diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,450204 @@ +{ + "best_metric": 7.352176189422607, + "best_model_checkpoint": "final_models/focus_kin_phi_focus_trained/checkpoint-10717", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 64302, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.330969487729775e-05, + "grad_norm": 41.65060671422142, + "learning_rate": 1.5e-07, + "loss": 11.2417, + "step": 1 + }, + { + "epoch": 0.0001866193897545955, + "grad_norm": 36.42862082147988, + "learning_rate": 3e-07, + "loss": 10.8435, + "step": 2 + }, + { + "epoch": 0.00027992908463189323, + "grad_norm": 34.87984726045636, + "learning_rate": 4.5e-07, + "loss": 11.3653, + "step": 3 + }, + { + "epoch": 0.000373238779509191, + "grad_norm": 89.55973917352621, + "learning_rate": 6e-07, + "loss": 10.6237, + "step": 4 + }, + { + "epoch": 0.00046654847438648876, + "grad_norm": 32.312573095641525, + "learning_rate": 7.499999999999999e-07, + "loss": 10.234, + "step": 5 + }, + { + "epoch": 0.0005598581692637865, + "grad_norm": 42.667901274782395, + "learning_rate": 9e-07, + "loss": 11.1672, + "step": 6 + }, + { + "epoch": 0.0006531678641410843, + "grad_norm": 46.517219491337094, + "learning_rate": 1.05e-06, + "loss": 10.4678, + "step": 7 + }, + { + "epoch": 0.000746477559018382, + "grad_norm": 40.75897232955888, + "learning_rate": 1.2e-06, + "loss": 10.7627, + "step": 8 + }, + { + "epoch": 0.0008397872538956798, + "grad_norm": 37.10035474359133, + "learning_rate": 1.3499999999999998e-06, + "loss": 10.4277, + "step": 9 + }, + { + "epoch": 0.0009330969487729775, + "grad_norm": 38.31751947338769, + "learning_rate": 1.4999999999999998e-06, + "loss": 10.5138, + "step": 10 + }, + { + "epoch": 0.0010264066436502753, + "grad_norm": 34.71768215819602, + "learning_rate": 1.6499999999999997e-06, + "loss": 10.5572, + "step": 11 + }, + { + "epoch": 0.001119716338527573, + "grad_norm": 35.52008207136885, + "learning_rate": 1.8e-06, + "loss": 10.309, + "step": 12 + }, + { + "epoch": 0.0012130260334048707, + "grad_norm": 112.93150943141897, + "learning_rate": 1.9499999999999995e-06, + "loss": 10.3894, + "step": 13 + }, + { + "epoch": 0.0013063357282821686, + "grad_norm": 55.01548468706236, + "learning_rate": 2.1e-06, + "loss": 10.1456, + "step": 14 + }, + { + "epoch": 0.0013996454231594662, + "grad_norm": 29.05189738996078, + "learning_rate": 2.2499999999999996e-06, + "loss": 10.2925, + "step": 15 + }, + { + "epoch": 0.001492955118036764, + "grad_norm": 29.929546968169024, + "learning_rate": 2.4e-06, + "loss": 10.179, + "step": 16 + }, + { + "epoch": 0.0015862648129140618, + "grad_norm": 22.785145161799708, + "learning_rate": 2.55e-06, + "loss": 9.5812, + "step": 17 + }, + { + "epoch": 0.0016795745077913596, + "grad_norm": 26.907581268837237, + "learning_rate": 2.6999999999999996e-06, + "loss": 10.224, + "step": 18 + }, + { + "epoch": 0.0017728842026686572, + "grad_norm": 18.9500462155395, + "learning_rate": 2.85e-06, + "loss": 9.2961, + "step": 19 + }, + { + "epoch": 0.001866193897545955, + "grad_norm": 24.755622123814515, + "learning_rate": 2.9999999999999997e-06, + "loss": 9.2247, + "step": 20 + }, + { + "epoch": 0.001959503592423253, + "grad_norm": 43.16672826707894, + "learning_rate": 3.15e-06, + "loss": 9.6067, + "step": 21 + }, + { + "epoch": 0.0020528132873005507, + "grad_norm": 12.449844180609615, + "learning_rate": 3.2999999999999993e-06, + "loss": 9.2865, + "step": 22 + }, + { + "epoch": 0.0021461229821778485, + "grad_norm": 196.16932419708482, + "learning_rate": 3.4499999999999996e-06, + "loss": 9.1827, + "step": 23 + }, + { + "epoch": 0.002239432677055146, + "grad_norm": 27.336545406788186, + "learning_rate": 3.6e-06, + "loss": 9.0039, + "step": 24 + }, + { + "epoch": 0.0023327423719324437, + "grad_norm": 16.110282715223843, + "learning_rate": 3.7499999999999997e-06, + "loss": 9.3198, + "step": 25 + }, + { + "epoch": 0.0024260520668097415, + "grad_norm": 17.791663479739864, + "learning_rate": 3.899999999999999e-06, + "loss": 9.0418, + "step": 26 + }, + { + "epoch": 0.0025193617616870393, + "grad_norm": 15.249711573903593, + "learning_rate": 4.049999999999999e-06, + "loss": 8.7604, + "step": 27 + }, + { + "epoch": 0.002612671456564337, + "grad_norm": 15.179883934283508, + "learning_rate": 4.2e-06, + "loss": 9.0337, + "step": 28 + }, + { + "epoch": 0.002705981151441635, + "grad_norm": 12.764756212145844, + "learning_rate": 4.35e-06, + "loss": 9.1824, + "step": 29 + }, + { + "epoch": 0.0027992908463189323, + "grad_norm": 11.50130037786626, + "learning_rate": 4.499999999999999e-06, + "loss": 8.5465, + "step": 30 + }, + { + "epoch": 0.00289260054119623, + "grad_norm": 8.546441539602156, + "learning_rate": 4.6499999999999995e-06, + "loss": 9.2249, + "step": 31 + }, + { + "epoch": 0.002985910236073528, + "grad_norm": 7.378021634564103, + "learning_rate": 4.8e-06, + "loss": 8.7139, + "step": 32 + }, + { + "epoch": 0.0030792199309508258, + "grad_norm": 7.822365876112508, + "learning_rate": 4.95e-06, + "loss": 8.9553, + "step": 33 + }, + { + "epoch": 0.0031725296258281236, + "grad_norm": 5.998198730483377, + "learning_rate": 5.1e-06, + "loss": 8.7271, + "step": 34 + }, + { + "epoch": 0.0032658393207054214, + "grad_norm": 5.986017712521886, + "learning_rate": 5.25e-06, + "loss": 8.9162, + "step": 35 + }, + { + "epoch": 0.0033591490155827192, + "grad_norm": 4.82132914523193, + "learning_rate": 5.399999999999999e-06, + "loss": 8.8316, + "step": 36 + }, + { + "epoch": 0.0034524587104600166, + "grad_norm": 6.1884771299182, + "learning_rate": 5.549999999999999e-06, + "loss": 8.5976, + "step": 37 + }, + { + "epoch": 0.0035457684053373144, + "grad_norm": 9.352374719891351, + "learning_rate": 5.7e-06, + "loss": 8.6948, + "step": 38 + }, + { + "epoch": 0.0036390781002146122, + "grad_norm": 5.543360253147177, + "learning_rate": 5.85e-06, + "loss": 8.9893, + "step": 39 + }, + { + "epoch": 0.00373238779509191, + "grad_norm": 6.295907682841182, + "learning_rate": 5.999999999999999e-06, + "loss": 8.9372, + "step": 40 + }, + { + "epoch": 0.003825697489969208, + "grad_norm": 5.6146235888388984, + "learning_rate": 6.1499999999999996e-06, + "loss": 8.4865, + "step": 41 + }, + { + "epoch": 0.003919007184846506, + "grad_norm": 5.110700515720652, + "learning_rate": 6.3e-06, + "loss": 8.894, + "step": 42 + }, + { + "epoch": 0.0040123168797238035, + "grad_norm": 5.955139165661321, + "learning_rate": 6.449999999999999e-06, + "loss": 8.5863, + "step": 43 + }, + { + "epoch": 0.004105626574601101, + "grad_norm": 3.463669314061232, + "learning_rate": 6.599999999999999e-06, + "loss": 8.6124, + "step": 44 + }, + { + "epoch": 0.004198936269478399, + "grad_norm": 4.3354779420542435, + "learning_rate": 6.749999999999999e-06, + "loss": 8.381, + "step": 45 + }, + { + "epoch": 0.004292245964355697, + "grad_norm": 5.0002041278459, + "learning_rate": 6.899999999999999e-06, + "loss": 8.5738, + "step": 46 + }, + { + "epoch": 0.004385555659232994, + "grad_norm": 4.7202424926014945, + "learning_rate": 7.049999999999999e-06, + "loss": 8.4764, + "step": 47 + }, + { + "epoch": 0.004478865354110292, + "grad_norm": 4.005315856360288, + "learning_rate": 7.2e-06, + "loss": 8.5264, + "step": 48 + }, + { + "epoch": 0.0045721750489875895, + "grad_norm": 3.1461653115960946, + "learning_rate": 7.35e-06, + "loss": 8.2713, + "step": 49 + }, + { + "epoch": 0.004665484743864887, + "grad_norm": 3.991948469289704, + "learning_rate": 7.499999999999999e-06, + "loss": 8.4762, + "step": 50 + }, + { + "epoch": 0.004758794438742185, + "grad_norm": 3.147819853114948, + "learning_rate": 7.65e-06, + "loss": 8.4119, + "step": 51 + }, + { + "epoch": 0.004852104133619483, + "grad_norm": 2.9079867706190656, + "learning_rate": 7.799999999999998e-06, + "loss": 8.2957, + "step": 52 + }, + { + "epoch": 0.004945413828496781, + "grad_norm": 3.994527643219513, + "learning_rate": 7.949999999999998e-06, + "loss": 8.6199, + "step": 53 + }, + { + "epoch": 0.005038723523374079, + "grad_norm": 3.3219977303770047, + "learning_rate": 8.099999999999999e-06, + "loss": 8.3188, + "step": 54 + }, + { + "epoch": 0.005132033218251376, + "grad_norm": 2.9517928087543384, + "learning_rate": 8.249999999999999e-06, + "loss": 8.137, + "step": 55 + }, + { + "epoch": 0.005225342913128674, + "grad_norm": 4.188619761227864, + "learning_rate": 8.4e-06, + "loss": 8.2551, + "step": 56 + }, + { + "epoch": 0.005318652608005972, + "grad_norm": 4.3937243199206, + "learning_rate": 8.55e-06, + "loss": 8.3754, + "step": 57 + }, + { + "epoch": 0.00541196230288327, + "grad_norm": 3.2102598807671407, + "learning_rate": 8.7e-06, + "loss": 8.0628, + "step": 58 + }, + { + "epoch": 0.005505271997760568, + "grad_norm": 3.0875961386034847, + "learning_rate": 8.849999999999998e-06, + "loss": 8.3747, + "step": 59 + }, + { + "epoch": 0.005598581692637865, + "grad_norm": 5.26017759481169, + "learning_rate": 8.999999999999999e-06, + "loss": 8.0659, + "step": 60 + }, + { + "epoch": 0.0056918913875151625, + "grad_norm": 3.5344137087466776, + "learning_rate": 9.149999999999999e-06, + "loss": 8.0665, + "step": 61 + }, + { + "epoch": 0.00578520108239246, + "grad_norm": 2.7417319378175318, + "learning_rate": 9.299999999999999e-06, + "loss": 8.1376, + "step": 62 + }, + { + "epoch": 0.005878510777269758, + "grad_norm": 3.40866684351427, + "learning_rate": 9.45e-06, + "loss": 8.0815, + "step": 63 + }, + { + "epoch": 0.005971820472147056, + "grad_norm": 3.9261982093121732, + "learning_rate": 9.6e-06, + "loss": 8.1893, + "step": 64 + }, + { + "epoch": 0.006065130167024354, + "grad_norm": 3.4113478171055993, + "learning_rate": 9.75e-06, + "loss": 8.1681, + "step": 65 + }, + { + "epoch": 0.0061584398619016515, + "grad_norm": 4.038209755036266, + "learning_rate": 9.9e-06, + "loss": 8.6241, + "step": 66 + }, + { + "epoch": 0.006251749556778949, + "grad_norm": 2.5640616726293852, + "learning_rate": 1.005e-05, + "loss": 8.1298, + "step": 67 + }, + { + "epoch": 0.006345059251656247, + "grad_norm": 3.5126306839667594, + "learning_rate": 1.02e-05, + "loss": 8.1354, + "step": 68 + }, + { + "epoch": 0.006438368946533545, + "grad_norm": 2.4103151150598103, + "learning_rate": 1.035e-05, + "loss": 8.3014, + "step": 69 + }, + { + "epoch": 0.006531678641410843, + "grad_norm": 3.3185932611894984, + "learning_rate": 1.05e-05, + "loss": 7.972, + "step": 70 + }, + { + "epoch": 0.006624988336288141, + "grad_norm": 3.514509260149007, + "learning_rate": 1.0649999999999998e-05, + "loss": 8.1771, + "step": 71 + }, + { + "epoch": 0.0067182980311654384, + "grad_norm": 3.866072077888053, + "learning_rate": 1.0799999999999998e-05, + "loss": 8.6317, + "step": 72 + }, + { + "epoch": 0.006811607726042736, + "grad_norm": 3.8212299625405435, + "learning_rate": 1.0949999999999998e-05, + "loss": 8.0982, + "step": 73 + }, + { + "epoch": 0.006904917420920033, + "grad_norm": 3.8352818125928048, + "learning_rate": 1.1099999999999999e-05, + "loss": 8.0498, + "step": 74 + }, + { + "epoch": 0.006998227115797331, + "grad_norm": 3.741038723205592, + "learning_rate": 1.1249999999999999e-05, + "loss": 7.8925, + "step": 75 + }, + { + "epoch": 0.007091536810674629, + "grad_norm": 4.643662631556424, + "learning_rate": 1.14e-05, + "loss": 8.1105, + "step": 76 + }, + { + "epoch": 0.007184846505551927, + "grad_norm": 2.717847430780462, + "learning_rate": 1.155e-05, + "loss": 8.1172, + "step": 77 + }, + { + "epoch": 0.0072781562004292245, + "grad_norm": 4.470016203171416, + "learning_rate": 1.17e-05, + "loss": 8.1258, + "step": 78 + }, + { + "epoch": 0.007371465895306522, + "grad_norm": 3.2947657529213408, + "learning_rate": 1.1849999999999998e-05, + "loss": 8.0074, + "step": 79 + }, + { + "epoch": 0.00746477559018382, + "grad_norm": 2.615267971344696, + "learning_rate": 1.1999999999999999e-05, + "loss": 8.1308, + "step": 80 + }, + { + "epoch": 0.007558085285061118, + "grad_norm": 3.6427714481295017, + "learning_rate": 1.2149999999999999e-05, + "loss": 7.8749, + "step": 81 + }, + { + "epoch": 0.007651394979938416, + "grad_norm": 4.361763779721175, + "learning_rate": 1.2299999999999999e-05, + "loss": 7.9242, + "step": 82 + }, + { + "epoch": 0.0077447046748157135, + "grad_norm": 4.4181237433021545, + "learning_rate": 1.245e-05, + "loss": 7.8988, + "step": 83 + }, + { + "epoch": 0.007838014369693011, + "grad_norm": 2.754003497382784, + "learning_rate": 1.26e-05, + "loss": 7.6704, + "step": 84 + }, + { + "epoch": 0.00793132406457031, + "grad_norm": 4.661965615104413, + "learning_rate": 1.275e-05, + "loss": 8.095, + "step": 85 + }, + { + "epoch": 0.008024633759447607, + "grad_norm": 5.614237629204432, + "learning_rate": 1.2899999999999998e-05, + "loss": 7.993, + "step": 86 + }, + { + "epoch": 0.008117943454324905, + "grad_norm": 2.8854686725852527, + "learning_rate": 1.3049999999999999e-05, + "loss": 7.5904, + "step": 87 + }, + { + "epoch": 0.008211253149202203, + "grad_norm": 6.37272074365425, + "learning_rate": 1.3199999999999997e-05, + "loss": 8.0211, + "step": 88 + }, + { + "epoch": 0.0083045628440795, + "grad_norm": 4.708584464989698, + "learning_rate": 1.3349999999999998e-05, + "loss": 7.9822, + "step": 89 + }, + { + "epoch": 0.008397872538956798, + "grad_norm": 3.3347839577203406, + "learning_rate": 1.3499999999999998e-05, + "loss": 7.827, + "step": 90 + }, + { + "epoch": 0.008491182233834096, + "grad_norm": 2.9650053261741456, + "learning_rate": 1.3649999999999998e-05, + "loss": 7.9708, + "step": 91 + }, + { + "epoch": 0.008584491928711394, + "grad_norm": 4.461460393997173, + "learning_rate": 1.3799999999999998e-05, + "loss": 8.2882, + "step": 92 + }, + { + "epoch": 0.008677801623588692, + "grad_norm": 5.0099836257861945, + "learning_rate": 1.3949999999999999e-05, + "loss": 7.8462, + "step": 93 + }, + { + "epoch": 0.008771111318465988, + "grad_norm": 5.960424142164332, + "learning_rate": 1.4099999999999999e-05, + "loss": 7.8037, + "step": 94 + }, + { + "epoch": 0.008864421013343286, + "grad_norm": 3.790072321995639, + "learning_rate": 1.4249999999999999e-05, + "loss": 7.9718, + "step": 95 + }, + { + "epoch": 0.008957730708220583, + "grad_norm": 3.877491179344736, + "learning_rate": 1.44e-05, + "loss": 8.2236, + "step": 96 + }, + { + "epoch": 0.009051040403097881, + "grad_norm": 3.0625013217925634, + "learning_rate": 1.455e-05, + "loss": 8.0429, + "step": 97 + }, + { + "epoch": 0.009144350097975179, + "grad_norm": 3.61857990180993, + "learning_rate": 1.47e-05, + "loss": 7.9261, + "step": 98 + }, + { + "epoch": 0.009237659792852477, + "grad_norm": 3.5935393061932386, + "learning_rate": 1.485e-05, + "loss": 8.031, + "step": 99 + }, + { + "epoch": 0.009330969487729775, + "grad_norm": 3.395370150232705, + "learning_rate": 1.4999999999999999e-05, + "loss": 7.9012, + "step": 100 + }, + { + "epoch": 0.009424279182607073, + "grad_norm": 3.31989793172357, + "learning_rate": 1.5149999999999999e-05, + "loss": 7.5289, + "step": 101 + }, + { + "epoch": 0.00951758887748437, + "grad_norm": 4.546247369290937, + "learning_rate": 1.53e-05, + "loss": 7.7636, + "step": 102 + }, + { + "epoch": 0.009610898572361668, + "grad_norm": 6.863903430800889, + "learning_rate": 1.545e-05, + "loss": 7.7951, + "step": 103 + }, + { + "epoch": 0.009704208267238966, + "grad_norm": 4.166630669086163, + "learning_rate": 1.5599999999999996e-05, + "loss": 7.5793, + "step": 104 + }, + { + "epoch": 0.009797517962116264, + "grad_norm": 6.2908858607130815, + "learning_rate": 1.5749999999999997e-05, + "loss": 7.8458, + "step": 105 + }, + { + "epoch": 0.009890827656993562, + "grad_norm": 3.5217519898308476, + "learning_rate": 1.5899999999999997e-05, + "loss": 7.4832, + "step": 106 + }, + { + "epoch": 0.00998413735187086, + "grad_norm": 3.476877189975084, + "learning_rate": 1.6049999999999997e-05, + "loss": 7.7484, + "step": 107 + }, + { + "epoch": 0.010077447046748157, + "grad_norm": 3.925838715665591, + "learning_rate": 1.6199999999999997e-05, + "loss": 7.6837, + "step": 108 + }, + { + "epoch": 0.010170756741625455, + "grad_norm": 4.1320770115099945, + "learning_rate": 1.6349999999999998e-05, + "loss": 7.8955, + "step": 109 + }, + { + "epoch": 0.010264066436502753, + "grad_norm": 3.0996859098639873, + "learning_rate": 1.6499999999999998e-05, + "loss": 7.6554, + "step": 110 + }, + { + "epoch": 0.01035737613138005, + "grad_norm": 3.1682681647609248, + "learning_rate": 1.6649999999999998e-05, + "loss": 7.85, + "step": 111 + }, + { + "epoch": 0.010450685826257348, + "grad_norm": 4.2244368433061075, + "learning_rate": 1.68e-05, + "loss": 8.0661, + "step": 112 + }, + { + "epoch": 0.010543995521134646, + "grad_norm": 3.948558628668473, + "learning_rate": 1.695e-05, + "loss": 8.0686, + "step": 113 + }, + { + "epoch": 0.010637305216011944, + "grad_norm": 3.1571090156425927, + "learning_rate": 1.71e-05, + "loss": 7.7062, + "step": 114 + }, + { + "epoch": 0.010730614910889242, + "grad_norm": 8.038035369252375, + "learning_rate": 1.725e-05, + "loss": 7.8486, + "step": 115 + }, + { + "epoch": 0.01082392460576654, + "grad_norm": 4.643144227810235, + "learning_rate": 1.74e-05, + "loss": 7.679, + "step": 116 + }, + { + "epoch": 0.010917234300643838, + "grad_norm": 4.233750360497723, + "learning_rate": 1.755e-05, + "loss": 7.7302, + "step": 117 + }, + { + "epoch": 0.011010543995521135, + "grad_norm": 3.3488873569634094, + "learning_rate": 1.7699999999999997e-05, + "loss": 7.2937, + "step": 118 + }, + { + "epoch": 0.011103853690398433, + "grad_norm": 5.148665674164206, + "learning_rate": 1.7849999999999997e-05, + "loss": 7.7405, + "step": 119 + }, + { + "epoch": 0.01119716338527573, + "grad_norm": 5.185779585148215, + "learning_rate": 1.7999999999999997e-05, + "loss": 7.4593, + "step": 120 + }, + { + "epoch": 0.011290473080153027, + "grad_norm": 3.9149115914509416, + "learning_rate": 1.8149999999999997e-05, + "loss": 7.4392, + "step": 121 + }, + { + "epoch": 0.011383782775030325, + "grad_norm": 6.015571776838503, + "learning_rate": 1.8299999999999998e-05, + "loss": 7.7622, + "step": 122 + }, + { + "epoch": 0.011477092469907623, + "grad_norm": 3.1170806951424916, + "learning_rate": 1.8449999999999998e-05, + "loss": 7.6168, + "step": 123 + }, + { + "epoch": 0.01157040216478492, + "grad_norm": 3.55929291936924, + "learning_rate": 1.8599999999999998e-05, + "loss": 7.3728, + "step": 124 + }, + { + "epoch": 0.011663711859662218, + "grad_norm": 5.803006517161098, + "learning_rate": 1.875e-05, + "loss": 7.6946, + "step": 125 + }, + { + "epoch": 0.011757021554539516, + "grad_norm": 4.2129615695630465, + "learning_rate": 1.89e-05, + "loss": 7.7969, + "step": 126 + }, + { + "epoch": 0.011850331249416814, + "grad_norm": 3.8661269941910055, + "learning_rate": 1.905e-05, + "loss": 7.6772, + "step": 127 + }, + { + "epoch": 0.011943640944294112, + "grad_norm": 14.835382063455132, + "learning_rate": 1.92e-05, + "loss": 8.1045, + "step": 128 + }, + { + "epoch": 0.01203695063917141, + "grad_norm": 4.677204171040953, + "learning_rate": 1.935e-05, + "loss": 7.5502, + "step": 129 + }, + { + "epoch": 0.012130260334048707, + "grad_norm": 3.6410995503494963, + "learning_rate": 1.95e-05, + "loss": 7.5798, + "step": 130 + }, + { + "epoch": 0.012223570028926005, + "grad_norm": 3.3028405948870083, + "learning_rate": 1.965e-05, + "loss": 7.6924, + "step": 131 + }, + { + "epoch": 0.012316879723803303, + "grad_norm": 6.340018509080736, + "learning_rate": 1.98e-05, + "loss": 7.6865, + "step": 132 + }, + { + "epoch": 0.012410189418680601, + "grad_norm": 5.544766526579473, + "learning_rate": 1.995e-05, + "loss": 7.6555, + "step": 133 + }, + { + "epoch": 0.012503499113557899, + "grad_norm": 5.450135441498356, + "learning_rate": 2.01e-05, + "loss": 7.6028, + "step": 134 + }, + { + "epoch": 0.012596808808435197, + "grad_norm": 5.707408396145421, + "learning_rate": 2.025e-05, + "loss": 7.5575, + "step": 135 + }, + { + "epoch": 0.012690118503312494, + "grad_norm": 4.9854713723354465, + "learning_rate": 2.04e-05, + "loss": 7.561, + "step": 136 + }, + { + "epoch": 0.012783428198189792, + "grad_norm": 4.070062636664643, + "learning_rate": 2.055e-05, + "loss": 7.4858, + "step": 137 + }, + { + "epoch": 0.01287673789306709, + "grad_norm": 3.2874392878529135, + "learning_rate": 2.07e-05, + "loss": 7.6304, + "step": 138 + }, + { + "epoch": 0.012970047587944388, + "grad_norm": 4.718436518407453, + "learning_rate": 2.085e-05, + "loss": 7.4336, + "step": 139 + }, + { + "epoch": 0.013063357282821686, + "grad_norm": 7.879944027065325, + "learning_rate": 2.1e-05, + "loss": 7.844, + "step": 140 + }, + { + "epoch": 0.013156666977698983, + "grad_norm": 4.703416778191686, + "learning_rate": 2.1149999999999996e-05, + "loss": 7.6773, + "step": 141 + }, + { + "epoch": 0.013249976672576281, + "grad_norm": 4.117406916216655, + "learning_rate": 2.1299999999999996e-05, + "loss": 7.6568, + "step": 142 + }, + { + "epoch": 0.013343286367453579, + "grad_norm": 4.326671002202663, + "learning_rate": 2.1449999999999996e-05, + "loss": 7.3589, + "step": 143 + }, + { + "epoch": 0.013436596062330877, + "grad_norm": 4.225190736053071, + "learning_rate": 2.1599999999999996e-05, + "loss": 7.4144, + "step": 144 + }, + { + "epoch": 0.013529905757208175, + "grad_norm": 4.819609212653457, + "learning_rate": 2.1749999999999997e-05, + "loss": 7.3123, + "step": 145 + }, + { + "epoch": 0.013623215452085473, + "grad_norm": 2.367674711926792, + "learning_rate": 2.1899999999999997e-05, + "loss": 7.2923, + "step": 146 + }, + { + "epoch": 0.013716525146962769, + "grad_norm": 2.3966481512624087, + "learning_rate": 2.2049999999999997e-05, + "loss": 7.5923, + "step": 147 + }, + { + "epoch": 0.013809834841840066, + "grad_norm": 6.889859611251114, + "learning_rate": 2.2199999999999998e-05, + "loss": 7.4811, + "step": 148 + }, + { + "epoch": 0.013903144536717364, + "grad_norm": 2.3440196711928096, + "learning_rate": 2.2349999999999998e-05, + "loss": 7.3955, + "step": 149 + }, + { + "epoch": 0.013996454231594662, + "grad_norm": 3.911946631202622, + "learning_rate": 2.2499999999999998e-05, + "loss": 6.9465, + "step": 150 + }, + { + "epoch": 0.01408976392647196, + "grad_norm": 3.016009732624104, + "learning_rate": 2.2649999999999998e-05, + "loss": 7.6624, + "step": 151 + }, + { + "epoch": 0.014183073621349258, + "grad_norm": 2.792375413169816, + "learning_rate": 2.28e-05, + "loss": 7.4123, + "step": 152 + }, + { + "epoch": 0.014276383316226555, + "grad_norm": 4.317185197605814, + "learning_rate": 2.295e-05, + "loss": 7.4018, + "step": 153 + }, + { + "epoch": 0.014369693011103853, + "grad_norm": 3.4364559638887027, + "learning_rate": 2.31e-05, + "loss": 7.5242, + "step": 154 + }, + { + "epoch": 0.014463002705981151, + "grad_norm": 5.282053992041776, + "learning_rate": 2.325e-05, + "loss": 7.3955, + "step": 155 + }, + { + "epoch": 0.014556312400858449, + "grad_norm": 20.15277765985429, + "learning_rate": 2.34e-05, + "loss": 7.0172, + "step": 156 + }, + { + "epoch": 0.014649622095735747, + "grad_norm": 4.718333312182343, + "learning_rate": 2.3549999999999996e-05, + "loss": 7.4035, + "step": 157 + }, + { + "epoch": 0.014742931790613045, + "grad_norm": 3.70549238312693, + "learning_rate": 2.3699999999999997e-05, + "loss": 6.8672, + "step": 158 + }, + { + "epoch": 0.014836241485490342, + "grad_norm": 2.7394535711118073, + "learning_rate": 2.3849999999999997e-05, + "loss": 7.4794, + "step": 159 + }, + { + "epoch": 0.01492955118036764, + "grad_norm": 2.7331577234681057, + "learning_rate": 2.3999999999999997e-05, + "loss": 7.2183, + "step": 160 + }, + { + "epoch": 0.015022860875244938, + "grad_norm": 3.3059758037133955, + "learning_rate": 2.4149999999999997e-05, + "loss": 7.2642, + "step": 161 + }, + { + "epoch": 0.015116170570122236, + "grad_norm": 2.916621932335848, + "learning_rate": 2.4299999999999998e-05, + "loss": 7.2231, + "step": 162 + }, + { + "epoch": 0.015209480264999534, + "grad_norm": 2.384881858947198, + "learning_rate": 2.4449999999999998e-05, + "loss": 7.2977, + "step": 163 + }, + { + "epoch": 0.015302789959876831, + "grad_norm": 2.7048174020599545, + "learning_rate": 2.4599999999999998e-05, + "loss": 7.276, + "step": 164 + }, + { + "epoch": 0.01539609965475413, + "grad_norm": 3.6522763236924485, + "learning_rate": 2.475e-05, + "loss": 6.8073, + "step": 165 + }, + { + "epoch": 0.015489409349631427, + "grad_norm": 9.088202375627258, + "learning_rate": 2.49e-05, + "loss": 7.9549, + "step": 166 + }, + { + "epoch": 0.015582719044508725, + "grad_norm": 3.302836285369822, + "learning_rate": 2.505e-05, + "loss": 7.4871, + "step": 167 + }, + { + "epoch": 0.015676028739386023, + "grad_norm": 2.728496347145704, + "learning_rate": 2.52e-05, + "loss": 7.1096, + "step": 168 + }, + { + "epoch": 0.01576933843426332, + "grad_norm": 5.994061347143837, + "learning_rate": 2.535e-05, + "loss": 7.2635, + "step": 169 + }, + { + "epoch": 0.01586264812914062, + "grad_norm": 2.76917824501617, + "learning_rate": 2.55e-05, + "loss": 7.5564, + "step": 170 + }, + { + "epoch": 0.015955957824017916, + "grad_norm": 2.8293139361451463, + "learning_rate": 2.565e-05, + "loss": 7.0417, + "step": 171 + }, + { + "epoch": 0.016049267518895214, + "grad_norm": 5.422568108853521, + "learning_rate": 2.5799999999999997e-05, + "loss": 7.2922, + "step": 172 + }, + { + "epoch": 0.016142577213772512, + "grad_norm": 4.1893209937216715, + "learning_rate": 2.5949999999999997e-05, + "loss": 7.9673, + "step": 173 + }, + { + "epoch": 0.01623588690864981, + "grad_norm": 2.8935894298741416, + "learning_rate": 2.6099999999999997e-05, + "loss": 7.2703, + "step": 174 + }, + { + "epoch": 0.016329196603527107, + "grad_norm": 3.8054636367698604, + "learning_rate": 2.6249999999999998e-05, + "loss": 7.4039, + "step": 175 + }, + { + "epoch": 0.016422506298404405, + "grad_norm": 2.8685813637731084, + "learning_rate": 2.6399999999999995e-05, + "loss": 7.192, + "step": 176 + }, + { + "epoch": 0.016515815993281703, + "grad_norm": 3.3976796922993895, + "learning_rate": 2.6549999999999995e-05, + "loss": 7.9884, + "step": 177 + }, + { + "epoch": 0.016609125688159, + "grad_norm": 8.931447067080795, + "learning_rate": 2.6699999999999995e-05, + "loss": 6.9554, + "step": 178 + }, + { + "epoch": 0.0167024353830363, + "grad_norm": 4.802337264394083, + "learning_rate": 2.6849999999999995e-05, + "loss": 7.1861, + "step": 179 + }, + { + "epoch": 0.016795745077913597, + "grad_norm": 3.188882480616817, + "learning_rate": 2.6999999999999996e-05, + "loss": 7.2031, + "step": 180 + }, + { + "epoch": 0.016889054772790894, + "grad_norm": 3.7081894210813195, + "learning_rate": 2.7149999999999996e-05, + "loss": 6.8983, + "step": 181 + }, + { + "epoch": 0.016982364467668192, + "grad_norm": 3.561905256294169, + "learning_rate": 2.7299999999999996e-05, + "loss": 7.0777, + "step": 182 + }, + { + "epoch": 0.01707567416254549, + "grad_norm": 2.6026402651462868, + "learning_rate": 2.7449999999999996e-05, + "loss": 7.1559, + "step": 183 + }, + { + "epoch": 0.017168983857422788, + "grad_norm": 2.6276163710256935, + "learning_rate": 2.7599999999999997e-05, + "loss": 7.5201, + "step": 184 + }, + { + "epoch": 0.017262293552300086, + "grad_norm": 2.641794724511697, + "learning_rate": 2.7749999999999997e-05, + "loss": 7.3678, + "step": 185 + }, + { + "epoch": 0.017355603247177383, + "grad_norm": 2.5623426198411394, + "learning_rate": 2.7899999999999997e-05, + "loss": 7.1622, + "step": 186 + }, + { + "epoch": 0.017448912942054678, + "grad_norm": 2.4606462306194796, + "learning_rate": 2.8049999999999997e-05, + "loss": 7.1453, + "step": 187 + }, + { + "epoch": 0.017542222636931976, + "grad_norm": 2.778624142308761, + "learning_rate": 2.8199999999999998e-05, + "loss": 7.0597, + "step": 188 + }, + { + "epoch": 0.017635532331809273, + "grad_norm": 2.717665663846328, + "learning_rate": 2.8349999999999998e-05, + "loss": 6.8793, + "step": 189 + }, + { + "epoch": 0.01772884202668657, + "grad_norm": 2.743813314925308, + "learning_rate": 2.8499999999999998e-05, + "loss": 7.24, + "step": 190 + }, + { + "epoch": 0.01782215172156387, + "grad_norm": 3.0109127396625293, + "learning_rate": 2.865e-05, + "loss": 6.9549, + "step": 191 + }, + { + "epoch": 0.017915461416441167, + "grad_norm": 2.1132327784363536, + "learning_rate": 2.88e-05, + "loss": 7.178, + "step": 192 + }, + { + "epoch": 0.018008771111318465, + "grad_norm": 3.6235238848936406, + "learning_rate": 2.895e-05, + "loss": 7.0309, + "step": 193 + }, + { + "epoch": 0.018102080806195762, + "grad_norm": 2.1590555774434663, + "learning_rate": 2.91e-05, + "loss": 7.0727, + "step": 194 + }, + { + "epoch": 0.01819539050107306, + "grad_norm": 2.0429794542785347, + "learning_rate": 2.925e-05, + "loss": 7.0757, + "step": 195 + }, + { + "epoch": 0.018288700195950358, + "grad_norm": 3.4930837869492684, + "learning_rate": 2.94e-05, + "loss": 7.1273, + "step": 196 + }, + { + "epoch": 0.018382009890827656, + "grad_norm": 2.642428443227357, + "learning_rate": 2.955e-05, + "loss": 7.3142, + "step": 197 + }, + { + "epoch": 0.018475319585704954, + "grad_norm": 4.012832464731586, + "learning_rate": 2.97e-05, + "loss": 7.1009, + "step": 198 + }, + { + "epoch": 0.01856862928058225, + "grad_norm": 2.0982462382090326, + "learning_rate": 2.985e-05, + "loss": 7.3238, + "step": 199 + }, + { + "epoch": 0.01866193897545955, + "grad_norm": 3.574750245158072, + "learning_rate": 2.9999999999999997e-05, + "loss": 7.0674, + "step": 200 + }, + { + "epoch": 0.018755248670336847, + "grad_norm": 1.7506516879882215, + "learning_rate": 3.0149999999999998e-05, + "loss": 7.1385, + "step": 201 + }, + { + "epoch": 0.018848558365214145, + "grad_norm": 1.814651877892708, + "learning_rate": 3.0299999999999998e-05, + "loss": 7.1536, + "step": 202 + }, + { + "epoch": 0.018941868060091443, + "grad_norm": 2.123258063816407, + "learning_rate": 3.0449999999999998e-05, + "loss": 6.9449, + "step": 203 + }, + { + "epoch": 0.01903517775496874, + "grad_norm": 4.72529867160204, + "learning_rate": 3.06e-05, + "loss": 7.4031, + "step": 204 + }, + { + "epoch": 0.01912848744984604, + "grad_norm": 3.164318311347015, + "learning_rate": 3.0749999999999995e-05, + "loss": 7.0539, + "step": 205 + }, + { + "epoch": 0.019221797144723336, + "grad_norm": 1.985013025550353, + "learning_rate": 3.09e-05, + "loss": 7.0807, + "step": 206 + }, + { + "epoch": 0.019315106839600634, + "grad_norm": 1.9495347073866955, + "learning_rate": 3.1049999999999996e-05, + "loss": 7.0507, + "step": 207 + }, + { + "epoch": 0.019408416534477932, + "grad_norm": 1.9247385793042622, + "learning_rate": 3.119999999999999e-05, + "loss": 6.7912, + "step": 208 + }, + { + "epoch": 0.01950172622935523, + "grad_norm": 1.9181839775885599, + "learning_rate": 3.1349999999999996e-05, + "loss": 6.804, + "step": 209 + }, + { + "epoch": 0.019595035924232528, + "grad_norm": 1.9023229971231608, + "learning_rate": 3.149999999999999e-05, + "loss": 6.9396, + "step": 210 + }, + { + "epoch": 0.019688345619109825, + "grad_norm": 2.2411768976996362, + "learning_rate": 3.165e-05, + "loss": 7.1747, + "step": 211 + }, + { + "epoch": 0.019781655313987123, + "grad_norm": 2.2995575378864936, + "learning_rate": 3.1799999999999994e-05, + "loss": 7.3781, + "step": 212 + }, + { + "epoch": 0.01987496500886442, + "grad_norm": 2.929779945554089, + "learning_rate": 3.195e-05, + "loss": 7.1723, + "step": 213 + }, + { + "epoch": 0.01996827470374172, + "grad_norm": 2.4138476632968864, + "learning_rate": 3.2099999999999994e-05, + "loss": 7.0351, + "step": 214 + }, + { + "epoch": 0.020061584398619017, + "grad_norm": 2.8431504276939283, + "learning_rate": 3.225e-05, + "loss": 7.4947, + "step": 215 + }, + { + "epoch": 0.020154894093496314, + "grad_norm": 3.920453395341559, + "learning_rate": 3.2399999999999995e-05, + "loss": 6.9284, + "step": 216 + }, + { + "epoch": 0.020248203788373612, + "grad_norm": 2.36676815741544, + "learning_rate": 3.255e-05, + "loss": 7.2348, + "step": 217 + }, + { + "epoch": 0.02034151348325091, + "grad_norm": 2.415367463880595, + "learning_rate": 3.2699999999999995e-05, + "loss": 7.0518, + "step": 218 + }, + { + "epoch": 0.020434823178128208, + "grad_norm": 3.295643693461853, + "learning_rate": 3.285e-05, + "loss": 6.9691, + "step": 219 + }, + { + "epoch": 0.020528132873005506, + "grad_norm": 2.0148549171253403, + "learning_rate": 3.2999999999999996e-05, + "loss": 7.2372, + "step": 220 + }, + { + "epoch": 0.020621442567882804, + "grad_norm": 2.398294971475451, + "learning_rate": 3.315e-05, + "loss": 6.8075, + "step": 221 + }, + { + "epoch": 0.0207147522627601, + "grad_norm": 2.653116063657824, + "learning_rate": 3.3299999999999996e-05, + "loss": 6.9841, + "step": 222 + }, + { + "epoch": 0.0208080619576374, + "grad_norm": 2.5317135410825955, + "learning_rate": 3.345e-05, + "loss": 6.9453, + "step": 223 + }, + { + "epoch": 0.020901371652514697, + "grad_norm": 3.6502117039945947, + "learning_rate": 3.36e-05, + "loss": 7.4429, + "step": 224 + }, + { + "epoch": 0.020994681347391995, + "grad_norm": 2.8012535957638396, + "learning_rate": 3.375e-05, + "loss": 7.0072, + "step": 225 + }, + { + "epoch": 0.021087991042269293, + "grad_norm": 2.2869289983789183, + "learning_rate": 3.39e-05, + "loss": 6.781, + "step": 226 + }, + { + "epoch": 0.02118130073714659, + "grad_norm": 2.2118751296445582, + "learning_rate": 3.405e-05, + "loss": 7.0965, + "step": 227 + }, + { + "epoch": 0.021274610432023888, + "grad_norm": 2.6155209435865565, + "learning_rate": 3.42e-05, + "loss": 7.0541, + "step": 228 + }, + { + "epoch": 0.021367920126901186, + "grad_norm": 2.998110788815042, + "learning_rate": 3.435e-05, + "loss": 6.5346, + "step": 229 + }, + { + "epoch": 0.021461229821778484, + "grad_norm": 2.7632568392006123, + "learning_rate": 3.45e-05, + "loss": 6.7837, + "step": 230 + }, + { + "epoch": 0.02155453951665578, + "grad_norm": 1.9014141889838017, + "learning_rate": 3.465e-05, + "loss": 6.8603, + "step": 231 + }, + { + "epoch": 0.02164784921153308, + "grad_norm": 3.3588611089963107, + "learning_rate": 3.48e-05, + "loss": 7.2635, + "step": 232 + }, + { + "epoch": 0.021741158906410377, + "grad_norm": 3.5963366185340484, + "learning_rate": 3.4949999999999996e-05, + "loss": 7.1083, + "step": 233 + }, + { + "epoch": 0.021834468601287675, + "grad_norm": 2.1336689178446138, + "learning_rate": 3.51e-05, + "loss": 7.1983, + "step": 234 + }, + { + "epoch": 0.021927778296164973, + "grad_norm": 2.9605241277839545, + "learning_rate": 3.5249999999999996e-05, + "loss": 6.8923, + "step": 235 + }, + { + "epoch": 0.02202108799104227, + "grad_norm": 2.5824484518372715, + "learning_rate": 3.539999999999999e-05, + "loss": 7.0754, + "step": 236 + }, + { + "epoch": 0.02211439768591957, + "grad_norm": 2.1485393820173533, + "learning_rate": 3.555e-05, + "loss": 6.9909, + "step": 237 + }, + { + "epoch": 0.022207707380796866, + "grad_norm": 3.5799684263959035, + "learning_rate": 3.5699999999999994e-05, + "loss": 7.0826, + "step": 238 + }, + { + "epoch": 0.022301017075674164, + "grad_norm": 2.6165370371247216, + "learning_rate": 3.585e-05, + "loss": 7.0195, + "step": 239 + }, + { + "epoch": 0.02239432677055146, + "grad_norm": 2.2164799196528855, + "learning_rate": 3.5999999999999994e-05, + "loss": 6.6222, + "step": 240 + }, + { + "epoch": 0.022487636465428756, + "grad_norm": 1.944506656209241, + "learning_rate": 3.615e-05, + "loss": 6.6194, + "step": 241 + }, + { + "epoch": 0.022580946160306054, + "grad_norm": 2.8363603845628202, + "learning_rate": 3.6299999999999995e-05, + "loss": 7.1579, + "step": 242 + }, + { + "epoch": 0.022674255855183352, + "grad_norm": 2.145481507440039, + "learning_rate": 3.645e-05, + "loss": 6.9946, + "step": 243 + }, + { + "epoch": 0.02276756555006065, + "grad_norm": 2.2760937677648845, + "learning_rate": 3.6599999999999995e-05, + "loss": 7.1975, + "step": 244 + }, + { + "epoch": 0.022860875244937948, + "grad_norm": 2.982013615694382, + "learning_rate": 3.675e-05, + "loss": 6.8555, + "step": 245 + }, + { + "epoch": 0.022954184939815245, + "grad_norm": 2.698505327492211, + "learning_rate": 3.6899999999999996e-05, + "loss": 7.058, + "step": 246 + }, + { + "epoch": 0.023047494634692543, + "grad_norm": 2.321088670517965, + "learning_rate": 3.705e-05, + "loss": 7.2093, + "step": 247 + }, + { + "epoch": 0.02314080432956984, + "grad_norm": 2.1244625612917427, + "learning_rate": 3.7199999999999996e-05, + "loss": 6.7406, + "step": 248 + }, + { + "epoch": 0.02323411402444714, + "grad_norm": 8.246620278997872, + "learning_rate": 3.735e-05, + "loss": 6.8494, + "step": 249 + }, + { + "epoch": 0.023327423719324437, + "grad_norm": 2.3015537110263904, + "learning_rate": 3.75e-05, + "loss": 7.1768, + "step": 250 + }, + { + "epoch": 0.023420733414201735, + "grad_norm": 3.063808696325007, + "learning_rate": 3.7649999999999994e-05, + "loss": 7.1611, + "step": 251 + }, + { + "epoch": 0.023514043109079032, + "grad_norm": 3.5575348222554406, + "learning_rate": 3.78e-05, + "loss": 7.0611, + "step": 252 + }, + { + "epoch": 0.02360735280395633, + "grad_norm": 2.7253325396754673, + "learning_rate": 3.7949999999999994e-05, + "loss": 7.2871, + "step": 253 + }, + { + "epoch": 0.023700662498833628, + "grad_norm": 2.531277593540482, + "learning_rate": 3.81e-05, + "loss": 6.843, + "step": 254 + }, + { + "epoch": 0.023793972193710926, + "grad_norm": 1.9950543479182865, + "learning_rate": 3.8249999999999995e-05, + "loss": 6.5042, + "step": 255 + }, + { + "epoch": 0.023887281888588224, + "grad_norm": 2.7855058938153276, + "learning_rate": 3.84e-05, + "loss": 7.0203, + "step": 256 + }, + { + "epoch": 0.02398059158346552, + "grad_norm": 2.3721958251540767, + "learning_rate": 3.8549999999999995e-05, + "loss": 6.9375, + "step": 257 + }, + { + "epoch": 0.02407390127834282, + "grad_norm": 2.2351365778435905, + "learning_rate": 3.87e-05, + "loss": 6.6671, + "step": 258 + }, + { + "epoch": 0.024167210973220117, + "grad_norm": 2.4764231809046446, + "learning_rate": 3.8849999999999996e-05, + "loss": 6.6639, + "step": 259 + }, + { + "epoch": 0.024260520668097415, + "grad_norm": 2.8802772464582835, + "learning_rate": 3.9e-05, + "loss": 6.856, + "step": 260 + }, + { + "epoch": 0.024353830362974713, + "grad_norm": 2.3258605423228995, + "learning_rate": 3.9149999999999996e-05, + "loss": 7.0232, + "step": 261 + }, + { + "epoch": 0.02444714005785201, + "grad_norm": 2.8882770836150105, + "learning_rate": 3.93e-05, + "loss": 6.9187, + "step": 262 + }, + { + "epoch": 0.02454044975272931, + "grad_norm": 1.7347349401258199, + "learning_rate": 3.945e-05, + "loss": 6.9874, + "step": 263 + }, + { + "epoch": 0.024633759447606606, + "grad_norm": 4.338942065102009, + "learning_rate": 3.96e-05, + "loss": 7.1224, + "step": 264 + }, + { + "epoch": 0.024727069142483904, + "grad_norm": 4.771897969140107, + "learning_rate": 3.975e-05, + "loss": 6.9715, + "step": 265 + }, + { + "epoch": 0.024820378837361202, + "grad_norm": 4.901360972453769, + "learning_rate": 3.99e-05, + "loss": 6.9423, + "step": 266 + }, + { + "epoch": 0.0249136885322385, + "grad_norm": 2.0422540889871126, + "learning_rate": 4.005e-05, + "loss": 6.9266, + "step": 267 + }, + { + "epoch": 0.025006998227115797, + "grad_norm": 5.634320027355875, + "learning_rate": 4.02e-05, + "loss": 7.0669, + "step": 268 + }, + { + "epoch": 0.025100307921993095, + "grad_norm": 2.9300153933428494, + "learning_rate": 4.035e-05, + "loss": 7.0546, + "step": 269 + }, + { + "epoch": 0.025193617616870393, + "grad_norm": 3.886106780551248, + "learning_rate": 4.05e-05, + "loss": 6.6174, + "step": 270 + }, + { + "epoch": 0.02528692731174769, + "grad_norm": 2.7597145085108705, + "learning_rate": 4.065e-05, + "loss": 6.6251, + "step": 271 + }, + { + "epoch": 0.02538023700662499, + "grad_norm": 1.7069966099102916, + "learning_rate": 4.08e-05, + "loss": 6.7354, + "step": 272 + }, + { + "epoch": 0.025473546701502287, + "grad_norm": 3.5635967710955683, + "learning_rate": 4.095e-05, + "loss": 7.021, + "step": 273 + }, + { + "epoch": 0.025566856396379584, + "grad_norm": 2.663939142899624, + "learning_rate": 4.11e-05, + "loss": 6.6579, + "step": 274 + }, + { + "epoch": 0.025660166091256882, + "grad_norm": 4.142556504105736, + "learning_rate": 4.125e-05, + "loss": 7.1126, + "step": 275 + }, + { + "epoch": 0.02575347578613418, + "grad_norm": 4.29610058437747, + "learning_rate": 4.14e-05, + "loss": 6.8175, + "step": 276 + }, + { + "epoch": 0.025846785481011478, + "grad_norm": 2.332541617651578, + "learning_rate": 4.155e-05, + "loss": 6.5559, + "step": 277 + }, + { + "epoch": 0.025940095175888776, + "grad_norm": 1.9100755867945796, + "learning_rate": 4.17e-05, + "loss": 6.9011, + "step": 278 + }, + { + "epoch": 0.026033404870766073, + "grad_norm": 1.8971899435045787, + "learning_rate": 4.185e-05, + "loss": 6.8355, + "step": 279 + }, + { + "epoch": 0.02612671456564337, + "grad_norm": 3.8993481182422, + "learning_rate": 4.2e-05, + "loss": 6.6943, + "step": 280 + }, + { + "epoch": 0.02622002426052067, + "grad_norm": 3.4019488827442514, + "learning_rate": 4.215e-05, + "loss": 6.6957, + "step": 281 + }, + { + "epoch": 0.026313333955397967, + "grad_norm": 2.1146807370091207, + "learning_rate": 4.229999999999999e-05, + "loss": 6.9012, + "step": 282 + }, + { + "epoch": 0.026406643650275265, + "grad_norm": 2.097624636154274, + "learning_rate": 4.2449999999999995e-05, + "loss": 6.7156, + "step": 283 + }, + { + "epoch": 0.026499953345152562, + "grad_norm": 1.7212296758453676, + "learning_rate": 4.259999999999999e-05, + "loss": 6.9508, + "step": 284 + }, + { + "epoch": 0.02659326304002986, + "grad_norm": 2.452514495622238, + "learning_rate": 4.2749999999999996e-05, + "loss": 6.8343, + "step": 285 + }, + { + "epoch": 0.026686572734907158, + "grad_norm": 3.1427679899583767, + "learning_rate": 4.289999999999999e-05, + "loss": 6.9751, + "step": 286 + }, + { + "epoch": 0.026779882429784456, + "grad_norm": 2.3799253084430196, + "learning_rate": 4.3049999999999996e-05, + "loss": 6.9765, + "step": 287 + }, + { + "epoch": 0.026873192124661754, + "grad_norm": 2.2532279460133933, + "learning_rate": 4.319999999999999e-05, + "loss": 6.8433, + "step": 288 + }, + { + "epoch": 0.02696650181953905, + "grad_norm": 2.5829649164790247, + "learning_rate": 4.334999999999999e-05, + "loss": 6.9877, + "step": 289 + }, + { + "epoch": 0.02705981151441635, + "grad_norm": 5.519387594923731, + "learning_rate": 4.3499999999999993e-05, + "loss": 6.6403, + "step": 290 + }, + { + "epoch": 0.027153121209293647, + "grad_norm": 2.9979889475292527, + "learning_rate": 4.364999999999999e-05, + "loss": 6.8204, + "step": 291 + }, + { + "epoch": 0.027246430904170945, + "grad_norm": 3.2516056927235644, + "learning_rate": 4.3799999999999994e-05, + "loss": 7.1223, + "step": 292 + }, + { + "epoch": 0.027339740599048243, + "grad_norm": 1.894354807906214, + "learning_rate": 4.394999999999999e-05, + "loss": 7.0515, + "step": 293 + }, + { + "epoch": 0.027433050293925537, + "grad_norm": 3.439455233923546, + "learning_rate": 4.4099999999999995e-05, + "loss": 7.0158, + "step": 294 + }, + { + "epoch": 0.027526359988802835, + "grad_norm": 1.7839613321713998, + "learning_rate": 4.424999999999999e-05, + "loss": 6.4484, + "step": 295 + }, + { + "epoch": 0.027619669683680133, + "grad_norm": 2.3208679465848694, + "learning_rate": 4.4399999999999995e-05, + "loss": 6.9463, + "step": 296 + }, + { + "epoch": 0.02771297937855743, + "grad_norm": 1.987394286033976, + "learning_rate": 4.454999999999999e-05, + "loss": 7.1663, + "step": 297 + }, + { + "epoch": 0.02780628907343473, + "grad_norm": 2.18667380924321, + "learning_rate": 4.4699999999999996e-05, + "loss": 6.5976, + "step": 298 + }, + { + "epoch": 0.027899598768312026, + "grad_norm": 4.246928656039885, + "learning_rate": 4.484999999999999e-05, + "loss": 6.9526, + "step": 299 + }, + { + "epoch": 0.027992908463189324, + "grad_norm": 2.3304168975199637, + "learning_rate": 4.4999999999999996e-05, + "loss": 6.7993, + "step": 300 + }, + { + "epoch": 0.028086218158066622, + "grad_norm": 2.7044862787327126, + "learning_rate": 4.514999999999999e-05, + "loss": 6.6207, + "step": 301 + }, + { + "epoch": 0.02817952785294392, + "grad_norm": 2.9144351283477308, + "learning_rate": 4.5299999999999997e-05, + "loss": 6.7866, + "step": 302 + }, + { + "epoch": 0.028272837547821218, + "grad_norm": 2.011443384933232, + "learning_rate": 4.5449999999999993e-05, + "loss": 6.5413, + "step": 303 + }, + { + "epoch": 0.028366147242698515, + "grad_norm": 3.12303803657147, + "learning_rate": 4.56e-05, + "loss": 6.4639, + "step": 304 + }, + { + "epoch": 0.028459456937575813, + "grad_norm": 4.135549347901571, + "learning_rate": 4.5749999999999994e-05, + "loss": 6.9701, + "step": 305 + }, + { + "epoch": 0.02855276663245311, + "grad_norm": 2.378697883872549, + "learning_rate": 4.59e-05, + "loss": 6.8141, + "step": 306 + }, + { + "epoch": 0.02864607632733041, + "grad_norm": 3.573729201744551, + "learning_rate": 4.6049999999999994e-05, + "loss": 6.837, + "step": 307 + }, + { + "epoch": 0.028739386022207707, + "grad_norm": 2.3581753085429145, + "learning_rate": 4.62e-05, + "loss": 7.0484, + "step": 308 + }, + { + "epoch": 0.028832695717085004, + "grad_norm": 1.8530350138897498, + "learning_rate": 4.6349999999999995e-05, + "loss": 6.9525, + "step": 309 + }, + { + "epoch": 0.028926005411962302, + "grad_norm": 1.652701842103095, + "learning_rate": 4.65e-05, + "loss": 7.0121, + "step": 310 + }, + { + "epoch": 0.0290193151068396, + "grad_norm": 1.6247099771208149, + "learning_rate": 4.6649999999999996e-05, + "loss": 6.6929, + "step": 311 + }, + { + "epoch": 0.029112624801716898, + "grad_norm": 4.75002060252995, + "learning_rate": 4.68e-05, + "loss": 6.7543, + "step": 312 + }, + { + "epoch": 0.029205934496594196, + "grad_norm": 1.619648075762717, + "learning_rate": 4.6949999999999996e-05, + "loss": 6.7422, + "step": 313 + }, + { + "epoch": 0.029299244191471494, + "grad_norm": 1.9451664269380826, + "learning_rate": 4.709999999999999e-05, + "loss": 7.1112, + "step": 314 + }, + { + "epoch": 0.02939255388634879, + "grad_norm": 2.5162083320906943, + "learning_rate": 4.7249999999999997e-05, + "loss": 6.764, + "step": 315 + }, + { + "epoch": 0.02948586358122609, + "grad_norm": 1.8507644970914505, + "learning_rate": 4.7399999999999993e-05, + "loss": 6.9892, + "step": 316 + }, + { + "epoch": 0.029579173276103387, + "grad_norm": 4.098041023557062, + "learning_rate": 4.755e-05, + "loss": 6.8884, + "step": 317 + }, + { + "epoch": 0.029672482970980685, + "grad_norm": 2.6019074553225203, + "learning_rate": 4.7699999999999994e-05, + "loss": 6.7606, + "step": 318 + }, + { + "epoch": 0.029765792665857983, + "grad_norm": 2.1189162366725482, + "learning_rate": 4.785e-05, + "loss": 6.8806, + "step": 319 + }, + { + "epoch": 0.02985910236073528, + "grad_norm": 2.7450617411550793, + "learning_rate": 4.7999999999999994e-05, + "loss": 6.8619, + "step": 320 + }, + { + "epoch": 0.029952412055612578, + "grad_norm": 4.90602785632482, + "learning_rate": 4.815e-05, + "loss": 6.6038, + "step": 321 + }, + { + "epoch": 0.030045721750489876, + "grad_norm": 2.4689190121117015, + "learning_rate": 4.8299999999999995e-05, + "loss": 7.1821, + "step": 322 + }, + { + "epoch": 0.030139031445367174, + "grad_norm": 3.5375655713993224, + "learning_rate": 4.845e-05, + "loss": 7.0018, + "step": 323 + }, + { + "epoch": 0.03023234114024447, + "grad_norm": 1.7242325304764248, + "learning_rate": 4.8599999999999995e-05, + "loss": 6.583, + "step": 324 + }, + { + "epoch": 0.03032565083512177, + "grad_norm": 1.7459676469468937, + "learning_rate": 4.875e-05, + "loss": 6.9033, + "step": 325 + }, + { + "epoch": 0.030418960529999067, + "grad_norm": 1.793430212664617, + "learning_rate": 4.8899999999999996e-05, + "loss": 6.6752, + "step": 326 + }, + { + "epoch": 0.030512270224876365, + "grad_norm": 1.855007065816703, + "learning_rate": 4.905e-05, + "loss": 6.9869, + "step": 327 + }, + { + "epoch": 0.030605579919753663, + "grad_norm": 2.31248351464449, + "learning_rate": 4.9199999999999997e-05, + "loss": 6.8709, + "step": 328 + }, + { + "epoch": 0.03069888961463096, + "grad_norm": 2.5102537382654595, + "learning_rate": 4.935e-05, + "loss": 6.4722, + "step": 329 + }, + { + "epoch": 0.03079219930950826, + "grad_norm": 2.4689772055745682, + "learning_rate": 4.95e-05, + "loss": 6.9859, + "step": 330 + }, + { + "epoch": 0.030885509004385556, + "grad_norm": 2.4118755493931694, + "learning_rate": 4.965e-05, + "loss": 6.7944, + "step": 331 + }, + { + "epoch": 0.030978818699262854, + "grad_norm": 3.4467728284362718, + "learning_rate": 4.98e-05, + "loss": 6.8915, + "step": 332 + }, + { + "epoch": 0.031072128394140152, + "grad_norm": 3.956650633918563, + "learning_rate": 4.995e-05, + "loss": 6.7834, + "step": 333 + }, + { + "epoch": 0.03116543808901745, + "grad_norm": 16.818262923509092, + "learning_rate": 5.01e-05, + "loss": 6.7589, + "step": 334 + }, + { + "epoch": 0.031258747783894744, + "grad_norm": 4.072019802244985, + "learning_rate": 5.025e-05, + "loss": 6.8555, + "step": 335 + }, + { + "epoch": 0.031352057478772045, + "grad_norm": 2.1524046962949566, + "learning_rate": 5.04e-05, + "loss": 6.6517, + "step": 336 + }, + { + "epoch": 0.03144536717364934, + "grad_norm": 3.8302092577734634, + "learning_rate": 5.055e-05, + "loss": 7.0356, + "step": 337 + }, + { + "epoch": 0.03153867686852664, + "grad_norm": 3.048258377795021, + "learning_rate": 5.07e-05, + "loss": 6.7292, + "step": 338 + }, + { + "epoch": 0.031631986563403935, + "grad_norm": 2.5706186282644388, + "learning_rate": 5.0849999999999996e-05, + "loss": 6.9759, + "step": 339 + }, + { + "epoch": 0.03172529625828124, + "grad_norm": 1.9540628734533232, + "learning_rate": 5.1e-05, + "loss": 6.4456, + "step": 340 + }, + { + "epoch": 0.03181860595315853, + "grad_norm": 2.601461047859829, + "learning_rate": 5.1149999999999996e-05, + "loss": 7.1181, + "step": 341 + }, + { + "epoch": 0.03191191564803583, + "grad_norm": 1.9371085737113005, + "learning_rate": 5.13e-05, + "loss": 6.5343, + "step": 342 + }, + { + "epoch": 0.03200522534291313, + "grad_norm": 3.042934466709292, + "learning_rate": 5.145e-05, + "loss": 6.9885, + "step": 343 + }, + { + "epoch": 0.03209853503779043, + "grad_norm": 2.0442211708812477, + "learning_rate": 5.1599999999999994e-05, + "loss": 6.8809, + "step": 344 + }, + { + "epoch": 0.03219184473266772, + "grad_norm": 2.361794304120133, + "learning_rate": 5.174999999999999e-05, + "loss": 7.0135, + "step": 345 + }, + { + "epoch": 0.032285154427545024, + "grad_norm": 2.5314040641568214, + "learning_rate": 5.1899999999999994e-05, + "loss": 6.9162, + "step": 346 + }, + { + "epoch": 0.03237846412242232, + "grad_norm": 1.9032504237629242, + "learning_rate": 5.204999999999999e-05, + "loss": 6.8525, + "step": 347 + }, + { + "epoch": 0.03247177381729962, + "grad_norm": 2.19205021330874, + "learning_rate": 5.2199999999999995e-05, + "loss": 6.9526, + "step": 348 + }, + { + "epoch": 0.032565083512176914, + "grad_norm": 1.8500466689296424, + "learning_rate": 5.234999999999999e-05, + "loss": 6.5604, + "step": 349 + }, + { + "epoch": 0.032658393207054215, + "grad_norm": 2.6697110383403633, + "learning_rate": 5.2499999999999995e-05, + "loss": 6.8725, + "step": 350 + }, + { + "epoch": 0.03275170290193151, + "grad_norm": 3.907148915985684, + "learning_rate": 5.264999999999999e-05, + "loss": 6.9152, + "step": 351 + }, + { + "epoch": 0.03284501259680881, + "grad_norm": 1.8020944951307882, + "learning_rate": 5.279999999999999e-05, + "loss": 6.7976, + "step": 352 + }, + { + "epoch": 0.032938322291686105, + "grad_norm": 1.5681329400184183, + "learning_rate": 5.294999999999999e-05, + "loss": 6.9273, + "step": 353 + }, + { + "epoch": 0.033031631986563406, + "grad_norm": 1.8513924600969476, + "learning_rate": 5.309999999999999e-05, + "loss": 6.6117, + "step": 354 + }, + { + "epoch": 0.0331249416814407, + "grad_norm": 1.6393516111930533, + "learning_rate": 5.324999999999999e-05, + "loss": 6.6243, + "step": 355 + }, + { + "epoch": 0.033218251376318, + "grad_norm": 2.142437907668519, + "learning_rate": 5.339999999999999e-05, + "loss": 6.7379, + "step": 356 + }, + { + "epoch": 0.033311561071195296, + "grad_norm": 3.3468252946857238, + "learning_rate": 5.3549999999999994e-05, + "loss": 7.0563, + "step": 357 + }, + { + "epoch": 0.0334048707660726, + "grad_norm": 1.7717778796489365, + "learning_rate": 5.369999999999999e-05, + "loss": 6.9711, + "step": 358 + }, + { + "epoch": 0.03349818046094989, + "grad_norm": 2.3208265418661322, + "learning_rate": 5.3849999999999994e-05, + "loss": 7.0828, + "step": 359 + }, + { + "epoch": 0.03359149015582719, + "grad_norm": 2.969668649120806, + "learning_rate": 5.399999999999999e-05, + "loss": 6.7383, + "step": 360 + }, + { + "epoch": 0.03368479985070449, + "grad_norm": 2.0590633169781514, + "learning_rate": 5.4149999999999995e-05, + "loss": 6.975, + "step": 361 + }, + { + "epoch": 0.03377810954558179, + "grad_norm": 2.4010916561348328, + "learning_rate": 5.429999999999999e-05, + "loss": 6.8061, + "step": 362 + }, + { + "epoch": 0.03387141924045908, + "grad_norm": 2.7440354773067313, + "learning_rate": 5.4449999999999995e-05, + "loss": 6.8446, + "step": 363 + }, + { + "epoch": 0.033964728935336384, + "grad_norm": 4.4187869916068525, + "learning_rate": 5.459999999999999e-05, + "loss": 6.5958, + "step": 364 + }, + { + "epoch": 0.03405803863021368, + "grad_norm": 1.8263026446303645, + "learning_rate": 5.4749999999999996e-05, + "loss": 6.9217, + "step": 365 + }, + { + "epoch": 0.03415134832509098, + "grad_norm": 1.9629803927740797, + "learning_rate": 5.489999999999999e-05, + "loss": 6.8283, + "step": 366 + }, + { + "epoch": 0.034244658019968274, + "grad_norm": 1.7220897216121127, + "learning_rate": 5.5049999999999996e-05, + "loss": 6.3543, + "step": 367 + }, + { + "epoch": 0.034337967714845576, + "grad_norm": 2.2742093810391095, + "learning_rate": 5.519999999999999e-05, + "loss": 6.8901, + "step": 368 + }, + { + "epoch": 0.03443127740972287, + "grad_norm": 2.535722039534486, + "learning_rate": 5.535e-05, + "loss": 6.8421, + "step": 369 + }, + { + "epoch": 0.03452458710460017, + "grad_norm": 2.0869043833089, + "learning_rate": 5.5499999999999994e-05, + "loss": 6.8343, + "step": 370 + }, + { + "epoch": 0.034617896799477466, + "grad_norm": 2.4014723265384306, + "learning_rate": 5.565e-05, + "loss": 7.0, + "step": 371 + }, + { + "epoch": 0.03471120649435477, + "grad_norm": 1.543883356560879, + "learning_rate": 5.5799999999999994e-05, + "loss": 6.5189, + "step": 372 + }, + { + "epoch": 0.03480451618923206, + "grad_norm": 1.7687325778406104, + "learning_rate": 5.595e-05, + "loss": 6.9758, + "step": 373 + }, + { + "epoch": 0.034897825884109356, + "grad_norm": 1.632097455336093, + "learning_rate": 5.6099999999999995e-05, + "loss": 6.5959, + "step": 374 + }, + { + "epoch": 0.03499113557898666, + "grad_norm": 2.1348435446453355, + "learning_rate": 5.625e-05, + "loss": 6.8304, + "step": 375 + }, + { + "epoch": 0.03508444527386395, + "grad_norm": 1.4443243770412246, + "learning_rate": 5.6399999999999995e-05, + "loss": 6.8456, + "step": 376 + }, + { + "epoch": 0.03517775496874125, + "grad_norm": 2.5567983361496482, + "learning_rate": 5.654999999999999e-05, + "loss": 7.1518, + "step": 377 + }, + { + "epoch": 0.03527106466361855, + "grad_norm": 1.6445501678178396, + "learning_rate": 5.6699999999999996e-05, + "loss": 6.5795, + "step": 378 + }, + { + "epoch": 0.03536437435849585, + "grad_norm": 2.1081163233931512, + "learning_rate": 5.684999999999999e-05, + "loss": 6.8127, + "step": 379 + }, + { + "epoch": 0.03545768405337314, + "grad_norm": 2.649392054564806, + "learning_rate": 5.6999999999999996e-05, + "loss": 6.9747, + "step": 380 + }, + { + "epoch": 0.035550993748250444, + "grad_norm": 2.4514312953331476, + "learning_rate": 5.714999999999999e-05, + "loss": 6.646, + "step": 381 + }, + { + "epoch": 0.03564430344312774, + "grad_norm": 1.806409523666454, + "learning_rate": 5.73e-05, + "loss": 6.7308, + "step": 382 + }, + { + "epoch": 0.03573761313800504, + "grad_norm": 2.234186646170075, + "learning_rate": 5.7449999999999994e-05, + "loss": 6.6541, + "step": 383 + }, + { + "epoch": 0.035830922832882334, + "grad_norm": 2.7019879422647244, + "learning_rate": 5.76e-05, + "loss": 6.5877, + "step": 384 + }, + { + "epoch": 0.035924232527759635, + "grad_norm": 2.5217303580805597, + "learning_rate": 5.7749999999999994e-05, + "loss": 6.6303, + "step": 385 + }, + { + "epoch": 0.03601754222263693, + "grad_norm": 4.066900608743942, + "learning_rate": 5.79e-05, + "loss": 6.9251, + "step": 386 + }, + { + "epoch": 0.03611085191751423, + "grad_norm": 2.529434789319295, + "learning_rate": 5.8049999999999995e-05, + "loss": 6.8225, + "step": 387 + }, + { + "epoch": 0.036204161612391525, + "grad_norm": 2.5467926757703934, + "learning_rate": 5.82e-05, + "loss": 6.9738, + "step": 388 + }, + { + "epoch": 0.036297471307268826, + "grad_norm": 2.151199213442986, + "learning_rate": 5.8349999999999995e-05, + "loss": 6.5741, + "step": 389 + }, + { + "epoch": 0.03639078100214612, + "grad_norm": 3.43279442713623, + "learning_rate": 5.85e-05, + "loss": 6.6467, + "step": 390 + }, + { + "epoch": 0.03648409069702342, + "grad_norm": 2.471112766036535, + "learning_rate": 5.8649999999999996e-05, + "loss": 6.3968, + "step": 391 + }, + { + "epoch": 0.036577400391900716, + "grad_norm": 1.5576352843389007, + "learning_rate": 5.88e-05, + "loss": 6.8435, + "step": 392 + }, + { + "epoch": 0.03667071008677802, + "grad_norm": 1.715363315912774, + "learning_rate": 5.8949999999999996e-05, + "loss": 6.4326, + "step": 393 + }, + { + "epoch": 0.03676401978165531, + "grad_norm": 1.845801090217926, + "learning_rate": 5.91e-05, + "loss": 6.7806, + "step": 394 + }, + { + "epoch": 0.03685732947653261, + "grad_norm": 2.026249104939755, + "learning_rate": 5.925e-05, + "loss": 7.1112, + "step": 395 + }, + { + "epoch": 0.03695063917140991, + "grad_norm": 1.6727863426678549, + "learning_rate": 5.94e-05, + "loss": 6.7757, + "step": 396 + }, + { + "epoch": 0.03704394886628721, + "grad_norm": 2.865608002574905, + "learning_rate": 5.955e-05, + "loss": 6.9446, + "step": 397 + }, + { + "epoch": 0.0371372585611645, + "grad_norm": 2.800640390662696, + "learning_rate": 5.97e-05, + "loss": 6.789, + "step": 398 + }, + { + "epoch": 0.037230568256041804, + "grad_norm": 2.4992220050059997, + "learning_rate": 5.985e-05, + "loss": 6.6742, + "step": 399 + }, + { + "epoch": 0.0373238779509191, + "grad_norm": 2.1268217805351517, + "learning_rate": 5.9999999999999995e-05, + "loss": 6.7988, + "step": 400 + }, + { + "epoch": 0.0374171876457964, + "grad_norm": 5.395096613299505, + "learning_rate": 6.015e-05, + "loss": 6.8678, + "step": 401 + }, + { + "epoch": 0.037510497340673694, + "grad_norm": 1.926470277098988, + "learning_rate": 6.0299999999999995e-05, + "loss": 6.7328, + "step": 402 + }, + { + "epoch": 0.037603807035550996, + "grad_norm": 2.281866258028689, + "learning_rate": 6.045e-05, + "loss": 6.6511, + "step": 403 + }, + { + "epoch": 0.03769711673042829, + "grad_norm": 2.74798724762753, + "learning_rate": 6.0599999999999996e-05, + "loss": 6.5271, + "step": 404 + }, + { + "epoch": 0.03779042642530559, + "grad_norm": 2.073225456857603, + "learning_rate": 6.075e-05, + "loss": 6.7716, + "step": 405 + }, + { + "epoch": 0.037883736120182886, + "grad_norm": 1.6974218586599088, + "learning_rate": 6.0899999999999996e-05, + "loss": 6.8408, + "step": 406 + }, + { + "epoch": 0.03797704581506019, + "grad_norm": 42.39506377639845, + "learning_rate": 6.104999999999999e-05, + "loss": 6.945, + "step": 407 + }, + { + "epoch": 0.03807035550993748, + "grad_norm": 3.2145706096801607, + "learning_rate": 6.12e-05, + "loss": 6.839, + "step": 408 + }, + { + "epoch": 0.03816366520481478, + "grad_norm": 1.7643666867816687, + "learning_rate": 6.134999999999999e-05, + "loss": 6.6878, + "step": 409 + }, + { + "epoch": 0.03825697489969208, + "grad_norm": 2.3420957209886417, + "learning_rate": 6.149999999999999e-05, + "loss": 6.6565, + "step": 410 + }, + { + "epoch": 0.03835028459456938, + "grad_norm": 7.08439383101099, + "learning_rate": 6.165e-05, + "loss": 6.7486, + "step": 411 + }, + { + "epoch": 0.03844359428944667, + "grad_norm": 3.523456715023551, + "learning_rate": 6.18e-05, + "loss": 6.9227, + "step": 412 + }, + { + "epoch": 0.038536903984323974, + "grad_norm": 5.002944048955482, + "learning_rate": 6.194999999999999e-05, + "loss": 7.0937, + "step": 413 + }, + { + "epoch": 0.03863021367920127, + "grad_norm": 8.787900263294265, + "learning_rate": 6.209999999999999e-05, + "loss": 7.1987, + "step": 414 + }, + { + "epoch": 0.03872352337407857, + "grad_norm": 15.38785239727991, + "learning_rate": 6.225e-05, + "loss": 6.9863, + "step": 415 + }, + { + "epoch": 0.038816833068955864, + "grad_norm": 3.598465031570538, + "learning_rate": 6.239999999999999e-05, + "loss": 7.0014, + "step": 416 + }, + { + "epoch": 0.038910142763833165, + "grad_norm": 11.834596312216004, + "learning_rate": 6.254999999999999e-05, + "loss": 6.9219, + "step": 417 + }, + { + "epoch": 0.03900345245871046, + "grad_norm": 2.3498080743725427, + "learning_rate": 6.269999999999999e-05, + "loss": 6.5956, + "step": 418 + }, + { + "epoch": 0.03909676215358776, + "grad_norm": 4.77411451780441, + "learning_rate": 6.285e-05, + "loss": 7.2208, + "step": 419 + }, + { + "epoch": 0.039190071848465055, + "grad_norm": 2.524798813252006, + "learning_rate": 6.299999999999999e-05, + "loss": 7.0057, + "step": 420 + }, + { + "epoch": 0.039283381543342356, + "grad_norm": 2.0393446621804387, + "learning_rate": 6.314999999999999e-05, + "loss": 6.7472, + "step": 421 + }, + { + "epoch": 0.03937669123821965, + "grad_norm": 2.333153816669667, + "learning_rate": 6.33e-05, + "loss": 6.6129, + "step": 422 + }, + { + "epoch": 0.03947000093309695, + "grad_norm": 1.9568555827950362, + "learning_rate": 6.345e-05, + "loss": 6.7832, + "step": 423 + }, + { + "epoch": 0.039563310627974246, + "grad_norm": 1.7435637961736432, + "learning_rate": 6.359999999999999e-05, + "loss": 7.1011, + "step": 424 + }, + { + "epoch": 0.03965662032285155, + "grad_norm": 1.4988619112041623, + "learning_rate": 6.374999999999999e-05, + "loss": 6.6407, + "step": 425 + }, + { + "epoch": 0.03974993001772884, + "grad_norm": 1.4318842110981587, + "learning_rate": 6.39e-05, + "loss": 6.8526, + "step": 426 + }, + { + "epoch": 0.039843239712606136, + "grad_norm": 1.5048669357937812, + "learning_rate": 6.405e-05, + "loss": 6.8242, + "step": 427 + }, + { + "epoch": 0.03993654940748344, + "grad_norm": 2.217615175354302, + "learning_rate": 6.419999999999999e-05, + "loss": 6.6316, + "step": 428 + }, + { + "epoch": 0.04002985910236073, + "grad_norm": 1.9874634186853763, + "learning_rate": 6.434999999999999e-05, + "loss": 7.0518, + "step": 429 + }, + { + "epoch": 0.04012316879723803, + "grad_norm": 2.0393006772718834, + "learning_rate": 6.45e-05, + "loss": 6.8575, + "step": 430 + }, + { + "epoch": 0.04021647849211533, + "grad_norm": 2.378773223381117, + "learning_rate": 6.465e-05, + "loss": 6.8785, + "step": 431 + }, + { + "epoch": 0.04030978818699263, + "grad_norm": 1.5749557276747963, + "learning_rate": 6.479999999999999e-05, + "loss": 6.6166, + "step": 432 + }, + { + "epoch": 0.04040309788186992, + "grad_norm": 4.914182739117475, + "learning_rate": 6.494999999999999e-05, + "loss": 6.8728, + "step": 433 + }, + { + "epoch": 0.040496407576747225, + "grad_norm": 2.0975966230881613, + "learning_rate": 6.51e-05, + "loss": 6.9613, + "step": 434 + }, + { + "epoch": 0.04058971727162452, + "grad_norm": 2.4238780411917533, + "learning_rate": 6.525e-05, + "loss": 6.6452, + "step": 435 + }, + { + "epoch": 0.04068302696650182, + "grad_norm": 1.750806119648488, + "learning_rate": 6.539999999999999e-05, + "loss": 6.6875, + "step": 436 + }, + { + "epoch": 0.040776336661379115, + "grad_norm": 2.2668515207238737, + "learning_rate": 6.555e-05, + "loss": 6.9074, + "step": 437 + }, + { + "epoch": 0.040869646356256416, + "grad_norm": 2.3761774485689084, + "learning_rate": 6.57e-05, + "loss": 6.5909, + "step": 438 + }, + { + "epoch": 0.04096295605113371, + "grad_norm": 1.8403631726468188, + "learning_rate": 6.584999999999999e-05, + "loss": 6.9433, + "step": 439 + }, + { + "epoch": 0.04105626574601101, + "grad_norm": 2.1550126835922936, + "learning_rate": 6.599999999999999e-05, + "loss": 6.5735, + "step": 440 + }, + { + "epoch": 0.041149575440888306, + "grad_norm": 2.3483567405236943, + "learning_rate": 6.615e-05, + "loss": 7.1064, + "step": 441 + }, + { + "epoch": 0.04124288513576561, + "grad_norm": 1.804006777483578, + "learning_rate": 6.63e-05, + "loss": 6.8805, + "step": 442 + }, + { + "epoch": 0.0413361948306429, + "grad_norm": 3.260127379844714, + "learning_rate": 6.644999999999999e-05, + "loss": 6.9448, + "step": 443 + }, + { + "epoch": 0.0414295045255202, + "grad_norm": 3.406501613046933, + "learning_rate": 6.659999999999999e-05, + "loss": 6.9805, + "step": 444 + }, + { + "epoch": 0.0415228142203975, + "grad_norm": 3.402634017511404, + "learning_rate": 6.675e-05, + "loss": 6.7385, + "step": 445 + }, + { + "epoch": 0.0416161239152748, + "grad_norm": 1.9350069008043547, + "learning_rate": 6.69e-05, + "loss": 6.2635, + "step": 446 + }, + { + "epoch": 0.04170943361015209, + "grad_norm": 1.7253792512823272, + "learning_rate": 6.704999999999999e-05, + "loss": 6.7723, + "step": 447 + }, + { + "epoch": 0.041802743305029394, + "grad_norm": 3.1210426889792293, + "learning_rate": 6.72e-05, + "loss": 6.8976, + "step": 448 + }, + { + "epoch": 0.04189605299990669, + "grad_norm": 1.7710114686540397, + "learning_rate": 6.735e-05, + "loss": 6.8772, + "step": 449 + }, + { + "epoch": 0.04198936269478399, + "grad_norm": 1.5565816936291799, + "learning_rate": 6.75e-05, + "loss": 6.9478, + "step": 450 + }, + { + "epoch": 0.042082672389661284, + "grad_norm": 1.5452759893391366, + "learning_rate": 6.764999999999999e-05, + "loss": 6.7165, + "step": 451 + }, + { + "epoch": 0.042175982084538585, + "grad_norm": 1.5778971094667391, + "learning_rate": 6.78e-05, + "loss": 6.2239, + "step": 452 + }, + { + "epoch": 0.04226929177941588, + "grad_norm": 2.809865858984599, + "learning_rate": 6.795e-05, + "loss": 6.3874, + "step": 453 + }, + { + "epoch": 0.04236260147429318, + "grad_norm": 1.8425707804033693, + "learning_rate": 6.81e-05, + "loss": 6.7706, + "step": 454 + }, + { + "epoch": 0.042455911169170475, + "grad_norm": 2.549592426849233, + "learning_rate": 6.824999999999999e-05, + "loss": 6.8394, + "step": 455 + }, + { + "epoch": 0.042549220864047776, + "grad_norm": 1.8722170147481711, + "learning_rate": 6.84e-05, + "loss": 6.8857, + "step": 456 + }, + { + "epoch": 0.04264253055892507, + "grad_norm": 1.6135618971627785, + "learning_rate": 6.855e-05, + "loss": 6.8528, + "step": 457 + }, + { + "epoch": 0.04273584025380237, + "grad_norm": 1.8438091488494044, + "learning_rate": 6.87e-05, + "loss": 6.644, + "step": 458 + }, + { + "epoch": 0.042829149948679666, + "grad_norm": 8.216747760870089, + "learning_rate": 6.884999999999999e-05, + "loss": 6.6653, + "step": 459 + }, + { + "epoch": 0.04292245964355697, + "grad_norm": 1.8259310602978462, + "learning_rate": 6.9e-05, + "loss": 6.8291, + "step": 460 + }, + { + "epoch": 0.04301576933843426, + "grad_norm": 2.18928933691788, + "learning_rate": 6.915e-05, + "loss": 6.8668, + "step": 461 + }, + { + "epoch": 0.04310907903331156, + "grad_norm": 1.5327946962426204, + "learning_rate": 6.93e-05, + "loss": 6.7537, + "step": 462 + }, + { + "epoch": 0.04320238872818886, + "grad_norm": 1.951082395465456, + "learning_rate": 6.945e-05, + "loss": 6.5836, + "step": 463 + }, + { + "epoch": 0.04329569842306616, + "grad_norm": 1.598613995243499, + "learning_rate": 6.96e-05, + "loss": 6.697, + "step": 464 + }, + { + "epoch": 0.04338900811794345, + "grad_norm": 1.8387546964768424, + "learning_rate": 6.975e-05, + "loss": 6.648, + "step": 465 + }, + { + "epoch": 0.043482317812820755, + "grad_norm": 1.8327002053295383, + "learning_rate": 6.989999999999999e-05, + "loss": 7.0438, + "step": 466 + }, + { + "epoch": 0.04357562750769805, + "grad_norm": 2.569405312022309, + "learning_rate": 7.005e-05, + "loss": 6.5295, + "step": 467 + }, + { + "epoch": 0.04366893720257535, + "grad_norm": 1.6312796372648652, + "learning_rate": 7.02e-05, + "loss": 6.923, + "step": 468 + }, + { + "epoch": 0.043762246897452645, + "grad_norm": 2.1935472837101626, + "learning_rate": 7.034999999999999e-05, + "loss": 7.3355, + "step": 469 + }, + { + "epoch": 0.043855556592329946, + "grad_norm": 1.3926334047746605, + "learning_rate": 7.049999999999999e-05, + "loss": 6.8254, + "step": 470 + }, + { + "epoch": 0.04394886628720724, + "grad_norm": 48.986610689630716, + "learning_rate": 7.065e-05, + "loss": 6.8514, + "step": 471 + }, + { + "epoch": 0.04404217598208454, + "grad_norm": 1.6745527188272151, + "learning_rate": 7.079999999999999e-05, + "loss": 6.7861, + "step": 472 + }, + { + "epoch": 0.044135485676961836, + "grad_norm": 1.5482637299936983, + "learning_rate": 7.094999999999999e-05, + "loss": 6.8743, + "step": 473 + }, + { + "epoch": 0.04422879537183914, + "grad_norm": 1.5931735286464932, + "learning_rate": 7.11e-05, + "loss": 6.5998, + "step": 474 + }, + { + "epoch": 0.04432210506671643, + "grad_norm": 1.522718526207836, + "learning_rate": 7.125e-05, + "loss": 6.8452, + "step": 475 + }, + { + "epoch": 0.04441541476159373, + "grad_norm": 1.9283841234040648, + "learning_rate": 7.139999999999999e-05, + "loss": 7.0066, + "step": 476 + }, + { + "epoch": 0.04450872445647103, + "grad_norm": 1.637905741454851, + "learning_rate": 7.154999999999999e-05, + "loss": 6.5349, + "step": 477 + }, + { + "epoch": 0.04460203415134833, + "grad_norm": 7.213195698306576, + "learning_rate": 7.17e-05, + "loss": 7.302, + "step": 478 + }, + { + "epoch": 0.04469534384622562, + "grad_norm": 3.090817094627032, + "learning_rate": 7.184999999999998e-05, + "loss": 6.6824, + "step": 479 + }, + { + "epoch": 0.04478865354110292, + "grad_norm": 4.09115465273653, + "learning_rate": 7.199999999999999e-05, + "loss": 6.638, + "step": 480 + }, + { + "epoch": 0.04488196323598022, + "grad_norm": 2.4669258828345906, + "learning_rate": 7.214999999999999e-05, + "loss": 7.0398, + "step": 481 + }, + { + "epoch": 0.04497527293085751, + "grad_norm": 2.0756755090777355, + "learning_rate": 7.23e-05, + "loss": 6.6587, + "step": 482 + }, + { + "epoch": 0.045068582625734814, + "grad_norm": 2.5719497294058327, + "learning_rate": 7.244999999999999e-05, + "loss": 6.4837, + "step": 483 + }, + { + "epoch": 0.04516189232061211, + "grad_norm": 2.4214765451018776, + "learning_rate": 7.259999999999999e-05, + "loss": 6.9397, + "step": 484 + }, + { + "epoch": 0.04525520201548941, + "grad_norm": 3.48863381747684, + "learning_rate": 7.274999999999999e-05, + "loss": 7.0237, + "step": 485 + }, + { + "epoch": 0.045348511710366704, + "grad_norm": 103.1073718357277, + "learning_rate": 7.29e-05, + "loss": 7.2867, + "step": 486 + }, + { + "epoch": 0.045441821405244005, + "grad_norm": 6.7574146899060485, + "learning_rate": 7.304999999999999e-05, + "loss": 6.8285, + "step": 487 + }, + { + "epoch": 0.0455351311001213, + "grad_norm": 8.54414319384612, + "learning_rate": 7.319999999999999e-05, + "loss": 7.0125, + "step": 488 + }, + { + "epoch": 0.0456284407949986, + "grad_norm": 2.7565282674198706, + "learning_rate": 7.335e-05, + "loss": 6.7334, + "step": 489 + }, + { + "epoch": 0.045721750489875895, + "grad_norm": 3.650960352282086, + "learning_rate": 7.35e-05, + "loss": 6.678, + "step": 490 + }, + { + "epoch": 0.0458150601847532, + "grad_norm": 1.869315196347962, + "learning_rate": 7.364999999999999e-05, + "loss": 6.7673, + "step": 491 + }, + { + "epoch": 0.04590836987963049, + "grad_norm": 2.694025098533977, + "learning_rate": 7.379999999999999e-05, + "loss": 6.7091, + "step": 492 + }, + { + "epoch": 0.04600167957450779, + "grad_norm": 3.9577572876874276, + "learning_rate": 7.395e-05, + "loss": 7.0398, + "step": 493 + }, + { + "epoch": 0.04609498926938509, + "grad_norm": 3.460335574823485, + "learning_rate": 7.41e-05, + "loss": 6.6756, + "step": 494 + }, + { + "epoch": 0.04618829896426239, + "grad_norm": 2.069339009489845, + "learning_rate": 7.424999999999999e-05, + "loss": 6.6769, + "step": 495 + }, + { + "epoch": 0.04628160865913968, + "grad_norm": 3.313031519115293, + "learning_rate": 7.439999999999999e-05, + "loss": 6.8022, + "step": 496 + }, + { + "epoch": 0.046374918354016983, + "grad_norm": 39.35513096872072, + "learning_rate": 7.455e-05, + "loss": 6.9818, + "step": 497 + }, + { + "epoch": 0.04646822804889428, + "grad_norm": 2.499697152838618, + "learning_rate": 7.47e-05, + "loss": 6.8295, + "step": 498 + }, + { + "epoch": 0.04656153774377158, + "grad_norm": 1.6909189437087202, + "learning_rate": 7.484999999999999e-05, + "loss": 6.6823, + "step": 499 + }, + { + "epoch": 0.04665484743864887, + "grad_norm": 2.7074348116015154, + "learning_rate": 7.5e-05, + "loss": 6.6319, + "step": 500 + }, + { + "epoch": 0.046748157133526175, + "grad_norm": 2.7468976909104335, + "learning_rate": 7.515e-05, + "loss": 6.9496, + "step": 501 + }, + { + "epoch": 0.04684146682840347, + "grad_norm": 3.403217651120694, + "learning_rate": 7.529999999999999e-05, + "loss": 6.8917, + "step": 502 + }, + { + "epoch": 0.04693477652328077, + "grad_norm": 3.43334840878711, + "learning_rate": 7.544999999999999e-05, + "loss": 6.7171, + "step": 503 + }, + { + "epoch": 0.047028086218158065, + "grad_norm": 2.6448636832554078, + "learning_rate": 7.56e-05, + "loss": 6.5438, + "step": 504 + }, + { + "epoch": 0.047121395913035366, + "grad_norm": 2.3838738841890152, + "learning_rate": 7.575e-05, + "loss": 6.608, + "step": 505 + }, + { + "epoch": 0.04721470560791266, + "grad_norm": 2.476591542730458, + "learning_rate": 7.589999999999999e-05, + "loss": 6.6217, + "step": 506 + }, + { + "epoch": 0.04730801530278996, + "grad_norm": 1.535930067310971, + "learning_rate": 7.604999999999999e-05, + "loss": 6.8284, + "step": 507 + }, + { + "epoch": 0.047401324997667256, + "grad_norm": 2.44297690765291, + "learning_rate": 7.62e-05, + "loss": 6.7241, + "step": 508 + }, + { + "epoch": 0.04749463469254456, + "grad_norm": 1.7198720001239602, + "learning_rate": 7.635e-05, + "loss": 6.4045, + "step": 509 + }, + { + "epoch": 0.04758794438742185, + "grad_norm": 2.0128981115461726, + "learning_rate": 7.649999999999999e-05, + "loss": 6.6736, + "step": 510 + }, + { + "epoch": 0.04768125408229915, + "grad_norm": 1.7526928741866414, + "learning_rate": 7.664999999999999e-05, + "loss": 6.5583, + "step": 511 + }, + { + "epoch": 0.04777456377717645, + "grad_norm": 1.5352003720670793, + "learning_rate": 7.68e-05, + "loss": 6.3372, + "step": 512 + }, + { + "epoch": 0.04786787347205375, + "grad_norm": 1.9185344111713465, + "learning_rate": 7.695e-05, + "loss": 6.7462, + "step": 513 + }, + { + "epoch": 0.04796118316693104, + "grad_norm": 3.392393677633703, + "learning_rate": 7.709999999999999e-05, + "loss": 6.5737, + "step": 514 + }, + { + "epoch": 0.048054492861808344, + "grad_norm": 2.073776656753986, + "learning_rate": 7.725e-05, + "loss": 6.9541, + "step": 515 + }, + { + "epoch": 0.04814780255668564, + "grad_norm": 2.057918813274513, + "learning_rate": 7.74e-05, + "loss": 6.8184, + "step": 516 + }, + { + "epoch": 0.04824111225156294, + "grad_norm": 1.898061872842493, + "learning_rate": 7.755e-05, + "loss": 6.9441, + "step": 517 + }, + { + "epoch": 0.048334421946440234, + "grad_norm": 1.8469187346822995, + "learning_rate": 7.769999999999999e-05, + "loss": 6.6068, + "step": 518 + }, + { + "epoch": 0.048427731641317535, + "grad_norm": 2.7718733738476335, + "learning_rate": 7.785e-05, + "loss": 6.6642, + "step": 519 + }, + { + "epoch": 0.04852104133619483, + "grad_norm": 1.4641011707692404, + "learning_rate": 7.8e-05, + "loss": 6.8286, + "step": 520 + }, + { + "epoch": 0.04861435103107213, + "grad_norm": 1.6809768038054496, + "learning_rate": 7.815e-05, + "loss": 6.5197, + "step": 521 + }, + { + "epoch": 0.048707660725949425, + "grad_norm": 2.5860877384966794, + "learning_rate": 7.829999999999999e-05, + "loss": 6.669, + "step": 522 + }, + { + "epoch": 0.04880097042082673, + "grad_norm": 1.4529222341214887, + "learning_rate": 7.845e-05, + "loss": 6.4523, + "step": 523 + }, + { + "epoch": 0.04889428011570402, + "grad_norm": 2.201785799631261, + "learning_rate": 7.86e-05, + "loss": 6.6106, + "step": 524 + }, + { + "epoch": 0.04898758981058132, + "grad_norm": 2.367145347587744, + "learning_rate": 7.874999999999999e-05, + "loss": 7.1036, + "step": 525 + }, + { + "epoch": 0.04908089950545862, + "grad_norm": 1.4615708596103225, + "learning_rate": 7.89e-05, + "loss": 6.415, + "step": 526 + }, + { + "epoch": 0.04917420920033592, + "grad_norm": 1.3872471240447406, + "learning_rate": 7.905e-05, + "loss": 6.7049, + "step": 527 + }, + { + "epoch": 0.04926751889521321, + "grad_norm": 2.473666608969332, + "learning_rate": 7.92e-05, + "loss": 6.2935, + "step": 528 + }, + { + "epoch": 0.049360828590090514, + "grad_norm": 1.5650611084241879, + "learning_rate": 7.934999999999999e-05, + "loss": 6.5995, + "step": 529 + }, + { + "epoch": 0.04945413828496781, + "grad_norm": 1.4377630799856755, + "learning_rate": 7.95e-05, + "loss": 6.2979, + "step": 530 + }, + { + "epoch": 0.04954744797984511, + "grad_norm": 1.6887689816676623, + "learning_rate": 7.965e-05, + "loss": 6.6888, + "step": 531 + }, + { + "epoch": 0.049640757674722404, + "grad_norm": 2.434730479268664, + "learning_rate": 7.98e-05, + "loss": 6.9246, + "step": 532 + }, + { + "epoch": 0.049734067369599705, + "grad_norm": 2.9007892386632275, + "learning_rate": 7.994999999999999e-05, + "loss": 6.8725, + "step": 533 + }, + { + "epoch": 0.049827377064477, + "grad_norm": 1.8843105602916848, + "learning_rate": 8.01e-05, + "loss": 7.139, + "step": 534 + }, + { + "epoch": 0.049920686759354294, + "grad_norm": 4.8324475743859505, + "learning_rate": 8.025e-05, + "loss": 6.491, + "step": 535 + }, + { + "epoch": 0.050013996454231595, + "grad_norm": 1.6465446249469335, + "learning_rate": 8.04e-05, + "loss": 6.5866, + "step": 536 + }, + { + "epoch": 0.05010730614910889, + "grad_norm": 1.4599321564651584, + "learning_rate": 8.054999999999999e-05, + "loss": 6.6715, + "step": 537 + }, + { + "epoch": 0.05020061584398619, + "grad_norm": 1.5929427282295912, + "learning_rate": 8.07e-05, + "loss": 6.7307, + "step": 538 + }, + { + "epoch": 0.050293925538863485, + "grad_norm": 2.594350068607909, + "learning_rate": 8.085e-05, + "loss": 6.5058, + "step": 539 + }, + { + "epoch": 0.050387235233740786, + "grad_norm": 1.9761679157783907, + "learning_rate": 8.1e-05, + "loss": 6.8358, + "step": 540 + }, + { + "epoch": 0.05048054492861808, + "grad_norm": 1.8291164580407644, + "learning_rate": 8.115e-05, + "loss": 6.8159, + "step": 541 + }, + { + "epoch": 0.05057385462349538, + "grad_norm": 2.8068347531978524, + "learning_rate": 8.13e-05, + "loss": 6.768, + "step": 542 + }, + { + "epoch": 0.050667164318372676, + "grad_norm": 2.406222526845439, + "learning_rate": 8.145e-05, + "loss": 6.9345, + "step": 543 + }, + { + "epoch": 0.05076047401324998, + "grad_norm": 1.391190948967499, + "learning_rate": 8.16e-05, + "loss": 6.354, + "step": 544 + }, + { + "epoch": 0.05085378370812727, + "grad_norm": 1.5437895832193596, + "learning_rate": 8.175e-05, + "loss": 6.355, + "step": 545 + }, + { + "epoch": 0.05094709340300457, + "grad_norm": 2.555416695313407, + "learning_rate": 8.19e-05, + "loss": 6.5101, + "step": 546 + }, + { + "epoch": 0.05104040309788187, + "grad_norm": 2.1530658094926336, + "learning_rate": 8.205e-05, + "loss": 6.5278, + "step": 547 + }, + { + "epoch": 0.05113371279275917, + "grad_norm": 4.697888489320164, + "learning_rate": 8.22e-05, + "loss": 6.764, + "step": 548 + }, + { + "epoch": 0.05122702248763646, + "grad_norm": 2.2828848094646705, + "learning_rate": 8.235e-05, + "loss": 6.6856, + "step": 549 + }, + { + "epoch": 0.051320332182513764, + "grad_norm": 1.9256505473546692, + "learning_rate": 8.25e-05, + "loss": 6.7406, + "step": 550 + }, + { + "epoch": 0.05141364187739106, + "grad_norm": 1.4764281952091673, + "learning_rate": 8.265e-05, + "loss": 6.4804, + "step": 551 + }, + { + "epoch": 0.05150695157226836, + "grad_norm": 1.984362843569923, + "learning_rate": 8.28e-05, + "loss": 6.5495, + "step": 552 + }, + { + "epoch": 0.051600261267145654, + "grad_norm": 1.6254741841890725, + "learning_rate": 8.295e-05, + "loss": 6.767, + "step": 553 + }, + { + "epoch": 0.051693570962022956, + "grad_norm": 1.4132504349236032, + "learning_rate": 8.31e-05, + "loss": 6.4835, + "step": 554 + }, + { + "epoch": 0.05178688065690025, + "grad_norm": 1.63390536052909, + "learning_rate": 8.325e-05, + "loss": 6.5635, + "step": 555 + }, + { + "epoch": 0.05188019035177755, + "grad_norm": 1.710174539966927, + "learning_rate": 8.34e-05, + "loss": 6.6702, + "step": 556 + }, + { + "epoch": 0.051973500046654846, + "grad_norm": 2.2797732511254027, + "learning_rate": 8.355e-05, + "loss": 6.6697, + "step": 557 + }, + { + "epoch": 0.05206680974153215, + "grad_norm": 1.6148672141524638, + "learning_rate": 8.37e-05, + "loss": 6.7046, + "step": 558 + }, + { + "epoch": 0.05216011943640944, + "grad_norm": 1.3788059020927161, + "learning_rate": 8.385e-05, + "loss": 6.5822, + "step": 559 + }, + { + "epoch": 0.05225342913128674, + "grad_norm": 1.5766479306346688, + "learning_rate": 8.4e-05, + "loss": 6.7229, + "step": 560 + }, + { + "epoch": 0.05234673882616404, + "grad_norm": 4.021931773495056, + "learning_rate": 8.415e-05, + "loss": 6.9856, + "step": 561 + }, + { + "epoch": 0.05244004852104134, + "grad_norm": 1.4844742387867755, + "learning_rate": 8.43e-05, + "loss": 6.5598, + "step": 562 + }, + { + "epoch": 0.05253335821591863, + "grad_norm": 1.768517756013192, + "learning_rate": 8.444999999999998e-05, + "loss": 6.318, + "step": 563 + }, + { + "epoch": 0.052626667910795934, + "grad_norm": 2.4997808263372923, + "learning_rate": 8.459999999999998e-05, + "loss": 7.0339, + "step": 564 + }, + { + "epoch": 0.05271997760567323, + "grad_norm": 1.4462692589447308, + "learning_rate": 8.474999999999999e-05, + "loss": 6.5102, + "step": 565 + }, + { + "epoch": 0.05281328730055053, + "grad_norm": 1.3303136014720292, + "learning_rate": 8.489999999999999e-05, + "loss": 6.6304, + "step": 566 + }, + { + "epoch": 0.052906596995427824, + "grad_norm": 1.931542690107203, + "learning_rate": 8.504999999999998e-05, + "loss": 6.7984, + "step": 567 + }, + { + "epoch": 0.052999906690305125, + "grad_norm": 1.9417051272909915, + "learning_rate": 8.519999999999998e-05, + "loss": 6.3396, + "step": 568 + }, + { + "epoch": 0.05309321638518242, + "grad_norm": 1.629569841080011, + "learning_rate": 8.534999999999999e-05, + "loss": 6.7394, + "step": 569 + }, + { + "epoch": 0.05318652608005972, + "grad_norm": 1.388904333791548, + "learning_rate": 8.549999999999999e-05, + "loss": 7.0262, + "step": 570 + }, + { + "epoch": 0.053279835774937015, + "grad_norm": 1.770235848180796, + "learning_rate": 8.564999999999998e-05, + "loss": 6.5435, + "step": 571 + }, + { + "epoch": 0.053373145469814316, + "grad_norm": 1.545112331559322, + "learning_rate": 8.579999999999998e-05, + "loss": 6.7372, + "step": 572 + }, + { + "epoch": 0.05346645516469161, + "grad_norm": 1.876704636137121, + "learning_rate": 8.594999999999999e-05, + "loss": 6.5982, + "step": 573 + }, + { + "epoch": 0.05355976485956891, + "grad_norm": 1.683806712498047, + "learning_rate": 8.609999999999999e-05, + "loss": 6.5194, + "step": 574 + }, + { + "epoch": 0.053653074554446206, + "grad_norm": 1.6034819693982907, + "learning_rate": 8.624999999999998e-05, + "loss": 6.704, + "step": 575 + }, + { + "epoch": 0.05374638424932351, + "grad_norm": 1.6895925423700833, + "learning_rate": 8.639999999999999e-05, + "loss": 6.3675, + "step": 576 + }, + { + "epoch": 0.0538396939442008, + "grad_norm": 1.5212148516922452, + "learning_rate": 8.654999999999999e-05, + "loss": 6.7479, + "step": 577 + }, + { + "epoch": 0.0539330036390781, + "grad_norm": 1.466449502143248, + "learning_rate": 8.669999999999998e-05, + "loss": 6.6013, + "step": 578 + }, + { + "epoch": 0.0540263133339554, + "grad_norm": 1.5867726068594619, + "learning_rate": 8.684999999999998e-05, + "loss": 6.358, + "step": 579 + }, + { + "epoch": 0.0541196230288327, + "grad_norm": 1.3887669548285873, + "learning_rate": 8.699999999999999e-05, + "loss": 6.8527, + "step": 580 + }, + { + "epoch": 0.05421293272370999, + "grad_norm": 1.341372753224642, + "learning_rate": 8.714999999999999e-05, + "loss": 6.6541, + "step": 581 + }, + { + "epoch": 0.054306242418587294, + "grad_norm": 1.562131359122336, + "learning_rate": 8.729999999999998e-05, + "loss": 6.5511, + "step": 582 + }, + { + "epoch": 0.05439955211346459, + "grad_norm": 1.5080822454394494, + "learning_rate": 8.744999999999998e-05, + "loss": 6.6994, + "step": 583 + }, + { + "epoch": 0.05449286180834189, + "grad_norm": 1.2544177089603556, + "learning_rate": 8.759999999999999e-05, + "loss": 6.5842, + "step": 584 + }, + { + "epoch": 0.054586171503219184, + "grad_norm": 1.4236096580497961, + "learning_rate": 8.774999999999999e-05, + "loss": 6.7652, + "step": 585 + }, + { + "epoch": 0.054679481198096486, + "grad_norm": 1.3901940081507036, + "learning_rate": 8.789999999999998e-05, + "loss": 6.7039, + "step": 586 + }, + { + "epoch": 0.05477279089297378, + "grad_norm": 1.3869187682036412, + "learning_rate": 8.804999999999999e-05, + "loss": 6.4812, + "step": 587 + }, + { + "epoch": 0.054866100587851074, + "grad_norm": 1.2491969930848001, + "learning_rate": 8.819999999999999e-05, + "loss": 6.5197, + "step": 588 + }, + { + "epoch": 0.054959410282728376, + "grad_norm": 1.4651946096018593, + "learning_rate": 8.834999999999999e-05, + "loss": 6.4817, + "step": 589 + }, + { + "epoch": 0.05505271997760567, + "grad_norm": 1.4738709376754404, + "learning_rate": 8.849999999999998e-05, + "loss": 6.7737, + "step": 590 + }, + { + "epoch": 0.05514602967248297, + "grad_norm": 1.9055398435185755, + "learning_rate": 8.864999999999999e-05, + "loss": 6.7276, + "step": 591 + }, + { + "epoch": 0.055239339367360266, + "grad_norm": 2.6958230049117455, + "learning_rate": 8.879999999999999e-05, + "loss": 6.6127, + "step": 592 + }, + { + "epoch": 0.05533264906223757, + "grad_norm": 1.301033244052073, + "learning_rate": 8.895e-05, + "loss": 6.7696, + "step": 593 + }, + { + "epoch": 0.05542595875711486, + "grad_norm": 1.4717414747098134, + "learning_rate": 8.909999999999998e-05, + "loss": 6.5351, + "step": 594 + }, + { + "epoch": 0.05551926845199216, + "grad_norm": 1.4956081822805873, + "learning_rate": 8.924999999999999e-05, + "loss": 6.6948, + "step": 595 + }, + { + "epoch": 0.05561257814686946, + "grad_norm": 1.6037945965185043, + "learning_rate": 8.939999999999999e-05, + "loss": 6.6515, + "step": 596 + }, + { + "epoch": 0.05570588784174676, + "grad_norm": 1.6027235831037094, + "learning_rate": 8.955e-05, + "loss": 6.4426, + "step": 597 + }, + { + "epoch": 0.05579919753662405, + "grad_norm": 2.06332200376337, + "learning_rate": 8.969999999999998e-05, + "loss": 6.7787, + "step": 598 + }, + { + "epoch": 0.055892507231501354, + "grad_norm": 1.2908045363038683, + "learning_rate": 8.984999999999999e-05, + "loss": 6.337, + "step": 599 + }, + { + "epoch": 0.05598581692637865, + "grad_norm": 1.8000036526889491, + "learning_rate": 8.999999999999999e-05, + "loss": 6.5686, + "step": 600 + }, + { + "epoch": 0.05607912662125595, + "grad_norm": 3.2755582506488135, + "learning_rate": 9.014999999999998e-05, + "loss": 6.8122, + "step": 601 + }, + { + "epoch": 0.056172436316133244, + "grad_norm": 2.486737283288589, + "learning_rate": 9.029999999999999e-05, + "loss": 6.7646, + "step": 602 + }, + { + "epoch": 0.056265746011010545, + "grad_norm": 1.9264908000687926, + "learning_rate": 9.044999999999999e-05, + "loss": 6.3586, + "step": 603 + }, + { + "epoch": 0.05635905570588784, + "grad_norm": 1.851114803187875, + "learning_rate": 9.059999999999999e-05, + "loss": 6.7047, + "step": 604 + }, + { + "epoch": 0.05645236540076514, + "grad_norm": 4.10519520175052, + "learning_rate": 9.074999999999998e-05, + "loss": 6.5827, + "step": 605 + }, + { + "epoch": 0.056545675095642435, + "grad_norm": 2.157706612683392, + "learning_rate": 9.089999999999999e-05, + "loss": 6.7269, + "step": 606 + }, + { + "epoch": 0.056638984790519736, + "grad_norm": 5.978078005336673, + "learning_rate": 9.104999999999999e-05, + "loss": 6.6709, + "step": 607 + }, + { + "epoch": 0.05673229448539703, + "grad_norm": 3.33120342529082, + "learning_rate": 9.12e-05, + "loss": 6.746, + "step": 608 + }, + { + "epoch": 0.05682560418027433, + "grad_norm": 1.5601612856671083, + "learning_rate": 9.134999999999998e-05, + "loss": 6.3943, + "step": 609 + }, + { + "epoch": 0.056918913875151626, + "grad_norm": 1.7180368672403559, + "learning_rate": 9.149999999999999e-05, + "loss": 6.6571, + "step": 610 + }, + { + "epoch": 0.05701222357002893, + "grad_norm": 1.2172716341538379, + "learning_rate": 9.164999999999999e-05, + "loss": 6.6539, + "step": 611 + }, + { + "epoch": 0.05710553326490622, + "grad_norm": 1.7669006672528336, + "learning_rate": 9.18e-05, + "loss": 6.4761, + "step": 612 + }, + { + "epoch": 0.05719884295978352, + "grad_norm": 1.2206439038488268, + "learning_rate": 9.194999999999999e-05, + "loss": 6.5154, + "step": 613 + }, + { + "epoch": 0.05729215265466082, + "grad_norm": 1.4379319565016038, + "learning_rate": 9.209999999999999e-05, + "loss": 6.4411, + "step": 614 + }, + { + "epoch": 0.05738546234953812, + "grad_norm": 2.0295815040989247, + "learning_rate": 9.224999999999999e-05, + "loss": 6.5676, + "step": 615 + }, + { + "epoch": 0.05747877204441541, + "grad_norm": 1.2449100354190357, + "learning_rate": 9.24e-05, + "loss": 6.5017, + "step": 616 + }, + { + "epoch": 0.057572081739292714, + "grad_norm": 1.572341095115172, + "learning_rate": 9.254999999999999e-05, + "loss": 6.6258, + "step": 617 + }, + { + "epoch": 0.05766539143417001, + "grad_norm": 1.4645614953360122, + "learning_rate": 9.269999999999999e-05, + "loss": 6.4667, + "step": 618 + }, + { + "epoch": 0.05775870112904731, + "grad_norm": 1.3126419489270784, + "learning_rate": 9.285e-05, + "loss": 6.575, + "step": 619 + }, + { + "epoch": 0.057852010823924604, + "grad_norm": 1.4609407980887146, + "learning_rate": 9.3e-05, + "loss": 6.6108, + "step": 620 + }, + { + "epoch": 0.057945320518801906, + "grad_norm": 9.465836752101962, + "learning_rate": 9.314999999999999e-05, + "loss": 6.698, + "step": 621 + }, + { + "epoch": 0.0580386302136792, + "grad_norm": 1.2354194167011214, + "learning_rate": 9.329999999999999e-05, + "loss": 6.7071, + "step": 622 + }, + { + "epoch": 0.0581319399085565, + "grad_norm": 1.4986137148516405, + "learning_rate": 9.345e-05, + "loss": 6.5792, + "step": 623 + }, + { + "epoch": 0.058225249603433796, + "grad_norm": 1.8348332941626966, + "learning_rate": 9.36e-05, + "loss": 6.351, + "step": 624 + }, + { + "epoch": 0.0583185592983111, + "grad_norm": 1.3911154875400393, + "learning_rate": 9.374999999999999e-05, + "loss": 6.3985, + "step": 625 + }, + { + "epoch": 0.05841186899318839, + "grad_norm": 3.2383073816466155, + "learning_rate": 9.389999999999999e-05, + "loss": 6.7244, + "step": 626 + }, + { + "epoch": 0.05850517868806569, + "grad_norm": 3.580199952900078, + "learning_rate": 9.405e-05, + "loss": 6.5552, + "step": 627 + }, + { + "epoch": 0.05859848838294299, + "grad_norm": 1.7320419492321215, + "learning_rate": 9.419999999999999e-05, + "loss": 6.7034, + "step": 628 + }, + { + "epoch": 0.05869179807782029, + "grad_norm": 2.814241026714129, + "learning_rate": 9.434999999999999e-05, + "loss": 6.3131, + "step": 629 + }, + { + "epoch": 0.05878510777269758, + "grad_norm": 1.630202534485676, + "learning_rate": 9.449999999999999e-05, + "loss": 6.6869, + "step": 630 + }, + { + "epoch": 0.058878417467574884, + "grad_norm": 2.7255810967140097, + "learning_rate": 9.465e-05, + "loss": 6.5609, + "step": 631 + }, + { + "epoch": 0.05897172716245218, + "grad_norm": 1.6074126406311458, + "learning_rate": 9.479999999999999e-05, + "loss": 6.2175, + "step": 632 + }, + { + "epoch": 0.05906503685732948, + "grad_norm": 2.874277523538436, + "learning_rate": 9.494999999999999e-05, + "loss": 6.6233, + "step": 633 + }, + { + "epoch": 0.059158346552206774, + "grad_norm": 1.7375249438221947, + "learning_rate": 9.51e-05, + "loss": 6.6793, + "step": 634 + }, + { + "epoch": 0.059251656247084075, + "grad_norm": 1.625990426122923, + "learning_rate": 9.525e-05, + "loss": 6.6331, + "step": 635 + }, + { + "epoch": 0.05934496594196137, + "grad_norm": 1.7346901866277822, + "learning_rate": 9.539999999999999e-05, + "loss": 6.7645, + "step": 636 + }, + { + "epoch": 0.05943827563683867, + "grad_norm": 2.257244840208406, + "learning_rate": 9.554999999999999e-05, + "loss": 6.5358, + "step": 637 + }, + { + "epoch": 0.059531585331715965, + "grad_norm": 1.3884745506375207, + "learning_rate": 9.57e-05, + "loss": 6.633, + "step": 638 + }, + { + "epoch": 0.059624895026593266, + "grad_norm": 1.9707575778003272, + "learning_rate": 9.585e-05, + "loss": 6.6126, + "step": 639 + }, + { + "epoch": 0.05971820472147056, + "grad_norm": 2.636247294314707, + "learning_rate": 9.599999999999999e-05, + "loss": 6.7522, + "step": 640 + }, + { + "epoch": 0.059811514416347855, + "grad_norm": 2.084619453084209, + "learning_rate": 9.614999999999999e-05, + "loss": 6.5111, + "step": 641 + }, + { + "epoch": 0.059904824111225156, + "grad_norm": 2.1846012061940905, + "learning_rate": 9.63e-05, + "loss": 6.5745, + "step": 642 + }, + { + "epoch": 0.05999813380610245, + "grad_norm": 1.923432736230155, + "learning_rate": 9.645e-05, + "loss": 6.6016, + "step": 643 + }, + { + "epoch": 0.06009144350097975, + "grad_norm": 2.3146047544615787, + "learning_rate": 9.659999999999999e-05, + "loss": 6.3646, + "step": 644 + }, + { + "epoch": 0.060184753195857046, + "grad_norm": 1.4761475663173567, + "learning_rate": 9.675e-05, + "loss": 6.7626, + "step": 645 + }, + { + "epoch": 0.06027806289073435, + "grad_norm": 5.142223389261606, + "learning_rate": 9.69e-05, + "loss": 6.4434, + "step": 646 + }, + { + "epoch": 0.06037137258561164, + "grad_norm": 1.3730964769712053, + "learning_rate": 9.705e-05, + "loss": 6.4181, + "step": 647 + }, + { + "epoch": 0.06046468228048894, + "grad_norm": 1.776110075160921, + "learning_rate": 9.719999999999999e-05, + "loss": 6.6274, + "step": 648 + }, + { + "epoch": 0.06055799197536624, + "grad_norm": 1.3255597085010034, + "learning_rate": 9.735e-05, + "loss": 6.3948, + "step": 649 + }, + { + "epoch": 0.06065130167024354, + "grad_norm": 1.413349412401672, + "learning_rate": 9.75e-05, + "loss": 6.4361, + "step": 650 + }, + { + "epoch": 0.06074461136512083, + "grad_norm": 1.306595864425494, + "learning_rate": 9.764999999999999e-05, + "loss": 6.5039, + "step": 651 + }, + { + "epoch": 0.060837921059998135, + "grad_norm": 1.2577687419462065, + "learning_rate": 9.779999999999999e-05, + "loss": 6.5333, + "step": 652 + }, + { + "epoch": 0.06093123075487543, + "grad_norm": 2.292955661225484, + "learning_rate": 9.795e-05, + "loss": 6.9748, + "step": 653 + }, + { + "epoch": 0.06102454044975273, + "grad_norm": 1.5491925045728958, + "learning_rate": 9.81e-05, + "loss": 6.3867, + "step": 654 + }, + { + "epoch": 0.061117850144630025, + "grad_norm": 2.3657441092586304, + "learning_rate": 9.824999999999999e-05, + "loss": 6.4241, + "step": 655 + }, + { + "epoch": 0.061211159839507326, + "grad_norm": 2.652090164472632, + "learning_rate": 9.839999999999999e-05, + "loss": 6.9209, + "step": 656 + }, + { + "epoch": 0.06130446953438462, + "grad_norm": 2.9751656841121314, + "learning_rate": 9.855e-05, + "loss": 6.8289, + "step": 657 + }, + { + "epoch": 0.06139777922926192, + "grad_norm": 1.2049199220762419, + "learning_rate": 9.87e-05, + "loss": 6.3429, + "step": 658 + }, + { + "epoch": 0.061491088924139216, + "grad_norm": 3.1754030320544917, + "learning_rate": 9.884999999999999e-05, + "loss": 6.8736, + "step": 659 + }, + { + "epoch": 0.06158439861901652, + "grad_norm": 2.1551321226488738, + "learning_rate": 9.9e-05, + "loss": 6.6105, + "step": 660 + }, + { + "epoch": 0.06167770831389381, + "grad_norm": 1.5152616657176492, + "learning_rate": 9.915e-05, + "loss": 6.5005, + "step": 661 + }, + { + "epoch": 0.06177101800877111, + "grad_norm": 1.470421078153687, + "learning_rate": 9.93e-05, + "loss": 6.4697, + "step": 662 + }, + { + "epoch": 0.06186432770364841, + "grad_norm": 2.418880893230635, + "learning_rate": 9.944999999999999e-05, + "loss": 6.7097, + "step": 663 + }, + { + "epoch": 0.06195763739852571, + "grad_norm": 1.4004925961726913, + "learning_rate": 9.96e-05, + "loss": 6.4322, + "step": 664 + }, + { + "epoch": 0.062050947093403, + "grad_norm": 2.4438864742939868, + "learning_rate": 9.975e-05, + "loss": 6.6955, + "step": 665 + }, + { + "epoch": 0.062144256788280304, + "grad_norm": 1.827419638468658, + "learning_rate": 9.99e-05, + "loss": 6.562, + "step": 666 + }, + { + "epoch": 0.0622375664831576, + "grad_norm": 1.5330261004510215, + "learning_rate": 0.00010004999999999999, + "loss": 6.6787, + "step": 667 + }, + { + "epoch": 0.0623308761780349, + "grad_norm": 1.3071578128260664, + "learning_rate": 0.0001002, + "loss": 6.501, + "step": 668 + }, + { + "epoch": 0.062424185872912194, + "grad_norm": 1.4354519287172727, + "learning_rate": 0.00010035, + "loss": 6.9784, + "step": 669 + }, + { + "epoch": 0.06251749556778949, + "grad_norm": 6.72430184343885, + "learning_rate": 0.0001005, + "loss": 6.5477, + "step": 670 + }, + { + "epoch": 0.0626108052626668, + "grad_norm": 1.1996272239853953, + "learning_rate": 0.00010065, + "loss": 6.3868, + "step": 671 + }, + { + "epoch": 0.06270411495754409, + "grad_norm": 1.3215646930105334, + "learning_rate": 0.0001008, + "loss": 6.7254, + "step": 672 + }, + { + "epoch": 0.06279742465242139, + "grad_norm": 1.2715721182513475, + "learning_rate": 0.00010095, + "loss": 6.8106, + "step": 673 + }, + { + "epoch": 0.06289073434729868, + "grad_norm": 2.1872573950590257, + "learning_rate": 0.0001011, + "loss": 6.3253, + "step": 674 + }, + { + "epoch": 0.06298404404217599, + "grad_norm": 1.371196724746652, + "learning_rate": 0.00010125, + "loss": 6.7998, + "step": 675 + }, + { + "epoch": 0.06307735373705328, + "grad_norm": 1.4191480095370408, + "learning_rate": 0.0001014, + "loss": 6.5954, + "step": 676 + }, + { + "epoch": 0.06317066343193058, + "grad_norm": 1.4601411324530333, + "learning_rate": 0.00010155, + "loss": 6.467, + "step": 677 + }, + { + "epoch": 0.06326397312680787, + "grad_norm": 1.1581613344308255, + "learning_rate": 0.00010169999999999999, + "loss": 6.5759, + "step": 678 + }, + { + "epoch": 0.06335728282168518, + "grad_norm": 1.517844099147466, + "learning_rate": 0.00010185, + "loss": 6.5342, + "step": 679 + }, + { + "epoch": 0.06345059251656247, + "grad_norm": 2.0533061062710547, + "learning_rate": 0.000102, + "loss": 6.5608, + "step": 680 + }, + { + "epoch": 0.06354390221143977, + "grad_norm": 1.4023881455751568, + "learning_rate": 0.00010215, + "loss": 6.4721, + "step": 681 + }, + { + "epoch": 0.06363721190631706, + "grad_norm": 1.6265536990011569, + "learning_rate": 0.00010229999999999999, + "loss": 6.4182, + "step": 682 + }, + { + "epoch": 0.06373052160119437, + "grad_norm": 1.3429849589296932, + "learning_rate": 0.00010245, + "loss": 6.3969, + "step": 683 + }, + { + "epoch": 0.06382383129607166, + "grad_norm": 12.05367235118124, + "learning_rate": 0.0001026, + "loss": 6.5104, + "step": 684 + }, + { + "epoch": 0.06391714099094896, + "grad_norm": 1.9068169111269777, + "learning_rate": 0.00010275, + "loss": 6.6282, + "step": 685 + }, + { + "epoch": 0.06401045068582625, + "grad_norm": 1.538684380311488, + "learning_rate": 0.0001029, + "loss": 6.5101, + "step": 686 + }, + { + "epoch": 0.06410376038070356, + "grad_norm": 1.493689373264097, + "learning_rate": 0.00010305, + "loss": 6.6907, + "step": 687 + }, + { + "epoch": 0.06419707007558086, + "grad_norm": 2.325238209192523, + "learning_rate": 0.00010319999999999999, + "loss": 6.4855, + "step": 688 + }, + { + "epoch": 0.06429037977045815, + "grad_norm": 2.3000487543033974, + "learning_rate": 0.00010334999999999998, + "loss": 6.2275, + "step": 689 + }, + { + "epoch": 0.06438368946533544, + "grad_norm": 2.1048473697064383, + "learning_rate": 0.00010349999999999998, + "loss": 6.6914, + "step": 690 + }, + { + "epoch": 0.06447699916021275, + "grad_norm": 6.714509717273078, + "learning_rate": 0.00010364999999999999, + "loss": 6.568, + "step": 691 + }, + { + "epoch": 0.06457030885509005, + "grad_norm": 2.3826144669019467, + "learning_rate": 0.00010379999999999999, + "loss": 6.5583, + "step": 692 + }, + { + "epoch": 0.06466361854996734, + "grad_norm": 1.365840882060761, + "learning_rate": 0.00010394999999999998, + "loss": 6.1538, + "step": 693 + }, + { + "epoch": 0.06475692824484464, + "grad_norm": 2.3059213829484175, + "learning_rate": 0.00010409999999999998, + "loss": 6.5245, + "step": 694 + }, + { + "epoch": 0.06485023793972193, + "grad_norm": 2.3728779037732064, + "learning_rate": 0.00010424999999999999, + "loss": 6.2104, + "step": 695 + }, + { + "epoch": 0.06494354763459924, + "grad_norm": 5.543481138728798, + "learning_rate": 0.00010439999999999999, + "loss": 6.8285, + "step": 696 + }, + { + "epoch": 0.06503685732947653, + "grad_norm": 2.4464449332987463, + "learning_rate": 0.00010454999999999998, + "loss": 6.7073, + "step": 697 + }, + { + "epoch": 0.06513016702435383, + "grad_norm": 1.4755825506923081, + "learning_rate": 0.00010469999999999998, + "loss": 6.4839, + "step": 698 + }, + { + "epoch": 0.06522347671923112, + "grad_norm": 1.504295678370013, + "learning_rate": 0.00010484999999999999, + "loss": 6.4958, + "step": 699 + }, + { + "epoch": 0.06531678641410843, + "grad_norm": 1.6029584846394747, + "learning_rate": 0.00010499999999999999, + "loss": 6.8787, + "step": 700 + }, + { + "epoch": 0.06541009610898572, + "grad_norm": 1.5769740913416435, + "learning_rate": 0.00010514999999999998, + "loss": 6.5094, + "step": 701 + }, + { + "epoch": 0.06550340580386302, + "grad_norm": 1.1201348434965253, + "learning_rate": 0.00010529999999999998, + "loss": 6.6637, + "step": 702 + }, + { + "epoch": 0.06559671549874031, + "grad_norm": 1.8503262132211153, + "learning_rate": 0.00010544999999999999, + "loss": 6.3001, + "step": 703 + }, + { + "epoch": 0.06569002519361762, + "grad_norm": 2.1297556184097304, + "learning_rate": 0.00010559999999999998, + "loss": 6.8552, + "step": 704 + }, + { + "epoch": 0.06578333488849492, + "grad_norm": 1.9434501664526656, + "learning_rate": 0.00010574999999999998, + "loss": 6.5302, + "step": 705 + }, + { + "epoch": 0.06587664458337221, + "grad_norm": 1.9671936769285197, + "learning_rate": 0.00010589999999999999, + "loss": 6.7823, + "step": 706 + }, + { + "epoch": 0.0659699542782495, + "grad_norm": 1.5683630476403876, + "learning_rate": 0.00010604999999999999, + "loss": 6.4736, + "step": 707 + }, + { + "epoch": 0.06606326397312681, + "grad_norm": 2.2603570288490924, + "learning_rate": 0.00010619999999999998, + "loss": 6.7538, + "step": 708 + }, + { + "epoch": 0.0661565736680041, + "grad_norm": 1.986879751024604, + "learning_rate": 0.00010634999999999998, + "loss": 6.8364, + "step": 709 + }, + { + "epoch": 0.0662498833628814, + "grad_norm": 1.3797379430101955, + "learning_rate": 0.00010649999999999999, + "loss": 6.9102, + "step": 710 + }, + { + "epoch": 0.0663431930577587, + "grad_norm": 1.643827088265431, + "learning_rate": 0.00010664999999999999, + "loss": 6.4626, + "step": 711 + }, + { + "epoch": 0.066436502752636, + "grad_norm": 2.071885944932851, + "learning_rate": 0.00010679999999999998, + "loss": 6.6163, + "step": 712 + }, + { + "epoch": 0.0665298124475133, + "grad_norm": 1.3737705736010026, + "learning_rate": 0.00010694999999999998, + "loss": 6.7399, + "step": 713 + }, + { + "epoch": 0.06662312214239059, + "grad_norm": 1.9480059631202007, + "learning_rate": 0.00010709999999999999, + "loss": 6.6517, + "step": 714 + }, + { + "epoch": 0.06671643183726789, + "grad_norm": 1.5981615522385129, + "learning_rate": 0.00010724999999999999, + "loss": 6.3718, + "step": 715 + }, + { + "epoch": 0.0668097415321452, + "grad_norm": 1.2793216703716528, + "learning_rate": 0.00010739999999999998, + "loss": 6.3307, + "step": 716 + }, + { + "epoch": 0.06690305122702249, + "grad_norm": 1.4582507945779195, + "learning_rate": 0.00010754999999999999, + "loss": 6.5189, + "step": 717 + }, + { + "epoch": 0.06699636092189978, + "grad_norm": 2.1224475431233705, + "learning_rate": 0.00010769999999999999, + "loss": 6.5788, + "step": 718 + }, + { + "epoch": 0.06708967061677708, + "grad_norm": 1.7263709692933664, + "learning_rate": 0.00010784999999999999, + "loss": 6.6537, + "step": 719 + }, + { + "epoch": 0.06718298031165439, + "grad_norm": 1.6314403858953217, + "learning_rate": 0.00010799999999999998, + "loss": 6.5922, + "step": 720 + }, + { + "epoch": 0.06727629000653168, + "grad_norm": 1.9580610905571645, + "learning_rate": 0.00010814999999999999, + "loss": 6.8622, + "step": 721 + }, + { + "epoch": 0.06736959970140897, + "grad_norm": 7.597886557041851, + "learning_rate": 0.00010829999999999999, + "loss": 6.6262, + "step": 722 + }, + { + "epoch": 0.06746290939628627, + "grad_norm": 2.3296351028932443, + "learning_rate": 0.00010845, + "loss": 7.0, + "step": 723 + }, + { + "epoch": 0.06755621909116358, + "grad_norm": 1.2941224578938773, + "learning_rate": 0.00010859999999999998, + "loss": 6.455, + "step": 724 + }, + { + "epoch": 0.06764952878604087, + "grad_norm": 2.7161772190122986, + "learning_rate": 0.00010874999999999999, + "loss": 6.6654, + "step": 725 + }, + { + "epoch": 0.06774283848091817, + "grad_norm": 1.9492012686878817, + "learning_rate": 0.00010889999999999999, + "loss": 6.4019, + "step": 726 + }, + { + "epoch": 0.06783614817579546, + "grad_norm": 1.435652975808796, + "learning_rate": 0.00010904999999999998, + "loss": 6.5661, + "step": 727 + }, + { + "epoch": 0.06792945787067277, + "grad_norm": 1.6723963989766766, + "learning_rate": 0.00010919999999999998, + "loss": 6.5316, + "step": 728 + }, + { + "epoch": 0.06802276756555006, + "grad_norm": 1.978987039951302, + "learning_rate": 0.00010934999999999999, + "loss": 6.4446, + "step": 729 + }, + { + "epoch": 0.06811607726042736, + "grad_norm": 1.4883342433806623, + "learning_rate": 0.00010949999999999999, + "loss": 6.2584, + "step": 730 + }, + { + "epoch": 0.06820938695530465, + "grad_norm": 2.077000026011006, + "learning_rate": 0.00010964999999999998, + "loss": 6.6195, + "step": 731 + }, + { + "epoch": 0.06830269665018196, + "grad_norm": 1.8962458466451908, + "learning_rate": 0.00010979999999999999, + "loss": 6.3937, + "step": 732 + }, + { + "epoch": 0.06839600634505925, + "grad_norm": 4.265958287398598, + "learning_rate": 0.00010994999999999999, + "loss": 6.636, + "step": 733 + }, + { + "epoch": 0.06848931603993655, + "grad_norm": 1.4533620826160916, + "learning_rate": 0.00011009999999999999, + "loss": 6.6292, + "step": 734 + }, + { + "epoch": 0.06858262573481384, + "grad_norm": 1.2552344524216061, + "learning_rate": 0.00011024999999999998, + "loss": 6.4947, + "step": 735 + }, + { + "epoch": 0.06867593542969115, + "grad_norm": 1.6302730456594332, + "learning_rate": 0.00011039999999999999, + "loss": 6.1924, + "step": 736 + }, + { + "epoch": 0.06876924512456845, + "grad_norm": 1.364558079413236, + "learning_rate": 0.00011054999999999999, + "loss": 6.7826, + "step": 737 + }, + { + "epoch": 0.06886255481944574, + "grad_norm": 3.029565029464239, + "learning_rate": 0.0001107, + "loss": 6.3676, + "step": 738 + }, + { + "epoch": 0.06895586451432303, + "grad_norm": 2.1464448084760885, + "learning_rate": 0.00011084999999999998, + "loss": 6.5826, + "step": 739 + }, + { + "epoch": 0.06904917420920034, + "grad_norm": 5.1364261727179334, + "learning_rate": 0.00011099999999999999, + "loss": 6.3445, + "step": 740 + }, + { + "epoch": 0.06914248390407764, + "grad_norm": 1.558204268438497, + "learning_rate": 0.00011114999999999999, + "loss": 6.5912, + "step": 741 + }, + { + "epoch": 0.06923579359895493, + "grad_norm": 1.4374797114414386, + "learning_rate": 0.0001113, + "loss": 6.6011, + "step": 742 + }, + { + "epoch": 0.06932910329383223, + "grad_norm": 1.9914204224244103, + "learning_rate": 0.00011144999999999998, + "loss": 6.6804, + "step": 743 + }, + { + "epoch": 0.06942241298870953, + "grad_norm": 1.6603964086185306, + "learning_rate": 0.00011159999999999999, + "loss": 6.227, + "step": 744 + }, + { + "epoch": 0.06951572268358683, + "grad_norm": 1.6038423336246717, + "learning_rate": 0.00011174999999999999, + "loss": 6.6119, + "step": 745 + }, + { + "epoch": 0.06960903237846412, + "grad_norm": 1.5691499016640085, + "learning_rate": 0.0001119, + "loss": 6.5015, + "step": 746 + }, + { + "epoch": 0.06970234207334142, + "grad_norm": 2.8261978490018347, + "learning_rate": 0.00011204999999999999, + "loss": 6.4758, + "step": 747 + }, + { + "epoch": 0.06979565176821871, + "grad_norm": 1.6994873778068584, + "learning_rate": 0.00011219999999999999, + "loss": 6.5891, + "step": 748 + }, + { + "epoch": 0.06988896146309602, + "grad_norm": 1.5440620811963224, + "learning_rate": 0.00011235, + "loss": 6.5931, + "step": 749 + }, + { + "epoch": 0.06998227115797331, + "grad_norm": 1.8504686125019305, + "learning_rate": 0.0001125, + "loss": 6.6412, + "step": 750 + }, + { + "epoch": 0.07007558085285061, + "grad_norm": 1.4381572035876946, + "learning_rate": 0.00011264999999999999, + "loss": 6.4869, + "step": 751 + }, + { + "epoch": 0.0701688905477279, + "grad_norm": 1.6688917022750263, + "learning_rate": 0.00011279999999999999, + "loss": 6.4998, + "step": 752 + }, + { + "epoch": 0.07026220024260521, + "grad_norm": 3.0203201060817912, + "learning_rate": 0.00011295, + "loss": 6.4853, + "step": 753 + }, + { + "epoch": 0.0703555099374825, + "grad_norm": 1.3016122358805482, + "learning_rate": 0.00011309999999999998, + "loss": 6.7537, + "step": 754 + }, + { + "epoch": 0.0704488196323598, + "grad_norm": 1.6198310564295868, + "learning_rate": 0.00011324999999999999, + "loss": 6.4287, + "step": 755 + }, + { + "epoch": 0.0705421293272371, + "grad_norm": 1.8294026539492854, + "learning_rate": 0.00011339999999999999, + "loss": 6.3764, + "step": 756 + }, + { + "epoch": 0.0706354390221144, + "grad_norm": 6.199336810294012, + "learning_rate": 0.00011355, + "loss": 6.3479, + "step": 757 + }, + { + "epoch": 0.0707287487169917, + "grad_norm": 1.3117184483888389, + "learning_rate": 0.00011369999999999999, + "loss": 6.0451, + "step": 758 + }, + { + "epoch": 0.07082205841186899, + "grad_norm": 4.01055922965651, + "learning_rate": 0.00011384999999999999, + "loss": 6.2705, + "step": 759 + }, + { + "epoch": 0.07091536810674628, + "grad_norm": 1.5659381816766471, + "learning_rate": 0.00011399999999999999, + "loss": 6.2726, + "step": 760 + }, + { + "epoch": 0.0710086778016236, + "grad_norm": 1.3044362892134707, + "learning_rate": 0.00011415, + "loss": 6.2756, + "step": 761 + }, + { + "epoch": 0.07110198749650089, + "grad_norm": 1.577111310088106, + "learning_rate": 0.00011429999999999999, + "loss": 6.7698, + "step": 762 + }, + { + "epoch": 0.07119529719137818, + "grad_norm": 1.785348050498168, + "learning_rate": 0.00011444999999999999, + "loss": 6.4389, + "step": 763 + }, + { + "epoch": 0.07128860688625548, + "grad_norm": 1.857634084053082, + "learning_rate": 0.0001146, + "loss": 6.7614, + "step": 764 + }, + { + "epoch": 0.07138191658113278, + "grad_norm": 1.657776020790231, + "learning_rate": 0.00011475, + "loss": 6.6383, + "step": 765 + }, + { + "epoch": 0.07147522627601008, + "grad_norm": 1.671605732396042, + "learning_rate": 0.00011489999999999999, + "loss": 6.4047, + "step": 766 + }, + { + "epoch": 0.07156853597088737, + "grad_norm": 2.6219974672641975, + "learning_rate": 0.00011504999999999999, + "loss": 6.3977, + "step": 767 + }, + { + "epoch": 0.07166184566576467, + "grad_norm": 1.5255397872956102, + "learning_rate": 0.0001152, + "loss": 6.6928, + "step": 768 + }, + { + "epoch": 0.07175515536064198, + "grad_norm": 1.3868896151885182, + "learning_rate": 0.00011535, + "loss": 6.5113, + "step": 769 + }, + { + "epoch": 0.07184846505551927, + "grad_norm": 1.7987935649775342, + "learning_rate": 0.00011549999999999999, + "loss": 6.4583, + "step": 770 + }, + { + "epoch": 0.07194177475039656, + "grad_norm": 2.454324530910088, + "learning_rate": 0.00011564999999999999, + "loss": 6.5353, + "step": 771 + }, + { + "epoch": 0.07203508444527386, + "grad_norm": 1.454933370406595, + "learning_rate": 0.0001158, + "loss": 6.8396, + "step": 772 + }, + { + "epoch": 0.07212839414015117, + "grad_norm": 1.4491447718652852, + "learning_rate": 0.00011595, + "loss": 6.3384, + "step": 773 + }, + { + "epoch": 0.07222170383502846, + "grad_norm": 1.441477396737543, + "learning_rate": 0.00011609999999999999, + "loss": 6.5162, + "step": 774 + }, + { + "epoch": 0.07231501352990576, + "grad_norm": 2.71501062276025, + "learning_rate": 0.00011624999999999999, + "loss": 6.5383, + "step": 775 + }, + { + "epoch": 0.07240832322478305, + "grad_norm": 1.3383009020954946, + "learning_rate": 0.0001164, + "loss": 5.9623, + "step": 776 + }, + { + "epoch": 0.07250163291966036, + "grad_norm": 1.3769900288055394, + "learning_rate": 0.00011654999999999999, + "loss": 6.5685, + "step": 777 + }, + { + "epoch": 0.07259494261453765, + "grad_norm": 1.634094977585275, + "learning_rate": 0.00011669999999999999, + "loss": 6.7032, + "step": 778 + }, + { + "epoch": 0.07268825230941495, + "grad_norm": 3.110704097720863, + "learning_rate": 0.00011685, + "loss": 6.5637, + "step": 779 + }, + { + "epoch": 0.07278156200429224, + "grad_norm": 1.3146754376936263, + "learning_rate": 0.000117, + "loss": 6.2793, + "step": 780 + }, + { + "epoch": 0.07287487169916955, + "grad_norm": 1.6903574678249418, + "learning_rate": 0.00011714999999999999, + "loss": 6.3218, + "step": 781 + }, + { + "epoch": 0.07296818139404684, + "grad_norm": 1.3507490990527529, + "learning_rate": 0.00011729999999999999, + "loss": 6.3932, + "step": 782 + }, + { + "epoch": 0.07306149108892414, + "grad_norm": 1.264663609075453, + "learning_rate": 0.00011745, + "loss": 6.382, + "step": 783 + }, + { + "epoch": 0.07315480078380143, + "grad_norm": 2.0348433260466483, + "learning_rate": 0.0001176, + "loss": 5.9621, + "step": 784 + }, + { + "epoch": 0.07324811047867874, + "grad_norm": 1.3420852040952438, + "learning_rate": 0.00011774999999999999, + "loss": 6.0871, + "step": 785 + }, + { + "epoch": 0.07334142017355604, + "grad_norm": 1.9955948761758202, + "learning_rate": 0.00011789999999999999, + "loss": 6.2546, + "step": 786 + }, + { + "epoch": 0.07343472986843333, + "grad_norm": 1.48526296732338, + "learning_rate": 0.00011805, + "loss": 6.556, + "step": 787 + }, + { + "epoch": 0.07352803956331062, + "grad_norm": 1.4646425435108716, + "learning_rate": 0.0001182, + "loss": 6.2144, + "step": 788 + }, + { + "epoch": 0.07362134925818793, + "grad_norm": 2.3288342267336994, + "learning_rate": 0.00011834999999999999, + "loss": 6.7151, + "step": 789 + }, + { + "epoch": 0.07371465895306523, + "grad_norm": 1.1928326788106318, + "learning_rate": 0.0001185, + "loss": 6.7389, + "step": 790 + }, + { + "epoch": 0.07380796864794252, + "grad_norm": 1.5394067390611488, + "learning_rate": 0.00011865, + "loss": 6.6165, + "step": 791 + }, + { + "epoch": 0.07390127834281982, + "grad_norm": 1.8072715885103796, + "learning_rate": 0.0001188, + "loss": 6.5481, + "step": 792 + }, + { + "epoch": 0.07399458803769712, + "grad_norm": 2.0690417173828446, + "learning_rate": 0.00011894999999999999, + "loss": 6.1615, + "step": 793 + }, + { + "epoch": 0.07408789773257442, + "grad_norm": 1.302335945056349, + "learning_rate": 0.0001191, + "loss": 6.2416, + "step": 794 + }, + { + "epoch": 0.07418120742745171, + "grad_norm": 1.809316493357154, + "learning_rate": 0.00011925, + "loss": 6.6709, + "step": 795 + }, + { + "epoch": 0.074274517122329, + "grad_norm": 2.1392999567469677, + "learning_rate": 0.0001194, + "loss": 6.5538, + "step": 796 + }, + { + "epoch": 0.07436782681720631, + "grad_norm": 1.4971621326531626, + "learning_rate": 0.00011954999999999999, + "loss": 6.3269, + "step": 797 + }, + { + "epoch": 0.07446113651208361, + "grad_norm": 1.3800543728376613, + "learning_rate": 0.0001197, + "loss": 6.4261, + "step": 798 + }, + { + "epoch": 0.0745544462069609, + "grad_norm": 1.4626585606107014, + "learning_rate": 0.00011985, + "loss": 6.3114, + "step": 799 + }, + { + "epoch": 0.0746477559018382, + "grad_norm": 1.5259416009883968, + "learning_rate": 0.00011999999999999999, + "loss": 6.4931, + "step": 800 + }, + { + "epoch": 0.07474106559671549, + "grad_norm": 1.2086120709219725, + "learning_rate": 0.00012014999999999999, + "loss": 5.9856, + "step": 801 + }, + { + "epoch": 0.0748343752915928, + "grad_norm": 1.314493635252369, + "learning_rate": 0.0001203, + "loss": 6.4326, + "step": 802 + }, + { + "epoch": 0.0749276849864701, + "grad_norm": 1.602557544890895, + "learning_rate": 0.00012045, + "loss": 6.5268, + "step": 803 + }, + { + "epoch": 0.07502099468134739, + "grad_norm": 1.528057053636823, + "learning_rate": 0.00012059999999999999, + "loss": 6.6151, + "step": 804 + }, + { + "epoch": 0.07511430437622468, + "grad_norm": 1.490332112453394, + "learning_rate": 0.00012075, + "loss": 6.6427, + "step": 805 + }, + { + "epoch": 0.07520761407110199, + "grad_norm": 1.1653496785649047, + "learning_rate": 0.0001209, + "loss": 6.2561, + "step": 806 + }, + { + "epoch": 0.07530092376597929, + "grad_norm": 1.286154765907209, + "learning_rate": 0.00012105, + "loss": 6.4334, + "step": 807 + }, + { + "epoch": 0.07539423346085658, + "grad_norm": 1.3632044942693098, + "learning_rate": 0.00012119999999999999, + "loss": 6.5536, + "step": 808 + }, + { + "epoch": 0.07548754315573387, + "grad_norm": 1.4271117169656284, + "learning_rate": 0.00012135, + "loss": 6.6054, + "step": 809 + }, + { + "epoch": 0.07558085285061118, + "grad_norm": 1.2839249983261731, + "learning_rate": 0.0001215, + "loss": 6.3626, + "step": 810 + }, + { + "epoch": 0.07567416254548848, + "grad_norm": 1.541933471600715, + "learning_rate": 0.00012165, + "loss": 6.3726, + "step": 811 + }, + { + "epoch": 0.07576747224036577, + "grad_norm": 1.8753834781901908, + "learning_rate": 0.00012179999999999999, + "loss": 6.148, + "step": 812 + }, + { + "epoch": 0.07586078193524307, + "grad_norm": 1.5523710525105163, + "learning_rate": 0.00012194999999999998, + "loss": 5.9241, + "step": 813 + }, + { + "epoch": 0.07595409163012037, + "grad_norm": 2.0465576001530903, + "learning_rate": 0.00012209999999999999, + "loss": 6.2675, + "step": 814 + }, + { + "epoch": 0.07604740132499767, + "grad_norm": 1.39545090443235, + "learning_rate": 0.00012225, + "loss": 6.3698, + "step": 815 + }, + { + "epoch": 0.07614071101987496, + "grad_norm": 1.765991426790649, + "learning_rate": 0.0001224, + "loss": 6.6967, + "step": 816 + }, + { + "epoch": 0.07623402071475226, + "grad_norm": 1.3912020381360133, + "learning_rate": 0.00012254999999999997, + "loss": 6.5179, + "step": 817 + }, + { + "epoch": 0.07632733040962957, + "grad_norm": 1.9684819708323196, + "learning_rate": 0.00012269999999999997, + "loss": 7.023, + "step": 818 + }, + { + "epoch": 0.07642064010450686, + "grad_norm": 1.9929264302870657, + "learning_rate": 0.00012284999999999998, + "loss": 6.3762, + "step": 819 + }, + { + "epoch": 0.07651394979938415, + "grad_norm": 1.2520789055690282, + "learning_rate": 0.00012299999999999998, + "loss": 6.6276, + "step": 820 + }, + { + "epoch": 0.07660725949426145, + "grad_norm": 1.7886206063965775, + "learning_rate": 0.00012314999999999998, + "loss": 6.4958, + "step": 821 + }, + { + "epoch": 0.07670056918913876, + "grad_norm": 1.7745696019544202, + "learning_rate": 0.0001233, + "loss": 6.2438, + "step": 822 + }, + { + "epoch": 0.07679387888401605, + "grad_norm": 1.4626821058637358, + "learning_rate": 0.00012345, + "loss": 6.4945, + "step": 823 + }, + { + "epoch": 0.07688718857889335, + "grad_norm": 1.436284465595357, + "learning_rate": 0.0001236, + "loss": 6.5954, + "step": 824 + }, + { + "epoch": 0.07698049827377064, + "grad_norm": 1.311319335739438, + "learning_rate": 0.00012374999999999997, + "loss": 6.5057, + "step": 825 + }, + { + "epoch": 0.07707380796864795, + "grad_norm": 2.260615774077124, + "learning_rate": 0.00012389999999999998, + "loss": 6.5333, + "step": 826 + }, + { + "epoch": 0.07716711766352524, + "grad_norm": 1.9231373364675333, + "learning_rate": 0.00012404999999999998, + "loss": 6.1731, + "step": 827 + }, + { + "epoch": 0.07726042735840254, + "grad_norm": 1.3540092966777426, + "learning_rate": 0.00012419999999999998, + "loss": 6.5061, + "step": 828 + }, + { + "epoch": 0.07735373705327983, + "grad_norm": 1.783599420105879, + "learning_rate": 0.00012435, + "loss": 6.067, + "step": 829 + }, + { + "epoch": 0.07744704674815714, + "grad_norm": 2.1716604471126337, + "learning_rate": 0.0001245, + "loss": 6.3958, + "step": 830 + }, + { + "epoch": 0.07754035644303443, + "grad_norm": 1.4877179036880561, + "learning_rate": 0.00012465, + "loss": 6.1533, + "step": 831 + }, + { + "epoch": 0.07763366613791173, + "grad_norm": 1.4763859716505399, + "learning_rate": 0.00012479999999999997, + "loss": 6.5844, + "step": 832 + }, + { + "epoch": 0.07772697583278902, + "grad_norm": 1.52226921952858, + "learning_rate": 0.00012494999999999997, + "loss": 6.145, + "step": 833 + }, + { + "epoch": 0.07782028552766633, + "grad_norm": 1.133813877850951, + "learning_rate": 0.00012509999999999998, + "loss": 6.5732, + "step": 834 + }, + { + "epoch": 0.07791359522254362, + "grad_norm": 1.5309902385676748, + "learning_rate": 0.00012524999999999998, + "loss": 6.4882, + "step": 835 + }, + { + "epoch": 0.07800690491742092, + "grad_norm": 1.5685963284992166, + "learning_rate": 0.00012539999999999999, + "loss": 6.3626, + "step": 836 + }, + { + "epoch": 0.07810021461229821, + "grad_norm": 1.392075823711747, + "learning_rate": 0.00012555, + "loss": 6.2344, + "step": 837 + }, + { + "epoch": 0.07819352430717552, + "grad_norm": 1.3669145161225762, + "learning_rate": 0.0001257, + "loss": 6.5032, + "step": 838 + }, + { + "epoch": 0.07828683400205282, + "grad_norm": 1.4723472068917287, + "learning_rate": 0.00012585, + "loss": 6.459, + "step": 839 + }, + { + "epoch": 0.07838014369693011, + "grad_norm": 1.4733952660779783, + "learning_rate": 0.00012599999999999997, + "loss": 6.0339, + "step": 840 + }, + { + "epoch": 0.0784734533918074, + "grad_norm": 1.2452293549502669, + "learning_rate": 0.00012614999999999998, + "loss": 6.4707, + "step": 841 + }, + { + "epoch": 0.07856676308668471, + "grad_norm": 2.0853593052937613, + "learning_rate": 0.00012629999999999998, + "loss": 6.4846, + "step": 842 + }, + { + "epoch": 0.07866007278156201, + "grad_norm": 2.0963686618505792, + "learning_rate": 0.00012644999999999998, + "loss": 6.7074, + "step": 843 + }, + { + "epoch": 0.0787533824764393, + "grad_norm": 1.261901165452889, + "learning_rate": 0.0001266, + "loss": 6.386, + "step": 844 + }, + { + "epoch": 0.0788466921713166, + "grad_norm": 1.60684648844085, + "learning_rate": 0.00012675, + "loss": 6.5122, + "step": 845 + }, + { + "epoch": 0.0789400018661939, + "grad_norm": 1.306448415194855, + "learning_rate": 0.0001269, + "loss": 6.5601, + "step": 846 + }, + { + "epoch": 0.0790333115610712, + "grad_norm": 1.258290281167767, + "learning_rate": 0.00012705, + "loss": 6.5686, + "step": 847 + }, + { + "epoch": 0.07912662125594849, + "grad_norm": 1.1924806950241387, + "learning_rate": 0.00012719999999999997, + "loss": 6.6892, + "step": 848 + }, + { + "epoch": 0.07921993095082579, + "grad_norm": 1.578433000914274, + "learning_rate": 0.00012734999999999998, + "loss": 6.3202, + "step": 849 + }, + { + "epoch": 0.0793132406457031, + "grad_norm": 1.5173584055416625, + "learning_rate": 0.00012749999999999998, + "loss": 6.3004, + "step": 850 + }, + { + "epoch": 0.07940655034058039, + "grad_norm": 1.2089281178733993, + "learning_rate": 0.00012764999999999999, + "loss": 6.2056, + "step": 851 + }, + { + "epoch": 0.07949986003545768, + "grad_norm": 1.7904552801402367, + "learning_rate": 0.0001278, + "loss": 6.0052, + "step": 852 + }, + { + "epoch": 0.07959316973033498, + "grad_norm": 1.3626376637705573, + "learning_rate": 0.00012795, + "loss": 6.3024, + "step": 853 + }, + { + "epoch": 0.07968647942521227, + "grad_norm": 1.727547111359358, + "learning_rate": 0.0001281, + "loss": 6.2193, + "step": 854 + }, + { + "epoch": 0.07977978912008958, + "grad_norm": 1.3659949841777652, + "learning_rate": 0.00012824999999999997, + "loss": 6.6175, + "step": 855 + }, + { + "epoch": 0.07987309881496688, + "grad_norm": 1.460171421408208, + "learning_rate": 0.00012839999999999998, + "loss": 6.6644, + "step": 856 + }, + { + "epoch": 0.07996640850984417, + "grad_norm": 1.4278678055778216, + "learning_rate": 0.00012854999999999998, + "loss": 6.3026, + "step": 857 + }, + { + "epoch": 0.08005971820472146, + "grad_norm": 1.2537445267021143, + "learning_rate": 0.00012869999999999998, + "loss": 6.4224, + "step": 858 + }, + { + "epoch": 0.08015302789959877, + "grad_norm": 1.128645390026562, + "learning_rate": 0.00012885, + "loss": 6.5839, + "step": 859 + }, + { + "epoch": 0.08024633759447607, + "grad_norm": 1.565540114658358, + "learning_rate": 0.000129, + "loss": 6.4916, + "step": 860 + }, + { + "epoch": 0.08033964728935336, + "grad_norm": 1.4074479113637735, + "learning_rate": 0.00012915, + "loss": 6.5635, + "step": 861 + }, + { + "epoch": 0.08043295698423066, + "grad_norm": 1.4643830107336648, + "learning_rate": 0.0001293, + "loss": 6.024, + "step": 862 + }, + { + "epoch": 0.08052626667910796, + "grad_norm": 1.2035920412261496, + "learning_rate": 0.00012944999999999998, + "loss": 6.2241, + "step": 863 + }, + { + "epoch": 0.08061957637398526, + "grad_norm": 1.2677019384241537, + "learning_rate": 0.00012959999999999998, + "loss": 6.3387, + "step": 864 + }, + { + "epoch": 0.08071288606886255, + "grad_norm": 1.333113632776727, + "learning_rate": 0.00012974999999999998, + "loss": 6.1753, + "step": 865 + }, + { + "epoch": 0.08080619576373985, + "grad_norm": 1.536256385411371, + "learning_rate": 0.00012989999999999999, + "loss": 6.4208, + "step": 866 + }, + { + "epoch": 0.08089950545861715, + "grad_norm": 1.4045378553234138, + "learning_rate": 0.00013005, + "loss": 6.4159, + "step": 867 + }, + { + "epoch": 0.08099281515349445, + "grad_norm": 1.8742788943822597, + "learning_rate": 0.0001302, + "loss": 6.5707, + "step": 868 + }, + { + "epoch": 0.08108612484837174, + "grad_norm": 1.3594614417733126, + "learning_rate": 0.00013035, + "loss": 5.9648, + "step": 869 + }, + { + "epoch": 0.08117943454324904, + "grad_norm": 1.1977466273516046, + "learning_rate": 0.0001305, + "loss": 6.3257, + "step": 870 + }, + { + "epoch": 0.08127274423812635, + "grad_norm": 1.2520254815806786, + "learning_rate": 0.00013064999999999998, + "loss": 6.5817, + "step": 871 + }, + { + "epoch": 0.08136605393300364, + "grad_norm": 1.3959328284060684, + "learning_rate": 0.00013079999999999998, + "loss": 5.9216, + "step": 872 + }, + { + "epoch": 0.08145936362788093, + "grad_norm": 1.655866713366638, + "learning_rate": 0.00013094999999999998, + "loss": 6.2147, + "step": 873 + }, + { + "epoch": 0.08155267332275823, + "grad_norm": 1.3609645336655705, + "learning_rate": 0.0001311, + "loss": 5.9097, + "step": 874 + }, + { + "epoch": 0.08164598301763554, + "grad_norm": 1.215434028537456, + "learning_rate": 0.00013125, + "loss": 6.1597, + "step": 875 + }, + { + "epoch": 0.08173929271251283, + "grad_norm": 1.6102165166625213, + "learning_rate": 0.0001314, + "loss": 6.4886, + "step": 876 + }, + { + "epoch": 0.08183260240739013, + "grad_norm": 1.3274128920555104, + "learning_rate": 0.00013155, + "loss": 6.2954, + "step": 877 + }, + { + "epoch": 0.08192591210226742, + "grad_norm": 1.3189749013446928, + "learning_rate": 0.00013169999999999998, + "loss": 5.9429, + "step": 878 + }, + { + "epoch": 0.08201922179714473, + "grad_norm": 1.2113569325314453, + "learning_rate": 0.00013184999999999998, + "loss": 6.6613, + "step": 879 + }, + { + "epoch": 0.08211253149202202, + "grad_norm": 1.1770708777942642, + "learning_rate": 0.00013199999999999998, + "loss": 6.0107, + "step": 880 + }, + { + "epoch": 0.08220584118689932, + "grad_norm": 1.2390950121152433, + "learning_rate": 0.00013215, + "loss": 6.2766, + "step": 881 + }, + { + "epoch": 0.08229915088177661, + "grad_norm": 1.5449713030496548, + "learning_rate": 0.0001323, + "loss": 6.3767, + "step": 882 + }, + { + "epoch": 0.08239246057665392, + "grad_norm": 1.220795567497897, + "learning_rate": 0.00013245, + "loss": 6.655, + "step": 883 + }, + { + "epoch": 0.08248577027153121, + "grad_norm": 1.4036718541951927, + "learning_rate": 0.0001326, + "loss": 6.2578, + "step": 884 + }, + { + "epoch": 0.08257907996640851, + "grad_norm": 1.3070238899218913, + "learning_rate": 0.00013275, + "loss": 6.3598, + "step": 885 + }, + { + "epoch": 0.0826723896612858, + "grad_norm": 2.0027334359051463, + "learning_rate": 0.00013289999999999998, + "loss": 6.2107, + "step": 886 + }, + { + "epoch": 0.08276569935616311, + "grad_norm": 1.814792439140556, + "learning_rate": 0.00013304999999999998, + "loss": 6.3501, + "step": 887 + }, + { + "epoch": 0.0828590090510404, + "grad_norm": 1.9838018800142971, + "learning_rate": 0.00013319999999999999, + "loss": 6.4398, + "step": 888 + }, + { + "epoch": 0.0829523187459177, + "grad_norm": 1.677314446003066, + "learning_rate": 0.00013335, + "loss": 6.5261, + "step": 889 + }, + { + "epoch": 0.083045628440795, + "grad_norm": 1.5019955692917817, + "learning_rate": 0.0001335, + "loss": 6.1962, + "step": 890 + }, + { + "epoch": 0.0831389381356723, + "grad_norm": 2.365426283753135, + "learning_rate": 0.00013365, + "loss": 6.5733, + "step": 891 + }, + { + "epoch": 0.0832322478305496, + "grad_norm": 1.7198035091933261, + "learning_rate": 0.0001338, + "loss": 6.0229, + "step": 892 + }, + { + "epoch": 0.08332555752542689, + "grad_norm": 1.4732368023190767, + "learning_rate": 0.00013395, + "loss": 6.1801, + "step": 893 + }, + { + "epoch": 0.08341886722030419, + "grad_norm": 1.2121457110937943, + "learning_rate": 0.00013409999999999998, + "loss": 6.3904, + "step": 894 + }, + { + "epoch": 0.0835121769151815, + "grad_norm": 1.7778683601078973, + "learning_rate": 0.00013424999999999998, + "loss": 6.3549, + "step": 895 + }, + { + "epoch": 0.08360548661005879, + "grad_norm": 1.5763036778328892, + "learning_rate": 0.0001344, + "loss": 6.6478, + "step": 896 + }, + { + "epoch": 0.08369879630493608, + "grad_norm": 1.8630946284374774, + "learning_rate": 0.00013455, + "loss": 6.6245, + "step": 897 + }, + { + "epoch": 0.08379210599981338, + "grad_norm": 1.4030167603307593, + "learning_rate": 0.0001347, + "loss": 6.6949, + "step": 898 + }, + { + "epoch": 0.08388541569469068, + "grad_norm": 1.5745389472767464, + "learning_rate": 0.00013485, + "loss": 6.3891, + "step": 899 + }, + { + "epoch": 0.08397872538956798, + "grad_norm": 2.356312103594305, + "learning_rate": 0.000135, + "loss": 6.268, + "step": 900 + }, + { + "epoch": 0.08407203508444527, + "grad_norm": 1.451270728528241, + "learning_rate": 0.00013514999999999998, + "loss": 6.3188, + "step": 901 + }, + { + "epoch": 0.08416534477932257, + "grad_norm": 1.7229795408520099, + "learning_rate": 0.00013529999999999998, + "loss": 6.228, + "step": 902 + }, + { + "epoch": 0.08425865447419988, + "grad_norm": 2.9015200305497486, + "learning_rate": 0.00013544999999999999, + "loss": 6.4785, + "step": 903 + }, + { + "epoch": 0.08435196416907717, + "grad_norm": 2.138461735192999, + "learning_rate": 0.0001356, + "loss": 6.4512, + "step": 904 + }, + { + "epoch": 0.08444527386395446, + "grad_norm": 1.220169162264846, + "learning_rate": 0.00013575, + "loss": 6.1107, + "step": 905 + }, + { + "epoch": 0.08453858355883176, + "grad_norm": 1.6354540867456915, + "learning_rate": 0.0001359, + "loss": 6.413, + "step": 906 + }, + { + "epoch": 0.08463189325370905, + "grad_norm": 1.88444846667468, + "learning_rate": 0.00013605, + "loss": 6.4786, + "step": 907 + }, + { + "epoch": 0.08472520294858636, + "grad_norm": 1.2386755954298523, + "learning_rate": 0.0001362, + "loss": 6.427, + "step": 908 + }, + { + "epoch": 0.08481851264346366, + "grad_norm": 1.1620455988809937, + "learning_rate": 0.00013634999999999998, + "loss": 6.4063, + "step": 909 + }, + { + "epoch": 0.08491182233834095, + "grad_norm": 1.6270742422426776, + "learning_rate": 0.00013649999999999998, + "loss": 6.362, + "step": 910 + }, + { + "epoch": 0.08500513203321824, + "grad_norm": 1.1566566461925498, + "learning_rate": 0.00013665, + "loss": 6.3839, + "step": 911 + }, + { + "epoch": 0.08509844172809555, + "grad_norm": 23.80243571877916, + "learning_rate": 0.0001368, + "loss": 6.1715, + "step": 912 + }, + { + "epoch": 0.08519175142297285, + "grad_norm": 1.7032969545955414, + "learning_rate": 0.00013695, + "loss": 6.269, + "step": 913 + }, + { + "epoch": 0.08528506111785014, + "grad_norm": 7.5149990875914545, + "learning_rate": 0.0001371, + "loss": 6.3265, + "step": 914 + }, + { + "epoch": 0.08537837081272744, + "grad_norm": 1.4956572871962561, + "learning_rate": 0.00013725, + "loss": 6.4407, + "step": 915 + }, + { + "epoch": 0.08547168050760474, + "grad_norm": 4.56724335119866, + "learning_rate": 0.0001374, + "loss": 6.5709, + "step": 916 + }, + { + "epoch": 0.08556499020248204, + "grad_norm": 1.1175636481211777, + "learning_rate": 0.00013754999999999998, + "loss": 6.3753, + "step": 917 + }, + { + "epoch": 0.08565829989735933, + "grad_norm": 1.270253185657069, + "learning_rate": 0.00013769999999999999, + "loss": 6.4776, + "step": 918 + }, + { + "epoch": 0.08575160959223663, + "grad_norm": 11.570404176796469, + "learning_rate": 0.00013785, + "loss": 6.5986, + "step": 919 + }, + { + "epoch": 0.08584491928711394, + "grad_norm": 1.4838305867885069, + "learning_rate": 0.000138, + "loss": 6.7017, + "step": 920 + }, + { + "epoch": 0.08593822898199123, + "grad_norm": 2.6302113305436006, + "learning_rate": 0.00013815, + "loss": 6.1249, + "step": 921 + }, + { + "epoch": 0.08603153867686852, + "grad_norm": 63.31786395403053, + "learning_rate": 0.0001383, + "loss": 6.8973, + "step": 922 + }, + { + "epoch": 0.08612484837174582, + "grad_norm": 3.3309119200329347, + "learning_rate": 0.00013845, + "loss": 6.2643, + "step": 923 + }, + { + "epoch": 0.08621815806662313, + "grad_norm": 3.0782341720349238, + "learning_rate": 0.0001386, + "loss": 6.5003, + "step": 924 + }, + { + "epoch": 0.08631146776150042, + "grad_norm": 4.669419777258113, + "learning_rate": 0.00013874999999999998, + "loss": 6.6943, + "step": 925 + }, + { + "epoch": 0.08640477745637772, + "grad_norm": 2.7682037014186074, + "learning_rate": 0.0001389, + "loss": 6.7674, + "step": 926 + }, + { + "epoch": 0.08649808715125501, + "grad_norm": 6.199266510756549, + "learning_rate": 0.00013905, + "loss": 6.2648, + "step": 927 + }, + { + "epoch": 0.08659139684613232, + "grad_norm": 2.8317808788832193, + "learning_rate": 0.0001392, + "loss": 6.0558, + "step": 928 + }, + { + "epoch": 0.08668470654100961, + "grad_norm": 2.490271817352842, + "learning_rate": 0.00013935, + "loss": 6.6224, + "step": 929 + }, + { + "epoch": 0.0867780162358869, + "grad_norm": 2.49346863792448, + "learning_rate": 0.0001395, + "loss": 6.363, + "step": 930 + }, + { + "epoch": 0.0868713259307642, + "grad_norm": 3.528572427822931, + "learning_rate": 0.00013965, + "loss": 6.5706, + "step": 931 + }, + { + "epoch": 0.08696463562564151, + "grad_norm": 1.6371208400081982, + "learning_rate": 0.00013979999999999998, + "loss": 6.7689, + "step": 932 + }, + { + "epoch": 0.0870579453205188, + "grad_norm": 2.1113160950833647, + "learning_rate": 0.00013995, + "loss": 6.7192, + "step": 933 + }, + { + "epoch": 0.0871512550153961, + "grad_norm": 2.5010322455924205, + "learning_rate": 0.0001401, + "loss": 6.6158, + "step": 934 + }, + { + "epoch": 0.08724456471027339, + "grad_norm": 1.6290578832947726, + "learning_rate": 0.00014025, + "loss": 6.1582, + "step": 935 + }, + { + "epoch": 0.0873378744051507, + "grad_norm": 1.6464821318639253, + "learning_rate": 0.0001404, + "loss": 6.506, + "step": 936 + }, + { + "epoch": 0.087431184100028, + "grad_norm": 1.6725842039107484, + "learning_rate": 0.00014055, + "loss": 6.3388, + "step": 937 + }, + { + "epoch": 0.08752449379490529, + "grad_norm": 1.8544415637782476, + "learning_rate": 0.00014069999999999998, + "loss": 6.1745, + "step": 938 + }, + { + "epoch": 0.08761780348978258, + "grad_norm": 1.825442204529072, + "learning_rate": 0.00014084999999999998, + "loss": 6.3804, + "step": 939 + }, + { + "epoch": 0.08771111318465989, + "grad_norm": 2.4748947576292735, + "learning_rate": 0.00014099999999999998, + "loss": 6.4673, + "step": 940 + }, + { + "epoch": 0.08780442287953719, + "grad_norm": 1.739788720423207, + "learning_rate": 0.00014115, + "loss": 5.9149, + "step": 941 + }, + { + "epoch": 0.08789773257441448, + "grad_norm": 1.661904508327584, + "learning_rate": 0.0001413, + "loss": 6.2541, + "step": 942 + }, + { + "epoch": 0.08799104226929177, + "grad_norm": 1.4339189646775752, + "learning_rate": 0.00014144999999999997, + "loss": 6.104, + "step": 943 + }, + { + "epoch": 0.08808435196416908, + "grad_norm": 1.3310638891179236, + "learning_rate": 0.00014159999999999997, + "loss": 6.117, + "step": 944 + }, + { + "epoch": 0.08817766165904638, + "grad_norm": 1.604040529371643, + "learning_rate": 0.00014174999999999998, + "loss": 6.5598, + "step": 945 + }, + { + "epoch": 0.08827097135392367, + "grad_norm": 2.3474384766850123, + "learning_rate": 0.00014189999999999998, + "loss": 6.3398, + "step": 946 + }, + { + "epoch": 0.08836428104880097, + "grad_norm": 1.1718557188703085, + "learning_rate": 0.00014204999999999998, + "loss": 6.5092, + "step": 947 + }, + { + "epoch": 0.08845759074367827, + "grad_norm": 1.367533849771808, + "learning_rate": 0.0001422, + "loss": 6.201, + "step": 948 + }, + { + "epoch": 0.08855090043855557, + "grad_norm": 1.2802430097316349, + "learning_rate": 0.00014235, + "loss": 5.9876, + "step": 949 + }, + { + "epoch": 0.08864421013343286, + "grad_norm": 1.954012661508481, + "learning_rate": 0.0001425, + "loss": 6.1548, + "step": 950 + }, + { + "epoch": 0.08873751982831016, + "grad_norm": 1.2828378101632358, + "learning_rate": 0.00014264999999999997, + "loss": 6.4549, + "step": 951 + }, + { + "epoch": 0.08883082952318747, + "grad_norm": 1.5951452571918863, + "learning_rate": 0.00014279999999999997, + "loss": 6.4509, + "step": 952 + }, + { + "epoch": 0.08892413921806476, + "grad_norm": 1.552101387513093, + "learning_rate": 0.00014294999999999998, + "loss": 6.5319, + "step": 953 + }, + { + "epoch": 0.08901744891294205, + "grad_norm": 2.1034477088654433, + "learning_rate": 0.00014309999999999998, + "loss": 6.0116, + "step": 954 + }, + { + "epoch": 0.08911075860781935, + "grad_norm": 1.832757629789991, + "learning_rate": 0.00014324999999999999, + "loss": 6.2426, + "step": 955 + }, + { + "epoch": 0.08920406830269666, + "grad_norm": 1.2808378886229967, + "learning_rate": 0.0001434, + "loss": 6.3245, + "step": 956 + }, + { + "epoch": 0.08929737799757395, + "grad_norm": 2.5521703780702127, + "learning_rate": 0.00014355, + "loss": 6.5533, + "step": 957 + }, + { + "epoch": 0.08939068769245125, + "grad_norm": 2.439984876863113, + "learning_rate": 0.00014369999999999997, + "loss": 6.319, + "step": 958 + }, + { + "epoch": 0.08948399738732854, + "grad_norm": 1.5115846079800872, + "learning_rate": 0.00014384999999999997, + "loss": 6.2533, + "step": 959 + }, + { + "epoch": 0.08957730708220583, + "grad_norm": 3.246506794954226, + "learning_rate": 0.00014399999999999998, + "loss": 6.1705, + "step": 960 + }, + { + "epoch": 0.08967061677708314, + "grad_norm": 1.3200696908404492, + "learning_rate": 0.00014414999999999998, + "loss": 5.9965, + "step": 961 + }, + { + "epoch": 0.08976392647196044, + "grad_norm": 1.2135237043317766, + "learning_rate": 0.00014429999999999998, + "loss": 5.8471, + "step": 962 + }, + { + "epoch": 0.08985723616683773, + "grad_norm": 2.0689050522913757, + "learning_rate": 0.00014445, + "loss": 6.5263, + "step": 963 + }, + { + "epoch": 0.08995054586171503, + "grad_norm": 1.5487592898209372, + "learning_rate": 0.0001446, + "loss": 6.4936, + "step": 964 + }, + { + "epoch": 0.09004385555659233, + "grad_norm": 1.3474264295519107, + "learning_rate": 0.00014475, + "loss": 6.1078, + "step": 965 + }, + { + "epoch": 0.09013716525146963, + "grad_norm": 2.0639440517785816, + "learning_rate": 0.00014489999999999997, + "loss": 6.5453, + "step": 966 + }, + { + "epoch": 0.09023047494634692, + "grad_norm": 1.4430569827972595, + "learning_rate": 0.00014504999999999997, + "loss": 6.3341, + "step": 967 + }, + { + "epoch": 0.09032378464122422, + "grad_norm": 1.301028286679285, + "learning_rate": 0.00014519999999999998, + "loss": 6.5885, + "step": 968 + }, + { + "epoch": 0.09041709433610153, + "grad_norm": 2.180590337228208, + "learning_rate": 0.00014534999999999998, + "loss": 6.2098, + "step": 969 + }, + { + "epoch": 0.09051040403097882, + "grad_norm": 1.1148306775128265, + "learning_rate": 0.00014549999999999999, + "loss": 6.0658, + "step": 970 + }, + { + "epoch": 0.09060371372585611, + "grad_norm": 1.2202108646949439, + "learning_rate": 0.00014565, + "loss": 6.5256, + "step": 971 + }, + { + "epoch": 0.09069702342073341, + "grad_norm": 1.8297936222386277, + "learning_rate": 0.0001458, + "loss": 6.5596, + "step": 972 + }, + { + "epoch": 0.09079033311561072, + "grad_norm": 1.2465187841638048, + "learning_rate": 0.00014595, + "loss": 6.4058, + "step": 973 + }, + { + "epoch": 0.09088364281048801, + "grad_norm": 1.211875199964932, + "learning_rate": 0.00014609999999999997, + "loss": 6.2203, + "step": 974 + }, + { + "epoch": 0.0909769525053653, + "grad_norm": 1.494883978245697, + "learning_rate": 0.00014624999999999998, + "loss": 6.1276, + "step": 975 + }, + { + "epoch": 0.0910702622002426, + "grad_norm": 1.402376957643712, + "learning_rate": 0.00014639999999999998, + "loss": 6.2259, + "step": 976 + }, + { + "epoch": 0.09116357189511991, + "grad_norm": 1.7938642472006994, + "learning_rate": 0.00014654999999999998, + "loss": 5.7707, + "step": 977 + }, + { + "epoch": 0.0912568815899972, + "grad_norm": 1.4344945459967329, + "learning_rate": 0.0001467, + "loss": 6.4725, + "step": 978 + }, + { + "epoch": 0.0913501912848745, + "grad_norm": 1.835272402438821, + "learning_rate": 0.00014685, + "loss": 6.2807, + "step": 979 + }, + { + "epoch": 0.09144350097975179, + "grad_norm": 1.300360900968715, + "learning_rate": 0.000147, + "loss": 6.271, + "step": 980 + }, + { + "epoch": 0.0915368106746291, + "grad_norm": 2.037112946334879, + "learning_rate": 0.00014714999999999997, + "loss": 5.6927, + "step": 981 + }, + { + "epoch": 0.0916301203695064, + "grad_norm": 1.327668189145624, + "learning_rate": 0.00014729999999999998, + "loss": 6.2316, + "step": 982 + }, + { + "epoch": 0.09172343006438369, + "grad_norm": 1.2034996092827093, + "learning_rate": 0.00014744999999999998, + "loss": 6.4422, + "step": 983 + }, + { + "epoch": 0.09181673975926098, + "grad_norm": 1.3276980250479151, + "learning_rate": 0.00014759999999999998, + "loss": 5.9573, + "step": 984 + }, + { + "epoch": 0.09191004945413829, + "grad_norm": 1.366399351355341, + "learning_rate": 0.00014774999999999999, + "loss": 6.4088, + "step": 985 + }, + { + "epoch": 0.09200335914901558, + "grad_norm": 1.1959363206070643, + "learning_rate": 0.0001479, + "loss": 6.323, + "step": 986 + }, + { + "epoch": 0.09209666884389288, + "grad_norm": 1.2045853979917045, + "learning_rate": 0.00014805, + "loss": 6.3844, + "step": 987 + }, + { + "epoch": 0.09218997853877017, + "grad_norm": 1.1486735231649656, + "learning_rate": 0.0001482, + "loss": 5.8419, + "step": 988 + }, + { + "epoch": 0.09228328823364748, + "grad_norm": 1.3986351583792818, + "learning_rate": 0.00014834999999999997, + "loss": 6.0956, + "step": 989 + }, + { + "epoch": 0.09237659792852478, + "grad_norm": 1.456910599496067, + "learning_rate": 0.00014849999999999998, + "loss": 6.4848, + "step": 990 + }, + { + "epoch": 0.09246990762340207, + "grad_norm": 1.1452903926123608, + "learning_rate": 0.00014864999999999998, + "loss": 6.0661, + "step": 991 + }, + { + "epoch": 0.09256321731827936, + "grad_norm": 1.254359818607674, + "learning_rate": 0.00014879999999999998, + "loss": 6.222, + "step": 992 + }, + { + "epoch": 0.09265652701315667, + "grad_norm": 1.1486771159723588, + "learning_rate": 0.00014895, + "loss": 6.583, + "step": 993 + }, + { + "epoch": 0.09274983670803397, + "grad_norm": 1.9511531951774208, + "learning_rate": 0.0001491, + "loss": 6.3879, + "step": 994 + }, + { + "epoch": 0.09284314640291126, + "grad_norm": 1.6842041399733716, + "learning_rate": 0.00014925, + "loss": 6.2931, + "step": 995 + }, + { + "epoch": 0.09293645609778856, + "grad_norm": 1.1805433246116188, + "learning_rate": 0.0001494, + "loss": 6.1143, + "step": 996 + }, + { + "epoch": 0.09302976579266586, + "grad_norm": 1.615631057927885, + "learning_rate": 0.00014954999999999998, + "loss": 6.7117, + "step": 997 + }, + { + "epoch": 0.09312307548754316, + "grad_norm": 1.933037846211515, + "learning_rate": 0.00014969999999999998, + "loss": 6.271, + "step": 998 + }, + { + "epoch": 0.09321638518242045, + "grad_norm": 1.4732388225828184, + "learning_rate": 0.00014984999999999998, + "loss": 6.3458, + "step": 999 + }, + { + "epoch": 0.09330969487729775, + "grad_norm": 2.798657262902148, + "learning_rate": 0.00015, + "loss": 5.8935, + "step": 1000 + }, + { + "epoch": 0.09340300457217506, + "grad_norm": 7.028429142744828, + "learning_rate": 0.00015014999999999996, + "loss": 6.4806, + "step": 1001 + }, + { + "epoch": 0.09349631426705235, + "grad_norm": 1.5224086775246597, + "learning_rate": 0.0001503, + "loss": 6.2031, + "step": 1002 + }, + { + "epoch": 0.09358962396192964, + "grad_norm": 3.5133025374760436, + "learning_rate": 0.00015044999999999997, + "loss": 6.0845, + "step": 1003 + }, + { + "epoch": 0.09368293365680694, + "grad_norm": 1.594802349125917, + "learning_rate": 0.00015059999999999997, + "loss": 6.0197, + "step": 1004 + }, + { + "epoch": 0.09377624335168425, + "grad_norm": 1.7528873012769384, + "learning_rate": 0.00015074999999999998, + "loss": 6.535, + "step": 1005 + }, + { + "epoch": 0.09386955304656154, + "grad_norm": 1.8533665799453014, + "learning_rate": 0.00015089999999999998, + "loss": 6.1578, + "step": 1006 + }, + { + "epoch": 0.09396286274143884, + "grad_norm": 1.8495367528396955, + "learning_rate": 0.00015104999999999996, + "loss": 6.5655, + "step": 1007 + }, + { + "epoch": 0.09405617243631613, + "grad_norm": 1.4903041304641782, + "learning_rate": 0.0001512, + "loss": 6.6563, + "step": 1008 + }, + { + "epoch": 0.09414948213119344, + "grad_norm": 1.25092896260294, + "learning_rate": 0.00015134999999999997, + "loss": 6.2775, + "step": 1009 + }, + { + "epoch": 0.09424279182607073, + "grad_norm": 1.3379831523032375, + "learning_rate": 0.0001515, + "loss": 6.6551, + "step": 1010 + }, + { + "epoch": 0.09433610152094803, + "grad_norm": 1.469064293408222, + "learning_rate": 0.00015164999999999997, + "loss": 6.3741, + "step": 1011 + }, + { + "epoch": 0.09442941121582532, + "grad_norm": 1.2322405640143446, + "learning_rate": 0.00015179999999999998, + "loss": 6.3636, + "step": 1012 + }, + { + "epoch": 0.09452272091070263, + "grad_norm": 3.3105859622644815, + "learning_rate": 0.00015194999999999998, + "loss": 6.042, + "step": 1013 + }, + { + "epoch": 0.09461603060557992, + "grad_norm": 2.2875905897001165, + "learning_rate": 0.00015209999999999998, + "loss": 5.921, + "step": 1014 + }, + { + "epoch": 0.09470934030045722, + "grad_norm": 1.9059698033762926, + "learning_rate": 0.00015224999999999996, + "loss": 6.548, + "step": 1015 + }, + { + "epoch": 0.09480264999533451, + "grad_norm": 1.3968534499292178, + "learning_rate": 0.0001524, + "loss": 6.6674, + "step": 1016 + }, + { + "epoch": 0.0948959596902118, + "grad_norm": 1.2520784535121507, + "learning_rate": 0.00015254999999999997, + "loss": 6.1905, + "step": 1017 + }, + { + "epoch": 0.09498926938508911, + "grad_norm": 1.2937188283015384, + "learning_rate": 0.0001527, + "loss": 6.2381, + "step": 1018 + }, + { + "epoch": 0.09508257907996641, + "grad_norm": 1.3708759046819783, + "learning_rate": 0.00015284999999999997, + "loss": 6.1876, + "step": 1019 + }, + { + "epoch": 0.0951758887748437, + "grad_norm": 4.113008382375953, + "learning_rate": 0.00015299999999999998, + "loss": 6.4466, + "step": 1020 + }, + { + "epoch": 0.095269198469721, + "grad_norm": 1.4747663310640629, + "learning_rate": 0.00015314999999999998, + "loss": 5.8638, + "step": 1021 + }, + { + "epoch": 0.0953625081645983, + "grad_norm": 1.5657146086631801, + "learning_rate": 0.00015329999999999999, + "loss": 6.1283, + "step": 1022 + }, + { + "epoch": 0.0954558178594756, + "grad_norm": 1.3369291270702854, + "learning_rate": 0.00015344999999999996, + "loss": 5.8254, + "step": 1023 + }, + { + "epoch": 0.0955491275543529, + "grad_norm": 1.4352342283908985, + "learning_rate": 0.0001536, + "loss": 5.9661, + "step": 1024 + }, + { + "epoch": 0.09564243724923019, + "grad_norm": 1.3727060465483532, + "learning_rate": 0.00015374999999999997, + "loss": 6.2905, + "step": 1025 + }, + { + "epoch": 0.0957357469441075, + "grad_norm": 1.8448348469334164, + "learning_rate": 0.0001539, + "loss": 6.7535, + "step": 1026 + }, + { + "epoch": 0.09582905663898479, + "grad_norm": 1.540463285733382, + "learning_rate": 0.00015404999999999998, + "loss": 5.9634, + "step": 1027 + }, + { + "epoch": 0.09592236633386209, + "grad_norm": 1.2426627580060952, + "learning_rate": 0.00015419999999999998, + "loss": 5.7708, + "step": 1028 + }, + { + "epoch": 0.09601567602873938, + "grad_norm": 1.3976875912490005, + "learning_rate": 0.00015434999999999998, + "loss": 6.2282, + "step": 1029 + }, + { + "epoch": 0.09610898572361669, + "grad_norm": 1.3560124078997478, + "learning_rate": 0.0001545, + "loss": 6.552, + "step": 1030 + }, + { + "epoch": 0.09620229541849398, + "grad_norm": 2.0046192622177745, + "learning_rate": 0.00015464999999999996, + "loss": 6.3362, + "step": 1031 + }, + { + "epoch": 0.09629560511337128, + "grad_norm": 1.1597827383536514, + "learning_rate": 0.0001548, + "loss": 6.3253, + "step": 1032 + }, + { + "epoch": 0.09638891480824857, + "grad_norm": 1.1429292174981605, + "learning_rate": 0.00015494999999999997, + "loss": 6.4675, + "step": 1033 + }, + { + "epoch": 0.09648222450312588, + "grad_norm": 1.088084118815608, + "learning_rate": 0.0001551, + "loss": 6.3202, + "step": 1034 + }, + { + "epoch": 0.09657553419800317, + "grad_norm": 1.0798602358776, + "learning_rate": 0.00015524999999999998, + "loss": 6.2113, + "step": 1035 + }, + { + "epoch": 0.09666884389288047, + "grad_norm": 1.557681532875864, + "learning_rate": 0.00015539999999999998, + "loss": 6.2055, + "step": 1036 + }, + { + "epoch": 0.09676215358775776, + "grad_norm": 1.2347256739569412, + "learning_rate": 0.00015554999999999999, + "loss": 6.6795, + "step": 1037 + }, + { + "epoch": 0.09685546328263507, + "grad_norm": 1.544815147957025, + "learning_rate": 0.0001557, + "loss": 6.099, + "step": 1038 + }, + { + "epoch": 0.09694877297751237, + "grad_norm": 1.9113486816874252, + "learning_rate": 0.00015584999999999997, + "loss": 6.0767, + "step": 1039 + }, + { + "epoch": 0.09704208267238966, + "grad_norm": 1.5978558607776703, + "learning_rate": 0.000156, + "loss": 6.3359, + "step": 1040 + }, + { + "epoch": 0.09713539236726695, + "grad_norm": 1.8647501138437708, + "learning_rate": 0.00015614999999999997, + "loss": 5.999, + "step": 1041 + }, + { + "epoch": 0.09722870206214426, + "grad_norm": 1.3394029498958255, + "learning_rate": 0.0001563, + "loss": 6.2139, + "step": 1042 + }, + { + "epoch": 0.09732201175702156, + "grad_norm": 1.4742949691149332, + "learning_rate": 0.00015644999999999998, + "loss": 6.3803, + "step": 1043 + }, + { + "epoch": 0.09741532145189885, + "grad_norm": 1.2289612479243135, + "learning_rate": 0.00015659999999999998, + "loss": 6.0495, + "step": 1044 + }, + { + "epoch": 0.09750863114677615, + "grad_norm": 1.1421500931248447, + "learning_rate": 0.00015675, + "loss": 6.108, + "step": 1045 + }, + { + "epoch": 0.09760194084165345, + "grad_norm": 1.6027302676727868, + "learning_rate": 0.0001569, + "loss": 6.3306, + "step": 1046 + }, + { + "epoch": 0.09769525053653075, + "grad_norm": 1.5133306793249608, + "learning_rate": 0.00015704999999999997, + "loss": 6.3004, + "step": 1047 + }, + { + "epoch": 0.09778856023140804, + "grad_norm": 1.7027637166177307, + "learning_rate": 0.0001572, + "loss": 6.5447, + "step": 1048 + }, + { + "epoch": 0.09788186992628534, + "grad_norm": 1.5285742575309829, + "learning_rate": 0.00015734999999999998, + "loss": 6.2464, + "step": 1049 + }, + { + "epoch": 0.09797517962116264, + "grad_norm": 1.1350703632234806, + "learning_rate": 0.00015749999999999998, + "loss": 6.3154, + "step": 1050 + }, + { + "epoch": 0.09806848931603994, + "grad_norm": 1.1622597128377568, + "learning_rate": 0.00015764999999999998, + "loss": 6.2465, + "step": 1051 + }, + { + "epoch": 0.09816179901091723, + "grad_norm": 2.2050394652297634, + "learning_rate": 0.0001578, + "loss": 6.1391, + "step": 1052 + }, + { + "epoch": 0.09825510870579453, + "grad_norm": 1.3735734879732313, + "learning_rate": 0.00015794999999999996, + "loss": 6.4089, + "step": 1053 + }, + { + "epoch": 0.09834841840067184, + "grad_norm": 1.257280149510842, + "learning_rate": 0.0001581, + "loss": 6.5068, + "step": 1054 + }, + { + "epoch": 0.09844172809554913, + "grad_norm": 2.3057629052911186, + "learning_rate": 0.00015824999999999997, + "loss": 5.9554, + "step": 1055 + }, + { + "epoch": 0.09853503779042642, + "grad_norm": 1.5356844140500037, + "learning_rate": 0.0001584, + "loss": 6.3464, + "step": 1056 + }, + { + "epoch": 0.09862834748530372, + "grad_norm": 1.9171672852289605, + "learning_rate": 0.00015854999999999998, + "loss": 6.2438, + "step": 1057 + }, + { + "epoch": 0.09872165718018103, + "grad_norm": 1.396578702044599, + "learning_rate": 0.00015869999999999998, + "loss": 6.7214, + "step": 1058 + }, + { + "epoch": 0.09881496687505832, + "grad_norm": 1.2613004824281713, + "learning_rate": 0.00015884999999999999, + "loss": 6.1819, + "step": 1059 + }, + { + "epoch": 0.09890827656993562, + "grad_norm": 1.7814848727253965, + "learning_rate": 0.000159, + "loss": 6.0629, + "step": 1060 + }, + { + "epoch": 0.09900158626481291, + "grad_norm": 1.1342738200133677, + "learning_rate": 0.00015914999999999997, + "loss": 6.3483, + "step": 1061 + }, + { + "epoch": 0.09909489595969022, + "grad_norm": 1.2679969913263938, + "learning_rate": 0.0001593, + "loss": 5.968, + "step": 1062 + }, + { + "epoch": 0.09918820565456751, + "grad_norm": 1.3817290981714831, + "learning_rate": 0.00015944999999999997, + "loss": 6.0731, + "step": 1063 + }, + { + "epoch": 0.09928151534944481, + "grad_norm": 1.325044950851954, + "learning_rate": 0.0001596, + "loss": 6.478, + "step": 1064 + }, + { + "epoch": 0.0993748250443221, + "grad_norm": 1.6380323035736615, + "learning_rate": 0.00015974999999999998, + "loss": 6.2131, + "step": 1065 + }, + { + "epoch": 0.09946813473919941, + "grad_norm": 1.327498790517761, + "learning_rate": 0.00015989999999999998, + "loss": 6.4848, + "step": 1066 + }, + { + "epoch": 0.0995614444340767, + "grad_norm": 1.8954994771308546, + "learning_rate": 0.00016005, + "loss": 6.464, + "step": 1067 + }, + { + "epoch": 0.099654754128954, + "grad_norm": 1.2955220339109486, + "learning_rate": 0.0001602, + "loss": 5.9411, + "step": 1068 + }, + { + "epoch": 0.09974806382383129, + "grad_norm": 1.2933539823321596, + "learning_rate": 0.00016034999999999997, + "loss": 6.122, + "step": 1069 + }, + { + "epoch": 0.09984137351870859, + "grad_norm": 1.2470648997982403, + "learning_rate": 0.0001605, + "loss": 6.6225, + "step": 1070 + }, + { + "epoch": 0.0999346832135859, + "grad_norm": 1.2232588191024751, + "learning_rate": 0.00016064999999999997, + "loss": 6.2922, + "step": 1071 + }, + { + "epoch": 0.10002799290846319, + "grad_norm": 1.2568464737753478, + "learning_rate": 0.0001608, + "loss": 6.0226, + "step": 1072 + }, + { + "epoch": 0.10012130260334048, + "grad_norm": 1.3126116385432909, + "learning_rate": 0.00016094999999999998, + "loss": 6.1798, + "step": 1073 + }, + { + "epoch": 0.10021461229821778, + "grad_norm": 1.1451559031495222, + "learning_rate": 0.00016109999999999999, + "loss": 6.1303, + "step": 1074 + }, + { + "epoch": 0.10030792199309509, + "grad_norm": 1.0643290690205447, + "learning_rate": 0.00016125, + "loss": 6.4426, + "step": 1075 + }, + { + "epoch": 0.10040123168797238, + "grad_norm": 1.1466955410772646, + "learning_rate": 0.0001614, + "loss": 6.4992, + "step": 1076 + }, + { + "epoch": 0.10049454138284968, + "grad_norm": 1.4236828160019692, + "learning_rate": 0.00016154999999999997, + "loss": 6.3588, + "step": 1077 + }, + { + "epoch": 0.10058785107772697, + "grad_norm": 1.1839453407964127, + "learning_rate": 0.0001617, + "loss": 6.3924, + "step": 1078 + }, + { + "epoch": 0.10068116077260428, + "grad_norm": 1.1262059324227671, + "learning_rate": 0.00016184999999999998, + "loss": 6.1265, + "step": 1079 + }, + { + "epoch": 0.10077447046748157, + "grad_norm": 1.4465380927556837, + "learning_rate": 0.000162, + "loss": 6.3394, + "step": 1080 + }, + { + "epoch": 0.10086778016235887, + "grad_norm": 1.1511369439786292, + "learning_rate": 0.00016214999999999998, + "loss": 6.105, + "step": 1081 + }, + { + "epoch": 0.10096108985723616, + "grad_norm": 1.6218500526072512, + "learning_rate": 0.0001623, + "loss": 6.5907, + "step": 1082 + }, + { + "epoch": 0.10105439955211347, + "grad_norm": 1.9704429417775167, + "learning_rate": 0.00016245, + "loss": 6.1456, + "step": 1083 + }, + { + "epoch": 0.10114770924699076, + "grad_norm": 1.1261301083798267, + "learning_rate": 0.0001626, + "loss": 6.2072, + "step": 1084 + }, + { + "epoch": 0.10124101894186806, + "grad_norm": 1.2607886211578687, + "learning_rate": 0.00016274999999999997, + "loss": 6.5205, + "step": 1085 + }, + { + "epoch": 0.10133432863674535, + "grad_norm": 1.3573921981154151, + "learning_rate": 0.0001629, + "loss": 6.0619, + "step": 1086 + }, + { + "epoch": 0.10142763833162266, + "grad_norm": 1.5835977709164846, + "learning_rate": 0.00016304999999999998, + "loss": 6.1975, + "step": 1087 + }, + { + "epoch": 0.10152094802649995, + "grad_norm": 1.1545344080901103, + "learning_rate": 0.0001632, + "loss": 6.2884, + "step": 1088 + }, + { + "epoch": 0.10161425772137725, + "grad_norm": 1.0937001592786355, + "learning_rate": 0.00016334999999999999, + "loss": 6.1379, + "step": 1089 + }, + { + "epoch": 0.10170756741625454, + "grad_norm": 3.1051579685083777, + "learning_rate": 0.0001635, + "loss": 6.4148, + "step": 1090 + }, + { + "epoch": 0.10180087711113185, + "grad_norm": 1.2236045718348578, + "learning_rate": 0.00016365, + "loss": 6.2634, + "step": 1091 + }, + { + "epoch": 0.10189418680600915, + "grad_norm": 1.6890837186562602, + "learning_rate": 0.0001638, + "loss": 6.0817, + "step": 1092 + }, + { + "epoch": 0.10198749650088644, + "grad_norm": 1.5259426240539102, + "learning_rate": 0.00016394999999999997, + "loss": 6.5001, + "step": 1093 + }, + { + "epoch": 0.10208080619576373, + "grad_norm": 1.2926632250732808, + "learning_rate": 0.0001641, + "loss": 6.2561, + "step": 1094 + }, + { + "epoch": 0.10217411589064104, + "grad_norm": 1.218960033489852, + "learning_rate": 0.00016424999999999998, + "loss": 6.348, + "step": 1095 + }, + { + "epoch": 0.10226742558551834, + "grad_norm": 1.3046996962069919, + "learning_rate": 0.0001644, + "loss": 6.5526, + "step": 1096 + }, + { + "epoch": 0.10236073528039563, + "grad_norm": 1.360435336764549, + "learning_rate": 0.00016455, + "loss": 6.4868, + "step": 1097 + }, + { + "epoch": 0.10245404497527293, + "grad_norm": 1.3920150481341276, + "learning_rate": 0.0001647, + "loss": 6.0476, + "step": 1098 + }, + { + "epoch": 0.10254735467015023, + "grad_norm": 1.4332618085288222, + "learning_rate": 0.00016485, + "loss": 6.1765, + "step": 1099 + }, + { + "epoch": 0.10264066436502753, + "grad_norm": 1.4702358400247548, + "learning_rate": 0.000165, + "loss": 6.334, + "step": 1100 + }, + { + "epoch": 0.10273397405990482, + "grad_norm": 1.7519945520336284, + "learning_rate": 0.00016514999999999998, + "loss": 6.1632, + "step": 1101 + }, + { + "epoch": 0.10282728375478212, + "grad_norm": 1.2608802223385442, + "learning_rate": 0.0001653, + "loss": 6.0158, + "step": 1102 + }, + { + "epoch": 0.10292059344965943, + "grad_norm": 1.3438947828844299, + "learning_rate": 0.00016544999999999998, + "loss": 6.2643, + "step": 1103 + }, + { + "epoch": 0.10301390314453672, + "grad_norm": 1.4144235049397518, + "learning_rate": 0.0001656, + "loss": 6.0268, + "step": 1104 + }, + { + "epoch": 0.10310721283941401, + "grad_norm": 2.081469316146863, + "learning_rate": 0.00016575, + "loss": 6.2673, + "step": 1105 + }, + { + "epoch": 0.10320052253429131, + "grad_norm": 1.4056426350824331, + "learning_rate": 0.0001659, + "loss": 6.0867, + "step": 1106 + }, + { + "epoch": 0.10329383222916862, + "grad_norm": 1.2112064806309273, + "learning_rate": 0.00016604999999999997, + "loss": 6.3625, + "step": 1107 + }, + { + "epoch": 0.10338714192404591, + "grad_norm": 1.658537168247383, + "learning_rate": 0.0001662, + "loss": 6.0665, + "step": 1108 + }, + { + "epoch": 0.1034804516189232, + "grad_norm": 1.7394687843843857, + "learning_rate": 0.00016634999999999998, + "loss": 6.3233, + "step": 1109 + }, + { + "epoch": 0.1035737613138005, + "grad_norm": 1.0638950897776105, + "learning_rate": 0.0001665, + "loss": 6.2827, + "step": 1110 + }, + { + "epoch": 0.10366707100867781, + "grad_norm": 1.2490476719425774, + "learning_rate": 0.00016664999999999998, + "loss": 6.2941, + "step": 1111 + }, + { + "epoch": 0.1037603807035551, + "grad_norm": 1.2257761046172413, + "learning_rate": 0.0001668, + "loss": 6.0848, + "step": 1112 + }, + { + "epoch": 0.1038536903984324, + "grad_norm": 2.044795056065527, + "learning_rate": 0.00016695, + "loss": 6.0383, + "step": 1113 + }, + { + "epoch": 0.10394700009330969, + "grad_norm": 1.388089136761368, + "learning_rate": 0.0001671, + "loss": 6.1116, + "step": 1114 + }, + { + "epoch": 0.104040309788187, + "grad_norm": 1.2990159169918174, + "learning_rate": 0.00016724999999999997, + "loss": 6.1659, + "step": 1115 + }, + { + "epoch": 0.1041336194830643, + "grad_norm": 1.5031573977865405, + "learning_rate": 0.0001674, + "loss": 6.0682, + "step": 1116 + }, + { + "epoch": 0.10422692917794159, + "grad_norm": 1.6907147604353827, + "learning_rate": 0.00016754999999999998, + "loss": 6.239, + "step": 1117 + }, + { + "epoch": 0.10432023887281888, + "grad_norm": 1.1327607498746357, + "learning_rate": 0.0001677, + "loss": 6.2738, + "step": 1118 + }, + { + "epoch": 0.10441354856769619, + "grad_norm": 1.3447547148533563, + "learning_rate": 0.00016785, + "loss": 6.4015, + "step": 1119 + }, + { + "epoch": 0.10450685826257348, + "grad_norm": 1.419186380906778, + "learning_rate": 0.000168, + "loss": 6.2756, + "step": 1120 + }, + { + "epoch": 0.10460016795745078, + "grad_norm": 3.073536462257597, + "learning_rate": 0.00016815, + "loss": 6.5085, + "step": 1121 + }, + { + "epoch": 0.10469347765232807, + "grad_norm": 1.5278765475771425, + "learning_rate": 0.0001683, + "loss": 5.8374, + "step": 1122 + }, + { + "epoch": 0.10478678734720537, + "grad_norm": 1.1439820825877434, + "learning_rate": 0.00016844999999999997, + "loss": 6.3891, + "step": 1123 + }, + { + "epoch": 0.10488009704208268, + "grad_norm": 1.1165790558415005, + "learning_rate": 0.0001686, + "loss": 6.3153, + "step": 1124 + }, + { + "epoch": 0.10497340673695997, + "grad_norm": 1.228682120204116, + "learning_rate": 0.00016874999999999998, + "loss": 6.348, + "step": 1125 + }, + { + "epoch": 0.10506671643183726, + "grad_norm": 1.2804019447998432, + "learning_rate": 0.00016889999999999996, + "loss": 6.4099, + "step": 1126 + }, + { + "epoch": 0.10516002612671456, + "grad_norm": 1.2122839107710346, + "learning_rate": 0.00016905, + "loss": 6.0399, + "step": 1127 + }, + { + "epoch": 0.10525333582159187, + "grad_norm": 1.3041201938855982, + "learning_rate": 0.00016919999999999997, + "loss": 5.8886, + "step": 1128 + }, + { + "epoch": 0.10534664551646916, + "grad_norm": 1.2945872183065612, + "learning_rate": 0.00016935, + "loss": 6.1503, + "step": 1129 + }, + { + "epoch": 0.10543995521134646, + "grad_norm": 1.7823570248404352, + "learning_rate": 0.00016949999999999997, + "loss": 6.1216, + "step": 1130 + }, + { + "epoch": 0.10553326490622375, + "grad_norm": 1.2984397813037274, + "learning_rate": 0.00016964999999999998, + "loss": 6.0332, + "step": 1131 + }, + { + "epoch": 0.10562657460110106, + "grad_norm": 1.4435861716897276, + "learning_rate": 0.00016979999999999998, + "loss": 6.1786, + "step": 1132 + }, + { + "epoch": 0.10571988429597835, + "grad_norm": 1.548236849919478, + "learning_rate": 0.00016994999999999998, + "loss": 5.9609, + "step": 1133 + }, + { + "epoch": 0.10581319399085565, + "grad_norm": 1.20571182874518, + "learning_rate": 0.00017009999999999996, + "loss": 6.4688, + "step": 1134 + }, + { + "epoch": 0.10590650368573294, + "grad_norm": 1.127057636935576, + "learning_rate": 0.00017025, + "loss": 6.2688, + "step": 1135 + }, + { + "epoch": 0.10599981338061025, + "grad_norm": 1.3254923734850799, + "learning_rate": 0.00017039999999999997, + "loss": 6.1645, + "step": 1136 + }, + { + "epoch": 0.10609312307548754, + "grad_norm": 1.113771302850239, + "learning_rate": 0.00017055, + "loss": 6.1163, + "step": 1137 + }, + { + "epoch": 0.10618643277036484, + "grad_norm": 1.528163002415355, + "learning_rate": 0.00017069999999999998, + "loss": 6.2377, + "step": 1138 + }, + { + "epoch": 0.10627974246524213, + "grad_norm": 1.337581246763928, + "learning_rate": 0.00017084999999999998, + "loss": 5.9449, + "step": 1139 + }, + { + "epoch": 0.10637305216011944, + "grad_norm": 4.116986019034809, + "learning_rate": 0.00017099999999999998, + "loss": 6.3348, + "step": 1140 + }, + { + "epoch": 0.10646636185499674, + "grad_norm": 1.4305544317798478, + "learning_rate": 0.00017114999999999999, + "loss": 5.9189, + "step": 1141 + }, + { + "epoch": 0.10655967154987403, + "grad_norm": 10.421932951676027, + "learning_rate": 0.00017129999999999996, + "loss": 6.4357, + "step": 1142 + }, + { + "epoch": 0.10665298124475132, + "grad_norm": 1.0321573658411387, + "learning_rate": 0.00017145, + "loss": 6.3527, + "step": 1143 + }, + { + "epoch": 0.10674629093962863, + "grad_norm": 1.4971363537734814, + "learning_rate": 0.00017159999999999997, + "loss": 6.1553, + "step": 1144 + }, + { + "epoch": 0.10683960063450593, + "grad_norm": 1.5041385570294319, + "learning_rate": 0.00017175, + "loss": 6.6487, + "step": 1145 + }, + { + "epoch": 0.10693291032938322, + "grad_norm": 8.227221273314903, + "learning_rate": 0.00017189999999999998, + "loss": 6.6842, + "step": 1146 + }, + { + "epoch": 0.10702622002426052, + "grad_norm": 1.423996197905602, + "learning_rate": 0.00017204999999999998, + "loss": 6.2848, + "step": 1147 + }, + { + "epoch": 0.10711952971913782, + "grad_norm": 1.750816380960834, + "learning_rate": 0.00017219999999999998, + "loss": 6.2314, + "step": 1148 + }, + { + "epoch": 0.10721283941401512, + "grad_norm": 1.3324140209914686, + "learning_rate": 0.00017235, + "loss": 6.1764, + "step": 1149 + }, + { + "epoch": 0.10730614910889241, + "grad_norm": 2.932727257520526, + "learning_rate": 0.00017249999999999996, + "loss": 6.2221, + "step": 1150 + }, + { + "epoch": 0.1073994588037697, + "grad_norm": 1.6929960438782754, + "learning_rate": 0.00017265, + "loss": 6.0586, + "step": 1151 + }, + { + "epoch": 0.10749276849864701, + "grad_norm": 1.2524387801782704, + "learning_rate": 0.00017279999999999997, + "loss": 6.4219, + "step": 1152 + }, + { + "epoch": 0.10758607819352431, + "grad_norm": 1.4576891895079755, + "learning_rate": 0.00017294999999999998, + "loss": 6.3397, + "step": 1153 + }, + { + "epoch": 0.1076793878884016, + "grad_norm": 1.2555404287675127, + "learning_rate": 0.00017309999999999998, + "loss": 6.5876, + "step": 1154 + }, + { + "epoch": 0.1077726975832789, + "grad_norm": 1.2810874545125654, + "learning_rate": 0.00017324999999999998, + "loss": 6.6005, + "step": 1155 + }, + { + "epoch": 0.1078660072781562, + "grad_norm": 1.4282244111003726, + "learning_rate": 0.00017339999999999996, + "loss": 6.1518, + "step": 1156 + }, + { + "epoch": 0.1079593169730335, + "grad_norm": 1.5427377016398653, + "learning_rate": 0.00017355, + "loss": 6.3314, + "step": 1157 + }, + { + "epoch": 0.1080526266679108, + "grad_norm": 1.9341338356035978, + "learning_rate": 0.00017369999999999997, + "loss": 6.0414, + "step": 1158 + }, + { + "epoch": 0.10814593636278809, + "grad_norm": 1.6126202269740204, + "learning_rate": 0.00017385, + "loss": 6.322, + "step": 1159 + }, + { + "epoch": 0.1082392460576654, + "grad_norm": 1.6186378111884283, + "learning_rate": 0.00017399999999999997, + "loss": 6.1507, + "step": 1160 + }, + { + "epoch": 0.10833255575254269, + "grad_norm": 1.2876624880721945, + "learning_rate": 0.00017414999999999998, + "loss": 5.792, + "step": 1161 + }, + { + "epoch": 0.10842586544741999, + "grad_norm": 1.2286987176416262, + "learning_rate": 0.00017429999999999998, + "loss": 6.1564, + "step": 1162 + }, + { + "epoch": 0.10851917514229728, + "grad_norm": 1.7819927675537754, + "learning_rate": 0.00017444999999999998, + "loss": 6.4793, + "step": 1163 + }, + { + "epoch": 0.10861248483717459, + "grad_norm": 1.4341972714537852, + "learning_rate": 0.00017459999999999996, + "loss": 5.8031, + "step": 1164 + }, + { + "epoch": 0.10870579453205188, + "grad_norm": 1.62404494919765, + "learning_rate": 0.00017475, + "loss": 6.264, + "step": 1165 + }, + { + "epoch": 0.10879910422692918, + "grad_norm": 1.3341910800555026, + "learning_rate": 0.00017489999999999997, + "loss": 6.4006, + "step": 1166 + }, + { + "epoch": 0.10889241392180647, + "grad_norm": 1.414310954100489, + "learning_rate": 0.00017505, + "loss": 6.0767, + "step": 1167 + }, + { + "epoch": 0.10898572361668378, + "grad_norm": 1.934618478562656, + "learning_rate": 0.00017519999999999998, + "loss": 6.2311, + "step": 1168 + }, + { + "epoch": 0.10907903331156107, + "grad_norm": 1.892718185185759, + "learning_rate": 0.00017534999999999998, + "loss": 6.2271, + "step": 1169 + }, + { + "epoch": 0.10917234300643837, + "grad_norm": 1.4644820511483727, + "learning_rate": 0.00017549999999999998, + "loss": 6.4757, + "step": 1170 + }, + { + "epoch": 0.10926565270131566, + "grad_norm": 1.6303572355421359, + "learning_rate": 0.00017565, + "loss": 5.8133, + "step": 1171 + }, + { + "epoch": 0.10935896239619297, + "grad_norm": 1.5320300749820772, + "learning_rate": 0.00017579999999999996, + "loss": 5.8975, + "step": 1172 + }, + { + "epoch": 0.10945227209107027, + "grad_norm": 1.4395948526023576, + "learning_rate": 0.00017595, + "loss": 5.9986, + "step": 1173 + }, + { + "epoch": 0.10954558178594756, + "grad_norm": 1.3267755918812805, + "learning_rate": 0.00017609999999999997, + "loss": 6.3658, + "step": 1174 + }, + { + "epoch": 0.10963889148082485, + "grad_norm": 2.7220582150522867, + "learning_rate": 0.00017625, + "loss": 6.3582, + "step": 1175 + }, + { + "epoch": 0.10973220117570215, + "grad_norm": 1.1861137901109577, + "learning_rate": 0.00017639999999999998, + "loss": 6.1154, + "step": 1176 + }, + { + "epoch": 0.10982551087057946, + "grad_norm": 1.1430210415635158, + "learning_rate": 0.00017654999999999998, + "loss": 6.1619, + "step": 1177 + }, + { + "epoch": 0.10991882056545675, + "grad_norm": 1.4188300041802235, + "learning_rate": 0.00017669999999999999, + "loss": 6.1456, + "step": 1178 + }, + { + "epoch": 0.11001213026033405, + "grad_norm": 1.5214823488517113, + "learning_rate": 0.00017685, + "loss": 6.4833, + "step": 1179 + }, + { + "epoch": 0.11010543995521134, + "grad_norm": 1.715798376642998, + "learning_rate": 0.00017699999999999997, + "loss": 6.1152, + "step": 1180 + }, + { + "epoch": 0.11019874965008865, + "grad_norm": 1.2909134783659197, + "learning_rate": 0.00017715, + "loss": 6.2982, + "step": 1181 + }, + { + "epoch": 0.11029205934496594, + "grad_norm": 1.3205017328617572, + "learning_rate": 0.00017729999999999997, + "loss": 6.1782, + "step": 1182 + }, + { + "epoch": 0.11038536903984324, + "grad_norm": 1.25988898416261, + "learning_rate": 0.00017745, + "loss": 5.8781, + "step": 1183 + }, + { + "epoch": 0.11047867873472053, + "grad_norm": 1.570315192464473, + "learning_rate": 0.00017759999999999998, + "loss": 6.2829, + "step": 1184 + }, + { + "epoch": 0.11057198842959784, + "grad_norm": 1.2890368112118231, + "learning_rate": 0.00017774999999999998, + "loss": 5.725, + "step": 1185 + }, + { + "epoch": 0.11066529812447513, + "grad_norm": 1.4298279778746639, + "learning_rate": 0.0001779, + "loss": 6.1598, + "step": 1186 + }, + { + "epoch": 0.11075860781935243, + "grad_norm": 2.315729238298323, + "learning_rate": 0.00017805, + "loss": 6.3239, + "step": 1187 + }, + { + "epoch": 0.11085191751422972, + "grad_norm": 1.299700687808431, + "learning_rate": 0.00017819999999999997, + "loss": 6.0062, + "step": 1188 + }, + { + "epoch": 0.11094522720910703, + "grad_norm": 1.2393711285241322, + "learning_rate": 0.00017835, + "loss": 6.5156, + "step": 1189 + }, + { + "epoch": 0.11103853690398433, + "grad_norm": 11.919122823722214, + "learning_rate": 0.00017849999999999997, + "loss": 6.5002, + "step": 1190 + }, + { + "epoch": 0.11113184659886162, + "grad_norm": 1.4738370646547263, + "learning_rate": 0.00017865, + "loss": 6.2753, + "step": 1191 + }, + { + "epoch": 0.11122515629373891, + "grad_norm": 4.022789388650392, + "learning_rate": 0.00017879999999999998, + "loss": 6.1922, + "step": 1192 + }, + { + "epoch": 0.11131846598861622, + "grad_norm": 1.0906658204573239, + "learning_rate": 0.00017894999999999999, + "loss": 6.0019, + "step": 1193 + }, + { + "epoch": 0.11141177568349352, + "grad_norm": 1.0951661757454678, + "learning_rate": 0.0001791, + "loss": 6.1715, + "step": 1194 + }, + { + "epoch": 0.11150508537837081, + "grad_norm": 2.2012555078890923, + "learning_rate": 0.00017925, + "loss": 6.2367, + "step": 1195 + }, + { + "epoch": 0.1115983950732481, + "grad_norm": 2.070359887865638, + "learning_rate": 0.00017939999999999997, + "loss": 6.1842, + "step": 1196 + }, + { + "epoch": 0.11169170476812541, + "grad_norm": 1.2904593614890094, + "learning_rate": 0.00017955, + "loss": 6.3313, + "step": 1197 + }, + { + "epoch": 0.11178501446300271, + "grad_norm": 2.2202392825895667, + "learning_rate": 0.00017969999999999998, + "loss": 6.6573, + "step": 1198 + }, + { + "epoch": 0.11187832415788, + "grad_norm": 1.188933196365007, + "learning_rate": 0.00017984999999999998, + "loss": 5.9656, + "step": 1199 + }, + { + "epoch": 0.1119716338527573, + "grad_norm": 1.3127531285292373, + "learning_rate": 0.00017999999999999998, + "loss": 6.1279, + "step": 1200 + }, + { + "epoch": 0.1120649435476346, + "grad_norm": 1.2365841485680538, + "learning_rate": 0.00018015, + "loss": 6.024, + "step": 1201 + }, + { + "epoch": 0.1121582532425119, + "grad_norm": 1.7968980184251717, + "learning_rate": 0.00018029999999999996, + "loss": 6.4184, + "step": 1202 + }, + { + "epoch": 0.1122515629373892, + "grad_norm": 1.2752420378540628, + "learning_rate": 0.00018045, + "loss": 6.1679, + "step": 1203 + }, + { + "epoch": 0.11234487263226649, + "grad_norm": 1.1286710881834257, + "learning_rate": 0.00018059999999999997, + "loss": 5.8399, + "step": 1204 + }, + { + "epoch": 0.1124381823271438, + "grad_norm": 1.4878129578573613, + "learning_rate": 0.00018075, + "loss": 5.9555, + "step": 1205 + }, + { + "epoch": 0.11253149202202109, + "grad_norm": 16.38687905203824, + "learning_rate": 0.00018089999999999998, + "loss": 6.2788, + "step": 1206 + }, + { + "epoch": 0.11262480171689838, + "grad_norm": 1.1279054141212974, + "learning_rate": 0.00018104999999999998, + "loss": 6.2619, + "step": 1207 + }, + { + "epoch": 0.11271811141177568, + "grad_norm": 1.4374190635685302, + "learning_rate": 0.00018119999999999999, + "loss": 6.0022, + "step": 1208 + }, + { + "epoch": 0.11281142110665299, + "grad_norm": 1.4376362458621972, + "learning_rate": 0.00018135, + "loss": 5.9823, + "step": 1209 + }, + { + "epoch": 0.11290473080153028, + "grad_norm": 1.1982896033359198, + "learning_rate": 0.00018149999999999997, + "loss": 6.4617, + "step": 1210 + }, + { + "epoch": 0.11299804049640758, + "grad_norm": 1.4997659925320068, + "learning_rate": 0.00018165, + "loss": 6.2515, + "step": 1211 + }, + { + "epoch": 0.11309135019128487, + "grad_norm": 2.0882111085220596, + "learning_rate": 0.00018179999999999997, + "loss": 6.2741, + "step": 1212 + }, + { + "epoch": 0.11318465988616218, + "grad_norm": 1.0201316071855455, + "learning_rate": 0.00018195, + "loss": 6.4122, + "step": 1213 + }, + { + "epoch": 0.11327796958103947, + "grad_norm": 1.7051046097355367, + "learning_rate": 0.00018209999999999998, + "loss": 6.2912, + "step": 1214 + }, + { + "epoch": 0.11337127927591677, + "grad_norm": 1.3396671381413425, + "learning_rate": 0.00018224999999999998, + "loss": 6.3778, + "step": 1215 + }, + { + "epoch": 0.11346458897079406, + "grad_norm": 1.0909052068603269, + "learning_rate": 0.0001824, + "loss": 6.2653, + "step": 1216 + }, + { + "epoch": 0.11355789866567137, + "grad_norm": 1.0994114921030458, + "learning_rate": 0.00018255, + "loss": 6.2327, + "step": 1217 + }, + { + "epoch": 0.11365120836054866, + "grad_norm": 2.3588381866826102, + "learning_rate": 0.00018269999999999997, + "loss": 5.9844, + "step": 1218 + }, + { + "epoch": 0.11374451805542596, + "grad_norm": 4.119332381287453, + "learning_rate": 0.00018285, + "loss": 6.0718, + "step": 1219 + }, + { + "epoch": 0.11383782775030325, + "grad_norm": 2.6262278033683883, + "learning_rate": 0.00018299999999999998, + "loss": 6.2418, + "step": 1220 + }, + { + "epoch": 0.11393113744518056, + "grad_norm": 1.1571025536344675, + "learning_rate": 0.00018315, + "loss": 6.3895, + "step": 1221 + }, + { + "epoch": 0.11402444714005786, + "grad_norm": 1.502941483333602, + "learning_rate": 0.00018329999999999998, + "loss": 5.9251, + "step": 1222 + }, + { + "epoch": 0.11411775683493515, + "grad_norm": 1.2853000314583856, + "learning_rate": 0.00018345, + "loss": 6.0483, + "step": 1223 + }, + { + "epoch": 0.11421106652981244, + "grad_norm": 1.4279370011667267, + "learning_rate": 0.0001836, + "loss": 6.3284, + "step": 1224 + }, + { + "epoch": 0.11430437622468975, + "grad_norm": 1.1289318707800846, + "learning_rate": 0.00018375, + "loss": 6.1881, + "step": 1225 + }, + { + "epoch": 0.11439768591956705, + "grad_norm": 2.2353643439394983, + "learning_rate": 0.00018389999999999997, + "loss": 6.4979, + "step": 1226 + }, + { + "epoch": 0.11449099561444434, + "grad_norm": 1.6977374681573978, + "learning_rate": 0.00018405, + "loss": 6.1704, + "step": 1227 + }, + { + "epoch": 0.11458430530932164, + "grad_norm": 1.2588799532801085, + "learning_rate": 0.00018419999999999998, + "loss": 6.3525, + "step": 1228 + }, + { + "epoch": 0.11467761500419893, + "grad_norm": 1.5796689333651848, + "learning_rate": 0.00018435, + "loss": 5.8455, + "step": 1229 + }, + { + "epoch": 0.11477092469907624, + "grad_norm": 1.4996696934401659, + "learning_rate": 0.00018449999999999999, + "loss": 6.2702, + "step": 1230 + }, + { + "epoch": 0.11486423439395353, + "grad_norm": 1.3456653869733033, + "learning_rate": 0.00018465, + "loss": 6.1649, + "step": 1231 + }, + { + "epoch": 0.11495754408883083, + "grad_norm": 1.4315780654878754, + "learning_rate": 0.0001848, + "loss": 6.0027, + "step": 1232 + }, + { + "epoch": 0.11505085378370812, + "grad_norm": 1.0588735550022346, + "learning_rate": 0.00018495, + "loss": 6.3381, + "step": 1233 + }, + { + "epoch": 0.11514416347858543, + "grad_norm": 1.3549344997395742, + "learning_rate": 0.00018509999999999997, + "loss": 6.0581, + "step": 1234 + }, + { + "epoch": 0.11523747317346272, + "grad_norm": 1.1288009274191242, + "learning_rate": 0.00018525, + "loss": 6.0058, + "step": 1235 + }, + { + "epoch": 0.11533078286834002, + "grad_norm": 1.2219152851956177, + "learning_rate": 0.00018539999999999998, + "loss": 6.2534, + "step": 1236 + }, + { + "epoch": 0.11542409256321731, + "grad_norm": 1.163864453792022, + "learning_rate": 0.00018555, + "loss": 6.3036, + "step": 1237 + }, + { + "epoch": 0.11551740225809462, + "grad_norm": 1.577120660140526, + "learning_rate": 0.0001857, + "loss": 5.9332, + "step": 1238 + }, + { + "epoch": 0.11561071195297191, + "grad_norm": 1.2271036819589818, + "learning_rate": 0.00018585, + "loss": 6.2377, + "step": 1239 + }, + { + "epoch": 0.11570402164784921, + "grad_norm": 1.231976252986545, + "learning_rate": 0.000186, + "loss": 6.0941, + "step": 1240 + }, + { + "epoch": 0.1157973313427265, + "grad_norm": 1.0754253826910236, + "learning_rate": 0.00018615, + "loss": 6.3612, + "step": 1241 + }, + { + "epoch": 0.11589064103760381, + "grad_norm": 1.2773464933362413, + "learning_rate": 0.00018629999999999997, + "loss": 5.3099, + "step": 1242 + }, + { + "epoch": 0.1159839507324811, + "grad_norm": 1.1661879793760686, + "learning_rate": 0.00018645, + "loss": 6.3979, + "step": 1243 + }, + { + "epoch": 0.1160772604273584, + "grad_norm": 4.29897623864643, + "learning_rate": 0.00018659999999999998, + "loss": 6.1675, + "step": 1244 + }, + { + "epoch": 0.1161705701222357, + "grad_norm": 1.3429460099583301, + "learning_rate": 0.00018675, + "loss": 6.1593, + "step": 1245 + }, + { + "epoch": 0.116263879817113, + "grad_norm": 1.1907741317424458, + "learning_rate": 0.0001869, + "loss": 6.0997, + "step": 1246 + }, + { + "epoch": 0.1163571895119903, + "grad_norm": 2.0694790015086095, + "learning_rate": 0.00018705, + "loss": 5.9918, + "step": 1247 + }, + { + "epoch": 0.11645049920686759, + "grad_norm": 1.1025168551420077, + "learning_rate": 0.0001872, + "loss": 5.9498, + "step": 1248 + }, + { + "epoch": 0.11654380890174489, + "grad_norm": 1.102291072733115, + "learning_rate": 0.00018735, + "loss": 5.5148, + "step": 1249 + }, + { + "epoch": 0.1166371185966222, + "grad_norm": 1.4320510510145106, + "learning_rate": 0.00018749999999999998, + "loss": 6.1932, + "step": 1250 + }, + { + "epoch": 0.11673042829149949, + "grad_norm": 1.437391731154843, + "learning_rate": 0.00018764999999999998, + "loss": 6.0865, + "step": 1251 + }, + { + "epoch": 0.11682373798637678, + "grad_norm": 1.2330188914138591, + "learning_rate": 0.00018779999999999998, + "loss": 6.1431, + "step": 1252 + }, + { + "epoch": 0.11691704768125408, + "grad_norm": 1.1757313567266483, + "learning_rate": 0.00018794999999999996, + "loss": 6.358, + "step": 1253 + }, + { + "epoch": 0.11701035737613139, + "grad_norm": 1.348047852642334, + "learning_rate": 0.0001881, + "loss": 6.2996, + "step": 1254 + }, + { + "epoch": 0.11710366707100868, + "grad_norm": 2.1001941941534517, + "learning_rate": 0.00018824999999999997, + "loss": 5.8463, + "step": 1255 + }, + { + "epoch": 0.11719697676588597, + "grad_norm": 2.0968693283584274, + "learning_rate": 0.00018839999999999997, + "loss": 6.6829, + "step": 1256 + }, + { + "epoch": 0.11729028646076327, + "grad_norm": 1.1648417811953298, + "learning_rate": 0.00018854999999999998, + "loss": 6.1012, + "step": 1257 + }, + { + "epoch": 0.11738359615564058, + "grad_norm": 1.3598456873588518, + "learning_rate": 0.00018869999999999998, + "loss": 5.6534, + "step": 1258 + }, + { + "epoch": 0.11747690585051787, + "grad_norm": 1.3264653128039599, + "learning_rate": 0.00018884999999999996, + "loss": 6.1813, + "step": 1259 + }, + { + "epoch": 0.11757021554539517, + "grad_norm": 1.1055873779231646, + "learning_rate": 0.00018899999999999999, + "loss": 6.277, + "step": 1260 + }, + { + "epoch": 0.11766352524027246, + "grad_norm": 1.2833468212326333, + "learning_rate": 0.00018914999999999996, + "loss": 5.8966, + "step": 1261 + }, + { + "epoch": 0.11775683493514977, + "grad_norm": 1.2779387559954, + "learning_rate": 0.0001893, + "loss": 5.9431, + "step": 1262 + }, + { + "epoch": 0.11785014463002706, + "grad_norm": 1.2698757690597455, + "learning_rate": 0.00018944999999999997, + "loss": 6.0825, + "step": 1263 + }, + { + "epoch": 0.11794345432490436, + "grad_norm": 1.1067925063361284, + "learning_rate": 0.00018959999999999997, + "loss": 6.3763, + "step": 1264 + }, + { + "epoch": 0.11803676401978165, + "grad_norm": 0.9720062113883495, + "learning_rate": 0.00018974999999999998, + "loss": 5.9958, + "step": 1265 + }, + { + "epoch": 0.11813007371465896, + "grad_norm": 1.220321361925994, + "learning_rate": 0.00018989999999999998, + "loss": 5.8438, + "step": 1266 + }, + { + "epoch": 0.11822338340953625, + "grad_norm": 3.815608486242096, + "learning_rate": 0.00019004999999999996, + "loss": 6.2543, + "step": 1267 + }, + { + "epoch": 0.11831669310441355, + "grad_norm": 1.0888776764658823, + "learning_rate": 0.0001902, + "loss": 5.9996, + "step": 1268 + }, + { + "epoch": 0.11841000279929084, + "grad_norm": 1.3603974758404256, + "learning_rate": 0.00019034999999999996, + "loss": 5.9159, + "step": 1269 + }, + { + "epoch": 0.11850331249416815, + "grad_norm": 4.7637742480041485, + "learning_rate": 0.0001905, + "loss": 6.0199, + "step": 1270 + }, + { + "epoch": 0.11859662218904544, + "grad_norm": 1.2280839845799463, + "learning_rate": 0.00019064999999999997, + "loss": 6.2018, + "step": 1271 + }, + { + "epoch": 0.11868993188392274, + "grad_norm": 1.3486576112480078, + "learning_rate": 0.00019079999999999998, + "loss": 6.5593, + "step": 1272 + }, + { + "epoch": 0.11878324157880003, + "grad_norm": 1.74588747969383, + "learning_rate": 0.00019094999999999998, + "loss": 5.9166, + "step": 1273 + }, + { + "epoch": 0.11887655127367734, + "grad_norm": 1.7777374143515323, + "learning_rate": 0.00019109999999999998, + "loss": 6.3446, + "step": 1274 + }, + { + "epoch": 0.11896986096855464, + "grad_norm": 36.37704112630846, + "learning_rate": 0.00019124999999999996, + "loss": 6.3638, + "step": 1275 + }, + { + "epoch": 0.11906317066343193, + "grad_norm": 1.5606723225779717, + "learning_rate": 0.0001914, + "loss": 6.122, + "step": 1276 + }, + { + "epoch": 0.11915648035830922, + "grad_norm": 1.6508423667204186, + "learning_rate": 0.00019154999999999997, + "loss": 6.1121, + "step": 1277 + }, + { + "epoch": 0.11924979005318653, + "grad_norm": 2.0396005119464617, + "learning_rate": 0.0001917, + "loss": 6.162, + "step": 1278 + }, + { + "epoch": 0.11934309974806383, + "grad_norm": 1.7003818016446854, + "learning_rate": 0.00019184999999999997, + "loss": 6.1558, + "step": 1279 + }, + { + "epoch": 0.11943640944294112, + "grad_norm": 1.4973279972300293, + "learning_rate": 0.00019199999999999998, + "loss": 6.2955, + "step": 1280 + }, + { + "epoch": 0.11952971913781842, + "grad_norm": 1.6201099075906773, + "learning_rate": 0.00019214999999999998, + "loss": 6.1139, + "step": 1281 + }, + { + "epoch": 0.11962302883269571, + "grad_norm": 1.132432387495407, + "learning_rate": 0.00019229999999999999, + "loss": 6.4182, + "step": 1282 + }, + { + "epoch": 0.11971633852757302, + "grad_norm": 2.0852316294465396, + "learning_rate": 0.00019244999999999996, + "loss": 6.3333, + "step": 1283 + }, + { + "epoch": 0.11980964822245031, + "grad_norm": 1.0572834744089286, + "learning_rate": 0.0001926, + "loss": 6.2206, + "step": 1284 + }, + { + "epoch": 0.11990295791732761, + "grad_norm": 1.94354068496833, + "learning_rate": 0.00019274999999999997, + "loss": 6.0775, + "step": 1285 + }, + { + "epoch": 0.1199962676122049, + "grad_norm": 1.268657546823371, + "learning_rate": 0.0001929, + "loss": 5.8247, + "step": 1286 + }, + { + "epoch": 0.12008957730708221, + "grad_norm": 1.371666614801123, + "learning_rate": 0.00019304999999999998, + "loss": 6.3138, + "step": 1287 + }, + { + "epoch": 0.1201828870019595, + "grad_norm": 1.5742744533473958, + "learning_rate": 0.00019319999999999998, + "loss": 6.5358, + "step": 1288 + }, + { + "epoch": 0.1202761966968368, + "grad_norm": 2.8241105596252294, + "learning_rate": 0.00019334999999999998, + "loss": 6.5158, + "step": 1289 + }, + { + "epoch": 0.12036950639171409, + "grad_norm": 1.896689416143004, + "learning_rate": 0.0001935, + "loss": 6.2802, + "step": 1290 + }, + { + "epoch": 0.1204628160865914, + "grad_norm": 1.412854105879125, + "learning_rate": 0.00019364999999999996, + "loss": 6.1516, + "step": 1291 + }, + { + "epoch": 0.1205561257814687, + "grad_norm": 2.0297027536042767, + "learning_rate": 0.0001938, + "loss": 5.9494, + "step": 1292 + }, + { + "epoch": 0.12064943547634599, + "grad_norm": 4.973940738293538, + "learning_rate": 0.00019394999999999997, + "loss": 6.4489, + "step": 1293 + }, + { + "epoch": 0.12074274517122328, + "grad_norm": 1.3756648178452109, + "learning_rate": 0.0001941, + "loss": 6.4138, + "step": 1294 + }, + { + "epoch": 0.12083605486610059, + "grad_norm": 1.628461664418833, + "learning_rate": 0.00019424999999999998, + "loss": 6.1903, + "step": 1295 + }, + { + "epoch": 0.12092936456097789, + "grad_norm": 2.069476159922491, + "learning_rate": 0.00019439999999999998, + "loss": 6.0344, + "step": 1296 + }, + { + "epoch": 0.12102267425585518, + "grad_norm": 1.6240790443760444, + "learning_rate": 0.00019454999999999999, + "loss": 6.3867, + "step": 1297 + }, + { + "epoch": 0.12111598395073248, + "grad_norm": 1.036301572943905, + "learning_rate": 0.0001947, + "loss": 6.1067, + "step": 1298 + }, + { + "epoch": 0.12120929364560978, + "grad_norm": 1.555160591808436, + "learning_rate": 0.00019484999999999997, + "loss": 6.3493, + "step": 1299 + }, + { + "epoch": 0.12130260334048708, + "grad_norm": 1.3742296887353906, + "learning_rate": 0.000195, + "loss": 6.289, + "step": 1300 + }, + { + "epoch": 0.12139591303536437, + "grad_norm": 1.1478287142632024, + "learning_rate": 0.00019514999999999997, + "loss": 6.2646, + "step": 1301 + }, + { + "epoch": 0.12148922273024167, + "grad_norm": 1.2709103331960143, + "learning_rate": 0.00019529999999999998, + "loss": 6.1714, + "step": 1302 + }, + { + "epoch": 0.12158253242511897, + "grad_norm": 1.1194556801093474, + "learning_rate": 0.00019544999999999998, + "loss": 6.2497, + "step": 1303 + }, + { + "epoch": 0.12167584211999627, + "grad_norm": 1.2086757906187022, + "learning_rate": 0.00019559999999999998, + "loss": 6.2128, + "step": 1304 + }, + { + "epoch": 0.12176915181487356, + "grad_norm": 1.2823970802175972, + "learning_rate": 0.00019574999999999996, + "loss": 5.8269, + "step": 1305 + }, + { + "epoch": 0.12186246150975086, + "grad_norm": 1.2559628075759166, + "learning_rate": 0.0001959, + "loss": 6.0685, + "step": 1306 + }, + { + "epoch": 0.12195577120462817, + "grad_norm": 1.1067252506533887, + "learning_rate": 0.00019604999999999997, + "loss": 6.2466, + "step": 1307 + }, + { + "epoch": 0.12204908089950546, + "grad_norm": 1.7906244372521296, + "learning_rate": 0.0001962, + "loss": 6.4284, + "step": 1308 + }, + { + "epoch": 0.12214239059438275, + "grad_norm": 1.1718764850130214, + "learning_rate": 0.00019634999999999998, + "loss": 6.3111, + "step": 1309 + }, + { + "epoch": 0.12223570028926005, + "grad_norm": 1.5950672285805523, + "learning_rate": 0.00019649999999999998, + "loss": 6.1147, + "step": 1310 + }, + { + "epoch": 0.12232900998413736, + "grad_norm": 1.479977005873723, + "learning_rate": 0.00019664999999999998, + "loss": 5.999, + "step": 1311 + }, + { + "epoch": 0.12242231967901465, + "grad_norm": 1.259044481155946, + "learning_rate": 0.00019679999999999999, + "loss": 6.1133, + "step": 1312 + }, + { + "epoch": 0.12251562937389195, + "grad_norm": 1.351025360841291, + "learning_rate": 0.00019694999999999996, + "loss": 6.2309, + "step": 1313 + }, + { + "epoch": 0.12260893906876924, + "grad_norm": 1.3785279506992119, + "learning_rate": 0.0001971, + "loss": 6.2637, + "step": 1314 + }, + { + "epoch": 0.12270224876364655, + "grad_norm": 1.415300428989068, + "learning_rate": 0.00019724999999999997, + "loss": 6.1538, + "step": 1315 + }, + { + "epoch": 0.12279555845852384, + "grad_norm": 1.1770373439768491, + "learning_rate": 0.0001974, + "loss": 6.1622, + "step": 1316 + }, + { + "epoch": 0.12288886815340114, + "grad_norm": 1.1977745410840734, + "learning_rate": 0.00019754999999999998, + "loss": 6.0745, + "step": 1317 + }, + { + "epoch": 0.12298217784827843, + "grad_norm": 2.3506076148455164, + "learning_rate": 0.00019769999999999998, + "loss": 5.9647, + "step": 1318 + }, + { + "epoch": 0.12307548754315574, + "grad_norm": 1.5506859766161036, + "learning_rate": 0.00019784999999999998, + "loss": 6.485, + "step": 1319 + }, + { + "epoch": 0.12316879723803303, + "grad_norm": 1.1452255368062778, + "learning_rate": 0.000198, + "loss": 6.1545, + "step": 1320 + }, + { + "epoch": 0.12326210693291033, + "grad_norm": 1.1544485097104737, + "learning_rate": 0.00019814999999999996, + "loss": 6.1365, + "step": 1321 + }, + { + "epoch": 0.12335541662778762, + "grad_norm": 1.4750661396061173, + "learning_rate": 0.0001983, + "loss": 6.1199, + "step": 1322 + }, + { + "epoch": 0.12344872632266493, + "grad_norm": 1.1945584310534507, + "learning_rate": 0.00019844999999999997, + "loss": 6.2507, + "step": 1323 + }, + { + "epoch": 0.12354203601754223, + "grad_norm": 1.2136384680347991, + "learning_rate": 0.0001986, + "loss": 6.1357, + "step": 1324 + }, + { + "epoch": 0.12363534571241952, + "grad_norm": 1.4018168117905023, + "learning_rate": 0.00019874999999999998, + "loss": 6.0508, + "step": 1325 + }, + { + "epoch": 0.12372865540729681, + "grad_norm": 1.1711386384437066, + "learning_rate": 0.00019889999999999998, + "loss": 6.1958, + "step": 1326 + }, + { + "epoch": 0.12382196510217412, + "grad_norm": 1.153662227051574, + "learning_rate": 0.00019905, + "loss": 6.2191, + "step": 1327 + }, + { + "epoch": 0.12391527479705142, + "grad_norm": 1.2194857040804994, + "learning_rate": 0.0001992, + "loss": 6.3598, + "step": 1328 + }, + { + "epoch": 0.12400858449192871, + "grad_norm": 1.2212326062466818, + "learning_rate": 0.00019934999999999997, + "loss": 5.8261, + "step": 1329 + }, + { + "epoch": 0.124101894186806, + "grad_norm": 1.09032003870761, + "learning_rate": 0.0001995, + "loss": 6.3924, + "step": 1330 + }, + { + "epoch": 0.12419520388168331, + "grad_norm": 1.3077507984254098, + "learning_rate": 0.00019964999999999997, + "loss": 6.2077, + "step": 1331 + }, + { + "epoch": 0.12428851357656061, + "grad_norm": 1.224329345956318, + "learning_rate": 0.0001998, + "loss": 6.4696, + "step": 1332 + }, + { + "epoch": 0.1243818232714379, + "grad_norm": 1.30441124129955, + "learning_rate": 0.00019994999999999998, + "loss": 6.0727, + "step": 1333 + }, + { + "epoch": 0.1244751329663152, + "grad_norm": 1.3601436061145156, + "learning_rate": 0.00020009999999999998, + "loss": 6.1098, + "step": 1334 + }, + { + "epoch": 0.12456844266119249, + "grad_norm": 1.9977260342492955, + "learning_rate": 0.00020025, + "loss": 5.9648, + "step": 1335 + }, + { + "epoch": 0.1246617523560698, + "grad_norm": 1.2144740676663832, + "learning_rate": 0.0002004, + "loss": 5.9389, + "step": 1336 + }, + { + "epoch": 0.1247550620509471, + "grad_norm": 1.5253034963809107, + "learning_rate": 0.00020054999999999997, + "loss": 6.1902, + "step": 1337 + }, + { + "epoch": 0.12484837174582439, + "grad_norm": 1.1456161640241254, + "learning_rate": 0.0002007, + "loss": 6.3516, + "step": 1338 + }, + { + "epoch": 0.12494168144070168, + "grad_norm": 1.3497125299268002, + "learning_rate": 0.00020084999999999998, + "loss": 6.0532, + "step": 1339 + }, + { + "epoch": 0.12503499113557898, + "grad_norm": 1.3023430321534757, + "learning_rate": 0.000201, + "loss": 5.6271, + "step": 1340 + }, + { + "epoch": 0.12512830083045628, + "grad_norm": 1.2208232209547463, + "learning_rate": 0.00020114999999999998, + "loss": 6.6254, + "step": 1341 + }, + { + "epoch": 0.1252216105253336, + "grad_norm": 1.7972978922473424, + "learning_rate": 0.0002013, + "loss": 6.1083, + "step": 1342 + }, + { + "epoch": 0.12531492022021087, + "grad_norm": 1.3121083335889026, + "learning_rate": 0.00020145, + "loss": 5.9603, + "step": 1343 + }, + { + "epoch": 0.12540822991508818, + "grad_norm": 1.968714837924387, + "learning_rate": 0.0002016, + "loss": 6.077, + "step": 1344 + }, + { + "epoch": 0.12550153960996546, + "grad_norm": 1.1765898863655209, + "learning_rate": 0.00020174999999999997, + "loss": 6.3484, + "step": 1345 + }, + { + "epoch": 0.12559484930484277, + "grad_norm": 1.2206248011272884, + "learning_rate": 0.0002019, + "loss": 5.7127, + "step": 1346 + }, + { + "epoch": 0.12568815899972008, + "grad_norm": 1.2229664010408434, + "learning_rate": 0.00020204999999999998, + "loss": 5.9499, + "step": 1347 + }, + { + "epoch": 0.12578146869459736, + "grad_norm": 2.1291226949203828, + "learning_rate": 0.0002022, + "loss": 6.1948, + "step": 1348 + }, + { + "epoch": 0.12587477838947467, + "grad_norm": 1.4757728474750804, + "learning_rate": 0.00020234999999999999, + "loss": 6.0511, + "step": 1349 + }, + { + "epoch": 0.12596808808435198, + "grad_norm": 1.954654536890291, + "learning_rate": 0.0002025, + "loss": 6.4039, + "step": 1350 + }, + { + "epoch": 0.12606139777922926, + "grad_norm": 1.222244142258998, + "learning_rate": 0.00020264999999999997, + "loss": 5.3566, + "step": 1351 + }, + { + "epoch": 0.12615470747410656, + "grad_norm": 1.188333581291439, + "learning_rate": 0.0002028, + "loss": 5.9607, + "step": 1352 + }, + { + "epoch": 0.12624801716898384, + "grad_norm": 1.4001591331543415, + "learning_rate": 0.00020294999999999997, + "loss": 6.0142, + "step": 1353 + }, + { + "epoch": 0.12634132686386115, + "grad_norm": 1.2488578361205478, + "learning_rate": 0.0002031, + "loss": 6.2435, + "step": 1354 + }, + { + "epoch": 0.12643463655873846, + "grad_norm": 1.1844861770948623, + "learning_rate": 0.00020324999999999998, + "loss": 6.0037, + "step": 1355 + }, + { + "epoch": 0.12652794625361574, + "grad_norm": 1.1204919037840897, + "learning_rate": 0.00020339999999999998, + "loss": 6.3478, + "step": 1356 + }, + { + "epoch": 0.12662125594849305, + "grad_norm": 1.3087617527456676, + "learning_rate": 0.00020355, + "loss": 6.0215, + "step": 1357 + }, + { + "epoch": 0.12671456564337036, + "grad_norm": 1.1200217236965198, + "learning_rate": 0.0002037, + "loss": 6.0275, + "step": 1358 + }, + { + "epoch": 0.12680787533824764, + "grad_norm": 1.7187516057323593, + "learning_rate": 0.00020384999999999997, + "loss": 6.0684, + "step": 1359 + }, + { + "epoch": 0.12690118503312495, + "grad_norm": 1.097181225834667, + "learning_rate": 0.000204, + "loss": 6.0369, + "step": 1360 + }, + { + "epoch": 0.12699449472800223, + "grad_norm": 1.2682594421856883, + "learning_rate": 0.00020414999999999997, + "loss": 6.0031, + "step": 1361 + }, + { + "epoch": 0.12708780442287954, + "grad_norm": 2.259624436133716, + "learning_rate": 0.0002043, + "loss": 5.805, + "step": 1362 + }, + { + "epoch": 0.12718111411775684, + "grad_norm": 1.9497601052899247, + "learning_rate": 0.00020444999999999998, + "loss": 6.0488, + "step": 1363 + }, + { + "epoch": 0.12727442381263412, + "grad_norm": 1.2027356720651734, + "learning_rate": 0.00020459999999999999, + "loss": 6.2323, + "step": 1364 + }, + { + "epoch": 0.12736773350751143, + "grad_norm": 2.2921585542865524, + "learning_rate": 0.00020475, + "loss": 5.8261, + "step": 1365 + }, + { + "epoch": 0.12746104320238874, + "grad_norm": 1.2491703058429722, + "learning_rate": 0.0002049, + "loss": 6.0555, + "step": 1366 + }, + { + "epoch": 0.12755435289726602, + "grad_norm": 1.3409090004217405, + "learning_rate": 0.00020504999999999997, + "loss": 6.3904, + "step": 1367 + }, + { + "epoch": 0.12764766259214333, + "grad_norm": 1.265125853383758, + "learning_rate": 0.0002052, + "loss": 6.1353, + "step": 1368 + }, + { + "epoch": 0.1277409722870206, + "grad_norm": 4.396454554896641, + "learning_rate": 0.00020534999999999998, + "loss": 5.5532, + "step": 1369 + }, + { + "epoch": 0.12783428198189792, + "grad_norm": 1.2201292055396438, + "learning_rate": 0.0002055, + "loss": 6.3168, + "step": 1370 + }, + { + "epoch": 0.12792759167677523, + "grad_norm": 1.2206790913061583, + "learning_rate": 0.00020564999999999998, + "loss": 6.3057, + "step": 1371 + }, + { + "epoch": 0.1280209013716525, + "grad_norm": 1.1499743161736136, + "learning_rate": 0.0002058, + "loss": 5.8312, + "step": 1372 + }, + { + "epoch": 0.12811421106652982, + "grad_norm": 1.1389898444813449, + "learning_rate": 0.00020595, + "loss": 6.1371, + "step": 1373 + }, + { + "epoch": 0.12820752076140712, + "grad_norm": 1.4786050903861194, + "learning_rate": 0.0002061, + "loss": 6.2373, + "step": 1374 + }, + { + "epoch": 0.1283008304562844, + "grad_norm": 1.282690533931662, + "learning_rate": 0.00020624999999999997, + "loss": 5.8156, + "step": 1375 + }, + { + "epoch": 0.1283941401511617, + "grad_norm": 1.1087596149047225, + "learning_rate": 0.00020639999999999998, + "loss": 6.3952, + "step": 1376 + }, + { + "epoch": 0.128487449846039, + "grad_norm": 1.1285913817358442, + "learning_rate": 0.00020654999999999998, + "loss": 6.2803, + "step": 1377 + }, + { + "epoch": 0.1285807595409163, + "grad_norm": 1.224642677671005, + "learning_rate": 0.00020669999999999996, + "loss": 5.8658, + "step": 1378 + }, + { + "epoch": 0.1286740692357936, + "grad_norm": 1.235519474042189, + "learning_rate": 0.00020684999999999999, + "loss": 6.1993, + "step": 1379 + }, + { + "epoch": 0.1287673789306709, + "grad_norm": 1.156579145124003, + "learning_rate": 0.00020699999999999996, + "loss": 6.2995, + "step": 1380 + }, + { + "epoch": 0.1288606886255482, + "grad_norm": 1.1699734629554936, + "learning_rate": 0.00020715, + "loss": 6.5098, + "step": 1381 + }, + { + "epoch": 0.1289539983204255, + "grad_norm": 1.101240066547141, + "learning_rate": 0.00020729999999999997, + "loss": 6.1424, + "step": 1382 + }, + { + "epoch": 0.1290473080153028, + "grad_norm": 1.5257149197152484, + "learning_rate": 0.00020744999999999997, + "loss": 6.0205, + "step": 1383 + }, + { + "epoch": 0.1291406177101801, + "grad_norm": 1.1162098354931427, + "learning_rate": 0.00020759999999999998, + "loss": 5.9398, + "step": 1384 + }, + { + "epoch": 0.12923392740505737, + "grad_norm": 1.3806156096285058, + "learning_rate": 0.00020774999999999998, + "loss": 6.1198, + "step": 1385 + }, + { + "epoch": 0.12932723709993468, + "grad_norm": 1.1688948714628447, + "learning_rate": 0.00020789999999999996, + "loss": 6.1439, + "step": 1386 + }, + { + "epoch": 0.129420546794812, + "grad_norm": 1.4151776038551258, + "learning_rate": 0.00020805, + "loss": 6.2262, + "step": 1387 + }, + { + "epoch": 0.12951385648968927, + "grad_norm": 1.1093606114972263, + "learning_rate": 0.00020819999999999996, + "loss": 6.1634, + "step": 1388 + }, + { + "epoch": 0.12960716618456658, + "grad_norm": 1.4077271670618594, + "learning_rate": 0.00020835, + "loss": 5.9944, + "step": 1389 + }, + { + "epoch": 0.12970047587944386, + "grad_norm": 1.1409166129579984, + "learning_rate": 0.00020849999999999997, + "loss": 6.4963, + "step": 1390 + }, + { + "epoch": 0.12979378557432117, + "grad_norm": 1.3060017256205505, + "learning_rate": 0.00020864999999999998, + "loss": 5.9859, + "step": 1391 + }, + { + "epoch": 0.12988709526919848, + "grad_norm": 1.222666405583401, + "learning_rate": 0.00020879999999999998, + "loss": 6.2795, + "step": 1392 + }, + { + "epoch": 0.12998040496407576, + "grad_norm": 1.1201339435489703, + "learning_rate": 0.00020894999999999998, + "loss": 6.0662, + "step": 1393 + }, + { + "epoch": 0.13007371465895307, + "grad_norm": 1.0925662743920432, + "learning_rate": 0.00020909999999999996, + "loss": 6.0294, + "step": 1394 + }, + { + "epoch": 0.13016702435383037, + "grad_norm": 2.808553904490928, + "learning_rate": 0.00020925, + "loss": 5.9675, + "step": 1395 + }, + { + "epoch": 0.13026033404870765, + "grad_norm": 1.1191468783974226, + "learning_rate": 0.00020939999999999997, + "loss": 5.9947, + "step": 1396 + }, + { + "epoch": 0.13035364374358496, + "grad_norm": 1.8109622999671509, + "learning_rate": 0.00020955, + "loss": 5.9515, + "step": 1397 + }, + { + "epoch": 0.13044695343846224, + "grad_norm": 1.847709490354984, + "learning_rate": 0.00020969999999999997, + "loss": 6.2523, + "step": 1398 + }, + { + "epoch": 0.13054026313333955, + "grad_norm": 1.230314209643686, + "learning_rate": 0.00020984999999999998, + "loss": 6.1811, + "step": 1399 + }, + { + "epoch": 0.13063357282821686, + "grad_norm": 1.3699284432574852, + "learning_rate": 0.00020999999999999998, + "loss": 6.0622, + "step": 1400 + }, + { + "epoch": 0.13072688252309414, + "grad_norm": 1.5252683629586794, + "learning_rate": 0.00021014999999999999, + "loss": 5.819, + "step": 1401 + }, + { + "epoch": 0.13082019221797145, + "grad_norm": 1.730575798864281, + "learning_rate": 0.00021029999999999996, + "loss": 6.3357, + "step": 1402 + }, + { + "epoch": 0.13091350191284876, + "grad_norm": 1.5159181878947814, + "learning_rate": 0.00021045, + "loss": 6.3834, + "step": 1403 + }, + { + "epoch": 0.13100681160772604, + "grad_norm": 1.7426512041781015, + "learning_rate": 0.00021059999999999997, + "loss": 5.9466, + "step": 1404 + }, + { + "epoch": 0.13110012130260335, + "grad_norm": 1.3625262223203487, + "learning_rate": 0.00021074999999999997, + "loss": 5.8807, + "step": 1405 + }, + { + "epoch": 0.13119343099748063, + "grad_norm": 1.7172349249241463, + "learning_rate": 0.00021089999999999998, + "loss": 6.4048, + "step": 1406 + }, + { + "epoch": 0.13128674069235793, + "grad_norm": 1.2094985488130763, + "learning_rate": 0.00021104999999999998, + "loss": 6.2176, + "step": 1407 + }, + { + "epoch": 0.13138005038723524, + "grad_norm": 4.006616444757551, + "learning_rate": 0.00021119999999999996, + "loss": 6.1878, + "step": 1408 + }, + { + "epoch": 0.13147336008211252, + "grad_norm": 1.2400191038737642, + "learning_rate": 0.00021135, + "loss": 6.3787, + "step": 1409 + }, + { + "epoch": 0.13156666977698983, + "grad_norm": 1.0784043118177946, + "learning_rate": 0.00021149999999999996, + "loss": 6.3204, + "step": 1410 + }, + { + "epoch": 0.13165997947186714, + "grad_norm": 1.139976428624587, + "learning_rate": 0.00021165, + "loss": 6.0972, + "step": 1411 + }, + { + "epoch": 0.13175328916674442, + "grad_norm": 2.09384955922344, + "learning_rate": 0.00021179999999999997, + "loss": 6.3865, + "step": 1412 + }, + { + "epoch": 0.13184659886162173, + "grad_norm": 1.9118768205717909, + "learning_rate": 0.00021194999999999997, + "loss": 6.3546, + "step": 1413 + }, + { + "epoch": 0.131939908556499, + "grad_norm": 4.177503835542689, + "learning_rate": 0.00021209999999999998, + "loss": 6.4851, + "step": 1414 + }, + { + "epoch": 0.13203321825137632, + "grad_norm": 2.0911997463314127, + "learning_rate": 0.00021224999999999998, + "loss": 6.0747, + "step": 1415 + }, + { + "epoch": 0.13212652794625362, + "grad_norm": 1.2637208613303064, + "learning_rate": 0.00021239999999999996, + "loss": 6.0394, + "step": 1416 + }, + { + "epoch": 0.1322198376411309, + "grad_norm": 5.549192247696427, + "learning_rate": 0.00021255, + "loss": 6.5192, + "step": 1417 + }, + { + "epoch": 0.1323131473360082, + "grad_norm": 1.9544864988529016, + "learning_rate": 0.00021269999999999997, + "loss": 6.4972, + "step": 1418 + }, + { + "epoch": 0.13240645703088552, + "grad_norm": 1.75582303629644, + "learning_rate": 0.00021285, + "loss": 6.1916, + "step": 1419 + }, + { + "epoch": 0.1324997667257628, + "grad_norm": 1.5435587552657424, + "learning_rate": 0.00021299999999999997, + "loss": 6.0751, + "step": 1420 + }, + { + "epoch": 0.1325930764206401, + "grad_norm": 2.3248250268801165, + "learning_rate": 0.00021314999999999998, + "loss": 6.1608, + "step": 1421 + }, + { + "epoch": 0.1326863861155174, + "grad_norm": 1.8476676651163249, + "learning_rate": 0.00021329999999999998, + "loss": 5.9665, + "step": 1422 + }, + { + "epoch": 0.1327796958103947, + "grad_norm": 1.2596583453105812, + "learning_rate": 0.00021344999999999998, + "loss": 6.3533, + "step": 1423 + }, + { + "epoch": 0.132873005505272, + "grad_norm": 2.4909659626194776, + "learning_rate": 0.00021359999999999996, + "loss": 6.3132, + "step": 1424 + }, + { + "epoch": 0.1329663152001493, + "grad_norm": 1.9090696229487372, + "learning_rate": 0.00021375, + "loss": 6.3497, + "step": 1425 + }, + { + "epoch": 0.1330596248950266, + "grad_norm": 1.0819377228858287, + "learning_rate": 0.00021389999999999997, + "loss": 6.2852, + "step": 1426 + }, + { + "epoch": 0.1331529345899039, + "grad_norm": 1.1097680855683565, + "learning_rate": 0.00021405, + "loss": 6.0087, + "step": 1427 + }, + { + "epoch": 0.13324624428478118, + "grad_norm": 1.1124580558362192, + "learning_rate": 0.00021419999999999998, + "loss": 5.9938, + "step": 1428 + }, + { + "epoch": 0.1333395539796585, + "grad_norm": 1.0633416023460815, + "learning_rate": 0.00021434999999999998, + "loss": 6.1679, + "step": 1429 + }, + { + "epoch": 0.13343286367453577, + "grad_norm": 1.2019547858421231, + "learning_rate": 0.00021449999999999998, + "loss": 6.426, + "step": 1430 + }, + { + "epoch": 0.13352617336941308, + "grad_norm": 1.194278206412398, + "learning_rate": 0.00021464999999999999, + "loss": 5.6427, + "step": 1431 + }, + { + "epoch": 0.1336194830642904, + "grad_norm": 1.3762736769972292, + "learning_rate": 0.00021479999999999996, + "loss": 6.2697, + "step": 1432 + }, + { + "epoch": 0.13371279275916767, + "grad_norm": 1.1311200943820845, + "learning_rate": 0.00021495, + "loss": 6.2499, + "step": 1433 + }, + { + "epoch": 0.13380610245404498, + "grad_norm": 2.105909136648668, + "learning_rate": 0.00021509999999999997, + "loss": 6.2878, + "step": 1434 + }, + { + "epoch": 0.1338994121489223, + "grad_norm": 1.2123185918458483, + "learning_rate": 0.00021525, + "loss": 5.9928, + "step": 1435 + }, + { + "epoch": 0.13399272184379957, + "grad_norm": 1.4342591100214266, + "learning_rate": 0.00021539999999999998, + "loss": 6.2566, + "step": 1436 + }, + { + "epoch": 0.13408603153867688, + "grad_norm": 1.264269688589814, + "learning_rate": 0.00021554999999999998, + "loss": 6.1531, + "step": 1437 + }, + { + "epoch": 0.13417934123355416, + "grad_norm": 1.3746618633726095, + "learning_rate": 0.00021569999999999998, + "loss": 5.9624, + "step": 1438 + }, + { + "epoch": 0.13427265092843146, + "grad_norm": 2.9424092634550343, + "learning_rate": 0.00021585, + "loss": 5.8914, + "step": 1439 + }, + { + "epoch": 0.13436596062330877, + "grad_norm": 1.3570773875154238, + "learning_rate": 0.00021599999999999996, + "loss": 6.3477, + "step": 1440 + }, + { + "epoch": 0.13445927031818605, + "grad_norm": 1.3496456218081856, + "learning_rate": 0.00021615, + "loss": 6.2719, + "step": 1441 + }, + { + "epoch": 0.13455258001306336, + "grad_norm": 1.338957472378986, + "learning_rate": 0.00021629999999999997, + "loss": 6.2379, + "step": 1442 + }, + { + "epoch": 0.13464588970794064, + "grad_norm": 1.7434997960710532, + "learning_rate": 0.00021645, + "loss": 6.1898, + "step": 1443 + }, + { + "epoch": 0.13473919940281795, + "grad_norm": 1.217945923050817, + "learning_rate": 0.00021659999999999998, + "loss": 5.9831, + "step": 1444 + }, + { + "epoch": 0.13483250909769526, + "grad_norm": 1.2487337483339374, + "learning_rate": 0.00021674999999999998, + "loss": 6.0904, + "step": 1445 + }, + { + "epoch": 0.13492581879257254, + "grad_norm": 1.1494003365763654, + "learning_rate": 0.0002169, + "loss": 5.7152, + "step": 1446 + }, + { + "epoch": 0.13501912848744985, + "grad_norm": 1.0181640534167278, + "learning_rate": 0.00021705, + "loss": 6.2957, + "step": 1447 + }, + { + "epoch": 0.13511243818232715, + "grad_norm": 1.086963249449302, + "learning_rate": 0.00021719999999999997, + "loss": 5.9845, + "step": 1448 + }, + { + "epoch": 0.13520574787720444, + "grad_norm": 1.1353090828242258, + "learning_rate": 0.00021735, + "loss": 6.1515, + "step": 1449 + }, + { + "epoch": 0.13529905757208174, + "grad_norm": 1.1099812308138615, + "learning_rate": 0.00021749999999999997, + "loss": 6.3605, + "step": 1450 + }, + { + "epoch": 0.13539236726695902, + "grad_norm": 1.2046067747938873, + "learning_rate": 0.00021764999999999998, + "loss": 6.4793, + "step": 1451 + }, + { + "epoch": 0.13548567696183633, + "grad_norm": 1.1469159764042922, + "learning_rate": 0.00021779999999999998, + "loss": 6.3066, + "step": 1452 + }, + { + "epoch": 0.13557898665671364, + "grad_norm": 1.1463936638399397, + "learning_rate": 0.00021794999999999999, + "loss": 6.3705, + "step": 1453 + }, + { + "epoch": 0.13567229635159092, + "grad_norm": 1.327046428167801, + "learning_rate": 0.00021809999999999996, + "loss": 6.1627, + "step": 1454 + }, + { + "epoch": 0.13576560604646823, + "grad_norm": 1.1643988072405076, + "learning_rate": 0.00021825, + "loss": 5.9638, + "step": 1455 + }, + { + "epoch": 0.13585891574134554, + "grad_norm": 1.6279908352150474, + "learning_rate": 0.00021839999999999997, + "loss": 6.1149, + "step": 1456 + }, + { + "epoch": 0.13595222543622282, + "grad_norm": 1.1856143274661248, + "learning_rate": 0.00021855, + "loss": 6.1604, + "step": 1457 + }, + { + "epoch": 0.13604553513110013, + "grad_norm": 1.0702421852921662, + "learning_rate": 0.00021869999999999998, + "loss": 6.413, + "step": 1458 + }, + { + "epoch": 0.1361388448259774, + "grad_norm": 1.2950819736045889, + "learning_rate": 0.00021884999999999998, + "loss": 6.2922, + "step": 1459 + }, + { + "epoch": 0.13623215452085471, + "grad_norm": 1.0673034454126509, + "learning_rate": 0.00021899999999999998, + "loss": 6.1216, + "step": 1460 + }, + { + "epoch": 0.13632546421573202, + "grad_norm": 1.07528603169276, + "learning_rate": 0.00021915, + "loss": 6.1543, + "step": 1461 + }, + { + "epoch": 0.1364187739106093, + "grad_norm": 1.835522231059361, + "learning_rate": 0.00021929999999999996, + "loss": 6.4474, + "step": 1462 + }, + { + "epoch": 0.1365120836054866, + "grad_norm": 1.5105259665362452, + "learning_rate": 0.00021945, + "loss": 6.2764, + "step": 1463 + }, + { + "epoch": 0.13660539330036392, + "grad_norm": 1.5435138906623709, + "learning_rate": 0.00021959999999999997, + "loss": 5.9158, + "step": 1464 + }, + { + "epoch": 0.1366987029952412, + "grad_norm": 1.4584044834143177, + "learning_rate": 0.00021975, + "loss": 6.3685, + "step": 1465 + }, + { + "epoch": 0.1367920126901185, + "grad_norm": 1.1275232842439027, + "learning_rate": 0.00021989999999999998, + "loss": 6.0, + "step": 1466 + }, + { + "epoch": 0.1368853223849958, + "grad_norm": 1.1004544134145362, + "learning_rate": 0.00022004999999999998, + "loss": 5.9739, + "step": 1467 + }, + { + "epoch": 0.1369786320798731, + "grad_norm": 1.3313942141882553, + "learning_rate": 0.00022019999999999999, + "loss": 5.8058, + "step": 1468 + }, + { + "epoch": 0.1370719417747504, + "grad_norm": 1.774903304117492, + "learning_rate": 0.00022035, + "loss": 5.9945, + "step": 1469 + }, + { + "epoch": 0.13716525146962769, + "grad_norm": 1.3107441847403432, + "learning_rate": 0.00022049999999999997, + "loss": 5.9705, + "step": 1470 + }, + { + "epoch": 0.137258561164505, + "grad_norm": 1.3952750091488564, + "learning_rate": 0.00022065, + "loss": 6.182, + "step": 1471 + }, + { + "epoch": 0.1373518708593823, + "grad_norm": 1.6226102633647148, + "learning_rate": 0.00022079999999999997, + "loss": 6.1052, + "step": 1472 + }, + { + "epoch": 0.13744518055425958, + "grad_norm": 1.310851186901282, + "learning_rate": 0.00022095, + "loss": 6.0346, + "step": 1473 + }, + { + "epoch": 0.1375384902491369, + "grad_norm": 1.0728236416566572, + "learning_rate": 0.00022109999999999998, + "loss": 5.6957, + "step": 1474 + }, + { + "epoch": 0.13763179994401417, + "grad_norm": 2.0071763012501105, + "learning_rate": 0.00022124999999999998, + "loss": 6.0429, + "step": 1475 + }, + { + "epoch": 0.13772510963889148, + "grad_norm": 1.5835857123104642, + "learning_rate": 0.0002214, + "loss": 6.2882, + "step": 1476 + }, + { + "epoch": 0.1378184193337688, + "grad_norm": 1.2906649894401594, + "learning_rate": 0.00022155, + "loss": 5.8941, + "step": 1477 + }, + { + "epoch": 0.13791172902864607, + "grad_norm": 1.8825299188476745, + "learning_rate": 0.00022169999999999997, + "loss": 6.085, + "step": 1478 + }, + { + "epoch": 0.13800503872352338, + "grad_norm": 2.2568594098259056, + "learning_rate": 0.00022185, + "loss": 6.0577, + "step": 1479 + }, + { + "epoch": 0.13809834841840068, + "grad_norm": 1.1216247480828676, + "learning_rate": 0.00022199999999999998, + "loss": 6.1911, + "step": 1480 + }, + { + "epoch": 0.13819165811327797, + "grad_norm": 1.2154764534562632, + "learning_rate": 0.00022215, + "loss": 6.1295, + "step": 1481 + }, + { + "epoch": 0.13828496780815527, + "grad_norm": 1.0960133242635377, + "learning_rate": 0.00022229999999999998, + "loss": 6.1092, + "step": 1482 + }, + { + "epoch": 0.13837827750303255, + "grad_norm": 1.253449986573928, + "learning_rate": 0.00022244999999999999, + "loss": 5.9991, + "step": 1483 + }, + { + "epoch": 0.13847158719790986, + "grad_norm": 1.2074271590686714, + "learning_rate": 0.0002226, + "loss": 5.6911, + "step": 1484 + }, + { + "epoch": 0.13856489689278717, + "grad_norm": 1.8010968429431964, + "learning_rate": 0.00022275, + "loss": 6.3349, + "step": 1485 + }, + { + "epoch": 0.13865820658766445, + "grad_norm": 1.1736932644083253, + "learning_rate": 0.00022289999999999997, + "loss": 6.0057, + "step": 1486 + }, + { + "epoch": 0.13875151628254176, + "grad_norm": 1.1255446841457617, + "learning_rate": 0.00022305, + "loss": 6.0923, + "step": 1487 + }, + { + "epoch": 0.13884482597741907, + "grad_norm": 1.2453400877213654, + "learning_rate": 0.00022319999999999998, + "loss": 6.1207, + "step": 1488 + }, + { + "epoch": 0.13893813567229635, + "grad_norm": 1.6869807247185116, + "learning_rate": 0.00022335, + "loss": 6.0073, + "step": 1489 + }, + { + "epoch": 0.13903144536717366, + "grad_norm": 1.0787543422272658, + "learning_rate": 0.00022349999999999998, + "loss": 6.2661, + "step": 1490 + }, + { + "epoch": 0.13912475506205094, + "grad_norm": 1.1989304252908837, + "learning_rate": 0.00022365, + "loss": 6.0569, + "step": 1491 + }, + { + "epoch": 0.13921806475692824, + "grad_norm": 1.0639499274615203, + "learning_rate": 0.0002238, + "loss": 6.1721, + "step": 1492 + }, + { + "epoch": 0.13931137445180555, + "grad_norm": 1.0850511940995364, + "learning_rate": 0.00022395, + "loss": 6.2112, + "step": 1493 + }, + { + "epoch": 0.13940468414668283, + "grad_norm": 0.9541542825144239, + "learning_rate": 0.00022409999999999997, + "loss": 6.0763, + "step": 1494 + }, + { + "epoch": 0.13949799384156014, + "grad_norm": 1.4407454687545114, + "learning_rate": 0.00022425, + "loss": 6.4864, + "step": 1495 + }, + { + "epoch": 0.13959130353643742, + "grad_norm": 1.1096281527203413, + "learning_rate": 0.00022439999999999998, + "loss": 5.9707, + "step": 1496 + }, + { + "epoch": 0.13968461323131473, + "grad_norm": 1.030694940153786, + "learning_rate": 0.00022455, + "loss": 6.173, + "step": 1497 + }, + { + "epoch": 0.13977792292619204, + "grad_norm": 1.084096384983346, + "learning_rate": 0.0002247, + "loss": 5.9573, + "step": 1498 + }, + { + "epoch": 0.13987123262106932, + "grad_norm": 1.2189601411836775, + "learning_rate": 0.00022485, + "loss": 5.9801, + "step": 1499 + }, + { + "epoch": 0.13996454231594663, + "grad_norm": 1.2866540148373604, + "learning_rate": 0.000225, + "loss": 6.1902, + "step": 1500 + }, + { + "epoch": 0.14005785201082394, + "grad_norm": 7.088785384679744, + "learning_rate": 0.00022514999999999997, + "loss": 5.8336, + "step": 1501 + }, + { + "epoch": 0.14015116170570122, + "grad_norm": 1.0876545737841021, + "learning_rate": 0.00022529999999999997, + "loss": 5.8908, + "step": 1502 + }, + { + "epoch": 0.14024447140057852, + "grad_norm": 1.457307964177532, + "learning_rate": 0.00022544999999999995, + "loss": 6.0182, + "step": 1503 + }, + { + "epoch": 0.1403377810954558, + "grad_norm": 1.5345466331520743, + "learning_rate": 0.00022559999999999998, + "loss": 5.6391, + "step": 1504 + }, + { + "epoch": 0.1404310907903331, + "grad_norm": 1.5483761089003192, + "learning_rate": 0.00022574999999999996, + "loss": 5.9398, + "step": 1505 + }, + { + "epoch": 0.14052440048521042, + "grad_norm": 1.274095569860916, + "learning_rate": 0.0002259, + "loss": 6.1742, + "step": 1506 + }, + { + "epoch": 0.1406177101800877, + "grad_norm": 1.6359781978870045, + "learning_rate": 0.00022604999999999997, + "loss": 5.7548, + "step": 1507 + }, + { + "epoch": 0.140711019874965, + "grad_norm": 1.1882486452209193, + "learning_rate": 0.00022619999999999997, + "loss": 5.6854, + "step": 1508 + }, + { + "epoch": 0.14080432956984232, + "grad_norm": 1.4251403764340456, + "learning_rate": 0.00022634999999999997, + "loss": 5.8732, + "step": 1509 + }, + { + "epoch": 0.1408976392647196, + "grad_norm": 1.466796834886676, + "learning_rate": 0.00022649999999999998, + "loss": 6.5356, + "step": 1510 + }, + { + "epoch": 0.1409909489595969, + "grad_norm": 1.5782477656002585, + "learning_rate": 0.00022664999999999995, + "loss": 6.1199, + "step": 1511 + }, + { + "epoch": 0.1410842586544742, + "grad_norm": 2.120003389197898, + "learning_rate": 0.00022679999999999998, + "loss": 6.5512, + "step": 1512 + }, + { + "epoch": 0.1411775683493515, + "grad_norm": 1.1643663066424605, + "learning_rate": 0.00022694999999999996, + "loss": 5.7535, + "step": 1513 + }, + { + "epoch": 0.1412708780442288, + "grad_norm": 1.1208543137609506, + "learning_rate": 0.0002271, + "loss": 5.9419, + "step": 1514 + }, + { + "epoch": 0.14136418773910608, + "grad_norm": 1.2928479126017602, + "learning_rate": 0.00022724999999999997, + "loss": 5.6591, + "step": 1515 + }, + { + "epoch": 0.1414574974339834, + "grad_norm": 1.3423332050699108, + "learning_rate": 0.00022739999999999997, + "loss": 6.091, + "step": 1516 + }, + { + "epoch": 0.1415508071288607, + "grad_norm": 1.783354817845397, + "learning_rate": 0.00022754999999999997, + "loss": 5.9362, + "step": 1517 + }, + { + "epoch": 0.14164411682373798, + "grad_norm": 1.5098061661948168, + "learning_rate": 0.00022769999999999998, + "loss": 6.0037, + "step": 1518 + }, + { + "epoch": 0.1417374265186153, + "grad_norm": 2.2789166072616616, + "learning_rate": 0.00022784999999999995, + "loss": 5.7146, + "step": 1519 + }, + { + "epoch": 0.14183073621349257, + "grad_norm": 2.5912744706308053, + "learning_rate": 0.00022799999999999999, + "loss": 6.0159, + "step": 1520 + }, + { + "epoch": 0.14192404590836988, + "grad_norm": 1.7652767266157812, + "learning_rate": 0.00022814999999999996, + "loss": 6.304, + "step": 1521 + }, + { + "epoch": 0.1420173556032472, + "grad_norm": 1.4672729459664335, + "learning_rate": 0.0002283, + "loss": 5.9273, + "step": 1522 + }, + { + "epoch": 0.14211066529812447, + "grad_norm": 2.3690284405998523, + "learning_rate": 0.00022844999999999997, + "loss": 5.5563, + "step": 1523 + }, + { + "epoch": 0.14220397499300177, + "grad_norm": 1.6795306180695317, + "learning_rate": 0.00022859999999999997, + "loss": 5.6919, + "step": 1524 + }, + { + "epoch": 0.14229728468787908, + "grad_norm": 1.462579196363749, + "learning_rate": 0.00022874999999999998, + "loss": 5.7951, + "step": 1525 + }, + { + "epoch": 0.14239059438275636, + "grad_norm": 1.3515560817003385, + "learning_rate": 0.00022889999999999998, + "loss": 6.0127, + "step": 1526 + }, + { + "epoch": 0.14248390407763367, + "grad_norm": 1.5405870358068294, + "learning_rate": 0.00022904999999999996, + "loss": 6.0026, + "step": 1527 + }, + { + "epoch": 0.14257721377251095, + "grad_norm": 2.3621922454765087, + "learning_rate": 0.0002292, + "loss": 6.1472, + "step": 1528 + }, + { + "epoch": 0.14267052346738826, + "grad_norm": 1.1527044231503485, + "learning_rate": 0.00022934999999999996, + "loss": 6.244, + "step": 1529 + }, + { + "epoch": 0.14276383316226557, + "grad_norm": 1.300731497747626, + "learning_rate": 0.0002295, + "loss": 6.1284, + "step": 1530 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.1078068775042358, + "learning_rate": 0.00022964999999999997, + "loss": 5.6432, + "step": 1531 + }, + { + "epoch": 0.14295045255202016, + "grad_norm": 1.498701911889005, + "learning_rate": 0.00022979999999999997, + "loss": 6.1671, + "step": 1532 + }, + { + "epoch": 0.14304376224689747, + "grad_norm": 1.4633309896228204, + "learning_rate": 0.00022994999999999998, + "loss": 6.2793, + "step": 1533 + }, + { + "epoch": 0.14313707194177475, + "grad_norm": 1.197980663203834, + "learning_rate": 0.00023009999999999998, + "loss": 6.2066, + "step": 1534 + }, + { + "epoch": 0.14323038163665205, + "grad_norm": 1.3644371374466042, + "learning_rate": 0.00023024999999999996, + "loss": 6.248, + "step": 1535 + }, + { + "epoch": 0.14332369133152933, + "grad_norm": 2.3299356164528473, + "learning_rate": 0.0002304, + "loss": 5.7192, + "step": 1536 + }, + { + "epoch": 0.14341700102640664, + "grad_norm": 1.4600504261469014, + "learning_rate": 0.00023054999999999997, + "loss": 6.1221, + "step": 1537 + }, + { + "epoch": 0.14351031072128395, + "grad_norm": 0.9961182940528043, + "learning_rate": 0.0002307, + "loss": 6.3046, + "step": 1538 + }, + { + "epoch": 0.14360362041616123, + "grad_norm": 1.8213529535576407, + "learning_rate": 0.00023084999999999997, + "loss": 5.544, + "step": 1539 + }, + { + "epoch": 0.14369693011103854, + "grad_norm": 1.4930282574767029, + "learning_rate": 0.00023099999999999998, + "loss": 5.9844, + "step": 1540 + }, + { + "epoch": 0.14379023980591585, + "grad_norm": 1.458362299634727, + "learning_rate": 0.00023114999999999998, + "loss": 6.069, + "step": 1541 + }, + { + "epoch": 0.14388354950079313, + "grad_norm": 1.8729613816513024, + "learning_rate": 0.00023129999999999998, + "loss": 6.0998, + "step": 1542 + }, + { + "epoch": 0.14397685919567044, + "grad_norm": 1.216205957548438, + "learning_rate": 0.00023144999999999996, + "loss": 6.0778, + "step": 1543 + }, + { + "epoch": 0.14407016889054772, + "grad_norm": 1.3482577419974693, + "learning_rate": 0.0002316, + "loss": 6.2662, + "step": 1544 + }, + { + "epoch": 0.14416347858542503, + "grad_norm": 2.266341700112728, + "learning_rate": 0.00023174999999999997, + "loss": 6.0546, + "step": 1545 + }, + { + "epoch": 0.14425678828030233, + "grad_norm": 3.2110736331411913, + "learning_rate": 0.0002319, + "loss": 6.0751, + "step": 1546 + }, + { + "epoch": 0.14435009797517961, + "grad_norm": 1.1481295231304225, + "learning_rate": 0.00023204999999999998, + "loss": 6.1464, + "step": 1547 + }, + { + "epoch": 0.14444340767005692, + "grad_norm": 1.4259679212337055, + "learning_rate": 0.00023219999999999998, + "loss": 6.1306, + "step": 1548 + }, + { + "epoch": 0.1445367173649342, + "grad_norm": 1.299091610800162, + "learning_rate": 0.00023234999999999998, + "loss": 6.0569, + "step": 1549 + }, + { + "epoch": 0.1446300270598115, + "grad_norm": 1.2268748521820108, + "learning_rate": 0.00023249999999999999, + "loss": 5.7201, + "step": 1550 + }, + { + "epoch": 0.14472333675468882, + "grad_norm": 1.324827493047507, + "learning_rate": 0.00023264999999999996, + "loss": 6.0575, + "step": 1551 + }, + { + "epoch": 0.1448166464495661, + "grad_norm": 1.4196664533119459, + "learning_rate": 0.0002328, + "loss": 5.8135, + "step": 1552 + }, + { + "epoch": 0.1449099561444434, + "grad_norm": 1.5258528540710545, + "learning_rate": 0.00023294999999999997, + "loss": 6.2715, + "step": 1553 + }, + { + "epoch": 0.14500326583932072, + "grad_norm": 1.2692500206960453, + "learning_rate": 0.00023309999999999997, + "loss": 6.033, + "step": 1554 + }, + { + "epoch": 0.145096575534198, + "grad_norm": 1.4696488766924176, + "learning_rate": 0.00023324999999999998, + "loss": 5.9557, + "step": 1555 + }, + { + "epoch": 0.1451898852290753, + "grad_norm": 1.2367430383647535, + "learning_rate": 0.00023339999999999998, + "loss": 6.3734, + "step": 1556 + }, + { + "epoch": 0.14528319492395259, + "grad_norm": 1.0999788162513628, + "learning_rate": 0.00023354999999999996, + "loss": 6.2675, + "step": 1557 + }, + { + "epoch": 0.1453765046188299, + "grad_norm": 1.346141881118203, + "learning_rate": 0.0002337, + "loss": 6.1667, + "step": 1558 + }, + { + "epoch": 0.1454698143137072, + "grad_norm": 1.197643007383513, + "learning_rate": 0.00023384999999999997, + "loss": 5.9784, + "step": 1559 + }, + { + "epoch": 0.14556312400858448, + "grad_norm": 1.5076616674179641, + "learning_rate": 0.000234, + "loss": 5.8532, + "step": 1560 + }, + { + "epoch": 0.1456564337034618, + "grad_norm": 1.182061674747634, + "learning_rate": 0.00023414999999999997, + "loss": 6.2054, + "step": 1561 + }, + { + "epoch": 0.1457497433983391, + "grad_norm": 1.1464570251165531, + "learning_rate": 0.00023429999999999998, + "loss": 5.8882, + "step": 1562 + }, + { + "epoch": 0.14584305309321638, + "grad_norm": 1.8998952496272266, + "learning_rate": 0.00023444999999999998, + "loss": 6.0488, + "step": 1563 + }, + { + "epoch": 0.1459363627880937, + "grad_norm": 1.5822076920983446, + "learning_rate": 0.00023459999999999998, + "loss": 5.7923, + "step": 1564 + }, + { + "epoch": 0.14602967248297097, + "grad_norm": 1.1514187405410903, + "learning_rate": 0.00023474999999999996, + "loss": 5.9698, + "step": 1565 + }, + { + "epoch": 0.14612298217784828, + "grad_norm": 2.040544316894019, + "learning_rate": 0.0002349, + "loss": 5.8733, + "step": 1566 + }, + { + "epoch": 0.14621629187272558, + "grad_norm": 1.5292953498011594, + "learning_rate": 0.00023504999999999997, + "loss": 5.941, + "step": 1567 + }, + { + "epoch": 0.14630960156760286, + "grad_norm": 1.1947625906381196, + "learning_rate": 0.0002352, + "loss": 6.3089, + "step": 1568 + }, + { + "epoch": 0.14640291126248017, + "grad_norm": 1.456426141799462, + "learning_rate": 0.00023534999999999997, + "loss": 5.9325, + "step": 1569 + }, + { + "epoch": 0.14649622095735748, + "grad_norm": 1.3203591797452197, + "learning_rate": 0.00023549999999999998, + "loss": 5.7022, + "step": 1570 + }, + { + "epoch": 0.14658953065223476, + "grad_norm": 1.363713342313653, + "learning_rate": 0.00023564999999999998, + "loss": 5.4368, + "step": 1571 + }, + { + "epoch": 0.14668284034711207, + "grad_norm": 2.334081323152148, + "learning_rate": 0.00023579999999999999, + "loss": 6.2817, + "step": 1572 + }, + { + "epoch": 0.14677615004198935, + "grad_norm": 1.6245624486408488, + "learning_rate": 0.00023594999999999996, + "loss": 6.0066, + "step": 1573 + }, + { + "epoch": 0.14686945973686666, + "grad_norm": 1.0711790249251028, + "learning_rate": 0.0002361, + "loss": 6.3881, + "step": 1574 + }, + { + "epoch": 0.14696276943174397, + "grad_norm": 1.153586532138504, + "learning_rate": 0.00023624999999999997, + "loss": 5.8907, + "step": 1575 + }, + { + "epoch": 0.14705607912662125, + "grad_norm": 1.542687635320667, + "learning_rate": 0.0002364, + "loss": 6.3154, + "step": 1576 + }, + { + "epoch": 0.14714938882149856, + "grad_norm": 1.1761623673004047, + "learning_rate": 0.00023654999999999998, + "loss": 5.9045, + "step": 1577 + }, + { + "epoch": 0.14724269851637586, + "grad_norm": 3.163269170842302, + "learning_rate": 0.00023669999999999998, + "loss": 6.1984, + "step": 1578 + }, + { + "epoch": 0.14733600821125314, + "grad_norm": 1.4747009804508153, + "learning_rate": 0.00023684999999999998, + "loss": 6.2177, + "step": 1579 + }, + { + "epoch": 0.14742931790613045, + "grad_norm": 1.1170667300545754, + "learning_rate": 0.000237, + "loss": 6.1377, + "step": 1580 + }, + { + "epoch": 0.14752262760100773, + "grad_norm": 0.9808115253437152, + "learning_rate": 0.00023714999999999996, + "loss": 6.012, + "step": 1581 + }, + { + "epoch": 0.14761593729588504, + "grad_norm": 1.6181420873872459, + "learning_rate": 0.0002373, + "loss": 5.5005, + "step": 1582 + }, + { + "epoch": 0.14770924699076235, + "grad_norm": 1.7483743241146836, + "learning_rate": 0.00023744999999999997, + "loss": 6.1352, + "step": 1583 + }, + { + "epoch": 0.14780255668563963, + "grad_norm": 1.4634922168012183, + "learning_rate": 0.0002376, + "loss": 5.971, + "step": 1584 + }, + { + "epoch": 0.14789586638051694, + "grad_norm": 1.229129137901754, + "learning_rate": 0.00023774999999999998, + "loss": 6.108, + "step": 1585 + }, + { + "epoch": 0.14798917607539425, + "grad_norm": 1.3130140655264533, + "learning_rate": 0.00023789999999999998, + "loss": 6.077, + "step": 1586 + }, + { + "epoch": 0.14808248577027153, + "grad_norm": 1.2978824816571488, + "learning_rate": 0.00023804999999999999, + "loss": 6.2794, + "step": 1587 + }, + { + "epoch": 0.14817579546514884, + "grad_norm": 1.5093597571111523, + "learning_rate": 0.0002382, + "loss": 6.1172, + "step": 1588 + }, + { + "epoch": 0.14826910516002612, + "grad_norm": 1.4513486794351866, + "learning_rate": 0.00023834999999999997, + "loss": 5.7937, + "step": 1589 + }, + { + "epoch": 0.14836241485490342, + "grad_norm": 1.0934208795276728, + "learning_rate": 0.0002385, + "loss": 5.5746, + "step": 1590 + }, + { + "epoch": 0.14845572454978073, + "grad_norm": 2.386799048766373, + "learning_rate": 0.00023864999999999997, + "loss": 6.1399, + "step": 1591 + }, + { + "epoch": 0.148549034244658, + "grad_norm": 3.4447348075833664, + "learning_rate": 0.0002388, + "loss": 6.0611, + "step": 1592 + }, + { + "epoch": 0.14864234393953532, + "grad_norm": 1.3967206270688344, + "learning_rate": 0.00023894999999999998, + "loss": 6.073, + "step": 1593 + }, + { + "epoch": 0.14873565363441263, + "grad_norm": 1.1764795497766027, + "learning_rate": 0.00023909999999999998, + "loss": 6.14, + "step": 1594 + }, + { + "epoch": 0.1488289633292899, + "grad_norm": 1.4010837440266932, + "learning_rate": 0.00023925, + "loss": 6.3999, + "step": 1595 + }, + { + "epoch": 0.14892227302416722, + "grad_norm": 1.4108402637820514, + "learning_rate": 0.0002394, + "loss": 6.4195, + "step": 1596 + }, + { + "epoch": 0.1490155827190445, + "grad_norm": 108.7932963254385, + "learning_rate": 0.00023954999999999997, + "loss": 6.2382, + "step": 1597 + }, + { + "epoch": 0.1491088924139218, + "grad_norm": 1.2845198598989103, + "learning_rate": 0.0002397, + "loss": 6.0949, + "step": 1598 + }, + { + "epoch": 0.14920220210879911, + "grad_norm": 1.178176598243144, + "learning_rate": 0.00023984999999999998, + "loss": 5.9668, + "step": 1599 + }, + { + "epoch": 0.1492955118036764, + "grad_norm": 3.665231987119466, + "learning_rate": 0.00023999999999999998, + "loss": 6.4743, + "step": 1600 + }, + { + "epoch": 0.1493888214985537, + "grad_norm": 1.5831882208618417, + "learning_rate": 0.00024014999999999998, + "loss": 6.2823, + "step": 1601 + }, + { + "epoch": 0.14948213119343098, + "grad_norm": 5.57437704475879, + "learning_rate": 0.00024029999999999999, + "loss": 5.496, + "step": 1602 + }, + { + "epoch": 0.1495754408883083, + "grad_norm": 6.493253627416996, + "learning_rate": 0.00024044999999999996, + "loss": 6.282, + "step": 1603 + }, + { + "epoch": 0.1496687505831856, + "grad_norm": 2.7012678071779783, + "learning_rate": 0.0002406, + "loss": 5.9046, + "step": 1604 + }, + { + "epoch": 0.14976206027806288, + "grad_norm": 2.403021006838836, + "learning_rate": 0.00024074999999999997, + "loss": 5.4818, + "step": 1605 + }, + { + "epoch": 0.1498553699729402, + "grad_norm": 1.3799115262374058, + "learning_rate": 0.0002409, + "loss": 5.9219, + "step": 1606 + }, + { + "epoch": 0.1499486796678175, + "grad_norm": 1.4602979517175878, + "learning_rate": 0.00024104999999999998, + "loss": 6.0796, + "step": 1607 + }, + { + "epoch": 0.15004198936269478, + "grad_norm": 4.634254920728987, + "learning_rate": 0.00024119999999999998, + "loss": 6.6451, + "step": 1608 + }, + { + "epoch": 0.15013529905757209, + "grad_norm": 2.5853623602207603, + "learning_rate": 0.00024134999999999998, + "loss": 5.9897, + "step": 1609 + }, + { + "epoch": 0.15022860875244937, + "grad_norm": 1.9937868858668841, + "learning_rate": 0.0002415, + "loss": 6.461, + "step": 1610 + }, + { + "epoch": 0.15032191844732667, + "grad_norm": 3.3203879703230763, + "learning_rate": 0.00024164999999999996, + "loss": 6.1032, + "step": 1611 + }, + { + "epoch": 0.15041522814220398, + "grad_norm": 2.6340340515608056, + "learning_rate": 0.0002418, + "loss": 6.0074, + "step": 1612 + }, + { + "epoch": 0.15050853783708126, + "grad_norm": 1.9532615807494658, + "learning_rate": 0.00024194999999999997, + "loss": 6.0798, + "step": 1613 + }, + { + "epoch": 0.15060184753195857, + "grad_norm": 3.644718645294615, + "learning_rate": 0.0002421, + "loss": 6.7983, + "step": 1614 + }, + { + "epoch": 0.15069515722683588, + "grad_norm": 1.4971091700634032, + "learning_rate": 0.00024224999999999998, + "loss": 6.0318, + "step": 1615 + }, + { + "epoch": 0.15078846692171316, + "grad_norm": 1.9805821059011188, + "learning_rate": 0.00024239999999999998, + "loss": 6.0774, + "step": 1616 + }, + { + "epoch": 0.15088177661659047, + "grad_norm": 133.53585710159317, + "learning_rate": 0.00024255, + "loss": 6.1958, + "step": 1617 + }, + { + "epoch": 0.15097508631146775, + "grad_norm": 2.705680328541319, + "learning_rate": 0.0002427, + "loss": 6.3514, + "step": 1618 + }, + { + "epoch": 0.15106839600634506, + "grad_norm": 3.9620839664539167, + "learning_rate": 0.00024284999999999997, + "loss": 5.942, + "step": 1619 + }, + { + "epoch": 0.15116170570122237, + "grad_norm": 2.2190733135690466, + "learning_rate": 0.000243, + "loss": 6.4133, + "step": 1620 + }, + { + "epoch": 0.15125501539609965, + "grad_norm": 1.9622781839451442, + "learning_rate": 0.00024314999999999997, + "loss": 5.8496, + "step": 1621 + }, + { + "epoch": 0.15134832509097695, + "grad_norm": 2.4431213866326025, + "learning_rate": 0.0002433, + "loss": 6.3267, + "step": 1622 + }, + { + "epoch": 0.15144163478585426, + "grad_norm": 2.1266506315666067, + "learning_rate": 0.00024344999999999998, + "loss": 6.0693, + "step": 1623 + }, + { + "epoch": 0.15153494448073154, + "grad_norm": 1.4441409847185622, + "learning_rate": 0.00024359999999999999, + "loss": 6.4613, + "step": 1624 + }, + { + "epoch": 0.15162825417560885, + "grad_norm": 4.358682717705284, + "learning_rate": 0.00024375, + "loss": 5.9972, + "step": 1625 + }, + { + "epoch": 0.15172156387048613, + "grad_norm": 2.731044041614414, + "learning_rate": 0.00024389999999999997, + "loss": 5.7335, + "step": 1626 + }, + { + "epoch": 0.15181487356536344, + "grad_norm": 1.6899953471359042, + "learning_rate": 0.00024404999999999997, + "loss": 6.2408, + "step": 1627 + }, + { + "epoch": 0.15190818326024075, + "grad_norm": 3.3069350437546916, + "learning_rate": 0.00024419999999999997, + "loss": 6.2464, + "step": 1628 + }, + { + "epoch": 0.15200149295511803, + "grad_norm": 1.2688726341738317, + "learning_rate": 0.00024435, + "loss": 6.2889, + "step": 1629 + }, + { + "epoch": 0.15209480264999534, + "grad_norm": 1.8655953433663865, + "learning_rate": 0.0002445, + "loss": 5.9435, + "step": 1630 + }, + { + "epoch": 0.15218811234487264, + "grad_norm": 1.7160367163003862, + "learning_rate": 0.00024464999999999996, + "loss": 6.3692, + "step": 1631 + }, + { + "epoch": 0.15228142203974993, + "grad_norm": 1.205595973179143, + "learning_rate": 0.0002448, + "loss": 5.9836, + "step": 1632 + }, + { + "epoch": 0.15237473173462723, + "grad_norm": 1.4349299188486557, + "learning_rate": 0.00024494999999999996, + "loss": 6.3428, + "step": 1633 + }, + { + "epoch": 0.15246804142950451, + "grad_norm": 1.8390788721113833, + "learning_rate": 0.00024509999999999994, + "loss": 6.4171, + "step": 1634 + }, + { + "epoch": 0.15256135112438182, + "grad_norm": 2.06453313418655, + "learning_rate": 0.00024524999999999997, + "loss": 6.3183, + "step": 1635 + }, + { + "epoch": 0.15265466081925913, + "grad_norm": 2.101263311573955, + "learning_rate": 0.00024539999999999995, + "loss": 5.9252, + "step": 1636 + }, + { + "epoch": 0.1527479705141364, + "grad_norm": 2.5115832946699204, + "learning_rate": 0.00024555, + "loss": 6.2647, + "step": 1637 + }, + { + "epoch": 0.15284128020901372, + "grad_norm": 2.1682479436217386, + "learning_rate": 0.00024569999999999995, + "loss": 5.6352, + "step": 1638 + }, + { + "epoch": 0.15293458990389103, + "grad_norm": 5.463019268970865, + "learning_rate": 0.00024585, + "loss": 6.5336, + "step": 1639 + }, + { + "epoch": 0.1530278995987683, + "grad_norm": 1.7869101947000723, + "learning_rate": 0.00024599999999999996, + "loss": 5.9129, + "step": 1640 + }, + { + "epoch": 0.15312120929364562, + "grad_norm": 2.0085960465459465, + "learning_rate": 0.00024615, + "loss": 6.3829, + "step": 1641 + }, + { + "epoch": 0.1532145189885229, + "grad_norm": 1.6425429752722176, + "learning_rate": 0.00024629999999999997, + "loss": 6.0242, + "step": 1642 + }, + { + "epoch": 0.1533078286834002, + "grad_norm": 10.78072898903019, + "learning_rate": 0.00024645, + "loss": 6.4166, + "step": 1643 + }, + { + "epoch": 0.1534011383782775, + "grad_norm": 11.47259767928769, + "learning_rate": 0.0002466, + "loss": 6.6456, + "step": 1644 + }, + { + "epoch": 0.1534944480731548, + "grad_norm": 2.0043342264351764, + "learning_rate": 0.00024675, + "loss": 6.5086, + "step": 1645 + }, + { + "epoch": 0.1535877577680321, + "grad_norm": 1.7247614490239112, + "learning_rate": 0.0002469, + "loss": 6.402, + "step": 1646 + }, + { + "epoch": 0.1536810674629094, + "grad_norm": 2.453616292223442, + "learning_rate": 0.00024704999999999996, + "loss": 6.1119, + "step": 1647 + }, + { + "epoch": 0.1537743771577867, + "grad_norm": 2.133230339292896, + "learning_rate": 0.0002472, + "loss": 5.6975, + "step": 1648 + }, + { + "epoch": 0.153867686852664, + "grad_norm": 2.7848059312227638, + "learning_rate": 0.00024734999999999997, + "loss": 6.6517, + "step": 1649 + }, + { + "epoch": 0.15396099654754128, + "grad_norm": 1.6105605356750956, + "learning_rate": 0.00024749999999999994, + "loss": 6.2875, + "step": 1650 + }, + { + "epoch": 0.1540543062424186, + "grad_norm": 1.2156913998635435, + "learning_rate": 0.00024765, + "loss": 6.0363, + "step": 1651 + }, + { + "epoch": 0.1541476159372959, + "grad_norm": 3.242953249616661, + "learning_rate": 0.00024779999999999995, + "loss": 6.1359, + "step": 1652 + }, + { + "epoch": 0.15424092563217318, + "grad_norm": 1.9440212844343971, + "learning_rate": 0.00024795, + "loss": 6.1542, + "step": 1653 + }, + { + "epoch": 0.15433423532705048, + "grad_norm": 1.444369613240847, + "learning_rate": 0.00024809999999999996, + "loss": 6.2024, + "step": 1654 + }, + { + "epoch": 0.15442754502192776, + "grad_norm": 2.591872718801743, + "learning_rate": 0.00024825, + "loss": 6.0126, + "step": 1655 + }, + { + "epoch": 0.15452085471680507, + "grad_norm": 2.3445632900267896, + "learning_rate": 0.00024839999999999997, + "loss": 6.598, + "step": 1656 + }, + { + "epoch": 0.15461416441168238, + "grad_norm": 1.9157634896482796, + "learning_rate": 0.00024855, + "loss": 5.4971, + "step": 1657 + }, + { + "epoch": 0.15470747410655966, + "grad_norm": 1.327332804795297, + "learning_rate": 0.0002487, + "loss": 6.1092, + "step": 1658 + }, + { + "epoch": 0.15480078380143697, + "grad_norm": 2.0435338412268864, + "learning_rate": 0.00024885, + "loss": 6.4531, + "step": 1659 + }, + { + "epoch": 0.15489409349631428, + "grad_norm": 1.5082300409859073, + "learning_rate": 0.000249, + "loss": 6.54, + "step": 1660 + }, + { + "epoch": 0.15498740319119156, + "grad_norm": 1.8905861439306713, + "learning_rate": 0.00024914999999999996, + "loss": 5.549, + "step": 1661 + }, + { + "epoch": 0.15508071288606887, + "grad_norm": 1.1765375172112948, + "learning_rate": 0.0002493, + "loss": 6.3343, + "step": 1662 + }, + { + "epoch": 0.15517402258094615, + "grad_norm": 1.9662055153788909, + "learning_rate": 0.00024944999999999996, + "loss": 6.1667, + "step": 1663 + }, + { + "epoch": 0.15526733227582346, + "grad_norm": 1.3487987272220028, + "learning_rate": 0.00024959999999999994, + "loss": 6.3366, + "step": 1664 + }, + { + "epoch": 0.15536064197070076, + "grad_norm": 1.5867252497007858, + "learning_rate": 0.00024974999999999997, + "loss": 6.2052, + "step": 1665 + }, + { + "epoch": 0.15545395166557804, + "grad_norm": 1.2652882852838725, + "learning_rate": 0.00024989999999999995, + "loss": 6.1554, + "step": 1666 + }, + { + "epoch": 0.15554726136045535, + "grad_norm": 1.230397654647555, + "learning_rate": 0.00025005, + "loss": 6.0874, + "step": 1667 + }, + { + "epoch": 0.15564057105533266, + "grad_norm": 1.3984533837922277, + "learning_rate": 0.00025019999999999996, + "loss": 6.1454, + "step": 1668 + }, + { + "epoch": 0.15573388075020994, + "grad_norm": 1.1191217604196697, + "learning_rate": 0.00025035, + "loss": 6.019, + "step": 1669 + }, + { + "epoch": 0.15582719044508725, + "grad_norm": 1.3450268709926367, + "learning_rate": 0.00025049999999999996, + "loss": 5.8461, + "step": 1670 + }, + { + "epoch": 0.15592050013996453, + "grad_norm": 1.8816806982340322, + "learning_rate": 0.00025065, + "loss": 5.6711, + "step": 1671 + }, + { + "epoch": 0.15601380983484184, + "grad_norm": 1.4059074254023989, + "learning_rate": 0.00025079999999999997, + "loss": 6.0348, + "step": 1672 + }, + { + "epoch": 0.15610711952971915, + "grad_norm": 1.1867621535633623, + "learning_rate": 0.00025095, + "loss": 5.8093, + "step": 1673 + }, + { + "epoch": 0.15620042922459643, + "grad_norm": 1.1189725993519666, + "learning_rate": 0.0002511, + "loss": 5.681, + "step": 1674 + }, + { + "epoch": 0.15629373891947373, + "grad_norm": 1.3847845588472654, + "learning_rate": 0.00025125, + "loss": 6.0398, + "step": 1675 + }, + { + "epoch": 0.15638704861435104, + "grad_norm": 1.1478774637301508, + "learning_rate": 0.0002514, + "loss": 6.4159, + "step": 1676 + }, + { + "epoch": 0.15648035830922832, + "grad_norm": 1.3247295630132225, + "learning_rate": 0.00025154999999999996, + "loss": 6.3613, + "step": 1677 + }, + { + "epoch": 0.15657366800410563, + "grad_norm": 1.3316523738552228, + "learning_rate": 0.0002517, + "loss": 6.5265, + "step": 1678 + }, + { + "epoch": 0.1566669776989829, + "grad_norm": 1.438567142798783, + "learning_rate": 0.00025184999999999997, + "loss": 6.3096, + "step": 1679 + }, + { + "epoch": 0.15676028739386022, + "grad_norm": 1.3610048354001028, + "learning_rate": 0.00025199999999999995, + "loss": 6.1024, + "step": 1680 + }, + { + "epoch": 0.15685359708873753, + "grad_norm": 1.3617079569307597, + "learning_rate": 0.00025215, + "loss": 6.0185, + "step": 1681 + }, + { + "epoch": 0.1569469067836148, + "grad_norm": 1.111273798517925, + "learning_rate": 0.00025229999999999995, + "loss": 6.2766, + "step": 1682 + }, + { + "epoch": 0.15704021647849212, + "grad_norm": 1.220552296887176, + "learning_rate": 0.00025245, + "loss": 6.2633, + "step": 1683 + }, + { + "epoch": 0.15713352617336943, + "grad_norm": 1.2778862591496487, + "learning_rate": 0.00025259999999999996, + "loss": 5.7476, + "step": 1684 + }, + { + "epoch": 0.1572268358682467, + "grad_norm": 1.225167732794752, + "learning_rate": 0.00025275, + "loss": 6.1371, + "step": 1685 + }, + { + "epoch": 0.15732014556312401, + "grad_norm": 1.083187388679878, + "learning_rate": 0.00025289999999999997, + "loss": 5.7838, + "step": 1686 + }, + { + "epoch": 0.1574134552580013, + "grad_norm": 1.1843164463905735, + "learning_rate": 0.00025305, + "loss": 6.2005, + "step": 1687 + }, + { + "epoch": 0.1575067649528786, + "grad_norm": 1.3590341198997902, + "learning_rate": 0.0002532, + "loss": 6.027, + "step": 1688 + }, + { + "epoch": 0.1576000746477559, + "grad_norm": 1.2086573182819997, + "learning_rate": 0.00025335, + "loss": 6.16, + "step": 1689 + }, + { + "epoch": 0.1576933843426332, + "grad_norm": 1.2993948667771182, + "learning_rate": 0.0002535, + "loss": 6.3075, + "step": 1690 + }, + { + "epoch": 0.1577866940375105, + "grad_norm": 1.576314289962441, + "learning_rate": 0.00025365, + "loss": 5.8712, + "step": 1691 + }, + { + "epoch": 0.1578800037323878, + "grad_norm": 1.4959487646059668, + "learning_rate": 0.0002538, + "loss": 5.8348, + "step": 1692 + }, + { + "epoch": 0.1579733134272651, + "grad_norm": 1.1405281402318694, + "learning_rate": 0.00025394999999999997, + "loss": 5.8301, + "step": 1693 + }, + { + "epoch": 0.1580666231221424, + "grad_norm": 1.564780047145138, + "learning_rate": 0.0002541, + "loss": 6.1415, + "step": 1694 + }, + { + "epoch": 0.15815993281701968, + "grad_norm": 2.1417921751429483, + "learning_rate": 0.00025425, + "loss": 5.7133, + "step": 1695 + }, + { + "epoch": 0.15825324251189699, + "grad_norm": 1.2560322527524517, + "learning_rate": 0.00025439999999999995, + "loss": 6.4917, + "step": 1696 + }, + { + "epoch": 0.1583465522067743, + "grad_norm": 1.3978602951882706, + "learning_rate": 0.00025455, + "loss": 5.8582, + "step": 1697 + }, + { + "epoch": 0.15843986190165157, + "grad_norm": 1.3235843282662627, + "learning_rate": 0.00025469999999999996, + "loss": 6.3091, + "step": 1698 + }, + { + "epoch": 0.15853317159652888, + "grad_norm": 1.1750605499378375, + "learning_rate": 0.00025485, + "loss": 6.2457, + "step": 1699 + }, + { + "epoch": 0.1586264812914062, + "grad_norm": 1.7546897514249817, + "learning_rate": 0.00025499999999999996, + "loss": 6.1142, + "step": 1700 + }, + { + "epoch": 0.15871979098628347, + "grad_norm": 1.423550338696953, + "learning_rate": 0.00025515, + "loss": 6.0562, + "step": 1701 + }, + { + "epoch": 0.15881310068116078, + "grad_norm": 1.3575407561066177, + "learning_rate": 0.00025529999999999997, + "loss": 5.788, + "step": 1702 + }, + { + "epoch": 0.15890641037603806, + "grad_norm": 1.3057421215517833, + "learning_rate": 0.00025545, + "loss": 5.4749, + "step": 1703 + }, + { + "epoch": 0.15899972007091537, + "grad_norm": 1.519412833246476, + "learning_rate": 0.0002556, + "loss": 6.0435, + "step": 1704 + }, + { + "epoch": 0.15909302976579268, + "grad_norm": 1.3319835729667449, + "learning_rate": 0.00025575, + "loss": 5.8022, + "step": 1705 + }, + { + "epoch": 0.15918633946066996, + "grad_norm": 1.3933253026772874, + "learning_rate": 0.0002559, + "loss": 6.3166, + "step": 1706 + }, + { + "epoch": 0.15927964915554726, + "grad_norm": 1.3511784857333997, + "learning_rate": 0.00025604999999999996, + "loss": 5.7936, + "step": 1707 + }, + { + "epoch": 0.15937295885042455, + "grad_norm": 1.4181010443381639, + "learning_rate": 0.0002562, + "loss": 5.9593, + "step": 1708 + }, + { + "epoch": 0.15946626854530185, + "grad_norm": 1.3021360830235875, + "learning_rate": 0.00025634999999999997, + "loss": 5.5561, + "step": 1709 + }, + { + "epoch": 0.15955957824017916, + "grad_norm": 2.472650086715092, + "learning_rate": 0.00025649999999999995, + "loss": 6.3638, + "step": 1710 + }, + { + "epoch": 0.15965288793505644, + "grad_norm": 1.1977712243607843, + "learning_rate": 0.00025665, + "loss": 6.0247, + "step": 1711 + }, + { + "epoch": 0.15974619762993375, + "grad_norm": 1.2598516567097688, + "learning_rate": 0.00025679999999999995, + "loss": 5.6802, + "step": 1712 + }, + { + "epoch": 0.15983950732481106, + "grad_norm": 0.9983143376734399, + "learning_rate": 0.00025695, + "loss": 6.1802, + "step": 1713 + }, + { + "epoch": 0.15993281701968834, + "grad_norm": 1.1192087573690273, + "learning_rate": 0.00025709999999999996, + "loss": 5.7139, + "step": 1714 + }, + { + "epoch": 0.16002612671456565, + "grad_norm": 1.2193173473576875, + "learning_rate": 0.00025725, + "loss": 6.1288, + "step": 1715 + }, + { + "epoch": 0.16011943640944293, + "grad_norm": 1.2852427770530066, + "learning_rate": 0.00025739999999999997, + "loss": 6.0623, + "step": 1716 + }, + { + "epoch": 0.16021274610432024, + "grad_norm": 1.1880657999146909, + "learning_rate": 0.00025755, + "loss": 6.0908, + "step": 1717 + }, + { + "epoch": 0.16030605579919754, + "grad_norm": 1.1944568885086069, + "learning_rate": 0.0002577, + "loss": 6.1735, + "step": 1718 + }, + { + "epoch": 0.16039936549407482, + "grad_norm": 1.2348265081713863, + "learning_rate": 0.00025785, + "loss": 6.1942, + "step": 1719 + }, + { + "epoch": 0.16049267518895213, + "grad_norm": 1.4438522432319791, + "learning_rate": 0.000258, + "loss": 6.2112, + "step": 1720 + }, + { + "epoch": 0.16058598488382944, + "grad_norm": 1.3118466868075458, + "learning_rate": 0.00025815, + "loss": 5.8064, + "step": 1721 + }, + { + "epoch": 0.16067929457870672, + "grad_norm": 1.1204570227825819, + "learning_rate": 0.0002583, + "loss": 5.9375, + "step": 1722 + }, + { + "epoch": 0.16077260427358403, + "grad_norm": 1.3135782809870664, + "learning_rate": 0.00025844999999999997, + "loss": 6.1393, + "step": 1723 + }, + { + "epoch": 0.1608659139684613, + "grad_norm": 1.3381976875810622, + "learning_rate": 0.0002586, + "loss": 6.3053, + "step": 1724 + }, + { + "epoch": 0.16095922366333862, + "grad_norm": 1.1374155107387116, + "learning_rate": 0.00025875, + "loss": 5.4195, + "step": 1725 + }, + { + "epoch": 0.16105253335821593, + "grad_norm": 1.1415973619608224, + "learning_rate": 0.00025889999999999995, + "loss": 6.1436, + "step": 1726 + }, + { + "epoch": 0.1611458430530932, + "grad_norm": 1.2269215304307357, + "learning_rate": 0.00025905, + "loss": 6.1884, + "step": 1727 + }, + { + "epoch": 0.16123915274797052, + "grad_norm": 1.1246424550989342, + "learning_rate": 0.00025919999999999996, + "loss": 5.9756, + "step": 1728 + }, + { + "epoch": 0.16133246244284782, + "grad_norm": 1.5070503607165753, + "learning_rate": 0.00025935, + "loss": 5.8369, + "step": 1729 + }, + { + "epoch": 0.1614257721377251, + "grad_norm": 1.2163596165320902, + "learning_rate": 0.00025949999999999997, + "loss": 5.7697, + "step": 1730 + }, + { + "epoch": 0.1615190818326024, + "grad_norm": 1.116096212595931, + "learning_rate": 0.00025965, + "loss": 5.9296, + "step": 1731 + }, + { + "epoch": 0.1616123915274797, + "grad_norm": 1.3359783978990345, + "learning_rate": 0.00025979999999999997, + "loss": 6.2802, + "step": 1732 + }, + { + "epoch": 0.161705701222357, + "grad_norm": 1.104888869168171, + "learning_rate": 0.00025995, + "loss": 5.9542, + "step": 1733 + }, + { + "epoch": 0.1617990109172343, + "grad_norm": 0.9923404911124063, + "learning_rate": 0.0002601, + "loss": 6.2732, + "step": 1734 + }, + { + "epoch": 0.1618923206121116, + "grad_norm": 2.7857752359071255, + "learning_rate": 0.00026025, + "loss": 5.5544, + "step": 1735 + }, + { + "epoch": 0.1619856303069889, + "grad_norm": 1.7223006282563726, + "learning_rate": 0.0002604, + "loss": 5.8061, + "step": 1736 + }, + { + "epoch": 0.1620789400018662, + "grad_norm": 2.130052382169402, + "learning_rate": 0.00026055, + "loss": 6.0788, + "step": 1737 + }, + { + "epoch": 0.1621722496967435, + "grad_norm": 2.133592239073034, + "learning_rate": 0.0002607, + "loss": 5.9631, + "step": 1738 + }, + { + "epoch": 0.1622655593916208, + "grad_norm": 1.115975692044565, + "learning_rate": 0.00026084999999999997, + "loss": 6.1491, + "step": 1739 + }, + { + "epoch": 0.16235886908649808, + "grad_norm": 1.04420015400804, + "learning_rate": 0.000261, + "loss": 5.9384, + "step": 1740 + }, + { + "epoch": 0.16245217878137538, + "grad_norm": 1.2346460240743966, + "learning_rate": 0.00026115, + "loss": 5.9648, + "step": 1741 + }, + { + "epoch": 0.1625454884762527, + "grad_norm": 1.249773428440131, + "learning_rate": 0.00026129999999999995, + "loss": 6.0073, + "step": 1742 + }, + { + "epoch": 0.16263879817112997, + "grad_norm": 1.431651413537032, + "learning_rate": 0.00026145, + "loss": 6.1834, + "step": 1743 + }, + { + "epoch": 0.16273210786600728, + "grad_norm": 1.0772775159296573, + "learning_rate": 0.00026159999999999996, + "loss": 5.9992, + "step": 1744 + }, + { + "epoch": 0.1628254175608846, + "grad_norm": 1.2216815808932737, + "learning_rate": 0.00026175, + "loss": 5.9745, + "step": 1745 + }, + { + "epoch": 0.16291872725576187, + "grad_norm": 1.225057619318257, + "learning_rate": 0.00026189999999999997, + "loss": 6.0734, + "step": 1746 + }, + { + "epoch": 0.16301203695063918, + "grad_norm": 1.0943552328459438, + "learning_rate": 0.00026205, + "loss": 6.0468, + "step": 1747 + }, + { + "epoch": 0.16310534664551646, + "grad_norm": 1.8051799266018034, + "learning_rate": 0.0002622, + "loss": 6.1872, + "step": 1748 + }, + { + "epoch": 0.16319865634039377, + "grad_norm": 1.7176298492434536, + "learning_rate": 0.00026235, + "loss": 6.1116, + "step": 1749 + }, + { + "epoch": 0.16329196603527107, + "grad_norm": 1.2439737390748866, + "learning_rate": 0.0002625, + "loss": 5.6996, + "step": 1750 + }, + { + "epoch": 0.16338527573014835, + "grad_norm": 1.4031604420856267, + "learning_rate": 0.00026264999999999996, + "loss": 6.173, + "step": 1751 + }, + { + "epoch": 0.16347858542502566, + "grad_norm": 1.1565884466873473, + "learning_rate": 0.0002628, + "loss": 6.1133, + "step": 1752 + }, + { + "epoch": 0.16357189511990297, + "grad_norm": 1.179672963854827, + "learning_rate": 0.00026294999999999997, + "loss": 6.2252, + "step": 1753 + }, + { + "epoch": 0.16366520481478025, + "grad_norm": 1.0762371471250034, + "learning_rate": 0.0002631, + "loss": 5.8274, + "step": 1754 + }, + { + "epoch": 0.16375851450965756, + "grad_norm": 1.4879099681440295, + "learning_rate": 0.00026325, + "loss": 6.2267, + "step": 1755 + }, + { + "epoch": 0.16385182420453484, + "grad_norm": 1.036506297652239, + "learning_rate": 0.00026339999999999995, + "loss": 6.0217, + "step": 1756 + }, + { + "epoch": 0.16394513389941215, + "grad_norm": 1.045385565547776, + "learning_rate": 0.00026355, + "loss": 5.771, + "step": 1757 + }, + { + "epoch": 0.16403844359428946, + "grad_norm": 1.2647493857624137, + "learning_rate": 0.00026369999999999996, + "loss": 5.6243, + "step": 1758 + }, + { + "epoch": 0.16413175328916674, + "grad_norm": 1.0134880428464317, + "learning_rate": 0.00026384999999999994, + "loss": 5.6167, + "step": 1759 + }, + { + "epoch": 0.16422506298404405, + "grad_norm": 4.97079715457025, + "learning_rate": 0.00026399999999999997, + "loss": 5.6719, + "step": 1760 + }, + { + "epoch": 0.16431837267892133, + "grad_norm": 1.1893926290211432, + "learning_rate": 0.00026414999999999994, + "loss": 5.9814, + "step": 1761 + }, + { + "epoch": 0.16441168237379863, + "grad_norm": 1.1185198353536305, + "learning_rate": 0.0002643, + "loss": 5.4905, + "step": 1762 + }, + { + "epoch": 0.16450499206867594, + "grad_norm": 1.3855382976183825, + "learning_rate": 0.00026444999999999995, + "loss": 6.4502, + "step": 1763 + }, + { + "epoch": 0.16459830176355322, + "grad_norm": 2.508450303296694, + "learning_rate": 0.0002646, + "loss": 5.8257, + "step": 1764 + }, + { + "epoch": 0.16469161145843053, + "grad_norm": 1.5162168439514707, + "learning_rate": 0.00026474999999999996, + "loss": 6.2122, + "step": 1765 + }, + { + "epoch": 0.16478492115330784, + "grad_norm": 1.2553932715222689, + "learning_rate": 0.0002649, + "loss": 6.1079, + "step": 1766 + }, + { + "epoch": 0.16487823084818512, + "grad_norm": 1.1616703536783126, + "learning_rate": 0.00026504999999999996, + "loss": 5.8059, + "step": 1767 + }, + { + "epoch": 0.16497154054306243, + "grad_norm": 1.3691665679087996, + "learning_rate": 0.0002652, + "loss": 5.9612, + "step": 1768 + }, + { + "epoch": 0.1650648502379397, + "grad_norm": 1.1027360232171155, + "learning_rate": 0.00026534999999999997, + "loss": 5.9194, + "step": 1769 + }, + { + "epoch": 0.16515815993281702, + "grad_norm": 1.3429965811233582, + "learning_rate": 0.0002655, + "loss": 6.1991, + "step": 1770 + }, + { + "epoch": 0.16525146962769433, + "grad_norm": 0.990141727326018, + "learning_rate": 0.00026565, + "loss": 5.9947, + "step": 1771 + }, + { + "epoch": 0.1653447793225716, + "grad_norm": 1.4297058587713243, + "learning_rate": 0.00026579999999999996, + "loss": 6.1396, + "step": 1772 + }, + { + "epoch": 0.1654380890174489, + "grad_norm": 1.4331002335595773, + "learning_rate": 0.00026595, + "loss": 5.7898, + "step": 1773 + }, + { + "epoch": 0.16553139871232622, + "grad_norm": 1.5112041511420067, + "learning_rate": 0.00026609999999999996, + "loss": 5.8482, + "step": 1774 + }, + { + "epoch": 0.1656247084072035, + "grad_norm": 1.6320027980222664, + "learning_rate": 0.00026624999999999994, + "loss": 6.2066, + "step": 1775 + }, + { + "epoch": 0.1657180181020808, + "grad_norm": 1.1390983850952052, + "learning_rate": 0.00026639999999999997, + "loss": 5.8264, + "step": 1776 + }, + { + "epoch": 0.1658113277969581, + "grad_norm": 1.1873099753145782, + "learning_rate": 0.00026654999999999995, + "loss": 6.3797, + "step": 1777 + }, + { + "epoch": 0.1659046374918354, + "grad_norm": 1.6960783434423408, + "learning_rate": 0.0002667, + "loss": 5.934, + "step": 1778 + }, + { + "epoch": 0.1659979471867127, + "grad_norm": 1.3350364932717618, + "learning_rate": 0.00026684999999999995, + "loss": 5.8002, + "step": 1779 + }, + { + "epoch": 0.16609125688159, + "grad_norm": 1.074700286357636, + "learning_rate": 0.000267, + "loss": 6.185, + "step": 1780 + }, + { + "epoch": 0.1661845665764673, + "grad_norm": 2.503102414909486, + "learning_rate": 0.00026714999999999996, + "loss": 6.068, + "step": 1781 + }, + { + "epoch": 0.1662778762713446, + "grad_norm": 1.6636190152529986, + "learning_rate": 0.0002673, + "loss": 5.9682, + "step": 1782 + }, + { + "epoch": 0.16637118596622189, + "grad_norm": 1.4606476167664677, + "learning_rate": 0.00026744999999999997, + "loss": 6.1276, + "step": 1783 + }, + { + "epoch": 0.1664644956610992, + "grad_norm": 3.2232720959766588, + "learning_rate": 0.0002676, + "loss": 5.5454, + "step": 1784 + }, + { + "epoch": 0.16655780535597647, + "grad_norm": 1.865073395568299, + "learning_rate": 0.00026775, + "loss": 6.0339, + "step": 1785 + }, + { + "epoch": 0.16665111505085378, + "grad_norm": 1.9615187822462081, + "learning_rate": 0.0002679, + "loss": 6.194, + "step": 1786 + }, + { + "epoch": 0.1667444247457311, + "grad_norm": 1.1022314708780765, + "learning_rate": 0.00026805, + "loss": 6.069, + "step": 1787 + }, + { + "epoch": 0.16683773444060837, + "grad_norm": 1.521519103311825, + "learning_rate": 0.00026819999999999996, + "loss": 6.2218, + "step": 1788 + }, + { + "epoch": 0.16693104413548568, + "grad_norm": 2.511825380550982, + "learning_rate": 0.00026835, + "loss": 6.0035, + "step": 1789 + }, + { + "epoch": 0.167024353830363, + "grad_norm": 1.144568561569331, + "learning_rate": 0.00026849999999999997, + "loss": 6.0772, + "step": 1790 + }, + { + "epoch": 0.16711766352524027, + "grad_norm": 15.354580417021632, + "learning_rate": 0.00026864999999999994, + "loss": 7.8819, + "step": 1791 + }, + { + "epoch": 0.16721097322011758, + "grad_norm": 1.9480943817650065, + "learning_rate": 0.0002688, + "loss": 5.8618, + "step": 1792 + }, + { + "epoch": 0.16730428291499486, + "grad_norm": 1.953210690230277, + "learning_rate": 0.00026894999999999995, + "loss": 5.9765, + "step": 1793 + }, + { + "epoch": 0.16739759260987216, + "grad_norm": 1.767087332561718, + "learning_rate": 0.0002691, + "loss": 6.4301, + "step": 1794 + }, + { + "epoch": 0.16749090230474947, + "grad_norm": 1.4384196236097433, + "learning_rate": 0.00026924999999999996, + "loss": 5.9699, + "step": 1795 + }, + { + "epoch": 0.16758421199962675, + "grad_norm": 1.2593479298073766, + "learning_rate": 0.0002694, + "loss": 6.4438, + "step": 1796 + }, + { + "epoch": 0.16767752169450406, + "grad_norm": 2.52653119336251, + "learning_rate": 0.00026954999999999997, + "loss": 6.314, + "step": 1797 + }, + { + "epoch": 0.16777083138938137, + "grad_norm": 2.0998953485569167, + "learning_rate": 0.0002697, + "loss": 6.1248, + "step": 1798 + }, + { + "epoch": 0.16786414108425865, + "grad_norm": 2.892366945297402, + "learning_rate": 0.00026984999999999997, + "loss": 5.9535, + "step": 1799 + }, + { + "epoch": 0.16795745077913596, + "grad_norm": 1.459575037058231, + "learning_rate": 0.00027, + "loss": 5.9808, + "step": 1800 + }, + { + "epoch": 0.16805076047401324, + "grad_norm": 1.4281081742084467, + "learning_rate": 0.00027015, + "loss": 6.2695, + "step": 1801 + }, + { + "epoch": 0.16814407016889055, + "grad_norm": 1.5964856486974988, + "learning_rate": 0.00027029999999999996, + "loss": 6.2793, + "step": 1802 + }, + { + "epoch": 0.16823737986376786, + "grad_norm": 1.8141466281468503, + "learning_rate": 0.00027045, + "loss": 6.0638, + "step": 1803 + }, + { + "epoch": 0.16833068955864514, + "grad_norm": 1.2739354170954909, + "learning_rate": 0.00027059999999999996, + "loss": 6.1758, + "step": 1804 + }, + { + "epoch": 0.16842399925352244, + "grad_norm": 1.3963434373932788, + "learning_rate": 0.00027074999999999994, + "loss": 5.6053, + "step": 1805 + }, + { + "epoch": 0.16851730894839975, + "grad_norm": 1.3878602181488342, + "learning_rate": 0.00027089999999999997, + "loss": 6.1288, + "step": 1806 + }, + { + "epoch": 0.16861061864327703, + "grad_norm": 1.6705801160428149, + "learning_rate": 0.00027104999999999995, + "loss": 6.0766, + "step": 1807 + }, + { + "epoch": 0.16870392833815434, + "grad_norm": 1.1359337430592205, + "learning_rate": 0.0002712, + "loss": 5.7218, + "step": 1808 + }, + { + "epoch": 0.16879723803303162, + "grad_norm": 1.4683665499223697, + "learning_rate": 0.00027134999999999995, + "loss": 6.4435, + "step": 1809 + }, + { + "epoch": 0.16889054772790893, + "grad_norm": 1.8699528634588025, + "learning_rate": 0.0002715, + "loss": 6.3143, + "step": 1810 + }, + { + "epoch": 0.16898385742278624, + "grad_norm": 1.2517679535555053, + "learning_rate": 0.00027164999999999996, + "loss": 6.245, + "step": 1811 + }, + { + "epoch": 0.16907716711766352, + "grad_norm": 1.2882175760926566, + "learning_rate": 0.0002718, + "loss": 5.8323, + "step": 1812 + }, + { + "epoch": 0.16917047681254083, + "grad_norm": 1.1377288171862359, + "learning_rate": 0.00027194999999999997, + "loss": 5.9521, + "step": 1813 + }, + { + "epoch": 0.1692637865074181, + "grad_norm": 1.7412714068531434, + "learning_rate": 0.0002721, + "loss": 5.9716, + "step": 1814 + }, + { + "epoch": 0.16935709620229542, + "grad_norm": 1.4049128804268423, + "learning_rate": 0.00027225, + "loss": 6.3987, + "step": 1815 + }, + { + "epoch": 0.16945040589717272, + "grad_norm": 1.5261563165757683, + "learning_rate": 0.0002724, + "loss": 5.5642, + "step": 1816 + }, + { + "epoch": 0.16954371559205, + "grad_norm": 1.1256580277909538, + "learning_rate": 0.00027255, + "loss": 5.3559, + "step": 1817 + }, + { + "epoch": 0.1696370252869273, + "grad_norm": 2.576346025832345, + "learning_rate": 0.00027269999999999996, + "loss": 6.2721, + "step": 1818 + }, + { + "epoch": 0.16973033498180462, + "grad_norm": 1.0918299589408456, + "learning_rate": 0.00027285, + "loss": 6.3157, + "step": 1819 + }, + { + "epoch": 0.1698236446766819, + "grad_norm": 1.9936465317502254, + "learning_rate": 0.00027299999999999997, + "loss": 6.2595, + "step": 1820 + }, + { + "epoch": 0.1699169543715592, + "grad_norm": 1.343670355004733, + "learning_rate": 0.00027314999999999994, + "loss": 6.1148, + "step": 1821 + }, + { + "epoch": 0.1700102640664365, + "grad_norm": 1.2071236720185754, + "learning_rate": 0.0002733, + "loss": 5.6095, + "step": 1822 + }, + { + "epoch": 0.1701035737613138, + "grad_norm": 1.4089926818493697, + "learning_rate": 0.00027344999999999995, + "loss": 6.2476, + "step": 1823 + }, + { + "epoch": 0.1701968834561911, + "grad_norm": 1.1147308134991054, + "learning_rate": 0.0002736, + "loss": 6.0824, + "step": 1824 + }, + { + "epoch": 0.1702901931510684, + "grad_norm": 3.077658417026185, + "learning_rate": 0.00027374999999999996, + "loss": 6.274, + "step": 1825 + }, + { + "epoch": 0.1703835028459457, + "grad_norm": 1.1443272630902848, + "learning_rate": 0.0002739, + "loss": 6.0803, + "step": 1826 + }, + { + "epoch": 0.170476812540823, + "grad_norm": 1.0003744629836113, + "learning_rate": 0.00027404999999999997, + "loss": 6.1463, + "step": 1827 + }, + { + "epoch": 0.17057012223570028, + "grad_norm": 1.1450828789388534, + "learning_rate": 0.0002742, + "loss": 6.0623, + "step": 1828 + }, + { + "epoch": 0.1706634319305776, + "grad_norm": 1.4250658686462119, + "learning_rate": 0.00027435, + "loss": 5.9946, + "step": 1829 + }, + { + "epoch": 0.17075674162545487, + "grad_norm": 1.3193146447016917, + "learning_rate": 0.0002745, + "loss": 6.0466, + "step": 1830 + }, + { + "epoch": 0.17085005132033218, + "grad_norm": 1.4390360507582285, + "learning_rate": 0.00027465, + "loss": 6.1784, + "step": 1831 + }, + { + "epoch": 0.1709433610152095, + "grad_norm": 1.183413033225423, + "learning_rate": 0.0002748, + "loss": 6.2447, + "step": 1832 + }, + { + "epoch": 0.17103667071008677, + "grad_norm": 1.3031805981688862, + "learning_rate": 0.00027495, + "loss": 5.9383, + "step": 1833 + }, + { + "epoch": 0.17112998040496408, + "grad_norm": 1.4855058136130543, + "learning_rate": 0.00027509999999999996, + "loss": 5.9877, + "step": 1834 + }, + { + "epoch": 0.17122329009984139, + "grad_norm": 1.1500429232826193, + "learning_rate": 0.00027525, + "loss": 6.0302, + "step": 1835 + }, + { + "epoch": 0.17131659979471867, + "grad_norm": 1.1263519283523984, + "learning_rate": 0.00027539999999999997, + "loss": 6.3632, + "step": 1836 + }, + { + "epoch": 0.17140990948959597, + "grad_norm": 1.9337242025183716, + "learning_rate": 0.00027554999999999995, + "loss": 6.0351, + "step": 1837 + }, + { + "epoch": 0.17150321918447325, + "grad_norm": 1.565728490299732, + "learning_rate": 0.0002757, + "loss": 6.2602, + "step": 1838 + }, + { + "epoch": 0.17159652887935056, + "grad_norm": 1.4290090855065367, + "learning_rate": 0.00027584999999999996, + "loss": 5.7152, + "step": 1839 + }, + { + "epoch": 0.17168983857422787, + "grad_norm": 1.121179202982176, + "learning_rate": 0.000276, + "loss": 5.9747, + "step": 1840 + }, + { + "epoch": 0.17178314826910515, + "grad_norm": 1.4060217356300864, + "learning_rate": 0.00027614999999999996, + "loss": 5.8772, + "step": 1841 + }, + { + "epoch": 0.17187645796398246, + "grad_norm": 1.070737531163851, + "learning_rate": 0.0002763, + "loss": 6.0949, + "step": 1842 + }, + { + "epoch": 0.17196976765885977, + "grad_norm": 1.0648929803746594, + "learning_rate": 0.00027644999999999997, + "loss": 5.8195, + "step": 1843 + }, + { + "epoch": 0.17206307735373705, + "grad_norm": 1.2075463055981133, + "learning_rate": 0.0002766, + "loss": 5.7342, + "step": 1844 + }, + { + "epoch": 0.17215638704861436, + "grad_norm": 1.091452333106686, + "learning_rate": 0.00027675, + "loss": 6.1203, + "step": 1845 + }, + { + "epoch": 0.17224969674349164, + "grad_norm": 1.1508841985575928, + "learning_rate": 0.0002769, + "loss": 5.9852, + "step": 1846 + }, + { + "epoch": 0.17234300643836895, + "grad_norm": 1.2990829762100484, + "learning_rate": 0.00027705, + "loss": 5.7137, + "step": 1847 + }, + { + "epoch": 0.17243631613324625, + "grad_norm": 1.0560094209210393, + "learning_rate": 0.0002772, + "loss": 6.1564, + "step": 1848 + }, + { + "epoch": 0.17252962582812353, + "grad_norm": 1.3109891272893768, + "learning_rate": 0.00027735, + "loss": 6.1512, + "step": 1849 + }, + { + "epoch": 0.17262293552300084, + "grad_norm": 1.4530193929788198, + "learning_rate": 0.00027749999999999997, + "loss": 5.83, + "step": 1850 + }, + { + "epoch": 0.17271624521787815, + "grad_norm": 1.0665403026932405, + "learning_rate": 0.00027764999999999995, + "loss": 5.7521, + "step": 1851 + }, + { + "epoch": 0.17280955491275543, + "grad_norm": 1.0131611959312334, + "learning_rate": 0.0002778, + "loss": 6.0855, + "step": 1852 + }, + { + "epoch": 0.17290286460763274, + "grad_norm": 1.1867275059121296, + "learning_rate": 0.00027794999999999995, + "loss": 5.8117, + "step": 1853 + }, + { + "epoch": 0.17299617430251002, + "grad_norm": 1.5120063058009936, + "learning_rate": 0.0002781, + "loss": 5.5373, + "step": 1854 + }, + { + "epoch": 0.17308948399738733, + "grad_norm": 1.0917181291899134, + "learning_rate": 0.00027824999999999996, + "loss": 5.7177, + "step": 1855 + }, + { + "epoch": 0.17318279369226464, + "grad_norm": 1.2890890998742186, + "learning_rate": 0.0002784, + "loss": 5.8458, + "step": 1856 + }, + { + "epoch": 0.17327610338714192, + "grad_norm": 1.114549694108299, + "learning_rate": 0.00027854999999999997, + "loss": 6.4029, + "step": 1857 + }, + { + "epoch": 0.17336941308201922, + "grad_norm": 1.4041966375847035, + "learning_rate": 0.0002787, + "loss": 5.8457, + "step": 1858 + }, + { + "epoch": 0.17346272277689653, + "grad_norm": 1.0200930333989542, + "learning_rate": 0.00027885, + "loss": 5.8298, + "step": 1859 + }, + { + "epoch": 0.1735560324717738, + "grad_norm": 1.0570059623220087, + "learning_rate": 0.000279, + "loss": 6.1923, + "step": 1860 + }, + { + "epoch": 0.17364934216665112, + "grad_norm": 1.1479988652340907, + "learning_rate": 0.00027915, + "loss": 5.7747, + "step": 1861 + }, + { + "epoch": 0.1737426518615284, + "grad_norm": 1.478241821211861, + "learning_rate": 0.0002793, + "loss": 6.0629, + "step": 1862 + }, + { + "epoch": 0.1738359615564057, + "grad_norm": 1.1729917968873607, + "learning_rate": 0.00027945, + "loss": 6.3, + "step": 1863 + }, + { + "epoch": 0.17392927125128302, + "grad_norm": 1.0937047739549457, + "learning_rate": 0.00027959999999999997, + "loss": 5.9848, + "step": 1864 + }, + { + "epoch": 0.1740225809461603, + "grad_norm": 1.1431721207353447, + "learning_rate": 0.00027975, + "loss": 6.2741, + "step": 1865 + }, + { + "epoch": 0.1741158906410376, + "grad_norm": 1.1649911709302943, + "learning_rate": 0.0002799, + "loss": 6.1331, + "step": 1866 + }, + { + "epoch": 0.1742092003359149, + "grad_norm": 1.228421762157625, + "learning_rate": 0.00028004999999999995, + "loss": 5.7912, + "step": 1867 + }, + { + "epoch": 0.1743025100307922, + "grad_norm": 1.2004735009132848, + "learning_rate": 0.0002802, + "loss": 5.8625, + "step": 1868 + }, + { + "epoch": 0.1743958197256695, + "grad_norm": 1.2991846914907255, + "learning_rate": 0.00028034999999999996, + "loss": 5.8469, + "step": 1869 + }, + { + "epoch": 0.17448912942054678, + "grad_norm": 1.1562896098115414, + "learning_rate": 0.0002805, + "loss": 6.3442, + "step": 1870 + }, + { + "epoch": 0.1745824391154241, + "grad_norm": 1.0996656457819103, + "learning_rate": 0.00028064999999999996, + "loss": 6.0243, + "step": 1871 + }, + { + "epoch": 0.1746757488103014, + "grad_norm": 1.2529164472014687, + "learning_rate": 0.0002808, + "loss": 6.1558, + "step": 1872 + }, + { + "epoch": 0.17476905850517868, + "grad_norm": 1.2797446426516759, + "learning_rate": 0.00028094999999999997, + "loss": 5.6098, + "step": 1873 + }, + { + "epoch": 0.174862368200056, + "grad_norm": 1.3050358908706228, + "learning_rate": 0.0002811, + "loss": 6.0697, + "step": 1874 + }, + { + "epoch": 0.17495567789493327, + "grad_norm": 1.096818317115401, + "learning_rate": 0.00028125, + "loss": 6.1255, + "step": 1875 + }, + { + "epoch": 0.17504898758981058, + "grad_norm": 1.9511033541368037, + "learning_rate": 0.00028139999999999996, + "loss": 5.6482, + "step": 1876 + }, + { + "epoch": 0.1751422972846879, + "grad_norm": 1.1755802008727283, + "learning_rate": 0.00028155, + "loss": 5.8379, + "step": 1877 + }, + { + "epoch": 0.17523560697956517, + "grad_norm": 1.1150862536417219, + "learning_rate": 0.00028169999999999996, + "loss": 6.0311, + "step": 1878 + }, + { + "epoch": 0.17532891667444248, + "grad_norm": 1.1154629061674666, + "learning_rate": 0.00028185, + "loss": 6.1312, + "step": 1879 + }, + { + "epoch": 0.17542222636931978, + "grad_norm": 1.6147716492355306, + "learning_rate": 0.00028199999999999997, + "loss": 5.8431, + "step": 1880 + }, + { + "epoch": 0.17551553606419706, + "grad_norm": 1.1971734337315332, + "learning_rate": 0.00028215, + "loss": 5.817, + "step": 1881 + }, + { + "epoch": 0.17560884575907437, + "grad_norm": 1.272084690279385, + "learning_rate": 0.0002823, + "loss": 6.0112, + "step": 1882 + }, + { + "epoch": 0.17570215545395165, + "grad_norm": 1.346812777236933, + "learning_rate": 0.00028244999999999995, + "loss": 5.853, + "step": 1883 + }, + { + "epoch": 0.17579546514882896, + "grad_norm": 1.256644711746057, + "learning_rate": 0.0002826, + "loss": 6.26, + "step": 1884 + }, + { + "epoch": 0.17588877484370627, + "grad_norm": 1.866496727759012, + "learning_rate": 0.00028274999999999996, + "loss": 5.7321, + "step": 1885 + }, + { + "epoch": 0.17598208453858355, + "grad_norm": 1.1111784108832687, + "learning_rate": 0.00028289999999999994, + "loss": 6.0959, + "step": 1886 + }, + { + "epoch": 0.17607539423346086, + "grad_norm": 1.2054902007475483, + "learning_rate": 0.00028304999999999997, + "loss": 5.8909, + "step": 1887 + }, + { + "epoch": 0.17616870392833817, + "grad_norm": 1.7058460118158796, + "learning_rate": 0.00028319999999999994, + "loss": 6.352, + "step": 1888 + }, + { + "epoch": 0.17626201362321545, + "grad_norm": 2.88790487680784, + "learning_rate": 0.00028335, + "loss": 5.9166, + "step": 1889 + }, + { + "epoch": 0.17635532331809275, + "grad_norm": 1.1763040309478483, + "learning_rate": 0.00028349999999999995, + "loss": 6.1202, + "step": 1890 + }, + { + "epoch": 0.17644863301297004, + "grad_norm": 2.663837014727327, + "learning_rate": 0.00028365, + "loss": 5.3209, + "step": 1891 + }, + { + "epoch": 0.17654194270784734, + "grad_norm": 1.0547337612925138, + "learning_rate": 0.00028379999999999996, + "loss": 6.2033, + "step": 1892 + }, + { + "epoch": 0.17663525240272465, + "grad_norm": 1.395012538501387, + "learning_rate": 0.00028395, + "loss": 6.0426, + "step": 1893 + }, + { + "epoch": 0.17672856209760193, + "grad_norm": 1.0886469884440533, + "learning_rate": 0.00028409999999999997, + "loss": 6.2651, + "step": 1894 + }, + { + "epoch": 0.17682187179247924, + "grad_norm": 1.2696531599447087, + "learning_rate": 0.00028425, + "loss": 5.9838, + "step": 1895 + }, + { + "epoch": 0.17691518148735655, + "grad_norm": 1.183838995034986, + "learning_rate": 0.0002844, + "loss": 5.7954, + "step": 1896 + }, + { + "epoch": 0.17700849118223383, + "grad_norm": 1.3813072541502254, + "learning_rate": 0.00028455, + "loss": 5.6922, + "step": 1897 + }, + { + "epoch": 0.17710180087711114, + "grad_norm": 1.5895503196511367, + "learning_rate": 0.0002847, + "loss": 6.0677, + "step": 1898 + }, + { + "epoch": 0.17719511057198842, + "grad_norm": 4.885341911220412, + "learning_rate": 0.00028484999999999996, + "loss": 5.7086, + "step": 1899 + }, + { + "epoch": 0.17728842026686573, + "grad_norm": 1.1552881053992397, + "learning_rate": 0.000285, + "loss": 6.1716, + "step": 1900 + }, + { + "epoch": 0.17738172996174303, + "grad_norm": 2.2251685930417695, + "learning_rate": 0.00028514999999999997, + "loss": 6.1242, + "step": 1901 + }, + { + "epoch": 0.17747503965662031, + "grad_norm": 1.1506088430657397, + "learning_rate": 0.00028529999999999994, + "loss": 6.1415, + "step": 1902 + }, + { + "epoch": 0.17756834935149762, + "grad_norm": 1.701963536790812, + "learning_rate": 0.00028544999999999997, + "loss": 6.1112, + "step": 1903 + }, + { + "epoch": 0.17766165904637493, + "grad_norm": 1.6476992886838104, + "learning_rate": 0.00028559999999999995, + "loss": 5.593, + "step": 1904 + }, + { + "epoch": 0.1777549687412522, + "grad_norm": 1.421259011867925, + "learning_rate": 0.00028575, + "loss": 6.1147, + "step": 1905 + }, + { + "epoch": 0.17784827843612952, + "grad_norm": 1.2328794903569977, + "learning_rate": 0.00028589999999999996, + "loss": 5.905, + "step": 1906 + }, + { + "epoch": 0.1779415881310068, + "grad_norm": 1.0887908631868863, + "learning_rate": 0.00028605, + "loss": 6.4583, + "step": 1907 + }, + { + "epoch": 0.1780348978258841, + "grad_norm": 1.4952223744341593, + "learning_rate": 0.00028619999999999996, + "loss": 5.6555, + "step": 1908 + }, + { + "epoch": 0.17812820752076142, + "grad_norm": 1.3246079991134152, + "learning_rate": 0.00028635, + "loss": 6.056, + "step": 1909 + }, + { + "epoch": 0.1782215172156387, + "grad_norm": 1.1669032961780479, + "learning_rate": 0.00028649999999999997, + "loss": 5.6934, + "step": 1910 + }, + { + "epoch": 0.178314826910516, + "grad_norm": 1.2253112729171536, + "learning_rate": 0.00028665, + "loss": 6.1025, + "step": 1911 + }, + { + "epoch": 0.1784081366053933, + "grad_norm": 1.0896704599245908, + "learning_rate": 0.0002868, + "loss": 5.7105, + "step": 1912 + }, + { + "epoch": 0.1785014463002706, + "grad_norm": 1.7425378819200117, + "learning_rate": 0.00028694999999999995, + "loss": 5.3635, + "step": 1913 + }, + { + "epoch": 0.1785947559951479, + "grad_norm": 1.000375467699386, + "learning_rate": 0.0002871, + "loss": 5.9369, + "step": 1914 + }, + { + "epoch": 0.17868806569002518, + "grad_norm": 1.2721134669506504, + "learning_rate": 0.00028724999999999996, + "loss": 5.905, + "step": 1915 + }, + { + "epoch": 0.1787813753849025, + "grad_norm": 1.0431185864526105, + "learning_rate": 0.00028739999999999994, + "loss": 5.693, + "step": 1916 + }, + { + "epoch": 0.1788746850797798, + "grad_norm": 2.131135622058938, + "learning_rate": 0.00028754999999999997, + "loss": 5.4154, + "step": 1917 + }, + { + "epoch": 0.17896799477465708, + "grad_norm": 1.0875697312850696, + "learning_rate": 0.00028769999999999995, + "loss": 5.6271, + "step": 1918 + }, + { + "epoch": 0.1790613044695344, + "grad_norm": 2.441170093311157, + "learning_rate": 0.00028785, + "loss": 5.5607, + "step": 1919 + }, + { + "epoch": 0.17915461416441167, + "grad_norm": 1.847298940759434, + "learning_rate": 0.00028799999999999995, + "loss": 6.0818, + "step": 1920 + }, + { + "epoch": 0.17924792385928898, + "grad_norm": 1.1364287275279374, + "learning_rate": 0.00028815, + "loss": 5.9034, + "step": 1921 + }, + { + "epoch": 0.17934123355416628, + "grad_norm": 1.0833273630637132, + "learning_rate": 0.00028829999999999996, + "loss": 5.912, + "step": 1922 + }, + { + "epoch": 0.17943454324904357, + "grad_norm": 1.6857778678826414, + "learning_rate": 0.00028845, + "loss": 6.0119, + "step": 1923 + }, + { + "epoch": 0.17952785294392087, + "grad_norm": 1.0881472233818268, + "learning_rate": 0.00028859999999999997, + "loss": 6.3065, + "step": 1924 + }, + { + "epoch": 0.17962116263879818, + "grad_norm": 1.032985167349083, + "learning_rate": 0.00028875, + "loss": 5.707, + "step": 1925 + }, + { + "epoch": 0.17971447233367546, + "grad_norm": 1.3623566939442069, + "learning_rate": 0.0002889, + "loss": 6.1784, + "step": 1926 + }, + { + "epoch": 0.17980778202855277, + "grad_norm": 1.4204386024378415, + "learning_rate": 0.00028905, + "loss": 6.2117, + "step": 1927 + }, + { + "epoch": 0.17990109172343005, + "grad_norm": 1.7627520619855779, + "learning_rate": 0.0002892, + "loss": 5.8642, + "step": 1928 + }, + { + "epoch": 0.17999440141830736, + "grad_norm": 1.0380122057691934, + "learning_rate": 0.00028934999999999996, + "loss": 5.9217, + "step": 1929 + }, + { + "epoch": 0.18008771111318467, + "grad_norm": 1.14310670745735, + "learning_rate": 0.0002895, + "loss": 5.5448, + "step": 1930 + }, + { + "epoch": 0.18018102080806195, + "grad_norm": 1.2639097299134152, + "learning_rate": 0.00028964999999999997, + "loss": 5.9081, + "step": 1931 + }, + { + "epoch": 0.18027433050293926, + "grad_norm": 1.0338324600134623, + "learning_rate": 0.00028979999999999994, + "loss": 6.154, + "step": 1932 + }, + { + "epoch": 0.18036764019781656, + "grad_norm": 1.132513245780658, + "learning_rate": 0.00028995, + "loss": 6.3748, + "step": 1933 + }, + { + "epoch": 0.18046094989269384, + "grad_norm": 1.1918130992013205, + "learning_rate": 0.00029009999999999995, + "loss": 5.9919, + "step": 1934 + }, + { + "epoch": 0.18055425958757115, + "grad_norm": 1.258475333226215, + "learning_rate": 0.00029025, + "loss": 5.9552, + "step": 1935 + }, + { + "epoch": 0.18064756928244843, + "grad_norm": 1.0015849956374014, + "learning_rate": 0.00029039999999999996, + "loss": 6.0405, + "step": 1936 + }, + { + "epoch": 0.18074087897732574, + "grad_norm": 0.9974764306559696, + "learning_rate": 0.00029055, + "loss": 5.6113, + "step": 1937 + }, + { + "epoch": 0.18083418867220305, + "grad_norm": 1.8556168784003642, + "learning_rate": 0.00029069999999999996, + "loss": 5.1934, + "step": 1938 + }, + { + "epoch": 0.18092749836708033, + "grad_norm": 2.2271926403638775, + "learning_rate": 0.00029085, + "loss": 6.1956, + "step": 1939 + }, + { + "epoch": 0.18102080806195764, + "grad_norm": 1.2892473727338591, + "learning_rate": 0.00029099999999999997, + "loss": 6.0978, + "step": 1940 + }, + { + "epoch": 0.18111411775683495, + "grad_norm": 1.3255309741887062, + "learning_rate": 0.00029115, + "loss": 5.8536, + "step": 1941 + }, + { + "epoch": 0.18120742745171223, + "grad_norm": 1.1833958804404616, + "learning_rate": 0.0002913, + "loss": 6.0344, + "step": 1942 + }, + { + "epoch": 0.18130073714658954, + "grad_norm": 1.2801899252167568, + "learning_rate": 0.00029145, + "loss": 6.2436, + "step": 1943 + }, + { + "epoch": 0.18139404684146682, + "grad_norm": 1.0879686784446911, + "learning_rate": 0.0002916, + "loss": 5.938, + "step": 1944 + }, + { + "epoch": 0.18148735653634412, + "grad_norm": 1.5201180711134177, + "learning_rate": 0.00029174999999999996, + "loss": 6.41, + "step": 1945 + }, + { + "epoch": 0.18158066623122143, + "grad_norm": 1.012543196512442, + "learning_rate": 0.0002919, + "loss": 5.7447, + "step": 1946 + }, + { + "epoch": 0.1816739759260987, + "grad_norm": 1.5207914725112415, + "learning_rate": 0.00029204999999999997, + "loss": 6.1802, + "step": 1947 + }, + { + "epoch": 0.18176728562097602, + "grad_norm": 1.1694628389936967, + "learning_rate": 0.00029219999999999995, + "loss": 6.4633, + "step": 1948 + }, + { + "epoch": 0.18186059531585333, + "grad_norm": 2.3175003831222725, + "learning_rate": 0.00029235, + "loss": 6.2756, + "step": 1949 + }, + { + "epoch": 0.1819539050107306, + "grad_norm": 1.1130524138779003, + "learning_rate": 0.00029249999999999995, + "loss": 6.1381, + "step": 1950 + }, + { + "epoch": 0.18204721470560792, + "grad_norm": 1.1283494693046876, + "learning_rate": 0.00029265, + "loss": 5.7265, + "step": 1951 + }, + { + "epoch": 0.1821405244004852, + "grad_norm": 1.7099262471676993, + "learning_rate": 0.00029279999999999996, + "loss": 6.0628, + "step": 1952 + }, + { + "epoch": 0.1822338340953625, + "grad_norm": 1.2843969338520138, + "learning_rate": 0.00029295, + "loss": 5.7766, + "step": 1953 + }, + { + "epoch": 0.18232714379023982, + "grad_norm": 1.6978553071595892, + "learning_rate": 0.00029309999999999997, + "loss": 6.0995, + "step": 1954 + }, + { + "epoch": 0.1824204534851171, + "grad_norm": 1.8144226157383971, + "learning_rate": 0.00029325, + "loss": 6.1426, + "step": 1955 + }, + { + "epoch": 0.1825137631799944, + "grad_norm": 1.2731338249040465, + "learning_rate": 0.0002934, + "loss": 6.2719, + "step": 1956 + }, + { + "epoch": 0.1826070728748717, + "grad_norm": 1.1998102709876957, + "learning_rate": 0.00029355, + "loss": 5.8262, + "step": 1957 + }, + { + "epoch": 0.182700382569749, + "grad_norm": 1.1925580092240922, + "learning_rate": 0.0002937, + "loss": 6.2803, + "step": 1958 + }, + { + "epoch": 0.1827936922646263, + "grad_norm": 1.0513292732960746, + "learning_rate": 0.00029384999999999996, + "loss": 5.8856, + "step": 1959 + }, + { + "epoch": 0.18288700195950358, + "grad_norm": 1.1753207921448268, + "learning_rate": 0.000294, + "loss": 6.0431, + "step": 1960 + }, + { + "epoch": 0.1829803116543809, + "grad_norm": 1.1985383036510173, + "learning_rate": 0.00029414999999999997, + "loss": 5.9621, + "step": 1961 + }, + { + "epoch": 0.1830736213492582, + "grad_norm": 1.2640212260290897, + "learning_rate": 0.00029429999999999994, + "loss": 5.9085, + "step": 1962 + }, + { + "epoch": 0.18316693104413548, + "grad_norm": 1.0322137288691293, + "learning_rate": 0.00029445, + "loss": 5.4881, + "step": 1963 + }, + { + "epoch": 0.1832602407390128, + "grad_norm": 1.1607485226378844, + "learning_rate": 0.00029459999999999995, + "loss": 6.4013, + "step": 1964 + }, + { + "epoch": 0.1833535504338901, + "grad_norm": 1.356097484527584, + "learning_rate": 0.00029475, + "loss": 6.1911, + "step": 1965 + }, + { + "epoch": 0.18344686012876738, + "grad_norm": 1.123130490181038, + "learning_rate": 0.00029489999999999996, + "loss": 5.582, + "step": 1966 + }, + { + "epoch": 0.18354016982364468, + "grad_norm": 1.1179890526099163, + "learning_rate": 0.00029505, + "loss": 6.0848, + "step": 1967 + }, + { + "epoch": 0.18363347951852196, + "grad_norm": 1.0462464648058303, + "learning_rate": 0.00029519999999999997, + "loss": 6.3455, + "step": 1968 + }, + { + "epoch": 0.18372678921339927, + "grad_norm": 1.1345436602920742, + "learning_rate": 0.00029535, + "loss": 6.1751, + "step": 1969 + }, + { + "epoch": 0.18382009890827658, + "grad_norm": 1.1053892596649382, + "learning_rate": 0.00029549999999999997, + "loss": 6.0283, + "step": 1970 + }, + { + "epoch": 0.18391340860315386, + "grad_norm": 2.5778637083289366, + "learning_rate": 0.00029565, + "loss": 6.0435, + "step": 1971 + }, + { + "epoch": 0.18400671829803117, + "grad_norm": 1.067685318464396, + "learning_rate": 0.0002958, + "loss": 6.1485, + "step": 1972 + }, + { + "epoch": 0.18410002799290845, + "grad_norm": 1.085856251987143, + "learning_rate": 0.00029595, + "loss": 6.1545, + "step": 1973 + }, + { + "epoch": 0.18419333768778576, + "grad_norm": 1.4768045465472328, + "learning_rate": 0.0002961, + "loss": 6.0978, + "step": 1974 + }, + { + "epoch": 0.18428664738266307, + "grad_norm": 1.037770952549316, + "learning_rate": 0.00029624999999999996, + "loss": 5.9912, + "step": 1975 + }, + { + "epoch": 0.18437995707754035, + "grad_norm": 0.9990361789161678, + "learning_rate": 0.0002964, + "loss": 6.1878, + "step": 1976 + }, + { + "epoch": 0.18447326677241765, + "grad_norm": 1.1113391419524095, + "learning_rate": 0.00029654999999999997, + "loss": 6.363, + "step": 1977 + }, + { + "epoch": 0.18456657646729496, + "grad_norm": 1.030005874398822, + "learning_rate": 0.00029669999999999995, + "loss": 6.0947, + "step": 1978 + }, + { + "epoch": 0.18465988616217224, + "grad_norm": 1.0238926812764273, + "learning_rate": 0.00029685, + "loss": 6.1061, + "step": 1979 + }, + { + "epoch": 0.18475319585704955, + "grad_norm": 1.0286829408868456, + "learning_rate": 0.00029699999999999996, + "loss": 6.49, + "step": 1980 + }, + { + "epoch": 0.18484650555192683, + "grad_norm": 1.254701111858773, + "learning_rate": 0.00029715, + "loss": 6.0652, + "step": 1981 + }, + { + "epoch": 0.18493981524680414, + "grad_norm": 1.5114752839596943, + "learning_rate": 0.00029729999999999996, + "loss": 6.3089, + "step": 1982 + }, + { + "epoch": 0.18503312494168145, + "grad_norm": 1.191171585495524, + "learning_rate": 0.00029745, + "loss": 6.1616, + "step": 1983 + }, + { + "epoch": 0.18512643463655873, + "grad_norm": 0.9908470961698465, + "learning_rate": 0.00029759999999999997, + "loss": 6.0677, + "step": 1984 + }, + { + "epoch": 0.18521974433143604, + "grad_norm": 5.62967426086178, + "learning_rate": 0.00029775, + "loss": 5.5828, + "step": 1985 + }, + { + "epoch": 0.18531305402631335, + "grad_norm": 1.2940991914839852, + "learning_rate": 0.0002979, + "loss": 5.6932, + "step": 1986 + }, + { + "epoch": 0.18540636372119063, + "grad_norm": 1.069299118937804, + "learning_rate": 0.00029805, + "loss": 5.7402, + "step": 1987 + }, + { + "epoch": 0.18549967341606793, + "grad_norm": 1.073087148407434, + "learning_rate": 0.0002982, + "loss": 6.0892, + "step": 1988 + }, + { + "epoch": 0.18559298311094521, + "grad_norm": 1.4890061649597592, + "learning_rate": 0.00029835, + "loss": 5.7992, + "step": 1989 + }, + { + "epoch": 0.18568629280582252, + "grad_norm": 1.099214567025864, + "learning_rate": 0.0002985, + "loss": 6.0244, + "step": 1990 + }, + { + "epoch": 0.18577960250069983, + "grad_norm": 1.3591194226170638, + "learning_rate": 0.00029864999999999997, + "loss": 5.9927, + "step": 1991 + }, + { + "epoch": 0.1858729121955771, + "grad_norm": 1.078638002808935, + "learning_rate": 0.0002988, + "loss": 6.5679, + "step": 1992 + }, + { + "epoch": 0.18596622189045442, + "grad_norm": 1.689365454209772, + "learning_rate": 0.00029895, + "loss": 5.6289, + "step": 1993 + }, + { + "epoch": 0.18605953158533173, + "grad_norm": 1.2857931570483987, + "learning_rate": 0.00029909999999999995, + "loss": 5.9902, + "step": 1994 + }, + { + "epoch": 0.186152841280209, + "grad_norm": 1.0517968549246732, + "learning_rate": 0.00029925, + "loss": 5.934, + "step": 1995 + }, + { + "epoch": 0.18624615097508632, + "grad_norm": 0.9894135139779832, + "learning_rate": 0.00029939999999999996, + "loss": 6.2507, + "step": 1996 + }, + { + "epoch": 0.1863394606699636, + "grad_norm": 1.4034631411983352, + "learning_rate": 0.00029955, + "loss": 6.5006, + "step": 1997 + }, + { + "epoch": 0.1864327703648409, + "grad_norm": 1.4498174062458442, + "learning_rate": 0.00029969999999999997, + "loss": 5.6659, + "step": 1998 + }, + { + "epoch": 0.1865260800597182, + "grad_norm": 1.1870711127971085, + "learning_rate": 0.00029985, + "loss": 6.0142, + "step": 1999 + }, + { + "epoch": 0.1866193897545955, + "grad_norm": 1.1464279247970808, + "learning_rate": 0.0003, + "loss": 5.9402, + "step": 2000 + }, + { + "epoch": 0.1867126994494728, + "grad_norm": 1.2396411833629826, + "learning_rate": 0.00029999999980929716, + "loss": 5.9427, + "step": 2001 + }, + { + "epoch": 0.1868060091443501, + "grad_norm": 1.301117710162386, + "learning_rate": 0.0002999999992371888, + "loss": 6.3624, + "step": 2002 + }, + { + "epoch": 0.1868993188392274, + "grad_norm": 1.3269541342299478, + "learning_rate": 0.0002999999982836749, + "loss": 5.7433, + "step": 2003 + }, + { + "epoch": 0.1869926285341047, + "grad_norm": 2.4334197948249323, + "learning_rate": 0.00029999999694875544, + "loss": 6.233, + "step": 2004 + }, + { + "epoch": 0.18708593822898198, + "grad_norm": 1.0413928183382413, + "learning_rate": 0.00029999999523243044, + "loss": 5.6815, + "step": 2005 + }, + { + "epoch": 0.1871792479238593, + "grad_norm": 1.0587156176423758, + "learning_rate": 0.00029999999313469987, + "loss": 5.5379, + "step": 2006 + }, + { + "epoch": 0.1872725576187366, + "grad_norm": 1.0437294821212613, + "learning_rate": 0.00029999999065556367, + "loss": 6.0619, + "step": 2007 + }, + { + "epoch": 0.18736586731361388, + "grad_norm": 1.0102244750181155, + "learning_rate": 0.000299999987795022, + "loss": 6.0661, + "step": 2008 + }, + { + "epoch": 0.18745917700849118, + "grad_norm": 1.2134786234849428, + "learning_rate": 0.00029999998455307477, + "loss": 6.1329, + "step": 2009 + }, + { + "epoch": 0.1875524867033685, + "grad_norm": 2.4666044049127938, + "learning_rate": 0.000299999980929722, + "loss": 5.7667, + "step": 2010 + }, + { + "epoch": 0.18764579639824577, + "grad_norm": 1.3679967373752884, + "learning_rate": 0.00029999997692496373, + "loss": 5.4202, + "step": 2011 + }, + { + "epoch": 0.18773910609312308, + "grad_norm": 1.0731858831170809, + "learning_rate": 0.0002999999725388, + "loss": 5.7317, + "step": 2012 + }, + { + "epoch": 0.18783241578800036, + "grad_norm": 1.094944113366521, + "learning_rate": 0.00029999996777123073, + "loss": 5.997, + "step": 2013 + }, + { + "epoch": 0.18792572548287767, + "grad_norm": 1.2745083115109292, + "learning_rate": 0.00029999996262225595, + "loss": 5.8737, + "step": 2014 + }, + { + "epoch": 0.18801903517775498, + "grad_norm": 1.6258932365629801, + "learning_rate": 0.00029999995709187576, + "loss": 6.128, + "step": 2015 + }, + { + "epoch": 0.18811234487263226, + "grad_norm": 1.5586382039183604, + "learning_rate": 0.00029999995118009005, + "loss": 6.0907, + "step": 2016 + }, + { + "epoch": 0.18820565456750957, + "grad_norm": 1.143807690202979, + "learning_rate": 0.00029999994488689893, + "loss": 5.7207, + "step": 2017 + }, + { + "epoch": 0.18829896426238688, + "grad_norm": 1.0958317897716574, + "learning_rate": 0.0002999999382123024, + "loss": 6.2445, + "step": 2018 + }, + { + "epoch": 0.18839227395726416, + "grad_norm": 1.1102382070577497, + "learning_rate": 0.0002999999311563004, + "loss": 5.837, + "step": 2019 + }, + { + "epoch": 0.18848558365214146, + "grad_norm": 2.8778400633969996, + "learning_rate": 0.00029999992371889305, + "loss": 5.4621, + "step": 2020 + }, + { + "epoch": 0.18857889334701874, + "grad_norm": 1.2134285547980685, + "learning_rate": 0.00029999991590008035, + "loss": 5.469, + "step": 2021 + }, + { + "epoch": 0.18867220304189605, + "grad_norm": 1.118924689111731, + "learning_rate": 0.0002999999076998623, + "loss": 6.2327, + "step": 2022 + }, + { + "epoch": 0.18876551273677336, + "grad_norm": 0.9782491714871444, + "learning_rate": 0.00029999989911823886, + "loss": 5.9294, + "step": 2023 + }, + { + "epoch": 0.18885882243165064, + "grad_norm": 0.977032545392469, + "learning_rate": 0.00029999989015521013, + "loss": 5.9867, + "step": 2024 + }, + { + "epoch": 0.18895213212652795, + "grad_norm": 1.025457227071101, + "learning_rate": 0.00029999988081077616, + "loss": 5.8457, + "step": 2025 + }, + { + "epoch": 0.18904544182140526, + "grad_norm": 1.0602098935669433, + "learning_rate": 0.0002999998710849369, + "loss": 5.5696, + "step": 2026 + }, + { + "epoch": 0.18913875151628254, + "grad_norm": 0.9958842948869079, + "learning_rate": 0.00029999986097769236, + "loss": 6.0891, + "step": 2027 + }, + { + "epoch": 0.18923206121115985, + "grad_norm": 0.9585990327757482, + "learning_rate": 0.00029999985048904265, + "loss": 6.2777, + "step": 2028 + }, + { + "epoch": 0.18932537090603713, + "grad_norm": 0.991845740025733, + "learning_rate": 0.0002999998396189877, + "loss": 5.9919, + "step": 2029 + }, + { + "epoch": 0.18941868060091444, + "grad_norm": 1.2426600972471034, + "learning_rate": 0.00029999982836752763, + "loss": 6.1562, + "step": 2030 + }, + { + "epoch": 0.18951199029579174, + "grad_norm": 1.0851232391788839, + "learning_rate": 0.00029999981673466244, + "loss": 6.2973, + "step": 2031 + }, + { + "epoch": 0.18960529999066902, + "grad_norm": 1.0910057017309398, + "learning_rate": 0.00029999980472039217, + "loss": 6.1557, + "step": 2032 + }, + { + "epoch": 0.18969860968554633, + "grad_norm": 2.2130617030146382, + "learning_rate": 0.00029999979232471675, + "loss": 5.528, + "step": 2033 + }, + { + "epoch": 0.1897919193804236, + "grad_norm": 1.3308505035383948, + "learning_rate": 0.00029999977954763636, + "loss": 5.5969, + "step": 2034 + }, + { + "epoch": 0.18988522907530092, + "grad_norm": 1.696143517012187, + "learning_rate": 0.00029999976638915094, + "loss": 6.2861, + "step": 2035 + }, + { + "epoch": 0.18997853877017823, + "grad_norm": 1.325368948874948, + "learning_rate": 0.0002999997528492606, + "loss": 5.8569, + "step": 2036 + }, + { + "epoch": 0.1900718484650555, + "grad_norm": 1.2472687234071105, + "learning_rate": 0.00029999973892796527, + "loss": 5.6778, + "step": 2037 + }, + { + "epoch": 0.19016515815993282, + "grad_norm": 1.2133421000128792, + "learning_rate": 0.00029999972462526497, + "loss": 5.9012, + "step": 2038 + }, + { + "epoch": 0.19025846785481013, + "grad_norm": 1.1389095901613864, + "learning_rate": 0.0002999997099411599, + "loss": 6.0355, + "step": 2039 + }, + { + "epoch": 0.1903517775496874, + "grad_norm": 1.3083171515100809, + "learning_rate": 0.00029999969487565, + "loss": 6.1108, + "step": 2040 + }, + { + "epoch": 0.19044508724456471, + "grad_norm": 1.2316161557780805, + "learning_rate": 0.00029999967942873524, + "loss": 5.6585, + "step": 2041 + }, + { + "epoch": 0.190538396939442, + "grad_norm": 1.2498869106125696, + "learning_rate": 0.0002999996636004158, + "loss": 6.0993, + "step": 2042 + }, + { + "epoch": 0.1906317066343193, + "grad_norm": 1.2429341771162148, + "learning_rate": 0.00029999964739069163, + "loss": 5.8221, + "step": 2043 + }, + { + "epoch": 0.1907250163291966, + "grad_norm": 1.3465596713427237, + "learning_rate": 0.00029999963079956277, + "loss": 5.8764, + "step": 2044 + }, + { + "epoch": 0.1908183260240739, + "grad_norm": 1.126697229409073, + "learning_rate": 0.0002999996138270293, + "loss": 6.2249, + "step": 2045 + }, + { + "epoch": 0.1909116357189512, + "grad_norm": 1.0803867622767853, + "learning_rate": 0.0002999995964730912, + "loss": 5.8642, + "step": 2046 + }, + { + "epoch": 0.1910049454138285, + "grad_norm": 1.1508916790056551, + "learning_rate": 0.0002999995787377486, + "loss": 5.9666, + "step": 2047 + }, + { + "epoch": 0.1910982551087058, + "grad_norm": 1.2221938977192273, + "learning_rate": 0.0002999995606210015, + "loss": 5.9069, + "step": 2048 + }, + { + "epoch": 0.1911915648035831, + "grad_norm": 1.2067276932618283, + "learning_rate": 0.00029999954212285, + "loss": 5.9555, + "step": 2049 + }, + { + "epoch": 0.19128487449846038, + "grad_norm": 1.3975155918023063, + "learning_rate": 0.00029999952324329403, + "loss": 6.1583, + "step": 2050 + }, + { + "epoch": 0.19137818419333769, + "grad_norm": 1.0860577057254484, + "learning_rate": 0.0002999995039823337, + "loss": 6.1996, + "step": 2051 + }, + { + "epoch": 0.191471493888215, + "grad_norm": 7.921610906200255, + "learning_rate": 0.0002999994843399691, + "loss": 5.7596, + "step": 2052 + }, + { + "epoch": 0.19156480358309227, + "grad_norm": 1.063442822298501, + "learning_rate": 0.00029999946431620025, + "loss": 5.7924, + "step": 2053 + }, + { + "epoch": 0.19165811327796958, + "grad_norm": 1.1098059448629152, + "learning_rate": 0.00029999944391102714, + "loss": 6.3752, + "step": 2054 + }, + { + "epoch": 0.1917514229728469, + "grad_norm": 1.4232747116323814, + "learning_rate": 0.00029999942312444993, + "loss": 5.8258, + "step": 2055 + }, + { + "epoch": 0.19184473266772417, + "grad_norm": 1.1179474937020026, + "learning_rate": 0.0002999994019564686, + "loss": 5.9129, + "step": 2056 + }, + { + "epoch": 0.19193804236260148, + "grad_norm": 1.0953974215726547, + "learning_rate": 0.0002999993804070833, + "loss": 5.5803, + "step": 2057 + }, + { + "epoch": 0.19203135205747876, + "grad_norm": 1.7048234212430582, + "learning_rate": 0.0002999993584762939, + "loss": 6.2611, + "step": 2058 + }, + { + "epoch": 0.19212466175235607, + "grad_norm": 1.3592342209929675, + "learning_rate": 0.0002999993361641006, + "loss": 6.3985, + "step": 2059 + }, + { + "epoch": 0.19221797144723338, + "grad_norm": 1.1848788070630945, + "learning_rate": 0.0002999993134705034, + "loss": 6.2515, + "step": 2060 + }, + { + "epoch": 0.19231128114211066, + "grad_norm": 1.5772057617593298, + "learning_rate": 0.0002999992903955024, + "loss": 6.423, + "step": 2061 + }, + { + "epoch": 0.19240459083698797, + "grad_norm": 1.3757408282610097, + "learning_rate": 0.0002999992669390977, + "loss": 5.9522, + "step": 2062 + }, + { + "epoch": 0.19249790053186527, + "grad_norm": 1.4669473399755686, + "learning_rate": 0.0002999992431012892, + "loss": 6.2755, + "step": 2063 + }, + { + "epoch": 0.19259121022674255, + "grad_norm": 1.2444374957514492, + "learning_rate": 0.0002999992188820771, + "loss": 6.0545, + "step": 2064 + }, + { + "epoch": 0.19268451992161986, + "grad_norm": 1.2525471689967165, + "learning_rate": 0.00029999919428146143, + "loss": 5.7469, + "step": 2065 + }, + { + "epoch": 0.19277782961649714, + "grad_norm": 1.1596908171386804, + "learning_rate": 0.0002999991692994422, + "loss": 6.0438, + "step": 2066 + }, + { + "epoch": 0.19287113931137445, + "grad_norm": 1.1607023716495395, + "learning_rate": 0.0002999991439360196, + "loss": 5.7994, + "step": 2067 + }, + { + "epoch": 0.19296444900625176, + "grad_norm": 1.3304766309036504, + "learning_rate": 0.00029999911819119355, + "loss": 6.1513, + "step": 2068 + }, + { + "epoch": 0.19305775870112904, + "grad_norm": 1.308567604128919, + "learning_rate": 0.00029999909206496417, + "loss": 5.9528, + "step": 2069 + }, + { + "epoch": 0.19315106839600635, + "grad_norm": 1.1219718297806398, + "learning_rate": 0.00029999906555733155, + "loss": 5.7337, + "step": 2070 + }, + { + "epoch": 0.19324437809088366, + "grad_norm": 1.241685878187484, + "learning_rate": 0.0002999990386682957, + "loss": 5.5063, + "step": 2071 + }, + { + "epoch": 0.19333768778576094, + "grad_norm": 1.1737624913053388, + "learning_rate": 0.00029999901139785674, + "loss": 5.7441, + "step": 2072 + }, + { + "epoch": 0.19343099748063824, + "grad_norm": 1.4411784986562188, + "learning_rate": 0.00029999898374601477, + "loss": 6.1873, + "step": 2073 + }, + { + "epoch": 0.19352430717551553, + "grad_norm": 1.236652949695254, + "learning_rate": 0.00029999895571276977, + "loss": 5.9201, + "step": 2074 + }, + { + "epoch": 0.19361761687039283, + "grad_norm": 0.9972244545319723, + "learning_rate": 0.00029999892729812186, + "loss": 5.9558, + "step": 2075 + }, + { + "epoch": 0.19371092656527014, + "grad_norm": 1.3406141134228389, + "learning_rate": 0.0002999988985020712, + "loss": 6.289, + "step": 2076 + }, + { + "epoch": 0.19380423626014742, + "grad_norm": 1.1561234266033031, + "learning_rate": 0.00029999886932461766, + "loss": 5.6645, + "step": 2077 + }, + { + "epoch": 0.19389754595502473, + "grad_norm": 1.9726013284783632, + "learning_rate": 0.0002999988397657615, + "loss": 5.3437, + "step": 2078 + }, + { + "epoch": 0.19399085564990204, + "grad_norm": 1.2575750391315381, + "learning_rate": 0.00029999880982550266, + "loss": 5.6076, + "step": 2079 + }, + { + "epoch": 0.19408416534477932, + "grad_norm": 1.0598831944730538, + "learning_rate": 0.00029999877950384135, + "loss": 5.6668, + "step": 2080 + }, + { + "epoch": 0.19417747503965663, + "grad_norm": 1.0825725140390123, + "learning_rate": 0.00029999874880077756, + "loss": 6.1126, + "step": 2081 + }, + { + "epoch": 0.1942707847345339, + "grad_norm": 1.1049172491234107, + "learning_rate": 0.00029999871771631135, + "loss": 5.7643, + "step": 2082 + }, + { + "epoch": 0.19436409442941122, + "grad_norm": 1.941313271233444, + "learning_rate": 0.0002999986862504428, + "loss": 5.9916, + "step": 2083 + }, + { + "epoch": 0.19445740412428852, + "grad_norm": 1.0220758706240503, + "learning_rate": 0.0002999986544031721, + "loss": 5.7171, + "step": 2084 + }, + { + "epoch": 0.1945507138191658, + "grad_norm": 1.2289486797928328, + "learning_rate": 0.00029999862217449925, + "loss": 5.7372, + "step": 2085 + }, + { + "epoch": 0.1946440235140431, + "grad_norm": 1.4852559605867512, + "learning_rate": 0.0002999985895644243, + "loss": 5.8582, + "step": 2086 + }, + { + "epoch": 0.1947373332089204, + "grad_norm": 1.43866074370563, + "learning_rate": 0.00029999855657294745, + "loss": 6.1967, + "step": 2087 + }, + { + "epoch": 0.1948306429037977, + "grad_norm": 1.1926813420022653, + "learning_rate": 0.0002999985232000686, + "loss": 5.9654, + "step": 2088 + }, + { + "epoch": 0.194923952598675, + "grad_norm": 1.9994161416291176, + "learning_rate": 0.000299998489445788, + "loss": 6.1005, + "step": 2089 + }, + { + "epoch": 0.1950172622935523, + "grad_norm": 1.6660259943154003, + "learning_rate": 0.00029999845531010564, + "loss": 6.0094, + "step": 2090 + }, + { + "epoch": 0.1951105719884296, + "grad_norm": 1.0420935126833901, + "learning_rate": 0.0002999984207930217, + "loss": 6.014, + "step": 2091 + }, + { + "epoch": 0.1952038816833069, + "grad_norm": 1.090096633975986, + "learning_rate": 0.0002999983858945361, + "loss": 5.6616, + "step": 2092 + }, + { + "epoch": 0.1952971913781842, + "grad_norm": 1.0724001372323224, + "learning_rate": 0.00029999835061464907, + "loss": 6.006, + "step": 2093 + }, + { + "epoch": 0.1953905010730615, + "grad_norm": 12.019648710627488, + "learning_rate": 0.0002999983149533607, + "loss": 6.05, + "step": 2094 + }, + { + "epoch": 0.19548381076793878, + "grad_norm": 1.6145704770787297, + "learning_rate": 0.00029999827891067103, + "loss": 6.0811, + "step": 2095 + }, + { + "epoch": 0.19557712046281608, + "grad_norm": 0.9931788135845205, + "learning_rate": 0.00029999824248658024, + "loss": 5.9641, + "step": 2096 + }, + { + "epoch": 0.1956704301576934, + "grad_norm": 2.371945788019116, + "learning_rate": 0.00029999820568108826, + "loss": 6.3184, + "step": 2097 + }, + { + "epoch": 0.19576373985257067, + "grad_norm": 1.1818572516062655, + "learning_rate": 0.00029999816849419537, + "loss": 5.4832, + "step": 2098 + }, + { + "epoch": 0.19585704954744798, + "grad_norm": 1.5193343790411211, + "learning_rate": 0.00029999813092590146, + "loss": 5.8812, + "step": 2099 + }, + { + "epoch": 0.1959503592423253, + "grad_norm": 1.0992567005330796, + "learning_rate": 0.0002999980929762068, + "loss": 5.795, + "step": 2100 + }, + { + "epoch": 0.19604366893720257, + "grad_norm": 1.3521754394402645, + "learning_rate": 0.0002999980546451114, + "loss": 6.2213, + "step": 2101 + }, + { + "epoch": 0.19613697863207988, + "grad_norm": 1.367781999363231, + "learning_rate": 0.0002999980159326154, + "loss": 5.719, + "step": 2102 + }, + { + "epoch": 0.19623028832695716, + "grad_norm": 1.8517360047231712, + "learning_rate": 0.0002999979768387189, + "loss": 6.0057, + "step": 2103 + }, + { + "epoch": 0.19632359802183447, + "grad_norm": 1.043024520566012, + "learning_rate": 0.0002999979373634219, + "loss": 5.9708, + "step": 2104 + }, + { + "epoch": 0.19641690771671177, + "grad_norm": 1.3493324503928605, + "learning_rate": 0.00029999789750672463, + "loss": 6.0043, + "step": 2105 + }, + { + "epoch": 0.19651021741158906, + "grad_norm": 1.4121989900706327, + "learning_rate": 0.00029999785726862716, + "loss": 5.6019, + "step": 2106 + }, + { + "epoch": 0.19660352710646636, + "grad_norm": 1.1493330877705765, + "learning_rate": 0.00029999781664912954, + "loss": 6.069, + "step": 2107 + }, + { + "epoch": 0.19669683680134367, + "grad_norm": 1.0341787302838248, + "learning_rate": 0.0002999977756482319, + "loss": 6.0165, + "step": 2108 + }, + { + "epoch": 0.19679014649622095, + "grad_norm": 1.1290231961284902, + "learning_rate": 0.00029999773426593433, + "loss": 5.8777, + "step": 2109 + }, + { + "epoch": 0.19688345619109826, + "grad_norm": 1.0537337778080833, + "learning_rate": 0.000299997692502237, + "loss": 6.18, + "step": 2110 + }, + { + "epoch": 0.19697676588597554, + "grad_norm": 1.1359392706118154, + "learning_rate": 0.00029999765035714, + "loss": 5.5841, + "step": 2111 + }, + { + "epoch": 0.19707007558085285, + "grad_norm": 1.0663441384094055, + "learning_rate": 0.00029999760783064333, + "loss": 5.9845, + "step": 2112 + }, + { + "epoch": 0.19716338527573016, + "grad_norm": 1.0686417169688687, + "learning_rate": 0.0002999975649227472, + "loss": 5.8952, + "step": 2113 + }, + { + "epoch": 0.19725669497060744, + "grad_norm": 1.1778159762666571, + "learning_rate": 0.00029999752163345175, + "loss": 6.0176, + "step": 2114 + }, + { + "epoch": 0.19735000466548475, + "grad_norm": 1.082501938851595, + "learning_rate": 0.0002999974779627569, + "loss": 5.8482, + "step": 2115 + }, + { + "epoch": 0.19744331436036205, + "grad_norm": 1.33936321509152, + "learning_rate": 0.000299997433910663, + "loss": 5.3439, + "step": 2116 + }, + { + "epoch": 0.19753662405523933, + "grad_norm": 1.077154208980563, + "learning_rate": 0.00029999738947717006, + "loss": 5.6429, + "step": 2117 + }, + { + "epoch": 0.19762993375011664, + "grad_norm": 1.1298334700293655, + "learning_rate": 0.0002999973446622782, + "loss": 6.0861, + "step": 2118 + }, + { + "epoch": 0.19772324344499392, + "grad_norm": 1.2320337881759456, + "learning_rate": 0.00029999729946598747, + "loss": 6.4542, + "step": 2119 + }, + { + "epoch": 0.19781655313987123, + "grad_norm": 1.2322806864986695, + "learning_rate": 0.00029999725388829806, + "loss": 5.9853, + "step": 2120 + }, + { + "epoch": 0.19790986283474854, + "grad_norm": 1.3595812686519622, + "learning_rate": 0.00029999720792921007, + "loss": 5.7292, + "step": 2121 + }, + { + "epoch": 0.19800317252962582, + "grad_norm": 1.520626391508239, + "learning_rate": 0.0002999971615887236, + "loss": 5.4493, + "step": 2122 + }, + { + "epoch": 0.19809648222450313, + "grad_norm": 1.0951255164508464, + "learning_rate": 0.00029999711486683884, + "loss": 5.7584, + "step": 2123 + }, + { + "epoch": 0.19818979191938044, + "grad_norm": 1.7898558227654506, + "learning_rate": 0.00029999706776355576, + "loss": 5.9806, + "step": 2124 + }, + { + "epoch": 0.19828310161425772, + "grad_norm": 1.301386166653145, + "learning_rate": 0.00029999702027887464, + "loss": 6.173, + "step": 2125 + }, + { + "epoch": 0.19837641130913503, + "grad_norm": 1.2918462977819103, + "learning_rate": 0.0002999969724127955, + "loss": 6.3259, + "step": 2126 + }, + { + "epoch": 0.1984697210040123, + "grad_norm": 1.3420080538680603, + "learning_rate": 0.0002999969241653185, + "loss": 6.0023, + "step": 2127 + }, + { + "epoch": 0.19856303069888961, + "grad_norm": 1.1405041509138227, + "learning_rate": 0.00029999687553644377, + "loss": 5.9882, + "step": 2128 + }, + { + "epoch": 0.19865634039376692, + "grad_norm": 1.1877696909611626, + "learning_rate": 0.00029999682652617136, + "loss": 6.2793, + "step": 2129 + }, + { + "epoch": 0.1987496500886442, + "grad_norm": 1.3183199261966856, + "learning_rate": 0.0002999967771345015, + "loss": 5.8451, + "step": 2130 + }, + { + "epoch": 0.1988429597835215, + "grad_norm": 1.0212132400499938, + "learning_rate": 0.0002999967273614343, + "loss": 5.9918, + "step": 2131 + }, + { + "epoch": 0.19893626947839882, + "grad_norm": 1.2782649416181027, + "learning_rate": 0.0002999966772069698, + "loss": 6.3413, + "step": 2132 + }, + { + "epoch": 0.1990295791732761, + "grad_norm": 1.4004878066579243, + "learning_rate": 0.0002999966266711082, + "loss": 5.9459, + "step": 2133 + }, + { + "epoch": 0.1991228888681534, + "grad_norm": 1.129243605707388, + "learning_rate": 0.0002999965757538496, + "loss": 6.2354, + "step": 2134 + }, + { + "epoch": 0.1992161985630307, + "grad_norm": 1.0140338443641579, + "learning_rate": 0.00029999652445519414, + "loss": 6.0363, + "step": 2135 + }, + { + "epoch": 0.199309508257908, + "grad_norm": 1.4083396387339346, + "learning_rate": 0.00029999647277514197, + "loss": 5.9264, + "step": 2136 + }, + { + "epoch": 0.1994028179527853, + "grad_norm": 1.9345297928123217, + "learning_rate": 0.0002999964207136932, + "loss": 6.0331, + "step": 2137 + }, + { + "epoch": 0.19949612764766259, + "grad_norm": 1.40264772973353, + "learning_rate": 0.0002999963682708479, + "loss": 5.83, + "step": 2138 + }, + { + "epoch": 0.1995894373425399, + "grad_norm": 1.091254152153047, + "learning_rate": 0.0002999963154466064, + "loss": 6.0874, + "step": 2139 + }, + { + "epoch": 0.19968274703741717, + "grad_norm": 1.2230429122532254, + "learning_rate": 0.0002999962622409686, + "loss": 6.1357, + "step": 2140 + }, + { + "epoch": 0.19977605673229448, + "grad_norm": 1.0624386250894906, + "learning_rate": 0.00029999620865393475, + "loss": 6.3691, + "step": 2141 + }, + { + "epoch": 0.1998693664271718, + "grad_norm": 1.180697950029592, + "learning_rate": 0.00029999615468550493, + "loss": 6.0511, + "step": 2142 + }, + { + "epoch": 0.19996267612204907, + "grad_norm": 1.176500968714766, + "learning_rate": 0.00029999610033567945, + "loss": 5.7282, + "step": 2143 + }, + { + "epoch": 0.20005598581692638, + "grad_norm": 1.0664552299479384, + "learning_rate": 0.0002999960456044582, + "loss": 6.2546, + "step": 2144 + }, + { + "epoch": 0.2001492955118037, + "grad_norm": 1.332298405440769, + "learning_rate": 0.0002999959904918415, + "loss": 5.9278, + "step": 2145 + }, + { + "epoch": 0.20024260520668097, + "grad_norm": 1.0997136316210878, + "learning_rate": 0.0002999959349978294, + "loss": 6.0633, + "step": 2146 + }, + { + "epoch": 0.20033591490155828, + "grad_norm": 0.9807387737041154, + "learning_rate": 0.00029999587912242205, + "loss": 5.9748, + "step": 2147 + }, + { + "epoch": 0.20042922459643556, + "grad_norm": 1.2931760808051596, + "learning_rate": 0.00029999582286561967, + "loss": 6.2037, + "step": 2148 + }, + { + "epoch": 0.20052253429131286, + "grad_norm": 2.4886205330394615, + "learning_rate": 0.00029999576622742233, + "loss": 6.2544, + "step": 2149 + }, + { + "epoch": 0.20061584398619017, + "grad_norm": 1.0104187513361846, + "learning_rate": 0.00029999570920783016, + "loss": 5.6894, + "step": 2150 + }, + { + "epoch": 0.20070915368106745, + "grad_norm": 1.2236177502675514, + "learning_rate": 0.00029999565180684337, + "loss": 5.5865, + "step": 2151 + }, + { + "epoch": 0.20080246337594476, + "grad_norm": 1.51530010644068, + "learning_rate": 0.000299995594024462, + "loss": 6.216, + "step": 2152 + }, + { + "epoch": 0.20089577307082207, + "grad_norm": 1.4650872420752465, + "learning_rate": 0.00029999553586068634, + "loss": 6.1235, + "step": 2153 + }, + { + "epoch": 0.20098908276569935, + "grad_norm": 2.5988217945731353, + "learning_rate": 0.00029999547731551643, + "loss": 5.7937, + "step": 2154 + }, + { + "epoch": 0.20108239246057666, + "grad_norm": 1.2234989132590464, + "learning_rate": 0.0002999954183889525, + "loss": 5.7413, + "step": 2155 + }, + { + "epoch": 0.20117570215545394, + "grad_norm": 2.785922637298358, + "learning_rate": 0.0002999953590809946, + "loss": 5.2171, + "step": 2156 + }, + { + "epoch": 0.20126901185033125, + "grad_norm": 2.1628511515905053, + "learning_rate": 0.00029999529939164294, + "loss": 5.6941, + "step": 2157 + }, + { + "epoch": 0.20136232154520856, + "grad_norm": 1.185501492248828, + "learning_rate": 0.0002999952393208977, + "loss": 6.0631, + "step": 2158 + }, + { + "epoch": 0.20145563124008584, + "grad_norm": 2.3877613703684535, + "learning_rate": 0.00029999517886875895, + "loss": 5.3087, + "step": 2159 + }, + { + "epoch": 0.20154894093496314, + "grad_norm": 1.3664727068017528, + "learning_rate": 0.0002999951180352269, + "loss": 6.005, + "step": 2160 + }, + { + "epoch": 0.20164225062984045, + "grad_norm": 7.129434066603312, + "learning_rate": 0.0002999950568203017, + "loss": 6.2019, + "step": 2161 + }, + { + "epoch": 0.20173556032471773, + "grad_norm": 1.0452679515022534, + "learning_rate": 0.0002999949952239835, + "loss": 6.0142, + "step": 2162 + }, + { + "epoch": 0.20182887001959504, + "grad_norm": 1.1245717480496493, + "learning_rate": 0.0002999949332462725, + "loss": 5.758, + "step": 2163 + }, + { + "epoch": 0.20192217971447232, + "grad_norm": 3.579296578683165, + "learning_rate": 0.0002999948708871688, + "loss": 6.1079, + "step": 2164 + }, + { + "epoch": 0.20201548940934963, + "grad_norm": 1.3841029811069863, + "learning_rate": 0.00029999480814667256, + "loss": 5.6388, + "step": 2165 + }, + { + "epoch": 0.20210879910422694, + "grad_norm": 1.489314490186503, + "learning_rate": 0.00029999474502478393, + "loss": 6.2339, + "step": 2166 + }, + { + "epoch": 0.20220210879910422, + "grad_norm": 2.4594858730484823, + "learning_rate": 0.0002999946815215031, + "loss": 5.4742, + "step": 2167 + }, + { + "epoch": 0.20229541849398153, + "grad_norm": 2.4337650199415424, + "learning_rate": 0.0002999946176368303, + "loss": 6.1678, + "step": 2168 + }, + { + "epoch": 0.20238872818885884, + "grad_norm": 1.363533418766389, + "learning_rate": 0.00029999455337076554, + "loss": 6.299, + "step": 2169 + }, + { + "epoch": 0.20248203788373612, + "grad_norm": 1.3085698244254815, + "learning_rate": 0.0002999944887233091, + "loss": 5.8629, + "step": 2170 + }, + { + "epoch": 0.20257534757861342, + "grad_norm": 1.3014312203207241, + "learning_rate": 0.0002999944236944611, + "loss": 6.1401, + "step": 2171 + }, + { + "epoch": 0.2026686572734907, + "grad_norm": 1.1558779831928907, + "learning_rate": 0.00029999435828422167, + "loss": 6.1257, + "step": 2172 + }, + { + "epoch": 0.202761966968368, + "grad_norm": 1.1339620251201488, + "learning_rate": 0.00029999429249259106, + "loss": 5.7761, + "step": 2173 + }, + { + "epoch": 0.20285527666324532, + "grad_norm": 1.1226590134975403, + "learning_rate": 0.0002999942263195694, + "loss": 5.8602, + "step": 2174 + }, + { + "epoch": 0.2029485863581226, + "grad_norm": 1.2211067077199114, + "learning_rate": 0.0002999941597651569, + "loss": 5.9658, + "step": 2175 + }, + { + "epoch": 0.2030418960529999, + "grad_norm": 2.295767387344587, + "learning_rate": 0.0002999940928293536, + "loss": 5.7326, + "step": 2176 + }, + { + "epoch": 0.20313520574787722, + "grad_norm": 2.196822145571162, + "learning_rate": 0.0002999940255121598, + "loss": 6.3821, + "step": 2177 + }, + { + "epoch": 0.2032285154427545, + "grad_norm": 4.717030529257068, + "learning_rate": 0.0002999939578135756, + "loss": 6.0371, + "step": 2178 + }, + { + "epoch": 0.2033218251376318, + "grad_norm": 1.130917858127149, + "learning_rate": 0.00029999388973360117, + "loss": 6.2126, + "step": 2179 + }, + { + "epoch": 0.2034151348325091, + "grad_norm": 1.4486325214178986, + "learning_rate": 0.0002999938212722368, + "loss": 6.0716, + "step": 2180 + }, + { + "epoch": 0.2035084445273864, + "grad_norm": 1.526880129021064, + "learning_rate": 0.0002999937524294825, + "loss": 6.0265, + "step": 2181 + }, + { + "epoch": 0.2036017542222637, + "grad_norm": 1.2697958013663335, + "learning_rate": 0.0002999936832053386, + "loss": 6.0815, + "step": 2182 + }, + { + "epoch": 0.20369506391714098, + "grad_norm": 3.5359594168715724, + "learning_rate": 0.0002999936135998051, + "loss": 5.7715, + "step": 2183 + }, + { + "epoch": 0.2037883736120183, + "grad_norm": 1.5876704093266427, + "learning_rate": 0.00029999354361288233, + "loss": 5.99, + "step": 2184 + }, + { + "epoch": 0.2038816833068956, + "grad_norm": 1.4253148452316955, + "learning_rate": 0.00029999347324457035, + "loss": 5.6337, + "step": 2185 + }, + { + "epoch": 0.20397499300177288, + "grad_norm": 1.3041463706854735, + "learning_rate": 0.00029999340249486944, + "loss": 6.1514, + "step": 2186 + }, + { + "epoch": 0.2040683026966502, + "grad_norm": 1.1109861792871376, + "learning_rate": 0.00029999333136377976, + "loss": 5.6521, + "step": 2187 + }, + { + "epoch": 0.20416161239152747, + "grad_norm": 1.2541845983811368, + "learning_rate": 0.0002999932598513014, + "loss": 5.8267, + "step": 2188 + }, + { + "epoch": 0.20425492208640478, + "grad_norm": 1.1296547728340849, + "learning_rate": 0.0002999931879574347, + "loss": 5.8768, + "step": 2189 + }, + { + "epoch": 0.20434823178128209, + "grad_norm": 1.3045108458353531, + "learning_rate": 0.0002999931156821797, + "loss": 6.077, + "step": 2190 + }, + { + "epoch": 0.20444154147615937, + "grad_norm": 1.11343156363241, + "learning_rate": 0.0002999930430255366, + "loss": 6.2446, + "step": 2191 + }, + { + "epoch": 0.20453485117103667, + "grad_norm": 1.0914936655077192, + "learning_rate": 0.0002999929699875057, + "loss": 6.0285, + "step": 2192 + }, + { + "epoch": 0.20462816086591396, + "grad_norm": 1.0680643245345778, + "learning_rate": 0.00029999289656808704, + "loss": 5.9385, + "step": 2193 + }, + { + "epoch": 0.20472147056079126, + "grad_norm": 2.8557852928994043, + "learning_rate": 0.0002999928227672809, + "loss": 6.1833, + "step": 2194 + }, + { + "epoch": 0.20481478025566857, + "grad_norm": 1.2361772991327546, + "learning_rate": 0.0002999927485850875, + "loss": 5.5344, + "step": 2195 + }, + { + "epoch": 0.20490808995054585, + "grad_norm": 1.162439272286462, + "learning_rate": 0.0002999926740215069, + "loss": 5.6808, + "step": 2196 + }, + { + "epoch": 0.20500139964542316, + "grad_norm": 1.5274374898447005, + "learning_rate": 0.0002999925990765394, + "loss": 6.1681, + "step": 2197 + }, + { + "epoch": 0.20509470934030047, + "grad_norm": 1.1367206173003637, + "learning_rate": 0.00029999252375018516, + "loss": 6.2999, + "step": 2198 + }, + { + "epoch": 0.20518801903517775, + "grad_norm": 1.399226434131158, + "learning_rate": 0.00029999244804244436, + "loss": 5.7037, + "step": 2199 + }, + { + "epoch": 0.20528132873005506, + "grad_norm": 1.241605255037183, + "learning_rate": 0.0002999923719533171, + "loss": 6.1575, + "step": 2200 + }, + { + "epoch": 0.20537463842493234, + "grad_norm": 1.2530253391720145, + "learning_rate": 0.0002999922954828038, + "loss": 5.7164, + "step": 2201 + }, + { + "epoch": 0.20546794811980965, + "grad_norm": 1.1181270530267546, + "learning_rate": 0.00029999221863090446, + "loss": 6.1511, + "step": 2202 + }, + { + "epoch": 0.20556125781468695, + "grad_norm": 1.1921231210260212, + "learning_rate": 0.00029999214139761937, + "loss": 6.1196, + "step": 2203 + }, + { + "epoch": 0.20565456750956423, + "grad_norm": 1.093919262245409, + "learning_rate": 0.00029999206378294866, + "loss": 6.2215, + "step": 2204 + }, + { + "epoch": 0.20574787720444154, + "grad_norm": 1.3851651761897354, + "learning_rate": 0.0002999919857868926, + "loss": 5.7342, + "step": 2205 + }, + { + "epoch": 0.20584118689931885, + "grad_norm": 1.436509320952605, + "learning_rate": 0.00029999190740945134, + "loss": 6.2471, + "step": 2206 + }, + { + "epoch": 0.20593449659419613, + "grad_norm": 1.3320447608371617, + "learning_rate": 0.00029999182865062505, + "loss": 5.9345, + "step": 2207 + }, + { + "epoch": 0.20602780628907344, + "grad_norm": 1.1734561123398901, + "learning_rate": 0.000299991749510414, + "loss": 5.976, + "step": 2208 + }, + { + "epoch": 0.20612111598395072, + "grad_norm": 1.3130419565680718, + "learning_rate": 0.0002999916699888184, + "loss": 5.9101, + "step": 2209 + }, + { + "epoch": 0.20621442567882803, + "grad_norm": 1.0867235719651092, + "learning_rate": 0.00029999159008583837, + "loss": 6.0996, + "step": 2210 + }, + { + "epoch": 0.20630773537370534, + "grad_norm": 3.5728103864857395, + "learning_rate": 0.00029999150980147414, + "loss": 5.8291, + "step": 2211 + }, + { + "epoch": 0.20640104506858262, + "grad_norm": 1.3163217484610221, + "learning_rate": 0.00029999142913572597, + "loss": 5.7431, + "step": 2212 + }, + { + "epoch": 0.20649435476345993, + "grad_norm": 1.2018738222110583, + "learning_rate": 0.000299991348088594, + "loss": 5.6963, + "step": 2213 + }, + { + "epoch": 0.20658766445833723, + "grad_norm": 1.952797689915655, + "learning_rate": 0.00029999126666007845, + "loss": 5.4401, + "step": 2214 + }, + { + "epoch": 0.20668097415321451, + "grad_norm": 1.3525636905439373, + "learning_rate": 0.0002999911848501796, + "loss": 6.0903, + "step": 2215 + }, + { + "epoch": 0.20677428384809182, + "grad_norm": 1.2845893818500624, + "learning_rate": 0.00029999110265889754, + "loss": 5.6709, + "step": 2216 + }, + { + "epoch": 0.2068675935429691, + "grad_norm": 4.535632764730783, + "learning_rate": 0.0002999910200862326, + "loss": 5.5707, + "step": 2217 + }, + { + "epoch": 0.2069609032378464, + "grad_norm": 8.436747356669935, + "learning_rate": 0.0002999909371321849, + "loss": 5.2191, + "step": 2218 + }, + { + "epoch": 0.20705421293272372, + "grad_norm": 1.1062734165418047, + "learning_rate": 0.00029999085379675463, + "loss": 5.9084, + "step": 2219 + }, + { + "epoch": 0.207147522627601, + "grad_norm": 1.3229230169751185, + "learning_rate": 0.00029999077007994207, + "loss": 4.7673, + "step": 2220 + }, + { + "epoch": 0.2072408323224783, + "grad_norm": 1.630778484594659, + "learning_rate": 0.00029999068598174746, + "loss": 6.0031, + "step": 2221 + }, + { + "epoch": 0.20733414201735562, + "grad_norm": 1.9209486317800277, + "learning_rate": 0.0002999906015021709, + "loss": 6.1508, + "step": 2222 + }, + { + "epoch": 0.2074274517122329, + "grad_norm": 1.306398918127076, + "learning_rate": 0.0002999905166412127, + "loss": 5.805, + "step": 2223 + }, + { + "epoch": 0.2075207614071102, + "grad_norm": 1.5701895217440929, + "learning_rate": 0.0002999904313988731, + "loss": 6.0277, + "step": 2224 + }, + { + "epoch": 0.20761407110198749, + "grad_norm": 1.4469577644986427, + "learning_rate": 0.00029999034577515214, + "loss": 6.0644, + "step": 2225 + }, + { + "epoch": 0.2077073807968648, + "grad_norm": 1.3571841757180745, + "learning_rate": 0.00029999025977005023, + "loss": 6.0722, + "step": 2226 + }, + { + "epoch": 0.2078006904917421, + "grad_norm": 1.088448021490351, + "learning_rate": 0.0002999901733835675, + "loss": 6.0147, + "step": 2227 + }, + { + "epoch": 0.20789400018661938, + "grad_norm": 1.85466250499084, + "learning_rate": 0.0002999900866157042, + "loss": 6.405, + "step": 2228 + }, + { + "epoch": 0.2079873098814967, + "grad_norm": 1.2656842160767612, + "learning_rate": 0.0002999899994664606, + "loss": 6.0077, + "step": 2229 + }, + { + "epoch": 0.208080619576374, + "grad_norm": 1.5043179673556863, + "learning_rate": 0.0002999899119358368, + "loss": 5.8834, + "step": 2230 + }, + { + "epoch": 0.20817392927125128, + "grad_norm": 1.9996722030456808, + "learning_rate": 0.0002999898240238331, + "loss": 6.0366, + "step": 2231 + }, + { + "epoch": 0.2082672389661286, + "grad_norm": 1.1953015057106597, + "learning_rate": 0.0002999897357304497, + "loss": 5.9387, + "step": 2232 + }, + { + "epoch": 0.20836054866100587, + "grad_norm": 1.5910949085498083, + "learning_rate": 0.0002999896470556869, + "loss": 6.174, + "step": 2233 + }, + { + "epoch": 0.20845385835588318, + "grad_norm": 1.299994015627141, + "learning_rate": 0.00029998955799954475, + "loss": 6.137, + "step": 2234 + }, + { + "epoch": 0.20854716805076048, + "grad_norm": 1.47697851943804, + "learning_rate": 0.0002999894685620236, + "loss": 5.3717, + "step": 2235 + }, + { + "epoch": 0.20864047774563776, + "grad_norm": 1.4524993037268392, + "learning_rate": 0.0002999893787431237, + "loss": 5.8715, + "step": 2236 + }, + { + "epoch": 0.20873378744051507, + "grad_norm": 2.7174123710879137, + "learning_rate": 0.00029998928854284523, + "loss": 5.5648, + "step": 2237 + }, + { + "epoch": 0.20882709713539238, + "grad_norm": 1.359071996283176, + "learning_rate": 0.0002999891979611884, + "loss": 5.9431, + "step": 2238 + }, + { + "epoch": 0.20892040683026966, + "grad_norm": 1.446710846363339, + "learning_rate": 0.0002999891069981536, + "loss": 6.1742, + "step": 2239 + }, + { + "epoch": 0.20901371652514697, + "grad_norm": 1.612774969486751, + "learning_rate": 0.0002999890156537408, + "loss": 5.9644, + "step": 2240 + }, + { + "epoch": 0.20910702622002425, + "grad_norm": 1.2704595046262268, + "learning_rate": 0.00029998892392795037, + "loss": 5.8638, + "step": 2241 + }, + { + "epoch": 0.20920033591490156, + "grad_norm": 1.1259526262697852, + "learning_rate": 0.00029998883182078256, + "loss": 6.0094, + "step": 2242 + }, + { + "epoch": 0.20929364560977887, + "grad_norm": 1.1025643813991932, + "learning_rate": 0.0002999887393322376, + "loss": 5.8507, + "step": 2243 + }, + { + "epoch": 0.20938695530465615, + "grad_norm": 1.2420121396846224, + "learning_rate": 0.0002999886464623157, + "loss": 5.8634, + "step": 2244 + }, + { + "epoch": 0.20948026499953346, + "grad_norm": 1.5684664902711691, + "learning_rate": 0.00029998855321101704, + "loss": 6.2892, + "step": 2245 + }, + { + "epoch": 0.20957357469441074, + "grad_norm": 1.2253850232781722, + "learning_rate": 0.000299988459578342, + "loss": 6.1869, + "step": 2246 + }, + { + "epoch": 0.20966688438928804, + "grad_norm": 1.0416975693490347, + "learning_rate": 0.0002999883655642907, + "loss": 5.8833, + "step": 2247 + }, + { + "epoch": 0.20976019408416535, + "grad_norm": 1.064324457293611, + "learning_rate": 0.0002999882711688634, + "loss": 5.892, + "step": 2248 + }, + { + "epoch": 0.20985350377904263, + "grad_norm": 1.073907576714571, + "learning_rate": 0.0002999881763920604, + "loss": 5.5515, + "step": 2249 + }, + { + "epoch": 0.20994681347391994, + "grad_norm": 1.3361316988851477, + "learning_rate": 0.0002999880812338819, + "loss": 6.15, + "step": 2250 + }, + { + "epoch": 0.21004012316879725, + "grad_norm": 1.3231731966455957, + "learning_rate": 0.00029998798569432807, + "loss": 6.23, + "step": 2251 + }, + { + "epoch": 0.21013343286367453, + "grad_norm": 1.4817429110381946, + "learning_rate": 0.00029998788977339926, + "loss": 6.1857, + "step": 2252 + }, + { + "epoch": 0.21022674255855184, + "grad_norm": 1.1031072129356378, + "learning_rate": 0.0002999877934710957, + "loss": 5.417, + "step": 2253 + }, + { + "epoch": 0.21032005225342912, + "grad_norm": 1.054753393199807, + "learning_rate": 0.00029998769678741757, + "loss": 5.5232, + "step": 2254 + }, + { + "epoch": 0.21041336194830643, + "grad_norm": 1.364489971659812, + "learning_rate": 0.0002999875997223652, + "loss": 5.491, + "step": 2255 + }, + { + "epoch": 0.21050667164318373, + "grad_norm": 1.42920742026732, + "learning_rate": 0.0002999875022759388, + "loss": 5.4464, + "step": 2256 + }, + { + "epoch": 0.21059998133806102, + "grad_norm": 1.075090812083121, + "learning_rate": 0.0002999874044481386, + "loss": 5.8363, + "step": 2257 + }, + { + "epoch": 0.21069329103293832, + "grad_norm": 1.454765633566013, + "learning_rate": 0.0002999873062389648, + "loss": 5.8505, + "step": 2258 + }, + { + "epoch": 0.21078660072781563, + "grad_norm": 1.202493794689062, + "learning_rate": 0.00029998720764841776, + "loss": 5.8243, + "step": 2259 + }, + { + "epoch": 0.2108799104226929, + "grad_norm": 1.4531535616231077, + "learning_rate": 0.0002999871086764977, + "loss": 6.1198, + "step": 2260 + }, + { + "epoch": 0.21097322011757022, + "grad_norm": 1.148710140960721, + "learning_rate": 0.0002999870093232048, + "loss": 6.268, + "step": 2261 + }, + { + "epoch": 0.2110665298124475, + "grad_norm": 1.1956060496747198, + "learning_rate": 0.0002999869095885394, + "loss": 6.1719, + "step": 2262 + }, + { + "epoch": 0.2111598395073248, + "grad_norm": 1.3480196608858523, + "learning_rate": 0.0002999868094725017, + "loss": 6.0758, + "step": 2263 + }, + { + "epoch": 0.21125314920220212, + "grad_norm": 1.178642318556221, + "learning_rate": 0.000299986708975092, + "loss": 5.9204, + "step": 2264 + }, + { + "epoch": 0.2113464588970794, + "grad_norm": 1.1707542477866295, + "learning_rate": 0.0002999866080963105, + "loss": 6.0304, + "step": 2265 + }, + { + "epoch": 0.2114397685919567, + "grad_norm": 1.1271677011758183, + "learning_rate": 0.0002999865068361575, + "loss": 6.136, + "step": 2266 + }, + { + "epoch": 0.21153307828683401, + "grad_norm": 1.0458133090341133, + "learning_rate": 0.00029998640519463324, + "loss": 5.9669, + "step": 2267 + }, + { + "epoch": 0.2116263879817113, + "grad_norm": 1.099591924558181, + "learning_rate": 0.000299986303171738, + "loss": 6.2652, + "step": 2268 + }, + { + "epoch": 0.2117196976765886, + "grad_norm": 1.3581282518137423, + "learning_rate": 0.000299986200767472, + "loss": 6.224, + "step": 2269 + }, + { + "epoch": 0.21181300737146588, + "grad_norm": 1.0154468578781441, + "learning_rate": 0.0002999860979818355, + "loss": 5.9569, + "step": 2270 + }, + { + "epoch": 0.2119063170663432, + "grad_norm": 1.2059204187467252, + "learning_rate": 0.0002999859948148288, + "loss": 5.7112, + "step": 2271 + }, + { + "epoch": 0.2119996267612205, + "grad_norm": 1.1046207873252916, + "learning_rate": 0.00029998589126645214, + "loss": 5.7531, + "step": 2272 + }, + { + "epoch": 0.21209293645609778, + "grad_norm": 1.0942042520700759, + "learning_rate": 0.00029998578733670577, + "loss": 5.9397, + "step": 2273 + }, + { + "epoch": 0.2121862461509751, + "grad_norm": 1.076236141060183, + "learning_rate": 0.00029998568302559003, + "loss": 5.9048, + "step": 2274 + }, + { + "epoch": 0.2122795558458524, + "grad_norm": 1.2351857770620351, + "learning_rate": 0.0002999855783331051, + "loss": 6.261, + "step": 2275 + }, + { + "epoch": 0.21237286554072968, + "grad_norm": 1.1468146716454344, + "learning_rate": 0.00029998547325925124, + "loss": 5.9077, + "step": 2276 + }, + { + "epoch": 0.21246617523560699, + "grad_norm": 1.152483024749221, + "learning_rate": 0.0002999853678040288, + "loss": 5.6507, + "step": 2277 + }, + { + "epoch": 0.21255948493048427, + "grad_norm": 1.2536625432137443, + "learning_rate": 0.00029998526196743793, + "loss": 5.8022, + "step": 2278 + }, + { + "epoch": 0.21265279462536157, + "grad_norm": 1.2178222425630718, + "learning_rate": 0.0002999851557494791, + "loss": 6.024, + "step": 2279 + }, + { + "epoch": 0.21274610432023888, + "grad_norm": 1.2040286187663884, + "learning_rate": 0.00029998504915015227, + "loss": 5.864, + "step": 2280 + }, + { + "epoch": 0.21283941401511616, + "grad_norm": 1.2592923679340107, + "learning_rate": 0.000299984942169458, + "loss": 5.6015, + "step": 2281 + }, + { + "epoch": 0.21293272370999347, + "grad_norm": 1.2785624703021583, + "learning_rate": 0.00029998483480739643, + "loss": 5.7276, + "step": 2282 + }, + { + "epoch": 0.21302603340487078, + "grad_norm": 1.2495385983675529, + "learning_rate": 0.0002999847270639679, + "loss": 5.9391, + "step": 2283 + }, + { + "epoch": 0.21311934309974806, + "grad_norm": 1.6819503744838766, + "learning_rate": 0.0002999846189391726, + "loss": 6.0695, + "step": 2284 + }, + { + "epoch": 0.21321265279462537, + "grad_norm": 1.2629442202028558, + "learning_rate": 0.0002999845104330108, + "loss": 5.7188, + "step": 2285 + }, + { + "epoch": 0.21330596248950265, + "grad_norm": 1.1533524262872659, + "learning_rate": 0.0002999844015454829, + "loss": 6.1225, + "step": 2286 + }, + { + "epoch": 0.21339927218437996, + "grad_norm": 1.2423923908676955, + "learning_rate": 0.000299984292276589, + "loss": 5.9117, + "step": 2287 + }, + { + "epoch": 0.21349258187925726, + "grad_norm": 1.1655435826928453, + "learning_rate": 0.0002999841826263295, + "loss": 6.23, + "step": 2288 + }, + { + "epoch": 0.21358589157413455, + "grad_norm": 1.1506841886533126, + "learning_rate": 0.0002999840725947047, + "loss": 6.2612, + "step": 2289 + }, + { + "epoch": 0.21367920126901185, + "grad_norm": 1.4100686569065704, + "learning_rate": 0.0002999839621817148, + "loss": 5.8243, + "step": 2290 + }, + { + "epoch": 0.21377251096388916, + "grad_norm": 1.1172441914125253, + "learning_rate": 0.00029998385138736017, + "loss": 6.0003, + "step": 2291 + }, + { + "epoch": 0.21386582065876644, + "grad_norm": 1.4757910820531686, + "learning_rate": 0.00029998374021164093, + "loss": 5.5547, + "step": 2292 + }, + { + "epoch": 0.21395913035364375, + "grad_norm": 1.696033693951165, + "learning_rate": 0.00029998362865455753, + "loss": 5.8947, + "step": 2293 + }, + { + "epoch": 0.21405244004852103, + "grad_norm": 1.185972574044201, + "learning_rate": 0.00029998351671611014, + "loss": 5.439, + "step": 2294 + }, + { + "epoch": 0.21414574974339834, + "grad_norm": 1.3072718355710577, + "learning_rate": 0.0002999834043962992, + "loss": 5.8078, + "step": 2295 + }, + { + "epoch": 0.21423905943827565, + "grad_norm": 1.3003609187470826, + "learning_rate": 0.0002999832916951248, + "loss": 5.6851, + "step": 2296 + }, + { + "epoch": 0.21433236913315293, + "grad_norm": 1.6302167287754383, + "learning_rate": 0.00029998317861258737, + "loss": 6.2202, + "step": 2297 + }, + { + "epoch": 0.21442567882803024, + "grad_norm": 1.2477608367656219, + "learning_rate": 0.0002999830651486871, + "loss": 5.8802, + "step": 2298 + }, + { + "epoch": 0.21451898852290752, + "grad_norm": 7.080502144855866, + "learning_rate": 0.00029998295130342434, + "loss": 5.2941, + "step": 2299 + }, + { + "epoch": 0.21461229821778482, + "grad_norm": 1.1282443709611751, + "learning_rate": 0.0002999828370767994, + "loss": 5.8114, + "step": 2300 + }, + { + "epoch": 0.21470560791266213, + "grad_norm": 1.3204850512365494, + "learning_rate": 0.0002999827224688125, + "loss": 6.4581, + "step": 2301 + }, + { + "epoch": 0.2147989176075394, + "grad_norm": 1.3516428070454896, + "learning_rate": 0.00029998260747946394, + "loss": 5.5246, + "step": 2302 + }, + { + "epoch": 0.21489222730241672, + "grad_norm": 1.4526577803304948, + "learning_rate": 0.00029998249210875404, + "loss": 6.1383, + "step": 2303 + }, + { + "epoch": 0.21498553699729403, + "grad_norm": 1.1246072799771925, + "learning_rate": 0.00029998237635668314, + "loss": 5.5172, + "step": 2304 + }, + { + "epoch": 0.2150788466921713, + "grad_norm": 1.1117648224896757, + "learning_rate": 0.0002999822602232515, + "loss": 5.9421, + "step": 2305 + }, + { + "epoch": 0.21517215638704862, + "grad_norm": 1.111686835100085, + "learning_rate": 0.0002999821437084593, + "loss": 5.5402, + "step": 2306 + }, + { + "epoch": 0.2152654660819259, + "grad_norm": 1.1817331768980646, + "learning_rate": 0.00029998202681230706, + "loss": 5.922, + "step": 2307 + }, + { + "epoch": 0.2153587757768032, + "grad_norm": 2.101432451278081, + "learning_rate": 0.00029998190953479486, + "loss": 5.1333, + "step": 2308 + }, + { + "epoch": 0.21545208547168052, + "grad_norm": 1.4383599245408085, + "learning_rate": 0.0002999817918759231, + "loss": 6.1352, + "step": 2309 + }, + { + "epoch": 0.2155453951665578, + "grad_norm": 1.223653810817536, + "learning_rate": 0.0002999816738356921, + "loss": 5.8165, + "step": 2310 + }, + { + "epoch": 0.2156387048614351, + "grad_norm": 7.910073538752316, + "learning_rate": 0.00029998155541410213, + "loss": 5.5969, + "step": 2311 + }, + { + "epoch": 0.2157320145563124, + "grad_norm": 1.776597431017663, + "learning_rate": 0.0002999814366111535, + "loss": 6.0012, + "step": 2312 + }, + { + "epoch": 0.2158253242511897, + "grad_norm": 1.4755419716801315, + "learning_rate": 0.00029998131742684647, + "loss": 6.028, + "step": 2313 + }, + { + "epoch": 0.215918633946067, + "grad_norm": 1.3309465025055276, + "learning_rate": 0.00029998119786118143, + "loss": 6.244, + "step": 2314 + }, + { + "epoch": 0.21601194364094428, + "grad_norm": 1.2385104743401611, + "learning_rate": 0.0002999810779141586, + "loss": 6.0506, + "step": 2315 + }, + { + "epoch": 0.2161052533358216, + "grad_norm": 1.0946274302951793, + "learning_rate": 0.0002999809575857783, + "loss": 5.5115, + "step": 2316 + }, + { + "epoch": 0.2161985630306989, + "grad_norm": 1.469996876217754, + "learning_rate": 0.00029998083687604084, + "loss": 6.0129, + "step": 2317 + }, + { + "epoch": 0.21629187272557618, + "grad_norm": 1.0512471555255565, + "learning_rate": 0.0002999807157849466, + "loss": 5.9509, + "step": 2318 + }, + { + "epoch": 0.2163851824204535, + "grad_norm": 1.2030517032421582, + "learning_rate": 0.0002999805943124958, + "loss": 6.163, + "step": 2319 + }, + { + "epoch": 0.2164784921153308, + "grad_norm": 3.3051838229960104, + "learning_rate": 0.00029998047245868875, + "loss": 5.8708, + "step": 2320 + }, + { + "epoch": 0.21657180181020808, + "grad_norm": 1.0418418418769129, + "learning_rate": 0.00029998035022352584, + "loss": 6.1412, + "step": 2321 + }, + { + "epoch": 0.21666511150508538, + "grad_norm": 1.389429300989522, + "learning_rate": 0.0002999802276070073, + "loss": 5.9264, + "step": 2322 + }, + { + "epoch": 0.21675842119996266, + "grad_norm": 0.9556932110575043, + "learning_rate": 0.0002999801046091334, + "loss": 5.5712, + "step": 2323 + }, + { + "epoch": 0.21685173089483997, + "grad_norm": 1.0609415083268652, + "learning_rate": 0.00029997998122990464, + "loss": 5.7946, + "step": 2324 + }, + { + "epoch": 0.21694504058971728, + "grad_norm": 19.245905840800237, + "learning_rate": 0.0002999798574693211, + "loss": 5.4809, + "step": 2325 + }, + { + "epoch": 0.21703835028459456, + "grad_norm": 2.135086378145173, + "learning_rate": 0.0002999797333273833, + "loss": 5.3364, + "step": 2326 + }, + { + "epoch": 0.21713165997947187, + "grad_norm": 1.0928445508410345, + "learning_rate": 0.0002999796088040914, + "loss": 5.7875, + "step": 2327 + }, + { + "epoch": 0.21722496967434918, + "grad_norm": 1.3153713084871994, + "learning_rate": 0.00029997948389944587, + "loss": 6.0692, + "step": 2328 + }, + { + "epoch": 0.21731827936922646, + "grad_norm": 1.5978484035936622, + "learning_rate": 0.00029997935861344685, + "loss": 6.0489, + "step": 2329 + }, + { + "epoch": 0.21741158906410377, + "grad_norm": 1.1973026218956528, + "learning_rate": 0.0002999792329460948, + "loss": 6.1265, + "step": 2330 + }, + { + "epoch": 0.21750489875898105, + "grad_norm": 2.2605406442342137, + "learning_rate": 0.00029997910689739, + "loss": 6.0076, + "step": 2331 + }, + { + "epoch": 0.21759820845385835, + "grad_norm": 1.2881412712736473, + "learning_rate": 0.0002999789804673327, + "loss": 6.0024, + "step": 2332 + }, + { + "epoch": 0.21769151814873566, + "grad_norm": 1.246544007410345, + "learning_rate": 0.0002999788536559233, + "loss": 5.9648, + "step": 2333 + }, + { + "epoch": 0.21778482784361294, + "grad_norm": 14.652554862575833, + "learning_rate": 0.00029997872646316215, + "loss": 6.0103, + "step": 2334 + }, + { + "epoch": 0.21787813753849025, + "grad_norm": 5.335263523700777, + "learning_rate": 0.00029997859888904947, + "loss": 5.5568, + "step": 2335 + }, + { + "epoch": 0.21797144723336756, + "grad_norm": 1.702090254650533, + "learning_rate": 0.00029997847093358565, + "loss": 5.7658, + "step": 2336 + }, + { + "epoch": 0.21806475692824484, + "grad_norm": 1.0139689572533086, + "learning_rate": 0.000299978342596771, + "loss": 6.1213, + "step": 2337 + }, + { + "epoch": 0.21815806662312215, + "grad_norm": 1.3444203974022033, + "learning_rate": 0.0002999782138786059, + "loss": 5.3763, + "step": 2338 + }, + { + "epoch": 0.21825137631799943, + "grad_norm": 1.5949238977791682, + "learning_rate": 0.0002999780847790906, + "loss": 5.9408, + "step": 2339 + }, + { + "epoch": 0.21834468601287674, + "grad_norm": 1.7630621824756254, + "learning_rate": 0.0002999779552982254, + "loss": 5.9796, + "step": 2340 + }, + { + "epoch": 0.21843799570775405, + "grad_norm": 3.109832164916591, + "learning_rate": 0.00029997782543601077, + "loss": 5.8506, + "step": 2341 + }, + { + "epoch": 0.21853130540263133, + "grad_norm": 1.8776716662279411, + "learning_rate": 0.00029997769519244695, + "loss": 6.2039, + "step": 2342 + }, + { + "epoch": 0.21862461509750863, + "grad_norm": 1.1255993691009958, + "learning_rate": 0.0002999775645675342, + "loss": 6.0442, + "step": 2343 + }, + { + "epoch": 0.21871792479238594, + "grad_norm": 1.5516055891970666, + "learning_rate": 0.00029997743356127303, + "loss": 5.8639, + "step": 2344 + }, + { + "epoch": 0.21881123448726322, + "grad_norm": 1.0862440571614163, + "learning_rate": 0.00029997730217366363, + "loss": 5.7047, + "step": 2345 + }, + { + "epoch": 0.21890454418214053, + "grad_norm": 1.1237795783795774, + "learning_rate": 0.00029997717040470634, + "loss": 5.9811, + "step": 2346 + }, + { + "epoch": 0.2189978538770178, + "grad_norm": 1.392641819642867, + "learning_rate": 0.0002999770382544016, + "loss": 6.2995, + "step": 2347 + }, + { + "epoch": 0.21909116357189512, + "grad_norm": 1.6918238387123095, + "learning_rate": 0.0002999769057227496, + "loss": 5.969, + "step": 2348 + }, + { + "epoch": 0.21918447326677243, + "grad_norm": 1.5737229477325294, + "learning_rate": 0.0002999767728097508, + "loss": 5.996, + "step": 2349 + }, + { + "epoch": 0.2192777829616497, + "grad_norm": 1.3704167683114452, + "learning_rate": 0.0002999766395154055, + "loss": 6.0282, + "step": 2350 + }, + { + "epoch": 0.21937109265652702, + "grad_norm": 1.2306571839114124, + "learning_rate": 0.00029997650583971406, + "loss": 5.8665, + "step": 2351 + }, + { + "epoch": 0.2194644023514043, + "grad_norm": 1.3166309882901572, + "learning_rate": 0.0002999763717826767, + "loss": 5.7506, + "step": 2352 + }, + { + "epoch": 0.2195577120462816, + "grad_norm": 1.047533009750917, + "learning_rate": 0.00029997623734429393, + "loss": 5.7523, + "step": 2353 + }, + { + "epoch": 0.2196510217411589, + "grad_norm": 1.5900837930232263, + "learning_rate": 0.00029997610252456596, + "loss": 6.0532, + "step": 2354 + }, + { + "epoch": 0.2197443314360362, + "grad_norm": 1.8780791296701962, + "learning_rate": 0.00029997596732349324, + "loss": 6.0131, + "step": 2355 + }, + { + "epoch": 0.2198376411309135, + "grad_norm": 1.2216963767604072, + "learning_rate": 0.00029997583174107604, + "loss": 6.0455, + "step": 2356 + }, + { + "epoch": 0.2199309508257908, + "grad_norm": 1.8701443172577439, + "learning_rate": 0.00029997569577731474, + "loss": 5.8391, + "step": 2357 + }, + { + "epoch": 0.2200242605206681, + "grad_norm": 1.5448710784533959, + "learning_rate": 0.00029997555943220966, + "loss": 6.1203, + "step": 2358 + }, + { + "epoch": 0.2201175702155454, + "grad_norm": 3.7658083425829796, + "learning_rate": 0.00029997542270576114, + "loss": 6.1425, + "step": 2359 + }, + { + "epoch": 0.22021087991042268, + "grad_norm": 3.0370531750518, + "learning_rate": 0.0002999752855979696, + "loss": 5.5237, + "step": 2360 + }, + { + "epoch": 0.2203041896053, + "grad_norm": 1.8322348217948672, + "learning_rate": 0.00029997514810883527, + "loss": 5.9891, + "step": 2361 + }, + { + "epoch": 0.2203974993001773, + "grad_norm": 1.9975360609866166, + "learning_rate": 0.0002999750102383586, + "loss": 5.2432, + "step": 2362 + }, + { + "epoch": 0.22049080899505458, + "grad_norm": 5.69166749925803, + "learning_rate": 0.0002999748719865399, + "loss": 6.3697, + "step": 2363 + }, + { + "epoch": 0.22058411868993189, + "grad_norm": 2.3392820798541245, + "learning_rate": 0.00029997473335337954, + "loss": 6.1211, + "step": 2364 + }, + { + "epoch": 0.2206774283848092, + "grad_norm": 1.4323366786109664, + "learning_rate": 0.00029997459433887786, + "loss": 6.0105, + "step": 2365 + }, + { + "epoch": 0.22077073807968647, + "grad_norm": 2.4817008732538133, + "learning_rate": 0.0002999744549430352, + "loss": 6.4146, + "step": 2366 + }, + { + "epoch": 0.22086404777456378, + "grad_norm": 1.8870573944348, + "learning_rate": 0.0002999743151658519, + "loss": 6.063, + "step": 2367 + }, + { + "epoch": 0.22095735746944106, + "grad_norm": 3.986126148732609, + "learning_rate": 0.00029997417500732834, + "loss": 6.0323, + "step": 2368 + }, + { + "epoch": 0.22105066716431837, + "grad_norm": 2.9173701559575917, + "learning_rate": 0.0002999740344674649, + "loss": 5.8188, + "step": 2369 + }, + { + "epoch": 0.22114397685919568, + "grad_norm": 5.674413088696671, + "learning_rate": 0.0002999738935462619, + "loss": 6.1835, + "step": 2370 + }, + { + "epoch": 0.22123728655407296, + "grad_norm": 1.913842552975384, + "learning_rate": 0.00029997375224371977, + "loss": 6.4373, + "step": 2371 + }, + { + "epoch": 0.22133059624895027, + "grad_norm": 2.7521450218479195, + "learning_rate": 0.0002999736105598387, + "loss": 6.0763, + "step": 2372 + }, + { + "epoch": 0.22142390594382758, + "grad_norm": 3.582263436565503, + "learning_rate": 0.0002999734684946193, + "loss": 6.2251, + "step": 2373 + }, + { + "epoch": 0.22151721563870486, + "grad_norm": 1.2586182309085083, + "learning_rate": 0.00029997332604806174, + "loss": 5.6096, + "step": 2374 + }, + { + "epoch": 0.22161052533358216, + "grad_norm": 1.6765844383307176, + "learning_rate": 0.00029997318322016645, + "loss": 5.5868, + "step": 2375 + }, + { + "epoch": 0.22170383502845945, + "grad_norm": 1.2533759814382288, + "learning_rate": 0.0002999730400109338, + "loss": 5.7124, + "step": 2376 + }, + { + "epoch": 0.22179714472333675, + "grad_norm": 1.117415984533133, + "learning_rate": 0.00029997289642036407, + "loss": 5.9727, + "step": 2377 + }, + { + "epoch": 0.22189045441821406, + "grad_norm": 1.996396972099825, + "learning_rate": 0.0002999727524484577, + "loss": 6.1614, + "step": 2378 + }, + { + "epoch": 0.22198376411309134, + "grad_norm": 52.443159810013846, + "learning_rate": 0.00029997260809521505, + "loss": 5.4705, + "step": 2379 + }, + { + "epoch": 0.22207707380796865, + "grad_norm": 1.5747088841028405, + "learning_rate": 0.0002999724633606365, + "loss": 5.9144, + "step": 2380 + }, + { + "epoch": 0.22217038350284596, + "grad_norm": 1.2900576597390851, + "learning_rate": 0.0002999723182447224, + "loss": 5.975, + "step": 2381 + }, + { + "epoch": 0.22226369319772324, + "grad_norm": 2.1927946313449325, + "learning_rate": 0.00029997217274747314, + "loss": 6.2596, + "step": 2382 + }, + { + "epoch": 0.22235700289260055, + "grad_norm": 2.1328371375940933, + "learning_rate": 0.000299972026868889, + "loss": 6.0064, + "step": 2383 + }, + { + "epoch": 0.22245031258747783, + "grad_norm": 2.308498569969896, + "learning_rate": 0.0002999718806089705, + "loss": 5.8147, + "step": 2384 + }, + { + "epoch": 0.22254362228235514, + "grad_norm": 5.2621817241370765, + "learning_rate": 0.0002999717339677179, + "loss": 6.0111, + "step": 2385 + }, + { + "epoch": 0.22263693197723244, + "grad_norm": 7.415600054592513, + "learning_rate": 0.0002999715869451316, + "loss": 6.524, + "step": 2386 + }, + { + "epoch": 0.22273024167210972, + "grad_norm": 2.5322688468881527, + "learning_rate": 0.00029997143954121197, + "loss": 6.2136, + "step": 2387 + }, + { + "epoch": 0.22282355136698703, + "grad_norm": 3.029120720205851, + "learning_rate": 0.0002999712917559594, + "loss": 6.2928, + "step": 2388 + }, + { + "epoch": 0.22291686106186434, + "grad_norm": 1.7929571093617456, + "learning_rate": 0.0002999711435893743, + "loss": 6.1004, + "step": 2389 + }, + { + "epoch": 0.22301017075674162, + "grad_norm": 8.61899497029606, + "learning_rate": 0.00029997099504145693, + "loss": 6.0823, + "step": 2390 + }, + { + "epoch": 0.22310348045161893, + "grad_norm": 30.26955246752881, + "learning_rate": 0.0002999708461122078, + "loss": 5.998, + "step": 2391 + }, + { + "epoch": 0.2231967901464962, + "grad_norm": 4.389663279115934, + "learning_rate": 0.00029997069680162724, + "loss": 6.0267, + "step": 2392 + }, + { + "epoch": 0.22329009984137352, + "grad_norm": 4.893102947424498, + "learning_rate": 0.0002999705471097156, + "loss": 6.7549, + "step": 2393 + }, + { + "epoch": 0.22338340953625083, + "grad_norm": 4.01844946219581, + "learning_rate": 0.0002999703970364733, + "loss": 6.1053, + "step": 2394 + }, + { + "epoch": 0.2234767192311281, + "grad_norm": 11.966202179094699, + "learning_rate": 0.00029997024658190067, + "loss": 6.8683, + "step": 2395 + }, + { + "epoch": 0.22357002892600542, + "grad_norm": 3.6421814311729777, + "learning_rate": 0.0002999700957459982, + "loss": 6.4086, + "step": 2396 + }, + { + "epoch": 0.22366333862088272, + "grad_norm": 6.173799751748622, + "learning_rate": 0.0002999699445287661, + "loss": 5.441, + "step": 2397 + }, + { + "epoch": 0.22375664831576, + "grad_norm": 8.416846250416544, + "learning_rate": 0.0002999697929302049, + "loss": 6.7128, + "step": 2398 + }, + { + "epoch": 0.2238499580106373, + "grad_norm": 4.620282227732891, + "learning_rate": 0.000299969640950315, + "loss": 6.6443, + "step": 2399 + }, + { + "epoch": 0.2239432677055146, + "grad_norm": 61.94268290622032, + "learning_rate": 0.0002999694885890966, + "loss": 5.8694, + "step": 2400 + }, + { + "epoch": 0.2240365774003919, + "grad_norm": 4.3660559957325855, + "learning_rate": 0.00029996933584655036, + "loss": 5.6445, + "step": 2401 + }, + { + "epoch": 0.2241298870952692, + "grad_norm": 9.914049285468511, + "learning_rate": 0.0002999691827226764, + "loss": 6.8899, + "step": 2402 + }, + { + "epoch": 0.2242231967901465, + "grad_norm": 11.468056298095233, + "learning_rate": 0.0002999690292174753, + "loss": 7.3821, + "step": 2403 + }, + { + "epoch": 0.2243165064850238, + "grad_norm": 6.80744427161086, + "learning_rate": 0.00029996887533094733, + "loss": 6.6091, + "step": 2404 + }, + { + "epoch": 0.22440981617990108, + "grad_norm": 3.808094068736954, + "learning_rate": 0.000299968721063093, + "loss": 6.7132, + "step": 2405 + }, + { + "epoch": 0.2245031258747784, + "grad_norm": 71.24555904007222, + "learning_rate": 0.0002999685664139126, + "loss": 6.8138, + "step": 2406 + }, + { + "epoch": 0.2245964355696557, + "grad_norm": 5.882579128275463, + "learning_rate": 0.00029996841138340657, + "loss": 7.1525, + "step": 2407 + }, + { + "epoch": 0.22468974526453298, + "grad_norm": 23.151361271755967, + "learning_rate": 0.0002999682559715753, + "loss": 7.7434, + "step": 2408 + }, + { + "epoch": 0.22478305495941028, + "grad_norm": 22.323746495552133, + "learning_rate": 0.0002999681001784192, + "loss": 7.3791, + "step": 2409 + }, + { + "epoch": 0.2248763646542876, + "grad_norm": 8.824314508299633, + "learning_rate": 0.0002999679440039386, + "loss": 7.884, + "step": 2410 + }, + { + "epoch": 0.22496967434916487, + "grad_norm": 713.0264758543179, + "learning_rate": 0.00029996778744813393, + "loss": 7.6328, + "step": 2411 + }, + { + "epoch": 0.22506298404404218, + "grad_norm": 826.3157407748029, + "learning_rate": 0.00029996763051100565, + "loss": 7.1886, + "step": 2412 + }, + { + "epoch": 0.22515629373891946, + "grad_norm": 7.449887291712951, + "learning_rate": 0.0002999674731925541, + "loss": 7.4979, + "step": 2413 + }, + { + "epoch": 0.22524960343379677, + "grad_norm": 6.955935469290764, + "learning_rate": 0.0002999673154927797, + "loss": 7.4705, + "step": 2414 + }, + { + "epoch": 0.22534291312867408, + "grad_norm": 5.720893706157774, + "learning_rate": 0.0002999671574116828, + "loss": 7.3472, + "step": 2415 + }, + { + "epoch": 0.22543622282355136, + "grad_norm": 6.246711081044645, + "learning_rate": 0.00029996699894926384, + "loss": 7.3019, + "step": 2416 + }, + { + "epoch": 0.22552953251842867, + "grad_norm": 82486.0036678095, + "learning_rate": 0.0002999668401055232, + "loss": 7.4829, + "step": 2417 + }, + { + "epoch": 0.22562284221330597, + "grad_norm": 3.2798974747680525, + "learning_rate": 0.0002999666808804614, + "loss": 7.6577, + "step": 2418 + }, + { + "epoch": 0.22571615190818325, + "grad_norm": 3.0618901604175823, + "learning_rate": 0.00029996652127407873, + "loss": 7.7076, + "step": 2419 + }, + { + "epoch": 0.22580946160306056, + "grad_norm": 1.4730173200895036, + "learning_rate": 0.0002999663612863756, + "loss": 7.4574, + "step": 2420 + }, + { + "epoch": 0.22590277129793784, + "grad_norm": 7.1585210780192075, + "learning_rate": 0.0002999662009173524, + "loss": 7.772, + "step": 2421 + }, + { + "epoch": 0.22599608099281515, + "grad_norm": 3.8536567309883725, + "learning_rate": 0.0002999660401670096, + "loss": 7.9115, + "step": 2422 + }, + { + "epoch": 0.22608939068769246, + "grad_norm": 2.719173902914598, + "learning_rate": 0.00029996587903534757, + "loss": 8.1549, + "step": 2423 + }, + { + "epoch": 0.22618270038256974, + "grad_norm": 15.38163501592201, + "learning_rate": 0.0002999657175223668, + "loss": 8.3395, + "step": 2424 + }, + { + "epoch": 0.22627601007744705, + "grad_norm": 5.217743520166177, + "learning_rate": 0.00029996555562806755, + "loss": 8.1375, + "step": 2425 + }, + { + "epoch": 0.22636931977232436, + "grad_norm": 9.171008952368549, + "learning_rate": 0.00029996539335245035, + "loss": 8.2136, + "step": 2426 + }, + { + "epoch": 0.22646262946720164, + "grad_norm": 8.448558879905077, + "learning_rate": 0.0002999652306955156, + "loss": 7.7405, + "step": 2427 + }, + { + "epoch": 0.22655593916207895, + "grad_norm": 10.271299025868705, + "learning_rate": 0.00029996506765726365, + "loss": 8.0949, + "step": 2428 + }, + { + "epoch": 0.22664924885695623, + "grad_norm": 6.332165059338473, + "learning_rate": 0.000299964904237695, + "loss": 7.6269, + "step": 2429 + }, + { + "epoch": 0.22674255855183353, + "grad_norm": 2.983293714942881, + "learning_rate": 0.00029996474043681, + "loss": 7.7416, + "step": 2430 + }, + { + "epoch": 0.22683586824671084, + "grad_norm": 3.83226669961221, + "learning_rate": 0.0002999645762546091, + "loss": 7.3541, + "step": 2431 + }, + { + "epoch": 0.22692917794158812, + "grad_norm": 7.794686260878543, + "learning_rate": 0.0002999644116910927, + "loss": 7.5142, + "step": 2432 + }, + { + "epoch": 0.22702248763646543, + "grad_norm": 5.133226979475811, + "learning_rate": 0.0002999642467462612, + "loss": 7.8398, + "step": 2433 + }, + { + "epoch": 0.22711579733134274, + "grad_norm": 3.478015078277813, + "learning_rate": 0.00029996408142011507, + "loss": 8.0893, + "step": 2434 + }, + { + "epoch": 0.22720910702622002, + "grad_norm": 3.580629006247248, + "learning_rate": 0.00029996391571265467, + "loss": 8.0448, + "step": 2435 + }, + { + "epoch": 0.22730241672109733, + "grad_norm": 5.804020087354577, + "learning_rate": 0.0002999637496238805, + "loss": 8.0375, + "step": 2436 + }, + { + "epoch": 0.2273957264159746, + "grad_norm": 6.177423858348546, + "learning_rate": 0.00029996358315379293, + "loss": 7.7782, + "step": 2437 + }, + { + "epoch": 0.22748903611085192, + "grad_norm": 2.885132782677425, + "learning_rate": 0.00029996341630239234, + "loss": 7.5895, + "step": 2438 + }, + { + "epoch": 0.22758234580572922, + "grad_norm": 2.9078219390823885, + "learning_rate": 0.00029996324906967926, + "loss": 7.6918, + "step": 2439 + }, + { + "epoch": 0.2276756555006065, + "grad_norm": 7.2941570948925465, + "learning_rate": 0.000299963081455654, + "loss": 7.8075, + "step": 2440 + }, + { + "epoch": 0.2277689651954838, + "grad_norm": 5.96110541459796, + "learning_rate": 0.0002999629134603171, + "loss": 7.6185, + "step": 2441 + }, + { + "epoch": 0.22786227489036112, + "grad_norm": 349.96000531039306, + "learning_rate": 0.00029996274508366894, + "loss": 8.2147, + "step": 2442 + }, + { + "epoch": 0.2279555845852384, + "grad_norm": 5.730920063686958, + "learning_rate": 0.0002999625763257099, + "loss": 7.6481, + "step": 2443 + }, + { + "epoch": 0.2280488942801157, + "grad_norm": 4.947240289904961, + "learning_rate": 0.0002999624071864405, + "loss": 7.9812, + "step": 2444 + }, + { + "epoch": 0.228142203974993, + "grad_norm": 6.684345434169988, + "learning_rate": 0.00029996223766586103, + "loss": 7.8054, + "step": 2445 + }, + { + "epoch": 0.2282355136698703, + "grad_norm": 3.2045175276854074, + "learning_rate": 0.00029996206776397214, + "loss": 7.9415, + "step": 2446 + }, + { + "epoch": 0.2283288233647476, + "grad_norm": 3.156441720077023, + "learning_rate": 0.00029996189748077406, + "loss": 7.5996, + "step": 2447 + }, + { + "epoch": 0.2284221330596249, + "grad_norm": 18.055360891075516, + "learning_rate": 0.0002999617268162673, + "loss": 7.9316, + "step": 2448 + }, + { + "epoch": 0.2285154427545022, + "grad_norm": 5.07609443353742, + "learning_rate": 0.0002999615557704523, + "loss": 7.5663, + "step": 2449 + }, + { + "epoch": 0.2286087524493795, + "grad_norm": 9.66601652542733, + "learning_rate": 0.00029996138434332947, + "loss": 7.8883, + "step": 2450 + }, + { + "epoch": 0.22870206214425678, + "grad_norm": 8.097887146045462, + "learning_rate": 0.00029996121253489925, + "loss": 7.9889, + "step": 2451 + }, + { + "epoch": 0.2287953718391341, + "grad_norm": 1.9297650458618725, + "learning_rate": 0.0002999610403451621, + "loss": 7.8707, + "step": 2452 + }, + { + "epoch": 0.22888868153401137, + "grad_norm": 3.828636808005992, + "learning_rate": 0.00029996086777411845, + "loss": 7.8401, + "step": 2453 + }, + { + "epoch": 0.22898199122888868, + "grad_norm": 5.398817015448068, + "learning_rate": 0.00029996069482176875, + "loss": 8.1271, + "step": 2454 + }, + { + "epoch": 0.229075300923766, + "grad_norm": 5.782163282033069, + "learning_rate": 0.00029996052148811343, + "loss": 7.7816, + "step": 2455 + }, + { + "epoch": 0.22916861061864327, + "grad_norm": 4.751904767109379, + "learning_rate": 0.00029996034777315293, + "loss": 7.5025, + "step": 2456 + }, + { + "epoch": 0.22926192031352058, + "grad_norm": 1.4319588718714094, + "learning_rate": 0.00029996017367688764, + "loss": 7.885, + "step": 2457 + }, + { + "epoch": 0.22935523000839786, + "grad_norm": 12.012245928979189, + "learning_rate": 0.0002999599991993181, + "loss": 7.4442, + "step": 2458 + }, + { + "epoch": 0.22944853970327517, + "grad_norm": 10.011432026524483, + "learning_rate": 0.0002999598243404447, + "loss": 8.2169, + "step": 2459 + }, + { + "epoch": 0.22954184939815248, + "grad_norm": 7.68628944956349, + "learning_rate": 0.0002999596491002678, + "loss": 7.6841, + "step": 2460 + }, + { + "epoch": 0.22963515909302976, + "grad_norm": 5.892006128082311, + "learning_rate": 0.00029995947347878806, + "loss": 7.8788, + "step": 2461 + }, + { + "epoch": 0.22972846878790706, + "grad_norm": 1.6872772556948834, + "learning_rate": 0.0002999592974760057, + "loss": 7.9249, + "step": 2462 + }, + { + "epoch": 0.22982177848278437, + "grad_norm": 3.3346028960636303, + "learning_rate": 0.00029995912109192135, + "loss": 7.831, + "step": 2463 + }, + { + "epoch": 0.22991508817766165, + "grad_norm": 4.54630286356007, + "learning_rate": 0.00029995894432653533, + "loss": 7.9415, + "step": 2464 + }, + { + "epoch": 0.23000839787253896, + "grad_norm": 5.502125902851042, + "learning_rate": 0.00029995876717984815, + "loss": 7.9348, + "step": 2465 + }, + { + "epoch": 0.23010170756741624, + "grad_norm": 5.686274793718125, + "learning_rate": 0.00029995858965186023, + "loss": 7.7973, + "step": 2466 + }, + { + "epoch": 0.23019501726229355, + "grad_norm": 4.358028303250122, + "learning_rate": 0.0002999584117425721, + "loss": 7.7972, + "step": 2467 + }, + { + "epoch": 0.23028832695717086, + "grad_norm": 1.947955077326251, + "learning_rate": 0.00029995823345198405, + "loss": 7.8336, + "step": 2468 + }, + { + "epoch": 0.23038163665204814, + "grad_norm": 1296.0468430544342, + "learning_rate": 0.0002999580547800967, + "loss": 7.8352, + "step": 2469 + }, + { + "epoch": 0.23047494634692545, + "grad_norm": 4.786736183892429, + "learning_rate": 0.0002999578757269104, + "loss": 7.6884, + "step": 2470 + }, + { + "epoch": 0.23056825604180275, + "grad_norm": 6.234470459618513, + "learning_rate": 0.0002999576962924257, + "loss": 7.7819, + "step": 2471 + }, + { + "epoch": 0.23066156573668004, + "grad_norm": 3.8512816965029324, + "learning_rate": 0.0002999575164766429, + "loss": 7.7022, + "step": 2472 + }, + { + "epoch": 0.23075487543155734, + "grad_norm": 1.177060901069851, + "learning_rate": 0.00029995733627956263, + "loss": 7.4076, + "step": 2473 + }, + { + "epoch": 0.23084818512643462, + "grad_norm": 2.245123704652132, + "learning_rate": 0.0002999571557011853, + "loss": 7.5612, + "step": 2474 + }, + { + "epoch": 0.23094149482131193, + "grad_norm": 2.0535023524367593, + "learning_rate": 0.00029995697474151123, + "loss": 7.6554, + "step": 2475 + }, + { + "epoch": 0.23103480451618924, + "grad_norm": 3.1000932246751187, + "learning_rate": 0.0002999567934005411, + "loss": 7.6107, + "step": 2476 + }, + { + "epoch": 0.23112811421106652, + "grad_norm": 2.9557583995339756, + "learning_rate": 0.00029995661167827524, + "loss": 7.4913, + "step": 2477 + }, + { + "epoch": 0.23122142390594383, + "grad_norm": 2.1415596826023213, + "learning_rate": 0.00029995642957471416, + "loss": 7.4418, + "step": 2478 + }, + { + "epoch": 0.23131473360082114, + "grad_norm": 4.167442509116276, + "learning_rate": 0.00029995624708985827, + "loss": 7.7895, + "step": 2479 + }, + { + "epoch": 0.23140804329569842, + "grad_norm": 2.575111109170731, + "learning_rate": 0.00029995606422370804, + "loss": 7.9897, + "step": 2480 + }, + { + "epoch": 0.23150135299057573, + "grad_norm": 1.2507380521818063, + "learning_rate": 0.00029995588097626397, + "loss": 8.0648, + "step": 2481 + }, + { + "epoch": 0.231594662685453, + "grad_norm": 2.7570833409641526, + "learning_rate": 0.00029995569734752654, + "loss": 7.3967, + "step": 2482 + }, + { + "epoch": 0.23168797238033031, + "grad_norm": 1.8450608408595768, + "learning_rate": 0.0002999555133374962, + "loss": 7.8437, + "step": 2483 + }, + { + "epoch": 0.23178128207520762, + "grad_norm": 1.524728764692411, + "learning_rate": 0.0002999553289461734, + "loss": 7.4775, + "step": 2484 + }, + { + "epoch": 0.2318745917700849, + "grad_norm": 4.43803128032088, + "learning_rate": 0.0002999551441735586, + "loss": 7.3708, + "step": 2485 + }, + { + "epoch": 0.2319679014649622, + "grad_norm": 1.9790088314241872, + "learning_rate": 0.0002999549590196523, + "loss": 7.894, + "step": 2486 + }, + { + "epoch": 0.23206121115983952, + "grad_norm": 1.755109529637287, + "learning_rate": 0.00029995477348445493, + "loss": 7.4977, + "step": 2487 + }, + { + "epoch": 0.2321545208547168, + "grad_norm": 1.608450104390941, + "learning_rate": 0.000299954587567967, + "loss": 7.6008, + "step": 2488 + }, + { + "epoch": 0.2322478305495941, + "grad_norm": 1.3254813626304365, + "learning_rate": 0.000299954401270189, + "loss": 7.5789, + "step": 2489 + }, + { + "epoch": 0.2323411402444714, + "grad_norm": 1.4930141950945612, + "learning_rate": 0.00029995421459112136, + "loss": 7.6671, + "step": 2490 + }, + { + "epoch": 0.2324344499393487, + "grad_norm": 2.228942502141077, + "learning_rate": 0.0002999540275307646, + "loss": 7.1687, + "step": 2491 + }, + { + "epoch": 0.232527759634226, + "grad_norm": 1.657023153086721, + "learning_rate": 0.00029995384008911914, + "loss": 7.1827, + "step": 2492 + }, + { + "epoch": 0.23262106932910329, + "grad_norm": 0.8683118036789852, + "learning_rate": 0.0002999536522661855, + "loss": 7.5007, + "step": 2493 + }, + { + "epoch": 0.2327143790239806, + "grad_norm": 1.8627520927625654, + "learning_rate": 0.00029995346406196414, + "loss": 7.944, + "step": 2494 + }, + { + "epoch": 0.2328076887188579, + "grad_norm": 1.0978204084015202, + "learning_rate": 0.0002999532754764555, + "loss": 7.5622, + "step": 2495 + }, + { + "epoch": 0.23290099841373518, + "grad_norm": 1.8296000020468255, + "learning_rate": 0.00029995308650966016, + "loss": 7.9774, + "step": 2496 + }, + { + "epoch": 0.2329943081086125, + "grad_norm": 1.0657423427452697, + "learning_rate": 0.0002999528971615785, + "loss": 7.6465, + "step": 2497 + }, + { + "epoch": 0.23308761780348977, + "grad_norm": 1.3735993043074683, + "learning_rate": 0.0002999527074322111, + "loss": 7.5616, + "step": 2498 + }, + { + "epoch": 0.23318092749836708, + "grad_norm": 1.2375174934927007, + "learning_rate": 0.00029995251732155834, + "loss": 7.7129, + "step": 2499 + }, + { + "epoch": 0.2332742371932444, + "grad_norm": 1.9261779765197362, + "learning_rate": 0.0002999523268296207, + "loss": 7.5089, + "step": 2500 + }, + { + "epoch": 0.23336754688812167, + "grad_norm": 1.4092588292462578, + "learning_rate": 0.0002999521359563988, + "loss": 7.63, + "step": 2501 + }, + { + "epoch": 0.23346085658299898, + "grad_norm": 2.0687159635281227, + "learning_rate": 0.00029995194470189295, + "loss": 7.7861, + "step": 2502 + }, + { + "epoch": 0.23355416627787629, + "grad_norm": 0.9378797939900296, + "learning_rate": 0.0002999517530661038, + "loss": 7.5941, + "step": 2503 + }, + { + "epoch": 0.23364747597275357, + "grad_norm": 1.0003356712760885, + "learning_rate": 0.00029995156104903174, + "loss": 7.5701, + "step": 2504 + }, + { + "epoch": 0.23374078566763087, + "grad_norm": 0.7998369639079455, + "learning_rate": 0.0002999513686506772, + "loss": 7.4844, + "step": 2505 + }, + { + "epoch": 0.23383409536250815, + "grad_norm": 1.5190989209639567, + "learning_rate": 0.00029995117587104084, + "loss": 7.5153, + "step": 2506 + }, + { + "epoch": 0.23392740505738546, + "grad_norm": 1.2937252260868493, + "learning_rate": 0.0002999509827101231, + "loss": 7.6444, + "step": 2507 + }, + { + "epoch": 0.23402071475226277, + "grad_norm": 1.566094363359153, + "learning_rate": 0.0002999507891679243, + "loss": 7.8779, + "step": 2508 + }, + { + "epoch": 0.23411402444714005, + "grad_norm": 0.9450474032808731, + "learning_rate": 0.00029995059524444515, + "loss": 7.4674, + "step": 2509 + }, + { + "epoch": 0.23420733414201736, + "grad_norm": 0.9603195417885623, + "learning_rate": 0.000299950400939686, + "loss": 7.3449, + "step": 2510 + }, + { + "epoch": 0.23430064383689464, + "grad_norm": 0.8521452592688227, + "learning_rate": 0.00029995020625364746, + "loss": 7.7685, + "step": 2511 + }, + { + "epoch": 0.23439395353177195, + "grad_norm": 0.5825566584503677, + "learning_rate": 0.00029995001118632997, + "loss": 7.4008, + "step": 2512 + }, + { + "epoch": 0.23448726322664926, + "grad_norm": 1.2796370522302178, + "learning_rate": 0.00029994981573773396, + "loss": 7.6428, + "step": 2513 + }, + { + "epoch": 0.23458057292152654, + "grad_norm": 1.0803575006629558, + "learning_rate": 0.00029994961990786, + "loss": 7.0873, + "step": 2514 + }, + { + "epoch": 0.23467388261640384, + "grad_norm": 1.4025925825390289, + "learning_rate": 0.0002999494236967086, + "loss": 7.7609, + "step": 2515 + }, + { + "epoch": 0.23476719231128115, + "grad_norm": 0.9180548655259043, + "learning_rate": 0.00029994922710428024, + "loss": 7.3074, + "step": 2516 + }, + { + "epoch": 0.23486050200615843, + "grad_norm": 0.8169398873218523, + "learning_rate": 0.00029994903013057543, + "loss": 7.2879, + "step": 2517 + }, + { + "epoch": 0.23495381170103574, + "grad_norm": 0.5929179956738158, + "learning_rate": 0.0002999488327755946, + "loss": 7.391, + "step": 2518 + }, + { + "epoch": 0.23504712139591302, + "grad_norm": 1.6270329200675926, + "learning_rate": 0.0002999486350393383, + "loss": 7.5166, + "step": 2519 + }, + { + "epoch": 0.23514043109079033, + "grad_norm": 1.0155253579191934, + "learning_rate": 0.0002999484369218071, + "loss": 7.4733, + "step": 2520 + }, + { + "epoch": 0.23523374078566764, + "grad_norm": 0.8840833553220876, + "learning_rate": 0.00029994823842300136, + "loss": 7.055, + "step": 2521 + }, + { + "epoch": 0.23532705048054492, + "grad_norm": 1.2114423974344404, + "learning_rate": 0.00029994803954292175, + "loss": 7.2061, + "step": 2522 + }, + { + "epoch": 0.23542036017542223, + "grad_norm": 0.8645357215477198, + "learning_rate": 0.00029994784028156867, + "loss": 7.474, + "step": 2523 + }, + { + "epoch": 0.23551366987029954, + "grad_norm": 0.9892585934067356, + "learning_rate": 0.0002999476406389426, + "loss": 7.504, + "step": 2524 + }, + { + "epoch": 0.23560697956517682, + "grad_norm": 1.0181984177753185, + "learning_rate": 0.00029994744061504413, + "loss": 7.6897, + "step": 2525 + }, + { + "epoch": 0.23570028926005412, + "grad_norm": 0.7477409078998145, + "learning_rate": 0.00029994724020987375, + "loss": 7.5315, + "step": 2526 + }, + { + "epoch": 0.2357935989549314, + "grad_norm": 0.6680664625034998, + "learning_rate": 0.000299947039423432, + "loss": 7.7794, + "step": 2527 + }, + { + "epoch": 0.2358869086498087, + "grad_norm": 1.3751939197714802, + "learning_rate": 0.0002999468382557193, + "loss": 7.8089, + "step": 2528 + }, + { + "epoch": 0.23598021834468602, + "grad_norm": 0.7738721269282339, + "learning_rate": 0.00029994663670673615, + "loss": 7.6645, + "step": 2529 + }, + { + "epoch": 0.2360735280395633, + "grad_norm": 1.5614193786040207, + "learning_rate": 0.0002999464347764832, + "loss": 7.2733, + "step": 2530 + }, + { + "epoch": 0.2361668377344406, + "grad_norm": 0.8789377350867592, + "learning_rate": 0.00029994623246496085, + "loss": 7.811, + "step": 2531 + }, + { + "epoch": 0.23626014742931792, + "grad_norm": 0.914528485760572, + "learning_rate": 0.0002999460297721697, + "loss": 7.5761, + "step": 2532 + }, + { + "epoch": 0.2363534571241952, + "grad_norm": 0.784985767235721, + "learning_rate": 0.0002999458266981101, + "loss": 7.2236, + "step": 2533 + }, + { + "epoch": 0.2364467668190725, + "grad_norm": 0.8745009938475238, + "learning_rate": 0.00029994562324278277, + "loss": 7.3248, + "step": 2534 + }, + { + "epoch": 0.2365400765139498, + "grad_norm": 1.4914080219923311, + "learning_rate": 0.00029994541940618814, + "loss": 7.9718, + "step": 2535 + }, + { + "epoch": 0.2366333862088271, + "grad_norm": 0.5917883078940946, + "learning_rate": 0.0002999452151883267, + "loss": 7.446, + "step": 2536 + }, + { + "epoch": 0.2367266959037044, + "grad_norm": 1.2416507979532978, + "learning_rate": 0.000299945010589199, + "loss": 7.321, + "step": 2537 + }, + { + "epoch": 0.23682000559858168, + "grad_norm": 0.9784411295388499, + "learning_rate": 0.00029994480560880554, + "loss": 7.9799, + "step": 2538 + }, + { + "epoch": 0.236913315293459, + "grad_norm": 0.6753139986351423, + "learning_rate": 0.00029994460024714687, + "loss": 7.3637, + "step": 2539 + }, + { + "epoch": 0.2370066249883363, + "grad_norm": 2.44647907639499, + "learning_rate": 0.0002999443945042235, + "loss": 7.4308, + "step": 2540 + }, + { + "epoch": 0.23709993468321358, + "grad_norm": 1.5544283383387654, + "learning_rate": 0.0002999441883800359, + "loss": 7.3776, + "step": 2541 + }, + { + "epoch": 0.2371932443780909, + "grad_norm": 1.8330499925033177, + "learning_rate": 0.00029994398187458473, + "loss": 7.6514, + "step": 2542 + }, + { + "epoch": 0.23728655407296817, + "grad_norm": 0.8361490416887725, + "learning_rate": 0.0002999437749878704, + "loss": 7.4842, + "step": 2543 + }, + { + "epoch": 0.23737986376784548, + "grad_norm": 1.795966424975601, + "learning_rate": 0.0002999435677198935, + "loss": 7.4278, + "step": 2544 + }, + { + "epoch": 0.2374731734627228, + "grad_norm": 1.4312878596145626, + "learning_rate": 0.00029994336007065444, + "loss": 7.3739, + "step": 2545 + }, + { + "epoch": 0.23756648315760007, + "grad_norm": 1.1357005844416261, + "learning_rate": 0.0002999431520401539, + "loss": 7.6024, + "step": 2546 + }, + { + "epoch": 0.23765979285247738, + "grad_norm": 1.1678729309340925, + "learning_rate": 0.0002999429436283923, + "loss": 7.6189, + "step": 2547 + }, + { + "epoch": 0.23775310254735468, + "grad_norm": 0.7465323673496161, + "learning_rate": 0.00029994273483537027, + "loss": 7.4005, + "step": 2548 + }, + { + "epoch": 0.23784641224223196, + "grad_norm": 13.344711299149946, + "learning_rate": 0.0002999425256610882, + "loss": 7.6719, + "step": 2549 + }, + { + "epoch": 0.23793972193710927, + "grad_norm": 2.024277203167904, + "learning_rate": 0.0002999423161055468, + "loss": 7.4131, + "step": 2550 + }, + { + "epoch": 0.23803303163198655, + "grad_norm": 2.5114460463034733, + "learning_rate": 0.0002999421061687464, + "loss": 7.5757, + "step": 2551 + }, + { + "epoch": 0.23812634132686386, + "grad_norm": 1.6939390496783784, + "learning_rate": 0.0002999418958506877, + "loss": 7.6241, + "step": 2552 + }, + { + "epoch": 0.23821965102174117, + "grad_norm": 0.9169078879166505, + "learning_rate": 0.00029994168515137115, + "loss": 7.9, + "step": 2553 + }, + { + "epoch": 0.23831296071661845, + "grad_norm": 0.8359429838767044, + "learning_rate": 0.00029994147407079734, + "loss": 7.098, + "step": 2554 + }, + { + "epoch": 0.23840627041149576, + "grad_norm": 3.620299807728983, + "learning_rate": 0.00029994126260896677, + "loss": 7.3498, + "step": 2555 + }, + { + "epoch": 0.23849958010637307, + "grad_norm": 2.485396847072849, + "learning_rate": 0.00029994105076587996, + "loss": 7.9723, + "step": 2556 + }, + { + "epoch": 0.23859288980125035, + "grad_norm": 0.8387502910594195, + "learning_rate": 0.00029994083854153754, + "loss": 7.6263, + "step": 2557 + }, + { + "epoch": 0.23868619949612765, + "grad_norm": 1.2501276241794879, + "learning_rate": 0.0002999406259359399, + "loss": 7.7963, + "step": 2558 + }, + { + "epoch": 0.23877950919100493, + "grad_norm": 2.0063152934020096, + "learning_rate": 0.0002999404129490877, + "loss": 7.6587, + "step": 2559 + }, + { + "epoch": 0.23887281888588224, + "grad_norm": 2.3193126108276485, + "learning_rate": 0.00029994019958098147, + "loss": 7.5272, + "step": 2560 + }, + { + "epoch": 0.23896612858075955, + "grad_norm": 0.9031898521413302, + "learning_rate": 0.00029993998583162173, + "loss": 7.7239, + "step": 2561 + }, + { + "epoch": 0.23905943827563683, + "grad_norm": 7.394324540890611, + "learning_rate": 0.0002999397717010089, + "loss": 7.4313, + "step": 2562 + }, + { + "epoch": 0.23915274797051414, + "grad_norm": 2.9168995371679425, + "learning_rate": 0.00029993955718914377, + "loss": 7.7234, + "step": 2563 + }, + { + "epoch": 0.23924605766539142, + "grad_norm": 1.6853795261138635, + "learning_rate": 0.00029993934229602674, + "loss": 7.4808, + "step": 2564 + }, + { + "epoch": 0.23933936736026873, + "grad_norm": 1.907429348306149, + "learning_rate": 0.00029993912702165837, + "loss": 7.5039, + "step": 2565 + }, + { + "epoch": 0.23943267705514604, + "grad_norm": 1.8051550575608126, + "learning_rate": 0.00029993891136603925, + "loss": 7.7322, + "step": 2566 + }, + { + "epoch": 0.23952598675002332, + "grad_norm": 2.2155766309386404, + "learning_rate": 0.00029993869532916987, + "loss": 7.5663, + "step": 2567 + }, + { + "epoch": 0.23961929644490063, + "grad_norm": 7028.113600315249, + "learning_rate": 0.00029993847891105076, + "loss": 7.1414, + "step": 2568 + }, + { + "epoch": 0.23971260613977793, + "grad_norm": 0.9007817946576253, + "learning_rate": 0.00029993826211168254, + "loss": 7.558, + "step": 2569 + }, + { + "epoch": 0.23980591583465521, + "grad_norm": 1.1980791823044432, + "learning_rate": 0.0002999380449310658, + "loss": 7.8324, + "step": 2570 + }, + { + "epoch": 0.23989922552953252, + "grad_norm": 1.496219791674786, + "learning_rate": 0.0002999378273692009, + "loss": 7.8948, + "step": 2571 + }, + { + "epoch": 0.2399925352244098, + "grad_norm": 1.3466268471702747, + "learning_rate": 0.0002999376094260886, + "loss": 7.1969, + "step": 2572 + }, + { + "epoch": 0.2400858449192871, + "grad_norm": 0.5449503894042039, + "learning_rate": 0.00029993739110172936, + "loss": 7.6199, + "step": 2573 + }, + { + "epoch": 0.24017915461416442, + "grad_norm": 1.083866732988532, + "learning_rate": 0.00029993717239612376, + "loss": 7.409, + "step": 2574 + }, + { + "epoch": 0.2402724643090417, + "grad_norm": 1.0366828752578934, + "learning_rate": 0.00029993695330927234, + "loss": 7.5145, + "step": 2575 + }, + { + "epoch": 0.240365774003919, + "grad_norm": 0.6303715858168296, + "learning_rate": 0.00029993673384117565, + "loss": 7.6389, + "step": 2576 + }, + { + "epoch": 0.24045908369879632, + "grad_norm": 1.0731527391179743, + "learning_rate": 0.0002999365139918342, + "loss": 7.1711, + "step": 2577 + }, + { + "epoch": 0.2405523933936736, + "grad_norm": 0.8519123939705435, + "learning_rate": 0.0002999362937612487, + "loss": 7.4758, + "step": 2578 + }, + { + "epoch": 0.2406457030885509, + "grad_norm": 0.7013062187963796, + "learning_rate": 0.0002999360731494196, + "loss": 7.1124, + "step": 2579 + }, + { + "epoch": 0.24073901278342819, + "grad_norm": 1.2210064846604927, + "learning_rate": 0.0002999358521563475, + "loss": 7.4591, + "step": 2580 + }, + { + "epoch": 0.2408323224783055, + "grad_norm": 0.6392216838016958, + "learning_rate": 0.000299935630782033, + "loss": 7.5164, + "step": 2581 + }, + { + "epoch": 0.2409256321731828, + "grad_norm": 0.6067784823817359, + "learning_rate": 0.00029993540902647646, + "loss": 7.5409, + "step": 2582 + }, + { + "epoch": 0.24101894186806008, + "grad_norm": 0.7706953604963317, + "learning_rate": 0.0002999351868896786, + "loss": 7.6349, + "step": 2583 + }, + { + "epoch": 0.2411122515629374, + "grad_norm": 0.9380265380143504, + "learning_rate": 0.00029993496437164007, + "loss": 7.4465, + "step": 2584 + }, + { + "epoch": 0.2412055612578147, + "grad_norm": 0.862620576881464, + "learning_rate": 0.00029993474147236134, + "loss": 7.3695, + "step": 2585 + }, + { + "epoch": 0.24129887095269198, + "grad_norm": 0.6112529509546186, + "learning_rate": 0.0002999345181918429, + "loss": 7.279, + "step": 2586 + }, + { + "epoch": 0.2413921806475693, + "grad_norm": 1.1706779393381168, + "learning_rate": 0.00029993429453008545, + "loss": 7.4468, + "step": 2587 + }, + { + "epoch": 0.24148549034244657, + "grad_norm": 0.9594421215744093, + "learning_rate": 0.00029993407048708944, + "loss": 7.5181, + "step": 2588 + }, + { + "epoch": 0.24157880003732388, + "grad_norm": 0.4875024996636557, + "learning_rate": 0.0002999338460628556, + "loss": 7.454, + "step": 2589 + }, + { + "epoch": 0.24167210973220118, + "grad_norm": 0.8966912049569754, + "learning_rate": 0.0002999336212573843, + "loss": 7.0909, + "step": 2590 + }, + { + "epoch": 0.24176541942707847, + "grad_norm": 0.6133313834092748, + "learning_rate": 0.00029993339607067626, + "loss": 7.395, + "step": 2591 + }, + { + "epoch": 0.24185872912195577, + "grad_norm": 0.9972017483121208, + "learning_rate": 0.000299933170502732, + "loss": 7.7903, + "step": 2592 + }, + { + "epoch": 0.24195203881683308, + "grad_norm": 0.7261913863284337, + "learning_rate": 0.0002999329445535521, + "loss": 7.3889, + "step": 2593 + }, + { + "epoch": 0.24204534851171036, + "grad_norm": 3.321334550990193, + "learning_rate": 0.00029993271822313715, + "loss": 7.356, + "step": 2594 + }, + { + "epoch": 0.24213865820658767, + "grad_norm": 1.5822623478414506, + "learning_rate": 0.0002999324915114877, + "loss": 7.4903, + "step": 2595 + }, + { + "epoch": 0.24223196790146495, + "grad_norm": 1.181634831872127, + "learning_rate": 0.00029993226441860435, + "loss": 7.8329, + "step": 2596 + }, + { + "epoch": 0.24232527759634226, + "grad_norm": 1.2440292084646052, + "learning_rate": 0.00029993203694448766, + "loss": 7.3883, + "step": 2597 + }, + { + "epoch": 0.24241858729121957, + "grad_norm": 0.5135181189682655, + "learning_rate": 0.0002999318090891382, + "loss": 7.4068, + "step": 2598 + }, + { + "epoch": 0.24251189698609685, + "grad_norm": 1.6126317588100063, + "learning_rate": 0.00029993158085255656, + "loss": 7.2033, + "step": 2599 + }, + { + "epoch": 0.24260520668097416, + "grad_norm": 1.6723614401861449, + "learning_rate": 0.00029993135223474335, + "loss": 7.7193, + "step": 2600 + }, + { + "epoch": 0.24269851637585146, + "grad_norm": 0.8261536028449973, + "learning_rate": 0.0002999311232356991, + "loss": 7.3655, + "step": 2601 + }, + { + "epoch": 0.24279182607072874, + "grad_norm": 0.9871120125736415, + "learning_rate": 0.0002999308938554245, + "loss": 7.4886, + "step": 2602 + }, + { + "epoch": 0.24288513576560605, + "grad_norm": 1.354439382585426, + "learning_rate": 0.00029993066409391995, + "loss": 7.3318, + "step": 2603 + }, + { + "epoch": 0.24297844546048333, + "grad_norm": 0.897852007407524, + "learning_rate": 0.00029993043395118614, + "loss": 7.4387, + "step": 2604 + }, + { + "epoch": 0.24307175515536064, + "grad_norm": 0.6305396633721325, + "learning_rate": 0.00029993020342722364, + "loss": 7.2525, + "step": 2605 + }, + { + "epoch": 0.24316506485023795, + "grad_norm": 2.0804555685194837, + "learning_rate": 0.00029992997252203306, + "loss": 7.5344, + "step": 2606 + }, + { + "epoch": 0.24325837454511523, + "grad_norm": 1.2662268479613474, + "learning_rate": 0.00029992974123561504, + "loss": 7.7537, + "step": 2607 + }, + { + "epoch": 0.24335168423999254, + "grad_norm": 0.6542191209057938, + "learning_rate": 0.00029992950956797006, + "loss": 7.6933, + "step": 2608 + }, + { + "epoch": 0.24344499393486985, + "grad_norm": 1.886697230029725, + "learning_rate": 0.00029992927751909873, + "loss": 7.4437, + "step": 2609 + }, + { + "epoch": 0.24353830362974713, + "grad_norm": 2.2014117186798368, + "learning_rate": 0.00029992904508900165, + "loss": 7.5351, + "step": 2610 + }, + { + "epoch": 0.24363161332462444, + "grad_norm": 0.9424399825631408, + "learning_rate": 0.0002999288122776794, + "loss": 7.4251, + "step": 2611 + }, + { + "epoch": 0.24372492301950172, + "grad_norm": 1.2250843029195895, + "learning_rate": 0.0002999285790851327, + "loss": 7.5213, + "step": 2612 + }, + { + "epoch": 0.24381823271437902, + "grad_norm": 2.9618585944053577, + "learning_rate": 0.00029992834551136196, + "loss": 7.6732, + "step": 2613 + }, + { + "epoch": 0.24391154240925633, + "grad_norm": 0.8176519780329625, + "learning_rate": 0.00029992811155636785, + "loss": 7.3555, + "step": 2614 + }, + { + "epoch": 0.2440048521041336, + "grad_norm": 1.071750278141308, + "learning_rate": 0.000299927877220151, + "loss": 7.5606, + "step": 2615 + }, + { + "epoch": 0.24409816179901092, + "grad_norm": 1.8233168316986235, + "learning_rate": 0.00029992764250271186, + "loss": 7.3715, + "step": 2616 + }, + { + "epoch": 0.2441914714938882, + "grad_norm": 1.5488628421123285, + "learning_rate": 0.00029992740740405127, + "loss": 7.3547, + "step": 2617 + }, + { + "epoch": 0.2442847811887655, + "grad_norm": 0.975277248809733, + "learning_rate": 0.0002999271719241696, + "loss": 7.6756, + "step": 2618 + }, + { + "epoch": 0.24437809088364282, + "grad_norm": 1.2602362386767785, + "learning_rate": 0.0002999269360630676, + "loss": 7.4964, + "step": 2619 + }, + { + "epoch": 0.2444714005785201, + "grad_norm": 0.969693661029349, + "learning_rate": 0.0002999266998207458, + "loss": 6.9945, + "step": 2620 + }, + { + "epoch": 0.2445647102733974, + "grad_norm": 0.7461930630285311, + "learning_rate": 0.0002999264631972048, + "loss": 7.533, + "step": 2621 + }, + { + "epoch": 0.24465801996827471, + "grad_norm": 0.7034386323313229, + "learning_rate": 0.00029992622619244525, + "loss": 7.4193, + "step": 2622 + }, + { + "epoch": 0.244751329663152, + "grad_norm": 1.2758438069931424, + "learning_rate": 0.0002999259888064677, + "loss": 7.3745, + "step": 2623 + }, + { + "epoch": 0.2448446393580293, + "grad_norm": 1.0988550715562693, + "learning_rate": 0.0002999257510392727, + "loss": 7.3775, + "step": 2624 + }, + { + "epoch": 0.24493794905290658, + "grad_norm": 0.5539782954233492, + "learning_rate": 0.000299925512890861, + "loss": 7.1756, + "step": 2625 + }, + { + "epoch": 0.2450312587477839, + "grad_norm": 1.2046852752277097, + "learning_rate": 0.00029992527436123315, + "loss": 7.264, + "step": 2626 + }, + { + "epoch": 0.2451245684426612, + "grad_norm": 1.0860547423810472, + "learning_rate": 0.0002999250354503897, + "loss": 7.2088, + "step": 2627 + }, + { + "epoch": 0.24521787813753848, + "grad_norm": 0.9619320972744199, + "learning_rate": 0.00029992479615833134, + "loss": 7.4872, + "step": 2628 + }, + { + "epoch": 0.2453111878324158, + "grad_norm": 0.7723938258960117, + "learning_rate": 0.00029992455648505863, + "loss": 7.4668, + "step": 2629 + }, + { + "epoch": 0.2454044975272931, + "grad_norm": 0.9648373617504453, + "learning_rate": 0.00029992431643057214, + "loss": 7.4682, + "step": 2630 + }, + { + "epoch": 0.24549780722217038, + "grad_norm": 1.3623300352406453, + "learning_rate": 0.00029992407599487253, + "loss": 7.3307, + "step": 2631 + }, + { + "epoch": 0.24559111691704769, + "grad_norm": 0.47316466458684026, + "learning_rate": 0.00029992383517796044, + "loss": 7.279, + "step": 2632 + }, + { + "epoch": 0.24568442661192497, + "grad_norm": 0.7443503419425256, + "learning_rate": 0.0002999235939798364, + "loss": 7.1855, + "step": 2633 + }, + { + "epoch": 0.24577773630680227, + "grad_norm": 1.57285876470888, + "learning_rate": 0.0002999233524005011, + "loss": 7.3613, + "step": 2634 + }, + { + "epoch": 0.24587104600167958, + "grad_norm": 0.6389795400031374, + "learning_rate": 0.0002999231104399551, + "loss": 7.4097, + "step": 2635 + }, + { + "epoch": 0.24596435569655686, + "grad_norm": 0.5742759540528528, + "learning_rate": 0.0002999228680981991, + "loss": 7.4452, + "step": 2636 + }, + { + "epoch": 0.24605766539143417, + "grad_norm": 1.480454680220708, + "learning_rate": 0.0002999226253752336, + "loss": 7.3631, + "step": 2637 + }, + { + "epoch": 0.24615097508631148, + "grad_norm": 0.7713313441916609, + "learning_rate": 0.0002999223822710593, + "loss": 7.4285, + "step": 2638 + }, + { + "epoch": 0.24624428478118876, + "grad_norm": 0.7378451615035521, + "learning_rate": 0.0002999221387856768, + "loss": 7.3216, + "step": 2639 + }, + { + "epoch": 0.24633759447606607, + "grad_norm": 1.1610807344266219, + "learning_rate": 0.0002999218949190867, + "loss": 7.1128, + "step": 2640 + }, + { + "epoch": 0.24643090417094335, + "grad_norm": 0.7324774295386155, + "learning_rate": 0.0002999216506712896, + "loss": 7.3121, + "step": 2641 + }, + { + "epoch": 0.24652421386582066, + "grad_norm": 0.6937918187586012, + "learning_rate": 0.0002999214060422862, + "loss": 7.2247, + "step": 2642 + }, + { + "epoch": 0.24661752356069797, + "grad_norm": 1.0712547715842524, + "learning_rate": 0.0002999211610320771, + "loss": 7.9088, + "step": 2643 + }, + { + "epoch": 0.24671083325557525, + "grad_norm": 1.0522573474291694, + "learning_rate": 0.0002999209156406628, + "loss": 7.3035, + "step": 2644 + }, + { + "epoch": 0.24680414295045255, + "grad_norm": 0.8824269187388591, + "learning_rate": 0.0002999206698680441, + "loss": 7.1678, + "step": 2645 + }, + { + "epoch": 0.24689745264532986, + "grad_norm": 1.15381002745134, + "learning_rate": 0.0002999204237142215, + "loss": 7.6041, + "step": 2646 + }, + { + "epoch": 0.24699076234020714, + "grad_norm": 0.5639032840727654, + "learning_rate": 0.0002999201771791957, + "loss": 7.2983, + "step": 2647 + }, + { + "epoch": 0.24708407203508445, + "grad_norm": 0.6424610738305957, + "learning_rate": 0.0002999199302629673, + "loss": 7.428, + "step": 2648 + }, + { + "epoch": 0.24717738172996173, + "grad_norm": 0.5432009308332609, + "learning_rate": 0.0002999196829655369, + "loss": 7.3032, + "step": 2649 + }, + { + "epoch": 0.24727069142483904, + "grad_norm": 0.6499983662811186, + "learning_rate": 0.00029991943528690515, + "loss": 7.0763, + "step": 2650 + }, + { + "epoch": 0.24736400111971635, + "grad_norm": 0.6646713443568599, + "learning_rate": 0.00029991918722707277, + "loss": 7.3326, + "step": 2651 + }, + { + "epoch": 0.24745731081459363, + "grad_norm": 0.980935416404551, + "learning_rate": 0.0002999189387860402, + "loss": 7.463, + "step": 2652 + }, + { + "epoch": 0.24755062050947094, + "grad_norm": 0.8845595623363487, + "learning_rate": 0.00029991868996380824, + "loss": 7.3815, + "step": 2653 + }, + { + "epoch": 0.24764393020434824, + "grad_norm": 1.1007557498272464, + "learning_rate": 0.00029991844076037745, + "loss": 7.7922, + "step": 2654 + }, + { + "epoch": 0.24773723989922553, + "grad_norm": 0.6664398876610934, + "learning_rate": 0.00029991819117574845, + "loss": 7.6321, + "step": 2655 + }, + { + "epoch": 0.24783054959410283, + "grad_norm": 1.3337181616420415, + "learning_rate": 0.0002999179412099219, + "loss": 7.3976, + "step": 2656 + }, + { + "epoch": 0.24792385928898011, + "grad_norm": 1.0059917618495802, + "learning_rate": 0.00029991769086289843, + "loss": 7.5465, + "step": 2657 + }, + { + "epoch": 0.24801716898385742, + "grad_norm": 0.4751996379135879, + "learning_rate": 0.0002999174401346787, + "loss": 7.4578, + "step": 2658 + }, + { + "epoch": 0.24811047867873473, + "grad_norm": 0.6903955680485677, + "learning_rate": 0.00029991718902526335, + "loss": 7.1683, + "step": 2659 + }, + { + "epoch": 0.248203788373612, + "grad_norm": 1.1009395544376797, + "learning_rate": 0.000299916937534653, + "loss": 7.6102, + "step": 2660 + }, + { + "epoch": 0.24829709806848932, + "grad_norm": 0.8250604162738757, + "learning_rate": 0.00029991668566284823, + "loss": 7.2614, + "step": 2661 + }, + { + "epoch": 0.24839040776336663, + "grad_norm": 0.5701996545987017, + "learning_rate": 0.00029991643340984985, + "loss": 7.6451, + "step": 2662 + }, + { + "epoch": 0.2484837174582439, + "grad_norm": 0.9111643693177771, + "learning_rate": 0.0002999161807756583, + "loss": 7.199, + "step": 2663 + }, + { + "epoch": 0.24857702715312122, + "grad_norm": 0.9146369939557693, + "learning_rate": 0.0002999159277602743, + "loss": 7.3385, + "step": 2664 + }, + { + "epoch": 0.2486703368479985, + "grad_norm": 1.0219060979996266, + "learning_rate": 0.0002999156743636985, + "loss": 7.5619, + "step": 2665 + }, + { + "epoch": 0.2487636465428758, + "grad_norm": 1.5932527778824228, + "learning_rate": 0.0002999154205859316, + "loss": 7.4763, + "step": 2666 + }, + { + "epoch": 0.2488569562377531, + "grad_norm": 0.53979052594829, + "learning_rate": 0.00029991516642697413, + "loss": 6.9969, + "step": 2667 + }, + { + "epoch": 0.2489502659326304, + "grad_norm": 0.6993748513044651, + "learning_rate": 0.0002999149118868269, + "loss": 7.1691, + "step": 2668 + }, + { + "epoch": 0.2490435756275077, + "grad_norm": 0.6648846061487197, + "learning_rate": 0.00029991465696549034, + "loss": 7.4156, + "step": 2669 + }, + { + "epoch": 0.24913688532238498, + "grad_norm": 1.2655439242284263, + "learning_rate": 0.0002999144016629653, + "loss": 7.1048, + "step": 2670 + }, + { + "epoch": 0.2492301950172623, + "grad_norm": 0.8311406783711126, + "learning_rate": 0.0002999141459792523, + "loss": 7.1247, + "step": 2671 + }, + { + "epoch": 0.2493235047121396, + "grad_norm": 0.956308627422243, + "learning_rate": 0.0002999138899143521, + "loss": 7.3531, + "step": 2672 + }, + { + "epoch": 0.24941681440701688, + "grad_norm": 0.8734372290382104, + "learning_rate": 0.00029991363346826516, + "loss": 7.5579, + "step": 2673 + }, + { + "epoch": 0.2495101241018942, + "grad_norm": 0.6061377857437111, + "learning_rate": 0.0002999133766409924, + "loss": 7.3817, + "step": 2674 + }, + { + "epoch": 0.2496034337967715, + "grad_norm": 1.0493956131778235, + "learning_rate": 0.00029991311943253424, + "loss": 7.3718, + "step": 2675 + }, + { + "epoch": 0.24969674349164878, + "grad_norm": 0.7968148436673853, + "learning_rate": 0.00029991286184289143, + "loss": 7.247, + "step": 2676 + }, + { + "epoch": 0.24979005318652608, + "grad_norm": 1.1189396755654597, + "learning_rate": 0.00029991260387206466, + "loss": 7.6415, + "step": 2677 + }, + { + "epoch": 0.24988336288140336, + "grad_norm": 0.6735303380269618, + "learning_rate": 0.0002999123455200545, + "loss": 7.3398, + "step": 2678 + }, + { + "epoch": 0.24997667257628067, + "grad_norm": 0.5641458122431658, + "learning_rate": 0.0002999120867868617, + "loss": 7.1136, + "step": 2679 + }, + { + "epoch": 0.25006998227115795, + "grad_norm": 0.7885153107334336, + "learning_rate": 0.00029991182767248684, + "loss": 7.4877, + "step": 2680 + }, + { + "epoch": 0.25016329196603526, + "grad_norm": 0.9438944180131286, + "learning_rate": 0.0002999115681769306, + "loss": 7.2998, + "step": 2681 + }, + { + "epoch": 0.25025660166091257, + "grad_norm": 0.7145534137122075, + "learning_rate": 0.0002999113083001937, + "loss": 7.0625, + "step": 2682 + }, + { + "epoch": 0.2503499113557899, + "grad_norm": 0.8648025665249007, + "learning_rate": 0.0002999110480422767, + "loss": 7.2166, + "step": 2683 + }, + { + "epoch": 0.2504432210506672, + "grad_norm": 1.031421038262731, + "learning_rate": 0.00029991078740318033, + "loss": 7.3742, + "step": 2684 + }, + { + "epoch": 0.25053653074554444, + "grad_norm": 0.8117433183790838, + "learning_rate": 0.0002999105263829052, + "loss": 7.4249, + "step": 2685 + }, + { + "epoch": 0.25062984044042175, + "grad_norm": 1.2879518328515005, + "learning_rate": 0.00029991026498145206, + "loss": 7.3251, + "step": 2686 + }, + { + "epoch": 0.25072315013529906, + "grad_norm": 0.9340632098225078, + "learning_rate": 0.00029991000319882154, + "loss": 7.1416, + "step": 2687 + }, + { + "epoch": 0.25081645983017636, + "grad_norm": 0.8290113703509984, + "learning_rate": 0.00029990974103501417, + "loss": 7.1749, + "step": 2688 + }, + { + "epoch": 0.25090976952505367, + "grad_norm": 1.4805613024486932, + "learning_rate": 0.00029990947849003086, + "loss": 7.6653, + "step": 2689 + }, + { + "epoch": 0.2510030792199309, + "grad_norm": 0.8896489014347145, + "learning_rate": 0.00029990921556387215, + "loss": 7.4048, + "step": 2690 + }, + { + "epoch": 0.25109638891480823, + "grad_norm": 1.7213964889634885, + "learning_rate": 0.00029990895225653864, + "loss": 6.9986, + "step": 2691 + }, + { + "epoch": 0.25118969860968554, + "grad_norm": 1.358535832517339, + "learning_rate": 0.0002999086885680311, + "loss": 6.9189, + "step": 2692 + }, + { + "epoch": 0.25128300830456285, + "grad_norm": 1.6676501725652502, + "learning_rate": 0.0002999084244983502, + "loss": 7.3654, + "step": 2693 + }, + { + "epoch": 0.25137631799944016, + "grad_norm": 1.5979034950461202, + "learning_rate": 0.0002999081600474965, + "loss": 7.453, + "step": 2694 + }, + { + "epoch": 0.25146962769431747, + "grad_norm": 0.65869112347782, + "learning_rate": 0.00029990789521547085, + "loss": 7.2071, + "step": 2695 + }, + { + "epoch": 0.2515629373891947, + "grad_norm": 1.1710468850372635, + "learning_rate": 0.0002999076300022738, + "loss": 6.987, + "step": 2696 + }, + { + "epoch": 0.251656247084072, + "grad_norm": 1.30030830149584, + "learning_rate": 0.000299907364407906, + "loss": 6.7528, + "step": 2697 + }, + { + "epoch": 0.25174955677894933, + "grad_norm": 0.7632113905475801, + "learning_rate": 0.00029990709843236824, + "loss": 6.9005, + "step": 2698 + }, + { + "epoch": 0.25184286647382664, + "grad_norm": 1.2899515116778317, + "learning_rate": 0.0002999068320756611, + "loss": 6.9591, + "step": 2699 + }, + { + "epoch": 0.25193617616870395, + "grad_norm": 1.4678318719124255, + "learning_rate": 0.00029990656533778533, + "loss": 7.4044, + "step": 2700 + }, + { + "epoch": 0.2520294858635812, + "grad_norm": 1.0219220825650484, + "learning_rate": 0.00029990629821874154, + "loss": 7.4664, + "step": 2701 + }, + { + "epoch": 0.2521227955584585, + "grad_norm": 1.5754691301151333, + "learning_rate": 0.00029990603071853043, + "loss": 7.3169, + "step": 2702 + }, + { + "epoch": 0.2522161052533358, + "grad_norm": 1.737934242053776, + "learning_rate": 0.0002999057628371527, + "loss": 7.4039, + "step": 2703 + }, + { + "epoch": 0.25230941494821313, + "grad_norm": 0.9188768587791404, + "learning_rate": 0.000299905494574609, + "loss": 7.2261, + "step": 2704 + }, + { + "epoch": 0.25240272464309044, + "grad_norm": 0.986973113268894, + "learning_rate": 0.00029990522593090007, + "loss": 7.2262, + "step": 2705 + }, + { + "epoch": 0.2524960343379677, + "grad_norm": 1.0973955073310409, + "learning_rate": 0.00029990495690602656, + "loss": 6.697, + "step": 2706 + }, + { + "epoch": 0.252589344032845, + "grad_norm": 0.7548439983162402, + "learning_rate": 0.00029990468749998916, + "loss": 7.2473, + "step": 2707 + }, + { + "epoch": 0.2526826537277223, + "grad_norm": 1.2898683269502238, + "learning_rate": 0.0002999044177127885, + "loss": 6.9494, + "step": 2708 + }, + { + "epoch": 0.2527759634225996, + "grad_norm": 1.0865697052387455, + "learning_rate": 0.00029990414754442533, + "loss": 7.376, + "step": 2709 + }, + { + "epoch": 0.2528692731174769, + "grad_norm": 0.6570176076276731, + "learning_rate": 0.00029990387699490033, + "loss": 7.2909, + "step": 2710 + }, + { + "epoch": 0.25296258281235423, + "grad_norm": 1.8433825282537495, + "learning_rate": 0.00029990360606421413, + "loss": 7.8129, + "step": 2711 + }, + { + "epoch": 0.2530558925072315, + "grad_norm": 1.8117076238303744, + "learning_rate": 0.0002999033347523675, + "loss": 7.388, + "step": 2712 + }, + { + "epoch": 0.2531492022021088, + "grad_norm": 0.7793006182329988, + "learning_rate": 0.0002999030630593611, + "loss": 6.7894, + "step": 2713 + }, + { + "epoch": 0.2532425118969861, + "grad_norm": 1.1071489235281413, + "learning_rate": 0.0002999027909851956, + "loss": 7.1287, + "step": 2714 + }, + { + "epoch": 0.2533358215918634, + "grad_norm": 0.9776768302755666, + "learning_rate": 0.0002999025185298717, + "loss": 7.1045, + "step": 2715 + }, + { + "epoch": 0.2534291312867407, + "grad_norm": 1.047339580169443, + "learning_rate": 0.0002999022456933901, + "loss": 7.2528, + "step": 2716 + }, + { + "epoch": 0.25352244098161797, + "grad_norm": 1.2117756403885118, + "learning_rate": 0.0002999019724757515, + "loss": 7.462, + "step": 2717 + }, + { + "epoch": 0.2536157506764953, + "grad_norm": 0.7253970282513357, + "learning_rate": 0.0002999016988769566, + "loss": 7.1798, + "step": 2718 + }, + { + "epoch": 0.2537090603713726, + "grad_norm": 1.5142436816721228, + "learning_rate": 0.00029990142489700613, + "loss": 7.1463, + "step": 2719 + }, + { + "epoch": 0.2538023700662499, + "grad_norm": 1.825893322980153, + "learning_rate": 0.0002999011505359007, + "loss": 7.1722, + "step": 2720 + }, + { + "epoch": 0.2538956797611272, + "grad_norm": 1.4601486899924625, + "learning_rate": 0.000299900875793641, + "loss": 7.2522, + "step": 2721 + }, + { + "epoch": 0.25398898945600445, + "grad_norm": 1.5163614821805715, + "learning_rate": 0.00029990060067022783, + "loss": 7.1299, + "step": 2722 + }, + { + "epoch": 0.25408229915088176, + "grad_norm": 0.8700634835033962, + "learning_rate": 0.0002999003251656618, + "loss": 7.4275, + "step": 2723 + }, + { + "epoch": 0.25417560884575907, + "grad_norm": 1.1766425268320047, + "learning_rate": 0.00029990004927994365, + "loss": 7.0926, + "step": 2724 + }, + { + "epoch": 0.2542689185406364, + "grad_norm": 0.857278314097692, + "learning_rate": 0.0002998997730130741, + "loss": 7.146, + "step": 2725 + }, + { + "epoch": 0.2543622282355137, + "grad_norm": 1.1111224292851256, + "learning_rate": 0.00029989949636505383, + "loss": 7.0699, + "step": 2726 + }, + { + "epoch": 0.25445553793039094, + "grad_norm": 0.6735819140541554, + "learning_rate": 0.00029989921933588356, + "loss": 7.1727, + "step": 2727 + }, + { + "epoch": 0.25454884762526825, + "grad_norm": 0.7280624671476945, + "learning_rate": 0.0002998989419255639, + "loss": 7.1914, + "step": 2728 + }, + { + "epoch": 0.25464215732014556, + "grad_norm": 1.1106470325521631, + "learning_rate": 0.00029989866413409575, + "loss": 7.0124, + "step": 2729 + }, + { + "epoch": 0.25473546701502287, + "grad_norm": 0.7046068476506434, + "learning_rate": 0.0002998983859614796, + "loss": 7.1886, + "step": 2730 + }, + { + "epoch": 0.2548287767099002, + "grad_norm": 0.7430031592057198, + "learning_rate": 0.00029989810740771634, + "loss": 7.1271, + "step": 2731 + }, + { + "epoch": 0.2549220864047775, + "grad_norm": 0.8342074673894073, + "learning_rate": 0.0002998978284728065, + "loss": 7.0636, + "step": 2732 + }, + { + "epoch": 0.25501539609965473, + "grad_norm": 0.7437020104520016, + "learning_rate": 0.000299897549156751, + "loss": 7.328, + "step": 2733 + }, + { + "epoch": 0.25510870579453204, + "grad_norm": 1.130128647957407, + "learning_rate": 0.00029989726945955035, + "loss": 6.7697, + "step": 2734 + }, + { + "epoch": 0.25520201548940935, + "grad_norm": 0.6765994203124536, + "learning_rate": 0.0002998969893812054, + "loss": 7.0429, + "step": 2735 + }, + { + "epoch": 0.25529532518428666, + "grad_norm": 16.601560031898227, + "learning_rate": 0.00029989670892171676, + "loss": 7.7215, + "step": 2736 + }, + { + "epoch": 0.25538863487916397, + "grad_norm": 1.130603582281056, + "learning_rate": 0.0002998964280810852, + "loss": 7.0078, + "step": 2737 + }, + { + "epoch": 0.2554819445740412, + "grad_norm": 0.9973625041100456, + "learning_rate": 0.00029989614685931146, + "loss": 7.1812, + "step": 2738 + }, + { + "epoch": 0.25557525426891853, + "grad_norm": 1.1273130401080207, + "learning_rate": 0.0002998958652563962, + "loss": 7.1186, + "step": 2739 + }, + { + "epoch": 0.25566856396379584, + "grad_norm": 0.827476185145613, + "learning_rate": 0.0002998955832723402, + "loss": 7.2343, + "step": 2740 + }, + { + "epoch": 0.25576187365867314, + "grad_norm": 0.9476824908969727, + "learning_rate": 0.00029989530090714404, + "loss": 7.2219, + "step": 2741 + }, + { + "epoch": 0.25585518335355045, + "grad_norm": 0.7052723717044384, + "learning_rate": 0.00029989501816080863, + "loss": 7.187, + "step": 2742 + }, + { + "epoch": 0.2559484930484277, + "grad_norm": 0.9370285351760061, + "learning_rate": 0.00029989473503333453, + "loss": 7.0904, + "step": 2743 + }, + { + "epoch": 0.256041802743305, + "grad_norm": 1.4806085543479977, + "learning_rate": 0.00029989445152472254, + "loss": 7.149, + "step": 2744 + }, + { + "epoch": 0.2561351124381823, + "grad_norm": 1.0227295645635377, + "learning_rate": 0.0002998941676349733, + "loss": 7.0225, + "step": 2745 + }, + { + "epoch": 0.25622842213305963, + "grad_norm": 0.8133472874139136, + "learning_rate": 0.0002998938833640877, + "loss": 7.0571, + "step": 2746 + }, + { + "epoch": 0.25632173182793694, + "grad_norm": 1.010338881818985, + "learning_rate": 0.0002998935987120663, + "loss": 6.9045, + "step": 2747 + }, + { + "epoch": 0.25641504152281425, + "grad_norm": 0.6959205852743542, + "learning_rate": 0.0002998933136789099, + "loss": 7.1839, + "step": 2748 + }, + { + "epoch": 0.2565083512176915, + "grad_norm": 1.0515264558208068, + "learning_rate": 0.0002998930282646192, + "loss": 6.9568, + "step": 2749 + }, + { + "epoch": 0.2566016609125688, + "grad_norm": 0.7018667237576836, + "learning_rate": 0.0002998927424691949, + "loss": 6.8547, + "step": 2750 + }, + { + "epoch": 0.2566949706074461, + "grad_norm": 1.2028227375442804, + "learning_rate": 0.0002998924562926378, + "loss": 6.8477, + "step": 2751 + }, + { + "epoch": 0.2567882803023234, + "grad_norm": 0.7032943338183415, + "learning_rate": 0.00029989216973494854, + "loss": 7.1339, + "step": 2752 + }, + { + "epoch": 0.25688158999720073, + "grad_norm": 1.0344266071888857, + "learning_rate": 0.0002998918827961279, + "loss": 7.0619, + "step": 2753 + }, + { + "epoch": 0.256974899692078, + "grad_norm": 1.2180340096445086, + "learning_rate": 0.00029989159547617666, + "loss": 6.7012, + "step": 2754 + }, + { + "epoch": 0.2570682093869553, + "grad_norm": 2.5348055685530086, + "learning_rate": 0.0002998913077750955, + "loss": 7.0115, + "step": 2755 + }, + { + "epoch": 0.2571615190818326, + "grad_norm": 1.1925087369692269, + "learning_rate": 0.00029989101969288506, + "loss": 6.5463, + "step": 2756 + }, + { + "epoch": 0.2572548287767099, + "grad_norm": 1.925909623268966, + "learning_rate": 0.0002998907312295462, + "loss": 6.971, + "step": 2757 + }, + { + "epoch": 0.2573481384715872, + "grad_norm": 3.216575802231962, + "learning_rate": 0.0002998904423850796, + "loss": 7.1201, + "step": 2758 + }, + { + "epoch": 0.25744144816646447, + "grad_norm": 1.6720501351291104, + "learning_rate": 0.000299890153159486, + "loss": 6.9545, + "step": 2759 + }, + { + "epoch": 0.2575347578613418, + "grad_norm": 1.4027575275901394, + "learning_rate": 0.0002998898635527662, + "loss": 7.0396, + "step": 2760 + }, + { + "epoch": 0.2576280675562191, + "grad_norm": 0.9073241269668102, + "learning_rate": 0.0002998895735649208, + "loss": 6.9581, + "step": 2761 + }, + { + "epoch": 0.2577213772510964, + "grad_norm": 1.15532762062441, + "learning_rate": 0.00029988928319595065, + "loss": 6.9935, + "step": 2762 + }, + { + "epoch": 0.2578146869459737, + "grad_norm": 1.0737365575353597, + "learning_rate": 0.00029988899244585645, + "loss": 7.0994, + "step": 2763 + }, + { + "epoch": 0.257907996640851, + "grad_norm": 1.1868394384311078, + "learning_rate": 0.000299888701314639, + "loss": 7.4042, + "step": 2764 + }, + { + "epoch": 0.25800130633572826, + "grad_norm": 0.8750664209687217, + "learning_rate": 0.0002998884098022989, + "loss": 7.0697, + "step": 2765 + }, + { + "epoch": 0.2580946160306056, + "grad_norm": 1.5100255078421574, + "learning_rate": 0.000299888117908837, + "loss": 6.9115, + "step": 2766 + }, + { + "epoch": 0.2581879257254829, + "grad_norm": 2.7271441787147457, + "learning_rate": 0.0002998878256342541, + "loss": 6.9888, + "step": 2767 + }, + { + "epoch": 0.2582812354203602, + "grad_norm": 2.3687891755742276, + "learning_rate": 0.0002998875329785507, + "loss": 6.923, + "step": 2768 + }, + { + "epoch": 0.2583745451152375, + "grad_norm": 1.0860165195315872, + "learning_rate": 0.00029988723994172777, + "loss": 7.2396, + "step": 2769 + }, + { + "epoch": 0.25846785481011475, + "grad_norm": 1.7119201526063994, + "learning_rate": 0.000299886946523786, + "loss": 6.8372, + "step": 2770 + }, + { + "epoch": 0.25856116450499206, + "grad_norm": 1.4164405565192166, + "learning_rate": 0.0002998866527247262, + "loss": 7.0983, + "step": 2771 + }, + { + "epoch": 0.25865447419986937, + "grad_norm": 0.707225051529312, + "learning_rate": 0.00029988635854454894, + "loss": 6.5508, + "step": 2772 + }, + { + "epoch": 0.2587477838947467, + "grad_norm": 1.6155420548649513, + "learning_rate": 0.0002998860639832551, + "loss": 7.0203, + "step": 2773 + }, + { + "epoch": 0.258841093589624, + "grad_norm": 1.13010891579668, + "learning_rate": 0.0002998857690408454, + "loss": 6.6454, + "step": 2774 + }, + { + "epoch": 0.25893440328450124, + "grad_norm": 0.829876233289909, + "learning_rate": 0.0002998854737173206, + "loss": 7.1643, + "step": 2775 + }, + { + "epoch": 0.25902771297937854, + "grad_norm": 1.4583870414947728, + "learning_rate": 0.0002998851780126814, + "loss": 6.9109, + "step": 2776 + }, + { + "epoch": 0.25912102267425585, + "grad_norm": 1.1209007619737839, + "learning_rate": 0.0002998848819269286, + "loss": 6.5288, + "step": 2777 + }, + { + "epoch": 0.25921433236913316, + "grad_norm": 1.3931515524950158, + "learning_rate": 0.00029988458546006296, + "loss": 6.8676, + "step": 2778 + }, + { + "epoch": 0.25930764206401047, + "grad_norm": 1.8582259868202469, + "learning_rate": 0.00029988428861208523, + "loss": 7.4222, + "step": 2779 + }, + { + "epoch": 0.2594009517588877, + "grad_norm": 0.7091254629728435, + "learning_rate": 0.0002998839913829962, + "loss": 7.1345, + "step": 2780 + }, + { + "epoch": 0.25949426145376503, + "grad_norm": 1.1512719778302685, + "learning_rate": 0.0002998836937727965, + "loss": 6.8586, + "step": 2781 + }, + { + "epoch": 0.25958757114864234, + "grad_norm": 1.1694771402458375, + "learning_rate": 0.000299883395781487, + "loss": 6.7171, + "step": 2782 + }, + { + "epoch": 0.25968088084351965, + "grad_norm": 1.685950060923811, + "learning_rate": 0.00029988309740906836, + "loss": 6.7449, + "step": 2783 + }, + { + "epoch": 0.25977419053839695, + "grad_norm": 0.7318691339196545, + "learning_rate": 0.0002998827986555415, + "loss": 6.777, + "step": 2784 + }, + { + "epoch": 0.25986750023327426, + "grad_norm": 1.2708924243159023, + "learning_rate": 0.000299882499520907, + "loss": 6.892, + "step": 2785 + }, + { + "epoch": 0.2599608099281515, + "grad_norm": 1.3176699601864517, + "learning_rate": 0.00029988220000516576, + "loss": 6.6858, + "step": 2786 + }, + { + "epoch": 0.2600541196230288, + "grad_norm": 0.7875303285546784, + "learning_rate": 0.0002998819001083185, + "loss": 6.3747, + "step": 2787 + }, + { + "epoch": 0.26014742931790613, + "grad_norm": 0.9320754355039413, + "learning_rate": 0.00029988159983036593, + "loss": 6.7624, + "step": 2788 + }, + { + "epoch": 0.26024073901278344, + "grad_norm": 2.320705202242556, + "learning_rate": 0.00029988129917130876, + "loss": 6.3827, + "step": 2789 + }, + { + "epoch": 0.26033404870766075, + "grad_norm": 1.6658799309923438, + "learning_rate": 0.000299880998131148, + "loss": 6.8969, + "step": 2790 + }, + { + "epoch": 0.260427358402538, + "grad_norm": 1.3511181894377153, + "learning_rate": 0.00029988069670988416, + "loss": 6.547, + "step": 2791 + }, + { + "epoch": 0.2605206680974153, + "grad_norm": 0.8741375440473018, + "learning_rate": 0.0002998803949075181, + "loss": 6.6461, + "step": 2792 + }, + { + "epoch": 0.2606139777922926, + "grad_norm": 1.9751024738799838, + "learning_rate": 0.0002998800927240507, + "loss": 6.5653, + "step": 2793 + }, + { + "epoch": 0.2607072874871699, + "grad_norm": 1.3737301379541365, + "learning_rate": 0.0002998797901594825, + "loss": 6.3034, + "step": 2794 + }, + { + "epoch": 0.26080059718204723, + "grad_norm": 0.9876405117613397, + "learning_rate": 0.00029987948721381445, + "loss": 6.1794, + "step": 2795 + }, + { + "epoch": 0.2608939068769245, + "grad_norm": 0.7944193911636318, + "learning_rate": 0.0002998791838870473, + "loss": 6.3914, + "step": 2796 + }, + { + "epoch": 0.2609872165718018, + "grad_norm": 1.351939854304221, + "learning_rate": 0.0002998788801791817, + "loss": 6.7087, + "step": 2797 + }, + { + "epoch": 0.2610805262666791, + "grad_norm": 1.4972792218065851, + "learning_rate": 0.00029987857609021856, + "loss": 6.8745, + "step": 2798 + }, + { + "epoch": 0.2611738359615564, + "grad_norm": 0.8050292812723596, + "learning_rate": 0.0002998782716201586, + "loss": 6.4374, + "step": 2799 + }, + { + "epoch": 0.2612671456564337, + "grad_norm": 0.908795793108601, + "learning_rate": 0.00029987796676900257, + "loss": 6.5916, + "step": 2800 + }, + { + "epoch": 0.261360455351311, + "grad_norm": 0.9480039661974778, + "learning_rate": 0.00029987766153675126, + "loss": 6.3903, + "step": 2801 + }, + { + "epoch": 0.2614537650461883, + "grad_norm": 0.9352082988320429, + "learning_rate": 0.00029987735592340543, + "loss": 7.0441, + "step": 2802 + }, + { + "epoch": 0.2615470747410656, + "grad_norm": 1.1003546450108046, + "learning_rate": 0.00029987704992896595, + "loss": 6.704, + "step": 2803 + }, + { + "epoch": 0.2616403844359429, + "grad_norm": 1.0771909728545042, + "learning_rate": 0.00029987674355343353, + "loss": 6.4595, + "step": 2804 + }, + { + "epoch": 0.2617336941308202, + "grad_norm": 1.701885267973432, + "learning_rate": 0.0002998764367968089, + "loss": 6.133, + "step": 2805 + }, + { + "epoch": 0.2618270038256975, + "grad_norm": 2.9471908036349235, + "learning_rate": 0.0002998761296590929, + "loss": 6.4779, + "step": 2806 + }, + { + "epoch": 0.26192031352057477, + "grad_norm": 1.3927661638255793, + "learning_rate": 0.00029987582214028625, + "loss": 6.0914, + "step": 2807 + }, + { + "epoch": 0.2620136232154521, + "grad_norm": 0.8572521700483432, + "learning_rate": 0.00029987551424038987, + "loss": 6.432, + "step": 2808 + }, + { + "epoch": 0.2621069329103294, + "grad_norm": 3.679378325813522, + "learning_rate": 0.00029987520595940444, + "loss": 6.3051, + "step": 2809 + }, + { + "epoch": 0.2622002426052067, + "grad_norm": 1.8566175152530962, + "learning_rate": 0.0002998748972973307, + "loss": 6.1653, + "step": 2810 + }, + { + "epoch": 0.262293552300084, + "grad_norm": 123.78476843144507, + "learning_rate": 0.00029987458825416955, + "loss": 6.9083, + "step": 2811 + }, + { + "epoch": 0.26238686199496125, + "grad_norm": 1.7104026186702108, + "learning_rate": 0.0002998742788299217, + "loss": 6.6539, + "step": 2812 + }, + { + "epoch": 0.26248017168983856, + "grad_norm": 1.7158298778477958, + "learning_rate": 0.00029987396902458793, + "loss": 6.4463, + "step": 2813 + }, + { + "epoch": 0.26257348138471587, + "grad_norm": 1.8261768321271012, + "learning_rate": 0.00029987365883816906, + "loss": 6.9289, + "step": 2814 + }, + { + "epoch": 0.2626667910795932, + "grad_norm": 1.6870219637675292, + "learning_rate": 0.0002998733482706659, + "loss": 6.9755, + "step": 2815 + }, + { + "epoch": 0.2627601007744705, + "grad_norm": 1.160780342277205, + "learning_rate": 0.00029987303732207916, + "loss": 6.3758, + "step": 2816 + }, + { + "epoch": 0.2628534104693478, + "grad_norm": 1.1505581090769632, + "learning_rate": 0.0002998727259924097, + "loss": 6.9523, + "step": 2817 + }, + { + "epoch": 0.26294672016422505, + "grad_norm": 1.1267197684113246, + "learning_rate": 0.00029987241428165837, + "loss": 6.4758, + "step": 2818 + }, + { + "epoch": 0.26304002985910235, + "grad_norm": 1.1214975065422155, + "learning_rate": 0.0002998721021898258, + "loss": 7.0031, + "step": 2819 + }, + { + "epoch": 0.26313333955397966, + "grad_norm": 1.8631159515091265, + "learning_rate": 0.0002998717897169129, + "loss": 7.3058, + "step": 2820 + }, + { + "epoch": 0.26322664924885697, + "grad_norm": 1.5604722284095949, + "learning_rate": 0.0002998714768629204, + "loss": 7.3805, + "step": 2821 + }, + { + "epoch": 0.2633199589437343, + "grad_norm": 1.3031845989766162, + "learning_rate": 0.0002998711636278492, + "loss": 6.633, + "step": 2822 + }, + { + "epoch": 0.26341326863861153, + "grad_norm": 1.5199417780166244, + "learning_rate": 0.00029987085001169996, + "loss": 6.773, + "step": 2823 + }, + { + "epoch": 0.26350657833348884, + "grad_norm": 1.2713872968371889, + "learning_rate": 0.00029987053601447357, + "loss": 6.5231, + "step": 2824 + }, + { + "epoch": 0.26359988802836615, + "grad_norm": 0.8722030120699005, + "learning_rate": 0.00029987022163617083, + "loss": 7.0051, + "step": 2825 + }, + { + "epoch": 0.26369319772324346, + "grad_norm": 1.6924626807901315, + "learning_rate": 0.00029986990687679244, + "loss": 6.8168, + "step": 2826 + }, + { + "epoch": 0.26378650741812076, + "grad_norm": 1.237906129760944, + "learning_rate": 0.0002998695917363393, + "loss": 6.5949, + "step": 2827 + }, + { + "epoch": 0.263879817112998, + "grad_norm": 1.063024216626195, + "learning_rate": 0.0002998692762148122, + "loss": 6.3902, + "step": 2828 + }, + { + "epoch": 0.2639731268078753, + "grad_norm": 1.9562022023735273, + "learning_rate": 0.0002998689603122119, + "loss": 6.4588, + "step": 2829 + }, + { + "epoch": 0.26406643650275263, + "grad_norm": 1.3752088287000774, + "learning_rate": 0.0002998686440285393, + "loss": 6.8247, + "step": 2830 + }, + { + "epoch": 0.26415974619762994, + "grad_norm": 1.3622182300529317, + "learning_rate": 0.0002998683273637951, + "loss": 6.5251, + "step": 2831 + }, + { + "epoch": 0.26425305589250725, + "grad_norm": 1.5866712920629535, + "learning_rate": 0.0002998680103179801, + "loss": 6.8754, + "step": 2832 + }, + { + "epoch": 0.2643463655873845, + "grad_norm": 1.5861473653011227, + "learning_rate": 0.00029986769289109513, + "loss": 6.7142, + "step": 2833 + }, + { + "epoch": 0.2644396752822618, + "grad_norm": 1.881050947243346, + "learning_rate": 0.000299867375083141, + "loss": 6.6181, + "step": 2834 + }, + { + "epoch": 0.2645329849771391, + "grad_norm": 0.9514471679664168, + "learning_rate": 0.00029986705689411864, + "loss": 6.6959, + "step": 2835 + }, + { + "epoch": 0.2646262946720164, + "grad_norm": 1.0265179852611912, + "learning_rate": 0.0002998667383240287, + "loss": 6.58, + "step": 2836 + }, + { + "epoch": 0.26471960436689373, + "grad_norm": 0.7499786980990184, + "learning_rate": 0.00029986641937287197, + "loss": 6.5638, + "step": 2837 + }, + { + "epoch": 0.26481291406177104, + "grad_norm": 0.7910483906262609, + "learning_rate": 0.0002998661000406494, + "loss": 6.3068, + "step": 2838 + }, + { + "epoch": 0.2649062237566483, + "grad_norm": 0.8829451208542559, + "learning_rate": 0.0002998657803273617, + "loss": 6.6163, + "step": 2839 + }, + { + "epoch": 0.2649995334515256, + "grad_norm": 1.1237416398275653, + "learning_rate": 0.0002998654602330097, + "loss": 6.6788, + "step": 2840 + }, + { + "epoch": 0.2650928431464029, + "grad_norm": 0.9278138010579025, + "learning_rate": 0.0002998651397575943, + "loss": 6.7083, + "step": 2841 + }, + { + "epoch": 0.2651861528412802, + "grad_norm": 1.044817536679954, + "learning_rate": 0.0002998648189011162, + "loss": 6.5495, + "step": 2842 + }, + { + "epoch": 0.26527946253615753, + "grad_norm": 0.9432033880972603, + "learning_rate": 0.00029986449766357626, + "loss": 6.1747, + "step": 2843 + }, + { + "epoch": 0.2653727722310348, + "grad_norm": 0.8288989720167861, + "learning_rate": 0.00029986417604497526, + "loss": 5.9815, + "step": 2844 + }, + { + "epoch": 0.2654660819259121, + "grad_norm": 1.3203463554501202, + "learning_rate": 0.00029986385404531406, + "loss": 6.3273, + "step": 2845 + }, + { + "epoch": 0.2655593916207894, + "grad_norm": 1.8569603803425576, + "learning_rate": 0.0002998635316645935, + "loss": 6.9409, + "step": 2846 + }, + { + "epoch": 0.2656527013156667, + "grad_norm": 0.9805233217495881, + "learning_rate": 0.0002998632089028144, + "loss": 6.2218, + "step": 2847 + }, + { + "epoch": 0.265746011010544, + "grad_norm": 1.1077325501537751, + "learning_rate": 0.0002998628857599775, + "loss": 6.834, + "step": 2848 + }, + { + "epoch": 0.26583932070542127, + "grad_norm": 1.332276015625214, + "learning_rate": 0.00029986256223608367, + "loss": 6.1787, + "step": 2849 + }, + { + "epoch": 0.2659326304002986, + "grad_norm": 0.988754978290132, + "learning_rate": 0.0002998622383311338, + "loss": 6.4543, + "step": 2850 + }, + { + "epoch": 0.2660259400951759, + "grad_norm": 1.110649221954072, + "learning_rate": 0.0002998619140451286, + "loss": 6.2549, + "step": 2851 + }, + { + "epoch": 0.2661192497900532, + "grad_norm": 1.806910288764524, + "learning_rate": 0.0002998615893780689, + "loss": 6.6523, + "step": 2852 + }, + { + "epoch": 0.2662125594849305, + "grad_norm": 1.2151413240335072, + "learning_rate": 0.00029986126432995566, + "loss": 6.7841, + "step": 2853 + }, + { + "epoch": 0.2663058691798078, + "grad_norm": 1.3117610538843678, + "learning_rate": 0.0002998609389007896, + "loss": 6.4434, + "step": 2854 + }, + { + "epoch": 0.26639917887468506, + "grad_norm": 1.5117343447242015, + "learning_rate": 0.0002998606130905716, + "loss": 6.6025, + "step": 2855 + }, + { + "epoch": 0.26649248856956237, + "grad_norm": 1.3207084328710839, + "learning_rate": 0.0002998602868993024, + "loss": 6.4595, + "step": 2856 + }, + { + "epoch": 0.2665857982644397, + "grad_norm": 1.0117093042304015, + "learning_rate": 0.00029985996032698284, + "loss": 6.2209, + "step": 2857 + }, + { + "epoch": 0.266679107959317, + "grad_norm": 1.32977456152614, + "learning_rate": 0.00029985963337361384, + "loss": 6.5832, + "step": 2858 + }, + { + "epoch": 0.2667724176541943, + "grad_norm": 1.136668604062389, + "learning_rate": 0.0002998593060391962, + "loss": 6.4968, + "step": 2859 + }, + { + "epoch": 0.26686572734907155, + "grad_norm": 0.9209272686395955, + "learning_rate": 0.0002998589783237307, + "loss": 6.6459, + "step": 2860 + }, + { + "epoch": 0.26695903704394885, + "grad_norm": 1.0560013937023491, + "learning_rate": 0.0002998586502272182, + "loss": 6.0673, + "step": 2861 + }, + { + "epoch": 0.26705234673882616, + "grad_norm": 1.0242382708896545, + "learning_rate": 0.0002998583217496596, + "loss": 6.3049, + "step": 2862 + }, + { + "epoch": 0.26714565643370347, + "grad_norm": 1.0532931525109541, + "learning_rate": 0.00029985799289105567, + "loss": 6.0465, + "step": 2863 + }, + { + "epoch": 0.2672389661285808, + "grad_norm": 1.3532943831524478, + "learning_rate": 0.0002998576636514072, + "loss": 6.7483, + "step": 2864 + }, + { + "epoch": 0.26733227582345803, + "grad_norm": 1.6313083532063344, + "learning_rate": 0.00029985733403071514, + "loss": 6.3766, + "step": 2865 + }, + { + "epoch": 0.26742558551833534, + "grad_norm": 0.9374545101608638, + "learning_rate": 0.0002998570040289803, + "loss": 6.3202, + "step": 2866 + }, + { + "epoch": 0.26751889521321265, + "grad_norm": 1.3729810074943327, + "learning_rate": 0.00029985667364620336, + "loss": 6.4592, + "step": 2867 + }, + { + "epoch": 0.26761220490808996, + "grad_norm": 1.3331984433497737, + "learning_rate": 0.0002998563428823854, + "loss": 6.4405, + "step": 2868 + }, + { + "epoch": 0.26770551460296726, + "grad_norm": 0.9528729939108638, + "learning_rate": 0.00029985601173752706, + "loss": 5.8694, + "step": 2869 + }, + { + "epoch": 0.2677988242978446, + "grad_norm": 1.1406260107745358, + "learning_rate": 0.0002998556802116294, + "loss": 6.5403, + "step": 2870 + }, + { + "epoch": 0.2678921339927218, + "grad_norm": 0.9435767009736282, + "learning_rate": 0.00029985534830469305, + "loss": 6.4655, + "step": 2871 + }, + { + "epoch": 0.26798544368759913, + "grad_norm": 1.0621470923214267, + "learning_rate": 0.0002998550160167189, + "loss": 6.2813, + "step": 2872 + }, + { + "epoch": 0.26807875338247644, + "grad_norm": 0.9595184952319845, + "learning_rate": 0.0002998546833477079, + "loss": 6.3168, + "step": 2873 + }, + { + "epoch": 0.26817206307735375, + "grad_norm": 0.8342460394228779, + "learning_rate": 0.00029985435029766084, + "loss": 6.4907, + "step": 2874 + }, + { + "epoch": 0.26826537277223106, + "grad_norm": 1.0610324933131103, + "learning_rate": 0.00029985401686657856, + "loss": 5.9569, + "step": 2875 + }, + { + "epoch": 0.2683586824671083, + "grad_norm": 0.8546686013263918, + "learning_rate": 0.0002998536830544619, + "loss": 6.4376, + "step": 2876 + }, + { + "epoch": 0.2684519921619856, + "grad_norm": 1.3733420077467682, + "learning_rate": 0.00029985334886131163, + "loss": 6.7954, + "step": 2877 + }, + { + "epoch": 0.26854530185686293, + "grad_norm": 0.9484846714223644, + "learning_rate": 0.00029985301428712876, + "loss": 6.1352, + "step": 2878 + }, + { + "epoch": 0.26863861155174024, + "grad_norm": 0.9106942160702017, + "learning_rate": 0.00029985267933191405, + "loss": 6.6763, + "step": 2879 + }, + { + "epoch": 0.26873192124661754, + "grad_norm": 1.0897194682119125, + "learning_rate": 0.0002998523439956684, + "loss": 6.4752, + "step": 2880 + }, + { + "epoch": 0.2688252309414948, + "grad_norm": 1.3560182941328, + "learning_rate": 0.0002998520082783926, + "loss": 6.4745, + "step": 2881 + }, + { + "epoch": 0.2689185406363721, + "grad_norm": 1.1252688958707169, + "learning_rate": 0.00029985167218008754, + "loss": 6.4519, + "step": 2882 + }, + { + "epoch": 0.2690118503312494, + "grad_norm": 1.323808117367373, + "learning_rate": 0.00029985133570075405, + "loss": 6.2683, + "step": 2883 + }, + { + "epoch": 0.2691051600261267, + "grad_norm": 1.3698096685734458, + "learning_rate": 0.000299850998840393, + "loss": 6.1935, + "step": 2884 + }, + { + "epoch": 0.26919846972100403, + "grad_norm": 1.1036107056098696, + "learning_rate": 0.00029985066159900523, + "loss": 6.0434, + "step": 2885 + }, + { + "epoch": 0.2692917794158813, + "grad_norm": 1.657384236942326, + "learning_rate": 0.0002998503239765917, + "loss": 6.0529, + "step": 2886 + }, + { + "epoch": 0.2693850891107586, + "grad_norm": 1.1672002803762613, + "learning_rate": 0.0002998499859731531, + "loss": 6.479, + "step": 2887 + }, + { + "epoch": 0.2694783988056359, + "grad_norm": 1.0512349345124927, + "learning_rate": 0.00029984964758869044, + "loss": 5.9528, + "step": 2888 + }, + { + "epoch": 0.2695717085005132, + "grad_norm": 1.2446490755115542, + "learning_rate": 0.00029984930882320446, + "loss": 6.5538, + "step": 2889 + }, + { + "epoch": 0.2696650181953905, + "grad_norm": 1.1491765618068799, + "learning_rate": 0.0002998489696766961, + "loss": 6.3363, + "step": 2890 + }, + { + "epoch": 0.2697583278902678, + "grad_norm": 1.099217671871206, + "learning_rate": 0.0002998486301491662, + "loss": 6.4027, + "step": 2891 + }, + { + "epoch": 0.2698516375851451, + "grad_norm": 1.1935560071353433, + "learning_rate": 0.00029984829024061563, + "loss": 5.7848, + "step": 2892 + }, + { + "epoch": 0.2699449472800224, + "grad_norm": 1.3036896536072902, + "learning_rate": 0.0002998479499510452, + "loss": 6.3302, + "step": 2893 + }, + { + "epoch": 0.2700382569748997, + "grad_norm": 1.1791734898559998, + "learning_rate": 0.00029984760928045584, + "loss": 6.1023, + "step": 2894 + }, + { + "epoch": 0.270131566669777, + "grad_norm": 2.737661180441508, + "learning_rate": 0.0002998472682288484, + "loss": 6.1595, + "step": 2895 + }, + { + "epoch": 0.2702248763646543, + "grad_norm": 1.002350232814558, + "learning_rate": 0.0002998469267962238, + "loss": 6.3745, + "step": 2896 + }, + { + "epoch": 0.27031818605953156, + "grad_norm": 1.9054441751390194, + "learning_rate": 0.0002998465849825828, + "loss": 6.4242, + "step": 2897 + }, + { + "epoch": 0.27041149575440887, + "grad_norm": 2.1812389747926915, + "learning_rate": 0.00029984624278792636, + "loss": 6.3837, + "step": 2898 + }, + { + "epoch": 0.2705048054492862, + "grad_norm": 1.6929827302132987, + "learning_rate": 0.0002998459002122553, + "loss": 5.836, + "step": 2899 + }, + { + "epoch": 0.2705981151441635, + "grad_norm": 1.3143445037688004, + "learning_rate": 0.0002998455572555705, + "loss": 6.4038, + "step": 2900 + }, + { + "epoch": 0.2706914248390408, + "grad_norm": 0.9523028816074974, + "learning_rate": 0.00029984521391787275, + "loss": 6.1436, + "step": 2901 + }, + { + "epoch": 0.27078473453391805, + "grad_norm": 4.251667170556117, + "learning_rate": 0.0002998448701991631, + "loss": 6.2964, + "step": 2902 + }, + { + "epoch": 0.27087804422879536, + "grad_norm": 2.068106039897803, + "learning_rate": 0.00029984452609944234, + "loss": 5.8982, + "step": 2903 + }, + { + "epoch": 0.27097135392367266, + "grad_norm": 1.2602370320192926, + "learning_rate": 0.00029984418161871135, + "loss": 6.1066, + "step": 2904 + }, + { + "epoch": 0.27106466361855, + "grad_norm": 1.4511304150619195, + "learning_rate": 0.00029984383675697094, + "loss": 6.0468, + "step": 2905 + }, + { + "epoch": 0.2711579733134273, + "grad_norm": 1.118766078712981, + "learning_rate": 0.00029984349151422206, + "loss": 6.2919, + "step": 2906 + }, + { + "epoch": 0.2712512830083046, + "grad_norm": 1.8496123534287199, + "learning_rate": 0.00029984314589046556, + "loss": 6.2943, + "step": 2907 + }, + { + "epoch": 0.27134459270318184, + "grad_norm": 1.6605801213285505, + "learning_rate": 0.00029984279988570236, + "loss": 6.1516, + "step": 2908 + }, + { + "epoch": 0.27143790239805915, + "grad_norm": 0.9981841615832835, + "learning_rate": 0.0002998424534999332, + "loss": 6.3753, + "step": 2909 + }, + { + "epoch": 0.27153121209293646, + "grad_norm": 1.2397761412889559, + "learning_rate": 0.0002998421067331592, + "loss": 6.3324, + "step": 2910 + }, + { + "epoch": 0.27162452178781377, + "grad_norm": 1.8212714110624002, + "learning_rate": 0.00029984175958538105, + "loss": 6.2984, + "step": 2911 + }, + { + "epoch": 0.2717178314826911, + "grad_norm": 1.1204192720834054, + "learning_rate": 0.0002998414120565997, + "loss": 5.8514, + "step": 2912 + }, + { + "epoch": 0.2718111411775683, + "grad_norm": 1.2527001917604408, + "learning_rate": 0.00029984106414681597, + "loss": 5.5361, + "step": 2913 + }, + { + "epoch": 0.27190445087244564, + "grad_norm": 0.9469174641238001, + "learning_rate": 0.0002998407158560309, + "loss": 6.2415, + "step": 2914 + }, + { + "epoch": 0.27199776056732294, + "grad_norm": 1.0154877875310926, + "learning_rate": 0.0002998403671842452, + "loss": 6.413, + "step": 2915 + }, + { + "epoch": 0.27209107026220025, + "grad_norm": 1.3732889994691995, + "learning_rate": 0.0002998400181314599, + "loss": 5.9931, + "step": 2916 + }, + { + "epoch": 0.27218437995707756, + "grad_norm": 1.2154555597420507, + "learning_rate": 0.0002998396686976757, + "loss": 6.3181, + "step": 2917 + }, + { + "epoch": 0.2722776896519548, + "grad_norm": 0.9998802360338029, + "learning_rate": 0.00029983931888289376, + "loss": 6.3006, + "step": 2918 + }, + { + "epoch": 0.2723709993468321, + "grad_norm": 1.3094004886019772, + "learning_rate": 0.00029983896868711476, + "loss": 6.2943, + "step": 2919 + }, + { + "epoch": 0.27246430904170943, + "grad_norm": 0.9119184692063845, + "learning_rate": 0.00029983861811033963, + "loss": 6.2393, + "step": 2920 + }, + { + "epoch": 0.27255761873658674, + "grad_norm": 0.8737407341970833, + "learning_rate": 0.00029983826715256924, + "loss": 6.2567, + "step": 2921 + }, + { + "epoch": 0.27265092843146405, + "grad_norm": 1.317997773008493, + "learning_rate": 0.0002998379158138046, + "loss": 6.1019, + "step": 2922 + }, + { + "epoch": 0.27274423812634135, + "grad_norm": 1.1325804987864876, + "learning_rate": 0.0002998375640940465, + "loss": 6.3643, + "step": 2923 + }, + { + "epoch": 0.2728375478212186, + "grad_norm": 1.0429569574663868, + "learning_rate": 0.0002998372119932958, + "loss": 5.8283, + "step": 2924 + }, + { + "epoch": 0.2729308575160959, + "grad_norm": 1.621797270332888, + "learning_rate": 0.0002998368595115535, + "loss": 6.3843, + "step": 2925 + }, + { + "epoch": 0.2730241672109732, + "grad_norm": 0.9155348493074348, + "learning_rate": 0.0002998365066488205, + "loss": 6.2059, + "step": 2926 + }, + { + "epoch": 0.27311747690585053, + "grad_norm": 1.1251231210015156, + "learning_rate": 0.0002998361534050976, + "loss": 6.0947, + "step": 2927 + }, + { + "epoch": 0.27321078660072784, + "grad_norm": 1.4642730614637596, + "learning_rate": 0.0002998357997803857, + "loss": 6.0805, + "step": 2928 + }, + { + "epoch": 0.2733040962956051, + "grad_norm": 2.1880745111219038, + "learning_rate": 0.00029983544577468586, + "loss": 5.5612, + "step": 2929 + }, + { + "epoch": 0.2733974059904824, + "grad_norm": 1.0059345903591634, + "learning_rate": 0.0002998350913879988, + "loss": 6.3397, + "step": 2930 + }, + { + "epoch": 0.2734907156853597, + "grad_norm": 1.5823094513185247, + "learning_rate": 0.00029983473662032546, + "loss": 6.3805, + "step": 2931 + }, + { + "epoch": 0.273584025380237, + "grad_norm": 1.1560913002777857, + "learning_rate": 0.0002998343814716668, + "loss": 6.3372, + "step": 2932 + }, + { + "epoch": 0.2736773350751143, + "grad_norm": 1.756165695054389, + "learning_rate": 0.0002998340259420237, + "loss": 6.0746, + "step": 2933 + }, + { + "epoch": 0.2737706447699916, + "grad_norm": 1.266560564422136, + "learning_rate": 0.00029983367003139707, + "loss": 6.4185, + "step": 2934 + }, + { + "epoch": 0.2738639544648689, + "grad_norm": 1.021678142647575, + "learning_rate": 0.0002998333137397878, + "loss": 5.961, + "step": 2935 + }, + { + "epoch": 0.2739572641597462, + "grad_norm": 1.0523730967412759, + "learning_rate": 0.00029983295706719675, + "loss": 5.8644, + "step": 2936 + }, + { + "epoch": 0.2740505738546235, + "grad_norm": 1.3959491094089762, + "learning_rate": 0.0002998326000136249, + "loss": 6.3192, + "step": 2937 + }, + { + "epoch": 0.2741438835495008, + "grad_norm": 1.2944631422180497, + "learning_rate": 0.00029983224257907313, + "loss": 6.2921, + "step": 2938 + }, + { + "epoch": 0.27423719324437806, + "grad_norm": 1.0869739946279957, + "learning_rate": 0.00029983188476354235, + "loss": 6.1003, + "step": 2939 + }, + { + "epoch": 0.27433050293925537, + "grad_norm": 1.0153355273415228, + "learning_rate": 0.0002998315265670335, + "loss": 6.2195, + "step": 2940 + }, + { + "epoch": 0.2744238126341327, + "grad_norm": 1.335507944449289, + "learning_rate": 0.0002998311679895474, + "loss": 5.7326, + "step": 2941 + }, + { + "epoch": 0.27451712232901, + "grad_norm": 1.3707972880662636, + "learning_rate": 0.000299830809031085, + "loss": 6.609, + "step": 2942 + }, + { + "epoch": 0.2746104320238873, + "grad_norm": 1.233503282737017, + "learning_rate": 0.00029983044969164737, + "loss": 6.269, + "step": 2943 + }, + { + "epoch": 0.2747037417187646, + "grad_norm": 1.061922994620398, + "learning_rate": 0.00029983008997123517, + "loss": 6.1291, + "step": 2944 + }, + { + "epoch": 0.27479705141364186, + "grad_norm": 1.2178713008393542, + "learning_rate": 0.0002998297298698495, + "loss": 5.588, + "step": 2945 + }, + { + "epoch": 0.27489036110851917, + "grad_norm": 1.7553175303248525, + "learning_rate": 0.0002998293693874912, + "loss": 5.9039, + "step": 2946 + }, + { + "epoch": 0.2749836708033965, + "grad_norm": 0.9723744163609467, + "learning_rate": 0.0002998290085241612, + "loss": 5.7118, + "step": 2947 + }, + { + "epoch": 0.2750769804982738, + "grad_norm": 0.9787471008844737, + "learning_rate": 0.0002998286472798604, + "loss": 5.735, + "step": 2948 + }, + { + "epoch": 0.2751702901931511, + "grad_norm": 1.5901356746697834, + "learning_rate": 0.0002998282856545897, + "loss": 6.0246, + "step": 2949 + }, + { + "epoch": 0.27526359988802834, + "grad_norm": 1.535031078527316, + "learning_rate": 0.0002998279236483501, + "loss": 6.1124, + "step": 2950 + }, + { + "epoch": 0.27535690958290565, + "grad_norm": 1.2822787637840218, + "learning_rate": 0.00029982756126114245, + "loss": 5.8884, + "step": 2951 + }, + { + "epoch": 0.27545021927778296, + "grad_norm": 1.5977394249314394, + "learning_rate": 0.0002998271984929677, + "loss": 6.2037, + "step": 2952 + }, + { + "epoch": 0.27554352897266027, + "grad_norm": 1.37157100122592, + "learning_rate": 0.00029982683534382677, + "loss": 5.2745, + "step": 2953 + }, + { + "epoch": 0.2756368386675376, + "grad_norm": 1.4006467862130458, + "learning_rate": 0.0002998264718137206, + "loss": 6.1828, + "step": 2954 + }, + { + "epoch": 0.27573014836241483, + "grad_norm": 1.0877709032179919, + "learning_rate": 0.00029982610790265003, + "loss": 6.0217, + "step": 2955 + }, + { + "epoch": 0.27582345805729214, + "grad_norm": 0.9808594330932818, + "learning_rate": 0.0002998257436106161, + "loss": 6.0512, + "step": 2956 + }, + { + "epoch": 0.27591676775216945, + "grad_norm": 1.2487696292204982, + "learning_rate": 0.00029982537893761966, + "loss": 6.2564, + "step": 2957 + }, + { + "epoch": 0.27601007744704675, + "grad_norm": 1.2459195008638408, + "learning_rate": 0.0002998250138836617, + "loss": 6.3074, + "step": 2958 + }, + { + "epoch": 0.27610338714192406, + "grad_norm": 1.126271858072282, + "learning_rate": 0.0002998246484487431, + "loss": 6.4608, + "step": 2959 + }, + { + "epoch": 0.27619669683680137, + "grad_norm": 1.0791564262009685, + "learning_rate": 0.0002998242826328648, + "loss": 6.1133, + "step": 2960 + }, + { + "epoch": 0.2762900065316786, + "grad_norm": 0.9861183586679917, + "learning_rate": 0.00029982391643602766, + "loss": 5.9086, + "step": 2961 + }, + { + "epoch": 0.27638331622655593, + "grad_norm": 1.0745943677819911, + "learning_rate": 0.00029982354985823274, + "loss": 6.039, + "step": 2962 + }, + { + "epoch": 0.27647662592143324, + "grad_norm": 0.9964070686184006, + "learning_rate": 0.0002998231828994809, + "loss": 6.1078, + "step": 2963 + }, + { + "epoch": 0.27656993561631055, + "grad_norm": 1.310020553933076, + "learning_rate": 0.00029982281555977313, + "loss": 5.3433, + "step": 2964 + }, + { + "epoch": 0.27666324531118786, + "grad_norm": 1.586719395261847, + "learning_rate": 0.0002998224478391103, + "loss": 5.9382, + "step": 2965 + }, + { + "epoch": 0.2767565550060651, + "grad_norm": 1.3061951669348533, + "learning_rate": 0.00029982207973749335, + "loss": 6.0939, + "step": 2966 + }, + { + "epoch": 0.2768498647009424, + "grad_norm": 1.1471162814281965, + "learning_rate": 0.0002998217112549232, + "loss": 5.731, + "step": 2967 + }, + { + "epoch": 0.2769431743958197, + "grad_norm": 1.3275580603303208, + "learning_rate": 0.0002998213423914009, + "loss": 6.2014, + "step": 2968 + }, + { + "epoch": 0.27703648409069703, + "grad_norm": 0.9805465298683034, + "learning_rate": 0.00029982097314692726, + "loss": 5.7225, + "step": 2969 + }, + { + "epoch": 0.27712979378557434, + "grad_norm": 1.0322530750009677, + "learning_rate": 0.0002998206035215033, + "loss": 5.8716, + "step": 2970 + }, + { + "epoch": 0.2772231034804516, + "grad_norm": 1.066057249679239, + "learning_rate": 0.0002998202335151299, + "loss": 6.0163, + "step": 2971 + }, + { + "epoch": 0.2773164131753289, + "grad_norm": 0.988383828272117, + "learning_rate": 0.000299819863127808, + "loss": 5.4573, + "step": 2972 + }, + { + "epoch": 0.2774097228702062, + "grad_norm": 1.069587193461704, + "learning_rate": 0.0002998194923595386, + "loss": 5.6896, + "step": 2973 + }, + { + "epoch": 0.2775030325650835, + "grad_norm": 1.0967343034822306, + "learning_rate": 0.0002998191212103226, + "loss": 6.0005, + "step": 2974 + }, + { + "epoch": 0.2775963422599608, + "grad_norm": 1.1608564487810549, + "learning_rate": 0.0002998187496801609, + "loss": 6.3783, + "step": 2975 + }, + { + "epoch": 0.27768965195483813, + "grad_norm": 1.0187806685406544, + "learning_rate": 0.00029981837776905457, + "loss": 5.5864, + "step": 2976 + }, + { + "epoch": 0.2777829616497154, + "grad_norm": 1.1646850475298953, + "learning_rate": 0.0002998180054770045, + "loss": 5.6472, + "step": 2977 + }, + { + "epoch": 0.2778762713445927, + "grad_norm": 1.2434932552752302, + "learning_rate": 0.0002998176328040116, + "loss": 5.7463, + "step": 2978 + }, + { + "epoch": 0.27796958103947, + "grad_norm": 1.4746333685620834, + "learning_rate": 0.00029981725975007683, + "loss": 6.3641, + "step": 2979 + }, + { + "epoch": 0.2780628907343473, + "grad_norm": 1.261927763379834, + "learning_rate": 0.00029981688631520114, + "loss": 6.3668, + "step": 2980 + }, + { + "epoch": 0.2781562004292246, + "grad_norm": 1.014440599619093, + "learning_rate": 0.0002998165124993855, + "loss": 6.3564, + "step": 2981 + }, + { + "epoch": 0.2782495101241019, + "grad_norm": 1.0069155897332154, + "learning_rate": 0.0002998161383026308, + "loss": 5.7403, + "step": 2982 + }, + { + "epoch": 0.2783428198189792, + "grad_norm": 1.2562767165616824, + "learning_rate": 0.0002998157637249381, + "loss": 5.6512, + "step": 2983 + }, + { + "epoch": 0.2784361295138565, + "grad_norm": 0.9974340116135324, + "learning_rate": 0.00029981538876630825, + "loss": 6.1222, + "step": 2984 + }, + { + "epoch": 0.2785294392087338, + "grad_norm": 1.010336403720735, + "learning_rate": 0.0002998150134267423, + "loss": 5.8248, + "step": 2985 + }, + { + "epoch": 0.2786227489036111, + "grad_norm": 12.065763526438227, + "learning_rate": 0.0002998146377062411, + "loss": 5.9042, + "step": 2986 + }, + { + "epoch": 0.27871605859848836, + "grad_norm": 1.1461736260510205, + "learning_rate": 0.00029981426160480566, + "loss": 6.1959, + "step": 2987 + }, + { + "epoch": 0.27880936829336567, + "grad_norm": 29.333035237527852, + "learning_rate": 0.00029981388512243694, + "loss": 5.8688, + "step": 2988 + }, + { + "epoch": 0.278902677988243, + "grad_norm": 2.4052941767379674, + "learning_rate": 0.0002998135082591359, + "loss": 6.1481, + "step": 2989 + }, + { + "epoch": 0.2789959876831203, + "grad_norm": 2.235736018496747, + "learning_rate": 0.00029981313101490347, + "loss": 6.6893, + "step": 2990 + }, + { + "epoch": 0.2790892973779976, + "grad_norm": 2.966499825224412, + "learning_rate": 0.0002998127533897406, + "loss": 6.3988, + "step": 2991 + }, + { + "epoch": 0.27918260707287484, + "grad_norm": 1.979755933670715, + "learning_rate": 0.00029981237538364837, + "loss": 6.7602, + "step": 2992 + }, + { + "epoch": 0.27927591676775215, + "grad_norm": 903.861997973456, + "learning_rate": 0.0002998119969966276, + "loss": 6.2522, + "step": 2993 + }, + { + "epoch": 0.27936922646262946, + "grad_norm": 2.1655992385981127, + "learning_rate": 0.0002998116182286793, + "loss": 6.7235, + "step": 2994 + }, + { + "epoch": 0.27946253615750677, + "grad_norm": 4.693890135461939, + "learning_rate": 0.0002998112390798044, + "loss": 6.9832, + "step": 2995 + }, + { + "epoch": 0.2795558458523841, + "grad_norm": 4.267954119690116, + "learning_rate": 0.0002998108595500039, + "loss": 6.8969, + "step": 2996 + }, + { + "epoch": 0.2796491555472614, + "grad_norm": 2.675015302382177, + "learning_rate": 0.0002998104796392788, + "loss": 6.8901, + "step": 2997 + }, + { + "epoch": 0.27974246524213864, + "grad_norm": 2.23788872377728, + "learning_rate": 0.00029981009934762997, + "loss": 6.9205, + "step": 2998 + }, + { + "epoch": 0.27983577493701595, + "grad_norm": 2.810449766144606, + "learning_rate": 0.00029980971867505843, + "loss": 6.9626, + "step": 2999 + }, + { + "epoch": 0.27992908463189325, + "grad_norm": 4.334376805717342, + "learning_rate": 0.0002998093376215652, + "loss": 7.1596, + "step": 3000 + }, + { + "epoch": 0.28002239432677056, + "grad_norm": 2.476952136878105, + "learning_rate": 0.0002998089561871512, + "loss": 7.0791, + "step": 3001 + }, + { + "epoch": 0.28011570402164787, + "grad_norm": 2.9609204201532404, + "learning_rate": 0.0002998085743718174, + "loss": 7.0238, + "step": 3002 + }, + { + "epoch": 0.2802090137165251, + "grad_norm": 3.5015752108761764, + "learning_rate": 0.0002998081921755648, + "loss": 6.7914, + "step": 3003 + }, + { + "epoch": 0.28030232341140243, + "grad_norm": 3.3280143032137848, + "learning_rate": 0.00029980780959839426, + "loss": 7.0227, + "step": 3004 + }, + { + "epoch": 0.28039563310627974, + "grad_norm": 2.927797716684128, + "learning_rate": 0.0002998074266403069, + "loss": 7.1933, + "step": 3005 + }, + { + "epoch": 0.28048894280115705, + "grad_norm": 10.570884364013287, + "learning_rate": 0.0002998070433013036, + "loss": 6.9237, + "step": 3006 + }, + { + "epoch": 0.28058225249603436, + "grad_norm": 2.2578687647856373, + "learning_rate": 0.0002998066595813854, + "loss": 6.7161, + "step": 3007 + }, + { + "epoch": 0.2806755621909116, + "grad_norm": 5.773694380278683, + "learning_rate": 0.0002998062754805532, + "loss": 6.9494, + "step": 3008 + }, + { + "epoch": 0.2807688718857889, + "grad_norm": 5.080194720978736, + "learning_rate": 0.00029980589099880804, + "loss": 7.0836, + "step": 3009 + }, + { + "epoch": 0.2808621815806662, + "grad_norm": 3.456986971595064, + "learning_rate": 0.0002998055061361509, + "loss": 7.0437, + "step": 3010 + }, + { + "epoch": 0.28095549127554353, + "grad_norm": 1.8223493082133797, + "learning_rate": 0.00029980512089258267, + "loss": 7.0224, + "step": 3011 + }, + { + "epoch": 0.28104880097042084, + "grad_norm": 2.4260218147983, + "learning_rate": 0.0002998047352681044, + "loss": 7.1712, + "step": 3012 + }, + { + "epoch": 0.28114211066529815, + "grad_norm": 1.8025695078752935, + "learning_rate": 0.00029980434926271714, + "loss": 6.5315, + "step": 3013 + }, + { + "epoch": 0.2812354203601754, + "grad_norm": 1.7628476421151378, + "learning_rate": 0.00029980396287642173, + "loss": 6.6727, + "step": 3014 + }, + { + "epoch": 0.2813287300550527, + "grad_norm": 1.6070709728077972, + "learning_rate": 0.00029980357610921927, + "loss": 6.7767, + "step": 3015 + }, + { + "epoch": 0.28142203974993, + "grad_norm": 2.753210988780816, + "learning_rate": 0.00029980318896111066, + "loss": 6.976, + "step": 3016 + }, + { + "epoch": 0.28151534944480733, + "grad_norm": 1.520352288437766, + "learning_rate": 0.0002998028014320969, + "loss": 6.6753, + "step": 3017 + }, + { + "epoch": 0.28160865913968464, + "grad_norm": 5.690015047446787, + "learning_rate": 0.00029980241352217896, + "loss": 6.5954, + "step": 3018 + }, + { + "epoch": 0.2817019688345619, + "grad_norm": 1.9064460044503417, + "learning_rate": 0.00029980202523135796, + "loss": 6.7617, + "step": 3019 + }, + { + "epoch": 0.2817952785294392, + "grad_norm": 2.0948155960890746, + "learning_rate": 0.0002998016365596347, + "loss": 6.4138, + "step": 3020 + }, + { + "epoch": 0.2818885882243165, + "grad_norm": 1.6569190567511753, + "learning_rate": 0.00029980124750701026, + "loss": 6.4664, + "step": 3021 + }, + { + "epoch": 0.2819818979191938, + "grad_norm": 3.1383744130263835, + "learning_rate": 0.00029980085807348566, + "loss": 6.766, + "step": 3022 + }, + { + "epoch": 0.2820752076140711, + "grad_norm": 2.5418289512233296, + "learning_rate": 0.0002998004682590618, + "loss": 6.7509, + "step": 3023 + }, + { + "epoch": 0.2821685173089484, + "grad_norm": 146.0479876464063, + "learning_rate": 0.0002998000780637398, + "loss": 6.185, + "step": 3024 + }, + { + "epoch": 0.2822618270038257, + "grad_norm": 1.9538837203645707, + "learning_rate": 0.0002997996874875205, + "loss": 6.3865, + "step": 3025 + }, + { + "epoch": 0.282355136698703, + "grad_norm": 1.5957091276260218, + "learning_rate": 0.000299799296530405, + "loss": 6.6636, + "step": 3026 + }, + { + "epoch": 0.2824484463935803, + "grad_norm": 2.006288391981306, + "learning_rate": 0.00029979890519239423, + "loss": 6.3464, + "step": 3027 + }, + { + "epoch": 0.2825417560884576, + "grad_norm": 27.505513308251743, + "learning_rate": 0.00029979851347348925, + "loss": 6.5545, + "step": 3028 + }, + { + "epoch": 0.2826350657833349, + "grad_norm": 4.65438054733735, + "learning_rate": 0.00029979812137369104, + "loss": 6.5939, + "step": 3029 + }, + { + "epoch": 0.28272837547821217, + "grad_norm": 2.498220939682252, + "learning_rate": 0.0002997977288930006, + "loss": 6.5017, + "step": 3030 + }, + { + "epoch": 0.2828216851730895, + "grad_norm": 6.210074975273446, + "learning_rate": 0.00029979733603141887, + "loss": 6.8699, + "step": 3031 + }, + { + "epoch": 0.2829149948679668, + "grad_norm": 2.5121184764730318, + "learning_rate": 0.00029979694278894686, + "loss": 6.5465, + "step": 3032 + }, + { + "epoch": 0.2830083045628441, + "grad_norm": 3.3404237104607755, + "learning_rate": 0.00029979654916558563, + "loss": 6.6876, + "step": 3033 + }, + { + "epoch": 0.2831016142577214, + "grad_norm": 2.1278332642369593, + "learning_rate": 0.00029979615516133616, + "loss": 6.4891, + "step": 3034 + }, + { + "epoch": 0.28319492395259865, + "grad_norm": 1.424365550338529, + "learning_rate": 0.0002997957607761994, + "loss": 6.7632, + "step": 3035 + }, + { + "epoch": 0.28328823364747596, + "grad_norm": 2.0310394261106355, + "learning_rate": 0.0002997953660101765, + "loss": 6.3079, + "step": 3036 + }, + { + "epoch": 0.28338154334235327, + "grad_norm": 2.6615704871435355, + "learning_rate": 0.0002997949708632682, + "loss": 6.5032, + "step": 3037 + }, + { + "epoch": 0.2834748530372306, + "grad_norm": 2.13538589159806, + "learning_rate": 0.00029979457533547575, + "loss": 6.5136, + "step": 3038 + }, + { + "epoch": 0.2835681627321079, + "grad_norm": 2.877924441594578, + "learning_rate": 0.00029979417942680006, + "loss": 6.6011, + "step": 3039 + }, + { + "epoch": 0.28366147242698514, + "grad_norm": 1.0536234060874032, + "learning_rate": 0.00029979378313724214, + "loss": 5.975, + "step": 3040 + }, + { + "epoch": 0.28375478212186245, + "grad_norm": 1.33772766903319, + "learning_rate": 0.00029979338646680297, + "loss": 6.0831, + "step": 3041 + }, + { + "epoch": 0.28384809181673976, + "grad_norm": 2.279722121891094, + "learning_rate": 0.00029979298941548363, + "loss": 6.3553, + "step": 3042 + }, + { + "epoch": 0.28394140151161706, + "grad_norm": 1.3230681378429885, + "learning_rate": 0.0002997925919832851, + "loss": 5.8427, + "step": 3043 + }, + { + "epoch": 0.2840347112064944, + "grad_norm": 5.222997352568799, + "learning_rate": 0.0002997921941702084, + "loss": 6.0236, + "step": 3044 + }, + { + "epoch": 0.2841280209013716, + "grad_norm": 1.619802082516946, + "learning_rate": 0.00029979179597625446, + "loss": 6.1076, + "step": 3045 + }, + { + "epoch": 0.28422133059624893, + "grad_norm": 1.4408226683205405, + "learning_rate": 0.00029979139740142437, + "loss": 6.5339, + "step": 3046 + }, + { + "epoch": 0.28431464029112624, + "grad_norm": 1.7554693447595802, + "learning_rate": 0.00029979099844571914, + "loss": 6.3201, + "step": 3047 + }, + { + "epoch": 0.28440794998600355, + "grad_norm": 7.561051865950881, + "learning_rate": 0.00029979059910913977, + "loss": 6.2125, + "step": 3048 + }, + { + "epoch": 0.28450125968088086, + "grad_norm": 4.7385011087863065, + "learning_rate": 0.00029979019939168727, + "loss": 6.1005, + "step": 3049 + }, + { + "epoch": 0.28459456937575817, + "grad_norm": 1.4408312189271058, + "learning_rate": 0.0002997897992933627, + "loss": 5.7291, + "step": 3050 + }, + { + "epoch": 0.2846878790706354, + "grad_norm": 1.7571700790715004, + "learning_rate": 0.000299789398814167, + "loss": 6.5619, + "step": 3051 + }, + { + "epoch": 0.2847811887655127, + "grad_norm": 1.5413245197131946, + "learning_rate": 0.0002997889979541012, + "loss": 6.5471, + "step": 3052 + }, + { + "epoch": 0.28487449846039004, + "grad_norm": 1.6372909599183294, + "learning_rate": 0.0002997885967131664, + "loss": 5.9619, + "step": 3053 + }, + { + "epoch": 0.28496780815526734, + "grad_norm": 1.3112332755946563, + "learning_rate": 0.0002997881950913636, + "loss": 6.072, + "step": 3054 + }, + { + "epoch": 0.28506111785014465, + "grad_norm": 2.5371144936870818, + "learning_rate": 0.0002997877930886937, + "loss": 5.5633, + "step": 3055 + }, + { + "epoch": 0.2851544275450219, + "grad_norm": 1.4837614375149895, + "learning_rate": 0.0002997873907051579, + "loss": 6.1262, + "step": 3056 + }, + { + "epoch": 0.2852477372398992, + "grad_norm": 2.1027973324345894, + "learning_rate": 0.0002997869879407571, + "loss": 6.3116, + "step": 3057 + }, + { + "epoch": 0.2853410469347765, + "grad_norm": 1.7569023463226934, + "learning_rate": 0.0002997865847954923, + "loss": 6.2284, + "step": 3058 + }, + { + "epoch": 0.28543435662965383, + "grad_norm": 5.02490482571683, + "learning_rate": 0.00029978618126936466, + "loss": 6.2897, + "step": 3059 + }, + { + "epoch": 0.28552766632453114, + "grad_norm": 1.2321866780408273, + "learning_rate": 0.00029978577736237516, + "loss": 6.078, + "step": 3060 + }, + { + "epoch": 0.2856209760194084, + "grad_norm": 1.4960722554195962, + "learning_rate": 0.00029978537307452473, + "loss": 5.9524, + "step": 3061 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.3203736802854635, + "learning_rate": 0.0002997849684058145, + "loss": 6.0844, + "step": 3062 + }, + { + "epoch": 0.285807595409163, + "grad_norm": 1.3324710297127873, + "learning_rate": 0.00029978456335624543, + "loss": 5.8693, + "step": 3063 + }, + { + "epoch": 0.2859009051040403, + "grad_norm": 1.0303378876352693, + "learning_rate": 0.0002997841579258186, + "loss": 6.2455, + "step": 3064 + }, + { + "epoch": 0.2859942147989176, + "grad_norm": 1.5601671454269512, + "learning_rate": 0.00029978375211453507, + "loss": 6.3854, + "step": 3065 + }, + { + "epoch": 0.28608752449379493, + "grad_norm": 1.0734208505602771, + "learning_rate": 0.0002997833459223958, + "loss": 6.3801, + "step": 3066 + }, + { + "epoch": 0.2861808341886722, + "grad_norm": 1.1222847326532506, + "learning_rate": 0.0002997829393494018, + "loss": 6.4871, + "step": 3067 + }, + { + "epoch": 0.2862741438835495, + "grad_norm": 0.9686976643160488, + "learning_rate": 0.0002997825323955542, + "loss": 6.3562, + "step": 3068 + }, + { + "epoch": 0.2863674535784268, + "grad_norm": 1.0677429369353606, + "learning_rate": 0.000299782125060854, + "loss": 6.4729, + "step": 3069 + }, + { + "epoch": 0.2864607632733041, + "grad_norm": 1.0862184579790575, + "learning_rate": 0.00029978171734530224, + "loss": 5.8338, + "step": 3070 + }, + { + "epoch": 0.2865540729681814, + "grad_norm": 1.1537556874116697, + "learning_rate": 0.00029978130924889987, + "loss": 6.1529, + "step": 3071 + }, + { + "epoch": 0.28664738266305867, + "grad_norm": 1.3123008916054835, + "learning_rate": 0.00029978090077164806, + "loss": 6.1648, + "step": 3072 + }, + { + "epoch": 0.286740692357936, + "grad_norm": 1.2334354647406487, + "learning_rate": 0.00029978049191354774, + "loss": 5.6443, + "step": 3073 + }, + { + "epoch": 0.2868340020528133, + "grad_norm": 1.0032431038747538, + "learning_rate": 0.00029978008267460006, + "loss": 6.238, + "step": 3074 + }, + { + "epoch": 0.2869273117476906, + "grad_norm": 0.9115175877572829, + "learning_rate": 0.00029977967305480597, + "loss": 6.3002, + "step": 3075 + }, + { + "epoch": 0.2870206214425679, + "grad_norm": 1.0002579057952343, + "learning_rate": 0.0002997792630541665, + "loss": 6.1658, + "step": 3076 + }, + { + "epoch": 0.28711393113744516, + "grad_norm": 1.0810047921037782, + "learning_rate": 0.0002997788526726828, + "loss": 6.2625, + "step": 3077 + }, + { + "epoch": 0.28720724083232246, + "grad_norm": 1.384113980958795, + "learning_rate": 0.0002997784419103558, + "loss": 6.3814, + "step": 3078 + }, + { + "epoch": 0.28730055052719977, + "grad_norm": 0.9254589308568401, + "learning_rate": 0.0002997780307671866, + "loss": 5.837, + "step": 3079 + }, + { + "epoch": 0.2873938602220771, + "grad_norm": 1.0311894726291506, + "learning_rate": 0.0002997776192431762, + "loss": 6.05, + "step": 3080 + }, + { + "epoch": 0.2874871699169544, + "grad_norm": 1.1770883667404881, + "learning_rate": 0.00029977720733832575, + "loss": 5.7965, + "step": 3081 + }, + { + "epoch": 0.2875804796118317, + "grad_norm": 1.022511368513701, + "learning_rate": 0.00029977679505263616, + "loss": 6.0148, + "step": 3082 + }, + { + "epoch": 0.28767378930670895, + "grad_norm": 1.0226500392812345, + "learning_rate": 0.00029977638238610864, + "loss": 5.5243, + "step": 3083 + }, + { + "epoch": 0.28776709900158626, + "grad_norm": 1.2582830438950183, + "learning_rate": 0.00029977596933874405, + "loss": 6.0366, + "step": 3084 + }, + { + "epoch": 0.28786040869646357, + "grad_norm": 1.023218402568481, + "learning_rate": 0.00029977555591054357, + "loss": 6.2212, + "step": 3085 + }, + { + "epoch": 0.2879537183913409, + "grad_norm": 0.9990317146877201, + "learning_rate": 0.00029977514210150825, + "loss": 6.0374, + "step": 3086 + }, + { + "epoch": 0.2880470280862182, + "grad_norm": 1.1528502121674618, + "learning_rate": 0.00029977472791163905, + "loss": 6.196, + "step": 3087 + }, + { + "epoch": 0.28814033778109543, + "grad_norm": 1.0234805588004834, + "learning_rate": 0.0002997743133409372, + "loss": 5.8234, + "step": 3088 + }, + { + "epoch": 0.28823364747597274, + "grad_norm": 0.9414490355952937, + "learning_rate": 0.0002997738983894035, + "loss": 6.1548, + "step": 3089 + }, + { + "epoch": 0.28832695717085005, + "grad_norm": 1.5626548923753376, + "learning_rate": 0.00029977348305703917, + "loss": 6.1994, + "step": 3090 + }, + { + "epoch": 0.28842026686572736, + "grad_norm": 1.0188958257673408, + "learning_rate": 0.0002997730673438453, + "loss": 5.7647, + "step": 3091 + }, + { + "epoch": 0.28851357656060467, + "grad_norm": 0.984705736938159, + "learning_rate": 0.00029977265124982286, + "loss": 5.7733, + "step": 3092 + }, + { + "epoch": 0.2886068862554819, + "grad_norm": 0.9559140215444275, + "learning_rate": 0.0002997722347749729, + "loss": 6.0828, + "step": 3093 + }, + { + "epoch": 0.28870019595035923, + "grad_norm": 1.1088593515446683, + "learning_rate": 0.0002997718179192966, + "loss": 6.4061, + "step": 3094 + }, + { + "epoch": 0.28879350564523654, + "grad_norm": 0.942385171050877, + "learning_rate": 0.00029977140068279483, + "loss": 6.1389, + "step": 3095 + }, + { + "epoch": 0.28888681534011384, + "grad_norm": 1.2191093278962641, + "learning_rate": 0.0002997709830654688, + "loss": 5.8278, + "step": 3096 + }, + { + "epoch": 0.28898012503499115, + "grad_norm": 1.0067430867305613, + "learning_rate": 0.0002997705650673195, + "loss": 6.2781, + "step": 3097 + }, + { + "epoch": 0.2890734347298684, + "grad_norm": 0.9939855724559198, + "learning_rate": 0.0002997701466883481, + "loss": 6.2024, + "step": 3098 + }, + { + "epoch": 0.2891667444247457, + "grad_norm": 1.36097127509377, + "learning_rate": 0.0002997697279285555, + "loss": 5.8543, + "step": 3099 + }, + { + "epoch": 0.289260054119623, + "grad_norm": 1.7298825261582613, + "learning_rate": 0.00029976930878794283, + "loss": 6.0305, + "step": 3100 + }, + { + "epoch": 0.28935336381450033, + "grad_norm": 1.1019481506662627, + "learning_rate": 0.00029976888926651124, + "loss": 5.9564, + "step": 3101 + }, + { + "epoch": 0.28944667350937764, + "grad_norm": 1.391890051386765, + "learning_rate": 0.0002997684693642617, + "loss": 6.5374, + "step": 3102 + }, + { + "epoch": 0.28953998320425495, + "grad_norm": 1.1568426541169439, + "learning_rate": 0.0002997680490811953, + "loss": 5.5979, + "step": 3103 + }, + { + "epoch": 0.2896332928991322, + "grad_norm": 1.1231693991938796, + "learning_rate": 0.0002997676284173131, + "loss": 6.1317, + "step": 3104 + }, + { + "epoch": 0.2897266025940095, + "grad_norm": 1.0479260914541757, + "learning_rate": 0.00029976720737261625, + "loss": 5.8167, + "step": 3105 + }, + { + "epoch": 0.2898199122888868, + "grad_norm": 1.6063610354347722, + "learning_rate": 0.0002997667859471057, + "loss": 6.4426, + "step": 3106 + }, + { + "epoch": 0.2899132219837641, + "grad_norm": 1.181480862501627, + "learning_rate": 0.0002997663641407826, + "loss": 5.4421, + "step": 3107 + }, + { + "epoch": 0.29000653167864143, + "grad_norm": 1.462640873637773, + "learning_rate": 0.00029976594195364796, + "loss": 5.7576, + "step": 3108 + }, + { + "epoch": 0.2900998413735187, + "grad_norm": 1.8654955356614409, + "learning_rate": 0.00029976551938570297, + "loss": 6.1228, + "step": 3109 + }, + { + "epoch": 0.290193151068396, + "grad_norm": 1.3004875505807973, + "learning_rate": 0.00029976509643694856, + "loss": 5.8542, + "step": 3110 + }, + { + "epoch": 0.2902864607632733, + "grad_norm": 1.4311453462344779, + "learning_rate": 0.0002997646731073859, + "loss": 5.9579, + "step": 3111 + }, + { + "epoch": 0.2903797704581506, + "grad_norm": 1.6124978082629642, + "learning_rate": 0.00029976424939701606, + "loss": 6.0068, + "step": 3112 + }, + { + "epoch": 0.2904730801530279, + "grad_norm": 1.5577234289179438, + "learning_rate": 0.00029976382530584004, + "loss": 6.0471, + "step": 3113 + }, + { + "epoch": 0.29056638984790517, + "grad_norm": 1.0808171854343693, + "learning_rate": 0.00029976340083385903, + "loss": 5.9043, + "step": 3114 + }, + { + "epoch": 0.2906596995427825, + "grad_norm": 1.3618458923898213, + "learning_rate": 0.00029976297598107403, + "loss": 5.7494, + "step": 3115 + }, + { + "epoch": 0.2907530092376598, + "grad_norm": 1.0483029659750025, + "learning_rate": 0.00029976255074748616, + "loss": 6.0754, + "step": 3116 + }, + { + "epoch": 0.2908463189325371, + "grad_norm": 0.9463922110948506, + "learning_rate": 0.00029976212513309645, + "loss": 5.8943, + "step": 3117 + }, + { + "epoch": 0.2909396286274144, + "grad_norm": 1.2667821583318135, + "learning_rate": 0.00029976169913790605, + "loss": 5.8409, + "step": 3118 + }, + { + "epoch": 0.2910329383222917, + "grad_norm": 0.973646501613584, + "learning_rate": 0.00029976127276191603, + "loss": 6.0423, + "step": 3119 + }, + { + "epoch": 0.29112624801716896, + "grad_norm": 1.5989165198823947, + "learning_rate": 0.00029976084600512743, + "loss": 5.9579, + "step": 3120 + }, + { + "epoch": 0.2912195577120463, + "grad_norm": 1.388628869568069, + "learning_rate": 0.0002997604188675414, + "loss": 6.1473, + "step": 3121 + }, + { + "epoch": 0.2913128674069236, + "grad_norm": 1.0161938884982709, + "learning_rate": 0.00029975999134915893, + "loss": 5.5891, + "step": 3122 + }, + { + "epoch": 0.2914061771018009, + "grad_norm": 1.0742330984327655, + "learning_rate": 0.00029975956344998114, + "loss": 6.1415, + "step": 3123 + }, + { + "epoch": 0.2914994867966782, + "grad_norm": 1.0481587928691647, + "learning_rate": 0.0002997591351700092, + "loss": 6.0378, + "step": 3124 + }, + { + "epoch": 0.29159279649155545, + "grad_norm": 0.9864797956277496, + "learning_rate": 0.0002997587065092442, + "loss": 5.8193, + "step": 3125 + }, + { + "epoch": 0.29168610618643276, + "grad_norm": 0.9677719622695482, + "learning_rate": 0.000299758277467687, + "loss": 6.1605, + "step": 3126 + }, + { + "epoch": 0.29177941588131007, + "grad_norm": 1.0850475223265919, + "learning_rate": 0.000299757848045339, + "loss": 6.1749, + "step": 3127 + }, + { + "epoch": 0.2918727255761874, + "grad_norm": 1.0239047149552327, + "learning_rate": 0.0002997574182422011, + "loss": 6.2071, + "step": 3128 + }, + { + "epoch": 0.2919660352710647, + "grad_norm": 1.2804426472845087, + "learning_rate": 0.0002997569880582745, + "loss": 6.0231, + "step": 3129 + }, + { + "epoch": 0.29205934496594194, + "grad_norm": 1.0638000517052468, + "learning_rate": 0.0002997565574935602, + "loss": 5.9875, + "step": 3130 + }, + { + "epoch": 0.29215265466081924, + "grad_norm": 0.9670198509119878, + "learning_rate": 0.00029975612654805934, + "loss": 6.134, + "step": 3131 + }, + { + "epoch": 0.29224596435569655, + "grad_norm": 0.9478114392274788, + "learning_rate": 0.00029975569522177296, + "loss": 6.4439, + "step": 3132 + }, + { + "epoch": 0.29233927405057386, + "grad_norm": 0.9797189523458464, + "learning_rate": 0.0002997552635147023, + "loss": 6.0964, + "step": 3133 + }, + { + "epoch": 0.29243258374545117, + "grad_norm": 1.078116062672675, + "learning_rate": 0.0002997548314268483, + "loss": 5.962, + "step": 3134 + }, + { + "epoch": 0.2925258934403285, + "grad_norm": 1.0117925650365498, + "learning_rate": 0.0002997543989582122, + "loss": 5.9342, + "step": 3135 + }, + { + "epoch": 0.29261920313520573, + "grad_norm": 1.0830485324820873, + "learning_rate": 0.00029975396610879495, + "loss": 6.1002, + "step": 3136 + }, + { + "epoch": 0.29271251283008304, + "grad_norm": 1.2651897294351242, + "learning_rate": 0.00029975353287859773, + "loss": 5.9982, + "step": 3137 + }, + { + "epoch": 0.29280582252496035, + "grad_norm": 1.0561365906727913, + "learning_rate": 0.00029975309926762167, + "loss": 5.534, + "step": 3138 + }, + { + "epoch": 0.29289913221983765, + "grad_norm": 1.0610888235115314, + "learning_rate": 0.0002997526652758678, + "loss": 5.7583, + "step": 3139 + }, + { + "epoch": 0.29299244191471496, + "grad_norm": 1.0107525480042392, + "learning_rate": 0.00029975223090333733, + "loss": 5.6779, + "step": 3140 + }, + { + "epoch": 0.2930857516095922, + "grad_norm": 1.0583373880167823, + "learning_rate": 0.0002997517961500312, + "loss": 5.9664, + "step": 3141 + }, + { + "epoch": 0.2931790613044695, + "grad_norm": 1.134560403281879, + "learning_rate": 0.00029975136101595065, + "loss": 5.8839, + "step": 3142 + }, + { + "epoch": 0.29327237099934683, + "grad_norm": 1.0028114428183008, + "learning_rate": 0.0002997509255010968, + "loss": 6.1854, + "step": 3143 + }, + { + "epoch": 0.29336568069422414, + "grad_norm": 1.197686527053807, + "learning_rate": 0.0002997504896054707, + "loss": 5.7891, + "step": 3144 + }, + { + "epoch": 0.29345899038910145, + "grad_norm": 1.910203541676267, + "learning_rate": 0.0002997500533290734, + "loss": 5.1001, + "step": 3145 + }, + { + "epoch": 0.2935523000839787, + "grad_norm": 1.1412958899604442, + "learning_rate": 0.0002997496166719061, + "loss": 6.0505, + "step": 3146 + }, + { + "epoch": 0.293645609778856, + "grad_norm": 1.0168554593253927, + "learning_rate": 0.0002997491796339699, + "loss": 5.9226, + "step": 3147 + }, + { + "epoch": 0.2937389194737333, + "grad_norm": 1.2001153156734803, + "learning_rate": 0.0002997487422152659, + "loss": 6.3222, + "step": 3148 + }, + { + "epoch": 0.2938322291686106, + "grad_norm": 1.6036718245216812, + "learning_rate": 0.00029974830441579516, + "loss": 5.0086, + "step": 3149 + }, + { + "epoch": 0.29392553886348793, + "grad_norm": 1.0339239458689427, + "learning_rate": 0.0002997478662355589, + "loss": 6.2541, + "step": 3150 + }, + { + "epoch": 0.2940188485583652, + "grad_norm": 1.2903130988929865, + "learning_rate": 0.00029974742767455815, + "loss": 6.0001, + "step": 3151 + }, + { + "epoch": 0.2941121582532425, + "grad_norm": 1.5457234855875444, + "learning_rate": 0.00029974698873279403, + "loss": 5.7386, + "step": 3152 + }, + { + "epoch": 0.2942054679481198, + "grad_norm": 1.1550519332228009, + "learning_rate": 0.0002997465494102677, + "loss": 5.8037, + "step": 3153 + }, + { + "epoch": 0.2942987776429971, + "grad_norm": 1.058457549278051, + "learning_rate": 0.00029974610970698025, + "loss": 5.7375, + "step": 3154 + }, + { + "epoch": 0.2943920873378744, + "grad_norm": 1.6380482154046687, + "learning_rate": 0.0002997456696229328, + "loss": 5.8099, + "step": 3155 + }, + { + "epoch": 0.29448539703275173, + "grad_norm": 1.1789650642716318, + "learning_rate": 0.0002997452291581264, + "loss": 5.9294, + "step": 3156 + }, + { + "epoch": 0.294578706727629, + "grad_norm": 7.841474657447209, + "learning_rate": 0.00029974478831256237, + "loss": 5.6985, + "step": 3157 + }, + { + "epoch": 0.2946720164225063, + "grad_norm": 1.043811040316713, + "learning_rate": 0.00029974434708624165, + "loss": 6.1314, + "step": 3158 + }, + { + "epoch": 0.2947653261173836, + "grad_norm": 6.719301058203395, + "learning_rate": 0.0002997439054791654, + "loss": 6.2772, + "step": 3159 + }, + { + "epoch": 0.2948586358122609, + "grad_norm": 1.7397708883270946, + "learning_rate": 0.0002997434634913347, + "loss": 6.142, + "step": 3160 + }, + { + "epoch": 0.2949519455071382, + "grad_norm": 45.37752448548843, + "learning_rate": 0.00029974302112275084, + "loss": 6.1652, + "step": 3161 + }, + { + "epoch": 0.29504525520201547, + "grad_norm": 2.627530628715651, + "learning_rate": 0.0002997425783734148, + "loss": 6.0051, + "step": 3162 + }, + { + "epoch": 0.2951385648968928, + "grad_norm": 7.775140253982714, + "learning_rate": 0.0002997421352433277, + "loss": 6.2911, + "step": 3163 + }, + { + "epoch": 0.2952318745917701, + "grad_norm": 2.326937844651894, + "learning_rate": 0.00029974169173249075, + "loss": 6.1925, + "step": 3164 + }, + { + "epoch": 0.2953251842866474, + "grad_norm": 1.946507432032608, + "learning_rate": 0.000299741247840905, + "loss": 6.5434, + "step": 3165 + }, + { + "epoch": 0.2954184939815247, + "grad_norm": 6.074919151589873, + "learning_rate": 0.0002997408035685716, + "loss": 6.2592, + "step": 3166 + }, + { + "epoch": 0.29551180367640195, + "grad_norm": 5.082371297986588, + "learning_rate": 0.00029974035891549173, + "loss": 6.235, + "step": 3167 + }, + { + "epoch": 0.29560511337127926, + "grad_norm": 2.887404242885888, + "learning_rate": 0.00029973991388166645, + "loss": 6.416, + "step": 3168 + }, + { + "epoch": 0.29569842306615657, + "grad_norm": 1.670618568892004, + "learning_rate": 0.00029973946846709697, + "loss": 6.0972, + "step": 3169 + }, + { + "epoch": 0.2957917327610339, + "grad_norm": 2.858509761503505, + "learning_rate": 0.00029973902267178436, + "loss": 6.0793, + "step": 3170 + }, + { + "epoch": 0.2958850424559112, + "grad_norm": 1.5365981176953813, + "learning_rate": 0.00029973857649572976, + "loss": 6.2905, + "step": 3171 + }, + { + "epoch": 0.2959783521507885, + "grad_norm": 1.5651193438404707, + "learning_rate": 0.0002997381299389343, + "loss": 6.2828, + "step": 3172 + }, + { + "epoch": 0.29607166184566575, + "grad_norm": 1.5474214484479958, + "learning_rate": 0.0002997376830013992, + "loss": 5.8351, + "step": 3173 + }, + { + "epoch": 0.29616497154054305, + "grad_norm": 1.5676964718290538, + "learning_rate": 0.0002997372356831255, + "loss": 6.1219, + "step": 3174 + }, + { + "epoch": 0.29625828123542036, + "grad_norm": 1.9926004857557336, + "learning_rate": 0.0002997367879841143, + "loss": 6.2821, + "step": 3175 + }, + { + "epoch": 0.29635159093029767, + "grad_norm": 1.7239803605994788, + "learning_rate": 0.00029973633990436685, + "loss": 6.1629, + "step": 3176 + }, + { + "epoch": 0.296444900625175, + "grad_norm": 1.379257245632112, + "learning_rate": 0.0002997358914438843, + "loss": 6.3481, + "step": 3177 + }, + { + "epoch": 0.29653821032005223, + "grad_norm": 1.904663580014301, + "learning_rate": 0.0002997354426026676, + "loss": 6.4285, + "step": 3178 + }, + { + "epoch": 0.29663152001492954, + "grad_norm": 1.5836317601415475, + "learning_rate": 0.00029973499338071814, + "loss": 6.0352, + "step": 3179 + }, + { + "epoch": 0.29672482970980685, + "grad_norm": 2.4652910342655643, + "learning_rate": 0.0002997345437780369, + "loss": 5.6865, + "step": 3180 + }, + { + "epoch": 0.29681813940468416, + "grad_norm": 1.9402329574513268, + "learning_rate": 0.0002997340937946251, + "loss": 6.2103, + "step": 3181 + }, + { + "epoch": 0.29691144909956146, + "grad_norm": 1.487404484284663, + "learning_rate": 0.0002997336434304838, + "loss": 6.1484, + "step": 3182 + }, + { + "epoch": 0.2970047587944387, + "grad_norm": 1.486051887654797, + "learning_rate": 0.00029973319268561426, + "loss": 5.9059, + "step": 3183 + }, + { + "epoch": 0.297098068489316, + "grad_norm": 1.3750011586377586, + "learning_rate": 0.00029973274156001753, + "loss": 6.094, + "step": 3184 + }, + { + "epoch": 0.29719137818419333, + "grad_norm": 1.3121646897070642, + "learning_rate": 0.0002997322900536948, + "loss": 6.1034, + "step": 3185 + }, + { + "epoch": 0.29728468787907064, + "grad_norm": 1.2537297826522953, + "learning_rate": 0.0002997318381666472, + "loss": 6.249, + "step": 3186 + }, + { + "epoch": 0.29737799757394795, + "grad_norm": 1.267512680120541, + "learning_rate": 0.0002997313858988759, + "loss": 5.9899, + "step": 3187 + }, + { + "epoch": 0.29747130726882526, + "grad_norm": 1.1952953108105333, + "learning_rate": 0.00029973093325038204, + "loss": 6.1028, + "step": 3188 + }, + { + "epoch": 0.2975646169637025, + "grad_norm": 1.054278964006496, + "learning_rate": 0.00029973048022116677, + "loss": 6.3102, + "step": 3189 + }, + { + "epoch": 0.2976579266585798, + "grad_norm": 1.2362638549898781, + "learning_rate": 0.00029973002681123125, + "loss": 6.1005, + "step": 3190 + }, + { + "epoch": 0.2977512363534571, + "grad_norm": 1.1357868837705505, + "learning_rate": 0.00029972957302057656, + "loss": 5.9229, + "step": 3191 + }, + { + "epoch": 0.29784454604833444, + "grad_norm": 1.1300152796953176, + "learning_rate": 0.000299729118849204, + "loss": 6.1301, + "step": 3192 + }, + { + "epoch": 0.29793785574321174, + "grad_norm": 1.0741610423061654, + "learning_rate": 0.00029972866429711463, + "loss": 5.9383, + "step": 3193 + }, + { + "epoch": 0.298031165438089, + "grad_norm": 1.2172900987034996, + "learning_rate": 0.0002997282093643096, + "loss": 6.0801, + "step": 3194 + }, + { + "epoch": 0.2981244751329663, + "grad_norm": 1.1765221251321216, + "learning_rate": 0.00029972775405079014, + "loss": 5.6044, + "step": 3195 + }, + { + "epoch": 0.2982177848278436, + "grad_norm": 1.2191125379794343, + "learning_rate": 0.0002997272983565573, + "loss": 5.6944, + "step": 3196 + }, + { + "epoch": 0.2983110945227209, + "grad_norm": 1.238989721560939, + "learning_rate": 0.0002997268422816123, + "loss": 5.4334, + "step": 3197 + }, + { + "epoch": 0.29840440421759823, + "grad_norm": 0.9856097339550298, + "learning_rate": 0.00029972638582595625, + "loss": 6.1798, + "step": 3198 + }, + { + "epoch": 0.2984977139124755, + "grad_norm": 1.8063817135710887, + "learning_rate": 0.0002997259289895904, + "loss": 5.8744, + "step": 3199 + }, + { + "epoch": 0.2985910236073528, + "grad_norm": 1.089408331688774, + "learning_rate": 0.0002997254717725159, + "loss": 6.0387, + "step": 3200 + }, + { + "epoch": 0.2986843333022301, + "grad_norm": 1.0390480594681977, + "learning_rate": 0.00029972501417473385, + "loss": 6.2263, + "step": 3201 + }, + { + "epoch": 0.2987776429971074, + "grad_norm": 1.1261889105069265, + "learning_rate": 0.0002997245561962454, + "loss": 5.6935, + "step": 3202 + }, + { + "epoch": 0.2988709526919847, + "grad_norm": 1.2964793162542319, + "learning_rate": 0.00029972409783705174, + "loss": 6.0471, + "step": 3203 + }, + { + "epoch": 0.29896426238686197, + "grad_norm": 1.1083534783847975, + "learning_rate": 0.0002997236390971541, + "loss": 5.7959, + "step": 3204 + }, + { + "epoch": 0.2990575720817393, + "grad_norm": 1.02242040063745, + "learning_rate": 0.00029972317997655363, + "loss": 6.2031, + "step": 3205 + }, + { + "epoch": 0.2991508817766166, + "grad_norm": 1.3669905461650715, + "learning_rate": 0.0002997227204752514, + "loss": 6.0668, + "step": 3206 + }, + { + "epoch": 0.2992441914714939, + "grad_norm": 1.0971358856500306, + "learning_rate": 0.0002997222605932487, + "loss": 6.0506, + "step": 3207 + }, + { + "epoch": 0.2993375011663712, + "grad_norm": 1.185582583411242, + "learning_rate": 0.0002997218003305466, + "loss": 5.777, + "step": 3208 + }, + { + "epoch": 0.2994308108612485, + "grad_norm": 1.036449727659758, + "learning_rate": 0.00029972133968714633, + "loss": 6.1207, + "step": 3209 + }, + { + "epoch": 0.29952412055612576, + "grad_norm": 0.9906432598134477, + "learning_rate": 0.00029972087866304907, + "loss": 5.9116, + "step": 3210 + }, + { + "epoch": 0.29961743025100307, + "grad_norm": 1.123217524481729, + "learning_rate": 0.0002997204172582559, + "loss": 5.5906, + "step": 3211 + }, + { + "epoch": 0.2997107399458804, + "grad_norm": 1.027265063049628, + "learning_rate": 0.0002997199554727681, + "loss": 5.9947, + "step": 3212 + }, + { + "epoch": 0.2998040496407577, + "grad_norm": 0.9581600524125455, + "learning_rate": 0.00029971949330658676, + "loss": 5.5472, + "step": 3213 + }, + { + "epoch": 0.299897359335635, + "grad_norm": 1.0798727417953264, + "learning_rate": 0.0002997190307597132, + "loss": 6.4225, + "step": 3214 + }, + { + "epoch": 0.29999066903051225, + "grad_norm": 2.522774025612745, + "learning_rate": 0.0002997185678321484, + "loss": 5.2443, + "step": 3215 + }, + { + "epoch": 0.30008397872538956, + "grad_norm": 0.9781099906832157, + "learning_rate": 0.00029971810452389364, + "loss": 6.0529, + "step": 3216 + }, + { + "epoch": 0.30017728842026686, + "grad_norm": 1.0677274610324856, + "learning_rate": 0.00029971764083495017, + "loss": 5.9359, + "step": 3217 + }, + { + "epoch": 0.30027059811514417, + "grad_norm": 2.3215204207691538, + "learning_rate": 0.000299717176765319, + "loss": 5.2013, + "step": 3218 + }, + { + "epoch": 0.3003639078100215, + "grad_norm": 1.0242711194101648, + "learning_rate": 0.0002997167123150014, + "loss": 5.8917, + "step": 3219 + }, + { + "epoch": 0.30045721750489873, + "grad_norm": 1.505751008551042, + "learning_rate": 0.00029971624748399863, + "loss": 6.1218, + "step": 3220 + }, + { + "epoch": 0.30055052719977604, + "grad_norm": 1.140023228674955, + "learning_rate": 0.0002997157822723117, + "loss": 6.1218, + "step": 3221 + }, + { + "epoch": 0.30064383689465335, + "grad_norm": 1.323146428499374, + "learning_rate": 0.00029971531667994197, + "loss": 5.9732, + "step": 3222 + }, + { + "epoch": 0.30073714658953066, + "grad_norm": 1.1453179889906553, + "learning_rate": 0.0002997148507068905, + "loss": 6.0806, + "step": 3223 + }, + { + "epoch": 0.30083045628440797, + "grad_norm": 2.051741685710033, + "learning_rate": 0.0002997143843531585, + "loss": 6.2385, + "step": 3224 + }, + { + "epoch": 0.3009237659792853, + "grad_norm": 1.7582217031004077, + "learning_rate": 0.00029971391761874716, + "loss": 5.9196, + "step": 3225 + }, + { + "epoch": 0.3010170756741625, + "grad_norm": 1.4524233466266538, + "learning_rate": 0.0002997134505036577, + "loss": 5.4673, + "step": 3226 + }, + { + "epoch": 0.30111038536903983, + "grad_norm": 1.278483539402082, + "learning_rate": 0.0002997129830078913, + "loss": 5.993, + "step": 3227 + }, + { + "epoch": 0.30120369506391714, + "grad_norm": 1.415353706349868, + "learning_rate": 0.0002997125151314491, + "loss": 5.6699, + "step": 3228 + }, + { + "epoch": 0.30129700475879445, + "grad_norm": 2.828052117744277, + "learning_rate": 0.0002997120468743323, + "loss": 4.9979, + "step": 3229 + }, + { + "epoch": 0.30139031445367176, + "grad_norm": 1.4456611148659986, + "learning_rate": 0.0002997115782365422, + "loss": 6.2647, + "step": 3230 + }, + { + "epoch": 0.301483624148549, + "grad_norm": 1.4616189459172229, + "learning_rate": 0.0002997111092180798, + "loss": 6.0851, + "step": 3231 + }, + { + "epoch": 0.3015769338434263, + "grad_norm": 1.2504446359117107, + "learning_rate": 0.0002997106398189465, + "loss": 5.8159, + "step": 3232 + }, + { + "epoch": 0.30167024353830363, + "grad_norm": 2.320201325934824, + "learning_rate": 0.0002997101700391433, + "loss": 6.0605, + "step": 3233 + }, + { + "epoch": 0.30176355323318094, + "grad_norm": 1.0654346865354103, + "learning_rate": 0.00029970969987867156, + "loss": 6.0635, + "step": 3234 + }, + { + "epoch": 0.30185686292805824, + "grad_norm": 1.0819804706458547, + "learning_rate": 0.0002997092293375323, + "loss": 5.8848, + "step": 3235 + }, + { + "epoch": 0.3019501726229355, + "grad_norm": 1.0471079967942205, + "learning_rate": 0.0002997087584157269, + "loss": 5.8262, + "step": 3236 + }, + { + "epoch": 0.3020434823178128, + "grad_norm": 1.551534419148612, + "learning_rate": 0.00029970828711325644, + "loss": 6.0408, + "step": 3237 + }, + { + "epoch": 0.3021367920126901, + "grad_norm": 1.0893587354166927, + "learning_rate": 0.0002997078154301222, + "loss": 5.9546, + "step": 3238 + }, + { + "epoch": 0.3022301017075674, + "grad_norm": 1.6828428303769245, + "learning_rate": 0.0002997073433663253, + "loss": 6.107, + "step": 3239 + }, + { + "epoch": 0.30232341140244473, + "grad_norm": 1.7540154801901502, + "learning_rate": 0.00029970687092186697, + "loss": 6.1866, + "step": 3240 + }, + { + "epoch": 0.30241672109732204, + "grad_norm": 3.4081510096596954, + "learning_rate": 0.0002997063980967484, + "loss": 5.2589, + "step": 3241 + }, + { + "epoch": 0.3025100307921993, + "grad_norm": 1.4269493944514409, + "learning_rate": 0.0002997059248909709, + "loss": 6.177, + "step": 3242 + }, + { + "epoch": 0.3026033404870766, + "grad_norm": 1.2692679826010747, + "learning_rate": 0.00029970545130453545, + "loss": 5.5725, + "step": 3243 + }, + { + "epoch": 0.3026966501819539, + "grad_norm": 1.3687736104401333, + "learning_rate": 0.0002997049773374434, + "loss": 6.2643, + "step": 3244 + }, + { + "epoch": 0.3027899598768312, + "grad_norm": 1.5860763220522227, + "learning_rate": 0.000299704502989696, + "loss": 6.1917, + "step": 3245 + }, + { + "epoch": 0.3028832695717085, + "grad_norm": 1.111303601638784, + "learning_rate": 0.0002997040282612944, + "loss": 6.1722, + "step": 3246 + }, + { + "epoch": 0.3029765792665858, + "grad_norm": 1.4835085014179987, + "learning_rate": 0.00029970355315223974, + "loss": 6.0695, + "step": 3247 + }, + { + "epoch": 0.3030698889614631, + "grad_norm": 1.6309230362441731, + "learning_rate": 0.0002997030776625333, + "loss": 6.3684, + "step": 3248 + }, + { + "epoch": 0.3031631986563404, + "grad_norm": 1.159157097906023, + "learning_rate": 0.0002997026017921763, + "loss": 6.0215, + "step": 3249 + }, + { + "epoch": 0.3032565083512177, + "grad_norm": 1.7395383411489487, + "learning_rate": 0.00029970212554116996, + "loss": 5.4724, + "step": 3250 + }, + { + "epoch": 0.303349818046095, + "grad_norm": 1.5836863954514857, + "learning_rate": 0.0002997016489095154, + "loss": 5.9785, + "step": 3251 + }, + { + "epoch": 0.30344312774097226, + "grad_norm": 1.8390882117336158, + "learning_rate": 0.00029970117189721395, + "loss": 6.229, + "step": 3252 + }, + { + "epoch": 0.30353643743584957, + "grad_norm": 1.9973100815353801, + "learning_rate": 0.0002997006945042667, + "loss": 6.0391, + "step": 3253 + }, + { + "epoch": 0.3036297471307269, + "grad_norm": 1.3864106307681072, + "learning_rate": 0.00029970021673067493, + "loss": 6.0431, + "step": 3254 + }, + { + "epoch": 0.3037230568256042, + "grad_norm": 1.177741973381195, + "learning_rate": 0.0002996997385764399, + "loss": 6.1696, + "step": 3255 + }, + { + "epoch": 0.3038163665204815, + "grad_norm": 2.128287924182368, + "learning_rate": 0.0002996992600415628, + "loss": 6.3584, + "step": 3256 + }, + { + "epoch": 0.30390967621535875, + "grad_norm": 1.8882035026763428, + "learning_rate": 0.00029969878112604475, + "loss": 6.1109, + "step": 3257 + }, + { + "epoch": 0.30400298591023606, + "grad_norm": 1.3571330164040205, + "learning_rate": 0.00029969830182988714, + "loss": 5.7943, + "step": 3258 + }, + { + "epoch": 0.30409629560511336, + "grad_norm": 1.2795534163241447, + "learning_rate": 0.00029969782215309097, + "loss": 6.2142, + "step": 3259 + }, + { + "epoch": 0.3041896052999907, + "grad_norm": 1.0568270324864326, + "learning_rate": 0.0002996973420956577, + "loss": 5.9419, + "step": 3260 + }, + { + "epoch": 0.304282914994868, + "grad_norm": 2.3311277480393287, + "learning_rate": 0.00029969686165758834, + "loss": 6.0637, + "step": 3261 + }, + { + "epoch": 0.3043762246897453, + "grad_norm": 2.314791485075328, + "learning_rate": 0.00029969638083888426, + "loss": 6.0582, + "step": 3262 + }, + { + "epoch": 0.30446953438462254, + "grad_norm": 1.4569300729583694, + "learning_rate": 0.0002996958996395466, + "loss": 6.1196, + "step": 3263 + }, + { + "epoch": 0.30456284407949985, + "grad_norm": 2.0339853483875596, + "learning_rate": 0.00029969541805957664, + "loss": 6.0783, + "step": 3264 + }, + { + "epoch": 0.30465615377437716, + "grad_norm": 2.922831120150641, + "learning_rate": 0.00029969493609897555, + "loss": 6.0612, + "step": 3265 + }, + { + "epoch": 0.30474946346925447, + "grad_norm": 1.1930238163202893, + "learning_rate": 0.00029969445375774463, + "loss": 6.2855, + "step": 3266 + }, + { + "epoch": 0.3048427731641318, + "grad_norm": 1.691695457082353, + "learning_rate": 0.000299693971035885, + "loss": 6.2976, + "step": 3267 + }, + { + "epoch": 0.30493608285900903, + "grad_norm": 1.1279010744118303, + "learning_rate": 0.0002996934879333979, + "loss": 5.3864, + "step": 3268 + }, + { + "epoch": 0.30502939255388634, + "grad_norm": 1.218028172457505, + "learning_rate": 0.0002996930044502847, + "loss": 5.7906, + "step": 3269 + }, + { + "epoch": 0.30512270224876364, + "grad_norm": 1.1135216490186592, + "learning_rate": 0.0002996925205865465, + "loss": 5.6358, + "step": 3270 + }, + { + "epoch": 0.30521601194364095, + "grad_norm": 1.434722664199669, + "learning_rate": 0.00029969203634218453, + "loss": 6.1609, + "step": 3271 + }, + { + "epoch": 0.30530932163851826, + "grad_norm": 1.3946643712559534, + "learning_rate": 0.00029969155171720007, + "loss": 6.0711, + "step": 3272 + }, + { + "epoch": 0.3054026313333955, + "grad_norm": 4.235116212261096, + "learning_rate": 0.00029969106671159435, + "loss": 5.9175, + "step": 3273 + }, + { + "epoch": 0.3054959410282728, + "grad_norm": 6.377848586505673, + "learning_rate": 0.00029969058132536857, + "loss": 5.9043, + "step": 3274 + }, + { + "epoch": 0.30558925072315013, + "grad_norm": 11.44149369466534, + "learning_rate": 0.000299690095558524, + "loss": 6.5598, + "step": 3275 + }, + { + "epoch": 0.30568256041802744, + "grad_norm": 57.218889789688326, + "learning_rate": 0.0002996896094110619, + "loss": 6.098, + "step": 3276 + }, + { + "epoch": 0.30577587011290475, + "grad_norm": 2.7415649833180264, + "learning_rate": 0.00029968912288298337, + "loss": 6.178, + "step": 3277 + }, + { + "epoch": 0.30586917980778205, + "grad_norm": 1.6765555714420188, + "learning_rate": 0.0002996886359742898, + "loss": 6.5129, + "step": 3278 + }, + { + "epoch": 0.3059624895026593, + "grad_norm": 2.2098183675520477, + "learning_rate": 0.0002996881486849824, + "loss": 6.8652, + "step": 3279 + }, + { + "epoch": 0.3060557991975366, + "grad_norm": 2.5001918027173784, + "learning_rate": 0.0002996876610150623, + "loss": 6.5742, + "step": 3280 + }, + { + "epoch": 0.3061491088924139, + "grad_norm": 3.535659453607676, + "learning_rate": 0.00029968717296453086, + "loss": 6.576, + "step": 3281 + }, + { + "epoch": 0.30624241858729123, + "grad_norm": 166.22826851678164, + "learning_rate": 0.0002996866845333893, + "loss": 6.8405, + "step": 3282 + }, + { + "epoch": 0.30633572828216854, + "grad_norm": 5.210414120373424, + "learning_rate": 0.00029968619572163884, + "loss": 6.9599, + "step": 3283 + }, + { + "epoch": 0.3064290379770458, + "grad_norm": 2.630501233002575, + "learning_rate": 0.0002996857065292807, + "loss": 7.0291, + "step": 3284 + }, + { + "epoch": 0.3065223476719231, + "grad_norm": 3.95082103071334, + "learning_rate": 0.00029968521695631614, + "loss": 7.4132, + "step": 3285 + }, + { + "epoch": 0.3066156573668004, + "grad_norm": 4.056533358943556, + "learning_rate": 0.00029968472700274645, + "loss": 7.392, + "step": 3286 + }, + { + "epoch": 0.3067089670616777, + "grad_norm": 4.556183669927088, + "learning_rate": 0.0002996842366685728, + "loss": 7.1856, + "step": 3287 + }, + { + "epoch": 0.306802276756555, + "grad_norm": 220.30602006079835, + "learning_rate": 0.0002996837459537965, + "loss": 6.9825, + "step": 3288 + }, + { + "epoch": 0.3068955864514323, + "grad_norm": 3.748424008106601, + "learning_rate": 0.0002996832548584188, + "loss": 7.556, + "step": 3289 + }, + { + "epoch": 0.3069888961463096, + "grad_norm": 10.34480527529873, + "learning_rate": 0.0002996827633824409, + "loss": 7.7428, + "step": 3290 + }, + { + "epoch": 0.3070822058411869, + "grad_norm": 12.282968624330922, + "learning_rate": 0.00029968227152586403, + "loss": 7.7742, + "step": 3291 + }, + { + "epoch": 0.3071755155360642, + "grad_norm": 16.092903623506565, + "learning_rate": 0.0002996817792886895, + "loss": 7.3196, + "step": 3292 + }, + { + "epoch": 0.3072688252309415, + "grad_norm": 26.470387204516708, + "learning_rate": 0.00029968128667091855, + "loss": 7.3798, + "step": 3293 + }, + { + "epoch": 0.3073621349258188, + "grad_norm": 10.384962534596852, + "learning_rate": 0.0002996807936725524, + "loss": 7.5609, + "step": 3294 + }, + { + "epoch": 0.3074554446206961, + "grad_norm": 35.97658626037903, + "learning_rate": 0.0002996803002935924, + "loss": 7.5539, + "step": 3295 + }, + { + "epoch": 0.3075487543155734, + "grad_norm": 34.238974056836916, + "learning_rate": 0.0002996798065340397, + "loss": 7.8247, + "step": 3296 + }, + { + "epoch": 0.3076420640104507, + "grad_norm": 27.90017256555198, + "learning_rate": 0.0002996793123938956, + "loss": 7.6288, + "step": 3297 + }, + { + "epoch": 0.307735373705328, + "grad_norm": 16.13410940864432, + "learning_rate": 0.00029967881787316134, + "loss": 7.6008, + "step": 3298 + }, + { + "epoch": 0.3078286834002053, + "grad_norm": 3.7546286650318454, + "learning_rate": 0.0002996783229718382, + "loss": 7.3142, + "step": 3299 + }, + { + "epoch": 0.30792199309508256, + "grad_norm": 389.4419089694259, + "learning_rate": 0.0002996778276899274, + "loss": 8.0803, + "step": 3300 + }, + { + "epoch": 0.30801530278995987, + "grad_norm": 4.529068758765648, + "learning_rate": 0.0002996773320274302, + "loss": 7.2438, + "step": 3301 + }, + { + "epoch": 0.3081086124848372, + "grad_norm": 8.200852496328004, + "learning_rate": 0.0002996768359843479, + "loss": 7.4143, + "step": 3302 + }, + { + "epoch": 0.3082019221797145, + "grad_norm": 8.162195219302612, + "learning_rate": 0.00029967633956068173, + "loss": 7.53, + "step": 3303 + }, + { + "epoch": 0.3082952318745918, + "grad_norm": 3.278037300947941, + "learning_rate": 0.000299675842756433, + "loss": 7.835, + "step": 3304 + }, + { + "epoch": 0.30838854156946904, + "grad_norm": 3.4076577125238567, + "learning_rate": 0.00029967534557160297, + "loss": 7.5222, + "step": 3305 + }, + { + "epoch": 0.30848185126434635, + "grad_norm": 3.056193490687372, + "learning_rate": 0.0002996748480061928, + "loss": 7.5463, + "step": 3306 + }, + { + "epoch": 0.30857516095922366, + "grad_norm": 4.479185698451695, + "learning_rate": 0.00029967435006020385, + "loss": 7.4022, + "step": 3307 + }, + { + "epoch": 0.30866847065410097, + "grad_norm": 3.7659964209948424, + "learning_rate": 0.00029967385173363737, + "loss": 7.4512, + "step": 3308 + }, + { + "epoch": 0.3087617803489783, + "grad_norm": 2.259842689894789, + "learning_rate": 0.0002996733530264947, + "loss": 7.1573, + "step": 3309 + }, + { + "epoch": 0.30885509004385553, + "grad_norm": 6.046261055122479, + "learning_rate": 0.0002996728539387769, + "loss": 7.2876, + "step": 3310 + }, + { + "epoch": 0.30894839973873284, + "grad_norm": 2.3654111398501496, + "learning_rate": 0.0002996723544704854, + "loss": 7.1907, + "step": 3311 + }, + { + "epoch": 0.30904170943361015, + "grad_norm": 187.7136306345294, + "learning_rate": 0.00029967185462162146, + "loss": 7.0352, + "step": 3312 + }, + { + "epoch": 0.30913501912848745, + "grad_norm": 1.9032626072454413, + "learning_rate": 0.00029967135439218633, + "loss": 7.4527, + "step": 3313 + }, + { + "epoch": 0.30922832882336476, + "grad_norm": 22.5590228731397, + "learning_rate": 0.0002996708537821813, + "loss": 7.4586, + "step": 3314 + }, + { + "epoch": 0.30932163851824207, + "grad_norm": 2.044657206975972, + "learning_rate": 0.0002996703527916076, + "loss": 7.6079, + "step": 3315 + }, + { + "epoch": 0.3094149482131193, + "grad_norm": 3.618449559545947, + "learning_rate": 0.00029966985142046654, + "loss": 7.5714, + "step": 3316 + }, + { + "epoch": 0.30950825790799663, + "grad_norm": 2.049182391434748, + "learning_rate": 0.0002996693496687594, + "loss": 7.826, + "step": 3317 + }, + { + "epoch": 0.30960156760287394, + "grad_norm": 2.713800207142277, + "learning_rate": 0.00029966884753648745, + "loss": 7.3123, + "step": 3318 + }, + { + "epoch": 0.30969487729775125, + "grad_norm": 2.209020146765191, + "learning_rate": 0.0002996683450236519, + "loss": 7.012, + "step": 3319 + }, + { + "epoch": 0.30978818699262856, + "grad_norm": 3.32531411356578, + "learning_rate": 0.00029966784213025414, + "loss": 7.4227, + "step": 3320 + }, + { + "epoch": 0.3098814966875058, + "grad_norm": 5.092003822663377, + "learning_rate": 0.00029966733885629533, + "loss": 7.7243, + "step": 3321 + }, + { + "epoch": 0.3099748063823831, + "grad_norm": 3.6322230202523436, + "learning_rate": 0.00029966683520177685, + "loss": 7.2118, + "step": 3322 + }, + { + "epoch": 0.3100681160772604, + "grad_norm": 3.6046775618607834, + "learning_rate": 0.00029966633116669994, + "loss": 7.0771, + "step": 3323 + }, + { + "epoch": 0.31016142577213773, + "grad_norm": 1.7649084486193969, + "learning_rate": 0.00029966582675106584, + "loss": 7.403, + "step": 3324 + }, + { + "epoch": 0.31025473546701504, + "grad_norm": 1.5730540742138144, + "learning_rate": 0.0002996653219548759, + "loss": 7.269, + "step": 3325 + }, + { + "epoch": 0.3103480451618923, + "grad_norm": 2.0563903715062266, + "learning_rate": 0.00029966481677813145, + "loss": 7.5212, + "step": 3326 + }, + { + "epoch": 0.3104413548567696, + "grad_norm": 2.364103500754601, + "learning_rate": 0.00029966431122083366, + "loss": 7.3283, + "step": 3327 + }, + { + "epoch": 0.3105346645516469, + "grad_norm": 2.2116410004548452, + "learning_rate": 0.0002996638052829839, + "loss": 7.2093, + "step": 3328 + }, + { + "epoch": 0.3106279742465242, + "grad_norm": 1.4376901721479503, + "learning_rate": 0.0002996632989645833, + "loss": 7.2495, + "step": 3329 + }, + { + "epoch": 0.3107212839414015, + "grad_norm": 1.1530420704186441, + "learning_rate": 0.0002996627922656333, + "loss": 7.2288, + "step": 3330 + }, + { + "epoch": 0.31081459363627884, + "grad_norm": 1.3947567029707266, + "learning_rate": 0.00029966228518613525, + "loss": 7.1971, + "step": 3331 + }, + { + "epoch": 0.3109079033311561, + "grad_norm": 1.6922723821372503, + "learning_rate": 0.0002996617777260903, + "loss": 7.2056, + "step": 3332 + }, + { + "epoch": 0.3110012130260334, + "grad_norm": 1.7596636317171719, + "learning_rate": 0.0002996612698854997, + "loss": 7.2227, + "step": 3333 + }, + { + "epoch": 0.3110945227209107, + "grad_norm": 1.6347676199075705, + "learning_rate": 0.0002996607616643649, + "loss": 6.9141, + "step": 3334 + }, + { + "epoch": 0.311187832415788, + "grad_norm": 1.5234779362277342, + "learning_rate": 0.00029966025306268706, + "loss": 7.1901, + "step": 3335 + }, + { + "epoch": 0.3112811421106653, + "grad_norm": 1.3530757517716434, + "learning_rate": 0.0002996597440804676, + "loss": 7.1643, + "step": 3336 + }, + { + "epoch": 0.3113744518055426, + "grad_norm": 1.0700762534423922, + "learning_rate": 0.00029965923471770775, + "loss": 7.3102, + "step": 3337 + }, + { + "epoch": 0.3114677615004199, + "grad_norm": 2.5356502614925156, + "learning_rate": 0.00029965872497440875, + "loss": 6.8643, + "step": 3338 + }, + { + "epoch": 0.3115610711952972, + "grad_norm": 2.635438267614494, + "learning_rate": 0.0002996582148505719, + "loss": 7.3225, + "step": 3339 + }, + { + "epoch": 0.3116543808901745, + "grad_norm": 1.7926076777136335, + "learning_rate": 0.0002996577043461986, + "loss": 7.113, + "step": 3340 + }, + { + "epoch": 0.3117476905850518, + "grad_norm": 1.245127072396644, + "learning_rate": 0.0002996571934612901, + "loss": 7.3898, + "step": 3341 + }, + { + "epoch": 0.31184100027992906, + "grad_norm": 2.3427033828078523, + "learning_rate": 0.0002996566821958477, + "loss": 6.9413, + "step": 3342 + }, + { + "epoch": 0.31193430997480637, + "grad_norm": 1.8144851995433, + "learning_rate": 0.00029965617054987265, + "loss": 7.0958, + "step": 3343 + }, + { + "epoch": 0.3120276196696837, + "grad_norm": 2.217144354441095, + "learning_rate": 0.00029965565852336633, + "loss": 7.1485, + "step": 3344 + }, + { + "epoch": 0.312120929364561, + "grad_norm": 1.499231618514282, + "learning_rate": 0.00029965514611632994, + "loss": 7.1845, + "step": 3345 + }, + { + "epoch": 0.3122142390594383, + "grad_norm": 1.0181443795152039, + "learning_rate": 0.0002996546333287649, + "loss": 6.8962, + "step": 3346 + }, + { + "epoch": 0.3123075487543156, + "grad_norm": 1.5676231336905715, + "learning_rate": 0.00029965412016067243, + "loss": 6.6886, + "step": 3347 + }, + { + "epoch": 0.31240085844919285, + "grad_norm": 1.6265712154317755, + "learning_rate": 0.00029965360661205387, + "loss": 7.0009, + "step": 3348 + }, + { + "epoch": 0.31249416814407016, + "grad_norm": 0.8689772558647978, + "learning_rate": 0.00029965309268291054, + "loss": 7.0341, + "step": 3349 + }, + { + "epoch": 0.31258747783894747, + "grad_norm": 2.9727618844317396, + "learning_rate": 0.00029965257837324374, + "loss": 7.3748, + "step": 3350 + }, + { + "epoch": 0.3126807875338248, + "grad_norm": 1.6148726045879593, + "learning_rate": 0.00029965206368305473, + "loss": 7.4112, + "step": 3351 + }, + { + "epoch": 0.3127740972287021, + "grad_norm": 1.6528163750611984, + "learning_rate": 0.00029965154861234485, + "loss": 6.7853, + "step": 3352 + }, + { + "epoch": 0.31286740692357934, + "grad_norm": 2.39386980084742, + "learning_rate": 0.0002996510331611154, + "loss": 6.8692, + "step": 3353 + }, + { + "epoch": 0.31296071661845665, + "grad_norm": 2.276425739213265, + "learning_rate": 0.0002996505173293677, + "loss": 7.0844, + "step": 3354 + }, + { + "epoch": 0.31305402631333396, + "grad_norm": 2.394045533471139, + "learning_rate": 0.0002996500011171031, + "loss": 6.7389, + "step": 3355 + }, + { + "epoch": 0.31314733600821126, + "grad_norm": 2.1523823564544653, + "learning_rate": 0.0002996494845243229, + "loss": 6.8038, + "step": 3356 + }, + { + "epoch": 0.31324064570308857, + "grad_norm": 3.3338100158045996, + "learning_rate": 0.0002996489675510283, + "loss": 6.9581, + "step": 3357 + }, + { + "epoch": 0.3133339553979658, + "grad_norm": 1.4452126016245619, + "learning_rate": 0.00029964845019722077, + "loss": 7.1963, + "step": 3358 + }, + { + "epoch": 0.31342726509284313, + "grad_norm": 2.5995352362704405, + "learning_rate": 0.00029964793246290154, + "loss": 6.9189, + "step": 3359 + }, + { + "epoch": 0.31352057478772044, + "grad_norm": 1.5298445067261064, + "learning_rate": 0.00029964741434807197, + "loss": 6.9004, + "step": 3360 + }, + { + "epoch": 0.31361388448259775, + "grad_norm": 1.1559269205076723, + "learning_rate": 0.00029964689585273334, + "loss": 6.9439, + "step": 3361 + }, + { + "epoch": 0.31370719417747506, + "grad_norm": 1.9301472833489839, + "learning_rate": 0.000299646376976887, + "loss": 7.0263, + "step": 3362 + }, + { + "epoch": 0.3138005038723523, + "grad_norm": 0.876561873806568, + "learning_rate": 0.00029964585772053416, + "loss": 6.288, + "step": 3363 + }, + { + "epoch": 0.3138938135672296, + "grad_norm": 1.1593878857207718, + "learning_rate": 0.0002996453380836763, + "loss": 7.0017, + "step": 3364 + }, + { + "epoch": 0.3139871232621069, + "grad_norm": 1.7491815250241265, + "learning_rate": 0.0002996448180663147, + "loss": 6.6816, + "step": 3365 + }, + { + "epoch": 0.31408043295698423, + "grad_norm": 1.9617475177525305, + "learning_rate": 0.0002996442976684506, + "loss": 6.5344, + "step": 3366 + }, + { + "epoch": 0.31417374265186154, + "grad_norm": 0.9696460127211092, + "learning_rate": 0.0002996437768900854, + "loss": 6.6014, + "step": 3367 + }, + { + "epoch": 0.31426705234673885, + "grad_norm": 1.5183312499779, + "learning_rate": 0.0002996432557312204, + "loss": 6.3338, + "step": 3368 + }, + { + "epoch": 0.3143603620416161, + "grad_norm": 1.728955450492949, + "learning_rate": 0.000299642734191857, + "loss": 6.7192, + "step": 3369 + }, + { + "epoch": 0.3144536717364934, + "grad_norm": 1.1295998473321118, + "learning_rate": 0.0002996422122719964, + "loss": 6.7507, + "step": 3370 + }, + { + "epoch": 0.3145469814313707, + "grad_norm": 1.058267967517208, + "learning_rate": 0.00029964168997164, + "loss": 6.8768, + "step": 3371 + }, + { + "epoch": 0.31464029112624803, + "grad_norm": 1.6733392071618267, + "learning_rate": 0.00029964116729078904, + "loss": 6.4775, + "step": 3372 + }, + { + "epoch": 0.31473360082112534, + "grad_norm": 1.8132362557630273, + "learning_rate": 0.00029964064422944496, + "loss": 6.6323, + "step": 3373 + }, + { + "epoch": 0.3148269105160026, + "grad_norm": 1.4312905179217572, + "learning_rate": 0.0002996401207876091, + "loss": 7.0368, + "step": 3374 + }, + { + "epoch": 0.3149202202108799, + "grad_norm": 1.1903433387156834, + "learning_rate": 0.00029963959696528264, + "loss": 6.1883, + "step": 3375 + }, + { + "epoch": 0.3150135299057572, + "grad_norm": 2.3178337417431156, + "learning_rate": 0.00029963907276246704, + "loss": 6.2039, + "step": 3376 + }, + { + "epoch": 0.3151068396006345, + "grad_norm": 1.3159292133742995, + "learning_rate": 0.00029963854817916365, + "loss": 6.8393, + "step": 3377 + }, + { + "epoch": 0.3152001492955118, + "grad_norm": 2.5382454400940535, + "learning_rate": 0.00029963802321537376, + "loss": 6.5521, + "step": 3378 + }, + { + "epoch": 0.3152934589903891, + "grad_norm": 1.2927717694588094, + "learning_rate": 0.0002996374978710987, + "loss": 6.5266, + "step": 3379 + }, + { + "epoch": 0.3153867686852664, + "grad_norm": 1.3943238219440957, + "learning_rate": 0.00029963697214633976, + "loss": 6.1245, + "step": 3380 + }, + { + "epoch": 0.3154800783801437, + "grad_norm": 1.0875078940387397, + "learning_rate": 0.00029963644604109836, + "loss": 6.5431, + "step": 3381 + }, + { + "epoch": 0.315573388075021, + "grad_norm": 2.0924265670088253, + "learning_rate": 0.0002996359195553758, + "loss": 6.8587, + "step": 3382 + }, + { + "epoch": 0.3156666977698983, + "grad_norm": 1.6218487984121621, + "learning_rate": 0.0002996353926891734, + "loss": 6.6456, + "step": 3383 + }, + { + "epoch": 0.3157600074647756, + "grad_norm": 2.0280651273562516, + "learning_rate": 0.0002996348654424926, + "loss": 6.8614, + "step": 3384 + }, + { + "epoch": 0.31585331715965287, + "grad_norm": 1.431148164948035, + "learning_rate": 0.00029963433781533453, + "loss": 6.5051, + "step": 3385 + }, + { + "epoch": 0.3159466268545302, + "grad_norm": 2.0879211773939543, + "learning_rate": 0.00029963380980770076, + "loss": 6.4345, + "step": 3386 + }, + { + "epoch": 0.3160399365494075, + "grad_norm": 1.2680564560648384, + "learning_rate": 0.00029963328141959254, + "loss": 6.7713, + "step": 3387 + }, + { + "epoch": 0.3161332462442848, + "grad_norm": 1.1366934433095437, + "learning_rate": 0.00029963275265101116, + "loss": 7.0145, + "step": 3388 + }, + { + "epoch": 0.3162265559391621, + "grad_norm": 2.138290816085884, + "learning_rate": 0.000299632223501958, + "loss": 6.511, + "step": 3389 + }, + { + "epoch": 0.31631986563403935, + "grad_norm": 1.629333691505421, + "learning_rate": 0.0002996316939724345, + "loss": 6.5679, + "step": 3390 + }, + { + "epoch": 0.31641317532891666, + "grad_norm": 1.473867424020266, + "learning_rate": 0.0002996311640624419, + "loss": 6.7008, + "step": 3391 + }, + { + "epoch": 0.31650648502379397, + "grad_norm": 1.293525251350132, + "learning_rate": 0.00029963063377198157, + "loss": 6.5547, + "step": 3392 + }, + { + "epoch": 0.3165997947186713, + "grad_norm": 1.4691665393195534, + "learning_rate": 0.0002996301031010548, + "loss": 5.8662, + "step": 3393 + }, + { + "epoch": 0.3166931044135486, + "grad_norm": 1.56158053577733, + "learning_rate": 0.0002996295720496631, + "loss": 6.4626, + "step": 3394 + }, + { + "epoch": 0.31678641410842584, + "grad_norm": 1.1549512098418593, + "learning_rate": 0.00029962904061780766, + "loss": 6.443, + "step": 3395 + }, + { + "epoch": 0.31687972380330315, + "grad_norm": 1.864105143630114, + "learning_rate": 0.0002996285088054899, + "loss": 6.3968, + "step": 3396 + }, + { + "epoch": 0.31697303349818046, + "grad_norm": 9.190935568337691, + "learning_rate": 0.0002996279766127112, + "loss": 6.2237, + "step": 3397 + }, + { + "epoch": 0.31706634319305776, + "grad_norm": 1.6449890773876052, + "learning_rate": 0.00029962744403947284, + "loss": 6.419, + "step": 3398 + }, + { + "epoch": 0.3171596528879351, + "grad_norm": 3.0776773083274307, + "learning_rate": 0.0002996269110857762, + "loss": 6.3699, + "step": 3399 + }, + { + "epoch": 0.3172529625828124, + "grad_norm": 2.2988664763837092, + "learning_rate": 0.00029962637775162273, + "loss": 6.8798, + "step": 3400 + }, + { + "epoch": 0.31734627227768963, + "grad_norm": 4.326948652767106, + "learning_rate": 0.0002996258440370136, + "loss": 6.2226, + "step": 3401 + }, + { + "epoch": 0.31743958197256694, + "grad_norm": 2.0267805302786632, + "learning_rate": 0.00029962530994195034, + "loss": 6.7838, + "step": 3402 + }, + { + "epoch": 0.31753289166744425, + "grad_norm": 154922.4977791225, + "learning_rate": 0.0002996247754664342, + "loss": 6.6022, + "step": 3403 + }, + { + "epoch": 0.31762620136232156, + "grad_norm": 2.5821429695951084, + "learning_rate": 0.00029962424061046657, + "loss": 7.0062, + "step": 3404 + }, + { + "epoch": 0.31771951105719887, + "grad_norm": 2.4661381932702424, + "learning_rate": 0.0002996237053740489, + "loss": 6.8464, + "step": 3405 + }, + { + "epoch": 0.3178128207520761, + "grad_norm": 1.9395170606894496, + "learning_rate": 0.0002996231697571824, + "loss": 7.1926, + "step": 3406 + }, + { + "epoch": 0.3179061304469534, + "grad_norm": 2.8766828264679454, + "learning_rate": 0.00029962263375986846, + "loss": 6.9848, + "step": 3407 + }, + { + "epoch": 0.31799944014183074, + "grad_norm": 2.1125169910116868, + "learning_rate": 0.00029962209738210855, + "loss": 6.9795, + "step": 3408 + }, + { + "epoch": 0.31809274983670804, + "grad_norm": 2.255323629332996, + "learning_rate": 0.000299621560623904, + "loss": 7.1058, + "step": 3409 + }, + { + "epoch": 0.31818605953158535, + "grad_norm": 2.4443333951651383, + "learning_rate": 0.0002996210234852561, + "loss": 7.3844, + "step": 3410 + }, + { + "epoch": 0.3182793692264626, + "grad_norm": 2.5845842801457555, + "learning_rate": 0.00029962048596616624, + "loss": 6.9018, + "step": 3411 + }, + { + "epoch": 0.3183726789213399, + "grad_norm": 2.1391154937167025, + "learning_rate": 0.0002996199480666358, + "loss": 6.9993, + "step": 3412 + }, + { + "epoch": 0.3184659886162172, + "grad_norm": 8.229754775409246, + "learning_rate": 0.0002996194097866662, + "loss": 7.1011, + "step": 3413 + }, + { + "epoch": 0.31855929831109453, + "grad_norm": 2.777967718494061, + "learning_rate": 0.0002996188711262587, + "loss": 7.2082, + "step": 3414 + }, + { + "epoch": 0.31865260800597184, + "grad_norm": 4.029675450877059, + "learning_rate": 0.00029961833208541475, + "loss": 6.7705, + "step": 3415 + }, + { + "epoch": 0.3187459177008491, + "grad_norm": 4.381600878672249, + "learning_rate": 0.00029961779266413573, + "loss": 6.953, + "step": 3416 + }, + { + "epoch": 0.3188392273957264, + "grad_norm": 2.1572678420516485, + "learning_rate": 0.000299617252862423, + "loss": 6.9705, + "step": 3417 + }, + { + "epoch": 0.3189325370906037, + "grad_norm": 2.5559940754570185, + "learning_rate": 0.00029961671268027786, + "loss": 7.1766, + "step": 3418 + }, + { + "epoch": 0.319025846785481, + "grad_norm": 1.515077879597823, + "learning_rate": 0.00029961617211770176, + "loss": 6.734, + "step": 3419 + }, + { + "epoch": 0.3191191564803583, + "grad_norm": 1.9375646091528753, + "learning_rate": 0.0002996156311746961, + "loss": 7.014, + "step": 3420 + }, + { + "epoch": 0.31921246617523563, + "grad_norm": 1.9980287715202734, + "learning_rate": 0.00029961508985126216, + "loss": 6.9722, + "step": 3421 + }, + { + "epoch": 0.3193057758701129, + "grad_norm": 2.254953008134551, + "learning_rate": 0.00029961454814740135, + "loss": 7.0995, + "step": 3422 + }, + { + "epoch": 0.3193990855649902, + "grad_norm": 1.7252879822279878, + "learning_rate": 0.0002996140060631151, + "loss": 7.1059, + "step": 3423 + }, + { + "epoch": 0.3194923952598675, + "grad_norm": 1.7757495159585406, + "learning_rate": 0.00029961346359840473, + "loss": 7.0194, + "step": 3424 + }, + { + "epoch": 0.3195857049547448, + "grad_norm": 1.4881344524825995, + "learning_rate": 0.00029961292075327163, + "loss": 6.8622, + "step": 3425 + }, + { + "epoch": 0.3196790146496221, + "grad_norm": 1.1932029900911687, + "learning_rate": 0.00029961237752771725, + "loss": 6.7461, + "step": 3426 + }, + { + "epoch": 0.31977232434449937, + "grad_norm": 1.5011627522072217, + "learning_rate": 0.00029961183392174283, + "loss": 6.7976, + "step": 3427 + }, + { + "epoch": 0.3198656340393767, + "grad_norm": 1.4590400861736874, + "learning_rate": 0.0002996112899353499, + "loss": 6.7065, + "step": 3428 + }, + { + "epoch": 0.319958943734254, + "grad_norm": 1.1050131233676919, + "learning_rate": 0.00029961074556853975, + "loss": 6.3888, + "step": 3429 + }, + { + "epoch": 0.3200522534291313, + "grad_norm": 2.7695681213547805, + "learning_rate": 0.0002996102008213138, + "loss": 6.4205, + "step": 3430 + }, + { + "epoch": 0.3201455631240086, + "grad_norm": 1.5684133380206449, + "learning_rate": 0.00029960965569367345, + "loss": 7.0013, + "step": 3431 + }, + { + "epoch": 0.32023887281888586, + "grad_norm": 2.3126781693030893, + "learning_rate": 0.00029960911018562, + "loss": 6.7184, + "step": 3432 + }, + { + "epoch": 0.32033218251376316, + "grad_norm": 1.5291448959461664, + "learning_rate": 0.00029960856429715494, + "loss": 6.6904, + "step": 3433 + }, + { + "epoch": 0.32042549220864047, + "grad_norm": 1.7171275275530093, + "learning_rate": 0.00029960801802827964, + "loss": 6.7152, + "step": 3434 + }, + { + "epoch": 0.3205188019035178, + "grad_norm": 1.7944358873284392, + "learning_rate": 0.00029960747137899543, + "loss": 6.941, + "step": 3435 + }, + { + "epoch": 0.3206121115983951, + "grad_norm": 1.8907629322332582, + "learning_rate": 0.00029960692434930375, + "loss": 6.7048, + "step": 3436 + }, + { + "epoch": 0.3207054212932724, + "grad_norm": 1.78221692831175, + "learning_rate": 0.000299606376939206, + "loss": 6.7299, + "step": 3437 + }, + { + "epoch": 0.32079873098814965, + "grad_norm": 1.1612546237829107, + "learning_rate": 0.0002996058291487035, + "loss": 6.3982, + "step": 3438 + }, + { + "epoch": 0.32089204068302696, + "grad_norm": 1.305213911914595, + "learning_rate": 0.00029960528097779777, + "loss": 6.6293, + "step": 3439 + }, + { + "epoch": 0.32098535037790427, + "grad_norm": 1.593937427009202, + "learning_rate": 0.00029960473242649004, + "loss": 6.7875, + "step": 3440 + }, + { + "epoch": 0.3210786600727816, + "grad_norm": 3.310719921735257, + "learning_rate": 0.00029960418349478184, + "loss": 7.3388, + "step": 3441 + }, + { + "epoch": 0.3211719697676589, + "grad_norm": 1.7500636710739368, + "learning_rate": 0.00029960363418267454, + "loss": 6.5703, + "step": 3442 + }, + { + "epoch": 0.32126527946253614, + "grad_norm": 2.1564626854193145, + "learning_rate": 0.00029960308449016943, + "loss": 6.812, + "step": 3443 + }, + { + "epoch": 0.32135858915741344, + "grad_norm": 1.8273318421654943, + "learning_rate": 0.0002996025344172681, + "loss": 6.9078, + "step": 3444 + }, + { + "epoch": 0.32145189885229075, + "grad_norm": 1.5910442456173384, + "learning_rate": 0.00029960198396397175, + "loss": 6.7919, + "step": 3445 + }, + { + "epoch": 0.32154520854716806, + "grad_norm": 7.762780018694255, + "learning_rate": 0.00029960143313028195, + "loss": 6.742, + "step": 3446 + }, + { + "epoch": 0.32163851824204537, + "grad_norm": 1.6289360374163642, + "learning_rate": 0.00029960088191619997, + "loss": 6.5396, + "step": 3447 + }, + { + "epoch": 0.3217318279369226, + "grad_norm": 1.5188157727503147, + "learning_rate": 0.00029960033032172724, + "loss": 6.6318, + "step": 3448 + }, + { + "epoch": 0.32182513763179993, + "grad_norm": 1.310914728848977, + "learning_rate": 0.0002995997783468652, + "loss": 6.3746, + "step": 3449 + }, + { + "epoch": 0.32191844732667724, + "grad_norm": 1.0562720908623198, + "learning_rate": 0.0002995992259916153, + "loss": 6.3765, + "step": 3450 + }, + { + "epoch": 0.32201175702155455, + "grad_norm": 1.9652177008547898, + "learning_rate": 0.0002995986732559788, + "loss": 6.5533, + "step": 3451 + }, + { + "epoch": 0.32210506671643185, + "grad_norm": 2.661822738528564, + "learning_rate": 0.0002995981201399572, + "loss": 6.8443, + "step": 3452 + }, + { + "epoch": 0.32219837641130916, + "grad_norm": 1.5569683220974702, + "learning_rate": 0.00029959756664355193, + "loss": 6.5531, + "step": 3453 + }, + { + "epoch": 0.3222916861061864, + "grad_norm": 1.9826703861613706, + "learning_rate": 0.0002995970127667644, + "loss": 6.6679, + "step": 3454 + }, + { + "epoch": 0.3223849958010637, + "grad_norm": 2.475972873594254, + "learning_rate": 0.00029959645850959586, + "loss": 6.686, + "step": 3455 + }, + { + "epoch": 0.32247830549594103, + "grad_norm": 4.925400581350343, + "learning_rate": 0.00029959590387204795, + "loss": 6.7477, + "step": 3456 + }, + { + "epoch": 0.32257161519081834, + "grad_norm": 1.5498146925666596, + "learning_rate": 0.0002995953488541219, + "loss": 6.1856, + "step": 3457 + }, + { + "epoch": 0.32266492488569565, + "grad_norm": 1.5411129931319743, + "learning_rate": 0.0002995947934558192, + "loss": 6.7343, + "step": 3458 + }, + { + "epoch": 0.3227582345805729, + "grad_norm": 1.3869609110574062, + "learning_rate": 0.0002995942376771413, + "loss": 6.6215, + "step": 3459 + }, + { + "epoch": 0.3228515442754502, + "grad_norm": 2.481246290816785, + "learning_rate": 0.00029959368151808954, + "loss": 6.7999, + "step": 3460 + }, + { + "epoch": 0.3229448539703275, + "grad_norm": 2.3109635552829504, + "learning_rate": 0.0002995931249786654, + "loss": 6.6857, + "step": 3461 + }, + { + "epoch": 0.3230381636652048, + "grad_norm": 2.4934568995269117, + "learning_rate": 0.0002995925680588702, + "loss": 6.8317, + "step": 3462 + }, + { + "epoch": 0.32313147336008213, + "grad_norm": 2.4826536820392904, + "learning_rate": 0.00029959201075870545, + "loss": 6.7826, + "step": 3463 + }, + { + "epoch": 0.3232247830549594, + "grad_norm": 1.4508401623195402, + "learning_rate": 0.00029959145307817246, + "loss": 6.7975, + "step": 3464 + }, + { + "epoch": 0.3233180927498367, + "grad_norm": 1.3476727567500242, + "learning_rate": 0.0002995908950172728, + "loss": 6.1268, + "step": 3465 + }, + { + "epoch": 0.323411402444714, + "grad_norm": 5.131132819007855, + "learning_rate": 0.00029959033657600776, + "loss": 6.5399, + "step": 3466 + }, + { + "epoch": 0.3235047121395913, + "grad_norm": 2.769724769025756, + "learning_rate": 0.0002995897777543788, + "loss": 7.1005, + "step": 3467 + }, + { + "epoch": 0.3235980218344686, + "grad_norm": 2.09429439986575, + "learning_rate": 0.0002995892185523874, + "loss": 6.6445, + "step": 3468 + }, + { + "epoch": 0.32369133152934587, + "grad_norm": 3.8952656603530773, + "learning_rate": 0.0002995886589700349, + "loss": 6.7475, + "step": 3469 + }, + { + "epoch": 0.3237846412242232, + "grad_norm": 1.487340054219697, + "learning_rate": 0.00029958809900732273, + "loss": 6.5877, + "step": 3470 + }, + { + "epoch": 0.3238779509191005, + "grad_norm": 2.1769868964616648, + "learning_rate": 0.00029958753866425234, + "loss": 6.6614, + "step": 3471 + }, + { + "epoch": 0.3239712606139778, + "grad_norm": 2.4484582869185263, + "learning_rate": 0.0002995869779408252, + "loss": 6.607, + "step": 3472 + }, + { + "epoch": 0.3240645703088551, + "grad_norm": 2.2763056786202047, + "learning_rate": 0.0002995864168370426, + "loss": 6.8912, + "step": 3473 + }, + { + "epoch": 0.3241578800037324, + "grad_norm": 2.037849048484423, + "learning_rate": 0.0002995858553529061, + "loss": 6.6185, + "step": 3474 + }, + { + "epoch": 0.32425118969860967, + "grad_norm": 1.6659430415138985, + "learning_rate": 0.0002995852934884171, + "loss": 6.784, + "step": 3475 + }, + { + "epoch": 0.324344499393487, + "grad_norm": 1.6138690278200274, + "learning_rate": 0.000299584731243577, + "loss": 6.8862, + "step": 3476 + }, + { + "epoch": 0.3244378090883643, + "grad_norm": 3.0670749125000767, + "learning_rate": 0.00029958416861838725, + "loss": 6.5858, + "step": 3477 + }, + { + "epoch": 0.3245311187832416, + "grad_norm": 4.301532309041833, + "learning_rate": 0.0002995836056128492, + "loss": 6.5928, + "step": 3478 + }, + { + "epoch": 0.3246244284781189, + "grad_norm": 1.713003695311036, + "learning_rate": 0.0002995830422269644, + "loss": 6.7227, + "step": 3479 + }, + { + "epoch": 0.32471773817299615, + "grad_norm": 3.887883612666169, + "learning_rate": 0.00029958247846073425, + "loss": 6.496, + "step": 3480 + }, + { + "epoch": 0.32481104786787346, + "grad_norm": 1.8294717765191482, + "learning_rate": 0.0002995819143141601, + "loss": 6.7321, + "step": 3481 + }, + { + "epoch": 0.32490435756275077, + "grad_norm": 2.446455859810892, + "learning_rate": 0.00029958134978724354, + "loss": 6.4402, + "step": 3482 + }, + { + "epoch": 0.3249976672576281, + "grad_norm": 6.401890558909593, + "learning_rate": 0.0002995807848799859, + "loss": 6.6076, + "step": 3483 + }, + { + "epoch": 0.3250909769525054, + "grad_norm": 4.944764883023873, + "learning_rate": 0.00029958021959238857, + "loss": 6.3678, + "step": 3484 + }, + { + "epoch": 0.32518428664738264, + "grad_norm": 2.452318493609916, + "learning_rate": 0.0002995796539244531, + "loss": 6.3855, + "step": 3485 + }, + { + "epoch": 0.32527759634225994, + "grad_norm": 1.3940161103763153, + "learning_rate": 0.0002995790878761809, + "loss": 6.203, + "step": 3486 + }, + { + "epoch": 0.32537090603713725, + "grad_norm": 3.806824587695105, + "learning_rate": 0.0002995785214475733, + "loss": 6.5123, + "step": 3487 + }, + { + "epoch": 0.32546421573201456, + "grad_norm": 1.9927684736430924, + "learning_rate": 0.00029957795463863196, + "loss": 6.1489, + "step": 3488 + }, + { + "epoch": 0.32555752542689187, + "grad_norm": 1.6462351885925781, + "learning_rate": 0.00029957738744935806, + "loss": 6.5496, + "step": 3489 + }, + { + "epoch": 0.3256508351217692, + "grad_norm": 2.6792720440527837, + "learning_rate": 0.00029957681987975325, + "loss": 6.8305, + "step": 3490 + }, + { + "epoch": 0.32574414481664643, + "grad_norm": 3.1272339115652787, + "learning_rate": 0.0002995762519298189, + "loss": 6.6182, + "step": 3491 + }, + { + "epoch": 0.32583745451152374, + "grad_norm": 2.02381924590526, + "learning_rate": 0.0002995756835995564, + "loss": 6.4413, + "step": 3492 + }, + { + "epoch": 0.32593076420640105, + "grad_norm": 2.025202726481116, + "learning_rate": 0.00029957511488896727, + "loss": 6.5618, + "step": 3493 + }, + { + "epoch": 0.32602407390127836, + "grad_norm": 6.358918602010023, + "learning_rate": 0.00029957454579805295, + "loss": 6.3399, + "step": 3494 + }, + { + "epoch": 0.32611738359615566, + "grad_norm": 1.55509559207534, + "learning_rate": 0.0002995739763268148, + "loss": 6.3067, + "step": 3495 + }, + { + "epoch": 0.3262106932910329, + "grad_norm": 2.0097848531381444, + "learning_rate": 0.0002995734064752544, + "loss": 6.6835, + "step": 3496 + }, + { + "epoch": 0.3263040029859102, + "grad_norm": 2.711889554485398, + "learning_rate": 0.0002995728362433731, + "loss": 6.2728, + "step": 3497 + }, + { + "epoch": 0.32639731268078753, + "grad_norm": 1.421349483112208, + "learning_rate": 0.0002995722656311724, + "loss": 6.337, + "step": 3498 + }, + { + "epoch": 0.32649062237566484, + "grad_norm": 1.7964922832901964, + "learning_rate": 0.0002995716946386537, + "loss": 6.5031, + "step": 3499 + }, + { + "epoch": 0.32658393207054215, + "grad_norm": 3.1476314018415814, + "learning_rate": 0.00029957112326581855, + "loss": 5.9561, + "step": 3500 + }, + { + "epoch": 0.3266772417654194, + "grad_norm": 5.788279089094578, + "learning_rate": 0.0002995705515126683, + "loss": 6.6011, + "step": 3501 + }, + { + "epoch": 0.3267705514602967, + "grad_norm": 5.41154167075424, + "learning_rate": 0.0002995699793792045, + "loss": 6.6344, + "step": 3502 + }, + { + "epoch": 0.326863861155174, + "grad_norm": 1.2885815481388823, + "learning_rate": 0.00029956940686542846, + "loss": 6.4751, + "step": 3503 + }, + { + "epoch": 0.3269571708500513, + "grad_norm": 2.6285567678077273, + "learning_rate": 0.00029956883397134175, + "loss": 6.6672, + "step": 3504 + }, + { + "epoch": 0.32705048054492863, + "grad_norm": 1.4077387254577256, + "learning_rate": 0.00029956826069694586, + "loss": 6.4481, + "step": 3505 + }, + { + "epoch": 0.32714379023980594, + "grad_norm": 1.9190285390114687, + "learning_rate": 0.0002995676870422421, + "loss": 6.7608, + "step": 3506 + }, + { + "epoch": 0.3272370999346832, + "grad_norm": 1.3321910059347746, + "learning_rate": 0.0002995671130072321, + "loss": 6.3688, + "step": 3507 + }, + { + "epoch": 0.3273304096295605, + "grad_norm": 1.763571603935637, + "learning_rate": 0.0002995665385919172, + "loss": 6.6352, + "step": 3508 + }, + { + "epoch": 0.3274237193244378, + "grad_norm": 3.1491149290015095, + "learning_rate": 0.0002995659637962989, + "loss": 6.9169, + "step": 3509 + }, + { + "epoch": 0.3275170290193151, + "grad_norm": 4.835720502090978, + "learning_rate": 0.0002995653886203786, + "loss": 6.3992, + "step": 3510 + }, + { + "epoch": 0.32761033871419243, + "grad_norm": 1.4146831414259045, + "learning_rate": 0.00029956481306415793, + "loss": 6.3217, + "step": 3511 + }, + { + "epoch": 0.3277036484090697, + "grad_norm": 1.8916653171576645, + "learning_rate": 0.00029956423712763816, + "loss": 6.1969, + "step": 3512 + }, + { + "epoch": 0.327796958103947, + "grad_norm": 1.6777659020604498, + "learning_rate": 0.00029956366081082086, + "loss": 6.5773, + "step": 3513 + }, + { + "epoch": 0.3278902677988243, + "grad_norm": 1.5418655374124743, + "learning_rate": 0.0002995630841137075, + "loss": 6.7231, + "step": 3514 + }, + { + "epoch": 0.3279835774937016, + "grad_norm": 1.4770792481911492, + "learning_rate": 0.0002995625070362995, + "loss": 6.4168, + "step": 3515 + }, + { + "epoch": 0.3280768871885789, + "grad_norm": 6.467166368896856, + "learning_rate": 0.00029956192957859834, + "loss": 6.5164, + "step": 3516 + }, + { + "epoch": 0.32817019688345617, + "grad_norm": 1.7968579378108454, + "learning_rate": 0.00029956135174060556, + "loss": 6.7484, + "step": 3517 + }, + { + "epoch": 0.3282635065783335, + "grad_norm": 2.6231704747756295, + "learning_rate": 0.00029956077352232246, + "loss": 6.5988, + "step": 3518 + }, + { + "epoch": 0.3283568162732108, + "grad_norm": 2.033441768984592, + "learning_rate": 0.0002995601949237507, + "loss": 6.5726, + "step": 3519 + }, + { + "epoch": 0.3284501259680881, + "grad_norm": 2.058910497249853, + "learning_rate": 0.00029955961594489163, + "loss": 6.3962, + "step": 3520 + }, + { + "epoch": 0.3285434356629654, + "grad_norm": 2.077313789742227, + "learning_rate": 0.00029955903658574677, + "loss": 6.686, + "step": 3521 + }, + { + "epoch": 0.32863674535784265, + "grad_norm": 2.4892041825428675, + "learning_rate": 0.00029955845684631756, + "loss": 6.5217, + "step": 3522 + }, + { + "epoch": 0.32873005505271996, + "grad_norm": 2.6589366891502384, + "learning_rate": 0.00029955787672660553, + "loss": 6.6949, + "step": 3523 + }, + { + "epoch": 0.32882336474759727, + "grad_norm": 1.6423593104451155, + "learning_rate": 0.00029955729622661213, + "loss": 6.5196, + "step": 3524 + }, + { + "epoch": 0.3289166744424746, + "grad_norm": 1.5961778501126271, + "learning_rate": 0.0002995567153463388, + "loss": 6.3519, + "step": 3525 + }, + { + "epoch": 0.3290099841373519, + "grad_norm": 2.6023799388834217, + "learning_rate": 0.00029955613408578705, + "loss": 6.5166, + "step": 3526 + }, + { + "epoch": 0.3291032938322292, + "grad_norm": 1.48783574030821, + "learning_rate": 0.00029955555244495834, + "loss": 6.4701, + "step": 3527 + }, + { + "epoch": 0.32919660352710645, + "grad_norm": 1.8353294166068148, + "learning_rate": 0.0002995549704238542, + "loss": 6.415, + "step": 3528 + }, + { + "epoch": 0.32928991322198375, + "grad_norm": 1.1909877861497775, + "learning_rate": 0.000299554388022476, + "loss": 6.4002, + "step": 3529 + }, + { + "epoch": 0.32938322291686106, + "grad_norm": 4.384585554310168, + "learning_rate": 0.00029955380524082534, + "loss": 6.2507, + "step": 3530 + }, + { + "epoch": 0.32947653261173837, + "grad_norm": 2.7487732784154115, + "learning_rate": 0.0002995532220789037, + "loss": 6.3107, + "step": 3531 + }, + { + "epoch": 0.3295698423066157, + "grad_norm": 2.389658239824183, + "learning_rate": 0.00029955263853671246, + "loss": 6.4374, + "step": 3532 + }, + { + "epoch": 0.32966315200149293, + "grad_norm": 5.395576874288757, + "learning_rate": 0.00029955205461425314, + "loss": 6.2945, + "step": 3533 + }, + { + "epoch": 0.32975646169637024, + "grad_norm": 2.6033773492500254, + "learning_rate": 0.0002995514703115273, + "loss": 6.6355, + "step": 3534 + }, + { + "epoch": 0.32984977139124755, + "grad_norm": 15.42651999565507, + "learning_rate": 0.00029955088562853637, + "loss": 6.5401, + "step": 3535 + }, + { + "epoch": 0.32994308108612486, + "grad_norm": 8.582996596989878, + "learning_rate": 0.0002995503005652818, + "loss": 6.3881, + "step": 3536 + }, + { + "epoch": 0.33003639078100216, + "grad_norm": 2.159979079093086, + "learning_rate": 0.00029954971512176516, + "loss": 6.5967, + "step": 3537 + }, + { + "epoch": 0.3301297004758794, + "grad_norm": 1.438839787213543, + "learning_rate": 0.00029954912929798787, + "loss": 6.2528, + "step": 3538 + }, + { + "epoch": 0.3302230101707567, + "grad_norm": 7.552433660025885, + "learning_rate": 0.00029954854309395144, + "loss": 6.5487, + "step": 3539 + }, + { + "epoch": 0.33031631986563403, + "grad_norm": 1.4559141310763783, + "learning_rate": 0.00029954795650965734, + "loss": 5.9834, + "step": 3540 + }, + { + "epoch": 0.33040962956051134, + "grad_norm": 3.071220960457651, + "learning_rate": 0.00029954736954510713, + "loss": 6.6601, + "step": 3541 + }, + { + "epoch": 0.33050293925538865, + "grad_norm": 1.3489297055485792, + "learning_rate": 0.0002995467822003023, + "loss": 6.3921, + "step": 3542 + }, + { + "epoch": 0.33059624895026596, + "grad_norm": 1.439755483838545, + "learning_rate": 0.00029954619447524416, + "loss": 6.5798, + "step": 3543 + }, + { + "epoch": 0.3306895586451432, + "grad_norm": 42.2427580353901, + "learning_rate": 0.0002995456063699345, + "loss": 6.4937, + "step": 3544 + }, + { + "epoch": 0.3307828683400205, + "grad_norm": 13.914863752217508, + "learning_rate": 0.00029954501788437456, + "loss": 6.8304, + "step": 3545 + }, + { + "epoch": 0.3308761780348978, + "grad_norm": 2.839722800173688, + "learning_rate": 0.00029954442901856596, + "loss": 6.3435, + "step": 3546 + }, + { + "epoch": 0.33096948772977514, + "grad_norm": 1.8369992085811662, + "learning_rate": 0.0002995438397725102, + "loss": 6.6628, + "step": 3547 + }, + { + "epoch": 0.33106279742465244, + "grad_norm": 2.5396544151614613, + "learning_rate": 0.00029954325014620875, + "loss": 6.6916, + "step": 3548 + }, + { + "epoch": 0.3311561071195297, + "grad_norm": 1.786867651780834, + "learning_rate": 0.000299542660139663, + "loss": 6.2447, + "step": 3549 + }, + { + "epoch": 0.331249416814407, + "grad_norm": 1.9339971467492398, + "learning_rate": 0.0002995420697528747, + "loss": 6.6587, + "step": 3550 + }, + { + "epoch": 0.3313427265092843, + "grad_norm": 1.5563732294120824, + "learning_rate": 0.0002995414789858452, + "loss": 6.3865, + "step": 3551 + }, + { + "epoch": 0.3314360362041616, + "grad_norm": 1.6593204531598618, + "learning_rate": 0.00029954088783857604, + "loss": 6.4001, + "step": 3552 + }, + { + "epoch": 0.33152934589903893, + "grad_norm": 2.7319254791510748, + "learning_rate": 0.0002995402963110686, + "loss": 6.5243, + "step": 3553 + }, + { + "epoch": 0.3316226555939162, + "grad_norm": 1.7249224100385392, + "learning_rate": 0.00029953970440332455, + "loss": 6.385, + "step": 3554 + }, + { + "epoch": 0.3317159652887935, + "grad_norm": 1.6149924433120568, + "learning_rate": 0.0002995391121153453, + "loss": 6.4797, + "step": 3555 + }, + { + "epoch": 0.3318092749836708, + "grad_norm": 1.7769537426934636, + "learning_rate": 0.0002995385194471324, + "loss": 6.5547, + "step": 3556 + }, + { + "epoch": 0.3319025846785481, + "grad_norm": 1.5237038724186296, + "learning_rate": 0.00029953792639868736, + "loss": 6.5558, + "step": 3557 + }, + { + "epoch": 0.3319958943734254, + "grad_norm": 1.9607427600591048, + "learning_rate": 0.00029953733297001165, + "loss": 6.6793, + "step": 3558 + }, + { + "epoch": 0.3320892040683027, + "grad_norm": 2.6442680645116603, + "learning_rate": 0.00029953673916110677, + "loss": 6.5692, + "step": 3559 + }, + { + "epoch": 0.33218251376318, + "grad_norm": 1.5287343840820062, + "learning_rate": 0.0002995361449719743, + "loss": 6.5459, + "step": 3560 + }, + { + "epoch": 0.3322758234580573, + "grad_norm": 4.698537515183524, + "learning_rate": 0.00029953555040261576, + "loss": 6.5736, + "step": 3561 + }, + { + "epoch": 0.3323691331529346, + "grad_norm": 1.8365255984332447, + "learning_rate": 0.0002995349554530325, + "loss": 6.4136, + "step": 3562 + }, + { + "epoch": 0.3324624428478119, + "grad_norm": 1.902015344985132, + "learning_rate": 0.0002995343601232262, + "loss": 6.3565, + "step": 3563 + }, + { + "epoch": 0.3325557525426892, + "grad_norm": 1.4894188621189208, + "learning_rate": 0.0002995337644131983, + "loss": 6.1167, + "step": 3564 + }, + { + "epoch": 0.33264906223756646, + "grad_norm": 1.3795379623049122, + "learning_rate": 0.00029953316832295035, + "loss": 6.3269, + "step": 3565 + }, + { + "epoch": 0.33274237193244377, + "grad_norm": 1.369782881116122, + "learning_rate": 0.00029953257185248385, + "loss": 6.0551, + "step": 3566 + }, + { + "epoch": 0.3328356816273211, + "grad_norm": 1.306981795229108, + "learning_rate": 0.00029953197500180034, + "loss": 6.4142, + "step": 3567 + }, + { + "epoch": 0.3329289913221984, + "grad_norm": 1.428680375177703, + "learning_rate": 0.0002995313777709013, + "loss": 5.9708, + "step": 3568 + }, + { + "epoch": 0.3330223010170757, + "grad_norm": 1.3871385004650385, + "learning_rate": 0.00029953078015978817, + "loss": 6.6392, + "step": 3569 + }, + { + "epoch": 0.33311561071195295, + "grad_norm": 1.6848696660702915, + "learning_rate": 0.00029953018216846266, + "loss": 6.5614, + "step": 3570 + }, + { + "epoch": 0.33320892040683026, + "grad_norm": 1.675603170105531, + "learning_rate": 0.00029952958379692615, + "loss": 6.5244, + "step": 3571 + }, + { + "epoch": 0.33330223010170756, + "grad_norm": 1.5379322358460423, + "learning_rate": 0.0002995289850451802, + "loss": 6.5131, + "step": 3572 + }, + { + "epoch": 0.33339553979658487, + "grad_norm": 1.4453217771912183, + "learning_rate": 0.00029952838591322637, + "loss": 6.2023, + "step": 3573 + }, + { + "epoch": 0.3334888494914622, + "grad_norm": 1.459933869296828, + "learning_rate": 0.0002995277864010661, + "loss": 6.3982, + "step": 3574 + }, + { + "epoch": 0.33358215918633943, + "grad_norm": 1.4173685458622187, + "learning_rate": 0.000299527186508701, + "loss": 6.3438, + "step": 3575 + }, + { + "epoch": 0.33367546888121674, + "grad_norm": 2.1172654744262416, + "learning_rate": 0.00029952658623613255, + "loss": 6.2764, + "step": 3576 + }, + { + "epoch": 0.33376877857609405, + "grad_norm": 1.917046176091507, + "learning_rate": 0.00029952598558336224, + "loss": 6.5643, + "step": 3577 + }, + { + "epoch": 0.33386208827097136, + "grad_norm": 2.791656206151246, + "learning_rate": 0.0002995253845503917, + "loss": 6.5393, + "step": 3578 + }, + { + "epoch": 0.33395539796584867, + "grad_norm": 1.1051696042288557, + "learning_rate": 0.0002995247831372224, + "loss": 6.4021, + "step": 3579 + }, + { + "epoch": 0.334048707660726, + "grad_norm": 1.9192264790651135, + "learning_rate": 0.00029952418134385585, + "loss": 6.7079, + "step": 3580 + }, + { + "epoch": 0.3341420173556032, + "grad_norm": 1.2103763362296662, + "learning_rate": 0.0002995235791702936, + "loss": 6.2874, + "step": 3581 + }, + { + "epoch": 0.33423532705048054, + "grad_norm": 1.9165627799597238, + "learning_rate": 0.0002995229766165372, + "loss": 6.4319, + "step": 3582 + }, + { + "epoch": 0.33432863674535784, + "grad_norm": 1.4886084383973486, + "learning_rate": 0.0002995223736825881, + "loss": 6.6268, + "step": 3583 + }, + { + "epoch": 0.33442194644023515, + "grad_norm": 1.7652085067737855, + "learning_rate": 0.00029952177036844793, + "loss": 5.98, + "step": 3584 + }, + { + "epoch": 0.33451525613511246, + "grad_norm": 3.1836828561476245, + "learning_rate": 0.0002995211666741182, + "loss": 6.2976, + "step": 3585 + }, + { + "epoch": 0.3346085658299897, + "grad_norm": 2.3307461912521497, + "learning_rate": 0.0002995205625996005, + "loss": 6.3141, + "step": 3586 + }, + { + "epoch": 0.334701875524867, + "grad_norm": 1.5338097841686296, + "learning_rate": 0.0002995199581448962, + "loss": 6.6275, + "step": 3587 + }, + { + "epoch": 0.33479518521974433, + "grad_norm": 1.735329913635411, + "learning_rate": 0.00029951935331000695, + "loss": 6.6741, + "step": 3588 + }, + { + "epoch": 0.33488849491462164, + "grad_norm": 1.8209335659056745, + "learning_rate": 0.0002995187480949343, + "loss": 6.3126, + "step": 3589 + }, + { + "epoch": 0.33498180460949895, + "grad_norm": 1.186461587286892, + "learning_rate": 0.00029951814249967973, + "loss": 6.3617, + "step": 3590 + }, + { + "epoch": 0.3350751143043762, + "grad_norm": 1.4401396912897542, + "learning_rate": 0.00029951753652424486, + "loss": 5.9937, + "step": 3591 + }, + { + "epoch": 0.3351684239992535, + "grad_norm": 1.5101901299320897, + "learning_rate": 0.0002995169301686312, + "loss": 6.2282, + "step": 3592 + }, + { + "epoch": 0.3352617336941308, + "grad_norm": 1.4882896904500769, + "learning_rate": 0.00029951632343284024, + "loss": 5.9622, + "step": 3593 + }, + { + "epoch": 0.3353550433890081, + "grad_norm": 1.2997711842251891, + "learning_rate": 0.00029951571631687353, + "loss": 6.5198, + "step": 3594 + }, + { + "epoch": 0.33544835308388543, + "grad_norm": 1.6577269272210824, + "learning_rate": 0.0002995151088207327, + "loss": 6.2979, + "step": 3595 + }, + { + "epoch": 0.33554166277876274, + "grad_norm": 1.9699257847350813, + "learning_rate": 0.0002995145009444192, + "loss": 6.3938, + "step": 3596 + }, + { + "epoch": 0.33563497247364, + "grad_norm": 1.5284763108411432, + "learning_rate": 0.00029951389268793464, + "loss": 6.2899, + "step": 3597 + }, + { + "epoch": 0.3357282821685173, + "grad_norm": 1.956482473713604, + "learning_rate": 0.00029951328405128055, + "loss": 6.1704, + "step": 3598 + }, + { + "epoch": 0.3358215918633946, + "grad_norm": 2.588434899414739, + "learning_rate": 0.00029951267503445844, + "loss": 6.397, + "step": 3599 + }, + { + "epoch": 0.3359149015582719, + "grad_norm": 1.7608118201853638, + "learning_rate": 0.0002995120656374699, + "loss": 6.1602, + "step": 3600 + }, + { + "epoch": 0.3360082112531492, + "grad_norm": 1.2698120344843016, + "learning_rate": 0.0002995114558603165, + "loss": 6.4099, + "step": 3601 + }, + { + "epoch": 0.3361015209480265, + "grad_norm": 1.4246124485596576, + "learning_rate": 0.00029951084570299967, + "loss": 6.1449, + "step": 3602 + }, + { + "epoch": 0.3361948306429038, + "grad_norm": 1.63662523718457, + "learning_rate": 0.0002995102351655211, + "loss": 6.5478, + "step": 3603 + }, + { + "epoch": 0.3362881403377811, + "grad_norm": 1.7184275281530819, + "learning_rate": 0.0002995096242478823, + "loss": 6.7524, + "step": 3604 + }, + { + "epoch": 0.3363814500326584, + "grad_norm": 2.3443907787311655, + "learning_rate": 0.00029950901295008474, + "loss": 6.3566, + "step": 3605 + }, + { + "epoch": 0.3364747597275357, + "grad_norm": 1.2596696961122673, + "learning_rate": 0.00029950840127213014, + "loss": 6.2511, + "step": 3606 + }, + { + "epoch": 0.33656806942241296, + "grad_norm": 1.2221756126333243, + "learning_rate": 0.00029950778921401995, + "loss": 6.2674, + "step": 3607 + }, + { + "epoch": 0.33666137911729027, + "grad_norm": 1.2534062509585007, + "learning_rate": 0.0002995071767757557, + "loss": 6.1684, + "step": 3608 + }, + { + "epoch": 0.3367546888121676, + "grad_norm": 1.1745833592819008, + "learning_rate": 0.000299506563957339, + "loss": 6.5705, + "step": 3609 + }, + { + "epoch": 0.3368479985070449, + "grad_norm": 1.1082998327225466, + "learning_rate": 0.0002995059507587714, + "loss": 6.4601, + "step": 3610 + }, + { + "epoch": 0.3369413082019222, + "grad_norm": 1.123722952185218, + "learning_rate": 0.0002995053371800545, + "loss": 6.4138, + "step": 3611 + }, + { + "epoch": 0.3370346178967995, + "grad_norm": 1.3780846241596607, + "learning_rate": 0.0002995047232211897, + "loss": 6.2673, + "step": 3612 + }, + { + "epoch": 0.33712792759167676, + "grad_norm": 1.1665047376150446, + "learning_rate": 0.00029950410888217874, + "loss": 6.1956, + "step": 3613 + }, + { + "epoch": 0.33722123728655407, + "grad_norm": 1.5910792413122656, + "learning_rate": 0.0002995034941630232, + "loss": 6.3291, + "step": 3614 + }, + { + "epoch": 0.3373145469814314, + "grad_norm": 1.512305866943639, + "learning_rate": 0.0002995028790637244, + "loss": 6.1821, + "step": 3615 + }, + { + "epoch": 0.3374078566763087, + "grad_norm": 1.2017375331584066, + "learning_rate": 0.00029950226358428417, + "loss": 5.8213, + "step": 3616 + }, + { + "epoch": 0.337501166371186, + "grad_norm": 2.58007596700557, + "learning_rate": 0.000299501647724704, + "loss": 5.8047, + "step": 3617 + }, + { + "epoch": 0.33759447606606324, + "grad_norm": 1.267579815341795, + "learning_rate": 0.00029950103148498533, + "loss": 6.2909, + "step": 3618 + }, + { + "epoch": 0.33768778576094055, + "grad_norm": 1.9045445633866858, + "learning_rate": 0.0002995004148651299, + "loss": 6.4266, + "step": 3619 + }, + { + "epoch": 0.33778109545581786, + "grad_norm": 2.092970874864839, + "learning_rate": 0.00029949979786513914, + "loss": 6.202, + "step": 3620 + }, + { + "epoch": 0.33787440515069517, + "grad_norm": 0.9750615624529092, + "learning_rate": 0.0002994991804850147, + "loss": 6.2871, + "step": 3621 + }, + { + "epoch": 0.3379677148455725, + "grad_norm": 2.237494971256988, + "learning_rate": 0.00029949856272475816, + "loss": 6.7222, + "step": 3622 + }, + { + "epoch": 0.33806102454044973, + "grad_norm": 1.250064087958693, + "learning_rate": 0.000299497944584371, + "loss": 5.7118, + "step": 3623 + }, + { + "epoch": 0.33815433423532704, + "grad_norm": 1.3040162755113067, + "learning_rate": 0.00029949732606385487, + "loss": 6.1786, + "step": 3624 + }, + { + "epoch": 0.33824764393020434, + "grad_norm": 1.3509318498164236, + "learning_rate": 0.00029949670716321135, + "loss": 5.8392, + "step": 3625 + }, + { + "epoch": 0.33834095362508165, + "grad_norm": 1.8079574432123375, + "learning_rate": 0.00029949608788244194, + "loss": 6.1394, + "step": 3626 + }, + { + "epoch": 0.33843426331995896, + "grad_norm": 2.219182060554689, + "learning_rate": 0.00029949546822154833, + "loss": 5.9088, + "step": 3627 + }, + { + "epoch": 0.3385275730148362, + "grad_norm": 1.552389861751933, + "learning_rate": 0.000299494848180532, + "loss": 6.5394, + "step": 3628 + }, + { + "epoch": 0.3386208827097135, + "grad_norm": 1.28149133913818, + "learning_rate": 0.00029949422775939454, + "loss": 6.4737, + "step": 3629 + }, + { + "epoch": 0.33871419240459083, + "grad_norm": 1.2763062240816037, + "learning_rate": 0.0002994936069581375, + "loss": 6.4882, + "step": 3630 + }, + { + "epoch": 0.33880750209946814, + "grad_norm": 2.716070557054848, + "learning_rate": 0.00029949298577676257, + "loss": 5.8841, + "step": 3631 + }, + { + "epoch": 0.33890081179434545, + "grad_norm": 1.3413406717480105, + "learning_rate": 0.0002994923642152713, + "loss": 6.4083, + "step": 3632 + }, + { + "epoch": 0.33899412148922276, + "grad_norm": 1.8388900832288935, + "learning_rate": 0.00029949174227366515, + "loss": 6.4142, + "step": 3633 + }, + { + "epoch": 0.3390874311841, + "grad_norm": 1.8039653954485049, + "learning_rate": 0.0002994911199519458, + "loss": 6.3307, + "step": 3634 + }, + { + "epoch": 0.3391807408789773, + "grad_norm": 1.1973095357647539, + "learning_rate": 0.00029949049725011486, + "loss": 6.2163, + "step": 3635 + }, + { + "epoch": 0.3392740505738546, + "grad_norm": 1.2690554416269901, + "learning_rate": 0.0002994898741681738, + "loss": 6.2205, + "step": 3636 + }, + { + "epoch": 0.33936736026873193, + "grad_norm": 1.3203530146029674, + "learning_rate": 0.00029948925070612427, + "loss": 6.3744, + "step": 3637 + }, + { + "epoch": 0.33946066996360924, + "grad_norm": 1.3558988428740655, + "learning_rate": 0.0002994886268639679, + "loss": 5.9992, + "step": 3638 + }, + { + "epoch": 0.3395539796584865, + "grad_norm": 1.4220914355778813, + "learning_rate": 0.0002994880026417063, + "loss": 6.1348, + "step": 3639 + }, + { + "epoch": 0.3396472893533638, + "grad_norm": 1.3737558911772163, + "learning_rate": 0.00029948737803934085, + "loss": 6.4248, + "step": 3640 + }, + { + "epoch": 0.3397405990482411, + "grad_norm": 1.0918871997954265, + "learning_rate": 0.0002994867530568734, + "loss": 6.331, + "step": 3641 + }, + { + "epoch": 0.3398339087431184, + "grad_norm": 1.101525475942415, + "learning_rate": 0.00029948612769430534, + "loss": 5.9717, + "step": 3642 + }, + { + "epoch": 0.3399272184379957, + "grad_norm": 1.2793925779030442, + "learning_rate": 0.0002994855019516383, + "loss": 6.3854, + "step": 3643 + }, + { + "epoch": 0.340020528132873, + "grad_norm": 1.6560922668639206, + "learning_rate": 0.00029948487582887403, + "loss": 6.4712, + "step": 3644 + }, + { + "epoch": 0.3401138378277503, + "grad_norm": 1.275405712366622, + "learning_rate": 0.0002994842493260139, + "loss": 6.0711, + "step": 3645 + }, + { + "epoch": 0.3402071475226276, + "grad_norm": 1.2736238242401783, + "learning_rate": 0.0002994836224430597, + "loss": 5.5915, + "step": 3646 + }, + { + "epoch": 0.3403004572175049, + "grad_norm": 1.9039303581536533, + "learning_rate": 0.00029948299518001284, + "loss": 6.2512, + "step": 3647 + }, + { + "epoch": 0.3403937669123822, + "grad_norm": 1.3678331953918583, + "learning_rate": 0.00029948236753687507, + "loss": 6.0398, + "step": 3648 + }, + { + "epoch": 0.3404870766072595, + "grad_norm": 1.4322457798610324, + "learning_rate": 0.00029948173951364784, + "loss": 6.3146, + "step": 3649 + }, + { + "epoch": 0.3405803863021368, + "grad_norm": 1.8278708656312228, + "learning_rate": 0.0002994811111103329, + "loss": 6.5797, + "step": 3650 + }, + { + "epoch": 0.3406736959970141, + "grad_norm": 1.792117639866391, + "learning_rate": 0.00029948048232693173, + "loss": 6.3304, + "step": 3651 + }, + { + "epoch": 0.3407670056918914, + "grad_norm": 1.3966444562341487, + "learning_rate": 0.000299479853163446, + "loss": 6.1506, + "step": 3652 + }, + { + "epoch": 0.3408603153867687, + "grad_norm": 1.6267739596563529, + "learning_rate": 0.00029947922361987724, + "loss": 6.2568, + "step": 3653 + }, + { + "epoch": 0.340953625081646, + "grad_norm": 1.7702906782339523, + "learning_rate": 0.00029947859369622716, + "loss": 6.3378, + "step": 3654 + }, + { + "epoch": 0.34104693477652326, + "grad_norm": 1.7947089554690114, + "learning_rate": 0.00029947796339249727, + "loss": 5.8714, + "step": 3655 + }, + { + "epoch": 0.34114024447140057, + "grad_norm": 1.131446215634353, + "learning_rate": 0.00029947733270868914, + "loss": 5.9742, + "step": 3656 + }, + { + "epoch": 0.3412335541662779, + "grad_norm": 3.671187006761232, + "learning_rate": 0.0002994767016448045, + "loss": 6.2337, + "step": 3657 + }, + { + "epoch": 0.3413268638611552, + "grad_norm": 1.3097316345718866, + "learning_rate": 0.0002994760702008449, + "loss": 6.213, + "step": 3658 + }, + { + "epoch": 0.3414201735560325, + "grad_norm": 1.5752569965415333, + "learning_rate": 0.00029947543837681185, + "loss": 6.0563, + "step": 3659 + }, + { + "epoch": 0.34151348325090974, + "grad_norm": 1.3539080819663911, + "learning_rate": 0.0002994748061727071, + "loss": 6.5656, + "step": 3660 + }, + { + "epoch": 0.34160679294578705, + "grad_norm": 1.3024607078489197, + "learning_rate": 0.00029947417358853215, + "loss": 6.4852, + "step": 3661 + }, + { + "epoch": 0.34170010264066436, + "grad_norm": 1.2680442256874425, + "learning_rate": 0.00029947354062428874, + "loss": 6.2241, + "step": 3662 + }, + { + "epoch": 0.34179341233554167, + "grad_norm": 1.4319438790191175, + "learning_rate": 0.0002994729072799783, + "loss": 5.8418, + "step": 3663 + }, + { + "epoch": 0.341886722030419, + "grad_norm": 1.363401645277857, + "learning_rate": 0.00029947227355560255, + "loss": 6.1964, + "step": 3664 + }, + { + "epoch": 0.3419800317252963, + "grad_norm": 1.120582863067763, + "learning_rate": 0.00029947163945116313, + "loss": 6.3693, + "step": 3665 + }, + { + "epoch": 0.34207334142017354, + "grad_norm": 1.2835710841018058, + "learning_rate": 0.00029947100496666157, + "loss": 5.9437, + "step": 3666 + }, + { + "epoch": 0.34216665111505085, + "grad_norm": 2.252524616088902, + "learning_rate": 0.00029947037010209955, + "loss": 6.0446, + "step": 3667 + }, + { + "epoch": 0.34225996080992815, + "grad_norm": 1.1669872829854488, + "learning_rate": 0.00029946973485747865, + "loss": 6.1428, + "step": 3668 + }, + { + "epoch": 0.34235327050480546, + "grad_norm": 1.3037023914053636, + "learning_rate": 0.0002994690992328005, + "loss": 6.0718, + "step": 3669 + }, + { + "epoch": 0.34244658019968277, + "grad_norm": 1.5035380147250241, + "learning_rate": 0.00029946846322806664, + "loss": 6.4368, + "step": 3670 + }, + { + "epoch": 0.34253988989456, + "grad_norm": 1.5179351106190924, + "learning_rate": 0.00029946782684327883, + "loss": 5.9845, + "step": 3671 + }, + { + "epoch": 0.34263319958943733, + "grad_norm": 1.1887033293232498, + "learning_rate": 0.00029946719007843854, + "loss": 6.1513, + "step": 3672 + }, + { + "epoch": 0.34272650928431464, + "grad_norm": 1.2242962667308308, + "learning_rate": 0.00029946655293354755, + "loss": 6.2634, + "step": 3673 + }, + { + "epoch": 0.34281981897919195, + "grad_norm": 1.168145552084383, + "learning_rate": 0.0002994659154086073, + "loss": 5.7156, + "step": 3674 + }, + { + "epoch": 0.34291312867406926, + "grad_norm": 1.4358370338785056, + "learning_rate": 0.0002994652775036196, + "loss": 6.3384, + "step": 3675 + }, + { + "epoch": 0.3430064383689465, + "grad_norm": 1.7516564154581262, + "learning_rate": 0.00029946463921858593, + "loss": 6.4916, + "step": 3676 + }, + { + "epoch": 0.3430997480638238, + "grad_norm": 1.3856416482470537, + "learning_rate": 0.0002994640005535079, + "loss": 6.1277, + "step": 3677 + }, + { + "epoch": 0.3431930577587011, + "grad_norm": 1.218819152290107, + "learning_rate": 0.0002994633615083873, + "loss": 5.5024, + "step": 3678 + }, + { + "epoch": 0.34328636745357843, + "grad_norm": 2.188655576315645, + "learning_rate": 0.0002994627220832256, + "loss": 6.2969, + "step": 3679 + }, + { + "epoch": 0.34337967714845574, + "grad_norm": 1.869920060157139, + "learning_rate": 0.0002994620822780244, + "loss": 5.9907, + "step": 3680 + }, + { + "epoch": 0.343472986843333, + "grad_norm": 1.4336397166028945, + "learning_rate": 0.0002994614420927855, + "loss": 6.1077, + "step": 3681 + }, + { + "epoch": 0.3435662965382103, + "grad_norm": 1.236772065403358, + "learning_rate": 0.0002994608015275103, + "loss": 6.555, + "step": 3682 + }, + { + "epoch": 0.3436596062330876, + "grad_norm": 1.323619359111022, + "learning_rate": 0.0002994601605822007, + "loss": 6.1085, + "step": 3683 + }, + { + "epoch": 0.3437529159279649, + "grad_norm": 1.4292304748996643, + "learning_rate": 0.0002994595192568581, + "loss": 6.2603, + "step": 3684 + }, + { + "epoch": 0.3438462256228422, + "grad_norm": 1.060976152975432, + "learning_rate": 0.00029945887755148423, + "loss": 5.7914, + "step": 3685 + }, + { + "epoch": 0.34393953531771954, + "grad_norm": 1.50268121887699, + "learning_rate": 0.0002994582354660807, + "loss": 6.3157, + "step": 3686 + }, + { + "epoch": 0.3440328450125968, + "grad_norm": 1.9585656860730045, + "learning_rate": 0.0002994575930006492, + "loss": 6.0418, + "step": 3687 + }, + { + "epoch": 0.3441261547074741, + "grad_norm": 1.1442624700398116, + "learning_rate": 0.00029945695015519126, + "loss": 6.0899, + "step": 3688 + }, + { + "epoch": 0.3442194644023514, + "grad_norm": 1.263852733932567, + "learning_rate": 0.0002994563069297086, + "loss": 6.3062, + "step": 3689 + }, + { + "epoch": 0.3443127740972287, + "grad_norm": 1.1328149244097627, + "learning_rate": 0.0002994556633242028, + "loss": 6.0374, + "step": 3690 + }, + { + "epoch": 0.344406083792106, + "grad_norm": 1.1139009695283215, + "learning_rate": 0.0002994550193386756, + "loss": 6.3584, + "step": 3691 + }, + { + "epoch": 0.3444993934869833, + "grad_norm": 1.002330609785436, + "learning_rate": 0.0002994543749731285, + "loss": 6.2695, + "step": 3692 + }, + { + "epoch": 0.3445927031818606, + "grad_norm": 2.533563417104384, + "learning_rate": 0.00029945373022756313, + "loss": 5.5776, + "step": 3693 + }, + { + "epoch": 0.3446860128767379, + "grad_norm": 1.6583166504998756, + "learning_rate": 0.00029945308510198127, + "loss": 5.8122, + "step": 3694 + }, + { + "epoch": 0.3447793225716152, + "grad_norm": 1.5424000979330545, + "learning_rate": 0.0002994524395963845, + "loss": 6.2101, + "step": 3695 + }, + { + "epoch": 0.3448726322664925, + "grad_norm": 2.4048976355592284, + "learning_rate": 0.00029945179371077435, + "loss": 5.1326, + "step": 3696 + }, + { + "epoch": 0.34496594196136976, + "grad_norm": 2.010043392575825, + "learning_rate": 0.00029945114744515266, + "loss": 6.3313, + "step": 3697 + }, + { + "epoch": 0.34505925165624707, + "grad_norm": 1.2324971637655189, + "learning_rate": 0.0002994505007995209, + "loss": 5.894, + "step": 3698 + }, + { + "epoch": 0.3451525613511244, + "grad_norm": 1.6726074575854823, + "learning_rate": 0.00029944985377388083, + "loss": 6.1976, + "step": 3699 + }, + { + "epoch": 0.3452458710460017, + "grad_norm": 1.1139986427189725, + "learning_rate": 0.00029944920636823405, + "loss": 6.5188, + "step": 3700 + }, + { + "epoch": 0.345339180740879, + "grad_norm": 1.2670588807729877, + "learning_rate": 0.0002994485585825822, + "loss": 5.506, + "step": 3701 + }, + { + "epoch": 0.3454324904357563, + "grad_norm": 1.1174152692915347, + "learning_rate": 0.00029944791041692695, + "loss": 6.0911, + "step": 3702 + }, + { + "epoch": 0.34552580013063355, + "grad_norm": 2.9913638873637165, + "learning_rate": 0.00029944726187126994, + "loss": 6.2965, + "step": 3703 + }, + { + "epoch": 0.34561910982551086, + "grad_norm": 1.9462174234366096, + "learning_rate": 0.0002994466129456128, + "loss": 6.1438, + "step": 3704 + }, + { + "epoch": 0.34571241952038817, + "grad_norm": 1.7966431845612005, + "learning_rate": 0.0002994459636399572, + "loss": 6.1885, + "step": 3705 + }, + { + "epoch": 0.3458057292152655, + "grad_norm": 1.2608076412209954, + "learning_rate": 0.00029944531395430474, + "loss": 6.4386, + "step": 3706 + }, + { + "epoch": 0.3458990389101428, + "grad_norm": 1.6135663954898682, + "learning_rate": 0.00029944466388865716, + "loss": 5.3089, + "step": 3707 + }, + { + "epoch": 0.34599234860502004, + "grad_norm": 1.1511710801134791, + "learning_rate": 0.00029944401344301607, + "loss": 6.0952, + "step": 3708 + }, + { + "epoch": 0.34608565829989735, + "grad_norm": 1.0729873885357928, + "learning_rate": 0.0002994433626173831, + "loss": 6.0792, + "step": 3709 + }, + { + "epoch": 0.34617896799477466, + "grad_norm": 1.2310555235962826, + "learning_rate": 0.00029944271141175993, + "loss": 5.7646, + "step": 3710 + }, + { + "epoch": 0.34627227768965196, + "grad_norm": 1.6413117314447512, + "learning_rate": 0.00029944205982614824, + "loss": 6.0657, + "step": 3711 + }, + { + "epoch": 0.34636558738452927, + "grad_norm": 1.0471975028172094, + "learning_rate": 0.00029944140786054965, + "loss": 6.282, + "step": 3712 + }, + { + "epoch": 0.3464588970794065, + "grad_norm": 1.2438554972623286, + "learning_rate": 0.0002994407555149658, + "loss": 6.1812, + "step": 3713 + }, + { + "epoch": 0.34655220677428383, + "grad_norm": 1.0899527804474751, + "learning_rate": 0.00029944010278939835, + "loss": 6.4208, + "step": 3714 + }, + { + "epoch": 0.34664551646916114, + "grad_norm": 1.225986634223154, + "learning_rate": 0.00029943944968384905, + "loss": 6.2688, + "step": 3715 + }, + { + "epoch": 0.34673882616403845, + "grad_norm": 1.63733537701341, + "learning_rate": 0.0002994387961983195, + "loss": 6.2694, + "step": 3716 + }, + { + "epoch": 0.34683213585891576, + "grad_norm": 1.169214609523902, + "learning_rate": 0.00029943814233281135, + "loss": 6.1003, + "step": 3717 + }, + { + "epoch": 0.34692544555379307, + "grad_norm": 1.7301862704254127, + "learning_rate": 0.00029943748808732625, + "loss": 6.4393, + "step": 3718 + }, + { + "epoch": 0.3470187552486703, + "grad_norm": 1.2455619709320094, + "learning_rate": 0.00029943683346186586, + "loss": 6.1946, + "step": 3719 + }, + { + "epoch": 0.3471120649435476, + "grad_norm": 0.9771054572801475, + "learning_rate": 0.0002994361784564319, + "loss": 6.2009, + "step": 3720 + }, + { + "epoch": 0.34720537463842494, + "grad_norm": 1.2101709855827532, + "learning_rate": 0.000299435523071026, + "loss": 6.3179, + "step": 3721 + }, + { + "epoch": 0.34729868433330224, + "grad_norm": 1.2422997900253983, + "learning_rate": 0.00029943486730564983, + "loss": 5.8587, + "step": 3722 + }, + { + "epoch": 0.34739199402817955, + "grad_norm": 2.7444989474449284, + "learning_rate": 0.00029943421116030507, + "loss": 5.7712, + "step": 3723 + }, + { + "epoch": 0.3474853037230568, + "grad_norm": 2.7490535349748084, + "learning_rate": 0.0002994335546349934, + "loss": 6.0154, + "step": 3724 + }, + { + "epoch": 0.3475786134179341, + "grad_norm": 1.0198006394641708, + "learning_rate": 0.00029943289772971644, + "loss": 6.2786, + "step": 3725 + }, + { + "epoch": 0.3476719231128114, + "grad_norm": 1.2301819293559735, + "learning_rate": 0.00029943224044447583, + "loss": 5.8915, + "step": 3726 + }, + { + "epoch": 0.34776523280768873, + "grad_norm": 1.2083693908812008, + "learning_rate": 0.00029943158277927337, + "loss": 6.3405, + "step": 3727 + }, + { + "epoch": 0.34785854250256604, + "grad_norm": 1.3354953817894157, + "learning_rate": 0.0002994309247341107, + "loss": 5.877, + "step": 3728 + }, + { + "epoch": 0.3479518521974433, + "grad_norm": 1.4897578915770386, + "learning_rate": 0.00029943026630898934, + "loss": 6.212, + "step": 3729 + }, + { + "epoch": 0.3480451618923206, + "grad_norm": 2.1115688464995377, + "learning_rate": 0.00029942960750391114, + "loss": 6.412, + "step": 3730 + }, + { + "epoch": 0.3481384715871979, + "grad_norm": 1.651068521117459, + "learning_rate": 0.00029942894831887766, + "loss": 6.3349, + "step": 3731 + }, + { + "epoch": 0.3482317812820752, + "grad_norm": 1.1959835886288155, + "learning_rate": 0.00029942828875389066, + "loss": 6.4095, + "step": 3732 + }, + { + "epoch": 0.3483250909769525, + "grad_norm": 1.5004979025536076, + "learning_rate": 0.0002994276288089518, + "loss": 6.3931, + "step": 3733 + }, + { + "epoch": 0.3484184006718298, + "grad_norm": 1.434712804100788, + "learning_rate": 0.00029942696848406274, + "loss": 6.0161, + "step": 3734 + }, + { + "epoch": 0.3485117103667071, + "grad_norm": 1.230368449054946, + "learning_rate": 0.00029942630777922516, + "loss": 6.412, + "step": 3735 + }, + { + "epoch": 0.3486050200615844, + "grad_norm": 1.3460907049406134, + "learning_rate": 0.0002994256466944408, + "loss": 5.9669, + "step": 3736 + }, + { + "epoch": 0.3486983297564617, + "grad_norm": 1.1697169547119308, + "learning_rate": 0.0002994249852297112, + "loss": 5.834, + "step": 3737 + }, + { + "epoch": 0.348791639451339, + "grad_norm": 1.133282593655985, + "learning_rate": 0.00029942432338503816, + "loss": 5.9113, + "step": 3738 + }, + { + "epoch": 0.3488849491462163, + "grad_norm": 1.2786431624228778, + "learning_rate": 0.00029942366116042334, + "loss": 6.1292, + "step": 3739 + }, + { + "epoch": 0.34897825884109357, + "grad_norm": 15.78837731511928, + "learning_rate": 0.0002994229985558684, + "loss": 6.4718, + "step": 3740 + }, + { + "epoch": 0.3490715685359709, + "grad_norm": 49.308246304635645, + "learning_rate": 0.00029942233557137503, + "loss": 6.3446, + "step": 3741 + }, + { + "epoch": 0.3491648782308482, + "grad_norm": 1.7405549430797596, + "learning_rate": 0.0002994216722069449, + "loss": 6.3904, + "step": 3742 + }, + { + "epoch": 0.3492581879257255, + "grad_norm": 1.87075312238632, + "learning_rate": 0.0002994210084625797, + "loss": 6.4114, + "step": 3743 + }, + { + "epoch": 0.3493514976206028, + "grad_norm": 3.130853047974628, + "learning_rate": 0.00029942034433828125, + "loss": 6.61, + "step": 3744 + }, + { + "epoch": 0.34944480731548005, + "grad_norm": 1.9783609877374446, + "learning_rate": 0.000299419679834051, + "loss": 6.4244, + "step": 3745 + }, + { + "epoch": 0.34953811701035736, + "grad_norm": 2.6266695818914374, + "learning_rate": 0.00029941901494989087, + "loss": 6.5829, + "step": 3746 + }, + { + "epoch": 0.34963142670523467, + "grad_norm": 2.3161253985462364, + "learning_rate": 0.0002994183496858024, + "loss": 6.6561, + "step": 3747 + }, + { + "epoch": 0.349724736400112, + "grad_norm": 1.4726298441420254, + "learning_rate": 0.0002994176840417873, + "loss": 6.62, + "step": 3748 + }, + { + "epoch": 0.3498180460949893, + "grad_norm": 96.65737473554957, + "learning_rate": 0.0002994170180178473, + "loss": 6.3086, + "step": 3749 + }, + { + "epoch": 0.34991135578986654, + "grad_norm": 13.346807202175164, + "learning_rate": 0.0002994163516139841, + "loss": 6.45, + "step": 3750 + }, + { + "epoch": 0.35000466548474385, + "grad_norm": 1.7266307763655122, + "learning_rate": 0.0002994156848301994, + "loss": 6.3829, + "step": 3751 + }, + { + "epoch": 0.35009797517962116, + "grad_norm": 6.468403531226608, + "learning_rate": 0.0002994150176664949, + "loss": 6.4215, + "step": 3752 + }, + { + "epoch": 0.35019128487449847, + "grad_norm": 17.66412791522354, + "learning_rate": 0.00029941435012287217, + "loss": 6.7605, + "step": 3753 + }, + { + "epoch": 0.3502845945693758, + "grad_norm": 2.3459995624923193, + "learning_rate": 0.0002994136821993331, + "loss": 6.4717, + "step": 3754 + }, + { + "epoch": 0.3503779042642531, + "grad_norm": 1897.3381034236404, + "learning_rate": 0.00029941301389587924, + "loss": 6.9013, + "step": 3755 + }, + { + "epoch": 0.35047121395913033, + "grad_norm": 4.230862740214558, + "learning_rate": 0.00029941234521251237, + "loss": 7.0384, + "step": 3756 + }, + { + "epoch": 0.35056452365400764, + "grad_norm": 6.558498014724198, + "learning_rate": 0.0002994116761492341, + "loss": 7.5364, + "step": 3757 + }, + { + "epoch": 0.35065783334888495, + "grad_norm": 8.21523097601501, + "learning_rate": 0.0002994110067060463, + "loss": 7.3887, + "step": 3758 + }, + { + "epoch": 0.35075114304376226, + "grad_norm": 5.215587174537649, + "learning_rate": 0.0002994103368829505, + "loss": 7.1036, + "step": 3759 + }, + { + "epoch": 0.35084445273863957, + "grad_norm": 4.334123825730367, + "learning_rate": 0.0002994096666799485, + "loss": 7.3884, + "step": 3760 + }, + { + "epoch": 0.3509377624335168, + "grad_norm": 4.214390591446475, + "learning_rate": 0.00029940899609704196, + "loss": 7.4023, + "step": 3761 + }, + { + "epoch": 0.35103107212839413, + "grad_norm": 193585.7229709136, + "learning_rate": 0.0002994083251342326, + "loss": 7.35, + "step": 3762 + }, + { + "epoch": 0.35112438182327144, + "grad_norm": 7.255614724949423, + "learning_rate": 0.00029940765379152213, + "loss": 7.4859, + "step": 3763 + }, + { + "epoch": 0.35121769151814874, + "grad_norm": 7.53356961371791, + "learning_rate": 0.00029940698206891225, + "loss": 7.4487, + "step": 3764 + }, + { + "epoch": 0.35131100121302605, + "grad_norm": 5.020291748111419, + "learning_rate": 0.0002994063099664047, + "loss": 7.1627, + "step": 3765 + }, + { + "epoch": 0.3514043109079033, + "grad_norm": 3.295890376425153, + "learning_rate": 0.00029940563748400117, + "loss": 7.5433, + "step": 3766 + }, + { + "epoch": 0.3514976206027806, + "grad_norm": 3.194948455404182, + "learning_rate": 0.00029940496462170334, + "loss": 7.5086, + "step": 3767 + }, + { + "epoch": 0.3515909302976579, + "grad_norm": 5.93819539966494, + "learning_rate": 0.00029940429137951293, + "loss": 7.4985, + "step": 3768 + }, + { + "epoch": 0.35168423999253523, + "grad_norm": 5.991155308795401, + "learning_rate": 0.0002994036177574317, + "loss": 7.0991, + "step": 3769 + }, + { + "epoch": 0.35177754968741254, + "grad_norm": 4.907273117677635, + "learning_rate": 0.0002994029437554613, + "loss": 7.5499, + "step": 3770 + }, + { + "epoch": 0.35187085938228985, + "grad_norm": 3.12943364119155, + "learning_rate": 0.0002994022693736035, + "loss": 7.3576, + "step": 3771 + }, + { + "epoch": 0.3519641690771671, + "grad_norm": 5.114800907432513, + "learning_rate": 0.00029940159461185994, + "loss": 7.7054, + "step": 3772 + }, + { + "epoch": 0.3520574787720444, + "grad_norm": 5.700230887289347, + "learning_rate": 0.00029940091947023233, + "loss": 7.5042, + "step": 3773 + }, + { + "epoch": 0.3521507884669217, + "grad_norm": 2.3296997273970046, + "learning_rate": 0.0002994002439487225, + "loss": 7.3935, + "step": 3774 + }, + { + "epoch": 0.352244098161799, + "grad_norm": 3.0685091948500554, + "learning_rate": 0.00029939956804733215, + "loss": 7.3746, + "step": 3775 + }, + { + "epoch": 0.35233740785667633, + "grad_norm": 3.497890264770341, + "learning_rate": 0.0002993988917660629, + "loss": 7.3387, + "step": 3776 + }, + { + "epoch": 0.3524307175515536, + "grad_norm": 4.331317012470052, + "learning_rate": 0.00029939821510491655, + "loss": 7.3293, + "step": 3777 + }, + { + "epoch": 0.3525240272464309, + "grad_norm": 3.488443586654214, + "learning_rate": 0.0002993975380638948, + "loss": 7.769, + "step": 3778 + }, + { + "epoch": 0.3526173369413082, + "grad_norm": 5.472769616891022, + "learning_rate": 0.0002993968606429993, + "loss": 7.772, + "step": 3779 + }, + { + "epoch": 0.3527106466361855, + "grad_norm": 4.231832352817065, + "learning_rate": 0.0002993961828422319, + "loss": 7.4214, + "step": 3780 + }, + { + "epoch": 0.3528039563310628, + "grad_norm": 6.037749727398716, + "learning_rate": 0.00029939550466159424, + "loss": 7.7053, + "step": 3781 + }, + { + "epoch": 0.35289726602594007, + "grad_norm": 2.7343442498083523, + "learning_rate": 0.00029939482610108806, + "loss": 7.3636, + "step": 3782 + }, + { + "epoch": 0.3529905757208174, + "grad_norm": 2.08800554001087, + "learning_rate": 0.0002993941471607151, + "loss": 7.6041, + "step": 3783 + }, + { + "epoch": 0.3530838854156947, + "grad_norm": 2.795261993036617, + "learning_rate": 0.0002993934678404771, + "loss": 7.1046, + "step": 3784 + }, + { + "epoch": 0.353177195110572, + "grad_norm": 4.063343261602866, + "learning_rate": 0.00029939278814037577, + "loss": 7.527, + "step": 3785 + }, + { + "epoch": 0.3532705048054493, + "grad_norm": 2.6218962656369174, + "learning_rate": 0.0002993921080604128, + "loss": 7.3757, + "step": 3786 + }, + { + "epoch": 0.35336381450032656, + "grad_norm": 2.26350464022497, + "learning_rate": 0.0002993914276005899, + "loss": 7.1136, + "step": 3787 + }, + { + "epoch": 0.35345712419520386, + "grad_norm": 1.894444709049515, + "learning_rate": 0.00029939074676090893, + "loss": 7.5745, + "step": 3788 + }, + { + "epoch": 0.3535504338900812, + "grad_norm": 3.681871627818342, + "learning_rate": 0.00029939006554137156, + "loss": 7.2114, + "step": 3789 + }, + { + "epoch": 0.3536437435849585, + "grad_norm": 1.7769018175078675, + "learning_rate": 0.00029938938394197945, + "loss": 7.2751, + "step": 3790 + }, + { + "epoch": 0.3537370532798358, + "grad_norm": 1.2086302590723565, + "learning_rate": 0.00029938870196273445, + "loss": 7.2153, + "step": 3791 + }, + { + "epoch": 0.3538303629747131, + "grad_norm": 3.9252947384195025, + "learning_rate": 0.00029938801960363814, + "loss": 7.0902, + "step": 3792 + }, + { + "epoch": 0.35392367266959035, + "grad_norm": 3.653534025471394, + "learning_rate": 0.0002993873368646924, + "loss": 7.3224, + "step": 3793 + }, + { + "epoch": 0.35401698236446766, + "grad_norm": 2.3267014972096844, + "learning_rate": 0.00029938665374589895, + "loss": 7.3743, + "step": 3794 + }, + { + "epoch": 0.35411029205934497, + "grad_norm": 3.1252743635137223, + "learning_rate": 0.0002993859702472595, + "loss": 7.4056, + "step": 3795 + }, + { + "epoch": 0.3542036017542223, + "grad_norm": 1.422998582162854, + "learning_rate": 0.0002993852863687757, + "loss": 7.2863, + "step": 3796 + }, + { + "epoch": 0.3542969114490996, + "grad_norm": 1.7404471070503202, + "learning_rate": 0.0002993846021104494, + "loss": 6.8106, + "step": 3797 + }, + { + "epoch": 0.35439022114397684, + "grad_norm": 1.653093824850358, + "learning_rate": 0.0002993839174722823, + "loss": 7.3937, + "step": 3798 + }, + { + "epoch": 0.35448353083885414, + "grad_norm": 1.3233823573200996, + "learning_rate": 0.00029938323245427617, + "loss": 7.2286, + "step": 3799 + }, + { + "epoch": 0.35457684053373145, + "grad_norm": 1.2115561803728185, + "learning_rate": 0.00029938254705643276, + "loss": 7.2364, + "step": 3800 + }, + { + "epoch": 0.35467015022860876, + "grad_norm": 3.297123072194155, + "learning_rate": 0.0002993818612787537, + "loss": 7.5866, + "step": 3801 + }, + { + "epoch": 0.35476345992348607, + "grad_norm": 1.2551895377186268, + "learning_rate": 0.0002993811751212409, + "loss": 7.139, + "step": 3802 + }, + { + "epoch": 0.3548567696183633, + "grad_norm": 1.8594759620566135, + "learning_rate": 0.00029938048858389597, + "loss": 7.1083, + "step": 3803 + }, + { + "epoch": 0.35495007931324063, + "grad_norm": 2.601081950284984, + "learning_rate": 0.0002993798016667207, + "loss": 7.3983, + "step": 3804 + }, + { + "epoch": 0.35504338900811794, + "grad_norm": 1.7595820123482009, + "learning_rate": 0.0002993791143697169, + "loss": 7.1201, + "step": 3805 + }, + { + "epoch": 0.35513669870299525, + "grad_norm": 1.7561817846470928, + "learning_rate": 0.0002993784266928862, + "loss": 7.3663, + "step": 3806 + }, + { + "epoch": 0.35523000839787255, + "grad_norm": 3.680991791610406, + "learning_rate": 0.00029937773863623044, + "loss": 7.414, + "step": 3807 + }, + { + "epoch": 0.35532331809274986, + "grad_norm": 1.21481363406263, + "learning_rate": 0.0002993770501997513, + "loss": 7.1898, + "step": 3808 + }, + { + "epoch": 0.3554166277876271, + "grad_norm": 4.29257927207879, + "learning_rate": 0.00029937636138345063, + "loss": 7.134, + "step": 3809 + }, + { + "epoch": 0.3555099374825044, + "grad_norm": 3.0556825725267704, + "learning_rate": 0.00029937567218733005, + "loss": 7.1184, + "step": 3810 + }, + { + "epoch": 0.35560324717738173, + "grad_norm": 1.6422400244429947, + "learning_rate": 0.00029937498261139143, + "loss": 7.0701, + "step": 3811 + }, + { + "epoch": 0.35569655687225904, + "grad_norm": 3.1811240336861455, + "learning_rate": 0.0002993742926556365, + "loss": 7.2563, + "step": 3812 + }, + { + "epoch": 0.35578986656713635, + "grad_norm": 22.36764011958173, + "learning_rate": 0.000299373602320067, + "loss": 7.4249, + "step": 3813 + }, + { + "epoch": 0.3558831762620136, + "grad_norm": 1.7040616669412447, + "learning_rate": 0.00029937291160468465, + "loss": 7.2702, + "step": 3814 + }, + { + "epoch": 0.3559764859568909, + "grad_norm": 1.5486437646941638, + "learning_rate": 0.0002993722205094912, + "loss": 7.3637, + "step": 3815 + }, + { + "epoch": 0.3560697956517682, + "grad_norm": 2.140994221581931, + "learning_rate": 0.0002993715290344885, + "loss": 7.5511, + "step": 3816 + }, + { + "epoch": 0.3561631053466455, + "grad_norm": 1.6613309519063333, + "learning_rate": 0.0002993708371796782, + "loss": 7.2691, + "step": 3817 + }, + { + "epoch": 0.35625641504152283, + "grad_norm": 1.642383322017843, + "learning_rate": 0.0002993701449450621, + "loss": 7.274, + "step": 3818 + }, + { + "epoch": 0.3563497247364001, + "grad_norm": 2.7861475413994885, + "learning_rate": 0.00029936945233064203, + "loss": 7.1094, + "step": 3819 + }, + { + "epoch": 0.3564430344312774, + "grad_norm": 3.740608967846815, + "learning_rate": 0.0002993687593364197, + "loss": 7.0473, + "step": 3820 + }, + { + "epoch": 0.3565363441261547, + "grad_norm": 2.122949722555117, + "learning_rate": 0.0002993680659623968, + "loss": 7.3386, + "step": 3821 + }, + { + "epoch": 0.356629653821032, + "grad_norm": 5.297311385539133, + "learning_rate": 0.00029936737220857514, + "loss": 7.1926, + "step": 3822 + }, + { + "epoch": 0.3567229635159093, + "grad_norm": 5.873777284451131, + "learning_rate": 0.00029936667807495656, + "loss": 7.0309, + "step": 3823 + }, + { + "epoch": 0.3568162732107866, + "grad_norm": 2.1825093263209814, + "learning_rate": 0.0002993659835615427, + "loss": 7.5533, + "step": 3824 + }, + { + "epoch": 0.3569095829056639, + "grad_norm": 1.5790348701411525, + "learning_rate": 0.00029936528866833544, + "loss": 7.2823, + "step": 3825 + }, + { + "epoch": 0.3570028926005412, + "grad_norm": 5.014647526107266, + "learning_rate": 0.0002993645933953365, + "loss": 7.6014, + "step": 3826 + }, + { + "epoch": 0.3570962022954185, + "grad_norm": 1.6970098334330752, + "learning_rate": 0.0002993638977425476, + "loss": 7.3181, + "step": 3827 + }, + { + "epoch": 0.3571895119902958, + "grad_norm": 2.7620839092364364, + "learning_rate": 0.0002993632017099706, + "loss": 7.3537, + "step": 3828 + }, + { + "epoch": 0.3572828216851731, + "grad_norm": 2.2929415377760907, + "learning_rate": 0.0002993625052976072, + "loss": 6.8289, + "step": 3829 + }, + { + "epoch": 0.35737613138005037, + "grad_norm": 3.9657839078229493, + "learning_rate": 0.0002993618085054592, + "loss": 7.1557, + "step": 3830 + }, + { + "epoch": 0.3574694410749277, + "grad_norm": 2.9054593094446455, + "learning_rate": 0.00029936111133352833, + "loss": 7.2657, + "step": 3831 + }, + { + "epoch": 0.357562750769805, + "grad_norm": 1.6361889279021848, + "learning_rate": 0.0002993604137818164, + "loss": 7.3156, + "step": 3832 + }, + { + "epoch": 0.3576560604646823, + "grad_norm": 0.9322910812355797, + "learning_rate": 0.0002993597158503252, + "loss": 7.3369, + "step": 3833 + }, + { + "epoch": 0.3577493701595596, + "grad_norm": 3.00841625171535, + "learning_rate": 0.0002993590175390565, + "loss": 7.0121, + "step": 3834 + }, + { + "epoch": 0.35784267985443685, + "grad_norm": 2.8677150900846504, + "learning_rate": 0.00029935831884801207, + "loss": 6.9954, + "step": 3835 + }, + { + "epoch": 0.35793598954931416, + "grad_norm": 1.2532213539583639, + "learning_rate": 0.00029935761977719364, + "loss": 7.22, + "step": 3836 + }, + { + "epoch": 0.35802929924419147, + "grad_norm": 0.884608443598875, + "learning_rate": 0.0002993569203266031, + "loss": 6.9065, + "step": 3837 + }, + { + "epoch": 0.3581226089390688, + "grad_norm": 1.668861087948258, + "learning_rate": 0.00029935622049624206, + "loss": 6.6363, + "step": 3838 + }, + { + "epoch": 0.3582159186339461, + "grad_norm": 1.720997938991013, + "learning_rate": 0.0002993555202861125, + "loss": 6.7627, + "step": 3839 + }, + { + "epoch": 0.35830922832882334, + "grad_norm": 2.1055075698697574, + "learning_rate": 0.00029935481969621595, + "loss": 7.0249, + "step": 3840 + }, + { + "epoch": 0.35840253802370065, + "grad_norm": 2.4155016283060617, + "learning_rate": 0.00029935411872655446, + "loss": 6.6692, + "step": 3841 + }, + { + "epoch": 0.35849584771857795, + "grad_norm": 2.6029496132719316, + "learning_rate": 0.00029935341737712963, + "loss": 7.3411, + "step": 3842 + }, + { + "epoch": 0.35858915741345526, + "grad_norm": 3.919995830961553, + "learning_rate": 0.0002993527156479433, + "loss": 7.0144, + "step": 3843 + }, + { + "epoch": 0.35868246710833257, + "grad_norm": 3.7868582665646295, + "learning_rate": 0.0002993520135389973, + "loss": 7.4944, + "step": 3844 + }, + { + "epoch": 0.3587757768032099, + "grad_norm": 3.4029602046126697, + "learning_rate": 0.0002993513110502933, + "loss": 7.042, + "step": 3845 + }, + { + "epoch": 0.35886908649808713, + "grad_norm": 4.012754488574656, + "learning_rate": 0.0002993506081818332, + "loss": 7.1563, + "step": 3846 + }, + { + "epoch": 0.35896239619296444, + "grad_norm": 3.2329637273881024, + "learning_rate": 0.0002993499049336188, + "loss": 7.0527, + "step": 3847 + }, + { + "epoch": 0.35905570588784175, + "grad_norm": 3.7093306368896894, + "learning_rate": 0.00029934920130565175, + "loss": 6.8497, + "step": 3848 + }, + { + "epoch": 0.35914901558271906, + "grad_norm": 2.4907413446805444, + "learning_rate": 0.0002993484972979339, + "loss": 6.7765, + "step": 3849 + }, + { + "epoch": 0.35924232527759636, + "grad_norm": 2.117378888866078, + "learning_rate": 0.0002993477929104672, + "loss": 7.0986, + "step": 3850 + }, + { + "epoch": 0.3593356349724736, + "grad_norm": 1.7364344417731594, + "learning_rate": 0.0002993470881432532, + "loss": 6.9422, + "step": 3851 + }, + { + "epoch": 0.3594289446673509, + "grad_norm": 6.231618404457278, + "learning_rate": 0.0002993463829962938, + "loss": 6.9622, + "step": 3852 + }, + { + "epoch": 0.35952225436222823, + "grad_norm": 2.9041479571330684, + "learning_rate": 0.00029934567746959075, + "loss": 7.0368, + "step": 3853 + }, + { + "epoch": 0.35961556405710554, + "grad_norm": 1.4053445771895405, + "learning_rate": 0.00029934497156314596, + "loss": 6.6059, + "step": 3854 + }, + { + "epoch": 0.35970887375198285, + "grad_norm": 1.2026553748746285, + "learning_rate": 0.0002993442652769611, + "loss": 6.6915, + "step": 3855 + }, + { + "epoch": 0.3598021834468601, + "grad_norm": 2.485281059223112, + "learning_rate": 0.00029934355861103805, + "loss": 7.1457, + "step": 3856 + }, + { + "epoch": 0.3598954931417374, + "grad_norm": 2.075981240768823, + "learning_rate": 0.00029934285156537856, + "loss": 6.8216, + "step": 3857 + }, + { + "epoch": 0.3599888028366147, + "grad_norm": 1.627356849045822, + "learning_rate": 0.0002993421441399844, + "loss": 6.9514, + "step": 3858 + }, + { + "epoch": 0.360082112531492, + "grad_norm": 1.2103884882223472, + "learning_rate": 0.0002993414363348574, + "loss": 6.574, + "step": 3859 + }, + { + "epoch": 0.36017542222636934, + "grad_norm": 2.2501661838582505, + "learning_rate": 0.00029934072814999943, + "loss": 6.9356, + "step": 3860 + }, + { + "epoch": 0.36026873192124664, + "grad_norm": 1.9868980267276306, + "learning_rate": 0.00029934001958541217, + "loss": 6.7701, + "step": 3861 + }, + { + "epoch": 0.3603620416161239, + "grad_norm": 1.444441136951385, + "learning_rate": 0.0002993393106410975, + "loss": 7.0985, + "step": 3862 + }, + { + "epoch": 0.3604553513110012, + "grad_norm": 1.2245782022650182, + "learning_rate": 0.0002993386013170572, + "loss": 6.8895, + "step": 3863 + }, + { + "epoch": 0.3605486610058785, + "grad_norm": 36.60892959452322, + "learning_rate": 0.00029933789161329307, + "loss": 7.1524, + "step": 3864 + }, + { + "epoch": 0.3606419707007558, + "grad_norm": 1.4960247812819734, + "learning_rate": 0.0002993371815298069, + "loss": 6.9858, + "step": 3865 + }, + { + "epoch": 0.36073528039563313, + "grad_norm": 2.888753152459878, + "learning_rate": 0.00029933647106660053, + "loss": 6.4341, + "step": 3866 + }, + { + "epoch": 0.3608285900905104, + "grad_norm": 1.7522597949661678, + "learning_rate": 0.00029933576022367575, + "loss": 6.7682, + "step": 3867 + }, + { + "epoch": 0.3609218997853877, + "grad_norm": 2.7364573757614603, + "learning_rate": 0.0002993350490010344, + "loss": 6.5725, + "step": 3868 + }, + { + "epoch": 0.361015209480265, + "grad_norm": 2.8949854635410643, + "learning_rate": 0.00029933433739867823, + "loss": 6.7136, + "step": 3869 + }, + { + "epoch": 0.3611085191751423, + "grad_norm": 0.904420279562007, + "learning_rate": 0.0002993336254166091, + "loss": 6.8103, + "step": 3870 + }, + { + "epoch": 0.3612018288700196, + "grad_norm": 2.223990948041122, + "learning_rate": 0.0002993329130548287, + "loss": 6.6438, + "step": 3871 + }, + { + "epoch": 0.36129513856489687, + "grad_norm": 2.7751771357575112, + "learning_rate": 0.0002993322003133391, + "loss": 6.4463, + "step": 3872 + }, + { + "epoch": 0.3613884482597742, + "grad_norm": 3.135696996233878, + "learning_rate": 0.00029933148719214184, + "loss": 6.6134, + "step": 3873 + }, + { + "epoch": 0.3614817579546515, + "grad_norm": 1.1473918267237058, + "learning_rate": 0.00029933077369123884, + "loss": 6.4627, + "step": 3874 + }, + { + "epoch": 0.3615750676495288, + "grad_norm": 1.381070097918491, + "learning_rate": 0.000299330059810632, + "loss": 6.8083, + "step": 3875 + }, + { + "epoch": 0.3616683773444061, + "grad_norm": 2.8178636793459892, + "learning_rate": 0.00029932934555032293, + "loss": 6.7931, + "step": 3876 + }, + { + "epoch": 0.3617616870392834, + "grad_norm": 4.537046640809028, + "learning_rate": 0.00029932863091031365, + "loss": 6.9558, + "step": 3877 + }, + { + "epoch": 0.36185499673416066, + "grad_norm": 1.673062163639331, + "learning_rate": 0.00029932791589060587, + "loss": 6.5382, + "step": 3878 + }, + { + "epoch": 0.36194830642903797, + "grad_norm": 1.800310164596211, + "learning_rate": 0.00029932720049120144, + "loss": 6.8605, + "step": 3879 + }, + { + "epoch": 0.3620416161239153, + "grad_norm": 0.948108559017535, + "learning_rate": 0.0002993264847121022, + "loss": 6.2018, + "step": 3880 + }, + { + "epoch": 0.3621349258187926, + "grad_norm": 1.9125581547393737, + "learning_rate": 0.00029932576855330994, + "loss": 6.6514, + "step": 3881 + }, + { + "epoch": 0.3622282355136699, + "grad_norm": 1.4661689783002578, + "learning_rate": 0.00029932505201482644, + "loss": 6.4724, + "step": 3882 + }, + { + "epoch": 0.36232154520854715, + "grad_norm": 5.533923429928786, + "learning_rate": 0.0002993243350966536, + "loss": 7.0995, + "step": 3883 + }, + { + "epoch": 0.36241485490342445, + "grad_norm": 1.3016455276092236, + "learning_rate": 0.0002993236177987932, + "loss": 6.5288, + "step": 3884 + }, + { + "epoch": 0.36250816459830176, + "grad_norm": 2.366860592400384, + "learning_rate": 0.00029932290012124705, + "loss": 6.9293, + "step": 3885 + }, + { + "epoch": 0.36260147429317907, + "grad_norm": 2.1653410619194218, + "learning_rate": 0.00029932218206401705, + "loss": 6.6123, + "step": 3886 + }, + { + "epoch": 0.3626947839880564, + "grad_norm": 3.0195254590146767, + "learning_rate": 0.00029932146362710493, + "loss": 6.5966, + "step": 3887 + }, + { + "epoch": 0.36278809368293363, + "grad_norm": 1.8820835989913622, + "learning_rate": 0.0002993207448105126, + "loss": 6.5347, + "step": 3888 + }, + { + "epoch": 0.36288140337781094, + "grad_norm": 71.04196671253548, + "learning_rate": 0.0002993200256142418, + "loss": 6.4898, + "step": 3889 + }, + { + "epoch": 0.36297471307268825, + "grad_norm": 2.192274626361559, + "learning_rate": 0.0002993193060382944, + "loss": 6.8376, + "step": 3890 + }, + { + "epoch": 0.36306802276756556, + "grad_norm": 7.00309613630599, + "learning_rate": 0.00029931858608267227, + "loss": 8.1688, + "step": 3891 + }, + { + "epoch": 0.36316133246244287, + "grad_norm": 419.1580686756999, + "learning_rate": 0.0002993178657473772, + "loss": 7.1685, + "step": 3892 + }, + { + "epoch": 0.3632546421573201, + "grad_norm": 3.9654881624766136, + "learning_rate": 0.00029931714503241106, + "loss": 7.5236, + "step": 3893 + }, + { + "epoch": 0.3633479518521974, + "grad_norm": 4.458331870284248, + "learning_rate": 0.00029931642393777566, + "loss": 8.6591, + "step": 3894 + }, + { + "epoch": 0.36344126154707473, + "grad_norm": 3.400293212672469, + "learning_rate": 0.00029931570246347273, + "loss": 8.1231, + "step": 3895 + }, + { + "epoch": 0.36353457124195204, + "grad_norm": 8.300857108048007, + "learning_rate": 0.00029931498060950426, + "loss": 8.1087, + "step": 3896 + }, + { + "epoch": 0.36362788093682935, + "grad_norm": 6.1998103033020255, + "learning_rate": 0.00029931425837587204, + "loss": 8.2119, + "step": 3897 + }, + { + "epoch": 0.36372119063170666, + "grad_norm": 5.230659613424784, + "learning_rate": 0.00029931353576257786, + "loss": 8.1277, + "step": 3898 + }, + { + "epoch": 0.3638145003265839, + "grad_norm": 7.078496780477515, + "learning_rate": 0.0002993128127696236, + "loss": 7.894, + "step": 3899 + }, + { + "epoch": 0.3639078100214612, + "grad_norm": 6.781361357608478, + "learning_rate": 0.0002993120893970111, + "loss": 7.87, + "step": 3900 + }, + { + "epoch": 0.36400111971633853, + "grad_norm": 11.302303439995733, + "learning_rate": 0.00029931136564474215, + "loss": 7.8169, + "step": 3901 + }, + { + "epoch": 0.36409442941121584, + "grad_norm": 9.107802786096569, + "learning_rate": 0.0002993106415128187, + "loss": 7.9904, + "step": 3902 + }, + { + "epoch": 0.36418773910609314, + "grad_norm": 11.916276845810126, + "learning_rate": 0.00029930991700124245, + "loss": 7.7576, + "step": 3903 + }, + { + "epoch": 0.3642810488009704, + "grad_norm": 9.959814000702602, + "learning_rate": 0.00029930919211001533, + "loss": 7.8584, + "step": 3904 + }, + { + "epoch": 0.3643743584958477, + "grad_norm": 9.792433741363881, + "learning_rate": 0.0002993084668391391, + "loss": 8.197, + "step": 3905 + }, + { + "epoch": 0.364467668190725, + "grad_norm": 3.657854814620441, + "learning_rate": 0.00029930774118861576, + "loss": 7.5222, + "step": 3906 + }, + { + "epoch": 0.3645609778856023, + "grad_norm": 22.34943293008139, + "learning_rate": 0.00029930701515844705, + "loss": 8.1726, + "step": 3907 + }, + { + "epoch": 0.36465428758047963, + "grad_norm": 3.237145375755384, + "learning_rate": 0.0002993062887486348, + "loss": 7.9816, + "step": 3908 + }, + { + "epoch": 0.3647475972753569, + "grad_norm": 7.516025454266711, + "learning_rate": 0.0002993055619591809, + "loss": 7.7913, + "step": 3909 + }, + { + "epoch": 0.3648409069702342, + "grad_norm": 8.028873932161607, + "learning_rate": 0.0002993048347900872, + "loss": 7.5985, + "step": 3910 + }, + { + "epoch": 0.3649342166651115, + "grad_norm": 9.320013427575956, + "learning_rate": 0.00029930410724135546, + "loss": 7.5494, + "step": 3911 + }, + { + "epoch": 0.3650275263599888, + "grad_norm": 5.591002957739234, + "learning_rate": 0.0002993033793129877, + "loss": 7.362, + "step": 3912 + }, + { + "epoch": 0.3651208360548661, + "grad_norm": 13.320752756775963, + "learning_rate": 0.0002993026510049856, + "loss": 7.8453, + "step": 3913 + }, + { + "epoch": 0.3652141457497434, + "grad_norm": 4.985219309782814, + "learning_rate": 0.0002993019223173511, + "loss": 7.3335, + "step": 3914 + }, + { + "epoch": 0.3653074554446207, + "grad_norm": 5.206783560867957, + "learning_rate": 0.00029930119325008604, + "loss": 7.3763, + "step": 3915 + }, + { + "epoch": 0.365400765139498, + "grad_norm": 5.650402329741779, + "learning_rate": 0.0002993004638031923, + "loss": 7.4517, + "step": 3916 + }, + { + "epoch": 0.3654940748343753, + "grad_norm": 4.981293322133708, + "learning_rate": 0.0002992997339766717, + "loss": 7.4966, + "step": 3917 + }, + { + "epoch": 0.3655873845292526, + "grad_norm": 3.996527083682415, + "learning_rate": 0.0002992990037705261, + "loss": 7.4969, + "step": 3918 + }, + { + "epoch": 0.3656806942241299, + "grad_norm": 4.150558083702964, + "learning_rate": 0.00029929827318475735, + "loss": 7.4367, + "step": 3919 + }, + { + "epoch": 0.36577400391900716, + "grad_norm": 4.819945819025011, + "learning_rate": 0.0002992975422193673, + "loss": 7.5732, + "step": 3920 + }, + { + "epoch": 0.36586731361388447, + "grad_norm": 2.1287294161141053, + "learning_rate": 0.00029929681087435784, + "loss": 7.1565, + "step": 3921 + }, + { + "epoch": 0.3659606233087618, + "grad_norm": 3.1523194655498825, + "learning_rate": 0.00029929607914973077, + "loss": 7.1607, + "step": 3922 + }, + { + "epoch": 0.3660539330036391, + "grad_norm": 2.0388362957334576, + "learning_rate": 0.0002992953470454881, + "loss": 7.1796, + "step": 3923 + }, + { + "epoch": 0.3661472426985164, + "grad_norm": 2.811878650402719, + "learning_rate": 0.0002992946145616315, + "loss": 7.2115, + "step": 3924 + }, + { + "epoch": 0.36624055239339365, + "grad_norm": 2.3950095523855586, + "learning_rate": 0.00029929388169816295, + "loss": 7.3438, + "step": 3925 + }, + { + "epoch": 0.36633386208827096, + "grad_norm": 1.9112865766389908, + "learning_rate": 0.0002992931484550843, + "loss": 6.963, + "step": 3926 + }, + { + "epoch": 0.36642717178314826, + "grad_norm": 1.6525816341220032, + "learning_rate": 0.0002992924148323974, + "loss": 7.1753, + "step": 3927 + }, + { + "epoch": 0.3665204814780256, + "grad_norm": 2.599736122711423, + "learning_rate": 0.0002992916808301041, + "loss": 6.8393, + "step": 3928 + }, + { + "epoch": 0.3666137911729029, + "grad_norm": 1.7047564215576143, + "learning_rate": 0.0002992909464482062, + "loss": 6.8351, + "step": 3929 + }, + { + "epoch": 0.3667071008677802, + "grad_norm": 1.6796322612596364, + "learning_rate": 0.0002992902116867057, + "loss": 6.8822, + "step": 3930 + }, + { + "epoch": 0.36680041056265744, + "grad_norm": 2.7533715166703927, + "learning_rate": 0.00029928947654560446, + "loss": 7.0549, + "step": 3931 + }, + { + "epoch": 0.36689372025753475, + "grad_norm": 6.217285062377164, + "learning_rate": 0.0002992887410249043, + "loss": 7.0184, + "step": 3932 + }, + { + "epoch": 0.36698702995241206, + "grad_norm": 5.02724665047313, + "learning_rate": 0.00029928800512460705, + "loss": 6.7798, + "step": 3933 + }, + { + "epoch": 0.36708033964728937, + "grad_norm": 3.749066575715841, + "learning_rate": 0.00029928726884471463, + "loss": 7.1068, + "step": 3934 + }, + { + "epoch": 0.3671736493421667, + "grad_norm": 2.2573494206842284, + "learning_rate": 0.0002992865321852289, + "loss": 7.2219, + "step": 3935 + }, + { + "epoch": 0.3672669590370439, + "grad_norm": 2.225715053732881, + "learning_rate": 0.0002992857951461518, + "loss": 6.7048, + "step": 3936 + }, + { + "epoch": 0.36736026873192124, + "grad_norm": 1.1027222983148124, + "learning_rate": 0.0002992850577274851, + "loss": 6.9738, + "step": 3937 + }, + { + "epoch": 0.36745357842679854, + "grad_norm": 1.533881160196764, + "learning_rate": 0.0002992843199292307, + "loss": 6.329, + "step": 3938 + }, + { + "epoch": 0.36754688812167585, + "grad_norm": 2.652245193728251, + "learning_rate": 0.0002992835817513905, + "loss": 6.7082, + "step": 3939 + }, + { + "epoch": 0.36764019781655316, + "grad_norm": 1.900692313517647, + "learning_rate": 0.00029928284319396647, + "loss": 6.8785, + "step": 3940 + }, + { + "epoch": 0.3677335075114304, + "grad_norm": 2.4751380122107296, + "learning_rate": 0.0002992821042569603, + "loss": 6.8485, + "step": 3941 + }, + { + "epoch": 0.3678268172063077, + "grad_norm": 1.0239125774282727, + "learning_rate": 0.00029928136494037396, + "loss": 6.7057, + "step": 3942 + }, + { + "epoch": 0.36792012690118503, + "grad_norm": 1.1854813650483476, + "learning_rate": 0.00029928062524420935, + "loss": 6.4203, + "step": 3943 + }, + { + "epoch": 0.36801343659606234, + "grad_norm": 5.855393479262327, + "learning_rate": 0.0002992798851684683, + "loss": 6.5003, + "step": 3944 + }, + { + "epoch": 0.36810674629093965, + "grad_norm": 1.9756530293998185, + "learning_rate": 0.00029927914471315276, + "loss": 6.6758, + "step": 3945 + }, + { + "epoch": 0.3682000559858169, + "grad_norm": 2.1226086836256854, + "learning_rate": 0.0002992784038782646, + "loss": 7.2488, + "step": 3946 + }, + { + "epoch": 0.3682933656806942, + "grad_norm": 1.3769503946797539, + "learning_rate": 0.0002992776626638056, + "loss": 6.6864, + "step": 3947 + }, + { + "epoch": 0.3683866753755715, + "grad_norm": 48.36668080257113, + "learning_rate": 0.00029927692106977775, + "loss": 6.5638, + "step": 3948 + }, + { + "epoch": 0.3684799850704488, + "grad_norm": 2.3189348248032857, + "learning_rate": 0.000299276179096183, + "loss": 6.6175, + "step": 3949 + }, + { + "epoch": 0.36857329476532613, + "grad_norm": 3.0036454684488443, + "learning_rate": 0.00029927543674302304, + "loss": 6.8622, + "step": 3950 + }, + { + "epoch": 0.36866660446020344, + "grad_norm": 5.723436094288706, + "learning_rate": 0.0002992746940102999, + "loss": 6.4235, + "step": 3951 + }, + { + "epoch": 0.3687599141550807, + "grad_norm": 6.517422989809144, + "learning_rate": 0.0002992739508980154, + "loss": 7.2641, + "step": 3952 + }, + { + "epoch": 0.368853223849958, + "grad_norm": 4.315383208568654, + "learning_rate": 0.00029927320740617147, + "loss": 7.0674, + "step": 3953 + }, + { + "epoch": 0.3689465335448353, + "grad_norm": 2.63474497291817, + "learning_rate": 0.00029927246353477003, + "loss": 7.0506, + "step": 3954 + }, + { + "epoch": 0.3690398432397126, + "grad_norm": 3.87683529787196, + "learning_rate": 0.0002992717192838129, + "loss": 6.9092, + "step": 3955 + }, + { + "epoch": 0.3691331529345899, + "grad_norm": 2.6936429362912917, + "learning_rate": 0.000299270974653302, + "loss": 6.7596, + "step": 3956 + }, + { + "epoch": 0.3692264626294672, + "grad_norm": 2.772715666712289, + "learning_rate": 0.00029927022964323924, + "loss": 6.8847, + "step": 3957 + }, + { + "epoch": 0.3693197723243445, + "grad_norm": 3.1056050494226706, + "learning_rate": 0.0002992694842536265, + "loss": 6.448, + "step": 3958 + }, + { + "epoch": 0.3694130820192218, + "grad_norm": 2.4707752519149198, + "learning_rate": 0.0002992687384844657, + "loss": 6.634, + "step": 3959 + }, + { + "epoch": 0.3695063917140991, + "grad_norm": 5.828162508884565, + "learning_rate": 0.0002992679923357587, + "loss": 7.233, + "step": 3960 + }, + { + "epoch": 0.3695997014089764, + "grad_norm": 2.02737598117637, + "learning_rate": 0.00029926724580750736, + "loss": 6.6805, + "step": 3961 + }, + { + "epoch": 0.36969301110385366, + "grad_norm": 2.7038024208523295, + "learning_rate": 0.0002992664988997137, + "loss": 6.594, + "step": 3962 + }, + { + "epoch": 0.36978632079873097, + "grad_norm": 3.6496620118068934, + "learning_rate": 0.00029926575161237954, + "loss": 6.395, + "step": 3963 + }, + { + "epoch": 0.3698796304936083, + "grad_norm": 3.6851332093050555, + "learning_rate": 0.0002992650039455068, + "loss": 6.8184, + "step": 3964 + }, + { + "epoch": 0.3699729401884856, + "grad_norm": 314.77793055880176, + "learning_rate": 0.0002992642558990973, + "loss": 6.8733, + "step": 3965 + }, + { + "epoch": 0.3700662498833629, + "grad_norm": 3.2677620561845147, + "learning_rate": 0.0002992635074731531, + "loss": 7.2247, + "step": 3966 + }, + { + "epoch": 0.3701595595782402, + "grad_norm": 2.082797026116714, + "learning_rate": 0.000299262758667676, + "loss": 7.4524, + "step": 3967 + }, + { + "epoch": 0.37025286927311746, + "grad_norm": 2.5865470247698448, + "learning_rate": 0.00029926200948266786, + "loss": 7.4717, + "step": 3968 + }, + { + "epoch": 0.37034617896799477, + "grad_norm": 1.6787337046971582, + "learning_rate": 0.0002992612599181307, + "loss": 7.3761, + "step": 3969 + }, + { + "epoch": 0.3704394886628721, + "grad_norm": 2.3690441775082407, + "learning_rate": 0.00029926050997406637, + "loss": 7.0969, + "step": 3970 + }, + { + "epoch": 0.3705327983577494, + "grad_norm": 1.5037444608466997, + "learning_rate": 0.00029925975965047676, + "loss": 6.842, + "step": 3971 + }, + { + "epoch": 0.3706261080526267, + "grad_norm": 2.6443409740088515, + "learning_rate": 0.00029925900894736375, + "loss": 7.2095, + "step": 3972 + }, + { + "epoch": 0.37071941774750394, + "grad_norm": 3.050612556714316, + "learning_rate": 0.00029925825786472935, + "loss": 6.7531, + "step": 3973 + }, + { + "epoch": 0.37081272744238125, + "grad_norm": 2.332756576092519, + "learning_rate": 0.00029925750640257546, + "loss": 7.4071, + "step": 3974 + }, + { + "epoch": 0.37090603713725856, + "grad_norm": 1.889497670223854, + "learning_rate": 0.00029925675456090385, + "loss": 7.2436, + "step": 3975 + }, + { + "epoch": 0.37099934683213587, + "grad_norm": 4.2382701381867784, + "learning_rate": 0.0002992560023397166, + "loss": 7.0732, + "step": 3976 + }, + { + "epoch": 0.3710926565270132, + "grad_norm": 3.7294864404003905, + "learning_rate": 0.0002992552497390155, + "loss": 7.0617, + "step": 3977 + }, + { + "epoch": 0.37118596622189043, + "grad_norm": 6.2860100968414026, + "learning_rate": 0.0002992544967588025, + "loss": 7.005, + "step": 3978 + }, + { + "epoch": 0.37127927591676774, + "grad_norm": 1.3394975916017298, + "learning_rate": 0.0002992537433990796, + "loss": 6.5751, + "step": 3979 + }, + { + "epoch": 0.37137258561164505, + "grad_norm": 2.6360646107698487, + "learning_rate": 0.00029925298965984855, + "loss": 6.762, + "step": 3980 + }, + { + "epoch": 0.37146589530652235, + "grad_norm": 1.6484885662756164, + "learning_rate": 0.0002992522355411114, + "loss": 6.9272, + "step": 3981 + }, + { + "epoch": 0.37155920500139966, + "grad_norm": 1.7827361219536826, + "learning_rate": 0.00029925148104287003, + "loss": 7.1544, + "step": 3982 + }, + { + "epoch": 0.37165251469627697, + "grad_norm": 2.0943576235029204, + "learning_rate": 0.00029925072616512633, + "loss": 6.6547, + "step": 3983 + }, + { + "epoch": 0.3717458243911542, + "grad_norm": 1.7269066919122629, + "learning_rate": 0.00029924997090788227, + "loss": 7.0092, + "step": 3984 + }, + { + "epoch": 0.37183913408603153, + "grad_norm": 1.4345668362680322, + "learning_rate": 0.00029924921527113975, + "loss": 6.88, + "step": 3985 + }, + { + "epoch": 0.37193244378090884, + "grad_norm": 2.7200674559309532, + "learning_rate": 0.00029924845925490066, + "loss": 6.9347, + "step": 3986 + }, + { + "epoch": 0.37202575347578615, + "grad_norm": 1.2639586251761554, + "learning_rate": 0.000299247702859167, + "loss": 6.9185, + "step": 3987 + }, + { + "epoch": 0.37211906317066346, + "grad_norm": 2.580309780336597, + "learning_rate": 0.0002992469460839406, + "loss": 6.4154, + "step": 3988 + }, + { + "epoch": 0.3722123728655407, + "grad_norm": 2.5580156918063928, + "learning_rate": 0.0002992461889292235, + "loss": 6.9149, + "step": 3989 + }, + { + "epoch": 0.372305682560418, + "grad_norm": 1.243482553076633, + "learning_rate": 0.0002992454313950175, + "loss": 6.6556, + "step": 3990 + }, + { + "epoch": 0.3723989922552953, + "grad_norm": 1.721476280389132, + "learning_rate": 0.0002992446734813245, + "loss": 6.7843, + "step": 3991 + }, + { + "epoch": 0.37249230195017263, + "grad_norm": 1.862689132264018, + "learning_rate": 0.0002992439151881466, + "loss": 6.3668, + "step": 3992 + }, + { + "epoch": 0.37258561164504994, + "grad_norm": 1.3529603264703667, + "learning_rate": 0.0002992431565154856, + "loss": 6.6871, + "step": 3993 + }, + { + "epoch": 0.3726789213399272, + "grad_norm": 1.5842913183344247, + "learning_rate": 0.00029924239746334345, + "loss": 6.6008, + "step": 3994 + }, + { + "epoch": 0.3727722310348045, + "grad_norm": 1.6318857034616399, + "learning_rate": 0.00029924163803172213, + "loss": 6.1599, + "step": 3995 + }, + { + "epoch": 0.3728655407296818, + "grad_norm": 1.4368869429724633, + "learning_rate": 0.00029924087822062354, + "loss": 6.3636, + "step": 3996 + }, + { + "epoch": 0.3729588504245591, + "grad_norm": 1.3784149815993185, + "learning_rate": 0.00029924011803004953, + "loss": 6.6413, + "step": 3997 + }, + { + "epoch": 0.3730521601194364, + "grad_norm": 1.3612953993923536, + "learning_rate": 0.00029923935746000216, + "loss": 6.5809, + "step": 3998 + }, + { + "epoch": 0.3731454698143137, + "grad_norm": 1.4484142836157685, + "learning_rate": 0.0002992385965104833, + "loss": 6.6308, + "step": 3999 + }, + { + "epoch": 0.373238779509191, + "grad_norm": 1.4317732248113235, + "learning_rate": 0.0002992378351814949, + "loss": 6.1187, + "step": 4000 + }, + { + "epoch": 0.3733320892040683, + "grad_norm": 1.161136300719391, + "learning_rate": 0.00029923707347303895, + "loss": 6.6911, + "step": 4001 + }, + { + "epoch": 0.3734253988989456, + "grad_norm": 1.3475878109734418, + "learning_rate": 0.00029923631138511727, + "loss": 6.3198, + "step": 4002 + }, + { + "epoch": 0.3735187085938229, + "grad_norm": 1.5133653703270646, + "learning_rate": 0.00029923554891773187, + "loss": 6.6161, + "step": 4003 + }, + { + "epoch": 0.3736120182887002, + "grad_norm": 1.2457377263661726, + "learning_rate": 0.00029923478607088466, + "loss": 6.0255, + "step": 4004 + }, + { + "epoch": 0.3737053279835775, + "grad_norm": 1.9124432802968583, + "learning_rate": 0.00029923402284457763, + "loss": 6.4856, + "step": 4005 + }, + { + "epoch": 0.3737986376784548, + "grad_norm": 1.3036449400313546, + "learning_rate": 0.00029923325923881264, + "loss": 6.7587, + "step": 4006 + }, + { + "epoch": 0.3738919473733321, + "grad_norm": 1.1705982335157687, + "learning_rate": 0.0002992324952535917, + "loss": 6.7401, + "step": 4007 + }, + { + "epoch": 0.3739852570682094, + "grad_norm": 2.1268490609743917, + "learning_rate": 0.0002992317308889167, + "loss": 6.4909, + "step": 4008 + }, + { + "epoch": 0.3740785667630867, + "grad_norm": 7.3941929874255425, + "learning_rate": 0.0002992309661447897, + "loss": 5.9974, + "step": 4009 + }, + { + "epoch": 0.37417187645796396, + "grad_norm": 2.1494787668672, + "learning_rate": 0.0002992302010212125, + "loss": 6.4305, + "step": 4010 + }, + { + "epoch": 0.37426518615284127, + "grad_norm": 1.1395484908366527, + "learning_rate": 0.0002992294355181871, + "loss": 6.646, + "step": 4011 + }, + { + "epoch": 0.3743584958477186, + "grad_norm": 1.8165328651468093, + "learning_rate": 0.00029922866963571544, + "loss": 6.7162, + "step": 4012 + }, + { + "epoch": 0.3744518055425959, + "grad_norm": 1.944862995610746, + "learning_rate": 0.00029922790337379954, + "loss": 6.8674, + "step": 4013 + }, + { + "epoch": 0.3745451152374732, + "grad_norm": 2.5091104051624726, + "learning_rate": 0.00029922713673244123, + "loss": 6.4945, + "step": 4014 + }, + { + "epoch": 0.37463842493235044, + "grad_norm": 1.3723069625859428, + "learning_rate": 0.0002992263697116425, + "loss": 6.7591, + "step": 4015 + }, + { + "epoch": 0.37473173462722775, + "grad_norm": 1.1455530535907494, + "learning_rate": 0.00029922560231140536, + "loss": 6.7959, + "step": 4016 + }, + { + "epoch": 0.37482504432210506, + "grad_norm": 3.131295016516139, + "learning_rate": 0.0002992248345317317, + "loss": 7.0143, + "step": 4017 + }, + { + "epoch": 0.37491835401698237, + "grad_norm": 1.4356892209947079, + "learning_rate": 0.0002992240663726235, + "loss": 6.6486, + "step": 4018 + }, + { + "epoch": 0.3750116637118597, + "grad_norm": 1.2086657557906597, + "learning_rate": 0.0002992232978340827, + "loss": 6.5144, + "step": 4019 + }, + { + "epoch": 0.375104973406737, + "grad_norm": 2.3267933196732717, + "learning_rate": 0.00029922252891611124, + "loss": 6.7075, + "step": 4020 + }, + { + "epoch": 0.37519828310161424, + "grad_norm": 1.2603646701450024, + "learning_rate": 0.0002992217596187111, + "loss": 6.8736, + "step": 4021 + }, + { + "epoch": 0.37529159279649155, + "grad_norm": 1.4942197970872084, + "learning_rate": 0.0002992209899418842, + "loss": 6.8927, + "step": 4022 + }, + { + "epoch": 0.37538490249136885, + "grad_norm": 1.1891550882773523, + "learning_rate": 0.0002992202198856326, + "loss": 6.1973, + "step": 4023 + }, + { + "epoch": 0.37547821218624616, + "grad_norm": 0.9517491324014715, + "learning_rate": 0.0002992194494499581, + "loss": 6.7641, + "step": 4024 + }, + { + "epoch": 0.37557152188112347, + "grad_norm": 1.5703990418008444, + "learning_rate": 0.00029921867863486284, + "loss": 6.5476, + "step": 4025 + }, + { + "epoch": 0.3756648315760007, + "grad_norm": 0.8862401255389107, + "learning_rate": 0.00029921790744034856, + "loss": 6.2642, + "step": 4026 + }, + { + "epoch": 0.37575814127087803, + "grad_norm": 1.525253270718908, + "learning_rate": 0.0002992171358664174, + "loss": 6.4854, + "step": 4027 + }, + { + "epoch": 0.37585145096575534, + "grad_norm": 0.9971163210385283, + "learning_rate": 0.0002992163639130713, + "loss": 6.2367, + "step": 4028 + }, + { + "epoch": 0.37594476066063265, + "grad_norm": 1.1805967177400423, + "learning_rate": 0.00029921559158031216, + "loss": 6.6938, + "step": 4029 + }, + { + "epoch": 0.37603807035550996, + "grad_norm": 1.2955774166794627, + "learning_rate": 0.000299214818868142, + "loss": 6.4101, + "step": 4030 + }, + { + "epoch": 0.3761313800503872, + "grad_norm": 1.3900746832675857, + "learning_rate": 0.0002992140457765627, + "loss": 6.059, + "step": 4031 + }, + { + "epoch": 0.3762246897452645, + "grad_norm": 1.2650159450972571, + "learning_rate": 0.0002992132723055763, + "loss": 6.4211, + "step": 4032 + }, + { + "epoch": 0.3763179994401418, + "grad_norm": 1.4967432486165893, + "learning_rate": 0.00029921249845518477, + "loss": 6.5544, + "step": 4033 + }, + { + "epoch": 0.37641130913501913, + "grad_norm": 1.0308176773490438, + "learning_rate": 0.00029921172422539007, + "loss": 6.1062, + "step": 4034 + }, + { + "epoch": 0.37650461882989644, + "grad_norm": 2.142681460488028, + "learning_rate": 0.0002992109496161941, + "loss": 6.4034, + "step": 4035 + }, + { + "epoch": 0.37659792852477375, + "grad_norm": 1.104337979046688, + "learning_rate": 0.0002992101746275989, + "loss": 6.4481, + "step": 4036 + }, + { + "epoch": 0.376691238219651, + "grad_norm": 1.349786480716133, + "learning_rate": 0.00029920939925960643, + "loss": 6.0644, + "step": 4037 + }, + { + "epoch": 0.3767845479145283, + "grad_norm": 3.5921185086492433, + "learning_rate": 0.0002992086235122187, + "loss": 6.1064, + "step": 4038 + }, + { + "epoch": 0.3768778576094056, + "grad_norm": 8.789997934079672, + "learning_rate": 0.0002992078473854376, + "loss": 6.5979, + "step": 4039 + }, + { + "epoch": 0.37697116730428293, + "grad_norm": 1.6906360594824354, + "learning_rate": 0.00029920707087926514, + "loss": 6.9166, + "step": 4040 + }, + { + "epoch": 0.37706447699916024, + "grad_norm": 1.2059293330157315, + "learning_rate": 0.0002992062939937033, + "loss": 6.7722, + "step": 4041 + }, + { + "epoch": 0.3771577866940375, + "grad_norm": 1.2230892014224932, + "learning_rate": 0.00029920551672875406, + "loss": 6.5666, + "step": 4042 + }, + { + "epoch": 0.3772510963889148, + "grad_norm": 1.5643435582416256, + "learning_rate": 0.0002992047390844194, + "loss": 6.2948, + "step": 4043 + }, + { + "epoch": 0.3773444060837921, + "grad_norm": 1.2825777277944517, + "learning_rate": 0.0002992039610607013, + "loss": 6.4165, + "step": 4044 + }, + { + "epoch": 0.3774377157786694, + "grad_norm": 2.0706712326951995, + "learning_rate": 0.00029920318265760166, + "loss": 6.4201, + "step": 4045 + }, + { + "epoch": 0.3775310254735467, + "grad_norm": 1.6576236512590417, + "learning_rate": 0.00029920240387512257, + "loss": 6.5281, + "step": 4046 + }, + { + "epoch": 0.377624335168424, + "grad_norm": 1.3824304277968305, + "learning_rate": 0.0002992016247132659, + "loss": 6.1494, + "step": 4047 + }, + { + "epoch": 0.3777176448633013, + "grad_norm": 1.4461282529346067, + "learning_rate": 0.00029920084517203377, + "loss": 6.5899, + "step": 4048 + }, + { + "epoch": 0.3778109545581786, + "grad_norm": 1.4052507809115247, + "learning_rate": 0.0002992000652514281, + "loss": 6.4582, + "step": 4049 + }, + { + "epoch": 0.3779042642530559, + "grad_norm": 3.0088757346030546, + "learning_rate": 0.00029919928495145083, + "loss": 6.0903, + "step": 4050 + }, + { + "epoch": 0.3779975739479332, + "grad_norm": 1.2336410518311338, + "learning_rate": 0.000299198504272104, + "loss": 6.2576, + "step": 4051 + }, + { + "epoch": 0.3780908836428105, + "grad_norm": 1.3870888549397924, + "learning_rate": 0.00029919772321338947, + "loss": 6.2145, + "step": 4052 + }, + { + "epoch": 0.37818419333768777, + "grad_norm": 1.2879497764424785, + "learning_rate": 0.00029919694177530943, + "loss": 6.5967, + "step": 4053 + }, + { + "epoch": 0.3782775030325651, + "grad_norm": 966.6518366639455, + "learning_rate": 0.00029919615995786575, + "loss": 6.7722, + "step": 4054 + }, + { + "epoch": 0.3783708127274424, + "grad_norm": 129.14613359979148, + "learning_rate": 0.00029919537776106037, + "loss": 6.8749, + "step": 4055 + }, + { + "epoch": 0.3784641224223197, + "grad_norm": 3.1743430913929314, + "learning_rate": 0.00029919459518489537, + "loss": 7.0375, + "step": 4056 + }, + { + "epoch": 0.378557432117197, + "grad_norm": 3.71493865936724, + "learning_rate": 0.00029919381222937274, + "loss": 7.06, + "step": 4057 + }, + { + "epoch": 0.37865074181207425, + "grad_norm": 1.6325830025878894, + "learning_rate": 0.0002991930288944944, + "loss": 6.5173, + "step": 4058 + }, + { + "epoch": 0.37874405150695156, + "grad_norm": 1.498413904527288, + "learning_rate": 0.0002991922451802624, + "loss": 6.6835, + "step": 4059 + }, + { + "epoch": 0.37883736120182887, + "grad_norm": 3.0188756436469344, + "learning_rate": 0.00029919146108667874, + "loss": 6.7845, + "step": 4060 + }, + { + "epoch": 0.3789306708967062, + "grad_norm": 1.793066045668384, + "learning_rate": 0.00029919067661374535, + "loss": 7.0189, + "step": 4061 + }, + { + "epoch": 0.3790239805915835, + "grad_norm": 1.6205624047762022, + "learning_rate": 0.00029918989176146426, + "loss": 6.559, + "step": 4062 + }, + { + "epoch": 0.37911729028646074, + "grad_norm": 1.3923444931788787, + "learning_rate": 0.0002991891065298375, + "loss": 6.347, + "step": 4063 + }, + { + "epoch": 0.37921059998133805, + "grad_norm": 1.4972416601953846, + "learning_rate": 0.00029918832091886704, + "loss": 6.7254, + "step": 4064 + }, + { + "epoch": 0.37930390967621536, + "grad_norm": 1.2112056748814335, + "learning_rate": 0.0002991875349285548, + "loss": 6.6547, + "step": 4065 + }, + { + "epoch": 0.37939721937109266, + "grad_norm": 1.8485464831016372, + "learning_rate": 0.0002991867485589029, + "loss": 6.5748, + "step": 4066 + }, + { + "epoch": 0.37949052906597, + "grad_norm": 1.8049146754059244, + "learning_rate": 0.00029918596180991333, + "loss": 6.6969, + "step": 4067 + }, + { + "epoch": 0.3795838387608472, + "grad_norm": 1.5320160593750678, + "learning_rate": 0.00029918517468158803, + "loss": 6.4316, + "step": 4068 + }, + { + "epoch": 0.37967714845572453, + "grad_norm": 1.2034612817516097, + "learning_rate": 0.000299184387173929, + "loss": 6.6444, + "step": 4069 + }, + { + "epoch": 0.37977045815060184, + "grad_norm": 2.235079735163689, + "learning_rate": 0.0002991835992869383, + "loss": 6.8361, + "step": 4070 + }, + { + "epoch": 0.37986376784547915, + "grad_norm": 1.2504352691703693, + "learning_rate": 0.00029918281102061785, + "loss": 6.5569, + "step": 4071 + }, + { + "epoch": 0.37995707754035646, + "grad_norm": 1.3812719317616735, + "learning_rate": 0.0002991820223749697, + "loss": 6.705, + "step": 4072 + }, + { + "epoch": 0.38005038723523377, + "grad_norm": 1.5359668894901886, + "learning_rate": 0.0002991812333499959, + "loss": 6.3689, + "step": 4073 + }, + { + "epoch": 0.380143696930111, + "grad_norm": 1.4674056050084199, + "learning_rate": 0.0002991804439456984, + "loss": 6.4251, + "step": 4074 + }, + { + "epoch": 0.3802370066249883, + "grad_norm": 1.5214931487536447, + "learning_rate": 0.0002991796541620792, + "loss": 6.7279, + "step": 4075 + }, + { + "epoch": 0.38033031631986564, + "grad_norm": 2.425007818438043, + "learning_rate": 0.00029917886399914034, + "loss": 6.8376, + "step": 4076 + }, + { + "epoch": 0.38042362601474294, + "grad_norm": 2.0086242330724287, + "learning_rate": 0.00029917807345688385, + "loss": 6.3613, + "step": 4077 + }, + { + "epoch": 0.38051693570962025, + "grad_norm": 1.4161385912698514, + "learning_rate": 0.00029917728253531163, + "loss": 6.5713, + "step": 4078 + }, + { + "epoch": 0.3806102454044975, + "grad_norm": 2.633377401607802, + "learning_rate": 0.00029917649123442587, + "loss": 6.5583, + "step": 4079 + }, + { + "epoch": 0.3807035550993748, + "grad_norm": 1.5371868018921717, + "learning_rate": 0.00029917569955422845, + "loss": 6.5325, + "step": 4080 + }, + { + "epoch": 0.3807968647942521, + "grad_norm": 1.0410280986775557, + "learning_rate": 0.0002991749074947214, + "loss": 6.6824, + "step": 4081 + }, + { + "epoch": 0.38089017448912943, + "grad_norm": 1.6327044192486715, + "learning_rate": 0.0002991741150559068, + "loss": 6.6515, + "step": 4082 + }, + { + "epoch": 0.38098348418400674, + "grad_norm": 2.021486766869574, + "learning_rate": 0.00029917332223778655, + "loss": 6.8394, + "step": 4083 + }, + { + "epoch": 0.381076793878884, + "grad_norm": 1.0812926085025334, + "learning_rate": 0.00029917252904036273, + "loss": 6.1631, + "step": 4084 + }, + { + "epoch": 0.3811701035737613, + "grad_norm": 2.3164658520739847, + "learning_rate": 0.00029917173546363735, + "loss": 6.7955, + "step": 4085 + }, + { + "epoch": 0.3812634132686386, + "grad_norm": 1.5574300181235434, + "learning_rate": 0.00029917094150761246, + "loss": 6.6837, + "step": 4086 + }, + { + "epoch": 0.3813567229635159, + "grad_norm": 1.8275740172559567, + "learning_rate": 0.00029917014717229006, + "loss": 6.5472, + "step": 4087 + }, + { + "epoch": 0.3814500326583932, + "grad_norm": 1.6408522983096545, + "learning_rate": 0.00029916935245767216, + "loss": 6.7898, + "step": 4088 + }, + { + "epoch": 0.38154334235327053, + "grad_norm": 1.2588217141539497, + "learning_rate": 0.00029916855736376077, + "loss": 6.4928, + "step": 4089 + }, + { + "epoch": 0.3816366520481478, + "grad_norm": 1.1950982312585057, + "learning_rate": 0.00029916776189055794, + "loss": 6.3401, + "step": 4090 + }, + { + "epoch": 0.3817299617430251, + "grad_norm": 2.2879950943524916, + "learning_rate": 0.00029916696603806564, + "loss": 6.5215, + "step": 4091 + }, + { + "epoch": 0.3818232714379024, + "grad_norm": 1.185623713018332, + "learning_rate": 0.000299166169806286, + "loss": 6.5114, + "step": 4092 + }, + { + "epoch": 0.3819165811327797, + "grad_norm": 1.2505399261710697, + "learning_rate": 0.00029916537319522093, + "loss": 6.4507, + "step": 4093 + }, + { + "epoch": 0.382009890827657, + "grad_norm": 2.2972548783303584, + "learning_rate": 0.0002991645762048725, + "loss": 6.471, + "step": 4094 + }, + { + "epoch": 0.38210320052253427, + "grad_norm": 2.306377812485381, + "learning_rate": 0.0002991637788352428, + "loss": 6.5511, + "step": 4095 + }, + { + "epoch": 0.3821965102174116, + "grad_norm": 1.1108684946546867, + "learning_rate": 0.0002991629810863337, + "loss": 6.391, + "step": 4096 + }, + { + "epoch": 0.3822898199122889, + "grad_norm": 1.2895783155646816, + "learning_rate": 0.0002991621829581474, + "loss": 6.6801, + "step": 4097 + }, + { + "epoch": 0.3823831296071662, + "grad_norm": 2.010996177258373, + "learning_rate": 0.00029916138445068585, + "loss": 6.616, + "step": 4098 + }, + { + "epoch": 0.3824764393020435, + "grad_norm": 1.6997340447165494, + "learning_rate": 0.000299160585563951, + "loss": 6.6369, + "step": 4099 + }, + { + "epoch": 0.38256974899692076, + "grad_norm": 1.2935988950219288, + "learning_rate": 0.00029915978629794506, + "loss": 6.5097, + "step": 4100 + }, + { + "epoch": 0.38266305869179806, + "grad_norm": 2.093539216337037, + "learning_rate": 0.00029915898665266994, + "loss": 6.3669, + "step": 4101 + }, + { + "epoch": 0.38275636838667537, + "grad_norm": 0.968707892224988, + "learning_rate": 0.00029915818662812766, + "loss": 6.7825, + "step": 4102 + }, + { + "epoch": 0.3828496780815527, + "grad_norm": 2.0810999552919367, + "learning_rate": 0.00029915738622432036, + "loss": 6.3842, + "step": 4103 + }, + { + "epoch": 0.38294298777643, + "grad_norm": 1.2022073880468658, + "learning_rate": 0.00029915658544124997, + "loss": 6.5879, + "step": 4104 + }, + { + "epoch": 0.3830362974713073, + "grad_norm": 3.520613543776457, + "learning_rate": 0.00029915578427891856, + "loss": 6.2483, + "step": 4105 + }, + { + "epoch": 0.38312960716618455, + "grad_norm": 1.8504351962407481, + "learning_rate": 0.0002991549827373282, + "loss": 6.526, + "step": 4106 + }, + { + "epoch": 0.38322291686106186, + "grad_norm": 17.416787406958274, + "learning_rate": 0.00029915418081648086, + "loss": 6.4546, + "step": 4107 + }, + { + "epoch": 0.38331622655593917, + "grad_norm": 1.4756894189167902, + "learning_rate": 0.00029915337851637864, + "loss": 6.6484, + "step": 4108 + }, + { + "epoch": 0.3834095362508165, + "grad_norm": 2.067310683026741, + "learning_rate": 0.00029915257583702364, + "loss": 6.582, + "step": 4109 + }, + { + "epoch": 0.3835028459456938, + "grad_norm": 1.4154524004601785, + "learning_rate": 0.00029915177277841775, + "loss": 6.587, + "step": 4110 + }, + { + "epoch": 0.38359615564057103, + "grad_norm": 1.4012048879438492, + "learning_rate": 0.0002991509693405631, + "loss": 6.4167, + "step": 4111 + }, + { + "epoch": 0.38368946533544834, + "grad_norm": 1.8725149361263396, + "learning_rate": 0.00029915016552346167, + "loss": 6.6869, + "step": 4112 + }, + { + "epoch": 0.38378277503032565, + "grad_norm": 1.2293307405542309, + "learning_rate": 0.0002991493613271156, + "loss": 6.3576, + "step": 4113 + }, + { + "epoch": 0.38387608472520296, + "grad_norm": 1.1301907246247198, + "learning_rate": 0.0002991485567515269, + "loss": 6.5423, + "step": 4114 + }, + { + "epoch": 0.38396939442008027, + "grad_norm": 0.9917011144935743, + "learning_rate": 0.00029914775179669753, + "loss": 6.1685, + "step": 4115 + }, + { + "epoch": 0.3840627041149575, + "grad_norm": 1.0658223594155734, + "learning_rate": 0.00029914694646262965, + "loss": 6.4798, + "step": 4116 + }, + { + "epoch": 0.38415601380983483, + "grad_norm": 1.028314200341546, + "learning_rate": 0.00029914614074932525, + "loss": 6.14, + "step": 4117 + }, + { + "epoch": 0.38424932350471214, + "grad_norm": 1.030000799977928, + "learning_rate": 0.00029914533465678645, + "loss": 6.2857, + "step": 4118 + }, + { + "epoch": 0.38434263319958945, + "grad_norm": 2.4694454695805965, + "learning_rate": 0.0002991445281850152, + "loss": 6.92, + "step": 4119 + }, + { + "epoch": 0.38443594289446675, + "grad_norm": 1.1550503874502422, + "learning_rate": 0.0002991437213340136, + "loss": 6.6044, + "step": 4120 + }, + { + "epoch": 0.384529252589344, + "grad_norm": 1.4973604208021072, + "learning_rate": 0.0002991429141037837, + "loss": 6.6414, + "step": 4121 + }, + { + "epoch": 0.3846225622842213, + "grad_norm": 0.9943242207194051, + "learning_rate": 0.00029914210649432753, + "loss": 6.6637, + "step": 4122 + }, + { + "epoch": 0.3847158719790986, + "grad_norm": 0.8061325438679189, + "learning_rate": 0.00029914129850564723, + "loss": 6.4652, + "step": 4123 + }, + { + "epoch": 0.38480918167397593, + "grad_norm": 1.7741499471601097, + "learning_rate": 0.00029914049013774467, + "loss": 6.5534, + "step": 4124 + }, + { + "epoch": 0.38490249136885324, + "grad_norm": 1.25769196320811, + "learning_rate": 0.0002991396813906221, + "loss": 6.178, + "step": 4125 + }, + { + "epoch": 0.38499580106373055, + "grad_norm": 1.4344766433942053, + "learning_rate": 0.0002991388722642815, + "loss": 6.2355, + "step": 4126 + }, + { + "epoch": 0.3850891107586078, + "grad_norm": 0.9818889451455708, + "learning_rate": 0.00029913806275872493, + "loss": 6.8114, + "step": 4127 + }, + { + "epoch": 0.3851824204534851, + "grad_norm": 1.3840232199207778, + "learning_rate": 0.0002991372528739544, + "loss": 6.0269, + "step": 4128 + }, + { + "epoch": 0.3852757301483624, + "grad_norm": 3.6104525038806248, + "learning_rate": 0.00029913644260997203, + "loss": 6.5551, + "step": 4129 + }, + { + "epoch": 0.3853690398432397, + "grad_norm": 1.2669596664630007, + "learning_rate": 0.00029913563196677987, + "loss": 6.3828, + "step": 4130 + }, + { + "epoch": 0.38546234953811703, + "grad_norm": 1.1302580948704637, + "learning_rate": 0.00029913482094437997, + "loss": 6.3195, + "step": 4131 + }, + { + "epoch": 0.3855556592329943, + "grad_norm": 1.4243692474943865, + "learning_rate": 0.00029913400954277445, + "loss": 6.7373, + "step": 4132 + }, + { + "epoch": 0.3856489689278716, + "grad_norm": 1.1628124725255755, + "learning_rate": 0.00029913319776196526, + "loss": 6.4044, + "step": 4133 + }, + { + "epoch": 0.3857422786227489, + "grad_norm": 1.3948360981390784, + "learning_rate": 0.0002991323856019545, + "loss": 6.4708, + "step": 4134 + }, + { + "epoch": 0.3858355883176262, + "grad_norm": 1.2841638920813452, + "learning_rate": 0.00029913157306274435, + "loss": 6.5628, + "step": 4135 + }, + { + "epoch": 0.3859288980125035, + "grad_norm": 1.6271734119859358, + "learning_rate": 0.00029913076014433673, + "loss": 5.9215, + "step": 4136 + }, + { + "epoch": 0.38602220770738077, + "grad_norm": 1.6166464661973776, + "learning_rate": 0.00029912994684673375, + "loss": 6.5384, + "step": 4137 + }, + { + "epoch": 0.3861155174022581, + "grad_norm": 0.9324673335756438, + "learning_rate": 0.00029912913316993756, + "loss": 6.2907, + "step": 4138 + }, + { + "epoch": 0.3862088270971354, + "grad_norm": 0.9731949661656263, + "learning_rate": 0.00029912831911395007, + "loss": 6.3093, + "step": 4139 + }, + { + "epoch": 0.3863021367920127, + "grad_norm": 1.078955070477442, + "learning_rate": 0.0002991275046787735, + "loss": 6.5069, + "step": 4140 + }, + { + "epoch": 0.38639544648689, + "grad_norm": 1.5891011342975594, + "learning_rate": 0.00029912668986440983, + "loss": 6.225, + "step": 4141 + }, + { + "epoch": 0.3864887561817673, + "grad_norm": 1.0012520758127423, + "learning_rate": 0.00029912587467086115, + "loss": 6.2624, + "step": 4142 + }, + { + "epoch": 0.38658206587664456, + "grad_norm": 1.4390940002171502, + "learning_rate": 0.00029912505909812957, + "loss": 6.431, + "step": 4143 + }, + { + "epoch": 0.3866753755715219, + "grad_norm": 1.0849480319693998, + "learning_rate": 0.00029912424314621714, + "loss": 6.2729, + "step": 4144 + }, + { + "epoch": 0.3867686852663992, + "grad_norm": 1.183602279224011, + "learning_rate": 0.00029912342681512593, + "loss": 5.9286, + "step": 4145 + }, + { + "epoch": 0.3868619949612765, + "grad_norm": 1.151755788432176, + "learning_rate": 0.00029912261010485805, + "loss": 6.291, + "step": 4146 + }, + { + "epoch": 0.3869553046561538, + "grad_norm": 1.189857506138684, + "learning_rate": 0.0002991217930154155, + "loss": 5.6252, + "step": 4147 + }, + { + "epoch": 0.38704861435103105, + "grad_norm": 1.589505939284602, + "learning_rate": 0.0002991209755468004, + "loss": 6.6434, + "step": 4148 + }, + { + "epoch": 0.38714192404590836, + "grad_norm": 1.278633951373921, + "learning_rate": 0.00029912015769901487, + "loss": 6.3998, + "step": 4149 + }, + { + "epoch": 0.38723523374078567, + "grad_norm": 1.233193661322253, + "learning_rate": 0.0002991193394720609, + "loss": 6.45, + "step": 4150 + }, + { + "epoch": 0.387328543435663, + "grad_norm": 1.0975934389023458, + "learning_rate": 0.0002991185208659407, + "loss": 6.318, + "step": 4151 + }, + { + "epoch": 0.3874218531305403, + "grad_norm": 1.414174421274212, + "learning_rate": 0.0002991177018806562, + "loss": 6.3077, + "step": 4152 + }, + { + "epoch": 0.38751516282541754, + "grad_norm": 1.3046913715377304, + "learning_rate": 0.00029911688251620957, + "loss": 6.3314, + "step": 4153 + }, + { + "epoch": 0.38760847252029484, + "grad_norm": 0.9641918020010786, + "learning_rate": 0.0002991160627726029, + "loss": 6.059, + "step": 4154 + }, + { + "epoch": 0.38770178221517215, + "grad_norm": 0.9247960637797613, + "learning_rate": 0.00029911524264983824, + "loss": 6.5258, + "step": 4155 + }, + { + "epoch": 0.38779509191004946, + "grad_norm": 1.6558265514211998, + "learning_rate": 0.0002991144221479177, + "loss": 5.9848, + "step": 4156 + }, + { + "epoch": 0.38788840160492677, + "grad_norm": 1.0678035138548934, + "learning_rate": 0.0002991136012668434, + "loss": 6.3211, + "step": 4157 + }, + { + "epoch": 0.3879817112998041, + "grad_norm": 1.7894072577701796, + "learning_rate": 0.0002991127800066173, + "loss": 6.3441, + "step": 4158 + }, + { + "epoch": 0.38807502099468133, + "grad_norm": 1.457881770823991, + "learning_rate": 0.0002991119583672416, + "loss": 6.5456, + "step": 4159 + }, + { + "epoch": 0.38816833068955864, + "grad_norm": 1.2511050275003894, + "learning_rate": 0.0002991111363487184, + "loss": 6.4432, + "step": 4160 + }, + { + "epoch": 0.38826164038443595, + "grad_norm": 1.1752499127603606, + "learning_rate": 0.0002991103139510497, + "loss": 6.4026, + "step": 4161 + }, + { + "epoch": 0.38835495007931325, + "grad_norm": 69.228166913049, + "learning_rate": 0.00029910949117423766, + "loss": 6.6902, + "step": 4162 + }, + { + "epoch": 0.38844825977419056, + "grad_norm": 0.9921843090810278, + "learning_rate": 0.0002991086680182843, + "loss": 5.8916, + "step": 4163 + }, + { + "epoch": 0.3885415694690678, + "grad_norm": 1.1669180159147792, + "learning_rate": 0.0002991078444831918, + "loss": 6.6, + "step": 4164 + }, + { + "epoch": 0.3886348791639451, + "grad_norm": 1.3391689253527375, + "learning_rate": 0.0002991070205689623, + "loss": 6.4364, + "step": 4165 + }, + { + "epoch": 0.38872818885882243, + "grad_norm": 1.2869422912376942, + "learning_rate": 0.00029910619627559776, + "loss": 6.2644, + "step": 4166 + }, + { + "epoch": 0.38882149855369974, + "grad_norm": 1.180875282157369, + "learning_rate": 0.0002991053716031003, + "loss": 6.3967, + "step": 4167 + }, + { + "epoch": 0.38891480824857705, + "grad_norm": 1.074630271167139, + "learning_rate": 0.00029910454655147206, + "loss": 6.622, + "step": 4168 + }, + { + "epoch": 0.3890081179434543, + "grad_norm": 1.10893153091305, + "learning_rate": 0.00029910372112071517, + "loss": 5.913, + "step": 4169 + }, + { + "epoch": 0.3891014276383316, + "grad_norm": 0.9381255373985985, + "learning_rate": 0.00029910289531083164, + "loss": 6.4968, + "step": 4170 + }, + { + "epoch": 0.3891947373332089, + "grad_norm": 0.9839445675599751, + "learning_rate": 0.0002991020691218236, + "loss": 6.4213, + "step": 4171 + }, + { + "epoch": 0.3892880470280862, + "grad_norm": 0.9912618178487879, + "learning_rate": 0.0002991012425536932, + "loss": 6.4792, + "step": 4172 + }, + { + "epoch": 0.38938135672296353, + "grad_norm": 1.2460521307292332, + "learning_rate": 0.00029910041560644255, + "loss": 6.333, + "step": 4173 + }, + { + "epoch": 0.3894746664178408, + "grad_norm": 1.1279113032222985, + "learning_rate": 0.0002990995882800737, + "loss": 6.3246, + "step": 4174 + }, + { + "epoch": 0.3895679761127181, + "grad_norm": 1.3936511296308536, + "learning_rate": 0.00029909876057458864, + "loss": 6.4771, + "step": 4175 + }, + { + "epoch": 0.3896612858075954, + "grad_norm": 3.2914336748649315, + "learning_rate": 0.0002990979324899897, + "loss": 6.784, + "step": 4176 + }, + { + "epoch": 0.3897545955024727, + "grad_norm": 1.4707374844236254, + "learning_rate": 0.00029909710402627887, + "loss": 6.5254, + "step": 4177 + }, + { + "epoch": 0.38984790519735, + "grad_norm": 0.928773972839987, + "learning_rate": 0.00029909627518345827, + "loss": 6.2168, + "step": 4178 + }, + { + "epoch": 0.38994121489222733, + "grad_norm": 1.134928607643775, + "learning_rate": 0.00029909544596153, + "loss": 6.3949, + "step": 4179 + }, + { + "epoch": 0.3900345245871046, + "grad_norm": 1.282111525481518, + "learning_rate": 0.0002990946163604962, + "loss": 6.642, + "step": 4180 + }, + { + "epoch": 0.3901278342819819, + "grad_norm": 1.4806404712994574, + "learning_rate": 0.00029909378638035895, + "loss": 6.3556, + "step": 4181 + }, + { + "epoch": 0.3902211439768592, + "grad_norm": 1.0850003444616756, + "learning_rate": 0.00029909295602112033, + "loss": 6.0823, + "step": 4182 + }, + { + "epoch": 0.3903144536717365, + "grad_norm": 1.2114542824835455, + "learning_rate": 0.0002990921252827825, + "loss": 6.4145, + "step": 4183 + }, + { + "epoch": 0.3904077633666138, + "grad_norm": 1.3403131868771305, + "learning_rate": 0.00029909129416534754, + "loss": 6.1088, + "step": 4184 + }, + { + "epoch": 0.39050107306149107, + "grad_norm": 0.8093402876430695, + "learning_rate": 0.00029909046266881766, + "loss": 6.3976, + "step": 4185 + }, + { + "epoch": 0.3905943827563684, + "grad_norm": 0.9871973679354932, + "learning_rate": 0.0002990896307931948, + "loss": 6.546, + "step": 4186 + }, + { + "epoch": 0.3906876924512457, + "grad_norm": 1.3722020659960694, + "learning_rate": 0.00029908879853848125, + "loss": 6.0827, + "step": 4187 + }, + { + "epoch": 0.390781002146123, + "grad_norm": 1.0206319586829393, + "learning_rate": 0.000299087965904679, + "loss": 6.3478, + "step": 4188 + }, + { + "epoch": 0.3908743118410003, + "grad_norm": 1.332242684538043, + "learning_rate": 0.00029908713289179026, + "loss": 6.3049, + "step": 4189 + }, + { + "epoch": 0.39096762153587755, + "grad_norm": 0.9336279728048327, + "learning_rate": 0.00029908629949981704, + "loss": 6.2371, + "step": 4190 + }, + { + "epoch": 0.39106093123075486, + "grad_norm": 0.9840184104671449, + "learning_rate": 0.0002990854657287616, + "loss": 6.237, + "step": 4191 + }, + { + "epoch": 0.39115424092563217, + "grad_norm": 1.0483405866043185, + "learning_rate": 0.00029908463157862596, + "loss": 6.0856, + "step": 4192 + }, + { + "epoch": 0.3912475506205095, + "grad_norm": 0.869508465564784, + "learning_rate": 0.0002990837970494122, + "loss": 6.4519, + "step": 4193 + }, + { + "epoch": 0.3913408603153868, + "grad_norm": 1.2805453478682276, + "learning_rate": 0.00029908296214112257, + "loss": 6.4166, + "step": 4194 + }, + { + "epoch": 0.3914341700102641, + "grad_norm": 1.1063653748519684, + "learning_rate": 0.0002990821268537591, + "loss": 6.0846, + "step": 4195 + }, + { + "epoch": 0.39152747970514135, + "grad_norm": 1.4242619643080119, + "learning_rate": 0.000299081291187324, + "loss": 6.3797, + "step": 4196 + }, + { + "epoch": 0.39162078940001865, + "grad_norm": 1.023551730608166, + "learning_rate": 0.0002990804551418193, + "loss": 6.0971, + "step": 4197 + }, + { + "epoch": 0.39171409909489596, + "grad_norm": 1.0199480740019995, + "learning_rate": 0.0002990796187172472, + "loss": 6.4845, + "step": 4198 + }, + { + "epoch": 0.39180740878977327, + "grad_norm": 1.0404006798136136, + "learning_rate": 0.00029907878191360975, + "loss": 6.456, + "step": 4199 + }, + { + "epoch": 0.3919007184846506, + "grad_norm": 1.0518715008121822, + "learning_rate": 0.0002990779447309091, + "loss": 6.4396, + "step": 4200 + }, + { + "epoch": 0.39199402817952783, + "grad_norm": 0.8357822733536249, + "learning_rate": 0.0002990771071691474, + "loss": 6.1829, + "step": 4201 + }, + { + "epoch": 0.39208733787440514, + "grad_norm": 1.3298164970918964, + "learning_rate": 0.00029907626922832676, + "loss": 6.3834, + "step": 4202 + }, + { + "epoch": 0.39218064756928245, + "grad_norm": 0.9839252242922365, + "learning_rate": 0.0002990754309084494, + "loss": 6.4722, + "step": 4203 + }, + { + "epoch": 0.39227395726415976, + "grad_norm": 0.9347019433971062, + "learning_rate": 0.00029907459220951735, + "loss": 6.3295, + "step": 4204 + }, + { + "epoch": 0.39236726695903706, + "grad_norm": 0.9091835158204316, + "learning_rate": 0.0002990737531315327, + "loss": 6.2709, + "step": 4205 + }, + { + "epoch": 0.3924605766539143, + "grad_norm": 0.8898906793452394, + "learning_rate": 0.00029907291367449777, + "loss": 6.5259, + "step": 4206 + }, + { + "epoch": 0.3925538863487916, + "grad_norm": 1.766833150034851, + "learning_rate": 0.0002990720738384145, + "loss": 6.031, + "step": 4207 + }, + { + "epoch": 0.39264719604366893, + "grad_norm": 1.5761492784337032, + "learning_rate": 0.00029907123362328514, + "loss": 6.1789, + "step": 4208 + }, + { + "epoch": 0.39274050573854624, + "grad_norm": 1.0325193439123614, + "learning_rate": 0.0002990703930291117, + "loss": 6.4139, + "step": 4209 + }, + { + "epoch": 0.39283381543342355, + "grad_norm": 29616.34252059512, + "learning_rate": 0.00029906955205589655, + "loss": 6.1949, + "step": 4210 + }, + { + "epoch": 0.39292712512830086, + "grad_norm": 1.1428221805569347, + "learning_rate": 0.0002990687107036416, + "loss": 6.5377, + "step": 4211 + }, + { + "epoch": 0.3930204348231781, + "grad_norm": 1.4935109426884494, + "learning_rate": 0.0002990678689723491, + "loss": 6.2958, + "step": 4212 + }, + { + "epoch": 0.3931137445180554, + "grad_norm": 1.2925349050611932, + "learning_rate": 0.00029906702686202115, + "loss": 6.8104, + "step": 4213 + }, + { + "epoch": 0.3932070542129327, + "grad_norm": 2.5538577261217346, + "learning_rate": 0.00029906618437265995, + "loss": 6.4424, + "step": 4214 + }, + { + "epoch": 0.39330036390781004, + "grad_norm": 1.9445993388034077, + "learning_rate": 0.00029906534150426757, + "loss": 7.0761, + "step": 4215 + }, + { + "epoch": 0.39339367360268734, + "grad_norm": 1.5439386248539246, + "learning_rate": 0.0002990644982568462, + "loss": 6.9605, + "step": 4216 + }, + { + "epoch": 0.3934869832975646, + "grad_norm": 1.6263921161508839, + "learning_rate": 0.0002990636546303979, + "loss": 6.9587, + "step": 4217 + }, + { + "epoch": 0.3935802929924419, + "grad_norm": 1.6889548122239202, + "learning_rate": 0.00029906281062492494, + "loss": 6.8171, + "step": 4218 + }, + { + "epoch": 0.3936736026873192, + "grad_norm": 1.389738189547104, + "learning_rate": 0.0002990619662404294, + "loss": 6.9575, + "step": 4219 + }, + { + "epoch": 0.3937669123821965, + "grad_norm": 1.8062185555863646, + "learning_rate": 0.0002990611214769135, + "loss": 6.6456, + "step": 4220 + }, + { + "epoch": 0.39386022207707383, + "grad_norm": 2.350181768008514, + "learning_rate": 0.00029906027633437923, + "loss": 6.9102, + "step": 4221 + }, + { + "epoch": 0.3939535317719511, + "grad_norm": 2.599711330551181, + "learning_rate": 0.0002990594308128289, + "loss": 6.915, + "step": 4222 + }, + { + "epoch": 0.3940468414668284, + "grad_norm": 2.9430103152357043, + "learning_rate": 0.0002990585849122645, + "loss": 6.8488, + "step": 4223 + }, + { + "epoch": 0.3941401511617057, + "grad_norm": 2.0523983230195744, + "learning_rate": 0.00029905773863268836, + "loss": 6.8882, + "step": 4224 + }, + { + "epoch": 0.394233460856583, + "grad_norm": 2.387863054140195, + "learning_rate": 0.0002990568919741025, + "loss": 6.8952, + "step": 4225 + }, + { + "epoch": 0.3943267705514603, + "grad_norm": 1.3564819173368419, + "learning_rate": 0.0002990560449365091, + "loss": 7.0094, + "step": 4226 + }, + { + "epoch": 0.39442008024633757, + "grad_norm": 1.69133207396018, + "learning_rate": 0.0002990551975199104, + "loss": 7.1038, + "step": 4227 + }, + { + "epoch": 0.3945133899412149, + "grad_norm": 1.584902881196301, + "learning_rate": 0.00029905434972430847, + "loss": 6.7261, + "step": 4228 + }, + { + "epoch": 0.3946066996360922, + "grad_norm": 1.9735026401988711, + "learning_rate": 0.00029905350154970544, + "loss": 6.8214, + "step": 4229 + }, + { + "epoch": 0.3947000093309695, + "grad_norm": 2.3294934369515, + "learning_rate": 0.0002990526529961035, + "loss": 6.6583, + "step": 4230 + }, + { + "epoch": 0.3947933190258468, + "grad_norm": 1.4457821402881423, + "learning_rate": 0.00029905180406350484, + "loss": 6.6626, + "step": 4231 + }, + { + "epoch": 0.3948866287207241, + "grad_norm": 1.7077159028568203, + "learning_rate": 0.0002990509547519116, + "loss": 6.8985, + "step": 4232 + }, + { + "epoch": 0.39497993841560136, + "grad_norm": 2.793609774943464, + "learning_rate": 0.000299050105061326, + "loss": 7.7082, + "step": 4233 + }, + { + "epoch": 0.39507324811047867, + "grad_norm": 2.8291569281962246, + "learning_rate": 0.00029904925499175006, + "loss": 6.9552, + "step": 4234 + }, + { + "epoch": 0.395166557805356, + "grad_norm": 2.2800641463618003, + "learning_rate": 0.000299048404543186, + "loss": 7.2165, + "step": 4235 + }, + { + "epoch": 0.3952598675002333, + "grad_norm": 1.3410470452879448, + "learning_rate": 0.000299047553715636, + "loss": 7.103, + "step": 4236 + }, + { + "epoch": 0.3953531771951106, + "grad_norm": 2.923559943852146, + "learning_rate": 0.0002990467025091023, + "loss": 6.5806, + "step": 4237 + }, + { + "epoch": 0.39544648688998785, + "grad_norm": 2.447297026351733, + "learning_rate": 0.0002990458509235869, + "loss": 7.2081, + "step": 4238 + }, + { + "epoch": 0.39553979658486516, + "grad_norm": 1.7628227964165917, + "learning_rate": 0.0002990449989590921, + "loss": 6.4262, + "step": 4239 + }, + { + "epoch": 0.39563310627974246, + "grad_norm": 2.0029465687777073, + "learning_rate": 0.00029904414661561997, + "loss": 6.8622, + "step": 4240 + }, + { + "epoch": 0.39572641597461977, + "grad_norm": 1.6828948291368142, + "learning_rate": 0.00029904329389317274, + "loss": 6.5186, + "step": 4241 + }, + { + "epoch": 0.3958197256694971, + "grad_norm": 1.5360129933040394, + "learning_rate": 0.0002990424407917526, + "loss": 6.901, + "step": 4242 + }, + { + "epoch": 0.39591303536437433, + "grad_norm": 1.4236222750265133, + "learning_rate": 0.00029904158731136165, + "loss": 6.9963, + "step": 4243 + }, + { + "epoch": 0.39600634505925164, + "grad_norm": 1.1695231865024274, + "learning_rate": 0.00029904073345200213, + "loss": 6.6876, + "step": 4244 + }, + { + "epoch": 0.39609965475412895, + "grad_norm": 1.4290462958439027, + "learning_rate": 0.0002990398792136761, + "loss": 6.8344, + "step": 4245 + }, + { + "epoch": 0.39619296444900626, + "grad_norm": 1.413470568879574, + "learning_rate": 0.0002990390245963859, + "loss": 6.7113, + "step": 4246 + }, + { + "epoch": 0.39628627414388357, + "grad_norm": 1.4096014961933527, + "learning_rate": 0.0002990381696001336, + "loss": 6.4479, + "step": 4247 + }, + { + "epoch": 0.3963795838387609, + "grad_norm": 1.1376814608846906, + "learning_rate": 0.0002990373142249213, + "loss": 6.4814, + "step": 4248 + }, + { + "epoch": 0.3964728935336381, + "grad_norm": 1.5524856277924324, + "learning_rate": 0.0002990364584707513, + "loss": 6.8061, + "step": 4249 + }, + { + "epoch": 0.39656620322851543, + "grad_norm": 1.4191895049977383, + "learning_rate": 0.00029903560233762575, + "loss": 7.0394, + "step": 4250 + }, + { + "epoch": 0.39665951292339274, + "grad_norm": 1.6895590516661065, + "learning_rate": 0.0002990347458255468, + "loss": 6.3063, + "step": 4251 + }, + { + "epoch": 0.39675282261827005, + "grad_norm": 1.165010506071183, + "learning_rate": 0.00029903388893451663, + "loss": 6.4913, + "step": 4252 + }, + { + "epoch": 0.39684613231314736, + "grad_norm": 1.940253486208601, + "learning_rate": 0.00029903303166453746, + "loss": 6.8306, + "step": 4253 + }, + { + "epoch": 0.3969394420080246, + "grad_norm": 1.8584315601066674, + "learning_rate": 0.0002990321740156114, + "loss": 7.0332, + "step": 4254 + }, + { + "epoch": 0.3970327517029019, + "grad_norm": 1.1841818985356445, + "learning_rate": 0.0002990313159877407, + "loss": 6.5711, + "step": 4255 + }, + { + "epoch": 0.39712606139777923, + "grad_norm": 1.198891504080212, + "learning_rate": 0.0002990304575809275, + "loss": 6.9432, + "step": 4256 + }, + { + "epoch": 0.39721937109265654, + "grad_norm": 1.2618545153536556, + "learning_rate": 0.000299029598795174, + "loss": 6.5542, + "step": 4257 + }, + { + "epoch": 0.39731268078753385, + "grad_norm": 1.1244974924944602, + "learning_rate": 0.0002990287396304823, + "loss": 6.6018, + "step": 4258 + }, + { + "epoch": 0.3974059904824111, + "grad_norm": 2.4812106082989254, + "learning_rate": 0.00029902788008685473, + "loss": 6.9115, + "step": 4259 + }, + { + "epoch": 0.3974993001772884, + "grad_norm": 2.277857210515966, + "learning_rate": 0.0002990270201642934, + "loss": 6.8523, + "step": 4260 + }, + { + "epoch": 0.3975926098721657, + "grad_norm": 1.0327934229540163, + "learning_rate": 0.0002990261598628005, + "loss": 6.9601, + "step": 4261 + }, + { + "epoch": 0.397685919567043, + "grad_norm": 1.3565101433172242, + "learning_rate": 0.00029902529918237827, + "loss": 6.2533, + "step": 4262 + }, + { + "epoch": 0.39777922926192033, + "grad_norm": 1.1304007826580436, + "learning_rate": 0.00029902443812302874, + "loss": 6.8958, + "step": 4263 + }, + { + "epoch": 0.39787253895679764, + "grad_norm": 1.457031086171537, + "learning_rate": 0.0002990235766847543, + "loss": 6.5092, + "step": 4264 + }, + { + "epoch": 0.3979658486516749, + "grad_norm": 2.7937177769506514, + "learning_rate": 0.000299022714867557, + "loss": 6.8571, + "step": 4265 + }, + { + "epoch": 0.3980591583465522, + "grad_norm": 1.0208741748229007, + "learning_rate": 0.0002990218526714391, + "loss": 6.7069, + "step": 4266 + }, + { + "epoch": 0.3981524680414295, + "grad_norm": 1.833202213465874, + "learning_rate": 0.00029902099009640275, + "loss": 6.8044, + "step": 4267 + }, + { + "epoch": 0.3982457777363068, + "grad_norm": 1.2354342910730636, + "learning_rate": 0.0002990201271424502, + "loss": 6.5171, + "step": 4268 + }, + { + "epoch": 0.3983390874311841, + "grad_norm": 1.4876078674095898, + "learning_rate": 0.0002990192638095836, + "loss": 6.3784, + "step": 4269 + }, + { + "epoch": 0.3984323971260614, + "grad_norm": 0.9680663512585056, + "learning_rate": 0.00029901840009780516, + "loss": 6.5291, + "step": 4270 + }, + { + "epoch": 0.3985257068209387, + "grad_norm": 1.0915162900289959, + "learning_rate": 0.00029901753600711706, + "loss": 6.3881, + "step": 4271 + }, + { + "epoch": 0.398619016515816, + "grad_norm": 1.2191184015410523, + "learning_rate": 0.00029901667153752154, + "loss": 6.6133, + "step": 4272 + }, + { + "epoch": 0.3987123262106933, + "grad_norm": 1.240923500804903, + "learning_rate": 0.00029901580668902067, + "loss": 6.6683, + "step": 4273 + }, + { + "epoch": 0.3988056359055706, + "grad_norm": 1.2308740964419354, + "learning_rate": 0.00029901494146161684, + "loss": 6.4339, + "step": 4274 + }, + { + "epoch": 0.39889894560044786, + "grad_norm": 1.1755555173780223, + "learning_rate": 0.00029901407585531216, + "loss": 6.3441, + "step": 4275 + }, + { + "epoch": 0.39899225529532517, + "grad_norm": 1.942826719367273, + "learning_rate": 0.00029901320987010875, + "loss": 5.9401, + "step": 4276 + }, + { + "epoch": 0.3990855649902025, + "grad_norm": 1.5280355315765228, + "learning_rate": 0.00029901234350600893, + "loss": 6.6595, + "step": 4277 + }, + { + "epoch": 0.3991788746850798, + "grad_norm": 1.4009082262639099, + "learning_rate": 0.0002990114767630149, + "loss": 6.2401, + "step": 4278 + }, + { + "epoch": 0.3992721843799571, + "grad_norm": 1.1340947998087858, + "learning_rate": 0.0002990106096411287, + "loss": 6.5586, + "step": 4279 + }, + { + "epoch": 0.39936549407483435, + "grad_norm": 1.2574959702459647, + "learning_rate": 0.0002990097421403528, + "loss": 6.2527, + "step": 4280 + }, + { + "epoch": 0.39945880376971166, + "grad_norm": 1.8067631540896725, + "learning_rate": 0.0002990088742606892, + "loss": 6.5894, + "step": 4281 + }, + { + "epoch": 0.39955211346458896, + "grad_norm": 1.329501157904478, + "learning_rate": 0.0002990080060021402, + "loss": 7.0224, + "step": 4282 + }, + { + "epoch": 0.3996454231594663, + "grad_norm": 1.0582037183337485, + "learning_rate": 0.00029900713736470794, + "loss": 6.3651, + "step": 4283 + }, + { + "epoch": 0.3997387328543436, + "grad_norm": 1.0490788310758037, + "learning_rate": 0.0002990062683483947, + "loss": 6.688, + "step": 4284 + }, + { + "epoch": 0.3998320425492209, + "grad_norm": 1.6986552707256308, + "learning_rate": 0.00029900539895320264, + "loss": 6.7853, + "step": 4285 + }, + { + "epoch": 0.39992535224409814, + "grad_norm": 1.3988452894053973, + "learning_rate": 0.000299004529179134, + "loss": 6.2869, + "step": 4286 + }, + { + "epoch": 0.40001866193897545, + "grad_norm": 1.3137849877572016, + "learning_rate": 0.000299003659026191, + "loss": 6.6499, + "step": 4287 + }, + { + "epoch": 0.40011197163385276, + "grad_norm": 1.2516411717957694, + "learning_rate": 0.00029900278849437584, + "loss": 5.8479, + "step": 4288 + }, + { + "epoch": 0.40020528132873007, + "grad_norm": 1.60216168214307, + "learning_rate": 0.00029900191758369064, + "loss": 6.28, + "step": 4289 + }, + { + "epoch": 0.4002985910236074, + "grad_norm": 1.508856045290139, + "learning_rate": 0.0002990010462941378, + "loss": 6.4613, + "step": 4290 + }, + { + "epoch": 0.40039190071848463, + "grad_norm": 1.4805310597465342, + "learning_rate": 0.0002990001746257194, + "loss": 6.2, + "step": 4291 + }, + { + "epoch": 0.40048521041336194, + "grad_norm": 1.2750025896810102, + "learning_rate": 0.00029899930257843766, + "loss": 6.3923, + "step": 4292 + }, + { + "epoch": 0.40057852010823924, + "grad_norm": 1.1793537298456287, + "learning_rate": 0.0002989984301522949, + "loss": 6.1562, + "step": 4293 + }, + { + "epoch": 0.40067182980311655, + "grad_norm": 1.100058083948488, + "learning_rate": 0.0002989975573472932, + "loss": 6.3053, + "step": 4294 + }, + { + "epoch": 0.40076513949799386, + "grad_norm": 1.5730293987758037, + "learning_rate": 0.0002989966841634349, + "loss": 5.9696, + "step": 4295 + }, + { + "epoch": 0.4008584491928711, + "grad_norm": 1.628933730973054, + "learning_rate": 0.0002989958106007221, + "loss": 6.67, + "step": 4296 + }, + { + "epoch": 0.4009517588877484, + "grad_norm": 1.0584359935415808, + "learning_rate": 0.00029899493665915715, + "loss": 6.5097, + "step": 4297 + }, + { + "epoch": 0.40104506858262573, + "grad_norm": 1.4257067148254483, + "learning_rate": 0.0002989940623387422, + "loss": 6.1999, + "step": 4298 + }, + { + "epoch": 0.40113837827750304, + "grad_norm": 1.1596746115375696, + "learning_rate": 0.00029899318763947953, + "loss": 6.5501, + "step": 4299 + }, + { + "epoch": 0.40123168797238035, + "grad_norm": 1.1858160934942092, + "learning_rate": 0.00029899231256137125, + "loss": 6.5563, + "step": 4300 + }, + { + "epoch": 0.40132499766725765, + "grad_norm": 1.0944166364897174, + "learning_rate": 0.0002989914371044197, + "loss": 6.5338, + "step": 4301 + }, + { + "epoch": 0.4014183073621349, + "grad_norm": 1.985817124476812, + "learning_rate": 0.000298990561268627, + "loss": 6.5633, + "step": 4302 + }, + { + "epoch": 0.4015116170570122, + "grad_norm": 1.636068323427966, + "learning_rate": 0.00029898968505399546, + "loss": 6.1915, + "step": 4303 + }, + { + "epoch": 0.4016049267518895, + "grad_norm": 1.1039770962469688, + "learning_rate": 0.00029898880846052734, + "loss": 6.4398, + "step": 4304 + }, + { + "epoch": 0.40169823644676683, + "grad_norm": 1.6875757436639602, + "learning_rate": 0.00029898793148822473, + "loss": 6.5413, + "step": 4305 + }, + { + "epoch": 0.40179154614164414, + "grad_norm": 1.6706055725268243, + "learning_rate": 0.00029898705413709, + "loss": 6.449, + "step": 4306 + }, + { + "epoch": 0.4018848558365214, + "grad_norm": 1.240702422846965, + "learning_rate": 0.0002989861764071253, + "loss": 6.7018, + "step": 4307 + }, + { + "epoch": 0.4019781655313987, + "grad_norm": 1.3870139262904273, + "learning_rate": 0.0002989852982983329, + "loss": 6.2356, + "step": 4308 + }, + { + "epoch": 0.402071475226276, + "grad_norm": 1.0963720747014527, + "learning_rate": 0.000298984419810715, + "loss": 6.5466, + "step": 4309 + }, + { + "epoch": 0.4021647849211533, + "grad_norm": 1.3831593485045086, + "learning_rate": 0.00029898354094427387, + "loss": 6.0796, + "step": 4310 + }, + { + "epoch": 0.4022580946160306, + "grad_norm": 0.9674918135520383, + "learning_rate": 0.00029898266169901176, + "loss": 6.3136, + "step": 4311 + }, + { + "epoch": 0.4023514043109079, + "grad_norm": 1.4745370594378753, + "learning_rate": 0.00029898178207493083, + "loss": 6.5653, + "step": 4312 + }, + { + "epoch": 0.4024447140057852, + "grad_norm": 1.0970773368961046, + "learning_rate": 0.00029898090207203335, + "loss": 6.0631, + "step": 4313 + }, + { + "epoch": 0.4025380237006625, + "grad_norm": 1.2917812825682615, + "learning_rate": 0.00029898002169032164, + "loss": 6.2504, + "step": 4314 + }, + { + "epoch": 0.4026313333955398, + "grad_norm": 1.1785424270137819, + "learning_rate": 0.0002989791409297978, + "loss": 6.3575, + "step": 4315 + }, + { + "epoch": 0.4027246430904171, + "grad_norm": 1.0102593871083774, + "learning_rate": 0.00029897825979046413, + "loss": 6.6523, + "step": 4316 + }, + { + "epoch": 0.4028179527852944, + "grad_norm": 1.2977430411780326, + "learning_rate": 0.0002989773782723229, + "loss": 6.3314, + "step": 4317 + }, + { + "epoch": 0.4029112624801717, + "grad_norm": 0.8762722244745489, + "learning_rate": 0.00029897649637537635, + "loss": 6.4177, + "step": 4318 + }, + { + "epoch": 0.403004572175049, + "grad_norm": 1.436270042820527, + "learning_rate": 0.00029897561409962666, + "loss": 6.259, + "step": 4319 + }, + { + "epoch": 0.4030978818699263, + "grad_norm": 1.0015859238804998, + "learning_rate": 0.0002989747314450761, + "loss": 6.2448, + "step": 4320 + }, + { + "epoch": 0.4031911915648036, + "grad_norm": 1.0876663802524478, + "learning_rate": 0.00029897384841172695, + "loss": 6.3913, + "step": 4321 + }, + { + "epoch": 0.4032845012596809, + "grad_norm": 1.4429610920124643, + "learning_rate": 0.00029897296499958143, + "loss": 6.469, + "step": 4322 + }, + { + "epoch": 0.40337781095455816, + "grad_norm": 1.4029495586681278, + "learning_rate": 0.0002989720812086418, + "loss": 6.4228, + "step": 4323 + }, + { + "epoch": 0.40347112064943547, + "grad_norm": 1.1143826059106228, + "learning_rate": 0.0002989711970389103, + "loss": 6.4254, + "step": 4324 + }, + { + "epoch": 0.4035644303443128, + "grad_norm": 1.125768742985748, + "learning_rate": 0.00029897031249038916, + "loss": 6.6632, + "step": 4325 + }, + { + "epoch": 0.4036577400391901, + "grad_norm": 1.2781807449133786, + "learning_rate": 0.00029896942756308065, + "loss": 6.2599, + "step": 4326 + }, + { + "epoch": 0.4037510497340674, + "grad_norm": 1.1398271732891088, + "learning_rate": 0.00029896854225698697, + "loss": 6.3865, + "step": 4327 + }, + { + "epoch": 0.40384435942894464, + "grad_norm": 1.0002295902904583, + "learning_rate": 0.00029896765657211045, + "loss": 6.492, + "step": 4328 + }, + { + "epoch": 0.40393766912382195, + "grad_norm": 1.058540234045873, + "learning_rate": 0.0002989667705084534, + "loss": 6.3112, + "step": 4329 + }, + { + "epoch": 0.40403097881869926, + "grad_norm": 1.0314735622626419, + "learning_rate": 0.00029896588406601787, + "loss": 6.6471, + "step": 4330 + }, + { + "epoch": 0.40412428851357657, + "grad_norm": 0.9976355611342688, + "learning_rate": 0.0002989649972448062, + "loss": 6.5252, + "step": 4331 + }, + { + "epoch": 0.4042175982084539, + "grad_norm": 1.1527031811060908, + "learning_rate": 0.00029896411004482073, + "loss": 6.1894, + "step": 4332 + }, + { + "epoch": 0.40431090790333113, + "grad_norm": 1.329397427324029, + "learning_rate": 0.00029896322246606365, + "loss": 6.5581, + "step": 4333 + }, + { + "epoch": 0.40440421759820844, + "grad_norm": 1.1365096693252505, + "learning_rate": 0.0002989623345085372, + "loss": 6.643, + "step": 4334 + }, + { + "epoch": 0.40449752729308575, + "grad_norm": 1.0381132429294537, + "learning_rate": 0.00029896144617224367, + "loss": 5.9605, + "step": 4335 + }, + { + "epoch": 0.40459083698796305, + "grad_norm": 0.8915715279824995, + "learning_rate": 0.0002989605574571853, + "loss": 6.3523, + "step": 4336 + }, + { + "epoch": 0.40468414668284036, + "grad_norm": 0.8828407136523849, + "learning_rate": 0.0002989596683633644, + "loss": 6.2438, + "step": 4337 + }, + { + "epoch": 0.40477745637771767, + "grad_norm": 0.8490731988794797, + "learning_rate": 0.00029895877889078314, + "loss": 6.2749, + "step": 4338 + }, + { + "epoch": 0.4048707660725949, + "grad_norm": 1.1819856161223787, + "learning_rate": 0.0002989578890394438, + "loss": 6.4946, + "step": 4339 + }, + { + "epoch": 0.40496407576747223, + "grad_norm": 1.0005083412823628, + "learning_rate": 0.0002989569988093488, + "loss": 6.2524, + "step": 4340 + }, + { + "epoch": 0.40505738546234954, + "grad_norm": 1.2141077064523003, + "learning_rate": 0.00029895610820050017, + "loss": 5.9735, + "step": 4341 + }, + { + "epoch": 0.40515069515722685, + "grad_norm": 1.3006174514826965, + "learning_rate": 0.0002989552172129003, + "loss": 6.3992, + "step": 4342 + }, + { + "epoch": 0.40524400485210416, + "grad_norm": 1.3178624337817555, + "learning_rate": 0.00029895432584655144, + "loss": 6.8099, + "step": 4343 + }, + { + "epoch": 0.4053373145469814, + "grad_norm": 1.241636770882523, + "learning_rate": 0.0002989534341014558, + "loss": 6.0055, + "step": 4344 + }, + { + "epoch": 0.4054306242418587, + "grad_norm": 2.0655144280465705, + "learning_rate": 0.0002989525419776158, + "loss": 6.5513, + "step": 4345 + }, + { + "epoch": 0.405523933936736, + "grad_norm": 3.000104891958767, + "learning_rate": 0.00029895164947503353, + "loss": 6.2674, + "step": 4346 + }, + { + "epoch": 0.40561724363161333, + "grad_norm": 5.682236012405088, + "learning_rate": 0.00029895075659371143, + "loss": 6.4303, + "step": 4347 + }, + { + "epoch": 0.40571055332649064, + "grad_norm": 1.2633618449382098, + "learning_rate": 0.0002989498633336516, + "loss": 6.5862, + "step": 4348 + }, + { + "epoch": 0.4058038630213679, + "grad_norm": 4.512864853733437, + "learning_rate": 0.0002989489696948564, + "loss": 6.4203, + "step": 4349 + }, + { + "epoch": 0.4058971727162452, + "grad_norm": 5.765818738052536, + "learning_rate": 0.00029894807567732806, + "loss": 6.2515, + "step": 4350 + }, + { + "epoch": 0.4059904824111225, + "grad_norm": 1.40744208690749, + "learning_rate": 0.00029894718128106897, + "loss": 6.3951, + "step": 4351 + }, + { + "epoch": 0.4060837921059998, + "grad_norm": 1.3117747182988035, + "learning_rate": 0.00029894628650608125, + "loss": 6.5999, + "step": 4352 + }, + { + "epoch": 0.4061771018008771, + "grad_norm": 1.8568073198510044, + "learning_rate": 0.0002989453913523673, + "loss": 6.0639, + "step": 4353 + }, + { + "epoch": 0.40627041149575444, + "grad_norm": 1.4877412461673716, + "learning_rate": 0.0002989444958199293, + "loss": 6.5037, + "step": 4354 + }, + { + "epoch": 0.4063637211906317, + "grad_norm": 1.6390644040403508, + "learning_rate": 0.00029894359990876955, + "loss": 6.6548, + "step": 4355 + }, + { + "epoch": 0.406457030885509, + "grad_norm": 1.0400527481868647, + "learning_rate": 0.0002989427036188904, + "loss": 6.3953, + "step": 4356 + }, + { + "epoch": 0.4065503405803863, + "grad_norm": 1.179192289774448, + "learning_rate": 0.00029894180695029407, + "loss": 6.3235, + "step": 4357 + }, + { + "epoch": 0.4066436502752636, + "grad_norm": 1.2061061951247798, + "learning_rate": 0.0002989409099029828, + "loss": 6.2278, + "step": 4358 + }, + { + "epoch": 0.4067369599701409, + "grad_norm": 45.42774873478709, + "learning_rate": 0.00029894001247695895, + "loss": 6.1891, + "step": 4359 + }, + { + "epoch": 0.4068302696650182, + "grad_norm": 2.1219261389854966, + "learning_rate": 0.00029893911467222475, + "loss": 6.4221, + "step": 4360 + }, + { + "epoch": 0.4069235793598955, + "grad_norm": 1.8474661571657605, + "learning_rate": 0.0002989382164887825, + "loss": 6.5185, + "step": 4361 + }, + { + "epoch": 0.4070168890547728, + "grad_norm": 3.5009363098139104, + "learning_rate": 0.0002989373179266345, + "loss": 6.5831, + "step": 4362 + }, + { + "epoch": 0.4071101987496501, + "grad_norm": 2.5618809178018442, + "learning_rate": 0.000298936418985783, + "loss": 6.4682, + "step": 4363 + }, + { + "epoch": 0.4072035084445274, + "grad_norm": 2.3292341798429175, + "learning_rate": 0.00029893551966623036, + "loss": 7.0158, + "step": 4364 + }, + { + "epoch": 0.40729681813940466, + "grad_norm": 2.0177249802839903, + "learning_rate": 0.00029893461996797875, + "loss": 6.7409, + "step": 4365 + }, + { + "epoch": 0.40739012783428197, + "grad_norm": 1.7646013379834253, + "learning_rate": 0.00029893371989103055, + "loss": 6.6128, + "step": 4366 + }, + { + "epoch": 0.4074834375291593, + "grad_norm": 2.9140876084070255, + "learning_rate": 0.000298932819435388, + "loss": 6.1735, + "step": 4367 + }, + { + "epoch": 0.4075767472240366, + "grad_norm": 3.135408501092857, + "learning_rate": 0.0002989319186010534, + "loss": 6.8267, + "step": 4368 + }, + { + "epoch": 0.4076700569189139, + "grad_norm": 1.8564370457960129, + "learning_rate": 0.00029893101738802906, + "loss": 6.6145, + "step": 4369 + }, + { + "epoch": 0.4077633666137912, + "grad_norm": 1.6252877082965813, + "learning_rate": 0.00029893011579631724, + "loss": 6.2525, + "step": 4370 + }, + { + "epoch": 0.40785667630866845, + "grad_norm": 2.062888877560823, + "learning_rate": 0.00029892921382592026, + "loss": 6.6751, + "step": 4371 + }, + { + "epoch": 0.40794998600354576, + "grad_norm": 1.710232789341886, + "learning_rate": 0.0002989283114768405, + "loss": 6.3123, + "step": 4372 + }, + { + "epoch": 0.40804329569842307, + "grad_norm": 1.6117956807049587, + "learning_rate": 0.00029892740874908004, + "loss": 6.2993, + "step": 4373 + }, + { + "epoch": 0.4081366053933004, + "grad_norm": 1.5099796686018927, + "learning_rate": 0.0002989265056426413, + "loss": 6.5142, + "step": 4374 + }, + { + "epoch": 0.4082299150881777, + "grad_norm": 1.534519117331692, + "learning_rate": 0.00029892560215752666, + "loss": 5.9385, + "step": 4375 + }, + { + "epoch": 0.40832322478305494, + "grad_norm": 1.4512062252785898, + "learning_rate": 0.0002989246982937382, + "loss": 6.6381, + "step": 4376 + }, + { + "epoch": 0.40841653447793225, + "grad_norm": 1.2551164908301151, + "learning_rate": 0.00029892379405127844, + "loss": 6.2829, + "step": 4377 + }, + { + "epoch": 0.40850984417280956, + "grad_norm": 1.7079250173505813, + "learning_rate": 0.00029892288943014956, + "loss": 6.5462, + "step": 4378 + }, + { + "epoch": 0.40860315386768686, + "grad_norm": 8.05848023927949, + "learning_rate": 0.0002989219844303539, + "loss": 6.3042, + "step": 4379 + }, + { + "epoch": 0.40869646356256417, + "grad_norm": 1.229033411581752, + "learning_rate": 0.0002989210790518937, + "loss": 6.2292, + "step": 4380 + }, + { + "epoch": 0.4087897732574414, + "grad_norm": 1.0900328479331718, + "learning_rate": 0.0002989201732947714, + "loss": 6.4581, + "step": 4381 + }, + { + "epoch": 0.40888308295231873, + "grad_norm": 1.128062715339358, + "learning_rate": 0.00029891926715898913, + "loss": 6.4038, + "step": 4382 + }, + { + "epoch": 0.40897639264719604, + "grad_norm": 1.5222618266637786, + "learning_rate": 0.0002989183606445493, + "loss": 6.8036, + "step": 4383 + }, + { + "epoch": 0.40906970234207335, + "grad_norm": 1.4647227150181557, + "learning_rate": 0.0002989174537514542, + "loss": 6.6875, + "step": 4384 + }, + { + "epoch": 0.40916301203695066, + "grad_norm": 1.877418746787283, + "learning_rate": 0.00029891654647970614, + "loss": 6.3034, + "step": 4385 + }, + { + "epoch": 0.4092563217318279, + "grad_norm": 2.0634780631227314, + "learning_rate": 0.0002989156388293074, + "loss": 6.4876, + "step": 4386 + }, + { + "epoch": 0.4093496314267052, + "grad_norm": 2.02137269063386, + "learning_rate": 0.0002989147308002603, + "loss": 6.3047, + "step": 4387 + }, + { + "epoch": 0.4094429411215825, + "grad_norm": 1.1976416607978024, + "learning_rate": 0.0002989138223925672, + "loss": 6.4329, + "step": 4388 + }, + { + "epoch": 0.40953625081645983, + "grad_norm": 3.070932994981251, + "learning_rate": 0.0002989129136062303, + "loss": 6.5678, + "step": 4389 + }, + { + "epoch": 0.40962956051133714, + "grad_norm": 2.0650390346089402, + "learning_rate": 0.00029891200444125197, + "loss": 6.2461, + "step": 4390 + }, + { + "epoch": 0.40972287020621445, + "grad_norm": 1.8824813863996905, + "learning_rate": 0.0002989110948976346, + "loss": 6.1562, + "step": 4391 + }, + { + "epoch": 0.4098161799010917, + "grad_norm": 1.3113595665446043, + "learning_rate": 0.00029891018497538033, + "loss": 6.3067, + "step": 4392 + }, + { + "epoch": 0.409909489595969, + "grad_norm": 1.7716560582208565, + "learning_rate": 0.0002989092746744916, + "loss": 6.4359, + "step": 4393 + }, + { + "epoch": 0.4100027992908463, + "grad_norm": 1.7156460244946112, + "learning_rate": 0.0002989083639949707, + "loss": 6.1536, + "step": 4394 + }, + { + "epoch": 0.41009610898572363, + "grad_norm": 0.89723962096593, + "learning_rate": 0.00029890745293681994, + "loss": 6.1697, + "step": 4395 + }, + { + "epoch": 0.41018941868060094, + "grad_norm": 1.723113710147617, + "learning_rate": 0.0002989065415000416, + "loss": 6.3623, + "step": 4396 + }, + { + "epoch": 0.4102827283754782, + "grad_norm": 1.3588786798132093, + "learning_rate": 0.0002989056296846381, + "loss": 6.4006, + "step": 4397 + }, + { + "epoch": 0.4103760380703555, + "grad_norm": 0.9629981891714184, + "learning_rate": 0.0002989047174906117, + "loss": 6.4852, + "step": 4398 + }, + { + "epoch": 0.4104693477652328, + "grad_norm": 1.554444254486569, + "learning_rate": 0.0002989038049179647, + "loss": 6.1929, + "step": 4399 + }, + { + "epoch": 0.4105626574601101, + "grad_norm": 1.2583155228488019, + "learning_rate": 0.00029890289196669937, + "loss": 6.4468, + "step": 4400 + }, + { + "epoch": 0.4106559671549874, + "grad_norm": 1.4248761330157915, + "learning_rate": 0.00029890197863681814, + "loss": 5.914, + "step": 4401 + }, + { + "epoch": 0.4107492768498647, + "grad_norm": 1.4657428847760074, + "learning_rate": 0.00029890106492832326, + "loss": 5.8833, + "step": 4402 + }, + { + "epoch": 0.410842586544742, + "grad_norm": 0.9332344178850369, + "learning_rate": 0.0002989001508412171, + "loss": 5.8882, + "step": 4403 + }, + { + "epoch": 0.4109358962396193, + "grad_norm": 1.6527051213883994, + "learning_rate": 0.000298899236375502, + "loss": 6.5259, + "step": 4404 + }, + { + "epoch": 0.4110292059344966, + "grad_norm": 1.3663541804836126, + "learning_rate": 0.00029889832153118023, + "loss": 5.7569, + "step": 4405 + }, + { + "epoch": 0.4111225156293739, + "grad_norm": 2.1137008123295002, + "learning_rate": 0.00029889740630825414, + "loss": 6.2023, + "step": 4406 + }, + { + "epoch": 0.4112158253242512, + "grad_norm": 1.1917672130100736, + "learning_rate": 0.000298896490706726, + "loss": 6.3043, + "step": 4407 + }, + { + "epoch": 0.41130913501912847, + "grad_norm": 4.395475950306776, + "learning_rate": 0.00029889557472659827, + "loss": 6.6145, + "step": 4408 + }, + { + "epoch": 0.4114024447140058, + "grad_norm": 1.7522682189490355, + "learning_rate": 0.0002988946583678731, + "loss": 6.0658, + "step": 4409 + }, + { + "epoch": 0.4114957544088831, + "grad_norm": 1.506488790018202, + "learning_rate": 0.000298893741630553, + "loss": 6.1901, + "step": 4410 + }, + { + "epoch": 0.4115890641037604, + "grad_norm": 1.2171096106926846, + "learning_rate": 0.0002988928245146402, + "loss": 5.8204, + "step": 4411 + }, + { + "epoch": 0.4116823737986377, + "grad_norm": 1.2340287443320404, + "learning_rate": 0.000298891907020137, + "loss": 6.0102, + "step": 4412 + }, + { + "epoch": 0.41177568349351495, + "grad_norm": 1.4734870246798917, + "learning_rate": 0.00029889098914704585, + "loss": 5.6591, + "step": 4413 + }, + { + "epoch": 0.41186899318839226, + "grad_norm": 1.3449999290347696, + "learning_rate": 0.000298890070895369, + "loss": 6.6198, + "step": 4414 + }, + { + "epoch": 0.41196230288326957, + "grad_norm": 1.2598459172940804, + "learning_rate": 0.0002988891522651088, + "loss": 5.8356, + "step": 4415 + }, + { + "epoch": 0.4120556125781469, + "grad_norm": 1.1816004069209605, + "learning_rate": 0.0002988882332562676, + "loss": 6.2443, + "step": 4416 + }, + { + "epoch": 0.4121489222730242, + "grad_norm": 1.268335706932497, + "learning_rate": 0.0002988873138688477, + "loss": 6.433, + "step": 4417 + }, + { + "epoch": 0.41224223196790144, + "grad_norm": 1.0762119941941317, + "learning_rate": 0.0002988863941028515, + "loss": 6.2397, + "step": 4418 + }, + { + "epoch": 0.41233554166277875, + "grad_norm": 1.4644685287596626, + "learning_rate": 0.0002988854739582813, + "loss": 6.2942, + "step": 4419 + }, + { + "epoch": 0.41242885135765606, + "grad_norm": 1.1481998295953022, + "learning_rate": 0.0002988845534351394, + "loss": 6.405, + "step": 4420 + }, + { + "epoch": 0.41252216105253336, + "grad_norm": 1.091363036092342, + "learning_rate": 0.00029888363253342825, + "loss": 6.3215, + "step": 4421 + }, + { + "epoch": 0.4126154707474107, + "grad_norm": 1.004590666245663, + "learning_rate": 0.00029888271125315006, + "loss": 6.2342, + "step": 4422 + }, + { + "epoch": 0.412708780442288, + "grad_norm": 1.2880296331348184, + "learning_rate": 0.0002988817895943073, + "loss": 6.2934, + "step": 4423 + }, + { + "epoch": 0.41280209013716523, + "grad_norm": 0.8414077404093603, + "learning_rate": 0.0002988808675569022, + "loss": 6.2417, + "step": 4424 + }, + { + "epoch": 0.41289539983204254, + "grad_norm": 0.931922690456802, + "learning_rate": 0.0002988799451409372, + "loss": 6.4928, + "step": 4425 + }, + { + "epoch": 0.41298870952691985, + "grad_norm": 1.0081700417718635, + "learning_rate": 0.0002988790223464146, + "loss": 6.215, + "step": 4426 + }, + { + "epoch": 0.41308201922179716, + "grad_norm": 1.6200113598618444, + "learning_rate": 0.0002988780991733367, + "loss": 6.2222, + "step": 4427 + }, + { + "epoch": 0.41317532891667447, + "grad_norm": 1.274845369364824, + "learning_rate": 0.00029887717562170594, + "loss": 6.1094, + "step": 4428 + }, + { + "epoch": 0.4132686386115517, + "grad_norm": 0.9174851257926984, + "learning_rate": 0.00029887625169152456, + "loss": 6.1549, + "step": 4429 + }, + { + "epoch": 0.41336194830642903, + "grad_norm": 1.7704320618032734, + "learning_rate": 0.000298875327382795, + "loss": 6.2184, + "step": 4430 + }, + { + "epoch": 0.41345525800130634, + "grad_norm": 0.9138756030437861, + "learning_rate": 0.00029887440269551964, + "loss": 6.0674, + "step": 4431 + }, + { + "epoch": 0.41354856769618364, + "grad_norm": 1.5307856992686877, + "learning_rate": 0.0002988734776297007, + "loss": 6.6219, + "step": 4432 + }, + { + "epoch": 0.41364187739106095, + "grad_norm": 1.8460870363029995, + "learning_rate": 0.00029887255218534063, + "loss": 6.2954, + "step": 4433 + }, + { + "epoch": 0.4137351870859382, + "grad_norm": 1.6218471909005965, + "learning_rate": 0.00029887162636244176, + "loss": 6.2985, + "step": 4434 + }, + { + "epoch": 0.4138284967808155, + "grad_norm": 1.297059875980226, + "learning_rate": 0.00029887070016100645, + "loss": 6.4659, + "step": 4435 + }, + { + "epoch": 0.4139218064756928, + "grad_norm": 0.9336120130908748, + "learning_rate": 0.00029886977358103707, + "loss": 6.2418, + "step": 4436 + }, + { + "epoch": 0.41401511617057013, + "grad_norm": 1.5861915486263716, + "learning_rate": 0.00029886884662253595, + "loss": 6.4022, + "step": 4437 + }, + { + "epoch": 0.41410842586544744, + "grad_norm": 1.4996249194664424, + "learning_rate": 0.0002988679192855054, + "loss": 6.521, + "step": 4438 + }, + { + "epoch": 0.4142017355603247, + "grad_norm": 1.453016161264509, + "learning_rate": 0.0002988669915699479, + "loss": 6.2338, + "step": 4439 + }, + { + "epoch": 0.414295045255202, + "grad_norm": 0.8938151469362965, + "learning_rate": 0.0002988660634758657, + "loss": 6.0746, + "step": 4440 + }, + { + "epoch": 0.4143883549500793, + "grad_norm": 1.5376435919515061, + "learning_rate": 0.00029886513500326117, + "loss": 5.5072, + "step": 4441 + }, + { + "epoch": 0.4144816646449566, + "grad_norm": 1.1735564318579337, + "learning_rate": 0.00029886420615213673, + "loss": 6.5353, + "step": 4442 + }, + { + "epoch": 0.4145749743398339, + "grad_norm": 1.3880409438190136, + "learning_rate": 0.0002988632769224947, + "loss": 5.7564, + "step": 4443 + }, + { + "epoch": 0.41466828403471123, + "grad_norm": 0.9613504395729549, + "learning_rate": 0.00029886234731433745, + "loss": 5.9309, + "step": 4444 + }, + { + "epoch": 0.4147615937295885, + "grad_norm": 1.2330762332623968, + "learning_rate": 0.0002988614173276674, + "loss": 6.5893, + "step": 4445 + }, + { + "epoch": 0.4148549034244658, + "grad_norm": 1.1174065719943593, + "learning_rate": 0.0002988604869624868, + "loss": 6.2127, + "step": 4446 + }, + { + "epoch": 0.4149482131193431, + "grad_norm": 1.1377211051582032, + "learning_rate": 0.00029885955621879813, + "loss": 6.284, + "step": 4447 + }, + { + "epoch": 0.4150415228142204, + "grad_norm": 1.2037009172943451, + "learning_rate": 0.00029885862509660365, + "loss": 6.2703, + "step": 4448 + }, + { + "epoch": 0.4151348325090977, + "grad_norm": 0.7670184049806007, + "learning_rate": 0.0002988576935959058, + "loss": 6.2491, + "step": 4449 + }, + { + "epoch": 0.41522814220397497, + "grad_norm": 1.046996702355772, + "learning_rate": 0.0002988567617167069, + "loss": 6.6268, + "step": 4450 + }, + { + "epoch": 0.4153214518988523, + "grad_norm": 0.9058058872139297, + "learning_rate": 0.0002988558294590094, + "loss": 6.1625, + "step": 4451 + }, + { + "epoch": 0.4154147615937296, + "grad_norm": 1.281854942981288, + "learning_rate": 0.0002988548968228156, + "loss": 6.2091, + "step": 4452 + }, + { + "epoch": 0.4155080712886069, + "grad_norm": 0.89825454962793, + "learning_rate": 0.0002988539638081279, + "loss": 5.6406, + "step": 4453 + }, + { + "epoch": 0.4156013809834842, + "grad_norm": 0.9991871645053975, + "learning_rate": 0.00029885303041494863, + "loss": 5.9955, + "step": 4454 + }, + { + "epoch": 0.41569469067836146, + "grad_norm": 1.1325394265482536, + "learning_rate": 0.00029885209664328024, + "loss": 6.4471, + "step": 4455 + }, + { + "epoch": 0.41578800037323876, + "grad_norm": 1.2396918256840905, + "learning_rate": 0.00029885116249312506, + "loss": 6.3456, + "step": 4456 + }, + { + "epoch": 0.4158813100681161, + "grad_norm": 1.3608225531597606, + "learning_rate": 0.0002988502279644855, + "loss": 6.1513, + "step": 4457 + }, + { + "epoch": 0.4159746197629934, + "grad_norm": 1.023564064868097, + "learning_rate": 0.0002988492930573638, + "loss": 6.2602, + "step": 4458 + }, + { + "epoch": 0.4160679294578707, + "grad_norm": 0.9625733004976113, + "learning_rate": 0.0002988483577717625, + "loss": 5.6915, + "step": 4459 + }, + { + "epoch": 0.416161239152748, + "grad_norm": 1.2740414127392472, + "learning_rate": 0.0002988474221076839, + "loss": 6.3439, + "step": 4460 + }, + { + "epoch": 0.41625454884762525, + "grad_norm": 4.842595832821185, + "learning_rate": 0.00029884648606513044, + "loss": 6.2406, + "step": 4461 + }, + { + "epoch": 0.41634785854250256, + "grad_norm": 2.0862075056931415, + "learning_rate": 0.00029884554964410443, + "loss": 6.4586, + "step": 4462 + }, + { + "epoch": 0.41644116823737987, + "grad_norm": 657.4752145745036, + "learning_rate": 0.00029884461284460827, + "loss": 5.6961, + "step": 4463 + }, + { + "epoch": 0.4165344779322572, + "grad_norm": 1.0880753250615371, + "learning_rate": 0.00029884367566664435, + "loss": 6.369, + "step": 4464 + }, + { + "epoch": 0.4166277876271345, + "grad_norm": 1.2966368204758412, + "learning_rate": 0.00029884273811021505, + "loss": 5.7705, + "step": 4465 + }, + { + "epoch": 0.41672109732201174, + "grad_norm": 1.5211802965520334, + "learning_rate": 0.00029884180017532275, + "loss": 6.0184, + "step": 4466 + }, + { + "epoch": 0.41681440701688904, + "grad_norm": 1.5925861985002403, + "learning_rate": 0.0002988408618619699, + "loss": 6.5818, + "step": 4467 + }, + { + "epoch": 0.41690771671176635, + "grad_norm": 1.9627607255934973, + "learning_rate": 0.00029883992317015877, + "loss": 6.8655, + "step": 4468 + }, + { + "epoch": 0.41700102640664366, + "grad_norm": 1.779642290721212, + "learning_rate": 0.00029883898409989185, + "loss": 6.5212, + "step": 4469 + }, + { + "epoch": 0.41709433610152097, + "grad_norm": 1.5308203513919247, + "learning_rate": 0.00029883804465117143, + "loss": 6.4736, + "step": 4470 + }, + { + "epoch": 0.4171876457963982, + "grad_norm": 1.1654362762139023, + "learning_rate": 0.000298837104824, + "loss": 6.4548, + "step": 4471 + }, + { + "epoch": 0.41728095549127553, + "grad_norm": 2.670924839548389, + "learning_rate": 0.0002988361646183799, + "loss": 6.5976, + "step": 4472 + }, + { + "epoch": 0.41737426518615284, + "grad_norm": 1.6667151028785334, + "learning_rate": 0.00029883522403431346, + "loss": 6.5324, + "step": 4473 + }, + { + "epoch": 0.41746757488103015, + "grad_norm": 1.8702016120319553, + "learning_rate": 0.0002988342830718032, + "loss": 6.5731, + "step": 4474 + }, + { + "epoch": 0.41756088457590745, + "grad_norm": 2.330717392992776, + "learning_rate": 0.0002988333417308514, + "loss": 6.7989, + "step": 4475 + }, + { + "epoch": 0.41765419427078476, + "grad_norm": 1.7336090100826196, + "learning_rate": 0.00029883240001146055, + "loss": 6.1827, + "step": 4476 + }, + { + "epoch": 0.417747503965662, + "grad_norm": 1.3630450920602772, + "learning_rate": 0.00029883145791363295, + "loss": 6.6627, + "step": 4477 + }, + { + "epoch": 0.4178408136605393, + "grad_norm": 3.4744057655230387, + "learning_rate": 0.00029883051543737104, + "loss": 6.7695, + "step": 4478 + }, + { + "epoch": 0.41793412335541663, + "grad_norm": 1.54132449899338, + "learning_rate": 0.00029882957258267726, + "loss": 6.3932, + "step": 4479 + }, + { + "epoch": 0.41802743305029394, + "grad_norm": 2.111663386915353, + "learning_rate": 0.0002988286293495539, + "loss": 6.3934, + "step": 4480 + }, + { + "epoch": 0.41812074274517125, + "grad_norm": 2.528910088946883, + "learning_rate": 0.0002988276857380035, + "loss": 6.2286, + "step": 4481 + }, + { + "epoch": 0.4182140524400485, + "grad_norm": 1.2183970835603677, + "learning_rate": 0.0002988267417480283, + "loss": 6.2928, + "step": 4482 + }, + { + "epoch": 0.4183073621349258, + "grad_norm": 1.479763525062313, + "learning_rate": 0.0002988257973796308, + "loss": 6.2479, + "step": 4483 + }, + { + "epoch": 0.4184006718298031, + "grad_norm": 3.7494165896340483, + "learning_rate": 0.0002988248526328134, + "loss": 6.4975, + "step": 4484 + }, + { + "epoch": 0.4184939815246804, + "grad_norm": 1.4524925319548374, + "learning_rate": 0.00029882390750757856, + "loss": 6.5968, + "step": 4485 + }, + { + "epoch": 0.41858729121955773, + "grad_norm": 1.4198473341347397, + "learning_rate": 0.0002988229620039285, + "loss": 6.0925, + "step": 4486 + }, + { + "epoch": 0.418680600914435, + "grad_norm": 2.1375073144041723, + "learning_rate": 0.00029882201612186574, + "loss": 6.6782, + "step": 4487 + }, + { + "epoch": 0.4187739106093123, + "grad_norm": 3.0203210430403593, + "learning_rate": 0.0002988210698613927, + "loss": 6.2988, + "step": 4488 + }, + { + "epoch": 0.4188672203041896, + "grad_norm": 1.4292077620691808, + "learning_rate": 0.00029882012322251177, + "loss": 6.3655, + "step": 4489 + }, + { + "epoch": 0.4189605299990669, + "grad_norm": 1.1674689234574587, + "learning_rate": 0.00029881917620522535, + "loss": 6.0466, + "step": 4490 + }, + { + "epoch": 0.4190538396939442, + "grad_norm": 1.683866968061916, + "learning_rate": 0.0002988182288095359, + "loss": 6.627, + "step": 4491 + }, + { + "epoch": 0.41914714938882147, + "grad_norm": 2.441297122240872, + "learning_rate": 0.0002988172810354457, + "loss": 6.248, + "step": 4492 + }, + { + "epoch": 0.4192404590836988, + "grad_norm": 1.5953943421587875, + "learning_rate": 0.0002988163328829573, + "loss": 6.1967, + "step": 4493 + }, + { + "epoch": 0.4193337687785761, + "grad_norm": 8.33488082072242, + "learning_rate": 0.000298815384352073, + "loss": 6.3345, + "step": 4494 + }, + { + "epoch": 0.4194270784734534, + "grad_norm": 23.487612593846038, + "learning_rate": 0.0002988144354427952, + "loss": 6.6132, + "step": 4495 + }, + { + "epoch": 0.4195203881683307, + "grad_norm": 1.9506177875011614, + "learning_rate": 0.0002988134861551265, + "loss": 6.516, + "step": 4496 + }, + { + "epoch": 0.419613697863208, + "grad_norm": 3.187716426991958, + "learning_rate": 0.00029881253648906907, + "loss": 6.7628, + "step": 4497 + }, + { + "epoch": 0.41970700755808527, + "grad_norm": 3.8108189209564034, + "learning_rate": 0.00029881158644462556, + "loss": 6.7352, + "step": 4498 + }, + { + "epoch": 0.4198003172529626, + "grad_norm": 2.418630964736474, + "learning_rate": 0.0002988106360217982, + "loss": 6.1077, + "step": 4499 + }, + { + "epoch": 0.4198936269478399, + "grad_norm": 138.55452344941102, + "learning_rate": 0.00029880968522058945, + "loss": 6.727, + "step": 4500 + }, + { + "epoch": 0.4199869366427172, + "grad_norm": 2.830516020736446, + "learning_rate": 0.0002988087340410018, + "loss": 6.7987, + "step": 4501 + }, + { + "epoch": 0.4200802463375945, + "grad_norm": 1.7510475471799707, + "learning_rate": 0.00029880778248303757, + "loss": 6.4186, + "step": 4502 + }, + { + "epoch": 0.42017355603247175, + "grad_norm": 5.898450753128356, + "learning_rate": 0.00029880683054669927, + "loss": 6.5373, + "step": 4503 + }, + { + "epoch": 0.42026686572734906, + "grad_norm": 2.9078644371257116, + "learning_rate": 0.00029880587823198927, + "loss": 6.9859, + "step": 4504 + }, + { + "epoch": 0.42036017542222637, + "grad_norm": 14.86997713239367, + "learning_rate": 0.00029880492553891006, + "loss": 6.6931, + "step": 4505 + }, + { + "epoch": 0.4204534851171037, + "grad_norm": 1.433731532039686, + "learning_rate": 0.0002988039724674639, + "loss": 6.5101, + "step": 4506 + }, + { + "epoch": 0.420546794811981, + "grad_norm": 1.6372172333217554, + "learning_rate": 0.0002988030190176534, + "loss": 6.719, + "step": 4507 + }, + { + "epoch": 0.42064010450685824, + "grad_norm": 2.0142097306020155, + "learning_rate": 0.00029880206518948085, + "loss": 6.866, + "step": 4508 + }, + { + "epoch": 0.42073341420173554, + "grad_norm": 1.7134290168933695, + "learning_rate": 0.00029880111098294876, + "loss": 6.0683, + "step": 4509 + }, + { + "epoch": 0.42082672389661285, + "grad_norm": 2.3317447339287303, + "learning_rate": 0.0002988001563980595, + "loss": 6.6378, + "step": 4510 + }, + { + "epoch": 0.42092003359149016, + "grad_norm": 1.4330716579597529, + "learning_rate": 0.0002987992014348155, + "loss": 6.7589, + "step": 4511 + }, + { + "epoch": 0.42101334328636747, + "grad_norm": 2.2477886666479177, + "learning_rate": 0.00029879824609321923, + "loss": 7.0198, + "step": 4512 + }, + { + "epoch": 0.4211066529812448, + "grad_norm": 2.0197416723866715, + "learning_rate": 0.0002987972903732731, + "loss": 6.9306, + "step": 4513 + }, + { + "epoch": 0.42119996267612203, + "grad_norm": 2.2404250693895458, + "learning_rate": 0.0002987963342749795, + "loss": 6.7219, + "step": 4514 + }, + { + "epoch": 0.42129327237099934, + "grad_norm": 8.473347811480114, + "learning_rate": 0.0002987953777983409, + "loss": 6.2645, + "step": 4515 + }, + { + "epoch": 0.42138658206587665, + "grad_norm": 1.6221799276443922, + "learning_rate": 0.0002987944209433598, + "loss": 6.6153, + "step": 4516 + }, + { + "epoch": 0.42147989176075396, + "grad_norm": 1.4456136883861037, + "learning_rate": 0.00029879346371003857, + "loss": 6.3394, + "step": 4517 + }, + { + "epoch": 0.42157320145563126, + "grad_norm": 1.7369483231841427, + "learning_rate": 0.0002987925060983796, + "loss": 6.378, + "step": 4518 + }, + { + "epoch": 0.4216665111505085, + "grad_norm": 1.8792166946143405, + "learning_rate": 0.00029879154810838536, + "loss": 6.5157, + "step": 4519 + }, + { + "epoch": 0.4217598208453858, + "grad_norm": 10193.59554524889, + "learning_rate": 0.0002987905897400583, + "loss": 6.454, + "step": 4520 + }, + { + "epoch": 0.42185313054026313, + "grad_norm": 1.6258692172247957, + "learning_rate": 0.00029878963099340086, + "loss": 6.5434, + "step": 4521 + }, + { + "epoch": 0.42194644023514044, + "grad_norm": 15.650968583849869, + "learning_rate": 0.0002987886718684154, + "loss": 6.7067, + "step": 4522 + }, + { + "epoch": 0.42203974993001775, + "grad_norm": 3.637178293928423, + "learning_rate": 0.00029878771236510447, + "loss": 6.6754, + "step": 4523 + }, + { + "epoch": 0.422133059624895, + "grad_norm": 2.7592155820650612, + "learning_rate": 0.00029878675248347044, + "loss": 6.9339, + "step": 4524 + }, + { + "epoch": 0.4222263693197723, + "grad_norm": 2.325668047885875, + "learning_rate": 0.00029878579222351577, + "loss": 6.957, + "step": 4525 + }, + { + "epoch": 0.4223196790146496, + "grad_norm": 3.646523823057819, + "learning_rate": 0.00029878483158524296, + "loss": 7.0496, + "step": 4526 + }, + { + "epoch": 0.4224129887095269, + "grad_norm": 6.298704700097966, + "learning_rate": 0.0002987838705686543, + "loss": 7.1955, + "step": 4527 + }, + { + "epoch": 0.42250629840440423, + "grad_norm": 3.662587364042801, + "learning_rate": 0.0002987829091737524, + "loss": 6.9929, + "step": 4528 + }, + { + "epoch": 0.42259960809928154, + "grad_norm": 2.1601488490378395, + "learning_rate": 0.0002987819474005396, + "loss": 7.0874, + "step": 4529 + }, + { + "epoch": 0.4226929177941588, + "grad_norm": 2.5070214621028617, + "learning_rate": 0.0002987809852490184, + "loss": 7.18, + "step": 4530 + }, + { + "epoch": 0.4227862274890361, + "grad_norm": 4.3944061495989555, + "learning_rate": 0.00029878002271919123, + "loss": 7.0743, + "step": 4531 + }, + { + "epoch": 0.4228795371839134, + "grad_norm": 6.183465692710332, + "learning_rate": 0.00029877905981106056, + "loss": 7.1681, + "step": 4532 + }, + { + "epoch": 0.4229728468787907, + "grad_norm": 9.777999144230206, + "learning_rate": 0.00029877809652462876, + "loss": 7.2304, + "step": 4533 + }, + { + "epoch": 0.42306615657366803, + "grad_norm": 4.604141836925342, + "learning_rate": 0.0002987771328598984, + "loss": 6.9153, + "step": 4534 + }, + { + "epoch": 0.4231594662685453, + "grad_norm": 16.447164811388895, + "learning_rate": 0.0002987761688168718, + "loss": 6.8551, + "step": 4535 + }, + { + "epoch": 0.4232527759634226, + "grad_norm": 2.892789258186517, + "learning_rate": 0.00029877520439555147, + "loss": 7.3372, + "step": 4536 + }, + { + "epoch": 0.4233460856582999, + "grad_norm": 3.2630580600142207, + "learning_rate": 0.0002987742395959399, + "loss": 7.1533, + "step": 4537 + }, + { + "epoch": 0.4234393953531772, + "grad_norm": 2.9016123991766194, + "learning_rate": 0.0002987732744180395, + "loss": 7.384, + "step": 4538 + }, + { + "epoch": 0.4235327050480545, + "grad_norm": 15.263525450918921, + "learning_rate": 0.0002987723088618527, + "loss": 7.0711, + "step": 4539 + }, + { + "epoch": 0.42362601474293177, + "grad_norm": 26.715940790362566, + "learning_rate": 0.000298771342927382, + "loss": 6.6729, + "step": 4540 + }, + { + "epoch": 0.4237193244378091, + "grad_norm": 2.454524866087036, + "learning_rate": 0.00029877037661462985, + "loss": 7.3936, + "step": 4541 + }, + { + "epoch": 0.4238126341326864, + "grad_norm": 2.204742388519253, + "learning_rate": 0.00029876940992359875, + "loss": 7.2205, + "step": 4542 + }, + { + "epoch": 0.4239059438275637, + "grad_norm": 3.56831247185155, + "learning_rate": 0.0002987684428542911, + "loss": 6.988, + "step": 4543 + }, + { + "epoch": 0.423999253522441, + "grad_norm": 1.9755586252638502, + "learning_rate": 0.0002987674754067093, + "loss": 7.2243, + "step": 4544 + }, + { + "epoch": 0.42409256321731825, + "grad_norm": 3.147088024774465, + "learning_rate": 0.0002987665075808559, + "loss": 7.3041, + "step": 4545 + }, + { + "epoch": 0.42418587291219556, + "grad_norm": 2.7610416370836233, + "learning_rate": 0.00029876553937673337, + "loss": 6.9804, + "step": 4546 + }, + { + "epoch": 0.42427918260707287, + "grad_norm": 1.9486222446068244, + "learning_rate": 0.0002987645707943441, + "loss": 7.3511, + "step": 4547 + }, + { + "epoch": 0.4243724923019502, + "grad_norm": 1.5006265499668554, + "learning_rate": 0.00029876360183369064, + "loss": 7.2649, + "step": 4548 + }, + { + "epoch": 0.4244658019968275, + "grad_norm": 2.1738504886087524, + "learning_rate": 0.0002987626324947754, + "loss": 7.1985, + "step": 4549 + }, + { + "epoch": 0.4245591116917048, + "grad_norm": 13.068371114308956, + "learning_rate": 0.00029876166277760085, + "loss": 7.6213, + "step": 4550 + }, + { + "epoch": 0.42465242138658205, + "grad_norm": 2.2728418028880197, + "learning_rate": 0.00029876069268216946, + "loss": 7.236, + "step": 4551 + }, + { + "epoch": 0.42474573108145935, + "grad_norm": 73.88955423560488, + "learning_rate": 0.00029875972220848367, + "loss": 7.0186, + "step": 4552 + }, + { + "epoch": 0.42483904077633666, + "grad_norm": 5.376722647673379, + "learning_rate": 0.0002987587513565459, + "loss": 7.2917, + "step": 4553 + }, + { + "epoch": 0.42493235047121397, + "grad_norm": 95.91840218787354, + "learning_rate": 0.0002987577801263589, + "loss": 7.0804, + "step": 4554 + }, + { + "epoch": 0.4250256601660913, + "grad_norm": 2.5801244957438803, + "learning_rate": 0.00029875680851792477, + "loss": 7.045, + "step": 4555 + }, + { + "epoch": 0.42511896986096853, + "grad_norm": 4.318260576153903, + "learning_rate": 0.0002987558365312462, + "loss": 7.0141, + "step": 4556 + }, + { + "epoch": 0.42521227955584584, + "grad_norm": 4.9704477173312585, + "learning_rate": 0.00029875486416632555, + "loss": 6.9877, + "step": 4557 + }, + { + "epoch": 0.42530558925072315, + "grad_norm": 1.639994591164919, + "learning_rate": 0.00029875389142316534, + "loss": 6.5332, + "step": 4558 + }, + { + "epoch": 0.42539889894560046, + "grad_norm": 3546.675213234011, + "learning_rate": 0.00029875291830176807, + "loss": 6.9878, + "step": 4559 + }, + { + "epoch": 0.42549220864047776, + "grad_norm": 2.337348253882495, + "learning_rate": 0.00029875194480213625, + "loss": 7.3435, + "step": 4560 + }, + { + "epoch": 0.425585518335355, + "grad_norm": 7.675518741566026, + "learning_rate": 0.0002987509709242722, + "loss": 6.9301, + "step": 4561 + }, + { + "epoch": 0.4256788280302323, + "grad_norm": 2.2333481084408127, + "learning_rate": 0.0002987499966681786, + "loss": 7.0669, + "step": 4562 + }, + { + "epoch": 0.42577213772510963, + "grad_norm": 3.462931210157374, + "learning_rate": 0.00029874902203385777, + "loss": 7.1144, + "step": 4563 + }, + { + "epoch": 0.42586544741998694, + "grad_norm": 782.6943097706175, + "learning_rate": 0.00029874804702131224, + "loss": 7.0244, + "step": 4564 + }, + { + "epoch": 0.42595875711486425, + "grad_norm": 2.943313203965607, + "learning_rate": 0.00029874707163054446, + "loss": 6.9678, + "step": 4565 + }, + { + "epoch": 0.42605206680974156, + "grad_norm": 3.9613719162494236, + "learning_rate": 0.000298746095861557, + "loss": 7.198, + "step": 4566 + }, + { + "epoch": 0.4261453765046188, + "grad_norm": 2.242122406711647, + "learning_rate": 0.00029874511971435226, + "loss": 7.0382, + "step": 4567 + }, + { + "epoch": 0.4262386861994961, + "grad_norm": 23.016348152404415, + "learning_rate": 0.0002987441431889327, + "loss": 7.1186, + "step": 4568 + }, + { + "epoch": 0.42633199589437343, + "grad_norm": 3.465418323894953, + "learning_rate": 0.00029874316628530087, + "loss": 6.9156, + "step": 4569 + }, + { + "epoch": 0.42642530558925074, + "grad_norm": 6328422.094536547, + "learning_rate": 0.00029874218900345927, + "loss": 6.6709, + "step": 4570 + }, + { + "epoch": 0.42651861528412804, + "grad_norm": 3.5058586401926246, + "learning_rate": 0.00029874121134341027, + "loss": 7.0478, + "step": 4571 + }, + { + "epoch": 0.4266119249790053, + "grad_norm": 21.656842681634927, + "learning_rate": 0.0002987402333051565, + "loss": 6.6812, + "step": 4572 + }, + { + "epoch": 0.4267052346738826, + "grad_norm": 22.901713400040993, + "learning_rate": 0.00029873925488870033, + "loss": 6.9478, + "step": 4573 + }, + { + "epoch": 0.4267985443687599, + "grad_norm": 1.292326019916219, + "learning_rate": 0.0002987382760940443, + "loss": 6.61, + "step": 4574 + }, + { + "epoch": 0.4268918540636372, + "grad_norm": 21.320864039081627, + "learning_rate": 0.0002987372969211909, + "loss": 6.4977, + "step": 4575 + }, + { + "epoch": 0.42698516375851453, + "grad_norm": 2.0978992850923235, + "learning_rate": 0.0002987363173701426, + "loss": 7.0724, + "step": 4576 + }, + { + "epoch": 0.4270784734533918, + "grad_norm": 1.7205059983120683, + "learning_rate": 0.00029873533744090194, + "loss": 6.613, + "step": 4577 + }, + { + "epoch": 0.4271717831482691, + "grad_norm": 7.185607140684328, + "learning_rate": 0.00029873435713347133, + "loss": 6.6704, + "step": 4578 + }, + { + "epoch": 0.4272650928431464, + "grad_norm": 2.9345358175603127, + "learning_rate": 0.00029873337644785333, + "loss": 6.8623, + "step": 4579 + }, + { + "epoch": 0.4273584025380237, + "grad_norm": 169.2277337535444, + "learning_rate": 0.00029873239538405044, + "loss": 7.1427, + "step": 4580 + }, + { + "epoch": 0.427451712232901, + "grad_norm": 2.9237024823139692, + "learning_rate": 0.00029873141394206514, + "loss": 6.8348, + "step": 4581 + }, + { + "epoch": 0.4275450219277783, + "grad_norm": 4.163737425191498, + "learning_rate": 0.0002987304321218998, + "loss": 7.0669, + "step": 4582 + }, + { + "epoch": 0.4276383316226556, + "grad_norm": 2.8972762376975894, + "learning_rate": 0.00029872944992355714, + "loss": 7.1471, + "step": 4583 + }, + { + "epoch": 0.4277316413175329, + "grad_norm": 1052.9179733267433, + "learning_rate": 0.0002987284673470395, + "loss": 6.9617, + "step": 4584 + }, + { + "epoch": 0.4278249510124102, + "grad_norm": 8410.38427664401, + "learning_rate": 0.00029872748439234946, + "loss": 6.8908, + "step": 4585 + }, + { + "epoch": 0.4279182607072875, + "grad_norm": 808.8219655083536, + "learning_rate": 0.00029872650105948944, + "loss": 6.9455, + "step": 4586 + }, + { + "epoch": 0.4280115704021648, + "grad_norm": 3279.972723478519, + "learning_rate": 0.000298725517348462, + "loss": 7.1858, + "step": 4587 + }, + { + "epoch": 0.42810488009704206, + "grad_norm": 105.08260374000861, + "learning_rate": 0.00029872453325926964, + "loss": 7.237, + "step": 4588 + }, + { + "epoch": 0.42819818979191937, + "grad_norm": 814.4719702588036, + "learning_rate": 0.00029872354879191483, + "loss": 6.987, + "step": 4589 + }, + { + "epoch": 0.4282914994867967, + "grad_norm": 1.5120464583952045, + "learning_rate": 0.00029872256394640007, + "loss": 7.433, + "step": 4590 + }, + { + "epoch": 0.428384809181674, + "grad_norm": 2.3377143990032607, + "learning_rate": 0.00029872157872272794, + "loss": 7.4835, + "step": 4591 + }, + { + "epoch": 0.4284781188765513, + "grad_norm": 1.353691555055379, + "learning_rate": 0.00029872059312090085, + "loss": 6.7438, + "step": 4592 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.6995823719832495, + "learning_rate": 0.00029871960714092133, + "loss": 7.0997, + "step": 4593 + }, + { + "epoch": 0.42866473826630586, + "grad_norm": 1.910328017431076, + "learning_rate": 0.00029871862078279193, + "loss": 6.9782, + "step": 4594 + }, + { + "epoch": 0.42875804796118316, + "grad_norm": 3.17648550475911, + "learning_rate": 0.0002987176340465151, + "loss": 7.3534, + "step": 4595 + }, + { + "epoch": 0.4288513576560605, + "grad_norm": 3.6409429253579195, + "learning_rate": 0.00029871664693209343, + "loss": 6.8665, + "step": 4596 + }, + { + "epoch": 0.4289446673509378, + "grad_norm": 47.11448831934271, + "learning_rate": 0.0002987156594395293, + "loss": 7.295, + "step": 4597 + }, + { + "epoch": 0.42903797704581503, + "grad_norm": 2.6775640094369058, + "learning_rate": 0.00029871467156882545, + "loss": 6.9524, + "step": 4598 + }, + { + "epoch": 0.42913128674069234, + "grad_norm": 7.5649810287304815, + "learning_rate": 0.0002987136833199841, + "loss": 7.469, + "step": 4599 + }, + { + "epoch": 0.42922459643556965, + "grad_norm": 2.784096562979179, + "learning_rate": 0.000298712694693008, + "loss": 6.5375, + "step": 4600 + }, + { + "epoch": 0.42931790613044696, + "grad_norm": 1.459512995830534, + "learning_rate": 0.0002987117056878995, + "loss": 6.9396, + "step": 4601 + }, + { + "epoch": 0.42941121582532427, + "grad_norm": 2.8658046362850293, + "learning_rate": 0.0002987107163046612, + "loss": 7.207, + "step": 4602 + }, + { + "epoch": 0.4295045255202016, + "grad_norm": 10.847826753091995, + "learning_rate": 0.0002987097265432956, + "loss": 7.3716, + "step": 4603 + }, + { + "epoch": 0.4295978352150788, + "grad_norm": 23.250619987621086, + "learning_rate": 0.00029870873640380524, + "loss": 7.0082, + "step": 4604 + }, + { + "epoch": 0.42969114490995614, + "grad_norm": 1.7048318867928793, + "learning_rate": 0.0002987077458861926, + "loss": 7.0337, + "step": 4605 + }, + { + "epoch": 0.42978445460483344, + "grad_norm": 3.795649287502455, + "learning_rate": 0.0002987067549904602, + "loss": 6.8396, + "step": 4606 + }, + { + "epoch": 0.42987776429971075, + "grad_norm": 3.027892415505643, + "learning_rate": 0.0002987057637166106, + "loss": 6.9894, + "step": 4607 + }, + { + "epoch": 0.42997107399458806, + "grad_norm": 3.8816363986235607, + "learning_rate": 0.00029870477206464627, + "loss": 7.2789, + "step": 4608 + }, + { + "epoch": 0.4300643836894653, + "grad_norm": 3.0471566993337538, + "learning_rate": 0.00029870378003456975, + "loss": 6.5509, + "step": 4609 + }, + { + "epoch": 0.4301576933843426, + "grad_norm": 1.6698902468963566, + "learning_rate": 0.0002987027876263836, + "loss": 6.8485, + "step": 4610 + }, + { + "epoch": 0.43025100307921993, + "grad_norm": 1.1344372798752171, + "learning_rate": 0.0002987017948400903, + "loss": 6.8936, + "step": 4611 + }, + { + "epoch": 0.43034431277409724, + "grad_norm": 2.6409960102648933, + "learning_rate": 0.00029870080167569235, + "loss": 7.0227, + "step": 4612 + }, + { + "epoch": 0.43043762246897455, + "grad_norm": 1.5960224112232397, + "learning_rate": 0.00029869980813319234, + "loss": 6.963, + "step": 4613 + }, + { + "epoch": 0.4305309321638518, + "grad_norm": 2.9650183645715242, + "learning_rate": 0.00029869881421259277, + "loss": 6.6571, + "step": 4614 + }, + { + "epoch": 0.4306242418587291, + "grad_norm": 1.4855320428216277, + "learning_rate": 0.00029869781991389617, + "loss": 6.6578, + "step": 4615 + }, + { + "epoch": 0.4307175515536064, + "grad_norm": 8.773249869906635, + "learning_rate": 0.000298696825237105, + "loss": 6.6131, + "step": 4616 + }, + { + "epoch": 0.4308108612484837, + "grad_norm": 1.8552812845391966, + "learning_rate": 0.00029869583018222193, + "loss": 7.0526, + "step": 4617 + }, + { + "epoch": 0.43090417094336103, + "grad_norm": 1.209530371298808, + "learning_rate": 0.00029869483474924933, + "loss": 6.5205, + "step": 4618 + }, + { + "epoch": 0.43099748063823834, + "grad_norm": 3.7775741780908674, + "learning_rate": 0.00029869383893818985, + "loss": 6.2638, + "step": 4619 + }, + { + "epoch": 0.4310907903331156, + "grad_norm": 1.9304235787733632, + "learning_rate": 0.00029869284274904604, + "loss": 6.5724, + "step": 4620 + }, + { + "epoch": 0.4311841000279929, + "grad_norm": 1.2131691793658672, + "learning_rate": 0.00029869184618182027, + "loss": 6.9009, + "step": 4621 + }, + { + "epoch": 0.4312774097228702, + "grad_norm": 1.91811616038846, + "learning_rate": 0.00029869084923651526, + "loss": 6.8326, + "step": 4622 + }, + { + "epoch": 0.4313707194177475, + "grad_norm": 0.9217008461753621, + "learning_rate": 0.00029868985191313345, + "loss": 6.5815, + "step": 4623 + }, + { + "epoch": 0.4314640291126248, + "grad_norm": 2.9840649420234113, + "learning_rate": 0.0002986888542116774, + "loss": 6.6134, + "step": 4624 + }, + { + "epoch": 0.4315573388075021, + "grad_norm": 2.8942458527848087, + "learning_rate": 0.0002986878561321496, + "loss": 6.2936, + "step": 4625 + }, + { + "epoch": 0.4316506485023794, + "grad_norm": 5.397496593668857, + "learning_rate": 0.0002986868576745527, + "loss": 6.5815, + "step": 4626 + }, + { + "epoch": 0.4317439581972567, + "grad_norm": 1.327412158521904, + "learning_rate": 0.0002986858588388891, + "loss": 6.6363, + "step": 4627 + }, + { + "epoch": 0.431837267892134, + "grad_norm": 1.2966176706242212, + "learning_rate": 0.0002986848596251614, + "loss": 6.5889, + "step": 4628 + }, + { + "epoch": 0.4319305775870113, + "grad_norm": 1.6702503738880836, + "learning_rate": 0.00029868386003337216, + "loss": 6.8259, + "step": 4629 + }, + { + "epoch": 0.43202388728188856, + "grad_norm": 2.0431224025027164, + "learning_rate": 0.0002986828600635239, + "loss": 7.1236, + "step": 4630 + }, + { + "epoch": 0.43211719697676587, + "grad_norm": 1.5252660820638482, + "learning_rate": 0.00029868185971561925, + "loss": 6.3558, + "step": 4631 + }, + { + "epoch": 0.4322105066716432, + "grad_norm": 1.995702576603371, + "learning_rate": 0.00029868085898966055, + "loss": 6.2501, + "step": 4632 + }, + { + "epoch": 0.4323038163665205, + "grad_norm": 1.7344983264374525, + "learning_rate": 0.00029867985788565053, + "loss": 6.3416, + "step": 4633 + }, + { + "epoch": 0.4323971260613978, + "grad_norm": 1.4617528414204455, + "learning_rate": 0.0002986788564035917, + "loss": 6.6354, + "step": 4634 + }, + { + "epoch": 0.4324904357562751, + "grad_norm": 16.039472225174404, + "learning_rate": 0.00029867785454348655, + "loss": 6.8023, + "step": 4635 + }, + { + "epoch": 0.43258374545115236, + "grad_norm": 1.0729968768021185, + "learning_rate": 0.00029867685230533763, + "loss": 7.0267, + "step": 4636 + }, + { + "epoch": 0.43267705514602967, + "grad_norm": 265.1619837364344, + "learning_rate": 0.00029867584968914753, + "loss": 6.706, + "step": 4637 + }, + { + "epoch": 0.432770364840907, + "grad_norm": 2.464230102247246, + "learning_rate": 0.00029867484669491884, + "loss": 6.8269, + "step": 4638 + }, + { + "epoch": 0.4328636745357843, + "grad_norm": 420.2116724089941, + "learning_rate": 0.00029867384332265396, + "loss": 6.7885, + "step": 4639 + }, + { + "epoch": 0.4329569842306616, + "grad_norm": 1.575002053617238, + "learning_rate": 0.00029867283957235564, + "loss": 6.5237, + "step": 4640 + }, + { + "epoch": 0.43305029392553884, + "grad_norm": 2.5822943529278612, + "learning_rate": 0.00029867183544402627, + "loss": 7.0548, + "step": 4641 + }, + { + "epoch": 0.43314360362041615, + "grad_norm": 1.7169312112203072, + "learning_rate": 0.00029867083093766846, + "loss": 6.5782, + "step": 4642 + }, + { + "epoch": 0.43323691331529346, + "grad_norm": 2.0236185007866325, + "learning_rate": 0.0002986698260532848, + "loss": 6.9152, + "step": 4643 + }, + { + "epoch": 0.43333022301017077, + "grad_norm": 2.455892295456273, + "learning_rate": 0.0002986688207908778, + "loss": 6.5939, + "step": 4644 + }, + { + "epoch": 0.4334235327050481, + "grad_norm": 1.1656050119514723, + "learning_rate": 0.00029866781515045, + "loss": 6.5467, + "step": 4645 + }, + { + "epoch": 0.43351684239992533, + "grad_norm": 2.756233088303948, + "learning_rate": 0.00029866680913200397, + "loss": 6.5277, + "step": 4646 + }, + { + "epoch": 0.43361015209480264, + "grad_norm": 2.7616452917027345, + "learning_rate": 0.0002986658027355423, + "loss": 6.7757, + "step": 4647 + }, + { + "epoch": 0.43370346178967994, + "grad_norm": 11.406951105693928, + "learning_rate": 0.0002986647959610676, + "loss": 6.776, + "step": 4648 + }, + { + "epoch": 0.43379677148455725, + "grad_norm": 527.6838751739904, + "learning_rate": 0.0002986637888085823, + "loss": 6.6786, + "step": 4649 + }, + { + "epoch": 0.43389008117943456, + "grad_norm": 7.290353783891779, + "learning_rate": 0.00029866278127808904, + "loss": 6.735, + "step": 4650 + }, + { + "epoch": 0.4339833908743118, + "grad_norm": 3.8464631952421033, + "learning_rate": 0.00029866177336959035, + "loss": 6.6734, + "step": 4651 + }, + { + "epoch": 0.4340767005691891, + "grad_norm": 6.495659103538366, + "learning_rate": 0.0002986607650830888, + "loss": 6.9783, + "step": 4652 + }, + { + "epoch": 0.43417001026406643, + "grad_norm": 2.496153693096699, + "learning_rate": 0.000298659756418587, + "loss": 7.1875, + "step": 4653 + }, + { + "epoch": 0.43426331995894374, + "grad_norm": 2.2641675812545663, + "learning_rate": 0.00029865874737608744, + "loss": 6.691, + "step": 4654 + }, + { + "epoch": 0.43435662965382105, + "grad_norm": 3.4436743848337086, + "learning_rate": 0.00029865773795559276, + "loss": 7.1471, + "step": 4655 + }, + { + "epoch": 0.43444993934869836, + "grad_norm": 3.0256546117503573, + "learning_rate": 0.0002986567281571054, + "loss": 6.6295, + "step": 4656 + }, + { + "epoch": 0.4345432490435756, + "grad_norm": 1.9033808646415542, + "learning_rate": 0.0002986557179806281, + "loss": 6.6229, + "step": 4657 + }, + { + "epoch": 0.4346365587384529, + "grad_norm": 3.203243669754278, + "learning_rate": 0.0002986547074261633, + "loss": 6.5364, + "step": 4658 + }, + { + "epoch": 0.4347298684333302, + "grad_norm": 2.4137092025758617, + "learning_rate": 0.00029865369649371363, + "loss": 6.9098, + "step": 4659 + }, + { + "epoch": 0.43482317812820753, + "grad_norm": 2.3007424084554016, + "learning_rate": 0.00029865268518328164, + "loss": 6.803, + "step": 4660 + }, + { + "epoch": 0.43491648782308484, + "grad_norm": 2.6394993256686745, + "learning_rate": 0.00029865167349486994, + "loss": 6.892, + "step": 4661 + }, + { + "epoch": 0.4350097975179621, + "grad_norm": 2.3827710643922386, + "learning_rate": 0.000298650661428481, + "loss": 6.9981, + "step": 4662 + }, + { + "epoch": 0.4351031072128394, + "grad_norm": 2.3701505996730994, + "learning_rate": 0.00029864964898411754, + "loss": 6.879, + "step": 4663 + }, + { + "epoch": 0.4351964169077167, + "grad_norm": 2.1043671634959993, + "learning_rate": 0.00029864863616178204, + "loss": 6.7208, + "step": 4664 + }, + { + "epoch": 0.435289726602594, + "grad_norm": 1.6862354454682458, + "learning_rate": 0.00029864762296147703, + "loss": 6.3964, + "step": 4665 + }, + { + "epoch": 0.4353830362974713, + "grad_norm": 2.278519147228464, + "learning_rate": 0.0002986466093832052, + "loss": 6.6488, + "step": 4666 + }, + { + "epoch": 0.4354763459923486, + "grad_norm": 6027.107745010831, + "learning_rate": 0.00029864559542696907, + "loss": 6.3561, + "step": 4667 + }, + { + "epoch": 0.4355696556872259, + "grad_norm": 1.3736815487017953, + "learning_rate": 0.0002986445810927712, + "loss": 6.5179, + "step": 4668 + }, + { + "epoch": 0.4356629653821032, + "grad_norm": 2.2127043827562005, + "learning_rate": 0.0002986435663806142, + "loss": 6.9775, + "step": 4669 + }, + { + "epoch": 0.4357562750769805, + "grad_norm": 3.439748076360214, + "learning_rate": 0.00029864255129050067, + "loss": 7.1824, + "step": 4670 + }, + { + "epoch": 0.4358495847718578, + "grad_norm": 2.502473894158177, + "learning_rate": 0.0002986415358224332, + "loss": 6.6788, + "step": 4671 + }, + { + "epoch": 0.4359428944667351, + "grad_norm": 877.2435357252139, + "learning_rate": 0.0002986405199764142, + "loss": 7.3226, + "step": 4672 + }, + { + "epoch": 0.4360362041616124, + "grad_norm": 2.600134831889146, + "learning_rate": 0.00029863950375244645, + "loss": 7.3622, + "step": 4673 + }, + { + "epoch": 0.4361295138564897, + "grad_norm": 8.8887501928463, + "learning_rate": 0.00029863848715053253, + "loss": 6.9712, + "step": 4674 + }, + { + "epoch": 0.436222823551367, + "grad_norm": 1.201072981898165, + "learning_rate": 0.0002986374701706749, + "loss": 6.5419, + "step": 4675 + }, + { + "epoch": 0.4363161332462443, + "grad_norm": 64.64840125385709, + "learning_rate": 0.00029863645281287624, + "loss": 6.9304, + "step": 4676 + }, + { + "epoch": 0.4364094429411216, + "grad_norm": 1.8562961764901047, + "learning_rate": 0.00029863543507713914, + "loss": 6.7903, + "step": 4677 + }, + { + "epoch": 0.43650275263599886, + "grad_norm": 1.5920079040139847, + "learning_rate": 0.00029863441696346613, + "loss": 7.3821, + "step": 4678 + }, + { + "epoch": 0.43659606233087617, + "grad_norm": 1.5731572295796306, + "learning_rate": 0.0002986333984718598, + "loss": 7.0582, + "step": 4679 + }, + { + "epoch": 0.4366893720257535, + "grad_norm": 1.7652357252991107, + "learning_rate": 0.0002986323796023228, + "loss": 7.3301, + "step": 4680 + }, + { + "epoch": 0.4367826817206308, + "grad_norm": 3.294109685822627, + "learning_rate": 0.0002986313603548577, + "loss": 7.0887, + "step": 4681 + }, + { + "epoch": 0.4368759914155081, + "grad_norm": 1.945793313494201, + "learning_rate": 0.000298630340729467, + "loss": 6.897, + "step": 4682 + }, + { + "epoch": 0.43696930111038534, + "grad_norm": 2.389571689178921, + "learning_rate": 0.00029862932072615344, + "loss": 7.2224, + "step": 4683 + }, + { + "epoch": 0.43706261080526265, + "grad_norm": 1.2060467866934619, + "learning_rate": 0.00029862830034491956, + "loss": 7.3176, + "step": 4684 + }, + { + "epoch": 0.43715592050013996, + "grad_norm": 2.3776821862948023, + "learning_rate": 0.00029862727958576793, + "loss": 6.8905, + "step": 4685 + }, + { + "epoch": 0.43724923019501727, + "grad_norm": 2.5805949834076047, + "learning_rate": 0.00029862625844870105, + "loss": 7.3124, + "step": 4686 + }, + { + "epoch": 0.4373425398898946, + "grad_norm": 2.3377739733985172, + "learning_rate": 0.00029862523693372173, + "loss": 6.8412, + "step": 4687 + }, + { + "epoch": 0.4374358495847719, + "grad_norm": 2.8072768299851276, + "learning_rate": 0.0002986242150408324, + "loss": 7.1411, + "step": 4688 + }, + { + "epoch": 0.43752915927964914, + "grad_norm": 1.4191716019439347, + "learning_rate": 0.0002986231927700358, + "loss": 7.2584, + "step": 4689 + }, + { + "epoch": 0.43762246897452645, + "grad_norm": 2.3315583807247013, + "learning_rate": 0.0002986221701213344, + "loss": 7.1192, + "step": 4690 + }, + { + "epoch": 0.43771577866940375, + "grad_norm": 1.4562240072948476, + "learning_rate": 0.00029862114709473087, + "loss": 7.055, + "step": 4691 + }, + { + "epoch": 0.43780908836428106, + "grad_norm": 4.333357162839777, + "learning_rate": 0.0002986201236902277, + "loss": 7.3255, + "step": 4692 + }, + { + "epoch": 0.43790239805915837, + "grad_norm": 1.12472436616639, + "learning_rate": 0.00029861909990782764, + "loss": 6.9698, + "step": 4693 + }, + { + "epoch": 0.4379957077540356, + "grad_norm": 2.191017050839031, + "learning_rate": 0.00029861807574753325, + "loss": 6.8402, + "step": 4694 + }, + { + "epoch": 0.43808901744891293, + "grad_norm": 1.6309006266238666, + "learning_rate": 0.00029861705120934714, + "loss": 6.6869, + "step": 4695 + }, + { + "epoch": 0.43818232714379024, + "grad_norm": 1.8924619166644852, + "learning_rate": 0.00029861602629327184, + "loss": 7.1146, + "step": 4696 + }, + { + "epoch": 0.43827563683866755, + "grad_norm": 1.1039612287387521, + "learning_rate": 0.00029861500099930997, + "loss": 7.0393, + "step": 4697 + }, + { + "epoch": 0.43836894653354486, + "grad_norm": 1.9718984896188585, + "learning_rate": 0.0002986139753274643, + "loss": 7.2272, + "step": 4698 + }, + { + "epoch": 0.4384622562284221, + "grad_norm": 1.1086006374374189, + "learning_rate": 0.0002986129492777372, + "loss": 6.7141, + "step": 4699 + }, + { + "epoch": 0.4385555659232994, + "grad_norm": 1.859649661841181, + "learning_rate": 0.00029861192285013144, + "loss": 7.0269, + "step": 4700 + }, + { + "epoch": 0.4386488756181767, + "grad_norm": 2.214950235651908, + "learning_rate": 0.00029861089604464955, + "loss": 6.9517, + "step": 4701 + }, + { + "epoch": 0.43874218531305403, + "grad_norm": 1.5000910143844761, + "learning_rate": 0.00029860986886129425, + "loss": 7.0275, + "step": 4702 + }, + { + "epoch": 0.43883549500793134, + "grad_norm": 1.3192041348498789, + "learning_rate": 0.000298608841300068, + "loss": 7.116, + "step": 4703 + }, + { + "epoch": 0.4389288047028086, + "grad_norm": 1.720493881132413, + "learning_rate": 0.0002986078133609735, + "loss": 6.9464, + "step": 4704 + }, + { + "epoch": 0.4390221143976859, + "grad_norm": 1.1612525488347174, + "learning_rate": 0.00029860678504401334, + "loss": 6.8837, + "step": 4705 + }, + { + "epoch": 0.4391154240925632, + "grad_norm": 0.962522234738292, + "learning_rate": 0.00029860575634919017, + "loss": 6.5662, + "step": 4706 + }, + { + "epoch": 0.4392087337874405, + "grad_norm": 1.2744009511453356, + "learning_rate": 0.0002986047272765066, + "loss": 6.7433, + "step": 4707 + }, + { + "epoch": 0.4393020434823178, + "grad_norm": 1.4889725033729713, + "learning_rate": 0.00029860369782596524, + "loss": 6.6527, + "step": 4708 + }, + { + "epoch": 0.43939535317719514, + "grad_norm": 1.4617033988211878, + "learning_rate": 0.0002986026679975687, + "loss": 6.9577, + "step": 4709 + }, + { + "epoch": 0.4394886628720724, + "grad_norm": 1.2530686777402043, + "learning_rate": 0.0002986016377913195, + "loss": 7.2368, + "step": 4710 + }, + { + "epoch": 0.4395819725669497, + "grad_norm": 1.0348625660715727, + "learning_rate": 0.0002986006072072204, + "loss": 6.6877, + "step": 4711 + }, + { + "epoch": 0.439675282261827, + "grad_norm": 1.110359831356295, + "learning_rate": 0.000298599576245274, + "loss": 6.6503, + "step": 4712 + }, + { + "epoch": 0.4397685919567043, + "grad_norm": 1.1657409811497468, + "learning_rate": 0.0002985985449054829, + "loss": 6.7095, + "step": 4713 + }, + { + "epoch": 0.4398619016515816, + "grad_norm": 2.611476926879591, + "learning_rate": 0.0002985975131878497, + "loss": 7.1326, + "step": 4714 + }, + { + "epoch": 0.4399552113464589, + "grad_norm": 1.191399928711461, + "learning_rate": 0.000298596481092377, + "loss": 6.894, + "step": 4715 + }, + { + "epoch": 0.4400485210413362, + "grad_norm": 1.6960649541534378, + "learning_rate": 0.00029859544861906754, + "loss": 6.9163, + "step": 4716 + }, + { + "epoch": 0.4401418307362135, + "grad_norm": 6.3559046604476785, + "learning_rate": 0.00029859441576792384, + "loss": 6.8535, + "step": 4717 + }, + { + "epoch": 0.4402351404310908, + "grad_norm": 1.879483721259231, + "learning_rate": 0.0002985933825389486, + "loss": 6.7693, + "step": 4718 + }, + { + "epoch": 0.4403284501259681, + "grad_norm": 15.386860120993157, + "learning_rate": 0.00029859234893214433, + "loss": 7.0051, + "step": 4719 + }, + { + "epoch": 0.44042175982084536, + "grad_norm": 1.5401193598509078, + "learning_rate": 0.0002985913149475138, + "loss": 6.5179, + "step": 4720 + }, + { + "epoch": 0.44051506951572267, + "grad_norm": 2.0757707276059123, + "learning_rate": 0.0002985902805850595, + "loss": 6.9757, + "step": 4721 + }, + { + "epoch": 0.4406083792106, + "grad_norm": 6.375165521018236, + "learning_rate": 0.0002985892458447842, + "loss": 6.6065, + "step": 4722 + }, + { + "epoch": 0.4407016889054773, + "grad_norm": 1.172529008739629, + "learning_rate": 0.0002985882107266904, + "loss": 6.8155, + "step": 4723 + }, + { + "epoch": 0.4407949986003546, + "grad_norm": 2.4828945760803935, + "learning_rate": 0.00029858717523078085, + "loss": 7.092, + "step": 4724 + }, + { + "epoch": 0.4408883082952319, + "grad_norm": 0.7924640900671857, + "learning_rate": 0.0002985861393570581, + "loss": 6.6787, + "step": 4725 + }, + { + "epoch": 0.44098161799010915, + "grad_norm": 1.6221110307250315, + "learning_rate": 0.0002985851031055248, + "loss": 6.7164, + "step": 4726 + }, + { + "epoch": 0.44107492768498646, + "grad_norm": 1.1670096459250292, + "learning_rate": 0.0002985840664761837, + "loss": 6.9545, + "step": 4727 + }, + { + "epoch": 0.44116823737986377, + "grad_norm": 1.171268182886207, + "learning_rate": 0.0002985830294690372, + "loss": 6.7194, + "step": 4728 + }, + { + "epoch": 0.4412615470747411, + "grad_norm": 0.9688216371849964, + "learning_rate": 0.00029858199208408813, + "loss": 6.7697, + "step": 4729 + }, + { + "epoch": 0.4413548567696184, + "grad_norm": 1.3267991024331607, + "learning_rate": 0.000298580954321339, + "loss": 6.8112, + "step": 4730 + }, + { + "epoch": 0.44144816646449564, + "grad_norm": 21.34867975735765, + "learning_rate": 0.0002985799161807926, + "loss": 6.8833, + "step": 4731 + }, + { + "epoch": 0.44154147615937295, + "grad_norm": 1.2486849742370334, + "learning_rate": 0.0002985788776624515, + "loss": 6.5322, + "step": 4732 + }, + { + "epoch": 0.44163478585425026, + "grad_norm": 1.3504882427390175, + "learning_rate": 0.00029857783876631824, + "loss": 6.928, + "step": 4733 + }, + { + "epoch": 0.44172809554912756, + "grad_norm": 1.1461840302353088, + "learning_rate": 0.0002985767994923956, + "loss": 6.5694, + "step": 4734 + }, + { + "epoch": 0.4418214052440049, + "grad_norm": 484.6191457891721, + "learning_rate": 0.00029857575984068623, + "loss": 6.7652, + "step": 4735 + }, + { + "epoch": 0.4419147149388821, + "grad_norm": 2.1321121695676553, + "learning_rate": 0.0002985747198111926, + "loss": 6.8775, + "step": 4736 + }, + { + "epoch": 0.44200802463375943, + "grad_norm": 3.5859764647203933, + "learning_rate": 0.00029857367940391757, + "loss": 6.4618, + "step": 4737 + }, + { + "epoch": 0.44210133432863674, + "grad_norm": 1.5600037143759078, + "learning_rate": 0.0002985726386188636, + "loss": 7.0077, + "step": 4738 + }, + { + "epoch": 0.44219464402351405, + "grad_norm": 744.6290288019063, + "learning_rate": 0.00029857159745603346, + "loss": 6.7053, + "step": 4739 + }, + { + "epoch": 0.44228795371839136, + "grad_norm": 1.282539770900032, + "learning_rate": 0.00029857055591542975, + "loss": 6.8242, + "step": 4740 + }, + { + "epoch": 0.44238126341326867, + "grad_norm": 2.0641591860014765, + "learning_rate": 0.0002985695139970551, + "loss": 6.6943, + "step": 4741 + }, + { + "epoch": 0.4424745731081459, + "grad_norm": 1.2018014665713377, + "learning_rate": 0.0002985684717009122, + "loss": 6.8595, + "step": 4742 + }, + { + "epoch": 0.4425678828030232, + "grad_norm": 196.9408529600988, + "learning_rate": 0.0002985674290270037, + "loss": 6.6716, + "step": 4743 + }, + { + "epoch": 0.44266119249790054, + "grad_norm": 1.9597784138467644, + "learning_rate": 0.0002985663859753322, + "loss": 6.6676, + "step": 4744 + }, + { + "epoch": 0.44275450219277784, + "grad_norm": 2.2493883639899175, + "learning_rate": 0.0002985653425459005, + "loss": 6.4473, + "step": 4745 + }, + { + "epoch": 0.44284781188765515, + "grad_norm": 2.8215677939407398, + "learning_rate": 0.00029856429873871104, + "loss": 7.0906, + "step": 4746 + }, + { + "epoch": 0.4429411215825324, + "grad_norm": 2.743518054141487, + "learning_rate": 0.0002985632545537666, + "loss": 6.7561, + "step": 4747 + }, + { + "epoch": 0.4430344312774097, + "grad_norm": 2.557787312973557, + "learning_rate": 0.0002985622099910698, + "loss": 7.1256, + "step": 4748 + }, + { + "epoch": 0.443127740972287, + "grad_norm": 2.044121183074794, + "learning_rate": 0.0002985611650506233, + "loss": 6.7971, + "step": 4749 + }, + { + "epoch": 0.44322105066716433, + "grad_norm": 4.808786390112261, + "learning_rate": 0.0002985601197324298, + "loss": 7.2707, + "step": 4750 + }, + { + "epoch": 0.44331436036204164, + "grad_norm": 3.6098250276542365, + "learning_rate": 0.00029855907403649194, + "loss": 6.3815, + "step": 4751 + }, + { + "epoch": 0.4434076700569189, + "grad_norm": 2.111842730311393, + "learning_rate": 0.00029855802796281233, + "loss": 7.0344, + "step": 4752 + }, + { + "epoch": 0.4435009797517962, + "grad_norm": 1.8362181568421276, + "learning_rate": 0.00029855698151139365, + "loss": 6.9132, + "step": 4753 + }, + { + "epoch": 0.4435942894466735, + "grad_norm": 32770.227317359415, + "learning_rate": 0.00029855593468223855, + "loss": 7.2263, + "step": 4754 + }, + { + "epoch": 0.4436875991415508, + "grad_norm": 2.0701430914990016, + "learning_rate": 0.00029855488747534975, + "loss": 6.7648, + "step": 4755 + }, + { + "epoch": 0.4437809088364281, + "grad_norm": 3.293026875036319, + "learning_rate": 0.0002985538398907299, + "loss": 6.9841, + "step": 4756 + }, + { + "epoch": 0.4438742185313054, + "grad_norm": 2.870156346579552, + "learning_rate": 0.0002985527919283816, + "loss": 6.8018, + "step": 4757 + }, + { + "epoch": 0.4439675282261827, + "grad_norm": 14695271.94600257, + "learning_rate": 0.00029855174358830755, + "loss": 6.9699, + "step": 4758 + }, + { + "epoch": 0.44406083792106, + "grad_norm": 6.573150734516484, + "learning_rate": 0.00029855069487051044, + "loss": 7.7414, + "step": 4759 + }, + { + "epoch": 0.4441541476159373, + "grad_norm": 17963333.111697502, + "learning_rate": 0.0002985496457749929, + "loss": 7.4908, + "step": 4760 + }, + { + "epoch": 0.4442474573108146, + "grad_norm": 19320789.95339789, + "learning_rate": 0.00029854859630175763, + "loss": 7.4545, + "step": 4761 + }, + { + "epoch": 0.4443407670056919, + "grad_norm": 49244.09809041782, + "learning_rate": 0.00029854754645080727, + "loss": 8.2896, + "step": 4762 + }, + { + "epoch": 0.44443407670056917, + "grad_norm": 16.577917286627923, + "learning_rate": 0.00029854649622214455, + "loss": 7.4104, + "step": 4763 + }, + { + "epoch": 0.4445273863954465, + "grad_norm": 7.355092651132516, + "learning_rate": 0.00029854544561577203, + "loss": 7.6247, + "step": 4764 + }, + { + "epoch": 0.4446206960903238, + "grad_norm": 5.036837380863419, + "learning_rate": 0.0002985443946316925, + "loss": 7.9816, + "step": 4765 + }, + { + "epoch": 0.4447140057852011, + "grad_norm": 3.76551909188416, + "learning_rate": 0.0002985433432699085, + "loss": 7.8572, + "step": 4766 + }, + { + "epoch": 0.4448073154800784, + "grad_norm": 4.060184790134472, + "learning_rate": 0.00029854229153042285, + "loss": 7.9188, + "step": 4767 + }, + { + "epoch": 0.44490062517495566, + "grad_norm": 8.590801446625054, + "learning_rate": 0.00029854123941323814, + "loss": 8.0024, + "step": 4768 + }, + { + "epoch": 0.44499393486983296, + "grad_norm": 4.742162188853487, + "learning_rate": 0.00029854018691835707, + "loss": 7.3762, + "step": 4769 + }, + { + "epoch": 0.44508724456471027, + "grad_norm": 5.375717541965763, + "learning_rate": 0.0002985391340457823, + "loss": 7.4331, + "step": 4770 + }, + { + "epoch": 0.4451805542595876, + "grad_norm": 6644280.996955419, + "learning_rate": 0.0002985380807955165, + "loss": 7.4497, + "step": 4771 + }, + { + "epoch": 0.4452738639544649, + "grad_norm": 19143334.594275944, + "learning_rate": 0.00029853702716756234, + "loss": 8.2193, + "step": 4772 + }, + { + "epoch": 0.44536717364934214, + "grad_norm": 6.225714344068106, + "learning_rate": 0.00029853597316192247, + "loss": 7.6655, + "step": 4773 + }, + { + "epoch": 0.44546048334421945, + "grad_norm": 1.9946715106529942, + "learning_rate": 0.00029853491877859974, + "loss": 7.7593, + "step": 4774 + }, + { + "epoch": 0.44555379303909676, + "grad_norm": 5.569571233405516, + "learning_rate": 0.00029853386401759664, + "loss": 7.6435, + "step": 4775 + }, + { + "epoch": 0.44564710273397407, + "grad_norm": 5.091875649172669, + "learning_rate": 0.00029853280887891595, + "loss": 7.7633, + "step": 4776 + }, + { + "epoch": 0.4457404124288514, + "grad_norm": 7.0686672228028105, + "learning_rate": 0.0002985317533625603, + "loss": 7.629, + "step": 4777 + }, + { + "epoch": 0.4458337221237287, + "grad_norm": 35189.961695989856, + "learning_rate": 0.00029853069746853245, + "loss": 7.6014, + "step": 4778 + }, + { + "epoch": 0.44592703181860593, + "grad_norm": 4.100348149303418, + "learning_rate": 0.00029852964119683495, + "loss": 7.6094, + "step": 4779 + }, + { + "epoch": 0.44602034151348324, + "grad_norm": 6.17049061597964, + "learning_rate": 0.00029852858454747065, + "loss": 7.8569, + "step": 4780 + }, + { + "epoch": 0.44611365120836055, + "grad_norm": 6.581937460059724, + "learning_rate": 0.00029852752752044213, + "loss": 7.6195, + "step": 4781 + }, + { + "epoch": 0.44620696090323786, + "grad_norm": 8.297941872520777, + "learning_rate": 0.0002985264701157521, + "loss": 7.8156, + "step": 4782 + }, + { + "epoch": 0.44630027059811517, + "grad_norm": 6.703655855148213, + "learning_rate": 0.00029852541233340327, + "loss": 7.7898, + "step": 4783 + }, + { + "epoch": 0.4463935802929924, + "grad_norm": 3.4068525165547645, + "learning_rate": 0.0002985243541733983, + "loss": 7.3878, + "step": 4784 + }, + { + "epoch": 0.44648688998786973, + "grad_norm": 128917.00539365334, + "learning_rate": 0.0002985232956357399, + "loss": 7.3381, + "step": 4785 + }, + { + "epoch": 0.44658019968274704, + "grad_norm": 4.974376413529145, + "learning_rate": 0.00029852223672043076, + "loss": 7.6723, + "step": 4786 + }, + { + "epoch": 0.44667350937762434, + "grad_norm": 733031.8115290292, + "learning_rate": 0.00029852117742747353, + "loss": 7.2224, + "step": 4787 + }, + { + "epoch": 0.44676681907250165, + "grad_norm": 12.57262569031547, + "learning_rate": 0.00029852011775687097, + "loss": 7.481, + "step": 4788 + }, + { + "epoch": 0.4468601287673789, + "grad_norm": 2.8800858235845697, + "learning_rate": 0.00029851905770862577, + "loss": 7.0641, + "step": 4789 + }, + { + "epoch": 0.4469534384622562, + "grad_norm": 8.863067970257722, + "learning_rate": 0.00029851799728274054, + "loss": 7.6895, + "step": 4790 + }, + { + "epoch": 0.4470467481571335, + "grad_norm": 3.7874126626280997, + "learning_rate": 0.0002985169364792181, + "loss": 7.1788, + "step": 4791 + }, + { + "epoch": 0.44714005785201083, + "grad_norm": 6.668199948340626, + "learning_rate": 0.000298515875298061, + "loss": 7.1946, + "step": 4792 + }, + { + "epoch": 0.44723336754688814, + "grad_norm": 1565970.4001383455, + "learning_rate": 0.0002985148137392721, + "loss": 7.5418, + "step": 4793 + }, + { + "epoch": 0.44732667724176545, + "grad_norm": 3.95566090228977, + "learning_rate": 0.000298513751802854, + "loss": 7.5106, + "step": 4794 + }, + { + "epoch": 0.4474199869366427, + "grad_norm": 32.833499460462164, + "learning_rate": 0.0002985126894888094, + "loss": 7.2776, + "step": 4795 + }, + { + "epoch": 0.44751329663152, + "grad_norm": 4.2062653280495, + "learning_rate": 0.00029851162679714105, + "loss": 7.2159, + "step": 4796 + }, + { + "epoch": 0.4476066063263973, + "grad_norm": 1.9022374082420381, + "learning_rate": 0.0002985105637278516, + "loss": 7.1284, + "step": 4797 + }, + { + "epoch": 0.4476999160212746, + "grad_norm": 5.580196164920447, + "learning_rate": 0.0002985095002809438, + "loss": 7.3646, + "step": 4798 + }, + { + "epoch": 0.44779322571615193, + "grad_norm": 1.6806073939455437, + "learning_rate": 0.0002985084364564204, + "loss": 7.5878, + "step": 4799 + }, + { + "epoch": 0.4478865354110292, + "grad_norm": 45072302.88707096, + "learning_rate": 0.0002985073722542839, + "loss": 7.3879, + "step": 4800 + }, + { + "epoch": 0.4479798451059065, + "grad_norm": 2.374059682020147, + "learning_rate": 0.00029850630767453725, + "loss": 7.8807, + "step": 4801 + }, + { + "epoch": 0.4480731548007838, + "grad_norm": 4.127856524414257, + "learning_rate": 0.000298505242717183, + "loss": 7.0212, + "step": 4802 + }, + { + "epoch": 0.4481664644956611, + "grad_norm": 2.6675949484872876, + "learning_rate": 0.0002985041773822239, + "loss": 7.3361, + "step": 4803 + }, + { + "epoch": 0.4482597741905384, + "grad_norm": 16.267374007231716, + "learning_rate": 0.0002985031116696627, + "loss": 7.3347, + "step": 4804 + }, + { + "epoch": 0.44835308388541567, + "grad_norm": 1.8719248689396533, + "learning_rate": 0.00029850204557950206, + "loss": 7.3683, + "step": 4805 + }, + { + "epoch": 0.448446393580293, + "grad_norm": 3495.2335757997553, + "learning_rate": 0.00029850097911174474, + "loss": 7.1704, + "step": 4806 + }, + { + "epoch": 0.4485397032751703, + "grad_norm": 2.3968289456396477, + "learning_rate": 0.00029849991226639337, + "loss": 7.506, + "step": 4807 + }, + { + "epoch": 0.4486330129700476, + "grad_norm": 4.802225454969221, + "learning_rate": 0.00029849884504345076, + "loss": 7.0722, + "step": 4808 + }, + { + "epoch": 0.4487263226649249, + "grad_norm": 4.843887197839845, + "learning_rate": 0.0002984977774429195, + "loss": 7.5022, + "step": 4809 + }, + { + "epoch": 0.44881963235980216, + "grad_norm": 391330.01578979427, + "learning_rate": 0.00029849670946480243, + "loss": 7.5193, + "step": 4810 + }, + { + "epoch": 0.44891294205467946, + "grad_norm": 3.729976541998843, + "learning_rate": 0.00029849564110910224, + "loss": 7.8437, + "step": 4811 + }, + { + "epoch": 0.4490062517495568, + "grad_norm": 14.903631947325135, + "learning_rate": 0.0002984945723758216, + "loss": 7.4954, + "step": 4812 + }, + { + "epoch": 0.4490995614444341, + "grad_norm": 4.548392563179469, + "learning_rate": 0.0002984935032649633, + "loss": 7.3559, + "step": 4813 + }, + { + "epoch": 0.4491928711393114, + "grad_norm": 3.408878040143805, + "learning_rate": 0.0002984924337765299, + "loss": 7.0545, + "step": 4814 + }, + { + "epoch": 0.4492861808341887, + "grad_norm": 2.669724144836123, + "learning_rate": 0.0002984913639105243, + "loss": 7.5348, + "step": 4815 + }, + { + "epoch": 0.44937949052906595, + "grad_norm": 2.4213708889636325, + "learning_rate": 0.0002984902936669491, + "loss": 7.7167, + "step": 4816 + }, + { + "epoch": 0.44947280022394326, + "grad_norm": 16.41170519594737, + "learning_rate": 0.0002984892230458071, + "loss": 7.5834, + "step": 4817 + }, + { + "epoch": 0.44956610991882057, + "grad_norm": 5.442964219583861, + "learning_rate": 0.00029848815204710106, + "loss": 7.3575, + "step": 4818 + }, + { + "epoch": 0.4496594196136979, + "grad_norm": 2.431518116961368, + "learning_rate": 0.0002984870806708335, + "loss": 7.5376, + "step": 4819 + }, + { + "epoch": 0.4497527293085752, + "grad_norm": 38.05842842788842, + "learning_rate": 0.00029848600891700737, + "loss": 7.5178, + "step": 4820 + }, + { + "epoch": 0.44984603900345244, + "grad_norm": 2.821508318045152, + "learning_rate": 0.00029848493678562527, + "loss": 8.0558, + "step": 4821 + }, + { + "epoch": 0.44993934869832974, + "grad_norm": 6.021262122172912, + "learning_rate": 0.00029848386427669, + "loss": 7.596, + "step": 4822 + }, + { + "epoch": 0.45003265839320705, + "grad_norm": 2.349584183346098, + "learning_rate": 0.0002984827913902042, + "loss": 7.5215, + "step": 4823 + }, + { + "epoch": 0.45012596808808436, + "grad_norm": 3.1798513359572698, + "learning_rate": 0.0002984817181261707, + "loss": 7.6557, + "step": 4824 + }, + { + "epoch": 0.45021927778296167, + "grad_norm": 2.81378794647262, + "learning_rate": 0.0002984806444845921, + "loss": 7.3491, + "step": 4825 + }, + { + "epoch": 0.4503125874778389, + "grad_norm": 4.051468912946784, + "learning_rate": 0.00029847957046547123, + "loss": 7.5757, + "step": 4826 + }, + { + "epoch": 0.45040589717271623, + "grad_norm": 2.4839187211774862, + "learning_rate": 0.00029847849606881084, + "loss": 7.239, + "step": 4827 + }, + { + "epoch": 0.45049920686759354, + "grad_norm": 4.736528182285887, + "learning_rate": 0.00029847742129461357, + "loss": 7.338, + "step": 4828 + }, + { + "epoch": 0.45059251656247085, + "grad_norm": 3.074118606054449, + "learning_rate": 0.0002984763461428822, + "loss": 7.4033, + "step": 4829 + }, + { + "epoch": 0.45068582625734815, + "grad_norm": 1.6871198552334306, + "learning_rate": 0.0002984752706136195, + "loss": 7.4553, + "step": 4830 + }, + { + "epoch": 0.45077913595222546, + "grad_norm": 2.9618703746701227, + "learning_rate": 0.00029847419470682815, + "loss": 7.3635, + "step": 4831 + }, + { + "epoch": 0.4508724456471027, + "grad_norm": 1.543763798836506, + "learning_rate": 0.0002984731184225109, + "loss": 7.3717, + "step": 4832 + }, + { + "epoch": 0.45096575534198, + "grad_norm": 1.9293636293691545, + "learning_rate": 0.0002984720417606705, + "loss": 7.3736, + "step": 4833 + }, + { + "epoch": 0.45105906503685733, + "grad_norm": 1.8839514548518927, + "learning_rate": 0.0002984709647213096, + "loss": 7.3908, + "step": 4834 + }, + { + "epoch": 0.45115237473173464, + "grad_norm": 7951.255439288713, + "learning_rate": 0.00029846988730443115, + "loss": 7.4126, + "step": 4835 + }, + { + "epoch": 0.45124568442661195, + "grad_norm": 1.6779163335189289, + "learning_rate": 0.00029846880951003765, + "loss": 7.1321, + "step": 4836 + }, + { + "epoch": 0.4513389941214892, + "grad_norm": 2.2846846574852964, + "learning_rate": 0.000298467731338132, + "loss": 7.5124, + "step": 4837 + }, + { + "epoch": 0.4514323038163665, + "grad_norm": 273.0871126115977, + "learning_rate": 0.00029846665278871685, + "loss": 7.0762, + "step": 4838 + }, + { + "epoch": 0.4515256135112438, + "grad_norm": 1.88088073804668, + "learning_rate": 0.00029846557386179503, + "loss": 7.4705, + "step": 4839 + }, + { + "epoch": 0.4516189232061211, + "grad_norm": 2.875058767932646, + "learning_rate": 0.00029846449455736925, + "loss": 7.8006, + "step": 4840 + }, + { + "epoch": 0.45171223290099843, + "grad_norm": 1.7303417619808532, + "learning_rate": 0.00029846341487544217, + "loss": 7.5359, + "step": 4841 + }, + { + "epoch": 0.4518055425958757, + "grad_norm": 2.4079045969778, + "learning_rate": 0.00029846233481601664, + "loss": 7.423, + "step": 4842 + }, + { + "epoch": 0.451898852290753, + "grad_norm": 3.241318764819014, + "learning_rate": 0.0002984612543790954, + "loss": 7.2563, + "step": 4843 + }, + { + "epoch": 0.4519921619856303, + "grad_norm": 2.8856777323429714, + "learning_rate": 0.0002984601735646811, + "loss": 7.5496, + "step": 4844 + }, + { + "epoch": 0.4520854716805076, + "grad_norm": 1.620967457750902, + "learning_rate": 0.0002984590923727766, + "loss": 7.1836, + "step": 4845 + }, + { + "epoch": 0.4521787813753849, + "grad_norm": 1.5337731552548084, + "learning_rate": 0.0002984580108033846, + "loss": 7.4464, + "step": 4846 + }, + { + "epoch": 0.4522720910702622, + "grad_norm": 1.4813585704619892, + "learning_rate": 0.0002984569288565078, + "loss": 7.4104, + "step": 4847 + }, + { + "epoch": 0.4523654007651395, + "grad_norm": 2.685747965557594, + "learning_rate": 0.0002984558465321491, + "loss": 7.3641, + "step": 4848 + }, + { + "epoch": 0.4524587104600168, + "grad_norm": 3.05495062118575, + "learning_rate": 0.0002984547638303111, + "loss": 7.3736, + "step": 4849 + }, + { + "epoch": 0.4525520201548941, + "grad_norm": 63.17434842777646, + "learning_rate": 0.00029845368075099665, + "loss": 7.4209, + "step": 4850 + }, + { + "epoch": 0.4526453298497714, + "grad_norm": 7.855267235433308, + "learning_rate": 0.00029845259729420845, + "loss": 7.3821, + "step": 4851 + }, + { + "epoch": 0.4527386395446487, + "grad_norm": 1.8557431381561498, + "learning_rate": 0.00029845151345994923, + "loss": 7.5831, + "step": 4852 + }, + { + "epoch": 0.45283194923952597, + "grad_norm": 2680.8929634028054, + "learning_rate": 0.00029845042924822184, + "loss": 7.0699, + "step": 4853 + }, + { + "epoch": 0.4529252589344033, + "grad_norm": 1.757240676029145, + "learning_rate": 0.00029844934465902897, + "loss": 7.319, + "step": 4854 + }, + { + "epoch": 0.4530185686292806, + "grad_norm": 1.8805064923038777, + "learning_rate": 0.00029844825969237333, + "loss": 7.536, + "step": 4855 + }, + { + "epoch": 0.4531118783241579, + "grad_norm": 2.035347480978539, + "learning_rate": 0.0002984471743482578, + "loss": 7.3925, + "step": 4856 + }, + { + "epoch": 0.4532051880190352, + "grad_norm": 3.237393020292786, + "learning_rate": 0.0002984460886266851, + "loss": 7.2347, + "step": 4857 + }, + { + "epoch": 0.45329849771391245, + "grad_norm": 2.5676175946084907, + "learning_rate": 0.0002984450025276579, + "loss": 7.3985, + "step": 4858 + }, + { + "epoch": 0.45339180740878976, + "grad_norm": 1.929350900508061, + "learning_rate": 0.0002984439160511791, + "loss": 7.3657, + "step": 4859 + }, + { + "epoch": 0.45348511710366707, + "grad_norm": 1.9486249547884433, + "learning_rate": 0.00029844282919725133, + "loss": 7.2549, + "step": 4860 + }, + { + "epoch": 0.4535784267985444, + "grad_norm": 1.7032350406512224, + "learning_rate": 0.0002984417419658775, + "loss": 7.2608, + "step": 4861 + }, + { + "epoch": 0.4536717364934217, + "grad_norm": 1.7317328278503907, + "learning_rate": 0.0002984406543570602, + "loss": 7.1935, + "step": 4862 + }, + { + "epoch": 0.45376504618829894, + "grad_norm": 1.4108574886257055, + "learning_rate": 0.0002984395663708023, + "loss": 7.1114, + "step": 4863 + }, + { + "epoch": 0.45385835588317625, + "grad_norm": 1.8834065384650325, + "learning_rate": 0.00029843847800710665, + "loss": 7.0919, + "step": 4864 + }, + { + "epoch": 0.45395166557805355, + "grad_norm": 3.316496959364836, + "learning_rate": 0.0002984373892659758, + "loss": 7.5937, + "step": 4865 + }, + { + "epoch": 0.45404497527293086, + "grad_norm": 2.5198472747087988, + "learning_rate": 0.0002984363001474127, + "loss": 7.181, + "step": 4866 + }, + { + "epoch": 0.45413828496780817, + "grad_norm": 7.289128649733775, + "learning_rate": 0.0002984352106514201, + "loss": 6.9882, + "step": 4867 + }, + { + "epoch": 0.4542315946626855, + "grad_norm": 2.4221091410370947, + "learning_rate": 0.00029843412077800065, + "loss": 7.3722, + "step": 4868 + }, + { + "epoch": 0.45432490435756273, + "grad_norm": 2.2291957964385674, + "learning_rate": 0.0002984330305271572, + "loss": 7.3865, + "step": 4869 + }, + { + "epoch": 0.45441821405244004, + "grad_norm": 10.147370166077973, + "learning_rate": 0.0002984319398988926, + "loss": 7.1557, + "step": 4870 + }, + { + "epoch": 0.45451152374731735, + "grad_norm": 9.17380444473201, + "learning_rate": 0.0002984308488932095, + "loss": 7.3723, + "step": 4871 + }, + { + "epoch": 0.45460483344219466, + "grad_norm": 1.4846610733302594, + "learning_rate": 0.0002984297575101107, + "loss": 7.27, + "step": 4872 + }, + { + "epoch": 0.45469814313707196, + "grad_norm": 1.61710182921209, + "learning_rate": 0.00029842866574959905, + "loss": 7.292, + "step": 4873 + }, + { + "epoch": 0.4547914528319492, + "grad_norm": 151088.61022372494, + "learning_rate": 0.00029842757361167723, + "loss": 7.4373, + "step": 4874 + }, + { + "epoch": 0.4548847625268265, + "grad_norm": 2328.6847857623156, + "learning_rate": 0.00029842648109634805, + "loss": 7.2248, + "step": 4875 + }, + { + "epoch": 0.45497807222170383, + "grad_norm": 1.8519510438205855, + "learning_rate": 0.0002984253882036143, + "loss": 7.136, + "step": 4876 + }, + { + "epoch": 0.45507138191658114, + "grad_norm": 1.502552692580503, + "learning_rate": 0.0002984242949334788, + "loss": 7.3218, + "step": 4877 + }, + { + "epoch": 0.45516469161145845, + "grad_norm": 3.932094882018982, + "learning_rate": 0.0002984232012859442, + "loss": 7.477, + "step": 4878 + }, + { + "epoch": 0.4552580013063357, + "grad_norm": 11.527822575004855, + "learning_rate": 0.00029842210726101345, + "loss": 6.8402, + "step": 4879 + }, + { + "epoch": 0.455351311001213, + "grad_norm": 8.679692958056549, + "learning_rate": 0.0002984210128586892, + "loss": 7.4036, + "step": 4880 + }, + { + "epoch": 0.4554446206960903, + "grad_norm": 7.545091005894895, + "learning_rate": 0.0002984199180789743, + "loss": 6.9278, + "step": 4881 + }, + { + "epoch": 0.4555379303909676, + "grad_norm": 561455.4799740022, + "learning_rate": 0.0002984188229218715, + "loss": 7.4288, + "step": 4882 + }, + { + "epoch": 0.45563124008584494, + "grad_norm": 2.626235208691402, + "learning_rate": 0.0002984177273873836, + "loss": 7.4277, + "step": 4883 + }, + { + "epoch": 0.45572454978072224, + "grad_norm": 3.4614666197649218, + "learning_rate": 0.0002984166314755134, + "loss": 7.3574, + "step": 4884 + }, + { + "epoch": 0.4558178594755995, + "grad_norm": 2.775879168311972, + "learning_rate": 0.00029841553518626364, + "loss": 7.1715, + "step": 4885 + }, + { + "epoch": 0.4559111691704768, + "grad_norm": 6.074729451433741, + "learning_rate": 0.00029841443851963715, + "loss": 7.3158, + "step": 4886 + }, + { + "epoch": 0.4560044788653541, + "grad_norm": 1.947300850576257, + "learning_rate": 0.00029841334147563675, + "loss": 7.2501, + "step": 4887 + }, + { + "epoch": 0.4560977885602314, + "grad_norm": 3.0592228726898787, + "learning_rate": 0.0002984122440542651, + "loss": 7.3958, + "step": 4888 + }, + { + "epoch": 0.45619109825510873, + "grad_norm": 3.056310373823969, + "learning_rate": 0.0002984111462555251, + "loss": 7.3457, + "step": 4889 + }, + { + "epoch": 0.456284407949986, + "grad_norm": 7966.161782441209, + "learning_rate": 0.0002984100480794196, + "loss": 7.2659, + "step": 4890 + }, + { + "epoch": 0.4563777176448633, + "grad_norm": 1.553447491877009, + "learning_rate": 0.0002984089495259512, + "loss": 7.3213, + "step": 4891 + }, + { + "epoch": 0.4564710273397406, + "grad_norm": 2.5360248107237626, + "learning_rate": 0.00029840785059512286, + "loss": 7.2629, + "step": 4892 + }, + { + "epoch": 0.4565643370346179, + "grad_norm": 2.8373722804620662, + "learning_rate": 0.0002984067512869373, + "loss": 7.5591, + "step": 4893 + }, + { + "epoch": 0.4566576467294952, + "grad_norm": 1.3789463900949028, + "learning_rate": 0.00029840565160139736, + "loss": 7.2525, + "step": 4894 + }, + { + "epoch": 0.45675095642437247, + "grad_norm": 1.6958360734604951, + "learning_rate": 0.00029840455153850584, + "loss": 7.1473, + "step": 4895 + }, + { + "epoch": 0.4568442661192498, + "grad_norm": 1.2866148276276983, + "learning_rate": 0.00029840345109826544, + "loss": 7.0952, + "step": 4896 + }, + { + "epoch": 0.4569375758141271, + "grad_norm": 5032.623632439567, + "learning_rate": 0.00029840235028067905, + "loss": 7.1958, + "step": 4897 + }, + { + "epoch": 0.4570308855090044, + "grad_norm": 1.5949967124045912, + "learning_rate": 0.0002984012490857494, + "loss": 7.1863, + "step": 4898 + }, + { + "epoch": 0.4571241952038817, + "grad_norm": 7.367626786190327, + "learning_rate": 0.0002984001475134794, + "loss": 7.1944, + "step": 4899 + }, + { + "epoch": 0.457217504898759, + "grad_norm": 3.4879435378297416, + "learning_rate": 0.00029839904556387177, + "loss": 6.8969, + "step": 4900 + }, + { + "epoch": 0.45731081459363626, + "grad_norm": 1.7668044268792171, + "learning_rate": 0.0002983979432369293, + "loss": 7.2724, + "step": 4901 + }, + { + "epoch": 0.45740412428851357, + "grad_norm": 13.01677940096093, + "learning_rate": 0.0002983968405326548, + "loss": 7.1718, + "step": 4902 + }, + { + "epoch": 0.4574974339833909, + "grad_norm": 2.093462971765808, + "learning_rate": 0.00029839573745105114, + "loss": 7.1028, + "step": 4903 + }, + { + "epoch": 0.4575907436782682, + "grad_norm": 2.2568763427109717, + "learning_rate": 0.00029839463399212107, + "loss": 7.2516, + "step": 4904 + }, + { + "epoch": 0.4576840533731455, + "grad_norm": 5.072038088895305, + "learning_rate": 0.0002983935301558674, + "loss": 7.0281, + "step": 4905 + }, + { + "epoch": 0.45777736306802275, + "grad_norm": 2.230038637420673, + "learning_rate": 0.0002983924259422929, + "loss": 7.3515, + "step": 4906 + }, + { + "epoch": 0.45787067276290006, + "grad_norm": 2.051077140626386, + "learning_rate": 0.00029839132135140047, + "loss": 7.7265, + "step": 4907 + }, + { + "epoch": 0.45796398245777736, + "grad_norm": 2.4438534153039457, + "learning_rate": 0.0002983902163831928, + "loss": 7.0931, + "step": 4908 + }, + { + "epoch": 0.45805729215265467, + "grad_norm": 6.987435094943846, + "learning_rate": 0.00029838911103767285, + "loss": 7.2519, + "step": 4909 + }, + { + "epoch": 0.458150601847532, + "grad_norm": 65634.47784421961, + "learning_rate": 0.0002983880053148433, + "loss": 7.0055, + "step": 4910 + }, + { + "epoch": 0.45824391154240923, + "grad_norm": 1.6188267662793967, + "learning_rate": 0.000298386899214707, + "loss": 6.9943, + "step": 4911 + }, + { + "epoch": 0.45833722123728654, + "grad_norm": 5.013721719413091, + "learning_rate": 0.0002983857927372668, + "loss": 7.3184, + "step": 4912 + }, + { + "epoch": 0.45843053093216385, + "grad_norm": 1.6325933052994512, + "learning_rate": 0.0002983846858825255, + "loss": 7.227, + "step": 4913 + }, + { + "epoch": 0.45852384062704116, + "grad_norm": 4335624.07935285, + "learning_rate": 0.0002983835786504859, + "loss": 7.4007, + "step": 4914 + }, + { + "epoch": 0.45861715032191847, + "grad_norm": 1.7452087900948898, + "learning_rate": 0.0002983824710411508, + "loss": 6.9987, + "step": 4915 + }, + { + "epoch": 0.4587104600167957, + "grad_norm": 1.4072192961714862, + "learning_rate": 0.000298381363054523, + "loss": 7.1963, + "step": 4916 + }, + { + "epoch": 0.458803769711673, + "grad_norm": 3.500280335275566, + "learning_rate": 0.00029838025469060536, + "loss": 7.3062, + "step": 4917 + }, + { + "epoch": 0.45889707940655033, + "grad_norm": 2.7111063501062205, + "learning_rate": 0.00029837914594940066, + "loss": 7.25, + "step": 4918 + }, + { + "epoch": 0.45899038910142764, + "grad_norm": 2.077414320960117, + "learning_rate": 0.00029837803683091177, + "loss": 7.3072, + "step": 4919 + }, + { + "epoch": 0.45908369879630495, + "grad_norm": 1.2409912152235891, + "learning_rate": 0.00029837692733514156, + "loss": 7.4278, + "step": 4920 + }, + { + "epoch": 0.45917700849118226, + "grad_norm": 1.96425063547238, + "learning_rate": 0.0002983758174620927, + "loss": 7.1252, + "step": 4921 + }, + { + "epoch": 0.4592703181860595, + "grad_norm": 25.27549220578812, + "learning_rate": 0.00029837470721176806, + "loss": 7.5856, + "step": 4922 + }, + { + "epoch": 0.4593636278809368, + "grad_norm": 2.41732298611327, + "learning_rate": 0.0002983735965841705, + "loss": 7.3281, + "step": 4923 + }, + { + "epoch": 0.45945693757581413, + "grad_norm": 49033.40904602899, + "learning_rate": 0.0002983724855793029, + "loss": 7.324, + "step": 4924 + }, + { + "epoch": 0.45955024727069144, + "grad_norm": 2.574415857219401, + "learning_rate": 0.00029837137419716796, + "loss": 7.1859, + "step": 4925 + }, + { + "epoch": 0.45964355696556874, + "grad_norm": 2.3228470092758995, + "learning_rate": 0.0002983702624377686, + "loss": 7.2578, + "step": 4926 + }, + { + "epoch": 0.459736866660446, + "grad_norm": 1.9970856148450717, + "learning_rate": 0.0002983691503011076, + "loss": 7.4477, + "step": 4927 + }, + { + "epoch": 0.4598301763553233, + "grad_norm": 2.198072121303155, + "learning_rate": 0.0002983680377871878, + "loss": 7.5991, + "step": 4928 + }, + { + "epoch": 0.4599234860502006, + "grad_norm": 1.5144531908638923, + "learning_rate": 0.00029836692489601204, + "loss": 7.1164, + "step": 4929 + }, + { + "epoch": 0.4600167957450779, + "grad_norm": 22.24755426247372, + "learning_rate": 0.00029836581162758316, + "loss": 7.2826, + "step": 4930 + }, + { + "epoch": 0.46011010543995523, + "grad_norm": 2.0397007135731764, + "learning_rate": 0.0002983646979819039, + "loss": 7.2397, + "step": 4931 + }, + { + "epoch": 0.4602034151348325, + "grad_norm": 2.1180180094589547, + "learning_rate": 0.0002983635839589772, + "loss": 7.2646, + "step": 4932 + }, + { + "epoch": 0.4602967248297098, + "grad_norm": 218.52222879267927, + "learning_rate": 0.0002983624695588059, + "loss": 7.2812, + "step": 4933 + }, + { + "epoch": 0.4603900345245871, + "grad_norm": 2.125064025192131, + "learning_rate": 0.0002983613547813927, + "loss": 6.8205, + "step": 4934 + }, + { + "epoch": 0.4604833442194644, + "grad_norm": 1.8534219835545416, + "learning_rate": 0.0002983602396267406, + "loss": 7.3504, + "step": 4935 + }, + { + "epoch": 0.4605766539143417, + "grad_norm": 1.1247955383021997, + "learning_rate": 0.00029835912409485234, + "loss": 7.145, + "step": 4936 + }, + { + "epoch": 0.460669963609219, + "grad_norm": 13440.468276887692, + "learning_rate": 0.00029835800818573077, + "loss": 7.2886, + "step": 4937 + }, + { + "epoch": 0.4607632733040963, + "grad_norm": 2.217802788737678, + "learning_rate": 0.0002983568918993787, + "loss": 7.225, + "step": 4938 + }, + { + "epoch": 0.4608565829989736, + "grad_norm": 1.4365964746512392, + "learning_rate": 0.000298355775235799, + "loss": 7.309, + "step": 4939 + }, + { + "epoch": 0.4609498926938509, + "grad_norm": 1.4860786078387853, + "learning_rate": 0.00029835465819499457, + "loss": 7.1806, + "step": 4940 + }, + { + "epoch": 0.4610432023887282, + "grad_norm": 6913.764945724561, + "learning_rate": 0.00029835354077696814, + "loss": 7.2183, + "step": 4941 + }, + { + "epoch": 0.4611365120836055, + "grad_norm": 9646.211663500299, + "learning_rate": 0.0002983524229817226, + "loss": 7.4012, + "step": 4942 + }, + { + "epoch": 0.46122982177848276, + "grad_norm": 1.4958419801862644, + "learning_rate": 0.0002983513048092608, + "loss": 7.1172, + "step": 4943 + }, + { + "epoch": 0.46132313147336007, + "grad_norm": 12.385982065129962, + "learning_rate": 0.0002983501862595856, + "loss": 7.1801, + "step": 4944 + }, + { + "epoch": 0.4614164411682374, + "grad_norm": 3.74962436972908, + "learning_rate": 0.00029834906733269985, + "loss": 7.2293, + "step": 4945 + }, + { + "epoch": 0.4615097508631147, + "grad_norm": 2.4412147671408606, + "learning_rate": 0.00029834794802860627, + "loss": 7.5043, + "step": 4946 + }, + { + "epoch": 0.461603060557992, + "grad_norm": 1.1085058072722132, + "learning_rate": 0.00029834682834730785, + "loss": 7.1567, + "step": 4947 + }, + { + "epoch": 0.46169637025286925, + "grad_norm": 1.3492677204593815, + "learning_rate": 0.0002983457082888074, + "loss": 6.9806, + "step": 4948 + }, + { + "epoch": 0.46178967994774656, + "grad_norm": 86.44641193640132, + "learning_rate": 0.0002983445878531078, + "loss": 7.1548, + "step": 4949 + }, + { + "epoch": 0.46188298964262386, + "grad_norm": 1.8035536750299168, + "learning_rate": 0.0002983434670402118, + "loss": 7.4285, + "step": 4950 + }, + { + "epoch": 0.4619762993375012, + "grad_norm": 2942.316018789192, + "learning_rate": 0.0002983423458501223, + "loss": 6.9733, + "step": 4951 + }, + { + "epoch": 0.4620696090323785, + "grad_norm": 2.820876366321255, + "learning_rate": 0.00029834122428284217, + "loss": 6.8436, + "step": 4952 + }, + { + "epoch": 0.4621629187272558, + "grad_norm": 1.6116786994988441, + "learning_rate": 0.0002983401023383743, + "loss": 7.228, + "step": 4953 + }, + { + "epoch": 0.46225622842213304, + "grad_norm": 1.687730630780724, + "learning_rate": 0.0002983389800167214, + "loss": 7.2352, + "step": 4954 + }, + { + "epoch": 0.46234953811701035, + "grad_norm": 1.3876230847137185, + "learning_rate": 0.00029833785731788647, + "loss": 6.926, + "step": 4955 + }, + { + "epoch": 0.46244284781188766, + "grad_norm": 1.7576227081260836, + "learning_rate": 0.0002983367342418723, + "loss": 7.4041, + "step": 4956 + }, + { + "epoch": 0.46253615750676497, + "grad_norm": 2.2121263726121234, + "learning_rate": 0.0002983356107886817, + "loss": 7.0441, + "step": 4957 + }, + { + "epoch": 0.4626294672016423, + "grad_norm": 1.8351658945906222, + "learning_rate": 0.00029833448695831767, + "loss": 7.4288, + "step": 4958 + }, + { + "epoch": 0.4627227768965195, + "grad_norm": 1.8412997275215433, + "learning_rate": 0.00029833336275078293, + "loss": 7.0266, + "step": 4959 + }, + { + "epoch": 0.46281608659139684, + "grad_norm": 2331.6654303829378, + "learning_rate": 0.0002983322381660804, + "loss": 7.2897, + "step": 4960 + }, + { + "epoch": 0.46290939628627414, + "grad_norm": 1.2159452149385872, + "learning_rate": 0.0002983311132042129, + "loss": 6.8467, + "step": 4961 + }, + { + "epoch": 0.46300270598115145, + "grad_norm": 3571.516214279807, + "learning_rate": 0.0002983299878651834, + "loss": 7.0065, + "step": 4962 + }, + { + "epoch": 0.46309601567602876, + "grad_norm": 2.3627813314104555, + "learning_rate": 0.0002983288621489946, + "loss": 7.3089, + "step": 4963 + }, + { + "epoch": 0.463189325370906, + "grad_norm": 56.32682181815641, + "learning_rate": 0.00029832773605564946, + "loss": 7.1599, + "step": 4964 + }, + { + "epoch": 0.4632826350657833, + "grad_norm": 2.535869299502148, + "learning_rate": 0.0002983266095851508, + "loss": 6.6863, + "step": 4965 + }, + { + "epoch": 0.46337594476066063, + "grad_norm": 8.549507412150831, + "learning_rate": 0.00029832548273750156, + "loss": 7.3069, + "step": 4966 + }, + { + "epoch": 0.46346925445553794, + "grad_norm": 1.4005181390040888, + "learning_rate": 0.00029832435551270455, + "loss": 7.1241, + "step": 4967 + }, + { + "epoch": 0.46356256415041525, + "grad_norm": 1.2463070583009361, + "learning_rate": 0.00029832322791076266, + "loss": 7.0032, + "step": 4968 + }, + { + "epoch": 0.4636558738452925, + "grad_norm": 1.7697012255082913, + "learning_rate": 0.0002983220999316787, + "loss": 6.8051, + "step": 4969 + }, + { + "epoch": 0.4637491835401698, + "grad_norm": 1.9126870101559845, + "learning_rate": 0.0002983209715754556, + "loss": 7.0148, + "step": 4970 + }, + { + "epoch": 0.4638424932350471, + "grad_norm": 2.8594391769584875, + "learning_rate": 0.00029831984284209615, + "loss": 7.3907, + "step": 4971 + }, + { + "epoch": 0.4639358029299244, + "grad_norm": 6.115666213340543, + "learning_rate": 0.0002983187137316033, + "loss": 7.2489, + "step": 4972 + }, + { + "epoch": 0.46402911262480173, + "grad_norm": 1.4201239190902262, + "learning_rate": 0.00029831758424397995, + "loss": 7.3387, + "step": 4973 + }, + { + "epoch": 0.46412242231967904, + "grad_norm": 2.6362776102628427, + "learning_rate": 0.0002983164543792288, + "loss": 7.2136, + "step": 4974 + }, + { + "epoch": 0.4642157320145563, + "grad_norm": 1.8470302842297177, + "learning_rate": 0.000298315324137353, + "loss": 7.0501, + "step": 4975 + }, + { + "epoch": 0.4643090417094336, + "grad_norm": 1.7071677497744948, + "learning_rate": 0.00029831419351835514, + "loss": 7.3266, + "step": 4976 + }, + { + "epoch": 0.4644023514043109, + "grad_norm": 64.3798381193793, + "learning_rate": 0.0002983130625222383, + "loss": 7.0941, + "step": 4977 + }, + { + "epoch": 0.4644956610991882, + "grad_norm": 3353.529283345852, + "learning_rate": 0.0002983119311490052, + "loss": 6.8989, + "step": 4978 + }, + { + "epoch": 0.4645889707940655, + "grad_norm": 1.2004723648122666, + "learning_rate": 0.00029831079939865885, + "loss": 6.9697, + "step": 4979 + }, + { + "epoch": 0.4646822804889428, + "grad_norm": 2.7003336408286804, + "learning_rate": 0.00029830966727120207, + "loss": 7.379, + "step": 4980 + }, + { + "epoch": 0.4647755901838201, + "grad_norm": 1.444420981621679, + "learning_rate": 0.00029830853476663766, + "loss": 6.8402, + "step": 4981 + }, + { + "epoch": 0.4648688998786974, + "grad_norm": 5.162307223929595, + "learning_rate": 0.00029830740188496865, + "loss": 6.9612, + "step": 4982 + }, + { + "epoch": 0.4649622095735747, + "grad_norm": 1.6094250311672016, + "learning_rate": 0.00029830626862619786, + "loss": 6.5437, + "step": 4983 + }, + { + "epoch": 0.465055519268452, + "grad_norm": 42713.04653779243, + "learning_rate": 0.0002983051349903281, + "loss": 7.5225, + "step": 4984 + }, + { + "epoch": 0.46514882896332926, + "grad_norm": 2.343867902884079, + "learning_rate": 0.0002983040009773624, + "loss": 7.3553, + "step": 4985 + }, + { + "epoch": 0.46524213865820657, + "grad_norm": 4.157111603071138, + "learning_rate": 0.0002983028665873035, + "loss": 6.6772, + "step": 4986 + }, + { + "epoch": 0.4653354483530839, + "grad_norm": 2.820873401810536, + "learning_rate": 0.00029830173182015434, + "loss": 7.2112, + "step": 4987 + }, + { + "epoch": 0.4654287580479612, + "grad_norm": 1.1919678588459122, + "learning_rate": 0.0002983005966759178, + "loss": 7.3044, + "step": 4988 + }, + { + "epoch": 0.4655220677428385, + "grad_norm": 2.811014458227754, + "learning_rate": 0.0002982994611545968, + "loss": 7.162, + "step": 4989 + }, + { + "epoch": 0.4656153774377158, + "grad_norm": 2.1007033462882587, + "learning_rate": 0.00029829832525619417, + "loss": 7.0474, + "step": 4990 + }, + { + "epoch": 0.46570868713259306, + "grad_norm": 2.889692686004495, + "learning_rate": 0.0002982971889807128, + "loss": 7.2919, + "step": 4991 + }, + { + "epoch": 0.46580199682747037, + "grad_norm": 1.3939367769233735, + "learning_rate": 0.0002982960523281557, + "loss": 7.0237, + "step": 4992 + }, + { + "epoch": 0.4658953065223477, + "grad_norm": 2.53227731304005, + "learning_rate": 0.0002982949152985256, + "loss": 7.2118, + "step": 4993 + }, + { + "epoch": 0.465988616217225, + "grad_norm": 97256.45076278984, + "learning_rate": 0.0002982937778918255, + "loss": 7.2943, + "step": 4994 + }, + { + "epoch": 0.4660819259121023, + "grad_norm": 2.466715970305697, + "learning_rate": 0.0002982926401080582, + "loss": 7.4427, + "step": 4995 + }, + { + "epoch": 0.46617523560697954, + "grad_norm": 8.286249309588905, + "learning_rate": 0.0002982915019472267, + "loss": 7.0676, + "step": 4996 + }, + { + "epoch": 0.46626854530185685, + "grad_norm": 3.7094917111641514, + "learning_rate": 0.0002982903634093338, + "loss": 7.1328, + "step": 4997 + }, + { + "epoch": 0.46636185499673416, + "grad_norm": 87363.46094805331, + "learning_rate": 0.0002982892244943824, + "loss": 7.2304, + "step": 4998 + }, + { + "epoch": 0.46645516469161147, + "grad_norm": 1605688.928847391, + "learning_rate": 0.0002982880852023755, + "loss": 7.3481, + "step": 4999 + }, + { + "epoch": 0.4665484743864888, + "grad_norm": 2.6985976795878455, + "learning_rate": 0.0002982869455333159, + "loss": 7.0787, + "step": 5000 + }, + { + "epoch": 0.46664178408136603, + "grad_norm": 14833.612301653062, + "learning_rate": 0.0002982858054872065, + "loss": 7.3554, + "step": 5001 + }, + { + "epoch": 0.46673509377624334, + "grad_norm": 23.41245734801036, + "learning_rate": 0.0002982846650640503, + "loss": 7.3794, + "step": 5002 + }, + { + "epoch": 0.46682840347112065, + "grad_norm": 8.895648672750541, + "learning_rate": 0.00029828352426385004, + "loss": 7.5028, + "step": 5003 + }, + { + "epoch": 0.46692171316599795, + "grad_norm": 106367800.29115531, + "learning_rate": 0.0002982823830866087, + "loss": 7.7826, + "step": 5004 + }, + { + "epoch": 0.46701502286087526, + "grad_norm": 17.33335955408663, + "learning_rate": 0.00029828124153232925, + "loss": 7.812, + "step": 5005 + }, + { + "epoch": 0.46710833255575257, + "grad_norm": 4.057062774225269, + "learning_rate": 0.00029828009960101444, + "loss": 7.5435, + "step": 5006 + }, + { + "epoch": 0.4672016422506298, + "grad_norm": 2.7532285324391172, + "learning_rate": 0.0002982789572926673, + "loss": 7.5074, + "step": 5007 + }, + { + "epoch": 0.46729495194550713, + "grad_norm": 6.738582161394691, + "learning_rate": 0.00029827781460729075, + "loss": 7.9741, + "step": 5008 + }, + { + "epoch": 0.46738826164038444, + "grad_norm": 7.084121562789856, + "learning_rate": 0.00029827667154488756, + "loss": 7.4206, + "step": 5009 + }, + { + "epoch": 0.46748157133526175, + "grad_norm": 5.583116393540679, + "learning_rate": 0.00029827552810546076, + "loss": 7.4901, + "step": 5010 + }, + { + "epoch": 0.46757488103013906, + "grad_norm": 3.0092668262844073, + "learning_rate": 0.0002982743842890132, + "loss": 7.592, + "step": 5011 + }, + { + "epoch": 0.4676681907250163, + "grad_norm": 3.343766059473866, + "learning_rate": 0.0002982732400955478, + "loss": 7.6923, + "step": 5012 + }, + { + "epoch": 0.4677615004198936, + "grad_norm": 4.851325500042696, + "learning_rate": 0.0002982720955250675, + "loss": 7.551, + "step": 5013 + }, + { + "epoch": 0.4678548101147709, + "grad_norm": 5.873484772459893, + "learning_rate": 0.00029827095057757515, + "loss": 7.6024, + "step": 5014 + }, + { + "epoch": 0.46794811980964823, + "grad_norm": 3.538373737263464, + "learning_rate": 0.00029826980525307373, + "loss": 7.8916, + "step": 5015 + }, + { + "epoch": 0.46804142950452554, + "grad_norm": 3.261456044677215, + "learning_rate": 0.00029826865955156606, + "loss": 7.3762, + "step": 5016 + }, + { + "epoch": 0.4681347391994028, + "grad_norm": 189585.38433228162, + "learning_rate": 0.00029826751347305515, + "loss": 8.004, + "step": 5017 + }, + { + "epoch": 0.4682280488942801, + "grad_norm": 4.201952708739109, + "learning_rate": 0.00029826636701754385, + "loss": 7.5584, + "step": 5018 + }, + { + "epoch": 0.4683213585891574, + "grad_norm": 353709298.0641201, + "learning_rate": 0.0002982652201850351, + "loss": 7.5381, + "step": 5019 + }, + { + "epoch": 0.4684146682840347, + "grad_norm": 57758860155.759125, + "learning_rate": 0.0002982640729755319, + "loss": 7.6256, + "step": 5020 + }, + { + "epoch": 0.468507977978912, + "grad_norm": 6.765294786912438, + "learning_rate": 0.000298262925389037, + "loss": 7.7065, + "step": 5021 + }, + { + "epoch": 0.4686012876737893, + "grad_norm": 5.492349417716848, + "learning_rate": 0.00029826177742555337, + "loss": 7.828, + "step": 5022 + }, + { + "epoch": 0.4686945973686666, + "grad_norm": 7.551562847212036, + "learning_rate": 0.000298260629085084, + "loss": 7.7368, + "step": 5023 + }, + { + "epoch": 0.4687879070635439, + "grad_norm": 6.4103221451613415, + "learning_rate": 0.0002982594803676318, + "loss": 7.8077, + "step": 5024 + }, + { + "epoch": 0.4688812167584212, + "grad_norm": 15.255593347498131, + "learning_rate": 0.0002982583312731996, + "loss": 7.8179, + "step": 5025 + }, + { + "epoch": 0.4689745264532985, + "grad_norm": 1230269214885104.5, + "learning_rate": 0.0002982571818017904, + "loss": 7.2314, + "step": 5026 + }, + { + "epoch": 0.4690678361481758, + "grad_norm": 3.3340031730589197, + "learning_rate": 0.00029825603195340715, + "loss": 7.8349, + "step": 5027 + }, + { + "epoch": 0.4691611458430531, + "grad_norm": 5120855875444.942, + "learning_rate": 0.0002982548817280526, + "loss": 7.3431, + "step": 5028 + }, + { + "epoch": 0.4692544555379304, + "grad_norm": 622389644957891.8, + "learning_rate": 0.00029825373112572995, + "loss": 7.3726, + "step": 5029 + }, + { + "epoch": 0.4693477652328077, + "grad_norm": 3.4575113013188714, + "learning_rate": 0.0002982525801464419, + "loss": 7.432, + "step": 5030 + }, + { + "epoch": 0.469441074927685, + "grad_norm": 2.535339104944334, + "learning_rate": 0.0002982514287901915, + "loss": 7.6767, + "step": 5031 + }, + { + "epoch": 0.4695343846225623, + "grad_norm": 7.19855301541784, + "learning_rate": 0.0002982502770569816, + "loss": 7.5681, + "step": 5032 + }, + { + "epoch": 0.46962769431743956, + "grad_norm": 5.780540399945457, + "learning_rate": 0.0002982491249468151, + "loss": 7.6868, + "step": 5033 + }, + { + "epoch": 0.46972100401231687, + "grad_norm": 129.75799485383934, + "learning_rate": 0.0002982479724596951, + "loss": 7.6903, + "step": 5034 + }, + { + "epoch": 0.4698143137071942, + "grad_norm": 9.091918683676257, + "learning_rate": 0.0002982468195956244, + "loss": 7.8169, + "step": 5035 + }, + { + "epoch": 0.4699076234020715, + "grad_norm": 19.61468638831002, + "learning_rate": 0.00029824566635460587, + "loss": 7.8658, + "step": 5036 + }, + { + "epoch": 0.4700009330969488, + "grad_norm": 7.527168328680452, + "learning_rate": 0.0002982445127366426, + "loss": 7.6904, + "step": 5037 + }, + { + "epoch": 0.47009424279182604, + "grad_norm": 3.68364625966224, + "learning_rate": 0.00029824335874173744, + "loss": 7.6518, + "step": 5038 + }, + { + "epoch": 0.47018755248670335, + "grad_norm": 2.8511144804896533, + "learning_rate": 0.0002982422043698933, + "loss": 7.2373, + "step": 5039 + }, + { + "epoch": 0.47028086218158066, + "grad_norm": 5.485487552135906, + "learning_rate": 0.00029824104962111313, + "loss": 7.6672, + "step": 5040 + }, + { + "epoch": 0.47037417187645797, + "grad_norm": 2.823909538673522, + "learning_rate": 0.00029823989449539995, + "loss": 7.7015, + "step": 5041 + }, + { + "epoch": 0.4704674815713353, + "grad_norm": 28.795121254218504, + "learning_rate": 0.0002982387389927566, + "loss": 7.3299, + "step": 5042 + }, + { + "epoch": 0.4705607912662126, + "grad_norm": 3.108156791096897, + "learning_rate": 0.000298237583113186, + "loss": 7.5194, + "step": 5043 + }, + { + "epoch": 0.47065410096108984, + "grad_norm": 2.441341975282072, + "learning_rate": 0.0002982364268566912, + "loss": 7.0478, + "step": 5044 + }, + { + "epoch": 0.47074741065596715, + "grad_norm": 33.1063109016743, + "learning_rate": 0.00029823527022327506, + "loss": 7.4452, + "step": 5045 + }, + { + "epoch": 0.47084072035084445, + "grad_norm": 5.134288323671176, + "learning_rate": 0.0002982341132129406, + "loss": 7.4202, + "step": 5046 + }, + { + "epoch": 0.47093403004572176, + "grad_norm": 3.0840717570563103, + "learning_rate": 0.00029823295582569057, + "loss": 7.213, + "step": 5047 + }, + { + "epoch": 0.47102733974059907, + "grad_norm": 2.889408041130914, + "learning_rate": 0.00029823179806152814, + "loss": 7.4241, + "step": 5048 + }, + { + "epoch": 0.4711206494354763, + "grad_norm": 5640451651792954.0, + "learning_rate": 0.0002982306399204561, + "loss": 7.7143, + "step": 5049 + }, + { + "epoch": 0.47121395913035363, + "grad_norm": 1.1166427685326532e+16, + "learning_rate": 0.0002982294814024775, + "loss": 7.5626, + "step": 5050 + }, + { + "epoch": 0.47130726882523094, + "grad_norm": 7.451369299283056, + "learning_rate": 0.0002982283225075953, + "loss": 7.811, + "step": 5051 + }, + { + "epoch": 0.47140057852010825, + "grad_norm": 2.753710566240479, + "learning_rate": 0.0002982271632358122, + "loss": 7.6243, + "step": 5052 + }, + { + "epoch": 0.47149388821498556, + "grad_norm": 4.54114368680455, + "learning_rate": 0.0002982260035871315, + "loss": 7.4965, + "step": 5053 + }, + { + "epoch": 0.4715871979098628, + "grad_norm": 3.5095416240168404, + "learning_rate": 0.0002982248435615559, + "loss": 7.4407, + "step": 5054 + }, + { + "epoch": 0.4716805076047401, + "grad_norm": 8933984380110.266, + "learning_rate": 0.0002982236831590884, + "loss": 7.5109, + "step": 5055 + }, + { + "epoch": 0.4717738172996174, + "grad_norm": 4.445615484001498, + "learning_rate": 0.00029822252237973203, + "loss": 7.188, + "step": 5056 + }, + { + "epoch": 0.47186712699449473, + "grad_norm": 10.170821263501894, + "learning_rate": 0.0002982213612234897, + "loss": 7.3504, + "step": 5057 + }, + { + "epoch": 0.47196043668937204, + "grad_norm": 7114326134.017593, + "learning_rate": 0.00029822019969036437, + "loss": 7.2607, + "step": 5058 + }, + { + "epoch": 0.47205374638424935, + "grad_norm": 2.537192820544298, + "learning_rate": 0.00029821903778035893, + "loss": 7.6145, + "step": 5059 + }, + { + "epoch": 0.4721470560791266, + "grad_norm": 21653452653.28671, + "learning_rate": 0.0002982178754934764, + "loss": 7.3765, + "step": 5060 + }, + { + "epoch": 0.4722403657740039, + "grad_norm": 3.040469618958706, + "learning_rate": 0.00029821671282971975, + "loss": 7.2571, + "step": 5061 + }, + { + "epoch": 0.4723336754688812, + "grad_norm": 4.84643788399825, + "learning_rate": 0.00029821554978909186, + "loss": 7.2414, + "step": 5062 + }, + { + "epoch": 0.47242698516375853, + "grad_norm": 4.795733914681581, + "learning_rate": 0.0002982143863715958, + "loss": 7.2933, + "step": 5063 + }, + { + "epoch": 0.47252029485863584, + "grad_norm": 3.3408111906929956, + "learning_rate": 0.0002982132225772344, + "loss": 7.3118, + "step": 5064 + }, + { + "epoch": 0.4726136045535131, + "grad_norm": 2.964266819343061, + "learning_rate": 0.00029821205840601067, + "loss": 7.233, + "step": 5065 + }, + { + "epoch": 0.4727069142483904, + "grad_norm": 4.125184017066435, + "learning_rate": 0.0002982108938579276, + "loss": 7.1239, + "step": 5066 + }, + { + "epoch": 0.4728002239432677, + "grad_norm": 3.2592697551628196, + "learning_rate": 0.00029820972893298815, + "loss": 7.3945, + "step": 5067 + }, + { + "epoch": 0.472893533638145, + "grad_norm": 3.0687559065657557, + "learning_rate": 0.0002982085636311952, + "loss": 7.4704, + "step": 5068 + }, + { + "epoch": 0.4729868433330223, + "grad_norm": 1.530881107272883, + "learning_rate": 0.0002982073979525518, + "loss": 7.349, + "step": 5069 + }, + { + "epoch": 0.4730801530278996, + "grad_norm": 172154566154.7955, + "learning_rate": 0.0002982062318970609, + "loss": 7.2082, + "step": 5070 + }, + { + "epoch": 0.4731734627227769, + "grad_norm": 79232919118.15002, + "learning_rate": 0.0002982050654647255, + "loss": 7.3819, + "step": 5071 + }, + { + "epoch": 0.4732667724176542, + "grad_norm": 2.426663409378487, + "learning_rate": 0.00029820389865554844, + "loss": 7.1736, + "step": 5072 + }, + { + "epoch": 0.4733600821125315, + "grad_norm": 2.6073846311943134, + "learning_rate": 0.00029820273146953285, + "loss": 7.2531, + "step": 5073 + }, + { + "epoch": 0.4734533918074088, + "grad_norm": 6.371610425236423, + "learning_rate": 0.00029820156390668155, + "loss": 7.9426, + "step": 5074 + }, + { + "epoch": 0.47354670150228606, + "grad_norm": 30368836.124003846, + "learning_rate": 0.00029820039596699756, + "loss": 7.2114, + "step": 5075 + }, + { + "epoch": 0.47364001119716337, + "grad_norm": 2.72359322100784, + "learning_rate": 0.00029819922765048394, + "loss": 7.2965, + "step": 5076 + }, + { + "epoch": 0.4737333208920407, + "grad_norm": 3.284640411728044, + "learning_rate": 0.00029819805895714354, + "loss": 7.1973, + "step": 5077 + }, + { + "epoch": 0.473826630586918, + "grad_norm": 2.40777169185485, + "learning_rate": 0.0002981968898869794, + "loss": 7.3946, + "step": 5078 + }, + { + "epoch": 0.4739199402817953, + "grad_norm": 4294702301.528205, + "learning_rate": 0.0002981957204399944, + "loss": 7.0641, + "step": 5079 + }, + { + "epoch": 0.4740132499766726, + "grad_norm": 3.180699572155655, + "learning_rate": 0.00029819455061619164, + "loss": 7.3619, + "step": 5080 + }, + { + "epoch": 0.47410655967154985, + "grad_norm": 2.6059669243170105, + "learning_rate": 0.000298193380415574, + "loss": 7.3601, + "step": 5081 + }, + { + "epoch": 0.47419986936642716, + "grad_norm": 2.0521284425347694, + "learning_rate": 0.0002981922098381445, + "loss": 7.1707, + "step": 5082 + }, + { + "epoch": 0.47429317906130447, + "grad_norm": 2.434909933372139, + "learning_rate": 0.00029819103888390614, + "loss": 7.2728, + "step": 5083 + }, + { + "epoch": 0.4743864887561818, + "grad_norm": 34620001.745196775, + "learning_rate": 0.0002981898675528618, + "loss": 7.149, + "step": 5084 + }, + { + "epoch": 0.4744797984510591, + "grad_norm": 4.930513443936311, + "learning_rate": 0.00029818869584501465, + "loss": 7.2451, + "step": 5085 + }, + { + "epoch": 0.47457310814593634, + "grad_norm": 2.872052287101981, + "learning_rate": 0.00029818752376036743, + "loss": 7.2298, + "step": 5086 + }, + { + "epoch": 0.47466641784081365, + "grad_norm": 2.833830738996039, + "learning_rate": 0.0002981863512989233, + "loss": 7.154, + "step": 5087 + }, + { + "epoch": 0.47475972753569096, + "grad_norm": 1.877274221259927, + "learning_rate": 0.0002981851784606851, + "loss": 6.8436, + "step": 5088 + }, + { + "epoch": 0.47485303723056826, + "grad_norm": 56248694945.459236, + "learning_rate": 0.00029818400524565596, + "loss": 7.1901, + "step": 5089 + }, + { + "epoch": 0.4749463469254456, + "grad_norm": 36691631.202680096, + "learning_rate": 0.00029818283165383876, + "loss": 7.2326, + "step": 5090 + }, + { + "epoch": 0.4750396566203228, + "grad_norm": 36243341029.63689, + "learning_rate": 0.0002981816576852365, + "loss": 6.9996, + "step": 5091 + }, + { + "epoch": 0.47513296631520013, + "grad_norm": 4.201353935232238, + "learning_rate": 0.0002981804833398522, + "loss": 7.3453, + "step": 5092 + }, + { + "epoch": 0.47522627601007744, + "grad_norm": 1.8247864251997061, + "learning_rate": 0.00029817930861768884, + "loss": 7.4065, + "step": 5093 + }, + { + "epoch": 0.47531958570495475, + "grad_norm": 7.101684326481297, + "learning_rate": 0.00029817813351874935, + "loss": 7.5369, + "step": 5094 + }, + { + "epoch": 0.47541289539983206, + "grad_norm": 4.51822028427469, + "learning_rate": 0.00029817695804303676, + "loss": 7.2304, + "step": 5095 + }, + { + "epoch": 0.47550620509470937, + "grad_norm": 3.4521296300031787, + "learning_rate": 0.0002981757821905541, + "loss": 7.1553, + "step": 5096 + }, + { + "epoch": 0.4755995147895866, + "grad_norm": 4.644488462281108, + "learning_rate": 0.0002981746059613043, + "loss": 7.1633, + "step": 5097 + }, + { + "epoch": 0.4756928244844639, + "grad_norm": 3.487790597065913, + "learning_rate": 0.0002981734293552904, + "loss": 7.1051, + "step": 5098 + }, + { + "epoch": 0.47578613417934124, + "grad_norm": 2.4669556291055246, + "learning_rate": 0.0002981722523725153, + "loss": 7.2604, + "step": 5099 + }, + { + "epoch": 0.47587944387421854, + "grad_norm": 2.7515889817997836, + "learning_rate": 0.00029817107501298206, + "loss": 7.6412, + "step": 5100 + }, + { + "epoch": 0.47597275356909585, + "grad_norm": 1.8696262634121579, + "learning_rate": 0.00029816989727669374, + "loss": 7.3342, + "step": 5101 + }, + { + "epoch": 0.4760660632639731, + "grad_norm": 1.6166733178377501, + "learning_rate": 0.0002981687191636532, + "loss": 6.9374, + "step": 5102 + }, + { + "epoch": 0.4761593729588504, + "grad_norm": 3.082902757037187, + "learning_rate": 0.0002981675406738635, + "loss": 7.1845, + "step": 5103 + }, + { + "epoch": 0.4762526826537277, + "grad_norm": 9.179494591548144, + "learning_rate": 0.00029816636180732763, + "loss": 7.1278, + "step": 5104 + }, + { + "epoch": 0.47634599234860503, + "grad_norm": 6.809139852442443, + "learning_rate": 0.0002981651825640486, + "loss": 7.5812, + "step": 5105 + }, + { + "epoch": 0.47643930204348234, + "grad_norm": 2.4781128735679046, + "learning_rate": 0.0002981640029440294, + "loss": 7.2273, + "step": 5106 + }, + { + "epoch": 0.4765326117383596, + "grad_norm": 3.7679686318221046, + "learning_rate": 0.000298162822947273, + "loss": 7.0333, + "step": 5107 + }, + { + "epoch": 0.4766259214332369, + "grad_norm": 161947397498.12766, + "learning_rate": 0.0002981616425737825, + "loss": 7.0171, + "step": 5108 + }, + { + "epoch": 0.4767192311281142, + "grad_norm": 2.594292864684604, + "learning_rate": 0.00029816046182356084, + "loss": 7.2658, + "step": 5109 + }, + { + "epoch": 0.4768125408229915, + "grad_norm": 2.429204936396782, + "learning_rate": 0.000298159280696611, + "loss": 7.3274, + "step": 5110 + }, + { + "epoch": 0.4769058505178688, + "grad_norm": 2.8698655989663746, + "learning_rate": 0.00029815809919293596, + "loss": 7.0667, + "step": 5111 + }, + { + "epoch": 0.47699916021274613, + "grad_norm": 3.354745097650775, + "learning_rate": 0.00029815691731253875, + "loss": 7.5032, + "step": 5112 + }, + { + "epoch": 0.4770924699076234, + "grad_norm": 2.4034639471993593, + "learning_rate": 0.0002981557350554224, + "loss": 7.4393, + "step": 5113 + }, + { + "epoch": 0.4771857796025007, + "grad_norm": 3.914759808701611, + "learning_rate": 0.00029815455242158994, + "loss": 7.186, + "step": 5114 + }, + { + "epoch": 0.477279089297378, + "grad_norm": 3.6368317533576633, + "learning_rate": 0.0002981533694110443, + "loss": 7.3073, + "step": 5115 + }, + { + "epoch": 0.4773723989922553, + "grad_norm": 17.435082375746966, + "learning_rate": 0.00029815218602378854, + "loss": 7.4326, + "step": 5116 + }, + { + "epoch": 0.4774657086871326, + "grad_norm": 3.2111799727272574, + "learning_rate": 0.00029815100225982565, + "loss": 7.0591, + "step": 5117 + }, + { + "epoch": 0.47755901838200987, + "grad_norm": 1.5317288936426954, + "learning_rate": 0.0002981498181191587, + "loss": 7.0527, + "step": 5118 + }, + { + "epoch": 0.4776523280768872, + "grad_norm": 4.6110064373801825, + "learning_rate": 0.0002981486336017906, + "loss": 6.9253, + "step": 5119 + }, + { + "epoch": 0.4777456377717645, + "grad_norm": 2.3203635621287515, + "learning_rate": 0.0002981474487077244, + "loss": 7.5036, + "step": 5120 + }, + { + "epoch": 0.4778389474666418, + "grad_norm": 1.6353511081955403, + "learning_rate": 0.0002981462634369631, + "loss": 7.23, + "step": 5121 + }, + { + "epoch": 0.4779322571615191, + "grad_norm": 7.699737534176782, + "learning_rate": 0.0002981450777895098, + "loss": 7.3729, + "step": 5122 + }, + { + "epoch": 0.47802556685639636, + "grad_norm": 4.6737925472366655, + "learning_rate": 0.0002981438917653674, + "loss": 7.2307, + "step": 5123 + }, + { + "epoch": 0.47811887655127366, + "grad_norm": 2.190306669729915, + "learning_rate": 0.00029814270536453896, + "loss": 7.1478, + "step": 5124 + }, + { + "epoch": 0.47821218624615097, + "grad_norm": 2.6995349719666093, + "learning_rate": 0.00029814151858702757, + "loss": 7.1031, + "step": 5125 + }, + { + "epoch": 0.4783054959410283, + "grad_norm": 2.1886784741006022, + "learning_rate": 0.00029814033143283616, + "loss": 7.3039, + "step": 5126 + }, + { + "epoch": 0.4783988056359056, + "grad_norm": 1.6192913150058237, + "learning_rate": 0.0002981391439019677, + "loss": 7.31, + "step": 5127 + }, + { + "epoch": 0.47849211533078284, + "grad_norm": 1.7237896420718413, + "learning_rate": 0.00029813795599442535, + "loss": 6.921, + "step": 5128 + }, + { + "epoch": 0.47858542502566015, + "grad_norm": 1.903667866677483, + "learning_rate": 0.00029813676771021203, + "loss": 7.3984, + "step": 5129 + }, + { + "epoch": 0.47867873472053746, + "grad_norm": 1.5562174196007414, + "learning_rate": 0.00029813557904933083, + "loss": 7.4345, + "step": 5130 + }, + { + "epoch": 0.47877204441541477, + "grad_norm": 1.5434169588688362, + "learning_rate": 0.00029813439001178467, + "loss": 7.0536, + "step": 5131 + }, + { + "epoch": 0.4788653541102921, + "grad_norm": 5.566291094992747, + "learning_rate": 0.0002981332005975767, + "loss": 6.9471, + "step": 5132 + }, + { + "epoch": 0.4789586638051694, + "grad_norm": 3.038615583548446, + "learning_rate": 0.00029813201080670985, + "loss": 7.2796, + "step": 5133 + }, + { + "epoch": 0.47905197350004664, + "grad_norm": 3.192600940439201, + "learning_rate": 0.00029813082063918716, + "loss": 7.528, + "step": 5134 + }, + { + "epoch": 0.47914528319492394, + "grad_norm": 3.4208500305807097, + "learning_rate": 0.0002981296300950117, + "loss": 7.3123, + "step": 5135 + }, + { + "epoch": 0.47923859288980125, + "grad_norm": 1.9860598257693256, + "learning_rate": 0.0002981284391741864, + "loss": 6.996, + "step": 5136 + }, + { + "epoch": 0.47933190258467856, + "grad_norm": 1.8433691143966864, + "learning_rate": 0.00029812724787671446, + "loss": 6.741, + "step": 5137 + }, + { + "epoch": 0.47942521227955587, + "grad_norm": 3.0129518014872962, + "learning_rate": 0.0002981260562025988, + "loss": 7.2026, + "step": 5138 + }, + { + "epoch": 0.4795185219744331, + "grad_norm": 24.68897126280131, + "learning_rate": 0.0002981248641518424, + "loss": 7.4392, + "step": 5139 + }, + { + "epoch": 0.47961183166931043, + "grad_norm": 1.8621483538003014, + "learning_rate": 0.0002981236717244484, + "loss": 7.2363, + "step": 5140 + }, + { + "epoch": 0.47970514136418774, + "grad_norm": 1.9449868009574405, + "learning_rate": 0.00029812247892041976, + "loss": 7.1461, + "step": 5141 + }, + { + "epoch": 0.47979845105906505, + "grad_norm": 5835642913.274269, + "learning_rate": 0.00029812128573975953, + "loss": 7.0117, + "step": 5142 + }, + { + "epoch": 0.47989176075394235, + "grad_norm": 2.254092937866946, + "learning_rate": 0.0002981200921824707, + "loss": 7.1393, + "step": 5143 + }, + { + "epoch": 0.4799850704488196, + "grad_norm": 376171374326.34863, + "learning_rate": 0.0002981188982485564, + "loss": 7.4119, + "step": 5144 + }, + { + "epoch": 0.4800783801436969, + "grad_norm": 83224947964.8161, + "learning_rate": 0.0002981177039380196, + "loss": 6.9126, + "step": 5145 + }, + { + "epoch": 0.4801716898385742, + "grad_norm": 2.83098484837123, + "learning_rate": 0.0002981165092508634, + "loss": 7.1681, + "step": 5146 + }, + { + "epoch": 0.48026499953345153, + "grad_norm": 3092889834.23538, + "learning_rate": 0.00029811531418709073, + "loss": 7.1385, + "step": 5147 + }, + { + "epoch": 0.48035830922832884, + "grad_norm": 2.2588161285573043, + "learning_rate": 0.00029811411874670473, + "loss": 7.0971, + "step": 5148 + }, + { + "epoch": 0.48045161892320615, + "grad_norm": 3.8063874743115464, + "learning_rate": 0.00029811292292970836, + "loss": 7.5291, + "step": 5149 + }, + { + "epoch": 0.4805449286180834, + "grad_norm": 4.771378619965249, + "learning_rate": 0.00029811172673610477, + "loss": 7.0898, + "step": 5150 + }, + { + "epoch": 0.4806382383129607, + "grad_norm": 3.247100383930365, + "learning_rate": 0.0002981105301658969, + "loss": 7.3304, + "step": 5151 + }, + { + "epoch": 0.480731548007838, + "grad_norm": 3.626324219194193, + "learning_rate": 0.0002981093332190878, + "loss": 7.1819, + "step": 5152 + }, + { + "epoch": 0.4808248577027153, + "grad_norm": 3.1652740709675333, + "learning_rate": 0.0002981081358956806, + "loss": 7.1358, + "step": 5153 + }, + { + "epoch": 0.48091816739759263, + "grad_norm": 4.556453005708733, + "learning_rate": 0.00029810693819567823, + "loss": 7.2898, + "step": 5154 + }, + { + "epoch": 0.4810114770924699, + "grad_norm": 15.567230768187692, + "learning_rate": 0.0002981057401190838, + "loss": 7.4477, + "step": 5155 + }, + { + "epoch": 0.4811047867873472, + "grad_norm": 2.290699239439125, + "learning_rate": 0.00029810454166590033, + "loss": 7.4582, + "step": 5156 + }, + { + "epoch": 0.4811980964822245, + "grad_norm": 5.7394303646414455, + "learning_rate": 0.00029810334283613093, + "loss": 7.1504, + "step": 5157 + }, + { + "epoch": 0.4812914061771018, + "grad_norm": 5.747390759148776, + "learning_rate": 0.00029810214362977855, + "loss": 7.1973, + "step": 5158 + }, + { + "epoch": 0.4813847158719791, + "grad_norm": 480.45042688473444, + "learning_rate": 0.0002981009440468463, + "loss": 7.1475, + "step": 5159 + }, + { + "epoch": 0.48147802556685637, + "grad_norm": 30.75897424436535, + "learning_rate": 0.00029809974408733723, + "loss": 7.1216, + "step": 5160 + }, + { + "epoch": 0.4815713352617337, + "grad_norm": 3.2115732440439486, + "learning_rate": 0.00029809854375125437, + "loss": 7.0232, + "step": 5161 + }, + { + "epoch": 0.481664644956611, + "grad_norm": 3.6361632205623917, + "learning_rate": 0.0002980973430386008, + "loss": 7.2513, + "step": 5162 + }, + { + "epoch": 0.4817579546514883, + "grad_norm": 5.0210495124227625, + "learning_rate": 0.0002980961419493796, + "loss": 7.4483, + "step": 5163 + }, + { + "epoch": 0.4818512643463656, + "grad_norm": 6.866361545955033, + "learning_rate": 0.0002980949404835937, + "loss": 6.7792, + "step": 5164 + }, + { + "epoch": 0.4819445740412429, + "grad_norm": 6.474417116511548, + "learning_rate": 0.0002980937386412463, + "loss": 7.3871, + "step": 5165 + }, + { + "epoch": 0.48203788373612017, + "grad_norm": 7.3876175162860465, + "learning_rate": 0.0002980925364223403, + "loss": 7.1184, + "step": 5166 + }, + { + "epoch": 0.4821311934309975, + "grad_norm": 3.337716136068199, + "learning_rate": 0.00029809133382687896, + "loss": 6.9692, + "step": 5167 + }, + { + "epoch": 0.4822245031258748, + "grad_norm": 48777976004.07005, + "learning_rate": 0.0002980901308548651, + "loss": 7.3558, + "step": 5168 + }, + { + "epoch": 0.4823178128207521, + "grad_norm": 12.987745914202534, + "learning_rate": 0.000298088927506302, + "loss": 7.335, + "step": 5169 + }, + { + "epoch": 0.4824111225156294, + "grad_norm": 5.284069003452433, + "learning_rate": 0.00029808772378119263, + "loss": 7.3247, + "step": 5170 + }, + { + "epoch": 0.48250443221050665, + "grad_norm": 10.591905229920156, + "learning_rate": 0.00029808651967954005, + "loss": 7.1532, + "step": 5171 + }, + { + "epoch": 0.48259774190538396, + "grad_norm": 11948.121703717283, + "learning_rate": 0.00029808531520134724, + "loss": 7.6162, + "step": 5172 + }, + { + "epoch": 0.48269105160026127, + "grad_norm": 54.28833959270344, + "learning_rate": 0.00029808411034661737, + "loss": 7.2075, + "step": 5173 + }, + { + "epoch": 0.4827843612951386, + "grad_norm": 64.10589554462518, + "learning_rate": 0.0002980829051153535, + "loss": 7.485, + "step": 5174 + }, + { + "epoch": 0.4828776709900159, + "grad_norm": 8.207574720296979, + "learning_rate": 0.00029808169950755866, + "loss": 7.1692, + "step": 5175 + }, + { + "epoch": 0.48297098068489314, + "grad_norm": 1963387.2383752908, + "learning_rate": 0.00029808049352323594, + "loss": 7.5013, + "step": 5176 + }, + { + "epoch": 0.48306429037977044, + "grad_norm": 92.79751044038494, + "learning_rate": 0.0002980792871623883, + "loss": 7.565, + "step": 5177 + }, + { + "epoch": 0.48315760007464775, + "grad_norm": 70.09013652995907, + "learning_rate": 0.00029807808042501904, + "loss": 7.5102, + "step": 5178 + }, + { + "epoch": 0.48325090976952506, + "grad_norm": 8.793136458619063, + "learning_rate": 0.00029807687331113097, + "loss": 7.2581, + "step": 5179 + }, + { + "epoch": 0.48334421946440237, + "grad_norm": 110.28352722179609, + "learning_rate": 0.0002980756658207273, + "loss": 7.6654, + "step": 5180 + }, + { + "epoch": 0.4834375291592796, + "grad_norm": 12.194382107615132, + "learning_rate": 0.0002980744579538111, + "loss": 7.226, + "step": 5181 + }, + { + "epoch": 0.48353083885415693, + "grad_norm": 113.10916209600698, + "learning_rate": 0.0002980732497103854, + "loss": 7.4372, + "step": 5182 + }, + { + "epoch": 0.48362414854903424, + "grad_norm": 103.29835144540377, + "learning_rate": 0.0002980720410904533, + "loss": 7.4331, + "step": 5183 + }, + { + "epoch": 0.48371745824391155, + "grad_norm": 7.9336828975847355, + "learning_rate": 0.00029807083209401787, + "loss": 7.1912, + "step": 5184 + }, + { + "epoch": 0.48381076793878885, + "grad_norm": 109.57241400804244, + "learning_rate": 0.00029806962272108215, + "loss": 7.2308, + "step": 5185 + }, + { + "epoch": 0.48390407763366616, + "grad_norm": 2129.5351662064786, + "learning_rate": 0.00029806841297164925, + "loss": 7.0837, + "step": 5186 + }, + { + "epoch": 0.4839973873285434, + "grad_norm": 15.465936338547493, + "learning_rate": 0.00029806720284572227, + "loss": 7.4126, + "step": 5187 + }, + { + "epoch": 0.4840906970234207, + "grad_norm": 27.62057151431949, + "learning_rate": 0.0002980659923433042, + "loss": 7.6079, + "step": 5188 + }, + { + "epoch": 0.48418400671829803, + "grad_norm": 16.336912322479478, + "learning_rate": 0.0002980647814643982, + "loss": 7.3769, + "step": 5189 + }, + { + "epoch": 0.48427731641317534, + "grad_norm": 6.198379312833082, + "learning_rate": 0.0002980635702090073, + "loss": 7.6087, + "step": 5190 + }, + { + "epoch": 0.48437062610805265, + "grad_norm": 13.954334476866604, + "learning_rate": 0.0002980623585771346, + "loss": 7.4404, + "step": 5191 + }, + { + "epoch": 0.4844639358029299, + "grad_norm": 6.6271004496963615, + "learning_rate": 0.0002980611465687832, + "loss": 7.0335, + "step": 5192 + }, + { + "epoch": 0.4845572454978072, + "grad_norm": 6.515735601386028, + "learning_rate": 0.0002980599341839562, + "loss": 7.378, + "step": 5193 + }, + { + "epoch": 0.4846505551926845, + "grad_norm": 12.184960169085715, + "learning_rate": 0.0002980587214226566, + "loss": 7.2155, + "step": 5194 + }, + { + "epoch": 0.4847438648875618, + "grad_norm": 185801.46420585713, + "learning_rate": 0.0002980575082848875, + "loss": 7.1722, + "step": 5195 + }, + { + "epoch": 0.48483717458243913, + "grad_norm": 321261.2933542172, + "learning_rate": 0.00029805629477065205, + "loss": 7.3439, + "step": 5196 + }, + { + "epoch": 0.4849304842773164, + "grad_norm": 8.415934930050614, + "learning_rate": 0.00029805508087995327, + "loss": 7.4256, + "step": 5197 + }, + { + "epoch": 0.4850237939721937, + "grad_norm": 4.742141719642352, + "learning_rate": 0.00029805386661279433, + "loss": 7.4809, + "step": 5198 + }, + { + "epoch": 0.485117103667071, + "grad_norm": 43.32503984240332, + "learning_rate": 0.00029805265196917823, + "loss": 7.2903, + "step": 5199 + }, + { + "epoch": 0.4852104133619483, + "grad_norm": 10.966187806861857, + "learning_rate": 0.00029805143694910816, + "loss": 7.4021, + "step": 5200 + }, + { + "epoch": 0.4853037230568256, + "grad_norm": 35.13860647166791, + "learning_rate": 0.00029805022155258705, + "loss": 7.5891, + "step": 5201 + }, + { + "epoch": 0.48539703275170293, + "grad_norm": 25.497219641097683, + "learning_rate": 0.00029804900577961815, + "loss": 7.3191, + "step": 5202 + }, + { + "epoch": 0.4854903424465802, + "grad_norm": 4.994858479795526, + "learning_rate": 0.00029804778963020445, + "loss": 7.7923, + "step": 5203 + }, + { + "epoch": 0.4855836521414575, + "grad_norm": 3.6721359609845017, + "learning_rate": 0.0002980465731043491, + "loss": 7.3784, + "step": 5204 + }, + { + "epoch": 0.4856769618363348, + "grad_norm": 3.2046993951445772, + "learning_rate": 0.0002980453562020551, + "loss": 7.5028, + "step": 5205 + }, + { + "epoch": 0.4857702715312121, + "grad_norm": 17.325693304563316, + "learning_rate": 0.00029804413892332566, + "loss": 7.3643, + "step": 5206 + }, + { + "epoch": 0.4858635812260894, + "grad_norm": 16.44666735502411, + "learning_rate": 0.00029804292126816385, + "loss": 7.383, + "step": 5207 + }, + { + "epoch": 0.48595689092096667, + "grad_norm": 17.217629879128143, + "learning_rate": 0.0002980417032365727, + "loss": 7.5138, + "step": 5208 + }, + { + "epoch": 0.486050200615844, + "grad_norm": 49.18863006043412, + "learning_rate": 0.00029804048482855535, + "loss": 7.0969, + "step": 5209 + }, + { + "epoch": 0.4861435103107213, + "grad_norm": 362.6221748785238, + "learning_rate": 0.00029803926604411496, + "loss": 7.2467, + "step": 5210 + }, + { + "epoch": 0.4862368200055986, + "grad_norm": 117.6946518834278, + "learning_rate": 0.00029803804688325453, + "loss": 7.3117, + "step": 5211 + }, + { + "epoch": 0.4863301297004759, + "grad_norm": 490.187940758864, + "learning_rate": 0.0002980368273459772, + "loss": 7.3304, + "step": 5212 + }, + { + "epoch": 0.48642343939535315, + "grad_norm": 8.350469194527069, + "learning_rate": 0.0002980356074322861, + "loss": 7.4678, + "step": 5213 + }, + { + "epoch": 0.48651674909023046, + "grad_norm": 229.59233772577983, + "learning_rate": 0.0002980343871421843, + "loss": 7.494, + "step": 5214 + }, + { + "epoch": 0.48661005878510777, + "grad_norm": 34.485180719488255, + "learning_rate": 0.00029803316647567484, + "loss": 7.3527, + "step": 5215 + }, + { + "epoch": 0.4867033684799851, + "grad_norm": 20.265318452026776, + "learning_rate": 0.0002980319454327609, + "loss": 7.0035, + "step": 5216 + }, + { + "epoch": 0.4867966781748624, + "grad_norm": 8.419940253534804, + "learning_rate": 0.00029803072401344557, + "loss": 7.3016, + "step": 5217 + }, + { + "epoch": 0.4868899878697397, + "grad_norm": 37.607513811894954, + "learning_rate": 0.00029802950221773203, + "loss": 7.423, + "step": 5218 + }, + { + "epoch": 0.48698329756461695, + "grad_norm": 12.881568650644688, + "learning_rate": 0.0002980282800456233, + "loss": 7.1344, + "step": 5219 + }, + { + "epoch": 0.48707660725949425, + "grad_norm": 4.839548069472758, + "learning_rate": 0.00029802705749712243, + "loss": 7.0523, + "step": 5220 + }, + { + "epoch": 0.48716991695437156, + "grad_norm": 9.000149691110042, + "learning_rate": 0.00029802583457223266, + "loss": 7.325, + "step": 5221 + }, + { + "epoch": 0.48726322664924887, + "grad_norm": 373285.3692298086, + "learning_rate": 0.00029802461127095706, + "loss": 7.0516, + "step": 5222 + }, + { + "epoch": 0.4873565363441262, + "grad_norm": 5851.750114153603, + "learning_rate": 0.00029802338759329867, + "loss": 7.3531, + "step": 5223 + }, + { + "epoch": 0.48744984603900343, + "grad_norm": 221417.95796035902, + "learning_rate": 0.0002980221635392607, + "loss": 7.354, + "step": 5224 + }, + { + "epoch": 0.48754315573388074, + "grad_norm": 12.401547769942917, + "learning_rate": 0.00029802093910884616, + "loss": 7.2705, + "step": 5225 + }, + { + "epoch": 0.48763646542875805, + "grad_norm": 25.46022770491459, + "learning_rate": 0.00029801971430205826, + "loss": 7.2052, + "step": 5226 + }, + { + "epoch": 0.48772977512363536, + "grad_norm": 14246.232232341563, + "learning_rate": 0.00029801848911890005, + "loss": 7.4883, + "step": 5227 + }, + { + "epoch": 0.48782308481851266, + "grad_norm": 3.8450509764431464, + "learning_rate": 0.0002980172635593747, + "loss": 7.3109, + "step": 5228 + }, + { + "epoch": 0.4879163945133899, + "grad_norm": 59.41656013557999, + "learning_rate": 0.00029801603762348523, + "loss": 7.2614, + "step": 5229 + }, + { + "epoch": 0.4880097042082672, + "grad_norm": 6.327929672385446, + "learning_rate": 0.00029801481131123487, + "loss": 7.3138, + "step": 5230 + }, + { + "epoch": 0.48810301390314453, + "grad_norm": 3.3001252683128888, + "learning_rate": 0.0002980135846226267, + "loss": 7.241, + "step": 5231 + }, + { + "epoch": 0.48819632359802184, + "grad_norm": 8.38559221704068, + "learning_rate": 0.0002980123575576638, + "loss": 7.4076, + "step": 5232 + }, + { + "epoch": 0.48828963329289915, + "grad_norm": 22.703605816152013, + "learning_rate": 0.0002980111301163494, + "loss": 7.6852, + "step": 5233 + }, + { + "epoch": 0.4883829429877764, + "grad_norm": 5.130654077543891, + "learning_rate": 0.00029800990229868646, + "loss": 7.4635, + "step": 5234 + }, + { + "epoch": 0.4884762526826537, + "grad_norm": 23.304631542080436, + "learning_rate": 0.0002980086741046782, + "loss": 6.9178, + "step": 5235 + }, + { + "epoch": 0.488569562377531, + "grad_norm": 116942.16289156675, + "learning_rate": 0.00029800744553432773, + "loss": 7.2151, + "step": 5236 + }, + { + "epoch": 0.4886628720724083, + "grad_norm": 10846.162784049795, + "learning_rate": 0.00029800621658763814, + "loss": 7.0826, + "step": 5237 + }, + { + "epoch": 0.48875618176728564, + "grad_norm": 3.572843991048622, + "learning_rate": 0.0002980049872646126, + "loss": 6.963, + "step": 5238 + }, + { + "epoch": 0.48884949146216294, + "grad_norm": 4.852358054100147, + "learning_rate": 0.00029800375756525426, + "loss": 6.7372, + "step": 5239 + }, + { + "epoch": 0.4889428011570402, + "grad_norm": 4.254578622723792, + "learning_rate": 0.00029800252748956614, + "loss": 7.1341, + "step": 5240 + }, + { + "epoch": 0.4890361108519175, + "grad_norm": 3.760007159505304, + "learning_rate": 0.0002980012970375515, + "loss": 7.0778, + "step": 5241 + }, + { + "epoch": 0.4891294205467948, + "grad_norm": 8.363559286560983, + "learning_rate": 0.00029800006620921333, + "loss": 7.0429, + "step": 5242 + }, + { + "epoch": 0.4892227302416721, + "grad_norm": 19.16404465281064, + "learning_rate": 0.00029799883500455487, + "loss": 7.3107, + "step": 5243 + }, + { + "epoch": 0.48931603993654943, + "grad_norm": 37.41919911012492, + "learning_rate": 0.0002979976034235792, + "loss": 7.4663, + "step": 5244 + }, + { + "epoch": 0.4894093496314267, + "grad_norm": 3.152930457105719, + "learning_rate": 0.0002979963714662895, + "loss": 7.3047, + "step": 5245 + }, + { + "epoch": 0.489502659326304, + "grad_norm": 7.902698313947849, + "learning_rate": 0.0002979951391326889, + "loss": 7.313, + "step": 5246 + }, + { + "epoch": 0.4895959690211813, + "grad_norm": 144.2684331753071, + "learning_rate": 0.0002979939064227804, + "loss": 7.337, + "step": 5247 + }, + { + "epoch": 0.4896892787160586, + "grad_norm": 64.42288496711986, + "learning_rate": 0.0002979926733365673, + "loss": 6.991, + "step": 5248 + }, + { + "epoch": 0.4897825884109359, + "grad_norm": 45.96227867887696, + "learning_rate": 0.00029799143987405266, + "loss": 7.093, + "step": 5249 + }, + { + "epoch": 0.48987589810581317, + "grad_norm": 3.653379313212867, + "learning_rate": 0.0002979902060352396, + "loss": 7.0554, + "step": 5250 + }, + { + "epoch": 0.4899692078006905, + "grad_norm": 13.30694851808211, + "learning_rate": 0.0002979889718201313, + "loss": 7.53, + "step": 5251 + }, + { + "epoch": 0.4900625174955678, + "grad_norm": 17.76739192046269, + "learning_rate": 0.0002979877372287309, + "loss": 7.1696, + "step": 5252 + }, + { + "epoch": 0.4901558271904451, + "grad_norm": 174.6332076368918, + "learning_rate": 0.00029798650226104154, + "loss": 8.0457, + "step": 5253 + }, + { + "epoch": 0.4902491368853224, + "grad_norm": 14.797548777440495, + "learning_rate": 0.00029798526691706626, + "loss": 6.8555, + "step": 5254 + }, + { + "epoch": 0.4903424465801997, + "grad_norm": 9.251143242558724, + "learning_rate": 0.00029798403119680837, + "loss": 7.4358, + "step": 5255 + }, + { + "epoch": 0.49043575627507696, + "grad_norm": 8.495706155548756, + "learning_rate": 0.00029798279510027085, + "loss": 7.136, + "step": 5256 + }, + { + "epoch": 0.49052906596995427, + "grad_norm": 121.62851721978147, + "learning_rate": 0.00029798155862745696, + "loss": 7.5939, + "step": 5257 + }, + { + "epoch": 0.4906223756648316, + "grad_norm": 17.88808673158068, + "learning_rate": 0.00029798032177836984, + "loss": 7.6569, + "step": 5258 + }, + { + "epoch": 0.4907156853597089, + "grad_norm": 10.265381076338576, + "learning_rate": 0.0002979790845530125, + "loss": 7.1765, + "step": 5259 + }, + { + "epoch": 0.4908089950545862, + "grad_norm": 804701.6722798308, + "learning_rate": 0.00029797784695138825, + "loss": 7.0299, + "step": 5260 + }, + { + "epoch": 0.49090230474946345, + "grad_norm": 634.7780262999597, + "learning_rate": 0.00029797660897350013, + "loss": 7.0191, + "step": 5261 + }, + { + "epoch": 0.49099561444434076, + "grad_norm": 12.493121440765334, + "learning_rate": 0.00029797537061935136, + "loss": 7.4276, + "step": 5262 + }, + { + "epoch": 0.49108892413921806, + "grad_norm": 6.732018865922802, + "learning_rate": 0.0002979741318889451, + "loss": 6.9713, + "step": 5263 + }, + { + "epoch": 0.49118223383409537, + "grad_norm": 23733.478861171658, + "learning_rate": 0.0002979728927822844, + "loss": 6.8269, + "step": 5264 + }, + { + "epoch": 0.4912755435289727, + "grad_norm": 6.812834868307656, + "learning_rate": 0.00029797165329937246, + "loss": 7.1952, + "step": 5265 + }, + { + "epoch": 0.49136885322384993, + "grad_norm": 6.2713956047369255, + "learning_rate": 0.00029797041344021245, + "loss": 7.2218, + "step": 5266 + }, + { + "epoch": 0.49146216291872724, + "grad_norm": 10.807869943875541, + "learning_rate": 0.00029796917320480746, + "loss": 7.1426, + "step": 5267 + }, + { + "epoch": 0.49155547261360455, + "grad_norm": 78.47656687724437, + "learning_rate": 0.0002979679325931608, + "loss": 6.8385, + "step": 5268 + }, + { + "epoch": 0.49164878230848186, + "grad_norm": 12.394389530593225, + "learning_rate": 0.0002979666916052754, + "loss": 6.9158, + "step": 5269 + }, + { + "epoch": 0.49174209200335917, + "grad_norm": 1895.7905100013204, + "learning_rate": 0.0002979654502411546, + "loss": 7.0496, + "step": 5270 + }, + { + "epoch": 0.4918354016982365, + "grad_norm": 15.765456207894664, + "learning_rate": 0.00029796420850080154, + "loss": 7.1344, + "step": 5271 + }, + { + "epoch": 0.4919287113931137, + "grad_norm": 15.269630818595914, + "learning_rate": 0.00029796296638421926, + "loss": 7.2579, + "step": 5272 + }, + { + "epoch": 0.49202202108799103, + "grad_norm": 29.213436395589166, + "learning_rate": 0.00029796172389141097, + "loss": 7.2603, + "step": 5273 + }, + { + "epoch": 0.49211533078286834, + "grad_norm": 2.7734482039100956, + "learning_rate": 0.0002979604810223799, + "loss": 7.3083, + "step": 5274 + }, + { + "epoch": 0.49220864047774565, + "grad_norm": 78.97265986102194, + "learning_rate": 0.00029795923777712913, + "loss": 7.0043, + "step": 5275 + }, + { + "epoch": 0.49230195017262296, + "grad_norm": 631551.1493955078, + "learning_rate": 0.0002979579941556618, + "loss": 7.0665, + "step": 5276 + }, + { + "epoch": 0.4923952598675002, + "grad_norm": 22.35101336493707, + "learning_rate": 0.00029795675015798124, + "loss": 7.225, + "step": 5277 + }, + { + "epoch": 0.4924885695623775, + "grad_norm": 27.63072472961117, + "learning_rate": 0.0002979555057840904, + "loss": 7.0716, + "step": 5278 + }, + { + "epoch": 0.49258187925725483, + "grad_norm": 27.502903216166672, + "learning_rate": 0.00029795426103399256, + "loss": 7.1962, + "step": 5279 + }, + { + "epoch": 0.49267518895213214, + "grad_norm": 3.8453268892499786, + "learning_rate": 0.00029795301590769085, + "loss": 6.9138, + "step": 5280 + }, + { + "epoch": 0.49276849864700945, + "grad_norm": 14.97131653129927, + "learning_rate": 0.00029795177040518845, + "loss": 7.1264, + "step": 5281 + }, + { + "epoch": 0.4928618083418867, + "grad_norm": 31.38928606196414, + "learning_rate": 0.00029795052452648855, + "loss": 7.036, + "step": 5282 + }, + { + "epoch": 0.492955118036764, + "grad_norm": 7.331172618084224, + "learning_rate": 0.0002979492782715943, + "loss": 7.1345, + "step": 5283 + }, + { + "epoch": 0.4930484277316413, + "grad_norm": 9.587281761016996, + "learning_rate": 0.0002979480316405088, + "loss": 6.9416, + "step": 5284 + }, + { + "epoch": 0.4931417374265186, + "grad_norm": 6.059558397690048, + "learning_rate": 0.00029794678463323533, + "loss": 7.1017, + "step": 5285 + }, + { + "epoch": 0.49323504712139593, + "grad_norm": 1499.4285241760115, + "learning_rate": 0.000297945537249777, + "loss": 7.0332, + "step": 5286 + }, + { + "epoch": 0.4933283568162732, + "grad_norm": 6.158458122022152, + "learning_rate": 0.000297944289490137, + "loss": 7.4691, + "step": 5287 + }, + { + "epoch": 0.4934216665111505, + "grad_norm": 5.276511805612033, + "learning_rate": 0.00029794304135431846, + "loss": 7.2711, + "step": 5288 + }, + { + "epoch": 0.4935149762060278, + "grad_norm": 1505436.6508797102, + "learning_rate": 0.00029794179284232464, + "loss": 6.9512, + "step": 5289 + }, + { + "epoch": 0.4936082859009051, + "grad_norm": 4.582948127246696, + "learning_rate": 0.00029794054395415864, + "loss": 7.1077, + "step": 5290 + }, + { + "epoch": 0.4937015955957824, + "grad_norm": 3.1663090878842506, + "learning_rate": 0.00029793929468982364, + "loss": 7.0738, + "step": 5291 + }, + { + "epoch": 0.4937949052906597, + "grad_norm": 282.2251441138056, + "learning_rate": 0.00029793804504932287, + "loss": 7.2971, + "step": 5292 + }, + { + "epoch": 0.493888214985537, + "grad_norm": 19.962440650322943, + "learning_rate": 0.0002979367950326595, + "loss": 6.9022, + "step": 5293 + }, + { + "epoch": 0.4939815246804143, + "grad_norm": 4.4131288881168045, + "learning_rate": 0.0002979355446398366, + "loss": 7.0641, + "step": 5294 + }, + { + "epoch": 0.4940748343752916, + "grad_norm": 3.4662825145701452, + "learning_rate": 0.00029793429387085747, + "loss": 6.9236, + "step": 5295 + }, + { + "epoch": 0.4941681440701689, + "grad_norm": 11.151746954000044, + "learning_rate": 0.00029793304272572526, + "loss": 7.0603, + "step": 5296 + }, + { + "epoch": 0.4942614537650462, + "grad_norm": 2.9028170523947323, + "learning_rate": 0.0002979317912044432, + "loss": 6.8392, + "step": 5297 + }, + { + "epoch": 0.49435476345992346, + "grad_norm": 3.113208522167318, + "learning_rate": 0.0002979305393070143, + "loss": 7.0844, + "step": 5298 + }, + { + "epoch": 0.49444807315480077, + "grad_norm": 12434.805266181173, + "learning_rate": 0.00029792928703344194, + "loss": 7.2071, + "step": 5299 + }, + { + "epoch": 0.4945413828496781, + "grad_norm": 8.34987911347481, + "learning_rate": 0.0002979280343837292, + "loss": 7.1725, + "step": 5300 + }, + { + "epoch": 0.4946346925445554, + "grad_norm": 2.8290165827745817, + "learning_rate": 0.0002979267813578793, + "loss": 7.0554, + "step": 5301 + }, + { + "epoch": 0.4947280022394327, + "grad_norm": 20.428958176113717, + "learning_rate": 0.00029792552795589536, + "loss": 6.9749, + "step": 5302 + }, + { + "epoch": 0.49482131193430995, + "grad_norm": 4.122837283826665, + "learning_rate": 0.00029792427417778066, + "loss": 7.1299, + "step": 5303 + }, + { + "epoch": 0.49491462162918726, + "grad_norm": 5.711331442736774, + "learning_rate": 0.0002979230200235384, + "loss": 6.9911, + "step": 5304 + }, + { + "epoch": 0.49500793132406457, + "grad_norm": 15.029655402640998, + "learning_rate": 0.00029792176549317165, + "loss": 6.7108, + "step": 5305 + }, + { + "epoch": 0.4951012410189419, + "grad_norm": 505865.10968427756, + "learning_rate": 0.0002979205105866837, + "loss": 6.7046, + "step": 5306 + }, + { + "epoch": 0.4951945507138192, + "grad_norm": 771339.5485496407, + "learning_rate": 0.0002979192553040777, + "loss": 6.9919, + "step": 5307 + }, + { + "epoch": 0.4952878604086965, + "grad_norm": 12.961330173174924, + "learning_rate": 0.0002979179996453568, + "loss": 6.9461, + "step": 5308 + }, + { + "epoch": 0.49538117010357374, + "grad_norm": 2.7432849480606727, + "learning_rate": 0.0002979167436105243, + "loss": 6.8619, + "step": 5309 + }, + { + "epoch": 0.49547447979845105, + "grad_norm": 4.145996872804888, + "learning_rate": 0.0002979154871995833, + "loss": 7.0648, + "step": 5310 + }, + { + "epoch": 0.49556778949332836, + "grad_norm": 3.110418578792898, + "learning_rate": 0.00029791423041253707, + "loss": 6.6528, + "step": 5311 + }, + { + "epoch": 0.49566109918820567, + "grad_norm": 1660.2052944162872, + "learning_rate": 0.00029791297324938874, + "loss": 6.892, + "step": 5312 + }, + { + "epoch": 0.495754408883083, + "grad_norm": 9.062167448129324, + "learning_rate": 0.0002979117157101416, + "loss": 7.0719, + "step": 5313 + }, + { + "epoch": 0.49584771857796023, + "grad_norm": 652.3850381346962, + "learning_rate": 0.00029791045779479873, + "loss": 7.3308, + "step": 5314 + }, + { + "epoch": 0.49594102827283754, + "grad_norm": 7.156042244934325, + "learning_rate": 0.0002979091995033634, + "loss": 7.0804, + "step": 5315 + }, + { + "epoch": 0.49603433796771484, + "grad_norm": 117.24302275116999, + "learning_rate": 0.00029790794083583874, + "loss": 7.1244, + "step": 5316 + }, + { + "epoch": 0.49612764766259215, + "grad_norm": 20.47532146908912, + "learning_rate": 0.00029790668179222805, + "loss": 7.1807, + "step": 5317 + }, + { + "epoch": 0.49622095735746946, + "grad_norm": 2.955633946458351, + "learning_rate": 0.00029790542237253446, + "loss": 6.9573, + "step": 5318 + }, + { + "epoch": 0.4963142670523467, + "grad_norm": 3.3276166195455175, + "learning_rate": 0.0002979041625767612, + "loss": 7.4301, + "step": 5319 + }, + { + "epoch": 0.496407576747224, + "grad_norm": 7.014423568208355, + "learning_rate": 0.00029790290240491144, + "loss": 6.9683, + "step": 5320 + }, + { + "epoch": 0.49650088644210133, + "grad_norm": 22.828275775739126, + "learning_rate": 0.00029790164185698845, + "loss": 7.5137, + "step": 5321 + }, + { + "epoch": 0.49659419613697864, + "grad_norm": 19.537938365876425, + "learning_rate": 0.0002979003809329954, + "loss": 7.095, + "step": 5322 + }, + { + "epoch": 0.49668750583185595, + "grad_norm": 5.601194948344606, + "learning_rate": 0.00029789911963293546, + "loss": 7.181, + "step": 5323 + }, + { + "epoch": 0.49678081552673325, + "grad_norm": 10.063526434293486, + "learning_rate": 0.00029789785795681186, + "loss": 7.0502, + "step": 5324 + }, + { + "epoch": 0.4968741252216105, + "grad_norm": 3.9319213491591727, + "learning_rate": 0.0002978965959046279, + "loss": 7.266, + "step": 5325 + }, + { + "epoch": 0.4969674349164878, + "grad_norm": 1.9797536665455695, + "learning_rate": 0.0002978953334763866, + "loss": 7.1595, + "step": 5326 + }, + { + "epoch": 0.4970607446113651, + "grad_norm": 14.4351604463745, + "learning_rate": 0.0002978940706720913, + "loss": 7.45, + "step": 5327 + }, + { + "epoch": 0.49715405430624243, + "grad_norm": 24.60288657899314, + "learning_rate": 0.00029789280749174525, + "loss": 7.2173, + "step": 5328 + }, + { + "epoch": 0.49724736400111974, + "grad_norm": 37.32376721385024, + "learning_rate": 0.00029789154393535156, + "loss": 7.2212, + "step": 5329 + }, + { + "epoch": 0.497340673695997, + "grad_norm": 3.9693730175890685, + "learning_rate": 0.00029789028000291347, + "loss": 7.1961, + "step": 5330 + }, + { + "epoch": 0.4974339833908743, + "grad_norm": 2.4479455012681584, + "learning_rate": 0.00029788901569443427, + "loss": 7.042, + "step": 5331 + }, + { + "epoch": 0.4975272930857516, + "grad_norm": 3.521975897265611, + "learning_rate": 0.00029788775100991706, + "loss": 7.0496, + "step": 5332 + }, + { + "epoch": 0.4976206027806289, + "grad_norm": 398525.31530688365, + "learning_rate": 0.0002978864859493651, + "loss": 7.1357, + "step": 5333 + }, + { + "epoch": 0.4977139124755062, + "grad_norm": 11.387766308397628, + "learning_rate": 0.0002978852205127816, + "loss": 7.0565, + "step": 5334 + }, + { + "epoch": 0.4978072221703835, + "grad_norm": 25.891514858572368, + "learning_rate": 0.0002978839547001698, + "loss": 7.197, + "step": 5335 + }, + { + "epoch": 0.4979005318652608, + "grad_norm": 2.822428448433664, + "learning_rate": 0.00029788268851153296, + "loss": 7.0537, + "step": 5336 + }, + { + "epoch": 0.4979938415601381, + "grad_norm": 3.381426198533066, + "learning_rate": 0.0002978814219468742, + "loss": 7.0724, + "step": 5337 + }, + { + "epoch": 0.4980871512550154, + "grad_norm": 9.296155761149333, + "learning_rate": 0.00029788015500619683, + "loss": 7.1772, + "step": 5338 + }, + { + "epoch": 0.4981804609498927, + "grad_norm": 27922802.725411076, + "learning_rate": 0.00029787888768950394, + "loss": 7.1063, + "step": 5339 + }, + { + "epoch": 0.49827377064476996, + "grad_norm": 3.3725999387481367, + "learning_rate": 0.00029787761999679896, + "loss": 7.1794, + "step": 5340 + }, + { + "epoch": 0.4983670803396473, + "grad_norm": 24.26278400392318, + "learning_rate": 0.0002978763519280849, + "loss": 7.1371, + "step": 5341 + }, + { + "epoch": 0.4984603900345246, + "grad_norm": 67377.42821078705, + "learning_rate": 0.00029787508348336515, + "loss": 6.8424, + "step": 5342 + }, + { + "epoch": 0.4985536997294019, + "grad_norm": 5.269643306650461, + "learning_rate": 0.00029787381466264283, + "loss": 7.4443, + "step": 5343 + }, + { + "epoch": 0.4986470094242792, + "grad_norm": 2.453492803504854, + "learning_rate": 0.00029787254546592126, + "loss": 7.1325, + "step": 5344 + }, + { + "epoch": 0.4987403191191565, + "grad_norm": 2.499687366871481, + "learning_rate": 0.0002978712758932036, + "loss": 7.0739, + "step": 5345 + }, + { + "epoch": 0.49883362881403376, + "grad_norm": 52414460.348885536, + "learning_rate": 0.000297870005944493, + "loss": 7.0342, + "step": 5346 + }, + { + "epoch": 0.49892693850891107, + "grad_norm": 1.771829259712686, + "learning_rate": 0.0002978687356197928, + "loss": 7.0306, + "step": 5347 + }, + { + "epoch": 0.4990202482037884, + "grad_norm": 1.6457331143162204, + "learning_rate": 0.00029786746491910625, + "loss": 7.2766, + "step": 5348 + }, + { + "epoch": 0.4991135578986657, + "grad_norm": 3.7835248187413084, + "learning_rate": 0.00029786619384243655, + "loss": 6.6664, + "step": 5349 + }, + { + "epoch": 0.499206867593543, + "grad_norm": 3.1086034912010456, + "learning_rate": 0.0002978649223897869, + "loss": 7.1859, + "step": 5350 + }, + { + "epoch": 0.49930017728842024, + "grad_norm": 1927893139.6176667, + "learning_rate": 0.00029786365056116054, + "loss": 7.3141, + "step": 5351 + }, + { + "epoch": 0.49939348698329755, + "grad_norm": 5.071219321111608, + "learning_rate": 0.0002978623783565607, + "loss": 7.3505, + "step": 5352 + }, + { + "epoch": 0.49948679667817486, + "grad_norm": 142.8507298987062, + "learning_rate": 0.0002978611057759907, + "loss": 6.9731, + "step": 5353 + }, + { + "epoch": 0.49958010637305217, + "grad_norm": 3.1552684568858718, + "learning_rate": 0.0002978598328194537, + "loss": 7.158, + "step": 5354 + }, + { + "epoch": 0.4996734160679295, + "grad_norm": 13.747978062200588, + "learning_rate": 0.0002978585594869529, + "loss": 7.0494, + "step": 5355 + }, + { + "epoch": 0.49976672576280673, + "grad_norm": 2.6316402088639785, + "learning_rate": 0.0002978572857784916, + "loss": 7.212, + "step": 5356 + }, + { + "epoch": 0.49986003545768404, + "grad_norm": 12774050.166685322, + "learning_rate": 0.00029785601169407294, + "loss": 6.9922, + "step": 5357 + }, + { + "epoch": 0.49995334515256135, + "grad_norm": 7.730366359531491, + "learning_rate": 0.0002978547372337003, + "loss": 6.8583, + "step": 5358 + }, + { + "epoch": 0.5000466548474386, + "grad_norm": 3074791855.9575033, + "learning_rate": 0.0002978534623973769, + "loss": 7.2934, + "step": 5359 + }, + { + "epoch": 0.5001399645423159, + "grad_norm": 8.49705498352404, + "learning_rate": 0.00029785218718510594, + "loss": 7.3401, + "step": 5360 + }, + { + "epoch": 0.5002332742371932, + "grad_norm": 3.0475597093433695, + "learning_rate": 0.00029785091159689064, + "loss": 7.4112, + "step": 5361 + }, + { + "epoch": 0.5003265839320705, + "grad_norm": 2.2095679425757075, + "learning_rate": 0.00029784963563273426, + "loss": 7.0259, + "step": 5362 + }, + { + "epoch": 0.5004198936269478, + "grad_norm": 3.215945743123154, + "learning_rate": 0.00029784835929264004, + "loss": 7.2877, + "step": 5363 + }, + { + "epoch": 0.5005132033218251, + "grad_norm": 7.125009518032129, + "learning_rate": 0.00029784708257661125, + "loss": 7.6271, + "step": 5364 + }, + { + "epoch": 0.5006065130167024, + "grad_norm": 1.6565057571595294, + "learning_rate": 0.0002978458054846512, + "loss": 7.0055, + "step": 5365 + }, + { + "epoch": 0.5006998227115798, + "grad_norm": 2.6975676260889117, + "learning_rate": 0.000297844528016763, + "loss": 7.0848, + "step": 5366 + }, + { + "epoch": 0.5007931324064571, + "grad_norm": 3.8766220430357676, + "learning_rate": 0.0002978432501729499, + "loss": 6.847, + "step": 5367 + }, + { + "epoch": 0.5008864421013344, + "grad_norm": 1.7851658628096403, + "learning_rate": 0.0002978419719532153, + "loss": 7.0956, + "step": 5368 + }, + { + "epoch": 0.5009797517962117, + "grad_norm": 4.491385751780679, + "learning_rate": 0.0002978406933575623, + "loss": 6.9094, + "step": 5369 + }, + { + "epoch": 0.5010730614910889, + "grad_norm": 31332185.90823383, + "learning_rate": 0.00029783941438599426, + "loss": 7.2413, + "step": 5370 + }, + { + "epoch": 0.5011663711859662, + "grad_norm": 2.471703192034325, + "learning_rate": 0.00029783813503851436, + "loss": 6.9247, + "step": 5371 + }, + { + "epoch": 0.5012596808808435, + "grad_norm": 223.65321306159194, + "learning_rate": 0.0002978368553151259, + "loss": 7.5448, + "step": 5372 + }, + { + "epoch": 0.5013529905757208, + "grad_norm": 2.0473360578941264, + "learning_rate": 0.0002978355752158321, + "loss": 7.015, + "step": 5373 + }, + { + "epoch": 0.5014463002705981, + "grad_norm": 5.943703273266295, + "learning_rate": 0.0002978342947406362, + "loss": 7.6743, + "step": 5374 + }, + { + "epoch": 0.5015396099654754, + "grad_norm": 5.267951857226929, + "learning_rate": 0.0002978330138895415, + "loss": 6.9638, + "step": 5375 + }, + { + "epoch": 0.5016329196603527, + "grad_norm": 86.34302323627223, + "learning_rate": 0.00029783173266255115, + "loss": 7.0472, + "step": 5376 + }, + { + "epoch": 0.50172622935523, + "grad_norm": 52.60299844653936, + "learning_rate": 0.00029783045105966863, + "loss": 7.3693, + "step": 5377 + }, + { + "epoch": 0.5018195390501073, + "grad_norm": 1407850215.1896317, + "learning_rate": 0.000297829169080897, + "loss": 7.6567, + "step": 5378 + }, + { + "epoch": 0.5019128487449847, + "grad_norm": 154.04609189270568, + "learning_rate": 0.00029782788672623955, + "loss": 7.3852, + "step": 5379 + }, + { + "epoch": 0.5020061584398618, + "grad_norm": 52.379235686554075, + "learning_rate": 0.00029782660399569965, + "loss": 7.576, + "step": 5380 + }, + { + "epoch": 0.5020994681347392, + "grad_norm": 33.5359346231621, + "learning_rate": 0.00029782532088928047, + "loss": 7.1065, + "step": 5381 + }, + { + "epoch": 0.5021927778296165, + "grad_norm": 3.599050594422902, + "learning_rate": 0.0002978240374069852, + "loss": 7.1981, + "step": 5382 + }, + { + "epoch": 0.5022860875244938, + "grad_norm": 11.143093865912652, + "learning_rate": 0.0002978227535488173, + "loss": 7.1319, + "step": 5383 + }, + { + "epoch": 0.5023793972193711, + "grad_norm": 30.560387662410225, + "learning_rate": 0.00029782146931477984, + "loss": 7.3281, + "step": 5384 + }, + { + "epoch": 0.5024727069142484, + "grad_norm": 29.18228332039682, + "learning_rate": 0.00029782018470487625, + "loss": 7.4497, + "step": 5385 + }, + { + "epoch": 0.5025660166091257, + "grad_norm": 23.03963956825272, + "learning_rate": 0.0002978188997191097, + "loss": 7.4598, + "step": 5386 + }, + { + "epoch": 0.502659326304003, + "grad_norm": 22.20043568126034, + "learning_rate": 0.0002978176143574834, + "loss": 7.3276, + "step": 5387 + }, + { + "epoch": 0.5027526359988803, + "grad_norm": 23.0394751705211, + "learning_rate": 0.0002978163286200008, + "loss": 7.3857, + "step": 5388 + }, + { + "epoch": 0.5028459456937576, + "grad_norm": 8.551948189261255, + "learning_rate": 0.000297815042506665, + "loss": 7.3151, + "step": 5389 + }, + { + "epoch": 0.5029392553886349, + "grad_norm": 5.98759604565041, + "learning_rate": 0.0002978137560174794, + "loss": 7.5142, + "step": 5390 + }, + { + "epoch": 0.5030325650835121, + "grad_norm": 3.2808844776500217, + "learning_rate": 0.00029781246915244714, + "loss": 7.1821, + "step": 5391 + }, + { + "epoch": 0.5031258747783894, + "grad_norm": 7.614816942222911, + "learning_rate": 0.00029781118191157157, + "loss": 7.2667, + "step": 5392 + }, + { + "epoch": 0.5032191844732667, + "grad_norm": 2.554022151687408, + "learning_rate": 0.0002978098942948559, + "loss": 7.4076, + "step": 5393 + }, + { + "epoch": 0.503312494168144, + "grad_norm": 4.073098900875231, + "learning_rate": 0.0002978086063023035, + "loss": 6.9082, + "step": 5394 + }, + { + "epoch": 0.5034058038630214, + "grad_norm": 19.58757165720929, + "learning_rate": 0.0002978073179339176, + "loss": 7.242, + "step": 5395 + }, + { + "epoch": 0.5034991135578987, + "grad_norm": 41.738785449158854, + "learning_rate": 0.0002978060291897015, + "loss": 7.1746, + "step": 5396 + }, + { + "epoch": 0.503592423252776, + "grad_norm": 16.09521242163596, + "learning_rate": 0.0002978047400696584, + "loss": 7.5622, + "step": 5397 + }, + { + "epoch": 0.5036857329476533, + "grad_norm": 4.190556164836859, + "learning_rate": 0.0002978034505737916, + "loss": 7.5177, + "step": 5398 + }, + { + "epoch": 0.5037790426425306, + "grad_norm": 7.454521014313919, + "learning_rate": 0.0002978021607021045, + "loss": 7.5164, + "step": 5399 + }, + { + "epoch": 0.5038723523374079, + "grad_norm": 17925314188.68197, + "learning_rate": 0.0002978008704546003, + "loss": 7.291, + "step": 5400 + }, + { + "epoch": 0.5039656620322851, + "grad_norm": 5.413012565428496, + "learning_rate": 0.0002977995798312822, + "loss": 6.8263, + "step": 5401 + }, + { + "epoch": 0.5040589717271624, + "grad_norm": 3.2455126476412985, + "learning_rate": 0.00029779828883215356, + "loss": 6.7861, + "step": 5402 + }, + { + "epoch": 0.5041522814220397, + "grad_norm": 3.3047688387239047, + "learning_rate": 0.0002977969974572177, + "loss": 7.3785, + "step": 5403 + }, + { + "epoch": 0.504245591116917, + "grad_norm": 26823421493.11387, + "learning_rate": 0.0002977957057064778, + "loss": 6.87, + "step": 5404 + }, + { + "epoch": 0.5043389008117943, + "grad_norm": 2.8980019381575444, + "learning_rate": 0.0002977944135799372, + "loss": 7.6523, + "step": 5405 + }, + { + "epoch": 0.5044322105066716, + "grad_norm": 1.990710688420936, + "learning_rate": 0.0002977931210775992, + "loss": 7.1087, + "step": 5406 + }, + { + "epoch": 0.504525520201549, + "grad_norm": 2.5362605680166297, + "learning_rate": 0.00029779182819946706, + "loss": 7.3803, + "step": 5407 + }, + { + "epoch": 0.5046188298964263, + "grad_norm": 1.7956239028319185, + "learning_rate": 0.00029779053494554406, + "loss": 7.3131, + "step": 5408 + }, + { + "epoch": 0.5047121395913036, + "grad_norm": 2.0865960306084177, + "learning_rate": 0.0002977892413158336, + "loss": 7.6306, + "step": 5409 + }, + { + "epoch": 0.5048054492861809, + "grad_norm": 2.4097478324748587, + "learning_rate": 0.00029778794731033883, + "loss": 7.5871, + "step": 5410 + }, + { + "epoch": 0.5048987589810582, + "grad_norm": 120.76874056576519, + "learning_rate": 0.00029778665292906305, + "loss": 7.8002, + "step": 5411 + }, + { + "epoch": 0.5049920686759354, + "grad_norm": 1.8069744095198212, + "learning_rate": 0.00029778535817200964, + "loss": 7.4841, + "step": 5412 + }, + { + "epoch": 0.5050853783708127, + "grad_norm": 1.485334747899244, + "learning_rate": 0.0002977840630391818, + "loss": 7.6076, + "step": 5413 + }, + { + "epoch": 0.50517868806569, + "grad_norm": 1.4821713342682379, + "learning_rate": 0.0002977827675305829, + "loss": 7.3849, + "step": 5414 + }, + { + "epoch": 0.5052719977605673, + "grad_norm": 2.092990587572788, + "learning_rate": 0.00029778147164621617, + "loss": 7.5602, + "step": 5415 + }, + { + "epoch": 0.5053653074554446, + "grad_norm": 2.7047974108167043, + "learning_rate": 0.0002977801753860849, + "loss": 7.4229, + "step": 5416 + }, + { + "epoch": 0.5054586171503219, + "grad_norm": 1.2890249680759105, + "learning_rate": 0.0002977788787501925, + "loss": 7.4519, + "step": 5417 + }, + { + "epoch": 0.5055519268451992, + "grad_norm": 2.094246947701609, + "learning_rate": 0.00029777758173854215, + "loss": 7.8937, + "step": 5418 + }, + { + "epoch": 0.5056452365400765, + "grad_norm": 2.37961308950055, + "learning_rate": 0.00029777628435113723, + "loss": 7.7948, + "step": 5419 + }, + { + "epoch": 0.5057385462349538, + "grad_norm": 6.289079824996155, + "learning_rate": 0.0002977749865879809, + "loss": 7.4755, + "step": 5420 + }, + { + "epoch": 0.5058318559298312, + "grad_norm": 3.4590074794086614, + "learning_rate": 0.00029777368844907665, + "loss": 7.5249, + "step": 5421 + }, + { + "epoch": 0.5059251656247085, + "grad_norm": 2.179134749115988, + "learning_rate": 0.0002977723899344276, + "loss": 7.4367, + "step": 5422 + }, + { + "epoch": 0.5060184753195857, + "grad_norm": 1.3473933597197734, + "learning_rate": 0.0002977710910440372, + "loss": 7.5594, + "step": 5423 + }, + { + "epoch": 0.506111785014463, + "grad_norm": 6.739151018891678, + "learning_rate": 0.0002977697917779087, + "loss": 7.2119, + "step": 5424 + }, + { + "epoch": 0.5062050947093403, + "grad_norm": 4.1399882585379935, + "learning_rate": 0.0002977684921360453, + "loss": 7.4526, + "step": 5425 + }, + { + "epoch": 0.5062984044042176, + "grad_norm": 3.697559098587702, + "learning_rate": 0.0002977671921184505, + "loss": 7.79, + "step": 5426 + }, + { + "epoch": 0.5063917140990949, + "grad_norm": 1.4397770518203896, + "learning_rate": 0.0002977658917251274, + "loss": 7.5579, + "step": 5427 + }, + { + "epoch": 0.5064850237939722, + "grad_norm": 3.039025684863913, + "learning_rate": 0.0002977645909560795, + "loss": 7.4648, + "step": 5428 + }, + { + "epoch": 0.5065783334888495, + "grad_norm": 2.262165335868829, + "learning_rate": 0.00029776328981131, + "loss": 7.7013, + "step": 5429 + }, + { + "epoch": 0.5066716431837268, + "grad_norm": 2.1329630884592183, + "learning_rate": 0.00029776198829082213, + "loss": 7.5876, + "step": 5430 + }, + { + "epoch": 0.5067649528786041, + "grad_norm": 1.9132329278175604, + "learning_rate": 0.0002977606863946194, + "loss": 7.1453, + "step": 5431 + }, + { + "epoch": 0.5068582625734814, + "grad_norm": 1.6132969441929574, + "learning_rate": 0.000297759384122705, + "loss": 7.652, + "step": 5432 + }, + { + "epoch": 0.5069515722683586, + "grad_norm": 2.5841736947210543, + "learning_rate": 0.00029775808147508223, + "loss": 7.347, + "step": 5433 + }, + { + "epoch": 0.5070448819632359, + "grad_norm": 3.3684807851604726, + "learning_rate": 0.00029775677845175443, + "loss": 7.7597, + "step": 5434 + }, + { + "epoch": 0.5071381916581132, + "grad_norm": 1.3320106503481959, + "learning_rate": 0.00029775547505272494, + "loss": 7.5872, + "step": 5435 + }, + { + "epoch": 0.5072315013529906, + "grad_norm": 1.5142561425955463, + "learning_rate": 0.000297754171277997, + "loss": 7.5394, + "step": 5436 + }, + { + "epoch": 0.5073248110478679, + "grad_norm": 2.645338137815826, + "learning_rate": 0.000297752867127574, + "loss": 7.4341, + "step": 5437 + }, + { + "epoch": 0.5074181207427452, + "grad_norm": 2.372622018775389, + "learning_rate": 0.0002977515626014592, + "loss": 7.2746, + "step": 5438 + }, + { + "epoch": 0.5075114304376225, + "grad_norm": 1.240791199598423, + "learning_rate": 0.000297750257699656, + "loss": 7.2732, + "step": 5439 + }, + { + "epoch": 0.5076047401324998, + "grad_norm": 2.7727725834480057, + "learning_rate": 0.00029774895242216763, + "loss": 7.6834, + "step": 5440 + }, + { + "epoch": 0.5076980498273771, + "grad_norm": 1.7844756092916603, + "learning_rate": 0.0002977476467689974, + "loss": 7.3084, + "step": 5441 + }, + { + "epoch": 0.5077913595222544, + "grad_norm": 2.627851753224328, + "learning_rate": 0.0002977463407401487, + "loss": 7.6416, + "step": 5442 + }, + { + "epoch": 0.5078846692171317, + "grad_norm": 1.8706731098366747, + "learning_rate": 0.0002977450343356248, + "loss": 7.5238, + "step": 5443 + }, + { + "epoch": 0.5079779789120089, + "grad_norm": 2.3156630970710443, + "learning_rate": 0.0002977437275554291, + "loss": 7.5789, + "step": 5444 + }, + { + "epoch": 0.5080712886068862, + "grad_norm": 2.8038336645884763, + "learning_rate": 0.00029774242039956486, + "loss": 7.4766, + "step": 5445 + }, + { + "epoch": 0.5081645983017635, + "grad_norm": 4.222161209560802, + "learning_rate": 0.00029774111286803536, + "loss": 7.3222, + "step": 5446 + }, + { + "epoch": 0.5082579079966408, + "grad_norm": 1.4333995366288104, + "learning_rate": 0.000297739804960844, + "loss": 7.4119, + "step": 5447 + }, + { + "epoch": 0.5083512176915181, + "grad_norm": 3.5738231027379883, + "learning_rate": 0.00029773849667799406, + "loss": 7.2776, + "step": 5448 + }, + { + "epoch": 0.5084445273863955, + "grad_norm": 5.620813043973837, + "learning_rate": 0.00029773718801948893, + "loss": 7.6173, + "step": 5449 + }, + { + "epoch": 0.5085378370812728, + "grad_norm": 1.7536694720197066, + "learning_rate": 0.0002977358789853318, + "loss": 7.6068, + "step": 5450 + }, + { + "epoch": 0.5086311467761501, + "grad_norm": 2.6819787825652845, + "learning_rate": 0.00029773456957552625, + "loss": 6.9848, + "step": 5451 + }, + { + "epoch": 0.5087244564710274, + "grad_norm": 2.370053810957274, + "learning_rate": 0.00029773325979007536, + "loss": 7.224, + "step": 5452 + }, + { + "epoch": 0.5088177661659047, + "grad_norm": 1.4974746592896122, + "learning_rate": 0.00029773194962898257, + "loss": 7.3942, + "step": 5453 + }, + { + "epoch": 0.5089110758607819, + "grad_norm": 1.7295600753605624, + "learning_rate": 0.00029773063909225114, + "loss": 7.4133, + "step": 5454 + }, + { + "epoch": 0.5090043855556592, + "grad_norm": 3.443819512676362, + "learning_rate": 0.00029772932817988447, + "loss": 7.5278, + "step": 5455 + }, + { + "epoch": 0.5090976952505365, + "grad_norm": 1.092228620926108, + "learning_rate": 0.0002977280168918859, + "loss": 7.3627, + "step": 5456 + }, + { + "epoch": 0.5091910049454138, + "grad_norm": 2.248784164045549, + "learning_rate": 0.00029772670522825877, + "loss": 7.6644, + "step": 5457 + }, + { + "epoch": 0.5092843146402911, + "grad_norm": 1.7601383223800937, + "learning_rate": 0.0002977253931890064, + "loss": 7.5006, + "step": 5458 + }, + { + "epoch": 0.5093776243351684, + "grad_norm": 1.9962407011742678, + "learning_rate": 0.00029772408077413204, + "loss": 6.9978, + "step": 5459 + }, + { + "epoch": 0.5094709340300457, + "grad_norm": 1.2794727300201667, + "learning_rate": 0.00029772276798363913, + "loss": 7.7014, + "step": 5460 + }, + { + "epoch": 0.509564243724923, + "grad_norm": 1.3206472070315975, + "learning_rate": 0.000297721454817531, + "loss": 7.5722, + "step": 5461 + }, + { + "epoch": 0.5096575534198003, + "grad_norm": 1.8947034524334605, + "learning_rate": 0.000297720141275811, + "loss": 7.2869, + "step": 5462 + }, + { + "epoch": 0.5097508631146777, + "grad_norm": 1.1641202159934878, + "learning_rate": 0.0002977188273584824, + "loss": 7.567, + "step": 5463 + }, + { + "epoch": 0.509844172809555, + "grad_norm": 6.3245326827259545, + "learning_rate": 0.00029771751306554857, + "loss": 7.5847, + "step": 5464 + }, + { + "epoch": 0.5099374825044322, + "grad_norm": 1.0612238479441636, + "learning_rate": 0.0002977161983970129, + "loss": 7.7496, + "step": 5465 + }, + { + "epoch": 0.5100307921993095, + "grad_norm": 1.1563289980454023, + "learning_rate": 0.0002977148833528786, + "loss": 7.5729, + "step": 5466 + }, + { + "epoch": 0.5101241018941868, + "grad_norm": 2.175868210328023, + "learning_rate": 0.0002977135679331492, + "loss": 7.2838, + "step": 5467 + }, + { + "epoch": 0.5102174115890641, + "grad_norm": 1.5824800779133992, + "learning_rate": 0.0002977122521378279, + "loss": 7.1491, + "step": 5468 + }, + { + "epoch": 0.5103107212839414, + "grad_norm": 4.112073487585894, + "learning_rate": 0.00029771093596691814, + "loss": 7.3447, + "step": 5469 + }, + { + "epoch": 0.5104040309788187, + "grad_norm": 1.1449623138466158, + "learning_rate": 0.0002977096194204232, + "loss": 7.3627, + "step": 5470 + }, + { + "epoch": 0.510497340673696, + "grad_norm": 2.4761880960461884, + "learning_rate": 0.00029770830249834646, + "loss": 7.6565, + "step": 5471 + }, + { + "epoch": 0.5105906503685733, + "grad_norm": 0.94808790388345, + "learning_rate": 0.0002977069852006913, + "loss": 7.4448, + "step": 5472 + }, + { + "epoch": 0.5106839600634506, + "grad_norm": 1.7642026283861307, + "learning_rate": 0.000297705667527461, + "loss": 7.5894, + "step": 5473 + }, + { + "epoch": 0.5107772697583279, + "grad_norm": 1.7901035945345756, + "learning_rate": 0.0002977043494786589, + "loss": 7.3373, + "step": 5474 + }, + { + "epoch": 0.5108705794532052, + "grad_norm": 1.6220727599539686, + "learning_rate": 0.00029770303105428845, + "loss": 7.2464, + "step": 5475 + }, + { + "epoch": 0.5109638891480824, + "grad_norm": 0.9469613230650809, + "learning_rate": 0.0002977017122543529, + "loss": 7.4211, + "step": 5476 + }, + { + "epoch": 0.5110571988429597, + "grad_norm": 10.545977450644092, + "learning_rate": 0.0002977003930788557, + "loss": 7.3734, + "step": 5477 + }, + { + "epoch": 0.5111505085378371, + "grad_norm": 1.2866047068766906, + "learning_rate": 0.00029769907352780007, + "loss": 7.3263, + "step": 5478 + }, + { + "epoch": 0.5112438182327144, + "grad_norm": 0.9996976663924821, + "learning_rate": 0.0002976977536011895, + "loss": 7.4842, + "step": 5479 + }, + { + "epoch": 0.5113371279275917, + "grad_norm": 1.1809013726234736, + "learning_rate": 0.0002976964332990273, + "loss": 7.2744, + "step": 5480 + }, + { + "epoch": 0.511430437622469, + "grad_norm": 1.4029801835641005, + "learning_rate": 0.00029769511262131684, + "loss": 7.2195, + "step": 5481 + }, + { + "epoch": 0.5115237473173463, + "grad_norm": 1.0323710649875961, + "learning_rate": 0.0002976937915680614, + "loss": 7.1763, + "step": 5482 + }, + { + "epoch": 0.5116170570122236, + "grad_norm": 9.906525348769293, + "learning_rate": 0.00029769247013926444, + "loss": 6.9223, + "step": 5483 + }, + { + "epoch": 0.5117103667071009, + "grad_norm": 0.7552496068691048, + "learning_rate": 0.0002976911483349293, + "loss": 7.3719, + "step": 5484 + }, + { + "epoch": 0.5118036764019782, + "grad_norm": 0.9091494528243481, + "learning_rate": 0.0002976898261550593, + "loss": 7.3538, + "step": 5485 + }, + { + "epoch": 0.5118969860968554, + "grad_norm": 1.1039247205818399, + "learning_rate": 0.0002976885035996578, + "loss": 7.1295, + "step": 5486 + }, + { + "epoch": 0.5119902957917327, + "grad_norm": 0.8865803592937442, + "learning_rate": 0.00029768718066872817, + "loss": 7.2179, + "step": 5487 + }, + { + "epoch": 0.51208360548661, + "grad_norm": 0.7361395851119547, + "learning_rate": 0.00029768585736227385, + "loss": 7.2448, + "step": 5488 + }, + { + "epoch": 0.5121769151814873, + "grad_norm": 16.42893513017974, + "learning_rate": 0.0002976845336802981, + "loss": 7.074, + "step": 5489 + }, + { + "epoch": 0.5122702248763646, + "grad_norm": 1.2832228356405544, + "learning_rate": 0.0002976832096228043, + "loss": 7.3055, + "step": 5490 + }, + { + "epoch": 0.512363534571242, + "grad_norm": 1.23849647576753, + "learning_rate": 0.0002976818851897959, + "loss": 7.4106, + "step": 5491 + }, + { + "epoch": 0.5124568442661193, + "grad_norm": 5.20220714484956, + "learning_rate": 0.0002976805603812762, + "loss": 7.421, + "step": 5492 + }, + { + "epoch": 0.5125501539609966, + "grad_norm": 1.4884882335207854, + "learning_rate": 0.0002976792351972486, + "loss": 7.2615, + "step": 5493 + }, + { + "epoch": 0.5126434636558739, + "grad_norm": 1.0209921738194696, + "learning_rate": 0.0002976779096377164, + "loss": 7.3111, + "step": 5494 + }, + { + "epoch": 0.5127367733507512, + "grad_norm": 8.564989511513406, + "learning_rate": 0.000297676583702683, + "loss": 7.2858, + "step": 5495 + }, + { + "epoch": 0.5128300830456285, + "grad_norm": 6.53877531873847, + "learning_rate": 0.0002976752573921519, + "loss": 7.2199, + "step": 5496 + }, + { + "epoch": 0.5129233927405057, + "grad_norm": 7.456764710397896, + "learning_rate": 0.00029767393070612624, + "loss": 6.9577, + "step": 5497 + }, + { + "epoch": 0.513016702435383, + "grad_norm": 0.7545511835787401, + "learning_rate": 0.00029767260364460963, + "loss": 7.1701, + "step": 5498 + }, + { + "epoch": 0.5131100121302603, + "grad_norm": 0.9526117418988547, + "learning_rate": 0.00029767127620760524, + "loss": 7.0663, + "step": 5499 + }, + { + "epoch": 0.5132033218251376, + "grad_norm": 3.527035257593942, + "learning_rate": 0.0002976699483951166, + "loss": 7.4756, + "step": 5500 + }, + { + "epoch": 0.5132966315200149, + "grad_norm": 0.8955354576499531, + "learning_rate": 0.000297668620207147, + "loss": 7.495, + "step": 5501 + }, + { + "epoch": 0.5133899412148922, + "grad_norm": 0.8601139642245975, + "learning_rate": 0.00029766729164369985, + "loss": 7.3779, + "step": 5502 + }, + { + "epoch": 0.5134832509097695, + "grad_norm": 0.7171361478305803, + "learning_rate": 0.0002976659627047785, + "loss": 6.9758, + "step": 5503 + }, + { + "epoch": 0.5135765606046468, + "grad_norm": 1.1678557334712483, + "learning_rate": 0.00029766463339038637, + "loss": 7.3086, + "step": 5504 + }, + { + "epoch": 0.5136698702995242, + "grad_norm": 0.8505038314092759, + "learning_rate": 0.0002976633037005268, + "loss": 7.2182, + "step": 5505 + }, + { + "epoch": 0.5137631799944015, + "grad_norm": 0.7784461388938968, + "learning_rate": 0.00029766197363520323, + "loss": 7.4315, + "step": 5506 + }, + { + "epoch": 0.5138564896892787, + "grad_norm": 0.7153791909312185, + "learning_rate": 0.00029766064319441894, + "loss": 7.2947, + "step": 5507 + }, + { + "epoch": 0.513949799384156, + "grad_norm": 0.8388616833270206, + "learning_rate": 0.0002976593123781774, + "loss": 7.3658, + "step": 5508 + }, + { + "epoch": 0.5140431090790333, + "grad_norm": 0.8664933912257158, + "learning_rate": 0.000297657981186482, + "loss": 7.3464, + "step": 5509 + }, + { + "epoch": 0.5141364187739106, + "grad_norm": 0.7664179973028062, + "learning_rate": 0.00029765664961933606, + "loss": 7.4552, + "step": 5510 + }, + { + "epoch": 0.5142297284687879, + "grad_norm": 1.0496945096275085, + "learning_rate": 0.00029765531767674306, + "loss": 7.624, + "step": 5511 + }, + { + "epoch": 0.5143230381636652, + "grad_norm": 0.7753507192079101, + "learning_rate": 0.00029765398535870624, + "loss": 7.3983, + "step": 5512 + }, + { + "epoch": 0.5144163478585425, + "grad_norm": 1.1202295927848163, + "learning_rate": 0.00029765265266522913, + "loss": 7.2707, + "step": 5513 + }, + { + "epoch": 0.5145096575534198, + "grad_norm": 0.7807289086070184, + "learning_rate": 0.00029765131959631503, + "loss": 7.0685, + "step": 5514 + }, + { + "epoch": 0.5146029672482971, + "grad_norm": 1.1808894247379007, + "learning_rate": 0.00029764998615196736, + "loss": 7.6824, + "step": 5515 + }, + { + "epoch": 0.5146962769431744, + "grad_norm": 0.7145406187414036, + "learning_rate": 0.00029764865233218953, + "loss": 7.1894, + "step": 5516 + }, + { + "epoch": 0.5147895866380517, + "grad_norm": 0.747957593698464, + "learning_rate": 0.0002976473181369849, + "loss": 7.1148, + "step": 5517 + }, + { + "epoch": 0.5148828963329289, + "grad_norm": 0.6578403823722342, + "learning_rate": 0.0002976459835663569, + "loss": 7.1215, + "step": 5518 + }, + { + "epoch": 0.5149762060278062, + "grad_norm": 5.339064215320555, + "learning_rate": 0.0002976446486203089, + "loss": 7.079, + "step": 5519 + }, + { + "epoch": 0.5150695157226836, + "grad_norm": 22.881278895817236, + "learning_rate": 0.0002976433132988443, + "loss": 7.1357, + "step": 5520 + }, + { + "epoch": 0.5151628254175609, + "grad_norm": 1.1991512041393506, + "learning_rate": 0.00029764197760196647, + "loss": 7.2489, + "step": 5521 + }, + { + "epoch": 0.5152561351124382, + "grad_norm": 81.29160467090757, + "learning_rate": 0.00029764064152967883, + "loss": 7.3695, + "step": 5522 + }, + { + "epoch": 0.5153494448073155, + "grad_norm": 1.0908037050187949, + "learning_rate": 0.00029763930508198473, + "loss": 7.0259, + "step": 5523 + }, + { + "epoch": 0.5154427545021928, + "grad_norm": 1.4409351117872247, + "learning_rate": 0.0002976379682588877, + "loss": 7.3389, + "step": 5524 + }, + { + "epoch": 0.5155360641970701, + "grad_norm": 0.8296821234839477, + "learning_rate": 0.00029763663106039095, + "loss": 7.1565, + "step": 5525 + }, + { + "epoch": 0.5156293738919474, + "grad_norm": 68.14014469708306, + "learning_rate": 0.00029763529348649805, + "loss": 7.181, + "step": 5526 + }, + { + "epoch": 0.5157226835868247, + "grad_norm": 1.2511746689589116, + "learning_rate": 0.0002976339555372123, + "loss": 7.3431, + "step": 5527 + }, + { + "epoch": 0.515815993281702, + "grad_norm": 1.0988395630937224, + "learning_rate": 0.00029763261721253714, + "loss": 7.227, + "step": 5528 + }, + { + "epoch": 0.5159093029765792, + "grad_norm": 1.2919478131644846, + "learning_rate": 0.00029763127851247596, + "loss": 6.9212, + "step": 5529 + }, + { + "epoch": 0.5160026126714565, + "grad_norm": 0.7842007769442945, + "learning_rate": 0.0002976299394370322, + "loss": 6.9363, + "step": 5530 + }, + { + "epoch": 0.5160959223663338, + "grad_norm": 40.59145470728087, + "learning_rate": 0.0002976285999862092, + "loss": 7.2493, + "step": 5531 + }, + { + "epoch": 0.5161892320612111, + "grad_norm": 2.6944255373404893, + "learning_rate": 0.0002976272601600104, + "loss": 7.1502, + "step": 5532 + }, + { + "epoch": 0.5162825417560885, + "grad_norm": 1.4878521143104961, + "learning_rate": 0.00029762591995843927, + "loss": 7.3006, + "step": 5533 + }, + { + "epoch": 0.5163758514509658, + "grad_norm": 16.877004096530193, + "learning_rate": 0.0002976245793814991, + "loss": 7.2564, + "step": 5534 + }, + { + "epoch": 0.5164691611458431, + "grad_norm": 1.0304495227318855, + "learning_rate": 0.0002976232384291934, + "loss": 7.1961, + "step": 5535 + }, + { + "epoch": 0.5165624708407204, + "grad_norm": 0.9761730287708172, + "learning_rate": 0.0002976218971015255, + "loss": 7.0755, + "step": 5536 + }, + { + "epoch": 0.5166557805355977, + "grad_norm": 1.3315998208556752, + "learning_rate": 0.00029762055539849885, + "loss": 7.4246, + "step": 5537 + }, + { + "epoch": 0.516749090230475, + "grad_norm": 136.26910947386466, + "learning_rate": 0.0002976192133201168, + "loss": 7.063, + "step": 5538 + }, + { + "epoch": 0.5168423999253522, + "grad_norm": 1.1267681142989188, + "learning_rate": 0.0002976178708663829, + "loss": 7.2129, + "step": 5539 + }, + { + "epoch": 0.5169357096202295, + "grad_norm": 1.3086218088495847, + "learning_rate": 0.0002976165280373004, + "loss": 7.0492, + "step": 5540 + }, + { + "epoch": 0.5170290193151068, + "grad_norm": 1.1180452983997955, + "learning_rate": 0.0002976151848328729, + "loss": 6.943, + "step": 5541 + }, + { + "epoch": 0.5171223290099841, + "grad_norm": 0.7993311237122068, + "learning_rate": 0.00029761384125310365, + "loss": 7.2954, + "step": 5542 + }, + { + "epoch": 0.5172156387048614, + "grad_norm": 1.2688501207835539, + "learning_rate": 0.0002976124972979961, + "loss": 7.0306, + "step": 5543 + }, + { + "epoch": 0.5173089483997387, + "grad_norm": 1.9169644644052082, + "learning_rate": 0.00029761115296755375, + "loss": 7.0856, + "step": 5544 + }, + { + "epoch": 0.517402258094616, + "grad_norm": 1.0897914306907628, + "learning_rate": 0.00029760980826178, + "loss": 7.2646, + "step": 5545 + }, + { + "epoch": 0.5174955677894933, + "grad_norm": 1.101984071736771, + "learning_rate": 0.00029760846318067815, + "loss": 7.2608, + "step": 5546 + }, + { + "epoch": 0.5175888774843707, + "grad_norm": 1.7268386880029596, + "learning_rate": 0.0002976071177242517, + "loss": 6.8837, + "step": 5547 + }, + { + "epoch": 0.517682187179248, + "grad_norm": 1376.4612133499909, + "learning_rate": 0.0002976057718925041, + "loss": 7.2344, + "step": 5548 + }, + { + "epoch": 0.5177754968741253, + "grad_norm": 1.3616195183592008, + "learning_rate": 0.0002976044256854388, + "loss": 7.0586, + "step": 5549 + }, + { + "epoch": 0.5178688065690025, + "grad_norm": 1.4548048777648035, + "learning_rate": 0.0002976030791030591, + "loss": 7.4702, + "step": 5550 + }, + { + "epoch": 0.5179621162638798, + "grad_norm": 0.8157738407952728, + "learning_rate": 0.0002976017321453685, + "loss": 7.216, + "step": 5551 + }, + { + "epoch": 0.5180554259587571, + "grad_norm": 1.555393160706654, + "learning_rate": 0.00029760038481237047, + "loss": 6.8621, + "step": 5552 + }, + { + "epoch": 0.5181487356536344, + "grad_norm": 6708.36666535818, + "learning_rate": 0.0002975990371040683, + "loss": 7.1124, + "step": 5553 + }, + { + "epoch": 0.5182420453485117, + "grad_norm": 1.0049586669969761, + "learning_rate": 0.00029759768902046555, + "loss": 7.1633, + "step": 5554 + }, + { + "epoch": 0.518335355043389, + "grad_norm": 1.3069865266987708, + "learning_rate": 0.0002975963405615656, + "loss": 6.9606, + "step": 5555 + }, + { + "epoch": 0.5184286647382663, + "grad_norm": 1460.8872319070947, + "learning_rate": 0.00029759499172737187, + "loss": 7.3226, + "step": 5556 + }, + { + "epoch": 0.5185219744331436, + "grad_norm": 1.7458598871112223, + "learning_rate": 0.0002975936425178878, + "loss": 7.2905, + "step": 5557 + }, + { + "epoch": 0.5186152841280209, + "grad_norm": 1.1268674607365596, + "learning_rate": 0.00029759229293311684, + "loss": 6.8495, + "step": 5558 + }, + { + "epoch": 0.5187085938228982, + "grad_norm": 1.473206365509928, + "learning_rate": 0.0002975909429730624, + "loss": 7.0531, + "step": 5559 + }, + { + "epoch": 0.5188019035177754, + "grad_norm": 1.652209165989912, + "learning_rate": 0.00029758959263772786, + "loss": 7.1605, + "step": 5560 + }, + { + "epoch": 0.5188952132126528, + "grad_norm": 1.1564654874666267, + "learning_rate": 0.0002975882419271168, + "loss": 7.3618, + "step": 5561 + }, + { + "epoch": 0.5189885229075301, + "grad_norm": 1.7095456233516806, + "learning_rate": 0.00029758689084123245, + "loss": 7.3965, + "step": 5562 + }, + { + "epoch": 0.5190818326024074, + "grad_norm": 1.3111580991358445, + "learning_rate": 0.0002975855393800784, + "loss": 7.2898, + "step": 5563 + }, + { + "epoch": 0.5191751422972847, + "grad_norm": 1.5296694769430936, + "learning_rate": 0.00029758418754365806, + "loss": 7.1925, + "step": 5564 + }, + { + "epoch": 0.519268451992162, + "grad_norm": 1.5650023246297098, + "learning_rate": 0.0002975828353319748, + "loss": 7.2165, + "step": 5565 + }, + { + "epoch": 0.5193617616870393, + "grad_norm": 12510039.606382644, + "learning_rate": 0.00029758148274503215, + "loss": 7.1895, + "step": 5566 + }, + { + "epoch": 0.5194550713819166, + "grad_norm": 1.303160695721859, + "learning_rate": 0.0002975801297828335, + "loss": 7.4686, + "step": 5567 + }, + { + "epoch": 0.5195483810767939, + "grad_norm": 1.0252426777787438, + "learning_rate": 0.0002975787764453823, + "loss": 7.139, + "step": 5568 + }, + { + "epoch": 0.5196416907716712, + "grad_norm": 1.4665403326865414, + "learning_rate": 0.00029757742273268196, + "loss": 7.7627, + "step": 5569 + }, + { + "epoch": 0.5197350004665485, + "grad_norm": 1.0164876330203079, + "learning_rate": 0.00029757606864473597, + "loss": 7.3234, + "step": 5570 + }, + { + "epoch": 0.5198283101614257, + "grad_norm": 5351.083945093774, + "learning_rate": 0.0002975747141815477, + "loss": 7.1477, + "step": 5571 + }, + { + "epoch": 0.519921619856303, + "grad_norm": 1.7680207735500904, + "learning_rate": 0.0002975733593431207, + "loss": 6.9169, + "step": 5572 + }, + { + "epoch": 0.5200149295511803, + "grad_norm": 0.7730848293732631, + "learning_rate": 0.00029757200412945835, + "loss": 7.2317, + "step": 5573 + }, + { + "epoch": 0.5201082392460576, + "grad_norm": 325.154125066738, + "learning_rate": 0.0002975706485405642, + "loss": 7.2266, + "step": 5574 + }, + { + "epoch": 0.520201548940935, + "grad_norm": 71.44592249693216, + "learning_rate": 0.00029756929257644145, + "loss": 7.4582, + "step": 5575 + }, + { + "epoch": 0.5202948586358123, + "grad_norm": 1.465370844693978, + "learning_rate": 0.0002975679362370938, + "loss": 7.5087, + "step": 5576 + }, + { + "epoch": 0.5203881683306896, + "grad_norm": 1.1037849293873456, + "learning_rate": 0.00029756657952252456, + "loss": 7.2771, + "step": 5577 + }, + { + "epoch": 0.5204814780255669, + "grad_norm": 2.0349169862485614, + "learning_rate": 0.00029756522243273723, + "loss": 7.2306, + "step": 5578 + }, + { + "epoch": 0.5205747877204442, + "grad_norm": 1.5222434198301071, + "learning_rate": 0.0002975638649677352, + "loss": 7.266, + "step": 5579 + }, + { + "epoch": 0.5206680974153215, + "grad_norm": 5.843886144327941, + "learning_rate": 0.00029756250712752207, + "loss": 6.9053, + "step": 5580 + }, + { + "epoch": 0.5207614071101988, + "grad_norm": 2.270061740062785, + "learning_rate": 0.0002975611489121012, + "loss": 7.3047, + "step": 5581 + }, + { + "epoch": 0.520854716805076, + "grad_norm": 1.79613817210494, + "learning_rate": 0.000297559790321476, + "loss": 7.13, + "step": 5582 + }, + { + "epoch": 0.5209480264999533, + "grad_norm": 1.88944900456797, + "learning_rate": 0.0002975584313556499, + "loss": 7.515, + "step": 5583 + }, + { + "epoch": 0.5210413361948306, + "grad_norm": 0.9723348958186208, + "learning_rate": 0.00029755707201462646, + "loss": 7.3528, + "step": 5584 + }, + { + "epoch": 0.5211346458897079, + "grad_norm": 2.2115827150189835, + "learning_rate": 0.00029755571229840913, + "loss": 7.2443, + "step": 5585 + }, + { + "epoch": 0.5212279555845852, + "grad_norm": 4.041940553853615, + "learning_rate": 0.0002975543522070013, + "loss": 7.4173, + "step": 5586 + }, + { + "epoch": 0.5213212652794625, + "grad_norm": 1.826751601207955, + "learning_rate": 0.0002975529917404065, + "loss": 7.3686, + "step": 5587 + }, + { + "epoch": 0.5214145749743399, + "grad_norm": 0.9180549266097208, + "learning_rate": 0.0002975516308986281, + "loss": 7.0002, + "step": 5588 + }, + { + "epoch": 0.5215078846692172, + "grad_norm": 1.4076752323589126, + "learning_rate": 0.0002975502696816696, + "loss": 7.4953, + "step": 5589 + }, + { + "epoch": 0.5216011943640945, + "grad_norm": 3.7872580365977297, + "learning_rate": 0.0002975489080895346, + "loss": 7.173, + "step": 5590 + }, + { + "epoch": 0.5216945040589718, + "grad_norm": 1.7664902125957553, + "learning_rate": 0.0002975475461222263, + "loss": 7.49, + "step": 5591 + }, + { + "epoch": 0.521787813753849, + "grad_norm": 1.0472486649314323, + "learning_rate": 0.00029754618377974835, + "loss": 7.3434, + "step": 5592 + }, + { + "epoch": 0.5218811234487263, + "grad_norm": 1.417216069812287, + "learning_rate": 0.00029754482106210417, + "loss": 7.3236, + "step": 5593 + }, + { + "epoch": 0.5219744331436036, + "grad_norm": 1.6547125036242492, + "learning_rate": 0.00029754345796929714, + "loss": 7.3923, + "step": 5594 + }, + { + "epoch": 0.5220677428384809, + "grad_norm": 2.2114678872072684, + "learning_rate": 0.0002975420945013309, + "loss": 7.3533, + "step": 5595 + }, + { + "epoch": 0.5221610525333582, + "grad_norm": 1.4055988179725156, + "learning_rate": 0.0002975407306582088, + "loss": 7.564, + "step": 5596 + }, + { + "epoch": 0.5222543622282355, + "grad_norm": 2.6373929749554037, + "learning_rate": 0.00029753936643993426, + "loss": 7.4373, + "step": 5597 + }, + { + "epoch": 0.5223476719231128, + "grad_norm": 1.0584238913517083, + "learning_rate": 0.00029753800184651087, + "loss": 7.5082, + "step": 5598 + }, + { + "epoch": 0.5224409816179901, + "grad_norm": 1.1768799461507713, + "learning_rate": 0.000297536636877942, + "loss": 7.2374, + "step": 5599 + }, + { + "epoch": 0.5225342913128674, + "grad_norm": 1.6569315087719512, + "learning_rate": 0.0002975352715342312, + "loss": 7.0557, + "step": 5600 + }, + { + "epoch": 0.5226276010077447, + "grad_norm": 1.8747579250301976, + "learning_rate": 0.00029753390581538196, + "loss": 7.3315, + "step": 5601 + }, + { + "epoch": 0.522720910702622, + "grad_norm": 4.973119312309435, + "learning_rate": 0.00029753253972139766, + "loss": 7.2379, + "step": 5602 + }, + { + "epoch": 0.5228142203974993, + "grad_norm": 1.8935040184891485, + "learning_rate": 0.0002975311732522818, + "loss": 7.6248, + "step": 5603 + }, + { + "epoch": 0.5229075300923766, + "grad_norm": 1.077484506338242, + "learning_rate": 0.00029752980640803783, + "loss": 7.3252, + "step": 5604 + }, + { + "epoch": 0.5230008397872539, + "grad_norm": 2.64833278129616, + "learning_rate": 0.0002975284391886693, + "loss": 7.4234, + "step": 5605 + }, + { + "epoch": 0.5230941494821312, + "grad_norm": 1.300275209421068, + "learning_rate": 0.00029752707159417965, + "loss": 7.5269, + "step": 5606 + }, + { + "epoch": 0.5231874591770085, + "grad_norm": 1.3610081152994256, + "learning_rate": 0.0002975257036245724, + "loss": 7.0855, + "step": 5607 + }, + { + "epoch": 0.5232807688718858, + "grad_norm": 0.9729433079909985, + "learning_rate": 0.00029752433527985094, + "loss": 7.388, + "step": 5608 + }, + { + "epoch": 0.5233740785667631, + "grad_norm": 3.9179777687606414, + "learning_rate": 0.0002975229665600188, + "loss": 7.8255, + "step": 5609 + }, + { + "epoch": 0.5234673882616404, + "grad_norm": 0.8922821090717099, + "learning_rate": 0.00029752159746507944, + "loss": 7.2986, + "step": 5610 + }, + { + "epoch": 0.5235606979565177, + "grad_norm": 1.563295195921573, + "learning_rate": 0.0002975202279950364, + "loss": 7.0124, + "step": 5611 + }, + { + "epoch": 0.523654007651395, + "grad_norm": 3.3610051915715906, + "learning_rate": 0.00029751885814989303, + "loss": 7.4652, + "step": 5612 + }, + { + "epoch": 0.5237473173462722, + "grad_norm": 0.6943193808750039, + "learning_rate": 0.000297517487929653, + "loss": 7.2628, + "step": 5613 + }, + { + "epoch": 0.5238406270411495, + "grad_norm": 1.3472412027802556, + "learning_rate": 0.00029751611733431967, + "loss": 7.444, + "step": 5614 + }, + { + "epoch": 0.5239339367360268, + "grad_norm": 1.73159543755245, + "learning_rate": 0.00029751474636389655, + "loss": 7.681, + "step": 5615 + }, + { + "epoch": 0.5240272464309041, + "grad_norm": 0.9911058278519963, + "learning_rate": 0.0002975133750183871, + "loss": 7.3331, + "step": 5616 + }, + { + "epoch": 0.5241205561257815, + "grad_norm": 1.5555648293432032, + "learning_rate": 0.00029751200329779486, + "loss": 7.1604, + "step": 5617 + }, + { + "epoch": 0.5242138658206588, + "grad_norm": 1.3989704896942214, + "learning_rate": 0.0002975106312021233, + "loss": 7.1694, + "step": 5618 + }, + { + "epoch": 0.5243071755155361, + "grad_norm": 2.1098494709420463, + "learning_rate": 0.0002975092587313759, + "loss": 7.333, + "step": 5619 + }, + { + "epoch": 0.5244004852104134, + "grad_norm": 1.519520312806898, + "learning_rate": 0.0002975078858855561, + "loss": 7.3015, + "step": 5620 + }, + { + "epoch": 0.5244937949052907, + "grad_norm": 1.3692988206244119, + "learning_rate": 0.00029750651266466745, + "loss": 7.4498, + "step": 5621 + }, + { + "epoch": 0.524587104600168, + "grad_norm": 0.8032610354589735, + "learning_rate": 0.0002975051390687135, + "loss": 7.4211, + "step": 5622 + }, + { + "epoch": 0.5246804142950453, + "grad_norm": 3.579509297101848, + "learning_rate": 0.0002975037650976976, + "loss": 7.2671, + "step": 5623 + }, + { + "epoch": 0.5247737239899225, + "grad_norm": 1.6444346323103087, + "learning_rate": 0.0002975023907516234, + "loss": 7.076, + "step": 5624 + }, + { + "epoch": 0.5248670336847998, + "grad_norm": 1.7745578372364965, + "learning_rate": 0.00029750101603049424, + "loss": 7.1821, + "step": 5625 + }, + { + "epoch": 0.5249603433796771, + "grad_norm": 0.7960041540158261, + "learning_rate": 0.0002974996409343137, + "loss": 7.1183, + "step": 5626 + }, + { + "epoch": 0.5250536530745544, + "grad_norm": 1.5879151803536706, + "learning_rate": 0.00029749826546308537, + "loss": 7.0002, + "step": 5627 + }, + { + "epoch": 0.5251469627694317, + "grad_norm": 2.531366146218508, + "learning_rate": 0.0002974968896168125, + "loss": 7.6221, + "step": 5628 + }, + { + "epoch": 0.525240272464309, + "grad_norm": 2.5867595205376976, + "learning_rate": 0.0002974955133954988, + "loss": 7.1906, + "step": 5629 + }, + { + "epoch": 0.5253335821591864, + "grad_norm": 1.7386421374822463, + "learning_rate": 0.0002974941367991477, + "loss": 7.0918, + "step": 5630 + }, + { + "epoch": 0.5254268918540637, + "grad_norm": 1.6779151395866472, + "learning_rate": 0.0002974927598277627, + "loss": 7.3817, + "step": 5631 + }, + { + "epoch": 0.525520201548941, + "grad_norm": 1.263807787582815, + "learning_rate": 0.0002974913824813473, + "loss": 7.2172, + "step": 5632 + }, + { + "epoch": 0.5256135112438183, + "grad_norm": 0.7775918413028816, + "learning_rate": 0.000297490004759905, + "loss": 7.2118, + "step": 5633 + }, + { + "epoch": 0.5257068209386956, + "grad_norm": 1.541500154775004, + "learning_rate": 0.0002974886266634393, + "loss": 7.3998, + "step": 5634 + }, + { + "epoch": 0.5258001306335728, + "grad_norm": 1.5239516450177433, + "learning_rate": 0.0002974872481919537, + "loss": 7.4803, + "step": 5635 + }, + { + "epoch": 0.5258934403284501, + "grad_norm": 0.9538535087670147, + "learning_rate": 0.00029748586934545175, + "loss": 7.3942, + "step": 5636 + }, + { + "epoch": 0.5259867500233274, + "grad_norm": 1.1358593642154653, + "learning_rate": 0.0002974844901239369, + "loss": 7.3431, + "step": 5637 + }, + { + "epoch": 0.5260800597182047, + "grad_norm": 1.4623564102678797, + "learning_rate": 0.0002974831105274127, + "loss": 7.2912, + "step": 5638 + }, + { + "epoch": 0.526173369413082, + "grad_norm": 1.601125621371205, + "learning_rate": 0.0002974817305558826, + "loss": 7.135, + "step": 5639 + }, + { + "epoch": 0.5262666791079593, + "grad_norm": 1.0632334956397642, + "learning_rate": 0.00029748035020935016, + "loss": 7.283, + "step": 5640 + }, + { + "epoch": 0.5263599888028366, + "grad_norm": 1.1411438394873568, + "learning_rate": 0.0002974789694878189, + "loss": 6.9645, + "step": 5641 + }, + { + "epoch": 0.5264532984977139, + "grad_norm": 1.9396243477592858, + "learning_rate": 0.0002974775883912923, + "loss": 7.6187, + "step": 5642 + }, + { + "epoch": 0.5265466081925912, + "grad_norm": 0.9237413160030805, + "learning_rate": 0.0002974762069197739, + "loss": 6.7311, + "step": 5643 + }, + { + "epoch": 0.5266399178874686, + "grad_norm": 0.7891333505297808, + "learning_rate": 0.00029747482507326713, + "loss": 7.0911, + "step": 5644 + }, + { + "epoch": 0.5267332275823458, + "grad_norm": 0.9798510518157677, + "learning_rate": 0.0002974734428517756, + "loss": 7.2651, + "step": 5645 + }, + { + "epoch": 0.5268265372772231, + "grad_norm": 3.8271620706067306, + "learning_rate": 0.0002974720602553028, + "loss": 7.2643, + "step": 5646 + }, + { + "epoch": 0.5269198469721004, + "grad_norm": 4.894447195780574, + "learning_rate": 0.00029747067728385225, + "loss": 6.9846, + "step": 5647 + }, + { + "epoch": 0.5270131566669777, + "grad_norm": 0.8181775032980565, + "learning_rate": 0.0002974692939374274, + "loss": 6.9808, + "step": 5648 + }, + { + "epoch": 0.527106466361855, + "grad_norm": 1.7135994005591437, + "learning_rate": 0.0002974679102160318, + "loss": 7.3806, + "step": 5649 + }, + { + "epoch": 0.5271997760567323, + "grad_norm": 0.8714887890774375, + "learning_rate": 0.00029746652611966907, + "loss": 7.0771, + "step": 5650 + }, + { + "epoch": 0.5272930857516096, + "grad_norm": 0.7957005241723136, + "learning_rate": 0.00029746514164834264, + "loss": 7.1174, + "step": 5651 + }, + { + "epoch": 0.5273863954464869, + "grad_norm": 1.0801385903083816, + "learning_rate": 0.000297463756802056, + "loss": 7.1993, + "step": 5652 + }, + { + "epoch": 0.5274797051413642, + "grad_norm": 3.451229743420949, + "learning_rate": 0.00029746237158081274, + "loss": 7.2242, + "step": 5653 + }, + { + "epoch": 0.5275730148362415, + "grad_norm": 0.7857129580633532, + "learning_rate": 0.0002974609859846163, + "loss": 7.3658, + "step": 5654 + }, + { + "epoch": 0.5276663245311188, + "grad_norm": 1.041663009349726, + "learning_rate": 0.0002974596000134703, + "loss": 7.3418, + "step": 5655 + }, + { + "epoch": 0.527759634225996, + "grad_norm": 1.1079770151419666, + "learning_rate": 0.0002974582136673782, + "loss": 7.3039, + "step": 5656 + }, + { + "epoch": 0.5278529439208733, + "grad_norm": 0.7695343534041593, + "learning_rate": 0.0002974568269463435, + "loss": 7.3763, + "step": 5657 + }, + { + "epoch": 0.5279462536157506, + "grad_norm": 0.9335398208561628, + "learning_rate": 0.0002974554398503698, + "loss": 6.8612, + "step": 5658 + }, + { + "epoch": 0.528039563310628, + "grad_norm": 13.73689340562289, + "learning_rate": 0.00029745405237946063, + "loss": 7.1448, + "step": 5659 + }, + { + "epoch": 0.5281328730055053, + "grad_norm": 0.5949347415494634, + "learning_rate": 0.00029745266453361945, + "loss": 7.1759, + "step": 5660 + }, + { + "epoch": 0.5282261827003826, + "grad_norm": 3.195167267587028, + "learning_rate": 0.0002974512763128499, + "loss": 7.1353, + "step": 5661 + }, + { + "epoch": 0.5283194923952599, + "grad_norm": 1.1426714589363969, + "learning_rate": 0.0002974498877171553, + "loss": 7.1713, + "step": 5662 + }, + { + "epoch": 0.5284128020901372, + "grad_norm": 10.406978135645822, + "learning_rate": 0.0002974484987465394, + "loss": 7.3479, + "step": 5663 + }, + { + "epoch": 0.5285061117850145, + "grad_norm": 1.030776793116021, + "learning_rate": 0.0002974471094010056, + "loss": 7.1374, + "step": 5664 + }, + { + "epoch": 0.5285994214798918, + "grad_norm": 0.8389086772024406, + "learning_rate": 0.0002974457196805575, + "loss": 7.3583, + "step": 5665 + }, + { + "epoch": 0.528692731174769, + "grad_norm": 5.832425245438535, + "learning_rate": 0.00029744432958519866, + "loss": 7.4214, + "step": 5666 + }, + { + "epoch": 0.5287860408696463, + "grad_norm": 0.766275450650267, + "learning_rate": 0.0002974429391149325, + "loss": 7.1991, + "step": 5667 + }, + { + "epoch": 0.5288793505645236, + "grad_norm": 1.5408089353708492, + "learning_rate": 0.00029744154826976267, + "loss": 7.0001, + "step": 5668 + }, + { + "epoch": 0.5289726602594009, + "grad_norm": 0.815582817481556, + "learning_rate": 0.0002974401570496926, + "loss": 6.9245, + "step": 5669 + }, + { + "epoch": 0.5290659699542782, + "grad_norm": 1.1122578869226505, + "learning_rate": 0.00029743876545472594, + "loss": 7.2444, + "step": 5670 + }, + { + "epoch": 0.5291592796491555, + "grad_norm": 13.372707813446462, + "learning_rate": 0.00029743737348486614, + "loss": 7.3638, + "step": 5671 + }, + { + "epoch": 0.5292525893440329, + "grad_norm": 1.5302960038395765, + "learning_rate": 0.00029743598114011677, + "loss": 6.8401, + "step": 5672 + }, + { + "epoch": 0.5293458990389102, + "grad_norm": 0.949791107451352, + "learning_rate": 0.0002974345884204814, + "loss": 7.0312, + "step": 5673 + }, + { + "epoch": 0.5294392087337875, + "grad_norm": 1.580470885100769, + "learning_rate": 0.0002974331953259635, + "loss": 7.0842, + "step": 5674 + }, + { + "epoch": 0.5295325184286648, + "grad_norm": 1.4619168778718212, + "learning_rate": 0.00029743180185656673, + "loss": 7.1873, + "step": 5675 + }, + { + "epoch": 0.5296258281235421, + "grad_norm": 0.9582863202707437, + "learning_rate": 0.00029743040801229447, + "loss": 7.0851, + "step": 5676 + }, + { + "epoch": 0.5297191378184193, + "grad_norm": 0.8996313629709567, + "learning_rate": 0.0002974290137931504, + "loss": 7.0424, + "step": 5677 + }, + { + "epoch": 0.5298124475132966, + "grad_norm": 21.567329815794587, + "learning_rate": 0.00029742761919913805, + "loss": 7.1562, + "step": 5678 + }, + { + "epoch": 0.5299057572081739, + "grad_norm": 0.9835770833121605, + "learning_rate": 0.0002974262242302609, + "loss": 7.2219, + "step": 5679 + }, + { + "epoch": 0.5299990669030512, + "grad_norm": 0.750922584388463, + "learning_rate": 0.0002974248288865226, + "loss": 7.3668, + "step": 5680 + }, + { + "epoch": 0.5300923765979285, + "grad_norm": 0.8653468257073605, + "learning_rate": 0.00029742343316792655, + "loss": 7.4645, + "step": 5681 + }, + { + "epoch": 0.5301856862928058, + "grad_norm": 1.048357369749519, + "learning_rate": 0.00029742203707447634, + "loss": 7.4014, + "step": 5682 + }, + { + "epoch": 0.5302789959876831, + "grad_norm": 0.6313895256115395, + "learning_rate": 0.00029742064060617564, + "loss": 7.3408, + "step": 5683 + }, + { + "epoch": 0.5303723056825604, + "grad_norm": 1.0444399426755338, + "learning_rate": 0.00029741924376302793, + "loss": 7.2865, + "step": 5684 + }, + { + "epoch": 0.5304656153774377, + "grad_norm": 0.6340577676848844, + "learning_rate": 0.0002974178465450367, + "loss": 7.0825, + "step": 5685 + }, + { + "epoch": 0.5305589250723151, + "grad_norm": 2.3680899059645326, + "learning_rate": 0.0002974164489522056, + "loss": 7.2213, + "step": 5686 + }, + { + "epoch": 0.5306522347671924, + "grad_norm": 0.8446519046394415, + "learning_rate": 0.00029741505098453814, + "loss": 6.9489, + "step": 5687 + }, + { + "epoch": 0.5307455444620696, + "grad_norm": 0.729073730687658, + "learning_rate": 0.0002974136526420378, + "loss": 6.6873, + "step": 5688 + }, + { + "epoch": 0.5308388541569469, + "grad_norm": 5.740926944222237, + "learning_rate": 0.0002974122539247083, + "loss": 6.9647, + "step": 5689 + }, + { + "epoch": 0.5309321638518242, + "grad_norm": 0.9589555647723443, + "learning_rate": 0.000297410854832553, + "loss": 7.063, + "step": 5690 + }, + { + "epoch": 0.5310254735467015, + "grad_norm": 0.629399942978361, + "learning_rate": 0.00029740945536557565, + "loss": 7.1003, + "step": 5691 + }, + { + "epoch": 0.5311187832415788, + "grad_norm": 1.1336327682635554, + "learning_rate": 0.0002974080555237797, + "loss": 7.1301, + "step": 5692 + }, + { + "epoch": 0.5312120929364561, + "grad_norm": 1.3377826420854988, + "learning_rate": 0.0002974066553071687, + "loss": 6.8069, + "step": 5693 + }, + { + "epoch": 0.5313054026313334, + "grad_norm": 19.52092048401554, + "learning_rate": 0.0002974052547157463, + "loss": 7.053, + "step": 5694 + }, + { + "epoch": 0.5313987123262107, + "grad_norm": 0.9144889623064456, + "learning_rate": 0.000297403853749516, + "loss": 6.9715, + "step": 5695 + }, + { + "epoch": 0.531492022021088, + "grad_norm": 1.4691511711077039, + "learning_rate": 0.0002974024524084813, + "loss": 7.5144, + "step": 5696 + }, + { + "epoch": 0.5315853317159653, + "grad_norm": 0.8433064796954075, + "learning_rate": 0.00029740105069264586, + "loss": 6.9639, + "step": 5697 + }, + { + "epoch": 0.5316786414108425, + "grad_norm": 1.1456696393249484, + "learning_rate": 0.0002973996486020132, + "loss": 7.1895, + "step": 5698 + }, + { + "epoch": 0.5317719511057198, + "grad_norm": 0.7130043056635007, + "learning_rate": 0.0002973982461365869, + "loss": 7.221, + "step": 5699 + }, + { + "epoch": 0.5318652608005972, + "grad_norm": 18.670768177566824, + "learning_rate": 0.00029739684329637057, + "loss": 6.9929, + "step": 5700 + }, + { + "epoch": 0.5319585704954745, + "grad_norm": 45.18846121973099, + "learning_rate": 0.00029739544008136766, + "loss": 7.1837, + "step": 5701 + }, + { + "epoch": 0.5320518801903518, + "grad_norm": 1.1014186840822524, + "learning_rate": 0.00029739403649158187, + "loss": 7.3174, + "step": 5702 + }, + { + "epoch": 0.5321451898852291, + "grad_norm": 0.9412444610973492, + "learning_rate": 0.00029739263252701666, + "loss": 7.0413, + "step": 5703 + }, + { + "epoch": 0.5322384995801064, + "grad_norm": 58.4276999995343, + "learning_rate": 0.00029739122818767566, + "loss": 6.8904, + "step": 5704 + }, + { + "epoch": 0.5323318092749837, + "grad_norm": 1.4593772543317458, + "learning_rate": 0.0002973898234735624, + "loss": 7.1455, + "step": 5705 + }, + { + "epoch": 0.532425118969861, + "grad_norm": 1.0513392204353218, + "learning_rate": 0.00029738841838468057, + "loss": 6.9615, + "step": 5706 + }, + { + "epoch": 0.5325184286647383, + "grad_norm": 1.0555211733031191, + "learning_rate": 0.0002973870129210336, + "loss": 6.8303, + "step": 5707 + }, + { + "epoch": 0.5326117383596156, + "grad_norm": 0.8692924563067432, + "learning_rate": 0.00029738560708262506, + "loss": 7.2051, + "step": 5708 + }, + { + "epoch": 0.5327050480544928, + "grad_norm": 1.1335662950090009, + "learning_rate": 0.00029738420086945864, + "loss": 7.0407, + "step": 5709 + }, + { + "epoch": 0.5327983577493701, + "grad_norm": 0.9896950613486795, + "learning_rate": 0.0002973827942815378, + "loss": 6.9903, + "step": 5710 + }, + { + "epoch": 0.5328916674442474, + "grad_norm": 149.53413388076032, + "learning_rate": 0.00029738138731886626, + "loss": 7.339, + "step": 5711 + }, + { + "epoch": 0.5329849771391247, + "grad_norm": 1.6214634017267253, + "learning_rate": 0.00029737997998144746, + "loss": 7.6054, + "step": 5712 + }, + { + "epoch": 0.533078286834002, + "grad_norm": 0.9316436919215888, + "learning_rate": 0.000297378572269285, + "loss": 7.0598, + "step": 5713 + }, + { + "epoch": 0.5331715965288794, + "grad_norm": 16.976508478752496, + "learning_rate": 0.0002973771641823825, + "loss": 7.351, + "step": 5714 + }, + { + "epoch": 0.5332649062237567, + "grad_norm": 4.271246152268633, + "learning_rate": 0.0002973757557207436, + "loss": 7.1009, + "step": 5715 + }, + { + "epoch": 0.533358215918634, + "grad_norm": 0.9105121089956145, + "learning_rate": 0.0002973743468843717, + "loss": 6.9111, + "step": 5716 + }, + { + "epoch": 0.5334515256135113, + "grad_norm": 146.79263033174823, + "learning_rate": 0.0002973729376732706, + "loss": 6.8945, + "step": 5717 + }, + { + "epoch": 0.5335448353083886, + "grad_norm": 1.8033119143000478, + "learning_rate": 0.0002973715280874437, + "loss": 7.2167, + "step": 5718 + }, + { + "epoch": 0.5336381450032658, + "grad_norm": 248.53193618589702, + "learning_rate": 0.00029737011812689464, + "loss": 6.669, + "step": 5719 + }, + { + "epoch": 0.5337314546981431, + "grad_norm": 2.425915229026633, + "learning_rate": 0.00029736870779162707, + "loss": 7.2859, + "step": 5720 + }, + { + "epoch": 0.5338247643930204, + "grad_norm": 2.3731800388481807, + "learning_rate": 0.00029736729708164454, + "loss": 6.8212, + "step": 5721 + }, + { + "epoch": 0.5339180740878977, + "grad_norm": 2.7256445250070365, + "learning_rate": 0.0002973658859969506, + "loss": 7.6242, + "step": 5722 + }, + { + "epoch": 0.534011383782775, + "grad_norm": 2.003673474249611, + "learning_rate": 0.0002973644745375489, + "loss": 7.8665, + "step": 5723 + }, + { + "epoch": 0.5341046934776523, + "grad_norm": 10.663996408962966, + "learning_rate": 0.00029736306270344295, + "loss": 7.5972, + "step": 5724 + }, + { + "epoch": 0.5341980031725296, + "grad_norm": 34.211192211695696, + "learning_rate": 0.0002973616504946364, + "loss": 7.1385, + "step": 5725 + }, + { + "epoch": 0.5342913128674069, + "grad_norm": 38.42230572404231, + "learning_rate": 0.00029736023791113284, + "loss": 7.3632, + "step": 5726 + }, + { + "epoch": 0.5343846225622843, + "grad_norm": 6.41670684603685, + "learning_rate": 0.00029735882495293583, + "loss": 7.3712, + "step": 5727 + }, + { + "epoch": 0.5344779322571616, + "grad_norm": 4.61436838678969, + "learning_rate": 0.00029735741162004896, + "loss": 7.8136, + "step": 5728 + }, + { + "epoch": 0.5345712419520389, + "grad_norm": 181.01955893579455, + "learning_rate": 0.00029735599791247587, + "loss": 7.9549, + "step": 5729 + }, + { + "epoch": 0.5346645516469161, + "grad_norm": 2.1362136640407154, + "learning_rate": 0.0002973545838302201, + "loss": 7.6942, + "step": 5730 + }, + { + "epoch": 0.5347578613417934, + "grad_norm": 2.1427484537906913, + "learning_rate": 0.00029735316937328535, + "loss": 8.0762, + "step": 5731 + }, + { + "epoch": 0.5348511710366707, + "grad_norm": 2.2302279567593626, + "learning_rate": 0.0002973517545416751, + "loss": 7.8546, + "step": 5732 + }, + { + "epoch": 0.534944480731548, + "grad_norm": 1.5755984247883, + "learning_rate": 0.00029735033933539295, + "loss": 7.6432, + "step": 5733 + }, + { + "epoch": 0.5350377904264253, + "grad_norm": 1.395759853812917, + "learning_rate": 0.0002973489237544426, + "loss": 7.6408, + "step": 5734 + }, + { + "epoch": 0.5351311001213026, + "grad_norm": 2.3302352183689177, + "learning_rate": 0.0002973475077988275, + "loss": 7.6263, + "step": 5735 + }, + { + "epoch": 0.5352244098161799, + "grad_norm": 1.9727834171972325, + "learning_rate": 0.00029734609146855136, + "loss": 8.3775, + "step": 5736 + }, + { + "epoch": 0.5353177195110572, + "grad_norm": 1.5018810070560276, + "learning_rate": 0.0002973446747636178, + "loss": 7.6897, + "step": 5737 + }, + { + "epoch": 0.5354110292059345, + "grad_norm": 27.763188183564914, + "learning_rate": 0.00029734325768403035, + "loss": 7.3861, + "step": 5738 + }, + { + "epoch": 0.5355043389008118, + "grad_norm": 1.6525050031785793, + "learning_rate": 0.0002973418402297926, + "loss": 7.6949, + "step": 5739 + }, + { + "epoch": 0.5355976485956891, + "grad_norm": 1.70002013481258, + "learning_rate": 0.00029734042240090825, + "loss": 7.3435, + "step": 5740 + }, + { + "epoch": 0.5356909582905663, + "grad_norm": 13.473259591217385, + "learning_rate": 0.00029733900419738087, + "loss": 7.361, + "step": 5741 + }, + { + "epoch": 0.5357842679854437, + "grad_norm": 1.5517863728592514, + "learning_rate": 0.00029733758561921404, + "loss": 7.1467, + "step": 5742 + }, + { + "epoch": 0.535877577680321, + "grad_norm": 1.80793686222509, + "learning_rate": 0.0002973361666664113, + "loss": 7.1183, + "step": 5743 + }, + { + "epoch": 0.5359708873751983, + "grad_norm": 1.1339415235063885, + "learning_rate": 0.0002973347473389764, + "loss": 7.1969, + "step": 5744 + }, + { + "epoch": 0.5360641970700756, + "grad_norm": 1.4964983799913372, + "learning_rate": 0.0002973333276369129, + "loss": 7.5788, + "step": 5745 + }, + { + "epoch": 0.5361575067649529, + "grad_norm": 1.472875952858933, + "learning_rate": 0.00029733190756022436, + "loss": 7.5703, + "step": 5746 + }, + { + "epoch": 0.5362508164598302, + "grad_norm": 1.537166462000583, + "learning_rate": 0.0002973304871089144, + "loss": 7.415, + "step": 5747 + }, + { + "epoch": 0.5363441261547075, + "grad_norm": 1.7974124388390678, + "learning_rate": 0.0002973290662829867, + "loss": 7.3981, + "step": 5748 + }, + { + "epoch": 0.5364374358495848, + "grad_norm": 0.9301404651410403, + "learning_rate": 0.00029732764508244477, + "loss": 7.2689, + "step": 5749 + }, + { + "epoch": 0.5365307455444621, + "grad_norm": 1.5407942634420781, + "learning_rate": 0.0002973262235072923, + "loss": 7.2996, + "step": 5750 + }, + { + "epoch": 0.5366240552393393, + "grad_norm": 1.0209792307347885, + "learning_rate": 0.00029732480155753293, + "loss": 7.4006, + "step": 5751 + }, + { + "epoch": 0.5367173649342166, + "grad_norm": 1.2635620819741322, + "learning_rate": 0.0002973233792331702, + "loss": 7.4745, + "step": 5752 + }, + { + "epoch": 0.5368106746290939, + "grad_norm": 1.00287116412599, + "learning_rate": 0.0002973219565342077, + "loss": 7.2723, + "step": 5753 + }, + { + "epoch": 0.5369039843239712, + "grad_norm": 1.098897964736465, + "learning_rate": 0.0002973205334606492, + "loss": 7.4616, + "step": 5754 + }, + { + "epoch": 0.5369972940188485, + "grad_norm": 1.313863613334851, + "learning_rate": 0.0002973191100124982, + "loss": 7.4494, + "step": 5755 + }, + { + "epoch": 0.5370906037137259, + "grad_norm": 139.7262558280033, + "learning_rate": 0.0002973176861897583, + "loss": 7.0498, + "step": 5756 + }, + { + "epoch": 0.5371839134086032, + "grad_norm": 289.3711664157415, + "learning_rate": 0.00029731626199243317, + "loss": 7.2583, + "step": 5757 + }, + { + "epoch": 0.5372772231034805, + "grad_norm": 1.9168575569599589, + "learning_rate": 0.00029731483742052644, + "loss": 7.3006, + "step": 5758 + }, + { + "epoch": 0.5373705327983578, + "grad_norm": 3.6180910995338715, + "learning_rate": 0.00029731341247404174, + "loss": 7.1327, + "step": 5759 + }, + { + "epoch": 0.5374638424932351, + "grad_norm": 1.4130346864376282, + "learning_rate": 0.00029731198715298265, + "loss": 6.8575, + "step": 5760 + }, + { + "epoch": 0.5375571521881124, + "grad_norm": 1.5842155774850404, + "learning_rate": 0.0002973105614573528, + "loss": 7.3911, + "step": 5761 + }, + { + "epoch": 0.5376504618829896, + "grad_norm": 1.6054533506219435, + "learning_rate": 0.0002973091353871558, + "loss": 7.4868, + "step": 5762 + }, + { + "epoch": 0.5377437715778669, + "grad_norm": 2.2516479206153326, + "learning_rate": 0.00029730770894239537, + "loss": 7.0777, + "step": 5763 + }, + { + "epoch": 0.5378370812727442, + "grad_norm": 2.1061970016451896, + "learning_rate": 0.00029730628212307507, + "loss": 7.5388, + "step": 5764 + }, + { + "epoch": 0.5379303909676215, + "grad_norm": 1.8650772864584695, + "learning_rate": 0.0002973048549291985, + "loss": 6.9989, + "step": 5765 + }, + { + "epoch": 0.5380237006624988, + "grad_norm": 18.585795828821148, + "learning_rate": 0.00029730342736076934, + "loss": 7.3965, + "step": 5766 + }, + { + "epoch": 0.5381170103573761, + "grad_norm": 1.9445319675879977, + "learning_rate": 0.0002973019994177911, + "loss": 7.3861, + "step": 5767 + }, + { + "epoch": 0.5382103200522534, + "grad_norm": 10.902032120528471, + "learning_rate": 0.0002973005711002676, + "loss": 7.2247, + "step": 5768 + }, + { + "epoch": 0.5383036297471308, + "grad_norm": 1.869947911657366, + "learning_rate": 0.0002972991424082024, + "loss": 7.3995, + "step": 5769 + }, + { + "epoch": 0.5383969394420081, + "grad_norm": 1.4218970147791516, + "learning_rate": 0.0002972977133415991, + "loss": 7.1471, + "step": 5770 + }, + { + "epoch": 0.5384902491368854, + "grad_norm": 1.5291007086463047, + "learning_rate": 0.00029729628390046136, + "loss": 7.3521, + "step": 5771 + }, + { + "epoch": 0.5385835588317626, + "grad_norm": 1.9172077416210043, + "learning_rate": 0.0002972948540847928, + "loss": 7.2549, + "step": 5772 + }, + { + "epoch": 0.5386768685266399, + "grad_norm": 2.0362533987638822, + "learning_rate": 0.00029729342389459696, + "loss": 7.2823, + "step": 5773 + }, + { + "epoch": 0.5387701782215172, + "grad_norm": 3.307024798190899, + "learning_rate": 0.0002972919933298777, + "loss": 6.8806, + "step": 5774 + }, + { + "epoch": 0.5388634879163945, + "grad_norm": 1.9838748908844779, + "learning_rate": 0.0002972905623906385, + "loss": 7.1062, + "step": 5775 + }, + { + "epoch": 0.5389567976112718, + "grad_norm": 2.179656888109616, + "learning_rate": 0.000297289131076883, + "loss": 7.0724, + "step": 5776 + }, + { + "epoch": 0.5390501073061491, + "grad_norm": 1.351941199934948, + "learning_rate": 0.00029728769938861487, + "loss": 7.136, + "step": 5777 + }, + { + "epoch": 0.5391434170010264, + "grad_norm": 1.5149551168754392, + "learning_rate": 0.0002972862673258378, + "loss": 7.1843, + "step": 5778 + }, + { + "epoch": 0.5392367266959037, + "grad_norm": 1.493176882798339, + "learning_rate": 0.0002972848348885553, + "loss": 6.9742, + "step": 5779 + }, + { + "epoch": 0.539330036390781, + "grad_norm": 2.294662296429572, + "learning_rate": 0.00029728340207677115, + "loss": 7.391, + "step": 5780 + }, + { + "epoch": 0.5394233460856583, + "grad_norm": 0.7596378079860604, + "learning_rate": 0.00029728196889048893, + "loss": 7.0968, + "step": 5781 + }, + { + "epoch": 0.5395166557805356, + "grad_norm": 2.0559126617972074, + "learning_rate": 0.0002972805353297123, + "loss": 6.887, + "step": 5782 + }, + { + "epoch": 0.5396099654754128, + "grad_norm": 85.09587831247502, + "learning_rate": 0.0002972791013944449, + "loss": 7.2574, + "step": 5783 + }, + { + "epoch": 0.5397032751702902, + "grad_norm": 1.425003279073121, + "learning_rate": 0.00029727766708469036, + "loss": 7.0648, + "step": 5784 + }, + { + "epoch": 0.5397965848651675, + "grad_norm": 233.53616529139774, + "learning_rate": 0.0002972762324004523, + "loss": 7.2266, + "step": 5785 + }, + { + "epoch": 0.5398898945600448, + "grad_norm": 3.3543160594627057, + "learning_rate": 0.00029727479734173445, + "loss": 7.5946, + "step": 5786 + }, + { + "epoch": 0.5399832042549221, + "grad_norm": 1.0270895635552082, + "learning_rate": 0.0002972733619085404, + "loss": 7.3837, + "step": 5787 + }, + { + "epoch": 0.5400765139497994, + "grad_norm": 17.550218926999694, + "learning_rate": 0.0002972719261008738, + "loss": 7.4372, + "step": 5788 + }, + { + "epoch": 0.5401698236446767, + "grad_norm": 2.4968065998167495, + "learning_rate": 0.00029727048991873837, + "loss": 7.2664, + "step": 5789 + }, + { + "epoch": 0.540263133339554, + "grad_norm": 2.14582625198886, + "learning_rate": 0.0002972690533621377, + "loss": 7.7712, + "step": 5790 + }, + { + "epoch": 0.5403564430344313, + "grad_norm": 1.8504362808600212, + "learning_rate": 0.00029726761643107544, + "loss": 7.1641, + "step": 5791 + }, + { + "epoch": 0.5404497527293086, + "grad_norm": 4.225058459435568, + "learning_rate": 0.00029726617912555523, + "loss": 7.2001, + "step": 5792 + }, + { + "epoch": 0.5405430624241859, + "grad_norm": 8.314085946833675, + "learning_rate": 0.00029726474144558076, + "loss": 7.5214, + "step": 5793 + }, + { + "epoch": 0.5406363721190631, + "grad_norm": 3.0099056673299285, + "learning_rate": 0.0002972633033911557, + "loss": 7.2034, + "step": 5794 + }, + { + "epoch": 0.5407296818139404, + "grad_norm": 364.14703325879657, + "learning_rate": 0.00029726186496228365, + "loss": 6.9981, + "step": 5795 + }, + { + "epoch": 0.5408229915088177, + "grad_norm": 1.8524832486192386, + "learning_rate": 0.00029726042615896834, + "loss": 7.7633, + "step": 5796 + }, + { + "epoch": 0.540916301203695, + "grad_norm": 1.961710525054444, + "learning_rate": 0.00029725898698121334, + "loss": 7.1231, + "step": 5797 + }, + { + "epoch": 0.5410096108985724, + "grad_norm": 1.393059259298727, + "learning_rate": 0.00029725754742902235, + "loss": 7.55, + "step": 5798 + }, + { + "epoch": 0.5411029205934497, + "grad_norm": 2.0360970556631646, + "learning_rate": 0.0002972561075023991, + "loss": 7.1881, + "step": 5799 + }, + { + "epoch": 0.541196230288327, + "grad_norm": 3.03059232506594, + "learning_rate": 0.0002972546672013471, + "loss": 7.7962, + "step": 5800 + }, + { + "epoch": 0.5412895399832043, + "grad_norm": 3.2528932729600184, + "learning_rate": 0.0002972532265258702, + "loss": 7.7696, + "step": 5801 + }, + { + "epoch": 0.5413828496780816, + "grad_norm": 2.834542756539481, + "learning_rate": 0.000297251785475972, + "loss": 7.5093, + "step": 5802 + }, + { + "epoch": 0.5414761593729589, + "grad_norm": 2.6099861150430765, + "learning_rate": 0.00029725034405165597, + "loss": 7.391, + "step": 5803 + }, + { + "epoch": 0.5415694690678361, + "grad_norm": 5.733938941201403, + "learning_rate": 0.00029724890225292603, + "loss": 7.6281, + "step": 5804 + }, + { + "epoch": 0.5416627787627134, + "grad_norm": 57.425833066559065, + "learning_rate": 0.00029724746007978573, + "loss": 7.1016, + "step": 5805 + }, + { + "epoch": 0.5417560884575907, + "grad_norm": 1.886335086903091, + "learning_rate": 0.00029724601753223877, + "loss": 7.1651, + "step": 5806 + }, + { + "epoch": 0.541849398152468, + "grad_norm": 1.836878413709324, + "learning_rate": 0.00029724457461028877, + "loss": 7.2366, + "step": 5807 + }, + { + "epoch": 0.5419427078473453, + "grad_norm": 2.367647393491478, + "learning_rate": 0.00029724313131393947, + "loss": 7.3676, + "step": 5808 + }, + { + "epoch": 0.5420360175422226, + "grad_norm": 1.8605334714272292, + "learning_rate": 0.0002972416876431945, + "loss": 7.4869, + "step": 5809 + }, + { + "epoch": 0.5421293272371, + "grad_norm": 1.5638762184560315, + "learning_rate": 0.0002972402435980575, + "loss": 7.1995, + "step": 5810 + }, + { + "epoch": 0.5422226369319773, + "grad_norm": 3.8506951251750365, + "learning_rate": 0.0002972387991785323, + "loss": 7.3357, + "step": 5811 + }, + { + "epoch": 0.5423159466268546, + "grad_norm": 1.541476341770195, + "learning_rate": 0.0002972373543846223, + "loss": 7.453, + "step": 5812 + }, + { + "epoch": 0.5424092563217319, + "grad_norm": 1.3185985625495251, + "learning_rate": 0.0002972359092163314, + "loss": 7.6557, + "step": 5813 + }, + { + "epoch": 0.5425025660166092, + "grad_norm": 1.3319204642332685, + "learning_rate": 0.00029723446367366317, + "loss": 7.566, + "step": 5814 + }, + { + "epoch": 0.5425958757114864, + "grad_norm": 141.29711108542637, + "learning_rate": 0.00029723301775662136, + "loss": 7.3159, + "step": 5815 + }, + { + "epoch": 0.5426891854063637, + "grad_norm": 1.4718285251493268, + "learning_rate": 0.00029723157146520953, + "loss": 7.2957, + "step": 5816 + }, + { + "epoch": 0.542782495101241, + "grad_norm": 1202.5672811386603, + "learning_rate": 0.0002972301247994315, + "loss": 7.8528, + "step": 5817 + }, + { + "epoch": 0.5428758047961183, + "grad_norm": 16.826552810736946, + "learning_rate": 0.00029722867775929077, + "loss": 7.5624, + "step": 5818 + }, + { + "epoch": 0.5429691144909956, + "grad_norm": 387.2342031812883, + "learning_rate": 0.0002972272303447912, + "loss": 7.3332, + "step": 5819 + }, + { + "epoch": 0.5430624241858729, + "grad_norm": 2.685433873028737, + "learning_rate": 0.00029722578255593637, + "loss": 7.3288, + "step": 5820 + }, + { + "epoch": 0.5431557338807502, + "grad_norm": 3.707150535981251, + "learning_rate": 0.00029722433439273, + "loss": 7.3013, + "step": 5821 + }, + { + "epoch": 0.5432490435756275, + "grad_norm": 3.635312302485934, + "learning_rate": 0.00029722288585517576, + "loss": 7.252, + "step": 5822 + }, + { + "epoch": 0.5433423532705048, + "grad_norm": 3.1279443677357315, + "learning_rate": 0.0002972214369432773, + "loss": 7.6075, + "step": 5823 + }, + { + "epoch": 0.5434356629653821, + "grad_norm": 3.609480035491041, + "learning_rate": 0.0002972199876570384, + "loss": 7.6564, + "step": 5824 + }, + { + "epoch": 0.5435289726602593, + "grad_norm": 3.055715304584755, + "learning_rate": 0.00029721853799646267, + "loss": 7.6034, + "step": 5825 + }, + { + "epoch": 0.5436222823551367, + "grad_norm": 16.692238603386336, + "learning_rate": 0.00029721708796155374, + "loss": 7.9084, + "step": 5826 + }, + { + "epoch": 0.543715592050014, + "grad_norm": 83.25801567251868, + "learning_rate": 0.00029721563755231545, + "loss": 6.997, + "step": 5827 + }, + { + "epoch": 0.5438089017448913, + "grad_norm": 3.132766209205991, + "learning_rate": 0.0002972141867687513, + "loss": 7.3989, + "step": 5828 + }, + { + "epoch": 0.5439022114397686, + "grad_norm": 3.375725575199407, + "learning_rate": 0.0002972127356108652, + "loss": 7.7126, + "step": 5829 + }, + { + "epoch": 0.5439955211346459, + "grad_norm": 1696.6779727953055, + "learning_rate": 0.00029721128407866063, + "loss": 7.6764, + "step": 5830 + }, + { + "epoch": 0.5440888308295232, + "grad_norm": 2.6468846447307866, + "learning_rate": 0.0002972098321721414, + "loss": 7.5096, + "step": 5831 + }, + { + "epoch": 0.5441821405244005, + "grad_norm": 1.9736065722024052, + "learning_rate": 0.0002972083798913112, + "loss": 7.4002, + "step": 5832 + }, + { + "epoch": 0.5442754502192778, + "grad_norm": 2.3442670386161177, + "learning_rate": 0.00029720692723617364, + "loss": 7.5291, + "step": 5833 + }, + { + "epoch": 0.5443687599141551, + "grad_norm": 2.968653360464321, + "learning_rate": 0.0002972054742067325, + "loss": 7.8047, + "step": 5834 + }, + { + "epoch": 0.5444620696090324, + "grad_norm": 2.5802248036313338, + "learning_rate": 0.00029720402080299146, + "loss": 7.2564, + "step": 5835 + }, + { + "epoch": 0.5445553793039096, + "grad_norm": 1927.1288561569886, + "learning_rate": 0.00029720256702495416, + "loss": 7.3415, + "step": 5836 + }, + { + "epoch": 0.5446486889987869, + "grad_norm": 2.1548109904876873, + "learning_rate": 0.0002972011128726244, + "loss": 7.4702, + "step": 5837 + }, + { + "epoch": 0.5447419986936642, + "grad_norm": 2.0136510737683073, + "learning_rate": 0.00029719965834600575, + "loss": 7.8748, + "step": 5838 + }, + { + "epoch": 0.5448353083885416, + "grad_norm": 1.986425671896492, + "learning_rate": 0.00029719820344510206, + "loss": 8.5421, + "step": 5839 + }, + { + "epoch": 0.5449286180834189, + "grad_norm": 2.25454115071543, + "learning_rate": 0.0002971967481699168, + "loss": 8.7429, + "step": 5840 + }, + { + "epoch": 0.5450219277782962, + "grad_norm": 6.335985234564621, + "learning_rate": 0.0002971952925204539, + "loss": 8.0812, + "step": 5841 + }, + { + "epoch": 0.5451152374731735, + "grad_norm": 1.9026965542402443, + "learning_rate": 0.000297193836496717, + "loss": 8.5616, + "step": 5842 + }, + { + "epoch": 0.5452085471680508, + "grad_norm": 2.0426521223296628, + "learning_rate": 0.0002971923800987097, + "loss": 8.2599, + "step": 5843 + }, + { + "epoch": 0.5453018568629281, + "grad_norm": 2.591350629941379, + "learning_rate": 0.00029719092332643584, + "loss": 7.8501, + "step": 5844 + }, + { + "epoch": 0.5453951665578054, + "grad_norm": 2.2146465426501427, + "learning_rate": 0.0002971894661798991, + "loss": 7.9141, + "step": 5845 + }, + { + "epoch": 0.5454884762526827, + "grad_norm": 1.291540933696487, + "learning_rate": 0.00029718800865910304, + "loss": 7.8553, + "step": 5846 + }, + { + "epoch": 0.5455817859475599, + "grad_norm": 2.112708836986732, + "learning_rate": 0.00029718655076405156, + "loss": 7.9952, + "step": 5847 + }, + { + "epoch": 0.5456750956424372, + "grad_norm": 2.7093645824157595, + "learning_rate": 0.00029718509249474824, + "loss": 7.8039, + "step": 5848 + }, + { + "epoch": 0.5457684053373145, + "grad_norm": 1.411529561592856, + "learning_rate": 0.0002971836338511968, + "loss": 7.8808, + "step": 5849 + }, + { + "epoch": 0.5458617150321918, + "grad_norm": 2.1395594783128, + "learning_rate": 0.00029718217483340107, + "loss": 7.7128, + "step": 5850 + }, + { + "epoch": 0.5459550247270691, + "grad_norm": 1.4573141540203822, + "learning_rate": 0.0002971807154413646, + "loss": 7.517, + "step": 5851 + }, + { + "epoch": 0.5460483344219464, + "grad_norm": 2.0012544722643675, + "learning_rate": 0.0002971792556750912, + "loss": 8.0186, + "step": 5852 + }, + { + "epoch": 0.5461416441168238, + "grad_norm": 1.6474313127752351, + "learning_rate": 0.00029717779553458453, + "loss": 7.7585, + "step": 5853 + }, + { + "epoch": 0.5462349538117011, + "grad_norm": 3.543128919731307, + "learning_rate": 0.0002971763350198483, + "loss": 7.487, + "step": 5854 + }, + { + "epoch": 0.5463282635065784, + "grad_norm": 2.3657226589622105, + "learning_rate": 0.0002971748741308863, + "loss": 7.4746, + "step": 5855 + }, + { + "epoch": 0.5464215732014557, + "grad_norm": 1.8804992067601882, + "learning_rate": 0.0002971734128677022, + "loss": 7.3965, + "step": 5856 + }, + { + "epoch": 0.5465148828963329, + "grad_norm": 3.209108872613453, + "learning_rate": 0.0002971719512302996, + "loss": 7.6972, + "step": 5857 + }, + { + "epoch": 0.5466081925912102, + "grad_norm": 2.383698909074647, + "learning_rate": 0.00029717048921868247, + "loss": 7.5047, + "step": 5858 + }, + { + "epoch": 0.5467015022860875, + "grad_norm": 1.5578728277316176, + "learning_rate": 0.00029716902683285426, + "loss": 7.0854, + "step": 5859 + }, + { + "epoch": 0.5467948119809648, + "grad_norm": 2.3575469779799154, + "learning_rate": 0.00029716756407281887, + "loss": 7.2475, + "step": 5860 + }, + { + "epoch": 0.5468881216758421, + "grad_norm": 2.5601743774661223, + "learning_rate": 0.00029716610093857994, + "loss": 7.3789, + "step": 5861 + }, + { + "epoch": 0.5469814313707194, + "grad_norm": 1.697092542601849, + "learning_rate": 0.0002971646374301412, + "loss": 7.7276, + "step": 5862 + }, + { + "epoch": 0.5470747410655967, + "grad_norm": 1.3844226845842924, + "learning_rate": 0.0002971631735475064, + "loss": 7.3512, + "step": 5863 + }, + { + "epoch": 0.547168050760474, + "grad_norm": 2.605070141565732, + "learning_rate": 0.00029716170929067925, + "loss": 7.5077, + "step": 5864 + }, + { + "epoch": 0.5472613604553513, + "grad_norm": 1.302156400010503, + "learning_rate": 0.0002971602446596635, + "loss": 7.6523, + "step": 5865 + }, + { + "epoch": 0.5473546701502287, + "grad_norm": 1.935077493047659, + "learning_rate": 0.00029715877965446274, + "loss": 7.3835, + "step": 5866 + }, + { + "epoch": 0.547447979845106, + "grad_norm": 2.1196352124819438, + "learning_rate": 0.0002971573142750809, + "loss": 7.4878, + "step": 5867 + }, + { + "epoch": 0.5475412895399832, + "grad_norm": 1.7604653727214354, + "learning_rate": 0.0002971558485215215, + "loss": 7.656, + "step": 5868 + }, + { + "epoch": 0.5476345992348605, + "grad_norm": 1.9034502956707589, + "learning_rate": 0.0002971543823937884, + "loss": 7.6332, + "step": 5869 + }, + { + "epoch": 0.5477279089297378, + "grad_norm": 2.888026097245784, + "learning_rate": 0.00029715291589188535, + "loss": 7.6233, + "step": 5870 + }, + { + "epoch": 0.5478212186246151, + "grad_norm": 1.768638109907338, + "learning_rate": 0.000297151449015816, + "loss": 7.4784, + "step": 5871 + }, + { + "epoch": 0.5479145283194924, + "grad_norm": 1.2397922101956094, + "learning_rate": 0.00029714998176558406, + "loss": 7.4045, + "step": 5872 + }, + { + "epoch": 0.5480078380143697, + "grad_norm": 1.0269142598422216, + "learning_rate": 0.0002971485141411934, + "loss": 7.2136, + "step": 5873 + }, + { + "epoch": 0.548101147709247, + "grad_norm": 1.2139389472974487, + "learning_rate": 0.0002971470461426476, + "loss": 7.1915, + "step": 5874 + }, + { + "epoch": 0.5481944574041243, + "grad_norm": 1.7918898715724272, + "learning_rate": 0.0002971455777699504, + "loss": 7.4113, + "step": 5875 + }, + { + "epoch": 0.5482877670990016, + "grad_norm": 1.295626421150132, + "learning_rate": 0.00029714410902310565, + "loss": 7.221, + "step": 5876 + }, + { + "epoch": 0.5483810767938789, + "grad_norm": 1.0943788170631294, + "learning_rate": 0.000297142639902117, + "loss": 6.997, + "step": 5877 + }, + { + "epoch": 0.5484743864887561, + "grad_norm": 1.314241452844411, + "learning_rate": 0.00029714117040698817, + "loss": 7.3557, + "step": 5878 + }, + { + "epoch": 0.5485676961836334, + "grad_norm": 3.5316805713261643, + "learning_rate": 0.00029713970053772297, + "loss": 7.6499, + "step": 5879 + }, + { + "epoch": 0.5486610058785107, + "grad_norm": 1.2420495058688439, + "learning_rate": 0.0002971382302943251, + "loss": 7.607, + "step": 5880 + }, + { + "epoch": 0.548754315573388, + "grad_norm": 2.5209053742124183, + "learning_rate": 0.0002971367596767983, + "loss": 7.9474, + "step": 5881 + }, + { + "epoch": 0.5488476252682654, + "grad_norm": 2.660160238772233, + "learning_rate": 0.00029713528868514623, + "loss": 7.2172, + "step": 5882 + }, + { + "epoch": 0.5489409349631427, + "grad_norm": 1.787655267084048, + "learning_rate": 0.0002971338173193728, + "loss": 7.5719, + "step": 5883 + }, + { + "epoch": 0.54903424465802, + "grad_norm": 1.3155630532584575, + "learning_rate": 0.0002971323455794816, + "loss": 7.4043, + "step": 5884 + }, + { + "epoch": 0.5491275543528973, + "grad_norm": 1.9121366210765662, + "learning_rate": 0.0002971308734654765, + "loss": 7.44, + "step": 5885 + }, + { + "epoch": 0.5492208640477746, + "grad_norm": 1.8272621655841186, + "learning_rate": 0.00029712940097736105, + "loss": 7.3293, + "step": 5886 + }, + { + "epoch": 0.5493141737426519, + "grad_norm": 1.037473530057755, + "learning_rate": 0.0002971279281151392, + "loss": 7.4148, + "step": 5887 + }, + { + "epoch": 0.5494074834375292, + "grad_norm": 1.9054128823328347, + "learning_rate": 0.0002971264548788146, + "loss": 7.29, + "step": 5888 + }, + { + "epoch": 0.5495007931324064, + "grad_norm": 1.4832217052831786, + "learning_rate": 0.00029712498126839104, + "loss": 7.5704, + "step": 5889 + }, + { + "epoch": 0.5495941028272837, + "grad_norm": 1.3722280486813239, + "learning_rate": 0.00029712350728387224, + "loss": 7.1937, + "step": 5890 + }, + { + "epoch": 0.549687412522161, + "grad_norm": 1.470385442572477, + "learning_rate": 0.0002971220329252619, + "loss": 7.4153, + "step": 5891 + }, + { + "epoch": 0.5497807222170383, + "grad_norm": 1.0532257928505122, + "learning_rate": 0.0002971205581925638, + "loss": 7.2528, + "step": 5892 + }, + { + "epoch": 0.5498740319119156, + "grad_norm": 10.440768486886563, + "learning_rate": 0.0002971190830857817, + "loss": 7.5818, + "step": 5893 + }, + { + "epoch": 0.549967341606793, + "grad_norm": 1.2419110627145344, + "learning_rate": 0.0002971176076049194, + "loss": 7.6022, + "step": 5894 + }, + { + "epoch": 0.5500606513016703, + "grad_norm": 2.621956378969274, + "learning_rate": 0.00029711613174998055, + "loss": 7.2146, + "step": 5895 + }, + { + "epoch": 0.5501539609965476, + "grad_norm": 1.686297163537079, + "learning_rate": 0.00029711465552096896, + "loss": 7.3051, + "step": 5896 + }, + { + "epoch": 0.5502472706914249, + "grad_norm": 1.5721116741917196, + "learning_rate": 0.0002971131789178884, + "loss": 7.4318, + "step": 5897 + }, + { + "epoch": 0.5503405803863022, + "grad_norm": 1.3480038830016907, + "learning_rate": 0.0002971117019407426, + "loss": 7.1145, + "step": 5898 + }, + { + "epoch": 0.5504338900811795, + "grad_norm": 2.2913443747724678, + "learning_rate": 0.00029711022458953534, + "loss": 7.1208, + "step": 5899 + }, + { + "epoch": 0.5505271997760567, + "grad_norm": 1.9368846076505468, + "learning_rate": 0.00029710874686427033, + "loss": 7.3112, + "step": 5900 + }, + { + "epoch": 0.550620509470934, + "grad_norm": 2.6247840124783814, + "learning_rate": 0.0002971072687649514, + "loss": 7.1525, + "step": 5901 + }, + { + "epoch": 0.5507138191658113, + "grad_norm": 1.4198261258413705, + "learning_rate": 0.00029710579029158216, + "loss": 7.3157, + "step": 5902 + }, + { + "epoch": 0.5508071288606886, + "grad_norm": 1.9173506665613638, + "learning_rate": 0.00029710431144416654, + "loss": 7.27, + "step": 5903 + }, + { + "epoch": 0.5509004385555659, + "grad_norm": 1.1628527291623667, + "learning_rate": 0.00029710283222270827, + "loss": 7.3265, + "step": 5904 + }, + { + "epoch": 0.5509937482504432, + "grad_norm": 1.1770537949020021, + "learning_rate": 0.000297101352627211, + "loss": 7.2252, + "step": 5905 + }, + { + "epoch": 0.5510870579453205, + "grad_norm": 1.863703944202908, + "learning_rate": 0.0002970998726576786, + "loss": 7.2512, + "step": 5906 + }, + { + "epoch": 0.5511803676401978, + "grad_norm": 1.2218881536822361, + "learning_rate": 0.00029709839231411477, + "loss": 7.3884, + "step": 5907 + }, + { + "epoch": 0.5512736773350752, + "grad_norm": 1.091721141696869, + "learning_rate": 0.0002970969115965233, + "loss": 7.085, + "step": 5908 + }, + { + "epoch": 0.5513669870299525, + "grad_norm": 1.4460746232342996, + "learning_rate": 0.00029709543050490797, + "loss": 7.2754, + "step": 5909 + }, + { + "epoch": 0.5514602967248297, + "grad_norm": 1.0253946854188147, + "learning_rate": 0.0002970939490392726, + "loss": 7.0572, + "step": 5910 + }, + { + "epoch": 0.551553606419707, + "grad_norm": 0.6799985797639899, + "learning_rate": 0.00029709246719962084, + "loss": 7.3383, + "step": 5911 + }, + { + "epoch": 0.5516469161145843, + "grad_norm": 4.761694040293295, + "learning_rate": 0.0002970909849859565, + "loss": 7.285, + "step": 5912 + }, + { + "epoch": 0.5517402258094616, + "grad_norm": 1.7479911865636804, + "learning_rate": 0.00029708950239828335, + "loss": 7.1584, + "step": 5913 + }, + { + "epoch": 0.5518335355043389, + "grad_norm": 0.7474322384677524, + "learning_rate": 0.00029708801943660514, + "loss": 7.1702, + "step": 5914 + }, + { + "epoch": 0.5519268451992162, + "grad_norm": 8.23342794455183, + "learning_rate": 0.0002970865361009257, + "loss": 7.1635, + "step": 5915 + }, + { + "epoch": 0.5520201548940935, + "grad_norm": 2.283676072739915, + "learning_rate": 0.0002970850523912488, + "loss": 6.8264, + "step": 5916 + }, + { + "epoch": 0.5521134645889708, + "grad_norm": 0.8606559226033087, + "learning_rate": 0.00029708356830757815, + "loss": 7.0362, + "step": 5917 + }, + { + "epoch": 0.5522067742838481, + "grad_norm": 1.233514779124736, + "learning_rate": 0.00029708208384991757, + "loss": 7.1806, + "step": 5918 + }, + { + "epoch": 0.5523000839787254, + "grad_norm": 6.087120438595438, + "learning_rate": 0.00029708059901827077, + "loss": 7.1924, + "step": 5919 + }, + { + "epoch": 0.5523933936736027, + "grad_norm": 0.7471405119210712, + "learning_rate": 0.0002970791138126416, + "loss": 7.1291, + "step": 5920 + }, + { + "epoch": 0.5524867033684799, + "grad_norm": 1.0944052799891537, + "learning_rate": 0.0002970776282330339, + "loss": 7.343, + "step": 5921 + }, + { + "epoch": 0.5525800130633572, + "grad_norm": 1.3416665582987743, + "learning_rate": 0.0002970761422794512, + "loss": 7.1576, + "step": 5922 + }, + { + "epoch": 0.5526733227582346, + "grad_norm": 1.837257600284458, + "learning_rate": 0.00029707465595189754, + "loss": 7.2246, + "step": 5923 + }, + { + "epoch": 0.5527666324531119, + "grad_norm": 0.5881323309430009, + "learning_rate": 0.00029707316925037654, + "loss": 7.22, + "step": 5924 + }, + { + "epoch": 0.5528599421479892, + "grad_norm": 1.3255971929217456, + "learning_rate": 0.00029707168217489214, + "loss": 6.8806, + "step": 5925 + }, + { + "epoch": 0.5529532518428665, + "grad_norm": 30.051585334212966, + "learning_rate": 0.0002970701947254479, + "loss": 7.0774, + "step": 5926 + }, + { + "epoch": 0.5530465615377438, + "grad_norm": 0.8465242268523889, + "learning_rate": 0.0002970687069020478, + "loss": 7.1079, + "step": 5927 + }, + { + "epoch": 0.5531398712326211, + "grad_norm": 1.0035989082690018, + "learning_rate": 0.0002970672187046955, + "loss": 7.0999, + "step": 5928 + }, + { + "epoch": 0.5532331809274984, + "grad_norm": 1.759280577566833, + "learning_rate": 0.0002970657301333949, + "loss": 6.894, + "step": 5929 + }, + { + "epoch": 0.5533264906223757, + "grad_norm": 2.121153181072591, + "learning_rate": 0.0002970642411881496, + "loss": 7.134, + "step": 5930 + }, + { + "epoch": 0.5534198003172529, + "grad_norm": 32.038217917676995, + "learning_rate": 0.0002970627518689636, + "loss": 6.8606, + "step": 5931 + }, + { + "epoch": 0.5535131100121302, + "grad_norm": 1.2731551135709538, + "learning_rate": 0.0002970612621758405, + "loss": 7.5366, + "step": 5932 + }, + { + "epoch": 0.5536064197070075, + "grad_norm": 42.31908235576048, + "learning_rate": 0.0002970597721087842, + "loss": 7.2839, + "step": 5933 + }, + { + "epoch": 0.5536997294018848, + "grad_norm": 1.3063900917646396, + "learning_rate": 0.0002970582816677985, + "loss": 6.9781, + "step": 5934 + }, + { + "epoch": 0.5537930390967621, + "grad_norm": 2.202005812976686, + "learning_rate": 0.0002970567908528871, + "loss": 6.9488, + "step": 5935 + }, + { + "epoch": 0.5538863487916394, + "grad_norm": 1.310766550863814, + "learning_rate": 0.0002970552996640539, + "loss": 7.2127, + "step": 5936 + }, + { + "epoch": 0.5539796584865168, + "grad_norm": 1.8445894698641498, + "learning_rate": 0.0002970538081013026, + "loss": 7.0655, + "step": 5937 + }, + { + "epoch": 0.5540729681813941, + "grad_norm": 3.4320972369608147, + "learning_rate": 0.000297052316164637, + "loss": 7.439, + "step": 5938 + }, + { + "epoch": 0.5541662778762714, + "grad_norm": 1.5033860334023283, + "learning_rate": 0.00029705082385406096, + "loss": 7.549, + "step": 5939 + }, + { + "epoch": 0.5542595875711487, + "grad_norm": 3.551986309338173, + "learning_rate": 0.0002970493311695783, + "loss": 7.0847, + "step": 5940 + }, + { + "epoch": 0.554352897266026, + "grad_norm": 1.902224417832948, + "learning_rate": 0.0002970478381111926, + "loss": 6.8926, + "step": 5941 + }, + { + "epoch": 0.5544462069609032, + "grad_norm": 27.2930846809305, + "learning_rate": 0.0002970463446789079, + "loss": 7.2274, + "step": 5942 + }, + { + "epoch": 0.5545395166557805, + "grad_norm": 2.136608161093885, + "learning_rate": 0.00029704485087272785, + "loss": 7.0736, + "step": 5943 + }, + { + "epoch": 0.5546328263506578, + "grad_norm": 2.430959213244903, + "learning_rate": 0.00029704335669265635, + "loss": 6.9264, + "step": 5944 + }, + { + "epoch": 0.5547261360455351, + "grad_norm": 7.395347704331361, + "learning_rate": 0.00029704186213869713, + "loss": 6.976, + "step": 5945 + }, + { + "epoch": 0.5548194457404124, + "grad_norm": 0.9844919609228122, + "learning_rate": 0.000297040367210854, + "loss": 7.1923, + "step": 5946 + }, + { + "epoch": 0.5549127554352897, + "grad_norm": 1.136820677780664, + "learning_rate": 0.0002970388719091308, + "loss": 7.3216, + "step": 5947 + }, + { + "epoch": 0.555006065130167, + "grad_norm": 1.031700657094215, + "learning_rate": 0.0002970373762335313, + "loss": 7.2678, + "step": 5948 + }, + { + "epoch": 0.5550993748250443, + "grad_norm": 44.027734906672066, + "learning_rate": 0.0002970358801840593, + "loss": 7.2024, + "step": 5949 + }, + { + "epoch": 0.5551926845199217, + "grad_norm": 0.7951222184420027, + "learning_rate": 0.0002970343837607186, + "loss": 7.3361, + "step": 5950 + }, + { + "epoch": 0.555285994214799, + "grad_norm": 0.8476900712393037, + "learning_rate": 0.0002970328869635131, + "loss": 7.3179, + "step": 5951 + }, + { + "epoch": 0.5553793039096763, + "grad_norm": 1.4307386399136093, + "learning_rate": 0.00029703138979244643, + "loss": 7.4487, + "step": 5952 + }, + { + "epoch": 0.5554726136045535, + "grad_norm": 0.9161835309475798, + "learning_rate": 0.0002970298922475225, + "loss": 7.1214, + "step": 5953 + }, + { + "epoch": 0.5555659232994308, + "grad_norm": 1.2564078524422908, + "learning_rate": 0.00029702839432874513, + "loss": 7.048, + "step": 5954 + }, + { + "epoch": 0.5556592329943081, + "grad_norm": 1.2900395807273024, + "learning_rate": 0.0002970268960361181, + "loss": 7.1095, + "step": 5955 + }, + { + "epoch": 0.5557525426891854, + "grad_norm": 2.4703087327618216, + "learning_rate": 0.0002970253973696452, + "loss": 7.24, + "step": 5956 + }, + { + "epoch": 0.5558458523840627, + "grad_norm": 1.4761595833528398, + "learning_rate": 0.0002970238983293303, + "loss": 7.2553, + "step": 5957 + }, + { + "epoch": 0.55593916207894, + "grad_norm": 1.287366461581749, + "learning_rate": 0.0002970223989151772, + "loss": 7.5103, + "step": 5958 + }, + { + "epoch": 0.5560324717738173, + "grad_norm": 0.5777811402920159, + "learning_rate": 0.00029702089912718964, + "loss": 7.0845, + "step": 5959 + }, + { + "epoch": 0.5561257814686946, + "grad_norm": 1.0645668355431561, + "learning_rate": 0.0002970193989653715, + "loss": 7.2233, + "step": 5960 + }, + { + "epoch": 0.5562190911635719, + "grad_norm": 1.284791972124591, + "learning_rate": 0.00029701789842972656, + "loss": 7.0243, + "step": 5961 + }, + { + "epoch": 0.5563124008584492, + "grad_norm": 424.24885576373447, + "learning_rate": 0.0002970163975202587, + "loss": 7.0349, + "step": 5962 + }, + { + "epoch": 0.5564057105533264, + "grad_norm": 0.8674089246591098, + "learning_rate": 0.00029701489623697164, + "loss": 7.2772, + "step": 5963 + }, + { + "epoch": 0.5564990202482037, + "grad_norm": 1.1320205481329935, + "learning_rate": 0.0002970133945798693, + "loss": 7.2055, + "step": 5964 + }, + { + "epoch": 0.556592329943081, + "grad_norm": 0.8140511434553125, + "learning_rate": 0.00029701189254895545, + "loss": 6.5732, + "step": 5965 + }, + { + "epoch": 0.5566856396379584, + "grad_norm": 0.9407751543473448, + "learning_rate": 0.00029701039014423387, + "loss": 7.1652, + "step": 5966 + }, + { + "epoch": 0.5567789493328357, + "grad_norm": 2.424372252726458, + "learning_rate": 0.0002970088873657084, + "loss": 7.1436, + "step": 5967 + }, + { + "epoch": 0.556872259027713, + "grad_norm": 0.8299173260526628, + "learning_rate": 0.0002970073842133829, + "loss": 7.1392, + "step": 5968 + }, + { + "epoch": 0.5569655687225903, + "grad_norm": 1.3551487715045065, + "learning_rate": 0.00029700588068726116, + "loss": 7.124, + "step": 5969 + }, + { + "epoch": 0.5570588784174676, + "grad_norm": 1.090771729612798, + "learning_rate": 0.00029700437678734705, + "loss": 7.5211, + "step": 5970 + }, + { + "epoch": 0.5571521881123449, + "grad_norm": 0.9838777139921864, + "learning_rate": 0.00029700287251364433, + "loss": 7.2497, + "step": 5971 + }, + { + "epoch": 0.5572454978072222, + "grad_norm": 1.2568135393948443, + "learning_rate": 0.00029700136786615683, + "loss": 7.2693, + "step": 5972 + }, + { + "epoch": 0.5573388075020995, + "grad_norm": 2.6375956752673493, + "learning_rate": 0.0002969998628448884, + "loss": 7.0114, + "step": 5973 + }, + { + "epoch": 0.5574321171969767, + "grad_norm": 0.9345207064832223, + "learning_rate": 0.00029699835744984287, + "loss": 7.0123, + "step": 5974 + }, + { + "epoch": 0.557525426891854, + "grad_norm": 1.0430667511289802, + "learning_rate": 0.0002969968516810241, + "loss": 7.3159, + "step": 5975 + }, + { + "epoch": 0.5576187365867313, + "grad_norm": 1.1836256217351213, + "learning_rate": 0.00029699534553843585, + "loss": 6.8805, + "step": 5976 + }, + { + "epoch": 0.5577120462816086, + "grad_norm": 1.192627320903884, + "learning_rate": 0.00029699383902208196, + "loss": 6.9949, + "step": 5977 + }, + { + "epoch": 0.557805355976486, + "grad_norm": 1.5294850506278026, + "learning_rate": 0.0002969923321319663, + "loss": 6.9709, + "step": 5978 + }, + { + "epoch": 0.5578986656713633, + "grad_norm": 0.8790764252125718, + "learning_rate": 0.0002969908248680927, + "loss": 6.9952, + "step": 5979 + }, + { + "epoch": 0.5579919753662406, + "grad_norm": 1.6451668951341314, + "learning_rate": 0.00029698931723046493, + "loss": 7.5173, + "step": 5980 + }, + { + "epoch": 0.5580852850611179, + "grad_norm": 1.6136893880912202, + "learning_rate": 0.0002969878092190869, + "loss": 7.103, + "step": 5981 + }, + { + "epoch": 0.5581785947559952, + "grad_norm": 1.8776407524493677, + "learning_rate": 0.0002969863008339624, + "loss": 6.8524, + "step": 5982 + }, + { + "epoch": 0.5582719044508725, + "grad_norm": 968.6086623826751, + "learning_rate": 0.00029698479207509527, + "loss": 7.0475, + "step": 5983 + }, + { + "epoch": 0.5583652141457497, + "grad_norm": 2.111787946859903, + "learning_rate": 0.0002969832829424894, + "loss": 6.9343, + "step": 5984 + }, + { + "epoch": 0.558458523840627, + "grad_norm": 1290.3106917293276, + "learning_rate": 0.0002969817734361485, + "loss": 6.9622, + "step": 5985 + }, + { + "epoch": 0.5585518335355043, + "grad_norm": 11353.273592653557, + "learning_rate": 0.00029698026355607656, + "loss": 7.2877, + "step": 5986 + }, + { + "epoch": 0.5586451432303816, + "grad_norm": 2.6063255518301838, + "learning_rate": 0.0002969787533022773, + "loss": 7.805, + "step": 5987 + }, + { + "epoch": 0.5587384529252589, + "grad_norm": 867.5685893830525, + "learning_rate": 0.00029697724267475467, + "loss": 7.9156, + "step": 5988 + }, + { + "epoch": 0.5588317626201362, + "grad_norm": 3.7374134564181594, + "learning_rate": 0.0002969757316735124, + "loss": 7.7614, + "step": 5989 + }, + { + "epoch": 0.5589250723150135, + "grad_norm": 1.9563544784966134, + "learning_rate": 0.0002969742202985544, + "loss": 8.1586, + "step": 5990 + }, + { + "epoch": 0.5590183820098908, + "grad_norm": 21.531984169890023, + "learning_rate": 0.0002969727085498845, + "loss": 7.9988, + "step": 5991 + }, + { + "epoch": 0.5591116917047682, + "grad_norm": 3.6978367693247636, + "learning_rate": 0.0002969711964275065, + "loss": 8.0425, + "step": 5992 + }, + { + "epoch": 0.5592050013996455, + "grad_norm": 3.475900671927064, + "learning_rate": 0.0002969696839314243, + "loss": 8.1491, + "step": 5993 + }, + { + "epoch": 0.5592983110945228, + "grad_norm": 1.604727980829227, + "learning_rate": 0.00029696817106164173, + "loss": 7.8142, + "step": 5994 + }, + { + "epoch": 0.5593916207894, + "grad_norm": 6.750677109277488, + "learning_rate": 0.0002969666578181627, + "loss": 8.3337, + "step": 5995 + }, + { + "epoch": 0.5594849304842773, + "grad_norm": 5.4938289508233655, + "learning_rate": 0.0002969651442009909, + "loss": 8.2047, + "step": 5996 + }, + { + "epoch": 0.5595782401791546, + "grad_norm": 3.902657783684809, + "learning_rate": 0.00029696363021013036, + "loss": 8.0173, + "step": 5997 + }, + { + "epoch": 0.5596715498740319, + "grad_norm": 5.4445977648387744, + "learning_rate": 0.00029696211584558476, + "loss": 7.4832, + "step": 5998 + }, + { + "epoch": 0.5597648595689092, + "grad_norm": 2.077428708709524, + "learning_rate": 0.00029696060110735804, + "loss": 7.5317, + "step": 5999 + }, + { + "epoch": 0.5598581692637865, + "grad_norm": 3.9150615476681283, + "learning_rate": 0.0002969590859954541, + "loss": 7.9891, + "step": 6000 + }, + { + "epoch": 0.5599514789586638, + "grad_norm": 3.3272001703166163, + "learning_rate": 0.0002969575705098767, + "loss": 7.5642, + "step": 6001 + }, + { + "epoch": 0.5600447886535411, + "grad_norm": 3.9500636852024877, + "learning_rate": 0.00029695605465062975, + "loss": 7.4962, + "step": 6002 + }, + { + "epoch": 0.5601380983484184, + "grad_norm": 3.1002671972355627, + "learning_rate": 0.0002969545384177171, + "loss": 7.5026, + "step": 6003 + }, + { + "epoch": 0.5602314080432957, + "grad_norm": 3.175228306295062, + "learning_rate": 0.00029695302181114255, + "loss": 7.3987, + "step": 6004 + }, + { + "epoch": 0.560324717738173, + "grad_norm": 3.679216937924253, + "learning_rate": 0.00029695150483091, + "loss": 7.6302, + "step": 6005 + }, + { + "epoch": 0.5604180274330502, + "grad_norm": 3.0955935183870467, + "learning_rate": 0.00029694998747702335, + "loss": 7.8097, + "step": 6006 + }, + { + "epoch": 0.5605113371279276, + "grad_norm": 1.8728657584632382, + "learning_rate": 0.00029694846974948633, + "loss": 7.3864, + "step": 6007 + }, + { + "epoch": 0.5606046468228049, + "grad_norm": 1.3203689706331163, + "learning_rate": 0.0002969469516483029, + "loss": 7.2724, + "step": 6008 + }, + { + "epoch": 0.5606979565176822, + "grad_norm": 3.875126125728873, + "learning_rate": 0.00029694543317347697, + "loss": 7.1261, + "step": 6009 + }, + { + "epoch": 0.5607912662125595, + "grad_norm": 6.274100612332289, + "learning_rate": 0.0002969439143250123, + "loss": 7.6431, + "step": 6010 + }, + { + "epoch": 0.5608845759074368, + "grad_norm": 9.111311132019239, + "learning_rate": 0.00029694239510291275, + "loss": 7.0093, + "step": 6011 + }, + { + "epoch": 0.5609778856023141, + "grad_norm": 8.066302164382288, + "learning_rate": 0.00029694087550718225, + "loss": 7.393, + "step": 6012 + }, + { + "epoch": 0.5610711952971914, + "grad_norm": 2.6265468519080732, + "learning_rate": 0.00029693935553782463, + "loss": 7.6033, + "step": 6013 + }, + { + "epoch": 0.5611645049920687, + "grad_norm": 2.8649754265947256, + "learning_rate": 0.0002969378351948437, + "loss": 7.5858, + "step": 6014 + }, + { + "epoch": 0.561257814686946, + "grad_norm": 2.270296983833154, + "learning_rate": 0.00029693631447824345, + "loss": 7.4444, + "step": 6015 + }, + { + "epoch": 0.5613511243818232, + "grad_norm": 1.5670086301327477, + "learning_rate": 0.00029693479338802764, + "loss": 7.0511, + "step": 6016 + }, + { + "epoch": 0.5614444340767005, + "grad_norm": 2.094082409894055, + "learning_rate": 0.0002969332719242002, + "loss": 7.3511, + "step": 6017 + }, + { + "epoch": 0.5615377437715778, + "grad_norm": 1.9135160885689717, + "learning_rate": 0.00029693175008676495, + "loss": 7.4213, + "step": 6018 + }, + { + "epoch": 0.5616310534664551, + "grad_norm": 8.276434599021982, + "learning_rate": 0.0002969302278757258, + "loss": 7.6114, + "step": 6019 + }, + { + "epoch": 0.5617243631613325, + "grad_norm": 1.5614757130568224, + "learning_rate": 0.0002969287052910866, + "loss": 7.3737, + "step": 6020 + }, + { + "epoch": 0.5618176728562098, + "grad_norm": 2.6480526788915255, + "learning_rate": 0.00029692718233285124, + "loss": 7.408, + "step": 6021 + }, + { + "epoch": 0.5619109825510871, + "grad_norm": 2.1833196550450005, + "learning_rate": 0.0002969256590010236, + "loss": 8.0588, + "step": 6022 + }, + { + "epoch": 0.5620042922459644, + "grad_norm": 3.1450726213915616, + "learning_rate": 0.00029692413529560746, + "loss": 7.5814, + "step": 6023 + }, + { + "epoch": 0.5620976019408417, + "grad_norm": 41.68958015158389, + "learning_rate": 0.0002969226112166068, + "loss": 7.5077, + "step": 6024 + }, + { + "epoch": 0.562190911635719, + "grad_norm": 5.440785673907822, + "learning_rate": 0.0002969210867640255, + "loss": 7.3761, + "step": 6025 + }, + { + "epoch": 0.5622842213305963, + "grad_norm": 2.802302330190988, + "learning_rate": 0.00029691956193786737, + "loss": 7.3505, + "step": 6026 + }, + { + "epoch": 0.5623775310254735, + "grad_norm": 9847886.691934751, + "learning_rate": 0.0002969180367381363, + "loss": 7.7674, + "step": 6027 + }, + { + "epoch": 0.5624708407203508, + "grad_norm": 4.6215123976752555, + "learning_rate": 0.0002969165111648362, + "loss": 8.1777, + "step": 6028 + }, + { + "epoch": 0.5625641504152281, + "grad_norm": 3865.9726008427683, + "learning_rate": 0.0002969149852179709, + "loss": 8.1474, + "step": 6029 + }, + { + "epoch": 0.5626574601101054, + "grad_norm": 2.5547484280186175, + "learning_rate": 0.00029691345889754436, + "loss": 8.2992, + "step": 6030 + }, + { + "epoch": 0.5627507698049827, + "grad_norm": 2.879467027575963, + "learning_rate": 0.00029691193220356035, + "loss": 7.9678, + "step": 6031 + }, + { + "epoch": 0.56284407949986, + "grad_norm": 3.2086294558410264, + "learning_rate": 0.0002969104051360229, + "loss": 8.2549, + "step": 6032 + }, + { + "epoch": 0.5629373891947373, + "grad_norm": 5.0315861409254214, + "learning_rate": 0.00029690887769493574, + "loss": 8.1583, + "step": 6033 + }, + { + "epoch": 0.5630306988896147, + "grad_norm": 2.3119289095206916, + "learning_rate": 0.0002969073498803028, + "loss": 8.2008, + "step": 6034 + }, + { + "epoch": 0.563124008584492, + "grad_norm": 2.145251677864567, + "learning_rate": 0.0002969058216921281, + "loss": 7.9982, + "step": 6035 + }, + { + "epoch": 0.5632173182793693, + "grad_norm": 3.064567085193833, + "learning_rate": 0.0002969042931304153, + "loss": 7.8859, + "step": 6036 + }, + { + "epoch": 0.5633106279742465, + "grad_norm": 4.089984980816349, + "learning_rate": 0.0002969027641951685, + "loss": 7.8505, + "step": 6037 + }, + { + "epoch": 0.5634039376691238, + "grad_norm": 2.9542795091275793, + "learning_rate": 0.0002969012348863914, + "loss": 7.9849, + "step": 6038 + }, + { + "epoch": 0.5634972473640011, + "grad_norm": 4.404342078441114, + "learning_rate": 0.000296899705204088, + "loss": 8.0659, + "step": 6039 + }, + { + "epoch": 0.5635905570588784, + "grad_norm": 3.3654060555994256, + "learning_rate": 0.00029689817514826213, + "loss": 8.2614, + "step": 6040 + }, + { + "epoch": 0.5636838667537557, + "grad_norm": 2.5743760597907035, + "learning_rate": 0.0002968966447189178, + "loss": 8.1839, + "step": 6041 + }, + { + "epoch": 0.563777176448633, + "grad_norm": 5.970378146117661, + "learning_rate": 0.00029689511391605875, + "loss": 7.7543, + "step": 6042 + }, + { + "epoch": 0.5638704861435103, + "grad_norm": 3.765544871886004, + "learning_rate": 0.000296893582739689, + "loss": 7.8738, + "step": 6043 + }, + { + "epoch": 0.5639637958383876, + "grad_norm": 2.4716021037108415, + "learning_rate": 0.0002968920511898123, + "loss": 7.5922, + "step": 6044 + }, + { + "epoch": 0.5640571055332649, + "grad_norm": 5.071340233392537, + "learning_rate": 0.00029689051926643265, + "loss": 7.8062, + "step": 6045 + }, + { + "epoch": 0.5641504152281422, + "grad_norm": 1.9928434915979922, + "learning_rate": 0.0002968889869695539, + "loss": 7.7995, + "step": 6046 + }, + { + "epoch": 0.5642437249230196, + "grad_norm": 4.4892249696027955, + "learning_rate": 0.00029688745429918005, + "loss": 8.2424, + "step": 6047 + }, + { + "epoch": 0.5643370346178967, + "grad_norm": 3.7747935151440952, + "learning_rate": 0.00029688592125531486, + "loss": 7.8304, + "step": 6048 + }, + { + "epoch": 0.5644303443127741, + "grad_norm": 4.096549645514174, + "learning_rate": 0.0002968843878379623, + "loss": 7.6979, + "step": 6049 + }, + { + "epoch": 0.5645236540076514, + "grad_norm": 3.5157350486295837, + "learning_rate": 0.00029688285404712625, + "loss": 8.118, + "step": 6050 + }, + { + "epoch": 0.5646169637025287, + "grad_norm": 5.481594210884873, + "learning_rate": 0.0002968813198828106, + "loss": 7.7986, + "step": 6051 + }, + { + "epoch": 0.564710273397406, + "grad_norm": 4.070903469315188, + "learning_rate": 0.00029687978534501927, + "loss": 7.6267, + "step": 6052 + }, + { + "epoch": 0.5648035830922833, + "grad_norm": 2.348468959051563, + "learning_rate": 0.0002968782504337562, + "loss": 7.9292, + "step": 6053 + }, + { + "epoch": 0.5648968927871606, + "grad_norm": 7.217234191871221, + "learning_rate": 0.00029687671514902513, + "loss": 7.9335, + "step": 6054 + }, + { + "epoch": 0.5649902024820379, + "grad_norm": 3.7346863715664087, + "learning_rate": 0.0002968751794908302, + "loss": 7.8183, + "step": 6055 + }, + { + "epoch": 0.5650835121769152, + "grad_norm": 1.6917288972572002, + "learning_rate": 0.00029687364345917517, + "loss": 7.6331, + "step": 6056 + }, + { + "epoch": 0.5651768218717925, + "grad_norm": 3.7195588338154066, + "learning_rate": 0.00029687210705406396, + "loss": 7.8494, + "step": 6057 + }, + { + "epoch": 0.5652701315666698, + "grad_norm": 4.21950181758867, + "learning_rate": 0.00029687057027550046, + "loss": 7.6359, + "step": 6058 + }, + { + "epoch": 0.565363441261547, + "grad_norm": 3.2831250841092676, + "learning_rate": 0.0002968690331234886, + "loss": 7.6831, + "step": 6059 + }, + { + "epoch": 0.5654567509564243, + "grad_norm": 2.1679261880083147, + "learning_rate": 0.0002968674955980323, + "loss": 7.5882, + "step": 6060 + }, + { + "epoch": 0.5655500606513016, + "grad_norm": 2.9092492984334504, + "learning_rate": 0.0002968659576991355, + "loss": 7.4882, + "step": 6061 + }, + { + "epoch": 0.565643370346179, + "grad_norm": 4.507268443778923, + "learning_rate": 0.0002968644194268021, + "loss": 7.7493, + "step": 6062 + }, + { + "epoch": 0.5657366800410563, + "grad_norm": 2.7449151518065396, + "learning_rate": 0.0002968628807810359, + "loss": 7.6995, + "step": 6063 + }, + { + "epoch": 0.5658299897359336, + "grad_norm": 1.8216735103544273, + "learning_rate": 0.00029686134176184093, + "loss": 7.6856, + "step": 6064 + }, + { + "epoch": 0.5659232994308109, + "grad_norm": 2.9328044775104956, + "learning_rate": 0.0002968598023692211, + "loss": 8.3331, + "step": 6065 + }, + { + "epoch": 0.5660166091256882, + "grad_norm": 2.4041977973959545, + "learning_rate": 0.0002968582626031803, + "loss": 7.922, + "step": 6066 + }, + { + "epoch": 0.5661099188205655, + "grad_norm": 2.838723059770169, + "learning_rate": 0.00029685672246372244, + "loss": 7.9652, + "step": 6067 + }, + { + "epoch": 0.5662032285154428, + "grad_norm": 3.91684765780007, + "learning_rate": 0.0002968551819508514, + "loss": 7.7938, + "step": 6068 + }, + { + "epoch": 0.56629653821032, + "grad_norm": 4.567902879998186, + "learning_rate": 0.0002968536410645712, + "loss": 7.6163, + "step": 6069 + }, + { + "epoch": 0.5663898479051973, + "grad_norm": 2.552552035955207, + "learning_rate": 0.0002968520998048856, + "loss": 7.7153, + "step": 6070 + }, + { + "epoch": 0.5664831576000746, + "grad_norm": 3.681839379633101, + "learning_rate": 0.0002968505581717987, + "loss": 7.8238, + "step": 6071 + }, + { + "epoch": 0.5665764672949519, + "grad_norm": 5.458856357881192, + "learning_rate": 0.0002968490161653143, + "loss": 7.6333, + "step": 6072 + }, + { + "epoch": 0.5666697769898292, + "grad_norm": 3.279560638276304, + "learning_rate": 0.0002968474737854363, + "loss": 7.854, + "step": 6073 + }, + { + "epoch": 0.5667630866847065, + "grad_norm": 2.7906687748769063, + "learning_rate": 0.00029684593103216874, + "loss": 7.8703, + "step": 6074 + }, + { + "epoch": 0.5668563963795838, + "grad_norm": 3.400943898083769, + "learning_rate": 0.0002968443879055154, + "loss": 7.724, + "step": 6075 + }, + { + "epoch": 0.5669497060744612, + "grad_norm": 5.278627681175933, + "learning_rate": 0.00029684284440548034, + "loss": 7.7295, + "step": 6076 + }, + { + "epoch": 0.5670430157693385, + "grad_norm": 4.624647165172099, + "learning_rate": 0.00029684130053206744, + "loss": 7.5733, + "step": 6077 + }, + { + "epoch": 0.5671363254642158, + "grad_norm": 2.371361770955671, + "learning_rate": 0.00029683975628528057, + "loss": 7.5316, + "step": 6078 + }, + { + "epoch": 0.5672296351590931, + "grad_norm": 1.5433796231411006, + "learning_rate": 0.0002968382116651237, + "loss": 7.4488, + "step": 6079 + }, + { + "epoch": 0.5673229448539703, + "grad_norm": 4.662371869865492, + "learning_rate": 0.00029683666667160077, + "loss": 7.6479, + "step": 6080 + }, + { + "epoch": 0.5674162545488476, + "grad_norm": 5.672527222769188, + "learning_rate": 0.0002968351213047157, + "loss": 7.7323, + "step": 6081 + }, + { + "epoch": 0.5675095642437249, + "grad_norm": 2.0082178337263015, + "learning_rate": 0.00029683357556447237, + "loss": 7.6414, + "step": 6082 + }, + { + "epoch": 0.5676028739386022, + "grad_norm": 2.2990077788206387, + "learning_rate": 0.0002968320294508748, + "loss": 7.5987, + "step": 6083 + }, + { + "epoch": 0.5676961836334795, + "grad_norm": 2.5330421878778906, + "learning_rate": 0.0002968304829639268, + "loss": 7.74, + "step": 6084 + }, + { + "epoch": 0.5677894933283568, + "grad_norm": 3.712140795679561, + "learning_rate": 0.0002968289361036325, + "loss": 7.5274, + "step": 6085 + }, + { + "epoch": 0.5678828030232341, + "grad_norm": 2.335770176935912, + "learning_rate": 0.00029682738886999566, + "loss": 7.8874, + "step": 6086 + }, + { + "epoch": 0.5679761127181114, + "grad_norm": 2.784920052285174, + "learning_rate": 0.0002968258412630202, + "loss": 7.4874, + "step": 6087 + }, + { + "epoch": 0.5680694224129887, + "grad_norm": 1.6346482850264055, + "learning_rate": 0.0002968242932827102, + "loss": 7.5203, + "step": 6088 + }, + { + "epoch": 0.568162732107866, + "grad_norm": 3.0209922493249213, + "learning_rate": 0.00029682274492906946, + "loss": 7.5935, + "step": 6089 + }, + { + "epoch": 0.5682560418027433, + "grad_norm": 5.6940458014898665, + "learning_rate": 0.000296821196202102, + "loss": 7.8431, + "step": 6090 + }, + { + "epoch": 0.5683493514976206, + "grad_norm": 1.7918409304709317, + "learning_rate": 0.00029681964710181173, + "loss": 7.541, + "step": 6091 + }, + { + "epoch": 0.5684426611924979, + "grad_norm": 3.052284988297741, + "learning_rate": 0.0002968180976282026, + "loss": 7.4533, + "step": 6092 + }, + { + "epoch": 0.5685359708873752, + "grad_norm": 2.043074208062487, + "learning_rate": 0.0002968165477812785, + "loss": 7.9275, + "step": 6093 + }, + { + "epoch": 0.5686292805822525, + "grad_norm": 1.8174624755222644, + "learning_rate": 0.00029681499756104343, + "loss": 7.7792, + "step": 6094 + }, + { + "epoch": 0.5687225902771298, + "grad_norm": 3.1707822304547695, + "learning_rate": 0.0002968134469675013, + "loss": 7.8511, + "step": 6095 + }, + { + "epoch": 0.5688158999720071, + "grad_norm": 2.4852528315263664, + "learning_rate": 0.00029681189600065606, + "loss": 7.4047, + "step": 6096 + }, + { + "epoch": 0.5689092096668844, + "grad_norm": 2.183029024100369, + "learning_rate": 0.00029681034466051166, + "loss": 7.7317, + "step": 6097 + }, + { + "epoch": 0.5690025193617617, + "grad_norm": 1.943052680645987, + "learning_rate": 0.00029680879294707207, + "loss": 7.4751, + "step": 6098 + }, + { + "epoch": 0.569095829056639, + "grad_norm": 3.9025944189744317, + "learning_rate": 0.00029680724086034117, + "loss": 7.8202, + "step": 6099 + }, + { + "epoch": 0.5691891387515163, + "grad_norm": 1.4489524070749669, + "learning_rate": 0.000296805688400323, + "loss": 7.2408, + "step": 6100 + }, + { + "epoch": 0.5692824484463935, + "grad_norm": 2.279639883733487, + "learning_rate": 0.00029680413556702143, + "loss": 7.7525, + "step": 6101 + }, + { + "epoch": 0.5693757581412708, + "grad_norm": 2.522280885133202, + "learning_rate": 0.0002968025823604404, + "loss": 7.5977, + "step": 6102 + }, + { + "epoch": 0.5694690678361481, + "grad_norm": 2.165531708988841, + "learning_rate": 0.0002968010287805839, + "loss": 7.6686, + "step": 6103 + }, + { + "epoch": 0.5695623775310255, + "grad_norm": 1.6397672618351649, + "learning_rate": 0.0002967994748274559, + "loss": 7.7985, + "step": 6104 + }, + { + "epoch": 0.5696556872259028, + "grad_norm": 2.9853931634676276, + "learning_rate": 0.00029679792050106026, + "loss": 7.2624, + "step": 6105 + }, + { + "epoch": 0.5697489969207801, + "grad_norm": 1.6906173443630683, + "learning_rate": 0.000296796365801401, + "loss": 7.5829, + "step": 6106 + }, + { + "epoch": 0.5698423066156574, + "grad_norm": 2.2716612445943585, + "learning_rate": 0.0002967948107284821, + "loss": 7.1278, + "step": 6107 + }, + { + "epoch": 0.5699356163105347, + "grad_norm": 3.2764495504758457, + "learning_rate": 0.0002967932552823075, + "loss": 8.0928, + "step": 6108 + }, + { + "epoch": 0.570028926005412, + "grad_norm": 1.314769885590832, + "learning_rate": 0.0002967916994628811, + "loss": 7.173, + "step": 6109 + }, + { + "epoch": 0.5701222357002893, + "grad_norm": 1.354905023000486, + "learning_rate": 0.0002967901432702069, + "loss": 7.6452, + "step": 6110 + }, + { + "epoch": 0.5702155453951666, + "grad_norm": 1.283476494565277, + "learning_rate": 0.00029678858670428886, + "loss": 7.8742, + "step": 6111 + }, + { + "epoch": 0.5703088550900438, + "grad_norm": 2.4157580046664537, + "learning_rate": 0.0002967870297651309, + "loss": 7.2728, + "step": 6112 + }, + { + "epoch": 0.5704021647849211, + "grad_norm": 2.1634797235888135, + "learning_rate": 0.000296785472452737, + "loss": 7.7755, + "step": 6113 + }, + { + "epoch": 0.5704954744797984, + "grad_norm": 1.256242031473568, + "learning_rate": 0.00029678391476711113, + "loss": 7.8225, + "step": 6114 + }, + { + "epoch": 0.5705887841746757, + "grad_norm": 2.433920641928137, + "learning_rate": 0.00029678235670825727, + "loss": 7.9417, + "step": 6115 + }, + { + "epoch": 0.570682093869553, + "grad_norm": 1.1527913961711889, + "learning_rate": 0.00029678079827617933, + "loss": 7.5091, + "step": 6116 + }, + { + "epoch": 0.5707754035644304, + "grad_norm": 1.2423487235648818, + "learning_rate": 0.00029677923947088133, + "loss": 7.4206, + "step": 6117 + }, + { + "epoch": 0.5708687132593077, + "grad_norm": 2.070393793006707, + "learning_rate": 0.0002967776802923672, + "loss": 7.6475, + "step": 6118 + }, + { + "epoch": 0.570962022954185, + "grad_norm": 2.6590895192200312, + "learning_rate": 0.0002967761207406409, + "loss": 7.4268, + "step": 6119 + }, + { + "epoch": 0.5710553326490623, + "grad_norm": 1.7986745285232189, + "learning_rate": 0.00029677456081570646, + "loss": 7.5443, + "step": 6120 + }, + { + "epoch": 0.5711486423439396, + "grad_norm": 1.160055642715038, + "learning_rate": 0.00029677300051756766, + "loss": 7.3832, + "step": 6121 + }, + { + "epoch": 0.5712419520388168, + "grad_norm": 1.1793334407540945, + "learning_rate": 0.0002967714398462287, + "loss": 7.5952, + "step": 6122 + }, + { + "epoch": 0.5713352617336941, + "grad_norm": 2.0244410391352683, + "learning_rate": 0.0002967698788016934, + "loss": 7.3177, + "step": 6123 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.9332970916795982, + "learning_rate": 0.0002967683173839658, + "loss": 7.3279, + "step": 6124 + }, + { + "epoch": 0.5715218811234487, + "grad_norm": 1.0079149341797622, + "learning_rate": 0.00029676675559304984, + "loss": 7.3321, + "step": 6125 + }, + { + "epoch": 0.571615190818326, + "grad_norm": 2.1727516186066147, + "learning_rate": 0.00029676519342894945, + "loss": 7.6163, + "step": 6126 + }, + { + "epoch": 0.5717085005132033, + "grad_norm": 2.0192436401300413, + "learning_rate": 0.0002967636308916687, + "loss": 7.3044, + "step": 6127 + }, + { + "epoch": 0.5718018102080806, + "grad_norm": 1.200207034858287, + "learning_rate": 0.0002967620679812115, + "loss": 7.4769, + "step": 6128 + }, + { + "epoch": 0.5718951199029579, + "grad_norm": 1.2341955383692276, + "learning_rate": 0.0002967605046975818, + "loss": 7.1661, + "step": 6129 + }, + { + "epoch": 0.5719884295978352, + "grad_norm": 1.490348768362506, + "learning_rate": 0.0002967589410407837, + "loss": 7.2741, + "step": 6130 + }, + { + "epoch": 0.5720817392927126, + "grad_norm": 1.1168649312271417, + "learning_rate": 0.00029675737701082103, + "loss": 7.3215, + "step": 6131 + }, + { + "epoch": 0.5721750489875899, + "grad_norm": 1.7628858987924403, + "learning_rate": 0.0002967558126076978, + "loss": 7.7024, + "step": 6132 + }, + { + "epoch": 0.5722683586824671, + "grad_norm": 1.406852745609696, + "learning_rate": 0.000296754247831418, + "loss": 7.671, + "step": 6133 + }, + { + "epoch": 0.5723616683773444, + "grad_norm": 1.8812707111567635, + "learning_rate": 0.0002967526826819857, + "loss": 7.6371, + "step": 6134 + }, + { + "epoch": 0.5724549780722217, + "grad_norm": 1.6578445104325739, + "learning_rate": 0.00029675111715940473, + "loss": 7.8121, + "step": 6135 + }, + { + "epoch": 0.572548287767099, + "grad_norm": 1.676672638551667, + "learning_rate": 0.00029674955126367914, + "loss": 7.7596, + "step": 6136 + }, + { + "epoch": 0.5726415974619763, + "grad_norm": 1.373336119531998, + "learning_rate": 0.00029674798499481296, + "loss": 7.9981, + "step": 6137 + }, + { + "epoch": 0.5727349071568536, + "grad_norm": 1.7900299284159178, + "learning_rate": 0.0002967464183528101, + "loss": 7.5128, + "step": 6138 + }, + { + "epoch": 0.5728282168517309, + "grad_norm": 1.1924269792538118, + "learning_rate": 0.00029674485133767455, + "loss": 7.1299, + "step": 6139 + }, + { + "epoch": 0.5729215265466082, + "grad_norm": 2.1182979793170698, + "learning_rate": 0.00029674328394941035, + "loss": 7.5615, + "step": 6140 + }, + { + "epoch": 0.5730148362414855, + "grad_norm": 1.3098624375346664, + "learning_rate": 0.0002967417161880214, + "loss": 7.3681, + "step": 6141 + }, + { + "epoch": 0.5731081459363628, + "grad_norm": 1.0547928939283935, + "learning_rate": 0.00029674014805351176, + "loss": 7.564, + "step": 6142 + }, + { + "epoch": 0.57320145563124, + "grad_norm": 1.967763264207776, + "learning_rate": 0.0002967385795458854, + "loss": 7.072, + "step": 6143 + }, + { + "epoch": 0.5732947653261173, + "grad_norm": 1.2184756224742874, + "learning_rate": 0.00029673701066514627, + "loss": 7.6765, + "step": 6144 + }, + { + "epoch": 0.5733880750209946, + "grad_norm": 1.1045253402337911, + "learning_rate": 0.0002967354414112984, + "loss": 7.6494, + "step": 6145 + }, + { + "epoch": 0.573481384715872, + "grad_norm": 1.305017417930611, + "learning_rate": 0.0002967338717843458, + "loss": 7.9434, + "step": 6146 + }, + { + "epoch": 0.5735746944107493, + "grad_norm": 1.2488660721571572, + "learning_rate": 0.00029673230178429234, + "loss": 7.4493, + "step": 6147 + }, + { + "epoch": 0.5736680041056266, + "grad_norm": 1.0897499713752903, + "learning_rate": 0.0002967307314111422, + "loss": 7.373, + "step": 6148 + }, + { + "epoch": 0.5737613138005039, + "grad_norm": 1.0097481952308975, + "learning_rate": 0.00029672916066489927, + "loss": 7.443, + "step": 6149 + }, + { + "epoch": 0.5738546234953812, + "grad_norm": 1.7215551435852574, + "learning_rate": 0.0002967275895455675, + "loss": 7.812, + "step": 6150 + }, + { + "epoch": 0.5739479331902585, + "grad_norm": 1.0035477343596406, + "learning_rate": 0.00029672601805315097, + "loss": 7.126, + "step": 6151 + }, + { + "epoch": 0.5740412428851358, + "grad_norm": 1.1245429050723987, + "learning_rate": 0.0002967244461876536, + "loss": 7.6859, + "step": 6152 + }, + { + "epoch": 0.5741345525800131, + "grad_norm": 1.2991970401422483, + "learning_rate": 0.00029672287394907947, + "loss": 7.5255, + "step": 6153 + }, + { + "epoch": 0.5742278622748903, + "grad_norm": 0.9582523793627209, + "learning_rate": 0.0002967213013374325, + "loss": 7.5478, + "step": 6154 + }, + { + "epoch": 0.5743211719697676, + "grad_norm": 1.0420998730369273, + "learning_rate": 0.0002967197283527167, + "loss": 7.3246, + "step": 6155 + }, + { + "epoch": 0.5744144816646449, + "grad_norm": 1.7364227720719096, + "learning_rate": 0.0002967181549949362, + "loss": 7.7454, + "step": 6156 + }, + { + "epoch": 0.5745077913595222, + "grad_norm": 1.1884243975200512, + "learning_rate": 0.0002967165812640948, + "loss": 7.0447, + "step": 6157 + }, + { + "epoch": 0.5746011010543995, + "grad_norm": 0.9958374617791734, + "learning_rate": 0.0002967150071601966, + "loss": 7.0143, + "step": 6158 + }, + { + "epoch": 0.5746944107492769, + "grad_norm": 1.0761463331276397, + "learning_rate": 0.00029671343268324567, + "loss": 7.3211, + "step": 6159 + }, + { + "epoch": 0.5747877204441542, + "grad_norm": 0.9690006308069947, + "learning_rate": 0.00029671185783324586, + "loss": 7.3625, + "step": 6160 + }, + { + "epoch": 0.5748810301390315, + "grad_norm": 1.3237827188466755, + "learning_rate": 0.0002967102826102013, + "loss": 7.3804, + "step": 6161 + }, + { + "epoch": 0.5749743398339088, + "grad_norm": 0.6897995983219597, + "learning_rate": 0.0002967087070141159, + "loss": 7.3208, + "step": 6162 + }, + { + "epoch": 0.5750676495287861, + "grad_norm": 1.3412340545098183, + "learning_rate": 0.00029670713104499376, + "loss": 7.4949, + "step": 6163 + }, + { + "epoch": 0.5751609592236634, + "grad_norm": 1.7325969563450856, + "learning_rate": 0.0002967055547028388, + "loss": 7.44, + "step": 6164 + }, + { + "epoch": 0.5752542689185406, + "grad_norm": 1.175803803772611, + "learning_rate": 0.0002967039779876551, + "loss": 7.2474, + "step": 6165 + }, + { + "epoch": 0.5753475786134179, + "grad_norm": 0.7918968720526014, + "learning_rate": 0.00029670240089944665, + "loss": 7.3218, + "step": 6166 + }, + { + "epoch": 0.5754408883082952, + "grad_norm": 2.997125764382104, + "learning_rate": 0.00029670082343821746, + "loss": 7.7724, + "step": 6167 + }, + { + "epoch": 0.5755341980031725, + "grad_norm": 0.8344275776009342, + "learning_rate": 0.0002966992456039715, + "loss": 7.2627, + "step": 6168 + }, + { + "epoch": 0.5756275076980498, + "grad_norm": 1.3832968900075524, + "learning_rate": 0.00029669766739671283, + "loss": 6.9865, + "step": 6169 + }, + { + "epoch": 0.5757208173929271, + "grad_norm": 1.096864052815106, + "learning_rate": 0.00029669608881644546, + "loss": 7.3614, + "step": 6170 + }, + { + "epoch": 0.5758141270878044, + "grad_norm": 1.6840408327742153, + "learning_rate": 0.0002966945098631734, + "loss": 7.129, + "step": 6171 + }, + { + "epoch": 0.5759074367826817, + "grad_norm": 0.8217232741471114, + "learning_rate": 0.0002966929305369006, + "loss": 7.48, + "step": 6172 + }, + { + "epoch": 0.5760007464775591, + "grad_norm": 1.1861252869840073, + "learning_rate": 0.00029669135083763116, + "loss": 7.555, + "step": 6173 + }, + { + "epoch": 0.5760940561724364, + "grad_norm": 0.9675239213093448, + "learning_rate": 0.00029668977076536906, + "loss": 7.3411, + "step": 6174 + }, + { + "epoch": 0.5761873658673136, + "grad_norm": 0.6991367223052035, + "learning_rate": 0.00029668819032011836, + "loss": 7.2952, + "step": 6175 + }, + { + "epoch": 0.5762806755621909, + "grad_norm": 1.4615926258718792, + "learning_rate": 0.000296686609501883, + "loss": 7.6218, + "step": 6176 + }, + { + "epoch": 0.5763739852570682, + "grad_norm": 1.3450614752678358, + "learning_rate": 0.0002966850283106671, + "loss": 7.1211, + "step": 6177 + }, + { + "epoch": 0.5764672949519455, + "grad_norm": 0.9691441790188162, + "learning_rate": 0.00029668344674647456, + "loss": 7.2664, + "step": 6178 + }, + { + "epoch": 0.5765606046468228, + "grad_norm": 0.6521050344541987, + "learning_rate": 0.00029668186480930953, + "loss": 7.3514, + "step": 6179 + }, + { + "epoch": 0.5766539143417001, + "grad_norm": 0.8734257218387224, + "learning_rate": 0.0002966802824991759, + "loss": 7.5859, + "step": 6180 + }, + { + "epoch": 0.5767472240365774, + "grad_norm": 0.9670443503069478, + "learning_rate": 0.0002966786998160778, + "loss": 7.4896, + "step": 6181 + }, + { + "epoch": 0.5768405337314547, + "grad_norm": 0.554957309617575, + "learning_rate": 0.00029667711676001926, + "loss": 7.2686, + "step": 6182 + }, + { + "epoch": 0.576933843426332, + "grad_norm": 0.6500484921343256, + "learning_rate": 0.0002966755333310042, + "loss": 7.389, + "step": 6183 + }, + { + "epoch": 0.5770271531212093, + "grad_norm": 1.3435609308905165, + "learning_rate": 0.0002966739495290367, + "loss": 6.9097, + "step": 6184 + }, + { + "epoch": 0.5771204628160866, + "grad_norm": 0.712726367957933, + "learning_rate": 0.00029667236535412076, + "loss": 7.0436, + "step": 6185 + }, + { + "epoch": 0.5772137725109638, + "grad_norm": 0.8108495854287726, + "learning_rate": 0.00029667078080626054, + "loss": 7.141, + "step": 6186 + }, + { + "epoch": 0.5773070822058411, + "grad_norm": 1.218143675536453, + "learning_rate": 0.0002966691958854599, + "loss": 7.2633, + "step": 6187 + }, + { + "epoch": 0.5774003919007185, + "grad_norm": 0.6161019510456037, + "learning_rate": 0.000296667610591723, + "loss": 7.3956, + "step": 6188 + }, + { + "epoch": 0.5774937015955958, + "grad_norm": 0.796290566822915, + "learning_rate": 0.0002966660249250538, + "loss": 7.5739, + "step": 6189 + }, + { + "epoch": 0.5775870112904731, + "grad_norm": 1.2012689938614054, + "learning_rate": 0.0002966644388854563, + "loss": 7.448, + "step": 6190 + }, + { + "epoch": 0.5776803209853504, + "grad_norm": 1.5436687091490822, + "learning_rate": 0.0002966628524729346, + "loss": 7.291, + "step": 6191 + }, + { + "epoch": 0.5777736306802277, + "grad_norm": 1.1812004974003953, + "learning_rate": 0.0002966612656874927, + "loss": 7.4622, + "step": 6192 + }, + { + "epoch": 0.577866940375105, + "grad_norm": 1.0317102087163363, + "learning_rate": 0.0002966596785291347, + "loss": 7.7484, + "step": 6193 + }, + { + "epoch": 0.5779602500699823, + "grad_norm": 0.8609051715703238, + "learning_rate": 0.0002966580909978645, + "loss": 7.2603, + "step": 6194 + }, + { + "epoch": 0.5780535597648596, + "grad_norm": 0.7605974114285308, + "learning_rate": 0.0002966565030936863, + "loss": 6.9929, + "step": 6195 + }, + { + "epoch": 0.5781468694597368, + "grad_norm": 1.1303062435612645, + "learning_rate": 0.00029665491481660396, + "loss": 7.4505, + "step": 6196 + }, + { + "epoch": 0.5782401791546141, + "grad_norm": 0.6727269082352597, + "learning_rate": 0.00029665332616662167, + "loss": 7.2441, + "step": 6197 + }, + { + "epoch": 0.5783334888494914, + "grad_norm": 0.9914227712081609, + "learning_rate": 0.00029665173714374344, + "loss": 7.3981, + "step": 6198 + }, + { + "epoch": 0.5784267985443687, + "grad_norm": 0.7256235845606418, + "learning_rate": 0.0002966501477479733, + "loss": 7.2654, + "step": 6199 + }, + { + "epoch": 0.578520108239246, + "grad_norm": 1.29079751583837, + "learning_rate": 0.0002966485579793152, + "loss": 6.8044, + "step": 6200 + }, + { + "epoch": 0.5786134179341234, + "grad_norm": 1.2303800420224933, + "learning_rate": 0.00029664696783777327, + "loss": 7.3635, + "step": 6201 + }, + { + "epoch": 0.5787067276290007, + "grad_norm": 0.9322605960363536, + "learning_rate": 0.00029664537732335155, + "loss": 7.2659, + "step": 6202 + }, + { + "epoch": 0.578800037323878, + "grad_norm": 1.2212463775085194, + "learning_rate": 0.0002966437864360541, + "loss": 7.3509, + "step": 6203 + }, + { + "epoch": 0.5788933470187553, + "grad_norm": 0.734682019754327, + "learning_rate": 0.00029664219517588496, + "loss": 7.3332, + "step": 6204 + }, + { + "epoch": 0.5789866567136326, + "grad_norm": 0.7677191318074343, + "learning_rate": 0.00029664060354284806, + "loss": 7.6536, + "step": 6205 + }, + { + "epoch": 0.5790799664085099, + "grad_norm": 1.1780598018783395, + "learning_rate": 0.0002966390115369476, + "loss": 7.3315, + "step": 6206 + }, + { + "epoch": 0.5791732761033871, + "grad_norm": 1.2572933884390116, + "learning_rate": 0.00029663741915818764, + "loss": 6.9602, + "step": 6207 + }, + { + "epoch": 0.5792665857982644, + "grad_norm": 0.8501179158874296, + "learning_rate": 0.00029663582640657203, + "loss": 7.2046, + "step": 6208 + }, + { + "epoch": 0.5793598954931417, + "grad_norm": 1.3590783163901763, + "learning_rate": 0.00029663423328210505, + "loss": 7.2367, + "step": 6209 + }, + { + "epoch": 0.579453205188019, + "grad_norm": 1.7471293852558378, + "learning_rate": 0.0002966326397847906, + "loss": 7.6637, + "step": 6210 + }, + { + "epoch": 0.5795465148828963, + "grad_norm": 0.7766755295725658, + "learning_rate": 0.0002966310459146328, + "loss": 7.2108, + "step": 6211 + }, + { + "epoch": 0.5796398245777736, + "grad_norm": 1.040461721051717, + "learning_rate": 0.0002966294516716357, + "loss": 7.1883, + "step": 6212 + }, + { + "epoch": 0.5797331342726509, + "grad_norm": 1.4248249201945191, + "learning_rate": 0.0002966278570558033, + "loss": 6.9783, + "step": 6213 + }, + { + "epoch": 0.5798264439675282, + "grad_norm": 0.5641750122775704, + "learning_rate": 0.0002966262620671397, + "loss": 7.1832, + "step": 6214 + }, + { + "epoch": 0.5799197536624056, + "grad_norm": 1.1467998232145795, + "learning_rate": 0.000296624666705649, + "loss": 7.3949, + "step": 6215 + }, + { + "epoch": 0.5800130633572829, + "grad_norm": 0.8305781725774122, + "learning_rate": 0.0002966230709713352, + "loss": 7.1516, + "step": 6216 + }, + { + "epoch": 0.5801063730521602, + "grad_norm": 1.0257273406326548, + "learning_rate": 0.0002966214748642023, + "loss": 7.4439, + "step": 6217 + }, + { + "epoch": 0.5801996827470374, + "grad_norm": 0.8480072635748923, + "learning_rate": 0.00029661987838425447, + "loss": 7.0312, + "step": 6218 + }, + { + "epoch": 0.5802929924419147, + "grad_norm": 1.1207657140704785, + "learning_rate": 0.0002966182815314957, + "loss": 7.1738, + "step": 6219 + }, + { + "epoch": 0.580386302136792, + "grad_norm": 0.7342909878868733, + "learning_rate": 0.0002966166843059301, + "loss": 7.2487, + "step": 6220 + }, + { + "epoch": 0.5804796118316693, + "grad_norm": 0.49743788839408176, + "learning_rate": 0.00029661508670756167, + "loss": 7.072, + "step": 6221 + }, + { + "epoch": 0.5805729215265466, + "grad_norm": 1.068314741745784, + "learning_rate": 0.0002966134887363945, + "loss": 7.2988, + "step": 6222 + }, + { + "epoch": 0.5806662312214239, + "grad_norm": 0.9961477300079599, + "learning_rate": 0.0002966118903924327, + "loss": 7.1414, + "step": 6223 + }, + { + "epoch": 0.5807595409163012, + "grad_norm": 0.5616173148596546, + "learning_rate": 0.0002966102916756803, + "loss": 7.183, + "step": 6224 + }, + { + "epoch": 0.5808528506111785, + "grad_norm": 0.9874725882875224, + "learning_rate": 0.0002966086925861413, + "loss": 7.1238, + "step": 6225 + }, + { + "epoch": 0.5809461603060558, + "grad_norm": 0.5528712871956013, + "learning_rate": 0.00029660709312381985, + "loss": 7.3983, + "step": 6226 + }, + { + "epoch": 0.5810394700009331, + "grad_norm": 0.5212398428389944, + "learning_rate": 0.00029660549328872, + "loss": 7.3704, + "step": 6227 + }, + { + "epoch": 0.5811327796958103, + "grad_norm": 0.622938160508679, + "learning_rate": 0.0002966038930808458, + "loss": 7.2926, + "step": 6228 + }, + { + "epoch": 0.5812260893906877, + "grad_norm": 0.7808945420531304, + "learning_rate": 0.00029660229250020134, + "loss": 7.3906, + "step": 6229 + }, + { + "epoch": 0.581319399085565, + "grad_norm": 0.5854982926804618, + "learning_rate": 0.00029660069154679064, + "loss": 7.0214, + "step": 6230 + }, + { + "epoch": 0.5814127087804423, + "grad_norm": 0.7083044054034241, + "learning_rate": 0.00029659909022061785, + "loss": 7.0738, + "step": 6231 + }, + { + "epoch": 0.5815060184753196, + "grad_norm": 0.608426143575775, + "learning_rate": 0.000296597488521687, + "loss": 7.2388, + "step": 6232 + }, + { + "epoch": 0.5815993281701969, + "grad_norm": 0.6374277981945015, + "learning_rate": 0.00029659588645000214, + "loss": 7.1243, + "step": 6233 + }, + { + "epoch": 0.5816926378650742, + "grad_norm": 0.5229123937089617, + "learning_rate": 0.0002965942840055674, + "loss": 7.2321, + "step": 6234 + }, + { + "epoch": 0.5817859475599515, + "grad_norm": 0.6505808596280311, + "learning_rate": 0.0002965926811883867, + "loss": 7.1229, + "step": 6235 + }, + { + "epoch": 0.5818792572548288, + "grad_norm": 0.8886777610767148, + "learning_rate": 0.00029659107799846437, + "loss": 7.58, + "step": 6236 + }, + { + "epoch": 0.5819725669497061, + "grad_norm": 0.9731350364034412, + "learning_rate": 0.0002965894744358043, + "loss": 6.976, + "step": 6237 + }, + { + "epoch": 0.5820658766445834, + "grad_norm": 0.8225606375134054, + "learning_rate": 0.00029658787050041064, + "loss": 6.9612, + "step": 6238 + }, + { + "epoch": 0.5821591863394606, + "grad_norm": 0.7964789451536337, + "learning_rate": 0.00029658626619228745, + "loss": 7.1059, + "step": 6239 + }, + { + "epoch": 0.5822524960343379, + "grad_norm": 0.712440824350567, + "learning_rate": 0.00029658466151143874, + "loss": 7.0551, + "step": 6240 + }, + { + "epoch": 0.5823458057292152, + "grad_norm": 0.6664213352841641, + "learning_rate": 0.00029658305645786876, + "loss": 7.2341, + "step": 6241 + }, + { + "epoch": 0.5824391154240925, + "grad_norm": 0.8730144826672648, + "learning_rate": 0.00029658145103158147, + "loss": 6.8609, + "step": 6242 + }, + { + "epoch": 0.5825324251189699, + "grad_norm": 0.8285717471300932, + "learning_rate": 0.00029657984523258086, + "loss": 7.1935, + "step": 6243 + }, + { + "epoch": 0.5826257348138472, + "grad_norm": 0.6451630605892074, + "learning_rate": 0.00029657823906087123, + "loss": 7.4433, + "step": 6244 + }, + { + "epoch": 0.5827190445087245, + "grad_norm": 0.7310574724566198, + "learning_rate": 0.00029657663251645653, + "loss": 7.3796, + "step": 6245 + }, + { + "epoch": 0.5828123542036018, + "grad_norm": 0.6872509966036392, + "learning_rate": 0.0002965750255993409, + "loss": 7.2879, + "step": 6246 + }, + { + "epoch": 0.5829056638984791, + "grad_norm": 0.6268624873598685, + "learning_rate": 0.0002965734183095283, + "loss": 7.0474, + "step": 6247 + }, + { + "epoch": 0.5829989735933564, + "grad_norm": 0.9237880703192685, + "learning_rate": 0.000296571810647023, + "loss": 7.0383, + "step": 6248 + }, + { + "epoch": 0.5830922832882336, + "grad_norm": 0.5337154604030961, + "learning_rate": 0.000296570202611829, + "loss": 7.1494, + "step": 6249 + }, + { + "epoch": 0.5831855929831109, + "grad_norm": 1.2849315021930736, + "learning_rate": 0.0002965685942039504, + "loss": 7.3439, + "step": 6250 + }, + { + "epoch": 0.5832789026779882, + "grad_norm": 0.6870373213873828, + "learning_rate": 0.0002965669854233913, + "loss": 7.1742, + "step": 6251 + }, + { + "epoch": 0.5833722123728655, + "grad_norm": 1.4850003791075437, + "learning_rate": 0.0002965653762701557, + "loss": 7.0297, + "step": 6252 + }, + { + "epoch": 0.5834655220677428, + "grad_norm": 1.0290015681829645, + "learning_rate": 0.0002965637667442478, + "loss": 7.3397, + "step": 6253 + }, + { + "epoch": 0.5835588317626201, + "grad_norm": 0.8642302659800044, + "learning_rate": 0.0002965621568456717, + "loss": 7.265, + "step": 6254 + }, + { + "epoch": 0.5836521414574974, + "grad_norm": 0.6614382174554577, + "learning_rate": 0.00029656054657443135, + "loss": 7.0324, + "step": 6255 + }, + { + "epoch": 0.5837454511523748, + "grad_norm": 0.7399968702248633, + "learning_rate": 0.00029655893593053104, + "loss": 6.962, + "step": 6256 + }, + { + "epoch": 0.5838387608472521, + "grad_norm": 0.6666225671615621, + "learning_rate": 0.0002965573249139747, + "loss": 7.3969, + "step": 6257 + }, + { + "epoch": 0.5839320705421294, + "grad_norm": 0.859040195161754, + "learning_rate": 0.0002965557135247665, + "loss": 7.3103, + "step": 6258 + }, + { + "epoch": 0.5840253802370067, + "grad_norm": 1.0763103435072654, + "learning_rate": 0.00029655410176291063, + "loss": 7.18, + "step": 6259 + }, + { + "epoch": 0.5841186899318839, + "grad_norm": 1.1008608073985993, + "learning_rate": 0.000296552489628411, + "loss": 7.0043, + "step": 6260 + }, + { + "epoch": 0.5842119996267612, + "grad_norm": 0.8013759314472277, + "learning_rate": 0.00029655087712127185, + "loss": 7.2592, + "step": 6261 + }, + { + "epoch": 0.5843053093216385, + "grad_norm": 1.108725778668916, + "learning_rate": 0.0002965492642414972, + "loss": 7.268, + "step": 6262 + }, + { + "epoch": 0.5843986190165158, + "grad_norm": 0.8985916386621723, + "learning_rate": 0.00029654765098909117, + "loss": 7.2753, + "step": 6263 + }, + { + "epoch": 0.5844919287113931, + "grad_norm": 1.41084742043513, + "learning_rate": 0.0002965460373640579, + "loss": 7.3387, + "step": 6264 + }, + { + "epoch": 0.5845852384062704, + "grad_norm": 1.1514453112983976, + "learning_rate": 0.00029654442336640144, + "loss": 7.1549, + "step": 6265 + }, + { + "epoch": 0.5846785481011477, + "grad_norm": 1.5095008252814146, + "learning_rate": 0.0002965428089961259, + "loss": 7.0812, + "step": 6266 + }, + { + "epoch": 0.584771857796025, + "grad_norm": 1.1046662034983779, + "learning_rate": 0.00029654119425323545, + "loss": 7.0067, + "step": 6267 + }, + { + "epoch": 0.5848651674909023, + "grad_norm": 1.3292740711634283, + "learning_rate": 0.0002965395791377342, + "loss": 7.1934, + "step": 6268 + }, + { + "epoch": 0.5849584771857796, + "grad_norm": 1.5423376017898944, + "learning_rate": 0.0002965379636496261, + "loss": 7.5541, + "step": 6269 + }, + { + "epoch": 0.585051786880657, + "grad_norm": 1.0622579414242772, + "learning_rate": 0.0002965363477889155, + "loss": 7.413, + "step": 6270 + }, + { + "epoch": 0.5851450965755342, + "grad_norm": 0.6498018133454416, + "learning_rate": 0.00029653473155560623, + "loss": 7.6502, + "step": 6271 + }, + { + "epoch": 0.5852384062704115, + "grad_norm": 1.7578898281488349, + "learning_rate": 0.0002965331149497026, + "loss": 7.0566, + "step": 6272 + }, + { + "epoch": 0.5853317159652888, + "grad_norm": 1.2168307473902056, + "learning_rate": 0.0002965314979712087, + "loss": 7.0435, + "step": 6273 + }, + { + "epoch": 0.5854250256601661, + "grad_norm": 0.922135983007727, + "learning_rate": 0.00029652988062012854, + "loss": 6.9188, + "step": 6274 + }, + { + "epoch": 0.5855183353550434, + "grad_norm": 1.4768929159993112, + "learning_rate": 0.00029652826289646643, + "loss": 7.1171, + "step": 6275 + }, + { + "epoch": 0.5856116450499207, + "grad_norm": 1.4043803993497637, + "learning_rate": 0.0002965266448002262, + "loss": 7.2094, + "step": 6276 + }, + { + "epoch": 0.585704954744798, + "grad_norm": 0.5918027267460375, + "learning_rate": 0.0002965250263314122, + "loss": 6.8806, + "step": 6277 + }, + { + "epoch": 0.5857982644396753, + "grad_norm": 0.7297443061381101, + "learning_rate": 0.0002965234074900284, + "loss": 7.0525, + "step": 6278 + }, + { + "epoch": 0.5858915741345526, + "grad_norm": 0.8551558239089629, + "learning_rate": 0.000296521788276079, + "loss": 7.1102, + "step": 6279 + }, + { + "epoch": 0.5859848838294299, + "grad_norm": 0.9469363436511526, + "learning_rate": 0.0002965201686895682, + "loss": 7.2231, + "step": 6280 + }, + { + "epoch": 0.5860781935243071, + "grad_norm": 1.046790443650493, + "learning_rate": 0.0002965185487304999, + "loss": 7.4247, + "step": 6281 + }, + { + "epoch": 0.5861715032191844, + "grad_norm": 0.540472224467065, + "learning_rate": 0.00029651692839887837, + "loss": 7.152, + "step": 6282 + }, + { + "epoch": 0.5862648129140617, + "grad_norm": 0.6337346601282134, + "learning_rate": 0.00029651530769470773, + "loss": 6.9976, + "step": 6283 + }, + { + "epoch": 0.586358122608939, + "grad_norm": 0.8640619464714684, + "learning_rate": 0.000296513686617992, + "loss": 7.1781, + "step": 6284 + }, + { + "epoch": 0.5864514323038164, + "grad_norm": 1.419097437687406, + "learning_rate": 0.0002965120651687354, + "loss": 6.8934, + "step": 6285 + }, + { + "epoch": 0.5865447419986937, + "grad_norm": 0.8113635202422569, + "learning_rate": 0.000296510443346942, + "loss": 7.1959, + "step": 6286 + }, + { + "epoch": 0.586638051693571, + "grad_norm": 1.4016359853192126, + "learning_rate": 0.0002965088211526159, + "loss": 7.0454, + "step": 6287 + }, + { + "epoch": 0.5867313613884483, + "grad_norm": 1.1496953038974203, + "learning_rate": 0.00029650719858576133, + "loss": 7.0287, + "step": 6288 + }, + { + "epoch": 0.5868246710833256, + "grad_norm": 1.0580090480869373, + "learning_rate": 0.0002965055756463823, + "loss": 7.3368, + "step": 6289 + }, + { + "epoch": 0.5869179807782029, + "grad_norm": 1.523211021903563, + "learning_rate": 0.0002965039523344831, + "loss": 7.4241, + "step": 6290 + }, + { + "epoch": 0.5870112904730802, + "grad_norm": 0.7069712551173747, + "learning_rate": 0.0002965023286500676, + "loss": 7.3836, + "step": 6291 + }, + { + "epoch": 0.5871046001679574, + "grad_norm": 1.1882707989347538, + "learning_rate": 0.0002965007045931402, + "loss": 6.9707, + "step": 6292 + }, + { + "epoch": 0.5871979098628347, + "grad_norm": 0.903002530495291, + "learning_rate": 0.0002964990801637048, + "loss": 7.0529, + "step": 6293 + }, + { + "epoch": 0.587291219557712, + "grad_norm": 0.6320658152261335, + "learning_rate": 0.0002964974553617657, + "loss": 7.2052, + "step": 6294 + }, + { + "epoch": 0.5873845292525893, + "grad_norm": 1.09023711616062, + "learning_rate": 0.00029649583018732695, + "loss": 7.2562, + "step": 6295 + }, + { + "epoch": 0.5874778389474666, + "grad_norm": 1.3742178751258038, + "learning_rate": 0.0002964942046403927, + "loss": 7.1946, + "step": 6296 + }, + { + "epoch": 0.5875711486423439, + "grad_norm": 0.6131853057342037, + "learning_rate": 0.00029649257872096707, + "loss": 7.0126, + "step": 6297 + }, + { + "epoch": 0.5876644583372213, + "grad_norm": 2.414791313918829, + "learning_rate": 0.00029649095242905425, + "loss": 7.2266, + "step": 6298 + }, + { + "epoch": 0.5877577680320986, + "grad_norm": 3.2704683457729127, + "learning_rate": 0.0002964893257646583, + "loss": 7.3547, + "step": 6299 + }, + { + "epoch": 0.5878510777269759, + "grad_norm": 1.6659023661367418, + "learning_rate": 0.0002964876987277834, + "loss": 7.2078, + "step": 6300 + }, + { + "epoch": 0.5879443874218532, + "grad_norm": 1.560143692876193, + "learning_rate": 0.00029648607131843363, + "loss": 7.1971, + "step": 6301 + }, + { + "epoch": 0.5880376971167304, + "grad_norm": 2.2537362858818755, + "learning_rate": 0.0002964844435366132, + "loss": 7.0513, + "step": 6302 + }, + { + "epoch": 0.5881310068116077, + "grad_norm": 1.973914802415694, + "learning_rate": 0.00029648281538232626, + "loss": 7.5626, + "step": 6303 + }, + { + "epoch": 0.588224316506485, + "grad_norm": 0.7786779319920278, + "learning_rate": 0.0002964811868555768, + "loss": 7.3082, + "step": 6304 + }, + { + "epoch": 0.5883176262013623, + "grad_norm": 0.8131331725005064, + "learning_rate": 0.00029647955795636925, + "loss": 7.3443, + "step": 6305 + }, + { + "epoch": 0.5884109358962396, + "grad_norm": 0.9696698346295664, + "learning_rate": 0.00029647792868470744, + "loss": 7.1694, + "step": 6306 + }, + { + "epoch": 0.5885042455911169, + "grad_norm": 7.977222068865424, + "learning_rate": 0.00029647629904059567, + "loss": 7.4451, + "step": 6307 + }, + { + "epoch": 0.5885975552859942, + "grad_norm": 1.025258391723112, + "learning_rate": 0.00029647466902403807, + "loss": 7.2663, + "step": 6308 + }, + { + "epoch": 0.5886908649808715, + "grad_norm": 0.7211292179257591, + "learning_rate": 0.0002964730386350388, + "loss": 7.1271, + "step": 6309 + }, + { + "epoch": 0.5887841746757488, + "grad_norm": 15.0988250888436, + "learning_rate": 0.000296471407873602, + "loss": 7.2651, + "step": 6310 + }, + { + "epoch": 0.5888774843706261, + "grad_norm": 1.2341314788123956, + "learning_rate": 0.0002964697767397318, + "loss": 7.2641, + "step": 6311 + }, + { + "epoch": 0.5889707940655035, + "grad_norm": 29.86569555390264, + "learning_rate": 0.00029646814523343225, + "loss": 6.9841, + "step": 6312 + }, + { + "epoch": 0.5890641037603807, + "grad_norm": 7.340304298732794, + "learning_rate": 0.0002964665133547077, + "loss": 7.4389, + "step": 6313 + }, + { + "epoch": 0.589157413455258, + "grad_norm": 69.16394565487653, + "learning_rate": 0.0002964648811035621, + "loss": 7.2296, + "step": 6314 + }, + { + "epoch": 0.5892507231501353, + "grad_norm": 0.9699639371447578, + "learning_rate": 0.00029646324847999977, + "loss": 7.2324, + "step": 6315 + }, + { + "epoch": 0.5893440328450126, + "grad_norm": 210.17223062744762, + "learning_rate": 0.00029646161548402473, + "loss": 7.0493, + "step": 6316 + }, + { + "epoch": 0.5894373425398899, + "grad_norm": 923.0196442428879, + "learning_rate": 0.0002964599821156412, + "loss": 7.4805, + "step": 6317 + }, + { + "epoch": 0.5895306522347672, + "grad_norm": 1.6752332059380985, + "learning_rate": 0.0002964583483748533, + "loss": 7.6231, + "step": 6318 + }, + { + "epoch": 0.5896239619296445, + "grad_norm": 2.4225411044178133, + "learning_rate": 0.0002964567142616653, + "loss": 7.5714, + "step": 6319 + }, + { + "epoch": 0.5897172716245218, + "grad_norm": 7.4021655998984945, + "learning_rate": 0.00029645507977608113, + "loss": 8.0782, + "step": 6320 + }, + { + "epoch": 0.5898105813193991, + "grad_norm": 6.17726717158249, + "learning_rate": 0.00029645344491810514, + "loss": 8.2138, + "step": 6321 + }, + { + "epoch": 0.5899038910142764, + "grad_norm": 2.2451433095155635, + "learning_rate": 0.0002964518096877414, + "loss": 7.2835, + "step": 6322 + }, + { + "epoch": 0.5899972007091537, + "grad_norm": 4.038066701417754, + "learning_rate": 0.00029645017408499416, + "loss": 8.2133, + "step": 6323 + }, + { + "epoch": 0.5900905104040309, + "grad_norm": 3.1097512384660737, + "learning_rate": 0.0002964485381098675, + "loss": 7.5206, + "step": 6324 + }, + { + "epoch": 0.5901838200989082, + "grad_norm": 2901054842.454615, + "learning_rate": 0.00029644690176236553, + "loss": 7.7262, + "step": 6325 + }, + { + "epoch": 0.5902771297937855, + "grad_norm": 3.245984309209732, + "learning_rate": 0.0002964452650424925, + "loss": 7.6366, + "step": 6326 + }, + { + "epoch": 0.5903704394886629, + "grad_norm": 5.575604882966819, + "learning_rate": 0.00029644362795025253, + "loss": 7.3158, + "step": 6327 + }, + { + "epoch": 0.5904637491835402, + "grad_norm": 1.8353615327814325, + "learning_rate": 0.00029644199048564976, + "loss": 7.5858, + "step": 6328 + }, + { + "epoch": 0.5905570588784175, + "grad_norm": 2.7438271811307477, + "learning_rate": 0.00029644035264868844, + "loss": 7.5845, + "step": 6329 + }, + { + "epoch": 0.5906503685732948, + "grad_norm": 7.2321385325612795, + "learning_rate": 0.0002964387144393726, + "loss": 7.7792, + "step": 6330 + }, + { + "epoch": 0.5907436782681721, + "grad_norm": 2.4129442682389493, + "learning_rate": 0.0002964370758577066, + "loss": 7.6081, + "step": 6331 + }, + { + "epoch": 0.5908369879630494, + "grad_norm": 3.8035679474785904, + "learning_rate": 0.00029643543690369443, + "loss": 7.6771, + "step": 6332 + }, + { + "epoch": 0.5909302976579267, + "grad_norm": 3.7992586339713155, + "learning_rate": 0.00029643379757734035, + "loss": 7.4534, + "step": 6333 + }, + { + "epoch": 0.5910236073528039, + "grad_norm": 2.3513673052488713, + "learning_rate": 0.0002964321578786485, + "loss": 7.7721, + "step": 6334 + }, + { + "epoch": 0.5911169170476812, + "grad_norm": 2.4503392999390905, + "learning_rate": 0.00029643051780762304, + "loss": 7.2108, + "step": 6335 + }, + { + "epoch": 0.5912102267425585, + "grad_norm": 1.387679053514568, + "learning_rate": 0.0002964288773642681, + "loss": 7.376, + "step": 6336 + }, + { + "epoch": 0.5913035364374358, + "grad_norm": 4.005367634923926, + "learning_rate": 0.00029642723654858794, + "loss": 7.2874, + "step": 6337 + }, + { + "epoch": 0.5913968461323131, + "grad_norm": 7.2674890574525195, + "learning_rate": 0.0002964255953605867, + "loss": 7.4349, + "step": 6338 + }, + { + "epoch": 0.5914901558271904, + "grad_norm": 5.655722915242241, + "learning_rate": 0.00029642395380026856, + "loss": 7.6201, + "step": 6339 + }, + { + "epoch": 0.5915834655220678, + "grad_norm": 3.733663878129182, + "learning_rate": 0.0002964223118676377, + "loss": 7.801, + "step": 6340 + }, + { + "epoch": 0.5916767752169451, + "grad_norm": 2.4147549776563597, + "learning_rate": 0.0002964206695626982, + "loss": 7.7584, + "step": 6341 + }, + { + "epoch": 0.5917700849118224, + "grad_norm": 36.68407010026657, + "learning_rate": 0.00029641902688545433, + "loss": 7.5864, + "step": 6342 + }, + { + "epoch": 0.5918633946066997, + "grad_norm": 2.3219986388422953, + "learning_rate": 0.00029641738383591026, + "loss": 7.7554, + "step": 6343 + }, + { + "epoch": 0.591956704301577, + "grad_norm": 4.10299465232237, + "learning_rate": 0.0002964157404140702, + "loss": 7.8822, + "step": 6344 + }, + { + "epoch": 0.5920500139964542, + "grad_norm": 3.840764859691267, + "learning_rate": 0.00029641409661993823, + "loss": 7.6017, + "step": 6345 + }, + { + "epoch": 0.5921433236913315, + "grad_norm": 2.267808217025745, + "learning_rate": 0.0002964124524535186, + "loss": 7.4165, + "step": 6346 + }, + { + "epoch": 0.5922366333862088, + "grad_norm": 1.2521381063875179, + "learning_rate": 0.00029641080791481544, + "loss": 7.4985, + "step": 6347 + }, + { + "epoch": 0.5923299430810861, + "grad_norm": 1.3933779093160819, + "learning_rate": 0.000296409163003833, + "loss": 7.4746, + "step": 6348 + }, + { + "epoch": 0.5924232527759634, + "grad_norm": 23.813155495733156, + "learning_rate": 0.00029640751772057545, + "loss": 7.6136, + "step": 6349 + }, + { + "epoch": 0.5925165624708407, + "grad_norm": 4.225302002294734, + "learning_rate": 0.00029640587206504693, + "loss": 7.9064, + "step": 6350 + }, + { + "epoch": 0.592609872165718, + "grad_norm": 4.523486161419635, + "learning_rate": 0.00029640422603725164, + "loss": 7.6648, + "step": 6351 + }, + { + "epoch": 0.5927031818605953, + "grad_norm": 2.1367472670234147, + "learning_rate": 0.00029640257963719374, + "loss": 7.2673, + "step": 6352 + }, + { + "epoch": 0.5927964915554726, + "grad_norm": 1.6843957372055014, + "learning_rate": 0.0002964009328648775, + "loss": 7.3206, + "step": 6353 + }, + { + "epoch": 0.59288980125035, + "grad_norm": 1.994996377215416, + "learning_rate": 0.00029639928572030704, + "loss": 7.5828, + "step": 6354 + }, + { + "epoch": 0.5929831109452272, + "grad_norm": 3.451030077396634, + "learning_rate": 0.00029639763820348653, + "loss": 7.6123, + "step": 6355 + }, + { + "epoch": 0.5930764206401045, + "grad_norm": 1.9934887773850365, + "learning_rate": 0.00029639599031442023, + "loss": 7.7266, + "step": 6356 + }, + { + "epoch": 0.5931697303349818, + "grad_norm": 3.2727161401640066, + "learning_rate": 0.0002963943420531123, + "loss": 7.2023, + "step": 6357 + }, + { + "epoch": 0.5932630400298591, + "grad_norm": 1.8339260113059377, + "learning_rate": 0.00029639269341956686, + "loss": 7.5081, + "step": 6358 + }, + { + "epoch": 0.5933563497247364, + "grad_norm": 4.028846560270093, + "learning_rate": 0.0002963910444137882, + "loss": 7.8489, + "step": 6359 + }, + { + "epoch": 0.5934496594196137, + "grad_norm": 6.6398176234517665, + "learning_rate": 0.0002963893950357805, + "loss": 7.3675, + "step": 6360 + }, + { + "epoch": 0.593542969114491, + "grad_norm": 7.836926906681715, + "learning_rate": 0.00029638774528554796, + "loss": 7.0258, + "step": 6361 + }, + { + "epoch": 0.5936362788093683, + "grad_norm": 5.04014215631767, + "learning_rate": 0.0002963860951630947, + "loss": 7.4641, + "step": 6362 + }, + { + "epoch": 0.5937295885042456, + "grad_norm": 4.858064562065806, + "learning_rate": 0.0002963844446684249, + "loss": 7.4996, + "step": 6363 + }, + { + "epoch": 0.5938228981991229, + "grad_norm": 6.85749284499751, + "learning_rate": 0.0002963827938015429, + "loss": 7.3784, + "step": 6364 + }, + { + "epoch": 0.5939162078940002, + "grad_norm": 3.0636002801203905, + "learning_rate": 0.0002963811425624529, + "loss": 7.504, + "step": 6365 + }, + { + "epoch": 0.5940095175888774, + "grad_norm": 1.7582471884731568, + "learning_rate": 0.0002963794909511589, + "loss": 7.4037, + "step": 6366 + }, + { + "epoch": 0.5941028272837547, + "grad_norm": 1.265247999713027, + "learning_rate": 0.0002963778389676652, + "loss": 7.5052, + "step": 6367 + }, + { + "epoch": 0.594196136978632, + "grad_norm": 1.2653987758929235, + "learning_rate": 0.0002963761866119761, + "loss": 7.5948, + "step": 6368 + }, + { + "epoch": 0.5942894466735094, + "grad_norm": 6.047364521393459, + "learning_rate": 0.00029637453388409564, + "loss": 7.2877, + "step": 6369 + }, + { + "epoch": 0.5943827563683867, + "grad_norm": 3.1212796637069964, + "learning_rate": 0.00029637288078402816, + "loss": 7.5077, + "step": 6370 + }, + { + "epoch": 0.594476066063264, + "grad_norm": 1.2891436872174185, + "learning_rate": 0.00029637122731177777, + "loss": 7.4565, + "step": 6371 + }, + { + "epoch": 0.5945693757581413, + "grad_norm": 2.549590436696176, + "learning_rate": 0.0002963695734673487, + "loss": 7.5715, + "step": 6372 + }, + { + "epoch": 0.5946626854530186, + "grad_norm": 2.171658818877086, + "learning_rate": 0.0002963679192507452, + "loss": 7.4384, + "step": 6373 + }, + { + "epoch": 0.5947559951478959, + "grad_norm": 6.382481931841194, + "learning_rate": 0.0002963662646619714, + "loss": 7.2937, + "step": 6374 + }, + { + "epoch": 0.5948493048427732, + "grad_norm": 1.2744682717707227, + "learning_rate": 0.00029636460970103155, + "loss": 7.3386, + "step": 6375 + }, + { + "epoch": 0.5949426145376505, + "grad_norm": 2.0634196885110234, + "learning_rate": 0.0002963629543679299, + "loss": 7.542, + "step": 6376 + }, + { + "epoch": 0.5950359242325277, + "grad_norm": 1.0402898627964188, + "learning_rate": 0.0002963612986626705, + "loss": 7.4106, + "step": 6377 + }, + { + "epoch": 0.595129233927405, + "grad_norm": 1.8963767154940498, + "learning_rate": 0.0002963596425852578, + "loss": 7.1981, + "step": 6378 + }, + { + "epoch": 0.5952225436222823, + "grad_norm": 29.31764475045633, + "learning_rate": 0.0002963579861356958, + "loss": 7.1677, + "step": 6379 + }, + { + "epoch": 0.5953158533171596, + "grad_norm": 13.599967908781869, + "learning_rate": 0.00029635632931398885, + "loss": 7.3822, + "step": 6380 + }, + { + "epoch": 0.595409163012037, + "grad_norm": 2.8024673182866437, + "learning_rate": 0.0002963546721201411, + "loss": 7.7066, + "step": 6381 + }, + { + "epoch": 0.5955024727069143, + "grad_norm": 29.469798648031496, + "learning_rate": 0.0002963530145541567, + "loss": 7.2885, + "step": 6382 + }, + { + "epoch": 0.5955957824017916, + "grad_norm": 2.897009674220351, + "learning_rate": 0.00029635135661604, + "loss": 7.5394, + "step": 6383 + }, + { + "epoch": 0.5956890920966689, + "grad_norm": 1.41724590036068, + "learning_rate": 0.0002963496983057952, + "loss": 7.2045, + "step": 6384 + }, + { + "epoch": 0.5957824017915462, + "grad_norm": 0.8355555392090399, + "learning_rate": 0.0002963480396234264, + "loss": 7.4213, + "step": 6385 + }, + { + "epoch": 0.5958757114864235, + "grad_norm": 0.8399706895457019, + "learning_rate": 0.0002963463805689379, + "loss": 7.2966, + "step": 6386 + }, + { + "epoch": 0.5959690211813007, + "grad_norm": 37.062092633584776, + "learning_rate": 0.0002963447211423339, + "loss": 7.6863, + "step": 6387 + }, + { + "epoch": 0.596062330876178, + "grad_norm": 23.397555917431244, + "learning_rate": 0.0002963430613436186, + "loss": 7.1944, + "step": 6388 + }, + { + "epoch": 0.5961556405710553, + "grad_norm": 1.1664584135598743, + "learning_rate": 0.00029634140117279636, + "loss": 7.1519, + "step": 6389 + }, + { + "epoch": 0.5962489502659326, + "grad_norm": 1.8166929736495765, + "learning_rate": 0.0002963397406298712, + "loss": 7.2577, + "step": 6390 + }, + { + "epoch": 0.5963422599608099, + "grad_norm": 1.5229076149282978, + "learning_rate": 0.0002963380797148474, + "loss": 7.5047, + "step": 6391 + }, + { + "epoch": 0.5964355696556872, + "grad_norm": 2.102757267112403, + "learning_rate": 0.0002963364184277293, + "loss": 7.307, + "step": 6392 + }, + { + "epoch": 0.5965288793505645, + "grad_norm": 0.6626434666438457, + "learning_rate": 0.00029633475676852094, + "loss": 7.3118, + "step": 6393 + }, + { + "epoch": 0.5966221890454418, + "grad_norm": 2.5269498096892637, + "learning_rate": 0.00029633309473722666, + "loss": 7.4207, + "step": 6394 + }, + { + "epoch": 0.5967154987403192, + "grad_norm": 1.799602072975306, + "learning_rate": 0.0002963314323338507, + "loss": 7.8435, + "step": 6395 + }, + { + "epoch": 0.5968088084351965, + "grad_norm": 6.606993339753778, + "learning_rate": 0.0002963297695583972, + "loss": 7.6907, + "step": 6396 + }, + { + "epoch": 0.5969021181300738, + "grad_norm": 1.2699850598024576, + "learning_rate": 0.00029632810641087057, + "loss": 7.526, + "step": 6397 + }, + { + "epoch": 0.596995427824951, + "grad_norm": 2.243309882282919, + "learning_rate": 0.0002963264428912748, + "loss": 7.2572, + "step": 6398 + }, + { + "epoch": 0.5970887375198283, + "grad_norm": 1.6833659445778684, + "learning_rate": 0.00029632477899961426, + "loss": 7.108, + "step": 6399 + }, + { + "epoch": 0.5971820472147056, + "grad_norm": 2.1118837122151333, + "learning_rate": 0.0002963231147358932, + "loss": 7.1909, + "step": 6400 + }, + { + "epoch": 0.5972753569095829, + "grad_norm": 2.621421773953814, + "learning_rate": 0.0002963214501001157, + "loss": 7.2606, + "step": 6401 + }, + { + "epoch": 0.5973686666044602, + "grad_norm": 1.1704393453895978, + "learning_rate": 0.0002963197850922862, + "loss": 7.3477, + "step": 6402 + }, + { + "epoch": 0.5974619762993375, + "grad_norm": 1.748919540651312, + "learning_rate": 0.0002963181197124088, + "loss": 7.0079, + "step": 6403 + }, + { + "epoch": 0.5975552859942148, + "grad_norm": 2.0017265797611885, + "learning_rate": 0.00029631645396048766, + "loss": 7.3022, + "step": 6404 + }, + { + "epoch": 0.5976485956890921, + "grad_norm": 2.0637350472849, + "learning_rate": 0.00029631478783652724, + "loss": 7.1775, + "step": 6405 + }, + { + "epoch": 0.5977419053839694, + "grad_norm": 0.9209906906049228, + "learning_rate": 0.00029631312134053157, + "loss": 7.286, + "step": 6406 + }, + { + "epoch": 0.5978352150788467, + "grad_norm": 2.5944585133993225, + "learning_rate": 0.00029631145447250504, + "loss": 7.7563, + "step": 6407 + }, + { + "epoch": 0.5979285247737239, + "grad_norm": 1.4297677255511492, + "learning_rate": 0.0002963097872324518, + "loss": 7.0223, + "step": 6408 + }, + { + "epoch": 0.5980218344686012, + "grad_norm": 0.8603692892294059, + "learning_rate": 0.0002963081196203761, + "loss": 7.2976, + "step": 6409 + }, + { + "epoch": 0.5981151441634786, + "grad_norm": 3.3299824255884474, + "learning_rate": 0.0002963064516362822, + "loss": 7.2317, + "step": 6410 + }, + { + "epoch": 0.5982084538583559, + "grad_norm": 17.64917382656928, + "learning_rate": 0.0002963047832801744, + "loss": 7.4904, + "step": 6411 + }, + { + "epoch": 0.5983017635532332, + "grad_norm": 3.637512380179204, + "learning_rate": 0.0002963031145520568, + "loss": 7.4577, + "step": 6412 + }, + { + "epoch": 0.5983950732481105, + "grad_norm": 2.164756970242436, + "learning_rate": 0.0002963014454519337, + "loss": 7.6619, + "step": 6413 + }, + { + "epoch": 0.5984883829429878, + "grad_norm": 6.350730407272721, + "learning_rate": 0.0002962997759798094, + "loss": 7.1145, + "step": 6414 + }, + { + "epoch": 0.5985816926378651, + "grad_norm": 1.1358497507111551, + "learning_rate": 0.00029629810613568814, + "loss": 7.4789, + "step": 6415 + }, + { + "epoch": 0.5986750023327424, + "grad_norm": 58.53658649429999, + "learning_rate": 0.0002962964359195741, + "loss": 7.1748, + "step": 6416 + }, + { + "epoch": 0.5987683120276197, + "grad_norm": 9.820065635792808, + "learning_rate": 0.00029629476533147154, + "loss": 7.5667, + "step": 6417 + }, + { + "epoch": 0.598861621722497, + "grad_norm": 1.7134927032976726, + "learning_rate": 0.0002962930943713847, + "loss": 7.2359, + "step": 6418 + }, + { + "epoch": 0.5989549314173742, + "grad_norm": 1.3216125173746995, + "learning_rate": 0.0002962914230393179, + "loss": 7.4687, + "step": 6419 + }, + { + "epoch": 0.5990482411122515, + "grad_norm": 2.018751419182759, + "learning_rate": 0.0002962897513352754, + "loss": 7.5382, + "step": 6420 + }, + { + "epoch": 0.5991415508071288, + "grad_norm": 0.7777952640389758, + "learning_rate": 0.00029628807925926134, + "loss": 7.085, + "step": 6421 + }, + { + "epoch": 0.5992348605020061, + "grad_norm": 1.0987613319467022, + "learning_rate": 0.00029628640681127996, + "loss": 7.0519, + "step": 6422 + }, + { + "epoch": 0.5993281701968834, + "grad_norm": 62.29141477966852, + "learning_rate": 0.0002962847339913357, + "loss": 7.1849, + "step": 6423 + }, + { + "epoch": 0.5994214798917608, + "grad_norm": 0.9558498328175176, + "learning_rate": 0.00029628306079943265, + "loss": 7.1342, + "step": 6424 + }, + { + "epoch": 0.5995147895866381, + "grad_norm": 1.7648393157789704, + "learning_rate": 0.00029628138723557506, + "loss": 7.2969, + "step": 6425 + }, + { + "epoch": 0.5996080992815154, + "grad_norm": 154.17650148498757, + "learning_rate": 0.0002962797132997673, + "loss": 7.2612, + "step": 6426 + }, + { + "epoch": 0.5997014089763927, + "grad_norm": 0.6006471515066886, + "learning_rate": 0.00029627803899201353, + "loss": 7.2695, + "step": 6427 + }, + { + "epoch": 0.59979471867127, + "grad_norm": 2.085018591008417, + "learning_rate": 0.00029627636431231805, + "loss": 7.2562, + "step": 6428 + }, + { + "epoch": 0.5998880283661473, + "grad_norm": 51.59246193120887, + "learning_rate": 0.00029627468926068506, + "loss": 7.0926, + "step": 6429 + }, + { + "epoch": 0.5999813380610245, + "grad_norm": 2.1187156947290804, + "learning_rate": 0.0002962730138371189, + "loss": 7.5314, + "step": 6430 + }, + { + "epoch": 0.6000746477559018, + "grad_norm": 1.9834181635929193, + "learning_rate": 0.0002962713380416238, + "loss": 7.1204, + "step": 6431 + }, + { + "epoch": 0.6001679574507791, + "grad_norm": 1.0801795492098758, + "learning_rate": 0.00029626966187420404, + "loss": 7.2052, + "step": 6432 + }, + { + "epoch": 0.6002612671456564, + "grad_norm": 2.1745363359163483, + "learning_rate": 0.00029626798533486385, + "loss": 6.9452, + "step": 6433 + }, + { + "epoch": 0.6003545768405337, + "grad_norm": 1.8166025701919504, + "learning_rate": 0.00029626630842360745, + "loss": 6.998, + "step": 6434 + }, + { + "epoch": 0.600447886535411, + "grad_norm": 1.493996430597873, + "learning_rate": 0.00029626463114043925, + "loss": 7.255, + "step": 6435 + }, + { + "epoch": 0.6005411962302883, + "grad_norm": 1.4851316340730896, + "learning_rate": 0.00029626295348536335, + "loss": 7.1562, + "step": 6436 + }, + { + "epoch": 0.6006345059251657, + "grad_norm": 3.2578398363485874, + "learning_rate": 0.0002962612754583841, + "loss": 7.1228, + "step": 6437 + }, + { + "epoch": 0.600727815620043, + "grad_norm": 4.1222370800094605, + "learning_rate": 0.00029625959705950573, + "loss": 7.3077, + "step": 6438 + }, + { + "epoch": 0.6008211253149203, + "grad_norm": 86.37286774436791, + "learning_rate": 0.00029625791828873256, + "loss": 6.765, + "step": 6439 + }, + { + "epoch": 0.6009144350097975, + "grad_norm": 1.5842080712750715, + "learning_rate": 0.0002962562391460688, + "loss": 6.8582, + "step": 6440 + }, + { + "epoch": 0.6010077447046748, + "grad_norm": 41.90941864152565, + "learning_rate": 0.0002962545596315187, + "loss": 7.4094, + "step": 6441 + }, + { + "epoch": 0.6011010543995521, + "grad_norm": 1.2057860315179878, + "learning_rate": 0.0002962528797450867, + "loss": 7.5734, + "step": 6442 + }, + { + "epoch": 0.6011943640944294, + "grad_norm": 196.95220559144852, + "learning_rate": 0.0002962511994867769, + "loss": 7.4946, + "step": 6443 + }, + { + "epoch": 0.6012876737893067, + "grad_norm": 0.6444188286298232, + "learning_rate": 0.0002962495188565936, + "loss": 7.0936, + "step": 6444 + }, + { + "epoch": 0.601380983484184, + "grad_norm": 2.2584064758960736, + "learning_rate": 0.0002962478378545411, + "loss": 6.8696, + "step": 6445 + }, + { + "epoch": 0.6014742931790613, + "grad_norm": 332.9270086164168, + "learning_rate": 0.00029624615648062366, + "loss": 7.1302, + "step": 6446 + }, + { + "epoch": 0.6015676028739386, + "grad_norm": 0.8669936517533675, + "learning_rate": 0.00029624447473484565, + "loss": 7.1484, + "step": 6447 + }, + { + "epoch": 0.6016609125688159, + "grad_norm": 1.7587921783820493, + "learning_rate": 0.0002962427926172112, + "loss": 7.2317, + "step": 6448 + }, + { + "epoch": 0.6017542222636932, + "grad_norm": 1.0671638784643502, + "learning_rate": 0.0002962411101277246, + "loss": 7.2486, + "step": 6449 + }, + { + "epoch": 0.6018475319585705, + "grad_norm": 0.5749860160679126, + "learning_rate": 0.00029623942726639024, + "loss": 7.3321, + "step": 6450 + }, + { + "epoch": 0.6019408416534477, + "grad_norm": 1.4112383344280544, + "learning_rate": 0.00029623774403321235, + "loss": 7.2157, + "step": 6451 + }, + { + "epoch": 0.602034151348325, + "grad_norm": 4.566935176924814, + "learning_rate": 0.0002962360604281951, + "loss": 6.7399, + "step": 6452 + }, + { + "epoch": 0.6021274610432024, + "grad_norm": 3.036296553942203, + "learning_rate": 0.00029623437645134295, + "loss": 6.9769, + "step": 6453 + }, + { + "epoch": 0.6022207707380797, + "grad_norm": 3.151028702143228, + "learning_rate": 0.00029623269210266013, + "loss": 7.4671, + "step": 6454 + }, + { + "epoch": 0.602314080432957, + "grad_norm": 1.3573946366824279, + "learning_rate": 0.0002962310073821508, + "loss": 7.0343, + "step": 6455 + }, + { + "epoch": 0.6024073901278343, + "grad_norm": 1.2160575292998292, + "learning_rate": 0.00029622932228981936, + "loss": 7.2186, + "step": 6456 + }, + { + "epoch": 0.6025006998227116, + "grad_norm": 1.2916207788984622, + "learning_rate": 0.0002962276368256701, + "loss": 7.326, + "step": 6457 + }, + { + "epoch": 0.6025940095175889, + "grad_norm": 41.59726193611959, + "learning_rate": 0.0002962259509897073, + "loss": 7.0022, + "step": 6458 + }, + { + "epoch": 0.6026873192124662, + "grad_norm": 1.4536287247618516, + "learning_rate": 0.00029622426478193514, + "loss": 7.261, + "step": 6459 + }, + { + "epoch": 0.6027806289073435, + "grad_norm": 2.0723661891375675, + "learning_rate": 0.0002962225782023581, + "loss": 7.0381, + "step": 6460 + }, + { + "epoch": 0.6028739386022207, + "grad_norm": 150.43640909110113, + "learning_rate": 0.0002962208912509802, + "loss": 7.111, + "step": 6461 + }, + { + "epoch": 0.602967248297098, + "grad_norm": 0.9724864364477286, + "learning_rate": 0.00029621920392780603, + "loss": 7.337, + "step": 6462 + }, + { + "epoch": 0.6030605579919753, + "grad_norm": 542.4969035998255, + "learning_rate": 0.00029621751623283967, + "loss": 7.085, + "step": 6463 + }, + { + "epoch": 0.6031538676868526, + "grad_norm": 1.5275170239102673, + "learning_rate": 0.0002962158281660855, + "loss": 7.0707, + "step": 6464 + }, + { + "epoch": 0.60324717738173, + "grad_norm": 1.6349688320053302, + "learning_rate": 0.00029621413972754783, + "loss": 7.3064, + "step": 6465 + }, + { + "epoch": 0.6033404870766073, + "grad_norm": 0.6502719374053829, + "learning_rate": 0.0002962124509172308, + "loss": 7.1714, + "step": 6466 + }, + { + "epoch": 0.6034337967714846, + "grad_norm": 1.0891296046175654, + "learning_rate": 0.0002962107617351389, + "loss": 6.8688, + "step": 6467 + }, + { + "epoch": 0.6035271064663619, + "grad_norm": 523.4727939344501, + "learning_rate": 0.00029620907218127633, + "loss": 6.9915, + "step": 6468 + }, + { + "epoch": 0.6036204161612392, + "grad_norm": 0.671870347708053, + "learning_rate": 0.00029620738225564743, + "loss": 7.2292, + "step": 6469 + }, + { + "epoch": 0.6037137258561165, + "grad_norm": 1.1231514880350786, + "learning_rate": 0.00029620569195825644, + "loss": 7.4264, + "step": 6470 + }, + { + "epoch": 0.6038070355509938, + "grad_norm": 1.2192280351033145, + "learning_rate": 0.0002962040012891077, + "loss": 6.7937, + "step": 6471 + }, + { + "epoch": 0.603900345245871, + "grad_norm": 1.4720889709413385, + "learning_rate": 0.00029620231024820546, + "loss": 7.5579, + "step": 6472 + }, + { + "epoch": 0.6039936549407483, + "grad_norm": 0.5778733400330097, + "learning_rate": 0.0002962006188355541, + "loss": 7.2012, + "step": 6473 + }, + { + "epoch": 0.6040869646356256, + "grad_norm": 68.21459359771376, + "learning_rate": 0.0002961989270511578, + "loss": 7.0039, + "step": 6474 + }, + { + "epoch": 0.6041802743305029, + "grad_norm": 1.0713112150821307, + "learning_rate": 0.000296197234895021, + "loss": 7.3199, + "step": 6475 + }, + { + "epoch": 0.6042735840253802, + "grad_norm": 2.7867077958216053, + "learning_rate": 0.0002961955423671479, + "loss": 7.1495, + "step": 6476 + }, + { + "epoch": 0.6043668937202575, + "grad_norm": 1.7689135871818567, + "learning_rate": 0.0002961938494675429, + "loss": 6.8104, + "step": 6477 + }, + { + "epoch": 0.6044602034151348, + "grad_norm": 0.7758399991691133, + "learning_rate": 0.0002961921561962102, + "loss": 6.872, + "step": 6478 + }, + { + "epoch": 0.6045535131100122, + "grad_norm": 0.9571464023523414, + "learning_rate": 0.0002961904625531542, + "loss": 7.3771, + "step": 6479 + }, + { + "epoch": 0.6046468228048895, + "grad_norm": 2.0222532104894175, + "learning_rate": 0.00029618876853837906, + "loss": 7.0629, + "step": 6480 + }, + { + "epoch": 0.6047401324997668, + "grad_norm": 0.9585450418325825, + "learning_rate": 0.0002961870741518893, + "loss": 7.258, + "step": 6481 + }, + { + "epoch": 0.6048334421946441, + "grad_norm": 1432.354184956763, + "learning_rate": 0.000296185379393689, + "loss": 6.8094, + "step": 6482 + }, + { + "epoch": 0.6049267518895213, + "grad_norm": 421.9171395761489, + "learning_rate": 0.00029618368426378267, + "loss": 6.974, + "step": 6483 + }, + { + "epoch": 0.6050200615843986, + "grad_norm": 1.2902809035657887, + "learning_rate": 0.00029618198876217453, + "loss": 7.1703, + "step": 6484 + }, + { + "epoch": 0.6051133712792759, + "grad_norm": 1.5548866148899048, + "learning_rate": 0.0002961802928888688, + "loss": 7.2721, + "step": 6485 + }, + { + "epoch": 0.6052066809741532, + "grad_norm": 2.116380654392, + "learning_rate": 0.00029617859664386997, + "loss": 6.9659, + "step": 6486 + }, + { + "epoch": 0.6052999906690305, + "grad_norm": 1.0049564242212286, + "learning_rate": 0.0002961769000271823, + "loss": 6.7916, + "step": 6487 + }, + { + "epoch": 0.6053933003639078, + "grad_norm": 108.47611961541479, + "learning_rate": 0.00029617520303880997, + "loss": 7.1603, + "step": 6488 + }, + { + "epoch": 0.6054866100587851, + "grad_norm": 1.689198589386683, + "learning_rate": 0.0002961735056787575, + "loss": 7.1398, + "step": 6489 + }, + { + "epoch": 0.6055799197536624, + "grad_norm": 0.991268684922997, + "learning_rate": 0.00029617180794702907, + "loss": 7.2207, + "step": 6490 + }, + { + "epoch": 0.6056732294485397, + "grad_norm": 0.9975246798090388, + "learning_rate": 0.00029617010984362904, + "loss": 6.8515, + "step": 6491 + }, + { + "epoch": 0.605766539143417, + "grad_norm": 1.6702860852221124, + "learning_rate": 0.0002961684113685617, + "loss": 6.9599, + "step": 6492 + }, + { + "epoch": 0.6058598488382942, + "grad_norm": 1.0567570907876604, + "learning_rate": 0.00029616671252183135, + "loss": 6.826, + "step": 6493 + }, + { + "epoch": 0.6059531585331716, + "grad_norm": 0.707380282777123, + "learning_rate": 0.0002961650133034424, + "loss": 7.115, + "step": 6494 + }, + { + "epoch": 0.6060464682280489, + "grad_norm": 0.7737796329371074, + "learning_rate": 0.0002961633137133991, + "loss": 7.4618, + "step": 6495 + }, + { + "epoch": 0.6061397779229262, + "grad_norm": 1.0341014593247184, + "learning_rate": 0.00029616161375170584, + "loss": 7.5386, + "step": 6496 + }, + { + "epoch": 0.6062330876178035, + "grad_norm": 0.5878415984560483, + "learning_rate": 0.00029615991341836687, + "loss": 7.2634, + "step": 6497 + }, + { + "epoch": 0.6063263973126808, + "grad_norm": 0.9073010626873886, + "learning_rate": 0.00029615821271338654, + "loss": 7.1181, + "step": 6498 + }, + { + "epoch": 0.6064197070075581, + "grad_norm": 0.945872029832418, + "learning_rate": 0.00029615651163676914, + "loss": 7.1644, + "step": 6499 + }, + { + "epoch": 0.6065130167024354, + "grad_norm": 1.1289467340685784, + "learning_rate": 0.00029615481018851905, + "loss": 7.3402, + "step": 6500 + }, + { + "epoch": 0.6066063263973127, + "grad_norm": 1.0147499988508648, + "learning_rate": 0.00029615310836864057, + "loss": 7.0538, + "step": 6501 + }, + { + "epoch": 0.60669963609219, + "grad_norm": 54.40880665194085, + "learning_rate": 0.000296151406177138, + "loss": 7.4854, + "step": 6502 + }, + { + "epoch": 0.6067929457870673, + "grad_norm": 1.178890526584183, + "learning_rate": 0.0002961497036140157, + "loss": 7.1321, + "step": 6503 + }, + { + "epoch": 0.6068862554819445, + "grad_norm": 1.3010732514269705, + "learning_rate": 0.000296148000679278, + "loss": 7.247, + "step": 6504 + }, + { + "epoch": 0.6069795651768218, + "grad_norm": 0.7935940837507428, + "learning_rate": 0.00029614629737292924, + "loss": 7.0269, + "step": 6505 + }, + { + "epoch": 0.6070728748716991, + "grad_norm": 7476.830285673273, + "learning_rate": 0.00029614459369497375, + "loss": 7.2559, + "step": 6506 + }, + { + "epoch": 0.6071661845665765, + "grad_norm": 1846.691392849108, + "learning_rate": 0.0002961428896454158, + "loss": 7.3028, + "step": 6507 + }, + { + "epoch": 0.6072594942614538, + "grad_norm": 1.3221630229209302, + "learning_rate": 0.00029614118522425985, + "loss": 7.1116, + "step": 6508 + }, + { + "epoch": 0.6073528039563311, + "grad_norm": 1.412587312015002, + "learning_rate": 0.0002961394804315101, + "loss": 7.4241, + "step": 6509 + }, + { + "epoch": 0.6074461136512084, + "grad_norm": 1.141078787475211, + "learning_rate": 0.00029613777526717096, + "loss": 7.0006, + "step": 6510 + }, + { + "epoch": 0.6075394233460857, + "grad_norm": 0.6976171320851028, + "learning_rate": 0.0002961360697312468, + "loss": 7.3607, + "step": 6511 + }, + { + "epoch": 0.607632733040963, + "grad_norm": 1.3167371250678255, + "learning_rate": 0.0002961343638237418, + "loss": 6.993, + "step": 6512 + }, + { + "epoch": 0.6077260427358403, + "grad_norm": 0.5158259160077733, + "learning_rate": 0.0002961326575446604, + "loss": 7.127, + "step": 6513 + }, + { + "epoch": 0.6078193524307175, + "grad_norm": 0.9547772387835662, + "learning_rate": 0.000296130950894007, + "loss": 6.724, + "step": 6514 + }, + { + "epoch": 0.6079126621255948, + "grad_norm": 0.8746920401244932, + "learning_rate": 0.0002961292438717859, + "loss": 7.2637, + "step": 6515 + }, + { + "epoch": 0.6080059718204721, + "grad_norm": 0.47098544812407944, + "learning_rate": 0.0002961275364780013, + "loss": 7.1298, + "step": 6516 + }, + { + "epoch": 0.6080992815153494, + "grad_norm": 2055.438982069114, + "learning_rate": 0.0002961258287126578, + "loss": 7.1384, + "step": 6517 + }, + { + "epoch": 0.6081925912102267, + "grad_norm": 5003.948212275166, + "learning_rate": 0.00029612412057575954, + "loss": 7.1272, + "step": 6518 + }, + { + "epoch": 0.608285900905104, + "grad_norm": 0.7828861195058189, + "learning_rate": 0.00029612241206731095, + "loss": 7.5142, + "step": 6519 + }, + { + "epoch": 0.6083792105999813, + "grad_norm": 1.2029165934097308, + "learning_rate": 0.0002961207031873163, + "loss": 6.9549, + "step": 6520 + }, + { + "epoch": 0.6084725202948587, + "grad_norm": 1.8125179353046557, + "learning_rate": 0.0002961189939357801, + "loss": 6.8158, + "step": 6521 + }, + { + "epoch": 0.608565829989736, + "grad_norm": 2703.3179632406736, + "learning_rate": 0.00029611728431270645, + "loss": 7.0901, + "step": 6522 + }, + { + "epoch": 0.6086591396846133, + "grad_norm": 0.9563053837449894, + "learning_rate": 0.00029611557431809987, + "loss": 7.2418, + "step": 6523 + }, + { + "epoch": 0.6087524493794906, + "grad_norm": 6114.624285633296, + "learning_rate": 0.0002961138639519647, + "loss": 7.2785, + "step": 6524 + }, + { + "epoch": 0.6088457590743678, + "grad_norm": 2.765997940990585, + "learning_rate": 0.00029611215321430524, + "loss": 7.1044, + "step": 6525 + }, + { + "epoch": 0.6089390687692451, + "grad_norm": 7.5940851711140995, + "learning_rate": 0.0002961104421051258, + "loss": 7.3185, + "step": 6526 + }, + { + "epoch": 0.6090323784641224, + "grad_norm": 5.3171521948312535, + "learning_rate": 0.00029610873062443086, + "loss": 6.8276, + "step": 6527 + }, + { + "epoch": 0.6091256881589997, + "grad_norm": 2.329274437033004, + "learning_rate": 0.0002961070187722247, + "loss": 7.4849, + "step": 6528 + }, + { + "epoch": 0.609218997853877, + "grad_norm": 21849.63236223329, + "learning_rate": 0.00029610530654851163, + "loss": 7.1257, + "step": 6529 + }, + { + "epoch": 0.6093123075487543, + "grad_norm": 1.7198083269029247, + "learning_rate": 0.0002961035939532961, + "loss": 7.3936, + "step": 6530 + }, + { + "epoch": 0.6094056172436316, + "grad_norm": 2.246853617092913, + "learning_rate": 0.00029610188098658235, + "loss": 7.5061, + "step": 6531 + }, + { + "epoch": 0.6094989269385089, + "grad_norm": 3.3974210251507455, + "learning_rate": 0.0002961001676483748, + "loss": 7.6188, + "step": 6532 + }, + { + "epoch": 0.6095922366333862, + "grad_norm": 9.465711487752877, + "learning_rate": 0.00029609845393867784, + "loss": 7.7092, + "step": 6533 + }, + { + "epoch": 0.6096855463282636, + "grad_norm": 4.139855410703897, + "learning_rate": 0.0002960967398574958, + "loss": 7.4216, + "step": 6534 + }, + { + "epoch": 0.6097788560231409, + "grad_norm": 1.1566764189210412, + "learning_rate": 0.00029609502540483293, + "loss": 7.147, + "step": 6535 + }, + { + "epoch": 0.6098721657180181, + "grad_norm": 2.387946013785382, + "learning_rate": 0.0002960933105806938, + "loss": 7.8808, + "step": 6536 + }, + { + "epoch": 0.6099654754128954, + "grad_norm": 1.171822517795155, + "learning_rate": 0.0002960915953850826, + "loss": 7.3828, + "step": 6537 + }, + { + "epoch": 0.6100587851077727, + "grad_norm": 1.2598555263658666, + "learning_rate": 0.00029608987981800375, + "loss": 7.3411, + "step": 6538 + }, + { + "epoch": 0.61015209480265, + "grad_norm": 1.0433736251985646, + "learning_rate": 0.0002960881638794616, + "loss": 7.3991, + "step": 6539 + }, + { + "epoch": 0.6102454044975273, + "grad_norm": 1.2603533390497126, + "learning_rate": 0.0002960864475694606, + "loss": 7.1554, + "step": 6540 + }, + { + "epoch": 0.6103387141924046, + "grad_norm": 375.8532646439272, + "learning_rate": 0.00029608473088800496, + "loss": 7.1465, + "step": 6541 + }, + { + "epoch": 0.6104320238872819, + "grad_norm": 1.4483462459668346, + "learning_rate": 0.0002960830138350991, + "loss": 7.21, + "step": 6542 + }, + { + "epoch": 0.6105253335821592, + "grad_norm": 1.8480486736613868, + "learning_rate": 0.00029608129641074747, + "loss": 7.528, + "step": 6543 + }, + { + "epoch": 0.6106186432770365, + "grad_norm": 4.096881382708808, + "learning_rate": 0.0002960795786149544, + "loss": 7.4169, + "step": 6544 + }, + { + "epoch": 0.6107119529719138, + "grad_norm": 2.8592603035058186, + "learning_rate": 0.0002960778604477242, + "loss": 7.7035, + "step": 6545 + }, + { + "epoch": 0.610805262666791, + "grad_norm": 16380.474073595575, + "learning_rate": 0.0002960761419090612, + "loss": 7.6214, + "step": 6546 + }, + { + "epoch": 0.6108985723616683, + "grad_norm": 1.3893128439972844, + "learning_rate": 0.00029607442299896996, + "loss": 7.0691, + "step": 6547 + }, + { + "epoch": 0.6109918820565456, + "grad_norm": 11.321469768612536, + "learning_rate": 0.0002960727037174547, + "loss": 8.3693, + "step": 6548 + }, + { + "epoch": 0.611085191751423, + "grad_norm": 3.9045680852583455, + "learning_rate": 0.00029607098406451976, + "loss": 8.5997, + "step": 6549 + }, + { + "epoch": 0.6111785014463003, + "grad_norm": 4.885065977551312, + "learning_rate": 0.00029606926404016965, + "loss": 8.2732, + "step": 6550 + }, + { + "epoch": 0.6112718111411776, + "grad_norm": 1.786227923891179, + "learning_rate": 0.00029606754364440863, + "loss": 8.7178, + "step": 6551 + }, + { + "epoch": 0.6113651208360549, + "grad_norm": 714.1060962427159, + "learning_rate": 0.0002960658228772411, + "loss": 7.994, + "step": 6552 + }, + { + "epoch": 0.6114584305309322, + "grad_norm": 3.565756749135153, + "learning_rate": 0.0002960641017386715, + "loss": 7.9173, + "step": 6553 + }, + { + "epoch": 0.6115517402258095, + "grad_norm": 1.4906265313428355, + "learning_rate": 0.00029606238022870414, + "loss": 7.9965, + "step": 6554 + }, + { + "epoch": 0.6116450499206868, + "grad_norm": 4.047646903287381, + "learning_rate": 0.0002960606583473434, + "loss": 8.5061, + "step": 6555 + }, + { + "epoch": 0.6117383596155641, + "grad_norm": 3.454834350235144, + "learning_rate": 0.0002960589360945937, + "loss": 7.8819, + "step": 6556 + }, + { + "epoch": 0.6118316693104413, + "grad_norm": 3.8910060476678203, + "learning_rate": 0.0002960572134704593, + "loss": 8.2072, + "step": 6557 + }, + { + "epoch": 0.6119249790053186, + "grad_norm": 1.8569931158627586, + "learning_rate": 0.00029605549047494477, + "loss": 7.8331, + "step": 6558 + }, + { + "epoch": 0.6120182887001959, + "grad_norm": 2.940167295419059, + "learning_rate": 0.0002960537671080544, + "loss": 7.3905, + "step": 6559 + }, + { + "epoch": 0.6121115983950732, + "grad_norm": 2.6134645528917506, + "learning_rate": 0.0002960520433697925, + "loss": 7.6261, + "step": 6560 + }, + { + "epoch": 0.6122049080899505, + "grad_norm": 1.669842869670385, + "learning_rate": 0.00029605031926016357, + "loss": 7.854, + "step": 6561 + }, + { + "epoch": 0.6122982177848278, + "grad_norm": 1.9644592593612737, + "learning_rate": 0.0002960485947791719, + "loss": 7.3538, + "step": 6562 + }, + { + "epoch": 0.6123915274797052, + "grad_norm": 2.339353833524571, + "learning_rate": 0.00029604686992682193, + "loss": 7.8439, + "step": 6563 + }, + { + "epoch": 0.6124848371745825, + "grad_norm": 23.824817271948042, + "learning_rate": 0.000296045144703118, + "loss": 7.8071, + "step": 6564 + }, + { + "epoch": 0.6125781468694598, + "grad_norm": 2.8084940159146528, + "learning_rate": 0.0002960434191080646, + "loss": 7.7092, + "step": 6565 + }, + { + "epoch": 0.6126714565643371, + "grad_norm": 2.1872746320008676, + "learning_rate": 0.000296041693141666, + "loss": 7.214, + "step": 6566 + }, + { + "epoch": 0.6127647662592143, + "grad_norm": 2.5255925805981754, + "learning_rate": 0.00029603996680392663, + "loss": 7.49, + "step": 6567 + }, + { + "epoch": 0.6128580759540916, + "grad_norm": 1.3300709608190417, + "learning_rate": 0.00029603824009485094, + "loss": 7.2767, + "step": 6568 + }, + { + "epoch": 0.6129513856489689, + "grad_norm": 234.71754547974092, + "learning_rate": 0.0002960365130144432, + "loss": 7.4886, + "step": 6569 + }, + { + "epoch": 0.6130446953438462, + "grad_norm": 1.3241674945734656, + "learning_rate": 0.0002960347855627079, + "loss": 7.3706, + "step": 6570 + }, + { + "epoch": 0.6131380050387235, + "grad_norm": 1.6905397888998999, + "learning_rate": 0.00029603305773964945, + "loss": 7.6256, + "step": 6571 + }, + { + "epoch": 0.6132313147336008, + "grad_norm": 0.9243262033326616, + "learning_rate": 0.0002960313295452721, + "loss": 7.5403, + "step": 6572 + }, + { + "epoch": 0.6133246244284781, + "grad_norm": 1.2117074685455187, + "learning_rate": 0.0002960296009795804, + "loss": 7.5848, + "step": 6573 + }, + { + "epoch": 0.6134179341233554, + "grad_norm": 1.0207305550076562, + "learning_rate": 0.0002960278720425786, + "loss": 7.446, + "step": 6574 + }, + { + "epoch": 0.6135112438182327, + "grad_norm": 1.12529881578715, + "learning_rate": 0.00029602614273427125, + "loss": 7.3955, + "step": 6575 + }, + { + "epoch": 0.61360455351311, + "grad_norm": 1.0016809403569071, + "learning_rate": 0.00029602441305466267, + "loss": 7.1659, + "step": 6576 + }, + { + "epoch": 0.6136978632079874, + "grad_norm": 1.378733985144175, + "learning_rate": 0.00029602268300375724, + "loss": 7.6054, + "step": 6577 + }, + { + "epoch": 0.6137911729028646, + "grad_norm": 1.255940291410124, + "learning_rate": 0.0002960209525815594, + "loss": 7.3081, + "step": 6578 + }, + { + "epoch": 0.6138844825977419, + "grad_norm": 1.0413572755618326, + "learning_rate": 0.0002960192217880735, + "loss": 7.1772, + "step": 6579 + }, + { + "epoch": 0.6139777922926192, + "grad_norm": 526.8686731282809, + "learning_rate": 0.000296017490623304, + "loss": 7.3706, + "step": 6580 + }, + { + "epoch": 0.6140711019874965, + "grad_norm": 1676.9213309455097, + "learning_rate": 0.00029601575908725524, + "loss": 7.4995, + "step": 6581 + }, + { + "epoch": 0.6141644116823738, + "grad_norm": 14.32042857482655, + "learning_rate": 0.00029601402717993174, + "loss": 7.6798, + "step": 6582 + }, + { + "epoch": 0.6142577213772511, + "grad_norm": 4.939054958677594, + "learning_rate": 0.0002960122949013378, + "loss": 7.2796, + "step": 6583 + }, + { + "epoch": 0.6143510310721284, + "grad_norm": 0.942282201034243, + "learning_rate": 0.0002960105622514778, + "loss": 7.3103, + "step": 6584 + }, + { + "epoch": 0.6144443407670057, + "grad_norm": 3358.1470794133575, + "learning_rate": 0.0002960088292303562, + "loss": 7.3509, + "step": 6585 + }, + { + "epoch": 0.614537650461883, + "grad_norm": 865.7185114848987, + "learning_rate": 0.0002960070958379774, + "loss": 7.4126, + "step": 6586 + }, + { + "epoch": 0.6146309601567603, + "grad_norm": 13875.80297520527, + "learning_rate": 0.00029600536207434583, + "loss": 7.3627, + "step": 6587 + }, + { + "epoch": 0.6147242698516376, + "grad_norm": 0.9400889371212953, + "learning_rate": 0.0002960036279394658, + "loss": 7.2644, + "step": 6588 + }, + { + "epoch": 0.6148175795465148, + "grad_norm": 74676.38149425328, + "learning_rate": 0.00029600189343334185, + "loss": 7.8137, + "step": 6589 + }, + { + "epoch": 0.6149108892413921, + "grad_norm": 1.734968755496483, + "learning_rate": 0.00029600015855597834, + "loss": 7.3556, + "step": 6590 + }, + { + "epoch": 0.6150041989362695, + "grad_norm": 1.174194901733081, + "learning_rate": 0.00029599842330737966, + "loss": 7.6102, + "step": 6591 + }, + { + "epoch": 0.6150975086311468, + "grad_norm": 1.993714168423794, + "learning_rate": 0.00029599668768755027, + "loss": 7.1873, + "step": 6592 + }, + { + "epoch": 0.6151908183260241, + "grad_norm": 6.007873165995547, + "learning_rate": 0.0002959949516964945, + "loss": 7.539, + "step": 6593 + }, + { + "epoch": 0.6152841280209014, + "grad_norm": 121694.22806197956, + "learning_rate": 0.0002959932153342169, + "loss": 7.8142, + "step": 6594 + }, + { + "epoch": 0.6153774377157787, + "grad_norm": 2.335754956575567, + "learning_rate": 0.0002959914786007217, + "loss": 7.2067, + "step": 6595 + }, + { + "epoch": 0.615470747410656, + "grad_norm": 34.49487421463732, + "learning_rate": 0.0002959897414960135, + "loss": 7.6705, + "step": 6596 + }, + { + "epoch": 0.6155640571055333, + "grad_norm": 4.354908805699149, + "learning_rate": 0.0002959880040200966, + "loss": 7.0418, + "step": 6597 + }, + { + "epoch": 0.6156573668004106, + "grad_norm": 9726369.58020372, + "learning_rate": 0.0002959862661729754, + "loss": 7.094, + "step": 6598 + }, + { + "epoch": 0.6157506764952878, + "grad_norm": 3.134985179480617, + "learning_rate": 0.00029598452795465445, + "loss": 7.4554, + "step": 6599 + }, + { + "epoch": 0.6158439861901651, + "grad_norm": 1.808106762845508, + "learning_rate": 0.000295982789365138, + "loss": 7.3404, + "step": 6600 + }, + { + "epoch": 0.6159372958850424, + "grad_norm": 4.841337035000832, + "learning_rate": 0.00029598105040443064, + "loss": 7.7776, + "step": 6601 + }, + { + "epoch": 0.6160306055799197, + "grad_norm": 9.188608640474259, + "learning_rate": 0.0002959793110725367, + "loss": 7.9866, + "step": 6602 + }, + { + "epoch": 0.616123915274797, + "grad_norm": 13.740448710148696, + "learning_rate": 0.0002959775713694606, + "loss": 8.4641, + "step": 6603 + }, + { + "epoch": 0.6162172249696743, + "grad_norm": 6.432274836592997, + "learning_rate": 0.00029597583129520674, + "loss": 8.9799, + "step": 6604 + }, + { + "epoch": 0.6163105346645517, + "grad_norm": 7.029485106632873, + "learning_rate": 0.0002959740908497796, + "loss": 8.091, + "step": 6605 + }, + { + "epoch": 0.616403844359429, + "grad_norm": 4.738111729035465, + "learning_rate": 0.0002959723500331836, + "loss": 8.1058, + "step": 6606 + }, + { + "epoch": 0.6164971540543063, + "grad_norm": 33997.53277437778, + "learning_rate": 0.0002959706088454232, + "loss": 8.4649, + "step": 6607 + }, + { + "epoch": 0.6165904637491836, + "grad_norm": 7.802443203752792, + "learning_rate": 0.00029596886728650273, + "loss": 8.3377, + "step": 6608 + }, + { + "epoch": 0.6166837734440609, + "grad_norm": 2.891120138345526, + "learning_rate": 0.0002959671253564267, + "loss": 7.7876, + "step": 6609 + }, + { + "epoch": 0.6167770831389381, + "grad_norm": 3.0137669911565554, + "learning_rate": 0.00029596538305519946, + "loss": 7.4006, + "step": 6610 + }, + { + "epoch": 0.6168703928338154, + "grad_norm": 10026.3824313085, + "learning_rate": 0.00029596364038282554, + "loss": 7.7609, + "step": 6611 + }, + { + "epoch": 0.6169637025286927, + "grad_norm": 3.8923973517984742, + "learning_rate": 0.00029596189733930927, + "loss": 7.8029, + "step": 6612 + }, + { + "epoch": 0.61705701222357, + "grad_norm": 3.64829560458673, + "learning_rate": 0.0002959601539246552, + "loss": 7.9516, + "step": 6613 + }, + { + "epoch": 0.6171503219184473, + "grad_norm": 4.95119425544506, + "learning_rate": 0.0002959584101388676, + "loss": 7.6132, + "step": 6614 + }, + { + "epoch": 0.6172436316133246, + "grad_norm": 1.3999063537685397, + "learning_rate": 0.00029595666598195106, + "loss": 7.57, + "step": 6615 + }, + { + "epoch": 0.6173369413082019, + "grad_norm": 3.294840713523134, + "learning_rate": 0.00029595492145390994, + "loss": 7.2725, + "step": 6616 + }, + { + "epoch": 0.6174302510030792, + "grad_norm": 3.2875019696419487, + "learning_rate": 0.00029595317655474874, + "loss": 7.4644, + "step": 6617 + }, + { + "epoch": 0.6175235606979566, + "grad_norm": 4587.3433888568115, + "learning_rate": 0.00029595143128447176, + "loss": 7.4365, + "step": 6618 + }, + { + "epoch": 0.6176168703928339, + "grad_norm": 16.273353877064743, + "learning_rate": 0.0002959496856430836, + "loss": 7.4546, + "step": 6619 + }, + { + "epoch": 0.6177101800877111, + "grad_norm": 2.3170895446480593, + "learning_rate": 0.0002959479396305886, + "loss": 7.752, + "step": 6620 + }, + { + "epoch": 0.6178034897825884, + "grad_norm": 2.627617257817643, + "learning_rate": 0.0002959461932469912, + "loss": 7.4902, + "step": 6621 + }, + { + "epoch": 0.6178967994774657, + "grad_norm": 10.222344551617194, + "learning_rate": 0.00029594444649229585, + "loss": 7.4347, + "step": 6622 + }, + { + "epoch": 0.617990109172343, + "grad_norm": 4.456081874361872, + "learning_rate": 0.0002959426993665071, + "loss": 7.6507, + "step": 6623 + }, + { + "epoch": 0.6180834188672203, + "grad_norm": 2.004322235830387, + "learning_rate": 0.0002959409518696292, + "loss": 7.1069, + "step": 6624 + }, + { + "epoch": 0.6181767285620976, + "grad_norm": 6.605104386048309, + "learning_rate": 0.0002959392040016667, + "loss": 7.6249, + "step": 6625 + }, + { + "epoch": 0.6182700382569749, + "grad_norm": 209.43066433992306, + "learning_rate": 0.00029593745576262406, + "loss": 7.3422, + "step": 6626 + }, + { + "epoch": 0.6183633479518522, + "grad_norm": 295.91561875442557, + "learning_rate": 0.00029593570715250565, + "loss": 7.2399, + "step": 6627 + }, + { + "epoch": 0.6184566576467295, + "grad_norm": 4.332420930847917, + "learning_rate": 0.000295933958171316, + "loss": 7.465, + "step": 6628 + }, + { + "epoch": 0.6185499673416068, + "grad_norm": 1.241506521479358, + "learning_rate": 0.00029593220881905954, + "loss": 7.1655, + "step": 6629 + }, + { + "epoch": 0.6186432770364841, + "grad_norm": 165.0960119200473, + "learning_rate": 0.0002959304590957407, + "loss": 7.4677, + "step": 6630 + }, + { + "epoch": 0.6187365867313613, + "grad_norm": 3.83707862656926, + "learning_rate": 0.00029592870900136396, + "loss": 7.6518, + "step": 6631 + }, + { + "epoch": 0.6188298964262386, + "grad_norm": 3.5770990016900597, + "learning_rate": 0.0002959269585359337, + "loss": 7.2598, + "step": 6632 + }, + { + "epoch": 0.618923206121116, + "grad_norm": 2.1566728391166063, + "learning_rate": 0.0002959252076994544, + "loss": 7.3446, + "step": 6633 + }, + { + "epoch": 0.6190165158159933, + "grad_norm": 32.4136286328622, + "learning_rate": 0.0002959234564919306, + "loss": 7.3015, + "step": 6634 + }, + { + "epoch": 0.6191098255108706, + "grad_norm": 4.064386073453344, + "learning_rate": 0.0002959217049133666, + "loss": 7.4702, + "step": 6635 + }, + { + "epoch": 0.6192031352057479, + "grad_norm": 2.109320836104819, + "learning_rate": 0.00029591995296376694, + "loss": 7.5275, + "step": 6636 + }, + { + "epoch": 0.6192964449006252, + "grad_norm": 3.0505400891020678, + "learning_rate": 0.00029591820064313605, + "loss": 7.5638, + "step": 6637 + }, + { + "epoch": 0.6193897545955025, + "grad_norm": 2.3542683353433502, + "learning_rate": 0.0002959164479514784, + "loss": 7.9416, + "step": 6638 + }, + { + "epoch": 0.6194830642903798, + "grad_norm": 2.1768891324662794, + "learning_rate": 0.00029591469488879845, + "loss": 7.5036, + "step": 6639 + }, + { + "epoch": 0.6195763739852571, + "grad_norm": 6.526219108669136, + "learning_rate": 0.0002959129414551007, + "loss": 7.5543, + "step": 6640 + }, + { + "epoch": 0.6196696836801344, + "grad_norm": 84.71424151822856, + "learning_rate": 0.0002959111876503895, + "loss": 7.1464, + "step": 6641 + }, + { + "epoch": 0.6197629933750116, + "grad_norm": 1.9505904726056214, + "learning_rate": 0.0002959094334746694, + "loss": 7.2729, + "step": 6642 + }, + { + "epoch": 0.6198563030698889, + "grad_norm": 129.51414430791692, + "learning_rate": 0.0002959076789279449, + "loss": 7.3558, + "step": 6643 + }, + { + "epoch": 0.6199496127647662, + "grad_norm": 9.53652241848506, + "learning_rate": 0.0002959059240102203, + "loss": 7.1267, + "step": 6644 + }, + { + "epoch": 0.6200429224596435, + "grad_norm": 4.076676269048437, + "learning_rate": 0.00029590416872150014, + "loss": 7.1562, + "step": 6645 + }, + { + "epoch": 0.6201362321545209, + "grad_norm": 14.63533209757607, + "learning_rate": 0.00029590241306178894, + "loss": 7.6061, + "step": 6646 + }, + { + "epoch": 0.6202295418493982, + "grad_norm": 4.4227119995337585, + "learning_rate": 0.00029590065703109114, + "loss": 7.3157, + "step": 6647 + }, + { + "epoch": 0.6203228515442755, + "grad_norm": 4.237926519792196, + "learning_rate": 0.00029589890062941117, + "loss": 6.9672, + "step": 6648 + }, + { + "epoch": 0.6204161612391528, + "grad_norm": 27.902248149688923, + "learning_rate": 0.00029589714385675354, + "loss": 7.3796, + "step": 6649 + }, + { + "epoch": 0.6205094709340301, + "grad_norm": 15.67893182824135, + "learning_rate": 0.00029589538671312264, + "loss": 7.2746, + "step": 6650 + }, + { + "epoch": 0.6206027806289074, + "grad_norm": 3204.813100809794, + "learning_rate": 0.000295893629198523, + "loss": 7.3282, + "step": 6651 + }, + { + "epoch": 0.6206960903237846, + "grad_norm": 2.5087877541165087, + "learning_rate": 0.0002958918713129591, + "loss": 7.1783, + "step": 6652 + }, + { + "epoch": 0.6207894000186619, + "grad_norm": 18.187215530172377, + "learning_rate": 0.00029589011305643546, + "loss": 7.422, + "step": 6653 + }, + { + "epoch": 0.6208827097135392, + "grad_norm": 4.4580502501967825, + "learning_rate": 0.00029588835442895635, + "loss": 7.5642, + "step": 6654 + }, + { + "epoch": 0.6209760194084165, + "grad_norm": 2.0415702541898804, + "learning_rate": 0.0002958865954305264, + "loss": 7.2625, + "step": 6655 + }, + { + "epoch": 0.6210693291032938, + "grad_norm": 2.2274895567836652, + "learning_rate": 0.0002958848360611501, + "loss": 7.3599, + "step": 6656 + }, + { + "epoch": 0.6211626387981711, + "grad_norm": 0.6432333032037734, + "learning_rate": 0.00029588307632083186, + "loss": 7.2724, + "step": 6657 + }, + { + "epoch": 0.6212559484930484, + "grad_norm": 20.18016247080477, + "learning_rate": 0.00029588131620957616, + "loss": 7.5781, + "step": 6658 + }, + { + "epoch": 0.6213492581879257, + "grad_norm": 1.4610150771004513, + "learning_rate": 0.0002958795557273875, + "loss": 7.1076, + "step": 6659 + }, + { + "epoch": 0.621442567882803, + "grad_norm": 1.2772054391913086, + "learning_rate": 0.0002958777948742703, + "loss": 7.028, + "step": 6660 + }, + { + "epoch": 0.6215358775776804, + "grad_norm": 1.4440319121152614, + "learning_rate": 0.0002958760336502291, + "loss": 7.2015, + "step": 6661 + }, + { + "epoch": 0.6216291872725577, + "grad_norm": 3.067421022511424, + "learning_rate": 0.0002958742720552684, + "loss": 7.258, + "step": 6662 + }, + { + "epoch": 0.6217224969674349, + "grad_norm": 1.130915315706854, + "learning_rate": 0.0002958725100893926, + "loss": 7.3081, + "step": 6663 + }, + { + "epoch": 0.6218158066623122, + "grad_norm": 14645.643579292737, + "learning_rate": 0.00029587074775260625, + "loss": 7.6375, + "step": 6664 + }, + { + "epoch": 0.6219091163571895, + "grad_norm": 25.79986854977133, + "learning_rate": 0.0002958689850449137, + "loss": 6.9606, + "step": 6665 + }, + { + "epoch": 0.6220024260520668, + "grad_norm": 71.05802024802358, + "learning_rate": 0.0002958672219663197, + "loss": 7.4219, + "step": 6666 + }, + { + "epoch": 0.6220957357469441, + "grad_norm": 12.62673881955452, + "learning_rate": 0.00029586545851682843, + "loss": 7.2857, + "step": 6667 + }, + { + "epoch": 0.6221890454418214, + "grad_norm": 6.9666151400210445, + "learning_rate": 0.0002958636946964445, + "loss": 7.2748, + "step": 6668 + }, + { + "epoch": 0.6222823551366987, + "grad_norm": 8.70719704131928, + "learning_rate": 0.0002958619305051725, + "loss": 7.2735, + "step": 6669 + }, + { + "epoch": 0.622375664831576, + "grad_norm": 814.1808837053061, + "learning_rate": 0.00029586016594301676, + "loss": 7.6386, + "step": 6670 + }, + { + "epoch": 0.6224689745264533, + "grad_norm": 16.944377368495672, + "learning_rate": 0.0002958584010099818, + "loss": 7.7039, + "step": 6671 + }, + { + "epoch": 0.6225622842213306, + "grad_norm": 13.392690243600258, + "learning_rate": 0.0002958566357060722, + "loss": 7.3879, + "step": 6672 + }, + { + "epoch": 0.6226555939162078, + "grad_norm": 472.3217863383622, + "learning_rate": 0.0002958548700312923, + "loss": 7.4781, + "step": 6673 + }, + { + "epoch": 0.6227489036110851, + "grad_norm": 3.3599227595672696, + "learning_rate": 0.00029585310398564675, + "loss": 7.4905, + "step": 6674 + }, + { + "epoch": 0.6228422133059625, + "grad_norm": 5.683689463386194, + "learning_rate": 0.0002958513375691399, + "loss": 7.7813, + "step": 6675 + }, + { + "epoch": 0.6229355230008398, + "grad_norm": 6.172554294214025, + "learning_rate": 0.00029584957078177633, + "loss": 8.0206, + "step": 6676 + }, + { + "epoch": 0.6230288326957171, + "grad_norm": 12007203.55347155, + "learning_rate": 0.0002958478036235605, + "loss": 7.5552, + "step": 6677 + }, + { + "epoch": 0.6231221423905944, + "grad_norm": 53.594814986860825, + "learning_rate": 0.0002958460360944969, + "loss": 7.2648, + "step": 6678 + }, + { + "epoch": 0.6232154520854717, + "grad_norm": 83.37082932020326, + "learning_rate": 0.00029584426819459004, + "loss": 7.2166, + "step": 6679 + }, + { + "epoch": 0.623308761780349, + "grad_norm": 1.3603341323365676, + "learning_rate": 0.00029584249992384445, + "loss": 7.4002, + "step": 6680 + }, + { + "epoch": 0.6234020714752263, + "grad_norm": 838.6937144379125, + "learning_rate": 0.0002958407312822645, + "loss": 7.2691, + "step": 6681 + }, + { + "epoch": 0.6234953811701036, + "grad_norm": 3.6169876459974435, + "learning_rate": 0.00029583896226985487, + "loss": 7.5197, + "step": 6682 + }, + { + "epoch": 0.6235886908649809, + "grad_norm": 11.658539042693425, + "learning_rate": 0.0002958371928866199, + "loss": 7.8854, + "step": 6683 + }, + { + "epoch": 0.6236820005598581, + "grad_norm": 6.0450683425421134, + "learning_rate": 0.00029583542313256413, + "loss": 7.662, + "step": 6684 + }, + { + "epoch": 0.6237753102547354, + "grad_norm": 4.492837374085431, + "learning_rate": 0.0002958336530076921, + "loss": 7.7031, + "step": 6685 + }, + { + "epoch": 0.6238686199496127, + "grad_norm": 2.0599913147812203, + "learning_rate": 0.00029583188251200833, + "loss": 7.4126, + "step": 6686 + }, + { + "epoch": 0.62396192964449, + "grad_norm": 3.0138659584648075, + "learning_rate": 0.00029583011164551723, + "loss": 7.284, + "step": 6687 + }, + { + "epoch": 0.6240552393393674, + "grad_norm": 3.452313271855895, + "learning_rate": 0.00029582834040822335, + "loss": 7.2632, + "step": 6688 + }, + { + "epoch": 0.6241485490342447, + "grad_norm": 2.25127405376506, + "learning_rate": 0.00029582656880013125, + "loss": 7.6322, + "step": 6689 + }, + { + "epoch": 0.624241858729122, + "grad_norm": 3.4064513258524545, + "learning_rate": 0.00029582479682124536, + "loss": 7.4208, + "step": 6690 + }, + { + "epoch": 0.6243351684239993, + "grad_norm": 1.2786861866577652, + "learning_rate": 0.0002958230244715702, + "loss": 7.7986, + "step": 6691 + }, + { + "epoch": 0.6244284781188766, + "grad_norm": 1.5382370219339065, + "learning_rate": 0.0002958212517511103, + "loss": 6.9986, + "step": 6692 + }, + { + "epoch": 0.6245217878137539, + "grad_norm": 1.0731937132220402, + "learning_rate": 0.0002958194786598701, + "loss": 7.2675, + "step": 6693 + }, + { + "epoch": 0.6246150975086312, + "grad_norm": 2.6589243309591444, + "learning_rate": 0.0002958177051978542, + "loss": 7.3348, + "step": 6694 + }, + { + "epoch": 0.6247084072035084, + "grad_norm": 3.5984440868807193, + "learning_rate": 0.00029581593136506705, + "loss": 7.4283, + "step": 6695 + }, + { + "epoch": 0.6248017168983857, + "grad_norm": 2.8571195237547498, + "learning_rate": 0.0002958141571615132, + "loss": 7.5206, + "step": 6696 + }, + { + "epoch": 0.624895026593263, + "grad_norm": 1.1374157938775855, + "learning_rate": 0.00029581238258719717, + "loss": 7.3745, + "step": 6697 + }, + { + "epoch": 0.6249883362881403, + "grad_norm": 1.113914394832123, + "learning_rate": 0.0002958106076421234, + "loss": 7.2309, + "step": 6698 + }, + { + "epoch": 0.6250816459830176, + "grad_norm": 1.163422081924971, + "learning_rate": 0.0002958088323262964, + "loss": 7.4533, + "step": 6699 + }, + { + "epoch": 0.6251749556778949, + "grad_norm": 1.3416391979231062, + "learning_rate": 0.0002958070566397209, + "loss": 7.672, + "step": 6700 + }, + { + "epoch": 0.6252682653727722, + "grad_norm": 5.035647812253561, + "learning_rate": 0.0002958052805824011, + "loss": 7.2571, + "step": 6701 + }, + { + "epoch": 0.6253615750676496, + "grad_norm": 4.386921901647107, + "learning_rate": 0.0002958035041543417, + "loss": 7.3426, + "step": 6702 + }, + { + "epoch": 0.6254548847625269, + "grad_norm": 1.3687393332390678, + "learning_rate": 0.00029580172735554715, + "loss": 7.5727, + "step": 6703 + }, + { + "epoch": 0.6255481944574042, + "grad_norm": 0.8622769444003844, + "learning_rate": 0.000295799950186022, + "loss": 7.3176, + "step": 6704 + }, + { + "epoch": 0.6256415041522814, + "grad_norm": 1.0293387054578187, + "learning_rate": 0.00029579817264577084, + "loss": 7.3875, + "step": 6705 + }, + { + "epoch": 0.6257348138471587, + "grad_norm": 0.7719878460411023, + "learning_rate": 0.00029579639473479804, + "loss": 7.1872, + "step": 6706 + }, + { + "epoch": 0.625828123542036, + "grad_norm": 2.6746832871419706, + "learning_rate": 0.0002957946164531083, + "loss": 7.5575, + "step": 6707 + }, + { + "epoch": 0.6259214332369133, + "grad_norm": 2.0372288105599328, + "learning_rate": 0.00029579283780070594, + "loss": 7.7309, + "step": 6708 + }, + { + "epoch": 0.6260147429317906, + "grad_norm": 1.0190636859230966, + "learning_rate": 0.0002957910587775956, + "loss": 7.3892, + "step": 6709 + }, + { + "epoch": 0.6261080526266679, + "grad_norm": 4.019337198261372, + "learning_rate": 0.00029578927938378176, + "loss": 7.3168, + "step": 6710 + }, + { + "epoch": 0.6262013623215452, + "grad_norm": 1.1519771000343555, + "learning_rate": 0.000295787499619269, + "loss": 7.3957, + "step": 6711 + }, + { + "epoch": 0.6262946720164225, + "grad_norm": 1.567320894419876, + "learning_rate": 0.0002957857194840618, + "loss": 7.4163, + "step": 6712 + }, + { + "epoch": 0.6263879817112998, + "grad_norm": 1.0005059006684056, + "learning_rate": 0.00029578393897816474, + "loss": 7.4513, + "step": 6713 + }, + { + "epoch": 0.6264812914061771, + "grad_norm": 1.086512460011875, + "learning_rate": 0.0002957821581015823, + "loss": 7.6155, + "step": 6714 + }, + { + "epoch": 0.6265746011010545, + "grad_norm": 1.2503372029932065, + "learning_rate": 0.0002957803768543189, + "loss": 7.7034, + "step": 6715 + }, + { + "epoch": 0.6266679107959316, + "grad_norm": 1.1784987453438884, + "learning_rate": 0.00029577859523637935, + "loss": 7.3216, + "step": 6716 + }, + { + "epoch": 0.626761220490809, + "grad_norm": 1.3302550357461025, + "learning_rate": 0.00029577681324776794, + "loss": 7.8041, + "step": 6717 + }, + { + "epoch": 0.6268545301856863, + "grad_norm": 0.8449268714884421, + "learning_rate": 0.0002957750308884892, + "loss": 7.4652, + "step": 6718 + }, + { + "epoch": 0.6269478398805636, + "grad_norm": 5.928133605581734, + "learning_rate": 0.00029577324815854784, + "loss": 8.2817, + "step": 6719 + }, + { + "epoch": 0.6270411495754409, + "grad_norm": 1.6676527645301287, + "learning_rate": 0.00029577146505794827, + "loss": 7.586, + "step": 6720 + }, + { + "epoch": 0.6271344592703182, + "grad_norm": 2.415079906319561, + "learning_rate": 0.00029576968158669505, + "loss": 7.5858, + "step": 6721 + }, + { + "epoch": 0.6272277689651955, + "grad_norm": 1.477557831993897, + "learning_rate": 0.00029576789774479273, + "loss": 7.266, + "step": 6722 + }, + { + "epoch": 0.6273210786600728, + "grad_norm": 1.7006646190544799, + "learning_rate": 0.0002957661135322458, + "loss": 7.6291, + "step": 6723 + }, + { + "epoch": 0.6274143883549501, + "grad_norm": 1.4484981656018279, + "learning_rate": 0.0002957643289490588, + "loss": 7.2452, + "step": 6724 + }, + { + "epoch": 0.6275076980498274, + "grad_norm": 0.7823407014867795, + "learning_rate": 0.0002957625439952363, + "loss": 7.438, + "step": 6725 + }, + { + "epoch": 0.6276010077447046, + "grad_norm": 1.276324886163159, + "learning_rate": 0.0002957607586707829, + "loss": 7.3936, + "step": 6726 + }, + { + "epoch": 0.6276943174395819, + "grad_norm": 0.8832990003491406, + "learning_rate": 0.00029575897297570296, + "loss": 7.4201, + "step": 6727 + }, + { + "epoch": 0.6277876271344592, + "grad_norm": 2.9020687349103724, + "learning_rate": 0.0002957571869100012, + "loss": 7.411, + "step": 6728 + }, + { + "epoch": 0.6278809368293365, + "grad_norm": 0.8020153407200029, + "learning_rate": 0.0002957554004736821, + "loss": 7.4403, + "step": 6729 + }, + { + "epoch": 0.6279742465242139, + "grad_norm": 0.6403823765682349, + "learning_rate": 0.0002957536136667501, + "loss": 7.4353, + "step": 6730 + }, + { + "epoch": 0.6280675562190912, + "grad_norm": 0.7860121185555259, + "learning_rate": 0.00029575182648920994, + "loss": 7.4608, + "step": 6731 + }, + { + "epoch": 0.6281608659139685, + "grad_norm": 0.8347413417353129, + "learning_rate": 0.00029575003894106604, + "loss": 7.6238, + "step": 6732 + }, + { + "epoch": 0.6282541756088458, + "grad_norm": 0.9261437539490994, + "learning_rate": 0.0002957482510223229, + "loss": 7.4501, + "step": 6733 + }, + { + "epoch": 0.6283474853037231, + "grad_norm": 1.3801796106103303, + "learning_rate": 0.0002957464627329851, + "loss": 7.4969, + "step": 6734 + }, + { + "epoch": 0.6284407949986004, + "grad_norm": 1.3995801050657137, + "learning_rate": 0.0002957446740730574, + "loss": 7.6607, + "step": 6735 + }, + { + "epoch": 0.6285341046934777, + "grad_norm": 0.8596788125822069, + "learning_rate": 0.000295742885042544, + "loss": 7.5246, + "step": 6736 + }, + { + "epoch": 0.6286274143883549, + "grad_norm": 1.5937693957572308, + "learning_rate": 0.0002957410956414497, + "loss": 7.2428, + "step": 6737 + }, + { + "epoch": 0.6287207240832322, + "grad_norm": 1.0432629208953685, + "learning_rate": 0.00029573930586977894, + "loss": 7.4044, + "step": 6738 + }, + { + "epoch": 0.6288140337781095, + "grad_norm": 1.2087512112030063, + "learning_rate": 0.0002957375157275363, + "loss": 7.3818, + "step": 6739 + }, + { + "epoch": 0.6289073434729868, + "grad_norm": 0.9904715225579317, + "learning_rate": 0.00029573572521472626, + "loss": 7.0001, + "step": 6740 + }, + { + "epoch": 0.6290006531678641, + "grad_norm": 0.5806644453007923, + "learning_rate": 0.00029573393433135356, + "loss": 7.2421, + "step": 6741 + }, + { + "epoch": 0.6290939628627414, + "grad_norm": 9.774041858458933, + "learning_rate": 0.00029573214307742257, + "loss": 7.5276, + "step": 6742 + }, + { + "epoch": 0.6291872725576187, + "grad_norm": 3.3027295037574915, + "learning_rate": 0.00029573035145293795, + "loss": 7.7529, + "step": 6743 + }, + { + "epoch": 0.6292805822524961, + "grad_norm": 2.7478993158039775, + "learning_rate": 0.00029572855945790414, + "loss": 7.581, + "step": 6744 + }, + { + "epoch": 0.6293738919473734, + "grad_norm": 1.3289613247395957, + "learning_rate": 0.0002957267670923258, + "loss": 7.4816, + "step": 6745 + }, + { + "epoch": 0.6294672016422507, + "grad_norm": 0.5963118358373443, + "learning_rate": 0.0002957249743562075, + "loss": 7.5466, + "step": 6746 + }, + { + "epoch": 0.629560511337128, + "grad_norm": 1.8412867626179787, + "learning_rate": 0.0002957231812495537, + "loss": 7.3454, + "step": 6747 + }, + { + "epoch": 0.6296538210320052, + "grad_norm": 2.247316007113961, + "learning_rate": 0.00029572138777236905, + "loss": 7.1208, + "step": 6748 + }, + { + "epoch": 0.6297471307268825, + "grad_norm": 1.3909840594088976, + "learning_rate": 0.0002957195939246581, + "loss": 7.8206, + "step": 6749 + }, + { + "epoch": 0.6298404404217598, + "grad_norm": 1.4646776219667421, + "learning_rate": 0.00029571779970642535, + "loss": 7.4873, + "step": 6750 + }, + { + "epoch": 0.6299337501166371, + "grad_norm": 1.0760334413705703, + "learning_rate": 0.00029571600511767545, + "loss": 7.2452, + "step": 6751 + }, + { + "epoch": 0.6300270598115144, + "grad_norm": 1.008949245611362, + "learning_rate": 0.0002957142101584128, + "loss": 7.6577, + "step": 6752 + }, + { + "epoch": 0.6301203695063917, + "grad_norm": 0.7717005835999026, + "learning_rate": 0.00029571241482864214, + "loss": 7.2909, + "step": 6753 + }, + { + "epoch": 0.630213679201269, + "grad_norm": 0.9226515351824623, + "learning_rate": 0.000295710619128368, + "loss": 7.1608, + "step": 6754 + }, + { + "epoch": 0.6303069888961463, + "grad_norm": 1.0610176236945603, + "learning_rate": 0.0002957088230575949, + "loss": 7.2613, + "step": 6755 + }, + { + "epoch": 0.6304002985910236, + "grad_norm": 0.8418596155819636, + "learning_rate": 0.00029570702661632745, + "loss": 7.5527, + "step": 6756 + }, + { + "epoch": 0.630493608285901, + "grad_norm": 0.5437734590674885, + "learning_rate": 0.0002957052298045702, + "loss": 7.3521, + "step": 6757 + }, + { + "epoch": 0.6305869179807781, + "grad_norm": 0.5570232449305444, + "learning_rate": 0.0002957034326223277, + "loss": 7.6672, + "step": 6758 + }, + { + "epoch": 0.6306802276756555, + "grad_norm": 0.9696080585653112, + "learning_rate": 0.0002957016350696045, + "loss": 7.539, + "step": 6759 + }, + { + "epoch": 0.6307735373705328, + "grad_norm": 0.9118461416528562, + "learning_rate": 0.00029569983714640516, + "loss": 7.6163, + "step": 6760 + }, + { + "epoch": 0.6308668470654101, + "grad_norm": 1.2955461250573488, + "learning_rate": 0.0002956980388527344, + "loss": 7.2958, + "step": 6761 + }, + { + "epoch": 0.6309601567602874, + "grad_norm": 1.0243485035411395, + "learning_rate": 0.00029569624018859663, + "loss": 7.3767, + "step": 6762 + }, + { + "epoch": 0.6310534664551647, + "grad_norm": 2.0097009482282506, + "learning_rate": 0.00029569444115399646, + "loss": 7.4955, + "step": 6763 + }, + { + "epoch": 0.631146776150042, + "grad_norm": 1.1062798118900563, + "learning_rate": 0.0002956926417489385, + "loss": 7.3609, + "step": 6764 + }, + { + "epoch": 0.6312400858449193, + "grad_norm": 0.906679749003987, + "learning_rate": 0.0002956908419734273, + "loss": 7.6804, + "step": 6765 + }, + { + "epoch": 0.6313333955397966, + "grad_norm": 0.7592876970551372, + "learning_rate": 0.0002956890418274675, + "loss": 7.481, + "step": 6766 + }, + { + "epoch": 0.6314267052346739, + "grad_norm": 0.9731320146983485, + "learning_rate": 0.00029568724131106357, + "loss": 7.3314, + "step": 6767 + }, + { + "epoch": 0.6315200149295512, + "grad_norm": 0.506802952898517, + "learning_rate": 0.00029568544042422016, + "loss": 7.6629, + "step": 6768 + }, + { + "epoch": 0.6316133246244284, + "grad_norm": 1.6107805915964433, + "learning_rate": 0.00029568363916694185, + "loss": 7.428, + "step": 6769 + }, + { + "epoch": 0.6317066343193057, + "grad_norm": 1.0086198899610406, + "learning_rate": 0.0002956818375392332, + "loss": 7.6339, + "step": 6770 + }, + { + "epoch": 0.631799944014183, + "grad_norm": 1.0317117267782827, + "learning_rate": 0.0002956800355410987, + "loss": 7.6017, + "step": 6771 + }, + { + "epoch": 0.6318932537090604, + "grad_norm": 0.8448388040960075, + "learning_rate": 0.00029567823317254313, + "loss": 7.5074, + "step": 6772 + }, + { + "epoch": 0.6319865634039377, + "grad_norm": 0.9052446825566195, + "learning_rate": 0.0002956764304335709, + "loss": 7.0822, + "step": 6773 + }, + { + "epoch": 0.632079873098815, + "grad_norm": 0.7105298116157587, + "learning_rate": 0.00029567462732418674, + "loss": 7.2384, + "step": 6774 + }, + { + "epoch": 0.6321731827936923, + "grad_norm": 1.1429264884688752, + "learning_rate": 0.0002956728238443951, + "loss": 7.976, + "step": 6775 + }, + { + "epoch": 0.6322664924885696, + "grad_norm": 0.6544011100194461, + "learning_rate": 0.00029567101999420064, + "loss": 7.3058, + "step": 6776 + }, + { + "epoch": 0.6323598021834469, + "grad_norm": 1.0534822573855742, + "learning_rate": 0.0002956692157736079, + "loss": 6.9976, + "step": 6777 + }, + { + "epoch": 0.6324531118783242, + "grad_norm": 1.155811869455907, + "learning_rate": 0.0002956674111826215, + "loss": 7.1863, + "step": 6778 + }, + { + "epoch": 0.6325464215732014, + "grad_norm": 0.6256033320712523, + "learning_rate": 0.000295665606221246, + "loss": 7.2652, + "step": 6779 + }, + { + "epoch": 0.6326397312680787, + "grad_norm": 0.745635740784606, + "learning_rate": 0.00029566380088948607, + "loss": 7.5676, + "step": 6780 + }, + { + "epoch": 0.632733040962956, + "grad_norm": 1.2249151979365438, + "learning_rate": 0.0002956619951873462, + "loss": 7.5065, + "step": 6781 + }, + { + "epoch": 0.6328263506578333, + "grad_norm": 0.48386352758346246, + "learning_rate": 0.00029566018911483105, + "loss": 6.9032, + "step": 6782 + }, + { + "epoch": 0.6329196603527106, + "grad_norm": 0.6027321514690829, + "learning_rate": 0.00029565838267194515, + "loss": 7.5238, + "step": 6783 + }, + { + "epoch": 0.6330129700475879, + "grad_norm": 1.7522080329782472, + "learning_rate": 0.00029565657585869313, + "loss": 7.1608, + "step": 6784 + }, + { + "epoch": 0.6331062797424652, + "grad_norm": 0.8535146878095476, + "learning_rate": 0.00029565476867507965, + "loss": 7.4725, + "step": 6785 + }, + { + "epoch": 0.6331995894373426, + "grad_norm": 0.9670484845712739, + "learning_rate": 0.0002956529611211092, + "loss": 7.2606, + "step": 6786 + }, + { + "epoch": 0.6332928991322199, + "grad_norm": 1.7023913176470598, + "learning_rate": 0.00029565115319678644, + "loss": 7.1478, + "step": 6787 + }, + { + "epoch": 0.6333862088270972, + "grad_norm": 1.2790928626010867, + "learning_rate": 0.0002956493449021159, + "loss": 7.136, + "step": 6788 + }, + { + "epoch": 0.6334795185219745, + "grad_norm": 0.8126603473664519, + "learning_rate": 0.0002956475362371022, + "loss": 7.507, + "step": 6789 + }, + { + "epoch": 0.6335728282168517, + "grad_norm": 0.7405289568934648, + "learning_rate": 0.00029564572720175, + "loss": 7.1948, + "step": 6790 + }, + { + "epoch": 0.633666137911729, + "grad_norm": 1.003033431332323, + "learning_rate": 0.00029564391779606387, + "loss": 7.1594, + "step": 6791 + }, + { + "epoch": 0.6337594476066063, + "grad_norm": 1.2304859232591552, + "learning_rate": 0.00029564210802004835, + "loss": 7.3567, + "step": 6792 + }, + { + "epoch": 0.6338527573014836, + "grad_norm": 0.6964189171961034, + "learning_rate": 0.00029564029787370813, + "loss": 7.3675, + "step": 6793 + }, + { + "epoch": 0.6339460669963609, + "grad_norm": 0.9802402151602088, + "learning_rate": 0.0002956384873570478, + "loss": 7.166, + "step": 6794 + }, + { + "epoch": 0.6340393766912382, + "grad_norm": 0.687754045173004, + "learning_rate": 0.0002956366764700719, + "loss": 7.438, + "step": 6795 + }, + { + "epoch": 0.6341326863861155, + "grad_norm": 1.2897235420604316, + "learning_rate": 0.0002956348652127851, + "loss": 7.1683, + "step": 6796 + }, + { + "epoch": 0.6342259960809928, + "grad_norm": 0.6442499995645495, + "learning_rate": 0.00029563305358519195, + "loss": 7.4768, + "step": 6797 + }, + { + "epoch": 0.6343193057758701, + "grad_norm": 6.527778479988806, + "learning_rate": 0.00029563124158729707, + "loss": 7.1941, + "step": 6798 + }, + { + "epoch": 0.6344126154707475, + "grad_norm": 0.8377185448054181, + "learning_rate": 0.0002956294292191052, + "loss": 7.3988, + "step": 6799 + }, + { + "epoch": 0.6345059251656248, + "grad_norm": 0.7763569468356787, + "learning_rate": 0.0002956276164806207, + "loss": 7.2398, + "step": 6800 + }, + { + "epoch": 0.634599234860502, + "grad_norm": 1.538878133416518, + "learning_rate": 0.0002956258033718483, + "loss": 7.6113, + "step": 6801 + }, + { + "epoch": 0.6346925445553793, + "grad_norm": 1.165892409120171, + "learning_rate": 0.0002956239898927927, + "loss": 7.593, + "step": 6802 + }, + { + "epoch": 0.6347858542502566, + "grad_norm": 0.8409933960425253, + "learning_rate": 0.0002956221760434584, + "loss": 7.3338, + "step": 6803 + }, + { + "epoch": 0.6348791639451339, + "grad_norm": 0.7553150517180723, + "learning_rate": 0.00029562036182385003, + "loss": 7.5893, + "step": 6804 + }, + { + "epoch": 0.6349724736400112, + "grad_norm": 1.2099656528315583, + "learning_rate": 0.00029561854723397226, + "loss": 7.2382, + "step": 6805 + }, + { + "epoch": 0.6350657833348885, + "grad_norm": 1.3060531945996454, + "learning_rate": 0.0002956167322738296, + "loss": 7.0811, + "step": 6806 + }, + { + "epoch": 0.6351590930297658, + "grad_norm": 0.48229417356402254, + "learning_rate": 0.00029561491694342677, + "loss": 7.1367, + "step": 6807 + }, + { + "epoch": 0.6352524027246431, + "grad_norm": 0.8634117651127632, + "learning_rate": 0.0002956131012427683, + "loss": 7.3633, + "step": 6808 + }, + { + "epoch": 0.6353457124195204, + "grad_norm": 1.242037735123686, + "learning_rate": 0.0002956112851718589, + "loss": 7.1806, + "step": 6809 + }, + { + "epoch": 0.6354390221143977, + "grad_norm": 1.369777743839262, + "learning_rate": 0.0002956094687307031, + "loss": 7.312, + "step": 6810 + }, + { + "epoch": 0.6355323318092749, + "grad_norm": 1.0275715364684939, + "learning_rate": 0.0002956076519193055, + "loss": 7.4066, + "step": 6811 + }, + { + "epoch": 0.6356256415041522, + "grad_norm": 8.71505009966165, + "learning_rate": 0.00029560583473767086, + "loss": 7.2014, + "step": 6812 + }, + { + "epoch": 0.6357189511990295, + "grad_norm": 1.3207624376795761, + "learning_rate": 0.00029560401718580365, + "loss": 7.0274, + "step": 6813 + }, + { + "epoch": 0.6358122608939069, + "grad_norm": 1.052346342625026, + "learning_rate": 0.0002956021992637086, + "loss": 7.2536, + "step": 6814 + }, + { + "epoch": 0.6359055705887842, + "grad_norm": 0.9127108772763329, + "learning_rate": 0.0002956003809713903, + "loss": 7.2978, + "step": 6815 + }, + { + "epoch": 0.6359988802836615, + "grad_norm": 1.19779863360947, + "learning_rate": 0.00029559856230885334, + "loss": 7.6347, + "step": 6816 + }, + { + "epoch": 0.6360921899785388, + "grad_norm": 1.0612810343930026, + "learning_rate": 0.00029559674327610236, + "loss": 7.6233, + "step": 6817 + }, + { + "epoch": 0.6361854996734161, + "grad_norm": 0.7290835212896289, + "learning_rate": 0.000295594923873142, + "loss": 7.474, + "step": 6818 + }, + { + "epoch": 0.6362788093682934, + "grad_norm": 1.0311258946347512, + "learning_rate": 0.0002955931040999769, + "loss": 7.3575, + "step": 6819 + }, + { + "epoch": 0.6363721190631707, + "grad_norm": 0.7403656234416243, + "learning_rate": 0.00029559128395661164, + "loss": 6.8234, + "step": 6820 + }, + { + "epoch": 0.636465428758048, + "grad_norm": 0.4476188908241481, + "learning_rate": 0.0002955894634430509, + "loss": 7.002, + "step": 6821 + }, + { + "epoch": 0.6365587384529252, + "grad_norm": 1.8650078279531066, + "learning_rate": 0.0002955876425592992, + "loss": 7.8582, + "step": 6822 + }, + { + "epoch": 0.6366520481478025, + "grad_norm": 0.8068704816094694, + "learning_rate": 0.0002955858213053613, + "loss": 7.1485, + "step": 6823 + }, + { + "epoch": 0.6367453578426798, + "grad_norm": 0.7496164017255279, + "learning_rate": 0.0002955839996812418, + "loss": 7.4539, + "step": 6824 + }, + { + "epoch": 0.6368386675375571, + "grad_norm": 0.5658888136036012, + "learning_rate": 0.00029558217768694525, + "loss": 7.1252, + "step": 6825 + }, + { + "epoch": 0.6369319772324344, + "grad_norm": 0.9159331558155032, + "learning_rate": 0.0002955803553224764, + "loss": 7.0221, + "step": 6826 + }, + { + "epoch": 0.6370252869273118, + "grad_norm": 0.5237828390927762, + "learning_rate": 0.00029557853258783983, + "loss": 7.2604, + "step": 6827 + }, + { + "epoch": 0.6371185966221891, + "grad_norm": 0.7837124428099858, + "learning_rate": 0.00029557670948304017, + "loss": 7.1011, + "step": 6828 + }, + { + "epoch": 0.6372119063170664, + "grad_norm": 4.7493486941168035, + "learning_rate": 0.00029557488600808206, + "loss": 7.6588, + "step": 6829 + }, + { + "epoch": 0.6373052160119437, + "grad_norm": 2.021110184997737, + "learning_rate": 0.0002955730621629701, + "loss": 7.4342, + "step": 6830 + }, + { + "epoch": 0.637398525706821, + "grad_norm": 0.42813552268544836, + "learning_rate": 0.000295571237947709, + "loss": 7.373, + "step": 6831 + }, + { + "epoch": 0.6374918354016982, + "grad_norm": 0.8670018726520881, + "learning_rate": 0.0002955694133623034, + "loss": 7.3623, + "step": 6832 + }, + { + "epoch": 0.6375851450965755, + "grad_norm": 1.4629319739206748, + "learning_rate": 0.0002955675884067579, + "loss": 6.9556, + "step": 6833 + }, + { + "epoch": 0.6376784547914528, + "grad_norm": 0.6938416501654899, + "learning_rate": 0.0002955657630810771, + "loss": 7.371, + "step": 6834 + }, + { + "epoch": 0.6377717644863301, + "grad_norm": 0.8883902282583633, + "learning_rate": 0.0002955639373852657, + "loss": 7.1896, + "step": 6835 + }, + { + "epoch": 0.6378650741812074, + "grad_norm": 2.9535820908998724, + "learning_rate": 0.0002955621113193283, + "loss": 7.4243, + "step": 6836 + }, + { + "epoch": 0.6379583838760847, + "grad_norm": 1.6113313794034032, + "learning_rate": 0.0002955602848832696, + "loss": 7.3288, + "step": 6837 + }, + { + "epoch": 0.638051693570962, + "grad_norm": 1.368202026417356, + "learning_rate": 0.00029555845807709423, + "loss": 7.1433, + "step": 6838 + }, + { + "epoch": 0.6381450032658393, + "grad_norm": 2.734436412666859, + "learning_rate": 0.0002955566309008068, + "loss": 7.5503, + "step": 6839 + }, + { + "epoch": 0.6382383129607166, + "grad_norm": 2.2279826973312824, + "learning_rate": 0.000295554803354412, + "loss": 7.6459, + "step": 6840 + }, + { + "epoch": 0.638331622655594, + "grad_norm": 1.225117259281565, + "learning_rate": 0.00029555297543791434, + "loss": 7.7235, + "step": 6841 + }, + { + "epoch": 0.6384249323504713, + "grad_norm": 2.282399167892356, + "learning_rate": 0.00029555114715131865, + "loss": 7.0007, + "step": 6842 + }, + { + "epoch": 0.6385182420453485, + "grad_norm": 2.2272489651790153, + "learning_rate": 0.0002955493184946295, + "loss": 7.2962, + "step": 6843 + }, + { + "epoch": 0.6386115517402258, + "grad_norm": 1.3886970627542903, + "learning_rate": 0.0002955474894678516, + "loss": 7.395, + "step": 6844 + }, + { + "epoch": 0.6387048614351031, + "grad_norm": 24.553790266086587, + "learning_rate": 0.0002955456600709895, + "loss": 7.0605, + "step": 6845 + }, + { + "epoch": 0.6387981711299804, + "grad_norm": 1.165072661282399, + "learning_rate": 0.00029554383030404795, + "loss": 7.3817, + "step": 6846 + }, + { + "epoch": 0.6388914808248577, + "grad_norm": 1.7621711642091646, + "learning_rate": 0.0002955420001670315, + "loss": 7.3683, + "step": 6847 + }, + { + "epoch": 0.638984790519735, + "grad_norm": 1.2516452568809597, + "learning_rate": 0.0002955401696599449, + "loss": 6.7909, + "step": 6848 + }, + { + "epoch": 0.6390781002146123, + "grad_norm": 1.022120899013728, + "learning_rate": 0.0002955383387827927, + "loss": 7.121, + "step": 6849 + }, + { + "epoch": 0.6391714099094896, + "grad_norm": 0.4732546653842149, + "learning_rate": 0.0002955365075355797, + "loss": 7.0881, + "step": 6850 + }, + { + "epoch": 0.6392647196043669, + "grad_norm": 0.5271056978225275, + "learning_rate": 0.0002955346759183104, + "loss": 7.354, + "step": 6851 + }, + { + "epoch": 0.6393580292992442, + "grad_norm": 1.4823094267671408, + "learning_rate": 0.00029553284393098953, + "loss": 6.8908, + "step": 6852 + }, + { + "epoch": 0.6394513389941215, + "grad_norm": 3.365786118994906, + "learning_rate": 0.00029553101157362175, + "loss": 7.0332, + "step": 6853 + }, + { + "epoch": 0.6395446486889987, + "grad_norm": 1.0729605133947948, + "learning_rate": 0.00029552917884621176, + "loss": 7.6586, + "step": 6854 + }, + { + "epoch": 0.639637958383876, + "grad_norm": 4.414780423840307, + "learning_rate": 0.0002955273457487641, + "loss": 7.3948, + "step": 6855 + }, + { + "epoch": 0.6397312680787534, + "grad_norm": 2.475168610705475, + "learning_rate": 0.0002955255122812836, + "loss": 7.3263, + "step": 6856 + }, + { + "epoch": 0.6398245777736307, + "grad_norm": 1.817523117329412, + "learning_rate": 0.0002955236784437748, + "loss": 7.6043, + "step": 6857 + }, + { + "epoch": 0.639917887468508, + "grad_norm": 1.159737304552165, + "learning_rate": 0.0002955218442362424, + "loss": 7.6774, + "step": 6858 + }, + { + "epoch": 0.6400111971633853, + "grad_norm": 1.6979677546763186, + "learning_rate": 0.00029552000965869107, + "loss": 7.1592, + "step": 6859 + }, + { + "epoch": 0.6401045068582626, + "grad_norm": 1.136789932952597, + "learning_rate": 0.00029551817471112543, + "loss": 7.3925, + "step": 6860 + }, + { + "epoch": 0.6401978165531399, + "grad_norm": 1.5882105094383312, + "learning_rate": 0.00029551633939355016, + "loss": 6.7988, + "step": 6861 + }, + { + "epoch": 0.6402911262480172, + "grad_norm": 2.4364416825458024, + "learning_rate": 0.00029551450370597, + "loss": 6.9803, + "step": 6862 + }, + { + "epoch": 0.6403844359428945, + "grad_norm": 1.6744697378816389, + "learning_rate": 0.0002955126676483895, + "loss": 7.3374, + "step": 6863 + }, + { + "epoch": 0.6404777456377717, + "grad_norm": 0.7194947352166166, + "learning_rate": 0.0002955108312208134, + "loss": 7.3452, + "step": 6864 + }, + { + "epoch": 0.640571055332649, + "grad_norm": 0.8186493360784606, + "learning_rate": 0.00029550899442324643, + "loss": 7.2641, + "step": 6865 + }, + { + "epoch": 0.6406643650275263, + "grad_norm": 1.3982282991045771, + "learning_rate": 0.00029550715725569314, + "loss": 7.2783, + "step": 6866 + }, + { + "epoch": 0.6407576747224036, + "grad_norm": 2.7616742547013784, + "learning_rate": 0.0002955053197181582, + "loss": 7.28, + "step": 6867 + }, + { + "epoch": 0.6408509844172809, + "grad_norm": 1.3603537515703512, + "learning_rate": 0.0002955034818106464, + "loss": 7.2628, + "step": 6868 + }, + { + "epoch": 0.6409442941121583, + "grad_norm": 0.9093841587718428, + "learning_rate": 0.0002955016435331623, + "loss": 6.9396, + "step": 6869 + }, + { + "epoch": 0.6410376038070356, + "grad_norm": 1.7926179615860547, + "learning_rate": 0.00029549980488571066, + "loss": 7.3141, + "step": 6870 + }, + { + "epoch": 0.6411309135019129, + "grad_norm": 2.5905414684392194, + "learning_rate": 0.00029549796586829613, + "loss": 7.2183, + "step": 6871 + }, + { + "epoch": 0.6412242231967902, + "grad_norm": 1.2616307243529, + "learning_rate": 0.0002954961264809233, + "loss": 7.2083, + "step": 6872 + }, + { + "epoch": 0.6413175328916675, + "grad_norm": 1.415849940616505, + "learning_rate": 0.000295494286723597, + "loss": 7.1246, + "step": 6873 + }, + { + "epoch": 0.6414108425865448, + "grad_norm": 0.660906814674425, + "learning_rate": 0.0002954924465963218, + "loss": 7.3235, + "step": 6874 + }, + { + "epoch": 0.641504152281422, + "grad_norm": 3.5366475265072586, + "learning_rate": 0.00029549060609910236, + "loss": 7.2331, + "step": 6875 + }, + { + "epoch": 0.6415974619762993, + "grad_norm": 1.355722819787944, + "learning_rate": 0.00029548876523194346, + "loss": 7.4675, + "step": 6876 + }, + { + "epoch": 0.6416907716711766, + "grad_norm": 4.070936395569061, + "learning_rate": 0.00029548692399484973, + "loss": 7.3796, + "step": 6877 + }, + { + "epoch": 0.6417840813660539, + "grad_norm": 1.1334040559599763, + "learning_rate": 0.00029548508238782584, + "loss": 7.1742, + "step": 6878 + }, + { + "epoch": 0.6418773910609312, + "grad_norm": 4.055837251719046, + "learning_rate": 0.0002954832404108765, + "loss": 7.1313, + "step": 6879 + }, + { + "epoch": 0.6419707007558085, + "grad_norm": 3.089068700390468, + "learning_rate": 0.00029548139806400634, + "loss": 7.3071, + "step": 6880 + }, + { + "epoch": 0.6420640104506858, + "grad_norm": 1.167991322240001, + "learning_rate": 0.00029547955534722005, + "loss": 7.2004, + "step": 6881 + }, + { + "epoch": 0.6421573201455631, + "grad_norm": 1.4329008950028606, + "learning_rate": 0.0002954777122605224, + "loss": 7.4837, + "step": 6882 + }, + { + "epoch": 0.6422506298404405, + "grad_norm": 0.8334363713797022, + "learning_rate": 0.00029547586880391804, + "loss": 7.4751, + "step": 6883 + }, + { + "epoch": 0.6423439395353178, + "grad_norm": 20.026609732805852, + "learning_rate": 0.00029547402497741163, + "loss": 6.9247, + "step": 6884 + }, + { + "epoch": 0.642437249230195, + "grad_norm": 4.900532232362919, + "learning_rate": 0.0002954721807810078, + "loss": 7.1534, + "step": 6885 + }, + { + "epoch": 0.6425305589250723, + "grad_norm": 8.535573098884768, + "learning_rate": 0.00029547033621471133, + "loss": 7.1341, + "step": 6886 + }, + { + "epoch": 0.6426238686199496, + "grad_norm": 5.046573162274151, + "learning_rate": 0.00029546849127852693, + "loss": 7.3998, + "step": 6887 + }, + { + "epoch": 0.6427171783148269, + "grad_norm": 2.4956608852724194, + "learning_rate": 0.00029546664597245926, + "loss": 7.1267, + "step": 6888 + }, + { + "epoch": 0.6428104880097042, + "grad_norm": 7.924217281618736, + "learning_rate": 0.000295464800296513, + "loss": 7.0592, + "step": 6889 + }, + { + "epoch": 0.6429037977045815, + "grad_norm": 1.4431913316724805, + "learning_rate": 0.00029546295425069276, + "loss": 7.1053, + "step": 6890 + }, + { + "epoch": 0.6429971073994588, + "grad_norm": 0.5300810120840321, + "learning_rate": 0.00029546110783500336, + "loss": 7.3604, + "step": 6891 + }, + { + "epoch": 0.6430904170943361, + "grad_norm": 1.7361550156958583, + "learning_rate": 0.00029545926104944945, + "loss": 7.1586, + "step": 6892 + }, + { + "epoch": 0.6431837267892134, + "grad_norm": 1.7073964081854458, + "learning_rate": 0.0002954574138940357, + "loss": 7.4918, + "step": 6893 + }, + { + "epoch": 0.6432770364840907, + "grad_norm": 22.16068011397024, + "learning_rate": 0.00029545556636876685, + "loss": 7.1172, + "step": 6894 + }, + { + "epoch": 0.643370346178968, + "grad_norm": 1.766439286172723, + "learning_rate": 0.00029545371847364765, + "loss": 7.2177, + "step": 6895 + }, + { + "epoch": 0.6434636558738452, + "grad_norm": 1.620787252423054, + "learning_rate": 0.0002954518702086827, + "loss": 7.3886, + "step": 6896 + }, + { + "epoch": 0.6435569655687225, + "grad_norm": 1.4022527473762785, + "learning_rate": 0.00029545002157387664, + "loss": 7.3152, + "step": 6897 + }, + { + "epoch": 0.6436502752635999, + "grad_norm": 2.426754441482777, + "learning_rate": 0.00029544817256923437, + "loss": 7.4509, + "step": 6898 + }, + { + "epoch": 0.6437435849584772, + "grad_norm": 17.254837634503772, + "learning_rate": 0.0002954463231947604, + "loss": 7.3071, + "step": 6899 + }, + { + "epoch": 0.6438368946533545, + "grad_norm": 2.336961712109559, + "learning_rate": 0.0002954444734504596, + "loss": 7.4162, + "step": 6900 + }, + { + "epoch": 0.6439302043482318, + "grad_norm": 2.0309093448562283, + "learning_rate": 0.0002954426233363365, + "loss": 7.3332, + "step": 6901 + }, + { + "epoch": 0.6440235140431091, + "grad_norm": 2.357890851460656, + "learning_rate": 0.00029544077285239596, + "loss": 7.5639, + "step": 6902 + }, + { + "epoch": 0.6441168237379864, + "grad_norm": 226.44256360315723, + "learning_rate": 0.0002954389219986426, + "loss": 7.2068, + "step": 6903 + }, + { + "epoch": 0.6442101334328637, + "grad_norm": 8.37799811181858, + "learning_rate": 0.0002954370707750811, + "loss": 8.1899, + "step": 6904 + }, + { + "epoch": 0.644303443127741, + "grad_norm": 1.64241168158837, + "learning_rate": 0.00029543521918171627, + "loss": 7.5394, + "step": 6905 + }, + { + "epoch": 0.6443967528226183, + "grad_norm": 13.088963345402714, + "learning_rate": 0.0002954333672185527, + "loss": 8.4172, + "step": 6906 + }, + { + "epoch": 0.6444900625174955, + "grad_norm": 8.997139383590257, + "learning_rate": 0.00029543151488559523, + "loss": 8.2774, + "step": 6907 + }, + { + "epoch": 0.6445833722123728, + "grad_norm": 4.722161868823934, + "learning_rate": 0.0002954296621828485, + "loss": 8.1547, + "step": 6908 + }, + { + "epoch": 0.6446766819072501, + "grad_norm": 3.076106875590823, + "learning_rate": 0.0002954278091103171, + "loss": 7.2839, + "step": 6909 + }, + { + "epoch": 0.6447699916021274, + "grad_norm": 4.020304702917531, + "learning_rate": 0.000295425955668006, + "loss": 8.2638, + "step": 6910 + }, + { + "epoch": 0.6448633012970048, + "grad_norm": 3.9130450842394975, + "learning_rate": 0.0002954241018559197, + "loss": 8.0618, + "step": 6911 + }, + { + "epoch": 0.6449566109918821, + "grad_norm": 5.360814393947799, + "learning_rate": 0.000295422247674063, + "loss": 8.4548, + "step": 6912 + }, + { + "epoch": 0.6450499206867594, + "grad_norm": 4.934797773044187, + "learning_rate": 0.0002954203931224406, + "loss": 8.3587, + "step": 6913 + }, + { + "epoch": 0.6451432303816367, + "grad_norm": 19.057504447798696, + "learning_rate": 0.0002954185382010572, + "loss": 8.2144, + "step": 6914 + }, + { + "epoch": 0.645236540076514, + "grad_norm": 5.238028339668806, + "learning_rate": 0.0002954166829099176, + "loss": 8.166, + "step": 6915 + }, + { + "epoch": 0.6453298497713913, + "grad_norm": 5.8050633458403595, + "learning_rate": 0.00029541482724902636, + "loss": 8.1667, + "step": 6916 + }, + { + "epoch": 0.6454231594662685, + "grad_norm": 6.343234409506036, + "learning_rate": 0.0002954129712183883, + "loss": 8.1233, + "step": 6917 + }, + { + "epoch": 0.6455164691611458, + "grad_norm": 4.556925948068755, + "learning_rate": 0.00029541111481800816, + "loss": 7.8918, + "step": 6918 + }, + { + "epoch": 0.6456097788560231, + "grad_norm": 5.15401181241479, + "learning_rate": 0.0002954092580478907, + "loss": 7.6878, + "step": 6919 + }, + { + "epoch": 0.6457030885509004, + "grad_norm": 3.5186159956833025, + "learning_rate": 0.0002954074009080405, + "loss": 7.6489, + "step": 6920 + }, + { + "epoch": 0.6457963982457777, + "grad_norm": 3.6382286219500126, + "learning_rate": 0.00029540554339846234, + "loss": 7.5922, + "step": 6921 + }, + { + "epoch": 0.645889707940655, + "grad_norm": 8.78922000269322, + "learning_rate": 0.00029540368551916093, + "loss": 7.9749, + "step": 6922 + }, + { + "epoch": 0.6459830176355323, + "grad_norm": 10.038441962021968, + "learning_rate": 0.0002954018272701411, + "loss": 7.7799, + "step": 6923 + }, + { + "epoch": 0.6460763273304096, + "grad_norm": 5.065298236238662, + "learning_rate": 0.0002953999686514074, + "loss": 7.6138, + "step": 6924 + }, + { + "epoch": 0.646169637025287, + "grad_norm": 1.9567028146328622, + "learning_rate": 0.00029539810966296474, + "loss": 7.561, + "step": 6925 + }, + { + "epoch": 0.6462629467201643, + "grad_norm": 2.3623893785499637, + "learning_rate": 0.00029539625030481774, + "loss": 7.6754, + "step": 6926 + }, + { + "epoch": 0.6463562564150416, + "grad_norm": 3.128167452606714, + "learning_rate": 0.0002953943905769711, + "loss": 7.8591, + "step": 6927 + }, + { + "epoch": 0.6464495661099188, + "grad_norm": 4.163540460558416, + "learning_rate": 0.0002953925304794296, + "loss": 7.7866, + "step": 6928 + }, + { + "epoch": 0.6465428758047961, + "grad_norm": 4.798254287982123, + "learning_rate": 0.000295390670012198, + "loss": 7.7609, + "step": 6929 + }, + { + "epoch": 0.6466361854996734, + "grad_norm": 1.6315215227711601, + "learning_rate": 0.000295388809175281, + "loss": 7.4812, + "step": 6930 + }, + { + "epoch": 0.6467294951945507, + "grad_norm": 4.632694375013199, + "learning_rate": 0.0002953869479686833, + "loss": 7.5811, + "step": 6931 + }, + { + "epoch": 0.646822804889428, + "grad_norm": 3.745601439308481, + "learning_rate": 0.0002953850863924096, + "loss": 7.6779, + "step": 6932 + }, + { + "epoch": 0.6469161145843053, + "grad_norm": 2.553264418244648, + "learning_rate": 0.0002953832244464648, + "loss": 7.5311, + "step": 6933 + }, + { + "epoch": 0.6470094242791826, + "grad_norm": 2.012687360726176, + "learning_rate": 0.00029538136213085346, + "loss": 7.3715, + "step": 6934 + }, + { + "epoch": 0.6471027339740599, + "grad_norm": 3.5972972802265946, + "learning_rate": 0.00029537949944558034, + "loss": 7.5101, + "step": 6935 + }, + { + "epoch": 0.6471960436689372, + "grad_norm": 4.8177877014722394, + "learning_rate": 0.0002953776363906503, + "loss": 7.6712, + "step": 6936 + }, + { + "epoch": 0.6472893533638145, + "grad_norm": 4.924268051412997, + "learning_rate": 0.000295375772966068, + "loss": 7.5976, + "step": 6937 + }, + { + "epoch": 0.6473826630586917, + "grad_norm": 3.287208055366615, + "learning_rate": 0.0002953739091718381, + "loss": 7.3491, + "step": 6938 + }, + { + "epoch": 0.647475972753569, + "grad_norm": 1.12015756869055, + "learning_rate": 0.00029537204500796547, + "loss": 7.5833, + "step": 6939 + }, + { + "epoch": 0.6475692824484464, + "grad_norm": 1.6940379803286199, + "learning_rate": 0.0002953701804744548, + "loss": 7.5182, + "step": 6940 + }, + { + "epoch": 0.6476625921433237, + "grad_norm": 2.1214593140214313, + "learning_rate": 0.00029536831557131084, + "loss": 7.6587, + "step": 6941 + }, + { + "epoch": 0.647755901838201, + "grad_norm": 3.25408572386153, + "learning_rate": 0.00029536645029853823, + "loss": 7.6218, + "step": 6942 + }, + { + "epoch": 0.6478492115330783, + "grad_norm": 2.976895536702646, + "learning_rate": 0.00029536458465614186, + "loss": 7.6554, + "step": 6943 + }, + { + "epoch": 0.6479425212279556, + "grad_norm": 3.068492605471979, + "learning_rate": 0.0002953627186441264, + "loss": 7.2572, + "step": 6944 + }, + { + "epoch": 0.6480358309228329, + "grad_norm": 1.2565302246159882, + "learning_rate": 0.00029536085226249657, + "loss": 7.4546, + "step": 6945 + }, + { + "epoch": 0.6481291406177102, + "grad_norm": 1.5001386724357229, + "learning_rate": 0.0002953589855112572, + "loss": 7.7741, + "step": 6946 + }, + { + "epoch": 0.6482224503125875, + "grad_norm": 3.7639224192241394, + "learning_rate": 0.0002953571183904129, + "loss": 8.0801, + "step": 6947 + }, + { + "epoch": 0.6483157600074648, + "grad_norm": 1.5904177087924118, + "learning_rate": 0.00029535525089996863, + "loss": 7.3515, + "step": 6948 + }, + { + "epoch": 0.648409069702342, + "grad_norm": 1.0590689868269203, + "learning_rate": 0.000295353383039929, + "loss": 7.5618, + "step": 6949 + }, + { + "epoch": 0.6485023793972193, + "grad_norm": 1.4971803110862711, + "learning_rate": 0.0002953515148102987, + "loss": 7.6191, + "step": 6950 + }, + { + "epoch": 0.6485956890920966, + "grad_norm": 1.7751166153790805, + "learning_rate": 0.0002953496462110826, + "loss": 7.3843, + "step": 6951 + }, + { + "epoch": 0.648688998786974, + "grad_norm": 1.814029247332023, + "learning_rate": 0.0002953477772422854, + "loss": 7.6158, + "step": 6952 + }, + { + "epoch": 0.6487823084818513, + "grad_norm": 1.4171629454357906, + "learning_rate": 0.00029534590790391185, + "loss": 7.682, + "step": 6953 + }, + { + "epoch": 0.6488756181767286, + "grad_norm": 1.8600927493702422, + "learning_rate": 0.0002953440381959667, + "loss": 7.6086, + "step": 6954 + }, + { + "epoch": 0.6489689278716059, + "grad_norm": 1.28055806675688, + "learning_rate": 0.00029534216811845474, + "loss": 7.6768, + "step": 6955 + }, + { + "epoch": 0.6490622375664832, + "grad_norm": 1.5458425143640022, + "learning_rate": 0.00029534029767138066, + "loss": 7.1993, + "step": 6956 + }, + { + "epoch": 0.6491555472613605, + "grad_norm": 2.820888352464006, + "learning_rate": 0.00029533842685474927, + "loss": 7.7847, + "step": 6957 + }, + { + "epoch": 0.6492488569562378, + "grad_norm": 1.3762204053635294, + "learning_rate": 0.00029533655566856535, + "loss": 7.6028, + "step": 6958 + }, + { + "epoch": 0.6493421666511151, + "grad_norm": 0.8813305181093992, + "learning_rate": 0.00029533468411283356, + "loss": 7.3238, + "step": 6959 + }, + { + "epoch": 0.6494354763459923, + "grad_norm": 1.0822577725238733, + "learning_rate": 0.0002953328121875588, + "loss": 7.2441, + "step": 6960 + }, + { + "epoch": 0.6495287860408696, + "grad_norm": 1.2891373812443205, + "learning_rate": 0.00029533093989274567, + "loss": 7.3361, + "step": 6961 + }, + { + "epoch": 0.6496220957357469, + "grad_norm": 1.8681348782278278, + "learning_rate": 0.00029532906722839905, + "loss": 7.2401, + "step": 6962 + }, + { + "epoch": 0.6497154054306242, + "grad_norm": 0.8591810557758727, + "learning_rate": 0.00029532719419452365, + "loss": 7.0782, + "step": 6963 + }, + { + "epoch": 0.6498087151255015, + "grad_norm": 1.2080069062454486, + "learning_rate": 0.0002953253207911243, + "loss": 7.3982, + "step": 6964 + }, + { + "epoch": 0.6499020248203788, + "grad_norm": 1.6774997568809447, + "learning_rate": 0.0002953234470182056, + "loss": 7.5713, + "step": 6965 + }, + { + "epoch": 0.6499953345152562, + "grad_norm": 0.7884329866698991, + "learning_rate": 0.0002953215728757725, + "loss": 7.042, + "step": 6966 + }, + { + "epoch": 0.6500886442101335, + "grad_norm": 1.1339941768696409, + "learning_rate": 0.0002953196983638297, + "loss": 7.4481, + "step": 6967 + }, + { + "epoch": 0.6501819539050108, + "grad_norm": 1.1137418223746922, + "learning_rate": 0.0002953178234823819, + "loss": 7.2881, + "step": 6968 + }, + { + "epoch": 0.6502752635998881, + "grad_norm": 0.9535451883153725, + "learning_rate": 0.00029531594823143395, + "loss": 7.3301, + "step": 6969 + }, + { + "epoch": 0.6503685732947653, + "grad_norm": 1.1378049909050254, + "learning_rate": 0.0002953140726109906, + "loss": 7.1224, + "step": 6970 + }, + { + "epoch": 0.6504618829896426, + "grad_norm": 0.8072119397258549, + "learning_rate": 0.00029531219662105656, + "loss": 7.108, + "step": 6971 + }, + { + "epoch": 0.6505551926845199, + "grad_norm": 1.7457731503061324, + "learning_rate": 0.0002953103202616367, + "loss": 7.7083, + "step": 6972 + }, + { + "epoch": 0.6506485023793972, + "grad_norm": 1.2641907597071746, + "learning_rate": 0.0002953084435327357, + "loss": 7.5792, + "step": 6973 + }, + { + "epoch": 0.6507418120742745, + "grad_norm": 1.6509752008425786, + "learning_rate": 0.0002953065664343584, + "loss": 7.8191, + "step": 6974 + }, + { + "epoch": 0.6508351217691518, + "grad_norm": 1.3193790421160556, + "learning_rate": 0.0002953046889665095, + "loss": 7.3072, + "step": 6975 + }, + { + "epoch": 0.6509284314640291, + "grad_norm": 1.7717374573285891, + "learning_rate": 0.00029530281112919387, + "loss": 6.9895, + "step": 6976 + }, + { + "epoch": 0.6510217411589064, + "grad_norm": 10.355231180744703, + "learning_rate": 0.00029530093292241626, + "loss": 7.2521, + "step": 6977 + }, + { + "epoch": 0.6511150508537837, + "grad_norm": 0.7768583318960253, + "learning_rate": 0.00029529905434618137, + "loss": 7.4117, + "step": 6978 + }, + { + "epoch": 0.651208360548661, + "grad_norm": 1.713592618570852, + "learning_rate": 0.000295297175400494, + "loss": 7.2794, + "step": 6979 + }, + { + "epoch": 0.6513016702435384, + "grad_norm": 1.260831417207902, + "learning_rate": 0.00029529529608535903, + "loss": 7.2539, + "step": 6980 + }, + { + "epoch": 0.6513949799384156, + "grad_norm": 1.5973469626911196, + "learning_rate": 0.0002952934164007811, + "loss": 7.685, + "step": 6981 + }, + { + "epoch": 0.6514882896332929, + "grad_norm": 0.5869918271106945, + "learning_rate": 0.00029529153634676506, + "loss": 7.2526, + "step": 6982 + }, + { + "epoch": 0.6515815993281702, + "grad_norm": 0.7141960540019713, + "learning_rate": 0.0002952896559233157, + "loss": 7.5474, + "step": 6983 + }, + { + "epoch": 0.6516749090230475, + "grad_norm": 1.5029167952074745, + "learning_rate": 0.0002952877751304378, + "loss": 7.7615, + "step": 6984 + }, + { + "epoch": 0.6517682187179248, + "grad_norm": 1.6271680931783972, + "learning_rate": 0.00029528589396813607, + "loss": 7.2029, + "step": 6985 + }, + { + "epoch": 0.6518615284128021, + "grad_norm": 1.550864445890939, + "learning_rate": 0.00029528401243641533, + "loss": 7.0717, + "step": 6986 + }, + { + "epoch": 0.6519548381076794, + "grad_norm": 0.7294434523345097, + "learning_rate": 0.00029528213053528046, + "loss": 7.3594, + "step": 6987 + }, + { + "epoch": 0.6520481478025567, + "grad_norm": 0.8449134697055263, + "learning_rate": 0.0002952802482647361, + "loss": 7.3714, + "step": 6988 + }, + { + "epoch": 0.652141457497434, + "grad_norm": 2.306991450326655, + "learning_rate": 0.0002952783656247872, + "loss": 7.5889, + "step": 6989 + }, + { + "epoch": 0.6522347671923113, + "grad_norm": 0.7775054669299898, + "learning_rate": 0.00029527648261543836, + "loss": 7.0524, + "step": 6990 + }, + { + "epoch": 0.6523280768871885, + "grad_norm": 0.7251678729366068, + "learning_rate": 0.00029527459923669455, + "loss": 7.1389, + "step": 6991 + }, + { + "epoch": 0.6524213865820658, + "grad_norm": 0.6170638467846519, + "learning_rate": 0.0002952727154885604, + "loss": 7.3003, + "step": 6992 + }, + { + "epoch": 0.6525146962769431, + "grad_norm": 0.9638824408525852, + "learning_rate": 0.00029527083137104073, + "loss": 7.1411, + "step": 6993 + }, + { + "epoch": 0.6526080059718204, + "grad_norm": 1.0506088000853153, + "learning_rate": 0.00029526894688414044, + "loss": 7.2304, + "step": 6994 + }, + { + "epoch": 0.6527013156666978, + "grad_norm": 0.6483626199362365, + "learning_rate": 0.00029526706202786417, + "loss": 7.2844, + "step": 6995 + }, + { + "epoch": 0.6527946253615751, + "grad_norm": 0.9997342108086374, + "learning_rate": 0.00029526517680221687, + "loss": 7.2845, + "step": 6996 + }, + { + "epoch": 0.6528879350564524, + "grad_norm": 8.186235108406569, + "learning_rate": 0.00029526329120720324, + "loss": 7.4134, + "step": 6997 + }, + { + "epoch": 0.6529812447513297, + "grad_norm": 1.0665541913862935, + "learning_rate": 0.0002952614052428281, + "loss": 7.4576, + "step": 6998 + }, + { + "epoch": 0.653074554446207, + "grad_norm": 0.816895227096294, + "learning_rate": 0.0002952595189090962, + "loss": 7.3443, + "step": 6999 + }, + { + "epoch": 0.6531678641410843, + "grad_norm": 1.1972767421507402, + "learning_rate": 0.0002952576322060124, + "loss": 7.081, + "step": 7000 + }, + { + "epoch": 0.6532611738359616, + "grad_norm": 0.9266608273105088, + "learning_rate": 0.0002952557451335815, + "loss": 7.3365, + "step": 7001 + }, + { + "epoch": 0.6533544835308388, + "grad_norm": 0.7502454886806174, + "learning_rate": 0.0002952538576918082, + "loss": 7.1232, + "step": 7002 + }, + { + "epoch": 0.6534477932257161, + "grad_norm": 0.6853780980361247, + "learning_rate": 0.0002952519698806974, + "loss": 7.0549, + "step": 7003 + }, + { + "epoch": 0.6535411029205934, + "grad_norm": 0.9914686523083488, + "learning_rate": 0.00029525008170025387, + "loss": 7.3531, + "step": 7004 + }, + { + "epoch": 0.6536344126154707, + "grad_norm": 5.0259407336201845, + "learning_rate": 0.00029524819315048233, + "loss": 7.6021, + "step": 7005 + }, + { + "epoch": 0.653727722310348, + "grad_norm": 1.3372468793724501, + "learning_rate": 0.0002952463042313878, + "loss": 7.3152, + "step": 7006 + }, + { + "epoch": 0.6538210320052253, + "grad_norm": 0.6089728576672527, + "learning_rate": 0.00029524441494297484, + "loss": 7.0841, + "step": 7007 + }, + { + "epoch": 0.6539143417001027, + "grad_norm": 0.623108387351855, + "learning_rate": 0.00029524252528524843, + "loss": 7.3008, + "step": 7008 + }, + { + "epoch": 0.65400765139498, + "grad_norm": 1.4148042262862144, + "learning_rate": 0.0002952406352582133, + "loss": 7.202, + "step": 7009 + }, + { + "epoch": 0.6541009610898573, + "grad_norm": 0.7739680147666296, + "learning_rate": 0.0002952387448618742, + "loss": 7.6592, + "step": 7010 + }, + { + "epoch": 0.6541942707847346, + "grad_norm": 314.5004716444082, + "learning_rate": 0.000295236854096236, + "loss": 7.1542, + "step": 7011 + }, + { + "epoch": 0.6542875804796119, + "grad_norm": 0.8457212990242523, + "learning_rate": 0.00029523496296130354, + "loss": 7.1828, + "step": 7012 + }, + { + "epoch": 0.6543808901744891, + "grad_norm": 1.5769177199932476, + "learning_rate": 0.00029523307145708157, + "loss": 7.1925, + "step": 7013 + }, + { + "epoch": 0.6544741998693664, + "grad_norm": 1.3137327238541765, + "learning_rate": 0.0002952311795835749, + "loss": 7.3387, + "step": 7014 + }, + { + "epoch": 0.6545675095642437, + "grad_norm": 1.2086368685299242, + "learning_rate": 0.0002952292873407884, + "loss": 7.3933, + "step": 7015 + }, + { + "epoch": 0.654660819259121, + "grad_norm": 1.265031768206294, + "learning_rate": 0.0002952273947287268, + "loss": 7.2222, + "step": 7016 + }, + { + "epoch": 0.6547541289539983, + "grad_norm": 0.7090355773507366, + "learning_rate": 0.000295225501747395, + "loss": 7.4279, + "step": 7017 + }, + { + "epoch": 0.6548474386488756, + "grad_norm": 1.4394949293833985, + "learning_rate": 0.00029522360839679773, + "loss": 6.9701, + "step": 7018 + }, + { + "epoch": 0.6549407483437529, + "grad_norm": 1.0830794418015413, + "learning_rate": 0.0002952217146769399, + "loss": 7.2847, + "step": 7019 + }, + { + "epoch": 0.6550340580386302, + "grad_norm": 1.028376120426434, + "learning_rate": 0.0002952198205878262, + "loss": 7.45, + "step": 7020 + }, + { + "epoch": 0.6551273677335075, + "grad_norm": 0.7486895008726557, + "learning_rate": 0.00029521792612946155, + "loss": 7.353, + "step": 7021 + }, + { + "epoch": 0.6552206774283849, + "grad_norm": 0.5758804968233605, + "learning_rate": 0.0002952160313018507, + "loss": 7.3763, + "step": 7022 + }, + { + "epoch": 0.655313987123262, + "grad_norm": 0.8509679541436085, + "learning_rate": 0.00029521413610499855, + "loss": 7.1949, + "step": 7023 + }, + { + "epoch": 0.6554072968181394, + "grad_norm": 0.8543277987264026, + "learning_rate": 0.00029521224053890985, + "loss": 7.4038, + "step": 7024 + }, + { + "epoch": 0.6555006065130167, + "grad_norm": 0.7027931166540171, + "learning_rate": 0.0002952103446035894, + "loss": 7.4523, + "step": 7025 + }, + { + "epoch": 0.655593916207894, + "grad_norm": 0.8180972404611299, + "learning_rate": 0.0002952084482990421, + "loss": 7.6719, + "step": 7026 + }, + { + "epoch": 0.6556872259027713, + "grad_norm": 1.1618958256295435, + "learning_rate": 0.0002952065516252727, + "loss": 7.4099, + "step": 7027 + }, + { + "epoch": 0.6557805355976486, + "grad_norm": 1.2222682682133559, + "learning_rate": 0.0002952046545822861, + "loss": 7.2882, + "step": 7028 + }, + { + "epoch": 0.6558738452925259, + "grad_norm": 0.7723059936482717, + "learning_rate": 0.000295202757170087, + "loss": 7.0965, + "step": 7029 + }, + { + "epoch": 0.6559671549874032, + "grad_norm": 0.43455489005477443, + "learning_rate": 0.00029520085938868036, + "loss": 7.3289, + "step": 7030 + }, + { + "epoch": 0.6560604646822805, + "grad_norm": 1.5093454910566537, + "learning_rate": 0.00029519896123807095, + "loss": 7.5077, + "step": 7031 + }, + { + "epoch": 0.6561537743771578, + "grad_norm": 0.69637956528568, + "learning_rate": 0.00029519706271826355, + "loss": 7.2421, + "step": 7032 + }, + { + "epoch": 0.6562470840720351, + "grad_norm": 0.747927103683333, + "learning_rate": 0.00029519516382926303, + "loss": 7.2752, + "step": 7033 + }, + { + "epoch": 0.6563403937669123, + "grad_norm": 0.5809172455006034, + "learning_rate": 0.0002951932645710742, + "loss": 7.2408, + "step": 7034 + }, + { + "epoch": 0.6564337034617896, + "grad_norm": 0.7214535652604706, + "learning_rate": 0.000295191364943702, + "loss": 7.2797, + "step": 7035 + }, + { + "epoch": 0.656527013156667, + "grad_norm": 1.057851249694541, + "learning_rate": 0.00029518946494715116, + "loss": 7.1012, + "step": 7036 + }, + { + "epoch": 0.6566203228515443, + "grad_norm": 0.8047816664578014, + "learning_rate": 0.0002951875645814264, + "loss": 7.4039, + "step": 7037 + }, + { + "epoch": 0.6567136325464216, + "grad_norm": 0.7875844952766745, + "learning_rate": 0.00029518566384653276, + "loss": 7.2324, + "step": 7038 + }, + { + "epoch": 0.6568069422412989, + "grad_norm": 3.103922692142533, + "learning_rate": 0.000295183762742475, + "loss": 7.2082, + "step": 7039 + }, + { + "epoch": 0.6569002519361762, + "grad_norm": 1.3365547644260931, + "learning_rate": 0.0002951818612692579, + "loss": 7.6614, + "step": 7040 + }, + { + "epoch": 0.6569935616310535, + "grad_norm": 0.6423935280394985, + "learning_rate": 0.0002951799594268863, + "loss": 7.3886, + "step": 7041 + }, + { + "epoch": 0.6570868713259308, + "grad_norm": 6.13263373782843, + "learning_rate": 0.00029517805721536515, + "loss": 7.4037, + "step": 7042 + }, + { + "epoch": 0.6571801810208081, + "grad_norm": 0.766433014738868, + "learning_rate": 0.0002951761546346992, + "loss": 7.2956, + "step": 7043 + }, + { + "epoch": 0.6572734907156853, + "grad_norm": 1.5259071924097012, + "learning_rate": 0.00029517425168489324, + "loss": 7.1149, + "step": 7044 + }, + { + "epoch": 0.6573668004105626, + "grad_norm": 0.8432301027515445, + "learning_rate": 0.00029517234836595215, + "loss": 7.6087, + "step": 7045 + }, + { + "epoch": 0.6574601101054399, + "grad_norm": 0.8725739679160109, + "learning_rate": 0.0002951704446778808, + "loss": 7.3986, + "step": 7046 + }, + { + "epoch": 0.6575534198003172, + "grad_norm": 0.9912836499292638, + "learning_rate": 0.000295168540620684, + "loss": 7.1473, + "step": 7047 + }, + { + "epoch": 0.6576467294951945, + "grad_norm": 0.562145869005665, + "learning_rate": 0.0002951666361943667, + "loss": 7.191, + "step": 7048 + }, + { + "epoch": 0.6577400391900718, + "grad_norm": 0.5819106530423387, + "learning_rate": 0.00029516473139893355, + "loss": 7.0522, + "step": 7049 + }, + { + "epoch": 0.6578333488849492, + "grad_norm": 0.7494980029584579, + "learning_rate": 0.0002951628262343895, + "loss": 7.1423, + "step": 7050 + }, + { + "epoch": 0.6579266585798265, + "grad_norm": 0.8862485968893902, + "learning_rate": 0.00029516092070073943, + "loss": 7.4654, + "step": 7051 + }, + { + "epoch": 0.6580199682747038, + "grad_norm": 3.431301953631388, + "learning_rate": 0.00029515901479798805, + "loss": 7.3645, + "step": 7052 + }, + { + "epoch": 0.6581132779695811, + "grad_norm": 2.0176350943872747, + "learning_rate": 0.0002951571085261404, + "loss": 7.2602, + "step": 7053 + }, + { + "epoch": 0.6582065876644584, + "grad_norm": 1.2619826936328276, + "learning_rate": 0.00029515520188520117, + "loss": 7.2319, + "step": 7054 + }, + { + "epoch": 0.6582998973593356, + "grad_norm": 9.471730397067313, + "learning_rate": 0.00029515329487517526, + "loss": 7.4075, + "step": 7055 + }, + { + "epoch": 0.6583932070542129, + "grad_norm": 8.73501794459169, + "learning_rate": 0.00029515138749606747, + "loss": 7.2683, + "step": 7056 + }, + { + "epoch": 0.6584865167490902, + "grad_norm": 0.6496117766794529, + "learning_rate": 0.0002951494797478828, + "loss": 6.976, + "step": 7057 + }, + { + "epoch": 0.6585798264439675, + "grad_norm": 3.0905887642557164, + "learning_rate": 0.0002951475716306259, + "loss": 6.8718, + "step": 7058 + }, + { + "epoch": 0.6586731361388448, + "grad_norm": 0.5451062269295914, + "learning_rate": 0.00029514566314430175, + "loss": 6.968, + "step": 7059 + }, + { + "epoch": 0.6587664458337221, + "grad_norm": 0.8749305474673359, + "learning_rate": 0.0002951437542889152, + "loss": 7.5052, + "step": 7060 + }, + { + "epoch": 0.6588597555285994, + "grad_norm": 0.7421874171219236, + "learning_rate": 0.0002951418450644711, + "loss": 7.1745, + "step": 7061 + }, + { + "epoch": 0.6589530652234767, + "grad_norm": 0.7361449014102732, + "learning_rate": 0.00029513993547097426, + "loss": 7.2916, + "step": 7062 + }, + { + "epoch": 0.659046374918354, + "grad_norm": 0.8317496604827547, + "learning_rate": 0.00029513802550842954, + "loss": 7.1906, + "step": 7063 + }, + { + "epoch": 0.6591396846132314, + "grad_norm": 1.2025202029449762, + "learning_rate": 0.0002951361151768418, + "loss": 7.5279, + "step": 7064 + }, + { + "epoch": 0.6592329943081087, + "grad_norm": 0.6731494987479653, + "learning_rate": 0.00029513420447621597, + "loss": 7.4938, + "step": 7065 + }, + { + "epoch": 0.6593263040029859, + "grad_norm": 0.6972658770300049, + "learning_rate": 0.0002951322934065568, + "loss": 7.4582, + "step": 7066 + }, + { + "epoch": 0.6594196136978632, + "grad_norm": 0.6811703286319052, + "learning_rate": 0.0002951303819678692, + "loss": 7.227, + "step": 7067 + }, + { + "epoch": 0.6595129233927405, + "grad_norm": 0.9514023841172723, + "learning_rate": 0.00029512847016015806, + "loss": 7.2951, + "step": 7068 + }, + { + "epoch": 0.6596062330876178, + "grad_norm": 0.5147335858042436, + "learning_rate": 0.00029512655798342817, + "loss": 7.1167, + "step": 7069 + }, + { + "epoch": 0.6596995427824951, + "grad_norm": 0.6133492342420885, + "learning_rate": 0.0002951246454376845, + "loss": 7.0096, + "step": 7070 + }, + { + "epoch": 0.6597928524773724, + "grad_norm": 0.9337891658938192, + "learning_rate": 0.0002951227325229318, + "loss": 7.1443, + "step": 7071 + }, + { + "epoch": 0.6598861621722497, + "grad_norm": 0.998007512356914, + "learning_rate": 0.00029512081923917493, + "loss": 7.5152, + "step": 7072 + }, + { + "epoch": 0.659979471867127, + "grad_norm": 0.6604779413659899, + "learning_rate": 0.00029511890558641887, + "loss": 7.2995, + "step": 7073 + }, + { + "epoch": 0.6600727815620043, + "grad_norm": 0.886980089376891, + "learning_rate": 0.00029511699156466837, + "loss": 7.1936, + "step": 7074 + }, + { + "epoch": 0.6601660912568816, + "grad_norm": 1.0764140288696025, + "learning_rate": 0.0002951150771739284, + "loss": 6.8604, + "step": 7075 + }, + { + "epoch": 0.6602594009517588, + "grad_norm": 0.48801277763752343, + "learning_rate": 0.0002951131624142038, + "loss": 7.2408, + "step": 7076 + }, + { + "epoch": 0.6603527106466361, + "grad_norm": 0.7343454518292178, + "learning_rate": 0.00029511124728549934, + "loss": 7.0496, + "step": 7077 + }, + { + "epoch": 0.6604460203415135, + "grad_norm": 0.7609961519214078, + "learning_rate": 0.00029510933178782, + "loss": 7.1784, + "step": 7078 + }, + { + "epoch": 0.6605393300363908, + "grad_norm": 0.4134851446783363, + "learning_rate": 0.0002951074159211706, + "loss": 6.9358, + "step": 7079 + }, + { + "epoch": 0.6606326397312681, + "grad_norm": 0.4573551619363148, + "learning_rate": 0.0002951054996855561, + "loss": 7.2465, + "step": 7080 + }, + { + "epoch": 0.6607259494261454, + "grad_norm": 0.9348555933993351, + "learning_rate": 0.0002951035830809812, + "loss": 6.9399, + "step": 7081 + }, + { + "epoch": 0.6608192591210227, + "grad_norm": 0.9321593119810874, + "learning_rate": 0.0002951016661074509, + "loss": 6.9475, + "step": 7082 + }, + { + "epoch": 0.6609125688159, + "grad_norm": 0.9508723505131267, + "learning_rate": 0.00029509974876497006, + "loss": 7.236, + "step": 7083 + }, + { + "epoch": 0.6610058785107773, + "grad_norm": 0.5043444715784565, + "learning_rate": 0.0002950978310535435, + "loss": 6.7502, + "step": 7084 + }, + { + "epoch": 0.6610991882056546, + "grad_norm": 1.2995349251151715, + "learning_rate": 0.00029509591297317616, + "loss": 7.1464, + "step": 7085 + }, + { + "epoch": 0.6611924979005319, + "grad_norm": 0.9077267644189002, + "learning_rate": 0.00029509399452387294, + "loss": 7.0623, + "step": 7086 + }, + { + "epoch": 0.6612858075954091, + "grad_norm": 0.6109642502643154, + "learning_rate": 0.00029509207570563864, + "loss": 7.2876, + "step": 7087 + }, + { + "epoch": 0.6613791172902864, + "grad_norm": 1.1310570391308419, + "learning_rate": 0.00029509015651847816, + "loss": 7.2059, + "step": 7088 + }, + { + "epoch": 0.6614724269851637, + "grad_norm": 0.9919641491962867, + "learning_rate": 0.00029508823696239636, + "loss": 7.2853, + "step": 7089 + }, + { + "epoch": 0.661565736680041, + "grad_norm": 0.961091726769661, + "learning_rate": 0.0002950863170373982, + "loss": 7.1102, + "step": 7090 + }, + { + "epoch": 0.6616590463749183, + "grad_norm": 0.5138316673065617, + "learning_rate": 0.0002950843967434885, + "loss": 7.0445, + "step": 7091 + }, + { + "epoch": 0.6617523560697957, + "grad_norm": 2.0715793602126102, + "learning_rate": 0.0002950824760806721, + "loss": 7.2036, + "step": 7092 + }, + { + "epoch": 0.661845665764673, + "grad_norm": 0.6072334614360295, + "learning_rate": 0.00029508055504895404, + "loss": 6.9356, + "step": 7093 + }, + { + "epoch": 0.6619389754595503, + "grad_norm": 0.6661421353287867, + "learning_rate": 0.0002950786336483391, + "loss": 6.6857, + "step": 7094 + }, + { + "epoch": 0.6620322851544276, + "grad_norm": 0.651011537028933, + "learning_rate": 0.0002950767118788321, + "loss": 6.9998, + "step": 7095 + }, + { + "epoch": 0.6621255948493049, + "grad_norm": 0.7802520179711563, + "learning_rate": 0.000295074789740438, + "loss": 6.8435, + "step": 7096 + }, + { + "epoch": 0.6622189045441821, + "grad_norm": 0.5383783805682751, + "learning_rate": 0.00029507286723316176, + "loss": 7.0268, + "step": 7097 + }, + { + "epoch": 0.6623122142390594, + "grad_norm": 0.843889999187487, + "learning_rate": 0.0002950709443570081, + "loss": 7.1942, + "step": 7098 + }, + { + "epoch": 0.6624055239339367, + "grad_norm": 0.5738038808212851, + "learning_rate": 0.0002950690211119821, + "loss": 7.2757, + "step": 7099 + }, + { + "epoch": 0.662498833628814, + "grad_norm": 0.5939922738320024, + "learning_rate": 0.00029506709749808847, + "loss": 6.8526, + "step": 7100 + }, + { + "epoch": 0.6625921433236913, + "grad_norm": 0.7623494082185356, + "learning_rate": 0.0002950651735153322, + "loss": 6.6513, + "step": 7101 + }, + { + "epoch": 0.6626854530185686, + "grad_norm": 0.5909271128186431, + "learning_rate": 0.00029506324916371817, + "loss": 6.8167, + "step": 7102 + }, + { + "epoch": 0.6627787627134459, + "grad_norm": 0.6647971057089643, + "learning_rate": 0.00029506132444325124, + "loss": 6.9286, + "step": 7103 + }, + { + "epoch": 0.6628720724083232, + "grad_norm": 0.9346166578744342, + "learning_rate": 0.0002950593993539364, + "loss": 7.1455, + "step": 7104 + }, + { + "epoch": 0.6629653821032006, + "grad_norm": 1.4951993231284462, + "learning_rate": 0.00029505747389577846, + "loss": 7.024, + "step": 7105 + }, + { + "epoch": 0.6630586917980779, + "grad_norm": 0.5326583076274397, + "learning_rate": 0.0002950555480687823, + "loss": 7.2005, + "step": 7106 + }, + { + "epoch": 0.6631520014929552, + "grad_norm": 7.010985585534067, + "learning_rate": 0.00029505362187295285, + "loss": 7.1687, + "step": 7107 + }, + { + "epoch": 0.6632453111878324, + "grad_norm": 1.4879304808099263, + "learning_rate": 0.00029505169530829505, + "loss": 6.844, + "step": 7108 + }, + { + "epoch": 0.6633386208827097, + "grad_norm": 0.8578297751026648, + "learning_rate": 0.0002950497683748137, + "loss": 7.3619, + "step": 7109 + }, + { + "epoch": 0.663431930577587, + "grad_norm": 0.8096446892517316, + "learning_rate": 0.0002950478410725138, + "loss": 7.1941, + "step": 7110 + }, + { + "epoch": 0.6635252402724643, + "grad_norm": 0.8983049756646491, + "learning_rate": 0.0002950459134014002, + "loss": 7.2018, + "step": 7111 + }, + { + "epoch": 0.6636185499673416, + "grad_norm": 0.6357799181550484, + "learning_rate": 0.00029504398536147784, + "loss": 6.9267, + "step": 7112 + }, + { + "epoch": 0.6637118596622189, + "grad_norm": 0.7397039844376445, + "learning_rate": 0.0002950420569527515, + "loss": 6.9978, + "step": 7113 + }, + { + "epoch": 0.6638051693570962, + "grad_norm": 0.8272800969004994, + "learning_rate": 0.00029504012817522623, + "loss": 7.2904, + "step": 7114 + }, + { + "epoch": 0.6638984790519735, + "grad_norm": 0.7523473411943269, + "learning_rate": 0.00029503819902890687, + "loss": 7.1195, + "step": 7115 + }, + { + "epoch": 0.6639917887468508, + "grad_norm": 1.3673230758688146, + "learning_rate": 0.00029503626951379836, + "loss": 7.3261, + "step": 7116 + }, + { + "epoch": 0.6640850984417281, + "grad_norm": 0.7274313213716566, + "learning_rate": 0.00029503433962990553, + "loss": 7.0902, + "step": 7117 + }, + { + "epoch": 0.6641784081366054, + "grad_norm": 4.297414348234019, + "learning_rate": 0.00029503240937723337, + "loss": 7.1729, + "step": 7118 + }, + { + "epoch": 0.6642717178314826, + "grad_norm": 4.8872478169583236, + "learning_rate": 0.00029503047875578675, + "loss": 7.3708, + "step": 7119 + }, + { + "epoch": 0.66436502752636, + "grad_norm": 0.4840066510186353, + "learning_rate": 0.00029502854776557055, + "loss": 6.8584, + "step": 7120 + }, + { + "epoch": 0.6644583372212373, + "grad_norm": 0.9638992867099958, + "learning_rate": 0.0002950266164065898, + "loss": 6.8828, + "step": 7121 + }, + { + "epoch": 0.6645516469161146, + "grad_norm": 0.8712429689139022, + "learning_rate": 0.0002950246846788492, + "loss": 6.7312, + "step": 7122 + }, + { + "epoch": 0.6646449566109919, + "grad_norm": 0.8040103025583011, + "learning_rate": 0.0002950227525823539, + "loss": 7.0275, + "step": 7123 + }, + { + "epoch": 0.6647382663058692, + "grad_norm": 0.6750248685515994, + "learning_rate": 0.00029502082011710867, + "loss": 7.2931, + "step": 7124 + }, + { + "epoch": 0.6648315760007465, + "grad_norm": 2.418373608071755, + "learning_rate": 0.0002950188872831184, + "loss": 6.7021, + "step": 7125 + }, + { + "epoch": 0.6649248856956238, + "grad_norm": 3.0367146045605704, + "learning_rate": 0.00029501695408038815, + "loss": 7.0238, + "step": 7126 + }, + { + "epoch": 0.6650181953905011, + "grad_norm": 24.826169014004034, + "learning_rate": 0.00029501502050892267, + "loss": 6.8453, + "step": 7127 + }, + { + "epoch": 0.6651115050853784, + "grad_norm": 1193.2517560022916, + "learning_rate": 0.0002950130865687269, + "loss": 6.7996, + "step": 7128 + }, + { + "epoch": 0.6652048147802556, + "grad_norm": 11.809258881153774, + "learning_rate": 0.0002950111522598059, + "loss": 7.1181, + "step": 7129 + }, + { + "epoch": 0.6652981244751329, + "grad_norm": 33227.43190491291, + "learning_rate": 0.0002950092175821645, + "loss": 6.7849, + "step": 7130 + }, + { + "epoch": 0.6653914341700102, + "grad_norm": 3.5257105724840225, + "learning_rate": 0.0002950072825358076, + "loss": 6.8069, + "step": 7131 + }, + { + "epoch": 0.6654847438648875, + "grad_norm": 4.479884527229305, + "learning_rate": 0.0002950053471207401, + "loss": 7.3422, + "step": 7132 + }, + { + "epoch": 0.6655780535597648, + "grad_norm": 19338.74210732594, + "learning_rate": 0.000295003411336967, + "loss": 7.2542, + "step": 7133 + }, + { + "epoch": 0.6656713632546422, + "grad_norm": 2.4964365846324275, + "learning_rate": 0.00029500147518449313, + "loss": 7.1183, + "step": 7134 + }, + { + "epoch": 0.6657646729495195, + "grad_norm": 38.18388815000175, + "learning_rate": 0.00029499953866332347, + "loss": 6.9175, + "step": 7135 + }, + { + "epoch": 0.6658579826443968, + "grad_norm": 2.687285445932068, + "learning_rate": 0.00029499760177346295, + "loss": 7.3735, + "step": 7136 + }, + { + "epoch": 0.6659512923392741, + "grad_norm": 1.420347866590833, + "learning_rate": 0.0002949956645149165, + "loss": 7.1693, + "step": 7137 + }, + { + "epoch": 0.6660446020341514, + "grad_norm": 422098287.4688012, + "learning_rate": 0.000294993726887689, + "loss": 7.3741, + "step": 7138 + }, + { + "epoch": 0.6661379117290287, + "grad_norm": 15.960345364581851, + "learning_rate": 0.0002949917888917854, + "loss": 7.5723, + "step": 7139 + }, + { + "epoch": 0.6662312214239059, + "grad_norm": 171052332.6559933, + "learning_rate": 0.0002949898505272106, + "loss": 7.0685, + "step": 7140 + }, + { + "epoch": 0.6663245311187832, + "grad_norm": 2.080261160611902, + "learning_rate": 0.0002949879117939696, + "loss": 7.8389, + "step": 7141 + }, + { + "epoch": 0.6664178408136605, + "grad_norm": 35.14831707529015, + "learning_rate": 0.0002949859726920673, + "loss": 7.2131, + "step": 7142 + }, + { + "epoch": 0.6665111505085378, + "grad_norm": 3.1849432348806643, + "learning_rate": 0.00029498403322150864, + "loss": 7.7639, + "step": 7143 + }, + { + "epoch": 0.6666044602034151, + "grad_norm": 2.5829963896694816, + "learning_rate": 0.0002949820933822985, + "loss": 7.841, + "step": 7144 + }, + { + "epoch": 0.6666977698982924, + "grad_norm": 2.2213137361151025, + "learning_rate": 0.00029498015317444185, + "loss": 7.8943, + "step": 7145 + }, + { + "epoch": 0.6667910795931697, + "grad_norm": 6.489757403414579, + "learning_rate": 0.0002949782125979436, + "loss": 7.5567, + "step": 7146 + }, + { + "epoch": 0.666884389288047, + "grad_norm": 400245908.2894883, + "learning_rate": 0.00029497627165280866, + "loss": 7.6287, + "step": 7147 + }, + { + "epoch": 0.6669776989829244, + "grad_norm": 5.0885244248462485, + "learning_rate": 0.0002949743303390421, + "loss": 7.676, + "step": 7148 + }, + { + "epoch": 0.6670710086778017, + "grad_norm": 5.115120876905319, + "learning_rate": 0.0002949723886566487, + "loss": 7.6369, + "step": 7149 + }, + { + "epoch": 0.6671643183726789, + "grad_norm": 11.66376252274414, + "learning_rate": 0.0002949704466056335, + "loss": 7.5825, + "step": 7150 + }, + { + "epoch": 0.6672576280675562, + "grad_norm": 2.407336768736242, + "learning_rate": 0.0002949685041860014, + "loss": 7.2823, + "step": 7151 + }, + { + "epoch": 0.6673509377624335, + "grad_norm": 55.88070584537889, + "learning_rate": 0.00029496656139775724, + "loss": 7.5333, + "step": 7152 + }, + { + "epoch": 0.6674442474573108, + "grad_norm": 1.6228171678606618, + "learning_rate": 0.00029496461824090616, + "loss": 7.6347, + "step": 7153 + }, + { + "epoch": 0.6675375571521881, + "grad_norm": 12108122188052.178, + "learning_rate": 0.000294962674715453, + "loss": 7.7403, + "step": 7154 + }, + { + "epoch": 0.6676308668470654, + "grad_norm": 34628907077.15343, + "learning_rate": 0.0002949607308214026, + "loss": 7.3314, + "step": 7155 + }, + { + "epoch": 0.6677241765419427, + "grad_norm": 30.7639444704913, + "learning_rate": 0.00029495878655876013, + "loss": 7.4216, + "step": 7156 + }, + { + "epoch": 0.66781748623682, + "grad_norm": 9.180426331066805, + "learning_rate": 0.0002949568419275303, + "loss": 7.317, + "step": 7157 + }, + { + "epoch": 0.6679107959316973, + "grad_norm": 4.100496405242715, + "learning_rate": 0.0002949548969277182, + "loss": 7.3692, + "step": 7158 + }, + { + "epoch": 0.6680041056265746, + "grad_norm": 25.846539385149516, + "learning_rate": 0.00029495295155932875, + "loss": 7.7195, + "step": 7159 + }, + { + "epoch": 0.668097415321452, + "grad_norm": 1078858476.1947775, + "learning_rate": 0.00029495100582236684, + "loss": 7.5948, + "step": 7160 + }, + { + "epoch": 0.6681907250163291, + "grad_norm": 4.737066584751702, + "learning_rate": 0.0002949490597168375, + "loss": 7.4572, + "step": 7161 + }, + { + "epoch": 0.6682840347112065, + "grad_norm": 15.468165278850911, + "learning_rate": 0.00029494711324274563, + "loss": 7.7183, + "step": 7162 + }, + { + "epoch": 0.6683773444060838, + "grad_norm": 12.552461491791936, + "learning_rate": 0.0002949451664000962, + "loss": 7.318, + "step": 7163 + }, + { + "epoch": 0.6684706541009611, + "grad_norm": 3.98853770481569, + "learning_rate": 0.0002949432191888941, + "loss": 7.6088, + "step": 7164 + }, + { + "epoch": 0.6685639637958384, + "grad_norm": 27.833503470827804, + "learning_rate": 0.0002949412716091444, + "loss": 7.3479, + "step": 7165 + }, + { + "epoch": 0.6686572734907157, + "grad_norm": 2.847307370514328, + "learning_rate": 0.00029493932366085195, + "loss": 7.4458, + "step": 7166 + }, + { + "epoch": 0.668750583185593, + "grad_norm": 2.386325886184096, + "learning_rate": 0.0002949373753440217, + "loss": 7.6806, + "step": 7167 + }, + { + "epoch": 0.6688438928804703, + "grad_norm": 2.6064188433111495, + "learning_rate": 0.00029493542665865873, + "loss": 7.5526, + "step": 7168 + }, + { + "epoch": 0.6689372025753476, + "grad_norm": 5.797150024445457, + "learning_rate": 0.00029493347760476784, + "loss": 7.8695, + "step": 7169 + }, + { + "epoch": 0.6690305122702249, + "grad_norm": 13105540.07379685, + "learning_rate": 0.000294931528182354, + "loss": 7.4467, + "step": 7170 + }, + { + "epoch": 0.6691238219651022, + "grad_norm": 1.9020156612830725, + "learning_rate": 0.0002949295783914223, + "loss": 7.8319, + "step": 7171 + }, + { + "epoch": 0.6692171316599794, + "grad_norm": 2.006485595557474, + "learning_rate": 0.00029492762823197757, + "loss": 7.4746, + "step": 7172 + }, + { + "epoch": 0.6693104413548567, + "grad_norm": 3.6876500610156087, + "learning_rate": 0.0002949256777040248, + "loss": 7.2889, + "step": 7173 + }, + { + "epoch": 0.669403751049734, + "grad_norm": 22926119.11157739, + "learning_rate": 0.00029492372680756904, + "loss": 7.1058, + "step": 7174 + }, + { + "epoch": 0.6694970607446113, + "grad_norm": 9.012084955842814, + "learning_rate": 0.00029492177554261506, + "loss": 7.4447, + "step": 7175 + }, + { + "epoch": 0.6695903704394887, + "grad_norm": 8.2508550252232, + "learning_rate": 0.00029491982390916804, + "loss": 7.4663, + "step": 7176 + }, + { + "epoch": 0.669683680134366, + "grad_norm": 1.908216986716822, + "learning_rate": 0.00029491787190723273, + "loss": 7.3328, + "step": 7177 + }, + { + "epoch": 0.6697769898292433, + "grad_norm": 6.4902822999376415, + "learning_rate": 0.00029491591953681424, + "loss": 7.355, + "step": 7178 + }, + { + "epoch": 0.6698702995241206, + "grad_norm": 4.616824305588737, + "learning_rate": 0.00029491396679791755, + "loss": 7.4852, + "step": 7179 + }, + { + "epoch": 0.6699636092189979, + "grad_norm": 14.84226991469574, + "learning_rate": 0.0002949120136905475, + "loss": 7.5024, + "step": 7180 + }, + { + "epoch": 0.6700569189138752, + "grad_norm": 2.8837702578887785, + "learning_rate": 0.00029491006021470915, + "loss": 7.4745, + "step": 7181 + }, + { + "epoch": 0.6701502286087524, + "grad_norm": 2.1246917786751065, + "learning_rate": 0.00029490810637040747, + "loss": 7.6441, + "step": 7182 + }, + { + "epoch": 0.6702435383036297, + "grad_norm": 11.436566705964552, + "learning_rate": 0.0002949061521576474, + "loss": 7.4338, + "step": 7183 + }, + { + "epoch": 0.670336847998507, + "grad_norm": 318.51765323977077, + "learning_rate": 0.00029490419757643386, + "loss": 7.2915, + "step": 7184 + }, + { + "epoch": 0.6704301576933843, + "grad_norm": 59250020.78152674, + "learning_rate": 0.0002949022426267719, + "loss": 7.2352, + "step": 7185 + }, + { + "epoch": 0.6705234673882616, + "grad_norm": 262026345.68932614, + "learning_rate": 0.00029490028730866645, + "loss": 7.4574, + "step": 7186 + }, + { + "epoch": 0.6706167770831389, + "grad_norm": 2.9425095454526047, + "learning_rate": 0.0002948983316221225, + "loss": 7.7505, + "step": 7187 + }, + { + "epoch": 0.6707100867780162, + "grad_norm": 8.6037806986514, + "learning_rate": 0.000294896375567145, + "loss": 7.4548, + "step": 7188 + }, + { + "epoch": 0.6708033964728936, + "grad_norm": 5.739265243567782, + "learning_rate": 0.00029489441914373895, + "loss": 7.3556, + "step": 7189 + }, + { + "epoch": 0.6708967061677709, + "grad_norm": 2.1664316543020745, + "learning_rate": 0.0002948924623519093, + "loss": 7.2346, + "step": 7190 + }, + { + "epoch": 0.6709900158626482, + "grad_norm": 2.139950854816778, + "learning_rate": 0.00029489050519166104, + "loss": 7.4065, + "step": 7191 + }, + { + "epoch": 0.6710833255575255, + "grad_norm": 162994379.60690644, + "learning_rate": 0.0002948885476629992, + "loss": 7.4225, + "step": 7192 + }, + { + "epoch": 0.6711766352524027, + "grad_norm": 1.7412307780424143, + "learning_rate": 0.0002948865897659286, + "loss": 7.6893, + "step": 7193 + }, + { + "epoch": 0.67126994494728, + "grad_norm": 22.33237495014229, + "learning_rate": 0.00029488463150045445, + "loss": 7.4832, + "step": 7194 + }, + { + "epoch": 0.6713632546421573, + "grad_norm": 2.222296142975704, + "learning_rate": 0.00029488267286658146, + "loss": 7.5139, + "step": 7195 + }, + { + "epoch": 0.6714565643370346, + "grad_norm": 2.161738755714835, + "learning_rate": 0.00029488071386431485, + "loss": 7.2751, + "step": 7196 + }, + { + "epoch": 0.6715498740319119, + "grad_norm": 5.571825819813341, + "learning_rate": 0.0002948787544936595, + "loss": 7.4274, + "step": 7197 + }, + { + "epoch": 0.6716431837267892, + "grad_norm": 1.904520652158966, + "learning_rate": 0.0002948767947546203, + "loss": 7.4167, + "step": 7198 + }, + { + "epoch": 0.6717364934216665, + "grad_norm": 1.987390547808389, + "learning_rate": 0.0002948748346472024, + "loss": 7.5486, + "step": 7199 + }, + { + "epoch": 0.6718298031165438, + "grad_norm": 11.46647443355504, + "learning_rate": 0.0002948728741714107, + "loss": 7.2439, + "step": 7200 + }, + { + "epoch": 0.6719231128114211, + "grad_norm": 2.9506925977168907, + "learning_rate": 0.00029487091332725023, + "loss": 7.3125, + "step": 7201 + }, + { + "epoch": 0.6720164225062984, + "grad_norm": 4.647046130496012, + "learning_rate": 0.0002948689521147259, + "loss": 7.5673, + "step": 7202 + }, + { + "epoch": 0.6721097322011756, + "grad_norm": 3140954590.22502, + "learning_rate": 0.00029486699053384274, + "loss": 7.668, + "step": 7203 + }, + { + "epoch": 0.672203041896053, + "grad_norm": 460067586.2555457, + "learning_rate": 0.00029486502858460577, + "loss": 7.2887, + "step": 7204 + }, + { + "epoch": 0.6722963515909303, + "grad_norm": 6.008647499978461, + "learning_rate": 0.0002948630662670199, + "loss": 7.458, + "step": 7205 + }, + { + "epoch": 0.6723896612858076, + "grad_norm": 1.8458654641510903, + "learning_rate": 0.0002948611035810902, + "loss": 7.7144, + "step": 7206 + }, + { + "epoch": 0.6724829709806849, + "grad_norm": 9.180238287458037, + "learning_rate": 0.0002948591405268216, + "loss": 7.4947, + "step": 7207 + }, + { + "epoch": 0.6725762806755622, + "grad_norm": 6.981191121273984, + "learning_rate": 0.00029485717710421913, + "loss": 7.2277, + "step": 7208 + }, + { + "epoch": 0.6726695903704395, + "grad_norm": 5.192229780330686, + "learning_rate": 0.0002948552133132877, + "loss": 7.2697, + "step": 7209 + }, + { + "epoch": 0.6727629000653168, + "grad_norm": 1.4816227270689624, + "learning_rate": 0.00029485324915403245, + "loss": 7.7268, + "step": 7210 + }, + { + "epoch": 0.6728562097601941, + "grad_norm": 135790925380.33072, + "learning_rate": 0.0002948512846264583, + "loss": 7.4918, + "step": 7211 + }, + { + "epoch": 0.6729495194550714, + "grad_norm": 14.510075813893414, + "learning_rate": 0.0002948493197305702, + "loss": 7.5156, + "step": 7212 + }, + { + "epoch": 0.6730428291499487, + "grad_norm": 3.295374602326323, + "learning_rate": 0.00029484735446637324, + "loss": 7.3844, + "step": 7213 + }, + { + "epoch": 0.6731361388448259, + "grad_norm": 3.7220332568905157, + "learning_rate": 0.0002948453888338723, + "loss": 7.4029, + "step": 7214 + }, + { + "epoch": 0.6732294485397032, + "grad_norm": 13.737092610489611, + "learning_rate": 0.00029484342283307247, + "loss": 7.8614, + "step": 7215 + }, + { + "epoch": 0.6733227582345805, + "grad_norm": 3.0424194660894344, + "learning_rate": 0.00029484145646397876, + "loss": 7.6225, + "step": 7216 + }, + { + "epoch": 0.6734160679294579, + "grad_norm": 3.873340925342319, + "learning_rate": 0.00029483948972659605, + "loss": 8.3945, + "step": 7217 + }, + { + "epoch": 0.6735093776243352, + "grad_norm": 10.941753293666673, + "learning_rate": 0.00029483752262092955, + "loss": 7.9535, + "step": 7218 + }, + { + "epoch": 0.6736026873192125, + "grad_norm": 5545146869.583827, + "learning_rate": 0.000294835555146984, + "loss": 7.3647, + "step": 7219 + }, + { + "epoch": 0.6736959970140898, + "grad_norm": 3.170525158183669, + "learning_rate": 0.0002948335873047646, + "loss": 7.6594, + "step": 7220 + }, + { + "epoch": 0.6737893067089671, + "grad_norm": 651964.5953637107, + "learning_rate": 0.0002948316190942763, + "loss": 7.7738, + "step": 7221 + }, + { + "epoch": 0.6738826164038444, + "grad_norm": 4.58874103150802, + "learning_rate": 0.0002948296505155241, + "loss": 8.2139, + "step": 7222 + }, + { + "epoch": 0.6739759260987217, + "grad_norm": 63.17269462003077, + "learning_rate": 0.0002948276815685129, + "loss": 7.9481, + "step": 7223 + }, + { + "epoch": 0.674069235793599, + "grad_norm": 556.512931074255, + "learning_rate": 0.0002948257122532479, + "loss": 7.6461, + "step": 7224 + }, + { + "epoch": 0.6741625454884762, + "grad_norm": 10096.499819907396, + "learning_rate": 0.000294823742569734, + "loss": 7.6957, + "step": 7225 + }, + { + "epoch": 0.6742558551833535, + "grad_norm": 46.63366524897697, + "learning_rate": 0.0002948217725179762, + "loss": 7.6716, + "step": 7226 + }, + { + "epoch": 0.6743491648782308, + "grad_norm": 8.54826538430521, + "learning_rate": 0.0002948198020979796, + "loss": 7.4714, + "step": 7227 + }, + { + "epoch": 0.6744424745731081, + "grad_norm": 34.699060884610006, + "learning_rate": 0.00029481783130974907, + "loss": 7.8938, + "step": 7228 + }, + { + "epoch": 0.6745357842679854, + "grad_norm": 133.48373967813117, + "learning_rate": 0.0002948158601532897, + "loss": 7.5701, + "step": 7229 + }, + { + "epoch": 0.6746290939628627, + "grad_norm": 10.254394805777855, + "learning_rate": 0.0002948138886286065, + "loss": 7.3477, + "step": 7230 + }, + { + "epoch": 0.67472240365774, + "grad_norm": 4.152511885416853, + "learning_rate": 0.0002948119167357045, + "loss": 7.9747, + "step": 7231 + }, + { + "epoch": 0.6748157133526174, + "grad_norm": 6.070266736077356, + "learning_rate": 0.0002948099444745887, + "loss": 7.5096, + "step": 7232 + }, + { + "epoch": 0.6749090230474947, + "grad_norm": 3250.740813887817, + "learning_rate": 0.0002948079718452641, + "loss": 7.4785, + "step": 7233 + }, + { + "epoch": 0.675002332742372, + "grad_norm": 325.6252092509243, + "learning_rate": 0.0002948059988477357, + "loss": 8.1216, + "step": 7234 + }, + { + "epoch": 0.6750956424372492, + "grad_norm": 93.35375418665451, + "learning_rate": 0.00029480402548200856, + "loss": 7.5118, + "step": 7235 + }, + { + "epoch": 0.6751889521321265, + "grad_norm": 23.12343337032824, + "learning_rate": 0.00029480205174808763, + "loss": 7.7162, + "step": 7236 + }, + { + "epoch": 0.6752822618270038, + "grad_norm": 17.28793036404099, + "learning_rate": 0.00029480007764597804, + "loss": 7.7206, + "step": 7237 + }, + { + "epoch": 0.6753755715218811, + "grad_norm": 11.755877239590635, + "learning_rate": 0.0002947981031756847, + "loss": 7.7034, + "step": 7238 + }, + { + "epoch": 0.6754688812167584, + "grad_norm": 2.95384121214288, + "learning_rate": 0.0002947961283372127, + "loss": 7.4656, + "step": 7239 + }, + { + "epoch": 0.6755621909116357, + "grad_norm": 364.22435814141903, + "learning_rate": 0.00029479415313056706, + "loss": 7.4768, + "step": 7240 + }, + { + "epoch": 0.675655500606513, + "grad_norm": 4.981138917517324, + "learning_rate": 0.00029479217755575275, + "loss": 7.6091, + "step": 7241 + }, + { + "epoch": 0.6757488103013903, + "grad_norm": 331.3211059644056, + "learning_rate": 0.00029479020161277483, + "loss": 7.6055, + "step": 7242 + }, + { + "epoch": 0.6758421199962676, + "grad_norm": 157.95310944490717, + "learning_rate": 0.00029478822530163833, + "loss": 7.6018, + "step": 7243 + }, + { + "epoch": 0.675935429691145, + "grad_norm": 2.056183225494845, + "learning_rate": 0.00029478624862234825, + "loss": 7.7689, + "step": 7244 + }, + { + "epoch": 0.6760287393860223, + "grad_norm": 1.4329833361883455, + "learning_rate": 0.0002947842715749096, + "loss": 7.8469, + "step": 7245 + }, + { + "epoch": 0.6761220490808995, + "grad_norm": 2.09143375595681, + "learning_rate": 0.0002947822941593275, + "loss": 7.3842, + "step": 7246 + }, + { + "epoch": 0.6762153587757768, + "grad_norm": 7.446792352141467, + "learning_rate": 0.0002947803163756069, + "loss": 7.6453, + "step": 7247 + }, + { + "epoch": 0.6763086684706541, + "grad_norm": 20.00404833490849, + "learning_rate": 0.00029477833822375276, + "loss": 7.6548, + "step": 7248 + }, + { + "epoch": 0.6764019781655314, + "grad_norm": 45.32805202315458, + "learning_rate": 0.0002947763597037702, + "loss": 7.6159, + "step": 7249 + }, + { + "epoch": 0.6764952878604087, + "grad_norm": 599.3574613217364, + "learning_rate": 0.00029477438081566436, + "loss": 7.478, + "step": 7250 + }, + { + "epoch": 0.676588597555286, + "grad_norm": 14.481221465528135, + "learning_rate": 0.00029477240155944005, + "loss": 7.412, + "step": 7251 + }, + { + "epoch": 0.6766819072501633, + "grad_norm": 21.129726354012096, + "learning_rate": 0.0002947704219351025, + "loss": 7.2761, + "step": 7252 + }, + { + "epoch": 0.6767752169450406, + "grad_norm": 6.454778580514693, + "learning_rate": 0.00029476844194265656, + "loss": 7.3772, + "step": 7253 + }, + { + "epoch": 0.6768685266399179, + "grad_norm": 9.034724805211754, + "learning_rate": 0.0002947664615821074, + "loss": 7.8671, + "step": 7254 + }, + { + "epoch": 0.6769618363347952, + "grad_norm": 3.882889631013997, + "learning_rate": 0.00029476448085346, + "loss": 7.412, + "step": 7255 + }, + { + "epoch": 0.6770551460296724, + "grad_norm": 5.235402131637755, + "learning_rate": 0.0002947624997567194, + "loss": 7.4799, + "step": 7256 + }, + { + "epoch": 0.6771484557245497, + "grad_norm": 1.571821969222271, + "learning_rate": 0.00029476051829189066, + "loss": 7.4667, + "step": 7257 + }, + { + "epoch": 0.677241765419427, + "grad_norm": 1841.567221249083, + "learning_rate": 0.0002947585364589788, + "loss": 7.8813, + "step": 7258 + }, + { + "epoch": 0.6773350751143044, + "grad_norm": 1829.0077903447598, + "learning_rate": 0.00029475655425798887, + "loss": 7.333, + "step": 7259 + }, + { + "epoch": 0.6774283848091817, + "grad_norm": 2.121751029212473, + "learning_rate": 0.0002947545716889259, + "loss": 7.5751, + "step": 7260 + }, + { + "epoch": 0.677521694504059, + "grad_norm": 34.45119051585541, + "learning_rate": 0.00029475258875179494, + "loss": 7.4696, + "step": 7261 + }, + { + "epoch": 0.6776150041989363, + "grad_norm": 3.2221578007190104, + "learning_rate": 0.000294750605446601, + "loss": 7.633, + "step": 7262 + }, + { + "epoch": 0.6777083138938136, + "grad_norm": 10.759147284341722, + "learning_rate": 0.0002947486217733491, + "loss": 7.6441, + "step": 7263 + }, + { + "epoch": 0.6778016235886909, + "grad_norm": 15.093331456036958, + "learning_rate": 0.00029474663773204445, + "loss": 7.8672, + "step": 7264 + }, + { + "epoch": 0.6778949332835682, + "grad_norm": 243.6101338034143, + "learning_rate": 0.00029474465332269195, + "loss": 7.0628, + "step": 7265 + }, + { + "epoch": 0.6779882429784455, + "grad_norm": 665.9035033522151, + "learning_rate": 0.0002947426685452966, + "loss": 7.403, + "step": 7266 + }, + { + "epoch": 0.6780815526733227, + "grad_norm": 0.9109713331710019, + "learning_rate": 0.0002947406833998636, + "loss": 7.1473, + "step": 7267 + }, + { + "epoch": 0.6781748623682, + "grad_norm": 14.189399788057093, + "learning_rate": 0.0002947386978863979, + "loss": 7.3612, + "step": 7268 + }, + { + "epoch": 0.6782681720630773, + "grad_norm": 2.5841293882124994, + "learning_rate": 0.0002947367120049045, + "loss": 7.6491, + "step": 7269 + }, + { + "epoch": 0.6783614817579546, + "grad_norm": 2.0298450706479434, + "learning_rate": 0.00029473472575538857, + "loss": 7.3531, + "step": 7270 + }, + { + "epoch": 0.6784547914528319, + "grad_norm": 3.3761118388547553, + "learning_rate": 0.0002947327391378551, + "loss": 7.6141, + "step": 7271 + }, + { + "epoch": 0.6785481011477092, + "grad_norm": 32.10032819291748, + "learning_rate": 0.0002947307521523092, + "loss": 7.5323, + "step": 7272 + }, + { + "epoch": 0.6786414108425866, + "grad_norm": 1.6806057426640428, + "learning_rate": 0.00029472876479875575, + "loss": 7.2366, + "step": 7273 + }, + { + "epoch": 0.6787347205374639, + "grad_norm": 740.1682932545973, + "learning_rate": 0.00029472677707720004, + "loss": 7.6658, + "step": 7274 + }, + { + "epoch": 0.6788280302323412, + "grad_norm": 15.91866148122539, + "learning_rate": 0.0002947247889876469, + "loss": 7.6301, + "step": 7275 + }, + { + "epoch": 0.6789213399272185, + "grad_norm": 8.622953965682466, + "learning_rate": 0.0002947228005301016, + "loss": 7.8615, + "step": 7276 + }, + { + "epoch": 0.6790146496220958, + "grad_norm": 3.686829637636916, + "learning_rate": 0.000294720811704569, + "loss": 7.5084, + "step": 7277 + }, + { + "epoch": 0.679107959316973, + "grad_norm": 5.5303829552833665, + "learning_rate": 0.0002947188225110543, + "loss": 7.652, + "step": 7278 + }, + { + "epoch": 0.6792012690118503, + "grad_norm": 2171.339841444829, + "learning_rate": 0.0002947168329495625, + "loss": 7.4633, + "step": 7279 + }, + { + "epoch": 0.6792945787067276, + "grad_norm": 261.987628649878, + "learning_rate": 0.00029471484302009867, + "loss": 7.5999, + "step": 7280 + }, + { + "epoch": 0.6793878884016049, + "grad_norm": 384.3014332564203, + "learning_rate": 0.0002947128527226678, + "loss": 7.4403, + "step": 7281 + }, + { + "epoch": 0.6794811980964822, + "grad_norm": 4.045515713795966, + "learning_rate": 0.00029471086205727507, + "loss": 7.3553, + "step": 7282 + }, + { + "epoch": 0.6795745077913595, + "grad_norm": 704.343334810001, + "learning_rate": 0.0002947088710239255, + "loss": 7.3905, + "step": 7283 + }, + { + "epoch": 0.6796678174862368, + "grad_norm": 11.88492023404307, + "learning_rate": 0.0002947068796226241, + "loss": 7.5744, + "step": 7284 + }, + { + "epoch": 0.6797611271811141, + "grad_norm": 22.049531096796134, + "learning_rate": 0.00029470488785337596, + "loss": 7.122, + "step": 7285 + }, + { + "epoch": 0.6798544368759915, + "grad_norm": 1.3388608740239016, + "learning_rate": 0.0002947028957161861, + "loss": 7.2182, + "step": 7286 + }, + { + "epoch": 0.6799477465708688, + "grad_norm": 86.26486946641867, + "learning_rate": 0.00029470090321105977, + "loss": 7.6968, + "step": 7287 + }, + { + "epoch": 0.680041056265746, + "grad_norm": 106.40404826206935, + "learning_rate": 0.00029469891033800184, + "loss": 7.2474, + "step": 7288 + }, + { + "epoch": 0.6801343659606233, + "grad_norm": 1.0864253155229817, + "learning_rate": 0.00029469691709701747, + "loss": 7.1973, + "step": 7289 + }, + { + "epoch": 0.6802276756555006, + "grad_norm": 1.1344203628476466, + "learning_rate": 0.00029469492348811163, + "loss": 7.5892, + "step": 7290 + }, + { + "epoch": 0.6803209853503779, + "grad_norm": 1.0360715264220963, + "learning_rate": 0.00029469292951128954, + "loss": 7.5205, + "step": 7291 + }, + { + "epoch": 0.6804142950452552, + "grad_norm": 31.155970708515607, + "learning_rate": 0.00029469093516655613, + "loss": 7.2019, + "step": 7292 + }, + { + "epoch": 0.6805076047401325, + "grad_norm": 1.0306580832236585, + "learning_rate": 0.0002946889404539166, + "loss": 7.528, + "step": 7293 + }, + { + "epoch": 0.6806009144350098, + "grad_norm": 43.84837338252151, + "learning_rate": 0.00029468694537337594, + "loss": 7.2486, + "step": 7294 + }, + { + "epoch": 0.6806942241298871, + "grad_norm": 481.46708357939906, + "learning_rate": 0.00029468494992493925, + "loss": 7.2845, + "step": 7295 + }, + { + "epoch": 0.6807875338247644, + "grad_norm": 103.25726953228974, + "learning_rate": 0.0002946829541086115, + "loss": 7.4595, + "step": 7296 + }, + { + "epoch": 0.6808808435196417, + "grad_norm": 1.8649439994796118, + "learning_rate": 0.0002946809579243979, + "loss": 7.3485, + "step": 7297 + }, + { + "epoch": 0.680974153214519, + "grad_norm": 3.915138784383694, + "learning_rate": 0.00029467896137230353, + "loss": 7.1789, + "step": 7298 + }, + { + "epoch": 0.6810674629093962, + "grad_norm": 3323.972068099968, + "learning_rate": 0.0002946769644523334, + "loss": 7.6357, + "step": 7299 + }, + { + "epoch": 0.6811607726042735, + "grad_norm": 2.616777457707158, + "learning_rate": 0.00029467496716449257, + "loss": 7.1125, + "step": 7300 + }, + { + "epoch": 0.6812540822991509, + "grad_norm": 1.2169114577503748, + "learning_rate": 0.0002946729695087862, + "loss": 7.4395, + "step": 7301 + }, + { + "epoch": 0.6813473919940282, + "grad_norm": 43.28450343153227, + "learning_rate": 0.0002946709714852193, + "loss": 7.4111, + "step": 7302 + }, + { + "epoch": 0.6814407016889055, + "grad_norm": 0.8787433119096433, + "learning_rate": 0.00029466897309379694, + "loss": 7.4004, + "step": 7303 + }, + { + "epoch": 0.6815340113837828, + "grad_norm": 0.6661130466615385, + "learning_rate": 0.0002946669743345243, + "loss": 7.423, + "step": 7304 + }, + { + "epoch": 0.6816273210786601, + "grad_norm": 12.697432164185521, + "learning_rate": 0.00029466497520740635, + "loss": 7.2033, + "step": 7305 + }, + { + "epoch": 0.6817206307735374, + "grad_norm": 1.0394880363912877, + "learning_rate": 0.0002946629757124483, + "loss": 7.6727, + "step": 7306 + }, + { + "epoch": 0.6818139404684147, + "grad_norm": 12.171759473672335, + "learning_rate": 0.00029466097584965507, + "loss": 7.5953, + "step": 7307 + }, + { + "epoch": 0.681907250163292, + "grad_norm": 7.735523368462785, + "learning_rate": 0.00029465897561903185, + "loss": 7.4374, + "step": 7308 + }, + { + "epoch": 0.6820005598581692, + "grad_norm": 589.3310545679079, + "learning_rate": 0.00029465697502058366, + "loss": 7.3491, + "step": 7309 + }, + { + "epoch": 0.6820938695530465, + "grad_norm": 1.1966028236341786, + "learning_rate": 0.0002946549740543157, + "loss": 7.3696, + "step": 7310 + }, + { + "epoch": 0.6821871792479238, + "grad_norm": 0.7122433089559952, + "learning_rate": 0.000294652972720233, + "loss": 7.4515, + "step": 7311 + }, + { + "epoch": 0.6822804889428011, + "grad_norm": 1.0671087689672205, + "learning_rate": 0.00029465097101834064, + "loss": 7.1112, + "step": 7312 + }, + { + "epoch": 0.6823737986376784, + "grad_norm": 267.8013861469452, + "learning_rate": 0.00029464896894864365, + "loss": 7.3534, + "step": 7313 + }, + { + "epoch": 0.6824671083325557, + "grad_norm": 1.705347453514782, + "learning_rate": 0.00029464696651114724, + "loss": 7.2629, + "step": 7314 + }, + { + "epoch": 0.6825604180274331, + "grad_norm": 5.02145694406877, + "learning_rate": 0.0002946449637058564, + "loss": 7.5093, + "step": 7315 + }, + { + "epoch": 0.6826537277223104, + "grad_norm": 61.80837661378976, + "learning_rate": 0.00029464296053277627, + "loss": 7.4783, + "step": 7316 + }, + { + "epoch": 0.6827470374171877, + "grad_norm": 1.1662199237064075, + "learning_rate": 0.00029464095699191197, + "loss": 7.6081, + "step": 7317 + }, + { + "epoch": 0.682840347112065, + "grad_norm": 18.1750140665411, + "learning_rate": 0.0002946389530832685, + "loss": 7.3823, + "step": 7318 + }, + { + "epoch": 0.6829336568069423, + "grad_norm": 3.7896105923364853, + "learning_rate": 0.0002946369488068511, + "loss": 7.5493, + "step": 7319 + }, + { + "epoch": 0.6830269665018195, + "grad_norm": 6.334072844449593, + "learning_rate": 0.0002946349441626647, + "loss": 7.4497, + "step": 7320 + }, + { + "epoch": 0.6831202761966968, + "grad_norm": 4.436666542658925, + "learning_rate": 0.0002946329391507145, + "loss": 7.224, + "step": 7321 + }, + { + "epoch": 0.6832135858915741, + "grad_norm": 7.8008529456599325, + "learning_rate": 0.00029463093377100566, + "loss": 7.4935, + "step": 7322 + }, + { + "epoch": 0.6833068955864514, + "grad_norm": 1.709921453864366, + "learning_rate": 0.0002946289280235431, + "loss": 7.3837, + "step": 7323 + }, + { + "epoch": 0.6834002052813287, + "grad_norm": 6390.101647446181, + "learning_rate": 0.0002946269219083321, + "loss": 7.5136, + "step": 7324 + }, + { + "epoch": 0.683493514976206, + "grad_norm": 3.0471694230200708, + "learning_rate": 0.00029462491542537765, + "loss": 7.8548, + "step": 7325 + }, + { + "epoch": 0.6835868246710833, + "grad_norm": 2.0660365606535405, + "learning_rate": 0.00029462290857468486, + "loss": 7.0822, + "step": 7326 + }, + { + "epoch": 0.6836801343659606, + "grad_norm": 1.4479904373765111, + "learning_rate": 0.0002946209013562589, + "loss": 7.1742, + "step": 7327 + }, + { + "epoch": 0.683773444060838, + "grad_norm": 4.100554922076198, + "learning_rate": 0.00029461889377010476, + "loss": 7.274, + "step": 7328 + }, + { + "epoch": 0.6838667537557153, + "grad_norm": 6.019201965850126, + "learning_rate": 0.00029461688581622765, + "loss": 7.6423, + "step": 7329 + }, + { + "epoch": 0.6839600634505926, + "grad_norm": 51.2384899410029, + "learning_rate": 0.00029461487749463263, + "loss": 7.8876, + "step": 7330 + }, + { + "epoch": 0.6840533731454698, + "grad_norm": 4.995639679211572, + "learning_rate": 0.0002946128688053248, + "loss": 7.2416, + "step": 7331 + }, + { + "epoch": 0.6841466828403471, + "grad_norm": 3.3543458876688645, + "learning_rate": 0.00029461085974830933, + "loss": 7.7007, + "step": 7332 + }, + { + "epoch": 0.6842399925352244, + "grad_norm": 22.186804250454617, + "learning_rate": 0.00029460885032359124, + "loss": 7.5364, + "step": 7333 + }, + { + "epoch": 0.6843333022301017, + "grad_norm": 7.8966235465087635, + "learning_rate": 0.0002946068405311757, + "loss": 7.2001, + "step": 7334 + }, + { + "epoch": 0.684426611924979, + "grad_norm": 3168.460895303382, + "learning_rate": 0.00029460483037106776, + "loss": 7.1682, + "step": 7335 + }, + { + "epoch": 0.6845199216198563, + "grad_norm": 60.13756361609806, + "learning_rate": 0.00029460281984327257, + "loss": 7.149, + "step": 7336 + }, + { + "epoch": 0.6846132313147336, + "grad_norm": 0.5878404939191868, + "learning_rate": 0.0002946008089477953, + "loss": 7.3054, + "step": 7337 + }, + { + "epoch": 0.6847065410096109, + "grad_norm": 15.568860543966487, + "learning_rate": 0.00029459879768464097, + "loss": 7.1868, + "step": 7338 + }, + { + "epoch": 0.6847998507044882, + "grad_norm": 26.88976670072345, + "learning_rate": 0.00029459678605381475, + "loss": 7.3648, + "step": 7339 + }, + { + "epoch": 0.6848931603993655, + "grad_norm": 2.1392334577592593, + "learning_rate": 0.0002945947740553217, + "loss": 7.6055, + "step": 7340 + }, + { + "epoch": 0.6849864700942427, + "grad_norm": 0.5405553707557185, + "learning_rate": 0.000294592761689167, + "loss": 7.4066, + "step": 7341 + }, + { + "epoch": 0.68507977978912, + "grad_norm": 1.0237972057444762, + "learning_rate": 0.00029459074895535575, + "loss": 7.0738, + "step": 7342 + }, + { + "epoch": 0.6851730894839974, + "grad_norm": 3.9615595402502133, + "learning_rate": 0.00029458873585389304, + "loss": 7.3895, + "step": 7343 + }, + { + "epoch": 0.6852663991788747, + "grad_norm": 27.515873310058716, + "learning_rate": 0.000294586722384784, + "loss": 6.9509, + "step": 7344 + }, + { + "epoch": 0.685359708873752, + "grad_norm": 2.0658867252310023, + "learning_rate": 0.0002945847085480338, + "loss": 7.3751, + "step": 7345 + }, + { + "epoch": 0.6854530185686293, + "grad_norm": 7.906777151424723, + "learning_rate": 0.00029458269434364746, + "loss": 7.442, + "step": 7346 + }, + { + "epoch": 0.6855463282635066, + "grad_norm": 43.13201688013242, + "learning_rate": 0.00029458067977163017, + "loss": 7.1753, + "step": 7347 + }, + { + "epoch": 0.6856396379583839, + "grad_norm": 1.1856368856272212, + "learning_rate": 0.0002945786648319871, + "loss": 7.1603, + "step": 7348 + }, + { + "epoch": 0.6857329476532612, + "grad_norm": 2554.3697386127014, + "learning_rate": 0.00029457664952472324, + "loss": 7.6284, + "step": 7349 + }, + { + "epoch": 0.6858262573481385, + "grad_norm": 11.57556525477212, + "learning_rate": 0.0002945746338498438, + "loss": 7.448, + "step": 7350 + }, + { + "epoch": 0.6859195670430158, + "grad_norm": 33.368580800240196, + "learning_rate": 0.0002945726178073539, + "loss": 7.3757, + "step": 7351 + }, + { + "epoch": 0.686012876737893, + "grad_norm": 1.0518333233133261, + "learning_rate": 0.0002945706013972586, + "loss": 7.6589, + "step": 7352 + }, + { + "epoch": 0.6861061864327703, + "grad_norm": 14.805554492907428, + "learning_rate": 0.0002945685846195632, + "loss": 7.3121, + "step": 7353 + }, + { + "epoch": 0.6861994961276476, + "grad_norm": 21.78545842301116, + "learning_rate": 0.0002945665674742726, + "loss": 7.5133, + "step": 7354 + }, + { + "epoch": 0.6862928058225249, + "grad_norm": 0.5464754666267209, + "learning_rate": 0.0002945645499613921, + "loss": 7.3026, + "step": 7355 + }, + { + "epoch": 0.6863861155174023, + "grad_norm": 9.619675943383204, + "learning_rate": 0.0002945625320809268, + "loss": 7.7001, + "step": 7356 + }, + { + "epoch": 0.6864794252122796, + "grad_norm": 0.9644216997623156, + "learning_rate": 0.00029456051383288176, + "loss": 7.3162, + "step": 7357 + }, + { + "epoch": 0.6865727349071569, + "grad_norm": 7.567710970378461, + "learning_rate": 0.00029455849521726216, + "loss": 7.4565, + "step": 7358 + }, + { + "epoch": 0.6866660446020342, + "grad_norm": 38.32942978143785, + "learning_rate": 0.00029455647623407314, + "loss": 7.1817, + "step": 7359 + }, + { + "epoch": 0.6867593542969115, + "grad_norm": 21469.471121580515, + "learning_rate": 0.00029455445688331985, + "loss": 7.4303, + "step": 7360 + }, + { + "epoch": 0.6868526639917888, + "grad_norm": 624.1066667742018, + "learning_rate": 0.00029455243716500733, + "loss": 7.2582, + "step": 7361 + }, + { + "epoch": 0.686945973686666, + "grad_norm": 6.462752164049424, + "learning_rate": 0.00029455041707914084, + "loss": 7.1189, + "step": 7362 + }, + { + "epoch": 0.6870392833815433, + "grad_norm": 3235.995308524178, + "learning_rate": 0.0002945483966257254, + "loss": 7.3061, + "step": 7363 + }, + { + "epoch": 0.6871325930764206, + "grad_norm": 3.3177664656456463, + "learning_rate": 0.00029454637580476623, + "loss": 7.5346, + "step": 7364 + }, + { + "epoch": 0.6872259027712979, + "grad_norm": 13.21780577768489, + "learning_rate": 0.00029454435461626847, + "loss": 7.5166, + "step": 7365 + }, + { + "epoch": 0.6873192124661752, + "grad_norm": 33.76739710974822, + "learning_rate": 0.0002945423330602372, + "loss": 7.3266, + "step": 7366 + }, + { + "epoch": 0.6874125221610525, + "grad_norm": 22.988840394764413, + "learning_rate": 0.00029454031113667766, + "loss": 7.3823, + "step": 7367 + }, + { + "epoch": 0.6875058318559298, + "grad_norm": 8408.00519595546, + "learning_rate": 0.00029453828884559485, + "loss": 7.5018, + "step": 7368 + }, + { + "epoch": 0.6875991415508071, + "grad_norm": 11.913152081041034, + "learning_rate": 0.000294536266186994, + "loss": 7.4355, + "step": 7369 + }, + { + "epoch": 0.6876924512456845, + "grad_norm": 4.954430846832594, + "learning_rate": 0.0002945342431608803, + "loss": 7.4622, + "step": 7370 + }, + { + "epoch": 0.6877857609405618, + "grad_norm": 15.637700383550168, + "learning_rate": 0.00029453221976725877, + "loss": 7.4575, + "step": 7371 + }, + { + "epoch": 0.6878790706354391, + "grad_norm": 37881.943294633136, + "learning_rate": 0.00029453019600613465, + "loss": 7.5625, + "step": 7372 + }, + { + "epoch": 0.6879723803303163, + "grad_norm": 66.71838330387824, + "learning_rate": 0.00029452817187751305, + "loss": 7.5027, + "step": 7373 + }, + { + "epoch": 0.6880656900251936, + "grad_norm": 8.107271888751988, + "learning_rate": 0.0002945261473813991, + "loss": 7.5138, + "step": 7374 + }, + { + "epoch": 0.6881589997200709, + "grad_norm": 5524.782629970869, + "learning_rate": 0.000294524122517798, + "loss": 7.3142, + "step": 7375 + }, + { + "epoch": 0.6882523094149482, + "grad_norm": 12.890774626043681, + "learning_rate": 0.0002945220972867148, + "loss": 7.6392, + "step": 7376 + }, + { + "epoch": 0.6883456191098255, + "grad_norm": 85.22060696928875, + "learning_rate": 0.0002945200716881548, + "loss": 7.799, + "step": 7377 + }, + { + "epoch": 0.6884389288047028, + "grad_norm": 1.9720835162448347, + "learning_rate": 0.00029451804572212304, + "loss": 7.4138, + "step": 7378 + }, + { + "epoch": 0.6885322384995801, + "grad_norm": 1.3672722019000425, + "learning_rate": 0.00029451601938862464, + "loss": 7.302, + "step": 7379 + }, + { + "epoch": 0.6886255481944574, + "grad_norm": 4165.902485076085, + "learning_rate": 0.0002945139926876649, + "loss": 7.227, + "step": 7380 + }, + { + "epoch": 0.6887188578893347, + "grad_norm": 5.292812926036697, + "learning_rate": 0.00029451196561924885, + "loss": 7.2249, + "step": 7381 + }, + { + "epoch": 0.688812167584212, + "grad_norm": 8.15969867945261, + "learning_rate": 0.0002945099381833817, + "loss": 7.3021, + "step": 7382 + }, + { + "epoch": 0.6889054772790894, + "grad_norm": 1.3403241997133999, + "learning_rate": 0.00029450791038006854, + "loss": 7.315, + "step": 7383 + }, + { + "epoch": 0.6889987869739665, + "grad_norm": 2296.845712768656, + "learning_rate": 0.00029450588220931463, + "loss": 7.5408, + "step": 7384 + }, + { + "epoch": 0.6890920966688439, + "grad_norm": 8.771144817000403, + "learning_rate": 0.000294503853671125, + "loss": 7.0415, + "step": 7385 + }, + { + "epoch": 0.6891854063637212, + "grad_norm": 197.15076029191098, + "learning_rate": 0.0002945018247655049, + "loss": 7.3534, + "step": 7386 + }, + { + "epoch": 0.6892787160585985, + "grad_norm": 195.54698960086637, + "learning_rate": 0.0002944997954924595, + "loss": 7.3575, + "step": 7387 + }, + { + "epoch": 0.6893720257534758, + "grad_norm": 10445.15945027791, + "learning_rate": 0.00029449776585199387, + "loss": 7.2597, + "step": 7388 + }, + { + "epoch": 0.6894653354483531, + "grad_norm": 1.1694275037034676, + "learning_rate": 0.00029449573584411327, + "loss": 7.3658, + "step": 7389 + }, + { + "epoch": 0.6895586451432304, + "grad_norm": 2.2893540410228432, + "learning_rate": 0.00029449370546882274, + "loss": 7.5636, + "step": 7390 + }, + { + "epoch": 0.6896519548381077, + "grad_norm": 5.139116568456646, + "learning_rate": 0.00029449167472612764, + "loss": 7.5498, + "step": 7391 + }, + { + "epoch": 0.689745264532985, + "grad_norm": 4.482614087503806, + "learning_rate": 0.0002944896436160329, + "loss": 7.6809, + "step": 7392 + }, + { + "epoch": 0.6898385742278623, + "grad_norm": 3.942900764105557, + "learning_rate": 0.00029448761213854386, + "loss": 7.1072, + "step": 7393 + }, + { + "epoch": 0.6899318839227395, + "grad_norm": 65.11229066846893, + "learning_rate": 0.0002944855802936656, + "loss": 7.797, + "step": 7394 + }, + { + "epoch": 0.6900251936176168, + "grad_norm": 1.696917659322383, + "learning_rate": 0.00029448354808140335, + "loss": 7.1064, + "step": 7395 + }, + { + "epoch": 0.6901185033124941, + "grad_norm": 2.0576922739459915, + "learning_rate": 0.0002944815155017622, + "loss": 7.0541, + "step": 7396 + }, + { + "epoch": 0.6902118130073714, + "grad_norm": 1.703402548334158, + "learning_rate": 0.00029447948255474735, + "loss": 7.2856, + "step": 7397 + }, + { + "epoch": 0.6903051227022488, + "grad_norm": 0.8623177683599774, + "learning_rate": 0.00029447744924036395, + "loss": 7.485, + "step": 7398 + }, + { + "epoch": 0.6903984323971261, + "grad_norm": 1200.17937871135, + "learning_rate": 0.00029447541555861724, + "loss": 7.1191, + "step": 7399 + }, + { + "epoch": 0.6904917420920034, + "grad_norm": 1.7495328564420638, + "learning_rate": 0.0002944733815095123, + "loss": 7.1485, + "step": 7400 + }, + { + "epoch": 0.6905850517868807, + "grad_norm": 874.4030677170412, + "learning_rate": 0.0002944713470930544, + "loss": 7.2535, + "step": 7401 + }, + { + "epoch": 0.690678361481758, + "grad_norm": 25.34310187078968, + "learning_rate": 0.0002944693123092486, + "loss": 6.9476, + "step": 7402 + }, + { + "epoch": 0.6907716711766353, + "grad_norm": 1.5208193830382912, + "learning_rate": 0.0002944672771581002, + "loss": 7.2552, + "step": 7403 + }, + { + "epoch": 0.6908649808715126, + "grad_norm": 6.8655017184784155, + "learning_rate": 0.0002944652416396143, + "loss": 7.3684, + "step": 7404 + }, + { + "epoch": 0.6909582905663898, + "grad_norm": 11.389125398620532, + "learning_rate": 0.00029446320575379607, + "loss": 7.647, + "step": 7405 + }, + { + "epoch": 0.6910516002612671, + "grad_norm": 3.889510771870502, + "learning_rate": 0.0002944611695006507, + "loss": 7.4272, + "step": 7406 + }, + { + "epoch": 0.6911449099561444, + "grad_norm": 126.7330918022834, + "learning_rate": 0.0002944591328801833, + "loss": 7.7617, + "step": 7407 + }, + { + "epoch": 0.6912382196510217, + "grad_norm": 2979.6725971573064, + "learning_rate": 0.0002944570958923992, + "loss": 7.3924, + "step": 7408 + }, + { + "epoch": 0.691331529345899, + "grad_norm": 11.274102981359567, + "learning_rate": 0.0002944550585373035, + "loss": 7.8804, + "step": 7409 + }, + { + "epoch": 0.6914248390407763, + "grad_norm": 2200.4734738899547, + "learning_rate": 0.00029445302081490136, + "loss": 7.3961, + "step": 7410 + }, + { + "epoch": 0.6915181487356536, + "grad_norm": 15.733775456663109, + "learning_rate": 0.00029445098272519796, + "loss": 7.6027, + "step": 7411 + }, + { + "epoch": 0.691611458430531, + "grad_norm": 42.966023448415335, + "learning_rate": 0.0002944489442681985, + "loss": 7.1934, + "step": 7412 + }, + { + "epoch": 0.6917047681254083, + "grad_norm": 1.355400195171999, + "learning_rate": 0.0002944469054439082, + "loss": 7.4253, + "step": 7413 + }, + { + "epoch": 0.6917980778202856, + "grad_norm": 16.59207211057284, + "learning_rate": 0.00029444486625233217, + "loss": 7.2247, + "step": 7414 + }, + { + "epoch": 0.6918913875151628, + "grad_norm": 2414.6324730506194, + "learning_rate": 0.0002944428266934756, + "loss": 7.1844, + "step": 7415 + }, + { + "epoch": 0.6919846972100401, + "grad_norm": 213110.14131504684, + "learning_rate": 0.0002944407867673438, + "loss": 7.0623, + "step": 7416 + }, + { + "epoch": 0.6920780069049174, + "grad_norm": 10.110772240460989, + "learning_rate": 0.00029443874647394185, + "loss": 7.667, + "step": 7417 + }, + { + "epoch": 0.6921713165997947, + "grad_norm": 1.8107269025807324, + "learning_rate": 0.00029443670581327493, + "loss": 7.2286, + "step": 7418 + }, + { + "epoch": 0.692264626294672, + "grad_norm": 402.60182679883064, + "learning_rate": 0.00029443466478534824, + "loss": 7.1935, + "step": 7419 + }, + { + "epoch": 0.6923579359895493, + "grad_norm": 393.33654068993167, + "learning_rate": 0.000294432623390167, + "loss": 7.234, + "step": 7420 + }, + { + "epoch": 0.6924512456844266, + "grad_norm": 0.9072561069095479, + "learning_rate": 0.00029443058162773636, + "loss": 7.3661, + "step": 7421 + }, + { + "epoch": 0.6925445553793039, + "grad_norm": 162220.3544283863, + "learning_rate": 0.0002944285394980615, + "loss": 7.3559, + "step": 7422 + }, + { + "epoch": 0.6926378650741812, + "grad_norm": 3008.716397939246, + "learning_rate": 0.00029442649700114773, + "loss": 7.1098, + "step": 7423 + }, + { + "epoch": 0.6927311747690585, + "grad_norm": 27.702241254554693, + "learning_rate": 0.00029442445413700014, + "loss": 7.7168, + "step": 7424 + }, + { + "epoch": 0.6928244844639359, + "grad_norm": 1.3142718186024012, + "learning_rate": 0.0002944224109056239, + "loss": 7.4704, + "step": 7425 + }, + { + "epoch": 0.692917794158813, + "grad_norm": 15.155793211808309, + "learning_rate": 0.0002944203673070243, + "loss": 7.2965, + "step": 7426 + }, + { + "epoch": 0.6930111038536904, + "grad_norm": 6486.711169648875, + "learning_rate": 0.00029441832334120647, + "loss": 7.5552, + "step": 7427 + }, + { + "epoch": 0.6931044135485677, + "grad_norm": 1.4722642952082219, + "learning_rate": 0.0002944162790081756, + "loss": 7.3665, + "step": 7428 + }, + { + "epoch": 0.693197723243445, + "grad_norm": 38.392866907745, + "learning_rate": 0.00029441423430793693, + "loss": 7.4817, + "step": 7429 + }, + { + "epoch": 0.6932910329383223, + "grad_norm": 3.7311529586634835, + "learning_rate": 0.00029441218924049564, + "loss": 7.4277, + "step": 7430 + }, + { + "epoch": 0.6933843426331996, + "grad_norm": 1.1781450310309276, + "learning_rate": 0.0002944101438058569, + "loss": 7.3256, + "step": 7431 + }, + { + "epoch": 0.6934776523280769, + "grad_norm": 5.439694362834183, + "learning_rate": 0.00029440809800402603, + "loss": 7.5595, + "step": 7432 + }, + { + "epoch": 0.6935709620229542, + "grad_norm": 29.521524477637556, + "learning_rate": 0.00029440605183500806, + "loss": 7.6916, + "step": 7433 + }, + { + "epoch": 0.6936642717178315, + "grad_norm": 31.970633634056387, + "learning_rate": 0.00029440400529880833, + "loss": 7.4705, + "step": 7434 + }, + { + "epoch": 0.6937575814127088, + "grad_norm": 13.513293435533527, + "learning_rate": 0.00029440195839543193, + "loss": 7.0574, + "step": 7435 + }, + { + "epoch": 0.6938508911075861, + "grad_norm": 20.654878239557956, + "learning_rate": 0.0002943999111248842, + "loss": 7.523, + "step": 7436 + }, + { + "epoch": 0.6939442008024633, + "grad_norm": 12.448432609248021, + "learning_rate": 0.00029439786348717023, + "loss": 7.2465, + "step": 7437 + }, + { + "epoch": 0.6940375104973406, + "grad_norm": 225.32948069150245, + "learning_rate": 0.0002943958154822953, + "loss": 7.47, + "step": 7438 + }, + { + "epoch": 0.694130820192218, + "grad_norm": 22.65528292639431, + "learning_rate": 0.00029439376711026447, + "loss": 7.2466, + "step": 7439 + }, + { + "epoch": 0.6942241298870953, + "grad_norm": 1.0307691476284322, + "learning_rate": 0.0002943917183710831, + "loss": 7.3729, + "step": 7440 + }, + { + "epoch": 0.6943174395819726, + "grad_norm": 1.9458422553716272, + "learning_rate": 0.00029438966926475645, + "loss": 7.1036, + "step": 7441 + }, + { + "epoch": 0.6944107492768499, + "grad_norm": 4.350568215120782, + "learning_rate": 0.0002943876197912896, + "loss": 7.2827, + "step": 7442 + }, + { + "epoch": 0.6945040589717272, + "grad_norm": 12246.890400305098, + "learning_rate": 0.0002943855699506878, + "loss": 7.3942, + "step": 7443 + }, + { + "epoch": 0.6945973686666045, + "grad_norm": 11.932448064740694, + "learning_rate": 0.0002943835197429563, + "loss": 7.0505, + "step": 7444 + }, + { + "epoch": 0.6946906783614818, + "grad_norm": 2.602190524252706, + "learning_rate": 0.00029438146916810023, + "loss": 7.3253, + "step": 7445 + }, + { + "epoch": 0.6947839880563591, + "grad_norm": 1.3907553425872514, + "learning_rate": 0.00029437941822612484, + "loss": 7.4779, + "step": 7446 + }, + { + "epoch": 0.6948772977512363, + "grad_norm": 18.141604666673928, + "learning_rate": 0.0002943773669170354, + "loss": 7.0566, + "step": 7447 + }, + { + "epoch": 0.6949706074461136, + "grad_norm": 37.16144427697655, + "learning_rate": 0.00029437531524083706, + "loss": 7.4292, + "step": 7448 + }, + { + "epoch": 0.6950639171409909, + "grad_norm": 7.077386911126933, + "learning_rate": 0.0002943732631975351, + "loss": 7.3882, + "step": 7449 + }, + { + "epoch": 0.6951572268358682, + "grad_norm": 4.2761005163034955, + "learning_rate": 0.0002943712107871347, + "loss": 7.27, + "step": 7450 + }, + { + "epoch": 0.6952505365307455, + "grad_norm": 2.453752928055806, + "learning_rate": 0.000294369158009641, + "loss": 7.1637, + "step": 7451 + }, + { + "epoch": 0.6953438462256228, + "grad_norm": 190.5356597001076, + "learning_rate": 0.00029436710486505937, + "loss": 7.7174, + "step": 7452 + }, + { + "epoch": 0.6954371559205001, + "grad_norm": 16.613954807095567, + "learning_rate": 0.0002943650513533949, + "loss": 7.6565, + "step": 7453 + }, + { + "epoch": 0.6955304656153775, + "grad_norm": 9572.588029417067, + "learning_rate": 0.0002943629974746529, + "loss": 7.1412, + "step": 7454 + }, + { + "epoch": 0.6956237753102548, + "grad_norm": 1.3062671241553252, + "learning_rate": 0.0002943609432288386, + "loss": 7.2407, + "step": 7455 + }, + { + "epoch": 0.6957170850051321, + "grad_norm": 20.34287497044367, + "learning_rate": 0.0002943588886159571, + "loss": 7.2842, + "step": 7456 + }, + { + "epoch": 0.6958103947000094, + "grad_norm": 33247.232311804844, + "learning_rate": 0.00029435683363601375, + "loss": 7.3904, + "step": 7457 + }, + { + "epoch": 0.6959037043948866, + "grad_norm": 6.986817451418972, + "learning_rate": 0.0002943547782890138, + "loss": 7.3579, + "step": 7458 + }, + { + "epoch": 0.6959970140897639, + "grad_norm": 14.097577530986657, + "learning_rate": 0.00029435272257496227, + "loss": 7.2541, + "step": 7459 + }, + { + "epoch": 0.6960903237846412, + "grad_norm": 2.387724762115768, + "learning_rate": 0.00029435066649386463, + "loss": 7.5607, + "step": 7460 + }, + { + "epoch": 0.6961836334795185, + "grad_norm": 1.7435268173933574, + "learning_rate": 0.000294348610045726, + "loss": 7.3154, + "step": 7461 + }, + { + "epoch": 0.6962769431743958, + "grad_norm": 5515.263135854745, + "learning_rate": 0.00029434655323055156, + "loss": 7.3029, + "step": 7462 + }, + { + "epoch": 0.6963702528692731, + "grad_norm": 3.8128306497817004, + "learning_rate": 0.00029434449604834664, + "loss": 7.4667, + "step": 7463 + }, + { + "epoch": 0.6964635625641504, + "grad_norm": 2.7972800670724314, + "learning_rate": 0.0002943424384991164, + "loss": 7.489, + "step": 7464 + }, + { + "epoch": 0.6965568722590277, + "grad_norm": 0.9196973208646371, + "learning_rate": 0.0002943403805828661, + "loss": 7.2467, + "step": 7465 + }, + { + "epoch": 0.696650181953905, + "grad_norm": 0.9468823720786057, + "learning_rate": 0.000294338322299601, + "loss": 7.3019, + "step": 7466 + }, + { + "epoch": 0.6967434916487824, + "grad_norm": 3.4007011614166593, + "learning_rate": 0.0002943362636493263, + "loss": 7.2851, + "step": 7467 + }, + { + "epoch": 0.6968368013436596, + "grad_norm": 20.951243867706552, + "learning_rate": 0.0002943342046320472, + "loss": 7.3556, + "step": 7468 + }, + { + "epoch": 0.6969301110385369, + "grad_norm": 7.881309110979847, + "learning_rate": 0.00029433214524776896, + "loss": 7.6426, + "step": 7469 + }, + { + "epoch": 0.6970234207334142, + "grad_norm": 3.108796134212256, + "learning_rate": 0.00029433008549649685, + "loss": 7.5803, + "step": 7470 + }, + { + "epoch": 0.6971167304282915, + "grad_norm": 33.333278277218234, + "learning_rate": 0.0002943280253782361, + "loss": 7.6299, + "step": 7471 + }, + { + "epoch": 0.6972100401231688, + "grad_norm": 997.0164183189949, + "learning_rate": 0.00029432596489299196, + "loss": 7.4797, + "step": 7472 + }, + { + "epoch": 0.6973033498180461, + "grad_norm": 1.6553595147200508, + "learning_rate": 0.0002943239040407696, + "loss": 7.2644, + "step": 7473 + }, + { + "epoch": 0.6973966595129234, + "grad_norm": 14519.440985363242, + "learning_rate": 0.00029432184282157425, + "loss": 7.3018, + "step": 7474 + }, + { + "epoch": 0.6974899692078007, + "grad_norm": 1.8614541024825182, + "learning_rate": 0.0002943197812354113, + "loss": 7.6488, + "step": 7475 + }, + { + "epoch": 0.697583278902678, + "grad_norm": 1.999625893316745, + "learning_rate": 0.00029431771928228585, + "loss": 7.3346, + "step": 7476 + }, + { + "epoch": 0.6976765885975553, + "grad_norm": 21.278761926179737, + "learning_rate": 0.00029431565696220324, + "loss": 7.3968, + "step": 7477 + }, + { + "epoch": 0.6977698982924326, + "grad_norm": 9.075852143582264, + "learning_rate": 0.0002943135942751686, + "loss": 7.1928, + "step": 7478 + }, + { + "epoch": 0.6978632079873098, + "grad_norm": 1462594.192717567, + "learning_rate": 0.0002943115312211873, + "loss": 7.4916, + "step": 7479 + }, + { + "epoch": 0.6979565176821871, + "grad_norm": 48825.70394123329, + "learning_rate": 0.0002943094678002645, + "loss": 7.4785, + "step": 7480 + }, + { + "epoch": 0.6980498273770644, + "grad_norm": 2.9677832826979045, + "learning_rate": 0.00029430740401240545, + "loss": 7.7492, + "step": 7481 + }, + { + "epoch": 0.6981431370719418, + "grad_norm": 8.016158523159396, + "learning_rate": 0.00029430533985761543, + "loss": 7.6069, + "step": 7482 + }, + { + "epoch": 0.6982364467668191, + "grad_norm": 1.2212654481227843, + "learning_rate": 0.0002943032753358997, + "loss": 7.279, + "step": 7483 + }, + { + "epoch": 0.6983297564616964, + "grad_norm": 39.868287548312416, + "learning_rate": 0.0002943012104472634, + "loss": 7.129, + "step": 7484 + }, + { + "epoch": 0.6984230661565737, + "grad_norm": 36.378585969691585, + "learning_rate": 0.000294299145191712, + "loss": 7.3065, + "step": 7485 + }, + { + "epoch": 0.698516375851451, + "grad_norm": 32.67012744347994, + "learning_rate": 0.00029429707956925055, + "loss": 7.5578, + "step": 7486 + }, + { + "epoch": 0.6986096855463283, + "grad_norm": 92.71970640419221, + "learning_rate": 0.00029429501357988437, + "loss": 7.2066, + "step": 7487 + }, + { + "epoch": 0.6987029952412056, + "grad_norm": 4.045792359137818, + "learning_rate": 0.0002942929472236187, + "loss": 7.2326, + "step": 7488 + }, + { + "epoch": 0.6987963049360829, + "grad_norm": 5.582265728915548, + "learning_rate": 0.0002942908805004588, + "loss": 7.7572, + "step": 7489 + }, + { + "epoch": 0.6988896146309601, + "grad_norm": 5.717059724323019, + "learning_rate": 0.00029428881341040996, + "loss": 7.5645, + "step": 7490 + }, + { + "epoch": 0.6989829243258374, + "grad_norm": 14.348275771314942, + "learning_rate": 0.0002942867459534774, + "loss": 7.5041, + "step": 7491 + }, + { + "epoch": 0.6990762340207147, + "grad_norm": 281.14408061633196, + "learning_rate": 0.0002942846781296664, + "loss": 7.1642, + "step": 7492 + }, + { + "epoch": 0.699169543715592, + "grad_norm": 24716.65774920691, + "learning_rate": 0.00029428260993898223, + "loss": 8.048, + "step": 7493 + }, + { + "epoch": 0.6992628534104693, + "grad_norm": 6.16580493827975, + "learning_rate": 0.0002942805413814301, + "loss": 7.5816, + "step": 7494 + }, + { + "epoch": 0.6993561631053467, + "grad_norm": 11717294.213540757, + "learning_rate": 0.0002942784724570152, + "loss": 7.4203, + "step": 7495 + }, + { + "epoch": 0.699449472800224, + "grad_norm": 241.1422629688341, + "learning_rate": 0.00029427640316574303, + "loss": 7.3807, + "step": 7496 + }, + { + "epoch": 0.6995427824951013, + "grad_norm": 1.8533881946849573, + "learning_rate": 0.0002942743335076186, + "loss": 7.5704, + "step": 7497 + }, + { + "epoch": 0.6996360921899786, + "grad_norm": 3.826806301246015, + "learning_rate": 0.00029427226348264737, + "loss": 7.3704, + "step": 7498 + }, + { + "epoch": 0.6997294018848559, + "grad_norm": 1.7018130910871665, + "learning_rate": 0.0002942701930908344, + "loss": 7.3599, + "step": 7499 + }, + { + "epoch": 0.6998227115797331, + "grad_norm": 16.088195419465055, + "learning_rate": 0.00029426812233218516, + "loss": 7.521, + "step": 7500 + }, + { + "epoch": 0.6999160212746104, + "grad_norm": 3.6609895491460467, + "learning_rate": 0.0002942660512067048, + "loss": 7.3692, + "step": 7501 + }, + { + "epoch": 0.7000093309694877, + "grad_norm": 11.283738590668417, + "learning_rate": 0.00029426397971439854, + "loss": 7.6176, + "step": 7502 + }, + { + "epoch": 0.700102640664365, + "grad_norm": 252.28732395599985, + "learning_rate": 0.0002942619078552718, + "loss": 7.3271, + "step": 7503 + }, + { + "epoch": 0.7001959503592423, + "grad_norm": 15.662447765709931, + "learning_rate": 0.0002942598356293297, + "loss": 7.5009, + "step": 7504 + }, + { + "epoch": 0.7002892600541196, + "grad_norm": 4.057721704081301, + "learning_rate": 0.0002942577630365776, + "loss": 7.3544, + "step": 7505 + }, + { + "epoch": 0.7003825697489969, + "grad_norm": 11.42341223864731, + "learning_rate": 0.00029425569007702075, + "loss": 7.797, + "step": 7506 + }, + { + "epoch": 0.7004758794438742, + "grad_norm": 769.3100329704669, + "learning_rate": 0.0002942536167506644, + "loss": 7.4058, + "step": 7507 + }, + { + "epoch": 0.7005691891387515, + "grad_norm": 8.554165787754648, + "learning_rate": 0.0002942515430575138, + "loss": 7.3895, + "step": 7508 + }, + { + "epoch": 0.7006624988336289, + "grad_norm": 2.042496124112732, + "learning_rate": 0.0002942494689975743, + "loss": 7.5227, + "step": 7509 + }, + { + "epoch": 0.7007558085285062, + "grad_norm": 11.386696133393619, + "learning_rate": 0.0002942473945708511, + "loss": 7.1863, + "step": 7510 + }, + { + "epoch": 0.7008491182233834, + "grad_norm": 1.8492004941826499, + "learning_rate": 0.00029424531977734956, + "loss": 7.5271, + "step": 7511 + }, + { + "epoch": 0.7009424279182607, + "grad_norm": 2.2465133376654878, + "learning_rate": 0.00029424324461707483, + "loss": 7.3197, + "step": 7512 + }, + { + "epoch": 0.701035737613138, + "grad_norm": 2.912027016893866, + "learning_rate": 0.00029424116909003233, + "loss": 7.4018, + "step": 7513 + }, + { + "epoch": 0.7011290473080153, + "grad_norm": 12.042783073649211, + "learning_rate": 0.0002942390931962272, + "loss": 7.3774, + "step": 7514 + }, + { + "epoch": 0.7012223570028926, + "grad_norm": 12.03899732430994, + "learning_rate": 0.00029423701693566477, + "loss": 7.6751, + "step": 7515 + }, + { + "epoch": 0.7013156666977699, + "grad_norm": 2.246477936740809, + "learning_rate": 0.00029423494030835043, + "loss": 7.0814, + "step": 7516 + }, + { + "epoch": 0.7014089763926472, + "grad_norm": 2.3932593263799884, + "learning_rate": 0.00029423286331428926, + "loss": 7.4151, + "step": 7517 + }, + { + "epoch": 0.7015022860875245, + "grad_norm": 2.418941155510867, + "learning_rate": 0.0002942307859534867, + "loss": 7.5383, + "step": 7518 + }, + { + "epoch": 0.7015955957824018, + "grad_norm": 3.0900355521557805, + "learning_rate": 0.00029422870822594794, + "loss": 7.1857, + "step": 7519 + }, + { + "epoch": 0.7016889054772791, + "grad_norm": 7.104388636404518, + "learning_rate": 0.00029422663013167836, + "loss": 7.5571, + "step": 7520 + }, + { + "epoch": 0.7017822151721563, + "grad_norm": 2.2047608953472158, + "learning_rate": 0.0002942245516706831, + "loss": 7.3098, + "step": 7521 + }, + { + "epoch": 0.7018755248670336, + "grad_norm": 4.327669853040609, + "learning_rate": 0.0002942224728429676, + "loss": 7.5788, + "step": 7522 + }, + { + "epoch": 0.701968834561911, + "grad_norm": 1.6860683616450842, + "learning_rate": 0.00029422039364853704, + "loss": 7.2403, + "step": 7523 + }, + { + "epoch": 0.7020621442567883, + "grad_norm": 22.333939069453844, + "learning_rate": 0.0002942183140873968, + "loss": 7.1383, + "step": 7524 + }, + { + "epoch": 0.7021554539516656, + "grad_norm": 10.760438966005909, + "learning_rate": 0.000294216234159552, + "loss": 7.3712, + "step": 7525 + }, + { + "epoch": 0.7022487636465429, + "grad_norm": 4.307640444938296, + "learning_rate": 0.0002942141538650081, + "loss": 7.3721, + "step": 7526 + }, + { + "epoch": 0.7023420733414202, + "grad_norm": 2.3576983901249715, + "learning_rate": 0.00029421207320377027, + "loss": 7.6796, + "step": 7527 + }, + { + "epoch": 0.7024353830362975, + "grad_norm": 5.848974609486211, + "learning_rate": 0.0002942099921758439, + "loss": 7.5456, + "step": 7528 + }, + { + "epoch": 0.7025286927311748, + "grad_norm": 16.170208018110785, + "learning_rate": 0.0002942079107812343, + "loss": 7.2566, + "step": 7529 + }, + { + "epoch": 0.7026220024260521, + "grad_norm": 2.4349156001866246, + "learning_rate": 0.0002942058290199466, + "loss": 7.5188, + "step": 7530 + }, + { + "epoch": 0.7027153121209294, + "grad_norm": 7.987709354477937, + "learning_rate": 0.00029420374689198624, + "loss": 7.6794, + "step": 7531 + }, + { + "epoch": 0.7028086218158066, + "grad_norm": 16.22836504539707, + "learning_rate": 0.00029420166439735843, + "loss": 7.5248, + "step": 7532 + }, + { + "epoch": 0.7029019315106839, + "grad_norm": 279.17517622926414, + "learning_rate": 0.0002941995815360685, + "loss": 7.8529, + "step": 7533 + }, + { + "epoch": 0.7029952412055612, + "grad_norm": 4.796988910115489, + "learning_rate": 0.0002941974983081218, + "loss": 7.2544, + "step": 7534 + }, + { + "epoch": 0.7030885509004385, + "grad_norm": 7.245231886129335, + "learning_rate": 0.0002941954147135236, + "loss": 7.4436, + "step": 7535 + }, + { + "epoch": 0.7031818605953158, + "grad_norm": 2.285634639012637, + "learning_rate": 0.00029419333075227913, + "loss": 7.1679, + "step": 7536 + }, + { + "epoch": 0.7032751702901932, + "grad_norm": 411.60708540439964, + "learning_rate": 0.0002941912464243937, + "loss": 7.6491, + "step": 7537 + }, + { + "epoch": 0.7033684799850705, + "grad_norm": 3.3868256242205925, + "learning_rate": 0.0002941891617298727, + "loss": 7.409, + "step": 7538 + }, + { + "epoch": 0.7034617896799478, + "grad_norm": 33.30332278149362, + "learning_rate": 0.00029418707666872135, + "loss": 7.4145, + "step": 7539 + }, + { + "epoch": 0.7035550993748251, + "grad_norm": 1.2701752608541577, + "learning_rate": 0.00029418499124094497, + "loss": 7.6246, + "step": 7540 + }, + { + "epoch": 0.7036484090697024, + "grad_norm": 154.22342487872527, + "learning_rate": 0.0002941829054465489, + "loss": 6.9703, + "step": 7541 + }, + { + "epoch": 0.7037417187645797, + "grad_norm": 21.029767994654552, + "learning_rate": 0.00029418081928553834, + "loss": 7.3104, + "step": 7542 + }, + { + "epoch": 0.7038350284594569, + "grad_norm": 7.434281477373079, + "learning_rate": 0.00029417873275791877, + "loss": 7.5805, + "step": 7543 + }, + { + "epoch": 0.7039283381543342, + "grad_norm": 12.213224797643095, + "learning_rate": 0.0002941766458636953, + "loss": 6.9636, + "step": 7544 + }, + { + "epoch": 0.7040216478492115, + "grad_norm": 2.6827178571948718, + "learning_rate": 0.0002941745586028734, + "loss": 7.4091, + "step": 7545 + }, + { + "epoch": 0.7041149575440888, + "grad_norm": 8.895106811244329, + "learning_rate": 0.0002941724709754583, + "loss": 7.597, + "step": 7546 + }, + { + "epoch": 0.7042082672389661, + "grad_norm": 73.12331119150048, + "learning_rate": 0.0002941703829814552, + "loss": 7.1587, + "step": 7547 + }, + { + "epoch": 0.7043015769338434, + "grad_norm": 8.71911150540216, + "learning_rate": 0.0002941682946208696, + "loss": 7.4842, + "step": 7548 + }, + { + "epoch": 0.7043948866287207, + "grad_norm": 7.874739389299537, + "learning_rate": 0.00029416620589370676, + "loss": 7.2897, + "step": 7549 + }, + { + "epoch": 0.704488196323598, + "grad_norm": 2.3170578859796622, + "learning_rate": 0.00029416411679997195, + "loss": 7.6745, + "step": 7550 + }, + { + "epoch": 0.7045815060184754, + "grad_norm": 2.415278999737635, + "learning_rate": 0.0002941620273396705, + "loss": 7.3015, + "step": 7551 + }, + { + "epoch": 0.7046748157133527, + "grad_norm": 4.7903918176437505, + "learning_rate": 0.0002941599375128077, + "loss": 7.6279, + "step": 7552 + }, + { + "epoch": 0.7047681254082299, + "grad_norm": 3.9736816673749735, + "learning_rate": 0.0002941578473193889, + "loss": 7.9425, + "step": 7553 + }, + { + "epoch": 0.7048614351031072, + "grad_norm": 10.389249245727214, + "learning_rate": 0.0002941557567594194, + "loss": 7.652, + "step": 7554 + }, + { + "epoch": 0.7049547447979845, + "grad_norm": 2.6407532785745333, + "learning_rate": 0.00029415366583290445, + "loss": 7.1314, + "step": 7555 + }, + { + "epoch": 0.7050480544928618, + "grad_norm": 252.1890531071373, + "learning_rate": 0.0002941515745398495, + "loss": 7.4589, + "step": 7556 + }, + { + "epoch": 0.7051413641877391, + "grad_norm": 5.139242308318233, + "learning_rate": 0.00029414948288025976, + "loss": 7.4202, + "step": 7557 + }, + { + "epoch": 0.7052346738826164, + "grad_norm": 70.99074235560649, + "learning_rate": 0.0002941473908541406, + "loss": 7.453, + "step": 7558 + }, + { + "epoch": 0.7053279835774937, + "grad_norm": 32.90299123980295, + "learning_rate": 0.0002941452984614973, + "loss": 7.4627, + "step": 7559 + }, + { + "epoch": 0.705421293272371, + "grad_norm": 22.15080069139947, + "learning_rate": 0.00029414320570233523, + "loss": 7.7029, + "step": 7560 + }, + { + "epoch": 0.7055146029672483, + "grad_norm": 2.0843152830876472, + "learning_rate": 0.0002941411125766597, + "loss": 7.6606, + "step": 7561 + }, + { + "epoch": 0.7056079126621256, + "grad_norm": 5.3338795438688535, + "learning_rate": 0.00029413901908447603, + "loss": 7.618, + "step": 7562 + }, + { + "epoch": 0.7057012223570029, + "grad_norm": 9.12522178653658, + "learning_rate": 0.0002941369252257895, + "loss": 7.7131, + "step": 7563 + }, + { + "epoch": 0.7057945320518801, + "grad_norm": 4125.684917348984, + "learning_rate": 0.00029413483100060545, + "loss": 7.2491, + "step": 7564 + }, + { + "epoch": 0.7058878417467574, + "grad_norm": 138.74382394267468, + "learning_rate": 0.00029413273640892923, + "loss": 7.6895, + "step": 7565 + }, + { + "epoch": 0.7059811514416348, + "grad_norm": 6.908842890247481, + "learning_rate": 0.0002941306414507662, + "loss": 7.4589, + "step": 7566 + }, + { + "epoch": 0.7060744611365121, + "grad_norm": 15.099310102796283, + "learning_rate": 0.0002941285461261216, + "loss": 7.1318, + "step": 7567 + }, + { + "epoch": 0.7061677708313894, + "grad_norm": 4232.525347092831, + "learning_rate": 0.0002941264504350008, + "loss": 7.4074, + "step": 7568 + }, + { + "epoch": 0.7062610805262667, + "grad_norm": 9.16687226037386, + "learning_rate": 0.00029412435437740914, + "loss": 7.2871, + "step": 7569 + }, + { + "epoch": 0.706354390221144, + "grad_norm": 6.1757665506974915, + "learning_rate": 0.0002941222579533519, + "loss": 7.4537, + "step": 7570 + }, + { + "epoch": 0.7064476999160213, + "grad_norm": 2.733786896293786, + "learning_rate": 0.0002941201611628345, + "loss": 7.2179, + "step": 7571 + }, + { + "epoch": 0.7065410096108986, + "grad_norm": 664.0173665577777, + "learning_rate": 0.0002941180640058622, + "loss": 7.0671, + "step": 7572 + }, + { + "epoch": 0.7066343193057759, + "grad_norm": 1.824379625603924, + "learning_rate": 0.0002941159664824404, + "loss": 7.6361, + "step": 7573 + }, + { + "epoch": 0.7067276290006531, + "grad_norm": 18.480492736218036, + "learning_rate": 0.00029411386859257437, + "loss": 7.431, + "step": 7574 + }, + { + "epoch": 0.7068209386955304, + "grad_norm": 431.5536229337865, + "learning_rate": 0.0002941117703362694, + "loss": 7.4293, + "step": 7575 + }, + { + "epoch": 0.7069142483904077, + "grad_norm": 9.3629226526294, + "learning_rate": 0.00029410967171353096, + "loss": 7.2653, + "step": 7576 + }, + { + "epoch": 0.707007558085285, + "grad_norm": 7.920510772621493, + "learning_rate": 0.0002941075727243643, + "loss": 7.3962, + "step": 7577 + }, + { + "epoch": 0.7071008677801623, + "grad_norm": 199.16343705093698, + "learning_rate": 0.00029410547336877475, + "loss": 7.0825, + "step": 7578 + }, + { + "epoch": 0.7071941774750397, + "grad_norm": 5.414326407308149, + "learning_rate": 0.00029410337364676765, + "loss": 7.6389, + "step": 7579 + }, + { + "epoch": 0.707287487169917, + "grad_norm": 1.7515593882775091, + "learning_rate": 0.0002941012735583484, + "loss": 7.2411, + "step": 7580 + }, + { + "epoch": 0.7073807968647943, + "grad_norm": 4.798163011993974, + "learning_rate": 0.00029409917310352227, + "loss": 7.5236, + "step": 7581 + }, + { + "epoch": 0.7074741065596716, + "grad_norm": 1974.2585053494433, + "learning_rate": 0.0002940970722822946, + "loss": 7.42, + "step": 7582 + }, + { + "epoch": 0.7075674162545489, + "grad_norm": 5.4979821650679845, + "learning_rate": 0.0002940949710946709, + "loss": 7.3946, + "step": 7583 + }, + { + "epoch": 0.7076607259494262, + "grad_norm": 23.870980232906664, + "learning_rate": 0.0002940928695406562, + "loss": 7.4554, + "step": 7584 + }, + { + "epoch": 0.7077540356443034, + "grad_norm": 22.75009317696326, + "learning_rate": 0.0002940907676202561, + "loss": 7.3401, + "step": 7585 + }, + { + "epoch": 0.7078473453391807, + "grad_norm": 2.884028829770254, + "learning_rate": 0.00029408866533347583, + "loss": 7.5778, + "step": 7586 + }, + { + "epoch": 0.707940655034058, + "grad_norm": 1.6981550798328675, + "learning_rate": 0.0002940865626803208, + "loss": 7.4078, + "step": 7587 + }, + { + "epoch": 0.7080339647289353, + "grad_norm": 1.6790762047956698, + "learning_rate": 0.0002940844596607963, + "loss": 7.544, + "step": 7588 + }, + { + "epoch": 0.7081272744238126, + "grad_norm": 2.208494900035978, + "learning_rate": 0.0002940823562749077, + "loss": 7.5094, + "step": 7589 + }, + { + "epoch": 0.7082205841186899, + "grad_norm": 18.932308245092514, + "learning_rate": 0.0002940802525226604, + "loss": 7.4941, + "step": 7590 + }, + { + "epoch": 0.7083138938135672, + "grad_norm": 25710.267578629566, + "learning_rate": 0.00029407814840405963, + "loss": 7.292, + "step": 7591 + }, + { + "epoch": 0.7084072035084445, + "grad_norm": 1.0804275071033593, + "learning_rate": 0.0002940760439191108, + "loss": 7.2445, + "step": 7592 + }, + { + "epoch": 0.7085005132033219, + "grad_norm": 86.00582152806535, + "learning_rate": 0.00029407393906781933, + "loss": 7.589, + "step": 7593 + }, + { + "epoch": 0.7085938228981992, + "grad_norm": 1787.6040962725272, + "learning_rate": 0.00029407183385019047, + "loss": 7.313, + "step": 7594 + }, + { + "epoch": 0.7086871325930765, + "grad_norm": 17.0409912480714, + "learning_rate": 0.0002940697282662296, + "loss": 7.5421, + "step": 7595 + }, + { + "epoch": 0.7087804422879537, + "grad_norm": 1.3121434110307104, + "learning_rate": 0.0002940676223159421, + "loss": 7.188, + "step": 7596 + }, + { + "epoch": 0.708873751982831, + "grad_norm": 28.528729612278767, + "learning_rate": 0.00029406551599933336, + "loss": 7.0833, + "step": 7597 + }, + { + "epoch": 0.7089670616777083, + "grad_norm": 13.379587234275807, + "learning_rate": 0.00029406340931640863, + "loss": 7.4067, + "step": 7598 + }, + { + "epoch": 0.7090603713725856, + "grad_norm": 3.6241945033000524, + "learning_rate": 0.00029406130226717334, + "loss": 7.4134, + "step": 7599 + }, + { + "epoch": 0.7091536810674629, + "grad_norm": 1.576936919511795, + "learning_rate": 0.0002940591948516329, + "loss": 7.2889, + "step": 7600 + }, + { + "epoch": 0.7092469907623402, + "grad_norm": 6.074651251440895, + "learning_rate": 0.00029405708706979253, + "loss": 7.3314, + "step": 7601 + }, + { + "epoch": 0.7093403004572175, + "grad_norm": 26.167103787556464, + "learning_rate": 0.0002940549789216577, + "loss": 7.1392, + "step": 7602 + }, + { + "epoch": 0.7094336101520948, + "grad_norm": 11.24165171315326, + "learning_rate": 0.00029405287040723364, + "loss": 7.3798, + "step": 7603 + }, + { + "epoch": 0.7095269198469721, + "grad_norm": 19.77862922518186, + "learning_rate": 0.0002940507615265259, + "loss": 7.5095, + "step": 7604 + }, + { + "epoch": 0.7096202295418494, + "grad_norm": 3.7335496848025516, + "learning_rate": 0.00029404865227953967, + "loss": 6.9648, + "step": 7605 + }, + { + "epoch": 0.7097135392367266, + "grad_norm": 1.3119759959888955, + "learning_rate": 0.00029404654266628047, + "loss": 7.2478, + "step": 7606 + }, + { + "epoch": 0.709806848931604, + "grad_norm": 4.5383896167367865, + "learning_rate": 0.0002940444326867535, + "loss": 7.6608, + "step": 7607 + }, + { + "epoch": 0.7099001586264813, + "grad_norm": 3.434789466727485, + "learning_rate": 0.00029404232234096425, + "loss": 7.5286, + "step": 7608 + }, + { + "epoch": 0.7099934683213586, + "grad_norm": 64.7503322363885, + "learning_rate": 0.00029404021162891803, + "loss": 7.2108, + "step": 7609 + }, + { + "epoch": 0.7100867780162359, + "grad_norm": 1280813.8575944747, + "learning_rate": 0.00029403810055062023, + "loss": 7.7232, + "step": 7610 + }, + { + "epoch": 0.7101800877111132, + "grad_norm": 180.74132933650327, + "learning_rate": 0.0002940359891060762, + "loss": 7.5846, + "step": 7611 + }, + { + "epoch": 0.7102733974059905, + "grad_norm": 141.26456471158795, + "learning_rate": 0.00029403387729529134, + "loss": 7.522, + "step": 7612 + }, + { + "epoch": 0.7103667071008678, + "grad_norm": 2777.0051739661426, + "learning_rate": 0.000294031765118271, + "loss": 8.0486, + "step": 7613 + }, + { + "epoch": 0.7104600167957451, + "grad_norm": 573.6485086706235, + "learning_rate": 0.00029402965257502046, + "loss": 7.7419, + "step": 7614 + }, + { + "epoch": 0.7105533264906224, + "grad_norm": 1082.870832605124, + "learning_rate": 0.00029402753966554525, + "loss": 7.4895, + "step": 7615 + }, + { + "epoch": 0.7106466361854997, + "grad_norm": 1.3657457432881233, + "learning_rate": 0.0002940254263898507, + "loss": 7.4415, + "step": 7616 + }, + { + "epoch": 0.7107399458803769, + "grad_norm": 69.1302833751553, + "learning_rate": 0.00029402331274794205, + "loss": 7.3117, + "step": 7617 + }, + { + "epoch": 0.7108332555752542, + "grad_norm": 4.706800356353898, + "learning_rate": 0.0002940211987398249, + "loss": 7.3028, + "step": 7618 + }, + { + "epoch": 0.7109265652701315, + "grad_norm": 169.4687262511966, + "learning_rate": 0.0002940190843655044, + "loss": 7.5601, + "step": 7619 + }, + { + "epoch": 0.7110198749650088, + "grad_norm": 1.4752245189802553, + "learning_rate": 0.0002940169696249861, + "loss": 7.4266, + "step": 7620 + }, + { + "epoch": 0.7111131846598862, + "grad_norm": 73.44947310504439, + "learning_rate": 0.00029401485451827525, + "loss": 7.7151, + "step": 7621 + }, + { + "epoch": 0.7112064943547635, + "grad_norm": 1.746332256543853, + "learning_rate": 0.0002940127390453773, + "loss": 7.3016, + "step": 7622 + }, + { + "epoch": 0.7112998040496408, + "grad_norm": 2.619848480292373, + "learning_rate": 0.0002940106232062977, + "loss": 7.4631, + "step": 7623 + }, + { + "epoch": 0.7113931137445181, + "grad_norm": 1.3492056679133206, + "learning_rate": 0.0002940085070010417, + "loss": 7.2624, + "step": 7624 + }, + { + "epoch": 0.7114864234393954, + "grad_norm": 1.031816542938988, + "learning_rate": 0.00029400639042961467, + "loss": 7.2615, + "step": 7625 + }, + { + "epoch": 0.7115797331342727, + "grad_norm": 1.2051213885274228, + "learning_rate": 0.0002940042734920221, + "loss": 7.5493, + "step": 7626 + }, + { + "epoch": 0.7116730428291499, + "grad_norm": 57.62946648691784, + "learning_rate": 0.0002940021561882693, + "loss": 7.396, + "step": 7627 + }, + { + "epoch": 0.7117663525240272, + "grad_norm": 0.8845811619172055, + "learning_rate": 0.00029400003851836167, + "loss": 7.3756, + "step": 7628 + }, + { + "epoch": 0.7118596622189045, + "grad_norm": 0.9388676635846478, + "learning_rate": 0.0002939979204823046, + "loss": 7.4889, + "step": 7629 + }, + { + "epoch": 0.7119529719137818, + "grad_norm": 1.6816336700748238, + "learning_rate": 0.00029399580208010347, + "loss": 7.4374, + "step": 7630 + }, + { + "epoch": 0.7120462816086591, + "grad_norm": 0.9946807097490019, + "learning_rate": 0.0002939936833117637, + "loss": 7.3628, + "step": 7631 + }, + { + "epoch": 0.7121395913035364, + "grad_norm": 15.82999137089567, + "learning_rate": 0.0002939915641772906, + "loss": 7.1747, + "step": 7632 + }, + { + "epoch": 0.7122329009984137, + "grad_norm": 0.8590174022591571, + "learning_rate": 0.00029398944467668966, + "loss": 7.3245, + "step": 7633 + }, + { + "epoch": 0.712326210693291, + "grad_norm": 42.20149778992895, + "learning_rate": 0.00029398732480996614, + "loss": 7.3746, + "step": 7634 + }, + { + "epoch": 0.7124195203881684, + "grad_norm": 1.5274376573392672, + "learning_rate": 0.00029398520457712556, + "loss": 7.6926, + "step": 7635 + }, + { + "epoch": 0.7125128300830457, + "grad_norm": 9.145665820658945, + "learning_rate": 0.00029398308397817326, + "loss": 7.3523, + "step": 7636 + }, + { + "epoch": 0.712606139777923, + "grad_norm": 40.41737597923349, + "learning_rate": 0.0002939809630131146, + "loss": 7.6322, + "step": 7637 + }, + { + "epoch": 0.7126994494728002, + "grad_norm": 6.213183765685881, + "learning_rate": 0.000293978841681955, + "loss": 7.2458, + "step": 7638 + }, + { + "epoch": 0.7127927591676775, + "grad_norm": 0.7075741828935782, + "learning_rate": 0.0002939767199846999, + "loss": 7.5332, + "step": 7639 + }, + { + "epoch": 0.7128860688625548, + "grad_norm": 0.7678126075728148, + "learning_rate": 0.00029397459792135464, + "loss": 7.4976, + "step": 7640 + }, + { + "epoch": 0.7129793785574321, + "grad_norm": 0.6094034643723005, + "learning_rate": 0.00029397247549192457, + "loss": 7.5934, + "step": 7641 + }, + { + "epoch": 0.7130726882523094, + "grad_norm": 2.4721152456612137, + "learning_rate": 0.00029397035269641516, + "loss": 7.3915, + "step": 7642 + }, + { + "epoch": 0.7131659979471867, + "grad_norm": 0.9733408714309956, + "learning_rate": 0.0002939682295348318, + "loss": 7.2767, + "step": 7643 + }, + { + "epoch": 0.713259307642064, + "grad_norm": 0.6256442221662907, + "learning_rate": 0.0002939661060071799, + "loss": 7.2511, + "step": 7644 + }, + { + "epoch": 0.7133526173369413, + "grad_norm": 30.015747469144905, + "learning_rate": 0.0002939639821134649, + "loss": 7.374, + "step": 7645 + }, + { + "epoch": 0.7134459270318186, + "grad_norm": 0.7269375450986397, + "learning_rate": 0.000293961857853692, + "loss": 7.0882, + "step": 7646 + }, + { + "epoch": 0.713539236726696, + "grad_norm": 1.1834738476410827, + "learning_rate": 0.0002939597332278669, + "loss": 7.3807, + "step": 7647 + }, + { + "epoch": 0.7136325464215733, + "grad_norm": 1.450692182321186, + "learning_rate": 0.0002939576082359947, + "loss": 7.5296, + "step": 7648 + }, + { + "epoch": 0.7137258561164505, + "grad_norm": 0.47665282354139415, + "learning_rate": 0.000293955482878081, + "loss": 7.2283, + "step": 7649 + }, + { + "epoch": 0.7138191658113278, + "grad_norm": 0.9785550255032812, + "learning_rate": 0.00029395335715413115, + "loss": 7.5664, + "step": 7650 + }, + { + "epoch": 0.7139124755062051, + "grad_norm": 1.23812129288439, + "learning_rate": 0.00029395123106415054, + "loss": 7.4719, + "step": 7651 + }, + { + "epoch": 0.7140057852010824, + "grad_norm": 6.594546812877382, + "learning_rate": 0.0002939491046081446, + "loss": 7.3942, + "step": 7652 + }, + { + "epoch": 0.7140990948959597, + "grad_norm": 206.67471743271872, + "learning_rate": 0.0002939469777861187, + "loss": 7.2721, + "step": 7653 + }, + { + "epoch": 0.714192404590837, + "grad_norm": 1.073181073935075, + "learning_rate": 0.00029394485059807833, + "loss": 7.2471, + "step": 7654 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 3.3882114735858084, + "learning_rate": 0.0002939427230440288, + "loss": 7.3636, + "step": 7655 + }, + { + "epoch": 0.7143790239805916, + "grad_norm": 0.9775748622263449, + "learning_rate": 0.00029394059512397557, + "loss": 7.41, + "step": 7656 + }, + { + "epoch": 0.7144723336754689, + "grad_norm": 66.85711705360144, + "learning_rate": 0.0002939384668379241, + "loss": 7.1777, + "step": 7657 + }, + { + "epoch": 0.7145656433703462, + "grad_norm": 1.3389484966288376, + "learning_rate": 0.00029393633818587965, + "loss": 7.603, + "step": 7658 + }, + { + "epoch": 0.7146589530652234, + "grad_norm": 0.6624750986290903, + "learning_rate": 0.0002939342091678478, + "loss": 7.2642, + "step": 7659 + }, + { + "epoch": 0.7147522627601007, + "grad_norm": 1.3684859389971904, + "learning_rate": 0.00029393207978383384, + "loss": 7.524, + "step": 7660 + }, + { + "epoch": 0.714845572454978, + "grad_norm": 121.0178005920298, + "learning_rate": 0.00029392995003384327, + "loss": 7.354, + "step": 7661 + }, + { + "epoch": 0.7149388821498553, + "grad_norm": 1.0325095060151277, + "learning_rate": 0.0002939278199178815, + "loss": 7.2891, + "step": 7662 + }, + { + "epoch": 0.7150321918447327, + "grad_norm": 1.1441477570254501, + "learning_rate": 0.0002939256894359539, + "loss": 7.1904, + "step": 7663 + }, + { + "epoch": 0.71512550153961, + "grad_norm": 0.6485511380227308, + "learning_rate": 0.0002939235585880659, + "loss": 7.4025, + "step": 7664 + }, + { + "epoch": 0.7152188112344873, + "grad_norm": 0.5802293603960231, + "learning_rate": 0.0002939214273742229, + "loss": 7.2394, + "step": 7665 + }, + { + "epoch": 0.7153121209293646, + "grad_norm": 1.8970509905588429, + "learning_rate": 0.00029391929579443036, + "loss": 7.7145, + "step": 7666 + }, + { + "epoch": 0.7154054306242419, + "grad_norm": 1.0120880421855118, + "learning_rate": 0.0002939171638486937, + "loss": 7.2552, + "step": 7667 + }, + { + "epoch": 0.7154987403191192, + "grad_norm": 0.8132751325773296, + "learning_rate": 0.00029391503153701834, + "loss": 7.3932, + "step": 7668 + }, + { + "epoch": 0.7155920500139965, + "grad_norm": 0.6039439960163967, + "learning_rate": 0.0002939128988594096, + "loss": 7.2289, + "step": 7669 + }, + { + "epoch": 0.7156853597088737, + "grad_norm": 0.5458987917815854, + "learning_rate": 0.0002939107658158731, + "loss": 7.3347, + "step": 7670 + }, + { + "epoch": 0.715778669403751, + "grad_norm": 0.9673897372229824, + "learning_rate": 0.0002939086324064141, + "loss": 7.1119, + "step": 7671 + }, + { + "epoch": 0.7158719790986283, + "grad_norm": 533.8719138064611, + "learning_rate": 0.00029390649863103807, + "loss": 7.2512, + "step": 7672 + }, + { + "epoch": 0.7159652887935056, + "grad_norm": 1.3453608118778935, + "learning_rate": 0.00029390436448975044, + "loss": 7.168, + "step": 7673 + }, + { + "epoch": 0.7160585984883829, + "grad_norm": 0.3749288812653031, + "learning_rate": 0.00029390222998255667, + "loss": 7.2881, + "step": 7674 + }, + { + "epoch": 0.7161519081832602, + "grad_norm": 0.7453738746404783, + "learning_rate": 0.00029390009510946207, + "loss": 7.3902, + "step": 7675 + }, + { + "epoch": 0.7162452178781376, + "grad_norm": 0.5578375895817917, + "learning_rate": 0.00029389795987047227, + "loss": 7.261, + "step": 7676 + }, + { + "epoch": 0.7163385275730149, + "grad_norm": 12.953644460165354, + "learning_rate": 0.00029389582426559254, + "loss": 7.7255, + "step": 7677 + }, + { + "epoch": 0.7164318372678922, + "grad_norm": 0.7905208822831097, + "learning_rate": 0.00029389368829482835, + "loss": 7.166, + "step": 7678 + }, + { + "epoch": 0.7165251469627695, + "grad_norm": 574.0835473163256, + "learning_rate": 0.00029389155195818506, + "loss": 7.1928, + "step": 7679 + }, + { + "epoch": 0.7166184566576467, + "grad_norm": 1086.0687130350248, + "learning_rate": 0.0002938894152556683, + "loss": 7.3116, + "step": 7680 + }, + { + "epoch": 0.716711766352524, + "grad_norm": 0.9778710072468095, + "learning_rate": 0.0002938872781872833, + "loss": 7.2352, + "step": 7681 + }, + { + "epoch": 0.7168050760474013, + "grad_norm": 0.7607173240958796, + "learning_rate": 0.0002938851407530356, + "loss": 7.0762, + "step": 7682 + }, + { + "epoch": 0.7168983857422786, + "grad_norm": 0.431837699629038, + "learning_rate": 0.00029388300295293064, + "loss": 7.1937, + "step": 7683 + }, + { + "epoch": 0.7169916954371559, + "grad_norm": 243.06385519092433, + "learning_rate": 0.0002938808647869738, + "loss": 7.5594, + "step": 7684 + }, + { + "epoch": 0.7170850051320332, + "grad_norm": 1269.360309611312, + "learning_rate": 0.0002938787262551705, + "loss": 7.1185, + "step": 7685 + }, + { + "epoch": 0.7171783148269105, + "grad_norm": 0.8538401390840045, + "learning_rate": 0.00029387658735752627, + "loss": 7.1801, + "step": 7686 + }, + { + "epoch": 0.7172716245217878, + "grad_norm": 4.115717734756041, + "learning_rate": 0.00029387444809404647, + "loss": 7.2254, + "step": 7687 + }, + { + "epoch": 0.7173649342166651, + "grad_norm": 1.4224891016140508, + "learning_rate": 0.0002938723084647366, + "loss": 7.686, + "step": 7688 + }, + { + "epoch": 0.7174582439115424, + "grad_norm": 2.030342262902594, + "learning_rate": 0.00029387016846960204, + "loss": 7.1719, + "step": 7689 + }, + { + "epoch": 0.7175515536064198, + "grad_norm": 2.2832780516464894, + "learning_rate": 0.00029386802810864825, + "loss": 7.9194, + "step": 7690 + }, + { + "epoch": 0.717644863301297, + "grad_norm": 2.7557455346846567, + "learning_rate": 0.00029386588738188074, + "loss": 7.6578, + "step": 7691 + }, + { + "epoch": 0.7177381729961743, + "grad_norm": 2.916843013292216, + "learning_rate": 0.0002938637462893048, + "loss": 7.8839, + "step": 7692 + }, + { + "epoch": 0.7178314826910516, + "grad_norm": 2.515639174171577, + "learning_rate": 0.0002938616048309261, + "loss": 7.7469, + "step": 7693 + }, + { + "epoch": 0.7179247923859289, + "grad_norm": 9.755539750000453, + "learning_rate": 0.0002938594630067498, + "loss": 7.9145, + "step": 7694 + }, + { + "epoch": 0.7180181020808062, + "grad_norm": 1.2839075526409054, + "learning_rate": 0.0002938573208167816, + "loss": 7.7091, + "step": 7695 + }, + { + "epoch": 0.7181114117756835, + "grad_norm": 1.0586404368904403, + "learning_rate": 0.0002938551782610268, + "loss": 7.5968, + "step": 7696 + }, + { + "epoch": 0.7182047214705608, + "grad_norm": 1.7947565551561533, + "learning_rate": 0.00029385303533949094, + "loss": 7.6755, + "step": 7697 + }, + { + "epoch": 0.7182980311654381, + "grad_norm": 2.137693257196279, + "learning_rate": 0.00029385089205217937, + "loss": 7.3174, + "step": 7698 + }, + { + "epoch": 0.7183913408603154, + "grad_norm": 2.133894861428657, + "learning_rate": 0.00029384874839909767, + "loss": 7.6426, + "step": 7699 + }, + { + "epoch": 0.7184846505551927, + "grad_norm": 1.4046750552862897, + "learning_rate": 0.00029384660438025115, + "loss": 7.5836, + "step": 7700 + }, + { + "epoch": 0.71857796025007, + "grad_norm": 3.335349881523776, + "learning_rate": 0.00029384445999564533, + "loss": 7.8762, + "step": 7701 + }, + { + "epoch": 0.7186712699449472, + "grad_norm": 2.375480570593548, + "learning_rate": 0.00029384231524528566, + "loss": 8.0966, + "step": 7702 + }, + { + "epoch": 0.7187645796398245, + "grad_norm": 1.4461048511876642, + "learning_rate": 0.00029384017012917764, + "loss": 7.7897, + "step": 7703 + }, + { + "epoch": 0.7188578893347018, + "grad_norm": 1.8595410067959486, + "learning_rate": 0.0002938380246473266, + "loss": 7.3983, + "step": 7704 + }, + { + "epoch": 0.7189511990295792, + "grad_norm": 1.9081625041581503, + "learning_rate": 0.0002938358787997381, + "loss": 7.4475, + "step": 7705 + }, + { + "epoch": 0.7190445087244565, + "grad_norm": 1.4145406797064508, + "learning_rate": 0.0002938337325864176, + "loss": 7.4084, + "step": 7706 + }, + { + "epoch": 0.7191378184193338, + "grad_norm": 1.2527178752780117, + "learning_rate": 0.0002938315860073705, + "loss": 7.2783, + "step": 7707 + }, + { + "epoch": 0.7192311281142111, + "grad_norm": 1.4363075790179336, + "learning_rate": 0.0002938294390626023, + "loss": 7.2829, + "step": 7708 + }, + { + "epoch": 0.7193244378090884, + "grad_norm": 2.61123896844163, + "learning_rate": 0.0002938272917521184, + "loss": 7.4883, + "step": 7709 + }, + { + "epoch": 0.7194177475039657, + "grad_norm": 2.104644838927439, + "learning_rate": 0.00029382514407592437, + "loss": 7.3913, + "step": 7710 + }, + { + "epoch": 0.719511057198843, + "grad_norm": 1.9068685452789622, + "learning_rate": 0.0002938229960340255, + "loss": 7.776, + "step": 7711 + }, + { + "epoch": 0.7196043668937202, + "grad_norm": 1.2213848000454255, + "learning_rate": 0.00029382084762642746, + "loss": 7.7355, + "step": 7712 + }, + { + "epoch": 0.7196976765885975, + "grad_norm": 2.4836512277213543, + "learning_rate": 0.00029381869885313557, + "loss": 7.5552, + "step": 7713 + }, + { + "epoch": 0.7197909862834748, + "grad_norm": 2.351625830837886, + "learning_rate": 0.00029381654971415535, + "loss": 7.5441, + "step": 7714 + }, + { + "epoch": 0.7198842959783521, + "grad_norm": 1.695342009036449, + "learning_rate": 0.0002938144002094922, + "loss": 7.2298, + "step": 7715 + }, + { + "epoch": 0.7199776056732294, + "grad_norm": 1.107792059799516, + "learning_rate": 0.00029381225033915166, + "loss": 7.2047, + "step": 7716 + }, + { + "epoch": 0.7200709153681067, + "grad_norm": 2.6534972925523914, + "learning_rate": 0.00029381010010313914, + "loss": 7.8925, + "step": 7717 + }, + { + "epoch": 0.720164225062984, + "grad_norm": 2.8532973201646468, + "learning_rate": 0.00029380794950146014, + "loss": 7.5799, + "step": 7718 + }, + { + "epoch": 0.7202575347578614, + "grad_norm": 3.4235573233222234, + "learning_rate": 0.00029380579853412013, + "loss": 7.4226, + "step": 7719 + }, + { + "epoch": 0.7203508444527387, + "grad_norm": 1.8186502130109896, + "learning_rate": 0.0002938036472011246, + "loss": 7.8594, + "step": 7720 + }, + { + "epoch": 0.720444154147616, + "grad_norm": 1.3673019408749756, + "learning_rate": 0.000293801495502479, + "loss": 7.73, + "step": 7721 + }, + { + "epoch": 0.7205374638424933, + "grad_norm": 2.1016357348498143, + "learning_rate": 0.0002937993434381887, + "loss": 7.5002, + "step": 7722 + }, + { + "epoch": 0.7206307735373705, + "grad_norm": 1.741460590936247, + "learning_rate": 0.0002937971910082594, + "loss": 7.607, + "step": 7723 + }, + { + "epoch": 0.7207240832322478, + "grad_norm": 1.6393721830930394, + "learning_rate": 0.00029379503821269637, + "loss": 7.2592, + "step": 7724 + }, + { + "epoch": 0.7208173929271251, + "grad_norm": 2.0055214155631935, + "learning_rate": 0.00029379288505150514, + "loss": 7.6054, + "step": 7725 + }, + { + "epoch": 0.7209107026220024, + "grad_norm": 2.8013779900428712, + "learning_rate": 0.00029379073152469124, + "loss": 7.6781, + "step": 7726 + }, + { + "epoch": 0.7210040123168797, + "grad_norm": 1.1124190131387328, + "learning_rate": 0.00029378857763226007, + "loss": 7.3088, + "step": 7727 + }, + { + "epoch": 0.721097322011757, + "grad_norm": 1.238859181353852, + "learning_rate": 0.0002937864233742171, + "loss": 7.4563, + "step": 7728 + }, + { + "epoch": 0.7211906317066343, + "grad_norm": 3.070645539378536, + "learning_rate": 0.0002937842687505679, + "loss": 7.4676, + "step": 7729 + }, + { + "epoch": 0.7212839414015116, + "grad_norm": 1.0572279288702162, + "learning_rate": 0.0002937821137613179, + "loss": 7.4353, + "step": 7730 + }, + { + "epoch": 0.721377251096389, + "grad_norm": 0.9176365687434876, + "learning_rate": 0.00029377995840647255, + "loss": 7.3226, + "step": 7731 + }, + { + "epoch": 0.7214705607912663, + "grad_norm": 0.7220926119758431, + "learning_rate": 0.00029377780268603743, + "loss": 7.4977, + "step": 7732 + }, + { + "epoch": 0.7215638704861435, + "grad_norm": 1.5802660706441227, + "learning_rate": 0.0002937756466000179, + "loss": 7.3333, + "step": 7733 + }, + { + "epoch": 0.7216571801810208, + "grad_norm": 1.674321232364289, + "learning_rate": 0.00029377349014841946, + "loss": 7.7077, + "step": 7734 + }, + { + "epoch": 0.7217504898758981, + "grad_norm": 1.7314052510037603, + "learning_rate": 0.00029377133333124766, + "loss": 7.6531, + "step": 7735 + }, + { + "epoch": 0.7218437995707754, + "grad_norm": 3.5088411454653157, + "learning_rate": 0.0002937691761485079, + "loss": 7.4363, + "step": 7736 + }, + { + "epoch": 0.7219371092656527, + "grad_norm": 0.7159100381518106, + "learning_rate": 0.00029376701860020575, + "loss": 7.4785, + "step": 7737 + }, + { + "epoch": 0.72203041896053, + "grad_norm": 0.6674175359626511, + "learning_rate": 0.0002937648606863467, + "loss": 7.3752, + "step": 7738 + }, + { + "epoch": 0.7221237286554073, + "grad_norm": 1.0764347791144289, + "learning_rate": 0.00029376270240693615, + "loss": 7.2199, + "step": 7739 + }, + { + "epoch": 0.7222170383502846, + "grad_norm": 13.678794726824432, + "learning_rate": 0.0002937605437619796, + "loss": 7.2686, + "step": 7740 + }, + { + "epoch": 0.7223103480451619, + "grad_norm": 1.1311641395435879, + "learning_rate": 0.0002937583847514826, + "loss": 7.5167, + "step": 7741 + }, + { + "epoch": 0.7224036577400392, + "grad_norm": 0.622874141974698, + "learning_rate": 0.0002937562253754506, + "loss": 7.3748, + "step": 7742 + }, + { + "epoch": 0.7224969674349165, + "grad_norm": 1.557554825733583, + "learning_rate": 0.00029375406563388913, + "loss": 7.5332, + "step": 7743 + }, + { + "epoch": 0.7225902771297937, + "grad_norm": 0.7625875197886665, + "learning_rate": 0.0002937519055268037, + "loss": 7.4162, + "step": 7744 + }, + { + "epoch": 0.722683586824671, + "grad_norm": 0.985673703352911, + "learning_rate": 0.00029374974505419965, + "loss": 7.3496, + "step": 7745 + }, + { + "epoch": 0.7227768965195484, + "grad_norm": 5.709460501260533, + "learning_rate": 0.00029374758421608265, + "loss": 7.3735, + "step": 7746 + }, + { + "epoch": 0.7228702062144257, + "grad_norm": 308.91684322809743, + "learning_rate": 0.0002937454230124581, + "loss": 7.3001, + "step": 7747 + }, + { + "epoch": 0.722963515909303, + "grad_norm": 1.7418109184544903, + "learning_rate": 0.0002937432614433315, + "loss": 7.4624, + "step": 7748 + }, + { + "epoch": 0.7230568256041803, + "grad_norm": 1.2188241789848304, + "learning_rate": 0.00029374109950870843, + "loss": 7.5915, + "step": 7749 + }, + { + "epoch": 0.7231501352990576, + "grad_norm": 0.8803922983029606, + "learning_rate": 0.0002937389372085943, + "loss": 7.5401, + "step": 7750 + }, + { + "epoch": 0.7232434449939349, + "grad_norm": 1697.0366154326323, + "learning_rate": 0.00029373677454299464, + "loss": 7.3368, + "step": 7751 + }, + { + "epoch": 0.7233367546888122, + "grad_norm": 0.9127194959014506, + "learning_rate": 0.00029373461151191487, + "loss": 7.1961, + "step": 7752 + }, + { + "epoch": 0.7234300643836895, + "grad_norm": 2.0085829869233494, + "learning_rate": 0.0002937324481153606, + "loss": 7.5109, + "step": 7753 + }, + { + "epoch": 0.7235233740785668, + "grad_norm": 1.8951472500442363, + "learning_rate": 0.00029373028435333734, + "loss": 7.1988, + "step": 7754 + }, + { + "epoch": 0.723616683773444, + "grad_norm": 0.7297110037684924, + "learning_rate": 0.0002937281202258505, + "loss": 7.2795, + "step": 7755 + }, + { + "epoch": 0.7237099934683213, + "grad_norm": 1.7266902637383277, + "learning_rate": 0.00029372595573290563, + "loss": 7.359, + "step": 7756 + }, + { + "epoch": 0.7238033031631986, + "grad_norm": 1.5102725646527766, + "learning_rate": 0.0002937237908745082, + "loss": 7.9112, + "step": 7757 + }, + { + "epoch": 0.7238966128580759, + "grad_norm": 1.970567316514831, + "learning_rate": 0.0002937216256506638, + "loss": 7.6469, + "step": 7758 + }, + { + "epoch": 0.7239899225529532, + "grad_norm": 2.060163015568972, + "learning_rate": 0.00029371946006137784, + "loss": 7.5001, + "step": 7759 + }, + { + "epoch": 0.7240832322478306, + "grad_norm": 1.0186710570330004, + "learning_rate": 0.0002937172941066559, + "loss": 7.7212, + "step": 7760 + }, + { + "epoch": 0.7241765419427079, + "grad_norm": 1.736215426938354, + "learning_rate": 0.0002937151277865034, + "loss": 7.5302, + "step": 7761 + }, + { + "epoch": 0.7242698516375852, + "grad_norm": 48.47426830733758, + "learning_rate": 0.0002937129611009259, + "loss": 7.5624, + "step": 7762 + }, + { + "epoch": 0.7243631613324625, + "grad_norm": 1.1224309915482327, + "learning_rate": 0.000293710794049929, + "loss": 7.4, + "step": 7763 + }, + { + "epoch": 0.7244564710273398, + "grad_norm": 4.984156026202343, + "learning_rate": 0.000293708626633518, + "loss": 7.429, + "step": 7764 + }, + { + "epoch": 0.724549780722217, + "grad_norm": 1.6931852244855001, + "learning_rate": 0.00029370645885169863, + "loss": 7.4708, + "step": 7765 + }, + { + "epoch": 0.7246430904170943, + "grad_norm": 1.842547973722912, + "learning_rate": 0.00029370429070447627, + "loss": 7.3891, + "step": 7766 + }, + { + "epoch": 0.7247364001119716, + "grad_norm": 0.7408212195462076, + "learning_rate": 0.0002937021221918565, + "loss": 7.4183, + "step": 7767 + }, + { + "epoch": 0.7248297098068489, + "grad_norm": 1.407009651876214, + "learning_rate": 0.0002936999533138447, + "loss": 7.6863, + "step": 7768 + }, + { + "epoch": 0.7249230195017262, + "grad_norm": 0.8487303968562181, + "learning_rate": 0.0002936977840704466, + "loss": 7.5551, + "step": 7769 + }, + { + "epoch": 0.7250163291966035, + "grad_norm": 1.2112030151658448, + "learning_rate": 0.00029369561446166754, + "loss": 7.7106, + "step": 7770 + }, + { + "epoch": 0.7251096388914808, + "grad_norm": 1.2462474042513847, + "learning_rate": 0.00029369344448751313, + "loss": 7.8562, + "step": 7771 + }, + { + "epoch": 0.7252029485863581, + "grad_norm": 1.0010113148135114, + "learning_rate": 0.0002936912741479888, + "loss": 7.5725, + "step": 7772 + }, + { + "epoch": 0.7252962582812355, + "grad_norm": 0.993326262479829, + "learning_rate": 0.0002936891034431002, + "loss": 7.4142, + "step": 7773 + }, + { + "epoch": 0.7253895679761128, + "grad_norm": 36.57414639954212, + "learning_rate": 0.0002936869323728527, + "loss": 7.3552, + "step": 7774 + }, + { + "epoch": 0.7254828776709901, + "grad_norm": 1.1026412340053662, + "learning_rate": 0.00029368476093725196, + "loss": 7.5081, + "step": 7775 + }, + { + "epoch": 0.7255761873658673, + "grad_norm": 6.865102020655879, + "learning_rate": 0.0002936825891363034, + "loss": 7.2228, + "step": 7776 + }, + { + "epoch": 0.7256694970607446, + "grad_norm": 1.4769478465265147, + "learning_rate": 0.0002936804169700126, + "loss": 7.2652, + "step": 7777 + }, + { + "epoch": 0.7257628067556219, + "grad_norm": 1.401588785110451, + "learning_rate": 0.000293678244438385, + "loss": 7.2954, + "step": 7778 + }, + { + "epoch": 0.7258561164504992, + "grad_norm": 1.342596870074644, + "learning_rate": 0.00029367607154142625, + "loss": 7.5607, + "step": 7779 + }, + { + "epoch": 0.7259494261453765, + "grad_norm": 1.109139855702738, + "learning_rate": 0.0002936738982791418, + "loss": 7.5105, + "step": 7780 + }, + { + "epoch": 0.7260427358402538, + "grad_norm": 1.2953834299448774, + "learning_rate": 0.0002936717246515372, + "loss": 7.447, + "step": 7781 + }, + { + "epoch": 0.7261360455351311, + "grad_norm": 67.08307539855247, + "learning_rate": 0.0002936695506586179, + "loss": 7.142, + "step": 7782 + }, + { + "epoch": 0.7262293552300084, + "grad_norm": 1.1702046663744763, + "learning_rate": 0.00029366737630038956, + "loss": 7.5303, + "step": 7783 + }, + { + "epoch": 0.7263226649248857, + "grad_norm": 1.188849344052473, + "learning_rate": 0.00029366520157685756, + "loss": 7.5632, + "step": 7784 + }, + { + "epoch": 0.726415974619763, + "grad_norm": 0.907903188004766, + "learning_rate": 0.0002936630264880276, + "loss": 7.2514, + "step": 7785 + }, + { + "epoch": 0.7265092843146402, + "grad_norm": 4.135446567408281, + "learning_rate": 0.0002936608510339051, + "loss": 7.5085, + "step": 7786 + }, + { + "epoch": 0.7266025940095175, + "grad_norm": 1.3602218047099381, + "learning_rate": 0.00029365867521449556, + "loss": 7.6595, + "step": 7787 + }, + { + "epoch": 0.7266959037043949, + "grad_norm": 1.15768260225276, + "learning_rate": 0.0002936564990298046, + "loss": 7.6901, + "step": 7788 + }, + { + "epoch": 0.7267892133992722, + "grad_norm": 0.8204612199885777, + "learning_rate": 0.0002936543224798377, + "loss": 7.4225, + "step": 7789 + }, + { + "epoch": 0.7268825230941495, + "grad_norm": 0.9544538159975359, + "learning_rate": 0.00029365214556460045, + "loss": 7.6751, + "step": 7790 + }, + { + "epoch": 0.7269758327890268, + "grad_norm": 0.9428090875868398, + "learning_rate": 0.0002936499682840983, + "loss": 7.4736, + "step": 7791 + }, + { + "epoch": 0.7270691424839041, + "grad_norm": 1.3178887285377803, + "learning_rate": 0.0002936477906383369, + "loss": 7.4525, + "step": 7792 + }, + { + "epoch": 0.7271624521787814, + "grad_norm": 0.9792510641835159, + "learning_rate": 0.00029364561262732163, + "loss": 7.3147, + "step": 7793 + }, + { + "epoch": 0.7272557618736587, + "grad_norm": 0.9051709683707502, + "learning_rate": 0.00029364343425105813, + "loss": 7.4054, + "step": 7794 + }, + { + "epoch": 0.727349071568536, + "grad_norm": 1.4202578391421545, + "learning_rate": 0.000293641255509552, + "loss": 7.4264, + "step": 7795 + }, + { + "epoch": 0.7274423812634133, + "grad_norm": 115.36033744736916, + "learning_rate": 0.00029363907640280867, + "loss": 7.0588, + "step": 7796 + }, + { + "epoch": 0.7275356909582905, + "grad_norm": 0.8334038952645373, + "learning_rate": 0.0002936368969308337, + "loss": 7.6281, + "step": 7797 + }, + { + "epoch": 0.7276290006531678, + "grad_norm": 0.9226210334859329, + "learning_rate": 0.00029363471709363266, + "loss": 7.2598, + "step": 7798 + }, + { + "epoch": 0.7277223103480451, + "grad_norm": 1308.7920002666885, + "learning_rate": 0.00029363253689121106, + "loss": 6.9907, + "step": 7799 + }, + { + "epoch": 0.7278156200429224, + "grad_norm": 6182.091137284333, + "learning_rate": 0.0002936303563235745, + "loss": 7.5428, + "step": 7800 + }, + { + "epoch": 0.7279089297377997, + "grad_norm": 1.578851786643593, + "learning_rate": 0.00029362817539072846, + "loss": 7.115, + "step": 7801 + }, + { + "epoch": 0.7280022394326771, + "grad_norm": 0.8299948617887799, + "learning_rate": 0.00029362599409267856, + "loss": 7.5106, + "step": 7802 + }, + { + "epoch": 0.7280955491275544, + "grad_norm": 1.1376567517598397, + "learning_rate": 0.0002936238124294303, + "loss": 7.2274, + "step": 7803 + }, + { + "epoch": 0.7281888588224317, + "grad_norm": 14907.160267839932, + "learning_rate": 0.00029362163040098915, + "loss": 7.3519, + "step": 7804 + }, + { + "epoch": 0.728282168517309, + "grad_norm": 2.328133315350598, + "learning_rate": 0.0002936194480073608, + "loss": 7.7739, + "step": 7805 + }, + { + "epoch": 0.7283754782121863, + "grad_norm": 418423.35505178984, + "learning_rate": 0.00029361726524855075, + "loss": 7.2372, + "step": 7806 + }, + { + "epoch": 0.7284687879070636, + "grad_norm": 1.4552004986671516, + "learning_rate": 0.00029361508212456453, + "loss": 7.5344, + "step": 7807 + }, + { + "epoch": 0.7285620976019408, + "grad_norm": 1.3827035098543736, + "learning_rate": 0.0002936128986354077, + "loss": 7.35, + "step": 7808 + }, + { + "epoch": 0.7286554072968181, + "grad_norm": 2.3323583001157355, + "learning_rate": 0.0002936107147810858, + "loss": 7.6628, + "step": 7809 + }, + { + "epoch": 0.7287487169916954, + "grad_norm": 247.39799452596256, + "learning_rate": 0.00029360853056160436, + "loss": 7.3981, + "step": 7810 + }, + { + "epoch": 0.7288420266865727, + "grad_norm": 1316.4427239730676, + "learning_rate": 0.000293606345976969, + "loss": 7.6142, + "step": 7811 + }, + { + "epoch": 0.72893533638145, + "grad_norm": 162.71781128552382, + "learning_rate": 0.00029360416102718523, + "loss": 7.7942, + "step": 7812 + }, + { + "epoch": 0.7290286460763273, + "grad_norm": 1.8828492880066536, + "learning_rate": 0.00029360197571225865, + "loss": 7.6185, + "step": 7813 + }, + { + "epoch": 0.7291219557712046, + "grad_norm": 1.0109165521622716, + "learning_rate": 0.00029359979003219474, + "loss": 7.3238, + "step": 7814 + }, + { + "epoch": 0.729215265466082, + "grad_norm": 2.05050780685654, + "learning_rate": 0.00029359760398699913, + "loss": 7.5633, + "step": 7815 + }, + { + "epoch": 0.7293085751609593, + "grad_norm": 2.235495558634785, + "learning_rate": 0.00029359541757667735, + "loss": 7.3678, + "step": 7816 + }, + { + "epoch": 0.7294018848558366, + "grad_norm": 3.599956978300766, + "learning_rate": 0.000293593230801235, + "loss": 7.9266, + "step": 7817 + }, + { + "epoch": 0.7294951945507138, + "grad_norm": 2.8460583463987814, + "learning_rate": 0.0002935910436606775, + "loss": 7.8359, + "step": 7818 + }, + { + "epoch": 0.7295885042455911, + "grad_norm": 1.569876745282074, + "learning_rate": 0.00029358885615501056, + "loss": 7.7311, + "step": 7819 + }, + { + "epoch": 0.7296818139404684, + "grad_norm": 1.642360982441394, + "learning_rate": 0.00029358666828423966, + "loss": 7.3416, + "step": 7820 + }, + { + "epoch": 0.7297751236353457, + "grad_norm": 1.8100193563440565, + "learning_rate": 0.0002935844800483704, + "loss": 7.3254, + "step": 7821 + }, + { + "epoch": 0.729868433330223, + "grad_norm": 1.754172193909663, + "learning_rate": 0.00029358229144740844, + "loss": 7.4829, + "step": 7822 + }, + { + "epoch": 0.7299617430251003, + "grad_norm": 1.339642515125646, + "learning_rate": 0.00029358010248135916, + "loss": 7.515, + "step": 7823 + }, + { + "epoch": 0.7300550527199776, + "grad_norm": 1.6450067513183078, + "learning_rate": 0.00029357791315022823, + "loss": 7.2268, + "step": 7824 + }, + { + "epoch": 0.7301483624148549, + "grad_norm": 0.7456050325215346, + "learning_rate": 0.00029357572345402116, + "loss": 7.578, + "step": 7825 + }, + { + "epoch": 0.7302416721097322, + "grad_norm": 1.6240423788721459, + "learning_rate": 0.0002935735333927436, + "loss": 7.5325, + "step": 7826 + }, + { + "epoch": 0.7303349818046095, + "grad_norm": 39.80476073587756, + "learning_rate": 0.00029357134296640106, + "loss": 7.7423, + "step": 7827 + }, + { + "epoch": 0.7304282914994868, + "grad_norm": 1.0744580260250525, + "learning_rate": 0.00029356915217499914, + "loss": 7.5536, + "step": 7828 + }, + { + "epoch": 0.730521601194364, + "grad_norm": 1.552739237377804, + "learning_rate": 0.0002935669610185434, + "loss": 7.5165, + "step": 7829 + }, + { + "epoch": 0.7306149108892414, + "grad_norm": 0.8121611644944978, + "learning_rate": 0.00029356476949703934, + "loss": 7.4239, + "step": 7830 + }, + { + "epoch": 0.7307082205841187, + "grad_norm": 1.033150203700757, + "learning_rate": 0.00029356257761049267, + "loss": 7.4773, + "step": 7831 + }, + { + "epoch": 0.730801530278996, + "grad_norm": 0.9661612103215922, + "learning_rate": 0.00029356038535890885, + "loss": 7.6308, + "step": 7832 + }, + { + "epoch": 0.7308948399738733, + "grad_norm": 0.9078627321994566, + "learning_rate": 0.0002935581927422935, + "loss": 7.5828, + "step": 7833 + }, + { + "epoch": 0.7309881496687506, + "grad_norm": 0.9916148540443551, + "learning_rate": 0.0002935559997606522, + "loss": 7.6952, + "step": 7834 + }, + { + "epoch": 0.7310814593636279, + "grad_norm": 1.4193315999174994, + "learning_rate": 0.00029355380641399054, + "loss": 7.2176, + "step": 7835 + }, + { + "epoch": 0.7311747690585052, + "grad_norm": 1.318663163300992, + "learning_rate": 0.00029355161270231404, + "loss": 7.2793, + "step": 7836 + }, + { + "epoch": 0.7312680787533825, + "grad_norm": 0.8776976717136848, + "learning_rate": 0.00029354941862562835, + "loss": 7.3651, + "step": 7837 + }, + { + "epoch": 0.7313613884482598, + "grad_norm": 0.8205692769226224, + "learning_rate": 0.000293547224183939, + "loss": 7.4013, + "step": 7838 + }, + { + "epoch": 0.731454698143137, + "grad_norm": 0.8725914821158016, + "learning_rate": 0.00029354502937725154, + "loss": 7.1777, + "step": 7839 + }, + { + "epoch": 0.7315480078380143, + "grad_norm": 2.0545175085919225, + "learning_rate": 0.00029354283420557163, + "loss": 7.7442, + "step": 7840 + }, + { + "epoch": 0.7316413175328916, + "grad_norm": 0.9030277439348632, + "learning_rate": 0.0002935406386689048, + "loss": 7.4919, + "step": 7841 + }, + { + "epoch": 0.7317346272277689, + "grad_norm": 0.7388189439394066, + "learning_rate": 0.00029353844276725665, + "loss": 7.0688, + "step": 7842 + }, + { + "epoch": 0.7318279369226462, + "grad_norm": 0.7580413526397566, + "learning_rate": 0.0002935362465006328, + "loss": 7.3931, + "step": 7843 + }, + { + "epoch": 0.7319212466175236, + "grad_norm": 1.0593671023656341, + "learning_rate": 0.00029353404986903867, + "loss": 7.2728, + "step": 7844 + }, + { + "epoch": 0.7320145563124009, + "grad_norm": 1.3389109177926932, + "learning_rate": 0.0002935318528724801, + "loss": 8.0423, + "step": 7845 + }, + { + "epoch": 0.7321078660072782, + "grad_norm": 0.728050271745356, + "learning_rate": 0.00029352965551096246, + "loss": 7.7253, + "step": 7846 + }, + { + "epoch": 0.7322011757021555, + "grad_norm": 0.8154565056685578, + "learning_rate": 0.00029352745778449144, + "loss": 7.5262, + "step": 7847 + }, + { + "epoch": 0.7322944853970328, + "grad_norm": 1.1206742018640798, + "learning_rate": 0.00029352525969307265, + "loss": 7.4782, + "step": 7848 + }, + { + "epoch": 0.7323877950919101, + "grad_norm": 1.0002749821162669, + "learning_rate": 0.0002935230612367116, + "loss": 7.2548, + "step": 7849 + }, + { + "epoch": 0.7324811047867873, + "grad_norm": 0.8469105272598954, + "learning_rate": 0.00029352086241541395, + "loss": 7.2046, + "step": 7850 + }, + { + "epoch": 0.7325744144816646, + "grad_norm": 1.5183027250600982, + "learning_rate": 0.00029351866322918527, + "loss": 7.8713, + "step": 7851 + }, + { + "epoch": 0.7326677241765419, + "grad_norm": 1.4976584518757936, + "learning_rate": 0.0002935164636780311, + "loss": 7.6578, + "step": 7852 + }, + { + "epoch": 0.7327610338714192, + "grad_norm": 1.0562971255347176, + "learning_rate": 0.00029351426376195714, + "loss": 7.5211, + "step": 7853 + }, + { + "epoch": 0.7328543435662965, + "grad_norm": 0.715560821835286, + "learning_rate": 0.00029351206348096886, + "loss": 7.5287, + "step": 7854 + }, + { + "epoch": 0.7329476532611738, + "grad_norm": 1.2067598590804502, + "learning_rate": 0.00029350986283507196, + "loss": 7.0835, + "step": 7855 + }, + { + "epoch": 0.7330409629560511, + "grad_norm": 0.6514404093928944, + "learning_rate": 0.0002935076618242719, + "loss": 7.3413, + "step": 7856 + }, + { + "epoch": 0.7331342726509285, + "grad_norm": 0.6559014279872011, + "learning_rate": 0.00029350546044857445, + "loss": 7.2554, + "step": 7857 + }, + { + "epoch": 0.7332275823458058, + "grad_norm": 0.9339382342529967, + "learning_rate": 0.00029350325870798513, + "loss": 7.7359, + "step": 7858 + }, + { + "epoch": 0.7333208920406831, + "grad_norm": 0.9470492754683193, + "learning_rate": 0.00029350105660250946, + "loss": 7.2442, + "step": 7859 + }, + { + "epoch": 0.7334142017355604, + "grad_norm": 1.0283451607574488, + "learning_rate": 0.00029349885413215314, + "loss": 7.7261, + "step": 7860 + }, + { + "epoch": 0.7335075114304376, + "grad_norm": 0.6324383207929648, + "learning_rate": 0.0002934966512969218, + "loss": 7.4499, + "step": 7861 + }, + { + "epoch": 0.7336008211253149, + "grad_norm": 1.019922088591668, + "learning_rate": 0.0002934944480968209, + "loss": 7.3689, + "step": 7862 + }, + { + "epoch": 0.7336941308201922, + "grad_norm": 0.7456088039231142, + "learning_rate": 0.0002934922445318562, + "loss": 7.4583, + "step": 7863 + }, + { + "epoch": 0.7337874405150695, + "grad_norm": 1.0889471736800904, + "learning_rate": 0.00029349004060203317, + "loss": 7.3755, + "step": 7864 + }, + { + "epoch": 0.7338807502099468, + "grad_norm": 0.6092898151464223, + "learning_rate": 0.0002934878363073575, + "loss": 7.434, + "step": 7865 + }, + { + "epoch": 0.7339740599048241, + "grad_norm": 0.7544771731885117, + "learning_rate": 0.00029348563164783474, + "loss": 7.3973, + "step": 7866 + }, + { + "epoch": 0.7340673695997014, + "grad_norm": 0.5578943432719913, + "learning_rate": 0.0002934834266234706, + "loss": 7.0804, + "step": 7867 + }, + { + "epoch": 0.7341606792945787, + "grad_norm": 0.8254788601678765, + "learning_rate": 0.0002934812212342705, + "loss": 7.4373, + "step": 7868 + }, + { + "epoch": 0.734253988989456, + "grad_norm": 0.5657386633121168, + "learning_rate": 0.0002934790154802402, + "loss": 7.2852, + "step": 7869 + }, + { + "epoch": 0.7343472986843333, + "grad_norm": 0.6762271890885183, + "learning_rate": 0.0002934768093613853, + "loss": 7.1042, + "step": 7870 + }, + { + "epoch": 0.7344406083792105, + "grad_norm": 0.630305664053805, + "learning_rate": 0.00029347460287771135, + "loss": 7.3755, + "step": 7871 + }, + { + "epoch": 0.7345339180740879, + "grad_norm": 1.037516186979142, + "learning_rate": 0.000293472396029224, + "loss": 7.4387, + "step": 7872 + }, + { + "epoch": 0.7346272277689652, + "grad_norm": 1.2232952035987958, + "learning_rate": 0.00029347018881592876, + "loss": 7.8313, + "step": 7873 + }, + { + "epoch": 0.7347205374638425, + "grad_norm": 1.0154396089980475, + "learning_rate": 0.00029346798123783144, + "loss": 7.1434, + "step": 7874 + }, + { + "epoch": 0.7348138471587198, + "grad_norm": 1.0955321829839302, + "learning_rate": 0.0002934657732949375, + "loss": 7.4169, + "step": 7875 + }, + { + "epoch": 0.7349071568535971, + "grad_norm": 1.3834964163534724, + "learning_rate": 0.00029346356498725256, + "loss": 7.2304, + "step": 7876 + }, + { + "epoch": 0.7350004665484744, + "grad_norm": 0.5277838498401587, + "learning_rate": 0.00029346135631478233, + "loss": 7.3598, + "step": 7877 + }, + { + "epoch": 0.7350937762433517, + "grad_norm": 0.537100360980731, + "learning_rate": 0.00029345914727753233, + "loss": 7.263, + "step": 7878 + }, + { + "epoch": 0.735187085938229, + "grad_norm": 1.138663948993013, + "learning_rate": 0.00029345693787550826, + "loss": 7.5561, + "step": 7879 + }, + { + "epoch": 0.7352803956331063, + "grad_norm": 0.8744356380096119, + "learning_rate": 0.00029345472810871564, + "loss": 7.2961, + "step": 7880 + }, + { + "epoch": 0.7353737053279836, + "grad_norm": 0.7155476102657159, + "learning_rate": 0.0002934525179771601, + "loss": 7.131, + "step": 7881 + }, + { + "epoch": 0.7354670150228608, + "grad_norm": 1.011987295873645, + "learning_rate": 0.00029345030748084743, + "loss": 7.2264, + "step": 7882 + }, + { + "epoch": 0.7355603247177381, + "grad_norm": 0.7465754430132612, + "learning_rate": 0.00029344809661978306, + "loss": 7.3382, + "step": 7883 + }, + { + "epoch": 0.7356536344126154, + "grad_norm": 1.155358792360951, + "learning_rate": 0.00029344588539397264, + "loss": 7.0528, + "step": 7884 + }, + { + "epoch": 0.7357469441074928, + "grad_norm": 0.7093372991632574, + "learning_rate": 0.00029344367380342186, + "loss": 7.7128, + "step": 7885 + }, + { + "epoch": 0.7358402538023701, + "grad_norm": 0.5179298960348854, + "learning_rate": 0.0002934414618481363, + "loss": 7.2846, + "step": 7886 + }, + { + "epoch": 0.7359335634972474, + "grad_norm": 0.9451611313320316, + "learning_rate": 0.0002934392495281216, + "loss": 7.5646, + "step": 7887 + }, + { + "epoch": 0.7360268731921247, + "grad_norm": 0.6191629386391628, + "learning_rate": 0.00029343703684338334, + "loss": 7.4434, + "step": 7888 + }, + { + "epoch": 0.736120182887002, + "grad_norm": 0.4384050106580797, + "learning_rate": 0.00029343482379392725, + "loss": 7.3099, + "step": 7889 + }, + { + "epoch": 0.7362134925818793, + "grad_norm": 0.6746137154966473, + "learning_rate": 0.0002934326103797588, + "loss": 7.1036, + "step": 7890 + }, + { + "epoch": 0.7363068022767566, + "grad_norm": 0.487241044113081, + "learning_rate": 0.0002934303966008838, + "loss": 7.3117, + "step": 7891 + }, + { + "epoch": 0.7364001119716338, + "grad_norm": 0.5296461674483163, + "learning_rate": 0.00029342818245730776, + "loss": 7.2883, + "step": 7892 + }, + { + "epoch": 0.7364934216665111, + "grad_norm": 0.6982862531782661, + "learning_rate": 0.00029342596794903634, + "loss": 7.5344, + "step": 7893 + }, + { + "epoch": 0.7365867313613884, + "grad_norm": 0.9472474716850933, + "learning_rate": 0.00029342375307607515, + "loss": 7.0755, + "step": 7894 + }, + { + "epoch": 0.7366800410562657, + "grad_norm": 1.1175468006197171, + "learning_rate": 0.0002934215378384298, + "loss": 7.2036, + "step": 7895 + }, + { + "epoch": 0.736773350751143, + "grad_norm": 0.9106161325557631, + "learning_rate": 0.0002934193222361061, + "loss": 7.6542, + "step": 7896 + }, + { + "epoch": 0.7368666604460203, + "grad_norm": 1.035359317486576, + "learning_rate": 0.0002934171062691095, + "loss": 7.4839, + "step": 7897 + }, + { + "epoch": 0.7369599701408976, + "grad_norm": 0.6600865208437044, + "learning_rate": 0.0002934148899374456, + "loss": 7.5681, + "step": 7898 + }, + { + "epoch": 0.737053279835775, + "grad_norm": 0.7367186746795193, + "learning_rate": 0.0002934126732411202, + "loss": 7.479, + "step": 7899 + }, + { + "epoch": 0.7371465895306523, + "grad_norm": 2.4254439791525804, + "learning_rate": 0.0002934104561801388, + "loss": 7.4766, + "step": 7900 + }, + { + "epoch": 0.7372398992255296, + "grad_norm": 0.898426645425766, + "learning_rate": 0.0002934082387545071, + "loss": 7.309, + "step": 7901 + }, + { + "epoch": 0.7373332089204069, + "grad_norm": 1.8313707489802744, + "learning_rate": 0.00029340602096423075, + "loss": 7.2348, + "step": 7902 + }, + { + "epoch": 0.7374265186152841, + "grad_norm": 1.7084089052467024, + "learning_rate": 0.00029340380280931536, + "loss": 7.3227, + "step": 7903 + }, + { + "epoch": 0.7375198283101614, + "grad_norm": 1.6521222144702254, + "learning_rate": 0.0002934015842897665, + "loss": 7.2133, + "step": 7904 + }, + { + "epoch": 0.7376131380050387, + "grad_norm": 0.8671972679592067, + "learning_rate": 0.00029339936540559, + "loss": 6.9967, + "step": 7905 + }, + { + "epoch": 0.737706447699916, + "grad_norm": 1.0361937402167538, + "learning_rate": 0.00029339714615679135, + "loss": 7.2938, + "step": 7906 + }, + { + "epoch": 0.7377997573947933, + "grad_norm": 0.9739620675608677, + "learning_rate": 0.0002933949265433763, + "loss": 7.5393, + "step": 7907 + }, + { + "epoch": 0.7378930670896706, + "grad_norm": 0.5992261278150399, + "learning_rate": 0.00029339270656535035, + "loss": 7.6654, + "step": 7908 + }, + { + "epoch": 0.7379863767845479, + "grad_norm": 4.854347580861979, + "learning_rate": 0.00029339048622271924, + "loss": 7.2633, + "step": 7909 + }, + { + "epoch": 0.7380796864794252, + "grad_norm": 1.2773178195798345, + "learning_rate": 0.0002933882655154886, + "loss": 7.4714, + "step": 7910 + }, + { + "epoch": 0.7381729961743025, + "grad_norm": 0.8297351912166462, + "learning_rate": 0.0002933860444436641, + "loss": 7.3236, + "step": 7911 + }, + { + "epoch": 0.7382663058691799, + "grad_norm": 6.8585928300267245, + "learning_rate": 0.00029338382300725136, + "loss": 7.0773, + "step": 7912 + }, + { + "epoch": 0.7383596155640572, + "grad_norm": 6.7386992140494355, + "learning_rate": 0.00029338160120625595, + "loss": 7.1176, + "step": 7913 + }, + { + "epoch": 0.7384529252589344, + "grad_norm": 1.2153400717812355, + "learning_rate": 0.0002933793790406837, + "loss": 7.0651, + "step": 7914 + }, + { + "epoch": 0.7385462349538117, + "grad_norm": 1.3772688107615418, + "learning_rate": 0.0002933771565105401, + "loss": 7.2623, + "step": 7915 + }, + { + "epoch": 0.738639544648689, + "grad_norm": 77.47430351654867, + "learning_rate": 0.0002933749336158309, + "loss": 7.6101, + "step": 7916 + }, + { + "epoch": 0.7387328543435663, + "grad_norm": 1.3221390471597125, + "learning_rate": 0.0002933727103565617, + "loss": 7.4755, + "step": 7917 + }, + { + "epoch": 0.7388261640384436, + "grad_norm": 102.4486058253301, + "learning_rate": 0.00029337048673273814, + "loss": 7.4448, + "step": 7918 + }, + { + "epoch": 0.7389194737333209, + "grad_norm": 0.8228755657895653, + "learning_rate": 0.00029336826274436594, + "loss": 7.36, + "step": 7919 + }, + { + "epoch": 0.7390127834281982, + "grad_norm": 1.2386645146674187, + "learning_rate": 0.00029336603839145075, + "loss": 7.2037, + "step": 7920 + }, + { + "epoch": 0.7391060931230755, + "grad_norm": 1.0040899217699044, + "learning_rate": 0.0002933638136739981, + "loss": 7.222, + "step": 7921 + }, + { + "epoch": 0.7391994028179528, + "grad_norm": 1.1645932205767275, + "learning_rate": 0.00029336158859201385, + "loss": 7.3282, + "step": 7922 + }, + { + "epoch": 0.7392927125128301, + "grad_norm": 724.9526982250558, + "learning_rate": 0.00029335936314550345, + "loss": 7.4931, + "step": 7923 + }, + { + "epoch": 0.7393860222077073, + "grad_norm": 1.2306532829198447, + "learning_rate": 0.00029335713733447274, + "loss": 7.166, + "step": 7924 + }, + { + "epoch": 0.7394793319025846, + "grad_norm": 1.0804726247365868, + "learning_rate": 0.00029335491115892724, + "loss": 7.2567, + "step": 7925 + }, + { + "epoch": 0.7395726415974619, + "grad_norm": 543.4855906651502, + "learning_rate": 0.00029335268461887264, + "loss": 7.2256, + "step": 7926 + }, + { + "epoch": 0.7396659512923393, + "grad_norm": 30.803015202137665, + "learning_rate": 0.0002933504577143147, + "loss": 7.6679, + "step": 7927 + }, + { + "epoch": 0.7397592609872166, + "grad_norm": 0.5761471158465383, + "learning_rate": 0.000293348230445259, + "loss": 7.3602, + "step": 7928 + }, + { + "epoch": 0.7398525706820939, + "grad_norm": 1.422033028641083, + "learning_rate": 0.00029334600281171114, + "loss": 7.1941, + "step": 7929 + }, + { + "epoch": 0.7399458803769712, + "grad_norm": 0.9080270399538033, + "learning_rate": 0.00029334377481367687, + "loss": 7.3819, + "step": 7930 + }, + { + "epoch": 0.7400391900718485, + "grad_norm": 1.2593870288557443, + "learning_rate": 0.0002933415464511619, + "loss": 7.325, + "step": 7931 + }, + { + "epoch": 0.7401324997667258, + "grad_norm": 1.303971522512578, + "learning_rate": 0.00029333931772417183, + "loss": 7.3812, + "step": 7932 + }, + { + "epoch": 0.7402258094616031, + "grad_norm": 2.661026802562429, + "learning_rate": 0.00029333708863271235, + "loss": 7.8645, + "step": 7933 + }, + { + "epoch": 0.7403191191564804, + "grad_norm": 1.7990898968147544, + "learning_rate": 0.00029333485917678903, + "loss": 7.2142, + "step": 7934 + }, + { + "epoch": 0.7404124288513576, + "grad_norm": 2.380811090726226, + "learning_rate": 0.0002933326293564077, + "loss": 7.6142, + "step": 7935 + }, + { + "epoch": 0.7405057385462349, + "grad_norm": 16.628646052893643, + "learning_rate": 0.0002933303991715739, + "loss": 7.5728, + "step": 7936 + }, + { + "epoch": 0.7405990482411122, + "grad_norm": 3.05463089430178, + "learning_rate": 0.00029332816862229336, + "loss": 7.4161, + "step": 7937 + }, + { + "epoch": 0.7406923579359895, + "grad_norm": 0.6916516698904709, + "learning_rate": 0.0002933259377085718, + "loss": 7.504, + "step": 7938 + }, + { + "epoch": 0.7407856676308668, + "grad_norm": 0.9741212703543722, + "learning_rate": 0.00029332370643041477, + "loss": 7.4697, + "step": 7939 + }, + { + "epoch": 0.7408789773257441, + "grad_norm": 2.333229366476805, + "learning_rate": 0.000293321474787828, + "loss": 7.5987, + "step": 7940 + }, + { + "epoch": 0.7409722870206215, + "grad_norm": 0.5674930152024661, + "learning_rate": 0.00029331924278081726, + "loss": 7.4885, + "step": 7941 + }, + { + "epoch": 0.7410655967154988, + "grad_norm": 1.0403176153221694, + "learning_rate": 0.0002933170104093881, + "loss": 7.4662, + "step": 7942 + }, + { + "epoch": 0.7411589064103761, + "grad_norm": 1.5487342628189822, + "learning_rate": 0.00029331477767354625, + "loss": 7.6837, + "step": 7943 + }, + { + "epoch": 0.7412522161052534, + "grad_norm": 1.007733827761656, + "learning_rate": 0.0002933125445732973, + "loss": 7.0692, + "step": 7944 + }, + { + "epoch": 0.7413455258001306, + "grad_norm": 1.316449083460836, + "learning_rate": 0.0002933103111086471, + "loss": 7.802, + "step": 7945 + }, + { + "epoch": 0.7414388354950079, + "grad_norm": 0.5929055760837783, + "learning_rate": 0.00029330807727960117, + "loss": 7.3737, + "step": 7946 + }, + { + "epoch": 0.7415321451898852, + "grad_norm": 2.1651102278600374, + "learning_rate": 0.0002933058430861653, + "loss": 7.5696, + "step": 7947 + }, + { + "epoch": 0.7416254548847625, + "grad_norm": 0.7440668601349602, + "learning_rate": 0.00029330360852834503, + "loss": 7.8311, + "step": 7948 + }, + { + "epoch": 0.7417187645796398, + "grad_norm": 2.15969952075935, + "learning_rate": 0.0002933013736061462, + "loss": 7.8844, + "step": 7949 + }, + { + "epoch": 0.7418120742745171, + "grad_norm": 0.696326823267602, + "learning_rate": 0.00029329913831957445, + "loss": 7.3704, + "step": 7950 + }, + { + "epoch": 0.7419053839693944, + "grad_norm": 1.212548583527233, + "learning_rate": 0.00029329690266863533, + "loss": 7.1508, + "step": 7951 + }, + { + "epoch": 0.7419986936642717, + "grad_norm": 0.801183776489053, + "learning_rate": 0.00029329466665333474, + "loss": 7.3114, + "step": 7952 + }, + { + "epoch": 0.742092003359149, + "grad_norm": 1.0247584109906573, + "learning_rate": 0.0002932924302736782, + "loss": 7.0452, + "step": 7953 + }, + { + "epoch": 0.7421853130540264, + "grad_norm": 0.9102676313988611, + "learning_rate": 0.00029329019352967154, + "loss": 7.1881, + "step": 7954 + }, + { + "epoch": 0.7422786227489037, + "grad_norm": 0.8251639346392526, + "learning_rate": 0.00029328795642132033, + "loss": 7.3025, + "step": 7955 + }, + { + "epoch": 0.7423719324437809, + "grad_norm": 1.3816800208300815, + "learning_rate": 0.00029328571894863024, + "loss": 7.4311, + "step": 7956 + }, + { + "epoch": 0.7424652421386582, + "grad_norm": 0.9389158839343814, + "learning_rate": 0.000293283481111607, + "loss": 7.2694, + "step": 7957 + }, + { + "epoch": 0.7425585518335355, + "grad_norm": 0.6578918739901226, + "learning_rate": 0.00029328124291025636, + "loss": 7.5308, + "step": 7958 + }, + { + "epoch": 0.7426518615284128, + "grad_norm": 4.919319513716818, + "learning_rate": 0.00029327900434458396, + "loss": 7.544, + "step": 7959 + }, + { + "epoch": 0.7427451712232901, + "grad_norm": 0.5677938527228845, + "learning_rate": 0.0002932767654145955, + "loss": 7.3641, + "step": 7960 + }, + { + "epoch": 0.7428384809181674, + "grad_norm": 0.5926332995729247, + "learning_rate": 0.0002932745261202967, + "loss": 7.3988, + "step": 7961 + }, + { + "epoch": 0.7429317906130447, + "grad_norm": 0.6284624917583064, + "learning_rate": 0.00029327228646169315, + "loss": 7.177, + "step": 7962 + }, + { + "epoch": 0.743025100307922, + "grad_norm": 1.2083785948820143, + "learning_rate": 0.00029327004643879063, + "loss": 7.592, + "step": 7963 + }, + { + "epoch": 0.7431184100027993, + "grad_norm": 0.3806470061185782, + "learning_rate": 0.0002932678060515948, + "loss": 7.353, + "step": 7964 + }, + { + "epoch": 0.7432117196976766, + "grad_norm": 0.5502004089585302, + "learning_rate": 0.00029326556530011143, + "loss": 7.2372, + "step": 7965 + }, + { + "epoch": 0.7433050293925539, + "grad_norm": 0.778443579517566, + "learning_rate": 0.0002932633241843462, + "loss": 7.0755, + "step": 7966 + }, + { + "epoch": 0.7433983390874311, + "grad_norm": 2.10924440850915, + "learning_rate": 0.0002932610827043047, + "loss": 7.6516, + "step": 7967 + }, + { + "epoch": 0.7434916487823084, + "grad_norm": 1.4461310004411236, + "learning_rate": 0.00029325884085999276, + "loss": 7.1488, + "step": 7968 + }, + { + "epoch": 0.7435849584771858, + "grad_norm": 0.6920384210403525, + "learning_rate": 0.0002932565986514159, + "loss": 7.2333, + "step": 7969 + }, + { + "epoch": 0.7436782681720631, + "grad_norm": 0.5295147901226591, + "learning_rate": 0.0002932543560785801, + "loss": 7.2589, + "step": 7970 + }, + { + "epoch": 0.7437715778669404, + "grad_norm": 1.6947055182860513, + "learning_rate": 0.00029325211314149084, + "loss": 7.2163, + "step": 7971 + }, + { + "epoch": 0.7438648875618177, + "grad_norm": 0.9945189184097719, + "learning_rate": 0.0002932498698401539, + "loss": 7.2877, + "step": 7972 + }, + { + "epoch": 0.743958197256695, + "grad_norm": 0.6041869591179996, + "learning_rate": 0.000293247626174575, + "loss": 7.0356, + "step": 7973 + }, + { + "epoch": 0.7440515069515723, + "grad_norm": 0.8904634395051566, + "learning_rate": 0.0002932453821447597, + "loss": 7.7298, + "step": 7974 + }, + { + "epoch": 0.7441448166464496, + "grad_norm": 1.024045499635571, + "learning_rate": 0.000293243137750714, + "loss": 7.3178, + "step": 7975 + }, + { + "epoch": 0.7442381263413269, + "grad_norm": 10.452691733384718, + "learning_rate": 0.0002932408929924433, + "loss": 7.7542, + "step": 7976 + }, + { + "epoch": 0.7443314360362041, + "grad_norm": 8.841541125305774, + "learning_rate": 0.0002932386478699535, + "loss": 7.4674, + "step": 7977 + }, + { + "epoch": 0.7444247457310814, + "grad_norm": 1.6345554729623548, + "learning_rate": 0.0002932364023832502, + "loss": 7.5171, + "step": 7978 + }, + { + "epoch": 0.7445180554259587, + "grad_norm": 1.1169879180581455, + "learning_rate": 0.00029323415653233925, + "loss": 7.2, + "step": 7979 + }, + { + "epoch": 0.744611365120836, + "grad_norm": 1.8678971811253644, + "learning_rate": 0.0002932319103172262, + "loss": 7.7098, + "step": 7980 + }, + { + "epoch": 0.7447046748157133, + "grad_norm": 1.1395768610233676, + "learning_rate": 0.00029322966373791684, + "loss": 7.4303, + "step": 7981 + }, + { + "epoch": 0.7447979845105906, + "grad_norm": 2.727461804438223, + "learning_rate": 0.00029322741679441687, + "loss": 7.6488, + "step": 7982 + }, + { + "epoch": 0.744891294205468, + "grad_norm": 0.9583595537456179, + "learning_rate": 0.000293225169486732, + "loss": 7.3653, + "step": 7983 + }, + { + "epoch": 0.7449846039003453, + "grad_norm": 1.2742428072516783, + "learning_rate": 0.000293222921814868, + "loss": 7.2764, + "step": 7984 + }, + { + "epoch": 0.7450779135952226, + "grad_norm": 0.5947769985229491, + "learning_rate": 0.00029322067377883043, + "loss": 7.5049, + "step": 7985 + }, + { + "epoch": 0.7451712232900999, + "grad_norm": 0.6453081834376381, + "learning_rate": 0.0002932184253786252, + "loss": 7.5972, + "step": 7986 + }, + { + "epoch": 0.7452645329849772, + "grad_norm": 1.059117819659827, + "learning_rate": 0.00029321617661425795, + "loss": 7.337, + "step": 7987 + }, + { + "epoch": 0.7453578426798544, + "grad_norm": 0.5988311781122599, + "learning_rate": 0.00029321392748573433, + "loss": 7.2408, + "step": 7988 + }, + { + "epoch": 0.7454511523747317, + "grad_norm": 1.631034730547737, + "learning_rate": 0.00029321167799306017, + "loss": 6.8418, + "step": 7989 + }, + { + "epoch": 0.745544462069609, + "grad_norm": 2.044886476088687, + "learning_rate": 0.0002932094281362411, + "loss": 7.5064, + "step": 7990 + }, + { + "epoch": 0.7456377717644863, + "grad_norm": 1.3838848730240174, + "learning_rate": 0.0002932071779152829, + "loss": 7.4376, + "step": 7991 + }, + { + "epoch": 0.7457310814593636, + "grad_norm": 0.8572786516078031, + "learning_rate": 0.00029320492733019124, + "loss": 7.5588, + "step": 7992 + }, + { + "epoch": 0.7458243911542409, + "grad_norm": 0.8182062428416782, + "learning_rate": 0.00029320267638097185, + "loss": 7.2419, + "step": 7993 + }, + { + "epoch": 0.7459177008491182, + "grad_norm": 7.345312583925489, + "learning_rate": 0.00029320042506763053, + "loss": 7.4343, + "step": 7994 + }, + { + "epoch": 0.7460110105439955, + "grad_norm": 1.314663779900062, + "learning_rate": 0.00029319817339017293, + "loss": 7.4718, + "step": 7995 + }, + { + "epoch": 0.7461043202388729, + "grad_norm": 3.388918614581523, + "learning_rate": 0.0002931959213486048, + "loss": 7.3676, + "step": 7996 + }, + { + "epoch": 0.7461976299337502, + "grad_norm": 0.7085732498665374, + "learning_rate": 0.00029319366894293183, + "loss": 7.4799, + "step": 7997 + }, + { + "epoch": 0.7462909396286274, + "grad_norm": 1.1231189110049462, + "learning_rate": 0.0002931914161731598, + "loss": 7.161, + "step": 7998 + }, + { + "epoch": 0.7463842493235047, + "grad_norm": 0.5607122522367688, + "learning_rate": 0.0002931891630392944, + "loss": 7.3525, + "step": 7999 + }, + { + "epoch": 0.746477559018382, + "grad_norm": 0.9136853738934135, + "learning_rate": 0.00029318690954134135, + "loss": 7.2336, + "step": 8000 + }, + { + "epoch": 0.7465708687132593, + "grad_norm": 7.789798900982798, + "learning_rate": 0.00029318465567930647, + "loss": 7.4784, + "step": 8001 + }, + { + "epoch": 0.7466641784081366, + "grad_norm": 2.293416076891602, + "learning_rate": 0.0002931824014531954, + "loss": 7.7617, + "step": 8002 + }, + { + "epoch": 0.7467574881030139, + "grad_norm": 0.4192319767187546, + "learning_rate": 0.00029318014686301386, + "loss": 7.3548, + "step": 8003 + }, + { + "epoch": 0.7468507977978912, + "grad_norm": 4.150297224082741, + "learning_rate": 0.0002931778919087677, + "loss": 7.4863, + "step": 8004 + }, + { + "epoch": 0.7469441074927685, + "grad_norm": 1.8909912144105345, + "learning_rate": 0.0002931756365904625, + "loss": 7.0292, + "step": 8005 + }, + { + "epoch": 0.7470374171876458, + "grad_norm": 1.4512617471487719, + "learning_rate": 0.0002931733809081041, + "loss": 7.345, + "step": 8006 + }, + { + "epoch": 0.7471307268825231, + "grad_norm": 0.7032930898469381, + "learning_rate": 0.0002931711248616982, + "loss": 7.1329, + "step": 8007 + }, + { + "epoch": 0.7472240365774004, + "grad_norm": 0.781660451030658, + "learning_rate": 0.0002931688684512505, + "loss": 7.1915, + "step": 8008 + }, + { + "epoch": 0.7473173462722776, + "grad_norm": 0.38244253397434946, + "learning_rate": 0.0002931666116767668, + "loss": 6.9563, + "step": 8009 + }, + { + "epoch": 0.747410655967155, + "grad_norm": 2.4304112582580557, + "learning_rate": 0.00029316435453825287, + "loss": 7.1131, + "step": 8010 + }, + { + "epoch": 0.7475039656620323, + "grad_norm": 1.6246898313210816, + "learning_rate": 0.0002931620970357143, + "loss": 7.268, + "step": 8011 + }, + { + "epoch": 0.7475972753569096, + "grad_norm": 143.56971824172945, + "learning_rate": 0.000293159839169157, + "loss": 7.2656, + "step": 8012 + }, + { + "epoch": 0.7476905850517869, + "grad_norm": 290.809610296334, + "learning_rate": 0.0002931575809385866, + "loss": 7.1854, + "step": 8013 + }, + { + "epoch": 0.7477838947466642, + "grad_norm": 1.0077563679822914, + "learning_rate": 0.00029315532234400893, + "loss": 7.451, + "step": 8014 + }, + { + "epoch": 0.7478772044415415, + "grad_norm": 1.3218110820419418, + "learning_rate": 0.0002931530633854296, + "loss": 7.3503, + "step": 8015 + }, + { + "epoch": 0.7479705141364188, + "grad_norm": 42650.77659782516, + "learning_rate": 0.0002931508040628545, + "loss": 7.3643, + "step": 8016 + }, + { + "epoch": 0.7480638238312961, + "grad_norm": 2.4660557345170626, + "learning_rate": 0.00029314854437628924, + "loss": 7.8393, + "step": 8017 + }, + { + "epoch": 0.7481571335261734, + "grad_norm": 1.4621869643152645, + "learning_rate": 0.0002931462843257397, + "loss": 7.9127, + "step": 8018 + }, + { + "epoch": 0.7482504432210507, + "grad_norm": 6.962644738527613, + "learning_rate": 0.0002931440239112115, + "loss": 8.063, + "step": 8019 + }, + { + "epoch": 0.7483437529159279, + "grad_norm": 8.962003942577581, + "learning_rate": 0.0002931417631327105, + "loss": 7.8081, + "step": 8020 + }, + { + "epoch": 0.7484370626108052, + "grad_norm": 3.4787418374306176, + "learning_rate": 0.00029313950199024244, + "loss": 8.2508, + "step": 8021 + }, + { + "epoch": 0.7485303723056825, + "grad_norm": 2.6294981500969543, + "learning_rate": 0.000293137240483813, + "loss": 7.8634, + "step": 8022 + }, + { + "epoch": 0.7486236820005598, + "grad_norm": 11.243214306094005, + "learning_rate": 0.00029313497861342794, + "loss": 7.5639, + "step": 8023 + }, + { + "epoch": 0.7487169916954372, + "grad_norm": 2.6799117676300686, + "learning_rate": 0.00029313271637909297, + "loss": 7.8668, + "step": 8024 + }, + { + "epoch": 0.7488103013903145, + "grad_norm": 4.118833867864081, + "learning_rate": 0.000293130453780814, + "loss": 7.9512, + "step": 8025 + }, + { + "epoch": 0.7489036110851918, + "grad_norm": 12.343587264996279, + "learning_rate": 0.00029312819081859663, + "loss": 7.7519, + "step": 8026 + }, + { + "epoch": 0.7489969207800691, + "grad_norm": 3.0013424771862294, + "learning_rate": 0.00029312592749244665, + "loss": 7.4744, + "step": 8027 + }, + { + "epoch": 0.7490902304749464, + "grad_norm": 1.6013777664578293, + "learning_rate": 0.0002931236638023699, + "loss": 7.6695, + "step": 8028 + }, + { + "epoch": 0.7491835401698237, + "grad_norm": 1.8586812522717602, + "learning_rate": 0.000293121399748372, + "loss": 7.3451, + "step": 8029 + }, + { + "epoch": 0.7492768498647009, + "grad_norm": 2.6413544330008074, + "learning_rate": 0.0002931191353304588, + "loss": 7.918, + "step": 8030 + }, + { + "epoch": 0.7493701595595782, + "grad_norm": 1071401429.2498881, + "learning_rate": 0.00029311687054863606, + "loss": 7.6996, + "step": 8031 + }, + { + "epoch": 0.7494634692544555, + "grad_norm": 9.31533758221415, + "learning_rate": 0.00029311460540290947, + "loss": 7.7724, + "step": 8032 + }, + { + "epoch": 0.7495567789493328, + "grad_norm": 9.644409736763096, + "learning_rate": 0.00029311233989328484, + "loss": 7.9243, + "step": 8033 + }, + { + "epoch": 0.7496500886442101, + "grad_norm": 5.622586526475371, + "learning_rate": 0.000293110074019768, + "loss": 8.2066, + "step": 8034 + }, + { + "epoch": 0.7497433983390874, + "grad_norm": 7.286243670974119, + "learning_rate": 0.00029310780778236457, + "loss": 8.6343, + "step": 8035 + }, + { + "epoch": 0.7498367080339647, + "grad_norm": 22338756768556.777, + "learning_rate": 0.00029310554118108036, + "loss": 8.8692, + "step": 8036 + }, + { + "epoch": 0.749930017728842, + "grad_norm": 8.181113040877584, + "learning_rate": 0.00029310327421592114, + "loss": 8.1537, + "step": 8037 + }, + { + "epoch": 0.7500233274237194, + "grad_norm": 6.0580058204879, + "learning_rate": 0.00029310100688689273, + "loss": 8.0694, + "step": 8038 + }, + { + "epoch": 0.7501166371185967, + "grad_norm": 12.389555785511554, + "learning_rate": 0.0002930987391940008, + "loss": 7.8206, + "step": 8039 + }, + { + "epoch": 0.750209946813474, + "grad_norm": 9.460067556236853, + "learning_rate": 0.00029309647113725124, + "loss": 7.5221, + "step": 8040 + }, + { + "epoch": 0.7503032565083512, + "grad_norm": 4.423787652146722, + "learning_rate": 0.0002930942027166497, + "loss": 7.7892, + "step": 8041 + }, + { + "epoch": 0.7503965662032285, + "grad_norm": 27.575719744613586, + "learning_rate": 0.00029309193393220196, + "loss": 7.7478, + "step": 8042 + }, + { + "epoch": 0.7504898758981058, + "grad_norm": 6.818919484671299, + "learning_rate": 0.00029308966478391386, + "loss": 7.6897, + "step": 8043 + }, + { + "epoch": 0.7505831855929831, + "grad_norm": 3.3807493679437863, + "learning_rate": 0.00029308739527179114, + "loss": 7.635, + "step": 8044 + }, + { + "epoch": 0.7506764952878604, + "grad_norm": 4.3621711072311165, + "learning_rate": 0.0002930851253958395, + "loss": 7.283, + "step": 8045 + }, + { + "epoch": 0.7507698049827377, + "grad_norm": 2.307583979199303, + "learning_rate": 0.00029308285515606484, + "loss": 7.2944, + "step": 8046 + }, + { + "epoch": 0.750863114677615, + "grad_norm": 11.626998325727447, + "learning_rate": 0.0002930805845524728, + "loss": 7.6123, + "step": 8047 + }, + { + "epoch": 0.7509564243724923, + "grad_norm": 2.430779388266343, + "learning_rate": 0.00029307831358506926, + "loss": 7.421, + "step": 8048 + }, + { + "epoch": 0.7510497340673696, + "grad_norm": 1.53520374519722, + "learning_rate": 0.00029307604225385994, + "loss": 7.1, + "step": 8049 + }, + { + "epoch": 0.7511430437622469, + "grad_norm": 3.076032102891254, + "learning_rate": 0.00029307377055885064, + "loss": 7.3826, + "step": 8050 + }, + { + "epoch": 0.7512363534571243, + "grad_norm": 956500.0384300639, + "learning_rate": 0.0002930714985000471, + "loss": 7.4126, + "step": 8051 + }, + { + "epoch": 0.7513296631520014, + "grad_norm": 2.1882743689716877, + "learning_rate": 0.0002930692260774552, + "loss": 7.6097, + "step": 8052 + }, + { + "epoch": 0.7514229728468788, + "grad_norm": 2.855112303288983, + "learning_rate": 0.00029306695329108054, + "loss": 7.3698, + "step": 8053 + }, + { + "epoch": 0.7515162825417561, + "grad_norm": 150.7907006603548, + "learning_rate": 0.00029306468014092904, + "loss": 7.4676, + "step": 8054 + }, + { + "epoch": 0.7516095922366334, + "grad_norm": 3.897587323281161, + "learning_rate": 0.00029306240662700634, + "loss": 7.3268, + "step": 8055 + }, + { + "epoch": 0.7517029019315107, + "grad_norm": 2.259487219445734, + "learning_rate": 0.0002930601327493184, + "loss": 7.3032, + "step": 8056 + }, + { + "epoch": 0.751796211626388, + "grad_norm": 0.7274260164189498, + "learning_rate": 0.00029305785850787096, + "loss": 7.3549, + "step": 8057 + }, + { + "epoch": 0.7518895213212653, + "grad_norm": 1.2134494927717088, + "learning_rate": 0.0002930555839026697, + "loss": 7.1744, + "step": 8058 + }, + { + "epoch": 0.7519828310161426, + "grad_norm": 1.4250959810214572, + "learning_rate": 0.0002930533089337205, + "loss": 7.2663, + "step": 8059 + }, + { + "epoch": 0.7520761407110199, + "grad_norm": 521649.585805953, + "learning_rate": 0.0002930510336010291, + "loss": 7.3753, + "step": 8060 + }, + { + "epoch": 0.7521694504058972, + "grad_norm": 11256589.232876968, + "learning_rate": 0.0002930487579046013, + "loss": 7.3966, + "step": 8061 + }, + { + "epoch": 0.7522627601007744, + "grad_norm": 18847429.47821582, + "learning_rate": 0.0002930464818444429, + "loss": 7.6265, + "step": 8062 + }, + { + "epoch": 0.7523560697956517, + "grad_norm": 1.5892802510193464, + "learning_rate": 0.00029304420542055964, + "loss": 7.2601, + "step": 8063 + }, + { + "epoch": 0.752449379490529, + "grad_norm": 2.645486026943508, + "learning_rate": 0.00029304192863295733, + "loss": 7.4457, + "step": 8064 + }, + { + "epoch": 0.7525426891854063, + "grad_norm": 1.3273504927261928, + "learning_rate": 0.0002930396514816418, + "loss": 7.2206, + "step": 8065 + }, + { + "epoch": 0.7526359988802837, + "grad_norm": 1.732887404950229, + "learning_rate": 0.0002930373739666188, + "loss": 7.3254, + "step": 8066 + }, + { + "epoch": 0.752729308575161, + "grad_norm": 117899.6633245731, + "learning_rate": 0.0002930350960878941, + "loss": 7.2748, + "step": 8067 + }, + { + "epoch": 0.7528226182700383, + "grad_norm": 1.8476575836348383, + "learning_rate": 0.00029303281784547356, + "loss": 7.3035, + "step": 8068 + }, + { + "epoch": 0.7529159279649156, + "grad_norm": 2.2915032511127102, + "learning_rate": 0.0002930305392393629, + "loss": 7.2832, + "step": 8069 + }, + { + "epoch": 0.7530092376597929, + "grad_norm": 3.814471119500933, + "learning_rate": 0.00029302826026956797, + "loss": 7.6116, + "step": 8070 + }, + { + "epoch": 0.7531025473546702, + "grad_norm": 37672.38428670362, + "learning_rate": 0.00029302598093609454, + "loss": 7.2135, + "step": 8071 + }, + { + "epoch": 0.7531958570495475, + "grad_norm": 1.6463222056329951, + "learning_rate": 0.0002930237012389484, + "loss": 7.4969, + "step": 8072 + }, + { + "epoch": 0.7532891667444247, + "grad_norm": 64.56761150578751, + "learning_rate": 0.00029302142117813536, + "loss": 7.1576, + "step": 8073 + }, + { + "epoch": 0.753382476439302, + "grad_norm": 587154.351279332, + "learning_rate": 0.0002930191407536612, + "loss": 7.2098, + "step": 8074 + }, + { + "epoch": 0.7534757861341793, + "grad_norm": 0.9618499383572341, + "learning_rate": 0.0002930168599655318, + "loss": 7.3939, + "step": 8075 + }, + { + "epoch": 0.7535690958290566, + "grad_norm": 62.8310775782939, + "learning_rate": 0.0002930145788137528, + "loss": 7.5122, + "step": 8076 + }, + { + "epoch": 0.7536624055239339, + "grad_norm": 0.8855095304095754, + "learning_rate": 0.0002930122972983301, + "loss": 7.4338, + "step": 8077 + }, + { + "epoch": 0.7537557152188112, + "grad_norm": 548563.278257223, + "learning_rate": 0.00029301001541926955, + "loss": 7.3183, + "step": 8078 + }, + { + "epoch": 0.7538490249136885, + "grad_norm": 0.7945330739112668, + "learning_rate": 0.0002930077331765768, + "loss": 7.0986, + "step": 8079 + }, + { + "epoch": 0.7539423346085659, + "grad_norm": 30205635.548584256, + "learning_rate": 0.00029300545057025785, + "loss": 7.3152, + "step": 8080 + }, + { + "epoch": 0.7540356443034432, + "grad_norm": 28.23200957883115, + "learning_rate": 0.00029300316760031835, + "loss": 7.0652, + "step": 8081 + }, + { + "epoch": 0.7541289539983205, + "grad_norm": 2.9040069721173833, + "learning_rate": 0.00029300088426676416, + "loss": 7.4592, + "step": 8082 + }, + { + "epoch": 0.7542222636931977, + "grad_norm": 746467278.4056494, + "learning_rate": 0.0002929986005696011, + "loss": 7.571, + "step": 8083 + }, + { + "epoch": 0.754315573388075, + "grad_norm": 1.2223860251227447, + "learning_rate": 0.00029299631650883494, + "loss": 7.3519, + "step": 8084 + }, + { + "epoch": 0.7544088830829523, + "grad_norm": 78548.4186474139, + "learning_rate": 0.0002929940320844715, + "loss": 7.4555, + "step": 8085 + }, + { + "epoch": 0.7545021927778296, + "grad_norm": 1.4571502475693827, + "learning_rate": 0.0002929917472965166, + "loss": 7.1082, + "step": 8086 + }, + { + "epoch": 0.7545955024727069, + "grad_norm": 1.1210858198193125, + "learning_rate": 0.00029298946214497605, + "loss": 7.2087, + "step": 8087 + }, + { + "epoch": 0.7546888121675842, + "grad_norm": 0.6537935033422344, + "learning_rate": 0.0002929871766298556, + "loss": 7.2249, + "step": 8088 + }, + { + "epoch": 0.7547821218624615, + "grad_norm": 0.6930994580943617, + "learning_rate": 0.0002929848907511612, + "loss": 7.3116, + "step": 8089 + }, + { + "epoch": 0.7548754315573388, + "grad_norm": 0.6359380548377168, + "learning_rate": 0.0002929826045088985, + "loss": 7.07, + "step": 8090 + }, + { + "epoch": 0.7549687412522161, + "grad_norm": 3.7071009414033314, + "learning_rate": 0.00029298031790307347, + "loss": 7.2596, + "step": 8091 + }, + { + "epoch": 0.7550620509470934, + "grad_norm": 1.8999883403965752, + "learning_rate": 0.0002929780309336918, + "loss": 7.6247, + "step": 8092 + }, + { + "epoch": 0.7551553606419708, + "grad_norm": 1.0410953362057591, + "learning_rate": 0.00029297574360075937, + "loss": 7.3207, + "step": 8093 + }, + { + "epoch": 0.755248670336848, + "grad_norm": 10.15133483726507, + "learning_rate": 0.00029297345590428197, + "loss": 7.3616, + "step": 8094 + }, + { + "epoch": 0.7553419800317253, + "grad_norm": 1.0310937047280573, + "learning_rate": 0.0002929711678442654, + "loss": 7.2992, + "step": 8095 + }, + { + "epoch": 0.7554352897266026, + "grad_norm": 1.185989846178217, + "learning_rate": 0.0002929688794207155, + "loss": 7.0226, + "step": 8096 + }, + { + "epoch": 0.7555285994214799, + "grad_norm": 0.8081914952635064, + "learning_rate": 0.0002929665906336381, + "loss": 7.232, + "step": 8097 + }, + { + "epoch": 0.7556219091163572, + "grad_norm": 0.703824919565603, + "learning_rate": 0.00029296430148303905, + "loss": 7.1859, + "step": 8098 + }, + { + "epoch": 0.7557152188112345, + "grad_norm": 1.1551237355364836, + "learning_rate": 0.0002929620119689241, + "loss": 7.4031, + "step": 8099 + }, + { + "epoch": 0.7558085285061118, + "grad_norm": 20264586.336603705, + "learning_rate": 0.0002929597220912991, + "loss": 7.4338, + "step": 8100 + }, + { + "epoch": 0.7559018382009891, + "grad_norm": 3.5802475240551312, + "learning_rate": 0.0002929574318501699, + "loss": 7.3801, + "step": 8101 + }, + { + "epoch": 0.7559951478958664, + "grad_norm": 0.5654611766684561, + "learning_rate": 0.00029295514124554224, + "loss": 7.5019, + "step": 8102 + }, + { + "epoch": 0.7560884575907437, + "grad_norm": 19019914.284988474, + "learning_rate": 0.00029295285027742206, + "loss": 7.1621, + "step": 8103 + }, + { + "epoch": 0.756181767285621, + "grad_norm": 0.6607418548015707, + "learning_rate": 0.00029295055894581513, + "loss": 7.2212, + "step": 8104 + }, + { + "epoch": 0.7562750769804982, + "grad_norm": 1.3417213907413994, + "learning_rate": 0.0002929482672507272, + "loss": 7.4549, + "step": 8105 + }, + { + "epoch": 0.7563683866753755, + "grad_norm": 0.5516519203305162, + "learning_rate": 0.0002929459751921642, + "loss": 7.1682, + "step": 8106 + }, + { + "epoch": 0.7564616963702528, + "grad_norm": 0.5756050717491539, + "learning_rate": 0.000292943682770132, + "loss": 7.1713, + "step": 8107 + }, + { + "epoch": 0.7565550060651302, + "grad_norm": 1164248.3351241925, + "learning_rate": 0.0002929413899846363, + "loss": 7.4378, + "step": 8108 + }, + { + "epoch": 0.7566483157600075, + "grad_norm": 1.107773955867667, + "learning_rate": 0.000292939096835683, + "loss": 7.5085, + "step": 8109 + }, + { + "epoch": 0.7567416254548848, + "grad_norm": 18474386.655744806, + "learning_rate": 0.0002929368033232779, + "loss": 7.44, + "step": 8110 + }, + { + "epoch": 0.7568349351497621, + "grad_norm": 2.0024484758263505, + "learning_rate": 0.00029293450944742686, + "loss": 7.2755, + "step": 8111 + }, + { + "epoch": 0.7569282448446394, + "grad_norm": 124513.9316733765, + "learning_rate": 0.0002929322152081357, + "loss": 7.2413, + "step": 8112 + }, + { + "epoch": 0.7570215545395167, + "grad_norm": 2.8228989453555857, + "learning_rate": 0.00029292992060541025, + "loss": 7.3779, + "step": 8113 + }, + { + "epoch": 0.757114864234394, + "grad_norm": 1.1958879409320016, + "learning_rate": 0.0002929276256392564, + "loss": 6.9112, + "step": 8114 + }, + { + "epoch": 0.7572081739292712, + "grad_norm": 3.6708887122079044, + "learning_rate": 0.00029292533030967987, + "loss": 7.1653, + "step": 8115 + }, + { + "epoch": 0.7573014836241485, + "grad_norm": 0.7301360314621995, + "learning_rate": 0.0002929230346166866, + "loss": 7.3573, + "step": 8116 + }, + { + "epoch": 0.7573947933190258, + "grad_norm": 1.578414277390963, + "learning_rate": 0.00029292073856028233, + "loss": 7.6225, + "step": 8117 + }, + { + "epoch": 0.7574881030139031, + "grad_norm": 2.285906813983925, + "learning_rate": 0.000292918442140473, + "loss": 7.3049, + "step": 8118 + }, + { + "epoch": 0.7575814127087804, + "grad_norm": 1511832.52993321, + "learning_rate": 0.00029291614535726447, + "loss": 7.0867, + "step": 8119 + }, + { + "epoch": 0.7576747224036577, + "grad_norm": 1.0670549641412304, + "learning_rate": 0.00029291384821066247, + "loss": 7.2231, + "step": 8120 + }, + { + "epoch": 0.757768032098535, + "grad_norm": 5541053545.216059, + "learning_rate": 0.00029291155070067286, + "loss": 7.387, + "step": 8121 + }, + { + "epoch": 0.7578613417934124, + "grad_norm": 0.8497191191455735, + "learning_rate": 0.0002929092528273015, + "loss": 7.499, + "step": 8122 + }, + { + "epoch": 0.7579546514882897, + "grad_norm": 1.1138135525358752, + "learning_rate": 0.00029290695459055434, + "loss": 7.4284, + "step": 8123 + }, + { + "epoch": 0.758047961183167, + "grad_norm": 5.795943530180539, + "learning_rate": 0.000292904655990437, + "loss": 7.0947, + "step": 8124 + }, + { + "epoch": 0.7581412708780443, + "grad_norm": 1.2410938197520454, + "learning_rate": 0.0002929023570269555, + "loss": 7.4271, + "step": 8125 + }, + { + "epoch": 0.7582345805729215, + "grad_norm": 18.609822506799013, + "learning_rate": 0.0002929000577001157, + "loss": 7.4068, + "step": 8126 + }, + { + "epoch": 0.7583278902677988, + "grad_norm": 1.4736522141451018, + "learning_rate": 0.00029289775800992335, + "loss": 7.4617, + "step": 8127 + }, + { + "epoch": 0.7584211999626761, + "grad_norm": 0.8301587626622813, + "learning_rate": 0.0002928954579563843, + "loss": 7.5598, + "step": 8128 + }, + { + "epoch": 0.7585145096575534, + "grad_norm": 0.6228980358285798, + "learning_rate": 0.0002928931575395044, + "loss": 7.3534, + "step": 8129 + }, + { + "epoch": 0.7586078193524307, + "grad_norm": 0.6980327667444846, + "learning_rate": 0.0002928908567592896, + "loss": 7.4083, + "step": 8130 + }, + { + "epoch": 0.758701129047308, + "grad_norm": 572098331.4308013, + "learning_rate": 0.0002928885556157457, + "loss": 7.2177, + "step": 8131 + }, + { + "epoch": 0.7587944387421853, + "grad_norm": 1.0343743227046063, + "learning_rate": 0.00029288625410887843, + "loss": 7.3286, + "step": 8132 + }, + { + "epoch": 0.7588877484370626, + "grad_norm": 1.3111442340245023, + "learning_rate": 0.0002928839522386938, + "loss": 7.3333, + "step": 8133 + }, + { + "epoch": 0.75898105813194, + "grad_norm": 0.6178620688484097, + "learning_rate": 0.00029288165000519755, + "loss": 7.3473, + "step": 8134 + }, + { + "epoch": 0.7590743678268173, + "grad_norm": 746554599.5838776, + "learning_rate": 0.0002928793474083957, + "loss": 7.5048, + "step": 8135 + }, + { + "epoch": 0.7591676775216945, + "grad_norm": 0.7655099513029892, + "learning_rate": 0.00029287704444829393, + "loss": 7.4407, + "step": 8136 + }, + { + "epoch": 0.7592609872165718, + "grad_norm": 0.8287185387896531, + "learning_rate": 0.00029287474112489815, + "loss": 7.3642, + "step": 8137 + }, + { + "epoch": 0.7593542969114491, + "grad_norm": 0.735059971814863, + "learning_rate": 0.00029287243743821424, + "loss": 7.431, + "step": 8138 + }, + { + "epoch": 0.7594476066063264, + "grad_norm": 0.5498900527020003, + "learning_rate": 0.000292870133388248, + "loss": 7.2165, + "step": 8139 + }, + { + "epoch": 0.7595409163012037, + "grad_norm": 0.4231886243902307, + "learning_rate": 0.0002928678289750054, + "loss": 7.3866, + "step": 8140 + }, + { + "epoch": 0.759634225996081, + "grad_norm": 0.5230261174859853, + "learning_rate": 0.0002928655241984922, + "loss": 7.4475, + "step": 8141 + }, + { + "epoch": 0.7597275356909583, + "grad_norm": 1.1192758194127852, + "learning_rate": 0.0002928632190587143, + "loss": 7.4809, + "step": 8142 + }, + { + "epoch": 0.7598208453858356, + "grad_norm": 0.7140273637843257, + "learning_rate": 0.00029286091355567754, + "loss": 7.4038, + "step": 8143 + }, + { + "epoch": 0.7599141550807129, + "grad_norm": 0.7779086480150476, + "learning_rate": 0.00029285860768938775, + "loss": 7.2608, + "step": 8144 + }, + { + "epoch": 0.7600074647755902, + "grad_norm": 0.982028426003893, + "learning_rate": 0.00029285630145985093, + "loss": 6.9112, + "step": 8145 + }, + { + "epoch": 0.7601007744704675, + "grad_norm": 0.4545697249774902, + "learning_rate": 0.0002928539948670728, + "loss": 7.2899, + "step": 8146 + }, + { + "epoch": 0.7601940841653447, + "grad_norm": 0.7857002874880615, + "learning_rate": 0.0002928516879110593, + "loss": 7.3492, + "step": 8147 + }, + { + "epoch": 0.760287393860222, + "grad_norm": 2.47168863781276, + "learning_rate": 0.00029284938059181623, + "loss": 7.4022, + "step": 8148 + }, + { + "epoch": 0.7603807035550993, + "grad_norm": 106871484.09023921, + "learning_rate": 0.00029284707290934956, + "loss": 7.087, + "step": 8149 + }, + { + "epoch": 0.7604740132499767, + "grad_norm": 2190937.1744334344, + "learning_rate": 0.0002928447648636651, + "loss": 7.1593, + "step": 8150 + }, + { + "epoch": 0.760567322944854, + "grad_norm": 0.9014510249129073, + "learning_rate": 0.00029284245645476865, + "loss": 7.3851, + "step": 8151 + }, + { + "epoch": 0.7606606326397313, + "grad_norm": 0.6743270061972386, + "learning_rate": 0.00029284014768266617, + "loss": 7.3474, + "step": 8152 + }, + { + "epoch": 0.7607539423346086, + "grad_norm": 0.7157244610752186, + "learning_rate": 0.00029283783854736355, + "loss": 7.28, + "step": 8153 + }, + { + "epoch": 0.7608472520294859, + "grad_norm": 0.7555459957168903, + "learning_rate": 0.0002928355290488666, + "loss": 7.5064, + "step": 8154 + }, + { + "epoch": 0.7609405617243632, + "grad_norm": 11046233251.47922, + "learning_rate": 0.0002928332191871812, + "loss": 7.4699, + "step": 8155 + }, + { + "epoch": 0.7610338714192405, + "grad_norm": 0.7397758641700907, + "learning_rate": 0.00029283090896231324, + "loss": 6.9978, + "step": 8156 + }, + { + "epoch": 0.7611271811141178, + "grad_norm": 1.551826776354161, + "learning_rate": 0.00029282859837426856, + "loss": 7.6236, + "step": 8157 + }, + { + "epoch": 0.761220490808995, + "grad_norm": 1801721.147584821, + "learning_rate": 0.0002928262874230531, + "loss": 7.084, + "step": 8158 + }, + { + "epoch": 0.7613138005038723, + "grad_norm": 2.4512316213934096, + "learning_rate": 0.0002928239761086727, + "loss": 7.5406, + "step": 8159 + }, + { + "epoch": 0.7614071101987496, + "grad_norm": 0.7620277264434191, + "learning_rate": 0.0002928216644311332, + "loss": 7.1601, + "step": 8160 + }, + { + "epoch": 0.7615004198936269, + "grad_norm": 3.3641458143879897, + "learning_rate": 0.0002928193523904405, + "loss": 7.2886, + "step": 8161 + }, + { + "epoch": 0.7615937295885042, + "grad_norm": 7460808.520320194, + "learning_rate": 0.0002928170399866005, + "loss": 7.3646, + "step": 8162 + }, + { + "epoch": 0.7616870392833816, + "grad_norm": 0.8661208104337715, + "learning_rate": 0.0002928147272196191, + "loss": 7.2725, + "step": 8163 + }, + { + "epoch": 0.7617803489782589, + "grad_norm": 0.7096758993474892, + "learning_rate": 0.0002928124140895022, + "loss": 7.3762, + "step": 8164 + }, + { + "epoch": 0.7618736586731362, + "grad_norm": 1.46133544153897, + "learning_rate": 0.00029281010059625557, + "loss": 7.0655, + "step": 8165 + }, + { + "epoch": 0.7619669683680135, + "grad_norm": 1.0678369362523679, + "learning_rate": 0.00029280778673988517, + "loss": 7.3754, + "step": 8166 + }, + { + "epoch": 0.7620602780628908, + "grad_norm": 0.8818328313339139, + "learning_rate": 0.00029280547252039693, + "loss": 7.4605, + "step": 8167 + }, + { + "epoch": 0.762153587757768, + "grad_norm": 1.012898196640558, + "learning_rate": 0.00029280315793779664, + "loss": 7.3007, + "step": 8168 + }, + { + "epoch": 0.7622468974526453, + "grad_norm": 2.3658661417967073, + "learning_rate": 0.00029280084299209016, + "loss": 7.1765, + "step": 8169 + }, + { + "epoch": 0.7623402071475226, + "grad_norm": 398455178330.1915, + "learning_rate": 0.0002927985276832835, + "loss": 7.2073, + "step": 8170 + }, + { + "epoch": 0.7624335168423999, + "grad_norm": 0.8211917223155252, + "learning_rate": 0.00029279621201138247, + "loss": 7.3591, + "step": 8171 + }, + { + "epoch": 0.7625268265372772, + "grad_norm": 5.967034934196324, + "learning_rate": 0.0002927938959763929, + "loss": 7.442, + "step": 8172 + }, + { + "epoch": 0.7626201362321545, + "grad_norm": 0.6055026351824986, + "learning_rate": 0.00029279157957832087, + "loss": 7.356, + "step": 8173 + }, + { + "epoch": 0.7627134459270318, + "grad_norm": 0.8161944422429774, + "learning_rate": 0.0002927892628171721, + "loss": 7.4479, + "step": 8174 + }, + { + "epoch": 0.7628067556219091, + "grad_norm": 23000694955.34518, + "learning_rate": 0.00029278694569295256, + "loss": 7.5213, + "step": 8175 + }, + { + "epoch": 0.7629000653167864, + "grad_norm": 0.6859152208547085, + "learning_rate": 0.0002927846282056681, + "loss": 7.4466, + "step": 8176 + }, + { + "epoch": 0.7629933750116638, + "grad_norm": 1.3181847194071057, + "learning_rate": 0.00029278231035532456, + "loss": 7.2666, + "step": 8177 + }, + { + "epoch": 0.7630866847065411, + "grad_norm": 17.53625040892316, + "learning_rate": 0.000292779992141928, + "loss": 7.4324, + "step": 8178 + }, + { + "epoch": 0.7631799944014183, + "grad_norm": 0.7356932186327788, + "learning_rate": 0.00029277767356548413, + "loss": 7.4486, + "step": 8179 + }, + { + "epoch": 0.7632733040962956, + "grad_norm": 1.4235616567495293, + "learning_rate": 0.000292775354625999, + "loss": 7.2186, + "step": 8180 + }, + { + "epoch": 0.7633666137911729, + "grad_norm": 0.5766915927312078, + "learning_rate": 0.0002927730353234784, + "loss": 7.1484, + "step": 8181 + }, + { + "epoch": 0.7634599234860502, + "grad_norm": 0.8379827122243996, + "learning_rate": 0.00029277071565792824, + "loss": 7.4455, + "step": 8182 + }, + { + "epoch": 0.7635532331809275, + "grad_norm": 1.4762169354915229, + "learning_rate": 0.00029276839562935447, + "loss": 7.3716, + "step": 8183 + }, + { + "epoch": 0.7636465428758048, + "grad_norm": 241564042707.60153, + "learning_rate": 0.000292766075237763, + "loss": 7.5219, + "step": 8184 + }, + { + "epoch": 0.7637398525706821, + "grad_norm": 0.9793502688101527, + "learning_rate": 0.0002927637544831596, + "loss": 7.2917, + "step": 8185 + }, + { + "epoch": 0.7638331622655594, + "grad_norm": 1.0062399209034083, + "learning_rate": 0.0002927614333655503, + "loss": 7.3259, + "step": 8186 + }, + { + "epoch": 0.7639264719604367, + "grad_norm": 1.1746767010070789, + "learning_rate": 0.000292759111884941, + "loss": 7.2528, + "step": 8187 + }, + { + "epoch": 0.764019781655314, + "grad_norm": 24874898267.199, + "learning_rate": 0.00029275679004133755, + "loss": 7.3408, + "step": 8188 + }, + { + "epoch": 0.7641130913501912, + "grad_norm": 0.965402508402506, + "learning_rate": 0.0002927544678347459, + "loss": 7.26, + "step": 8189 + }, + { + "epoch": 0.7642064010450685, + "grad_norm": 0.771628062710779, + "learning_rate": 0.0002927521452651718, + "loss": 7.2192, + "step": 8190 + }, + { + "epoch": 0.7642997107399458, + "grad_norm": 2705617729.8811, + "learning_rate": 0.00029274982233262146, + "loss": 7.1812, + "step": 8191 + }, + { + "epoch": 0.7643930204348232, + "grad_norm": 2.3174227208329468, + "learning_rate": 0.0002927474990371005, + "loss": 7.7409, + "step": 8192 + }, + { + "epoch": 0.7644863301297005, + "grad_norm": 7608370943.020411, + "learning_rate": 0.0002927451753786149, + "loss": 7.1699, + "step": 8193 + }, + { + "epoch": 0.7645796398245778, + "grad_norm": 2005167141.7504268, + "learning_rate": 0.00029274285135717066, + "loss": 7.0866, + "step": 8194 + }, + { + "epoch": 0.7646729495194551, + "grad_norm": 21.53643038707756, + "learning_rate": 0.0002927405269727737, + "loss": 7.2832, + "step": 8195 + }, + { + "epoch": 0.7647662592143324, + "grad_norm": 5.342627891050481, + "learning_rate": 0.0002927382022254298, + "loss": 7.263, + "step": 8196 + }, + { + "epoch": 0.7648595689092097, + "grad_norm": 1.3391770962382747, + "learning_rate": 0.00029273587711514486, + "loss": 7.1965, + "step": 8197 + }, + { + "epoch": 0.764952878604087, + "grad_norm": 9.499367664182639, + "learning_rate": 0.00029273355164192495, + "loss": 7.4601, + "step": 8198 + }, + { + "epoch": 0.7650461882989643, + "grad_norm": 0.7858825183029648, + "learning_rate": 0.00029273122580577584, + "loss": 7.152, + "step": 8199 + }, + { + "epoch": 0.7651394979938415, + "grad_norm": 0.9784554837319596, + "learning_rate": 0.0002927288996067036, + "loss": 7.2828, + "step": 8200 + }, + { + "epoch": 0.7652328076887188, + "grad_norm": 1.2936493294974554, + "learning_rate": 0.00029272657304471394, + "loss": 7.4061, + "step": 8201 + }, + { + "epoch": 0.7653261173835961, + "grad_norm": 1.4592590506682486, + "learning_rate": 0.00029272424611981296, + "loss": 7.3702, + "step": 8202 + }, + { + "epoch": 0.7654194270784734, + "grad_norm": 1.522722222976088, + "learning_rate": 0.0002927219188320065, + "loss": 7.844, + "step": 8203 + }, + { + "epoch": 0.7655127367733507, + "grad_norm": 201633543997.27878, + "learning_rate": 0.0002927195911813004, + "loss": 7.3242, + "step": 8204 + }, + { + "epoch": 0.765606046468228, + "grad_norm": 14.186784039604222, + "learning_rate": 0.00029271726316770075, + "loss": 7.1256, + "step": 8205 + }, + { + "epoch": 0.7656993561631054, + "grad_norm": 1.428854128350833, + "learning_rate": 0.00029271493479121337, + "loss": 7.061, + "step": 8206 + }, + { + "epoch": 0.7657926658579827, + "grad_norm": 0.6597039859565217, + "learning_rate": 0.0002927126060518441, + "loss": 7.6335, + "step": 8207 + }, + { + "epoch": 0.76588597555286, + "grad_norm": 3.041185727714845, + "learning_rate": 0.00029271027694959906, + "loss": 7.0967, + "step": 8208 + }, + { + "epoch": 0.7659792852477373, + "grad_norm": 21252914.974000815, + "learning_rate": 0.00029270794748448395, + "loss": 7.4962, + "step": 8209 + }, + { + "epoch": 0.7660725949426146, + "grad_norm": 77624828.7463922, + "learning_rate": 0.00029270561765650486, + "loss": 7.3473, + "step": 8210 + }, + { + "epoch": 0.7661659046374918, + "grad_norm": 1.0941603192715508, + "learning_rate": 0.00029270328746566765, + "loss": 7.2905, + "step": 8211 + }, + { + "epoch": 0.7662592143323691, + "grad_norm": 1.536223486835396, + "learning_rate": 0.0002927009569119782, + "loss": 7.7015, + "step": 8212 + }, + { + "epoch": 0.7663525240272464, + "grad_norm": 2.538046281349378, + "learning_rate": 0.0002926986259954426, + "loss": 7.6454, + "step": 8213 + }, + { + "epoch": 0.7664458337221237, + "grad_norm": 1.7286796929745867, + "learning_rate": 0.00029269629471606655, + "loss": 7.4493, + "step": 8214 + }, + { + "epoch": 0.766539143417001, + "grad_norm": 1.8300165871402367, + "learning_rate": 0.0002926939630738562, + "loss": 7.5913, + "step": 8215 + }, + { + "epoch": 0.7666324531118783, + "grad_norm": 1.1741861764632304, + "learning_rate": 0.0002926916310688173, + "loss": 7.3582, + "step": 8216 + }, + { + "epoch": 0.7667257628067556, + "grad_norm": 0.9570778727482484, + "learning_rate": 0.00029268929870095585, + "loss": 7.4169, + "step": 8217 + }, + { + "epoch": 0.766819072501633, + "grad_norm": 32.387076696198214, + "learning_rate": 0.0002926869659702778, + "loss": 7.4209, + "step": 8218 + }, + { + "epoch": 0.7669123821965103, + "grad_norm": 1.0443252501769917, + "learning_rate": 0.00029268463287678904, + "loss": 7.4739, + "step": 8219 + }, + { + "epoch": 0.7670056918913876, + "grad_norm": 1.5219944511799672, + "learning_rate": 0.00029268229942049556, + "loss": 7.3314, + "step": 8220 + }, + { + "epoch": 0.7670990015862648, + "grad_norm": 79.56869988432673, + "learning_rate": 0.00029267996560140326, + "loss": 7.3922, + "step": 8221 + }, + { + "epoch": 0.7671923112811421, + "grad_norm": 13054963.13402736, + "learning_rate": 0.00029267763141951805, + "loss": 7.0977, + "step": 8222 + }, + { + "epoch": 0.7672856209760194, + "grad_norm": 1.222298192643095, + "learning_rate": 0.00029267529687484583, + "loss": 7.5698, + "step": 8223 + }, + { + "epoch": 0.7673789306708967, + "grad_norm": 0.6490076990043951, + "learning_rate": 0.0002926729619673927, + "loss": 7.1916, + "step": 8224 + }, + { + "epoch": 0.767472240365774, + "grad_norm": 2.6569317660610063, + "learning_rate": 0.00029267062669716446, + "loss": 7.3552, + "step": 8225 + }, + { + "epoch": 0.7675655500606513, + "grad_norm": 78479979.83797905, + "learning_rate": 0.00029266829106416707, + "loss": 7.3581, + "step": 8226 + }, + { + "epoch": 0.7676588597555286, + "grad_norm": 0.7487541256317816, + "learning_rate": 0.0002926659550684064, + "loss": 7.207, + "step": 8227 + }, + { + "epoch": 0.7677521694504059, + "grad_norm": 0.7861110148431417, + "learning_rate": 0.0002926636187098886, + "loss": 7.1752, + "step": 8228 + }, + { + "epoch": 0.7678454791452832, + "grad_norm": 0.9635338663094409, + "learning_rate": 0.0002926612819886194, + "loss": 7.1945, + "step": 8229 + }, + { + "epoch": 0.7679387888401605, + "grad_norm": 0.43977396637320354, + "learning_rate": 0.00029265894490460487, + "loss": 7.3115, + "step": 8230 + }, + { + "epoch": 0.7680320985350378, + "grad_norm": 14236004.195551198, + "learning_rate": 0.00029265660745785086, + "loss": 7.1485, + "step": 8231 + }, + { + "epoch": 0.768125408229915, + "grad_norm": 1.4266150395663961, + "learning_rate": 0.00029265426964836336, + "loss": 7.3764, + "step": 8232 + }, + { + "epoch": 0.7682187179247923, + "grad_norm": 1.1943561762298787, + "learning_rate": 0.00029265193147614833, + "loss": 7.399, + "step": 8233 + }, + { + "epoch": 0.7683120276196697, + "grad_norm": 0.566387770881462, + "learning_rate": 0.0002926495929412116, + "loss": 7.371, + "step": 8234 + }, + { + "epoch": 0.768405337314547, + "grad_norm": 0.4644691663322432, + "learning_rate": 0.00029264725404355935, + "loss": 7.0595, + "step": 8235 + }, + { + "epoch": 0.7684986470094243, + "grad_norm": 0.69534123315486, + "learning_rate": 0.0002926449147831973, + "loss": 7.1334, + "step": 8236 + }, + { + "epoch": 0.7685919567043016, + "grad_norm": 1.025018610261918, + "learning_rate": 0.0002926425751601315, + "loss": 7.368, + "step": 8237 + }, + { + "epoch": 0.7686852663991789, + "grad_norm": 0.9150196140424364, + "learning_rate": 0.0002926402351743679, + "loss": 7.119, + "step": 8238 + }, + { + "epoch": 0.7687785760940562, + "grad_norm": 0.6276895346978089, + "learning_rate": 0.0002926378948259125, + "loss": 7.4835, + "step": 8239 + }, + { + "epoch": 0.7688718857889335, + "grad_norm": 0.7043499986973032, + "learning_rate": 0.00029263555411477107, + "loss": 7.4792, + "step": 8240 + }, + { + "epoch": 0.7689651954838108, + "grad_norm": 29275039.17924501, + "learning_rate": 0.0002926332130409498, + "loss": 7.4177, + "step": 8241 + }, + { + "epoch": 0.769058505178688, + "grad_norm": 1.5031309749756512, + "learning_rate": 0.0002926308716044544, + "loss": 7.1886, + "step": 8242 + }, + { + "epoch": 0.7691518148735653, + "grad_norm": 0.8774737653995645, + "learning_rate": 0.00029262852980529097, + "loss": 7.4272, + "step": 8243 + }, + { + "epoch": 0.7692451245684426, + "grad_norm": 0.5818292428063918, + "learning_rate": 0.0002926261876434655, + "loss": 7.2403, + "step": 8244 + }, + { + "epoch": 0.7693384342633199, + "grad_norm": 2.1453624373566718, + "learning_rate": 0.0002926238451189838, + "loss": 7.6837, + "step": 8245 + }, + { + "epoch": 0.7694317439581972, + "grad_norm": 0.9947809446446738, + "learning_rate": 0.000292621502231852, + "loss": 6.9285, + "step": 8246 + }, + { + "epoch": 0.7695250536530746, + "grad_norm": 6.77537427059991, + "learning_rate": 0.00029261915898207586, + "loss": 7.6995, + "step": 8247 + }, + { + "epoch": 0.7696183633479519, + "grad_norm": 13802302.6850017, + "learning_rate": 0.00029261681536966154, + "loss": 7.1287, + "step": 8248 + }, + { + "epoch": 0.7697116730428292, + "grad_norm": 1.24285484680296, + "learning_rate": 0.00029261447139461487, + "loss": 7.0994, + "step": 8249 + }, + { + "epoch": 0.7698049827377065, + "grad_norm": 124.44661925273076, + "learning_rate": 0.00029261212705694183, + "loss": 7.0853, + "step": 8250 + }, + { + "epoch": 0.7698982924325838, + "grad_norm": 1.490067232512301, + "learning_rate": 0.0002926097823566484, + "loss": 7.18, + "step": 8251 + }, + { + "epoch": 0.7699916021274611, + "grad_norm": 7.0908087312671935, + "learning_rate": 0.00029260743729374056, + "loss": 6.8784, + "step": 8252 + }, + { + "epoch": 0.7700849118223383, + "grad_norm": 1.0945657396173736, + "learning_rate": 0.00029260509186822427, + "loss": 7.2646, + "step": 8253 + }, + { + "epoch": 0.7701782215172156, + "grad_norm": 0.9875478709038908, + "learning_rate": 0.00029260274608010545, + "loss": 7.1723, + "step": 8254 + }, + { + "epoch": 0.7702715312120929, + "grad_norm": 4.6190653712162755, + "learning_rate": 0.00029260039992939007, + "loss": 7.0333, + "step": 8255 + }, + { + "epoch": 0.7703648409069702, + "grad_norm": 1.0874430637572576, + "learning_rate": 0.0002925980534160841, + "loss": 7.8141, + "step": 8256 + }, + { + "epoch": 0.7704581506018475, + "grad_norm": 8962087.21196967, + "learning_rate": 0.0002925957065401936, + "loss": 7.471, + "step": 8257 + }, + { + "epoch": 0.7705514602967248, + "grad_norm": 0.9640741819583976, + "learning_rate": 0.0002925933593017244, + "loss": 7.5428, + "step": 8258 + }, + { + "epoch": 0.7706447699916021, + "grad_norm": 1.4566310721346167, + "learning_rate": 0.0002925910117006825, + "loss": 7.299, + "step": 8259 + }, + { + "epoch": 0.7707380796864794, + "grad_norm": 1.4330342329736971, + "learning_rate": 0.0002925886637370739, + "loss": 7.3311, + "step": 8260 + }, + { + "epoch": 0.7708313893813568, + "grad_norm": 0.961826692279991, + "learning_rate": 0.00029258631541090465, + "loss": 7.2666, + "step": 8261 + }, + { + "epoch": 0.7709246990762341, + "grad_norm": 1.8081207103930705, + "learning_rate": 0.00029258396672218056, + "loss": 6.9636, + "step": 8262 + }, + { + "epoch": 0.7710180087711114, + "grad_norm": 1.3890887864349144, + "learning_rate": 0.0002925816176709077, + "loss": 7.2635, + "step": 8263 + }, + { + "epoch": 0.7711113184659886, + "grad_norm": 2.9750799644641086, + "learning_rate": 0.000292579268257092, + "loss": 7.4505, + "step": 8264 + }, + { + "epoch": 0.7712046281608659, + "grad_norm": 1.4037174978415474, + "learning_rate": 0.0002925769184807395, + "loss": 7.2434, + "step": 8265 + }, + { + "epoch": 0.7712979378557432, + "grad_norm": 1.3668691665282644, + "learning_rate": 0.0002925745683418561, + "loss": 7.2142, + "step": 8266 + }, + { + "epoch": 0.7713912475506205, + "grad_norm": 0.4703832004405953, + "learning_rate": 0.00029257221784044783, + "loss": 7.3593, + "step": 8267 + }, + { + "epoch": 0.7714845572454978, + "grad_norm": 0.6830915319151291, + "learning_rate": 0.00029256986697652064, + "loss": 7.4817, + "step": 8268 + }, + { + "epoch": 0.7715778669403751, + "grad_norm": 1.1829286070854703, + "learning_rate": 0.0002925675157500805, + "loss": 7.353, + "step": 8269 + }, + { + "epoch": 0.7716711766352524, + "grad_norm": 1.4572994816706766, + "learning_rate": 0.00029256516416113336, + "loss": 7.3441, + "step": 8270 + }, + { + "epoch": 0.7717644863301297, + "grad_norm": 1.1675713084647161, + "learning_rate": 0.00029256281220968526, + "loss": 7.3201, + "step": 8271 + }, + { + "epoch": 0.771857796025007, + "grad_norm": 1.2322544297584141, + "learning_rate": 0.00029256045989574223, + "loss": 7.3611, + "step": 8272 + }, + { + "epoch": 0.7719511057198843, + "grad_norm": 0.626663473472697, + "learning_rate": 0.0002925581072193101, + "loss": 7.0886, + "step": 8273 + }, + { + "epoch": 0.7720444154147615, + "grad_norm": 20851947.015676323, + "learning_rate": 0.00029255575418039495, + "loss": 7.3089, + "step": 8274 + }, + { + "epoch": 0.7721377251096389, + "grad_norm": 1.2886891244489098, + "learning_rate": 0.0002925534007790028, + "loss": 7.3501, + "step": 8275 + }, + { + "epoch": 0.7722310348045162, + "grad_norm": 1.764387305566607, + "learning_rate": 0.0002925510470151395, + "loss": 7.6381, + "step": 8276 + }, + { + "epoch": 0.7723243444993935, + "grad_norm": 2.627574979952993, + "learning_rate": 0.00029254869288881116, + "loss": 7.2063, + "step": 8277 + }, + { + "epoch": 0.7724176541942708, + "grad_norm": 0.4347271993293246, + "learning_rate": 0.0002925463384000237, + "loss": 7.0973, + "step": 8278 + }, + { + "epoch": 0.7725109638891481, + "grad_norm": 0.5686169291146599, + "learning_rate": 0.0002925439835487831, + "loss": 7.4141, + "step": 8279 + }, + { + "epoch": 0.7726042735840254, + "grad_norm": 0.6267590627865302, + "learning_rate": 0.00029254162833509544, + "loss": 7.5777, + "step": 8280 + }, + { + "epoch": 0.7726975832789027, + "grad_norm": 1.0601951877941753, + "learning_rate": 0.0002925392727589666, + "loss": 7.3086, + "step": 8281 + }, + { + "epoch": 0.77279089297378, + "grad_norm": 0.8311573675882562, + "learning_rate": 0.0002925369168204026, + "loss": 7.4367, + "step": 8282 + }, + { + "epoch": 0.7728842026686573, + "grad_norm": 1.706837493200096, + "learning_rate": 0.00029253456051940953, + "loss": 7.3891, + "step": 8283 + }, + { + "epoch": 0.7729775123635346, + "grad_norm": 1.3260664037341643, + "learning_rate": 0.0002925322038559932, + "loss": 7.2531, + "step": 8284 + }, + { + "epoch": 0.7730708220584118, + "grad_norm": 66782.44756034107, + "learning_rate": 0.00029252984683015974, + "loss": 7.3428, + "step": 8285 + }, + { + "epoch": 0.7731641317532891, + "grad_norm": 5.289250133677749, + "learning_rate": 0.00029252748944191506, + "loss": 7.4212, + "step": 8286 + }, + { + "epoch": 0.7732574414481664, + "grad_norm": 2.1711972741009853, + "learning_rate": 0.00029252513169126523, + "loss": 7.2472, + "step": 8287 + }, + { + "epoch": 0.7733507511430437, + "grad_norm": 1.1634174810757072, + "learning_rate": 0.0002925227735782162, + "loss": 7.33, + "step": 8288 + }, + { + "epoch": 0.773444060837921, + "grad_norm": 0.8500367290457701, + "learning_rate": 0.00029252041510277393, + "loss": 7.4315, + "step": 8289 + }, + { + "epoch": 0.7735373705327984, + "grad_norm": 0.40515069934246145, + "learning_rate": 0.00029251805626494455, + "loss": 6.9511, + "step": 8290 + }, + { + "epoch": 0.7736306802276757, + "grad_norm": 0.6540359793062858, + "learning_rate": 0.0002925156970647339, + "loss": 7.3738, + "step": 8291 + }, + { + "epoch": 0.773723989922553, + "grad_norm": 220119.72062581358, + "learning_rate": 0.00029251333750214813, + "loss": 6.928, + "step": 8292 + }, + { + "epoch": 0.7738172996174303, + "grad_norm": 327013.27067138674, + "learning_rate": 0.0002925109775771931, + "loss": 7.1168, + "step": 8293 + }, + { + "epoch": 0.7739106093123076, + "grad_norm": 1.6417815369217255, + "learning_rate": 0.0002925086172898749, + "loss": 7.5649, + "step": 8294 + }, + { + "epoch": 0.7740039190071848, + "grad_norm": 2.647986173938601, + "learning_rate": 0.00029250625664019943, + "loss": 7.1458, + "step": 8295 + }, + { + "epoch": 0.7740972287020621, + "grad_norm": 0.6318534606684946, + "learning_rate": 0.0002925038956281728, + "loss": 7.2718, + "step": 8296 + }, + { + "epoch": 0.7741905383969394, + "grad_norm": 1.7437061528939692, + "learning_rate": 0.000292501534253801, + "loss": 7.4311, + "step": 8297 + }, + { + "epoch": 0.7742838480918167, + "grad_norm": 1.3371421969080854, + "learning_rate": 0.00029249917251709, + "loss": 7.2216, + "step": 8298 + }, + { + "epoch": 0.774377157786694, + "grad_norm": 0.6073137340941898, + "learning_rate": 0.00029249681041804577, + "loss": 7.415, + "step": 8299 + }, + { + "epoch": 0.7744704674815713, + "grad_norm": 0.46076152912311663, + "learning_rate": 0.00029249444795667443, + "loss": 7.2968, + "step": 8300 + }, + { + "epoch": 0.7745637771764486, + "grad_norm": 0.561467389973262, + "learning_rate": 0.00029249208513298187, + "loss": 7.2484, + "step": 8301 + }, + { + "epoch": 0.774657086871326, + "grad_norm": 0.693217279336345, + "learning_rate": 0.0002924897219469742, + "loss": 7.5352, + "step": 8302 + }, + { + "epoch": 0.7747503965662033, + "grad_norm": 4.067533838075362, + "learning_rate": 0.00029248735839865734, + "loss": 7.5541, + "step": 8303 + }, + { + "epoch": 0.7748437062610806, + "grad_norm": 1.4280158523678428, + "learning_rate": 0.0002924849944880373, + "loss": 7.1144, + "step": 8304 + }, + { + "epoch": 0.7749370159559579, + "grad_norm": 0.706000034132562, + "learning_rate": 0.00029248263021512016, + "loss": 7.22, + "step": 8305 + }, + { + "epoch": 0.7750303256508351, + "grad_norm": 0.7118942581478696, + "learning_rate": 0.0002924802655799119, + "loss": 7.2139, + "step": 8306 + }, + { + "epoch": 0.7751236353457124, + "grad_norm": 0.46730649714972666, + "learning_rate": 0.00029247790058241856, + "loss": 7.1482, + "step": 8307 + }, + { + "epoch": 0.7752169450405897, + "grad_norm": 3473497.004875253, + "learning_rate": 0.0002924755352226461, + "loss": 7.381, + "step": 8308 + }, + { + "epoch": 0.775310254735467, + "grad_norm": 0.9276625789520254, + "learning_rate": 0.00029247316950060053, + "loss": 7.4813, + "step": 8309 + }, + { + "epoch": 0.7754035644303443, + "grad_norm": 0.6850085757805339, + "learning_rate": 0.00029247080341628795, + "loss": 7.1435, + "step": 8310 + }, + { + "epoch": 0.7754968741252216, + "grad_norm": 149718515.74335858, + "learning_rate": 0.00029246843696971426, + "loss": 7.5698, + "step": 8311 + }, + { + "epoch": 0.7755901838200989, + "grad_norm": 2.9719564895997164, + "learning_rate": 0.0002924660701608856, + "loss": 7.2537, + "step": 8312 + }, + { + "epoch": 0.7756834935149762, + "grad_norm": 0.6576228240001841, + "learning_rate": 0.0002924637029898079, + "loss": 7.2973, + "step": 8313 + }, + { + "epoch": 0.7757768032098535, + "grad_norm": 0.4896528845489754, + "learning_rate": 0.0002924613354564872, + "loss": 7.1271, + "step": 8314 + }, + { + "epoch": 0.7758701129047308, + "grad_norm": 6929066108.184937, + "learning_rate": 0.0002924589675609295, + "loss": 7.3048, + "step": 8315 + }, + { + "epoch": 0.7759634225996082, + "grad_norm": 1.8734120396892673, + "learning_rate": 0.0002924565993031409, + "loss": 7.0278, + "step": 8316 + }, + { + "epoch": 0.7760567322944854, + "grad_norm": 11.507516168814828, + "learning_rate": 0.0002924542306831273, + "loss": 7.201, + "step": 8317 + }, + { + "epoch": 0.7761500419893627, + "grad_norm": 0.6507424348839352, + "learning_rate": 0.00029245186170089477, + "loss": 7.3771, + "step": 8318 + }, + { + "epoch": 0.77624335168424, + "grad_norm": 0.4632551764432626, + "learning_rate": 0.0002924494923564494, + "loss": 6.999, + "step": 8319 + }, + { + "epoch": 0.7763366613791173, + "grad_norm": 1.664903480934075, + "learning_rate": 0.0002924471226497972, + "loss": 7.3474, + "step": 8320 + }, + { + "epoch": 0.7764299710739946, + "grad_norm": 0.9532008200785909, + "learning_rate": 0.0002924447525809441, + "loss": 7.3754, + "step": 8321 + }, + { + "epoch": 0.7765232807688719, + "grad_norm": 1.7480113229927383, + "learning_rate": 0.00029244238214989627, + "loss": 7.3089, + "step": 8322 + }, + { + "epoch": 0.7766165904637492, + "grad_norm": 54.02536785092681, + "learning_rate": 0.00029244001135665963, + "loss": 7.5215, + "step": 8323 + }, + { + "epoch": 0.7767099001586265, + "grad_norm": 0.7374670225729553, + "learning_rate": 0.0002924376402012402, + "loss": 7.0824, + "step": 8324 + }, + { + "epoch": 0.7768032098535038, + "grad_norm": 0.8663384385232185, + "learning_rate": 0.00029243526868364403, + "loss": 7.2824, + "step": 8325 + }, + { + "epoch": 0.7768965195483811, + "grad_norm": 1.695743950142144, + "learning_rate": 0.00029243289680387717, + "loss": 7.5256, + "step": 8326 + }, + { + "epoch": 0.7769898292432583, + "grad_norm": 0.7134452395616309, + "learning_rate": 0.0002924305245619457, + "loss": 7.4193, + "step": 8327 + }, + { + "epoch": 0.7770831389381356, + "grad_norm": 0.6218693991159747, + "learning_rate": 0.00029242815195785555, + "loss": 7.3133, + "step": 8328 + }, + { + "epoch": 0.7771764486330129, + "grad_norm": 144384245.2658182, + "learning_rate": 0.0002924257789916128, + "loss": 7.3072, + "step": 8329 + }, + { + "epoch": 0.7772697583278902, + "grad_norm": 27996945.385808628, + "learning_rate": 0.0002924234056632235, + "loss": 7.3303, + "step": 8330 + }, + { + "epoch": 0.7773630680227676, + "grad_norm": 0.6939432035619043, + "learning_rate": 0.0002924210319726937, + "loss": 7.1961, + "step": 8331 + }, + { + "epoch": 0.7774563777176449, + "grad_norm": 9.24665815248864, + "learning_rate": 0.0002924186579200294, + "loss": 7.5548, + "step": 8332 + }, + { + "epoch": 0.7775496874125222, + "grad_norm": 2.767638084667929, + "learning_rate": 0.0002924162835052366, + "loss": 7.3459, + "step": 8333 + }, + { + "epoch": 0.7776429971073995, + "grad_norm": 6.589779078681357, + "learning_rate": 0.00029241390872832137, + "loss": 7.4451, + "step": 8334 + }, + { + "epoch": 0.7777363068022768, + "grad_norm": 0.7586288289425137, + "learning_rate": 0.0002924115335892898, + "loss": 7.1136, + "step": 8335 + }, + { + "epoch": 0.7778296164971541, + "grad_norm": 31175552.653784122, + "learning_rate": 0.00029240915808814786, + "loss": 7.4344, + "step": 8336 + }, + { + "epoch": 0.7779229261920314, + "grad_norm": 57023212.95593107, + "learning_rate": 0.00029240678222490154, + "loss": 7.3223, + "step": 8337 + }, + { + "epoch": 0.7780162358869086, + "grad_norm": 0.9708504356335037, + "learning_rate": 0.00029240440599955707, + "loss": 7.4764, + "step": 8338 + }, + { + "epoch": 0.7781095455817859, + "grad_norm": 0.5296440894190914, + "learning_rate": 0.00029240202941212035, + "loss": 7.3144, + "step": 8339 + }, + { + "epoch": 0.7782028552766632, + "grad_norm": 33729817.38955698, + "learning_rate": 0.00029239965246259745, + "loss": 7.132, + "step": 8340 + }, + { + "epoch": 0.7782961649715405, + "grad_norm": 1.9938353004103766, + "learning_rate": 0.0002923972751509944, + "loss": 7.4904, + "step": 8341 + }, + { + "epoch": 0.7783894746664178, + "grad_norm": 1.8770566960316823, + "learning_rate": 0.0002923948974773173, + "loss": 7.4939, + "step": 8342 + }, + { + "epoch": 0.7784827843612951, + "grad_norm": 1.234901941901803, + "learning_rate": 0.0002923925194415721, + "loss": 7.4743, + "step": 8343 + }, + { + "epoch": 0.7785760940561725, + "grad_norm": 0.7330440020970737, + "learning_rate": 0.00029239014104376495, + "loss": 7.8017, + "step": 8344 + }, + { + "epoch": 0.7786694037510498, + "grad_norm": 27532665.860245086, + "learning_rate": 0.0002923877622839018, + "loss": 7.311, + "step": 8345 + }, + { + "epoch": 0.7787627134459271, + "grad_norm": 377501785.95091647, + "learning_rate": 0.0002923853831619888, + "loss": 7.5208, + "step": 8346 + }, + { + "epoch": 0.7788560231408044, + "grad_norm": 2.001706123301679, + "learning_rate": 0.0002923830036780319, + "loss": 7.2006, + "step": 8347 + }, + { + "epoch": 0.7789493328356816, + "grad_norm": 0.8704590083486954, + "learning_rate": 0.00029238062383203726, + "loss": 7.3641, + "step": 8348 + }, + { + "epoch": 0.7790426425305589, + "grad_norm": 0.9428083806160968, + "learning_rate": 0.0002923782436240108, + "loss": 7.0545, + "step": 8349 + }, + { + "epoch": 0.7791359522254362, + "grad_norm": 1.1843964122720902, + "learning_rate": 0.0002923758630539587, + "loss": 7.3753, + "step": 8350 + }, + { + "epoch": 0.7792292619203135, + "grad_norm": 2.920457999940587, + "learning_rate": 0.0002923734821218869, + "loss": 7.3479, + "step": 8351 + }, + { + "epoch": 0.7793225716151908, + "grad_norm": 0.9818158264235588, + "learning_rate": 0.0002923711008278016, + "loss": 7.3605, + "step": 8352 + }, + { + "epoch": 0.7794158813100681, + "grad_norm": 0.6445239628085924, + "learning_rate": 0.00029236871917170867, + "loss": 7.7477, + "step": 8353 + }, + { + "epoch": 0.7795091910049454, + "grad_norm": 1.2046940985361116, + "learning_rate": 0.0002923663371536143, + "loss": 7.4759, + "step": 8354 + }, + { + "epoch": 0.7796025006998227, + "grad_norm": 2.4438816042502753, + "learning_rate": 0.00029236395477352453, + "loss": 7.1477, + "step": 8355 + }, + { + "epoch": 0.7796958103947, + "grad_norm": 0.6155524542035032, + "learning_rate": 0.0002923615720314453, + "loss": 7.1815, + "step": 8356 + }, + { + "epoch": 0.7797891200895773, + "grad_norm": 0.6202064200888693, + "learning_rate": 0.00029235918892738286, + "loss": 7.3416, + "step": 8357 + }, + { + "epoch": 0.7798824297844547, + "grad_norm": 1.7073089114071762, + "learning_rate": 0.00029235680546134315, + "loss": 7.1877, + "step": 8358 + }, + { + "epoch": 0.7799757394793319, + "grad_norm": 1.393410851813106, + "learning_rate": 0.00029235442163333224, + "loss": 7.4415, + "step": 8359 + }, + { + "epoch": 0.7800690491742092, + "grad_norm": 3869044342.281957, + "learning_rate": 0.0002923520374433562, + "loss": 7.2004, + "step": 8360 + }, + { + "epoch": 0.7801623588690865, + "grad_norm": 1.3411963788961987, + "learning_rate": 0.0002923496528914211, + "loss": 7.2525, + "step": 8361 + }, + { + "epoch": 0.7802556685639638, + "grad_norm": 86.8794223521973, + "learning_rate": 0.00029234726797753305, + "loss": 7.6999, + "step": 8362 + }, + { + "epoch": 0.7803489782588411, + "grad_norm": 3.0506669782681466, + "learning_rate": 0.00029234488270169797, + "loss": 7.2163, + "step": 8363 + }, + { + "epoch": 0.7804422879537184, + "grad_norm": 14701802583.054152, + "learning_rate": 0.0002923424970639221, + "loss": 7.4909, + "step": 8364 + }, + { + "epoch": 0.7805355976485957, + "grad_norm": 42.50103253884644, + "learning_rate": 0.00029234011106421136, + "loss": 7.3262, + "step": 8365 + }, + { + "epoch": 0.780628907343473, + "grad_norm": 23.657268895339335, + "learning_rate": 0.000292337724702572, + "loss": 7.2905, + "step": 8366 + }, + { + "epoch": 0.7807222170383503, + "grad_norm": 10.751053670664783, + "learning_rate": 0.0002923353379790098, + "loss": 7.4926, + "step": 8367 + }, + { + "epoch": 0.7808155267332276, + "grad_norm": 0.9806174473925859, + "learning_rate": 0.0002923329508935311, + "loss": 7.3277, + "step": 8368 + }, + { + "epoch": 0.7809088364281049, + "grad_norm": 8.313108646759192, + "learning_rate": 0.00029233056344614184, + "loss": 7.2098, + "step": 8369 + }, + { + "epoch": 0.7810021461229821, + "grad_norm": 4.10931770001212, + "learning_rate": 0.0002923281756368481, + "loss": 7.4939, + "step": 8370 + }, + { + "epoch": 0.7810954558178594, + "grad_norm": 8.913095196865374, + "learning_rate": 0.00029232578746565607, + "loss": 6.8813, + "step": 8371 + }, + { + "epoch": 0.7811887655127367, + "grad_norm": 0.9921666063173948, + "learning_rate": 0.00029232339893257164, + "loss": 7.0927, + "step": 8372 + }, + { + "epoch": 0.7812820752076141, + "grad_norm": 4.386427284070142, + "learning_rate": 0.00029232101003760097, + "loss": 7.5744, + "step": 8373 + }, + { + "epoch": 0.7813753849024914, + "grad_norm": 4.072498610830454, + "learning_rate": 0.00029231862078075017, + "loss": 7.2674, + "step": 8374 + }, + { + "epoch": 0.7814686945973687, + "grad_norm": 1.3686245684793716, + "learning_rate": 0.0002923162311620252, + "loss": 7.0933, + "step": 8375 + }, + { + "epoch": 0.781562004292246, + "grad_norm": 0.7097806397088865, + "learning_rate": 0.0002923138411814323, + "loss": 7.3507, + "step": 8376 + }, + { + "epoch": 0.7816553139871233, + "grad_norm": 1.26872735095576, + "learning_rate": 0.0002923114508389774, + "loss": 7.1332, + "step": 8377 + }, + { + "epoch": 0.7817486236820006, + "grad_norm": 155.269478394203, + "learning_rate": 0.00029230906013466665, + "loss": 7.5789, + "step": 8378 + }, + { + "epoch": 0.7818419333768779, + "grad_norm": 2.802088604338141, + "learning_rate": 0.0002923066690685061, + "loss": 7.3958, + "step": 8379 + }, + { + "epoch": 0.7819352430717551, + "grad_norm": 1.31253091022748, + "learning_rate": 0.00029230427764050186, + "loss": 6.9861, + "step": 8380 + }, + { + "epoch": 0.7820285527666324, + "grad_norm": 1.6814742858004565, + "learning_rate": 0.00029230188585066, + "loss": 6.8968, + "step": 8381 + }, + { + "epoch": 0.7821218624615097, + "grad_norm": 0.7252424788036615, + "learning_rate": 0.0002922994936989866, + "loss": 7.0641, + "step": 8382 + }, + { + "epoch": 0.782215172156387, + "grad_norm": 1.3960556237240878, + "learning_rate": 0.00029229710118548773, + "loss": 7.2324, + "step": 8383 + }, + { + "epoch": 0.7823084818512643, + "grad_norm": 60322.9853447073, + "learning_rate": 0.0002922947083101695, + "loss": 7.2175, + "step": 8384 + }, + { + "epoch": 0.7824017915461416, + "grad_norm": 44.22645531953319, + "learning_rate": 0.0002922923150730379, + "loss": 7.2965, + "step": 8385 + }, + { + "epoch": 0.782495101241019, + "grad_norm": 108877.51425050525, + "learning_rate": 0.00029228992147409917, + "loss": 7.5058, + "step": 8386 + }, + { + "epoch": 0.7825884109358963, + "grad_norm": 93.02423112381176, + "learning_rate": 0.00029228752751335934, + "loss": 7.5427, + "step": 8387 + }, + { + "epoch": 0.7826817206307736, + "grad_norm": 6.737009506301153, + "learning_rate": 0.0002922851331908244, + "loss": 7.2486, + "step": 8388 + }, + { + "epoch": 0.7827750303256509, + "grad_norm": 26.28963818080088, + "learning_rate": 0.00029228273850650057, + "loss": 7.5494, + "step": 8389 + }, + { + "epoch": 0.7828683400205282, + "grad_norm": 2.0271212893132735, + "learning_rate": 0.0002922803434603939, + "loss": 7.2902, + "step": 8390 + }, + { + "epoch": 0.7829616497154054, + "grad_norm": 4.474235034871334, + "learning_rate": 0.0002922779480525104, + "loss": 7.2873, + "step": 8391 + }, + { + "epoch": 0.7830549594102827, + "grad_norm": 3.7681349955269683, + "learning_rate": 0.00029227555228285626, + "loss": 7.2015, + "step": 8392 + }, + { + "epoch": 0.78314826910516, + "grad_norm": 1.5673312574668903, + "learning_rate": 0.00029227315615143755, + "loss": 7.4415, + "step": 8393 + }, + { + "epoch": 0.7832415788000373, + "grad_norm": 55.01860198120453, + "learning_rate": 0.0002922707596582603, + "loss": 7.3766, + "step": 8394 + }, + { + "epoch": 0.7833348884949146, + "grad_norm": 2.6896714380848077, + "learning_rate": 0.0002922683628033307, + "loss": 7.2539, + "step": 8395 + }, + { + "epoch": 0.7834281981897919, + "grad_norm": 13541.192931936697, + "learning_rate": 0.00029226596558665474, + "loss": 6.8538, + "step": 8396 + }, + { + "epoch": 0.7835215078846692, + "grad_norm": 10.042423745060837, + "learning_rate": 0.0002922635680082386, + "loss": 7.0962, + "step": 8397 + }, + { + "epoch": 0.7836148175795465, + "grad_norm": 0.541289959612754, + "learning_rate": 0.0002922611700680884, + "loss": 7.2096, + "step": 8398 + }, + { + "epoch": 0.7837081272744238, + "grad_norm": 5.166749527839149, + "learning_rate": 0.0002922587717662102, + "loss": 6.9359, + "step": 8399 + }, + { + "epoch": 0.7838014369693012, + "grad_norm": 17991.228590706596, + "learning_rate": 0.00029225637310260997, + "loss": 7.3041, + "step": 8400 + }, + { + "epoch": 0.7838947466641784, + "grad_norm": 4.561225455916792, + "learning_rate": 0.000292253974077294, + "loss": 7.1306, + "step": 8401 + }, + { + "epoch": 0.7839880563590557, + "grad_norm": 3.9610769834178905, + "learning_rate": 0.0002922515746902683, + "loss": 7.5783, + "step": 8402 + }, + { + "epoch": 0.784081366053933, + "grad_norm": 44.35488417435674, + "learning_rate": 0.00029224917494153897, + "loss": 7.4523, + "step": 8403 + }, + { + "epoch": 0.7841746757488103, + "grad_norm": 0.6631496673660943, + "learning_rate": 0.00029224677483111205, + "loss": 7.1152, + "step": 8404 + }, + { + "epoch": 0.7842679854436876, + "grad_norm": 5.848322119706285, + "learning_rate": 0.0002922443743589939, + "loss": 7.4294, + "step": 8405 + }, + { + "epoch": 0.7843612951385649, + "grad_norm": 1.8384430452626441, + "learning_rate": 0.0002922419735251903, + "loss": 7.2925, + "step": 8406 + }, + { + "epoch": 0.7844546048334422, + "grad_norm": 3.874518460358717, + "learning_rate": 0.0002922395723297075, + "loss": 7.2372, + "step": 8407 + }, + { + "epoch": 0.7845479145283195, + "grad_norm": 26.76179525214036, + "learning_rate": 0.00029223717077255163, + "loss": 7.1424, + "step": 8408 + }, + { + "epoch": 0.7846412242231968, + "grad_norm": 115934.92419455039, + "learning_rate": 0.00029223476885372883, + "loss": 7.3176, + "step": 8409 + }, + { + "epoch": 0.7847345339180741, + "grad_norm": 39.952601290945665, + "learning_rate": 0.0002922323665732451, + "loss": 7.3372, + "step": 8410 + }, + { + "epoch": 0.7848278436129514, + "grad_norm": 1.6136843002794834, + "learning_rate": 0.00029222996393110656, + "loss": 7.3552, + "step": 8411 + }, + { + "epoch": 0.7849211533078286, + "grad_norm": 1.9834939303421733, + "learning_rate": 0.0002922275609273194, + "loss": 7.3359, + "step": 8412 + }, + { + "epoch": 0.7850144630027059, + "grad_norm": 6.486790710511283, + "learning_rate": 0.00029222515756188966, + "loss": 7.1772, + "step": 8413 + }, + { + "epoch": 0.7851077726975833, + "grad_norm": 172436.72972803572, + "learning_rate": 0.00029222275383482347, + "loss": 7.3632, + "step": 8414 + }, + { + "epoch": 0.7852010823924606, + "grad_norm": 19.04216947661014, + "learning_rate": 0.00029222034974612696, + "loss": 7.3885, + "step": 8415 + }, + { + "epoch": 0.7852943920873379, + "grad_norm": 2.0998949448485975, + "learning_rate": 0.0002922179452958062, + "loss": 7.6399, + "step": 8416 + }, + { + "epoch": 0.7853877017822152, + "grad_norm": 2.2735114147778126, + "learning_rate": 0.00029221554048386737, + "loss": 7.3262, + "step": 8417 + }, + { + "epoch": 0.7854810114770925, + "grad_norm": 1.4981230331820643, + "learning_rate": 0.00029221313531031654, + "loss": 7.6614, + "step": 8418 + }, + { + "epoch": 0.7855743211719698, + "grad_norm": 3.83223472665322, + "learning_rate": 0.00029221072977515984, + "loss": 6.8862, + "step": 8419 + }, + { + "epoch": 0.7856676308668471, + "grad_norm": 328829.06741769804, + "learning_rate": 0.00029220832387840334, + "loss": 7.4045, + "step": 8420 + }, + { + "epoch": 0.7857609405617244, + "grad_norm": 2.0390040133626255, + "learning_rate": 0.00029220591762005326, + "loss": 7.5463, + "step": 8421 + }, + { + "epoch": 0.7858542502566017, + "grad_norm": 1.0959480052501767, + "learning_rate": 0.0002922035110001156, + "loss": 7.6759, + "step": 8422 + }, + { + "epoch": 0.7859475599514789, + "grad_norm": 5.24733751125175, + "learning_rate": 0.0002922011040185966, + "loss": 7.5008, + "step": 8423 + }, + { + "epoch": 0.7860408696463562, + "grad_norm": 55.48063549995646, + "learning_rate": 0.00029219869667550225, + "loss": 7.3731, + "step": 8424 + }, + { + "epoch": 0.7861341793412335, + "grad_norm": 18779.563865917222, + "learning_rate": 0.00029219628897083873, + "loss": 7.0364, + "step": 8425 + }, + { + "epoch": 0.7862274890361108, + "grad_norm": 76893.63754230854, + "learning_rate": 0.0002921938809046122, + "loss": 7.3431, + "step": 8426 + }, + { + "epoch": 0.7863207987309881, + "grad_norm": 466105.4897331099, + "learning_rate": 0.0002921914724768288, + "loss": 7.458, + "step": 8427 + }, + { + "epoch": 0.7864141084258655, + "grad_norm": 2.8333959179417816, + "learning_rate": 0.0002921890636874945, + "loss": 7.1405, + "step": 8428 + }, + { + "epoch": 0.7865074181207428, + "grad_norm": 68883.08766577512, + "learning_rate": 0.0002921866545366156, + "loss": 7.1904, + "step": 8429 + }, + { + "epoch": 0.7866007278156201, + "grad_norm": 0.6802791349548161, + "learning_rate": 0.00029218424502419816, + "loss": 7.4662, + "step": 8430 + }, + { + "epoch": 0.7866940375104974, + "grad_norm": 0.9080769291729064, + "learning_rate": 0.00029218183515024825, + "loss": 7.5729, + "step": 8431 + }, + { + "epoch": 0.7867873472053747, + "grad_norm": 129919.7996324783, + "learning_rate": 0.00029217942491477215, + "loss": 7.3045, + "step": 8432 + }, + { + "epoch": 0.7868806569002519, + "grad_norm": 4.094329780561216, + "learning_rate": 0.00029217701431777577, + "loss": 7.1226, + "step": 8433 + }, + { + "epoch": 0.7869739665951292, + "grad_norm": 15.744909195680334, + "learning_rate": 0.00029217460335926546, + "loss": 7.1468, + "step": 8434 + }, + { + "epoch": 0.7870672762900065, + "grad_norm": 0.6663341534565312, + "learning_rate": 0.0002921721920392472, + "loss": 7.1574, + "step": 8435 + }, + { + "epoch": 0.7871605859848838, + "grad_norm": 1.0538490806084906, + "learning_rate": 0.00029216978035772717, + "loss": 7.3673, + "step": 8436 + }, + { + "epoch": 0.7872538956797611, + "grad_norm": 0.8468013021435875, + "learning_rate": 0.0002921673683147115, + "loss": 7.5501, + "step": 8437 + }, + { + "epoch": 0.7873472053746384, + "grad_norm": 0.6255241421562112, + "learning_rate": 0.0002921649559102063, + "loss": 7.3165, + "step": 8438 + }, + { + "epoch": 0.7874405150695157, + "grad_norm": 331855.68704295147, + "learning_rate": 0.0002921625431442178, + "loss": 7.3849, + "step": 8439 + }, + { + "epoch": 0.787533824764393, + "grad_norm": 1.2227124782610186, + "learning_rate": 0.000292160130016752, + "loss": 7.5825, + "step": 8440 + }, + { + "epoch": 0.7876271344592704, + "grad_norm": 12.580062604400672, + "learning_rate": 0.0002921577165278152, + "loss": 7.4651, + "step": 8441 + }, + { + "epoch": 0.7877204441541477, + "grad_norm": 20.472373726145587, + "learning_rate": 0.0002921553026774133, + "loss": 7.3609, + "step": 8442 + }, + { + "epoch": 0.787813753849025, + "grad_norm": 0.7863600561872602, + "learning_rate": 0.00029215288846555265, + "loss": 7.33, + "step": 8443 + }, + { + "epoch": 0.7879070635439022, + "grad_norm": 1.0213117155671863, + "learning_rate": 0.0002921504738922393, + "loss": 7.3225, + "step": 8444 + }, + { + "epoch": 0.7880003732387795, + "grad_norm": 1.3284039490072415, + "learning_rate": 0.0002921480589574794, + "loss": 6.8509, + "step": 8445 + }, + { + "epoch": 0.7880936829336568, + "grad_norm": 6.711242911587666, + "learning_rate": 0.00029214564366127906, + "loss": 7.2253, + "step": 8446 + }, + { + "epoch": 0.7881869926285341, + "grad_norm": 1.5272651764225909, + "learning_rate": 0.0002921432280036446, + "loss": 7.5258, + "step": 8447 + }, + { + "epoch": 0.7882803023234114, + "grad_norm": 1.530960777290043, + "learning_rate": 0.00029214081198458187, + "loss": 7.3698, + "step": 8448 + }, + { + "epoch": 0.7883736120182887, + "grad_norm": 1.3133088126076349, + "learning_rate": 0.00029213839560409717, + "loss": 7.4838, + "step": 8449 + }, + { + "epoch": 0.788466921713166, + "grad_norm": 0.8066801844102383, + "learning_rate": 0.00029213597886219666, + "loss": 7.2892, + "step": 8450 + }, + { + "epoch": 0.7885602314080433, + "grad_norm": 703904.6555029999, + "learning_rate": 0.00029213356175888645, + "loss": 7.2043, + "step": 8451 + }, + { + "epoch": 0.7886535411029206, + "grad_norm": 8.478963449870859, + "learning_rate": 0.00029213114429417275, + "loss": 7.4483, + "step": 8452 + }, + { + "epoch": 0.7887468507977979, + "grad_norm": 0.7857456861525087, + "learning_rate": 0.0002921287264680616, + "loss": 7.2523, + "step": 8453 + }, + { + "epoch": 0.7888401604926751, + "grad_norm": 0.705530659208977, + "learning_rate": 0.0002921263082805592, + "loss": 7.3155, + "step": 8454 + }, + { + "epoch": 0.7889334701875524, + "grad_norm": 2.337264306888548, + "learning_rate": 0.00029212388973167173, + "loss": 7.1284, + "step": 8455 + }, + { + "epoch": 0.7890267798824298, + "grad_norm": 1.3631284021816223, + "learning_rate": 0.0002921214708214053, + "loss": 7.4022, + "step": 8456 + }, + { + "epoch": 0.7891200895773071, + "grad_norm": 1.325879187070677, + "learning_rate": 0.000292119051549766, + "loss": 7.3753, + "step": 8457 + }, + { + "epoch": 0.7892133992721844, + "grad_norm": 3.9479166495999096, + "learning_rate": 0.0002921166319167601, + "loss": 7.2669, + "step": 8458 + }, + { + "epoch": 0.7893067089670617, + "grad_norm": 2.7648034215604227, + "learning_rate": 0.00029211421192239375, + "loss": 6.9702, + "step": 8459 + }, + { + "epoch": 0.789400018661939, + "grad_norm": 43.14865753324885, + "learning_rate": 0.000292111791566673, + "loss": 7.4313, + "step": 8460 + }, + { + "epoch": 0.7894933283568163, + "grad_norm": 31455548.866960533, + "learning_rate": 0.00029210937084960415, + "loss": 7.3573, + "step": 8461 + }, + { + "epoch": 0.7895866380516936, + "grad_norm": 1.3263505644228857, + "learning_rate": 0.00029210694977119316, + "loss": 7.09, + "step": 8462 + }, + { + "epoch": 0.7896799477465709, + "grad_norm": 87174551.92709006, + "learning_rate": 0.0002921045283314463, + "loss": 7.3433, + "step": 8463 + }, + { + "epoch": 0.7897732574414482, + "grad_norm": 1.0445236269014664, + "learning_rate": 0.0002921021065303698, + "loss": 7.2711, + "step": 8464 + }, + { + "epoch": 0.7898665671363254, + "grad_norm": 117938837.10935259, + "learning_rate": 0.00029209968436796967, + "loss": 7.1662, + "step": 8465 + }, + { + "epoch": 0.7899598768312027, + "grad_norm": 43.50401838605123, + "learning_rate": 0.0002920972618442521, + "loss": 7.5166, + "step": 8466 + }, + { + "epoch": 0.79005318652608, + "grad_norm": 2.154190834563899, + "learning_rate": 0.00029209483895922334, + "loss": 7.4437, + "step": 8467 + }, + { + "epoch": 0.7901464962209573, + "grad_norm": 0.5163754110412737, + "learning_rate": 0.0002920924157128895, + "loss": 7.2169, + "step": 8468 + }, + { + "epoch": 0.7902398059158346, + "grad_norm": 1.1762927306810913, + "learning_rate": 0.00029208999210525677, + "loss": 7.2951, + "step": 8469 + }, + { + "epoch": 0.790333115610712, + "grad_norm": 3.0786731742506905, + "learning_rate": 0.00029208756813633123, + "loss": 7.2079, + "step": 8470 + }, + { + "epoch": 0.7904264253055893, + "grad_norm": 2.3748112944009407, + "learning_rate": 0.0002920851438061191, + "loss": 6.8957, + "step": 8471 + }, + { + "epoch": 0.7905197350004666, + "grad_norm": 2322599848.9140306, + "learning_rate": 0.00029208271911462656, + "loss": 7.3373, + "step": 8472 + }, + { + "epoch": 0.7906130446953439, + "grad_norm": 2977271249.545571, + "learning_rate": 0.0002920802940618597, + "loss": 7.4128, + "step": 8473 + }, + { + "epoch": 0.7907063543902212, + "grad_norm": 24.192914338043284, + "learning_rate": 0.00029207786864782475, + "loss": 7.2165, + "step": 8474 + }, + { + "epoch": 0.7907996640850985, + "grad_norm": 261868584317.94437, + "learning_rate": 0.0002920754428725279, + "loss": 7.7892, + "step": 8475 + }, + { + "epoch": 0.7908929737799757, + "grad_norm": 1.1098858877916384, + "learning_rate": 0.0002920730167359753, + "loss": 7.4936, + "step": 8476 + }, + { + "epoch": 0.790986283474853, + "grad_norm": 0.8844545954757467, + "learning_rate": 0.0002920705902381731, + "loss": 7.2548, + "step": 8477 + }, + { + "epoch": 0.7910795931697303, + "grad_norm": 2.2481318326032405, + "learning_rate": 0.00029206816337912743, + "loss": 7.7609, + "step": 8478 + }, + { + "epoch": 0.7911729028646076, + "grad_norm": 0.6342013744115739, + "learning_rate": 0.0002920657361588445, + "loss": 7.4176, + "step": 8479 + }, + { + "epoch": 0.7912662125594849, + "grad_norm": 0.8873815844015387, + "learning_rate": 0.00029206330857733053, + "loss": 7.4531, + "step": 8480 + }, + { + "epoch": 0.7913595222543622, + "grad_norm": 78.14143566183677, + "learning_rate": 0.00029206088063459164, + "loss": 7.4739, + "step": 8481 + }, + { + "epoch": 0.7914528319492395, + "grad_norm": 11.847447540063035, + "learning_rate": 0.000292058452330634, + "loss": 7.4411, + "step": 8482 + }, + { + "epoch": 0.7915461416441169, + "grad_norm": 7.020705979842609, + "learning_rate": 0.0002920560236654638, + "loss": 7.365, + "step": 8483 + }, + { + "epoch": 0.7916394513389942, + "grad_norm": 13.604679866505334, + "learning_rate": 0.00029205359463908723, + "loss": 7.7782, + "step": 8484 + }, + { + "epoch": 0.7917327610338715, + "grad_norm": 29.72998334455206, + "learning_rate": 0.0002920511652515104, + "loss": 7.2747, + "step": 8485 + }, + { + "epoch": 0.7918260707287487, + "grad_norm": 4179416784.21484, + "learning_rate": 0.00029204873550273954, + "loss": 7.3455, + "step": 8486 + }, + { + "epoch": 0.791919380423626, + "grad_norm": 2.263801819032047, + "learning_rate": 0.0002920463053927809, + "loss": 7.3382, + "step": 8487 + }, + { + "epoch": 0.7920126901185033, + "grad_norm": 2.324125087723818, + "learning_rate": 0.00029204387492164053, + "loss": 7.5861, + "step": 8488 + }, + { + "epoch": 0.7921059998133806, + "grad_norm": 4.654547828515939, + "learning_rate": 0.00029204144408932466, + "loss": 7.2054, + "step": 8489 + }, + { + "epoch": 0.7921993095082579, + "grad_norm": 35.251545168356486, + "learning_rate": 0.00029203901289583947, + "loss": 7.7595, + "step": 8490 + }, + { + "epoch": 0.7922926192031352, + "grad_norm": 32493786326.8822, + "learning_rate": 0.00029203658134119114, + "loss": 7.1548, + "step": 8491 + }, + { + "epoch": 0.7923859288980125, + "grad_norm": 5.599938947611306, + "learning_rate": 0.0002920341494253859, + "loss": 7.4902, + "step": 8492 + }, + { + "epoch": 0.7924792385928898, + "grad_norm": 1.3292744950650597, + "learning_rate": 0.00029203171714842983, + "loss": 7.4573, + "step": 8493 + }, + { + "epoch": 0.7925725482877671, + "grad_norm": 3.4969833619680806, + "learning_rate": 0.0002920292845103292, + "loss": 7.4871, + "step": 8494 + }, + { + "epoch": 0.7926658579826444, + "grad_norm": 18.300640988610656, + "learning_rate": 0.0002920268515110902, + "loss": 7.2942, + "step": 8495 + }, + { + "epoch": 0.7927591676775217, + "grad_norm": 22.771063641030597, + "learning_rate": 0.0002920244181507189, + "loss": 7.5854, + "step": 8496 + }, + { + "epoch": 0.792852477372399, + "grad_norm": 32847541910.972866, + "learning_rate": 0.0002920219844292217, + "loss": 7.1625, + "step": 8497 + }, + { + "epoch": 0.7929457870672763, + "grad_norm": 3.661721518323653, + "learning_rate": 0.0002920195503466046, + "loss": 7.3683, + "step": 8498 + }, + { + "epoch": 0.7930390967621536, + "grad_norm": 27.594281229831996, + "learning_rate": 0.0002920171159028739, + "loss": 7.2869, + "step": 8499 + }, + { + "epoch": 0.7931324064570309, + "grad_norm": 8.252103855800602, + "learning_rate": 0.00029201468109803566, + "loss": 7.5738, + "step": 8500 + }, + { + "epoch": 0.7932257161519082, + "grad_norm": 1.4386713862274831, + "learning_rate": 0.0002920122459320962, + "loss": 7.5491, + "step": 8501 + }, + { + "epoch": 0.7933190258467855, + "grad_norm": 11.742733282491546, + "learning_rate": 0.00029200981040506166, + "loss": 7.7297, + "step": 8502 + }, + { + "epoch": 0.7934123355416628, + "grad_norm": 2.339446402159239, + "learning_rate": 0.00029200737451693824, + "loss": 7.0729, + "step": 8503 + }, + { + "epoch": 0.7935056452365401, + "grad_norm": 899645824.5949155, + "learning_rate": 0.0002920049382677321, + "loss": 7.5471, + "step": 8504 + }, + { + "epoch": 0.7935989549314174, + "grad_norm": 172133327.80052397, + "learning_rate": 0.0002920025016574495, + "loss": 7.3297, + "step": 8505 + }, + { + "epoch": 0.7936922646262947, + "grad_norm": 44.506664368243364, + "learning_rate": 0.00029200006468609664, + "loss": 7.4824, + "step": 8506 + }, + { + "epoch": 0.7937855743211719, + "grad_norm": 12.631379710368416, + "learning_rate": 0.00029199762735367964, + "loss": 7.6391, + "step": 8507 + }, + { + "epoch": 0.7938788840160492, + "grad_norm": 1160511348.3775303, + "learning_rate": 0.0002919951896602047, + "loss": 7.0876, + "step": 8508 + }, + { + "epoch": 0.7939721937109265, + "grad_norm": 3.611143466770344, + "learning_rate": 0.0002919927516056781, + "loss": 7.5209, + "step": 8509 + }, + { + "epoch": 0.7940655034058038, + "grad_norm": 0.7207580736224598, + "learning_rate": 0.000291990313190106, + "loss": 7.2538, + "step": 8510 + }, + { + "epoch": 0.7941588131006811, + "grad_norm": 4.3767294985633, + "learning_rate": 0.0002919878744134946, + "loss": 7.3073, + "step": 8511 + }, + { + "epoch": 0.7942521227955585, + "grad_norm": 3.511078169948091, + "learning_rate": 0.00029198543527585006, + "loss": 7.8529, + "step": 8512 + }, + { + "epoch": 0.7943454324904358, + "grad_norm": 2.9393719805635836, + "learning_rate": 0.00029198299577717863, + "loss": 7.2156, + "step": 8513 + }, + { + "epoch": 0.7944387421853131, + "grad_norm": 1.420616462953416, + "learning_rate": 0.0002919805559174865, + "loss": 7.4626, + "step": 8514 + }, + { + "epoch": 0.7945320518801904, + "grad_norm": 1.1419993687334318, + "learning_rate": 0.0002919781156967799, + "loss": 7.3574, + "step": 8515 + }, + { + "epoch": 0.7946253615750677, + "grad_norm": 32.745865019076014, + "learning_rate": 0.00029197567511506496, + "loss": 7.4754, + "step": 8516 + }, + { + "epoch": 0.794718671269945, + "grad_norm": 199405447.66392303, + "learning_rate": 0.00029197323417234797, + "loss": 7.4073, + "step": 8517 + }, + { + "epoch": 0.7948119809648222, + "grad_norm": 1.3148091458632933, + "learning_rate": 0.0002919707928686351, + "loss": 7.4472, + "step": 8518 + }, + { + "epoch": 0.7949052906596995, + "grad_norm": 0.9574040798012702, + "learning_rate": 0.00029196835120393256, + "loss": 7.0468, + "step": 8519 + }, + { + "epoch": 0.7949986003545768, + "grad_norm": 2.0118807660192752, + "learning_rate": 0.0002919659091782465, + "loss": 7.2362, + "step": 8520 + }, + { + "epoch": 0.7950919100494541, + "grad_norm": 0.7215906849063709, + "learning_rate": 0.00029196346679158326, + "loss": 7.1488, + "step": 8521 + }, + { + "epoch": 0.7951852197443314, + "grad_norm": 2069323891.4546647, + "learning_rate": 0.00029196102404394894, + "loss": 7.3062, + "step": 8522 + }, + { + "epoch": 0.7952785294392087, + "grad_norm": 2.7658857579342953, + "learning_rate": 0.00029195858093534983, + "loss": 7.5703, + "step": 8523 + }, + { + "epoch": 0.795371839134086, + "grad_norm": 2.078002289603633, + "learning_rate": 0.00029195613746579207, + "loss": 7.3847, + "step": 8524 + }, + { + "epoch": 0.7954651488289634, + "grad_norm": 1.5321369537308929, + "learning_rate": 0.0002919536936352819, + "loss": 7.4199, + "step": 8525 + }, + { + "epoch": 0.7955584585238407, + "grad_norm": 1.3742148880201333, + "learning_rate": 0.00029195124944382555, + "loss": 7.2318, + "step": 8526 + }, + { + "epoch": 0.795651768218718, + "grad_norm": 1.6604055277758247, + "learning_rate": 0.0002919488048914292, + "loss": 7.5243, + "step": 8527 + }, + { + "epoch": 0.7957450779135953, + "grad_norm": 1.9752849923283855, + "learning_rate": 0.0002919463599780991, + "loss": 7.633, + "step": 8528 + }, + { + "epoch": 0.7958383876084725, + "grad_norm": 3084768800.9098396, + "learning_rate": 0.00029194391470384144, + "loss": 7.2625, + "step": 8529 + }, + { + "epoch": 0.7959316973033498, + "grad_norm": 3.039544206908599, + "learning_rate": 0.00029194146906866243, + "loss": 7.6188, + "step": 8530 + }, + { + "epoch": 0.7960250069982271, + "grad_norm": 5.1976146846672, + "learning_rate": 0.0002919390230725684, + "loss": 7.1081, + "step": 8531 + }, + { + "epoch": 0.7961183166931044, + "grad_norm": 62490582900.28283, + "learning_rate": 0.0002919365767155654, + "loss": 7.4159, + "step": 8532 + }, + { + "epoch": 0.7962116263879817, + "grad_norm": 48.71297095984982, + "learning_rate": 0.0002919341299976597, + "loss": 7.6116, + "step": 8533 + }, + { + "epoch": 0.796304936082859, + "grad_norm": 63.1227919800274, + "learning_rate": 0.0002919316829188577, + "loss": 7.6254, + "step": 8534 + }, + { + "epoch": 0.7963982457777363, + "grad_norm": 3.168320172909481, + "learning_rate": 0.0002919292354791653, + "loss": 7.5521, + "step": 8535 + }, + { + "epoch": 0.7964915554726136, + "grad_norm": 1.2019826759715346, + "learning_rate": 0.000291926787678589, + "loss": 7.624, + "step": 8536 + }, + { + "epoch": 0.7965848651674909, + "grad_norm": 30.54821478069479, + "learning_rate": 0.0002919243395171349, + "loss": 7.3343, + "step": 8537 + }, + { + "epoch": 0.7966781748623682, + "grad_norm": 2.2709344507467972, + "learning_rate": 0.00029192189099480924, + "loss": 7.6303, + "step": 8538 + }, + { + "epoch": 0.7967714845572454, + "grad_norm": 13.713918374828951, + "learning_rate": 0.0002919194421116182, + "loss": 7.4868, + "step": 8539 + }, + { + "epoch": 0.7968647942521228, + "grad_norm": 19145415221.95224, + "learning_rate": 0.0002919169928675682, + "loss": 7.6438, + "step": 8540 + }, + { + "epoch": 0.7969581039470001, + "grad_norm": 3.047808737923919, + "learning_rate": 0.0002919145432626652, + "loss": 7.7216, + "step": 8541 + }, + { + "epoch": 0.7970514136418774, + "grad_norm": 8.67028532521751, + "learning_rate": 0.0002919120932969156, + "loss": 7.617, + "step": 8542 + }, + { + "epoch": 0.7971447233367547, + "grad_norm": 14.659193240927038, + "learning_rate": 0.00029190964297032555, + "loss": 7.9376, + "step": 8543 + }, + { + "epoch": 0.797238033031632, + "grad_norm": 27529912609.47193, + "learning_rate": 0.00029190719228290135, + "loss": 7.5049, + "step": 8544 + }, + { + "epoch": 0.7973313427265093, + "grad_norm": 1.3748241325193584, + "learning_rate": 0.0002919047412346492, + "loss": 7.5546, + "step": 8545 + }, + { + "epoch": 0.7974246524213866, + "grad_norm": 101092952.74774866, + "learning_rate": 0.0002919022898255753, + "loss": 7.8139, + "step": 8546 + }, + { + "epoch": 0.7975179621162639, + "grad_norm": 4.1814181114091875, + "learning_rate": 0.0002918998380556859, + "loss": 7.799, + "step": 8547 + }, + { + "epoch": 0.7976112718111412, + "grad_norm": 1.349271658348078, + "learning_rate": 0.00029189738592498733, + "loss": 7.6354, + "step": 8548 + }, + { + "epoch": 0.7977045815060185, + "grad_norm": 1.312637223565919, + "learning_rate": 0.0002918949334334857, + "loss": 7.3153, + "step": 8549 + }, + { + "epoch": 0.7977978912008957, + "grad_norm": 1.4373160745976354, + "learning_rate": 0.00029189248058118723, + "loss": 7.8488, + "step": 8550 + }, + { + "epoch": 0.797891200895773, + "grad_norm": 0.7180863866586877, + "learning_rate": 0.00029189002736809824, + "loss": 7.087, + "step": 8551 + }, + { + "epoch": 0.7979845105906503, + "grad_norm": 3395909524.6710973, + "learning_rate": 0.000291887573794225, + "loss": 7.4805, + "step": 8552 + }, + { + "epoch": 0.7980778202855277, + "grad_norm": 15.79136458260227, + "learning_rate": 0.00029188511985957363, + "loss": 7.8592, + "step": 8553 + }, + { + "epoch": 0.798171129980405, + "grad_norm": 1.0088959863356546, + "learning_rate": 0.00029188266556415043, + "loss": 7.7547, + "step": 8554 + }, + { + "epoch": 0.7982644396752823, + "grad_norm": 0.9970311733920342, + "learning_rate": 0.0002918802109079616, + "loss": 7.373, + "step": 8555 + }, + { + "epoch": 0.7983577493701596, + "grad_norm": 1.963950188082622, + "learning_rate": 0.00029187775589101353, + "loss": 7.2321, + "step": 8556 + }, + { + "epoch": 0.7984510590650369, + "grad_norm": 7.001848500865314, + "learning_rate": 0.0002918753005133123, + "loss": 7.4214, + "step": 8557 + }, + { + "epoch": 0.7985443687599142, + "grad_norm": 1.0746277580261374, + "learning_rate": 0.0002918728447748642, + "loss": 7.0396, + "step": 8558 + }, + { + "epoch": 0.7986376784547915, + "grad_norm": 156789.60486756236, + "learning_rate": 0.00029187038867567553, + "loss": 7.3775, + "step": 8559 + }, + { + "epoch": 0.7987309881496687, + "grad_norm": 5.038867417429068, + "learning_rate": 0.0002918679322157524, + "loss": 7.2357, + "step": 8560 + }, + { + "epoch": 0.798824297844546, + "grad_norm": 1.5231654243128365, + "learning_rate": 0.0002918654753951012, + "loss": 7.4929, + "step": 8561 + }, + { + "epoch": 0.7989176075394233, + "grad_norm": 1.5592416153455282, + "learning_rate": 0.0002918630182137281, + "loss": 7.557, + "step": 8562 + }, + { + "epoch": 0.7990109172343006, + "grad_norm": 1481.9939384481388, + "learning_rate": 0.0002918605606716394, + "loss": 7.5121, + "step": 8563 + }, + { + "epoch": 0.7991042269291779, + "grad_norm": 3.5291203671090807, + "learning_rate": 0.00029185810276884127, + "loss": 7.6524, + "step": 8564 + }, + { + "epoch": 0.7991975366240552, + "grad_norm": 9327236.22433792, + "learning_rate": 0.00029185564450534004, + "loss": 7.1096, + "step": 8565 + }, + { + "epoch": 0.7992908463189325, + "grad_norm": 1.5301121132473339, + "learning_rate": 0.0002918531858811419, + "loss": 7.2754, + "step": 8566 + }, + { + "epoch": 0.7993841560138099, + "grad_norm": 1.0770601389095624, + "learning_rate": 0.00029185072689625317, + "loss": 7.4729, + "step": 8567 + }, + { + "epoch": 0.7994774657086872, + "grad_norm": 1.6008802249851388, + "learning_rate": 0.00029184826755068, + "loss": 7.5262, + "step": 8568 + }, + { + "epoch": 0.7995707754035645, + "grad_norm": 2.0229443003057406, + "learning_rate": 0.00029184580784442877, + "loss": 7.5659, + "step": 8569 + }, + { + "epoch": 0.7996640850984418, + "grad_norm": 5.555732162067937, + "learning_rate": 0.0002918433477775056, + "loss": 7.292, + "step": 8570 + }, + { + "epoch": 0.799757394793319, + "grad_norm": 2.231385631880682, + "learning_rate": 0.0002918408873499169, + "loss": 7.4169, + "step": 8571 + }, + { + "epoch": 0.7998507044881963, + "grad_norm": 4.898100687773479, + "learning_rate": 0.0002918384265616688, + "loss": 7.3343, + "step": 8572 + }, + { + "epoch": 0.7999440141830736, + "grad_norm": 780680504.230104, + "learning_rate": 0.0002918359654127676, + "loss": 7.311, + "step": 8573 + }, + { + "epoch": 0.8000373238779509, + "grad_norm": 37.74066902630752, + "learning_rate": 0.0002918335039032195, + "loss": 7.5126, + "step": 8574 + }, + { + "epoch": 0.8001306335728282, + "grad_norm": 0.7980527965150399, + "learning_rate": 0.00029183104203303086, + "loss": 7.5063, + "step": 8575 + }, + { + "epoch": 0.8002239432677055, + "grad_norm": 0.9449001251804472, + "learning_rate": 0.00029182857980220787, + "loss": 7.4898, + "step": 8576 + }, + { + "epoch": 0.8003172529625828, + "grad_norm": 2.136903026307096, + "learning_rate": 0.00029182611721075686, + "loss": 7.5201, + "step": 8577 + }, + { + "epoch": 0.8004105626574601, + "grad_norm": 1.3531853776485017, + "learning_rate": 0.000291823654258684, + "loss": 7.4144, + "step": 8578 + }, + { + "epoch": 0.8005038723523374, + "grad_norm": 57780791.14213881, + "learning_rate": 0.0002918211909459956, + "loss": 7.5166, + "step": 8579 + }, + { + "epoch": 0.8005971820472148, + "grad_norm": 2.1605848242179624, + "learning_rate": 0.00029181872727269796, + "loss": 7.3127, + "step": 8580 + }, + { + "epoch": 0.8006904917420921, + "grad_norm": 1.3371108482659178, + "learning_rate": 0.00029181626323879727, + "loss": 7.2358, + "step": 8581 + }, + { + "epoch": 0.8007838014369693, + "grad_norm": 1.841312773815445, + "learning_rate": 0.0002918137988442999, + "loss": 7.4987, + "step": 8582 + }, + { + "epoch": 0.8008771111318466, + "grad_norm": 275583450.71622777, + "learning_rate": 0.00029181133408921197, + "loss": 7.1575, + "step": 8583 + }, + { + "epoch": 0.8009704208267239, + "grad_norm": 0.8246468860705797, + "learning_rate": 0.00029180886897353984, + "loss": 6.8649, + "step": 8584 + }, + { + "epoch": 0.8010637305216012, + "grad_norm": 0.9022760279553615, + "learning_rate": 0.00029180640349728976, + "loss": 7.3069, + "step": 8585 + }, + { + "epoch": 0.8011570402164785, + "grad_norm": 0.735070807326234, + "learning_rate": 0.000291803937660468, + "loss": 7.2543, + "step": 8586 + }, + { + "epoch": 0.8012503499113558, + "grad_norm": 203933007.25248674, + "learning_rate": 0.0002918014714630809, + "loss": 7.1499, + "step": 8587 + }, + { + "epoch": 0.8013436596062331, + "grad_norm": 18791302.591504212, + "learning_rate": 0.0002917990049051346, + "loss": 7.1185, + "step": 8588 + }, + { + "epoch": 0.8014369693011104, + "grad_norm": 0.6322092224424644, + "learning_rate": 0.0002917965379866354, + "loss": 7.4917, + "step": 8589 + }, + { + "epoch": 0.8015302789959877, + "grad_norm": 2.191999655303696, + "learning_rate": 0.0002917940707075897, + "loss": 7.072, + "step": 8590 + }, + { + "epoch": 0.801623588690865, + "grad_norm": 2.5756445053630626, + "learning_rate": 0.00029179160306800364, + "loss": 7.2884, + "step": 8591 + }, + { + "epoch": 0.8017168983857422, + "grad_norm": 0.8431601919152588, + "learning_rate": 0.0002917891350678835, + "loss": 7.7741, + "step": 8592 + }, + { + "epoch": 0.8018102080806195, + "grad_norm": 3.059707104003218, + "learning_rate": 0.00029178666670723563, + "loss": 7.2995, + "step": 8593 + }, + { + "epoch": 0.8019035177754968, + "grad_norm": 6.422953677363855, + "learning_rate": 0.00029178419798606625, + "loss": 7.6017, + "step": 8594 + }, + { + "epoch": 0.8019968274703742, + "grad_norm": 2.7594435761061966, + "learning_rate": 0.0002917817289043817, + "loss": 7.4395, + "step": 8595 + }, + { + "epoch": 0.8020901371652515, + "grad_norm": 0.5128280454768387, + "learning_rate": 0.00029177925946218816, + "loss": 7.2631, + "step": 8596 + }, + { + "epoch": 0.8021834468601288, + "grad_norm": 21.444548374698613, + "learning_rate": 0.000291776789659492, + "loss": 7.4168, + "step": 8597 + }, + { + "epoch": 0.8022767565550061, + "grad_norm": 7.275312289055664, + "learning_rate": 0.0002917743194962994, + "loss": 7.3325, + "step": 8598 + }, + { + "epoch": 0.8023700662498834, + "grad_norm": 0.735379779733574, + "learning_rate": 0.00029177184897261673, + "loss": 7.1944, + "step": 8599 + }, + { + "epoch": 0.8024633759447607, + "grad_norm": 1718163151.23197, + "learning_rate": 0.00029176937808845024, + "loss": 7.2262, + "step": 8600 + }, + { + "epoch": 0.802556685639638, + "grad_norm": 1311428453.7939284, + "learning_rate": 0.00029176690684380626, + "loss": 7.5659, + "step": 8601 + }, + { + "epoch": 0.8026499953345153, + "grad_norm": 4.1364669166351336, + "learning_rate": 0.000291764435238691, + "loss": 7.4381, + "step": 8602 + }, + { + "epoch": 0.8027433050293925, + "grad_norm": 83033363.21346651, + "learning_rate": 0.00029176196327311077, + "loss": 7.3473, + "step": 8603 + }, + { + "epoch": 0.8028366147242698, + "grad_norm": 99.09726882963074, + "learning_rate": 0.00029175949094707184, + "loss": 7.3459, + "step": 8604 + }, + { + "epoch": 0.8029299244191471, + "grad_norm": 0.4482554575346646, + "learning_rate": 0.00029175701826058057, + "loss": 7.5149, + "step": 8605 + }, + { + "epoch": 0.8030232341140244, + "grad_norm": 1985785806.1598003, + "learning_rate": 0.00029175454521364314, + "loss": 7.28, + "step": 8606 + }, + { + "epoch": 0.8031165438089017, + "grad_norm": 0.8362760681906712, + "learning_rate": 0.0002917520718062659, + "loss": 7.3261, + "step": 8607 + }, + { + "epoch": 0.803209853503779, + "grad_norm": 5.168281139144198, + "learning_rate": 0.0002917495980384552, + "loss": 7.259, + "step": 8608 + }, + { + "epoch": 0.8033031631986564, + "grad_norm": 0.7383726021834656, + "learning_rate": 0.0002917471239102172, + "loss": 7.3625, + "step": 8609 + }, + { + "epoch": 0.8033964728935337, + "grad_norm": 0.8398609843266054, + "learning_rate": 0.00029174464942155823, + "loss": 7.2675, + "step": 8610 + }, + { + "epoch": 0.803489782588411, + "grad_norm": 0.62974780651587, + "learning_rate": 0.0002917421745724846, + "loss": 7.2726, + "step": 8611 + }, + { + "epoch": 0.8035830922832883, + "grad_norm": 2.9035421020282, + "learning_rate": 0.0002917396993630027, + "loss": 7.2844, + "step": 8612 + }, + { + "epoch": 0.8036764019781655, + "grad_norm": 2.3808149650596238, + "learning_rate": 0.00029173722379311867, + "loss": 7.2728, + "step": 8613 + }, + { + "epoch": 0.8037697116730428, + "grad_norm": 2.7279035708317556, + "learning_rate": 0.00029173474786283886, + "loss": 7.5099, + "step": 8614 + }, + { + "epoch": 0.8038630213679201, + "grad_norm": 8.81811995227567, + "learning_rate": 0.0002917322715721696, + "loss": 7.1937, + "step": 8615 + }, + { + "epoch": 0.8039563310627974, + "grad_norm": 36628769080.966606, + "learning_rate": 0.00029172979492111717, + "loss": 7.5367, + "step": 8616 + }, + { + "epoch": 0.8040496407576747, + "grad_norm": 1.5236620513513814, + "learning_rate": 0.00029172731790968776, + "loss": 7.1372, + "step": 8617 + }, + { + "epoch": 0.804142950452552, + "grad_norm": 228.5326113981374, + "learning_rate": 0.0002917248405378878, + "loss": 7.344, + "step": 8618 + }, + { + "epoch": 0.8042362601474293, + "grad_norm": 0.547046549534308, + "learning_rate": 0.0002917223628057236, + "loss": 7.276, + "step": 8619 + }, + { + "epoch": 0.8043295698423066, + "grad_norm": 1.2889790932504535, + "learning_rate": 0.0002917198847132014, + "loss": 7.6128, + "step": 8620 + }, + { + "epoch": 0.8044228795371839, + "grad_norm": 2.274973599365072, + "learning_rate": 0.0002917174062603275, + "loss": 7.482, + "step": 8621 + }, + { + "epoch": 0.8045161892320613, + "grad_norm": 0.8901238500203442, + "learning_rate": 0.0002917149274471082, + "loss": 7.0445, + "step": 8622 + }, + { + "epoch": 0.8046094989269386, + "grad_norm": 12.77145042553792, + "learning_rate": 0.00029171244827354986, + "loss": 7.3342, + "step": 8623 + }, + { + "epoch": 0.8047028086218158, + "grad_norm": 4076179006.5181975, + "learning_rate": 0.00029170996873965877, + "loss": 7.5403, + "step": 8624 + }, + { + "epoch": 0.8047961183166931, + "grad_norm": 1.1521588474743194, + "learning_rate": 0.0002917074888454411, + "loss": 7.0905, + "step": 8625 + }, + { + "epoch": 0.8048894280115704, + "grad_norm": 3.1653678447372102, + "learning_rate": 0.0002917050085909034, + "loss": 7.2752, + "step": 8626 + }, + { + "epoch": 0.8049827377064477, + "grad_norm": 16.75367079797329, + "learning_rate": 0.00029170252797605173, + "loss": 7.2515, + "step": 8627 + }, + { + "epoch": 0.805076047401325, + "grad_norm": 2139919164.9913611, + "learning_rate": 0.00029170004700089253, + "loss": 7.4616, + "step": 8628 + }, + { + "epoch": 0.8051693570962023, + "grad_norm": 1.508886786310022, + "learning_rate": 0.0002916975656654321, + "loss": 7.2982, + "step": 8629 + }, + { + "epoch": 0.8052626667910796, + "grad_norm": 4.531736819975781, + "learning_rate": 0.0002916950839696767, + "loss": 7.4702, + "step": 8630 + }, + { + "epoch": 0.8053559764859569, + "grad_norm": 2.6996744053840076, + "learning_rate": 0.00029169260191363273, + "loss": 7.1813, + "step": 8631 + }, + { + "epoch": 0.8054492861808342, + "grad_norm": 24.25699638918701, + "learning_rate": 0.0002916901194973064, + "loss": 7.3471, + "step": 8632 + }, + { + "epoch": 0.8055425958757115, + "grad_norm": 12.061502207293236, + "learning_rate": 0.0002916876367207041, + "loss": 7.5, + "step": 8633 + }, + { + "epoch": 0.8056359055705888, + "grad_norm": 1.0119321699755082, + "learning_rate": 0.0002916851535838321, + "loss": 7.3492, + "step": 8634 + }, + { + "epoch": 0.805729215265466, + "grad_norm": 2.731618281838146, + "learning_rate": 0.0002916826700866967, + "loss": 7.2638, + "step": 8635 + }, + { + "epoch": 0.8058225249603433, + "grad_norm": 2.2427792561762248, + "learning_rate": 0.0002916801862293043, + "loss": 7.3823, + "step": 8636 + }, + { + "epoch": 0.8059158346552207, + "grad_norm": 19.855105892480815, + "learning_rate": 0.00029167770201166105, + "loss": 7.7016, + "step": 8637 + }, + { + "epoch": 0.806009144350098, + "grad_norm": 6.972196872210479, + "learning_rate": 0.0002916752174337735, + "loss": 7.6007, + "step": 8638 + }, + { + "epoch": 0.8061024540449753, + "grad_norm": 206.73845157294372, + "learning_rate": 0.0002916727324956478, + "loss": 7.3773, + "step": 8639 + }, + { + "epoch": 0.8061957637398526, + "grad_norm": 1.0898038963680454, + "learning_rate": 0.00029167024719729026, + "loss": 7.6315, + "step": 8640 + }, + { + "epoch": 0.8062890734347299, + "grad_norm": 3.8735930735789488, + "learning_rate": 0.00029166776153870727, + "loss": 7.5387, + "step": 8641 + }, + { + "epoch": 0.8063823831296072, + "grad_norm": 1.2182330335316847, + "learning_rate": 0.0002916652755199051, + "loss": 7.7602, + "step": 8642 + }, + { + "epoch": 0.8064756928244845, + "grad_norm": 1.0069751924389034, + "learning_rate": 0.00029166278914089015, + "loss": 7.8053, + "step": 8643 + }, + { + "epoch": 0.8065690025193618, + "grad_norm": 1.238533865840918, + "learning_rate": 0.00029166030240166866, + "loss": 7.4062, + "step": 8644 + }, + { + "epoch": 0.806662312214239, + "grad_norm": 1.8320269378453995, + "learning_rate": 0.000291657815302247, + "loss": 7.6361, + "step": 8645 + }, + { + "epoch": 0.8067556219091163, + "grad_norm": 1.4667248112496265, + "learning_rate": 0.00029165532784263145, + "loss": 7.8033, + "step": 8646 + }, + { + "epoch": 0.8068489316039936, + "grad_norm": 1.2834568399261708, + "learning_rate": 0.0002916528400228284, + "loss": 7.2749, + "step": 8647 + }, + { + "epoch": 0.8069422412988709, + "grad_norm": 77345288722412.11, + "learning_rate": 0.00029165035184284415, + "loss": 7.2718, + "step": 8648 + }, + { + "epoch": 0.8070355509937482, + "grad_norm": 1.1977622737868647, + "learning_rate": 0.00029164786330268495, + "loss": 7.7946, + "step": 8649 + }, + { + "epoch": 0.8071288606886255, + "grad_norm": 2.143758156785559, + "learning_rate": 0.00029164537440235725, + "loss": 7.9849, + "step": 8650 + }, + { + "epoch": 0.8072221703835029, + "grad_norm": 24852583592834.062, + "learning_rate": 0.0002916428851418673, + "loss": 7.2137, + "step": 8651 + }, + { + "epoch": 0.8073154800783802, + "grad_norm": 1.5894276391443358, + "learning_rate": 0.0002916403955212215, + "loss": 7.5632, + "step": 8652 + }, + { + "epoch": 0.8074087897732575, + "grad_norm": 1.1644013320071267, + "learning_rate": 0.00029163790554042604, + "loss": 7.4976, + "step": 8653 + }, + { + "epoch": 0.8075020994681348, + "grad_norm": 1.1656186106262385, + "learning_rate": 0.00029163541519948736, + "loss": 7.2704, + "step": 8654 + }, + { + "epoch": 0.8075954091630121, + "grad_norm": 2.438360304815841, + "learning_rate": 0.0002916329244984118, + "loss": 7.5836, + "step": 8655 + }, + { + "epoch": 0.8076887188578893, + "grad_norm": 0.5817807047092802, + "learning_rate": 0.0002916304334372057, + "loss": 7.3798, + "step": 8656 + }, + { + "epoch": 0.8077820285527666, + "grad_norm": 0.533394798439258, + "learning_rate": 0.00029162794201587533, + "loss": 7.2043, + "step": 8657 + }, + { + "epoch": 0.8078753382476439, + "grad_norm": 1274373677344.0676, + "learning_rate": 0.00029162545023442705, + "loss": 7.8026, + "step": 8658 + }, + { + "epoch": 0.8079686479425212, + "grad_norm": 1291339559278.75, + "learning_rate": 0.0002916229580928672, + "loss": 7.6575, + "step": 8659 + }, + { + "epoch": 0.8080619576373985, + "grad_norm": 1.7862482441811356, + "learning_rate": 0.0002916204655912022, + "loss": 7.9736, + "step": 8660 + }, + { + "epoch": 0.8081552673322758, + "grad_norm": 1374960941248.859, + "learning_rate": 0.00029161797272943816, + "loss": 7.4611, + "step": 8661 + }, + { + "epoch": 0.8082485770271531, + "grad_norm": 2.584415278034166, + "learning_rate": 0.00029161547950758165, + "loss": 7.9201, + "step": 8662 + }, + { + "epoch": 0.8083418867220304, + "grad_norm": 7.0467635562902355, + "learning_rate": 0.0002916129859256389, + "loss": 7.6473, + "step": 8663 + }, + { + "epoch": 0.8084351964169078, + "grad_norm": 1.5430939361569147, + "learning_rate": 0.00029161049198361637, + "loss": 7.7473, + "step": 8664 + }, + { + "epoch": 0.8085285061117851, + "grad_norm": 1.6775930331520348, + "learning_rate": 0.0002916079976815202, + "loss": 7.5763, + "step": 8665 + }, + { + "epoch": 0.8086218158066623, + "grad_norm": 1.3321007825348028, + "learning_rate": 0.0002916055030193568, + "loss": 7.5577, + "step": 8666 + }, + { + "epoch": 0.8087151255015396, + "grad_norm": 1.2798646569588876, + "learning_rate": 0.00029160300799713267, + "loss": 7.9413, + "step": 8667 + }, + { + "epoch": 0.8088084351964169, + "grad_norm": 257402215575.16687, + "learning_rate": 0.000291600512614854, + "loss": 7.5906, + "step": 8668 + }, + { + "epoch": 0.8089017448912942, + "grad_norm": 1.2741807231869242, + "learning_rate": 0.00029159801687252715, + "loss": 7.5603, + "step": 8669 + }, + { + "epoch": 0.8089950545861715, + "grad_norm": 1.0313415392618817, + "learning_rate": 0.0002915955207701585, + "loss": 7.3895, + "step": 8670 + }, + { + "epoch": 0.8090883642810488, + "grad_norm": 1.685964894441955, + "learning_rate": 0.0002915930243077544, + "loss": 7.9876, + "step": 8671 + }, + { + "epoch": 0.8091816739759261, + "grad_norm": 3.7531139978567776, + "learning_rate": 0.00029159052748532115, + "loss": 7.6796, + "step": 8672 + }, + { + "epoch": 0.8092749836708034, + "grad_norm": 627.3394745784504, + "learning_rate": 0.00029158803030286516, + "loss": 7.792, + "step": 8673 + }, + { + "epoch": 0.8093682933656807, + "grad_norm": 1.4716244741739894, + "learning_rate": 0.00029158553276039274, + "loss": 7.884, + "step": 8674 + }, + { + "epoch": 0.809461603060558, + "grad_norm": 1.3818618720287077, + "learning_rate": 0.0002915830348579102, + "loss": 7.8355, + "step": 8675 + }, + { + "epoch": 0.8095549127554353, + "grad_norm": 1.2645122661603891, + "learning_rate": 0.000291580536595424, + "loss": 7.7752, + "step": 8676 + }, + { + "epoch": 0.8096482224503125, + "grad_norm": 1.5617785473131007, + "learning_rate": 0.0002915780379729404, + "loss": 7.5446, + "step": 8677 + }, + { + "epoch": 0.8097415321451898, + "grad_norm": 0.9619522584269878, + "learning_rate": 0.00029157553899046584, + "loss": 7.7088, + "step": 8678 + }, + { + "epoch": 0.8098348418400672, + "grad_norm": 0.7483351155719994, + "learning_rate": 0.0002915730396480066, + "loss": 7.4306, + "step": 8679 + }, + { + "epoch": 0.8099281515349445, + "grad_norm": 1.2917138136770778, + "learning_rate": 0.00029157053994556905, + "loss": 7.7148, + "step": 8680 + }, + { + "epoch": 0.8100214612298218, + "grad_norm": 3143152487.5101914, + "learning_rate": 0.0002915680398831595, + "loss": 7.4378, + "step": 8681 + }, + { + "epoch": 0.8101147709246991, + "grad_norm": 1.3059522252394362, + "learning_rate": 0.00029156553946078445, + "loss": 7.4305, + "step": 8682 + }, + { + "epoch": 0.8102080806195764, + "grad_norm": 1.5923907743291743, + "learning_rate": 0.00029156303867845015, + "loss": 7.6757, + "step": 8683 + }, + { + "epoch": 0.8103013903144537, + "grad_norm": 494947144142.87506, + "learning_rate": 0.00029156053753616294, + "loss": 7.7958, + "step": 8684 + }, + { + "epoch": 0.810394700009331, + "grad_norm": 5419940133.441003, + "learning_rate": 0.0002915580360339293, + "loss": 7.5795, + "step": 8685 + }, + { + "epoch": 0.8104880097042083, + "grad_norm": 2.0679780668279006, + "learning_rate": 0.0002915555341717554, + "loss": 7.5477, + "step": 8686 + }, + { + "epoch": 0.8105813193990856, + "grad_norm": 2.2864739343225398, + "learning_rate": 0.00029155303194964774, + "loss": 7.2829, + "step": 8687 + }, + { + "epoch": 0.8106746290939628, + "grad_norm": 1.727837669463973, + "learning_rate": 0.00029155052936761264, + "loss": 7.3877, + "step": 8688 + }, + { + "epoch": 0.8107679387888401, + "grad_norm": 10502699487.031807, + "learning_rate": 0.0002915480264256565, + "loss": 7.3329, + "step": 8689 + }, + { + "epoch": 0.8108612484837174, + "grad_norm": 1.2464659990806723, + "learning_rate": 0.0002915455231237857, + "loss": 7.6403, + "step": 8690 + }, + { + "epoch": 0.8109545581785947, + "grad_norm": 1.5061432042150171, + "learning_rate": 0.00029154301946200645, + "loss": 7.373, + "step": 8691 + }, + { + "epoch": 0.811047867873472, + "grad_norm": 1.847211677867772, + "learning_rate": 0.0002915405154403253, + "loss": 7.4495, + "step": 8692 + }, + { + "epoch": 0.8111411775683494, + "grad_norm": 1.715426004060339, + "learning_rate": 0.0002915380110587486, + "loss": 7.653, + "step": 8693 + }, + { + "epoch": 0.8112344872632267, + "grad_norm": 0.8165850398179346, + "learning_rate": 0.0002915355063172826, + "loss": 7.2617, + "step": 8694 + }, + { + "epoch": 0.811327796958104, + "grad_norm": 1.0581343129630802, + "learning_rate": 0.00029153300121593373, + "loss": 7.8509, + "step": 8695 + }, + { + "epoch": 0.8114211066529813, + "grad_norm": 1.5553953002153769, + "learning_rate": 0.0002915304957547084, + "loss": 7.5534, + "step": 8696 + }, + { + "epoch": 0.8115144163478586, + "grad_norm": 1.5696604832467904, + "learning_rate": 0.0002915279899336129, + "loss": 7.641, + "step": 8697 + }, + { + "epoch": 0.8116077260427358, + "grad_norm": 1.240656436205625, + "learning_rate": 0.00029152548375265366, + "loss": 7.6624, + "step": 8698 + }, + { + "epoch": 0.8117010357376131, + "grad_norm": 2.75916701498275, + "learning_rate": 0.00029152297721183706, + "loss": 7.3793, + "step": 8699 + }, + { + "epoch": 0.8117943454324904, + "grad_norm": 1804732391.3304858, + "learning_rate": 0.0002915204703111694, + "loss": 8.0759, + "step": 8700 + }, + { + "epoch": 0.8118876551273677, + "grad_norm": 0.8928515026781805, + "learning_rate": 0.00029151796305065716, + "loss": 7.429, + "step": 8701 + }, + { + "epoch": 0.811980964822245, + "grad_norm": 62350654720.73286, + "learning_rate": 0.0002915154554303066, + "loss": 7.2396, + "step": 8702 + }, + { + "epoch": 0.8120742745171223, + "grad_norm": 0.7594646132719645, + "learning_rate": 0.0002915129474501242, + "loss": 7.216, + "step": 8703 + }, + { + "epoch": 0.8121675842119996, + "grad_norm": 1.2398937328176314, + "learning_rate": 0.0002915104391101163, + "loss": 7.6869, + "step": 8704 + }, + { + "epoch": 0.812260893906877, + "grad_norm": 1.1977718828049895, + "learning_rate": 0.00029150793041028927, + "loss": 7.5345, + "step": 8705 + }, + { + "epoch": 0.8123542036017543, + "grad_norm": 1.064732971722802, + "learning_rate": 0.0002915054213506495, + "loss": 7.167, + "step": 8706 + }, + { + "epoch": 0.8124475132966316, + "grad_norm": 0.6731241097262053, + "learning_rate": 0.00029150291193120336, + "loss": 7.6157, + "step": 8707 + }, + { + "epoch": 0.8125408229915089, + "grad_norm": 1.0766937077278709, + "learning_rate": 0.00029150040215195717, + "loss": 7.6471, + "step": 8708 + }, + { + "epoch": 0.8126341326863861, + "grad_norm": 1.2520549292912175, + "learning_rate": 0.0002914978920129174, + "loss": 7.4431, + "step": 8709 + }, + { + "epoch": 0.8127274423812634, + "grad_norm": 2.497057100461244, + "learning_rate": 0.00029149538151409046, + "loss": 7.3268, + "step": 8710 + }, + { + "epoch": 0.8128207520761407, + "grad_norm": 1.1289725520302138, + "learning_rate": 0.0002914928706554826, + "loss": 7.5555, + "step": 8711 + }, + { + "epoch": 0.812914061771018, + "grad_norm": 1.018119924407064, + "learning_rate": 0.00029149035943710035, + "loss": 7.7871, + "step": 8712 + }, + { + "epoch": 0.8130073714658953, + "grad_norm": 0.7122959587914454, + "learning_rate": 0.00029148784785895, + "loss": 7.3756, + "step": 8713 + }, + { + "epoch": 0.8131006811607726, + "grad_norm": 0.6390425270122767, + "learning_rate": 0.00029148533592103797, + "loss": 7.6576, + "step": 8714 + }, + { + "epoch": 0.8131939908556499, + "grad_norm": 1.358422116440131, + "learning_rate": 0.00029148282362337065, + "loss": 7.6917, + "step": 8715 + }, + { + "epoch": 0.8132873005505272, + "grad_norm": 1.4384497819315614, + "learning_rate": 0.0002914803109659544, + "loss": 7.4014, + "step": 8716 + }, + { + "epoch": 0.8133806102454045, + "grad_norm": 397539957616.1707, + "learning_rate": 0.00029147779794879564, + "loss": 7.4203, + "step": 8717 + }, + { + "epoch": 0.8134739199402818, + "grad_norm": 366004448760.0785, + "learning_rate": 0.00029147528457190074, + "loss": 7.6044, + "step": 8718 + }, + { + "epoch": 0.813567229635159, + "grad_norm": 1.379899211750611, + "learning_rate": 0.0002914727708352761, + "loss": 7.3246, + "step": 8719 + }, + { + "epoch": 0.8136605393300363, + "grad_norm": 0.7427714701368545, + "learning_rate": 0.0002914702567389281, + "loss": 7.4986, + "step": 8720 + }, + { + "epoch": 0.8137538490249137, + "grad_norm": 10298983552.211655, + "learning_rate": 0.0002914677422828632, + "loss": 7.3304, + "step": 8721 + }, + { + "epoch": 0.813847158719791, + "grad_norm": 0.6864853876748626, + "learning_rate": 0.0002914652274670877, + "loss": 7.4206, + "step": 8722 + }, + { + "epoch": 0.8139404684146683, + "grad_norm": 0.9929058919823328, + "learning_rate": 0.000291462712291608, + "loss": 7.6512, + "step": 8723 + }, + { + "epoch": 0.8140337781095456, + "grad_norm": 16.32021416363816, + "learning_rate": 0.00029146019675643054, + "loss": 7.4656, + "step": 8724 + }, + { + "epoch": 0.8141270878044229, + "grad_norm": 0.7967227884799309, + "learning_rate": 0.00029145768086156175, + "loss": 7.2954, + "step": 8725 + }, + { + "epoch": 0.8142203974993002, + "grad_norm": 0.5855090745608349, + "learning_rate": 0.00029145516460700796, + "loss": 7.3901, + "step": 8726 + }, + { + "epoch": 0.8143137071941775, + "grad_norm": 1.0714127535170113, + "learning_rate": 0.0002914526479927756, + "loss": 7.4245, + "step": 8727 + }, + { + "epoch": 0.8144070168890548, + "grad_norm": 0.897037628044998, + "learning_rate": 0.000291450131018871, + "loss": 7.6257, + "step": 8728 + }, + { + "epoch": 0.8145003265839321, + "grad_norm": 0.6661308395901309, + "learning_rate": 0.00029144761368530067, + "loss": 7.3189, + "step": 8729 + }, + { + "epoch": 0.8145936362788093, + "grad_norm": 1.0562198153640643, + "learning_rate": 0.0002914450959920709, + "loss": 7.4546, + "step": 8730 + }, + { + "epoch": 0.8146869459736866, + "grad_norm": 1.4446668559192706, + "learning_rate": 0.0002914425779391882, + "loss": 7.7939, + "step": 8731 + }, + { + "epoch": 0.8147802556685639, + "grad_norm": 0.6461618791598002, + "learning_rate": 0.0002914400595266589, + "loss": 7.3456, + "step": 8732 + }, + { + "epoch": 0.8148735653634412, + "grad_norm": 0.7537994291989439, + "learning_rate": 0.0002914375407544895, + "loss": 7.2761, + "step": 8733 + }, + { + "epoch": 0.8149668750583186, + "grad_norm": 1.196669625396778, + "learning_rate": 0.0002914350216226862, + "loss": 7.8243, + "step": 8734 + }, + { + "epoch": 0.8150601847531959, + "grad_norm": 1.793523768015727, + "learning_rate": 0.0002914325021312556, + "loss": 7.6617, + "step": 8735 + }, + { + "epoch": 0.8151534944480732, + "grad_norm": 0.9167181515205441, + "learning_rate": 0.00029142998228020407, + "loss": 7.8977, + "step": 8736 + }, + { + "epoch": 0.8152468041429505, + "grad_norm": 3376510468.0446286, + "learning_rate": 0.000291427462069538, + "loss": 7.2591, + "step": 8737 + }, + { + "epoch": 0.8153401138378278, + "grad_norm": 8722533032.811508, + "learning_rate": 0.0002914249414992638, + "loss": 7.4705, + "step": 8738 + }, + { + "epoch": 0.8154334235327051, + "grad_norm": 0.942308164644138, + "learning_rate": 0.0002914224205693878, + "loss": 7.0731, + "step": 8739 + }, + { + "epoch": 0.8155267332275824, + "grad_norm": 1.5725844856017313, + "learning_rate": 0.0002914198992799165, + "loss": 7.3213, + "step": 8740 + }, + { + "epoch": 0.8156200429224596, + "grad_norm": 3.6492090096733674, + "learning_rate": 0.0002914173776308563, + "loss": 7.4916, + "step": 8741 + }, + { + "epoch": 0.8157133526173369, + "grad_norm": 0.7468303293023941, + "learning_rate": 0.0002914148556222136, + "loss": 7.3051, + "step": 8742 + }, + { + "epoch": 0.8158066623122142, + "grad_norm": 1.0095045186189706, + "learning_rate": 0.0002914123332539947, + "loss": 7.4072, + "step": 8743 + }, + { + "epoch": 0.8158999720070915, + "grad_norm": 0.6614037924020508, + "learning_rate": 0.0002914098105262063, + "loss": 7.2227, + "step": 8744 + }, + { + "epoch": 0.8159932817019688, + "grad_norm": 0.719136733526121, + "learning_rate": 0.00029140728743885455, + "loss": 7.0746, + "step": 8745 + }, + { + "epoch": 0.8160865913968461, + "grad_norm": 1.0088885723652765, + "learning_rate": 0.000291404763991946, + "loss": 7.5444, + "step": 8746 + }, + { + "epoch": 0.8161799010917234, + "grad_norm": 0.9003076949818276, + "learning_rate": 0.00029140224018548696, + "loss": 7.5515, + "step": 8747 + }, + { + "epoch": 0.8162732107866008, + "grad_norm": 13655562538.787577, + "learning_rate": 0.00029139971601948396, + "loss": 7.4965, + "step": 8748 + }, + { + "epoch": 0.8163665204814781, + "grad_norm": 1.1241428256805595, + "learning_rate": 0.00029139719149394336, + "loss": 7.3435, + "step": 8749 + }, + { + "epoch": 0.8164598301763554, + "grad_norm": 0.8605057840563447, + "learning_rate": 0.00029139466660887156, + "loss": 7.4381, + "step": 8750 + }, + { + "epoch": 0.8165531398712326, + "grad_norm": 11575398660.147516, + "learning_rate": 0.000291392141364275, + "loss": 7.2871, + "step": 8751 + }, + { + "epoch": 0.8166464495661099, + "grad_norm": 0.7103323238423275, + "learning_rate": 0.00029138961576016017, + "loss": 7.3849, + "step": 8752 + }, + { + "epoch": 0.8167397592609872, + "grad_norm": 39.86794760327878, + "learning_rate": 0.0002913870897965334, + "loss": 7.6851, + "step": 8753 + }, + { + "epoch": 0.8168330689558645, + "grad_norm": 1.622825201226141, + "learning_rate": 0.00029138456347340113, + "loss": 7.2649, + "step": 8754 + }, + { + "epoch": 0.8169263786507418, + "grad_norm": 5.632708788261843, + "learning_rate": 0.0002913820367907698, + "loss": 7.6054, + "step": 8755 + }, + { + "epoch": 0.8170196883456191, + "grad_norm": 0.9158545851068084, + "learning_rate": 0.00029137950974864583, + "loss": 7.2168, + "step": 8756 + }, + { + "epoch": 0.8171129980404964, + "grad_norm": 1.1190732674352106, + "learning_rate": 0.0002913769823470357, + "loss": 7.2799, + "step": 8757 + }, + { + "epoch": 0.8172063077353737, + "grad_norm": 7656008862.159146, + "learning_rate": 0.0002913744545859457, + "loss": 7.3207, + "step": 8758 + }, + { + "epoch": 0.817299617430251, + "grad_norm": 1.2257498601915109, + "learning_rate": 0.0002913719264653824, + "loss": 7.2894, + "step": 8759 + }, + { + "epoch": 0.8173929271251283, + "grad_norm": 0.9009192242470748, + "learning_rate": 0.0002913693979853521, + "loss": 7.4142, + "step": 8760 + }, + { + "epoch": 0.8174862368200057, + "grad_norm": 0.7334615013541679, + "learning_rate": 0.00029136686914586134, + "loss": 7.1985, + "step": 8761 + }, + { + "epoch": 0.8175795465148828, + "grad_norm": 7447589194.692687, + "learning_rate": 0.0002913643399469165, + "loss": 7.4087, + "step": 8762 + }, + { + "epoch": 0.8176728562097602, + "grad_norm": 1.3193865169574688, + "learning_rate": 0.00029136181038852405, + "loss": 7.5777, + "step": 8763 + }, + { + "epoch": 0.8177661659046375, + "grad_norm": 5.087978527264775, + "learning_rate": 0.00029135928047069033, + "loss": 7.3506, + "step": 8764 + }, + { + "epoch": 0.8178594755995148, + "grad_norm": 0.7293961619358431, + "learning_rate": 0.00029135675019342186, + "loss": 7.4554, + "step": 8765 + }, + { + "epoch": 0.8179527852943921, + "grad_norm": 0.6305309584700004, + "learning_rate": 0.0002913542195567251, + "loss": 7.4326, + "step": 8766 + }, + { + "epoch": 0.8180460949892694, + "grad_norm": 0.8516171440846108, + "learning_rate": 0.0002913516885606064, + "loss": 7.1031, + "step": 8767 + }, + { + "epoch": 0.8181394046841467, + "grad_norm": 1.0553128168024237, + "learning_rate": 0.0002913491572050722, + "loss": 6.9628, + "step": 8768 + }, + { + "epoch": 0.818232714379024, + "grad_norm": 0.751994084140242, + "learning_rate": 0.000291346625490129, + "loss": 7.3104, + "step": 8769 + }, + { + "epoch": 0.8183260240739013, + "grad_norm": 0.8154253272036758, + "learning_rate": 0.0002913440934157832, + "loss": 7.2331, + "step": 8770 + }, + { + "epoch": 0.8184193337687786, + "grad_norm": 329576825924.9425, + "learning_rate": 0.0002913415609820412, + "loss": 7.6261, + "step": 8771 + }, + { + "epoch": 0.8185126434636558, + "grad_norm": 0.7084534141595871, + "learning_rate": 0.0002913390281889095, + "loss": 7.2772, + "step": 8772 + }, + { + "epoch": 0.8186059531585331, + "grad_norm": 0.8005780361107808, + "learning_rate": 0.00029133649503639453, + "loss": 7.3601, + "step": 8773 + }, + { + "epoch": 0.8186992628534104, + "grad_norm": 146736049551.95358, + "learning_rate": 0.0002913339615245027, + "loss": 7.4965, + "step": 8774 + }, + { + "epoch": 0.8187925725482877, + "grad_norm": 0.7936552993900433, + "learning_rate": 0.0002913314276532405, + "loss": 7.2756, + "step": 8775 + }, + { + "epoch": 0.818885882243165, + "grad_norm": 10429335043.868782, + "learning_rate": 0.0002913288934226143, + "loss": 7.5494, + "step": 8776 + }, + { + "epoch": 0.8189791919380424, + "grad_norm": 1.007152279011637, + "learning_rate": 0.00029132635883263066, + "loss": 7.5398, + "step": 8777 + }, + { + "epoch": 0.8190725016329197, + "grad_norm": 1.3665955118380124, + "learning_rate": 0.00029132382388329586, + "loss": 7.3452, + "step": 8778 + }, + { + "epoch": 0.819165811327797, + "grad_norm": 0.982471577650078, + "learning_rate": 0.0002913212885746165, + "loss": 7.2079, + "step": 8779 + }, + { + "epoch": 0.8192591210226743, + "grad_norm": 0.6947876162646447, + "learning_rate": 0.00029131875290659893, + "loss": 7.3724, + "step": 8780 + }, + { + "epoch": 0.8193524307175516, + "grad_norm": 4.899046119995506, + "learning_rate": 0.00029131621687924965, + "loss": 7.6321, + "step": 8781 + }, + { + "epoch": 0.8194457404124289, + "grad_norm": 803387545815.9863, + "learning_rate": 0.0002913136804925751, + "loss": 7.5215, + "step": 8782 + }, + { + "epoch": 0.8195390501073061, + "grad_norm": 0.5993109492121776, + "learning_rate": 0.00029131114374658174, + "loss": 7.1855, + "step": 8783 + }, + { + "epoch": 0.8196323598021834, + "grad_norm": 0.6220925715354448, + "learning_rate": 0.000291308606641276, + "loss": 7.1983, + "step": 8784 + }, + { + "epoch": 0.8197256694970607, + "grad_norm": 1.0891432717531677, + "learning_rate": 0.0002913060691766643, + "loss": 7.6475, + "step": 8785 + }, + { + "epoch": 0.819818979191938, + "grad_norm": 0.8509770328138995, + "learning_rate": 0.00029130353135275316, + "loss": 7.2474, + "step": 8786 + }, + { + "epoch": 0.8199122888868153, + "grad_norm": 1.43261790551526, + "learning_rate": 0.0002913009931695489, + "loss": 7.5913, + "step": 8787 + }, + { + "epoch": 0.8200055985816926, + "grad_norm": 2.6705298766568246, + "learning_rate": 0.00029129845462705817, + "loss": 7.5315, + "step": 8788 + }, + { + "epoch": 0.82009890827657, + "grad_norm": 1.258000648104103, + "learning_rate": 0.0002912959157252873, + "loss": 7.2856, + "step": 8789 + }, + { + "epoch": 0.8201922179714473, + "grad_norm": 8.461330234782247, + "learning_rate": 0.0002912933764642428, + "loss": 7.2836, + "step": 8790 + }, + { + "epoch": 0.8202855276663246, + "grad_norm": 123177758.2624378, + "learning_rate": 0.0002912908368439311, + "loss": 7.5292, + "step": 8791 + }, + { + "epoch": 0.8203788373612019, + "grad_norm": 0.6618402179400116, + "learning_rate": 0.00029128829686435857, + "loss": 7.3867, + "step": 8792 + }, + { + "epoch": 0.8204721470560792, + "grad_norm": 0.6623038166975335, + "learning_rate": 0.0002912857565255318, + "loss": 7.3547, + "step": 8793 + }, + { + "epoch": 0.8205654567509564, + "grad_norm": 1.061997485149465, + "learning_rate": 0.00029128321582745723, + "loss": 7.5294, + "step": 8794 + }, + { + "epoch": 0.8206587664458337, + "grad_norm": 3689612675.6156273, + "learning_rate": 0.0002912806747701413, + "loss": 7.1182, + "step": 8795 + }, + { + "epoch": 0.820752076140711, + "grad_norm": 12.324446891468925, + "learning_rate": 0.00029127813335359044, + "loss": 7.2718, + "step": 8796 + }, + { + "epoch": 0.8208453858355883, + "grad_norm": 1.6481765430826754, + "learning_rate": 0.0002912755915778111, + "loss": 7.375, + "step": 8797 + }, + { + "epoch": 0.8209386955304656, + "grad_norm": 1.007454567579938, + "learning_rate": 0.00029127304944280986, + "loss": 7.2451, + "step": 8798 + }, + { + "epoch": 0.8210320052253429, + "grad_norm": 1.3010643511500941, + "learning_rate": 0.000291270506948593, + "loss": 8.0358, + "step": 8799 + }, + { + "epoch": 0.8211253149202202, + "grad_norm": 1.0103962546753227, + "learning_rate": 0.00029126796409516715, + "loss": 7.3721, + "step": 8800 + }, + { + "epoch": 0.8212186246150975, + "grad_norm": 1.0037407107355827, + "learning_rate": 0.00029126542088253875, + "loss": 7.5834, + "step": 8801 + }, + { + "epoch": 0.8213119343099748, + "grad_norm": 1.200236012451704, + "learning_rate": 0.0002912628773107142, + "loss": 7.3282, + "step": 8802 + }, + { + "epoch": 0.8214052440048522, + "grad_norm": 0.9709017498986954, + "learning_rate": 0.0002912603333797, + "loss": 7.456, + "step": 8803 + }, + { + "epoch": 0.8214985536997294, + "grad_norm": 0.8929380631895227, + "learning_rate": 0.00029125778908950257, + "loss": 7.1211, + "step": 8804 + }, + { + "epoch": 0.8215918633946067, + "grad_norm": 0.9970196894375041, + "learning_rate": 0.00029125524444012843, + "loss": 7.5728, + "step": 8805 + }, + { + "epoch": 0.821685173089484, + "grad_norm": 0.925805608603405, + "learning_rate": 0.0002912526994315841, + "loss": 7.3906, + "step": 8806 + }, + { + "epoch": 0.8217784827843613, + "grad_norm": 0.6900826118841304, + "learning_rate": 0.000291250154063876, + "loss": 7.0898, + "step": 8807 + }, + { + "epoch": 0.8218717924792386, + "grad_norm": 1.1855974580116952, + "learning_rate": 0.00029124760833701055, + "loss": 7.7622, + "step": 8808 + }, + { + "epoch": 0.8219651021741159, + "grad_norm": 162334066548.83585, + "learning_rate": 0.0002912450622509943, + "loss": 7.1107, + "step": 8809 + }, + { + "epoch": 0.8220584118689932, + "grad_norm": 0.9594140582847499, + "learning_rate": 0.0002912425158058337, + "loss": 7.4162, + "step": 8810 + }, + { + "epoch": 0.8221517215638705, + "grad_norm": 0.6493119136248188, + "learning_rate": 0.0002912399690015352, + "loss": 7.3843, + "step": 8811 + }, + { + "epoch": 0.8222450312587478, + "grad_norm": 581524453029.9327, + "learning_rate": 0.0002912374218381053, + "loss": 7.0709, + "step": 8812 + }, + { + "epoch": 0.8223383409536251, + "grad_norm": 0.7739341124213243, + "learning_rate": 0.00029123487431555054, + "loss": 7.2462, + "step": 8813 + }, + { + "epoch": 0.8224316506485024, + "grad_norm": 0.6559546488551924, + "learning_rate": 0.00029123232643387726, + "loss": 7.4002, + "step": 8814 + }, + { + "epoch": 0.8225249603433796, + "grad_norm": 15041233151479.6, + "learning_rate": 0.00029122977819309205, + "loss": 7.488, + "step": 8815 + }, + { + "epoch": 0.8226182700382569, + "grad_norm": 0.8648983166248945, + "learning_rate": 0.0002912272295932013, + "loss": 7.2333, + "step": 8816 + }, + { + "epoch": 0.8227115797331342, + "grad_norm": 1.3984480471675362, + "learning_rate": 0.00029122468063421156, + "loss": 7.8547, + "step": 8817 + }, + { + "epoch": 0.8228048894280116, + "grad_norm": 1.5842961194817962, + "learning_rate": 0.0002912221313161293, + "loss": 7.4709, + "step": 8818 + }, + { + "epoch": 0.8228981991228889, + "grad_norm": 358122390893.1172, + "learning_rate": 0.000291219581638961, + "loss": 7.3315, + "step": 8819 + }, + { + "epoch": 0.8229915088177662, + "grad_norm": 0.8863273078970518, + "learning_rate": 0.00029121703160271315, + "loss": 7.3886, + "step": 8820 + }, + { + "epoch": 0.8230848185126435, + "grad_norm": 1927527228104.3354, + "learning_rate": 0.00029121448120739216, + "loss": 7.2943, + "step": 8821 + }, + { + "epoch": 0.8231781282075208, + "grad_norm": 1.0953640448597064, + "learning_rate": 0.0002912119304530046, + "loss": 7.3045, + "step": 8822 + }, + { + "epoch": 0.8232714379023981, + "grad_norm": 1.160047022234658, + "learning_rate": 0.00029120937933955696, + "loss": 7.2926, + "step": 8823 + }, + { + "epoch": 0.8233647475972754, + "grad_norm": 0.7017501074002815, + "learning_rate": 0.0002912068278670557, + "loss": 7.387, + "step": 8824 + }, + { + "epoch": 0.8234580572921526, + "grad_norm": 1.421227682700633, + "learning_rate": 0.00029120427603550726, + "loss": 7.4247, + "step": 8825 + }, + { + "epoch": 0.8235513669870299, + "grad_norm": 1.5027868803796687, + "learning_rate": 0.00029120172384491824, + "loss": 7.6567, + "step": 8826 + }, + { + "epoch": 0.8236446766819072, + "grad_norm": 1.6564671776147994, + "learning_rate": 0.000291199171295295, + "loss": 7.4257, + "step": 8827 + }, + { + "epoch": 0.8237379863767845, + "grad_norm": 0.7554990264771599, + "learning_rate": 0.00029119661838664416, + "loss": 7.5215, + "step": 8828 + }, + { + "epoch": 0.8238312960716618, + "grad_norm": 0.8102694593866497, + "learning_rate": 0.00029119406511897205, + "loss": 7.3121, + "step": 8829 + }, + { + "epoch": 0.8239246057665391, + "grad_norm": 0.7712970282381305, + "learning_rate": 0.0002911915114922853, + "loss": 7.545, + "step": 8830 + }, + { + "epoch": 0.8240179154614165, + "grad_norm": 3981871703558.9277, + "learning_rate": 0.0002911889575065904, + "loss": 7.561, + "step": 8831 + }, + { + "epoch": 0.8241112251562938, + "grad_norm": 1.6864421386439519, + "learning_rate": 0.00029118640316189376, + "loss": 7.5362, + "step": 8832 + }, + { + "epoch": 0.8242045348511711, + "grad_norm": 1.0829603776041072, + "learning_rate": 0.00029118384845820195, + "loss": 7.3518, + "step": 8833 + }, + { + "epoch": 0.8242978445460484, + "grad_norm": 0.9420411010482108, + "learning_rate": 0.0002911812933955214, + "loss": 7.2955, + "step": 8834 + }, + { + "epoch": 0.8243911542409257, + "grad_norm": 0.584012982162206, + "learning_rate": 0.0002911787379738587, + "loss": 7.2499, + "step": 8835 + }, + { + "epoch": 0.8244844639358029, + "grad_norm": 23.362402678715153, + "learning_rate": 0.00029117618219322023, + "loss": 7.167, + "step": 8836 + }, + { + "epoch": 0.8245777736306802, + "grad_norm": 0.8414463788232636, + "learning_rate": 0.0002911736260536126, + "loss": 7.3185, + "step": 8837 + }, + { + "epoch": 0.8246710833255575, + "grad_norm": 1.0757016836169249, + "learning_rate": 0.00029117106955504224, + "loss": 7.5581, + "step": 8838 + }, + { + "epoch": 0.8247643930204348, + "grad_norm": 1.0915487939246085, + "learning_rate": 0.0002911685126975156, + "loss": 7.4848, + "step": 8839 + }, + { + "epoch": 0.8248577027153121, + "grad_norm": 1.9108062257919496, + "learning_rate": 0.0002911659554810393, + "loss": 7.0143, + "step": 8840 + }, + { + "epoch": 0.8249510124101894, + "grad_norm": 34.197064733184874, + "learning_rate": 0.00029116339790561985, + "loss": 7.4179, + "step": 8841 + }, + { + "epoch": 0.8250443221050667, + "grad_norm": 1.6502812108683804, + "learning_rate": 0.0002911608399712636, + "loss": 7.2294, + "step": 8842 + }, + { + "epoch": 0.825137631799944, + "grad_norm": 1.2737991550560874, + "learning_rate": 0.0002911582816779772, + "loss": 7.2036, + "step": 8843 + }, + { + "epoch": 0.8252309414948213, + "grad_norm": 1.0221704289985853, + "learning_rate": 0.00029115572302576705, + "loss": 7.2732, + "step": 8844 + }, + { + "epoch": 0.8253242511896987, + "grad_norm": 410937617.55460304, + "learning_rate": 0.00029115316401463976, + "loss": 7.3911, + "step": 8845 + }, + { + "epoch": 0.825417560884576, + "grad_norm": 1033784742.5808771, + "learning_rate": 0.0002911506046446018, + "loss": 7.3555, + "step": 8846 + }, + { + "epoch": 0.8255108705794532, + "grad_norm": 1.218827210320993, + "learning_rate": 0.00029114804491565965, + "loss": 7.3401, + "step": 8847 + }, + { + "epoch": 0.8256041802743305, + "grad_norm": 587928246.0890406, + "learning_rate": 0.0002911454848278198, + "loss": 7.5973, + "step": 8848 + }, + { + "epoch": 0.8256974899692078, + "grad_norm": 1.3792135496587368, + "learning_rate": 0.00029114292438108886, + "loss": 7.7775, + "step": 8849 + }, + { + "epoch": 0.8257907996640851, + "grad_norm": 723041513.7740871, + "learning_rate": 0.00029114036357547323, + "loss": 7.5313, + "step": 8850 + }, + { + "epoch": 0.8258841093589624, + "grad_norm": 158118764.9862626, + "learning_rate": 0.0002911378024109795, + "loss": 7.4076, + "step": 8851 + }, + { + "epoch": 0.8259774190538397, + "grad_norm": 4.7487585038702225, + "learning_rate": 0.00029113524088761404, + "loss": 7.3025, + "step": 8852 + }, + { + "epoch": 0.826070728748717, + "grad_norm": 3.7565519392422955, + "learning_rate": 0.00029113267900538354, + "loss": 7.7657, + "step": 8853 + }, + { + "epoch": 0.8261640384435943, + "grad_norm": 2.4983112684600255, + "learning_rate": 0.0002911301167642944, + "loss": 7.8154, + "step": 8854 + }, + { + "epoch": 0.8262573481384716, + "grad_norm": 39678605.65083927, + "learning_rate": 0.00029112755416435326, + "loss": 7.6211, + "step": 8855 + }, + { + "epoch": 0.8263506578333489, + "grad_norm": 7.1757069411664665, + "learning_rate": 0.00029112499120556655, + "loss": 7.4543, + "step": 8856 + }, + { + "epoch": 0.8264439675282261, + "grad_norm": 1.8308340034300967, + "learning_rate": 0.0002911224278879408, + "loss": 7.5113, + "step": 8857 + }, + { + "epoch": 0.8265372772231034, + "grad_norm": 3134005.816123182, + "learning_rate": 0.00029111986421148244, + "loss": 7.3974, + "step": 8858 + }, + { + "epoch": 0.8266305869179807, + "grad_norm": 1.362679563704035, + "learning_rate": 0.0002911173001761981, + "loss": 7.2424, + "step": 8859 + }, + { + "epoch": 0.8267238966128581, + "grad_norm": 3.6247016495608193, + "learning_rate": 0.0002911147357820943, + "loss": 7.3382, + "step": 8860 + }, + { + "epoch": 0.8268172063077354, + "grad_norm": 1829843.354846083, + "learning_rate": 0.0002911121710291775, + "loss": 7.3143, + "step": 8861 + }, + { + "epoch": 0.8269105160026127, + "grad_norm": 1.315728986330431, + "learning_rate": 0.00029110960591745424, + "loss": 7.5298, + "step": 8862 + }, + { + "epoch": 0.82700382569749, + "grad_norm": 1.4395378997955732, + "learning_rate": 0.0002911070404469311, + "loss": 7.6169, + "step": 8863 + }, + { + "epoch": 0.8270971353923673, + "grad_norm": 4781954.954434474, + "learning_rate": 0.0002911044746176145, + "loss": 7.4018, + "step": 8864 + }, + { + "epoch": 0.8271904450872446, + "grad_norm": 0.9976303959714611, + "learning_rate": 0.000291101908429511, + "loss": 7.5967, + "step": 8865 + }, + { + "epoch": 0.8272837547821219, + "grad_norm": 1.3858362349956508, + "learning_rate": 0.0002910993418826272, + "loss": 7.3578, + "step": 8866 + }, + { + "epoch": 0.8273770644769992, + "grad_norm": 7.058417756317323, + "learning_rate": 0.0002910967749769696, + "loss": 7.8715, + "step": 8867 + }, + { + "epoch": 0.8274703741718764, + "grad_norm": 131356.09420726667, + "learning_rate": 0.00029109420771254464, + "loss": 7.4189, + "step": 8868 + }, + { + "epoch": 0.8275636838667537, + "grad_norm": 3.216256872856093, + "learning_rate": 0.0002910916400893589, + "loss": 7.4435, + "step": 8869 + }, + { + "epoch": 0.827656993561631, + "grad_norm": 1.2723241669278003, + "learning_rate": 0.0002910890721074189, + "loss": 7.6111, + "step": 8870 + }, + { + "epoch": 0.8277503032565083, + "grad_norm": 2.5675215017026134, + "learning_rate": 0.00029108650376673126, + "loss": 7.7592, + "step": 8871 + }, + { + "epoch": 0.8278436129513856, + "grad_norm": 2.0512630262325753, + "learning_rate": 0.0002910839350673024, + "loss": 7.212, + "step": 8872 + }, + { + "epoch": 0.827936922646263, + "grad_norm": 1.337832599825318, + "learning_rate": 0.00029108136600913885, + "loss": 7.6264, + "step": 8873 + }, + { + "epoch": 0.8280302323411403, + "grad_norm": 1.49134258995353, + "learning_rate": 0.0002910787965922472, + "loss": 7.0739, + "step": 8874 + }, + { + "epoch": 0.8281235420360176, + "grad_norm": 174812.79209378266, + "learning_rate": 0.00029107622681663397, + "loss": 7.6413, + "step": 8875 + }, + { + "epoch": 0.8282168517308949, + "grad_norm": 4.339007195798144, + "learning_rate": 0.0002910736566823057, + "loss": 7.5138, + "step": 8876 + }, + { + "epoch": 0.8283101614257722, + "grad_norm": 1.1699569116305593, + "learning_rate": 0.00029107108618926884, + "loss": 7.5291, + "step": 8877 + }, + { + "epoch": 0.8284034711206494, + "grad_norm": 14.542839750834595, + "learning_rate": 0.00029106851533753004, + "loss": 7.2754, + "step": 8878 + }, + { + "epoch": 0.8284967808155267, + "grad_norm": 14.049675662238228, + "learning_rate": 0.0002910659441270958, + "loss": 7.6098, + "step": 8879 + }, + { + "epoch": 0.828590090510404, + "grad_norm": 51382.84546669715, + "learning_rate": 0.00029106337255797263, + "loss": 7.4396, + "step": 8880 + }, + { + "epoch": 0.8286834002052813, + "grad_norm": 1.2111912326429053, + "learning_rate": 0.00029106080063016714, + "loss": 7.6866, + "step": 8881 + }, + { + "epoch": 0.8287767099001586, + "grad_norm": 1.7708605306358343, + "learning_rate": 0.0002910582283436858, + "loss": 8.0106, + "step": 8882 + }, + { + "epoch": 0.8288700195950359, + "grad_norm": 4.3524032952403, + "learning_rate": 0.00029105565569853515, + "loss": 7.5555, + "step": 8883 + }, + { + "epoch": 0.8289633292899132, + "grad_norm": 5.877376500599356, + "learning_rate": 0.0002910530826947218, + "loss": 7.2918, + "step": 8884 + }, + { + "epoch": 0.8290566389847905, + "grad_norm": 2.1178395195676627, + "learning_rate": 0.00029105050933225216, + "loss": 7.7097, + "step": 8885 + }, + { + "epoch": 0.8291499486796678, + "grad_norm": 66967.02104001246, + "learning_rate": 0.0002910479356111329, + "loss": 7.4782, + "step": 8886 + }, + { + "epoch": 0.8292432583745452, + "grad_norm": 10070.921413292503, + "learning_rate": 0.0002910453615313705, + "loss": 7.19, + "step": 8887 + }, + { + "epoch": 0.8293365680694225, + "grad_norm": 142790.0246136832, + "learning_rate": 0.00029104278709297157, + "loss": 7.3984, + "step": 8888 + }, + { + "epoch": 0.8294298777642997, + "grad_norm": 2.6863631834389485, + "learning_rate": 0.0002910402122959426, + "loss": 7.5366, + "step": 8889 + }, + { + "epoch": 0.829523187459177, + "grad_norm": 1.916341056479437, + "learning_rate": 0.00029103763714029014, + "loss": 7.5678, + "step": 8890 + }, + { + "epoch": 0.8296164971540543, + "grad_norm": 1.7709217639469519, + "learning_rate": 0.00029103506162602073, + "loss": 7.4585, + "step": 8891 + }, + { + "epoch": 0.8297098068489316, + "grad_norm": 1.240521316084125, + "learning_rate": 0.00029103248575314093, + "loss": 7.5093, + "step": 8892 + }, + { + "epoch": 0.8298031165438089, + "grad_norm": 3.279660394833253, + "learning_rate": 0.00029102990952165736, + "loss": 7.8089, + "step": 8893 + }, + { + "epoch": 0.8298964262386862, + "grad_norm": 1.7761691426439676, + "learning_rate": 0.0002910273329315765, + "loss": 7.4512, + "step": 8894 + }, + { + "epoch": 0.8299897359335635, + "grad_norm": 1.9125050008017241, + "learning_rate": 0.00029102475598290483, + "loss": 7.5155, + "step": 8895 + }, + { + "epoch": 0.8300830456284408, + "grad_norm": 1.91573000444559, + "learning_rate": 0.000291022178675649, + "loss": 7.59, + "step": 8896 + }, + { + "epoch": 0.8301763553233181, + "grad_norm": 3.589064609924674, + "learning_rate": 0.00029101960100981554, + "loss": 7.2363, + "step": 8897 + }, + { + "epoch": 0.8302696650181954, + "grad_norm": 3.4027590072087563, + "learning_rate": 0.00029101702298541107, + "loss": 7.7294, + "step": 8898 + }, + { + "epoch": 0.8303629747130727, + "grad_norm": 2.259244164862064, + "learning_rate": 0.000291014444602442, + "loss": 7.6338, + "step": 8899 + }, + { + "epoch": 0.8304562844079499, + "grad_norm": 5.16288250784429, + "learning_rate": 0.000291011865860915, + "loss": 7.5782, + "step": 8900 + }, + { + "epoch": 0.8305495941028272, + "grad_norm": 3.4907451994439924, + "learning_rate": 0.00029100928676083665, + "loss": 7.3395, + "step": 8901 + }, + { + "epoch": 0.8306429037977046, + "grad_norm": 1.912053509481784, + "learning_rate": 0.0002910067073022134, + "loss": 7.7194, + "step": 8902 + }, + { + "epoch": 0.8307362134925819, + "grad_norm": 4.712639539181809, + "learning_rate": 0.0002910041274850519, + "loss": 7.6349, + "step": 8903 + }, + { + "epoch": 0.8308295231874592, + "grad_norm": 2.2648575863672677, + "learning_rate": 0.00029100154730935863, + "loss": 7.1788, + "step": 8904 + }, + { + "epoch": 0.8309228328823365, + "grad_norm": 2.7800860298821455, + "learning_rate": 0.00029099896677514024, + "loss": 7.2806, + "step": 8905 + }, + { + "epoch": 0.8310161425772138, + "grad_norm": 1.7267316186879451, + "learning_rate": 0.0002909963858824032, + "loss": 7.2353, + "step": 8906 + }, + { + "epoch": 0.8311094522720911, + "grad_norm": 2574959.326071131, + "learning_rate": 0.0002909938046311541, + "loss": 7.3878, + "step": 8907 + }, + { + "epoch": 0.8312027619669684, + "grad_norm": 3.125138769302822, + "learning_rate": 0.00029099122302139957, + "loss": 7.497, + "step": 8908 + }, + { + "epoch": 0.8312960716618457, + "grad_norm": 28621.69169720623, + "learning_rate": 0.0002909886410531461, + "loss": 7.6121, + "step": 8909 + }, + { + "epoch": 0.8313893813567229, + "grad_norm": 2.4443451141849915, + "learning_rate": 0.0002909860587264003, + "loss": 7.6639, + "step": 8910 + }, + { + "epoch": 0.8314826910516002, + "grad_norm": 2.352648179594977, + "learning_rate": 0.0002909834760411687, + "loss": 7.5038, + "step": 8911 + }, + { + "epoch": 0.8315760007464775, + "grad_norm": 89281.96926667751, + "learning_rate": 0.00029098089299745786, + "loss": 7.4646, + "step": 8912 + }, + { + "epoch": 0.8316693104413548, + "grad_norm": 324246.3443308986, + "learning_rate": 0.00029097830959527444, + "loss": 7.2924, + "step": 8913 + }, + { + "epoch": 0.8317626201362321, + "grad_norm": 52258.869605413856, + "learning_rate": 0.0002909757258346249, + "loss": 7.5112, + "step": 8914 + }, + { + "epoch": 0.8318559298311095, + "grad_norm": 1.509765181046946, + "learning_rate": 0.0002909731417155158, + "loss": 7.6062, + "step": 8915 + }, + { + "epoch": 0.8319492395259868, + "grad_norm": 85113.85070672868, + "learning_rate": 0.0002909705572379538, + "loss": 7.2132, + "step": 8916 + }, + { + "epoch": 0.8320425492208641, + "grad_norm": 451279.8241824979, + "learning_rate": 0.00029096797240194547, + "loss": 7.4201, + "step": 8917 + }, + { + "epoch": 0.8321358589157414, + "grad_norm": 2.5813899573916688, + "learning_rate": 0.0002909653872074973, + "loss": 7.4761, + "step": 8918 + }, + { + "epoch": 0.8322291686106187, + "grad_norm": 1.5895817939580283, + "learning_rate": 0.0002909628016546159, + "loss": 7.3237, + "step": 8919 + }, + { + "epoch": 0.832322478305496, + "grad_norm": 13.922740718010026, + "learning_rate": 0.0002909602157433079, + "loss": 7.3598, + "step": 8920 + }, + { + "epoch": 0.8324157880003732, + "grad_norm": 0.8318307831742734, + "learning_rate": 0.0002909576294735798, + "loss": 7.195, + "step": 8921 + }, + { + "epoch": 0.8325090976952505, + "grad_norm": 2.2465294640831375, + "learning_rate": 0.0002909550428454382, + "loss": 7.4826, + "step": 8922 + }, + { + "epoch": 0.8326024073901278, + "grad_norm": 421036.3818108874, + "learning_rate": 0.0002909524558588897, + "loss": 7.0754, + "step": 8923 + }, + { + "epoch": 0.8326957170850051, + "grad_norm": 2.865899754600898, + "learning_rate": 0.0002909498685139408, + "loss": 7.4289, + "step": 8924 + }, + { + "epoch": 0.8327890267798824, + "grad_norm": 6.410555675428392, + "learning_rate": 0.0002909472808105982, + "loss": 7.5423, + "step": 8925 + }, + { + "epoch": 0.8328823364747597, + "grad_norm": 15.97902585080355, + "learning_rate": 0.0002909446927488684, + "loss": 7.2839, + "step": 8926 + }, + { + "epoch": 0.832975646169637, + "grad_norm": 2.971620258915693, + "learning_rate": 0.00029094210432875796, + "loss": 7.6319, + "step": 8927 + }, + { + "epoch": 0.8330689558645143, + "grad_norm": 1.6134018606910596, + "learning_rate": 0.0002909395155502735, + "loss": 7.3567, + "step": 8928 + }, + { + "epoch": 0.8331622655593917, + "grad_norm": 1.4043700085420732, + "learning_rate": 0.00029093692641342166, + "loss": 7.3867, + "step": 8929 + }, + { + "epoch": 0.833255575254269, + "grad_norm": 29.52243116515924, + "learning_rate": 0.00029093433691820895, + "loss": 7.4601, + "step": 8930 + }, + { + "epoch": 0.8333488849491462, + "grad_norm": 0.8259087573137656, + "learning_rate": 0.00029093174706464194, + "loss": 7.2237, + "step": 8931 + }, + { + "epoch": 0.8334421946440235, + "grad_norm": 1.496913823060475, + "learning_rate": 0.00029092915685272725, + "loss": 7.5197, + "step": 8932 + }, + { + "epoch": 0.8335355043389008, + "grad_norm": 2.288384703062113, + "learning_rate": 0.0002909265662824715, + "loss": 7.9285, + "step": 8933 + }, + { + "epoch": 0.8336288140337781, + "grad_norm": 0.7571257553867337, + "learning_rate": 0.00029092397535388116, + "loss": 7.3715, + "step": 8934 + }, + { + "epoch": 0.8337221237286554, + "grad_norm": 11.678665505474271, + "learning_rate": 0.00029092138406696294, + "loss": 7.4503, + "step": 8935 + }, + { + "epoch": 0.8338154334235327, + "grad_norm": 19423686.259852346, + "learning_rate": 0.0002909187924217234, + "loss": 7.3824, + "step": 8936 + }, + { + "epoch": 0.83390874311841, + "grad_norm": 1.566683875241745, + "learning_rate": 0.0002909162004181691, + "loss": 7.3995, + "step": 8937 + }, + { + "epoch": 0.8340020528132873, + "grad_norm": 2.119336252203828, + "learning_rate": 0.00029091360805630665, + "loss": 7.3903, + "step": 8938 + }, + { + "epoch": 0.8340953625081646, + "grad_norm": 1.829615410989931, + "learning_rate": 0.0002909110153361426, + "loss": 7.1172, + "step": 8939 + }, + { + "epoch": 0.8341886722030419, + "grad_norm": 3.7490754551371928, + "learning_rate": 0.00029090842225768367, + "loss": 7.6294, + "step": 8940 + }, + { + "epoch": 0.8342819818979192, + "grad_norm": 2.6744991009967127, + "learning_rate": 0.00029090582882093625, + "loss": 7.2594, + "step": 8941 + }, + { + "epoch": 0.8343752915927964, + "grad_norm": 0.9448070077981097, + "learning_rate": 0.0002909032350259071, + "loss": 7.3739, + "step": 8942 + }, + { + "epoch": 0.8344686012876738, + "grad_norm": 1.0293675721220577, + "learning_rate": 0.0002909006408726028, + "loss": 7.3226, + "step": 8943 + }, + { + "epoch": 0.8345619109825511, + "grad_norm": 0.8622746540164009, + "learning_rate": 0.00029089804636102986, + "loss": 7.0424, + "step": 8944 + }, + { + "epoch": 0.8346552206774284, + "grad_norm": 2.2413986159499966, + "learning_rate": 0.0002908954514911949, + "loss": 7.6871, + "step": 8945 + }, + { + "epoch": 0.8347485303723057, + "grad_norm": 1.7231634728789436, + "learning_rate": 0.0002908928562631046, + "loss": 7.3396, + "step": 8946 + }, + { + "epoch": 0.834841840067183, + "grad_norm": 14.318606980874364, + "learning_rate": 0.00029089026067676547, + "loss": 7.1637, + "step": 8947 + }, + { + "epoch": 0.8349351497620603, + "grad_norm": 1.649326348588024, + "learning_rate": 0.0002908876647321841, + "loss": 7.5616, + "step": 8948 + }, + { + "epoch": 0.8350284594569376, + "grad_norm": 586880397.8121622, + "learning_rate": 0.0002908850684293672, + "loss": 7.6167, + "step": 8949 + }, + { + "epoch": 0.8351217691518149, + "grad_norm": 4.3840035336757595, + "learning_rate": 0.0002908824717683213, + "loss": 7.5174, + "step": 8950 + }, + { + "epoch": 0.8352150788466922, + "grad_norm": 2.983410832483812, + "learning_rate": 0.000290879874749053, + "loss": 7.5195, + "step": 8951 + }, + { + "epoch": 0.8353083885415695, + "grad_norm": 3.354054231626007, + "learning_rate": 0.0002908772773715689, + "loss": 7.4399, + "step": 8952 + }, + { + "epoch": 0.8354016982364467, + "grad_norm": 1.2892673076120045, + "learning_rate": 0.0002908746796358756, + "loss": 7.4807, + "step": 8953 + }, + { + "epoch": 0.835495007931324, + "grad_norm": 0.8291192963537477, + "learning_rate": 0.0002908720815419798, + "loss": 7.2592, + "step": 8954 + }, + { + "epoch": 0.8355883176262013, + "grad_norm": 2.9135065553988784, + "learning_rate": 0.00029086948308988794, + "loss": 7.3937, + "step": 8955 + }, + { + "epoch": 0.8356816273210786, + "grad_norm": 35.795150482540855, + "learning_rate": 0.0002908668842796067, + "loss": 7.7159, + "step": 8956 + }, + { + "epoch": 0.835774937015956, + "grad_norm": 839003548.2411987, + "learning_rate": 0.00029086428511114276, + "loss": 7.3439, + "step": 8957 + }, + { + "epoch": 0.8358682467108333, + "grad_norm": 1.316492154989582, + "learning_rate": 0.00029086168558450264, + "loss": 7.5799, + "step": 8958 + }, + { + "epoch": 0.8359615564057106, + "grad_norm": 1.781382299044018, + "learning_rate": 0.000290859085699693, + "loss": 7.351, + "step": 8959 + }, + { + "epoch": 0.8360548661005879, + "grad_norm": 245823825.05387208, + "learning_rate": 0.00029085648545672043, + "loss": 7.3572, + "step": 8960 + }, + { + "epoch": 0.8361481757954652, + "grad_norm": 9.534521129808018, + "learning_rate": 0.0002908538848555915, + "loss": 7.5688, + "step": 8961 + }, + { + "epoch": 0.8362414854903425, + "grad_norm": 50898835.807914354, + "learning_rate": 0.00029085128389631294, + "loss": 7.6774, + "step": 8962 + }, + { + "epoch": 0.8363347951852197, + "grad_norm": 3.3980497419944418, + "learning_rate": 0.0002908486825788912, + "loss": 7.4787, + "step": 8963 + }, + { + "epoch": 0.836428104880097, + "grad_norm": 1.156358509458697, + "learning_rate": 0.00029084608090333306, + "loss": 7.2788, + "step": 8964 + }, + { + "epoch": 0.8365214145749743, + "grad_norm": 5.031738298328793, + "learning_rate": 0.000290843478869645, + "loss": 7.5338, + "step": 8965 + }, + { + "epoch": 0.8366147242698516, + "grad_norm": 1.0569592128174423, + "learning_rate": 0.0002908408764778337, + "loss": 7.5309, + "step": 8966 + }, + { + "epoch": 0.8367080339647289, + "grad_norm": 1.5548035027510156, + "learning_rate": 0.0002908382737279058, + "loss": 7.262, + "step": 8967 + }, + { + "epoch": 0.8368013436596062, + "grad_norm": 2.4380523624650037, + "learning_rate": 0.0002908356706198679, + "loss": 7.4964, + "step": 8968 + }, + { + "epoch": 0.8368946533544835, + "grad_norm": 2.8858276440467723, + "learning_rate": 0.0002908330671537266, + "loss": 7.2965, + "step": 8969 + }, + { + "epoch": 0.8369879630493609, + "grad_norm": 8.87211964828008, + "learning_rate": 0.0002908304633294885, + "loss": 7.5416, + "step": 8970 + }, + { + "epoch": 0.8370812727442382, + "grad_norm": 2.221614723879509, + "learning_rate": 0.0002908278591471602, + "loss": 7.6876, + "step": 8971 + }, + { + "epoch": 0.8371745824391155, + "grad_norm": 0.8818277881348061, + "learning_rate": 0.0002908252546067485, + "loss": 7.4183, + "step": 8972 + }, + { + "epoch": 0.8372678921339928, + "grad_norm": 2.0574648415495074, + "learning_rate": 0.00029082264970825984, + "loss": 7.6163, + "step": 8973 + }, + { + "epoch": 0.83736120182887, + "grad_norm": 1.1665292484436947, + "learning_rate": 0.00029082004445170085, + "loss": 7.5148, + "step": 8974 + }, + { + "epoch": 0.8374545115237473, + "grad_norm": 1.134230848373603, + "learning_rate": 0.00029081743883707824, + "loss": 7.4134, + "step": 8975 + }, + { + "epoch": 0.8375478212186246, + "grad_norm": 1.181822137617232, + "learning_rate": 0.0002908148328643986, + "loss": 7.6643, + "step": 8976 + }, + { + "epoch": 0.8376411309135019, + "grad_norm": 451.2673995556229, + "learning_rate": 0.00029081222653366855, + "loss": 7.4709, + "step": 8977 + }, + { + "epoch": 0.8377344406083792, + "grad_norm": 1.5572410466864512, + "learning_rate": 0.0002908096198448947, + "loss": 7.6419, + "step": 8978 + }, + { + "epoch": 0.8378277503032565, + "grad_norm": 2.0387455564112087, + "learning_rate": 0.00029080701279808374, + "loss": 7.3896, + "step": 8979 + }, + { + "epoch": 0.8379210599981338, + "grad_norm": 19085937.14686057, + "learning_rate": 0.00029080440539324217, + "loss": 7.4478, + "step": 8980 + }, + { + "epoch": 0.8380143696930111, + "grad_norm": 0.9054336241706801, + "learning_rate": 0.0002908017976303768, + "loss": 7.4098, + "step": 8981 + }, + { + "epoch": 0.8381076793878884, + "grad_norm": 1.1272231896686111, + "learning_rate": 0.00029079918950949415, + "loss": 7.4812, + "step": 8982 + }, + { + "epoch": 0.8382009890827657, + "grad_norm": 1.402991819468495, + "learning_rate": 0.00029079658103060087, + "loss": 7.2356, + "step": 8983 + }, + { + "epoch": 0.8382942987776429, + "grad_norm": 31.614839180012694, + "learning_rate": 0.00029079397219370354, + "loss": 7.2623, + "step": 8984 + }, + { + "epoch": 0.8383876084725203, + "grad_norm": 0.702931498554807, + "learning_rate": 0.0002907913629988089, + "loss": 7.3458, + "step": 8985 + }, + { + "epoch": 0.8384809181673976, + "grad_norm": 438381553.96580416, + "learning_rate": 0.0002907887534459235, + "loss": 7.4494, + "step": 8986 + }, + { + "epoch": 0.8385742278622749, + "grad_norm": 8126361.041667146, + "learning_rate": 0.00029078614353505403, + "loss": 7.3753, + "step": 8987 + }, + { + "epoch": 0.8386675375571522, + "grad_norm": 0.8499591264960236, + "learning_rate": 0.0002907835332662071, + "loss": 7.3941, + "step": 8988 + }, + { + "epoch": 0.8387608472520295, + "grad_norm": 0.7667305397315886, + "learning_rate": 0.0002907809226393893, + "loss": 7.4217, + "step": 8989 + }, + { + "epoch": 0.8388541569469068, + "grad_norm": 526442847.45740587, + "learning_rate": 0.0002907783116546073, + "loss": 7.2305, + "step": 8990 + }, + { + "epoch": 0.8389474666417841, + "grad_norm": 1460974.9114936225, + "learning_rate": 0.0002907757003118678, + "loss": 7.4842, + "step": 8991 + }, + { + "epoch": 0.8390407763366614, + "grad_norm": 237.96798843586814, + "learning_rate": 0.0002907730886111774, + "loss": 7.3578, + "step": 8992 + }, + { + "epoch": 0.8391340860315387, + "grad_norm": 1.152791409159053, + "learning_rate": 0.00029077047655254274, + "loss": 7.585, + "step": 8993 + }, + { + "epoch": 0.839227395726416, + "grad_norm": 13.915325443546969, + "learning_rate": 0.0002907678641359705, + "loss": 7.2618, + "step": 8994 + }, + { + "epoch": 0.8393207054212932, + "grad_norm": 64101.28293753896, + "learning_rate": 0.0002907652513614672, + "loss": 7.4689, + "step": 8995 + }, + { + "epoch": 0.8394140151161705, + "grad_norm": 1.0540942104139481, + "learning_rate": 0.00029076263822903957, + "loss": 7.5478, + "step": 8996 + }, + { + "epoch": 0.8395073248110478, + "grad_norm": 1.3044269962600625, + "learning_rate": 0.00029076002473869426, + "loss": 7.6094, + "step": 8997 + }, + { + "epoch": 0.8396006345059251, + "grad_norm": 7.877574588758244, + "learning_rate": 0.00029075741089043793, + "loss": 7.676, + "step": 8998 + }, + { + "epoch": 0.8396939442008025, + "grad_norm": 0.8020296041017145, + "learning_rate": 0.00029075479668427714, + "loss": 7.3303, + "step": 8999 + }, + { + "epoch": 0.8397872538956798, + "grad_norm": 3.4617014629355425, + "learning_rate": 0.0002907521821202187, + "loss": 7.7799, + "step": 9000 + }, + { + "epoch": 0.8398805635905571, + "grad_norm": 435357.1281184499, + "learning_rate": 0.00029074956719826907, + "loss": 7.3921, + "step": 9001 + }, + { + "epoch": 0.8399738732854344, + "grad_norm": 414858.05997762963, + "learning_rate": 0.00029074695191843507, + "loss": 7.3918, + "step": 9002 + }, + { + "epoch": 0.8400671829803117, + "grad_norm": 4.052764490706791, + "learning_rate": 0.00029074433628072316, + "loss": 7.5721, + "step": 9003 + }, + { + "epoch": 0.840160492675189, + "grad_norm": 1.1215213929483308, + "learning_rate": 0.00029074172028514014, + "loss": 7.2875, + "step": 9004 + }, + { + "epoch": 0.8402538023700663, + "grad_norm": 25.65901695179884, + "learning_rate": 0.0002907391039316926, + "loss": 7.5577, + "step": 9005 + }, + { + "epoch": 0.8403471120649435, + "grad_norm": 0.9134547081273342, + "learning_rate": 0.0002907364872203872, + "loss": 7.4691, + "step": 9006 + }, + { + "epoch": 0.8404404217598208, + "grad_norm": 1.4225063753993359, + "learning_rate": 0.00029073387015123063, + "loss": 7.9565, + "step": 9007 + }, + { + "epoch": 0.8405337314546981, + "grad_norm": 8.333338958280041, + "learning_rate": 0.00029073125272422953, + "loss": 7.3919, + "step": 9008 + }, + { + "epoch": 0.8406270411495754, + "grad_norm": 1.8559734797294418, + "learning_rate": 0.00029072863493939053, + "loss": 7.6033, + "step": 9009 + }, + { + "epoch": 0.8407203508444527, + "grad_norm": 7.857573026002642, + "learning_rate": 0.00029072601679672026, + "loss": 7.4042, + "step": 9010 + }, + { + "epoch": 0.84081366053933, + "grad_norm": 2.351897414330079, + "learning_rate": 0.0002907233982962255, + "loss": 7.3867, + "step": 9011 + }, + { + "epoch": 0.8409069702342074, + "grad_norm": 1.0293823034921146, + "learning_rate": 0.00029072077943791277, + "loss": 7.3679, + "step": 9012 + }, + { + "epoch": 0.8410002799290847, + "grad_norm": 270990.5649098902, + "learning_rate": 0.0002907181602217888, + "loss": 7.2922, + "step": 9013 + }, + { + "epoch": 0.841093589623962, + "grad_norm": 1.181608391597646, + "learning_rate": 0.0002907155406478602, + "loss": 7.5487, + "step": 9014 + }, + { + "epoch": 0.8411868993188393, + "grad_norm": 1.5744183369159088, + "learning_rate": 0.00029071292071613364, + "loss": 7.7413, + "step": 9015 + }, + { + "epoch": 0.8412802090137165, + "grad_norm": 1.374133755545326, + "learning_rate": 0.00029071030042661584, + "loss": 7.143, + "step": 9016 + }, + { + "epoch": 0.8413735187085938, + "grad_norm": 1.70915285905243, + "learning_rate": 0.0002907076797793134, + "loss": 7.2832, + "step": 9017 + }, + { + "epoch": 0.8414668284034711, + "grad_norm": 5.388396924968503, + "learning_rate": 0.0002907050587742331, + "loss": 7.4269, + "step": 9018 + }, + { + "epoch": 0.8415601380983484, + "grad_norm": 0.8246300970617273, + "learning_rate": 0.0002907024374113814, + "loss": 7.6219, + "step": 9019 + }, + { + "epoch": 0.8416534477932257, + "grad_norm": 1.5552700768972396, + "learning_rate": 0.0002906998156907652, + "loss": 7.3971, + "step": 9020 + }, + { + "epoch": 0.841746757488103, + "grad_norm": 1.970740159941069, + "learning_rate": 0.00029069719361239096, + "loss": 7.1974, + "step": 9021 + }, + { + "epoch": 0.8418400671829803, + "grad_norm": 62.175927335348646, + "learning_rate": 0.00029069457117626544, + "loss": 7.2765, + "step": 9022 + }, + { + "epoch": 0.8419333768778576, + "grad_norm": 1.6291918180375955, + "learning_rate": 0.00029069194838239533, + "loss": 7.5035, + "step": 9023 + }, + { + "epoch": 0.8420266865727349, + "grad_norm": 5.206332658483616, + "learning_rate": 0.00029068932523078724, + "loss": 7.1063, + "step": 9024 + }, + { + "epoch": 0.8421199962676122, + "grad_norm": 2247883.342364904, + "learning_rate": 0.00029068670172144795, + "loss": 7.4804, + "step": 9025 + }, + { + "epoch": 0.8422133059624896, + "grad_norm": 1803848.8428629434, + "learning_rate": 0.00029068407785438395, + "loss": 7.391, + "step": 9026 + }, + { + "epoch": 0.8423066156573668, + "grad_norm": 0.6652449159228468, + "learning_rate": 0.0002906814536296021, + "loss": 7.43, + "step": 9027 + }, + { + "epoch": 0.8423999253522441, + "grad_norm": 0.7871354492341318, + "learning_rate": 0.0002906788290471089, + "loss": 7.4495, + "step": 9028 + }, + { + "epoch": 0.8424932350471214, + "grad_norm": 3.741955518152251, + "learning_rate": 0.00029067620410691113, + "loss": 7.395, + "step": 9029 + }, + { + "epoch": 0.8425865447419987, + "grad_norm": 4.410242950013278, + "learning_rate": 0.0002906735788090155, + "loss": 7.5138, + "step": 9030 + }, + { + "epoch": 0.842679854436876, + "grad_norm": 0.6180270599718949, + "learning_rate": 0.00029067095315342856, + "loss": 7.247, + "step": 9031 + }, + { + "epoch": 0.8427731641317533, + "grad_norm": 1.3354744749551524, + "learning_rate": 0.00029066832714015714, + "loss": 7.3764, + "step": 9032 + }, + { + "epoch": 0.8428664738266306, + "grad_norm": 2863596.9897786, + "learning_rate": 0.0002906657007692078, + "loss": 7.5769, + "step": 9033 + }, + { + "epoch": 0.8429597835215079, + "grad_norm": 0.7723229069013271, + "learning_rate": 0.0002906630740405872, + "loss": 7.4048, + "step": 9034 + }, + { + "epoch": 0.8430530932163852, + "grad_norm": 10.991692568957083, + "learning_rate": 0.0002906604469543021, + "loss": 7.1964, + "step": 9035 + }, + { + "epoch": 0.8431464029112625, + "grad_norm": 1.1663059671996368, + "learning_rate": 0.00029065781951035925, + "loss": 7.0199, + "step": 9036 + }, + { + "epoch": 0.8432397126061397, + "grad_norm": 0.7193649105096235, + "learning_rate": 0.0002906551917087651, + "loss": 7.4134, + "step": 9037 + }, + { + "epoch": 0.843333022301017, + "grad_norm": 187169.88924482983, + "learning_rate": 0.0002906525635495265, + "loss": 7.2982, + "step": 9038 + }, + { + "epoch": 0.8434263319958943, + "grad_norm": 1.6270404031236385, + "learning_rate": 0.00029064993503265013, + "loss": 7.6111, + "step": 9039 + }, + { + "epoch": 0.8435196416907716, + "grad_norm": 3.5169335677766567, + "learning_rate": 0.00029064730615814256, + "loss": 7.5335, + "step": 9040 + }, + { + "epoch": 0.843612951385649, + "grad_norm": 1.3978960887940606, + "learning_rate": 0.00029064467692601064, + "loss": 7.6552, + "step": 9041 + }, + { + "epoch": 0.8437062610805263, + "grad_norm": 1.3720348950974102, + "learning_rate": 0.0002906420473362609, + "loss": 7.0645, + "step": 9042 + }, + { + "epoch": 0.8437995707754036, + "grad_norm": 1.4645762128969317, + "learning_rate": 0.0002906394173889001, + "loss": 7.1102, + "step": 9043 + }, + { + "epoch": 0.8438928804702809, + "grad_norm": 1.5236823328148126, + "learning_rate": 0.00029063678708393496, + "loss": 6.9947, + "step": 9044 + }, + { + "epoch": 0.8439861901651582, + "grad_norm": 1.1033287780509957, + "learning_rate": 0.0002906341564213721, + "loss": 7.1747, + "step": 9045 + }, + { + "epoch": 0.8440794998600355, + "grad_norm": 1.389010609342792, + "learning_rate": 0.00029063152540121826, + "loss": 7.1534, + "step": 9046 + }, + { + "epoch": 0.8441728095549128, + "grad_norm": 4.61607098396713, + "learning_rate": 0.0002906288940234801, + "loss": 7.0053, + "step": 9047 + }, + { + "epoch": 0.84426611924979, + "grad_norm": 2.0624403146122763, + "learning_rate": 0.0002906262622881644, + "loss": 7.3189, + "step": 9048 + }, + { + "epoch": 0.8443594289446673, + "grad_norm": 5.696190333568695, + "learning_rate": 0.00029062363019527763, + "loss": 7.2683, + "step": 9049 + }, + { + "epoch": 0.8444527386395446, + "grad_norm": 2.008980389644598, + "learning_rate": 0.0002906209977448267, + "loss": 7.599, + "step": 9050 + }, + { + "epoch": 0.8445460483344219, + "grad_norm": 666680.9536010729, + "learning_rate": 0.0002906183649368182, + "loss": 7.2891, + "step": 9051 + }, + { + "epoch": 0.8446393580292992, + "grad_norm": 1.377329469559428, + "learning_rate": 0.0002906157317712589, + "loss": 7.1281, + "step": 9052 + }, + { + "epoch": 0.8447326677241765, + "grad_norm": 293928.0777888555, + "learning_rate": 0.0002906130982481554, + "loss": 7.4908, + "step": 9053 + }, + { + "epoch": 0.8448259774190539, + "grad_norm": 2.158459709547158, + "learning_rate": 0.00029061046436751445, + "loss": 7.5494, + "step": 9054 + }, + { + "epoch": 0.8449192871139312, + "grad_norm": 1.4904777089324204, + "learning_rate": 0.0002906078301293428, + "loss": 7.4646, + "step": 9055 + }, + { + "epoch": 0.8450125968088085, + "grad_norm": 0.6850986043286688, + "learning_rate": 0.00029060519553364704, + "loss": 7.3691, + "step": 9056 + }, + { + "epoch": 0.8451059065036858, + "grad_norm": 0.8542128300591458, + "learning_rate": 0.0002906025605804339, + "loss": 7.2226, + "step": 9057 + }, + { + "epoch": 0.8451992161985631, + "grad_norm": 1.1608276049271036, + "learning_rate": 0.00029059992526971013, + "loss": 7.429, + "step": 9058 + }, + { + "epoch": 0.8452925258934403, + "grad_norm": 447227.8637146651, + "learning_rate": 0.0002905972896014824, + "loss": 7.3985, + "step": 9059 + }, + { + "epoch": 0.8453858355883176, + "grad_norm": 1.5519462020402808, + "learning_rate": 0.00029059465357575735, + "loss": 7.4593, + "step": 9060 + }, + { + "epoch": 0.8454791452831949, + "grad_norm": 1.1773478447125278, + "learning_rate": 0.0002905920171925418, + "loss": 7.3655, + "step": 9061 + }, + { + "epoch": 0.8455724549780722, + "grad_norm": 0.8534230733719804, + "learning_rate": 0.00029058938045184234, + "loss": 7.4258, + "step": 9062 + }, + { + "epoch": 0.8456657646729495, + "grad_norm": 0.9844602082835726, + "learning_rate": 0.0002905867433536658, + "loss": 7.1367, + "step": 9063 + }, + { + "epoch": 0.8457590743678268, + "grad_norm": 0.9041867183989158, + "learning_rate": 0.00029058410589801885, + "loss": 7.2966, + "step": 9064 + }, + { + "epoch": 0.8458523840627041, + "grad_norm": 4.144608197600802, + "learning_rate": 0.00029058146808490806, + "loss": 7.1125, + "step": 9065 + }, + { + "epoch": 0.8459456937575814, + "grad_norm": 1.2591248274601492, + "learning_rate": 0.0002905788299143403, + "loss": 7.4802, + "step": 9066 + }, + { + "epoch": 0.8460390034524587, + "grad_norm": 0.7445459793481476, + "learning_rate": 0.0002905761913863222, + "loss": 7.4686, + "step": 9067 + }, + { + "epoch": 0.8461323131473361, + "grad_norm": 1104.9358345015362, + "learning_rate": 0.0002905735525008605, + "loss": 7.5332, + "step": 9068 + }, + { + "epoch": 0.8462256228422133, + "grad_norm": 22.44602543258642, + "learning_rate": 0.0002905709132579619, + "loss": 7.3755, + "step": 9069 + }, + { + "epoch": 0.8463189325370906, + "grad_norm": 0.9327651485887836, + "learning_rate": 0.00029056827365763307, + "loss": 7.4752, + "step": 9070 + }, + { + "epoch": 0.8464122422319679, + "grad_norm": 8.435622488833744, + "learning_rate": 0.0002905656336998808, + "loss": 7.3186, + "step": 9071 + }, + { + "epoch": 0.8465055519268452, + "grad_norm": 2.747674206412536, + "learning_rate": 0.00029056299338471174, + "loss": 7.1347, + "step": 9072 + }, + { + "epoch": 0.8465988616217225, + "grad_norm": 5.380684528333792, + "learning_rate": 0.00029056035271213265, + "loss": 7.4625, + "step": 9073 + }, + { + "epoch": 0.8466921713165998, + "grad_norm": 1.0451415270058417, + "learning_rate": 0.0002905577116821502, + "loss": 7.5685, + "step": 9074 + }, + { + "epoch": 0.8467854810114771, + "grad_norm": 1.1018737380784525, + "learning_rate": 0.0002905550702947711, + "loss": 7.444, + "step": 9075 + }, + { + "epoch": 0.8468787907063544, + "grad_norm": 1.5158949535044894, + "learning_rate": 0.00029055242855000215, + "loss": 7.435, + "step": 9076 + }, + { + "epoch": 0.8469721004012317, + "grad_norm": 302.2580310598522, + "learning_rate": 0.00029054978644785, + "loss": 7.346, + "step": 9077 + }, + { + "epoch": 0.847065410096109, + "grad_norm": 17.430903385664994, + "learning_rate": 0.0002905471439883213, + "loss": 7.5079, + "step": 9078 + }, + { + "epoch": 0.8471587197909863, + "grad_norm": 53.155533388403406, + "learning_rate": 0.00029054450117142294, + "loss": 7.3859, + "step": 9079 + }, + { + "epoch": 0.8472520294858635, + "grad_norm": 92.2203946700308, + "learning_rate": 0.0002905418579971615, + "loss": 7.3974, + "step": 9080 + }, + { + "epoch": 0.8473453391807408, + "grad_norm": 1.0845798773861046, + "learning_rate": 0.00029053921446554377, + "loss": 7.7436, + "step": 9081 + }, + { + "epoch": 0.8474386488756182, + "grad_norm": 0.694280205897419, + "learning_rate": 0.0002905365705765764, + "loss": 7.2875, + "step": 9082 + }, + { + "epoch": 0.8475319585704955, + "grad_norm": 0.7312249901813886, + "learning_rate": 0.00029053392633026623, + "loss": 7.2926, + "step": 9083 + }, + { + "epoch": 0.8476252682653728, + "grad_norm": 0.8168015051427214, + "learning_rate": 0.00029053128172661986, + "loss": 7.208, + "step": 9084 + }, + { + "epoch": 0.8477185779602501, + "grad_norm": 1.2472848456576242, + "learning_rate": 0.0002905286367656441, + "loss": 7.4955, + "step": 9085 + }, + { + "epoch": 0.8478118876551274, + "grad_norm": 0.5395643770358702, + "learning_rate": 0.00029052599144734564, + "loss": 7.5033, + "step": 9086 + }, + { + "epoch": 0.8479051973500047, + "grad_norm": 0.9625621511107995, + "learning_rate": 0.00029052334577173123, + "loss": 7.5089, + "step": 9087 + }, + { + "epoch": 0.847998507044882, + "grad_norm": 2.1868447541230505, + "learning_rate": 0.00029052069973880754, + "loss": 7.1392, + "step": 9088 + }, + { + "epoch": 0.8480918167397593, + "grad_norm": 1.2889580422911995, + "learning_rate": 0.00029051805334858133, + "loss": 7.3296, + "step": 9089 + }, + { + "epoch": 0.8481851264346365, + "grad_norm": 0.9022544431945474, + "learning_rate": 0.00029051540660105933, + "loss": 7.5995, + "step": 9090 + }, + { + "epoch": 0.8482784361295138, + "grad_norm": 3.18823793247338, + "learning_rate": 0.0002905127594962483, + "loss": 7.3109, + "step": 9091 + }, + { + "epoch": 0.8483717458243911, + "grad_norm": 1.0973940750744113, + "learning_rate": 0.00029051011203415497, + "loss": 7.4482, + "step": 9092 + }, + { + "epoch": 0.8484650555192684, + "grad_norm": 0.5721596598980643, + "learning_rate": 0.00029050746421478604, + "loss": 7.2648, + "step": 9093 + }, + { + "epoch": 0.8485583652141457, + "grad_norm": 0.6559676656703746, + "learning_rate": 0.0002905048160381482, + "loss": 7.5644, + "step": 9094 + }, + { + "epoch": 0.848651674909023, + "grad_norm": 0.8339451572718378, + "learning_rate": 0.00029050216750424826, + "loss": 7.0327, + "step": 9095 + }, + { + "epoch": 0.8487449846039004, + "grad_norm": 0.7931916107851097, + "learning_rate": 0.00029049951861309295, + "loss": 7.4682, + "step": 9096 + }, + { + "epoch": 0.8488382942987777, + "grad_norm": 0.7002148549804174, + "learning_rate": 0.00029049686936468897, + "loss": 7.319, + "step": 9097 + }, + { + "epoch": 0.848931603993655, + "grad_norm": 2.1468484193020894, + "learning_rate": 0.000290494219759043, + "loss": 7.4616, + "step": 9098 + }, + { + "epoch": 0.8490249136885323, + "grad_norm": 1.5828984434839826, + "learning_rate": 0.00029049156979616195, + "loss": 7.3691, + "step": 9099 + }, + { + "epoch": 0.8491182233834096, + "grad_norm": 29.335709763560754, + "learning_rate": 0.0002904889194760524, + "loss": 7.3807, + "step": 9100 + }, + { + "epoch": 0.8492115330782868, + "grad_norm": 1.1332753901591244, + "learning_rate": 0.0002904862687987212, + "loss": 7.2565, + "step": 9101 + }, + { + "epoch": 0.8493048427731641, + "grad_norm": 6.5711114733001, + "learning_rate": 0.000290483617764175, + "loss": 7.0438, + "step": 9102 + }, + { + "epoch": 0.8493981524680414, + "grad_norm": 0.6435784998078379, + "learning_rate": 0.00029048096637242053, + "loss": 7.2951, + "step": 9103 + }, + { + "epoch": 0.8494914621629187, + "grad_norm": 0.751325289330868, + "learning_rate": 0.00029047831462346464, + "loss": 7.3347, + "step": 9104 + }, + { + "epoch": 0.849584771857796, + "grad_norm": 0.5183306041303892, + "learning_rate": 0.00029047566251731396, + "loss": 7.3851, + "step": 9105 + }, + { + "epoch": 0.8496780815526733, + "grad_norm": 37.86684939036431, + "learning_rate": 0.0002904730100539753, + "loss": 7.0907, + "step": 9106 + }, + { + "epoch": 0.8497713912475506, + "grad_norm": 0.9353065042870258, + "learning_rate": 0.00029047035723345536, + "loss": 7.1075, + "step": 9107 + }, + { + "epoch": 0.8498647009424279, + "grad_norm": 0.3795393713012896, + "learning_rate": 0.00029046770405576094, + "loss": 7.4304, + "step": 9108 + }, + { + "epoch": 0.8499580106373053, + "grad_norm": 0.7390645399543251, + "learning_rate": 0.0002904650505208988, + "loss": 7.2778, + "step": 9109 + }, + { + "epoch": 0.8500513203321826, + "grad_norm": 0.5189680971558416, + "learning_rate": 0.0002904623966288756, + "loss": 7.2392, + "step": 9110 + }, + { + "epoch": 0.8501446300270599, + "grad_norm": 0.6532446089541074, + "learning_rate": 0.00029045974237969815, + "loss": 7.5907, + "step": 9111 + }, + { + "epoch": 0.8502379397219371, + "grad_norm": 0.7057661026593829, + "learning_rate": 0.0002904570877733732, + "loss": 7.3539, + "step": 9112 + }, + { + "epoch": 0.8503312494168144, + "grad_norm": 0.46970640912784994, + "learning_rate": 0.00029045443280990743, + "loss": 7.3002, + "step": 9113 + }, + { + "epoch": 0.8504245591116917, + "grad_norm": 0.632549438817482, + "learning_rate": 0.00029045177748930764, + "loss": 7.3602, + "step": 9114 + }, + { + "epoch": 0.850517868806569, + "grad_norm": 0.8156789760211337, + "learning_rate": 0.00029044912181158065, + "loss": 7.2508, + "step": 9115 + }, + { + "epoch": 0.8506111785014463, + "grad_norm": 1.5044111603676094, + "learning_rate": 0.0002904464657767331, + "loss": 7.4495, + "step": 9116 + }, + { + "epoch": 0.8507044881963236, + "grad_norm": 0.6654716945342147, + "learning_rate": 0.0002904438093847718, + "loss": 7.3672, + "step": 9117 + }, + { + "epoch": 0.8507977978912009, + "grad_norm": 1.0356260116092817, + "learning_rate": 0.00029044115263570345, + "loss": 7.2068, + "step": 9118 + }, + { + "epoch": 0.8508911075860782, + "grad_norm": 7.228705163691357, + "learning_rate": 0.0002904384955295349, + "loss": 7.3694, + "step": 9119 + }, + { + "epoch": 0.8509844172809555, + "grad_norm": 1.5745477806854493, + "learning_rate": 0.00029043583806627287, + "loss": 7.263, + "step": 9120 + }, + { + "epoch": 0.8510777269758328, + "grad_norm": 1.3734936642376474, + "learning_rate": 0.0002904331802459241, + "loss": 7.753, + "step": 9121 + }, + { + "epoch": 0.85117103667071, + "grad_norm": 28.270253405201952, + "learning_rate": 0.0002904305220684953, + "loss": 7.2707, + "step": 9122 + }, + { + "epoch": 0.8512643463655873, + "grad_norm": 1.0796286615243464, + "learning_rate": 0.00029042786353399334, + "loss": 7.2622, + "step": 9123 + }, + { + "epoch": 0.8513576560604647, + "grad_norm": 0.9993086928388013, + "learning_rate": 0.0002904252046424249, + "loss": 7.3822, + "step": 9124 + }, + { + "epoch": 0.851450965755342, + "grad_norm": 0.9562756149800861, + "learning_rate": 0.0002904225453937968, + "loss": 7.7052, + "step": 9125 + }, + { + "epoch": 0.8515442754502193, + "grad_norm": 1.0727554753680648, + "learning_rate": 0.0002904198857881157, + "loss": 7.217, + "step": 9126 + }, + { + "epoch": 0.8516375851450966, + "grad_norm": 0.5305180248506421, + "learning_rate": 0.00029041722582538845, + "loss": 7.3301, + "step": 9127 + }, + { + "epoch": 0.8517308948399739, + "grad_norm": 0.4300947170717326, + "learning_rate": 0.0002904145655056218, + "loss": 7.1639, + "step": 9128 + }, + { + "epoch": 0.8518242045348512, + "grad_norm": 0.43549134937220685, + "learning_rate": 0.00029041190482882246, + "loss": 7.2507, + "step": 9129 + }, + { + "epoch": 0.8519175142297285, + "grad_norm": 2.2202314962817344, + "learning_rate": 0.00029040924379499726, + "loss": 7.1732, + "step": 9130 + }, + { + "epoch": 0.8520108239246058, + "grad_norm": 5.735836800680983, + "learning_rate": 0.00029040658240415295, + "loss": 7.5252, + "step": 9131 + }, + { + "epoch": 0.8521041336194831, + "grad_norm": 1.5455785325536087, + "learning_rate": 0.0002904039206562963, + "loss": 7.5608, + "step": 9132 + }, + { + "epoch": 0.8521974433143603, + "grad_norm": 0.723072077947633, + "learning_rate": 0.0002904012585514341, + "loss": 7.235, + "step": 9133 + }, + { + "epoch": 0.8522907530092376, + "grad_norm": 0.7520021670978151, + "learning_rate": 0.0002903985960895731, + "loss": 7.6166, + "step": 9134 + }, + { + "epoch": 0.8523840627041149, + "grad_norm": 18.130452437462218, + "learning_rate": 0.00029039593327071997, + "loss": 7.4481, + "step": 9135 + }, + { + "epoch": 0.8524773723989922, + "grad_norm": 1.3213171963768438, + "learning_rate": 0.00029039327009488164, + "loss": 7.4206, + "step": 9136 + }, + { + "epoch": 0.8525706820938695, + "grad_norm": 0.6515648812065477, + "learning_rate": 0.0002903906065620648, + "loss": 7.4377, + "step": 9137 + }, + { + "epoch": 0.8526639917887469, + "grad_norm": 0.622487613588657, + "learning_rate": 0.0002903879426722762, + "loss": 7.4947, + "step": 9138 + }, + { + "epoch": 0.8527573014836242, + "grad_norm": 0.5091056795340204, + "learning_rate": 0.0002903852784255227, + "loss": 7.1289, + "step": 9139 + }, + { + "epoch": 0.8528506111785015, + "grad_norm": 1.2828040539861805, + "learning_rate": 0.000290382613821811, + "loss": 7.5404, + "step": 9140 + }, + { + "epoch": 0.8529439208733788, + "grad_norm": 5.854886770392086, + "learning_rate": 0.00029037994886114786, + "loss": 7.4121, + "step": 9141 + }, + { + "epoch": 0.8530372305682561, + "grad_norm": 0.8429512873282247, + "learning_rate": 0.00029037728354354013, + "loss": 7.4669, + "step": 9142 + }, + { + "epoch": 0.8531305402631333, + "grad_norm": 9.572273343528625, + "learning_rate": 0.00029037461786899456, + "loss": 7.5881, + "step": 9143 + }, + { + "epoch": 0.8532238499580106, + "grad_norm": 0.7590986791584096, + "learning_rate": 0.0002903719518375179, + "loss": 7.7107, + "step": 9144 + }, + { + "epoch": 0.8533171596528879, + "grad_norm": 0.641627351687729, + "learning_rate": 0.000290369285449117, + "loss": 7.5532, + "step": 9145 + }, + { + "epoch": 0.8534104693477652, + "grad_norm": 0.5931724632028884, + "learning_rate": 0.0002903666187037985, + "loss": 7.6255, + "step": 9146 + }, + { + "epoch": 0.8535037790426425, + "grad_norm": 0.8494721738268338, + "learning_rate": 0.0002903639516015693, + "loss": 7.4837, + "step": 9147 + }, + { + "epoch": 0.8535970887375198, + "grad_norm": 1.271009694706383, + "learning_rate": 0.00029036128414243614, + "loss": 7.1868, + "step": 9148 + }, + { + "epoch": 0.8536903984323971, + "grad_norm": 9.535206546736621, + "learning_rate": 0.0002903586163264059, + "loss": 7.3839, + "step": 9149 + }, + { + "epoch": 0.8537837081272744, + "grad_norm": 0.9922415247785028, + "learning_rate": 0.0002903559481534852, + "loss": 7.4216, + "step": 9150 + }, + { + "epoch": 0.8538770178221518, + "grad_norm": 1.080651882344436, + "learning_rate": 0.00029035327962368085, + "loss": 7.3442, + "step": 9151 + }, + { + "epoch": 0.8539703275170291, + "grad_norm": 2.8010414386460485, + "learning_rate": 0.0002903506107369998, + "loss": 7.1309, + "step": 9152 + }, + { + "epoch": 0.8540636372119064, + "grad_norm": 9.668695270015574, + "learning_rate": 0.0002903479414934487, + "loss": 7.4362, + "step": 9153 + }, + { + "epoch": 0.8541569469067836, + "grad_norm": 1.630092429499156, + "learning_rate": 0.0002903452718930343, + "loss": 7.8899, + "step": 9154 + }, + { + "epoch": 0.8542502566016609, + "grad_norm": 0.8692142166968249, + "learning_rate": 0.00029034260193576346, + "loss": 7.521, + "step": 9155 + }, + { + "epoch": 0.8543435662965382, + "grad_norm": 1.47967068822178, + "learning_rate": 0.000290339931621643, + "loss": 7.1172, + "step": 9156 + }, + { + "epoch": 0.8544368759914155, + "grad_norm": 5.316438973518141, + "learning_rate": 0.00029033726095067957, + "loss": 7.3363, + "step": 9157 + }, + { + "epoch": 0.8545301856862928, + "grad_norm": 1.2189489998594816, + "learning_rate": 0.00029033458992288013, + "loss": 7.42, + "step": 9158 + }, + { + "epoch": 0.8546234953811701, + "grad_norm": 1.527278070004393, + "learning_rate": 0.0002903319185382514, + "loss": 7.3687, + "step": 9159 + }, + { + "epoch": 0.8547168050760474, + "grad_norm": 5.617657765434792, + "learning_rate": 0.00029032924679680015, + "loss": 7.2417, + "step": 9160 + }, + { + "epoch": 0.8548101147709247, + "grad_norm": 0.42830189832416116, + "learning_rate": 0.0002903265746985332, + "loss": 7.3136, + "step": 9161 + }, + { + "epoch": 0.854903424465802, + "grad_norm": 1.5245797163981252, + "learning_rate": 0.00029032390224345735, + "loss": 7.6499, + "step": 9162 + }, + { + "epoch": 0.8549967341606793, + "grad_norm": 83.23137138493564, + "learning_rate": 0.0002903212294315794, + "loss": 7.5858, + "step": 9163 + }, + { + "epoch": 0.8550900438555566, + "grad_norm": 32.351725145273974, + "learning_rate": 0.0002903185562629061, + "loss": 7.4101, + "step": 9164 + }, + { + "epoch": 0.8551833535504338, + "grad_norm": 0.691942554862843, + "learning_rate": 0.00029031588273744426, + "loss": 7.4937, + "step": 9165 + }, + { + "epoch": 0.8552766632453112, + "grad_norm": 0.6574844863108793, + "learning_rate": 0.0002903132088552007, + "loss": 7.3272, + "step": 9166 + }, + { + "epoch": 0.8553699729401885, + "grad_norm": 0.6102522815567297, + "learning_rate": 0.0002903105346161822, + "loss": 7.0911, + "step": 9167 + }, + { + "epoch": 0.8554632826350658, + "grad_norm": 2.63919619244701, + "learning_rate": 0.0002903078600203957, + "loss": 7.5338, + "step": 9168 + }, + { + "epoch": 0.8555565923299431, + "grad_norm": 1.5998932356150664, + "learning_rate": 0.00029030518506784774, + "loss": 7.3619, + "step": 9169 + }, + { + "epoch": 0.8556499020248204, + "grad_norm": 0.683057631109024, + "learning_rate": 0.0002903025097585453, + "loss": 7.4521, + "step": 9170 + }, + { + "epoch": 0.8557432117196977, + "grad_norm": 0.6371316657771069, + "learning_rate": 0.0002902998340924951, + "loss": 7.5485, + "step": 9171 + }, + { + "epoch": 0.855836521414575, + "grad_norm": 0.7988119095971816, + "learning_rate": 0.00029029715806970404, + "loss": 7.3261, + "step": 9172 + }, + { + "epoch": 0.8559298311094523, + "grad_norm": 0.6633472324479693, + "learning_rate": 0.0002902944816901788, + "loss": 7.3206, + "step": 9173 + }, + { + "epoch": 0.8560231408043296, + "grad_norm": 0.922573383200562, + "learning_rate": 0.0002902918049539263, + "loss": 7.3807, + "step": 9174 + }, + { + "epoch": 0.8561164504992068, + "grad_norm": 6.089464865280774, + "learning_rate": 0.0002902891278609533, + "loss": 7.2981, + "step": 9175 + }, + { + "epoch": 0.8562097601940841, + "grad_norm": 2.1723358964737227, + "learning_rate": 0.00029028645041126656, + "loss": 7.0886, + "step": 9176 + }, + { + "epoch": 0.8563030698889614, + "grad_norm": 1.1941896959182654, + "learning_rate": 0.000290283772604873, + "loss": 7.2601, + "step": 9177 + }, + { + "epoch": 0.8563963795838387, + "grad_norm": 0.4579778284890735, + "learning_rate": 0.0002902810944417793, + "loss": 6.9848, + "step": 9178 + }, + { + "epoch": 0.856489689278716, + "grad_norm": 0.45409921858168745, + "learning_rate": 0.00029027841592199236, + "loss": 7.3925, + "step": 9179 + }, + { + "epoch": 0.8565829989735934, + "grad_norm": 1.7817137602158053, + "learning_rate": 0.00029027573704551896, + "loss": 8.0706, + "step": 9180 + }, + { + "epoch": 0.8566763086684707, + "grad_norm": 1.268655144966194, + "learning_rate": 0.00029027305781236597, + "loss": 7.4496, + "step": 9181 + }, + { + "epoch": 0.856769618363348, + "grad_norm": 1.302918732352721, + "learning_rate": 0.00029027037822254006, + "loss": 7.1556, + "step": 9182 + }, + { + "epoch": 0.8568629280582253, + "grad_norm": 1.535081447645429, + "learning_rate": 0.0002902676982760481, + "loss": 7.3286, + "step": 9183 + }, + { + "epoch": 0.8569562377531026, + "grad_norm": 18.139353051142947, + "learning_rate": 0.00029026501797289704, + "loss": 7.2201, + "step": 9184 + }, + { + "epoch": 0.8570495474479799, + "grad_norm": 8.832249709798567, + "learning_rate": 0.00029026233731309353, + "loss": 7.1562, + "step": 9185 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.6327677949651245, + "learning_rate": 0.0002902596562966445, + "loss": 7.6926, + "step": 9186 + }, + { + "epoch": 0.8572361668377344, + "grad_norm": 1.9918805306056964, + "learning_rate": 0.0002902569749235566, + "loss": 7.711, + "step": 9187 + }, + { + "epoch": 0.8573294765326117, + "grad_norm": 0.540615017278725, + "learning_rate": 0.0002902542931938369, + "loss": 7.3652, + "step": 9188 + }, + { + "epoch": 0.857422786227489, + "grad_norm": 6.669117115086091, + "learning_rate": 0.000290251611107492, + "loss": 7.4817, + "step": 9189 + }, + { + "epoch": 0.8575160959223663, + "grad_norm": 0.7855149830051762, + "learning_rate": 0.0002902489286645288, + "loss": 7.2928, + "step": 9190 + }, + { + "epoch": 0.8576094056172436, + "grad_norm": 0.8385290853311074, + "learning_rate": 0.0002902462458649541, + "loss": 7.4231, + "step": 9191 + }, + { + "epoch": 0.857702715312121, + "grad_norm": 41.074634004822066, + "learning_rate": 0.0002902435627087748, + "loss": 7.3942, + "step": 9192 + }, + { + "epoch": 0.8577960250069983, + "grad_norm": 1.0199489719531016, + "learning_rate": 0.00029024087919599757, + "loss": 7.4238, + "step": 9193 + }, + { + "epoch": 0.8578893347018756, + "grad_norm": 2.38069630181559, + "learning_rate": 0.0002902381953266294, + "loss": 7.3233, + "step": 9194 + }, + { + "epoch": 0.8579826443967529, + "grad_norm": 1.588582164803448, + "learning_rate": 0.00029023551110067704, + "loss": 7.1392, + "step": 9195 + }, + { + "epoch": 0.8580759540916301, + "grad_norm": 1.0411036190649556, + "learning_rate": 0.0002902328265181473, + "loss": 7.1998, + "step": 9196 + }, + { + "epoch": 0.8581692637865074, + "grad_norm": 44.55660136304231, + "learning_rate": 0.00029023014157904703, + "loss": 7.2697, + "step": 9197 + }, + { + "epoch": 0.8582625734813847, + "grad_norm": 1.26208720975866, + "learning_rate": 0.00029022745628338304, + "loss": 7.2585, + "step": 9198 + }, + { + "epoch": 0.858355883176262, + "grad_norm": 0.6637196183693954, + "learning_rate": 0.0002902247706311622, + "loss": 7.3786, + "step": 9199 + }, + { + "epoch": 0.8584491928711393, + "grad_norm": 0.6699878333207858, + "learning_rate": 0.0002902220846223912, + "loss": 7.1026, + "step": 9200 + }, + { + "epoch": 0.8585425025660166, + "grad_norm": 1.0717330504063214, + "learning_rate": 0.0002902193982570771, + "loss": 7.4475, + "step": 9201 + }, + { + "epoch": 0.8586358122608939, + "grad_norm": 0.7086109951484364, + "learning_rate": 0.00029021671153522656, + "loss": 7.5011, + "step": 9202 + }, + { + "epoch": 0.8587291219557712, + "grad_norm": 83.42645216305509, + "learning_rate": 0.00029021402445684644, + "loss": 7.2794, + "step": 9203 + }, + { + "epoch": 0.8588224316506485, + "grad_norm": 0.9450640600374857, + "learning_rate": 0.00029021133702194363, + "loss": 7.3907, + "step": 9204 + }, + { + "epoch": 0.8589157413455258, + "grad_norm": 68.42270756267882, + "learning_rate": 0.0002902086492305249, + "loss": 7.1858, + "step": 9205 + }, + { + "epoch": 0.8590090510404031, + "grad_norm": 1.4130314287575974, + "learning_rate": 0.0002902059610825971, + "loss": 7.7705, + "step": 9206 + }, + { + "epoch": 0.8591023607352803, + "grad_norm": 23.6789732217836, + "learning_rate": 0.00029020327257816707, + "loss": 7.0653, + "step": 9207 + }, + { + "epoch": 0.8591956704301577, + "grad_norm": 434.5497762430438, + "learning_rate": 0.00029020058371724165, + "loss": 7.6231, + "step": 9208 + }, + { + "epoch": 0.859288980125035, + "grad_norm": 12.807199461851546, + "learning_rate": 0.0002901978944998277, + "loss": 7.4469, + "step": 9209 + }, + { + "epoch": 0.8593822898199123, + "grad_norm": 3.778215572754037, + "learning_rate": 0.00029019520492593205, + "loss": 7.4683, + "step": 9210 + }, + { + "epoch": 0.8594755995147896, + "grad_norm": 350.2638272700393, + "learning_rate": 0.0002901925149955615, + "loss": 7.2502, + "step": 9211 + }, + { + "epoch": 0.8595689092096669, + "grad_norm": 1.0246085256958137, + "learning_rate": 0.0002901898247087229, + "loss": 7.2077, + "step": 9212 + }, + { + "epoch": 0.8596622189045442, + "grad_norm": 1.1288699482361868, + "learning_rate": 0.00029018713406542306, + "loss": 7.2743, + "step": 9213 + }, + { + "epoch": 0.8597555285994215, + "grad_norm": 1.0162701456509333, + "learning_rate": 0.0002901844430656689, + "loss": 7.0188, + "step": 9214 + }, + { + "epoch": 0.8598488382942988, + "grad_norm": 1.165965833112428, + "learning_rate": 0.0002901817517094673, + "loss": 7.6325, + "step": 9215 + }, + { + "epoch": 0.8599421479891761, + "grad_norm": 1.180276771344227, + "learning_rate": 0.00029017905999682495, + "loss": 7.2177, + "step": 9216 + }, + { + "epoch": 0.8600354576840534, + "grad_norm": 0.9647647218676743, + "learning_rate": 0.0002901763679277488, + "loss": 7.4491, + "step": 9217 + }, + { + "epoch": 0.8601287673789306, + "grad_norm": 0.7429900641056336, + "learning_rate": 0.0002901736755022456, + "loss": 7.4104, + "step": 9218 + }, + { + "epoch": 0.8602220770738079, + "grad_norm": 616.2104213510681, + "learning_rate": 0.00029017098272032235, + "loss": 7.359, + "step": 9219 + }, + { + "epoch": 0.8603153867686852, + "grad_norm": 0.9666155200708137, + "learning_rate": 0.0002901682895819858, + "loss": 7.3383, + "step": 9220 + }, + { + "epoch": 0.8604086964635626, + "grad_norm": 1617.7543011612902, + "learning_rate": 0.0002901655960872429, + "loss": 7.38, + "step": 9221 + }, + { + "epoch": 0.8605020061584399, + "grad_norm": 0.73750672536873, + "learning_rate": 0.00029016290223610023, + "loss": 7.3188, + "step": 9222 + }, + { + "epoch": 0.8605953158533172, + "grad_norm": 0.7266041431490281, + "learning_rate": 0.0002901602080285649, + "loss": 7.3303, + "step": 9223 + }, + { + "epoch": 0.8606886255481945, + "grad_norm": 0.5288993250089471, + "learning_rate": 0.0002901575134646437, + "loss": 7.1319, + "step": 9224 + }, + { + "epoch": 0.8607819352430718, + "grad_norm": 868.5320931727593, + "learning_rate": 0.00029015481854434346, + "loss": 7.0799, + "step": 9225 + }, + { + "epoch": 0.8608752449379491, + "grad_norm": 0.36451917671476797, + "learning_rate": 0.00029015212326767104, + "loss": 7.1809, + "step": 9226 + }, + { + "epoch": 0.8609685546328264, + "grad_norm": 2.8353320395997605, + "learning_rate": 0.00029014942763463323, + "loss": 7.0996, + "step": 9227 + }, + { + "epoch": 0.8610618643277036, + "grad_norm": 2363.5528743391874, + "learning_rate": 0.000290146731645237, + "loss": 7.4017, + "step": 9228 + }, + { + "epoch": 0.8611551740225809, + "grad_norm": 1.00262037841339, + "learning_rate": 0.00029014403529948907, + "loss": 7.521, + "step": 9229 + }, + { + "epoch": 0.8612484837174582, + "grad_norm": 4308.200937203872, + "learning_rate": 0.00029014133859739643, + "loss": 7.313, + "step": 9230 + }, + { + "epoch": 0.8613417934123355, + "grad_norm": 1.8028517305605374, + "learning_rate": 0.0002901386415389659, + "loss": 7.33, + "step": 9231 + }, + { + "epoch": 0.8614351031072128, + "grad_norm": 2766.6067811428343, + "learning_rate": 0.00029013594412420423, + "loss": 7.3397, + "step": 9232 + }, + { + "epoch": 0.8615284128020901, + "grad_norm": 1.6037389794836188, + "learning_rate": 0.00029013324635311844, + "loss": 7.192, + "step": 9233 + }, + { + "epoch": 0.8616217224969674, + "grad_norm": 0.9340627469171952, + "learning_rate": 0.0002901305482257153, + "loss": 7.4198, + "step": 9234 + }, + { + "epoch": 0.8617150321918448, + "grad_norm": 1.1985065148604597, + "learning_rate": 0.0002901278497420017, + "loss": 7.4032, + "step": 9235 + }, + { + "epoch": 0.8618083418867221, + "grad_norm": 0.7999479819847516, + "learning_rate": 0.00029012515090198445, + "loss": 7.342, + "step": 9236 + }, + { + "epoch": 0.8619016515815994, + "grad_norm": 0.9409973117285511, + "learning_rate": 0.00029012245170567045, + "loss": 7.1647, + "step": 9237 + }, + { + "epoch": 0.8619949612764767, + "grad_norm": 27814.92073085154, + "learning_rate": 0.00029011975215306664, + "loss": 7.5018, + "step": 9238 + }, + { + "epoch": 0.8620882709713539, + "grad_norm": 0.8656035604909127, + "learning_rate": 0.0002901170522441797, + "loss": 7.5893, + "step": 9239 + }, + { + "epoch": 0.8621815806662312, + "grad_norm": 5596.502114723754, + "learning_rate": 0.0002901143519790167, + "loss": 7.21, + "step": 9240 + }, + { + "epoch": 0.8622748903611085, + "grad_norm": 1.9139027503427029, + "learning_rate": 0.00029011165135758437, + "loss": 7.5228, + "step": 9241 + }, + { + "epoch": 0.8623682000559858, + "grad_norm": 16.197114920725298, + "learning_rate": 0.00029010895037988955, + "loss": 7.0948, + "step": 9242 + }, + { + "epoch": 0.8624615097508631, + "grad_norm": 0.9212584897671706, + "learning_rate": 0.00029010624904593924, + "loss": 7.7076, + "step": 9243 + }, + { + "epoch": 0.8625548194457404, + "grad_norm": 0.6965624564307095, + "learning_rate": 0.0002901035473557402, + "loss": 7.5408, + "step": 9244 + }, + { + "epoch": 0.8626481291406177, + "grad_norm": 0.9073037926482921, + "learning_rate": 0.0002901008453092994, + "loss": 7.1727, + "step": 9245 + }, + { + "epoch": 0.862741438835495, + "grad_norm": 8.263255971140005, + "learning_rate": 0.0002900981429066236, + "loss": 7.2638, + "step": 9246 + }, + { + "epoch": 0.8628347485303723, + "grad_norm": 0.9914391686408414, + "learning_rate": 0.0002900954401477198, + "loss": 7.4174, + "step": 9247 + }, + { + "epoch": 0.8629280582252497, + "grad_norm": 2.0907647971064893, + "learning_rate": 0.0002900927370325947, + "loss": 7.3381, + "step": 9248 + }, + { + "epoch": 0.8630213679201268, + "grad_norm": 4.89403906671189, + "learning_rate": 0.0002900900335612553, + "loss": 7.3484, + "step": 9249 + }, + { + "epoch": 0.8631146776150042, + "grad_norm": 2.199038408438771, + "learning_rate": 0.00029008732973370845, + "loss": 7.4016, + "step": 9250 + }, + { + "epoch": 0.8632079873098815, + "grad_norm": 473678.1724810268, + "learning_rate": 0.000290084625549961, + "loss": 7.2939, + "step": 9251 + }, + { + "epoch": 0.8633012970047588, + "grad_norm": 0.7340639310843535, + "learning_rate": 0.00029008192101001985, + "loss": 7.1898, + "step": 9252 + }, + { + "epoch": 0.8633946066996361, + "grad_norm": 1.9384002395343716, + "learning_rate": 0.0002900792161138919, + "loss": 7.2206, + "step": 9253 + }, + { + "epoch": 0.8634879163945134, + "grad_norm": 2.0705231411964244, + "learning_rate": 0.00029007651086158394, + "loss": 7.4289, + "step": 9254 + }, + { + "epoch": 0.8635812260893907, + "grad_norm": 3.9067073645417163, + "learning_rate": 0.000290073805253103, + "loss": 7.5959, + "step": 9255 + }, + { + "epoch": 0.863674535784268, + "grad_norm": 1.7195805534967843, + "learning_rate": 0.0002900710992884558, + "loss": 7.5264, + "step": 9256 + }, + { + "epoch": 0.8637678454791453, + "grad_norm": 1.553354624861099, + "learning_rate": 0.0002900683929676493, + "loss": 7.3291, + "step": 9257 + }, + { + "epoch": 0.8638611551740226, + "grad_norm": 1.233888417370732, + "learning_rate": 0.0002900656862906903, + "loss": 7.3503, + "step": 9258 + }, + { + "epoch": 0.8639544648688999, + "grad_norm": 73.27662752052544, + "learning_rate": 0.00029006297925758583, + "loss": 7.5406, + "step": 9259 + }, + { + "epoch": 0.8640477745637771, + "grad_norm": 4.525374775681682, + "learning_rate": 0.0002900602718683427, + "loss": 7.2187, + "step": 9260 + }, + { + "epoch": 0.8641410842586544, + "grad_norm": 0.9889103577022225, + "learning_rate": 0.00029005756412296777, + "loss": 7.4574, + "step": 9261 + }, + { + "epoch": 0.8642343939535317, + "grad_norm": 1.4724351567533092, + "learning_rate": 0.00029005485602146794, + "loss": 7.2168, + "step": 9262 + }, + { + "epoch": 0.864327703648409, + "grad_norm": 1.9997327963759002, + "learning_rate": 0.0002900521475638501, + "loss": 7.4485, + "step": 9263 + }, + { + "epoch": 0.8644210133432864, + "grad_norm": 1.7213449035322657, + "learning_rate": 0.0002900494387501211, + "loss": 7.4646, + "step": 9264 + }, + { + "epoch": 0.8645143230381637, + "grad_norm": 0.8340470850783823, + "learning_rate": 0.0002900467295802879, + "loss": 7.5053, + "step": 9265 + }, + { + "epoch": 0.864607632733041, + "grad_norm": 2.091554811393442, + "learning_rate": 0.00029004402005435736, + "loss": 7.2011, + "step": 9266 + }, + { + "epoch": 0.8647009424279183, + "grad_norm": 1.2347888151098612, + "learning_rate": 0.0002900413101723364, + "loss": 7.5401, + "step": 9267 + }, + { + "epoch": 0.8647942521227956, + "grad_norm": 0.9611614402826806, + "learning_rate": 0.00029003859993423175, + "loss": 7.1943, + "step": 9268 + }, + { + "epoch": 0.8648875618176729, + "grad_norm": 1.0761469497095015, + "learning_rate": 0.0002900358893400505, + "loss": 7.4677, + "step": 9269 + }, + { + "epoch": 0.8649808715125502, + "grad_norm": 27.923997004219174, + "learning_rate": 0.00029003317838979944, + "loss": 7.4837, + "step": 9270 + }, + { + "epoch": 0.8650741812074274, + "grad_norm": 1.1898976084114607, + "learning_rate": 0.0002900304670834855, + "loss": 7.2781, + "step": 9271 + }, + { + "epoch": 0.8651674909023047, + "grad_norm": 1.2568083257954359, + "learning_rate": 0.0002900277554211156, + "loss": 7.5653, + "step": 9272 + }, + { + "epoch": 0.865260800597182, + "grad_norm": 0.779043399904389, + "learning_rate": 0.00029002504340269655, + "loss": 7.5188, + "step": 9273 + }, + { + "epoch": 0.8653541102920593, + "grad_norm": 1.1136545202803823, + "learning_rate": 0.00029002233102823534, + "loss": 7.4177, + "step": 9274 + }, + { + "epoch": 0.8654474199869366, + "grad_norm": 0.780874236144765, + "learning_rate": 0.0002900196182977387, + "loss": 7.3928, + "step": 9275 + }, + { + "epoch": 0.865540729681814, + "grad_norm": 1.0761397863976132, + "learning_rate": 0.00029001690521121375, + "loss": 7.643, + "step": 9276 + }, + { + "epoch": 0.8656340393766913, + "grad_norm": 17.257720948179724, + "learning_rate": 0.0002900141917686673, + "loss": 7.3474, + "step": 9277 + }, + { + "epoch": 0.8657273490715686, + "grad_norm": 1.2568211649708592, + "learning_rate": 0.0002900114779701062, + "loss": 7.46, + "step": 9278 + }, + { + "epoch": 0.8658206587664459, + "grad_norm": 1.2573202955740326, + "learning_rate": 0.0002900087638155374, + "loss": 7.3044, + "step": 9279 + }, + { + "epoch": 0.8659139684613232, + "grad_norm": 0.9107477459644798, + "learning_rate": 0.0002900060493049678, + "loss": 7.3096, + "step": 9280 + }, + { + "epoch": 0.8660072781562004, + "grad_norm": 16.767143467822457, + "learning_rate": 0.00029000333443840427, + "loss": 7.6371, + "step": 9281 + }, + { + "epoch": 0.8661005878510777, + "grad_norm": 1.0924515510188808, + "learning_rate": 0.0002900006192158537, + "loss": 7.4574, + "step": 9282 + }, + { + "epoch": 0.866193897545955, + "grad_norm": 1.1252973017344243, + "learning_rate": 0.00028999790363732307, + "loss": 7.3733, + "step": 9283 + }, + { + "epoch": 0.8662872072408323, + "grad_norm": 0.8112418028128667, + "learning_rate": 0.00028999518770281925, + "loss": 7.2783, + "step": 9284 + }, + { + "epoch": 0.8663805169357096, + "grad_norm": 1.0099652007276265, + "learning_rate": 0.00028999247141234914, + "loss": 7.1477, + "step": 9285 + }, + { + "epoch": 0.8664738266305869, + "grad_norm": 1.040102299428328, + "learning_rate": 0.00028998975476591957, + "loss": 7.1222, + "step": 9286 + }, + { + "epoch": 0.8665671363254642, + "grad_norm": 6.516560305686237, + "learning_rate": 0.0002899870377635376, + "loss": 7.4309, + "step": 9287 + }, + { + "epoch": 0.8666604460203415, + "grad_norm": 1.5333935346847007, + "learning_rate": 0.00028998432040521, + "loss": 7.7202, + "step": 9288 + }, + { + "epoch": 0.8667537557152188, + "grad_norm": 27.930902195937236, + "learning_rate": 0.0002899816026909438, + "loss": 7.3192, + "step": 9289 + }, + { + "epoch": 0.8668470654100962, + "grad_norm": 40.08587729763829, + "learning_rate": 0.0002899788846207458, + "loss": 7.6161, + "step": 9290 + }, + { + "epoch": 0.8669403751049735, + "grad_norm": 1.5103646986285983, + "learning_rate": 0.000289976166194623, + "loss": 7.1933, + "step": 9291 + }, + { + "epoch": 0.8670336847998507, + "grad_norm": 0.9945776161237706, + "learning_rate": 0.00028997344741258225, + "loss": 7.5937, + "step": 9292 + }, + { + "epoch": 0.867126994494728, + "grad_norm": 1.0293843081243574, + "learning_rate": 0.0002899707282746305, + "loss": 7.5, + "step": 9293 + }, + { + "epoch": 0.8672203041896053, + "grad_norm": 0.7137095198170091, + "learning_rate": 0.00028996800878077465, + "loss": 7.4245, + "step": 9294 + }, + { + "epoch": 0.8673136138844826, + "grad_norm": 4.760055360300395, + "learning_rate": 0.0002899652889310216, + "loss": 7.2519, + "step": 9295 + }, + { + "epoch": 0.8674069235793599, + "grad_norm": 1.1043163746343971, + "learning_rate": 0.00028996256872537826, + "loss": 7.5963, + "step": 9296 + }, + { + "epoch": 0.8675002332742372, + "grad_norm": 0.6554627351362475, + "learning_rate": 0.0002899598481638516, + "loss": 7.0294, + "step": 9297 + }, + { + "epoch": 0.8675935429691145, + "grad_norm": 1.5066693674140226, + "learning_rate": 0.0002899571272464485, + "loss": 7.5836, + "step": 9298 + }, + { + "epoch": 0.8676868526639918, + "grad_norm": 1.6859456910748978, + "learning_rate": 0.0002899544059731759, + "loss": 7.8255, + "step": 9299 + }, + { + "epoch": 0.8677801623588691, + "grad_norm": 82.185260720477, + "learning_rate": 0.00028995168434404073, + "loss": 7.5003, + "step": 9300 + }, + { + "epoch": 0.8678734720537464, + "grad_norm": 17.801319717799185, + "learning_rate": 0.0002899489623590498, + "loss": 7.7104, + "step": 9301 + }, + { + "epoch": 0.8679667817486236, + "grad_norm": 0.7123602656076877, + "learning_rate": 0.0002899462400182102, + "loss": 7.4009, + "step": 9302 + }, + { + "epoch": 0.8680600914435009, + "grad_norm": 1289.0050646485583, + "learning_rate": 0.00028994351732152874, + "loss": 8.0357, + "step": 9303 + }, + { + "epoch": 0.8681534011383782, + "grad_norm": 1.7570927955650304, + "learning_rate": 0.0002899407942690123, + "loss": 7.28, + "step": 9304 + }, + { + "epoch": 0.8682467108332556, + "grad_norm": 0.8377070367449853, + "learning_rate": 0.0002899380708606679, + "loss": 7.5978, + "step": 9305 + }, + { + "epoch": 0.8683400205281329, + "grad_norm": 0.7270485251119472, + "learning_rate": 0.00028993534709650254, + "loss": 7.3553, + "step": 9306 + }, + { + "epoch": 0.8684333302230102, + "grad_norm": 0.836151658807595, + "learning_rate": 0.0002899326229765229, + "loss": 7.3707, + "step": 9307 + }, + { + "epoch": 0.8685266399178875, + "grad_norm": 0.7222959080466048, + "learning_rate": 0.0002899298985007362, + "loss": 7.1477, + "step": 9308 + }, + { + "epoch": 0.8686199496127648, + "grad_norm": 0.6852553652361597, + "learning_rate": 0.00028992717366914915, + "loss": 7.3973, + "step": 9309 + }, + { + "epoch": 0.8687132593076421, + "grad_norm": 4.367656795746897, + "learning_rate": 0.0002899244484817687, + "loss": 7.6306, + "step": 9310 + }, + { + "epoch": 0.8688065690025194, + "grad_norm": 0.9624852813704191, + "learning_rate": 0.0002899217229386019, + "loss": 7.5162, + "step": 9311 + }, + { + "epoch": 0.8688998786973967, + "grad_norm": 0.483334072859049, + "learning_rate": 0.0002899189970396555, + "loss": 7.4241, + "step": 9312 + }, + { + "epoch": 0.8689931883922739, + "grad_norm": 4.613948775723939, + "learning_rate": 0.0002899162707849366, + "loss": 7.5239, + "step": 9313 + }, + { + "epoch": 0.8690864980871512, + "grad_norm": 20.20494820346809, + "learning_rate": 0.0002899135441744521, + "loss": 7.402, + "step": 9314 + }, + { + "epoch": 0.8691798077820285, + "grad_norm": 1.0276358575097857, + "learning_rate": 0.00028991081720820887, + "loss": 7.2033, + "step": 9315 + }, + { + "epoch": 0.8692731174769058, + "grad_norm": 1.3394420951682044, + "learning_rate": 0.0002899080898862139, + "loss": 7.2997, + "step": 9316 + }, + { + "epoch": 0.8693664271717831, + "grad_norm": 31.794082212989327, + "learning_rate": 0.00028990536220847407, + "loss": 7.788, + "step": 9317 + }, + { + "epoch": 0.8694597368666604, + "grad_norm": 0.9225587793163454, + "learning_rate": 0.0002899026341749964, + "loss": 7.1377, + "step": 9318 + }, + { + "epoch": 0.8695530465615378, + "grad_norm": 18.01350389946765, + "learning_rate": 0.0002898999057857877, + "loss": 7.1783, + "step": 9319 + }, + { + "epoch": 0.8696463562564151, + "grad_norm": 0.7601098377700087, + "learning_rate": 0.000289897177040855, + "loss": 7.2773, + "step": 9320 + }, + { + "epoch": 0.8697396659512924, + "grad_norm": 0.7190186063668812, + "learning_rate": 0.0002898944479402053, + "loss": 7.0258, + "step": 9321 + }, + { + "epoch": 0.8698329756461697, + "grad_norm": 1.486265036215435, + "learning_rate": 0.00028989171848384536, + "loss": 7.9178, + "step": 9322 + }, + { + "epoch": 0.869926285341047, + "grad_norm": 1.0324636799929279, + "learning_rate": 0.0002898889886717823, + "loss": 7.1174, + "step": 9323 + }, + { + "epoch": 0.8700195950359242, + "grad_norm": 0.6951325497419251, + "learning_rate": 0.0002898862585040229, + "loss": 7.2419, + "step": 9324 + }, + { + "epoch": 0.8701129047308015, + "grad_norm": 0.661888342971145, + "learning_rate": 0.00028988352798057425, + "loss": 7.3986, + "step": 9325 + }, + { + "epoch": 0.8702062144256788, + "grad_norm": 163.0746855713323, + "learning_rate": 0.0002898807971014432, + "loss": 7.6765, + "step": 9326 + }, + { + "epoch": 0.8702995241205561, + "grad_norm": 1.1670313039017648, + "learning_rate": 0.00028987806586663667, + "loss": 7.4186, + "step": 9327 + }, + { + "epoch": 0.8703928338154334, + "grad_norm": 162.813301996845, + "learning_rate": 0.0002898753342761617, + "loss": 7.4619, + "step": 9328 + }, + { + "epoch": 0.8704861435103107, + "grad_norm": 0.743454905806292, + "learning_rate": 0.00028987260233002515, + "loss": 7.3969, + "step": 9329 + }, + { + "epoch": 0.870579453205188, + "grad_norm": 0.6425076332132406, + "learning_rate": 0.0002898698700282341, + "loss": 7.3759, + "step": 9330 + }, + { + "epoch": 0.8706727629000653, + "grad_norm": 0.6323583091682624, + "learning_rate": 0.0002898671373707953, + "loss": 7.2032, + "step": 9331 + }, + { + "epoch": 0.8707660725949427, + "grad_norm": 0.8575800157836925, + "learning_rate": 0.00028986440435771585, + "loss": 7.4472, + "step": 9332 + }, + { + "epoch": 0.87085938228982, + "grad_norm": 1.4344469896474776, + "learning_rate": 0.0002898616709890027, + "loss": 7.5416, + "step": 9333 + }, + { + "epoch": 0.8709526919846972, + "grad_norm": 217.94737427407205, + "learning_rate": 0.0002898589372646626, + "loss": 7.0215, + "step": 9334 + }, + { + "epoch": 0.8710460016795745, + "grad_norm": 0.6693223231213795, + "learning_rate": 0.0002898562031847028, + "loss": 7.278, + "step": 9335 + }, + { + "epoch": 0.8711393113744518, + "grad_norm": 1.2628239040251, + "learning_rate": 0.00028985346874913, + "loss": 7.4677, + "step": 9336 + }, + { + "epoch": 0.8712326210693291, + "grad_norm": 0.5473480132963986, + "learning_rate": 0.00028985073395795135, + "loss": 7.5111, + "step": 9337 + }, + { + "epoch": 0.8713259307642064, + "grad_norm": 1.249128251669033, + "learning_rate": 0.00028984799881117367, + "loss": 7.215, + "step": 9338 + }, + { + "epoch": 0.8714192404590837, + "grad_norm": 0.5796910122473761, + "learning_rate": 0.00028984526330880396, + "loss": 7.268, + "step": 9339 + }, + { + "epoch": 0.871512550153961, + "grad_norm": 0.7831634343964876, + "learning_rate": 0.00028984252745084917, + "loss": 7.2399, + "step": 9340 + }, + { + "epoch": 0.8716058598488383, + "grad_norm": 0.9292355022607348, + "learning_rate": 0.00028983979123731624, + "loss": 7.3188, + "step": 9341 + }, + { + "epoch": 0.8716991695437156, + "grad_norm": 3.065570690191814, + "learning_rate": 0.00028983705466821216, + "loss": 7.4224, + "step": 9342 + }, + { + "epoch": 0.8717924792385929, + "grad_norm": 0.8323808273346878, + "learning_rate": 0.00028983431774354387, + "loss": 7.4465, + "step": 9343 + }, + { + "epoch": 0.8718857889334702, + "grad_norm": 0.4880452924095542, + "learning_rate": 0.0002898315804633183, + "loss": 7.2152, + "step": 9344 + }, + { + "epoch": 0.8719790986283474, + "grad_norm": 38.343679841550625, + "learning_rate": 0.0002898288428275425, + "loss": 7.7839, + "step": 9345 + }, + { + "epoch": 0.8720724083232247, + "grad_norm": 1.000912201880045, + "learning_rate": 0.00028982610483622333, + "loss": 7.1809, + "step": 9346 + }, + { + "epoch": 0.872165718018102, + "grad_norm": 265.30490110339645, + "learning_rate": 0.00028982336648936786, + "loss": 7.263, + "step": 9347 + }, + { + "epoch": 0.8722590277129794, + "grad_norm": 5.6586320574372415, + "learning_rate": 0.0002898206277869829, + "loss": 7.3529, + "step": 9348 + }, + { + "epoch": 0.8723523374078567, + "grad_norm": 378.4771743972688, + "learning_rate": 0.00028981788872907557, + "loss": 7.5482, + "step": 9349 + }, + { + "epoch": 0.872445647102734, + "grad_norm": 0.9984618974267792, + "learning_rate": 0.0002898151493156527, + "loss": 7.2006, + "step": 9350 + }, + { + "epoch": 0.8725389567976113, + "grad_norm": 1.2973757197385405, + "learning_rate": 0.0002898124095467214, + "loss": 7.6214, + "step": 9351 + }, + { + "epoch": 0.8726322664924886, + "grad_norm": 0.7576716963543044, + "learning_rate": 0.0002898096694222885, + "loss": 7.3015, + "step": 9352 + }, + { + "epoch": 0.8727255761873659, + "grad_norm": 502.2473268384202, + "learning_rate": 0.00028980692894236106, + "loss": 7.1784, + "step": 9353 + }, + { + "epoch": 0.8728188858822432, + "grad_norm": 0.757258175111546, + "learning_rate": 0.00028980418810694593, + "loss": 7.2904, + "step": 9354 + }, + { + "epoch": 0.8729121955771204, + "grad_norm": 244.40183377291564, + "learning_rate": 0.00028980144691605024, + "loss": 7.182, + "step": 9355 + }, + { + "epoch": 0.8730055052719977, + "grad_norm": 662.5875940371474, + "learning_rate": 0.00028979870536968087, + "loss": 7.341, + "step": 9356 + }, + { + "epoch": 0.873098814966875, + "grad_norm": 1.4094993020382183, + "learning_rate": 0.0002897959634678448, + "loss": 7.6185, + "step": 9357 + }, + { + "epoch": 0.8731921246617523, + "grad_norm": 0.6622355715585327, + "learning_rate": 0.000289793221210549, + "loss": 7.3622, + "step": 9358 + }, + { + "epoch": 0.8732854343566296, + "grad_norm": 0.6646830890079928, + "learning_rate": 0.0002897904785978005, + "loss": 8.0747, + "step": 9359 + }, + { + "epoch": 0.873378744051507, + "grad_norm": 1.454422950046978, + "learning_rate": 0.0002897877356296061, + "loss": 7.3018, + "step": 9360 + }, + { + "epoch": 0.8734720537463843, + "grad_norm": 7.242833247609325, + "learning_rate": 0.00028978499230597303, + "loss": 7.7398, + "step": 9361 + }, + { + "epoch": 0.8735653634412616, + "grad_norm": 1.3666290812319861, + "learning_rate": 0.00028978224862690805, + "loss": 7.2714, + "step": 9362 + }, + { + "epoch": 0.8736586731361389, + "grad_norm": 1.033401064686736, + "learning_rate": 0.0002897795045924183, + "loss": 7.4512, + "step": 9363 + }, + { + "epoch": 0.8737519828310162, + "grad_norm": 2.2186133671789814, + "learning_rate": 0.0002897767602025106, + "loss": 7.3392, + "step": 9364 + }, + { + "epoch": 0.8738452925258935, + "grad_norm": 6.589402775308127, + "learning_rate": 0.000289774015457192, + "loss": 7.9287, + "step": 9365 + }, + { + "epoch": 0.8739386022207707, + "grad_norm": 2.0975722774122922, + "learning_rate": 0.0002897712703564695, + "loss": 7.8676, + "step": 9366 + }, + { + "epoch": 0.874031911915648, + "grad_norm": 1.4737293396287194, + "learning_rate": 0.00028976852490035005, + "loss": 7.6497, + "step": 9367 + }, + { + "epoch": 0.8741252216105253, + "grad_norm": 1.0048300288772438, + "learning_rate": 0.0002897657790888407, + "loss": 7.3757, + "step": 9368 + }, + { + "epoch": 0.8742185313054026, + "grad_norm": 2.043201094832899, + "learning_rate": 0.0002897630329219484, + "loss": 7.4779, + "step": 9369 + }, + { + "epoch": 0.8743118410002799, + "grad_norm": 2.05995049869907, + "learning_rate": 0.00028976028639968003, + "loss": 7.5061, + "step": 9370 + }, + { + "epoch": 0.8744051506951572, + "grad_norm": 14.523659877062416, + "learning_rate": 0.0002897575395220426, + "loss": 7.3383, + "step": 9371 + }, + { + "epoch": 0.8744984603900345, + "grad_norm": 1.85122046954678, + "learning_rate": 0.0002897547922890432, + "loss": 7.3125, + "step": 9372 + }, + { + "epoch": 0.8745917700849118, + "grad_norm": 1.007825508706391, + "learning_rate": 0.0002897520447006888, + "loss": 7.2357, + "step": 9373 + }, + { + "epoch": 0.8746850797797892, + "grad_norm": 1.8955342060099922, + "learning_rate": 0.00028974929675698634, + "loss": 7.3926, + "step": 9374 + }, + { + "epoch": 0.8747783894746665, + "grad_norm": 2.2175355614901395, + "learning_rate": 0.0002897465484579428, + "loss": 7.5644, + "step": 9375 + }, + { + "epoch": 0.8748716991695438, + "grad_norm": 1.4973076491636053, + "learning_rate": 0.0002897437998035652, + "loss": 7.1488, + "step": 9376 + }, + { + "epoch": 0.874965008864421, + "grad_norm": 0.8343010475604473, + "learning_rate": 0.0002897410507938605, + "loss": 7.384, + "step": 9377 + }, + { + "epoch": 0.8750583185592983, + "grad_norm": 8.421945822722337, + "learning_rate": 0.00028973830142883566, + "loss": 7.5375, + "step": 9378 + }, + { + "epoch": 0.8751516282541756, + "grad_norm": 1.7228020427936437, + "learning_rate": 0.00028973555170849774, + "loss": 7.5593, + "step": 9379 + }, + { + "epoch": 0.8752449379490529, + "grad_norm": 1.8727405974543192, + "learning_rate": 0.00028973280163285375, + "loss": 7.4679, + "step": 9380 + }, + { + "epoch": 0.8753382476439302, + "grad_norm": 1.1464660794087753, + "learning_rate": 0.00028973005120191064, + "loss": 7.6192, + "step": 9381 + }, + { + "epoch": 0.8754315573388075, + "grad_norm": 2.8975639948149303, + "learning_rate": 0.00028972730041567534, + "loss": 7.2049, + "step": 9382 + }, + { + "epoch": 0.8755248670336848, + "grad_norm": 1.1210459119442164, + "learning_rate": 0.0002897245492741549, + "loss": 7.5546, + "step": 9383 + }, + { + "epoch": 0.8756181767285621, + "grad_norm": 3.261606829506991, + "learning_rate": 0.0002897217977773564, + "loss": 7.3712, + "step": 9384 + }, + { + "epoch": 0.8757114864234394, + "grad_norm": 3.415138115108889, + "learning_rate": 0.00028971904592528665, + "loss": 7.1494, + "step": 9385 + }, + { + "epoch": 0.8758047961183167, + "grad_norm": 1.818877669813565, + "learning_rate": 0.0002897162937179528, + "loss": 7.3837, + "step": 9386 + }, + { + "epoch": 0.8758981058131939, + "grad_norm": 1.5397660504723685, + "learning_rate": 0.0002897135411553618, + "loss": 7.3566, + "step": 9387 + }, + { + "epoch": 0.8759914155080712, + "grad_norm": 1.6697113401263397, + "learning_rate": 0.0002897107882375207, + "loss": 7.3064, + "step": 9388 + }, + { + "epoch": 0.8760847252029486, + "grad_norm": 1.0344573045519243, + "learning_rate": 0.00028970803496443647, + "loss": 7.5196, + "step": 9389 + }, + { + "epoch": 0.8761780348978259, + "grad_norm": 1.1657356289841994, + "learning_rate": 0.000289705281336116, + "loss": 7.7601, + "step": 9390 + }, + { + "epoch": 0.8762713445927032, + "grad_norm": 1.241153624799302, + "learning_rate": 0.0002897025273525664, + "loss": 7.5721, + "step": 9391 + }, + { + "epoch": 0.8763646542875805, + "grad_norm": 2.5070795137839226, + "learning_rate": 0.00028969977301379466, + "loss": 7.2929, + "step": 9392 + }, + { + "epoch": 0.8764579639824578, + "grad_norm": 2.2052402016256267, + "learning_rate": 0.0002896970183198078, + "loss": 7.3297, + "step": 9393 + }, + { + "epoch": 0.8765512736773351, + "grad_norm": 1.6738810159180455, + "learning_rate": 0.00028969426327061284, + "loss": 7.385, + "step": 9394 + }, + { + "epoch": 0.8766445833722124, + "grad_norm": 1.1615423446379296, + "learning_rate": 0.00028969150786621665, + "loss": 7.5607, + "step": 9395 + }, + { + "epoch": 0.8767378930670897, + "grad_norm": 2.3837456894371547, + "learning_rate": 0.0002896887521066264, + "loss": 8.0994, + "step": 9396 + }, + { + "epoch": 0.876831202761967, + "grad_norm": 1.9300681836483053, + "learning_rate": 0.0002896859959918491, + "loss": 7.5312, + "step": 9397 + }, + { + "epoch": 0.8769245124568442, + "grad_norm": 2.8308333403265644, + "learning_rate": 0.0002896832395218916, + "loss": 7.4706, + "step": 9398 + }, + { + "epoch": 0.8770178221517215, + "grad_norm": 0.7569342319026285, + "learning_rate": 0.000289680482696761, + "loss": 7.3055, + "step": 9399 + }, + { + "epoch": 0.8771111318465988, + "grad_norm": 1.3131421794185094, + "learning_rate": 0.00028967772551646435, + "loss": 7.2369, + "step": 9400 + }, + { + "epoch": 0.8772044415414761, + "grad_norm": 1.4826583948088006, + "learning_rate": 0.00028967496798100857, + "loss": 7.3869, + "step": 9401 + }, + { + "epoch": 0.8772977512363535, + "grad_norm": 1.030958961486052, + "learning_rate": 0.00028967221009040074, + "loss": 7.1353, + "step": 9402 + }, + { + "epoch": 0.8773910609312308, + "grad_norm": 0.8361006148462048, + "learning_rate": 0.00028966945184464785, + "loss": 7.3066, + "step": 9403 + }, + { + "epoch": 0.8774843706261081, + "grad_norm": 1.1724276051919023, + "learning_rate": 0.00028966669324375694, + "loss": 7.8021, + "step": 9404 + }, + { + "epoch": 0.8775776803209854, + "grad_norm": 1.1513639608017086, + "learning_rate": 0.00028966393428773497, + "loss": 7.7031, + "step": 9405 + }, + { + "epoch": 0.8776709900158627, + "grad_norm": 0.616942679733852, + "learning_rate": 0.000289661174976589, + "loss": 7.3952, + "step": 9406 + }, + { + "epoch": 0.87776429971074, + "grad_norm": 0.7956141332970289, + "learning_rate": 0.00028965841531032604, + "loss": 7.2404, + "step": 9407 + }, + { + "epoch": 0.8778576094056172, + "grad_norm": 0.9303756412528832, + "learning_rate": 0.00028965565528895314, + "loss": 7.2661, + "step": 9408 + }, + { + "epoch": 0.8779509191004945, + "grad_norm": 0.5412101677003484, + "learning_rate": 0.0002896528949124772, + "loss": 7.381, + "step": 9409 + }, + { + "epoch": 0.8780442287953718, + "grad_norm": 0.8233383719512747, + "learning_rate": 0.00028965013418090534, + "loss": 7.1925, + "step": 9410 + }, + { + "epoch": 0.8781375384902491, + "grad_norm": 1.6422748586540903, + "learning_rate": 0.00028964737309424455, + "loss": 7.4989, + "step": 9411 + }, + { + "epoch": 0.8782308481851264, + "grad_norm": 0.9574028468662945, + "learning_rate": 0.0002896446116525019, + "loss": 7.5711, + "step": 9412 + }, + { + "epoch": 0.8783241578800037, + "grad_norm": 1.2654744525611443, + "learning_rate": 0.0002896418498556843, + "loss": 7.7308, + "step": 9413 + }, + { + "epoch": 0.878417467574881, + "grad_norm": 0.7532345033013718, + "learning_rate": 0.0002896390877037989, + "loss": 7.8212, + "step": 9414 + }, + { + "epoch": 0.8785107772697583, + "grad_norm": 1.683646407191956, + "learning_rate": 0.00028963632519685264, + "loss": 7.3389, + "step": 9415 + }, + { + "epoch": 0.8786040869646357, + "grad_norm": 1.0617866415383386, + "learning_rate": 0.0002896335623348526, + "loss": 7.231, + "step": 9416 + }, + { + "epoch": 0.878697396659513, + "grad_norm": 1.0648330023781287, + "learning_rate": 0.0002896307991178057, + "loss": 7.5773, + "step": 9417 + }, + { + "epoch": 0.8787907063543903, + "grad_norm": 1.0721091718195388, + "learning_rate": 0.00028962803554571915, + "loss": 7.5365, + "step": 9418 + }, + { + "epoch": 0.8788840160492675, + "grad_norm": 0.6436656380270872, + "learning_rate": 0.00028962527161859974, + "loss": 7.4693, + "step": 9419 + }, + { + "epoch": 0.8789773257441448, + "grad_norm": 1.2026135602511323, + "learning_rate": 0.0002896225073364547, + "loss": 7.4179, + "step": 9420 + }, + { + "epoch": 0.8790706354390221, + "grad_norm": 1.0738291167485527, + "learning_rate": 0.000289619742699291, + "loss": 7.3369, + "step": 9421 + }, + { + "epoch": 0.8791639451338994, + "grad_norm": 0.9529536895137503, + "learning_rate": 0.00028961697770711565, + "loss": 7.6193, + "step": 9422 + }, + { + "epoch": 0.8792572548287767, + "grad_norm": 1.610913098809646, + "learning_rate": 0.0002896142123599357, + "loss": 7.563, + "step": 9423 + }, + { + "epoch": 0.879350564523654, + "grad_norm": 0.85952418069538, + "learning_rate": 0.00028961144665775804, + "loss": 7.4908, + "step": 9424 + }, + { + "epoch": 0.8794438742185313, + "grad_norm": 0.7735913166714502, + "learning_rate": 0.00028960868060058995, + "loss": 7.4043, + "step": 9425 + }, + { + "epoch": 0.8795371839134086, + "grad_norm": 1.057731117272469, + "learning_rate": 0.0002896059141884383, + "loss": 7.3206, + "step": 9426 + }, + { + "epoch": 0.8796304936082859, + "grad_norm": 0.8066677568774536, + "learning_rate": 0.0002896031474213101, + "loss": 7.441, + "step": 9427 + }, + { + "epoch": 0.8797238033031632, + "grad_norm": 1.2138571584038382, + "learning_rate": 0.0002896003802992125, + "loss": 7.7921, + "step": 9428 + }, + { + "epoch": 0.8798171129980406, + "grad_norm": 24.068905879473746, + "learning_rate": 0.0002895976128221525, + "loss": 7.1883, + "step": 9429 + }, + { + "epoch": 0.8799104226929177, + "grad_norm": 4.194564732321075, + "learning_rate": 0.00028959484499013717, + "loss": 7.3854, + "step": 9430 + }, + { + "epoch": 0.8800037323877951, + "grad_norm": 0.9769466777343624, + "learning_rate": 0.00028959207680317344, + "loss": 7.6121, + "step": 9431 + }, + { + "epoch": 0.8800970420826724, + "grad_norm": 0.49983104331894657, + "learning_rate": 0.00028958930826126843, + "loss": 7.4022, + "step": 9432 + }, + { + "epoch": 0.8801903517775497, + "grad_norm": 0.6539840916036915, + "learning_rate": 0.00028958653936442914, + "loss": 7.5816, + "step": 9433 + }, + { + "epoch": 0.880283661472427, + "grad_norm": 1.395943454858951, + "learning_rate": 0.00028958377011266265, + "loss": 7.2424, + "step": 9434 + }, + { + "epoch": 0.8803769711673043, + "grad_norm": 1.3830147934811798, + "learning_rate": 0.000289581000505976, + "loss": 7.3722, + "step": 9435 + }, + { + "epoch": 0.8804702808621816, + "grad_norm": 2.548186718505454, + "learning_rate": 0.00028957823054437615, + "loss": 7.4488, + "step": 9436 + }, + { + "epoch": 0.8805635905570589, + "grad_norm": 1.1288170831213904, + "learning_rate": 0.0002895754602278702, + "loss": 7.3727, + "step": 9437 + }, + { + "epoch": 0.8806569002519362, + "grad_norm": 1.4016940456513531, + "learning_rate": 0.00028957268955646524, + "loss": 7.1317, + "step": 9438 + }, + { + "epoch": 0.8807502099468135, + "grad_norm": 0.7948736050837941, + "learning_rate": 0.0002895699185301683, + "loss": 7.4115, + "step": 9439 + }, + { + "epoch": 0.8808435196416907, + "grad_norm": 0.7657916714444677, + "learning_rate": 0.00028956714714898637, + "loss": 7.5426, + "step": 9440 + }, + { + "epoch": 0.880936829336568, + "grad_norm": 1.0519149789546336, + "learning_rate": 0.0002895643754129265, + "loss": 7.411, + "step": 9441 + }, + { + "epoch": 0.8810301390314453, + "grad_norm": 1.4466841185367396, + "learning_rate": 0.00028956160332199585, + "loss": 7.6442, + "step": 9442 + }, + { + "epoch": 0.8811234487263226, + "grad_norm": 0.7602395663033066, + "learning_rate": 0.00028955883087620126, + "loss": 7.3958, + "step": 9443 + }, + { + "epoch": 0.8812167584212, + "grad_norm": 0.6823554808396092, + "learning_rate": 0.00028955605807555, + "loss": 7.4478, + "step": 9444 + }, + { + "epoch": 0.8813100681160773, + "grad_norm": 2.0862921715232656, + "learning_rate": 0.00028955328492004904, + "loss": 7.257, + "step": 9445 + }, + { + "epoch": 0.8814033778109546, + "grad_norm": 0.6655176428222573, + "learning_rate": 0.0002895505114097054, + "loss": 7.1975, + "step": 9446 + }, + { + "epoch": 0.8814966875058319, + "grad_norm": 0.6759036055573885, + "learning_rate": 0.0002895477375445261, + "loss": 7.3616, + "step": 9447 + }, + { + "epoch": 0.8815899972007092, + "grad_norm": 0.5950041686480693, + "learning_rate": 0.0002895449633245182, + "loss": 7.3038, + "step": 9448 + }, + { + "epoch": 0.8816833068955865, + "grad_norm": 0.7651495562175076, + "learning_rate": 0.0002895421887496889, + "loss": 7.1987, + "step": 9449 + }, + { + "epoch": 0.8817766165904638, + "grad_norm": 0.9125115074993106, + "learning_rate": 0.0002895394138200451, + "loss": 7.3053, + "step": 9450 + }, + { + "epoch": 0.881869926285341, + "grad_norm": 1.2244173873854318, + "learning_rate": 0.0002895366385355939, + "loss": 7.4584, + "step": 9451 + }, + { + "epoch": 0.8819632359802183, + "grad_norm": 0.9943959027285, + "learning_rate": 0.0002895338628963424, + "loss": 7.4255, + "step": 9452 + }, + { + "epoch": 0.8820565456750956, + "grad_norm": 0.518354087656328, + "learning_rate": 0.0002895310869022976, + "loss": 7.3001, + "step": 9453 + }, + { + "epoch": 0.8821498553699729, + "grad_norm": 3.2480546171467415, + "learning_rate": 0.0002895283105534665, + "loss": 7.5466, + "step": 9454 + }, + { + "epoch": 0.8822431650648502, + "grad_norm": 0.4175207618746702, + "learning_rate": 0.00028952553384985636, + "loss": 7.1784, + "step": 9455 + }, + { + "epoch": 0.8823364747597275, + "grad_norm": 0.6851489458988428, + "learning_rate": 0.000289522756791474, + "loss": 7.5427, + "step": 9456 + }, + { + "epoch": 0.8824297844546048, + "grad_norm": 0.6918038048345216, + "learning_rate": 0.0002895199793783267, + "loss": 7.2386, + "step": 9457 + }, + { + "epoch": 0.8825230941494822, + "grad_norm": 0.9417712502931291, + "learning_rate": 0.0002895172016104214, + "loss": 7.5466, + "step": 9458 + }, + { + "epoch": 0.8826164038443595, + "grad_norm": 0.5546504612387847, + "learning_rate": 0.00028951442348776514, + "loss": 7.1777, + "step": 9459 + }, + { + "epoch": 0.8827097135392368, + "grad_norm": 0.5331738442836536, + "learning_rate": 0.0002895116450103651, + "loss": 7.2141, + "step": 9460 + }, + { + "epoch": 0.882803023234114, + "grad_norm": 0.44827613328248617, + "learning_rate": 0.0002895088661782282, + "loss": 7.1891, + "step": 9461 + }, + { + "epoch": 0.8828963329289913, + "grad_norm": 1.2529807986356887, + "learning_rate": 0.0002895060869913616, + "loss": 7.669, + "step": 9462 + }, + { + "epoch": 0.8829896426238686, + "grad_norm": 1.2327937831767901, + "learning_rate": 0.0002895033074497724, + "loss": 7.4397, + "step": 9463 + }, + { + "epoch": 0.8830829523187459, + "grad_norm": 0.5292397049176709, + "learning_rate": 0.00028950052755346753, + "loss": 7.3611, + "step": 9464 + }, + { + "epoch": 0.8831762620136232, + "grad_norm": 1.1279960497925199, + "learning_rate": 0.0002894977473024542, + "loss": 7.8631, + "step": 9465 + }, + { + "epoch": 0.8832695717085005, + "grad_norm": 0.6747865262240875, + "learning_rate": 0.0002894949666967394, + "loss": 7.3066, + "step": 9466 + }, + { + "epoch": 0.8833628814033778, + "grad_norm": 0.8924174098772665, + "learning_rate": 0.0002894921857363302, + "loss": 7.3116, + "step": 9467 + }, + { + "epoch": 0.8834561910982551, + "grad_norm": 5.219812889555897, + "learning_rate": 0.0002894894044212338, + "loss": 7.3457, + "step": 9468 + }, + { + "epoch": 0.8835495007931324, + "grad_norm": 1.6326660233262826, + "learning_rate": 0.00028948662275145705, + "loss": 7.3074, + "step": 9469 + }, + { + "epoch": 0.8836428104880097, + "grad_norm": 1.3320826487894457, + "learning_rate": 0.00028948384072700716, + "loss": 7.4627, + "step": 9470 + }, + { + "epoch": 0.883736120182887, + "grad_norm": 1.1554791711026733, + "learning_rate": 0.00028948105834789115, + "loss": 7.4373, + "step": 9471 + }, + { + "epoch": 0.8838294298777642, + "grad_norm": 3.358712286250624, + "learning_rate": 0.0002894782756141162, + "loss": 7.2348, + "step": 9472 + }, + { + "epoch": 0.8839227395726416, + "grad_norm": 0.6962600947636554, + "learning_rate": 0.00028947549252568926, + "loss": 7.3219, + "step": 9473 + }, + { + "epoch": 0.8840160492675189, + "grad_norm": 5.653157274428364, + "learning_rate": 0.0002894727090826175, + "loss": 7.486, + "step": 9474 + }, + { + "epoch": 0.8841093589623962, + "grad_norm": 0.5963530394119019, + "learning_rate": 0.0002894699252849079, + "loss": 7.1884, + "step": 9475 + }, + { + "epoch": 0.8842026686572735, + "grad_norm": 4.495771763344113, + "learning_rate": 0.0002894671411325676, + "loss": 7.0125, + "step": 9476 + }, + { + "epoch": 0.8842959783521508, + "grad_norm": 2.1446243956551885, + "learning_rate": 0.0002894643566256037, + "loss": 7.3772, + "step": 9477 + }, + { + "epoch": 0.8843892880470281, + "grad_norm": 0.4383128310877142, + "learning_rate": 0.00028946157176402325, + "loss": 7.3678, + "step": 9478 + }, + { + "epoch": 0.8844825977419054, + "grad_norm": 1.1448110490482049, + "learning_rate": 0.0002894587865478333, + "loss": 7.3208, + "step": 9479 + }, + { + "epoch": 0.8845759074367827, + "grad_norm": 1.0274762593221256, + "learning_rate": 0.00028945600097704104, + "loss": 7.2855, + "step": 9480 + }, + { + "epoch": 0.88466921713166, + "grad_norm": 17.606172789383486, + "learning_rate": 0.00028945321505165347, + "loss": 7.0613, + "step": 9481 + }, + { + "epoch": 0.8847625268265373, + "grad_norm": 1.0352950426924792, + "learning_rate": 0.00028945042877167763, + "loss": 7.3841, + "step": 9482 + }, + { + "epoch": 0.8848558365214145, + "grad_norm": 1.002725492365469, + "learning_rate": 0.0002894476421371207, + "loss": 7.0873, + "step": 9483 + }, + { + "epoch": 0.8849491462162918, + "grad_norm": 1461.6647026377073, + "learning_rate": 0.00028944485514798967, + "loss": 7.4394, + "step": 9484 + }, + { + "epoch": 0.8850424559111691, + "grad_norm": 1993.5416614077726, + "learning_rate": 0.00028944206780429175, + "loss": 7.1586, + "step": 9485 + }, + { + "epoch": 0.8851357656060465, + "grad_norm": 16611.142828232187, + "learning_rate": 0.0002894392801060339, + "loss": 7.5173, + "step": 9486 + }, + { + "epoch": 0.8852290753009238, + "grad_norm": 2.9849974590764763, + "learning_rate": 0.00028943649205322327, + "loss": 7.3742, + "step": 9487 + }, + { + "epoch": 0.8853223849958011, + "grad_norm": 5.292205166009531, + "learning_rate": 0.000289433703645867, + "loss": 7.3053, + "step": 9488 + }, + { + "epoch": 0.8854156946906784, + "grad_norm": 1.3505057396505311, + "learning_rate": 0.00028943091488397205, + "loss": 7.7589, + "step": 9489 + }, + { + "epoch": 0.8855090043855557, + "grad_norm": 124145.48121409226, + "learning_rate": 0.0002894281257675456, + "loss": 7.6801, + "step": 9490 + }, + { + "epoch": 0.885602314080433, + "grad_norm": 4.870383277116008, + "learning_rate": 0.0002894253362965948, + "loss": 7.8468, + "step": 9491 + }, + { + "epoch": 0.8856956237753103, + "grad_norm": 96.35286990319433, + "learning_rate": 0.00028942254647112666, + "loss": 7.878, + "step": 9492 + }, + { + "epoch": 0.8857889334701875, + "grad_norm": 14.60598558512723, + "learning_rate": 0.0002894197562911482, + "loss": 7.625, + "step": 9493 + }, + { + "epoch": 0.8858822431650648, + "grad_norm": 5.192505199804326, + "learning_rate": 0.00028941696575666666, + "loss": 7.3776, + "step": 9494 + }, + { + "epoch": 0.8859755528599421, + "grad_norm": 5.82651435138743, + "learning_rate": 0.000289414174867689, + "loss": 7.6294, + "step": 9495 + }, + { + "epoch": 0.8860688625548194, + "grad_norm": 4.5764366131129295, + "learning_rate": 0.0002894113836242225, + "loss": 8.0236, + "step": 9496 + }, + { + "epoch": 0.8861621722496967, + "grad_norm": 2.2121096507847264, + "learning_rate": 0.0002894085920262741, + "loss": 7.7524, + "step": 9497 + }, + { + "epoch": 0.886255481944574, + "grad_norm": 1.6403193842447987, + "learning_rate": 0.00028940580007385093, + "loss": 7.6463, + "step": 9498 + }, + { + "epoch": 0.8863487916394513, + "grad_norm": 176070154425.09302, + "learning_rate": 0.0002894030077669601, + "loss": 7.3558, + "step": 9499 + }, + { + "epoch": 0.8864421013343287, + "grad_norm": 3.5330445357853217, + "learning_rate": 0.00028940021510560877, + "loss": 7.8936, + "step": 9500 + }, + { + "epoch": 0.886535411029206, + "grad_norm": 3.736670233757186, + "learning_rate": 0.00028939742208980397, + "loss": 7.741, + "step": 9501 + }, + { + "epoch": 0.8866287207240833, + "grad_norm": 4.525355144052764, + "learning_rate": 0.00028939462871955277, + "loss": 7.898, + "step": 9502 + }, + { + "epoch": 0.8867220304189606, + "grad_norm": 383.2436005649597, + "learning_rate": 0.0002893918349948624, + "loss": 7.7816, + "step": 9503 + }, + { + "epoch": 0.8868153401138378, + "grad_norm": 2.6207957253287746, + "learning_rate": 0.0002893890409157398, + "loss": 7.573, + "step": 9504 + }, + { + "epoch": 0.8869086498087151, + "grad_norm": 1917515623187.617, + "learning_rate": 0.00028938624648219217, + "loss": 7.7813, + "step": 9505 + }, + { + "epoch": 0.8870019595035924, + "grad_norm": 2.5430616160113475, + "learning_rate": 0.00028938345169422666, + "loss": 7.6131, + "step": 9506 + }, + { + "epoch": 0.8870952691984697, + "grad_norm": 1.9688489378643776, + "learning_rate": 0.0002893806565518503, + "loss": 7.5046, + "step": 9507 + }, + { + "epoch": 0.887188578893347, + "grad_norm": 1.210718813947361, + "learning_rate": 0.0002893778610550702, + "loss": 7.6584, + "step": 9508 + }, + { + "epoch": 0.8872818885882243, + "grad_norm": 1688144890196.5044, + "learning_rate": 0.00028937506520389347, + "loss": 7.4903, + "step": 9509 + }, + { + "epoch": 0.8873751982831016, + "grad_norm": 1.245011626557837, + "learning_rate": 0.0002893722689983273, + "loss": 7.3461, + "step": 9510 + }, + { + "epoch": 0.8874685079779789, + "grad_norm": 1.6490059702344801, + "learning_rate": 0.0002893694724383787, + "loss": 7.4755, + "step": 9511 + }, + { + "epoch": 0.8875618176728562, + "grad_norm": 138153948106.1325, + "learning_rate": 0.00028936667552405477, + "loss": 7.3, + "step": 9512 + }, + { + "epoch": 0.8876551273677336, + "grad_norm": 2.9911066027178337, + "learning_rate": 0.00028936387825536275, + "loss": 7.1323, + "step": 9513 + }, + { + "epoch": 0.8877484370626108, + "grad_norm": 3.0987645939755413, + "learning_rate": 0.0002893610806323096, + "loss": 7.6382, + "step": 9514 + }, + { + "epoch": 0.8878417467574881, + "grad_norm": 2.295505320921533, + "learning_rate": 0.00028935828265490255, + "loss": 7.4562, + "step": 9515 + }, + { + "epoch": 0.8879350564523654, + "grad_norm": 1.3256618136402818, + "learning_rate": 0.0002893554843231486, + "loss": 7.0395, + "step": 9516 + }, + { + "epoch": 0.8880283661472427, + "grad_norm": 18529111750.668186, + "learning_rate": 0.00028935268563705504, + "loss": 7.0919, + "step": 9517 + }, + { + "epoch": 0.88812167584212, + "grad_norm": 1.3802265089223842, + "learning_rate": 0.0002893498865966288, + "loss": 7.5777, + "step": 9518 + }, + { + "epoch": 0.8882149855369973, + "grad_norm": 0.7935418678679651, + "learning_rate": 0.00028934708720187714, + "loss": 7.3004, + "step": 9519 + }, + { + "epoch": 0.8883082952318746, + "grad_norm": 1.5156587103207084, + "learning_rate": 0.000289344287452807, + "loss": 7.3518, + "step": 9520 + }, + { + "epoch": 0.8884016049267519, + "grad_norm": 2.4777421744273163, + "learning_rate": 0.0002893414873494257, + "loss": 7.1016, + "step": 9521 + }, + { + "epoch": 0.8884949146216292, + "grad_norm": 1.8994299938323664, + "learning_rate": 0.0002893386868917403, + "loss": 7.3927, + "step": 9522 + }, + { + "epoch": 0.8885882243165065, + "grad_norm": 3.4381060195334094, + "learning_rate": 0.00028933588607975785, + "loss": 7.4737, + "step": 9523 + }, + { + "epoch": 0.8886815340113838, + "grad_norm": 3.8548404847640594, + "learning_rate": 0.0002893330849134855, + "loss": 7.2206, + "step": 9524 + }, + { + "epoch": 0.888774843706261, + "grad_norm": 0.868060678029742, + "learning_rate": 0.0002893302833929304, + "loss": 7.379, + "step": 9525 + }, + { + "epoch": 0.8888681534011383, + "grad_norm": 1.2021070546103851, + "learning_rate": 0.00028932748151809973, + "loss": 7.4007, + "step": 9526 + }, + { + "epoch": 0.8889614630960156, + "grad_norm": 0.643720241132051, + "learning_rate": 0.00028932467928900047, + "loss": 7.458, + "step": 9527 + }, + { + "epoch": 0.889054772790893, + "grad_norm": 1.378569595682508, + "learning_rate": 0.0002893218767056398, + "loss": 7.3555, + "step": 9528 + }, + { + "epoch": 0.8891480824857703, + "grad_norm": 1.2138609326130763, + "learning_rate": 0.00028931907376802496, + "loss": 7.3803, + "step": 9529 + }, + { + "epoch": 0.8892413921806476, + "grad_norm": 1.2164950846912006, + "learning_rate": 0.0002893162704761629, + "loss": 7.3356, + "step": 9530 + }, + { + "epoch": 0.8893347018755249, + "grad_norm": 1.5454498017579146, + "learning_rate": 0.00028931346683006085, + "loss": 7.6474, + "step": 9531 + }, + { + "epoch": 0.8894280115704022, + "grad_norm": 1.4459711800771218, + "learning_rate": 0.000289310662829726, + "loss": 7.5906, + "step": 9532 + }, + { + "epoch": 0.8895213212652795, + "grad_norm": 0.6400277429148064, + "learning_rate": 0.00028930785847516534, + "loss": 7.2261, + "step": 9533 + }, + { + "epoch": 0.8896146309601568, + "grad_norm": 5.802825556650733, + "learning_rate": 0.00028930505376638605, + "loss": 7.4923, + "step": 9534 + }, + { + "epoch": 0.8897079406550341, + "grad_norm": 0.6398226387242009, + "learning_rate": 0.00028930224870339524, + "loss": 7.5809, + "step": 9535 + }, + { + "epoch": 0.8898012503499113, + "grad_norm": 1.1792594972272648, + "learning_rate": 0.00028929944328620014, + "loss": 7.5691, + "step": 9536 + }, + { + "epoch": 0.8898945600447886, + "grad_norm": 1153740339.3094969, + "learning_rate": 0.0002892966375148078, + "loss": 7.3352, + "step": 9537 + }, + { + "epoch": 0.8899878697396659, + "grad_norm": 22.41082696589334, + "learning_rate": 0.00028929383138922535, + "loss": 7.5088, + "step": 9538 + }, + { + "epoch": 0.8900811794345432, + "grad_norm": 477693816.5574266, + "learning_rate": 0.00028929102490945997, + "loss": 7.62, + "step": 9539 + }, + { + "epoch": 0.8901744891294205, + "grad_norm": 5.15497179913377, + "learning_rate": 0.0002892882180755188, + "loss": 7.1367, + "step": 9540 + }, + { + "epoch": 0.8902677988242979, + "grad_norm": 3.756812674491688, + "learning_rate": 0.00028928541088740897, + "loss": 7.4427, + "step": 9541 + }, + { + "epoch": 0.8903611085191752, + "grad_norm": 2.515439752567935, + "learning_rate": 0.00028928260334513755, + "loss": 7.33, + "step": 9542 + }, + { + "epoch": 0.8904544182140525, + "grad_norm": 1.5633860777221702, + "learning_rate": 0.0002892797954487117, + "loss": 7.6907, + "step": 9543 + }, + { + "epoch": 0.8905477279089298, + "grad_norm": 2.0344512277235025, + "learning_rate": 0.00028927698719813865, + "loss": 7.5134, + "step": 9544 + }, + { + "epoch": 0.8906410376038071, + "grad_norm": 2.14738294408176, + "learning_rate": 0.00028927417859342543, + "loss": 7.3608, + "step": 9545 + }, + { + "epoch": 0.8907343472986843, + "grad_norm": 1.560848822784265, + "learning_rate": 0.00028927136963457925, + "loss": 7.4741, + "step": 9546 + }, + { + "epoch": 0.8908276569935616, + "grad_norm": 1.3302539702780043, + "learning_rate": 0.0002892685603216073, + "loss": 7.5368, + "step": 9547 + }, + { + "epoch": 0.8909209666884389, + "grad_norm": 3.111096946907548, + "learning_rate": 0.00028926575065451655, + "loss": 7.3161, + "step": 9548 + }, + { + "epoch": 0.8910142763833162, + "grad_norm": 2243178976.9530706, + "learning_rate": 0.0002892629406333143, + "loss": 7.1285, + "step": 9549 + }, + { + "epoch": 0.8911075860781935, + "grad_norm": 1.2588855982261378, + "learning_rate": 0.0002892601302580076, + "loss": 7.7794, + "step": 9550 + }, + { + "epoch": 0.8912008957730708, + "grad_norm": 1.0288177389343582, + "learning_rate": 0.00028925731952860373, + "loss": 7.3698, + "step": 9551 + }, + { + "epoch": 0.8912942054679481, + "grad_norm": 1435735015.0280123, + "learning_rate": 0.00028925450844510966, + "loss": 7.2749, + "step": 9552 + }, + { + "epoch": 0.8913875151628254, + "grad_norm": 5.457466006195502, + "learning_rate": 0.00028925169700753266, + "loss": 7.3865, + "step": 9553 + }, + { + "epoch": 0.8914808248577027, + "grad_norm": 1.1257202383276015, + "learning_rate": 0.00028924888521587985, + "loss": 7.298, + "step": 9554 + }, + { + "epoch": 0.89157413455258, + "grad_norm": 14.841214050359852, + "learning_rate": 0.0002892460730701584, + "loss": 7.4045, + "step": 9555 + }, + { + "epoch": 0.8916674442474574, + "grad_norm": 2.787429217191414, + "learning_rate": 0.00028924326057037534, + "loss": 7.517, + "step": 9556 + }, + { + "epoch": 0.8917607539423346, + "grad_norm": 1.929664435092851, + "learning_rate": 0.000289240447716538, + "loss": 7.668, + "step": 9557 + }, + { + "epoch": 0.8918540636372119, + "grad_norm": 9.841097949125412, + "learning_rate": 0.00028923763450865343, + "loss": 7.3625, + "step": 9558 + }, + { + "epoch": 0.8919473733320892, + "grad_norm": 0.7329187215830847, + "learning_rate": 0.00028923482094672876, + "loss": 7.3104, + "step": 9559 + }, + { + "epoch": 0.8920406830269665, + "grad_norm": 2.7317394347640622, + "learning_rate": 0.0002892320070307712, + "loss": 7.3465, + "step": 9560 + }, + { + "epoch": 0.8921339927218438, + "grad_norm": 1530204718.8719914, + "learning_rate": 0.00028922919276078797, + "loss": 7.2969, + "step": 9561 + }, + { + "epoch": 0.8922273024167211, + "grad_norm": 5.97848445308766, + "learning_rate": 0.00028922637813678606, + "loss": 7.2514, + "step": 9562 + }, + { + "epoch": 0.8923206121115984, + "grad_norm": 4736954392.68298, + "learning_rate": 0.0002892235631587727, + "loss": 7.4765, + "step": 9563 + }, + { + "epoch": 0.8924139218064757, + "grad_norm": 1.1248313615869414, + "learning_rate": 0.00028922074782675514, + "loss": 7.2113, + "step": 9564 + }, + { + "epoch": 0.892507231501353, + "grad_norm": 1.781320329789996, + "learning_rate": 0.0002892179321407404, + "loss": 7.3269, + "step": 9565 + }, + { + "epoch": 0.8926005411962303, + "grad_norm": 3.921187322350201, + "learning_rate": 0.0002892151161007357, + "loss": 7.1864, + "step": 9566 + }, + { + "epoch": 0.8926938508911075, + "grad_norm": 4732548422.854726, + "learning_rate": 0.0002892122997067482, + "loss": 7.2015, + "step": 9567 + }, + { + "epoch": 0.8927871605859848, + "grad_norm": 4.637592057894114, + "learning_rate": 0.00028920948295878507, + "loss": 7.4919, + "step": 9568 + }, + { + "epoch": 0.8928804702808621, + "grad_norm": 0.9666043447368742, + "learning_rate": 0.00028920666585685346, + "loss": 7.3438, + "step": 9569 + }, + { + "epoch": 0.8929737799757395, + "grad_norm": 0.659483535835815, + "learning_rate": 0.0002892038484009605, + "loss": 7.1939, + "step": 9570 + }, + { + "epoch": 0.8930670896706168, + "grad_norm": 1.0219569591212179, + "learning_rate": 0.00028920103059111346, + "loss": 7.2202, + "step": 9571 + }, + { + "epoch": 0.8931603993654941, + "grad_norm": 15.625165768930117, + "learning_rate": 0.00028919821242731945, + "loss": 7.2743, + "step": 9572 + }, + { + "epoch": 0.8932537090603714, + "grad_norm": 0.9574533610913202, + "learning_rate": 0.00028919539390958557, + "loss": 7.4516, + "step": 9573 + }, + { + "epoch": 0.8933470187552487, + "grad_norm": 181431650656.14352, + "learning_rate": 0.000289192575037919, + "loss": 7.3357, + "step": 9574 + }, + { + "epoch": 0.893440328450126, + "grad_norm": 1.4351025872188954, + "learning_rate": 0.000289189755812327, + "loss": 7.5892, + "step": 9575 + }, + { + "epoch": 0.8935336381450033, + "grad_norm": 10.021437722971225, + "learning_rate": 0.0002891869362328166, + "loss": 7.3848, + "step": 9576 + }, + { + "epoch": 0.8936269478398806, + "grad_norm": 1.7499405177276364, + "learning_rate": 0.00028918411629939514, + "loss": 7.265, + "step": 9577 + }, + { + "epoch": 0.8937202575347578, + "grad_norm": 0.6451736546190555, + "learning_rate": 0.0002891812960120697, + "loss": 7.2877, + "step": 9578 + }, + { + "epoch": 0.8938135672296351, + "grad_norm": 2.111133578405264, + "learning_rate": 0.0002891784753708474, + "loss": 7.228, + "step": 9579 + }, + { + "epoch": 0.8939068769245124, + "grad_norm": 50.39362297116697, + "learning_rate": 0.0002891756543757355, + "loss": 7.2085, + "step": 9580 + }, + { + "epoch": 0.8940001866193897, + "grad_norm": 255.32329062847057, + "learning_rate": 0.0002891728330267411, + "loss": 7.2736, + "step": 9581 + }, + { + "epoch": 0.894093496314267, + "grad_norm": 5.620071366434848, + "learning_rate": 0.00028917001132387147, + "loss": 7.5383, + "step": 9582 + }, + { + "epoch": 0.8941868060091444, + "grad_norm": 0.5256696982068416, + "learning_rate": 0.00028916718926713365, + "loss": 7.6302, + "step": 9583 + }, + { + "epoch": 0.8942801157040217, + "grad_norm": 1.361783357061317, + "learning_rate": 0.00028916436685653496, + "loss": 7.0392, + "step": 9584 + }, + { + "epoch": 0.894373425398899, + "grad_norm": 3552530481.9094815, + "learning_rate": 0.0002891615440920825, + "loss": 7.005, + "step": 9585 + }, + { + "epoch": 0.8944667350937763, + "grad_norm": 1.001686535137313, + "learning_rate": 0.0002891587209737834, + "loss": 7.2721, + "step": 9586 + }, + { + "epoch": 0.8945600447886536, + "grad_norm": 1.7951506695956656, + "learning_rate": 0.000289155897501645, + "loss": 7.4726, + "step": 9587 + }, + { + "epoch": 0.8946533544835309, + "grad_norm": 4.8304084438218835, + "learning_rate": 0.0002891530736756743, + "loss": 7.4193, + "step": 9588 + }, + { + "epoch": 0.8947466641784081, + "grad_norm": 0.9666262686678052, + "learning_rate": 0.00028915024949587856, + "loss": 7.1906, + "step": 9589 + }, + { + "epoch": 0.8948399738732854, + "grad_norm": 10007510005.655796, + "learning_rate": 0.00028914742496226494, + "loss": 7.2689, + "step": 9590 + }, + { + "epoch": 0.8949332835681627, + "grad_norm": 0.6097895225022641, + "learning_rate": 0.0002891446000748407, + "loss": 7.3364, + "step": 9591 + }, + { + "epoch": 0.89502659326304, + "grad_norm": 0.7001503076981211, + "learning_rate": 0.0002891417748336129, + "loss": 7.7459, + "step": 9592 + }, + { + "epoch": 0.8951199029579173, + "grad_norm": 1.5150257557099547, + "learning_rate": 0.0002891389492385888, + "loss": 7.2522, + "step": 9593 + }, + { + "epoch": 0.8952132126527946, + "grad_norm": 1.2359286944497612, + "learning_rate": 0.00028913612328977553, + "loss": 7.4111, + "step": 9594 + }, + { + "epoch": 0.8953065223476719, + "grad_norm": 1.5520060616191114, + "learning_rate": 0.00028913329698718036, + "loss": 7.183, + "step": 9595 + }, + { + "epoch": 0.8953998320425492, + "grad_norm": 32765216513.019962, + "learning_rate": 0.00028913047033081045, + "loss": 7.3223, + "step": 9596 + }, + { + "epoch": 0.8954931417374266, + "grad_norm": 0.9462524734903947, + "learning_rate": 0.0002891276433206729, + "loss": 7.2531, + "step": 9597 + }, + { + "epoch": 0.8955864514323039, + "grad_norm": 6.020839347633625, + "learning_rate": 0.000289124815956775, + "loss": 7.2619, + "step": 9598 + }, + { + "epoch": 0.8956797611271811, + "grad_norm": 1.4970522209112178, + "learning_rate": 0.00028912198823912393, + "loss": 7.532, + "step": 9599 + }, + { + "epoch": 0.8957730708220584, + "grad_norm": 1.9609945743049084, + "learning_rate": 0.00028911916016772685, + "loss": 7.2434, + "step": 9600 + }, + { + "epoch": 0.8958663805169357, + "grad_norm": 3559506139.6852083, + "learning_rate": 0.00028911633174259093, + "loss": 7.3688, + "step": 9601 + }, + { + "epoch": 0.895959690211813, + "grad_norm": 1507060860.5570009, + "learning_rate": 0.00028911350296372337, + "loss": 7.1791, + "step": 9602 + }, + { + "epoch": 0.8960529999066903, + "grad_norm": 1.069371092299589, + "learning_rate": 0.00028911067383113144, + "loss": 7.6962, + "step": 9603 + }, + { + "epoch": 0.8961463096015676, + "grad_norm": 6898668670.249193, + "learning_rate": 0.00028910784434482223, + "loss": 7.3287, + "step": 9604 + }, + { + "epoch": 0.8962396192964449, + "grad_norm": 3.7139744076886445, + "learning_rate": 0.000289105014504803, + "loss": 7.5565, + "step": 9605 + }, + { + "epoch": 0.8963329289913222, + "grad_norm": 1.6076364252559785, + "learning_rate": 0.0002891021843110809, + "loss": 7.5943, + "step": 9606 + }, + { + "epoch": 0.8964262386861995, + "grad_norm": 2.5675583940923494, + "learning_rate": 0.0002890993537636632, + "loss": 7.7232, + "step": 9607 + }, + { + "epoch": 0.8965195483810768, + "grad_norm": 1.672217081516262, + "learning_rate": 0.000289096522862557, + "loss": 7.2327, + "step": 9608 + }, + { + "epoch": 0.8966128580759541, + "grad_norm": 1.0115097427116282, + "learning_rate": 0.00028909369160776957, + "loss": 7.6662, + "step": 9609 + }, + { + "epoch": 0.8967061677708313, + "grad_norm": 1558769307.0041246, + "learning_rate": 0.000289090859999308, + "loss": 7.4385, + "step": 9610 + }, + { + "epoch": 0.8967994774657086, + "grad_norm": 1391.0155084944286, + "learning_rate": 0.0002890880280371797, + "loss": 7.5266, + "step": 9611 + }, + { + "epoch": 0.896892787160586, + "grad_norm": 1.152350141047886, + "learning_rate": 0.0002890851957213917, + "loss": 7.5084, + "step": 9612 + }, + { + "epoch": 0.8969860968554633, + "grad_norm": 0.9283560689736859, + "learning_rate": 0.0002890823630519512, + "loss": 7.3087, + "step": 9613 + }, + { + "epoch": 0.8970794065503406, + "grad_norm": 0.9866063366505893, + "learning_rate": 0.0002890795300288655, + "loss": 7.5139, + "step": 9614 + }, + { + "epoch": 0.8971727162452179, + "grad_norm": 0.715776105008564, + "learning_rate": 0.00028907669665214177, + "loss": 7.4404, + "step": 9615 + }, + { + "epoch": 0.8972660259400952, + "grad_norm": 25024008.51546858, + "learning_rate": 0.0002890738629217872, + "loss": 7.1694, + "step": 9616 + }, + { + "epoch": 0.8973593356349725, + "grad_norm": 0.8782781186017555, + "learning_rate": 0.00028907102883780894, + "loss": 7.1878, + "step": 9617 + }, + { + "epoch": 0.8974526453298498, + "grad_norm": 0.6761199386651892, + "learning_rate": 0.0002890681944002143, + "loss": 7.4907, + "step": 9618 + }, + { + "epoch": 0.8975459550247271, + "grad_norm": 0.8991421469327096, + "learning_rate": 0.0002890653596090104, + "loss": 7.2449, + "step": 9619 + }, + { + "epoch": 0.8976392647196043, + "grad_norm": 3.6294407608570993, + "learning_rate": 0.00028906252446420447, + "loss": 7.5806, + "step": 9620 + }, + { + "epoch": 0.8977325744144816, + "grad_norm": 1.1906347580452357, + "learning_rate": 0.00028905968896580377, + "loss": 7.6957, + "step": 9621 + }, + { + "epoch": 0.8978258841093589, + "grad_norm": 0.7235165727695495, + "learning_rate": 0.0002890568531138155, + "loss": 7.2362, + "step": 9622 + }, + { + "epoch": 0.8979191938042362, + "grad_norm": 0.5507790373999335, + "learning_rate": 0.0002890540169082468, + "loss": 7.4453, + "step": 9623 + }, + { + "epoch": 0.8980125034991135, + "grad_norm": 0.8630615906608239, + "learning_rate": 0.0002890511803491049, + "loss": 7.3882, + "step": 9624 + }, + { + "epoch": 0.8981058131939909, + "grad_norm": 0.7150021737928854, + "learning_rate": 0.0002890483434363971, + "loss": 7.287, + "step": 9625 + }, + { + "epoch": 0.8981991228888682, + "grad_norm": 0.7661248176992315, + "learning_rate": 0.0002890455061701305, + "loss": 7.4438, + "step": 9626 + }, + { + "epoch": 0.8982924325837455, + "grad_norm": 197042331.25016502, + "learning_rate": 0.0002890426685503124, + "loss": 7.1552, + "step": 9627 + }, + { + "epoch": 0.8983857422786228, + "grad_norm": 1.073102713941197, + "learning_rate": 0.00028903983057695, + "loss": 7.2075, + "step": 9628 + }, + { + "epoch": 0.8984790519735001, + "grad_norm": 0.8659750114068716, + "learning_rate": 0.0002890369922500504, + "loss": 7.4268, + "step": 9629 + }, + { + "epoch": 0.8985723616683774, + "grad_norm": 4.30307116156542, + "learning_rate": 0.000289034153569621, + "loss": 7.2474, + "step": 9630 + }, + { + "epoch": 0.8986656713632546, + "grad_norm": 17.5179608589605, + "learning_rate": 0.0002890313145356689, + "loss": 7.5753, + "step": 9631 + }, + { + "epoch": 0.8987589810581319, + "grad_norm": 1.777114288452536, + "learning_rate": 0.0002890284751482014, + "loss": 7.566, + "step": 9632 + }, + { + "epoch": 0.8988522907530092, + "grad_norm": 1.122524071895918, + "learning_rate": 0.00028902563540722563, + "loss": 7.2887, + "step": 9633 + }, + { + "epoch": 0.8989456004478865, + "grad_norm": 0.7998342519220738, + "learning_rate": 0.00028902279531274887, + "loss": 7.4674, + "step": 9634 + }, + { + "epoch": 0.8990389101427638, + "grad_norm": 1.5041602103138116, + "learning_rate": 0.0002890199548647783, + "loss": 7.5162, + "step": 9635 + }, + { + "epoch": 0.8991322198376411, + "grad_norm": 0.8050300471271401, + "learning_rate": 0.0002890171140633212, + "loss": 7.5033, + "step": 9636 + }, + { + "epoch": 0.8992255295325184, + "grad_norm": 0.6514165186483525, + "learning_rate": 0.00028901427290838473, + "loss": 7.5729, + "step": 9637 + }, + { + "epoch": 0.8993188392273957, + "grad_norm": 0.977840010768316, + "learning_rate": 0.0002890114313999761, + "loss": 7.6101, + "step": 9638 + }, + { + "epoch": 0.8994121489222731, + "grad_norm": 0.6518049615692088, + "learning_rate": 0.0002890085895381027, + "loss": 7.6366, + "step": 9639 + }, + { + "epoch": 0.8995054586171504, + "grad_norm": 1262.829021378385, + "learning_rate": 0.00028900574732277154, + "loss": 7.3974, + "step": 9640 + }, + { + "epoch": 0.8995987683120277, + "grad_norm": 0.7283633592547929, + "learning_rate": 0.00028900290475398994, + "loss": 7.3334, + "step": 9641 + }, + { + "epoch": 0.8996920780069049, + "grad_norm": 0.7798569375480954, + "learning_rate": 0.00028900006183176516, + "loss": 7.4589, + "step": 9642 + }, + { + "epoch": 0.8997853877017822, + "grad_norm": 166011407.18890113, + "learning_rate": 0.0002889972185561044, + "loss": 7.4764, + "step": 9643 + }, + { + "epoch": 0.8998786973966595, + "grad_norm": 841562765.3397567, + "learning_rate": 0.0002889943749270149, + "loss": 7.3313, + "step": 9644 + }, + { + "epoch": 0.8999720070915368, + "grad_norm": 0.9524879242384715, + "learning_rate": 0.0002889915309445038, + "loss": 7.507, + "step": 9645 + }, + { + "epoch": 0.9000653167864141, + "grad_norm": 0.6082736322073147, + "learning_rate": 0.0002889886866085785, + "loss": 7.336, + "step": 9646 + }, + { + "epoch": 0.9001586264812914, + "grad_norm": 0.9494338249582808, + "learning_rate": 0.0002889858419192461, + "loss": 7.4673, + "step": 9647 + }, + { + "epoch": 0.9002519361761687, + "grad_norm": 1.9071474908973456, + "learning_rate": 0.00028898299687651386, + "loss": 7.5639, + "step": 9648 + }, + { + "epoch": 0.900345245871046, + "grad_norm": 0.8066798277176365, + "learning_rate": 0.0002889801514803891, + "loss": 7.212, + "step": 9649 + }, + { + "epoch": 0.9004385555659233, + "grad_norm": 0.7198086679586058, + "learning_rate": 0.0002889773057308789, + "loss": 7.3312, + "step": 9650 + }, + { + "epoch": 0.9005318652608006, + "grad_norm": 1.0361110581181467, + "learning_rate": 0.0002889744596279906, + "loss": 7.7171, + "step": 9651 + }, + { + "epoch": 0.9006251749556778, + "grad_norm": 0.7099514480276878, + "learning_rate": 0.00028897161317173144, + "loss": 7.122, + "step": 9652 + }, + { + "epoch": 0.9007184846505552, + "grad_norm": 3.1143514298748736, + "learning_rate": 0.0002889687663621086, + "loss": 7.724, + "step": 9653 + }, + { + "epoch": 0.9008117943454325, + "grad_norm": 1403442336.0772986, + "learning_rate": 0.00028896591919912943, + "loss": 7.1898, + "step": 9654 + }, + { + "epoch": 0.9009051040403098, + "grad_norm": 0.6646304225982632, + "learning_rate": 0.000288963071682801, + "loss": 7.4601, + "step": 9655 + }, + { + "epoch": 0.9009984137351871, + "grad_norm": 0.8618528644800891, + "learning_rate": 0.0002889602238131307, + "loss": 7.2098, + "step": 9656 + }, + { + "epoch": 0.9010917234300644, + "grad_norm": 0.6950012265115916, + "learning_rate": 0.0002889573755901257, + "loss": 7.4253, + "step": 9657 + }, + { + "epoch": 0.9011850331249417, + "grad_norm": 0.9339857398005939, + "learning_rate": 0.00028895452701379324, + "loss": 7.2213, + "step": 9658 + }, + { + "epoch": 0.901278342819819, + "grad_norm": 1584405649.549838, + "learning_rate": 0.0002889516780841406, + "loss": 7.323, + "step": 9659 + }, + { + "epoch": 0.9013716525146963, + "grad_norm": 0.8794466887278529, + "learning_rate": 0.000288948828801175, + "loss": 7.5454, + "step": 9660 + }, + { + "epoch": 0.9014649622095736, + "grad_norm": 0.8731535041822731, + "learning_rate": 0.00028894597916490365, + "loss": 7.5935, + "step": 9661 + }, + { + "epoch": 0.9015582719044509, + "grad_norm": 0.7457807817798499, + "learning_rate": 0.0002889431291753339, + "loss": 7.5936, + "step": 9662 + }, + { + "epoch": 0.9016515815993281, + "grad_norm": 0.5585887984950004, + "learning_rate": 0.0002889402788324729, + "loss": 7.4056, + "step": 9663 + }, + { + "epoch": 0.9017448912942054, + "grad_norm": 0.7296208750330151, + "learning_rate": 0.0002889374281363279, + "loss": 7.5159, + "step": 9664 + }, + { + "epoch": 0.9018382009890827, + "grad_norm": 1298211786.0432956, + "learning_rate": 0.0002889345770869062, + "loss": 7.2794, + "step": 9665 + }, + { + "epoch": 0.90193151068396, + "grad_norm": 0.8804954814456577, + "learning_rate": 0.00028893172568421506, + "loss": 7.4549, + "step": 9666 + }, + { + "epoch": 0.9020248203788374, + "grad_norm": 797293815.1820521, + "learning_rate": 0.00028892887392826166, + "loss": 7.4372, + "step": 9667 + }, + { + "epoch": 0.9021181300737147, + "grad_norm": 8563156553.978167, + "learning_rate": 0.00028892602181905324, + "loss": 7.5443, + "step": 9668 + }, + { + "epoch": 0.902211439768592, + "grad_norm": 0.7428742657644767, + "learning_rate": 0.00028892316935659717, + "loss": 7.4022, + "step": 9669 + }, + { + "epoch": 0.9023047494634693, + "grad_norm": 9.791858197952697, + "learning_rate": 0.0002889203165409006, + "loss": 7.4537, + "step": 9670 + }, + { + "epoch": 0.9023980591583466, + "grad_norm": 0.8716714935718329, + "learning_rate": 0.0002889174633719708, + "loss": 7.2432, + "step": 9671 + }, + { + "epoch": 0.9024913688532239, + "grad_norm": 0.6886967034514596, + "learning_rate": 0.0002889146098498151, + "loss": 7.2736, + "step": 9672 + }, + { + "epoch": 0.9025846785481011, + "grad_norm": 0.7495532897484986, + "learning_rate": 0.0002889117559744407, + "loss": 7.3832, + "step": 9673 + }, + { + "epoch": 0.9026779882429784, + "grad_norm": 0.8963028679003602, + "learning_rate": 0.0002889089017458548, + "loss": 7.806, + "step": 9674 + }, + { + "epoch": 0.9027712979378557, + "grad_norm": 0.8455969480442127, + "learning_rate": 0.0002889060471640647, + "loss": 7.4303, + "step": 9675 + }, + { + "epoch": 0.902864607632733, + "grad_norm": 10515954.08137915, + "learning_rate": 0.00028890319222907773, + "loss": 7.1453, + "step": 9676 + }, + { + "epoch": 0.9029579173276103, + "grad_norm": 0.8063613835613267, + "learning_rate": 0.00028890033694090104, + "loss": 7.4758, + "step": 9677 + }, + { + "epoch": 0.9030512270224876, + "grad_norm": 1.186533048857876, + "learning_rate": 0.000288897481299542, + "loss": 7.4899, + "step": 9678 + }, + { + "epoch": 0.9031445367173649, + "grad_norm": 0.76988192800563, + "learning_rate": 0.00028889462530500774, + "loss": 7.4457, + "step": 9679 + }, + { + "epoch": 0.9032378464122423, + "grad_norm": 259790302.72891197, + "learning_rate": 0.0002888917689573056, + "loss": 7.3036, + "step": 9680 + }, + { + "epoch": 0.9033311561071196, + "grad_norm": 0.7434015147228181, + "learning_rate": 0.00028888891225644286, + "loss": 7.5145, + "step": 9681 + }, + { + "epoch": 0.9034244658019969, + "grad_norm": 1.1839230445775806, + "learning_rate": 0.00028888605520242675, + "loss": 7.4018, + "step": 9682 + }, + { + "epoch": 0.9035177754968742, + "grad_norm": 0.7821125858719119, + "learning_rate": 0.0002888831977952645, + "loss": 7.5012, + "step": 9683 + }, + { + "epoch": 0.9036110851917514, + "grad_norm": 0.6830544447849969, + "learning_rate": 0.0002888803400349635, + "loss": 7.5915, + "step": 9684 + }, + { + "epoch": 0.9037043948866287, + "grad_norm": 0.7747962687295072, + "learning_rate": 0.0002888774819215309, + "loss": 7.3824, + "step": 9685 + }, + { + "epoch": 0.903797704581506, + "grad_norm": 1.5677819969228362, + "learning_rate": 0.000288874623454974, + "loss": 7.2859, + "step": 9686 + }, + { + "epoch": 0.9038910142763833, + "grad_norm": 0.5891046716698259, + "learning_rate": 0.00028887176463530007, + "loss": 7.4556, + "step": 9687 + }, + { + "epoch": 0.9039843239712606, + "grad_norm": 229418438.29118165, + "learning_rate": 0.00028886890546251634, + "loss": 7.392, + "step": 9688 + }, + { + "epoch": 0.9040776336661379, + "grad_norm": 0.5624712908959538, + "learning_rate": 0.0002888660459366302, + "loss": 7.3101, + "step": 9689 + }, + { + "epoch": 0.9041709433610152, + "grad_norm": 0.6002048744517718, + "learning_rate": 0.00028886318605764877, + "loss": 7.3347, + "step": 9690 + }, + { + "epoch": 0.9042642530558925, + "grad_norm": 0.8962707590897403, + "learning_rate": 0.0002888603258255794, + "loss": 7.3688, + "step": 9691 + }, + { + "epoch": 0.9043575627507698, + "grad_norm": 3.66128094998545, + "learning_rate": 0.0002888574652404294, + "loss": 7.4369, + "step": 9692 + }, + { + "epoch": 0.9044508724456471, + "grad_norm": 0.7203510610534003, + "learning_rate": 0.000288854604302206, + "loss": 7.3602, + "step": 9693 + }, + { + "epoch": 0.9045441821405245, + "grad_norm": 16.129601589710497, + "learning_rate": 0.00028885174301091643, + "loss": 7.3718, + "step": 9694 + }, + { + "epoch": 0.9046374918354017, + "grad_norm": 68169532.21348888, + "learning_rate": 0.00028884888136656797, + "loss": 7.2777, + "step": 9695 + }, + { + "epoch": 0.904730801530279, + "grad_norm": 1201785650.903927, + "learning_rate": 0.000288846019369168, + "loss": 7.3667, + "step": 9696 + }, + { + "epoch": 0.9048241112251563, + "grad_norm": 0.9400516165747381, + "learning_rate": 0.0002888431570187237, + "loss": 7.3919, + "step": 9697 + }, + { + "epoch": 0.9049174209200336, + "grad_norm": 0.8712779241169754, + "learning_rate": 0.00028884029431524243, + "loss": 7.4626, + "step": 9698 + }, + { + "epoch": 0.9050107306149109, + "grad_norm": 0.88475004768086, + "learning_rate": 0.0002888374312587314, + "loss": 7.2571, + "step": 9699 + }, + { + "epoch": 0.9051040403097882, + "grad_norm": 0.9944820159723459, + "learning_rate": 0.0002888345678491979, + "loss": 7.7369, + "step": 9700 + }, + { + "epoch": 0.9051973500046655, + "grad_norm": 917802840.0638072, + "learning_rate": 0.0002888317040866492, + "loss": 7.4484, + "step": 9701 + }, + { + "epoch": 0.9052906596995428, + "grad_norm": 1.020282558842427, + "learning_rate": 0.0002888288399710926, + "loss": 7.2378, + "step": 9702 + }, + { + "epoch": 0.9053839693944201, + "grad_norm": 3.026612250050622, + "learning_rate": 0.00028882597550253545, + "loss": 7.0511, + "step": 9703 + }, + { + "epoch": 0.9054772790892974, + "grad_norm": 0.7043804046523254, + "learning_rate": 0.0002888231106809849, + "loss": 7.327, + "step": 9704 + }, + { + "epoch": 0.9055705887841746, + "grad_norm": 0.8495658378264971, + "learning_rate": 0.0002888202455064483, + "loss": 7.485, + "step": 9705 + }, + { + "epoch": 0.9056638984790519, + "grad_norm": 0.8035621028332842, + "learning_rate": 0.000288817379978933, + "loss": 7.3607, + "step": 9706 + }, + { + "epoch": 0.9057572081739292, + "grad_norm": 0.8507596486291763, + "learning_rate": 0.0002888145140984462, + "loss": 7.3924, + "step": 9707 + }, + { + "epoch": 0.9058505178688065, + "grad_norm": 0.5804945265747629, + "learning_rate": 0.00028881164786499514, + "loss": 7.1375, + "step": 9708 + }, + { + "epoch": 0.9059438275636839, + "grad_norm": 0.8275941553695376, + "learning_rate": 0.0002888087812785873, + "loss": 7.0024, + "step": 9709 + }, + { + "epoch": 0.9060371372585612, + "grad_norm": 10968293676.509653, + "learning_rate": 0.00028880591433922976, + "loss": 7.4614, + "step": 9710 + }, + { + "epoch": 0.9061304469534385, + "grad_norm": 0.6009381198356479, + "learning_rate": 0.0002888030470469299, + "loss": 7.4629, + "step": 9711 + }, + { + "epoch": 0.9062237566483158, + "grad_norm": 1.4149131081992958, + "learning_rate": 0.000288800179401695, + "loss": 7.7797, + "step": 9712 + }, + { + "epoch": 0.9063170663431931, + "grad_norm": 0.9477630781033807, + "learning_rate": 0.00028879731140353244, + "loss": 7.1938, + "step": 9713 + }, + { + "epoch": 0.9064103760380704, + "grad_norm": 1.1115366378989182, + "learning_rate": 0.00028879444305244937, + "loss": 7.4281, + "step": 9714 + }, + { + "epoch": 0.9065036857329477, + "grad_norm": 0.9489912666297187, + "learning_rate": 0.0002887915743484531, + "loss": 7.4414, + "step": 9715 + }, + { + "epoch": 0.9065969954278249, + "grad_norm": 1.1152438562532077, + "learning_rate": 0.000288788705291551, + "loss": 7.6839, + "step": 9716 + }, + { + "epoch": 0.9066903051227022, + "grad_norm": 0.8339360321735462, + "learning_rate": 0.0002887858358817504, + "loss": 7.4882, + "step": 9717 + }, + { + "epoch": 0.9067836148175795, + "grad_norm": 0.5094433601271576, + "learning_rate": 0.00028878296611905847, + "loss": 7.1947, + "step": 9718 + }, + { + "epoch": 0.9068769245124568, + "grad_norm": 0.8400626747721925, + "learning_rate": 0.0002887800960034826, + "loss": 7.3204, + "step": 9719 + }, + { + "epoch": 0.9069702342073341, + "grad_norm": 496610155.65518963, + "learning_rate": 0.00028877722553503004, + "loss": 7.2158, + "step": 9720 + }, + { + "epoch": 0.9070635439022114, + "grad_norm": 1.7254409875227186, + "learning_rate": 0.0002887743547137081, + "loss": 7.3107, + "step": 9721 + }, + { + "epoch": 0.9071568535970888, + "grad_norm": 0.7263660506396972, + "learning_rate": 0.0002887714835395241, + "loss": 7.5706, + "step": 9722 + }, + { + "epoch": 0.9072501632919661, + "grad_norm": 0.5773870556374877, + "learning_rate": 0.0002887686120124853, + "loss": 7.2487, + "step": 9723 + }, + { + "epoch": 0.9073434729868434, + "grad_norm": 0.6480094405964302, + "learning_rate": 0.00028876574013259905, + "loss": 7.5852, + "step": 9724 + }, + { + "epoch": 0.9074367826817207, + "grad_norm": 0.64408433533218, + "learning_rate": 0.0002887628678998726, + "loss": 7.1111, + "step": 9725 + }, + { + "epoch": 0.9075300923765979, + "grad_norm": 1.2496258272796328, + "learning_rate": 0.0002887599953143133, + "loss": 7.1249, + "step": 9726 + }, + { + "epoch": 0.9076234020714752, + "grad_norm": 0.9062928796187341, + "learning_rate": 0.00028875712237592846, + "loss": 7.3082, + "step": 9727 + }, + { + "epoch": 0.9077167117663525, + "grad_norm": 37183210346.75327, + "learning_rate": 0.00028875424908472535, + "loss": 7.4293, + "step": 9728 + }, + { + "epoch": 0.9078100214612298, + "grad_norm": 1.3189052731720299, + "learning_rate": 0.00028875137544071125, + "loss": 7.2516, + "step": 9729 + }, + { + "epoch": 0.9079033311561071, + "grad_norm": 0.7684312930960893, + "learning_rate": 0.00028874850144389354, + "loss": 7.1762, + "step": 9730 + }, + { + "epoch": 0.9079966408509844, + "grad_norm": 1.7615087849943025, + "learning_rate": 0.0002887456270942794, + "loss": 7.4806, + "step": 9731 + }, + { + "epoch": 0.9080899505458617, + "grad_norm": 2104586423.5241647, + "learning_rate": 0.0002887427523918764, + "loss": 7.3712, + "step": 9732 + }, + { + "epoch": 0.908183260240739, + "grad_norm": 0.8668603059567069, + "learning_rate": 0.00028873987733669157, + "loss": 7.1858, + "step": 9733 + }, + { + "epoch": 0.9082765699356163, + "grad_norm": 2.749356687935105, + "learning_rate": 0.0002887370019287323, + "loss": 7.3111, + "step": 9734 + }, + { + "epoch": 0.9083698796304936, + "grad_norm": 169298692.77876246, + "learning_rate": 0.00028873412616800596, + "loss": 7.3757, + "step": 9735 + }, + { + "epoch": 0.908463189325371, + "grad_norm": 1568091232.7576625, + "learning_rate": 0.00028873125005451987, + "loss": 7.0825, + "step": 9736 + }, + { + "epoch": 0.9085564990202482, + "grad_norm": 1.0739594039750635, + "learning_rate": 0.00028872837358828133, + "loss": 7.3945, + "step": 9737 + }, + { + "epoch": 0.9086498087151255, + "grad_norm": 1.4656999260884302, + "learning_rate": 0.00028872549676929756, + "loss": 7.3662, + "step": 9738 + }, + { + "epoch": 0.9087431184100028, + "grad_norm": 17.960991654653576, + "learning_rate": 0.00028872261959757593, + "loss": 7.316, + "step": 9739 + }, + { + "epoch": 0.9088364281048801, + "grad_norm": 11.02383119705887, + "learning_rate": 0.0002887197420731238, + "loss": 7.2102, + "step": 9740 + }, + { + "epoch": 0.9089297377997574, + "grad_norm": 2.141025328303701, + "learning_rate": 0.00028871686419594853, + "loss": 7.4328, + "step": 9741 + }, + { + "epoch": 0.9090230474946347, + "grad_norm": 0.7149361956980714, + "learning_rate": 0.0002887139859660573, + "loss": 7.0438, + "step": 9742 + }, + { + "epoch": 0.909116357189512, + "grad_norm": 0.6717563762886684, + "learning_rate": 0.00028871110738345747, + "loss": 7.2251, + "step": 9743 + }, + { + "epoch": 0.9092096668843893, + "grad_norm": 0.8345593747715634, + "learning_rate": 0.00028870822844815643, + "loss": 7.2673, + "step": 9744 + }, + { + "epoch": 0.9093029765792666, + "grad_norm": 1.3792945616956969, + "learning_rate": 0.00028870534916016144, + "loss": 7.8187, + "step": 9745 + }, + { + "epoch": 0.9093962862741439, + "grad_norm": 1.0772490193012372, + "learning_rate": 0.0002887024695194798, + "loss": 7.2697, + "step": 9746 + }, + { + "epoch": 0.9094895959690212, + "grad_norm": 1.3803237016818255, + "learning_rate": 0.00028869958952611896, + "loss": 7.3668, + "step": 9747 + }, + { + "epoch": 0.9095829056638984, + "grad_norm": 0.5545260742059306, + "learning_rate": 0.00028869670918008604, + "loss": 7.2628, + "step": 9748 + }, + { + "epoch": 0.9096762153587757, + "grad_norm": 0.9367486948727589, + "learning_rate": 0.00028869382848138855, + "loss": 6.9925, + "step": 9749 + }, + { + "epoch": 0.909769525053653, + "grad_norm": 331.9808702066351, + "learning_rate": 0.0002886909474300337, + "loss": 7.2528, + "step": 9750 + }, + { + "epoch": 0.9098628347485304, + "grad_norm": 739809951.8138524, + "learning_rate": 0.0002886880660260288, + "loss": 6.9857, + "step": 9751 + }, + { + "epoch": 0.9099561444434077, + "grad_norm": 0.8598703888510968, + "learning_rate": 0.00028868518426938133, + "loss": 7.1553, + "step": 9752 + }, + { + "epoch": 0.910049454138285, + "grad_norm": 1.0388812497658535, + "learning_rate": 0.0002886823021600985, + "loss": 7.3199, + "step": 9753 + }, + { + "epoch": 0.9101427638331623, + "grad_norm": 474460218.3387101, + "learning_rate": 0.0002886794196981876, + "loss": 7.1793, + "step": 9754 + }, + { + "epoch": 0.9102360735280396, + "grad_norm": 1.7484068780894406, + "learning_rate": 0.00028867653688365605, + "loss": 7.3863, + "step": 9755 + }, + { + "epoch": 0.9103293832229169, + "grad_norm": 3.13752788397429, + "learning_rate": 0.0002886736537165112, + "loss": 7.4959, + "step": 9756 + }, + { + "epoch": 0.9104226929177942, + "grad_norm": 0.630396836566457, + "learning_rate": 0.00028867077019676025, + "loss": 7.441, + "step": 9757 + }, + { + "epoch": 0.9105160026126714, + "grad_norm": 0.7343621782391211, + "learning_rate": 0.0002886678863244106, + "loss": 7.5536, + "step": 9758 + }, + { + "epoch": 0.9106093123075487, + "grad_norm": 1.368073699621033, + "learning_rate": 0.00028866500209946965, + "loss": 7.1975, + "step": 9759 + }, + { + "epoch": 0.910702622002426, + "grad_norm": 1.3593102634801864, + "learning_rate": 0.00028866211752194467, + "loss": 7.1512, + "step": 9760 + }, + { + "epoch": 0.9107959316973033, + "grad_norm": 1.3365856008156116, + "learning_rate": 0.00028865923259184295, + "loss": 7.1047, + "step": 9761 + }, + { + "epoch": 0.9108892413921806, + "grad_norm": 6.137121895566022, + "learning_rate": 0.0002886563473091719, + "loss": 7.3092, + "step": 9762 + }, + { + "epoch": 0.910982551087058, + "grad_norm": 1.0228271221545382, + "learning_rate": 0.00028865346167393887, + "loss": 7.1249, + "step": 9763 + }, + { + "epoch": 0.9110758607819353, + "grad_norm": 2.7963084985046622, + "learning_rate": 0.0002886505756861511, + "loss": 7.6315, + "step": 9764 + }, + { + "epoch": 0.9111691704768126, + "grad_norm": 1.0948196932740555, + "learning_rate": 0.00028864768934581606, + "loss": 7.3254, + "step": 9765 + }, + { + "epoch": 0.9112624801716899, + "grad_norm": 241852664.12447888, + "learning_rate": 0.000288644802652941, + "loss": 7.8459, + "step": 9766 + }, + { + "epoch": 0.9113557898665672, + "grad_norm": 0.8471012233155609, + "learning_rate": 0.00028864191560753317, + "loss": 7.2323, + "step": 9767 + }, + { + "epoch": 0.9114490995614445, + "grad_norm": 0.9119381569923058, + "learning_rate": 0.0002886390282096001, + "loss": 7.2276, + "step": 9768 + }, + { + "epoch": 0.9115424092563217, + "grad_norm": 0.7334682701933403, + "learning_rate": 0.000288636140459149, + "loss": 7.329, + "step": 9769 + }, + { + "epoch": 0.911635718951199, + "grad_norm": 0.9032295150578485, + "learning_rate": 0.0002886332523561874, + "loss": 7.5649, + "step": 9770 + }, + { + "epoch": 0.9117290286460763, + "grad_norm": 1.002562638538388, + "learning_rate": 0.0002886303639007224, + "loss": 7.4833, + "step": 9771 + }, + { + "epoch": 0.9118223383409536, + "grad_norm": 1.2218935813531737, + "learning_rate": 0.0002886274750927614, + "loss": 7.4874, + "step": 9772 + }, + { + "epoch": 0.9119156480358309, + "grad_norm": 0.8422876090249013, + "learning_rate": 0.0002886245859323119, + "loss": 7.2675, + "step": 9773 + }, + { + "epoch": 0.9120089577307082, + "grad_norm": 0.5919956041660709, + "learning_rate": 0.0002886216964193811, + "loss": 7.2181, + "step": 9774 + }, + { + "epoch": 0.9121022674255855, + "grad_norm": 0.6066295898413414, + "learning_rate": 0.00028861880655397636, + "loss": 7.534, + "step": 9775 + }, + { + "epoch": 0.9121955771204628, + "grad_norm": 10.55316352134452, + "learning_rate": 0.0002886159163361051, + "loss": 7.698, + "step": 9776 + }, + { + "epoch": 0.9122888868153401, + "grad_norm": 72.94767890608384, + "learning_rate": 0.0002886130257657746, + "loss": 7.7985, + "step": 9777 + }, + { + "epoch": 0.9123821965102175, + "grad_norm": 0.8667627883721941, + "learning_rate": 0.00028861013484299226, + "loss": 7.5493, + "step": 9778 + }, + { + "epoch": 0.9124755062050947, + "grad_norm": 0.8179056955272592, + "learning_rate": 0.00028860724356776537, + "loss": 7.4436, + "step": 9779 + }, + { + "epoch": 0.912568815899972, + "grad_norm": 0.9093003883074595, + "learning_rate": 0.00028860435194010133, + "loss": 7.2814, + "step": 9780 + }, + { + "epoch": 0.9126621255948493, + "grad_norm": 1.1131854444648113, + "learning_rate": 0.0002886014599600075, + "loss": 7.3385, + "step": 9781 + }, + { + "epoch": 0.9127554352897266, + "grad_norm": 0.6429982401134168, + "learning_rate": 0.0002885985676274912, + "loss": 7.4819, + "step": 9782 + }, + { + "epoch": 0.9128487449846039, + "grad_norm": 1682053.6848434207, + "learning_rate": 0.00028859567494255985, + "loss": 7.8443, + "step": 9783 + }, + { + "epoch": 0.9129420546794812, + "grad_norm": 11.98786168888397, + "learning_rate": 0.0002885927819052207, + "loss": 7.288, + "step": 9784 + }, + { + "epoch": 0.9130353643743585, + "grad_norm": 0.6040100141318242, + "learning_rate": 0.00028858988851548114, + "loss": 7.6917, + "step": 9785 + }, + { + "epoch": 0.9131286740692358, + "grad_norm": 3.705351638153509, + "learning_rate": 0.00028858699477334857, + "loss": 7.2093, + "step": 9786 + }, + { + "epoch": 0.9132219837641131, + "grad_norm": 0.5081744650207336, + "learning_rate": 0.00028858410067883033, + "loss": 7.383, + "step": 9787 + }, + { + "epoch": 0.9133152934589904, + "grad_norm": 1.4271586883004461, + "learning_rate": 0.00028858120623193376, + "loss": 7.6665, + "step": 9788 + }, + { + "epoch": 0.9134086031538677, + "grad_norm": 0.9100225933048119, + "learning_rate": 0.0002885783114326663, + "loss": 7.6478, + "step": 9789 + }, + { + "epoch": 0.9135019128487449, + "grad_norm": 0.9246472421404779, + "learning_rate": 0.0002885754162810352, + "loss": 7.7133, + "step": 9790 + }, + { + "epoch": 0.9135952225436222, + "grad_norm": 1.007465085147123, + "learning_rate": 0.0002885725207770478, + "loss": 7.3963, + "step": 9791 + }, + { + "epoch": 0.9136885322384996, + "grad_norm": 19419179.21684729, + "learning_rate": 0.0002885696249207116, + "loss": 7.1999, + "step": 9792 + }, + { + "epoch": 0.9137818419333769, + "grad_norm": 0.7531035652957293, + "learning_rate": 0.00028856672871203383, + "loss": 7.3355, + "step": 9793 + }, + { + "epoch": 0.9138751516282542, + "grad_norm": 0.973508173992455, + "learning_rate": 0.000288563832151022, + "loss": 7.3038, + "step": 9794 + }, + { + "epoch": 0.9139684613231315, + "grad_norm": 1.5711994933012912, + "learning_rate": 0.00028856093523768335, + "loss": 7.8149, + "step": 9795 + }, + { + "epoch": 0.9140617710180088, + "grad_norm": 0.6757047060621135, + "learning_rate": 0.0002885580379720253, + "loss": 7.4605, + "step": 9796 + }, + { + "epoch": 0.9141550807128861, + "grad_norm": 20186261.4317399, + "learning_rate": 0.00028855514035405516, + "loss": 7.4081, + "step": 9797 + }, + { + "epoch": 0.9142483904077634, + "grad_norm": 0.5611867973907807, + "learning_rate": 0.0002885522423837804, + "loss": 7.3724, + "step": 9798 + }, + { + "epoch": 0.9143417001026407, + "grad_norm": 3.951653208696496, + "learning_rate": 0.00028854934406120827, + "loss": 7.5025, + "step": 9799 + }, + { + "epoch": 0.914435009797518, + "grad_norm": 3369477.4764548484, + "learning_rate": 0.0002885464453863462, + "loss": 7.1738, + "step": 9800 + }, + { + "epoch": 0.9145283194923952, + "grad_norm": 35523406.18218047, + "learning_rate": 0.00028854354635920163, + "loss": 7.3558, + "step": 9801 + }, + { + "epoch": 0.9146216291872725, + "grad_norm": 1.1245787678285835, + "learning_rate": 0.0002885406469797818, + "loss": 7.4613, + "step": 9802 + }, + { + "epoch": 0.9147149388821498, + "grad_norm": 3667184.5600834354, + "learning_rate": 0.0002885377472480942, + "loss": 7.0213, + "step": 9803 + }, + { + "epoch": 0.9148082485770271, + "grad_norm": 1.3814816574809596, + "learning_rate": 0.0002885348471641461, + "loss": 7.0691, + "step": 9804 + }, + { + "epoch": 0.9149015582719044, + "grad_norm": 0.7409952646563005, + "learning_rate": 0.000288531946727945, + "loss": 7.1857, + "step": 9805 + }, + { + "epoch": 0.9149948679667818, + "grad_norm": 0.8681695910916452, + "learning_rate": 0.000288529045939498, + "loss": 7.2456, + "step": 9806 + }, + { + "epoch": 0.9150881776616591, + "grad_norm": 0.8085562142662948, + "learning_rate": 0.00028852614479881285, + "loss": 7.1807, + "step": 9807 + }, + { + "epoch": 0.9151814873565364, + "grad_norm": 0.664983089117752, + "learning_rate": 0.0002885232433058967, + "loss": 7.2104, + "step": 9808 + }, + { + "epoch": 0.9152747970514137, + "grad_norm": 0.586299729460095, + "learning_rate": 0.000288520341460757, + "loss": 7.255, + "step": 9809 + }, + { + "epoch": 0.915368106746291, + "grad_norm": 0.7148298431327427, + "learning_rate": 0.00028851743926340106, + "loss": 7.5546, + "step": 9810 + }, + { + "epoch": 0.9154614164411682, + "grad_norm": 0.8708687593551762, + "learning_rate": 0.0002885145367138363, + "loss": 7.6297, + "step": 9811 + }, + { + "epoch": 0.9155547261360455, + "grad_norm": 1.3463508247046112, + "learning_rate": 0.00028851163381207013, + "loss": 7.3737, + "step": 9812 + }, + { + "epoch": 0.9156480358309228, + "grad_norm": 216216430.90660977, + "learning_rate": 0.0002885087305581099, + "loss": 7.4814, + "step": 9813 + }, + { + "epoch": 0.9157413455258001, + "grad_norm": 0.8374415165472302, + "learning_rate": 0.000288505826951963, + "loss": 7.4812, + "step": 9814 + }, + { + "epoch": 0.9158346552206774, + "grad_norm": 1.2975248186973587, + "learning_rate": 0.0002885029229936368, + "loss": 6.9178, + "step": 9815 + }, + { + "epoch": 0.9159279649155547, + "grad_norm": 0.6838109340756384, + "learning_rate": 0.00028850001868313877, + "loss": 7.3609, + "step": 9816 + }, + { + "epoch": 0.916021274610432, + "grad_norm": 1.0206447674063692, + "learning_rate": 0.0002884971140204761, + "loss": 7.4223, + "step": 9817 + }, + { + "epoch": 0.9161145843053093, + "grad_norm": 21800058.64793972, + "learning_rate": 0.00028849420900565635, + "loss": 7.1817, + "step": 9818 + }, + { + "epoch": 0.9162078940001867, + "grad_norm": 1.787205324660882, + "learning_rate": 0.0002884913036386868, + "loss": 7.7032, + "step": 9819 + }, + { + "epoch": 0.916301203695064, + "grad_norm": 1.2175431097678635, + "learning_rate": 0.000288488397919575, + "loss": 7.3778, + "step": 9820 + }, + { + "epoch": 0.9163945133899413, + "grad_norm": 0.9686639054830694, + "learning_rate": 0.0002884854918483281, + "loss": 7.5523, + "step": 9821 + }, + { + "epoch": 0.9164878230848185, + "grad_norm": 0.5906278232124736, + "learning_rate": 0.00028848258542495375, + "loss": 7.5369, + "step": 9822 + }, + { + "epoch": 0.9165811327796958, + "grad_norm": 61010254.16287282, + "learning_rate": 0.0002884796786494591, + "loss": 7.2403, + "step": 9823 + }, + { + "epoch": 0.9166744424745731, + "grad_norm": 1.2602740297633983, + "learning_rate": 0.0002884767715218517, + "loss": 7.3107, + "step": 9824 + }, + { + "epoch": 0.9167677521694504, + "grad_norm": 1708653.6083951022, + "learning_rate": 0.0002884738640421389, + "loss": 7.3084, + "step": 9825 + }, + { + "epoch": 0.9168610618643277, + "grad_norm": 8.325895386960733, + "learning_rate": 0.000288470956210328, + "loss": 7.3728, + "step": 9826 + }, + { + "epoch": 0.916954371559205, + "grad_norm": 0.5659912281126084, + "learning_rate": 0.0002884680480264266, + "loss": 7.3044, + "step": 9827 + }, + { + "epoch": 0.9170476812540823, + "grad_norm": 0.6172063479929634, + "learning_rate": 0.0002884651394904419, + "loss": 7.5615, + "step": 9828 + }, + { + "epoch": 0.9171409909489596, + "grad_norm": 0.7150189742916948, + "learning_rate": 0.00028846223060238134, + "loss": 7.1124, + "step": 9829 + }, + { + "epoch": 0.9172343006438369, + "grad_norm": 1.8700149304204987, + "learning_rate": 0.00028845932136225237, + "loss": 7.3992, + "step": 9830 + }, + { + "epoch": 0.9173276103387142, + "grad_norm": 1.2967168363519275, + "learning_rate": 0.0002884564117700624, + "loss": 7.36, + "step": 9831 + }, + { + "epoch": 0.9174209200335914, + "grad_norm": 7244546.1882026205, + "learning_rate": 0.0002884535018258188, + "loss": 7.4027, + "step": 9832 + }, + { + "epoch": 0.9175142297284687, + "grad_norm": 0.6398347502481027, + "learning_rate": 0.00028845059152952886, + "loss": 7.2664, + "step": 9833 + }, + { + "epoch": 0.917607539423346, + "grad_norm": 0.9180719778797971, + "learning_rate": 0.0002884476808812001, + "loss": 7.1857, + "step": 9834 + }, + { + "epoch": 0.9177008491182234, + "grad_norm": 5.344471516916849, + "learning_rate": 0.00028844476988083997, + "loss": 7.4749, + "step": 9835 + }, + { + "epoch": 0.9177941588131007, + "grad_norm": 1910865.8083390717, + "learning_rate": 0.00028844185852845575, + "loss": 7.3182, + "step": 9836 + }, + { + "epoch": 0.917887468507978, + "grad_norm": 0.515509215251107, + "learning_rate": 0.00028843894682405494, + "loss": 7.3943, + "step": 9837 + }, + { + "epoch": 0.9179807782028553, + "grad_norm": 0.40712503500245373, + "learning_rate": 0.0002884360347676448, + "loss": 7.3508, + "step": 9838 + }, + { + "epoch": 0.9180740878977326, + "grad_norm": 0.501064500441632, + "learning_rate": 0.00028843312235923293, + "loss": 7.1937, + "step": 9839 + }, + { + "epoch": 0.9181673975926099, + "grad_norm": 0.4913794444974268, + "learning_rate": 0.00028843020959882663, + "loss": 7.3534, + "step": 9840 + }, + { + "epoch": 0.9182607072874872, + "grad_norm": 0.6295257867634487, + "learning_rate": 0.00028842729648643323, + "loss": 7.3465, + "step": 9841 + }, + { + "epoch": 0.9183540169823645, + "grad_norm": 37108634.35295154, + "learning_rate": 0.00028842438302206026, + "loss": 7.3354, + "step": 9842 + }, + { + "epoch": 0.9184473266772417, + "grad_norm": 9313294.745907439, + "learning_rate": 0.00028842146920571516, + "loss": 7.275, + "step": 9843 + }, + { + "epoch": 0.918540636372119, + "grad_norm": 0.5945219135922443, + "learning_rate": 0.00028841855503740517, + "loss": 7.2161, + "step": 9844 + }, + { + "epoch": 0.9186339460669963, + "grad_norm": 0.414493699683316, + "learning_rate": 0.0002884156405171378, + "loss": 7.3564, + "step": 9845 + }, + { + "epoch": 0.9187272557618736, + "grad_norm": 0.48921487260136604, + "learning_rate": 0.00028841272564492055, + "loss": 7.3482, + "step": 9846 + }, + { + "epoch": 0.918820565456751, + "grad_norm": 23149468.983374465, + "learning_rate": 0.00028840981042076066, + "loss": 7.2917, + "step": 9847 + }, + { + "epoch": 0.9189138751516283, + "grad_norm": 1.065950741182078, + "learning_rate": 0.00028840689484466565, + "loss": 7.538, + "step": 9848 + }, + { + "epoch": 0.9190071848465056, + "grad_norm": 257175763.99646005, + "learning_rate": 0.0002884039789166429, + "loss": 7.18, + "step": 9849 + }, + { + "epoch": 0.9191004945413829, + "grad_norm": 2.1319007504581022, + "learning_rate": 0.00028840106263669984, + "loss": 7.3792, + "step": 9850 + }, + { + "epoch": 0.9191938042362602, + "grad_norm": 3.632270124415899, + "learning_rate": 0.00028839814600484383, + "loss": 7.5314, + "step": 9851 + }, + { + "epoch": 0.9192871139311375, + "grad_norm": 0.5824669033275978, + "learning_rate": 0.00028839522902108236, + "loss": 7.0664, + "step": 9852 + }, + { + "epoch": 0.9193804236260148, + "grad_norm": 0.7518549057843644, + "learning_rate": 0.00028839231168542286, + "loss": 7.2965, + "step": 9853 + }, + { + "epoch": 0.919473733320892, + "grad_norm": 0.6297254732432195, + "learning_rate": 0.00028838939399787264, + "loss": 7.3501, + "step": 9854 + }, + { + "epoch": 0.9195670430157693, + "grad_norm": 0.4807765670642741, + "learning_rate": 0.00028838647595843923, + "loss": 7.1547, + "step": 9855 + }, + { + "epoch": 0.9196603527106466, + "grad_norm": 7.523135398950149, + "learning_rate": 0.00028838355756713, + "loss": 7.4125, + "step": 9856 + }, + { + "epoch": 0.9197536624055239, + "grad_norm": 0.4438434186157976, + "learning_rate": 0.00028838063882395237, + "loss": 7.3413, + "step": 9857 + }, + { + "epoch": 0.9198469721004012, + "grad_norm": 1.0143459479486991, + "learning_rate": 0.00028837771972891376, + "loss": 6.9928, + "step": 9858 + }, + { + "epoch": 0.9199402817952785, + "grad_norm": 1.0971179629680694, + "learning_rate": 0.0002883748002820216, + "loss": 7.2368, + "step": 9859 + }, + { + "epoch": 0.9200335914901558, + "grad_norm": 0.6301452672776906, + "learning_rate": 0.00028837188048328334, + "loss": 7.578, + "step": 9860 + }, + { + "epoch": 0.9201269011850332, + "grad_norm": 0.4893635302003458, + "learning_rate": 0.0002883689603327063, + "loss": 7.1883, + "step": 9861 + }, + { + "epoch": 0.9202202108799105, + "grad_norm": 1638860059.7313921, + "learning_rate": 0.00028836603983029805, + "loss": 7.2027, + "step": 9862 + }, + { + "epoch": 0.9203135205747878, + "grad_norm": 0.8999991025362147, + "learning_rate": 0.00028836311897606595, + "loss": 7.5283, + "step": 9863 + }, + { + "epoch": 0.920406830269665, + "grad_norm": 1.0156696352218098, + "learning_rate": 0.00028836019777001736, + "loss": 7.2923, + "step": 9864 + }, + { + "epoch": 0.9205001399645423, + "grad_norm": 0.4261055605633498, + "learning_rate": 0.00028835727621215985, + "loss": 7.2975, + "step": 9865 + }, + { + "epoch": 0.9205934496594196, + "grad_norm": 15.005800291063789, + "learning_rate": 0.00028835435430250075, + "loss": 7.5854, + "step": 9866 + }, + { + "epoch": 0.9206867593542969, + "grad_norm": 0.9603679081638393, + "learning_rate": 0.0002883514320410475, + "loss": 7.1353, + "step": 9867 + }, + { + "epoch": 0.9207800690491742, + "grad_norm": 37581893306.792946, + "learning_rate": 0.0002883485094278075, + "loss": 7.1404, + "step": 9868 + }, + { + "epoch": 0.9208733787440515, + "grad_norm": 0.5887135931754026, + "learning_rate": 0.0002883455864627883, + "loss": 7.1628, + "step": 9869 + }, + { + "epoch": 0.9209666884389288, + "grad_norm": 1.696670242661968, + "learning_rate": 0.0002883426631459972, + "loss": 7.2915, + "step": 9870 + }, + { + "epoch": 0.9210599981338061, + "grad_norm": 0.5544366681323192, + "learning_rate": 0.0002883397394774417, + "loss": 7.3863, + "step": 9871 + }, + { + "epoch": 0.9211533078286834, + "grad_norm": 0.4461073821180479, + "learning_rate": 0.00028833681545712927, + "loss": 7.1835, + "step": 9872 + }, + { + "epoch": 0.9212466175235607, + "grad_norm": 32471479116.194023, + "learning_rate": 0.0002883338910850672, + "loss": 7.3963, + "step": 9873 + }, + { + "epoch": 0.921339927218438, + "grad_norm": 0.35383656202303787, + "learning_rate": 0.0002883309663612631, + "loss": 7.0583, + "step": 9874 + }, + { + "epoch": 0.9214332369133152, + "grad_norm": 0.48420523054909514, + "learning_rate": 0.00028832804128572426, + "loss": 7.169, + "step": 9875 + }, + { + "epoch": 0.9215265466081926, + "grad_norm": 0.6812853363660512, + "learning_rate": 0.00028832511585845827, + "loss": 7.4354, + "step": 9876 + }, + { + "epoch": 0.9216198563030699, + "grad_norm": 0.4083007174743787, + "learning_rate": 0.0002883221900794725, + "loss": 7.3032, + "step": 9877 + }, + { + "epoch": 0.9217131659979472, + "grad_norm": 67409678821.778206, + "learning_rate": 0.0002883192639487743, + "loss": 7.4003, + "step": 9878 + }, + { + "epoch": 0.9218064756928245, + "grad_norm": 1754517642.6702585, + "learning_rate": 0.0002883163374663712, + "loss": 7.0883, + "step": 9879 + }, + { + "epoch": 0.9218997853877018, + "grad_norm": 0.549598539071856, + "learning_rate": 0.0002883134106322706, + "loss": 7.2714, + "step": 9880 + }, + { + "epoch": 0.9219930950825791, + "grad_norm": 0.747090822083238, + "learning_rate": 0.00028831048344648007, + "loss": 7.5597, + "step": 9881 + }, + { + "epoch": 0.9220864047774564, + "grad_norm": 90104112.62393552, + "learning_rate": 0.00028830755590900683, + "loss": 7.2383, + "step": 9882 + }, + { + "epoch": 0.9221797144723337, + "grad_norm": 11858943915.988878, + "learning_rate": 0.0002883046280198585, + "loss": 7.7634, + "step": 9883 + }, + { + "epoch": 0.922273024167211, + "grad_norm": 0.8651468498604218, + "learning_rate": 0.00028830169977904247, + "loss": 7.7735, + "step": 9884 + }, + { + "epoch": 0.9223663338620882, + "grad_norm": 8654856351.32975, + "learning_rate": 0.0002882987711865662, + "loss": 7.4614, + "step": 9885 + }, + { + "epoch": 0.9224596435569655, + "grad_norm": 1.1393906955309383, + "learning_rate": 0.0002882958422424371, + "loss": 7.2205, + "step": 9886 + }, + { + "epoch": 0.9225529532518428, + "grad_norm": 1.1911650001783087, + "learning_rate": 0.0002882929129466626, + "loss": 7.425, + "step": 9887 + }, + { + "epoch": 0.9226462629467201, + "grad_norm": 660584.4916464324, + "learning_rate": 0.0002882899832992503, + "loss": 7.7542, + "step": 9888 + }, + { + "epoch": 0.9227395726415974, + "grad_norm": 7.539391370285734, + "learning_rate": 0.00028828705330020743, + "loss": 7.4541, + "step": 9889 + }, + { + "epoch": 0.9228328823364748, + "grad_norm": 1.291676380698759, + "learning_rate": 0.0002882841229495416, + "loss": 7.2374, + "step": 9890 + }, + { + "epoch": 0.9229261920313521, + "grad_norm": 17.52085079733263, + "learning_rate": 0.0002882811922472602, + "loss": 7.4438, + "step": 9891 + }, + { + "epoch": 0.9230195017262294, + "grad_norm": 22.845938391432718, + "learning_rate": 0.00028827826119337064, + "loss": 7.4974, + "step": 9892 + }, + { + "epoch": 0.9231128114211067, + "grad_norm": 5.939623104566282, + "learning_rate": 0.0002882753297878805, + "loss": 7.4687, + "step": 9893 + }, + { + "epoch": 0.923206121115984, + "grad_norm": 6.7889550468902495, + "learning_rate": 0.00028827239803079713, + "loss": 7.5671, + "step": 9894 + }, + { + "epoch": 0.9232994308108613, + "grad_norm": 10983050.206697345, + "learning_rate": 0.000288269465922128, + "loss": 7.5435, + "step": 9895 + }, + { + "epoch": 0.9233927405057385, + "grad_norm": 2.867214474028326, + "learning_rate": 0.0002882665334618806, + "loss": 7.6645, + "step": 9896 + }, + { + "epoch": 0.9234860502006158, + "grad_norm": 2.145403746348634, + "learning_rate": 0.0002882636006500623, + "loss": 7.4402, + "step": 9897 + }, + { + "epoch": 0.9235793598954931, + "grad_norm": 11.830659203275424, + "learning_rate": 0.00028826066748668065, + "loss": 7.7299, + "step": 9898 + }, + { + "epoch": 0.9236726695903704, + "grad_norm": 3.630880924989341, + "learning_rate": 0.0002882577339717431, + "loss": 7.6627, + "step": 9899 + }, + { + "epoch": 0.9237659792852477, + "grad_norm": 1.7839283988995533, + "learning_rate": 0.0002882548001052571, + "loss": 7.5219, + "step": 9900 + }, + { + "epoch": 0.923859288980125, + "grad_norm": 27.486587467132654, + "learning_rate": 0.0002882518658872301, + "loss": 7.5837, + "step": 9901 + }, + { + "epoch": 0.9239525986750023, + "grad_norm": 2.518962458298486, + "learning_rate": 0.00028824893131766953, + "loss": 7.6211, + "step": 9902 + }, + { + "epoch": 0.9240459083698797, + "grad_norm": 1.4800037344590307, + "learning_rate": 0.0002882459963965829, + "loss": 7.9442, + "step": 9903 + }, + { + "epoch": 0.924139218064757, + "grad_norm": 0.9084835472158839, + "learning_rate": 0.00028824306112397766, + "loss": 7.544, + "step": 9904 + }, + { + "epoch": 0.9242325277596343, + "grad_norm": 4.274438402857904, + "learning_rate": 0.0002882401254998612, + "loss": 7.8093, + "step": 9905 + }, + { + "epoch": 0.9243258374545116, + "grad_norm": 1.781833721140427, + "learning_rate": 0.0002882371895242411, + "loss": 7.2314, + "step": 9906 + }, + { + "epoch": 0.9244191471493888, + "grad_norm": 8.438134823842324, + "learning_rate": 0.0002882342531971248, + "loss": 7.6355, + "step": 9907 + }, + { + "epoch": 0.9245124568442661, + "grad_norm": 205196.79008949135, + "learning_rate": 0.0002882313165185197, + "loss": 7.486, + "step": 9908 + }, + { + "epoch": 0.9246057665391434, + "grad_norm": 2.9068557515932985, + "learning_rate": 0.0002882283794884333, + "loss": 7.5067, + "step": 9909 + }, + { + "epoch": 0.9246990762340207, + "grad_norm": 1.336443007064225, + "learning_rate": 0.0002882254421068731, + "loss": 7.4626, + "step": 9910 + }, + { + "epoch": 0.924792385928898, + "grad_norm": 1.6469127286445242, + "learning_rate": 0.0002882225043738466, + "loss": 7.6223, + "step": 9911 + }, + { + "epoch": 0.9248856956237753, + "grad_norm": 44.59046974576397, + "learning_rate": 0.00028821956628936115, + "loss": 7.279, + "step": 9912 + }, + { + "epoch": 0.9249790053186526, + "grad_norm": 1.0752583894371888, + "learning_rate": 0.00028821662785342425, + "loss": 7.3836, + "step": 9913 + }, + { + "epoch": 0.9250723150135299, + "grad_norm": 12534.364466193936, + "learning_rate": 0.00028821368906604343, + "loss": 7.3935, + "step": 9914 + }, + { + "epoch": 0.9251656247084072, + "grad_norm": 0.7581337833723633, + "learning_rate": 0.0002882107499272262, + "loss": 7.5492, + "step": 9915 + }, + { + "epoch": 0.9252589344032845, + "grad_norm": 44.52199000526255, + "learning_rate": 0.00028820781043697996, + "loss": 7.3456, + "step": 9916 + }, + { + "epoch": 0.9253522440981617, + "grad_norm": 2.185548237519249, + "learning_rate": 0.00028820487059531217, + "loss": 7.5136, + "step": 9917 + }, + { + "epoch": 0.925445553793039, + "grad_norm": 1.923212901738329, + "learning_rate": 0.00028820193040223033, + "loss": 7.7297, + "step": 9918 + }, + { + "epoch": 0.9255388634879164, + "grad_norm": 1.526337256966379, + "learning_rate": 0.000288198989857742, + "loss": 7.4252, + "step": 9919 + }, + { + "epoch": 0.9256321731827937, + "grad_norm": 1.0770017109281016, + "learning_rate": 0.00028819604896185443, + "loss": 7.5105, + "step": 9920 + }, + { + "epoch": 0.925725482877671, + "grad_norm": 4.799687037507409, + "learning_rate": 0.00028819310771457533, + "loss": 7.4531, + "step": 9921 + }, + { + "epoch": 0.9258187925725483, + "grad_norm": 0.9139219175800404, + "learning_rate": 0.0002881901661159121, + "loss": 7.742, + "step": 9922 + }, + { + "epoch": 0.9259121022674256, + "grad_norm": 13.834828557261494, + "learning_rate": 0.0002881872241658722, + "loss": 7.5305, + "step": 9923 + }, + { + "epoch": 0.9260054119623029, + "grad_norm": 10348.629725797211, + "learning_rate": 0.0002881842818644631, + "loss": 7.7779, + "step": 9924 + }, + { + "epoch": 0.9260987216571802, + "grad_norm": 0.6630846695590223, + "learning_rate": 0.00028818133921169236, + "loss": 7.5447, + "step": 9925 + }, + { + "epoch": 0.9261920313520575, + "grad_norm": 2.076868035202854, + "learning_rate": 0.00028817839620756734, + "loss": 7.7165, + "step": 9926 + }, + { + "epoch": 0.9262853410469348, + "grad_norm": 1.0103248960268383, + "learning_rate": 0.00028817545285209566, + "loss": 7.4024, + "step": 9927 + }, + { + "epoch": 0.926378650741812, + "grad_norm": 1906.4136081731278, + "learning_rate": 0.0002881725091452847, + "loss": 7.4942, + "step": 9928 + }, + { + "epoch": 0.9264719604366893, + "grad_norm": 1.2762249699926151, + "learning_rate": 0.00028816956508714197, + "loss": 7.3599, + "step": 9929 + }, + { + "epoch": 0.9265652701315666, + "grad_norm": 1806.1430109956023, + "learning_rate": 0.00028816662067767497, + "loss": 7.4077, + "step": 9930 + }, + { + "epoch": 0.926658579826444, + "grad_norm": 2.3085287185567775, + "learning_rate": 0.00028816367591689117, + "loss": 7.5218, + "step": 9931 + }, + { + "epoch": 0.9267518895213213, + "grad_norm": 3.438118187788632, + "learning_rate": 0.0002881607308047981, + "loss": 7.7409, + "step": 9932 + }, + { + "epoch": 0.9268451992161986, + "grad_norm": 8.87584441360568, + "learning_rate": 0.00028815778534140317, + "loss": 7.3172, + "step": 9933 + }, + { + "epoch": 0.9269385089110759, + "grad_norm": 0.7699363319387841, + "learning_rate": 0.00028815483952671393, + "loss": 7.4104, + "step": 9934 + }, + { + "epoch": 0.9270318186059532, + "grad_norm": 3.767718771714324, + "learning_rate": 0.0002881518933607379, + "loss": 7.6618, + "step": 9935 + }, + { + "epoch": 0.9271251283008305, + "grad_norm": 1.7488735371846913, + "learning_rate": 0.0002881489468434825, + "loss": 7.3234, + "step": 9936 + }, + { + "epoch": 0.9272184379957078, + "grad_norm": 7.526291980346218, + "learning_rate": 0.00028814599997495523, + "loss": 7.4044, + "step": 9937 + }, + { + "epoch": 0.927311747690585, + "grad_norm": 783.0794957025331, + "learning_rate": 0.00028814305275516366, + "loss": 7.3783, + "step": 9938 + }, + { + "epoch": 0.9274050573854623, + "grad_norm": 1.1970754360987736, + "learning_rate": 0.00028814010518411515, + "loss": 7.5736, + "step": 9939 + }, + { + "epoch": 0.9274983670803396, + "grad_norm": 4.944108086861496, + "learning_rate": 0.0002881371572618174, + "loss": 7.3029, + "step": 9940 + }, + { + "epoch": 0.9275916767752169, + "grad_norm": 2.1408578740932143, + "learning_rate": 0.00028813420898827764, + "loss": 7.4635, + "step": 9941 + }, + { + "epoch": 0.9276849864700942, + "grad_norm": 1.2735694869659349, + "learning_rate": 0.00028813126036350355, + "loss": 7.3609, + "step": 9942 + }, + { + "epoch": 0.9277782961649715, + "grad_norm": 0.8478534469503661, + "learning_rate": 0.0002881283113875026, + "loss": 7.4333, + "step": 9943 + }, + { + "epoch": 0.9278716058598488, + "grad_norm": 1.5533274227436988, + "learning_rate": 0.0002881253620602823, + "loss": 7.9096, + "step": 9944 + }, + { + "epoch": 0.9279649155547262, + "grad_norm": 4.680891929640921, + "learning_rate": 0.00028812241238185005, + "loss": 7.4438, + "step": 9945 + }, + { + "epoch": 0.9280582252496035, + "grad_norm": 1335.16300841817, + "learning_rate": 0.00028811946235221346, + "loss": 7.4405, + "step": 9946 + }, + { + "epoch": 0.9281515349444808, + "grad_norm": 884.2308226625794, + "learning_rate": 0.00028811651197138, + "loss": 7.3969, + "step": 9947 + }, + { + "epoch": 0.9282448446393581, + "grad_norm": 3676.931592543273, + "learning_rate": 0.0002881135612393572, + "loss": 7.2576, + "step": 9948 + }, + { + "epoch": 0.9283381543342353, + "grad_norm": 0.6488875243367083, + "learning_rate": 0.00028811061015615244, + "loss": 7.511, + "step": 9949 + }, + { + "epoch": 0.9284314640291126, + "grad_norm": 2.490385714036773, + "learning_rate": 0.0002881076587217733, + "loss": 7.7012, + "step": 9950 + }, + { + "epoch": 0.9285247737239899, + "grad_norm": 6.3732419989088775, + "learning_rate": 0.00028810470693622735, + "loss": 7.367, + "step": 9951 + }, + { + "epoch": 0.9286180834188672, + "grad_norm": 17.719450722563764, + "learning_rate": 0.00028810175479952206, + "loss": 7.4514, + "step": 9952 + }, + { + "epoch": 0.9287113931137445, + "grad_norm": 0.6507532815877958, + "learning_rate": 0.0002880988023116649, + "loss": 7.289, + "step": 9953 + }, + { + "epoch": 0.9288047028086218, + "grad_norm": 1.0877165825591202, + "learning_rate": 0.0002880958494726633, + "loss": 7.8477, + "step": 9954 + }, + { + "epoch": 0.9288980125034991, + "grad_norm": 2.2198597564620743, + "learning_rate": 0.000288092896282525, + "loss": 7.7146, + "step": 9955 + }, + { + "epoch": 0.9289913221983764, + "grad_norm": 2.0493531178317275, + "learning_rate": 0.0002880899427412573, + "loss": 7.2751, + "step": 9956 + }, + { + "epoch": 0.9290846318932537, + "grad_norm": 2177.341546597192, + "learning_rate": 0.00028808698884886776, + "loss": 7.5211, + "step": 9957 + }, + { + "epoch": 0.929177941588131, + "grad_norm": 1.5824292752087514, + "learning_rate": 0.00028808403460536396, + "loss": 7.46, + "step": 9958 + }, + { + "epoch": 0.9292712512830084, + "grad_norm": 2.239986254150886, + "learning_rate": 0.0002880810800107534, + "loss": 7.5175, + "step": 9959 + }, + { + "epoch": 0.9293645609778856, + "grad_norm": 0.7949147507075021, + "learning_rate": 0.0002880781250650435, + "loss": 7.5307, + "step": 9960 + }, + { + "epoch": 0.9294578706727629, + "grad_norm": 1.025543621626051, + "learning_rate": 0.0002880751697682418, + "loss": 7.3151, + "step": 9961 + }, + { + "epoch": 0.9295511803676402, + "grad_norm": 1.4252686433332349, + "learning_rate": 0.0002880722141203559, + "loss": 7.801, + "step": 9962 + }, + { + "epoch": 0.9296444900625175, + "grad_norm": 902.3732642061698, + "learning_rate": 0.0002880692581213932, + "loss": 7.5697, + "step": 9963 + }, + { + "epoch": 0.9297377997573948, + "grad_norm": 0.6154547075426844, + "learning_rate": 0.00028806630177136133, + "loss": 7.4499, + "step": 9964 + }, + { + "epoch": 0.9298311094522721, + "grad_norm": 866.8792018926413, + "learning_rate": 0.00028806334507026774, + "loss": 7.7731, + "step": 9965 + }, + { + "epoch": 0.9299244191471494, + "grad_norm": 1.045034721367844, + "learning_rate": 0.00028806038801811994, + "loss": 7.3648, + "step": 9966 + }, + { + "epoch": 0.9300177288420267, + "grad_norm": 281.95206371530475, + "learning_rate": 0.00028805743061492554, + "loss": 7.2911, + "step": 9967 + }, + { + "epoch": 0.930111038536904, + "grad_norm": 25.572027976270352, + "learning_rate": 0.00028805447286069194, + "loss": 7.3905, + "step": 9968 + }, + { + "epoch": 0.9302043482317813, + "grad_norm": 11.74142336314245, + "learning_rate": 0.00028805151475542665, + "loss": 7.151, + "step": 9969 + }, + { + "epoch": 0.9302976579266585, + "grad_norm": 0.8010027733633944, + "learning_rate": 0.0002880485562991373, + "loss": 7.5433, + "step": 9970 + }, + { + "epoch": 0.9303909676215358, + "grad_norm": 0.8336752475795394, + "learning_rate": 0.0002880455974918314, + "loss": 7.2633, + "step": 9971 + }, + { + "epoch": 0.9304842773164131, + "grad_norm": 2.596679327667928, + "learning_rate": 0.00028804263833351643, + "loss": 7.5235, + "step": 9972 + }, + { + "epoch": 0.9305775870112905, + "grad_norm": 0.8723904225040526, + "learning_rate": 0.0002880396788241999, + "loss": 7.205, + "step": 9973 + }, + { + "epoch": 0.9306708967061678, + "grad_norm": 2.00363138863015, + "learning_rate": 0.0002880367189638894, + "loss": 7.4342, + "step": 9974 + }, + { + "epoch": 0.9307642064010451, + "grad_norm": 2.5363942086749436, + "learning_rate": 0.0002880337587525923, + "loss": 7.6684, + "step": 9975 + }, + { + "epoch": 0.9308575160959224, + "grad_norm": 0.9096789477775667, + "learning_rate": 0.00028803079819031635, + "loss": 7.2786, + "step": 9976 + }, + { + "epoch": 0.9309508257907997, + "grad_norm": 5.607564865082535, + "learning_rate": 0.0002880278372770689, + "loss": 7.5142, + "step": 9977 + }, + { + "epoch": 0.931044135485677, + "grad_norm": 0.8116060314011572, + "learning_rate": 0.0002880248760128576, + "loss": 7.8178, + "step": 9978 + }, + { + "epoch": 0.9311374451805543, + "grad_norm": 1.5615818475198864, + "learning_rate": 0.00028802191439768985, + "loss": 7.5606, + "step": 9979 + }, + { + "epoch": 0.9312307548754316, + "grad_norm": 0.9031428479374445, + "learning_rate": 0.00028801895243157335, + "loss": 7.3727, + "step": 9980 + }, + { + "epoch": 0.9313240645703088, + "grad_norm": 784.841965237835, + "learning_rate": 0.0002880159901145155, + "loss": 7.3933, + "step": 9981 + }, + { + "epoch": 0.9314173742651861, + "grad_norm": 0.7600194467471357, + "learning_rate": 0.0002880130274465239, + "loss": 7.3872, + "step": 9982 + }, + { + "epoch": 0.9315106839600634, + "grad_norm": 1.1911280096168408, + "learning_rate": 0.000288010064427606, + "loss": 7.4494, + "step": 9983 + }, + { + "epoch": 0.9316039936549407, + "grad_norm": 1.3819586012413283, + "learning_rate": 0.0002880071010577694, + "loss": 7.3989, + "step": 9984 + }, + { + "epoch": 0.931697303349818, + "grad_norm": 6.595651396795508, + "learning_rate": 0.0002880041373370216, + "loss": 7.4466, + "step": 9985 + }, + { + "epoch": 0.9317906130446953, + "grad_norm": 5.337221343191941, + "learning_rate": 0.00028800117326537023, + "loss": 7.3458, + "step": 9986 + }, + { + "epoch": 0.9318839227395727, + "grad_norm": 0.5848176029359626, + "learning_rate": 0.00028799820884282273, + "loss": 7.4655, + "step": 9987 + }, + { + "epoch": 0.93197723243445, + "grad_norm": 5.074691323373423, + "learning_rate": 0.00028799524406938665, + "loss": 7.6204, + "step": 9988 + }, + { + "epoch": 0.9320705421293273, + "grad_norm": 1.7413981166523755, + "learning_rate": 0.0002879922789450695, + "loss": 7.3994, + "step": 9989 + }, + { + "epoch": 0.9321638518242046, + "grad_norm": 1.7031367780851216, + "learning_rate": 0.00028798931346987896, + "loss": 7.1923, + "step": 9990 + }, + { + "epoch": 0.9322571615190818, + "grad_norm": 3.129317449426592, + "learning_rate": 0.00028798634764382237, + "loss": 7.3962, + "step": 9991 + }, + { + "epoch": 0.9323504712139591, + "grad_norm": 3.008641652392097, + "learning_rate": 0.0002879833814669074, + "loss": 7.4343, + "step": 9992 + }, + { + "epoch": 0.9324437809088364, + "grad_norm": 0.7983609464498005, + "learning_rate": 0.0002879804149391416, + "loss": 7.6723, + "step": 9993 + }, + { + "epoch": 0.9325370906037137, + "grad_norm": 0.9021287288931138, + "learning_rate": 0.00028797744806053244, + "loss": 6.9933, + "step": 9994 + }, + { + "epoch": 0.932630400298591, + "grad_norm": 2.985990368242071, + "learning_rate": 0.00028797448083108754, + "loss": 7.4185, + "step": 9995 + }, + { + "epoch": 0.9327237099934683, + "grad_norm": 5316.073912873718, + "learning_rate": 0.0002879715132508144, + "loss": 7.4076, + "step": 9996 + }, + { + "epoch": 0.9328170196883456, + "grad_norm": 0.5868994193967932, + "learning_rate": 0.0002879685453197205, + "loss": 7.143, + "step": 9997 + }, + { + "epoch": 0.9329103293832229, + "grad_norm": 1.227570573038983, + "learning_rate": 0.0002879655770378136, + "loss": 7.7231, + "step": 9998 + }, + { + "epoch": 0.9330036390781002, + "grad_norm": 0.8773383853149141, + "learning_rate": 0.000287962608405101, + "loss": 7.4631, + "step": 9999 + }, + { + "epoch": 0.9330969487729776, + "grad_norm": 14376.796292190322, + "learning_rate": 0.0002879596394215904, + "loss": 7.6443, + "step": 10000 + }, + { + "epoch": 0.9331902584678549, + "grad_norm": 2659.5493494035027, + "learning_rate": 0.0002879566700872893, + "loss": 7.3529, + "step": 10001 + }, + { + "epoch": 0.9332835681627321, + "grad_norm": 524.0889474904743, + "learning_rate": 0.00028795370040220517, + "loss": 7.2038, + "step": 10002 + }, + { + "epoch": 0.9333768778576094, + "grad_norm": 1.5023157349800327, + "learning_rate": 0.0002879507303663457, + "loss": 7.3505, + "step": 10003 + }, + { + "epoch": 0.9334701875524867, + "grad_norm": 4.4369168385487265, + "learning_rate": 0.00028794775997971847, + "loss": 7.4224, + "step": 10004 + }, + { + "epoch": 0.933563497247364, + "grad_norm": 3.0799971902834034, + "learning_rate": 0.0002879447892423308, + "loss": 7.5159, + "step": 10005 + }, + { + "epoch": 0.9336568069422413, + "grad_norm": 1.8271623453834427, + "learning_rate": 0.0002879418181541905, + "loss": 7.2132, + "step": 10006 + }, + { + "epoch": 0.9337501166371186, + "grad_norm": 1.4954662441614117, + "learning_rate": 0.00028793884671530497, + "loss": 7.6653, + "step": 10007 + }, + { + "epoch": 0.9338434263319959, + "grad_norm": 2.4050102613589344, + "learning_rate": 0.00028793587492568183, + "loss": 7.3607, + "step": 10008 + }, + { + "epoch": 0.9339367360268732, + "grad_norm": 0.8462378735519801, + "learning_rate": 0.0002879329027853286, + "loss": 7.5598, + "step": 10009 + }, + { + "epoch": 0.9340300457217505, + "grad_norm": 2554.0359908688124, + "learning_rate": 0.00028792993029425296, + "loss": 7.6517, + "step": 10010 + }, + { + "epoch": 0.9341233554166278, + "grad_norm": 1610.8136593940487, + "learning_rate": 0.00028792695745246224, + "loss": 7.275, + "step": 10011 + }, + { + "epoch": 0.9342166651115051, + "grad_norm": 4.092551295007288, + "learning_rate": 0.00028792398425996416, + "loss": 7.3724, + "step": 10012 + }, + { + "epoch": 0.9343099748063823, + "grad_norm": 8864.176002600534, + "learning_rate": 0.00028792101071676624, + "loss": 7.6107, + "step": 10013 + }, + { + "epoch": 0.9344032845012596, + "grad_norm": 256.8111892279175, + "learning_rate": 0.00028791803682287604, + "loss": 7.429, + "step": 10014 + }, + { + "epoch": 0.934496594196137, + "grad_norm": 0.537961349179205, + "learning_rate": 0.0002879150625783011, + "loss": 7.3251, + "step": 10015 + }, + { + "epoch": 0.9345899038910143, + "grad_norm": 1.1007342988124322, + "learning_rate": 0.0002879120879830491, + "loss": 7.4508, + "step": 10016 + }, + { + "epoch": 0.9346832135858916, + "grad_norm": 154.25789208960887, + "learning_rate": 0.0002879091130371275, + "loss": 7.3173, + "step": 10017 + }, + { + "epoch": 0.9347765232807689, + "grad_norm": 95.04542724673468, + "learning_rate": 0.00028790613774054384, + "loss": 7.3514, + "step": 10018 + }, + { + "epoch": 0.9348698329756462, + "grad_norm": 4.677770716917154, + "learning_rate": 0.0002879031620933057, + "loss": 7.528, + "step": 10019 + }, + { + "epoch": 0.9349631426705235, + "grad_norm": 1332.2087844106313, + "learning_rate": 0.0002879001860954207, + "loss": 7.0668, + "step": 10020 + }, + { + "epoch": 0.9350564523654008, + "grad_norm": 0.7530050327399932, + "learning_rate": 0.00028789720974689635, + "loss": 7.2561, + "step": 10021 + }, + { + "epoch": 0.9351497620602781, + "grad_norm": 31.80059876547533, + "learning_rate": 0.00028789423304774025, + "loss": 7.5016, + "step": 10022 + }, + { + "epoch": 0.9352430717551553, + "grad_norm": 2.2559816722190673, + "learning_rate": 0.00028789125599796, + "loss": 7.0546, + "step": 10023 + }, + { + "epoch": 0.9353363814500326, + "grad_norm": 12.885388632679156, + "learning_rate": 0.00028788827859756313, + "loss": 7.6205, + "step": 10024 + }, + { + "epoch": 0.9354296911449099, + "grad_norm": 0.9212696998541046, + "learning_rate": 0.0002878853008465571, + "loss": 7.4018, + "step": 10025 + }, + { + "epoch": 0.9355230008397872, + "grad_norm": 7.2340954061249265, + "learning_rate": 0.00028788232274494974, + "loss": 7.605, + "step": 10026 + }, + { + "epoch": 0.9356163105346645, + "grad_norm": 0.7163173241597451, + "learning_rate": 0.0002878793442927484, + "loss": 7.1623, + "step": 10027 + }, + { + "epoch": 0.9357096202295418, + "grad_norm": 7.727772494675365, + "learning_rate": 0.0002878763654899608, + "loss": 7.4275, + "step": 10028 + }, + { + "epoch": 0.9358029299244192, + "grad_norm": 11.370288920352245, + "learning_rate": 0.0002878733863365944, + "loss": 7.1101, + "step": 10029 + }, + { + "epoch": 0.9358962396192965, + "grad_norm": 1.1486370252321627, + "learning_rate": 0.00028787040683265683, + "loss": 7.029, + "step": 10030 + }, + { + "epoch": 0.9359895493141738, + "grad_norm": 1.7640464425718791, + "learning_rate": 0.0002878674269781556, + "loss": 7.9148, + "step": 10031 + }, + { + "epoch": 0.9360828590090511, + "grad_norm": 1.3275238860989849, + "learning_rate": 0.00028786444677309837, + "loss": 7.629, + "step": 10032 + }, + { + "epoch": 0.9361761687039284, + "grad_norm": 1.2510647235954073, + "learning_rate": 0.0002878614662174927, + "loss": 7.3936, + "step": 10033 + }, + { + "epoch": 0.9362694783988056, + "grad_norm": 0.9687510763165096, + "learning_rate": 0.0002878584853113462, + "loss": 7.3785, + "step": 10034 + }, + { + "epoch": 0.9363627880936829, + "grad_norm": 2.3819142274142004, + "learning_rate": 0.00028785550405466633, + "loss": 7.5255, + "step": 10035 + }, + { + "epoch": 0.9364560977885602, + "grad_norm": 1.4030220460657028, + "learning_rate": 0.00028785252244746084, + "loss": 7.0958, + "step": 10036 + }, + { + "epoch": 0.9365494074834375, + "grad_norm": 2.13323595717101, + "learning_rate": 0.00028784954048973713, + "loss": 7.4227, + "step": 10037 + }, + { + "epoch": 0.9366427171783148, + "grad_norm": 2.3779974886314887, + "learning_rate": 0.00028784655818150294, + "loss": 7.3671, + "step": 10038 + }, + { + "epoch": 0.9367360268731921, + "grad_norm": 0.7600116683882415, + "learning_rate": 0.00028784357552276576, + "loss": 7.2536, + "step": 10039 + }, + { + "epoch": 0.9368293365680694, + "grad_norm": 3.518502923135271, + "learning_rate": 0.00028784059251353323, + "loss": 7.1044, + "step": 10040 + }, + { + "epoch": 0.9369226462629467, + "grad_norm": 652.1017677997208, + "learning_rate": 0.00028783760915381283, + "loss": 7.327, + "step": 10041 + }, + { + "epoch": 0.937015955957824, + "grad_norm": 23917.654570030605, + "learning_rate": 0.00028783462544361226, + "loss": 7.9417, + "step": 10042 + }, + { + "epoch": 0.9371092656527014, + "grad_norm": 7.511775831335926, + "learning_rate": 0.00028783164138293906, + "loss": 7.3527, + "step": 10043 + }, + { + "epoch": 0.9372025753475786, + "grad_norm": 0.9661155787958705, + "learning_rate": 0.00028782865697180086, + "loss": 7.4387, + "step": 10044 + }, + { + "epoch": 0.9372958850424559, + "grad_norm": 17926.268671695336, + "learning_rate": 0.00028782567221020516, + "loss": 7.6768, + "step": 10045 + }, + { + "epoch": 0.9373891947373332, + "grad_norm": 2.6212946912929755, + "learning_rate": 0.0002878226870981597, + "loss": 7.5126, + "step": 10046 + }, + { + "epoch": 0.9374825044322105, + "grad_norm": 4.413121750831781, + "learning_rate": 0.00028781970163567185, + "loss": 7.509, + "step": 10047 + }, + { + "epoch": 0.9375758141270878, + "grad_norm": 1.7004684618750514, + "learning_rate": 0.0002878167158227494, + "loss": 7.1334, + "step": 10048 + }, + { + "epoch": 0.9376691238219651, + "grad_norm": 31.08352941549549, + "learning_rate": 0.00028781372965939984, + "loss": 7.4654, + "step": 10049 + }, + { + "epoch": 0.9377624335168424, + "grad_norm": 7.427085054970451, + "learning_rate": 0.0002878107431456308, + "loss": 7.2846, + "step": 10050 + }, + { + "epoch": 0.9378557432117197, + "grad_norm": 9053.260159845539, + "learning_rate": 0.00028780775628144985, + "loss": 7.5555, + "step": 10051 + }, + { + "epoch": 0.937949052906597, + "grad_norm": 1.756626637478533, + "learning_rate": 0.0002878047690668646, + "loss": 7.4165, + "step": 10052 + }, + { + "epoch": 0.9380423626014743, + "grad_norm": 3.8934175108853157, + "learning_rate": 0.00028780178150188265, + "loss": 7.3909, + "step": 10053 + }, + { + "epoch": 0.9381356722963516, + "grad_norm": 1.6833689488700847, + "learning_rate": 0.00028779879358651163, + "loss": 7.6522, + "step": 10054 + }, + { + "epoch": 0.9382289819912288, + "grad_norm": 0.7710255986057488, + "learning_rate": 0.00028779580532075906, + "loss": 7.561, + "step": 10055 + }, + { + "epoch": 0.9383222916861061, + "grad_norm": 1.4336871828051474, + "learning_rate": 0.0002877928167046325, + "loss": 7.2591, + "step": 10056 + }, + { + "epoch": 0.9384156013809835, + "grad_norm": 2.257004338820098, + "learning_rate": 0.00028778982773813974, + "loss": 7.5172, + "step": 10057 + }, + { + "epoch": 0.9385089110758608, + "grad_norm": 197.4815879576014, + "learning_rate": 0.00028778683842128823, + "loss": 7.3279, + "step": 10058 + }, + { + "epoch": 0.9386022207707381, + "grad_norm": 0.7523854512300473, + "learning_rate": 0.0002877838487540856, + "loss": 7.8027, + "step": 10059 + }, + { + "epoch": 0.9386955304656154, + "grad_norm": 158.45388064935585, + "learning_rate": 0.0002877808587365394, + "loss": 7.7109, + "step": 10060 + }, + { + "epoch": 0.9387888401604927, + "grad_norm": 15410.061351978302, + "learning_rate": 0.00028777786836865733, + "loss": 7.7633, + "step": 10061 + }, + { + "epoch": 0.93888214985537, + "grad_norm": 0.8504095116382718, + "learning_rate": 0.000287774877650447, + "loss": 7.5908, + "step": 10062 + }, + { + "epoch": 0.9389754595502473, + "grad_norm": 0.875337930045573, + "learning_rate": 0.00028777188658191593, + "loss": 7.2016, + "step": 10063 + }, + { + "epoch": 0.9390687692451246, + "grad_norm": 4.767334895323983, + "learning_rate": 0.0002877688951630718, + "loss": 7.0031, + "step": 10064 + }, + { + "epoch": 0.9391620789400019, + "grad_norm": 3.200096292324104, + "learning_rate": 0.0002877659033939221, + "loss": 7.7428, + "step": 10065 + }, + { + "epoch": 0.9392553886348791, + "grad_norm": 1.2049270725691301, + "learning_rate": 0.00028776291127447455, + "loss": 7.482, + "step": 10066 + }, + { + "epoch": 0.9393486983297564, + "grad_norm": 0.4804324700463375, + "learning_rate": 0.00028775991880473673, + "loss": 7.3604, + "step": 10067 + }, + { + "epoch": 0.9394420080246337, + "grad_norm": 0.8918231566238168, + "learning_rate": 0.0002877569259847163, + "loss": 7.4207, + "step": 10068 + }, + { + "epoch": 0.939535317719511, + "grad_norm": 6.674649994206392, + "learning_rate": 0.0002877539328144207, + "loss": 7.4309, + "step": 10069 + }, + { + "epoch": 0.9396286274143884, + "grad_norm": 7.909384784235944, + "learning_rate": 0.00028775093929385773, + "loss": 7.4368, + "step": 10070 + }, + { + "epoch": 0.9397219371092657, + "grad_norm": 0.7515626963279111, + "learning_rate": 0.000287747945423035, + "loss": 7.697, + "step": 10071 + }, + { + "epoch": 0.939815246804143, + "grad_norm": 1.0681940554502516, + "learning_rate": 0.0002877449512019599, + "loss": 7.2757, + "step": 10072 + }, + { + "epoch": 0.9399085564990203, + "grad_norm": 0.8787496206561546, + "learning_rate": 0.00028774195663064026, + "loss": 7.6942, + "step": 10073 + }, + { + "epoch": 0.9400018661938976, + "grad_norm": 1.17359588076227, + "learning_rate": 0.00028773896170908363, + "loss": 7.7245, + "step": 10074 + }, + { + "epoch": 0.9400951758887749, + "grad_norm": 1.4616023891772134, + "learning_rate": 0.0002877359664372976, + "loss": 7.385, + "step": 10075 + }, + { + "epoch": 0.9401884855836521, + "grad_norm": 0.9500092811924357, + "learning_rate": 0.00028773297081528983, + "loss": 7.4029, + "step": 10076 + }, + { + "epoch": 0.9402817952785294, + "grad_norm": 0.9074160754540671, + "learning_rate": 0.00028772997484306795, + "loss": 7.6625, + "step": 10077 + }, + { + "epoch": 0.9403751049734067, + "grad_norm": 2.6949384519661006, + "learning_rate": 0.0002877269785206395, + "loss": 7.6229, + "step": 10078 + }, + { + "epoch": 0.940468414668284, + "grad_norm": 1.04540449712449, + "learning_rate": 0.0002877239818480121, + "loss": 7.2771, + "step": 10079 + }, + { + "epoch": 0.9405617243631613, + "grad_norm": 1.2854313357484881, + "learning_rate": 0.00028772098482519344, + "loss": 7.4055, + "step": 10080 + }, + { + "epoch": 0.9406550340580386, + "grad_norm": 0.7177354132618929, + "learning_rate": 0.00028771798745219114, + "loss": 7.3201, + "step": 10081 + }, + { + "epoch": 0.9407483437529159, + "grad_norm": 0.9363473903687634, + "learning_rate": 0.0002877149897290128, + "loss": 7.5228, + "step": 10082 + }, + { + "epoch": 0.9408416534477932, + "grad_norm": 2.028648779560421, + "learning_rate": 0.00028771199165566596, + "loss": 7.3293, + "step": 10083 + }, + { + "epoch": 0.9409349631426706, + "grad_norm": 2.357815879844162, + "learning_rate": 0.00028770899323215843, + "loss": 7.3575, + "step": 10084 + }, + { + "epoch": 0.9410282728375479, + "grad_norm": 3532.9259680847413, + "learning_rate": 0.00028770599445849767, + "loss": 7.1407, + "step": 10085 + }, + { + "epoch": 0.9411215825324252, + "grad_norm": 0.8634201255358851, + "learning_rate": 0.0002877029953346913, + "loss": 7.4498, + "step": 10086 + }, + { + "epoch": 0.9412148922273024, + "grad_norm": 0.6258253721899002, + "learning_rate": 0.00028769999586074704, + "loss": 7.2431, + "step": 10087 + }, + { + "epoch": 0.9413082019221797, + "grad_norm": 0.7392105419813959, + "learning_rate": 0.0002876969960366725, + "loss": 7.4008, + "step": 10088 + }, + { + "epoch": 0.941401511617057, + "grad_norm": 1.2760582929538904, + "learning_rate": 0.0002876939958624753, + "loss": 7.1748, + "step": 10089 + }, + { + "epoch": 0.9414948213119343, + "grad_norm": 838.5253405913712, + "learning_rate": 0.0002876909953381631, + "loss": 7.1998, + "step": 10090 + }, + { + "epoch": 0.9415881310068116, + "grad_norm": 1.0272854357577887, + "learning_rate": 0.00028768799446374335, + "loss": 7.2955, + "step": 10091 + }, + { + "epoch": 0.9416814407016889, + "grad_norm": 2.961438530194083, + "learning_rate": 0.0002876849932392239, + "loss": 7.5544, + "step": 10092 + }, + { + "epoch": 0.9417747503965662, + "grad_norm": 1.8909418933056448, + "learning_rate": 0.0002876819916646123, + "loss": 7.3142, + "step": 10093 + }, + { + "epoch": 0.9418680600914435, + "grad_norm": 1.7462389737223598, + "learning_rate": 0.00028767898973991614, + "loss": 7.5335, + "step": 10094 + }, + { + "epoch": 0.9419613697863208, + "grad_norm": 0.5678258459012289, + "learning_rate": 0.00028767598746514317, + "loss": 7.336, + "step": 10095 + }, + { + "epoch": 0.9420546794811981, + "grad_norm": 2.8256865545415444, + "learning_rate": 0.0002876729848403009, + "loss": 7.4517, + "step": 10096 + }, + { + "epoch": 0.9421479891760753, + "grad_norm": 1.4734049324277683, + "learning_rate": 0.000287669981865397, + "loss": 7.497, + "step": 10097 + }, + { + "epoch": 0.9422412988709526, + "grad_norm": 0.6346437224305492, + "learning_rate": 0.0002876669785404391, + "loss": 7.2429, + "step": 10098 + }, + { + "epoch": 0.94233460856583, + "grad_norm": 0.7904641527237791, + "learning_rate": 0.0002876639748654349, + "loss": 7.3169, + "step": 10099 + }, + { + "epoch": 0.9424279182607073, + "grad_norm": 0.7654348497418754, + "learning_rate": 0.00028766097084039193, + "loss": 7.3299, + "step": 10100 + }, + { + "epoch": 0.9425212279555846, + "grad_norm": 0.7887531417404214, + "learning_rate": 0.00028765796646531796, + "loss": 7.3111, + "step": 10101 + }, + { + "epoch": 0.9426145376504619, + "grad_norm": 3767.100701894125, + "learning_rate": 0.00028765496174022054, + "loss": 7.2292, + "step": 10102 + }, + { + "epoch": 0.9427078473453392, + "grad_norm": 1044.0644392120091, + "learning_rate": 0.0002876519566651073, + "loss": 7.3365, + "step": 10103 + }, + { + "epoch": 0.9428011570402165, + "grad_norm": 7.243678625726317, + "learning_rate": 0.0002876489512399859, + "loss": 7.5569, + "step": 10104 + }, + { + "epoch": 0.9428944667350938, + "grad_norm": 10334.79057662482, + "learning_rate": 0.00028764594546486406, + "loss": 7.6241, + "step": 10105 + }, + { + "epoch": 0.9429877764299711, + "grad_norm": 2.7366349155926284, + "learning_rate": 0.0002876429393397493, + "loss": 7.5169, + "step": 10106 + }, + { + "epoch": 0.9430810861248484, + "grad_norm": 5888.038179556157, + "learning_rate": 0.0002876399328646493, + "loss": 7.256, + "step": 10107 + }, + { + "epoch": 0.9431743958197256, + "grad_norm": 1.5048593261184058, + "learning_rate": 0.0002876369260395718, + "loss": 7.2675, + "step": 10108 + }, + { + "epoch": 0.9432677055146029, + "grad_norm": 2.983312987996008, + "learning_rate": 0.0002876339188645243, + "loss": 7.6546, + "step": 10109 + }, + { + "epoch": 0.9433610152094802, + "grad_norm": 1.350559654461555, + "learning_rate": 0.0002876309113395145, + "loss": 7.4202, + "step": 10110 + }, + { + "epoch": 0.9434543249043575, + "grad_norm": 1.7231653952799146, + "learning_rate": 0.0002876279034645502, + "loss": 7.4332, + "step": 10111 + }, + { + "epoch": 0.9435476345992349, + "grad_norm": 1.4297214468292203, + "learning_rate": 0.00028762489523963876, + "loss": 7.3091, + "step": 10112 + }, + { + "epoch": 0.9436409442941122, + "grad_norm": 0.8728545318248501, + "learning_rate": 0.000287621886664788, + "loss": 7.2844, + "step": 10113 + }, + { + "epoch": 0.9437342539889895, + "grad_norm": 1.2765215141965025, + "learning_rate": 0.0002876188777400056, + "loss": 7.3957, + "step": 10114 + }, + { + "epoch": 0.9438275636838668, + "grad_norm": 1.9210481240245745, + "learning_rate": 0.0002876158684652991, + "loss": 7.1176, + "step": 10115 + }, + { + "epoch": 0.9439208733787441, + "grad_norm": 0.5726627266471707, + "learning_rate": 0.0002876128588406763, + "loss": 7.3314, + "step": 10116 + }, + { + "epoch": 0.9440141830736214, + "grad_norm": 1083.744498782126, + "learning_rate": 0.0002876098488661447, + "loss": 7.8415, + "step": 10117 + }, + { + "epoch": 0.9441074927684987, + "grad_norm": 1.1525724323361295, + "learning_rate": 0.000287606838541712, + "loss": 7.3424, + "step": 10118 + }, + { + "epoch": 0.9442008024633759, + "grad_norm": 0.6305542336530576, + "learning_rate": 0.00028760382786738596, + "loss": 7.5703, + "step": 10119 + }, + { + "epoch": 0.9442941121582532, + "grad_norm": 2989.025635689098, + "learning_rate": 0.00028760081684317404, + "loss": 7.2462, + "step": 10120 + }, + { + "epoch": 0.9443874218531305, + "grad_norm": 5.057798512963654, + "learning_rate": 0.00028759780546908407, + "loss": 7.4674, + "step": 10121 + }, + { + "epoch": 0.9444807315480078, + "grad_norm": 1.041831694083805, + "learning_rate": 0.0002875947937451236, + "loss": 7.3227, + "step": 10122 + }, + { + "epoch": 0.9445740412428851, + "grad_norm": 1.0215394733630712, + "learning_rate": 0.00028759178167130035, + "loss": 7.3289, + "step": 10123 + }, + { + "epoch": 0.9446673509377624, + "grad_norm": 2.1995304420349635, + "learning_rate": 0.00028758876924762195, + "loss": 7.2439, + "step": 10124 + }, + { + "epoch": 0.9447606606326397, + "grad_norm": 1.6778634252940947, + "learning_rate": 0.00028758575647409607, + "loss": 7.8199, + "step": 10125 + }, + { + "epoch": 0.9448539703275171, + "grad_norm": 1.8770745315726594, + "learning_rate": 0.0002875827433507304, + "loss": 7.2797, + "step": 10126 + }, + { + "epoch": 0.9449472800223944, + "grad_norm": 4.197720240997659, + "learning_rate": 0.00028757972987753254, + "loss": 7.4077, + "step": 10127 + }, + { + "epoch": 0.9450405897172717, + "grad_norm": 24089.64549847606, + "learning_rate": 0.0002875767160545101, + "loss": 7.2164, + "step": 10128 + }, + { + "epoch": 0.9451338994121489, + "grad_norm": 8203.594059150519, + "learning_rate": 0.00028757370188167095, + "loss": 7.1326, + "step": 10129 + }, + { + "epoch": 0.9452272091070262, + "grad_norm": 1.6656339812018945, + "learning_rate": 0.0002875706873590226, + "loss": 7.7836, + "step": 10130 + }, + { + "epoch": 0.9453205188019035, + "grad_norm": 32.759548438650775, + "learning_rate": 0.0002875676724865727, + "loss": 7.5619, + "step": 10131 + }, + { + "epoch": 0.9454138284967808, + "grad_norm": 48445.350495886116, + "learning_rate": 0.000287564657264329, + "loss": 7.3941, + "step": 10132 + }, + { + "epoch": 0.9455071381916581, + "grad_norm": 0.9256505007960292, + "learning_rate": 0.0002875616416922991, + "loss": 7.1864, + "step": 10133 + }, + { + "epoch": 0.9456004478865354, + "grad_norm": 19282.8608970745, + "learning_rate": 0.0002875586257704907, + "loss": 6.9426, + "step": 10134 + }, + { + "epoch": 0.9456937575814127, + "grad_norm": 2.1788918799053105, + "learning_rate": 0.00028755560949891146, + "loss": 7.4851, + "step": 10135 + }, + { + "epoch": 0.94578706727629, + "grad_norm": 7290.391391125735, + "learning_rate": 0.0002875525928775691, + "loss": 7.3637, + "step": 10136 + }, + { + "epoch": 0.9458803769711673, + "grad_norm": 8.189822921016434, + "learning_rate": 0.0002875495759064712, + "loss": 7.339, + "step": 10137 + }, + { + "epoch": 0.9459736866660446, + "grad_norm": 0.7088923211926621, + "learning_rate": 0.00028754655858562547, + "loss": 7.2148, + "step": 10138 + }, + { + "epoch": 0.946066996360922, + "grad_norm": 1.9332825249351049, + "learning_rate": 0.00028754354091503956, + "loss": 7.1409, + "step": 10139 + }, + { + "epoch": 0.9461603060557991, + "grad_norm": 15121.243617371734, + "learning_rate": 0.0002875405228947212, + "loss": 7.514, + "step": 10140 + }, + { + "epoch": 0.9462536157506765, + "grad_norm": 32307.665217724334, + "learning_rate": 0.00028753750452467804, + "loss": 7.4224, + "step": 10141 + }, + { + "epoch": 0.9463469254455538, + "grad_norm": 1.6726782788342756, + "learning_rate": 0.0002875344858049177, + "loss": 7.5321, + "step": 10142 + }, + { + "epoch": 0.9464402351404311, + "grad_norm": 2.9683949073795555, + "learning_rate": 0.000287531466735448, + "loss": 7.7616, + "step": 10143 + }, + { + "epoch": 0.9465335448353084, + "grad_norm": 2.248822138829282, + "learning_rate": 0.0002875284473162764, + "loss": 7.5863, + "step": 10144 + }, + { + "epoch": 0.9466268545301857, + "grad_norm": 1.4929125505132919, + "learning_rate": 0.0002875254275474108, + "loss": 7.4806, + "step": 10145 + }, + { + "epoch": 0.946720164225063, + "grad_norm": 1.7958978840553226, + "learning_rate": 0.00028752240742885873, + "loss": 7.2809, + "step": 10146 + }, + { + "epoch": 0.9468134739199403, + "grad_norm": 100645.77838752273, + "learning_rate": 0.00028751938696062795, + "loss": 7.4941, + "step": 10147 + }, + { + "epoch": 0.9469067836148176, + "grad_norm": 1.2161727132537359, + "learning_rate": 0.0002875163661427261, + "loss": 7.3674, + "step": 10148 + }, + { + "epoch": 0.9470000933096949, + "grad_norm": 0.7944588304423817, + "learning_rate": 0.00028751334497516083, + "loss": 7.2995, + "step": 10149 + }, + { + "epoch": 0.9470934030045721, + "grad_norm": 67633.74154123836, + "learning_rate": 0.00028751032345793984, + "loss": 7.6429, + "step": 10150 + }, + { + "epoch": 0.9471867126994494, + "grad_norm": 7.287535907696301, + "learning_rate": 0.00028750730159107085, + "loss": 7.2379, + "step": 10151 + }, + { + "epoch": 0.9472800223943267, + "grad_norm": 67060.24247596045, + "learning_rate": 0.0002875042793745615, + "loss": 7.5332, + "step": 10152 + }, + { + "epoch": 0.947373332089204, + "grad_norm": 1.2298309236175407, + "learning_rate": 0.0002875012568084196, + "loss": 7.3394, + "step": 10153 + }, + { + "epoch": 0.9474666417840814, + "grad_norm": 150583.20001118534, + "learning_rate": 0.00028749823389265266, + "loss": 7.3381, + "step": 10154 + }, + { + "epoch": 0.9475599514789587, + "grad_norm": 5.667957479671331, + "learning_rate": 0.00028749521062726847, + "loss": 7.3354, + "step": 10155 + }, + { + "epoch": 0.947653261173836, + "grad_norm": 5.752506690751603, + "learning_rate": 0.0002874921870122746, + "loss": 7.3039, + "step": 10156 + }, + { + "epoch": 0.9477465708687133, + "grad_norm": 7.509223712746741, + "learning_rate": 0.00028748916304767897, + "loss": 7.491, + "step": 10157 + }, + { + "epoch": 0.9478398805635906, + "grad_norm": 2.7856463854434717, + "learning_rate": 0.00028748613873348904, + "loss": 7.4363, + "step": 10158 + }, + { + "epoch": 0.9479331902584679, + "grad_norm": 14.127534764711283, + "learning_rate": 0.0002874831140697126, + "loss": 7.3917, + "step": 10159 + }, + { + "epoch": 0.9480264999533452, + "grad_norm": 3.1707692376836647, + "learning_rate": 0.0002874800890563573, + "loss": 7.4204, + "step": 10160 + }, + { + "epoch": 0.9481198096482224, + "grad_norm": 8.963306460781302, + "learning_rate": 0.0002874770636934309, + "loss": 7.3319, + "step": 10161 + }, + { + "epoch": 0.9482131193430997, + "grad_norm": 2.5416599877007497, + "learning_rate": 0.00028747403798094106, + "loss": 7.2506, + "step": 10162 + }, + { + "epoch": 0.948306429037977, + "grad_norm": 3.864888239684287, + "learning_rate": 0.0002874710119188954, + "loss": 7.4592, + "step": 10163 + }, + { + "epoch": 0.9483997387328543, + "grad_norm": 1.7277574600477588, + "learning_rate": 0.00028746798550730176, + "loss": 7.5777, + "step": 10164 + }, + { + "epoch": 0.9484930484277316, + "grad_norm": 1.179951235204356, + "learning_rate": 0.00028746495874616766, + "loss": 7.8785, + "step": 10165 + }, + { + "epoch": 0.9485863581226089, + "grad_norm": 1.8519813334038566, + "learning_rate": 0.000287461931635501, + "loss": 7.3614, + "step": 10166 + }, + { + "epoch": 0.9486796678174862, + "grad_norm": 1.9849098034670907, + "learning_rate": 0.00028745890417530926, + "loss": 7.6126, + "step": 10167 + }, + { + "epoch": 0.9487729775123636, + "grad_norm": 2.4387174663463607, + "learning_rate": 0.0002874558763656003, + "loss": 7.2232, + "step": 10168 + }, + { + "epoch": 0.9488662872072409, + "grad_norm": 1.6392524254276903, + "learning_rate": 0.00028745284820638176, + "loss": 7.5001, + "step": 10169 + }, + { + "epoch": 0.9489595969021182, + "grad_norm": 2657427864742.845, + "learning_rate": 0.0002874498196976613, + "loss": 7.4839, + "step": 10170 + }, + { + "epoch": 0.9490529065969955, + "grad_norm": 7.951422654910905, + "learning_rate": 0.00028744679083944673, + "loss": 7.4919, + "step": 10171 + }, + { + "epoch": 0.9491462162918727, + "grad_norm": 5.778688646388296, + "learning_rate": 0.00028744376163174565, + "loss": 7.5773, + "step": 10172 + }, + { + "epoch": 0.94923952598675, + "grad_norm": 3.108591562132036, + "learning_rate": 0.0002874407320745658, + "loss": 7.7348, + "step": 10173 + }, + { + "epoch": 0.9493328356816273, + "grad_norm": 2.813934507885305, + "learning_rate": 0.00028743770216791494, + "loss": 7.1939, + "step": 10174 + }, + { + "epoch": 0.9494261453765046, + "grad_norm": 531.4567563796741, + "learning_rate": 0.00028743467191180065, + "loss": 7.3831, + "step": 10175 + }, + { + "epoch": 0.9495194550713819, + "grad_norm": 1.281412127211449, + "learning_rate": 0.00028743164130623066, + "loss": 7.3857, + "step": 10176 + }, + { + "epoch": 0.9496127647662592, + "grad_norm": 3.8361644727793816, + "learning_rate": 0.0002874286103512128, + "loss": 7.6357, + "step": 10177 + }, + { + "epoch": 0.9497060744611365, + "grad_norm": 11.613256391010927, + "learning_rate": 0.00028742557904675467, + "loss": 7.7007, + "step": 10178 + }, + { + "epoch": 0.9497993841560138, + "grad_norm": 14.610311054365548, + "learning_rate": 0.000287422547392864, + "loss": 7.566, + "step": 10179 + }, + { + "epoch": 0.9498926938508911, + "grad_norm": 13.961525482108218, + "learning_rate": 0.0002874195153895485, + "loss": 7.4539, + "step": 10180 + }, + { + "epoch": 0.9499860035457685, + "grad_norm": 9.452294028868266, + "learning_rate": 0.00028741648303681587, + "loss": 7.6854, + "step": 10181 + }, + { + "epoch": 0.9500793132406457, + "grad_norm": 3.4798745014895336, + "learning_rate": 0.00028741345033467383, + "loss": 7.2468, + "step": 10182 + }, + { + "epoch": 0.950172622935523, + "grad_norm": 8527299.014850298, + "learning_rate": 0.00028741041728313013, + "loss": 7.5618, + "step": 10183 + }, + { + "epoch": 0.9502659326304003, + "grad_norm": 7.442209456235592, + "learning_rate": 0.00028740738388219236, + "loss": 7.41, + "step": 10184 + }, + { + "epoch": 0.9503592423252776, + "grad_norm": 36049321.54856337, + "learning_rate": 0.0002874043501318684, + "loss": 7.6747, + "step": 10185 + }, + { + "epoch": 0.9504525520201549, + "grad_norm": 9.160913959625987, + "learning_rate": 0.00028740131603216583, + "loss": 6.918, + "step": 10186 + }, + { + "epoch": 0.9505458617150322, + "grad_norm": 182834945.4098047, + "learning_rate": 0.0002873982815830924, + "loss": 7.2709, + "step": 10187 + }, + { + "epoch": 0.9506391714099095, + "grad_norm": 61.44178674103805, + "learning_rate": 0.0002873952467846559, + "loss": 7.8191, + "step": 10188 + }, + { + "epoch": 0.9507324811047868, + "grad_norm": 5.81929459780717, + "learning_rate": 0.00028739221163686393, + "loss": 7.4006, + "step": 10189 + }, + { + "epoch": 0.9508257907996641, + "grad_norm": 5.285634023468675, + "learning_rate": 0.0002873891761397243, + "loss": 7.3674, + "step": 10190 + }, + { + "epoch": 0.9509191004945414, + "grad_norm": 14.44901339186364, + "learning_rate": 0.0002873861402932447, + "loss": 7.0532, + "step": 10191 + }, + { + "epoch": 0.9510124101894187, + "grad_norm": 10.179442005983418, + "learning_rate": 0.0002873831040974328, + "loss": 7.9145, + "step": 10192 + }, + { + "epoch": 0.9511057198842959, + "grad_norm": 15.651533933444632, + "learning_rate": 0.0002873800675522964, + "loss": 7.1086, + "step": 10193 + }, + { + "epoch": 0.9511990295791732, + "grad_norm": 3.253046024005159, + "learning_rate": 0.00028737703065784314, + "loss": 7.5361, + "step": 10194 + }, + { + "epoch": 0.9512923392740505, + "grad_norm": 2.6024999726321374, + "learning_rate": 0.00028737399341408087, + "loss": 7.4206, + "step": 10195 + }, + { + "epoch": 0.9513856489689279, + "grad_norm": 15.507091067096441, + "learning_rate": 0.0002873709558210171, + "loss": 7.212, + "step": 10196 + }, + { + "epoch": 0.9514789586638052, + "grad_norm": 9.221318956923051, + "learning_rate": 0.0002873679178786598, + "loss": 7.3598, + "step": 10197 + }, + { + "epoch": 0.9515722683586825, + "grad_norm": 12.167221617835207, + "learning_rate": 0.0002873648795870165, + "loss": 7.4723, + "step": 10198 + }, + { + "epoch": 0.9516655780535598, + "grad_norm": 56.730706928993094, + "learning_rate": 0.00028736184094609507, + "loss": 7.7081, + "step": 10199 + }, + { + "epoch": 0.9517588877484371, + "grad_norm": 1.2625376098786971, + "learning_rate": 0.0002873588019559031, + "loss": 7.5901, + "step": 10200 + }, + { + "epoch": 0.9518521974433144, + "grad_norm": 5.777678265262243, + "learning_rate": 0.00028735576261644844, + "loss": 7.2459, + "step": 10201 + }, + { + "epoch": 0.9519455071381917, + "grad_norm": 4.381704629230616, + "learning_rate": 0.0002873527229277387, + "loss": 7.7875, + "step": 10202 + }, + { + "epoch": 0.9520388168330689, + "grad_norm": 100.14148388557916, + "learning_rate": 0.0002873496828897817, + "loss": 7.286, + "step": 10203 + }, + { + "epoch": 0.9521321265279462, + "grad_norm": 6.928943705554423, + "learning_rate": 0.0002873466425025852, + "loss": 7.4851, + "step": 10204 + }, + { + "epoch": 0.9522254362228235, + "grad_norm": 3.96965461976183, + "learning_rate": 0.00028734360176615683, + "loss": 7.3689, + "step": 10205 + }, + { + "epoch": 0.9523187459177008, + "grad_norm": 97.4123081013418, + "learning_rate": 0.0002873405606805043, + "loss": 7.6813, + "step": 10206 + }, + { + "epoch": 0.9524120556125781, + "grad_norm": 1316739592.719593, + "learning_rate": 0.0002873375192456355, + "loss": 7.3086, + "step": 10207 + }, + { + "epoch": 0.9525053653074554, + "grad_norm": 17.457251283686187, + "learning_rate": 0.00028733447746155807, + "loss": 7.6837, + "step": 10208 + }, + { + "epoch": 0.9525986750023328, + "grad_norm": 9.752279560842945, + "learning_rate": 0.0002873314353282797, + "loss": 7.476, + "step": 10209 + }, + { + "epoch": 0.9526919846972101, + "grad_norm": 3.6780241395744158, + "learning_rate": 0.0002873283928458082, + "loss": 7.2379, + "step": 10210 + }, + { + "epoch": 0.9527852943920874, + "grad_norm": 7.720328575486066, + "learning_rate": 0.0002873253500141513, + "loss": 7.6725, + "step": 10211 + }, + { + "epoch": 0.9528786040869647, + "grad_norm": 47.50427645892767, + "learning_rate": 0.00028732230683331663, + "loss": 7.4195, + "step": 10212 + }, + { + "epoch": 0.952971913781842, + "grad_norm": 72.1872582003301, + "learning_rate": 0.00028731926330331206, + "loss": 7.7642, + "step": 10213 + }, + { + "epoch": 0.9530652234767192, + "grad_norm": 7.384524297035559, + "learning_rate": 0.0002873162194241453, + "loss": 7.8109, + "step": 10214 + }, + { + "epoch": 0.9531585331715965, + "grad_norm": 393108630.8051758, + "learning_rate": 0.000287313175195824, + "loss": 7.2316, + "step": 10215 + }, + { + "epoch": 0.9532518428664738, + "grad_norm": 61.33771399328055, + "learning_rate": 0.000287310130618356, + "loss": 7.4501, + "step": 10216 + }, + { + "epoch": 0.9533451525613511, + "grad_norm": 199.16988123893185, + "learning_rate": 0.00028730708569174904, + "loss": 7.6863, + "step": 10217 + }, + { + "epoch": 0.9534384622562284, + "grad_norm": 16.981397295107932, + "learning_rate": 0.00028730404041601087, + "loss": 7.5869, + "step": 10218 + }, + { + "epoch": 0.9535317719511057, + "grad_norm": 0.8481628960281232, + "learning_rate": 0.00028730099479114913, + "loss": 7.5265, + "step": 10219 + }, + { + "epoch": 0.953625081645983, + "grad_norm": 98.00146003076593, + "learning_rate": 0.00028729794881717164, + "loss": 7.4252, + "step": 10220 + }, + { + "epoch": 0.9537183913408603, + "grad_norm": 40.606377520274364, + "learning_rate": 0.0002872949024940862, + "loss": 7.589, + "step": 10221 + }, + { + "epoch": 0.9538117010357376, + "grad_norm": 19.73202626238522, + "learning_rate": 0.0002872918558219004, + "loss": 7.3589, + "step": 10222 + }, + { + "epoch": 0.953905010730615, + "grad_norm": 1.2940620161071308, + "learning_rate": 0.00028728880880062214, + "loss": 7.5426, + "step": 10223 + }, + { + "epoch": 0.9539983204254923, + "grad_norm": 88.2336752006098, + "learning_rate": 0.0002872857614302591, + "loss": 7.5566, + "step": 10224 + }, + { + "epoch": 0.9540916301203695, + "grad_norm": 3.4075319449792874, + "learning_rate": 0.00028728271371081905, + "loss": 7.4597, + "step": 10225 + }, + { + "epoch": 0.9541849398152468, + "grad_norm": 3.6643857341300037, + "learning_rate": 0.00028727966564230965, + "loss": 7.4712, + "step": 10226 + }, + { + "epoch": 0.9542782495101241, + "grad_norm": 2.783726049566188, + "learning_rate": 0.0002872766172247388, + "loss": 7.6932, + "step": 10227 + }, + { + "epoch": 0.9543715592050014, + "grad_norm": 7.595550857864356, + "learning_rate": 0.00028727356845811413, + "loss": 7.4599, + "step": 10228 + }, + { + "epoch": 0.9544648688998787, + "grad_norm": 4.539701633957516, + "learning_rate": 0.0002872705193424435, + "loss": 7.3495, + "step": 10229 + }, + { + "epoch": 0.954558178594756, + "grad_norm": 135.3321540560307, + "learning_rate": 0.00028726746987773455, + "loss": 7.4822, + "step": 10230 + }, + { + "epoch": 0.9546514882896333, + "grad_norm": 6.265953051315508, + "learning_rate": 0.0002872644200639951, + "loss": 7.1664, + "step": 10231 + }, + { + "epoch": 0.9547447979845106, + "grad_norm": 82.18847273259153, + "learning_rate": 0.0002872613699012329, + "loss": 7.827, + "step": 10232 + }, + { + "epoch": 0.9548381076793879, + "grad_norm": 5.650771533011011, + "learning_rate": 0.00028725831938945567, + "loss": 7.1358, + "step": 10233 + }, + { + "epoch": 0.9549314173742652, + "grad_norm": 5.4822646361185265, + "learning_rate": 0.00028725526852867124, + "loss": 7.5572, + "step": 10234 + }, + { + "epoch": 0.9550247270691424, + "grad_norm": 182.65653291372251, + "learning_rate": 0.00028725221731888727, + "loss": 7.4747, + "step": 10235 + }, + { + "epoch": 0.9551180367640197, + "grad_norm": 10.00899448259888, + "learning_rate": 0.0002872491657601116, + "loss": 7.269, + "step": 10236 + }, + { + "epoch": 0.955211346458897, + "grad_norm": 2.2567999926836335, + "learning_rate": 0.0002872461138523519, + "loss": 7.3344, + "step": 10237 + }, + { + "epoch": 0.9553046561537744, + "grad_norm": 5.89243218828461, + "learning_rate": 0.0002872430615956161, + "loss": 7.37, + "step": 10238 + }, + { + "epoch": 0.9553979658486517, + "grad_norm": 23938936396.61298, + "learning_rate": 0.00028724000898991176, + "loss": 7.6179, + "step": 10239 + }, + { + "epoch": 0.955491275543529, + "grad_norm": 43.92675516225117, + "learning_rate": 0.00028723695603524675, + "loss": 7.3227, + "step": 10240 + }, + { + "epoch": 0.9555845852384063, + "grad_norm": 1.1225104038023692, + "learning_rate": 0.0002872339027316288, + "loss": 7.1533, + "step": 10241 + }, + { + "epoch": 0.9556778949332836, + "grad_norm": 16.20890626168924, + "learning_rate": 0.0002872308490790657, + "loss": 7.4921, + "step": 10242 + }, + { + "epoch": 0.9557712046281609, + "grad_norm": 3.495434008039735, + "learning_rate": 0.00028722779507756523, + "loss": 7.5555, + "step": 10243 + }, + { + "epoch": 0.9558645143230382, + "grad_norm": 7.67331843137609, + "learning_rate": 0.0002872247407271351, + "loss": 7.5693, + "step": 10244 + }, + { + "epoch": 0.9559578240179155, + "grad_norm": 116933351761.40382, + "learning_rate": 0.00028722168602778313, + "loss": 7.3545, + "step": 10245 + }, + { + "epoch": 0.9560511337127927, + "grad_norm": 7.086294814730442, + "learning_rate": 0.000287218630979517, + "loss": 7.6723, + "step": 10246 + }, + { + "epoch": 0.95614444340767, + "grad_norm": 2.646176670341118, + "learning_rate": 0.0002872155755823446, + "loss": 7.6211, + "step": 10247 + }, + { + "epoch": 0.9562377531025473, + "grad_norm": 3.0435792644630384, + "learning_rate": 0.00028721251983627354, + "loss": 7.6945, + "step": 10248 + }, + { + "epoch": 0.9563310627974246, + "grad_norm": 4.0127968426167735, + "learning_rate": 0.0002872094637413118, + "loss": 7.6186, + "step": 10249 + }, + { + "epoch": 0.9564243724923019, + "grad_norm": 2.0781332986686794, + "learning_rate": 0.000287206407297467, + "loss": 7.6682, + "step": 10250 + }, + { + "epoch": 0.9565176821871793, + "grad_norm": 442.5248010141624, + "learning_rate": 0.00028720335050474695, + "loss": 7.8341, + "step": 10251 + }, + { + "epoch": 0.9566109918820566, + "grad_norm": 1.3360847845186106, + "learning_rate": 0.00028720029336315944, + "loss": 7.7954, + "step": 10252 + }, + { + "epoch": 0.9567043015769339, + "grad_norm": 1.2353601328917478, + "learning_rate": 0.0002871972358727122, + "loss": 7.5367, + "step": 10253 + }, + { + "epoch": 0.9567976112718112, + "grad_norm": 1.0958935716916178, + "learning_rate": 0.00028719417803341305, + "loss": 7.5449, + "step": 10254 + }, + { + "epoch": 0.9568909209666885, + "grad_norm": 1.5070993503488175, + "learning_rate": 0.0002871911198452697, + "loss": 7.611, + "step": 10255 + }, + { + "epoch": 0.9569842306615657, + "grad_norm": 0.8455311855270639, + "learning_rate": 0.00028718806130829, + "loss": 7.7776, + "step": 10256 + }, + { + "epoch": 0.957077540356443, + "grad_norm": 1.011115992488301, + "learning_rate": 0.0002871850024224817, + "loss": 7.833, + "step": 10257 + }, + { + "epoch": 0.9571708500513203, + "grad_norm": 0.5330267298926921, + "learning_rate": 0.0002871819431878526, + "loss": 7.4028, + "step": 10258 + }, + { + "epoch": 0.9572641597461976, + "grad_norm": 0.4719225195218522, + "learning_rate": 0.0002871788836044104, + "loss": 7.6308, + "step": 10259 + }, + { + "epoch": 0.9573574694410749, + "grad_norm": 0.525501400040564, + "learning_rate": 0.00028717582367216295, + "loss": 7.3943, + "step": 10260 + }, + { + "epoch": 0.9574507791359522, + "grad_norm": 1.1966514151436778, + "learning_rate": 0.00028717276339111804, + "loss": 7.9479, + "step": 10261 + }, + { + "epoch": 0.9575440888308295, + "grad_norm": 0.8782740472827727, + "learning_rate": 0.0002871697027612834, + "loss": 7.6039, + "step": 10262 + }, + { + "epoch": 0.9576373985257068, + "grad_norm": 0.7698452423903793, + "learning_rate": 0.0002871666417826669, + "loss": 7.4323, + "step": 10263 + }, + { + "epoch": 0.9577307082205841, + "grad_norm": 0.7997763324093355, + "learning_rate": 0.0002871635804552762, + "loss": 7.4849, + "step": 10264 + }, + { + "epoch": 0.9578240179154615, + "grad_norm": 0.5320669558180583, + "learning_rate": 0.00028716051877911917, + "loss": 7.5095, + "step": 10265 + }, + { + "epoch": 0.9579173276103388, + "grad_norm": 0.730067573822836, + "learning_rate": 0.00028715745675420354, + "loss": 7.3539, + "step": 10266 + }, + { + "epoch": 0.958010637305216, + "grad_norm": 0.542689227287125, + "learning_rate": 0.0002871543943805372, + "loss": 7.5238, + "step": 10267 + }, + { + "epoch": 0.9581039470000933, + "grad_norm": 0.7952294917953106, + "learning_rate": 0.00028715133165812775, + "loss": 7.7419, + "step": 10268 + }, + { + "epoch": 0.9581972566949706, + "grad_norm": 0.81720058492486, + "learning_rate": 0.00028714826858698315, + "loss": 7.4979, + "step": 10269 + }, + { + "epoch": 0.9582905663898479, + "grad_norm": 0.5554668468339148, + "learning_rate": 0.00028714520516711115, + "loss": 7.5848, + "step": 10270 + }, + { + "epoch": 0.9583838760847252, + "grad_norm": 0.47684799478256845, + "learning_rate": 0.0002871421413985195, + "loss": 7.4616, + "step": 10271 + }, + { + "epoch": 0.9584771857796025, + "grad_norm": 0.8504723262964151, + "learning_rate": 0.000287139077281216, + "loss": 7.7545, + "step": 10272 + }, + { + "epoch": 0.9585704954744798, + "grad_norm": 1.2174095275924794, + "learning_rate": 0.0002871360128152085, + "loss": 7.6507, + "step": 10273 + }, + { + "epoch": 0.9586638051693571, + "grad_norm": 0.7440886037260148, + "learning_rate": 0.0002871329480005047, + "loss": 7.4683, + "step": 10274 + }, + { + "epoch": 0.9587571148642344, + "grad_norm": 1.0739148226349726, + "learning_rate": 0.0002871298828371124, + "loss": 7.1933, + "step": 10275 + }, + { + "epoch": 0.9588504245591117, + "grad_norm": 0.5548269112712656, + "learning_rate": 0.0002871268173250395, + "loss": 7.3997, + "step": 10276 + }, + { + "epoch": 0.958943734253989, + "grad_norm": 0.5123955939929554, + "learning_rate": 0.00028712375146429365, + "loss": 7.4271, + "step": 10277 + }, + { + "epoch": 0.9590370439488662, + "grad_norm": 0.7582971434920355, + "learning_rate": 0.0002871206852548828, + "loss": 7.774, + "step": 10278 + }, + { + "epoch": 0.9591303536437435, + "grad_norm": 1.0045913157953, + "learning_rate": 0.00028711761869681463, + "loss": 7.6058, + "step": 10279 + }, + { + "epoch": 0.9592236633386209, + "grad_norm": 0.6650918416406796, + "learning_rate": 0.00028711455179009695, + "loss": 7.4441, + "step": 10280 + }, + { + "epoch": 0.9593169730334982, + "grad_norm": 0.5557058969923908, + "learning_rate": 0.0002871114845347376, + "loss": 7.6162, + "step": 10281 + }, + { + "epoch": 0.9594102827283755, + "grad_norm": 0.7526173549619264, + "learning_rate": 0.0002871084169307444, + "loss": 7.1998, + "step": 10282 + }, + { + "epoch": 0.9595035924232528, + "grad_norm": 0.7499809964989443, + "learning_rate": 0.00028710534897812505, + "loss": 7.4515, + "step": 10283 + }, + { + "epoch": 0.9595969021181301, + "grad_norm": 0.8295153545451164, + "learning_rate": 0.0002871022806768874, + "loss": 7.533, + "step": 10284 + }, + { + "epoch": 0.9596902118130074, + "grad_norm": 0.9333753983776804, + "learning_rate": 0.00028709921202703934, + "loss": 7.5372, + "step": 10285 + }, + { + "epoch": 0.9597835215078847, + "grad_norm": 0.7480151853151962, + "learning_rate": 0.0002870961430285885, + "loss": 7.6287, + "step": 10286 + }, + { + "epoch": 0.959876831202762, + "grad_norm": 1.4189787455639962, + "learning_rate": 0.00028709307368154284, + "loss": 7.7647, + "step": 10287 + }, + { + "epoch": 0.9599701408976392, + "grad_norm": 1.2261696906837443, + "learning_rate": 0.0002870900039859101, + "loss": 7.5661, + "step": 10288 + }, + { + "epoch": 0.9600634505925165, + "grad_norm": 0.6801673101982285, + "learning_rate": 0.0002870869339416981, + "loss": 7.457, + "step": 10289 + }, + { + "epoch": 0.9601567602873938, + "grad_norm": 1.056062309544597, + "learning_rate": 0.0002870838635489146, + "loss": 7.6923, + "step": 10290 + }, + { + "epoch": 0.9602500699822711, + "grad_norm": 0.8523225395113947, + "learning_rate": 0.0002870807928075675, + "loss": 7.1782, + "step": 10291 + }, + { + "epoch": 0.9603433796771484, + "grad_norm": 1.0545122005399024, + "learning_rate": 0.0002870777217176645, + "loss": 7.1857, + "step": 10292 + }, + { + "epoch": 0.9604366893720258, + "grad_norm": 8.537979271341063, + "learning_rate": 0.00028707465027921346, + "loss": 7.8405, + "step": 10293 + }, + { + "epoch": 0.9605299990669031, + "grad_norm": 0.5186280308047372, + "learning_rate": 0.00028707157849222213, + "loss": 7.4464, + "step": 10294 + }, + { + "epoch": 0.9606233087617804, + "grad_norm": 0.4766352046950889, + "learning_rate": 0.0002870685063566985, + "loss": 7.5358, + "step": 10295 + }, + { + "epoch": 0.9607166184566577, + "grad_norm": 0.5934582831648474, + "learning_rate": 0.0002870654338726502, + "loss": 7.5037, + "step": 10296 + }, + { + "epoch": 0.960809928151535, + "grad_norm": 0.6734391898574505, + "learning_rate": 0.00028706236104008513, + "loss": 7.4073, + "step": 10297 + }, + { + "epoch": 0.9609032378464123, + "grad_norm": 0.5441691182699258, + "learning_rate": 0.00028705928785901103, + "loss": 7.3769, + "step": 10298 + }, + { + "epoch": 0.9609965475412895, + "grad_norm": 0.4217506179835953, + "learning_rate": 0.00028705621432943576, + "loss": 7.408, + "step": 10299 + }, + { + "epoch": 0.9610898572361668, + "grad_norm": 0.4758732381716523, + "learning_rate": 0.0002870531404513672, + "loss": 7.2336, + "step": 10300 + }, + { + "epoch": 0.9611831669310441, + "grad_norm": 0.4856105212038111, + "learning_rate": 0.000287050066224813, + "loss": 7.4879, + "step": 10301 + }, + { + "epoch": 0.9612764766259214, + "grad_norm": 0.44441940646810996, + "learning_rate": 0.0002870469916497812, + "loss": 7.3204, + "step": 10302 + }, + { + "epoch": 0.9613697863207987, + "grad_norm": 0.5877985369801487, + "learning_rate": 0.0002870439167262794, + "loss": 7.6236, + "step": 10303 + }, + { + "epoch": 0.961463096015676, + "grad_norm": 1.2193269955617507, + "learning_rate": 0.0002870408414543156, + "loss": 7.0521, + "step": 10304 + }, + { + "epoch": 0.9615564057105533, + "grad_norm": 0.6019796997057911, + "learning_rate": 0.00028703776583389746, + "loss": 7.3554, + "step": 10305 + }, + { + "epoch": 0.9616497154054306, + "grad_norm": 0.9616180947507584, + "learning_rate": 0.0002870346898650329, + "loss": 7.7115, + "step": 10306 + }, + { + "epoch": 0.961743025100308, + "grad_norm": 0.5013092853661918, + "learning_rate": 0.00028703161354772975, + "loss": 7.3961, + "step": 10307 + }, + { + "epoch": 0.9618363347951853, + "grad_norm": 0.7690676093894582, + "learning_rate": 0.0002870285368819958, + "loss": 7.0821, + "step": 10308 + }, + { + "epoch": 0.9619296444900625, + "grad_norm": 0.9319781185578279, + "learning_rate": 0.0002870254598678388, + "loss": 7.2583, + "step": 10309 + }, + { + "epoch": 0.9620229541849398, + "grad_norm": 0.7011718291051101, + "learning_rate": 0.00028702238250526667, + "loss": 7.4635, + "step": 10310 + }, + { + "epoch": 0.9621162638798171, + "grad_norm": 1.367069835455137, + "learning_rate": 0.00028701930479428726, + "loss": 7.7561, + "step": 10311 + }, + { + "epoch": 0.9622095735746944, + "grad_norm": 0.8871230806903878, + "learning_rate": 0.0002870162267349083, + "loss": 7.5966, + "step": 10312 + }, + { + "epoch": 0.9623028832695717, + "grad_norm": 0.42830198734934893, + "learning_rate": 0.00028701314832713765, + "loss": 7.5531, + "step": 10313 + }, + { + "epoch": 0.962396192964449, + "grad_norm": 1.0774947215401371, + "learning_rate": 0.00028701006957098323, + "loss": 7.262, + "step": 10314 + }, + { + "epoch": 0.9624895026593263, + "grad_norm": 0.6906866670509584, + "learning_rate": 0.0002870069904664527, + "loss": 7.5565, + "step": 10315 + }, + { + "epoch": 0.9625828123542036, + "grad_norm": 0.6335177233471367, + "learning_rate": 0.000287003911013554, + "loss": 7.7109, + "step": 10316 + }, + { + "epoch": 0.9626761220490809, + "grad_norm": 0.4870152647989244, + "learning_rate": 0.000287000831212295, + "loss": 7.5882, + "step": 10317 + }, + { + "epoch": 0.9627694317439582, + "grad_norm": 0.4664769020482573, + "learning_rate": 0.0002869977510626834, + "loss": 7.5554, + "step": 10318 + }, + { + "epoch": 0.9628627414388355, + "grad_norm": 0.4677532256587781, + "learning_rate": 0.0002869946705647271, + "loss": 7.4653, + "step": 10319 + }, + { + "epoch": 0.9629560511337127, + "grad_norm": 0.44374641896096445, + "learning_rate": 0.000286991589718434, + "loss": 7.3709, + "step": 10320 + }, + { + "epoch": 0.96304936082859, + "grad_norm": 0.3764327606775496, + "learning_rate": 0.0002869885085238118, + "loss": 7.284, + "step": 10321 + }, + { + "epoch": 0.9631426705234674, + "grad_norm": 0.7391048398257208, + "learning_rate": 0.00028698542698086843, + "loss": 7.3933, + "step": 10322 + }, + { + "epoch": 0.9632359802183447, + "grad_norm": 0.5755852584951456, + "learning_rate": 0.0002869823450896117, + "loss": 6.9264, + "step": 10323 + }, + { + "epoch": 0.963329289913222, + "grad_norm": 0.8817215313893678, + "learning_rate": 0.0002869792628500494, + "loss": 7.1707, + "step": 10324 + }, + { + "epoch": 0.9634225996080993, + "grad_norm": 1.4188696407014825, + "learning_rate": 0.00028697618026218944, + "loss": 7.6733, + "step": 10325 + }, + { + "epoch": 0.9635159093029766, + "grad_norm": 0.5672866123160166, + "learning_rate": 0.00028697309732603963, + "loss": 7.2091, + "step": 10326 + }, + { + "epoch": 0.9636092189978539, + "grad_norm": 0.4600700582853362, + "learning_rate": 0.00028697001404160783, + "loss": 7.4274, + "step": 10327 + }, + { + "epoch": 0.9637025286927312, + "grad_norm": 0.45566151204582966, + "learning_rate": 0.0002869669304089018, + "loss": 7.5701, + "step": 10328 + }, + { + "epoch": 0.9637958383876085, + "grad_norm": 0.8956726311631199, + "learning_rate": 0.00028696384642792953, + "loss": 7.2847, + "step": 10329 + }, + { + "epoch": 0.9638891480824858, + "grad_norm": 0.9890737890000033, + "learning_rate": 0.0002869607620986987, + "loss": 7.2713, + "step": 10330 + }, + { + "epoch": 0.963982457777363, + "grad_norm": 0.5224595632340253, + "learning_rate": 0.0002869576774212172, + "loss": 7.4565, + "step": 10331 + }, + { + "epoch": 0.9640757674722403, + "grad_norm": 0.5635529404579604, + "learning_rate": 0.000286954592395493, + "loss": 7.5519, + "step": 10332 + }, + { + "epoch": 0.9641690771671176, + "grad_norm": 0.6934946242187435, + "learning_rate": 0.0002869515070215338, + "loss": 7.6451, + "step": 10333 + }, + { + "epoch": 0.964262386861995, + "grad_norm": 0.34499718664757295, + "learning_rate": 0.0002869484212993474, + "loss": 7.2052, + "step": 10334 + }, + { + "epoch": 0.9643556965568723, + "grad_norm": 0.3593179134484899, + "learning_rate": 0.0002869453352289418, + "loss": 7.2075, + "step": 10335 + }, + { + "epoch": 0.9644490062517496, + "grad_norm": 0.7463861038656011, + "learning_rate": 0.00028694224881032477, + "loss": 7.5496, + "step": 10336 + }, + { + "epoch": 0.9645423159466269, + "grad_norm": 0.48518469811666526, + "learning_rate": 0.0002869391620435042, + "loss": 7.2814, + "step": 10337 + }, + { + "epoch": 0.9646356256415042, + "grad_norm": 0.6303869253483935, + "learning_rate": 0.00028693607492848784, + "loss": 7.2132, + "step": 10338 + }, + { + "epoch": 0.9647289353363815, + "grad_norm": 0.46546540610477893, + "learning_rate": 0.0002869329874652837, + "loss": 7.3755, + "step": 10339 + }, + { + "epoch": 0.9648222450312588, + "grad_norm": 0.48628135470362, + "learning_rate": 0.00028692989965389945, + "loss": 7.539, + "step": 10340 + }, + { + "epoch": 0.964915554726136, + "grad_norm": 0.44538759614689377, + "learning_rate": 0.00028692681149434303, + "loss": 7.4891, + "step": 10341 + }, + { + "epoch": 0.9650088644210133, + "grad_norm": 0.41009801972548976, + "learning_rate": 0.00028692372298662236, + "loss": 7.2846, + "step": 10342 + }, + { + "epoch": 0.9651021741158906, + "grad_norm": 0.5823519954430384, + "learning_rate": 0.0002869206341307451, + "loss": 7.3277, + "step": 10343 + }, + { + "epoch": 0.9651954838107679, + "grad_norm": 0.6998926059747822, + "learning_rate": 0.00028691754492671934, + "loss": 7.5903, + "step": 10344 + }, + { + "epoch": 0.9652887935056452, + "grad_norm": 0.3839766138432854, + "learning_rate": 0.0002869144553745528, + "loss": 7.1582, + "step": 10345 + }, + { + "epoch": 0.9653821032005225, + "grad_norm": 0.5375542678805132, + "learning_rate": 0.00028691136547425334, + "loss": 7.7557, + "step": 10346 + }, + { + "epoch": 0.9654754128953998, + "grad_norm": 0.6348495936022913, + "learning_rate": 0.0002869082752258288, + "loss": 7.6478, + "step": 10347 + }, + { + "epoch": 0.9655687225902772, + "grad_norm": 0.5907128410269127, + "learning_rate": 0.0002869051846292871, + "loss": 7.5304, + "step": 10348 + }, + { + "epoch": 0.9656620322851545, + "grad_norm": 0.51117682698603, + "learning_rate": 0.00028690209368463607, + "loss": 7.4934, + "step": 10349 + }, + { + "epoch": 0.9657553419800318, + "grad_norm": 0.8464216812688194, + "learning_rate": 0.0002868990023918836, + "loss": 7.4082, + "step": 10350 + }, + { + "epoch": 0.9658486516749091, + "grad_norm": 0.48098223116112543, + "learning_rate": 0.00028689591075103753, + "loss": 7.4896, + "step": 10351 + }, + { + "epoch": 0.9659419613697863, + "grad_norm": 0.4165047277551135, + "learning_rate": 0.0002868928187621056, + "loss": 7.4476, + "step": 10352 + }, + { + "epoch": 0.9660352710646636, + "grad_norm": 0.38188629779858807, + "learning_rate": 0.0002868897264250959, + "loss": 7.4244, + "step": 10353 + }, + { + "epoch": 0.9661285807595409, + "grad_norm": 0.4751986308777311, + "learning_rate": 0.0002868866337400161, + "loss": 7.191, + "step": 10354 + }, + { + "epoch": 0.9662218904544182, + "grad_norm": 1.5620826269304078, + "learning_rate": 0.00028688354070687417, + "loss": 7.928, + "step": 10355 + }, + { + "epoch": 0.9663152001492955, + "grad_norm": 0.6312561977181675, + "learning_rate": 0.000286880447325678, + "loss": 7.3905, + "step": 10356 + }, + { + "epoch": 0.9664085098441728, + "grad_norm": 0.38862800646425333, + "learning_rate": 0.0002868773535964353, + "loss": 7.3383, + "step": 10357 + }, + { + "epoch": 0.9665018195390501, + "grad_norm": 0.5946896005581944, + "learning_rate": 0.00028687425951915406, + "loss": 7.7789, + "step": 10358 + }, + { + "epoch": 0.9665951292339274, + "grad_norm": 0.6479539855455961, + "learning_rate": 0.00028687116509384215, + "loss": 7.5029, + "step": 10359 + }, + { + "epoch": 0.9666884389288047, + "grad_norm": 1.3454200551436937, + "learning_rate": 0.0002868680703205074, + "loss": 7.2488, + "step": 10360 + }, + { + "epoch": 0.966781748623682, + "grad_norm": 0.46445809821106815, + "learning_rate": 0.0002868649751991577, + "loss": 7.7216, + "step": 10361 + }, + { + "epoch": 0.9668750583185592, + "grad_norm": 0.6670715111052767, + "learning_rate": 0.0002868618797298009, + "loss": 7.565, + "step": 10362 + }, + { + "epoch": 0.9669683680134366, + "grad_norm": 0.4829222078182261, + "learning_rate": 0.0002868587839124449, + "loss": 7.34, + "step": 10363 + }, + { + "epoch": 0.9670616777083139, + "grad_norm": 0.46473350832383337, + "learning_rate": 0.00028685568774709753, + "loss": 7.4423, + "step": 10364 + }, + { + "epoch": 0.9671549874031912, + "grad_norm": 0.47760516000986586, + "learning_rate": 0.00028685259123376674, + "loss": 7.4816, + "step": 10365 + }, + { + "epoch": 0.9672482970980685, + "grad_norm": 0.5050205755659098, + "learning_rate": 0.0002868494943724603, + "loss": 7.1708, + "step": 10366 + }, + { + "epoch": 0.9673416067929458, + "grad_norm": 0.7099108667355992, + "learning_rate": 0.0002868463971631861, + "loss": 7.7977, + "step": 10367 + }, + { + "epoch": 0.9674349164878231, + "grad_norm": 3.4767925535655184, + "learning_rate": 0.0002868432996059521, + "loss": 7.4752, + "step": 10368 + }, + { + "epoch": 0.9675282261827004, + "grad_norm": 0.6066404730623473, + "learning_rate": 0.0002868402017007661, + "loss": 7.3857, + "step": 10369 + }, + { + "epoch": 0.9676215358775777, + "grad_norm": 0.3386393055143861, + "learning_rate": 0.00028683710344763606, + "loss": 7.3885, + "step": 10370 + }, + { + "epoch": 0.967714845572455, + "grad_norm": 0.8198795550593996, + "learning_rate": 0.00028683400484656973, + "loss": 7.7962, + "step": 10371 + }, + { + "epoch": 0.9678081552673323, + "grad_norm": 3.2528617030112863, + "learning_rate": 0.00028683090589757507, + "loss": 7.5178, + "step": 10372 + }, + { + "epoch": 0.9679014649622095, + "grad_norm": 0.3468836944688066, + "learning_rate": 0.00028682780660065993, + "loss": 7.4889, + "step": 10373 + }, + { + "epoch": 0.9679947746570868, + "grad_norm": 0.4193960306277324, + "learning_rate": 0.0002868247069558323, + "loss": 7.3309, + "step": 10374 + }, + { + "epoch": 0.9680880843519641, + "grad_norm": 0.559326046822474, + "learning_rate": 0.0002868216069630999, + "loss": 6.9988, + "step": 10375 + }, + { + "epoch": 0.9681813940468414, + "grad_norm": 0.5852853139953966, + "learning_rate": 0.00028681850662247064, + "loss": 7.4058, + "step": 10376 + }, + { + "epoch": 0.9682747037417188, + "grad_norm": 1.097228872166925, + "learning_rate": 0.0002868154059339525, + "loss": 7.6481, + "step": 10377 + }, + { + "epoch": 0.9683680134365961, + "grad_norm": 0.4152887978377956, + "learning_rate": 0.00028681230489755334, + "loss": 7.1795, + "step": 10378 + }, + { + "epoch": 0.9684613231314734, + "grad_norm": 0.4907047634470481, + "learning_rate": 0.000286809203513281, + "loss": 7.3189, + "step": 10379 + }, + { + "epoch": 0.9685546328263507, + "grad_norm": 0.6967950397954339, + "learning_rate": 0.0002868061017811434, + "loss": 7.0872, + "step": 10380 + }, + { + "epoch": 0.968647942521228, + "grad_norm": 0.35994135535737537, + "learning_rate": 0.0002868029997011483, + "loss": 7.2724, + "step": 10381 + }, + { + "epoch": 0.9687412522161053, + "grad_norm": 0.4493751642539511, + "learning_rate": 0.0002867998972733038, + "loss": 7.166, + "step": 10382 + }, + { + "epoch": 0.9688345619109826, + "grad_norm": 0.5428354586049654, + "learning_rate": 0.0002867967944976177, + "loss": 7.6742, + "step": 10383 + }, + { + "epoch": 0.9689278716058598, + "grad_norm": 0.4378699041911766, + "learning_rate": 0.0002867936913740978, + "loss": 7.5189, + "step": 10384 + }, + { + "epoch": 0.9690211813007371, + "grad_norm": 0.37080065913315796, + "learning_rate": 0.00028679058790275207, + "loss": 7.3176, + "step": 10385 + }, + { + "epoch": 0.9691144909956144, + "grad_norm": 0.4504703040876968, + "learning_rate": 0.0002867874840835884, + "loss": 7.476, + "step": 10386 + }, + { + "epoch": 0.9692078006904917, + "grad_norm": 0.42340618606088226, + "learning_rate": 0.0002867843799166147, + "loss": 7.3004, + "step": 10387 + }, + { + "epoch": 0.969301110385369, + "grad_norm": 0.7155055213841518, + "learning_rate": 0.00028678127540183887, + "loss": 7.3751, + "step": 10388 + }, + { + "epoch": 0.9693944200802463, + "grad_norm": 0.5462372112799189, + "learning_rate": 0.0002867781705392687, + "loss": 7.2753, + "step": 10389 + }, + { + "epoch": 0.9694877297751237, + "grad_norm": 0.4421176875957764, + "learning_rate": 0.0002867750653289122, + "loss": 7.209, + "step": 10390 + }, + { + "epoch": 0.969581039470001, + "grad_norm": 0.6579495857803848, + "learning_rate": 0.00028677195977077724, + "loss": 7.4888, + "step": 10391 + }, + { + "epoch": 0.9696743491648783, + "grad_norm": 0.5649826044006772, + "learning_rate": 0.0002867688538648717, + "loss": 7.5822, + "step": 10392 + }, + { + "epoch": 0.9697676588597556, + "grad_norm": 0.7328122482078786, + "learning_rate": 0.0002867657476112035, + "loss": 7.234, + "step": 10393 + }, + { + "epoch": 0.9698609685546328, + "grad_norm": 0.5692716278985401, + "learning_rate": 0.00028676264100978044, + "loss": 7.3262, + "step": 10394 + }, + { + "epoch": 0.9699542782495101, + "grad_norm": 0.5912290031679112, + "learning_rate": 0.00028675953406061055, + "loss": 7.5677, + "step": 10395 + }, + { + "epoch": 0.9700475879443874, + "grad_norm": 0.612157874108606, + "learning_rate": 0.0002867564267637017, + "loss": 7.3916, + "step": 10396 + }, + { + "epoch": 0.9701408976392647, + "grad_norm": 0.7123617657767153, + "learning_rate": 0.0002867533191190617, + "loss": 7.2696, + "step": 10397 + }, + { + "epoch": 0.970234207334142, + "grad_norm": 0.43154957937950766, + "learning_rate": 0.00028675021112669854, + "loss": 6.9853, + "step": 10398 + }, + { + "epoch": 0.9703275170290193, + "grad_norm": 0.8609735485374653, + "learning_rate": 0.0002867471027866201, + "loss": 7.4058, + "step": 10399 + }, + { + "epoch": 0.9704208267238966, + "grad_norm": 0.809771256352344, + "learning_rate": 0.0002867439940988343, + "loss": 7.2852, + "step": 10400 + }, + { + "epoch": 0.9705141364187739, + "grad_norm": 0.9551669266078419, + "learning_rate": 0.00028674088506334906, + "loss": 7.2641, + "step": 10401 + }, + { + "epoch": 0.9706074461136512, + "grad_norm": 0.3505674311212633, + "learning_rate": 0.0002867377756801722, + "loss": 7.5, + "step": 10402 + }, + { + "epoch": 0.9707007558085285, + "grad_norm": 0.6895897275935655, + "learning_rate": 0.00028673466594931167, + "loss": 7.2853, + "step": 10403 + }, + { + "epoch": 0.9707940655034059, + "grad_norm": 0.484874825596613, + "learning_rate": 0.00028673155587077544, + "loss": 7.3664, + "step": 10404 + }, + { + "epoch": 0.970887375198283, + "grad_norm": 0.44285264940986424, + "learning_rate": 0.00028672844544457135, + "loss": 7.5508, + "step": 10405 + }, + { + "epoch": 0.9709806848931604, + "grad_norm": 0.6143631348794928, + "learning_rate": 0.00028672533467070737, + "loss": 7.2254, + "step": 10406 + }, + { + "epoch": 0.9710739945880377, + "grad_norm": 0.4193098734667313, + "learning_rate": 0.00028672222354919125, + "loss": 7.1675, + "step": 10407 + }, + { + "epoch": 0.971167304282915, + "grad_norm": 0.3914065512944027, + "learning_rate": 0.0002867191120800311, + "loss": 7.3248, + "step": 10408 + }, + { + "epoch": 0.9712606139777923, + "grad_norm": 0.5098051528668658, + "learning_rate": 0.0002867160002632347, + "loss": 7.2943, + "step": 10409 + }, + { + "epoch": 0.9713539236726696, + "grad_norm": 0.4668516178762218, + "learning_rate": 0.0002867128880988101, + "loss": 7.3095, + "step": 10410 + }, + { + "epoch": 0.9714472333675469, + "grad_norm": 0.44105004336105197, + "learning_rate": 0.00028670977558676507, + "loss": 7.364, + "step": 10411 + }, + { + "epoch": 0.9715405430624242, + "grad_norm": 0.4861162382330836, + "learning_rate": 0.00028670666272710753, + "loss": 7.6222, + "step": 10412 + }, + { + "epoch": 0.9716338527573015, + "grad_norm": 0.3643397595178301, + "learning_rate": 0.0002867035495198455, + "loss": 7.5258, + "step": 10413 + }, + { + "epoch": 0.9717271624521788, + "grad_norm": 0.4506257744497498, + "learning_rate": 0.0002867004359649868, + "loss": 7.6207, + "step": 10414 + }, + { + "epoch": 0.971820472147056, + "grad_norm": 1.471654448786277, + "learning_rate": 0.0002866973220625394, + "loss": 7.2126, + "step": 10415 + }, + { + "epoch": 0.9719137818419333, + "grad_norm": 0.7800982160182754, + "learning_rate": 0.0002866942078125112, + "loss": 7.2838, + "step": 10416 + }, + { + "epoch": 0.9720070915368106, + "grad_norm": 0.5900576977789763, + "learning_rate": 0.0002866910932149101, + "loss": 7.6905, + "step": 10417 + }, + { + "epoch": 0.972100401231688, + "grad_norm": 0.7405142241779432, + "learning_rate": 0.0002866879782697441, + "loss": 7.2644, + "step": 10418 + }, + { + "epoch": 0.9721937109265653, + "grad_norm": 0.9847407763398504, + "learning_rate": 0.00028668486297702106, + "loss": 7.5864, + "step": 10419 + }, + { + "epoch": 0.9722870206214426, + "grad_norm": 0.5049939780076385, + "learning_rate": 0.00028668174733674886, + "loss": 7.2336, + "step": 10420 + }, + { + "epoch": 0.9723803303163199, + "grad_norm": 0.8707016161404785, + "learning_rate": 0.0002866786313489355, + "loss": 7.5699, + "step": 10421 + }, + { + "epoch": 0.9724736400111972, + "grad_norm": 0.40849978945978643, + "learning_rate": 0.0002866755150135888, + "loss": 7.3254, + "step": 10422 + }, + { + "epoch": 0.9725669497060745, + "grad_norm": 0.348482341091202, + "learning_rate": 0.0002866723983307168, + "loss": 7.3318, + "step": 10423 + }, + { + "epoch": 0.9726602594009518, + "grad_norm": 0.8783531581532334, + "learning_rate": 0.0002866692813003274, + "loss": 7.3087, + "step": 10424 + }, + { + "epoch": 0.9727535690958291, + "grad_norm": 0.37901323058874387, + "learning_rate": 0.00028666616392242846, + "loss": 7.5867, + "step": 10425 + }, + { + "epoch": 0.9728468787907063, + "grad_norm": 0.8343308562210708, + "learning_rate": 0.00028666304619702796, + "loss": 7.3689, + "step": 10426 + }, + { + "epoch": 0.9729401884855836, + "grad_norm": 0.45687730121009956, + "learning_rate": 0.00028665992812413384, + "loss": 7.4074, + "step": 10427 + }, + { + "epoch": 0.9730334981804609, + "grad_norm": 0.40452820819056984, + "learning_rate": 0.00028665680970375396, + "loss": 7.2313, + "step": 10428 + }, + { + "epoch": 0.9731268078753382, + "grad_norm": 0.47563826030570855, + "learning_rate": 0.00028665369093589633, + "loss": 7.2272, + "step": 10429 + }, + { + "epoch": 0.9732201175702155, + "grad_norm": 0.5501469426981267, + "learning_rate": 0.0002866505718205688, + "loss": 7.4572, + "step": 10430 + }, + { + "epoch": 0.9733134272650928, + "grad_norm": 0.8437648184453248, + "learning_rate": 0.0002866474523577794, + "loss": 7.64, + "step": 10431 + }, + { + "epoch": 0.9734067369599702, + "grad_norm": 0.6653560688949047, + "learning_rate": 0.00028664433254753596, + "loss": 7.5726, + "step": 10432 + }, + { + "epoch": 0.9735000466548475, + "grad_norm": 0.8238022749832817, + "learning_rate": 0.0002866412123898465, + "loss": 7.3319, + "step": 10433 + }, + { + "epoch": 0.9735933563497248, + "grad_norm": 0.6375881578294733, + "learning_rate": 0.0002866380918847189, + "loss": 7.4329, + "step": 10434 + }, + { + "epoch": 0.9736866660446021, + "grad_norm": 0.7411945274368613, + "learning_rate": 0.0002866349710321611, + "loss": 7.2869, + "step": 10435 + }, + { + "epoch": 0.9737799757394794, + "grad_norm": 0.38375923808584833, + "learning_rate": 0.00028663184983218105, + "loss": 7.3604, + "step": 10436 + }, + { + "epoch": 0.9738732854343566, + "grad_norm": 0.4195154736034096, + "learning_rate": 0.0002866287282847867, + "loss": 7.3615, + "step": 10437 + }, + { + "epoch": 0.9739665951292339, + "grad_norm": 0.44835855596973784, + "learning_rate": 0.0002866256063899859, + "loss": 7.1367, + "step": 10438 + }, + { + "epoch": 0.9740599048241112, + "grad_norm": 0.8724487703283793, + "learning_rate": 0.00028662248414778674, + "loss": 7.5263, + "step": 10439 + }, + { + "epoch": 0.9741532145189885, + "grad_norm": 1.2024435868656906, + "learning_rate": 0.000286619361558197, + "loss": 7.6279, + "step": 10440 + }, + { + "epoch": 0.9742465242138658, + "grad_norm": 0.38290654328701834, + "learning_rate": 0.00028661623862122473, + "loss": 7.2469, + "step": 10441 + }, + { + "epoch": 0.9743398339087431, + "grad_norm": 0.594139987634075, + "learning_rate": 0.0002866131153368778, + "loss": 7.298, + "step": 10442 + }, + { + "epoch": 0.9744331436036204, + "grad_norm": 0.4233871090931817, + "learning_rate": 0.00028660999170516423, + "loss": 7.6582, + "step": 10443 + }, + { + "epoch": 0.9745264532984977, + "grad_norm": 0.6158473511253737, + "learning_rate": 0.0002866068677260919, + "loss": 7.3863, + "step": 10444 + }, + { + "epoch": 0.974619762993375, + "grad_norm": 0.5259267064733558, + "learning_rate": 0.0002866037433996688, + "loss": 7.4915, + "step": 10445 + }, + { + "epoch": 0.9747130726882524, + "grad_norm": 0.9584612446684034, + "learning_rate": 0.0002866006187259028, + "loss": 7.0629, + "step": 10446 + }, + { + "epoch": 0.9748063823831296, + "grad_norm": 1.078554566138606, + "learning_rate": 0.0002865974937048019, + "loss": 8.0099, + "step": 10447 + }, + { + "epoch": 0.9748996920780069, + "grad_norm": 0.957625738880558, + "learning_rate": 0.00028659436833637403, + "loss": 7.6429, + "step": 10448 + }, + { + "epoch": 0.9749930017728842, + "grad_norm": 0.4593775839637982, + "learning_rate": 0.0002865912426206272, + "loss": 7.3721, + "step": 10449 + }, + { + "epoch": 0.9750863114677615, + "grad_norm": 0.5128509102323946, + "learning_rate": 0.00028658811655756926, + "loss": 7.0807, + "step": 10450 + }, + { + "epoch": 0.9751796211626388, + "grad_norm": 0.34103235628280704, + "learning_rate": 0.0002865849901472082, + "loss": 7.3388, + "step": 10451 + }, + { + "epoch": 0.9752729308575161, + "grad_norm": 0.359736656211612, + "learning_rate": 0.0002865818633895519, + "loss": 7.2038, + "step": 10452 + }, + { + "epoch": 0.9753662405523934, + "grad_norm": 0.4479993005374458, + "learning_rate": 0.0002865787362846085, + "loss": 7.5895, + "step": 10453 + }, + { + "epoch": 0.9754595502472707, + "grad_norm": 1.2867733345975447, + "learning_rate": 0.0002865756088323858, + "loss": 7.1958, + "step": 10454 + }, + { + "epoch": 0.975552859942148, + "grad_norm": 0.47765211918374023, + "learning_rate": 0.0002865724810328917, + "loss": 7.3578, + "step": 10455 + }, + { + "epoch": 0.9756461696370253, + "grad_norm": 0.44563713751237194, + "learning_rate": 0.00028656935288613435, + "loss": 7.3534, + "step": 10456 + }, + { + "epoch": 0.9757394793319026, + "grad_norm": 0.31010379457847037, + "learning_rate": 0.0002865662243921215, + "loss": 7.2046, + "step": 10457 + }, + { + "epoch": 0.9758327890267798, + "grad_norm": 0.32755977992692015, + "learning_rate": 0.0002865630955508612, + "loss": 7.2203, + "step": 10458 + }, + { + "epoch": 0.9759260987216571, + "grad_norm": 0.6497476760500723, + "learning_rate": 0.0002865599663623615, + "loss": 7.3448, + "step": 10459 + }, + { + "epoch": 0.9760194084165345, + "grad_norm": 0.388584502995072, + "learning_rate": 0.00028655683682663017, + "loss": 7.5171, + "step": 10460 + }, + { + "epoch": 0.9761127181114118, + "grad_norm": 0.3025847750014307, + "learning_rate": 0.00028655370694367525, + "loss": 7.5195, + "step": 10461 + }, + { + "epoch": 0.9762060278062891, + "grad_norm": 0.4864558171750405, + "learning_rate": 0.00028655057671350475, + "loss": 7.3947, + "step": 10462 + }, + { + "epoch": 0.9762993375011664, + "grad_norm": 0.4382524694364541, + "learning_rate": 0.0002865474461361266, + "loss": 7.3745, + "step": 10463 + }, + { + "epoch": 0.9763926471960437, + "grad_norm": 0.6888992439542395, + "learning_rate": 0.00028654431521154866, + "loss": 7.079, + "step": 10464 + }, + { + "epoch": 0.976485956890921, + "grad_norm": 0.40701296396894693, + "learning_rate": 0.00028654118393977906, + "loss": 7.1676, + "step": 10465 + }, + { + "epoch": 0.9765792665857983, + "grad_norm": 0.48720679954601387, + "learning_rate": 0.00028653805232082565, + "loss": 7.3617, + "step": 10466 + }, + { + "epoch": 0.9766725762806756, + "grad_norm": 0.4777827316196949, + "learning_rate": 0.0002865349203546964, + "loss": 7.2226, + "step": 10467 + }, + { + "epoch": 0.9767658859755528, + "grad_norm": 0.7894621854530397, + "learning_rate": 0.00028653178804139925, + "loss": 7.29, + "step": 10468 + }, + { + "epoch": 0.9768591956704301, + "grad_norm": 0.5056426740775621, + "learning_rate": 0.0002865286553809423, + "loss": 7.3894, + "step": 10469 + }, + { + "epoch": 0.9769525053653074, + "grad_norm": 0.3461168519397622, + "learning_rate": 0.0002865255223733334, + "loss": 7.3936, + "step": 10470 + }, + { + "epoch": 0.9770458150601847, + "grad_norm": 0.7986213052430825, + "learning_rate": 0.00028652238901858047, + "loss": 7.1864, + "step": 10471 + }, + { + "epoch": 0.977139124755062, + "grad_norm": 0.8035052289887532, + "learning_rate": 0.0002865192553166916, + "loss": 7.167, + "step": 10472 + }, + { + "epoch": 0.9772324344499393, + "grad_norm": 0.33461794364097375, + "learning_rate": 0.00028651612126767473, + "loss": 7.4527, + "step": 10473 + }, + { + "epoch": 0.9773257441448167, + "grad_norm": 1.7321449448673312, + "learning_rate": 0.0002865129868715377, + "loss": 7.2999, + "step": 10474 + }, + { + "epoch": 0.977419053839694, + "grad_norm": 1.5381154787535414, + "learning_rate": 0.00028650985212828867, + "loss": 7.8049, + "step": 10475 + }, + { + "epoch": 0.9775123635345713, + "grad_norm": 0.6002465644541736, + "learning_rate": 0.00028650671703793555, + "loss": 7.2971, + "step": 10476 + }, + { + "epoch": 0.9776056732294486, + "grad_norm": 0.4356266891462889, + "learning_rate": 0.0002865035816004862, + "loss": 7.3474, + "step": 10477 + }, + { + "epoch": 0.9776989829243259, + "grad_norm": 0.3327336661222447, + "learning_rate": 0.00028650044581594874, + "loss": 7.3376, + "step": 10478 + }, + { + "epoch": 0.9777922926192031, + "grad_norm": 0.4498672331102421, + "learning_rate": 0.000286497309684331, + "loss": 7.357, + "step": 10479 + }, + { + "epoch": 0.9778856023140804, + "grad_norm": 0.8452909511774731, + "learning_rate": 0.0002864941732056411, + "loss": 7.2512, + "step": 10480 + }, + { + "epoch": 0.9779789120089577, + "grad_norm": 0.6654116426062209, + "learning_rate": 0.000286491036379887, + "loss": 7.3571, + "step": 10481 + }, + { + "epoch": 0.978072221703835, + "grad_norm": 0.430659702085164, + "learning_rate": 0.0002864878992070765, + "loss": 7.2405, + "step": 10482 + }, + { + "epoch": 0.9781655313987123, + "grad_norm": 0.5464288360523618, + "learning_rate": 0.0002864847616872178, + "loss": 7.3394, + "step": 10483 + }, + { + "epoch": 0.9782588410935896, + "grad_norm": 0.5119634408273017, + "learning_rate": 0.00028648162382031875, + "loss": 7.6969, + "step": 10484 + }, + { + "epoch": 0.9783521507884669, + "grad_norm": 0.6175678128450469, + "learning_rate": 0.00028647848560638735, + "loss": 7.7084, + "step": 10485 + }, + { + "epoch": 0.9784454604833442, + "grad_norm": 0.46850374319463356, + "learning_rate": 0.00028647534704543163, + "loss": 7.1981, + "step": 10486 + }, + { + "epoch": 0.9785387701782216, + "grad_norm": 0.6186384436350915, + "learning_rate": 0.0002864722081374595, + "loss": 7.5373, + "step": 10487 + }, + { + "epoch": 0.9786320798730989, + "grad_norm": 0.5059313966340887, + "learning_rate": 0.00028646906888247896, + "loss": 7.633, + "step": 10488 + }, + { + "epoch": 0.9787253895679762, + "grad_norm": 0.3165821115954377, + "learning_rate": 0.000286465929280498, + "loss": 7.451, + "step": 10489 + }, + { + "epoch": 0.9788186992628534, + "grad_norm": 0.5722424118759823, + "learning_rate": 0.00028646278933152464, + "loss": 7.2917, + "step": 10490 + }, + { + "epoch": 0.9789120089577307, + "grad_norm": 0.461628113087245, + "learning_rate": 0.0002864596490355668, + "loss": 7.4773, + "step": 10491 + }, + { + "epoch": 0.979005318652608, + "grad_norm": 0.4333418202589832, + "learning_rate": 0.0002864565083926325, + "loss": 7.4292, + "step": 10492 + }, + { + "epoch": 0.9790986283474853, + "grad_norm": 0.7187004358824772, + "learning_rate": 0.0002864533674027297, + "loss": 7.6617, + "step": 10493 + }, + { + "epoch": 0.9791919380423626, + "grad_norm": 0.4118048164294427, + "learning_rate": 0.00028645022606586644, + "loss": 7.2686, + "step": 10494 + }, + { + "epoch": 0.9792852477372399, + "grad_norm": 0.5703687569692623, + "learning_rate": 0.00028644708438205067, + "loss": 7.636, + "step": 10495 + }, + { + "epoch": 0.9793785574321172, + "grad_norm": 0.45896612586088675, + "learning_rate": 0.00028644394235129037, + "loss": 7.1197, + "step": 10496 + }, + { + "epoch": 0.9794718671269945, + "grad_norm": 0.8631105298274504, + "learning_rate": 0.00028644079997359356, + "loss": 7.7653, + "step": 10497 + }, + { + "epoch": 0.9795651768218718, + "grad_norm": 0.6146660299461731, + "learning_rate": 0.00028643765724896826, + "loss": 7.7688, + "step": 10498 + }, + { + "epoch": 0.9796584865167491, + "grad_norm": 0.8422142783570302, + "learning_rate": 0.0002864345141774223, + "loss": 7.2979, + "step": 10499 + }, + { + "epoch": 0.9797517962116263, + "grad_norm": 0.566540448782821, + "learning_rate": 0.00028643137075896386, + "loss": 7.6578, + "step": 10500 + }, + { + "epoch": 0.9798451059065036, + "grad_norm": 1.1096845587350082, + "learning_rate": 0.00028642822699360087, + "loss": 7.2162, + "step": 10501 + }, + { + "epoch": 0.979938415601381, + "grad_norm": 0.37879121686967004, + "learning_rate": 0.0002864250828813412, + "loss": 7.6686, + "step": 10502 + }, + { + "epoch": 0.9800317252962583, + "grad_norm": 0.33766794922390686, + "learning_rate": 0.0002864219384221931, + "loss": 7.3311, + "step": 10503 + }, + { + "epoch": 0.9801250349911356, + "grad_norm": 0.3643368138388572, + "learning_rate": 0.00028641879361616436, + "loss": 7.2306, + "step": 10504 + }, + { + "epoch": 0.9802183446860129, + "grad_norm": 0.6361006276102261, + "learning_rate": 0.000286415648463263, + "loss": 7.5972, + "step": 10505 + }, + { + "epoch": 0.9803116543808902, + "grad_norm": 0.9123114776177303, + "learning_rate": 0.0002864125029634971, + "loss": 7.4215, + "step": 10506 + }, + { + "epoch": 0.9804049640757675, + "grad_norm": 0.6104552544527488, + "learning_rate": 0.0002864093571168746, + "loss": 7.2217, + "step": 10507 + }, + { + "epoch": 0.9804982737706448, + "grad_norm": 0.6893600252813396, + "learning_rate": 0.00028640621092340353, + "loss": 7.7301, + "step": 10508 + }, + { + "epoch": 0.9805915834655221, + "grad_norm": 0.4035446722603314, + "learning_rate": 0.00028640306438309186, + "loss": 7.3917, + "step": 10509 + }, + { + "epoch": 0.9806848931603994, + "grad_norm": 1.1595446413749115, + "learning_rate": 0.0002863999174959476, + "loss": 7.2453, + "step": 10510 + }, + { + "epoch": 0.9807782028552766, + "grad_norm": 0.9078466587064283, + "learning_rate": 0.0002863967702619788, + "loss": 7.3029, + "step": 10511 + }, + { + "epoch": 0.9808715125501539, + "grad_norm": 0.4774902615763859, + "learning_rate": 0.00028639362268119336, + "loss": 7.5733, + "step": 10512 + }, + { + "epoch": 0.9809648222450312, + "grad_norm": 0.5749722406166118, + "learning_rate": 0.00028639047475359936, + "loss": 7.5274, + "step": 10513 + }, + { + "epoch": 0.9810581319399085, + "grad_norm": 0.2758446628841569, + "learning_rate": 0.00028638732647920476, + "loss": 7.2943, + "step": 10514 + }, + { + "epoch": 0.9811514416347858, + "grad_norm": 0.4609448056265888, + "learning_rate": 0.00028638417785801763, + "loss": 7.0616, + "step": 10515 + }, + { + "epoch": 0.9812447513296632, + "grad_norm": 1.4450737426042783, + "learning_rate": 0.0002863810288900459, + "loss": 7.6028, + "step": 10516 + }, + { + "epoch": 0.9813380610245405, + "grad_norm": 0.3678775843544648, + "learning_rate": 0.0002863778795752977, + "loss": 7.3053, + "step": 10517 + }, + { + "epoch": 0.9814313707194178, + "grad_norm": 0.4821469870938143, + "learning_rate": 0.0002863747299137808, + "loss": 7.104, + "step": 10518 + }, + { + "epoch": 0.9815246804142951, + "grad_norm": 0.3963066409917998, + "learning_rate": 0.00028637157990550345, + "loss": 7.6498, + "step": 10519 + }, + { + "epoch": 0.9816179901091724, + "grad_norm": 0.41825566232714395, + "learning_rate": 0.00028636842955047354, + "loss": 7.4475, + "step": 10520 + }, + { + "epoch": 0.9817112998040496, + "grad_norm": 0.6355667604363463, + "learning_rate": 0.00028636527884869914, + "loss": 7.641, + "step": 10521 + }, + { + "epoch": 0.9818046094989269, + "grad_norm": 0.4881365869965624, + "learning_rate": 0.0002863621278001882, + "loss": 7.7039, + "step": 10522 + }, + { + "epoch": 0.9818979191938042, + "grad_norm": 0.6788273192166132, + "learning_rate": 0.00028635897640494877, + "loss": 7.429, + "step": 10523 + }, + { + "epoch": 0.9819912288886815, + "grad_norm": 0.631015914311075, + "learning_rate": 0.00028635582466298885, + "loss": 7.7104, + "step": 10524 + }, + { + "epoch": 0.9820845385835588, + "grad_norm": 0.6836373567677021, + "learning_rate": 0.0002863526725743165, + "loss": 7.3882, + "step": 10525 + }, + { + "epoch": 0.9821778482784361, + "grad_norm": 0.5619768817057986, + "learning_rate": 0.00028634952013893966, + "loss": 7.4698, + "step": 10526 + }, + { + "epoch": 0.9822711579733134, + "grad_norm": 0.3821611908112673, + "learning_rate": 0.0002863463673568664, + "loss": 7.3174, + "step": 10527 + }, + { + "epoch": 0.9823644676681907, + "grad_norm": 0.3799020959227606, + "learning_rate": 0.0002863432142281047, + "loss": 7.1914, + "step": 10528 + }, + { + "epoch": 0.982457777363068, + "grad_norm": 2.645938719615252, + "learning_rate": 0.00028634006075266256, + "loss": 7.492, + "step": 10529 + }, + { + "epoch": 0.9825510870579454, + "grad_norm": 0.4886065862179878, + "learning_rate": 0.0002863369069305481, + "loss": 7.0522, + "step": 10530 + }, + { + "epoch": 0.9826443967528227, + "grad_norm": 1.0461641730035174, + "learning_rate": 0.00028633375276176926, + "loss": 7.4742, + "step": 10531 + }, + { + "epoch": 0.9827377064476999, + "grad_norm": 0.9986313000910643, + "learning_rate": 0.00028633059824633404, + "loss": 7.6009, + "step": 10532 + }, + { + "epoch": 0.9828310161425772, + "grad_norm": 0.4302357844558666, + "learning_rate": 0.0002863274433842505, + "loss": 7.521, + "step": 10533 + }, + { + "epoch": 0.9829243258374545, + "grad_norm": 0.7718063897638188, + "learning_rate": 0.0002863242881755267, + "loss": 7.3861, + "step": 10534 + }, + { + "epoch": 0.9830176355323318, + "grad_norm": 1.983642498368882, + "learning_rate": 0.0002863211326201705, + "loss": 7.302, + "step": 10535 + }, + { + "epoch": 0.9831109452272091, + "grad_norm": 0.6433357742319256, + "learning_rate": 0.0002863179767181902, + "loss": 7.2949, + "step": 10536 + }, + { + "epoch": 0.9832042549220864, + "grad_norm": 0.4221064000290723, + "learning_rate": 0.0002863148204695936, + "loss": 7.6636, + "step": 10537 + }, + { + "epoch": 0.9832975646169637, + "grad_norm": 0.39730795301462113, + "learning_rate": 0.00028631166387438876, + "loss": 7.566, + "step": 10538 + }, + { + "epoch": 0.983390874311841, + "grad_norm": 3.123358957480817, + "learning_rate": 0.00028630850693258373, + "loss": 7.6235, + "step": 10539 + }, + { + "epoch": 0.9834841840067183, + "grad_norm": 2.3440541118692595, + "learning_rate": 0.0002863053496441866, + "loss": 7.5679, + "step": 10540 + }, + { + "epoch": 0.9835774937015956, + "grad_norm": 0.3431387181226578, + "learning_rate": 0.0002863021920092053, + "loss": 7.6538, + "step": 10541 + }, + { + "epoch": 0.983670803396473, + "grad_norm": 10.054306623977629, + "learning_rate": 0.0002862990340276479, + "loss": 7.4673, + "step": 10542 + }, + { + "epoch": 0.9837641130913501, + "grad_norm": 4.06342177520499, + "learning_rate": 0.00028629587569952244, + "loss": 7.4617, + "step": 10543 + }, + { + "epoch": 0.9838574227862275, + "grad_norm": 17.581476114899125, + "learning_rate": 0.0002862927170248369, + "loss": 7.3119, + "step": 10544 + }, + { + "epoch": 0.9839507324811048, + "grad_norm": 0.2895280613229888, + "learning_rate": 0.00028628955800359945, + "loss": 7.6493, + "step": 10545 + }, + { + "epoch": 0.9840440421759821, + "grad_norm": 0.5569995533666051, + "learning_rate": 0.00028628639863581795, + "loss": 7.4736, + "step": 10546 + }, + { + "epoch": 0.9841373518708594, + "grad_norm": 0.7213967163571804, + "learning_rate": 0.00028628323892150055, + "loss": 7.2532, + "step": 10547 + }, + { + "epoch": 0.9842306615657367, + "grad_norm": 0.4703030083579695, + "learning_rate": 0.00028628007886065523, + "loss": 7.2971, + "step": 10548 + }, + { + "epoch": 0.984323971260614, + "grad_norm": 0.6637787394588267, + "learning_rate": 0.00028627691845329, + "loss": 7.3012, + "step": 10549 + }, + { + "epoch": 0.9844172809554913, + "grad_norm": 0.6296092785252886, + "learning_rate": 0.000286273757699413, + "loss": 7.2679, + "step": 10550 + }, + { + "epoch": 0.9845105906503686, + "grad_norm": 0.8506092910960177, + "learning_rate": 0.00028627059659903215, + "loss": 7.2079, + "step": 10551 + }, + { + "epoch": 0.9846039003452459, + "grad_norm": 0.6227158212055877, + "learning_rate": 0.00028626743515215556, + "loss": 7.4174, + "step": 10552 + }, + { + "epoch": 0.9846972100401231, + "grad_norm": 0.5792463633702988, + "learning_rate": 0.0002862642733587913, + "loss": 7.495, + "step": 10553 + }, + { + "epoch": 0.9847905197350004, + "grad_norm": 0.523125356796101, + "learning_rate": 0.0002862611112189473, + "loss": 7.4077, + "step": 10554 + }, + { + "epoch": 0.9848838294298777, + "grad_norm": 0.7431548185390796, + "learning_rate": 0.0002862579487326316, + "loss": 7.2975, + "step": 10555 + }, + { + "epoch": 0.984977139124755, + "grad_norm": 0.8406129734362012, + "learning_rate": 0.00028625478589985236, + "loss": 7.3655, + "step": 10556 + }, + { + "epoch": 0.9850704488196323, + "grad_norm": 0.7152836085700899, + "learning_rate": 0.0002862516227206176, + "loss": 7.5542, + "step": 10557 + }, + { + "epoch": 0.9851637585145097, + "grad_norm": 0.43640252482804515, + "learning_rate": 0.0002862484591949353, + "loss": 7.4153, + "step": 10558 + }, + { + "epoch": 0.985257068209387, + "grad_norm": 0.47680932012305555, + "learning_rate": 0.0002862452953228135, + "loss": 7.4222, + "step": 10559 + }, + { + "epoch": 0.9853503779042643, + "grad_norm": 0.5007796480052796, + "learning_rate": 0.0002862421311042603, + "loss": 7.0573, + "step": 10560 + }, + { + "epoch": 0.9854436875991416, + "grad_norm": 2.869503689824748, + "learning_rate": 0.0002862389665392837, + "loss": 7.4308, + "step": 10561 + }, + { + "epoch": 0.9855369972940189, + "grad_norm": 0.45614140943370934, + "learning_rate": 0.0002862358016278918, + "loss": 7.2681, + "step": 10562 + }, + { + "epoch": 0.9856303069888962, + "grad_norm": 0.3477396833263156, + "learning_rate": 0.00028623263637009256, + "loss": 7.4809, + "step": 10563 + }, + { + "epoch": 0.9857236166837734, + "grad_norm": 1.1370802617552358, + "learning_rate": 0.0002862294707658941, + "loss": 7.1536, + "step": 10564 + }, + { + "epoch": 0.9858169263786507, + "grad_norm": 0.502471458897535, + "learning_rate": 0.00028622630481530445, + "loss": 7.3197, + "step": 10565 + }, + { + "epoch": 0.985910236073528, + "grad_norm": 0.6217650558304612, + "learning_rate": 0.0002862231385183316, + "loss": 7.6368, + "step": 10566 + }, + { + "epoch": 0.9860035457684053, + "grad_norm": 1.7228327841858493, + "learning_rate": 0.0002862199718749838, + "loss": 7.598, + "step": 10567 + }, + { + "epoch": 0.9860968554632826, + "grad_norm": 0.393085881323536, + "learning_rate": 0.0002862168048852689, + "loss": 7.5326, + "step": 10568 + }, + { + "epoch": 0.9861901651581599, + "grad_norm": 0.5513905036493105, + "learning_rate": 0.000286213637549195, + "loss": 7.2714, + "step": 10569 + }, + { + "epoch": 0.9862834748530372, + "grad_norm": 0.4818824249775703, + "learning_rate": 0.0002862104698667702, + "loss": 7.4065, + "step": 10570 + }, + { + "epoch": 0.9863767845479146, + "grad_norm": 0.6538660429972069, + "learning_rate": 0.0002862073018380025, + "loss": 7.1084, + "step": 10571 + }, + { + "epoch": 0.9864700942427919, + "grad_norm": 0.4485248865172896, + "learning_rate": 0.00028620413346289994, + "loss": 7.1073, + "step": 10572 + }, + { + "epoch": 0.9865634039376692, + "grad_norm": 0.6080604096872925, + "learning_rate": 0.00028620096474147065, + "loss": 7.2457, + "step": 10573 + }, + { + "epoch": 0.9866567136325464, + "grad_norm": 0.6832462487835133, + "learning_rate": 0.0002861977956737226, + "loss": 7.0834, + "step": 10574 + }, + { + "epoch": 0.9867500233274237, + "grad_norm": 1.4548320791044502, + "learning_rate": 0.000286194626259664, + "loss": 7.7091, + "step": 10575 + }, + { + "epoch": 0.986843333022301, + "grad_norm": 167.268818041929, + "learning_rate": 0.0002861914564993028, + "loss": 7.2462, + "step": 10576 + }, + { + "epoch": 0.9869366427171783, + "grad_norm": 24.98796668386433, + "learning_rate": 0.000286188286392647, + "loss": 7.451, + "step": 10577 + }, + { + "epoch": 0.9870299524120556, + "grad_norm": 0.3214761923421279, + "learning_rate": 0.0002861851159397048, + "loss": 7.2061, + "step": 10578 + }, + { + "epoch": 0.9871232621069329, + "grad_norm": 1.362682941739675, + "learning_rate": 0.0002861819451404841, + "loss": 7.3906, + "step": 10579 + }, + { + "epoch": 0.9872165718018102, + "grad_norm": 1.882697027638673, + "learning_rate": 0.0002861787739949931, + "loss": 7.4621, + "step": 10580 + }, + { + "epoch": 0.9873098814966875, + "grad_norm": 1.606404775927638, + "learning_rate": 0.00028617560250323985, + "loss": 7.4486, + "step": 10581 + }, + { + "epoch": 0.9874031911915648, + "grad_norm": 1.4649110893841981, + "learning_rate": 0.0002861724306652324, + "loss": 7.239, + "step": 10582 + }, + { + "epoch": 0.9874965008864421, + "grad_norm": 0.6486165236662943, + "learning_rate": 0.0002861692584809787, + "loss": 7.5749, + "step": 10583 + }, + { + "epoch": 0.9875898105813194, + "grad_norm": 0.6234399511382377, + "learning_rate": 0.000286166085950487, + "loss": 7.3924, + "step": 10584 + }, + { + "epoch": 0.9876831202761966, + "grad_norm": 1.304014210673503, + "learning_rate": 0.00028616291307376527, + "loss": 7.3632, + "step": 10585 + }, + { + "epoch": 0.987776429971074, + "grad_norm": 0.8373930232475507, + "learning_rate": 0.0002861597398508215, + "loss": 7.2501, + "step": 10586 + }, + { + "epoch": 0.9878697396659513, + "grad_norm": 1.5230020417651529, + "learning_rate": 0.0002861565662816639, + "loss": 7.7682, + "step": 10587 + }, + { + "epoch": 0.9879630493608286, + "grad_norm": 0.44962579616691334, + "learning_rate": 0.0002861533923663005, + "loss": 7.2347, + "step": 10588 + }, + { + "epoch": 0.9880563590557059, + "grad_norm": 0.5690706175423781, + "learning_rate": 0.00028615021810473934, + "loss": 7.3224, + "step": 10589 + }, + { + "epoch": 0.9881496687505832, + "grad_norm": 0.4482617893524921, + "learning_rate": 0.0002861470434969885, + "loss": 7.7253, + "step": 10590 + }, + { + "epoch": 0.9882429784454605, + "grad_norm": 0.506865140673062, + "learning_rate": 0.00028614386854305604, + "loss": 7.3611, + "step": 10591 + }, + { + "epoch": 0.9883362881403378, + "grad_norm": 0.847346157442201, + "learning_rate": 0.00028614069324295013, + "loss": 7.1476, + "step": 10592 + }, + { + "epoch": 0.9884295978352151, + "grad_norm": 0.5458713697379306, + "learning_rate": 0.0002861375175966787, + "loss": 7.6643, + "step": 10593 + }, + { + "epoch": 0.9885229075300924, + "grad_norm": 1.0915371574985957, + "learning_rate": 0.0002861343416042499, + "loss": 7.5054, + "step": 10594 + }, + { + "epoch": 0.9886162172249697, + "grad_norm": 0.77051364492995, + "learning_rate": 0.00028613116526567177, + "loss": 7.1948, + "step": 10595 + }, + { + "epoch": 0.9887095269198469, + "grad_norm": 0.7991052451898665, + "learning_rate": 0.00028612798858095246, + "loss": 7.3053, + "step": 10596 + }, + { + "epoch": 0.9888028366147242, + "grad_norm": 1.001444878752248, + "learning_rate": 0.0002861248115501, + "loss": 7.6806, + "step": 10597 + }, + { + "epoch": 0.9888961463096015, + "grad_norm": 0.873335520115972, + "learning_rate": 0.0002861216341731224, + "loss": 7.657, + "step": 10598 + }, + { + "epoch": 0.9889894560044789, + "grad_norm": 0.4603097229690003, + "learning_rate": 0.00028611845645002787, + "loss": 7.3691, + "step": 10599 + }, + { + "epoch": 0.9890827656993562, + "grad_norm": 0.6427116108662027, + "learning_rate": 0.00028611527838082445, + "loss": 7.085, + "step": 10600 + }, + { + "epoch": 0.9891760753942335, + "grad_norm": 0.7510749817756809, + "learning_rate": 0.0002861120999655201, + "loss": 7.0799, + "step": 10601 + }, + { + "epoch": 0.9892693850891108, + "grad_norm": 0.37658921749864094, + "learning_rate": 0.00028610892120412305, + "loss": 7.4206, + "step": 10602 + }, + { + "epoch": 0.9893626947839881, + "grad_norm": 0.41656521472206115, + "learning_rate": 0.0002861057420966414, + "loss": 7.1411, + "step": 10603 + }, + { + "epoch": 0.9894560044788654, + "grad_norm": 0.6665449407386524, + "learning_rate": 0.0002861025626430831, + "loss": 7.394, + "step": 10604 + }, + { + "epoch": 0.9895493141737427, + "grad_norm": 0.4188648485366144, + "learning_rate": 0.0002860993828434563, + "loss": 7.3195, + "step": 10605 + }, + { + "epoch": 0.9896426238686199, + "grad_norm": 0.7880826392807564, + "learning_rate": 0.00028609620269776906, + "loss": 7.2293, + "step": 10606 + }, + { + "epoch": 0.9897359335634972, + "grad_norm": 0.5180654726602093, + "learning_rate": 0.00028609302220602956, + "loss": 7.1961, + "step": 10607 + }, + { + "epoch": 0.9898292432583745, + "grad_norm": 0.5586123241534781, + "learning_rate": 0.00028608984136824575, + "loss": 7.2529, + "step": 10608 + }, + { + "epoch": 0.9899225529532518, + "grad_norm": 0.7002731717931907, + "learning_rate": 0.0002860866601844258, + "loss": 7.2946, + "step": 10609 + }, + { + "epoch": 0.9900158626481291, + "grad_norm": 0.6122083706788946, + "learning_rate": 0.00028608347865457785, + "loss": 7.6735, + "step": 10610 + }, + { + "epoch": 0.9901091723430064, + "grad_norm": 0.3851931435592548, + "learning_rate": 0.00028608029677870987, + "loss": 7.2596, + "step": 10611 + }, + { + "epoch": 0.9902024820378837, + "grad_norm": 0.7001415784623173, + "learning_rate": 0.00028607711455683, + "loss": 7.7728, + "step": 10612 + }, + { + "epoch": 0.990295791732761, + "grad_norm": 0.40406369249066393, + "learning_rate": 0.00028607393198894633, + "loss": 7.2251, + "step": 10613 + }, + { + "epoch": 0.9903891014276384, + "grad_norm": 0.33070634731667325, + "learning_rate": 0.0002860707490750669, + "loss": 7.4129, + "step": 10614 + }, + { + "epoch": 0.9904824111225157, + "grad_norm": 1.2643972335253897, + "learning_rate": 0.00028606756581519996, + "loss": 7.5479, + "step": 10615 + }, + { + "epoch": 0.990575720817393, + "grad_norm": 0.574206179341873, + "learning_rate": 0.0002860643822093535, + "loss": 7.6374, + "step": 10616 + }, + { + "epoch": 0.9906690305122702, + "grad_norm": 0.32135649353487555, + "learning_rate": 0.00028606119825753566, + "loss": 7.5595, + "step": 10617 + }, + { + "epoch": 0.9907623402071475, + "grad_norm": 0.46834951763227606, + "learning_rate": 0.00028605801395975436, + "loss": 7.5657, + "step": 10618 + }, + { + "epoch": 0.9908556499020248, + "grad_norm": 3.3554420288831626, + "learning_rate": 0.000286054829316018, + "loss": 7.0884, + "step": 10619 + }, + { + "epoch": 0.9909489595969021, + "grad_norm": 0.48949399732153315, + "learning_rate": 0.0002860516443263344, + "loss": 7.5941, + "step": 10620 + }, + { + "epoch": 0.9910422692917794, + "grad_norm": 2.178232531367009, + "learning_rate": 0.0002860484589907118, + "loss": 7.3342, + "step": 10621 + }, + { + "epoch": 0.9911355789866567, + "grad_norm": 0.6228050622499768, + "learning_rate": 0.00028604527330915825, + "loss": 7.6341, + "step": 10622 + }, + { + "epoch": 0.991228888681534, + "grad_norm": 1.8829048228449388, + "learning_rate": 0.00028604208728168185, + "loss": 7.3187, + "step": 10623 + }, + { + "epoch": 0.9913221983764113, + "grad_norm": 0.8710300820673832, + "learning_rate": 0.0002860389009082908, + "loss": 7.5788, + "step": 10624 + }, + { + "epoch": 0.9914155080712886, + "grad_norm": 0.4501023955726129, + "learning_rate": 0.00028603571418899307, + "loss": 7.7444, + "step": 10625 + }, + { + "epoch": 0.991508817766166, + "grad_norm": 0.7733016840670496, + "learning_rate": 0.0002860325271237968, + "loss": 7.4336, + "step": 10626 + }, + { + "epoch": 0.9916021274610431, + "grad_norm": 0.9364433987385636, + "learning_rate": 0.00028602933971271015, + "loss": 7.4568, + "step": 10627 + }, + { + "epoch": 0.9916954371559205, + "grad_norm": 6.845740492664409, + "learning_rate": 0.00028602615195574116, + "loss": 7.1353, + "step": 10628 + }, + { + "epoch": 0.9917887468507978, + "grad_norm": 0.5008978292663122, + "learning_rate": 0.000286022963852898, + "loss": 7.4045, + "step": 10629 + }, + { + "epoch": 0.9918820565456751, + "grad_norm": 0.6189408591489344, + "learning_rate": 0.0002860197754041887, + "loss": 7.5114, + "step": 10630 + }, + { + "epoch": 0.9919753662405524, + "grad_norm": 1.1415086182568364, + "learning_rate": 0.0002860165866096214, + "loss": 7.8788, + "step": 10631 + }, + { + "epoch": 0.9920686759354297, + "grad_norm": 19.955460875508, + "learning_rate": 0.0002860133974692042, + "loss": 7.3363, + "step": 10632 + }, + { + "epoch": 0.992161985630307, + "grad_norm": 25.532648201999066, + "learning_rate": 0.00028601020798294527, + "loss": 7.5137, + "step": 10633 + }, + { + "epoch": 0.9922552953251843, + "grad_norm": 0.3598696155216064, + "learning_rate": 0.0002860070181508526, + "loss": 7.2514, + "step": 10634 + }, + { + "epoch": 0.9923486050200616, + "grad_norm": 127.6685253692124, + "learning_rate": 0.0002860038279729344, + "loss": 7.3707, + "step": 10635 + }, + { + "epoch": 0.9924419147149389, + "grad_norm": 0.5968361938640573, + "learning_rate": 0.0002860006374491988, + "loss": 7.774, + "step": 10636 + }, + { + "epoch": 0.9925352244098162, + "grad_norm": 1.371171534640991, + "learning_rate": 0.0002859974465796538, + "loss": 7.5124, + "step": 10637 + }, + { + "epoch": 0.9926285341046934, + "grad_norm": 1.9329136183607123, + "learning_rate": 0.00028599425536430763, + "loss": 7.3047, + "step": 10638 + }, + { + "epoch": 0.9927218437995707, + "grad_norm": 1.813765725890557, + "learning_rate": 0.0002859910638031683, + "loss": 7.2512, + "step": 10639 + }, + { + "epoch": 0.992815153494448, + "grad_norm": 1.8719923525882889, + "learning_rate": 0.00028598787189624403, + "loss": 7.2537, + "step": 10640 + }, + { + "epoch": 0.9929084631893254, + "grad_norm": 0.6880812929296818, + "learning_rate": 0.00028598467964354287, + "loss": 7.3422, + "step": 10641 + }, + { + "epoch": 0.9930017728842027, + "grad_norm": 0.6438630432344875, + "learning_rate": 0.00028598148704507293, + "loss": 7.6167, + "step": 10642 + }, + { + "epoch": 0.99309508257908, + "grad_norm": 1.2383657083018593, + "learning_rate": 0.0002859782941008424, + "loss": 7.2167, + "step": 10643 + }, + { + "epoch": 0.9931883922739573, + "grad_norm": 1.8027171906025494, + "learning_rate": 0.0002859751008108593, + "loss": 7.266, + "step": 10644 + }, + { + "epoch": 0.9932817019688346, + "grad_norm": 25.31384768331808, + "learning_rate": 0.00028597190717513177, + "loss": 7.8241, + "step": 10645 + }, + { + "epoch": 0.9933750116637119, + "grad_norm": 142.58839316302638, + "learning_rate": 0.000285968713193668, + "loss": 7.6029, + "step": 10646 + }, + { + "epoch": 0.9934683213585892, + "grad_norm": 0.43422867091123973, + "learning_rate": 0.00028596551886647605, + "loss": 7.3846, + "step": 10647 + }, + { + "epoch": 0.9935616310534665, + "grad_norm": 85.1353956932201, + "learning_rate": 0.0002859623241935641, + "loss": 7.3609, + "step": 10648 + }, + { + "epoch": 0.9936549407483437, + "grad_norm": 1.4617922619803456, + "learning_rate": 0.0002859591291749402, + "loss": 7.6915, + "step": 10649 + }, + { + "epoch": 0.993748250443221, + "grad_norm": 2.317796423795297, + "learning_rate": 0.0002859559338106125, + "loss": 7.2243, + "step": 10650 + }, + { + "epoch": 0.9938415601380983, + "grad_norm": 97.8042144459655, + "learning_rate": 0.0002859527381005892, + "loss": 7.8528, + "step": 10651 + }, + { + "epoch": 0.9939348698329756, + "grad_norm": 1.2400728476805059, + "learning_rate": 0.0002859495420448783, + "loss": 7.6554, + "step": 10652 + }, + { + "epoch": 0.9940281795278529, + "grad_norm": 1.0070758384038738, + "learning_rate": 0.000285946345643488, + "loss": 7.3254, + "step": 10653 + }, + { + "epoch": 0.9941214892227302, + "grad_norm": 0.5243012209974791, + "learning_rate": 0.0002859431488964264, + "loss": 7.4888, + "step": 10654 + }, + { + "epoch": 0.9942147989176076, + "grad_norm": 1.041880825114613, + "learning_rate": 0.00028593995180370164, + "loss": 7.6164, + "step": 10655 + }, + { + "epoch": 0.9943081086124849, + "grad_norm": 2.675380644582974, + "learning_rate": 0.00028593675436532187, + "loss": 7.548, + "step": 10656 + }, + { + "epoch": 0.9944014183073622, + "grad_norm": 1.6940743006171999, + "learning_rate": 0.0002859335565812952, + "loss": 7.4981, + "step": 10657 + }, + { + "epoch": 0.9944947280022395, + "grad_norm": 13.970012757878827, + "learning_rate": 0.00028593035845162976, + "loss": 7.2795, + "step": 10658 + }, + { + "epoch": 0.9945880376971167, + "grad_norm": 1.2717903521567406, + "learning_rate": 0.00028592715997633367, + "loss": 7.3362, + "step": 10659 + }, + { + "epoch": 0.994681347391994, + "grad_norm": 1.0302061251792278, + "learning_rate": 0.00028592396115541507, + "loss": 7.393, + "step": 10660 + }, + { + "epoch": 0.9947746570868713, + "grad_norm": 9.766904958074406, + "learning_rate": 0.0002859207619888821, + "loss": 7.5515, + "step": 10661 + }, + { + "epoch": 0.9948679667817486, + "grad_norm": 0.8846023205294605, + "learning_rate": 0.000285917562476743, + "loss": 7.115, + "step": 10662 + }, + { + "epoch": 0.9949612764766259, + "grad_norm": 0.9469695954625302, + "learning_rate": 0.00028591436261900564, + "loss": 7.3822, + "step": 10663 + }, + { + "epoch": 0.9950545861715032, + "grad_norm": 0.4655815175848858, + "learning_rate": 0.0002859111624156784, + "loss": 7.3574, + "step": 10664 + }, + { + "epoch": 0.9951478958663805, + "grad_norm": 0.7284138548270872, + "learning_rate": 0.00028590796186676935, + "loss": 7.7045, + "step": 10665 + }, + { + "epoch": 0.9952412055612578, + "grad_norm": 1.1238363386567203, + "learning_rate": 0.00028590476097228656, + "loss": 7.2867, + "step": 10666 + }, + { + "epoch": 0.9953345152561351, + "grad_norm": 0.7300449735673976, + "learning_rate": 0.0002859015597322383, + "loss": 7.4261, + "step": 10667 + }, + { + "epoch": 0.9954278249510125, + "grad_norm": 225.88387373963033, + "learning_rate": 0.00028589835814663256, + "loss": 7.6755, + "step": 10668 + }, + { + "epoch": 0.9955211346458898, + "grad_norm": 0.4379735606923022, + "learning_rate": 0.00028589515621547755, + "loss": 7.4243, + "step": 10669 + }, + { + "epoch": 0.995614444340767, + "grad_norm": 1.8633093037529795, + "learning_rate": 0.00028589195393878146, + "loss": 7.5229, + "step": 10670 + }, + { + "epoch": 0.9957077540356443, + "grad_norm": 0.7293987056589157, + "learning_rate": 0.0002858887513165523, + "loss": 7.173, + "step": 10671 + }, + { + "epoch": 0.9958010637305216, + "grad_norm": 1.550667122957005, + "learning_rate": 0.0002858855483487984, + "loss": 7.3955, + "step": 10672 + }, + { + "epoch": 0.9958943734253989, + "grad_norm": 1.4451278199918969, + "learning_rate": 0.00028588234503552776, + "loss": 7.3396, + "step": 10673 + }, + { + "epoch": 0.9959876831202762, + "grad_norm": 0.5837424768894254, + "learning_rate": 0.0002858791413767486, + "loss": 7.5796, + "step": 10674 + }, + { + "epoch": 0.9960809928151535, + "grad_norm": 0.8076988338706396, + "learning_rate": 0.0002858759373724689, + "loss": 7.0811, + "step": 10675 + }, + { + "epoch": 0.9961743025100308, + "grad_norm": 2.133606287378764, + "learning_rate": 0.00028587273302269707, + "loss": 7.3724, + "step": 10676 + }, + { + "epoch": 0.9962676122049081, + "grad_norm": 0.8919287365905556, + "learning_rate": 0.0002858695283274411, + "loss": 7.3808, + "step": 10677 + }, + { + "epoch": 0.9963609218997854, + "grad_norm": 1.559638101895735, + "learning_rate": 0.00028586632328670916, + "loss": 7.6435, + "step": 10678 + }, + { + "epoch": 0.9964542315946627, + "grad_norm": 48.32891193307166, + "learning_rate": 0.0002858631179005094, + "loss": 7.703, + "step": 10679 + }, + { + "epoch": 0.9965475412895399, + "grad_norm": 2.156648332641231, + "learning_rate": 0.00028585991216885, + "loss": 7.5622, + "step": 10680 + }, + { + "epoch": 0.9966408509844172, + "grad_norm": 0.9768654447718098, + "learning_rate": 0.00028585670609173914, + "loss": 7.7191, + "step": 10681 + }, + { + "epoch": 0.9967341606792945, + "grad_norm": 1.5599846544386833, + "learning_rate": 0.0002858534996691848, + "loss": 7.2102, + "step": 10682 + }, + { + "epoch": 0.9968274703741719, + "grad_norm": 1.5214219777381675, + "learning_rate": 0.00028585029290119536, + "loss": 7.1557, + "step": 10683 + }, + { + "epoch": 0.9969207800690492, + "grad_norm": 1.2442339945076977, + "learning_rate": 0.0002858470857877788, + "loss": 6.9503, + "step": 10684 + }, + { + "epoch": 0.9970140897639265, + "grad_norm": 1.2970561539830856, + "learning_rate": 0.00028584387832894336, + "loss": 7.3537, + "step": 10685 + }, + { + "epoch": 0.9971073994588038, + "grad_norm": 1.4004152969071257, + "learning_rate": 0.00028584067052469717, + "loss": 7.4684, + "step": 10686 + }, + { + "epoch": 0.9972007091536811, + "grad_norm": 1.7276707346141393, + "learning_rate": 0.0002858374623750484, + "loss": 7.5368, + "step": 10687 + }, + { + "epoch": 0.9972940188485584, + "grad_norm": 1.6328935763952341, + "learning_rate": 0.0002858342538800052, + "loss": 7.5446, + "step": 10688 + }, + { + "epoch": 0.9973873285434357, + "grad_norm": 1.0713994342913786, + "learning_rate": 0.00028583104503957576, + "loss": 7.5181, + "step": 10689 + }, + { + "epoch": 0.997480638238313, + "grad_norm": 0.5453315000514981, + "learning_rate": 0.00028582783585376814, + "loss": 7.6376, + "step": 10690 + }, + { + "epoch": 0.9975739479331902, + "grad_norm": 0.6753775557789283, + "learning_rate": 0.00028582462632259064, + "loss": 7.3911, + "step": 10691 + }, + { + "epoch": 0.9976672576280675, + "grad_norm": 50.69475177966831, + "learning_rate": 0.00028582141644605127, + "loss": 7.2218, + "step": 10692 + }, + { + "epoch": 0.9977605673229448, + "grad_norm": 90.9335439907186, + "learning_rate": 0.0002858182062241583, + "loss": 7.2841, + "step": 10693 + }, + { + "epoch": 0.9978538770178221, + "grad_norm": 1.457941249057375, + "learning_rate": 0.00028581499565691993, + "loss": 7.2368, + "step": 10694 + }, + { + "epoch": 0.9979471867126994, + "grad_norm": 1.734183876468408, + "learning_rate": 0.0002858117847443442, + "loss": 7.0921, + "step": 10695 + }, + { + "epoch": 0.9980404964075767, + "grad_norm": 1.6911425624422343, + "learning_rate": 0.00028580857348643935, + "loss": 7.4949, + "step": 10696 + }, + { + "epoch": 0.9981338061024541, + "grad_norm": 1.488257214556626, + "learning_rate": 0.0002858053618832135, + "loss": 7.3981, + "step": 10697 + }, + { + "epoch": 0.9982271157973314, + "grad_norm": 1.4238291630626914, + "learning_rate": 0.0002858021499346749, + "loss": 7.3184, + "step": 10698 + }, + { + "epoch": 0.9983204254922087, + "grad_norm": 1.2464989182721262, + "learning_rate": 0.0002857989376408316, + "loss": 7.111, + "step": 10699 + }, + { + "epoch": 0.998413735187086, + "grad_norm": 0.9430088042005976, + "learning_rate": 0.00028579572500169186, + "loss": 7.6251, + "step": 10700 + }, + { + "epoch": 0.9985070448819633, + "grad_norm": 2.3683001360868943, + "learning_rate": 0.0002857925120172638, + "loss": 7.5576, + "step": 10701 + }, + { + "epoch": 0.9986003545768405, + "grad_norm": 0.4822768087987962, + "learning_rate": 0.00028578929868755564, + "loss": 7.3604, + "step": 10702 + }, + { + "epoch": 0.9986936642717178, + "grad_norm": 1.0517486186451135, + "learning_rate": 0.00028578608501257543, + "loss": 7.5537, + "step": 10703 + }, + { + "epoch": 0.9987869739665951, + "grad_norm": 857.8320549172315, + "learning_rate": 0.00028578287099233153, + "loss": 7.1153, + "step": 10704 + }, + { + "epoch": 0.9988802836614724, + "grad_norm": 2.1044854376233437, + "learning_rate": 0.00028577965662683196, + "loss": 7.3544, + "step": 10705 + }, + { + "epoch": 0.9989735933563497, + "grad_norm": 2.5400843802240307, + "learning_rate": 0.00028577644191608495, + "loss": 7.4872, + "step": 10706 + }, + { + "epoch": 0.999066903051227, + "grad_norm": 2.08140530191053, + "learning_rate": 0.00028577322686009865, + "loss": 7.5123, + "step": 10707 + }, + { + "epoch": 0.9991602127461043, + "grad_norm": 2548.2803562284275, + "learning_rate": 0.00028577001145888134, + "loss": 7.3307, + "step": 10708 + }, + { + "epoch": 0.9992535224409816, + "grad_norm": 8340.480243271973, + "learning_rate": 0.00028576679571244105, + "loss": 7.0678, + "step": 10709 + }, + { + "epoch": 0.999346832135859, + "grad_norm": 0.8152206859869957, + "learning_rate": 0.000285763579620786, + "loss": 7.5475, + "step": 10710 + }, + { + "epoch": 0.9994401418307363, + "grad_norm": 1.7551490850859282, + "learning_rate": 0.00028576036318392437, + "loss": 7.0502, + "step": 10711 + }, + { + "epoch": 0.9995334515256135, + "grad_norm": 2.256440555843083, + "learning_rate": 0.00028575714640186435, + "loss": 7.5677, + "step": 10712 + }, + { + "epoch": 0.9996267612204908, + "grad_norm": 1.6301314047317383, + "learning_rate": 0.00028575392927461415, + "loss": 7.3527, + "step": 10713 + }, + { + "epoch": 0.9997200709153681, + "grad_norm": 3.0730007019923935, + "learning_rate": 0.00028575071180218195, + "loss": 7.5427, + "step": 10714 + }, + { + "epoch": 0.9998133806102454, + "grad_norm": 2.928528927893728, + "learning_rate": 0.0002857474939845759, + "loss": 7.5155, + "step": 10715 + }, + { + "epoch": 0.9999066903051227, + "grad_norm": 2187.370666760068, + "learning_rate": 0.0002857442758218041, + "loss": 7.5852, + "step": 10716 + }, + { + "epoch": 1.0, + "grad_norm": 9556.51593971104, + "learning_rate": 0.0002857410573138749, + "loss": 7.6014, + "step": 10717 + }, + { + "epoch": 1.0, + "eval_loss": 7.352176189422607, + "eval_runtime": 26.9523, + "eval_samples_per_second": 6.567, + "eval_steps_per_second": 6.567, + "step": 10717 + }, + { + "epoch": 1.0000933096948772, + "grad_norm": 1.4803808201918611, + "learning_rate": 0.00028573783846079634, + "loss": 7.5938, + "step": 10718 + }, + { + "epoch": 1.0001866193897546, + "grad_norm": 2.802278280656657, + "learning_rate": 0.0002857346192625767, + "loss": 7.5903, + "step": 10719 + }, + { + "epoch": 1.0002799290846318, + "grad_norm": 3.047289215264011, + "learning_rate": 0.00028573139971922414, + "loss": 7.4386, + "step": 10720 + }, + { + "epoch": 1.0003732387795092, + "grad_norm": 1.531393519241992, + "learning_rate": 0.0002857281798307468, + "loss": 7.4695, + "step": 10721 + }, + { + "epoch": 1.0004665484743864, + "grad_norm": 1.4635689215777097, + "learning_rate": 0.000285724959597153, + "loss": 7.5364, + "step": 10722 + }, + { + "epoch": 1.0005598581692638, + "grad_norm": 996989.5462137581, + "learning_rate": 0.0002857217390184507, + "loss": 7.3993, + "step": 10723 + }, + { + "epoch": 1.000653167864141, + "grad_norm": 103473.26738610632, + "learning_rate": 0.0002857185180946483, + "loss": 7.4427, + "step": 10724 + }, + { + "epoch": 1.0007464775590185, + "grad_norm": 3.16942470597472, + "learning_rate": 0.0002857152968257539, + "loss": 7.6166, + "step": 10725 + }, + { + "epoch": 1.0008397872538957, + "grad_norm": 3.1964374201629977, + "learning_rate": 0.0002857120752117757, + "loss": 8.2591, + "step": 10726 + }, + { + "epoch": 1.000933096948773, + "grad_norm": 783801.2166380131, + "learning_rate": 0.00028570885325272186, + "loss": 8.3891, + "step": 10727 + }, + { + "epoch": 1.0010264066436503, + "grad_norm": 28.263369097889807, + "learning_rate": 0.0002857056309486007, + "loss": 8.4263, + "step": 10728 + }, + { + "epoch": 1.0011197163385275, + "grad_norm": 39.677382280445435, + "learning_rate": 0.00028570240829942024, + "loss": 8.5336, + "step": 10729 + }, + { + "epoch": 1.001213026033405, + "grad_norm": 38.26827040357237, + "learning_rate": 0.0002856991853051887, + "loss": 8.2294, + "step": 10730 + }, + { + "epoch": 1.001306335728282, + "grad_norm": 115.57291435609356, + "learning_rate": 0.00028569596196591446, + "loss": 7.9849, + "step": 10731 + }, + { + "epoch": 1.0013996454231595, + "grad_norm": 268024.2009859327, + "learning_rate": 0.0002856927382816055, + "loss": 7.9726, + "step": 10732 + }, + { + "epoch": 1.0014929551180367, + "grad_norm": 2.637491959091861, + "learning_rate": 0.0002856895142522701, + "loss": 7.9478, + "step": 10733 + }, + { + "epoch": 1.0015862648129141, + "grad_norm": 4.228814291416715, + "learning_rate": 0.00028568628987791654, + "loss": 7.6922, + "step": 10734 + }, + { + "epoch": 1.0016795745077913, + "grad_norm": 4.273562941717757, + "learning_rate": 0.0002856830651585529, + "loss": 7.8223, + "step": 10735 + }, + { + "epoch": 1.0017728842026687, + "grad_norm": 4.58673318587689, + "learning_rate": 0.00028567984009418736, + "loss": 7.6787, + "step": 10736 + }, + { + "epoch": 1.001866193897546, + "grad_norm": 4.217008500720973, + "learning_rate": 0.0002856766146848282, + "loss": 7.9955, + "step": 10737 + }, + { + "epoch": 1.0019595035924234, + "grad_norm": 2.464021036275249, + "learning_rate": 0.00028567338893048364, + "loss": 7.5921, + "step": 10738 + }, + { + "epoch": 1.0020528132873006, + "grad_norm": 1.2215309254536932, + "learning_rate": 0.0002856701628311618, + "loss": 7.5531, + "step": 10739 + }, + { + "epoch": 1.0021461229821778, + "grad_norm": 0.7954275714724217, + "learning_rate": 0.000285666936386871, + "loss": 7.2259, + "step": 10740 + }, + { + "epoch": 1.0022394326770552, + "grad_norm": 0.6518883896424101, + "learning_rate": 0.0002856637095976193, + "loss": 7.467, + "step": 10741 + }, + { + "epoch": 1.0023327423719324, + "grad_norm": 1.8777768554095597, + "learning_rate": 0.00028566048246341495, + "loss": 7.3474, + "step": 10742 + }, + { + "epoch": 1.0024260520668098, + "grad_norm": 1.7415042776871845, + "learning_rate": 0.0002856572549842662, + "loss": 7.6975, + "step": 10743 + }, + { + "epoch": 1.002519361761687, + "grad_norm": 2.159239484702046, + "learning_rate": 0.00028565402716018126, + "loss": 7.5428, + "step": 10744 + }, + { + "epoch": 1.0026126714565644, + "grad_norm": 3.06684558454932, + "learning_rate": 0.00028565079899116834, + "loss": 7.6594, + "step": 10745 + }, + { + "epoch": 1.0027059811514416, + "grad_norm": 2.642897144955155, + "learning_rate": 0.00028564757047723553, + "loss": 7.393, + "step": 10746 + }, + { + "epoch": 1.002799290846319, + "grad_norm": 3.10995254075954, + "learning_rate": 0.00028564434161839117, + "loss": 7.3062, + "step": 10747 + }, + { + "epoch": 1.0028926005411962, + "grad_norm": 2.330482434580132, + "learning_rate": 0.0002856411124146435, + "loss": 7.6593, + "step": 10748 + }, + { + "epoch": 1.0029859102360734, + "grad_norm": 0.8369461758943134, + "learning_rate": 0.00028563788286600057, + "loss": 7.5545, + "step": 10749 + }, + { + "epoch": 1.0030792199309508, + "grad_norm": 0.5950121464750983, + "learning_rate": 0.00028563465297247074, + "loss": 7.3872, + "step": 10750 + }, + { + "epoch": 1.003172529625828, + "grad_norm": 3.580267351209202, + "learning_rate": 0.0002856314227340621, + "loss": 7.8289, + "step": 10751 + }, + { + "epoch": 1.0032658393207055, + "grad_norm": 3.5253893938879757, + "learning_rate": 0.000285628192150783, + "loss": 7.7919, + "step": 10752 + }, + { + "epoch": 1.0033591490155827, + "grad_norm": 2.2955305545595777, + "learning_rate": 0.0002856249612226415, + "loss": 7.3735, + "step": 10753 + }, + { + "epoch": 1.00345245871046, + "grad_norm": 0.9270553393270854, + "learning_rate": 0.000285621729949646, + "loss": 7.2877, + "step": 10754 + }, + { + "epoch": 1.0035457684053373, + "grad_norm": 1.3495996881412446, + "learning_rate": 0.00028561849833180455, + "loss": 7.463, + "step": 10755 + }, + { + "epoch": 1.0036390781002147, + "grad_norm": 0.7622835178907269, + "learning_rate": 0.0002856152663691255, + "loss": 7.3187, + "step": 10756 + }, + { + "epoch": 1.0037323877950919, + "grad_norm": 0.6788360497431695, + "learning_rate": 0.0002856120340616169, + "loss": 7.5257, + "step": 10757 + }, + { + "epoch": 1.0038256974899693, + "grad_norm": 0.8199609371454137, + "learning_rate": 0.00028560880140928715, + "loss": 7.6647, + "step": 10758 + }, + { + "epoch": 1.0039190071848465, + "grad_norm": 0.9348119590982811, + "learning_rate": 0.00028560556841214437, + "loss": 7.5803, + "step": 10759 + }, + { + "epoch": 1.0040123168797237, + "grad_norm": 1.61608956391667, + "learning_rate": 0.0002856023350701968, + "loss": 7.539, + "step": 10760 + }, + { + "epoch": 1.0041056265746011, + "grad_norm": 2.159904445304428, + "learning_rate": 0.00028559910138345256, + "loss": 7.2233, + "step": 10761 + }, + { + "epoch": 1.0041989362694783, + "grad_norm": 1.3943760970184638, + "learning_rate": 0.0002855958673519201, + "loss": 7.4351, + "step": 10762 + }, + { + "epoch": 1.0042922459643557, + "grad_norm": 0.9870565022193482, + "learning_rate": 0.00028559263297560746, + "loss": 7.4559, + "step": 10763 + }, + { + "epoch": 1.004385555659233, + "grad_norm": 0.8361787989672, + "learning_rate": 0.00028558939825452294, + "loss": 7.7811, + "step": 10764 + }, + { + "epoch": 1.0044788653541104, + "grad_norm": 0.7765630803409989, + "learning_rate": 0.00028558616318867476, + "loss": 7.6153, + "step": 10765 + }, + { + "epoch": 1.0045721750489875, + "grad_norm": 1.5358774412066052, + "learning_rate": 0.0002855829277780711, + "loss": 7.742, + "step": 10766 + }, + { + "epoch": 1.004665484743865, + "grad_norm": 1.6287441267788034, + "learning_rate": 0.0002855796920227202, + "loss": 7.5868, + "step": 10767 + }, + { + "epoch": 1.0047587944387422, + "grad_norm": 1.2273375084759084, + "learning_rate": 0.00028557645592263033, + "loss": 7.5445, + "step": 10768 + }, + { + "epoch": 1.0048521041336196, + "grad_norm": 0.6034445652126826, + "learning_rate": 0.0002855732194778097, + "loss": 7.3914, + "step": 10769 + }, + { + "epoch": 1.0049454138284968, + "grad_norm": 0.8395671430508409, + "learning_rate": 0.00028556998268826654, + "loss": 7.5937, + "step": 10770 + }, + { + "epoch": 1.005038723523374, + "grad_norm": 0.9986044401906704, + "learning_rate": 0.00028556674555400906, + "loss": 7.8786, + "step": 10771 + }, + { + "epoch": 1.0051320332182514, + "grad_norm": 0.7713042926719917, + "learning_rate": 0.0002855635080750455, + "loss": 7.6051, + "step": 10772 + }, + { + "epoch": 1.0052253429131286, + "grad_norm": 0.6339656414240445, + "learning_rate": 0.0002855602702513841, + "loss": 7.7101, + "step": 10773 + }, + { + "epoch": 1.005318652608006, + "grad_norm": 2.315650947501002, + "learning_rate": 0.0002855570320830331, + "loss": 7.3912, + "step": 10774 + }, + { + "epoch": 1.0054119623028832, + "grad_norm": 0.9980472748170048, + "learning_rate": 0.0002855537935700007, + "loss": 7.4553, + "step": 10775 + }, + { + "epoch": 1.0055052719977606, + "grad_norm": 0.9698224914322675, + "learning_rate": 0.00028555055471229517, + "loss": 7.4445, + "step": 10776 + }, + { + "epoch": 1.0055985816926378, + "grad_norm": 0.7162115292001089, + "learning_rate": 0.00028554731550992475, + "loss": 7.5013, + "step": 10777 + }, + { + "epoch": 1.0056918913875152, + "grad_norm": 0.6448561493035562, + "learning_rate": 0.00028554407596289764, + "loss": 7.253, + "step": 10778 + }, + { + "epoch": 1.0057852010823924, + "grad_norm": 0.6554761622359307, + "learning_rate": 0.0002855408360712221, + "loss": 7.5214, + "step": 10779 + }, + { + "epoch": 1.0058785107772699, + "grad_norm": 2.352638641214763, + "learning_rate": 0.0002855375958349063, + "loss": 7.8655, + "step": 10780 + }, + { + "epoch": 1.005971820472147, + "grad_norm": 1.3073075008435495, + "learning_rate": 0.0002855343552539586, + "loss": 7.4336, + "step": 10781 + }, + { + "epoch": 1.0060651301670243, + "grad_norm": 1.1438553690339166, + "learning_rate": 0.0002855311143283872, + "loss": 7.5756, + "step": 10782 + }, + { + "epoch": 1.0061584398619017, + "grad_norm": 0.8986013907739673, + "learning_rate": 0.0002855278730582003, + "loss": 7.4428, + "step": 10783 + }, + { + "epoch": 1.0062517495567789, + "grad_norm": 0.5391170130151243, + "learning_rate": 0.0002855246314434062, + "loss": 7.5274, + "step": 10784 + }, + { + "epoch": 1.0063450592516563, + "grad_norm": 1.2190590804148345, + "learning_rate": 0.0002855213894840131, + "loss": 7.28, + "step": 10785 + }, + { + "epoch": 1.0064383689465335, + "grad_norm": 0.9709008479467118, + "learning_rate": 0.0002855181471800292, + "loss": 7.5327, + "step": 10786 + }, + { + "epoch": 1.006531678641411, + "grad_norm": 0.5492935455760292, + "learning_rate": 0.0002855149045314628, + "loss": 7.6114, + "step": 10787 + }, + { + "epoch": 1.006624988336288, + "grad_norm": 1.3062125391944324, + "learning_rate": 0.0002855116615383222, + "loss": 7.3241, + "step": 10788 + }, + { + "epoch": 1.0067182980311655, + "grad_norm": 0.6432287200960713, + "learning_rate": 0.00028550841820061554, + "loss": 7.4723, + "step": 10789 + }, + { + "epoch": 1.0068116077260427, + "grad_norm": 1.2228617832164599, + "learning_rate": 0.00028550517451835113, + "loss": 7.8101, + "step": 10790 + }, + { + "epoch": 1.0069049174209201, + "grad_norm": 0.4828706711781116, + "learning_rate": 0.0002855019304915372, + "loss": 7.3531, + "step": 10791 + }, + { + "epoch": 1.0069982271157973, + "grad_norm": 0.5031012770230282, + "learning_rate": 0.000285498686120182, + "loss": 7.2407, + "step": 10792 + }, + { + "epoch": 1.0070915368106745, + "grad_norm": 0.40981665803283707, + "learning_rate": 0.0002854954414042938, + "loss": 7.2608, + "step": 10793 + }, + { + "epoch": 1.007184846505552, + "grad_norm": 0.5938742140635345, + "learning_rate": 0.00028549219634388075, + "loss": 7.4224, + "step": 10794 + }, + { + "epoch": 1.0072781562004292, + "grad_norm": 0.5777920160421117, + "learning_rate": 0.00028548895093895126, + "loss": 7.3428, + "step": 10795 + }, + { + "epoch": 1.0073714658953066, + "grad_norm": 0.897681542276873, + "learning_rate": 0.0002854857051895135, + "loss": 7.4015, + "step": 10796 + }, + { + "epoch": 1.0074647755901838, + "grad_norm": 1.3704686555919106, + "learning_rate": 0.00028548245909557567, + "loss": 8.2424, + "step": 10797 + }, + { + "epoch": 1.0075580852850612, + "grad_norm": 0.7246717595672969, + "learning_rate": 0.0002854792126571461, + "loss": 7.6462, + "step": 10798 + }, + { + "epoch": 1.0076513949799384, + "grad_norm": 0.5041776276106821, + "learning_rate": 0.00028547596587423307, + "loss": 7.6463, + "step": 10799 + }, + { + "epoch": 1.0077447046748158, + "grad_norm": 1.049834335163842, + "learning_rate": 0.00028547271874684475, + "loss": 7.4148, + "step": 10800 + }, + { + "epoch": 1.007838014369693, + "grad_norm": 1.6215236930367545, + "learning_rate": 0.0002854694712749894, + "loss": 7.2355, + "step": 10801 + }, + { + "epoch": 1.0079313240645702, + "grad_norm": 1.2691888345105784, + "learning_rate": 0.0002854662234586754, + "loss": 7.2664, + "step": 10802 + }, + { + "epoch": 1.0080246337594476, + "grad_norm": 0.9282112956269796, + "learning_rate": 0.0002854629752979109, + "loss": 7.3673, + "step": 10803 + }, + { + "epoch": 1.0081179434543248, + "grad_norm": 0.7899451454518432, + "learning_rate": 0.00028545972679270415, + "loss": 7.6057, + "step": 10804 + }, + { + "epoch": 1.0082112531492022, + "grad_norm": 0.5705319860115114, + "learning_rate": 0.00028545647794306345, + "loss": 7.3019, + "step": 10805 + }, + { + "epoch": 1.0083045628440794, + "grad_norm": 2.2309163150508864, + "learning_rate": 0.00028545322874899703, + "loss": 8.1722, + "step": 10806 + }, + { + "epoch": 1.0083978725389569, + "grad_norm": 1.14546751900516, + "learning_rate": 0.0002854499792105132, + "loss": 7.483, + "step": 10807 + }, + { + "epoch": 1.008491182233834, + "grad_norm": 0.8163722486894929, + "learning_rate": 0.0002854467293276202, + "loss": 7.4415, + "step": 10808 + }, + { + "epoch": 1.0085844919287115, + "grad_norm": 0.43262201556001395, + "learning_rate": 0.0002854434791003263, + "loss": 7.3318, + "step": 10809 + }, + { + "epoch": 1.0086778016235887, + "grad_norm": 0.7709442929304645, + "learning_rate": 0.00028544022852863973, + "loss": 7.2535, + "step": 10810 + }, + { + "epoch": 1.008771111318466, + "grad_norm": 0.5457331027719329, + "learning_rate": 0.0002854369776125688, + "loss": 7.6935, + "step": 10811 + }, + { + "epoch": 1.0088644210133433, + "grad_norm": 0.5371805897710399, + "learning_rate": 0.0002854337263521217, + "loss": 7.7708, + "step": 10812 + }, + { + "epoch": 1.0089577307082205, + "grad_norm": 2.185024330146065, + "learning_rate": 0.00028543047474730677, + "loss": 7.2355, + "step": 10813 + }, + { + "epoch": 1.009051040403098, + "grad_norm": 1.261313860569478, + "learning_rate": 0.0002854272227981323, + "loss": 7.2635, + "step": 10814 + }, + { + "epoch": 1.009144350097975, + "grad_norm": 1.0771160986022925, + "learning_rate": 0.00028542397050460646, + "loss": 7.2554, + "step": 10815 + }, + { + "epoch": 1.0092376597928525, + "grad_norm": 1.321204958014481, + "learning_rate": 0.00028542071786673767, + "loss": 7.9708, + "step": 10816 + }, + { + "epoch": 1.0093309694877297, + "grad_norm": 0.6621584213352806, + "learning_rate": 0.000285417464884534, + "loss": 7.4664, + "step": 10817 + }, + { + "epoch": 1.0094242791826071, + "grad_norm": 0.6572436056789596, + "learning_rate": 0.0002854142115580039, + "loss": 7.422, + "step": 10818 + }, + { + "epoch": 1.0095175888774843, + "grad_norm": 1.2384792422944477, + "learning_rate": 0.0002854109578871556, + "loss": 7.8093, + "step": 10819 + }, + { + "epoch": 1.0096108985723617, + "grad_norm": 0.8696924932463037, + "learning_rate": 0.0002854077038719973, + "loss": 7.6674, + "step": 10820 + }, + { + "epoch": 1.009704208267239, + "grad_norm": 0.6895450680750893, + "learning_rate": 0.00028540444951253733, + "loss": 7.3579, + "step": 10821 + }, + { + "epoch": 1.0097975179621164, + "grad_norm": 0.5056459537612498, + "learning_rate": 0.000285401194808784, + "loss": 7.6236, + "step": 10822 + }, + { + "epoch": 1.0098908276569936, + "grad_norm": 0.47631688875050193, + "learning_rate": 0.00028539793976074546, + "loss": 7.7094, + "step": 10823 + }, + { + "epoch": 1.0099841373518708, + "grad_norm": 1.7172400790749747, + "learning_rate": 0.00028539468436843005, + "loss": 7.1597, + "step": 10824 + }, + { + "epoch": 1.0100774470467482, + "grad_norm": 0.9412276236563394, + "learning_rate": 0.00028539142863184613, + "loss": 7.4859, + "step": 10825 + }, + { + "epoch": 1.0101707567416254, + "grad_norm": 1.1657434371295623, + "learning_rate": 0.00028538817255100194, + "loss": 7.1887, + "step": 10826 + }, + { + "epoch": 1.0102640664365028, + "grad_norm": 0.45840729651250384, + "learning_rate": 0.0002853849161259057, + "loss": 7.4686, + "step": 10827 + }, + { + "epoch": 1.01035737613138, + "grad_norm": 0.6245630221228273, + "learning_rate": 0.0002853816593565657, + "loss": 7.2624, + "step": 10828 + }, + { + "epoch": 1.0104506858262574, + "grad_norm": 2.1857611230686076, + "learning_rate": 0.00028537840224299025, + "loss": 7.7256, + "step": 10829 + }, + { + "epoch": 1.0105439955211346, + "grad_norm": 1.5222765934738194, + "learning_rate": 0.00028537514478518757, + "loss": 7.62, + "step": 10830 + }, + { + "epoch": 1.010637305216012, + "grad_norm": 1.6670207670623514, + "learning_rate": 0.0002853718869831661, + "loss": 7.741, + "step": 10831 + }, + { + "epoch": 1.0107306149108892, + "grad_norm": 0.9551741295474273, + "learning_rate": 0.00028536862883693395, + "loss": 7.6316, + "step": 10832 + }, + { + "epoch": 1.0108239246057666, + "grad_norm": 0.905490906750529, + "learning_rate": 0.0002853653703464995, + "loss": 7.4829, + "step": 10833 + }, + { + "epoch": 1.0109172343006438, + "grad_norm": 1.6447437365005662, + "learning_rate": 0.000285362111511871, + "loss": 7.2726, + "step": 10834 + }, + { + "epoch": 1.011010543995521, + "grad_norm": 1.551310097341681, + "learning_rate": 0.0002853588523330568, + "loss": 7.4675, + "step": 10835 + }, + { + "epoch": 1.0111038536903985, + "grad_norm": 1.1144357751173206, + "learning_rate": 0.00028535559281006514, + "loss": 7.4926, + "step": 10836 + }, + { + "epoch": 1.0111971633852757, + "grad_norm": 0.7468497383128376, + "learning_rate": 0.0002853523329429042, + "loss": 7.6178, + "step": 10837 + }, + { + "epoch": 1.011290473080153, + "grad_norm": 0.8096497347232336, + "learning_rate": 0.0002853490727315825, + "loss": 7.3291, + "step": 10838 + }, + { + "epoch": 1.0113837827750303, + "grad_norm": 0.6065147725494352, + "learning_rate": 0.0002853458121761081, + "loss": 7.0906, + "step": 10839 + }, + { + "epoch": 1.0114770924699077, + "grad_norm": 1.8122612526185107, + "learning_rate": 0.0002853425512764894, + "loss": 7.6825, + "step": 10840 + }, + { + "epoch": 1.0115704021647849, + "grad_norm": 0.5877418289106674, + "learning_rate": 0.00028533929003273477, + "loss": 7.1968, + "step": 10841 + }, + { + "epoch": 1.0116637118596623, + "grad_norm": 0.9211275724978683, + "learning_rate": 0.0002853360284448523, + "loss": 7.5691, + "step": 10842 + }, + { + "epoch": 1.0117570215545395, + "grad_norm": 0.9077531234811155, + "learning_rate": 0.00028533276651285044, + "loss": 7.4117, + "step": 10843 + }, + { + "epoch": 1.011850331249417, + "grad_norm": 0.6486706334208552, + "learning_rate": 0.0002853295042367375, + "loss": 7.4905, + "step": 10844 + }, + { + "epoch": 1.0119436409442941, + "grad_norm": 1.327877946686201, + "learning_rate": 0.00028532624161652163, + "loss": 7.1721, + "step": 10845 + }, + { + "epoch": 1.0120369506391713, + "grad_norm": 0.6428308436685658, + "learning_rate": 0.00028532297865221126, + "loss": 7.5501, + "step": 10846 + }, + { + "epoch": 1.0121302603340487, + "grad_norm": 1.3058910634756202, + "learning_rate": 0.00028531971534381464, + "loss": 7.2136, + "step": 10847 + }, + { + "epoch": 1.012223570028926, + "grad_norm": 0.5000228209517522, + "learning_rate": 0.0002853164516913401, + "loss": 7.5027, + "step": 10848 + }, + { + "epoch": 1.0123168797238034, + "grad_norm": 0.498161553600216, + "learning_rate": 0.0002853131876947959, + "loss": 7.5784, + "step": 10849 + }, + { + "epoch": 1.0124101894186806, + "grad_norm": 0.7715374974128861, + "learning_rate": 0.0002853099233541903, + "loss": 7.4997, + "step": 10850 + }, + { + "epoch": 1.012503499113558, + "grad_norm": 0.3319815778588858, + "learning_rate": 0.00028530665866953166, + "loss": 7.1923, + "step": 10851 + }, + { + "epoch": 1.0125968088084352, + "grad_norm": 0.517535490305854, + "learning_rate": 0.00028530339364082826, + "loss": 7.4477, + "step": 10852 + }, + { + "epoch": 1.0126901185033126, + "grad_norm": 0.7040403182071572, + "learning_rate": 0.0002853001282680885, + "loss": 7.1544, + "step": 10853 + }, + { + "epoch": 1.0127834281981898, + "grad_norm": 1.529873127111403, + "learning_rate": 0.00028529686255132045, + "loss": 7.9747, + "step": 10854 + }, + { + "epoch": 1.012876737893067, + "grad_norm": 0.46135703949290113, + "learning_rate": 0.00028529359649053263, + "loss": 7.3241, + "step": 10855 + }, + { + "epoch": 1.0129700475879444, + "grad_norm": 0.7376635036011437, + "learning_rate": 0.00028529033008573327, + "loss": 7.1765, + "step": 10856 + }, + { + "epoch": 1.0130633572828216, + "grad_norm": 0.3654531432281618, + "learning_rate": 0.00028528706333693065, + "loss": 7.5077, + "step": 10857 + }, + { + "epoch": 1.013156666977699, + "grad_norm": 0.4772187891766016, + "learning_rate": 0.0002852837962441332, + "loss": 7.5075, + "step": 10858 + }, + { + "epoch": 1.0132499766725762, + "grad_norm": 0.3226825587349275, + "learning_rate": 0.00028528052880734904, + "loss": 7.3889, + "step": 10859 + }, + { + "epoch": 1.0133432863674536, + "grad_norm": 0.3714538922312381, + "learning_rate": 0.0002852772610265866, + "loss": 7.3958, + "step": 10860 + }, + { + "epoch": 1.0134365960623308, + "grad_norm": 0.6957012621928443, + "learning_rate": 0.0002852739929018541, + "loss": 7.5872, + "step": 10861 + }, + { + "epoch": 1.0135299057572082, + "grad_norm": 0.8045404397135821, + "learning_rate": 0.00028527072443315996, + "loss": 7.5712, + "step": 10862 + }, + { + "epoch": 1.0136232154520854, + "grad_norm": 0.7954715509072315, + "learning_rate": 0.0002852674556205124, + "loss": 7.2922, + "step": 10863 + }, + { + "epoch": 1.0137165251469629, + "grad_norm": 0.4777944777876853, + "learning_rate": 0.00028526418646391985, + "loss": 7.5959, + "step": 10864 + }, + { + "epoch": 1.01380983484184, + "grad_norm": 0.8296153417721878, + "learning_rate": 0.0002852609169633905, + "loss": 7.4167, + "step": 10865 + }, + { + "epoch": 1.0139031445367173, + "grad_norm": 0.4033001559338575, + "learning_rate": 0.0002852576471189327, + "loss": 7.4596, + "step": 10866 + }, + { + "epoch": 1.0139964542315947, + "grad_norm": 0.4252004280464613, + "learning_rate": 0.0002852543769305547, + "loss": 7.6163, + "step": 10867 + }, + { + "epoch": 1.0140897639264719, + "grad_norm": 0.603319038584652, + "learning_rate": 0.00028525110639826496, + "loss": 7.6557, + "step": 10868 + }, + { + "epoch": 1.0141830736213493, + "grad_norm": 1.295357519570936, + "learning_rate": 0.0002852478355220717, + "loss": 7.636, + "step": 10869 + }, + { + "epoch": 1.0142763833162265, + "grad_norm": 0.7425344223844484, + "learning_rate": 0.0002852445643019833, + "loss": 7.4916, + "step": 10870 + }, + { + "epoch": 1.014369693011104, + "grad_norm": 0.5242471668482309, + "learning_rate": 0.00028524129273800803, + "loss": 7.3788, + "step": 10871 + }, + { + "epoch": 1.014463002705981, + "grad_norm": 0.8016505373837998, + "learning_rate": 0.00028523802083015417, + "loss": 7.2329, + "step": 10872 + }, + { + "epoch": 1.0145563124008585, + "grad_norm": 0.8493979533574211, + "learning_rate": 0.0002852347485784301, + "loss": 7.7905, + "step": 10873 + }, + { + "epoch": 1.0146496220957357, + "grad_norm": 0.9664774977838588, + "learning_rate": 0.0002852314759828441, + "loss": 7.7483, + "step": 10874 + }, + { + "epoch": 1.0147429317906131, + "grad_norm": 0.3416864638552463, + "learning_rate": 0.0002852282030434046, + "loss": 7.4365, + "step": 10875 + }, + { + "epoch": 1.0148362414854903, + "grad_norm": 0.636595723651732, + "learning_rate": 0.00028522492976011974, + "loss": 7.1813, + "step": 10876 + }, + { + "epoch": 1.0149295511803675, + "grad_norm": 0.4367052012122534, + "learning_rate": 0.0002852216561329979, + "loss": 7.349, + "step": 10877 + }, + { + "epoch": 1.015022860875245, + "grad_norm": 1.0356479466198487, + "learning_rate": 0.0002852183821620476, + "loss": 7.1521, + "step": 10878 + }, + { + "epoch": 1.0151161705701222, + "grad_norm": 1.0562951543163153, + "learning_rate": 0.0002852151078472769, + "loss": 7.109, + "step": 10879 + }, + { + "epoch": 1.0152094802649996, + "grad_norm": 0.47837584727353266, + "learning_rate": 0.00028521183318869426, + "loss": 7.3823, + "step": 10880 + }, + { + "epoch": 1.0153027899598768, + "grad_norm": 1.6791851441245853, + "learning_rate": 0.000285208558186308, + "loss": 7.3501, + "step": 10881 + }, + { + "epoch": 1.0153960996547542, + "grad_norm": 1.5098958589743403, + "learning_rate": 0.00028520528284012645, + "loss": 7.6921, + "step": 10882 + }, + { + "epoch": 1.0154894093496314, + "grad_norm": 1.6594187308387827, + "learning_rate": 0.0002852020071501579, + "loss": 7.8049, + "step": 10883 + }, + { + "epoch": 1.0155827190445088, + "grad_norm": 0.6823807003301186, + "learning_rate": 0.0002851987311164106, + "loss": 7.4597, + "step": 10884 + }, + { + "epoch": 1.015676028739386, + "grad_norm": 0.9739282027638034, + "learning_rate": 0.0002851954547388931, + "loss": 7.3495, + "step": 10885 + }, + { + "epoch": 1.0157693384342634, + "grad_norm": 1.5917747760331715, + "learning_rate": 0.00028519217801761354, + "loss": 7.4744, + "step": 10886 + }, + { + "epoch": 1.0158626481291406, + "grad_norm": 1.865706132241857, + "learning_rate": 0.00028518890095258036, + "loss": 7.6117, + "step": 10887 + }, + { + "epoch": 1.0159559578240178, + "grad_norm": 0.4836279952395352, + "learning_rate": 0.0002851856235438018, + "loss": 7.7532, + "step": 10888 + }, + { + "epoch": 1.0160492675188952, + "grad_norm": 0.5413044730962029, + "learning_rate": 0.00028518234579128627, + "loss": 7.406, + "step": 10889 + }, + { + "epoch": 1.0161425772137724, + "grad_norm": 0.3976410761402715, + "learning_rate": 0.00028517906769504205, + "loss": 7.4015, + "step": 10890 + }, + { + "epoch": 1.0162358869086499, + "grad_norm": 0.6723668236691259, + "learning_rate": 0.00028517578925507756, + "loss": 7.598, + "step": 10891 + }, + { + "epoch": 1.016329196603527, + "grad_norm": 0.47453651730277613, + "learning_rate": 0.00028517251047140103, + "loss": 7.4614, + "step": 10892 + }, + { + "epoch": 1.0164225062984045, + "grad_norm": 0.552708328176834, + "learning_rate": 0.00028516923134402086, + "loss": 7.0813, + "step": 10893 + }, + { + "epoch": 1.0165158159932817, + "grad_norm": 0.3812437660065808, + "learning_rate": 0.0002851659518729454, + "loss": 7.3288, + "step": 10894 + }, + { + "epoch": 1.016609125688159, + "grad_norm": 0.8504275123696717, + "learning_rate": 0.00028516267205818297, + "loss": 7.5243, + "step": 10895 + }, + { + "epoch": 1.0167024353830363, + "grad_norm": 0.7660967601425503, + "learning_rate": 0.0002851593918997419, + "loss": 7.5727, + "step": 10896 + }, + { + "epoch": 1.0167957450779137, + "grad_norm": 0.31714641299755, + "learning_rate": 0.0002851561113976305, + "loss": 7.4132, + "step": 10897 + }, + { + "epoch": 1.016889054772791, + "grad_norm": 0.4592946057545183, + "learning_rate": 0.00028515283055185716, + "loss": 7.507, + "step": 10898 + }, + { + "epoch": 1.016982364467668, + "grad_norm": 1.0044550539516002, + "learning_rate": 0.0002851495493624302, + "loss": 7.2698, + "step": 10899 + }, + { + "epoch": 1.0170756741625455, + "grad_norm": 0.7536851062879699, + "learning_rate": 0.000285146267829358, + "loss": 7.4964, + "step": 10900 + }, + { + "epoch": 1.0171689838574227, + "grad_norm": 1.1862507952168386, + "learning_rate": 0.0002851429859526488, + "loss": 7.1707, + "step": 10901 + }, + { + "epoch": 1.0172622935523001, + "grad_norm": 0.6355921101739463, + "learning_rate": 0.0002851397037323111, + "loss": 7.4613, + "step": 10902 + }, + { + "epoch": 1.0173556032471773, + "grad_norm": 1.5858652611186879, + "learning_rate": 0.00028513642116835314, + "loss": 7.7517, + "step": 10903 + }, + { + "epoch": 1.0174489129420548, + "grad_norm": 1.715518242470943, + "learning_rate": 0.00028513313826078327, + "loss": 7.3191, + "step": 10904 + }, + { + "epoch": 1.017542222636932, + "grad_norm": 0.40909401070078044, + "learning_rate": 0.0002851298550096099, + "loss": 7.4573, + "step": 10905 + }, + { + "epoch": 1.0176355323318094, + "grad_norm": 0.6007981626650732, + "learning_rate": 0.0002851265714148413, + "loss": 7.3371, + "step": 10906 + }, + { + "epoch": 1.0177288420266866, + "grad_norm": 0.765154223171731, + "learning_rate": 0.00028512328747648584, + "loss": 7.6497, + "step": 10907 + }, + { + "epoch": 1.0178221517215638, + "grad_norm": 0.30623765846315515, + "learning_rate": 0.0002851200031945519, + "loss": 7.4614, + "step": 10908 + }, + { + "epoch": 1.0179154614164412, + "grad_norm": 5.056333170091622, + "learning_rate": 0.0002851167185690479, + "loss": 7.3445, + "step": 10909 + }, + { + "epoch": 1.0180087711113184, + "grad_norm": 0.5501063783353114, + "learning_rate": 0.000285113433599982, + "loss": 7.4043, + "step": 10910 + }, + { + "epoch": 1.0181020808061958, + "grad_norm": 0.5440521538192455, + "learning_rate": 0.00028511014828736274, + "loss": 7.4066, + "step": 10911 + }, + { + "epoch": 1.018195390501073, + "grad_norm": 0.44074498062262973, + "learning_rate": 0.00028510686263119833, + "loss": 7.5172, + "step": 10912 + }, + { + "epoch": 1.0182887001959504, + "grad_norm": 0.5936433337240997, + "learning_rate": 0.0002851035766314972, + "loss": 7.433, + "step": 10913 + }, + { + "epoch": 1.0183820098908276, + "grad_norm": 46.44328787243848, + "learning_rate": 0.00028510029028826775, + "loss": 7.501, + "step": 10914 + }, + { + "epoch": 1.018475319585705, + "grad_norm": 1.3722191151715, + "learning_rate": 0.0002850970036015182, + "loss": 7.8267, + "step": 10915 + }, + { + "epoch": 1.0185686292805822, + "grad_norm": 4.498862801949422, + "learning_rate": 0.0002850937165712571, + "loss": 7.2856, + "step": 10916 + }, + { + "epoch": 1.0186619389754596, + "grad_norm": 0.8066335275142075, + "learning_rate": 0.0002850904291974926, + "loss": 7.39, + "step": 10917 + }, + { + "epoch": 1.0187552486703368, + "grad_norm": 1.0732228277096323, + "learning_rate": 0.0002850871414802332, + "loss": 7.6313, + "step": 10918 + }, + { + "epoch": 1.018848558365214, + "grad_norm": 1.2062999764971711, + "learning_rate": 0.0002850838534194872, + "loss": 7.1102, + "step": 10919 + }, + { + "epoch": 1.0189418680600915, + "grad_norm": 1.0240756517734755, + "learning_rate": 0.000285080565015263, + "loss": 7.8427, + "step": 10920 + }, + { + "epoch": 1.0190351777549687, + "grad_norm": 0.9293641286377664, + "learning_rate": 0.0002850772762675689, + "loss": 7.799, + "step": 10921 + }, + { + "epoch": 1.019128487449846, + "grad_norm": 0.47840454411289285, + "learning_rate": 0.0002850739871764133, + "loss": 7.4533, + "step": 10922 + }, + { + "epoch": 1.0192217971447233, + "grad_norm": 0.7688835509630435, + "learning_rate": 0.00028507069774180455, + "loss": 7.5858, + "step": 10923 + }, + { + "epoch": 1.0193151068396007, + "grad_norm": 0.7025215161427208, + "learning_rate": 0.00028506740796375107, + "loss": 7.3351, + "step": 10924 + }, + { + "epoch": 1.019408416534478, + "grad_norm": 0.9123486587059051, + "learning_rate": 0.00028506411784226114, + "loss": 7.2399, + "step": 10925 + }, + { + "epoch": 1.0195017262293553, + "grad_norm": 0.7117490294834681, + "learning_rate": 0.0002850608273773432, + "loss": 7.2064, + "step": 10926 + }, + { + "epoch": 1.0195950359242325, + "grad_norm": 0.5118656001223385, + "learning_rate": 0.0002850575365690056, + "loss": 7.2298, + "step": 10927 + }, + { + "epoch": 1.01968834561911, + "grad_norm": 0.8910117028915822, + "learning_rate": 0.00028505424541725664, + "loss": 7.4182, + "step": 10928 + }, + { + "epoch": 1.0197816553139871, + "grad_norm": 0.9242300087149503, + "learning_rate": 0.0002850509539221048, + "loss": 7.2304, + "step": 10929 + }, + { + "epoch": 1.0198749650088643, + "grad_norm": 1.17771604201579, + "learning_rate": 0.0002850476620835583, + "loss": 7.3685, + "step": 10930 + }, + { + "epoch": 1.0199682747037417, + "grad_norm": 0.795644376153982, + "learning_rate": 0.0002850443699016257, + "loss": 7.337, + "step": 10931 + }, + { + "epoch": 1.020061584398619, + "grad_norm": 0.487858122294225, + "learning_rate": 0.00028504107737631525, + "loss": 7.1358, + "step": 10932 + }, + { + "epoch": 1.0201548940934964, + "grad_norm": 0.4755778692287078, + "learning_rate": 0.00028503778450763526, + "loss": 7.3285, + "step": 10933 + }, + { + "epoch": 1.0202482037883736, + "grad_norm": 4.547166593016324, + "learning_rate": 0.00028503449129559425, + "loss": 7.7222, + "step": 10934 + }, + { + "epoch": 1.020341513483251, + "grad_norm": 0.8314661700094433, + "learning_rate": 0.00028503119774020056, + "loss": 7.3793, + "step": 10935 + }, + { + "epoch": 1.0204348231781282, + "grad_norm": 0.978875015529093, + "learning_rate": 0.0002850279038414625, + "loss": 7.3744, + "step": 10936 + }, + { + "epoch": 1.0205281328730056, + "grad_norm": 0.534165494382852, + "learning_rate": 0.0002850246095993884, + "loss": 7.7081, + "step": 10937 + }, + { + "epoch": 1.0206214425678828, + "grad_norm": 0.7409393777869934, + "learning_rate": 0.00028502131501398684, + "loss": 7.1803, + "step": 10938 + }, + { + "epoch": 1.0207147522627602, + "grad_norm": 0.7375684018105748, + "learning_rate": 0.00028501802008526597, + "loss": 7.734, + "step": 10939 + }, + { + "epoch": 1.0208080619576374, + "grad_norm": 0.3203244750244165, + "learning_rate": 0.0002850147248132343, + "loss": 7.2531, + "step": 10940 + }, + { + "epoch": 1.0209013716525146, + "grad_norm": 0.7437056384244988, + "learning_rate": 0.00028501142919790026, + "loss": 7.4475, + "step": 10941 + }, + { + "epoch": 1.020994681347392, + "grad_norm": 1.1935446361834843, + "learning_rate": 0.0002850081332392721, + "loss": 7.8091, + "step": 10942 + }, + { + "epoch": 1.0210879910422692, + "grad_norm": 0.5102319472078105, + "learning_rate": 0.00028500483693735823, + "loss": 7.2437, + "step": 10943 + }, + { + "epoch": 1.0211813007371466, + "grad_norm": 0.4771077797170412, + "learning_rate": 0.000285001540292167, + "loss": 7.5716, + "step": 10944 + }, + { + "epoch": 1.0212746104320238, + "grad_norm": 0.5556754966925967, + "learning_rate": 0.00028499824330370695, + "loss": 7.4579, + "step": 10945 + }, + { + "epoch": 1.0213679201269013, + "grad_norm": 0.45365939496298785, + "learning_rate": 0.00028499494597198635, + "loss": 7.5154, + "step": 10946 + }, + { + "epoch": 1.0214612298217784, + "grad_norm": 10.717000774440494, + "learning_rate": 0.0002849916482970135, + "loss": 7.4531, + "step": 10947 + }, + { + "epoch": 1.0215545395166559, + "grad_norm": 0.38464087306865624, + "learning_rate": 0.0002849883502787969, + "loss": 7.531, + "step": 10948 + }, + { + "epoch": 1.021647849211533, + "grad_norm": 0.5313064880304209, + "learning_rate": 0.000284985051917345, + "loss": 7.6012, + "step": 10949 + }, + { + "epoch": 1.0217411589064105, + "grad_norm": 0.9619082559487608, + "learning_rate": 0.000284981753212666, + "loss": 7.7086, + "step": 10950 + }, + { + "epoch": 1.0218344686012877, + "grad_norm": 0.5202950019251141, + "learning_rate": 0.0002849784541647684, + "loss": 7.2293, + "step": 10951 + }, + { + "epoch": 1.0219277782961649, + "grad_norm": 0.6395764889177457, + "learning_rate": 0.00028497515477366066, + "loss": 7.599, + "step": 10952 + }, + { + "epoch": 1.0220210879910423, + "grad_norm": 0.46231411195079586, + "learning_rate": 0.00028497185503935103, + "loss": 7.3085, + "step": 10953 + }, + { + "epoch": 1.0221143976859195, + "grad_norm": 0.49488618964905057, + "learning_rate": 0.00028496855496184793, + "loss": 7.5245, + "step": 10954 + }, + { + "epoch": 1.022207707380797, + "grad_norm": 1.0955253022148217, + "learning_rate": 0.0002849652545411598, + "loss": 7.0837, + "step": 10955 + }, + { + "epoch": 1.0223010170756741, + "grad_norm": 0.6115219894722664, + "learning_rate": 0.00028496195377729496, + "loss": 7.4757, + "step": 10956 + }, + { + "epoch": 1.0223943267705515, + "grad_norm": 0.7019995128743984, + "learning_rate": 0.00028495865267026194, + "loss": 7.4534, + "step": 10957 + }, + { + "epoch": 1.0224876364654287, + "grad_norm": 0.6774900317396761, + "learning_rate": 0.00028495535122006896, + "loss": 7.4391, + "step": 10958 + }, + { + "epoch": 1.0225809461603061, + "grad_norm": 0.6447838009627034, + "learning_rate": 0.00028495204942672453, + "loss": 7.5428, + "step": 10959 + }, + { + "epoch": 1.0226742558551833, + "grad_norm": 0.4861957471583644, + "learning_rate": 0.000284948747290237, + "loss": 7.3049, + "step": 10960 + }, + { + "epoch": 1.0227675655500605, + "grad_norm": 0.4556124202708736, + "learning_rate": 0.0002849454448106148, + "loss": 7.4914, + "step": 10961 + }, + { + "epoch": 1.022860875244938, + "grad_norm": 1.4461810962194488, + "learning_rate": 0.0002849421419878663, + "loss": 7.1156, + "step": 10962 + }, + { + "epoch": 1.0229541849398152, + "grad_norm": 0.6012387085399796, + "learning_rate": 0.00028493883882199997, + "loss": 7.436, + "step": 10963 + }, + { + "epoch": 1.0230474946346926, + "grad_norm": 0.3465367602073489, + "learning_rate": 0.00028493553531302405, + "loss": 7.4471, + "step": 10964 + }, + { + "epoch": 1.0231408043295698, + "grad_norm": 0.4026153785488766, + "learning_rate": 0.00028493223146094704, + "loss": 7.4849, + "step": 10965 + }, + { + "epoch": 1.0232341140244472, + "grad_norm": 0.6779823664061386, + "learning_rate": 0.0002849289272657774, + "loss": 7.5542, + "step": 10966 + }, + { + "epoch": 1.0233274237193244, + "grad_norm": 0.46533456267796675, + "learning_rate": 0.0002849256227275234, + "loss": 7.4964, + "step": 10967 + }, + { + "epoch": 1.0234207334142018, + "grad_norm": 5.175360509094901, + "learning_rate": 0.00028492231784619355, + "loss": 7.4345, + "step": 10968 + }, + { + "epoch": 1.023514043109079, + "grad_norm": 0.4705522404158976, + "learning_rate": 0.0002849190126217962, + "loss": 7.4016, + "step": 10969 + }, + { + "epoch": 1.0236073528039564, + "grad_norm": 0.33819609132649214, + "learning_rate": 0.0002849157070543397, + "loss": 7.5292, + "step": 10970 + }, + { + "epoch": 1.0237006624988336, + "grad_norm": 0.5305667390931634, + "learning_rate": 0.0002849124011438327, + "loss": 7.4557, + "step": 10971 + }, + { + "epoch": 1.0237939721937108, + "grad_norm": 24.078310210947514, + "learning_rate": 0.00028490909489028326, + "loss": 7.3706, + "step": 10972 + }, + { + "epoch": 1.0238872818885882, + "grad_norm": 0.41030007356098563, + "learning_rate": 0.0002849057882937001, + "loss": 7.6283, + "step": 10973 + }, + { + "epoch": 1.0239805915834654, + "grad_norm": 0.5832195836764349, + "learning_rate": 0.0002849024813540913, + "loss": 7.4753, + "step": 10974 + }, + { + "epoch": 1.0240739012783429, + "grad_norm": 39.16283767723657, + "learning_rate": 0.0002848991740714656, + "loss": 7.2059, + "step": 10975 + }, + { + "epoch": 1.02416721097322, + "grad_norm": 0.7899974198233393, + "learning_rate": 0.00028489586644583117, + "loss": 7.2714, + "step": 10976 + }, + { + "epoch": 1.0242605206680975, + "grad_norm": 60.92708452946225, + "learning_rate": 0.0002848925584771966, + "loss": 7.7247, + "step": 10977 + }, + { + "epoch": 1.0243538303629747, + "grad_norm": 2.6108047809560198, + "learning_rate": 0.00028488925016557015, + "loss": 7.6597, + "step": 10978 + }, + { + "epoch": 1.024447140057852, + "grad_norm": 6.6176766265007, + "learning_rate": 0.00028488594151096025, + "loss": 7.4703, + "step": 10979 + }, + { + "epoch": 1.0245404497527293, + "grad_norm": 45.423918771400125, + "learning_rate": 0.00028488263251337547, + "loss": 7.523, + "step": 10980 + }, + { + "epoch": 1.0246337594476067, + "grad_norm": 1.1082552359956022, + "learning_rate": 0.000284879323172824, + "loss": 7.3495, + "step": 10981 + }, + { + "epoch": 1.024727069142484, + "grad_norm": 77.61518159997685, + "learning_rate": 0.00028487601348931447, + "loss": 7.2387, + "step": 10982 + }, + { + "epoch": 1.024820378837361, + "grad_norm": 283.1832069330669, + "learning_rate": 0.0002848727034628551, + "loss": 7.5031, + "step": 10983 + }, + { + "epoch": 1.0249136885322385, + "grad_norm": 3.4599020031015932, + "learning_rate": 0.0002848693930934545, + "loss": 7.4012, + "step": 10984 + }, + { + "epoch": 1.0250069982271157, + "grad_norm": 3.447953861556216, + "learning_rate": 0.00028486608238112085, + "loss": 7.5364, + "step": 10985 + }, + { + "epoch": 1.0251003079219931, + "grad_norm": 70408.34187542708, + "learning_rate": 0.00028486277132586276, + "loss": 7.7867, + "step": 10986 + }, + { + "epoch": 1.0251936176168703, + "grad_norm": 0.8201004282644636, + "learning_rate": 0.00028485945992768864, + "loss": 7.6907, + "step": 10987 + }, + { + "epoch": 1.0252869273117478, + "grad_norm": 2.5804447341977554, + "learning_rate": 0.0002848561481866068, + "loss": 7.4191, + "step": 10988 + }, + { + "epoch": 1.025380237006625, + "grad_norm": 4.563368123908733, + "learning_rate": 0.00028485283610262576, + "loss": 7.5595, + "step": 10989 + }, + { + "epoch": 1.0254735467015024, + "grad_norm": 5.2448871219754984, + "learning_rate": 0.0002848495236757539, + "loss": 7.5701, + "step": 10990 + }, + { + "epoch": 1.0255668563963796, + "grad_norm": 4.755867442476126, + "learning_rate": 0.0002848462109059996, + "loss": 7.64, + "step": 10991 + }, + { + "epoch": 1.025660166091257, + "grad_norm": 3.86569460555232, + "learning_rate": 0.0002848428977933714, + "loss": 7.7955, + "step": 10992 + }, + { + "epoch": 1.0257534757861342, + "grad_norm": 3.4050308800995133, + "learning_rate": 0.0002848395843378777, + "loss": 8.1828, + "step": 10993 + }, + { + "epoch": 1.0258467854810114, + "grad_norm": 1.2641397196387356, + "learning_rate": 0.0002848362705395267, + "loss": 7.4158, + "step": 10994 + }, + { + "epoch": 1.0259400951758888, + "grad_norm": 1.6230734997995988, + "learning_rate": 0.0002848329563983271, + "loss": 7.8173, + "step": 10995 + }, + { + "epoch": 1.026033404870766, + "grad_norm": 2.376077416734107, + "learning_rate": 0.0002848296419142873, + "loss": 7.8774, + "step": 10996 + }, + { + "epoch": 1.0261267145656434, + "grad_norm": 130235.90745123479, + "learning_rate": 0.0002848263270874156, + "loss": 7.5001, + "step": 10997 + }, + { + "epoch": 1.0262200242605206, + "grad_norm": 4.659398052430684, + "learning_rate": 0.00028482301191772047, + "loss": 8.8607, + "step": 10998 + }, + { + "epoch": 1.026313333955398, + "grad_norm": 4.944439643025263, + "learning_rate": 0.0002848196964052104, + "loss": 8.7666, + "step": 10999 + }, + { + "epoch": 1.0264066436502752, + "grad_norm": 34.18590828708026, + "learning_rate": 0.00028481638054989375, + "loss": 8.7211, + "step": 11000 + }, + { + "epoch": 1.0264999533451526, + "grad_norm": 36.8553301010203, + "learning_rate": 0.000284813064351779, + "loss": 8.8162, + "step": 11001 + }, + { + "epoch": 1.0265932630400298, + "grad_norm": 4.73489866681289, + "learning_rate": 0.00028480974781087455, + "loss": 8.7646, + "step": 11002 + }, + { + "epoch": 1.0266865727349073, + "grad_norm": 38189265690.36528, + "learning_rate": 0.0002848064309271888, + "loss": 8.7375, + "step": 11003 + }, + { + "epoch": 1.0267798824297845, + "grad_norm": 5.616996028427735, + "learning_rate": 0.0002848031137007303, + "loss": 8.628, + "step": 11004 + }, + { + "epoch": 1.0268731921246617, + "grad_norm": 540.0123369987648, + "learning_rate": 0.00028479979613150737, + "loss": 8.2187, + "step": 11005 + }, + { + "epoch": 1.026966501819539, + "grad_norm": 21.05916477325329, + "learning_rate": 0.0002847964782195285, + "loss": 8.0616, + "step": 11006 + }, + { + "epoch": 1.0270598115144163, + "grad_norm": 3.5262072356813396, + "learning_rate": 0.00028479315996480213, + "loss": 8.0453, + "step": 11007 + }, + { + "epoch": 1.0271531212092937, + "grad_norm": 5.130763819102432, + "learning_rate": 0.00028478984136733667, + "loss": 7.8511, + "step": 11008 + }, + { + "epoch": 1.027246430904171, + "grad_norm": 22.377882175387626, + "learning_rate": 0.0002847865224271406, + "loss": 7.6507, + "step": 11009 + }, + { + "epoch": 1.0273397405990483, + "grad_norm": 4.20853607586797, + "learning_rate": 0.0002847832031442223, + "loss": 8.0562, + "step": 11010 + }, + { + "epoch": 1.0274330502939255, + "grad_norm": 2.647363375275303e+20, + "learning_rate": 0.00028477988351859024, + "loss": 7.7192, + "step": 11011 + }, + { + "epoch": 1.027526359988803, + "grad_norm": 93.56646856978922, + "learning_rate": 0.00028477656355025284, + "loss": 7.5986, + "step": 11012 + }, + { + "epoch": 1.0276196696836801, + "grad_norm": 3.6039792333554144, + "learning_rate": 0.0002847732432392186, + "loss": 7.8017, + "step": 11013 + }, + { + "epoch": 1.0277129793785573, + "grad_norm": 131546296754.9355, + "learning_rate": 0.00028476992258549594, + "loss": 7.3717, + "step": 11014 + }, + { + "epoch": 1.0278062890734347, + "grad_norm": 37.6447055040627, + "learning_rate": 0.00028476660158909326, + "loss": 7.8966, + "step": 11015 + }, + { + "epoch": 1.027899598768312, + "grad_norm": 14.56484076484779, + "learning_rate": 0.000284763280250019, + "loss": 7.8688, + "step": 11016 + }, + { + "epoch": 1.0279929084631894, + "grad_norm": 16.164649284786037, + "learning_rate": 0.0002847599585682817, + "loss": 7.8808, + "step": 11017 + }, + { + "epoch": 1.0280862181580666, + "grad_norm": 101058176578.17421, + "learning_rate": 0.0002847566365438897, + "loss": 7.5259, + "step": 11018 + }, + { + "epoch": 1.028179527852944, + "grad_norm": 4.133741011228652, + "learning_rate": 0.00028475331417685153, + "loss": 7.5926, + "step": 11019 + }, + { + "epoch": 1.0282728375478212, + "grad_norm": 6.014511990753035, + "learning_rate": 0.00028474999146717555, + "loss": 7.3374, + "step": 11020 + }, + { + "epoch": 1.0283661472426986, + "grad_norm": 9.406804427058905, + "learning_rate": 0.0002847466684148703, + "loss": 7.5746, + "step": 11021 + }, + { + "epoch": 1.0284594569375758, + "grad_norm": 6.741531174164687, + "learning_rate": 0.00028474334501994415, + "loss": 7.8594, + "step": 11022 + }, + { + "epoch": 1.0285527666324532, + "grad_norm": 6.634989795492397, + "learning_rate": 0.00028474002128240565, + "loss": 8.0798, + "step": 11023 + }, + { + "epoch": 1.0286460763273304, + "grad_norm": 20.343629919862764, + "learning_rate": 0.00028473669720226316, + "loss": 7.5714, + "step": 11024 + }, + { + "epoch": 1.0287393860222076, + "grad_norm": 16.675485184759292, + "learning_rate": 0.00028473337277952514, + "loss": 7.5431, + "step": 11025 + }, + { + "epoch": 1.028832695717085, + "grad_norm": 12.160351877037119, + "learning_rate": 0.00028473004801420006, + "loss": 7.1714, + "step": 11026 + }, + { + "epoch": 1.0289260054119622, + "grad_norm": 10.810169210769956, + "learning_rate": 0.0002847267229062964, + "loss": 7.6113, + "step": 11027 + }, + { + "epoch": 1.0290193151068396, + "grad_norm": 10.16183056428939, + "learning_rate": 0.0002847233974558226, + "loss": 7.428, + "step": 11028 + }, + { + "epoch": 1.0291126248017168, + "grad_norm": 3.673230940285084, + "learning_rate": 0.00028472007166278706, + "loss": 7.3661, + "step": 11029 + }, + { + "epoch": 1.0292059344965943, + "grad_norm": 70426807615.26584, + "learning_rate": 0.00028471674552719837, + "loss": 7.8353, + "step": 11030 + }, + { + "epoch": 1.0292992441914715, + "grad_norm": 16.88523032425697, + "learning_rate": 0.0002847134190490648, + "loss": 7.1042, + "step": 11031 + }, + { + "epoch": 1.0293925538863489, + "grad_norm": 35.187483730029555, + "learning_rate": 0.000284710092228395, + "loss": 7.5404, + "step": 11032 + }, + { + "epoch": 1.029485863581226, + "grad_norm": 5.683582757538082, + "learning_rate": 0.00028470676506519725, + "loss": 7.4131, + "step": 11033 + }, + { + "epoch": 1.0295791732761035, + "grad_norm": 125.6338747279121, + "learning_rate": 0.00028470343755948016, + "loss": 7.7689, + "step": 11034 + }, + { + "epoch": 1.0296724829709807, + "grad_norm": 4.194980989367159, + "learning_rate": 0.00028470010971125216, + "loss": 7.7512, + "step": 11035 + }, + { + "epoch": 1.0297657926658579, + "grad_norm": 2.3646341363516976, + "learning_rate": 0.0002846967815205216, + "loss": 7.4384, + "step": 11036 + }, + { + "epoch": 1.0298591023607353, + "grad_norm": 3.2696970590001815, + "learning_rate": 0.00028469345298729707, + "loss": 7.3998, + "step": 11037 + }, + { + "epoch": 1.0299524120556125, + "grad_norm": 3.7524634404438353, + "learning_rate": 0.000284690124111587, + "loss": 7.3253, + "step": 11038 + }, + { + "epoch": 1.03004572175049, + "grad_norm": 18.02638763760882, + "learning_rate": 0.00028468679489339983, + "loss": 7.3845, + "step": 11039 + }, + { + "epoch": 1.0301390314453671, + "grad_norm": 32593893359.143997, + "learning_rate": 0.000284683465332744, + "loss": 7.4471, + "step": 11040 + }, + { + "epoch": 1.0302323411402445, + "grad_norm": 2.438533760898592, + "learning_rate": 0.00028468013542962806, + "loss": 7.4136, + "step": 11041 + }, + { + "epoch": 1.0303256508351217, + "grad_norm": 23901568704.050964, + "learning_rate": 0.0002846768051840604, + "loss": 7.2256, + "step": 11042 + }, + { + "epoch": 1.0304189605299992, + "grad_norm": 4.423117546408746, + "learning_rate": 0.00028467347459604957, + "loss": 7.32, + "step": 11043 + }, + { + "epoch": 1.0305122702248763, + "grad_norm": 777317042345.1957, + "learning_rate": 0.00028467014366560393, + "loss": 7.3329, + "step": 11044 + }, + { + "epoch": 1.0306055799197538, + "grad_norm": 17.6657194673875, + "learning_rate": 0.00028466681239273207, + "loss": 7.5397, + "step": 11045 + }, + { + "epoch": 1.030698889614631, + "grad_norm": 2.3674829265739565, + "learning_rate": 0.0002846634807774423, + "loss": 7.5078, + "step": 11046 + }, + { + "epoch": 1.0307921993095082, + "grad_norm": 3.3637450345420126, + "learning_rate": 0.0002846601488197433, + "loss": 7.4852, + "step": 11047 + }, + { + "epoch": 1.0308855090043856, + "grad_norm": 3.0142344454620833, + "learning_rate": 0.0002846568165196434, + "loss": 7.333, + "step": 11048 + }, + { + "epoch": 1.0309788186992628, + "grad_norm": 3.302577458325463, + "learning_rate": 0.000284653483877151, + "loss": 7.5928, + "step": 11049 + }, + { + "epoch": 1.0310721283941402, + "grad_norm": 16.06327826967714, + "learning_rate": 0.0002846501508922748, + "loss": 7.8184, + "step": 11050 + }, + { + "epoch": 1.0311654380890174, + "grad_norm": 7.920465253235936, + "learning_rate": 0.0002846468175650231, + "loss": 7.5047, + "step": 11051 + }, + { + "epoch": 1.0312587477838948, + "grad_norm": 11.101536548203958, + "learning_rate": 0.0002846434838954045, + "loss": 7.3032, + "step": 11052 + }, + { + "epoch": 1.031352057478772, + "grad_norm": 24752006444309.285, + "learning_rate": 0.00028464014988342735, + "loss": 7.1578, + "step": 11053 + }, + { + "epoch": 1.0314453671736494, + "grad_norm": 27389666101236.35, + "learning_rate": 0.00028463681552910015, + "loss": 7.3176, + "step": 11054 + }, + { + "epoch": 1.0315386768685266, + "grad_norm": 16.518725550662513, + "learning_rate": 0.00028463348083243144, + "loss": 7.4982, + "step": 11055 + }, + { + "epoch": 1.0316319865634038, + "grad_norm": 9.365273156780605, + "learning_rate": 0.00028463014579342965, + "loss": 7.658, + "step": 11056 + }, + { + "epoch": 1.0317252962582812, + "grad_norm": 4023839124383.831, + "learning_rate": 0.0002846268104121033, + "loss": 7.057, + "step": 11057 + }, + { + "epoch": 1.0318186059531584, + "grad_norm": 1668.6380604989297, + "learning_rate": 0.00028462347468846086, + "loss": 7.5585, + "step": 11058 + }, + { + "epoch": 1.0319119156480359, + "grad_norm": 14.552774668157403, + "learning_rate": 0.0002846201386225108, + "loss": 7.0892, + "step": 11059 + }, + { + "epoch": 1.032005225342913, + "grad_norm": 1.5351343678768372, + "learning_rate": 0.0002846168022142616, + "loss": 7.3686, + "step": 11060 + }, + { + "epoch": 1.0320985350377905, + "grad_norm": 1.318705199225753, + "learning_rate": 0.00028461346546372173, + "loss": 7.8342, + "step": 11061 + }, + { + "epoch": 1.0321918447326677, + "grad_norm": 1.978010126026454, + "learning_rate": 0.0002846101283708997, + "loss": 7.5068, + "step": 11062 + }, + { + "epoch": 1.032285154427545, + "grad_norm": 1.482679617583253, + "learning_rate": 0.000284606790935804, + "loss": 7.0381, + "step": 11063 + }, + { + "epoch": 1.0323784641224223, + "grad_norm": 345732894743528.9, + "learning_rate": 0.00028460345315844314, + "loss": 7.1749, + "step": 11064 + }, + { + "epoch": 1.0324717738172997, + "grad_norm": 3367350090150691.5, + "learning_rate": 0.00028460011503882545, + "loss": 7.411, + "step": 11065 + }, + { + "epoch": 1.032565083512177, + "grad_norm": 4.01823027771347, + "learning_rate": 0.00028459677657695966, + "loss": 7.5601, + "step": 11066 + }, + { + "epoch": 1.032658393207054, + "grad_norm": 2825598093335083.0, + "learning_rate": 0.00028459343777285413, + "loss": 7.4687, + "step": 11067 + }, + { + "epoch": 1.0327517029019315, + "grad_norm": 285.23536525770527, + "learning_rate": 0.0002845900986265173, + "loss": 7.4425, + "step": 11068 + }, + { + "epoch": 1.0328450125968087, + "grad_norm": 2.6737677705224825, + "learning_rate": 0.00028458675913795777, + "loss": 7.6252, + "step": 11069 + }, + { + "epoch": 1.0329383222916861, + "grad_norm": 2.921297820318726, + "learning_rate": 0.00028458341930718395, + "loss": 7.4737, + "step": 11070 + }, + { + "epoch": 1.0330316319865633, + "grad_norm": 19.82570160833123, + "learning_rate": 0.00028458007913420433, + "loss": 7.6337, + "step": 11071 + }, + { + "epoch": 1.0331249416814408, + "grad_norm": 1.8149808619869159, + "learning_rate": 0.00028457673861902747, + "loss": 7.4097, + "step": 11072 + }, + { + "epoch": 1.033218251376318, + "grad_norm": 1.504920196192636, + "learning_rate": 0.0002845733977616618, + "loss": 7.372, + "step": 11073 + }, + { + "epoch": 1.0333115610711954, + "grad_norm": 3000807407101531.0, + "learning_rate": 0.0002845700565621159, + "loss": 7.5552, + "step": 11074 + }, + { + "epoch": 1.0334048707660726, + "grad_norm": 41.48443526076107, + "learning_rate": 0.0002845667150203982, + "loss": 7.2905, + "step": 11075 + }, + { + "epoch": 1.03349818046095, + "grad_norm": 7.492335238223979, + "learning_rate": 0.0002845633731365172, + "loss": 7.5808, + "step": 11076 + }, + { + "epoch": 1.0335914901558272, + "grad_norm": 3733622224864669.5, + "learning_rate": 0.0002845600309104813, + "loss": 7.5941, + "step": 11077 + }, + { + "epoch": 1.0336847998507044, + "grad_norm": 6491209270066161.0, + "learning_rate": 0.0002845566883422992, + "loss": 7.4593, + "step": 11078 + }, + { + "epoch": 1.0337781095455818, + "grad_norm": 7.632646843813299, + "learning_rate": 0.0002845533454319793, + "loss": 7.4313, + "step": 11079 + }, + { + "epoch": 1.033871419240459, + "grad_norm": 1.6230402320356605, + "learning_rate": 0.0002845500021795301, + "loss": 7.2698, + "step": 11080 + }, + { + "epoch": 1.0339647289353364, + "grad_norm": 3.0225501083668593, + "learning_rate": 0.0002845466585849601, + "loss": 7.4105, + "step": 11081 + }, + { + "epoch": 1.0340580386302136, + "grad_norm": 92.03690203307208, + "learning_rate": 0.00028454331464827776, + "loss": 7.4268, + "step": 11082 + }, + { + "epoch": 1.034151348325091, + "grad_norm": 12.316798269825355, + "learning_rate": 0.0002845399703694917, + "loss": 7.3885, + "step": 11083 + }, + { + "epoch": 1.0342446580199682, + "grad_norm": 32.50960230050998, + "learning_rate": 0.00028453662574861026, + "loss": 7.4497, + "step": 11084 + }, + { + "epoch": 1.0343379677148457, + "grad_norm": 1.7325376141435964, + "learning_rate": 0.0002845332807856421, + "loss": 7.3485, + "step": 11085 + }, + { + "epoch": 1.0344312774097228, + "grad_norm": 5.529164345258361, + "learning_rate": 0.0002845299354805956, + "loss": 7.3318, + "step": 11086 + }, + { + "epoch": 1.0345245871046003, + "grad_norm": 11.423479476157933, + "learning_rate": 0.0002845265898334794, + "loss": 7.5353, + "step": 11087 + }, + { + "epoch": 1.0346178967994775, + "grad_norm": 3.3633843577315465, + "learning_rate": 0.00028452324384430186, + "loss": 7.3138, + "step": 11088 + }, + { + "epoch": 1.0347112064943547, + "grad_norm": 14.954791256587287, + "learning_rate": 0.0002845198975130716, + "loss": 7.577, + "step": 11089 + }, + { + "epoch": 1.034804516189232, + "grad_norm": 1019305125386305.0, + "learning_rate": 0.00028451655083979707, + "loss": 7.3036, + "step": 11090 + }, + { + "epoch": 1.0348978258841093, + "grad_norm": 17.97498580820468, + "learning_rate": 0.0002845132038244868, + "loss": 7.1455, + "step": 11091 + }, + { + "epoch": 1.0349911355789867, + "grad_norm": 50.194781299517416, + "learning_rate": 0.00028450985646714933, + "loss": 7.6142, + "step": 11092 + }, + { + "epoch": 1.035084445273864, + "grad_norm": 3202138484492623.5, + "learning_rate": 0.00028450650876779316, + "loss": 7.4278, + "step": 11093 + }, + { + "epoch": 1.0351777549687413, + "grad_norm": 10.531561353793075, + "learning_rate": 0.0002845031607264267, + "loss": 7.5561, + "step": 11094 + }, + { + "epoch": 1.0352710646636185, + "grad_norm": 10.158211484822523, + "learning_rate": 0.0002844998123430586, + "loss": 7.3853, + "step": 11095 + }, + { + "epoch": 1.035364374358496, + "grad_norm": 78595346643901.02, + "learning_rate": 0.0002844964636176973, + "loss": 7.3938, + "step": 11096 + }, + { + "epoch": 1.0354576840533731, + "grad_norm": 6.37962020430375, + "learning_rate": 0.00028449311455035133, + "loss": 7.5956, + "step": 11097 + }, + { + "epoch": 1.0355509937482505, + "grad_norm": 10.452373714299801, + "learning_rate": 0.0002844897651410292, + "loss": 7.489, + "step": 11098 + }, + { + "epoch": 1.0356443034431277, + "grad_norm": 9.547577235252337, + "learning_rate": 0.00028448641538973945, + "loss": 7.2651, + "step": 11099 + }, + { + "epoch": 1.035737613138005, + "grad_norm": 20.158246172745148, + "learning_rate": 0.00028448306529649063, + "loss": 7.6181, + "step": 11100 + }, + { + "epoch": 1.0358309228328824, + "grad_norm": 7.889445715778189, + "learning_rate": 0.00028447971486129117, + "loss": 7.4659, + "step": 11101 + }, + { + "epoch": 1.0359242325277596, + "grad_norm": 20.738291553386265, + "learning_rate": 0.00028447636408414963, + "loss": 7.3864, + "step": 11102 + }, + { + "epoch": 1.036017542222637, + "grad_norm": 6.866872184653054, + "learning_rate": 0.00028447301296507453, + "loss": 7.3565, + "step": 11103 + }, + { + "epoch": 1.0361108519175142, + "grad_norm": 0.9527900360000803, + "learning_rate": 0.0002844696615040744, + "loss": 7.4161, + "step": 11104 + }, + { + "epoch": 1.0362041616123916, + "grad_norm": 1.2704780951514416, + "learning_rate": 0.0002844663097011577, + "loss": 7.0761, + "step": 11105 + }, + { + "epoch": 1.0362974713072688, + "grad_norm": 3935070268705880.0, + "learning_rate": 0.0002844629575563331, + "loss": 7.5699, + "step": 11106 + }, + { + "epoch": 1.0363907810021462, + "grad_norm": 48.18614520512238, + "learning_rate": 0.000284459605069609, + "loss": 7.4726, + "step": 11107 + }, + { + "epoch": 1.0364840906970234, + "grad_norm": 1579564077913796.0, + "learning_rate": 0.00028445625224099387, + "loss": 7.5579, + "step": 11108 + }, + { + "epoch": 1.0365774003919008, + "grad_norm": 3386613945383490.0, + "learning_rate": 0.0002844528990704964, + "loss": 7.5624, + "step": 11109 + }, + { + "epoch": 1.036670710086778, + "grad_norm": 11.583233926770278, + "learning_rate": 0.00028444954555812503, + "loss": 6.9223, + "step": 11110 + }, + { + "epoch": 1.0367640197816552, + "grad_norm": 2.7420310483799377, + "learning_rate": 0.00028444619170388827, + "loss": 6.9543, + "step": 11111 + }, + { + "epoch": 1.0368573294765326, + "grad_norm": 25.548107279768793, + "learning_rate": 0.0002844428375077947, + "loss": 7.274, + "step": 11112 + }, + { + "epoch": 1.0369506391714098, + "grad_norm": 698.1575063950785, + "learning_rate": 0.0002844394829698528, + "loss": 7.2391, + "step": 11113 + }, + { + "epoch": 1.0370439488662873, + "grad_norm": 3.66789906175197, + "learning_rate": 0.0002844361280900711, + "loss": 7.385, + "step": 11114 + }, + { + "epoch": 1.0371372585611645, + "grad_norm": 71.71770549365301, + "learning_rate": 0.0002844327728684582, + "loss": 7.1067, + "step": 11115 + }, + { + "epoch": 1.0372305682560419, + "grad_norm": 8.938188948060503, + "learning_rate": 0.00028442941730502257, + "loss": 7.5681, + "step": 11116 + }, + { + "epoch": 1.037323877950919, + "grad_norm": 236.5931942855532, + "learning_rate": 0.0002844260613997727, + "loss": 7.5476, + "step": 11117 + }, + { + "epoch": 1.0374171876457965, + "grad_norm": 2.0139849568483212, + "learning_rate": 0.0002844227051527172, + "loss": 7.2729, + "step": 11118 + }, + { + "epoch": 1.0375104973406737, + "grad_norm": 5.949881277381566, + "learning_rate": 0.00028441934856386457, + "loss": 7.299, + "step": 11119 + }, + { + "epoch": 1.037603807035551, + "grad_norm": 14.148332724044003, + "learning_rate": 0.0002844159916332234, + "loss": 7.5434, + "step": 11120 + }, + { + "epoch": 1.0376971167304283, + "grad_norm": 16.51864794174984, + "learning_rate": 0.00028441263436080217, + "loss": 7.3389, + "step": 11121 + }, + { + "epoch": 1.0377904264253055, + "grad_norm": 21.935932683295697, + "learning_rate": 0.00028440927674660936, + "loss": 7.2859, + "step": 11122 + }, + { + "epoch": 1.037883736120183, + "grad_norm": 4.517682487320262, + "learning_rate": 0.0002844059187906537, + "loss": 7.3365, + "step": 11123 + }, + { + "epoch": 1.0379770458150601, + "grad_norm": 1.9145536853419018, + "learning_rate": 0.0002844025604929435, + "loss": 6.9412, + "step": 11124 + }, + { + "epoch": 1.0380703555099375, + "grad_norm": 1.8862930894547084e+16, + "learning_rate": 0.0002843992018534874, + "loss": 7.2985, + "step": 11125 + }, + { + "epoch": 1.0381636652048147, + "grad_norm": 22.03861182161612, + "learning_rate": 0.000284395842872294, + "loss": 7.3559, + "step": 11126 + }, + { + "epoch": 1.0382569748996922, + "grad_norm": 23.656172744383557, + "learning_rate": 0.0002843924835493718, + "loss": 7.1085, + "step": 11127 + }, + { + "epoch": 1.0383502845945694, + "grad_norm": 21.29635871076018, + "learning_rate": 0.0002843891238847292, + "loss": 7.2689, + "step": 11128 + }, + { + "epoch": 1.0384435942894468, + "grad_norm": 10.092556763418752, + "learning_rate": 0.00028438576387837495, + "loss": 7.3553, + "step": 11129 + }, + { + "epoch": 1.038536903984324, + "grad_norm": 13.312839462426622, + "learning_rate": 0.0002843824035303175, + "loss": 7.4625, + "step": 11130 + }, + { + "epoch": 1.0386302136792012, + "grad_norm": 1.4705040733163457, + "learning_rate": 0.00028437904284056543, + "loss": 7.3385, + "step": 11131 + }, + { + "epoch": 1.0387235233740786, + "grad_norm": 17.618768766746726, + "learning_rate": 0.00028437568180912724, + "loss": 7.3308, + "step": 11132 + }, + { + "epoch": 1.0388168330689558, + "grad_norm": 3.648486593683664, + "learning_rate": 0.0002843723204360115, + "loss": 7.4378, + "step": 11133 + }, + { + "epoch": 1.0389101427638332, + "grad_norm": 4583267994786554.0, + "learning_rate": 0.00028436895872122677, + "loss": 7.3602, + "step": 11134 + }, + { + "epoch": 1.0390034524587104, + "grad_norm": 11.1115191622848, + "learning_rate": 0.00028436559666478157, + "loss": 7.1848, + "step": 11135 + }, + { + "epoch": 1.0390967621535878, + "grad_norm": 8.130604424236843, + "learning_rate": 0.0002843622342666844, + "loss": 7.3219, + "step": 11136 + }, + { + "epoch": 1.039190071848465, + "grad_norm": 3.8132983549785133, + "learning_rate": 0.00028435887152694395, + "loss": 7.2581, + "step": 11137 + }, + { + "epoch": 1.0392833815433424, + "grad_norm": 5.023515537560185, + "learning_rate": 0.00028435550844556866, + "loss": 7.2652, + "step": 11138 + }, + { + "epoch": 1.0393766912382196, + "grad_norm": 3.7732845792879792, + "learning_rate": 0.0002843521450225671, + "loss": 7.5464, + "step": 11139 + }, + { + "epoch": 1.039470000933097, + "grad_norm": 2.120242571357445, + "learning_rate": 0.00028434878125794784, + "loss": 7.7072, + "step": 11140 + }, + { + "epoch": 1.0395633106279742, + "grad_norm": 5.194764261097633e+16, + "learning_rate": 0.00028434541715171945, + "loss": 7.3053, + "step": 11141 + }, + { + "epoch": 1.0396566203228514, + "grad_norm": 3.062493271890554, + "learning_rate": 0.00028434205270389044, + "loss": 7.1888, + "step": 11142 + }, + { + "epoch": 1.0397499300177289, + "grad_norm": 4.145676656724811, + "learning_rate": 0.0002843386879144694, + "loss": 7.2188, + "step": 11143 + }, + { + "epoch": 1.039843239712606, + "grad_norm": 4.7540636803771354, + "learning_rate": 0.0002843353227834648, + "loss": 7.3115, + "step": 11144 + }, + { + "epoch": 1.0399365494074835, + "grad_norm": 6.506872091344302, + "learning_rate": 0.0002843319573108853, + "loss": 7.3806, + "step": 11145 + }, + { + "epoch": 1.0400298591023607, + "grad_norm": 4.409043797746182, + "learning_rate": 0.00028432859149673953, + "loss": 7.3165, + "step": 11146 + }, + { + "epoch": 1.040123168797238, + "grad_norm": 59.8420632178534, + "learning_rate": 0.00028432522534103586, + "loss": 7.3734, + "step": 11147 + }, + { + "epoch": 1.0402164784921153, + "grad_norm": 5.869077992763716, + "learning_rate": 0.0002843218588437829, + "loss": 7.3518, + "step": 11148 + }, + { + "epoch": 1.0403097881869927, + "grad_norm": 0.8710149347647598, + "learning_rate": 0.0002843184920049893, + "loss": 7.2105, + "step": 11149 + }, + { + "epoch": 1.04040309788187, + "grad_norm": 3.860452990885591, + "learning_rate": 0.00028431512482466356, + "loss": 7.2193, + "step": 11150 + }, + { + "epoch": 1.0404964075767473, + "grad_norm": 1.5170386718538516, + "learning_rate": 0.0002843117573028142, + "loss": 7.6741, + "step": 11151 + }, + { + "epoch": 1.0405897172716245, + "grad_norm": 22.563613750735794, + "learning_rate": 0.00028430838943944986, + "loss": 7.2353, + "step": 11152 + }, + { + "epoch": 1.0406830269665017, + "grad_norm": 3.1704244596046474, + "learning_rate": 0.00028430502123457903, + "loss": 7.6188, + "step": 11153 + }, + { + "epoch": 1.0407763366613791, + "grad_norm": 1.0967327058946053, + "learning_rate": 0.0002843016526882104, + "loss": 7.2222, + "step": 11154 + }, + { + "epoch": 1.0408696463562563, + "grad_norm": 14.245793859602092, + "learning_rate": 0.00028429828380035235, + "loss": 7.4132, + "step": 11155 + }, + { + "epoch": 1.0409629560511338, + "grad_norm": 15.224484177235663, + "learning_rate": 0.0002842949145710136, + "loss": 7.6385, + "step": 11156 + }, + { + "epoch": 1.041056265746011, + "grad_norm": 20.601848601406928, + "learning_rate": 0.0002842915450002026, + "loss": 7.6906, + "step": 11157 + }, + { + "epoch": 1.0411495754408884, + "grad_norm": 40.50381832346185, + "learning_rate": 0.0002842881750879281, + "loss": 7.3514, + "step": 11158 + }, + { + "epoch": 1.0412428851357656, + "grad_norm": 1.5776298336591457, + "learning_rate": 0.00028428480483419846, + "loss": 7.3837, + "step": 11159 + }, + { + "epoch": 1.041336194830643, + "grad_norm": 1.2887759665891239, + "learning_rate": 0.0002842814342390224, + "loss": 7.2647, + "step": 11160 + }, + { + "epoch": 1.0414295045255202, + "grad_norm": 2.747159078737377, + "learning_rate": 0.00028427806330240835, + "loss": 7.642, + "step": 11161 + }, + { + "epoch": 1.0415228142203974, + "grad_norm": 1.2989429269374342, + "learning_rate": 0.000284274692024365, + "loss": 7.2026, + "step": 11162 + }, + { + "epoch": 1.0416161239152748, + "grad_norm": 3.5991732263199744e+17, + "learning_rate": 0.0002842713204049009, + "loss": 7.3274, + "step": 11163 + }, + { + "epoch": 1.041709433610152, + "grad_norm": 3.9614088625176237e+17, + "learning_rate": 0.0002842679484440246, + "loss": 7.5768, + "step": 11164 + }, + { + "epoch": 1.0418027433050294, + "grad_norm": 82.73545537381563, + "learning_rate": 0.00028426457614174467, + "loss": 7.7461, + "step": 11165 + }, + { + "epoch": 1.0418960529999066, + "grad_norm": 14.910822522491511, + "learning_rate": 0.0002842612034980697, + "loss": 6.9785, + "step": 11166 + }, + { + "epoch": 1.041989362694784, + "grad_norm": 3.4602392860004966e+17, + "learning_rate": 0.0002842578305130083, + "loss": 7.4124, + "step": 11167 + }, + { + "epoch": 1.0420826723896612, + "grad_norm": 2.154299810215291, + "learning_rate": 0.00028425445718656894, + "loss": 7.0266, + "step": 11168 + }, + { + "epoch": 1.0421759820845387, + "grad_norm": 3.3173481113626497, + "learning_rate": 0.0002842510835187603, + "loss": 7.2079, + "step": 11169 + }, + { + "epoch": 1.0422692917794159, + "grad_norm": 9.49868559003836e+17, + "learning_rate": 0.0002842477095095909, + "loss": 7.4567, + "step": 11170 + }, + { + "epoch": 1.0423626014742933, + "grad_norm": 9.827462593578403, + "learning_rate": 0.00028424433515906935, + "loss": 7.175, + "step": 11171 + }, + { + "epoch": 1.0424559111691705, + "grad_norm": 2.481057317433606e+17, + "learning_rate": 0.0002842409604672043, + "loss": 7.0111, + "step": 11172 + }, + { + "epoch": 1.0425492208640477, + "grad_norm": 64.3113392746851, + "learning_rate": 0.0002842375854340042, + "loss": 7.5464, + "step": 11173 + }, + { + "epoch": 1.042642530558925, + "grad_norm": 45.90147145300769, + "learning_rate": 0.00028423421005947764, + "loss": 7.3873, + "step": 11174 + }, + { + "epoch": 1.0427358402538023, + "grad_norm": 9.245030711072737e+17, + "learning_rate": 0.00028423083434363325, + "loss": 7.4997, + "step": 11175 + }, + { + "epoch": 1.0428291499486797, + "grad_norm": 59.848985457486236, + "learning_rate": 0.00028422745828647964, + "loss": 7.1585, + "step": 11176 + }, + { + "epoch": 1.042922459643557, + "grad_norm": 3.07297793977432e+17, + "learning_rate": 0.00028422408188802535, + "loss": 7.1861, + "step": 11177 + }, + { + "epoch": 1.0430157693384343, + "grad_norm": 4.090905900715477, + "learning_rate": 0.000284220705148279, + "loss": 7.2118, + "step": 11178 + }, + { + "epoch": 1.0431090790333115, + "grad_norm": 0.7950426715126008, + "learning_rate": 0.0002842173280672491, + "loss": 7.4423, + "step": 11179 + }, + { + "epoch": 1.043202388728189, + "grad_norm": 16.980022065755833, + "learning_rate": 0.00028421395064494434, + "loss": 7.0634, + "step": 11180 + }, + { + "epoch": 1.0432956984230661, + "grad_norm": 1.6813724206258656, + "learning_rate": 0.00028421057288137327, + "loss": 7.498, + "step": 11181 + }, + { + "epoch": 1.0433890081179436, + "grad_norm": 5.972112489316037, + "learning_rate": 0.0002842071947765445, + "loss": 7.4387, + "step": 11182 + }, + { + "epoch": 1.0434823178128207, + "grad_norm": 25.014013499994075, + "learning_rate": 0.0002842038163304665, + "loss": 7.3563, + "step": 11183 + }, + { + "epoch": 1.043575627507698, + "grad_norm": 4.029524119620668, + "learning_rate": 0.000284200437543148, + "loss": 7.2962, + "step": 11184 + }, + { + "epoch": 1.0436689372025754, + "grad_norm": 39.504337427396685, + "learning_rate": 0.00028419705841459757, + "loss": 7.4151, + "step": 11185 + }, + { + "epoch": 1.0437622468974526, + "grad_norm": 43.779081735371335, + "learning_rate": 0.0002841936789448237, + "loss": 7.4935, + "step": 11186 + }, + { + "epoch": 1.04385555659233, + "grad_norm": 8.872327865977614e+17, + "learning_rate": 0.00028419029913383514, + "loss": 7.4152, + "step": 11187 + }, + { + "epoch": 1.0439488662872072, + "grad_norm": 45.96778778290228, + "learning_rate": 0.0002841869189816403, + "loss": 7.3632, + "step": 11188 + }, + { + "epoch": 1.0440421759820846, + "grad_norm": 1.85279369669131, + "learning_rate": 0.00028418353848824794, + "loss": 7.5149, + "step": 11189 + }, + { + "epoch": 1.0441354856769618, + "grad_norm": 1.4993733581987825, + "learning_rate": 0.00028418015765366663, + "loss": 7.3637, + "step": 11190 + }, + { + "epoch": 1.0442287953718392, + "grad_norm": 862.5903275835823, + "learning_rate": 0.00028417677647790484, + "loss": 7.2872, + "step": 11191 + }, + { + "epoch": 1.0443221050667164, + "grad_norm": 21315020549782.79, + "learning_rate": 0.00028417339496097124, + "loss": 7.1731, + "step": 11192 + }, + { + "epoch": 1.0444154147615938, + "grad_norm": 2.3285823541967814e+17, + "learning_rate": 0.00028417001310287456, + "loss": 7.5685, + "step": 11193 + }, + { + "epoch": 1.044508724456471, + "grad_norm": 8.008101717050831, + "learning_rate": 0.00028416663090362316, + "loss": 7.2514, + "step": 11194 + }, + { + "epoch": 1.0446020341513482, + "grad_norm": 1.8121429363869739, + "learning_rate": 0.00028416324836322586, + "loss": 7.2424, + "step": 11195 + }, + { + "epoch": 1.0446953438462256, + "grad_norm": 5.171015377496352, + "learning_rate": 0.0002841598654816911, + "loss": 7.7913, + "step": 11196 + }, + { + "epoch": 1.0447886535411028, + "grad_norm": 3.6309980943176883, + "learning_rate": 0.00028415648225902756, + "loss": 7.4007, + "step": 11197 + }, + { + "epoch": 1.0448819632359803, + "grad_norm": 3.828491473393545e+16, + "learning_rate": 0.0002841530986952438, + "loss": 7.3045, + "step": 11198 + }, + { + "epoch": 1.0449752729308575, + "grad_norm": 1.8255929844713332, + "learning_rate": 0.00028414971479034847, + "loss": 7.1341, + "step": 11199 + }, + { + "epoch": 1.0450685826257349, + "grad_norm": 1.9539859035949965, + "learning_rate": 0.0002841463305443502, + "loss": 7.1721, + "step": 11200 + }, + { + "epoch": 1.045161892320612, + "grad_norm": 5.9577943049297275, + "learning_rate": 0.00028414294595725744, + "loss": 7.761, + "step": 11201 + }, + { + "epoch": 1.0452552020154895, + "grad_norm": 15.347730354917259, + "learning_rate": 0.00028413956102907897, + "loss": 7.3043, + "step": 11202 + }, + { + "epoch": 1.0453485117103667, + "grad_norm": 52.9888301512829, + "learning_rate": 0.0002841361757598233, + "loss": 7.3564, + "step": 11203 + }, + { + "epoch": 1.045441821405244, + "grad_norm": 15.030365873885895, + "learning_rate": 0.00028413279014949914, + "loss": 7.365, + "step": 11204 + }, + { + "epoch": 1.0455351311001213, + "grad_norm": 6.833657863143583, + "learning_rate": 0.00028412940419811496, + "loss": 7.7807, + "step": 11205 + }, + { + "epoch": 1.0456284407949985, + "grad_norm": 29.369184693703037, + "learning_rate": 0.0002841260179056795, + "loss": 7.6159, + "step": 11206 + }, + { + "epoch": 1.045721750489876, + "grad_norm": 25.041317557228673, + "learning_rate": 0.0002841226312722013, + "loss": 7.539, + "step": 11207 + }, + { + "epoch": 1.0458150601847531, + "grad_norm": 1.4270973121684627, + "learning_rate": 0.0002841192442976889, + "loss": 7.4143, + "step": 11208 + }, + { + "epoch": 1.0459083698796305, + "grad_norm": 1.861146336195246, + "learning_rate": 0.00028411585698215103, + "loss": 7.2068, + "step": 11209 + }, + { + "epoch": 1.0460016795745077, + "grad_norm": 6.182122617824058, + "learning_rate": 0.0002841124693255963, + "loss": 7.5583, + "step": 11210 + }, + { + "epoch": 1.0460949892693852, + "grad_norm": 2.1522032881344253, + "learning_rate": 0.00028410908132803323, + "loss": 7.3661, + "step": 11211 + }, + { + "epoch": 1.0461882989642624, + "grad_norm": 27.441736307041595, + "learning_rate": 0.00028410569298947054, + "loss": 7.6874, + "step": 11212 + }, + { + "epoch": 1.0462816086591398, + "grad_norm": 1.1811792377275296, + "learning_rate": 0.0002841023043099168, + "loss": 7.5213, + "step": 11213 + }, + { + "epoch": 1.046374918354017, + "grad_norm": 1.2083455343315048, + "learning_rate": 0.0002840989152893806, + "loss": 7.3648, + "step": 11214 + }, + { + "epoch": 1.0464682280488944, + "grad_norm": 5.899426334812458, + "learning_rate": 0.00028409552592787053, + "loss": 7.4267, + "step": 11215 + }, + { + "epoch": 1.0465615377437716, + "grad_norm": 3.5961292026797937, + "learning_rate": 0.0002840921362253954, + "loss": 7.3335, + "step": 11216 + }, + { + "epoch": 1.0466548474386488, + "grad_norm": 1.5808525239199671, + "learning_rate": 0.00028408874618196354, + "loss": 7.6704, + "step": 11217 + }, + { + "epoch": 1.0467481571335262, + "grad_norm": 3.916662887915295e+17, + "learning_rate": 0.0002840853557975838, + "loss": 7.185, + "step": 11218 + }, + { + "epoch": 1.0468414668284034, + "grad_norm": 42.314249684216826, + "learning_rate": 0.0002840819650722647, + "loss": 7.5935, + "step": 11219 + }, + { + "epoch": 1.0469347765232808, + "grad_norm": 1.994198857038346, + "learning_rate": 0.0002840785740060149, + "loss": 7.5826, + "step": 11220 + }, + { + "epoch": 1.047028086218158, + "grad_norm": 3.2947122695932412e+16, + "learning_rate": 0.00028407518259884295, + "loss": 7.4219, + "step": 11221 + }, + { + "epoch": 1.0471213959130354, + "grad_norm": 2.747748603989745, + "learning_rate": 0.0002840717908507576, + "loss": 7.2705, + "step": 11222 + }, + { + "epoch": 1.0472147056079126, + "grad_norm": 2.5189443610686115e+17, + "learning_rate": 0.0002840683987617673, + "loss": 7.1483, + "step": 11223 + }, + { + "epoch": 1.04730801530279, + "grad_norm": 3.6474159349865882, + "learning_rate": 0.0002840650063318809, + "loss": 7.0914, + "step": 11224 + }, + { + "epoch": 1.0474013249976672, + "grad_norm": 2.0142115859946058e+17, + "learning_rate": 0.0002840616135611068, + "loss": 7.3251, + "step": 11225 + }, + { + "epoch": 1.0474946346925447, + "grad_norm": 9.860693621788784e+16, + "learning_rate": 0.0002840582204494538, + "loss": 7.3714, + "step": 11226 + }, + { + "epoch": 1.0475879443874219, + "grad_norm": 1.3024323803430364, + "learning_rate": 0.0002840548269969304, + "loss": 7.3194, + "step": 11227 + }, + { + "epoch": 1.047681254082299, + "grad_norm": 1.8484612193169156, + "learning_rate": 0.0002840514332035453, + "loss": 7.5414, + "step": 11228 + }, + { + "epoch": 1.0477745637771765, + "grad_norm": 3.047383472341888e+17, + "learning_rate": 0.00028404803906930716, + "loss": 7.1356, + "step": 11229 + }, + { + "epoch": 1.0478678734720537, + "grad_norm": 6.00605659402999, + "learning_rate": 0.0002840446445942245, + "loss": 7.5017, + "step": 11230 + }, + { + "epoch": 1.047961183166931, + "grad_norm": 2.457640263140126, + "learning_rate": 0.00028404124977830607, + "loss": 7.5356, + "step": 11231 + }, + { + "epoch": 1.0480544928618083, + "grad_norm": 1.8089929407303216, + "learning_rate": 0.00028403785462156043, + "loss": 7.4129, + "step": 11232 + }, + { + "epoch": 1.0481478025566857, + "grad_norm": 3.535113201507669e+18, + "learning_rate": 0.0002840344591239962, + "loss": 7.687, + "step": 11233 + }, + { + "epoch": 1.048241112251563, + "grad_norm": 8.573975421556642e+16, + "learning_rate": 0.0002840310632856221, + "loss": 7.8759, + "step": 11234 + }, + { + "epoch": 1.0483344219464403, + "grad_norm": 4.559809836973902, + "learning_rate": 0.00028402766710644665, + "loss": 6.9495, + "step": 11235 + }, + { + "epoch": 1.0484277316413175, + "grad_norm": 13.7495675488182, + "learning_rate": 0.0002840242705864786, + "loss": 7.3819, + "step": 11236 + }, + { + "epoch": 1.0485210413361947, + "grad_norm": 1.6580929106992537, + "learning_rate": 0.0002840208737257265, + "loss": 7.8531, + "step": 11237 + }, + { + "epoch": 1.0486143510310721, + "grad_norm": 8.89978607804388, + "learning_rate": 0.000284017476524199, + "loss": 7.4737, + "step": 11238 + }, + { + "epoch": 1.0487076607259493, + "grad_norm": 1.9081281967730985, + "learning_rate": 0.0002840140789819048, + "loss": 7.2269, + "step": 11239 + }, + { + "epoch": 1.0488009704208268, + "grad_norm": 1.4831176666237313, + "learning_rate": 0.00028401068109885246, + "loss": 7.5176, + "step": 11240 + }, + { + "epoch": 1.048894280115704, + "grad_norm": 2.099654193985655, + "learning_rate": 0.00028400728287505066, + "loss": 7.2552, + "step": 11241 + }, + { + "epoch": 1.0489875898105814, + "grad_norm": 7.256892524621543, + "learning_rate": 0.000284003884310508, + "loss": 7.4033, + "step": 11242 + }, + { + "epoch": 1.0490808995054586, + "grad_norm": 1.7002467780833723, + "learning_rate": 0.0002840004854052332, + "loss": 7.5252, + "step": 11243 + }, + { + "epoch": 1.049174209200336, + "grad_norm": 3.280594679901517, + "learning_rate": 0.0002839970861592349, + "loss": 7.5415, + "step": 11244 + }, + { + "epoch": 1.0492675188952132, + "grad_norm": 3.266272794970478e+17, + "learning_rate": 0.00028399368657252167, + "loss": 7.4741, + "step": 11245 + }, + { + "epoch": 1.0493608285900906, + "grad_norm": 1.7443974950065952, + "learning_rate": 0.0002839902866451022, + "loss": 7.284, + "step": 11246 + }, + { + "epoch": 1.0494541382849678, + "grad_norm": 1.522786796939001e+18, + "learning_rate": 0.00028398688637698506, + "loss": 7.4828, + "step": 11247 + }, + { + "epoch": 1.049547447979845, + "grad_norm": 4.995633011811303e+18, + "learning_rate": 0.000283983485768179, + "loss": 7.3362, + "step": 11248 + }, + { + "epoch": 1.0496407576747224, + "grad_norm": 0.7390429854567049, + "learning_rate": 0.0002839800848186926, + "loss": 7.685, + "step": 11249 + }, + { + "epoch": 1.0497340673695996, + "grad_norm": 1.6631132607729886, + "learning_rate": 0.0002839766835285346, + "loss": 7.2391, + "step": 11250 + }, + { + "epoch": 1.049827377064477, + "grad_norm": 1.183928995814157, + "learning_rate": 0.00028397328189771353, + "loss": 7.434, + "step": 11251 + }, + { + "epoch": 1.0499206867593542, + "grad_norm": 0.9396142183598933, + "learning_rate": 0.00028396987992623807, + "loss": 7.4167, + "step": 11252 + }, + { + "epoch": 1.0500139964542317, + "grad_norm": 0.911517570598566, + "learning_rate": 0.00028396647761411696, + "loss": 7.2169, + "step": 11253 + }, + { + "epoch": 1.0501073061491089, + "grad_norm": 0.8711133699877042, + "learning_rate": 0.0002839630749613587, + "loss": 7.3525, + "step": 11254 + }, + { + "epoch": 1.0502006158439863, + "grad_norm": 2.140471427266697, + "learning_rate": 0.00028395967196797207, + "loss": 7.9954, + "step": 11255 + }, + { + "epoch": 1.0502939255388635, + "grad_norm": 18445228835688.1, + "learning_rate": 0.00028395626863396563, + "loss": 7.6728, + "step": 11256 + }, + { + "epoch": 1.050387235233741, + "grad_norm": 1.1698502251560465, + "learning_rate": 0.00028395286495934815, + "loss": 7.7655, + "step": 11257 + }, + { + "epoch": 1.050480544928618, + "grad_norm": 2572.031904813032, + "learning_rate": 0.0002839494609441282, + "loss": 7.5744, + "step": 11258 + }, + { + "epoch": 1.0505738546234953, + "grad_norm": 0.5985171557098893, + "learning_rate": 0.0002839460565883144, + "loss": 7.6852, + "step": 11259 + }, + { + "epoch": 1.0506671643183727, + "grad_norm": 84.68603516538711, + "learning_rate": 0.00028394265189191553, + "loss": 7.7065, + "step": 11260 + }, + { + "epoch": 1.05076047401325, + "grad_norm": 1.8582252685171172, + "learning_rate": 0.0002839392468549401, + "loss": 7.287, + "step": 11261 + }, + { + "epoch": 1.0508537837081273, + "grad_norm": 2.169336885822457, + "learning_rate": 0.00028393584147739685, + "loss": 7.1508, + "step": 11262 + }, + { + "epoch": 1.0509470934030045, + "grad_norm": 40.058518114944924, + "learning_rate": 0.00028393243575929446, + "loss": 7.257, + "step": 11263 + }, + { + "epoch": 1.051040403097882, + "grad_norm": 0.5025656727048501, + "learning_rate": 0.00028392902970064156, + "loss": 7.3495, + "step": 11264 + }, + { + "epoch": 1.0511337127927591, + "grad_norm": 0.5844424072136997, + "learning_rate": 0.0002839256233014468, + "loss": 7.1349, + "step": 11265 + }, + { + "epoch": 1.0512270224876366, + "grad_norm": 30.89010444102825, + "learning_rate": 0.00028392221656171884, + "loss": 7.3263, + "step": 11266 + }, + { + "epoch": 1.0513203321825138, + "grad_norm": 1.762657840129399, + "learning_rate": 0.00028391880948146637, + "loss": 7.4983, + "step": 11267 + }, + { + "epoch": 1.051413641877391, + "grad_norm": 1.789298026869448, + "learning_rate": 0.00028391540206069803, + "loss": 7.441, + "step": 11268 + }, + { + "epoch": 1.0515069515722684, + "grad_norm": 0.7865369150963571, + "learning_rate": 0.0002839119942994225, + "loss": 7.1829, + "step": 11269 + }, + { + "epoch": 1.0516002612671456, + "grad_norm": 0.7585974399987027, + "learning_rate": 0.0002839085861976484, + "loss": 7.4685, + "step": 11270 + }, + { + "epoch": 1.051693570962023, + "grad_norm": 0.3846780757569239, + "learning_rate": 0.00028390517775538445, + "loss": 7.3253, + "step": 11271 + }, + { + "epoch": 1.0517868806569002, + "grad_norm": 0.6778058512220361, + "learning_rate": 0.00028390176897263934, + "loss": 7.415, + "step": 11272 + }, + { + "epoch": 1.0518801903517776, + "grad_norm": 0.9170520836998723, + "learning_rate": 0.0002838983598494217, + "loss": 7.3512, + "step": 11273 + }, + { + "epoch": 1.0519735000466548, + "grad_norm": 1.1983488909299016, + "learning_rate": 0.00028389495038574013, + "loss": 7.272, + "step": 11274 + }, + { + "epoch": 1.0520668097415322, + "grad_norm": 165.35670246928782, + "learning_rate": 0.0002838915405816034, + "loss": 7.3791, + "step": 11275 + }, + { + "epoch": 1.0521601194364094, + "grad_norm": 0.718183774792662, + "learning_rate": 0.0002838881304370201, + "loss": 7.5747, + "step": 11276 + }, + { + "epoch": 1.0522534291312868, + "grad_norm": 0.4576102735038305, + "learning_rate": 0.000283884719951999, + "loss": 7.5227, + "step": 11277 + }, + { + "epoch": 1.052346738826164, + "grad_norm": 0.6766083885828104, + "learning_rate": 0.00028388130912654873, + "loss": 7.6293, + "step": 11278 + }, + { + "epoch": 1.0524400485210412, + "grad_norm": 0.3979062647814298, + "learning_rate": 0.00028387789796067785, + "loss": 7.2811, + "step": 11279 + }, + { + "epoch": 1.0525333582159186, + "grad_norm": 1.0046685130629096, + "learning_rate": 0.00028387448645439525, + "loss": 7.6506, + "step": 11280 + }, + { + "epoch": 1.0526266679107958, + "grad_norm": 0.42477530788519163, + "learning_rate": 0.00028387107460770944, + "loss": 7.4, + "step": 11281 + }, + { + "epoch": 1.0527199776056733, + "grad_norm": 0.4809530593180969, + "learning_rate": 0.00028386766242062913, + "loss": 7.3418, + "step": 11282 + }, + { + "epoch": 1.0528132873005505, + "grad_norm": 0.5046109122913359, + "learning_rate": 0.000283864249893163, + "loss": 7.3677, + "step": 11283 + }, + { + "epoch": 1.0529065969954279, + "grad_norm": 0.7009680190995891, + "learning_rate": 0.00028386083702531975, + "loss": 7.6332, + "step": 11284 + }, + { + "epoch": 1.052999906690305, + "grad_norm": 0.33546353569458853, + "learning_rate": 0.0002838574238171081, + "loss": 7.4251, + "step": 11285 + }, + { + "epoch": 1.0530932163851825, + "grad_norm": 0.5731051009254031, + "learning_rate": 0.0002838540102685366, + "loss": 7.5108, + "step": 11286 + }, + { + "epoch": 1.0531865260800597, + "grad_norm": 0.571693871943108, + "learning_rate": 0.00028385059637961403, + "loss": 7.5836, + "step": 11287 + }, + { + "epoch": 1.0532798357749371, + "grad_norm": 0.9084823570855461, + "learning_rate": 0.00028384718215034905, + "loss": 7.1567, + "step": 11288 + }, + { + "epoch": 1.0533731454698143, + "grad_norm": 0.5683774918490698, + "learning_rate": 0.00028384376758075036, + "loss": 7.3586, + "step": 11289 + }, + { + "epoch": 1.0534664551646915, + "grad_norm": 0.523887969331527, + "learning_rate": 0.00028384035267082656, + "loss": 7.6779, + "step": 11290 + }, + { + "epoch": 1.053559764859569, + "grad_norm": 0.3973349765996098, + "learning_rate": 0.0002838369374205864, + "loss": 7.3854, + "step": 11291 + }, + { + "epoch": 1.0536530745544461, + "grad_norm": 0.45759185486440673, + "learning_rate": 0.00028383352183003854, + "loss": 7.3054, + "step": 11292 + }, + { + "epoch": 1.0537463842493235, + "grad_norm": 0.5072521329184791, + "learning_rate": 0.0002838301058991917, + "loss": 7.5523, + "step": 11293 + }, + { + "epoch": 1.0538396939442007, + "grad_norm": 0.4440025906107925, + "learning_rate": 0.0002838266896280546, + "loss": 7.3847, + "step": 11294 + }, + { + "epoch": 1.0539330036390782, + "grad_norm": 0.5238727662522767, + "learning_rate": 0.00028382327301663577, + "loss": 7.3565, + "step": 11295 + }, + { + "epoch": 1.0540263133339554, + "grad_norm": 0.6334888003261594, + "learning_rate": 0.00028381985606494407, + "loss": 7.6028, + "step": 11296 + }, + { + "epoch": 1.0541196230288328, + "grad_norm": 0.4480626101999424, + "learning_rate": 0.0002838164387729881, + "loss": 7.3916, + "step": 11297 + }, + { + "epoch": 1.05421293272371, + "grad_norm": 0.520393386491374, + "learning_rate": 0.0002838130211407765, + "loss": 7.5443, + "step": 11298 + }, + { + "epoch": 1.0543062424185874, + "grad_norm": 0.9763728398171737, + "learning_rate": 0.00028380960316831813, + "loss": 7.0678, + "step": 11299 + }, + { + "epoch": 1.0543995521134646, + "grad_norm": 0.4351947915682174, + "learning_rate": 0.00028380618485562153, + "loss": 7.6178, + "step": 11300 + }, + { + "epoch": 1.0544928618083418, + "grad_norm": 0.6901475580411174, + "learning_rate": 0.0002838027662026954, + "loss": 7.1093, + "step": 11301 + }, + { + "epoch": 1.0545861715032192, + "grad_norm": 0.4572240547532971, + "learning_rate": 0.00028379934720954855, + "loss": 7.6449, + "step": 11302 + }, + { + "epoch": 1.0546794811980964, + "grad_norm": 0.4867373030800969, + "learning_rate": 0.00028379592787618955, + "loss": 7.1154, + "step": 11303 + }, + { + "epoch": 1.0547727908929738, + "grad_norm": 407.89164007098697, + "learning_rate": 0.0002837925082026271, + "loss": 7.543, + "step": 11304 + }, + { + "epoch": 1.054866100587851, + "grad_norm": 0.6480732665178713, + "learning_rate": 0.00028378908818887, + "loss": 7.3944, + "step": 11305 + }, + { + "epoch": 1.0549594102827284, + "grad_norm": 121.57085134967039, + "learning_rate": 0.00028378566783492684, + "loss": 7.3703, + "step": 11306 + }, + { + "epoch": 1.0550527199776056, + "grad_norm": 1.108195384536269, + "learning_rate": 0.0002837822471408064, + "loss": 7.8418, + "step": 11307 + }, + { + "epoch": 1.055146029672483, + "grad_norm": 536.5561338391979, + "learning_rate": 0.0002837788261065173, + "loss": 7.0245, + "step": 11308 + }, + { + "epoch": 1.0552393393673603, + "grad_norm": 149.5484868271839, + "learning_rate": 0.0002837754047320683, + "loss": 7.4045, + "step": 11309 + }, + { + "epoch": 1.0553326490622377, + "grad_norm": 187.97143926373855, + "learning_rate": 0.000283771983017468, + "loss": 7.4151, + "step": 11310 + }, + { + "epoch": 1.0554259587571149, + "grad_norm": 1227.5127127882647, + "learning_rate": 0.0002837685609627252, + "loss": 7.4804, + "step": 11311 + }, + { + "epoch": 1.055519268451992, + "grad_norm": 0.39065088243792645, + "learning_rate": 0.00028376513856784866, + "loss": 7.2905, + "step": 11312 + }, + { + "epoch": 1.0556125781468695, + "grad_norm": 0.5097425523894531, + "learning_rate": 0.0002837617158328469, + "loss": 7.3367, + "step": 11313 + }, + { + "epoch": 1.0557058878417467, + "grad_norm": 33.14702283959881, + "learning_rate": 0.00028375829275772876, + "loss": 7.3702, + "step": 11314 + }, + { + "epoch": 1.055799197536624, + "grad_norm": 65.88110428590973, + "learning_rate": 0.0002837548693425029, + "loss": 7.533, + "step": 11315 + }, + { + "epoch": 1.0558925072315013, + "grad_norm": 292.7867433134499, + "learning_rate": 0.00028375144558717805, + "loss": 7.267, + "step": 11316 + }, + { + "epoch": 1.0559858169263787, + "grad_norm": 0.37654029207027406, + "learning_rate": 0.00028374802149176286, + "loss": 7.417, + "step": 11317 + }, + { + "epoch": 1.056079126621256, + "grad_norm": 0.3678645469210769, + "learning_rate": 0.0002837445970562661, + "loss": 7.2449, + "step": 11318 + }, + { + "epoch": 1.0561724363161333, + "grad_norm": 0.39336286830414746, + "learning_rate": 0.00028374117228069644, + "loss": 7.3927, + "step": 11319 + }, + { + "epoch": 1.0562657460110105, + "grad_norm": 0.41241195229494887, + "learning_rate": 0.00028373774716506256, + "loss": 7.2218, + "step": 11320 + }, + { + "epoch": 1.056359055705888, + "grad_norm": 0.6565135313501842, + "learning_rate": 0.00028373432170937323, + "loss": 7.5556, + "step": 11321 + }, + { + "epoch": 1.0564523654007651, + "grad_norm": 277.039968190204, + "learning_rate": 0.00028373089591363716, + "loss": 8.0872, + "step": 11322 + }, + { + "epoch": 1.0565456750956423, + "grad_norm": 0.5913774675692376, + "learning_rate": 0.00028372746977786304, + "loss": 7.6134, + "step": 11323 + }, + { + "epoch": 1.0566389847905198, + "grad_norm": 0.685902001994002, + "learning_rate": 0.00028372404330205954, + "loss": 7.592, + "step": 11324 + }, + { + "epoch": 1.056732294485397, + "grad_norm": 0.41960356547542926, + "learning_rate": 0.0002837206164862354, + "loss": 7.7104, + "step": 11325 + }, + { + "epoch": 1.0568256041802744, + "grad_norm": 1.213798826216418, + "learning_rate": 0.00028371718933039935, + "loss": 7.2058, + "step": 11326 + }, + { + "epoch": 1.0569189138751516, + "grad_norm": 0.7758912967042499, + "learning_rate": 0.0002837137618345601, + "loss": 7.4116, + "step": 11327 + }, + { + "epoch": 1.057012223570029, + "grad_norm": 1041.4018516650513, + "learning_rate": 0.00028371033399872636, + "loss": 7.3466, + "step": 11328 + }, + { + "epoch": 1.0571055332649062, + "grad_norm": 0.5587881097119346, + "learning_rate": 0.0002837069058229069, + "loss": 7.5971, + "step": 11329 + }, + { + "epoch": 1.0571988429597836, + "grad_norm": 418.505948672861, + "learning_rate": 0.00028370347730711034, + "loss": 7.2913, + "step": 11330 + }, + { + "epoch": 1.0572921526546608, + "grad_norm": 0.9792484477283167, + "learning_rate": 0.00028370004845134543, + "loss": 7.5677, + "step": 11331 + }, + { + "epoch": 1.0573854623495382, + "grad_norm": 0.7987419936569967, + "learning_rate": 0.0002836966192556209, + "loss": 7.5156, + "step": 11332 + }, + { + "epoch": 1.0574787720444154, + "grad_norm": 0.45906356135053633, + "learning_rate": 0.00028369318971994555, + "loss": 7.4326, + "step": 11333 + }, + { + "epoch": 1.0575720817392926, + "grad_norm": 0.6671610123613679, + "learning_rate": 0.000283689759844328, + "loss": 7.4343, + "step": 11334 + }, + { + "epoch": 1.05766539143417, + "grad_norm": 1386.6692636238595, + "learning_rate": 0.00028368632962877686, + "loss": 7.3091, + "step": 11335 + }, + { + "epoch": 1.0577587011290472, + "grad_norm": 0.6917057872444796, + "learning_rate": 0.0002836828990733011, + "loss": 7.524, + "step": 11336 + }, + { + "epoch": 1.0578520108239247, + "grad_norm": 0.46751180234640943, + "learning_rate": 0.0002836794681779093, + "loss": 7.7266, + "step": 11337 + }, + { + "epoch": 1.0579453205188019, + "grad_norm": 1232.3279764281558, + "learning_rate": 0.0002836760369426102, + "loss": 7.3626, + "step": 11338 + }, + { + "epoch": 1.0580386302136793, + "grad_norm": 0.7223670409454688, + "learning_rate": 0.00028367260536741256, + "loss": 7.2988, + "step": 11339 + }, + { + "epoch": 1.0581319399085565, + "grad_norm": 0.43755659164303623, + "learning_rate": 0.00028366917345232516, + "loss": 7.6474, + "step": 11340 + }, + { + "epoch": 1.058225249603434, + "grad_norm": 0.4107836955890602, + "learning_rate": 0.00028366574119735653, + "loss": 7.4194, + "step": 11341 + }, + { + "epoch": 1.058318559298311, + "grad_norm": 0.42714336311281836, + "learning_rate": 0.0002836623086025155, + "loss": 7.1368, + "step": 11342 + }, + { + "epoch": 1.0584118689931883, + "grad_norm": 0.4843803910386245, + "learning_rate": 0.00028365887566781094, + "loss": 7.2937, + "step": 11343 + }, + { + "epoch": 1.0585051786880657, + "grad_norm": 0.40810026637209973, + "learning_rate": 0.0002836554423932513, + "loss": 7.2166, + "step": 11344 + }, + { + "epoch": 1.058598488382943, + "grad_norm": 0.6104986681388546, + "learning_rate": 0.00028365200877884556, + "loss": 7.3562, + "step": 11345 + }, + { + "epoch": 1.0586917980778203, + "grad_norm": 0.9630676565467744, + "learning_rate": 0.00028364857482460237, + "loss": 7.1149, + "step": 11346 + }, + { + "epoch": 1.0587851077726975, + "grad_norm": 0.6412937882120998, + "learning_rate": 0.00028364514053053037, + "loss": 7.202, + "step": 11347 + }, + { + "epoch": 1.058878417467575, + "grad_norm": 0.5049655646098777, + "learning_rate": 0.0002836417058966384, + "loss": 7.3431, + "step": 11348 + }, + { + "epoch": 1.0589717271624521, + "grad_norm": 4726.361865967511, + "learning_rate": 0.0002836382709229352, + "loss": 7.4947, + "step": 11349 + }, + { + "epoch": 1.0590650368573296, + "grad_norm": 48.240580991000115, + "learning_rate": 0.00028363483560942936, + "loss": 7.2314, + "step": 11350 + }, + { + "epoch": 1.0591583465522068, + "grad_norm": 0.7995182448002803, + "learning_rate": 0.0002836313999561298, + "loss": 7.3557, + "step": 11351 + }, + { + "epoch": 1.0592516562470842, + "grad_norm": 0.3578103266898468, + "learning_rate": 0.0002836279639630452, + "loss": 7.5885, + "step": 11352 + }, + { + "epoch": 1.0593449659419614, + "grad_norm": 0.875207159949979, + "learning_rate": 0.00028362452763018423, + "loss": 7.2943, + "step": 11353 + }, + { + "epoch": 1.0594382756368386, + "grad_norm": 770.374545361485, + "learning_rate": 0.00028362109095755563, + "loss": 7.3683, + "step": 11354 + }, + { + "epoch": 1.059531585331716, + "grad_norm": 218.12410165317687, + "learning_rate": 0.0002836176539451682, + "loss": 7.7086, + "step": 11355 + }, + { + "epoch": 1.0596248950265932, + "grad_norm": 5573.421370876728, + "learning_rate": 0.0002836142165930307, + "loss": 7.2178, + "step": 11356 + }, + { + "epoch": 1.0597182047214706, + "grad_norm": 1774.5676305674353, + "learning_rate": 0.00028361077890115175, + "loss": 7.1248, + "step": 11357 + }, + { + "epoch": 1.0598115144163478, + "grad_norm": 0.866891325118189, + "learning_rate": 0.00028360734086954023, + "loss": 7.6493, + "step": 11358 + }, + { + "epoch": 1.0599048241112252, + "grad_norm": 0.6189809768665031, + "learning_rate": 0.00028360390249820486, + "loss": 7.5707, + "step": 11359 + }, + { + "epoch": 1.0599981338061024, + "grad_norm": 0.42587432463993613, + "learning_rate": 0.00028360046378715425, + "loss": 7.3131, + "step": 11360 + }, + { + "epoch": 1.0600914435009798, + "grad_norm": 0.3291951743088087, + "learning_rate": 0.00028359702473639725, + "loss": 7.4783, + "step": 11361 + }, + { + "epoch": 1.060184753195857, + "grad_norm": 1.256793024163186, + "learning_rate": 0.0002835935853459426, + "loss": 7.1008, + "step": 11362 + }, + { + "epoch": 1.0602780628907345, + "grad_norm": 0.5977138653359575, + "learning_rate": 0.00028359014561579906, + "loss": 7.3826, + "step": 11363 + }, + { + "epoch": 1.0603713725856116, + "grad_norm": 0.47277449296545526, + "learning_rate": 0.00028358670554597533, + "loss": 7.2742, + "step": 11364 + }, + { + "epoch": 1.0604646822804888, + "grad_norm": 0.35643070103365415, + "learning_rate": 0.0002835832651364802, + "loss": 7.2699, + "step": 11365 + }, + { + "epoch": 1.0605579919753663, + "grad_norm": 0.9334777242973318, + "learning_rate": 0.0002835798243873224, + "loss": 7.595, + "step": 11366 + }, + { + "epoch": 1.0606513016702435, + "grad_norm": 0.6912051971318898, + "learning_rate": 0.00028357638329851067, + "loss": 7.4264, + "step": 11367 + }, + { + "epoch": 1.0607446113651209, + "grad_norm": 0.7678563191968143, + "learning_rate": 0.00028357294187005374, + "loss": 7.4971, + "step": 11368 + }, + { + "epoch": 1.060837921059998, + "grad_norm": 0.5411056508841479, + "learning_rate": 0.00028356950010196045, + "loss": 7.4999, + "step": 11369 + }, + { + "epoch": 1.0609312307548755, + "grad_norm": 2937.7898694911064, + "learning_rate": 0.00028356605799423936, + "loss": 7.3353, + "step": 11370 + }, + { + "epoch": 1.0610245404497527, + "grad_norm": 0.612871623415193, + "learning_rate": 0.00028356261554689943, + "loss": 7.5737, + "step": 11371 + }, + { + "epoch": 1.0611178501446301, + "grad_norm": 0.7923002979277893, + "learning_rate": 0.00028355917275994937, + "loss": 7.2136, + "step": 11372 + }, + { + "epoch": 1.0612111598395073, + "grad_norm": 0.7349435304629365, + "learning_rate": 0.00028355572963339787, + "loss": 7.4868, + "step": 11373 + }, + { + "epoch": 1.0613044695343845, + "grad_norm": 0.40242061533986184, + "learning_rate": 0.0002835522861672537, + "loss": 7.6797, + "step": 11374 + }, + { + "epoch": 1.061397779229262, + "grad_norm": 0.32607772264018897, + "learning_rate": 0.0002835488423615256, + "loss": 7.2552, + "step": 11375 + }, + { + "epoch": 1.0614910889241391, + "grad_norm": 0.5311321968926938, + "learning_rate": 0.00028354539821622243, + "loss": 7.4289, + "step": 11376 + }, + { + "epoch": 1.0615843986190165, + "grad_norm": 0.8207435746903863, + "learning_rate": 0.0002835419537313528, + "loss": 7.5788, + "step": 11377 + }, + { + "epoch": 1.0616777083138937, + "grad_norm": 0.8233074049453812, + "learning_rate": 0.00028353850890692556, + "loss": 7.5121, + "step": 11378 + }, + { + "epoch": 1.0617710180087712, + "grad_norm": 0.391177440674553, + "learning_rate": 0.00028353506374294946, + "loss": 7.4592, + "step": 11379 + }, + { + "epoch": 1.0618643277036484, + "grad_norm": 0.44810348379042547, + "learning_rate": 0.00028353161823943323, + "loss": 7.3109, + "step": 11380 + }, + { + "epoch": 1.0619576373985258, + "grad_norm": 0.5792598442690766, + "learning_rate": 0.0002835281723963857, + "loss": 7.7855, + "step": 11381 + }, + { + "epoch": 1.062050947093403, + "grad_norm": 0.5830857019245811, + "learning_rate": 0.0002835247262138155, + "loss": 7.5316, + "step": 11382 + }, + { + "epoch": 1.0621442567882804, + "grad_norm": 1.0501903742468717, + "learning_rate": 0.0002835212796917315, + "loss": 7.3676, + "step": 11383 + }, + { + "epoch": 1.0622375664831576, + "grad_norm": 1.1839371953066715, + "learning_rate": 0.00028351783283014247, + "loss": 7.3843, + "step": 11384 + }, + { + "epoch": 1.0623308761780348, + "grad_norm": 0.6670953180110591, + "learning_rate": 0.0002835143856290571, + "loss": 7.2956, + "step": 11385 + }, + { + "epoch": 1.0624241858729122, + "grad_norm": 0.4001822369856271, + "learning_rate": 0.0002835109380884842, + "loss": 7.5393, + "step": 11386 + }, + { + "epoch": 1.0625174955677894, + "grad_norm": 0.9037531914129122, + "learning_rate": 0.0002835074902084326, + "loss": 7.4386, + "step": 11387 + }, + { + "epoch": 1.0626108052626668, + "grad_norm": 1.1976213343608662, + "learning_rate": 0.00028350404198891095, + "loss": 7.6399, + "step": 11388 + }, + { + "epoch": 1.062704114957544, + "grad_norm": 1.2469289875117155, + "learning_rate": 0.0002835005934299281, + "loss": 7.6046, + "step": 11389 + }, + { + "epoch": 1.0627974246524214, + "grad_norm": 3045.6541994498057, + "learning_rate": 0.00028349714453149274, + "loss": 7.2084, + "step": 11390 + }, + { + "epoch": 1.0628907343472986, + "grad_norm": 0.34214569956823854, + "learning_rate": 0.0002834936952936137, + "loss": 7.4727, + "step": 11391 + }, + { + "epoch": 1.062984044042176, + "grad_norm": 5945.055497395789, + "learning_rate": 0.00028349024571629975, + "loss": 7.5787, + "step": 11392 + }, + { + "epoch": 1.0630773537370533, + "grad_norm": 0.8531798333878929, + "learning_rate": 0.00028348679579955966, + "loss": 7.3409, + "step": 11393 + }, + { + "epoch": 1.0631706634319307, + "grad_norm": 1.1778493532738177, + "learning_rate": 0.00028348334554340217, + "loss": 7.3114, + "step": 11394 + }, + { + "epoch": 1.0632639731268079, + "grad_norm": 1465.058792650634, + "learning_rate": 0.0002834798949478361, + "loss": 7.5722, + "step": 11395 + }, + { + "epoch": 1.063357282821685, + "grad_norm": 0.5678647138726942, + "learning_rate": 0.00028347644401287015, + "loss": 7.2527, + "step": 11396 + }, + { + "epoch": 1.0634505925165625, + "grad_norm": 0.5409315904393764, + "learning_rate": 0.00028347299273851314, + "loss": 7.4629, + "step": 11397 + }, + { + "epoch": 1.0635439022114397, + "grad_norm": 0.42812462288660696, + "learning_rate": 0.0002834695411247739, + "loss": 7.1228, + "step": 11398 + }, + { + "epoch": 1.063637211906317, + "grad_norm": 0.7169953726870529, + "learning_rate": 0.0002834660891716611, + "loss": 7.2395, + "step": 11399 + }, + { + "epoch": 1.0637305216011943, + "grad_norm": 0.8951995881610746, + "learning_rate": 0.0002834626368791836, + "loss": 7.602, + "step": 11400 + }, + { + "epoch": 1.0638238312960717, + "grad_norm": 0.38994078854378594, + "learning_rate": 0.0002834591842473502, + "loss": 7.2424, + "step": 11401 + }, + { + "epoch": 1.063917140990949, + "grad_norm": 0.26739473292547516, + "learning_rate": 0.0002834557312761695, + "loss": 7.3198, + "step": 11402 + }, + { + "epoch": 1.0640104506858263, + "grad_norm": 0.838103098750199, + "learning_rate": 0.0002834522779656505, + "loss": 7.8931, + "step": 11403 + }, + { + "epoch": 1.0641037603807035, + "grad_norm": 0.5098996054940024, + "learning_rate": 0.00028344882431580187, + "loss": 7.4382, + "step": 11404 + }, + { + "epoch": 1.064197070075581, + "grad_norm": 0.40201181106638695, + "learning_rate": 0.00028344537032663243, + "loss": 7.4864, + "step": 11405 + }, + { + "epoch": 1.0642903797704582, + "grad_norm": 0.6293711291073173, + "learning_rate": 0.0002834419159981509, + "loss": 7.5396, + "step": 11406 + }, + { + "epoch": 1.0643836894653353, + "grad_norm": 0.3750920157828543, + "learning_rate": 0.00028343846133036617, + "loss": 7.6915, + "step": 11407 + }, + { + "epoch": 1.0644769991602128, + "grad_norm": 1.265960710861353, + "learning_rate": 0.0002834350063232869, + "loss": 7.1242, + "step": 11408 + }, + { + "epoch": 1.06457030885509, + "grad_norm": 0.49613545038901796, + "learning_rate": 0.00028343155097692197, + "loss": 7.3062, + "step": 11409 + }, + { + "epoch": 1.0646636185499674, + "grad_norm": 0.6391748615642278, + "learning_rate": 0.0002834280952912801, + "loss": 7.4931, + "step": 11410 + }, + { + "epoch": 1.0647569282448446, + "grad_norm": 0.45026110045881496, + "learning_rate": 0.00028342463926637015, + "loss": 7.1245, + "step": 11411 + }, + { + "epoch": 1.064850237939722, + "grad_norm": 1.517810453576211, + "learning_rate": 0.00028342118290220077, + "loss": 7.6919, + "step": 11412 + }, + { + "epoch": 1.0649435476345992, + "grad_norm": 0.3214104545152216, + "learning_rate": 0.00028341772619878096, + "loss": 7.2215, + "step": 11413 + }, + { + "epoch": 1.0650368573294766, + "grad_norm": 1654.4812605302955, + "learning_rate": 0.00028341426915611933, + "loss": 7.5692, + "step": 11414 + }, + { + "epoch": 1.0651301670243538, + "grad_norm": 0.34794718339688846, + "learning_rate": 0.00028341081177422477, + "loss": 7.2738, + "step": 11415 + }, + { + "epoch": 1.0652234767192312, + "grad_norm": 0.3909303310577919, + "learning_rate": 0.00028340735405310596, + "loss": 7.469, + "step": 11416 + }, + { + "epoch": 1.0653167864141084, + "grad_norm": 0.5512084799211902, + "learning_rate": 0.0002834038959927719, + "loss": 7.4248, + "step": 11417 + }, + { + "epoch": 1.0654100961089856, + "grad_norm": 0.7354136495220637, + "learning_rate": 0.00028340043759323113, + "loss": 7.6174, + "step": 11418 + }, + { + "epoch": 1.065503405803863, + "grad_norm": 0.5826383807005732, + "learning_rate": 0.0002833969788544926, + "loss": 7.621, + "step": 11419 + }, + { + "epoch": 1.0655967154987402, + "grad_norm": 1.4996664986728017, + "learning_rate": 0.0002833935197765651, + "loss": 6.9715, + "step": 11420 + }, + { + "epoch": 1.0656900251936177, + "grad_norm": 0.5663971720861277, + "learning_rate": 0.00028339006035945735, + "loss": 7.6189, + "step": 11421 + }, + { + "epoch": 1.0657833348884949, + "grad_norm": 0.32989297516995725, + "learning_rate": 0.0002833866006031782, + "loss": 7.346, + "step": 11422 + }, + { + "epoch": 1.0658766445833723, + "grad_norm": 0.7551114604804962, + "learning_rate": 0.0002833831405077364, + "loss": 7.6005, + "step": 11423 + }, + { + "epoch": 1.0659699542782495, + "grad_norm": 6991.304508478121, + "learning_rate": 0.00028337968007314087, + "loss": 7.3052, + "step": 11424 + }, + { + "epoch": 1.066063263973127, + "grad_norm": 0.5802039109098437, + "learning_rate": 0.0002833762192994003, + "loss": 7.5706, + "step": 11425 + }, + { + "epoch": 1.066156573668004, + "grad_norm": 0.38192517484805677, + "learning_rate": 0.00028337275818652343, + "loss": 7.4685, + "step": 11426 + }, + { + "epoch": 1.0662498833628815, + "grad_norm": 24796.121726299258, + "learning_rate": 0.00028336929673451923, + "loss": 7.4215, + "step": 11427 + }, + { + "epoch": 1.0663431930577587, + "grad_norm": 0.4746556319018849, + "learning_rate": 0.00028336583494339637, + "loss": 7.2212, + "step": 11428 + }, + { + "epoch": 1.066436502752636, + "grad_norm": 0.6132562518515674, + "learning_rate": 0.00028336237281316375, + "loss": 7.304, + "step": 11429 + }, + { + "epoch": 1.0665298124475133, + "grad_norm": 0.38394317176946596, + "learning_rate": 0.0002833589103438301, + "loss": 7.3589, + "step": 11430 + }, + { + "epoch": 1.0666231221423905, + "grad_norm": 0.41156843101820856, + "learning_rate": 0.00028335544753540424, + "loss": 7.352, + "step": 11431 + }, + { + "epoch": 1.066716431837268, + "grad_norm": 0.44626778335628897, + "learning_rate": 0.000283351984387895, + "loss": 7.2673, + "step": 11432 + }, + { + "epoch": 1.0668097415321451, + "grad_norm": 0.3905678729238197, + "learning_rate": 0.0002833485209013111, + "loss": 7.3216, + "step": 11433 + }, + { + "epoch": 1.0669030512270226, + "grad_norm": 0.32766988374149236, + "learning_rate": 0.0002833450570756614, + "loss": 7.1491, + "step": 11434 + }, + { + "epoch": 1.0669963609218998, + "grad_norm": 0.31272653430811465, + "learning_rate": 0.0002833415929109548, + "loss": 7.2829, + "step": 11435 + }, + { + "epoch": 1.0670896706167772, + "grad_norm": 0.28083237304109765, + "learning_rate": 0.0002833381284072, + "loss": 7.3778, + "step": 11436 + }, + { + "epoch": 1.0671829803116544, + "grad_norm": 8210.259762122523, + "learning_rate": 0.00028333466356440584, + "loss": 7.414, + "step": 11437 + }, + { + "epoch": 1.0672762900065318, + "grad_norm": 0.4306715371620423, + "learning_rate": 0.00028333119838258114, + "loss": 7.3334, + "step": 11438 + }, + { + "epoch": 1.067369599701409, + "grad_norm": 0.29082791950305653, + "learning_rate": 0.00028332773286173467, + "loss": 7.379, + "step": 11439 + }, + { + "epoch": 1.0674629093962862, + "grad_norm": 1.7103009471414257, + "learning_rate": 0.0002833242670018753, + "loss": 7.3123, + "step": 11440 + }, + { + "epoch": 1.0675562190911636, + "grad_norm": 0.49100539074195637, + "learning_rate": 0.0002833208008030118, + "loss": 7.3981, + "step": 11441 + }, + { + "epoch": 1.0676495287860408, + "grad_norm": 13462.40093218502, + "learning_rate": 0.00028331733426515304, + "loss": 7.4344, + "step": 11442 + }, + { + "epoch": 1.0677428384809182, + "grad_norm": 0.6467239094089396, + "learning_rate": 0.0002833138673883077, + "loss": 7.5521, + "step": 11443 + }, + { + "epoch": 1.0678361481757954, + "grad_norm": 0.6103251689406441, + "learning_rate": 0.0002833104001724847, + "loss": 7.5295, + "step": 11444 + }, + { + "epoch": 1.0679294578706728, + "grad_norm": 0.7441434009512462, + "learning_rate": 0.0002833069326176929, + "loss": 7.2203, + "step": 11445 + }, + { + "epoch": 1.06802276756555, + "grad_norm": 0.3930717816830154, + "learning_rate": 0.000283303464723941, + "loss": 7.4642, + "step": 11446 + }, + { + "epoch": 1.0681160772604275, + "grad_norm": 0.4539163232661355, + "learning_rate": 0.00028329999649123793, + "loss": 7.4464, + "step": 11447 + }, + { + "epoch": 1.0682093869553047, + "grad_norm": 0.37830341279968605, + "learning_rate": 0.0002832965279195924, + "loss": 7.1882, + "step": 11448 + }, + { + "epoch": 1.068302696650182, + "grad_norm": 207287.4498161669, + "learning_rate": 0.00028329305900901333, + "loss": 7.321, + "step": 11449 + }, + { + "epoch": 1.0683960063450593, + "grad_norm": 1.0079976778658406, + "learning_rate": 0.0002832895897595095, + "loss": 7.7008, + "step": 11450 + }, + { + "epoch": 1.0684893160399365, + "grad_norm": 0.6075910559842008, + "learning_rate": 0.0002832861201710897, + "loss": 7.7276, + "step": 11451 + }, + { + "epoch": 1.0685826257348139, + "grad_norm": 0.7377309733544204, + "learning_rate": 0.0002832826502437628, + "loss": 7.6791, + "step": 11452 + }, + { + "epoch": 1.068675935429691, + "grad_norm": 0.4272851057977076, + "learning_rate": 0.00028327917997753754, + "loss": 7.4978, + "step": 11453 + }, + { + "epoch": 1.0687692451245685, + "grad_norm": 0.9025731958449699, + "learning_rate": 0.0002832757093724229, + "loss": 7.4722, + "step": 11454 + }, + { + "epoch": 1.0688625548194457, + "grad_norm": 0.790213082918749, + "learning_rate": 0.00028327223842842755, + "loss": 7.2177, + "step": 11455 + }, + { + "epoch": 1.0689558645143231, + "grad_norm": 0.3098607904016466, + "learning_rate": 0.0002832687671455604, + "loss": 7.4578, + "step": 11456 + }, + { + "epoch": 1.0690491742092003, + "grad_norm": 0.49798677866786906, + "learning_rate": 0.0002832652955238302, + "loss": 7.3267, + "step": 11457 + }, + { + "epoch": 1.0691424839040777, + "grad_norm": 0.3550004107493543, + "learning_rate": 0.0002832618235632459, + "loss": 7.2757, + "step": 11458 + }, + { + "epoch": 1.069235793598955, + "grad_norm": 0.4841950813360875, + "learning_rate": 0.00028325835126381626, + "loss": 6.9457, + "step": 11459 + }, + { + "epoch": 1.0693291032938321, + "grad_norm": 137459.76622869787, + "learning_rate": 0.00028325487862555, + "loss": 7.4056, + "step": 11460 + }, + { + "epoch": 1.0694224129887095, + "grad_norm": 0.8719558647401883, + "learning_rate": 0.00028325140564845617, + "loss": 7.215, + "step": 11461 + }, + { + "epoch": 1.0695157226835867, + "grad_norm": 0.9998635300497937, + "learning_rate": 0.00028324793233254345, + "loss": 7.4232, + "step": 11462 + }, + { + "epoch": 1.0696090323784642, + "grad_norm": 0.6666398809999462, + "learning_rate": 0.0002832444586778207, + "loss": 7.4774, + "step": 11463 + }, + { + "epoch": 1.0697023420733414, + "grad_norm": 0.6581323747641873, + "learning_rate": 0.0002832409846842968, + "loss": 7.2674, + "step": 11464 + }, + { + "epoch": 1.0697956517682188, + "grad_norm": 0.7012603555456511, + "learning_rate": 0.0002832375103519805, + "loss": 7.2989, + "step": 11465 + }, + { + "epoch": 1.069888961463096, + "grad_norm": 39313.60092331595, + "learning_rate": 0.0002832340356808807, + "loss": 6.958, + "step": 11466 + }, + { + "epoch": 1.0699822711579734, + "grad_norm": 1.1558799370983965, + "learning_rate": 0.0002832305606710062, + "loss": 7.2682, + "step": 11467 + }, + { + "epoch": 1.0700755808528506, + "grad_norm": 1.0651487200062215, + "learning_rate": 0.00028322708532236586, + "loss": 7.1842, + "step": 11468 + }, + { + "epoch": 1.0701688905477278, + "grad_norm": 1.160693461377095, + "learning_rate": 0.00028322360963496847, + "loss": 6.8643, + "step": 11469 + }, + { + "epoch": 1.0702622002426052, + "grad_norm": 1.177238895911839, + "learning_rate": 0.00028322013360882294, + "loss": 7.6707, + "step": 11470 + }, + { + "epoch": 1.0703555099374824, + "grad_norm": 60509.71950946754, + "learning_rate": 0.00028321665724393813, + "loss": 7.5193, + "step": 11471 + }, + { + "epoch": 1.0704488196323598, + "grad_norm": 0.6390749778631499, + "learning_rate": 0.00028321318054032275, + "loss": 7.341, + "step": 11472 + }, + { + "epoch": 1.070542129327237, + "grad_norm": 0.3815157014803972, + "learning_rate": 0.0002832097034979857, + "loss": 7.109, + "step": 11473 + }, + { + "epoch": 1.0706354390221144, + "grad_norm": 49226.74452838205, + "learning_rate": 0.00028320622611693587, + "loss": 7.4431, + "step": 11474 + }, + { + "epoch": 1.0707287487169916, + "grad_norm": 203167.00112909405, + "learning_rate": 0.0002832027483971821, + "loss": 7.4548, + "step": 11475 + }, + { + "epoch": 1.070822058411869, + "grad_norm": 0.3759769522533808, + "learning_rate": 0.0002831992703387331, + "loss": 7.5466, + "step": 11476 + }, + { + "epoch": 1.0709153681067463, + "grad_norm": 0.964667017099464, + "learning_rate": 0.0002831957919415979, + "loss": 7.1594, + "step": 11477 + }, + { + "epoch": 1.0710086778016237, + "grad_norm": 0.8814992468623303, + "learning_rate": 0.00028319231320578524, + "loss": 7.7916, + "step": 11478 + }, + { + "epoch": 1.0711019874965009, + "grad_norm": 0.4842528795690518, + "learning_rate": 0.00028318883413130393, + "loss": 7.5411, + "step": 11479 + }, + { + "epoch": 1.071195297191378, + "grad_norm": 6145300.5773566235, + "learning_rate": 0.0002831853547181629, + "loss": 7.3496, + "step": 11480 + }, + { + "epoch": 1.0712886068862555, + "grad_norm": 596185.7592821388, + "learning_rate": 0.000283181874966371, + "loss": 7.2551, + "step": 11481 + }, + { + "epoch": 1.0713819165811327, + "grad_norm": 0.7047463919092644, + "learning_rate": 0.00028317839487593694, + "loss": 7.4834, + "step": 11482 + }, + { + "epoch": 1.07147522627601, + "grad_norm": 0.7482772877018344, + "learning_rate": 0.00028317491444686976, + "loss": 7.8553, + "step": 11483 + }, + { + "epoch": 1.0715685359708873, + "grad_norm": 0.3915930806615736, + "learning_rate": 0.0002831714336791782, + "loss": 7.2517, + "step": 11484 + }, + { + "epoch": 1.0716618456657647, + "grad_norm": 0.523055875309799, + "learning_rate": 0.00028316795257287114, + "loss": 7.3724, + "step": 11485 + }, + { + "epoch": 1.071755155360642, + "grad_norm": 0.6138469619109427, + "learning_rate": 0.0002831644711279574, + "loss": 7.2787, + "step": 11486 + }, + { + "epoch": 1.0718484650555193, + "grad_norm": 1.2378443380487951, + "learning_rate": 0.0002831609893444459, + "loss": 7.6018, + "step": 11487 + }, + { + "epoch": 1.0719417747503965, + "grad_norm": 36387773.33458182, + "learning_rate": 0.0002831575072223455, + "loss": 7.0395, + "step": 11488 + }, + { + "epoch": 1.072035084445274, + "grad_norm": 0.8308667648577105, + "learning_rate": 0.0002831540247616649, + "loss": 7.2695, + "step": 11489 + }, + { + "epoch": 1.0721283941401512, + "grad_norm": 35318164.87022998, + "learning_rate": 0.00028315054196241307, + "loss": 7.2112, + "step": 11490 + }, + { + "epoch": 1.0722217038350284, + "grad_norm": 0.5719292015439589, + "learning_rate": 0.00028314705882459887, + "loss": 7.621, + "step": 11491 + }, + { + "epoch": 1.0723150135299058, + "grad_norm": 0.5218263713669787, + "learning_rate": 0.00028314357534823116, + "loss": 7.2244, + "step": 11492 + }, + { + "epoch": 1.072408323224783, + "grad_norm": 0.42220725729901804, + "learning_rate": 0.00028314009153331877, + "loss": 7.6122, + "step": 11493 + }, + { + "epoch": 1.0725016329196604, + "grad_norm": 0.961109936798534, + "learning_rate": 0.00028313660737987054, + "loss": 7.4516, + "step": 11494 + }, + { + "epoch": 1.0725949426145376, + "grad_norm": 0.913851474823166, + "learning_rate": 0.00028313312288789536, + "loss": 7.1719, + "step": 11495 + }, + { + "epoch": 1.072688252309415, + "grad_norm": 0.4513494284185814, + "learning_rate": 0.0002831296380574021, + "loss": 7.6038, + "step": 11496 + }, + { + "epoch": 1.0727815620042922, + "grad_norm": 0.6672108192055134, + "learning_rate": 0.0002831261528883996, + "loss": 7.3727, + "step": 11497 + }, + { + "epoch": 1.0728748716991696, + "grad_norm": 550276809.0604986, + "learning_rate": 0.0002831226673808967, + "loss": 7.3348, + "step": 11498 + }, + { + "epoch": 1.0729681813940468, + "grad_norm": 0.6904678632467831, + "learning_rate": 0.00028311918153490237, + "loss": 7.1329, + "step": 11499 + }, + { + "epoch": 1.0730614910889242, + "grad_norm": 413689382.4248695, + "learning_rate": 0.00028311569535042537, + "loss": 7.3771, + "step": 11500 + }, + { + "epoch": 1.0731548007838014, + "grad_norm": 0.8711769665097885, + "learning_rate": 0.00028311220882747454, + "loss": 7.6173, + "step": 11501 + }, + { + "epoch": 1.0732481104786786, + "grad_norm": 201164913.59140673, + "learning_rate": 0.00028310872196605884, + "loss": 7.6519, + "step": 11502 + }, + { + "epoch": 1.073341420173556, + "grad_norm": 0.7207238054369779, + "learning_rate": 0.00028310523476618703, + "loss": 7.3872, + "step": 11503 + }, + { + "epoch": 1.0734347298684332, + "grad_norm": 0.5007804233814067, + "learning_rate": 0.0002831017472278681, + "loss": 7.2964, + "step": 11504 + }, + { + "epoch": 1.0735280395633107, + "grad_norm": 1.031568472231363, + "learning_rate": 0.00028309825935111086, + "loss": 7.0521, + "step": 11505 + }, + { + "epoch": 1.0736213492581879, + "grad_norm": 1.6497557050771834, + "learning_rate": 0.0002830947711359241, + "loss": 7.1919, + "step": 11506 + }, + { + "epoch": 1.0737146589530653, + "grad_norm": 94930562.22761399, + "learning_rate": 0.00028309128258231683, + "loss": 7.5996, + "step": 11507 + }, + { + "epoch": 1.0738079686479425, + "grad_norm": 0.5581904448531738, + "learning_rate": 0.00028308779369029785, + "loss": 7.4888, + "step": 11508 + }, + { + "epoch": 1.07390127834282, + "grad_norm": 0.4994359187596234, + "learning_rate": 0.000283084304459876, + "loss": 7.4563, + "step": 11509 + }, + { + "epoch": 1.073994588037697, + "grad_norm": 0.4182790053149311, + "learning_rate": 0.0002830808148910602, + "loss": 7.3582, + "step": 11510 + }, + { + "epoch": 1.0740878977325745, + "grad_norm": 0.8361271426387998, + "learning_rate": 0.00028307732498385934, + "loss": 7.6423, + "step": 11511 + }, + { + "epoch": 1.0741812074274517, + "grad_norm": 0.7717624897318156, + "learning_rate": 0.0002830738347382822, + "loss": 7.6467, + "step": 11512 + }, + { + "epoch": 1.074274517122329, + "grad_norm": 0.6539023342667468, + "learning_rate": 0.00028307034415433774, + "loss": 7.3466, + "step": 11513 + }, + { + "epoch": 1.0743678268172063, + "grad_norm": 0.4756382739585385, + "learning_rate": 0.00028306685323203483, + "loss": 7.592, + "step": 11514 + }, + { + "epoch": 1.0744611365120835, + "grad_norm": 846705203.080527, + "learning_rate": 0.00028306336197138235, + "loss": 7.4118, + "step": 11515 + }, + { + "epoch": 1.074554446206961, + "grad_norm": 773786428.802688, + "learning_rate": 0.00028305987037238907, + "loss": 7.1808, + "step": 11516 + }, + { + "epoch": 1.0746477559018381, + "grad_norm": 88696169.50381264, + "learning_rate": 0.00028305637843506406, + "loss": 7.1575, + "step": 11517 + }, + { + "epoch": 1.0747410655967156, + "grad_norm": 0.6695345287615521, + "learning_rate": 0.00028305288615941603, + "loss": 7.514, + "step": 11518 + }, + { + "epoch": 1.0748343752915928, + "grad_norm": 0.5553367490816359, + "learning_rate": 0.00028304939354545394, + "loss": 7.0606, + "step": 11519 + }, + { + "epoch": 1.0749276849864702, + "grad_norm": 0.937388070026132, + "learning_rate": 0.0002830459005931866, + "loss": 7.5567, + "step": 11520 + }, + { + "epoch": 1.0750209946813474, + "grad_norm": 0.926745265503638, + "learning_rate": 0.00028304240730262306, + "loss": 7.4015, + "step": 11521 + }, + { + "epoch": 1.0751143043762248, + "grad_norm": 0.8523202412641754, + "learning_rate": 0.000283038913673772, + "loss": 7.3424, + "step": 11522 + }, + { + "epoch": 1.075207614071102, + "grad_norm": 0.7190519893437436, + "learning_rate": 0.00028303541970664244, + "loss": 7.4548, + "step": 11523 + }, + { + "epoch": 1.0753009237659792, + "grad_norm": 0.9318791309577954, + "learning_rate": 0.0002830319254012432, + "loss": 7.792, + "step": 11524 + }, + { + "epoch": 1.0753942334608566, + "grad_norm": 1235451466.9661775, + "learning_rate": 0.0002830284307575832, + "loss": 7.3177, + "step": 11525 + }, + { + "epoch": 1.0754875431557338, + "grad_norm": 0.9431182569896419, + "learning_rate": 0.0002830249357756713, + "loss": 7.2228, + "step": 11526 + }, + { + "epoch": 1.0755808528506112, + "grad_norm": 1.148484326490705, + "learning_rate": 0.00028302144045551635, + "loss": 7.0096, + "step": 11527 + }, + { + "epoch": 1.0756741625454884, + "grad_norm": 0.8326539693244345, + "learning_rate": 0.0002830179447971273, + "loss": 7.3006, + "step": 11528 + }, + { + "epoch": 1.0757674722403658, + "grad_norm": 0.8795590747748832, + "learning_rate": 0.000283014448800513, + "loss": 7.6737, + "step": 11529 + }, + { + "epoch": 1.075860781935243, + "grad_norm": 0.9839941093636377, + "learning_rate": 0.00028301095246568243, + "loss": 7.3018, + "step": 11530 + }, + { + "epoch": 1.0759540916301205, + "grad_norm": 1.1798562194466282, + "learning_rate": 0.00028300745579264435, + "loss": 7.4194, + "step": 11531 + }, + { + "epoch": 1.0760474013249977, + "grad_norm": 0.6863913264495071, + "learning_rate": 0.0002830039587814077, + "loss": 7.1581, + "step": 11532 + }, + { + "epoch": 1.076140711019875, + "grad_norm": 1.0341747958162337, + "learning_rate": 0.00028300046143198143, + "loss": 7.5686, + "step": 11533 + }, + { + "epoch": 1.0762340207147523, + "grad_norm": 0.7400497003349547, + "learning_rate": 0.0002829969637443744, + "loss": 7.5109, + "step": 11534 + }, + { + "epoch": 1.0763273304096295, + "grad_norm": 1.2694284609765958, + "learning_rate": 0.0002829934657185954, + "loss": 7.1472, + "step": 11535 + }, + { + "epoch": 1.0764206401045069, + "grad_norm": 0.5946776166812731, + "learning_rate": 0.0002829899673546535, + "loss": 7.5404, + "step": 11536 + }, + { + "epoch": 1.076513949799384, + "grad_norm": 0.8354552307736282, + "learning_rate": 0.00028298646865255745, + "loss": 7.4758, + "step": 11537 + }, + { + "epoch": 1.0766072594942615, + "grad_norm": 1.320868434398483, + "learning_rate": 0.00028298296961231625, + "loss": 7.4744, + "step": 11538 + }, + { + "epoch": 1.0767005691891387, + "grad_norm": 0.5253229120747107, + "learning_rate": 0.0002829794702339387, + "loss": 7.429, + "step": 11539 + }, + { + "epoch": 1.0767938788840161, + "grad_norm": 1.1156796195657164, + "learning_rate": 0.00028297597051743376, + "loss": 7.0164, + "step": 11540 + }, + { + "epoch": 1.0768871885788933, + "grad_norm": 0.9259301083194945, + "learning_rate": 0.0002829724704628103, + "loss": 7.5807, + "step": 11541 + }, + { + "epoch": 1.0769804982737707, + "grad_norm": 0.7023912905883856, + "learning_rate": 0.0002829689700700773, + "loss": 7.0963, + "step": 11542 + }, + { + "epoch": 1.077073807968648, + "grad_norm": 0.5694908644371319, + "learning_rate": 0.00028296546933924356, + "loss": 7.2561, + "step": 11543 + }, + { + "epoch": 1.0771671176635254, + "grad_norm": 2.5645757676374084, + "learning_rate": 0.000282961968270318, + "loss": 7.5859, + "step": 11544 + }, + { + "epoch": 1.0772604273584026, + "grad_norm": 1.334463767900791, + "learning_rate": 0.0002829584668633096, + "loss": 7.5813, + "step": 11545 + }, + { + "epoch": 1.0773537370532797, + "grad_norm": 0.8514154487177015, + "learning_rate": 0.0002829549651182272, + "loss": 7.4942, + "step": 11546 + }, + { + "epoch": 1.0774470467481572, + "grad_norm": 0.7520142161369696, + "learning_rate": 0.0002829514630350796, + "loss": 7.6329, + "step": 11547 + }, + { + "epoch": 1.0775403564430344, + "grad_norm": 1.05136239208301, + "learning_rate": 0.00028294796061387585, + "loss": 7.2819, + "step": 11548 + }, + { + "epoch": 1.0776336661379118, + "grad_norm": 1.243463816921631, + "learning_rate": 0.0002829444578546249, + "loss": 7.176, + "step": 11549 + }, + { + "epoch": 1.077726975832789, + "grad_norm": 1.5455122546236453, + "learning_rate": 0.00028294095475733555, + "loss": 7.1575, + "step": 11550 + }, + { + "epoch": 1.0778202855276664, + "grad_norm": 0.9095925745277132, + "learning_rate": 0.0002829374513220167, + "loss": 7.2866, + "step": 11551 + }, + { + "epoch": 1.0779135952225436, + "grad_norm": 0.5848657733640065, + "learning_rate": 0.00028293394754867735, + "loss": 7.5451, + "step": 11552 + }, + { + "epoch": 1.078006904917421, + "grad_norm": 9713739.182221167, + "learning_rate": 0.0002829304434373262, + "loss": 7.3088, + "step": 11553 + }, + { + "epoch": 1.0781002146122982, + "grad_norm": 0.934839673968972, + "learning_rate": 0.00028292693898797247, + "loss": 7.2696, + "step": 11554 + }, + { + "epoch": 1.0781935243071756, + "grad_norm": 1.0741929715408038, + "learning_rate": 0.00028292343420062486, + "loss": 7.2992, + "step": 11555 + }, + { + "epoch": 1.0782868340020528, + "grad_norm": 1.1323023627119662, + "learning_rate": 0.0002829199290752923, + "loss": 7.384, + "step": 11556 + }, + { + "epoch": 1.07838014369693, + "grad_norm": 1.5180359945039044, + "learning_rate": 0.0002829164236119838, + "loss": 7.481, + "step": 11557 + }, + { + "epoch": 1.0784734533918074, + "grad_norm": 0.6700216491670442, + "learning_rate": 0.00028291291781070815, + "loss": 7.5006, + "step": 11558 + }, + { + "epoch": 1.0785667630866846, + "grad_norm": 1.0288250815143472, + "learning_rate": 0.0002829094116714743, + "loss": 7.3039, + "step": 11559 + }, + { + "epoch": 1.078660072781562, + "grad_norm": 0.7244964532511767, + "learning_rate": 0.00028290590519429123, + "loss": 7.5367, + "step": 11560 + }, + { + "epoch": 1.0787533824764393, + "grad_norm": 1.550584234357481, + "learning_rate": 0.0002829023983791678, + "loss": 7.2968, + "step": 11561 + }, + { + "epoch": 1.0788466921713167, + "grad_norm": 1.3345379242993225, + "learning_rate": 0.00028289889122611294, + "loss": 7.2715, + "step": 11562 + }, + { + "epoch": 1.0789400018661939, + "grad_norm": 2236051.1904399097, + "learning_rate": 0.0002828953837351356, + "loss": 7.2779, + "step": 11563 + }, + { + "epoch": 1.0790333115610713, + "grad_norm": 0.5366712468745535, + "learning_rate": 0.00028289187590624463, + "loss": 7.2636, + "step": 11564 + }, + { + "epoch": 1.0791266212559485, + "grad_norm": 0.9841599823047859, + "learning_rate": 0.000282888367739449, + "loss": 7.2693, + "step": 11565 + }, + { + "epoch": 1.0792199309508257, + "grad_norm": 24716142.19044787, + "learning_rate": 0.0002828848592347576, + "loss": 7.3204, + "step": 11566 + }, + { + "epoch": 1.079313240645703, + "grad_norm": 1.5519247468472162, + "learning_rate": 0.0002828813503921794, + "loss": 7.6889, + "step": 11567 + }, + { + "epoch": 1.0794065503405803, + "grad_norm": 1.1962817488857003, + "learning_rate": 0.0002828778412117233, + "loss": 7.3342, + "step": 11568 + }, + { + "epoch": 1.0794998600354577, + "grad_norm": 0.7517563992287043, + "learning_rate": 0.0002828743316933981, + "loss": 7.3861, + "step": 11569 + }, + { + "epoch": 1.079593169730335, + "grad_norm": 0.8300381064240926, + "learning_rate": 0.00028287082183721293, + "loss": 7.7146, + "step": 11570 + }, + { + "epoch": 1.0796864794252123, + "grad_norm": 0.7785359971816384, + "learning_rate": 0.00028286731164317656, + "loss": 7.2835, + "step": 11571 + }, + { + "epoch": 1.0797797891200895, + "grad_norm": 1.1640736528180566, + "learning_rate": 0.00028286380111129807, + "loss": 7.3411, + "step": 11572 + }, + { + "epoch": 1.079873098814967, + "grad_norm": 0.9614299671134992, + "learning_rate": 0.0002828602902415862, + "loss": 7.5815, + "step": 11573 + }, + { + "epoch": 1.0799664085098442, + "grad_norm": 10596699.075855153, + "learning_rate": 0.00028285677903405, + "loss": 7.3186, + "step": 11574 + }, + { + "epoch": 1.0800597182047214, + "grad_norm": 1.0024030226743201, + "learning_rate": 0.00028285326748869836, + "loss": 7.5052, + "step": 11575 + }, + { + "epoch": 1.0801530278995988, + "grad_norm": 0.576622342088471, + "learning_rate": 0.0002828497556055402, + "loss": 7.477, + "step": 11576 + }, + { + "epoch": 1.080246337594476, + "grad_norm": 0.9086497857801753, + "learning_rate": 0.0002828462433845845, + "loss": 7.2139, + "step": 11577 + }, + { + "epoch": 1.0803396472893534, + "grad_norm": 0.7443691045725938, + "learning_rate": 0.00028284273082584017, + "loss": 7.6082, + "step": 11578 + }, + { + "epoch": 1.0804329569842306, + "grad_norm": 0.4335675242259084, + "learning_rate": 0.0002828392179293161, + "loss": 7.3788, + "step": 11579 + }, + { + "epoch": 1.080526266679108, + "grad_norm": 0.48846602464964456, + "learning_rate": 0.00028283570469502125, + "loss": 7.1855, + "step": 11580 + }, + { + "epoch": 1.0806195763739852, + "grad_norm": 0.7317602259276739, + "learning_rate": 0.0002828321911229645, + "loss": 7.6302, + "step": 11581 + }, + { + "epoch": 1.0807128860688626, + "grad_norm": 5957151.665522472, + "learning_rate": 0.00028282867721315496, + "loss": 7.2767, + "step": 11582 + }, + { + "epoch": 1.0808061957637398, + "grad_norm": 0.6110452469949255, + "learning_rate": 0.0002828251629656013, + "loss": 7.4999, + "step": 11583 + }, + { + "epoch": 1.0808995054586172, + "grad_norm": 0.6789877073829798, + "learning_rate": 0.0002828216483803127, + "loss": 7.0598, + "step": 11584 + }, + { + "epoch": 1.0809928151534944, + "grad_norm": 2142667.813902723, + "learning_rate": 0.00028281813345729795, + "loss": 6.9958, + "step": 11585 + }, + { + "epoch": 1.0810861248483716, + "grad_norm": 0.9805319388585689, + "learning_rate": 0.00028281461819656603, + "loss": 7.5999, + "step": 11586 + }, + { + "epoch": 1.081179434543249, + "grad_norm": 0.42485347777830895, + "learning_rate": 0.00028281110259812585, + "loss": 7.1441, + "step": 11587 + }, + { + "epoch": 1.0812727442381262, + "grad_norm": 0.8801385347920995, + "learning_rate": 0.00028280758666198645, + "loss": 7.5935, + "step": 11588 + }, + { + "epoch": 1.0813660539330037, + "grad_norm": 4.6351444055263515, + "learning_rate": 0.00028280407038815667, + "loss": 7.1515, + "step": 11589 + }, + { + "epoch": 1.0814593636278809, + "grad_norm": 0.4879754379284064, + "learning_rate": 0.00028280055377664546, + "loss": 7.3088, + "step": 11590 + }, + { + "epoch": 1.0815526733227583, + "grad_norm": 144319.92000872947, + "learning_rate": 0.0002827970368274618, + "loss": 7.5422, + "step": 11591 + }, + { + "epoch": 1.0816459830176355, + "grad_norm": 0.6482245787114939, + "learning_rate": 0.00028279351954061456, + "loss": 7.372, + "step": 11592 + }, + { + "epoch": 1.081739292712513, + "grad_norm": 6093797.154574395, + "learning_rate": 0.0002827900019161128, + "loss": 7.0202, + "step": 11593 + }, + { + "epoch": 1.08183260240739, + "grad_norm": 0.5679462262598743, + "learning_rate": 0.0002827864839539653, + "loss": 7.3227, + "step": 11594 + }, + { + "epoch": 1.0819259121022675, + "grad_norm": 0.6078554575884701, + "learning_rate": 0.0002827829656541812, + "loss": 7.7097, + "step": 11595 + }, + { + "epoch": 1.0820192217971447, + "grad_norm": 0.48659836103089105, + "learning_rate": 0.0002827794470167694, + "loss": 7.2355, + "step": 11596 + }, + { + "epoch": 1.082112531492022, + "grad_norm": 3576704.068196052, + "learning_rate": 0.0002827759280417387, + "loss": 7.2517, + "step": 11597 + }, + { + "epoch": 1.0822058411868993, + "grad_norm": 0.8114847451789948, + "learning_rate": 0.0002827724087290982, + "loss": 7.3311, + "step": 11598 + }, + { + "epoch": 1.0822991508817765, + "grad_norm": 0.5900723460634778, + "learning_rate": 0.0002827688890788568, + "loss": 7.2108, + "step": 11599 + }, + { + "epoch": 1.082392460576654, + "grad_norm": 0.8688379459572326, + "learning_rate": 0.0002827653690910234, + "loss": 7.5676, + "step": 11600 + }, + { + "epoch": 1.0824857702715311, + "grad_norm": 2554488.7031856566, + "learning_rate": 0.00028276184876560703, + "loss": 7.4325, + "step": 11601 + }, + { + "epoch": 1.0825790799664086, + "grad_norm": 0.6423384764853908, + "learning_rate": 0.00028275832810261656, + "loss": 7.365, + "step": 11602 + }, + { + "epoch": 1.0826723896612858, + "grad_norm": 2354990.3473268147, + "learning_rate": 0.00028275480710206104, + "loss": 7.0939, + "step": 11603 + }, + { + "epoch": 1.0827656993561632, + "grad_norm": 0.8323163940497198, + "learning_rate": 0.0002827512857639493, + "loss": 7.2548, + "step": 11604 + }, + { + "epoch": 1.0828590090510404, + "grad_norm": 0.8491357840311189, + "learning_rate": 0.00028274776408829044, + "loss": 7.3616, + "step": 11605 + }, + { + "epoch": 1.0829523187459178, + "grad_norm": 0.590012696063715, + "learning_rate": 0.0002827442420750933, + "loss": 7.321, + "step": 11606 + }, + { + "epoch": 1.083045628440795, + "grad_norm": 0.8639275188454113, + "learning_rate": 0.0002827407197243669, + "loss": 7.7402, + "step": 11607 + }, + { + "epoch": 1.0831389381356722, + "grad_norm": 0.4275906390353007, + "learning_rate": 0.00028273719703612014, + "loss": 7.3465, + "step": 11608 + }, + { + "epoch": 1.0832322478305496, + "grad_norm": 0.3957264971964282, + "learning_rate": 0.00028273367401036203, + "loss": 7.3749, + "step": 11609 + }, + { + "epoch": 1.0833255575254268, + "grad_norm": 0.7608015391618769, + "learning_rate": 0.0002827301506471015, + "loss": 7.6022, + "step": 11610 + }, + { + "epoch": 1.0834188672203042, + "grad_norm": 0.47755553143456686, + "learning_rate": 0.0002827266269463475, + "loss": 7.3682, + "step": 11611 + }, + { + "epoch": 1.0835121769151814, + "grad_norm": 1.1403135426650495, + "learning_rate": 0.000282723102908109, + "loss": 7.2, + "step": 11612 + }, + { + "epoch": 1.0836054866100588, + "grad_norm": 1011597.0340158086, + "learning_rate": 0.00028271957853239497, + "loss": 7.2699, + "step": 11613 + }, + { + "epoch": 1.083698796304936, + "grad_norm": 1320730.1036556503, + "learning_rate": 0.00028271605381921435, + "loss": 7.1666, + "step": 11614 + }, + { + "epoch": 1.0837921059998135, + "grad_norm": 1.3213188387034098, + "learning_rate": 0.00028271252876857614, + "loss": 7.637, + "step": 11615 + }, + { + "epoch": 1.0838854156946907, + "grad_norm": 0.7505351520542491, + "learning_rate": 0.0002827090033804893, + "loss": 7.3332, + "step": 11616 + }, + { + "epoch": 1.083978725389568, + "grad_norm": 23900141.488641176, + "learning_rate": 0.00028270547765496274, + "loss": 7.1932, + "step": 11617 + }, + { + "epoch": 1.0840720350844453, + "grad_norm": 704767.628214178, + "learning_rate": 0.0002827019515920055, + "loss": 7.3002, + "step": 11618 + }, + { + "epoch": 1.0841653447793225, + "grad_norm": 0.5706731239672669, + "learning_rate": 0.0002826984251916265, + "loss": 7.6091, + "step": 11619 + }, + { + "epoch": 1.0842586544742, + "grad_norm": 0.37809320965047977, + "learning_rate": 0.0002826948984538347, + "loss": 7.4805, + "step": 11620 + }, + { + "epoch": 1.084351964169077, + "grad_norm": 0.5909654089877615, + "learning_rate": 0.00028269137137863906, + "loss": 7.5248, + "step": 11621 + }, + { + "epoch": 1.0844452738639545, + "grad_norm": 4562661.399846689, + "learning_rate": 0.00028268784396604863, + "loss": 7.1906, + "step": 11622 + }, + { + "epoch": 1.0845385835588317, + "grad_norm": 0.9348017655404491, + "learning_rate": 0.00028268431621607227, + "loss": 7.0832, + "step": 11623 + }, + { + "epoch": 1.0846318932537091, + "grad_norm": 1.1028255068164834, + "learning_rate": 0.000282680788128719, + "loss": 7.3206, + "step": 11624 + }, + { + "epoch": 1.0847252029485863, + "grad_norm": 0.48121255696450493, + "learning_rate": 0.0002826772597039978, + "loss": 7.6609, + "step": 11625 + }, + { + "epoch": 1.0848185126434637, + "grad_norm": 0.48551187060011575, + "learning_rate": 0.0002826737309419176, + "loss": 7.1997, + "step": 11626 + }, + { + "epoch": 1.084911822338341, + "grad_norm": 0.9395806838743674, + "learning_rate": 0.00028267020184248744, + "loss": 7.4355, + "step": 11627 + }, + { + "epoch": 1.0850051320332184, + "grad_norm": 1.245750478350209, + "learning_rate": 0.0002826666724057162, + "loss": 7.3616, + "step": 11628 + }, + { + "epoch": 1.0850984417280956, + "grad_norm": 2519421.6218213956, + "learning_rate": 0.000282663142631613, + "loss": 6.8168, + "step": 11629 + }, + { + "epoch": 1.0851917514229728, + "grad_norm": 0.5823401527536766, + "learning_rate": 0.0002826596125201867, + "loss": 7.294, + "step": 11630 + }, + { + "epoch": 1.0852850611178502, + "grad_norm": 0.45207294068441406, + "learning_rate": 0.00028265608207144626, + "loss": 7.4087, + "step": 11631 + }, + { + "epoch": 1.0853783708127274, + "grad_norm": 0.7460026193795423, + "learning_rate": 0.00028265255128540073, + "loss": 7.2427, + "step": 11632 + }, + { + "epoch": 1.0854716805076048, + "grad_norm": 1.1795919254341312, + "learning_rate": 0.00028264902016205903, + "loss": 7.0221, + "step": 11633 + }, + { + "epoch": 1.085564990202482, + "grad_norm": 0.8708332730732935, + "learning_rate": 0.0002826454887014302, + "loss": 7.2879, + "step": 11634 + }, + { + "epoch": 1.0856582998973594, + "grad_norm": 1.0478150436638207, + "learning_rate": 0.0002826419569035232, + "loss": 7.7422, + "step": 11635 + }, + { + "epoch": 1.0857516095922366, + "grad_norm": 0.4990519486951501, + "learning_rate": 0.00028263842476834695, + "loss": 7.2484, + "step": 11636 + }, + { + "epoch": 1.085844919287114, + "grad_norm": 0.398370330723795, + "learning_rate": 0.0002826348922959105, + "loss": 7.3878, + "step": 11637 + }, + { + "epoch": 1.0859382289819912, + "grad_norm": 8893142.150337206, + "learning_rate": 0.0002826313594862228, + "loss": 7.1189, + "step": 11638 + }, + { + "epoch": 1.0860315386768686, + "grad_norm": 153600.45054796996, + "learning_rate": 0.0002826278263392928, + "loss": 7.5867, + "step": 11639 + }, + { + "epoch": 1.0861248483717458, + "grad_norm": 1708765.6382781507, + "learning_rate": 0.0002826242928551296, + "loss": 7.4659, + "step": 11640 + }, + { + "epoch": 1.086218158066623, + "grad_norm": 0.42667531613659604, + "learning_rate": 0.0002826207590337421, + "loss": 7.3835, + "step": 11641 + }, + { + "epoch": 1.0863114677615004, + "grad_norm": 4704788.6751879, + "learning_rate": 0.00028261722487513927, + "loss": 7.3449, + "step": 11642 + }, + { + "epoch": 1.0864047774563776, + "grad_norm": 1.1340474392171787, + "learning_rate": 0.00028261369037933007, + "loss": 7.0389, + "step": 11643 + }, + { + "epoch": 1.086498087151255, + "grad_norm": 0.4077150584231385, + "learning_rate": 0.0002826101555463236, + "loss": 7.3686, + "step": 11644 + }, + { + "epoch": 1.0865913968461323, + "grad_norm": 0.3042935562197383, + "learning_rate": 0.0002826066203761288, + "loss": 7.4287, + "step": 11645 + }, + { + "epoch": 1.0866847065410097, + "grad_norm": 0.4436493749312439, + "learning_rate": 0.0002826030848687546, + "loss": 7.1778, + "step": 11646 + }, + { + "epoch": 1.0867780162358869, + "grad_norm": 0.5481102922880653, + "learning_rate": 0.00028259954902421007, + "loss": 7.3219, + "step": 11647 + }, + { + "epoch": 1.0868713259307643, + "grad_norm": 0.5485953293852461, + "learning_rate": 0.0002825960128425041, + "loss": 7.1704, + "step": 11648 + }, + { + "epoch": 1.0869646356256415, + "grad_norm": 0.44220633259459097, + "learning_rate": 0.00028259247632364586, + "loss": 7.4654, + "step": 11649 + }, + { + "epoch": 1.087057945320519, + "grad_norm": 0.3144432798264761, + "learning_rate": 0.00028258893946764415, + "loss": 7.3562, + "step": 11650 + }, + { + "epoch": 1.0871512550153961, + "grad_norm": 0.45142305671306293, + "learning_rate": 0.00028258540227450805, + "loss": 7.2544, + "step": 11651 + }, + { + "epoch": 1.0872445647102733, + "grad_norm": 1488865.207763434, + "learning_rate": 0.0002825818647442466, + "loss": 7.0672, + "step": 11652 + }, + { + "epoch": 1.0873378744051507, + "grad_norm": 0.5818261527499606, + "learning_rate": 0.00028257832687686863, + "loss": 7.3683, + "step": 11653 + }, + { + "epoch": 1.087431184100028, + "grad_norm": 0.39578826287846625, + "learning_rate": 0.0002825747886723833, + "loss": 7.334, + "step": 11654 + }, + { + "epoch": 1.0875244937949053, + "grad_norm": 0.8418495220481732, + "learning_rate": 0.0002825712501307996, + "loss": 7.5305, + "step": 11655 + }, + { + "epoch": 1.0876178034897825, + "grad_norm": 0.7691188345960134, + "learning_rate": 0.0002825677112521265, + "loss": 7.0068, + "step": 11656 + }, + { + "epoch": 1.08771111318466, + "grad_norm": 8675842.827500343, + "learning_rate": 0.00028256417203637285, + "loss": 6.9383, + "step": 11657 + }, + { + "epoch": 1.0878044228795372, + "grad_norm": 0.5816505486512994, + "learning_rate": 0.0002825606324835479, + "loss": 7.345, + "step": 11658 + }, + { + "epoch": 1.0878977325744146, + "grad_norm": 0.7611537333622427, + "learning_rate": 0.0002825570925936605, + "loss": 7.6507, + "step": 11659 + }, + { + "epoch": 1.0879910422692918, + "grad_norm": 0.824991885393391, + "learning_rate": 0.0002825535523667197, + "loss": 7.4977, + "step": 11660 + }, + { + "epoch": 1.0880843519641692, + "grad_norm": 0.42066031271872906, + "learning_rate": 0.0002825500118027344, + "loss": 7.2572, + "step": 11661 + }, + { + "epoch": 1.0881776616590464, + "grad_norm": 759493.4003019998, + "learning_rate": 0.0002825464709017138, + "loss": 7.228, + "step": 11662 + }, + { + "epoch": 1.0882709713539236, + "grad_norm": 0.3963258154847118, + "learning_rate": 0.0002825429296636667, + "loss": 7.2346, + "step": 11663 + }, + { + "epoch": 1.088364281048801, + "grad_norm": 299828.6855628243, + "learning_rate": 0.00028253938808860216, + "loss": 7.7241, + "step": 11664 + }, + { + "epoch": 1.0884575907436782, + "grad_norm": 1.4461120898636113, + "learning_rate": 0.00028253584617652934, + "loss": 7.1324, + "step": 11665 + }, + { + "epoch": 1.0885509004385556, + "grad_norm": 4.8399525591690535, + "learning_rate": 0.00028253230392745704, + "loss": 7.1201, + "step": 11666 + }, + { + "epoch": 1.0886442101334328, + "grad_norm": 0.9968352385326854, + "learning_rate": 0.0002825287613413943, + "loss": 7.521, + "step": 11667 + }, + { + "epoch": 1.0887375198283102, + "grad_norm": 0.8929487784629087, + "learning_rate": 0.0002825252184183503, + "loss": 7.508, + "step": 11668 + }, + { + "epoch": 1.0888308295231874, + "grad_norm": 0.9467795154309858, + "learning_rate": 0.00028252167515833385, + "loss": 7.6019, + "step": 11669 + }, + { + "epoch": 1.0889241392180649, + "grad_norm": 0.6991830906462617, + "learning_rate": 0.00028251813156135404, + "loss": 7.4989, + "step": 11670 + }, + { + "epoch": 1.089017448912942, + "grad_norm": 1.065949378441074, + "learning_rate": 0.0002825145876274199, + "loss": 7.7545, + "step": 11671 + }, + { + "epoch": 1.0891107586078193, + "grad_norm": 0.8192600012784352, + "learning_rate": 0.0002825110433565404, + "loss": 7.2889, + "step": 11672 + }, + { + "epoch": 1.0892040683026967, + "grad_norm": 0.8671222064220738, + "learning_rate": 0.0002825074987487246, + "loss": 7.4777, + "step": 11673 + }, + { + "epoch": 1.0892973779975739, + "grad_norm": 0.97545409224739, + "learning_rate": 0.00028250395380398146, + "loss": 7.2739, + "step": 11674 + }, + { + "epoch": 1.0893906876924513, + "grad_norm": 214217.913701958, + "learning_rate": 0.00028250040852232, + "loss": 7.3731, + "step": 11675 + }, + { + "epoch": 1.0894839973873285, + "grad_norm": 0.4791484794353646, + "learning_rate": 0.00028249686290374924, + "loss": 7.4086, + "step": 11676 + }, + { + "epoch": 1.089577307082206, + "grad_norm": 0.39843726691106357, + "learning_rate": 0.00028249331694827826, + "loss": 7.2338, + "step": 11677 + }, + { + "epoch": 1.089670616777083, + "grad_norm": 0.4711177297382127, + "learning_rate": 0.000282489770655916, + "loss": 7.2584, + "step": 11678 + }, + { + "epoch": 1.0897639264719605, + "grad_norm": 1.417527562906898, + "learning_rate": 0.0002824862240266715, + "loss": 7.6692, + "step": 11679 + }, + { + "epoch": 1.0898572361668377, + "grad_norm": 0.6621498366962433, + "learning_rate": 0.00028248267706055375, + "loss": 7.3134, + "step": 11680 + }, + { + "epoch": 1.089950545861715, + "grad_norm": 0.7094698999286624, + "learning_rate": 0.00028247912975757185, + "loss": 7.6444, + "step": 11681 + }, + { + "epoch": 1.0900438555565923, + "grad_norm": 148632.35902613742, + "learning_rate": 0.00028247558211773473, + "loss": 7.2029, + "step": 11682 + }, + { + "epoch": 1.0901371652514695, + "grad_norm": 8065484.94202261, + "learning_rate": 0.00028247203414105145, + "loss": 7.161, + "step": 11683 + }, + { + "epoch": 1.090230474946347, + "grad_norm": 1.6555921263790327, + "learning_rate": 0.00028246848582753106, + "loss": 7.2045, + "step": 11684 + }, + { + "epoch": 1.0903237846412241, + "grad_norm": 1.5562640107764927, + "learning_rate": 0.0002824649371771825, + "loss": 7.369, + "step": 11685 + }, + { + "epoch": 1.0904170943361016, + "grad_norm": 1.2850343069677443, + "learning_rate": 0.0002824613881900149, + "loss": 7.2743, + "step": 11686 + }, + { + "epoch": 1.0905104040309788, + "grad_norm": 1.039962703547439, + "learning_rate": 0.00028245783886603714, + "loss": 7.1908, + "step": 11687 + }, + { + "epoch": 1.0906037137258562, + "grad_norm": 0.5421585114391007, + "learning_rate": 0.0002824542892052584, + "loss": 7.4248, + "step": 11688 + }, + { + "epoch": 1.0906970234207334, + "grad_norm": 0.40800287859889517, + "learning_rate": 0.00028245073920768764, + "loss": 6.9658, + "step": 11689 + }, + { + "epoch": 1.0907903331156108, + "grad_norm": 7809468.214278053, + "learning_rate": 0.00028244718887333387, + "loss": 7.414, + "step": 11690 + }, + { + "epoch": 1.090883642810488, + "grad_norm": 2.0686688179206953, + "learning_rate": 0.0002824436382022061, + "loss": 7.6253, + "step": 11691 + }, + { + "epoch": 1.0909769525053652, + "grad_norm": 1882460.3386743674, + "learning_rate": 0.0002824400871943134, + "loss": 7.3065, + "step": 11692 + }, + { + "epoch": 1.0910702622002426, + "grad_norm": 1.6657723074936646, + "learning_rate": 0.0002824365358496649, + "loss": 7.4723, + "step": 11693 + }, + { + "epoch": 1.0911635718951198, + "grad_norm": 0.3455595264906353, + "learning_rate": 0.0002824329841682694, + "loss": 7.5036, + "step": 11694 + }, + { + "epoch": 1.0912568815899972, + "grad_norm": 633808.0740027981, + "learning_rate": 0.00028242943215013605, + "loss": 7.1858, + "step": 11695 + }, + { + "epoch": 1.0913501912848744, + "grad_norm": 1.1864863382009836, + "learning_rate": 0.0002824258797952739, + "loss": 7.3793, + "step": 11696 + }, + { + "epoch": 1.0914435009797518, + "grad_norm": 1.3289130522012296, + "learning_rate": 0.000282422327103692, + "loss": 7.2265, + "step": 11697 + }, + { + "epoch": 1.091536810674629, + "grad_norm": 1.190907211862365, + "learning_rate": 0.00028241877407539937, + "loss": 7.2642, + "step": 11698 + }, + { + "epoch": 1.0916301203695065, + "grad_norm": 0.8988336582384737, + "learning_rate": 0.000282415220710405, + "loss": 7.4281, + "step": 11699 + }, + { + "epoch": 1.0917234300643837, + "grad_norm": 0.4398455985023934, + "learning_rate": 0.0002824116670087179, + "loss": 7.3471, + "step": 11700 + }, + { + "epoch": 1.091816739759261, + "grad_norm": 0.29323212173648083, + "learning_rate": 0.00028240811297034716, + "loss": 7.1839, + "step": 11701 + }, + { + "epoch": 1.0919100494541383, + "grad_norm": 1.366662851957211, + "learning_rate": 0.00028240455859530184, + "loss": 7.6869, + "step": 11702 + }, + { + "epoch": 1.0920033591490155, + "grad_norm": 1.0695599989353988, + "learning_rate": 0.0002824010038835909, + "loss": 7.2887, + "step": 11703 + }, + { + "epoch": 1.092096668843893, + "grad_norm": 0.4535551865223657, + "learning_rate": 0.0002823974488352235, + "loss": 7.2389, + "step": 11704 + }, + { + "epoch": 1.09218997853877, + "grad_norm": 0.8146714867045628, + "learning_rate": 0.0002823938934502086, + "loss": 7.3082, + "step": 11705 + }, + { + "epoch": 1.0922832882336475, + "grad_norm": 1.3229012553534947, + "learning_rate": 0.00028239033772855523, + "loss": 7.6818, + "step": 11706 + }, + { + "epoch": 1.0923765979285247, + "grad_norm": 0.3206448918471898, + "learning_rate": 0.0002823867816702724, + "loss": 7.2544, + "step": 11707 + }, + { + "epoch": 1.0924699076234021, + "grad_norm": 0.4151803700876731, + "learning_rate": 0.00028238322527536925, + "loss": 7.5395, + "step": 11708 + }, + { + "epoch": 1.0925632173182793, + "grad_norm": 1.0188098496621838, + "learning_rate": 0.0002823796685438548, + "loss": 7.2748, + "step": 11709 + }, + { + "epoch": 1.0926565270131567, + "grad_norm": 0.5424060741017003, + "learning_rate": 0.000282376111475738, + "loss": 7.4597, + "step": 11710 + }, + { + "epoch": 1.092749836708034, + "grad_norm": 1.5214327826977478, + "learning_rate": 0.00028237255407102796, + "loss": 6.9857, + "step": 11711 + }, + { + "epoch": 1.0928431464029114, + "grad_norm": 0.45796979873501875, + "learning_rate": 0.0002823689963297338, + "loss": 7.6138, + "step": 11712 + }, + { + "epoch": 1.0929364560977886, + "grad_norm": 3794358.1008764035, + "learning_rate": 0.00028236543825186446, + "loss": 7.0849, + "step": 11713 + }, + { + "epoch": 1.0930297657926658, + "grad_norm": 1.0129961563298229, + "learning_rate": 0.000282361879837429, + "loss": 7.5352, + "step": 11714 + }, + { + "epoch": 1.0931230754875432, + "grad_norm": 0.8120117108982979, + "learning_rate": 0.0002823583210864365, + "loss": 7.5588, + "step": 11715 + }, + { + "epoch": 1.0932163851824204, + "grad_norm": 2100218.9807569273, + "learning_rate": 0.000282354761998896, + "loss": 7.1989, + "step": 11716 + }, + { + "epoch": 1.0933096948772978, + "grad_norm": 1.076066651947059, + "learning_rate": 0.00028235120257481653, + "loss": 7.5611, + "step": 11717 + }, + { + "epoch": 1.093403004572175, + "grad_norm": 0.46737682970807776, + "learning_rate": 0.0002823476428142072, + "loss": 7.397, + "step": 11718 + }, + { + "epoch": 1.0934963142670524, + "grad_norm": 1.8256005973701432, + "learning_rate": 0.000282344082717077, + "loss": 7.0422, + "step": 11719 + }, + { + "epoch": 1.0935896239619296, + "grad_norm": 0.532905208596371, + "learning_rate": 0.00028234052228343494, + "loss": 7.4832, + "step": 11720 + }, + { + "epoch": 1.093682933656807, + "grad_norm": 0.7660368122799668, + "learning_rate": 0.0002823369615132902, + "loss": 7.4469, + "step": 11721 + }, + { + "epoch": 1.0937762433516842, + "grad_norm": 0.6708112457531459, + "learning_rate": 0.0002823334004066518, + "loss": 7.2295, + "step": 11722 + }, + { + "epoch": 1.0938695530465616, + "grad_norm": 0.3501584682718951, + "learning_rate": 0.0002823298389635287, + "loss": 7.424, + "step": 11723 + }, + { + "epoch": 1.0939628627414388, + "grad_norm": 0.5371148902272671, + "learning_rate": 0.00028232627718393006, + "loss": 7.4651, + "step": 11724 + }, + { + "epoch": 1.094056172436316, + "grad_norm": 0.33038090516039326, + "learning_rate": 0.00028232271506786484, + "loss": 7.3468, + "step": 11725 + }, + { + "epoch": 1.0941494821311935, + "grad_norm": 0.6264017103527122, + "learning_rate": 0.00028231915261534214, + "loss": 7.3537, + "step": 11726 + }, + { + "epoch": 1.0942427918260706, + "grad_norm": 4364596.092121613, + "learning_rate": 0.0002823155898263711, + "loss": 7.4161, + "step": 11727 + }, + { + "epoch": 1.094336101520948, + "grad_norm": 0.3259881481207755, + "learning_rate": 0.00028231202670096073, + "loss": 7.4112, + "step": 11728 + }, + { + "epoch": 1.0944294112158253, + "grad_norm": 0.4124234801161287, + "learning_rate": 0.00028230846323911996, + "loss": 7.3316, + "step": 11729 + }, + { + "epoch": 1.0945227209107027, + "grad_norm": 0.5674648101200285, + "learning_rate": 0.000282304899440858, + "loss": 7.7009, + "step": 11730 + }, + { + "epoch": 1.0946160306055799, + "grad_norm": 0.41605431808734494, + "learning_rate": 0.00028230133530618396, + "loss": 7.6238, + "step": 11731 + }, + { + "epoch": 1.0947093403004573, + "grad_norm": 3304532.6839249176, + "learning_rate": 0.0002822977708351067, + "loss": 7.045, + "step": 11732 + }, + { + "epoch": 1.0948026499953345, + "grad_norm": 3490860.0820264868, + "learning_rate": 0.0002822942060276355, + "loss": 7.157, + "step": 11733 + }, + { + "epoch": 1.094895959690212, + "grad_norm": 0.804197720745047, + "learning_rate": 0.00028229064088377924, + "loss": 7.2627, + "step": 11734 + }, + { + "epoch": 1.0949892693850891, + "grad_norm": 0.5381908489638889, + "learning_rate": 0.0002822870754035471, + "loss": 7.5307, + "step": 11735 + }, + { + "epoch": 1.0950825790799663, + "grad_norm": 0.7307756533603049, + "learning_rate": 0.00028228350958694814, + "loss": 7.126, + "step": 11736 + }, + { + "epoch": 1.0951758887748437, + "grad_norm": 2.288946692810891, + "learning_rate": 0.00028227994343399133, + "loss": 7.4474, + "step": 11737 + }, + { + "epoch": 1.095269198469721, + "grad_norm": 1.0479321281298526, + "learning_rate": 0.0002822763769446859, + "loss": 7.0197, + "step": 11738 + }, + { + "epoch": 1.0953625081645983, + "grad_norm": 15955949.402614119, + "learning_rate": 0.0002822728101190407, + "loss": 7.5236, + "step": 11739 + }, + { + "epoch": 1.0954558178594755, + "grad_norm": 0.3959398889684834, + "learning_rate": 0.00028226924295706503, + "loss": 7.0871, + "step": 11740 + }, + { + "epoch": 1.095549127554353, + "grad_norm": 0.7605071844546715, + "learning_rate": 0.0002822656754587678, + "loss": 7.1759, + "step": 11741 + }, + { + "epoch": 1.0956424372492302, + "grad_norm": 0.8067572074575939, + "learning_rate": 0.0002822621076241582, + "loss": 7.3172, + "step": 11742 + }, + { + "epoch": 1.0957357469441076, + "grad_norm": 1.4102318640667966, + "learning_rate": 0.0002822585394532452, + "loss": 7.2207, + "step": 11743 + }, + { + "epoch": 1.0958290566389848, + "grad_norm": 0.947422430710515, + "learning_rate": 0.00028225497094603794, + "loss": 7.3792, + "step": 11744 + }, + { + "epoch": 1.0959223663338622, + "grad_norm": 0.7327814290148592, + "learning_rate": 0.00028225140210254545, + "loss": 7.3743, + "step": 11745 + }, + { + "epoch": 1.0960156760287394, + "grad_norm": 1.9340195889050285, + "learning_rate": 0.0002822478329227768, + "loss": 7.3963, + "step": 11746 + }, + { + "epoch": 1.0961089857236166, + "grad_norm": 0.5259376137588533, + "learning_rate": 0.0002822442634067411, + "loss": 7.1479, + "step": 11747 + }, + { + "epoch": 1.096202295418494, + "grad_norm": 0.48955931359063976, + "learning_rate": 0.0002822406935544474, + "loss": 7.1135, + "step": 11748 + }, + { + "epoch": 1.0962956051133712, + "grad_norm": 1.7353157457683959, + "learning_rate": 0.0002822371233659048, + "loss": 7.6777, + "step": 11749 + }, + { + "epoch": 1.0963889148082486, + "grad_norm": 27.269382421553455, + "learning_rate": 0.0002822335528411224, + "loss": 7.6541, + "step": 11750 + }, + { + "epoch": 1.0964822245031258, + "grad_norm": 5019880.100210852, + "learning_rate": 0.0002822299819801092, + "loss": 7.4925, + "step": 11751 + }, + { + "epoch": 1.0965755341980032, + "grad_norm": 2.7368658228335954, + "learning_rate": 0.0002822264107828743, + "loss": 7.5044, + "step": 11752 + }, + { + "epoch": 1.0966688438928804, + "grad_norm": 1.5253846189560085, + "learning_rate": 0.0002822228392494269, + "loss": 7.1519, + "step": 11753 + }, + { + "epoch": 1.0967621535877579, + "grad_norm": 10.35292413389061, + "learning_rate": 0.00028221926737977593, + "loss": 7.3826, + "step": 11754 + }, + { + "epoch": 1.096855463282635, + "grad_norm": 2.0364934230452527, + "learning_rate": 0.0002822156951739305, + "loss": 7.227, + "step": 11755 + }, + { + "epoch": 1.0969487729775125, + "grad_norm": 2922471.861954854, + "learning_rate": 0.00028221212263189975, + "loss": 7.4187, + "step": 11756 + }, + { + "epoch": 1.0970420826723897, + "grad_norm": 1.3103119464123956, + "learning_rate": 0.00028220854975369276, + "loss": 7.2587, + "step": 11757 + }, + { + "epoch": 1.0971353923672669, + "grad_norm": 2.163869871500414, + "learning_rate": 0.0002822049765393186, + "loss": 7.3206, + "step": 11758 + }, + { + "epoch": 1.0972287020621443, + "grad_norm": 7.7457614199401945, + "learning_rate": 0.00028220140298878627, + "loss": 7.4192, + "step": 11759 + }, + { + "epoch": 1.0973220117570215, + "grad_norm": 0.6416084953116236, + "learning_rate": 0.00028219782910210493, + "loss": 7.3269, + "step": 11760 + }, + { + "epoch": 1.097415321451899, + "grad_norm": 2972186.6538806115, + "learning_rate": 0.0002821942548792838, + "loss": 7.4028, + "step": 11761 + }, + { + "epoch": 1.097508631146776, + "grad_norm": 10.53207348241349, + "learning_rate": 0.00028219068032033173, + "loss": 7.1767, + "step": 11762 + }, + { + "epoch": 1.0976019408416535, + "grad_norm": 1.4482700695978212, + "learning_rate": 0.0002821871054252579, + "loss": 7.3227, + "step": 11763 + }, + { + "epoch": 1.0976952505365307, + "grad_norm": 1.6677951701959872, + "learning_rate": 0.0002821835301940715, + "loss": 7.4361, + "step": 11764 + }, + { + "epoch": 1.0977885602314081, + "grad_norm": 0.8135285766923069, + "learning_rate": 0.0002821799546267815, + "loss": 7.2763, + "step": 11765 + }, + { + "epoch": 1.0978818699262853, + "grad_norm": 1.0968387891941012, + "learning_rate": 0.000282176378723397, + "loss": 7.3146, + "step": 11766 + }, + { + "epoch": 1.0979751796211628, + "grad_norm": 1.1852168990102119, + "learning_rate": 0.0002821728024839272, + "loss": 7.3955, + "step": 11767 + }, + { + "epoch": 1.09806848931604, + "grad_norm": 0.5835284280842586, + "learning_rate": 0.00028216922590838106, + "loss": 7.1994, + "step": 11768 + }, + { + "epoch": 1.0981617990109172, + "grad_norm": 0.7218622615641171, + "learning_rate": 0.00028216564899676775, + "loss": 7.0832, + "step": 11769 + }, + { + "epoch": 1.0982551087057946, + "grad_norm": 0.7680515667047874, + "learning_rate": 0.00028216207174909633, + "loss": 7.1729, + "step": 11770 + }, + { + "epoch": 1.0983484184006718, + "grad_norm": 0.6378052032809342, + "learning_rate": 0.0002821584941653759, + "loss": 7.3266, + "step": 11771 + }, + { + "epoch": 1.0984417280955492, + "grad_norm": 62735.69560838258, + "learning_rate": 0.00028215491624561563, + "loss": 7.5839, + "step": 11772 + }, + { + "epoch": 1.0985350377904264, + "grad_norm": 0.6072464767266009, + "learning_rate": 0.00028215133798982446, + "loss": 7.5737, + "step": 11773 + }, + { + "epoch": 1.0986283474853038, + "grad_norm": 0.5353719148184137, + "learning_rate": 0.00028214775939801166, + "loss": 7.2633, + "step": 11774 + }, + { + "epoch": 1.098721657180181, + "grad_norm": 1.0187404197832386, + "learning_rate": 0.00028214418047018623, + "loss": 7.6607, + "step": 11775 + }, + { + "epoch": 1.0988149668750584, + "grad_norm": 0.6690950518905547, + "learning_rate": 0.0002821406012063573, + "loss": 7.0139, + "step": 11776 + }, + { + "epoch": 1.0989082765699356, + "grad_norm": 0.665029023421294, + "learning_rate": 0.0002821370216065339, + "loss": 6.922, + "step": 11777 + }, + { + "epoch": 1.0990015862648128, + "grad_norm": 2.236763194685972, + "learning_rate": 0.0002821334416707253, + "loss": 7.4071, + "step": 11778 + }, + { + "epoch": 1.0990948959596902, + "grad_norm": 1.3255494218708062, + "learning_rate": 0.0002821298613989404, + "loss": 7.1803, + "step": 11779 + }, + { + "epoch": 1.0991882056545674, + "grad_norm": 0.5312022691331717, + "learning_rate": 0.0002821262807911885, + "loss": 7.4419, + "step": 11780 + }, + { + "epoch": 1.0992815153494448, + "grad_norm": 0.7633052943025208, + "learning_rate": 0.0002821226998474785, + "loss": 7.6271, + "step": 11781 + }, + { + "epoch": 1.099374825044322, + "grad_norm": 0.5849818305819436, + "learning_rate": 0.0002821191185678197, + "loss": 7.5769, + "step": 11782 + }, + { + "epoch": 1.0994681347391995, + "grad_norm": 0.8812324313404798, + "learning_rate": 0.00028211553695222107, + "loss": 7.1472, + "step": 11783 + }, + { + "epoch": 1.0995614444340767, + "grad_norm": 0.5364890287402249, + "learning_rate": 0.0002821119550006918, + "loss": 7.3886, + "step": 11784 + }, + { + "epoch": 1.099654754128954, + "grad_norm": 0.9297082428064636, + "learning_rate": 0.00028210837271324096, + "loss": 7.3082, + "step": 11785 + }, + { + "epoch": 1.0997480638238313, + "grad_norm": 0.8832767285580024, + "learning_rate": 0.0002821047900898776, + "loss": 7.1782, + "step": 11786 + }, + { + "epoch": 1.0998413735187085, + "grad_norm": 0.42790631397325685, + "learning_rate": 0.00028210120713061094, + "loss": 7.1851, + "step": 11787 + }, + { + "epoch": 1.099934683213586, + "grad_norm": 1.3331982766663768, + "learning_rate": 0.00028209762383545, + "loss": 7.4514, + "step": 11788 + }, + { + "epoch": 1.100027992908463, + "grad_norm": 3.090506807996481, + "learning_rate": 0.00028209404020440403, + "loss": 7.2847, + "step": 11789 + }, + { + "epoch": 1.1001213026033405, + "grad_norm": 1.2438458308582105, + "learning_rate": 0.00028209045623748196, + "loss": 7.6274, + "step": 11790 + }, + { + "epoch": 1.1002146122982177, + "grad_norm": 0.7034388893829282, + "learning_rate": 0.00028208687193469304, + "loss": 7.3468, + "step": 11791 + }, + { + "epoch": 1.1003079219930951, + "grad_norm": 0.4949368111408212, + "learning_rate": 0.0002820832872960463, + "loss": 7.0978, + "step": 11792 + }, + { + "epoch": 1.1004012316879723, + "grad_norm": 0.5762947311438635, + "learning_rate": 0.00028207970232155087, + "loss": 7.4316, + "step": 11793 + }, + { + "epoch": 1.1004945413828497, + "grad_norm": 0.6150419713126419, + "learning_rate": 0.0002820761170112159, + "loss": 7.1739, + "step": 11794 + }, + { + "epoch": 1.100587851077727, + "grad_norm": 16548331.945359293, + "learning_rate": 0.0002820725313650505, + "loss": 7.2696, + "step": 11795 + }, + { + "epoch": 1.1006811607726044, + "grad_norm": 0.5007567821078579, + "learning_rate": 0.0002820689453830638, + "loss": 7.2812, + "step": 11796 + }, + { + "epoch": 1.1007744704674816, + "grad_norm": 1.5435358019804004, + "learning_rate": 0.00028206535906526484, + "loss": 7.3223, + "step": 11797 + }, + { + "epoch": 1.1008677801623588, + "grad_norm": 0.601954683074911, + "learning_rate": 0.0002820617724116628, + "loss": 7.0733, + "step": 11798 + }, + { + "epoch": 1.1009610898572362, + "grad_norm": 1.3742791442300095, + "learning_rate": 0.0002820581854222668, + "loss": 7.7651, + "step": 11799 + }, + { + "epoch": 1.1010543995521134, + "grad_norm": 0.760997942487573, + "learning_rate": 0.00028205459809708597, + "loss": 7.1538, + "step": 11800 + }, + { + "epoch": 1.1011477092469908, + "grad_norm": 0.4518675404485758, + "learning_rate": 0.0002820510104361294, + "loss": 7.2435, + "step": 11801 + }, + { + "epoch": 1.101241018941868, + "grad_norm": 1.242562665595598, + "learning_rate": 0.00028204742243940627, + "loss": 7.6158, + "step": 11802 + }, + { + "epoch": 1.1013343286367454, + "grad_norm": 0.5639271367381569, + "learning_rate": 0.00028204383410692556, + "loss": 7.6743, + "step": 11803 + }, + { + "epoch": 1.1014276383316226, + "grad_norm": 0.8546394052887674, + "learning_rate": 0.0002820402454386966, + "loss": 7.4542, + "step": 11804 + }, + { + "epoch": 1.1015209480265, + "grad_norm": 1.5515043366724868, + "learning_rate": 0.00028203665643472835, + "loss": 7.1469, + "step": 11805 + }, + { + "epoch": 1.1016142577213772, + "grad_norm": 0.8882068434592509, + "learning_rate": 0.00028203306709503, + "loss": 7.2766, + "step": 11806 + }, + { + "epoch": 1.1017075674162546, + "grad_norm": 757711.5781185642, + "learning_rate": 0.0002820294774196107, + "loss": 7.1345, + "step": 11807 + }, + { + "epoch": 1.1018008771111318, + "grad_norm": 0.6894408275529073, + "learning_rate": 0.0002820258874084795, + "loss": 7.4399, + "step": 11808 + }, + { + "epoch": 1.101894186806009, + "grad_norm": 2.0111249099200816, + "learning_rate": 0.0002820222970616456, + "loss": 7.8491, + "step": 11809 + }, + { + "epoch": 1.1019874965008865, + "grad_norm": 7744327.076737431, + "learning_rate": 0.00028201870637911816, + "loss": 7.3227, + "step": 11810 + }, + { + "epoch": 1.1020808061957637, + "grad_norm": 0.7144677953925475, + "learning_rate": 0.0002820151153609062, + "loss": 7.536, + "step": 11811 + }, + { + "epoch": 1.102174115890641, + "grad_norm": 0.6838391330716068, + "learning_rate": 0.00028201152400701885, + "loss": 7.303, + "step": 11812 + }, + { + "epoch": 1.1022674255855183, + "grad_norm": 0.4108841331138272, + "learning_rate": 0.0002820079323174654, + "loss": 7.4088, + "step": 11813 + }, + { + "epoch": 1.1023607352803957, + "grad_norm": 0.5842168316173939, + "learning_rate": 0.00028200434029225486, + "loss": 7.3681, + "step": 11814 + }, + { + "epoch": 1.1024540449752729, + "grad_norm": 0.40268442687252565, + "learning_rate": 0.00028200074793139637, + "loss": 7.7252, + "step": 11815 + }, + { + "epoch": 1.1025473546701503, + "grad_norm": 1.0107449034616656, + "learning_rate": 0.0002819971552348991, + "loss": 7.1953, + "step": 11816 + }, + { + "epoch": 1.1026406643650275, + "grad_norm": 0.38483170090305413, + "learning_rate": 0.0002819935622027721, + "loss": 7.5606, + "step": 11817 + }, + { + "epoch": 1.102733974059905, + "grad_norm": 0.8181904409320836, + "learning_rate": 0.00028198996883502464, + "loss": 7.0352, + "step": 11818 + }, + { + "epoch": 1.1028272837547821, + "grad_norm": 0.5060048450842146, + "learning_rate": 0.00028198637513166577, + "loss": 7.2011, + "step": 11819 + }, + { + "epoch": 1.1029205934496593, + "grad_norm": 0.3834550119381296, + "learning_rate": 0.00028198278109270463, + "loss": 7.3266, + "step": 11820 + }, + { + "epoch": 1.1030139031445367, + "grad_norm": 11201903.2959106, + "learning_rate": 0.0002819791867181504, + "loss": 7.4227, + "step": 11821 + }, + { + "epoch": 1.103107212839414, + "grad_norm": 0.7213023470933029, + "learning_rate": 0.0002819755920080122, + "loss": 7.2442, + "step": 11822 + }, + { + "epoch": 1.1032005225342914, + "grad_norm": 0.6842628707559485, + "learning_rate": 0.00028197199696229907, + "loss": 7.3597, + "step": 11823 + }, + { + "epoch": 1.1032938322291685, + "grad_norm": 1.6753641043586918, + "learning_rate": 0.00028196840158102034, + "loss": 7.5335, + "step": 11824 + }, + { + "epoch": 1.103387141924046, + "grad_norm": 0.977226192193109, + "learning_rate": 0.00028196480586418506, + "loss": 7.2298, + "step": 11825 + }, + { + "epoch": 1.1034804516189232, + "grad_norm": 97636998.74352242, + "learning_rate": 0.0002819612098118023, + "loss": 7.6469, + "step": 11826 + }, + { + "epoch": 1.1035737613138006, + "grad_norm": 1.5081520725064221, + "learning_rate": 0.00028195761342388135, + "loss": 7.3004, + "step": 11827 + }, + { + "epoch": 1.1036670710086778, + "grad_norm": 7.497979843176153, + "learning_rate": 0.00028195401670043124, + "loss": 7.2998, + "step": 11828 + }, + { + "epoch": 1.1037603807035552, + "grad_norm": 5.279893049712221, + "learning_rate": 0.00028195041964146117, + "loss": 7.4012, + "step": 11829 + }, + { + "epoch": 1.1038536903984324, + "grad_norm": 2.3640366623804314, + "learning_rate": 0.00028194682224698023, + "loss": 8.0065, + "step": 11830 + }, + { + "epoch": 1.1039470000933096, + "grad_norm": 2.2075073722888248, + "learning_rate": 0.00028194322451699765, + "loss": 7.8999, + "step": 11831 + }, + { + "epoch": 1.104040309788187, + "grad_norm": 72.92753103710729, + "learning_rate": 0.0002819396264515225, + "loss": 8.3555, + "step": 11832 + }, + { + "epoch": 1.1041336194830642, + "grad_norm": 7.23361557127511e+16, + "learning_rate": 0.000281936028050564, + "loss": 8.3606, + "step": 11833 + }, + { + "epoch": 1.1042269291779416, + "grad_norm": 0.921063397284451, + "learning_rate": 0.0002819324293141312, + "loss": 7.3817, + "step": 11834 + }, + { + "epoch": 1.1043202388728188, + "grad_norm": 0.7198896048574891, + "learning_rate": 0.0002819288302422334, + "loss": 7.2554, + "step": 11835 + }, + { + "epoch": 1.1044135485676962, + "grad_norm": 0.5097145279299029, + "learning_rate": 0.0002819252308348796, + "loss": 7.6567, + "step": 11836 + }, + { + "epoch": 1.1045068582625734, + "grad_norm": 0.6042700930562643, + "learning_rate": 0.000281921631092079, + "loss": 7.4413, + "step": 11837 + }, + { + "epoch": 1.1046001679574509, + "grad_norm": 0.48298253528770696, + "learning_rate": 0.00028191803101384084, + "loss": 7.5782, + "step": 11838 + }, + { + "epoch": 1.104693477652328, + "grad_norm": 41830415.30666542, + "learning_rate": 0.0002819144306001742, + "loss": 7.1048, + "step": 11839 + }, + { + "epoch": 1.1047867873472055, + "grad_norm": 1.0660414241137113, + "learning_rate": 0.0002819108298510882, + "loss": 7.298, + "step": 11840 + }, + { + "epoch": 1.1048800970420827, + "grad_norm": 0.48333452025891904, + "learning_rate": 0.000281907228766592, + "loss": 7.3875, + "step": 11841 + }, + { + "epoch": 1.1049734067369599, + "grad_norm": 18.61252885104377, + "learning_rate": 0.0002819036273466949, + "loss": 7.5848, + "step": 11842 + }, + { + "epoch": 1.1050667164318373, + "grad_norm": 6.350711421057989, + "learning_rate": 0.00028190002559140586, + "loss": 7.4439, + "step": 11843 + }, + { + "epoch": 1.1051600261267145, + "grad_norm": 7.620056782624381, + "learning_rate": 0.00028189642350073417, + "loss": 7.6853, + "step": 11844 + }, + { + "epoch": 1.105253335821592, + "grad_norm": 1.5048084576356398, + "learning_rate": 0.0002818928210746889, + "loss": 7.4333, + "step": 11845 + }, + { + "epoch": 1.105346645516469, + "grad_norm": 94.13819271925546, + "learning_rate": 0.0002818892183132793, + "loss": 7.608, + "step": 11846 + }, + { + "epoch": 1.1054399552113465, + "grad_norm": 2.0185126291076347, + "learning_rate": 0.00028188561521651445, + "loss": 7.354, + "step": 11847 + }, + { + "epoch": 1.1055332649062237, + "grad_norm": 20.904717697393977, + "learning_rate": 0.0002818820117844036, + "loss": 7.4475, + "step": 11848 + }, + { + "epoch": 1.1056265746011011, + "grad_norm": 5061566935116603.0, + "learning_rate": 0.00028187840801695583, + "loss": 7.1139, + "step": 11849 + }, + { + "epoch": 1.1057198842959783, + "grad_norm": 5.938812007216753, + "learning_rate": 0.0002818748039141803, + "loss": 7.1922, + "step": 11850 + }, + { + "epoch": 1.1058131939908558, + "grad_norm": 2.147086464453925, + "learning_rate": 0.00028187119947608626, + "loss": 7.6, + "step": 11851 + }, + { + "epoch": 1.105906503685733, + "grad_norm": 91.343482459616, + "learning_rate": 0.00028186759470268276, + "loss": 7.8485, + "step": 11852 + }, + { + "epoch": 1.1059998133806102, + "grad_norm": 3.2305147833935655, + "learning_rate": 0.00028186398959397906, + "loss": 7.4353, + "step": 11853 + }, + { + "epoch": 1.1060931230754876, + "grad_norm": 1.7675319797709146e+18, + "learning_rate": 0.0002818603841499843, + "loss": 7.5368, + "step": 11854 + }, + { + "epoch": 1.1061864327703648, + "grad_norm": 1.6020140859111518, + "learning_rate": 0.0002818567783707077, + "loss": 7.3114, + "step": 11855 + }, + { + "epoch": 1.1062797424652422, + "grad_norm": 1.8833462313481295, + "learning_rate": 0.0002818531722561583, + "loss": 7.6246, + "step": 11856 + }, + { + "epoch": 1.1063730521601194, + "grad_norm": 2.3141950512857457, + "learning_rate": 0.0002818495658063454, + "loss": 7.1566, + "step": 11857 + }, + { + "epoch": 1.1064663618549968, + "grad_norm": 1.4427444367420883, + "learning_rate": 0.00028184595902127805, + "loss": 7.3051, + "step": 11858 + }, + { + "epoch": 1.106559671549874, + "grad_norm": 344707.3417906869, + "learning_rate": 0.0002818423519009655, + "loss": 7.7386, + "step": 11859 + }, + { + "epoch": 1.1066529812447514, + "grad_norm": 0.9729305198226321, + "learning_rate": 0.00028183874444541695, + "loss": 7.4818, + "step": 11860 + }, + { + "epoch": 1.1067462909396286, + "grad_norm": 0.4715060517917214, + "learning_rate": 0.00028183513665464146, + "loss": 7.3914, + "step": 11861 + }, + { + "epoch": 1.106839600634506, + "grad_norm": 0.5369383675785695, + "learning_rate": 0.0002818315285286483, + "loss": 7.0911, + "step": 11862 + }, + { + "epoch": 1.1069329103293832, + "grad_norm": 2.264998710279352, + "learning_rate": 0.0002818279200674466, + "loss": 7.8831, + "step": 11863 + }, + { + "epoch": 1.1070262200242604, + "grad_norm": 2.38376958003357, + "learning_rate": 0.0002818243112710455, + "loss": 7.6994, + "step": 11864 + }, + { + "epoch": 1.1071195297191379, + "grad_norm": 250.48566095220164, + "learning_rate": 0.00028182070213945433, + "loss": 7.5738, + "step": 11865 + }, + { + "epoch": 1.107212839414015, + "grad_norm": 5142.341347534478, + "learning_rate": 0.0002818170926726821, + "loss": 7.3495, + "step": 11866 + }, + { + "epoch": 1.1073061491088925, + "grad_norm": 7709.671590534539, + "learning_rate": 0.0002818134828707381, + "loss": 7.3023, + "step": 11867 + }, + { + "epoch": 1.1073994588037697, + "grad_norm": 142.66815784706694, + "learning_rate": 0.0002818098727336314, + "loss": 7.3698, + "step": 11868 + }, + { + "epoch": 1.107492768498647, + "grad_norm": 1.109471839188316, + "learning_rate": 0.0002818062622613713, + "loss": 7.3729, + "step": 11869 + }, + { + "epoch": 1.1075860781935243, + "grad_norm": 1.336494596813412, + "learning_rate": 0.0002818026514539669, + "loss": 7.1608, + "step": 11870 + }, + { + "epoch": 1.1076793878884017, + "grad_norm": 0.8315224427636734, + "learning_rate": 0.0002817990403114274, + "loss": 7.5711, + "step": 11871 + }, + { + "epoch": 1.107772697583279, + "grad_norm": 1.0458669287888624, + "learning_rate": 0.0002817954288337619, + "loss": 7.2105, + "step": 11872 + }, + { + "epoch": 1.1078660072781563, + "grad_norm": 1852.6871269245, + "learning_rate": 0.0002817918170209797, + "loss": 7.4171, + "step": 11873 + }, + { + "epoch": 1.1079593169730335, + "grad_norm": 0.8580522383679206, + "learning_rate": 0.00028178820487309, + "loss": 6.9689, + "step": 11874 + }, + { + "epoch": 1.1080526266679107, + "grad_norm": 0.7253086394389685, + "learning_rate": 0.00028178459239010196, + "loss": 7.2672, + "step": 11875 + }, + { + "epoch": 1.1081459363627881, + "grad_norm": 1.0324339102637963, + "learning_rate": 0.00028178097957202465, + "loss": 7.2588, + "step": 11876 + }, + { + "epoch": 1.1082392460576653, + "grad_norm": 1.2134509466460943, + "learning_rate": 0.0002817773664188674, + "loss": 7.4572, + "step": 11877 + }, + { + "epoch": 1.1083325557525427, + "grad_norm": 0.6345050789204888, + "learning_rate": 0.00028177375293063936, + "loss": 7.3113, + "step": 11878 + }, + { + "epoch": 1.10842586544742, + "grad_norm": 0.5968638153516193, + "learning_rate": 0.00028177013910734964, + "loss": 7.2891, + "step": 11879 + }, + { + "epoch": 1.1085191751422974, + "grad_norm": 0.9910263222583351, + "learning_rate": 0.0002817665249490076, + "loss": 7.5961, + "step": 11880 + }, + { + "epoch": 1.1086124848371746, + "grad_norm": 0.6763790172102432, + "learning_rate": 0.00028176291045562215, + "loss": 7.5726, + "step": 11881 + }, + { + "epoch": 1.108705794532052, + "grad_norm": 1.344169743405826, + "learning_rate": 0.00028175929562720275, + "loss": 7.3528, + "step": 11882 + }, + { + "epoch": 1.1087991042269292, + "grad_norm": 0.7924445218367816, + "learning_rate": 0.0002817556804637585, + "loss": 7.4797, + "step": 11883 + }, + { + "epoch": 1.1088924139218064, + "grad_norm": 0.8067284095234147, + "learning_rate": 0.0002817520649652985, + "loss": 7.3949, + "step": 11884 + }, + { + "epoch": 1.1089857236166838, + "grad_norm": 0.6693986791817206, + "learning_rate": 0.00028174844913183216, + "loss": 7.4232, + "step": 11885 + }, + { + "epoch": 1.109079033311561, + "grad_norm": 19.087095511847515, + "learning_rate": 0.0002817448329633685, + "loss": 7.3107, + "step": 11886 + }, + { + "epoch": 1.1091723430064384, + "grad_norm": 0.7535255397528341, + "learning_rate": 0.0002817412164599167, + "loss": 7.2011, + "step": 11887 + }, + { + "epoch": 1.1092656527013156, + "grad_norm": 1.1575314159046746, + "learning_rate": 0.00028173759962148604, + "loss": 7.5151, + "step": 11888 + }, + { + "epoch": 1.109358962396193, + "grad_norm": 1.0742604764780823, + "learning_rate": 0.0002817339824480857, + "loss": 7.5433, + "step": 11889 + }, + { + "epoch": 1.1094522720910702, + "grad_norm": 0.9292127406654297, + "learning_rate": 0.0002817303649397248, + "loss": 7.3807, + "step": 11890 + }, + { + "epoch": 1.1095455817859476, + "grad_norm": 0.5549803860121638, + "learning_rate": 0.0002817267470964127, + "loss": 7.3492, + "step": 11891 + }, + { + "epoch": 1.1096388914808248, + "grad_norm": 228.30367980066478, + "learning_rate": 0.00028172312891815843, + "loss": 7.0861, + "step": 11892 + }, + { + "epoch": 1.109732201175702, + "grad_norm": 1.195838467158462, + "learning_rate": 0.0002817195104049713, + "loss": 7.4835, + "step": 11893 + }, + { + "epoch": 1.1098255108705795, + "grad_norm": 0.654845950267625, + "learning_rate": 0.00028171589155686044, + "loss": 7.2279, + "step": 11894 + }, + { + "epoch": 1.1099188205654567, + "grad_norm": 0.5550718174639383, + "learning_rate": 0.0002817122723738351, + "loss": 7.368, + "step": 11895 + }, + { + "epoch": 1.110012130260334, + "grad_norm": 0.651902576754259, + "learning_rate": 0.0002817086528559045, + "loss": 7.4884, + "step": 11896 + }, + { + "epoch": 1.1101054399552113, + "grad_norm": 0.45688425274481437, + "learning_rate": 0.0002817050330030778, + "loss": 7.1504, + "step": 11897 + }, + { + "epoch": 1.1101987496500887, + "grad_norm": 0.9305530589419764, + "learning_rate": 0.0002817014128153642, + "loss": 7.3789, + "step": 11898 + }, + { + "epoch": 1.1102920593449659, + "grad_norm": 78.41926063823921, + "learning_rate": 0.0002816977922927729, + "loss": 7.4805, + "step": 11899 + }, + { + "epoch": 1.1103853690398433, + "grad_norm": 0.7271594782926445, + "learning_rate": 0.0002816941714353132, + "loss": 7.3889, + "step": 11900 + }, + { + "epoch": 1.1104786787347205, + "grad_norm": 0.9987879675147376, + "learning_rate": 0.00028169055024299415, + "loss": 7.259, + "step": 11901 + }, + { + "epoch": 1.110571988429598, + "grad_norm": 28.1956776631296, + "learning_rate": 0.00028168692871582507, + "loss": 7.4737, + "step": 11902 + }, + { + "epoch": 1.1106652981244751, + "grad_norm": 223.04033645818166, + "learning_rate": 0.0002816833068538151, + "loss": 7.45, + "step": 11903 + }, + { + "epoch": 1.1107586078193523, + "grad_norm": 0.5988158807463204, + "learning_rate": 0.0002816796846569736, + "loss": 7.4829, + "step": 11904 + }, + { + "epoch": 1.1108519175142297, + "grad_norm": 0.46047977660092526, + "learning_rate": 0.0002816760621253095, + "loss": 7.1561, + "step": 11905 + }, + { + "epoch": 1.110945227209107, + "grad_norm": 0.7458415973948209, + "learning_rate": 0.00028167243925883233, + "loss": 7.2379, + "step": 11906 + }, + { + "epoch": 1.1110385369039844, + "grad_norm": 1.4689812900746648, + "learning_rate": 0.00028166881605755107, + "loss": 7.588, + "step": 11907 + }, + { + "epoch": 1.1111318465988616, + "grad_norm": 0.4361139035706718, + "learning_rate": 0.00028166519252147506, + "loss": 7.2158, + "step": 11908 + }, + { + "epoch": 1.111225156293739, + "grad_norm": 0.5064452664970626, + "learning_rate": 0.0002816615686506134, + "loss": 7.3498, + "step": 11909 + }, + { + "epoch": 1.1113184659886162, + "grad_norm": 0.363880674809077, + "learning_rate": 0.00028165794444497545, + "loss": 7.513, + "step": 11910 + }, + { + "epoch": 1.1114117756834936, + "grad_norm": 0.6819297658274254, + "learning_rate": 0.00028165431990457033, + "loss": 7.5825, + "step": 11911 + }, + { + "epoch": 1.1115050853783708, + "grad_norm": 0.8725231026604113, + "learning_rate": 0.0002816506950294073, + "loss": 7.3061, + "step": 11912 + }, + { + "epoch": 1.1115983950732482, + "grad_norm": 22.413479954148336, + "learning_rate": 0.00028164706981949545, + "loss": 7.5333, + "step": 11913 + }, + { + "epoch": 1.1116917047681254, + "grad_norm": 0.796972397902011, + "learning_rate": 0.0002816434442748442, + "loss": 7.3456, + "step": 11914 + }, + { + "epoch": 1.1117850144630026, + "grad_norm": 1.3787936159206147, + "learning_rate": 0.0002816398183954626, + "loss": 7.058, + "step": 11915 + }, + { + "epoch": 1.11187832415788, + "grad_norm": 20.765928854073124, + "learning_rate": 0.00028163619218135995, + "loss": 7.3415, + "step": 11916 + }, + { + "epoch": 1.1119716338527572, + "grad_norm": 15.705449528941395, + "learning_rate": 0.0002816325656325455, + "loss": 7.4831, + "step": 11917 + }, + { + "epoch": 1.1120649435476346, + "grad_norm": 1.3513587653787842, + "learning_rate": 0.00028162893874902835, + "loss": 7.3896, + "step": 11918 + }, + { + "epoch": 1.1121582532425118, + "grad_norm": 8.100805318168876, + "learning_rate": 0.0002816253115308179, + "loss": 7.4878, + "step": 11919 + }, + { + "epoch": 1.1122515629373892, + "grad_norm": 1.3345705645848702, + "learning_rate": 0.00028162168397792315, + "loss": 7.5438, + "step": 11920 + }, + { + "epoch": 1.1123448726322664, + "grad_norm": 0.6447982718851302, + "learning_rate": 0.0002816180560903535, + "loss": 7.2833, + "step": 11921 + }, + { + "epoch": 1.1124381823271439, + "grad_norm": 0.6466479122486888, + "learning_rate": 0.0002816144278681182, + "loss": 7.3076, + "step": 11922 + }, + { + "epoch": 1.112531492022021, + "grad_norm": 0.4543788578307695, + "learning_rate": 0.0002816107993112263, + "loss": 7.3091, + "step": 11923 + }, + { + "epoch": 1.1126248017168985, + "grad_norm": 34.48826405309278, + "learning_rate": 0.0002816071704196872, + "loss": 7.6103, + "step": 11924 + }, + { + "epoch": 1.1127181114117757, + "grad_norm": 1.0292762393916186, + "learning_rate": 0.00028160354119351, + "loss": 7.1644, + "step": 11925 + }, + { + "epoch": 1.1128114211066529, + "grad_norm": 31.345032212627725, + "learning_rate": 0.000281599911632704, + "loss": 7.1582, + "step": 11926 + }, + { + "epoch": 1.1129047308015303, + "grad_norm": 1.035502171351819, + "learning_rate": 0.00028159628173727837, + "loss": 7.1106, + "step": 11927 + }, + { + "epoch": 1.1129980404964075, + "grad_norm": 0.7847128941779318, + "learning_rate": 0.00028159265150724247, + "loss": 7.5369, + "step": 11928 + }, + { + "epoch": 1.113091350191285, + "grad_norm": 0.6960445620068187, + "learning_rate": 0.0002815890209426053, + "loss": 7.5795, + "step": 11929 + }, + { + "epoch": 1.113184659886162, + "grad_norm": 0.5373966995107257, + "learning_rate": 0.0002815853900433763, + "loss": 7.1041, + "step": 11930 + }, + { + "epoch": 1.1132779695810395, + "grad_norm": 0.9291377925792462, + "learning_rate": 0.00028158175880956465, + "loss": 7.3943, + "step": 11931 + }, + { + "epoch": 1.1133712792759167, + "grad_norm": 63.8328341645541, + "learning_rate": 0.0002815781272411795, + "loss": 7.5087, + "step": 11932 + }, + { + "epoch": 1.1134645889707941, + "grad_norm": 80.8778264339381, + "learning_rate": 0.0002815744953382302, + "loss": 7.398, + "step": 11933 + }, + { + "epoch": 1.1135578986656713, + "grad_norm": 106.66850914883864, + "learning_rate": 0.00028157086310072596, + "loss": 7.3096, + "step": 11934 + }, + { + "epoch": 1.1136512083605488, + "grad_norm": 0.9989907854361512, + "learning_rate": 0.000281567230528676, + "loss": 7.5333, + "step": 11935 + }, + { + "epoch": 1.113744518055426, + "grad_norm": 1.0589808053543204, + "learning_rate": 0.00028156359762208943, + "loss": 7.2973, + "step": 11936 + }, + { + "epoch": 1.1138378277503032, + "grad_norm": 2.6652718582589525, + "learning_rate": 0.0002815599643809757, + "loss": 7.6432, + "step": 11937 + }, + { + "epoch": 1.1139311374451806, + "grad_norm": 2.0205695093617884, + "learning_rate": 0.0002815563308053439, + "loss": 7.6497, + "step": 11938 + }, + { + "epoch": 1.1140244471400578, + "grad_norm": 1.0468835916001205, + "learning_rate": 0.00028155269689520335, + "loss": 7.3003, + "step": 11939 + }, + { + "epoch": 1.1141177568349352, + "grad_norm": 2.0214792259192405, + "learning_rate": 0.00028154906265056326, + "loss": 7.1281, + "step": 11940 + }, + { + "epoch": 1.1142110665298124, + "grad_norm": 1.8357666243059891, + "learning_rate": 0.0002815454280714329, + "loss": 7.3246, + "step": 11941 + }, + { + "epoch": 1.1143043762246898, + "grad_norm": 0.8612385683107077, + "learning_rate": 0.00028154179315782144, + "loss": 7.4486, + "step": 11942 + }, + { + "epoch": 1.114397685919567, + "grad_norm": 109.32010231400189, + "learning_rate": 0.0002815381579097382, + "loss": 7.4623, + "step": 11943 + }, + { + "epoch": 1.1144909956144444, + "grad_norm": 0.8189663517182633, + "learning_rate": 0.00028153452232719235, + "loss": 7.4463, + "step": 11944 + }, + { + "epoch": 1.1145843053093216, + "grad_norm": 0.9525244931333597, + "learning_rate": 0.0002815308864101932, + "loss": 7.1309, + "step": 11945 + }, + { + "epoch": 1.114677615004199, + "grad_norm": 623.5025345077657, + "learning_rate": 0.00028152725015874995, + "loss": 7.3008, + "step": 11946 + }, + { + "epoch": 1.1147709246990762, + "grad_norm": 0.917084960183923, + "learning_rate": 0.00028152361357287187, + "loss": 6.9677, + "step": 11947 + }, + { + "epoch": 1.1148642343939534, + "grad_norm": 1.553129741886875, + "learning_rate": 0.0002815199766525682, + "loss": 7.4106, + "step": 11948 + }, + { + "epoch": 1.1149575440888309, + "grad_norm": 0.6585426775732212, + "learning_rate": 0.0002815163393978482, + "loss": 7.5474, + "step": 11949 + }, + { + "epoch": 1.115050853783708, + "grad_norm": 0.5998887487165396, + "learning_rate": 0.0002815127018087211, + "loss": 7.5187, + "step": 11950 + }, + { + "epoch": 1.1151441634785855, + "grad_norm": 0.5304159324742777, + "learning_rate": 0.00028150906388519617, + "loss": 7.6474, + "step": 11951 + }, + { + "epoch": 1.1152374731734627, + "grad_norm": 0.6996980444103585, + "learning_rate": 0.00028150542562728264, + "loss": 7.4278, + "step": 11952 + }, + { + "epoch": 1.11533078286834, + "grad_norm": 2.8231050449114767, + "learning_rate": 0.0002815017870349897, + "loss": 7.1999, + "step": 11953 + }, + { + "epoch": 1.1154240925632173, + "grad_norm": 1.5788167678748648, + "learning_rate": 0.0002814981481083268, + "loss": 7.3953, + "step": 11954 + }, + { + "epoch": 1.1155174022580947, + "grad_norm": 0.7440103975092297, + "learning_rate": 0.000281494508847303, + "loss": 7.3653, + "step": 11955 + }, + { + "epoch": 1.115610711952972, + "grad_norm": 0.5616293929028496, + "learning_rate": 0.00028149086925192756, + "loss": 7.3427, + "step": 11956 + }, + { + "epoch": 1.1157040216478493, + "grad_norm": 1.5448807485204101, + "learning_rate": 0.0002814872293222099, + "loss": 7.1083, + "step": 11957 + }, + { + "epoch": 1.1157973313427265, + "grad_norm": 1.4237000850359594, + "learning_rate": 0.0002814835890581591, + "loss": 7.5521, + "step": 11958 + }, + { + "epoch": 1.1158906410376037, + "grad_norm": 1.2227896894770285, + "learning_rate": 0.00028147994845978446, + "loss": 7.253, + "step": 11959 + }, + { + "epoch": 1.1159839507324811, + "grad_norm": 1.283675934803775, + "learning_rate": 0.0002814763075270952, + "loss": 7.3911, + "step": 11960 + }, + { + "epoch": 1.1160772604273583, + "grad_norm": 1.2209993001635848, + "learning_rate": 0.0002814726662601008, + "loss": 7.6146, + "step": 11961 + }, + { + "epoch": 1.1161705701222358, + "grad_norm": 0.7366360632877744, + "learning_rate": 0.00028146902465881027, + "loss": 7.0693, + "step": 11962 + }, + { + "epoch": 1.116263879817113, + "grad_norm": 1.271190854669663, + "learning_rate": 0.00028146538272323294, + "loss": 7.2022, + "step": 11963 + }, + { + "epoch": 1.1163571895119904, + "grad_norm": 0.733757811310173, + "learning_rate": 0.00028146174045337813, + "loss": 7.3156, + "step": 11964 + }, + { + "epoch": 1.1164504992068676, + "grad_norm": 0.9805739396126075, + "learning_rate": 0.000281458097849255, + "loss": 7.3082, + "step": 11965 + }, + { + "epoch": 1.116543808901745, + "grad_norm": 283.93611115691107, + "learning_rate": 0.0002814544549108729, + "loss": 7.4813, + "step": 11966 + }, + { + "epoch": 1.1166371185966222, + "grad_norm": 0.6316719854098917, + "learning_rate": 0.00028145081163824103, + "loss": 7.24, + "step": 11967 + }, + { + "epoch": 1.1167304282914996, + "grad_norm": 0.4167502130915844, + "learning_rate": 0.00028144716803136876, + "loss": 7.2969, + "step": 11968 + }, + { + "epoch": 1.1168237379863768, + "grad_norm": 0.7213308374448139, + "learning_rate": 0.0002814435240902652, + "loss": 7.3964, + "step": 11969 + }, + { + "epoch": 1.116917047681254, + "grad_norm": 1.1658797024269116, + "learning_rate": 0.00028143987981493973, + "loss": 7.6855, + "step": 11970 + }, + { + "epoch": 1.1170103573761314, + "grad_norm": 0.613656379362064, + "learning_rate": 0.0002814362352054016, + "loss": 7.445, + "step": 11971 + }, + { + "epoch": 1.1171036670710086, + "grad_norm": 0.5341078965004956, + "learning_rate": 0.00028143259026166, + "loss": 7.514, + "step": 11972 + }, + { + "epoch": 1.117196976765886, + "grad_norm": 1.134700484323489, + "learning_rate": 0.00028142894498372426, + "loss": 8.1449, + "step": 11973 + }, + { + "epoch": 1.1172902864607632, + "grad_norm": 10.922862121237795, + "learning_rate": 0.0002814252993716037, + "loss": 7.3264, + "step": 11974 + }, + { + "epoch": 1.1173835961556406, + "grad_norm": 3.2976822860394335, + "learning_rate": 0.00028142165342530753, + "loss": 7.2083, + "step": 11975 + }, + { + "epoch": 1.1174769058505178, + "grad_norm": 2.2732982381039415, + "learning_rate": 0.000281418007144845, + "loss": 7.2141, + "step": 11976 + }, + { + "epoch": 1.1175702155453953, + "grad_norm": 0.6577323264250136, + "learning_rate": 0.00028141436053022537, + "loss": 7.4061, + "step": 11977 + }, + { + "epoch": 1.1176635252402725, + "grad_norm": 0.6810005125240175, + "learning_rate": 0.00028141071358145803, + "loss": 7.4256, + "step": 11978 + }, + { + "epoch": 1.1177568349351499, + "grad_norm": 1.151050943329049, + "learning_rate": 0.0002814070662985521, + "loss": 7.7884, + "step": 11979 + }, + { + "epoch": 1.117850144630027, + "grad_norm": 0.8570745492389557, + "learning_rate": 0.000281403418681517, + "loss": 7.8021, + "step": 11980 + }, + { + "epoch": 1.1179434543249043, + "grad_norm": 0.9979174119703409, + "learning_rate": 0.0002813997707303619, + "loss": 7.2024, + "step": 11981 + }, + { + "epoch": 1.1180367640197817, + "grad_norm": 1.4492985555857745, + "learning_rate": 0.00028139612244509606, + "loss": 7.3414, + "step": 11982 + }, + { + "epoch": 1.118130073714659, + "grad_norm": 1.0598115627287312, + "learning_rate": 0.00028139247382572886, + "loss": 7.3602, + "step": 11983 + }, + { + "epoch": 1.1182233834095363, + "grad_norm": 1.4113146705113426, + "learning_rate": 0.0002813888248722695, + "loss": 7.5201, + "step": 11984 + }, + { + "epoch": 1.1183166931044135, + "grad_norm": 0.5893168597075901, + "learning_rate": 0.0002813851755847273, + "loss": 7.4594, + "step": 11985 + }, + { + "epoch": 1.118410002799291, + "grad_norm": 0.41618756480802854, + "learning_rate": 0.0002813815259631115, + "loss": 7.4505, + "step": 11986 + }, + { + "epoch": 1.1185033124941681, + "grad_norm": 0.9056212047268758, + "learning_rate": 0.00028137787600743133, + "loss": 7.5423, + "step": 11987 + }, + { + "epoch": 1.1185966221890455, + "grad_norm": 4.189034936496491, + "learning_rate": 0.00028137422571769625, + "loss": 7.3399, + "step": 11988 + }, + { + "epoch": 1.1186899318839227, + "grad_norm": 1.8628682878626173, + "learning_rate": 0.0002813705750939154, + "loss": 7.0568, + "step": 11989 + }, + { + "epoch": 1.1187832415788, + "grad_norm": 0.5708010833110385, + "learning_rate": 0.00028136692413609807, + "loss": 7.596, + "step": 11990 + }, + { + "epoch": 1.1188765512736774, + "grad_norm": 0.7611167752763403, + "learning_rate": 0.00028136327284425356, + "loss": 7.2869, + "step": 11991 + }, + { + "epoch": 1.1189698609685546, + "grad_norm": 0.5733074575600496, + "learning_rate": 0.00028135962121839114, + "loss": 7.5164, + "step": 11992 + }, + { + "epoch": 1.119063170663432, + "grad_norm": 0.5871533851677738, + "learning_rate": 0.00028135596925852013, + "loss": 7.3333, + "step": 11993 + }, + { + "epoch": 1.1191564803583092, + "grad_norm": 2.406698624013438, + "learning_rate": 0.00028135231696464985, + "loss": 7.4272, + "step": 11994 + }, + { + "epoch": 1.1192497900531866, + "grad_norm": 0.7440067387112169, + "learning_rate": 0.00028134866433678947, + "loss": 7.2633, + "step": 11995 + }, + { + "epoch": 1.1193430997480638, + "grad_norm": 0.9759155143339977, + "learning_rate": 0.00028134501137494836, + "loss": 7.4488, + "step": 11996 + }, + { + "epoch": 1.1194364094429412, + "grad_norm": 0.6021522646984397, + "learning_rate": 0.00028134135807913584, + "loss": 7.2315, + "step": 11997 + }, + { + "epoch": 1.1195297191378184, + "grad_norm": 0.8052651034949049, + "learning_rate": 0.00028133770444936114, + "loss": 7.6155, + "step": 11998 + }, + { + "epoch": 1.1196230288326956, + "grad_norm": 1.3593514546777135, + "learning_rate": 0.0002813340504856335, + "loss": 7.2299, + "step": 11999 + }, + { + "epoch": 1.119716338527573, + "grad_norm": 1.1324963321717394, + "learning_rate": 0.0002813303961879623, + "loss": 7.418, + "step": 12000 + }, + { + "epoch": 1.1198096482224502, + "grad_norm": 0.8954693183682342, + "learning_rate": 0.0002813267415563569, + "loss": 7.3027, + "step": 12001 + }, + { + "epoch": 1.1199029579173276, + "grad_norm": 3.418009269242068, + "learning_rate": 0.0002813230865908264, + "loss": 7.1437, + "step": 12002 + }, + { + "epoch": 1.1199962676122048, + "grad_norm": 0.542219853825802, + "learning_rate": 0.0002813194312913802, + "loss": 7.5514, + "step": 12003 + }, + { + "epoch": 1.1200895773070823, + "grad_norm": 0.7682320842003054, + "learning_rate": 0.00028131577565802763, + "loss": 7.076, + "step": 12004 + }, + { + "epoch": 1.1201828870019594, + "grad_norm": 0.9442917021950674, + "learning_rate": 0.0002813121196907779, + "loss": 7.7067, + "step": 12005 + }, + { + "epoch": 1.1202761966968369, + "grad_norm": 0.5142840597036533, + "learning_rate": 0.0002813084633896403, + "loss": 7.1283, + "step": 12006 + }, + { + "epoch": 1.120369506391714, + "grad_norm": 0.754556603811678, + "learning_rate": 0.0002813048067546243, + "loss": 7.1343, + "step": 12007 + }, + { + "epoch": 1.1204628160865915, + "grad_norm": 0.9971340743576921, + "learning_rate": 0.000281301149785739, + "loss": 7.4881, + "step": 12008 + }, + { + "epoch": 1.1205561257814687, + "grad_norm": 0.628480895776023, + "learning_rate": 0.00028129749248299374, + "loss": 7.2516, + "step": 12009 + }, + { + "epoch": 1.1206494354763459, + "grad_norm": 5.325540814647718, + "learning_rate": 0.0002812938348463979, + "loss": 7.5856, + "step": 12010 + }, + { + "epoch": 1.1207427451712233, + "grad_norm": 0.9431552446053871, + "learning_rate": 0.0002812901768759607, + "loss": 7.3123, + "step": 12011 + }, + { + "epoch": 1.1208360548661005, + "grad_norm": 0.5822338908393198, + "learning_rate": 0.00028128651857169156, + "loss": 7.6366, + "step": 12012 + }, + { + "epoch": 1.120929364560978, + "grad_norm": 1.0365618122141549, + "learning_rate": 0.0002812828599335996, + "loss": 7.4046, + "step": 12013 + }, + { + "epoch": 1.1210226742558551, + "grad_norm": 0.7670370522588927, + "learning_rate": 0.0002812792009616942, + "loss": 7.4483, + "step": 12014 + }, + { + "epoch": 1.1211159839507325, + "grad_norm": 1.0805038498022301, + "learning_rate": 0.00028127554165598476, + "loss": 7.3077, + "step": 12015 + }, + { + "epoch": 1.1212092936456097, + "grad_norm": 0.7333271135229472, + "learning_rate": 0.00028127188201648045, + "loss": 7.4096, + "step": 12016 + }, + { + "epoch": 1.1213026033404871, + "grad_norm": 3.3007067168230306, + "learning_rate": 0.0002812682220431906, + "loss": 7.661, + "step": 12017 + }, + { + "epoch": 1.1213959130353643, + "grad_norm": 0.5116286221965334, + "learning_rate": 0.0002812645617361246, + "loss": 7.1856, + "step": 12018 + }, + { + "epoch": 1.1214892227302418, + "grad_norm": 0.9634638517315083, + "learning_rate": 0.00028126090109529167, + "loss": 7.271, + "step": 12019 + }, + { + "epoch": 1.121582532425119, + "grad_norm": 1.0032084062037832, + "learning_rate": 0.0002812572401207012, + "loss": 7.4842, + "step": 12020 + }, + { + "epoch": 1.1216758421199962, + "grad_norm": 1.0241366588038576, + "learning_rate": 0.0002812535788123624, + "loss": 7.4727, + "step": 12021 + }, + { + "epoch": 1.1217691518148736, + "grad_norm": 7.1732916286370685, + "learning_rate": 0.0002812499171702846, + "loss": 7.2848, + "step": 12022 + }, + { + "epoch": 1.1218624615097508, + "grad_norm": 0.7905464034110337, + "learning_rate": 0.00028124625519447723, + "loss": 7.4008, + "step": 12023 + }, + { + "epoch": 1.1219557712046282, + "grad_norm": 2.379851340943655, + "learning_rate": 0.0002812425928849495, + "loss": 7.0311, + "step": 12024 + }, + { + "epoch": 1.1220490808995054, + "grad_norm": 0.5533413421741128, + "learning_rate": 0.00028123893024171065, + "loss": 7.5856, + "step": 12025 + }, + { + "epoch": 1.1221423905943828, + "grad_norm": 0.5118671161998647, + "learning_rate": 0.00028123526726477017, + "loss": 7.3741, + "step": 12026 + }, + { + "epoch": 1.12223570028926, + "grad_norm": 0.7169082072967391, + "learning_rate": 0.0002812316039541372, + "loss": 7.3388, + "step": 12027 + }, + { + "epoch": 1.1223290099841374, + "grad_norm": 0.8023062662967624, + "learning_rate": 0.0002812279403098212, + "loss": 7.3698, + "step": 12028 + }, + { + "epoch": 1.1224223196790146, + "grad_norm": 0.965796092526486, + "learning_rate": 0.0002812242763318314, + "loss": 7.4239, + "step": 12029 + }, + { + "epoch": 1.122515629373892, + "grad_norm": 0.9191697754280256, + "learning_rate": 0.0002812206120201771, + "loss": 7.0437, + "step": 12030 + }, + { + "epoch": 1.1226089390687692, + "grad_norm": 3.5629792015181256, + "learning_rate": 0.0002812169473748677, + "loss": 7.48, + "step": 12031 + }, + { + "epoch": 1.1227022487636464, + "grad_norm": 0.8087472912361356, + "learning_rate": 0.0002812132823959125, + "loss": 7.0537, + "step": 12032 + }, + { + "epoch": 1.1227955584585239, + "grad_norm": 0.8559180637449314, + "learning_rate": 0.0002812096170833207, + "loss": 7.8543, + "step": 12033 + }, + { + "epoch": 1.122888868153401, + "grad_norm": 0.9827588359605014, + "learning_rate": 0.00028120595143710177, + "loss": 7.376, + "step": 12034 + }, + { + "epoch": 1.1229821778482785, + "grad_norm": 1.186702780544387, + "learning_rate": 0.00028120228545726496, + "loss": 7.2161, + "step": 12035 + }, + { + "epoch": 1.1230754875431557, + "grad_norm": 0.5166145796026077, + "learning_rate": 0.0002811986191438196, + "loss": 7.5245, + "step": 12036 + }, + { + "epoch": 1.123168797238033, + "grad_norm": 0.5870387208713485, + "learning_rate": 0.000281194952496775, + "loss": 7.4798, + "step": 12037 + }, + { + "epoch": 1.1232621069329103, + "grad_norm": 6.72821114617016, + "learning_rate": 0.00028119128551614054, + "loss": 7.4403, + "step": 12038 + }, + { + "epoch": 1.1233554166277877, + "grad_norm": 0.5695508756942843, + "learning_rate": 0.0002811876182019255, + "loss": 7.4587, + "step": 12039 + }, + { + "epoch": 1.123448726322665, + "grad_norm": 0.7593143710043331, + "learning_rate": 0.00028118395055413915, + "loss": 7.4427, + "step": 12040 + }, + { + "epoch": 1.1235420360175423, + "grad_norm": 1.6103481343913622, + "learning_rate": 0.00028118028257279086, + "loss": 7.5451, + "step": 12041 + }, + { + "epoch": 1.1236353457124195, + "grad_norm": 0.4983555736504685, + "learning_rate": 0.00028117661425789006, + "loss": 7.446, + "step": 12042 + }, + { + "epoch": 1.1237286554072967, + "grad_norm": 0.7512101520839145, + "learning_rate": 0.00028117294560944594, + "loss": 7.4713, + "step": 12043 + }, + { + "epoch": 1.1238219651021741, + "grad_norm": 1.5160618107243553, + "learning_rate": 0.0002811692766274679, + "loss": 7.2622, + "step": 12044 + }, + { + "epoch": 1.1239152747970513, + "grad_norm": 1.0748339234621094, + "learning_rate": 0.00028116560731196525, + "loss": 7.2577, + "step": 12045 + }, + { + "epoch": 1.1240085844919288, + "grad_norm": 1.3445676506564324, + "learning_rate": 0.0002811619376629473, + "loss": 7.0228, + "step": 12046 + }, + { + "epoch": 1.124101894186806, + "grad_norm": 0.7010554926478249, + "learning_rate": 0.00028115826768042334, + "loss": 7.5565, + "step": 12047 + }, + { + "epoch": 1.1241952038816834, + "grad_norm": 0.5900991457636405, + "learning_rate": 0.00028115459736440283, + "loss": 7.1172, + "step": 12048 + }, + { + "epoch": 1.1242885135765606, + "grad_norm": 1.528838744144142, + "learning_rate": 0.000281150926714895, + "loss": 7.5835, + "step": 12049 + }, + { + "epoch": 1.124381823271438, + "grad_norm": 1.2874060979595567, + "learning_rate": 0.0002811472557319092, + "loss": 7.5734, + "step": 12050 + }, + { + "epoch": 1.1244751329663152, + "grad_norm": 1.730752918387253, + "learning_rate": 0.00028114358441545486, + "loss": 7.6221, + "step": 12051 + }, + { + "epoch": 1.1245684426611926, + "grad_norm": 0.7277010058894947, + "learning_rate": 0.0002811399127655411, + "loss": 7.3966, + "step": 12052 + }, + { + "epoch": 1.1246617523560698, + "grad_norm": 0.6338875934127649, + "learning_rate": 0.0002811362407821775, + "loss": 7.4866, + "step": 12053 + }, + { + "epoch": 1.124755062050947, + "grad_norm": 1.0805327862370642, + "learning_rate": 0.00028113256846537326, + "loss": 7.421, + "step": 12054 + }, + { + "epoch": 1.1248483717458244, + "grad_norm": 0.6265432996677809, + "learning_rate": 0.0002811288958151377, + "loss": 7.6526, + "step": 12055 + }, + { + "epoch": 1.1249416814407016, + "grad_norm": 1.3579037360733601, + "learning_rate": 0.0002811252228314802, + "loss": 7.3231, + "step": 12056 + }, + { + "epoch": 1.125034991135579, + "grad_norm": 0.46802125669755396, + "learning_rate": 0.0002811215495144101, + "loss": 7.6156, + "step": 12057 + }, + { + "epoch": 1.1251283008304562, + "grad_norm": 1.2345533870287935, + "learning_rate": 0.0002811178758639368, + "loss": 7.3223, + "step": 12058 + }, + { + "epoch": 1.1252216105253336, + "grad_norm": 0.5414256096862298, + "learning_rate": 0.0002811142018800695, + "loss": 7.4508, + "step": 12059 + }, + { + "epoch": 1.1253149202202108, + "grad_norm": 12.75868684609207, + "learning_rate": 0.0002811105275628177, + "loss": 7.278, + "step": 12060 + }, + { + "epoch": 1.1254082299150883, + "grad_norm": 1.2011814992066963, + "learning_rate": 0.0002811068529121906, + "loss": 7.3452, + "step": 12061 + }, + { + "epoch": 1.1255015396099655, + "grad_norm": 1.333456854021496, + "learning_rate": 0.0002811031779281977, + "loss": 7.4025, + "step": 12062 + }, + { + "epoch": 1.1255948493048429, + "grad_norm": 1.2524024227923731, + "learning_rate": 0.0002810995026108482, + "loss": 7.5015, + "step": 12063 + }, + { + "epoch": 1.12568815899972, + "grad_norm": 0.6074299605541731, + "learning_rate": 0.00028109582696015154, + "loss": 7.1627, + "step": 12064 + }, + { + "epoch": 1.1257814686945973, + "grad_norm": 1.6530739532960284, + "learning_rate": 0.00028109215097611696, + "loss": 7.3627, + "step": 12065 + }, + { + "epoch": 1.1258747783894747, + "grad_norm": 1.4356154536340473, + "learning_rate": 0.0002810884746587539, + "loss": 7.5513, + "step": 12066 + }, + { + "epoch": 1.125968088084352, + "grad_norm": 1.2203359312477289, + "learning_rate": 0.0002810847980080717, + "loss": 7.3313, + "step": 12067 + }, + { + "epoch": 1.1260613977792293, + "grad_norm": 1.0541642285324462, + "learning_rate": 0.0002810811210240796, + "loss": 7.7766, + "step": 12068 + }, + { + "epoch": 1.1261547074741065, + "grad_norm": 1.6212465680213175, + "learning_rate": 0.00028107744370678715, + "loss": 7.325, + "step": 12069 + }, + { + "epoch": 1.126248017168984, + "grad_norm": 34.18015125125485, + "learning_rate": 0.00028107376605620356, + "loss": 7.3918, + "step": 12070 + }, + { + "epoch": 1.1263413268638611, + "grad_norm": 140.13456882009123, + "learning_rate": 0.0002810700880723382, + "loss": 7.3084, + "step": 12071 + }, + { + "epoch": 1.1264346365587385, + "grad_norm": 0.659722319156637, + "learning_rate": 0.00028106640975520043, + "loss": 7.3531, + "step": 12072 + }, + { + "epoch": 1.1265279462536157, + "grad_norm": 1.1832668591537998, + "learning_rate": 0.0002810627311047996, + "loss": 7.1595, + "step": 12073 + }, + { + "epoch": 1.1266212559484932, + "grad_norm": 1.7825649387250457, + "learning_rate": 0.0002810590521211451, + "loss": 7.4822, + "step": 12074 + }, + { + "epoch": 1.1267145656433704, + "grad_norm": 1.465813955519427, + "learning_rate": 0.0002810553728042462, + "loss": 7.2095, + "step": 12075 + }, + { + "epoch": 1.1268078753382476, + "grad_norm": 1.7015691247191056, + "learning_rate": 0.00028105169315411233, + "loss": 7.4008, + "step": 12076 + }, + { + "epoch": 1.126901185033125, + "grad_norm": 1.110445081642826, + "learning_rate": 0.0002810480131707528, + "loss": 7.4646, + "step": 12077 + }, + { + "epoch": 1.1269944947280022, + "grad_norm": 134.44178068335142, + "learning_rate": 0.000281044332854177, + "loss": 7.2937, + "step": 12078 + }, + { + "epoch": 1.1270878044228796, + "grad_norm": 1.3557076010650544, + "learning_rate": 0.0002810406522043943, + "loss": 7.432, + "step": 12079 + }, + { + "epoch": 1.1271811141177568, + "grad_norm": 1.235657805789845, + "learning_rate": 0.0002810369712214141, + "loss": 7.3899, + "step": 12080 + }, + { + "epoch": 1.1272744238126342, + "grad_norm": 0.5072961958784069, + "learning_rate": 0.00028103328990524557, + "loss": 7.5904, + "step": 12081 + }, + { + "epoch": 1.1273677335075114, + "grad_norm": 27.54899110294348, + "learning_rate": 0.00028102960825589825, + "loss": 7.828, + "step": 12082 + }, + { + "epoch": 1.1274610432023888, + "grad_norm": 1.2130592342733688, + "learning_rate": 0.00028102592627338147, + "loss": 7.3774, + "step": 12083 + }, + { + "epoch": 1.127554352897266, + "grad_norm": 368.54382546131063, + "learning_rate": 0.0002810222439577046, + "loss": 7.3064, + "step": 12084 + }, + { + "epoch": 1.1276476625921434, + "grad_norm": 0.590680000181988, + "learning_rate": 0.0002810185613088769, + "loss": 7.1727, + "step": 12085 + }, + { + "epoch": 1.1277409722870206, + "grad_norm": 0.8660883625175728, + "learning_rate": 0.00028101487832690784, + "loss": 7.3121, + "step": 12086 + }, + { + "epoch": 1.1278342819818978, + "grad_norm": 1.1057154461323733, + "learning_rate": 0.0002810111950118068, + "loss": 6.9703, + "step": 12087 + }, + { + "epoch": 1.1279275916767753, + "grad_norm": 1.9717672377314452, + "learning_rate": 0.00028100751136358304, + "loss": 7.4653, + "step": 12088 + }, + { + "epoch": 1.1280209013716525, + "grad_norm": 0.8870283989726067, + "learning_rate": 0.00028100382738224604, + "loss": 6.9598, + "step": 12089 + }, + { + "epoch": 1.1281142110665299, + "grad_norm": 1.2542954453249737, + "learning_rate": 0.00028100014306780505, + "loss": 7.5154, + "step": 12090 + }, + { + "epoch": 1.128207520761407, + "grad_norm": 1.8580680972459132, + "learning_rate": 0.00028099645842026956, + "loss": 7.3905, + "step": 12091 + }, + { + "epoch": 1.1283008304562845, + "grad_norm": 1.6690331405386374, + "learning_rate": 0.00028099277343964884, + "loss": 7.1809, + "step": 12092 + }, + { + "epoch": 1.1283941401511617, + "grad_norm": 1.1270089793633566, + "learning_rate": 0.0002809890881259523, + "loss": 7.3809, + "step": 12093 + }, + { + "epoch": 1.1284874498460389, + "grad_norm": 1.665758812563307, + "learning_rate": 0.00028098540247918933, + "loss": 7.0047, + "step": 12094 + }, + { + "epoch": 1.1285807595409163, + "grad_norm": 177.13805812432932, + "learning_rate": 0.0002809817164993693, + "loss": 7.4184, + "step": 12095 + }, + { + "epoch": 1.1286740692357937, + "grad_norm": 1.446334318047456, + "learning_rate": 0.0002809780301865015, + "loss": 7.8602, + "step": 12096 + }, + { + "epoch": 1.128767378930671, + "grad_norm": 0.7018549638295589, + "learning_rate": 0.00028097434354059546, + "loss": 7.1465, + "step": 12097 + }, + { + "epoch": 1.1288606886255481, + "grad_norm": 0.7222729349682582, + "learning_rate": 0.00028097065656166036, + "loss": 7.3103, + "step": 12098 + }, + { + "epoch": 1.1289539983204255, + "grad_norm": 0.7137576665118548, + "learning_rate": 0.00028096696924970573, + "loss": 7.2349, + "step": 12099 + }, + { + "epoch": 1.1290473080153027, + "grad_norm": 1.2213703825052604, + "learning_rate": 0.0002809632816047409, + "loss": 7.4827, + "step": 12100 + }, + { + "epoch": 1.1291406177101802, + "grad_norm": 1.3143258835421852, + "learning_rate": 0.00028095959362677524, + "loss": 7.2516, + "step": 12101 + }, + { + "epoch": 1.1292339274050573, + "grad_norm": 0.8008657261774996, + "learning_rate": 0.00028095590531581813, + "loss": 7.4933, + "step": 12102 + }, + { + "epoch": 1.1293272370999348, + "grad_norm": 1.9781558530497974, + "learning_rate": 0.0002809522166718789, + "loss": 7.2292, + "step": 12103 + }, + { + "epoch": 1.129420546794812, + "grad_norm": 299.99989716969543, + "learning_rate": 0.00028094852769496704, + "loss": 7.3656, + "step": 12104 + }, + { + "epoch": 1.1295138564896892, + "grad_norm": 2.0755020765189687, + "learning_rate": 0.0002809448383850918, + "loss": 7.6955, + "step": 12105 + }, + { + "epoch": 1.1296071661845666, + "grad_norm": 1.2688409013827198, + "learning_rate": 0.00028094114874226264, + "loss": 7.3207, + "step": 12106 + }, + { + "epoch": 1.1297004758794438, + "grad_norm": 1.285690100093422, + "learning_rate": 0.000280937458766489, + "loss": 7.3812, + "step": 12107 + }, + { + "epoch": 1.1297937855743212, + "grad_norm": 1.54763177258218, + "learning_rate": 0.00028093376845778013, + "loss": 7.2609, + "step": 12108 + }, + { + "epoch": 1.1298870952691984, + "grad_norm": 1.0788506132448303, + "learning_rate": 0.00028093007781614544, + "loss": 7.549, + "step": 12109 + }, + { + "epoch": 1.1299804049640758, + "grad_norm": 1.3078840893589814, + "learning_rate": 0.0002809263868415944, + "loss": 7.7982, + "step": 12110 + }, + { + "epoch": 1.130073714658953, + "grad_norm": 256.17624882663176, + "learning_rate": 0.0002809226955341363, + "loss": 7.5528, + "step": 12111 + }, + { + "epoch": 1.1301670243538304, + "grad_norm": 480.04577826184686, + "learning_rate": 0.00028091900389378056, + "loss": 7.0927, + "step": 12112 + }, + { + "epoch": 1.1302603340487076, + "grad_norm": 0.5583796608601702, + "learning_rate": 0.0002809153119205366, + "loss": 7.4146, + "step": 12113 + }, + { + "epoch": 1.130353643743585, + "grad_norm": 1.3222755317600245, + "learning_rate": 0.0002809116196144138, + "loss": 7.7221, + "step": 12114 + }, + { + "epoch": 1.1304469534384622, + "grad_norm": 459.61203348540204, + "learning_rate": 0.0002809079269754215, + "loss": 7.3387, + "step": 12115 + }, + { + "epoch": 1.1305402631333394, + "grad_norm": 1.1164172714049823, + "learning_rate": 0.0002809042340035692, + "loss": 7.2556, + "step": 12116 + }, + { + "epoch": 1.1306335728282169, + "grad_norm": 1.2489025142934906, + "learning_rate": 0.0002809005406988661, + "loss": 7.2919, + "step": 12117 + }, + { + "epoch": 1.130726882523094, + "grad_norm": 1.7240079716629202, + "learning_rate": 0.00028089684706132174, + "loss": 7.5525, + "step": 12118 + }, + { + "epoch": 1.1308201922179715, + "grad_norm": 1.7876888419332586, + "learning_rate": 0.0002808931530909455, + "loss": 7.2665, + "step": 12119 + }, + { + "epoch": 1.1309135019128487, + "grad_norm": 1.9153691862766922, + "learning_rate": 0.00028088945878774666, + "loss": 7.5062, + "step": 12120 + }, + { + "epoch": 1.131006811607726, + "grad_norm": 1.443234149658264, + "learning_rate": 0.00028088576415173476, + "loss": 7.2799, + "step": 12121 + }, + { + "epoch": 1.1311001213026033, + "grad_norm": 1.591071687950381, + "learning_rate": 0.00028088206918291914, + "loss": 7.5707, + "step": 12122 + }, + { + "epoch": 1.1311934309974807, + "grad_norm": 1.3775341253456301, + "learning_rate": 0.0002808783738813092, + "loss": 7.7849, + "step": 12123 + }, + { + "epoch": 1.131286740692358, + "grad_norm": 0.9707487390926454, + "learning_rate": 0.0002808746782469143, + "loss": 7.5134, + "step": 12124 + }, + { + "epoch": 1.1313800503872353, + "grad_norm": 1.416407415641972, + "learning_rate": 0.0002808709822797439, + "loss": 7.2927, + "step": 12125 + }, + { + "epoch": 1.1314733600821125, + "grad_norm": 1.1718666680471677, + "learning_rate": 0.0002808672859798073, + "loss": 7.1383, + "step": 12126 + }, + { + "epoch": 1.1315666697769897, + "grad_norm": 0.6909491057724106, + "learning_rate": 0.00028086358934711393, + "loss": 7.5269, + "step": 12127 + }, + { + "epoch": 1.1316599794718671, + "grad_norm": 0.7374215672023617, + "learning_rate": 0.0002808598923816733, + "loss": 7.2771, + "step": 12128 + }, + { + "epoch": 1.1317532891667443, + "grad_norm": 1.5472328902365116, + "learning_rate": 0.00028085619508349467, + "loss": 7.5598, + "step": 12129 + }, + { + "epoch": 1.1318465988616218, + "grad_norm": 1.9244045943610422, + "learning_rate": 0.00028085249745258753, + "loss": 7.7724, + "step": 12130 + }, + { + "epoch": 1.131939908556499, + "grad_norm": 1.1254169034614234, + "learning_rate": 0.00028084879948896125, + "loss": 7.2615, + "step": 12131 + }, + { + "epoch": 1.1320332182513764, + "grad_norm": 1.3806023090038297, + "learning_rate": 0.00028084510119262524, + "loss": 7.2712, + "step": 12132 + }, + { + "epoch": 1.1321265279462536, + "grad_norm": 1.9529551337659674, + "learning_rate": 0.0002808414025635889, + "loss": 7.2775, + "step": 12133 + }, + { + "epoch": 1.132219837641131, + "grad_norm": 1.1305646208575462, + "learning_rate": 0.0002808377036018616, + "loss": 7.5433, + "step": 12134 + }, + { + "epoch": 1.1323131473360082, + "grad_norm": 1.007357279261822, + "learning_rate": 0.00028083400430745277, + "loss": 7.4203, + "step": 12135 + }, + { + "epoch": 1.1324064570308856, + "grad_norm": 0.898436504342245, + "learning_rate": 0.0002808303046803719, + "loss": 7.4056, + "step": 12136 + }, + { + "epoch": 1.1324997667257628, + "grad_norm": 1.0096954570663985, + "learning_rate": 0.00028082660472062825, + "loss": 7.2227, + "step": 12137 + }, + { + "epoch": 1.13259307642064, + "grad_norm": 0.7475413801908072, + "learning_rate": 0.00028082290442823127, + "loss": 7.4295, + "step": 12138 + }, + { + "epoch": 1.1326863861155174, + "grad_norm": 0.8652441283232936, + "learning_rate": 0.0002808192038031905, + "loss": 7.3704, + "step": 12139 + }, + { + "epoch": 1.1327796958103946, + "grad_norm": 1.0605325234433818, + "learning_rate": 0.00028081550284551513, + "loss": 7.4169, + "step": 12140 + }, + { + "epoch": 1.132873005505272, + "grad_norm": 1.5909411440028338, + "learning_rate": 0.0002808118015552148, + "loss": 7.8583, + "step": 12141 + }, + { + "epoch": 1.1329663152001492, + "grad_norm": 0.9186543253317316, + "learning_rate": 0.0002808080999322987, + "loss": 7.4826, + "step": 12142 + }, + { + "epoch": 1.1330596248950267, + "grad_norm": 0.7068415284420967, + "learning_rate": 0.0002808043979767764, + "loss": 7.5488, + "step": 12143 + }, + { + "epoch": 1.1331529345899038, + "grad_norm": 0.782503210859021, + "learning_rate": 0.0002808006956886573, + "loss": 7.5246, + "step": 12144 + }, + { + "epoch": 1.1332462442847813, + "grad_norm": 1.015823267739436, + "learning_rate": 0.0002807969930679507, + "loss": 7.5594, + "step": 12145 + }, + { + "epoch": 1.1333395539796585, + "grad_norm": 0.7739867687410898, + "learning_rate": 0.0002807932901146661, + "loss": 7.7349, + "step": 12146 + }, + { + "epoch": 1.1334328636745359, + "grad_norm": 0.9544169857130296, + "learning_rate": 0.00028078958682881296, + "loss": 7.6104, + "step": 12147 + }, + { + "epoch": 1.133526173369413, + "grad_norm": 1.739768013975375, + "learning_rate": 0.0002807858832104006, + "loss": 7.4965, + "step": 12148 + }, + { + "epoch": 1.1336194830642903, + "grad_norm": 1.316687890962112, + "learning_rate": 0.00028078217925943855, + "loss": 7.5948, + "step": 12149 + }, + { + "epoch": 1.1337127927591677, + "grad_norm": 1.1179720639007127, + "learning_rate": 0.0002807784749759361, + "loss": 7.2414, + "step": 12150 + }, + { + "epoch": 1.133806102454045, + "grad_norm": 1.5859679810700535, + "learning_rate": 0.00028077477035990273, + "loss": 7.3508, + "step": 12151 + }, + { + "epoch": 1.1338994121489223, + "grad_norm": 1.8022774160120019, + "learning_rate": 0.0002807710654113479, + "loss": 7.1798, + "step": 12152 + }, + { + "epoch": 1.1339927218437995, + "grad_norm": 1.1312766317032428, + "learning_rate": 0.0002807673601302809, + "loss": 7.6259, + "step": 12153 + }, + { + "epoch": 1.134086031538677, + "grad_norm": 1.4087881957063995, + "learning_rate": 0.00028076365451671135, + "loss": 7.2053, + "step": 12154 + }, + { + "epoch": 1.1341793412335541, + "grad_norm": 0.774246790833765, + "learning_rate": 0.00028075994857064846, + "loss": 7.3246, + "step": 12155 + }, + { + "epoch": 1.1342726509284315, + "grad_norm": 1.1042116292363626, + "learning_rate": 0.00028075624229210186, + "loss": 7.3419, + "step": 12156 + }, + { + "epoch": 1.1343659606233087, + "grad_norm": 1.0320381283367634, + "learning_rate": 0.0002807525356810808, + "loss": 7.5745, + "step": 12157 + }, + { + "epoch": 1.1344592703181862, + "grad_norm": 1.722344370933971, + "learning_rate": 0.00028074882873759476, + "loss": 7.4813, + "step": 12158 + }, + { + "epoch": 1.1345525800130634, + "grad_norm": 1.3148894132059548, + "learning_rate": 0.0002807451214616532, + "loss": 7.4887, + "step": 12159 + }, + { + "epoch": 1.1346458897079406, + "grad_norm": 1.5101178927061543, + "learning_rate": 0.00028074141385326556, + "loss": 7.2723, + "step": 12160 + }, + { + "epoch": 1.134739199402818, + "grad_norm": 1.1747358969821415, + "learning_rate": 0.00028073770591244116, + "loss": 7.7313, + "step": 12161 + }, + { + "epoch": 1.1348325090976952, + "grad_norm": 1.353607256892915, + "learning_rate": 0.0002807339976391896, + "loss": 7.6901, + "step": 12162 + }, + { + "epoch": 1.1349258187925726, + "grad_norm": 0.8903085665175793, + "learning_rate": 0.0002807302890335201, + "loss": 7.2348, + "step": 12163 + }, + { + "epoch": 1.1350191284874498, + "grad_norm": 1.124610776792605, + "learning_rate": 0.00028072658009544224, + "loss": 7.4946, + "step": 12164 + }, + { + "epoch": 1.1351124381823272, + "grad_norm": 0.9974069892562467, + "learning_rate": 0.00028072287082496543, + "loss": 7.7267, + "step": 12165 + }, + { + "epoch": 1.1352057478772044, + "grad_norm": 1.7800033270958264, + "learning_rate": 0.0002807191612220991, + "loss": 7.4529, + "step": 12166 + }, + { + "epoch": 1.1352990575720818, + "grad_norm": 1.4759168641702396, + "learning_rate": 0.0002807154512868526, + "loss": 7.7103, + "step": 12167 + }, + { + "epoch": 1.135392367266959, + "grad_norm": 0.7884341396194794, + "learning_rate": 0.0002807117410192355, + "loss": 7.545, + "step": 12168 + }, + { + "epoch": 1.1354856769618364, + "grad_norm": 0.7769280324328437, + "learning_rate": 0.00028070803041925713, + "loss": 7.5605, + "step": 12169 + }, + { + "epoch": 1.1355789866567136, + "grad_norm": 0.9911600161570197, + "learning_rate": 0.000280704319486927, + "loss": 7.641, + "step": 12170 + }, + { + "epoch": 1.1356722963515908, + "grad_norm": 0.7476656057386359, + "learning_rate": 0.0002807006082222544, + "loss": 7.6413, + "step": 12171 + }, + { + "epoch": 1.1357656060464683, + "grad_norm": 1.3288232862826463, + "learning_rate": 0.0002806968966252489, + "loss": 7.1688, + "step": 12172 + }, + { + "epoch": 1.1358589157413455, + "grad_norm": 1.1223128599851588, + "learning_rate": 0.00028069318469592, + "loss": 7.3559, + "step": 12173 + }, + { + "epoch": 1.1359522254362229, + "grad_norm": 0.9108316327475535, + "learning_rate": 0.00028068947243427696, + "loss": 7.5085, + "step": 12174 + }, + { + "epoch": 1.1360455351311, + "grad_norm": 0.8371122989293073, + "learning_rate": 0.0002806857598403293, + "loss": 7.3976, + "step": 12175 + }, + { + "epoch": 1.1361388448259775, + "grad_norm": 1.1486318877255275, + "learning_rate": 0.00028068204691408653, + "loss": 7.5877, + "step": 12176 + }, + { + "epoch": 1.1362321545208547, + "grad_norm": 0.6992356467452137, + "learning_rate": 0.000280678333655558, + "loss": 7.3053, + "step": 12177 + }, + { + "epoch": 1.136325464215732, + "grad_norm": 0.6850078127617276, + "learning_rate": 0.00028067462006475314, + "loss": 7.3936, + "step": 12178 + }, + { + "epoch": 1.1364187739106093, + "grad_norm": 0.8236011615859412, + "learning_rate": 0.00028067090614168145, + "loss": 7.5532, + "step": 12179 + }, + { + "epoch": 1.1365120836054867, + "grad_norm": 1.2898345452149387, + "learning_rate": 0.00028066719188635236, + "loss": 7.1503, + "step": 12180 + }, + { + "epoch": 1.136605393300364, + "grad_norm": 2.2726544075392283, + "learning_rate": 0.0002806634772987753, + "loss": 7.2587, + "step": 12181 + }, + { + "epoch": 1.1366987029952411, + "grad_norm": 1.0267426779212394, + "learning_rate": 0.0002806597623789597, + "loss": 7.772, + "step": 12182 + }, + { + "epoch": 1.1367920126901185, + "grad_norm": 0.9351770142811456, + "learning_rate": 0.0002806560471269151, + "loss": 7.568, + "step": 12183 + }, + { + "epoch": 1.1368853223849957, + "grad_norm": 0.7490113157994043, + "learning_rate": 0.00028065233154265086, + "loss": 7.4822, + "step": 12184 + }, + { + "epoch": 1.1369786320798732, + "grad_norm": 0.8522133455052123, + "learning_rate": 0.0002806486156261764, + "loss": 7.1555, + "step": 12185 + }, + { + "epoch": 1.1370719417747503, + "grad_norm": 0.5349520684643869, + "learning_rate": 0.0002806448993775012, + "loss": 7.4849, + "step": 12186 + }, + { + "epoch": 1.1371652514696278, + "grad_norm": 0.5198125951573748, + "learning_rate": 0.0002806411827966348, + "loss": 7.573, + "step": 12187 + }, + { + "epoch": 1.137258561164505, + "grad_norm": 0.6541158418855554, + "learning_rate": 0.00028063746588358646, + "loss": 7.6607, + "step": 12188 + }, + { + "epoch": 1.1373518708593824, + "grad_norm": 1.5160159053345132, + "learning_rate": 0.00028063374863836584, + "loss": 7.3174, + "step": 12189 + }, + { + "epoch": 1.1374451805542596, + "grad_norm": 1.2885461581501974, + "learning_rate": 0.0002806300310609822, + "loss": 7.2883, + "step": 12190 + }, + { + "epoch": 1.137538490249137, + "grad_norm": 1.0389002262520797, + "learning_rate": 0.00028062631315144517, + "loss": 7.1665, + "step": 12191 + }, + { + "epoch": 1.1376317999440142, + "grad_norm": 1.122194598024653, + "learning_rate": 0.0002806225949097641, + "loss": 7.5019, + "step": 12192 + }, + { + "epoch": 1.1377251096388914, + "grad_norm": 1.4759362577753166, + "learning_rate": 0.0002806188763359484, + "loss": 7.5513, + "step": 12193 + }, + { + "epoch": 1.1378184193337688, + "grad_norm": 1.3751545268745304, + "learning_rate": 0.00028061515743000773, + "loss": 7.5317, + "step": 12194 + }, + { + "epoch": 1.137911729028646, + "grad_norm": 1.0487787678399936, + "learning_rate": 0.0002806114381919513, + "loss": 7.6432, + "step": 12195 + }, + { + "epoch": 1.1380050387235234, + "grad_norm": 1.0244140349623507, + "learning_rate": 0.0002806077186217887, + "loss": 7.2065, + "step": 12196 + }, + { + "epoch": 1.1380983484184006, + "grad_norm": 0.8078918203411869, + "learning_rate": 0.00028060399871952937, + "loss": 7.4435, + "step": 12197 + }, + { + "epoch": 1.138191658113278, + "grad_norm": 0.6602033930698844, + "learning_rate": 0.0002806002784851827, + "loss": 7.6127, + "step": 12198 + }, + { + "epoch": 1.1382849678081552, + "grad_norm": 0.5800913085060426, + "learning_rate": 0.00028059655791875827, + "loss": 7.6583, + "step": 12199 + }, + { + "epoch": 1.1383782775030324, + "grad_norm": 1.989740282220613, + "learning_rate": 0.00028059283702026546, + "loss": 7.7041, + "step": 12200 + }, + { + "epoch": 1.1384715871979099, + "grad_norm": 0.7107433583385656, + "learning_rate": 0.00028058911578971373, + "loss": 7.3933, + "step": 12201 + }, + { + "epoch": 1.1385648968927873, + "grad_norm": 0.5532087204404477, + "learning_rate": 0.0002805853942271126, + "loss": 7.5095, + "step": 12202 + }, + { + "epoch": 1.1386582065876645, + "grad_norm": 0.8507979982605051, + "learning_rate": 0.0002805816723324715, + "loss": 7.4567, + "step": 12203 + }, + { + "epoch": 1.1387515162825417, + "grad_norm": 0.7298463951916766, + "learning_rate": 0.0002805779501057998, + "loss": 7.0524, + "step": 12204 + }, + { + "epoch": 1.138844825977419, + "grad_norm": 1.2040681138804792, + "learning_rate": 0.00028057422754710715, + "loss": 7.3814, + "step": 12205 + }, + { + "epoch": 1.1389381356722963, + "grad_norm": 1.8609570454817064, + "learning_rate": 0.00028057050465640294, + "loss": 7.6481, + "step": 12206 + }, + { + "epoch": 1.1390314453671737, + "grad_norm": 1.1378512880242218, + "learning_rate": 0.0002805667814336965, + "loss": 7.6228, + "step": 12207 + }, + { + "epoch": 1.139124755062051, + "grad_norm": 0.5296269927971585, + "learning_rate": 0.00028056305787899744, + "loss": 7.3527, + "step": 12208 + }, + { + "epoch": 1.1392180647569283, + "grad_norm": 0.6463205427115298, + "learning_rate": 0.0002805593339923153, + "loss": 7.5982, + "step": 12209 + }, + { + "epoch": 1.1393113744518055, + "grad_norm": 1.4239357081944015, + "learning_rate": 0.0002805556097736594, + "loss": 7.2721, + "step": 12210 + }, + { + "epoch": 1.1394046841466827, + "grad_norm": 1.0477585088935775, + "learning_rate": 0.0002805518852230392, + "loss": 7.7524, + "step": 12211 + }, + { + "epoch": 1.1394979938415601, + "grad_norm": 1.4176177574746873, + "learning_rate": 0.00028054816034046425, + "loss": 7.2724, + "step": 12212 + }, + { + "epoch": 1.1395913035364373, + "grad_norm": 0.6674717064962159, + "learning_rate": 0.00028054443512594404, + "loss": 7.5381, + "step": 12213 + }, + { + "epoch": 1.1396846132313148, + "grad_norm": 0.8506675368717315, + "learning_rate": 0.00028054070957948796, + "loss": 7.3445, + "step": 12214 + }, + { + "epoch": 1.139777922926192, + "grad_norm": 1.1471575279327917, + "learning_rate": 0.00028053698370110556, + "loss": 7.5383, + "step": 12215 + }, + { + "epoch": 1.1398712326210694, + "grad_norm": 1.1294507317538187, + "learning_rate": 0.00028053325749080626, + "loss": 7.3537, + "step": 12216 + }, + { + "epoch": 1.1399645423159466, + "grad_norm": 1.527555299747464, + "learning_rate": 0.00028052953094859955, + "loss": 7.6038, + "step": 12217 + }, + { + "epoch": 1.140057852010824, + "grad_norm": 0.6676892768140061, + "learning_rate": 0.0002805258040744949, + "loss": 7.4269, + "step": 12218 + }, + { + "epoch": 1.1401511617057012, + "grad_norm": 0.6333892445411533, + "learning_rate": 0.0002805220768685018, + "loss": 7.5378, + "step": 12219 + }, + { + "epoch": 1.1402444714005786, + "grad_norm": 1.7269037853203903, + "learning_rate": 0.00028051834933062974, + "loss": 7.5973, + "step": 12220 + }, + { + "epoch": 1.1403377810954558, + "grad_norm": 1.145912403857521, + "learning_rate": 0.00028051462146088817, + "loss": 7.4282, + "step": 12221 + }, + { + "epoch": 1.140431090790333, + "grad_norm": 1.2453350001747843, + "learning_rate": 0.0002805108932592866, + "loss": 7.2807, + "step": 12222 + }, + { + "epoch": 1.1405244004852104, + "grad_norm": 0.6912155740444621, + "learning_rate": 0.0002805071647258344, + "loss": 7.4355, + "step": 12223 + }, + { + "epoch": 1.1406177101800876, + "grad_norm": 0.956914182952502, + "learning_rate": 0.0002805034358605412, + "loss": 6.9831, + "step": 12224 + }, + { + "epoch": 1.140711019874965, + "grad_norm": 1.0115230897158756, + "learning_rate": 0.00028049970666341645, + "loss": 7.263, + "step": 12225 + }, + { + "epoch": 1.1408043295698422, + "grad_norm": 1.8078197596746288, + "learning_rate": 0.0002804959771344696, + "loss": 7.4658, + "step": 12226 + }, + { + "epoch": 1.1408976392647197, + "grad_norm": 1.4313172132838654, + "learning_rate": 0.0002804922472737101, + "loss": 7.3924, + "step": 12227 + }, + { + "epoch": 1.1409909489595969, + "grad_norm": 0.8953388274823582, + "learning_rate": 0.0002804885170811475, + "loss": 7.1589, + "step": 12228 + }, + { + "epoch": 1.1410842586544743, + "grad_norm": 0.6947541252619401, + "learning_rate": 0.0002804847865567913, + "loss": 7.5083, + "step": 12229 + }, + { + "epoch": 1.1411775683493515, + "grad_norm": 0.7829372468932799, + "learning_rate": 0.0002804810557006508, + "loss": 7.3962, + "step": 12230 + }, + { + "epoch": 1.1412708780442289, + "grad_norm": 0.43889625120440134, + "learning_rate": 0.0002804773245127357, + "loss": 7.6431, + "step": 12231 + }, + { + "epoch": 1.141364187739106, + "grad_norm": 0.7252165208876965, + "learning_rate": 0.0002804735929930554, + "loss": 7.5877, + "step": 12232 + }, + { + "epoch": 1.1414574974339833, + "grad_norm": 1.3181908725601226, + "learning_rate": 0.0002804698611416195, + "loss": 7.3576, + "step": 12233 + }, + { + "epoch": 1.1415508071288607, + "grad_norm": 0.8246693269670646, + "learning_rate": 0.00028046612895843724, + "loss": 7.4126, + "step": 12234 + }, + { + "epoch": 1.141644116823738, + "grad_norm": 0.883102090856077, + "learning_rate": 0.00028046239644351835, + "loss": 7.1581, + "step": 12235 + }, + { + "epoch": 1.1417374265186153, + "grad_norm": 1.0490719538765245, + "learning_rate": 0.00028045866359687225, + "loss": 7.5975, + "step": 12236 + }, + { + "epoch": 1.1418307362134925, + "grad_norm": 0.7043863576376316, + "learning_rate": 0.0002804549304185084, + "loss": 7.2585, + "step": 12237 + }, + { + "epoch": 1.14192404590837, + "grad_norm": 1.6693707361679306, + "learning_rate": 0.0002804511969084362, + "loss": 7.4309, + "step": 12238 + }, + { + "epoch": 1.1420173556032471, + "grad_norm": 2.2928071659358795, + "learning_rate": 0.0002804474630666654, + "loss": 7.7208, + "step": 12239 + }, + { + "epoch": 1.1421106652981245, + "grad_norm": 1.3512516538701005, + "learning_rate": 0.0002804437288932053, + "loss": 7.6083, + "step": 12240 + }, + { + "epoch": 1.1422039749930017, + "grad_norm": 0.5053996957110332, + "learning_rate": 0.00028043999438806533, + "loss": 7.4637, + "step": 12241 + }, + { + "epoch": 1.1422972846878792, + "grad_norm": 1.689723221502233, + "learning_rate": 0.00028043625955125517, + "loss": 7.3607, + "step": 12242 + }, + { + "epoch": 1.1423905943827564, + "grad_norm": 2.337498382210506, + "learning_rate": 0.00028043252438278427, + "loss": 7.3143, + "step": 12243 + }, + { + "epoch": 1.1424839040776336, + "grad_norm": 1.9419126798624673, + "learning_rate": 0.0002804287888826621, + "loss": 7.264, + "step": 12244 + }, + { + "epoch": 1.142577213772511, + "grad_norm": 1.2083370167634595, + "learning_rate": 0.0002804250530508981, + "loss": 7.4127, + "step": 12245 + }, + { + "epoch": 1.1426705234673882, + "grad_norm": 0.8604182465021465, + "learning_rate": 0.00028042131688750185, + "loss": 7.149, + "step": 12246 + }, + { + "epoch": 1.1427638331622656, + "grad_norm": 0.8415060579262317, + "learning_rate": 0.00028041758039248286, + "loss": 7.3755, + "step": 12247 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.5434549806677191, + "learning_rate": 0.00028041384356585055, + "loss": 7.3739, + "step": 12248 + }, + { + "epoch": 1.1429504525520202, + "grad_norm": 2.367907725326918, + "learning_rate": 0.0002804101064076145, + "loss": 7.6008, + "step": 12249 + }, + { + "epoch": 1.1430437622468974, + "grad_norm": 1.152327662163432, + "learning_rate": 0.00028040636891778415, + "loss": 7.2022, + "step": 12250 + }, + { + "epoch": 1.1431370719417748, + "grad_norm": 0.8544657897036925, + "learning_rate": 0.00028040263109636903, + "loss": 7.3222, + "step": 12251 + }, + { + "epoch": 1.143230381636652, + "grad_norm": 0.7591833148468158, + "learning_rate": 0.0002803988929433787, + "loss": 7.5161, + "step": 12252 + }, + { + "epoch": 1.1433236913315294, + "grad_norm": 0.8340442639038558, + "learning_rate": 0.0002803951544588226, + "loss": 7.2733, + "step": 12253 + }, + { + "epoch": 1.1434170010264066, + "grad_norm": 0.7707150882839124, + "learning_rate": 0.0002803914156427102, + "loss": 7.6089, + "step": 12254 + }, + { + "epoch": 1.1435103107212838, + "grad_norm": 3.240545771824521, + "learning_rate": 0.00028038767649505106, + "loss": 7.5897, + "step": 12255 + }, + { + "epoch": 1.1436036204161613, + "grad_norm": 1.4322117322855683, + "learning_rate": 0.0002803839370158547, + "loss": 7.3398, + "step": 12256 + }, + { + "epoch": 1.1436969301110385, + "grad_norm": 1.7568090562894205, + "learning_rate": 0.00028038019720513064, + "loss": 7.0634, + "step": 12257 + }, + { + "epoch": 1.1437902398059159, + "grad_norm": 0.7175153185002445, + "learning_rate": 0.00028037645706288834, + "loss": 7.3889, + "step": 12258 + }, + { + "epoch": 1.143883549500793, + "grad_norm": 1.287839189788165, + "learning_rate": 0.0002803727165891373, + "loss": 7.5128, + "step": 12259 + }, + { + "epoch": 1.1439768591956705, + "grad_norm": 1.8605554645998537, + "learning_rate": 0.0002803689757838871, + "loss": 7.4791, + "step": 12260 + }, + { + "epoch": 1.1440701688905477, + "grad_norm": 1.3597284706529258, + "learning_rate": 0.0002803652346471472, + "loss": 7.2736, + "step": 12261 + }, + { + "epoch": 1.144163478585425, + "grad_norm": 1.08513151705518, + "learning_rate": 0.0002803614931789271, + "loss": 7.2406, + "step": 12262 + }, + { + "epoch": 1.1442567882803023, + "grad_norm": 1.2135542101257153, + "learning_rate": 0.00028035775137923635, + "loss": 7.5336, + "step": 12263 + }, + { + "epoch": 1.1443500979751797, + "grad_norm": 0.8810395108233103, + "learning_rate": 0.00028035400924808447, + "loss": 7.4616, + "step": 12264 + }, + { + "epoch": 1.144443407670057, + "grad_norm": 0.704598774212961, + "learning_rate": 0.0002803502667854809, + "loss": 7.3676, + "step": 12265 + }, + { + "epoch": 1.1445367173649341, + "grad_norm": 1.1695670846841535, + "learning_rate": 0.0002803465239914353, + "loss": 7.1852, + "step": 12266 + }, + { + "epoch": 1.1446300270598115, + "grad_norm": 0.9139302829979286, + "learning_rate": 0.00028034278086595707, + "loss": 7.2436, + "step": 12267 + }, + { + "epoch": 1.1447233367546887, + "grad_norm": 0.821550398305045, + "learning_rate": 0.0002803390374090558, + "loss": 7.2902, + "step": 12268 + }, + { + "epoch": 1.1448166464495662, + "grad_norm": 0.7721314898825885, + "learning_rate": 0.0002803352936207409, + "loss": 7.5866, + "step": 12269 + }, + { + "epoch": 1.1449099561444434, + "grad_norm": 1.3366222396459413, + "learning_rate": 0.0002803315495010219, + "loss": 7.1701, + "step": 12270 + }, + { + "epoch": 1.1450032658393208, + "grad_norm": 1.092643095473237, + "learning_rate": 0.00028032780504990855, + "loss": 7.4535, + "step": 12271 + }, + { + "epoch": 1.145096575534198, + "grad_norm": 3.1560277926197555, + "learning_rate": 0.00028032406026741, + "loss": 7.4624, + "step": 12272 + }, + { + "epoch": 1.1451898852290754, + "grad_norm": 0.9114747166347991, + "learning_rate": 0.0002803203151535361, + "loss": 7.4294, + "step": 12273 + }, + { + "epoch": 1.1452831949239526, + "grad_norm": 0.7254019882397578, + "learning_rate": 0.00028031656970829624, + "loss": 7.2673, + "step": 12274 + }, + { + "epoch": 1.14537650461883, + "grad_norm": 0.5726014995469294, + "learning_rate": 0.0002803128239316999, + "loss": 7.2716, + "step": 12275 + }, + { + "epoch": 1.1454698143137072, + "grad_norm": 1.431705789333713, + "learning_rate": 0.00028030907782375673, + "loss": 7.4153, + "step": 12276 + }, + { + "epoch": 1.1455631240085844, + "grad_norm": 1.3653273801746528, + "learning_rate": 0.0002803053313844761, + "loss": 7.4146, + "step": 12277 + }, + { + "epoch": 1.1456564337034618, + "grad_norm": 0.6652452022124252, + "learning_rate": 0.0002803015846138676, + "loss": 7.4523, + "step": 12278 + }, + { + "epoch": 1.145749743398339, + "grad_norm": 0.7663248101388972, + "learning_rate": 0.0002802978375119408, + "loss": 8.107, + "step": 12279 + }, + { + "epoch": 1.1458430530932164, + "grad_norm": 1.456093618424906, + "learning_rate": 0.0002802940900787052, + "loss": 7.2734, + "step": 12280 + }, + { + "epoch": 1.1459363627880936, + "grad_norm": 1.0476807799662955, + "learning_rate": 0.00028029034231417027, + "loss": 7.3728, + "step": 12281 + }, + { + "epoch": 1.146029672482971, + "grad_norm": 0.5677782478530496, + "learning_rate": 0.0002802865942183457, + "loss": 7.3572, + "step": 12282 + }, + { + "epoch": 1.1461229821778482, + "grad_norm": 0.47496998771281435, + "learning_rate": 0.0002802828457912408, + "loss": 7.3878, + "step": 12283 + }, + { + "epoch": 1.1462162918727257, + "grad_norm": 0.9548823551726601, + "learning_rate": 0.0002802790970328653, + "loss": 7.7481, + "step": 12284 + }, + { + "epoch": 1.1463096015676029, + "grad_norm": 0.8078710309116771, + "learning_rate": 0.0002802753479432286, + "loss": 7.3201, + "step": 12285 + }, + { + "epoch": 1.1464029112624803, + "grad_norm": 0.7270414764516163, + "learning_rate": 0.0002802715985223403, + "loss": 7.3211, + "step": 12286 + }, + { + "epoch": 1.1464962209573575, + "grad_norm": 0.39830873798650995, + "learning_rate": 0.00028026784877020994, + "loss": 7.2527, + "step": 12287 + }, + { + "epoch": 1.1465895306522347, + "grad_norm": 0.45889109509029913, + "learning_rate": 0.000280264098686847, + "loss": 7.4122, + "step": 12288 + }, + { + "epoch": 1.146682840347112, + "grad_norm": 0.7740313839665507, + "learning_rate": 0.000280260348272261, + "loss": 7.535, + "step": 12289 + }, + { + "epoch": 1.1467761500419893, + "grad_norm": 0.6018716620348564, + "learning_rate": 0.0002802565975264616, + "loss": 7.6871, + "step": 12290 + }, + { + "epoch": 1.1468694597368667, + "grad_norm": 1.5360792632545388, + "learning_rate": 0.0002802528464494582, + "loss": 7.1487, + "step": 12291 + }, + { + "epoch": 1.146962769431744, + "grad_norm": 0.7636726588572382, + "learning_rate": 0.00028024909504126044, + "loss": 7.2877, + "step": 12292 + }, + { + "epoch": 1.1470560791266213, + "grad_norm": 0.6170474074752276, + "learning_rate": 0.0002802453433018778, + "loss": 7.414, + "step": 12293 + }, + { + "epoch": 1.1471493888214985, + "grad_norm": 0.5475608932543287, + "learning_rate": 0.00028024159123131985, + "loss": 7.1701, + "step": 12294 + }, + { + "epoch": 1.147242698516376, + "grad_norm": 1.1108812907464944, + "learning_rate": 0.00028023783882959603, + "loss": 7.2847, + "step": 12295 + }, + { + "epoch": 1.1473360082112531, + "grad_norm": 0.7541988794219705, + "learning_rate": 0.00028023408609671605, + "loss": 7.2607, + "step": 12296 + }, + { + "epoch": 1.1474293179061306, + "grad_norm": 1.3879389889336555, + "learning_rate": 0.0002802303330326894, + "loss": 7.4043, + "step": 12297 + }, + { + "epoch": 1.1475226276010078, + "grad_norm": 1.2405614504522968, + "learning_rate": 0.0002802265796375255, + "loss": 7.5095, + "step": 12298 + }, + { + "epoch": 1.147615937295885, + "grad_norm": 0.6318916312843275, + "learning_rate": 0.000280222825911234, + "loss": 7.7379, + "step": 12299 + }, + { + "epoch": 1.1477092469907624, + "grad_norm": 1.7595309240331833, + "learning_rate": 0.00028021907185382444, + "loss": 7.4978, + "step": 12300 + }, + { + "epoch": 1.1478025566856396, + "grad_norm": 0.9145979539567862, + "learning_rate": 0.0002802153174653064, + "loss": 7.5396, + "step": 12301 + }, + { + "epoch": 1.147895866380517, + "grad_norm": 1.9884506632032264, + "learning_rate": 0.0002802115627456893, + "loss": 7.3552, + "step": 12302 + }, + { + "epoch": 1.1479891760753942, + "grad_norm": 1.515199096843502, + "learning_rate": 0.0002802078076949828, + "loss": 7.2945, + "step": 12303 + }, + { + "epoch": 1.1480824857702716, + "grad_norm": 0.8510026014263951, + "learning_rate": 0.00028020405231319646, + "loss": 7.3528, + "step": 12304 + }, + { + "epoch": 1.1481757954651488, + "grad_norm": 0.7512216743459025, + "learning_rate": 0.0002802002966003397, + "loss": 7.4426, + "step": 12305 + }, + { + "epoch": 1.148269105160026, + "grad_norm": 5.258464377138957, + "learning_rate": 0.00028019654055642224, + "loss": 7.3475, + "step": 12306 + }, + { + "epoch": 1.1483624148549034, + "grad_norm": 1.1850090333610577, + "learning_rate": 0.0002801927841814535, + "loss": 7.3058, + "step": 12307 + }, + { + "epoch": 1.1484557245497808, + "grad_norm": 0.8763948566670172, + "learning_rate": 0.00028018902747544306, + "loss": 7.3406, + "step": 12308 + }, + { + "epoch": 1.148549034244658, + "grad_norm": 27.737341329634557, + "learning_rate": 0.00028018527043840053, + "loss": 7.5116, + "step": 12309 + }, + { + "epoch": 1.1486423439395352, + "grad_norm": 0.7172816340002629, + "learning_rate": 0.0002801815130703354, + "loss": 7.3357, + "step": 12310 + }, + { + "epoch": 1.1487356536344127, + "grad_norm": 1.4065848348343097, + "learning_rate": 0.00028017775537125724, + "loss": 7.2874, + "step": 12311 + }, + { + "epoch": 1.1488289633292899, + "grad_norm": 1.320674843074636, + "learning_rate": 0.0002801739973411756, + "loss": 7.4053, + "step": 12312 + }, + { + "epoch": 1.1489222730241673, + "grad_norm": 1.632297258954267, + "learning_rate": 0.0002801702389801001, + "loss": 7.3192, + "step": 12313 + }, + { + "epoch": 1.1490155827190445, + "grad_norm": 1.574277386687579, + "learning_rate": 0.00028016648028804023, + "loss": 7.0161, + "step": 12314 + }, + { + "epoch": 1.149108892413922, + "grad_norm": 165.62518638083037, + "learning_rate": 0.0002801627212650055, + "loss": 7.6034, + "step": 12315 + }, + { + "epoch": 1.149202202108799, + "grad_norm": 1.337987453841056, + "learning_rate": 0.0002801589619110055, + "loss": 7.4714, + "step": 12316 + }, + { + "epoch": 1.1492955118036763, + "grad_norm": 0.9577223843734498, + "learning_rate": 0.0002801552022260499, + "loss": 7.3592, + "step": 12317 + }, + { + "epoch": 1.1493888214985537, + "grad_norm": 0.779795446925973, + "learning_rate": 0.0002801514422101482, + "loss": 7.0994, + "step": 12318 + }, + { + "epoch": 1.149482131193431, + "grad_norm": 1.1006534268851929, + "learning_rate": 0.00028014768186330986, + "loss": 7.6082, + "step": 12319 + }, + { + "epoch": 1.1495754408883083, + "grad_norm": 1.469977943959968, + "learning_rate": 0.00028014392118554457, + "loss": 7.8829, + "step": 12320 + }, + { + "epoch": 1.1496687505831855, + "grad_norm": 0.5702184278356117, + "learning_rate": 0.00028014016017686175, + "loss": 7.3802, + "step": 12321 + }, + { + "epoch": 1.149762060278063, + "grad_norm": 1.1822873546618786, + "learning_rate": 0.00028013639883727115, + "loss": 7.4801, + "step": 12322 + }, + { + "epoch": 1.1498553699729401, + "grad_norm": 0.6310091340564062, + "learning_rate": 0.00028013263716678217, + "loss": 7.9332, + "step": 12323 + }, + { + "epoch": 1.1499486796678176, + "grad_norm": 1.6670779682178518, + "learning_rate": 0.0002801288751654045, + "loss": 7.178, + "step": 12324 + }, + { + "epoch": 1.1500419893626947, + "grad_norm": 1.4066178254081956, + "learning_rate": 0.0002801251128331476, + "loss": 7.2894, + "step": 12325 + }, + { + "epoch": 1.1501352990575722, + "grad_norm": 0.864088339086439, + "learning_rate": 0.0002801213501700211, + "loss": 7.3185, + "step": 12326 + }, + { + "epoch": 1.1502286087524494, + "grad_norm": 0.6982685070349792, + "learning_rate": 0.0002801175871760346, + "loss": 7.3931, + "step": 12327 + }, + { + "epoch": 1.1503219184473266, + "grad_norm": 1.235108856138424, + "learning_rate": 0.00028011382385119757, + "loss": 7.2777, + "step": 12328 + }, + { + "epoch": 1.150415228142204, + "grad_norm": 0.7610337321909254, + "learning_rate": 0.0002801100601955196, + "loss": 7.234, + "step": 12329 + }, + { + "epoch": 1.1505085378370812, + "grad_norm": 0.6251909379922769, + "learning_rate": 0.00028010629620901035, + "loss": 7.1228, + "step": 12330 + }, + { + "epoch": 1.1506018475319586, + "grad_norm": 2.4305754324548685, + "learning_rate": 0.0002801025318916793, + "loss": 7.2942, + "step": 12331 + }, + { + "epoch": 1.1506951572268358, + "grad_norm": 0.9929904726363055, + "learning_rate": 0.00028009876724353607, + "loss": 7.7378, + "step": 12332 + }, + { + "epoch": 1.1507884669217132, + "grad_norm": 0.925449190600393, + "learning_rate": 0.00028009500226459016, + "loss": 7.5958, + "step": 12333 + }, + { + "epoch": 1.1508817766165904, + "grad_norm": 1.7404087738147238, + "learning_rate": 0.00028009123695485125, + "loss": 7.2979, + "step": 12334 + }, + { + "epoch": 1.1509750863114678, + "grad_norm": 1.9421776658033794, + "learning_rate": 0.00028008747131432886, + "loss": 7.2741, + "step": 12335 + }, + { + "epoch": 1.151068396006345, + "grad_norm": 0.8573847784654637, + "learning_rate": 0.00028008370534303255, + "loss": 7.3765, + "step": 12336 + }, + { + "epoch": 1.1511617057012224, + "grad_norm": 0.7703532072717684, + "learning_rate": 0.00028007993904097185, + "loss": 7.3711, + "step": 12337 + }, + { + "epoch": 1.1512550153960996, + "grad_norm": 396.10880239797183, + "learning_rate": 0.00028007617240815647, + "loss": 7.4227, + "step": 12338 + }, + { + "epoch": 1.1513483250909768, + "grad_norm": 0.5849075561505159, + "learning_rate": 0.00028007240544459594, + "loss": 7.2183, + "step": 12339 + }, + { + "epoch": 1.1514416347858543, + "grad_norm": 0.6038236820203718, + "learning_rate": 0.00028006863815029976, + "loss": 7.363, + "step": 12340 + }, + { + "epoch": 1.1515349444807315, + "grad_norm": 0.7818411392781494, + "learning_rate": 0.0002800648705252776, + "loss": 7.0542, + "step": 12341 + }, + { + "epoch": 1.1516282541756089, + "grad_norm": 0.8059750195946803, + "learning_rate": 0.000280061102569539, + "loss": 7.2124, + "step": 12342 + }, + { + "epoch": 1.151721563870486, + "grad_norm": 3880.7036203091607, + "learning_rate": 0.0002800573342830935, + "loss": 7.618, + "step": 12343 + }, + { + "epoch": 1.1518148735653635, + "grad_norm": 0.7839394577632118, + "learning_rate": 0.00028005356566595076, + "loss": 7.7195, + "step": 12344 + }, + { + "epoch": 1.1519081832602407, + "grad_norm": 0.5550352040090931, + "learning_rate": 0.0002800497967181203, + "loss": 7.0509, + "step": 12345 + }, + { + "epoch": 1.1520014929551181, + "grad_norm": 0.9135180784738735, + "learning_rate": 0.0002800460274396117, + "loss": 7.2195, + "step": 12346 + }, + { + "epoch": 1.1520948026499953, + "grad_norm": 1.5080679505651178, + "learning_rate": 0.00028004225783043467, + "loss": 7.4037, + "step": 12347 + }, + { + "epoch": 1.1521881123448727, + "grad_norm": 1.6243734420488196, + "learning_rate": 0.0002800384878905986, + "loss": 7.4973, + "step": 12348 + }, + { + "epoch": 1.15228142203975, + "grad_norm": 0.6369908947961901, + "learning_rate": 0.00028003471762011326, + "loss": 7.0773, + "step": 12349 + }, + { + "epoch": 1.1523747317346271, + "grad_norm": 1.1892253327633975, + "learning_rate": 0.0002800309470189881, + "loss": 7.6426, + "step": 12350 + }, + { + "epoch": 1.1524680414295045, + "grad_norm": 1.249213837551147, + "learning_rate": 0.00028002717608723276, + "loss": 7.0348, + "step": 12351 + }, + { + "epoch": 1.1525613511243817, + "grad_norm": 98.43890240669413, + "learning_rate": 0.00028002340482485686, + "loss": 7.3887, + "step": 12352 + }, + { + "epoch": 1.1526546608192592, + "grad_norm": 380.45892421724477, + "learning_rate": 0.00028001963323187, + "loss": 7.3537, + "step": 12353 + }, + { + "epoch": 1.1527479705141364, + "grad_norm": 1.1039625324633995, + "learning_rate": 0.00028001586130828164, + "loss": 7.3133, + "step": 12354 + }, + { + "epoch": 1.1528412802090138, + "grad_norm": 5137.293514598377, + "learning_rate": 0.0002800120890541015, + "loss": 7.605, + "step": 12355 + }, + { + "epoch": 1.152934589903891, + "grad_norm": 1.9859117489803184, + "learning_rate": 0.0002800083164693391, + "loss": 7.6008, + "step": 12356 + }, + { + "epoch": 1.1530278995987684, + "grad_norm": 1.339326645858661, + "learning_rate": 0.0002800045435540041, + "loss": 7.3501, + "step": 12357 + }, + { + "epoch": 1.1531212092936456, + "grad_norm": 2.0812140446529748, + "learning_rate": 0.000280000770308106, + "loss": 7.4658, + "step": 12358 + }, + { + "epoch": 1.153214518988523, + "grad_norm": 0.9995371769306347, + "learning_rate": 0.0002799969967316545, + "loss": 7.3196, + "step": 12359 + }, + { + "epoch": 1.1533078286834002, + "grad_norm": 0.9563305706250463, + "learning_rate": 0.00027999322282465914, + "loss": 7.2544, + "step": 12360 + }, + { + "epoch": 1.1534011383782774, + "grad_norm": 3.7543780351395313, + "learning_rate": 0.0002799894485871295, + "loss": 7.579, + "step": 12361 + }, + { + "epoch": 1.1534944480731548, + "grad_norm": 1.497670847637091, + "learning_rate": 0.0002799856740190752, + "loss": 7.8017, + "step": 12362 + }, + { + "epoch": 1.153587757768032, + "grad_norm": 2.0258040652215463, + "learning_rate": 0.00027998189912050586, + "loss": 7.4551, + "step": 12363 + }, + { + "epoch": 1.1536810674629094, + "grad_norm": 1.9102130383942977, + "learning_rate": 0.00027997812389143103, + "loss": 7.2366, + "step": 12364 + }, + { + "epoch": 1.1537743771577866, + "grad_norm": 0.908840328924378, + "learning_rate": 0.00027997434833186037, + "loss": 7.2465, + "step": 12365 + }, + { + "epoch": 1.153867686852664, + "grad_norm": 0.968364677567398, + "learning_rate": 0.0002799705724418034, + "loss": 7.3004, + "step": 12366 + }, + { + "epoch": 1.1539609965475413, + "grad_norm": 1.63443589555764, + "learning_rate": 0.0002799667962212698, + "loss": 7.4174, + "step": 12367 + }, + { + "epoch": 1.1540543062424187, + "grad_norm": 1.5797180891046758, + "learning_rate": 0.0002799630196702691, + "loss": 7.4061, + "step": 12368 + }, + { + "epoch": 1.1541476159372959, + "grad_norm": 2.040263347790303, + "learning_rate": 0.00027995924278881095, + "loss": 7.5586, + "step": 12369 + }, + { + "epoch": 1.1542409256321733, + "grad_norm": 0.9355644595749887, + "learning_rate": 0.00027995546557690496, + "loss": 7.2759, + "step": 12370 + }, + { + "epoch": 1.1543342353270505, + "grad_norm": 0.5148401742512454, + "learning_rate": 0.00027995168803456066, + "loss": 7.4339, + "step": 12371 + }, + { + "epoch": 1.1544275450219277, + "grad_norm": 0.8036799431887695, + "learning_rate": 0.0002799479101617878, + "loss": 7.4347, + "step": 12372 + }, + { + "epoch": 1.154520854716805, + "grad_norm": 0.5340613186049774, + "learning_rate": 0.00027994413195859587, + "loss": 7.8841, + "step": 12373 + }, + { + "epoch": 1.1546141644116823, + "grad_norm": 1.9027326967702485, + "learning_rate": 0.00027994035342499446, + "loss": 7.1126, + "step": 12374 + }, + { + "epoch": 1.1547074741065597, + "grad_norm": 1.8763854332518328, + "learning_rate": 0.0002799365745609933, + "loss": 7.4647, + "step": 12375 + }, + { + "epoch": 1.154800783801437, + "grad_norm": 0.8888621672513441, + "learning_rate": 0.00027993279536660183, + "loss": 7.6147, + "step": 12376 + }, + { + "epoch": 1.1548940934963143, + "grad_norm": 0.7845085685750591, + "learning_rate": 0.0002799290158418298, + "loss": 7.5243, + "step": 12377 + }, + { + "epoch": 1.1549874031911915, + "grad_norm": 0.7456740067451277, + "learning_rate": 0.0002799252359866867, + "loss": 7.558, + "step": 12378 + }, + { + "epoch": 1.155080712886069, + "grad_norm": 1.2904393262712492, + "learning_rate": 0.0002799214558011823, + "loss": 7.4989, + "step": 12379 + }, + { + "epoch": 1.1551740225809461, + "grad_norm": 1.415709586137028, + "learning_rate": 0.0002799176752853261, + "loss": 7.3801, + "step": 12380 + }, + { + "epoch": 1.1552673322758236, + "grad_norm": 2.054787325266845, + "learning_rate": 0.00027991389443912776, + "loss": 7.8474, + "step": 12381 + }, + { + "epoch": 1.1553606419707008, + "grad_norm": 1.9859338399318631, + "learning_rate": 0.00027991011326259686, + "loss": 7.6173, + "step": 12382 + }, + { + "epoch": 1.155453951665578, + "grad_norm": 1.5508466659431313, + "learning_rate": 0.000279906331755743, + "loss": 7.7393, + "step": 12383 + }, + { + "epoch": 1.1555472613604554, + "grad_norm": 0.6709279864536445, + "learning_rate": 0.0002799025499185758, + "loss": 7.2454, + "step": 12384 + }, + { + "epoch": 1.1556405710553326, + "grad_norm": 1.4458935508929458, + "learning_rate": 0.00027989876775110493, + "loss": 7.2741, + "step": 12385 + }, + { + "epoch": 1.15573388075021, + "grad_norm": 0.6793133782977074, + "learning_rate": 0.00027989498525334, + "loss": 7.6028, + "step": 12386 + }, + { + "epoch": 1.1558271904450872, + "grad_norm": 1.4234502870811732, + "learning_rate": 0.00027989120242529055, + "loss": 7.4838, + "step": 12387 + }, + { + "epoch": 1.1559205001399646, + "grad_norm": 1.5347295513518475, + "learning_rate": 0.00027988741926696624, + "loss": 7.3332, + "step": 12388 + }, + { + "epoch": 1.1560138098348418, + "grad_norm": 1.251822341640764, + "learning_rate": 0.00027988363577837675, + "loss": 7.296, + "step": 12389 + }, + { + "epoch": 1.1561071195297192, + "grad_norm": 0.6691212775633718, + "learning_rate": 0.00027987985195953164, + "loss": 7.2773, + "step": 12390 + }, + { + "epoch": 1.1562004292245964, + "grad_norm": 0.6227871755353377, + "learning_rate": 0.0002798760678104405, + "loss": 7.3711, + "step": 12391 + }, + { + "epoch": 1.1562937389194738, + "grad_norm": 0.7461428725334206, + "learning_rate": 0.00027987228333111304, + "loss": 7.3321, + "step": 12392 + }, + { + "epoch": 1.156387048614351, + "grad_norm": 0.7634630433808992, + "learning_rate": 0.0002798684985215588, + "loss": 6.9864, + "step": 12393 + }, + { + "epoch": 1.1564803583092282, + "grad_norm": 0.6987651815975353, + "learning_rate": 0.00027986471338178746, + "loss": 7.1334, + "step": 12394 + }, + { + "epoch": 1.1565736680041057, + "grad_norm": 1.121486022204983, + "learning_rate": 0.0002798609279118086, + "loss": 7.3261, + "step": 12395 + }, + { + "epoch": 1.1566669776989829, + "grad_norm": 0.8198935688907204, + "learning_rate": 0.00027985714211163187, + "loss": 7.3559, + "step": 12396 + }, + { + "epoch": 1.1567602873938603, + "grad_norm": 0.8552749720661615, + "learning_rate": 0.00027985335598126686, + "loss": 7.1843, + "step": 12397 + }, + { + "epoch": 1.1568535970887375, + "grad_norm": 0.8292916902687241, + "learning_rate": 0.0002798495695207233, + "loss": 7.3804, + "step": 12398 + }, + { + "epoch": 1.156946906783615, + "grad_norm": 1.1281770631189412, + "learning_rate": 0.0002798457827300107, + "loss": 7.079, + "step": 12399 + }, + { + "epoch": 1.157040216478492, + "grad_norm": 1.1614721854844916, + "learning_rate": 0.0002798419956091388, + "loss": 7.4563, + "step": 12400 + }, + { + "epoch": 1.1571335261733695, + "grad_norm": 0.7489251315040089, + "learning_rate": 0.0002798382081581171, + "loss": 7.2414, + "step": 12401 + }, + { + "epoch": 1.1572268358682467, + "grad_norm": 1.2196119470937186, + "learning_rate": 0.0002798344203769553, + "loss": 7.391, + "step": 12402 + }, + { + "epoch": 1.1573201455631241, + "grad_norm": 0.7641726246529302, + "learning_rate": 0.00027983063226566307, + "loss": 7.1172, + "step": 12403 + }, + { + "epoch": 1.1574134552580013, + "grad_norm": 1.0118319755120069, + "learning_rate": 0.00027982684382425, + "loss": 7.4826, + "step": 12404 + }, + { + "epoch": 1.1575067649528785, + "grad_norm": 0.8351314093497042, + "learning_rate": 0.00027982305505272573, + "loss": 6.916, + "step": 12405 + }, + { + "epoch": 1.157600074647756, + "grad_norm": 0.9768735041895954, + "learning_rate": 0.00027981926595109985, + "loss": 7.5046, + "step": 12406 + }, + { + "epoch": 1.1576933843426331, + "grad_norm": 0.7542826736888518, + "learning_rate": 0.00027981547651938205, + "loss": 7.2467, + "step": 12407 + }, + { + "epoch": 1.1577866940375106, + "grad_norm": 0.8643920624937566, + "learning_rate": 0.00027981168675758194, + "loss": 7.6114, + "step": 12408 + }, + { + "epoch": 1.1578800037323878, + "grad_norm": 0.6500914532934733, + "learning_rate": 0.0002798078966657092, + "loss": 7.4874, + "step": 12409 + }, + { + "epoch": 1.1579733134272652, + "grad_norm": 0.6306447667585058, + "learning_rate": 0.00027980410624377336, + "loss": 7.6724, + "step": 12410 + }, + { + "epoch": 1.1580666231221424, + "grad_norm": 1.0112468897273517, + "learning_rate": 0.00027980031549178414, + "loss": 7.2334, + "step": 12411 + }, + { + "epoch": 1.1581599328170196, + "grad_norm": 0.6697413119379609, + "learning_rate": 0.0002797965244097512, + "loss": 7.5646, + "step": 12412 + }, + { + "epoch": 1.158253242511897, + "grad_norm": 0.6774258861040207, + "learning_rate": 0.00027979273299768415, + "loss": 7.5924, + "step": 12413 + }, + { + "epoch": 1.1583465522067744, + "grad_norm": 1.3668359534077232, + "learning_rate": 0.00027978894125559267, + "loss": 7.1308, + "step": 12414 + }, + { + "epoch": 1.1584398619016516, + "grad_norm": 0.607806321475461, + "learning_rate": 0.00027978514918348626, + "loss": 7.4055, + "step": 12415 + }, + { + "epoch": 1.1585331715965288, + "grad_norm": 0.59337598030813, + "learning_rate": 0.00027978135678137473, + "loss": 7.3154, + "step": 12416 + }, + { + "epoch": 1.1586264812914062, + "grad_norm": 0.9254762034291004, + "learning_rate": 0.0002797775640492676, + "loss": 7.3556, + "step": 12417 + }, + { + "epoch": 1.1587197909862834, + "grad_norm": 1.5315843151873505, + "learning_rate": 0.00027977377098717464, + "loss": 7.6434, + "step": 12418 + }, + { + "epoch": 1.1588131006811608, + "grad_norm": 0.6766112581248809, + "learning_rate": 0.00027976997759510537, + "loss": 7.2441, + "step": 12419 + }, + { + "epoch": 1.158906410376038, + "grad_norm": 0.6336297847972457, + "learning_rate": 0.0002797661838730695, + "loss": 7.568, + "step": 12420 + }, + { + "epoch": 1.1589997200709155, + "grad_norm": 0.6707900535679716, + "learning_rate": 0.00027976238982107665, + "loss": 7.2078, + "step": 12421 + }, + { + "epoch": 1.1590930297657926, + "grad_norm": 1.1744175049916072, + "learning_rate": 0.00027975859543913654, + "loss": 7.3504, + "step": 12422 + }, + { + "epoch": 1.1591863394606698, + "grad_norm": 1.6910716480293193, + "learning_rate": 0.0002797548007272587, + "loss": 7.3302, + "step": 12423 + }, + { + "epoch": 1.1592796491555473, + "grad_norm": 1.11047821920488, + "learning_rate": 0.0002797510056854529, + "loss": 7.2648, + "step": 12424 + }, + { + "epoch": 1.1593729588504245, + "grad_norm": 0.839378704811701, + "learning_rate": 0.0002797472103137286, + "loss": 7.4408, + "step": 12425 + }, + { + "epoch": 1.1594662685453019, + "grad_norm": 1.0890276313836642, + "learning_rate": 0.0002797434146120957, + "loss": 7.6568, + "step": 12426 + }, + { + "epoch": 1.159559578240179, + "grad_norm": 1.4888492465059333, + "learning_rate": 0.00027973961858056366, + "loss": 7.5688, + "step": 12427 + }, + { + "epoch": 1.1596528879350565, + "grad_norm": 0.6303150033591044, + "learning_rate": 0.0002797358222191422, + "loss": 7.144, + "step": 12428 + }, + { + "epoch": 1.1597461976299337, + "grad_norm": 1.1221356667635058, + "learning_rate": 0.00027973202552784106, + "loss": 7.2233, + "step": 12429 + }, + { + "epoch": 1.1598395073248111, + "grad_norm": 1.322413873471309, + "learning_rate": 0.0002797282285066697, + "loss": 7.2987, + "step": 12430 + }, + { + "epoch": 1.1599328170196883, + "grad_norm": 0.8315201770333402, + "learning_rate": 0.00027972443115563796, + "loss": 7.4137, + "step": 12431 + }, + { + "epoch": 1.1600261267145657, + "grad_norm": 0.8678950982783064, + "learning_rate": 0.0002797206334747554, + "loss": 7.551, + "step": 12432 + }, + { + "epoch": 1.160119436409443, + "grad_norm": 0.6462751281868255, + "learning_rate": 0.0002797168354640317, + "loss": 7.0503, + "step": 12433 + }, + { + "epoch": 1.1602127461043201, + "grad_norm": 0.5780696333040951, + "learning_rate": 0.0002797130371234765, + "loss": 7.1655, + "step": 12434 + }, + { + "epoch": 1.1603060557991975, + "grad_norm": 0.7489772059048879, + "learning_rate": 0.0002797092384530995, + "loss": 7.5905, + "step": 12435 + }, + { + "epoch": 1.1603993654940747, + "grad_norm": 1.01626013697974, + "learning_rate": 0.00027970543945291025, + "loss": 7.1876, + "step": 12436 + }, + { + "epoch": 1.1604926751889522, + "grad_norm": 0.9285558671535673, + "learning_rate": 0.0002797016401229186, + "loss": 6.9926, + "step": 12437 + }, + { + "epoch": 1.1605859848838294, + "grad_norm": 0.7252363404836267, + "learning_rate": 0.00027969784046313396, + "loss": 7.4157, + "step": 12438 + }, + { + "epoch": 1.1606792945787068, + "grad_norm": 1.180743683949947, + "learning_rate": 0.00027969404047356627, + "loss": 7.6841, + "step": 12439 + }, + { + "epoch": 1.160772604273584, + "grad_norm": 0.718763976684914, + "learning_rate": 0.000279690240154225, + "loss": 7.4042, + "step": 12440 + }, + { + "epoch": 1.1608659139684614, + "grad_norm": 0.4032361837875767, + "learning_rate": 0.0002796864395051198, + "loss": 7.2308, + "step": 12441 + }, + { + "epoch": 1.1609592236633386, + "grad_norm": 0.5681710981648891, + "learning_rate": 0.00027968263852626047, + "loss": 7.3995, + "step": 12442 + }, + { + "epoch": 1.161052533358216, + "grad_norm": 0.7500431525144825, + "learning_rate": 0.0002796788372176566, + "loss": 7.4917, + "step": 12443 + }, + { + "epoch": 1.1611458430530932, + "grad_norm": 0.639854961742372, + "learning_rate": 0.0002796750355793179, + "loss": 7.377, + "step": 12444 + }, + { + "epoch": 1.1612391527479704, + "grad_norm": 0.7656849921162918, + "learning_rate": 0.00027967123361125393, + "loss": 7.1785, + "step": 12445 + }, + { + "epoch": 1.1613324624428478, + "grad_norm": 0.8717874547052991, + "learning_rate": 0.00027966743131347446, + "loss": 7.1068, + "step": 12446 + }, + { + "epoch": 1.161425772137725, + "grad_norm": 1.1130873805387576, + "learning_rate": 0.0002796636286859891, + "loss": 7.4124, + "step": 12447 + }, + { + "epoch": 1.1615190818326024, + "grad_norm": 0.6912560798914335, + "learning_rate": 0.0002796598257288075, + "loss": 7.2081, + "step": 12448 + }, + { + "epoch": 1.1616123915274796, + "grad_norm": 0.7532754020382588, + "learning_rate": 0.00027965602244193945, + "loss": 7.3208, + "step": 12449 + }, + { + "epoch": 1.161705701222357, + "grad_norm": 1.037725531916096, + "learning_rate": 0.00027965221882539457, + "loss": 7.6833, + "step": 12450 + }, + { + "epoch": 1.1617990109172343, + "grad_norm": 0.5612489057813459, + "learning_rate": 0.0002796484148791824, + "loss": 7.1515, + "step": 12451 + }, + { + "epoch": 1.1618923206121117, + "grad_norm": 0.9380456765640734, + "learning_rate": 0.00027964461060331277, + "loss": 7.15, + "step": 12452 + }, + { + "epoch": 1.1619856303069889, + "grad_norm": 1.0458369411280397, + "learning_rate": 0.0002796408059977953, + "loss": 7.5241, + "step": 12453 + }, + { + "epoch": 1.1620789400018663, + "grad_norm": 0.8775138452538986, + "learning_rate": 0.0002796370010626396, + "loss": 7.0192, + "step": 12454 + }, + { + "epoch": 1.1621722496967435, + "grad_norm": 0.784021611900741, + "learning_rate": 0.0002796331957978555, + "loss": 7.4687, + "step": 12455 + }, + { + "epoch": 1.1622655593916207, + "grad_norm": 0.7862709584638057, + "learning_rate": 0.00027962939020345255, + "loss": 7.0054, + "step": 12456 + }, + { + "epoch": 1.162358869086498, + "grad_norm": 0.9259342579575885, + "learning_rate": 0.0002796255842794405, + "loss": 7.2094, + "step": 12457 + }, + { + "epoch": 1.1624521787813753, + "grad_norm": 1.2920109175631143, + "learning_rate": 0.0002796217780258289, + "loss": 7.5192, + "step": 12458 + }, + { + "epoch": 1.1625454884762527, + "grad_norm": 0.8949922955378508, + "learning_rate": 0.0002796179714426276, + "loss": 7.5087, + "step": 12459 + }, + { + "epoch": 1.16263879817113, + "grad_norm": 0.7200497763321224, + "learning_rate": 0.00027961416452984615, + "loss": 7.3414, + "step": 12460 + }, + { + "epoch": 1.1627321078660073, + "grad_norm": 1.6765740852859086, + "learning_rate": 0.00027961035728749426, + "loss": 7.1611, + "step": 12461 + }, + { + "epoch": 1.1628254175608845, + "grad_norm": 1.4051677526914215, + "learning_rate": 0.00027960654971558165, + "loss": 7.1746, + "step": 12462 + }, + { + "epoch": 1.162918727255762, + "grad_norm": 0.46516600025789495, + "learning_rate": 0.000279602741814118, + "loss": 7.5026, + "step": 12463 + }, + { + "epoch": 1.1630120369506391, + "grad_norm": 0.47920701959022255, + "learning_rate": 0.0002795989335831129, + "loss": 7.3676, + "step": 12464 + }, + { + "epoch": 1.1631053466455166, + "grad_norm": 0.7520189924428384, + "learning_rate": 0.00027959512502257616, + "loss": 7.0433, + "step": 12465 + }, + { + "epoch": 1.1631986563403938, + "grad_norm": 0.8448691846974027, + "learning_rate": 0.0002795913161325174, + "loss": 7.4796, + "step": 12466 + }, + { + "epoch": 1.163291966035271, + "grad_norm": 1.6980983487179095, + "learning_rate": 0.0002795875069129463, + "loss": 7.765, + "step": 12467 + }, + { + "epoch": 1.1633852757301484, + "grad_norm": 0.6753686419948658, + "learning_rate": 0.0002795836973638725, + "loss": 7.416, + "step": 12468 + }, + { + "epoch": 1.1634785854250256, + "grad_norm": 0.7171273175233892, + "learning_rate": 0.0002795798874853058, + "loss": 7.5699, + "step": 12469 + }, + { + "epoch": 1.163571895119903, + "grad_norm": 0.7127917648726174, + "learning_rate": 0.00027957607727725585, + "loss": 7.3335, + "step": 12470 + }, + { + "epoch": 1.1636652048147802, + "grad_norm": 2.1083578697658347, + "learning_rate": 0.00027957226673973226, + "loss": 7.1585, + "step": 12471 + }, + { + "epoch": 1.1637585145096576, + "grad_norm": 0.7632486231140228, + "learning_rate": 0.00027956845587274483, + "loss": 7.4333, + "step": 12472 + }, + { + "epoch": 1.1638518242045348, + "grad_norm": 0.6222072262591444, + "learning_rate": 0.0002795646446763032, + "loss": 7.1997, + "step": 12473 + }, + { + "epoch": 1.1639451338994122, + "grad_norm": 0.9198331273665443, + "learning_rate": 0.000279560833150417, + "loss": 7.2972, + "step": 12474 + }, + { + "epoch": 1.1640384435942894, + "grad_norm": 1.264551700995873, + "learning_rate": 0.00027955702129509605, + "loss": 7.4607, + "step": 12475 + }, + { + "epoch": 1.1641317532891668, + "grad_norm": 0.958305124968841, + "learning_rate": 0.00027955320911034995, + "loss": 7.3038, + "step": 12476 + }, + { + "epoch": 1.164225062984044, + "grad_norm": 1.010769872107273, + "learning_rate": 0.0002795493965961884, + "loss": 7.5711, + "step": 12477 + }, + { + "epoch": 1.1643183726789212, + "grad_norm": 0.5011704635457489, + "learning_rate": 0.0002795455837526211, + "loss": 7.4723, + "step": 12478 + }, + { + "epoch": 1.1644116823737987, + "grad_norm": 0.6698416096993207, + "learning_rate": 0.00027954177057965777, + "loss": 7.3636, + "step": 12479 + }, + { + "epoch": 1.1645049920686759, + "grad_norm": 1.127897683143253, + "learning_rate": 0.00027953795707730814, + "loss": 7.421, + "step": 12480 + }, + { + "epoch": 1.1645983017635533, + "grad_norm": 0.9892449362963479, + "learning_rate": 0.0002795341432455818, + "loss": 7.6701, + "step": 12481 + }, + { + "epoch": 1.1646916114584305, + "grad_norm": 0.7587306794169568, + "learning_rate": 0.0002795303290844885, + "loss": 7.6491, + "step": 12482 + }, + { + "epoch": 1.164784921153308, + "grad_norm": 0.5718558853413411, + "learning_rate": 0.00027952651459403795, + "loss": 7.6332, + "step": 12483 + }, + { + "epoch": 1.164878230848185, + "grad_norm": 0.8874305457232937, + "learning_rate": 0.00027952269977423984, + "loss": 7.4049, + "step": 12484 + }, + { + "epoch": 1.1649715405430625, + "grad_norm": 0.5102541853436819, + "learning_rate": 0.0002795188846251039, + "loss": 7.5674, + "step": 12485 + }, + { + "epoch": 1.1650648502379397, + "grad_norm": 1.0023626478402912, + "learning_rate": 0.0002795150691466398, + "loss": 7.5282, + "step": 12486 + }, + { + "epoch": 1.1651581599328171, + "grad_norm": 0.7450182280660504, + "learning_rate": 0.0002795112533388572, + "loss": 7.4511, + "step": 12487 + }, + { + "epoch": 1.1652514696276943, + "grad_norm": 0.7578492295737732, + "learning_rate": 0.00027950743720176586, + "loss": 7.0885, + "step": 12488 + }, + { + "epoch": 1.1653447793225715, + "grad_norm": 0.5699711498049909, + "learning_rate": 0.0002795036207353755, + "loss": 7.1344, + "step": 12489 + }, + { + "epoch": 1.165438089017449, + "grad_norm": 0.7507421919102673, + "learning_rate": 0.0002794998039396958, + "loss": 7.1838, + "step": 12490 + }, + { + "epoch": 1.1655313987123261, + "grad_norm": 0.5011385548193401, + "learning_rate": 0.0002794959868147364, + "loss": 7.3183, + "step": 12491 + }, + { + "epoch": 1.1656247084072036, + "grad_norm": 0.7382324966727913, + "learning_rate": 0.0002794921693605071, + "loss": 7.6043, + "step": 12492 + }, + { + "epoch": 1.1657180181020808, + "grad_norm": 0.41250535515498793, + "learning_rate": 0.0002794883515770176, + "loss": 7.1891, + "step": 12493 + }, + { + "epoch": 1.1658113277969582, + "grad_norm": 0.9879445402439417, + "learning_rate": 0.00027948453346427756, + "loss": 7.3296, + "step": 12494 + }, + { + "epoch": 1.1659046374918354, + "grad_norm": 0.5670261524220072, + "learning_rate": 0.00027948071502229666, + "loss": 7.2516, + "step": 12495 + }, + { + "epoch": 1.1659979471867128, + "grad_norm": 0.5535261825014504, + "learning_rate": 0.0002794768962510847, + "loss": 7.5734, + "step": 12496 + }, + { + "epoch": 1.16609125688159, + "grad_norm": 0.7777326995162495, + "learning_rate": 0.0002794730771506514, + "loss": 7.5622, + "step": 12497 + }, + { + "epoch": 1.1661845665764674, + "grad_norm": 1.542133386469771, + "learning_rate": 0.00027946925772100634, + "loss": 7.2899, + "step": 12498 + }, + { + "epoch": 1.1662778762713446, + "grad_norm": 1.0348172771929447, + "learning_rate": 0.00027946543796215933, + "loss": 7.5424, + "step": 12499 + }, + { + "epoch": 1.1663711859662218, + "grad_norm": 0.5691485031421685, + "learning_rate": 0.0002794616178741201, + "loss": 7.5941, + "step": 12500 + }, + { + "epoch": 1.1664644956610992, + "grad_norm": 1.3930613785223325, + "learning_rate": 0.00027945779745689825, + "loss": 6.9546, + "step": 12501 + }, + { + "epoch": 1.1665578053559764, + "grad_norm": 1.5760071313091166, + "learning_rate": 0.0002794539767105036, + "loss": 7.5962, + "step": 12502 + }, + { + "epoch": 1.1666511150508538, + "grad_norm": 0.7732016566252853, + "learning_rate": 0.00027945015563494584, + "loss": 7.1818, + "step": 12503 + }, + { + "epoch": 1.166744424745731, + "grad_norm": 0.6064374642666096, + "learning_rate": 0.0002794463342302347, + "loss": 7.2301, + "step": 12504 + }, + { + "epoch": 1.1668377344406085, + "grad_norm": 0.6657380605990905, + "learning_rate": 0.00027944251249637983, + "loss": 7.6383, + "step": 12505 + }, + { + "epoch": 1.1669310441354857, + "grad_norm": 0.9345659449944723, + "learning_rate": 0.00027943869043339103, + "loss": 7.2815, + "step": 12506 + }, + { + "epoch": 1.167024353830363, + "grad_norm": 1.3634621851812538, + "learning_rate": 0.00027943486804127797, + "loss": 7.3826, + "step": 12507 + }, + { + "epoch": 1.1671176635252403, + "grad_norm": 1.0524962741032846, + "learning_rate": 0.0002794310453200504, + "loss": 7.3164, + "step": 12508 + }, + { + "epoch": 1.1672109732201177, + "grad_norm": 0.4591879756412026, + "learning_rate": 0.00027942722226971805, + "loss": 7.589, + "step": 12509 + }, + { + "epoch": 1.1673042829149949, + "grad_norm": 0.848269058519062, + "learning_rate": 0.0002794233988902906, + "loss": 7.3208, + "step": 12510 + }, + { + "epoch": 1.167397592609872, + "grad_norm": 0.7607731660538746, + "learning_rate": 0.0002794195751817777, + "loss": 7.2712, + "step": 12511 + }, + { + "epoch": 1.1674909023047495, + "grad_norm": 1.307299674538746, + "learning_rate": 0.0002794157511441892, + "loss": 7.7142, + "step": 12512 + }, + { + "epoch": 1.1675842119996267, + "grad_norm": 1.2715353954758302, + "learning_rate": 0.00027941192677753484, + "loss": 7.4414, + "step": 12513 + }, + { + "epoch": 1.1676775216945041, + "grad_norm": 0.8814161221339437, + "learning_rate": 0.0002794081020818242, + "loss": 7.6587, + "step": 12514 + }, + { + "epoch": 1.1677708313893813, + "grad_norm": 0.5202580744045624, + "learning_rate": 0.00027940427705706715, + "loss": 7.5806, + "step": 12515 + }, + { + "epoch": 1.1678641410842587, + "grad_norm": 0.654580837143717, + "learning_rate": 0.0002794004517032734, + "loss": 7.7708, + "step": 12516 + }, + { + "epoch": 1.167957450779136, + "grad_norm": 1.6533976132257036, + "learning_rate": 0.0002793966260204525, + "loss": 7.4295, + "step": 12517 + }, + { + "epoch": 1.1680507604740131, + "grad_norm": 1.2267435872486594, + "learning_rate": 0.0002793928000086144, + "loss": 7.3884, + "step": 12518 + }, + { + "epoch": 1.1681440701688905, + "grad_norm": 1.123869842475801, + "learning_rate": 0.0002793889736677687, + "loss": 7.5297, + "step": 12519 + }, + { + "epoch": 1.168237379863768, + "grad_norm": 0.5930870294911438, + "learning_rate": 0.0002793851469979252, + "loss": 7.4272, + "step": 12520 + }, + { + "epoch": 1.1683306895586452, + "grad_norm": 0.5337767834118359, + "learning_rate": 0.0002793813199990936, + "loss": 7.3334, + "step": 12521 + }, + { + "epoch": 1.1684239992535224, + "grad_norm": 0.7359023938732265, + "learning_rate": 0.0002793774926712836, + "loss": 7.0964, + "step": 12522 + }, + { + "epoch": 1.1685173089483998, + "grad_norm": 0.9346865282116922, + "learning_rate": 0.000279373665014505, + "loss": 7.2881, + "step": 12523 + }, + { + "epoch": 1.168610618643277, + "grad_norm": 1.0532119189227578, + "learning_rate": 0.0002793698370287675, + "loss": 7.1409, + "step": 12524 + }, + { + "epoch": 1.1687039283381544, + "grad_norm": 2.026933626143491, + "learning_rate": 0.00027936600871408075, + "loss": 7.4501, + "step": 12525 + }, + { + "epoch": 1.1687972380330316, + "grad_norm": 0.9546571901887196, + "learning_rate": 0.0002793621800704546, + "loss": 7.1591, + "step": 12526 + }, + { + "epoch": 1.168890547727909, + "grad_norm": 0.6644051215896866, + "learning_rate": 0.00027935835109789875, + "loss": 7.2926, + "step": 12527 + }, + { + "epoch": 1.1689838574227862, + "grad_norm": 0.8993090799753565, + "learning_rate": 0.00027935452179642296, + "loss": 7.1845, + "step": 12528 + }, + { + "epoch": 1.1690771671176634, + "grad_norm": 0.9127336086131865, + "learning_rate": 0.0002793506921660369, + "loss": 7.2921, + "step": 12529 + }, + { + "epoch": 1.1691704768125408, + "grad_norm": 0.9515173661294999, + "learning_rate": 0.00027934686220675036, + "loss": 7.3577, + "step": 12530 + }, + { + "epoch": 1.169263786507418, + "grad_norm": 1.4516322855718966, + "learning_rate": 0.0002793430319185731, + "loss": 7.2595, + "step": 12531 + }, + { + "epoch": 1.1693570962022954, + "grad_norm": 0.7684245502582472, + "learning_rate": 0.0002793392013015147, + "loss": 7.3072, + "step": 12532 + }, + { + "epoch": 1.1694504058971726, + "grad_norm": 0.5409091606034044, + "learning_rate": 0.00027933537035558514, + "loss": 7.2456, + "step": 12533 + }, + { + "epoch": 1.16954371559205, + "grad_norm": 0.6588814814333827, + "learning_rate": 0.000279331539080794, + "loss": 7.133, + "step": 12534 + }, + { + "epoch": 1.1696370252869273, + "grad_norm": 1.6022404751870736, + "learning_rate": 0.0002793277074771511, + "loss": 7.3074, + "step": 12535 + }, + { + "epoch": 1.1697303349818047, + "grad_norm": 0.6914115565689957, + "learning_rate": 0.0002793238755446661, + "loss": 7.0645, + "step": 12536 + }, + { + "epoch": 1.1698236446766819, + "grad_norm": 0.9341460667469746, + "learning_rate": 0.0002793200432833488, + "loss": 7.6718, + "step": 12537 + }, + { + "epoch": 1.1699169543715593, + "grad_norm": 0.6574597097474957, + "learning_rate": 0.000279316210693209, + "loss": 7.4764, + "step": 12538 + }, + { + "epoch": 1.1700102640664365, + "grad_norm": 0.935982386381892, + "learning_rate": 0.00027931237777425636, + "loss": 7.3672, + "step": 12539 + }, + { + "epoch": 1.1701035737613137, + "grad_norm": 1.3349498658548262, + "learning_rate": 0.00027930854452650063, + "loss": 7.2385, + "step": 12540 + }, + { + "epoch": 1.170196883456191, + "grad_norm": 1.2847292677666382, + "learning_rate": 0.0002793047109499516, + "loss": 7.2303, + "step": 12541 + }, + { + "epoch": 1.1702901931510683, + "grad_norm": 1.1950762673901516, + "learning_rate": 0.00027930087704461896, + "loss": 7.4608, + "step": 12542 + }, + { + "epoch": 1.1703835028459457, + "grad_norm": 0.5511190247845744, + "learning_rate": 0.0002792970428105125, + "loss": 7.384, + "step": 12543 + }, + { + "epoch": 1.170476812540823, + "grad_norm": 0.7700084669546944, + "learning_rate": 0.00027929320824764196, + "loss": 7.3972, + "step": 12544 + }, + { + "epoch": 1.1705701222357003, + "grad_norm": 1.013464849454938, + "learning_rate": 0.0002792893733560171, + "loss": 7.4537, + "step": 12545 + }, + { + "epoch": 1.1706634319305775, + "grad_norm": 0.6380175595409869, + "learning_rate": 0.0002792855381356477, + "loss": 7.1958, + "step": 12546 + }, + { + "epoch": 1.170756741625455, + "grad_norm": 0.7313789334369636, + "learning_rate": 0.0002792817025865434, + "loss": 7.5535, + "step": 12547 + }, + { + "epoch": 1.1708500513203322, + "grad_norm": 0.691521624379683, + "learning_rate": 0.00027927786670871405, + "loss": 7.21, + "step": 12548 + }, + { + "epoch": 1.1709433610152096, + "grad_norm": 0.4681943465323291, + "learning_rate": 0.0002792740305021694, + "loss": 7.4997, + "step": 12549 + }, + { + "epoch": 1.1710366707100868, + "grad_norm": 1.2232214183324501, + "learning_rate": 0.0002792701939669191, + "loss": 7.2378, + "step": 12550 + }, + { + "epoch": 1.171129980404964, + "grad_norm": 1.1660405085684107, + "learning_rate": 0.0002792663571029731, + "loss": 7.6089, + "step": 12551 + }, + { + "epoch": 1.1712232900998414, + "grad_norm": 0.6027704221014889, + "learning_rate": 0.00027926251991034097, + "loss": 7.2656, + "step": 12552 + }, + { + "epoch": 1.1713165997947186, + "grad_norm": 0.5576383353818539, + "learning_rate": 0.0002792586823890326, + "loss": 7.2781, + "step": 12553 + }, + { + "epoch": 1.171409909489596, + "grad_norm": 0.5435293903049381, + "learning_rate": 0.0002792548445390577, + "loss": 7.1851, + "step": 12554 + }, + { + "epoch": 1.1715032191844732, + "grad_norm": 0.6003128495446187, + "learning_rate": 0.00027925100636042597, + "loss": 7.4677, + "step": 12555 + }, + { + "epoch": 1.1715965288793506, + "grad_norm": 0.39801563287707137, + "learning_rate": 0.0002792471678531472, + "loss": 7.2219, + "step": 12556 + }, + { + "epoch": 1.1716898385742278, + "grad_norm": 0.49019181600588685, + "learning_rate": 0.00027924332901723125, + "loss": 7.4316, + "step": 12557 + }, + { + "epoch": 1.1717831482691052, + "grad_norm": 0.6051507940972133, + "learning_rate": 0.00027923948985268774, + "loss": 7.5312, + "step": 12558 + }, + { + "epoch": 1.1718764579639824, + "grad_norm": 0.7185019014855287, + "learning_rate": 0.0002792356503595265, + "loss": 7.5198, + "step": 12559 + }, + { + "epoch": 1.1719697676588599, + "grad_norm": 0.9120358236975743, + "learning_rate": 0.00027923181053775726, + "loss": 7.5679, + "step": 12560 + }, + { + "epoch": 1.172063077353737, + "grad_norm": 0.9195702373825693, + "learning_rate": 0.00027922797038738986, + "loss": 7.1962, + "step": 12561 + }, + { + "epoch": 1.1721563870486142, + "grad_norm": 0.9680969390727402, + "learning_rate": 0.00027922412990843395, + "loss": 7.3838, + "step": 12562 + }, + { + "epoch": 1.1722496967434917, + "grad_norm": 0.5706626464045172, + "learning_rate": 0.0002792202891008994, + "loss": 7.552, + "step": 12563 + }, + { + "epoch": 1.1723430064383689, + "grad_norm": 0.5262475999824943, + "learning_rate": 0.00027921644796479595, + "loss": 7.2591, + "step": 12564 + }, + { + "epoch": 1.1724363161332463, + "grad_norm": 0.8543572348518382, + "learning_rate": 0.00027921260650013336, + "loss": 7.0189, + "step": 12565 + }, + { + "epoch": 1.1725296258281235, + "grad_norm": 0.6530417455017791, + "learning_rate": 0.0002792087647069213, + "loss": 7.3597, + "step": 12566 + }, + { + "epoch": 1.172622935523001, + "grad_norm": 0.7086135726246754, + "learning_rate": 0.0002792049225851697, + "loss": 7.3911, + "step": 12567 + }, + { + "epoch": 1.172716245217878, + "grad_norm": 1.282200550803369, + "learning_rate": 0.0002792010801348882, + "loss": 7.039, + "step": 12568 + }, + { + "epoch": 1.1728095549127555, + "grad_norm": 0.6328546252329033, + "learning_rate": 0.0002791972373560867, + "loss": 7.3254, + "step": 12569 + }, + { + "epoch": 1.1729028646076327, + "grad_norm": 0.775028905403637, + "learning_rate": 0.0002791933942487748, + "loss": 7.2879, + "step": 12570 + }, + { + "epoch": 1.1729961743025101, + "grad_norm": 0.7909880179821663, + "learning_rate": 0.00027918955081296246, + "loss": 7.3225, + "step": 12571 + }, + { + "epoch": 1.1730894839973873, + "grad_norm": 0.5470940290875457, + "learning_rate": 0.0002791857070486593, + "loss": 7.4034, + "step": 12572 + }, + { + "epoch": 1.1731827936922645, + "grad_norm": 0.48432367547283944, + "learning_rate": 0.00027918186295587514, + "loss": 7.7242, + "step": 12573 + }, + { + "epoch": 1.173276103387142, + "grad_norm": 1.4208061325188057, + "learning_rate": 0.00027917801853461985, + "loss": 7.24, + "step": 12574 + }, + { + "epoch": 1.1733694130820191, + "grad_norm": 0.9002390669249144, + "learning_rate": 0.0002791741737849031, + "loss": 7.5621, + "step": 12575 + }, + { + "epoch": 1.1734627227768966, + "grad_norm": 0.7819516332761266, + "learning_rate": 0.00027917032870673465, + "loss": 7.4155, + "step": 12576 + }, + { + "epoch": 1.1735560324717738, + "grad_norm": 1.0585185503542562, + "learning_rate": 0.00027916648330012433, + "loss": 7.0689, + "step": 12577 + }, + { + "epoch": 1.1736493421666512, + "grad_norm": 0.8025643002415778, + "learning_rate": 0.0002791626375650819, + "loss": 7.3294, + "step": 12578 + }, + { + "epoch": 1.1737426518615284, + "grad_norm": 1.0764324369373075, + "learning_rate": 0.0002791587915016172, + "loss": 7.4121, + "step": 12579 + }, + { + "epoch": 1.1738359615564058, + "grad_norm": 1.2782945549211135, + "learning_rate": 0.0002791549451097399, + "loss": 7.3182, + "step": 12580 + }, + { + "epoch": 1.173929271251283, + "grad_norm": 1.3135868074044301, + "learning_rate": 0.0002791510983894598, + "loss": 7.6121, + "step": 12581 + }, + { + "epoch": 1.1740225809461604, + "grad_norm": 0.6832811680608933, + "learning_rate": 0.00027914725134078677, + "loss": 7.0118, + "step": 12582 + }, + { + "epoch": 1.1741158906410376, + "grad_norm": 0.6930652155242972, + "learning_rate": 0.00027914340396373056, + "loss": 7.2584, + "step": 12583 + }, + { + "epoch": 1.1742092003359148, + "grad_norm": 0.9280403540384847, + "learning_rate": 0.0002791395562583009, + "loss": 7.2247, + "step": 12584 + }, + { + "epoch": 1.1743025100307922, + "grad_norm": 0.5515271223941325, + "learning_rate": 0.00027913570822450754, + "loss": 7.6217, + "step": 12585 + }, + { + "epoch": 1.1743958197256694, + "grad_norm": 0.5535150735248034, + "learning_rate": 0.0002791318598623604, + "loss": 7.4764, + "step": 12586 + }, + { + "epoch": 1.1744891294205468, + "grad_norm": 0.5022220965712079, + "learning_rate": 0.00027912801117186916, + "loss": 7.4695, + "step": 12587 + }, + { + "epoch": 1.174582439115424, + "grad_norm": 0.47577245084605346, + "learning_rate": 0.00027912416215304366, + "loss": 7.1621, + "step": 12588 + }, + { + "epoch": 1.1746757488103015, + "grad_norm": 0.8917782147857491, + "learning_rate": 0.00027912031280589365, + "loss": 7.2426, + "step": 12589 + }, + { + "epoch": 1.1747690585051787, + "grad_norm": 1.0550608482803054, + "learning_rate": 0.0002791164631304289, + "loss": 7.5994, + "step": 12590 + }, + { + "epoch": 1.174862368200056, + "grad_norm": 0.6858573764538456, + "learning_rate": 0.00027911261312665934, + "loss": 7.2651, + "step": 12591 + }, + { + "epoch": 1.1749556778949333, + "grad_norm": 0.5267050022784037, + "learning_rate": 0.00027910876279459455, + "loss": 7.3549, + "step": 12592 + }, + { + "epoch": 1.1750489875898107, + "grad_norm": 0.8417506692926968, + "learning_rate": 0.00027910491213424445, + "loss": 7.2021, + "step": 12593 + }, + { + "epoch": 1.1751422972846879, + "grad_norm": 0.6236056875806307, + "learning_rate": 0.0002791010611456188, + "loss": 7.3487, + "step": 12594 + }, + { + "epoch": 1.175235606979565, + "grad_norm": 0.578764337186833, + "learning_rate": 0.00027909720982872743, + "loss": 7.3347, + "step": 12595 + }, + { + "epoch": 1.1753289166744425, + "grad_norm": 0.4037621037799104, + "learning_rate": 0.00027909335818358, + "loss": 7.4551, + "step": 12596 + }, + { + "epoch": 1.1754222263693197, + "grad_norm": 0.40275961056166293, + "learning_rate": 0.0002790895062101865, + "loss": 7.3879, + "step": 12597 + }, + { + "epoch": 1.1755155360641971, + "grad_norm": 0.5324751620865591, + "learning_rate": 0.00027908565390855663, + "loss": 7.224, + "step": 12598 + }, + { + "epoch": 1.1756088457590743, + "grad_norm": 0.6654110790999036, + "learning_rate": 0.0002790818012787001, + "loss": 7.4441, + "step": 12599 + }, + { + "epoch": 1.1757021554539517, + "grad_norm": 0.49513654998986917, + "learning_rate": 0.0002790779483206268, + "loss": 7.2822, + "step": 12600 + }, + { + "epoch": 1.175795465148829, + "grad_norm": 0.7769704140228424, + "learning_rate": 0.0002790740950343466, + "loss": 7.1627, + "step": 12601 + }, + { + "epoch": 1.1758887748437064, + "grad_norm": 0.48159522580903086, + "learning_rate": 0.0002790702414198691, + "loss": 7.2728, + "step": 12602 + }, + { + "epoch": 1.1759820845385835, + "grad_norm": 0.7096223270076144, + "learning_rate": 0.00027906638747720425, + "loss": 7.1354, + "step": 12603 + }, + { + "epoch": 1.176075394233461, + "grad_norm": 0.8973310479113098, + "learning_rate": 0.0002790625332063619, + "loss": 7.3503, + "step": 12604 + }, + { + "epoch": 1.1761687039283382, + "grad_norm": 0.5657473946107733, + "learning_rate": 0.0002790586786073516, + "loss": 6.9973, + "step": 12605 + }, + { + "epoch": 1.1762620136232154, + "grad_norm": 0.8741729029655282, + "learning_rate": 0.00027905482368018346, + "loss": 7.0824, + "step": 12606 + }, + { + "epoch": 1.1763553233180928, + "grad_norm": 1.5452510750651771, + "learning_rate": 0.000279050968424867, + "loss": 7.5054, + "step": 12607 + }, + { + "epoch": 1.17644863301297, + "grad_norm": 0.9947494200674578, + "learning_rate": 0.0002790471128414122, + "loss": 7.4599, + "step": 12608 + }, + { + "epoch": 1.1765419427078474, + "grad_norm": 0.6915347670645406, + "learning_rate": 0.00027904325692982884, + "loss": 7.284, + "step": 12609 + }, + { + "epoch": 1.1766352524027246, + "grad_norm": 2.5371025271917382, + "learning_rate": 0.0002790394006901267, + "loss": 7.345, + "step": 12610 + }, + { + "epoch": 1.176728562097602, + "grad_norm": 1.6829186792105226, + "learning_rate": 0.0002790355441223156, + "loss": 7.1918, + "step": 12611 + }, + { + "epoch": 1.1768218717924792, + "grad_norm": 0.9694607265676088, + "learning_rate": 0.00027903168722640535, + "loss": 7.3129, + "step": 12612 + }, + { + "epoch": 1.1769151814873566, + "grad_norm": 0.7579331713027422, + "learning_rate": 0.00027902783000240567, + "loss": 7.5007, + "step": 12613 + }, + { + "epoch": 1.1770084911822338, + "grad_norm": 0.5081418128768154, + "learning_rate": 0.0002790239724503265, + "loss": 7.2641, + "step": 12614 + }, + { + "epoch": 1.1771018008771112, + "grad_norm": 1.1121130675961821, + "learning_rate": 0.00027902011457017756, + "loss": 7.5625, + "step": 12615 + }, + { + "epoch": 1.1771951105719884, + "grad_norm": 2.101938550397325, + "learning_rate": 0.0002790162563619687, + "loss": 7.1633, + "step": 12616 + }, + { + "epoch": 1.1772884202668656, + "grad_norm": 0.9511942844779573, + "learning_rate": 0.00027901239782570974, + "loss": 6.9389, + "step": 12617 + }, + { + "epoch": 1.177381729961743, + "grad_norm": 0.830740459710017, + "learning_rate": 0.00027900853896141046, + "loss": 7.124, + "step": 12618 + }, + { + "epoch": 1.1774750396566203, + "grad_norm": 1.7939837497431932, + "learning_rate": 0.0002790046797690806, + "loss": 7.4577, + "step": 12619 + }, + { + "epoch": 1.1775683493514977, + "grad_norm": 1.7232456181964895, + "learning_rate": 0.00027900082024873016, + "loss": 7.2109, + "step": 12620 + }, + { + "epoch": 1.1776616590463749, + "grad_norm": 2.1007778257711878, + "learning_rate": 0.00027899696040036877, + "loss": 7.4166, + "step": 12621 + }, + { + "epoch": 1.1777549687412523, + "grad_norm": 0.7564101405761336, + "learning_rate": 0.0002789931002240064, + "loss": 7.2047, + "step": 12622 + }, + { + "epoch": 1.1778482784361295, + "grad_norm": 0.9650346381843559, + "learning_rate": 0.0002789892397196527, + "loss": 7.3082, + "step": 12623 + }, + { + "epoch": 1.1779415881310067, + "grad_norm": 0.8949284526842689, + "learning_rate": 0.0002789853788873176, + "loss": 7.6864, + "step": 12624 + }, + { + "epoch": 1.178034897825884, + "grad_norm": 0.7693968059871471, + "learning_rate": 0.0002789815177270109, + "loss": 7.5087, + "step": 12625 + }, + { + "epoch": 1.1781282075207615, + "grad_norm": 0.572454848565171, + "learning_rate": 0.0002789776562387424, + "loss": 7.6227, + "step": 12626 + }, + { + "epoch": 1.1782215172156387, + "grad_norm": 1.5253974240945976, + "learning_rate": 0.000278973794422522, + "loss": 7.3207, + "step": 12627 + }, + { + "epoch": 1.178314826910516, + "grad_norm": 0.4401439839468494, + "learning_rate": 0.00027896993227835933, + "loss": 7.1816, + "step": 12628 + }, + { + "epoch": 1.1784081366053933, + "grad_norm": 10.494234802956319, + "learning_rate": 0.0002789660698062644, + "loss": 7.2562, + "step": 12629 + }, + { + "epoch": 1.1785014463002705, + "grad_norm": 0.7753578074740077, + "learning_rate": 0.0002789622070062469, + "loss": 7.5642, + "step": 12630 + }, + { + "epoch": 1.178594755995148, + "grad_norm": 0.9554670804548453, + "learning_rate": 0.00027895834387831677, + "loss": 7.2407, + "step": 12631 + }, + { + "epoch": 1.1786880656900252, + "grad_norm": 0.6081131089334726, + "learning_rate": 0.00027895448042248374, + "loss": 7.515, + "step": 12632 + }, + { + "epoch": 1.1787813753849026, + "grad_norm": 0.698459456314701, + "learning_rate": 0.00027895061663875767, + "loss": 7.5426, + "step": 12633 + }, + { + "epoch": 1.1788746850797798, + "grad_norm": 0.8478584535290506, + "learning_rate": 0.0002789467525271484, + "loss": 7.296, + "step": 12634 + }, + { + "epoch": 1.178967994774657, + "grad_norm": 7.4770553007312515, + "learning_rate": 0.0002789428880876657, + "loss": 7.141, + "step": 12635 + }, + { + "epoch": 1.1790613044695344, + "grad_norm": 0.7367657839955877, + "learning_rate": 0.0002789390233203195, + "loss": 7.1634, + "step": 12636 + }, + { + "epoch": 1.1791546141644116, + "grad_norm": 1.5539583461685462, + "learning_rate": 0.0002789351582251195, + "loss": 7.3137, + "step": 12637 + }, + { + "epoch": 1.179247923859289, + "grad_norm": 1.7848939530364014, + "learning_rate": 0.00027893129280207556, + "loss": 7.3312, + "step": 12638 + }, + { + "epoch": 1.1793412335541662, + "grad_norm": 1.5137131164180877, + "learning_rate": 0.00027892742705119757, + "loss": 7.5283, + "step": 12639 + }, + { + "epoch": 1.1794345432490436, + "grad_norm": 10.611554045093529, + "learning_rate": 0.00027892356097249533, + "loss": 7.6129, + "step": 12640 + }, + { + "epoch": 1.1795278529439208, + "grad_norm": 1.1861199494829435, + "learning_rate": 0.00027891969456597866, + "loss": 7.3613, + "step": 12641 + }, + { + "epoch": 1.1796211626387982, + "grad_norm": 1.3960684176649416, + "learning_rate": 0.00027891582783165736, + "loss": 7.5173, + "step": 12642 + }, + { + "epoch": 1.1797144723336754, + "grad_norm": 27.938427495875395, + "learning_rate": 0.00027891196076954136, + "loss": 7.6814, + "step": 12643 + }, + { + "epoch": 1.1798077820285529, + "grad_norm": 1.8671916111212028, + "learning_rate": 0.00027890809337964036, + "loss": 7.4772, + "step": 12644 + }, + { + "epoch": 1.17990109172343, + "grad_norm": 1.385810559306563, + "learning_rate": 0.00027890422566196433, + "loss": 7.4513, + "step": 12645 + }, + { + "epoch": 1.1799944014183072, + "grad_norm": 2.5511651008000933, + "learning_rate": 0.00027890035761652304, + "loss": 7.3376, + "step": 12646 + }, + { + "epoch": 1.1800877111131847, + "grad_norm": 0.951601789589158, + "learning_rate": 0.00027889648924332624, + "loss": 7.8665, + "step": 12647 + }, + { + "epoch": 1.1801810208080619, + "grad_norm": 0.5798967067240398, + "learning_rate": 0.0002788926205423839, + "loss": 7.1946, + "step": 12648 + }, + { + "epoch": 1.1802743305029393, + "grad_norm": 133.4146410294161, + "learning_rate": 0.00027888875151370584, + "loss": 7.2764, + "step": 12649 + }, + { + "epoch": 1.1803676401978165, + "grad_norm": 0.7033483012000649, + "learning_rate": 0.0002788848821573018, + "loss": 7.2965, + "step": 12650 + }, + { + "epoch": 1.180460949892694, + "grad_norm": 0.5340036188174395, + "learning_rate": 0.00027888101247318175, + "loss": 7.3438, + "step": 12651 + }, + { + "epoch": 1.180554259587571, + "grad_norm": 1407.8169998512108, + "learning_rate": 0.00027887714246135545, + "loss": 7.4594, + "step": 12652 + }, + { + "epoch": 1.1806475692824485, + "grad_norm": 1.202099595531799, + "learning_rate": 0.00027887327212183273, + "loss": 7.2547, + "step": 12653 + }, + { + "epoch": 1.1807408789773257, + "grad_norm": 0.9502639131268128, + "learning_rate": 0.00027886940145462345, + "loss": 7.3182, + "step": 12654 + }, + { + "epoch": 1.1808341886722031, + "grad_norm": 0.6149680767718221, + "learning_rate": 0.0002788655304597375, + "loss": 7.4154, + "step": 12655 + }, + { + "epoch": 1.1809274983670803, + "grad_norm": 532.7442867876829, + "learning_rate": 0.0002788616591371846, + "loss": 7.4291, + "step": 12656 + }, + { + "epoch": 1.1810208080619575, + "grad_norm": 0.7538240925062141, + "learning_rate": 0.0002788577874869747, + "loss": 7.1892, + "step": 12657 + }, + { + "epoch": 1.181114117756835, + "grad_norm": 0.5993292875094174, + "learning_rate": 0.00027885391550911765, + "loss": 7.4604, + "step": 12658 + }, + { + "epoch": 1.1812074274517121, + "grad_norm": 0.7097053882846553, + "learning_rate": 0.00027885004320362327, + "loss": 7.0859, + "step": 12659 + }, + { + "epoch": 1.1813007371465896, + "grad_norm": 0.9287307565891127, + "learning_rate": 0.0002788461705705014, + "loss": 7.4007, + "step": 12660 + }, + { + "epoch": 1.1813940468414668, + "grad_norm": 0.8205864445742792, + "learning_rate": 0.0002788422976097618, + "loss": 7.4386, + "step": 12661 + }, + { + "epoch": 1.1814873565363442, + "grad_norm": 0.8835145131181115, + "learning_rate": 0.0002788384243214145, + "loss": 7.5571, + "step": 12662 + }, + { + "epoch": 1.1815806662312214, + "grad_norm": 0.6854134487007664, + "learning_rate": 0.0002788345507054692, + "loss": 7.244, + "step": 12663 + }, + { + "epoch": 1.1816739759260988, + "grad_norm": 0.585692011236828, + "learning_rate": 0.0002788306767619358, + "loss": 7.4044, + "step": 12664 + }, + { + "epoch": 1.181767285620976, + "grad_norm": 0.775426063775142, + "learning_rate": 0.00027882680249082415, + "loss": 7.383, + "step": 12665 + }, + { + "epoch": 1.1818605953158534, + "grad_norm": 0.689446938598983, + "learning_rate": 0.00027882292789214416, + "loss": 7.3477, + "step": 12666 + }, + { + "epoch": 1.1819539050107306, + "grad_norm": 0.441173271205877, + "learning_rate": 0.00027881905296590556, + "loss": 7.3713, + "step": 12667 + }, + { + "epoch": 1.1820472147056078, + "grad_norm": 1.3532830895476744, + "learning_rate": 0.0002788151777121183, + "loss": 7.2736, + "step": 12668 + }, + { + "epoch": 1.1821405244004852, + "grad_norm": 1.1752220081192921, + "learning_rate": 0.0002788113021307922, + "loss": 7.2741, + "step": 12669 + }, + { + "epoch": 1.1822338340953624, + "grad_norm": 294.6364437575276, + "learning_rate": 0.0002788074262219371, + "loss": 7.2417, + "step": 12670 + }, + { + "epoch": 1.1823271437902398, + "grad_norm": 0.8179128659190187, + "learning_rate": 0.0002788035499855629, + "loss": 7.2293, + "step": 12671 + }, + { + "epoch": 1.182420453485117, + "grad_norm": 0.6261082768290925, + "learning_rate": 0.0002787996734216794, + "loss": 7.2132, + "step": 12672 + }, + { + "epoch": 1.1825137631799945, + "grad_norm": 1.0718789091640475, + "learning_rate": 0.00027879579653029645, + "loss": 7.4133, + "step": 12673 + }, + { + "epoch": 1.1826070728748717, + "grad_norm": 1.5264872824005187, + "learning_rate": 0.000278791919311424, + "loss": 7.3699, + "step": 12674 + }, + { + "epoch": 1.182700382569749, + "grad_norm": 1.2705603003576251, + "learning_rate": 0.00027878804176507184, + "loss": 7.4511, + "step": 12675 + }, + { + "epoch": 1.1827936922646263, + "grad_norm": 0.6189822464200454, + "learning_rate": 0.0002787841638912498, + "loss": 7.2659, + "step": 12676 + }, + { + "epoch": 1.1828870019595037, + "grad_norm": 1.5072633557267896, + "learning_rate": 0.00027878028568996784, + "loss": 7.0688, + "step": 12677 + }, + { + "epoch": 1.182980311654381, + "grad_norm": 1.0507971641931506, + "learning_rate": 0.0002787764071612358, + "loss": 7.3832, + "step": 12678 + }, + { + "epoch": 1.183073621349258, + "grad_norm": 1.2840326963124753, + "learning_rate": 0.00027877252830506337, + "loss": 7.2045, + "step": 12679 + }, + { + "epoch": 1.1831669310441355, + "grad_norm": 1.5593171044851768, + "learning_rate": 0.00027876864912146056, + "loss": 7.038, + "step": 12680 + }, + { + "epoch": 1.1832602407390127, + "grad_norm": 1.296810069153796, + "learning_rate": 0.0002787647696104373, + "loss": 7.1013, + "step": 12681 + }, + { + "epoch": 1.1833535504338901, + "grad_norm": 0.7834701966787766, + "learning_rate": 0.00027876088977200333, + "loss": 7.1749, + "step": 12682 + }, + { + "epoch": 1.1834468601287673, + "grad_norm": 1.6478262141981892, + "learning_rate": 0.0002787570096061686, + "loss": 7.6084, + "step": 12683 + }, + { + "epoch": 1.1835401698236447, + "grad_norm": 1.0374706876991096, + "learning_rate": 0.0002787531291129429, + "loss": 7.093, + "step": 12684 + }, + { + "epoch": 1.183633479518522, + "grad_norm": 1.4150677003182768, + "learning_rate": 0.00027874924829233614, + "loss": 7.3916, + "step": 12685 + }, + { + "epoch": 1.1837267892133994, + "grad_norm": 0.8236653200857869, + "learning_rate": 0.00027874536714435816, + "loss": 7.0763, + "step": 12686 + }, + { + "epoch": 1.1838200989082766, + "grad_norm": 0.50028759854579, + "learning_rate": 0.0002787414856690189, + "loss": 7.1568, + "step": 12687 + }, + { + "epoch": 1.183913408603154, + "grad_norm": 0.8231774616409929, + "learning_rate": 0.00027873760386632813, + "loss": 7.6472, + "step": 12688 + }, + { + "epoch": 1.1840067182980312, + "grad_norm": 0.5910165027368465, + "learning_rate": 0.0002787337217362958, + "loss": 7.4323, + "step": 12689 + }, + { + "epoch": 1.1841000279929084, + "grad_norm": 0.6549671799058956, + "learning_rate": 0.00027872983927893175, + "loss": 7.5351, + "step": 12690 + }, + { + "epoch": 1.1841933376877858, + "grad_norm": 1.5973341279016537, + "learning_rate": 0.0002787259564942458, + "loss": 7.3337, + "step": 12691 + }, + { + "epoch": 1.184286647382663, + "grad_norm": 1.0807722438127574, + "learning_rate": 0.0002787220733822479, + "loss": 7.5158, + "step": 12692 + }, + { + "epoch": 1.1843799570775404, + "grad_norm": 0.9739742107551729, + "learning_rate": 0.00027871818994294794, + "loss": 7.6136, + "step": 12693 + }, + { + "epoch": 1.1844732667724176, + "grad_norm": 1.0144867417986463, + "learning_rate": 0.0002787143061763557, + "loss": 7.2264, + "step": 12694 + }, + { + "epoch": 1.184566576467295, + "grad_norm": 0.7114344640234833, + "learning_rate": 0.0002787104220824811, + "loss": 7.2305, + "step": 12695 + }, + { + "epoch": 1.1846598861621722, + "grad_norm": 0.8957120492506987, + "learning_rate": 0.00027870653766133406, + "loss": 7.3693, + "step": 12696 + }, + { + "epoch": 1.1847531958570496, + "grad_norm": 1.112697285440588, + "learning_rate": 0.0002787026529129244, + "loss": 7.1413, + "step": 12697 + }, + { + "epoch": 1.1848465055519268, + "grad_norm": 1.2607898415574106, + "learning_rate": 0.00027869876783726205, + "loss": 7.0568, + "step": 12698 + }, + { + "epoch": 1.1849398152468043, + "grad_norm": 1.169357069988362, + "learning_rate": 0.0002786948824343568, + "loss": 7.2179, + "step": 12699 + }, + { + "epoch": 1.1850331249416814, + "grad_norm": 1.4903630291295618, + "learning_rate": 0.0002786909967042187, + "loss": 7.3282, + "step": 12700 + }, + { + "epoch": 1.1851264346365586, + "grad_norm": 0.4840906889493692, + "learning_rate": 0.00027868711064685736, + "loss": 6.9031, + "step": 12701 + }, + { + "epoch": 1.185219744331436, + "grad_norm": 0.7236062534235511, + "learning_rate": 0.0002786832242622829, + "loss": 7.2461, + "step": 12702 + }, + { + "epoch": 1.1853130540263133, + "grad_norm": 0.9852828032002904, + "learning_rate": 0.0002786793375505051, + "loss": 7.4794, + "step": 12703 + }, + { + "epoch": 1.1854063637211907, + "grad_norm": 0.8131876341744123, + "learning_rate": 0.0002786754505115339, + "loss": 7.2611, + "step": 12704 + }, + { + "epoch": 1.1854996734160679, + "grad_norm": 1.1086924667257732, + "learning_rate": 0.00027867156314537915, + "loss": 7.1916, + "step": 12705 + }, + { + "epoch": 1.1855929831109453, + "grad_norm": 0.8860971252015788, + "learning_rate": 0.0002786676754520507, + "loss": 7.5278, + "step": 12706 + }, + { + "epoch": 1.1856862928058225, + "grad_norm": 0.4622974222591064, + "learning_rate": 0.00027866378743155845, + "loss": 7.306, + "step": 12707 + }, + { + "epoch": 1.1857796025007, + "grad_norm": 0.8038713403577077, + "learning_rate": 0.0002786598990839123, + "loss": 7.0793, + "step": 12708 + }, + { + "epoch": 1.1858729121955771, + "grad_norm": 1790.1458570439772, + "learning_rate": 0.0002786560104091222, + "loss": 7.2809, + "step": 12709 + }, + { + "epoch": 1.1859662218904545, + "grad_norm": 0.7019834714593578, + "learning_rate": 0.0002786521214071979, + "loss": 7.5735, + "step": 12710 + }, + { + "epoch": 1.1860595315853317, + "grad_norm": 1.3125425073809929, + "learning_rate": 0.0002786482320781494, + "loss": 7.0957, + "step": 12711 + }, + { + "epoch": 1.186152841280209, + "grad_norm": 1.2394982142935491, + "learning_rate": 0.0002786443424219866, + "loss": 7.4235, + "step": 12712 + }, + { + "epoch": 1.1862461509750863, + "grad_norm": 1.5636606349684175, + "learning_rate": 0.0002786404524387193, + "loss": 7.4906, + "step": 12713 + }, + { + "epoch": 1.1863394606699635, + "grad_norm": 1.9546364134869458, + "learning_rate": 0.00027863656212835745, + "loss": 7.1608, + "step": 12714 + }, + { + "epoch": 1.186432770364841, + "grad_norm": 0.9735334229104761, + "learning_rate": 0.0002786326714909109, + "loss": 7.2123, + "step": 12715 + }, + { + "epoch": 1.1865260800597182, + "grad_norm": 0.5883359671906789, + "learning_rate": 0.00027862878052638956, + "loss": 7.3392, + "step": 12716 + }, + { + "epoch": 1.1866193897545956, + "grad_norm": 0.5220601307062642, + "learning_rate": 0.00027862488923480335, + "loss": 7.2564, + "step": 12717 + }, + { + "epoch": 1.1867126994494728, + "grad_norm": 0.6589678864159223, + "learning_rate": 0.00027862099761616214, + "loss": 7.3537, + "step": 12718 + }, + { + "epoch": 1.1868060091443502, + "grad_norm": 0.7193558021691029, + "learning_rate": 0.00027861710567047583, + "loss": 7.311, + "step": 12719 + }, + { + "epoch": 1.1868993188392274, + "grad_norm": 294.2118766811714, + "learning_rate": 0.00027861321339775437, + "loss": 7.198, + "step": 12720 + }, + { + "epoch": 1.1869926285341048, + "grad_norm": 0.5827060695271642, + "learning_rate": 0.00027860932079800754, + "loss": 7.1749, + "step": 12721 + }, + { + "epoch": 1.187085938228982, + "grad_norm": 0.969240470412824, + "learning_rate": 0.00027860542787124534, + "loss": 7.3166, + "step": 12722 + }, + { + "epoch": 1.1871792479238592, + "grad_norm": 1238.707520467493, + "learning_rate": 0.0002786015346174777, + "loss": 7.2213, + "step": 12723 + }, + { + "epoch": 1.1872725576187366, + "grad_norm": 0.5666001499397557, + "learning_rate": 0.00027859764103671433, + "loss": 7.3183, + "step": 12724 + }, + { + "epoch": 1.1873658673136138, + "grad_norm": 0.7629931523279688, + "learning_rate": 0.0002785937471289653, + "loss": 7.0545, + "step": 12725 + }, + { + "epoch": 1.1874591770084912, + "grad_norm": 149.00928963263362, + "learning_rate": 0.0002785898528942405, + "loss": 7.3918, + "step": 12726 + }, + { + "epoch": 1.1875524867033684, + "grad_norm": 0.5222428063349861, + "learning_rate": 0.00027858595833254974, + "loss": 7.3316, + "step": 12727 + }, + { + "epoch": 1.1876457963982459, + "grad_norm": 0.6352840883761172, + "learning_rate": 0.000278582063443903, + "loss": 7.2651, + "step": 12728 + }, + { + "epoch": 1.187739106093123, + "grad_norm": 0.4314815504634931, + "learning_rate": 0.00027857816822831016, + "loss": 7.3746, + "step": 12729 + }, + { + "epoch": 1.1878324157880003, + "grad_norm": 0.62183960818478, + "learning_rate": 0.0002785742726857811, + "loss": 6.8573, + "step": 12730 + }, + { + "epoch": 1.1879257254828777, + "grad_norm": 6.083875179695188, + "learning_rate": 0.0002785703768163258, + "loss": 7.3865, + "step": 12731 + }, + { + "epoch": 1.188019035177755, + "grad_norm": 0.8323144047417923, + "learning_rate": 0.0002785664806199541, + "loss": 7.5295, + "step": 12732 + }, + { + "epoch": 1.1881123448726323, + "grad_norm": 0.7215277084193409, + "learning_rate": 0.0002785625840966759, + "loss": 7.1728, + "step": 12733 + }, + { + "epoch": 1.1882056545675095, + "grad_norm": 16.311410487769084, + "learning_rate": 0.0002785586872465012, + "loss": 7.3458, + "step": 12734 + }, + { + "epoch": 1.188298964262387, + "grad_norm": 10.618657551509889, + "learning_rate": 0.00027855479006943975, + "loss": 7.2511, + "step": 12735 + }, + { + "epoch": 1.188392273957264, + "grad_norm": 0.5417305038541632, + "learning_rate": 0.0002785508925655016, + "loss": 7.4821, + "step": 12736 + }, + { + "epoch": 1.1884855836521415, + "grad_norm": 1.078820495347698, + "learning_rate": 0.0002785469947346966, + "loss": 7.8905, + "step": 12737 + }, + { + "epoch": 1.1885788933470187, + "grad_norm": 0.5705628325034832, + "learning_rate": 0.00027854309657703464, + "loss": 7.2633, + "step": 12738 + }, + { + "epoch": 1.1886722030418961, + "grad_norm": 0.6544223330416796, + "learning_rate": 0.00027853919809252574, + "loss": 7.3569, + "step": 12739 + }, + { + "epoch": 1.1887655127367733, + "grad_norm": 1.6718526674521295, + "learning_rate": 0.0002785352992811797, + "loss": 6.9891, + "step": 12740 + }, + { + "epoch": 1.1888588224316505, + "grad_norm": 0.5873820804016804, + "learning_rate": 0.00027853140014300645, + "loss": 7.367, + "step": 12741 + }, + { + "epoch": 1.188952132126528, + "grad_norm": 0.5890062727670419, + "learning_rate": 0.00027852750067801596, + "loss": 7.1143, + "step": 12742 + }, + { + "epoch": 1.1890454418214054, + "grad_norm": 1.2604566212060286, + "learning_rate": 0.00027852360088621804, + "loss": 7.6435, + "step": 12743 + }, + { + "epoch": 1.1891387515162826, + "grad_norm": 5.316763949637701, + "learning_rate": 0.0002785197007676227, + "loss": 7.2174, + "step": 12744 + }, + { + "epoch": 1.1892320612111598, + "grad_norm": 0.732865489906286, + "learning_rate": 0.00027851580032223986, + "loss": 7.4938, + "step": 12745 + }, + { + "epoch": 1.1893253709060372, + "grad_norm": 0.6547770362209216, + "learning_rate": 0.0002785118995500794, + "loss": 7.2335, + "step": 12746 + }, + { + "epoch": 1.1894186806009144, + "grad_norm": 0.5753672380762478, + "learning_rate": 0.00027850799845115123, + "loss": 7.5351, + "step": 12747 + }, + { + "epoch": 1.1895119902957918, + "grad_norm": 1.6127507220795718, + "learning_rate": 0.0002785040970254653, + "loss": 7.0328, + "step": 12748 + }, + { + "epoch": 1.189605299990669, + "grad_norm": 1.280922896870439, + "learning_rate": 0.0002785001952730315, + "loss": 7.2696, + "step": 12749 + }, + { + "epoch": 1.1896986096855464, + "grad_norm": 1.198279688393912, + "learning_rate": 0.00027849629319385985, + "loss": 7.5205, + "step": 12750 + }, + { + "epoch": 1.1897919193804236, + "grad_norm": 4.50927163315053, + "learning_rate": 0.00027849239078796007, + "loss": 7.5345, + "step": 12751 + }, + { + "epoch": 1.1898852290753008, + "grad_norm": 0.6213285386132478, + "learning_rate": 0.00027848848805534226, + "loss": 7.2685, + "step": 12752 + }, + { + "epoch": 1.1899785387701782, + "grad_norm": 0.7289815849432397, + "learning_rate": 0.0002784845849960163, + "loss": 7.5273, + "step": 12753 + }, + { + "epoch": 1.1900718484650554, + "grad_norm": 2.887185159707122, + "learning_rate": 0.0002784806816099921, + "loss": 7.7914, + "step": 12754 + }, + { + "epoch": 1.1901651581599328, + "grad_norm": 0.41659719960813457, + "learning_rate": 0.00027847677789727957, + "loss": 7.4072, + "step": 12755 + }, + { + "epoch": 1.19025846785481, + "grad_norm": 0.5366219494927639, + "learning_rate": 0.0002784728738578886, + "loss": 7.5277, + "step": 12756 + }, + { + "epoch": 1.1903517775496875, + "grad_norm": 1.3791755945558437, + "learning_rate": 0.00027846896949182924, + "loss": 7.1019, + "step": 12757 + }, + { + "epoch": 1.1904450872445647, + "grad_norm": 0.654762141157755, + "learning_rate": 0.0002784650647991113, + "loss": 7.2658, + "step": 12758 + }, + { + "epoch": 1.190538396939442, + "grad_norm": 0.695353090126851, + "learning_rate": 0.0002784611597797448, + "loss": 7.1672, + "step": 12759 + }, + { + "epoch": 1.1906317066343193, + "grad_norm": 1.1662815097641572, + "learning_rate": 0.0002784572544337396, + "loss": 7.3608, + "step": 12760 + }, + { + "epoch": 1.1907250163291967, + "grad_norm": 1.0773586110772975, + "learning_rate": 0.0002784533487611057, + "loss": 7.0385, + "step": 12761 + }, + { + "epoch": 1.190818326024074, + "grad_norm": 1.3245775012650056, + "learning_rate": 0.0002784494427618529, + "loss": 7.3562, + "step": 12762 + }, + { + "epoch": 1.190911635718951, + "grad_norm": 4.896394625135838, + "learning_rate": 0.00027844553643599127, + "loss": 7.0918, + "step": 12763 + }, + { + "epoch": 1.1910049454138285, + "grad_norm": 1.4649455245376213, + "learning_rate": 0.0002784416297835307, + "loss": 7.6696, + "step": 12764 + }, + { + "epoch": 1.1910982551087057, + "grad_norm": 1.1318169171174328, + "learning_rate": 0.00027843772280448107, + "loss": 7.1233, + "step": 12765 + }, + { + "epoch": 1.1911915648035831, + "grad_norm": 1.7703121393408183, + "learning_rate": 0.00027843381549885233, + "loss": 7.03, + "step": 12766 + }, + { + "epoch": 1.1912848744984603, + "grad_norm": 1.4962975757990407, + "learning_rate": 0.00027842990786665453, + "loss": 7.559, + "step": 12767 + }, + { + "epoch": 1.1913781841933377, + "grad_norm": 0.8695945135808729, + "learning_rate": 0.0002784259999078975, + "loss": 7.3557, + "step": 12768 + }, + { + "epoch": 1.191471493888215, + "grad_norm": 0.5979812077925275, + "learning_rate": 0.00027842209162259114, + "loss": 7.3309, + "step": 12769 + }, + { + "epoch": 1.1915648035830924, + "grad_norm": 12.388838227443918, + "learning_rate": 0.0002784181830107455, + "loss": 7.2666, + "step": 12770 + }, + { + "epoch": 1.1916581132779696, + "grad_norm": 0.7122386955081643, + "learning_rate": 0.0002784142740723704, + "loss": 7.195, + "step": 12771 + }, + { + "epoch": 1.191751422972847, + "grad_norm": 1.6107611414651815, + "learning_rate": 0.0002784103648074759, + "loss": 7.2535, + "step": 12772 + }, + { + "epoch": 1.1918447326677242, + "grad_norm": 2.6025841916348167, + "learning_rate": 0.00027840645521607185, + "loss": 7.4722, + "step": 12773 + }, + { + "epoch": 1.1919380423626014, + "grad_norm": 0.802523182922073, + "learning_rate": 0.00027840254529816824, + "loss": 7.2616, + "step": 12774 + }, + { + "epoch": 1.1920313520574788, + "grad_norm": 2.7445394434723576, + "learning_rate": 0.000278398635053775, + "loss": 7.3223, + "step": 12775 + }, + { + "epoch": 1.192124661752356, + "grad_norm": 0.42115259775417674, + "learning_rate": 0.000278394724482902, + "loss": 7.362, + "step": 12776 + }, + { + "epoch": 1.1922179714472334, + "grad_norm": 1.0758363692072854, + "learning_rate": 0.00027839081358555927, + "loss": 7.6366, + "step": 12777 + }, + { + "epoch": 1.1923112811421106, + "grad_norm": 0.6836632292810001, + "learning_rate": 0.00027838690236175676, + "loss": 7.4561, + "step": 12778 + }, + { + "epoch": 1.192404590836988, + "grad_norm": 1.145989231727276, + "learning_rate": 0.00027838299081150444, + "loss": 7.2182, + "step": 12779 + }, + { + "epoch": 1.1924979005318652, + "grad_norm": 1.033614309770679, + "learning_rate": 0.0002783790789348121, + "loss": 7.4075, + "step": 12780 + }, + { + "epoch": 1.1925912102267426, + "grad_norm": 2.0770259581574693, + "learning_rate": 0.00027837516673168985, + "loss": 7.0739, + "step": 12781 + }, + { + "epoch": 1.1926845199216198, + "grad_norm": 0.8917440682903208, + "learning_rate": 0.0002783712542021476, + "loss": 7.0222, + "step": 12782 + }, + { + "epoch": 1.1927778296164973, + "grad_norm": 1.3025104647561139, + "learning_rate": 0.0002783673413461952, + "loss": 7.5262, + "step": 12783 + }, + { + "epoch": 1.1928711393113745, + "grad_norm": 1.4196356548209925, + "learning_rate": 0.00027836342816384276, + "loss": 7.2365, + "step": 12784 + }, + { + "epoch": 1.1929644490062516, + "grad_norm": 4.315534212576336, + "learning_rate": 0.0002783595146551001, + "loss": 7.2524, + "step": 12785 + }, + { + "epoch": 1.193057758701129, + "grad_norm": 0.7895929863473261, + "learning_rate": 0.00027835560081997725, + "loss": 7.1149, + "step": 12786 + }, + { + "epoch": 1.1931510683960063, + "grad_norm": 1.1471955871700643, + "learning_rate": 0.00027835168665848407, + "loss": 7.5715, + "step": 12787 + }, + { + "epoch": 1.1932443780908837, + "grad_norm": 2.6485134060218685, + "learning_rate": 0.00027834777217063063, + "loss": 7.3235, + "step": 12788 + }, + { + "epoch": 1.1933376877857609, + "grad_norm": 0.6023912428035225, + "learning_rate": 0.00027834385735642675, + "loss": 7.4563, + "step": 12789 + }, + { + "epoch": 1.1934309974806383, + "grad_norm": 0.48842218304947527, + "learning_rate": 0.0002783399422158825, + "loss": 7.7007, + "step": 12790 + }, + { + "epoch": 1.1935243071755155, + "grad_norm": 0.6553361219774136, + "learning_rate": 0.00027833602674900786, + "loss": 7.3639, + "step": 12791 + }, + { + "epoch": 1.193617616870393, + "grad_norm": 0.9906069713221998, + "learning_rate": 0.00027833211095581263, + "loss": 7.3337, + "step": 12792 + }, + { + "epoch": 1.1937109265652701, + "grad_norm": 0.4999036311000929, + "learning_rate": 0.0002783281948363069, + "loss": 7.5551, + "step": 12793 + }, + { + "epoch": 1.1938042362601475, + "grad_norm": 7.451677410192652, + "learning_rate": 0.0002783242783905006, + "loss": 7.4254, + "step": 12794 + }, + { + "epoch": 1.1938975459550247, + "grad_norm": 0.7093383043725215, + "learning_rate": 0.0002783203616184036, + "loss": 7.0995, + "step": 12795 + }, + { + "epoch": 1.193990855649902, + "grad_norm": 1.4219547850919867, + "learning_rate": 0.000278316444520026, + "loss": 7.6109, + "step": 12796 + }, + { + "epoch": 1.1940841653447793, + "grad_norm": 0.7873105353341939, + "learning_rate": 0.00027831252709537767, + "loss": 7.2032, + "step": 12797 + }, + { + "epoch": 1.1941774750396565, + "grad_norm": 5.33802034269553, + "learning_rate": 0.00027830860934446855, + "loss": 7.1339, + "step": 12798 + }, + { + "epoch": 1.194270784734534, + "grad_norm": 0.9023500290710029, + "learning_rate": 0.00027830469126730875, + "loss": 7.2952, + "step": 12799 + }, + { + "epoch": 1.1943640944294112, + "grad_norm": 0.7918107377489937, + "learning_rate": 0.00027830077286390806, + "loss": 7.4526, + "step": 12800 + }, + { + "epoch": 1.1944574041242886, + "grad_norm": 0.5402972528730157, + "learning_rate": 0.0002782968541342765, + "loss": 7.4741, + "step": 12801 + }, + { + "epoch": 1.1945507138191658, + "grad_norm": 1.7356975177233376, + "learning_rate": 0.00027829293507842406, + "loss": 6.9247, + "step": 12802 + }, + { + "epoch": 1.1946440235140432, + "grad_norm": 1.4246577440651378, + "learning_rate": 0.0002782890156963607, + "loss": 7.1121, + "step": 12803 + }, + { + "epoch": 1.1947373332089204, + "grad_norm": 0.49617952009346866, + "learning_rate": 0.00027828509598809635, + "loss": 7.2927, + "step": 12804 + }, + { + "epoch": 1.1948306429037978, + "grad_norm": 1.446862869958162, + "learning_rate": 0.000278281175953641, + "loss": 7.6111, + "step": 12805 + }, + { + "epoch": 1.194923952598675, + "grad_norm": 1.080414549866529, + "learning_rate": 0.00027827725559300467, + "loss": 7.5759, + "step": 12806 + }, + { + "epoch": 1.1950172622935522, + "grad_norm": 0.5939715340559049, + "learning_rate": 0.0002782733349061972, + "loss": 7.2199, + "step": 12807 + }, + { + "epoch": 1.1951105719884296, + "grad_norm": 2.735215699516736, + "learning_rate": 0.0002782694138932287, + "loss": 6.9337, + "step": 12808 + }, + { + "epoch": 1.1952038816833068, + "grad_norm": 0.7786142807739422, + "learning_rate": 0.000278265492554109, + "loss": 7.6181, + "step": 12809 + }, + { + "epoch": 1.1952971913781842, + "grad_norm": 0.5150240438706768, + "learning_rate": 0.00027826157088884826, + "loss": 7.1962, + "step": 12810 + }, + { + "epoch": 1.1953905010730614, + "grad_norm": 0.6334822940346992, + "learning_rate": 0.00027825764889745624, + "loss": 7.4566, + "step": 12811 + }, + { + "epoch": 1.1954838107679389, + "grad_norm": 11.026089669492888, + "learning_rate": 0.0002782537265799431, + "loss": 7.4502, + "step": 12812 + }, + { + "epoch": 1.195577120462816, + "grad_norm": 0.47182502680563426, + "learning_rate": 0.00027824980393631866, + "loss": 7.3723, + "step": 12813 + }, + { + "epoch": 1.1956704301576935, + "grad_norm": 1.659319428205098, + "learning_rate": 0.00027824588096659296, + "loss": 7.123, + "step": 12814 + }, + { + "epoch": 1.1957637398525707, + "grad_norm": 43.90049222356606, + "learning_rate": 0.000278241957670776, + "loss": 7.5254, + "step": 12815 + }, + { + "epoch": 1.195857049547448, + "grad_norm": 2.8620910384117937, + "learning_rate": 0.00027823803404887775, + "loss": 7.2883, + "step": 12816 + }, + { + "epoch": 1.1959503592423253, + "grad_norm": 1.270914925067544, + "learning_rate": 0.00027823411010090813, + "loss": 7.3897, + "step": 12817 + }, + { + "epoch": 1.1960436689372025, + "grad_norm": 0.7067551943820293, + "learning_rate": 0.00027823018582687713, + "loss": 7.6012, + "step": 12818 + }, + { + "epoch": 1.19613697863208, + "grad_norm": 0.7610930711454557, + "learning_rate": 0.0002782262612267948, + "loss": 7.3296, + "step": 12819 + }, + { + "epoch": 1.196230288326957, + "grad_norm": 0.7687618844389655, + "learning_rate": 0.000278222336300671, + "loss": 7.0924, + "step": 12820 + }, + { + "epoch": 1.1963235980218345, + "grad_norm": 0.6497369382887807, + "learning_rate": 0.00027821841104851586, + "loss": 7.1933, + "step": 12821 + }, + { + "epoch": 1.1964169077167117, + "grad_norm": 1.007353783679734, + "learning_rate": 0.00027821448547033927, + "loss": 7.4457, + "step": 12822 + }, + { + "epoch": 1.1965102174115891, + "grad_norm": 1.1684580047129023, + "learning_rate": 0.0002782105595661512, + "loss": 7.3375, + "step": 12823 + }, + { + "epoch": 1.1966035271064663, + "grad_norm": 0.6540787763964945, + "learning_rate": 0.0002782066333359617, + "loss": 7.3184, + "step": 12824 + }, + { + "epoch": 1.1966968368013438, + "grad_norm": 0.5798435896069464, + "learning_rate": 0.00027820270677978066, + "loss": 7.2486, + "step": 12825 + }, + { + "epoch": 1.196790146496221, + "grad_norm": 0.481588148646849, + "learning_rate": 0.0002781987798976181, + "loss": 7.4369, + "step": 12826 + }, + { + "epoch": 1.1968834561910984, + "grad_norm": 1.146000553928056, + "learning_rate": 0.00027819485268948407, + "loss": 7.094, + "step": 12827 + }, + { + "epoch": 1.1969767658859756, + "grad_norm": 1.092130232343049, + "learning_rate": 0.0002781909251553885, + "loss": 7.6181, + "step": 12828 + }, + { + "epoch": 1.1970700755808528, + "grad_norm": 0.7451106738437308, + "learning_rate": 0.00027818699729534134, + "loss": 7.1801, + "step": 12829 + }, + { + "epoch": 1.1971633852757302, + "grad_norm": 0.8534185124501633, + "learning_rate": 0.0002781830691093526, + "loss": 7.4206, + "step": 12830 + }, + { + "epoch": 1.1972566949706074, + "grad_norm": 0.5508155482710466, + "learning_rate": 0.00027817914059743233, + "loss": 7.5076, + "step": 12831 + }, + { + "epoch": 1.1973500046654848, + "grad_norm": 0.5286329384195299, + "learning_rate": 0.00027817521175959047, + "loss": 7.2909, + "step": 12832 + }, + { + "epoch": 1.197443314360362, + "grad_norm": 0.6226920957912723, + "learning_rate": 0.00027817128259583705, + "loss": 7.225, + "step": 12833 + }, + { + "epoch": 1.1975366240552394, + "grad_norm": 2.8570878822956596, + "learning_rate": 0.000278167353106182, + "loss": 7.2746, + "step": 12834 + }, + { + "epoch": 1.1976299337501166, + "grad_norm": 0.7844576166691726, + "learning_rate": 0.0002781634232906353, + "loss": 7.2362, + "step": 12835 + }, + { + "epoch": 1.1977232434449938, + "grad_norm": 0.8342353705134441, + "learning_rate": 0.00027815949314920695, + "loss": 7.0772, + "step": 12836 + }, + { + "epoch": 1.1978165531398712, + "grad_norm": 2.326925605742136, + "learning_rate": 0.00027815556268190707, + "loss": 7.3456, + "step": 12837 + }, + { + "epoch": 1.1979098628347487, + "grad_norm": 0.6757202410833067, + "learning_rate": 0.0002781516318887455, + "loss": 7.5199, + "step": 12838 + }, + { + "epoch": 1.1980031725296258, + "grad_norm": 0.6659399863668104, + "learning_rate": 0.00027814770076973235, + "loss": 7.2191, + "step": 12839 + }, + { + "epoch": 1.198096482224503, + "grad_norm": 0.7344549313602052, + "learning_rate": 0.00027814376932487747, + "loss": 7.4103, + "step": 12840 + }, + { + "epoch": 1.1981897919193805, + "grad_norm": 1.0394879377175206, + "learning_rate": 0.000278139837554191, + "loss": 7.1094, + "step": 12841 + }, + { + "epoch": 1.1982831016142577, + "grad_norm": 1.1492342862752374, + "learning_rate": 0.00027813590545768284, + "loss": 7.0627, + "step": 12842 + }, + { + "epoch": 1.198376411309135, + "grad_norm": 0.6690432212225094, + "learning_rate": 0.0002781319730353631, + "loss": 7.2871, + "step": 12843 + }, + { + "epoch": 1.1984697210040123, + "grad_norm": 1.1659032751731648, + "learning_rate": 0.00027812804028724163, + "loss": 6.8851, + "step": 12844 + }, + { + "epoch": 1.1985630306988897, + "grad_norm": 0.7165220796369639, + "learning_rate": 0.0002781241072133286, + "loss": 7.2964, + "step": 12845 + }, + { + "epoch": 1.198656340393767, + "grad_norm": 1.6096745941136235, + "learning_rate": 0.0002781201738136338, + "loss": 7.6242, + "step": 12846 + }, + { + "epoch": 1.198749650088644, + "grad_norm": 1.5675602151881507, + "learning_rate": 0.00027811624008816746, + "loss": 7.5935, + "step": 12847 + }, + { + "epoch": 1.1988429597835215, + "grad_norm": 0.578274727700213, + "learning_rate": 0.0002781123060369394, + "loss": 6.9859, + "step": 12848 + }, + { + "epoch": 1.198936269478399, + "grad_norm": 0.7662522314913311, + "learning_rate": 0.0002781083716599597, + "loss": 7.3032, + "step": 12849 + }, + { + "epoch": 1.1990295791732761, + "grad_norm": 1.041014446894582, + "learning_rate": 0.0002781044369572384, + "loss": 7.3775, + "step": 12850 + }, + { + "epoch": 1.1991228888681533, + "grad_norm": 1.6116686067696044, + "learning_rate": 0.0002781005019287854, + "loss": 6.9536, + "step": 12851 + }, + { + "epoch": 1.1992161985630307, + "grad_norm": 0.4910419266634712, + "learning_rate": 0.0002780965665746109, + "loss": 7.4546, + "step": 12852 + }, + { + "epoch": 1.199309508257908, + "grad_norm": 0.5873842043590934, + "learning_rate": 0.0002780926308947246, + "loss": 7.5583, + "step": 12853 + }, + { + "epoch": 1.1994028179527854, + "grad_norm": 3.925154902687311, + "learning_rate": 0.0002780886948891368, + "loss": 7.168, + "step": 12854 + }, + { + "epoch": 1.1994961276476626, + "grad_norm": 0.7356754578194283, + "learning_rate": 0.00027808475855785737, + "loss": 7.3343, + "step": 12855 + }, + { + "epoch": 1.19958943734254, + "grad_norm": 0.8691814405144357, + "learning_rate": 0.00027808082190089635, + "loss": 7.1456, + "step": 12856 + }, + { + "epoch": 1.1996827470374172, + "grad_norm": 0.5914328564245055, + "learning_rate": 0.0002780768849182637, + "loss": 7.1613, + "step": 12857 + }, + { + "epoch": 1.1997760567322944, + "grad_norm": 0.5985436752305283, + "learning_rate": 0.0002780729476099695, + "loss": 7.3256, + "step": 12858 + }, + { + "epoch": 1.1998693664271718, + "grad_norm": 0.8533048354821874, + "learning_rate": 0.00027806900997602373, + "loss": 7.5418, + "step": 12859 + }, + { + "epoch": 1.199962676122049, + "grad_norm": 0.45548259595860996, + "learning_rate": 0.00027806507201643635, + "loss": 7.2435, + "step": 12860 + }, + { + "epoch": 1.2000559858169264, + "grad_norm": 7.835178077258, + "learning_rate": 0.0002780611337312175, + "loss": 7.378, + "step": 12861 + }, + { + "epoch": 1.2001492955118036, + "grad_norm": 1.0798273481124807, + "learning_rate": 0.0002780571951203771, + "loss": 7.4315, + "step": 12862 + }, + { + "epoch": 1.200242605206681, + "grad_norm": 1.3865056594518574, + "learning_rate": 0.0002780532561839252, + "loss": 7.2072, + "step": 12863 + }, + { + "epoch": 1.2003359149015582, + "grad_norm": 0.7761057943982681, + "learning_rate": 0.0002780493169218718, + "loss": 7.4271, + "step": 12864 + }, + { + "epoch": 1.2004292245964356, + "grad_norm": 0.6001343843614838, + "learning_rate": 0.0002780453773342269, + "loss": 7.2717, + "step": 12865 + }, + { + "epoch": 1.2005225342913128, + "grad_norm": 0.8546489494992479, + "learning_rate": 0.00027804143742100055, + "loss": 7.4907, + "step": 12866 + }, + { + "epoch": 1.2006158439861903, + "grad_norm": 0.7313867691508343, + "learning_rate": 0.0002780374971822027, + "loss": 7.1809, + "step": 12867 + }, + { + "epoch": 1.2007091536810675, + "grad_norm": 1.1927548656760396, + "learning_rate": 0.0002780335566178435, + "loss": 7.6313, + "step": 12868 + }, + { + "epoch": 1.2008024633759447, + "grad_norm": 1.00497635640804, + "learning_rate": 0.0002780296157279329, + "loss": 7.1923, + "step": 12869 + }, + { + "epoch": 1.200895773070822, + "grad_norm": 0.6143191926499774, + "learning_rate": 0.00027802567451248086, + "loss": 7.1911, + "step": 12870 + }, + { + "epoch": 1.2009890827656993, + "grad_norm": 1.1095551308653715, + "learning_rate": 0.00027802173297149745, + "loss": 7.1522, + "step": 12871 + }, + { + "epoch": 1.2010823924605767, + "grad_norm": 1.1375969754733684, + "learning_rate": 0.0002780177911049927, + "loss": 7.2824, + "step": 12872 + }, + { + "epoch": 1.2011757021554539, + "grad_norm": 0.8110266211487269, + "learning_rate": 0.0002780138489129766, + "loss": 7.263, + "step": 12873 + }, + { + "epoch": 1.2012690118503313, + "grad_norm": 1.2472968165090395, + "learning_rate": 0.0002780099063954593, + "loss": 7.2264, + "step": 12874 + }, + { + "epoch": 1.2013623215452085, + "grad_norm": 0.8683406649968305, + "learning_rate": 0.00027800596355245067, + "loss": 7.2456, + "step": 12875 + }, + { + "epoch": 1.201455631240086, + "grad_norm": 1.1283370487652011, + "learning_rate": 0.00027800202038396075, + "loss": 7.5662, + "step": 12876 + }, + { + "epoch": 1.2015489409349631, + "grad_norm": 1.9409512623786982, + "learning_rate": 0.0002779980768899997, + "loss": 7.2043, + "step": 12877 + }, + { + "epoch": 1.2016422506298405, + "grad_norm": 0.6942057679201988, + "learning_rate": 0.0002779941330705774, + "loss": 7.5928, + "step": 12878 + }, + { + "epoch": 1.2017355603247177, + "grad_norm": 0.9159463794426915, + "learning_rate": 0.0002779901889257039, + "loss": 7.5824, + "step": 12879 + }, + { + "epoch": 1.201828870019595, + "grad_norm": 0.7083378106429844, + "learning_rate": 0.0002779862444553893, + "loss": 7.7336, + "step": 12880 + }, + { + "epoch": 1.2019221797144723, + "grad_norm": 2.502774526123938, + "learning_rate": 0.00027798229965964364, + "loss": 6.9757, + "step": 12881 + }, + { + "epoch": 1.2020154894093495, + "grad_norm": 0.7291031489290144, + "learning_rate": 0.0002779783545384768, + "loss": 7.471, + "step": 12882 + }, + { + "epoch": 1.202108799104227, + "grad_norm": 0.5037759838475597, + "learning_rate": 0.000277974409091899, + "loss": 7.4507, + "step": 12883 + }, + { + "epoch": 1.2022021087991042, + "grad_norm": 1.2211363104987136, + "learning_rate": 0.0002779704633199202, + "loss": 7.3959, + "step": 12884 + }, + { + "epoch": 1.2022954184939816, + "grad_norm": 2.02251135824868, + "learning_rate": 0.00027796651722255037, + "loss": 7.5724, + "step": 12885 + }, + { + "epoch": 1.2023887281888588, + "grad_norm": 2.1999281905506183, + "learning_rate": 0.0002779625707997996, + "loss": 7.2485, + "step": 12886 + }, + { + "epoch": 1.2024820378837362, + "grad_norm": 0.9981907343449974, + "learning_rate": 0.0002779586240516779, + "loss": 7.311, + "step": 12887 + }, + { + "epoch": 1.2025753475786134, + "grad_norm": 1.090484565132033, + "learning_rate": 0.00027795467697819536, + "loss": 7.1715, + "step": 12888 + }, + { + "epoch": 1.2026686572734908, + "grad_norm": 0.6093868865163027, + "learning_rate": 0.00027795072957936194, + "loss": 7.6623, + "step": 12889 + }, + { + "epoch": 1.202761966968368, + "grad_norm": 0.9596643170337311, + "learning_rate": 0.0002779467818551878, + "loss": 7.552, + "step": 12890 + }, + { + "epoch": 1.2028552766632452, + "grad_norm": 0.6869877789141648, + "learning_rate": 0.0002779428338056828, + "loss": 7.2584, + "step": 12891 + }, + { + "epoch": 1.2029485863581226, + "grad_norm": 2.25830062339931, + "learning_rate": 0.0002779388854308571, + "loss": 7.0429, + "step": 12892 + }, + { + "epoch": 1.2030418960529998, + "grad_norm": 0.4785207754678896, + "learning_rate": 0.00027793493673072074, + "loss": 7.4183, + "step": 12893 + }, + { + "epoch": 1.2031352057478772, + "grad_norm": 0.4618434840431172, + "learning_rate": 0.0002779309877052837, + "loss": 7.1659, + "step": 12894 + }, + { + "epoch": 1.2032285154427544, + "grad_norm": 0.7421630199352324, + "learning_rate": 0.00027792703835455605, + "loss": 7.1804, + "step": 12895 + }, + { + "epoch": 1.2033218251376319, + "grad_norm": 0.8557012755036395, + "learning_rate": 0.00027792308867854786, + "loss": 7.5729, + "step": 12896 + }, + { + "epoch": 1.203415134832509, + "grad_norm": 0.4982112750421555, + "learning_rate": 0.0002779191386772691, + "loss": 7.0633, + "step": 12897 + }, + { + "epoch": 1.2035084445273865, + "grad_norm": 0.927370459029162, + "learning_rate": 0.00027791518835073, + "loss": 7.1398, + "step": 12898 + }, + { + "epoch": 1.2036017542222637, + "grad_norm": 2.1474717968003096, + "learning_rate": 0.0002779112376989403, + "loss": 7.3481, + "step": 12899 + }, + { + "epoch": 1.203695063917141, + "grad_norm": 1.826668043704172, + "learning_rate": 0.00027790728672191035, + "loss": 7.0813, + "step": 12900 + }, + { + "epoch": 1.2037883736120183, + "grad_norm": 1.2931655019236343, + "learning_rate": 0.0002779033354196499, + "loss": 7.5007, + "step": 12901 + }, + { + "epoch": 1.2038816833068955, + "grad_norm": 0.9250258991455566, + "learning_rate": 0.00027789938379216934, + "loss": 7.5922, + "step": 12902 + }, + { + "epoch": 1.203974993001773, + "grad_norm": 0.4831793153831592, + "learning_rate": 0.0002778954318394784, + "loss": 7.303, + "step": 12903 + }, + { + "epoch": 1.20406830269665, + "grad_norm": 1.6511489328534465, + "learning_rate": 0.0002778914795615873, + "loss": 7.1734, + "step": 12904 + }, + { + "epoch": 1.2041616123915275, + "grad_norm": 1.4898230946629825, + "learning_rate": 0.00027788752695850606, + "loss": 7.5314, + "step": 12905 + }, + { + "epoch": 1.2042549220864047, + "grad_norm": 1.325927959230392, + "learning_rate": 0.00027788357403024476, + "loss": 7.3036, + "step": 12906 + }, + { + "epoch": 1.2043482317812821, + "grad_norm": 0.717430951915679, + "learning_rate": 0.00027787962077681334, + "loss": 7.1946, + "step": 12907 + }, + { + "epoch": 1.2044415414761593, + "grad_norm": 0.6426477518492519, + "learning_rate": 0.000277875667198222, + "loss": 7.4694, + "step": 12908 + }, + { + "epoch": 1.2045348511710368, + "grad_norm": 0.4224559320289919, + "learning_rate": 0.0002778717132944806, + "loss": 7.3639, + "step": 12909 + }, + { + "epoch": 1.204628160865914, + "grad_norm": 0.845026666316682, + "learning_rate": 0.00027786775906559944, + "loss": 7.2683, + "step": 12910 + }, + { + "epoch": 1.2047214705607914, + "grad_norm": 0.9284147236006124, + "learning_rate": 0.00027786380451158836, + "loss": 7.524, + "step": 12911 + }, + { + "epoch": 1.2048147802556686, + "grad_norm": 0.5741019348144105, + "learning_rate": 0.0002778598496324576, + "loss": 7.3108, + "step": 12912 + }, + { + "epoch": 1.2049080899505458, + "grad_norm": 0.8177336895994922, + "learning_rate": 0.000277855894428217, + "loss": 7.1903, + "step": 12913 + }, + { + "epoch": 1.2050013996454232, + "grad_norm": 0.6105605097959689, + "learning_rate": 0.0002778519388988768, + "loss": 7.3669, + "step": 12914 + }, + { + "epoch": 1.2050947093403004, + "grad_norm": 0.935966513085295, + "learning_rate": 0.00027784798304444696, + "loss": 7.2045, + "step": 12915 + }, + { + "epoch": 1.2051880190351778, + "grad_norm": 0.5746537922062234, + "learning_rate": 0.0002778440268649376, + "loss": 7.3433, + "step": 12916 + }, + { + "epoch": 1.205281328730055, + "grad_norm": 0.76902682636001, + "learning_rate": 0.00027784007036035877, + "loss": 7.4601, + "step": 12917 + }, + { + "epoch": 1.2053746384249324, + "grad_norm": 0.6683893366887335, + "learning_rate": 0.00027783611353072047, + "loss": 7.5143, + "step": 12918 + }, + { + "epoch": 1.2054679481198096, + "grad_norm": 0.8858862090818435, + "learning_rate": 0.00027783215637603283, + "loss": 7.3869, + "step": 12919 + }, + { + "epoch": 1.205561257814687, + "grad_norm": 1.6312200605228115, + "learning_rate": 0.0002778281988963058, + "loss": 7.4361, + "step": 12920 + }, + { + "epoch": 1.2056545675095642, + "grad_norm": 0.8790471683694544, + "learning_rate": 0.00027782424109154964, + "loss": 7.1078, + "step": 12921 + }, + { + "epoch": 1.2057478772044417, + "grad_norm": 0.5303956769437719, + "learning_rate": 0.00027782028296177426, + "loss": 7.4506, + "step": 12922 + }, + { + "epoch": 1.2058411868993189, + "grad_norm": 1.2732992179235574, + "learning_rate": 0.00027781632450698976, + "loss": 7.4359, + "step": 12923 + }, + { + "epoch": 1.205934496594196, + "grad_norm": 1.2986663286098847, + "learning_rate": 0.00027781236572720617, + "loss": 7.453, + "step": 12924 + }, + { + "epoch": 1.2060278062890735, + "grad_norm": 1.1075037943910235, + "learning_rate": 0.0002778084066224336, + "loss": 7.3073, + "step": 12925 + }, + { + "epoch": 1.2061211159839507, + "grad_norm": 1.1243258934149132, + "learning_rate": 0.0002778044471926822, + "loss": 7.1268, + "step": 12926 + }, + { + "epoch": 1.206214425678828, + "grad_norm": 1.1511523455430257, + "learning_rate": 0.0002778004874379618, + "loss": 6.9258, + "step": 12927 + }, + { + "epoch": 1.2063077353737053, + "grad_norm": 0.7021014397802045, + "learning_rate": 0.0002777965273582827, + "loss": 7.4159, + "step": 12928 + }, + { + "epoch": 1.2064010450685827, + "grad_norm": 1.0853751816637047, + "learning_rate": 0.00027779256695365493, + "loss": 7.0355, + "step": 12929 + }, + { + "epoch": 1.20649435476346, + "grad_norm": 0.5339443454124764, + "learning_rate": 0.00027778860622408846, + "loss": 7.2851, + "step": 12930 + }, + { + "epoch": 1.2065876644583373, + "grad_norm": 0.8092821115586813, + "learning_rate": 0.00027778464516959347, + "loss": 6.8483, + "step": 12931 + }, + { + "epoch": 1.2066809741532145, + "grad_norm": 0.7634278120163566, + "learning_rate": 0.00027778068379017996, + "loss": 7.0579, + "step": 12932 + }, + { + "epoch": 1.206774283848092, + "grad_norm": 1.5757406342930422, + "learning_rate": 0.000277776722085858, + "loss": 7.3096, + "step": 12933 + }, + { + "epoch": 1.2068675935429691, + "grad_norm": 0.4436589086901424, + "learning_rate": 0.00027777276005663766, + "loss": 7.0492, + "step": 12934 + }, + { + "epoch": 1.2069609032378463, + "grad_norm": 0.4362568429177662, + "learning_rate": 0.0002777687977025291, + "loss": 7.3047, + "step": 12935 + }, + { + "epoch": 1.2070542129327237, + "grad_norm": 1.2345663237469513, + "learning_rate": 0.0002777648350235423, + "loss": 7.4214, + "step": 12936 + }, + { + "epoch": 1.207147522627601, + "grad_norm": 1.3403534255275684, + "learning_rate": 0.0002777608720196874, + "loss": 7.1145, + "step": 12937 + }, + { + "epoch": 1.2072408323224784, + "grad_norm": 0.8402461859337711, + "learning_rate": 0.00027775690869097445, + "loss": 7.1782, + "step": 12938 + }, + { + "epoch": 1.2073341420173556, + "grad_norm": 0.46052161481050174, + "learning_rate": 0.0002777529450374135, + "loss": 7.1657, + "step": 12939 + }, + { + "epoch": 1.207427451712233, + "grad_norm": 1.0605022217604667, + "learning_rate": 0.00027774898105901465, + "loss": 7.3826, + "step": 12940 + }, + { + "epoch": 1.2075207614071102, + "grad_norm": 1.1421779882488197, + "learning_rate": 0.000277745016755788, + "loss": 7.39, + "step": 12941 + }, + { + "epoch": 1.2076140711019874, + "grad_norm": 0.8993283534754066, + "learning_rate": 0.00027774105212774355, + "loss": 7.2231, + "step": 12942 + }, + { + "epoch": 1.2077073807968648, + "grad_norm": 0.4323448234012849, + "learning_rate": 0.00027773708717489156, + "loss": 7.2794, + "step": 12943 + }, + { + "epoch": 1.2078006904917422, + "grad_norm": 0.6661290286437569, + "learning_rate": 0.00027773312189724187, + "loss": 7.4223, + "step": 12944 + }, + { + "epoch": 1.2078940001866194, + "grad_norm": 0.460045398383688, + "learning_rate": 0.0002777291562948048, + "loss": 7.7911, + "step": 12945 + }, + { + "epoch": 1.2079873098814966, + "grad_norm": 1.4028167378903689, + "learning_rate": 0.0002777251903675903, + "loss": 7.3293, + "step": 12946 + }, + { + "epoch": 1.208080619576374, + "grad_norm": 1.013527962335739, + "learning_rate": 0.00027772122411560843, + "loss": 7.2561, + "step": 12947 + }, + { + "epoch": 1.2081739292712512, + "grad_norm": 1.5697038693914342, + "learning_rate": 0.0002777172575388693, + "loss": 7.4929, + "step": 12948 + }, + { + "epoch": 1.2082672389661286, + "grad_norm": 0.6845700872336815, + "learning_rate": 0.0002777132906373831, + "loss": 7.3269, + "step": 12949 + }, + { + "epoch": 1.2083605486610058, + "grad_norm": 0.7633545711265789, + "learning_rate": 0.0002777093234111598, + "loss": 7.2169, + "step": 12950 + }, + { + "epoch": 1.2084538583558833, + "grad_norm": 1.1424349138039462, + "learning_rate": 0.0002777053558602095, + "loss": 7.4006, + "step": 12951 + }, + { + "epoch": 1.2085471680507605, + "grad_norm": 0.6606602481215437, + "learning_rate": 0.0002777013879845423, + "loss": 7.3657, + "step": 12952 + }, + { + "epoch": 1.2086404777456377, + "grad_norm": 0.9850385709397781, + "learning_rate": 0.00027769741978416834, + "loss": 7.6021, + "step": 12953 + }, + { + "epoch": 1.208733787440515, + "grad_norm": 0.5507333327969307, + "learning_rate": 0.00027769345125909766, + "loss": 7.3395, + "step": 12954 + }, + { + "epoch": 1.2088270971353925, + "grad_norm": 0.8915821465632713, + "learning_rate": 0.00027768948240934036, + "loss": 7.39, + "step": 12955 + }, + { + "epoch": 1.2089204068302697, + "grad_norm": 8.293323883288357, + "learning_rate": 0.0002776855132349065, + "loss": 7.3543, + "step": 12956 + }, + { + "epoch": 1.2090137165251469, + "grad_norm": 0.9767306276127846, + "learning_rate": 0.0002776815437358062, + "loss": 7.5575, + "step": 12957 + }, + { + "epoch": 1.2091070262200243, + "grad_norm": 0.7531542170842442, + "learning_rate": 0.0002776775739120496, + "loss": 7.4308, + "step": 12958 + }, + { + "epoch": 1.2092003359149015, + "grad_norm": 1.198939965116937, + "learning_rate": 0.00027767360376364675, + "loss": 7.3978, + "step": 12959 + }, + { + "epoch": 1.209293645609779, + "grad_norm": 1.2620436601929756, + "learning_rate": 0.0002776696332906077, + "loss": 7.5032, + "step": 12960 + }, + { + "epoch": 1.2093869553046561, + "grad_norm": 1.0071233530374766, + "learning_rate": 0.00027766566249294263, + "loss": 7.2835, + "step": 12961 + }, + { + "epoch": 1.2094802649995335, + "grad_norm": 0.5121777594720766, + "learning_rate": 0.0002776616913706616, + "loss": 7.2475, + "step": 12962 + }, + { + "epoch": 1.2095735746944107, + "grad_norm": 0.5774900397175156, + "learning_rate": 0.0002776577199237747, + "loss": 7.4651, + "step": 12963 + }, + { + "epoch": 1.209666884389288, + "grad_norm": 12.845710485350489, + "learning_rate": 0.00027765374815229196, + "loss": 7.0019, + "step": 12964 + }, + { + "epoch": 1.2097601940841654, + "grad_norm": 1.5429315634430576, + "learning_rate": 0.00027764977605622366, + "loss": 6.9733, + "step": 12965 + }, + { + "epoch": 1.2098535037790425, + "grad_norm": 6.554954431659358, + "learning_rate": 0.0002776458036355797, + "loss": 7.2551, + "step": 12966 + }, + { + "epoch": 1.20994681347392, + "grad_norm": 1.552376973042127, + "learning_rate": 0.00027764183089037036, + "loss": 7.6326, + "step": 12967 + }, + { + "epoch": 1.2100401231687972, + "grad_norm": 1.1131296528050736, + "learning_rate": 0.0002776378578206056, + "loss": 7.4123, + "step": 12968 + }, + { + "epoch": 1.2101334328636746, + "grad_norm": 0.5548368128841903, + "learning_rate": 0.0002776338844262956, + "loss": 7.3635, + "step": 12969 + }, + { + "epoch": 1.2102267425585518, + "grad_norm": 1.4641161923751465, + "learning_rate": 0.0002776299107074504, + "loss": 7.4363, + "step": 12970 + }, + { + "epoch": 1.2103200522534292, + "grad_norm": 1.417650129869161, + "learning_rate": 0.00027762593666408016, + "loss": 7.2179, + "step": 12971 + }, + { + "epoch": 1.2104133619483064, + "grad_norm": 0.8040953921474939, + "learning_rate": 0.00027762196229619496, + "loss": 7.5092, + "step": 12972 + }, + { + "epoch": 1.2105066716431838, + "grad_norm": 0.7522018243187019, + "learning_rate": 0.000277617987603805, + "loss": 6.9751, + "step": 12973 + }, + { + "epoch": 1.210599981338061, + "grad_norm": 2.0472252573373195, + "learning_rate": 0.0002776140125869202, + "loss": 7.004, + "step": 12974 + }, + { + "epoch": 1.2106932910329382, + "grad_norm": 1.891762083672078, + "learning_rate": 0.0002776100372455508, + "loss": 7.3268, + "step": 12975 + }, + { + "epoch": 1.2107866007278156, + "grad_norm": 2.1933593200443373, + "learning_rate": 0.0002776060615797069, + "loss": 7.3937, + "step": 12976 + }, + { + "epoch": 1.2108799104226928, + "grad_norm": 1.9962290786969534, + "learning_rate": 0.0002776020855893986, + "loss": 7.4616, + "step": 12977 + }, + { + "epoch": 1.2109732201175702, + "grad_norm": 0.873407178321675, + "learning_rate": 0.0002775981092746359, + "loss": 7.2017, + "step": 12978 + }, + { + "epoch": 1.2110665298124474, + "grad_norm": 0.9400054006031966, + "learning_rate": 0.0002775941326354291, + "loss": 7.2445, + "step": 12979 + }, + { + "epoch": 1.2111598395073249, + "grad_norm": 0.8203781651189977, + "learning_rate": 0.00027759015567178815, + "loss": 7.364, + "step": 12980 + }, + { + "epoch": 1.211253149202202, + "grad_norm": 1.0314952122292849, + "learning_rate": 0.0002775861783837233, + "loss": 7.4269, + "step": 12981 + }, + { + "epoch": 1.2113464588970795, + "grad_norm": 0.44118005632089086, + "learning_rate": 0.00027758220077124453, + "loss": 7.5073, + "step": 12982 + }, + { + "epoch": 1.2114397685919567, + "grad_norm": 0.4753319908655554, + "learning_rate": 0.0002775782228343621, + "loss": 7.2397, + "step": 12983 + }, + { + "epoch": 1.211533078286834, + "grad_norm": 0.9061058473470701, + "learning_rate": 0.0002775742445730859, + "loss": 7.446, + "step": 12984 + }, + { + "epoch": 1.2116263879817113, + "grad_norm": 1.1741810324096675, + "learning_rate": 0.0002775702659874263, + "loss": 7.4298, + "step": 12985 + }, + { + "epoch": 1.2117196976765885, + "grad_norm": 3.657272700998098, + "learning_rate": 0.0002775662870773933, + "loss": 7.2324, + "step": 12986 + }, + { + "epoch": 1.211813007371466, + "grad_norm": 1.4621326053755075, + "learning_rate": 0.00027756230784299694, + "loss": 7.6708, + "step": 12987 + }, + { + "epoch": 1.211906317066343, + "grad_norm": 1.0961836655943384, + "learning_rate": 0.0002775583282842475, + "loss": 7.13, + "step": 12988 + }, + { + "epoch": 1.2119996267612205, + "grad_norm": 0.632345886724395, + "learning_rate": 0.000277554348401155, + "loss": 7.6038, + "step": 12989 + }, + { + "epoch": 1.2120929364560977, + "grad_norm": 0.7262068614147359, + "learning_rate": 0.00027755036819372957, + "loss": 7.4637, + "step": 12990 + }, + { + "epoch": 1.2121862461509751, + "grad_norm": 1.1995557798159229, + "learning_rate": 0.0002775463876619813, + "loss": 7.2574, + "step": 12991 + }, + { + "epoch": 1.2122795558458523, + "grad_norm": 1.5341986714361013, + "learning_rate": 0.00027754240680592036, + "loss": 7.4209, + "step": 12992 + }, + { + "epoch": 1.2123728655407298, + "grad_norm": 2.866180577893761, + "learning_rate": 0.0002775384256255569, + "loss": 6.9503, + "step": 12993 + }, + { + "epoch": 1.212466175235607, + "grad_norm": 0.7612567391995922, + "learning_rate": 0.000277534444120901, + "loss": 7.152, + "step": 12994 + }, + { + "epoch": 1.2125594849304844, + "grad_norm": 5.411813714019693, + "learning_rate": 0.00027753046229196275, + "loss": 7.2214, + "step": 12995 + }, + { + "epoch": 1.2126527946253616, + "grad_norm": 0.9172767113510384, + "learning_rate": 0.0002775264801387523, + "loss": 6.9806, + "step": 12996 + }, + { + "epoch": 1.2127461043202388, + "grad_norm": 9.937104604251642, + "learning_rate": 0.00027752249766127985, + "loss": 6.9387, + "step": 12997 + }, + { + "epoch": 1.2128394140151162, + "grad_norm": 0.6133729205770452, + "learning_rate": 0.0002775185148595554, + "loss": 6.9878, + "step": 12998 + }, + { + "epoch": 1.2129327237099934, + "grad_norm": 0.5086480357028206, + "learning_rate": 0.00027751453173358915, + "loss": 7.4495, + "step": 12999 + }, + { + "epoch": 1.2130260334048708, + "grad_norm": 1.5617444989137712, + "learning_rate": 0.00027751054828339125, + "loss": 7.0286, + "step": 13000 + }, + { + "epoch": 1.213119343099748, + "grad_norm": 1.5857143087085102, + "learning_rate": 0.0002775065645089718, + "loss": 7.1896, + "step": 13001 + }, + { + "epoch": 1.2132126527946254, + "grad_norm": 0.8212847484224162, + "learning_rate": 0.00027750258041034094, + "loss": 7.2409, + "step": 13002 + }, + { + "epoch": 1.2133059624895026, + "grad_norm": 0.3986641124900962, + "learning_rate": 0.00027749859598750874, + "loss": 7.378, + "step": 13003 + }, + { + "epoch": 1.21339927218438, + "grad_norm": 0.7537920042076233, + "learning_rate": 0.0002774946112404854, + "loss": 7.3857, + "step": 13004 + }, + { + "epoch": 1.2134925818792572, + "grad_norm": 0.6368917271539227, + "learning_rate": 0.000277490626169281, + "loss": 7.0986, + "step": 13005 + }, + { + "epoch": 1.2135858915741347, + "grad_norm": 4.524163823885629, + "learning_rate": 0.00027748664077390574, + "loss": 7.4038, + "step": 13006 + }, + { + "epoch": 1.2136792012690119, + "grad_norm": 1.6600706965881025, + "learning_rate": 0.00027748265505436975, + "loss": 7.1991, + "step": 13007 + }, + { + "epoch": 1.213772510963889, + "grad_norm": 2.619702664296748, + "learning_rate": 0.00027747866901068306, + "loss": 7.4707, + "step": 13008 + }, + { + "epoch": 1.2138658206587665, + "grad_norm": 5.808985992852247, + "learning_rate": 0.0002774746826428559, + "loss": 7.1664, + "step": 13009 + }, + { + "epoch": 1.2139591303536437, + "grad_norm": 1.4459063069803175, + "learning_rate": 0.0002774706959508984, + "loss": 7.335, + "step": 13010 + }, + { + "epoch": 1.214052440048521, + "grad_norm": 1.6012823053244805, + "learning_rate": 0.00027746670893482065, + "loss": 7.3109, + "step": 13011 + }, + { + "epoch": 1.2141457497433983, + "grad_norm": 1.2587793253650836, + "learning_rate": 0.00027746272159463286, + "loss": 7.1016, + "step": 13012 + }, + { + "epoch": 1.2142390594382757, + "grad_norm": 1.4093035154866713, + "learning_rate": 0.00027745873393034506, + "loss": 7.1069, + "step": 13013 + }, + { + "epoch": 1.214332369133153, + "grad_norm": 2.367517725372835, + "learning_rate": 0.00027745474594196754, + "loss": 7.419, + "step": 13014 + }, + { + "epoch": 1.2144256788280303, + "grad_norm": 1.5371287757990086, + "learning_rate": 0.00027745075762951034, + "loss": 7.0682, + "step": 13015 + }, + { + "epoch": 1.2145189885229075, + "grad_norm": 0.9842849420090929, + "learning_rate": 0.00027744676899298353, + "loss": 7.3462, + "step": 13016 + }, + { + "epoch": 1.214612298217785, + "grad_norm": 1.1892869995750919, + "learning_rate": 0.00027744278003239743, + "loss": 7.3218, + "step": 13017 + }, + { + "epoch": 1.2147056079126621, + "grad_norm": 1.3635756015698093, + "learning_rate": 0.00027743879074776205, + "loss": 7.3427, + "step": 13018 + }, + { + "epoch": 1.2147989176075393, + "grad_norm": 1.6418945760574644, + "learning_rate": 0.0002774348011390876, + "loss": 7.3208, + "step": 13019 + }, + { + "epoch": 1.2148922273024167, + "grad_norm": 1.4189941840153197, + "learning_rate": 0.0002774308112063842, + "loss": 7.2805, + "step": 13020 + }, + { + "epoch": 1.214985536997294, + "grad_norm": 0.8753913332279671, + "learning_rate": 0.000277426820949662, + "loss": 7.6376, + "step": 13021 + }, + { + "epoch": 1.2150788466921714, + "grad_norm": 0.8051501916380063, + "learning_rate": 0.0002774228303689311, + "loss": 7.3968, + "step": 13022 + }, + { + "epoch": 1.2151721563870486, + "grad_norm": 1.0976673517655855, + "learning_rate": 0.0002774188394642017, + "loss": 7.4886, + "step": 13023 + }, + { + "epoch": 1.215265466081926, + "grad_norm": 0.6586722358499029, + "learning_rate": 0.00027741484823548394, + "loss": 7.1886, + "step": 13024 + }, + { + "epoch": 1.2153587757768032, + "grad_norm": 0.6488497747790064, + "learning_rate": 0.000277410856682788, + "loss": 7.4384, + "step": 13025 + }, + { + "epoch": 1.2154520854716806, + "grad_norm": 0.7192426448190116, + "learning_rate": 0.00027740686480612396, + "loss": 7.5195, + "step": 13026 + }, + { + "epoch": 1.2155453951665578, + "grad_norm": 0.6345949282606713, + "learning_rate": 0.000277402872605502, + "loss": 7.4653, + "step": 13027 + }, + { + "epoch": 1.2156387048614352, + "grad_norm": 0.7677574940108034, + "learning_rate": 0.0002773988800809323, + "loss": 7.2557, + "step": 13028 + }, + { + "epoch": 1.2157320145563124, + "grad_norm": 1.0131671761490744, + "learning_rate": 0.00027739488723242493, + "loss": 7.0352, + "step": 13029 + }, + { + "epoch": 1.2158253242511896, + "grad_norm": 0.6692536690562737, + "learning_rate": 0.00027739089405999013, + "loss": 7.3404, + "step": 13030 + }, + { + "epoch": 1.215918633946067, + "grad_norm": 1.0708193911464863, + "learning_rate": 0.00027738690056363806, + "loss": 7.225, + "step": 13031 + }, + { + "epoch": 1.2160119436409442, + "grad_norm": 1.4411908047723963, + "learning_rate": 0.0002773829067433788, + "loss": 7.3016, + "step": 13032 + }, + { + "epoch": 1.2161052533358216, + "grad_norm": 1.0170260643001412, + "learning_rate": 0.00027737891259922254, + "loss": 7.1919, + "step": 13033 + }, + { + "epoch": 1.2161985630306988, + "grad_norm": 0.7149774717240162, + "learning_rate": 0.0002773749181311795, + "loss": 7.2586, + "step": 13034 + }, + { + "epoch": 1.2162918727255763, + "grad_norm": 0.7250262659982715, + "learning_rate": 0.00027737092333925963, + "loss": 7.1588, + "step": 13035 + }, + { + "epoch": 1.2163851824204535, + "grad_norm": 0.5514713790948266, + "learning_rate": 0.0002773669282234733, + "loss": 7.5653, + "step": 13036 + }, + { + "epoch": 1.2164784921153309, + "grad_norm": 1.97602275821295, + "learning_rate": 0.00027736293278383066, + "loss": 7.2009, + "step": 13037 + }, + { + "epoch": 1.216571801810208, + "grad_norm": 0.697633712755864, + "learning_rate": 0.00027735893702034175, + "loss": 7.4406, + "step": 13038 + }, + { + "epoch": 1.2166651115050855, + "grad_norm": 1.0392020197282894, + "learning_rate": 0.0002773549409330169, + "loss": 7.1387, + "step": 13039 + }, + { + "epoch": 1.2167584211999627, + "grad_norm": 0.5758487824551376, + "learning_rate": 0.000277350944521866, + "loss": 7.1611, + "step": 13040 + }, + { + "epoch": 1.21685173089484, + "grad_norm": 0.5271594946278434, + "learning_rate": 0.00027734694778689946, + "loss": 7.167, + "step": 13041 + }, + { + "epoch": 1.2169450405897173, + "grad_norm": 0.6281067934784079, + "learning_rate": 0.00027734295072812733, + "loss": 7.0318, + "step": 13042 + }, + { + "epoch": 1.2170383502845945, + "grad_norm": 0.9018440815740454, + "learning_rate": 0.0002773389533455598, + "loss": 7.3929, + "step": 13043 + }, + { + "epoch": 1.217131659979472, + "grad_norm": 0.5804727050709695, + "learning_rate": 0.00027733495563920705, + "loss": 6.8866, + "step": 13044 + }, + { + "epoch": 1.2172249696743491, + "grad_norm": 0.48433899172121253, + "learning_rate": 0.0002773309576090792, + "loss": 7.3489, + "step": 13045 + }, + { + "epoch": 1.2173182793692265, + "grad_norm": 0.5204186343724767, + "learning_rate": 0.00027732695925518645, + "loss": 7.1438, + "step": 13046 + }, + { + "epoch": 1.2174115890641037, + "grad_norm": 1.238893382013043, + "learning_rate": 0.000277322960577539, + "loss": 7.4307, + "step": 13047 + }, + { + "epoch": 1.217504898758981, + "grad_norm": 0.9819302218291812, + "learning_rate": 0.0002773189615761469, + "loss": 7.1067, + "step": 13048 + }, + { + "epoch": 1.2175982084538584, + "grad_norm": 1.255819874052761, + "learning_rate": 0.0002773149622510205, + "loss": 7.1554, + "step": 13049 + }, + { + "epoch": 1.2176915181487358, + "grad_norm": 0.5964967332971718, + "learning_rate": 0.0002773109626021698, + "loss": 7.3934, + "step": 13050 + }, + { + "epoch": 1.217784827843613, + "grad_norm": 0.4497449548220234, + "learning_rate": 0.000277306962629605, + "loss": 7.2953, + "step": 13051 + }, + { + "epoch": 1.2178781375384902, + "grad_norm": 0.5002727618502967, + "learning_rate": 0.0002773029623333364, + "loss": 7.3253, + "step": 13052 + }, + { + "epoch": 1.2179714472333676, + "grad_norm": 0.7654576018541265, + "learning_rate": 0.000277298961713374, + "loss": 7.3391, + "step": 13053 + }, + { + "epoch": 1.2180647569282448, + "grad_norm": 0.4398072987285431, + "learning_rate": 0.0002772949607697281, + "loss": 7.1612, + "step": 13054 + }, + { + "epoch": 1.2181580666231222, + "grad_norm": 0.545277417757962, + "learning_rate": 0.00027729095950240876, + "loss": 7.264, + "step": 13055 + }, + { + "epoch": 1.2182513763179994, + "grad_norm": 0.8769596423520808, + "learning_rate": 0.00027728695791142625, + "loss": 7.5618, + "step": 13056 + }, + { + "epoch": 1.2183446860128768, + "grad_norm": 0.5420850004367125, + "learning_rate": 0.0002772829559967907, + "loss": 7.5951, + "step": 13057 + }, + { + "epoch": 1.218437995707754, + "grad_norm": 0.8950514614608754, + "learning_rate": 0.0002772789537585123, + "loss": 7.248, + "step": 13058 + }, + { + "epoch": 1.2185313054026312, + "grad_norm": 0.49614266817597524, + "learning_rate": 0.00027727495119660124, + "loss": 7.3218, + "step": 13059 + }, + { + "epoch": 1.2186246150975086, + "grad_norm": 0.41198564101291946, + "learning_rate": 0.00027727094831106767, + "loss": 7.383, + "step": 13060 + }, + { + "epoch": 1.218717924792386, + "grad_norm": 0.6491015337797131, + "learning_rate": 0.00027726694510192177, + "loss": 7.1375, + "step": 13061 + }, + { + "epoch": 1.2188112344872633, + "grad_norm": 0.6564753968252951, + "learning_rate": 0.0002772629415691738, + "loss": 7.0535, + "step": 13062 + }, + { + "epoch": 1.2189045441821404, + "grad_norm": 1.346772862756632, + "learning_rate": 0.00027725893771283377, + "loss": 7.4629, + "step": 13063 + }, + { + "epoch": 1.2189978538770179, + "grad_norm": 0.5844491314123362, + "learning_rate": 0.000277254933532912, + "loss": 7.2233, + "step": 13064 + }, + { + "epoch": 1.219091163571895, + "grad_norm": 1.0344650932835955, + "learning_rate": 0.0002772509290294186, + "loss": 6.9495, + "step": 13065 + }, + { + "epoch": 1.2191844732667725, + "grad_norm": 0.4019269279615775, + "learning_rate": 0.0002772469242023638, + "loss": 7.3491, + "step": 13066 + }, + { + "epoch": 1.2192777829616497, + "grad_norm": 0.8415697202719578, + "learning_rate": 0.0002772429190517578, + "loss": 6.8856, + "step": 13067 + }, + { + "epoch": 1.219371092656527, + "grad_norm": 0.46435297542996806, + "learning_rate": 0.0002772389135776107, + "loss": 7.1682, + "step": 13068 + }, + { + "epoch": 1.2194644023514043, + "grad_norm": 0.9084105253731317, + "learning_rate": 0.0002772349077799327, + "loss": 7.1879, + "step": 13069 + }, + { + "epoch": 1.2195577120462815, + "grad_norm": 1.32830945630015, + "learning_rate": 0.00027723090165873417, + "loss": 7.3466, + "step": 13070 + }, + { + "epoch": 1.219651021741159, + "grad_norm": 0.4732356460004761, + "learning_rate": 0.000277226895214025, + "loss": 7.2577, + "step": 13071 + }, + { + "epoch": 1.2197443314360361, + "grad_norm": 1.1538490356366502, + "learning_rate": 0.0002772228884458156, + "loss": 7.2074, + "step": 13072 + }, + { + "epoch": 1.2198376411309135, + "grad_norm": 0.7592227077921551, + "learning_rate": 0.000277218881354116, + "loss": 7.4307, + "step": 13073 + }, + { + "epoch": 1.2199309508257907, + "grad_norm": 1.346398779699257, + "learning_rate": 0.0002772148739389366, + "loss": 7.0937, + "step": 13074 + }, + { + "epoch": 1.2200242605206681, + "grad_norm": 0.5539235597628988, + "learning_rate": 0.00027721086620028734, + "loss": 7.2878, + "step": 13075 + }, + { + "epoch": 1.2201175702155453, + "grad_norm": 0.5232633656713662, + "learning_rate": 0.0002772068581381786, + "loss": 7.0519, + "step": 13076 + }, + { + "epoch": 1.2202108799104228, + "grad_norm": 2.6344743483787925, + "learning_rate": 0.0002772028497526204, + "loss": 7.268, + "step": 13077 + }, + { + "epoch": 1.2203041896053, + "grad_norm": 1.1236029154067273, + "learning_rate": 0.00027719884104362313, + "loss": 7.2657, + "step": 13078 + }, + { + "epoch": 1.2203974993001774, + "grad_norm": 0.7658826390085526, + "learning_rate": 0.0002771948320111969, + "loss": 7.3698, + "step": 13079 + }, + { + "epoch": 1.2204908089950546, + "grad_norm": 0.4208561207040724, + "learning_rate": 0.00027719082265535185, + "loss": 7.372, + "step": 13080 + }, + { + "epoch": 1.2205841186899318, + "grad_norm": 1.1567983230403354, + "learning_rate": 0.0002771868129760982, + "loss": 6.9042, + "step": 13081 + }, + { + "epoch": 1.2206774283848092, + "grad_norm": 0.40146846795367686, + "learning_rate": 0.00027718280297344615, + "loss": 7.2374, + "step": 13082 + }, + { + "epoch": 1.2207707380796864, + "grad_norm": 0.7333391104025536, + "learning_rate": 0.000277178792647406, + "loss": 7.0383, + "step": 13083 + }, + { + "epoch": 1.2208640477745638, + "grad_norm": 1.606621179021968, + "learning_rate": 0.0002771747819979877, + "loss": 7.5819, + "step": 13084 + }, + { + "epoch": 1.220957357469441, + "grad_norm": 1.1400218409769036, + "learning_rate": 0.0002771707710252017, + "loss": 7.1043, + "step": 13085 + }, + { + "epoch": 1.2210506671643184, + "grad_norm": 1.4019602919298884, + "learning_rate": 0.0002771667597290581, + "loss": 7.8936, + "step": 13086 + }, + { + "epoch": 1.2211439768591956, + "grad_norm": 1.2513778558672106, + "learning_rate": 0.00027716274810956715, + "loss": 7.1431, + "step": 13087 + }, + { + "epoch": 1.221237286554073, + "grad_norm": 1.8117919211658562, + "learning_rate": 0.0002771587361667389, + "loss": 7.4841, + "step": 13088 + }, + { + "epoch": 1.2213305962489502, + "grad_norm": 1.7313022104165212, + "learning_rate": 0.0002771547239005837, + "loss": 7.0791, + "step": 13089 + }, + { + "epoch": 1.2214239059438277, + "grad_norm": 1.0739047410260345, + "learning_rate": 0.0002771507113111117, + "loss": 7.87, + "step": 13090 + }, + { + "epoch": 1.2215172156387049, + "grad_norm": 0.6793627355215353, + "learning_rate": 0.0002771466983983331, + "loss": 7.4134, + "step": 13091 + }, + { + "epoch": 1.221610525333582, + "grad_norm": 0.9104697837155192, + "learning_rate": 0.0002771426851622581, + "loss": 7.3958, + "step": 13092 + }, + { + "epoch": 1.2217038350284595, + "grad_norm": 0.5390413494403, + "learning_rate": 0.0002771386716028969, + "loss": 6.8688, + "step": 13093 + }, + { + "epoch": 1.2217971447233367, + "grad_norm": 0.640153758284687, + "learning_rate": 0.00027713465772025977, + "loss": 7.3156, + "step": 13094 + }, + { + "epoch": 1.221890454418214, + "grad_norm": 0.6265100938262825, + "learning_rate": 0.0002771306435143568, + "loss": 7.0566, + "step": 13095 + }, + { + "epoch": 1.2219837641130913, + "grad_norm": 0.4116087194840479, + "learning_rate": 0.00027712662898519833, + "loss": 7.5212, + "step": 13096 + }, + { + "epoch": 1.2220770738079687, + "grad_norm": 0.5476357687012731, + "learning_rate": 0.00027712261413279446, + "loss": 7.2711, + "step": 13097 + }, + { + "epoch": 1.222170383502846, + "grad_norm": 0.8429881527804461, + "learning_rate": 0.00027711859895715544, + "loss": 6.8722, + "step": 13098 + }, + { + "epoch": 1.2222636931977233, + "grad_norm": 0.8382503930036808, + "learning_rate": 0.00027711458345829146, + "loss": 7.2736, + "step": 13099 + }, + { + "epoch": 1.2223570028926005, + "grad_norm": 1.1993979969147879, + "learning_rate": 0.0002771105676362128, + "loss": 7.3357, + "step": 13100 + }, + { + "epoch": 1.222450312587478, + "grad_norm": 0.524745759549784, + "learning_rate": 0.0002771065514909296, + "loss": 7.1707, + "step": 13101 + }, + { + "epoch": 1.2225436222823551, + "grad_norm": 0.5932769521504561, + "learning_rate": 0.0002771025350224521, + "loss": 7.6088, + "step": 13102 + }, + { + "epoch": 1.2226369319772323, + "grad_norm": 1.0078589109905216, + "learning_rate": 0.0002770985182307905, + "loss": 7.5888, + "step": 13103 + }, + { + "epoch": 1.2227302416721098, + "grad_norm": 0.5230082980005849, + "learning_rate": 0.00027709450111595497, + "loss": 7.4364, + "step": 13104 + }, + { + "epoch": 1.222823551366987, + "grad_norm": 0.6339107658416138, + "learning_rate": 0.00027709048367795583, + "loss": 7.5043, + "step": 13105 + }, + { + "epoch": 1.2229168610618644, + "grad_norm": 0.742280503754259, + "learning_rate": 0.0002770864659168032, + "loss": 7.3559, + "step": 13106 + }, + { + "epoch": 1.2230101707567416, + "grad_norm": 0.6184348097195823, + "learning_rate": 0.00027708244783250735, + "loss": 7.0969, + "step": 13107 + }, + { + "epoch": 1.223103480451619, + "grad_norm": 0.4504642024059277, + "learning_rate": 0.0002770784294250785, + "loss": 7.4155, + "step": 13108 + }, + { + "epoch": 1.2231967901464962, + "grad_norm": 0.8250914998670802, + "learning_rate": 0.0002770744106945268, + "loss": 6.924, + "step": 13109 + }, + { + "epoch": 1.2232900998413736, + "grad_norm": 2.193698072823557, + "learning_rate": 0.00027707039164086256, + "loss": 7.2523, + "step": 13110 + }, + { + "epoch": 1.2233834095362508, + "grad_norm": 1.1639956116204253, + "learning_rate": 0.000277066372264096, + "loss": 7.0155, + "step": 13111 + }, + { + "epoch": 1.2234767192311282, + "grad_norm": 1.090402651042926, + "learning_rate": 0.0002770623525642372, + "loss": 7.2037, + "step": 13112 + }, + { + "epoch": 1.2235700289260054, + "grad_norm": 0.6580549944151441, + "learning_rate": 0.0002770583325412965, + "loss": 6.9525, + "step": 13113 + }, + { + "epoch": 1.2236633386208826, + "grad_norm": 0.9322628075614833, + "learning_rate": 0.0002770543121952841, + "loss": 7.2241, + "step": 13114 + }, + { + "epoch": 1.22375664831576, + "grad_norm": 1.3880334883270335, + "learning_rate": 0.0002770502915262102, + "loss": 7.333, + "step": 13115 + }, + { + "epoch": 1.2238499580106372, + "grad_norm": 0.5313112315008235, + "learning_rate": 0.0002770462705340851, + "loss": 7.4754, + "step": 13116 + }, + { + "epoch": 1.2239432677055146, + "grad_norm": 0.5216839756871419, + "learning_rate": 0.0002770422492189189, + "loss": 7.114, + "step": 13117 + }, + { + "epoch": 1.2240365774003918, + "grad_norm": 1.420248172011949, + "learning_rate": 0.00027703822758072197, + "loss": 6.8505, + "step": 13118 + }, + { + "epoch": 1.2241298870952693, + "grad_norm": 0.695377589645076, + "learning_rate": 0.0002770342056195044, + "loss": 7.1165, + "step": 13119 + }, + { + "epoch": 1.2242231967901465, + "grad_norm": 0.9338542037254274, + "learning_rate": 0.0002770301833352765, + "loss": 7.223, + "step": 13120 + }, + { + "epoch": 1.2243165064850239, + "grad_norm": 1.247945239125434, + "learning_rate": 0.0002770261607280485, + "loss": 7.1886, + "step": 13121 + }, + { + "epoch": 1.224409816179901, + "grad_norm": 0.6274395356758411, + "learning_rate": 0.0002770221377978305, + "loss": 7.0913, + "step": 13122 + }, + { + "epoch": 1.2245031258747785, + "grad_norm": 0.4362082480342837, + "learning_rate": 0.0002770181145446329, + "loss": 7.2984, + "step": 13123 + }, + { + "epoch": 1.2245964355696557, + "grad_norm": 0.9891029373403938, + "learning_rate": 0.0002770140909684659, + "loss": 7.1745, + "step": 13124 + }, + { + "epoch": 1.224689745264533, + "grad_norm": 0.6553286697425439, + "learning_rate": 0.00027701006706933967, + "loss": 7.5502, + "step": 13125 + }, + { + "epoch": 1.2247830549594103, + "grad_norm": 0.7712112488227812, + "learning_rate": 0.0002770060428472644, + "loss": 7.2838, + "step": 13126 + }, + { + "epoch": 1.2248763646542875, + "grad_norm": 0.602823566440981, + "learning_rate": 0.00027700201830225044, + "loss": 7.2357, + "step": 13127 + }, + { + "epoch": 1.224969674349165, + "grad_norm": 0.936757927611563, + "learning_rate": 0.000276997993434308, + "loss": 7.0975, + "step": 13128 + }, + { + "epoch": 1.2250629840440421, + "grad_norm": 1.732773463656141, + "learning_rate": 0.0002769939682434472, + "loss": 7.491, + "step": 13129 + }, + { + "epoch": 1.2251562937389195, + "grad_norm": 2.9987882597362217, + "learning_rate": 0.0002769899427296784, + "loss": 7.1824, + "step": 13130 + }, + { + "epoch": 1.2252496034337967, + "grad_norm": 0.45324450111242454, + "learning_rate": 0.0002769859168930118, + "loss": 7.5481, + "step": 13131 + }, + { + "epoch": 1.2253429131286742, + "grad_norm": 1.394425311586783, + "learning_rate": 0.00027698189073345757, + "loss": 7.5153, + "step": 13132 + }, + { + "epoch": 1.2254362228235514, + "grad_norm": 1.7826238407028046, + "learning_rate": 0.0002769778642510261, + "loss": 7.4213, + "step": 13133 + }, + { + "epoch": 1.2255295325184288, + "grad_norm": 1.2517766478001886, + "learning_rate": 0.0002769738374457275, + "loss": 7.3359, + "step": 13134 + }, + { + "epoch": 1.225622842213306, + "grad_norm": 1.0627571273028482, + "learning_rate": 0.000276969810317572, + "loss": 7.5581, + "step": 13135 + }, + { + "epoch": 1.2257161519081832, + "grad_norm": 1.5639249627372924, + "learning_rate": 0.0002769657828665699, + "loss": 7.4034, + "step": 13136 + }, + { + "epoch": 1.2258094616030606, + "grad_norm": 2.3044086450633436, + "learning_rate": 0.00027696175509273144, + "loss": 7.3026, + "step": 13137 + }, + { + "epoch": 1.2259027712979378, + "grad_norm": 0.9713095879219333, + "learning_rate": 0.0002769577269960669, + "loss": 7.291, + "step": 13138 + }, + { + "epoch": 1.2259960809928152, + "grad_norm": 0.9848853915978603, + "learning_rate": 0.00027695369857658636, + "loss": 7.1829, + "step": 13139 + }, + { + "epoch": 1.2260893906876924, + "grad_norm": 0.6143806597981534, + "learning_rate": 0.00027694966983430024, + "loss": 7.341, + "step": 13140 + }, + { + "epoch": 1.2261827003825698, + "grad_norm": 2.60679230530309, + "learning_rate": 0.0002769456407692187, + "loss": 6.9772, + "step": 13141 + }, + { + "epoch": 1.226276010077447, + "grad_norm": 1.8575817860982264, + "learning_rate": 0.000276941611381352, + "loss": 6.9985, + "step": 13142 + }, + { + "epoch": 1.2263693197723244, + "grad_norm": 0.5073633141511502, + "learning_rate": 0.00027693758167071034, + "loss": 6.9467, + "step": 13143 + }, + { + "epoch": 1.2264626294672016, + "grad_norm": 2.4366256389150274, + "learning_rate": 0.0002769335516373041, + "loss": 7.1478, + "step": 13144 + }, + { + "epoch": 1.226555939162079, + "grad_norm": 2.976556584979188, + "learning_rate": 0.0002769295212811434, + "loss": 7.1113, + "step": 13145 + }, + { + "epoch": 1.2266492488569563, + "grad_norm": 1.6253609421941764, + "learning_rate": 0.00027692549060223854, + "loss": 7.0947, + "step": 13146 + }, + { + "epoch": 1.2267425585518335, + "grad_norm": 1.0306260226780117, + "learning_rate": 0.0002769214596005997, + "loss": 7.0997, + "step": 13147 + }, + { + "epoch": 1.2268358682467109, + "grad_norm": 1.9955320929914413, + "learning_rate": 0.00027691742827623723, + "loss": 7.1226, + "step": 13148 + }, + { + "epoch": 1.226929177941588, + "grad_norm": 2.0529915080103627, + "learning_rate": 0.00027691339662916137, + "loss": 7.159, + "step": 13149 + }, + { + "epoch": 1.2270224876364655, + "grad_norm": 1.1709082693389654, + "learning_rate": 0.00027690936465938227, + "loss": 7.2924, + "step": 13150 + }, + { + "epoch": 1.2271157973313427, + "grad_norm": 0.4123908710407463, + "learning_rate": 0.0002769053323669103, + "loss": 7.53, + "step": 13151 + }, + { + "epoch": 1.22720910702622, + "grad_norm": 2.922068260728162, + "learning_rate": 0.0002769012997517556, + "loss": 7.4444, + "step": 13152 + }, + { + "epoch": 1.2273024167210973, + "grad_norm": 0.9584859783243843, + "learning_rate": 0.0002768972668139285, + "loss": 7.1191, + "step": 13153 + }, + { + "epoch": 1.2273957264159745, + "grad_norm": 1.3891668057403246, + "learning_rate": 0.0002768932335534393, + "loss": 7.6432, + "step": 13154 + }, + { + "epoch": 1.227489036110852, + "grad_norm": 0.7321672507271597, + "learning_rate": 0.0002768891999702982, + "loss": 7.3496, + "step": 13155 + }, + { + "epoch": 1.2275823458057293, + "grad_norm": 1.1718372425719523, + "learning_rate": 0.00027688516606451537, + "loss": 7.3053, + "step": 13156 + }, + { + "epoch": 1.2276756555006065, + "grad_norm": 1.7723645379384718, + "learning_rate": 0.00027688113183610115, + "loss": 7.2044, + "step": 13157 + }, + { + "epoch": 1.2277689651954837, + "grad_norm": 0.4744126655515647, + "learning_rate": 0.0002768770972850659, + "loss": 7.2688, + "step": 13158 + }, + { + "epoch": 1.2278622748903611, + "grad_norm": 0.7271650170863928, + "learning_rate": 0.0002768730624114197, + "loss": 7.2806, + "step": 13159 + }, + { + "epoch": 1.2279555845852383, + "grad_norm": 1.7762337943399518, + "learning_rate": 0.0002768690272151729, + "loss": 6.9458, + "step": 13160 + }, + { + "epoch": 1.2280488942801158, + "grad_norm": 8.635622904539021, + "learning_rate": 0.0002768649916963357, + "loss": 7.2455, + "step": 13161 + }, + { + "epoch": 1.228142203974993, + "grad_norm": 0.9308641127339528, + "learning_rate": 0.00027686095585491847, + "loss": 7.4632, + "step": 13162 + }, + { + "epoch": 1.2282355136698704, + "grad_norm": 1.2994320283866183, + "learning_rate": 0.0002768569196909314, + "loss": 7.0241, + "step": 13163 + }, + { + "epoch": 1.2283288233647476, + "grad_norm": 2.0157541493607543, + "learning_rate": 0.0002768528832043847, + "loss": 7.1699, + "step": 13164 + }, + { + "epoch": 1.2284221330596248, + "grad_norm": 1.4162863001057084, + "learning_rate": 0.00027684884639528877, + "loss": 7.0781, + "step": 13165 + }, + { + "epoch": 1.2285154427545022, + "grad_norm": 2.511346076366144, + "learning_rate": 0.00027684480926365375, + "loss": 7.3723, + "step": 13166 + }, + { + "epoch": 1.2286087524493796, + "grad_norm": 0.4879536732479861, + "learning_rate": 0.00027684077180949, + "loss": 7.3671, + "step": 13167 + }, + { + "epoch": 1.2287020621442568, + "grad_norm": 1.0158612376811342, + "learning_rate": 0.0002768367340328077, + "loss": 7.1371, + "step": 13168 + }, + { + "epoch": 1.228795371839134, + "grad_norm": 0.8307106994441409, + "learning_rate": 0.0002768326959336172, + "loss": 6.9213, + "step": 13169 + }, + { + "epoch": 1.2288886815340114, + "grad_norm": 1.0962119239574113, + "learning_rate": 0.00027682865751192865, + "loss": 7.3853, + "step": 13170 + }, + { + "epoch": 1.2289819912288886, + "grad_norm": 0.8408885369371741, + "learning_rate": 0.0002768246187677525, + "loss": 6.7321, + "step": 13171 + }, + { + "epoch": 1.229075300923766, + "grad_norm": 0.6691826114955635, + "learning_rate": 0.0002768205797010988, + "loss": 7.0278, + "step": 13172 + }, + { + "epoch": 1.2291686106186432, + "grad_norm": 0.7071120150478057, + "learning_rate": 0.000276816540311978, + "loss": 7.3579, + "step": 13173 + }, + { + "epoch": 1.2292619203135207, + "grad_norm": 0.8765914994420907, + "learning_rate": 0.0002768125006004003, + "loss": 7.5727, + "step": 13174 + }, + { + "epoch": 1.2293552300083979, + "grad_norm": 0.5636490484420206, + "learning_rate": 0.00027680846056637597, + "loss": 7.4006, + "step": 13175 + }, + { + "epoch": 1.229448539703275, + "grad_norm": 2.2923221280995967, + "learning_rate": 0.0002768044202099153, + "loss": 7.1099, + "step": 13176 + }, + { + "epoch": 1.2295418493981525, + "grad_norm": 0.4974726164781193, + "learning_rate": 0.0002768003795310285, + "loss": 7.1472, + "step": 13177 + }, + { + "epoch": 1.2296351590930297, + "grad_norm": 0.4532589630836238, + "learning_rate": 0.00027679633852972595, + "loss": 7.2991, + "step": 13178 + }, + { + "epoch": 1.229728468787907, + "grad_norm": 0.6691834960075673, + "learning_rate": 0.0002767922972060178, + "loss": 7.4777, + "step": 13179 + }, + { + "epoch": 1.2298217784827843, + "grad_norm": 0.41081046227242474, + "learning_rate": 0.00027678825555991446, + "loss": 7.5204, + "step": 13180 + }, + { + "epoch": 1.2299150881776617, + "grad_norm": 0.3740317118780991, + "learning_rate": 0.00027678421359142615, + "loss": 7.3089, + "step": 13181 + }, + { + "epoch": 1.230008397872539, + "grad_norm": 0.4614990097721663, + "learning_rate": 0.00027678017130056316, + "loss": 7.3662, + "step": 13182 + }, + { + "epoch": 1.2301017075674163, + "grad_norm": 0.5194770668902066, + "learning_rate": 0.00027677612868733564, + "loss": 7.1512, + "step": 13183 + }, + { + "epoch": 1.2301950172622935, + "grad_norm": 0.6026761711845552, + "learning_rate": 0.00027677208575175406, + "loss": 7.3178, + "step": 13184 + }, + { + "epoch": 1.230288326957171, + "grad_norm": 0.8158187315993866, + "learning_rate": 0.00027676804249382865, + "loss": 7.3, + "step": 13185 + }, + { + "epoch": 1.2303816366520481, + "grad_norm": 1.3649125547191878, + "learning_rate": 0.0002767639989135696, + "loss": 7.2413, + "step": 13186 + }, + { + "epoch": 1.2304749463469253, + "grad_norm": 0.8119401113646543, + "learning_rate": 0.0002767599550109873, + "loss": 7.5059, + "step": 13187 + }, + { + "epoch": 1.2305682560418028, + "grad_norm": 1.021556456103901, + "learning_rate": 0.0002767559107860919, + "loss": 7.061, + "step": 13188 + }, + { + "epoch": 1.23066156573668, + "grad_norm": 1.0351715845034415, + "learning_rate": 0.0002767518662388938, + "loss": 7.3899, + "step": 13189 + }, + { + "epoch": 1.2307548754315574, + "grad_norm": 1.3494009746910254, + "learning_rate": 0.0002767478213694033, + "loss": 7.294, + "step": 13190 + }, + { + "epoch": 1.2308481851264346, + "grad_norm": 0.8688198887203993, + "learning_rate": 0.0002767437761776306, + "loss": 7.1065, + "step": 13191 + }, + { + "epoch": 1.230941494821312, + "grad_norm": 1.4470655618809845, + "learning_rate": 0.000276739730663586, + "loss": 7.4811, + "step": 13192 + }, + { + "epoch": 1.2310348045161892, + "grad_norm": 1.2494957864195528, + "learning_rate": 0.00027673568482727986, + "loss": 7.0692, + "step": 13193 + }, + { + "epoch": 1.2311281142110666, + "grad_norm": 0.8015953445508304, + "learning_rate": 0.0002767316386687224, + "loss": 7.3545, + "step": 13194 + }, + { + "epoch": 1.2312214239059438, + "grad_norm": 0.5803447926832893, + "learning_rate": 0.00027672759218792396, + "loss": 7.1636, + "step": 13195 + }, + { + "epoch": 1.2313147336008212, + "grad_norm": 2.0501390628945293, + "learning_rate": 0.00027672354538489473, + "loss": 7.2738, + "step": 13196 + }, + { + "epoch": 1.2314080432956984, + "grad_norm": 1.753527361357042, + "learning_rate": 0.00027671949825964505, + "loss": 7.1558, + "step": 13197 + }, + { + "epoch": 1.2315013529905756, + "grad_norm": 0.5064079744637034, + "learning_rate": 0.0002767154508121853, + "loss": 7.3328, + "step": 13198 + }, + { + "epoch": 1.231594662685453, + "grad_norm": 1.497811048032103, + "learning_rate": 0.00027671140304252564, + "loss": 7.1547, + "step": 13199 + }, + { + "epoch": 1.2316879723803302, + "grad_norm": 1.2386359476337865, + "learning_rate": 0.0002767073549506764, + "loss": 6.9995, + "step": 13200 + }, + { + "epoch": 1.2317812820752077, + "grad_norm": 8.119418553822266, + "learning_rate": 0.0002767033065366479, + "loss": 7.3607, + "step": 13201 + }, + { + "epoch": 1.2318745917700848, + "grad_norm": 1.2785742005168874, + "learning_rate": 0.00027669925780045046, + "loss": 7.2012, + "step": 13202 + }, + { + "epoch": 1.2319679014649623, + "grad_norm": 0.5496433151077826, + "learning_rate": 0.00027669520874209434, + "loss": 7.2356, + "step": 13203 + }, + { + "epoch": 1.2320612111598395, + "grad_norm": 0.573196187744085, + "learning_rate": 0.00027669115936158976, + "loss": 7.421, + "step": 13204 + }, + { + "epoch": 1.2321545208547169, + "grad_norm": 0.4986988243720936, + "learning_rate": 0.0002766871096589472, + "loss": 7.1334, + "step": 13205 + }, + { + "epoch": 1.232247830549594, + "grad_norm": 0.7382475560405019, + "learning_rate": 0.0002766830596341768, + "loss": 7.3759, + "step": 13206 + }, + { + "epoch": 1.2323411402444715, + "grad_norm": 0.7420926667891894, + "learning_rate": 0.00027667900928728887, + "loss": 6.7541, + "step": 13207 + }, + { + "epoch": 1.2324344499393487, + "grad_norm": 0.5149954855447308, + "learning_rate": 0.0002766749586182937, + "loss": 6.8518, + "step": 13208 + }, + { + "epoch": 1.232527759634226, + "grad_norm": 0.730343628552881, + "learning_rate": 0.00027667090762720177, + "loss": 6.9987, + "step": 13209 + }, + { + "epoch": 1.2326210693291033, + "grad_norm": 0.7910910773323503, + "learning_rate": 0.0002766668563140231, + "loss": 7.2601, + "step": 13210 + }, + { + "epoch": 1.2327143790239805, + "grad_norm": 0.4983875583619315, + "learning_rate": 0.0002766628046787682, + "loss": 7.2568, + "step": 13211 + }, + { + "epoch": 1.232807688718858, + "grad_norm": 6.650162100452149, + "learning_rate": 0.0002766587527214474, + "loss": 7.206, + "step": 13212 + }, + { + "epoch": 1.2329009984137351, + "grad_norm": 1.7034795093018404, + "learning_rate": 0.0002766547004420708, + "loss": 6.8359, + "step": 13213 + }, + { + "epoch": 1.2329943081086125, + "grad_norm": 0.6212382841007406, + "learning_rate": 0.0002766506478406489, + "loss": 7.2372, + "step": 13214 + }, + { + "epoch": 1.2330876178034897, + "grad_norm": 0.8642963593467365, + "learning_rate": 0.0002766465949171918, + "loss": 7.024, + "step": 13215 + }, + { + "epoch": 1.2331809274983672, + "grad_norm": 0.5169172799902538, + "learning_rate": 0.00027664254167171, + "loss": 7.0962, + "step": 13216 + }, + { + "epoch": 1.2332742371932444, + "grad_norm": 1.0331123024002742, + "learning_rate": 0.00027663848810421373, + "loss": 7.3688, + "step": 13217 + }, + { + "epoch": 1.2333675468881218, + "grad_norm": 1.623351202915333, + "learning_rate": 0.0002766344342147133, + "loss": 7.4799, + "step": 13218 + }, + { + "epoch": 1.233460856582999, + "grad_norm": 0.5577278094615353, + "learning_rate": 0.00027663038000321904, + "loss": 7.1448, + "step": 13219 + }, + { + "epoch": 1.2335541662778762, + "grad_norm": 0.6807613724776153, + "learning_rate": 0.00027662632546974114, + "loss": 7.2386, + "step": 13220 + }, + { + "epoch": 1.2336474759727536, + "grad_norm": 0.5464911852767105, + "learning_rate": 0.00027662227061429006, + "loss": 7.1975, + "step": 13221 + }, + { + "epoch": 1.2337407856676308, + "grad_norm": 0.5563797920261683, + "learning_rate": 0.0002766182154368761, + "loss": 7.0829, + "step": 13222 + }, + { + "epoch": 1.2338340953625082, + "grad_norm": 1.2257000819580377, + "learning_rate": 0.0002766141599375095, + "loss": 7.1138, + "step": 13223 + }, + { + "epoch": 1.2339274050573854, + "grad_norm": 1.6499359624374708, + "learning_rate": 0.0002766101041162006, + "loss": 7.3472, + "step": 13224 + }, + { + "epoch": 1.2340207147522628, + "grad_norm": 0.35674621401355405, + "learning_rate": 0.0002766060479729597, + "loss": 7.1924, + "step": 13225 + }, + { + "epoch": 1.23411402444714, + "grad_norm": 16.69959138919734, + "learning_rate": 0.00027660199150779705, + "loss": 7.1573, + "step": 13226 + }, + { + "epoch": 1.2342073341420174, + "grad_norm": 2.4780929730924153, + "learning_rate": 0.0002765979347207231, + "loss": 7.3162, + "step": 13227 + }, + { + "epoch": 1.2343006438368946, + "grad_norm": 2.6304575450609424, + "learning_rate": 0.00027659387761174814, + "loss": 7.1872, + "step": 13228 + }, + { + "epoch": 1.234393953531772, + "grad_norm": 1.7052871747112748, + "learning_rate": 0.00027658982018088235, + "loss": 7.2968, + "step": 13229 + }, + { + "epoch": 1.2344872632266493, + "grad_norm": 0.9232160130537741, + "learning_rate": 0.00027658576242813623, + "loss": 7.2222, + "step": 13230 + }, + { + "epoch": 1.2345805729215265, + "grad_norm": 2.4497095940991884, + "learning_rate": 0.00027658170435352, + "loss": 7.1659, + "step": 13231 + }, + { + "epoch": 1.2346738826164039, + "grad_norm": 1.6145623125686142, + "learning_rate": 0.000276577645957044, + "loss": 7.0106, + "step": 13232 + }, + { + "epoch": 1.234767192311281, + "grad_norm": 1.8350721686838383, + "learning_rate": 0.0002765735872387185, + "loss": 7.4855, + "step": 13233 + }, + { + "epoch": 1.2348605020061585, + "grad_norm": 0.6048186944706078, + "learning_rate": 0.0002765695281985539, + "loss": 7.0742, + "step": 13234 + }, + { + "epoch": 1.2349538117010357, + "grad_norm": 9052.945205684271, + "learning_rate": 0.00027656546883656044, + "loss": 7.279, + "step": 13235 + }, + { + "epoch": 1.235047121395913, + "grad_norm": 1.5845659080176966, + "learning_rate": 0.00027656140915274856, + "loss": 7.0656, + "step": 13236 + }, + { + "epoch": 1.2351404310907903, + "grad_norm": 475.25353618736926, + "learning_rate": 0.00027655734914712845, + "loss": 7.1254, + "step": 13237 + }, + { + "epoch": 1.2352337407856677, + "grad_norm": 65.04833476180794, + "learning_rate": 0.0002765532888197105, + "loss": 7.1382, + "step": 13238 + }, + { + "epoch": 1.235327050480545, + "grad_norm": 1.095069503799221, + "learning_rate": 0.00027654922817050507, + "loss": 7.0258, + "step": 13239 + }, + { + "epoch": 1.2354203601754223, + "grad_norm": 1.2972398126801272, + "learning_rate": 0.00027654516719952235, + "loss": 6.9968, + "step": 13240 + }, + { + "epoch": 1.2355136698702995, + "grad_norm": 0.6145645758700076, + "learning_rate": 0.00027654110590677277, + "loss": 6.9096, + "step": 13241 + }, + { + "epoch": 1.2356069795651767, + "grad_norm": 0.7521263940087728, + "learning_rate": 0.00027653704429226667, + "loss": 7.2533, + "step": 13242 + }, + { + "epoch": 1.2357002892600542, + "grad_norm": 1.1598088061564114, + "learning_rate": 0.00027653298235601437, + "loss": 7.2129, + "step": 13243 + }, + { + "epoch": 1.2357935989549313, + "grad_norm": 1.6377087495940894, + "learning_rate": 0.0002765289200980262, + "loss": 7.4161, + "step": 13244 + }, + { + "epoch": 1.2358869086498088, + "grad_norm": 1.7380729697095734, + "learning_rate": 0.00027652485751831243, + "loss": 7.1355, + "step": 13245 + }, + { + "epoch": 1.235980218344686, + "grad_norm": 1.2498342082240952, + "learning_rate": 0.0002765207946168834, + "loss": 7.146, + "step": 13246 + }, + { + "epoch": 1.2360735280395634, + "grad_norm": 6.786972454883416, + "learning_rate": 0.0002765167313937495, + "loss": 7.2861, + "step": 13247 + }, + { + "epoch": 1.2361668377344406, + "grad_norm": 1.3836316477894652, + "learning_rate": 0.000276512667848921, + "loss": 7.4259, + "step": 13248 + }, + { + "epoch": 1.236260147429318, + "grad_norm": 0.9099059356890887, + "learning_rate": 0.0002765086039824083, + "loss": 6.9386, + "step": 13249 + }, + { + "epoch": 1.2363534571241952, + "grad_norm": 1.2102669305173235, + "learning_rate": 0.00027650453979422164, + "loss": 7.2257, + "step": 13250 + }, + { + "epoch": 1.2364467668190726, + "grad_norm": 0.9798907493134192, + "learning_rate": 0.0002765004752843715, + "loss": 7.0992, + "step": 13251 + }, + { + "epoch": 1.2365400765139498, + "grad_norm": 1.0942574255406388, + "learning_rate": 0.00027649641045286803, + "loss": 7.3746, + "step": 13252 + }, + { + "epoch": 1.236633386208827, + "grad_norm": 0.4657713807907068, + "learning_rate": 0.0002764923452997217, + "loss": 7.4019, + "step": 13253 + }, + { + "epoch": 1.2367266959037044, + "grad_norm": 5.1310180982875835, + "learning_rate": 0.0002764882798249428, + "loss": 7.0749, + "step": 13254 + }, + { + "epoch": 1.2368200055985816, + "grad_norm": 0.8017496214214933, + "learning_rate": 0.0002764842140285417, + "loss": 7.3068, + "step": 13255 + }, + { + "epoch": 1.236913315293459, + "grad_norm": 0.7762597617308349, + "learning_rate": 0.00027648014791052867, + "loss": 7.4345, + "step": 13256 + }, + { + "epoch": 1.2370066249883362, + "grad_norm": 0.518728965500599, + "learning_rate": 0.0002764760814709141, + "loss": 7.2387, + "step": 13257 + }, + { + "epoch": 1.2370999346832137, + "grad_norm": 20.099650149374984, + "learning_rate": 0.0002764720147097083, + "loss": 7.2794, + "step": 13258 + }, + { + "epoch": 1.2371932443780909, + "grad_norm": 0.4962504096559652, + "learning_rate": 0.00027646794762692167, + "loss": 7.095, + "step": 13259 + }, + { + "epoch": 1.237286554072968, + "grad_norm": 1.0037729840124758, + "learning_rate": 0.0002764638802225645, + "loss": 7.8601, + "step": 13260 + }, + { + "epoch": 1.2373798637678455, + "grad_norm": 0.9687122820025081, + "learning_rate": 0.00027645981249664717, + "loss": 7.1757, + "step": 13261 + }, + { + "epoch": 1.237473173462723, + "grad_norm": 1.5958527180915687, + "learning_rate": 0.00027645574444918, + "loss": 7.0354, + "step": 13262 + }, + { + "epoch": 1.2375664831576, + "grad_norm": 0.5665728351132888, + "learning_rate": 0.0002764516760801733, + "loss": 7.6147, + "step": 13263 + }, + { + "epoch": 1.2376597928524773, + "grad_norm": 0.443833913698907, + "learning_rate": 0.0002764476073896374, + "loss": 7.341, + "step": 13264 + }, + { + "epoch": 1.2377531025473547, + "grad_norm": 0.6727624987440394, + "learning_rate": 0.0002764435383775827, + "loss": 7.2468, + "step": 13265 + }, + { + "epoch": 1.237846412242232, + "grad_norm": 0.7626429789809497, + "learning_rate": 0.0002764394690440196, + "loss": 7.027, + "step": 13266 + }, + { + "epoch": 1.2379397219371093, + "grad_norm": 1.5300927016469863, + "learning_rate": 0.0002764353993889584, + "loss": 7.0793, + "step": 13267 + }, + { + "epoch": 1.2380330316319865, + "grad_norm": 1.7262219620056936, + "learning_rate": 0.00027643132941240935, + "loss": 7.6478, + "step": 13268 + }, + { + "epoch": 1.238126341326864, + "grad_norm": 0.4886986463773983, + "learning_rate": 0.00027642725911438293, + "loss": 7.2608, + "step": 13269 + }, + { + "epoch": 1.2382196510217411, + "grad_norm": 0.5858466413480222, + "learning_rate": 0.00027642318849488944, + "loss": 7.4952, + "step": 13270 + }, + { + "epoch": 1.2383129607166183, + "grad_norm": 1.5351208557262987, + "learning_rate": 0.0002764191175539392, + "loss": 7.2395, + "step": 13271 + }, + { + "epoch": 1.2384062704114958, + "grad_norm": 2.0889742582147948, + "learning_rate": 0.00027641504629154266, + "loss": 6.9767, + "step": 13272 + }, + { + "epoch": 1.2384995801063732, + "grad_norm": 0.6347135893080422, + "learning_rate": 0.00027641097470771004, + "loss": 7.2158, + "step": 13273 + }, + { + "epoch": 1.2385928898012504, + "grad_norm": 0.9433304752726486, + "learning_rate": 0.00027640690280245176, + "loss": 7.1357, + "step": 13274 + }, + { + "epoch": 1.2386861994961276, + "grad_norm": 1.6505362211115626, + "learning_rate": 0.0002764028305757782, + "loss": 7.057, + "step": 13275 + }, + { + "epoch": 1.238779509191005, + "grad_norm": 1.986580666286219, + "learning_rate": 0.00027639875802769966, + "loss": 7.2695, + "step": 13276 + }, + { + "epoch": 1.2388728188858822, + "grad_norm": 1.0183464425358852, + "learning_rate": 0.00027639468515822655, + "loss": 7.2924, + "step": 13277 + }, + { + "epoch": 1.2389661285807596, + "grad_norm": 1.1118622544575603, + "learning_rate": 0.0002763906119673692, + "loss": 6.9366, + "step": 13278 + }, + { + "epoch": 1.2390594382756368, + "grad_norm": 0.905563936817155, + "learning_rate": 0.00027638653845513795, + "loss": 7.3888, + "step": 13279 + }, + { + "epoch": 1.2391527479705142, + "grad_norm": 1.2595625112202984, + "learning_rate": 0.00027638246462154315, + "loss": 7.3175, + "step": 13280 + }, + { + "epoch": 1.2392460576653914, + "grad_norm": 1.1096858543410553, + "learning_rate": 0.0002763783904665952, + "loss": 7.2095, + "step": 13281 + }, + { + "epoch": 1.2393393673602686, + "grad_norm": 0.6432490671603185, + "learning_rate": 0.0002763743159903044, + "loss": 7.1089, + "step": 13282 + }, + { + "epoch": 1.239432677055146, + "grad_norm": 0.91342525405109, + "learning_rate": 0.0002763702411926812, + "loss": 7.3014, + "step": 13283 + }, + { + "epoch": 1.2395259867500232, + "grad_norm": 1.3844630534556992, + "learning_rate": 0.00027636616607373593, + "loss": 7.2424, + "step": 13284 + }, + { + "epoch": 1.2396192964449007, + "grad_norm": 0.7446776214154778, + "learning_rate": 0.00027636209063347885, + "loss": 7.0973, + "step": 13285 + }, + { + "epoch": 1.2397126061397779, + "grad_norm": 0.43945636368458574, + "learning_rate": 0.00027635801487192047, + "loss": 6.8561, + "step": 13286 + }, + { + "epoch": 1.2398059158346553, + "grad_norm": 0.6743148702391131, + "learning_rate": 0.0002763539387890711, + "loss": 7.5245, + "step": 13287 + }, + { + "epoch": 1.2398992255295325, + "grad_norm": 0.9319440161521085, + "learning_rate": 0.000276349862384941, + "loss": 6.8695, + "step": 13288 + }, + { + "epoch": 1.2399925352244099, + "grad_norm": 0.6369276502273964, + "learning_rate": 0.00027634578565954073, + "loss": 7.1574, + "step": 13289 + }, + { + "epoch": 1.240085844919287, + "grad_norm": 4.161461199476157, + "learning_rate": 0.0002763417086128805, + "loss": 7.4137, + "step": 13290 + }, + { + "epoch": 1.2401791546141645, + "grad_norm": 0.5515515562594243, + "learning_rate": 0.0002763376312449707, + "loss": 6.9705, + "step": 13291 + }, + { + "epoch": 1.2402724643090417, + "grad_norm": 0.7003453895367455, + "learning_rate": 0.0002763335535558218, + "loss": 7.3215, + "step": 13292 + }, + { + "epoch": 1.240365774003919, + "grad_norm": 0.9600321850540758, + "learning_rate": 0.0002763294755454441, + "loss": 7.4737, + "step": 13293 + }, + { + "epoch": 1.2404590836987963, + "grad_norm": 0.7405898851281475, + "learning_rate": 0.0002763253972138479, + "loss": 6.9782, + "step": 13294 + }, + { + "epoch": 1.2405523933936735, + "grad_norm": 0.5460943914839623, + "learning_rate": 0.00027632131856104365, + "loss": 6.9149, + "step": 13295 + }, + { + "epoch": 1.240645703088551, + "grad_norm": 0.48237350504731497, + "learning_rate": 0.0002763172395870417, + "loss": 7.1642, + "step": 13296 + }, + { + "epoch": 1.2407390127834281, + "grad_norm": 0.5695702521557905, + "learning_rate": 0.00027631316029185247, + "loss": 7.0723, + "step": 13297 + }, + { + "epoch": 1.2408323224783055, + "grad_norm": 14.88258173312266, + "learning_rate": 0.0002763090806754863, + "loss": 7.1292, + "step": 13298 + }, + { + "epoch": 1.2409256321731827, + "grad_norm": 0.8610094229143939, + "learning_rate": 0.0002763050007379535, + "loss": 7.4069, + "step": 13299 + }, + { + "epoch": 1.2410189418680602, + "grad_norm": 2.133643310819571, + "learning_rate": 0.00027630092047926453, + "loss": 7.3627, + "step": 13300 + }, + { + "epoch": 1.2411122515629374, + "grad_norm": 1.1240696651810906, + "learning_rate": 0.0002762968398994297, + "loss": 7.4488, + "step": 13301 + }, + { + "epoch": 1.2412055612578148, + "grad_norm": 0.7335692541304918, + "learning_rate": 0.00027629275899845944, + "loss": 7.4943, + "step": 13302 + }, + { + "epoch": 1.241298870952692, + "grad_norm": 0.6904090560534591, + "learning_rate": 0.00027628867777636413, + "loss": 7.2647, + "step": 13303 + }, + { + "epoch": 1.2413921806475692, + "grad_norm": 0.5637003227655787, + "learning_rate": 0.00027628459623315406, + "loss": 6.9385, + "step": 13304 + }, + { + "epoch": 1.2414854903424466, + "grad_norm": 2.2528159317344314, + "learning_rate": 0.00027628051436883973, + "loss": 7.434, + "step": 13305 + }, + { + "epoch": 1.2415788000373238, + "grad_norm": 18.49117053878907, + "learning_rate": 0.00027627643218343145, + "loss": 7.3673, + "step": 13306 + }, + { + "epoch": 1.2416721097322012, + "grad_norm": 0.49473433242081366, + "learning_rate": 0.0002762723496769396, + "loss": 7.5092, + "step": 13307 + }, + { + "epoch": 1.2417654194270784, + "grad_norm": 1.6264312783408021, + "learning_rate": 0.00027626826684937455, + "loss": 7.1096, + "step": 13308 + }, + { + "epoch": 1.2418587291219558, + "grad_norm": 1.112919902955485, + "learning_rate": 0.0002762641837007467, + "loss": 7.6032, + "step": 13309 + }, + { + "epoch": 1.241952038816833, + "grad_norm": 0.5162820095869395, + "learning_rate": 0.00027626010023106646, + "loss": 7.4382, + "step": 13310 + }, + { + "epoch": 1.2420453485117104, + "grad_norm": 0.4448852162588405, + "learning_rate": 0.0002762560164403442, + "loss": 7.1348, + "step": 13311 + }, + { + "epoch": 1.2421386582065876, + "grad_norm": 0.7871509157318763, + "learning_rate": 0.0002762519323285903, + "loss": 7.3656, + "step": 13312 + }, + { + "epoch": 1.242231967901465, + "grad_norm": 0.7048949510027754, + "learning_rate": 0.00027624784789581506, + "loss": 7.3763, + "step": 13313 + }, + { + "epoch": 1.2423252775963423, + "grad_norm": 0.565477614338373, + "learning_rate": 0.00027624376314202897, + "loss": 7.1706, + "step": 13314 + }, + { + "epoch": 1.2424185872912195, + "grad_norm": 14.653413950971434, + "learning_rate": 0.0002762396780672424, + "loss": 7.1217, + "step": 13315 + }, + { + "epoch": 1.2425118969860969, + "grad_norm": 7.782631647471216, + "learning_rate": 0.0002762355926714657, + "loss": 7.1889, + "step": 13316 + }, + { + "epoch": 1.242605206680974, + "grad_norm": 0.5207191634109938, + "learning_rate": 0.00027623150695470927, + "loss": 7.2203, + "step": 13317 + }, + { + "epoch": 1.2426985163758515, + "grad_norm": 1.130638413352322, + "learning_rate": 0.0002762274209169836, + "loss": 7.145, + "step": 13318 + }, + { + "epoch": 1.2427918260707287, + "grad_norm": 1.8628695184808741, + "learning_rate": 0.0002762233345582989, + "loss": 7.2307, + "step": 13319 + }, + { + "epoch": 1.242885135765606, + "grad_norm": 0.7559166006751653, + "learning_rate": 0.0002762192478786657, + "loss": 7.2448, + "step": 13320 + }, + { + "epoch": 1.2429784454604833, + "grad_norm": 3.5149379767101867, + "learning_rate": 0.00027621516087809433, + "loss": 7.5693, + "step": 13321 + }, + { + "epoch": 1.2430717551553607, + "grad_norm": 2.2803200896719633, + "learning_rate": 0.00027621107355659517, + "loss": 6.9733, + "step": 13322 + }, + { + "epoch": 1.243165064850238, + "grad_norm": 1.5409657954276044, + "learning_rate": 0.00027620698591417867, + "loss": 7.1608, + "step": 13323 + }, + { + "epoch": 1.2432583745451153, + "grad_norm": 14.16558574296756, + "learning_rate": 0.00027620289795085513, + "loss": 7.2423, + "step": 13324 + }, + { + "epoch": 1.2433516842399925, + "grad_norm": 1.2847530231289073, + "learning_rate": 0.0002761988096666351, + "loss": 7.2137, + "step": 13325 + }, + { + "epoch": 1.2434449939348697, + "grad_norm": 121.80998023474923, + "learning_rate": 0.0002761947210615288, + "loss": 7.0117, + "step": 13326 + }, + { + "epoch": 1.2435383036297472, + "grad_norm": 9.74123014976626, + "learning_rate": 0.0002761906321355467, + "loss": 7.1117, + "step": 13327 + }, + { + "epoch": 1.2436316133246244, + "grad_norm": 46.04951734807476, + "learning_rate": 0.00027618654288869926, + "loss": 7.5153, + "step": 13328 + }, + { + "epoch": 1.2437249230195018, + "grad_norm": 41.50902118439595, + "learning_rate": 0.0002761824533209968, + "loss": 7.1756, + "step": 13329 + }, + { + "epoch": 1.243818232714379, + "grad_norm": 0.6948840613350484, + "learning_rate": 0.00027617836343244974, + "loss": 6.8812, + "step": 13330 + }, + { + "epoch": 1.2439115424092564, + "grad_norm": 17.146352189019755, + "learning_rate": 0.0002761742732230685, + "loss": 7.3056, + "step": 13331 + }, + { + "epoch": 1.2440048521041336, + "grad_norm": 2.131008305230156, + "learning_rate": 0.0002761701826928634, + "loss": 7.7762, + "step": 13332 + }, + { + "epoch": 1.244098161799011, + "grad_norm": 0.5929506589545696, + "learning_rate": 0.00027616609184184495, + "loss": 7.5736, + "step": 13333 + }, + { + "epoch": 1.2441914714938882, + "grad_norm": 1.9346081850849282, + "learning_rate": 0.00027616200067002343, + "loss": 7.2183, + "step": 13334 + }, + { + "epoch": 1.2442847811887656, + "grad_norm": 1.8955772480804967, + "learning_rate": 0.00027615790917740943, + "loss": 7.2011, + "step": 13335 + }, + { + "epoch": 1.2443780908836428, + "grad_norm": 1.5192343292207842, + "learning_rate": 0.0002761538173640132, + "loss": 7.1721, + "step": 13336 + }, + { + "epoch": 1.24447140057852, + "grad_norm": 0.46111637649843895, + "learning_rate": 0.0002761497252298451, + "loss": 7.0894, + "step": 13337 + }, + { + "epoch": 1.2445647102733974, + "grad_norm": 1.9519110387615128, + "learning_rate": 0.0002761456327749157, + "loss": 7.3901, + "step": 13338 + }, + { + "epoch": 1.2446580199682746, + "grad_norm": 1.2274700434822008, + "learning_rate": 0.00027614153999923525, + "loss": 7.0077, + "step": 13339 + }, + { + "epoch": 1.244751329663152, + "grad_norm": 0.9724144560979181, + "learning_rate": 0.0002761374469028143, + "loss": 7.0741, + "step": 13340 + }, + { + "epoch": 1.2448446393580292, + "grad_norm": 0.6400189349781439, + "learning_rate": 0.0002761333534856632, + "loss": 7.2578, + "step": 13341 + }, + { + "epoch": 1.2449379490529067, + "grad_norm": 0.47341532903159966, + "learning_rate": 0.0002761292597477923, + "loss": 7.2903, + "step": 13342 + }, + { + "epoch": 1.2450312587477839, + "grad_norm": 0.5454566069571788, + "learning_rate": 0.0002761251656892121, + "loss": 7.1758, + "step": 13343 + }, + { + "epoch": 1.2451245684426613, + "grad_norm": 0.7967012183706805, + "learning_rate": 0.0002761210713099329, + "loss": 6.9848, + "step": 13344 + }, + { + "epoch": 1.2452178781375385, + "grad_norm": 0.6568466115315522, + "learning_rate": 0.00027611697660996523, + "loss": 7.0962, + "step": 13345 + }, + { + "epoch": 1.245311187832416, + "grad_norm": 0.5693134371419337, + "learning_rate": 0.0002761128815893194, + "loss": 7.2736, + "step": 13346 + }, + { + "epoch": 1.245404497527293, + "grad_norm": 75.40382072085482, + "learning_rate": 0.0002761087862480059, + "loss": 7.1096, + "step": 13347 + }, + { + "epoch": 1.2454978072221703, + "grad_norm": 0.8518621619641502, + "learning_rate": 0.0002761046905860351, + "loss": 7.1268, + "step": 13348 + }, + { + "epoch": 1.2455911169170477, + "grad_norm": 0.6237529836305753, + "learning_rate": 0.00027610059460341744, + "loss": 7.0568, + "step": 13349 + }, + { + "epoch": 1.245684426611925, + "grad_norm": 0.5837297922498708, + "learning_rate": 0.0002760964983001633, + "loss": 7.149, + "step": 13350 + }, + { + "epoch": 1.2457777363068023, + "grad_norm": 1.4609160786290873, + "learning_rate": 0.00027609240167628314, + "loss": 7.3402, + "step": 13351 + }, + { + "epoch": 1.2458710460016795, + "grad_norm": 0.4889594499502122, + "learning_rate": 0.00027608830473178735, + "loss": 7.055, + "step": 13352 + }, + { + "epoch": 1.245964355696557, + "grad_norm": 4.084927051105707, + "learning_rate": 0.00027608420746668636, + "loss": 7.5064, + "step": 13353 + }, + { + "epoch": 1.2460576653914341, + "grad_norm": 0.8576691611086755, + "learning_rate": 0.00027608010988099054, + "loss": 7.1876, + "step": 13354 + }, + { + "epoch": 1.2461509750863116, + "grad_norm": 1.2473640948132236, + "learning_rate": 0.00027607601197471033, + "loss": 7.1953, + "step": 13355 + }, + { + "epoch": 1.2462442847811888, + "grad_norm": 0.4810850392306904, + "learning_rate": 0.00027607191374785626, + "loss": 7.0732, + "step": 13356 + }, + { + "epoch": 1.2463375944760662, + "grad_norm": 0.41929486275085354, + "learning_rate": 0.00027606781520043863, + "loss": 6.7624, + "step": 13357 + }, + { + "epoch": 1.2464309041709434, + "grad_norm": 1.2727361101170003, + "learning_rate": 0.00027606371633246785, + "loss": 7.0044, + "step": 13358 + }, + { + "epoch": 1.2465242138658206, + "grad_norm": 394.50331272733877, + "learning_rate": 0.0002760596171439544, + "loss": 7.1952, + "step": 13359 + }, + { + "epoch": 1.246617523560698, + "grad_norm": 0.6741400701303782, + "learning_rate": 0.00027605551763490866, + "loss": 7.3935, + "step": 13360 + }, + { + "epoch": 1.2467108332555752, + "grad_norm": 190.306620596892, + "learning_rate": 0.00027605141780534114, + "loss": 6.9323, + "step": 13361 + }, + { + "epoch": 1.2468041429504526, + "grad_norm": 589.2790033450149, + "learning_rate": 0.00027604731765526214, + "loss": 7.3388, + "step": 13362 + }, + { + "epoch": 1.2468974526453298, + "grad_norm": 0.603212469627927, + "learning_rate": 0.0002760432171846822, + "loss": 7.3003, + "step": 13363 + }, + { + "epoch": 1.2469907623402072, + "grad_norm": 0.6851315804402586, + "learning_rate": 0.0002760391163936117, + "loss": 6.907, + "step": 13364 + }, + { + "epoch": 1.2470840720350844, + "grad_norm": 1.3023542294462873, + "learning_rate": 0.000276035015282061, + "loss": 7.1527, + "step": 13365 + }, + { + "epoch": 1.2471773817299616, + "grad_norm": 0.4088449646777404, + "learning_rate": 0.00027603091385004064, + "loss": 7.0099, + "step": 13366 + }, + { + "epoch": 1.247270691424839, + "grad_norm": 0.5829277910194672, + "learning_rate": 0.000276026812097561, + "loss": 7.1794, + "step": 13367 + }, + { + "epoch": 1.2473640011197165, + "grad_norm": 0.9745520041567876, + "learning_rate": 0.00027602271002463247, + "loss": 7.3025, + "step": 13368 + }, + { + "epoch": 1.2474573108145937, + "grad_norm": 1.0074248766779859, + "learning_rate": 0.00027601860763126556, + "loss": 7.2923, + "step": 13369 + }, + { + "epoch": 1.2475506205094709, + "grad_norm": 0.5438628980750946, + "learning_rate": 0.00027601450491747067, + "loss": 7.4119, + "step": 13370 + }, + { + "epoch": 1.2476439302043483, + "grad_norm": 0.45362768778736456, + "learning_rate": 0.00027601040188325815, + "loss": 7.0426, + "step": 13371 + }, + { + "epoch": 1.2477372398992255, + "grad_norm": 2.099129119644494, + "learning_rate": 0.0002760062985286386, + "loss": 7.2752, + "step": 13372 + }, + { + "epoch": 1.247830549594103, + "grad_norm": 1.0541800276485576, + "learning_rate": 0.0002760021948536223, + "loss": 7.2321, + "step": 13373 + }, + { + "epoch": 1.24792385928898, + "grad_norm": 0.42366839421542773, + "learning_rate": 0.0002759980908582198, + "loss": 7.2604, + "step": 13374 + }, + { + "epoch": 1.2480171689838575, + "grad_norm": 0.8887716797333279, + "learning_rate": 0.00027599398654244137, + "loss": 7.3228, + "step": 13375 + }, + { + "epoch": 1.2481104786787347, + "grad_norm": 34906.473022978906, + "learning_rate": 0.0002759898819062976, + "loss": 7.0635, + "step": 13376 + }, + { + "epoch": 1.248203788373612, + "grad_norm": 0.6035671127070594, + "learning_rate": 0.0002759857769497989, + "loss": 6.8642, + "step": 13377 + }, + { + "epoch": 1.2482970980684893, + "grad_norm": 0.8111943583076268, + "learning_rate": 0.0002759816716729557, + "loss": 7.2215, + "step": 13378 + }, + { + "epoch": 1.2483904077633667, + "grad_norm": 0.5025675021524059, + "learning_rate": 0.00027597756607577843, + "loss": 7.3629, + "step": 13379 + }, + { + "epoch": 1.248483717458244, + "grad_norm": 0.3767179879766741, + "learning_rate": 0.00027597346015827747, + "loss": 7.1157, + "step": 13380 + }, + { + "epoch": 1.2485770271531211, + "grad_norm": 0.4051887715080138, + "learning_rate": 0.0002759693539204634, + "loss": 7.0466, + "step": 13381 + }, + { + "epoch": 1.2486703368479986, + "grad_norm": 1.6116904785977286, + "learning_rate": 0.0002759652473623465, + "loss": 7.4676, + "step": 13382 + }, + { + "epoch": 1.2487636465428757, + "grad_norm": 1.1473078517600124, + "learning_rate": 0.00027596114048393733, + "loss": 7.1722, + "step": 13383 + }, + { + "epoch": 1.2488569562377532, + "grad_norm": 0.8406147526239606, + "learning_rate": 0.0002759570332852463, + "loss": 7.3475, + "step": 13384 + }, + { + "epoch": 1.2489502659326304, + "grad_norm": 0.5000750437152768, + "learning_rate": 0.0002759529257662838, + "loss": 7.2843, + "step": 13385 + }, + { + "epoch": 1.2490435756275078, + "grad_norm": 1.0871334874855336, + "learning_rate": 0.0002759488179270604, + "loss": 7.547, + "step": 13386 + }, + { + "epoch": 1.249136885322385, + "grad_norm": 1.4798865714849507, + "learning_rate": 0.0002759447097675864, + "loss": 7.1882, + "step": 13387 + }, + { + "epoch": 1.2492301950172622, + "grad_norm": 2.1951902991187278, + "learning_rate": 0.0002759406012878724, + "loss": 6.8533, + "step": 13388 + }, + { + "epoch": 1.2493235047121396, + "grad_norm": 1.2525180044030548, + "learning_rate": 0.00027593649248792865, + "loss": 7.2265, + "step": 13389 + }, + { + "epoch": 1.2494168144070168, + "grad_norm": 0.4519365579651193, + "learning_rate": 0.0002759323833677657, + "loss": 7.0705, + "step": 13390 + }, + { + "epoch": 1.2495101241018942, + "grad_norm": 1.7946816875493667, + "learning_rate": 0.00027592827392739405, + "loss": 7.2894, + "step": 13391 + }, + { + "epoch": 1.2496034337967714, + "grad_norm": 0.6875848940530327, + "learning_rate": 0.0002759241641668241, + "loss": 6.95, + "step": 13392 + }, + { + "epoch": 1.2496967434916488, + "grad_norm": 0.7077481278094476, + "learning_rate": 0.00027592005408606633, + "loss": 7.3681, + "step": 13393 + }, + { + "epoch": 1.249790053186526, + "grad_norm": 12.52597298004467, + "learning_rate": 0.0002759159436851311, + "loss": 7.2955, + "step": 13394 + }, + { + "epoch": 1.2498833628814034, + "grad_norm": 1917.027925796602, + "learning_rate": 0.00027591183296402896, + "loss": 7.3514, + "step": 13395 + }, + { + "epoch": 1.2499766725762806, + "grad_norm": 0.5932482945048979, + "learning_rate": 0.0002759077219227703, + "loss": 7.1967, + "step": 13396 + }, + { + "epoch": 1.250069982271158, + "grad_norm": 0.7109776703264062, + "learning_rate": 0.00027590361056136566, + "loss": 7.1226, + "step": 13397 + }, + { + "epoch": 1.2501632919660353, + "grad_norm": 2839.1608794004355, + "learning_rate": 0.0002758994988798254, + "loss": 7.4636, + "step": 13398 + }, + { + "epoch": 1.2502566016609125, + "grad_norm": 1.0079238575392395, + "learning_rate": 0.00027589538687815995, + "loss": 7.2017, + "step": 13399 + }, + { + "epoch": 1.2503499113557899, + "grad_norm": 1.2641842752515016, + "learning_rate": 0.0002758912745563799, + "loss": 7.2232, + "step": 13400 + }, + { + "epoch": 1.2504432210506673, + "grad_norm": 58096.20790252881, + "learning_rate": 0.00027588716191449564, + "loss": 6.8729, + "step": 13401 + }, + { + "epoch": 1.2505365307455445, + "grad_norm": 0.4710397107179496, + "learning_rate": 0.00027588304895251754, + "loss": 6.7013, + "step": 13402 + }, + { + "epoch": 1.2506298404404217, + "grad_norm": 2.0804392499618776, + "learning_rate": 0.0002758789356704562, + "loss": 7.2265, + "step": 13403 + }, + { + "epoch": 1.250723150135299, + "grad_norm": 1.7944000607705746, + "learning_rate": 0.000275874822068322, + "loss": 7.2009, + "step": 13404 + }, + { + "epoch": 1.2508164598301763, + "grad_norm": 0.444283592006741, + "learning_rate": 0.00027587070814612545, + "loss": 7.155, + "step": 13405 + }, + { + "epoch": 1.2509097695250537, + "grad_norm": 0.7447447908507108, + "learning_rate": 0.0002758665939038769, + "loss": 7.684, + "step": 13406 + }, + { + "epoch": 1.251003079219931, + "grad_norm": 2.9402132874346156, + "learning_rate": 0.0002758624793415869, + "loss": 7.0889, + "step": 13407 + }, + { + "epoch": 1.2510963889148083, + "grad_norm": 2.3129672493595463, + "learning_rate": 0.00027585836445926596, + "loss": 6.8019, + "step": 13408 + }, + { + "epoch": 1.2511896986096855, + "grad_norm": 2.188422278765815, + "learning_rate": 0.0002758542492569244, + "loss": 7.1563, + "step": 13409 + }, + { + "epoch": 1.2512830083045627, + "grad_norm": 4.987062516402727, + "learning_rate": 0.00027585013373457285, + "loss": 7.7757, + "step": 13410 + }, + { + "epoch": 1.2513763179994402, + "grad_norm": 2.5573450688928077, + "learning_rate": 0.00027584601789222167, + "loss": 6.9889, + "step": 13411 + }, + { + "epoch": 1.2514696276943176, + "grad_norm": 2.0792566787395685, + "learning_rate": 0.0002758419017298813, + "loss": 6.9893, + "step": 13412 + }, + { + "epoch": 1.2515629373891948, + "grad_norm": 2.9832601020388654, + "learning_rate": 0.00027583778524756225, + "loss": 7.3755, + "step": 13413 + }, + { + "epoch": 1.251656247084072, + "grad_norm": 3.1020058430018396, + "learning_rate": 0.000275833668445275, + "loss": 7.0828, + "step": 13414 + }, + { + "epoch": 1.2517495567789494, + "grad_norm": 0.5829147057821438, + "learning_rate": 0.00027582955132303004, + "loss": 7.4291, + "step": 13415 + }, + { + "epoch": 1.2518428664738266, + "grad_norm": 0.7964060828667732, + "learning_rate": 0.0002758254338808378, + "loss": 6.9069, + "step": 13416 + }, + { + "epoch": 1.251936176168704, + "grad_norm": 2.3621916300622074, + "learning_rate": 0.0002758213161187087, + "loss": 7.2952, + "step": 13417 + }, + { + "epoch": 1.2520294858635812, + "grad_norm": 1.905987643090331, + "learning_rate": 0.0002758171980366533, + "loss": 7.2279, + "step": 13418 + }, + { + "epoch": 1.2521227955584586, + "grad_norm": 0.3991059220631928, + "learning_rate": 0.00027581307963468206, + "loss": 7.2396, + "step": 13419 + }, + { + "epoch": 1.2522161052533358, + "grad_norm": 1.3625861840812648, + "learning_rate": 0.0002758089609128054, + "loss": 7.1826, + "step": 13420 + }, + { + "epoch": 1.252309414948213, + "grad_norm": 6064384032.250671, + "learning_rate": 0.0002758048418710338, + "loss": 7.04, + "step": 13421 + }, + { + "epoch": 1.2524027246430904, + "grad_norm": 1.3487435464194242, + "learning_rate": 0.0002758007225093778, + "loss": 7.0176, + "step": 13422 + }, + { + "epoch": 1.2524960343379676, + "grad_norm": 0.7529802457582809, + "learning_rate": 0.0002757966028278478, + "loss": 7.0486, + "step": 13423 + }, + { + "epoch": 1.252589344032845, + "grad_norm": 339635.55649780534, + "learning_rate": 0.0002757924828264543, + "loss": 7.2974, + "step": 13424 + }, + { + "epoch": 1.2526826537277223, + "grad_norm": 1.3530240560898916, + "learning_rate": 0.0002757883625052078, + "loss": 6.8058, + "step": 13425 + }, + { + "epoch": 1.2527759634225997, + "grad_norm": 1.6784783571723707, + "learning_rate": 0.0002757842418641187, + "loss": 7.3538, + "step": 13426 + }, + { + "epoch": 1.2528692731174769, + "grad_norm": 0.5520781037912483, + "learning_rate": 0.00027578012090319755, + "loss": 6.8585, + "step": 13427 + }, + { + "epoch": 1.2529625828123543, + "grad_norm": 2137172.2635268643, + "learning_rate": 0.0002757759996224549, + "loss": 7.2064, + "step": 13428 + }, + { + "epoch": 1.2530558925072315, + "grad_norm": 1.4342681526362868, + "learning_rate": 0.000275771878021901, + "loss": 7.1887, + "step": 13429 + }, + { + "epoch": 1.253149202202109, + "grad_norm": 1708959.7757582332, + "learning_rate": 0.0002757677561015466, + "loss": 7.6962, + "step": 13430 + }, + { + "epoch": 1.253242511896986, + "grad_norm": 1.0223596621481181, + "learning_rate": 0.00027576363386140195, + "loss": 7.0052, + "step": 13431 + }, + { + "epoch": 1.2533358215918633, + "grad_norm": 1.5393795427581798, + "learning_rate": 0.0002757595113014777, + "loss": 7.4412, + "step": 13432 + }, + { + "epoch": 1.2534291312867407, + "grad_norm": 1.2963519652757098, + "learning_rate": 0.00027575538842178424, + "loss": 7.4059, + "step": 13433 + }, + { + "epoch": 1.253522440981618, + "grad_norm": 1.262093963466984, + "learning_rate": 0.0002757512652223321, + "loss": 7.3952, + "step": 13434 + }, + { + "epoch": 1.2536157506764953, + "grad_norm": 75009664.25211051, + "learning_rate": 0.0002757471417031317, + "loss": 7.2054, + "step": 13435 + }, + { + "epoch": 1.2537090603713725, + "grad_norm": 0.8372887042556008, + "learning_rate": 0.0002757430178641936, + "loss": 7.0623, + "step": 13436 + }, + { + "epoch": 1.25380237006625, + "grad_norm": 1.574706271410251, + "learning_rate": 0.00027573889370552824, + "loss": 7.0148, + "step": 13437 + }, + { + "epoch": 1.2538956797611271, + "grad_norm": 1.0258561785800002, + "learning_rate": 0.00027573476922714615, + "loss": 7.6408, + "step": 13438 + }, + { + "epoch": 1.2539889894560043, + "grad_norm": 1.5682629600298668, + "learning_rate": 0.00027573064442905777, + "loss": 7.5292, + "step": 13439 + }, + { + "epoch": 1.2540822991508818, + "grad_norm": 1.211751858834535, + "learning_rate": 0.00027572651931127357, + "loss": 7.4862, + "step": 13440 + }, + { + "epoch": 1.2541756088457592, + "grad_norm": 1.9507594968086448, + "learning_rate": 0.0002757223938738041, + "loss": 7.9604, + "step": 13441 + }, + { + "epoch": 1.2542689185406364, + "grad_norm": 1.4549020628049827, + "learning_rate": 0.00027571826811665983, + "loss": 7.8148, + "step": 13442 + }, + { + "epoch": 1.2543622282355136, + "grad_norm": 1.471434992349311, + "learning_rate": 0.0002757141420398512, + "loss": 7.8678, + "step": 13443 + }, + { + "epoch": 1.254455537930391, + "grad_norm": 2.7904665224711285, + "learning_rate": 0.00027571001564338885, + "loss": 7.6766, + "step": 13444 + }, + { + "epoch": 1.2545488476252682, + "grad_norm": 2.686021127561946, + "learning_rate": 0.0002757058889272831, + "loss": 7.7802, + "step": 13445 + }, + { + "epoch": 1.2546421573201456, + "grad_norm": 3.099985862861729, + "learning_rate": 0.00027570176189154453, + "loss": 7.9909, + "step": 13446 + }, + { + "epoch": 1.2547354670150228, + "grad_norm": 2.955734671424297, + "learning_rate": 0.00027569763453618357, + "loss": 7.8301, + "step": 13447 + }, + { + "epoch": 1.2548287767099002, + "grad_norm": 2.7690689137617572, + "learning_rate": 0.0002756935068612108, + "loss": 7.559, + "step": 13448 + }, + { + "epoch": 1.2549220864047774, + "grad_norm": 2.103333802815604, + "learning_rate": 0.00027568937886663665, + "loss": 7.478, + "step": 13449 + }, + { + "epoch": 1.2550153960996546, + "grad_norm": 2.021887631210614, + "learning_rate": 0.00027568525055247163, + "loss": 7.5838, + "step": 13450 + }, + { + "epoch": 1.255108705794532, + "grad_norm": 1.125811627532231, + "learning_rate": 0.0002756811219187263, + "loss": 7.271, + "step": 13451 + }, + { + "epoch": 1.2552020154894095, + "grad_norm": 0.7548985283040428, + "learning_rate": 0.0002756769929654111, + "loss": 7.2581, + "step": 13452 + }, + { + "epoch": 1.2552953251842867, + "grad_norm": 1.3164224906749828, + "learning_rate": 0.0002756728636925365, + "loss": 7.5275, + "step": 13453 + }, + { + "epoch": 1.2553886348791639, + "grad_norm": 1.0014715591785186, + "learning_rate": 0.00027566873410011307, + "loss": 7.3564, + "step": 13454 + }, + { + "epoch": 1.2554819445740413, + "grad_norm": 1.8008128902873808, + "learning_rate": 0.0002756646041881513, + "loss": 7.7284, + "step": 13455 + }, + { + "epoch": 1.2555752542689185, + "grad_norm": 15817.85651731191, + "learning_rate": 0.00027566047395666157, + "loss": 7.523, + "step": 13456 + }, + { + "epoch": 1.255668563963796, + "grad_norm": 1.7140742440133225, + "learning_rate": 0.00027565634340565454, + "loss": 7.8208, + "step": 13457 + }, + { + "epoch": 1.255761873658673, + "grad_norm": 12175.457638017835, + "learning_rate": 0.0002756522125351407, + "loss": 7.3584, + "step": 13458 + }, + { + "epoch": 1.2558551833535505, + "grad_norm": 1.7697041118617451, + "learning_rate": 0.0002756480813451304, + "loss": 7.3618, + "step": 13459 + }, + { + "epoch": 1.2559484930484277, + "grad_norm": 1.4014524660876588, + "learning_rate": 0.0002756439498356343, + "loss": 7.3756, + "step": 13460 + }, + { + "epoch": 1.256041802743305, + "grad_norm": 1.3149172718667326, + "learning_rate": 0.00027563981800666283, + "loss": 7.1298, + "step": 13461 + }, + { + "epoch": 1.2561351124381823, + "grad_norm": 1.1396027680616267, + "learning_rate": 0.0002756356858582266, + "loss": 6.9121, + "step": 13462 + }, + { + "epoch": 1.2562284221330597, + "grad_norm": 1.7328690182785782, + "learning_rate": 0.000275631553390336, + "loss": 7.2972, + "step": 13463 + }, + { + "epoch": 1.256321731827937, + "grad_norm": 2.9988680064112967, + "learning_rate": 0.0002756274206030015, + "loss": 7.6997, + "step": 13464 + }, + { + "epoch": 1.2564150415228141, + "grad_norm": 2.836093991281392, + "learning_rate": 0.00027562328749623376, + "loss": 7.7655, + "step": 13465 + }, + { + "epoch": 1.2565083512176916, + "grad_norm": 1.3771551057583107, + "learning_rate": 0.0002756191540700432, + "loss": 7.2326, + "step": 13466 + }, + { + "epoch": 1.2566016609125688, + "grad_norm": 0.7903399310499223, + "learning_rate": 0.0002756150203244403, + "loss": 7.268, + "step": 13467 + }, + { + "epoch": 1.2566949706074462, + "grad_norm": 1.1581867654022242, + "learning_rate": 0.00027561088625943566, + "loss": 7.5746, + "step": 13468 + }, + { + "epoch": 1.2567882803023234, + "grad_norm": 2379.6041544784507, + "learning_rate": 0.00027560675187503973, + "loss": 7.5965, + "step": 13469 + }, + { + "epoch": 1.2568815899972008, + "grad_norm": 2.3911740335447194, + "learning_rate": 0.000275602617171263, + "loss": 7.1254, + "step": 13470 + }, + { + "epoch": 1.256974899692078, + "grad_norm": 1.8603654655170736, + "learning_rate": 0.0002755984821481161, + "loss": 7.3588, + "step": 13471 + }, + { + "epoch": 1.2570682093869552, + "grad_norm": 1.3947828292788942, + "learning_rate": 0.0002755943468056094, + "loss": 7.5097, + "step": 13472 + }, + { + "epoch": 1.2571615190818326, + "grad_norm": 0.677774803576026, + "learning_rate": 0.0002755902111437535, + "loss": 7.4304, + "step": 13473 + }, + { + "epoch": 1.25725482877671, + "grad_norm": 0.829002741014473, + "learning_rate": 0.0002755860751625589, + "loss": 7.2361, + "step": 13474 + }, + { + "epoch": 1.2573481384715872, + "grad_norm": 1.0562931475044564, + "learning_rate": 0.00027558193886203613, + "loss": 7.3137, + "step": 13475 + }, + { + "epoch": 1.2574414481664644, + "grad_norm": 9524.549436056524, + "learning_rate": 0.00027557780224219563, + "loss": 7.0774, + "step": 13476 + }, + { + "epoch": 1.2575347578613418, + "grad_norm": 0.9152566895894759, + "learning_rate": 0.000275573665303048, + "loss": 7.1576, + "step": 13477 + }, + { + "epoch": 1.257628067556219, + "grad_norm": 0.9365088143466782, + "learning_rate": 0.0002755695280446038, + "loss": 7.4067, + "step": 13478 + }, + { + "epoch": 1.2577213772510965, + "grad_norm": 15835.172721620833, + "learning_rate": 0.00027556539046687343, + "loss": 7.7638, + "step": 13479 + }, + { + "epoch": 1.2578146869459736, + "grad_norm": 0.7155830303547647, + "learning_rate": 0.0002755612525698675, + "loss": 7.3292, + "step": 13480 + }, + { + "epoch": 1.257907996640851, + "grad_norm": 1.2366548327955615, + "learning_rate": 0.0002755571143535964, + "loss": 7.2229, + "step": 13481 + }, + { + "epoch": 1.2580013063357283, + "grad_norm": 0.9812630243153927, + "learning_rate": 0.00027555297581807086, + "loss": 7.3797, + "step": 13482 + }, + { + "epoch": 1.2580946160306055, + "grad_norm": 16382.906615385318, + "learning_rate": 0.00027554883696330126, + "loss": 7.4468, + "step": 13483 + }, + { + "epoch": 1.2581879257254829, + "grad_norm": 0.7553657598191907, + "learning_rate": 0.00027554469778929815, + "loss": 7.3246, + "step": 13484 + }, + { + "epoch": 1.2582812354203603, + "grad_norm": 7814.165357336207, + "learning_rate": 0.0002755405582960721, + "loss": 7.3595, + "step": 13485 + }, + { + "epoch": 1.2583745451152375, + "grad_norm": 0.6410144593954379, + "learning_rate": 0.00027553641848363354, + "loss": 7.5078, + "step": 13486 + }, + { + "epoch": 1.2584678548101147, + "grad_norm": 0.7662016845441434, + "learning_rate": 0.0002755322783519931, + "loss": 7.3394, + "step": 13487 + }, + { + "epoch": 1.2585611645049921, + "grad_norm": 1.5995641386186283, + "learning_rate": 0.00027552813790116123, + "loss": 7.6422, + "step": 13488 + }, + { + "epoch": 1.2586544741998693, + "grad_norm": 0.727038426024476, + "learning_rate": 0.00027552399713114855, + "loss": 7.2421, + "step": 13489 + }, + { + "epoch": 1.2587477838947467, + "grad_norm": 1.0093503429392872, + "learning_rate": 0.00027551985604196545, + "loss": 7.504, + "step": 13490 + }, + { + "epoch": 1.258841093589624, + "grad_norm": 1.2921847785044813, + "learning_rate": 0.00027551571463362256, + "loss": 7.0611, + "step": 13491 + }, + { + "epoch": 1.2589344032845013, + "grad_norm": 1.322607548711072, + "learning_rate": 0.0002755115729061304, + "loss": 6.985, + "step": 13492 + }, + { + "epoch": 1.2590277129793785, + "grad_norm": 944.9833012268662, + "learning_rate": 0.00027550743085949953, + "loss": 7.1608, + "step": 13493 + }, + { + "epoch": 1.2591210226742557, + "grad_norm": 0.8349340288345372, + "learning_rate": 0.00027550328849374037, + "loss": 7.1421, + "step": 13494 + }, + { + "epoch": 1.2592143323691332, + "grad_norm": 1.1757951279593823, + "learning_rate": 0.0002754991458088636, + "loss": 7.1761, + "step": 13495 + }, + { + "epoch": 1.2593076420640106, + "grad_norm": 7203.678200325735, + "learning_rate": 0.0002754950028048796, + "loss": 7.5903, + "step": 13496 + }, + { + "epoch": 1.2594009517588878, + "grad_norm": 0.8852489160307339, + "learning_rate": 0.000275490859481799, + "loss": 7.3957, + "step": 13497 + }, + { + "epoch": 1.259494261453765, + "grad_norm": 0.6945806751463307, + "learning_rate": 0.00027548671583963233, + "loss": 7.2458, + "step": 13498 + }, + { + "epoch": 1.2595875711486424, + "grad_norm": 21526.23476729495, + "learning_rate": 0.00027548257187839015, + "loss": 7.1829, + "step": 13499 + }, + { + "epoch": 1.2596808808435196, + "grad_norm": 1.909563649168383, + "learning_rate": 0.0002754784275980829, + "loss": 6.9408, + "step": 13500 + }, + { + "epoch": 1.259774190538397, + "grad_norm": 23435.67679018488, + "learning_rate": 0.00027547428299872124, + "loss": 7.3646, + "step": 13501 + }, + { + "epoch": 1.2598675002332742, + "grad_norm": 919.1136650584461, + "learning_rate": 0.0002754701380803156, + "loss": 7.546, + "step": 13502 + }, + { + "epoch": 1.2599608099281516, + "grad_norm": 0.986157252629566, + "learning_rate": 0.00027546599284287656, + "loss": 7.3316, + "step": 13503 + }, + { + "epoch": 1.2600541196230288, + "grad_norm": 0.8384179299562728, + "learning_rate": 0.0002754618472864147, + "loss": 7.169, + "step": 13504 + }, + { + "epoch": 1.260147429317906, + "grad_norm": 0.9513595675491658, + "learning_rate": 0.0002754577014109405, + "loss": 7.2937, + "step": 13505 + }, + { + "epoch": 1.2602407390127834, + "grad_norm": 2582.1732288680705, + "learning_rate": 0.0002754535552164645, + "loss": 7.462, + "step": 13506 + }, + { + "epoch": 1.2603340487076609, + "grad_norm": 0.5074896908759795, + "learning_rate": 0.0002754494087029973, + "loss": 7.4387, + "step": 13507 + }, + { + "epoch": 1.260427358402538, + "grad_norm": 4566.281315815561, + "learning_rate": 0.00027544526187054947, + "loss": 7.1734, + "step": 13508 + }, + { + "epoch": 1.2605206680974153, + "grad_norm": 13669.17708403601, + "learning_rate": 0.0002754411147191314, + "loss": 6.9437, + "step": 13509 + }, + { + "epoch": 1.2606139777922927, + "grad_norm": 0.6231803009074752, + "learning_rate": 0.0002754369672487538, + "loss": 7.1399, + "step": 13510 + }, + { + "epoch": 1.2607072874871699, + "grad_norm": 0.7892354513921139, + "learning_rate": 0.00027543281945942717, + "loss": 7.0587, + "step": 13511 + }, + { + "epoch": 1.2608005971820473, + "grad_norm": 1.323526622821073, + "learning_rate": 0.000275428671351162, + "loss": 7.2904, + "step": 13512 + }, + { + "epoch": 1.2608939068769245, + "grad_norm": 2.318524760493183, + "learning_rate": 0.00027542452292396885, + "loss": 7.6632, + "step": 13513 + }, + { + "epoch": 1.260987216571802, + "grad_norm": 1.0597042157637946, + "learning_rate": 0.0002754203741778583, + "loss": 7.3265, + "step": 13514 + }, + { + "epoch": 1.261080526266679, + "grad_norm": 0.5281075214538592, + "learning_rate": 0.0002754162251128409, + "loss": 7.3631, + "step": 13515 + }, + { + "epoch": 1.2611738359615563, + "grad_norm": 1.0633989872712846, + "learning_rate": 0.00027541207572892725, + "loss": 7.2547, + "step": 13516 + }, + { + "epoch": 1.2612671456564337, + "grad_norm": 0.9284564424663913, + "learning_rate": 0.00027540792602612777, + "loss": 7.4247, + "step": 13517 + }, + { + "epoch": 1.2613604553513111, + "grad_norm": 0.7323204480202401, + "learning_rate": 0.0002754037760044531, + "loss": 7.592, + "step": 13518 + }, + { + "epoch": 1.2614537650461883, + "grad_norm": 1.3599087751981491, + "learning_rate": 0.0002753996256639138, + "loss": 7.2669, + "step": 13519 + }, + { + "epoch": 1.2615470747410655, + "grad_norm": 0.6683074433445502, + "learning_rate": 0.00027539547500452037, + "loss": 7.5721, + "step": 13520 + }, + { + "epoch": 1.261640384435943, + "grad_norm": 6830.535214222685, + "learning_rate": 0.0002753913240262834, + "loss": 7.2842, + "step": 13521 + }, + { + "epoch": 1.2617336941308201, + "grad_norm": 0.45858386365113074, + "learning_rate": 0.0002753871727292134, + "loss": 6.9431, + "step": 13522 + }, + { + "epoch": 1.2618270038256976, + "grad_norm": 1.323913388174521, + "learning_rate": 0.000275383021113321, + "loss": 7.4806, + "step": 13523 + }, + { + "epoch": 1.2619203135205748, + "grad_norm": 2268.9340446629462, + "learning_rate": 0.0002753788691786167, + "loss": 7.5303, + "step": 13524 + }, + { + "epoch": 1.2620136232154522, + "grad_norm": 0.9906120321947494, + "learning_rate": 0.0002753747169251111, + "loss": 6.8757, + "step": 13525 + }, + { + "epoch": 1.2621069329103294, + "grad_norm": 0.5461963742886771, + "learning_rate": 0.00027537056435281474, + "loss": 7.4571, + "step": 13526 + }, + { + "epoch": 1.2622002426052066, + "grad_norm": 2816.207987339482, + "learning_rate": 0.00027536641146173814, + "loss": 7.1961, + "step": 13527 + }, + { + "epoch": 1.262293552300084, + "grad_norm": 0.9959158291816037, + "learning_rate": 0.0002753622582518919, + "loss": 7.241, + "step": 13528 + }, + { + "epoch": 1.2623868619949612, + "grad_norm": 0.6011913466652457, + "learning_rate": 0.0002753581047232866, + "loss": 7.3071, + "step": 13529 + }, + { + "epoch": 1.2624801716898386, + "grad_norm": 0.4940473960160411, + "learning_rate": 0.0002753539508759328, + "loss": 7.314, + "step": 13530 + }, + { + "epoch": 1.2625734813847158, + "grad_norm": 12501.22506669108, + "learning_rate": 0.00027534979670984093, + "loss": 7.3098, + "step": 13531 + }, + { + "epoch": 1.2626667910795932, + "grad_norm": 0.5180164151001939, + "learning_rate": 0.0002753456422250218, + "loss": 6.9517, + "step": 13532 + }, + { + "epoch": 1.2627601007744704, + "grad_norm": 0.6212056710721359, + "learning_rate": 0.0002753414874214857, + "loss": 7.0419, + "step": 13533 + }, + { + "epoch": 1.2628534104693478, + "grad_norm": 0.7265406939784932, + "learning_rate": 0.0002753373322992433, + "loss": 7.0125, + "step": 13534 + }, + { + "epoch": 1.262946720164225, + "grad_norm": 20801.40432960036, + "learning_rate": 0.0002753331768583053, + "loss": 7.5152, + "step": 13535 + }, + { + "epoch": 1.2630400298591025, + "grad_norm": 0.7678307155420196, + "learning_rate": 0.00027532902109868215, + "loss": 7.4687, + "step": 13536 + }, + { + "epoch": 1.2631333395539797, + "grad_norm": 0.5316020263356585, + "learning_rate": 0.0002753248650203844, + "loss": 7.5952, + "step": 13537 + }, + { + "epoch": 1.2632266492488569, + "grad_norm": 1.846622278766457, + "learning_rate": 0.00027532070862342265, + "loss": 7.0307, + "step": 13538 + }, + { + "epoch": 1.2633199589437343, + "grad_norm": 1.3373833478075028, + "learning_rate": 0.00027531655190780744, + "loss": 7.2121, + "step": 13539 + }, + { + "epoch": 1.2634132686386115, + "grad_norm": 1.0135769261674596, + "learning_rate": 0.0002753123948735494, + "loss": 7.0116, + "step": 13540 + }, + { + "epoch": 1.263506578333489, + "grad_norm": 10422.897495154488, + "learning_rate": 0.000275308237520659, + "loss": 7.2992, + "step": 13541 + }, + { + "epoch": 1.263599888028366, + "grad_norm": 14082.385201911584, + "learning_rate": 0.0002753040798491469, + "loss": 7.0635, + "step": 13542 + }, + { + "epoch": 1.2636931977232435, + "grad_norm": 1.2826595782594357, + "learning_rate": 0.0002752999218590237, + "loss": 7.3545, + "step": 13543 + }, + { + "epoch": 1.2637865074181207, + "grad_norm": 1.0997381387160288, + "learning_rate": 0.0002752957635502998, + "loss": 7.4561, + "step": 13544 + }, + { + "epoch": 1.263879817112998, + "grad_norm": 1081.7397738977195, + "learning_rate": 0.00027529160492298593, + "loss": 7.5164, + "step": 13545 + }, + { + "epoch": 1.2639731268078753, + "grad_norm": 0.7799232055967995, + "learning_rate": 0.0002752874459770927, + "loss": 7.4561, + "step": 13546 + }, + { + "epoch": 1.2640664365027527, + "grad_norm": 17848.363272508526, + "learning_rate": 0.00027528328671263055, + "loss": 7.0488, + "step": 13547 + }, + { + "epoch": 1.26415974619763, + "grad_norm": 0.7663466390350206, + "learning_rate": 0.0002752791271296101, + "loss": 7.5874, + "step": 13548 + }, + { + "epoch": 1.2642530558925071, + "grad_norm": 1.0482526848042444, + "learning_rate": 0.00027527496722804196, + "loss": 7.1676, + "step": 13549 + }, + { + "epoch": 1.2643463655873846, + "grad_norm": 0.5382899758476418, + "learning_rate": 0.0002752708070079367, + "loss": 7.1633, + "step": 13550 + }, + { + "epoch": 1.2644396752822618, + "grad_norm": 1.8022206081068255, + "learning_rate": 0.0002752666464693048, + "loss": 7.4032, + "step": 13551 + }, + { + "epoch": 1.2645329849771392, + "grad_norm": 1.3583088866553341, + "learning_rate": 0.00027526248561215706, + "loss": 7.1938, + "step": 13552 + }, + { + "epoch": 1.2646262946720164, + "grad_norm": 1.5895603144607278, + "learning_rate": 0.0002752583244365038, + "loss": 7.4499, + "step": 13553 + }, + { + "epoch": 1.2647196043668938, + "grad_norm": 0.6807238257535716, + "learning_rate": 0.0002752541629423558, + "loss": 7.3386, + "step": 13554 + }, + { + "epoch": 1.264812914061771, + "grad_norm": 10979.487469609936, + "learning_rate": 0.00027525000112972353, + "loss": 7.1696, + "step": 13555 + }, + { + "epoch": 1.2649062237566482, + "grad_norm": 1.9449230918280844, + "learning_rate": 0.0002752458389986177, + "loss": 6.9646, + "step": 13556 + }, + { + "epoch": 1.2649995334515256, + "grad_norm": 0.7375378520085525, + "learning_rate": 0.0002752416765490487, + "loss": 7.498, + "step": 13557 + }, + { + "epoch": 1.265092843146403, + "grad_norm": 5857.382885219051, + "learning_rate": 0.0002752375137810272, + "loss": 7.4396, + "step": 13558 + }, + { + "epoch": 1.2651861528412802, + "grad_norm": 1.2201522260078572, + "learning_rate": 0.00027523335069456383, + "loss": 6.7474, + "step": 13559 + }, + { + "epoch": 1.2652794625361574, + "grad_norm": 11927.900205316862, + "learning_rate": 0.0002752291872896692, + "loss": 7.1444, + "step": 13560 + }, + { + "epoch": 1.2653727722310348, + "grad_norm": 1.7617179118362025, + "learning_rate": 0.0002752250235663538, + "loss": 7.3208, + "step": 13561 + }, + { + "epoch": 1.265466081925912, + "grad_norm": 0.9252746305233229, + "learning_rate": 0.0002752208595246283, + "loss": 7.0245, + "step": 13562 + }, + { + "epoch": 1.2655593916207895, + "grad_norm": 0.9841188312847851, + "learning_rate": 0.00027521669516450317, + "loss": 7.4666, + "step": 13563 + }, + { + "epoch": 1.2656527013156667, + "grad_norm": 13119.443471531526, + "learning_rate": 0.0002752125304859891, + "loss": 7.1134, + "step": 13564 + }, + { + "epoch": 1.265746011010544, + "grad_norm": 0.5864891790117465, + "learning_rate": 0.0002752083654890967, + "loss": 7.491, + "step": 13565 + }, + { + "epoch": 1.2658393207054213, + "grad_norm": 0.9009366193896575, + "learning_rate": 0.0002752042001738365, + "loss": 7.4831, + "step": 13566 + }, + { + "epoch": 1.2659326304002985, + "grad_norm": 1.28353438470786, + "learning_rate": 0.00027520003454021907, + "loss": 7.3343, + "step": 13567 + }, + { + "epoch": 1.2660259400951759, + "grad_norm": 1.3860937644069273, + "learning_rate": 0.0002751958685882551, + "loss": 7.0803, + "step": 13568 + }, + { + "epoch": 1.2661192497900533, + "grad_norm": 6368.874799192281, + "learning_rate": 0.0002751917023179551, + "loss": 7.1026, + "step": 13569 + }, + { + "epoch": 1.2662125594849305, + "grad_norm": 1.384500564290571, + "learning_rate": 0.00027518753572932964, + "loss": 7.0821, + "step": 13570 + }, + { + "epoch": 1.2663058691798077, + "grad_norm": 7830.142088304513, + "learning_rate": 0.00027518336882238943, + "loss": 7.7359, + "step": 13571 + }, + { + "epoch": 1.2663991788746851, + "grad_norm": 0.41896814093097917, + "learning_rate": 0.00027517920159714497, + "loss": 6.8895, + "step": 13572 + }, + { + "epoch": 1.2664924885695623, + "grad_norm": 10003.866963517325, + "learning_rate": 0.00027517503405360684, + "loss": 7.2448, + "step": 13573 + }, + { + "epoch": 1.2665857982644397, + "grad_norm": 23000.47959388155, + "learning_rate": 0.00027517086619178565, + "loss": 7.0533, + "step": 13574 + }, + { + "epoch": 1.266679107959317, + "grad_norm": 11913.666886644663, + "learning_rate": 0.00027516669801169215, + "loss": 7.5901, + "step": 13575 + }, + { + "epoch": 1.2667724176541943, + "grad_norm": 1.1856073943812797, + "learning_rate": 0.00027516252951333673, + "loss": 7.2408, + "step": 13576 + }, + { + "epoch": 1.2668657273490715, + "grad_norm": 0.8185988949907433, + "learning_rate": 0.0002751583606967301, + "loss": 7.2854, + "step": 13577 + }, + { + "epoch": 1.2669590370439487, + "grad_norm": 0.5155299948869639, + "learning_rate": 0.0002751541915618828, + "loss": 7.6318, + "step": 13578 + }, + { + "epoch": 1.2670523467388262, + "grad_norm": 0.8173129041788245, + "learning_rate": 0.0002751500221088055, + "loss": 6.9628, + "step": 13579 + }, + { + "epoch": 1.2671456564337036, + "grad_norm": 0.38888745349431664, + "learning_rate": 0.0002751458523375087, + "loss": 7.1435, + "step": 13580 + }, + { + "epoch": 1.2672389661285808, + "grad_norm": 0.4282495831660789, + "learning_rate": 0.00027514168224800317, + "loss": 6.9245, + "step": 13581 + }, + { + "epoch": 1.267332275823458, + "grad_norm": 1.2122100050299063, + "learning_rate": 0.00027513751184029937, + "loss": 7.2547, + "step": 13582 + }, + { + "epoch": 1.2674255855183354, + "grad_norm": 1.3416027312016978, + "learning_rate": 0.00027513334111440793, + "loss": 7.1425, + "step": 13583 + }, + { + "epoch": 1.2675188952132126, + "grad_norm": 0.46178387634117174, + "learning_rate": 0.00027512917007033947, + "loss": 7.1598, + "step": 13584 + }, + { + "epoch": 1.26761220490809, + "grad_norm": 20834.860567936754, + "learning_rate": 0.0002751249987081046, + "loss": 7.3239, + "step": 13585 + }, + { + "epoch": 1.2677055146029672, + "grad_norm": 0.8061150365055717, + "learning_rate": 0.00027512082702771395, + "loss": 7.1687, + "step": 13586 + }, + { + "epoch": 1.2677988242978446, + "grad_norm": 1.5232238259474282, + "learning_rate": 0.0002751166550291781, + "loss": 6.8336, + "step": 13587 + }, + { + "epoch": 1.2678921339927218, + "grad_norm": 0.4920161727471738, + "learning_rate": 0.00027511248271250765, + "loss": 7.1718, + "step": 13588 + }, + { + "epoch": 1.267985443687599, + "grad_norm": 9026.09841120776, + "learning_rate": 0.00027510831007771316, + "loss": 7.3598, + "step": 13589 + }, + { + "epoch": 1.2680787533824764, + "grad_norm": 14522.765327981691, + "learning_rate": 0.0002751041371248054, + "loss": 7.0297, + "step": 13590 + }, + { + "epoch": 1.2681720630773539, + "grad_norm": 0.7162384994876851, + "learning_rate": 0.00027509996385379474, + "loss": 7.1787, + "step": 13591 + }, + { + "epoch": 1.268265372772231, + "grad_norm": 0.5518187196020413, + "learning_rate": 0.00027509579026469206, + "loss": 6.9154, + "step": 13592 + }, + { + "epoch": 1.2683586824671083, + "grad_norm": 0.5110244648355319, + "learning_rate": 0.00027509161635750777, + "loss": 7.1978, + "step": 13593 + }, + { + "epoch": 1.2684519921619857, + "grad_norm": 0.5052259135338125, + "learning_rate": 0.00027508744213225255, + "loss": 7.1097, + "step": 13594 + }, + { + "epoch": 1.2685453018568629, + "grad_norm": 0.45431177855318194, + "learning_rate": 0.0002750832675889371, + "loss": 7.1504, + "step": 13595 + }, + { + "epoch": 1.2686386115517403, + "grad_norm": 0.46502887498504747, + "learning_rate": 0.0002750790927275719, + "loss": 7.3088, + "step": 13596 + }, + { + "epoch": 1.2687319212466175, + "grad_norm": 42470.90317686658, + "learning_rate": 0.0002750749175481676, + "loss": 7.0935, + "step": 13597 + }, + { + "epoch": 1.268825230941495, + "grad_norm": 0.47424148474638345, + "learning_rate": 0.00027507074205073486, + "loss": 7.0653, + "step": 13598 + }, + { + "epoch": 1.268918540636372, + "grad_norm": 0.6704528616268616, + "learning_rate": 0.00027506656623528427, + "loss": 7.1635, + "step": 13599 + }, + { + "epoch": 1.2690118503312493, + "grad_norm": 1.2356641909473423, + "learning_rate": 0.00027506239010182646, + "loss": 6.8617, + "step": 13600 + }, + { + "epoch": 1.2691051600261267, + "grad_norm": 68224.32365106304, + "learning_rate": 0.000275058213650372, + "loss": 7.4653, + "step": 13601 + }, + { + "epoch": 1.2691984697210041, + "grad_norm": 30331.729999051287, + "learning_rate": 0.0002750540368809316, + "loss": 7.1395, + "step": 13602 + }, + { + "epoch": 1.2692917794158813, + "grad_norm": 0.6113932839914036, + "learning_rate": 0.00027504985979351576, + "loss": 7.3283, + "step": 13603 + }, + { + "epoch": 1.2693850891107585, + "grad_norm": 0.5449618476686253, + "learning_rate": 0.0002750456823881352, + "loss": 7.3606, + "step": 13604 + }, + { + "epoch": 1.269478398805636, + "grad_norm": 0.5062228324842667, + "learning_rate": 0.0002750415046648005, + "loss": 7.4245, + "step": 13605 + }, + { + "epoch": 1.2695717085005132, + "grad_norm": 101985.56822094436, + "learning_rate": 0.00027503732662352234, + "loss": 7.0105, + "step": 13606 + }, + { + "epoch": 1.2696650181953906, + "grad_norm": 0.7712903351078735, + "learning_rate": 0.00027503314826431125, + "loss": 7.5618, + "step": 13607 + }, + { + "epoch": 1.2697583278902678, + "grad_norm": 0.7384376902746357, + "learning_rate": 0.00027502896958717793, + "loss": 7.0675, + "step": 13608 + }, + { + "epoch": 1.2698516375851452, + "grad_norm": 0.4764782048463415, + "learning_rate": 0.000275024790592133, + "loss": 7.0779, + "step": 13609 + }, + { + "epoch": 1.2699449472800224, + "grad_norm": 0.9767189223906196, + "learning_rate": 0.000275020611279187, + "loss": 7.5321, + "step": 13610 + }, + { + "epoch": 1.2700382569748996, + "grad_norm": 0.5161684934488172, + "learning_rate": 0.00027501643164835063, + "loss": 7.2136, + "step": 13611 + }, + { + "epoch": 1.270131566669777, + "grad_norm": 40196.3212571733, + "learning_rate": 0.00027501225169963454, + "loss": 7.2914, + "step": 13612 + }, + { + "epoch": 1.2702248763646544, + "grad_norm": 10768.40005289182, + "learning_rate": 0.0002750080714330493, + "loss": 7.319, + "step": 13613 + }, + { + "epoch": 1.2703181860595316, + "grad_norm": 0.560853724497804, + "learning_rate": 0.00027500389084860556, + "loss": 7.468, + "step": 13614 + }, + { + "epoch": 1.2704114957544088, + "grad_norm": 0.6378334904275228, + "learning_rate": 0.000274999709946314, + "loss": 7.0247, + "step": 13615 + }, + { + "epoch": 1.2705048054492862, + "grad_norm": 0.599898505215461, + "learning_rate": 0.0002749955287261851, + "loss": 7.086, + "step": 13616 + }, + { + "epoch": 1.2705981151441634, + "grad_norm": 49962.33383427006, + "learning_rate": 0.0002749913471882297, + "loss": 7.0346, + "step": 13617 + }, + { + "epoch": 1.2706914248390409, + "grad_norm": 1.5949358928732493, + "learning_rate": 0.0002749871653324583, + "loss": 7.5046, + "step": 13618 + }, + { + "epoch": 1.270784734533918, + "grad_norm": 0.580209331331494, + "learning_rate": 0.00027498298315888157, + "loss": 7.4167, + "step": 13619 + }, + { + "epoch": 1.2708780442287955, + "grad_norm": 0.8490063740521646, + "learning_rate": 0.00027497880066751015, + "loss": 7.2846, + "step": 13620 + }, + { + "epoch": 1.2709713539236727, + "grad_norm": 1.302737305866936, + "learning_rate": 0.00027497461785835463, + "loss": 7.2961, + "step": 13621 + }, + { + "epoch": 1.2710646636185499, + "grad_norm": 1.14495411770218, + "learning_rate": 0.00027497043473142564, + "loss": 7.1866, + "step": 13622 + }, + { + "epoch": 1.2711579733134273, + "grad_norm": 0.435656798299205, + "learning_rate": 0.0002749662512867339, + "loss": 7.2818, + "step": 13623 + }, + { + "epoch": 1.2712512830083047, + "grad_norm": 30953.777802828856, + "learning_rate": 0.00027496206752429005, + "loss": 7.2032, + "step": 13624 + }, + { + "epoch": 1.271344592703182, + "grad_norm": 1.7065746612013377, + "learning_rate": 0.00027495788344410465, + "loss": 7.3024, + "step": 13625 + }, + { + "epoch": 1.271437902398059, + "grad_norm": 0.9154638831575082, + "learning_rate": 0.0002749536990461883, + "loss": 7.2776, + "step": 13626 + }, + { + "epoch": 1.2715312120929365, + "grad_norm": 0.5898064923421953, + "learning_rate": 0.00027494951433055175, + "loss": 7.302, + "step": 13627 + }, + { + "epoch": 1.2716245217878137, + "grad_norm": 1.2742300789181913, + "learning_rate": 0.0002749453292972056, + "loss": 7.3886, + "step": 13628 + }, + { + "epoch": 1.2717178314826911, + "grad_norm": 1.5303557100153171, + "learning_rate": 0.00027494114394616056, + "loss": 7.2335, + "step": 13629 + }, + { + "epoch": 1.2718111411775683, + "grad_norm": 20776.7451825347, + "learning_rate": 0.00027493695827742707, + "loss": 6.9906, + "step": 13630 + }, + { + "epoch": 1.2719044508724457, + "grad_norm": 0.9119471969779768, + "learning_rate": 0.000274932772291016, + "loss": 7.2054, + "step": 13631 + }, + { + "epoch": 1.271997760567323, + "grad_norm": 1.7042696905079924, + "learning_rate": 0.0002749285859869378, + "loss": 7.1884, + "step": 13632 + }, + { + "epoch": 1.2720910702622001, + "grad_norm": 2.1821520513246124, + "learning_rate": 0.00027492439936520335, + "loss": 7.6072, + "step": 13633 + }, + { + "epoch": 1.2721843799570776, + "grad_norm": 0.7549059112769877, + "learning_rate": 0.0002749202124258231, + "loss": 7.3491, + "step": 13634 + }, + { + "epoch": 1.2722776896519548, + "grad_norm": 1.2278761632910993, + "learning_rate": 0.0002749160251688077, + "loss": 7.2741, + "step": 13635 + }, + { + "epoch": 1.2723709993468322, + "grad_norm": 1.4977636461272599, + "learning_rate": 0.00027491183759416793, + "loss": 7.4681, + "step": 13636 + }, + { + "epoch": 1.2724643090417094, + "grad_norm": 9749.615639784754, + "learning_rate": 0.0002749076497019143, + "loss": 7.2214, + "step": 13637 + }, + { + "epoch": 1.2725576187365868, + "grad_norm": 0.5966625718741302, + "learning_rate": 0.0002749034614920575, + "loss": 7.3914, + "step": 13638 + }, + { + "epoch": 1.272650928431464, + "grad_norm": 1.2373192449292485, + "learning_rate": 0.00027489927296460827, + "loss": 7.3194, + "step": 13639 + }, + { + "epoch": 1.2727442381263414, + "grad_norm": 1.4272372982287358, + "learning_rate": 0.0002748950841195771, + "loss": 7.4536, + "step": 13640 + }, + { + "epoch": 1.2728375478212186, + "grad_norm": 0.40683086567593507, + "learning_rate": 0.0002748908949569748, + "loss": 7.0214, + "step": 13641 + }, + { + "epoch": 1.272930857516096, + "grad_norm": 0.8516240907116122, + "learning_rate": 0.00027488670547681196, + "loss": 7.6962, + "step": 13642 + }, + { + "epoch": 1.2730241672109732, + "grad_norm": 59904.378744966205, + "learning_rate": 0.0002748825156790992, + "loss": 7.1074, + "step": 13643 + }, + { + "epoch": 1.2731174769058504, + "grad_norm": 0.6090728422253653, + "learning_rate": 0.00027487832556384714, + "loss": 7.5268, + "step": 13644 + }, + { + "epoch": 1.2732107866007278, + "grad_norm": 0.7441946188986911, + "learning_rate": 0.00027487413513106653, + "loss": 7.3115, + "step": 13645 + }, + { + "epoch": 1.273304096295605, + "grad_norm": 0.4763283645949777, + "learning_rate": 0.000274869944380768, + "loss": 7.364, + "step": 13646 + }, + { + "epoch": 1.2733974059904825, + "grad_norm": 0.45706287609021734, + "learning_rate": 0.0002748657533129622, + "loss": 7.065, + "step": 13647 + }, + { + "epoch": 1.2734907156853597, + "grad_norm": 40670.13536296274, + "learning_rate": 0.0002748615619276598, + "loss": 7.3596, + "step": 13648 + }, + { + "epoch": 1.273584025380237, + "grad_norm": 1.6331606496910889, + "learning_rate": 0.0002748573702248713, + "loss": 7.4585, + "step": 13649 + }, + { + "epoch": 1.2736773350751143, + "grad_norm": 0.5874927050499552, + "learning_rate": 0.0002748531782046076, + "loss": 7.0402, + "step": 13650 + }, + { + "epoch": 1.2737706447699915, + "grad_norm": 0.6020038092811901, + "learning_rate": 0.00027484898586687924, + "loss": 7.4345, + "step": 13651 + }, + { + "epoch": 1.2738639544648689, + "grad_norm": 1.6604002796459612, + "learning_rate": 0.0002748447932116969, + "loss": 6.8592, + "step": 13652 + }, + { + "epoch": 1.2739572641597463, + "grad_norm": 0.6225274412810138, + "learning_rate": 0.0002748406002390712, + "loss": 7.3567, + "step": 13653 + }, + { + "epoch": 1.2740505738546235, + "grad_norm": 0.5223538176577782, + "learning_rate": 0.0002748364069490129, + "loss": 6.9247, + "step": 13654 + }, + { + "epoch": 1.2741438835495007, + "grad_norm": 115199.78761344921, + "learning_rate": 0.0002748322133415325, + "loss": 7.9254, + "step": 13655 + }, + { + "epoch": 1.2742371932443781, + "grad_norm": 0.4169251573672673, + "learning_rate": 0.00027482801941664084, + "loss": 6.998, + "step": 13656 + }, + { + "epoch": 1.2743305029392553, + "grad_norm": 0.508319827302103, + "learning_rate": 0.0002748238251743485, + "loss": 7.3796, + "step": 13657 + }, + { + "epoch": 1.2744238126341327, + "grad_norm": 1.2042929013473969, + "learning_rate": 0.0002748196306146661, + "loss": 7.2885, + "step": 13658 + }, + { + "epoch": 1.27451712232901, + "grad_norm": 1.329686826400617, + "learning_rate": 0.0002748154357376044, + "loss": 7.1617, + "step": 13659 + }, + { + "epoch": 1.2746104320238874, + "grad_norm": 24403.49418927844, + "learning_rate": 0.00027481124054317394, + "loss": 7.6341, + "step": 13660 + }, + { + "epoch": 1.2747037417187645, + "grad_norm": 0.5790824328473685, + "learning_rate": 0.00027480704503138554, + "loss": 7.1176, + "step": 13661 + }, + { + "epoch": 1.2747970514136417, + "grad_norm": 0.8210290795072615, + "learning_rate": 0.0002748028492022498, + "loss": 7.3949, + "step": 13662 + }, + { + "epoch": 1.2748903611085192, + "grad_norm": 0.5390805184938073, + "learning_rate": 0.00027479865305577735, + "loss": 7.114, + "step": 13663 + }, + { + "epoch": 1.2749836708033966, + "grad_norm": 0.45412600759765875, + "learning_rate": 0.00027479445659197893, + "loss": 7.0491, + "step": 13664 + }, + { + "epoch": 1.2750769804982738, + "grad_norm": 1.0093927352163305, + "learning_rate": 0.0002747902598108651, + "loss": 7.481, + "step": 13665 + }, + { + "epoch": 1.275170290193151, + "grad_norm": 786230.011693575, + "learning_rate": 0.0002747860627124467, + "loss": 7.2777, + "step": 13666 + }, + { + "epoch": 1.2752635998880284, + "grad_norm": 0.6692974998590829, + "learning_rate": 0.0002747818652967342, + "loss": 7.3181, + "step": 13667 + }, + { + "epoch": 1.2753569095829056, + "grad_norm": 243858.40404249943, + "learning_rate": 0.0002747776675637385, + "loss": 7.5125, + "step": 13668 + }, + { + "epoch": 1.275450219277783, + "grad_norm": 0.5462794012802862, + "learning_rate": 0.00027477346951347006, + "loss": 7.3761, + "step": 13669 + }, + { + "epoch": 1.2755435289726602, + "grad_norm": 1.1637888140301111, + "learning_rate": 0.0002747692711459397, + "loss": 7.1105, + "step": 13670 + }, + { + "epoch": 1.2756368386675376, + "grad_norm": 0.7174028686317077, + "learning_rate": 0.000274765072461158, + "loss": 7.117, + "step": 13671 + }, + { + "epoch": 1.2757301483624148, + "grad_norm": 0.38476392120027875, + "learning_rate": 0.0002747608734591357, + "loss": 6.9355, + "step": 13672 + }, + { + "epoch": 1.275823458057292, + "grad_norm": 1.6923360549207198, + "learning_rate": 0.0002747566741398835, + "loss": 7.2503, + "step": 13673 + }, + { + "epoch": 1.2759167677521694, + "grad_norm": 2.180238693671447, + "learning_rate": 0.00027475247450341195, + "loss": 7.3451, + "step": 13674 + }, + { + "epoch": 1.2760100774470469, + "grad_norm": 0.8447181996876877, + "learning_rate": 0.0002747482745497319, + "loss": 7.3786, + "step": 13675 + }, + { + "epoch": 1.276103387141924, + "grad_norm": 0.8950372769635293, + "learning_rate": 0.00027474407427885384, + "loss": 7.2781, + "step": 13676 + }, + { + "epoch": 1.2761966968368013, + "grad_norm": 2.0374702026601867, + "learning_rate": 0.00027473987369078863, + "loss": 7.2797, + "step": 13677 + }, + { + "epoch": 1.2762900065316787, + "grad_norm": 188171.53070544483, + "learning_rate": 0.00027473567278554686, + "loss": 7.4593, + "step": 13678 + }, + { + "epoch": 1.2763833162265559, + "grad_norm": 34805.30448043394, + "learning_rate": 0.0002747314715631392, + "loss": 7.2595, + "step": 13679 + }, + { + "epoch": 1.2764766259214333, + "grad_norm": 1.3837679871909712, + "learning_rate": 0.00027472727002357636, + "loss": 7.4052, + "step": 13680 + }, + { + "epoch": 1.2765699356163105, + "grad_norm": 0.8238321412715451, + "learning_rate": 0.00027472306816686906, + "loss": 7.6563, + "step": 13681 + }, + { + "epoch": 1.276663245311188, + "grad_norm": 0.7838879385790252, + "learning_rate": 0.0002747188659930279, + "loss": 6.8986, + "step": 13682 + }, + { + "epoch": 1.276756555006065, + "grad_norm": 1.1391205338079993, + "learning_rate": 0.00027471466350206363, + "loss": 7.3099, + "step": 13683 + }, + { + "epoch": 1.2768498647009423, + "grad_norm": 1.2990312248711258, + "learning_rate": 0.00027471046069398687, + "loss": 7.2608, + "step": 13684 + }, + { + "epoch": 1.2769431743958197, + "grad_norm": 1.204766871588129, + "learning_rate": 0.0002747062575688084, + "loss": 7.2046, + "step": 13685 + }, + { + "epoch": 1.2770364840906971, + "grad_norm": 0.9936201519697084, + "learning_rate": 0.00027470205412653885, + "loss": 7.3699, + "step": 13686 + }, + { + "epoch": 1.2771297937855743, + "grad_norm": 0.7786560730329636, + "learning_rate": 0.0002746978503671889, + "loss": 7.449, + "step": 13687 + }, + { + "epoch": 1.2772231034804515, + "grad_norm": 0.8218179147301412, + "learning_rate": 0.00027469364629076923, + "loss": 7.4384, + "step": 13688 + }, + { + "epoch": 1.277316413175329, + "grad_norm": 0.6319986588261449, + "learning_rate": 0.00027468944189729057, + "loss": 7.5894, + "step": 13689 + }, + { + "epoch": 1.2774097228702062, + "grad_norm": 0.7495498907201626, + "learning_rate": 0.00027468523718676365, + "loss": 7.483, + "step": 13690 + }, + { + "epoch": 1.2775030325650836, + "grad_norm": 0.7079243689303748, + "learning_rate": 0.00027468103215919906, + "loss": 7.6428, + "step": 13691 + }, + { + "epoch": 1.2775963422599608, + "grad_norm": 1.1882430792125807, + "learning_rate": 0.00027467682681460757, + "loss": 7.0305, + "step": 13692 + }, + { + "epoch": 1.2776896519548382, + "grad_norm": 0.8796424522941438, + "learning_rate": 0.0002746726211529998, + "loss": 7.7189, + "step": 13693 + }, + { + "epoch": 1.2777829616497154, + "grad_norm": 1.3394325822722504, + "learning_rate": 0.0002746684151743865, + "loss": 7.4325, + "step": 13694 + }, + { + "epoch": 1.2778762713445926, + "grad_norm": 0.6492128683536358, + "learning_rate": 0.00027466420887877835, + "loss": 7.1156, + "step": 13695 + }, + { + "epoch": 1.27796958103947, + "grad_norm": 1.1767928592417654, + "learning_rate": 0.000274660002266186, + "loss": 7.7381, + "step": 13696 + }, + { + "epoch": 1.2780628907343474, + "grad_norm": 0.9270817687882827, + "learning_rate": 0.0002746557953366203, + "loss": 7.4429, + "step": 13697 + }, + { + "epoch": 1.2781562004292246, + "grad_norm": 1.3491192201671445, + "learning_rate": 0.00027465158809009176, + "loss": 7.4469, + "step": 13698 + }, + { + "epoch": 1.2782495101241018, + "grad_norm": 1.607531966623349, + "learning_rate": 0.0002746473805266112, + "loss": 7.3087, + "step": 13699 + }, + { + "epoch": 1.2783428198189792, + "grad_norm": 1.0565772492681569, + "learning_rate": 0.0002746431726461892, + "loss": 7.4672, + "step": 13700 + }, + { + "epoch": 1.2784361295138564, + "grad_norm": 0.8190280015952218, + "learning_rate": 0.0002746389644488366, + "loss": 7.3845, + "step": 13701 + }, + { + "epoch": 1.2785294392087339, + "grad_norm": 129440.45927825154, + "learning_rate": 0.000274634755934564, + "loss": 7.1228, + "step": 13702 + }, + { + "epoch": 1.278622748903611, + "grad_norm": 0.6988887733371083, + "learning_rate": 0.0002746305471033822, + "loss": 7.427, + "step": 13703 + }, + { + "epoch": 1.2787160585984885, + "grad_norm": 1.269174193800774, + "learning_rate": 0.00027462633795530175, + "loss": 7.2134, + "step": 13704 + }, + { + "epoch": 1.2788093682933657, + "grad_norm": 1.034341874712579, + "learning_rate": 0.00027462212849033347, + "loss": 7.4631, + "step": 13705 + }, + { + "epoch": 1.2789026779882429, + "grad_norm": 1.1988418690280858, + "learning_rate": 0.00027461791870848803, + "loss": 7.6445, + "step": 13706 + }, + { + "epoch": 1.2789959876831203, + "grad_norm": 1.4049662349305543, + "learning_rate": 0.00027461370860977616, + "loss": 7.2721, + "step": 13707 + }, + { + "epoch": 1.2790892973779977, + "grad_norm": 0.812036658775282, + "learning_rate": 0.00027460949819420853, + "loss": 7.2649, + "step": 13708 + }, + { + "epoch": 1.279182607072875, + "grad_norm": 10427.201473598896, + "learning_rate": 0.0002746052874617958, + "loss": 7.6503, + "step": 13709 + }, + { + "epoch": 1.279275916767752, + "grad_norm": 0.6300413953873928, + "learning_rate": 0.0002746010764125489, + "loss": 7.0696, + "step": 13710 + }, + { + "epoch": 1.2793692264626295, + "grad_norm": 0.6906236942587536, + "learning_rate": 0.0002745968650464782, + "loss": 7.3381, + "step": 13711 + }, + { + "epoch": 1.2794625361575067, + "grad_norm": 0.9316805736822655, + "learning_rate": 0.00027459265336359466, + "loss": 7.4525, + "step": 13712 + }, + { + "epoch": 1.2795558458523841, + "grad_norm": 0.9141085591317266, + "learning_rate": 0.00027458844136390894, + "loss": 7.4707, + "step": 13713 + }, + { + "epoch": 1.2796491555472613, + "grad_norm": 0.96241449584265, + "learning_rate": 0.0002745842290474316, + "loss": 7.4846, + "step": 13714 + }, + { + "epoch": 1.2797424652421387, + "grad_norm": 1.072911690917532, + "learning_rate": 0.0002745800164141736, + "loss": 7.2856, + "step": 13715 + }, + { + "epoch": 1.279835774937016, + "grad_norm": 1.0231891578048538, + "learning_rate": 0.0002745758034641454, + "loss": 7.3785, + "step": 13716 + }, + { + "epoch": 1.2799290846318931, + "grad_norm": 0.7345603141427182, + "learning_rate": 0.00027457159019735793, + "loss": 7.1999, + "step": 13717 + }, + { + "epoch": 1.2800223943267706, + "grad_norm": 1.1302601975677615, + "learning_rate": 0.0002745673766138218, + "loss": 7.3282, + "step": 13718 + }, + { + "epoch": 1.280115704021648, + "grad_norm": 1.62957127740643, + "learning_rate": 0.0002745631627135477, + "loss": 7.3627, + "step": 13719 + }, + { + "epoch": 1.2802090137165252, + "grad_norm": 0.8760044583481257, + "learning_rate": 0.0002745589484965464, + "loss": 7.0598, + "step": 13720 + }, + { + "epoch": 1.2803023234114024, + "grad_norm": 0.864293485775182, + "learning_rate": 0.00027455473396282854, + "loss": 7.2862, + "step": 13721 + }, + { + "epoch": 1.2803956331062798, + "grad_norm": 0.6695006720585426, + "learning_rate": 0.00027455051911240495, + "loss": 7.2959, + "step": 13722 + }, + { + "epoch": 1.280488942801157, + "grad_norm": 1.051447769907228, + "learning_rate": 0.00027454630394528623, + "loss": 7.2036, + "step": 13723 + }, + { + "epoch": 1.2805822524960344, + "grad_norm": 0.6487767660457744, + "learning_rate": 0.0002745420884614832, + "loss": 7.2193, + "step": 13724 + }, + { + "epoch": 1.2806755621909116, + "grad_norm": 0.6976036006908272, + "learning_rate": 0.0002745378726610065, + "loss": 6.9642, + "step": 13725 + }, + { + "epoch": 1.280768871885789, + "grad_norm": 0.876022388461773, + "learning_rate": 0.0002745336565438669, + "loss": 7.2111, + "step": 13726 + }, + { + "epoch": 1.2808621815806662, + "grad_norm": 0.6693150226534516, + "learning_rate": 0.0002745294401100751, + "loss": 7.0939, + "step": 13727 + }, + { + "epoch": 1.2809554912755434, + "grad_norm": 1.0434459348767993, + "learning_rate": 0.0002745252233596418, + "loss": 7.3546, + "step": 13728 + }, + { + "epoch": 1.2810488009704208, + "grad_norm": 0.9429479897834387, + "learning_rate": 0.00027452100629257774, + "loss": 7.0128, + "step": 13729 + }, + { + "epoch": 1.2811421106652983, + "grad_norm": 1.0757109704693575, + "learning_rate": 0.00027451678890889366, + "loss": 7.2336, + "step": 13730 + }, + { + "epoch": 1.2812354203601755, + "grad_norm": 0.8552160226568088, + "learning_rate": 0.0002745125712086003, + "loss": 7.3029, + "step": 13731 + }, + { + "epoch": 1.2813287300550527, + "grad_norm": 0.6518721348997502, + "learning_rate": 0.0002745083531917083, + "loss": 7.1796, + "step": 13732 + }, + { + "epoch": 1.28142203974993, + "grad_norm": 6463.966622294714, + "learning_rate": 0.0002745041348582285, + "loss": 7.5326, + "step": 13733 + }, + { + "epoch": 1.2815153494448073, + "grad_norm": 25839.05032032639, + "learning_rate": 0.0002744999162081715, + "loss": 7.1044, + "step": 13734 + }, + { + "epoch": 1.2816086591396847, + "grad_norm": 1.1051762295896745, + "learning_rate": 0.00027449569724154817, + "loss": 7.1568, + "step": 13735 + }, + { + "epoch": 1.2817019688345619, + "grad_norm": 1.0048498224193103, + "learning_rate": 0.0002744914779583691, + "loss": 7.4021, + "step": 13736 + }, + { + "epoch": 1.2817952785294393, + "grad_norm": 0.794666989653789, + "learning_rate": 0.0002744872583586451, + "loss": 7.2492, + "step": 13737 + }, + { + "epoch": 1.2818885882243165, + "grad_norm": 62440.20137601622, + "learning_rate": 0.00027448303844238687, + "loss": 7.1407, + "step": 13738 + }, + { + "epoch": 1.2819818979191937, + "grad_norm": 0.5924865616367564, + "learning_rate": 0.0002744788182096052, + "loss": 7.3728, + "step": 13739 + }, + { + "epoch": 1.2820752076140711, + "grad_norm": 0.6275334173987853, + "learning_rate": 0.00027447459766031065, + "loss": 7.0218, + "step": 13740 + }, + { + "epoch": 1.2821685173089483, + "grad_norm": 1.2067395626175665, + "learning_rate": 0.0002744703767945142, + "loss": 7.5177, + "step": 13741 + }, + { + "epoch": 1.2822618270038257, + "grad_norm": 11027.569719520565, + "learning_rate": 0.00027446615561222635, + "loss": 7.0348, + "step": 13742 + }, + { + "epoch": 1.282355136698703, + "grad_norm": 270721.5308803159, + "learning_rate": 0.00027446193411345796, + "loss": 7.2661, + "step": 13743 + }, + { + "epoch": 1.2824484463935804, + "grad_norm": 128466.85494099818, + "learning_rate": 0.0002744577122982198, + "loss": 7.2223, + "step": 13744 + }, + { + "epoch": 1.2825417560884576, + "grad_norm": 114098.21913148412, + "learning_rate": 0.0002744534901665225, + "loss": 7.1009, + "step": 13745 + }, + { + "epoch": 1.282635065783335, + "grad_norm": 2.3497508907372042, + "learning_rate": 0.00027444926771837683, + "loss": 6.9477, + "step": 13746 + }, + { + "epoch": 1.2827283754782122, + "grad_norm": 0.9564796699109172, + "learning_rate": 0.00027444504495379357, + "loss": 7.4762, + "step": 13747 + }, + { + "epoch": 1.2828216851730896, + "grad_norm": 17044.336321470546, + "learning_rate": 0.0002744408218727834, + "loss": 7.2186, + "step": 13748 + }, + { + "epoch": 1.2829149948679668, + "grad_norm": 0.8453258034081146, + "learning_rate": 0.0002744365984753571, + "loss": 7.0594, + "step": 13749 + }, + { + "epoch": 1.283008304562844, + "grad_norm": 1.4781833952793644, + "learning_rate": 0.0002744323747615254, + "loss": 7.3887, + "step": 13750 + }, + { + "epoch": 1.2831016142577214, + "grad_norm": 59509.342497149875, + "learning_rate": 0.000274428150731299, + "loss": 7.3543, + "step": 13751 + }, + { + "epoch": 1.2831949239525986, + "grad_norm": 1.2085057879168113, + "learning_rate": 0.0002744239263846886, + "loss": 7.1898, + "step": 13752 + }, + { + "epoch": 1.283288233647476, + "grad_norm": 0.7445073178726244, + "learning_rate": 0.00027441970172170513, + "loss": 7.2048, + "step": 13753 + }, + { + "epoch": 1.2833815433423532, + "grad_norm": 50302.445616651894, + "learning_rate": 0.0002744154767423592, + "loss": 7.2595, + "step": 13754 + }, + { + "epoch": 1.2834748530372306, + "grad_norm": 2.2302915782873285, + "learning_rate": 0.0002744112514466616, + "loss": 7.8764, + "step": 13755 + }, + { + "epoch": 1.2835681627321078, + "grad_norm": 1.3224274405398215, + "learning_rate": 0.0002744070258346229, + "loss": 7.3681, + "step": 13756 + }, + { + "epoch": 1.283661472426985, + "grad_norm": 125734.66780674963, + "learning_rate": 0.00027440279990625406, + "loss": 7.3942, + "step": 13757 + }, + { + "epoch": 1.2837547821218624, + "grad_norm": 0.7638045924146816, + "learning_rate": 0.0002743985736615658, + "loss": 7.4247, + "step": 13758 + }, + { + "epoch": 1.2838480918167399, + "grad_norm": 1230123.0867023352, + "learning_rate": 0.0002743943471005688, + "loss": 7.2753, + "step": 13759 + }, + { + "epoch": 1.283941401511617, + "grad_norm": 2.300686378898132, + "learning_rate": 0.0002743901202232738, + "loss": 7.4934, + "step": 13760 + }, + { + "epoch": 1.2840347112064943, + "grad_norm": 3.149676207725008, + "learning_rate": 0.0002743858930296916, + "loss": 6.994, + "step": 13761 + }, + { + "epoch": 1.2841280209013717, + "grad_norm": 2.1901023926491705, + "learning_rate": 0.00027438166551983285, + "loss": 7.4065, + "step": 13762 + }, + { + "epoch": 1.2842213305962489, + "grad_norm": 1.6247986884307015, + "learning_rate": 0.0002743774376937084, + "loss": 7.5457, + "step": 13763 + }, + { + "epoch": 1.2843146402911263, + "grad_norm": 1.7390809943431234, + "learning_rate": 0.000274373209551329, + "loss": 7.6401, + "step": 13764 + }, + { + "epoch": 1.2844079499860035, + "grad_norm": 2.2692816938235807, + "learning_rate": 0.0002743689810927053, + "loss": 7.6905, + "step": 13765 + }, + { + "epoch": 1.284501259680881, + "grad_norm": 1.3990972769353964, + "learning_rate": 0.0002743647523178482, + "loss": 7.6131, + "step": 13766 + }, + { + "epoch": 1.284594569375758, + "grad_norm": 1.858843410218534, + "learning_rate": 0.00027436052322676836, + "loss": 7.9837, + "step": 13767 + }, + { + "epoch": 1.2846878790706353, + "grad_norm": 1.9786999267209848, + "learning_rate": 0.0002743562938194765, + "loss": 7.542, + "step": 13768 + }, + { + "epoch": 1.2847811887655127, + "grad_norm": 4.000175793759647, + "learning_rate": 0.00027435206409598346, + "loss": 7.6142, + "step": 13769 + }, + { + "epoch": 1.2848744984603901, + "grad_norm": 2.248916797357584, + "learning_rate": 0.00027434783405629997, + "loss": 7.6898, + "step": 13770 + }, + { + "epoch": 1.2849678081552673, + "grad_norm": 2.0565193495699132, + "learning_rate": 0.0002743436037004368, + "loss": 7.3118, + "step": 13771 + }, + { + "epoch": 1.2850611178501445, + "grad_norm": 1.0923067406340041, + "learning_rate": 0.00027433937302840457, + "loss": 7.8643, + "step": 13772 + }, + { + "epoch": 1.285154427545022, + "grad_norm": 1.040647585937581, + "learning_rate": 0.00027433514204021423, + "loss": 7.1244, + "step": 13773 + }, + { + "epoch": 1.2852477372398992, + "grad_norm": 1.970898989850542, + "learning_rate": 0.00027433091073587645, + "loss": 7.3346, + "step": 13774 + }, + { + "epoch": 1.2853410469347766, + "grad_norm": 1.9021587629259227, + "learning_rate": 0.00027432667911540196, + "loss": 7.7331, + "step": 13775 + }, + { + "epoch": 1.2854343566296538, + "grad_norm": 1.2237749534421094, + "learning_rate": 0.0002743224471788016, + "loss": 7.4549, + "step": 13776 + }, + { + "epoch": 1.2855276663245312, + "grad_norm": 1.2260177175249354, + "learning_rate": 0.00027431821492608605, + "loss": 7.5278, + "step": 13777 + }, + { + "epoch": 1.2856209760194084, + "grad_norm": 1.5572526701076432, + "learning_rate": 0.0002743139823572661, + "loss": 7.6556, + "step": 13778 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 13.24812611342915, + "learning_rate": 0.00027430974947235254, + "loss": 7.5729, + "step": 13779 + }, + { + "epoch": 1.285807595409163, + "grad_norm": 1.247779282840491, + "learning_rate": 0.0002743055162713561, + "loss": 7.2792, + "step": 13780 + }, + { + "epoch": 1.2859009051040404, + "grad_norm": 3.150699641883662, + "learning_rate": 0.00027430128275428756, + "loss": 7.4371, + "step": 13781 + }, + { + "epoch": 1.2859942147989176, + "grad_norm": 0.797281863601806, + "learning_rate": 0.00027429704892115766, + "loss": 7.4055, + "step": 13782 + }, + { + "epoch": 1.2860875244937948, + "grad_norm": 495.1311365018171, + "learning_rate": 0.0002742928147719772, + "loss": 7.1621, + "step": 13783 + }, + { + "epoch": 1.2861808341886722, + "grad_norm": 1.101483105458101, + "learning_rate": 0.0002742885803067569, + "loss": 7.2745, + "step": 13784 + }, + { + "epoch": 1.2862741438835494, + "grad_norm": 1.265860679760088, + "learning_rate": 0.0002742843455255076, + "loss": 7.4101, + "step": 13785 + }, + { + "epoch": 1.2863674535784269, + "grad_norm": 1.373856639646187, + "learning_rate": 0.00027428011042824, + "loss": 7.5335, + "step": 13786 + }, + { + "epoch": 1.286460763273304, + "grad_norm": 0.7941294296064689, + "learning_rate": 0.00027427587501496493, + "loss": 7.2195, + "step": 13787 + }, + { + "epoch": 1.2865540729681815, + "grad_norm": 1.7593270971393884, + "learning_rate": 0.0002742716392856931, + "loss": 7.066, + "step": 13788 + }, + { + "epoch": 1.2866473826630587, + "grad_norm": 1.0595116925963577, + "learning_rate": 0.0002742674032404353, + "loss": 7.3637, + "step": 13789 + }, + { + "epoch": 1.2867406923579359, + "grad_norm": 0.8530008220026734, + "learning_rate": 0.0002742631668792023, + "loss": 7.3418, + "step": 13790 + }, + { + "epoch": 1.2868340020528133, + "grad_norm": 0.6491403377335263, + "learning_rate": 0.0002742589302020049, + "loss": 7.3488, + "step": 13791 + }, + { + "epoch": 1.2869273117476907, + "grad_norm": 79.67925670769748, + "learning_rate": 0.00027425469320885385, + "loss": 7.2162, + "step": 13792 + }, + { + "epoch": 1.287020621442568, + "grad_norm": 1.1872243640004607, + "learning_rate": 0.00027425045589975986, + "loss": 7.1294, + "step": 13793 + }, + { + "epoch": 1.287113931137445, + "grad_norm": 3.3330241548199955, + "learning_rate": 0.0002742462182747338, + "loss": 7.1593, + "step": 13794 + }, + { + "epoch": 1.2872072408323225, + "grad_norm": 1.1980732669055736, + "learning_rate": 0.0002742419803337864, + "loss": 7.4961, + "step": 13795 + }, + { + "epoch": 1.2873005505271997, + "grad_norm": 0.7275059960424264, + "learning_rate": 0.0002742377420769285, + "loss": 7.1422, + "step": 13796 + }, + { + "epoch": 1.2873938602220771, + "grad_norm": 1.5337562752087985, + "learning_rate": 0.0002742335035041708, + "loss": 7.272, + "step": 13797 + }, + { + "epoch": 1.2874871699169543, + "grad_norm": 1.0468757784724727, + "learning_rate": 0.00027422926461552407, + "loss": 7.0384, + "step": 13798 + }, + { + "epoch": 1.2875804796118318, + "grad_norm": 1.4207254389360375, + "learning_rate": 0.00027422502541099916, + "loss": 7.1422, + "step": 13799 + }, + { + "epoch": 1.287673789306709, + "grad_norm": 0.6849846761612417, + "learning_rate": 0.00027422078589060676, + "loss": 7.08, + "step": 13800 + }, + { + "epoch": 1.2877670990015861, + "grad_norm": 263.6343108937467, + "learning_rate": 0.00027421654605435774, + "loss": 7.2197, + "step": 13801 + }, + { + "epoch": 1.2878604086964636, + "grad_norm": 1.7109687437855616, + "learning_rate": 0.0002742123059022628, + "loss": 7.4127, + "step": 13802 + }, + { + "epoch": 1.287953718391341, + "grad_norm": 76.95992721649618, + "learning_rate": 0.0002742080654343328, + "loss": 7.0336, + "step": 13803 + }, + { + "epoch": 1.2880470280862182, + "grad_norm": 0.5434592270997705, + "learning_rate": 0.00027420382465057845, + "loss": 7.0883, + "step": 13804 + }, + { + "epoch": 1.2881403377810954, + "grad_norm": 0.5546916497215458, + "learning_rate": 0.0002741995835510106, + "loss": 7.2116, + "step": 13805 + }, + { + "epoch": 1.2882336474759728, + "grad_norm": 0.9749414847985506, + "learning_rate": 0.00027419534213564, + "loss": 7.3095, + "step": 13806 + }, + { + "epoch": 1.28832695717085, + "grad_norm": 1.0201279004323354, + "learning_rate": 0.00027419110040447745, + "loss": 7.3916, + "step": 13807 + }, + { + "epoch": 1.2884202668657274, + "grad_norm": 1.2533866682862165, + "learning_rate": 0.0002741868583575337, + "loss": 6.9766, + "step": 13808 + }, + { + "epoch": 1.2885135765606046, + "grad_norm": 0.8270535659065577, + "learning_rate": 0.00027418261599481954, + "loss": 7.4445, + "step": 13809 + }, + { + "epoch": 1.288606886255482, + "grad_norm": 417.1263835813288, + "learning_rate": 0.00027417837331634575, + "loss": 7.1951, + "step": 13810 + }, + { + "epoch": 1.2887001959503592, + "grad_norm": 258.2486105587728, + "learning_rate": 0.0002741741303221232, + "loss": 7.0814, + "step": 13811 + }, + { + "epoch": 1.2887935056452364, + "grad_norm": 1.2993828937966432, + "learning_rate": 0.00027416988701216256, + "loss": 7.1992, + "step": 13812 + }, + { + "epoch": 1.2888868153401138, + "grad_norm": 5.184370462915222, + "learning_rate": 0.00027416564338647473, + "loss": 7.4793, + "step": 13813 + }, + { + "epoch": 1.2889801250349913, + "grad_norm": 1.534455835894567, + "learning_rate": 0.00027416139944507037, + "loss": 7.4873, + "step": 13814 + }, + { + "epoch": 1.2890734347298685, + "grad_norm": 1.17886762301033, + "learning_rate": 0.00027415715518796043, + "loss": 7.385, + "step": 13815 + }, + { + "epoch": 1.2891667444247457, + "grad_norm": 1.914554129563003, + "learning_rate": 0.0002741529106151556, + "loss": 7.4131, + "step": 13816 + }, + { + "epoch": 1.289260054119623, + "grad_norm": 1.1612545627615316, + "learning_rate": 0.0002741486657266667, + "loss": 7.5158, + "step": 13817 + }, + { + "epoch": 1.2893533638145003, + "grad_norm": 81.83296139034739, + "learning_rate": 0.00027414442052250455, + "loss": 7.3149, + "step": 13818 + }, + { + "epoch": 1.2894466735093777, + "grad_norm": 850.7537752131549, + "learning_rate": 0.00027414017500267986, + "loss": 7.1405, + "step": 13819 + }, + { + "epoch": 1.289539983204255, + "grad_norm": 2.1185619198023176, + "learning_rate": 0.0002741359291672035, + "loss": 7.3766, + "step": 13820 + }, + { + "epoch": 1.2896332928991323, + "grad_norm": 1.5048669672817803, + "learning_rate": 0.00027413168301608623, + "loss": 7.1662, + "step": 13821 + }, + { + "epoch": 1.2897266025940095, + "grad_norm": 0.8856168880283097, + "learning_rate": 0.0002741274365493389, + "loss": 7.2457, + "step": 13822 + }, + { + "epoch": 1.2898199122888867, + "grad_norm": 0.5174709165331554, + "learning_rate": 0.0002741231897669722, + "loss": 7.4365, + "step": 13823 + }, + { + "epoch": 1.2899132219837641, + "grad_norm": 2068.7990278694824, + "learning_rate": 0.00027411894266899707, + "loss": 6.9975, + "step": 13824 + }, + { + "epoch": 1.2900065316786415, + "grad_norm": 2.032887995314816, + "learning_rate": 0.00027411469525542414, + "loss": 6.9612, + "step": 13825 + }, + { + "epoch": 1.2900998413735187, + "grad_norm": 0.5977228988281779, + "learning_rate": 0.0002741104475262644, + "loss": 7.1803, + "step": 13826 + }, + { + "epoch": 1.290193151068396, + "grad_norm": 1.2883997641848837, + "learning_rate": 0.0002741061994815285, + "loss": 7.1445, + "step": 13827 + }, + { + "epoch": 1.2902864607632734, + "grad_norm": 2.431697362857664, + "learning_rate": 0.00027410195112122736, + "loss": 7.5732, + "step": 13828 + }, + { + "epoch": 1.2903797704581506, + "grad_norm": 0.6137395668611003, + "learning_rate": 0.0002740977024453717, + "loss": 6.8221, + "step": 13829 + }, + { + "epoch": 1.290473080153028, + "grad_norm": 21.40872286881604, + "learning_rate": 0.0002740934534539723, + "loss": 7.4707, + "step": 13830 + }, + { + "epoch": 1.2905663898479052, + "grad_norm": 0.9166161070537717, + "learning_rate": 0.00027408920414704, + "loss": 7.295, + "step": 13831 + }, + { + "epoch": 1.2906596995427826, + "grad_norm": 171.40642729736214, + "learning_rate": 0.00027408495452458563, + "loss": 7.1375, + "step": 13832 + }, + { + "epoch": 1.2907530092376598, + "grad_norm": 1.2205434005638722, + "learning_rate": 0.00027408070458662, + "loss": 7.1637, + "step": 13833 + }, + { + "epoch": 1.290846318932537, + "grad_norm": 4344.006689415585, + "learning_rate": 0.0002740764543331539, + "loss": 6.8663, + "step": 13834 + }, + { + "epoch": 1.2909396286274144, + "grad_norm": 2.360742575966881, + "learning_rate": 0.0002740722037641981, + "loss": 7.481, + "step": 13835 + }, + { + "epoch": 1.2910329383222918, + "grad_norm": 273.6676340412437, + "learning_rate": 0.00027406795287976345, + "loss": 7.5483, + "step": 13836 + }, + { + "epoch": 1.291126248017169, + "grad_norm": 0.8870060816686377, + "learning_rate": 0.00027406370167986076, + "loss": 7.1637, + "step": 13837 + }, + { + "epoch": 1.2912195577120462, + "grad_norm": 0.5000700551116531, + "learning_rate": 0.00027405945016450084, + "loss": 7.4149, + "step": 13838 + }, + { + "epoch": 1.2913128674069236, + "grad_norm": 6710.162698143738, + "learning_rate": 0.00027405519833369446, + "loss": 7.2516, + "step": 13839 + }, + { + "epoch": 1.2914061771018008, + "grad_norm": 2.2008053676342096, + "learning_rate": 0.0002740509461874524, + "loss": 7.1087, + "step": 13840 + }, + { + "epoch": 1.2914994867966783, + "grad_norm": 1.9511718294592393, + "learning_rate": 0.00027404669372578565, + "loss": 7.0013, + "step": 13841 + }, + { + "epoch": 1.2915927964915555, + "grad_norm": 0.5165999883845553, + "learning_rate": 0.0002740424409487048, + "loss": 7.3456, + "step": 13842 + }, + { + "epoch": 1.2916861061864329, + "grad_norm": 1.2776697757448832, + "learning_rate": 0.00027403818785622086, + "loss": 7.1126, + "step": 13843 + }, + { + "epoch": 1.29177941588131, + "grad_norm": 2.4298336442618664, + "learning_rate": 0.0002740339344483445, + "loss": 7.2997, + "step": 13844 + }, + { + "epoch": 1.2918727255761873, + "grad_norm": 1.9061952992729154, + "learning_rate": 0.00027402968072508656, + "loss": 7.5468, + "step": 13845 + }, + { + "epoch": 1.2919660352710647, + "grad_norm": 0.8385528312941032, + "learning_rate": 0.00027402542668645793, + "loss": 7.207, + "step": 13846 + }, + { + "epoch": 1.2920593449659419, + "grad_norm": 2.0834268315934876, + "learning_rate": 0.00027402117233246936, + "loss": 7.2118, + "step": 13847 + }, + { + "epoch": 1.2921526546608193, + "grad_norm": 2.270271535383351, + "learning_rate": 0.0002740169176631317, + "loss": 7.2173, + "step": 13848 + }, + { + "epoch": 1.2922459643556965, + "grad_norm": 2012.7280538671687, + "learning_rate": 0.00027401266267845575, + "loss": 6.8935, + "step": 13849 + }, + { + "epoch": 1.292339274050574, + "grad_norm": 0.5418190592390021, + "learning_rate": 0.0002740084073784523, + "loss": 7.1695, + "step": 13850 + }, + { + "epoch": 1.2924325837454511, + "grad_norm": 1.5697428267622917, + "learning_rate": 0.00027400415176313223, + "loss": 7.3271, + "step": 13851 + }, + { + "epoch": 1.2925258934403285, + "grad_norm": 2.310912594593254, + "learning_rate": 0.00027399989583250635, + "loss": 7.054, + "step": 13852 + }, + { + "epoch": 1.2926192031352057, + "grad_norm": 2.236732728131448, + "learning_rate": 0.00027399563958658543, + "loss": 7.3652, + "step": 13853 + }, + { + "epoch": 1.2927125128300831, + "grad_norm": 17.445452831559148, + "learning_rate": 0.00027399138302538036, + "loss": 6.9968, + "step": 13854 + }, + { + "epoch": 1.2928058225249603, + "grad_norm": 0.5684280760855993, + "learning_rate": 0.00027398712614890193, + "loss": 7.3303, + "step": 13855 + }, + { + "epoch": 1.2928991322198375, + "grad_norm": 1.5932872543671848, + "learning_rate": 0.0002739828689571609, + "loss": 7.2074, + "step": 13856 + }, + { + "epoch": 1.292992441914715, + "grad_norm": 1.3383469378933162, + "learning_rate": 0.00027397861145016825, + "loss": 7.3222, + "step": 13857 + }, + { + "epoch": 1.2930857516095922, + "grad_norm": 0.6126800211542595, + "learning_rate": 0.00027397435362793473, + "loss": 7.4406, + "step": 13858 + }, + { + "epoch": 1.2931790613044696, + "grad_norm": 1.2097232756741567, + "learning_rate": 0.00027397009549047106, + "loss": 7.2228, + "step": 13859 + }, + { + "epoch": 1.2932723709993468, + "grad_norm": 0.6658255764713632, + "learning_rate": 0.00027396583703778824, + "loss": 7.0715, + "step": 13860 + }, + { + "epoch": 1.2933656806942242, + "grad_norm": 1.3965196285353099, + "learning_rate": 0.00027396157826989695, + "loss": 7.466, + "step": 13861 + }, + { + "epoch": 1.2934589903891014, + "grad_norm": 1.0865901691413835, + "learning_rate": 0.00027395731918680813, + "loss": 7.2375, + "step": 13862 + }, + { + "epoch": 1.2935523000839786, + "grad_norm": 0.8920790541413692, + "learning_rate": 0.0002739530597885326, + "loss": 7.1524, + "step": 13863 + }, + { + "epoch": 1.293645609778856, + "grad_norm": 0.6521477022963329, + "learning_rate": 0.0002739488000750811, + "loss": 7.0041, + "step": 13864 + }, + { + "epoch": 1.2937389194737334, + "grad_norm": 1.1111152357418985, + "learning_rate": 0.0002739445400464645, + "loss": 7.0098, + "step": 13865 + }, + { + "epoch": 1.2938322291686106, + "grad_norm": 0.6785886585530292, + "learning_rate": 0.0002739402797026937, + "loss": 7.595, + "step": 13866 + }, + { + "epoch": 1.2939255388634878, + "grad_norm": 0.7229502818876269, + "learning_rate": 0.0002739360190437795, + "loss": 7.378, + "step": 13867 + }, + { + "epoch": 1.2940188485583652, + "grad_norm": 0.5327457808926985, + "learning_rate": 0.00027393175806973263, + "loss": 7.4793, + "step": 13868 + }, + { + "epoch": 1.2941121582532424, + "grad_norm": 0.7231218972261028, + "learning_rate": 0.0002739274967805641, + "loss": 7.3315, + "step": 13869 + }, + { + "epoch": 1.2942054679481199, + "grad_norm": 0.6308497929928863, + "learning_rate": 0.00027392323517628456, + "loss": 7.2297, + "step": 13870 + }, + { + "epoch": 1.294298777642997, + "grad_norm": 0.9151411411994306, + "learning_rate": 0.00027391897325690503, + "loss": 7.3686, + "step": 13871 + }, + { + "epoch": 1.2943920873378745, + "grad_norm": 0.8343743961706979, + "learning_rate": 0.00027391471102243623, + "loss": 7.4329, + "step": 13872 + }, + { + "epoch": 1.2944853970327517, + "grad_norm": 0.5397713262768785, + "learning_rate": 0.000273910448472889, + "loss": 7.2321, + "step": 13873 + }, + { + "epoch": 1.2945787067276289, + "grad_norm": 0.8584081048257027, + "learning_rate": 0.0002739061856082742, + "loss": 7.1417, + "step": 13874 + }, + { + "epoch": 1.2946720164225063, + "grad_norm": 0.6279011095576984, + "learning_rate": 0.00027390192242860274, + "loss": 6.938, + "step": 13875 + }, + { + "epoch": 1.2947653261173837, + "grad_norm": 0.9637379507158361, + "learning_rate": 0.00027389765893388537, + "loss": 7.2546, + "step": 13876 + }, + { + "epoch": 1.294858635812261, + "grad_norm": 0.5771323325784642, + "learning_rate": 0.00027389339512413294, + "loss": 7.0727, + "step": 13877 + }, + { + "epoch": 1.294951945507138, + "grad_norm": 269.31147245157274, + "learning_rate": 0.0002738891309993563, + "loss": 7.2283, + "step": 13878 + }, + { + "epoch": 1.2950452552020155, + "grad_norm": 0.523920947482217, + "learning_rate": 0.0002738848665595663, + "loss": 7.1799, + "step": 13879 + }, + { + "epoch": 1.2951385648968927, + "grad_norm": 287.7768114800966, + "learning_rate": 0.0002738806018047738, + "loss": 7.2815, + "step": 13880 + }, + { + "epoch": 1.2952318745917701, + "grad_norm": 1136.8939210738367, + "learning_rate": 0.0002738763367349896, + "loss": 7.1077, + "step": 13881 + }, + { + "epoch": 1.2953251842866473, + "grad_norm": 0.6789725638130343, + "learning_rate": 0.0002738720713502246, + "loss": 7.0425, + "step": 13882 + }, + { + "epoch": 1.2954184939815248, + "grad_norm": 1.4791585697260368, + "learning_rate": 0.0002738678056504896, + "loss": 7.1976, + "step": 13883 + }, + { + "epoch": 1.295511803676402, + "grad_norm": 1.9233339445315711, + "learning_rate": 0.00027386353963579547, + "loss": 7.5003, + "step": 13884 + }, + { + "epoch": 1.2956051133712791, + "grad_norm": 0.6567829131778083, + "learning_rate": 0.00027385927330615304, + "loss": 7.2515, + "step": 13885 + }, + { + "epoch": 1.2956984230661566, + "grad_norm": 677.7378990427063, + "learning_rate": 0.0002738550066615732, + "loss": 7.2526, + "step": 13886 + }, + { + "epoch": 1.295791732761034, + "grad_norm": 0.9691885453046876, + "learning_rate": 0.0002738507397020667, + "loss": 7.5359, + "step": 13887 + }, + { + "epoch": 1.2958850424559112, + "grad_norm": 1.4761187189575689, + "learning_rate": 0.0002738464724276445, + "loss": 7.2283, + "step": 13888 + }, + { + "epoch": 1.2959783521507884, + "grad_norm": 10247.469818170839, + "learning_rate": 0.00027384220483831737, + "loss": 6.9703, + "step": 13889 + }, + { + "epoch": 1.2960716618456658, + "grad_norm": 2612.0968906994494, + "learning_rate": 0.00027383793693409625, + "loss": 7.4627, + "step": 13890 + }, + { + "epoch": 1.296164971540543, + "grad_norm": 2.2825144958097883, + "learning_rate": 0.00027383366871499193, + "loss": 7.1591, + "step": 13891 + }, + { + "epoch": 1.2962582812354204, + "grad_norm": 2.550698743235504, + "learning_rate": 0.00027382940018101523, + "loss": 7.6315, + "step": 13892 + }, + { + "epoch": 1.2963515909302976, + "grad_norm": 0.7619544171238362, + "learning_rate": 0.00027382513133217707, + "loss": 7.295, + "step": 13893 + }, + { + "epoch": 1.296444900625175, + "grad_norm": 1.2551985889939785, + "learning_rate": 0.00027382086216848827, + "loss": 7.2368, + "step": 13894 + }, + { + "epoch": 1.2965382103200522, + "grad_norm": 2.0526721418853136, + "learning_rate": 0.00027381659268995974, + "loss": 7.2988, + "step": 13895 + }, + { + "epoch": 1.2966315200149294, + "grad_norm": 30142.58717262528, + "learning_rate": 0.00027381232289660227, + "loss": 7.266, + "step": 13896 + }, + { + "epoch": 1.2967248297098068, + "grad_norm": 1.9621765281277415, + "learning_rate": 0.00027380805278842674, + "loss": 7.5611, + "step": 13897 + }, + { + "epoch": 1.2968181394046843, + "grad_norm": 1.9188625243037887, + "learning_rate": 0.000273803782365444, + "loss": 7.6284, + "step": 13898 + }, + { + "epoch": 1.2969114490995615, + "grad_norm": 1.967685431639745, + "learning_rate": 0.0002737995116276649, + "loss": 7.6792, + "step": 13899 + }, + { + "epoch": 1.2970047587944387, + "grad_norm": 46.08018031430607, + "learning_rate": 0.00027379524057510033, + "loss": 7.9212, + "step": 13900 + }, + { + "epoch": 1.297098068489316, + "grad_norm": 3.204447531275735, + "learning_rate": 0.00027379096920776113, + "loss": 8.0536, + "step": 13901 + }, + { + "epoch": 1.2971913781841933, + "grad_norm": 3.951216587295475, + "learning_rate": 0.00027378669752565815, + "loss": 8.027, + "step": 13902 + }, + { + "epoch": 1.2972846878790707, + "grad_norm": 7.192780460018762, + "learning_rate": 0.00027378242552880233, + "loss": 7.9573, + "step": 13903 + }, + { + "epoch": 1.297377997573948, + "grad_norm": 3.8940717239107947, + "learning_rate": 0.00027377815321720435, + "loss": 8.4535, + "step": 13904 + }, + { + "epoch": 1.2974713072688253, + "grad_norm": 32999217.674910408, + "learning_rate": 0.0002737738805908753, + "loss": 8.0819, + "step": 13905 + }, + { + "epoch": 1.2975646169637025, + "grad_norm": 1390936.0282712884, + "learning_rate": 0.0002737696076498259, + "loss": 7.8817, + "step": 13906 + }, + { + "epoch": 1.2976579266585797, + "grad_norm": 2.394141132866095, + "learning_rate": 0.00027376533439406705, + "loss": 8.0507, + "step": 13907 + }, + { + "epoch": 1.2977512363534571, + "grad_norm": 2.4589365896066315, + "learning_rate": 0.00027376106082360963, + "loss": 7.8907, + "step": 13908 + }, + { + "epoch": 1.2978445460483345, + "grad_norm": 2.839779411948985, + "learning_rate": 0.0002737567869384645, + "loss": 7.8391, + "step": 13909 + }, + { + "epoch": 1.2979378557432117, + "grad_norm": 2.5173552177498455, + "learning_rate": 0.0002737525127386425, + "loss": 7.8372, + "step": 13910 + }, + { + "epoch": 1.298031165438089, + "grad_norm": 1647415.614135875, + "learning_rate": 0.0002737482382241545, + "loss": 7.8095, + "step": 13911 + }, + { + "epoch": 1.2981244751329664, + "grad_norm": 1.5357210417212988, + "learning_rate": 0.0002737439633950114, + "loss": 8.226, + "step": 13912 + }, + { + "epoch": 1.2982177848278436, + "grad_norm": 3.433921181388235, + "learning_rate": 0.0002737396882512241, + "loss": 7.3608, + "step": 13913 + }, + { + "epoch": 1.298311094522721, + "grad_norm": 2.487230080236592, + "learning_rate": 0.00027373541279280335, + "loss": 7.5128, + "step": 13914 + }, + { + "epoch": 1.2984044042175982, + "grad_norm": 6327176559057.495, + "learning_rate": 0.00027373113701976016, + "loss": 7.3594, + "step": 13915 + }, + { + "epoch": 1.2984977139124756, + "grad_norm": 21.497794232880302, + "learning_rate": 0.0002737268609321053, + "loss": 8.1908, + "step": 13916 + }, + { + "epoch": 1.2985910236073528, + "grad_norm": 1.5947213998218275e+17, + "learning_rate": 0.00027372258452984977, + "loss": 7.8587, + "step": 13917 + }, + { + "epoch": 1.29868433330223, + "grad_norm": 35.539541877676236, + "learning_rate": 0.00027371830781300425, + "loss": 8.4119, + "step": 13918 + }, + { + "epoch": 1.2987776429971074, + "grad_norm": 48.62664454952073, + "learning_rate": 0.00027371403078157975, + "loss": 7.7594, + "step": 13919 + }, + { + "epoch": 1.2988709526919848, + "grad_norm": 1.04056005985663e+20, + "learning_rate": 0.0002737097534355871, + "loss": 7.9701, + "step": 13920 + }, + { + "epoch": 1.298964262386862, + "grad_norm": 141.34567646146147, + "learning_rate": 0.00027370547577503727, + "loss": 8.5729, + "step": 13921 + }, + { + "epoch": 1.2990575720817392, + "grad_norm": 19.982966682394476, + "learning_rate": 0.000273701197799941, + "loss": 8.1997, + "step": 13922 + }, + { + "epoch": 1.2991508817766166, + "grad_norm": 10.054543267796872, + "learning_rate": 0.00027369691951030923, + "loss": 7.301, + "step": 13923 + }, + { + "epoch": 1.2992441914714938, + "grad_norm": 14.009113250931104, + "learning_rate": 0.0002736926409061528, + "loss": 7.7951, + "step": 13924 + }, + { + "epoch": 1.2993375011663713, + "grad_norm": 5.329323503435961, + "learning_rate": 0.0002736883619874827, + "loss": 7.6421, + "step": 13925 + }, + { + "epoch": 1.2994308108612485, + "grad_norm": 15252787776.740665, + "learning_rate": 0.0002736840827543097, + "loss": 7.4596, + "step": 13926 + }, + { + "epoch": 1.2995241205561259, + "grad_norm": 13.593673651091128, + "learning_rate": 0.00027367980320664473, + "loss": 7.5257, + "step": 13927 + }, + { + "epoch": 1.299617430251003, + "grad_norm": 13.781290245090347, + "learning_rate": 0.0002736755233444986, + "loss": 7.8276, + "step": 13928 + }, + { + "epoch": 1.2997107399458803, + "grad_norm": 1.7332197330818706, + "learning_rate": 0.00027367124316788233, + "loss": 7.3996, + "step": 13929 + }, + { + "epoch": 1.2998040496407577, + "grad_norm": 7.0955409174591155, + "learning_rate": 0.0002736669626768067, + "loss": 7.6685, + "step": 13930 + }, + { + "epoch": 1.299897359335635, + "grad_norm": 11176695.24917679, + "learning_rate": 0.0002736626818712826, + "loss": 7.9502, + "step": 13931 + }, + { + "epoch": 1.2999906690305123, + "grad_norm": 23.89561527756385, + "learning_rate": 0.00027365840075132094, + "loss": 7.9468, + "step": 13932 + }, + { + "epoch": 1.3000839787253895, + "grad_norm": 10335603.346187403, + "learning_rate": 0.00027365411931693255, + "loss": 7.9622, + "step": 13933 + }, + { + "epoch": 1.300177288420267, + "grad_norm": 1639059.3627934973, + "learning_rate": 0.00027364983756812846, + "loss": 8.0583, + "step": 13934 + }, + { + "epoch": 1.3002705981151441, + "grad_norm": 8.591898665468468, + "learning_rate": 0.0002736455555049194, + "loss": 7.7253, + "step": 13935 + }, + { + "epoch": 1.3003639078100215, + "grad_norm": 4.672372163270257, + "learning_rate": 0.0002736412731273163, + "loss": 7.6889, + "step": 13936 + }, + { + "epoch": 1.3004572175048987, + "grad_norm": 81.23647920266346, + "learning_rate": 0.0002736369904353301, + "loss": 7.8101, + "step": 13937 + }, + { + "epoch": 1.3005505271997762, + "grad_norm": 2.106381107964375, + "learning_rate": 0.0002736327074289717, + "loss": 7.9438, + "step": 13938 + }, + { + "epoch": 1.3006438368946533, + "grad_norm": 49.9464446916306, + "learning_rate": 0.00027362842410825187, + "loss": 7.7205, + "step": 13939 + }, + { + "epoch": 1.3007371465895305, + "grad_norm": 3.512166961271646, + "learning_rate": 0.00027362414047318157, + "loss": 7.6125, + "step": 13940 + }, + { + "epoch": 1.300830456284408, + "grad_norm": 20480913.433027405, + "learning_rate": 0.0002736198565237718, + "loss": 7.575, + "step": 13941 + }, + { + "epoch": 1.3009237659792854, + "grad_norm": 10.92921122926707, + "learning_rate": 0.00027361557226003325, + "loss": 7.524, + "step": 13942 + }, + { + "epoch": 1.3010170756741626, + "grad_norm": 3.9272046091653574, + "learning_rate": 0.000273611287681977, + "loss": 7.4069, + "step": 13943 + }, + { + "epoch": 1.3011103853690398, + "grad_norm": 2.943936852195291, + "learning_rate": 0.0002736070027896138, + "loss": 7.4771, + "step": 13944 + }, + { + "epoch": 1.3012036950639172, + "grad_norm": 8.402812470988042, + "learning_rate": 0.0002736027175829546, + "loss": 7.2919, + "step": 13945 + }, + { + "epoch": 1.3012970047587944, + "grad_norm": 2.0102838609023825, + "learning_rate": 0.00027359843206201036, + "loss": 7.462, + "step": 13946 + }, + { + "epoch": 1.3013903144536718, + "grad_norm": 3.50851673242046, + "learning_rate": 0.00027359414622679185, + "loss": 7.3416, + "step": 13947 + }, + { + "epoch": 1.301483624148549, + "grad_norm": 9.92444413557254, + "learning_rate": 0.0002735898600773101, + "loss": 7.4716, + "step": 13948 + }, + { + "epoch": 1.3015769338434264, + "grad_norm": 18.562301026248456, + "learning_rate": 0.0002735855736135759, + "loss": 7.5267, + "step": 13949 + }, + { + "epoch": 1.3016702435383036, + "grad_norm": 39182739.48974481, + "learning_rate": 0.00027358128683560026, + "loss": 7.3616, + "step": 13950 + }, + { + "epoch": 1.3017635532331808, + "grad_norm": 59.396114319264946, + "learning_rate": 0.00027357699974339396, + "loss": 7.9322, + "step": 13951 + }, + { + "epoch": 1.3018568629280582, + "grad_norm": 8.271395390150944, + "learning_rate": 0.000273572712336968, + "loss": 7.0411, + "step": 13952 + }, + { + "epoch": 1.3019501726229354, + "grad_norm": 3.1101704908970564, + "learning_rate": 0.00027356842461633316, + "loss": 7.5645, + "step": 13953 + }, + { + "epoch": 1.3020434823178129, + "grad_norm": 15.202485186582106, + "learning_rate": 0.0002735641365815005, + "loss": 7.1769, + "step": 13954 + }, + { + "epoch": 1.30213679201269, + "grad_norm": 23.05828768118742, + "learning_rate": 0.0002735598482324808, + "loss": 7.383, + "step": 13955 + }, + { + "epoch": 1.3022301017075675, + "grad_norm": 140737797.48162672, + "learning_rate": 0.00027355555956928504, + "loss": 7.3412, + "step": 13956 + }, + { + "epoch": 1.3023234114024447, + "grad_norm": 25.294963624862884, + "learning_rate": 0.00027355127059192405, + "loss": 7.5918, + "step": 13957 + }, + { + "epoch": 1.302416721097322, + "grad_norm": 14.764387464599071, + "learning_rate": 0.00027354698130040877, + "loss": 7.5334, + "step": 13958 + }, + { + "epoch": 1.3025100307921993, + "grad_norm": 2.1882244990609587, + "learning_rate": 0.0002735426916947502, + "loss": 7.4536, + "step": 13959 + }, + { + "epoch": 1.3026033404870767, + "grad_norm": 11.04040413871395, + "learning_rate": 0.00027353840177495904, + "loss": 7.5219, + "step": 13960 + }, + { + "epoch": 1.302696650181954, + "grad_norm": 10.164002522412096, + "learning_rate": 0.0002735341115410464, + "loss": 7.3905, + "step": 13961 + }, + { + "epoch": 1.302789959876831, + "grad_norm": 32.77850240558585, + "learning_rate": 0.00027352982099302313, + "loss": 7.218, + "step": 13962 + }, + { + "epoch": 1.3028832695717085, + "grad_norm": 40801914.03370397, + "learning_rate": 0.00027352553013090006, + "loss": 7.5462, + "step": 13963 + }, + { + "epoch": 1.3029765792665857, + "grad_norm": 28.47618491734742, + "learning_rate": 0.00027352123895468816, + "loss": 7.5606, + "step": 13964 + }, + { + "epoch": 1.3030698889614631, + "grad_norm": 5850879.16915251, + "learning_rate": 0.00027351694746439835, + "loss": 7.6598, + "step": 13965 + }, + { + "epoch": 1.3031631986563403, + "grad_norm": 6.199171359016265, + "learning_rate": 0.0002735126556600415, + "loss": 7.7996, + "step": 13966 + }, + { + "epoch": 1.3032565083512178, + "grad_norm": 406106948.7801449, + "learning_rate": 0.0002735083635416286, + "loss": 7.6713, + "step": 13967 + }, + { + "epoch": 1.303349818046095, + "grad_norm": 60107586.27139578, + "learning_rate": 0.0002735040711091705, + "loss": 7.4972, + "step": 13968 + }, + { + "epoch": 1.3034431277409722, + "grad_norm": 11.126521324590254, + "learning_rate": 0.00027349977836267815, + "loss": 7.3616, + "step": 13969 + }, + { + "epoch": 1.3035364374358496, + "grad_norm": 11.681006451436549, + "learning_rate": 0.00027349548530216245, + "loss": 7.6027, + "step": 13970 + }, + { + "epoch": 1.303629747130727, + "grad_norm": 25.64929802709814, + "learning_rate": 0.00027349119192763427, + "loss": 7.2274, + "step": 13971 + }, + { + "epoch": 1.3037230568256042, + "grad_norm": 288105514.77055514, + "learning_rate": 0.00027348689823910465, + "loss": 7.415, + "step": 13972 + }, + { + "epoch": 1.3038163665204814, + "grad_norm": 9.3229801538386, + "learning_rate": 0.00027348260423658435, + "loss": 7.2055, + "step": 13973 + }, + { + "epoch": 1.3039096762153588, + "grad_norm": 9.703283586414372, + "learning_rate": 0.0002734783099200844, + "loss": 7.4375, + "step": 13974 + }, + { + "epoch": 1.304002985910236, + "grad_norm": 12.502223602401168, + "learning_rate": 0.0002734740152896156, + "loss": 7.4529, + "step": 13975 + }, + { + "epoch": 1.3040962956051134, + "grad_norm": 4.899781093979017, + "learning_rate": 0.0002734697203451891, + "loss": 7.4694, + "step": 13976 + }, + { + "epoch": 1.3041896052999906, + "grad_norm": 86738064.73066266, + "learning_rate": 0.0002734654250868156, + "loss": 7.436, + "step": 13977 + }, + { + "epoch": 1.304282914994868, + "grad_norm": 6.505409144007154, + "learning_rate": 0.00027346112951450607, + "loss": 7.9205, + "step": 13978 + }, + { + "epoch": 1.3043762246897452, + "grad_norm": 4.666194219406075, + "learning_rate": 0.00027345683362827155, + "loss": 7.6115, + "step": 13979 + }, + { + "epoch": 1.3044695343846224, + "grad_norm": 3.197392988316702, + "learning_rate": 0.00027345253742812277, + "loss": 7.5536, + "step": 13980 + }, + { + "epoch": 1.3045628440794999, + "grad_norm": 5.599433234615926, + "learning_rate": 0.00027344824091407084, + "loss": 7.7728, + "step": 13981 + }, + { + "epoch": 1.3046561537743773, + "grad_norm": 7.62068241275796, + "learning_rate": 0.00027344394408612655, + "loss": 7.682, + "step": 13982 + }, + { + "epoch": 1.3047494634692545, + "grad_norm": 38.08244994074797, + "learning_rate": 0.0002734396469443009, + "loss": 7.3651, + "step": 13983 + }, + { + "epoch": 1.3048427731641317, + "grad_norm": 4.573705971859537, + "learning_rate": 0.00027343534948860476, + "loss": 7.3851, + "step": 13984 + }, + { + "epoch": 1.304936082859009, + "grad_norm": 173520139.6461901, + "learning_rate": 0.0002734310517190491, + "loss": 7.4187, + "step": 13985 + }, + { + "epoch": 1.3050293925538863, + "grad_norm": 3.1562637820325117, + "learning_rate": 0.0002734267536356449, + "loss": 7.3075, + "step": 13986 + }, + { + "epoch": 1.3051227022487637, + "grad_norm": 5.402904273231936, + "learning_rate": 0.00027342245523840296, + "loss": 7.6354, + "step": 13987 + }, + { + "epoch": 1.305216011943641, + "grad_norm": 11.391106635184272, + "learning_rate": 0.00027341815652733427, + "loss": 7.466, + "step": 13988 + }, + { + "epoch": 1.3053093216385183, + "grad_norm": 41.15443367552767, + "learning_rate": 0.00027341385750244976, + "loss": 7.7823, + "step": 13989 + }, + { + "epoch": 1.3054026313333955, + "grad_norm": 25.04787866802731, + "learning_rate": 0.00027340955816376045, + "loss": 7.5562, + "step": 13990 + }, + { + "epoch": 1.3054959410282727, + "grad_norm": 7.399980177708553, + "learning_rate": 0.0002734052585112771, + "loss": 7.0779, + "step": 13991 + }, + { + "epoch": 1.3055892507231501, + "grad_norm": 2.026561450438201, + "learning_rate": 0.0002734009585450108, + "loss": 7.3867, + "step": 13992 + }, + { + "epoch": 1.3056825604180275, + "grad_norm": 3.165840017915457, + "learning_rate": 0.0002733966582649724, + "loss": 7.4139, + "step": 13993 + }, + { + "epoch": 1.3057758701129047, + "grad_norm": 89.8633172826018, + "learning_rate": 0.00027339235767117286, + "loss": 7.59, + "step": 13994 + }, + { + "epoch": 1.305869179807782, + "grad_norm": 5.339070232815387, + "learning_rate": 0.0002733880567636231, + "loss": 7.1617, + "step": 13995 + }, + { + "epoch": 1.3059624895026594, + "grad_norm": 73270045.61655864, + "learning_rate": 0.000273383755542334, + "loss": 7.5893, + "step": 13996 + }, + { + "epoch": 1.3060557991975366, + "grad_norm": 1.8436581738421023, + "learning_rate": 0.00027337945400731663, + "loss": 7.5355, + "step": 13997 + }, + { + "epoch": 1.306149108892414, + "grad_norm": 1860637023.3323803, + "learning_rate": 0.0002733751521585818, + "loss": 7.5674, + "step": 13998 + }, + { + "epoch": 1.3062424185872912, + "grad_norm": 5.430996033647926, + "learning_rate": 0.0002733708499961406, + "loss": 7.2669, + "step": 13999 + }, + { + "epoch": 1.3063357282821686, + "grad_norm": 2.3632925015330137, + "learning_rate": 0.00027336654752000377, + "loss": 7.2739, + "step": 14000 + }, + { + "epoch": 1.3064290379770458, + "grad_norm": 5.6215884258044255, + "learning_rate": 0.00027336224473018244, + "loss": 7.528, + "step": 14001 + }, + { + "epoch": 1.306522347671923, + "grad_norm": 5.513550472909965, + "learning_rate": 0.0002733579416266874, + "loss": 7.5669, + "step": 14002 + }, + { + "epoch": 1.3066156573668004, + "grad_norm": 33.62117543071894, + "learning_rate": 0.0002733536382095297, + "loss": 7.4057, + "step": 14003 + }, + { + "epoch": 1.3067089670616778, + "grad_norm": 18.11041021789002, + "learning_rate": 0.00027334933447872015, + "loss": 7.4077, + "step": 14004 + }, + { + "epoch": 1.306802276756555, + "grad_norm": 3.798028669161409, + "learning_rate": 0.00027334503043426983, + "loss": 7.5696, + "step": 14005 + }, + { + "epoch": 1.3068955864514322, + "grad_norm": 311165740.022232, + "learning_rate": 0.0002733407260761897, + "loss": 7.7627, + "step": 14006 + }, + { + "epoch": 1.3069888961463096, + "grad_norm": 9.54174918569499, + "learning_rate": 0.00027333642140449057, + "loss": 7.6746, + "step": 14007 + }, + { + "epoch": 1.3070822058411868, + "grad_norm": 25.28068946728804, + "learning_rate": 0.0002733321164191835, + "loss": 7.3753, + "step": 14008 + }, + { + "epoch": 1.3071755155360643, + "grad_norm": 5.496964857454618, + "learning_rate": 0.00027332781112027936, + "loss": 7.457, + "step": 14009 + }, + { + "epoch": 1.3072688252309415, + "grad_norm": 44.430522951865136, + "learning_rate": 0.0002733235055077891, + "loss": 7.8261, + "step": 14010 + }, + { + "epoch": 1.3073621349258189, + "grad_norm": 5.408249142166034, + "learning_rate": 0.00027331919958172376, + "loss": 7.3533, + "step": 14011 + }, + { + "epoch": 1.307455444620696, + "grad_norm": 1.796723026813122, + "learning_rate": 0.00027331489334209423, + "loss": 7.4927, + "step": 14012 + }, + { + "epoch": 1.3075487543155733, + "grad_norm": 9.877296941525412, + "learning_rate": 0.0002733105867889114, + "loss": 7.2035, + "step": 14013 + }, + { + "epoch": 1.3076420640104507, + "grad_norm": 16.333473656283356, + "learning_rate": 0.0002733062799221863, + "loss": 7.1737, + "step": 14014 + }, + { + "epoch": 1.307735373705328, + "grad_norm": 738869510.7050507, + "learning_rate": 0.00027330197274192986, + "loss": 7.3207, + "step": 14015 + }, + { + "epoch": 1.3078286834002053, + "grad_norm": 239609931.83864057, + "learning_rate": 0.000273297665248153, + "loss": 7.3617, + "step": 14016 + }, + { + "epoch": 1.3079219930950825, + "grad_norm": 7.347806893291803, + "learning_rate": 0.0002732933574408667, + "loss": 7.1884, + "step": 14017 + }, + { + "epoch": 1.30801530278996, + "grad_norm": 8.97985022916208, + "learning_rate": 0.00027328904932008195, + "loss": 7.4856, + "step": 14018 + }, + { + "epoch": 1.3081086124848371, + "grad_norm": 5.0360811314244796, + "learning_rate": 0.0002732847408858097, + "loss": 7.3161, + "step": 14019 + }, + { + "epoch": 1.3082019221797145, + "grad_norm": 12.494453368132172, + "learning_rate": 0.0002732804321380608, + "loss": 7.2781, + "step": 14020 + }, + { + "epoch": 1.3082952318745917, + "grad_norm": 3.956716040011058, + "learning_rate": 0.0002732761230768463, + "loss": 7.4018, + "step": 14021 + }, + { + "epoch": 1.3083885415694692, + "grad_norm": 241374703.89860103, + "learning_rate": 0.00027327181370217715, + "loss": 7.4461, + "step": 14022 + }, + { + "epoch": 1.3084818512643464, + "grad_norm": 2.9358915060023345, + "learning_rate": 0.00027326750401406426, + "loss": 7.2705, + "step": 14023 + }, + { + "epoch": 1.3085751609592235, + "grad_norm": 28.656127665371532, + "learning_rate": 0.00027326319401251866, + "loss": 7.9754, + "step": 14024 + }, + { + "epoch": 1.308668470654101, + "grad_norm": 4.114361417856466, + "learning_rate": 0.0002732588836975513, + "loss": 7.7426, + "step": 14025 + }, + { + "epoch": 1.3087617803489784, + "grad_norm": 6.482810465460488, + "learning_rate": 0.00027325457306917307, + "loss": 7.4353, + "step": 14026 + }, + { + "epoch": 1.3088550900438556, + "grad_norm": 17.70440838123489, + "learning_rate": 0.00027325026212739493, + "loss": 7.347, + "step": 14027 + }, + { + "epoch": 1.3089483997387328, + "grad_norm": 1579204.5055499005, + "learning_rate": 0.00027324595087222794, + "loss": 7.1149, + "step": 14028 + }, + { + "epoch": 1.3090417094336102, + "grad_norm": 83792822.1995651, + "learning_rate": 0.000273241639303683, + "loss": 7.1115, + "step": 14029 + }, + { + "epoch": 1.3091350191284874, + "grad_norm": 74.76541912464779, + "learning_rate": 0.0002732373274217711, + "loss": 7.6077, + "step": 14030 + }, + { + "epoch": 1.3092283288233648, + "grad_norm": 50.171038600874546, + "learning_rate": 0.0002732330152265031, + "loss": 7.0798, + "step": 14031 + }, + { + "epoch": 1.309321638518242, + "grad_norm": 1.5889669929477768, + "learning_rate": 0.00027322870271789015, + "loss": 7.8659, + "step": 14032 + }, + { + "epoch": 1.3094149482131194, + "grad_norm": 4.394592583341649, + "learning_rate": 0.00027322438989594303, + "loss": 7.5674, + "step": 14033 + }, + { + "epoch": 1.3095082579079966, + "grad_norm": 4.339083148740127, + "learning_rate": 0.0002732200767606728, + "loss": 7.458, + "step": 14034 + }, + { + "epoch": 1.3096015676028738, + "grad_norm": 38.63616611442617, + "learning_rate": 0.00027321576331209045, + "loss": 7.2912, + "step": 14035 + }, + { + "epoch": 1.3096948772977512, + "grad_norm": 23.831360325697585, + "learning_rate": 0.0002732114495502069, + "loss": 7.4072, + "step": 14036 + }, + { + "epoch": 1.3097881869926287, + "grad_norm": 17.284511632243568, + "learning_rate": 0.0002732071354750331, + "loss": 7.3185, + "step": 14037 + }, + { + "epoch": 1.3098814966875059, + "grad_norm": 69.44892460806894, + "learning_rate": 0.0002732028210865801, + "loss": 7.5906, + "step": 14038 + }, + { + "epoch": 1.309974806382383, + "grad_norm": 1.9773158176543573, + "learning_rate": 0.0002731985063848588, + "loss": 7.1586, + "step": 14039 + }, + { + "epoch": 1.3100681160772605, + "grad_norm": 1615041819.6775692, + "learning_rate": 0.00027319419136988016, + "loss": 7.4872, + "step": 14040 + }, + { + "epoch": 1.3101614257721377, + "grad_norm": 7.759357544116935, + "learning_rate": 0.00027318987604165525, + "loss": 7.2062, + "step": 14041 + }, + { + "epoch": 1.310254735467015, + "grad_norm": 1.4605672024661545, + "learning_rate": 0.00027318556040019495, + "loss": 7.3828, + "step": 14042 + }, + { + "epoch": 1.3103480451618923, + "grad_norm": 25.056633479167598, + "learning_rate": 0.0002731812444455102, + "loss": 7.2273, + "step": 14043 + }, + { + "epoch": 1.3104413548567697, + "grad_norm": 75.12834322372727, + "learning_rate": 0.00027317692817761216, + "loss": 7.3561, + "step": 14044 + }, + { + "epoch": 1.310534664551647, + "grad_norm": 4.8531149296125955, + "learning_rate": 0.0002731726115965116, + "loss": 7.6707, + "step": 14045 + }, + { + "epoch": 1.310627974246524, + "grad_norm": 102.89252828157227, + "learning_rate": 0.00027316829470221956, + "loss": 7.658, + "step": 14046 + }, + { + "epoch": 1.3107212839414015, + "grad_norm": 15608548.742789973, + "learning_rate": 0.0002731639774947471, + "loss": 7.2888, + "step": 14047 + }, + { + "epoch": 1.310814593636279, + "grad_norm": 2.6330056756276394, + "learning_rate": 0.000273159659974105, + "loss": 7.1582, + "step": 14048 + }, + { + "epoch": 1.3109079033311561, + "grad_norm": 888956309.8008814, + "learning_rate": 0.0002731553421403045, + "loss": 7.4481, + "step": 14049 + }, + { + "epoch": 1.3110012130260333, + "grad_norm": 2.9274351283444457, + "learning_rate": 0.00027315102399335636, + "loss": 7.3479, + "step": 14050 + }, + { + "epoch": 1.3110945227209108, + "grad_norm": 3.851743178403459, + "learning_rate": 0.0002731467055332717, + "loss": 7.2629, + "step": 14051 + }, + { + "epoch": 1.311187832415788, + "grad_norm": 46.25831698106332, + "learning_rate": 0.0002731423867600614, + "loss": 7.6457, + "step": 14052 + }, + { + "epoch": 1.3112811421106654, + "grad_norm": 4.252795164632862, + "learning_rate": 0.00027313806767373654, + "loss": 7.553, + "step": 14053 + }, + { + "epoch": 1.3113744518055426, + "grad_norm": 7.462694190035827, + "learning_rate": 0.000273133748274308, + "loss": 7.8426, + "step": 14054 + }, + { + "epoch": 1.31146776150042, + "grad_norm": 12.776413096515277, + "learning_rate": 0.00027312942856178685, + "loss": 7.6868, + "step": 14055 + }, + { + "epoch": 1.3115610711952972, + "grad_norm": 8698961910.518835, + "learning_rate": 0.0002731251085361841, + "loss": 7.3255, + "step": 14056 + }, + { + "epoch": 1.3116543808901744, + "grad_norm": 10.880102778209698, + "learning_rate": 0.00027312078819751053, + "loss": 7.7451, + "step": 14057 + }, + { + "epoch": 1.3117476905850518, + "grad_norm": 8614700931.147144, + "learning_rate": 0.0002731164675457774, + "loss": 7.4194, + "step": 14058 + }, + { + "epoch": 1.311841000279929, + "grad_norm": 3765173654.7791553, + "learning_rate": 0.00027311214658099544, + "loss": 7.2287, + "step": 14059 + }, + { + "epoch": 1.3119343099748064, + "grad_norm": 32.04949751722749, + "learning_rate": 0.00027310782530317587, + "loss": 7.2182, + "step": 14060 + }, + { + "epoch": 1.3120276196696836, + "grad_norm": 50166064091.050125, + "learning_rate": 0.00027310350371232946, + "loss": 7.5471, + "step": 14061 + }, + { + "epoch": 1.312120929364561, + "grad_norm": 1.6803244080229789, + "learning_rate": 0.0002730991818084674, + "loss": 7.469, + "step": 14062 + }, + { + "epoch": 1.3122142390594382, + "grad_norm": 6.8395549707997505, + "learning_rate": 0.0002730948595916005, + "loss": 7.3577, + "step": 14063 + }, + { + "epoch": 1.3123075487543157, + "grad_norm": 27.426071579294085, + "learning_rate": 0.0002730905370617399, + "loss": 7.5568, + "step": 14064 + }, + { + "epoch": 1.3124008584491929, + "grad_norm": 2.3819995847102557, + "learning_rate": 0.00027308621421889655, + "loss": 7.4232, + "step": 14065 + }, + { + "epoch": 1.3124941681440703, + "grad_norm": 3.6264883338909684, + "learning_rate": 0.00027308189106308134, + "loss": 7.4585, + "step": 14066 + }, + { + "epoch": 1.3125874778389475, + "grad_norm": 3.31312297819218, + "learning_rate": 0.00027307756759430537, + "loss": 7.4246, + "step": 14067 + }, + { + "epoch": 1.3126807875338247, + "grad_norm": 6.183414639660565, + "learning_rate": 0.00027307324381257964, + "loss": 7.3918, + "step": 14068 + }, + { + "epoch": 1.312774097228702, + "grad_norm": 6.25292664928929, + "learning_rate": 0.0002730689197179151, + "loss": 7.5036, + "step": 14069 + }, + { + "epoch": 1.3128674069235793, + "grad_norm": 5.392791090421399, + "learning_rate": 0.0002730645953103227, + "loss": 7.7801, + "step": 14070 + }, + { + "epoch": 1.3129607166184567, + "grad_norm": 3.3095954895590056, + "learning_rate": 0.0002730602705898135, + "loss": 7.5002, + "step": 14071 + }, + { + "epoch": 1.313054026313334, + "grad_norm": 2.561903273750669, + "learning_rate": 0.0002730559455563985, + "loss": 7.7931, + "step": 14072 + }, + { + "epoch": 1.3131473360082113, + "grad_norm": 11556213784.02851, + "learning_rate": 0.0002730516202100887, + "loss": 7.2777, + "step": 14073 + }, + { + "epoch": 1.3132406457030885, + "grad_norm": 2.4575960242039705, + "learning_rate": 0.00027304729455089507, + "loss": 7.5389, + "step": 14074 + }, + { + "epoch": 1.3133339553979657, + "grad_norm": 6.503001504330484, + "learning_rate": 0.0002730429685788286, + "loss": 7.3759, + "step": 14075 + }, + { + "epoch": 1.3134272650928431, + "grad_norm": 18668745344.602108, + "learning_rate": 0.00027303864229390033, + "loss": 7.3641, + "step": 14076 + }, + { + "epoch": 1.3135205747877206, + "grad_norm": 394172748946.83026, + "learning_rate": 0.0002730343156961213, + "loss": 7.3287, + "step": 14077 + }, + { + "epoch": 1.3136138844825977, + "grad_norm": 9.054699481303343, + "learning_rate": 0.00027302998878550236, + "loss": 7.0849, + "step": 14078 + }, + { + "epoch": 1.313707194177475, + "grad_norm": 20397340538.928497, + "learning_rate": 0.0002730256615620546, + "loss": 7.4237, + "step": 14079 + }, + { + "epoch": 1.3138005038723524, + "grad_norm": 23.539196553789935, + "learning_rate": 0.00027302133402578904, + "loss": 7.6783, + "step": 14080 + }, + { + "epoch": 1.3138938135672296, + "grad_norm": 25.297642878684563, + "learning_rate": 0.0002730170061767167, + "loss": 7.4941, + "step": 14081 + }, + { + "epoch": 1.313987123262107, + "grad_norm": 28.985150035156156, + "learning_rate": 0.0002730126780148486, + "loss": 7.4671, + "step": 14082 + }, + { + "epoch": 1.3140804329569842, + "grad_norm": 26.370173007693904, + "learning_rate": 0.0002730083495401956, + "loss": 7.7158, + "step": 14083 + }, + { + "epoch": 1.3141737426518616, + "grad_norm": 3.8424567041966404, + "learning_rate": 0.00027300402075276884, + "loss": 7.4309, + "step": 14084 + }, + { + "epoch": 1.3142670523467388, + "grad_norm": 33.331710258380646, + "learning_rate": 0.0002729996916525793, + "loss": 7.7376, + "step": 14085 + }, + { + "epoch": 1.314360362041616, + "grad_norm": 74.52287750891351, + "learning_rate": 0.0002729953622396379, + "loss": 7.3939, + "step": 14086 + }, + { + "epoch": 1.3144536717364934, + "grad_norm": 2.309937997199816, + "learning_rate": 0.0002729910325139558, + "loss": 7.7023, + "step": 14087 + }, + { + "epoch": 1.3145469814313708, + "grad_norm": 5.977600084117167, + "learning_rate": 0.000272986702475544, + "loss": 7.3416, + "step": 14088 + }, + { + "epoch": 1.314640291126248, + "grad_norm": 13.478957480819446, + "learning_rate": 0.0002729823721244134, + "loss": 7.3071, + "step": 14089 + }, + { + "epoch": 1.3147336008211252, + "grad_norm": 18.801905488937955, + "learning_rate": 0.000272978041460575, + "loss": 7.4144, + "step": 14090 + }, + { + "epoch": 1.3148269105160026, + "grad_norm": 20.2727106285855, + "learning_rate": 0.0002729737104840399, + "loss": 7.5658, + "step": 14091 + }, + { + "epoch": 1.3149202202108798, + "grad_norm": 266.14217718141737, + "learning_rate": 0.00027296937919481907, + "loss": 7.3195, + "step": 14092 + }, + { + "epoch": 1.3150135299057573, + "grad_norm": 3.4749905957586993, + "learning_rate": 0.0002729650475929236, + "loss": 7.8436, + "step": 14093 + }, + { + "epoch": 1.3151068396006345, + "grad_norm": 69844833057.3367, + "learning_rate": 0.0002729607156783644, + "loss": 7.2886, + "step": 14094 + }, + { + "epoch": 1.3152001492955119, + "grad_norm": 158.96456059234285, + "learning_rate": 0.0002729563834511525, + "loss": 7.614, + "step": 14095 + }, + { + "epoch": 1.315293458990389, + "grad_norm": 5.388565870524617, + "learning_rate": 0.00027295205091129896, + "loss": 7.4146, + "step": 14096 + }, + { + "epoch": 1.3153867686852663, + "grad_norm": 7.515513381990784, + "learning_rate": 0.0002729477180588148, + "loss": 7.3764, + "step": 14097 + }, + { + "epoch": 1.3154800783801437, + "grad_norm": 17.689190686690154, + "learning_rate": 0.00027294338489371097, + "loss": 7.3901, + "step": 14098 + }, + { + "epoch": 1.315573388075021, + "grad_norm": 97.30803724017308, + "learning_rate": 0.00027293905141599856, + "loss": 7.5711, + "step": 14099 + }, + { + "epoch": 1.3156666977698983, + "grad_norm": 45.20651943439454, + "learning_rate": 0.00027293471762568854, + "loss": 7.257, + "step": 14100 + }, + { + "epoch": 1.3157600074647755, + "grad_norm": 20.070984834376187, + "learning_rate": 0.0002729303835227919, + "loss": 7.4235, + "step": 14101 + }, + { + "epoch": 1.315853317159653, + "grad_norm": 208.6294663502922, + "learning_rate": 0.0002729260491073198, + "loss": 7.4818, + "step": 14102 + }, + { + "epoch": 1.3159466268545301, + "grad_norm": 94.11946856504714, + "learning_rate": 0.0002729217143792832, + "loss": 7.5569, + "step": 14103 + }, + { + "epoch": 1.3160399365494075, + "grad_norm": 3.2033894671473613, + "learning_rate": 0.000272917379338693, + "loss": 7.635, + "step": 14104 + }, + { + "epoch": 1.3161332462442847, + "grad_norm": 16.48409073043041, + "learning_rate": 0.0002729130439855603, + "loss": 7.5264, + "step": 14105 + }, + { + "epoch": 1.3162265559391622, + "grad_norm": 18.32025838301078, + "learning_rate": 0.0002729087083198962, + "loss": 7.4308, + "step": 14106 + }, + { + "epoch": 1.3163198656340394, + "grad_norm": 693.0102937172923, + "learning_rate": 0.00027290437234171164, + "loss": 7.4972, + "step": 14107 + }, + { + "epoch": 1.3164131753289166, + "grad_norm": 3.916513532668238, + "learning_rate": 0.0002729000360510177, + "loss": 7.1758, + "step": 14108 + }, + { + "epoch": 1.316506485023794, + "grad_norm": 9933759727.616112, + "learning_rate": 0.00027289569944782534, + "loss": 7.5641, + "step": 14109 + }, + { + "epoch": 1.3165997947186714, + "grad_norm": 4.50074407748358, + "learning_rate": 0.0002728913625321456, + "loss": 7.4765, + "step": 14110 + }, + { + "epoch": 1.3166931044135486, + "grad_norm": 15.905942966860575, + "learning_rate": 0.00027288702530398957, + "loss": 7.6909, + "step": 14111 + }, + { + "epoch": 1.3167864141084258, + "grad_norm": 4.052432896033361, + "learning_rate": 0.00027288268776336824, + "loss": 7.6858, + "step": 14112 + }, + { + "epoch": 1.3168797238033032, + "grad_norm": 3.15270666009407, + "learning_rate": 0.00027287834991029263, + "loss": 7.8556, + "step": 14113 + }, + { + "epoch": 1.3169730334981804, + "grad_norm": 2.7052537366209854, + "learning_rate": 0.00027287401174477376, + "loss": 7.5358, + "step": 14114 + }, + { + "epoch": 1.3170663431930578, + "grad_norm": 14.438413269584064, + "learning_rate": 0.00027286967326682263, + "loss": 7.3124, + "step": 14115 + }, + { + "epoch": 1.317159652887935, + "grad_norm": 2.74795306405636, + "learning_rate": 0.0002728653344764504, + "loss": 7.3682, + "step": 14116 + }, + { + "epoch": 1.3172529625828124, + "grad_norm": 238.34254534990723, + "learning_rate": 0.00027286099537366797, + "loss": 7.6556, + "step": 14117 + }, + { + "epoch": 1.3173462722776896, + "grad_norm": 17.514604312020996, + "learning_rate": 0.00027285665595848645, + "loss": 7.7367, + "step": 14118 + }, + { + "epoch": 1.3174395819725668, + "grad_norm": 4.678809181886464, + "learning_rate": 0.00027285231623091686, + "loss": 7.2688, + "step": 14119 + }, + { + "epoch": 1.3175328916674443, + "grad_norm": 9.4533202917884, + "learning_rate": 0.00027284797619097023, + "loss": 7.3807, + "step": 14120 + }, + { + "epoch": 1.3176262013623217, + "grad_norm": 11.381979409417042, + "learning_rate": 0.00027284363583865754, + "loss": 7.3331, + "step": 14121 + }, + { + "epoch": 1.3177195110571989, + "grad_norm": 2545186579798.765, + "learning_rate": 0.0002728392951739899, + "loss": 7.4309, + "step": 14122 + }, + { + "epoch": 1.317812820752076, + "grad_norm": 6.338888880596563, + "learning_rate": 0.0002728349541969783, + "loss": 7.5684, + "step": 14123 + }, + { + "epoch": 1.3179061304469535, + "grad_norm": 5.5805125612179864, + "learning_rate": 0.0002728306129076338, + "loss": 7.5224, + "step": 14124 + }, + { + "epoch": 1.3179994401418307, + "grad_norm": 12.684740713436637, + "learning_rate": 0.00027282627130596743, + "loss": 7.6147, + "step": 14125 + }, + { + "epoch": 1.318092749836708, + "grad_norm": 1383492221597.513, + "learning_rate": 0.00027282192939199026, + "loss": 7.1197, + "step": 14126 + }, + { + "epoch": 1.3181860595315853, + "grad_norm": 2.994282304000605, + "learning_rate": 0.00027281758716571325, + "loss": 7.2566, + "step": 14127 + }, + { + "epoch": 1.3182793692264627, + "grad_norm": 9.210701882856515, + "learning_rate": 0.0002728132446271476, + "loss": 7.3487, + "step": 14128 + }, + { + "epoch": 1.31837267892134, + "grad_norm": 10.034269833614509, + "learning_rate": 0.00027280890177630416, + "loss": 7.1864, + "step": 14129 + }, + { + "epoch": 1.318465988616217, + "grad_norm": 5.347781681652247, + "learning_rate": 0.0002728045586131941, + "loss": 7.4007, + "step": 14130 + }, + { + "epoch": 1.3185592983110945, + "grad_norm": 43.78750818876405, + "learning_rate": 0.0002728002151378284, + "loss": 7.2838, + "step": 14131 + }, + { + "epoch": 1.318652608005972, + "grad_norm": 5.706494594630492, + "learning_rate": 0.0002727958713502182, + "loss": 7.2565, + "step": 14132 + }, + { + "epoch": 1.3187459177008491, + "grad_norm": 2.0550029608479865, + "learning_rate": 0.0002727915272503744, + "loss": 7.3129, + "step": 14133 + }, + { + "epoch": 1.3188392273957263, + "grad_norm": 26.323541470668417, + "learning_rate": 0.0002727871828383081, + "loss": 7.3391, + "step": 14134 + }, + { + "epoch": 1.3189325370906038, + "grad_norm": 705644248953.2972, + "learning_rate": 0.00027278283811403044, + "loss": 7.0983, + "step": 14135 + }, + { + "epoch": 1.319025846785481, + "grad_norm": 62.57284134499972, + "learning_rate": 0.00027277849307755236, + "loss": 7.4279, + "step": 14136 + }, + { + "epoch": 1.3191191564803584, + "grad_norm": 14.23555851712534, + "learning_rate": 0.00027277414772888494, + "loss": 7.6269, + "step": 14137 + }, + { + "epoch": 1.3192124661752356, + "grad_norm": 5.293715579543331, + "learning_rate": 0.0002727698020680392, + "loss": 7.7451, + "step": 14138 + }, + { + "epoch": 1.319305775870113, + "grad_norm": 182334968179.47873, + "learning_rate": 0.0002727654560950263, + "loss": 7.6128, + "step": 14139 + }, + { + "epoch": 1.3193990855649902, + "grad_norm": 1.8899673712417364, + "learning_rate": 0.0002727611098098571, + "loss": 7.7711, + "step": 14140 + }, + { + "epoch": 1.3194923952598674, + "grad_norm": 670067201556.5002, + "learning_rate": 0.00027275676321254284, + "loss": 7.2678, + "step": 14141 + }, + { + "epoch": 1.3195857049547448, + "grad_norm": 1603266550775.63, + "learning_rate": 0.0002727524163030945, + "loss": 7.2951, + "step": 14142 + }, + { + "epoch": 1.3196790146496222, + "grad_norm": 34454196491.20335, + "learning_rate": 0.0002727480690815231, + "loss": 7.2273, + "step": 14143 + }, + { + "epoch": 1.3197723243444994, + "grad_norm": 1496960500256.5, + "learning_rate": 0.00027274372154783975, + "loss": 7.465, + "step": 14144 + }, + { + "epoch": 1.3198656340393766, + "grad_norm": 7.469129247791531, + "learning_rate": 0.0002727393737020555, + "loss": 7.3902, + "step": 14145 + }, + { + "epoch": 1.319958943734254, + "grad_norm": 10408952402204.875, + "learning_rate": 0.00027273502554418127, + "loss": 7.5573, + "step": 14146 + }, + { + "epoch": 1.3200522534291312, + "grad_norm": 3746684198217.049, + "learning_rate": 0.00027273067707422833, + "loss": 7.5562, + "step": 14147 + }, + { + "epoch": 1.3201455631240087, + "grad_norm": 1.7396152710244346, + "learning_rate": 0.0002727263282922076, + "loss": 7.5347, + "step": 14148 + }, + { + "epoch": 1.3202388728188859, + "grad_norm": 2.076730033828725, + "learning_rate": 0.00027272197919813015, + "loss": 7.3009, + "step": 14149 + }, + { + "epoch": 1.3203321825137633, + "grad_norm": 238.2867451903606, + "learning_rate": 0.0002727176297920071, + "loss": 7.1843, + "step": 14150 + }, + { + "epoch": 1.3204254922086405, + "grad_norm": 2.9462343318452064, + "learning_rate": 0.0002727132800738494, + "loss": 7.3026, + "step": 14151 + }, + { + "epoch": 1.3205188019035177, + "grad_norm": 3.9324781887296196, + "learning_rate": 0.00027270893004366825, + "loss": 7.6689, + "step": 14152 + }, + { + "epoch": 1.320612111598395, + "grad_norm": 9222173898841.29, + "learning_rate": 0.0002727045797014746, + "loss": 7.068, + "step": 14153 + }, + { + "epoch": 1.3207054212932725, + "grad_norm": 73649029517477.25, + "learning_rate": 0.0002727002290472796, + "loss": 7.6615, + "step": 14154 + }, + { + "epoch": 1.3207987309881497, + "grad_norm": 41071305321208.266, + "learning_rate": 0.0002726958780810942, + "loss": 7.5447, + "step": 14155 + }, + { + "epoch": 1.320892040683027, + "grad_norm": 6.3497241060137615, + "learning_rate": 0.00027269152680292955, + "loss": 7.3382, + "step": 14156 + }, + { + "epoch": 1.3209853503779043, + "grad_norm": 6.580807325534571, + "learning_rate": 0.0002726871752127967, + "loss": 7.4161, + "step": 14157 + }, + { + "epoch": 1.3210786600727815, + "grad_norm": 1355776863888.0176, + "learning_rate": 0.0002726828233107067, + "loss": 7.3703, + "step": 14158 + }, + { + "epoch": 1.321171969767659, + "grad_norm": 5.289247720375481, + "learning_rate": 0.0002726784710966707, + "loss": 7.6427, + "step": 14159 + }, + { + "epoch": 1.3212652794625361, + "grad_norm": 25990396978032.812, + "learning_rate": 0.0002726741185706995, + "loss": 7.2441, + "step": 14160 + }, + { + "epoch": 1.3213585891574136, + "grad_norm": 58989518593826.164, + "learning_rate": 0.00027266976573280454, + "loss": 7.6971, + "step": 14161 + }, + { + "epoch": 1.3214518988522908, + "grad_norm": 32.23312297763156, + "learning_rate": 0.00027266541258299664, + "loss": 7.2742, + "step": 14162 + }, + { + "epoch": 1.321545208547168, + "grad_norm": 5.553443887075868, + "learning_rate": 0.0002726610591212869, + "loss": 7.4632, + "step": 14163 + }, + { + "epoch": 1.3216385182420454, + "grad_norm": 14.723671329214568, + "learning_rate": 0.0002726567053476865, + "loss": 7.2623, + "step": 14164 + }, + { + "epoch": 1.3217318279369226, + "grad_norm": 89598906860508.08, + "learning_rate": 0.00027265235126220635, + "loss": 7.4394, + "step": 14165 + }, + { + "epoch": 1.3218251376318, + "grad_norm": 15.699991098467258, + "learning_rate": 0.00027264799686485764, + "loss": 7.4118, + "step": 14166 + }, + { + "epoch": 1.3219184473266772, + "grad_norm": 15.923211656412892, + "learning_rate": 0.00027264364215565134, + "loss": 7.602, + "step": 14167 + }, + { + "epoch": 1.3220117570215546, + "grad_norm": 3.9445858894359946, + "learning_rate": 0.0002726392871345987, + "loss": 7.7582, + "step": 14168 + }, + { + "epoch": 1.3221050667164318, + "grad_norm": 2.746982155975558, + "learning_rate": 0.0002726349318017106, + "loss": 7.3957, + "step": 14169 + }, + { + "epoch": 1.3221983764113092, + "grad_norm": 82.46182142072632, + "learning_rate": 0.00027263057615699825, + "loss": 7.361, + "step": 14170 + }, + { + "epoch": 1.3222916861061864, + "grad_norm": 45.43304493932595, + "learning_rate": 0.00027262622020047264, + "loss": 7.2182, + "step": 14171 + }, + { + "epoch": 1.3223849958010638, + "grad_norm": 3.538637323705143, + "learning_rate": 0.00027262186393214486, + "loss": 7.4297, + "step": 14172 + }, + { + "epoch": 1.322478305495941, + "grad_norm": 21487168406948.516, + "learning_rate": 0.000272617507352026, + "loss": 7.5959, + "step": 14173 + }, + { + "epoch": 1.3225716151908182, + "grad_norm": 2.4207595772446235, + "learning_rate": 0.00027261315046012716, + "loss": 7.5058, + "step": 14174 + }, + { + "epoch": 1.3226649248856956, + "grad_norm": 4.0385227613463295, + "learning_rate": 0.0002726087932564594, + "loss": 7.4663, + "step": 14175 + }, + { + "epoch": 1.3227582345805728, + "grad_norm": 109611752897387.38, + "learning_rate": 0.0002726044357410338, + "loss": 7.6557, + "step": 14176 + }, + { + "epoch": 1.3228515442754503, + "grad_norm": 170469883945.35596, + "learning_rate": 0.00027260007791386146, + "loss": 7.8831, + "step": 14177 + }, + { + "epoch": 1.3229448539703275, + "grad_norm": 5.5169970834494295, + "learning_rate": 0.0002725957197749534, + "loss": 7.2623, + "step": 14178 + }, + { + "epoch": 1.3230381636652049, + "grad_norm": 2.459375501922747, + "learning_rate": 0.00027259136132432076, + "loss": 7.7682, + "step": 14179 + }, + { + "epoch": 1.323131473360082, + "grad_norm": 22.163648853696323, + "learning_rate": 0.00027258700256197464, + "loss": 7.7485, + "step": 14180 + }, + { + "epoch": 1.3232247830549593, + "grad_norm": 5.482074886238612, + "learning_rate": 0.00027258264348792603, + "loss": 7.4875, + "step": 14181 + }, + { + "epoch": 1.3233180927498367, + "grad_norm": 2083512.521934494, + "learning_rate": 0.00027257828410218605, + "loss": 7.4832, + "step": 14182 + }, + { + "epoch": 1.3234114024447141, + "grad_norm": 3.347821353599674, + "learning_rate": 0.00027257392440476586, + "loss": 7.1731, + "step": 14183 + }, + { + "epoch": 1.3235047121395913, + "grad_norm": 13.539400242871794, + "learning_rate": 0.0002725695643956765, + "loss": 7.2265, + "step": 14184 + }, + { + "epoch": 1.3235980218344685, + "grad_norm": 1.4066148285587015, + "learning_rate": 0.000272565204074929, + "loss": 7.4555, + "step": 14185 + }, + { + "epoch": 1.323691331529346, + "grad_norm": 12.246160203372908, + "learning_rate": 0.0002725608434425345, + "loss": 7.393, + "step": 14186 + }, + { + "epoch": 1.3237846412242231, + "grad_norm": 3.3094996816941165, + "learning_rate": 0.00027255648249850407, + "loss": 7.4278, + "step": 14187 + }, + { + "epoch": 1.3238779509191005, + "grad_norm": 3.2886525351517673, + "learning_rate": 0.00027255212124284884, + "loss": 7.6782, + "step": 14188 + }, + { + "epoch": 1.3239712606139777, + "grad_norm": 343614.60338278004, + "learning_rate": 0.0002725477596755798, + "loss": 7.5898, + "step": 14189 + }, + { + "epoch": 1.3240645703088552, + "grad_norm": 1.3043851743054982, + "learning_rate": 0.0002725433977967082, + "loss": 7.6497, + "step": 14190 + }, + { + "epoch": 1.3241578800037324, + "grad_norm": 8.002314818278128, + "learning_rate": 0.00027253903560624503, + "loss": 7.6009, + "step": 14191 + }, + { + "epoch": 1.3242511896986096, + "grad_norm": 3.777053320444151, + "learning_rate": 0.00027253467310420134, + "loss": 7.4604, + "step": 14192 + }, + { + "epoch": 1.324344499393487, + "grad_norm": 1.4336906697718914, + "learning_rate": 0.0002725303102905883, + "loss": 7.6459, + "step": 14193 + }, + { + "epoch": 1.3244378090883644, + "grad_norm": 10.824794942124521, + "learning_rate": 0.0002725259471654169, + "loss": 7.1659, + "step": 14194 + }, + { + "epoch": 1.3245311187832416, + "grad_norm": 3.3086460568583806, + "learning_rate": 0.0002725215837286985, + "loss": 7.1251, + "step": 14195 + }, + { + "epoch": 1.3246244284781188, + "grad_norm": 6.083166869238951, + "learning_rate": 0.00027251721998044386, + "loss": 7.5891, + "step": 14196 + }, + { + "epoch": 1.3247177381729962, + "grad_norm": 3.2928805983894893, + "learning_rate": 0.00027251285592066424, + "loss": 7.5021, + "step": 14197 + }, + { + "epoch": 1.3248110478678734, + "grad_norm": 2.9275453332185863, + "learning_rate": 0.0002725084915493707, + "loss": 7.3964, + "step": 14198 + }, + { + "epoch": 1.3249043575627508, + "grad_norm": 21.360161882379824, + "learning_rate": 0.0002725041268665744, + "loss": 7.4363, + "step": 14199 + }, + { + "epoch": 1.324997667257628, + "grad_norm": 49024989.30404082, + "learning_rate": 0.0002724997618722864, + "loss": 7.2091, + "step": 14200 + }, + { + "epoch": 1.3250909769525054, + "grad_norm": 14662867353.68514, + "learning_rate": 0.00027249539656651773, + "loss": 7.3061, + "step": 14201 + }, + { + "epoch": 1.3251842866473826, + "grad_norm": 2.8561628842921545, + "learning_rate": 0.00027249103094927963, + "loss": 7.4883, + "step": 14202 + }, + { + "epoch": 1.3252775963422598, + "grad_norm": 24.814879874128682, + "learning_rate": 0.00027248666502058304, + "loss": 7.2599, + "step": 14203 + }, + { + "epoch": 1.3253709060371373, + "grad_norm": 5.779180802529334, + "learning_rate": 0.0002724822987804392, + "loss": 7.4485, + "step": 14204 + }, + { + "epoch": 1.3254642157320147, + "grad_norm": 32152113.20612714, + "learning_rate": 0.0002724779322288592, + "loss": 7.3978, + "step": 14205 + }, + { + "epoch": 1.3255575254268919, + "grad_norm": 1.1274367412779889, + "learning_rate": 0.00027247356536585403, + "loss": 7.3518, + "step": 14206 + }, + { + "epoch": 1.325650835121769, + "grad_norm": 3.1730498689387203, + "learning_rate": 0.00027246919819143484, + "loss": 7.3833, + "step": 14207 + }, + { + "epoch": 1.3257441448166465, + "grad_norm": 2.6475368746931394, + "learning_rate": 0.0002724648307056128, + "loss": 7.3988, + "step": 14208 + }, + { + "epoch": 1.3258374545115237, + "grad_norm": 3.9648114581779605, + "learning_rate": 0.000272460462908399, + "loss": 7.6966, + "step": 14209 + }, + { + "epoch": 1.325930764206401, + "grad_norm": 2.085074918185172, + "learning_rate": 0.0002724560947998045, + "loss": 6.9548, + "step": 14210 + }, + { + "epoch": 1.3260240739012783, + "grad_norm": 4.1706590002053545, + "learning_rate": 0.0002724517263798404, + "loss": 7.251, + "step": 14211 + }, + { + "epoch": 1.3261173835961557, + "grad_norm": 3.2969177441608437, + "learning_rate": 0.00027244735764851783, + "loss": 6.9103, + "step": 14212 + }, + { + "epoch": 1.326210693291033, + "grad_norm": 10183904.946690533, + "learning_rate": 0.0002724429886058479, + "loss": 7.2388, + "step": 14213 + }, + { + "epoch": 1.3263040029859101, + "grad_norm": 3.040903182099313, + "learning_rate": 0.00027243861925184176, + "loss": 7.4271, + "step": 14214 + }, + { + "epoch": 1.3263973126807875, + "grad_norm": 1.171032283161757, + "learning_rate": 0.0002724342495865104, + "loss": 7.2784, + "step": 14215 + }, + { + "epoch": 1.326490622375665, + "grad_norm": 2.074663532008117, + "learning_rate": 0.0002724298796098651, + "loss": 7.5748, + "step": 14216 + }, + { + "epoch": 1.3265839320705421, + "grad_norm": 1.4000427132452629, + "learning_rate": 0.0002724255093219169, + "loss": 7.3187, + "step": 14217 + }, + { + "epoch": 1.3266772417654193, + "grad_norm": 1.5750024254615544, + "learning_rate": 0.00027242113872267686, + "loss": 7.6408, + "step": 14218 + }, + { + "epoch": 1.3267705514602968, + "grad_norm": 0.778828033284523, + "learning_rate": 0.0002724167678121561, + "loss": 7.1395, + "step": 14219 + }, + { + "epoch": 1.326863861155174, + "grad_norm": 3.3424653696502795, + "learning_rate": 0.0002724123965903657, + "loss": 7.5676, + "step": 14220 + }, + { + "epoch": 1.3269571708500514, + "grad_norm": 1.7186356085294856, + "learning_rate": 0.000272408025057317, + "loss": 7.3027, + "step": 14221 + }, + { + "epoch": 1.3270504805449286, + "grad_norm": 1.9000938286794078, + "learning_rate": 0.00027240365321302084, + "loss": 7.4465, + "step": 14222 + }, + { + "epoch": 1.327143790239806, + "grad_norm": 2.016015264898921, + "learning_rate": 0.0002723992810574884, + "loss": 7.0707, + "step": 14223 + }, + { + "epoch": 1.3272370999346832, + "grad_norm": 2.4002879874571184, + "learning_rate": 0.00027239490859073094, + "loss": 7.5059, + "step": 14224 + }, + { + "epoch": 1.3273304096295604, + "grad_norm": 6.201992666742441, + "learning_rate": 0.00027239053581275946, + "loss": 7.4126, + "step": 14225 + }, + { + "epoch": 1.3274237193244378, + "grad_norm": 3.0801426711760227, + "learning_rate": 0.00027238616272358515, + "loss": 7.0246, + "step": 14226 + }, + { + "epoch": 1.3275170290193152, + "grad_norm": 2.7923251042889534, + "learning_rate": 0.000272381789323219, + "loss": 7.1279, + "step": 14227 + }, + { + "epoch": 1.3276103387141924, + "grad_norm": 5984076.634781805, + "learning_rate": 0.00027237741561167227, + "loss": 7.5071, + "step": 14228 + }, + { + "epoch": 1.3277036484090696, + "grad_norm": 1.4114504234178327, + "learning_rate": 0.000272373041588956, + "loss": 7.3108, + "step": 14229 + }, + { + "epoch": 1.327796958103947, + "grad_norm": 1.1535451437446527, + "learning_rate": 0.0002723686672550813, + "loss": 7.4419, + "step": 14230 + }, + { + "epoch": 1.3278902677988242, + "grad_norm": 1.472082784758688, + "learning_rate": 0.0002723642926100594, + "loss": 7.3834, + "step": 14231 + }, + { + "epoch": 1.3279835774937017, + "grad_norm": 2.653445639098968, + "learning_rate": 0.00027235991765390134, + "loss": 7.2362, + "step": 14232 + }, + { + "epoch": 1.3280768871885789, + "grad_norm": 1.3388951323837128, + "learning_rate": 0.0002723555423866182, + "loss": 7.2734, + "step": 14233 + }, + { + "epoch": 1.3281701968834563, + "grad_norm": 8451536.35162776, + "learning_rate": 0.0002723511668082212, + "loss": 7.366, + "step": 14234 + }, + { + "epoch": 1.3282635065783335, + "grad_norm": 31.34625714234923, + "learning_rate": 0.0002723467909187214, + "loss": 7.2103, + "step": 14235 + }, + { + "epoch": 1.3283568162732107, + "grad_norm": 2506589.866772443, + "learning_rate": 0.0002723424147181299, + "loss": 7.1466, + "step": 14236 + }, + { + "epoch": 1.328450125968088, + "grad_norm": 1.1611780853732223, + "learning_rate": 0.000272338038206458, + "loss": 7.7806, + "step": 14237 + }, + { + "epoch": 1.3285434356629655, + "grad_norm": 0.872835675139506, + "learning_rate": 0.00027233366138371666, + "loss": 7.1943, + "step": 14238 + }, + { + "epoch": 1.3286367453578427, + "grad_norm": 1.102820864614374, + "learning_rate": 0.00027232928424991704, + "loss": 7.232, + "step": 14239 + }, + { + "epoch": 1.32873005505272, + "grad_norm": 1.1222911309293297, + "learning_rate": 0.0002723249068050703, + "loss": 7.1977, + "step": 14240 + }, + { + "epoch": 1.3288233647475973, + "grad_norm": 13039167.68677062, + "learning_rate": 0.00027232052904918754, + "loss": 7.1924, + "step": 14241 + }, + { + "epoch": 1.3289166744424745, + "grad_norm": 0.7180762499929246, + "learning_rate": 0.00027231615098227994, + "loss": 7.3411, + "step": 14242 + }, + { + "epoch": 1.329009984137352, + "grad_norm": 1.8430451012082973, + "learning_rate": 0.0002723117726043586, + "loss": 7.1989, + "step": 14243 + }, + { + "epoch": 1.3291032938322291, + "grad_norm": 17.954314256293237, + "learning_rate": 0.0002723073939154346, + "loss": 7.2229, + "step": 14244 + }, + { + "epoch": 1.3291966035271066, + "grad_norm": 0.9779786036814961, + "learning_rate": 0.0002723030149155192, + "loss": 7.031, + "step": 14245 + }, + { + "epoch": 1.3292899132219838, + "grad_norm": 8.427726680247925, + "learning_rate": 0.00027229863560462343, + "loss": 7.2291, + "step": 14246 + }, + { + "epoch": 1.329383222916861, + "grad_norm": 1.418890871827864, + "learning_rate": 0.00027229425598275845, + "loss": 7.4138, + "step": 14247 + }, + { + "epoch": 1.3294765326117384, + "grad_norm": 4.114133352629835, + "learning_rate": 0.00027228987604993544, + "loss": 7.8287, + "step": 14248 + }, + { + "epoch": 1.3295698423066158, + "grad_norm": 11.190645576230008, + "learning_rate": 0.0002722854958061655, + "loss": 7.6086, + "step": 14249 + }, + { + "epoch": 1.329663152001493, + "grad_norm": 0.6953613252531888, + "learning_rate": 0.00027228111525145967, + "loss": 7.3632, + "step": 14250 + }, + { + "epoch": 1.3297564616963702, + "grad_norm": 4.815231562242804, + "learning_rate": 0.0002722767343858293, + "loss": 7.3311, + "step": 14251 + }, + { + "epoch": 1.3298497713912476, + "grad_norm": 5757756.896964231, + "learning_rate": 0.00027227235320928533, + "loss": 7.3387, + "step": 14252 + }, + { + "epoch": 1.3299430810861248, + "grad_norm": 3.0224097820349427, + "learning_rate": 0.000272267971721839, + "loss": 7.2272, + "step": 14253 + }, + { + "epoch": 1.3300363907810022, + "grad_norm": 2.192839090166145, + "learning_rate": 0.0002722635899235015, + "loss": 7.4737, + "step": 14254 + }, + { + "epoch": 1.3301297004758794, + "grad_norm": 3.108326843927497, + "learning_rate": 0.00027225920781428386, + "loss": 7.323, + "step": 14255 + }, + { + "epoch": 1.3302230101707568, + "grad_norm": 2.9728242466033525, + "learning_rate": 0.00027225482539419726, + "loss": 7.2148, + "step": 14256 + }, + { + "epoch": 1.330316319865634, + "grad_norm": 1.8100054637005996, + "learning_rate": 0.00027225044266325287, + "loss": 7.6005, + "step": 14257 + }, + { + "epoch": 1.3304096295605112, + "grad_norm": 1.0133527830000966, + "learning_rate": 0.00027224605962146187, + "loss": 7.1561, + "step": 14258 + }, + { + "epoch": 1.3305029392553887, + "grad_norm": 3.97474673428616, + "learning_rate": 0.00027224167626883525, + "loss": 7.2857, + "step": 14259 + }, + { + "epoch": 1.330596248950266, + "grad_norm": 1656351.8889941024, + "learning_rate": 0.00027223729260538436, + "loss": 7.0634, + "step": 14260 + }, + { + "epoch": 1.3306895586451433, + "grad_norm": 1761813.9348251587, + "learning_rate": 0.0002722329086311202, + "loss": 7.2319, + "step": 14261 + }, + { + "epoch": 1.3307828683400205, + "grad_norm": 19024502.225789536, + "learning_rate": 0.00027222852434605394, + "loss": 7.5461, + "step": 14262 + }, + { + "epoch": 1.3308761780348979, + "grad_norm": 5.683425459258375, + "learning_rate": 0.00027222413975019677, + "loss": 7.4001, + "step": 14263 + }, + { + "epoch": 1.330969487729775, + "grad_norm": 3.382906905848986, + "learning_rate": 0.0002722197548435598, + "loss": 7.3579, + "step": 14264 + }, + { + "epoch": 1.3310627974246525, + "grad_norm": 1.0715456549720357, + "learning_rate": 0.0002722153696261542, + "loss": 7.2201, + "step": 14265 + }, + { + "epoch": 1.3311561071195297, + "grad_norm": 6.227316178507144, + "learning_rate": 0.00027221098409799114, + "loss": 7.3935, + "step": 14266 + }, + { + "epoch": 1.3312494168144071, + "grad_norm": 7.905971934113639, + "learning_rate": 0.0002722065982590817, + "loss": 7.5345, + "step": 14267 + }, + { + "epoch": 1.3313427265092843, + "grad_norm": 4069432.382685637, + "learning_rate": 0.00027220221210943714, + "loss": 7.216, + "step": 14268 + }, + { + "epoch": 1.3314360362041615, + "grad_norm": 2.003677476565067, + "learning_rate": 0.00027219782564906855, + "loss": 7.4562, + "step": 14269 + }, + { + "epoch": 1.331529345899039, + "grad_norm": 3.3129093284158926, + "learning_rate": 0.00027219343887798707, + "loss": 7.4819, + "step": 14270 + }, + { + "epoch": 1.3316226555939161, + "grad_norm": 3.5231919301312824, + "learning_rate": 0.0002721890517962039, + "loss": 7.2956, + "step": 14271 + }, + { + "epoch": 1.3317159652887935, + "grad_norm": 207394.3645259338, + "learning_rate": 0.0002721846644037301, + "loss": 7.4597, + "step": 14272 + }, + { + "epoch": 1.3318092749836707, + "grad_norm": 1.8317935934278529, + "learning_rate": 0.0002721802767005769, + "loss": 7.5654, + "step": 14273 + }, + { + "epoch": 1.3319025846785482, + "grad_norm": 3.5366389335105635, + "learning_rate": 0.0002721758886867555, + "loss": 7.0179, + "step": 14274 + }, + { + "epoch": 1.3319958943734254, + "grad_norm": 1.8641326844877097, + "learning_rate": 0.000272171500362277, + "loss": 7.4342, + "step": 14275 + }, + { + "epoch": 1.3320892040683028, + "grad_norm": 7069120.321277854, + "learning_rate": 0.00027216711172715257, + "loss": 7.3663, + "step": 14276 + }, + { + "epoch": 1.33218251376318, + "grad_norm": 1.91026198724244, + "learning_rate": 0.00027216272278139333, + "loss": 7.2304, + "step": 14277 + }, + { + "epoch": 1.3322758234580574, + "grad_norm": 879638.6518759665, + "learning_rate": 0.00027215833352501044, + "loss": 7.0905, + "step": 14278 + }, + { + "epoch": 1.3323691331529346, + "grad_norm": 1.0057266620087781, + "learning_rate": 0.00027215394395801513, + "loss": 7.1375, + "step": 14279 + }, + { + "epoch": 1.3324624428478118, + "grad_norm": 0.7322420905972338, + "learning_rate": 0.0002721495540804186, + "loss": 7.2567, + "step": 14280 + }, + { + "epoch": 1.3325557525426892, + "grad_norm": 3.0803396022419296, + "learning_rate": 0.0002721451638922318, + "loss": 7.3921, + "step": 14281 + }, + { + "epoch": 1.3326490622375664, + "grad_norm": 1.889727960850556, + "learning_rate": 0.0002721407733934661, + "loss": 7.6844, + "step": 14282 + }, + { + "epoch": 1.3327423719324438, + "grad_norm": 2568034.1363001857, + "learning_rate": 0.00027213638258413264, + "loss": 6.912, + "step": 14283 + }, + { + "epoch": 1.332835681627321, + "grad_norm": 5.901337933385075, + "learning_rate": 0.0002721319914642425, + "loss": 7.5422, + "step": 14284 + }, + { + "epoch": 1.3329289913221984, + "grad_norm": 1.8253637989837896, + "learning_rate": 0.00027212760003380687, + "loss": 7.0912, + "step": 14285 + }, + { + "epoch": 1.3330223010170756, + "grad_norm": 2628195.5272671618, + "learning_rate": 0.0002721232082928369, + "loss": 7.341, + "step": 14286 + }, + { + "epoch": 1.3331156107119528, + "grad_norm": 5303388.33055223, + "learning_rate": 0.00027211881624134384, + "loss": 7.3433, + "step": 14287 + }, + { + "epoch": 1.3332089204068303, + "grad_norm": 0.45609494811535056, + "learning_rate": 0.0002721144238793388, + "loss": 7.4433, + "step": 14288 + }, + { + "epoch": 1.3333022301017077, + "grad_norm": 4484627.403452659, + "learning_rate": 0.00027211003120683296, + "loss": 7.4303, + "step": 14289 + }, + { + "epoch": 1.3333955397965849, + "grad_norm": 3.780971113270737, + "learning_rate": 0.00027210563822383746, + "loss": 7.3245, + "step": 14290 + }, + { + "epoch": 1.333488849491462, + "grad_norm": 1406933.8174391766, + "learning_rate": 0.0002721012449303635, + "loss": 7.0213, + "step": 14291 + }, + { + "epoch": 1.3335821591863395, + "grad_norm": 1.5517195357207794, + "learning_rate": 0.00027209685132642227, + "loss": 7.6365, + "step": 14292 + }, + { + "epoch": 1.3336754688812167, + "grad_norm": 1.4543545246456593, + "learning_rate": 0.00027209245741202485, + "loss": 7.1192, + "step": 14293 + }, + { + "epoch": 1.333768778576094, + "grad_norm": 2.162079732383758, + "learning_rate": 0.0002720880631871825, + "loss": 6.9637, + "step": 14294 + }, + { + "epoch": 1.3338620882709713, + "grad_norm": 1.2640627754477372, + "learning_rate": 0.00027208366865190636, + "loss": 7.1892, + "step": 14295 + }, + { + "epoch": 1.3339553979658487, + "grad_norm": 1.023150123337407, + "learning_rate": 0.0002720792738062077, + "loss": 7.3909, + "step": 14296 + }, + { + "epoch": 1.334048707660726, + "grad_norm": 2.5385605862939182, + "learning_rate": 0.0002720748786500975, + "loss": 7.1348, + "step": 14297 + }, + { + "epoch": 1.3341420173556031, + "grad_norm": 2222041.8673488507, + "learning_rate": 0.0002720704831835871, + "loss": 7.4585, + "step": 14298 + }, + { + "epoch": 1.3342353270504805, + "grad_norm": 5.47860550966205, + "learning_rate": 0.00027206608740668763, + "loss": 7.2481, + "step": 14299 + }, + { + "epoch": 1.334328636745358, + "grad_norm": 0.597804235565174, + "learning_rate": 0.00027206169131941023, + "loss": 7.4413, + "step": 14300 + }, + { + "epoch": 1.3344219464402352, + "grad_norm": 3.6126088928394418, + "learning_rate": 0.00027205729492176613, + "loss": 7.2391, + "step": 14301 + }, + { + "epoch": 1.3345152561351123, + "grad_norm": 1.513187996875183, + "learning_rate": 0.00027205289821376644, + "loss": 7.2826, + "step": 14302 + }, + { + "epoch": 1.3346085658299898, + "grad_norm": 1.0050845098707617, + "learning_rate": 0.00027204850119542245, + "loss": 7.2696, + "step": 14303 + }, + { + "epoch": 1.334701875524867, + "grad_norm": 4.286014797669688, + "learning_rate": 0.0002720441038667452, + "loss": 7.3843, + "step": 14304 + }, + { + "epoch": 1.3347951852197444, + "grad_norm": 1.1979858110479435, + "learning_rate": 0.000272039706227746, + "loss": 7.4124, + "step": 14305 + }, + { + "epoch": 1.3348884949146216, + "grad_norm": 4360245.798201383, + "learning_rate": 0.000272035308278436, + "loss": 7.453, + "step": 14306 + }, + { + "epoch": 1.334981804609499, + "grad_norm": 1.225561520278517, + "learning_rate": 0.00027203091001882634, + "loss": 7.0424, + "step": 14307 + }, + { + "epoch": 1.3350751143043762, + "grad_norm": 1.241725900532311, + "learning_rate": 0.00027202651144892816, + "loss": 7.2612, + "step": 14308 + }, + { + "epoch": 1.3351684239992534, + "grad_norm": 0.6146510398458848, + "learning_rate": 0.0002720221125687528, + "loss": 7.3271, + "step": 14309 + }, + { + "epoch": 1.3352617336941308, + "grad_norm": 1.2822930841491826, + "learning_rate": 0.0002720177133783113, + "loss": 7.0642, + "step": 14310 + }, + { + "epoch": 1.3353550433890082, + "grad_norm": 1.8710215052919243, + "learning_rate": 0.0002720133138776149, + "loss": 7.3718, + "step": 14311 + }, + { + "epoch": 1.3354483530838854, + "grad_norm": 1.1376080116525251, + "learning_rate": 0.0002720089140666748, + "loss": 7.2377, + "step": 14312 + }, + { + "epoch": 1.3355416627787626, + "grad_norm": 3.038003437952227, + "learning_rate": 0.00027200451394550216, + "loss": 7.5523, + "step": 14313 + }, + { + "epoch": 1.33563497247364, + "grad_norm": 3.534529620413748, + "learning_rate": 0.0002720001135141082, + "loss": 7.5246, + "step": 14314 + }, + { + "epoch": 1.3357282821685172, + "grad_norm": 28.269838512613692, + "learning_rate": 0.00027199571277250413, + "loss": 7.6907, + "step": 14315 + }, + { + "epoch": 1.3358215918633947, + "grad_norm": 1.5765888890710895, + "learning_rate": 0.000271991311720701, + "loss": 7.1532, + "step": 14316 + }, + { + "epoch": 1.3359149015582719, + "grad_norm": 1.7185764686897067, + "learning_rate": 0.0002719869103587102, + "loss": 7.4783, + "step": 14317 + }, + { + "epoch": 1.3360082112531493, + "grad_norm": 1.110461141173316, + "learning_rate": 0.00027198250868654274, + "loss": 7.3352, + "step": 14318 + }, + { + "epoch": 1.3361015209480265, + "grad_norm": 0.8656024677427756, + "learning_rate": 0.0002719781067042099, + "loss": 7.298, + "step": 14319 + }, + { + "epoch": 1.3361948306429037, + "grad_norm": 2.0098835368973065, + "learning_rate": 0.0002719737044117229, + "loss": 7.5799, + "step": 14320 + }, + { + "epoch": 1.336288140337781, + "grad_norm": 4.654438551828846, + "learning_rate": 0.0002719693018090929, + "loss": 7.1459, + "step": 14321 + }, + { + "epoch": 1.3363814500326585, + "grad_norm": 13349328.976231234, + "learning_rate": 0.00027196489889633115, + "loss": 7.3244, + "step": 14322 + }, + { + "epoch": 1.3364747597275357, + "grad_norm": 0.797256693513477, + "learning_rate": 0.0002719604956734487, + "loss": 7.2757, + "step": 14323 + }, + { + "epoch": 1.336568069422413, + "grad_norm": 1.5326824985936631, + "learning_rate": 0.0002719560921404569, + "loss": 7.5358, + "step": 14324 + }, + { + "epoch": 1.3366613791172903, + "grad_norm": 2.2813354489596662, + "learning_rate": 0.00027195168829736684, + "loss": 7.251, + "step": 14325 + }, + { + "epoch": 1.3367546888121675, + "grad_norm": 0.9253825745651341, + "learning_rate": 0.00027194728414418976, + "loss": 7.4075, + "step": 14326 + }, + { + "epoch": 1.336847998507045, + "grad_norm": 37699267.81747466, + "learning_rate": 0.0002719428796809369, + "loss": 7.5491, + "step": 14327 + }, + { + "epoch": 1.3369413082019221, + "grad_norm": 0.7806841498146339, + "learning_rate": 0.0002719384749076194, + "loss": 7.4139, + "step": 14328 + }, + { + "epoch": 1.3370346178967996, + "grad_norm": 902287.5598119495, + "learning_rate": 0.0002719340698242484, + "loss": 7.1866, + "step": 14329 + }, + { + "epoch": 1.3371279275916768, + "grad_norm": 2.262044050959454, + "learning_rate": 0.0002719296644308353, + "loss": 7.5356, + "step": 14330 + }, + { + "epoch": 1.337221237286554, + "grad_norm": 3978415.3420868465, + "learning_rate": 0.00027192525872739114, + "loss": 7.3849, + "step": 14331 + }, + { + "epoch": 1.3373145469814314, + "grad_norm": 2367162.1427844395, + "learning_rate": 0.0002719208527139271, + "loss": 7.202, + "step": 14332 + }, + { + "epoch": 1.3374078566763088, + "grad_norm": 4.7813854141159675, + "learning_rate": 0.0002719164463904546, + "loss": 7.244, + "step": 14333 + }, + { + "epoch": 1.337501166371186, + "grad_norm": 691108.9280108213, + "learning_rate": 0.00027191203975698456, + "loss": 7.0416, + "step": 14334 + }, + { + "epoch": 1.3375944760660632, + "grad_norm": 441737.8146542359, + "learning_rate": 0.00027190763281352836, + "loss": 7.1028, + "step": 14335 + }, + { + "epoch": 1.3376877857609406, + "grad_norm": 1011170.0107087668, + "learning_rate": 0.00027190322556009717, + "loss": 7.2876, + "step": 14336 + }, + { + "epoch": 1.3377810954558178, + "grad_norm": 1.7563720615000282, + "learning_rate": 0.0002718988179967022, + "loss": 7.5047, + "step": 14337 + }, + { + "epoch": 1.3378744051506952, + "grad_norm": 2.5061617064041983, + "learning_rate": 0.00027189441012335456, + "loss": 7.5057, + "step": 14338 + }, + { + "epoch": 1.3379677148455724, + "grad_norm": 2081123.49027807, + "learning_rate": 0.0002718900019400656, + "loss": 7.4321, + "step": 14339 + }, + { + "epoch": 1.3380610245404498, + "grad_norm": 1.585301232189917, + "learning_rate": 0.0002718855934468465, + "loss": 7.5846, + "step": 14340 + }, + { + "epoch": 1.338154334235327, + "grad_norm": 2.6787931993164213, + "learning_rate": 0.0002718811846437084, + "loss": 7.4103, + "step": 14341 + }, + { + "epoch": 1.3382476439302042, + "grad_norm": 2.3891422567968013, + "learning_rate": 0.0002718767755306626, + "loss": 7.1925, + "step": 14342 + }, + { + "epoch": 1.3383409536250817, + "grad_norm": 1.5731319055929418, + "learning_rate": 0.0002718723661077202, + "loss": 7.4744, + "step": 14343 + }, + { + "epoch": 1.338434263319959, + "grad_norm": 1.7966283811458437, + "learning_rate": 0.0002718679563748925, + "loss": 7.2052, + "step": 14344 + }, + { + "epoch": 1.3385275730148363, + "grad_norm": 2.4929378723455446, + "learning_rate": 0.0002718635463321907, + "loss": 7.0265, + "step": 14345 + }, + { + "epoch": 1.3386208827097135, + "grad_norm": 2.0239890947099064, + "learning_rate": 0.00027185913597962594, + "loss": 7.8006, + "step": 14346 + }, + { + "epoch": 1.3387141924045909, + "grad_norm": 1274233.7677924128, + "learning_rate": 0.0002718547253172096, + "loss": 7.0923, + "step": 14347 + }, + { + "epoch": 1.338807502099468, + "grad_norm": 0.7407639739263525, + "learning_rate": 0.0002718503143449527, + "loss": 7.2245, + "step": 14348 + }, + { + "epoch": 1.3389008117943455, + "grad_norm": 1.5443947571870906, + "learning_rate": 0.0002718459030628666, + "loss": 7.0404, + "step": 14349 + }, + { + "epoch": 1.3389941214892227, + "grad_norm": 0.4814225130708615, + "learning_rate": 0.00027184149147096244, + "loss": 7.135, + "step": 14350 + }, + { + "epoch": 1.3390874311841001, + "grad_norm": 7034644.918391806, + "learning_rate": 0.00027183707956925145, + "loss": 7.5054, + "step": 14351 + }, + { + "epoch": 1.3391807408789773, + "grad_norm": 34.27978410426591, + "learning_rate": 0.00027183266735774487, + "loss": 7.5064, + "step": 14352 + }, + { + "epoch": 1.3392740505738545, + "grad_norm": 0.9497488819410425, + "learning_rate": 0.0002718282548364539, + "loss": 7.585, + "step": 14353 + }, + { + "epoch": 1.339367360268732, + "grad_norm": 2.290464445283399, + "learning_rate": 0.00027182384200538976, + "loss": 7.405, + "step": 14354 + }, + { + "epoch": 1.3394606699636094, + "grad_norm": 9339221.808775626, + "learning_rate": 0.00027181942886456373, + "loss": 7.3791, + "step": 14355 + }, + { + "epoch": 1.3395539796584865, + "grad_norm": 4.242488905092728, + "learning_rate": 0.0002718150154139869, + "loss": 7.0597, + "step": 14356 + }, + { + "epoch": 1.3396472893533637, + "grad_norm": 16665053.998527676, + "learning_rate": 0.0002718106016536706, + "loss": 7.6345, + "step": 14357 + }, + { + "epoch": 1.3397405990482412, + "grad_norm": 2.6053456635646417, + "learning_rate": 0.000271806187583626, + "loss": 7.1562, + "step": 14358 + }, + { + "epoch": 1.3398339087431184, + "grad_norm": 1.4015623300364295, + "learning_rate": 0.00027180177320386444, + "loss": 7.1023, + "step": 14359 + }, + { + "epoch": 1.3399272184379958, + "grad_norm": 1.5058969466232166, + "learning_rate": 0.000271797358514397, + "loss": 7.1322, + "step": 14360 + }, + { + "epoch": 1.340020528132873, + "grad_norm": 7320250.44881784, + "learning_rate": 0.0002717929435152349, + "loss": 7.0575, + "step": 14361 + }, + { + "epoch": 1.3401138378277504, + "grad_norm": 1378824.2961604432, + "learning_rate": 0.00027178852820638946, + "loss": 7.1489, + "step": 14362 + }, + { + "epoch": 1.3402071475226276, + "grad_norm": 3.1118252718108197, + "learning_rate": 0.00027178411258787187, + "loss": 7.4726, + "step": 14363 + }, + { + "epoch": 1.3403004572175048, + "grad_norm": 3886098.855392116, + "learning_rate": 0.00027177969665969335, + "loss": 7.0702, + "step": 14364 + }, + { + "epoch": 1.3403937669123822, + "grad_norm": 3.158749435281805, + "learning_rate": 0.0002717752804218652, + "loss": 7.377, + "step": 14365 + }, + { + "epoch": 1.3404870766072596, + "grad_norm": 1.7057855341922477, + "learning_rate": 0.0002717708638743985, + "loss": 7.0825, + "step": 14366 + }, + { + "epoch": 1.3405803863021368, + "grad_norm": 1.1754974616914953, + "learning_rate": 0.0002717664470173046, + "loss": 7.2855, + "step": 14367 + }, + { + "epoch": 1.340673695997014, + "grad_norm": 401812.48136835353, + "learning_rate": 0.0002717620298505947, + "loss": 7.0529, + "step": 14368 + }, + { + "epoch": 1.3407670056918914, + "grad_norm": 1.8609312977391876, + "learning_rate": 0.00027175761237428, + "loss": 7.1088, + "step": 14369 + }, + { + "epoch": 1.3408603153867686, + "grad_norm": 13975045.628458414, + "learning_rate": 0.00027175319458837174, + "loss": 7.4908, + "step": 14370 + }, + { + "epoch": 1.340953625081646, + "grad_norm": 3618415.1936261407, + "learning_rate": 0.0002717487764928812, + "loss": 7.1941, + "step": 14371 + }, + { + "epoch": 1.3410469347765233, + "grad_norm": 0.9785644724811324, + "learning_rate": 0.00027174435808781963, + "loss": 7.0545, + "step": 14372 + }, + { + "epoch": 1.3411402444714007, + "grad_norm": 1.1280377567552955, + "learning_rate": 0.00027173993937319816, + "loss": 7.4457, + "step": 14373 + }, + { + "epoch": 1.3412335541662779, + "grad_norm": 11444381.620917268, + "learning_rate": 0.00027173552034902813, + "loss": 7.4259, + "step": 14374 + }, + { + "epoch": 1.341326863861155, + "grad_norm": 0.803539760900714, + "learning_rate": 0.0002717311010153207, + "loss": 7.34, + "step": 14375 + }, + { + "epoch": 1.3414201735560325, + "grad_norm": 1.0962051359112346, + "learning_rate": 0.00027172668137208714, + "loss": 7.1984, + "step": 14376 + }, + { + "epoch": 1.3415134832509097, + "grad_norm": 1.2366595576295776, + "learning_rate": 0.0002717222614193387, + "loss": 7.3761, + "step": 14377 + }, + { + "epoch": 1.341606792945787, + "grad_norm": 0.7995294521042197, + "learning_rate": 0.0002717178411570866, + "loss": 7.3757, + "step": 14378 + }, + { + "epoch": 1.3417001026406643, + "grad_norm": 8.713379887984095, + "learning_rate": 0.00027171342058534214, + "loss": 7.3408, + "step": 14379 + }, + { + "epoch": 1.3417934123355417, + "grad_norm": 0.9123147732913196, + "learning_rate": 0.0002717089997041164, + "loss": 7.0486, + "step": 14380 + }, + { + "epoch": 1.341886722030419, + "grad_norm": 1127882.079498568, + "learning_rate": 0.0002717045785134208, + "loss": 7.2939, + "step": 14381 + }, + { + "epoch": 1.3419800317252963, + "grad_norm": 0.6104276643646647, + "learning_rate": 0.0002717001570132665, + "loss": 7.223, + "step": 14382 + }, + { + "epoch": 1.3420733414201735, + "grad_norm": 1981493.1362776733, + "learning_rate": 0.00027169573520366473, + "loss": 6.9937, + "step": 14383 + }, + { + "epoch": 1.342166651115051, + "grad_norm": 1.1625514896208893, + "learning_rate": 0.0002716913130846268, + "loss": 7.1609, + "step": 14384 + }, + { + "epoch": 1.3422599608099282, + "grad_norm": 1.6981610223130605, + "learning_rate": 0.0002716868906561639, + "loss": 7.5996, + "step": 14385 + }, + { + "epoch": 1.3423532705048054, + "grad_norm": 5.290597859342242, + "learning_rate": 0.0002716824679182872, + "loss": 7.322, + "step": 14386 + }, + { + "epoch": 1.3424465801996828, + "grad_norm": 3.325437824116329, + "learning_rate": 0.00027167804487100814, + "loss": 7.3622, + "step": 14387 + }, + { + "epoch": 1.34253988989456, + "grad_norm": 5.455674231757806, + "learning_rate": 0.00027167362151433777, + "loss": 7.5907, + "step": 14388 + }, + { + "epoch": 1.3426331995894374, + "grad_norm": 1.2117157549521065, + "learning_rate": 0.00027166919784828744, + "loss": 7.1137, + "step": 14389 + }, + { + "epoch": 1.3427265092843146, + "grad_norm": 4190717.50538543, + "learning_rate": 0.0002716647738728684, + "loss": 7.5078, + "step": 14390 + }, + { + "epoch": 1.342819818979192, + "grad_norm": 1.4604480766294676, + "learning_rate": 0.0002716603495880919, + "loss": 7.2696, + "step": 14391 + }, + { + "epoch": 1.3429131286740692, + "grad_norm": 1.354433110721614, + "learning_rate": 0.00027165592499396916, + "loss": 7.4547, + "step": 14392 + }, + { + "epoch": 1.3430064383689464, + "grad_norm": 1.2050705436203446, + "learning_rate": 0.00027165150009051135, + "loss": 7.3879, + "step": 14393 + }, + { + "epoch": 1.3430997480638238, + "grad_norm": 0.8803895566258461, + "learning_rate": 0.0002716470748777299, + "loss": 7.3948, + "step": 14394 + }, + { + "epoch": 1.3431930577587012, + "grad_norm": 8.449966489345258, + "learning_rate": 0.000271642649355636, + "loss": 7.2793, + "step": 14395 + }, + { + "epoch": 1.3432863674535784, + "grad_norm": 2.7263255477014297, + "learning_rate": 0.00027163822352424083, + "loss": 7.2292, + "step": 14396 + }, + { + "epoch": 1.3433796771484556, + "grad_norm": 1.9316189342691426, + "learning_rate": 0.0002716337973835557, + "loss": 7.4409, + "step": 14397 + }, + { + "epoch": 1.343472986843333, + "grad_norm": 10838336.229677403, + "learning_rate": 0.00027162937093359185, + "loss": 7.4946, + "step": 14398 + }, + { + "epoch": 1.3435662965382102, + "grad_norm": 1688680.5774498687, + "learning_rate": 0.00027162494417436054, + "loss": 7.1777, + "step": 14399 + }, + { + "epoch": 1.3436596062330877, + "grad_norm": 627995.093281793, + "learning_rate": 0.000271620517105873, + "loss": 7.1029, + "step": 14400 + }, + { + "epoch": 1.3437529159279649, + "grad_norm": 2.4741926664260614, + "learning_rate": 0.0002716160897281406, + "loss": 7.3058, + "step": 14401 + }, + { + "epoch": 1.3438462256228423, + "grad_norm": 1.7639708099758238, + "learning_rate": 0.0002716116620411744, + "loss": 7.5267, + "step": 14402 + }, + { + "epoch": 1.3439395353177195, + "grad_norm": 3.925491407634801, + "learning_rate": 0.0002716072340449858, + "loss": 7.2987, + "step": 14403 + }, + { + "epoch": 1.3440328450125967, + "grad_norm": 0.6173055739854882, + "learning_rate": 0.000271602805739586, + "loss": 7.3883, + "step": 14404 + }, + { + "epoch": 1.344126154707474, + "grad_norm": 4.719340396774906, + "learning_rate": 0.00027159837712498636, + "loss": 7.4015, + "step": 14405 + }, + { + "epoch": 1.3442194644023515, + "grad_norm": 1.532492216030264, + "learning_rate": 0.00027159394820119805, + "loss": 7.4263, + "step": 14406 + }, + { + "epoch": 1.3443127740972287, + "grad_norm": 3.1330485299469673, + "learning_rate": 0.0002715895189682323, + "loss": 7.1574, + "step": 14407 + }, + { + "epoch": 1.344406083792106, + "grad_norm": 1.9966505187079324, + "learning_rate": 0.00027158508942610045, + "loss": 7.6196, + "step": 14408 + }, + { + "epoch": 1.3444993934869833, + "grad_norm": 0.9464619402649124, + "learning_rate": 0.00027158065957481376, + "loss": 7.3651, + "step": 14409 + }, + { + "epoch": 1.3445927031818605, + "grad_norm": 1.376182271305656, + "learning_rate": 0.0002715762294143834, + "loss": 7.2577, + "step": 14410 + }, + { + "epoch": 1.344686012876738, + "grad_norm": 1.2655763126280655, + "learning_rate": 0.0002715717989448208, + "loss": 7.1731, + "step": 14411 + }, + { + "epoch": 1.3447793225716151, + "grad_norm": 2446151.3994309106, + "learning_rate": 0.0002715673681661371, + "loss": 7.0752, + "step": 14412 + }, + { + "epoch": 1.3448726322664926, + "grad_norm": 378526.9267621749, + "learning_rate": 0.0002715629370783435, + "loss": 7.1652, + "step": 14413 + }, + { + "epoch": 1.3449659419613698, + "grad_norm": 1.8586926561715358, + "learning_rate": 0.00027155850568145144, + "loss": 7.4601, + "step": 14414 + }, + { + "epoch": 1.345059251656247, + "grad_norm": 6.341467078654402, + "learning_rate": 0.0002715540739754721, + "loss": 7.175, + "step": 14415 + }, + { + "epoch": 1.3451525613511244, + "grad_norm": 3.8826638932999047, + "learning_rate": 0.00027154964196041674, + "loss": 7.461, + "step": 14416 + }, + { + "epoch": 1.3452458710460018, + "grad_norm": 3.3523343058650794, + "learning_rate": 0.00027154520963629665, + "loss": 7.2929, + "step": 14417 + }, + { + "epoch": 1.345339180740879, + "grad_norm": 1583216.0008737433, + "learning_rate": 0.0002715407770031231, + "loss": 7.3615, + "step": 14418 + }, + { + "epoch": 1.3454324904357562, + "grad_norm": 3.4291637731086326, + "learning_rate": 0.0002715363440609074, + "loss": 7.4431, + "step": 14419 + }, + { + "epoch": 1.3455258001306336, + "grad_norm": 3.9859652719462364, + "learning_rate": 0.00027153191080966075, + "loss": 7.7532, + "step": 14420 + }, + { + "epoch": 1.3456191098255108, + "grad_norm": 17251866.47020556, + "learning_rate": 0.00027152747724939444, + "loss": 7.3655, + "step": 14421 + }, + { + "epoch": 1.3457124195203882, + "grad_norm": 1.021151362965028, + "learning_rate": 0.0002715230433801197, + "loss": 7.5339, + "step": 14422 + }, + { + "epoch": 1.3458057292152654, + "grad_norm": 1.908980678817447, + "learning_rate": 0.00027151860920184793, + "loss": 7.7066, + "step": 14423 + }, + { + "epoch": 1.3458990389101428, + "grad_norm": 1.3721610527804744, + "learning_rate": 0.00027151417471459033, + "loss": 7.1289, + "step": 14424 + }, + { + "epoch": 1.34599234860502, + "grad_norm": 0.8199016403913658, + "learning_rate": 0.00027150973991835816, + "loss": 7.3557, + "step": 14425 + }, + { + "epoch": 1.3460856582998972, + "grad_norm": 5.490242224919861, + "learning_rate": 0.00027150530481316273, + "loss": 7.6447, + "step": 14426 + }, + { + "epoch": 1.3461789679947747, + "grad_norm": 0.7004814468016707, + "learning_rate": 0.0002715008693990153, + "loss": 7.1527, + "step": 14427 + }, + { + "epoch": 1.346272277689652, + "grad_norm": 3941867.8233742057, + "learning_rate": 0.0002714964336759271, + "loss": 7.4575, + "step": 14428 + }, + { + "epoch": 1.3463655873845293, + "grad_norm": 70022065.96781544, + "learning_rate": 0.00027149199764390955, + "loss": 7.4104, + "step": 14429 + }, + { + "epoch": 1.3464588970794065, + "grad_norm": 7629723.256398451, + "learning_rate": 0.0002714875613029738, + "loss": 7.4414, + "step": 14430 + }, + { + "epoch": 1.3465522067742839, + "grad_norm": 242015.5527310473, + "learning_rate": 0.0002714831246531312, + "loss": 7.3922, + "step": 14431 + }, + { + "epoch": 1.346645516469161, + "grad_norm": 4665425.083984418, + "learning_rate": 0.0002714786876943929, + "loss": 7.4965, + "step": 14432 + }, + { + "epoch": 1.3467388261640385, + "grad_norm": 1274793.9407325704, + "learning_rate": 0.0002714742504267703, + "loss": 6.9397, + "step": 14433 + }, + { + "epoch": 1.3468321358589157, + "grad_norm": 879785.7633028816, + "learning_rate": 0.00027146981285027476, + "loss": 7.2069, + "step": 14434 + }, + { + "epoch": 1.3469254455537931, + "grad_norm": 0.9091970304672676, + "learning_rate": 0.0002714653749649174, + "loss": 7.552, + "step": 14435 + }, + { + "epoch": 1.3470187552486703, + "grad_norm": 1.5865138259629066, + "learning_rate": 0.00027146093677070957, + "loss": 7.5657, + "step": 14436 + }, + { + "epoch": 1.3471120649435475, + "grad_norm": 1.322205681309051, + "learning_rate": 0.0002714564982676626, + "loss": 7.5786, + "step": 14437 + }, + { + "epoch": 1.347205374638425, + "grad_norm": 0.8690611605839673, + "learning_rate": 0.0002714520594557877, + "loss": 6.8859, + "step": 14438 + }, + { + "epoch": 1.3472986843333024, + "grad_norm": 2.7934088590913677, + "learning_rate": 0.0002714476203350962, + "loss": 6.9411, + "step": 14439 + }, + { + "epoch": 1.3473919940281796, + "grad_norm": 136072.25050281116, + "learning_rate": 0.0002714431809055994, + "loss": 7.3652, + "step": 14440 + }, + { + "epoch": 1.3474853037230567, + "grad_norm": 1.5778699465515167, + "learning_rate": 0.0002714387411673085, + "loss": 7.3406, + "step": 14441 + }, + { + "epoch": 1.3475786134179342, + "grad_norm": 29442.104466350032, + "learning_rate": 0.0002714343011202349, + "loss": 7.2343, + "step": 14442 + }, + { + "epoch": 1.3476719231128114, + "grad_norm": 1.121278147423616, + "learning_rate": 0.00027142986076438984, + "loss": 7.2567, + "step": 14443 + }, + { + "epoch": 1.3477652328076888, + "grad_norm": 21.0956680434591, + "learning_rate": 0.0002714254200997846, + "loss": 7.3112, + "step": 14444 + }, + { + "epoch": 1.347858542502566, + "grad_norm": 38459176.240013465, + "learning_rate": 0.0002714209791264305, + "loss": 7.6216, + "step": 14445 + }, + { + "epoch": 1.3479518521974434, + "grad_norm": 1.930599810882331, + "learning_rate": 0.00027141653784433885, + "loss": 7.269, + "step": 14446 + }, + { + "epoch": 1.3480451618923206, + "grad_norm": 0.6052669377808281, + "learning_rate": 0.00027141209625352084, + "loss": 7.1729, + "step": 14447 + }, + { + "epoch": 1.3481384715871978, + "grad_norm": 0.9385905045499378, + "learning_rate": 0.00027140765435398786, + "loss": 7.162, + "step": 14448 + }, + { + "epoch": 1.3482317812820752, + "grad_norm": 3.043685700747291, + "learning_rate": 0.0002714032121457512, + "loss": 7.3304, + "step": 14449 + }, + { + "epoch": 1.3483250909769526, + "grad_norm": 10.237072562297053, + "learning_rate": 0.0002713987696288221, + "loss": 7.3612, + "step": 14450 + }, + { + "epoch": 1.3484184006718298, + "grad_norm": 5.349412672816224, + "learning_rate": 0.00027139432680321193, + "loss": 7.3663, + "step": 14451 + }, + { + "epoch": 1.348511710366707, + "grad_norm": 1.4708008455717214, + "learning_rate": 0.00027138988366893195, + "loss": 7.4645, + "step": 14452 + }, + { + "epoch": 1.3486050200615844, + "grad_norm": 3.527420181318543, + "learning_rate": 0.0002713854402259934, + "loss": 7.3527, + "step": 14453 + }, + { + "epoch": 1.3486983297564616, + "grad_norm": 9007131.501724722, + "learning_rate": 0.00027138099647440765, + "loss": 7.4738, + "step": 14454 + }, + { + "epoch": 1.348791639451339, + "grad_norm": 4575487.579685483, + "learning_rate": 0.00027137655241418605, + "loss": 7.1078, + "step": 14455 + }, + { + "epoch": 1.3488849491462163, + "grad_norm": 0.9354929667451646, + "learning_rate": 0.00027137210804533974, + "loss": 7.3683, + "step": 14456 + }, + { + "epoch": 1.3489782588410937, + "grad_norm": 2.7615060813995242, + "learning_rate": 0.00027136766336788016, + "loss": 7.0812, + "step": 14457 + }, + { + "epoch": 1.3490715685359709, + "grad_norm": 1.1353338502224557, + "learning_rate": 0.00027136321838181854, + "loss": 7.3535, + "step": 14458 + }, + { + "epoch": 1.349164878230848, + "grad_norm": 2.093392696310895, + "learning_rate": 0.0002713587730871662, + "loss": 7.0456, + "step": 14459 + }, + { + "epoch": 1.3492581879257255, + "grad_norm": 2158898.2306692195, + "learning_rate": 0.00027135432748393447, + "loss": 7.2482, + "step": 14460 + }, + { + "epoch": 1.349351497620603, + "grad_norm": 0.8860412083671295, + "learning_rate": 0.0002713498815721347, + "loss": 7.3753, + "step": 14461 + }, + { + "epoch": 1.34944480731548, + "grad_norm": 330388504.9032578, + "learning_rate": 0.000271345435351778, + "loss": 7.4288, + "step": 14462 + }, + { + "epoch": 1.3495381170103573, + "grad_norm": 20618212.43216618, + "learning_rate": 0.00027134098882287586, + "loss": 7.7256, + "step": 14463 + }, + { + "epoch": 1.3496314267052347, + "grad_norm": 0.8392269458838236, + "learning_rate": 0.00027133654198543955, + "loss": 7.185, + "step": 14464 + }, + { + "epoch": 1.349724736400112, + "grad_norm": 2.0582411353081578, + "learning_rate": 0.00027133209483948034, + "loss": 7.0085, + "step": 14465 + }, + { + "epoch": 1.3498180460949893, + "grad_norm": 25043601.46929551, + "learning_rate": 0.00027132764738500956, + "loss": 7.137, + "step": 14466 + }, + { + "epoch": 1.3499113557898665, + "grad_norm": 2.434803310272472, + "learning_rate": 0.0002713231996220385, + "loss": 7.3908, + "step": 14467 + }, + { + "epoch": 1.350004665484744, + "grad_norm": 1.4828122140468023, + "learning_rate": 0.00027131875155057844, + "loss": 7.3629, + "step": 14468 + }, + { + "epoch": 1.3500979751796212, + "grad_norm": 1.0660348050186854, + "learning_rate": 0.0002713143031706408, + "loss": 7.3614, + "step": 14469 + }, + { + "epoch": 1.3501912848744984, + "grad_norm": 0.971134232583957, + "learning_rate": 0.0002713098544822368, + "loss": 7.3958, + "step": 14470 + }, + { + "epoch": 1.3502845945693758, + "grad_norm": 6569699.110457683, + "learning_rate": 0.00027130540548537776, + "loss": 7.2967, + "step": 14471 + }, + { + "epoch": 1.3503779042642532, + "grad_norm": 2.0669088125936814, + "learning_rate": 0.00027130095618007504, + "loss": 7.3557, + "step": 14472 + }, + { + "epoch": 1.3504712139591304, + "grad_norm": 0.8613711423472921, + "learning_rate": 0.0002712965065663399, + "loss": 7.0417, + "step": 14473 + }, + { + "epoch": 1.3505645236540076, + "grad_norm": 0.8864443114041995, + "learning_rate": 0.0002712920566441837, + "loss": 7.3466, + "step": 14474 + }, + { + "epoch": 1.350657833348885, + "grad_norm": 1.0549845375513425, + "learning_rate": 0.0002712876064136177, + "loss": 7.4116, + "step": 14475 + }, + { + "epoch": 1.3507511430437622, + "grad_norm": 1.4171454171787385, + "learning_rate": 0.00027128315587465323, + "loss": 7.4115, + "step": 14476 + }, + { + "epoch": 1.3508444527386396, + "grad_norm": 3.9578832682457117, + "learning_rate": 0.0002712787050273017, + "loss": 7.0162, + "step": 14477 + }, + { + "epoch": 1.3509377624335168, + "grad_norm": 11.108329931072088, + "learning_rate": 0.00027127425387157427, + "loss": 7.5181, + "step": 14478 + }, + { + "epoch": 1.3510310721283942, + "grad_norm": 0.8083553389441489, + "learning_rate": 0.00027126980240748234, + "loss": 7.2824, + "step": 14479 + }, + { + "epoch": 1.3511243818232714, + "grad_norm": 0.9476741832082949, + "learning_rate": 0.00027126535063503725, + "loss": 7.2245, + "step": 14480 + }, + { + "epoch": 1.3512176915181486, + "grad_norm": 6.14971926017721, + "learning_rate": 0.00027126089855425027, + "loss": 7.6367, + "step": 14481 + }, + { + "epoch": 1.351311001213026, + "grad_norm": 1.073094997968149, + "learning_rate": 0.00027125644616513277, + "loss": 7.1954, + "step": 14482 + }, + { + "epoch": 1.3514043109079033, + "grad_norm": 1.5125439622105787, + "learning_rate": 0.00027125199346769604, + "loss": 7.0177, + "step": 14483 + }, + { + "epoch": 1.3514976206027807, + "grad_norm": 1.479595147066538, + "learning_rate": 0.0002712475404619514, + "loss": 7.2338, + "step": 14484 + }, + { + "epoch": 1.3515909302976579, + "grad_norm": 0.7336670067369679, + "learning_rate": 0.0002712430871479102, + "loss": 6.8332, + "step": 14485 + }, + { + "epoch": 1.3516842399925353, + "grad_norm": 0.9542141643077815, + "learning_rate": 0.00027123863352558375, + "loss": 7.0478, + "step": 14486 + }, + { + "epoch": 1.3517775496874125, + "grad_norm": 357438857.4141056, + "learning_rate": 0.00027123417959498333, + "loss": 7.3232, + "step": 14487 + }, + { + "epoch": 1.35187085938229, + "grad_norm": 0.5765690071466187, + "learning_rate": 0.00027122972535612033, + "loss": 7.1517, + "step": 14488 + }, + { + "epoch": 1.351964169077167, + "grad_norm": 5.035822252716512, + "learning_rate": 0.00027122527080900607, + "loss": 7.3291, + "step": 14489 + }, + { + "epoch": 1.3520574787720445, + "grad_norm": 1.1867611080310876, + "learning_rate": 0.0002712208159536518, + "loss": 7.2782, + "step": 14490 + }, + { + "epoch": 1.3521507884669217, + "grad_norm": 2.0795305848936354, + "learning_rate": 0.00027121636079006895, + "loss": 7.256, + "step": 14491 + }, + { + "epoch": 1.352244098161799, + "grad_norm": 15.947587799752027, + "learning_rate": 0.0002712119053182688, + "loss": 7.2645, + "step": 14492 + }, + { + "epoch": 1.3523374078566763, + "grad_norm": 1.2577367034297793, + "learning_rate": 0.00027120744953826266, + "loss": 7.2006, + "step": 14493 + }, + { + "epoch": 1.3524307175515535, + "grad_norm": 16.79610535848763, + "learning_rate": 0.00027120299345006186, + "loss": 7.4813, + "step": 14494 + }, + { + "epoch": 1.352524027246431, + "grad_norm": 0.5963139033713925, + "learning_rate": 0.0002711985370536778, + "loss": 7.2969, + "step": 14495 + }, + { + "epoch": 1.3526173369413081, + "grad_norm": 3.7971222266005604, + "learning_rate": 0.0002711940803491217, + "loss": 7.4671, + "step": 14496 + }, + { + "epoch": 1.3527106466361856, + "grad_norm": 60610188.32298637, + "learning_rate": 0.00027118962333640503, + "loss": 7.1096, + "step": 14497 + }, + { + "epoch": 1.3528039563310628, + "grad_norm": 1.2406270549460754, + "learning_rate": 0.000271185166015539, + "loss": 7.2989, + "step": 14498 + }, + { + "epoch": 1.35289726602594, + "grad_norm": 0.7608334376885826, + "learning_rate": 0.00027118070838653497, + "loss": 7.2295, + "step": 14499 + }, + { + "epoch": 1.3529905757208174, + "grad_norm": 0.5698324168694674, + "learning_rate": 0.00027117625044940436, + "loss": 7.0793, + "step": 14500 + }, + { + "epoch": 1.3530838854156948, + "grad_norm": 1.205017149723702, + "learning_rate": 0.0002711717922041584, + "loss": 7.3933, + "step": 14501 + }, + { + "epoch": 1.353177195110572, + "grad_norm": 1.2556687534999087, + "learning_rate": 0.0002711673336508084, + "loss": 7.1423, + "step": 14502 + }, + { + "epoch": 1.3532705048054492, + "grad_norm": 12390373.601449136, + "learning_rate": 0.0002711628747893659, + "loss": 7.2279, + "step": 14503 + }, + { + "epoch": 1.3533638145003266, + "grad_norm": 1.0161244423446867, + "learning_rate": 0.000271158415619842, + "loss": 7.6238, + "step": 14504 + }, + { + "epoch": 1.3534571241952038, + "grad_norm": 0.6547735211266658, + "learning_rate": 0.0002711539561422482, + "loss": 7.3459, + "step": 14505 + }, + { + "epoch": 1.3535504338900812, + "grad_norm": 2.4288745590811334, + "learning_rate": 0.0002711494963565958, + "loss": 7.3838, + "step": 14506 + }, + { + "epoch": 1.3536437435849584, + "grad_norm": 0.7154480506526317, + "learning_rate": 0.000271145036262896, + "loss": 7.0652, + "step": 14507 + }, + { + "epoch": 1.3537370532798358, + "grad_norm": 1.2303207316066656, + "learning_rate": 0.00027114057586116036, + "loss": 7.0255, + "step": 14508 + }, + { + "epoch": 1.353830362974713, + "grad_norm": 5.0881841916534025, + "learning_rate": 0.0002711361151514001, + "loss": 6.9726, + "step": 14509 + }, + { + "epoch": 1.3539236726695902, + "grad_norm": 1.4328865957221784, + "learning_rate": 0.0002711316541336266, + "loss": 7.4785, + "step": 14510 + }, + { + "epoch": 1.3540169823644677, + "grad_norm": 0.5952633738121569, + "learning_rate": 0.0002711271928078511, + "loss": 7.1894, + "step": 14511 + }, + { + "epoch": 1.354110292059345, + "grad_norm": 25.315546755935944, + "learning_rate": 0.0002711227311740851, + "loss": 7.4747, + "step": 14512 + }, + { + "epoch": 1.3542036017542223, + "grad_norm": 991435.0930996195, + "learning_rate": 0.00027111826923233986, + "loss": 7.167, + "step": 14513 + }, + { + "epoch": 1.3542969114490995, + "grad_norm": 1.0357350875159945, + "learning_rate": 0.0002711138069826267, + "loss": 6.9587, + "step": 14514 + }, + { + "epoch": 1.354390221143977, + "grad_norm": 0.5420617524480117, + "learning_rate": 0.0002711093444249571, + "loss": 7.3716, + "step": 14515 + }, + { + "epoch": 1.354483530838854, + "grad_norm": 1.68464348084113, + "learning_rate": 0.0002711048815593422, + "loss": 7.4982, + "step": 14516 + }, + { + "epoch": 1.3545768405337315, + "grad_norm": 1.004824186040295, + "learning_rate": 0.00027110041838579354, + "loss": 7.098, + "step": 14517 + }, + { + "epoch": 1.3546701502286087, + "grad_norm": 2028635.8406058827, + "learning_rate": 0.0002710959549043224, + "loss": 7.2256, + "step": 14518 + }, + { + "epoch": 1.3547634599234861, + "grad_norm": 4.165647442368823, + "learning_rate": 0.00027109149111494005, + "loss": 7.1308, + "step": 14519 + }, + { + "epoch": 1.3548567696183633, + "grad_norm": 1.1318306392938668, + "learning_rate": 0.0002710870270176579, + "loss": 7.4409, + "step": 14520 + }, + { + "epoch": 1.3549500793132405, + "grad_norm": 1.0209149069413848, + "learning_rate": 0.0002710825626124874, + "loss": 7.3714, + "step": 14521 + }, + { + "epoch": 1.355043389008118, + "grad_norm": 2.5145406996400967, + "learning_rate": 0.00027107809789943976, + "loss": 7.1966, + "step": 14522 + }, + { + "epoch": 1.3551366987029954, + "grad_norm": 0.6870376227839339, + "learning_rate": 0.00027107363287852637, + "loss": 7.1969, + "step": 14523 + }, + { + "epoch": 1.3552300083978726, + "grad_norm": 376146.0870544588, + "learning_rate": 0.0002710691675497586, + "loss": 7.1321, + "step": 14524 + }, + { + "epoch": 1.3553233180927498, + "grad_norm": 1.9538610320598138, + "learning_rate": 0.0002710647019131478, + "loss": 7.3264, + "step": 14525 + }, + { + "epoch": 1.3554166277876272, + "grad_norm": 1.0865890890419998, + "learning_rate": 0.00027106023596870535, + "loss": 7.2353, + "step": 14526 + }, + { + "epoch": 1.3555099374825044, + "grad_norm": 1.4374917992356866, + "learning_rate": 0.0002710557697164426, + "loss": 7.2816, + "step": 14527 + }, + { + "epoch": 1.3556032471773818, + "grad_norm": 0.7374933086916327, + "learning_rate": 0.0002710513031563708, + "loss": 7.4366, + "step": 14528 + }, + { + "epoch": 1.355696556872259, + "grad_norm": 0.8451819379170168, + "learning_rate": 0.00027104683628850144, + "loss": 6.967, + "step": 14529 + }, + { + "epoch": 1.3557898665671364, + "grad_norm": 0.5064985601909363, + "learning_rate": 0.00027104236911284585, + "loss": 7.1444, + "step": 14530 + }, + { + "epoch": 1.3558831762620136, + "grad_norm": 0.6794321952170207, + "learning_rate": 0.00027103790162941535, + "loss": 7.3639, + "step": 14531 + }, + { + "epoch": 1.3559764859568908, + "grad_norm": 0.9886959163706329, + "learning_rate": 0.0002710334338382213, + "loss": 7.5049, + "step": 14532 + }, + { + "epoch": 1.3560697956517682, + "grad_norm": 1.2785237941007717, + "learning_rate": 0.00027102896573927515, + "loss": 7.6451, + "step": 14533 + }, + { + "epoch": 1.3561631053466456, + "grad_norm": 0.5821682313318599, + "learning_rate": 0.00027102449733258817, + "loss": 7.1688, + "step": 14534 + }, + { + "epoch": 1.3562564150415228, + "grad_norm": 2.0824193667519673, + "learning_rate": 0.0002710200286181717, + "loss": 7.0782, + "step": 14535 + }, + { + "epoch": 1.3563497247364, + "grad_norm": 3.1218070780407894, + "learning_rate": 0.0002710155595960372, + "loss": 7.4146, + "step": 14536 + }, + { + "epoch": 1.3564430344312775, + "grad_norm": 0.7399496738001827, + "learning_rate": 0.0002710110902661959, + "loss": 7.4589, + "step": 14537 + }, + { + "epoch": 1.3565363441261546, + "grad_norm": 5.2345775314267575, + "learning_rate": 0.0002710066206286593, + "loss": 7.0329, + "step": 14538 + }, + { + "epoch": 1.356629653821032, + "grad_norm": 1.4042357173016962, + "learning_rate": 0.0002710021506834387, + "loss": 7.2513, + "step": 14539 + }, + { + "epoch": 1.3567229635159093, + "grad_norm": 0.906297812586711, + "learning_rate": 0.0002709976804305455, + "loss": 7.2186, + "step": 14540 + }, + { + "epoch": 1.3568162732107867, + "grad_norm": 0.9074921734700987, + "learning_rate": 0.000270993209869991, + "loss": 7.0691, + "step": 14541 + }, + { + "epoch": 1.3569095829056639, + "grad_norm": 1.466371051663933, + "learning_rate": 0.00027098873900178663, + "loss": 7.284, + "step": 14542 + }, + { + "epoch": 1.357002892600541, + "grad_norm": 4.3314419552425525, + "learning_rate": 0.00027098426782594376, + "loss": 7.3336, + "step": 14543 + }, + { + "epoch": 1.3570962022954185, + "grad_norm": 857833.9922742342, + "learning_rate": 0.0002709797963424737, + "loss": 6.8406, + "step": 14544 + }, + { + "epoch": 1.357189511990296, + "grad_norm": 1646737.9450557223, + "learning_rate": 0.00027097532455138784, + "loss": 7.3683, + "step": 14545 + }, + { + "epoch": 1.3572828216851731, + "grad_norm": 0.8388550345006347, + "learning_rate": 0.0002709708524526976, + "loss": 7.3737, + "step": 14546 + }, + { + "epoch": 1.3573761313800503, + "grad_norm": 573816.407227761, + "learning_rate": 0.0002709663800464143, + "loss": 7.2218, + "step": 14547 + }, + { + "epoch": 1.3574694410749277, + "grad_norm": 0.7647999280008345, + "learning_rate": 0.00027096190733254933, + "loss": 7.3091, + "step": 14548 + }, + { + "epoch": 1.357562750769805, + "grad_norm": 1393048.241593593, + "learning_rate": 0.00027095743431111404, + "loss": 7.2952, + "step": 14549 + }, + { + "epoch": 1.3576560604646823, + "grad_norm": 107550.62254516782, + "learning_rate": 0.00027095296098211987, + "loss": 7.3049, + "step": 14550 + }, + { + "epoch": 1.3577493701595595, + "grad_norm": 1.2610448050357748, + "learning_rate": 0.00027094848734557814, + "loss": 7.4544, + "step": 14551 + }, + { + "epoch": 1.357842679854437, + "grad_norm": 1.0061576398990597, + "learning_rate": 0.0002709440134015002, + "loss": 7.009, + "step": 14552 + }, + { + "epoch": 1.3579359895493142, + "grad_norm": 1.1322018438153771, + "learning_rate": 0.0002709395391498975, + "loss": 7.3101, + "step": 14553 + }, + { + "epoch": 1.3580292992441914, + "grad_norm": 3.096457669808368, + "learning_rate": 0.0002709350645907813, + "loss": 7.6264, + "step": 14554 + }, + { + "epoch": 1.3581226089390688, + "grad_norm": 3.4428529490651854, + "learning_rate": 0.0002709305897241632, + "loss": 7.186, + "step": 14555 + }, + { + "epoch": 1.3582159186339462, + "grad_norm": 0.6840642608321332, + "learning_rate": 0.0002709261145500543, + "loss": 7.4164, + "step": 14556 + }, + { + "epoch": 1.3583092283288234, + "grad_norm": 0.539195416530845, + "learning_rate": 0.00027092163906846615, + "loss": 7.1233, + "step": 14557 + }, + { + "epoch": 1.3584025380237006, + "grad_norm": 0.7436155731965602, + "learning_rate": 0.0002709171632794101, + "loss": 7.3635, + "step": 14558 + }, + { + "epoch": 1.358495847718578, + "grad_norm": 0.8153306712201885, + "learning_rate": 0.0002709126871828975, + "loss": 7.1913, + "step": 14559 + }, + { + "epoch": 1.3585891574134552, + "grad_norm": 0.7787600105722998, + "learning_rate": 0.0002709082107789398, + "loss": 7.1954, + "step": 14560 + }, + { + "epoch": 1.3586824671083326, + "grad_norm": 4.105731318482086, + "learning_rate": 0.0002709037340675483, + "loss": 7.1278, + "step": 14561 + }, + { + "epoch": 1.3587757768032098, + "grad_norm": 45508.97521285984, + "learning_rate": 0.00027089925704873447, + "loss": 7.0393, + "step": 14562 + }, + { + "epoch": 1.3588690864980872, + "grad_norm": 0.9641242537094988, + "learning_rate": 0.0002708947797225096, + "loss": 7.3817, + "step": 14563 + }, + { + "epoch": 1.3589623961929644, + "grad_norm": 1.3070943649923645, + "learning_rate": 0.0002708903020888851, + "loss": 7.3731, + "step": 14564 + }, + { + "epoch": 1.3590557058878416, + "grad_norm": 3.9425270923790854, + "learning_rate": 0.00027088582414787235, + "loss": 7.3135, + "step": 14565 + }, + { + "epoch": 1.359149015582719, + "grad_norm": 1.2634450288990238, + "learning_rate": 0.00027088134589948283, + "loss": 7.1829, + "step": 14566 + }, + { + "epoch": 1.3592423252775965, + "grad_norm": 1.2397545827693612, + "learning_rate": 0.0002708768673437278, + "loss": 7.3585, + "step": 14567 + }, + { + "epoch": 1.3593356349724737, + "grad_norm": 0.6455837879341415, + "learning_rate": 0.00027087238848061873, + "loss": 7.0677, + "step": 14568 + }, + { + "epoch": 1.3594289446673509, + "grad_norm": 0.44416800082073493, + "learning_rate": 0.000270867909310167, + "loss": 7.318, + "step": 14569 + }, + { + "epoch": 1.3595222543622283, + "grad_norm": 0.4995826605698118, + "learning_rate": 0.00027086342983238397, + "loss": 7.2426, + "step": 14570 + }, + { + "epoch": 1.3596155640571055, + "grad_norm": 1.6890669320800655, + "learning_rate": 0.00027085895004728104, + "loss": 7.1548, + "step": 14571 + }, + { + "epoch": 1.359708873751983, + "grad_norm": 1.0332724893115857, + "learning_rate": 0.00027085446995486957, + "loss": 7.5346, + "step": 14572 + }, + { + "epoch": 1.35980218344686, + "grad_norm": 0.9386654407513009, + "learning_rate": 0.000270849989555161, + "loss": 6.9508, + "step": 14573 + }, + { + "epoch": 1.3598954931417375, + "grad_norm": 0.8446846185316207, + "learning_rate": 0.0002708455088481667, + "loss": 7.567, + "step": 14574 + }, + { + "epoch": 1.3599888028366147, + "grad_norm": 2.2577910950267466, + "learning_rate": 0.0002708410278338981, + "loss": 7.1588, + "step": 14575 + }, + { + "epoch": 1.360082112531492, + "grad_norm": 1.2011516012371357, + "learning_rate": 0.0002708365465123665, + "loss": 7.4246, + "step": 14576 + }, + { + "epoch": 1.3601754222263693, + "grad_norm": 1.3126367013238944, + "learning_rate": 0.00027083206488358335, + "loss": 7.3124, + "step": 14577 + }, + { + "epoch": 1.3602687319212468, + "grad_norm": 1.3227722604627692, + "learning_rate": 0.00027082758294756014, + "loss": 7.4572, + "step": 14578 + }, + { + "epoch": 1.360362041616124, + "grad_norm": 8.607114511877873, + "learning_rate": 0.0002708231007043081, + "loss": 7.3737, + "step": 14579 + }, + { + "epoch": 1.3604553513110011, + "grad_norm": 364664.3579202545, + "learning_rate": 0.00027081861815383875, + "loss": 7.0714, + "step": 14580 + }, + { + "epoch": 1.3605486610058786, + "grad_norm": 581020.4104468221, + "learning_rate": 0.0002708141352961634, + "loss": 7.2658, + "step": 14581 + }, + { + "epoch": 1.3606419707007558, + "grad_norm": 2.3593196981319347, + "learning_rate": 0.0002708096521312936, + "loss": 7.7954, + "step": 14582 + }, + { + "epoch": 1.3607352803956332, + "grad_norm": 1.012502817550142, + "learning_rate": 0.00027080516865924054, + "loss": 7.2111, + "step": 14583 + }, + { + "epoch": 1.3608285900905104, + "grad_norm": 934996.9679419587, + "learning_rate": 0.0002708006848800157, + "loss": 7.5247, + "step": 14584 + }, + { + "epoch": 1.3609218997853878, + "grad_norm": 1.4791798508472023, + "learning_rate": 0.00027079620079363054, + "loss": 7.0431, + "step": 14585 + }, + { + "epoch": 1.361015209480265, + "grad_norm": 1260618.5039081573, + "learning_rate": 0.00027079171640009646, + "loss": 7.1574, + "step": 14586 + }, + { + "epoch": 1.3611085191751422, + "grad_norm": 6.787009202899945, + "learning_rate": 0.0002707872316994248, + "loss": 7.1622, + "step": 14587 + }, + { + "epoch": 1.3612018288700196, + "grad_norm": 1.5707765169686936, + "learning_rate": 0.000270782746691627, + "loss": 7.614, + "step": 14588 + }, + { + "epoch": 1.3612951385648968, + "grad_norm": 4158204.8212628467, + "learning_rate": 0.00027077826137671444, + "loss": 7.1766, + "step": 14589 + }, + { + "epoch": 1.3613884482597742, + "grad_norm": 1.0845380799895392, + "learning_rate": 0.0002707737757546985, + "loss": 7.4667, + "step": 14590 + }, + { + "epoch": 1.3614817579546514, + "grad_norm": 1.949088427939837, + "learning_rate": 0.00027076928982559074, + "loss": 7.3508, + "step": 14591 + }, + { + "epoch": 1.3615750676495288, + "grad_norm": 1.0409473343577096, + "learning_rate": 0.0002707648035894023, + "loss": 7.3781, + "step": 14592 + }, + { + "epoch": 1.361668377344406, + "grad_norm": 0.7908517887844965, + "learning_rate": 0.00027076031704614486, + "loss": 7.0584, + "step": 14593 + }, + { + "epoch": 1.3617616870392835, + "grad_norm": 1.166853290516007, + "learning_rate": 0.00027075583019582964, + "loss": 7.4777, + "step": 14594 + }, + { + "epoch": 1.3618549967341607, + "grad_norm": 0.8692153855974948, + "learning_rate": 0.0002707513430384681, + "loss": 7.4285, + "step": 14595 + }, + { + "epoch": 1.361948306429038, + "grad_norm": 0.7991246424281407, + "learning_rate": 0.00027074685557407175, + "loss": 7.2612, + "step": 14596 + }, + { + "epoch": 1.3620416161239153, + "grad_norm": 1.072910223080049, + "learning_rate": 0.0002707423678026518, + "loss": 7.0584, + "step": 14597 + }, + { + "epoch": 1.3621349258187925, + "grad_norm": 0.5846204563619729, + "learning_rate": 0.00027073787972421986, + "loss": 7.2705, + "step": 14598 + }, + { + "epoch": 1.36222823551367, + "grad_norm": 1.4256696606637451, + "learning_rate": 0.0002707333913387872, + "loss": 7.4063, + "step": 14599 + }, + { + "epoch": 1.362321545208547, + "grad_norm": 0.8725235432742241, + "learning_rate": 0.0002707289026463654, + "loss": 7.5375, + "step": 14600 + }, + { + "epoch": 1.3624148549034245, + "grad_norm": 1.2842795988311333, + "learning_rate": 0.00027072441364696566, + "loss": 7.4226, + "step": 14601 + }, + { + "epoch": 1.3625081645983017, + "grad_norm": 0.6842943909877114, + "learning_rate": 0.0002707199243405995, + "loss": 7.1439, + "step": 14602 + }, + { + "epoch": 1.3626014742931791, + "grad_norm": 0.7459601014352713, + "learning_rate": 0.00027071543472727836, + "loss": 7.3688, + "step": 14603 + }, + { + "epoch": 1.3626947839880563, + "grad_norm": 37403393.22872428, + "learning_rate": 0.0002707109448070136, + "loss": 7.3773, + "step": 14604 + }, + { + "epoch": 1.3627880936829335, + "grad_norm": 3.538630356184024, + "learning_rate": 0.0002707064545798167, + "loss": 7.4345, + "step": 14605 + }, + { + "epoch": 1.362881403377811, + "grad_norm": 39338458.73218387, + "learning_rate": 0.000270701964045699, + "loss": 7.1279, + "step": 14606 + }, + { + "epoch": 1.3629747130726884, + "grad_norm": 1.2330969847488542, + "learning_rate": 0.00027069747320467203, + "loss": 7.274, + "step": 14607 + }, + { + "epoch": 1.3630680227675656, + "grad_norm": 0.8340704801925486, + "learning_rate": 0.00027069298205674707, + "loss": 7.5483, + "step": 14608 + }, + { + "epoch": 1.3631613324624428, + "grad_norm": 2081070564.628099, + "learning_rate": 0.00027068849060193564, + "loss": 7.1953, + "step": 14609 + }, + { + "epoch": 1.3632546421573202, + "grad_norm": 936694392.5833592, + "learning_rate": 0.00027068399884024914, + "loss": 7.3138, + "step": 14610 + }, + { + "epoch": 1.3633479518521974, + "grad_norm": 10.29491182972417, + "learning_rate": 0.00027067950677169895, + "loss": 7.4607, + "step": 14611 + }, + { + "epoch": 1.3634412615470748, + "grad_norm": 1.3179855535182567, + "learning_rate": 0.00027067501439629655, + "loss": 7.0073, + "step": 14612 + }, + { + "epoch": 1.363534571241952, + "grad_norm": 1.3259402843046413, + "learning_rate": 0.00027067052171405334, + "loss": 7.4246, + "step": 14613 + }, + { + "epoch": 1.3636278809368294, + "grad_norm": 1.855763725123983, + "learning_rate": 0.0002706660287249807, + "loss": 7.339, + "step": 14614 + }, + { + "epoch": 1.3637211906317066, + "grad_norm": 1.5087495994472062, + "learning_rate": 0.0002706615354290901, + "loss": 7.3653, + "step": 14615 + }, + { + "epoch": 1.3638145003265838, + "grad_norm": 1.3360705722150072, + "learning_rate": 0.000270657041826393, + "loss": 7.2135, + "step": 14616 + }, + { + "epoch": 1.3639078100214612, + "grad_norm": 13334645779.697035, + "learning_rate": 0.00027065254791690073, + "loss": 7.2666, + "step": 14617 + }, + { + "epoch": 1.3640011197163386, + "grad_norm": 11.825588870019635, + "learning_rate": 0.0002706480537006248, + "loss": 7.443, + "step": 14618 + }, + { + "epoch": 1.3640944294112158, + "grad_norm": 2.403720837904089, + "learning_rate": 0.00027064355917757663, + "loss": 7.4603, + "step": 14619 + }, + { + "epoch": 1.364187739106093, + "grad_norm": 7.444844840754046, + "learning_rate": 0.0002706390643477676, + "loss": 7.3461, + "step": 14620 + }, + { + "epoch": 1.3642810488009705, + "grad_norm": 6.811031857457987, + "learning_rate": 0.00027063456921120916, + "loss": 7.1656, + "step": 14621 + }, + { + "epoch": 1.3643743584958477, + "grad_norm": 476208721713.4976, + "learning_rate": 0.00027063007376791273, + "loss": 7.2864, + "step": 14622 + }, + { + "epoch": 1.364467668190725, + "grad_norm": 10.100315759042438, + "learning_rate": 0.0002706255780178898, + "loss": 7.3958, + "step": 14623 + }, + { + "epoch": 1.3645609778856023, + "grad_norm": 2.2593171217805987, + "learning_rate": 0.0002706210819611517, + "loss": 7.3115, + "step": 14624 + }, + { + "epoch": 1.3646542875804797, + "grad_norm": 2.08158173880243, + "learning_rate": 0.00027061658559770997, + "loss": 7.4312, + "step": 14625 + }, + { + "epoch": 1.3647475972753569, + "grad_norm": 13.949826884164569, + "learning_rate": 0.000270612088927576, + "loss": 7.4251, + "step": 14626 + }, + { + "epoch": 1.364840906970234, + "grad_norm": 2.666812641050037, + "learning_rate": 0.00027060759195076116, + "loss": 7.3462, + "step": 14627 + }, + { + "epoch": 1.3649342166651115, + "grad_norm": 74255441.92696503, + "learning_rate": 0.000270603094667277, + "loss": 7.4384, + "step": 14628 + }, + { + "epoch": 1.365027526359989, + "grad_norm": 0.9321868584112446, + "learning_rate": 0.00027059859707713485, + "loss": 7.1953, + "step": 14629 + }, + { + "epoch": 1.3651208360548661, + "grad_norm": 2.500707576519595, + "learning_rate": 0.0002705940991803462, + "loss": 7.3343, + "step": 14630 + }, + { + "epoch": 1.3652141457497433, + "grad_norm": 2.696867774255158, + "learning_rate": 0.00027058960097692253, + "loss": 7.4139, + "step": 14631 + }, + { + "epoch": 1.3653074554446207, + "grad_norm": 2.1753159796127015, + "learning_rate": 0.00027058510246687515, + "loss": 7.6543, + "step": 14632 + }, + { + "epoch": 1.365400765139498, + "grad_norm": 1.4099086734936472, + "learning_rate": 0.0002705806036502156, + "loss": 7.0581, + "step": 14633 + }, + { + "epoch": 1.3654940748343753, + "grad_norm": 1.6868615934383837, + "learning_rate": 0.0002705761045269553, + "loss": 7.5188, + "step": 14634 + }, + { + "epoch": 1.3655873845292525, + "grad_norm": 0.9994225880399354, + "learning_rate": 0.00027057160509710566, + "loss": 7.2956, + "step": 14635 + }, + { + "epoch": 1.36568069422413, + "grad_norm": 9572353.364995403, + "learning_rate": 0.0002705671053606782, + "loss": 7.4247, + "step": 14636 + }, + { + "epoch": 1.3657740039190072, + "grad_norm": 2.8503472733144415, + "learning_rate": 0.0002705626053176843, + "loss": 7.4356, + "step": 14637 + }, + { + "epoch": 1.3658673136138844, + "grad_norm": 1.0234538060310223, + "learning_rate": 0.0002705581049681354, + "loss": 7.4579, + "step": 14638 + }, + { + "epoch": 1.3659606233087618, + "grad_norm": 2.5187135459362042, + "learning_rate": 0.0002705536043120429, + "loss": 7.3338, + "step": 14639 + }, + { + "epoch": 1.3660539330036392, + "grad_norm": 1.8318147705572003, + "learning_rate": 0.0002705491033494183, + "loss": 7.4645, + "step": 14640 + }, + { + "epoch": 1.3661472426985164, + "grad_norm": 1.2155497678263867, + "learning_rate": 0.00027054460208027307, + "loss": 7.8533, + "step": 14641 + }, + { + "epoch": 1.3662405523933936, + "grad_norm": 4020583.073984929, + "learning_rate": 0.0002705401005046187, + "loss": 7.4209, + "step": 14642 + }, + { + "epoch": 1.366333862088271, + "grad_norm": 3.6495626890765545, + "learning_rate": 0.0002705355986224664, + "loss": 7.0061, + "step": 14643 + }, + { + "epoch": 1.3664271717831482, + "grad_norm": 2728463.5437136446, + "learning_rate": 0.0002705310964338279, + "loss": 7.2612, + "step": 14644 + }, + { + "epoch": 1.3665204814780256, + "grad_norm": 0.8242497387217379, + "learning_rate": 0.0002705265939387145, + "loss": 7.3891, + "step": 14645 + }, + { + "epoch": 1.3666137911729028, + "grad_norm": 2.459978190579512, + "learning_rate": 0.0002705220911371376, + "loss": 7.8152, + "step": 14646 + }, + { + "epoch": 1.3667071008677802, + "grad_norm": 1.2339847487485587, + "learning_rate": 0.0002705175880291088, + "loss": 7.5279, + "step": 14647 + }, + { + "epoch": 1.3668004105626574, + "grad_norm": 1.268222120712858, + "learning_rate": 0.00027051308461463946, + "loss": 7.1928, + "step": 14648 + }, + { + "epoch": 1.3668937202575346, + "grad_norm": 2408485.4374116585, + "learning_rate": 0.0002705085808937411, + "loss": 7.5666, + "step": 14649 + }, + { + "epoch": 1.366987029952412, + "grad_norm": 2101242.5202654316, + "learning_rate": 0.00027050407686642503, + "loss": 7.0977, + "step": 14650 + }, + { + "epoch": 1.3670803396472895, + "grad_norm": 1.289288458834017, + "learning_rate": 0.0002704995725327028, + "loss": 7.2467, + "step": 14651 + }, + { + "epoch": 1.3671736493421667, + "grad_norm": 0.6067001956129934, + "learning_rate": 0.00027049506789258586, + "loss": 7.3915, + "step": 14652 + }, + { + "epoch": 1.3672669590370439, + "grad_norm": 1.0310849592189466, + "learning_rate": 0.00027049056294608566, + "loss": 7.1214, + "step": 14653 + }, + { + "epoch": 1.3673602687319213, + "grad_norm": 4.390931846722339, + "learning_rate": 0.00027048605769321366, + "loss": 7.166, + "step": 14654 + }, + { + "epoch": 1.3674535784267985, + "grad_norm": 0.5551342818872992, + "learning_rate": 0.0002704815521339813, + "loss": 7.4713, + "step": 14655 + }, + { + "epoch": 1.367546888121676, + "grad_norm": 0.8582633770638572, + "learning_rate": 0.00027047704626840006, + "loss": 7.3036, + "step": 14656 + }, + { + "epoch": 1.367640197816553, + "grad_norm": 0.8415631947403114, + "learning_rate": 0.00027047254009648133, + "loss": 7.0295, + "step": 14657 + }, + { + "epoch": 1.3677335075114305, + "grad_norm": 0.8369102140835202, + "learning_rate": 0.00027046803361823667, + "loss": 7.1452, + "step": 14658 + }, + { + "epoch": 1.3678268172063077, + "grad_norm": 314411.2505709921, + "learning_rate": 0.00027046352683367746, + "loss": 7.2256, + "step": 14659 + }, + { + "epoch": 1.367920126901185, + "grad_norm": 1632253.9281927778, + "learning_rate": 0.00027045901974281515, + "loss": 7.2142, + "step": 14660 + }, + { + "epoch": 1.3680134365960623, + "grad_norm": 1.2955669415230227, + "learning_rate": 0.0002704545123456613, + "loss": 7.2487, + "step": 14661 + }, + { + "epoch": 1.3681067462909398, + "grad_norm": 0.8517653220527862, + "learning_rate": 0.0002704500046422273, + "loss": 7.1385, + "step": 14662 + }, + { + "epoch": 1.368200055985817, + "grad_norm": 0.6279116177160604, + "learning_rate": 0.00027044549663252456, + "loss": 7.4527, + "step": 14663 + }, + { + "epoch": 1.3682933656806942, + "grad_norm": 273165.8630255272, + "learning_rate": 0.0002704409883165647, + "loss": 7.3642, + "step": 14664 + }, + { + "epoch": 1.3683866753755716, + "grad_norm": 1768686.6282720368, + "learning_rate": 0.000270436479694359, + "loss": 7.2351, + "step": 14665 + }, + { + "epoch": 1.3684799850704488, + "grad_norm": 0.49742762618266684, + "learning_rate": 0.00027043197076591905, + "loss": 7.3651, + "step": 14666 + }, + { + "epoch": 1.3685732947653262, + "grad_norm": 1.3697128660096745, + "learning_rate": 0.00027042746153125625, + "loss": 7.4958, + "step": 14667 + }, + { + "epoch": 1.3686666044602034, + "grad_norm": 0.7919575284696715, + "learning_rate": 0.00027042295199038207, + "loss": 7.2213, + "step": 14668 + }, + { + "epoch": 1.3687599141550808, + "grad_norm": 1.1850690750245834, + "learning_rate": 0.000270418442143308, + "loss": 7.3226, + "step": 14669 + }, + { + "epoch": 1.368853223849958, + "grad_norm": 0.6337992829042915, + "learning_rate": 0.0002704139319900455, + "loss": 7.2327, + "step": 14670 + }, + { + "epoch": 1.3689465335448352, + "grad_norm": 919279.0273753605, + "learning_rate": 0.0002704094215306061, + "loss": 7.1269, + "step": 14671 + }, + { + "epoch": 1.3690398432397126, + "grad_norm": 0.7493081726490851, + "learning_rate": 0.00027040491076500114, + "loss": 7.2124, + "step": 14672 + }, + { + "epoch": 1.36913315293459, + "grad_norm": 927415.2022876701, + "learning_rate": 0.0002704003996932422, + "loss": 7.3389, + "step": 14673 + }, + { + "epoch": 1.3692264626294672, + "grad_norm": 0.8632680210465467, + "learning_rate": 0.0002703958883153407, + "loss": 7.297, + "step": 14674 + }, + { + "epoch": 1.3693197723243444, + "grad_norm": 1.165339740089475, + "learning_rate": 0.0002703913766313081, + "loss": 7.3788, + "step": 14675 + }, + { + "epoch": 1.3694130820192219, + "grad_norm": 2.624874380225296, + "learning_rate": 0.00027038686464115594, + "loss": 7.4149, + "step": 14676 + }, + { + "epoch": 1.369506391714099, + "grad_norm": 0.7056049779720424, + "learning_rate": 0.0002703823523448956, + "loss": 7.269, + "step": 14677 + }, + { + "epoch": 1.3695997014089765, + "grad_norm": 240886.91007356418, + "learning_rate": 0.0002703778397425386, + "loss": 7.4263, + "step": 14678 + }, + { + "epoch": 1.3696930111038537, + "grad_norm": 3279387.1828233977, + "learning_rate": 0.00027037332683409643, + "loss": 7.6812, + "step": 14679 + }, + { + "epoch": 1.369786320798731, + "grad_norm": 1.35905325906379, + "learning_rate": 0.00027036881361958056, + "loss": 7.3517, + "step": 14680 + }, + { + "epoch": 1.3698796304936083, + "grad_norm": 0.9994349003590934, + "learning_rate": 0.00027036430009900244, + "loss": 7.4107, + "step": 14681 + }, + { + "epoch": 1.3699729401884855, + "grad_norm": 523693.43396088685, + "learning_rate": 0.00027035978627237356, + "loss": 7.6711, + "step": 14682 + }, + { + "epoch": 1.370066249883363, + "grad_norm": 158702.1472116652, + "learning_rate": 0.00027035527213970537, + "loss": 6.9321, + "step": 14683 + }, + { + "epoch": 1.3701595595782403, + "grad_norm": 0.8970001388913695, + "learning_rate": 0.0002703507577010094, + "loss": 7.6567, + "step": 14684 + }, + { + "epoch": 1.3702528692731175, + "grad_norm": 400449.9821134203, + "learning_rate": 0.0002703462429562971, + "loss": 7.323, + "step": 14685 + }, + { + "epoch": 1.3703461789679947, + "grad_norm": 0.7548469953968987, + "learning_rate": 0.00027034172790557995, + "loss": 7.2041, + "step": 14686 + }, + { + "epoch": 1.3704394886628721, + "grad_norm": 1.5896540335859501, + "learning_rate": 0.00027033721254886945, + "loss": 7.443, + "step": 14687 + }, + { + "epoch": 1.3705327983577493, + "grad_norm": 1.2090930281699166, + "learning_rate": 0.00027033269688617705, + "loss": 7.5352, + "step": 14688 + }, + { + "epoch": 1.3706261080526267, + "grad_norm": 1.6616777024326603, + "learning_rate": 0.0002703281809175142, + "loss": 7.7704, + "step": 14689 + }, + { + "epoch": 1.370719417747504, + "grad_norm": 0.9976257883651642, + "learning_rate": 0.00027032366464289253, + "loss": 7.1375, + "step": 14690 + }, + { + "epoch": 1.3708127274423814, + "grad_norm": 0.8746133788560756, + "learning_rate": 0.00027031914806232335, + "loss": 7.4646, + "step": 14691 + }, + { + "epoch": 1.3709060371372586, + "grad_norm": 1.2001994899610302, + "learning_rate": 0.0002703146311758183, + "loss": 7.1873, + "step": 14692 + }, + { + "epoch": 1.3709993468321358, + "grad_norm": 0.6610764449122942, + "learning_rate": 0.00027031011398338865, + "loss": 7.5576, + "step": 14693 + }, + { + "epoch": 1.3710926565270132, + "grad_norm": 2.058670199018461, + "learning_rate": 0.0002703055964850461, + "loss": 7.1909, + "step": 14694 + }, + { + "epoch": 1.3711859662218904, + "grad_norm": 0.7049176323137397, + "learning_rate": 0.0002703010786808021, + "loss": 7.4403, + "step": 14695 + }, + { + "epoch": 1.3712792759167678, + "grad_norm": 1.4942842754265564, + "learning_rate": 0.00027029656057066796, + "loss": 7.4848, + "step": 14696 + }, + { + "epoch": 1.371372585611645, + "grad_norm": 2.2273188245374995, + "learning_rate": 0.0002702920421546554, + "loss": 7.0683, + "step": 14697 + }, + { + "epoch": 1.3714658953065224, + "grad_norm": 0.7782300793747605, + "learning_rate": 0.00027028752343277577, + "loss": 7.3472, + "step": 14698 + }, + { + "epoch": 1.3715592050013996, + "grad_norm": 1.5025033850142413, + "learning_rate": 0.00027028300440504064, + "loss": 7.436, + "step": 14699 + }, + { + "epoch": 1.371652514696277, + "grad_norm": 965593.7642733589, + "learning_rate": 0.00027027848507146145, + "loss": 7.2724, + "step": 14700 + }, + { + "epoch": 1.3717458243911542, + "grad_norm": 2375172.829246295, + "learning_rate": 0.0002702739654320497, + "loss": 7.4422, + "step": 14701 + }, + { + "epoch": 1.3718391340860316, + "grad_norm": 1.5552061270301543, + "learning_rate": 0.0002702694454868168, + "loss": 7.1953, + "step": 14702 + }, + { + "epoch": 1.3719324437809088, + "grad_norm": 3045153.2140669785, + "learning_rate": 0.0002702649252357745, + "loss": 7.189, + "step": 14703 + }, + { + "epoch": 1.372025753475786, + "grad_norm": 1.82901999087045, + "learning_rate": 0.00027026040467893395, + "loss": 7.3271, + "step": 14704 + }, + { + "epoch": 1.3721190631706635, + "grad_norm": 531607.9751012461, + "learning_rate": 0.0002702558838163069, + "loss": 7.0671, + "step": 14705 + }, + { + "epoch": 1.3722123728655407, + "grad_norm": 0.7159981789699528, + "learning_rate": 0.00027025136264790476, + "loss": 7.2939, + "step": 14706 + }, + { + "epoch": 1.372305682560418, + "grad_norm": 0.7134823643242815, + "learning_rate": 0.000270246841173739, + "loss": 7.52, + "step": 14707 + }, + { + "epoch": 1.3723989922552953, + "grad_norm": 1182625.010429252, + "learning_rate": 0.00027024231939382114, + "loss": 7.1225, + "step": 14708 + }, + { + "epoch": 1.3724923019501727, + "grad_norm": 0.9737618821863386, + "learning_rate": 0.0002702377973081627, + "loss": 7.1499, + "step": 14709 + }, + { + "epoch": 1.3725856116450499, + "grad_norm": 0.9776060372250736, + "learning_rate": 0.00027023327491677515, + "loss": 7.3127, + "step": 14710 + }, + { + "epoch": 1.372678921339927, + "grad_norm": 681310.9248809823, + "learning_rate": 0.00027022875221967, + "loss": 7.0999, + "step": 14711 + }, + { + "epoch": 1.3727722310348045, + "grad_norm": 1.4454942704567033, + "learning_rate": 0.00027022422921685877, + "loss": 6.9165, + "step": 14712 + }, + { + "epoch": 1.372865540729682, + "grad_norm": 1.7064031987842043, + "learning_rate": 0.0002702197059083529, + "loss": 7.4135, + "step": 14713 + }, + { + "epoch": 1.3729588504245591, + "grad_norm": 1.0809794690705332, + "learning_rate": 0.00027021518229416395, + "loss": 7.2795, + "step": 14714 + }, + { + "epoch": 1.3730521601194363, + "grad_norm": 0.5713964889898118, + "learning_rate": 0.0002702106583743034, + "loss": 7.1241, + "step": 14715 + }, + { + "epoch": 1.3731454698143137, + "grad_norm": 1060951.6639340941, + "learning_rate": 0.00027020613414878273, + "loss": 7.2663, + "step": 14716 + }, + { + "epoch": 1.373238779509191, + "grad_norm": 0.9007183436849822, + "learning_rate": 0.00027020160961761347, + "loss": 7.4594, + "step": 14717 + }, + { + "epoch": 1.3733320892040684, + "grad_norm": 0.6346915536051433, + "learning_rate": 0.0002701970847808072, + "loss": 7.4669, + "step": 14718 + }, + { + "epoch": 1.3734253988989455, + "grad_norm": 2.0859157391266296, + "learning_rate": 0.00027019255963837524, + "loss": 7.3479, + "step": 14719 + }, + { + "epoch": 1.373518708593823, + "grad_norm": 1.4198957140959843, + "learning_rate": 0.00027018803419032926, + "loss": 7.1649, + "step": 14720 + }, + { + "epoch": 1.3736120182887002, + "grad_norm": 2.8039752619529756, + "learning_rate": 0.0002701835084366807, + "loss": 7.5238, + "step": 14721 + }, + { + "epoch": 1.3737053279835774, + "grad_norm": 0.8281680378685182, + "learning_rate": 0.0002701789823774411, + "loss": 7.2095, + "step": 14722 + }, + { + "epoch": 1.3737986376784548, + "grad_norm": 1.927315564623898, + "learning_rate": 0.00027017445601262196, + "loss": 7.2556, + "step": 14723 + }, + { + "epoch": 1.3738919473733322, + "grad_norm": 3960339.039283106, + "learning_rate": 0.00027016992934223474, + "loss": 7.4788, + "step": 14724 + }, + { + "epoch": 1.3739852570682094, + "grad_norm": 2.642461761531895, + "learning_rate": 0.000270165402366291, + "loss": 7.4019, + "step": 14725 + }, + { + "epoch": 1.3740785667630866, + "grad_norm": 1.1842074319727929, + "learning_rate": 0.00027016087508480223, + "loss": 7.184, + "step": 14726 + }, + { + "epoch": 1.374171876457964, + "grad_norm": 1.1552681672452263, + "learning_rate": 0.0002701563474977799, + "loss": 6.9783, + "step": 14727 + }, + { + "epoch": 1.3742651861528412, + "grad_norm": 1.4268613418205367, + "learning_rate": 0.00027015181960523567, + "loss": 7.4292, + "step": 14728 + }, + { + "epoch": 1.3743584958477186, + "grad_norm": 3.220087070449642, + "learning_rate": 0.0002701472914071809, + "loss": 7.2202, + "step": 14729 + }, + { + "epoch": 1.3744518055425958, + "grad_norm": 0.5727636127245422, + "learning_rate": 0.00027014276290362717, + "loss": 7.4576, + "step": 14730 + }, + { + "epoch": 1.3745451152374732, + "grad_norm": 1.9864902230864665, + "learning_rate": 0.00027013823409458595, + "loss": 7.74, + "step": 14731 + }, + { + "epoch": 1.3746384249323504, + "grad_norm": 0.9957747552731979, + "learning_rate": 0.0002701337049800689, + "loss": 7.3517, + "step": 14732 + }, + { + "epoch": 1.3747317346272276, + "grad_norm": 1.4746989062138811, + "learning_rate": 0.0002701291755600873, + "loss": 7.3565, + "step": 14733 + }, + { + "epoch": 1.374825044322105, + "grad_norm": 2.8863064972398664, + "learning_rate": 0.00027012464583465285, + "loss": 7.1298, + "step": 14734 + }, + { + "epoch": 1.3749183540169825, + "grad_norm": 1.3299103929899652, + "learning_rate": 0.000270120115803777, + "loss": 7.2508, + "step": 14735 + }, + { + "epoch": 1.3750116637118597, + "grad_norm": 7.632760600623467, + "learning_rate": 0.00027011558546747124, + "loss": 7.2357, + "step": 14736 + }, + { + "epoch": 1.3751049734067369, + "grad_norm": 0.8511535621928926, + "learning_rate": 0.00027011105482574715, + "loss": 7.1782, + "step": 14737 + }, + { + "epoch": 1.3751982831016143, + "grad_norm": 0.7144238119513513, + "learning_rate": 0.00027010652387861626, + "loss": 7.3179, + "step": 14738 + }, + { + "epoch": 1.3752915927964915, + "grad_norm": 1.4328991605435115, + "learning_rate": 0.00027010199262609, + "loss": 7.3054, + "step": 14739 + }, + { + "epoch": 1.375384902491369, + "grad_norm": 2.3843561416211854, + "learning_rate": 0.00027009746106818, + "loss": 7.6982, + "step": 14740 + }, + { + "epoch": 1.375478212186246, + "grad_norm": 895550.0225030849, + "learning_rate": 0.0002700929292048977, + "loss": 7.1895, + "step": 14741 + }, + { + "epoch": 1.3755715218811235, + "grad_norm": 2.3857648137776635, + "learning_rate": 0.0002700883970362547, + "loss": 7.4645, + "step": 14742 + }, + { + "epoch": 1.3756648315760007, + "grad_norm": 331967.9737085852, + "learning_rate": 0.00027008386456226245, + "loss": 7.0466, + "step": 14743 + }, + { + "epoch": 1.375758141270878, + "grad_norm": 0.708448165454539, + "learning_rate": 0.0002700793317829325, + "loss": 7.5073, + "step": 14744 + }, + { + "epoch": 1.3758514509657553, + "grad_norm": 0.8314028278289796, + "learning_rate": 0.0002700747986982764, + "loss": 7.379, + "step": 14745 + }, + { + "epoch": 1.3759447606606328, + "grad_norm": 1.4741991815086313, + "learning_rate": 0.0002700702653083056, + "loss": 7.1177, + "step": 14746 + }, + { + "epoch": 1.37603807035551, + "grad_norm": 1.0808812124242995, + "learning_rate": 0.0002700657316130317, + "loss": 7.3126, + "step": 14747 + }, + { + "epoch": 1.3761313800503872, + "grad_norm": 1517039.3596138866, + "learning_rate": 0.00027006119761246626, + "loss": 7.0531, + "step": 14748 + }, + { + "epoch": 1.3762246897452646, + "grad_norm": 0.578038053269596, + "learning_rate": 0.00027005666330662075, + "loss": 7.372, + "step": 14749 + }, + { + "epoch": 1.3763179994401418, + "grad_norm": 0.43743667057824626, + "learning_rate": 0.0002700521286955067, + "loss": 7.2535, + "step": 14750 + }, + { + "epoch": 1.3764113091350192, + "grad_norm": 1404997.9495770298, + "learning_rate": 0.0002700475937791356, + "loss": 7.3553, + "step": 14751 + }, + { + "epoch": 1.3765046188298964, + "grad_norm": 1.5422055702778716, + "learning_rate": 0.00027004305855751905, + "loss": 7.5777, + "step": 14752 + }, + { + "epoch": 1.3765979285247738, + "grad_norm": 0.8927707885156303, + "learning_rate": 0.0002700385230306686, + "loss": 7.2428, + "step": 14753 + }, + { + "epoch": 1.376691238219651, + "grad_norm": 0.951397291885697, + "learning_rate": 0.00027003398719859575, + "loss": 7.1426, + "step": 14754 + }, + { + "epoch": 1.3767845479145282, + "grad_norm": 1320185.4770153626, + "learning_rate": 0.00027002945106131197, + "loss": 7.4958, + "step": 14755 + }, + { + "epoch": 1.3768778576094056, + "grad_norm": 0.6776049450718393, + "learning_rate": 0.0002700249146188289, + "loss": 7.1035, + "step": 14756 + }, + { + "epoch": 1.376971167304283, + "grad_norm": 0.5853777733416423, + "learning_rate": 0.000270020377871158, + "loss": 7.212, + "step": 14757 + }, + { + "epoch": 1.3770644769991602, + "grad_norm": 608305.0453253873, + "learning_rate": 0.00027001584081831083, + "loss": 7.2181, + "step": 14758 + }, + { + "epoch": 1.3771577866940374, + "grad_norm": 1.696199724983566, + "learning_rate": 0.00027001130346029895, + "loss": 7.5489, + "step": 14759 + }, + { + "epoch": 1.3772510963889149, + "grad_norm": 5.545598885758082, + "learning_rate": 0.00027000676579713387, + "loss": 7.1952, + "step": 14760 + }, + { + "epoch": 1.377344406083792, + "grad_norm": 13.069788336775447, + "learning_rate": 0.0002700022278288271, + "loss": 7.5409, + "step": 14761 + }, + { + "epoch": 1.3774377157786695, + "grad_norm": 0.8028452721183539, + "learning_rate": 0.00026999768955539023, + "loss": 7.0876, + "step": 14762 + }, + { + "epoch": 1.3775310254735467, + "grad_norm": 1.0493328431004514, + "learning_rate": 0.0002699931509768348, + "loss": 7.282, + "step": 14763 + }, + { + "epoch": 1.377624335168424, + "grad_norm": 0.5290734179894545, + "learning_rate": 0.00026998861209317235, + "loss": 7.2622, + "step": 14764 + }, + { + "epoch": 1.3777176448633013, + "grad_norm": 0.9707595380393873, + "learning_rate": 0.00026998407290441435, + "loss": 7.2818, + "step": 14765 + }, + { + "epoch": 1.3778109545581785, + "grad_norm": 1.7289606625142864, + "learning_rate": 0.00026997953341057244, + "loss": 7.5928, + "step": 14766 + }, + { + "epoch": 1.377904264253056, + "grad_norm": 1.135570214747258, + "learning_rate": 0.0002699749936116581, + "loss": 7.6048, + "step": 14767 + }, + { + "epoch": 1.3779975739479333, + "grad_norm": 1.4880106192579259, + "learning_rate": 0.00026997045350768287, + "loss": 7.1335, + "step": 14768 + }, + { + "epoch": 1.3780908836428105, + "grad_norm": 1.6012020483527523, + "learning_rate": 0.0002699659130986583, + "loss": 7.0935, + "step": 14769 + }, + { + "epoch": 1.3781841933376877, + "grad_norm": 1.0659836216122438, + "learning_rate": 0.000269961372384596, + "loss": 7.0954, + "step": 14770 + }, + { + "epoch": 1.3782775030325651, + "grad_norm": 511577.2787149725, + "learning_rate": 0.0002699568313655075, + "loss": 7.0852, + "step": 14771 + }, + { + "epoch": 1.3783708127274423, + "grad_norm": 0.7336516981553716, + "learning_rate": 0.0002699522900414042, + "loss": 7.1137, + "step": 14772 + }, + { + "epoch": 1.3784641224223197, + "grad_norm": 0.9047101917620428, + "learning_rate": 0.00026994774841229783, + "loss": 7.156, + "step": 14773 + }, + { + "epoch": 1.378557432117197, + "grad_norm": 448683.3913587402, + "learning_rate": 0.00026994320647819985, + "loss": 7.189, + "step": 14774 + }, + { + "epoch": 1.3786507418120744, + "grad_norm": 4.509777204039186, + "learning_rate": 0.0002699386642391218, + "loss": 7.0835, + "step": 14775 + }, + { + "epoch": 1.3787440515069516, + "grad_norm": 1371745.418906374, + "learning_rate": 0.00026993412169507534, + "loss": 6.9787, + "step": 14776 + }, + { + "epoch": 1.3788373612018288, + "grad_norm": 1.1445055818587715, + "learning_rate": 0.00026992957884607185, + "loss": 7.2202, + "step": 14777 + }, + { + "epoch": 1.3789306708967062, + "grad_norm": 903362.2088857449, + "learning_rate": 0.000269925035692123, + "loss": 7.3316, + "step": 14778 + }, + { + "epoch": 1.3790239805915836, + "grad_norm": 0.5263549034011574, + "learning_rate": 0.0002699204922332403, + "loss": 7.2795, + "step": 14779 + }, + { + "epoch": 1.3791172902864608, + "grad_norm": 469852.0007509432, + "learning_rate": 0.00026991594846943534, + "loss": 7.3656, + "step": 14780 + }, + { + "epoch": 1.379210599981338, + "grad_norm": 1.2891487424640273, + "learning_rate": 0.00026991140440071963, + "loss": 7.4201, + "step": 14781 + }, + { + "epoch": 1.3793039096762154, + "grad_norm": 0.5705409312937247, + "learning_rate": 0.0002699068600271047, + "loss": 7.5584, + "step": 14782 + }, + { + "epoch": 1.3793972193710926, + "grad_norm": 1.2002900652495156, + "learning_rate": 0.0002699023153486022, + "loss": 7.3328, + "step": 14783 + }, + { + "epoch": 1.37949052906597, + "grad_norm": 267143.8832889657, + "learning_rate": 0.0002698977703652236, + "loss": 7.003, + "step": 14784 + }, + { + "epoch": 1.3795838387608472, + "grad_norm": 1.1329411360515074, + "learning_rate": 0.0002698932250769805, + "loss": 7.2718, + "step": 14785 + }, + { + "epoch": 1.3796771484557246, + "grad_norm": 0.8054368744706428, + "learning_rate": 0.0002698886794838844, + "loss": 7.2763, + "step": 14786 + }, + { + "epoch": 1.3797704581506018, + "grad_norm": 1.8344178561840496, + "learning_rate": 0.00026988413358594696, + "loss": 7.7453, + "step": 14787 + }, + { + "epoch": 1.379863767845479, + "grad_norm": 3.187619748728234, + "learning_rate": 0.00026987958738317963, + "loss": 6.909, + "step": 14788 + }, + { + "epoch": 1.3799570775403565, + "grad_norm": 0.8996557620508533, + "learning_rate": 0.000269875040875594, + "loss": 7.1775, + "step": 14789 + }, + { + "epoch": 1.3800503872352339, + "grad_norm": 1.7246656221462986, + "learning_rate": 0.00026987049406320177, + "loss": 7.4327, + "step": 14790 + }, + { + "epoch": 1.380143696930111, + "grad_norm": 1.3495752930026812, + "learning_rate": 0.00026986594694601427, + "loss": 7.2984, + "step": 14791 + }, + { + "epoch": 1.3802370066249883, + "grad_norm": 0.6145054569067835, + "learning_rate": 0.0002698613995240432, + "loss": 7.4089, + "step": 14792 + }, + { + "epoch": 1.3803303163198657, + "grad_norm": 1.1852172273545294, + "learning_rate": 0.0002698568517973002, + "loss": 7.177, + "step": 14793 + }, + { + "epoch": 1.3804236260147429, + "grad_norm": 0.74619942773641, + "learning_rate": 0.00026985230376579654, + "loss": 7.2632, + "step": 14794 + }, + { + "epoch": 1.3805169357096203, + "grad_norm": 0.921724857921365, + "learning_rate": 0.00026984775542954415, + "loss": 7.1494, + "step": 14795 + }, + { + "epoch": 1.3806102454044975, + "grad_norm": 1.2115702401058186, + "learning_rate": 0.0002698432067885543, + "loss": 7.4371, + "step": 14796 + }, + { + "epoch": 1.380703555099375, + "grad_norm": 0.8950125646269187, + "learning_rate": 0.00026983865784283874, + "loss": 7.095, + "step": 14797 + }, + { + "epoch": 1.3807968647942521, + "grad_norm": 1.3884972635745068, + "learning_rate": 0.00026983410859240893, + "loss": 7.582, + "step": 14798 + }, + { + "epoch": 1.3808901744891293, + "grad_norm": 1.6949168721068808, + "learning_rate": 0.0002698295590372765, + "loss": 7.4223, + "step": 14799 + }, + { + "epoch": 1.3809834841840067, + "grad_norm": 0.5458233340250163, + "learning_rate": 0.000269825009177453, + "loss": 7.5162, + "step": 14800 + }, + { + "epoch": 1.381076793878884, + "grad_norm": 1.5734892957959556, + "learning_rate": 0.00026982045901295003, + "loss": 6.8513, + "step": 14801 + }, + { + "epoch": 1.3811701035737614, + "grad_norm": 1.2301577024893087, + "learning_rate": 0.0002698159085437791, + "loss": 7.4577, + "step": 14802 + }, + { + "epoch": 1.3812634132686386, + "grad_norm": 293861.74783117, + "learning_rate": 0.00026981135776995175, + "loss": 7.3362, + "step": 14803 + }, + { + "epoch": 1.381356722963516, + "grad_norm": 174719.07497541158, + "learning_rate": 0.00026980680669147966, + "loss": 6.9863, + "step": 14804 + }, + { + "epoch": 1.3814500326583932, + "grad_norm": 0.5209867018100324, + "learning_rate": 0.00026980225530837435, + "loss": 6.9664, + "step": 14805 + }, + { + "epoch": 1.3815433423532706, + "grad_norm": 1.2533147007487608, + "learning_rate": 0.0002697977036206474, + "loss": 7.2779, + "step": 14806 + }, + { + "epoch": 1.3816366520481478, + "grad_norm": 4.229410592235248, + "learning_rate": 0.00026979315162831033, + "loss": 7.2181, + "step": 14807 + }, + { + "epoch": 1.3817299617430252, + "grad_norm": 1.3059567047997294, + "learning_rate": 0.0002697885993313748, + "loss": 7.6457, + "step": 14808 + }, + { + "epoch": 1.3818232714379024, + "grad_norm": 1.300822063712285, + "learning_rate": 0.0002697840467298524, + "loss": 7.2472, + "step": 14809 + }, + { + "epoch": 1.3819165811327796, + "grad_norm": 1.0230516522788906, + "learning_rate": 0.0002697794938237545, + "loss": 7.0547, + "step": 14810 + }, + { + "epoch": 1.382009890827657, + "grad_norm": 1.516834557824769, + "learning_rate": 0.00026977494061309297, + "loss": 6.965, + "step": 14811 + }, + { + "epoch": 1.3821032005225342, + "grad_norm": 1.755072341242734, + "learning_rate": 0.0002697703870978792, + "loss": 7.1102, + "step": 14812 + }, + { + "epoch": 1.3821965102174116, + "grad_norm": 1.2361153599412313, + "learning_rate": 0.00026976583327812477, + "loss": 7.6254, + "step": 14813 + }, + { + "epoch": 1.3822898199122888, + "grad_norm": 0.8470397084519855, + "learning_rate": 0.00026976127915384135, + "loss": 7.4052, + "step": 14814 + }, + { + "epoch": 1.3823831296071662, + "grad_norm": 1.8632313857223113, + "learning_rate": 0.00026975672472504046, + "loss": 7.2773, + "step": 14815 + }, + { + "epoch": 1.3824764393020434, + "grad_norm": 5.21996375750262, + "learning_rate": 0.00026975216999173374, + "loss": 7.0745, + "step": 14816 + }, + { + "epoch": 1.3825697489969206, + "grad_norm": 0.5745668076254421, + "learning_rate": 0.00026974761495393264, + "loss": 7.2207, + "step": 14817 + }, + { + "epoch": 1.382663058691798, + "grad_norm": 1.1680433207378789, + "learning_rate": 0.00026974305961164886, + "loss": 7.5391, + "step": 14818 + }, + { + "epoch": 1.3827563683866755, + "grad_norm": 0.5552270098549038, + "learning_rate": 0.000269738503964894, + "loss": 7.4122, + "step": 14819 + }, + { + "epoch": 1.3828496780815527, + "grad_norm": 683849.3084264975, + "learning_rate": 0.0002697339480136796, + "loss": 7.5408, + "step": 14820 + }, + { + "epoch": 1.3829429877764299, + "grad_norm": 1.2184115302428613, + "learning_rate": 0.00026972939175801716, + "loss": 7.5731, + "step": 14821 + }, + { + "epoch": 1.3830362974713073, + "grad_norm": 0.8343783758360459, + "learning_rate": 0.00026972483519791836, + "loss": 6.9675, + "step": 14822 + }, + { + "epoch": 1.3831296071661845, + "grad_norm": 1.2573339949596676, + "learning_rate": 0.0002697202783333948, + "loss": 7.2726, + "step": 14823 + }, + { + "epoch": 1.383222916861062, + "grad_norm": 0.899648755783172, + "learning_rate": 0.00026971572116445803, + "loss": 7.4654, + "step": 14824 + }, + { + "epoch": 1.383316226555939, + "grad_norm": 0.8268769860190669, + "learning_rate": 0.00026971116369111964, + "loss": 7.0358, + "step": 14825 + }, + { + "epoch": 1.3834095362508165, + "grad_norm": 1.003568035670626, + "learning_rate": 0.0002697066059133912, + "loss": 6.993, + "step": 14826 + }, + { + "epoch": 1.3835028459456937, + "grad_norm": 1.3698724051007545, + "learning_rate": 0.0002697020478312843, + "loss": 7.125, + "step": 14827 + }, + { + "epoch": 1.383596155640571, + "grad_norm": 1.1768875518334911, + "learning_rate": 0.00026969748944481064, + "loss": 7.5786, + "step": 14828 + }, + { + "epoch": 1.3836894653354483, + "grad_norm": 1.2410172717578833, + "learning_rate": 0.0002696929307539816, + "loss": 7.2206, + "step": 14829 + }, + { + "epoch": 1.3837827750303258, + "grad_norm": 0.9310410937848075, + "learning_rate": 0.00026968837175880904, + "loss": 7.4401, + "step": 14830 + }, + { + "epoch": 1.383876084725203, + "grad_norm": 0.9598546110567513, + "learning_rate": 0.0002696838124593043, + "loss": 7.2149, + "step": 14831 + }, + { + "epoch": 1.3839693944200802, + "grad_norm": 0.8900804534403575, + "learning_rate": 0.0002696792528554791, + "loss": 7.6019, + "step": 14832 + }, + { + "epoch": 1.3840627041149576, + "grad_norm": 0.5503295198775383, + "learning_rate": 0.00026967469294734503, + "loss": 7.221, + "step": 14833 + }, + { + "epoch": 1.3841560138098348, + "grad_norm": 1.089002852302207, + "learning_rate": 0.0002696701327349136, + "loss": 7.308, + "step": 14834 + }, + { + "epoch": 1.3842493235047122, + "grad_norm": 0.875839914194472, + "learning_rate": 0.0002696655722181966, + "loss": 7.445, + "step": 14835 + }, + { + "epoch": 1.3843426331995894, + "grad_norm": 1.133949895225627, + "learning_rate": 0.0002696610113972054, + "loss": 7.3025, + "step": 14836 + }, + { + "epoch": 1.3844359428944668, + "grad_norm": 996949.5482606681, + "learning_rate": 0.00026965645027195173, + "loss": 7.1355, + "step": 14837 + }, + { + "epoch": 1.384529252589344, + "grad_norm": 0.5881480768887803, + "learning_rate": 0.00026965188884244716, + "loss": 7.1302, + "step": 14838 + }, + { + "epoch": 1.3846225622842212, + "grad_norm": 1.5507959495617336, + "learning_rate": 0.0002696473271087032, + "loss": 7.4263, + "step": 14839 + }, + { + "epoch": 1.3847158719790986, + "grad_norm": 0.7865402117396373, + "learning_rate": 0.00026964276507073165, + "loss": 7.2603, + "step": 14840 + }, + { + "epoch": 1.384809181673976, + "grad_norm": 45557.53460578604, + "learning_rate": 0.0002696382027285439, + "loss": 7.1251, + "step": 14841 + }, + { + "epoch": 1.3849024913688532, + "grad_norm": 1.2513812291185964, + "learning_rate": 0.0002696336400821517, + "loss": 7.2299, + "step": 14842 + }, + { + "epoch": 1.3849958010637304, + "grad_norm": 524778.7797781149, + "learning_rate": 0.0002696290771315665, + "loss": 7.4291, + "step": 14843 + }, + { + "epoch": 1.3850891107586079, + "grad_norm": 0.8176577780004454, + "learning_rate": 0.00026962451387680005, + "loss": 7.5634, + "step": 14844 + }, + { + "epoch": 1.385182420453485, + "grad_norm": 1.2399137099499504, + "learning_rate": 0.0002696199503178639, + "loss": 7.2034, + "step": 14845 + }, + { + "epoch": 1.3852757301483625, + "grad_norm": 1.3497011799064995, + "learning_rate": 0.0002696153864547697, + "loss": 7.1993, + "step": 14846 + }, + { + "epoch": 1.3853690398432397, + "grad_norm": 0.6675286754600271, + "learning_rate": 0.0002696108222875289, + "loss": 7.3495, + "step": 14847 + }, + { + "epoch": 1.385462349538117, + "grad_norm": 0.6530342770906333, + "learning_rate": 0.0002696062578161533, + "loss": 7.5454, + "step": 14848 + }, + { + "epoch": 1.3855556592329943, + "grad_norm": 0.7940838893274875, + "learning_rate": 0.0002696016930406544, + "loss": 7.3501, + "step": 14849 + }, + { + "epoch": 1.3856489689278715, + "grad_norm": 106308.35013022555, + "learning_rate": 0.0002695971279610438, + "loss": 7.4527, + "step": 14850 + }, + { + "epoch": 1.385742278622749, + "grad_norm": 1.0542347312356148, + "learning_rate": 0.0002695925625773331, + "loss": 7.1001, + "step": 14851 + }, + { + "epoch": 1.3858355883176263, + "grad_norm": 0.8438143059524829, + "learning_rate": 0.000269587996889534, + "loss": 6.9073, + "step": 14852 + }, + { + "epoch": 1.3859288980125035, + "grad_norm": 1.1770580117629212, + "learning_rate": 0.000269583430897658, + "loss": 7.3291, + "step": 14853 + }, + { + "epoch": 1.3860222077073807, + "grad_norm": 0.6608749349692261, + "learning_rate": 0.0002695788646017168, + "loss": 7.0552, + "step": 14854 + }, + { + "epoch": 1.3861155174022581, + "grad_norm": 239926.136620618, + "learning_rate": 0.00026957429800172195, + "loss": 7.2421, + "step": 14855 + }, + { + "epoch": 1.3862088270971353, + "grad_norm": 0.8546286325181485, + "learning_rate": 0.00026956973109768504, + "loss": 7.2361, + "step": 14856 + }, + { + "epoch": 1.3863021367920128, + "grad_norm": 0.8096643668492258, + "learning_rate": 0.0002695651638896178, + "loss": 7.5179, + "step": 14857 + }, + { + "epoch": 1.38639544648689, + "grad_norm": 1.004029577682257, + "learning_rate": 0.00026956059637753174, + "loss": 6.9421, + "step": 14858 + }, + { + "epoch": 1.3864887561817674, + "grad_norm": 1.9451186958876905, + "learning_rate": 0.00026955602856143843, + "loss": 7.2224, + "step": 14859 + }, + { + "epoch": 1.3865820658766446, + "grad_norm": 0.696108341267721, + "learning_rate": 0.00026955146044134963, + "loss": 7.2438, + "step": 14860 + }, + { + "epoch": 1.3866753755715218, + "grad_norm": 0.7297998914524209, + "learning_rate": 0.0002695468920172769, + "loss": 7.3186, + "step": 14861 + }, + { + "epoch": 1.3867686852663992, + "grad_norm": 0.5538017451526083, + "learning_rate": 0.0002695423232892318, + "loss": 7.0799, + "step": 14862 + }, + { + "epoch": 1.3868619949612766, + "grad_norm": 0.5724524223023869, + "learning_rate": 0.000269537754257226, + "loss": 7.1597, + "step": 14863 + }, + { + "epoch": 1.3869553046561538, + "grad_norm": 13615.290642074666, + "learning_rate": 0.00026953318492127103, + "loss": 7.4037, + "step": 14864 + }, + { + "epoch": 1.387048614351031, + "grad_norm": 1.3038017023473696, + "learning_rate": 0.0002695286152813787, + "loss": 7.2184, + "step": 14865 + }, + { + "epoch": 1.3871419240459084, + "grad_norm": 26389.672164864656, + "learning_rate": 0.0002695240453375604, + "loss": 6.9811, + "step": 14866 + }, + { + "epoch": 1.3872352337407856, + "grad_norm": 1.8498993229993403, + "learning_rate": 0.0002695194750898279, + "loss": 7.2034, + "step": 14867 + }, + { + "epoch": 1.387328543435663, + "grad_norm": 187401.70766240786, + "learning_rate": 0.00026951490453819277, + "loss": 7.3224, + "step": 14868 + }, + { + "epoch": 1.3874218531305402, + "grad_norm": 0.7264962864739732, + "learning_rate": 0.00026951033368266666, + "loss": 7.2373, + "step": 14869 + }, + { + "epoch": 1.3875151628254176, + "grad_norm": 0.6538032785714349, + "learning_rate": 0.0002695057625232612, + "loss": 7.1677, + "step": 14870 + }, + { + "epoch": 1.3876084725202948, + "grad_norm": 1.7165394882692833, + "learning_rate": 0.00026950119105998793, + "loss": 7.4104, + "step": 14871 + }, + { + "epoch": 1.387701782215172, + "grad_norm": 0.7322316445104958, + "learning_rate": 0.0002694966192928585, + "loss": 7.2966, + "step": 14872 + }, + { + "epoch": 1.3877950919100495, + "grad_norm": 413610.5293501136, + "learning_rate": 0.00026949204722188463, + "loss": 7.1974, + "step": 14873 + }, + { + "epoch": 1.3878884016049269, + "grad_norm": 1431823.795931878, + "learning_rate": 0.0002694874748470779, + "loss": 7.4309, + "step": 14874 + }, + { + "epoch": 1.387981711299804, + "grad_norm": 0.7374114814550794, + "learning_rate": 0.00026948290216844984, + "loss": 7.2476, + "step": 14875 + }, + { + "epoch": 1.3880750209946813, + "grad_norm": 1729957.5981201767, + "learning_rate": 0.0002694783291860122, + "loss": 7.3069, + "step": 14876 + }, + { + "epoch": 1.3881683306895587, + "grad_norm": 1.1872518945308945, + "learning_rate": 0.0002694737558997766, + "loss": 7.5742, + "step": 14877 + }, + { + "epoch": 1.388261640384436, + "grad_norm": 5.405895268370206, + "learning_rate": 0.0002694691823097546, + "loss": 6.992, + "step": 14878 + }, + { + "epoch": 1.3883549500793133, + "grad_norm": 0.6451589559360068, + "learning_rate": 0.00026946460841595777, + "loss": 7.1743, + "step": 14879 + }, + { + "epoch": 1.3884482597741905, + "grad_norm": 1.0429772479843686, + "learning_rate": 0.00026946003421839793, + "loss": 7.4526, + "step": 14880 + }, + { + "epoch": 1.388541569469068, + "grad_norm": 0.7334120749112252, + "learning_rate": 0.0002694554597170866, + "loss": 7.4059, + "step": 14881 + }, + { + "epoch": 1.3886348791639451, + "grad_norm": 0.8156966798971056, + "learning_rate": 0.0002694508849120354, + "loss": 7.4144, + "step": 14882 + }, + { + "epoch": 1.3887281888588223, + "grad_norm": 3.234898283450268, + "learning_rate": 0.000269446309803256, + "loss": 7.1405, + "step": 14883 + }, + { + "epoch": 1.3888214985536997, + "grad_norm": 6619718.066673744, + "learning_rate": 0.00026944173439075994, + "loss": 7.1766, + "step": 14884 + }, + { + "epoch": 1.3889148082485772, + "grad_norm": 1.1003507756055866, + "learning_rate": 0.00026943715867455904, + "loss": 7.2537, + "step": 14885 + }, + { + "epoch": 1.3890081179434544, + "grad_norm": 9980892.781891994, + "learning_rate": 0.00026943258265466476, + "loss": 7.5348, + "step": 14886 + }, + { + "epoch": 1.3891014276383316, + "grad_norm": 191525751.3090215, + "learning_rate": 0.0002694280063310888, + "loss": 7.2586, + "step": 14887 + }, + { + "epoch": 1.389194737333209, + "grad_norm": 0.6726391087534896, + "learning_rate": 0.00026942342970384285, + "loss": 7.2689, + "step": 14888 + }, + { + "epoch": 1.3892880470280862, + "grad_norm": 1.7578593682821888, + "learning_rate": 0.0002694188527729385, + "loss": 7.301, + "step": 14889 + }, + { + "epoch": 1.3893813567229636, + "grad_norm": 2.2863229738869806, + "learning_rate": 0.0002694142755383873, + "loss": 7.4823, + "step": 14890 + }, + { + "epoch": 1.3894746664178408, + "grad_norm": 1.3376737756622232, + "learning_rate": 0.00026940969800020095, + "loss": 7.4013, + "step": 14891 + }, + { + "epoch": 1.3895679761127182, + "grad_norm": 1.4688076587544037, + "learning_rate": 0.0002694051201583912, + "loss": 7.1901, + "step": 14892 + }, + { + "epoch": 1.3896612858075954, + "grad_norm": 2.145118633941029, + "learning_rate": 0.0002694005420129696, + "loss": 7.5756, + "step": 14893 + }, + { + "epoch": 1.3897545955024726, + "grad_norm": 1.0599335943597548, + "learning_rate": 0.0002693959635639477, + "loss": 7.2267, + "step": 14894 + }, + { + "epoch": 1.38984790519735, + "grad_norm": 150982051663450.72, + "learning_rate": 0.0002693913848113374, + "loss": 7.4329, + "step": 14895 + }, + { + "epoch": 1.3899412148922274, + "grad_norm": 1585779113307.8328, + "learning_rate": 0.00026938680575515003, + "loss": 7.3142, + "step": 14896 + }, + { + "epoch": 1.3900345245871046, + "grad_norm": 9.845702488645, + "learning_rate": 0.0002693822263953974, + "loss": 7.4767, + "step": 14897 + }, + { + "epoch": 1.3901278342819818, + "grad_norm": 16.648273775413713, + "learning_rate": 0.00026937764673209117, + "loss": 7.23, + "step": 14898 + }, + { + "epoch": 1.3902211439768593, + "grad_norm": 2.0898604359181547, + "learning_rate": 0.0002693730667652429, + "loss": 7.3341, + "step": 14899 + }, + { + "epoch": 1.3903144536717364, + "grad_norm": 6.216414188693971, + "learning_rate": 0.00026936848649486433, + "loss": 7.865, + "step": 14900 + }, + { + "epoch": 1.3904077633666139, + "grad_norm": 25.68268131355326, + "learning_rate": 0.00026936390592096704, + "loss": 7.5024, + "step": 14901 + }, + { + "epoch": 1.390501073061491, + "grad_norm": 3.4843852168417535, + "learning_rate": 0.0002693593250435627, + "loss": 7.5664, + "step": 14902 + }, + { + "epoch": 1.3905943827563685, + "grad_norm": 2.8502225293139585, + "learning_rate": 0.0002693547438626629, + "loss": 7.7595, + "step": 14903 + }, + { + "epoch": 1.3906876924512457, + "grad_norm": 99.43113607389749, + "learning_rate": 0.00026935016237827946, + "loss": 7.5522, + "step": 14904 + }, + { + "epoch": 1.3907810021461229, + "grad_norm": 3950349.541019334, + "learning_rate": 0.0002693455805904238, + "loss": 7.952, + "step": 14905 + }, + { + "epoch": 1.3908743118410003, + "grad_norm": 41.372259740144145, + "learning_rate": 0.00026934099849910774, + "loss": 7.6053, + "step": 14906 + }, + { + "epoch": 1.3909676215358775, + "grad_norm": 2.762421261488444, + "learning_rate": 0.00026933641610434283, + "loss": 7.6338, + "step": 14907 + }, + { + "epoch": 1.391060931230755, + "grad_norm": 2.2338000732611447, + "learning_rate": 0.0002693318334061408, + "loss": 7.4358, + "step": 14908 + }, + { + "epoch": 1.3911542409256321, + "grad_norm": 5.8043940899269595, + "learning_rate": 0.00026932725040451324, + "loss": 7.7421, + "step": 14909 + }, + { + "epoch": 1.3912475506205095, + "grad_norm": 1.647912567473909, + "learning_rate": 0.0002693226670994718, + "loss": 7.7404, + "step": 14910 + }, + { + "epoch": 1.3913408603153867, + "grad_norm": 3.110895066854765, + "learning_rate": 0.0002693180834910282, + "loss": 7.6311, + "step": 14911 + }, + { + "epoch": 1.3914341700102641, + "grad_norm": 1.801674990622472, + "learning_rate": 0.0002693134995791941, + "loss": 8.0369, + "step": 14912 + }, + { + "epoch": 1.3915274797051413, + "grad_norm": 1.3922974577402962, + "learning_rate": 0.00026930891536398107, + "loss": 7.5766, + "step": 14913 + }, + { + "epoch": 1.3916207894000188, + "grad_norm": 2.089480379683506, + "learning_rate": 0.0002693043308454008, + "loss": 7.5782, + "step": 14914 + }, + { + "epoch": 1.391714099094896, + "grad_norm": 1.3539989660592864, + "learning_rate": 0.00026929974602346503, + "loss": 7.6527, + "step": 14915 + }, + { + "epoch": 1.3918074087897732, + "grad_norm": 1.9621047210355183, + "learning_rate": 0.00026929516089818525, + "loss": 7.591, + "step": 14916 + }, + { + "epoch": 1.3919007184846506, + "grad_norm": 1.5227494216508501, + "learning_rate": 0.0002692905754695733, + "loss": 7.5475, + "step": 14917 + }, + { + "epoch": 1.3919940281795278, + "grad_norm": 1.6416749743622046, + "learning_rate": 0.00026928598973764066, + "loss": 7.6177, + "step": 14918 + }, + { + "epoch": 1.3920873378744052, + "grad_norm": 1.4599848255441619, + "learning_rate": 0.00026928140370239916, + "loss": 7.542, + "step": 14919 + }, + { + "epoch": 1.3921806475692824, + "grad_norm": 1.778205508365361, + "learning_rate": 0.00026927681736386034, + "loss": 8.1197, + "step": 14920 + }, + { + "epoch": 1.3922739572641598, + "grad_norm": 2.057679730970667, + "learning_rate": 0.0002692722307220359, + "loss": 7.5751, + "step": 14921 + }, + { + "epoch": 1.392367266959037, + "grad_norm": 1.3305411743469697, + "learning_rate": 0.0002692676437769376, + "loss": 7.4892, + "step": 14922 + }, + { + "epoch": 1.3924605766539142, + "grad_norm": 2.3681816659326347, + "learning_rate": 0.0002692630565285769, + "loss": 7.3289, + "step": 14923 + }, + { + "epoch": 1.3925538863487916, + "grad_norm": 1.5135784138219626, + "learning_rate": 0.00026925846897696564, + "loss": 7.6909, + "step": 14924 + }, + { + "epoch": 1.392647196043669, + "grad_norm": 1.6184471225918682, + "learning_rate": 0.0002692538811221154, + "loss": 7.5127, + "step": 14925 + }, + { + "epoch": 1.3927405057385462, + "grad_norm": 2.4177743754723657, + "learning_rate": 0.00026924929296403785, + "loss": 7.2522, + "step": 14926 + }, + { + "epoch": 1.3928338154334234, + "grad_norm": 1.1341823798441772, + "learning_rate": 0.0002692447045027447, + "loss": 7.3242, + "step": 14927 + }, + { + "epoch": 1.3929271251283009, + "grad_norm": 1.3715572195975274, + "learning_rate": 0.00026924011573824756, + "loss": 7.5923, + "step": 14928 + }, + { + "epoch": 1.393020434823178, + "grad_norm": 4.274212159420057, + "learning_rate": 0.0002692355266705581, + "loss": 7.8082, + "step": 14929 + }, + { + "epoch": 1.3931137445180555, + "grad_norm": 2.131786335584775, + "learning_rate": 0.0002692309372996881, + "loss": 7.6828, + "step": 14930 + }, + { + "epoch": 1.3932070542129327, + "grad_norm": 1.35956267925933, + "learning_rate": 0.0002692263476256491, + "loss": 7.5088, + "step": 14931 + }, + { + "epoch": 1.39330036390781, + "grad_norm": 1.375933387688229, + "learning_rate": 0.0002692217576484528, + "loss": 7.7655, + "step": 14932 + }, + { + "epoch": 1.3933936736026873, + "grad_norm": 1.6883715582830998, + "learning_rate": 0.00026921716736811094, + "loss": 7.6192, + "step": 14933 + }, + { + "epoch": 1.3934869832975645, + "grad_norm": 2.997937139215863, + "learning_rate": 0.0002692125767846351, + "loss": 7.4293, + "step": 14934 + }, + { + "epoch": 1.393580292992442, + "grad_norm": 2.2728651330972087, + "learning_rate": 0.00026920798589803697, + "loss": 7.7734, + "step": 14935 + }, + { + "epoch": 1.3936736026873193, + "grad_norm": 2.367068705748412, + "learning_rate": 0.00026920339470832823, + "loss": 7.6101, + "step": 14936 + }, + { + "epoch": 1.3937669123821965, + "grad_norm": 1.38986411408972, + "learning_rate": 0.00026919880321552063, + "loss": 7.6833, + "step": 14937 + }, + { + "epoch": 1.3938602220770737, + "grad_norm": 2.5388183701855227, + "learning_rate": 0.00026919421141962573, + "loss": 7.8317, + "step": 14938 + }, + { + "epoch": 1.3939535317719511, + "grad_norm": 2.4710380241683425, + "learning_rate": 0.00026918961932065533, + "loss": 7.8014, + "step": 14939 + }, + { + "epoch": 1.3940468414668283, + "grad_norm": 1.790418587782552, + "learning_rate": 0.00026918502691862094, + "loss": 7.631, + "step": 14940 + }, + { + "epoch": 1.3941401511617058, + "grad_norm": 1.1019281277538924, + "learning_rate": 0.0002691804342135344, + "loss": 7.5114, + "step": 14941 + }, + { + "epoch": 1.394233460856583, + "grad_norm": 1.476659172517915, + "learning_rate": 0.00026917584120540733, + "loss": 7.4968, + "step": 14942 + }, + { + "epoch": 1.3943267705514604, + "grad_norm": 1.4299413304026958, + "learning_rate": 0.00026917124789425137, + "loss": 7.5603, + "step": 14943 + }, + { + "epoch": 1.3944200802463376, + "grad_norm": 1.5046289724093758, + "learning_rate": 0.0002691666542800782, + "loss": 7.7461, + "step": 14944 + }, + { + "epoch": 1.3945133899412148, + "grad_norm": 1.2530376687396916, + "learning_rate": 0.00026916206036289956, + "loss": 7.7424, + "step": 14945 + }, + { + "epoch": 1.3946066996360922, + "grad_norm": 1.768353882283856, + "learning_rate": 0.00026915746614272713, + "loss": 7.6153, + "step": 14946 + }, + { + "epoch": 1.3947000093309696, + "grad_norm": 1.2577678225854925, + "learning_rate": 0.0002691528716195725, + "loss": 7.8169, + "step": 14947 + }, + { + "epoch": 1.3947933190258468, + "grad_norm": 1.1406099889062649, + "learning_rate": 0.0002691482767934474, + "loss": 8.0171, + "step": 14948 + }, + { + "epoch": 1.394886628720724, + "grad_norm": 1.372268994641854, + "learning_rate": 0.0002691436816643636, + "loss": 7.4504, + "step": 14949 + }, + { + "epoch": 1.3949799384156014, + "grad_norm": 2.201562095437211, + "learning_rate": 0.00026913908623233263, + "loss": 7.6315, + "step": 14950 + }, + { + "epoch": 1.3950732481104786, + "grad_norm": 1.2041447114740542, + "learning_rate": 0.00026913449049736633, + "loss": 7.8198, + "step": 14951 + }, + { + "epoch": 1.395166557805356, + "grad_norm": 1.0932048289045047, + "learning_rate": 0.00026912989445947627, + "loss": 7.2952, + "step": 14952 + }, + { + "epoch": 1.3952598675002332, + "grad_norm": 2.384619162534799, + "learning_rate": 0.0002691252981186742, + "loss": 8.0312, + "step": 14953 + }, + { + "epoch": 1.3953531771951106, + "grad_norm": 1.074611109583373, + "learning_rate": 0.0002691207014749718, + "loss": 7.5901, + "step": 14954 + }, + { + "epoch": 1.3954464868899878, + "grad_norm": 0.9832890892881401, + "learning_rate": 0.0002691161045283807, + "loss": 7.541, + "step": 14955 + }, + { + "epoch": 1.395539796584865, + "grad_norm": 1.696839221652246, + "learning_rate": 0.0002691115072789126, + "loss": 7.7312, + "step": 14956 + }, + { + "epoch": 1.3956331062797425, + "grad_norm": 1.6158776759152127, + "learning_rate": 0.0002691069097265793, + "loss": 7.8559, + "step": 14957 + }, + { + "epoch": 1.3957264159746199, + "grad_norm": 1.6196290623331735, + "learning_rate": 0.0002691023118713924, + "loss": 7.5718, + "step": 14958 + }, + { + "epoch": 1.395819725669497, + "grad_norm": 1.5384048765228322, + "learning_rate": 0.00026909771371336357, + "loss": 7.6541, + "step": 14959 + }, + { + "epoch": 1.3959130353643743, + "grad_norm": 2.9165520056461594, + "learning_rate": 0.00026909311525250457, + "loss": 7.5077, + "step": 14960 + }, + { + "epoch": 1.3960063450592517, + "grad_norm": 1.4931044411273504, + "learning_rate": 0.00026908851648882703, + "loss": 7.6539, + "step": 14961 + }, + { + "epoch": 1.396099654754129, + "grad_norm": 1.623478226385565, + "learning_rate": 0.0002690839174223426, + "loss": 7.7774, + "step": 14962 + }, + { + "epoch": 1.3961929644490063, + "grad_norm": 1.4793360330407703, + "learning_rate": 0.00026907931805306314, + "loss": 7.307, + "step": 14963 + }, + { + "epoch": 1.3962862741438835, + "grad_norm": 1.491586894678801, + "learning_rate": 0.00026907471838100025, + "loss": 7.564, + "step": 14964 + }, + { + "epoch": 1.396379583838761, + "grad_norm": 1.255971637388843, + "learning_rate": 0.00026907011840616557, + "loss": 7.4756, + "step": 14965 + }, + { + "epoch": 1.3964728935336381, + "grad_norm": 1.488019673344016, + "learning_rate": 0.0002690655181285709, + "loss": 7.5749, + "step": 14966 + }, + { + "epoch": 1.3965662032285153, + "grad_norm": 3.36119063324342, + "learning_rate": 0.00026906091754822783, + "loss": 7.9552, + "step": 14967 + }, + { + "epoch": 1.3966595129233927, + "grad_norm": 0.882980993662301, + "learning_rate": 0.00026905631666514814, + "loss": 7.4887, + "step": 14968 + }, + { + "epoch": 1.3967528226182702, + "grad_norm": 1.2153918910974495, + "learning_rate": 0.0002690517154793435, + "loss": 7.4036, + "step": 14969 + }, + { + "epoch": 1.3968461323131474, + "grad_norm": 1.3660233217784536, + "learning_rate": 0.00026904711399082564, + "loss": 7.5003, + "step": 14970 + }, + { + "epoch": 1.3969394420080246, + "grad_norm": 1.5168875800970594, + "learning_rate": 0.0002690425121996062, + "loss": 8.1765, + "step": 14971 + }, + { + "epoch": 1.397032751702902, + "grad_norm": 1.3878087373262236, + "learning_rate": 0.00026903791010569693, + "loss": 7.5898, + "step": 14972 + }, + { + "epoch": 1.3971260613977792, + "grad_norm": 1.5366857275610284, + "learning_rate": 0.00026903330770910956, + "loss": 7.3246, + "step": 14973 + }, + { + "epoch": 1.3972193710926566, + "grad_norm": 1.576081603705434, + "learning_rate": 0.00026902870500985566, + "loss": 7.3908, + "step": 14974 + }, + { + "epoch": 1.3973126807875338, + "grad_norm": 1.1924079524556157, + "learning_rate": 0.00026902410200794704, + "loss": 7.5225, + "step": 14975 + }, + { + "epoch": 1.3974059904824112, + "grad_norm": 2.250668930469763, + "learning_rate": 0.00026901949870339543, + "loss": 7.7423, + "step": 14976 + }, + { + "epoch": 1.3974993001772884, + "grad_norm": 1.4804267654484573, + "learning_rate": 0.0002690148950962125, + "loss": 7.7353, + "step": 14977 + }, + { + "epoch": 1.3975926098721656, + "grad_norm": 1.1778165989060487, + "learning_rate": 0.0002690102911864099, + "loss": 7.1952, + "step": 14978 + }, + { + "epoch": 1.397685919567043, + "grad_norm": 2.0599857317993737, + "learning_rate": 0.0002690056869739994, + "loss": 7.9854, + "step": 14979 + }, + { + "epoch": 1.3977792292619204, + "grad_norm": 2.117068816683423, + "learning_rate": 0.0002690010824589927, + "loss": 7.3131, + "step": 14980 + }, + { + "epoch": 1.3978725389567976, + "grad_norm": 1.539431462567992, + "learning_rate": 0.0002689964776414015, + "loss": 7.9649, + "step": 14981 + }, + { + "epoch": 1.3979658486516748, + "grad_norm": 1.3605262866093126, + "learning_rate": 0.0002689918725212375, + "loss": 7.9807, + "step": 14982 + }, + { + "epoch": 1.3980591583465523, + "grad_norm": 1.2874010084498384, + "learning_rate": 0.0002689872670985124, + "loss": 7.3155, + "step": 14983 + }, + { + "epoch": 1.3981524680414295, + "grad_norm": 1.2777593463664205, + "learning_rate": 0.00026898266137323793, + "loss": 7.8538, + "step": 14984 + }, + { + "epoch": 1.3982457777363069, + "grad_norm": 1.188613095872729, + "learning_rate": 0.00026897805534542585, + "loss": 7.7249, + "step": 14985 + }, + { + "epoch": 1.398339087431184, + "grad_norm": 1.216276007870364, + "learning_rate": 0.00026897344901508774, + "loss": 7.6826, + "step": 14986 + }, + { + "epoch": 1.3984323971260615, + "grad_norm": 0.8954089219058423, + "learning_rate": 0.00026896884238223544, + "loss": 7.6434, + "step": 14987 + }, + { + "epoch": 1.3985257068209387, + "grad_norm": 1.669518961694563, + "learning_rate": 0.00026896423544688064, + "loss": 7.6284, + "step": 14988 + }, + { + "epoch": 1.3986190165158159, + "grad_norm": 1.1371649371994914, + "learning_rate": 0.00026895962820903496, + "loss": 7.7069, + "step": 14989 + }, + { + "epoch": 1.3987123262106933, + "grad_norm": 1.9854884293110222, + "learning_rate": 0.0002689550206687102, + "loss": 7.4616, + "step": 14990 + }, + { + "epoch": 1.3988056359055707, + "grad_norm": 1.3909566765235286, + "learning_rate": 0.0002689504128259181, + "loss": 7.5981, + "step": 14991 + }, + { + "epoch": 1.398898945600448, + "grad_norm": 1.224565800573562, + "learning_rate": 0.0002689458046806703, + "loss": 7.4227, + "step": 14992 + }, + { + "epoch": 1.3989922552953251, + "grad_norm": 1.3474548392409231, + "learning_rate": 0.00026894119623297857, + "loss": 7.6517, + "step": 14993 + }, + { + "epoch": 1.3990855649902025, + "grad_norm": 1.7985885321790043, + "learning_rate": 0.00026893658748285463, + "loss": 7.641, + "step": 14994 + }, + { + "epoch": 1.3991788746850797, + "grad_norm": 1.6021205439297832, + "learning_rate": 0.0002689319784303101, + "loss": 7.3951, + "step": 14995 + }, + { + "epoch": 1.3992721843799572, + "grad_norm": 1.1705715361678528, + "learning_rate": 0.00026892736907535683, + "loss": 7.5132, + "step": 14996 + }, + { + "epoch": 1.3993654940748343, + "grad_norm": 0.7218282810217278, + "learning_rate": 0.0002689227594180065, + "loss": 7.5221, + "step": 14997 + }, + { + "epoch": 1.3994588037697118, + "grad_norm": 1.310779607315762, + "learning_rate": 0.00026891814945827084, + "loss": 7.8251, + "step": 14998 + }, + { + "epoch": 1.399552113464589, + "grad_norm": 1.6911094573103604, + "learning_rate": 0.00026891353919616153, + "loss": 7.4413, + "step": 14999 + }, + { + "epoch": 1.3996454231594662, + "grad_norm": 1.881277282596227, + "learning_rate": 0.0002689089286316903, + "loss": 7.4982, + "step": 15000 + }, + { + "epoch": 1.3997387328543436, + "grad_norm": 1.2324761538988334, + "learning_rate": 0.00026890431776486893, + "loss": 7.4829, + "step": 15001 + }, + { + "epoch": 1.399832042549221, + "grad_norm": 1.5477930317461543, + "learning_rate": 0.0002688997065957091, + "loss": 7.4416, + "step": 15002 + }, + { + "epoch": 1.3999253522440982, + "grad_norm": 0.8545039845804342, + "learning_rate": 0.00026889509512422247, + "loss": 7.4599, + "step": 15003 + }, + { + "epoch": 1.4000186619389754, + "grad_norm": 0.8235974645321157, + "learning_rate": 0.00026889048335042087, + "loss": 7.3166, + "step": 15004 + }, + { + "epoch": 1.4001119716338528, + "grad_norm": 1.2708921027055042, + "learning_rate": 0.000268885871274316, + "loss": 7.4858, + "step": 15005 + }, + { + "epoch": 1.40020528132873, + "grad_norm": 1.3733462310035838, + "learning_rate": 0.00026888125889591957, + "loss": 7.4512, + "step": 15006 + }, + { + "epoch": 1.4002985910236074, + "grad_norm": 0.8240013827174735, + "learning_rate": 0.0002688766462152433, + "loss": 7.4963, + "step": 15007 + }, + { + "epoch": 1.4003919007184846, + "grad_norm": 0.8440416002481167, + "learning_rate": 0.00026887203323229894, + "loss": 7.5284, + "step": 15008 + }, + { + "epoch": 1.400485210413362, + "grad_norm": 1.3110559141860962, + "learning_rate": 0.0002688674199470982, + "loss": 7.3583, + "step": 15009 + }, + { + "epoch": 1.4005785201082392, + "grad_norm": 0.9853050867549276, + "learning_rate": 0.00026886280635965285, + "loss": 7.5084, + "step": 15010 + }, + { + "epoch": 1.4006718298031164, + "grad_norm": 1.0603564535117327, + "learning_rate": 0.00026885819246997455, + "loss": 7.5745, + "step": 15011 + }, + { + "epoch": 1.4007651394979939, + "grad_norm": 1.1850096593582986, + "learning_rate": 0.0002688535782780751, + "loss": 7.7268, + "step": 15012 + }, + { + "epoch": 1.400858449192871, + "grad_norm": 1.1727822029315051, + "learning_rate": 0.00026884896378396623, + "loss": 7.671, + "step": 15013 + }, + { + "epoch": 1.4009517588877485, + "grad_norm": 1.5564637511060906, + "learning_rate": 0.0002688443489876596, + "loss": 7.5287, + "step": 15014 + }, + { + "epoch": 1.4010450685826257, + "grad_norm": 1.1011069795059747, + "learning_rate": 0.00026883973388916704, + "loss": 7.6199, + "step": 15015 + }, + { + "epoch": 1.401138378277503, + "grad_norm": 0.8415492696015674, + "learning_rate": 0.00026883511848850024, + "loss": 7.2714, + "step": 15016 + }, + { + "epoch": 1.4012316879723803, + "grad_norm": 1.4114890958997728, + "learning_rate": 0.0002688305027856709, + "loss": 7.766, + "step": 15017 + }, + { + "epoch": 1.4013249976672577, + "grad_norm": 1.1265408769400458, + "learning_rate": 0.00026882588678069086, + "loss": 7.5002, + "step": 15018 + }, + { + "epoch": 1.401418307362135, + "grad_norm": 1.1766551936585015, + "learning_rate": 0.00026882127047357174, + "loss": 7.4616, + "step": 15019 + }, + { + "epoch": 1.4015116170570123, + "grad_norm": 0.9234422383224299, + "learning_rate": 0.0002688166538643253, + "loss": 7.5209, + "step": 15020 + }, + { + "epoch": 1.4016049267518895, + "grad_norm": 0.9719607301578679, + "learning_rate": 0.00026881203695296337, + "loss": 7.6819, + "step": 15021 + }, + { + "epoch": 1.4016982364467667, + "grad_norm": 0.961405989661481, + "learning_rate": 0.00026880741973949753, + "loss": 7.4852, + "step": 15022 + }, + { + "epoch": 1.4017915461416441, + "grad_norm": 1.131548798692975, + "learning_rate": 0.0002688028022239397, + "loss": 7.5601, + "step": 15023 + }, + { + "epoch": 1.4018848558365213, + "grad_norm": 1.695580962230383, + "learning_rate": 0.0002687981844063015, + "loss": 7.2695, + "step": 15024 + }, + { + "epoch": 1.4019781655313988, + "grad_norm": 1.3235078731793033, + "learning_rate": 0.00026879356628659474, + "loss": 7.7655, + "step": 15025 + }, + { + "epoch": 1.402071475226276, + "grad_norm": 1.4047917594739259, + "learning_rate": 0.0002687889478648311, + "loss": 7.6954, + "step": 15026 + }, + { + "epoch": 1.4021647849211534, + "grad_norm": 1.5518798831272473, + "learning_rate": 0.0002687843291410224, + "loss": 7.0494, + "step": 15027 + }, + { + "epoch": 1.4022580946160306, + "grad_norm": 0.9839605212260607, + "learning_rate": 0.0002687797101151803, + "loss": 7.0955, + "step": 15028 + }, + { + "epoch": 1.4023514043109078, + "grad_norm": 1.2447344455913676, + "learning_rate": 0.00026877509078731654, + "loss": 7.4508, + "step": 15029 + }, + { + "epoch": 1.4024447140057852, + "grad_norm": 0.8915747796107982, + "learning_rate": 0.00026877047115744297, + "loss": 7.4663, + "step": 15030 + }, + { + "epoch": 1.4025380237006626, + "grad_norm": 0.775802719428734, + "learning_rate": 0.0002687658512255712, + "loss": 7.4203, + "step": 15031 + }, + { + "epoch": 1.4026313333955398, + "grad_norm": 1.5594799549329375, + "learning_rate": 0.0002687612309917131, + "loss": 7.8417, + "step": 15032 + }, + { + "epoch": 1.402724643090417, + "grad_norm": 0.9820041780713784, + "learning_rate": 0.0002687566104558804, + "loss": 7.3714, + "step": 15033 + }, + { + "epoch": 1.4028179527852944, + "grad_norm": 0.9711919968985904, + "learning_rate": 0.0002687519896180848, + "loss": 7.765, + "step": 15034 + }, + { + "epoch": 1.4029112624801716, + "grad_norm": 1.1071434309399064, + "learning_rate": 0.00026874736847833807, + "loss": 7.5576, + "step": 15035 + }, + { + "epoch": 1.403004572175049, + "grad_norm": 1.1242775341762627, + "learning_rate": 0.0002687427470366519, + "loss": 7.6659, + "step": 15036 + }, + { + "epoch": 1.4030978818699262, + "grad_norm": 1.2826295541960269, + "learning_rate": 0.0002687381252930382, + "loss": 7.3979, + "step": 15037 + }, + { + "epoch": 1.4031911915648037, + "grad_norm": 0.9405745613661083, + "learning_rate": 0.00026873350324750856, + "loss": 7.4302, + "step": 15038 + }, + { + "epoch": 1.4032845012596808, + "grad_norm": 0.8271276599125812, + "learning_rate": 0.0002687288809000748, + "loss": 7.3301, + "step": 15039 + }, + { + "epoch": 1.403377810954558, + "grad_norm": 1.8545512125863177, + "learning_rate": 0.0002687242582507486, + "loss": 7.7545, + "step": 15040 + }, + { + "epoch": 1.4034711206494355, + "grad_norm": 1.2145741752675392, + "learning_rate": 0.00026871963529954187, + "loss": 7.5353, + "step": 15041 + }, + { + "epoch": 1.4035644303443129, + "grad_norm": 1.4465478299532881, + "learning_rate": 0.0002687150120464663, + "loss": 7.5819, + "step": 15042 + }, + { + "epoch": 1.40365774003919, + "grad_norm": 0.9760663023372003, + "learning_rate": 0.00026871038849153356, + "loss": 7.2166, + "step": 15043 + }, + { + "epoch": 1.4037510497340673, + "grad_norm": 1.5284675576540443, + "learning_rate": 0.00026870576463475547, + "loss": 7.2119, + "step": 15044 + }, + { + "epoch": 1.4038443594289447, + "grad_norm": 0.9715665392979437, + "learning_rate": 0.0002687011404761438, + "loss": 7.2892, + "step": 15045 + }, + { + "epoch": 1.403937669123822, + "grad_norm": 0.9889460836729108, + "learning_rate": 0.00026869651601571027, + "loss": 7.6006, + "step": 15046 + }, + { + "epoch": 1.4040309788186993, + "grad_norm": 1.0005518796469626, + "learning_rate": 0.00026869189125346664, + "loss": 7.4644, + "step": 15047 + }, + { + "epoch": 1.4041242885135765, + "grad_norm": 1.7059224809422335, + "learning_rate": 0.00026868726618942476, + "loss": 7.772, + "step": 15048 + }, + { + "epoch": 1.404217598208454, + "grad_norm": 0.7068725420725414, + "learning_rate": 0.00026868264082359624, + "loss": 7.3728, + "step": 15049 + }, + { + "epoch": 1.4043109079033311, + "grad_norm": 0.8771883126607581, + "learning_rate": 0.000268678015155993, + "loss": 7.1843, + "step": 15050 + }, + { + "epoch": 1.4044042175982083, + "grad_norm": 0.6728520314727212, + "learning_rate": 0.0002686733891866267, + "loss": 7.3228, + "step": 15051 + }, + { + "epoch": 1.4044975272930857, + "grad_norm": 0.9052295710286639, + "learning_rate": 0.0002686687629155091, + "loss": 7.3678, + "step": 15052 + }, + { + "epoch": 1.4045908369879632, + "grad_norm": 0.8800853151939847, + "learning_rate": 0.000268664136342652, + "loss": 7.5571, + "step": 15053 + }, + { + "epoch": 1.4046841466828404, + "grad_norm": 0.6859911376154757, + "learning_rate": 0.00026865950946806716, + "loss": 7.3626, + "step": 15054 + }, + { + "epoch": 1.4047774563777176, + "grad_norm": 0.7795205916699435, + "learning_rate": 0.0002686548822917663, + "loss": 7.4729, + "step": 15055 + }, + { + "epoch": 1.404870766072595, + "grad_norm": 0.6872327813293554, + "learning_rate": 0.00026865025481376123, + "loss": 7.4752, + "step": 15056 + }, + { + "epoch": 1.4049640757674722, + "grad_norm": 0.601247902568076, + "learning_rate": 0.00026864562703406375, + "loss": 7.4623, + "step": 15057 + }, + { + "epoch": 1.4050573854623496, + "grad_norm": 1.0922615982937227, + "learning_rate": 0.00026864099895268555, + "loss": 7.4567, + "step": 15058 + }, + { + "epoch": 1.4051506951572268, + "grad_norm": 0.8029619402311697, + "learning_rate": 0.00026863637056963845, + "loss": 7.7348, + "step": 15059 + }, + { + "epoch": 1.4052440048521042, + "grad_norm": 1.1493891009987203, + "learning_rate": 0.0002686317418849342, + "loss": 7.5906, + "step": 15060 + }, + { + "epoch": 1.4053373145469814, + "grad_norm": 1.3855728347306733, + "learning_rate": 0.00026862711289858454, + "loss": 7.3748, + "step": 15061 + }, + { + "epoch": 1.4054306242418586, + "grad_norm": 1.0181238000459103, + "learning_rate": 0.00026862248361060137, + "loss": 7.693, + "step": 15062 + }, + { + "epoch": 1.405523933936736, + "grad_norm": 0.6121347118610126, + "learning_rate": 0.00026861785402099625, + "loss": 7.4372, + "step": 15063 + }, + { + "epoch": 1.4056172436316134, + "grad_norm": 0.9958430774161985, + "learning_rate": 0.0002686132241297811, + "loss": 7.5237, + "step": 15064 + }, + { + "epoch": 1.4057105533264906, + "grad_norm": 0.8150881062812589, + "learning_rate": 0.0002686085939369677, + "loss": 7.4075, + "step": 15065 + }, + { + "epoch": 1.4058038630213678, + "grad_norm": 1.8964583440260712, + "learning_rate": 0.0002686039634425677, + "loss": 7.3912, + "step": 15066 + }, + { + "epoch": 1.4058971727162453, + "grad_norm": 0.8580216335475693, + "learning_rate": 0.00026859933264659306, + "loss": 7.3345, + "step": 15067 + }, + { + "epoch": 1.4059904824111225, + "grad_norm": 1.18664348867949, + "learning_rate": 0.0002685947015490554, + "loss": 7.5906, + "step": 15068 + }, + { + "epoch": 1.4060837921059999, + "grad_norm": 0.9399311239959631, + "learning_rate": 0.0002685900701499665, + "loss": 7.5094, + "step": 15069 + }, + { + "epoch": 1.406177101800877, + "grad_norm": 0.7105957898884134, + "learning_rate": 0.00026858543844933826, + "loss": 7.6717, + "step": 15070 + }, + { + "epoch": 1.4062704114957545, + "grad_norm": 1.1179162999216947, + "learning_rate": 0.0002685808064471823, + "loss": 7.6143, + "step": 15071 + }, + { + "epoch": 1.4063637211906317, + "grad_norm": 2.5545074760781263, + "learning_rate": 0.0002685761741435105, + "loss": 7.1696, + "step": 15072 + }, + { + "epoch": 1.4064570308855089, + "grad_norm": 1.3291440876646303, + "learning_rate": 0.00026857154153833463, + "loss": 7.4886, + "step": 15073 + }, + { + "epoch": 1.4065503405803863, + "grad_norm": 0.6800121244279751, + "learning_rate": 0.00026856690863166647, + "loss": 7.4936, + "step": 15074 + }, + { + "epoch": 1.4066436502752637, + "grad_norm": 0.849897747828518, + "learning_rate": 0.00026856227542351773, + "loss": 7.508, + "step": 15075 + }, + { + "epoch": 1.406736959970141, + "grad_norm": 1.4056325789551611, + "learning_rate": 0.00026855764191390023, + "loss": 7.3825, + "step": 15076 + }, + { + "epoch": 1.4068302696650181, + "grad_norm": 1.17353606640526, + "learning_rate": 0.00026855300810282585, + "loss": 7.4127, + "step": 15077 + }, + { + "epoch": 1.4069235793598955, + "grad_norm": 2.040894871282554, + "learning_rate": 0.00026854837399030624, + "loss": 7.5724, + "step": 15078 + }, + { + "epoch": 1.4070168890547727, + "grad_norm": 1.095188630592224, + "learning_rate": 0.0002685437395763532, + "loss": 7.6817, + "step": 15079 + }, + { + "epoch": 1.4071101987496502, + "grad_norm": 1.1700254323075032, + "learning_rate": 0.00026853910486097866, + "loss": 7.1872, + "step": 15080 + }, + { + "epoch": 1.4072035084445274, + "grad_norm": 0.7926136149283556, + "learning_rate": 0.00026853446984419416, + "loss": 7.5281, + "step": 15081 + }, + { + "epoch": 1.4072968181394048, + "grad_norm": 0.8726130549933144, + "learning_rate": 0.0002685298345260117, + "loss": 7.4931, + "step": 15082 + }, + { + "epoch": 1.407390127834282, + "grad_norm": 1.4985307939018337, + "learning_rate": 0.00026852519890644295, + "loss": 7.2442, + "step": 15083 + }, + { + "epoch": 1.4074834375291592, + "grad_norm": 1.3630068758065637, + "learning_rate": 0.00026852056298549976, + "loss": 7.0636, + "step": 15084 + }, + { + "epoch": 1.4075767472240366, + "grad_norm": 1.0628953760141095, + "learning_rate": 0.00026851592676319384, + "loss": 7.7377, + "step": 15085 + }, + { + "epoch": 1.407670056918914, + "grad_norm": 0.6416343807917168, + "learning_rate": 0.0002685112902395371, + "loss": 7.265, + "step": 15086 + }, + { + "epoch": 1.4077633666137912, + "grad_norm": 1.9734834669519734, + "learning_rate": 0.0002685066534145411, + "loss": 7.6471, + "step": 15087 + }, + { + "epoch": 1.4078566763086684, + "grad_norm": 0.5950632762627656, + "learning_rate": 0.0002685020162882179, + "loss": 7.2756, + "step": 15088 + }, + { + "epoch": 1.4079499860035458, + "grad_norm": 0.661548966645936, + "learning_rate": 0.0002684973788605792, + "loss": 7.2657, + "step": 15089 + }, + { + "epoch": 1.408043295698423, + "grad_norm": 0.6836224643344321, + "learning_rate": 0.0002684927411316367, + "loss": 7.3723, + "step": 15090 + }, + { + "epoch": 1.4081366053933004, + "grad_norm": 1.030128853120538, + "learning_rate": 0.0002684881031014023, + "loss": 7.3929, + "step": 15091 + }, + { + "epoch": 1.4082299150881776, + "grad_norm": 0.7329820748617732, + "learning_rate": 0.0002684834647698877, + "loss": 7.4894, + "step": 15092 + }, + { + "epoch": 1.408323224783055, + "grad_norm": 1.5907709667570487, + "learning_rate": 0.0002684788261371048, + "loss": 7.0208, + "step": 15093 + }, + { + "epoch": 1.4084165344779322, + "grad_norm": 1.17563576628604, + "learning_rate": 0.0002684741872030653, + "loss": 7.0134, + "step": 15094 + }, + { + "epoch": 1.4085098441728094, + "grad_norm": 3.622125580003619, + "learning_rate": 0.0002684695479677811, + "loss": 7.6896, + "step": 15095 + }, + { + "epoch": 1.4086031538676869, + "grad_norm": 0.9166715981589921, + "learning_rate": 0.0002684649084312639, + "loss": 7.3424, + "step": 15096 + }, + { + "epoch": 1.4086964635625643, + "grad_norm": 1.7621034427283342, + "learning_rate": 0.00026846026859352556, + "loss": 7.5661, + "step": 15097 + }, + { + "epoch": 1.4087897732574415, + "grad_norm": 0.46044321251753295, + "learning_rate": 0.0002684556284545778, + "loss": 7.2546, + "step": 15098 + }, + { + "epoch": 1.4088830829523187, + "grad_norm": 0.625044222165227, + "learning_rate": 0.00026845098801443247, + "loss": 7.3987, + "step": 15099 + }, + { + "epoch": 1.408976392647196, + "grad_norm": 0.6685562394151662, + "learning_rate": 0.00026844634727310137, + "loss": 7.6855, + "step": 15100 + }, + { + "epoch": 1.4090697023420733, + "grad_norm": 1.185545719679374, + "learning_rate": 0.00026844170623059626, + "loss": 7.3799, + "step": 15101 + }, + { + "epoch": 1.4091630120369507, + "grad_norm": 0.9195749427567226, + "learning_rate": 0.000268437064886929, + "loss": 7.5448, + "step": 15102 + }, + { + "epoch": 1.409256321731828, + "grad_norm": 2.16208159667159, + "learning_rate": 0.0002684324232421114, + "loss": 7.3982, + "step": 15103 + }, + { + "epoch": 1.4093496314267053, + "grad_norm": 1.3297831911811457, + "learning_rate": 0.0002684277812961552, + "loss": 7.275, + "step": 15104 + }, + { + "epoch": 1.4094429411215825, + "grad_norm": 0.9459625864590392, + "learning_rate": 0.0002684231390490723, + "loss": 7.209, + "step": 15105 + }, + { + "epoch": 1.4095362508164597, + "grad_norm": 0.5627976091376601, + "learning_rate": 0.0002684184965008743, + "loss": 7.2689, + "step": 15106 + }, + { + "epoch": 1.4096295605113371, + "grad_norm": 1.0055242041170491, + "learning_rate": 0.00026841385365157326, + "loss": 7.3558, + "step": 15107 + }, + { + "epoch": 1.4097228702062146, + "grad_norm": 0.9269480710856617, + "learning_rate": 0.0002684092105011809, + "loss": 7.3459, + "step": 15108 + }, + { + "epoch": 1.4098161799010918, + "grad_norm": 1.2134636153583631, + "learning_rate": 0.00026840456704970887, + "loss": 7.3739, + "step": 15109 + }, + { + "epoch": 1.409909489595969, + "grad_norm": 1.0933680329926503, + "learning_rate": 0.00026839992329716914, + "loss": 7.5977, + "step": 15110 + }, + { + "epoch": 1.4100027992908464, + "grad_norm": 0.6223296264318199, + "learning_rate": 0.0002683952792435735, + "loss": 7.7118, + "step": 15111 + }, + { + "epoch": 1.4100961089857236, + "grad_norm": 0.8957781059326458, + "learning_rate": 0.0002683906348889337, + "loss": 7.5324, + "step": 15112 + }, + { + "epoch": 1.410189418680601, + "grad_norm": 2.0273877944852297, + "learning_rate": 0.00026838599023326165, + "loss": 7.1019, + "step": 15113 + }, + { + "epoch": 1.4102827283754782, + "grad_norm": 1.7133477342355206, + "learning_rate": 0.00026838134527656905, + "loss": 7.2134, + "step": 15114 + }, + { + "epoch": 1.4103760380703556, + "grad_norm": 0.8770107513649387, + "learning_rate": 0.00026837670001886776, + "loss": 7.3992, + "step": 15115 + }, + { + "epoch": 1.4104693477652328, + "grad_norm": 0.676469667381757, + "learning_rate": 0.00026837205446016957, + "loss": 7.3039, + "step": 15116 + }, + { + "epoch": 1.41056265746011, + "grad_norm": 1.641300721011105, + "learning_rate": 0.00026836740860048633, + "loss": 7.7474, + "step": 15117 + }, + { + "epoch": 1.4106559671549874, + "grad_norm": 1.1328883145440112, + "learning_rate": 0.0002683627624398299, + "loss": 7.1124, + "step": 15118 + }, + { + "epoch": 1.4107492768498646, + "grad_norm": 7.297525709764867, + "learning_rate": 0.0002683581159782119, + "loss": 7.3104, + "step": 15119 + }, + { + "epoch": 1.410842586544742, + "grad_norm": 0.7294442865096638, + "learning_rate": 0.00026835346921564435, + "loss": 7.4665, + "step": 15120 + }, + { + "epoch": 1.4109358962396192, + "grad_norm": 1.4302787230957206, + "learning_rate": 0.00026834882215213897, + "loss": 7.3059, + "step": 15121 + }, + { + "epoch": 1.4110292059344967, + "grad_norm": 0.9030050625352862, + "learning_rate": 0.0002683441747877076, + "loss": 7.5258, + "step": 15122 + }, + { + "epoch": 1.4111225156293739, + "grad_norm": 2.2783170172635363, + "learning_rate": 0.000268339527122362, + "loss": 7.1499, + "step": 15123 + }, + { + "epoch": 1.4112158253242513, + "grad_norm": 1.9105635055379395, + "learning_rate": 0.00026833487915611413, + "loss": 7.3121, + "step": 15124 + }, + { + "epoch": 1.4113091350191285, + "grad_norm": 0.9149386690545838, + "learning_rate": 0.00026833023088897564, + "loss": 7.4957, + "step": 15125 + }, + { + "epoch": 1.4114024447140059, + "grad_norm": 1.3297429040718372, + "learning_rate": 0.00026832558232095847, + "loss": 6.9998, + "step": 15126 + }, + { + "epoch": 1.411495754408883, + "grad_norm": 0.7404428502063458, + "learning_rate": 0.0002683209334520744, + "loss": 7.1885, + "step": 15127 + }, + { + "epoch": 1.4115890641037603, + "grad_norm": 1.506376355889491, + "learning_rate": 0.0002683162842823352, + "loss": 7.6351, + "step": 15128 + }, + { + "epoch": 1.4116823737986377, + "grad_norm": 1.363621767190332, + "learning_rate": 0.0002683116348117527, + "loss": 7.3498, + "step": 15129 + }, + { + "epoch": 1.411775683493515, + "grad_norm": 0.7936294955786846, + "learning_rate": 0.00026830698504033885, + "loss": 7.1419, + "step": 15130 + }, + { + "epoch": 1.4118689931883923, + "grad_norm": 1.158527217265912, + "learning_rate": 0.00026830233496810533, + "loss": 7.4132, + "step": 15131 + }, + { + "epoch": 1.4119623028832695, + "grad_norm": 1.2175503505528151, + "learning_rate": 0.00026829768459506404, + "loss": 7.567, + "step": 15132 + }, + { + "epoch": 1.412055612578147, + "grad_norm": 0.6247699779702469, + "learning_rate": 0.0002682930339212268, + "loss": 7.1366, + "step": 15133 + }, + { + "epoch": 1.4121489222730241, + "grad_norm": 0.5140418344734189, + "learning_rate": 0.0002682883829466054, + "loss": 7.3468, + "step": 15134 + }, + { + "epoch": 1.4122422319679013, + "grad_norm": 0.8990790336476377, + "learning_rate": 0.00026828373167121165, + "loss": 7.3722, + "step": 15135 + }, + { + "epoch": 1.4123355416627787, + "grad_norm": 1.3863174804369012, + "learning_rate": 0.00026827908009505744, + "loss": 7.1663, + "step": 15136 + }, + { + "epoch": 1.4124288513576562, + "grad_norm": 0.5909041067006644, + "learning_rate": 0.00026827442821815455, + "loss": 7.5711, + "step": 15137 + }, + { + "epoch": 1.4125221610525334, + "grad_norm": 0.874972841055502, + "learning_rate": 0.00026826977604051485, + "loss": 7.3523, + "step": 15138 + }, + { + "epoch": 1.4126154707474106, + "grad_norm": 0.7409981332920104, + "learning_rate": 0.0002682651235621501, + "loss": 7.2345, + "step": 15139 + }, + { + "epoch": 1.412708780442288, + "grad_norm": 0.5921225339330054, + "learning_rate": 0.0002682604707830722, + "loss": 7.0611, + "step": 15140 + }, + { + "epoch": 1.4128020901371652, + "grad_norm": 1.0712216408338244, + "learning_rate": 0.000268255817703293, + "loss": 7.5441, + "step": 15141 + }, + { + "epoch": 1.4128953998320426, + "grad_norm": 0.8253221586801455, + "learning_rate": 0.0002682511643228242, + "loss": 7.4217, + "step": 15142 + }, + { + "epoch": 1.4129887095269198, + "grad_norm": 3.4791857957190904, + "learning_rate": 0.00026824651064167774, + "loss": 7.5704, + "step": 15143 + }, + { + "epoch": 1.4130820192217972, + "grad_norm": 1.0807311728513045, + "learning_rate": 0.00026824185665986545, + "loss": 7.2947, + "step": 15144 + }, + { + "epoch": 1.4131753289166744, + "grad_norm": 1.2365349199722684, + "learning_rate": 0.0002682372023773992, + "loss": 7.1828, + "step": 15145 + }, + { + "epoch": 1.4132686386115516, + "grad_norm": 1.015065552795994, + "learning_rate": 0.00026823254779429064, + "loss": 7.5799, + "step": 15146 + }, + { + "epoch": 1.413361948306429, + "grad_norm": 0.9881971904358234, + "learning_rate": 0.0002682278929105518, + "loss": 7.4343, + "step": 15147 + }, + { + "epoch": 1.4134552580013064, + "grad_norm": 0.952159491379894, + "learning_rate": 0.0002682232377261945, + "loss": 7.3122, + "step": 15148 + }, + { + "epoch": 1.4135485676961836, + "grad_norm": 0.6641591010268088, + "learning_rate": 0.00026821858224123046, + "loss": 7.4671, + "step": 15149 + }, + { + "epoch": 1.4136418773910608, + "grad_norm": 0.5474726697310143, + "learning_rate": 0.0002682139264556716, + "loss": 7.1877, + "step": 15150 + }, + { + "epoch": 1.4137351870859383, + "grad_norm": 1.9555912928297046, + "learning_rate": 0.00026820927036952976, + "loss": 7.8418, + "step": 15151 + }, + { + "epoch": 1.4138284967808155, + "grad_norm": 1.958474622565688, + "learning_rate": 0.0002682046139828167, + "loss": 7.6556, + "step": 15152 + }, + { + "epoch": 1.4139218064756929, + "grad_norm": 0.9269964269248283, + "learning_rate": 0.00026819995729554443, + "loss": 7.4347, + "step": 15153 + }, + { + "epoch": 1.41401511617057, + "grad_norm": 0.5744127675140218, + "learning_rate": 0.0002681953003077246, + "loss": 7.3997, + "step": 15154 + }, + { + "epoch": 1.4141084258654475, + "grad_norm": 0.6448923089738001, + "learning_rate": 0.00026819064301936916, + "loss": 7.6401, + "step": 15155 + }, + { + "epoch": 1.4142017355603247, + "grad_norm": 1.1718162304180972, + "learning_rate": 0.0002681859854304899, + "loss": 7.4114, + "step": 15156 + }, + { + "epoch": 1.4142950452552019, + "grad_norm": 1.3787539238106232, + "learning_rate": 0.0002681813275410987, + "loss": 7.5356, + "step": 15157 + }, + { + "epoch": 1.4143883549500793, + "grad_norm": 0.7988437387862577, + "learning_rate": 0.0002681766693512074, + "loss": 7.7071, + "step": 15158 + }, + { + "epoch": 1.4144816646449567, + "grad_norm": 0.8098483298612195, + "learning_rate": 0.00026817201086082787, + "loss": 7.573, + "step": 15159 + }, + { + "epoch": 1.414574974339834, + "grad_norm": 0.9091602847875605, + "learning_rate": 0.0002681673520699719, + "loss": 7.5207, + "step": 15160 + }, + { + "epoch": 1.4146682840347111, + "grad_norm": 0.5670360486865414, + "learning_rate": 0.0002681626929786513, + "loss": 7.5505, + "step": 15161 + }, + { + "epoch": 1.4147615937295885, + "grad_norm": 0.5813132929898869, + "learning_rate": 0.0002681580335868781, + "loss": 7.3983, + "step": 15162 + }, + { + "epoch": 1.4148549034244657, + "grad_norm": 0.5518325319889986, + "learning_rate": 0.00026815337389466385, + "loss": 7.2423, + "step": 15163 + }, + { + "epoch": 1.4149482131193432, + "grad_norm": 0.48155601242874757, + "learning_rate": 0.00026814871390202073, + "loss": 7.0159, + "step": 15164 + }, + { + "epoch": 1.4150415228142204, + "grad_norm": 1.292404391343241, + "learning_rate": 0.0002681440536089603, + "loss": 7.2986, + "step": 15165 + }, + { + "epoch": 1.4151348325090978, + "grad_norm": 1.6768099983353153, + "learning_rate": 0.00026813939301549464, + "loss": 7.4662, + "step": 15166 + }, + { + "epoch": 1.415228142203975, + "grad_norm": 1.1757706300876731, + "learning_rate": 0.00026813473212163546, + "loss": 7.2451, + "step": 15167 + }, + { + "epoch": 1.4153214518988522, + "grad_norm": 1.0943054657739328, + "learning_rate": 0.00026813007092739465, + "loss": 7.1267, + "step": 15168 + }, + { + "epoch": 1.4154147615937296, + "grad_norm": 0.45610385817410987, + "learning_rate": 0.0002681254094327841, + "loss": 7.3151, + "step": 15169 + }, + { + "epoch": 1.415508071288607, + "grad_norm": 0.8017385195128778, + "learning_rate": 0.00026812074763781554, + "loss": 7.5033, + "step": 15170 + }, + { + "epoch": 1.4156013809834842, + "grad_norm": 1.253917618076876, + "learning_rate": 0.00026811608554250097, + "loss": 7.2239, + "step": 15171 + }, + { + "epoch": 1.4156946906783614, + "grad_norm": 0.7560781821241336, + "learning_rate": 0.0002681114231468521, + "loss": 7.7189, + "step": 15172 + }, + { + "epoch": 1.4157880003732388, + "grad_norm": 4.876247364948269, + "learning_rate": 0.000268106760450881, + "loss": 7.4985, + "step": 15173 + }, + { + "epoch": 1.415881310068116, + "grad_norm": 0.9777437150762465, + "learning_rate": 0.0002681020974545993, + "loss": 7.7412, + "step": 15174 + }, + { + "epoch": 1.4159746197629934, + "grad_norm": 1.4087007411933175, + "learning_rate": 0.000268097434158019, + "loss": 7.5426, + "step": 15175 + }, + { + "epoch": 1.4160679294578706, + "grad_norm": 1.3451377257913082, + "learning_rate": 0.0002680927705611519, + "loss": 7.5572, + "step": 15176 + }, + { + "epoch": 1.416161239152748, + "grad_norm": 1.1104931991007458, + "learning_rate": 0.00026808810666400986, + "loss": 7.1885, + "step": 15177 + }, + { + "epoch": 1.4162545488476252, + "grad_norm": 0.967730066735719, + "learning_rate": 0.0002680834424666047, + "loss": 7.2361, + "step": 15178 + }, + { + "epoch": 1.4163478585425024, + "grad_norm": 0.35751781131716426, + "learning_rate": 0.00026807877796894837, + "loss": 7.0713, + "step": 15179 + }, + { + "epoch": 1.4164411682373799, + "grad_norm": 1.4770080037532034, + "learning_rate": 0.0002680741131710527, + "loss": 7.5122, + "step": 15180 + }, + { + "epoch": 1.4165344779322573, + "grad_norm": 3.259389825347411, + "learning_rate": 0.0002680694480729295, + "loss": 7.4503, + "step": 15181 + }, + { + "epoch": 1.4166277876271345, + "grad_norm": 2.616541372593606, + "learning_rate": 0.0002680647826745907, + "loss": 7.8573, + "step": 15182 + }, + { + "epoch": 1.4167210973220117, + "grad_norm": 0.46385177814732087, + "learning_rate": 0.00026806011697604813, + "loss": 7.1604, + "step": 15183 + }, + { + "epoch": 1.416814407016889, + "grad_norm": 3.9642404358663614, + "learning_rate": 0.0002680554509773136, + "loss": 7.2436, + "step": 15184 + }, + { + "epoch": 1.4169077167117663, + "grad_norm": 0.9501144053916221, + "learning_rate": 0.00026805078467839906, + "loss": 7.5155, + "step": 15185 + }, + { + "epoch": 1.4170010264066437, + "grad_norm": 0.5186507419455548, + "learning_rate": 0.00026804611807931637, + "loss": 7.2543, + "step": 15186 + }, + { + "epoch": 1.417094336101521, + "grad_norm": 0.7761441888543367, + "learning_rate": 0.00026804145118007735, + "loss": 7.3699, + "step": 15187 + }, + { + "epoch": 1.4171876457963983, + "grad_norm": 2.1605578964927306, + "learning_rate": 0.00026803678398069393, + "loss": 7.7056, + "step": 15188 + }, + { + "epoch": 1.4172809554912755, + "grad_norm": 0.7144114713915266, + "learning_rate": 0.00026803211648117786, + "loss": 7.4391, + "step": 15189 + }, + { + "epoch": 1.4173742651861527, + "grad_norm": 3.8390712284473114, + "learning_rate": 0.00026802744868154114, + "loss": 7.2502, + "step": 15190 + }, + { + "epoch": 1.4174675748810301, + "grad_norm": 1.5472349185858514, + "learning_rate": 0.0002680227805817955, + "loss": 7.4045, + "step": 15191 + }, + { + "epoch": 1.4175608845759076, + "grad_norm": 1.5858462094693735, + "learning_rate": 0.00026801811218195293, + "loss": 7.4327, + "step": 15192 + }, + { + "epoch": 1.4176541942707848, + "grad_norm": 1.3384716189606736, + "learning_rate": 0.0002680134434820253, + "loss": 7.3677, + "step": 15193 + }, + { + "epoch": 1.417747503965662, + "grad_norm": 0.7150979001112276, + "learning_rate": 0.0002680087744820244, + "loss": 7.4778, + "step": 15194 + }, + { + "epoch": 1.4178408136605394, + "grad_norm": 0.7095759529411582, + "learning_rate": 0.0002680041051819622, + "loss": 7.4567, + "step": 15195 + }, + { + "epoch": 1.4179341233554166, + "grad_norm": 0.8200239048526239, + "learning_rate": 0.0002679994355818505, + "loss": 7.3964, + "step": 15196 + }, + { + "epoch": 1.418027433050294, + "grad_norm": 8.457922576332493, + "learning_rate": 0.0002679947656817011, + "loss": 7.4322, + "step": 15197 + }, + { + "epoch": 1.4181207427451712, + "grad_norm": 0.5289297356602795, + "learning_rate": 0.0002679900954815261, + "loss": 7.2335, + "step": 15198 + }, + { + "epoch": 1.4182140524400486, + "grad_norm": 1.0030546214524236, + "learning_rate": 0.0002679854249813371, + "loss": 7.3833, + "step": 15199 + }, + { + "epoch": 1.4183073621349258, + "grad_norm": 1.5916367721209954, + "learning_rate": 0.00026798075418114616, + "loss": 7.217, + "step": 15200 + }, + { + "epoch": 1.418400671829803, + "grad_norm": 13.255297968302488, + "learning_rate": 0.0002679760830809652, + "loss": 7.2856, + "step": 15201 + }, + { + "epoch": 1.4184939815246804, + "grad_norm": 0.5855184175314408, + "learning_rate": 0.0002679714116808059, + "loss": 7.4026, + "step": 15202 + }, + { + "epoch": 1.4185872912195578, + "grad_norm": 0.5663082302891745, + "learning_rate": 0.00026796673998068026, + "loss": 7.3135, + "step": 15203 + }, + { + "epoch": 1.418680600914435, + "grad_norm": 1.2795805006114023, + "learning_rate": 0.0002679620679806002, + "loss": 7.2034, + "step": 15204 + }, + { + "epoch": 1.4187739106093122, + "grad_norm": 10.397642446375636, + "learning_rate": 0.00026795739568057753, + "loss": 7.3311, + "step": 15205 + }, + { + "epoch": 1.4188672203041897, + "grad_norm": 1.4726801533353733, + "learning_rate": 0.0002679527230806241, + "loss": 7.2677, + "step": 15206 + }, + { + "epoch": 1.4189605299990669, + "grad_norm": 0.4597746425181471, + "learning_rate": 0.00026794805018075186, + "loss": 7.1263, + "step": 15207 + }, + { + "epoch": 1.4190538396939443, + "grad_norm": 0.882734419708679, + "learning_rate": 0.0002679433769809727, + "loss": 7.5588, + "step": 15208 + }, + { + "epoch": 1.4191471493888215, + "grad_norm": 0.37208845765771764, + "learning_rate": 0.0002679387034812984, + "loss": 7.3732, + "step": 15209 + }, + { + "epoch": 1.419240459083699, + "grad_norm": 0.8456712311320046, + "learning_rate": 0.00026793402968174095, + "loss": 7.5152, + "step": 15210 + }, + { + "epoch": 1.419333768778576, + "grad_norm": 0.8281531590524635, + "learning_rate": 0.0002679293555823122, + "loss": 7.5611, + "step": 15211 + }, + { + "epoch": 1.4194270784734533, + "grad_norm": 13.166671951755752, + "learning_rate": 0.00026792468118302405, + "loss": 7.6114, + "step": 15212 + }, + { + "epoch": 1.4195203881683307, + "grad_norm": 6.05537864745246, + "learning_rate": 0.00026792000648388835, + "loss": 7.4298, + "step": 15213 + }, + { + "epoch": 1.4196136978632081, + "grad_norm": 2.5341124633492518, + "learning_rate": 0.000267915331484917, + "loss": 7.2055, + "step": 15214 + }, + { + "epoch": 1.4197070075580853, + "grad_norm": 24.22240178550437, + "learning_rate": 0.0002679106561861219, + "loss": 7.1848, + "step": 15215 + }, + { + "epoch": 1.4198003172529625, + "grad_norm": 1.2405966039463197, + "learning_rate": 0.0002679059805875149, + "loss": 7.417, + "step": 15216 + }, + { + "epoch": 1.41989362694784, + "grad_norm": 0.4219515127919764, + "learning_rate": 0.00026790130468910795, + "loss": 7.4531, + "step": 15217 + }, + { + "epoch": 1.4199869366427171, + "grad_norm": 1.745407855383271, + "learning_rate": 0.0002678966284909129, + "loss": 7.7836, + "step": 15218 + }, + { + "epoch": 1.4200802463375946, + "grad_norm": 14.165488909164788, + "learning_rate": 0.00026789195199294165, + "loss": 7.8204, + "step": 15219 + }, + { + "epoch": 1.4201735560324718, + "grad_norm": 0.6982768708190841, + "learning_rate": 0.0002678872751952061, + "loss": 7.0406, + "step": 15220 + }, + { + "epoch": 1.4202668657273492, + "grad_norm": 0.7766386239338621, + "learning_rate": 0.0002678825980977181, + "loss": 7.2293, + "step": 15221 + }, + { + "epoch": 1.4203601754222264, + "grad_norm": 0.7848699487316869, + "learning_rate": 0.00026787792070048956, + "loss": 7.2564, + "step": 15222 + }, + { + "epoch": 1.4204534851171036, + "grad_norm": 0.7764005623930453, + "learning_rate": 0.00026787324300353246, + "loss": 7.4275, + "step": 15223 + }, + { + "epoch": 1.420546794811981, + "grad_norm": 0.46448644710963216, + "learning_rate": 0.00026786856500685856, + "loss": 7.161, + "step": 15224 + }, + { + "epoch": 1.4206401045068582, + "grad_norm": 0.8350198212780454, + "learning_rate": 0.0002678638867104798, + "loss": 7.3917, + "step": 15225 + }, + { + "epoch": 1.4207334142017356, + "grad_norm": 0.4498866313023291, + "learning_rate": 0.0002678592081144081, + "loss": 7.3002, + "step": 15226 + }, + { + "epoch": 1.4208267238966128, + "grad_norm": 0.4283771018963302, + "learning_rate": 0.0002678545292186554, + "loss": 7.0537, + "step": 15227 + }, + { + "epoch": 1.4209200335914902, + "grad_norm": 0.8846366238805722, + "learning_rate": 0.00026784985002323346, + "loss": 7.339, + "step": 15228 + }, + { + "epoch": 1.4210133432863674, + "grad_norm": 69.91023066050168, + "learning_rate": 0.0002678451705281544, + "loss": 7.3879, + "step": 15229 + }, + { + "epoch": 1.4211066529812448, + "grad_norm": 1.3300851208495013, + "learning_rate": 0.0002678404907334298, + "loss": 7.3015, + "step": 15230 + }, + { + "epoch": 1.421199962676122, + "grad_norm": 34.43893649074221, + "learning_rate": 0.00026783581063907183, + "loss": 7.5092, + "step": 15231 + }, + { + "epoch": 1.4212932723709994, + "grad_norm": 1.2123355627623464, + "learning_rate": 0.00026783113024509233, + "loss": 7.188, + "step": 15232 + }, + { + "epoch": 1.4213865820658766, + "grad_norm": 0.9783461781737006, + "learning_rate": 0.0002678264495515031, + "loss": 7.1184, + "step": 15233 + }, + { + "epoch": 1.4214798917607538, + "grad_norm": 0.4965738153913095, + "learning_rate": 0.0002678217685583162, + "loss": 7.3349, + "step": 15234 + }, + { + "epoch": 1.4215732014556313, + "grad_norm": 0.659728184699589, + "learning_rate": 0.00026781708726554337, + "loss": 7.5838, + "step": 15235 + }, + { + "epoch": 1.4216665111505085, + "grad_norm": 1.0265600223220723, + "learning_rate": 0.00026781240567319656, + "loss": 7.5944, + "step": 15236 + }, + { + "epoch": 1.4217598208453859, + "grad_norm": 1.3081002537057769, + "learning_rate": 0.0002678077237812878, + "loss": 7.5741, + "step": 15237 + }, + { + "epoch": 1.421853130540263, + "grad_norm": 86.37734299754348, + "learning_rate": 0.0002678030415898288, + "loss": 7.5602, + "step": 15238 + }, + { + "epoch": 1.4219464402351405, + "grad_norm": 2.259117247538242, + "learning_rate": 0.0002677983590988316, + "loss": 7.4384, + "step": 15239 + }, + { + "epoch": 1.4220397499300177, + "grad_norm": 2.152509531862572, + "learning_rate": 0.0002677936763083081, + "loss": 7.3994, + "step": 15240 + }, + { + "epoch": 1.422133059624895, + "grad_norm": 2.1912056901108197, + "learning_rate": 0.00026778899321827014, + "loss": 7.3524, + "step": 15241 + }, + { + "epoch": 1.4222263693197723, + "grad_norm": 1.7618322365309937, + "learning_rate": 0.00026778430982872964, + "loss": 7.3959, + "step": 15242 + }, + { + "epoch": 1.4223196790146497, + "grad_norm": 0.9206190407229323, + "learning_rate": 0.0002677796261396986, + "loss": 7.3836, + "step": 15243 + }, + { + "epoch": 1.422412988709527, + "grad_norm": 1.2278732944349962, + "learning_rate": 0.0002677749421511888, + "loss": 7.5051, + "step": 15244 + }, + { + "epoch": 1.4225062984044041, + "grad_norm": 1.2978769642833308, + "learning_rate": 0.0002677702578632122, + "loss": 7.3307, + "step": 15245 + }, + { + "epoch": 1.4225996080992815, + "grad_norm": 1.6903832086445385, + "learning_rate": 0.0002677655732757808, + "loss": 7.4706, + "step": 15246 + }, + { + "epoch": 1.4226929177941587, + "grad_norm": 1.2092999634077288, + "learning_rate": 0.00026776088838890636, + "loss": 7.4301, + "step": 15247 + }, + { + "epoch": 1.4227862274890362, + "grad_norm": 1.730940320294507, + "learning_rate": 0.00026775620320260087, + "loss": 7.7055, + "step": 15248 + }, + { + "epoch": 1.4228795371839134, + "grad_norm": 1.017737492911472, + "learning_rate": 0.00026775151771687625, + "loss": 7.5368, + "step": 15249 + }, + { + "epoch": 1.4229728468787908, + "grad_norm": 0.7661712319537055, + "learning_rate": 0.0002677468319317444, + "loss": 7.6341, + "step": 15250 + }, + { + "epoch": 1.423066156573668, + "grad_norm": 1.7057213051150373, + "learning_rate": 0.00026774214584721724, + "loss": 7.6631, + "step": 15251 + }, + { + "epoch": 1.4231594662685452, + "grad_norm": 1.9256945390355324, + "learning_rate": 0.0002677374594633067, + "loss": 7.1958, + "step": 15252 + }, + { + "epoch": 1.4232527759634226, + "grad_norm": 1.9555218937552927, + "learning_rate": 0.00026773277278002467, + "loss": 7.409, + "step": 15253 + }, + { + "epoch": 1.4233460856583, + "grad_norm": 129.45446992138963, + "learning_rate": 0.00026772808579738304, + "loss": 7.4768, + "step": 15254 + }, + { + "epoch": 1.4234393953531772, + "grad_norm": 0.6564927889161095, + "learning_rate": 0.0002677233985153938, + "loss": 7.4537, + "step": 15255 + }, + { + "epoch": 1.4235327050480544, + "grad_norm": 0.984915623324112, + "learning_rate": 0.0002677187109340688, + "loss": 7.5285, + "step": 15256 + }, + { + "epoch": 1.4236260147429318, + "grad_norm": 104.71016912463303, + "learning_rate": 0.00026771402305342, + "loss": 7.4189, + "step": 15257 + }, + { + "epoch": 1.423719324437809, + "grad_norm": 125.41027034871595, + "learning_rate": 0.0002677093348734593, + "loss": 7.4783, + "step": 15258 + }, + { + "epoch": 1.4238126341326864, + "grad_norm": 2.3761298993024105, + "learning_rate": 0.00026770464639419865, + "loss": 7.2287, + "step": 15259 + }, + { + "epoch": 1.4239059438275636, + "grad_norm": 3.4598482904728565, + "learning_rate": 0.0002676999576156499, + "loss": 7.7178, + "step": 15260 + }, + { + "epoch": 1.423999253522441, + "grad_norm": 3.214785086716908, + "learning_rate": 0.0002676952685378251, + "loss": 7.6909, + "step": 15261 + }, + { + "epoch": 1.4240925632173183, + "grad_norm": 1.271003599801515, + "learning_rate": 0.00026769057916073603, + "loss": 7.4061, + "step": 15262 + }, + { + "epoch": 1.4241858729121954, + "grad_norm": 1.4305176588284758, + "learning_rate": 0.0002676858894843947, + "loss": 7.2037, + "step": 15263 + }, + { + "epoch": 1.4242791826070729, + "grad_norm": 0.9459747393886397, + "learning_rate": 0.00026768119950881304, + "loss": 7.3805, + "step": 15264 + }, + { + "epoch": 1.4243724923019503, + "grad_norm": 370.7699671157763, + "learning_rate": 0.00026767650923400295, + "loss": 7.0306, + "step": 15265 + }, + { + "epoch": 1.4244658019968275, + "grad_norm": 2.97810738903012, + "learning_rate": 0.00026767181865997627, + "loss": 7.4423, + "step": 15266 + }, + { + "epoch": 1.4245591116917047, + "grad_norm": 3.568376905401368, + "learning_rate": 0.00026766712778674513, + "loss": 7.4888, + "step": 15267 + }, + { + "epoch": 1.424652421386582, + "grad_norm": 4.137525836067539, + "learning_rate": 0.00026766243661432125, + "loss": 7.3277, + "step": 15268 + }, + { + "epoch": 1.4247457310814593, + "grad_norm": 2.8985716573802143, + "learning_rate": 0.0002676577451427167, + "loss": 7.8016, + "step": 15269 + }, + { + "epoch": 1.4248390407763367, + "grad_norm": 507.96228243594373, + "learning_rate": 0.00026765305337194333, + "loss": 7.3614, + "step": 15270 + }, + { + "epoch": 1.424932350471214, + "grad_norm": 2.377728747282306, + "learning_rate": 0.00026764836130201315, + "loss": 7.348, + "step": 15271 + }, + { + "epoch": 1.4250256601660913, + "grad_norm": 0.6099166504307186, + "learning_rate": 0.000267643668932938, + "loss": 7.4444, + "step": 15272 + }, + { + "epoch": 1.4251189698609685, + "grad_norm": 0.9120728664025747, + "learning_rate": 0.0002676389762647298, + "loss": 7.2457, + "step": 15273 + }, + { + "epoch": 1.4252122795558457, + "grad_norm": 2.178497624330217, + "learning_rate": 0.00026763428329740056, + "loss": 7.1888, + "step": 15274 + }, + { + "epoch": 1.4253055892507231, + "grad_norm": 4.028267417834977, + "learning_rate": 0.0002676295900309622, + "loss": 7.698, + "step": 15275 + }, + { + "epoch": 1.4253988989456006, + "grad_norm": 3.8378837139957156, + "learning_rate": 0.0002676248964654267, + "loss": 7.5456, + "step": 15276 + }, + { + "epoch": 1.4254922086404778, + "grad_norm": 3.8531963112107213, + "learning_rate": 0.00026762020260080586, + "loss": 7.666, + "step": 15277 + }, + { + "epoch": 1.425585518335355, + "grad_norm": 1.2631489208642601, + "learning_rate": 0.00026761550843711166, + "loss": 6.9139, + "step": 15278 + }, + { + "epoch": 1.4256788280302324, + "grad_norm": 1.081684999616807, + "learning_rate": 0.0002676108139743561, + "loss": 7.2608, + "step": 15279 + }, + { + "epoch": 1.4257721377251096, + "grad_norm": 1.5499474779813895, + "learning_rate": 0.00026760611921255106, + "loss": 7.7049, + "step": 15280 + }, + { + "epoch": 1.425865447419987, + "grad_norm": 0.6488849648526293, + "learning_rate": 0.00026760142415170856, + "loss": 7.5062, + "step": 15281 + }, + { + "epoch": 1.4259587571148642, + "grad_norm": 452.8233235263888, + "learning_rate": 0.0002675967287918404, + "loss": 7.4838, + "step": 15282 + }, + { + "epoch": 1.4260520668097416, + "grad_norm": 1.085866363808226, + "learning_rate": 0.0002675920331329586, + "loss": 7.2748, + "step": 15283 + }, + { + "epoch": 1.4261453765046188, + "grad_norm": 427.1246160650314, + "learning_rate": 0.00026758733717507514, + "loss": 7.4188, + "step": 15284 + }, + { + "epoch": 1.426238686199496, + "grad_norm": 1.036988223146332, + "learning_rate": 0.00026758264091820185, + "loss": 7.2511, + "step": 15285 + }, + { + "epoch": 1.4263319958943734, + "grad_norm": 1.054861188209064, + "learning_rate": 0.00026757794436235076, + "loss": 7.3738, + "step": 15286 + }, + { + "epoch": 1.4264253055892508, + "grad_norm": 2.4182579377276863, + "learning_rate": 0.00026757324750753384, + "loss": 7.2315, + "step": 15287 + }, + { + "epoch": 1.426518615284128, + "grad_norm": 530.8281582756533, + "learning_rate": 0.0002675685503537629, + "loss": 7.2606, + "step": 15288 + }, + { + "epoch": 1.4266119249790052, + "grad_norm": 0.7213527942576884, + "learning_rate": 0.00026756385290105004, + "loss": 7.4256, + "step": 15289 + }, + { + "epoch": 1.4267052346738827, + "grad_norm": 0.923621640552148, + "learning_rate": 0.00026755915514940705, + "loss": 7.3357, + "step": 15290 + }, + { + "epoch": 1.4267985443687599, + "grad_norm": 1.1781828833410914, + "learning_rate": 0.000267554457098846, + "loss": 7.1531, + "step": 15291 + }, + { + "epoch": 1.4268918540636373, + "grad_norm": 540.1027935777223, + "learning_rate": 0.0002675497587493788, + "loss": 7.6406, + "step": 15292 + }, + { + "epoch": 1.4269851637585145, + "grad_norm": 1.148302189677573, + "learning_rate": 0.00026754506010101737, + "loss": 7.6516, + "step": 15293 + }, + { + "epoch": 1.427078473453392, + "grad_norm": 1.8637247956630416, + "learning_rate": 0.00026754036115377364, + "loss": 7.1248, + "step": 15294 + }, + { + "epoch": 1.427171783148269, + "grad_norm": 2.5838051585320323, + "learning_rate": 0.0002675356619076596, + "loss": 7.1804, + "step": 15295 + }, + { + "epoch": 1.4272650928431463, + "grad_norm": 3.5764923520889353, + "learning_rate": 0.00026753096236268724, + "loss": 7.0314, + "step": 15296 + }, + { + "epoch": 1.4273584025380237, + "grad_norm": 1228.9615765416604, + "learning_rate": 0.0002675262625188684, + "loss": 7.2992, + "step": 15297 + }, + { + "epoch": 1.4274517122329011, + "grad_norm": 0.8977056336226712, + "learning_rate": 0.0002675215623762151, + "loss": 7.5964, + "step": 15298 + }, + { + "epoch": 1.4275450219277783, + "grad_norm": 453.8644595014385, + "learning_rate": 0.0002675168619347393, + "loss": 7.2743, + "step": 15299 + }, + { + "epoch": 1.4276383316226555, + "grad_norm": 0.8261572563950087, + "learning_rate": 0.0002675121611944529, + "loss": 7.3374, + "step": 15300 + }, + { + "epoch": 1.427731641317533, + "grad_norm": 5.054262594712578, + "learning_rate": 0.0002675074601553679, + "loss": 7.1313, + "step": 15301 + }, + { + "epoch": 1.4278249510124101, + "grad_norm": 455.4110067574421, + "learning_rate": 0.0002675027588174962, + "loss": 7.0714, + "step": 15302 + }, + { + "epoch": 1.4279182607072876, + "grad_norm": 2.4603595629173483, + "learning_rate": 0.00026749805718084986, + "loss": 7.3631, + "step": 15303 + }, + { + "epoch": 1.4280115704021648, + "grad_norm": 1.6255631259890755, + "learning_rate": 0.00026749335524544075, + "loss": 7.3221, + "step": 15304 + }, + { + "epoch": 1.4281048800970422, + "grad_norm": 846.7287830908567, + "learning_rate": 0.0002674886530112808, + "loss": 7.4022, + "step": 15305 + }, + { + "epoch": 1.4281981897919194, + "grad_norm": 1.4156658653199121, + "learning_rate": 0.00026748395047838204, + "loss": 7.3459, + "step": 15306 + }, + { + "epoch": 1.4282914994867966, + "grad_norm": 1.0841595754536462, + "learning_rate": 0.00026747924764675645, + "loss": 7.4148, + "step": 15307 + }, + { + "epoch": 1.428384809181674, + "grad_norm": 0.7921233399507306, + "learning_rate": 0.0002674745445164158, + "loss": 7.1649, + "step": 15308 + }, + { + "epoch": 1.4284781188765514, + "grad_norm": 1.3447103970832572, + "learning_rate": 0.0002674698410873723, + "loss": 7.4718, + "step": 15309 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 473.48338921068546, + "learning_rate": 0.0002674651373596377, + "loss": 7.2217, + "step": 15310 + }, + { + "epoch": 1.4286647382663058, + "grad_norm": 1.4003550065015415, + "learning_rate": 0.00026746043333322414, + "loss": 7.3967, + "step": 15311 + }, + { + "epoch": 1.4287580479611832, + "grad_norm": 0.9817305079701022, + "learning_rate": 0.00026745572900814345, + "loss": 7.4644, + "step": 15312 + }, + { + "epoch": 1.4288513576560604, + "grad_norm": 1.6216607551146391, + "learning_rate": 0.0002674510243844076, + "loss": 7.6054, + "step": 15313 + }, + { + "epoch": 1.4289446673509378, + "grad_norm": 2.052807119086103, + "learning_rate": 0.0002674463194620286, + "loss": 7.7496, + "step": 15314 + }, + { + "epoch": 1.429037977045815, + "grad_norm": 0.6725688355345754, + "learning_rate": 0.0002674416142410185, + "loss": 7.3896, + "step": 15315 + }, + { + "epoch": 1.4291312867406925, + "grad_norm": 1.0444878179485877, + "learning_rate": 0.00026743690872138906, + "loss": 7.3754, + "step": 15316 + }, + { + "epoch": 1.4292245964355696, + "grad_norm": 0.7053420145278061, + "learning_rate": 0.0002674322029031524, + "loss": 7.784, + "step": 15317 + }, + { + "epoch": 1.4293179061304468, + "grad_norm": 1.2999152566540075, + "learning_rate": 0.00026742749678632035, + "loss": 7.3303, + "step": 15318 + }, + { + "epoch": 1.4294112158253243, + "grad_norm": 1.311087562704459, + "learning_rate": 0.000267422790370905, + "loss": 7.654, + "step": 15319 + }, + { + "epoch": 1.4295045255202017, + "grad_norm": 1.875954203732037, + "learning_rate": 0.00026741808365691837, + "loss": 7.3041, + "step": 15320 + }, + { + "epoch": 1.4295978352150789, + "grad_norm": 66.96675448816693, + "learning_rate": 0.0002674133766443722, + "loss": 7.6542, + "step": 15321 + }, + { + "epoch": 1.429691144909956, + "grad_norm": 0.785731266439399, + "learning_rate": 0.0002674086693332787, + "loss": 7.2325, + "step": 15322 + }, + { + "epoch": 1.4297844546048335, + "grad_norm": 1221.6029630563237, + "learning_rate": 0.00026740396172364965, + "loss": 7.1852, + "step": 15323 + }, + { + "epoch": 1.4298777642997107, + "grad_norm": 13.383823171375017, + "learning_rate": 0.0002673992538154972, + "loss": 7.2581, + "step": 15324 + }, + { + "epoch": 1.4299710739945881, + "grad_norm": 191.42465918905665, + "learning_rate": 0.00026739454560883313, + "loss": 7.031, + "step": 15325 + }, + { + "epoch": 1.4300643836894653, + "grad_norm": 0.8721737161149962, + "learning_rate": 0.00026738983710366957, + "loss": 7.6822, + "step": 15326 + }, + { + "epoch": 1.4301576933843427, + "grad_norm": 1.3802106238234795, + "learning_rate": 0.00026738512830001837, + "loss": 7.5398, + "step": 15327 + }, + { + "epoch": 1.43025100307922, + "grad_norm": 1.9107959918054913, + "learning_rate": 0.0002673804191978916, + "loss": 7.0762, + "step": 15328 + }, + { + "epoch": 1.4303443127740971, + "grad_norm": 0.7575531132253595, + "learning_rate": 0.00026737570979730116, + "loss": 7.6963, + "step": 15329 + }, + { + "epoch": 1.4304376224689745, + "grad_norm": 481.7421641845423, + "learning_rate": 0.0002673710000982591, + "loss": 7.2784, + "step": 15330 + }, + { + "epoch": 1.4305309321638517, + "grad_norm": 2.7699043399131633, + "learning_rate": 0.00026736629010077733, + "loss": 7.2503, + "step": 15331 + }, + { + "epoch": 1.4306242418587292, + "grad_norm": 3.6352672043824836, + "learning_rate": 0.00026736157980486783, + "loss": 7.0988, + "step": 15332 + }, + { + "epoch": 1.4307175515536064, + "grad_norm": 3.2213180099714323, + "learning_rate": 0.00026735686921054263, + "loss": 7.3853, + "step": 15333 + }, + { + "epoch": 1.4308108612484838, + "grad_norm": 3.573647140202747, + "learning_rate": 0.00026735215831781366, + "loss": 7.0803, + "step": 15334 + }, + { + "epoch": 1.430904170943361, + "grad_norm": 5306.671806640336, + "learning_rate": 0.0002673474471266929, + "loss": 7.4927, + "step": 15335 + }, + { + "epoch": 1.4309974806382384, + "grad_norm": 1.4549129718497722, + "learning_rate": 0.00026734273563719244, + "loss": 7.7636, + "step": 15336 + }, + { + "epoch": 1.4310907903331156, + "grad_norm": 2.7301699820698664, + "learning_rate": 0.00026733802384932404, + "loss": 7.2972, + "step": 15337 + }, + { + "epoch": 1.431184100027993, + "grad_norm": 1.7838957392616523, + "learning_rate": 0.00026733331176309984, + "loss": 7.1596, + "step": 15338 + }, + { + "epoch": 1.4312774097228702, + "grad_norm": 0.7435919809815259, + "learning_rate": 0.00026732859937853183, + "loss": 7.3843, + "step": 15339 + }, + { + "epoch": 1.4313707194177474, + "grad_norm": 1.065707920399353, + "learning_rate": 0.00026732388669563193, + "loss": 7.4203, + "step": 15340 + }, + { + "epoch": 1.4314640291126248, + "grad_norm": 675.6499365494103, + "learning_rate": 0.00026731917371441207, + "loss": 7.3809, + "step": 15341 + }, + { + "epoch": 1.431557338807502, + "grad_norm": 3.3736015065614997, + "learning_rate": 0.0002673144604348844, + "loss": 7.3847, + "step": 15342 + }, + { + "epoch": 1.4316506485023794, + "grad_norm": 5.250973548539789, + "learning_rate": 0.00026730974685706076, + "loss": 7.8828, + "step": 15343 + }, + { + "epoch": 1.4317439581972566, + "grad_norm": 4.425097492055055, + "learning_rate": 0.00026730503298095316, + "loss": 7.6122, + "step": 15344 + }, + { + "epoch": 1.431837267892134, + "grad_norm": 5.706987942046339, + "learning_rate": 0.00026730031880657363, + "loss": 7.6005, + "step": 15345 + }, + { + "epoch": 1.4319305775870113, + "grad_norm": 4.321082860379556, + "learning_rate": 0.00026729560433393415, + "loss": 7.4188, + "step": 15346 + }, + { + "epoch": 1.4320238872818885, + "grad_norm": 7.207123510003575, + "learning_rate": 0.0002672908895630467, + "loss": 7.8612, + "step": 15347 + }, + { + "epoch": 1.4321171969767659, + "grad_norm": 195.45680039572616, + "learning_rate": 0.00026728617449392326, + "loss": 7.0509, + "step": 15348 + }, + { + "epoch": 1.4322105066716433, + "grad_norm": 3.066095460160473, + "learning_rate": 0.0002672814591265758, + "loss": 7.6249, + "step": 15349 + }, + { + "epoch": 1.4323038163665205, + "grad_norm": 1347.1745716951978, + "learning_rate": 0.0002672767434610164, + "loss": 7.6385, + "step": 15350 + }, + { + "epoch": 1.4323971260613977, + "grad_norm": 1.802549746635601, + "learning_rate": 0.00026727202749725684, + "loss": 7.4508, + "step": 15351 + }, + { + "epoch": 1.432490435756275, + "grad_norm": 1.7871948135881832, + "learning_rate": 0.00026726731123530935, + "loss": 7.3613, + "step": 15352 + }, + { + "epoch": 1.4325837454511523, + "grad_norm": 3.654761739446903, + "learning_rate": 0.0002672625946751858, + "loss": 7.7853, + "step": 15353 + }, + { + "epoch": 1.4326770551460297, + "grad_norm": 950.3713975913578, + "learning_rate": 0.0002672578778168983, + "loss": 7.7894, + "step": 15354 + }, + { + "epoch": 1.432770364840907, + "grad_norm": 1.6388905558755793, + "learning_rate": 0.00026725316066045863, + "loss": 7.6424, + "step": 15355 + }, + { + "epoch": 1.4328636745357843, + "grad_norm": 0.9804459628990392, + "learning_rate": 0.00026724844320587894, + "loss": 7.2124, + "step": 15356 + }, + { + "epoch": 1.4329569842306615, + "grad_norm": 866.2727914646612, + "learning_rate": 0.0002672437254531712, + "loss": 7.6159, + "step": 15357 + }, + { + "epoch": 1.4330502939255387, + "grad_norm": 2.34188694102967, + "learning_rate": 0.0002672390074023474, + "loss": 7.5768, + "step": 15358 + }, + { + "epoch": 1.4331436036204162, + "grad_norm": 2.082562811072106, + "learning_rate": 0.0002672342890534195, + "loss": 7.6517, + "step": 15359 + }, + { + "epoch": 1.4332369133152936, + "grad_norm": 1463.0039015963175, + "learning_rate": 0.0002672295704063996, + "loss": 7.5246, + "step": 15360 + }, + { + "epoch": 1.4333302230101708, + "grad_norm": 2.17556090495033, + "learning_rate": 0.00026722485146129957, + "loss": 8.0188, + "step": 15361 + }, + { + "epoch": 1.433423532705048, + "grad_norm": 1.679066135744164, + "learning_rate": 0.00026722013221813144, + "loss": 7.5198, + "step": 15362 + }, + { + "epoch": 1.4335168423999254, + "grad_norm": 1.1030760957935566, + "learning_rate": 0.00026721541267690725, + "loss": 7.5031, + "step": 15363 + }, + { + "epoch": 1.4336101520948026, + "grad_norm": 1.2013962699405836, + "learning_rate": 0.000267210692837639, + "loss": 7.241, + "step": 15364 + }, + { + "epoch": 1.43370346178968, + "grad_norm": 3.332261999910455, + "learning_rate": 0.00026720597270033875, + "loss": 7.6764, + "step": 15365 + }, + { + "epoch": 1.4337967714845572, + "grad_norm": 3.840633746446201, + "learning_rate": 0.0002672012522650184, + "loss": 7.5307, + "step": 15366 + }, + { + "epoch": 1.4338900811794346, + "grad_norm": 1.5902963389233182, + "learning_rate": 0.00026719653153168995, + "loss": 7.3311, + "step": 15367 + }, + { + "epoch": 1.4339833908743118, + "grad_norm": 1.1456136777992643, + "learning_rate": 0.0002671918105003654, + "loss": 7.3215, + "step": 15368 + }, + { + "epoch": 1.434076700569189, + "grad_norm": 916.313358263479, + "learning_rate": 0.0002671870891710568, + "loss": 7.475, + "step": 15369 + }, + { + "epoch": 1.4341700102640664, + "grad_norm": 2458.3632007456745, + "learning_rate": 0.0002671823675437762, + "loss": 7.5914, + "step": 15370 + }, + { + "epoch": 1.4342633199589438, + "grad_norm": 30.30107016507332, + "learning_rate": 0.00026717764561853553, + "loss": 7.236, + "step": 15371 + }, + { + "epoch": 1.434356629653821, + "grad_norm": 1.7014586153117046, + "learning_rate": 0.0002671729233953469, + "loss": 7.536, + "step": 15372 + }, + { + "epoch": 1.4344499393486982, + "grad_norm": 1.1103596448917077, + "learning_rate": 0.0002671682008742221, + "loss": 7.6506, + "step": 15373 + }, + { + "epoch": 1.4345432490435757, + "grad_norm": 1.057295534212403, + "learning_rate": 0.0002671634780551733, + "loss": 7.4899, + "step": 15374 + }, + { + "epoch": 1.4346365587384529, + "grad_norm": 1.08524273832631, + "learning_rate": 0.0002671587549382125, + "loss": 7.4588, + "step": 15375 + }, + { + "epoch": 1.4347298684333303, + "grad_norm": 2.8655065496309016, + "learning_rate": 0.00026715403152335177, + "loss": 7.5397, + "step": 15376 + }, + { + "epoch": 1.4348231781282075, + "grad_norm": 2.7304957404269223, + "learning_rate": 0.0002671493078106029, + "loss": 7.488, + "step": 15377 + }, + { + "epoch": 1.434916487823085, + "grad_norm": 1.430126217826853, + "learning_rate": 0.00026714458379997817, + "loss": 7.3771, + "step": 15378 + }, + { + "epoch": 1.435009797517962, + "grad_norm": 2.7825297547560295, + "learning_rate": 0.0002671398594914894, + "loss": 7.2652, + "step": 15379 + }, + { + "epoch": 1.4351031072128393, + "grad_norm": 3.000898438881924, + "learning_rate": 0.0002671351348851487, + "loss": 7.5459, + "step": 15380 + }, + { + "epoch": 1.4351964169077167, + "grad_norm": 1.3349937699159524, + "learning_rate": 0.000267130409980968, + "loss": 7.4593, + "step": 15381 + }, + { + "epoch": 1.4352897266025941, + "grad_norm": 3.174965337934764, + "learning_rate": 0.0002671256847789594, + "loss": 7.6051, + "step": 15382 + }, + { + "epoch": 1.4353830362974713, + "grad_norm": 1.3238252956387142, + "learning_rate": 0.00026712095927913484, + "loss": 7.2977, + "step": 15383 + }, + { + "epoch": 1.4354763459923485, + "grad_norm": 11789.539930433308, + "learning_rate": 0.0002671162334815064, + "loss": 7.3083, + "step": 15384 + }, + { + "epoch": 1.435569655687226, + "grad_norm": 0.9103498347911937, + "learning_rate": 0.0002671115073860861, + "loss": 7.4394, + "step": 15385 + }, + { + "epoch": 1.4356629653821031, + "grad_norm": 0.7979180410183371, + "learning_rate": 0.0002671067809928859, + "loss": 7.4293, + "step": 15386 + }, + { + "epoch": 1.4357562750769806, + "grad_norm": 6.240038033592746, + "learning_rate": 0.0002671020543019178, + "loss": 8.05, + "step": 15387 + }, + { + "epoch": 1.4358495847718578, + "grad_norm": 8.877835189176594, + "learning_rate": 0.00026709732731319396, + "loss": 7.5144, + "step": 15388 + }, + { + "epoch": 1.4359428944667352, + "grad_norm": 1.082447625185748, + "learning_rate": 0.00026709260002672625, + "loss": 7.3666, + "step": 15389 + }, + { + "epoch": 1.4360362041616124, + "grad_norm": 3.2572947710998914, + "learning_rate": 0.00026708787244252675, + "loss": 7.4947, + "step": 15390 + }, + { + "epoch": 1.4361295138564896, + "grad_norm": 4.396649426136806, + "learning_rate": 0.00026708314456060745, + "loss": 7.1233, + "step": 15391 + }, + { + "epoch": 1.436222823551367, + "grad_norm": 2.0939743781966897, + "learning_rate": 0.00026707841638098044, + "loss": 7.6123, + "step": 15392 + }, + { + "epoch": 1.4363161332462444, + "grad_norm": 2.2902494381703193, + "learning_rate": 0.00026707368790365764, + "loss": 7.4895, + "step": 15393 + }, + { + "epoch": 1.4364094429411216, + "grad_norm": 2.2517368951769994, + "learning_rate": 0.00026706895912865115, + "loss": 7.3277, + "step": 15394 + }, + { + "epoch": 1.4365027526359988, + "grad_norm": 2.775230244375862, + "learning_rate": 0.000267064230055973, + "loss": 7.3444, + "step": 15395 + }, + { + "epoch": 1.4365960623308762, + "grad_norm": 131963.66611232507, + "learning_rate": 0.0002670595006856352, + "loss": 7.5041, + "step": 15396 + }, + { + "epoch": 1.4366893720257534, + "grad_norm": 7.6296163843154545, + "learning_rate": 0.0002670547710176497, + "loss": 7.4091, + "step": 15397 + }, + { + "epoch": 1.4367826817206308, + "grad_norm": 4.056360297365554, + "learning_rate": 0.00026705004105202865, + "loss": 7.8092, + "step": 15398 + }, + { + "epoch": 1.436875991415508, + "grad_norm": 4.3462569700384925, + "learning_rate": 0.000267045310788784, + "loss": 7.555, + "step": 15399 + }, + { + "epoch": 1.4369693011103855, + "grad_norm": 4.2018791362319075, + "learning_rate": 0.00026704058022792784, + "loss": 7.5179, + "step": 15400 + }, + { + "epoch": 1.4370626108052627, + "grad_norm": 3.282299962498552, + "learning_rate": 0.00026703584936947216, + "loss": 7.3921, + "step": 15401 + }, + { + "epoch": 1.4371559205001398, + "grad_norm": 1613.69273685378, + "learning_rate": 0.0002670311182134289, + "loss": 7.3342, + "step": 15402 + }, + { + "epoch": 1.4372492301950173, + "grad_norm": 0.9123321855486766, + "learning_rate": 0.0002670263867598102, + "loss": 7.3666, + "step": 15403 + }, + { + "epoch": 1.4373425398898947, + "grad_norm": 1192.238068562422, + "learning_rate": 0.0002670216550086281, + "loss": 7.1555, + "step": 15404 + }, + { + "epoch": 1.4374358495847719, + "grad_norm": 833.6089765488249, + "learning_rate": 0.00026701692295989457, + "loss": 7.4474, + "step": 15405 + }, + { + "epoch": 1.437529159279649, + "grad_norm": 1516.4144835986324, + "learning_rate": 0.00026701219061362163, + "loss": 7.1409, + "step": 15406 + }, + { + "epoch": 1.4376224689745265, + "grad_norm": 1.6005681760925417, + "learning_rate": 0.00026700745796982145, + "loss": 7.5551, + "step": 15407 + }, + { + "epoch": 1.4377157786694037, + "grad_norm": 0.738836175046198, + "learning_rate": 0.00026700272502850584, + "loss": 7.1177, + "step": 15408 + }, + { + "epoch": 1.4378090883642811, + "grad_norm": 46.22346502146631, + "learning_rate": 0.0002669979917896871, + "loss": 7.1269, + "step": 15409 + }, + { + "epoch": 1.4379023980591583, + "grad_norm": 1325.0730442466577, + "learning_rate": 0.000266993258253377, + "loss": 7.3615, + "step": 15410 + }, + { + "epoch": 1.4379957077540357, + "grad_norm": 0.7960560863159676, + "learning_rate": 0.00026698852441958774, + "loss": 7.5388, + "step": 15411 + }, + { + "epoch": 1.438089017448913, + "grad_norm": 0.7039935359209883, + "learning_rate": 0.00026698379028833135, + "loss": 7.3557, + "step": 15412 + }, + { + "epoch": 1.4381823271437901, + "grad_norm": 0.6560347661481826, + "learning_rate": 0.0002669790558596198, + "loss": 7.6492, + "step": 15413 + }, + { + "epoch": 1.4382756368386675, + "grad_norm": 0.6208873166735864, + "learning_rate": 0.0002669743211334652, + "loss": 7.5187, + "step": 15414 + }, + { + "epoch": 1.438368946533545, + "grad_norm": 0.7702664049524185, + "learning_rate": 0.00026696958610987953, + "loss": 7.5057, + "step": 15415 + }, + { + "epoch": 1.4384622562284222, + "grad_norm": 1488.985191074875, + "learning_rate": 0.00026696485078887486, + "loss": 7.8755, + "step": 15416 + }, + { + "epoch": 1.4385555659232994, + "grad_norm": 0.7079849262182563, + "learning_rate": 0.0002669601151704632, + "loss": 7.3171, + "step": 15417 + }, + { + "epoch": 1.4386488756181768, + "grad_norm": 0.7500316525829019, + "learning_rate": 0.0002669553792546566, + "loss": 7.1713, + "step": 15418 + }, + { + "epoch": 1.438742185313054, + "grad_norm": 1.1822556696684208, + "learning_rate": 0.00026695064304146714, + "loss": 7.3937, + "step": 15419 + }, + { + "epoch": 1.4388354950079314, + "grad_norm": 2.6635098000981707, + "learning_rate": 0.00026694590653090686, + "loss": 7.5746, + "step": 15420 + }, + { + "epoch": 1.4389288047028086, + "grad_norm": 1.983331337691846, + "learning_rate": 0.00026694116972298776, + "loss": 7.5432, + "step": 15421 + }, + { + "epoch": 1.439022114397686, + "grad_norm": 2.0463737630503283, + "learning_rate": 0.0002669364326177219, + "loss": 7.2862, + "step": 15422 + }, + { + "epoch": 1.4391154240925632, + "grad_norm": 220.57069152990934, + "learning_rate": 0.00026693169521512135, + "loss": 7.4024, + "step": 15423 + }, + { + "epoch": 1.4392087337874404, + "grad_norm": 1.3112054495063112, + "learning_rate": 0.0002669269575151981, + "loss": 7.3372, + "step": 15424 + }, + { + "epoch": 1.4393020434823178, + "grad_norm": 1.4480563038797343, + "learning_rate": 0.00026692221951796425, + "loss": 7.5221, + "step": 15425 + }, + { + "epoch": 1.4393953531771952, + "grad_norm": 0.9291465097747058, + "learning_rate": 0.00026691748122343185, + "loss": 7.5037, + "step": 15426 + }, + { + "epoch": 1.4394886628720724, + "grad_norm": 0.6301766679330104, + "learning_rate": 0.0002669127426316129, + "loss": 7.6613, + "step": 15427 + }, + { + "epoch": 1.4395819725669496, + "grad_norm": 718.4568416144799, + "learning_rate": 0.00026690800374251953, + "loss": 6.9806, + "step": 15428 + }, + { + "epoch": 1.439675282261827, + "grad_norm": 1.4770483516300217, + "learning_rate": 0.0002669032645561637, + "loss": 7.4819, + "step": 15429 + }, + { + "epoch": 1.4397685919567043, + "grad_norm": 180.97327709257178, + "learning_rate": 0.00026689852507255743, + "loss": 7.5365, + "step": 15430 + }, + { + "epoch": 1.4398619016515817, + "grad_norm": 2.4866401456388063, + "learning_rate": 0.00026689378529171293, + "loss": 7.3164, + "step": 15431 + }, + { + "epoch": 1.4399552113464589, + "grad_norm": 1.7425238677526347, + "learning_rate": 0.00026688904521364216, + "loss": 7.5763, + "step": 15432 + }, + { + "epoch": 1.4400485210413363, + "grad_norm": 1.4962501925320681, + "learning_rate": 0.00026688430483835714, + "loss": 7.5493, + "step": 15433 + }, + { + "epoch": 1.4401418307362135, + "grad_norm": 148.87546365164766, + "learning_rate": 0.00026687956416587, + "loss": 7.2626, + "step": 15434 + }, + { + "epoch": 1.4402351404310907, + "grad_norm": 1.4145647132035668, + "learning_rate": 0.00026687482319619264, + "loss": 7.2587, + "step": 15435 + }, + { + "epoch": 1.440328450125968, + "grad_norm": 0.9554447421499944, + "learning_rate": 0.00026687008192933734, + "loss": 7.3759, + "step": 15436 + }, + { + "epoch": 1.4404217598208453, + "grad_norm": 0.7475939894623596, + "learning_rate": 0.000266865340365316, + "loss": 7.3539, + "step": 15437 + }, + { + "epoch": 1.4405150695157227, + "grad_norm": 0.9251368408729516, + "learning_rate": 0.00026686059850414073, + "loss": 7.3004, + "step": 15438 + }, + { + "epoch": 1.4406083792106, + "grad_norm": 1.0725068720039133, + "learning_rate": 0.0002668558563458236, + "loss": 7.2801, + "step": 15439 + }, + { + "epoch": 1.4407016889054773, + "grad_norm": 1.2774426670984402, + "learning_rate": 0.0002668511138903766, + "loss": 7.3006, + "step": 15440 + }, + { + "epoch": 1.4407949986003545, + "grad_norm": 0.8258898977303728, + "learning_rate": 0.00026684637113781183, + "loss": 7.1828, + "step": 15441 + }, + { + "epoch": 1.440888308295232, + "grad_norm": 460.15484028803354, + "learning_rate": 0.0002668416280881413, + "loss": 7.2348, + "step": 15442 + }, + { + "epoch": 1.4409816179901092, + "grad_norm": 0.9236704511904347, + "learning_rate": 0.0002668368847413772, + "loss": 7.1621, + "step": 15443 + }, + { + "epoch": 1.4410749276849866, + "grad_norm": 1.0030358546331206, + "learning_rate": 0.0002668321410975315, + "loss": 7.3624, + "step": 15444 + }, + { + "epoch": 1.4411682373798638, + "grad_norm": 0.7946857846820788, + "learning_rate": 0.00026682739715661627, + "loss": 7.2798, + "step": 15445 + }, + { + "epoch": 1.441261547074741, + "grad_norm": 1.585447489355427, + "learning_rate": 0.0002668226529186436, + "loss": 7.7287, + "step": 15446 + }, + { + "epoch": 1.4413548567696184, + "grad_norm": 211.31038057066445, + "learning_rate": 0.00026681790838362546, + "loss": 7.344, + "step": 15447 + }, + { + "epoch": 1.4414481664644956, + "grad_norm": 0.5186396976954226, + "learning_rate": 0.00026681316355157405, + "loss": 7.4542, + "step": 15448 + }, + { + "epoch": 1.441541476159373, + "grad_norm": 1.1022009446663306, + "learning_rate": 0.00026680841842250134, + "loss": 7.3281, + "step": 15449 + }, + { + "epoch": 1.4416347858542502, + "grad_norm": 1.198777163609606, + "learning_rate": 0.0002668036729964194, + "loss": 7.2939, + "step": 15450 + }, + { + "epoch": 1.4417280955491276, + "grad_norm": 1.1796036361794489, + "learning_rate": 0.00026679892727334036, + "loss": 7.3285, + "step": 15451 + }, + { + "epoch": 1.4418214052440048, + "grad_norm": 1.2750210079498567, + "learning_rate": 0.00026679418125327627, + "loss": 7.2224, + "step": 15452 + }, + { + "epoch": 1.441914714938882, + "grad_norm": 0.6180020209691867, + "learning_rate": 0.0002667894349362391, + "loss": 7.4157, + "step": 15453 + }, + { + "epoch": 1.4420080246337594, + "grad_norm": 0.5982803519326431, + "learning_rate": 0.00026678468832224105, + "loss": 7.4128, + "step": 15454 + }, + { + "epoch": 1.4421013343286369, + "grad_norm": 0.379125777284488, + "learning_rate": 0.0002667799414112941, + "loss": 7.1807, + "step": 15455 + }, + { + "epoch": 1.442194644023514, + "grad_norm": 180.04115371070327, + "learning_rate": 0.00026677519420341037, + "loss": 7.297, + "step": 15456 + }, + { + "epoch": 1.4422879537183912, + "grad_norm": 1.141404136942847, + "learning_rate": 0.0002667704466986019, + "loss": 7.4623, + "step": 15457 + }, + { + "epoch": 1.4423812634132687, + "grad_norm": 1.2806414251505558, + "learning_rate": 0.00026676569889688077, + "loss": 7.337, + "step": 15458 + }, + { + "epoch": 1.4424745731081459, + "grad_norm": 0.7950939578190199, + "learning_rate": 0.00026676095079825903, + "loss": 7.1064, + "step": 15459 + }, + { + "epoch": 1.4425678828030233, + "grad_norm": 1.082788272340195, + "learning_rate": 0.00026675620240274885, + "loss": 7.3081, + "step": 15460 + }, + { + "epoch": 1.4426611924979005, + "grad_norm": 0.6814825476953245, + "learning_rate": 0.0002667514537103622, + "loss": 7.2774, + "step": 15461 + }, + { + "epoch": 1.442754502192778, + "grad_norm": 0.725034484539482, + "learning_rate": 0.00026674670472111116, + "loss": 7.3846, + "step": 15462 + }, + { + "epoch": 1.442847811887655, + "grad_norm": 0.7450888136485873, + "learning_rate": 0.0002667419554350079, + "loss": 7.1623, + "step": 15463 + }, + { + "epoch": 1.4429411215825323, + "grad_norm": 0.5484109905841279, + "learning_rate": 0.00026673720585206436, + "loss": 7.6397, + "step": 15464 + }, + { + "epoch": 1.4430344312774097, + "grad_norm": 0.8521355084962675, + "learning_rate": 0.00026673245597229274, + "loss": 7.2479, + "step": 15465 + }, + { + "epoch": 1.4431277409722871, + "grad_norm": 0.8034674179978112, + "learning_rate": 0.000266727705795705, + "loss": 7.2627, + "step": 15466 + }, + { + "epoch": 1.4432210506671643, + "grad_norm": 0.5291836393139618, + "learning_rate": 0.00026672295532231333, + "loss": 7.4439, + "step": 15467 + }, + { + "epoch": 1.4433143603620415, + "grad_norm": 0.6835582664785144, + "learning_rate": 0.00026671820455212973, + "loss": 7.5146, + "step": 15468 + }, + { + "epoch": 1.443407670056919, + "grad_norm": 0.6383295758411586, + "learning_rate": 0.0002667134534851663, + "loss": 7.4424, + "step": 15469 + }, + { + "epoch": 1.4435009797517961, + "grad_norm": 0.45391235523124235, + "learning_rate": 0.00026670870212143515, + "loss": 7.2912, + "step": 15470 + }, + { + "epoch": 1.4435942894466736, + "grad_norm": 0.701266427322403, + "learning_rate": 0.00026670395046094836, + "loss": 7.0006, + "step": 15471 + }, + { + "epoch": 1.4436875991415508, + "grad_norm": 0.656316848319876, + "learning_rate": 0.000266699198503718, + "loss": 7.4083, + "step": 15472 + }, + { + "epoch": 1.4437809088364282, + "grad_norm": 0.9926647341996239, + "learning_rate": 0.0002666944462497561, + "loss": 7.1633, + "step": 15473 + }, + { + "epoch": 1.4438742185313054, + "grad_norm": 0.6852695516994926, + "learning_rate": 0.0002666896936990748, + "loss": 7.0687, + "step": 15474 + }, + { + "epoch": 1.4439675282261826, + "grad_norm": 0.6440455926238767, + "learning_rate": 0.0002666849408516862, + "loss": 7.0848, + "step": 15475 + }, + { + "epoch": 1.44406083792106, + "grad_norm": 1.5870797795687555, + "learning_rate": 0.0002666801877076024, + "loss": 7.3506, + "step": 15476 + }, + { + "epoch": 1.4441541476159374, + "grad_norm": 507.23773760823275, + "learning_rate": 0.0002666754342668354, + "loss": 7.239, + "step": 15477 + }, + { + "epoch": 1.4442474573108146, + "grad_norm": 0.5043349732444068, + "learning_rate": 0.00026667068052939737, + "loss": 7.3533, + "step": 15478 + }, + { + "epoch": 1.4443407670056918, + "grad_norm": 1.0824885112215499, + "learning_rate": 0.0002666659264953003, + "loss": 7.487, + "step": 15479 + }, + { + "epoch": 1.4444340767005692, + "grad_norm": 373.6378516118049, + "learning_rate": 0.00026666117216455633, + "loss": 7.4345, + "step": 15480 + }, + { + "epoch": 1.4445273863954464, + "grad_norm": 1.6727389632754672, + "learning_rate": 0.00026665641753717757, + "loss": 7.3519, + "step": 15481 + }, + { + "epoch": 1.4446206960903238, + "grad_norm": 0.7508102922841228, + "learning_rate": 0.0002666516626131761, + "loss": 7.6075, + "step": 15482 + }, + { + "epoch": 1.444714005785201, + "grad_norm": 91.57916359157318, + "learning_rate": 0.0002666469073925641, + "loss": 7.2553, + "step": 15483 + }, + { + "epoch": 1.4448073154800785, + "grad_norm": 0.7962953505741319, + "learning_rate": 0.0002666421518753535, + "loss": 7.4271, + "step": 15484 + }, + { + "epoch": 1.4449006251749557, + "grad_norm": 345.1844797974457, + "learning_rate": 0.0002666373960615564, + "loss": 6.9646, + "step": 15485 + }, + { + "epoch": 1.4449939348698329, + "grad_norm": 647.5724550850513, + "learning_rate": 0.000266632639951185, + "loss": 7.5295, + "step": 15486 + }, + { + "epoch": 1.4450872445647103, + "grad_norm": 1.930242939209243, + "learning_rate": 0.00026662788354425135, + "loss": 7.2996, + "step": 15487 + }, + { + "epoch": 1.4451805542595877, + "grad_norm": 3.5747308456025384, + "learning_rate": 0.00026662312684076756, + "loss": 7.6865, + "step": 15488 + }, + { + "epoch": 1.4452738639544649, + "grad_norm": 2.8829436272522186, + "learning_rate": 0.0002666183698407457, + "loss": 7.7962, + "step": 15489 + }, + { + "epoch": 1.445367173649342, + "grad_norm": 2.5589227218991573, + "learning_rate": 0.00026661361254419784, + "loss": 7.4482, + "step": 15490 + }, + { + "epoch": 1.4454604833442195, + "grad_norm": 7.8613240682104655, + "learning_rate": 0.00026660885495113614, + "loss": 7.3835, + "step": 15491 + }, + { + "epoch": 1.4455537930390967, + "grad_norm": 0.7692885169015803, + "learning_rate": 0.0002666040970615726, + "loss": 7.831, + "step": 15492 + }, + { + "epoch": 1.4456471027339741, + "grad_norm": 2.005967267076743, + "learning_rate": 0.00026659933887551946, + "loss": 7.5781, + "step": 15493 + }, + { + "epoch": 1.4457404124288513, + "grad_norm": 1.1351950957130643, + "learning_rate": 0.0002665945803929887, + "loss": 7.3981, + "step": 15494 + }, + { + "epoch": 1.4458337221237287, + "grad_norm": 1.3377749701575132, + "learning_rate": 0.0002665898216139924, + "loss": 7.6547, + "step": 15495 + }, + { + "epoch": 1.445927031818606, + "grad_norm": 1.1857391995011985, + "learning_rate": 0.0002665850625385428, + "loss": 7.3594, + "step": 15496 + }, + { + "epoch": 1.4460203415134831, + "grad_norm": 1.6308297093876811, + "learning_rate": 0.0002665803031666519, + "loss": 7.5836, + "step": 15497 + }, + { + "epoch": 1.4461136512083606, + "grad_norm": 1.392311511414085, + "learning_rate": 0.00026657554349833186, + "loss": 7.4312, + "step": 15498 + }, + { + "epoch": 1.446206960903238, + "grad_norm": 1.2152259498231415, + "learning_rate": 0.0002665707835335947, + "loss": 7.5197, + "step": 15499 + }, + { + "epoch": 1.4463002705981152, + "grad_norm": 0.7505287258916676, + "learning_rate": 0.0002665660232724526, + "loss": 7.4206, + "step": 15500 + }, + { + "epoch": 1.4463935802929924, + "grad_norm": 0.8637056423204463, + "learning_rate": 0.00026656126271491765, + "loss": 7.551, + "step": 15501 + }, + { + "epoch": 1.4464868899878698, + "grad_norm": 0.9137096802135235, + "learning_rate": 0.00026655650186100185, + "loss": 7.5275, + "step": 15502 + }, + { + "epoch": 1.446580199682747, + "grad_norm": 1.0637888637152435, + "learning_rate": 0.00026655174071071744, + "loss": 7.4109, + "step": 15503 + }, + { + "epoch": 1.4466735093776244, + "grad_norm": 0.8003674854998503, + "learning_rate": 0.0002665469792640765, + "loss": 7.5034, + "step": 15504 + }, + { + "epoch": 1.4467668190725016, + "grad_norm": 1.2029530895683445, + "learning_rate": 0.00026654221752109113, + "loss": 7.7461, + "step": 15505 + }, + { + "epoch": 1.446860128767379, + "grad_norm": 0.7461225796134248, + "learning_rate": 0.00026653745548177335, + "loss": 7.401, + "step": 15506 + }, + { + "epoch": 1.4469534384622562, + "grad_norm": 1.2267368573353388, + "learning_rate": 0.0002665326931461354, + "loss": 7.4301, + "step": 15507 + }, + { + "epoch": 1.4470467481571334, + "grad_norm": 1.1381537979706118, + "learning_rate": 0.0002665279305141893, + "loss": 7.7899, + "step": 15508 + }, + { + "epoch": 1.4471400578520108, + "grad_norm": 0.7735253957086711, + "learning_rate": 0.0002665231675859472, + "loss": 7.4134, + "step": 15509 + }, + { + "epoch": 1.4472333675468882, + "grad_norm": 0.692688345483737, + "learning_rate": 0.00026651840436142123, + "loss": 7.486, + "step": 15510 + }, + { + "epoch": 1.4473266772417654, + "grad_norm": 0.6919732129732751, + "learning_rate": 0.0002665136408406235, + "loss": 7.6283, + "step": 15511 + }, + { + "epoch": 1.4474199869366426, + "grad_norm": 1.2825189448213377, + "learning_rate": 0.000266508877023566, + "loss": 7.578, + "step": 15512 + }, + { + "epoch": 1.44751329663152, + "grad_norm": 0.7473379844621837, + "learning_rate": 0.000266504112910261, + "loss": 7.5294, + "step": 15513 + }, + { + "epoch": 1.4476066063263973, + "grad_norm": 0.9387335724990248, + "learning_rate": 0.00026649934850072053, + "loss": 7.8248, + "step": 15514 + }, + { + "epoch": 1.4476999160212747, + "grad_norm": 1.2220064198293006, + "learning_rate": 0.0002664945837949567, + "loss": 7.2569, + "step": 15515 + }, + { + "epoch": 1.4477932257161519, + "grad_norm": 0.7811099973293743, + "learning_rate": 0.0002664898187929817, + "loss": 7.6876, + "step": 15516 + }, + { + "epoch": 1.4478865354110293, + "grad_norm": 1.125392789122443, + "learning_rate": 0.00026648505349480755, + "loss": 7.2612, + "step": 15517 + }, + { + "epoch": 1.4479798451059065, + "grad_norm": 0.7602806803992546, + "learning_rate": 0.00026648028790044647, + "loss": 7.2122, + "step": 15518 + }, + { + "epoch": 1.4480731548007837, + "grad_norm": 1.0814106335063982, + "learning_rate": 0.0002664755220099105, + "loss": 7.3936, + "step": 15519 + }, + { + "epoch": 1.448166464495661, + "grad_norm": 1.8535612875031549, + "learning_rate": 0.00026647075582321173, + "loss": 7.4423, + "step": 15520 + }, + { + "epoch": 1.4482597741905385, + "grad_norm": 1.4539503578348383, + "learning_rate": 0.0002664659893403624, + "loss": 7.4283, + "step": 15521 + }, + { + "epoch": 1.4483530838854157, + "grad_norm": 1.3554712435658323, + "learning_rate": 0.00026646122256137456, + "loss": 7.5082, + "step": 15522 + }, + { + "epoch": 1.448446393580293, + "grad_norm": 1.2261252315021012, + "learning_rate": 0.0002664564554862603, + "loss": 7.4586, + "step": 15523 + }, + { + "epoch": 1.4485397032751703, + "grad_norm": 1.0040836775751245, + "learning_rate": 0.00026645168811503177, + "loss": 7.3042, + "step": 15524 + }, + { + "epoch": 1.4486330129700475, + "grad_norm": 0.9574889786570067, + "learning_rate": 0.0002664469204477011, + "loss": 7.5661, + "step": 15525 + }, + { + "epoch": 1.448726322664925, + "grad_norm": 1.4957433321250462, + "learning_rate": 0.0002664421524842804, + "loss": 7.2569, + "step": 15526 + }, + { + "epoch": 1.4488196323598022, + "grad_norm": 0.988501481401285, + "learning_rate": 0.00026643738422478183, + "loss": 7.5099, + "step": 15527 + }, + { + "epoch": 1.4489129420546796, + "grad_norm": 0.6971747057172429, + "learning_rate": 0.0002664326156692174, + "loss": 7.4061, + "step": 15528 + }, + { + "epoch": 1.4490062517495568, + "grad_norm": 0.9739220522032561, + "learning_rate": 0.00026642784681759946, + "loss": 7.3237, + "step": 15529 + }, + { + "epoch": 1.449099561444434, + "grad_norm": 0.7533374680070705, + "learning_rate": 0.0002664230776699398, + "loss": 7.3893, + "step": 15530 + }, + { + "epoch": 1.4491928711393114, + "grad_norm": 1.199401130223618, + "learning_rate": 0.0002664183082262509, + "loss": 7.4902, + "step": 15531 + }, + { + "epoch": 1.4492861808341888, + "grad_norm": 0.7857112595553857, + "learning_rate": 0.0002664135384865447, + "loss": 7.3511, + "step": 15532 + }, + { + "epoch": 1.449379490529066, + "grad_norm": 1.4284454262557016, + "learning_rate": 0.0002664087684508333, + "loss": 7.643, + "step": 15533 + }, + { + "epoch": 1.4494728002239432, + "grad_norm": 0.8753452327523968, + "learning_rate": 0.0002664039981191289, + "loss": 7.5071, + "step": 15534 + }, + { + "epoch": 1.4495661099188206, + "grad_norm": 1.0332648609280228, + "learning_rate": 0.00026639922749144366, + "loss": 7.4114, + "step": 15535 + }, + { + "epoch": 1.4496594196136978, + "grad_norm": 0.8300572179708864, + "learning_rate": 0.00026639445656778964, + "loss": 7.3989, + "step": 15536 + }, + { + "epoch": 1.4497527293085752, + "grad_norm": 0.9759640645962522, + "learning_rate": 0.000266389685348179, + "loss": 7.4457, + "step": 15537 + }, + { + "epoch": 1.4498460390034524, + "grad_norm": 1.2312648219419948, + "learning_rate": 0.0002663849138326239, + "loss": 7.317, + "step": 15538 + }, + { + "epoch": 1.4499393486983299, + "grad_norm": 0.8494854682442472, + "learning_rate": 0.0002663801420211363, + "loss": 7.4671, + "step": 15539 + }, + { + "epoch": 1.450032658393207, + "grad_norm": 0.9113270086454853, + "learning_rate": 0.0002663753699137286, + "loss": 7.5751, + "step": 15540 + }, + { + "epoch": 1.4501259680880842, + "grad_norm": 0.8533721300390497, + "learning_rate": 0.00026637059751041277, + "loss": 7.5235, + "step": 15541 + }, + { + "epoch": 1.4502192777829617, + "grad_norm": 0.631214141646683, + "learning_rate": 0.000266365824811201, + "loss": 7.3713, + "step": 15542 + }, + { + "epoch": 1.4503125874778389, + "grad_norm": 1.2287106616844576, + "learning_rate": 0.00026636105181610545, + "loss": 7.5562, + "step": 15543 + }, + { + "epoch": 1.4504058971727163, + "grad_norm": 1.1429172723062142, + "learning_rate": 0.0002663562785251381, + "loss": 7.6674, + "step": 15544 + }, + { + "epoch": 1.4504992068675935, + "grad_norm": 1.0019634847102126, + "learning_rate": 0.0002663515049383113, + "loss": 7.0355, + "step": 15545 + }, + { + "epoch": 1.450592516562471, + "grad_norm": 0.643922016001472, + "learning_rate": 0.000266346731055637, + "loss": 7.5477, + "step": 15546 + }, + { + "epoch": 1.450685826257348, + "grad_norm": 0.6676651248092854, + "learning_rate": 0.0002663419568771275, + "loss": 7.6335, + "step": 15547 + }, + { + "epoch": 1.4507791359522255, + "grad_norm": 0.8205554368034389, + "learning_rate": 0.00026633718240279486, + "loss": 7.5718, + "step": 15548 + }, + { + "epoch": 1.4508724456471027, + "grad_norm": 0.7879823496630294, + "learning_rate": 0.0002663324076326512, + "loss": 7.7955, + "step": 15549 + }, + { + "epoch": 1.4509657553419801, + "grad_norm": 1.011850207226443, + "learning_rate": 0.00026632763256670866, + "loss": 7.5094, + "step": 15550 + }, + { + "epoch": 1.4510590650368573, + "grad_norm": 0.7617937824492241, + "learning_rate": 0.0002663228572049795, + "loss": 7.694, + "step": 15551 + }, + { + "epoch": 1.4511523747317345, + "grad_norm": 0.8788717315771563, + "learning_rate": 0.00026631808154747573, + "loss": 7.6813, + "step": 15552 + }, + { + "epoch": 1.451245684426612, + "grad_norm": 1.034800945420104, + "learning_rate": 0.0002663133055942095, + "loss": 7.3151, + "step": 15553 + }, + { + "epoch": 1.4513389941214891, + "grad_norm": 0.8011143047688416, + "learning_rate": 0.00026630852934519304, + "loss": 7.5126, + "step": 15554 + }, + { + "epoch": 1.4514323038163666, + "grad_norm": 1.2915140640534357, + "learning_rate": 0.0002663037528004384, + "loss": 8.0839, + "step": 15555 + }, + { + "epoch": 1.4515256135112438, + "grad_norm": 0.9552929190673288, + "learning_rate": 0.0002662989759599578, + "loss": 7.548, + "step": 15556 + }, + { + "epoch": 1.4516189232061212, + "grad_norm": 1.6178480785074152, + "learning_rate": 0.0002662941988237633, + "loss": 7.7372, + "step": 15557 + }, + { + "epoch": 1.4517122329009984, + "grad_norm": 1.2679949281160483, + "learning_rate": 0.00026628942139186716, + "loss": 7.1905, + "step": 15558 + }, + { + "epoch": 1.4518055425958756, + "grad_norm": 0.8661024907275388, + "learning_rate": 0.0002662846436642815, + "loss": 7.4856, + "step": 15559 + }, + { + "epoch": 1.451898852290753, + "grad_norm": 0.7456356729030326, + "learning_rate": 0.00026627986564101836, + "loss": 7.4756, + "step": 15560 + }, + { + "epoch": 1.4519921619856304, + "grad_norm": 0.6819077965412098, + "learning_rate": 0.00026627508732208994, + "loss": 7.3231, + "step": 15561 + }, + { + "epoch": 1.4520854716805076, + "grad_norm": 0.8557566938403776, + "learning_rate": 0.00026627030870750854, + "loss": 7.5283, + "step": 15562 + }, + { + "epoch": 1.4521787813753848, + "grad_norm": 0.9492767580274659, + "learning_rate": 0.0002662655297972861, + "loss": 7.4917, + "step": 15563 + }, + { + "epoch": 1.4522720910702622, + "grad_norm": 0.5169149593491458, + "learning_rate": 0.00026626075059143486, + "loss": 7.3492, + "step": 15564 + }, + { + "epoch": 1.4523654007651394, + "grad_norm": 0.8833399010507846, + "learning_rate": 0.000266255971089967, + "loss": 7.2502, + "step": 15565 + }, + { + "epoch": 1.4524587104600168, + "grad_norm": 0.605912002131334, + "learning_rate": 0.0002662511912928946, + "loss": 7.3534, + "step": 15566 + }, + { + "epoch": 1.452552020154894, + "grad_norm": 0.6853137204058072, + "learning_rate": 0.0002662464112002299, + "loss": 7.2834, + "step": 15567 + }, + { + "epoch": 1.4526453298497715, + "grad_norm": 1.0111518886711417, + "learning_rate": 0.000266241630811985, + "loss": 7.7834, + "step": 15568 + }, + { + "epoch": 1.4527386395446487, + "grad_norm": 0.7052118999324881, + "learning_rate": 0.00026623685012817206, + "loss": 7.4785, + "step": 15569 + }, + { + "epoch": 1.4528319492395259, + "grad_norm": 0.9375654066718999, + "learning_rate": 0.00026623206914880326, + "loss": 7.5018, + "step": 15570 + }, + { + "epoch": 1.4529252589344033, + "grad_norm": 0.5895660864249386, + "learning_rate": 0.0002662272878738907, + "loss": 7.3958, + "step": 15571 + }, + { + "epoch": 1.4530185686292807, + "grad_norm": 0.8943036067411769, + "learning_rate": 0.0002662225063034466, + "loss": 7.6266, + "step": 15572 + }, + { + "epoch": 1.453111878324158, + "grad_norm": 0.7074541726830863, + "learning_rate": 0.0002662177244374831, + "loss": 7.4542, + "step": 15573 + }, + { + "epoch": 1.453205188019035, + "grad_norm": 0.695499475114715, + "learning_rate": 0.0002662129422760123, + "loss": 7.4547, + "step": 15574 + }, + { + "epoch": 1.4532984977139125, + "grad_norm": 0.7709462744116773, + "learning_rate": 0.0002662081598190465, + "loss": 7.3936, + "step": 15575 + }, + { + "epoch": 1.4533918074087897, + "grad_norm": 0.8160540553569771, + "learning_rate": 0.00026620337706659774, + "loss": 7.3868, + "step": 15576 + }, + { + "epoch": 1.4534851171036671, + "grad_norm": 0.9578517515213318, + "learning_rate": 0.0002661985940186782, + "loss": 7.6816, + "step": 15577 + }, + { + "epoch": 1.4535784267985443, + "grad_norm": 0.48971031508396035, + "learning_rate": 0.0002661938106753, + "loss": 7.2758, + "step": 15578 + }, + { + "epoch": 1.4536717364934217, + "grad_norm": 1.9152711541684966, + "learning_rate": 0.00026618902703647544, + "loss": 7.7414, + "step": 15579 + }, + { + "epoch": 1.453765046188299, + "grad_norm": 0.5268776224775565, + "learning_rate": 0.0002661842431022166, + "loss": 7.0973, + "step": 15580 + }, + { + "epoch": 1.4538583558831761, + "grad_norm": 0.5683251751187366, + "learning_rate": 0.00026617945887253563, + "loss": 7.1878, + "step": 15581 + }, + { + "epoch": 1.4539516655780536, + "grad_norm": 0.8304567755179083, + "learning_rate": 0.0002661746743474447, + "loss": 7.5248, + "step": 15582 + }, + { + "epoch": 1.454044975272931, + "grad_norm": 0.7622594850218974, + "learning_rate": 0.000266169889526956, + "loss": 7.2836, + "step": 15583 + }, + { + "epoch": 1.4541382849678082, + "grad_norm": 0.5374607821098571, + "learning_rate": 0.0002661651044110817, + "loss": 7.4602, + "step": 15584 + }, + { + "epoch": 1.4542315946626854, + "grad_norm": 1.0122841219503103, + "learning_rate": 0.0002661603189998339, + "loss": 7.3633, + "step": 15585 + }, + { + "epoch": 1.4543249043575628, + "grad_norm": 0.7265006663663727, + "learning_rate": 0.00026615553329322486, + "loss": 7.3008, + "step": 15586 + }, + { + "epoch": 1.45441821405244, + "grad_norm": 0.5577037693717563, + "learning_rate": 0.00026615074729126673, + "loss": 7.215, + "step": 15587 + }, + { + "epoch": 1.4545115237473174, + "grad_norm": 0.8405413412271541, + "learning_rate": 0.0002661459609939716, + "loss": 7.2641, + "step": 15588 + }, + { + "epoch": 1.4546048334421946, + "grad_norm": 2.102959454225036, + "learning_rate": 0.00026614117440135166, + "loss": 7.8175, + "step": 15589 + }, + { + "epoch": 1.454698143137072, + "grad_norm": 0.6285841542363871, + "learning_rate": 0.0002661363875134192, + "loss": 7.1773, + "step": 15590 + }, + { + "epoch": 1.4547914528319492, + "grad_norm": 0.9220488097593889, + "learning_rate": 0.0002661316003301863, + "loss": 7.0819, + "step": 15591 + }, + { + "epoch": 1.4548847625268264, + "grad_norm": 0.6430180692573036, + "learning_rate": 0.00026612681285166515, + "loss": 7.1896, + "step": 15592 + }, + { + "epoch": 1.4549780722217038, + "grad_norm": 0.673722397264029, + "learning_rate": 0.0002661220250778678, + "loss": 7.2275, + "step": 15593 + }, + { + "epoch": 1.4550713819165813, + "grad_norm": 0.6219420725457707, + "learning_rate": 0.0002661172370088067, + "loss": 7.6095, + "step": 15594 + }, + { + "epoch": 1.4551646916114584, + "grad_norm": 0.8805808702958592, + "learning_rate": 0.0002661124486444937, + "loss": 7.2089, + "step": 15595 + }, + { + "epoch": 1.4552580013063356, + "grad_norm": 0.8895543789121333, + "learning_rate": 0.0002661076599849413, + "loss": 7.6935, + "step": 15596 + }, + { + "epoch": 1.455351311001213, + "grad_norm": 0.8615207914995113, + "learning_rate": 0.0002661028710301614, + "loss": 7.1714, + "step": 15597 + }, + { + "epoch": 1.4554446206960903, + "grad_norm": 0.6215564360971481, + "learning_rate": 0.00026609808178016634, + "loss": 7.4656, + "step": 15598 + }, + { + "epoch": 1.4555379303909677, + "grad_norm": 0.45553588576600257, + "learning_rate": 0.00026609329223496823, + "loss": 7.1606, + "step": 15599 + }, + { + "epoch": 1.4556312400858449, + "grad_norm": 0.4866525229506166, + "learning_rate": 0.0002660885023945793, + "loss": 7.0909, + "step": 15600 + }, + { + "epoch": 1.4557245497807223, + "grad_norm": 1.2449375553713888, + "learning_rate": 0.00026608371225901166, + "loss": 7.3669, + "step": 15601 + }, + { + "epoch": 1.4558178594755995, + "grad_norm": 0.9708747769120472, + "learning_rate": 0.00026607892182827755, + "loss": 7.3193, + "step": 15602 + }, + { + "epoch": 1.4559111691704767, + "grad_norm": 0.8985105624659749, + "learning_rate": 0.00026607413110238913, + "loss": 7.561, + "step": 15603 + }, + { + "epoch": 1.4560044788653541, + "grad_norm": 0.7262724635985717, + "learning_rate": 0.00026606934008135854, + "loss": 7.3504, + "step": 15604 + }, + { + "epoch": 1.4560977885602315, + "grad_norm": 0.523651509320514, + "learning_rate": 0.00026606454876519804, + "loss": 7.6333, + "step": 15605 + }, + { + "epoch": 1.4561910982551087, + "grad_norm": 2.188735591736607, + "learning_rate": 0.0002660597571539197, + "loss": 7.2543, + "step": 15606 + }, + { + "epoch": 1.456284407949986, + "grad_norm": 2.3125568492267288, + "learning_rate": 0.00026605496524753587, + "loss": 7.2598, + "step": 15607 + }, + { + "epoch": 1.4563777176448633, + "grad_norm": 0.5845970025357045, + "learning_rate": 0.0002660501730460586, + "loss": 7.0184, + "step": 15608 + }, + { + "epoch": 1.4564710273397405, + "grad_norm": 2.861099987714141, + "learning_rate": 0.00026604538054950013, + "loss": 7.8377, + "step": 15609 + }, + { + "epoch": 1.456564337034618, + "grad_norm": 2.5733163355610813, + "learning_rate": 0.00026604058775787264, + "loss": 7.5162, + "step": 15610 + }, + { + "epoch": 1.4566576467294952, + "grad_norm": 2.5440896799717994, + "learning_rate": 0.00026603579467118825, + "loss": 7.7399, + "step": 15611 + }, + { + "epoch": 1.4567509564243726, + "grad_norm": 1.2554622733611618, + "learning_rate": 0.0002660310012894593, + "loss": 7.5437, + "step": 15612 + }, + { + "epoch": 1.4568442661192498, + "grad_norm": 0.7910918944721149, + "learning_rate": 0.0002660262076126978, + "loss": 7.6431, + "step": 15613 + }, + { + "epoch": 1.456937575814127, + "grad_norm": 1.7163022771596643, + "learning_rate": 0.00026602141364091614, + "loss": 7.2642, + "step": 15614 + }, + { + "epoch": 1.4570308855090044, + "grad_norm": 1.1320301265104964, + "learning_rate": 0.0002660166193741263, + "loss": 7.6081, + "step": 15615 + }, + { + "epoch": 1.4571241952038818, + "grad_norm": 1.899688216249439, + "learning_rate": 0.0002660118248123406, + "loss": 7.4706, + "step": 15616 + }, + { + "epoch": 1.457217504898759, + "grad_norm": 2.045477421813437, + "learning_rate": 0.00026600702995557113, + "loss": 7.0763, + "step": 15617 + }, + { + "epoch": 1.4573108145936362, + "grad_norm": 0.5643196556355806, + "learning_rate": 0.0002660022348038302, + "loss": 7.4085, + "step": 15618 + }, + { + "epoch": 1.4574041242885136, + "grad_norm": 0.6385919328697497, + "learning_rate": 0.0002659974393571299, + "loss": 7.1646, + "step": 15619 + }, + { + "epoch": 1.4574974339833908, + "grad_norm": 30.39980817309505, + "learning_rate": 0.0002659926436154825, + "loss": 7.1616, + "step": 15620 + }, + { + "epoch": 1.4575907436782682, + "grad_norm": 1.1478827982663835, + "learning_rate": 0.00026598784757890016, + "loss": 7.3513, + "step": 15621 + }, + { + "epoch": 1.4576840533731454, + "grad_norm": 0.6280551473518226, + "learning_rate": 0.0002659830512473951, + "loss": 7.2244, + "step": 15622 + }, + { + "epoch": 1.4577773630680229, + "grad_norm": 0.48771065893410187, + "learning_rate": 0.00026597825462097957, + "loss": 7.4844, + "step": 15623 + }, + { + "epoch": 1.4578706727629, + "grad_norm": 19.047422957117202, + "learning_rate": 0.00026597345769966557, + "loss": 7.0171, + "step": 15624 + }, + { + "epoch": 1.4579639824577773, + "grad_norm": 1.0977298424397692, + "learning_rate": 0.00026596866048346544, + "loss": 7.7454, + "step": 15625 + }, + { + "epoch": 1.4580572921526547, + "grad_norm": 213.4210021663757, + "learning_rate": 0.00026596386297239144, + "loss": 7.4631, + "step": 15626 + }, + { + "epoch": 1.458150601847532, + "grad_norm": 2.2488154356991483, + "learning_rate": 0.00026595906516645564, + "loss": 7.1673, + "step": 15627 + }, + { + "epoch": 1.4582439115424093, + "grad_norm": 3.500146558736176, + "learning_rate": 0.00026595426706567025, + "loss": 7.5297, + "step": 15628 + }, + { + "epoch": 1.4583372212372865, + "grad_norm": 3.7233236617808037, + "learning_rate": 0.00026594946867004757, + "loss": 7.3136, + "step": 15629 + }, + { + "epoch": 1.458430530932164, + "grad_norm": 4.171573979094739, + "learning_rate": 0.0002659446699795997, + "loss": 7.6134, + "step": 15630 + }, + { + "epoch": 1.458523840627041, + "grad_norm": 4.018171833195917, + "learning_rate": 0.00026593987099433886, + "loss": 7.6415, + "step": 15631 + }, + { + "epoch": 1.4586171503219185, + "grad_norm": 1302.7708476192183, + "learning_rate": 0.00026593507171427733, + "loss": 7.9065, + "step": 15632 + }, + { + "epoch": 1.4587104600167957, + "grad_norm": 2.2086667405933844, + "learning_rate": 0.0002659302721394272, + "loss": 7.5382, + "step": 15633 + }, + { + "epoch": 1.4588037697116731, + "grad_norm": 510.6598871830504, + "learning_rate": 0.00026592547226980077, + "loss": 7.5957, + "step": 15634 + }, + { + "epoch": 1.4588970794065503, + "grad_norm": 1.6837275918175458, + "learning_rate": 0.0002659206721054102, + "loss": 7.8568, + "step": 15635 + }, + { + "epoch": 1.4589903891014275, + "grad_norm": 1.5721493019555106, + "learning_rate": 0.0002659158716462677, + "loss": 7.7099, + "step": 15636 + }, + { + "epoch": 1.459083698796305, + "grad_norm": 95.42188243901055, + "learning_rate": 0.0002659110708923854, + "loss": 7.8661, + "step": 15637 + }, + { + "epoch": 1.4591770084911824, + "grad_norm": 2.5766043016521403, + "learning_rate": 0.0002659062698437757, + "loss": 7.673, + "step": 15638 + }, + { + "epoch": 1.4592703181860596, + "grad_norm": 2.5793601626164038, + "learning_rate": 0.0002659014685004506, + "loss": 7.655, + "step": 15639 + }, + { + "epoch": 1.4593636278809368, + "grad_norm": 2.1809542268846416, + "learning_rate": 0.00026589666686242244, + "loss": 7.6063, + "step": 15640 + }, + { + "epoch": 1.4594569375758142, + "grad_norm": 1.29502955038545, + "learning_rate": 0.0002658918649297034, + "loss": 7.2978, + "step": 15641 + }, + { + "epoch": 1.4595502472706914, + "grad_norm": 0.5377446215743581, + "learning_rate": 0.00026588706270230567, + "loss": 7.3698, + "step": 15642 + }, + { + "epoch": 1.4596435569655688, + "grad_norm": 1.7423591055140488, + "learning_rate": 0.00026588226018024145, + "loss": 7.5166, + "step": 15643 + }, + { + "epoch": 1.459736866660446, + "grad_norm": 2.1032409736386923, + "learning_rate": 0.00026587745736352295, + "loss": 7.4503, + "step": 15644 + }, + { + "epoch": 1.4598301763553234, + "grad_norm": 2.1207838581290335, + "learning_rate": 0.00026587265425216243, + "loss": 7.1799, + "step": 15645 + }, + { + "epoch": 1.4599234860502006, + "grad_norm": 2.058816173044046, + "learning_rate": 0.0002658678508461721, + "loss": 7.7107, + "step": 15646 + }, + { + "epoch": 1.4600167957450778, + "grad_norm": 0.9249551698969889, + "learning_rate": 0.00026586304714556406, + "loss": 7.2029, + "step": 15647 + }, + { + "epoch": 1.4601101054399552, + "grad_norm": 2.6656717603662465, + "learning_rate": 0.0002658582431503507, + "loss": 7.2302, + "step": 15648 + }, + { + "epoch": 1.4602034151348324, + "grad_norm": 2.0275942079893587, + "learning_rate": 0.00026585343886054413, + "loss": 7.6947, + "step": 15649 + }, + { + "epoch": 1.4602967248297098, + "grad_norm": 1.35297081812959, + "learning_rate": 0.00026584863427615656, + "loss": 7.5171, + "step": 15650 + }, + { + "epoch": 1.460390034524587, + "grad_norm": 2.311707730715343, + "learning_rate": 0.0002658438293972002, + "loss": 7.2324, + "step": 15651 + }, + { + "epoch": 1.4604833442194645, + "grad_norm": 2.1167563169586807, + "learning_rate": 0.0002658390242236874, + "loss": 7.396, + "step": 15652 + }, + { + "epoch": 1.4605766539143417, + "grad_norm": 1.4264240803261885, + "learning_rate": 0.0002658342187556302, + "loss": 7.5259, + "step": 15653 + }, + { + "epoch": 1.460669963609219, + "grad_norm": 1.0379551750513758, + "learning_rate": 0.0002658294129930409, + "loss": 7.3551, + "step": 15654 + }, + { + "epoch": 1.4607632733040963, + "grad_norm": 0.6370242368186957, + "learning_rate": 0.0002658246069359317, + "loss": 7.3123, + "step": 15655 + }, + { + "epoch": 1.4608565829989737, + "grad_norm": 0.8558597200506657, + "learning_rate": 0.00026581980058431486, + "loss": 7.4821, + "step": 15656 + }, + { + "epoch": 1.460949892693851, + "grad_norm": 1.3576293569693803, + "learning_rate": 0.0002658149939382026, + "loss": 7.5038, + "step": 15657 + }, + { + "epoch": 1.461043202388728, + "grad_norm": 3.975170797644362, + "learning_rate": 0.0002658101869976071, + "loss": 7.7737, + "step": 15658 + }, + { + "epoch": 1.4611365120836055, + "grad_norm": 1.6879855763698457, + "learning_rate": 0.00026580537976254057, + "loss": 7.6903, + "step": 15659 + }, + { + "epoch": 1.4612298217784827, + "grad_norm": 0.8333665720472706, + "learning_rate": 0.00026580057223301527, + "loss": 7.248, + "step": 15660 + }, + { + "epoch": 1.4613231314733601, + "grad_norm": 0.40876167883378345, + "learning_rate": 0.0002657957644090434, + "loss": 7.4369, + "step": 15661 + }, + { + "epoch": 1.4614164411682373, + "grad_norm": 1.0407259752720246, + "learning_rate": 0.00026579095629063724, + "loss": 7.2229, + "step": 15662 + }, + { + "epoch": 1.4615097508631147, + "grad_norm": 1.239037519520222, + "learning_rate": 0.00026578614787780896, + "loss": 7.261, + "step": 15663 + }, + { + "epoch": 1.461603060557992, + "grad_norm": 1.1083994727044364, + "learning_rate": 0.00026578133917057084, + "loss": 7.2801, + "step": 15664 + }, + { + "epoch": 1.4616963702528691, + "grad_norm": 0.46239039399988646, + "learning_rate": 0.000265776530168935, + "loss": 7.8075, + "step": 15665 + }, + { + "epoch": 1.4617896799477466, + "grad_norm": 0.7360560968217601, + "learning_rate": 0.00026577172087291387, + "loss": 7.3909, + "step": 15666 + }, + { + "epoch": 1.461882989642624, + "grad_norm": 0.6940568827543899, + "learning_rate": 0.00026576691128251946, + "loss": 7.6263, + "step": 15667 + }, + { + "epoch": 1.4619762993375012, + "grad_norm": 1.3362288336804602, + "learning_rate": 0.00026576210139776405, + "loss": 7.4438, + "step": 15668 + }, + { + "epoch": 1.4620696090323784, + "grad_norm": 1.088388363402078, + "learning_rate": 0.00026575729121865996, + "loss": 7.235, + "step": 15669 + }, + { + "epoch": 1.4621629187272558, + "grad_norm": 0.9376557479764902, + "learning_rate": 0.0002657524807452193, + "loss": 7.019, + "step": 15670 + }, + { + "epoch": 1.462256228422133, + "grad_norm": 0.6973170364150518, + "learning_rate": 0.00026574766997745447, + "loss": 7.4361, + "step": 15671 + }, + { + "epoch": 1.4623495381170104, + "grad_norm": 0.8685880197549566, + "learning_rate": 0.00026574285891537753, + "loss": 7.3752, + "step": 15672 + }, + { + "epoch": 1.4624428478118876, + "grad_norm": 0.8830584114599929, + "learning_rate": 0.00026573804755900085, + "loss": 7.4364, + "step": 15673 + }, + { + "epoch": 1.462536157506765, + "grad_norm": 2.943047512631545, + "learning_rate": 0.0002657332359083366, + "loss": 7.4969, + "step": 15674 + }, + { + "epoch": 1.4626294672016422, + "grad_norm": 0.5389120987393569, + "learning_rate": 0.00026572842396339696, + "loss": 7.3355, + "step": 15675 + }, + { + "epoch": 1.4627227768965194, + "grad_norm": 0.4504370154390781, + "learning_rate": 0.0002657236117241942, + "loss": 7.3444, + "step": 15676 + }, + { + "epoch": 1.4628160865913968, + "grad_norm": 0.4138880307508355, + "learning_rate": 0.00026571879919074057, + "loss": 7.3537, + "step": 15677 + }, + { + "epoch": 1.4629093962862743, + "grad_norm": 0.6947006792194774, + "learning_rate": 0.0002657139863630484, + "loss": 7.4123, + "step": 15678 + }, + { + "epoch": 1.4630027059811515, + "grad_norm": 0.5267168159344539, + "learning_rate": 0.0002657091732411298, + "loss": 7.5999, + "step": 15679 + }, + { + "epoch": 1.4630960156760286, + "grad_norm": 1.3484196416408298, + "learning_rate": 0.000265704359824997, + "loss": 7.3166, + "step": 15680 + }, + { + "epoch": 1.463189325370906, + "grad_norm": 0.8708369336131654, + "learning_rate": 0.0002656995461146624, + "loss": 7.7781, + "step": 15681 + }, + { + "epoch": 1.4632826350657833, + "grad_norm": 0.44218956560306993, + "learning_rate": 0.00026569473211013805, + "loss": 7.4548, + "step": 15682 + }, + { + "epoch": 1.4633759447606607, + "grad_norm": 0.5536061778349307, + "learning_rate": 0.0002656899178114362, + "loss": 7.5468, + "step": 15683 + }, + { + "epoch": 1.4634692544555379, + "grad_norm": 0.9108399862984018, + "learning_rate": 0.00026568510321856926, + "loss": 7.1476, + "step": 15684 + }, + { + "epoch": 1.4635625641504153, + "grad_norm": 0.791170378459658, + "learning_rate": 0.00026568028833154935, + "loss": 7.5358, + "step": 15685 + }, + { + "epoch": 1.4636558738452925, + "grad_norm": 1.0873917326673588, + "learning_rate": 0.0002656754731503887, + "loss": 7.6117, + "step": 15686 + }, + { + "epoch": 1.4637491835401697, + "grad_norm": 0.755920439972365, + "learning_rate": 0.0002656706576750996, + "loss": 7.5359, + "step": 15687 + }, + { + "epoch": 1.4638424932350471, + "grad_norm": 1.0111893655612836, + "learning_rate": 0.0002656658419056943, + "loss": 7.7423, + "step": 15688 + }, + { + "epoch": 1.4639358029299245, + "grad_norm": 0.7011406253015653, + "learning_rate": 0.00026566102584218504, + "loss": 7.3109, + "step": 15689 + }, + { + "epoch": 1.4640291126248017, + "grad_norm": 1.3913271008947559, + "learning_rate": 0.00026565620948458405, + "loss": 7.1489, + "step": 15690 + }, + { + "epoch": 1.464122422319679, + "grad_norm": 0.4562729623064325, + "learning_rate": 0.0002656513928329035, + "loss": 7.5877, + "step": 15691 + }, + { + "epoch": 1.4642157320145563, + "grad_norm": 0.5749119005160226, + "learning_rate": 0.00026564657588715586, + "loss": 7.7172, + "step": 15692 + }, + { + "epoch": 1.4643090417094335, + "grad_norm": 0.8674381170052415, + "learning_rate": 0.0002656417586473531, + "loss": 7.1533, + "step": 15693 + }, + { + "epoch": 1.464402351404311, + "grad_norm": 0.895653036369705, + "learning_rate": 0.0002656369411135077, + "loss": 7.7503, + "step": 15694 + }, + { + "epoch": 1.4644956610991882, + "grad_norm": 0.4564761128174814, + "learning_rate": 0.0002656321232856317, + "loss": 7.6102, + "step": 15695 + }, + { + "epoch": 1.4645889707940656, + "grad_norm": 0.4895553367688128, + "learning_rate": 0.0002656273051637376, + "loss": 7.4963, + "step": 15696 + }, + { + "epoch": 1.4646822804889428, + "grad_norm": 0.8649484910402534, + "learning_rate": 0.00026562248674783743, + "loss": 7.6611, + "step": 15697 + }, + { + "epoch": 1.46477559018382, + "grad_norm": 0.9390665721096885, + "learning_rate": 0.0002656176680379436, + "loss": 7.2082, + "step": 15698 + }, + { + "epoch": 1.4648688998786974, + "grad_norm": 0.8341998639060816, + "learning_rate": 0.0002656128490340682, + "loss": 7.8298, + "step": 15699 + }, + { + "epoch": 1.4649622095735748, + "grad_norm": 0.7790919062782325, + "learning_rate": 0.0002656080297362236, + "loss": 7.4131, + "step": 15700 + }, + { + "epoch": 1.465055519268452, + "grad_norm": 3.6561262168208977, + "learning_rate": 0.00026560321014442204, + "loss": 7.4897, + "step": 15701 + }, + { + "epoch": 1.4651488289633292, + "grad_norm": 0.5487888263782152, + "learning_rate": 0.0002655983902586757, + "loss": 7.3665, + "step": 15702 + }, + { + "epoch": 1.4652421386582066, + "grad_norm": 0.7565880115944187, + "learning_rate": 0.000265593570078997, + "loss": 7.2196, + "step": 15703 + }, + { + "epoch": 1.4653354483530838, + "grad_norm": 0.8558913595704063, + "learning_rate": 0.000265588749605398, + "loss": 7.388, + "step": 15704 + }, + { + "epoch": 1.4654287580479612, + "grad_norm": 0.4199629366432174, + "learning_rate": 0.0002655839288378911, + "loss": 7.1197, + "step": 15705 + }, + { + "epoch": 1.4655220677428384, + "grad_norm": 1.57738439803148, + "learning_rate": 0.00026557910777648847, + "loss": 7.6503, + "step": 15706 + }, + { + "epoch": 1.4656153774377159, + "grad_norm": 0.41612003872564457, + "learning_rate": 0.0002655742864212025, + "loss": 7.1401, + "step": 15707 + }, + { + "epoch": 1.465708687132593, + "grad_norm": 1.3755162949595454, + "learning_rate": 0.0002655694647720453, + "loss": 7.9245, + "step": 15708 + }, + { + "epoch": 1.4658019968274703, + "grad_norm": 1.1818819450603264, + "learning_rate": 0.00026556464282902914, + "loss": 7.2542, + "step": 15709 + }, + { + "epoch": 1.4658953065223477, + "grad_norm": 0.7976000650157105, + "learning_rate": 0.00026555982059216633, + "loss": 7.5147, + "step": 15710 + }, + { + "epoch": 1.465988616217225, + "grad_norm": 0.8976838375864403, + "learning_rate": 0.00026555499806146923, + "loss": 7.613, + "step": 15711 + }, + { + "epoch": 1.4660819259121023, + "grad_norm": 1.7219632799197653, + "learning_rate": 0.0002655501752369499, + "loss": 7.394, + "step": 15712 + }, + { + "epoch": 1.4661752356069795, + "grad_norm": 0.5486793908353605, + "learning_rate": 0.0002655453521186207, + "loss": 7.5071, + "step": 15713 + }, + { + "epoch": 1.466268545301857, + "grad_norm": 0.6555299034347537, + "learning_rate": 0.000265540528706494, + "loss": 7.4865, + "step": 15714 + }, + { + "epoch": 1.466361854996734, + "grad_norm": 0.45373179106798367, + "learning_rate": 0.0002655357050005818, + "loss": 7.5041, + "step": 15715 + }, + { + "epoch": 1.4664551646916115, + "grad_norm": 0.4678517794932807, + "learning_rate": 0.0002655308810008966, + "loss": 7.4739, + "step": 15716 + }, + { + "epoch": 1.4665484743864887, + "grad_norm": 0.7003679246418738, + "learning_rate": 0.00026552605670745067, + "loss": 7.4452, + "step": 15717 + }, + { + "epoch": 1.4666417840813661, + "grad_norm": 0.6812787957546618, + "learning_rate": 0.0002655212321202561, + "loss": 7.4407, + "step": 15718 + }, + { + "epoch": 1.4667350937762433, + "grad_norm": 0.6467186304469731, + "learning_rate": 0.0002655164072393253, + "loss": 7.4428, + "step": 15719 + }, + { + "epoch": 1.4668284034711205, + "grad_norm": 1.1007436828291732, + "learning_rate": 0.0002655115820646705, + "loss": 6.9856, + "step": 15720 + }, + { + "epoch": 1.466921713165998, + "grad_norm": 2.1844614842088883, + "learning_rate": 0.00026550675659630393, + "loss": 7.3089, + "step": 15721 + }, + { + "epoch": 1.4670150228608754, + "grad_norm": 1.3245444866593665, + "learning_rate": 0.00026550193083423794, + "loss": 7.4436, + "step": 15722 + }, + { + "epoch": 1.4671083325557526, + "grad_norm": 0.425230187831861, + "learning_rate": 0.0002654971047784847, + "loss": 7.1226, + "step": 15723 + }, + { + "epoch": 1.4672016422506298, + "grad_norm": 1.5003171206440973, + "learning_rate": 0.0002654922784290566, + "loss": 7.5475, + "step": 15724 + }, + { + "epoch": 1.4672949519455072, + "grad_norm": 1.2874084881361934, + "learning_rate": 0.0002654874517859658, + "loss": 7.7414, + "step": 15725 + }, + { + "epoch": 1.4673882616403844, + "grad_norm": 0.4278632743031211, + "learning_rate": 0.0002654826248492246, + "loss": 7.4032, + "step": 15726 + }, + { + "epoch": 1.4674815713352618, + "grad_norm": 1.2389242179478492, + "learning_rate": 0.00026547779761884534, + "loss": 7.2605, + "step": 15727 + }, + { + "epoch": 1.467574881030139, + "grad_norm": 1.3681294625815148, + "learning_rate": 0.00026547297009484023, + "loss": 7.2056, + "step": 15728 + }, + { + "epoch": 1.4676681907250164, + "grad_norm": 0.6961559831223223, + "learning_rate": 0.00026546814227722157, + "loss": 7.8781, + "step": 15729 + }, + { + "epoch": 1.4677615004198936, + "grad_norm": 0.5704262559502206, + "learning_rate": 0.0002654633141660016, + "loss": 7.5263, + "step": 15730 + }, + { + "epoch": 1.4678548101147708, + "grad_norm": 0.7301629741878606, + "learning_rate": 0.0002654584857611927, + "loss": 7.331, + "step": 15731 + }, + { + "epoch": 1.4679481198096482, + "grad_norm": 0.33906822056928476, + "learning_rate": 0.000265453657062807, + "loss": 7.4378, + "step": 15732 + }, + { + "epoch": 1.4680414295045257, + "grad_norm": 0.378026527926077, + "learning_rate": 0.0002654488280708569, + "loss": 7.3322, + "step": 15733 + }, + { + "epoch": 1.4681347391994028, + "grad_norm": 1.0351280368660163, + "learning_rate": 0.0002654439987853546, + "loss": 7.52, + "step": 15734 + }, + { + "epoch": 1.46822804889428, + "grad_norm": 0.7975975791370856, + "learning_rate": 0.0002654391692063124, + "loss": 7.4612, + "step": 15735 + }, + { + "epoch": 1.4683213585891575, + "grad_norm": 0.4571267756383851, + "learning_rate": 0.0002654343393337426, + "loss": 7.214, + "step": 15736 + }, + { + "epoch": 1.4684146682840347, + "grad_norm": 0.502338456731565, + "learning_rate": 0.00026542950916765744, + "loss": 7.2919, + "step": 15737 + }, + { + "epoch": 1.468507977978912, + "grad_norm": 0.5251304219681873, + "learning_rate": 0.00026542467870806924, + "loss": 7.3784, + "step": 15738 + }, + { + "epoch": 1.4686012876737893, + "grad_norm": 1.0586748012250475, + "learning_rate": 0.0002654198479549903, + "loss": 7.5612, + "step": 15739 + }, + { + "epoch": 1.4686945973686667, + "grad_norm": 1.025571536188624, + "learning_rate": 0.0002654150169084329, + "loss": 7.1768, + "step": 15740 + }, + { + "epoch": 1.468787907063544, + "grad_norm": 0.48359153259052123, + "learning_rate": 0.0002654101855684092, + "loss": 7.5855, + "step": 15741 + }, + { + "epoch": 1.468881216758421, + "grad_norm": 0.9220134360847733, + "learning_rate": 0.00026540535393493173, + "loss": 7.2866, + "step": 15742 + }, + { + "epoch": 1.4689745264532985, + "grad_norm": 0.5308053005114038, + "learning_rate": 0.0002654005220080125, + "loss": 7.2839, + "step": 15743 + }, + { + "epoch": 1.469067836148176, + "grad_norm": 0.6194959572255004, + "learning_rate": 0.00026539568978766403, + "loss": 7.0864, + "step": 15744 + }, + { + "epoch": 1.4691611458430531, + "grad_norm": 1.7479870968121933, + "learning_rate": 0.0002653908572738984, + "loss": 7.6514, + "step": 15745 + }, + { + "epoch": 1.4692544555379303, + "grad_norm": 0.679356101440002, + "learning_rate": 0.00026538602446672806, + "loss": 7.2916, + "step": 15746 + }, + { + "epoch": 1.4693477652328077, + "grad_norm": 0.8588680235515916, + "learning_rate": 0.00026538119136616524, + "loss": 7.3256, + "step": 15747 + }, + { + "epoch": 1.469441074927685, + "grad_norm": 0.6470977312996745, + "learning_rate": 0.00026537635797222223, + "loss": 7.3418, + "step": 15748 + }, + { + "epoch": 1.4695343846225624, + "grad_norm": 0.9200575232785492, + "learning_rate": 0.0002653715242849113, + "loss": 7.528, + "step": 15749 + }, + { + "epoch": 1.4696276943174396, + "grad_norm": 1.2058620218544311, + "learning_rate": 0.0002653666903042448, + "loss": 6.9608, + "step": 15750 + }, + { + "epoch": 1.469721004012317, + "grad_norm": 0.48786747750006026, + "learning_rate": 0.0002653618560302349, + "loss": 7.4581, + "step": 15751 + }, + { + "epoch": 1.4698143137071942, + "grad_norm": 0.356598857594891, + "learning_rate": 0.00026535702146289404, + "loss": 7.4306, + "step": 15752 + }, + { + "epoch": 1.4699076234020714, + "grad_norm": 0.5379208065238242, + "learning_rate": 0.0002653521866022345, + "loss": 7.4235, + "step": 15753 + }, + { + "epoch": 1.4700009330969488, + "grad_norm": 0.4758766972114266, + "learning_rate": 0.0002653473514482684, + "loss": 7.5531, + "step": 15754 + }, + { + "epoch": 1.470094242791826, + "grad_norm": 0.532133213420691, + "learning_rate": 0.0002653425160010082, + "loss": 7.5015, + "step": 15755 + }, + { + "epoch": 1.4701875524867034, + "grad_norm": 0.910043863282198, + "learning_rate": 0.00026533768026046616, + "loss": 7.4077, + "step": 15756 + }, + { + "epoch": 1.4702808621815806, + "grad_norm": 0.47912515925963367, + "learning_rate": 0.0002653328442266545, + "loss": 7.4183, + "step": 15757 + }, + { + "epoch": 1.470374171876458, + "grad_norm": 0.33463991318557, + "learning_rate": 0.0002653280078995857, + "loss": 7.353, + "step": 15758 + }, + { + "epoch": 1.4704674815713352, + "grad_norm": 0.43503569639490314, + "learning_rate": 0.00026532317127927183, + "loss": 7.2266, + "step": 15759 + }, + { + "epoch": 1.4705607912662126, + "grad_norm": 1.1547679015937566, + "learning_rate": 0.0002653183343657254, + "loss": 7.3171, + "step": 15760 + }, + { + "epoch": 1.4706541009610898, + "grad_norm": 16.94226702353015, + "learning_rate": 0.0002653134971589585, + "loss": 7.3029, + "step": 15761 + }, + { + "epoch": 1.4707474106559673, + "grad_norm": 1.0961230046073218, + "learning_rate": 0.0002653086596589836, + "loss": 6.9292, + "step": 15762 + }, + { + "epoch": 1.4708407203508445, + "grad_norm": 0.4297643358966801, + "learning_rate": 0.0002653038218658129, + "loss": 7.3637, + "step": 15763 + }, + { + "epoch": 1.4709340300457217, + "grad_norm": 2.786395500114627, + "learning_rate": 0.00026529898377945874, + "loss": 7.6073, + "step": 15764 + }, + { + "epoch": 1.471027339740599, + "grad_norm": 0.7684589916036075, + "learning_rate": 0.0002652941453999335, + "loss": 7.339, + "step": 15765 + }, + { + "epoch": 1.4711206494354763, + "grad_norm": 0.48304003304839976, + "learning_rate": 0.0002652893067272493, + "loss": 7.4725, + "step": 15766 + }, + { + "epoch": 1.4712139591303537, + "grad_norm": 1.1392291576354876, + "learning_rate": 0.00026528446776141854, + "loss": 7.1174, + "step": 15767 + }, + { + "epoch": 1.4713072688252309, + "grad_norm": 0.8775226025288981, + "learning_rate": 0.00026527962850245363, + "loss": 7.655, + "step": 15768 + }, + { + "epoch": 1.4714005785201083, + "grad_norm": 0.41995154415408165, + "learning_rate": 0.00026527478895036665, + "loss": 7.3433, + "step": 15769 + }, + { + "epoch": 1.4714938882149855, + "grad_norm": 9.119273941187249, + "learning_rate": 0.0002652699491051701, + "loss": 7.318, + "step": 15770 + }, + { + "epoch": 1.4715871979098627, + "grad_norm": 0.9336382589366967, + "learning_rate": 0.0002652651089668762, + "loss": 7.2081, + "step": 15771 + }, + { + "epoch": 1.4716805076047401, + "grad_norm": 2.0750579050948375, + "learning_rate": 0.0002652602685354973, + "loss": 7.6774, + "step": 15772 + }, + { + "epoch": 1.4717738172996175, + "grad_norm": 1.0075634120101626, + "learning_rate": 0.00026525542781104564, + "loss": 7.1283, + "step": 15773 + }, + { + "epoch": 1.4718671269944947, + "grad_norm": 1.4757224354822696, + "learning_rate": 0.00026525058679353355, + "loss": 7.4029, + "step": 15774 + }, + { + "epoch": 1.471960436689372, + "grad_norm": 0.9415023675105272, + "learning_rate": 0.0002652457454829735, + "loss": 7.4076, + "step": 15775 + }, + { + "epoch": 1.4720537463842494, + "grad_norm": 1.8289138830355443, + "learning_rate": 0.00026524090387937746, + "loss": 7.5285, + "step": 15776 + }, + { + "epoch": 1.4721470560791265, + "grad_norm": 0.526444386413661, + "learning_rate": 0.00026523606198275803, + "loss": 7.5405, + "step": 15777 + }, + { + "epoch": 1.472240365774004, + "grad_norm": 0.5662218495944342, + "learning_rate": 0.00026523121979312743, + "loss": 7.5335, + "step": 15778 + }, + { + "epoch": 1.4723336754688812, + "grad_norm": 1.4880696770443418, + "learning_rate": 0.000265226377310498, + "loss": 7.1676, + "step": 15779 + }, + { + "epoch": 1.4724269851637586, + "grad_norm": 0.5401439062476324, + "learning_rate": 0.00026522153453488196, + "loss": 7.3895, + "step": 15780 + }, + { + "epoch": 1.4725202948586358, + "grad_norm": 0.6924438448699326, + "learning_rate": 0.00026521669146629175, + "loss": 7.5363, + "step": 15781 + }, + { + "epoch": 1.472613604553513, + "grad_norm": 0.4061340037385991, + "learning_rate": 0.0002652118481047396, + "loss": 7.4215, + "step": 15782 + }, + { + "epoch": 1.4727069142483904, + "grad_norm": 1.2760352994433939, + "learning_rate": 0.0002652070044502378, + "loss": 7.0765, + "step": 15783 + }, + { + "epoch": 1.4728002239432678, + "grad_norm": 0.8951940143532099, + "learning_rate": 0.0002652021605027988, + "loss": 7.4823, + "step": 15784 + }, + { + "epoch": 1.472893533638145, + "grad_norm": 0.8522514735420341, + "learning_rate": 0.00026519731626243475, + "loss": 7.808, + "step": 15785 + }, + { + "epoch": 1.4729868433330222, + "grad_norm": 0.9998962457460993, + "learning_rate": 0.0002651924717291581, + "loss": 7.4878, + "step": 15786 + }, + { + "epoch": 1.4730801530278996, + "grad_norm": 0.9026023864056547, + "learning_rate": 0.00026518762690298114, + "loss": 7.465, + "step": 15787 + }, + { + "epoch": 1.4731734627227768, + "grad_norm": 1.1652106975888614, + "learning_rate": 0.0002651827817839161, + "loss": 7.3307, + "step": 15788 + }, + { + "epoch": 1.4732667724176542, + "grad_norm": 1.442771728267393, + "learning_rate": 0.0002651779363719754, + "loss": 7.2706, + "step": 15789 + }, + { + "epoch": 1.4733600821125314, + "grad_norm": 0.9809803827019454, + "learning_rate": 0.00026517309066717137, + "loss": 7.0431, + "step": 15790 + }, + { + "epoch": 1.4734533918074089, + "grad_norm": 2.1041562935689875, + "learning_rate": 0.00026516824466951623, + "loss": 7.4356, + "step": 15791 + }, + { + "epoch": 1.473546701502286, + "grad_norm": 2.7155572887792196, + "learning_rate": 0.0002651633983790223, + "loss": 7.5013, + "step": 15792 + }, + { + "epoch": 1.4736400111971633, + "grad_norm": 2.188612851343245, + "learning_rate": 0.00026515855179570205, + "loss": 7.2987, + "step": 15793 + }, + { + "epoch": 1.4737333208920407, + "grad_norm": 0.5297328728598552, + "learning_rate": 0.0002651537049195677, + "loss": 7.0028, + "step": 15794 + }, + { + "epoch": 1.473826630586918, + "grad_norm": 0.5671141309362986, + "learning_rate": 0.0002651488577506316, + "loss": 7.6331, + "step": 15795 + }, + { + "epoch": 1.4739199402817953, + "grad_norm": 1.3670221575682835, + "learning_rate": 0.00026514401028890605, + "loss": 7.3568, + "step": 15796 + }, + { + "epoch": 1.4740132499766725, + "grad_norm": 3.9002253212823086, + "learning_rate": 0.00026513916253440343, + "loss": 7.4119, + "step": 15797 + }, + { + "epoch": 1.47410655967155, + "grad_norm": 1.6564365620831427, + "learning_rate": 0.00026513431448713597, + "loss": 7.3166, + "step": 15798 + }, + { + "epoch": 1.474199869366427, + "grad_norm": 1.2546120056113759, + "learning_rate": 0.00026512946614711604, + "loss": 7.3328, + "step": 15799 + }, + { + "epoch": 1.4742931790613045, + "grad_norm": 1.4333698529814756, + "learning_rate": 0.00026512461751435605, + "loss": 7.7319, + "step": 15800 + }, + { + "epoch": 1.4743864887561817, + "grad_norm": 0.5212594779230427, + "learning_rate": 0.0002651197685888682, + "loss": 7.207, + "step": 15801 + }, + { + "epoch": 1.4744797984510591, + "grad_norm": 1.8101444875096004, + "learning_rate": 0.00026511491937066494, + "loss": 7.4845, + "step": 15802 + }, + { + "epoch": 1.4745731081459363, + "grad_norm": 0.5688455104568764, + "learning_rate": 0.0002651100698597585, + "loss": 7.0839, + "step": 15803 + }, + { + "epoch": 1.4746664178408135, + "grad_norm": 0.636563233957353, + "learning_rate": 0.0002651052200561612, + "loss": 7.4949, + "step": 15804 + }, + { + "epoch": 1.474759727535691, + "grad_norm": 0.7524026119920169, + "learning_rate": 0.0002651003699598855, + "loss": 7.3136, + "step": 15805 + }, + { + "epoch": 1.4748530372305684, + "grad_norm": 0.8519662680535152, + "learning_rate": 0.0002650955195709436, + "loss": 7.4009, + "step": 15806 + }, + { + "epoch": 1.4749463469254456, + "grad_norm": 2.7765281057530027, + "learning_rate": 0.00026509066888934793, + "loss": 7.1087, + "step": 15807 + }, + { + "epoch": 1.4750396566203228, + "grad_norm": 0.5535675452325273, + "learning_rate": 0.0002650858179151108, + "loss": 7.5571, + "step": 15808 + }, + { + "epoch": 1.4751329663152002, + "grad_norm": 1.2452344448123862, + "learning_rate": 0.00026508096664824447, + "loss": 7.6137, + "step": 15809 + }, + { + "epoch": 1.4752262760100774, + "grad_norm": 0.7838979204491031, + "learning_rate": 0.0002650761150887614, + "loss": 7.2832, + "step": 15810 + }, + { + "epoch": 1.4753195857049548, + "grad_norm": 0.9981800509997412, + "learning_rate": 0.00026507126323667375, + "loss": 7.6279, + "step": 15811 + }, + { + "epoch": 1.475412895399832, + "grad_norm": 0.7541526690440291, + "learning_rate": 0.00026506641109199405, + "loss": 7.4448, + "step": 15812 + }, + { + "epoch": 1.4755062050947094, + "grad_norm": 0.4904041872581074, + "learning_rate": 0.0002650615586547345, + "loss": 7.4397, + "step": 15813 + }, + { + "epoch": 1.4755995147895866, + "grad_norm": 0.4122284894323416, + "learning_rate": 0.0002650567059249075, + "loss": 7.4584, + "step": 15814 + }, + { + "epoch": 1.4756928244844638, + "grad_norm": 1.1186529796235427, + "learning_rate": 0.00026505185290252544, + "loss": 7.0594, + "step": 15815 + }, + { + "epoch": 1.4757861341793412, + "grad_norm": 2.199491476422041, + "learning_rate": 0.0002650469995876006, + "loss": 7.4264, + "step": 15816 + }, + { + "epoch": 1.4758794438742187, + "grad_norm": 0.35969638400275916, + "learning_rate": 0.00026504214598014524, + "loss": 7.3196, + "step": 15817 + }, + { + "epoch": 1.4759727535690959, + "grad_norm": 1.0268305241659363, + "learning_rate": 0.00026503729208017177, + "loss": 7.7139, + "step": 15818 + }, + { + "epoch": 1.476066063263973, + "grad_norm": 0.8920424745296075, + "learning_rate": 0.00026503243788769264, + "loss": 7.3678, + "step": 15819 + }, + { + "epoch": 1.4761593729588505, + "grad_norm": 0.8341872355962091, + "learning_rate": 0.00026502758340272, + "loss": 7.5143, + "step": 15820 + }, + { + "epoch": 1.4762526826537277, + "grad_norm": 1.0847787181209634, + "learning_rate": 0.00026502272862526636, + "loss": 7.3609, + "step": 15821 + }, + { + "epoch": 1.476345992348605, + "grad_norm": 0.9624040876569133, + "learning_rate": 0.00026501787355534393, + "loss": 7.52, + "step": 15822 + }, + { + "epoch": 1.4764393020434823, + "grad_norm": 1.454790369890776, + "learning_rate": 0.0002650130181929652, + "loss": 7.2061, + "step": 15823 + }, + { + "epoch": 1.4765326117383597, + "grad_norm": 1.059518580586072, + "learning_rate": 0.00026500816253814235, + "loss": 7.496, + "step": 15824 + }, + { + "epoch": 1.476625921433237, + "grad_norm": 0.7948075676205768, + "learning_rate": 0.00026500330659088786, + "loss": 7.4958, + "step": 15825 + }, + { + "epoch": 1.476719231128114, + "grad_norm": 0.8750692061368625, + "learning_rate": 0.000264998450351214, + "loss": 7.3935, + "step": 15826 + }, + { + "epoch": 1.4768125408229915, + "grad_norm": 0.3565669871313042, + "learning_rate": 0.0002649935938191332, + "loss": 6.9321, + "step": 15827 + }, + { + "epoch": 1.476905850517869, + "grad_norm": 1.4099036313802251, + "learning_rate": 0.0002649887369946577, + "loss": 7.3877, + "step": 15828 + }, + { + "epoch": 1.4769991602127461, + "grad_norm": 0.8975248674191892, + "learning_rate": 0.00026498387987779995, + "loss": 7.2917, + "step": 15829 + }, + { + "epoch": 1.4770924699076233, + "grad_norm": 1.8994961380366828, + "learning_rate": 0.0002649790224685722, + "loss": 7.4065, + "step": 15830 + }, + { + "epoch": 1.4771857796025007, + "grad_norm": 5.0365130896026535, + "learning_rate": 0.0002649741647669869, + "loss": 7.3958, + "step": 15831 + }, + { + "epoch": 1.477279089297378, + "grad_norm": 1.2698917329723136, + "learning_rate": 0.00026496930677305633, + "loss": 7.4089, + "step": 15832 + }, + { + "epoch": 1.4773723989922554, + "grad_norm": 7.755946814427133, + "learning_rate": 0.00026496444848679286, + "loss": 7.3649, + "step": 15833 + }, + { + "epoch": 1.4774657086871326, + "grad_norm": 2.4898104054071135, + "learning_rate": 0.00026495958990820887, + "loss": 7.4773, + "step": 15834 + }, + { + "epoch": 1.47755901838201, + "grad_norm": 4.803653738811581, + "learning_rate": 0.0002649547310373167, + "loss": 7.6812, + "step": 15835 + }, + { + "epoch": 1.4776523280768872, + "grad_norm": 5.166328625199777, + "learning_rate": 0.0002649498718741287, + "loss": 7.242, + "step": 15836 + }, + { + "epoch": 1.4777456377717644, + "grad_norm": 5.227745071868056, + "learning_rate": 0.00026494501241865724, + "loss": 7.5679, + "step": 15837 + }, + { + "epoch": 1.4778389474666418, + "grad_norm": 3.3326093367023395, + "learning_rate": 0.0002649401526709146, + "loss": 7.4752, + "step": 15838 + }, + { + "epoch": 1.4779322571615192, + "grad_norm": 0.9574686361114635, + "learning_rate": 0.00026493529263091326, + "loss": 7.0379, + "step": 15839 + }, + { + "epoch": 1.4780255668563964, + "grad_norm": 0.6844596834892217, + "learning_rate": 0.00026493043229866553, + "loss": 7.4152, + "step": 15840 + }, + { + "epoch": 1.4781188765512736, + "grad_norm": 1.9186380536957184, + "learning_rate": 0.00026492557167418374, + "loss": 7.336, + "step": 15841 + }, + { + "epoch": 1.478212186246151, + "grad_norm": 2.833747336717596, + "learning_rate": 0.00026492071075748025, + "loss": 7.2356, + "step": 15842 + }, + { + "epoch": 1.4783054959410282, + "grad_norm": 2.021707092520098, + "learning_rate": 0.00026491584954856746, + "loss": 7.6322, + "step": 15843 + }, + { + "epoch": 1.4783988056359056, + "grad_norm": 2.286081495563118, + "learning_rate": 0.00026491098804745764, + "loss": 7.4905, + "step": 15844 + }, + { + "epoch": 1.4784921153307828, + "grad_norm": 0.9942386464698469, + "learning_rate": 0.0002649061262541633, + "loss": 7.7946, + "step": 15845 + }, + { + "epoch": 1.4785854250256603, + "grad_norm": 0.838869011654183, + "learning_rate": 0.00026490126416869663, + "loss": 7.7514, + "step": 15846 + }, + { + "epoch": 1.4786787347205375, + "grad_norm": 1.0842123772933934, + "learning_rate": 0.0002648964017910702, + "loss": 7.0306, + "step": 15847 + }, + { + "epoch": 1.4787720444154147, + "grad_norm": 2.0692575173061027, + "learning_rate": 0.0002648915391212961, + "loss": 7.9358, + "step": 15848 + }, + { + "epoch": 1.478865354110292, + "grad_norm": 0.662579294933199, + "learning_rate": 0.000264886676159387, + "loss": 7.434, + "step": 15849 + }, + { + "epoch": 1.4789586638051695, + "grad_norm": 0.5919473507983576, + "learning_rate": 0.000264881812905355, + "loss": 7.4559, + "step": 15850 + }, + { + "epoch": 1.4790519735000467, + "grad_norm": 0.9811109111494012, + "learning_rate": 0.00026487694935921265, + "loss": 7.4618, + "step": 15851 + }, + { + "epoch": 1.4791452831949239, + "grad_norm": 1.6319882717508123, + "learning_rate": 0.00026487208552097223, + "loss": 7.3144, + "step": 15852 + }, + { + "epoch": 1.4792385928898013, + "grad_norm": 1.407997795868063, + "learning_rate": 0.0002648672213906461, + "loss": 7.3665, + "step": 15853 + }, + { + "epoch": 1.4793319025846785, + "grad_norm": 1.2728711627568075, + "learning_rate": 0.00026486235696824667, + "loss": 7.2734, + "step": 15854 + }, + { + "epoch": 1.479425212279556, + "grad_norm": 0.9080320368276859, + "learning_rate": 0.0002648574922537863, + "loss": 7.8423, + "step": 15855 + }, + { + "epoch": 1.4795185219744331, + "grad_norm": 0.5943731957239031, + "learning_rate": 0.00026485262724727733, + "loss": 7.3615, + "step": 15856 + }, + { + "epoch": 1.4796118316693105, + "grad_norm": 0.39406015316021203, + "learning_rate": 0.0002648477619487322, + "loss": 7.3289, + "step": 15857 + }, + { + "epoch": 1.4797051413641877, + "grad_norm": 1.182486121612705, + "learning_rate": 0.00026484289635816315, + "loss": 7.4184, + "step": 15858 + }, + { + "epoch": 1.479798451059065, + "grad_norm": 0.6025067815671138, + "learning_rate": 0.0002648380304755827, + "loss": 7.4455, + "step": 15859 + }, + { + "epoch": 1.4798917607539424, + "grad_norm": 0.7987225748905676, + "learning_rate": 0.00026483316430100316, + "loss": 7.3023, + "step": 15860 + }, + { + "epoch": 1.4799850704488196, + "grad_norm": 0.8002845636997442, + "learning_rate": 0.0002648282978344369, + "loss": 7.4351, + "step": 15861 + }, + { + "epoch": 1.480078380143697, + "grad_norm": 0.47831434504941744, + "learning_rate": 0.00026482343107589625, + "loss": 7.2196, + "step": 15862 + }, + { + "epoch": 1.4801716898385742, + "grad_norm": 0.5348751662122027, + "learning_rate": 0.0002648185640253936, + "loss": 7.5121, + "step": 15863 + }, + { + "epoch": 1.4802649995334516, + "grad_norm": 0.42138457090086867, + "learning_rate": 0.0002648136966829414, + "loss": 7.3671, + "step": 15864 + }, + { + "epoch": 1.4803583092283288, + "grad_norm": 0.5875389794815012, + "learning_rate": 0.000264808829048552, + "loss": 7.3842, + "step": 15865 + }, + { + "epoch": 1.4804516189232062, + "grad_norm": 0.44682479577429884, + "learning_rate": 0.00026480396112223777, + "loss": 7.3725, + "step": 15866 + }, + { + "epoch": 1.4805449286180834, + "grad_norm": 0.6198608151060676, + "learning_rate": 0.000264799092904011, + "loss": 7.4756, + "step": 15867 + }, + { + "epoch": 1.4806382383129608, + "grad_norm": 0.6312764288032264, + "learning_rate": 0.00026479422439388417, + "loss": 7.4411, + "step": 15868 + }, + { + "epoch": 1.480731548007838, + "grad_norm": 0.4607058132786432, + "learning_rate": 0.00026478935559186967, + "loss": 7.4803, + "step": 15869 + }, + { + "epoch": 1.4808248577027152, + "grad_norm": 0.5742293538408318, + "learning_rate": 0.0002647844864979798, + "loss": 7.4553, + "step": 15870 + }, + { + "epoch": 1.4809181673975926, + "grad_norm": 0.4753862411235515, + "learning_rate": 0.000264779617112227, + "loss": 7.4233, + "step": 15871 + }, + { + "epoch": 1.4810114770924698, + "grad_norm": 0.3644513469441941, + "learning_rate": 0.00026477474743462364, + "loss": 7.3659, + "step": 15872 + }, + { + "epoch": 1.4811047867873472, + "grad_norm": 0.5638654350317576, + "learning_rate": 0.0002647698774651821, + "loss": 7.2228, + "step": 15873 + }, + { + "epoch": 1.4811980964822244, + "grad_norm": 0.688313705039264, + "learning_rate": 0.00026476500720391474, + "loss": 7.2456, + "step": 15874 + }, + { + "epoch": 1.4812914061771019, + "grad_norm": 0.39741052713852737, + "learning_rate": 0.000264760136650834, + "loss": 7.1241, + "step": 15875 + }, + { + "epoch": 1.481384715871979, + "grad_norm": 1.4042872257784451, + "learning_rate": 0.00026475526580595215, + "loss": 7.56, + "step": 15876 + }, + { + "epoch": 1.4814780255668563, + "grad_norm": 0.33693281176653284, + "learning_rate": 0.0002647503946692817, + "loss": 7.3604, + "step": 15877 + }, + { + "epoch": 1.4815713352617337, + "grad_norm": 1.0097714802968534, + "learning_rate": 0.00026474552324083503, + "loss": 7.2749, + "step": 15878 + }, + { + "epoch": 1.481664644956611, + "grad_norm": 2.3702827833437508, + "learning_rate": 0.00026474065152062447, + "loss": 7.1648, + "step": 15879 + }, + { + "epoch": 1.4817579546514883, + "grad_norm": 0.9253444488644208, + "learning_rate": 0.00026473577950866236, + "loss": 7.3498, + "step": 15880 + }, + { + "epoch": 1.4818512643463655, + "grad_norm": 0.4819873511817221, + "learning_rate": 0.0002647309072049612, + "loss": 7.3696, + "step": 15881 + }, + { + "epoch": 1.481944574041243, + "grad_norm": 0.9119834061524145, + "learning_rate": 0.0002647260346095333, + "loss": 7.1597, + "step": 15882 + }, + { + "epoch": 1.48203788373612, + "grad_norm": 1.0707054024350888, + "learning_rate": 0.0002647211617223911, + "loss": 7.4191, + "step": 15883 + }, + { + "epoch": 1.4821311934309975, + "grad_norm": 0.44693227434875527, + "learning_rate": 0.00026471628854354693, + "loss": 7.04, + "step": 15884 + }, + { + "epoch": 1.4822245031258747, + "grad_norm": 0.7120676197180914, + "learning_rate": 0.0002647114150730133, + "loss": 7.3377, + "step": 15885 + }, + { + "epoch": 1.4823178128207521, + "grad_norm": 0.40334172271407126, + "learning_rate": 0.0002647065413108024, + "loss": 7.1019, + "step": 15886 + }, + { + "epoch": 1.4824111225156293, + "grad_norm": 0.3730426044829188, + "learning_rate": 0.00026470166725692685, + "loss": 7.2661, + "step": 15887 + }, + { + "epoch": 1.4825044322105065, + "grad_norm": 0.648868992081023, + "learning_rate": 0.0002646967929113989, + "loss": 7.5637, + "step": 15888 + }, + { + "epoch": 1.482597741905384, + "grad_norm": 0.46805273212432785, + "learning_rate": 0.000264691918274231, + "loss": 7.3241, + "step": 15889 + }, + { + "epoch": 1.4826910516002614, + "grad_norm": 0.7633758866040615, + "learning_rate": 0.00026468704334543547, + "loss": 7.1316, + "step": 15890 + }, + { + "epoch": 1.4827843612951386, + "grad_norm": 0.518761798800433, + "learning_rate": 0.00026468216812502483, + "loss": 7.498, + "step": 15891 + }, + { + "epoch": 1.4828776709900158, + "grad_norm": 1.2468581993874195, + "learning_rate": 0.0002646772926130114, + "loss": 7.3418, + "step": 15892 + }, + { + "epoch": 1.4829709806848932, + "grad_norm": 0.47725389331299495, + "learning_rate": 0.00026467241680940755, + "loss": 7.4036, + "step": 15893 + }, + { + "epoch": 1.4830642903797704, + "grad_norm": 0.4227340433042832, + "learning_rate": 0.00026466754071422573, + "loss": 7.3608, + "step": 15894 + }, + { + "epoch": 1.4831576000746478, + "grad_norm": 0.7176583002287511, + "learning_rate": 0.0002646626643274783, + "loss": 7.2875, + "step": 15895 + }, + { + "epoch": 1.483250909769525, + "grad_norm": 0.6324399305760035, + "learning_rate": 0.0002646577876491777, + "loss": 7.4098, + "step": 15896 + }, + { + "epoch": 1.4833442194644024, + "grad_norm": 0.5141024453985941, + "learning_rate": 0.0002646529106793363, + "loss": 7.3956, + "step": 15897 + }, + { + "epoch": 1.4834375291592796, + "grad_norm": 1.3653201029425976, + "learning_rate": 0.0002646480334179665, + "loss": 7.1309, + "step": 15898 + }, + { + "epoch": 1.4835308388541568, + "grad_norm": 0.7313292029939614, + "learning_rate": 0.0002646431558650808, + "loss": 7.1702, + "step": 15899 + }, + { + "epoch": 1.4836241485490342, + "grad_norm": 0.6099687332712819, + "learning_rate": 0.0002646382780206914, + "loss": 6.982, + "step": 15900 + }, + { + "epoch": 1.4837174582439117, + "grad_norm": 0.4340355469293642, + "learning_rate": 0.0002646333998848109, + "loss": 6.9457, + "step": 15901 + }, + { + "epoch": 1.4838107679387889, + "grad_norm": 0.9998035406559157, + "learning_rate": 0.00026462852145745153, + "loss": 7.1508, + "step": 15902 + }, + { + "epoch": 1.483904077633666, + "grad_norm": 1.0161246780502957, + "learning_rate": 0.0002646236427386259, + "loss": 7.3401, + "step": 15903 + }, + { + "epoch": 1.4839973873285435, + "grad_norm": 0.7035249813824075, + "learning_rate": 0.00026461876372834624, + "loss": 7.5289, + "step": 15904 + }, + { + "epoch": 1.4840906970234207, + "grad_norm": 0.628796074849582, + "learning_rate": 0.00026461388442662495, + "loss": 7.4486, + "step": 15905 + }, + { + "epoch": 1.484184006718298, + "grad_norm": 1.4974626052760283, + "learning_rate": 0.0002646090048334746, + "loss": 7.3233, + "step": 15906 + }, + { + "epoch": 1.4842773164131753, + "grad_norm": 0.9947601850829613, + "learning_rate": 0.00026460412494890744, + "loss": 7.4762, + "step": 15907 + }, + { + "epoch": 1.4843706261080527, + "grad_norm": 1.4778497431762028, + "learning_rate": 0.000264599244772936, + "loss": 7.4161, + "step": 15908 + }, + { + "epoch": 1.48446393580293, + "grad_norm": 0.400986209731515, + "learning_rate": 0.00026459436430557263, + "loss": 7.3726, + "step": 15909 + }, + { + "epoch": 1.484557245497807, + "grad_norm": 0.7606967406590394, + "learning_rate": 0.00026458948354682966, + "loss": 7.3155, + "step": 15910 + }, + { + "epoch": 1.4846505551926845, + "grad_norm": 1.5953695779133414, + "learning_rate": 0.0002645846024967196, + "loss": 7.4813, + "step": 15911 + }, + { + "epoch": 1.484743864887562, + "grad_norm": 16.003673100038778, + "learning_rate": 0.0002645797211552549, + "loss": 7.3119, + "step": 15912 + }, + { + "epoch": 1.4848371745824391, + "grad_norm": 1.8360489611591726, + "learning_rate": 0.0002645748395224479, + "loss": 6.996, + "step": 15913 + }, + { + "epoch": 1.4849304842773163, + "grad_norm": 0.903249285367473, + "learning_rate": 0.0002645699575983109, + "loss": 7.2128, + "step": 15914 + }, + { + "epoch": 1.4850237939721938, + "grad_norm": 0.497591464758365, + "learning_rate": 0.00026456507538285656, + "loss": 7.3232, + "step": 15915 + }, + { + "epoch": 1.485117103667071, + "grad_norm": 1.0305405189386367, + "learning_rate": 0.0002645601928760972, + "loss": 7.4578, + "step": 15916 + }, + { + "epoch": 1.4852104133619484, + "grad_norm": 0.7965399878073431, + "learning_rate": 0.0002645553100780451, + "loss": 7.138, + "step": 15917 + }, + { + "epoch": 1.4853037230568256, + "grad_norm": 1.6308817300546765, + "learning_rate": 0.0002645504269887128, + "loss": 7.7575, + "step": 15918 + }, + { + "epoch": 1.485397032751703, + "grad_norm": 0.5209176220255199, + "learning_rate": 0.0002645455436081127, + "loss": 7.2047, + "step": 15919 + }, + { + "epoch": 1.4854903424465802, + "grad_norm": 0.5154502891482887, + "learning_rate": 0.00026454065993625724, + "loss": 7.5957, + "step": 15920 + }, + { + "epoch": 1.4855836521414574, + "grad_norm": 1.089104760445749, + "learning_rate": 0.00026453577597315887, + "loss": 7.3288, + "step": 15921 + }, + { + "epoch": 1.4856769618363348, + "grad_norm": 2.4944425680818325, + "learning_rate": 0.0002645308917188298, + "loss": 7.2306, + "step": 15922 + }, + { + "epoch": 1.4857702715312122, + "grad_norm": 1.9071208795882149, + "learning_rate": 0.00026452600717328275, + "loss": 7.219, + "step": 15923 + }, + { + "epoch": 1.4858635812260894, + "grad_norm": 1.127358104384513, + "learning_rate": 0.0002645211223365299, + "loss": 7.3753, + "step": 15924 + }, + { + "epoch": 1.4859568909209666, + "grad_norm": 0.3734966854841106, + "learning_rate": 0.0002645162372085838, + "loss": 7.522, + "step": 15925 + }, + { + "epoch": 1.486050200615844, + "grad_norm": 0.7305188692274199, + "learning_rate": 0.0002645113517894568, + "loss": 7.4047, + "step": 15926 + }, + { + "epoch": 1.4861435103107212, + "grad_norm": 1.242990063183601, + "learning_rate": 0.00026450646607916134, + "loss": 7.58, + "step": 15927 + }, + { + "epoch": 1.4862368200055986, + "grad_norm": 0.39521412712025006, + "learning_rate": 0.00026450158007770987, + "loss": 7.1606, + "step": 15928 + }, + { + "epoch": 1.4863301297004758, + "grad_norm": 0.6315075398340451, + "learning_rate": 0.00026449669378511483, + "loss": 7.0771, + "step": 15929 + }, + { + "epoch": 1.4864234393953533, + "grad_norm": 0.3710188383882604, + "learning_rate": 0.0002644918072013886, + "loss": 7.4513, + "step": 15930 + }, + { + "epoch": 1.4865167490902305, + "grad_norm": 0.5390407573458911, + "learning_rate": 0.00026448692032654363, + "loss": 7.2467, + "step": 15931 + }, + { + "epoch": 1.4866100587851077, + "grad_norm": 0.4465144375310577, + "learning_rate": 0.0002644820331605923, + "loss": 7.2793, + "step": 15932 + }, + { + "epoch": 1.486703368479985, + "grad_norm": 0.9343208393924161, + "learning_rate": 0.0002644771457035471, + "loss": 7.3655, + "step": 15933 + }, + { + "epoch": 1.4867966781748625, + "grad_norm": 0.802823250636209, + "learning_rate": 0.0002644722579554204, + "loss": 7.3893, + "step": 15934 + }, + { + "epoch": 1.4868899878697397, + "grad_norm": 148.80358569743703, + "learning_rate": 0.0002644673699162247, + "loss": 7.314, + "step": 15935 + }, + { + "epoch": 1.486983297564617, + "grad_norm": 4.887021329395989, + "learning_rate": 0.00026446248158597236, + "loss": 7.3914, + "step": 15936 + }, + { + "epoch": 1.4870766072594943, + "grad_norm": 8.06389099177399, + "learning_rate": 0.00026445759296467585, + "loss": 8.0532, + "step": 15937 + }, + { + "epoch": 1.4871699169543715, + "grad_norm": 8.378961067757068, + "learning_rate": 0.00026445270405234754, + "loss": 7.8366, + "step": 15938 + }, + { + "epoch": 1.487263226649249, + "grad_norm": 7.981299817539035, + "learning_rate": 0.00026444781484899997, + "loss": 7.7424, + "step": 15939 + }, + { + "epoch": 1.4873565363441261, + "grad_norm": 6.472579130139631, + "learning_rate": 0.0002644429253546455, + "loss": 7.742, + "step": 15940 + }, + { + "epoch": 1.4874498460390035, + "grad_norm": 4.393783970306386, + "learning_rate": 0.0002644380355692965, + "loss": 7.5362, + "step": 15941 + }, + { + "epoch": 1.4875431557338807, + "grad_norm": 2.1365509788212633, + "learning_rate": 0.0002644331454929655, + "loss": 7.4497, + "step": 15942 + }, + { + "epoch": 1.487636465428758, + "grad_norm": 1.3686235450366782, + "learning_rate": 0.000264428255125665, + "loss": 7.2989, + "step": 15943 + }, + { + "epoch": 1.4877297751236354, + "grad_norm": 2.27073878067845, + "learning_rate": 0.0002644233644674072, + "loss": 7.863, + "step": 15944 + }, + { + "epoch": 1.4878230848185128, + "grad_norm": 3.6867667630462355, + "learning_rate": 0.0002644184735182048, + "loss": 7.4944, + "step": 15945 + }, + { + "epoch": 1.48791639451339, + "grad_norm": 4.802432027692855, + "learning_rate": 0.00026441358227807007, + "loss": 7.428, + "step": 15946 + }, + { + "epoch": 1.4880097042082672, + "grad_norm": 3.811457543839863, + "learning_rate": 0.0002644086907470155, + "loss": 7.9227, + "step": 15947 + }, + { + "epoch": 1.4881030139031446, + "grad_norm": 3.913232567080012, + "learning_rate": 0.0002644037989250535, + "loss": 7.7678, + "step": 15948 + }, + { + "epoch": 1.4881963235980218, + "grad_norm": 3.8960986244824074, + "learning_rate": 0.00026439890681219657, + "loss": 7.3903, + "step": 15949 + }, + { + "epoch": 1.4882896332928992, + "grad_norm": 1.379277319410213, + "learning_rate": 0.0002643940144084571, + "loss": 7.8728, + "step": 15950 + }, + { + "epoch": 1.4883829429877764, + "grad_norm": 2.724207899183623, + "learning_rate": 0.0002643891217138475, + "loss": 7.7773, + "step": 15951 + }, + { + "epoch": 1.4884762526826538, + "grad_norm": 2.433968499493872, + "learning_rate": 0.00026438422872838025, + "loss": 7.6194, + "step": 15952 + }, + { + "epoch": 1.488569562377531, + "grad_norm": 1.2094089614966135, + "learning_rate": 0.00026437933545206784, + "loss": 7.2815, + "step": 15953 + }, + { + "epoch": 1.4886628720724082, + "grad_norm": 1.6889570335000144, + "learning_rate": 0.0002643744418849226, + "loss": 7.4406, + "step": 15954 + }, + { + "epoch": 1.4887561817672856, + "grad_norm": 1.024111564137258, + "learning_rate": 0.00026436954802695707, + "loss": 7.3447, + "step": 15955 + }, + { + "epoch": 1.488849491462163, + "grad_norm": 0.9796845796230019, + "learning_rate": 0.00026436465387818365, + "loss": 7.6209, + "step": 15956 + }, + { + "epoch": 1.4889428011570403, + "grad_norm": 0.6613682463430564, + "learning_rate": 0.0002643597594386148, + "loss": 7.7164, + "step": 15957 + }, + { + "epoch": 1.4890361108519174, + "grad_norm": 1.6768883175512654, + "learning_rate": 0.0002643548647082629, + "loss": 7.4228, + "step": 15958 + }, + { + "epoch": 1.4891294205467949, + "grad_norm": 1.3782790073788207, + "learning_rate": 0.0002643499696871406, + "loss": 7.6599, + "step": 15959 + }, + { + "epoch": 1.489222730241672, + "grad_norm": 1.9923502835972655, + "learning_rate": 0.00026434507437526005, + "loss": 7.3597, + "step": 15960 + }, + { + "epoch": 1.4893160399365495, + "grad_norm": 0.7781564913421363, + "learning_rate": 0.00026434017877263386, + "loss": 7.7312, + "step": 15961 + }, + { + "epoch": 1.4894093496314267, + "grad_norm": 1.0328761647725135, + "learning_rate": 0.00026433528287927457, + "loss": 7.5407, + "step": 15962 + }, + { + "epoch": 1.489502659326304, + "grad_norm": 0.7649033239882702, + "learning_rate": 0.00026433038669519443, + "loss": 7.4901, + "step": 15963 + }, + { + "epoch": 1.4895959690211813, + "grad_norm": 1.0119915093911376, + "learning_rate": 0.00026432549022040603, + "loss": 6.9588, + "step": 15964 + }, + { + "epoch": 1.4896892787160585, + "grad_norm": 1.0764807811465777, + "learning_rate": 0.0002643205934549217, + "loss": 7.5077, + "step": 15965 + }, + { + "epoch": 1.489782588410936, + "grad_norm": 1.1390382207385408, + "learning_rate": 0.00026431569639875403, + "loss": 7.1602, + "step": 15966 + }, + { + "epoch": 1.4898758981058131, + "grad_norm": 2.073205199639433, + "learning_rate": 0.0002643107990519154, + "loss": 7.3676, + "step": 15967 + }, + { + "epoch": 1.4899692078006905, + "grad_norm": 1.8927279881500794, + "learning_rate": 0.0002643059014144183, + "loss": 7.1173, + "step": 15968 + }, + { + "epoch": 1.4900625174955677, + "grad_norm": 1.8663537808871686, + "learning_rate": 0.0002643010034862751, + "loss": 7.462, + "step": 15969 + }, + { + "epoch": 1.4901558271904451, + "grad_norm": 0.9008419662088057, + "learning_rate": 0.00026429610526749835, + "loss": 7.3529, + "step": 15970 + }, + { + "epoch": 1.4902491368853223, + "grad_norm": 0.8597965765107702, + "learning_rate": 0.00026429120675810044, + "loss": 7.1536, + "step": 15971 + }, + { + "epoch": 1.4903424465801998, + "grad_norm": 1.8284240521201207, + "learning_rate": 0.00026428630795809384, + "loss": 7.1057, + "step": 15972 + }, + { + "epoch": 1.490435756275077, + "grad_norm": 1.2444770258279618, + "learning_rate": 0.000264281408867491, + "loss": 7.4726, + "step": 15973 + }, + { + "epoch": 1.4905290659699544, + "grad_norm": 1.5710055341828906, + "learning_rate": 0.0002642765094863044, + "loss": 7.3371, + "step": 15974 + }, + { + "epoch": 1.4906223756648316, + "grad_norm": 1.6473145938579876, + "learning_rate": 0.0002642716098145465, + "loss": 7.2252, + "step": 15975 + }, + { + "epoch": 1.4907156853597088, + "grad_norm": 0.5084324029693147, + "learning_rate": 0.00026426670985222977, + "loss": 7.6039, + "step": 15976 + }, + { + "epoch": 1.4908089950545862, + "grad_norm": 1.2919158211520754, + "learning_rate": 0.0002642618095993666, + "loss": 7.8218, + "step": 15977 + }, + { + "epoch": 1.4909023047494634, + "grad_norm": 1.100330617712844, + "learning_rate": 0.00026425690905596954, + "loss": 7.6385, + "step": 15978 + }, + { + "epoch": 1.4909956144443408, + "grad_norm": 0.6598936314928977, + "learning_rate": 0.00026425200822205095, + "loss": 7.4128, + "step": 15979 + }, + { + "epoch": 1.491088924139218, + "grad_norm": 0.5810751996184186, + "learning_rate": 0.0002642471070976234, + "loss": 7.1924, + "step": 15980 + }, + { + "epoch": 1.4911822338340954, + "grad_norm": 0.904332797199028, + "learning_rate": 0.00026424220568269926, + "loss": 6.9504, + "step": 15981 + }, + { + "epoch": 1.4912755435289726, + "grad_norm": 0.6719241985643453, + "learning_rate": 0.00026423730397729105, + "loss": 7.3118, + "step": 15982 + }, + { + "epoch": 1.4913688532238498, + "grad_norm": 0.612441220094776, + "learning_rate": 0.0002642324019814112, + "loss": 7.12, + "step": 15983 + }, + { + "epoch": 1.4914621629187272, + "grad_norm": 0.5414992930058226, + "learning_rate": 0.00026422749969507223, + "loss": 7.4105, + "step": 15984 + }, + { + "epoch": 1.4915554726136047, + "grad_norm": 0.945327669789658, + "learning_rate": 0.0002642225971182865, + "loss": 7.0236, + "step": 15985 + }, + { + "epoch": 1.4916487823084819, + "grad_norm": 0.6195737801413247, + "learning_rate": 0.0002642176942510666, + "loss": 7.4148, + "step": 15986 + }, + { + "epoch": 1.491742092003359, + "grad_norm": 0.6558440069720652, + "learning_rate": 0.0002642127910934249, + "loss": 7.7222, + "step": 15987 + }, + { + "epoch": 1.4918354016982365, + "grad_norm": 0.699166281245818, + "learning_rate": 0.0002642078876453739, + "loss": 7.3859, + "step": 15988 + }, + { + "epoch": 1.4919287113931137, + "grad_norm": 1.0439937088045579, + "learning_rate": 0.00026420298390692615, + "loss": 7.413, + "step": 15989 + }, + { + "epoch": 1.492022021087991, + "grad_norm": 1.120406025980284, + "learning_rate": 0.00026419807987809393, + "loss": 7.2739, + "step": 15990 + }, + { + "epoch": 1.4921153307828683, + "grad_norm": 1.9524898930436951, + "learning_rate": 0.0002641931755588899, + "loss": 7.0219, + "step": 15991 + }, + { + "epoch": 1.4922086404777457, + "grad_norm": 0.6665988857460715, + "learning_rate": 0.0002641882709493264, + "loss": 7.6283, + "step": 15992 + }, + { + "epoch": 1.492301950172623, + "grad_norm": 10.359273612691366, + "learning_rate": 0.000264183366049416, + "loss": 7.5811, + "step": 15993 + }, + { + "epoch": 1.4923952598675, + "grad_norm": 1.3385707470705417, + "learning_rate": 0.0002641784608591711, + "loss": 7.5442, + "step": 15994 + }, + { + "epoch": 1.4924885695623775, + "grad_norm": 0.8882320064922556, + "learning_rate": 0.0002641735553786042, + "loss": 7.269, + "step": 15995 + }, + { + "epoch": 1.492581879257255, + "grad_norm": 0.4963662549275527, + "learning_rate": 0.00026416864960772775, + "loss": 7.2902, + "step": 15996 + }, + { + "epoch": 1.4926751889521321, + "grad_norm": 0.5010089532199018, + "learning_rate": 0.0002641637435465543, + "loss": 7.195, + "step": 15997 + }, + { + "epoch": 1.4927684986470093, + "grad_norm": 0.3560879160205173, + "learning_rate": 0.0002641588371950962, + "loss": 7.3855, + "step": 15998 + }, + { + "epoch": 1.4928618083418868, + "grad_norm": 0.6831666008067464, + "learning_rate": 0.000264153930553366, + "loss": 7.6859, + "step": 15999 + }, + { + "epoch": 1.492955118036764, + "grad_norm": 0.37087949129962633, + "learning_rate": 0.0002641490236213762, + "loss": 7.4632, + "step": 16000 + }, + { + "epoch": 1.4930484277316414, + "grad_norm": 1.2676699704194483, + "learning_rate": 0.00026414411639913925, + "loss": 7.0714, + "step": 16001 + }, + { + "epoch": 1.4931417374265186, + "grad_norm": 0.5753133821261559, + "learning_rate": 0.0002641392088866676, + "loss": 7.271, + "step": 16002 + }, + { + "epoch": 1.493235047121396, + "grad_norm": 0.3683298413738802, + "learning_rate": 0.0002641343010839738, + "loss": 7.3678, + "step": 16003 + }, + { + "epoch": 1.4933283568162732, + "grad_norm": 0.6093683292704326, + "learning_rate": 0.00026412939299107023, + "loss": 7.3518, + "step": 16004 + }, + { + "epoch": 1.4934216665111504, + "grad_norm": 0.8121210410917752, + "learning_rate": 0.00026412448460796944, + "loss": 7.4888, + "step": 16005 + }, + { + "epoch": 1.4935149762060278, + "grad_norm": 3.210698179510486, + "learning_rate": 0.0002641195759346839, + "loss": 7.0262, + "step": 16006 + }, + { + "epoch": 1.4936082859009052, + "grad_norm": 0.6248857964866962, + "learning_rate": 0.00026411466697122605, + "loss": 7.6768, + "step": 16007 + }, + { + "epoch": 1.4937015955957824, + "grad_norm": 0.871213922145211, + "learning_rate": 0.00026410975771760844, + "loss": 7.2987, + "step": 16008 + }, + { + "epoch": 1.4937949052906596, + "grad_norm": 1.0007541060705154, + "learning_rate": 0.0002641048481738435, + "loss": 7.2818, + "step": 16009 + }, + { + "epoch": 1.493888214985537, + "grad_norm": 0.5984183871225233, + "learning_rate": 0.00026409993833994373, + "loss": 7.31, + "step": 16010 + }, + { + "epoch": 1.4939815246804142, + "grad_norm": 0.4955193983322633, + "learning_rate": 0.0002640950282159216, + "loss": 7.3212, + "step": 16011 + }, + { + "epoch": 1.4940748343752916, + "grad_norm": 0.6068306314768648, + "learning_rate": 0.00026409011780178967, + "loss": 6.8844, + "step": 16012 + }, + { + "epoch": 1.4941681440701688, + "grad_norm": 0.73797359967715, + "learning_rate": 0.0002640852070975603, + "loss": 7.3199, + "step": 16013 + }, + { + "epoch": 1.4942614537650463, + "grad_norm": 0.5102189979380426, + "learning_rate": 0.00026408029610324607, + "loss": 7.1128, + "step": 16014 + }, + { + "epoch": 1.4943547634599235, + "grad_norm": 0.6815136339281175, + "learning_rate": 0.0002640753848188594, + "loss": 7.3007, + "step": 16015 + }, + { + "epoch": 1.4944480731548007, + "grad_norm": 1.0865262703591094, + "learning_rate": 0.0002640704732444129, + "loss": 7.7436, + "step": 16016 + }, + { + "epoch": 1.494541382849678, + "grad_norm": 0.639757355011183, + "learning_rate": 0.0002640655613799189, + "loss": 7.3583, + "step": 16017 + }, + { + "epoch": 1.4946346925445555, + "grad_norm": 1.178139302749864, + "learning_rate": 0.00026406064922539007, + "loss": 7.2638, + "step": 16018 + }, + { + "epoch": 1.4947280022394327, + "grad_norm": 1.844644002014902, + "learning_rate": 0.0002640557367808387, + "loss": 7.0081, + "step": 16019 + }, + { + "epoch": 1.49482131193431, + "grad_norm": 0.793479550558659, + "learning_rate": 0.00026405082404627744, + "loss": 7.266, + "step": 16020 + }, + { + "epoch": 1.4949146216291873, + "grad_norm": 0.4124689844824099, + "learning_rate": 0.00026404591102171865, + "loss": 7.2272, + "step": 16021 + }, + { + "epoch": 1.4950079313240645, + "grad_norm": 1.4993893532913871, + "learning_rate": 0.00026404099770717496, + "loss": 7.4214, + "step": 16022 + }, + { + "epoch": 1.495101241018942, + "grad_norm": 1.252817938247658, + "learning_rate": 0.0002640360841026587, + "loss": 7.094, + "step": 16023 + }, + { + "epoch": 1.4951945507138191, + "grad_norm": 1.4679941959810223, + "learning_rate": 0.0002640311702081826, + "loss": 7.2904, + "step": 16024 + }, + { + "epoch": 1.4952878604086965, + "grad_norm": 0.6559729117224834, + "learning_rate": 0.0002640262560237589, + "loss": 7.1352, + "step": 16025 + }, + { + "epoch": 1.4953811701035737, + "grad_norm": 0.31799786827248044, + "learning_rate": 0.0002640213415494002, + "loss": 6.914, + "step": 16026 + }, + { + "epoch": 1.495474479798451, + "grad_norm": 0.8547540550070261, + "learning_rate": 0.000264016426785119, + "loss": 7.2516, + "step": 16027 + }, + { + "epoch": 1.4955677894933284, + "grad_norm": 0.43603426458054517, + "learning_rate": 0.0002640115117309279, + "loss": 7.3679, + "step": 16028 + }, + { + "epoch": 1.4956610991882058, + "grad_norm": 0.6561030647486403, + "learning_rate": 0.0002640065963868392, + "loss": 7.1319, + "step": 16029 + }, + { + "epoch": 1.495754408883083, + "grad_norm": 0.7592439714775154, + "learning_rate": 0.00026400168075286557, + "loss": 7.3661, + "step": 16030 + }, + { + "epoch": 1.4958477185779602, + "grad_norm": 1.2276927745185813, + "learning_rate": 0.0002639967648290194, + "loss": 6.8779, + "step": 16031 + }, + { + "epoch": 1.4959410282728376, + "grad_norm": 0.7914146115956949, + "learning_rate": 0.00026399184861531326, + "loss": 7.2472, + "step": 16032 + }, + { + "epoch": 1.4960343379677148, + "grad_norm": 0.6528443696538427, + "learning_rate": 0.00026398693211175954, + "loss": 7.0797, + "step": 16033 + }, + { + "epoch": 1.4961276476625922, + "grad_norm": 1.0418290433912067, + "learning_rate": 0.0002639820153183709, + "loss": 7.1767, + "step": 16034 + }, + { + "epoch": 1.4962209573574694, + "grad_norm": 0.9416133659101731, + "learning_rate": 0.0002639770982351597, + "loss": 7.377, + "step": 16035 + }, + { + "epoch": 1.4963142670523468, + "grad_norm": 1.0023001396013742, + "learning_rate": 0.00026397218086213854, + "loss": 7.5582, + "step": 16036 + }, + { + "epoch": 1.496407576747224, + "grad_norm": 0.522467010317882, + "learning_rate": 0.00026396726319931986, + "loss": 7.4323, + "step": 16037 + }, + { + "epoch": 1.4965008864421012, + "grad_norm": 0.7958251075413528, + "learning_rate": 0.00026396234524671616, + "loss": 7.5058, + "step": 16038 + }, + { + "epoch": 1.4965941961369786, + "grad_norm": 2.132899979141355, + "learning_rate": 0.00026395742700434003, + "loss": 7.0535, + "step": 16039 + }, + { + "epoch": 1.496687505831856, + "grad_norm": 1.0942058475548009, + "learning_rate": 0.0002639525084722039, + "loss": 7.0931, + "step": 16040 + }, + { + "epoch": 1.4967808155267333, + "grad_norm": 0.6524967621663866, + "learning_rate": 0.0002639475896503203, + "loss": 7.4145, + "step": 16041 + }, + { + "epoch": 1.4968741252216105, + "grad_norm": 1.0613121221265145, + "learning_rate": 0.00026394267053870174, + "loss": 7.1162, + "step": 16042 + }, + { + "epoch": 1.4969674349164879, + "grad_norm": 1.1396342112272921, + "learning_rate": 0.0002639377511373607, + "loss": 6.7797, + "step": 16043 + }, + { + "epoch": 1.497060744611365, + "grad_norm": 1.1651440165767757, + "learning_rate": 0.0002639328314463098, + "loss": 7.259, + "step": 16044 + }, + { + "epoch": 1.4971540543062425, + "grad_norm": 0.5321829166354559, + "learning_rate": 0.00026392791146556135, + "loss": 7.2469, + "step": 16045 + }, + { + "epoch": 1.4972473640011197, + "grad_norm": 1.2532368654453354, + "learning_rate": 0.00026392299119512797, + "loss": 6.8277, + "step": 16046 + }, + { + "epoch": 1.497340673695997, + "grad_norm": 0.8104709345858487, + "learning_rate": 0.0002639180706350222, + "loss": 7.0474, + "step": 16047 + }, + { + "epoch": 1.4974339833908743, + "grad_norm": 3.0944082357202043, + "learning_rate": 0.0002639131497852565, + "loss": 7.2274, + "step": 16048 + }, + { + "epoch": 1.4975272930857515, + "grad_norm": 1.0836112567206246, + "learning_rate": 0.00026390822864584345, + "loss": 7.3422, + "step": 16049 + }, + { + "epoch": 1.497620602780629, + "grad_norm": 1.5468065070483834, + "learning_rate": 0.0002639033072167955, + "loss": 7.2973, + "step": 16050 + }, + { + "epoch": 1.4977139124755063, + "grad_norm": 1.783288825457733, + "learning_rate": 0.00026389838549812516, + "loss": 6.9485, + "step": 16051 + }, + { + "epoch": 1.4978072221703835, + "grad_norm": 1.0048971443031267, + "learning_rate": 0.000263893463489845, + "loss": 7.0717, + "step": 16052 + }, + { + "epoch": 1.4979005318652607, + "grad_norm": 1.17632632103523, + "learning_rate": 0.0002638885411919674, + "loss": 7.409, + "step": 16053 + }, + { + "epoch": 1.4979938415601382, + "grad_norm": 2.1704453047080268, + "learning_rate": 0.00026388361860450515, + "loss": 7.7508, + "step": 16054 + }, + { + "epoch": 1.4980871512550153, + "grad_norm": 1.413298305457329, + "learning_rate": 0.0002638786957274704, + "loss": 7.3013, + "step": 16055 + }, + { + "epoch": 1.4981804609498928, + "grad_norm": 0.7141179185562275, + "learning_rate": 0.00026387377256087603, + "loss": 7.3411, + "step": 16056 + }, + { + "epoch": 1.49827377064477, + "grad_norm": 0.35687347257183394, + "learning_rate": 0.0002638688491047343, + "loss": 7.1259, + "step": 16057 + }, + { + "epoch": 1.4983670803396474, + "grad_norm": 0.7751584696586841, + "learning_rate": 0.0002638639253590578, + "loss": 7.1489, + "step": 16058 + }, + { + "epoch": 1.4984603900345246, + "grad_norm": 1.0101525232289386, + "learning_rate": 0.0002638590013238591, + "loss": 7.0756, + "step": 16059 + }, + { + "epoch": 1.4985536997294018, + "grad_norm": 0.9934701955389494, + "learning_rate": 0.0002638540769991507, + "loss": 7.1861, + "step": 16060 + }, + { + "epoch": 1.4986470094242792, + "grad_norm": 0.9969436810398039, + "learning_rate": 0.0002638491523849451, + "loss": 7.0301, + "step": 16061 + }, + { + "epoch": 1.4987403191191566, + "grad_norm": 1.2195836227694485, + "learning_rate": 0.0002638442274812548, + "loss": 7.5252, + "step": 16062 + }, + { + "epoch": 1.4988336288140338, + "grad_norm": 22.23120307386295, + "learning_rate": 0.0002638393022880924, + "loss": 7.2536, + "step": 16063 + }, + { + "epoch": 1.498926938508911, + "grad_norm": 2.453274956035572, + "learning_rate": 0.00026383437680547035, + "loss": 7.1511, + "step": 16064 + }, + { + "epoch": 1.4990202482037884, + "grad_norm": 3.82716400391399, + "learning_rate": 0.0002638294510334012, + "loss": 7.0808, + "step": 16065 + }, + { + "epoch": 1.4991135578986656, + "grad_norm": 574.8758409716219, + "learning_rate": 0.00026382452497189746, + "loss": 7.5038, + "step": 16066 + }, + { + "epoch": 1.499206867593543, + "grad_norm": 2.0372712113255784, + "learning_rate": 0.00026381959862097174, + "loss": 7.4031, + "step": 16067 + }, + { + "epoch": 1.4993001772884202, + "grad_norm": 442.16460820182556, + "learning_rate": 0.0002638146719806364, + "loss": 7.2118, + "step": 16068 + }, + { + "epoch": 1.4993934869832977, + "grad_norm": 1.4961086212134547, + "learning_rate": 0.0002638097450509041, + "loss": 7.2576, + "step": 16069 + }, + { + "epoch": 1.4994867966781749, + "grad_norm": 2.7854175058411768, + "learning_rate": 0.00026380481783178734, + "loss": 7.1506, + "step": 16070 + }, + { + "epoch": 1.499580106373052, + "grad_norm": 3.8006859668957538, + "learning_rate": 0.00026379989032329865, + "loss": 7.3904, + "step": 16071 + }, + { + "epoch": 1.4996734160679295, + "grad_norm": 4.293676594398734, + "learning_rate": 0.00026379496252545054, + "loss": 7.6192, + "step": 16072 + }, + { + "epoch": 1.4997667257628067, + "grad_norm": 4.163739101620246, + "learning_rate": 0.00026379003443825554, + "loss": 7.6039, + "step": 16073 + }, + { + "epoch": 1.499860035457684, + "grad_norm": 442.17905356509846, + "learning_rate": 0.0002637851060617262, + "loss": 7.4848, + "step": 16074 + }, + { + "epoch": 1.4999533451525613, + "grad_norm": 1060.043279486798, + "learning_rate": 0.00026378017739587503, + "loss": 7.3678, + "step": 16075 + }, + { + "epoch": 1.5000466548474387, + "grad_norm": 0.9481624053922905, + "learning_rate": 0.0002637752484407146, + "loss": 7.461, + "step": 16076 + }, + { + "epoch": 1.500139964542316, + "grad_norm": 262.05257383219254, + "learning_rate": 0.0002637703191962574, + "loss": 7.4531, + "step": 16077 + }, + { + "epoch": 1.500233274237193, + "grad_norm": 3.14204334644795, + "learning_rate": 0.00026376538966251596, + "loss": 6.9356, + "step": 16078 + }, + { + "epoch": 1.5003265839320705, + "grad_norm": 3.2112826706582065, + "learning_rate": 0.00026376045983950283, + "loss": 7.4396, + "step": 16079 + }, + { + "epoch": 1.500419893626948, + "grad_norm": 3.9898576801113323, + "learning_rate": 0.00026375552972723057, + "loss": 7.3831, + "step": 16080 + }, + { + "epoch": 1.5005132033218251, + "grad_norm": 4.034948207872827, + "learning_rate": 0.0002637505993257117, + "loss": 7.3417, + "step": 16081 + }, + { + "epoch": 1.5006065130167023, + "grad_norm": 1181.8218426828769, + "learning_rate": 0.0002637456686349587, + "loss": 7.2631, + "step": 16082 + }, + { + "epoch": 1.5006998227115798, + "grad_norm": 795.7279684287407, + "learning_rate": 0.0002637407376549843, + "loss": 7.3028, + "step": 16083 + }, + { + "epoch": 1.5007931324064572, + "grad_norm": 2.385963298727781, + "learning_rate": 0.00026373580638580077, + "loss": 7.2233, + "step": 16084 + }, + { + "epoch": 1.5008864421013344, + "grad_norm": 1.83271722911614, + "learning_rate": 0.0002637308748274208, + "loss": 7.2699, + "step": 16085 + }, + { + "epoch": 1.5009797517962116, + "grad_norm": 0.7865981636226185, + "learning_rate": 0.00026372594297985693, + "loss": 7.3292, + "step": 16086 + }, + { + "epoch": 1.501073061491089, + "grad_norm": 3440.10151990139, + "learning_rate": 0.00026372101084312167, + "loss": 7.4015, + "step": 16087 + }, + { + "epoch": 1.5011663711859662, + "grad_norm": 1.1466981347839922, + "learning_rate": 0.0002637160784172275, + "loss": 7.6232, + "step": 16088 + }, + { + "epoch": 1.5012596808808434, + "grad_norm": 1.163497196326042, + "learning_rate": 0.0002637111457021871, + "loss": 7.5276, + "step": 16089 + }, + { + "epoch": 1.5013529905757208, + "grad_norm": 1.0005297953119074, + "learning_rate": 0.0002637062126980129, + "loss": 7.4851, + "step": 16090 + }, + { + "epoch": 1.5014463002705982, + "grad_norm": 1.0156770172160583, + "learning_rate": 0.0002637012794047175, + "loss": 7.2481, + "step": 16091 + }, + { + "epoch": 1.5015396099654754, + "grad_norm": 6125989.136249568, + "learning_rate": 0.00026369634582231344, + "loss": 7.5959, + "step": 16092 + }, + { + "epoch": 1.5016329196603526, + "grad_norm": 1.0551422186553716, + "learning_rate": 0.00026369141195081323, + "loss": 7.3675, + "step": 16093 + }, + { + "epoch": 1.50172622935523, + "grad_norm": 1.9675907949205411, + "learning_rate": 0.0002636864777902295, + "loss": 7.1798, + "step": 16094 + }, + { + "epoch": 1.5018195390501075, + "grad_norm": 4.24946128338369, + "learning_rate": 0.00026368154334057467, + "loss": 8.0445, + "step": 16095 + }, + { + "epoch": 1.5019128487449847, + "grad_norm": 2.2660955297247476, + "learning_rate": 0.0002636766086018613, + "loss": 7.5377, + "step": 16096 + }, + { + "epoch": 1.5020061584398618, + "grad_norm": 2.5223607390691365, + "learning_rate": 0.0002636716735741021, + "loss": 7.3111, + "step": 16097 + }, + { + "epoch": 1.5020994681347393, + "grad_norm": 2.1802304761956615, + "learning_rate": 0.0002636667382573094, + "loss": 7.4892, + "step": 16098 + }, + { + "epoch": 1.5021927778296165, + "grad_norm": 2.0078838707507116, + "learning_rate": 0.00026366180265149594, + "loss": 7.2674, + "step": 16099 + }, + { + "epoch": 1.5022860875244937, + "grad_norm": 1.887557531798618, + "learning_rate": 0.0002636568667566742, + "loss": 7.6931, + "step": 16100 + }, + { + "epoch": 1.502379397219371, + "grad_norm": 1127.034836952036, + "learning_rate": 0.0002636519305728566, + "loss": 7.6116, + "step": 16101 + }, + { + "epoch": 1.5024727069142485, + "grad_norm": 1.3441905456229584, + "learning_rate": 0.0002636469941000559, + "loss": 7.6361, + "step": 16102 + }, + { + "epoch": 1.5025660166091257, + "grad_norm": 0.7213790056980074, + "learning_rate": 0.00026364205733828453, + "loss": 7.2922, + "step": 16103 + }, + { + "epoch": 1.502659326304003, + "grad_norm": 4885.135028135581, + "learning_rate": 0.0002636371202875551, + "loss": 7.0445, + "step": 16104 + }, + { + "epoch": 1.5027526359988803, + "grad_norm": 1.0775483339576153, + "learning_rate": 0.0002636321829478801, + "loss": 7.2073, + "step": 16105 + }, + { + "epoch": 1.5028459456937577, + "grad_norm": 1.001817445755618, + "learning_rate": 0.0002636272453192721, + "loss": 7.4139, + "step": 16106 + }, + { + "epoch": 1.502939255388635, + "grad_norm": 1.1494331920975487, + "learning_rate": 0.00026362230740174374, + "loss": 7.2106, + "step": 16107 + }, + { + "epoch": 1.5030325650835121, + "grad_norm": 1.1708516065166397, + "learning_rate": 0.00026361736919530747, + "loss": 7.251, + "step": 16108 + }, + { + "epoch": 1.5031258747783895, + "grad_norm": 1.327183747952327, + "learning_rate": 0.00026361243069997594, + "loss": 7.3076, + "step": 16109 + }, + { + "epoch": 1.5032191844732667, + "grad_norm": 2061.6821932100142, + "learning_rate": 0.00026360749191576155, + "loss": 7.4837, + "step": 16110 + }, + { + "epoch": 1.503312494168144, + "grad_norm": 0.7459517660121163, + "learning_rate": 0.00026360255284267706, + "loss": 7.2702, + "step": 16111 + }, + { + "epoch": 1.5034058038630214, + "grad_norm": 0.5158025261908648, + "learning_rate": 0.00026359761348073487, + "loss": 7.3116, + "step": 16112 + }, + { + "epoch": 1.5034991135578988, + "grad_norm": 0.5679447432516563, + "learning_rate": 0.0002635926738299476, + "loss": 7.0936, + "step": 16113 + }, + { + "epoch": 1.503592423252776, + "grad_norm": 3373.549045989867, + "learning_rate": 0.00026358773389032787, + "loss": 7.6141, + "step": 16114 + }, + { + "epoch": 1.5036857329476532, + "grad_norm": 0.631542545681325, + "learning_rate": 0.0002635827936618881, + "loss": 7.2811, + "step": 16115 + }, + { + "epoch": 1.5037790426425306, + "grad_norm": 0.4961058412619321, + "learning_rate": 0.00026357785314464095, + "loss": 6.983, + "step": 16116 + }, + { + "epoch": 1.503872352337408, + "grad_norm": 0.6074432531267557, + "learning_rate": 0.000263572912338599, + "loss": 7.2961, + "step": 16117 + }, + { + "epoch": 1.503965662032285, + "grad_norm": 0.4095719349943269, + "learning_rate": 0.0002635679712437748, + "loss": 7.0997, + "step": 16118 + }, + { + "epoch": 1.5040589717271624, + "grad_norm": 0.9389664393319335, + "learning_rate": 0.0002635630298601808, + "loss": 7.304, + "step": 16119 + }, + { + "epoch": 1.5041522814220398, + "grad_norm": 1636.7697234975449, + "learning_rate": 0.0002635580881878297, + "loss": 7.324, + "step": 16120 + }, + { + "epoch": 1.504245591116917, + "grad_norm": 0.9610655147974656, + "learning_rate": 0.000263553146226734, + "loss": 7.2877, + "step": 16121 + }, + { + "epoch": 1.5043389008117942, + "grad_norm": 0.47958268878527904, + "learning_rate": 0.00026354820397690633, + "loss": 6.9352, + "step": 16122 + }, + { + "epoch": 1.5044322105066716, + "grad_norm": 1.3256246978815018, + "learning_rate": 0.0002635432614383592, + "loss": 7.5941, + "step": 16123 + }, + { + "epoch": 1.504525520201549, + "grad_norm": 10201.276807881452, + "learning_rate": 0.0002635383186111052, + "loss": 7.147, + "step": 16124 + }, + { + "epoch": 1.5046188298964263, + "grad_norm": 0.42811456314738683, + "learning_rate": 0.00026353337549515687, + "loss": 6.9926, + "step": 16125 + }, + { + "epoch": 1.5047121395913035, + "grad_norm": 0.8148180087422993, + "learning_rate": 0.0002635284320905268, + "loss": 7.55, + "step": 16126 + }, + { + "epoch": 1.5048054492861809, + "grad_norm": 1.0437970471403557, + "learning_rate": 0.00026352348839722754, + "loss": 7.5459, + "step": 16127 + }, + { + "epoch": 1.5048987589810583, + "grad_norm": 0.826263853900897, + "learning_rate": 0.0002635185444152717, + "loss": 6.7499, + "step": 16128 + }, + { + "epoch": 1.5049920686759353, + "grad_norm": 8437.350425599629, + "learning_rate": 0.0002635136001446718, + "loss": 6.9118, + "step": 16129 + }, + { + "epoch": 1.5050853783708127, + "grad_norm": 0.5498903104506899, + "learning_rate": 0.00026350865558544045, + "loss": 7.1619, + "step": 16130 + }, + { + "epoch": 1.50517868806569, + "grad_norm": 0.5486033663048457, + "learning_rate": 0.00026350371073759026, + "loss": 7.4976, + "step": 16131 + }, + { + "epoch": 1.5052719977605673, + "grad_norm": 0.8134382358647781, + "learning_rate": 0.0002634987656011337, + "loss": 7.0731, + "step": 16132 + }, + { + "epoch": 1.5053653074554445, + "grad_norm": 4475.905293649513, + "learning_rate": 0.0002634938201760834, + "loss": 6.8865, + "step": 16133 + }, + { + "epoch": 1.505458617150322, + "grad_norm": 9550.089683888893, + "learning_rate": 0.000263488874462452, + "loss": 6.8778, + "step": 16134 + }, + { + "epoch": 1.5055519268451993, + "grad_norm": 0.5044471490339497, + "learning_rate": 0.0002634839284602519, + "loss": 7.3494, + "step": 16135 + }, + { + "epoch": 1.5056452365400765, + "grad_norm": 0.555491723379868, + "learning_rate": 0.00026347898216949584, + "loss": 7.3563, + "step": 16136 + }, + { + "epoch": 1.5057385462349537, + "grad_norm": 0.5370481043200792, + "learning_rate": 0.00026347403559019635, + "loss": 7.3551, + "step": 16137 + }, + { + "epoch": 1.5058318559298312, + "grad_norm": 0.4102115963756626, + "learning_rate": 0.00026346908872236605, + "loss": 7.2527, + "step": 16138 + }, + { + "epoch": 1.5059251656247086, + "grad_norm": 5574.170206723464, + "learning_rate": 0.0002634641415660174, + "loss": 7.2774, + "step": 16139 + }, + { + "epoch": 1.5060184753195855, + "grad_norm": 7766.758539588873, + "learning_rate": 0.00026345919412116306, + "loss": 7.1701, + "step": 16140 + }, + { + "epoch": 1.506111785014463, + "grad_norm": 0.5018076155210149, + "learning_rate": 0.00026345424638781563, + "loss": 7.4042, + "step": 16141 + }, + { + "epoch": 1.5062050947093404, + "grad_norm": 3069.5530737412846, + "learning_rate": 0.00026344929836598764, + "loss": 7.1751, + "step": 16142 + }, + { + "epoch": 1.5062984044042176, + "grad_norm": 498.11053802624093, + "learning_rate": 0.0002634443500556917, + "loss": 7.0775, + "step": 16143 + }, + { + "epoch": 1.5063917140990948, + "grad_norm": 0.6754662144200151, + "learning_rate": 0.00026343940145694036, + "loss": 7.326, + "step": 16144 + }, + { + "epoch": 1.5064850237939722, + "grad_norm": 1.5009623648645247, + "learning_rate": 0.0002634344525697462, + "loss": 6.8375, + "step": 16145 + }, + { + "epoch": 1.5065783334888496, + "grad_norm": 0.7364838624488188, + "learning_rate": 0.0002634295033941219, + "loss": 7.2581, + "step": 16146 + }, + { + "epoch": 1.5066716431837268, + "grad_norm": 3582.759982758322, + "learning_rate": 0.0002634245539300799, + "loss": 7.1779, + "step": 16147 + }, + { + "epoch": 1.506764952878604, + "grad_norm": 0.5014287193131068, + "learning_rate": 0.0002634196041776329, + "loss": 7.0555, + "step": 16148 + }, + { + "epoch": 1.5068582625734814, + "grad_norm": 0.4376288084948978, + "learning_rate": 0.00026341465413679346, + "loss": 7.378, + "step": 16149 + }, + { + "epoch": 1.5069515722683586, + "grad_norm": 0.34590099708774497, + "learning_rate": 0.00026340970380757416, + "loss": 7.2149, + "step": 16150 + }, + { + "epoch": 1.5070448819632358, + "grad_norm": 0.6349320116265793, + "learning_rate": 0.0002634047531899875, + "loss": 7.2487, + "step": 16151 + }, + { + "epoch": 1.5071381916581132, + "grad_norm": 2847.5928226708666, + "learning_rate": 0.0002633998022840462, + "loss": 7.1886, + "step": 16152 + }, + { + "epoch": 1.5072315013529907, + "grad_norm": 0.9100612988596005, + "learning_rate": 0.0002633948510897628, + "loss": 7.315, + "step": 16153 + }, + { + "epoch": 1.5073248110478679, + "grad_norm": 2.17869860674605, + "learning_rate": 0.00026338989960714984, + "loss": 7.5397, + "step": 16154 + }, + { + "epoch": 1.507418120742745, + "grad_norm": 0.9714653560656653, + "learning_rate": 0.00026338494783622, + "loss": 7.2323, + "step": 16155 + }, + { + "epoch": 1.5075114304376225, + "grad_norm": 0.682241602147058, + "learning_rate": 0.0002633799957769858, + "loss": 7.0842, + "step": 16156 + }, + { + "epoch": 1.5076047401325, + "grad_norm": 0.5500488191854352, + "learning_rate": 0.0002633750434294599, + "loss": 7.3462, + "step": 16157 + }, + { + "epoch": 1.507698049827377, + "grad_norm": 6667.680898589121, + "learning_rate": 0.0002633700907936548, + "loss": 7.4885, + "step": 16158 + }, + { + "epoch": 1.5077913595222543, + "grad_norm": 1.117282999108137, + "learning_rate": 0.00026336513786958315, + "loss": 6.9586, + "step": 16159 + }, + { + "epoch": 1.5078846692171317, + "grad_norm": 1.0392603908066853, + "learning_rate": 0.00026336018465725756, + "loss": 7.0465, + "step": 16160 + }, + { + "epoch": 1.507977978912009, + "grad_norm": 0.7908361595713966, + "learning_rate": 0.0002633552311566906, + "loss": 7.0435, + "step": 16161 + }, + { + "epoch": 1.508071288606886, + "grad_norm": 28541.52379147602, + "learning_rate": 0.0002633502773678948, + "loss": 7.0157, + "step": 16162 + }, + { + "epoch": 1.5081645983017635, + "grad_norm": 0.9592588472820012, + "learning_rate": 0.0002633453232908829, + "loss": 7.3945, + "step": 16163 + }, + { + "epoch": 1.508257907996641, + "grad_norm": 0.8075933245152241, + "learning_rate": 0.00026334036892566734, + "loss": 7.0593, + "step": 16164 + }, + { + "epoch": 1.5083512176915181, + "grad_norm": 1.3357692218334989, + "learning_rate": 0.0002633354142722609, + "loss": 7.227, + "step": 16165 + }, + { + "epoch": 1.5084445273863953, + "grad_norm": 1.6433687822427505, + "learning_rate": 0.000263330459330676, + "loss": 7.2835, + "step": 16166 + }, + { + "epoch": 1.5085378370812728, + "grad_norm": 1.407111505038818, + "learning_rate": 0.0002633255041009253, + "loss": 7.468, + "step": 16167 + }, + { + "epoch": 1.5086311467761502, + "grad_norm": 6684.80640205004, + "learning_rate": 0.0002633205485830215, + "loss": 7.2445, + "step": 16168 + }, + { + "epoch": 1.5087244564710274, + "grad_norm": 0.35570277708572445, + "learning_rate": 0.00026331559277697705, + "loss": 7.37, + "step": 16169 + }, + { + "epoch": 1.5088177661659046, + "grad_norm": 1.0205154482153211, + "learning_rate": 0.0002633106366828046, + "loss": 6.9954, + "step": 16170 + }, + { + "epoch": 1.508911075860782, + "grad_norm": 1.5125245564702663, + "learning_rate": 0.0002633056803005168, + "loss": 6.8569, + "step": 16171 + }, + { + "epoch": 1.5090043855556592, + "grad_norm": 0.9551699487698323, + "learning_rate": 0.0002633007236301262, + "loss": 7.2357, + "step": 16172 + }, + { + "epoch": 1.5090976952505364, + "grad_norm": 0.8072302662735825, + "learning_rate": 0.0002632957666716454, + "loss": 7.2448, + "step": 16173 + }, + { + "epoch": 1.5091910049454138, + "grad_norm": 0.7519480554062425, + "learning_rate": 0.00026329080942508713, + "loss": 7.2015, + "step": 16174 + }, + { + "epoch": 1.5092843146402912, + "grad_norm": 4869.816227358041, + "learning_rate": 0.00026328585189046375, + "loss": 7.2331, + "step": 16175 + }, + { + "epoch": 1.5093776243351684, + "grad_norm": 9803.040427522346, + "learning_rate": 0.0002632808940677881, + "loss": 7.2647, + "step": 16176 + }, + { + "epoch": 1.5094709340300456, + "grad_norm": 0.585657929338047, + "learning_rate": 0.00026327593595707263, + "loss": 7.3525, + "step": 16177 + }, + { + "epoch": 1.509564243724923, + "grad_norm": 1.0888567448637632, + "learning_rate": 0.00026327097755833004, + "loss": 7.3709, + "step": 16178 + }, + { + "epoch": 1.5096575534198005, + "grad_norm": 0.41260748908461414, + "learning_rate": 0.00026326601887157293, + "loss": 6.8229, + "step": 16179 + }, + { + "epoch": 1.5097508631146777, + "grad_norm": 0.7624224848890157, + "learning_rate": 0.0002632610598968139, + "loss": 7.2172, + "step": 16180 + }, + { + "epoch": 1.5098441728095549, + "grad_norm": 0.8755754249012462, + "learning_rate": 0.0002632561006340655, + "loss": 7.4357, + "step": 16181 + }, + { + "epoch": 1.5099374825044323, + "grad_norm": 0.8920006602437001, + "learning_rate": 0.00026325114108334037, + "loss": 6.7714, + "step": 16182 + }, + { + "epoch": 1.5100307921993095, + "grad_norm": 1.0554170827065295, + "learning_rate": 0.00026324618124465114, + "loss": 6.8445, + "step": 16183 + }, + { + "epoch": 1.5101241018941867, + "grad_norm": 0.48841945099270684, + "learning_rate": 0.00026324122111801045, + "loss": 7.3165, + "step": 16184 + }, + { + "epoch": 1.510217411589064, + "grad_norm": 12790.78242572778, + "learning_rate": 0.00026323626070343093, + "loss": 6.8918, + "step": 16185 + }, + { + "epoch": 1.5103107212839415, + "grad_norm": 0.36819702704187507, + "learning_rate": 0.00026323130000092504, + "loss": 7.3075, + "step": 16186 + }, + { + "epoch": 1.5104040309788187, + "grad_norm": 0.4958686567936898, + "learning_rate": 0.0002632263390105055, + "loss": 7.1789, + "step": 16187 + }, + { + "epoch": 1.510497340673696, + "grad_norm": 0.5273293879718236, + "learning_rate": 0.000263221377732185, + "loss": 7.4407, + "step": 16188 + }, + { + "epoch": 1.5105906503685733, + "grad_norm": 0.4224699207509297, + "learning_rate": 0.000263216416165976, + "loss": 7.2926, + "step": 16189 + }, + { + "epoch": 1.5106839600634507, + "grad_norm": 0.4960753540147342, + "learning_rate": 0.00026321145431189123, + "loss": 7.2442, + "step": 16190 + }, + { + "epoch": 1.510777269758328, + "grad_norm": 1.9739082200445837, + "learning_rate": 0.0002632064921699433, + "loss": 7.1188, + "step": 16191 + }, + { + "epoch": 1.5108705794532051, + "grad_norm": 0.4332789696847822, + "learning_rate": 0.0002632015297401447, + "loss": 6.9726, + "step": 16192 + }, + { + "epoch": 1.5109638891480826, + "grad_norm": 25464.45050249471, + "learning_rate": 0.0002631965670225082, + "loss": 6.983, + "step": 16193 + }, + { + "epoch": 1.5110571988429597, + "grad_norm": 0.5875049979037692, + "learning_rate": 0.00026319160401704636, + "loss": 7.1432, + "step": 16194 + }, + { + "epoch": 1.511150508537837, + "grad_norm": 14671.662309900466, + "learning_rate": 0.00026318664072377183, + "loss": 7.2652, + "step": 16195 + }, + { + "epoch": 1.5112438182327144, + "grad_norm": 0.7614974041655951, + "learning_rate": 0.00026318167714269714, + "loss": 7.327, + "step": 16196 + }, + { + "epoch": 1.5113371279275918, + "grad_norm": 1.9505058191377962, + "learning_rate": 0.000263176713273835, + "loss": 7.1607, + "step": 16197 + }, + { + "epoch": 1.511430437622469, + "grad_norm": 2.0403167586202295, + "learning_rate": 0.000263171749117198, + "loss": 7.2012, + "step": 16198 + }, + { + "epoch": 1.5115237473173462, + "grad_norm": 1.2903420562801824, + "learning_rate": 0.0002631667846727988, + "loss": 7.4173, + "step": 16199 + }, + { + "epoch": 1.5116170570122236, + "grad_norm": 1.932628285989399, + "learning_rate": 0.00026316181994065, + "loss": 7.519, + "step": 16200 + }, + { + "epoch": 1.511710366707101, + "grad_norm": 0.9714352481725032, + "learning_rate": 0.00026315685492076413, + "loss": 7.2118, + "step": 16201 + }, + { + "epoch": 1.5118036764019782, + "grad_norm": 632432.6384679837, + "learning_rate": 0.0002631518896131539, + "loss": 6.9314, + "step": 16202 + }, + { + "epoch": 1.5118969860968554, + "grad_norm": 1.423881367049929, + "learning_rate": 0.000263146924017832, + "loss": 7.5372, + "step": 16203 + }, + { + "epoch": 1.5119902957917328, + "grad_norm": 93665.33930805055, + "learning_rate": 0.000263141958134811, + "loss": 7.3107, + "step": 16204 + }, + { + "epoch": 1.51208360548661, + "grad_norm": 2.1367789958499253, + "learning_rate": 0.0002631369919641035, + "loss": 7.5671, + "step": 16205 + }, + { + "epoch": 1.5121769151814872, + "grad_norm": 2.3062275957655256, + "learning_rate": 0.00026313202550572206, + "loss": 7.4413, + "step": 16206 + }, + { + "epoch": 1.5122702248763646, + "grad_norm": 28282955.477526046, + "learning_rate": 0.00026312705875967946, + "loss": 7.1497, + "step": 16207 + }, + { + "epoch": 1.512363534571242, + "grad_norm": 3076626.776890124, + "learning_rate": 0.00026312209172598826, + "loss": 7.3648, + "step": 16208 + }, + { + "epoch": 1.5124568442661193, + "grad_norm": 2.9618475248791385, + "learning_rate": 0.00026311712440466105, + "loss": 7.3971, + "step": 16209 + }, + { + "epoch": 1.5125501539609965, + "grad_norm": 3.226124484431347, + "learning_rate": 0.00026311215679571055, + "loss": 7.5832, + "step": 16210 + }, + { + "epoch": 1.5126434636558739, + "grad_norm": 330283.34812720446, + "learning_rate": 0.00026310718889914936, + "loss": 7.0411, + "step": 16211 + }, + { + "epoch": 1.5127367733507513, + "grad_norm": 1.7965874162631585, + "learning_rate": 0.00026310222071499004, + "loss": 7.2985, + "step": 16212 + }, + { + "epoch": 1.5128300830456285, + "grad_norm": 1.0335158285486055, + "learning_rate": 0.0002630972522432453, + "loss": 7.2839, + "step": 16213 + }, + { + "epoch": 1.5129233927405057, + "grad_norm": 4245.0947071093215, + "learning_rate": 0.0002630922834839277, + "loss": 7.598, + "step": 16214 + }, + { + "epoch": 1.513016702435383, + "grad_norm": 1.640494303289742, + "learning_rate": 0.00026308731443705003, + "loss": 7.2339, + "step": 16215 + }, + { + "epoch": 1.5131100121302603, + "grad_norm": 1.133524019059987, + "learning_rate": 0.00026308234510262474, + "loss": 7.474, + "step": 16216 + }, + { + "epoch": 1.5132033218251375, + "grad_norm": 0.6522863149073206, + "learning_rate": 0.00026307737548066453, + "loss": 7.5483, + "step": 16217 + }, + { + "epoch": 1.513296631520015, + "grad_norm": 0.6977546489049267, + "learning_rate": 0.0002630724055711821, + "loss": 7.0883, + "step": 16218 + }, + { + "epoch": 1.5133899412148923, + "grad_norm": 1.402913478784909, + "learning_rate": 0.00026306743537419, + "loss": 7.4696, + "step": 16219 + }, + { + "epoch": 1.5134832509097695, + "grad_norm": 1.0767782050061168, + "learning_rate": 0.00026306246488970095, + "loss": 7.3425, + "step": 16220 + }, + { + "epoch": 1.5135765606046467, + "grad_norm": 1.12880996195069, + "learning_rate": 0.0002630574941177275, + "loss": 7.1677, + "step": 16221 + }, + { + "epoch": 1.5136698702995242, + "grad_norm": 1.3418708131662127, + "learning_rate": 0.00026305252305828234, + "loss": 7.6364, + "step": 16222 + }, + { + "epoch": 1.5137631799944016, + "grad_norm": 1.4599702563399617, + "learning_rate": 0.0002630475517113781, + "loss": 7.6197, + "step": 16223 + }, + { + "epoch": 1.5138564896892786, + "grad_norm": 13.827137578238982, + "learning_rate": 0.00026304258007702743, + "loss": 7.4998, + "step": 16224 + }, + { + "epoch": 1.513949799384156, + "grad_norm": 1.042671274227086, + "learning_rate": 0.000263037608155243, + "loss": 7.2738, + "step": 16225 + }, + { + "epoch": 1.5140431090790334, + "grad_norm": 1.4726121214096894, + "learning_rate": 0.0002630326359460374, + "loss": 7.3036, + "step": 16226 + }, + { + "epoch": 1.5141364187739106, + "grad_norm": 1.899892111945821, + "learning_rate": 0.0002630276634494232, + "loss": 7.2916, + "step": 16227 + }, + { + "epoch": 1.5142297284687878, + "grad_norm": 1.9037877517049122, + "learning_rate": 0.0002630226906654133, + "loss": 7.2384, + "step": 16228 + }, + { + "epoch": 1.5143230381636652, + "grad_norm": 1.3420369734299373, + "learning_rate": 0.00026301771759402006, + "loss": 7.5816, + "step": 16229 + }, + { + "epoch": 1.5144163478585426, + "grad_norm": 1.0962067208741628, + "learning_rate": 0.00026301274423525624, + "loss": 7.4959, + "step": 16230 + }, + { + "epoch": 1.5145096575534198, + "grad_norm": 0.9221948620142054, + "learning_rate": 0.00026300777058913455, + "loss": 7.325, + "step": 16231 + }, + { + "epoch": 1.514602967248297, + "grad_norm": 0.8385323327317277, + "learning_rate": 0.0002630027966556675, + "loss": 7.3253, + "step": 16232 + }, + { + "epoch": 1.5146962769431744, + "grad_norm": 0.7481583252238295, + "learning_rate": 0.0002629978224348678, + "loss": 7.2892, + "step": 16233 + }, + { + "epoch": 1.5147895866380519, + "grad_norm": 9411.269123582555, + "learning_rate": 0.0002629928479267482, + "loss": 7.0999, + "step": 16234 + }, + { + "epoch": 1.5148828963329288, + "grad_norm": 1.6978200029636408, + "learning_rate": 0.0002629878731313212, + "loss": 7.6485, + "step": 16235 + }, + { + "epoch": 1.5149762060278062, + "grad_norm": 1.5776526274929905, + "learning_rate": 0.00026298289804859955, + "loss": 7.4751, + "step": 16236 + }, + { + "epoch": 1.5150695157226837, + "grad_norm": 1.2636629293310173, + "learning_rate": 0.00026297792267859587, + "loss": 7.4439, + "step": 16237 + }, + { + "epoch": 1.5151628254175609, + "grad_norm": 1.0401295013446423, + "learning_rate": 0.0002629729470213228, + "loss": 7.2838, + "step": 16238 + }, + { + "epoch": 1.515256135112438, + "grad_norm": 1.5652516274363875, + "learning_rate": 0.0002629679710767929, + "loss": 7.1786, + "step": 16239 + }, + { + "epoch": 1.5153494448073155, + "grad_norm": 0.9813702005627657, + "learning_rate": 0.00026296299484501897, + "loss": 7.4361, + "step": 16240 + }, + { + "epoch": 1.515442754502193, + "grad_norm": 0.7520071380021405, + "learning_rate": 0.0002629580183260136, + "loss": 7.1489, + "step": 16241 + }, + { + "epoch": 1.51553606419707, + "grad_norm": 0.6467105000882283, + "learning_rate": 0.00026295304151978947, + "loss": 7.4222, + "step": 16242 + }, + { + "epoch": 1.5156293738919473, + "grad_norm": 0.5901351260068501, + "learning_rate": 0.0002629480644263592, + "loss": 7.1671, + "step": 16243 + }, + { + "epoch": 1.5157226835868247, + "grad_norm": 0.5704934982687762, + "learning_rate": 0.00026294308704573544, + "loss": 7.1299, + "step": 16244 + }, + { + "epoch": 1.5158159932817021, + "grad_norm": 0.9418523000308396, + "learning_rate": 0.0002629381093779309, + "loss": 7.4766, + "step": 16245 + }, + { + "epoch": 1.515909302976579, + "grad_norm": 0.8525820969198505, + "learning_rate": 0.0002629331314229582, + "loss": 7.4139, + "step": 16246 + }, + { + "epoch": 1.5160026126714565, + "grad_norm": 0.8486676941450624, + "learning_rate": 0.00026292815318083, + "loss": 7.4481, + "step": 16247 + }, + { + "epoch": 1.516095922366334, + "grad_norm": 3047.7131285917726, + "learning_rate": 0.0002629231746515589, + "loss": 7.3362, + "step": 16248 + }, + { + "epoch": 1.5161892320612111, + "grad_norm": 0.43326344121382715, + "learning_rate": 0.00026291819583515767, + "loss": 7.4618, + "step": 16249 + }, + { + "epoch": 1.5162825417560883, + "grad_norm": 1.3169893604398053, + "learning_rate": 0.0002629132167316389, + "loss": 7.0385, + "step": 16250 + }, + { + "epoch": 1.5163758514509658, + "grad_norm": 0.5450492023015207, + "learning_rate": 0.0002629082373410153, + "loss": 7.4051, + "step": 16251 + }, + { + "epoch": 1.5164691611458432, + "grad_norm": 4639.5416543516785, + "learning_rate": 0.0002629032576632995, + "loss": 7.1687, + "step": 16252 + }, + { + "epoch": 1.5165624708407204, + "grad_norm": 0.726946348573651, + "learning_rate": 0.00026289827769850413, + "loss": 7.0984, + "step": 16253 + }, + { + "epoch": 1.5166557805355976, + "grad_norm": 0.7705025201526007, + "learning_rate": 0.00026289329744664185, + "loss": 7.3407, + "step": 16254 + }, + { + "epoch": 1.516749090230475, + "grad_norm": 1.2535732846802021, + "learning_rate": 0.00026288831690772545, + "loss": 7.4719, + "step": 16255 + }, + { + "epoch": 1.5168423999253522, + "grad_norm": 0.6875548454666826, + "learning_rate": 0.00026288333608176745, + "loss": 7.2171, + "step": 16256 + }, + { + "epoch": 1.5169357096202294, + "grad_norm": 0.5817269528866733, + "learning_rate": 0.0002628783549687806, + "loss": 7.3901, + "step": 16257 + }, + { + "epoch": 1.5170290193151068, + "grad_norm": 0.5428688295874968, + "learning_rate": 0.0002628733735687774, + "loss": 7.38, + "step": 16258 + }, + { + "epoch": 1.5171223290099842, + "grad_norm": 0.5339842962985127, + "learning_rate": 0.00026286839188177077, + "loss": 7.1702, + "step": 16259 + }, + { + "epoch": 1.5172156387048614, + "grad_norm": 0.5971345566090993, + "learning_rate": 0.0002628634099077732, + "loss": 7.3345, + "step": 16260 + }, + { + "epoch": 1.5173089483997386, + "grad_norm": 0.6500487099051482, + "learning_rate": 0.0002628584276467974, + "loss": 7.1315, + "step": 16261 + }, + { + "epoch": 1.517402258094616, + "grad_norm": 0.6511958592910735, + "learning_rate": 0.0002628534450988561, + "loss": 7.1333, + "step": 16262 + }, + { + "epoch": 1.5174955677894935, + "grad_norm": 0.4642588858470191, + "learning_rate": 0.00026284846226396196, + "loss": 7.1474, + "step": 16263 + }, + { + "epoch": 1.5175888774843707, + "grad_norm": 0.7797135414663089, + "learning_rate": 0.0002628434791421275, + "loss": 7.3682, + "step": 16264 + }, + { + "epoch": 1.5176821871792479, + "grad_norm": 384.7367796324855, + "learning_rate": 0.00026283849573336554, + "loss": 7.0293, + "step": 16265 + }, + { + "epoch": 1.5177754968741253, + "grad_norm": 1.0183633500387383, + "learning_rate": 0.00026283351203768875, + "loss": 7.3841, + "step": 16266 + }, + { + "epoch": 1.5178688065690025, + "grad_norm": 0.566079632604464, + "learning_rate": 0.00026282852805510973, + "loss": 7.2312, + "step": 16267 + }, + { + "epoch": 1.5179621162638797, + "grad_norm": 0.5075298067942902, + "learning_rate": 0.0002628235437856412, + "loss": 7.1601, + "step": 16268 + }, + { + "epoch": 1.518055425958757, + "grad_norm": 0.5882062892587053, + "learning_rate": 0.0002628185592292958, + "loss": 7.3966, + "step": 16269 + }, + { + "epoch": 1.5181487356536345, + "grad_norm": 1.137775254693175, + "learning_rate": 0.0002628135743860862, + "loss": 7.1229, + "step": 16270 + }, + { + "epoch": 1.5182420453485117, + "grad_norm": 0.9061455190915659, + "learning_rate": 0.00026280858925602517, + "loss": 7.1953, + "step": 16271 + }, + { + "epoch": 1.518335355043389, + "grad_norm": 0.8358462932758048, + "learning_rate": 0.0002628036038391253, + "loss": 7.2556, + "step": 16272 + }, + { + "epoch": 1.5184286647382663, + "grad_norm": 0.7086415206125902, + "learning_rate": 0.0002627986181353992, + "loss": 7.1331, + "step": 16273 + }, + { + "epoch": 1.5185219744331437, + "grad_norm": 0.8955573801053762, + "learning_rate": 0.00026279363214485967, + "loss": 7.8181, + "step": 16274 + }, + { + "epoch": 1.518615284128021, + "grad_norm": 0.7458431298625775, + "learning_rate": 0.00026278864586751937, + "loss": 7.3327, + "step": 16275 + }, + { + "epoch": 1.5187085938228981, + "grad_norm": 0.8524887755287456, + "learning_rate": 0.0002627836593033909, + "loss": 7.4662, + "step": 16276 + }, + { + "epoch": 1.5188019035177756, + "grad_norm": 633.0017197468424, + "learning_rate": 0.00026277867245248706, + "loss": 7.1922, + "step": 16277 + }, + { + "epoch": 1.5188952132126528, + "grad_norm": 0.7476311253936441, + "learning_rate": 0.00026277368531482046, + "loss": 7.4075, + "step": 16278 + }, + { + "epoch": 1.51898852290753, + "grad_norm": 0.8993663147635175, + "learning_rate": 0.00026276869789040375, + "loss": 7.074, + "step": 16279 + }, + { + "epoch": 1.5190818326024074, + "grad_norm": 0.6993200309055955, + "learning_rate": 0.00026276371017924967, + "loss": 7.2618, + "step": 16280 + }, + { + "epoch": 1.5191751422972848, + "grad_norm": 0.6637967611138317, + "learning_rate": 0.00026275872218137084, + "loss": 7.4655, + "step": 16281 + }, + { + "epoch": 1.519268451992162, + "grad_norm": 0.7080183087212957, + "learning_rate": 0.00026275373389678, + "loss": 7.11, + "step": 16282 + }, + { + "epoch": 1.5193617616870392, + "grad_norm": 0.8063125101222103, + "learning_rate": 0.00026274874532548984, + "loss": 7.2827, + "step": 16283 + }, + { + "epoch": 1.5194550713819166, + "grad_norm": 0.7259047138428464, + "learning_rate": 0.000262743756467513, + "loss": 7.483, + "step": 16284 + }, + { + "epoch": 1.519548381076794, + "grad_norm": 2884.088034727922, + "learning_rate": 0.0002627387673228622, + "loss": 7.2106, + "step": 16285 + }, + { + "epoch": 1.5196416907716712, + "grad_norm": 0.4085634271739678, + "learning_rate": 0.0002627337778915501, + "loss": 7.0572, + "step": 16286 + }, + { + "epoch": 1.5197350004665484, + "grad_norm": 0.9164035869859201, + "learning_rate": 0.0002627287881735893, + "loss": 7.4461, + "step": 16287 + }, + { + "epoch": 1.5198283101614258, + "grad_norm": 0.5254159180839328, + "learning_rate": 0.00026272379816899273, + "loss": 6.9874, + "step": 16288 + }, + { + "epoch": 1.519921619856303, + "grad_norm": 11004.684867402992, + "learning_rate": 0.00026271880787777287, + "loss": 7.1897, + "step": 16289 + }, + { + "epoch": 1.5200149295511802, + "grad_norm": 0.8039803777314146, + "learning_rate": 0.0002627138172999425, + "loss": 7.0593, + "step": 16290 + }, + { + "epoch": 1.5201082392460576, + "grad_norm": 0.5408078682194433, + "learning_rate": 0.0002627088264355142, + "loss": 6.9369, + "step": 16291 + }, + { + "epoch": 1.520201548940935, + "grad_norm": 0.854244816320851, + "learning_rate": 0.00026270383528450084, + "loss": 7.3146, + "step": 16292 + }, + { + "epoch": 1.5202948586358123, + "grad_norm": 0.5749157960210134, + "learning_rate": 0.000262698843846915, + "loss": 7.1791, + "step": 16293 + }, + { + "epoch": 1.5203881683306895, + "grad_norm": 0.6617366072952809, + "learning_rate": 0.0002626938521227693, + "loss": 7.3173, + "step": 16294 + }, + { + "epoch": 1.5204814780255669, + "grad_norm": 0.927087646783536, + "learning_rate": 0.00026268886011207663, + "loss": 7.4705, + "step": 16295 + }, + { + "epoch": 1.5205747877204443, + "grad_norm": 0.9304665907814633, + "learning_rate": 0.0002626838678148495, + "loss": 7.0311, + "step": 16296 + }, + { + "epoch": 1.5206680974153215, + "grad_norm": 1.0059190428652764, + "learning_rate": 0.00026267887523110067, + "loss": 7.2088, + "step": 16297 + }, + { + "epoch": 1.5207614071101987, + "grad_norm": 0.8516414468736353, + "learning_rate": 0.00026267388236084286, + "loss": 7.7094, + "step": 16298 + }, + { + "epoch": 1.5208547168050761, + "grad_norm": 5855.726378158584, + "learning_rate": 0.00026266888920408875, + "loss": 6.9538, + "step": 16299 + }, + { + "epoch": 1.5209480264999533, + "grad_norm": 0.7528311619543643, + "learning_rate": 0.000262663895760851, + "loss": 7.4395, + "step": 16300 + }, + { + "epoch": 1.5210413361948305, + "grad_norm": 0.6487741811116471, + "learning_rate": 0.0002626589020311424, + "loss": 7.2837, + "step": 16301 + }, + { + "epoch": 1.521134645889708, + "grad_norm": 0.6270314331454894, + "learning_rate": 0.0002626539080149755, + "loss": 7.3259, + "step": 16302 + }, + { + "epoch": 1.5212279555845853, + "grad_norm": 0.4865678428927158, + "learning_rate": 0.00026264891371236313, + "loss": 7.2089, + "step": 16303 + }, + { + "epoch": 1.5213212652794625, + "grad_norm": 2791.637955266909, + "learning_rate": 0.00026264391912331794, + "loss": 7.4578, + "step": 16304 + }, + { + "epoch": 1.5214145749743397, + "grad_norm": 0.7563672693689387, + "learning_rate": 0.0002626389242478526, + "loss": 7.2434, + "step": 16305 + }, + { + "epoch": 1.5215078846692172, + "grad_norm": 0.9056363893264645, + "learning_rate": 0.0002626339290859799, + "loss": 7.2493, + "step": 16306 + }, + { + "epoch": 1.5216011943640946, + "grad_norm": 0.6413907150167499, + "learning_rate": 0.00026262893363771244, + "loss": 7.2586, + "step": 16307 + }, + { + "epoch": 1.5216945040589718, + "grad_norm": 1.6906102013626867, + "learning_rate": 0.00026262393790306296, + "loss": 6.7856, + "step": 16308 + }, + { + "epoch": 1.521787813753849, + "grad_norm": 5203.769432970127, + "learning_rate": 0.0002626189418820442, + "loss": 7.1537, + "step": 16309 + }, + { + "epoch": 1.5218811234487264, + "grad_norm": 0.8270304169047588, + "learning_rate": 0.00026261394557466883, + "loss": 6.6914, + "step": 16310 + }, + { + "epoch": 1.5219744331436036, + "grad_norm": 1.1447083555128645, + "learning_rate": 0.0002626089489809495, + "loss": 7.1296, + "step": 16311 + }, + { + "epoch": 1.5220677428384808, + "grad_norm": 2.005897958387972, + "learning_rate": 0.000262603952100899, + "loss": 7.3576, + "step": 16312 + }, + { + "epoch": 1.5221610525333582, + "grad_norm": 1.6587242959900694, + "learning_rate": 0.00026259895493453006, + "loss": 7.069, + "step": 16313 + }, + { + "epoch": 1.5222543622282356, + "grad_norm": 4118.155968255352, + "learning_rate": 0.00026259395748185523, + "loss": 7.4139, + "step": 16314 + }, + { + "epoch": 1.5223476719231128, + "grad_norm": 1.051179021811217, + "learning_rate": 0.0002625889597428874, + "loss": 7.2333, + "step": 16315 + }, + { + "epoch": 1.52244098161799, + "grad_norm": 3497.078826909028, + "learning_rate": 0.00026258396171763916, + "loss": 7.2352, + "step": 16316 + }, + { + "epoch": 1.5225342913128674, + "grad_norm": 0.5436192257348618, + "learning_rate": 0.0002625789634061233, + "loss": 7.3978, + "step": 16317 + }, + { + "epoch": 1.5226276010077449, + "grad_norm": 1.0045457070300692, + "learning_rate": 0.00026257396480835245, + "loss": 7.3031, + "step": 16318 + }, + { + "epoch": 1.522720910702622, + "grad_norm": 1.6949793874826218, + "learning_rate": 0.00026256896592433934, + "loss": 7.1958, + "step": 16319 + }, + { + "epoch": 1.5228142203974993, + "grad_norm": 2288.733841639692, + "learning_rate": 0.0002625639667540967, + "loss": 7.3087, + "step": 16320 + }, + { + "epoch": 1.5229075300923767, + "grad_norm": 2.9473954764440027, + "learning_rate": 0.00026255896729763725, + "loss": 7.3222, + "step": 16321 + }, + { + "epoch": 1.5230008397872539, + "grad_norm": 24218.51807369293, + "learning_rate": 0.00026255396755497365, + "loss": 7.0095, + "step": 16322 + }, + { + "epoch": 1.523094149482131, + "grad_norm": 0.7275056291803285, + "learning_rate": 0.0002625489675261187, + "loss": 7.3285, + "step": 16323 + }, + { + "epoch": 1.5231874591770085, + "grad_norm": 1.1122796581470638, + "learning_rate": 0.0002625439672110851, + "loss": 7.5296, + "step": 16324 + }, + { + "epoch": 1.523280768871886, + "grad_norm": 0.9151046396630591, + "learning_rate": 0.00026253896660988544, + "loss": 7.1909, + "step": 16325 + }, + { + "epoch": 1.523374078566763, + "grad_norm": 0.8553905745824005, + "learning_rate": 0.0002625339657225326, + "loss": 7.082, + "step": 16326 + }, + { + "epoch": 1.5234673882616403, + "grad_norm": 12897.066482001728, + "learning_rate": 0.00026252896454903915, + "loss": 6.9991, + "step": 16327 + }, + { + "epoch": 1.5235606979565177, + "grad_norm": 0.7351410815443354, + "learning_rate": 0.00026252396308941797, + "loss": 7.2388, + "step": 16328 + }, + { + "epoch": 1.5236540076513951, + "grad_norm": 0.5444205005332887, + "learning_rate": 0.0002625189613436816, + "loss": 7.2952, + "step": 16329 + }, + { + "epoch": 1.5237473173462721, + "grad_norm": 1.1451327781582996, + "learning_rate": 0.0002625139593118428, + "loss": 6.9581, + "step": 16330 + }, + { + "epoch": 1.5238406270411495, + "grad_norm": 1.0514252448436046, + "learning_rate": 0.00026250895699391445, + "loss": 7.0855, + "step": 16331 + }, + { + "epoch": 1.523933936736027, + "grad_norm": 6010.047269594356, + "learning_rate": 0.0002625039543899091, + "loss": 7.1402, + "step": 16332 + }, + { + "epoch": 1.5240272464309041, + "grad_norm": 6804.068906519587, + "learning_rate": 0.0002624989514998395, + "loss": 7.1874, + "step": 16333 + }, + { + "epoch": 1.5241205561257813, + "grad_norm": 0.6114500607873279, + "learning_rate": 0.0002624939483237184, + "loss": 7.2814, + "step": 16334 + }, + { + "epoch": 1.5242138658206588, + "grad_norm": 0.772487206987658, + "learning_rate": 0.0002624889448615586, + "loss": 7.2437, + "step": 16335 + }, + { + "epoch": 1.5243071755155362, + "grad_norm": 1.345867836318267, + "learning_rate": 0.0002624839411133726, + "loss": 7.3267, + "step": 16336 + }, + { + "epoch": 1.5244004852104134, + "grad_norm": 1.4423827778649876, + "learning_rate": 0.00026247893707917333, + "loss": 7.466, + "step": 16337 + }, + { + "epoch": 1.5244937949052906, + "grad_norm": 1.3224627895789907, + "learning_rate": 0.0002624739327589734, + "loss": 7.3416, + "step": 16338 + }, + { + "epoch": 1.524587104600168, + "grad_norm": 0.47140180673651877, + "learning_rate": 0.0002624689281527856, + "loss": 7.2589, + "step": 16339 + }, + { + "epoch": 1.5246804142950454, + "grad_norm": 0.7148726639179099, + "learning_rate": 0.00026246392326062263, + "loss": 7.0536, + "step": 16340 + }, + { + "epoch": 1.5247737239899224, + "grad_norm": 0.8521446014765005, + "learning_rate": 0.0002624589180824972, + "loss": 7.2619, + "step": 16341 + }, + { + "epoch": 1.5248670336847998, + "grad_norm": 0.909398302324022, + "learning_rate": 0.0002624539126184221, + "loss": 7.2453, + "step": 16342 + }, + { + "epoch": 1.5249603433796772, + "grad_norm": 1.1067663822552085, + "learning_rate": 0.00026244890686841, + "loss": 7.1311, + "step": 16343 + }, + { + "epoch": 1.5250536530745544, + "grad_norm": 15402.256287576061, + "learning_rate": 0.0002624439008324736, + "loss": 6.883, + "step": 16344 + }, + { + "epoch": 1.5251469627694316, + "grad_norm": 19725.87315604553, + "learning_rate": 0.00026243889451062576, + "loss": 7.25, + "step": 16345 + }, + { + "epoch": 1.525240272464309, + "grad_norm": 4563.546249139728, + "learning_rate": 0.000262433887902879, + "loss": 7.2833, + "step": 16346 + }, + { + "epoch": 1.5253335821591865, + "grad_norm": 0.9120681937088663, + "learning_rate": 0.0002624288810092463, + "loss": 7.4068, + "step": 16347 + }, + { + "epoch": 1.5254268918540637, + "grad_norm": 17687.48254827049, + "learning_rate": 0.0002624238738297401, + "loss": 7.239, + "step": 16348 + }, + { + "epoch": 1.5255202015489409, + "grad_norm": 1.2668923996266073, + "learning_rate": 0.0002624188663643734, + "loss": 7.5446, + "step": 16349 + }, + { + "epoch": 1.5256135112438183, + "grad_norm": 1.485252239980676, + "learning_rate": 0.0002624138586131588, + "loss": 7.7388, + "step": 16350 + }, + { + "epoch": 1.5257068209386957, + "grad_norm": 0.4336153167707099, + "learning_rate": 0.00026240885057610906, + "loss": 7.3055, + "step": 16351 + }, + { + "epoch": 1.5258001306335727, + "grad_norm": 14156.25397233732, + "learning_rate": 0.0002624038422532369, + "loss": 7.177, + "step": 16352 + }, + { + "epoch": 1.52589344032845, + "grad_norm": 0.9576640173163534, + "learning_rate": 0.0002623988336445551, + "loss": 7.1988, + "step": 16353 + }, + { + "epoch": 1.5259867500233275, + "grad_norm": 1.0638520218027165, + "learning_rate": 0.0002623938247500763, + "loss": 7.2302, + "step": 16354 + }, + { + "epoch": 1.5260800597182047, + "grad_norm": 0.7309346914979169, + "learning_rate": 0.00026238881556981335, + "loss": 7.2642, + "step": 16355 + }, + { + "epoch": 1.526173369413082, + "grad_norm": 0.8680670273607807, + "learning_rate": 0.00026238380610377887, + "loss": 7.1525, + "step": 16356 + }, + { + "epoch": 1.5262666791079593, + "grad_norm": 33077.144693521004, + "learning_rate": 0.0002623787963519857, + "loss": 7.1713, + "step": 16357 + }, + { + "epoch": 1.5263599888028367, + "grad_norm": 915.7334132163286, + "learning_rate": 0.00026237378631444654, + "loss": 7.39, + "step": 16358 + }, + { + "epoch": 1.526453298497714, + "grad_norm": 1.4023925469313976, + "learning_rate": 0.00026236877599117417, + "loss": 7.5859, + "step": 16359 + }, + { + "epoch": 1.5265466081925911, + "grad_norm": 1.2938255705091715, + "learning_rate": 0.00026236376538218124, + "loss": 7.3793, + "step": 16360 + }, + { + "epoch": 1.5266399178874686, + "grad_norm": 0.6515537127522082, + "learning_rate": 0.00026235875448748055, + "loss": 7.3561, + "step": 16361 + }, + { + "epoch": 1.5267332275823458, + "grad_norm": 126330.43227230878, + "learning_rate": 0.00026235374330708483, + "loss": 7.0955, + "step": 16362 + }, + { + "epoch": 1.526826537277223, + "grad_norm": 2432.132090728891, + "learning_rate": 0.0002623487318410068, + "loss": 7.2419, + "step": 16363 + }, + { + "epoch": 1.5269198469721004, + "grad_norm": 5222.496995344906, + "learning_rate": 0.0002623437200892593, + "loss": 7.2631, + "step": 16364 + }, + { + "epoch": 1.5270131566669778, + "grad_norm": 1.0051080363858298, + "learning_rate": 0.0002623387080518549, + "loss": 7.3124, + "step": 16365 + }, + { + "epoch": 1.527106466361855, + "grad_norm": 1.2966306657203654, + "learning_rate": 0.0002623336957288065, + "loss": 6.9359, + "step": 16366 + }, + { + "epoch": 1.5271997760567322, + "grad_norm": 1.091288650424133, + "learning_rate": 0.0002623286831201268, + "loss": 7.2973, + "step": 16367 + }, + { + "epoch": 1.5272930857516096, + "grad_norm": 0.7779977037648624, + "learning_rate": 0.00026232367022582847, + "loss": 7.0721, + "step": 16368 + }, + { + "epoch": 1.527386395446487, + "grad_norm": 0.9136707170378597, + "learning_rate": 0.0002623186570459244, + "loss": 7.5906, + "step": 16369 + }, + { + "epoch": 1.5274797051413642, + "grad_norm": 0.5258940139580136, + "learning_rate": 0.0002623136435804272, + "loss": 7.2792, + "step": 16370 + }, + { + "epoch": 1.5275730148362414, + "grad_norm": 0.4827045209384097, + "learning_rate": 0.0002623086298293497, + "loss": 7.3768, + "step": 16371 + }, + { + "epoch": 1.5276663245311188, + "grad_norm": 1.2040992133135946, + "learning_rate": 0.0002623036157927046, + "loss": 7.4451, + "step": 16372 + }, + { + "epoch": 1.527759634225996, + "grad_norm": 0.5979311514529853, + "learning_rate": 0.0002622986014705047, + "loss": 7.4662, + "step": 16373 + }, + { + "epoch": 1.5278529439208732, + "grad_norm": 0.765712264998691, + "learning_rate": 0.0002622935868627627, + "loss": 7.4233, + "step": 16374 + }, + { + "epoch": 1.5279462536157506, + "grad_norm": 0.9315235331399812, + "learning_rate": 0.0002622885719694914, + "loss": 6.9201, + "step": 16375 + }, + { + "epoch": 1.528039563310628, + "grad_norm": 0.7185918523148164, + "learning_rate": 0.0002622835567907035, + "loss": 7.1202, + "step": 16376 + }, + { + "epoch": 1.5281328730055053, + "grad_norm": 18867782.87383623, + "learning_rate": 0.00026227854132641177, + "loss": 7.4185, + "step": 16377 + }, + { + "epoch": 1.5282261827003825, + "grad_norm": 0.9007677165130762, + "learning_rate": 0.00026227352557662897, + "loss": 7.3192, + "step": 16378 + }, + { + "epoch": 1.5283194923952599, + "grad_norm": 0.980692244389684, + "learning_rate": 0.00026226850954136783, + "loss": 7.0629, + "step": 16379 + }, + { + "epoch": 1.5284128020901373, + "grad_norm": 0.8190562402424627, + "learning_rate": 0.00026226349322064113, + "loss": 7.712, + "step": 16380 + }, + { + "epoch": 1.5285061117850145, + "grad_norm": 2.0429133342567076, + "learning_rate": 0.0002622584766144617, + "loss": 7.4068, + "step": 16381 + }, + { + "epoch": 1.5285994214798917, + "grad_norm": 27496450610259.465, + "learning_rate": 0.00026225345972284215, + "loss": 7.5867, + "step": 16382 + }, + { + "epoch": 1.5286927311747691, + "grad_norm": 52998611241.6585, + "learning_rate": 0.00026224844254579534, + "loss": 7.2929, + "step": 16383 + }, + { + "epoch": 1.5287860408696463, + "grad_norm": 301456411.647234, + "learning_rate": 0.00026224342508333393, + "loss": 7.3343, + "step": 16384 + }, + { + "epoch": 1.5288793505645235, + "grad_norm": 36930437.443193346, + "learning_rate": 0.00026223840733547083, + "loss": 7.5653, + "step": 16385 + }, + { + "epoch": 1.528972660259401, + "grad_norm": 0.7852744850464802, + "learning_rate": 0.00026223338930221867, + "loss": 7.0368, + "step": 16386 + }, + { + "epoch": 1.5290659699542783, + "grad_norm": 1.3172326007631379, + "learning_rate": 0.0002622283709835902, + "loss": 7.4835, + "step": 16387 + }, + { + "epoch": 1.5291592796491555, + "grad_norm": 1730206.4211920865, + "learning_rate": 0.0002622233523795983, + "loss": 7.5579, + "step": 16388 + }, + { + "epoch": 1.5292525893440327, + "grad_norm": 1.1057197702226615, + "learning_rate": 0.0002622183334902556, + "loss": 7.3918, + "step": 16389 + }, + { + "epoch": 1.5293458990389102, + "grad_norm": 1046354.1154909378, + "learning_rate": 0.00026221331431557494, + "loss": 7.7616, + "step": 16390 + }, + { + "epoch": 1.5294392087337876, + "grad_norm": 0.9711970809298656, + "learning_rate": 0.00026220829485556905, + "loss": 7.55, + "step": 16391 + }, + { + "epoch": 1.5295325184286648, + "grad_norm": 0.6514499736923384, + "learning_rate": 0.0002622032751102507, + "loss": 7.3239, + "step": 16392 + }, + { + "epoch": 1.529625828123542, + "grad_norm": 1.2581595152654572, + "learning_rate": 0.00026219825507963265, + "loss": 7.5182, + "step": 16393 + }, + { + "epoch": 1.5297191378184194, + "grad_norm": 111167.73563303574, + "learning_rate": 0.00026219323476372775, + "loss": 7.4468, + "step": 16394 + }, + { + "epoch": 1.5298124475132966, + "grad_norm": 95914.74663465905, + "learning_rate": 0.00026218821416254865, + "loss": 7.6343, + "step": 16395 + }, + { + "epoch": 1.5299057572081738, + "grad_norm": 1.4897208783877092, + "learning_rate": 0.0002621831932761081, + "loss": 7.0706, + "step": 16396 + }, + { + "epoch": 1.5299990669030512, + "grad_norm": 1.4102837766599838, + "learning_rate": 0.000262178172104419, + "loss": 7.1796, + "step": 16397 + }, + { + "epoch": 1.5300923765979286, + "grad_norm": 1.2738845530177179, + "learning_rate": 0.0002621731506474939, + "loss": 7.0342, + "step": 16398 + }, + { + "epoch": 1.5301856862928058, + "grad_norm": 0.6574933361556189, + "learning_rate": 0.0002621681289053459, + "loss": 7.3559, + "step": 16399 + }, + { + "epoch": 1.530278995987683, + "grad_norm": 1.1704907003817477, + "learning_rate": 0.0002621631068779874, + "loss": 7.4693, + "step": 16400 + }, + { + "epoch": 1.5303723056825604, + "grad_norm": 0.5469983391308709, + "learning_rate": 0.00026215808456543145, + "loss": 7.3396, + "step": 16401 + }, + { + "epoch": 1.5304656153774379, + "grad_norm": 184305.71458137772, + "learning_rate": 0.0002621530619676907, + "loss": 6.9889, + "step": 16402 + }, + { + "epoch": 1.530558925072315, + "grad_norm": 0.6879196620573784, + "learning_rate": 0.0002621480390847779, + "loss": 7.2229, + "step": 16403 + }, + { + "epoch": 1.5306522347671923, + "grad_norm": 51193.851727682115, + "learning_rate": 0.0002621430159167059, + "loss": 6.8707, + "step": 16404 + }, + { + "epoch": 1.5307455444620697, + "grad_norm": 0.40761668120662803, + "learning_rate": 0.0002621379924634874, + "loss": 7.4089, + "step": 16405 + }, + { + "epoch": 1.5308388541569469, + "grad_norm": 0.6599754364404657, + "learning_rate": 0.0002621329687251352, + "loss": 7.3433, + "step": 16406 + }, + { + "epoch": 1.530932163851824, + "grad_norm": 1.454869105837144, + "learning_rate": 0.0002621279447016621, + "loss": 6.9694, + "step": 16407 + }, + { + "epoch": 1.5310254735467015, + "grad_norm": 1.0794108436643999, + "learning_rate": 0.00026212292039308086, + "loss": 6.9136, + "step": 16408 + }, + { + "epoch": 1.531118783241579, + "grad_norm": 1.0786689705740125, + "learning_rate": 0.0002621178957994042, + "loss": 7.7667, + "step": 16409 + }, + { + "epoch": 1.531212092936456, + "grad_norm": 0.8713755442668636, + "learning_rate": 0.00026211287092064496, + "loss": 7.233, + "step": 16410 + }, + { + "epoch": 1.5313054026313333, + "grad_norm": 1.5117728816304397, + "learning_rate": 0.00026210784575681596, + "loss": 7.5418, + "step": 16411 + }, + { + "epoch": 1.5313987123262107, + "grad_norm": 1.1755475274648814, + "learning_rate": 0.0002621028203079298, + "loss": 7.6222, + "step": 16412 + }, + { + "epoch": 1.5314920220210881, + "grad_norm": 23134.230143923916, + "learning_rate": 0.00026209779457399946, + "loss": 7.4745, + "step": 16413 + }, + { + "epoch": 1.5315853317159653, + "grad_norm": 0.9400537577075023, + "learning_rate": 0.00026209276855503766, + "loss": 7.1922, + "step": 16414 + }, + { + "epoch": 1.5316786414108425, + "grad_norm": 1.098010268264913, + "learning_rate": 0.0002620877422510571, + "loss": 7.6685, + "step": 16415 + }, + { + "epoch": 1.53177195110572, + "grad_norm": 0.5043291681254236, + "learning_rate": 0.00026208271566207064, + "loss": 7.3368, + "step": 16416 + }, + { + "epoch": 1.5318652608005972, + "grad_norm": 0.6513322288706758, + "learning_rate": 0.00026207768878809095, + "loss": 7.3819, + "step": 16417 + }, + { + "epoch": 1.5319585704954743, + "grad_norm": 1.135871278737652, + "learning_rate": 0.000262072661629131, + "loss": 7.4842, + "step": 16418 + }, + { + "epoch": 1.5320518801903518, + "grad_norm": 0.5849859790139047, + "learning_rate": 0.0002620676341852035, + "loss": 7.2893, + "step": 16419 + }, + { + "epoch": 1.5321451898852292, + "grad_norm": 0.8184636491483885, + "learning_rate": 0.00026206260645632106, + "loss": 7.4606, + "step": 16420 + }, + { + "epoch": 1.5322384995801064, + "grad_norm": 30641.338757656897, + "learning_rate": 0.0002620575784424967, + "loss": 7.2638, + "step": 16421 + }, + { + "epoch": 1.5323318092749836, + "grad_norm": 1.2217665487423093, + "learning_rate": 0.00026205255014374315, + "loss": 6.9403, + "step": 16422 + }, + { + "epoch": 1.532425118969861, + "grad_norm": 1.3393816743653786, + "learning_rate": 0.0002620475215600731, + "loss": 7.0565, + "step": 16423 + }, + { + "epoch": 1.5325184286647384, + "grad_norm": 0.7555159822951587, + "learning_rate": 0.0002620424926914994, + "loss": 7.437, + "step": 16424 + }, + { + "epoch": 1.5326117383596156, + "grad_norm": 1.1117568165386553, + "learning_rate": 0.0002620374635380348, + "loss": 7.1932, + "step": 16425 + }, + { + "epoch": 1.5327050480544928, + "grad_norm": 0.6060232761779023, + "learning_rate": 0.00026203243409969216, + "loss": 7.3845, + "step": 16426 + }, + { + "epoch": 1.5327983577493702, + "grad_norm": 42582.98794380268, + "learning_rate": 0.0002620274043764842, + "loss": 7.4541, + "step": 16427 + }, + { + "epoch": 1.5328916674442474, + "grad_norm": 0.6361498793472894, + "learning_rate": 0.0002620223743684238, + "loss": 7.2489, + "step": 16428 + }, + { + "epoch": 1.5329849771391246, + "grad_norm": 0.4997150360333852, + "learning_rate": 0.0002620173440755236, + "loss": 7.116, + "step": 16429 + }, + { + "epoch": 1.533078286834002, + "grad_norm": 34060.232934504354, + "learning_rate": 0.0002620123134977965, + "loss": 7.1486, + "step": 16430 + }, + { + "epoch": 1.5331715965288795, + "grad_norm": 38297.86985892429, + "learning_rate": 0.0002620072826352553, + "loss": 7.0833, + "step": 16431 + }, + { + "epoch": 1.5332649062237567, + "grad_norm": 0.75187164409599, + "learning_rate": 0.00026200225148791275, + "loss": 7.5199, + "step": 16432 + }, + { + "epoch": 1.5333582159186339, + "grad_norm": 0.6154336250264454, + "learning_rate": 0.0002619972200557816, + "loss": 7.2237, + "step": 16433 + }, + { + "epoch": 1.5334515256135113, + "grad_norm": 0.527872855046982, + "learning_rate": 0.0002619921883388747, + "loss": 7.2046, + "step": 16434 + }, + { + "epoch": 1.5335448353083887, + "grad_norm": 54689.03936606905, + "learning_rate": 0.00026198715633720494, + "loss": 7.382, + "step": 16435 + }, + { + "epoch": 1.5336381450032657, + "grad_norm": 0.5799709039188888, + "learning_rate": 0.0002619821240507849, + "loss": 7.5001, + "step": 16436 + }, + { + "epoch": 1.533731454698143, + "grad_norm": 0.7477280693668081, + "learning_rate": 0.0002619770914796276, + "loss": 7.1321, + "step": 16437 + }, + { + "epoch": 1.5338247643930205, + "grad_norm": 0.44140864433908755, + "learning_rate": 0.00026197205862374563, + "loss": 7.3047, + "step": 16438 + }, + { + "epoch": 1.5339180740878977, + "grad_norm": 0.6662000441873951, + "learning_rate": 0.00026196702548315193, + "loss": 7.4354, + "step": 16439 + }, + { + "epoch": 1.534011383782775, + "grad_norm": 0.6675155626107037, + "learning_rate": 0.00026196199205785926, + "loss": 7.1911, + "step": 16440 + }, + { + "epoch": 1.5341046934776523, + "grad_norm": 0.5201540797892976, + "learning_rate": 0.0002619569583478804, + "loss": 7.3572, + "step": 16441 + }, + { + "epoch": 1.5341980031725297, + "grad_norm": 0.8341401440087948, + "learning_rate": 0.00026195192435322814, + "loss": 7.1211, + "step": 16442 + }, + { + "epoch": 1.534291312867407, + "grad_norm": 0.5535112975475638, + "learning_rate": 0.0002619468900739153, + "loss": 7.3455, + "step": 16443 + }, + { + "epoch": 1.5343846225622841, + "grad_norm": 0.7025652201961705, + "learning_rate": 0.00026194185550995467, + "loss": 7.1766, + "step": 16444 + }, + { + "epoch": 1.5344779322571616, + "grad_norm": 5.34421182849196, + "learning_rate": 0.00026193682066135907, + "loss": 7.2009, + "step": 16445 + }, + { + "epoch": 1.534571241952039, + "grad_norm": 0.38823576417590816, + "learning_rate": 0.0002619317855281413, + "loss": 6.9158, + "step": 16446 + }, + { + "epoch": 1.534664551646916, + "grad_norm": 0.6296681487824131, + "learning_rate": 0.0002619267501103142, + "loss": 7.1485, + "step": 16447 + }, + { + "epoch": 1.5347578613417934, + "grad_norm": 1.2492916584880813, + "learning_rate": 0.0002619217144078905, + "loss": 7.3064, + "step": 16448 + }, + { + "epoch": 1.5348511710366708, + "grad_norm": 1.4585507612621345, + "learning_rate": 0.000261916678420883, + "loss": 7.5823, + "step": 16449 + }, + { + "epoch": 1.534944480731548, + "grad_norm": 1.4797296616354467, + "learning_rate": 0.0002619116421493045, + "loss": 7.512, + "step": 16450 + }, + { + "epoch": 1.5350377904264252, + "grad_norm": 0.8172289340115253, + "learning_rate": 0.00026190660559316796, + "loss": 6.7718, + "step": 16451 + }, + { + "epoch": 1.5351311001213026, + "grad_norm": 0.7420654718374597, + "learning_rate": 0.000261901568752486, + "loss": 6.9267, + "step": 16452 + }, + { + "epoch": 1.53522440981618, + "grad_norm": 5666.385940314541, + "learning_rate": 0.00026189653162727157, + "loss": 7.2354, + "step": 16453 + }, + { + "epoch": 1.5353177195110572, + "grad_norm": 0.5850393975760253, + "learning_rate": 0.00026189149421753733, + "loss": 7.018, + "step": 16454 + }, + { + "epoch": 1.5354110292059344, + "grad_norm": 0.6987932931182882, + "learning_rate": 0.0002618864565232962, + "loss": 7.37, + "step": 16455 + }, + { + "epoch": 1.5355043389008118, + "grad_norm": 0.38967580133081625, + "learning_rate": 0.00026188141854456093, + "loss": 7.2575, + "step": 16456 + }, + { + "epoch": 1.5355976485956893, + "grad_norm": 0.486708850948999, + "learning_rate": 0.00026187638028134436, + "loss": 7.1903, + "step": 16457 + }, + { + "epoch": 1.5356909582905662, + "grad_norm": 0.7400697153843242, + "learning_rate": 0.0002618713417336593, + "loss": 7.103, + "step": 16458 + }, + { + "epoch": 1.5357842679854437, + "grad_norm": 0.41571857409295276, + "learning_rate": 0.00026186630290151857, + "loss": 7.1707, + "step": 16459 + }, + { + "epoch": 1.535877577680321, + "grad_norm": 0.8133866049276415, + "learning_rate": 0.00026186126378493493, + "loss": 7.5226, + "step": 16460 + }, + { + "epoch": 1.5359708873751983, + "grad_norm": 0.5639646177386132, + "learning_rate": 0.00026185622438392126, + "loss": 7.0324, + "step": 16461 + }, + { + "epoch": 1.5360641970700755, + "grad_norm": 34657.49303428043, + "learning_rate": 0.00026185118469849035, + "loss": 6.8598, + "step": 16462 + }, + { + "epoch": 1.5361575067649529, + "grad_norm": 0.792180709828231, + "learning_rate": 0.000261846144728655, + "loss": 6.8596, + "step": 16463 + }, + { + "epoch": 1.5362508164598303, + "grad_norm": 0.32993267784046004, + "learning_rate": 0.00026184110447442804, + "loss": 7.1897, + "step": 16464 + }, + { + "epoch": 1.5363441261547075, + "grad_norm": 1.5743666354207957, + "learning_rate": 0.0002618360639358223, + "loss": 7.7167, + "step": 16465 + }, + { + "epoch": 1.5364374358495847, + "grad_norm": 0.28805531596523415, + "learning_rate": 0.0002618310231128506, + "loss": 7.039, + "step": 16466 + }, + { + "epoch": 1.5365307455444621, + "grad_norm": 0.48593768104315377, + "learning_rate": 0.00026182598200552565, + "loss": 7.1203, + "step": 16467 + }, + { + "epoch": 1.5366240552393393, + "grad_norm": 0.7207295818250438, + "learning_rate": 0.0002618209406138604, + "loss": 7.2251, + "step": 16468 + }, + { + "epoch": 1.5367173649342165, + "grad_norm": 0.9438563963192738, + "learning_rate": 0.0002618158989378676, + "loss": 7.3454, + "step": 16469 + }, + { + "epoch": 1.536810674629094, + "grad_norm": 51012.511550856194, + "learning_rate": 0.0002618108569775601, + "loss": 7.1687, + "step": 16470 + }, + { + "epoch": 1.5369039843239714, + "grad_norm": 0.5267859679522703, + "learning_rate": 0.00026180581473295075, + "loss": 7.1919, + "step": 16471 + }, + { + "epoch": 1.5369972940188485, + "grad_norm": 0.551484871000734, + "learning_rate": 0.0002618007722040523, + "loss": 7.3631, + "step": 16472 + }, + { + "epoch": 1.5370906037137257, + "grad_norm": 0.3743971451434857, + "learning_rate": 0.0002617957293908776, + "loss": 7.2808, + "step": 16473 + }, + { + "epoch": 1.5371839134086032, + "grad_norm": 0.4855936616084931, + "learning_rate": 0.0002617906862934395, + "loss": 7.2253, + "step": 16474 + }, + { + "epoch": 1.5372772231034806, + "grad_norm": 66900.32202518941, + "learning_rate": 0.00026178564291175075, + "loss": 7.233, + "step": 16475 + }, + { + "epoch": 1.5373705327983578, + "grad_norm": 0.31171044258055913, + "learning_rate": 0.00026178059924582427, + "loss": 7.1092, + "step": 16476 + }, + { + "epoch": 1.537463842493235, + "grad_norm": 0.7230935767553829, + "learning_rate": 0.0002617755552956728, + "loss": 7.286, + "step": 16477 + }, + { + "epoch": 1.5375571521881124, + "grad_norm": 0.5737229859880973, + "learning_rate": 0.00026177051106130925, + "loss": 7.3281, + "step": 16478 + }, + { + "epoch": 1.5376504618829896, + "grad_norm": 6207.8879177515655, + "learning_rate": 0.0002617654665427464, + "loss": 7.2927, + "step": 16479 + }, + { + "epoch": 1.5377437715778668, + "grad_norm": 0.5192397384572162, + "learning_rate": 0.000261760421739997, + "loss": 7.4148, + "step": 16480 + }, + { + "epoch": 1.5378370812727442, + "grad_norm": 0.7875451776478655, + "learning_rate": 0.00026175537665307404, + "loss": 7.4462, + "step": 16481 + }, + { + "epoch": 1.5379303909676216, + "grad_norm": 0.37814994777179506, + "learning_rate": 0.0002617503312819902, + "loss": 7.5934, + "step": 16482 + }, + { + "epoch": 1.5380237006624988, + "grad_norm": 1.9901459683558822, + "learning_rate": 0.0002617452856267584, + "loss": 6.7967, + "step": 16483 + }, + { + "epoch": 1.538117010357376, + "grad_norm": 1.3005524675562044, + "learning_rate": 0.00026174023968739146, + "loss": 7.0316, + "step": 16484 + }, + { + "epoch": 1.5382103200522534, + "grad_norm": 1.35414155209088, + "learning_rate": 0.0002617351934639022, + "loss": 6.7545, + "step": 16485 + }, + { + "epoch": 1.5383036297471309, + "grad_norm": 1.2641795211209998, + "learning_rate": 0.0002617301469563034, + "loss": 6.8078, + "step": 16486 + }, + { + "epoch": 1.538396939442008, + "grad_norm": 1.2997410093183506, + "learning_rate": 0.0002617251001646079, + "loss": 7.3019, + "step": 16487 + }, + { + "epoch": 1.5384902491368853, + "grad_norm": 1.0017076296315255, + "learning_rate": 0.00026172005308882864, + "loss": 7.0939, + "step": 16488 + }, + { + "epoch": 1.5385835588317627, + "grad_norm": 55291.674744571166, + "learning_rate": 0.00026171500572897836, + "loss": 7.5563, + "step": 16489 + }, + { + "epoch": 1.5386768685266399, + "grad_norm": 1.732270587443669, + "learning_rate": 0.0002617099580850699, + "loss": 7.4572, + "step": 16490 + }, + { + "epoch": 1.538770178221517, + "grad_norm": 81697.6524771528, + "learning_rate": 0.0002617049101571161, + "loss": 7.33, + "step": 16491 + }, + { + "epoch": 1.5388634879163945, + "grad_norm": 32409.102594671167, + "learning_rate": 0.0002616998619451298, + "loss": 7.148, + "step": 16492 + }, + { + "epoch": 1.538956797611272, + "grad_norm": 0.6308278719621477, + "learning_rate": 0.0002616948134491239, + "loss": 7.053, + "step": 16493 + }, + { + "epoch": 1.539050107306149, + "grad_norm": 0.740279915938104, + "learning_rate": 0.0002616897646691111, + "loss": 7.2657, + "step": 16494 + }, + { + "epoch": 1.5391434170010263, + "grad_norm": 0.8033749368690523, + "learning_rate": 0.00026168471560510435, + "loss": 7.2455, + "step": 16495 + }, + { + "epoch": 1.5392367266959037, + "grad_norm": 0.8981822911443551, + "learning_rate": 0.0002616796662571164, + "loss": 7.3275, + "step": 16496 + }, + { + "epoch": 1.5393300363907811, + "grad_norm": 0.6018236900857413, + "learning_rate": 0.0002616746166251602, + "loss": 7.3686, + "step": 16497 + }, + { + "epoch": 1.5394233460856583, + "grad_norm": 0.7968013662422754, + "learning_rate": 0.0002616695667092485, + "loss": 7.2791, + "step": 16498 + }, + { + "epoch": 1.5395166557805355, + "grad_norm": 0.720744193221168, + "learning_rate": 0.0002616645165093942, + "loss": 7.2516, + "step": 16499 + }, + { + "epoch": 1.539609965475413, + "grad_norm": 0.4168957613497468, + "learning_rate": 0.0002616594660256101, + "loss": 7.2892, + "step": 16500 + }, + { + "epoch": 1.5397032751702902, + "grad_norm": 0.5595212966720386, + "learning_rate": 0.000261654415257909, + "loss": 7.3285, + "step": 16501 + }, + { + "epoch": 1.5397965848651674, + "grad_norm": 0.6827982017002674, + "learning_rate": 0.0002616493642063039, + "loss": 7.1414, + "step": 16502 + }, + { + "epoch": 1.5398898945600448, + "grad_norm": 0.7309082028663711, + "learning_rate": 0.0002616443128708075, + "loss": 7.2016, + "step": 16503 + }, + { + "epoch": 1.5399832042549222, + "grad_norm": 0.9062180654095461, + "learning_rate": 0.00026163926125143263, + "loss": 7.2501, + "step": 16504 + }, + { + "epoch": 1.5400765139497994, + "grad_norm": 0.9933912654267909, + "learning_rate": 0.0002616342093481923, + "loss": 7.2028, + "step": 16505 + }, + { + "epoch": 1.5401698236446766, + "grad_norm": 0.5671140471522206, + "learning_rate": 0.0002616291571610991, + "loss": 7.3675, + "step": 16506 + }, + { + "epoch": 1.540263133339554, + "grad_norm": 0.7343768913716959, + "learning_rate": 0.0002616241046901661, + "loss": 6.9536, + "step": 16507 + }, + { + "epoch": 1.5403564430344314, + "grad_norm": 0.9575108672754425, + "learning_rate": 0.00026161905193540605, + "loss": 6.9753, + "step": 16508 + }, + { + "epoch": 1.5404497527293086, + "grad_norm": 14732.349350127568, + "learning_rate": 0.00026161399889683184, + "loss": 7.0215, + "step": 16509 + }, + { + "epoch": 1.5405430624241858, + "grad_norm": 1.1629762823451568, + "learning_rate": 0.0002616089455744563, + "loss": 6.6997, + "step": 16510 + }, + { + "epoch": 1.5406363721190632, + "grad_norm": 0.8046860678546002, + "learning_rate": 0.00026160389196829226, + "loss": 6.8117, + "step": 16511 + }, + { + "epoch": 1.5407296818139404, + "grad_norm": 1.4512149097467761, + "learning_rate": 0.0002615988380783525, + "loss": 7.1634, + "step": 16512 + }, + { + "epoch": 1.5408229915088176, + "grad_norm": 0.7868192179202489, + "learning_rate": 0.0002615937839046501, + "loss": 7.0316, + "step": 16513 + }, + { + "epoch": 1.540916301203695, + "grad_norm": 0.7992450018206119, + "learning_rate": 0.0002615887294471976, + "loss": 6.9968, + "step": 16514 + }, + { + "epoch": 1.5410096108985725, + "grad_norm": 0.9516383414862357, + "learning_rate": 0.00026158367470600815, + "loss": 7.1526, + "step": 16515 + }, + { + "epoch": 1.5411029205934497, + "grad_norm": 0.6970028101841566, + "learning_rate": 0.0002615786196810944, + "loss": 7.0825, + "step": 16516 + }, + { + "epoch": 1.5411962302883269, + "grad_norm": 1.1255554749322487, + "learning_rate": 0.0002615735643724693, + "loss": 7.3171, + "step": 16517 + }, + { + "epoch": 1.5412895399832043, + "grad_norm": 0.5534570639478059, + "learning_rate": 0.0002615685087801457, + "loss": 7.1262, + "step": 16518 + }, + { + "epoch": 1.5413828496780817, + "grad_norm": 0.9730632449175266, + "learning_rate": 0.0002615634529041364, + "loss": 7.463, + "step": 16519 + }, + { + "epoch": 1.541476159372959, + "grad_norm": 46570.63523247384, + "learning_rate": 0.0002615583967444543, + "loss": 7.4061, + "step": 16520 + }, + { + "epoch": 1.541569469067836, + "grad_norm": 1.4369994579828127, + "learning_rate": 0.00026155334030111225, + "loss": 6.8602, + "step": 16521 + }, + { + "epoch": 1.5416627787627135, + "grad_norm": 0.48919219073301606, + "learning_rate": 0.00026154828357412307, + "loss": 7.2967, + "step": 16522 + }, + { + "epoch": 1.5417560884575907, + "grad_norm": 0.45879319867775237, + "learning_rate": 0.00026154322656349966, + "loss": 7.207, + "step": 16523 + }, + { + "epoch": 1.541849398152468, + "grad_norm": 0.8798988685829202, + "learning_rate": 0.0002615381692692549, + "loss": 7.0605, + "step": 16524 + }, + { + "epoch": 1.5419427078473453, + "grad_norm": 0.6964814796345766, + "learning_rate": 0.00026153311169140156, + "loss": 7.2202, + "step": 16525 + }, + { + "epoch": 1.5420360175422227, + "grad_norm": 1.150033610799782, + "learning_rate": 0.00026152805382995256, + "loss": 7.2108, + "step": 16526 + }, + { + "epoch": 1.5421293272371, + "grad_norm": 32914.85519871511, + "learning_rate": 0.00026152299568492083, + "loss": 7.4479, + "step": 16527 + }, + { + "epoch": 1.5422226369319771, + "grad_norm": 0.4722858386536951, + "learning_rate": 0.00026151793725631906, + "loss": 7.1237, + "step": 16528 + }, + { + "epoch": 1.5423159466268546, + "grad_norm": 0.488601481192932, + "learning_rate": 0.00026151287854416026, + "loss": 7.0774, + "step": 16529 + }, + { + "epoch": 1.542409256321732, + "grad_norm": 43050.24176094842, + "learning_rate": 0.0002615078195484572, + "loss": 7.2881, + "step": 16530 + }, + { + "epoch": 1.5425025660166092, + "grad_norm": 29905.736340637, + "learning_rate": 0.0002615027602692229, + "loss": 7.3175, + "step": 16531 + }, + { + "epoch": 1.5425958757114864, + "grad_norm": 0.6670644874689987, + "learning_rate": 0.00026149770070646997, + "loss": 7.4591, + "step": 16532 + }, + { + "epoch": 1.5426891854063638, + "grad_norm": 120130.33302947364, + "learning_rate": 0.0002614926408602115, + "loss": 6.945, + "step": 16533 + }, + { + "epoch": 1.542782495101241, + "grad_norm": 0.9684206696511751, + "learning_rate": 0.0002614875807304603, + "loss": 7.2254, + "step": 16534 + }, + { + "epoch": 1.5428758047961182, + "grad_norm": 0.43983254632672864, + "learning_rate": 0.0002614825203172291, + "loss": 7.4198, + "step": 16535 + }, + { + "epoch": 1.5429691144909956, + "grad_norm": 0.47446217630505594, + "learning_rate": 0.00026147745962053096, + "loss": 7.1517, + "step": 16536 + }, + { + "epoch": 1.543062424185873, + "grad_norm": 0.883178481586874, + "learning_rate": 0.00026147239864037863, + "loss": 7.5407, + "step": 16537 + }, + { + "epoch": 1.5431557338807502, + "grad_norm": 0.5405428202647915, + "learning_rate": 0.00026146733737678497, + "loss": 7.2042, + "step": 16538 + }, + { + "epoch": 1.5432490435756274, + "grad_norm": 0.6011853836582643, + "learning_rate": 0.00026146227582976296, + "loss": 7.2923, + "step": 16539 + }, + { + "epoch": 1.5433423532705048, + "grad_norm": 0.7694602066082753, + "learning_rate": 0.00026145721399932535, + "loss": 6.9936, + "step": 16540 + }, + { + "epoch": 1.5434356629653823, + "grad_norm": 0.7872780176081295, + "learning_rate": 0.0002614521518854851, + "loss": 7.1388, + "step": 16541 + }, + { + "epoch": 1.5435289726602592, + "grad_norm": 0.5450239292352699, + "learning_rate": 0.000261447089488255, + "loss": 7.0389, + "step": 16542 + }, + { + "epoch": 1.5436222823551367, + "grad_norm": 36977.80688197563, + "learning_rate": 0.00026144202680764803, + "loss": 7.4484, + "step": 16543 + }, + { + "epoch": 1.543715592050014, + "grad_norm": 0.9311010302776103, + "learning_rate": 0.00026143696384367694, + "loss": 7.4314, + "step": 16544 + }, + { + "epoch": 1.5438089017448913, + "grad_norm": 1.137000838796892, + "learning_rate": 0.00026143190059635466, + "loss": 6.8024, + "step": 16545 + }, + { + "epoch": 1.5439022114397685, + "grad_norm": 211107.56771670672, + "learning_rate": 0.0002614268370656941, + "loss": 6.819, + "step": 16546 + }, + { + "epoch": 1.5439955211346459, + "grad_norm": 0.6695190338682789, + "learning_rate": 0.0002614217732517081, + "loss": 7.1068, + "step": 16547 + }, + { + "epoch": 1.5440888308295233, + "grad_norm": 252689.81407056033, + "learning_rate": 0.0002614167091544095, + "loss": 7.2995, + "step": 16548 + }, + { + "epoch": 1.5441821405244005, + "grad_norm": 1.1989971603337852, + "learning_rate": 0.00026141164477381124, + "loss": 7.4279, + "step": 16549 + }, + { + "epoch": 1.5442754502192777, + "grad_norm": 0.5864216988483069, + "learning_rate": 0.0002614065801099262, + "loss": 7.2342, + "step": 16550 + }, + { + "epoch": 1.5443687599141551, + "grad_norm": 290322.15060020273, + "learning_rate": 0.00026140151516276713, + "loss": 7.2389, + "step": 16551 + }, + { + "epoch": 1.5444620696090325, + "grad_norm": 0.59915841636788, + "learning_rate": 0.00026139644993234713, + "loss": 7.294, + "step": 16552 + }, + { + "epoch": 1.5445553793039095, + "grad_norm": 0.6291873493001718, + "learning_rate": 0.0002613913844186789, + "loss": 6.9716, + "step": 16553 + }, + { + "epoch": 1.544648688998787, + "grad_norm": 0.778881508398364, + "learning_rate": 0.0002613863186217753, + "loss": 7.3417, + "step": 16554 + }, + { + "epoch": 1.5447419986936644, + "grad_norm": 1.1719472865966074, + "learning_rate": 0.00026138125254164936, + "loss": 7.2652, + "step": 16555 + }, + { + "epoch": 1.5448353083885416, + "grad_norm": 0.7351521763116734, + "learning_rate": 0.0002613761861783139, + "loss": 7.0077, + "step": 16556 + }, + { + "epoch": 1.5449286180834187, + "grad_norm": 69289.3014037794, + "learning_rate": 0.00026137111953178176, + "loss": 7.1047, + "step": 16557 + }, + { + "epoch": 1.5450219277782962, + "grad_norm": 125281.1486204584, + "learning_rate": 0.0002613660526020659, + "loss": 7.4485, + "step": 16558 + }, + { + "epoch": 1.5451152374731736, + "grad_norm": 0.7614920570250258, + "learning_rate": 0.0002613609853891791, + "loss": 7.333, + "step": 16559 + }, + { + "epoch": 1.5452085471680508, + "grad_norm": 0.8095682797597106, + "learning_rate": 0.0002613559178931344, + "loss": 7.1955, + "step": 16560 + }, + { + "epoch": 1.545301856862928, + "grad_norm": 1.1967558305223311, + "learning_rate": 0.0002613508501139445, + "loss": 7.2788, + "step": 16561 + }, + { + "epoch": 1.5453951665578054, + "grad_norm": 0.7145591242789106, + "learning_rate": 0.0002613457820516223, + "loss": 7.3845, + "step": 16562 + }, + { + "epoch": 1.5454884762526828, + "grad_norm": 0.9439957534051626, + "learning_rate": 0.0002613407137061809, + "loss": 7.1758, + "step": 16563 + }, + { + "epoch": 1.5455817859475598, + "grad_norm": 0.689604602071575, + "learning_rate": 0.000261335645077633, + "loss": 7.3729, + "step": 16564 + }, + { + "epoch": 1.5456750956424372, + "grad_norm": 0.5563548708862602, + "learning_rate": 0.00026133057616599154, + "loss": 7.3138, + "step": 16565 + }, + { + "epoch": 1.5457684053373146, + "grad_norm": 0.7579881535520681, + "learning_rate": 0.00026132550697126935, + "loss": 7.1913, + "step": 16566 + }, + { + "epoch": 1.5458617150321918, + "grad_norm": 391643.0381385069, + "learning_rate": 0.00026132043749347943, + "loss": 6.8651, + "step": 16567 + }, + { + "epoch": 1.545955024727069, + "grad_norm": 0.49135663501285526, + "learning_rate": 0.00026131536773263463, + "loss": 7.1669, + "step": 16568 + }, + { + "epoch": 1.5460483344219464, + "grad_norm": 0.5832965804416032, + "learning_rate": 0.00026131029768874773, + "loss": 7.0807, + "step": 16569 + }, + { + "epoch": 1.5461416441168239, + "grad_norm": 0.913756219091315, + "learning_rate": 0.0002613052273618318, + "loss": 6.9977, + "step": 16570 + }, + { + "epoch": 1.546234953811701, + "grad_norm": 95608.14688443468, + "learning_rate": 0.0002613001567518996, + "loss": 7.2286, + "step": 16571 + }, + { + "epoch": 1.5463282635065783, + "grad_norm": 1.1688135426996276, + "learning_rate": 0.00026129508585896413, + "loss": 7.3467, + "step": 16572 + }, + { + "epoch": 1.5464215732014557, + "grad_norm": 0.9932990324417006, + "learning_rate": 0.00026129001468303817, + "loss": 7.4056, + "step": 16573 + }, + { + "epoch": 1.5465148828963329, + "grad_norm": 0.525485625999682, + "learning_rate": 0.0002612849432241347, + "loss": 7.2555, + "step": 16574 + }, + { + "epoch": 1.54660819259121, + "grad_norm": 0.6531576910402485, + "learning_rate": 0.00026127987148226656, + "loss": 7.397, + "step": 16575 + }, + { + "epoch": 1.5467015022860875, + "grad_norm": 0.601795715286192, + "learning_rate": 0.00026127479945744673, + "loss": 7.3214, + "step": 16576 + }, + { + "epoch": 1.546794811980965, + "grad_norm": 1.168207219513387, + "learning_rate": 0.00026126972714968796, + "loss": 6.8494, + "step": 16577 + }, + { + "epoch": 1.546888121675842, + "grad_norm": 1.040819344476261, + "learning_rate": 0.0002612646545590033, + "loss": 7.254, + "step": 16578 + }, + { + "epoch": 1.5469814313707193, + "grad_norm": 1.1929131058034903, + "learning_rate": 0.00026125958168540553, + "loss": 7.0513, + "step": 16579 + }, + { + "epoch": 1.5470747410655967, + "grad_norm": 0.9602680139135776, + "learning_rate": 0.00026125450852890767, + "loss": 7.2131, + "step": 16580 + }, + { + "epoch": 1.5471680507604741, + "grad_norm": 1.0822151685424681, + "learning_rate": 0.0002612494350895224, + "loss": 7.2749, + "step": 16581 + }, + { + "epoch": 1.5472613604553513, + "grad_norm": 0.3798324930642519, + "learning_rate": 0.000261244361367263, + "loss": 6.7561, + "step": 16582 + }, + { + "epoch": 1.5473546701502285, + "grad_norm": 1.0235330444654127, + "learning_rate": 0.00026123928736214197, + "loss": 7.1334, + "step": 16583 + }, + { + "epoch": 1.547447979845106, + "grad_norm": 1.3185258021133994, + "learning_rate": 0.00026123421307417245, + "loss": 7.3097, + "step": 16584 + }, + { + "epoch": 1.5475412895399832, + "grad_norm": 0.40174820756244, + "learning_rate": 0.0002612291385033673, + "loss": 7.0576, + "step": 16585 + }, + { + "epoch": 1.5476345992348604, + "grad_norm": 386.80394351829034, + "learning_rate": 0.00026122406364973935, + "loss": 6.9911, + "step": 16586 + }, + { + "epoch": 1.5477279089297378, + "grad_norm": 1.4209959021217704, + "learning_rate": 0.00026121898851330156, + "loss": 6.8208, + "step": 16587 + }, + { + "epoch": 1.5478212186246152, + "grad_norm": 1.6906341081917524, + "learning_rate": 0.0002612139130940668, + "loss": 6.6521, + "step": 16588 + }, + { + "epoch": 1.5479145283194924, + "grad_norm": 0.6061821278487914, + "learning_rate": 0.00026120883739204803, + "loss": 6.8689, + "step": 16589 + }, + { + "epoch": 1.5480078380143696, + "grad_norm": 0.6018955848143284, + "learning_rate": 0.0002612037614072582, + "loss": 7.1194, + "step": 16590 + }, + { + "epoch": 1.548101147709247, + "grad_norm": 0.903169806836696, + "learning_rate": 0.00026119868513971, + "loss": 7.1222, + "step": 16591 + }, + { + "epoch": 1.5481944574041244, + "grad_norm": 0.8292954854307635, + "learning_rate": 0.0002611936085894166, + "loss": 7.0399, + "step": 16592 + }, + { + "epoch": 1.5482877670990016, + "grad_norm": 0.7084817498504925, + "learning_rate": 0.00026118853175639075, + "loss": 6.8627, + "step": 16593 + }, + { + "epoch": 1.5483810767938788, + "grad_norm": 0.8701817607515434, + "learning_rate": 0.0002611834546406454, + "loss": 7.0669, + "step": 16594 + }, + { + "epoch": 1.5484743864887562, + "grad_norm": 1.1455437802415323, + "learning_rate": 0.00026117837724219347, + "loss": 7.169, + "step": 16595 + }, + { + "epoch": 1.5485676961836334, + "grad_norm": 0.7341888726801608, + "learning_rate": 0.00026117329956104786, + "loss": 7.1793, + "step": 16596 + }, + { + "epoch": 1.5486610058785106, + "grad_norm": 0.4535484548511301, + "learning_rate": 0.00026116822159722147, + "loss": 6.9422, + "step": 16597 + }, + { + "epoch": 1.548754315573388, + "grad_norm": 0.6992839711513618, + "learning_rate": 0.00026116314335072725, + "loss": 6.9926, + "step": 16598 + }, + { + "epoch": 1.5488476252682655, + "grad_norm": 0.585422754421392, + "learning_rate": 0.00026115806482157806, + "loss": 7.369, + "step": 16599 + }, + { + "epoch": 1.5489409349631427, + "grad_norm": 0.7476761533876054, + "learning_rate": 0.00026115298600978685, + "loss": 7.2015, + "step": 16600 + }, + { + "epoch": 1.5490342446580199, + "grad_norm": 2171.3188895371095, + "learning_rate": 0.0002611479069153665, + "loss": 7.1362, + "step": 16601 + }, + { + "epoch": 1.5491275543528973, + "grad_norm": 0.5472655871367845, + "learning_rate": 0.00026114282753833, + "loss": 7.3363, + "step": 16602 + }, + { + "epoch": 1.5492208640477747, + "grad_norm": 0.5453898054030994, + "learning_rate": 0.0002611377478786902, + "loss": 7.3297, + "step": 16603 + }, + { + "epoch": 1.549314173742652, + "grad_norm": 0.46970685425425257, + "learning_rate": 0.00026113266793645996, + "loss": 7.4221, + "step": 16604 + }, + { + "epoch": 1.549407483437529, + "grad_norm": 0.44570989572151876, + "learning_rate": 0.0002611275877116523, + "loss": 7.3119, + "step": 16605 + }, + { + "epoch": 1.5495007931324065, + "grad_norm": 0.4817063767102508, + "learning_rate": 0.0002611225072042801, + "loss": 7.3055, + "step": 16606 + }, + { + "epoch": 1.5495941028272837, + "grad_norm": 0.4605547199226983, + "learning_rate": 0.00026111742641435635, + "loss": 7.2776, + "step": 16607 + }, + { + "epoch": 1.549687412522161, + "grad_norm": 0.9175343114360577, + "learning_rate": 0.0002611123453418938, + "loss": 6.7889, + "step": 16608 + }, + { + "epoch": 1.5497807222170383, + "grad_norm": 0.47446630666154577, + "learning_rate": 0.00026110726398690553, + "loss": 7.2197, + "step": 16609 + }, + { + "epoch": 1.5498740319119158, + "grad_norm": 0.43413701845105146, + "learning_rate": 0.00026110218234940433, + "loss": 6.9231, + "step": 16610 + }, + { + "epoch": 1.549967341606793, + "grad_norm": 1.4054688580719799, + "learning_rate": 0.00026109710042940325, + "loss": 7.4567, + "step": 16611 + }, + { + "epoch": 1.5500606513016701, + "grad_norm": 0.8609227395406005, + "learning_rate": 0.0002610920182269152, + "loss": 7.1303, + "step": 16612 + }, + { + "epoch": 1.5501539609965476, + "grad_norm": 0.6118045554807398, + "learning_rate": 0.00026108693574195294, + "loss": 7.2334, + "step": 16613 + }, + { + "epoch": 1.550247270691425, + "grad_norm": 0.49970538269394205, + "learning_rate": 0.00026108185297452963, + "loss": 7.1203, + "step": 16614 + }, + { + "epoch": 1.5503405803863022, + "grad_norm": 0.44900941829662117, + "learning_rate": 0.000261076769924658, + "loss": 7.2982, + "step": 16615 + }, + { + "epoch": 1.5504338900811794, + "grad_norm": 0.6337846089005011, + "learning_rate": 0.00026107168659235104, + "loss": 7.1885, + "step": 16616 + }, + { + "epoch": 1.5505271997760568, + "grad_norm": 1.3141915315068793, + "learning_rate": 0.00026106660297762173, + "loss": 6.9961, + "step": 16617 + }, + { + "epoch": 1.550620509470934, + "grad_norm": 0.9078946392488624, + "learning_rate": 0.00026106151908048296, + "loss": 6.9613, + "step": 16618 + }, + { + "epoch": 1.5507138191658112, + "grad_norm": 0.9247254111523142, + "learning_rate": 0.0002610564349009476, + "loss": 7.5441, + "step": 16619 + }, + { + "epoch": 1.5508071288606886, + "grad_norm": 0.652128372158796, + "learning_rate": 0.0002610513504390286, + "loss": 7.2075, + "step": 16620 + }, + { + "epoch": 1.550900438555566, + "grad_norm": 1.3964185718135222, + "learning_rate": 0.0002610462656947389, + "loss": 7.3101, + "step": 16621 + }, + { + "epoch": 1.5509937482504432, + "grad_norm": 1.4301365073344696, + "learning_rate": 0.00026104118066809154, + "loss": 7.5732, + "step": 16622 + }, + { + "epoch": 1.5510870579453204, + "grad_norm": 0.5671121829656786, + "learning_rate": 0.0002610360953590993, + "loss": 7.3332, + "step": 16623 + }, + { + "epoch": 1.5511803676401978, + "grad_norm": 0.6935232947037122, + "learning_rate": 0.00026103100976777513, + "loss": 7.3952, + "step": 16624 + }, + { + "epoch": 1.5512736773350753, + "grad_norm": 149.34922804162093, + "learning_rate": 0.00026102592389413206, + "loss": 6.973, + "step": 16625 + }, + { + "epoch": 1.5513669870299525, + "grad_norm": 1.5219304687210367, + "learning_rate": 0.0002610208377381829, + "loss": 7.261, + "step": 16626 + }, + { + "epoch": 1.5514602967248297, + "grad_norm": 1.8563243412149546, + "learning_rate": 0.00026101575129994063, + "loss": 7.0152, + "step": 16627 + }, + { + "epoch": 1.551553606419707, + "grad_norm": 0.7745569734322296, + "learning_rate": 0.00026101066457941824, + "loss": 7.2409, + "step": 16628 + }, + { + "epoch": 1.5516469161145843, + "grad_norm": 0.4132848891606645, + "learning_rate": 0.0002610055775766286, + "loss": 7.2075, + "step": 16629 + }, + { + "epoch": 1.5517402258094615, + "grad_norm": 510.4038156003212, + "learning_rate": 0.0002610004902915846, + "loss": 7.3518, + "step": 16630 + }, + { + "epoch": 1.551833535504339, + "grad_norm": 1.5228966686228167, + "learning_rate": 0.0002609954027242993, + "loss": 7.2765, + "step": 16631 + }, + { + "epoch": 1.5519268451992163, + "grad_norm": 1.5091994289793091, + "learning_rate": 0.00026099031487478554, + "loss": 7.2855, + "step": 16632 + }, + { + "epoch": 1.5520201548940935, + "grad_norm": 0.957208238868731, + "learning_rate": 0.0002609852267430563, + "loss": 7.0477, + "step": 16633 + }, + { + "epoch": 1.5521134645889707, + "grad_norm": 0.5656735426470946, + "learning_rate": 0.0002609801383291245, + "loss": 6.8896, + "step": 16634 + }, + { + "epoch": 1.5522067742838481, + "grad_norm": 413.7169937732294, + "learning_rate": 0.00026097504963300304, + "loss": 6.906, + "step": 16635 + }, + { + "epoch": 1.5523000839787255, + "grad_norm": 0.46379378708323116, + "learning_rate": 0.00026096996065470493, + "loss": 7.0618, + "step": 16636 + }, + { + "epoch": 1.5523933936736027, + "grad_norm": 0.9273669408385249, + "learning_rate": 0.00026096487139424314, + "loss": 6.7758, + "step": 16637 + }, + { + "epoch": 1.55248670336848, + "grad_norm": 0.43063510600494187, + "learning_rate": 0.0002609597818516305, + "loss": 7.207, + "step": 16638 + }, + { + "epoch": 1.5525800130633574, + "grad_norm": 0.9281238130710099, + "learning_rate": 0.00026095469202687996, + "loss": 7.418, + "step": 16639 + }, + { + "epoch": 1.5526733227582346, + "grad_norm": 287.98061097085923, + "learning_rate": 0.0002609496019200046, + "loss": 7.4136, + "step": 16640 + }, + { + "epoch": 1.5527666324531118, + "grad_norm": 0.6456768438121376, + "learning_rate": 0.0002609445115310172, + "loss": 7.2941, + "step": 16641 + }, + { + "epoch": 1.5528599421479892, + "grad_norm": 0.5080714620521324, + "learning_rate": 0.0002609394208599308, + "loss": 7.2701, + "step": 16642 + }, + { + "epoch": 1.5529532518428666, + "grad_norm": 290.3949616194086, + "learning_rate": 0.00026093432990675826, + "loss": 7.3541, + "step": 16643 + }, + { + "epoch": 1.5530465615377438, + "grad_norm": 0.8729332879442794, + "learning_rate": 0.0002609292386715127, + "loss": 7.3304, + "step": 16644 + }, + { + "epoch": 1.553139871232621, + "grad_norm": 1.4604337284160296, + "learning_rate": 0.00026092414715420684, + "loss": 7.0987, + "step": 16645 + }, + { + "epoch": 1.5532331809274984, + "grad_norm": 1.0209215884921286, + "learning_rate": 0.00026091905535485376, + "loss": 7.226, + "step": 16646 + }, + { + "epoch": 1.5533264906223758, + "grad_norm": 0.9954355369811765, + "learning_rate": 0.0002609139632734664, + "loss": 7.0835, + "step": 16647 + }, + { + "epoch": 1.5534198003172528, + "grad_norm": 0.5755708740003411, + "learning_rate": 0.0002609088709100577, + "loss": 7.254, + "step": 16648 + }, + { + "epoch": 1.5535131100121302, + "grad_norm": 1.326753453840121, + "learning_rate": 0.00026090377826464055, + "loss": 7.2761, + "step": 16649 + }, + { + "epoch": 1.5536064197070076, + "grad_norm": 1.2156952224646789, + "learning_rate": 0.00026089868533722796, + "loss": 7.213, + "step": 16650 + }, + { + "epoch": 1.5536997294018848, + "grad_norm": 1.14048985433173, + "learning_rate": 0.0002608935921278329, + "loss": 7.0612, + "step": 16651 + }, + { + "epoch": 1.553793039096762, + "grad_norm": 1.5614817441814388, + "learning_rate": 0.00026088849863646824, + "loss": 7.4454, + "step": 16652 + }, + { + "epoch": 1.5538863487916394, + "grad_norm": 1.221356463635011, + "learning_rate": 0.00026088340486314697, + "loss": 7.2506, + "step": 16653 + }, + { + "epoch": 1.5539796584865169, + "grad_norm": 245.66495565086544, + "learning_rate": 0.00026087831080788207, + "loss": 7.083, + "step": 16654 + }, + { + "epoch": 1.554072968181394, + "grad_norm": 0.4888727015095919, + "learning_rate": 0.00026087321647068644, + "loss": 7.2019, + "step": 16655 + }, + { + "epoch": 1.5541662778762713, + "grad_norm": 1.1129410474027284, + "learning_rate": 0.0002608681218515731, + "loss": 7.212, + "step": 16656 + }, + { + "epoch": 1.5542595875711487, + "grad_norm": 1.1295232314395376, + "learning_rate": 0.000260863026950555, + "loss": 7.2084, + "step": 16657 + }, + { + "epoch": 1.554352897266026, + "grad_norm": 50.590360963399995, + "learning_rate": 0.000260857931767645, + "loss": 7.0528, + "step": 16658 + }, + { + "epoch": 1.554446206960903, + "grad_norm": 0.7105451336390304, + "learning_rate": 0.0002608528363028561, + "loss": 7.1178, + "step": 16659 + }, + { + "epoch": 1.5545395166557805, + "grad_norm": 314.75990961055993, + "learning_rate": 0.0002608477405562013, + "loss": 7.441, + "step": 16660 + }, + { + "epoch": 1.554632826350658, + "grad_norm": 0.502553512696387, + "learning_rate": 0.00026084264452769354, + "loss": 7.2434, + "step": 16661 + }, + { + "epoch": 1.5547261360455351, + "grad_norm": 1.0554613597860785, + "learning_rate": 0.0002608375482173458, + "loss": 7.0399, + "step": 16662 + }, + { + "epoch": 1.5548194457404123, + "grad_norm": 1.321982730140511, + "learning_rate": 0.000260832451625171, + "loss": 7.3815, + "step": 16663 + }, + { + "epoch": 1.5549127554352897, + "grad_norm": 71.70070282725369, + "learning_rate": 0.00026082735475118203, + "loss": 7.2513, + "step": 16664 + }, + { + "epoch": 1.5550060651301671, + "grad_norm": 0.718713067152668, + "learning_rate": 0.000260822257595392, + "loss": 7.2677, + "step": 16665 + }, + { + "epoch": 1.5550993748250443, + "grad_norm": 57.51487935148746, + "learning_rate": 0.00026081716015781376, + "loss": 6.9011, + "step": 16666 + }, + { + "epoch": 1.5551926845199215, + "grad_norm": 332.5123013765548, + "learning_rate": 0.0002608120624384603, + "loss": 7.0297, + "step": 16667 + }, + { + "epoch": 1.555285994214799, + "grad_norm": 0.6759014697015954, + "learning_rate": 0.00026080696443734464, + "loss": 7.1984, + "step": 16668 + }, + { + "epoch": 1.5553793039096764, + "grad_norm": 1.0594735152314, + "learning_rate": 0.0002608018661544797, + "loss": 6.9149, + "step": 16669 + }, + { + "epoch": 1.5554726136045534, + "grad_norm": 0.4617738632247037, + "learning_rate": 0.0002607967675898784, + "loss": 7.4509, + "step": 16670 + }, + { + "epoch": 1.5555659232994308, + "grad_norm": 1.6128232246595189, + "learning_rate": 0.0002607916687435537, + "loss": 6.9001, + "step": 16671 + }, + { + "epoch": 1.5556592329943082, + "grad_norm": 0.42445016378002093, + "learning_rate": 0.0002607865696155187, + "loss": 7.0836, + "step": 16672 + }, + { + "epoch": 1.5557525426891854, + "grad_norm": 1.456322347561504, + "learning_rate": 0.0002607814702057862, + "loss": 7.4656, + "step": 16673 + }, + { + "epoch": 1.5558458523840626, + "grad_norm": 0.7870086590700581, + "learning_rate": 0.00026077637051436924, + "loss": 7.0573, + "step": 16674 + }, + { + "epoch": 1.55593916207894, + "grad_norm": 0.5195115370548578, + "learning_rate": 0.0002607712705412808, + "loss": 7.0844, + "step": 16675 + }, + { + "epoch": 1.5560324717738174, + "grad_norm": 494.29849200875736, + "learning_rate": 0.00026076617028653385, + "loss": 6.8779, + "step": 16676 + }, + { + "epoch": 1.5561257814686946, + "grad_norm": 0.37510341327452135, + "learning_rate": 0.0002607610697501413, + "loss": 7.2241, + "step": 16677 + }, + { + "epoch": 1.5562190911635718, + "grad_norm": 0.5707037897886049, + "learning_rate": 0.00026075596893211623, + "loss": 7.2243, + "step": 16678 + }, + { + "epoch": 1.5563124008584492, + "grad_norm": 0.6373445568985378, + "learning_rate": 0.00026075086783247143, + "loss": 7.2851, + "step": 16679 + }, + { + "epoch": 1.5564057105533264, + "grad_norm": 0.3735154675016685, + "learning_rate": 0.0002607457664512201, + "loss": 7.0233, + "step": 16680 + }, + { + "epoch": 1.5564990202482036, + "grad_norm": 489.3240877993943, + "learning_rate": 0.00026074066478837504, + "loss": 7.1211, + "step": 16681 + }, + { + "epoch": 1.556592329943081, + "grad_norm": 0.7396134395341927, + "learning_rate": 0.00026073556284394924, + "loss": 7.2557, + "step": 16682 + }, + { + "epoch": 1.5566856396379585, + "grad_norm": 0.8273401078795651, + "learning_rate": 0.00026073046061795576, + "loss": 7.3093, + "step": 16683 + }, + { + "epoch": 1.5567789493328357, + "grad_norm": 0.4886760369127813, + "learning_rate": 0.0002607253581104075, + "loss": 6.7928, + "step": 16684 + }, + { + "epoch": 1.5568722590277129, + "grad_norm": 1.301523326846929, + "learning_rate": 0.0002607202553213174, + "loss": 7.3988, + "step": 16685 + }, + { + "epoch": 1.5569655687225903, + "grad_norm": 0.6223478382371911, + "learning_rate": 0.00026071515225069855, + "loss": 7.0463, + "step": 16686 + }, + { + "epoch": 1.5570588784174677, + "grad_norm": 0.7910857361533723, + "learning_rate": 0.00026071004889856384, + "loss": 7.2922, + "step": 16687 + }, + { + "epoch": 1.557152188112345, + "grad_norm": 4731.742056943809, + "learning_rate": 0.0002607049452649263, + "loss": 7.0403, + "step": 16688 + }, + { + "epoch": 1.557245497807222, + "grad_norm": 0.5409973524350171, + "learning_rate": 0.0002606998413497989, + "loss": 6.9079, + "step": 16689 + }, + { + "epoch": 1.5573388075020995, + "grad_norm": 0.3438525905287098, + "learning_rate": 0.0002606947371531945, + "loss": 7.1625, + "step": 16690 + }, + { + "epoch": 1.5574321171969767, + "grad_norm": 0.9012745860976712, + "learning_rate": 0.00026068963267512625, + "loss": 7.3273, + "step": 16691 + }, + { + "epoch": 1.557525426891854, + "grad_norm": 0.41137812803576707, + "learning_rate": 0.000260684527915607, + "loss": 7.1448, + "step": 16692 + }, + { + "epoch": 1.5576187365867313, + "grad_norm": 0.7327616300810054, + "learning_rate": 0.0002606794228746498, + "loss": 7.5492, + "step": 16693 + }, + { + "epoch": 1.5577120462816088, + "grad_norm": 1.4870753171920845, + "learning_rate": 0.00026067431755226765, + "loss": 7.4844, + "step": 16694 + }, + { + "epoch": 1.557805355976486, + "grad_norm": 0.9426648024050621, + "learning_rate": 0.00026066921194847347, + "loss": 7.0634, + "step": 16695 + }, + { + "epoch": 1.5578986656713631, + "grad_norm": 1.6519179628244025, + "learning_rate": 0.00026066410606328025, + "loss": 6.7162, + "step": 16696 + }, + { + "epoch": 1.5579919753662406, + "grad_norm": 1.0193926075050261, + "learning_rate": 0.000260658999896701, + "loss": 6.9343, + "step": 16697 + }, + { + "epoch": 1.558085285061118, + "grad_norm": 0.4057654707683559, + "learning_rate": 0.0002606538934487487, + "loss": 7.1785, + "step": 16698 + }, + { + "epoch": 1.5581785947559952, + "grad_norm": 6952.114954368244, + "learning_rate": 0.0002606487867194364, + "loss": 7.1964, + "step": 16699 + }, + { + "epoch": 1.5582719044508724, + "grad_norm": 1.6996776623962877, + "learning_rate": 0.00026064367970877686, + "loss": 7.5552, + "step": 16700 + }, + { + "epoch": 1.5583652141457498, + "grad_norm": 0.6842989911134061, + "learning_rate": 0.0002606385724167833, + "loss": 7.1401, + "step": 16701 + }, + { + "epoch": 1.558458523840627, + "grad_norm": 1.3842324360281781, + "learning_rate": 0.00026063346484346865, + "loss": 7.5155, + "step": 16702 + }, + { + "epoch": 1.5585518335355042, + "grad_norm": 0.4017549212391455, + "learning_rate": 0.0002606283569888458, + "loss": 7.1013, + "step": 16703 + }, + { + "epoch": 1.5586451432303816, + "grad_norm": 0.5535674143656226, + "learning_rate": 0.0002606232488529279, + "loss": 7.3929, + "step": 16704 + }, + { + "epoch": 1.558738452925259, + "grad_norm": 0.8281489021092273, + "learning_rate": 0.0002606181404357278, + "loss": 7.1101, + "step": 16705 + }, + { + "epoch": 1.5588317626201362, + "grad_norm": 0.8495826560457893, + "learning_rate": 0.00026061303173725847, + "loss": 7.2586, + "step": 16706 + }, + { + "epoch": 1.5589250723150134, + "grad_norm": 15383.404389627382, + "learning_rate": 0.00026060792275753303, + "loss": 6.8197, + "step": 16707 + }, + { + "epoch": 1.5590183820098908, + "grad_norm": 1.0668889456010184, + "learning_rate": 0.00026060281349656446, + "loss": 7.0134, + "step": 16708 + }, + { + "epoch": 1.5591116917047683, + "grad_norm": 0.5976595342520175, + "learning_rate": 0.0002605977039543656, + "loss": 7.2983, + "step": 16709 + }, + { + "epoch": 1.5592050013996455, + "grad_norm": 0.5628040653477067, + "learning_rate": 0.0002605925941309496, + "loss": 7.1665, + "step": 16710 + }, + { + "epoch": 1.5592983110945227, + "grad_norm": 0.45260248767046, + "learning_rate": 0.0002605874840263294, + "loss": 6.8083, + "step": 16711 + }, + { + "epoch": 1.5593916207894, + "grad_norm": 1.1260225649607485, + "learning_rate": 0.0002605823736405179, + "loss": 7.1484, + "step": 16712 + }, + { + "epoch": 1.5594849304842773, + "grad_norm": 0.6135706508789643, + "learning_rate": 0.00026057726297352827, + "loss": 7.0406, + "step": 16713 + }, + { + "epoch": 1.5595782401791545, + "grad_norm": 0.7125709950976342, + "learning_rate": 0.00026057215202537345, + "loss": 7.0643, + "step": 16714 + }, + { + "epoch": 1.559671549874032, + "grad_norm": 1.0024868412356196, + "learning_rate": 0.0002605670407960663, + "loss": 7.2604, + "step": 16715 + }, + { + "epoch": 1.5597648595689093, + "grad_norm": 1.043786488171046, + "learning_rate": 0.00026056192928561995, + "loss": 7.4033, + "step": 16716 + }, + { + "epoch": 1.5598581692637865, + "grad_norm": 49177.26617614398, + "learning_rate": 0.00026055681749404734, + "loss": 7.5247, + "step": 16717 + }, + { + "epoch": 1.5599514789586637, + "grad_norm": 1.893388678519434, + "learning_rate": 0.00026055170542136155, + "loss": 6.8226, + "step": 16718 + }, + { + "epoch": 1.5600447886535411, + "grad_norm": 1.1288139826844754, + "learning_rate": 0.00026054659306757553, + "loss": 7.0153, + "step": 16719 + }, + { + "epoch": 1.5601380983484185, + "grad_norm": 9791.883737143531, + "learning_rate": 0.00026054148043270223, + "loss": 7.1771, + "step": 16720 + }, + { + "epoch": 1.5602314080432957, + "grad_norm": 0.6943946852791728, + "learning_rate": 0.0002605363675167547, + "loss": 7.2281, + "step": 16721 + }, + { + "epoch": 1.560324717738173, + "grad_norm": 0.6225171495760443, + "learning_rate": 0.0002605312543197459, + "loss": 7.4099, + "step": 16722 + }, + { + "epoch": 1.5604180274330504, + "grad_norm": 0.5281179588005256, + "learning_rate": 0.00026052614084168886, + "loss": 6.9563, + "step": 16723 + }, + { + "epoch": 1.5605113371279276, + "grad_norm": 1.0817646938554708, + "learning_rate": 0.00026052102708259663, + "loss": 7.1708, + "step": 16724 + }, + { + "epoch": 1.5606046468228048, + "grad_norm": 0.8885981351661233, + "learning_rate": 0.0002605159130424822, + "loss": 7.2337, + "step": 16725 + }, + { + "epoch": 1.5606979565176822, + "grad_norm": 0.5611729700402258, + "learning_rate": 0.00026051079872135846, + "loss": 6.9736, + "step": 16726 + }, + { + "epoch": 1.5607912662125596, + "grad_norm": 59820.602615569835, + "learning_rate": 0.00026050568411923854, + "loss": 6.9416, + "step": 16727 + }, + { + "epoch": 1.5608845759074368, + "grad_norm": 0.6638208535278229, + "learning_rate": 0.0002605005692361354, + "loss": 7.5108, + "step": 16728 + }, + { + "epoch": 1.560977885602314, + "grad_norm": 0.8718364505435027, + "learning_rate": 0.00026049545407206203, + "loss": 7.3011, + "step": 16729 + }, + { + "epoch": 1.5610711952971914, + "grad_norm": 1.0111416664525348, + "learning_rate": 0.0002604903386270314, + "loss": 7.2132, + "step": 16730 + }, + { + "epoch": 1.5611645049920688, + "grad_norm": 0.9059727104102706, + "learning_rate": 0.00026048522290105663, + "loss": 7.1281, + "step": 16731 + }, + { + "epoch": 1.561257814686946, + "grad_norm": 51381.46460304227, + "learning_rate": 0.00026048010689415065, + "loss": 7.2043, + "step": 16732 + }, + { + "epoch": 1.5613511243818232, + "grad_norm": 1.1092798303785862, + "learning_rate": 0.00026047499060632653, + "loss": 7.2808, + "step": 16733 + }, + { + "epoch": 1.5614444340767006, + "grad_norm": 0.8019701979314892, + "learning_rate": 0.00026046987403759717, + "loss": 7.0859, + "step": 16734 + }, + { + "epoch": 1.5615377437715778, + "grad_norm": 528458.5315741159, + "learning_rate": 0.0002604647571879757, + "loss": 7.3182, + "step": 16735 + }, + { + "epoch": 1.561631053466455, + "grad_norm": 0.571486164269485, + "learning_rate": 0.000260459640057475, + "loss": 7.3169, + "step": 16736 + }, + { + "epoch": 1.5617243631613325, + "grad_norm": 599367.3477064228, + "learning_rate": 0.0002604545226461082, + "loss": 7.2782, + "step": 16737 + }, + { + "epoch": 1.5618176728562099, + "grad_norm": 0.8282538495741062, + "learning_rate": 0.0002604494049538883, + "loss": 7.2947, + "step": 16738 + }, + { + "epoch": 1.561910982551087, + "grad_norm": 0.6968387689918166, + "learning_rate": 0.0002604442869808283, + "loss": 7.2817, + "step": 16739 + }, + { + "epoch": 1.5620042922459643, + "grad_norm": 0.7470579115537515, + "learning_rate": 0.00026043916872694106, + "loss": 7.1606, + "step": 16740 + }, + { + "epoch": 1.5620976019408417, + "grad_norm": 0.7727010297866848, + "learning_rate": 0.00026043405019223983, + "loss": 7.0471, + "step": 16741 + }, + { + "epoch": 1.562190911635719, + "grad_norm": 0.7276389967326881, + "learning_rate": 0.0002604289313767375, + "loss": 7.111, + "step": 16742 + }, + { + "epoch": 1.5622842213305963, + "grad_norm": 2944357.036600255, + "learning_rate": 0.00026042381228044714, + "loss": 7.2462, + "step": 16743 + }, + { + "epoch": 1.5623775310254735, + "grad_norm": 0.7523499855471357, + "learning_rate": 0.0002604186929033817, + "loss": 7.2746, + "step": 16744 + }, + { + "epoch": 1.562470840720351, + "grad_norm": 2.0408220886537354, + "learning_rate": 0.00026041357324555424, + "loss": 7.2907, + "step": 16745 + }, + { + "epoch": 1.5625641504152281, + "grad_norm": 0.8677054629397013, + "learning_rate": 0.00026040845330697773, + "loss": 7.3118, + "step": 16746 + }, + { + "epoch": 1.5626574601101053, + "grad_norm": 0.7785899826989385, + "learning_rate": 0.0002604033330876653, + "loss": 7.0489, + "step": 16747 + }, + { + "epoch": 1.5627507698049827, + "grad_norm": 1.0117108628684035, + "learning_rate": 0.0002603982125876299, + "loss": 7.2955, + "step": 16748 + }, + { + "epoch": 1.5628440794998602, + "grad_norm": 0.584719788474701, + "learning_rate": 0.00026039309180688445, + "loss": 7.1052, + "step": 16749 + }, + { + "epoch": 1.5629373891947373, + "grad_norm": 0.5884260654266125, + "learning_rate": 0.00026038797074544215, + "loss": 7.1089, + "step": 16750 + }, + { + "epoch": 1.5630306988896145, + "grad_norm": 4.717448421911947, + "learning_rate": 0.00026038284940331594, + "loss": 7.4213, + "step": 16751 + }, + { + "epoch": 1.563124008584492, + "grad_norm": 189265668.53608724, + "learning_rate": 0.00026037772778051885, + "loss": 7.3065, + "step": 16752 + }, + { + "epoch": 1.5632173182793694, + "grad_norm": 0.5859974015940115, + "learning_rate": 0.00026037260587706383, + "loss": 7.1037, + "step": 16753 + }, + { + "epoch": 1.5633106279742464, + "grad_norm": 5002191037.2668085, + "learning_rate": 0.000260367483692964, + "loss": 7.2034, + "step": 16754 + }, + { + "epoch": 1.5634039376691238, + "grad_norm": 0.9050294486202543, + "learning_rate": 0.00026036236122823237, + "loss": 7.6662, + "step": 16755 + }, + { + "epoch": 1.5634972473640012, + "grad_norm": 1.20321745545845, + "learning_rate": 0.00026035723848288195, + "loss": 7.2328, + "step": 16756 + }, + { + "epoch": 1.5635905570588784, + "grad_norm": 1.2111351289288694, + "learning_rate": 0.0002603521154569258, + "loss": 7.3343, + "step": 16757 + }, + { + "epoch": 1.5636838667537556, + "grad_norm": 69081152.47557528, + "learning_rate": 0.00026034699215037683, + "loss": 7.1667, + "step": 16758 + }, + { + "epoch": 1.563777176448633, + "grad_norm": 1.0672294448486097, + "learning_rate": 0.00026034186856324816, + "loss": 7.6318, + "step": 16759 + }, + { + "epoch": 1.5638704861435104, + "grad_norm": 1.868799044818581, + "learning_rate": 0.0002603367446955528, + "loss": 6.9974, + "step": 16760 + }, + { + "epoch": 1.5639637958383876, + "grad_norm": 9297219.403094118, + "learning_rate": 0.00026033162054730385, + "loss": 7.0291, + "step": 16761 + }, + { + "epoch": 1.5640571055332648, + "grad_norm": 12087202.50970664, + "learning_rate": 0.0002603264961185142, + "loss": 7.3348, + "step": 16762 + }, + { + "epoch": 1.5641504152281422, + "grad_norm": 0.8376557322737843, + "learning_rate": 0.00026032137140919695, + "loss": 7.3459, + "step": 16763 + }, + { + "epoch": 1.5642437249230197, + "grad_norm": 0.930691420674608, + "learning_rate": 0.0002603162464193652, + "loss": 7.2688, + "step": 16764 + }, + { + "epoch": 1.5643370346178966, + "grad_norm": 1.0040222898125142, + "learning_rate": 0.00026031112114903183, + "loss": 7.1974, + "step": 16765 + }, + { + "epoch": 1.564430344312774, + "grad_norm": 0.5579477673019047, + "learning_rate": 0.00026030599559821, + "loss": 7.1261, + "step": 16766 + }, + { + "epoch": 1.5645236540076515, + "grad_norm": 0.5948637063003076, + "learning_rate": 0.0002603008697669127, + "loss": 6.9651, + "step": 16767 + }, + { + "epoch": 1.5646169637025287, + "grad_norm": 0.7104887812708858, + "learning_rate": 0.00026029574365515293, + "loss": 7.3891, + "step": 16768 + }, + { + "epoch": 1.5647102733974059, + "grad_norm": 0.9266511609920675, + "learning_rate": 0.0002602906172629438, + "loss": 7.1758, + "step": 16769 + }, + { + "epoch": 1.5648035830922833, + "grad_norm": 0.9066847630022435, + "learning_rate": 0.00026028549059029833, + "loss": 7.6089, + "step": 16770 + }, + { + "epoch": 1.5648968927871607, + "grad_norm": 4239470.832911612, + "learning_rate": 0.0002602803636372294, + "loss": 7.3035, + "step": 16771 + }, + { + "epoch": 1.564990202482038, + "grad_norm": 0.6469961784249779, + "learning_rate": 0.0002602752364037503, + "loss": 7.4159, + "step": 16772 + }, + { + "epoch": 1.565083512176915, + "grad_norm": 18617620.356847715, + "learning_rate": 0.0002602701088898739, + "loss": 7.599, + "step": 16773 + }, + { + "epoch": 1.5651768218717925, + "grad_norm": 1.0942227014320471, + "learning_rate": 0.0002602649810956133, + "loss": 7.0911, + "step": 16774 + }, + { + "epoch": 1.56527013156667, + "grad_norm": 0.7537236989597303, + "learning_rate": 0.0002602598530209814, + "loss": 7.2628, + "step": 16775 + }, + { + "epoch": 1.565363441261547, + "grad_norm": 0.7411595605679309, + "learning_rate": 0.00026025472466599146, + "loss": 7.2565, + "step": 16776 + }, + { + "epoch": 1.5654567509564243, + "grad_norm": 1.0232710803060632, + "learning_rate": 0.0002602495960306564, + "loss": 7.2285, + "step": 16777 + }, + { + "epoch": 1.5655500606513018, + "grad_norm": 108144029.09715647, + "learning_rate": 0.0002602444671149893, + "loss": 7.1709, + "step": 16778 + }, + { + "epoch": 1.565643370346179, + "grad_norm": 0.8929900869856262, + "learning_rate": 0.0002602393379190031, + "loss": 6.805, + "step": 16779 + }, + { + "epoch": 1.5657366800410562, + "grad_norm": 1.0935968282776425, + "learning_rate": 0.00026023420844271104, + "loss": 7.0517, + "step": 16780 + }, + { + "epoch": 1.5658299897359336, + "grad_norm": 9914694.957902139, + "learning_rate": 0.0002602290786861259, + "loss": 6.9088, + "step": 16781 + }, + { + "epoch": 1.565923299430811, + "grad_norm": 1.1131000073871926, + "learning_rate": 0.00026022394864926094, + "loss": 7.1751, + "step": 16782 + }, + { + "epoch": 1.5660166091256882, + "grad_norm": 1.6574561920425421, + "learning_rate": 0.00026021881833212916, + "loss": 7.5453, + "step": 16783 + }, + { + "epoch": 1.5661099188205654, + "grad_norm": 1.091881279264921, + "learning_rate": 0.00026021368773474353, + "loss": 7.4557, + "step": 16784 + }, + { + "epoch": 1.5662032285154428, + "grad_norm": 96254.52921923305, + "learning_rate": 0.0002602085568571171, + "loss": 7.0631, + "step": 16785 + }, + { + "epoch": 1.56629653821032, + "grad_norm": 0.9503424552622908, + "learning_rate": 0.00026020342569926303, + "loss": 7.1869, + "step": 16786 + }, + { + "epoch": 1.5663898479051972, + "grad_norm": 0.5488323690827142, + "learning_rate": 0.00026019829426119425, + "loss": 7.393, + "step": 16787 + }, + { + "epoch": 1.5664831576000746, + "grad_norm": 0.5257457058014814, + "learning_rate": 0.0002601931625429239, + "loss": 6.9967, + "step": 16788 + }, + { + "epoch": 1.566576467294952, + "grad_norm": 0.9089431415484499, + "learning_rate": 0.0002601880305444649, + "loss": 7.4281, + "step": 16789 + }, + { + "epoch": 1.5666697769898292, + "grad_norm": 0.47590882251450256, + "learning_rate": 0.00026018289826583045, + "loss": 7.2549, + "step": 16790 + }, + { + "epoch": 1.5667630866847064, + "grad_norm": 0.44184322851353625, + "learning_rate": 0.00026017776570703353, + "loss": 7.0843, + "step": 16791 + }, + { + "epoch": 1.5668563963795838, + "grad_norm": 0.7320274394306191, + "learning_rate": 0.0002601726328680872, + "loss": 7.3863, + "step": 16792 + }, + { + "epoch": 1.5669497060744613, + "grad_norm": 2163.225477420149, + "learning_rate": 0.0002601674997490044, + "loss": 7.2192, + "step": 16793 + }, + { + "epoch": 1.5670430157693385, + "grad_norm": 1.0733823779953358, + "learning_rate": 0.00026016236634979835, + "loss": 6.8504, + "step": 16794 + }, + { + "epoch": 1.5671363254642157, + "grad_norm": 0.6005356669309859, + "learning_rate": 0.000260157232670482, + "loss": 7.3239, + "step": 16795 + }, + { + "epoch": 1.567229635159093, + "grad_norm": 0.7369298622850379, + "learning_rate": 0.0002601520987110685, + "loss": 7.229, + "step": 16796 + }, + { + "epoch": 1.5673229448539703, + "grad_norm": 0.6508798453680305, + "learning_rate": 0.0002601469644715708, + "loss": 7.2226, + "step": 16797 + }, + { + "epoch": 1.5674162545488475, + "grad_norm": 0.8449888246962886, + "learning_rate": 0.000260141829952002, + "loss": 7.052, + "step": 16798 + }, + { + "epoch": 1.567509564243725, + "grad_norm": 1.0203831905401604, + "learning_rate": 0.0002601366951523752, + "loss": 7.0565, + "step": 16799 + }, + { + "epoch": 1.5676028739386023, + "grad_norm": 0.44558468310154753, + "learning_rate": 0.0002601315600727033, + "loss": 7.2728, + "step": 16800 + }, + { + "epoch": 1.5676961836334795, + "grad_norm": 0.49226834155302185, + "learning_rate": 0.0002601264247129996, + "loss": 7.3043, + "step": 16801 + }, + { + "epoch": 1.5677894933283567, + "grad_norm": 0.3307626648361073, + "learning_rate": 0.00026012128907327697, + "loss": 7.2703, + "step": 16802 + }, + { + "epoch": 1.5678828030232341, + "grad_norm": 0.38798666960607303, + "learning_rate": 0.00026011615315354845, + "loss": 7.1356, + "step": 16803 + }, + { + "epoch": 1.5679761127181115, + "grad_norm": 2727.9992027920307, + "learning_rate": 0.0002601110169538273, + "loss": 6.8677, + "step": 16804 + }, + { + "epoch": 1.5680694224129887, + "grad_norm": 0.3717694273002119, + "learning_rate": 0.00026010588047412634, + "loss": 6.8405, + "step": 16805 + }, + { + "epoch": 1.568162732107866, + "grad_norm": 1.1581789113979795, + "learning_rate": 0.0002601007437144588, + "loss": 7.1613, + "step": 16806 + }, + { + "epoch": 1.5682560418027434, + "grad_norm": 1.1842135484646468, + "learning_rate": 0.00026009560667483765, + "loss": 7.2542, + "step": 16807 + }, + { + "epoch": 1.5683493514976206, + "grad_norm": 2.3535126592878144, + "learning_rate": 0.00026009046935527604, + "loss": 7.0543, + "step": 16808 + }, + { + "epoch": 1.5684426611924978, + "grad_norm": 0.5258975832931085, + "learning_rate": 0.0002600853317557869, + "loss": 6.7492, + "step": 16809 + }, + { + "epoch": 1.5685359708873752, + "grad_norm": 1.2248981683876876, + "learning_rate": 0.0002600801938763834, + "loss": 7.3118, + "step": 16810 + }, + { + "epoch": 1.5686292805822526, + "grad_norm": 0.4433421925048343, + "learning_rate": 0.00026007505571707865, + "loss": 7.1235, + "step": 16811 + }, + { + "epoch": 1.5687225902771298, + "grad_norm": 0.5018838401510024, + "learning_rate": 0.0002600699172778855, + "loss": 7.2428, + "step": 16812 + }, + { + "epoch": 1.568815899972007, + "grad_norm": 0.4522803851124701, + "learning_rate": 0.0002600647785588173, + "loss": 7.2822, + "step": 16813 + }, + { + "epoch": 1.5689092096668844, + "grad_norm": 1036.944459471823, + "learning_rate": 0.00026005963955988686, + "loss": 7.2638, + "step": 16814 + }, + { + "epoch": 1.5690025193617618, + "grad_norm": 0.8521541203538777, + "learning_rate": 0.0002600545002811074, + "loss": 7.2321, + "step": 16815 + }, + { + "epoch": 1.569095829056639, + "grad_norm": 1.3229810180488712, + "learning_rate": 0.00026004936072249196, + "loss": 6.9916, + "step": 16816 + }, + { + "epoch": 1.5691891387515162, + "grad_norm": 1.2184511148703134, + "learning_rate": 0.0002600442208840536, + "loss": 7.0036, + "step": 16817 + }, + { + "epoch": 1.5692824484463936, + "grad_norm": 0.9183459316805466, + "learning_rate": 0.00026003908076580533, + "loss": 6.9369, + "step": 16818 + }, + { + "epoch": 1.5693757581412708, + "grad_norm": 0.4341469699156569, + "learning_rate": 0.00026003394036776034, + "loss": 7.1852, + "step": 16819 + }, + { + "epoch": 1.569469067836148, + "grad_norm": 0.7480776426762599, + "learning_rate": 0.00026002879968993155, + "loss": 7.3397, + "step": 16820 + }, + { + "epoch": 1.5695623775310255, + "grad_norm": 0.8950108207489708, + "learning_rate": 0.0002600236587323322, + "loss": 7.3097, + "step": 16821 + }, + { + "epoch": 1.5696556872259029, + "grad_norm": 0.8411161907889878, + "learning_rate": 0.0002600185174949753, + "loss": 7.3821, + "step": 16822 + }, + { + "epoch": 1.56974899692078, + "grad_norm": 985.9978413071757, + "learning_rate": 0.0002600133759778738, + "loss": 7.1469, + "step": 16823 + }, + { + "epoch": 1.5698423066156573, + "grad_norm": 807.1208482374536, + "learning_rate": 0.00026000823418104093, + "loss": 7.0942, + "step": 16824 + }, + { + "epoch": 1.5699356163105347, + "grad_norm": 1070.0632282229901, + "learning_rate": 0.0002600030921044897, + "loss": 7.3392, + "step": 16825 + }, + { + "epoch": 1.570028926005412, + "grad_norm": 0.5131025317490265, + "learning_rate": 0.00025999794974823317, + "loss": 7.1156, + "step": 16826 + }, + { + "epoch": 1.5701222357002893, + "grad_norm": 0.6176853370954023, + "learning_rate": 0.00025999280711228446, + "loss": 7.3647, + "step": 16827 + }, + { + "epoch": 1.5702155453951665, + "grad_norm": 1917.8724763056186, + "learning_rate": 0.00025998766419665665, + "loss": 7.1228, + "step": 16828 + }, + { + "epoch": 1.570308855090044, + "grad_norm": 0.39737469319188573, + "learning_rate": 0.00025998252100136273, + "loss": 7.226, + "step": 16829 + }, + { + "epoch": 1.5704021647849211, + "grad_norm": 0.3724534802736518, + "learning_rate": 0.0002599773775264159, + "loss": 7.1985, + "step": 16830 + }, + { + "epoch": 1.5704954744797983, + "grad_norm": 0.4265907075989638, + "learning_rate": 0.0002599722337718291, + "loss": 7.0696, + "step": 16831 + }, + { + "epoch": 1.5705887841746757, + "grad_norm": 0.5055461157961302, + "learning_rate": 0.00025996708973761556, + "loss": 7.2747, + "step": 16832 + }, + { + "epoch": 1.5706820938695532, + "grad_norm": 0.49883066711880664, + "learning_rate": 0.00025996194542378826, + "loss": 7.1549, + "step": 16833 + }, + { + "epoch": 1.5707754035644304, + "grad_norm": 0.3538759219581775, + "learning_rate": 0.00025995680083036035, + "loss": 7.1478, + "step": 16834 + }, + { + "epoch": 1.5708687132593075, + "grad_norm": 561.0144792716508, + "learning_rate": 0.0002599516559573448, + "loss": 7.1551, + "step": 16835 + }, + { + "epoch": 1.570962022954185, + "grad_norm": 0.6141016318817499, + "learning_rate": 0.00025994651080475477, + "loss": 7.3609, + "step": 16836 + }, + { + "epoch": 1.5710553326490624, + "grad_norm": 0.3416321701961804, + "learning_rate": 0.00025994136537260334, + "loss": 7.2224, + "step": 16837 + }, + { + "epoch": 1.5711486423439396, + "grad_norm": 0.3809056386495044, + "learning_rate": 0.0002599362196609036, + "loss": 7.0642, + "step": 16838 + }, + { + "epoch": 1.5712419520388168, + "grad_norm": 1028.2867068230066, + "learning_rate": 0.0002599310736696686, + "loss": 7.302, + "step": 16839 + }, + { + "epoch": 1.5713352617336942, + "grad_norm": 759.956882027214, + "learning_rate": 0.00025992592739891145, + "loss": 7.5176, + "step": 16840 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.36585566018166077, + "learning_rate": 0.00025992078084864526, + "loss": 7.1367, + "step": 16841 + }, + { + "epoch": 1.5715218811234486, + "grad_norm": 0.5279215628468107, + "learning_rate": 0.000259915634018883, + "loss": 7.1272, + "step": 16842 + }, + { + "epoch": 1.571615190818326, + "grad_norm": 1489.4810863096407, + "learning_rate": 0.00025991048690963796, + "loss": 7.4578, + "step": 16843 + }, + { + "epoch": 1.5717085005132034, + "grad_norm": 0.5730072016309268, + "learning_rate": 0.000259905339520923, + "loss": 7.2288, + "step": 16844 + }, + { + "epoch": 1.5718018102080806, + "grad_norm": 0.633364697740283, + "learning_rate": 0.0002599001918527513, + "loss": 7.0919, + "step": 16845 + }, + { + "epoch": 1.5718951199029578, + "grad_norm": 0.8251863266590805, + "learning_rate": 0.0002598950439051361, + "loss": 7.0311, + "step": 16846 + }, + { + "epoch": 1.5719884295978352, + "grad_norm": 0.40735451828082536, + "learning_rate": 0.0002598898956780903, + "loss": 7.2835, + "step": 16847 + }, + { + "epoch": 1.5720817392927127, + "grad_norm": 1.0218314040130383, + "learning_rate": 0.00025988474717162697, + "loss": 6.7961, + "step": 16848 + }, + { + "epoch": 1.5721750489875899, + "grad_norm": 11991.561913289323, + "learning_rate": 0.0002598795983857594, + "loss": 7.1547, + "step": 16849 + }, + { + "epoch": 1.572268358682467, + "grad_norm": 0.35900227166090914, + "learning_rate": 0.0002598744493205005, + "loss": 6.9845, + "step": 16850 + }, + { + "epoch": 1.5723616683773445, + "grad_norm": 5870.080467112211, + "learning_rate": 0.0002598692999758634, + "loss": 7.0743, + "step": 16851 + }, + { + "epoch": 1.5724549780722217, + "grad_norm": 0.4003879909503354, + "learning_rate": 0.0002598641503518613, + "loss": 7.0258, + "step": 16852 + }, + { + "epoch": 1.5725482877670989, + "grad_norm": 0.5537292166101475, + "learning_rate": 0.0002598590004485071, + "loss": 7.0815, + "step": 16853 + }, + { + "epoch": 1.5726415974619763, + "grad_norm": 0.4137100844702762, + "learning_rate": 0.0002598538502658141, + "loss": 7.0883, + "step": 16854 + }, + { + "epoch": 1.5727349071568537, + "grad_norm": 0.3309435171749321, + "learning_rate": 0.00025984869980379523, + "loss": 6.9012, + "step": 16855 + }, + { + "epoch": 1.572828216851731, + "grad_norm": 0.3636252628112206, + "learning_rate": 0.0002598435490624636, + "loss": 7.107, + "step": 16856 + }, + { + "epoch": 1.572921526546608, + "grad_norm": 0.491294014672967, + "learning_rate": 0.00025983839804183254, + "loss": 7.1968, + "step": 16857 + }, + { + "epoch": 1.5730148362414855, + "grad_norm": 0.8570661570117956, + "learning_rate": 0.0002598332467419149, + "loss": 7.4229, + "step": 16858 + }, + { + "epoch": 1.573108145936363, + "grad_norm": 0.4465134821977658, + "learning_rate": 0.00025982809516272376, + "loss": 7.2126, + "step": 16859 + }, + { + "epoch": 1.57320145563124, + "grad_norm": 13746.792121243783, + "learning_rate": 0.00025982294330427236, + "loss": 7.0139, + "step": 16860 + }, + { + "epoch": 1.5732947653261173, + "grad_norm": 0.32652364940447254, + "learning_rate": 0.0002598177911665738, + "loss": 7.2652, + "step": 16861 + }, + { + "epoch": 1.5733880750209948, + "grad_norm": 27124.739310717498, + "learning_rate": 0.00025981263874964107, + "loss": 7.1117, + "step": 16862 + }, + { + "epoch": 1.573481384715872, + "grad_norm": 0.4024934495014868, + "learning_rate": 0.0002598074860534873, + "loss": 7.083, + "step": 16863 + }, + { + "epoch": 1.5735746944107492, + "grad_norm": 15685.602294359118, + "learning_rate": 0.0002598023330781257, + "loss": 7.127, + "step": 16864 + }, + { + "epoch": 1.5736680041056266, + "grad_norm": 20080.970000247766, + "learning_rate": 0.0002597971798235693, + "loss": 7.3144, + "step": 16865 + }, + { + "epoch": 1.573761313800504, + "grad_norm": 1.3896119616596834, + "learning_rate": 0.0002597920262898311, + "loss": 7.2831, + "step": 16866 + }, + { + "epoch": 1.5738546234953812, + "grad_norm": 0.6304454541732382, + "learning_rate": 0.00025978687247692433, + "loss": 7.1257, + "step": 16867 + }, + { + "epoch": 1.5739479331902584, + "grad_norm": 0.3661522152895918, + "learning_rate": 0.0002597817183848621, + "loss": 7.0605, + "step": 16868 + }, + { + "epoch": 1.5740412428851358, + "grad_norm": 0.7395176824737749, + "learning_rate": 0.0002597765640136575, + "loss": 7.5721, + "step": 16869 + }, + { + "epoch": 1.5741345525800132, + "grad_norm": 0.9507891422542554, + "learning_rate": 0.00025977140936332357, + "loss": 7.0003, + "step": 16870 + }, + { + "epoch": 1.5742278622748902, + "grad_norm": 0.4763338520444181, + "learning_rate": 0.00025976625443387347, + "loss": 7.5656, + "step": 16871 + }, + { + "epoch": 1.5743211719697676, + "grad_norm": 1.2879692003574406, + "learning_rate": 0.0002597610992253203, + "loss": 7.258, + "step": 16872 + }, + { + "epoch": 1.574414481664645, + "grad_norm": 0.8870210633097747, + "learning_rate": 0.00025975594373767717, + "loss": 7.2758, + "step": 16873 + }, + { + "epoch": 1.5745077913595222, + "grad_norm": 0.5805040200613244, + "learning_rate": 0.00025975078797095725, + "loss": 7.3599, + "step": 16874 + }, + { + "epoch": 1.5746011010543994, + "grad_norm": 0.5549084780280698, + "learning_rate": 0.0002597456319251735, + "loss": 7.3475, + "step": 16875 + }, + { + "epoch": 1.5746944107492769, + "grad_norm": 0.44493939323778253, + "learning_rate": 0.0002597404756003391, + "loss": 7.1231, + "step": 16876 + }, + { + "epoch": 1.5747877204441543, + "grad_norm": 0.4603520298859112, + "learning_rate": 0.00025973531899646724, + "loss": 7.2363, + "step": 16877 + }, + { + "epoch": 1.5748810301390315, + "grad_norm": 0.6260224356230257, + "learning_rate": 0.000259730162113571, + "loss": 6.9819, + "step": 16878 + }, + { + "epoch": 1.5749743398339087, + "grad_norm": 0.5833226006619407, + "learning_rate": 0.0002597250049516634, + "loss": 7.0957, + "step": 16879 + }, + { + "epoch": 1.575067649528786, + "grad_norm": 1.4947221198598721, + "learning_rate": 0.00025971984751075764, + "loss": 7.3274, + "step": 16880 + }, + { + "epoch": 1.5751609592236635, + "grad_norm": 0.9584231036366224, + "learning_rate": 0.00025971468979086676, + "loss": 7.4767, + "step": 16881 + }, + { + "epoch": 1.5752542689185405, + "grad_norm": 0.4026823593663827, + "learning_rate": 0.000259709531792004, + "loss": 7.2978, + "step": 16882 + }, + { + "epoch": 1.575347578613418, + "grad_norm": 800.320493092309, + "learning_rate": 0.00025970437351418234, + "loss": 7.4618, + "step": 16883 + }, + { + "epoch": 1.5754408883082953, + "grad_norm": 1.0179383905008774, + "learning_rate": 0.000259699214957415, + "loss": 7.2841, + "step": 16884 + }, + { + "epoch": 1.5755341980031725, + "grad_norm": 5062.185481830424, + "learning_rate": 0.000259694056121715, + "loss": 7.015, + "step": 16885 + }, + { + "epoch": 1.5756275076980497, + "grad_norm": 1.0179503829380652, + "learning_rate": 0.00025968889700709555, + "loss": 7.3217, + "step": 16886 + }, + { + "epoch": 1.5757208173929271, + "grad_norm": 2048.5654514814278, + "learning_rate": 0.00025968373761356975, + "loss": 7.1497, + "step": 16887 + }, + { + "epoch": 1.5758141270878046, + "grad_norm": 0.52190459461442, + "learning_rate": 0.0002596785779411506, + "loss": 7.4277, + "step": 16888 + }, + { + "epoch": 1.5759074367826817, + "grad_norm": 0.8151056949151518, + "learning_rate": 0.0002596734179898514, + "loss": 7.4135, + "step": 16889 + }, + { + "epoch": 1.576000746477559, + "grad_norm": 0.7092552238466808, + "learning_rate": 0.00025966825775968513, + "loss": 7.2497, + "step": 16890 + }, + { + "epoch": 1.5760940561724364, + "grad_norm": 0.4256110626627345, + "learning_rate": 0.000259663097250665, + "loss": 7.0805, + "step": 16891 + }, + { + "epoch": 1.5761873658673136, + "grad_norm": 0.3676679980679618, + "learning_rate": 0.0002596579364628041, + "loss": 7.0795, + "step": 16892 + }, + { + "epoch": 1.5762806755621908, + "grad_norm": 0.837715795188979, + "learning_rate": 0.00025965277539611556, + "loss": 7.4216, + "step": 16893 + }, + { + "epoch": 1.5763739852570682, + "grad_norm": 0.5393625711357833, + "learning_rate": 0.0002596476140506125, + "loss": 7.0402, + "step": 16894 + }, + { + "epoch": 1.5764672949519456, + "grad_norm": 6584.053169281572, + "learning_rate": 0.00025964245242630804, + "loss": 7.107, + "step": 16895 + }, + { + "epoch": 1.5765606046468228, + "grad_norm": 1530.9992473303234, + "learning_rate": 0.00025963729052321525, + "loss": 6.9601, + "step": 16896 + }, + { + "epoch": 1.5766539143417, + "grad_norm": 3519.5892097329624, + "learning_rate": 0.00025963212834134735, + "loss": 7.0712, + "step": 16897 + }, + { + "epoch": 1.5767472240365774, + "grad_norm": 0.4911155067391073, + "learning_rate": 0.0002596269658807174, + "loss": 7.0631, + "step": 16898 + }, + { + "epoch": 1.5768405337314548, + "grad_norm": 0.3472417586241549, + "learning_rate": 0.00025962180314133856, + "loss": 7.0613, + "step": 16899 + }, + { + "epoch": 1.576933843426332, + "grad_norm": 0.5609842328151786, + "learning_rate": 0.00025961664012322396, + "loss": 6.9016, + "step": 16900 + }, + { + "epoch": 1.5770271531212092, + "grad_norm": 0.6355017146349916, + "learning_rate": 0.00025961147682638673, + "loss": 7.1406, + "step": 16901 + }, + { + "epoch": 1.5771204628160866, + "grad_norm": 0.9822547659142594, + "learning_rate": 0.00025960631325083993, + "loss": 7.2021, + "step": 16902 + }, + { + "epoch": 1.5772137725109638, + "grad_norm": 5320.367475669207, + "learning_rate": 0.00025960114939659675, + "loss": 7.0899, + "step": 16903 + }, + { + "epoch": 1.577307082205841, + "grad_norm": 0.3754388711583873, + "learning_rate": 0.00025959598526367034, + "loss": 7.1308, + "step": 16904 + }, + { + "epoch": 1.5774003919007185, + "grad_norm": 0.364671785296578, + "learning_rate": 0.0002595908208520738, + "loss": 7.2158, + "step": 16905 + }, + { + "epoch": 1.5774937015955959, + "grad_norm": 0.7908182667729459, + "learning_rate": 0.0002595856561618203, + "loss": 7.1559, + "step": 16906 + }, + { + "epoch": 1.577587011290473, + "grad_norm": 0.5498184931349841, + "learning_rate": 0.0002595804911929229, + "loss": 7.2475, + "step": 16907 + }, + { + "epoch": 1.5776803209853503, + "grad_norm": 0.7935367321783432, + "learning_rate": 0.00025957532594539476, + "loss": 7.0511, + "step": 16908 + }, + { + "epoch": 1.5777736306802277, + "grad_norm": 0.8611053476594217, + "learning_rate": 0.00025957016041924905, + "loss": 6.846, + "step": 16909 + }, + { + "epoch": 1.577866940375105, + "grad_norm": 0.3703929573678821, + "learning_rate": 0.00025956499461449883, + "loss": 7.2432, + "step": 16910 + }, + { + "epoch": 1.5779602500699823, + "grad_norm": 0.4680023899537645, + "learning_rate": 0.00025955982853115734, + "loss": 7.1038, + "step": 16911 + }, + { + "epoch": 1.5780535597648595, + "grad_norm": 0.969009608092109, + "learning_rate": 0.00025955466216923763, + "loss": 7.4925, + "step": 16912 + }, + { + "epoch": 1.578146869459737, + "grad_norm": 0.8668638385358131, + "learning_rate": 0.0002595494955287529, + "loss": 7.3379, + "step": 16913 + }, + { + "epoch": 1.5782401791546141, + "grad_norm": 0.7131682165857713, + "learning_rate": 0.0002595443286097162, + "loss": 7.2652, + "step": 16914 + }, + { + "epoch": 1.5783334888494913, + "grad_norm": 0.39680981598748544, + "learning_rate": 0.00025953916141214074, + "loss": 7.3297, + "step": 16915 + }, + { + "epoch": 1.5784267985443687, + "grad_norm": 0.46886953018511457, + "learning_rate": 0.00025953399393603964, + "loss": 7.2054, + "step": 16916 + }, + { + "epoch": 1.5785201082392462, + "grad_norm": 0.522531885678284, + "learning_rate": 0.0002595288261814261, + "loss": 7.1725, + "step": 16917 + }, + { + "epoch": 1.5786134179341234, + "grad_norm": 0.6401591151832752, + "learning_rate": 0.0002595236581483132, + "loss": 7.2586, + "step": 16918 + }, + { + "epoch": 1.5787067276290006, + "grad_norm": 0.35098110508588753, + "learning_rate": 0.000259518489836714, + "loss": 7.2202, + "step": 16919 + }, + { + "epoch": 1.578800037323878, + "grad_norm": 0.45301811174971635, + "learning_rate": 0.00025951332124664176, + "loss": 7.1224, + "step": 16920 + }, + { + "epoch": 1.5788933470187554, + "grad_norm": 28154.717079369926, + "learning_rate": 0.00025950815237810957, + "loss": 7.3022, + "step": 16921 + }, + { + "epoch": 1.5789866567136326, + "grad_norm": 0.6725213514490604, + "learning_rate": 0.00025950298323113057, + "loss": 7.2647, + "step": 16922 + }, + { + "epoch": 1.5790799664085098, + "grad_norm": 0.4381973964170018, + "learning_rate": 0.000259497813805718, + "loss": 7.1401, + "step": 16923 + }, + { + "epoch": 1.5791732761033872, + "grad_norm": 0.5343967045109258, + "learning_rate": 0.00025949264410188484, + "loss": 6.984, + "step": 16924 + }, + { + "epoch": 1.5792665857982644, + "grad_norm": 0.406962618568954, + "learning_rate": 0.00025948747411964435, + "loss": 7.1192, + "step": 16925 + }, + { + "epoch": 1.5793598954931416, + "grad_norm": 0.5705917508664844, + "learning_rate": 0.0002594823038590097, + "loss": 7.0569, + "step": 16926 + }, + { + "epoch": 1.579453205188019, + "grad_norm": 1.1609194820550206, + "learning_rate": 0.00025947713331999386, + "loss": 7.5926, + "step": 16927 + }, + { + "epoch": 1.5795465148828964, + "grad_norm": 0.4982832115218714, + "learning_rate": 0.0002594719625026102, + "loss": 7.1219, + "step": 16928 + }, + { + "epoch": 1.5796398245777736, + "grad_norm": 26133.44370822461, + "learning_rate": 0.0002594667914068717, + "loss": 7.0918, + "step": 16929 + }, + { + "epoch": 1.5797331342726508, + "grad_norm": 0.41876571107800703, + "learning_rate": 0.0002594616200327916, + "loss": 7.1474, + "step": 16930 + }, + { + "epoch": 1.5798264439675282, + "grad_norm": 0.5291701036766003, + "learning_rate": 0.0002594564483803831, + "loss": 7.1587, + "step": 16931 + }, + { + "epoch": 1.5799197536624057, + "grad_norm": 0.79981924127033, + "learning_rate": 0.00025945127644965923, + "loss": 7.3551, + "step": 16932 + }, + { + "epoch": 1.5800130633572829, + "grad_norm": 0.421384285559996, + "learning_rate": 0.00025944610424063314, + "loss": 7.0995, + "step": 16933 + }, + { + "epoch": 1.58010637305216, + "grad_norm": 0.3805488207744036, + "learning_rate": 0.0002594409317533181, + "loss": 7.2102, + "step": 16934 + }, + { + "epoch": 1.5801996827470375, + "grad_norm": 0.3392577468128072, + "learning_rate": 0.00025943575898772713, + "loss": 7.1497, + "step": 16935 + }, + { + "epoch": 1.5802929924419147, + "grad_norm": 0.34812530770970684, + "learning_rate": 0.00025943058594387345, + "loss": 7.1498, + "step": 16936 + }, + { + "epoch": 1.5803863021367919, + "grad_norm": 0.6413387632473602, + "learning_rate": 0.0002594254126217702, + "loss": 6.7088, + "step": 16937 + }, + { + "epoch": 1.5804796118316693, + "grad_norm": 0.9853970786521404, + "learning_rate": 0.0002594202390214306, + "loss": 7.4068, + "step": 16938 + }, + { + "epoch": 1.5805729215265467, + "grad_norm": 0.5050590297951858, + "learning_rate": 0.00025941506514286764, + "loss": 7.1612, + "step": 16939 + }, + { + "epoch": 1.580666231221424, + "grad_norm": 0.6591286995139638, + "learning_rate": 0.0002594098909860947, + "loss": 6.9203, + "step": 16940 + }, + { + "epoch": 1.580759540916301, + "grad_norm": 0.39107438851013615, + "learning_rate": 0.0002594047165511247, + "loss": 7.1682, + "step": 16941 + }, + { + "epoch": 1.5808528506111785, + "grad_norm": 0.4273133145116845, + "learning_rate": 0.000259399541837971, + "loss": 7.2324, + "step": 16942 + }, + { + "epoch": 1.580946160306056, + "grad_norm": 0.48769505807087626, + "learning_rate": 0.0002593943668466466, + "loss": 7.2937, + "step": 16943 + }, + { + "epoch": 1.5810394700009331, + "grad_norm": 0.6367154825138566, + "learning_rate": 0.0002593891915771648, + "loss": 7.1117, + "step": 16944 + }, + { + "epoch": 1.5811327796958103, + "grad_norm": 1.2006442413157676, + "learning_rate": 0.0002593840160295386, + "loss": 6.8241, + "step": 16945 + }, + { + "epoch": 1.5812260893906878, + "grad_norm": 0.4718729199178382, + "learning_rate": 0.0002593788402037814, + "loss": 7.213, + "step": 16946 + }, + { + "epoch": 1.581319399085565, + "grad_norm": 0.8202583143623164, + "learning_rate": 0.00025937366409990606, + "loss": 6.9638, + "step": 16947 + }, + { + "epoch": 1.5814127087804422, + "grad_norm": 0.9232851471199646, + "learning_rate": 0.000259368487717926, + "loss": 6.7115, + "step": 16948 + }, + { + "epoch": 1.5815060184753196, + "grad_norm": 33114.230213560884, + "learning_rate": 0.0002593633110578542, + "loss": 7.2574, + "step": 16949 + }, + { + "epoch": 1.581599328170197, + "grad_norm": 0.880690178751606, + "learning_rate": 0.0002593581341197039, + "loss": 7.1292, + "step": 16950 + }, + { + "epoch": 1.5816926378650742, + "grad_norm": 0.7729233882351212, + "learning_rate": 0.0002593529569034883, + "loss": 7.0902, + "step": 16951 + }, + { + "epoch": 1.5817859475599514, + "grad_norm": 0.899574694899492, + "learning_rate": 0.00025934777940922054, + "loss": 7.2081, + "step": 16952 + }, + { + "epoch": 1.5818792572548288, + "grad_norm": 0.9139327270491012, + "learning_rate": 0.0002593426016369137, + "loss": 7.3151, + "step": 16953 + }, + { + "epoch": 1.5819725669497062, + "grad_norm": 0.3857152280990628, + "learning_rate": 0.00025933742358658107, + "loss": 7.1419, + "step": 16954 + }, + { + "epoch": 1.5820658766445834, + "grad_norm": 974.2444158434885, + "learning_rate": 0.00025933224525823573, + "loss": 6.8858, + "step": 16955 + }, + { + "epoch": 1.5821591863394606, + "grad_norm": 0.5668604637612831, + "learning_rate": 0.0002593270666518909, + "loss": 7.0998, + "step": 16956 + }, + { + "epoch": 1.582252496034338, + "grad_norm": 0.4978439963975818, + "learning_rate": 0.0002593218877675597, + "loss": 7.2087, + "step": 16957 + }, + { + "epoch": 1.5823458057292152, + "grad_norm": 0.5559147645039687, + "learning_rate": 0.0002593167086052553, + "loss": 7.1669, + "step": 16958 + }, + { + "epoch": 1.5824391154240924, + "grad_norm": 1873.2575933156604, + "learning_rate": 0.00025931152916499093, + "loss": 7.0015, + "step": 16959 + }, + { + "epoch": 1.5825324251189699, + "grad_norm": 0.3657318815585661, + "learning_rate": 0.00025930634944677973, + "loss": 7.1169, + "step": 16960 + }, + { + "epoch": 1.5826257348138473, + "grad_norm": 0.5089157995404651, + "learning_rate": 0.0002593011694506348, + "loss": 7.0072, + "step": 16961 + }, + { + "epoch": 1.5827190445087245, + "grad_norm": 0.875156489847483, + "learning_rate": 0.0002592959891765695, + "loss": 7.1774, + "step": 16962 + }, + { + "epoch": 1.5828123542036017, + "grad_norm": 0.6188422538730811, + "learning_rate": 0.00025929080862459677, + "loss": 7.0294, + "step": 16963 + }, + { + "epoch": 1.582905663898479, + "grad_norm": 0.4668299524692236, + "learning_rate": 0.0002592856277947299, + "loss": 6.9759, + "step": 16964 + }, + { + "epoch": 1.5829989735933565, + "grad_norm": 0.48235757612263636, + "learning_rate": 0.0002592804466869821, + "loss": 6.9214, + "step": 16965 + }, + { + "epoch": 1.5830922832882335, + "grad_norm": 0.5348838783723596, + "learning_rate": 0.00025927526530136643, + "loss": 7.23, + "step": 16966 + }, + { + "epoch": 1.583185592983111, + "grad_norm": 0.9043822054346442, + "learning_rate": 0.0002592700836378961, + "loss": 6.8805, + "step": 16967 + }, + { + "epoch": 1.5832789026779883, + "grad_norm": 12052.79218251226, + "learning_rate": 0.0002592649016965844, + "loss": 6.9868, + "step": 16968 + }, + { + "epoch": 1.5833722123728655, + "grad_norm": 3432.5691059277433, + "learning_rate": 0.00025925971947744444, + "loss": 7.187, + "step": 16969 + }, + { + "epoch": 1.5834655220677427, + "grad_norm": 0.6906226550762479, + "learning_rate": 0.0002592545369804893, + "loss": 7.1446, + "step": 16970 + }, + { + "epoch": 1.5835588317626201, + "grad_norm": 18398.58555687221, + "learning_rate": 0.00025924935420573226, + "loss": 6.8507, + "step": 16971 + }, + { + "epoch": 1.5836521414574976, + "grad_norm": 0.7023266519648202, + "learning_rate": 0.0002592441711531865, + "loss": 7.0752, + "step": 16972 + }, + { + "epoch": 1.5837454511523748, + "grad_norm": 2863.058277974018, + "learning_rate": 0.0002592389878228652, + "loss": 7.1176, + "step": 16973 + }, + { + "epoch": 1.583838760847252, + "grad_norm": 0.755589109768997, + "learning_rate": 0.00025923380421478146, + "loss": 7.4317, + "step": 16974 + }, + { + "epoch": 1.5839320705421294, + "grad_norm": 0.7663539713674086, + "learning_rate": 0.0002592286203289485, + "loss": 6.9295, + "step": 16975 + }, + { + "epoch": 1.5840253802370068, + "grad_norm": 0.928749288504896, + "learning_rate": 0.0002592234361653795, + "loss": 7.028, + "step": 16976 + }, + { + "epoch": 1.5841186899318838, + "grad_norm": 4225.72463037347, + "learning_rate": 0.0002592182517240877, + "loss": 7.1252, + "step": 16977 + }, + { + "epoch": 1.5842119996267612, + "grad_norm": 0.7263292073487044, + "learning_rate": 0.00025921306700508627, + "loss": 7.0722, + "step": 16978 + }, + { + "epoch": 1.5843053093216386, + "grad_norm": 0.9147847702807475, + "learning_rate": 0.0002592078820083883, + "loss": 7.2235, + "step": 16979 + }, + { + "epoch": 1.5843986190165158, + "grad_norm": 1.1793194991843374, + "learning_rate": 0.0002592026967340071, + "loss": 7.3958, + "step": 16980 + }, + { + "epoch": 1.584491928711393, + "grad_norm": 0.5694945448971274, + "learning_rate": 0.0002591975111819558, + "loss": 7.0976, + "step": 16981 + }, + { + "epoch": 1.5845852384062704, + "grad_norm": 0.5682974219449839, + "learning_rate": 0.00025919232535224747, + "loss": 7.3118, + "step": 16982 + }, + { + "epoch": 1.5846785481011478, + "grad_norm": 0.37779429422797456, + "learning_rate": 0.0002591871392448955, + "loss": 7.3749, + "step": 16983 + }, + { + "epoch": 1.584771857796025, + "grad_norm": 0.5874328757865885, + "learning_rate": 0.00025918195285991286, + "loss": 7.3199, + "step": 16984 + }, + { + "epoch": 1.5848651674909022, + "grad_norm": 0.8845658353435277, + "learning_rate": 0.00025917676619731297, + "loss": 7.2072, + "step": 16985 + }, + { + "epoch": 1.5849584771857796, + "grad_norm": 2.058992264859325, + "learning_rate": 0.00025917157925710885, + "loss": 6.6802, + "step": 16986 + }, + { + "epoch": 1.585051786880657, + "grad_norm": 0.5129548646006211, + "learning_rate": 0.0002591663920393138, + "loss": 7.4283, + "step": 16987 + }, + { + "epoch": 1.585145096575534, + "grad_norm": 6914.503827606332, + "learning_rate": 0.0002591612045439409, + "loss": 6.8632, + "step": 16988 + }, + { + "epoch": 1.5852384062704115, + "grad_norm": 0.42438567660690685, + "learning_rate": 0.0002591560167710034, + "loss": 6.989, + "step": 16989 + }, + { + "epoch": 1.5853317159652889, + "grad_norm": 0.8455350091199011, + "learning_rate": 0.0002591508287205145, + "loss": 7.1322, + "step": 16990 + }, + { + "epoch": 1.585425025660166, + "grad_norm": 0.7411815508383129, + "learning_rate": 0.00025914564039248737, + "loss": 6.7646, + "step": 16991 + }, + { + "epoch": 1.5855183353550433, + "grad_norm": 0.6869688239895633, + "learning_rate": 0.0002591404517869352, + "loss": 6.6074, + "step": 16992 + }, + { + "epoch": 1.5856116450499207, + "grad_norm": 1.3417967155957806, + "learning_rate": 0.00025913526290387116, + "loss": 7.3133, + "step": 16993 + }, + { + "epoch": 1.5857049547447981, + "grad_norm": 0.5331680598792654, + "learning_rate": 0.00025913007374330855, + "loss": 7.06, + "step": 16994 + }, + { + "epoch": 1.5857982644396753, + "grad_norm": 0.5920163149123171, + "learning_rate": 0.00025912488430526043, + "loss": 7.148, + "step": 16995 + }, + { + "epoch": 1.5858915741345525, + "grad_norm": 0.6413453710444116, + "learning_rate": 0.0002591196945897401, + "loss": 7.4221, + "step": 16996 + }, + { + "epoch": 1.58598488382943, + "grad_norm": 0.7078845231238048, + "learning_rate": 0.00025911450459676066, + "loss": 7.2163, + "step": 16997 + }, + { + "epoch": 1.5860781935243071, + "grad_norm": 0.7109867787639939, + "learning_rate": 0.00025910931432633535, + "loss": 7.0973, + "step": 16998 + }, + { + "epoch": 1.5861715032191843, + "grad_norm": 17104.53460280784, + "learning_rate": 0.00025910412377847745, + "loss": 7.0127, + "step": 16999 + }, + { + "epoch": 1.5862648129140617, + "grad_norm": 0.5290955021978464, + "learning_rate": 0.0002590989329532, + "loss": 7.1585, + "step": 17000 + }, + { + "epoch": 1.5863581226089392, + "grad_norm": 0.5279968176760452, + "learning_rate": 0.0002590937418505163, + "loss": 7.1881, + "step": 17001 + }, + { + "epoch": 1.5864514323038164, + "grad_norm": 0.6989514865184838, + "learning_rate": 0.00025908855047043956, + "loss": 7.0696, + "step": 17002 + }, + { + "epoch": 1.5865447419986936, + "grad_norm": 0.9258014816295848, + "learning_rate": 0.000259083358812983, + "loss": 7.2235, + "step": 17003 + }, + { + "epoch": 1.586638051693571, + "grad_norm": 12587.894518142306, + "learning_rate": 0.0002590781668781597, + "loss": 7.2779, + "step": 17004 + }, + { + "epoch": 1.5867313613884484, + "grad_norm": 42668.06643017791, + "learning_rate": 0.0002590729746659829, + "loss": 7.1411, + "step": 17005 + }, + { + "epoch": 1.5868246710833256, + "grad_norm": 1.3034770363910544, + "learning_rate": 0.0002590677821764659, + "loss": 7.0059, + "step": 17006 + }, + { + "epoch": 1.5869179807782028, + "grad_norm": 12724.436488789106, + "learning_rate": 0.0002590625894096218, + "loss": 7.3439, + "step": 17007 + }, + { + "epoch": 1.5870112904730802, + "grad_norm": 1.6486156361038087, + "learning_rate": 0.00025905739636546387, + "loss": 7.385, + "step": 17008 + }, + { + "epoch": 1.5871046001679574, + "grad_norm": 1.8851917164124636, + "learning_rate": 0.0002590522030440053, + "loss": 7.2692, + "step": 17009 + }, + { + "epoch": 1.5871979098628346, + "grad_norm": 2.6874556148207542, + "learning_rate": 0.00025904700944525923, + "loss": 6.9279, + "step": 17010 + }, + { + "epoch": 1.587291219557712, + "grad_norm": 1.6657891751564982, + "learning_rate": 0.00025904181556923894, + "loss": 6.9754, + "step": 17011 + }, + { + "epoch": 1.5873845292525894, + "grad_norm": 21043.002737597657, + "learning_rate": 0.0002590366214159576, + "loss": 6.8571, + "step": 17012 + }, + { + "epoch": 1.5874778389474666, + "grad_norm": 1.1529032808563164, + "learning_rate": 0.00025903142698542847, + "loss": 7.3321, + "step": 17013 + }, + { + "epoch": 1.5875711486423438, + "grad_norm": 1.677457976439617, + "learning_rate": 0.0002590262322776647, + "loss": 7.0518, + "step": 17014 + }, + { + "epoch": 1.5876644583372213, + "grad_norm": 44602.65509172718, + "learning_rate": 0.00025902103729267954, + "loss": 7.385, + "step": 17015 + }, + { + "epoch": 1.5877577680320987, + "grad_norm": 2.4777126769564424, + "learning_rate": 0.0002590158420304861, + "loss": 6.9046, + "step": 17016 + }, + { + "epoch": 1.5878510777269759, + "grad_norm": 2.874439015190204, + "learning_rate": 0.00025901064649109775, + "loss": 7.5284, + "step": 17017 + }, + { + "epoch": 1.587944387421853, + "grad_norm": 2.5027202284645194, + "learning_rate": 0.00025900545067452757, + "loss": 7.5593, + "step": 17018 + }, + { + "epoch": 1.5880376971167305, + "grad_norm": 1.5713252182341033, + "learning_rate": 0.0002590002545807888, + "loss": 7.296, + "step": 17019 + }, + { + "epoch": 1.5881310068116077, + "grad_norm": 0.8105855676361602, + "learning_rate": 0.00025899505820989473, + "loss": 7.2168, + "step": 17020 + }, + { + "epoch": 1.5882243165064849, + "grad_norm": 0.7635555067921962, + "learning_rate": 0.0002589898615618585, + "loss": 7.0993, + "step": 17021 + }, + { + "epoch": 1.5883176262013623, + "grad_norm": 1.0257205673641037, + "learning_rate": 0.00025898466463669336, + "loss": 7.182, + "step": 17022 + }, + { + "epoch": 1.5884109358962397, + "grad_norm": 1.714892597840781, + "learning_rate": 0.0002589794674344124, + "loss": 6.9809, + "step": 17023 + }, + { + "epoch": 1.588504245591117, + "grad_norm": 1.1219176749285602, + "learning_rate": 0.000258974269955029, + "loss": 7.1196, + "step": 17024 + }, + { + "epoch": 1.5885975552859941, + "grad_norm": 377.6156170197042, + "learning_rate": 0.0002589690721985564, + "loss": 7.3526, + "step": 17025 + }, + { + "epoch": 1.5886908649808715, + "grad_norm": 2208.8096234075924, + "learning_rate": 0.00025896387416500755, + "loss": 7.247, + "step": 17026 + }, + { + "epoch": 1.588784174675749, + "grad_norm": 0.9734359712646246, + "learning_rate": 0.00025895867585439594, + "loss": 6.8473, + "step": 17027 + }, + { + "epoch": 1.5888774843706261, + "grad_norm": 0.44927991531135547, + "learning_rate": 0.00025895347726673473, + "loss": 7.2875, + "step": 17028 + }, + { + "epoch": 1.5889707940655033, + "grad_norm": 0.7881784193949843, + "learning_rate": 0.00025894827840203705, + "loss": 7.3423, + "step": 17029 + }, + { + "epoch": 1.5890641037603808, + "grad_norm": 0.6792252818144391, + "learning_rate": 0.00025894307926031617, + "loss": 7.3006, + "step": 17030 + }, + { + "epoch": 1.589157413455258, + "grad_norm": 0.9878445489775272, + "learning_rate": 0.0002589378798415853, + "loss": 7.5852, + "step": 17031 + }, + { + "epoch": 1.5892507231501352, + "grad_norm": 1.2093410478307967, + "learning_rate": 0.00025893268014585773, + "loss": 6.8478, + "step": 17032 + }, + { + "epoch": 1.5893440328450126, + "grad_norm": 671.1249402226491, + "learning_rate": 0.0002589274801731466, + "loss": 7.22, + "step": 17033 + }, + { + "epoch": 1.58943734253989, + "grad_norm": 1.101565904333559, + "learning_rate": 0.00025892227992346513, + "loss": 7.1128, + "step": 17034 + }, + { + "epoch": 1.5895306522347672, + "grad_norm": 1.5670342914910456, + "learning_rate": 0.0002589170793968266, + "loss": 6.8396, + "step": 17035 + }, + { + "epoch": 1.5896239619296444, + "grad_norm": 0.827888506721869, + "learning_rate": 0.0002589118785932442, + "loss": 7.4004, + "step": 17036 + }, + { + "epoch": 1.5897172716245218, + "grad_norm": 89.37429113142288, + "learning_rate": 0.0002589066775127311, + "loss": 7.078, + "step": 17037 + }, + { + "epoch": 1.5898105813193992, + "grad_norm": 0.5079794102336825, + "learning_rate": 0.0002589014761553006, + "loss": 7.2428, + "step": 17038 + }, + { + "epoch": 1.5899038910142764, + "grad_norm": 0.7716640457822231, + "learning_rate": 0.00025889627452096596, + "loss": 6.9461, + "step": 17039 + }, + { + "epoch": 1.5899972007091536, + "grad_norm": 0.5068588790095087, + "learning_rate": 0.0002588910726097403, + "loss": 7.0887, + "step": 17040 + }, + { + "epoch": 1.590090510404031, + "grad_norm": 0.8517770086183026, + "learning_rate": 0.00025888587042163693, + "loss": 7.3105, + "step": 17041 + }, + { + "epoch": 1.5901838200989082, + "grad_norm": 1.1095914083655383, + "learning_rate": 0.000258880667956669, + "loss": 7.227, + "step": 17042 + }, + { + "epoch": 1.5902771297937854, + "grad_norm": 3670.4003066300365, + "learning_rate": 0.00025887546521484976, + "loss": 7.3083, + "step": 17043 + }, + { + "epoch": 1.5903704394886629, + "grad_norm": 1.0137096729098602, + "learning_rate": 0.00025887026219619257, + "loss": 7.2807, + "step": 17044 + }, + { + "epoch": 1.5904637491835403, + "grad_norm": 0.38365630824840175, + "learning_rate": 0.0002588650589007105, + "loss": 7.1163, + "step": 17045 + }, + { + "epoch": 1.5905570588784175, + "grad_norm": 0.593827251988666, + "learning_rate": 0.00025885985532841677, + "loss": 7.3598, + "step": 17046 + }, + { + "epoch": 1.5906503685732947, + "grad_norm": 503.48041788647595, + "learning_rate": 0.0002588546514793248, + "loss": 7.4715, + "step": 17047 + }, + { + "epoch": 1.590743678268172, + "grad_norm": 310.9621410007667, + "learning_rate": 0.0002588494473534476, + "loss": 7.1783, + "step": 17048 + }, + { + "epoch": 1.5908369879630495, + "grad_norm": 6174.769735866221, + "learning_rate": 0.0002588442429507986, + "loss": 7.1226, + "step": 17049 + }, + { + "epoch": 1.5909302976579267, + "grad_norm": 2.3754334234047403, + "learning_rate": 0.00025883903827139085, + "loss": 6.7272, + "step": 17050 + }, + { + "epoch": 1.591023607352804, + "grad_norm": 1.6749901579362843, + "learning_rate": 0.0002588338333152376, + "loss": 6.8993, + "step": 17051 + }, + { + "epoch": 1.5911169170476813, + "grad_norm": 0.46690214701945354, + "learning_rate": 0.00025882862808235225, + "loss": 7.1967, + "step": 17052 + }, + { + "epoch": 1.5912102267425585, + "grad_norm": 0.731884032460504, + "learning_rate": 0.00025882342257274794, + "loss": 7.2213, + "step": 17053 + }, + { + "epoch": 1.5913035364374357, + "grad_norm": 1.1377639173412175, + "learning_rate": 0.00025881821678643783, + "loss": 6.9492, + "step": 17054 + }, + { + "epoch": 1.5913968461323131, + "grad_norm": 1.1546966520672808, + "learning_rate": 0.00025881301072343537, + "loss": 7.0219, + "step": 17055 + }, + { + "epoch": 1.5914901558271906, + "grad_norm": 2.1417422783087967, + "learning_rate": 0.0002588078043837535, + "loss": 7.2667, + "step": 17056 + }, + { + "epoch": 1.5915834655220678, + "grad_norm": 1.5972917161250713, + "learning_rate": 0.00025880259776740565, + "loss": 7.3147, + "step": 17057 + }, + { + "epoch": 1.591676775216945, + "grad_norm": 7028.167759302155, + "learning_rate": 0.00025879739087440506, + "loss": 6.8314, + "step": 17058 + }, + { + "epoch": 1.5917700849118224, + "grad_norm": 0.8808362234532485, + "learning_rate": 0.0002587921837047649, + "loss": 7.1682, + "step": 17059 + }, + { + "epoch": 1.5918633946066998, + "grad_norm": 0.42135220355262976, + "learning_rate": 0.0002587869762584985, + "loss": 6.8905, + "step": 17060 + }, + { + "epoch": 1.591956704301577, + "grad_norm": 23755.54278068137, + "learning_rate": 0.000258781768535619, + "loss": 7.0983, + "step": 17061 + }, + { + "epoch": 1.5920500139964542, + "grad_norm": 0.7817197532550415, + "learning_rate": 0.00025877656053613975, + "loss": 7.2812, + "step": 17062 + }, + { + "epoch": 1.5921433236913316, + "grad_norm": 6978.541727482819, + "learning_rate": 0.00025877135226007385, + "loss": 7.2231, + "step": 17063 + }, + { + "epoch": 1.5922366333862088, + "grad_norm": 0.8025305110875905, + "learning_rate": 0.00025876614370743463, + "loss": 7.2487, + "step": 17064 + }, + { + "epoch": 1.592329943081086, + "grad_norm": 1.0747661878849126, + "learning_rate": 0.00025876093487823534, + "loss": 7.3924, + "step": 17065 + }, + { + "epoch": 1.5924232527759634, + "grad_norm": 0.7985365277761878, + "learning_rate": 0.00025875572577248925, + "loss": 7.3478, + "step": 17066 + }, + { + "epoch": 1.5925165624708408, + "grad_norm": 727.0879552842113, + "learning_rate": 0.00025875051639020954, + "loss": 7.6163, + "step": 17067 + }, + { + "epoch": 1.592609872165718, + "grad_norm": 0.8604948189652359, + "learning_rate": 0.0002587453067314094, + "loss": 7.1968, + "step": 17068 + }, + { + "epoch": 1.5927031818605952, + "grad_norm": 0.7307709943148465, + "learning_rate": 0.0002587400967961023, + "loss": 7.5142, + "step": 17069 + }, + { + "epoch": 1.5927964915554726, + "grad_norm": 1.0425560673553065, + "learning_rate": 0.00025873488658430125, + "loss": 7.1518, + "step": 17070 + }, + { + "epoch": 1.59288980125035, + "grad_norm": 0.878987424324742, + "learning_rate": 0.00025872967609601963, + "loss": 7.2601, + "step": 17071 + }, + { + "epoch": 1.592983110945227, + "grad_norm": 0.6247350170725654, + "learning_rate": 0.00025872446533127064, + "loss": 7.4244, + "step": 17072 + }, + { + "epoch": 1.5930764206401045, + "grad_norm": 1.2923067217931667, + "learning_rate": 0.00025871925429006755, + "loss": 6.9098, + "step": 17073 + }, + { + "epoch": 1.5931697303349819, + "grad_norm": 5524.290618309842, + "learning_rate": 0.0002587140429724236, + "loss": 7.2597, + "step": 17074 + }, + { + "epoch": 1.593263040029859, + "grad_norm": 0.7978400591344506, + "learning_rate": 0.00025870883137835203, + "loss": 7.2399, + "step": 17075 + }, + { + "epoch": 1.5933563497247363, + "grad_norm": 1.2161930595651962, + "learning_rate": 0.0002587036195078661, + "loss": 7.0918, + "step": 17076 + }, + { + "epoch": 1.5934496594196137, + "grad_norm": 0.9460235622591947, + "learning_rate": 0.00025869840736097906, + "loss": 7.2775, + "step": 17077 + }, + { + "epoch": 1.5935429691144911, + "grad_norm": 0.6603178280159532, + "learning_rate": 0.0002586931949377042, + "loss": 7.3822, + "step": 17078 + }, + { + "epoch": 1.5936362788093683, + "grad_norm": 657.8331371595278, + "learning_rate": 0.0002586879822380547, + "loss": 7.0584, + "step": 17079 + }, + { + "epoch": 1.5937295885042455, + "grad_norm": 39425.536671575166, + "learning_rate": 0.00025868276926204386, + "loss": 7.0734, + "step": 17080 + }, + { + "epoch": 1.593822898199123, + "grad_norm": 1.0514584563985616, + "learning_rate": 0.000258677556009685, + "loss": 7.1534, + "step": 17081 + }, + { + "epoch": 1.5939162078940003, + "grad_norm": 0.8106959221011163, + "learning_rate": 0.00025867234248099126, + "loss": 7.0613, + "step": 17082 + }, + { + "epoch": 1.5940095175888773, + "grad_norm": 1.0654671887333045, + "learning_rate": 0.00025866712867597593, + "loss": 7.171, + "step": 17083 + }, + { + "epoch": 1.5941028272837547, + "grad_norm": 1.3661455357939818, + "learning_rate": 0.0002586619145946523, + "loss": 6.9018, + "step": 17084 + }, + { + "epoch": 1.5941961369786322, + "grad_norm": 1.728926801829054, + "learning_rate": 0.0002586567002370336, + "loss": 7.2556, + "step": 17085 + }, + { + "epoch": 1.5942894466735094, + "grad_norm": 1.6509604837710286, + "learning_rate": 0.0002586514856031331, + "loss": 7.4552, + "step": 17086 + }, + { + "epoch": 1.5943827563683866, + "grad_norm": 2.184730333474797, + "learning_rate": 0.00025864627069296405, + "loss": 7.5762, + "step": 17087 + }, + { + "epoch": 1.594476066063264, + "grad_norm": 1.4534995531517878, + "learning_rate": 0.0002586410555065398, + "loss": 7.4236, + "step": 17088 + }, + { + "epoch": 1.5945693757581414, + "grad_norm": 1.3006495620058947, + "learning_rate": 0.0002586358400438734, + "loss": 7.145, + "step": 17089 + }, + { + "epoch": 1.5946626854530186, + "grad_norm": 1.2907591326112313, + "learning_rate": 0.0002586306243049783, + "loss": 7.5612, + "step": 17090 + }, + { + "epoch": 1.5947559951478958, + "grad_norm": 1.0503450478595828, + "learning_rate": 0.0002586254082898677, + "loss": 7.3636, + "step": 17091 + }, + { + "epoch": 1.5948493048427732, + "grad_norm": 1.0914825788306566, + "learning_rate": 0.00025862019199855485, + "loss": 7.4515, + "step": 17092 + }, + { + "epoch": 1.5949426145376506, + "grad_norm": 1.033857205824216, + "learning_rate": 0.00025861497543105304, + "loss": 7.2515, + "step": 17093 + }, + { + "epoch": 1.5950359242325276, + "grad_norm": 0.8456093276690897, + "learning_rate": 0.0002586097585873755, + "loss": 7.4119, + "step": 17094 + }, + { + "epoch": 1.595129233927405, + "grad_norm": 1430247.2008690326, + "learning_rate": 0.00025860454146753553, + "loss": 7.1654, + "step": 17095 + }, + { + "epoch": 1.5952225436222824, + "grad_norm": 1.4116722592749993, + "learning_rate": 0.0002585993240715464, + "loss": 7.4713, + "step": 17096 + }, + { + "epoch": 1.5953158533171596, + "grad_norm": 0.9468264590022184, + "learning_rate": 0.00025859410639942135, + "loss": 6.8212, + "step": 17097 + }, + { + "epoch": 1.5954091630120368, + "grad_norm": 0.9514925039333103, + "learning_rate": 0.0002585888884511736, + "loss": 7.0198, + "step": 17098 + }, + { + "epoch": 1.5955024727069143, + "grad_norm": 0.781422414179383, + "learning_rate": 0.0002585836702268166, + "loss": 6.9949, + "step": 17099 + }, + { + "epoch": 1.5955957824017917, + "grad_norm": 0.969407154243033, + "learning_rate": 0.0002585784517263634, + "loss": 7.6278, + "step": 17100 + }, + { + "epoch": 1.5956890920966689, + "grad_norm": 0.9658092458399138, + "learning_rate": 0.00025857323294982735, + "loss": 7.6925, + "step": 17101 + }, + { + "epoch": 1.595782401791546, + "grad_norm": 4144.698370417937, + "learning_rate": 0.0002585680138972218, + "loss": 7.4597, + "step": 17102 + }, + { + "epoch": 1.5958757114864235, + "grad_norm": 0.8832711525099589, + "learning_rate": 0.00025856279456855987, + "loss": 7.5921, + "step": 17103 + }, + { + "epoch": 1.5959690211813007, + "grad_norm": 1.6898332732618344, + "learning_rate": 0.0002585575749638549, + "loss": 6.9869, + "step": 17104 + }, + { + "epoch": 1.5960623308761779, + "grad_norm": 0.970697084363762, + "learning_rate": 0.00025855235508312024, + "loss": 7.3379, + "step": 17105 + }, + { + "epoch": 1.5961556405710553, + "grad_norm": 0.6236200008359616, + "learning_rate": 0.00025854713492636915, + "loss": 7.3519, + "step": 17106 + }, + { + "epoch": 1.5962489502659327, + "grad_norm": 3714.738572044299, + "learning_rate": 0.0002585419144936148, + "loss": 7.1556, + "step": 17107 + }, + { + "epoch": 1.59634225996081, + "grad_norm": 0.5384087655294704, + "learning_rate": 0.00025853669378487045, + "loss": 7.1849, + "step": 17108 + }, + { + "epoch": 1.5964355696556871, + "grad_norm": 0.5496063454339243, + "learning_rate": 0.0002585314728001495, + "loss": 7.0381, + "step": 17109 + }, + { + "epoch": 1.5965288793505645, + "grad_norm": 0.8711992253637756, + "learning_rate": 0.0002585262515394651, + "loss": 7.1098, + "step": 17110 + }, + { + "epoch": 1.596622189045442, + "grad_norm": 171.07909908699258, + "learning_rate": 0.0002585210300028306, + "loss": 7.1993, + "step": 17111 + }, + { + "epoch": 1.5967154987403192, + "grad_norm": 266.07811774459634, + "learning_rate": 0.00025851580819025936, + "loss": 6.9483, + "step": 17112 + }, + { + "epoch": 1.5968088084351963, + "grad_norm": 0.8381052890079888, + "learning_rate": 0.00025851058610176455, + "loss": 7.272, + "step": 17113 + }, + { + "epoch": 1.5969021181300738, + "grad_norm": 519.9898915169256, + "learning_rate": 0.00025850536373735937, + "loss": 6.9499, + "step": 17114 + }, + { + "epoch": 1.596995427824951, + "grad_norm": 0.5942234975061889, + "learning_rate": 0.00025850014109705726, + "loss": 7.2849, + "step": 17115 + }, + { + "epoch": 1.5970887375198282, + "grad_norm": 0.8501976530489415, + "learning_rate": 0.0002584949181808714, + "loss": 7.2945, + "step": 17116 + }, + { + "epoch": 1.5971820472147056, + "grad_norm": 1.6999857779358418, + "learning_rate": 0.0002584896949888151, + "loss": 6.806, + "step": 17117 + }, + { + "epoch": 1.597275356909583, + "grad_norm": 1.2404297386437417, + "learning_rate": 0.0002584844715209017, + "loss": 7.0058, + "step": 17118 + }, + { + "epoch": 1.5973686666044602, + "grad_norm": 0.44390370844661603, + "learning_rate": 0.00025847924777714433, + "loss": 7.4518, + "step": 17119 + }, + { + "epoch": 1.5974619762993374, + "grad_norm": 0.4945531203646248, + "learning_rate": 0.00025847402375755643, + "loss": 7.3181, + "step": 17120 + }, + { + "epoch": 1.5975552859942148, + "grad_norm": 1045.1449714991722, + "learning_rate": 0.00025846879946215124, + "loss": 7.1033, + "step": 17121 + }, + { + "epoch": 1.5976485956890922, + "grad_norm": 0.9890700400181989, + "learning_rate": 0.000258463574890942, + "loss": 7.5026, + "step": 17122 + }, + { + "epoch": 1.5977419053839694, + "grad_norm": 1.1072443513462438, + "learning_rate": 0.000258458350043942, + "loss": 7.5197, + "step": 17123 + }, + { + "epoch": 1.5978352150788466, + "grad_norm": 0.5677722488260738, + "learning_rate": 0.00025845312492116457, + "loss": 7.321, + "step": 17124 + }, + { + "epoch": 1.597928524773724, + "grad_norm": 0.9205477825584281, + "learning_rate": 0.0002584478995226229, + "loss": 7.454, + "step": 17125 + }, + { + "epoch": 1.5980218344686012, + "grad_norm": 2150.294628550382, + "learning_rate": 0.00025844267384833044, + "loss": 7.2345, + "step": 17126 + }, + { + "epoch": 1.5981151441634784, + "grad_norm": 0.4900125808743811, + "learning_rate": 0.00025843744789830035, + "loss": 7.3916, + "step": 17127 + }, + { + "epoch": 1.5982084538583559, + "grad_norm": 663.5133817414387, + "learning_rate": 0.00025843222167254594, + "loss": 7.1948, + "step": 17128 + }, + { + "epoch": 1.5983017635532333, + "grad_norm": 331.57203146758275, + "learning_rate": 0.00025842699517108055, + "loss": 7.1944, + "step": 17129 + }, + { + "epoch": 1.5983950732481105, + "grad_norm": 1.157284708825876, + "learning_rate": 0.00025842176839391734, + "loss": 6.8695, + "step": 17130 + }, + { + "epoch": 1.5984883829429877, + "grad_norm": 0.7223708933650075, + "learning_rate": 0.00025841654134106976, + "loss": 6.9278, + "step": 17131 + }, + { + "epoch": 1.598581692637865, + "grad_norm": 0.33756926160664513, + "learning_rate": 0.00025841131401255104, + "loss": 7.0291, + "step": 17132 + }, + { + "epoch": 1.5986750023327425, + "grad_norm": 0.872691530549649, + "learning_rate": 0.0002584060864083744, + "loss": 7.0842, + "step": 17133 + }, + { + "epoch": 1.5987683120276197, + "grad_norm": 1.4599618418220028, + "learning_rate": 0.00025840085852855325, + "loss": 7.3315, + "step": 17134 + }, + { + "epoch": 1.598861621722497, + "grad_norm": 0.7564652263908898, + "learning_rate": 0.0002583956303731008, + "loss": 6.9994, + "step": 17135 + }, + { + "epoch": 1.5989549314173743, + "grad_norm": 1.1028345243274444, + "learning_rate": 0.00025839040194203044, + "loss": 7.2078, + "step": 17136 + }, + { + "epoch": 1.5990482411122515, + "grad_norm": 0.9775420784848716, + "learning_rate": 0.00025838517323535534, + "loss": 7.2829, + "step": 17137 + }, + { + "epoch": 1.5991415508071287, + "grad_norm": 0.29797398464875147, + "learning_rate": 0.0002583799442530888, + "loss": 6.7862, + "step": 17138 + }, + { + "epoch": 1.5992348605020061, + "grad_norm": 0.8094536111694657, + "learning_rate": 0.0002583747149952442, + "loss": 7.1636, + "step": 17139 + }, + { + "epoch": 1.5993281701968836, + "grad_norm": 0.5271177800093553, + "learning_rate": 0.00025836948546183484, + "loss": 7.0142, + "step": 17140 + }, + { + "epoch": 1.5994214798917608, + "grad_norm": 1281.9196287319714, + "learning_rate": 0.00025836425565287395, + "loss": 7.2854, + "step": 17141 + }, + { + "epoch": 1.599514789586638, + "grad_norm": 0.9457293165073082, + "learning_rate": 0.0002583590255683749, + "loss": 6.8639, + "step": 17142 + }, + { + "epoch": 1.5996080992815154, + "grad_norm": 1.202618874900934, + "learning_rate": 0.0002583537952083509, + "loss": 6.8358, + "step": 17143 + }, + { + "epoch": 1.5997014089763928, + "grad_norm": 1163.581912678478, + "learning_rate": 0.0002583485645728153, + "loss": 7.0553, + "step": 17144 + }, + { + "epoch": 1.59979471867127, + "grad_norm": 0.4998799565158151, + "learning_rate": 0.0002583433336617814, + "loss": 7.2982, + "step": 17145 + }, + { + "epoch": 1.5998880283661472, + "grad_norm": 0.38099188585927923, + "learning_rate": 0.0002583381024752625, + "loss": 7.1169, + "step": 17146 + }, + { + "epoch": 1.5999813380610246, + "grad_norm": 0.5114110716405543, + "learning_rate": 0.0002583328710132719, + "loss": 7.0552, + "step": 17147 + }, + { + "epoch": 1.6000746477559018, + "grad_norm": 0.5255008469141911, + "learning_rate": 0.0002583276392758229, + "loss": 6.9448, + "step": 17148 + }, + { + "epoch": 1.600167957450779, + "grad_norm": 83.32053516849282, + "learning_rate": 0.00025832240726292874, + "loss": 7.3893, + "step": 17149 + }, + { + "epoch": 1.6002612671456564, + "grad_norm": 0.3951092223455399, + "learning_rate": 0.0002583171749746029, + "loss": 7.1146, + "step": 17150 + }, + { + "epoch": 1.6003545768405338, + "grad_norm": 0.4349987423344366, + "learning_rate": 0.0002583119424108585, + "loss": 6.8868, + "step": 17151 + }, + { + "epoch": 1.600447886535411, + "grad_norm": 0.39090482336731885, + "learning_rate": 0.00025830670957170893, + "loss": 7.2169, + "step": 17152 + }, + { + "epoch": 1.6005411962302882, + "grad_norm": 0.5073764583603673, + "learning_rate": 0.00025830147645716745, + "loss": 7.3838, + "step": 17153 + }, + { + "epoch": 1.6006345059251657, + "grad_norm": 0.4985316951410261, + "learning_rate": 0.00025829624306724746, + "loss": 7.0145, + "step": 17154 + }, + { + "epoch": 1.600727815620043, + "grad_norm": 0.42071963172613197, + "learning_rate": 0.00025829100940196213, + "loss": 7.1852, + "step": 17155 + }, + { + "epoch": 1.6008211253149203, + "grad_norm": 0.2873322880659821, + "learning_rate": 0.0002582857754613249, + "loss": 6.9941, + "step": 17156 + }, + { + "epoch": 1.6009144350097975, + "grad_norm": 0.5324362626345547, + "learning_rate": 0.000258280541245349, + "loss": 7.2601, + "step": 17157 + }, + { + "epoch": 1.6010077447046749, + "grad_norm": 0.8276928319033304, + "learning_rate": 0.00025827530675404776, + "loss": 7.2835, + "step": 17158 + }, + { + "epoch": 1.601101054399552, + "grad_norm": 627.4245543378815, + "learning_rate": 0.00025827007198743447, + "loss": 7.314, + "step": 17159 + }, + { + "epoch": 1.6011943640944293, + "grad_norm": 0.5912386134909995, + "learning_rate": 0.0002582648369455225, + "loss": 7.1985, + "step": 17160 + }, + { + "epoch": 1.6012876737893067, + "grad_norm": 0.7117237091862817, + "learning_rate": 0.0002582596016283251, + "loss": 7.0473, + "step": 17161 + }, + { + "epoch": 1.6013809834841841, + "grad_norm": 0.28826514678193826, + "learning_rate": 0.00025825436603585557, + "loss": 6.906, + "step": 17162 + }, + { + "epoch": 1.6014742931790613, + "grad_norm": 1.535149861551744, + "learning_rate": 0.0002582491301681273, + "loss": 7.4888, + "step": 17163 + }, + { + "epoch": 1.6015676028739385, + "grad_norm": 0.3341418426280334, + "learning_rate": 0.00025824389402515356, + "loss": 6.7488, + "step": 17164 + }, + { + "epoch": 1.601660912568816, + "grad_norm": 0.8053323820697923, + "learning_rate": 0.0002582386576069476, + "loss": 6.8575, + "step": 17165 + }, + { + "epoch": 1.6017542222636934, + "grad_norm": 0.7623482387175071, + "learning_rate": 0.00025823342091352286, + "loss": 7.1151, + "step": 17166 + }, + { + "epoch": 1.6018475319585705, + "grad_norm": 0.7492016898518107, + "learning_rate": 0.00025822818394489257, + "loss": 7.163, + "step": 17167 + }, + { + "epoch": 1.6019408416534477, + "grad_norm": 0.753268073756084, + "learning_rate": 0.0002582229467010701, + "loss": 7.0439, + "step": 17168 + }, + { + "epoch": 1.6020341513483252, + "grad_norm": 0.41643220847199175, + "learning_rate": 0.0002582177091820687, + "loss": 7.2023, + "step": 17169 + }, + { + "epoch": 1.6021274610432024, + "grad_norm": 0.3876134864122157, + "learning_rate": 0.0002582124713879017, + "loss": 7.3118, + "step": 17170 + }, + { + "epoch": 1.6022207707380796, + "grad_norm": 0.5145487282413738, + "learning_rate": 0.00025820723331858243, + "loss": 7.1059, + "step": 17171 + }, + { + "epoch": 1.602314080432957, + "grad_norm": 31.123588211048137, + "learning_rate": 0.0002582019949741243, + "loss": 7.1861, + "step": 17172 + }, + { + "epoch": 1.6024073901278344, + "grad_norm": 0.5689456055349934, + "learning_rate": 0.0002581967563545405, + "loss": 7.4215, + "step": 17173 + }, + { + "epoch": 1.6025006998227116, + "grad_norm": 0.570922819389437, + "learning_rate": 0.00025819151745984435, + "loss": 7.0069, + "step": 17174 + }, + { + "epoch": 1.6025940095175888, + "grad_norm": 185.1430614135734, + "learning_rate": 0.0002581862782900493, + "loss": 7.2195, + "step": 17175 + }, + { + "epoch": 1.6026873192124662, + "grad_norm": 0.5622704020698123, + "learning_rate": 0.0002581810388451685, + "loss": 6.8559, + "step": 17176 + }, + { + "epoch": 1.6027806289073436, + "grad_norm": 0.6120495931881439, + "learning_rate": 0.00025817579912521546, + "loss": 7.0642, + "step": 17177 + }, + { + "epoch": 1.6028739386022206, + "grad_norm": 0.9618457850263381, + "learning_rate": 0.0002581705591302034, + "loss": 7.1697, + "step": 17178 + }, + { + "epoch": 1.602967248297098, + "grad_norm": 1.6057023738655063, + "learning_rate": 0.0002581653188601456, + "loss": 7.3993, + "step": 17179 + }, + { + "epoch": 1.6030605579919754, + "grad_norm": 447.8744724896487, + "learning_rate": 0.0002581600783150555, + "loss": 7.2336, + "step": 17180 + }, + { + "epoch": 1.6031538676868526, + "grad_norm": 0.35714453958176196, + "learning_rate": 0.0002581548374949463, + "loss": 7.1547, + "step": 17181 + }, + { + "epoch": 1.6032471773817298, + "grad_norm": 0.8158252236617001, + "learning_rate": 0.00025814959639983143, + "loss": 7.1992, + "step": 17182 + }, + { + "epoch": 1.6033404870766073, + "grad_norm": 1.3475093462592198, + "learning_rate": 0.0002581443550297241, + "loss": 7.0293, + "step": 17183 + }, + { + "epoch": 1.6034337967714847, + "grad_norm": 1.9425336730692655, + "learning_rate": 0.0002581391133846378, + "loss": 6.786, + "step": 17184 + }, + { + "epoch": 1.6035271064663619, + "grad_norm": 1.2826111100188908, + "learning_rate": 0.00025813387146458573, + "loss": 7.1769, + "step": 17185 + }, + { + "epoch": 1.603620416161239, + "grad_norm": 0.5593474725937255, + "learning_rate": 0.0002581286292695812, + "loss": 7.5158, + "step": 17186 + }, + { + "epoch": 1.6037137258561165, + "grad_norm": 1.0230390670324054, + "learning_rate": 0.00025812338679963764, + "loss": 6.7385, + "step": 17187 + }, + { + "epoch": 1.603807035550994, + "grad_norm": 0.6886637038849002, + "learning_rate": 0.0002581181440547684, + "loss": 7.4045, + "step": 17188 + }, + { + "epoch": 1.6039003452458709, + "grad_norm": 0.4359822288749005, + "learning_rate": 0.00025811290103498665, + "loss": 6.8439, + "step": 17189 + }, + { + "epoch": 1.6039936549407483, + "grad_norm": 0.5876984239867705, + "learning_rate": 0.00025810765774030584, + "loss": 7.0597, + "step": 17190 + }, + { + "epoch": 1.6040869646356257, + "grad_norm": 33.278833447111126, + "learning_rate": 0.00025810241417073933, + "loss": 7.4432, + "step": 17191 + }, + { + "epoch": 1.604180274330503, + "grad_norm": 0.5293839150220516, + "learning_rate": 0.00025809717032630037, + "loss": 7.089, + "step": 17192 + }, + { + "epoch": 1.6042735840253801, + "grad_norm": 0.8177560074490047, + "learning_rate": 0.0002580919262070023, + "loss": 7.1611, + "step": 17193 + }, + { + "epoch": 1.6043668937202575, + "grad_norm": 65.67818578826545, + "learning_rate": 0.00025808668181285854, + "loss": 7.1186, + "step": 17194 + }, + { + "epoch": 1.604460203415135, + "grad_norm": 0.7941006529074386, + "learning_rate": 0.0002580814371438823, + "loss": 7.2483, + "step": 17195 + }, + { + "epoch": 1.6045535131100122, + "grad_norm": 0.6924993279434624, + "learning_rate": 0.00025807619220008705, + "loss": 7.2072, + "step": 17196 + }, + { + "epoch": 1.6046468228048894, + "grad_norm": 0.3374589049201639, + "learning_rate": 0.000258070946981486, + "loss": 7.2261, + "step": 17197 + }, + { + "epoch": 1.6047401324997668, + "grad_norm": 0.3111479101839009, + "learning_rate": 0.0002580657014880926, + "loss": 7.3029, + "step": 17198 + }, + { + "epoch": 1.6048334421946442, + "grad_norm": 0.5204684263858143, + "learning_rate": 0.0002580604557199201, + "loss": 7.216, + "step": 17199 + }, + { + "epoch": 1.6049267518895212, + "grad_norm": 0.4501837087893665, + "learning_rate": 0.00025805520967698187, + "loss": 7.1158, + "step": 17200 + }, + { + "epoch": 1.6050200615843986, + "grad_norm": 0.7780292561235818, + "learning_rate": 0.00025804996335929126, + "loss": 7.3073, + "step": 17201 + }, + { + "epoch": 1.605113371279276, + "grad_norm": 0.652089932471637, + "learning_rate": 0.00025804471676686155, + "loss": 7.096, + "step": 17202 + }, + { + "epoch": 1.6052066809741532, + "grad_norm": 0.28796792305116475, + "learning_rate": 0.0002580394698997062, + "loss": 6.7715, + "step": 17203 + }, + { + "epoch": 1.6052999906690304, + "grad_norm": 1.0749783313359709, + "learning_rate": 0.00025803422275783846, + "loss": 7.478, + "step": 17204 + }, + { + "epoch": 1.6053933003639078, + "grad_norm": 0.7362639166753093, + "learning_rate": 0.00025802897534127166, + "loss": 7.0862, + "step": 17205 + }, + { + "epoch": 1.6054866100587852, + "grad_norm": 0.32873575575877645, + "learning_rate": 0.0002580237276500192, + "loss": 7.1014, + "step": 17206 + }, + { + "epoch": 1.6055799197536624, + "grad_norm": 0.38999954996270353, + "learning_rate": 0.0002580184796840944, + "loss": 7.1885, + "step": 17207 + }, + { + "epoch": 1.6056732294485396, + "grad_norm": 0.35749919465502605, + "learning_rate": 0.00025801323144351065, + "loss": 7.235, + "step": 17208 + }, + { + "epoch": 1.605766539143417, + "grad_norm": 0.45680440061257194, + "learning_rate": 0.0002580079829282812, + "loss": 7.3347, + "step": 17209 + }, + { + "epoch": 1.6058598488382942, + "grad_norm": 1672.691566202278, + "learning_rate": 0.00025800273413841945, + "loss": 6.9777, + "step": 17210 + }, + { + "epoch": 1.6059531585331714, + "grad_norm": 0.556237383635953, + "learning_rate": 0.00025799748507393874, + "loss": 7.1509, + "step": 17211 + }, + { + "epoch": 1.6060464682280489, + "grad_norm": 1.317575768686887, + "learning_rate": 0.0002579922357348524, + "loss": 7.3232, + "step": 17212 + }, + { + "epoch": 1.6061397779229263, + "grad_norm": 0.8754649815507731, + "learning_rate": 0.00025798698612117385, + "loss": 6.7541, + "step": 17213 + }, + { + "epoch": 1.6062330876178035, + "grad_norm": 1.1365745230915996, + "learning_rate": 0.00025798173623291633, + "loss": 7.127, + "step": 17214 + }, + { + "epoch": 1.6063263973126807, + "grad_norm": 1.1202602000900241, + "learning_rate": 0.00025797648607009323, + "loss": 6.9057, + "step": 17215 + }, + { + "epoch": 1.606419707007558, + "grad_norm": 0.6904017859773125, + "learning_rate": 0.00025797123563271794, + "loss": 7.2308, + "step": 17216 + }, + { + "epoch": 1.6065130167024355, + "grad_norm": 0.6299596335805917, + "learning_rate": 0.00025796598492080374, + "loss": 7.1731, + "step": 17217 + }, + { + "epoch": 1.6066063263973127, + "grad_norm": 0.3310672559995358, + "learning_rate": 0.0002579607339343641, + "loss": 6.9703, + "step": 17218 + }, + { + "epoch": 1.60669963609219, + "grad_norm": 0.5227412478833584, + "learning_rate": 0.00025795548267341223, + "loss": 7.4032, + "step": 17219 + }, + { + "epoch": 1.6067929457870673, + "grad_norm": 120.57873036270085, + "learning_rate": 0.0002579502311379616, + "loss": 7.3969, + "step": 17220 + }, + { + "epoch": 1.6068862554819445, + "grad_norm": 0.6807713487617868, + "learning_rate": 0.00025794497932802544, + "loss": 7.424, + "step": 17221 + }, + { + "epoch": 1.6069795651768217, + "grad_norm": 0.8085377758652639, + "learning_rate": 0.0002579397272436172, + "loss": 6.9488, + "step": 17222 + }, + { + "epoch": 1.6070728748716991, + "grad_norm": 0.37841894325802494, + "learning_rate": 0.0002579344748847502, + "loss": 7.0994, + "step": 17223 + }, + { + "epoch": 1.6071661845665766, + "grad_norm": 0.4306352844943846, + "learning_rate": 0.00025792922225143785, + "loss": 7.1867, + "step": 17224 + }, + { + "epoch": 1.6072594942614538, + "grad_norm": 0.483980306891543, + "learning_rate": 0.00025792396934369344, + "loss": 7.2912, + "step": 17225 + }, + { + "epoch": 1.607352803956331, + "grad_norm": 0.8379292705535671, + "learning_rate": 0.00025791871616153035, + "loss": 7.3125, + "step": 17226 + }, + { + "epoch": 1.6074461136512084, + "grad_norm": 0.5332039445388359, + "learning_rate": 0.00025791346270496193, + "loss": 7.0425, + "step": 17227 + }, + { + "epoch": 1.6075394233460858, + "grad_norm": 38.67777780023808, + "learning_rate": 0.0002579082089740015, + "loss": 7.0544, + "step": 17228 + }, + { + "epoch": 1.607632733040963, + "grad_norm": 0.5857333754021488, + "learning_rate": 0.00025790295496866254, + "loss": 7.605, + "step": 17229 + }, + { + "epoch": 1.6077260427358402, + "grad_norm": 0.8368643565512954, + "learning_rate": 0.00025789770068895823, + "loss": 7.2635, + "step": 17230 + }, + { + "epoch": 1.6078193524307176, + "grad_norm": 1.5992871485487163, + "learning_rate": 0.00025789244613490204, + "loss": 7.1429, + "step": 17231 + }, + { + "epoch": 1.6079126621255948, + "grad_norm": 0.8253967805566155, + "learning_rate": 0.00025788719130650736, + "loss": 7.3319, + "step": 17232 + }, + { + "epoch": 1.608005971820472, + "grad_norm": 181.824603448523, + "learning_rate": 0.0002578819362037875, + "loss": 7.4246, + "step": 17233 + }, + { + "epoch": 1.6080992815153494, + "grad_norm": 156.01992359956003, + "learning_rate": 0.0002578766808267559, + "loss": 7.0755, + "step": 17234 + }, + { + "epoch": 1.6081925912102268, + "grad_norm": 1.4039815085298406, + "learning_rate": 0.00025787142517542575, + "loss": 7.3629, + "step": 17235 + }, + { + "epoch": 1.608285900905104, + "grad_norm": 1.4566071336419544, + "learning_rate": 0.00025786616924981055, + "loss": 7.2486, + "step": 17236 + }, + { + "epoch": 1.6083792105999812, + "grad_norm": 1.232899765774251, + "learning_rate": 0.0002578609130499237, + "loss": 7.1894, + "step": 17237 + }, + { + "epoch": 1.6084725202948587, + "grad_norm": 0.7768823034851585, + "learning_rate": 0.0002578556565757784, + "loss": 7.4627, + "step": 17238 + }, + { + "epoch": 1.608565829989736, + "grad_norm": 55.60189325315874, + "learning_rate": 0.00025785039982738816, + "loss": 7.0942, + "step": 17239 + }, + { + "epoch": 1.6086591396846133, + "grad_norm": 0.903987273060778, + "learning_rate": 0.0002578451428047663, + "loss": 7.3153, + "step": 17240 + }, + { + "epoch": 1.6087524493794905, + "grad_norm": 0.841783497471255, + "learning_rate": 0.0002578398855079262, + "loss": 7.576, + "step": 17241 + }, + { + "epoch": 1.6088457590743679, + "grad_norm": 1.1169007155005937, + "learning_rate": 0.00025783462793688123, + "loss": 7.2305, + "step": 17242 + }, + { + "epoch": 1.608939068769245, + "grad_norm": 1.352614160762428, + "learning_rate": 0.0002578293700916447, + "loss": 7.1279, + "step": 17243 + }, + { + "epoch": 1.6090323784641223, + "grad_norm": 1.6577670531890667, + "learning_rate": 0.0002578241119722301, + "loss": 6.9241, + "step": 17244 + }, + { + "epoch": 1.6091256881589997, + "grad_norm": 820.4180600693817, + "learning_rate": 0.0002578188535786506, + "loss": 7.259, + "step": 17245 + }, + { + "epoch": 1.6092189978538771, + "grad_norm": 686.6867622612708, + "learning_rate": 0.0002578135949109198, + "loss": 6.8412, + "step": 17246 + }, + { + "epoch": 1.6093123075487543, + "grad_norm": 2066.3059644784084, + "learning_rate": 0.0002578083359690509, + "loss": 7.1581, + "step": 17247 + }, + { + "epoch": 1.6094056172436315, + "grad_norm": 3.4596541466345956, + "learning_rate": 0.0002578030767530574, + "loss": 7.4041, + "step": 17248 + }, + { + "epoch": 1.609498926938509, + "grad_norm": 3.782990365400122, + "learning_rate": 0.0002577978172629526, + "loss": 7.3491, + "step": 17249 + }, + { + "epoch": 1.6095922366333864, + "grad_norm": 4.051055798147615, + "learning_rate": 0.0002577925574987498, + "loss": 7.2659, + "step": 17250 + }, + { + "epoch": 1.6096855463282636, + "grad_norm": 4.391108612294648, + "learning_rate": 0.00025778729746046256, + "loss": 7.4417, + "step": 17251 + }, + { + "epoch": 1.6097788560231407, + "grad_norm": 4.305197044222497, + "learning_rate": 0.00025778203714810406, + "loss": 7.4397, + "step": 17252 + }, + { + "epoch": 1.6098721657180182, + "grad_norm": 4.177102391455901, + "learning_rate": 0.0002577767765616878, + "loss": 7.5337, + "step": 17253 + }, + { + "epoch": 1.6099654754128954, + "grad_norm": 3.3541946495998283, + "learning_rate": 0.0002577715157012272, + "loss": 7.4966, + "step": 17254 + }, + { + "epoch": 1.6100587851077726, + "grad_norm": 8068.832651229854, + "learning_rate": 0.0002577662545667354, + "loss": 7.5164, + "step": 17255 + }, + { + "epoch": 1.61015209480265, + "grad_norm": 2.2545052344316066, + "learning_rate": 0.00025776099315822607, + "loss": 7.2943, + "step": 17256 + }, + { + "epoch": 1.6102454044975274, + "grad_norm": 1.7542350671314408, + "learning_rate": 0.00025775573147571244, + "loss": 7.3113, + "step": 17257 + }, + { + "epoch": 1.6103387141924046, + "grad_norm": 0.7405306304633179, + "learning_rate": 0.00025775046951920786, + "loss": 7.0761, + "step": 17258 + }, + { + "epoch": 1.6104320238872818, + "grad_norm": 0.8192672654929584, + "learning_rate": 0.00025774520728872577, + "loss": 6.8787, + "step": 17259 + }, + { + "epoch": 1.6105253335821592, + "grad_norm": 1.3481966292077505, + "learning_rate": 0.00025773994478427956, + "loss": 7.4945, + "step": 17260 + }, + { + "epoch": 1.6106186432770366, + "grad_norm": 1.4176286738530204, + "learning_rate": 0.00025773468200588256, + "loss": 7.4614, + "step": 17261 + }, + { + "epoch": 1.6107119529719138, + "grad_norm": 1.7952063479169267, + "learning_rate": 0.0002577294189535482, + "loss": 7.3811, + "step": 17262 + }, + { + "epoch": 1.610805262666791, + "grad_norm": 44867.12866341667, + "learning_rate": 0.0002577241556272898, + "loss": 7.4362, + "step": 17263 + }, + { + "epoch": 1.6108985723616684, + "grad_norm": 47333.04850269657, + "learning_rate": 0.0002577188920271208, + "loss": 7.3833, + "step": 17264 + }, + { + "epoch": 1.6109918820565456, + "grad_norm": 2.5281352764247043, + "learning_rate": 0.00025771362815305455, + "loss": 7.8409, + "step": 17265 + }, + { + "epoch": 1.6110851917514228, + "grad_norm": 3.5531935619206787, + "learning_rate": 0.0002577083640051045, + "loss": 7.7826, + "step": 17266 + }, + { + "epoch": 1.6111785014463003, + "grad_norm": 1332898.798791065, + "learning_rate": 0.00025770309958328397, + "loss": 8.1705, + "step": 17267 + }, + { + "epoch": 1.6112718111411777, + "grad_norm": 5.120182469024426, + "learning_rate": 0.00025769783488760635, + "loss": 8.1873, + "step": 17268 + }, + { + "epoch": 1.6113651208360549, + "grad_norm": 5.307986556630015, + "learning_rate": 0.000257692569918085, + "loss": 8.4341, + "step": 17269 + }, + { + "epoch": 1.611458430530932, + "grad_norm": 4.1929537672373565, + "learning_rate": 0.0002576873046747334, + "loss": 8.1289, + "step": 17270 + }, + { + "epoch": 1.6115517402258095, + "grad_norm": 8523230.98847675, + "learning_rate": 0.00025768203915756485, + "loss": 8.2307, + "step": 17271 + }, + { + "epoch": 1.611645049920687, + "grad_norm": 2.694371919477234, + "learning_rate": 0.0002576767733665928, + "loss": 7.8088, + "step": 17272 + }, + { + "epoch": 1.611738359615564, + "grad_norm": 3.745426504459223, + "learning_rate": 0.0002576715073018306, + "loss": 7.9388, + "step": 17273 + }, + { + "epoch": 1.6118316693104413, + "grad_norm": 5.564470942693257, + "learning_rate": 0.0002576662409632916, + "loss": 7.9279, + "step": 17274 + }, + { + "epoch": 1.6119249790053187, + "grad_norm": 5.959268824561959, + "learning_rate": 0.0002576609743509893, + "loss": 7.9803, + "step": 17275 + }, + { + "epoch": 1.612018288700196, + "grad_norm": 1492241.3030737974, + "learning_rate": 0.0002576557074649371, + "loss": 7.8313, + "step": 17276 + }, + { + "epoch": 1.6121115983950731, + "grad_norm": 939723.8564391461, + "learning_rate": 0.00025765044030514825, + "loss": 7.5801, + "step": 17277 + }, + { + "epoch": 1.6122049080899505, + "grad_norm": 5.101686196091136, + "learning_rate": 0.0002576451728716362, + "loss": 7.6834, + "step": 17278 + }, + { + "epoch": 1.612298217784828, + "grad_norm": 3.3406636329662773, + "learning_rate": 0.0002576399051644144, + "loss": 7.8902, + "step": 17279 + }, + { + "epoch": 1.6123915274797052, + "grad_norm": 3.854319388929477, + "learning_rate": 0.0002576346371834962, + "loss": 7.9329, + "step": 17280 + }, + { + "epoch": 1.6124848371745824, + "grad_norm": 3.1984040288604114, + "learning_rate": 0.00025762936892889497, + "loss": 7.5559, + "step": 17281 + }, + { + "epoch": 1.6125781468694598, + "grad_norm": 2.1147178054935027, + "learning_rate": 0.00025762410040062415, + "loss": 7.6721, + "step": 17282 + }, + { + "epoch": 1.6126714565643372, + "grad_norm": 3.3245083612410036, + "learning_rate": 0.0002576188315986971, + "loss": 7.7167, + "step": 17283 + }, + { + "epoch": 1.6127647662592142, + "grad_norm": 1179209.9765439075, + "learning_rate": 0.0002576135625231273, + "loss": 7.6284, + "step": 17284 + }, + { + "epoch": 1.6128580759540916, + "grad_norm": 1.767760644519601, + "learning_rate": 0.00025760829317392805, + "loss": 7.28, + "step": 17285 + }, + { + "epoch": 1.612951385648969, + "grad_norm": 1.5183591024578846, + "learning_rate": 0.0002576030235511128, + "loss": 7.3035, + "step": 17286 + }, + { + "epoch": 1.6130446953438462, + "grad_norm": 2.1061650106277705, + "learning_rate": 0.00025759775365469495, + "loss": 7.2577, + "step": 17287 + }, + { + "epoch": 1.6131380050387234, + "grad_norm": 3.173615727362893, + "learning_rate": 0.00025759248348468784, + "loss": 7.6071, + "step": 17288 + }, + { + "epoch": 1.6132313147336008, + "grad_norm": 3.223630281264364, + "learning_rate": 0.000257587213041105, + "loss": 7.1262, + "step": 17289 + }, + { + "epoch": 1.6133246244284782, + "grad_norm": 4.479512117988542, + "learning_rate": 0.00025758194232395964, + "loss": 7.7222, + "step": 17290 + }, + { + "epoch": 1.6134179341233554, + "grad_norm": 4.346633400877487, + "learning_rate": 0.0002575766713332653, + "loss": 7.0498, + "step": 17291 + }, + { + "epoch": 1.6135112438182326, + "grad_norm": 3.889587587615171, + "learning_rate": 0.0002575714000690354, + "loss": 7.3569, + "step": 17292 + }, + { + "epoch": 1.61360455351311, + "grad_norm": 3.150794388818338, + "learning_rate": 0.00025756612853128325, + "loss": 7.29, + "step": 17293 + }, + { + "epoch": 1.6136978632079875, + "grad_norm": 2.82902379793243, + "learning_rate": 0.00025756085672002234, + "loss": 7.4958, + "step": 17294 + }, + { + "epoch": 1.6137911729028644, + "grad_norm": 0.6751436569720076, + "learning_rate": 0.000257555584635266, + "loss": 7.1293, + "step": 17295 + }, + { + "epoch": 1.6138844825977419, + "grad_norm": 1.1293066391855173, + "learning_rate": 0.00025755031227702764, + "loss": 7.013, + "step": 17296 + }, + { + "epoch": 1.6139777922926193, + "grad_norm": 1.1993508623415547, + "learning_rate": 0.00025754503964532075, + "loss": 7.3072, + "step": 17297 + }, + { + "epoch": 1.6140711019874965, + "grad_norm": 228641.36571309826, + "learning_rate": 0.00025753976674015866, + "loss": 7.4789, + "step": 17298 + }, + { + "epoch": 1.6141644116823737, + "grad_norm": 2.1155432858434726, + "learning_rate": 0.0002575344935615548, + "loss": 7.2841, + "step": 17299 + }, + { + "epoch": 1.614257721377251, + "grad_norm": 166105.46712540963, + "learning_rate": 0.0002575292201095226, + "loss": 7.2321, + "step": 17300 + }, + { + "epoch": 1.6143510310721285, + "grad_norm": 2.2400647697541136, + "learning_rate": 0.0002575239463840754, + "loss": 7.2268, + "step": 17301 + }, + { + "epoch": 1.6144443407670057, + "grad_norm": 1.4274299213749748, + "learning_rate": 0.0002575186723852267, + "loss": 7.5115, + "step": 17302 + }, + { + "epoch": 1.614537650461883, + "grad_norm": 1.5299508368272254, + "learning_rate": 0.00025751339811298985, + "loss": 7.3353, + "step": 17303 + }, + { + "epoch": 1.6146309601567603, + "grad_norm": 1.267286567116696, + "learning_rate": 0.00025750812356737827, + "loss": 7.323, + "step": 17304 + }, + { + "epoch": 1.6147242698516378, + "grad_norm": 0.6120783826573003, + "learning_rate": 0.0002575028487484054, + "loss": 7.2481, + "step": 17305 + }, + { + "epoch": 1.6148175795465147, + "grad_norm": 134622.50442686732, + "learning_rate": 0.0002574975736560846, + "loss": 7.2185, + "step": 17306 + }, + { + "epoch": 1.6149108892413921, + "grad_norm": 1.5171815850399895, + "learning_rate": 0.0002574922982904293, + "loss": 7.4412, + "step": 17307 + }, + { + "epoch": 1.6150041989362696, + "grad_norm": 1.3732066880632354, + "learning_rate": 0.00025748702265145296, + "loss": 7.247, + "step": 17308 + }, + { + "epoch": 1.6150975086311468, + "grad_norm": 1.6540474229404707, + "learning_rate": 0.0002574817467391689, + "loss": 7.3892, + "step": 17309 + }, + { + "epoch": 1.615190818326024, + "grad_norm": 1.2088464662399652, + "learning_rate": 0.00025747647055359064, + "loss": 7.1746, + "step": 17310 + }, + { + "epoch": 1.6152841280209014, + "grad_norm": 80673.27311174558, + "learning_rate": 0.0002574711940947316, + "loss": 6.8969, + "step": 17311 + }, + { + "epoch": 1.6153774377157788, + "grad_norm": 1.182609973976432, + "learning_rate": 0.0002574659173626051, + "loss": 7.3156, + "step": 17312 + }, + { + "epoch": 1.615470747410656, + "grad_norm": 11766.652903696799, + "learning_rate": 0.0002574606403572246, + "loss": 7.2678, + "step": 17313 + }, + { + "epoch": 1.6155640571055332, + "grad_norm": 1.0607722805031528, + "learning_rate": 0.00025745536307860355, + "loss": 7.5278, + "step": 17314 + }, + { + "epoch": 1.6156573668004106, + "grad_norm": 0.5380967563731391, + "learning_rate": 0.0002574500855267553, + "loss": 7.2916, + "step": 17315 + }, + { + "epoch": 1.6157506764952878, + "grad_norm": 0.9859937283510716, + "learning_rate": 0.00025744480770169335, + "loss": 7.212, + "step": 17316 + }, + { + "epoch": 1.615843986190165, + "grad_norm": 1.3591336092602802, + "learning_rate": 0.0002574395296034311, + "loss": 7.1535, + "step": 17317 + }, + { + "epoch": 1.6159372958850424, + "grad_norm": 0.971906712149362, + "learning_rate": 0.0002574342512319819, + "loss": 7.502, + "step": 17318 + }, + { + "epoch": 1.6160306055799198, + "grad_norm": 1.8484804215708168, + "learning_rate": 0.00025742897258735926, + "loss": 6.9242, + "step": 17319 + }, + { + "epoch": 1.616123915274797, + "grad_norm": 22821.841761461463, + "learning_rate": 0.0002574236936695766, + "loss": 7.3562, + "step": 17320 + }, + { + "epoch": 1.6162172249696742, + "grad_norm": 1.3548253624907711, + "learning_rate": 0.00025741841447864724, + "loss": 7.1398, + "step": 17321 + }, + { + "epoch": 1.6163105346645517, + "grad_norm": 0.7623110152752103, + "learning_rate": 0.00025741313501458465, + "loss": 7.2825, + "step": 17322 + }, + { + "epoch": 1.616403844359429, + "grad_norm": 0.8596935588621689, + "learning_rate": 0.00025740785527740236, + "loss": 6.8856, + "step": 17323 + }, + { + "epoch": 1.6164971540543063, + "grad_norm": 0.9609654841074778, + "learning_rate": 0.0002574025752671137, + "loss": 7.5348, + "step": 17324 + }, + { + "epoch": 1.6165904637491835, + "grad_norm": 0.6441137852899206, + "learning_rate": 0.000257397294983732, + "loss": 6.9074, + "step": 17325 + }, + { + "epoch": 1.616683773444061, + "grad_norm": 1.1369506902855995, + "learning_rate": 0.0002573920144272709, + "loss": 7.3271, + "step": 17326 + }, + { + "epoch": 1.616777083138938, + "grad_norm": 1252.4712821100077, + "learning_rate": 0.00025738673359774366, + "loss": 7.2722, + "step": 17327 + }, + { + "epoch": 1.6168703928338153, + "grad_norm": 121971.84839528146, + "learning_rate": 0.00025738145249516384, + "loss": 7.3257, + "step": 17328 + }, + { + "epoch": 1.6169637025286927, + "grad_norm": 0.9725640138600652, + "learning_rate": 0.0002573761711195447, + "loss": 6.9841, + "step": 17329 + }, + { + "epoch": 1.6170570122235701, + "grad_norm": 0.848694863865009, + "learning_rate": 0.00025737088947089986, + "loss": 7.3077, + "step": 17330 + }, + { + "epoch": 1.6171503219184473, + "grad_norm": 0.6978174306492679, + "learning_rate": 0.0002573656075492426, + "loss": 7.0206, + "step": 17331 + }, + { + "epoch": 1.6172436316133245, + "grad_norm": 0.6335804558634888, + "learning_rate": 0.0002573603253545864, + "loss": 7.3608, + "step": 17332 + }, + { + "epoch": 1.617336941308202, + "grad_norm": 0.5988017325118117, + "learning_rate": 0.0002573550428869447, + "loss": 7.3694, + "step": 17333 + }, + { + "epoch": 1.6174302510030794, + "grad_norm": 0.6242950135980918, + "learning_rate": 0.00025734976014633095, + "loss": 7.3935, + "step": 17334 + }, + { + "epoch": 1.6175235606979566, + "grad_norm": 0.9312307616176626, + "learning_rate": 0.00025734447713275855, + "loss": 7.1916, + "step": 17335 + }, + { + "epoch": 1.6176168703928338, + "grad_norm": 4434.759824912878, + "learning_rate": 0.0002573391938462409, + "loss": 6.8995, + "step": 17336 + }, + { + "epoch": 1.6177101800877112, + "grad_norm": 0.9705609660917752, + "learning_rate": 0.0002573339102867915, + "loss": 7.4944, + "step": 17337 + }, + { + "epoch": 1.6178034897825884, + "grad_norm": 0.9483046122968116, + "learning_rate": 0.0002573286264544238, + "loss": 7.4045, + "step": 17338 + }, + { + "epoch": 1.6178967994774656, + "grad_norm": 0.5720111550891389, + "learning_rate": 0.0002573233423491512, + "loss": 7.1689, + "step": 17339 + }, + { + "epoch": 1.617990109172343, + "grad_norm": 0.6243941839746642, + "learning_rate": 0.0002573180579709871, + "loss": 7.2776, + "step": 17340 + }, + { + "epoch": 1.6180834188672204, + "grad_norm": 0.8384098703168672, + "learning_rate": 0.00025731277331994496, + "loss": 7.0117, + "step": 17341 + }, + { + "epoch": 1.6181767285620976, + "grad_norm": 655.6853875506863, + "learning_rate": 0.0002573074883960382, + "loss": 7.3643, + "step": 17342 + }, + { + "epoch": 1.6182700382569748, + "grad_norm": 0.7235921214727594, + "learning_rate": 0.0002573022031992803, + "loss": 7.1595, + "step": 17343 + }, + { + "epoch": 1.6183633479518522, + "grad_norm": 0.6627822097084832, + "learning_rate": 0.00025729691772968474, + "loss": 7.1439, + "step": 17344 + }, + { + "epoch": 1.6184566576467296, + "grad_norm": 0.8303863264341989, + "learning_rate": 0.00025729163198726486, + "loss": 6.981, + "step": 17345 + }, + { + "epoch": 1.6185499673416068, + "grad_norm": 0.7959192980187725, + "learning_rate": 0.00025728634597203413, + "loss": 7.238, + "step": 17346 + }, + { + "epoch": 1.618643277036484, + "grad_norm": 48160.360785395176, + "learning_rate": 0.00025728105968400605, + "loss": 7.0039, + "step": 17347 + }, + { + "epoch": 1.6187365867313614, + "grad_norm": 1.0468506347662159, + "learning_rate": 0.000257275773123194, + "loss": 7.4981, + "step": 17348 + }, + { + "epoch": 1.6188298964262386, + "grad_norm": 0.5029422337737958, + "learning_rate": 0.0002572704862896114, + "loss": 7.1774, + "step": 17349 + }, + { + "epoch": 1.6189232061211158, + "grad_norm": 0.9200454313959626, + "learning_rate": 0.0002572651991832717, + "loss": 7.4734, + "step": 17350 + }, + { + "epoch": 1.6190165158159933, + "grad_norm": 0.7500095176418825, + "learning_rate": 0.00025725991180418844, + "loss": 7.2968, + "step": 17351 + }, + { + "epoch": 1.6191098255108707, + "grad_norm": 1.2858831963589514, + "learning_rate": 0.00025725462415237497, + "loss": 7.1582, + "step": 17352 + }, + { + "epoch": 1.6192031352057479, + "grad_norm": 0.7983637246600402, + "learning_rate": 0.0002572493362278448, + "loss": 7.2449, + "step": 17353 + }, + { + "epoch": 1.619296444900625, + "grad_norm": 0.7541899168068337, + "learning_rate": 0.0002572440480306113, + "loss": 7.1843, + "step": 17354 + }, + { + "epoch": 1.6193897545955025, + "grad_norm": 1.0466756174695087, + "learning_rate": 0.00025723875956068797, + "loss": 7.3475, + "step": 17355 + }, + { + "epoch": 1.61948306429038, + "grad_norm": 0.591260581769181, + "learning_rate": 0.00025723347081808823, + "loss": 7.1343, + "step": 17356 + }, + { + "epoch": 1.6195763739852571, + "grad_norm": 0.5796420164803323, + "learning_rate": 0.00025722818180282553, + "loss": 7.2136, + "step": 17357 + }, + { + "epoch": 1.6196696836801343, + "grad_norm": 1.0710632122682946, + "learning_rate": 0.00025722289251491336, + "loss": 7.2248, + "step": 17358 + }, + { + "epoch": 1.6197629933750117, + "grad_norm": 0.8413566194021792, + "learning_rate": 0.00025721760295436505, + "loss": 7.1877, + "step": 17359 + }, + { + "epoch": 1.619856303069889, + "grad_norm": 0.7619808639161014, + "learning_rate": 0.0002572123131211942, + "loss": 7.1063, + "step": 17360 + }, + { + "epoch": 1.6199496127647661, + "grad_norm": 9257.124858343921, + "learning_rate": 0.0002572070230154142, + "loss": 7.0489, + "step": 17361 + }, + { + "epoch": 1.6200429224596435, + "grad_norm": 0.6862367075304183, + "learning_rate": 0.0002572017326370385, + "loss": 7.0165, + "step": 17362 + }, + { + "epoch": 1.620136232154521, + "grad_norm": 0.5886884354657081, + "learning_rate": 0.00025719644198608056, + "loss": 7.1796, + "step": 17363 + }, + { + "epoch": 1.6202295418493982, + "grad_norm": 0.797113957341652, + "learning_rate": 0.0002571911510625538, + "loss": 7.3209, + "step": 17364 + }, + { + "epoch": 1.6203228515442754, + "grad_norm": 2521.7803425213915, + "learning_rate": 0.0002571858598664717, + "loss": 6.9984, + "step": 17365 + }, + { + "epoch": 1.6204161612391528, + "grad_norm": 0.8750766399773314, + "learning_rate": 0.0002571805683978477, + "loss": 7.4809, + "step": 17366 + }, + { + "epoch": 1.6205094709340302, + "grad_norm": 0.6651016510982093, + "learning_rate": 0.0002571752766566953, + "loss": 7.0028, + "step": 17367 + }, + { + "epoch": 1.6206027806289074, + "grad_norm": 31999.49674950897, + "learning_rate": 0.0002571699846430279, + "loss": 6.9938, + "step": 17368 + }, + { + "epoch": 1.6206960903237846, + "grad_norm": 0.6089017282986462, + "learning_rate": 0.00025716469235685896, + "loss": 6.9788, + "step": 17369 + }, + { + "epoch": 1.620789400018662, + "grad_norm": 0.6749993360893618, + "learning_rate": 0.00025715939979820195, + "loss": 6.8939, + "step": 17370 + }, + { + "epoch": 1.6208827097135392, + "grad_norm": 5667.272195767727, + "learning_rate": 0.00025715410696707034, + "loss": 7.5537, + "step": 17371 + }, + { + "epoch": 1.6209760194084164, + "grad_norm": 0.958669717395925, + "learning_rate": 0.00025714881386347755, + "loss": 7.2796, + "step": 17372 + }, + { + "epoch": 1.6210693291032938, + "grad_norm": 15730.06723483266, + "learning_rate": 0.0002571435204874371, + "loss": 7.3538, + "step": 17373 + }, + { + "epoch": 1.6211626387981712, + "grad_norm": 0.6682663671364204, + "learning_rate": 0.0002571382268389624, + "loss": 6.8278, + "step": 17374 + }, + { + "epoch": 1.6212559484930484, + "grad_norm": 1.0656595263634565, + "learning_rate": 0.00025713293291806694, + "loss": 7.116, + "step": 17375 + }, + { + "epoch": 1.6213492581879256, + "grad_norm": 0.7074908537352863, + "learning_rate": 0.00025712763872476417, + "loss": 7.299, + "step": 17376 + }, + { + "epoch": 1.621442567882803, + "grad_norm": 0.8940931568661792, + "learning_rate": 0.0002571223442590675, + "loss": 7.5594, + "step": 17377 + }, + { + "epoch": 1.6215358775776805, + "grad_norm": 0.7733747483124731, + "learning_rate": 0.0002571170495209905, + "loss": 7.3199, + "step": 17378 + }, + { + "epoch": 1.6216291872725577, + "grad_norm": 0.9507117126398327, + "learning_rate": 0.00025711175451054657, + "loss": 6.7565, + "step": 17379 + }, + { + "epoch": 1.6217224969674349, + "grad_norm": 0.614612069682539, + "learning_rate": 0.00025710645922774914, + "loss": 7.0599, + "step": 17380 + }, + { + "epoch": 1.6218158066623123, + "grad_norm": 0.6871308886976094, + "learning_rate": 0.00025710116367261174, + "loss": 7.1837, + "step": 17381 + }, + { + "epoch": 1.6219091163571895, + "grad_norm": 1.0849526131637728, + "learning_rate": 0.0002570958678451478, + "loss": 7.6046, + "step": 17382 + }, + { + "epoch": 1.6220024260520667, + "grad_norm": 0.7585062734851142, + "learning_rate": 0.0002570905717453708, + "loss": 7.2782, + "step": 17383 + }, + { + "epoch": 1.622095735746944, + "grad_norm": 0.7549609744458967, + "learning_rate": 0.00025708527537329415, + "loss": 6.8511, + "step": 17384 + }, + { + "epoch": 1.6221890454418215, + "grad_norm": 50518.61205962032, + "learning_rate": 0.0002570799787289314, + "loss": 7.3463, + "step": 17385 + }, + { + "epoch": 1.6222823551366987, + "grad_norm": 0.6944744780163129, + "learning_rate": 0.000257074681812296, + "loss": 7.2503, + "step": 17386 + }, + { + "epoch": 1.622375664831576, + "grad_norm": 1.4556580971184345, + "learning_rate": 0.0002570693846234014, + "loss": 6.8675, + "step": 17387 + }, + { + "epoch": 1.6224689745264533, + "grad_norm": 0.7624643689810092, + "learning_rate": 0.00025706408716226107, + "loss": 7.4495, + "step": 17388 + }, + { + "epoch": 1.6225622842213308, + "grad_norm": 0.5262681754496839, + "learning_rate": 0.0002570587894288885, + "loss": 7.1746, + "step": 17389 + }, + { + "epoch": 1.6226555939162077, + "grad_norm": 0.7153581775204112, + "learning_rate": 0.00025705349142329705, + "loss": 7.4514, + "step": 17390 + }, + { + "epoch": 1.6227489036110851, + "grad_norm": 0.6876545407091919, + "learning_rate": 0.00025704819314550036, + "loss": 7.4519, + "step": 17391 + }, + { + "epoch": 1.6228422133059626, + "grad_norm": 0.6612152192448275, + "learning_rate": 0.0002570428945955118, + "loss": 7.3303, + "step": 17392 + }, + { + "epoch": 1.6229355230008398, + "grad_norm": 16912.50370919575, + "learning_rate": 0.0002570375957733449, + "loss": 7.6015, + "step": 17393 + }, + { + "epoch": 1.623028832695717, + "grad_norm": 0.8272875421105095, + "learning_rate": 0.0002570322966790131, + "loss": 7.1497, + "step": 17394 + }, + { + "epoch": 1.6231221423905944, + "grad_norm": 0.7231915787665073, + "learning_rate": 0.00025702699731252986, + "loss": 7.414, + "step": 17395 + }, + { + "epoch": 1.6232154520854718, + "grad_norm": 0.678391097484089, + "learning_rate": 0.0002570216976739086, + "loss": 7.5017, + "step": 17396 + }, + { + "epoch": 1.623308761780349, + "grad_norm": 0.7329989306412441, + "learning_rate": 0.00025701639776316295, + "loss": 7.1864, + "step": 17397 + }, + { + "epoch": 1.6234020714752262, + "grad_norm": 2169.42850548253, + "learning_rate": 0.00025701109758030627, + "loss": 7.4959, + "step": 17398 + }, + { + "epoch": 1.6234953811701036, + "grad_norm": 3955.326066084483, + "learning_rate": 0.0002570057971253521, + "loss": 7.0575, + "step": 17399 + }, + { + "epoch": 1.623588690864981, + "grad_norm": 0.9669996506677719, + "learning_rate": 0.00025700049639831385, + "loss": 6.9123, + "step": 17400 + }, + { + "epoch": 1.623682000559858, + "grad_norm": 0.5231667279646762, + "learning_rate": 0.00025699519539920505, + "loss": 7.2145, + "step": 17401 + }, + { + "epoch": 1.6237753102547354, + "grad_norm": 0.7467282715283615, + "learning_rate": 0.00025698989412803913, + "loss": 7.1953, + "step": 17402 + }, + { + "epoch": 1.6238686199496128, + "grad_norm": 0.5792363128107976, + "learning_rate": 0.0002569845925848297, + "loss": 7.3056, + "step": 17403 + }, + { + "epoch": 1.62396192964449, + "grad_norm": 0.44326793580709545, + "learning_rate": 0.00025697929076959005, + "loss": 7.2033, + "step": 17404 + }, + { + "epoch": 1.6240552393393672, + "grad_norm": 0.560631040601986, + "learning_rate": 0.0002569739886823337, + "loss": 7.3307, + "step": 17405 + }, + { + "epoch": 1.6241485490342447, + "grad_norm": 0.7525088845620732, + "learning_rate": 0.0002569686863230743, + "loss": 7.1701, + "step": 17406 + }, + { + "epoch": 1.624241858729122, + "grad_norm": 0.9210398303562168, + "learning_rate": 0.0002569633836918252, + "loss": 7.293, + "step": 17407 + }, + { + "epoch": 1.6243351684239993, + "grad_norm": 0.7613568357523653, + "learning_rate": 0.0002569580807885998, + "loss": 7.2071, + "step": 17408 + }, + { + "epoch": 1.6244284781188765, + "grad_norm": 0.7846483143942249, + "learning_rate": 0.00025695277761341183, + "loss": 7.2556, + "step": 17409 + }, + { + "epoch": 1.624521787813754, + "grad_norm": 0.6565860700979234, + "learning_rate": 0.00025694747416627454, + "loss": 7.2104, + "step": 17410 + }, + { + "epoch": 1.6246150975086313, + "grad_norm": 0.7933605310250258, + "learning_rate": 0.00025694217044720157, + "loss": 7.4384, + "step": 17411 + }, + { + "epoch": 1.6247084072035083, + "grad_norm": 0.5396756474715843, + "learning_rate": 0.0002569368664562063, + "loss": 7.2196, + "step": 17412 + }, + { + "epoch": 1.6248017168983857, + "grad_norm": 0.6435480925120461, + "learning_rate": 0.00025693156219330223, + "loss": 7.166, + "step": 17413 + }, + { + "epoch": 1.6248950265932631, + "grad_norm": 0.6215831722568238, + "learning_rate": 0.0002569262576585029, + "loss": 7.0684, + "step": 17414 + }, + { + "epoch": 1.6249883362881403, + "grad_norm": 0.6546629208023256, + "learning_rate": 0.00025692095285182175, + "loss": 7.2539, + "step": 17415 + }, + { + "epoch": 1.6250816459830175, + "grad_norm": 0.8102619752545828, + "learning_rate": 0.00025691564777327233, + "loss": 7.5383, + "step": 17416 + }, + { + "epoch": 1.625174955677895, + "grad_norm": 0.6180240264110981, + "learning_rate": 0.000256910342422868, + "loss": 7.1092, + "step": 17417 + }, + { + "epoch": 1.6252682653727724, + "grad_norm": 0.6041476961558635, + "learning_rate": 0.0002569050368006224, + "loss": 7.3324, + "step": 17418 + }, + { + "epoch": 1.6253615750676496, + "grad_norm": 0.6624773651693775, + "learning_rate": 0.000256899730906549, + "loss": 7.3907, + "step": 17419 + }, + { + "epoch": 1.6254548847625268, + "grad_norm": 219437.9274894567, + "learning_rate": 0.0002568944247406612, + "loss": 7.2932, + "step": 17420 + }, + { + "epoch": 1.6255481944574042, + "grad_norm": 1.1445565703054374, + "learning_rate": 0.00025688911830297256, + "loss": 7.0698, + "step": 17421 + }, + { + "epoch": 1.6256415041522814, + "grad_norm": 0.8982901788715998, + "learning_rate": 0.0002568838115934965, + "loss": 7.1137, + "step": 17422 + }, + { + "epoch": 1.6257348138471586, + "grad_norm": 0.5935446324056722, + "learning_rate": 0.0002568785046122466, + "loss": 7.3718, + "step": 17423 + }, + { + "epoch": 1.625828123542036, + "grad_norm": 0.8606359946741777, + "learning_rate": 0.0002568731973592364, + "loss": 7.3803, + "step": 17424 + }, + { + "epoch": 1.6259214332369134, + "grad_norm": 0.8020491850598579, + "learning_rate": 0.0002568678898344792, + "loss": 7.3128, + "step": 17425 + }, + { + "epoch": 1.6260147429317906, + "grad_norm": 0.7397545462121805, + "learning_rate": 0.00025686258203798867, + "loss": 7.0985, + "step": 17426 + }, + { + "epoch": 1.6261080526266678, + "grad_norm": 99303195.78713717, + "learning_rate": 0.0002568572739697782, + "loss": 7.4021, + "step": 17427 + }, + { + "epoch": 1.6262013623215452, + "grad_norm": 0.8467844565832554, + "learning_rate": 0.00025685196562986144, + "loss": 7.343, + "step": 17428 + }, + { + "epoch": 1.6262946720164226, + "grad_norm": 0.8234084873998608, + "learning_rate": 0.00025684665701825173, + "loss": 7.3645, + "step": 17429 + }, + { + "epoch": 1.6263879817112998, + "grad_norm": 0.9103768025271479, + "learning_rate": 0.0002568413481349626, + "loss": 7.5183, + "step": 17430 + }, + { + "epoch": 1.626481291406177, + "grad_norm": 0.7897556792772911, + "learning_rate": 0.0002568360389800076, + "loss": 7.4888, + "step": 17431 + }, + { + "epoch": 1.6265746011010545, + "grad_norm": 1.1217201644326067, + "learning_rate": 0.00025683072955340016, + "loss": 7.206, + "step": 17432 + }, + { + "epoch": 1.6266679107959316, + "grad_norm": 1.0558900694109463, + "learning_rate": 0.00025682541985515387, + "loss": 6.9388, + "step": 17433 + }, + { + "epoch": 1.6267612204908088, + "grad_norm": 0.7225981618412638, + "learning_rate": 0.00025682010988528213, + "loss": 7.2373, + "step": 17434 + }, + { + "epoch": 1.6268545301856863, + "grad_norm": 2.0195095634281914, + "learning_rate": 0.00025681479964379856, + "loss": 7.6779, + "step": 17435 + }, + { + "epoch": 1.6269478398805637, + "grad_norm": 1.2084329841137185, + "learning_rate": 0.00025680948913071655, + "loss": 7.4928, + "step": 17436 + }, + { + "epoch": 1.6270411495754409, + "grad_norm": 1.2361721839455106, + "learning_rate": 0.00025680417834604964, + "loss": 7.3744, + "step": 17437 + }, + { + "epoch": 1.627134459270318, + "grad_norm": 1.2548475422603833, + "learning_rate": 0.00025679886728981134, + "loss": 7.2269, + "step": 17438 + }, + { + "epoch": 1.6272277689651955, + "grad_norm": 0.9108719202477354, + "learning_rate": 0.0002567935559620152, + "loss": 7.3199, + "step": 17439 + }, + { + "epoch": 1.627321078660073, + "grad_norm": 57183177.04084389, + "learning_rate": 0.0002567882443626746, + "loss": 7.5489, + "step": 17440 + }, + { + "epoch": 1.6274143883549501, + "grad_norm": 1.1236309821445916, + "learning_rate": 0.0002567829324918032, + "loss": 7.3984, + "step": 17441 + }, + { + "epoch": 1.6275076980498273, + "grad_norm": 1.0642067810282902, + "learning_rate": 0.00025677762034941444, + "loss": 7.3024, + "step": 17442 + }, + { + "epoch": 1.6276010077447047, + "grad_norm": 1.1734992503409065, + "learning_rate": 0.00025677230793552175, + "loss": 7.4192, + "step": 17443 + }, + { + "epoch": 1.627694317439582, + "grad_norm": 1.2527696365490122, + "learning_rate": 0.00025676699525013883, + "loss": 7.2715, + "step": 17444 + }, + { + "epoch": 1.6277876271344591, + "grad_norm": 1.7970498820195175, + "learning_rate": 0.00025676168229327894, + "loss": 7.1265, + "step": 17445 + }, + { + "epoch": 1.6278809368293365, + "grad_norm": 1.666362040662685, + "learning_rate": 0.00025675636906495575, + "loss": 7.251, + "step": 17446 + }, + { + "epoch": 1.627974246524214, + "grad_norm": 1.5731177418760607, + "learning_rate": 0.0002567510555651828, + "loss": 7.2622, + "step": 17447 + }, + { + "epoch": 1.6280675562190912, + "grad_norm": 1.286262774183314, + "learning_rate": 0.0002567457417939735, + "loss": 7.3407, + "step": 17448 + }, + { + "epoch": 1.6281608659139684, + "grad_norm": 0.8686532592536486, + "learning_rate": 0.00025674042775134144, + "loss": 7.386, + "step": 17449 + }, + { + "epoch": 1.6282541756088458, + "grad_norm": 0.91336733243087, + "learning_rate": 0.0002567351134373, + "loss": 7.1621, + "step": 17450 + }, + { + "epoch": 1.6283474853037232, + "grad_norm": 1.3285494571429668, + "learning_rate": 0.00025672979885186283, + "loss": 6.9955, + "step": 17451 + }, + { + "epoch": 1.6284407949986004, + "grad_norm": 1.5681538550175351, + "learning_rate": 0.00025672448399504343, + "loss": 7.6982, + "step": 17452 + }, + { + "epoch": 1.6285341046934776, + "grad_norm": 1194644578.8719597, + "learning_rate": 0.00025671916886685527, + "loss": 7.3509, + "step": 17453 + }, + { + "epoch": 1.628627414388355, + "grad_norm": 1.4618784556532696, + "learning_rate": 0.0002567138534673118, + "loss": 7.2481, + "step": 17454 + }, + { + "epoch": 1.6287207240832322, + "grad_norm": 1.6590399740282704, + "learning_rate": 0.00025670853779642666, + "loss": 7.5232, + "step": 17455 + }, + { + "epoch": 1.6288140337781094, + "grad_norm": 1.8970064476503707, + "learning_rate": 0.00025670322185421333, + "loss": 7.2364, + "step": 17456 + }, + { + "epoch": 1.6289073434729868, + "grad_norm": 1.7152771124279733, + "learning_rate": 0.0002566979056406853, + "loss": 7.5623, + "step": 17457 + }, + { + "epoch": 1.6290006531678642, + "grad_norm": 2.1042867337455955, + "learning_rate": 0.00025669258915585614, + "loss": 7.0947, + "step": 17458 + }, + { + "epoch": 1.6290939628627414, + "grad_norm": 1.4239759805924304, + "learning_rate": 0.0002566872723997393, + "loss": 7.3997, + "step": 17459 + }, + { + "epoch": 1.6291872725576186, + "grad_norm": 1.1315904066987537, + "learning_rate": 0.0002566819553723483, + "loss": 7.5159, + "step": 17460 + }, + { + "epoch": 1.629280582252496, + "grad_norm": 1.1225055962630495, + "learning_rate": 0.00025667663807369675, + "loss": 7.201, + "step": 17461 + }, + { + "epoch": 1.6293738919473735, + "grad_norm": 1.3249812473631553, + "learning_rate": 0.00025667132050379807, + "loss": 7.3937, + "step": 17462 + }, + { + "epoch": 1.6294672016422507, + "grad_norm": 1.4573248071009464, + "learning_rate": 0.00025666600266266584, + "loss": 7.087, + "step": 17463 + }, + { + "epoch": 1.6295605113371279, + "grad_norm": 1.0192095743524876, + "learning_rate": 0.00025666068455031354, + "loss": 7.4035, + "step": 17464 + }, + { + "epoch": 1.6296538210320053, + "grad_norm": 1.594875766783479, + "learning_rate": 0.0002566553661667547, + "loss": 6.9953, + "step": 17465 + }, + { + "epoch": 1.6297471307268825, + "grad_norm": 1.775236609479821, + "learning_rate": 0.00025665004751200294, + "loss": 7.3043, + "step": 17466 + }, + { + "epoch": 1.6298404404217597, + "grad_norm": 1.0405544918039151, + "learning_rate": 0.0002566447285860716, + "loss": 7.2561, + "step": 17467 + }, + { + "epoch": 1.629933750116637, + "grad_norm": 1.7239746117045143, + "learning_rate": 0.00025663940938897435, + "loss": 7.7383, + "step": 17468 + }, + { + "epoch": 1.6300270598115145, + "grad_norm": 0.9831840592075357, + "learning_rate": 0.0002566340899207247, + "loss": 7.2966, + "step": 17469 + }, + { + "epoch": 1.6301203695063917, + "grad_norm": 11440833432.168493, + "learning_rate": 0.00025662877018133614, + "loss": 7.1013, + "step": 17470 + }, + { + "epoch": 1.630213679201269, + "grad_norm": 0.7987181574916153, + "learning_rate": 0.00025662345017082215, + "loss": 7.3032, + "step": 17471 + }, + { + "epoch": 1.6303069888961463, + "grad_norm": 1.0337058525537288, + "learning_rate": 0.00025661812988919636, + "loss": 7.0699, + "step": 17472 + }, + { + "epoch": 1.6304002985910238, + "grad_norm": 1.2799795801144627, + "learning_rate": 0.0002566128093364722, + "loss": 7.5464, + "step": 17473 + }, + { + "epoch": 1.630493608285901, + "grad_norm": 1.3851633929682823, + "learning_rate": 0.00025660748851266327, + "loss": 7.3917, + "step": 17474 + }, + { + "epoch": 1.6305869179807781, + "grad_norm": 1.6610249132651207, + "learning_rate": 0.0002566021674177831, + "loss": 7.406, + "step": 17475 + }, + { + "epoch": 1.6306802276756556, + "grad_norm": 0.9950755127269404, + "learning_rate": 0.00025659684605184517, + "loss": 7.2189, + "step": 17476 + }, + { + "epoch": 1.6307735373705328, + "grad_norm": 646477308453.4524, + "learning_rate": 0.00025659152441486307, + "loss": 7.4104, + "step": 17477 + }, + { + "epoch": 1.63086684706541, + "grad_norm": 18479063340.450653, + "learning_rate": 0.00025658620250685026, + "loss": 7.2173, + "step": 17478 + }, + { + "epoch": 1.6309601567602874, + "grad_norm": 137883062795.83316, + "learning_rate": 0.0002565808803278203, + "loss": 7.3543, + "step": 17479 + }, + { + "epoch": 1.6310534664551648, + "grad_norm": 64387385246.6122, + "learning_rate": 0.0002565755578777868, + "loss": 7.1783, + "step": 17480 + }, + { + "epoch": 1.631146776150042, + "grad_norm": 26600739307.688725, + "learning_rate": 0.00025657023515676317, + "loss": 7.4172, + "step": 17481 + }, + { + "epoch": 1.6312400858449192, + "grad_norm": 57301362845.014595, + "learning_rate": 0.000256564912164763, + "loss": 7.4444, + "step": 17482 + }, + { + "epoch": 1.6313333955397966, + "grad_norm": 1.9214345912390243, + "learning_rate": 0.00025655958890179984, + "loss": 7.4135, + "step": 17483 + }, + { + "epoch": 1.631426705234674, + "grad_norm": 1.714430776663373, + "learning_rate": 0.0002565542653678872, + "loss": 7.3994, + "step": 17484 + }, + { + "epoch": 1.6315200149295512, + "grad_norm": 1.3741670572678988, + "learning_rate": 0.0002565489415630387, + "loss": 7.189, + "step": 17485 + }, + { + "epoch": 1.6316133246244284, + "grad_norm": 1.59090522315763, + "learning_rate": 0.0002565436174872677, + "loss": 6.7807, + "step": 17486 + }, + { + "epoch": 1.6317066343193058, + "grad_norm": 0.8356641363281769, + "learning_rate": 0.00025653829314058787, + "loss": 7.4033, + "step": 17487 + }, + { + "epoch": 1.631799944014183, + "grad_norm": 2079342766.212205, + "learning_rate": 0.0002565329685230128, + "loss": 7.3181, + "step": 17488 + }, + { + "epoch": 1.6318932537090602, + "grad_norm": 2915514261.0877676, + "learning_rate": 0.0002565276436345558, + "loss": 7.4668, + "step": 17489 + }, + { + "epoch": 1.6319865634039377, + "grad_norm": 1.5164283201098623, + "learning_rate": 0.0002565223184752307, + "loss": 7.3628, + "step": 17490 + }, + { + "epoch": 1.632079873098815, + "grad_norm": 2.4176083445646372, + "learning_rate": 0.0002565169930450508, + "loss": 7.3618, + "step": 17491 + }, + { + "epoch": 1.6321731827936923, + "grad_norm": 1.3327833849049056, + "learning_rate": 0.00025651166734402977, + "loss": 7.6993, + "step": 17492 + }, + { + "epoch": 1.6322664924885695, + "grad_norm": 1.8517041743197786, + "learning_rate": 0.00025650634137218116, + "loss": 7.1829, + "step": 17493 + }, + { + "epoch": 1.632359802183447, + "grad_norm": 104519767.17651325, + "learning_rate": 0.0002565010151295184, + "loss": 7.4363, + "step": 17494 + }, + { + "epoch": 1.6324531118783243, + "grad_norm": 1.139986582075247, + "learning_rate": 0.00025649568861605516, + "loss": 7.5623, + "step": 17495 + }, + { + "epoch": 1.6325464215732013, + "grad_norm": 1.1462798690765839, + "learning_rate": 0.0002564903618318049, + "loss": 7.392, + "step": 17496 + }, + { + "epoch": 1.6326397312680787, + "grad_norm": 1.3259751460550588, + "learning_rate": 0.0002564850347767812, + "loss": 7.5081, + "step": 17497 + }, + { + "epoch": 1.6327330409629561, + "grad_norm": 2.2483545601556902, + "learning_rate": 0.0002564797074509976, + "loss": 7.6829, + "step": 17498 + }, + { + "epoch": 1.6328263506578333, + "grad_norm": 1.5831712719025903, + "learning_rate": 0.00025647437985446773, + "loss": 7.3178, + "step": 17499 + }, + { + "epoch": 1.6329196603527105, + "grad_norm": 2.085034662696508, + "learning_rate": 0.00025646905198720496, + "loss": 7.4167, + "step": 17500 + }, + { + "epoch": 1.633012970047588, + "grad_norm": 1.3879253850193733, + "learning_rate": 0.000256463723849223, + "loss": 7.4654, + "step": 17501 + }, + { + "epoch": 1.6331062797424654, + "grad_norm": 1.1629582897777346, + "learning_rate": 0.00025645839544053523, + "loss": 7.1972, + "step": 17502 + }, + { + "epoch": 1.6331995894373426, + "grad_norm": 0.8664589592481302, + "learning_rate": 0.0002564530667611553, + "loss": 7.365, + "step": 17503 + }, + { + "epoch": 1.6332928991322198, + "grad_norm": 4790292040.2614565, + "learning_rate": 0.00025644773781109687, + "loss": 7.093, + "step": 17504 + }, + { + "epoch": 1.6333862088270972, + "grad_norm": 1.1595130058835454, + "learning_rate": 0.00025644240859037324, + "loss": 7.3717, + "step": 17505 + }, + { + "epoch": 1.6334795185219746, + "grad_norm": 1.8022054497472588, + "learning_rate": 0.00025643707909899817, + "loss": 7.2236, + "step": 17506 + }, + { + "epoch": 1.6335728282168516, + "grad_norm": 307377113.5803041, + "learning_rate": 0.0002564317493369851, + "loss": 7.1804, + "step": 17507 + }, + { + "epoch": 1.633666137911729, + "grad_norm": 0.8865934686956387, + "learning_rate": 0.0002564264193043477, + "loss": 7.5883, + "step": 17508 + }, + { + "epoch": 1.6337594476066064, + "grad_norm": 0.8416033420593215, + "learning_rate": 0.00025642108900109935, + "loss": 7.4632, + "step": 17509 + }, + { + "epoch": 1.6338527573014836, + "grad_norm": 0.882954976435722, + "learning_rate": 0.00025641575842725374, + "loss": 7.1977, + "step": 17510 + }, + { + "epoch": 1.6339460669963608, + "grad_norm": 1.1841081865570493, + "learning_rate": 0.00025641042758282434, + "loss": 7.4211, + "step": 17511 + }, + { + "epoch": 1.6340393766912382, + "grad_norm": 1619304303.6703308, + "learning_rate": 0.0002564050964678248, + "loss": 7.345, + "step": 17512 + }, + { + "epoch": 1.6341326863861156, + "grad_norm": 2.4474213396207043, + "learning_rate": 0.00025639976508226857, + "loss": 7.5952, + "step": 17513 + }, + { + "epoch": 1.6342259960809928, + "grad_norm": 1.819504619899295, + "learning_rate": 0.0002563944334261693, + "loss": 7.3995, + "step": 17514 + }, + { + "epoch": 1.63431930577587, + "grad_norm": 0.9549429126973346, + "learning_rate": 0.0002563891014995405, + "loss": 7.3637, + "step": 17515 + }, + { + "epoch": 1.6344126154707475, + "grad_norm": 1.1448240281228768, + "learning_rate": 0.00025638376930239566, + "loss": 7.7274, + "step": 17516 + }, + { + "epoch": 1.6345059251656249, + "grad_norm": 0.9665213966247113, + "learning_rate": 0.00025637843683474847, + "loss": 7.4829, + "step": 17517 + }, + { + "epoch": 1.6345992348605018, + "grad_norm": 1.8158187559024328, + "learning_rate": 0.0002563731040966124, + "loss": 6.9813, + "step": 17518 + }, + { + "epoch": 1.6346925445553793, + "grad_norm": 1.1622303547303143, + "learning_rate": 0.00025636777108800103, + "loss": 7.3485, + "step": 17519 + }, + { + "epoch": 1.6347858542502567, + "grad_norm": 1.0602816537361197, + "learning_rate": 0.000256362437808928, + "loss": 7.4418, + "step": 17520 + }, + { + "epoch": 1.6348791639451339, + "grad_norm": 1.1635492616733514, + "learning_rate": 0.0002563571042594067, + "loss": 7.2706, + "step": 17521 + }, + { + "epoch": 1.634972473640011, + "grad_norm": 1.2521740681703752, + "learning_rate": 0.00025635177043945083, + "loss": 7.5549, + "step": 17522 + }, + { + "epoch": 1.6350657833348885, + "grad_norm": 1.1747345637775832, + "learning_rate": 0.0002563464363490739, + "loss": 7.1055, + "step": 17523 + }, + { + "epoch": 1.635159093029766, + "grad_norm": 2.4083886277939226, + "learning_rate": 0.00025634110198828955, + "loss": 7.5626, + "step": 17524 + }, + { + "epoch": 1.6352524027246431, + "grad_norm": 1.2361219989280874, + "learning_rate": 0.00025633576735711123, + "loss": 7.4418, + "step": 17525 + }, + { + "epoch": 1.6353457124195203, + "grad_norm": 1.7648831879075884, + "learning_rate": 0.0002563304324555525, + "loss": 7.3812, + "step": 17526 + }, + { + "epoch": 1.6354390221143977, + "grad_norm": 1.5753367550497699, + "learning_rate": 0.000256325097283627, + "loss": 7.1844, + "step": 17527 + }, + { + "epoch": 1.635532331809275, + "grad_norm": 1.1998344934869172, + "learning_rate": 0.00025631976184134833, + "loss": 7.518, + "step": 17528 + }, + { + "epoch": 1.6356256415041521, + "grad_norm": 1.3550785493508961, + "learning_rate": 0.00025631442612873, + "loss": 7.3537, + "step": 17529 + }, + { + "epoch": 1.6357189511990295, + "grad_norm": 1.8335454125450905, + "learning_rate": 0.0002563090901457855, + "loss": 7.2966, + "step": 17530 + }, + { + "epoch": 1.635812260893907, + "grad_norm": 76361882.40162987, + "learning_rate": 0.0002563037538925285, + "loss": 7.55, + "step": 17531 + }, + { + "epoch": 1.6359055705887842, + "grad_norm": 159476652.4925672, + "learning_rate": 0.0002562984173689726, + "loss": 7.2869, + "step": 17532 + }, + { + "epoch": 1.6359988802836614, + "grad_norm": 1.5085260624093537, + "learning_rate": 0.0002562930805751313, + "loss": 7.2451, + "step": 17533 + }, + { + "epoch": 1.6360921899785388, + "grad_norm": 1.069453266786531, + "learning_rate": 0.0002562877435110181, + "loss": 7.6596, + "step": 17534 + }, + { + "epoch": 1.6361854996734162, + "grad_norm": 5704731.487485493, + "learning_rate": 0.00025628240617664677, + "loss": 7.211, + "step": 17535 + }, + { + "epoch": 1.6362788093682934, + "grad_norm": 1.1801728652430195, + "learning_rate": 0.0002562770685720307, + "loss": 7.2991, + "step": 17536 + }, + { + "epoch": 1.6363721190631706, + "grad_norm": 1.1442226409156804, + "learning_rate": 0.00025627173069718356, + "loss": 7.1225, + "step": 17537 + }, + { + "epoch": 1.636465428758048, + "grad_norm": 1.4197456138566438, + "learning_rate": 0.00025626639255211885, + "loss": 7.1702, + "step": 17538 + }, + { + "epoch": 1.6365587384529252, + "grad_norm": 1.4288710458932925, + "learning_rate": 0.0002562610541368502, + "loss": 7.5438, + "step": 17539 + }, + { + "epoch": 1.6366520481478024, + "grad_norm": 428159.8222496852, + "learning_rate": 0.00025625571545139116, + "loss": 7.1971, + "step": 17540 + }, + { + "epoch": 1.6367453578426798, + "grad_norm": 1.0744300729498475, + "learning_rate": 0.00025625037649575533, + "loss": 7.6457, + "step": 17541 + }, + { + "epoch": 1.6368386675375572, + "grad_norm": 1.2021274188626523, + "learning_rate": 0.0002562450372699562, + "loss": 7.159, + "step": 17542 + }, + { + "epoch": 1.6369319772324344, + "grad_norm": 1.1851544966980379, + "learning_rate": 0.0002562396977740075, + "loss": 7.5416, + "step": 17543 + }, + { + "epoch": 1.6370252869273116, + "grad_norm": 1.0644127602259585, + "learning_rate": 0.00025623435800792266, + "loss": 7.3369, + "step": 17544 + }, + { + "epoch": 1.637118596622189, + "grad_norm": 1.4854025228852072, + "learning_rate": 0.00025622901797171533, + "loss": 7.2062, + "step": 17545 + }, + { + "epoch": 1.6372119063170665, + "grad_norm": 0.9221220670638203, + "learning_rate": 0.0002562236776653991, + "loss": 7.4053, + "step": 17546 + }, + { + "epoch": 1.6373052160119437, + "grad_norm": 1.7267747555017443, + "learning_rate": 0.0002562183370889875, + "loss": 7.0683, + "step": 17547 + }, + { + "epoch": 1.6373985257068209, + "grad_norm": 1.3726032542884643, + "learning_rate": 0.0002562129962424941, + "loss": 7.403, + "step": 17548 + }, + { + "epoch": 1.6374918354016983, + "grad_norm": 1.0433158744490407, + "learning_rate": 0.00025620765512593256, + "loss": 7.4645, + "step": 17549 + }, + { + "epoch": 1.6375851450965755, + "grad_norm": 0.6833093940088424, + "learning_rate": 0.0002562023137393164, + "loss": 7.1082, + "step": 17550 + }, + { + "epoch": 1.6376784547914527, + "grad_norm": 0.8194732440059614, + "learning_rate": 0.0002561969720826592, + "loss": 7.1966, + "step": 17551 + }, + { + "epoch": 1.63777176448633, + "grad_norm": 1.0943616679493684, + "learning_rate": 0.0002561916301559746, + "loss": 7.3697, + "step": 17552 + }, + { + "epoch": 1.6378650741812075, + "grad_norm": 0.882270164563497, + "learning_rate": 0.00025618628795927615, + "loss": 7.4193, + "step": 17553 + }, + { + "epoch": 1.6379583838760847, + "grad_norm": 0.7655801549077027, + "learning_rate": 0.00025618094549257736, + "loss": 7.3139, + "step": 17554 + }, + { + "epoch": 1.638051693570962, + "grad_norm": 0.7769764366657039, + "learning_rate": 0.0002561756027558919, + "loss": 7.3366, + "step": 17555 + }, + { + "epoch": 1.6381450032658393, + "grad_norm": 1.1354712300032117, + "learning_rate": 0.00025617025974923336, + "loss": 7.1862, + "step": 17556 + }, + { + "epoch": 1.6382383129607168, + "grad_norm": 0.7376561939163865, + "learning_rate": 0.0002561649164726153, + "loss": 7.4116, + "step": 17557 + }, + { + "epoch": 1.638331622655594, + "grad_norm": 1.267430065796723, + "learning_rate": 0.00025615957292605127, + "loss": 7.1732, + "step": 17558 + }, + { + "epoch": 1.6384249323504712, + "grad_norm": 0.9597523283499807, + "learning_rate": 0.0002561542291095549, + "loss": 7.4509, + "step": 17559 + }, + { + "epoch": 1.6385182420453486, + "grad_norm": 0.8884636308618528, + "learning_rate": 0.00025614888502313983, + "loss": 7.375, + "step": 17560 + }, + { + "epoch": 1.6386115517402258, + "grad_norm": 0.7697890643366865, + "learning_rate": 0.0002561435406668195, + "loss": 7.3292, + "step": 17561 + }, + { + "epoch": 1.638704861435103, + "grad_norm": 1.3773723095172747, + "learning_rate": 0.0002561381960406076, + "loss": 7.5088, + "step": 17562 + }, + { + "epoch": 1.6387981711299804, + "grad_norm": 1.1468076805070482, + "learning_rate": 0.0002561328511445178, + "loss": 7.1698, + "step": 17563 + }, + { + "epoch": 1.6388914808248578, + "grad_norm": 527536.7342950824, + "learning_rate": 0.0002561275059785635, + "loss": 7.4078, + "step": 17564 + }, + { + "epoch": 1.638984790519735, + "grad_norm": 0.8288627330212042, + "learning_rate": 0.00025612216054275846, + "loss": 7.1137, + "step": 17565 + }, + { + "epoch": 1.6390781002146122, + "grad_norm": 1.0936192643821383, + "learning_rate": 0.00025611681483711616, + "loss": 7.4119, + "step": 17566 + }, + { + "epoch": 1.6391714099094896, + "grad_norm": 48839.47914150913, + "learning_rate": 0.00025611146886165024, + "loss": 7.3366, + "step": 17567 + }, + { + "epoch": 1.639264719604367, + "grad_norm": 92114.86149766482, + "learning_rate": 0.00025610612261637433, + "loss": 7.2355, + "step": 17568 + }, + { + "epoch": 1.6393580292992442, + "grad_norm": 0.7916069998955079, + "learning_rate": 0.0002561007761013019, + "loss": 7.0022, + "step": 17569 + }, + { + "epoch": 1.6394513389941214, + "grad_norm": 1.1914431649875747, + "learning_rate": 0.0002560954293164466, + "loss": 7.4068, + "step": 17570 + }, + { + "epoch": 1.6395446486889989, + "grad_norm": 1.1610455064823435, + "learning_rate": 0.0002560900822618222, + "loss": 7.4578, + "step": 17571 + }, + { + "epoch": 1.639637958383876, + "grad_norm": 0.984856255758308, + "learning_rate": 0.000256084734937442, + "loss": 7.2651, + "step": 17572 + }, + { + "epoch": 1.6397312680787532, + "grad_norm": 0.9138821765695654, + "learning_rate": 0.00025607938734331984, + "loss": 7.2368, + "step": 17573 + }, + { + "epoch": 1.6398245777736307, + "grad_norm": 1.6138746956150098, + "learning_rate": 0.0002560740394794692, + "loss": 7.0805, + "step": 17574 + }, + { + "epoch": 1.639917887468508, + "grad_norm": 0.7017607206411941, + "learning_rate": 0.0002560686913459037, + "loss": 7.6831, + "step": 17575 + }, + { + "epoch": 1.6400111971633853, + "grad_norm": 1.2297870514798253, + "learning_rate": 0.0002560633429426369, + "loss": 7.5156, + "step": 17576 + }, + { + "epoch": 1.6401045068582625, + "grad_norm": 1.2424442810585115, + "learning_rate": 0.00025605799426968245, + "loss": 7.0063, + "step": 17577 + }, + { + "epoch": 1.64019781655314, + "grad_norm": 1.1169554528079528, + "learning_rate": 0.00025605264532705394, + "loss": 7.542, + "step": 17578 + }, + { + "epoch": 1.6402911262480173, + "grad_norm": 1.1378958389048117, + "learning_rate": 0.000256047296114765, + "loss": 6.9173, + "step": 17579 + }, + { + "epoch": 1.6403844359428945, + "grad_norm": 65517.674932088994, + "learning_rate": 0.0002560419466328292, + "loss": 7.1682, + "step": 17580 + }, + { + "epoch": 1.6404777456377717, + "grad_norm": 1.776700758286685, + "learning_rate": 0.0002560365968812601, + "loss": 7.2048, + "step": 17581 + }, + { + "epoch": 1.6405710553326491, + "grad_norm": 1.2776628034670081, + "learning_rate": 0.00025603124686007133, + "loss": 7.4435, + "step": 17582 + }, + { + "epoch": 1.6406643650275263, + "grad_norm": 1.050375856980552, + "learning_rate": 0.0002560258965692766, + "loss": 7.3035, + "step": 17583 + }, + { + "epoch": 1.6407576747224035, + "grad_norm": 78089.10649574657, + "learning_rate": 0.00025602054600888934, + "loss": 7.1243, + "step": 17584 + }, + { + "epoch": 1.640850984417281, + "grad_norm": 253446.3987048877, + "learning_rate": 0.00025601519517892327, + "loss": 7.3654, + "step": 17585 + }, + { + "epoch": 1.6409442941121584, + "grad_norm": 384788.3947072901, + "learning_rate": 0.00025600984407939196, + "loss": 7.493, + "step": 17586 + }, + { + "epoch": 1.6410376038070356, + "grad_norm": 1.043367355947255, + "learning_rate": 0.000256004492710309, + "loss": 7.1959, + "step": 17587 + }, + { + "epoch": 1.6411309135019128, + "grad_norm": 472260.76066806173, + "learning_rate": 0.00025599914107168805, + "loss": 7.1034, + "step": 17588 + }, + { + "epoch": 1.6412242231967902, + "grad_norm": 0.8797910379928556, + "learning_rate": 0.0002559937891635426, + "loss": 7.6831, + "step": 17589 + }, + { + "epoch": 1.6413175328916676, + "grad_norm": 1.1594465436522743, + "learning_rate": 0.00025598843698588646, + "loss": 7.2943, + "step": 17590 + }, + { + "epoch": 1.6414108425865448, + "grad_norm": 99071.79757273782, + "learning_rate": 0.00025598308453873305, + "loss": 7.3197, + "step": 17591 + }, + { + "epoch": 1.641504152281422, + "grad_norm": 374554.7825009841, + "learning_rate": 0.0002559777318220961, + "loss": 7.3706, + "step": 17592 + }, + { + "epoch": 1.6415974619762994, + "grad_norm": 1.2682042614193838, + "learning_rate": 0.0002559723788359891, + "loss": 7.2129, + "step": 17593 + }, + { + "epoch": 1.6416907716711766, + "grad_norm": 0.8866651927007285, + "learning_rate": 0.0002559670255804258, + "loss": 7.1008, + "step": 17594 + }, + { + "epoch": 1.6417840813660538, + "grad_norm": 0.9708028692363815, + "learning_rate": 0.0002559616720554197, + "loss": 6.8387, + "step": 17595 + }, + { + "epoch": 1.6418773910609312, + "grad_norm": 0.7363969146429838, + "learning_rate": 0.00025595631826098446, + "loss": 7.3161, + "step": 17596 + }, + { + "epoch": 1.6419707007558086, + "grad_norm": 1.70361380530857, + "learning_rate": 0.0002559509641971337, + "loss": 7.403, + "step": 17597 + }, + { + "epoch": 1.6420640104506858, + "grad_norm": 1.7048664785613477, + "learning_rate": 0.000255945609863881, + "loss": 7.7682, + "step": 17598 + }, + { + "epoch": 1.642157320145563, + "grad_norm": 0.8173606644607705, + "learning_rate": 0.00025594025526124006, + "loss": 6.8763, + "step": 17599 + }, + { + "epoch": 1.6422506298404405, + "grad_norm": 0.7454504712010255, + "learning_rate": 0.00025593490038922437, + "loss": 7.43, + "step": 17600 + }, + { + "epoch": 1.6423439395353179, + "grad_norm": 0.8885519487600739, + "learning_rate": 0.00025592954524784764, + "loss": 7.5301, + "step": 17601 + }, + { + "epoch": 1.6424372492301949, + "grad_norm": 1.263616626540881, + "learning_rate": 0.00025592418983712344, + "loss": 7.7349, + "step": 17602 + }, + { + "epoch": 1.6425305589250723, + "grad_norm": 1.0266255069420038, + "learning_rate": 0.00025591883415706536, + "loss": 7.3148, + "step": 17603 + }, + { + "epoch": 1.6426238686199497, + "grad_norm": 1.167484231108126, + "learning_rate": 0.00025591347820768713, + "loss": 7.176, + "step": 17604 + }, + { + "epoch": 1.6427171783148269, + "grad_norm": 0.997269810076388, + "learning_rate": 0.0002559081219890023, + "loss": 7.2153, + "step": 17605 + }, + { + "epoch": 1.642810488009704, + "grad_norm": 1.2322419021428332, + "learning_rate": 0.00025590276550102446, + "loss": 6.9242, + "step": 17606 + }, + { + "epoch": 1.6429037977045815, + "grad_norm": 0.8670647415750684, + "learning_rate": 0.00025589740874376723, + "loss": 6.8475, + "step": 17607 + }, + { + "epoch": 1.642997107399459, + "grad_norm": 0.9418982454915007, + "learning_rate": 0.0002558920517172443, + "loss": 7.3556, + "step": 17608 + }, + { + "epoch": 1.6430904170943361, + "grad_norm": 1.0581675420690113, + "learning_rate": 0.00025588669442146926, + "loss": 7.21, + "step": 17609 + }, + { + "epoch": 1.6431837267892133, + "grad_norm": 1.0476362679257423, + "learning_rate": 0.0002558813368564557, + "loss": 7.3517, + "step": 17610 + }, + { + "epoch": 1.6432770364840907, + "grad_norm": 0.9141290550686131, + "learning_rate": 0.00025587597902221727, + "loss": 7.2126, + "step": 17611 + }, + { + "epoch": 1.6433703461789682, + "grad_norm": 0.7182929439452388, + "learning_rate": 0.0002558706209187676, + "loss": 7.4873, + "step": 17612 + }, + { + "epoch": 1.6434636558738451, + "grad_norm": 0.8883116082809691, + "learning_rate": 0.0002558652625461203, + "loss": 7.2127, + "step": 17613 + }, + { + "epoch": 1.6435569655687225, + "grad_norm": 1.1920618265761638, + "learning_rate": 0.0002558599039042889, + "loss": 7.1756, + "step": 17614 + }, + { + "epoch": 1.6436502752636, + "grad_norm": 0.8441634029006047, + "learning_rate": 0.00025585454499328723, + "loss": 7.4348, + "step": 17615 + }, + { + "epoch": 1.6437435849584772, + "grad_norm": 0.9745069282760238, + "learning_rate": 0.0002558491858131288, + "loss": 7.3805, + "step": 17616 + }, + { + "epoch": 1.6438368946533544, + "grad_norm": 1.0494096030826017, + "learning_rate": 0.00025584382636382726, + "loss": 7.1377, + "step": 17617 + }, + { + "epoch": 1.6439302043482318, + "grad_norm": 0.8934590235260701, + "learning_rate": 0.00025583846664539614, + "loss": 7.2972, + "step": 17618 + }, + { + "epoch": 1.6440235140431092, + "grad_norm": 1.6306927920041394, + "learning_rate": 0.00025583310665784923, + "loss": 7.1062, + "step": 17619 + }, + { + "epoch": 1.6441168237379864, + "grad_norm": 1.1372798970615072, + "learning_rate": 0.0002558277464012001, + "loss": 7.5263, + "step": 17620 + }, + { + "epoch": 1.6442101334328636, + "grad_norm": 1.3199985333542839, + "learning_rate": 0.0002558223858754623, + "loss": 7.3176, + "step": 17621 + }, + { + "epoch": 1.644303443127741, + "grad_norm": 0.7932837206090242, + "learning_rate": 0.0002558170250806495, + "loss": 7.0151, + "step": 17622 + }, + { + "epoch": 1.6443967528226184, + "grad_norm": 151680.07091007294, + "learning_rate": 0.00025581166401677537, + "loss": 6.9749, + "step": 17623 + }, + { + "epoch": 1.6444900625174954, + "grad_norm": 1.0038003776858924, + "learning_rate": 0.0002558063026838536, + "loss": 7.2884, + "step": 17624 + }, + { + "epoch": 1.6445833722123728, + "grad_norm": 0.945354913956891, + "learning_rate": 0.00025580094108189767, + "loss": 6.8929, + "step": 17625 + }, + { + "epoch": 1.6446766819072502, + "grad_norm": 0.6823769039731666, + "learning_rate": 0.00025579557921092126, + "loss": 7.3036, + "step": 17626 + }, + { + "epoch": 1.6447699916021274, + "grad_norm": 0.9480178634230257, + "learning_rate": 0.0002557902170709381, + "loss": 7.09, + "step": 17627 + }, + { + "epoch": 1.6448633012970046, + "grad_norm": 1.0866685463513286, + "learning_rate": 0.0002557848546619617, + "loss": 6.696, + "step": 17628 + }, + { + "epoch": 1.644956610991882, + "grad_norm": 1.1281287924488144, + "learning_rate": 0.00025577949198400576, + "loss": 7.2567, + "step": 17629 + }, + { + "epoch": 1.6450499206867595, + "grad_norm": 136965.23243614656, + "learning_rate": 0.000255774129037084, + "loss": 7.1899, + "step": 17630 + }, + { + "epoch": 1.6451432303816367, + "grad_norm": 1.162179013913418, + "learning_rate": 0.0002557687658212098, + "loss": 7.5201, + "step": 17631 + }, + { + "epoch": 1.6452365400765139, + "grad_norm": 0.7338731781272128, + "learning_rate": 0.00025576340233639705, + "loss": 7.1233, + "step": 17632 + }, + { + "epoch": 1.6453298497713913, + "grad_norm": 0.8279080984087422, + "learning_rate": 0.0002557580385826593, + "loss": 7.0526, + "step": 17633 + }, + { + "epoch": 1.6454231594662685, + "grad_norm": 1.1789472045980633, + "learning_rate": 0.00025575267456001016, + "loss": 7.0513, + "step": 17634 + }, + { + "epoch": 1.6455164691611457, + "grad_norm": 0.8585960612130172, + "learning_rate": 0.00025574731026846334, + "loss": 7.1411, + "step": 17635 + }, + { + "epoch": 1.645609778856023, + "grad_norm": 0.9624545898207125, + "learning_rate": 0.00025574194570803237, + "loss": 7.3749, + "step": 17636 + }, + { + "epoch": 1.6457030885509005, + "grad_norm": 760821.5277873783, + "learning_rate": 0.000255736580878731, + "loss": 7.1208, + "step": 17637 + }, + { + "epoch": 1.6457963982457777, + "grad_norm": 0.7100098941295249, + "learning_rate": 0.00025573121578057286, + "loss": 6.9374, + "step": 17638 + }, + { + "epoch": 1.645889707940655, + "grad_norm": 0.9164362747508927, + "learning_rate": 0.0002557258504135715, + "loss": 7.2435, + "step": 17639 + }, + { + "epoch": 1.6459830176355323, + "grad_norm": 0.6207755785019797, + "learning_rate": 0.0002557204847777406, + "loss": 7.1982, + "step": 17640 + }, + { + "epoch": 1.6460763273304098, + "grad_norm": 196508.09858082532, + "learning_rate": 0.0002557151188730939, + "loss": 7.1682, + "step": 17641 + }, + { + "epoch": 1.646169637025287, + "grad_norm": 0.7088940084328272, + "learning_rate": 0.0002557097526996449, + "loss": 7.0468, + "step": 17642 + }, + { + "epoch": 1.6462629467201642, + "grad_norm": 0.9347823361733205, + "learning_rate": 0.00025570438625740734, + "loss": 7.199, + "step": 17643 + }, + { + "epoch": 1.6463562564150416, + "grad_norm": 1.3110820041146813, + "learning_rate": 0.00025569901954639485, + "loss": 7.3955, + "step": 17644 + }, + { + "epoch": 1.6464495661099188, + "grad_norm": 0.6562839441806219, + "learning_rate": 0.000255693652566621, + "loss": 7.0996, + "step": 17645 + }, + { + "epoch": 1.646542875804796, + "grad_norm": 0.6375791402559572, + "learning_rate": 0.0002556882853180996, + "loss": 7.2079, + "step": 17646 + }, + { + "epoch": 1.6466361854996734, + "grad_norm": 0.5879691104061988, + "learning_rate": 0.00025568291780084414, + "loss": 7.1491, + "step": 17647 + }, + { + "epoch": 1.6467294951945508, + "grad_norm": 0.5721251818340984, + "learning_rate": 0.00025567755001486834, + "loss": 7.1888, + "step": 17648 + }, + { + "epoch": 1.646822804889428, + "grad_norm": 347195.4254096301, + "learning_rate": 0.0002556721819601858, + "loss": 7.1657, + "step": 17649 + }, + { + "epoch": 1.6469161145843052, + "grad_norm": 1.6265988304304821, + "learning_rate": 0.00025566681363681026, + "loss": 7.5082, + "step": 17650 + }, + { + "epoch": 1.6470094242791826, + "grad_norm": 0.8147271023973154, + "learning_rate": 0.0002556614450447553, + "loss": 6.836, + "step": 17651 + }, + { + "epoch": 1.64710273397406, + "grad_norm": 1.2864220562819473, + "learning_rate": 0.00025565607618403455, + "loss": 6.7892, + "step": 17652 + }, + { + "epoch": 1.6471960436689372, + "grad_norm": 0.8133532802969826, + "learning_rate": 0.00025565070705466173, + "loss": 6.9659, + "step": 17653 + }, + { + "epoch": 1.6472893533638144, + "grad_norm": 1.470803237093435, + "learning_rate": 0.00025564533765665045, + "loss": 7.0155, + "step": 17654 + }, + { + "epoch": 1.6473826630586919, + "grad_norm": 1.127470313702874, + "learning_rate": 0.0002556399679900143, + "loss": 7.2513, + "step": 17655 + }, + { + "epoch": 1.647475972753569, + "grad_norm": 1.0140832540488094, + "learning_rate": 0.00025563459805476714, + "loss": 7.0346, + "step": 17656 + }, + { + "epoch": 1.6475692824484462, + "grad_norm": 1.172365035993298, + "learning_rate": 0.0002556292278509224, + "loss": 6.7497, + "step": 17657 + }, + { + "epoch": 1.6476625921433237, + "grad_norm": 85749.44276381304, + "learning_rate": 0.00025562385737849385, + "loss": 7.4928, + "step": 17658 + }, + { + "epoch": 1.647755901838201, + "grad_norm": 0.6391213586718476, + "learning_rate": 0.0002556184866374951, + "loss": 7.1492, + "step": 17659 + }, + { + "epoch": 1.6478492115330783, + "grad_norm": 1.14151971801151, + "learning_rate": 0.00025561311562793986, + "loss": 7.3298, + "step": 17660 + }, + { + "epoch": 1.6479425212279555, + "grad_norm": 1.0038548499447777, + "learning_rate": 0.00025560774434984167, + "loss": 7.3083, + "step": 17661 + }, + { + "epoch": 1.648035830922833, + "grad_norm": 1.0488581286078769, + "learning_rate": 0.00025560237280321436, + "loss": 7.1179, + "step": 17662 + }, + { + "epoch": 1.6481291406177103, + "grad_norm": 0.9782218809367728, + "learning_rate": 0.0002555970009880715, + "loss": 7.2681, + "step": 17663 + }, + { + "epoch": 1.6482224503125875, + "grad_norm": 1.2510479840677524, + "learning_rate": 0.0002555916289044267, + "loss": 7.3254, + "step": 17664 + }, + { + "epoch": 1.6483157600074647, + "grad_norm": 0.556053919614734, + "learning_rate": 0.00025558625655229365, + "loss": 6.8287, + "step": 17665 + }, + { + "epoch": 1.6484090697023421, + "grad_norm": 149204.52118867403, + "learning_rate": 0.00025558088393168603, + "loss": 6.8363, + "step": 17666 + }, + { + "epoch": 1.6485023793972193, + "grad_norm": 118852.25263388902, + "learning_rate": 0.00025557551104261753, + "loss": 6.9044, + "step": 17667 + }, + { + "epoch": 1.6485956890920965, + "grad_norm": 85422.12805866206, + "learning_rate": 0.0002555701378851018, + "loss": 6.8205, + "step": 17668 + }, + { + "epoch": 1.648688998786974, + "grad_norm": 139717.90498712237, + "learning_rate": 0.00025556476445915246, + "loss": 7.3206, + "step": 17669 + }, + { + "epoch": 1.6487823084818514, + "grad_norm": 0.6513097391903481, + "learning_rate": 0.00025555939076478315, + "loss": 7.128, + "step": 17670 + }, + { + "epoch": 1.6488756181767286, + "grad_norm": 514852.2303147736, + "learning_rate": 0.0002555540168020076, + "loss": 7.1091, + "step": 17671 + }, + { + "epoch": 1.6489689278716058, + "grad_norm": 1.5184676158662145, + "learning_rate": 0.0002555486425708395, + "loss": 7.1489, + "step": 17672 + }, + { + "epoch": 1.6490622375664832, + "grad_norm": 1.490256731113538, + "learning_rate": 0.00025554326807129243, + "loss": 6.7975, + "step": 17673 + }, + { + "epoch": 1.6491555472613606, + "grad_norm": 1193549.007961243, + "learning_rate": 0.00025553789330338007, + "loss": 6.9845, + "step": 17674 + }, + { + "epoch": 1.6492488569562378, + "grad_norm": 0.9185498749196354, + "learning_rate": 0.00025553251826711617, + "loss": 7.0599, + "step": 17675 + }, + { + "epoch": 1.649342166651115, + "grad_norm": 1674562.0214482294, + "learning_rate": 0.0002555271429625143, + "loss": 6.9994, + "step": 17676 + }, + { + "epoch": 1.6494354763459924, + "grad_norm": 11940.926843768266, + "learning_rate": 0.00025552176738958816, + "loss": 7.1737, + "step": 17677 + }, + { + "epoch": 1.6495287860408696, + "grad_norm": 1453384.7237918659, + "learning_rate": 0.0002555163915483515, + "loss": 7.0921, + "step": 17678 + }, + { + "epoch": 1.6496220957357468, + "grad_norm": 209768.57910803228, + "learning_rate": 0.0002555110154388178, + "loss": 7.5023, + "step": 17679 + }, + { + "epoch": 1.6497154054306242, + "grad_norm": 0.9139368824665468, + "learning_rate": 0.0002555056390610009, + "loss": 6.905, + "step": 17680 + }, + { + "epoch": 1.6498087151255016, + "grad_norm": 0.6340513944440007, + "learning_rate": 0.00025550026241491443, + "loss": 7.3307, + "step": 17681 + }, + { + "epoch": 1.6499020248203788, + "grad_norm": 0.8374285104878592, + "learning_rate": 0.000255494885500572, + "loss": 7.6087, + "step": 17682 + }, + { + "epoch": 1.649995334515256, + "grad_norm": 685470.9682801878, + "learning_rate": 0.00025548950831798733, + "loss": 7.381, + "step": 17683 + }, + { + "epoch": 1.6500886442101335, + "grad_norm": 0.9686684029532924, + "learning_rate": 0.0002554841308671741, + "loss": 7.4634, + "step": 17684 + }, + { + "epoch": 1.6501819539050109, + "grad_norm": 0.9526713620532726, + "learning_rate": 0.00025547875314814596, + "loss": 7.3316, + "step": 17685 + }, + { + "epoch": 1.650275263599888, + "grad_norm": 1.0040369674169787, + "learning_rate": 0.0002554733751609166, + "loss": 7.3804, + "step": 17686 + }, + { + "epoch": 1.6503685732947653, + "grad_norm": 122189.11373995604, + "learning_rate": 0.0002554679969054997, + "loss": 7.3003, + "step": 17687 + }, + { + "epoch": 1.6504618829896427, + "grad_norm": 159040.8000929575, + "learning_rate": 0.00025546261838190897, + "loss": 7.4761, + "step": 17688 + }, + { + "epoch": 1.65055519268452, + "grad_norm": 697850.4268658475, + "learning_rate": 0.00025545723959015794, + "loss": 7.3473, + "step": 17689 + }, + { + "epoch": 1.650648502379397, + "grad_norm": 1.4009850202158634, + "learning_rate": 0.00025545186053026046, + "loss": 7.3608, + "step": 17690 + }, + { + "epoch": 1.6507418120742745, + "grad_norm": 466977.1941505323, + "learning_rate": 0.00025544648120223013, + "loss": 7.2886, + "step": 17691 + }, + { + "epoch": 1.650835121769152, + "grad_norm": 2.063531810133267, + "learning_rate": 0.0002554411016060806, + "loss": 7.1909, + "step": 17692 + }, + { + "epoch": 1.6509284314640291, + "grad_norm": 2.2490055110638387, + "learning_rate": 0.00025543572174182563, + "loss": 7.5817, + "step": 17693 + }, + { + "epoch": 1.6510217411589063, + "grad_norm": 2.4208414450632345, + "learning_rate": 0.00025543034160947884, + "loss": 7.3561, + "step": 17694 + }, + { + "epoch": 1.6511150508537837, + "grad_norm": 237116.68529827052, + "learning_rate": 0.0002554249612090539, + "loss": 7.4718, + "step": 17695 + }, + { + "epoch": 1.6512083605486612, + "grad_norm": 64677.96225567096, + "learning_rate": 0.0002554195805405645, + "loss": 7.311, + "step": 17696 + }, + { + "epoch": 1.6513016702435384, + "grad_norm": 2.8464144223141767, + "learning_rate": 0.0002554141996040244, + "loss": 7.269, + "step": 17697 + }, + { + "epoch": 1.6513949799384156, + "grad_norm": 4.176516161009186, + "learning_rate": 0.00025540881839944716, + "loss": 7.8835, + "step": 17698 + }, + { + "epoch": 1.651488289633293, + "grad_norm": 3.2230081511430844, + "learning_rate": 0.00025540343692684656, + "loss": 7.7112, + "step": 17699 + }, + { + "epoch": 1.6515815993281702, + "grad_norm": 24125.329831173724, + "learning_rate": 0.0002553980551862362, + "loss": 7.1883, + "step": 17700 + }, + { + "epoch": 1.6516749090230474, + "grad_norm": 0.7050332429166883, + "learning_rate": 0.0002553926731776298, + "loss": 7.3924, + "step": 17701 + }, + { + "epoch": 1.6517682187179248, + "grad_norm": 15920.062670607982, + "learning_rate": 0.0002553872909010411, + "loss": 7.2486, + "step": 17702 + }, + { + "epoch": 1.6518615284128022, + "grad_norm": 1.3152540495230842, + "learning_rate": 0.0002553819083564837, + "loss": 7.6131, + "step": 17703 + }, + { + "epoch": 1.6519548381076794, + "grad_norm": 2.123492166305262, + "learning_rate": 0.0002553765255439713, + "loss": 7.5274, + "step": 17704 + }, + { + "epoch": 1.6520481478025566, + "grad_norm": 3.059830708939258, + "learning_rate": 0.0002553711424635177, + "loss": 6.9946, + "step": 17705 + }, + { + "epoch": 1.652141457497434, + "grad_norm": 2.4512271061491524, + "learning_rate": 0.00025536575911513644, + "loss": 7.4119, + "step": 17706 + }, + { + "epoch": 1.6522347671923114, + "grad_norm": 1.4238952386994796, + "learning_rate": 0.00025536037549884127, + "loss": 7.9165, + "step": 17707 + }, + { + "epoch": 1.6523280768871884, + "grad_norm": 1.857426621207711, + "learning_rate": 0.00025535499161464587, + "loss": 7.1235, + "step": 17708 + }, + { + "epoch": 1.6524213865820658, + "grad_norm": 6269.523008494343, + "learning_rate": 0.00025534960746256396, + "loss": 7.3379, + "step": 17709 + }, + { + "epoch": 1.6525146962769433, + "grad_norm": 1719.6202562652609, + "learning_rate": 0.0002553442230426092, + "loss": 7.3171, + "step": 17710 + }, + { + "epoch": 1.6526080059718204, + "grad_norm": 0.9399240318556349, + "learning_rate": 0.0002553388383547952, + "loss": 7.252, + "step": 17711 + }, + { + "epoch": 1.6527013156666976, + "grad_norm": 1.5048738049965422, + "learning_rate": 0.00025533345339913584, + "loss": 7.426, + "step": 17712 + }, + { + "epoch": 1.652794625361575, + "grad_norm": 1.805779748014464, + "learning_rate": 0.0002553280681756447, + "loss": 7.8256, + "step": 17713 + }, + { + "epoch": 1.6528879350564525, + "grad_norm": 0.6244818863150177, + "learning_rate": 0.00025532268268433543, + "loss": 7.3274, + "step": 17714 + }, + { + "epoch": 1.6529812447513297, + "grad_norm": 0.9437481257611906, + "learning_rate": 0.00025531729692522185, + "loss": 7.1676, + "step": 17715 + }, + { + "epoch": 1.6530745544462069, + "grad_norm": 0.8377938738245895, + "learning_rate": 0.00025531191089831753, + "loss": 7.479, + "step": 17716 + }, + { + "epoch": 1.6531678641410843, + "grad_norm": 1.3667880033706998, + "learning_rate": 0.0002553065246036362, + "loss": 7.2885, + "step": 17717 + }, + { + "epoch": 1.6532611738359617, + "grad_norm": 1.0655164495383695, + "learning_rate": 0.0002553011380411916, + "loss": 7.5204, + "step": 17718 + }, + { + "epoch": 1.6533544835308387, + "grad_norm": 0.9617108098224161, + "learning_rate": 0.00025529575121099743, + "loss": 7.3943, + "step": 17719 + }, + { + "epoch": 1.6534477932257161, + "grad_norm": 1.1716007486109172, + "learning_rate": 0.00025529036411306726, + "loss": 7.5785, + "step": 17720 + }, + { + "epoch": 1.6535411029205935, + "grad_norm": 1.1347577786907554, + "learning_rate": 0.000255284976747415, + "loss": 7.629, + "step": 17721 + }, + { + "epoch": 1.6536344126154707, + "grad_norm": 1.100677542251725, + "learning_rate": 0.00025527958911405416, + "loss": 7.4721, + "step": 17722 + }, + { + "epoch": 1.653727722310348, + "grad_norm": 1.032068531152387, + "learning_rate": 0.0002552742012129985, + "loss": 7.1895, + "step": 17723 + }, + { + "epoch": 1.6538210320052253, + "grad_norm": 464.16678886822666, + "learning_rate": 0.0002552688130442618, + "loss": 7.4778, + "step": 17724 + }, + { + "epoch": 1.6539143417001028, + "grad_norm": 0.8594042010989394, + "learning_rate": 0.00025526342460785765, + "loss": 7.3619, + "step": 17725 + }, + { + "epoch": 1.65400765139498, + "grad_norm": 1.401671817636308, + "learning_rate": 0.0002552580359037998, + "loss": 7.6188, + "step": 17726 + }, + { + "epoch": 1.6541009610898572, + "grad_norm": 2036.7669527334422, + "learning_rate": 0.00025525264693210194, + "loss": 6.9804, + "step": 17727 + }, + { + "epoch": 1.6541942707847346, + "grad_norm": 0.7272482778609936, + "learning_rate": 0.00025524725769277777, + "loss": 7.4057, + "step": 17728 + }, + { + "epoch": 1.654287580479612, + "grad_norm": 0.9876750542341646, + "learning_rate": 0.000255241868185841, + "loss": 7.7715, + "step": 17729 + }, + { + "epoch": 1.654380890174489, + "grad_norm": 1.1485263368872427, + "learning_rate": 0.0002552364784113053, + "loss": 7.3976, + "step": 17730 + }, + { + "epoch": 1.6544741998693664, + "grad_norm": 311.5103864775103, + "learning_rate": 0.0002552310883691845, + "loss": 7.2039, + "step": 17731 + }, + { + "epoch": 1.6545675095642438, + "grad_norm": 1.0076987958906591, + "learning_rate": 0.00025522569805949214, + "loss": 7.377, + "step": 17732 + }, + { + "epoch": 1.654660819259121, + "grad_norm": 1.2578179418092676, + "learning_rate": 0.00025522030748224205, + "loss": 7.3869, + "step": 17733 + }, + { + "epoch": 1.6547541289539982, + "grad_norm": 1.021893668893194, + "learning_rate": 0.00025521491663744783, + "loss": 7.3525, + "step": 17734 + }, + { + "epoch": 1.6548474386488756, + "grad_norm": 1.2192401986309656, + "learning_rate": 0.0002552095255251233, + "loss": 7.3606, + "step": 17735 + }, + { + "epoch": 1.654940748343753, + "grad_norm": 1623.9023880280429, + "learning_rate": 0.00025520413414528205, + "loss": 7.2208, + "step": 17736 + }, + { + "epoch": 1.6550340580386302, + "grad_norm": 0.6774628116501357, + "learning_rate": 0.0002551987424979379, + "loss": 7.4156, + "step": 17737 + }, + { + "epoch": 1.6551273677335074, + "grad_norm": 0.7400395261882837, + "learning_rate": 0.0002551933505831045, + "loss": 6.9886, + "step": 17738 + }, + { + "epoch": 1.6552206774283849, + "grad_norm": 1.178132649144788, + "learning_rate": 0.00025518795840079556, + "loss": 7.4839, + "step": 17739 + }, + { + "epoch": 1.655313987123262, + "grad_norm": 1.4676234860361257, + "learning_rate": 0.0002551825659510248, + "loss": 7.4831, + "step": 17740 + }, + { + "epoch": 1.6554072968181393, + "grad_norm": 1.0589982276051435, + "learning_rate": 0.000255177173233806, + "loss": 7.2598, + "step": 17741 + }, + { + "epoch": 1.6555006065130167, + "grad_norm": 0.7936533213205939, + "learning_rate": 0.00025517178024915267, + "loss": 7.2196, + "step": 17742 + }, + { + "epoch": 1.655593916207894, + "grad_norm": 1.425502982548135, + "learning_rate": 0.00025516638699707873, + "loss": 7.5955, + "step": 17743 + }, + { + "epoch": 1.6556872259027713, + "grad_norm": 1.2811207907643647, + "learning_rate": 0.00025516099347759787, + "loss": 7.7704, + "step": 17744 + }, + { + "epoch": 1.6557805355976485, + "grad_norm": 0.6062089701578371, + "learning_rate": 0.0002551555996907237, + "loss": 7.5086, + "step": 17745 + }, + { + "epoch": 1.655873845292526, + "grad_norm": 1.3901512441144082, + "learning_rate": 0.00025515020563647, + "loss": 7.2035, + "step": 17746 + }, + { + "epoch": 1.6559671549874033, + "grad_norm": 0.9537137691812098, + "learning_rate": 0.00025514481131485043, + "loss": 7.325, + "step": 17747 + }, + { + "epoch": 1.6560604646822805, + "grad_norm": 0.6805395917264849, + "learning_rate": 0.0002551394167258788, + "loss": 7.6981, + "step": 17748 + }, + { + "epoch": 1.6561537743771577, + "grad_norm": 1.4501371740498357, + "learning_rate": 0.0002551340218695688, + "loss": 7.0261, + "step": 17749 + }, + { + "epoch": 1.6562470840720351, + "grad_norm": 0.6299502760099709, + "learning_rate": 0.00025512862674593404, + "loss": 7.3307, + "step": 17750 + }, + { + "epoch": 1.6563403937669123, + "grad_norm": 0.7256937800888505, + "learning_rate": 0.0002551232313549884, + "loss": 7.1081, + "step": 17751 + }, + { + "epoch": 1.6564337034617895, + "grad_norm": 1.0289312696394717, + "learning_rate": 0.0002551178356967455, + "loss": 7.3638, + "step": 17752 + }, + { + "epoch": 1.656527013156667, + "grad_norm": 29.707313236903257, + "learning_rate": 0.00025511243977121905, + "loss": 7.423, + "step": 17753 + }, + { + "epoch": 1.6566203228515444, + "grad_norm": 28.958545877207353, + "learning_rate": 0.0002551070435784229, + "loss": 7.3765, + "step": 17754 + }, + { + "epoch": 1.6567136325464216, + "grad_norm": 1.0532879831695094, + "learning_rate": 0.00025510164711837054, + "loss": 7.4017, + "step": 17755 + }, + { + "epoch": 1.6568069422412988, + "grad_norm": 543.1535824671122, + "learning_rate": 0.0002550962503910759, + "loss": 7.3838, + "step": 17756 + }, + { + "epoch": 1.6569002519361762, + "grad_norm": 1.3789992109787255, + "learning_rate": 0.00025509085339655267, + "loss": 6.9683, + "step": 17757 + }, + { + "epoch": 1.6569935616310536, + "grad_norm": 0.7102694785319438, + "learning_rate": 0.00025508545613481444, + "loss": 7.4324, + "step": 17758 + }, + { + "epoch": 1.6570868713259308, + "grad_norm": 0.921880837197722, + "learning_rate": 0.00025508005860587507, + "loss": 7.4338, + "step": 17759 + }, + { + "epoch": 1.657180181020808, + "grad_norm": 1.101582783336625, + "learning_rate": 0.0002550746608097482, + "loss": 7.5186, + "step": 17760 + }, + { + "epoch": 1.6572734907156854, + "grad_norm": 112.80001956214846, + "learning_rate": 0.00025506926274644767, + "loss": 7.4263, + "step": 17761 + }, + { + "epoch": 1.6573668004105626, + "grad_norm": 0.7460105553059386, + "learning_rate": 0.00025506386441598707, + "loss": 7.268, + "step": 17762 + }, + { + "epoch": 1.6574601101054398, + "grad_norm": 3085.014858693204, + "learning_rate": 0.0002550584658183802, + "loss": 7.3661, + "step": 17763 + }, + { + "epoch": 1.6575534198003172, + "grad_norm": 1.2156514051187066, + "learning_rate": 0.00025505306695364075, + "loss": 7.3883, + "step": 17764 + }, + { + "epoch": 1.6576467294951946, + "grad_norm": 0.5925864428525053, + "learning_rate": 0.0002550476678217825, + "loss": 7.1552, + "step": 17765 + }, + { + "epoch": 1.6577400391900718, + "grad_norm": 0.5945055173220838, + "learning_rate": 0.0002550422684228191, + "loss": 7.3919, + "step": 17766 + }, + { + "epoch": 1.657833348884949, + "grad_norm": 0.7457886046689544, + "learning_rate": 0.00025503686875676443, + "loss": 7.4306, + "step": 17767 + }, + { + "epoch": 1.6579266585798265, + "grad_norm": 1.1018746426037667, + "learning_rate": 0.000255031468823632, + "loss": 7.1155, + "step": 17768 + }, + { + "epoch": 1.6580199682747039, + "grad_norm": 3134.3627506774897, + "learning_rate": 0.0002550260686234357, + "loss": 7.1512, + "step": 17769 + }, + { + "epoch": 1.658113277969581, + "grad_norm": 0.8948972304748198, + "learning_rate": 0.00025502066815618924, + "loss": 7.2433, + "step": 17770 + }, + { + "epoch": 1.6582065876644583, + "grad_norm": 0.8359027557359796, + "learning_rate": 0.0002550152674219063, + "loss": 7.3059, + "step": 17771 + }, + { + "epoch": 1.6582998973593357, + "grad_norm": 1.234605836669003, + "learning_rate": 0.0002550098664206007, + "loss": 6.9244, + "step": 17772 + }, + { + "epoch": 1.658393207054213, + "grad_norm": 1.196509122782248, + "learning_rate": 0.0002550044651522861, + "loss": 7.6887, + "step": 17773 + }, + { + "epoch": 1.65848651674909, + "grad_norm": 0.8764902373845477, + "learning_rate": 0.0002549990636169762, + "loss": 7.1946, + "step": 17774 + }, + { + "epoch": 1.6585798264439675, + "grad_norm": 0.6439025701124541, + "learning_rate": 0.00025499366181468483, + "loss": 7.2526, + "step": 17775 + }, + { + "epoch": 1.658673136138845, + "grad_norm": 0.6571920379916475, + "learning_rate": 0.0002549882597454256, + "loss": 6.9591, + "step": 17776 + }, + { + "epoch": 1.6587664458337221, + "grad_norm": 0.8022070714323396, + "learning_rate": 0.00025498285740921243, + "loss": 7.4365, + "step": 17777 + }, + { + "epoch": 1.6588597555285993, + "grad_norm": 0.5202907312112081, + "learning_rate": 0.0002549774548060589, + "loss": 7.1484, + "step": 17778 + }, + { + "epoch": 1.6589530652234767, + "grad_norm": 0.5680460915368047, + "learning_rate": 0.0002549720519359788, + "loss": 6.9263, + "step": 17779 + }, + { + "epoch": 1.6590463749183542, + "grad_norm": 0.5827717421524966, + "learning_rate": 0.00025496664879898586, + "loss": 7.1244, + "step": 17780 + }, + { + "epoch": 1.6591396846132314, + "grad_norm": 0.8300656369255184, + "learning_rate": 0.00025496124539509384, + "loss": 6.996, + "step": 17781 + }, + { + "epoch": 1.6592329943081086, + "grad_norm": 0.926714790290796, + "learning_rate": 0.0002549558417243165, + "loss": 6.6961, + "step": 17782 + }, + { + "epoch": 1.659326304002986, + "grad_norm": 1.6974422886148104, + "learning_rate": 0.0002549504377866675, + "loss": 7.6195, + "step": 17783 + }, + { + "epoch": 1.6594196136978632, + "grad_norm": 3875.675490673994, + "learning_rate": 0.0002549450335821606, + "loss": 7.1395, + "step": 17784 + }, + { + "epoch": 1.6595129233927404, + "grad_norm": 0.7566908061062989, + "learning_rate": 0.0002549396291108096, + "loss": 7.1912, + "step": 17785 + }, + { + "epoch": 1.6596062330876178, + "grad_norm": 0.9534602485285831, + "learning_rate": 0.00025493422437262823, + "loss": 7.4954, + "step": 17786 + }, + { + "epoch": 1.6596995427824952, + "grad_norm": 0.6746979613680038, + "learning_rate": 0.0002549288193676302, + "loss": 6.8497, + "step": 17787 + }, + { + "epoch": 1.6597928524773724, + "grad_norm": 7074.24451148287, + "learning_rate": 0.00025492341409582924, + "loss": 7.1899, + "step": 17788 + }, + { + "epoch": 1.6598861621722496, + "grad_norm": 0.60984565156276, + "learning_rate": 0.0002549180085572391, + "loss": 7.482, + "step": 17789 + }, + { + "epoch": 1.659979471867127, + "grad_norm": 732.65025761247, + "learning_rate": 0.00025491260275187363, + "loss": 7.4927, + "step": 17790 + }, + { + "epoch": 1.6600727815620044, + "grad_norm": 0.8933007656844216, + "learning_rate": 0.0002549071966797464, + "loss": 7.3084, + "step": 17791 + }, + { + "epoch": 1.6601660912568816, + "grad_norm": 3244.9185306615555, + "learning_rate": 0.0002549017903408713, + "loss": 7.2626, + "step": 17792 + }, + { + "epoch": 1.6602594009517588, + "grad_norm": 0.894481319271565, + "learning_rate": 0.000254896383735262, + "loss": 7.2058, + "step": 17793 + }, + { + "epoch": 1.6603527106466363, + "grad_norm": 0.7925624766701417, + "learning_rate": 0.00025489097686293225, + "loss": 7.6438, + "step": 17794 + }, + { + "epoch": 1.6604460203415135, + "grad_norm": 0.7230693627412712, + "learning_rate": 0.0002548855697238958, + "loss": 7.2815, + "step": 17795 + }, + { + "epoch": 1.6605393300363906, + "grad_norm": 0.6205544080194271, + "learning_rate": 0.0002548801623181665, + "loss": 7.2702, + "step": 17796 + }, + { + "epoch": 1.660632639731268, + "grad_norm": 8446.040659489341, + "learning_rate": 0.000254874754645758, + "loss": 7.3727, + "step": 17797 + }, + { + "epoch": 1.6607259494261455, + "grad_norm": 0.7846909675621047, + "learning_rate": 0.000254869346706684, + "loss": 7.3732, + "step": 17798 + }, + { + "epoch": 1.6608192591210227, + "grad_norm": 0.7852285705230113, + "learning_rate": 0.00025486393850095835, + "loss": 7.4004, + "step": 17799 + }, + { + "epoch": 1.6609125688158999, + "grad_norm": 1.2857772886529488, + "learning_rate": 0.00025485853002859475, + "loss": 7.0899, + "step": 17800 + }, + { + "epoch": 1.6610058785107773, + "grad_norm": 1.2222953627021003, + "learning_rate": 0.000254853121289607, + "loss": 7.0888, + "step": 17801 + }, + { + "epoch": 1.6610991882056547, + "grad_norm": 0.7745981834669442, + "learning_rate": 0.0002548477122840088, + "loss": 7.4964, + "step": 17802 + }, + { + "epoch": 1.661192497900532, + "grad_norm": 0.6750818867612256, + "learning_rate": 0.00025484230301181396, + "loss": 7.3737, + "step": 17803 + }, + { + "epoch": 1.6612858075954091, + "grad_norm": 0.5786542524488366, + "learning_rate": 0.00025483689347303615, + "loss": 7.3752, + "step": 17804 + }, + { + "epoch": 1.6613791172902865, + "grad_norm": 0.6961382702943327, + "learning_rate": 0.0002548314836676892, + "loss": 7.3909, + "step": 17805 + }, + { + "epoch": 1.6614724269851637, + "grad_norm": 0.6385968054784172, + "learning_rate": 0.00025482607359578684, + "loss": 7.4735, + "step": 17806 + }, + { + "epoch": 1.661565736680041, + "grad_norm": 1.0636990472032817, + "learning_rate": 0.0002548206632573428, + "loss": 7.3019, + "step": 17807 + }, + { + "epoch": 1.6616590463749183, + "grad_norm": 0.531498970420333, + "learning_rate": 0.0002548152526523709, + "loss": 7.3181, + "step": 17808 + }, + { + "epoch": 1.6617523560697958, + "grad_norm": 1.1123267504608496, + "learning_rate": 0.00025480984178088484, + "loss": 6.9413, + "step": 17809 + }, + { + "epoch": 1.661845665764673, + "grad_norm": 0.7439041345085918, + "learning_rate": 0.00025480443064289837, + "loss": 7.4021, + "step": 17810 + }, + { + "epoch": 1.6619389754595502, + "grad_norm": 0.4207164293932034, + "learning_rate": 0.00025479901923842536, + "loss": 7.1071, + "step": 17811 + }, + { + "epoch": 1.6620322851544276, + "grad_norm": 0.46737680396118364, + "learning_rate": 0.00025479360756747947, + "loss": 7.0255, + "step": 17812 + }, + { + "epoch": 1.662125594849305, + "grad_norm": 0.9798699326677139, + "learning_rate": 0.0002547881956300744, + "loss": 7.3055, + "step": 17813 + }, + { + "epoch": 1.662218904544182, + "grad_norm": 7707.237119439084, + "learning_rate": 0.00025478278342622407, + "loss": 7.5244, + "step": 17814 + }, + { + "epoch": 1.6623122142390594, + "grad_norm": 0.9781222921621604, + "learning_rate": 0.00025477737095594216, + "loss": 7.5901, + "step": 17815 + }, + { + "epoch": 1.6624055239339368, + "grad_norm": 18570.984120169054, + "learning_rate": 0.0002547719582192424, + "loss": 7.2458, + "step": 17816 + }, + { + "epoch": 1.662498833628814, + "grad_norm": 1.4562103435232259, + "learning_rate": 0.0002547665452161386, + "loss": 6.922, + "step": 17817 + }, + { + "epoch": 1.6625921433236912, + "grad_norm": 2.131594306924448, + "learning_rate": 0.00025476113194664447, + "loss": 7.0119, + "step": 17818 + }, + { + "epoch": 1.6626854530185686, + "grad_norm": 0.646078363449735, + "learning_rate": 0.00025475571841077386, + "loss": 7.3715, + "step": 17819 + }, + { + "epoch": 1.662778762713446, + "grad_norm": 0.6640713138680167, + "learning_rate": 0.00025475030460854047, + "loss": 7.232, + "step": 17820 + }, + { + "epoch": 1.6628720724083232, + "grad_norm": 0.6429613368862809, + "learning_rate": 0.0002547448905399581, + "loss": 7.2061, + "step": 17821 + }, + { + "epoch": 1.6629653821032004, + "grad_norm": 0.5124456791291521, + "learning_rate": 0.0002547394762050405, + "loss": 7.2893, + "step": 17822 + }, + { + "epoch": 1.6630586917980779, + "grad_norm": 0.4750003658547273, + "learning_rate": 0.0002547340616038014, + "loss": 7.3242, + "step": 17823 + }, + { + "epoch": 1.6631520014929553, + "grad_norm": 0.9264481008166753, + "learning_rate": 0.00025472864673625463, + "loss": 7.0709, + "step": 17824 + }, + { + "epoch": 1.6632453111878323, + "grad_norm": 0.5174982168931964, + "learning_rate": 0.00025472323160241394, + "loss": 7.3828, + "step": 17825 + }, + { + "epoch": 1.6633386208827097, + "grad_norm": 0.7927469809511158, + "learning_rate": 0.0002547178162022931, + "loss": 7.107, + "step": 17826 + }, + { + "epoch": 1.663431930577587, + "grad_norm": 397389.48272661277, + "learning_rate": 0.0002547124005359058, + "loss": 6.8968, + "step": 17827 + }, + { + "epoch": 1.6635252402724643, + "grad_norm": 0.5399246193161348, + "learning_rate": 0.000254706984603266, + "loss": 7.3124, + "step": 17828 + }, + { + "epoch": 1.6636185499673415, + "grad_norm": 0.9064005453946256, + "learning_rate": 0.0002547015684043873, + "loss": 7.5729, + "step": 17829 + }, + { + "epoch": 1.663711859662219, + "grad_norm": 462062.4755448851, + "learning_rate": 0.00025469615193928353, + "loss": 7.6007, + "step": 17830 + }, + { + "epoch": 1.6638051693570963, + "grad_norm": 1.133091808055586, + "learning_rate": 0.00025469073520796847, + "loss": 7.1253, + "step": 17831 + }, + { + "epoch": 1.6638984790519735, + "grad_norm": 0.5656669085783463, + "learning_rate": 0.00025468531821045586, + "loss": 7.4691, + "step": 17832 + }, + { + "epoch": 1.6639917887468507, + "grad_norm": 0.9048265120019109, + "learning_rate": 0.0002546799009467595, + "loss": 6.9735, + "step": 17833 + }, + { + "epoch": 1.6640850984417281, + "grad_norm": 0.6348469052588558, + "learning_rate": 0.0002546744834168932, + "loss": 7.2115, + "step": 17834 + }, + { + "epoch": 1.6641784081366056, + "grad_norm": 289981.77983740036, + "learning_rate": 0.0002546690656208706, + "loss": 7.2721, + "step": 17835 + }, + { + "epoch": 1.6642717178314825, + "grad_norm": 0.7705937849631922, + "learning_rate": 0.0002546636475587057, + "loss": 7.1457, + "step": 17836 + }, + { + "epoch": 1.66436502752636, + "grad_norm": 619573.1873519081, + "learning_rate": 0.00025465822923041205, + "loss": 7.2158, + "step": 17837 + }, + { + "epoch": 1.6644583372212374, + "grad_norm": 394752.98503274855, + "learning_rate": 0.0002546528106360036, + "loss": 7.223, + "step": 17838 + }, + { + "epoch": 1.6645516469161146, + "grad_norm": 0.7771043574955157, + "learning_rate": 0.000254647391775494, + "loss": 7.3239, + "step": 17839 + }, + { + "epoch": 1.6646449566109918, + "grad_norm": 0.683470594956667, + "learning_rate": 0.0002546419726488971, + "loss": 7.3304, + "step": 17840 + }, + { + "epoch": 1.6647382663058692, + "grad_norm": 0.718893853989626, + "learning_rate": 0.0002546365532562266, + "loss": 7.4513, + "step": 17841 + }, + { + "epoch": 1.6648315760007466, + "grad_norm": 1.042439501964709, + "learning_rate": 0.0002546311335974964, + "loss": 7.3373, + "step": 17842 + }, + { + "epoch": 1.6649248856956238, + "grad_norm": 1.0946701786962707, + "learning_rate": 0.0002546257136727202, + "loss": 7.3508, + "step": 17843 + }, + { + "epoch": 1.665018195390501, + "grad_norm": 1.5230709604725559, + "learning_rate": 0.00025462029348191184, + "loss": 7.4301, + "step": 17844 + }, + { + "epoch": 1.6651115050853784, + "grad_norm": 1.759423159281563, + "learning_rate": 0.000254614873025085, + "loss": 7.5347, + "step": 17845 + }, + { + "epoch": 1.6652048147802556, + "grad_norm": 1.160393780387674, + "learning_rate": 0.0002546094523022536, + "loss": 7.1614, + "step": 17846 + }, + { + "epoch": 1.6652981244751328, + "grad_norm": 1.4654610097539607, + "learning_rate": 0.0002546040313134313, + "loss": 7.2321, + "step": 17847 + }, + { + "epoch": 1.6653914341700102, + "grad_norm": 1.0334220684044149, + "learning_rate": 0.000254598610058632, + "loss": 7.0551, + "step": 17848 + }, + { + "epoch": 1.6654847438648877, + "grad_norm": 0.784474331257311, + "learning_rate": 0.0002545931885378693, + "loss": 7.1065, + "step": 17849 + }, + { + "epoch": 1.6655780535597648, + "grad_norm": 1.1273020638518365, + "learning_rate": 0.0002545877667511572, + "loss": 7.3585, + "step": 17850 + }, + { + "epoch": 1.665671363254642, + "grad_norm": 0.5099025410394884, + "learning_rate": 0.0002545823446985094, + "loss": 6.9712, + "step": 17851 + }, + { + "epoch": 1.6657646729495195, + "grad_norm": 0.6256241177068679, + "learning_rate": 0.00025457692237993964, + "loss": 6.9351, + "step": 17852 + }, + { + "epoch": 1.6658579826443969, + "grad_norm": 0.5534070978628505, + "learning_rate": 0.00025457149979546173, + "loss": 7.029, + "step": 17853 + }, + { + "epoch": 1.665951292339274, + "grad_norm": 0.6541451189769911, + "learning_rate": 0.00025456607694508953, + "loss": 6.9741, + "step": 17854 + }, + { + "epoch": 1.6660446020341513, + "grad_norm": 591205.5296328983, + "learning_rate": 0.0002545606538288367, + "loss": 7.3154, + "step": 17855 + }, + { + "epoch": 1.6661379117290287, + "grad_norm": 238017.70192456077, + "learning_rate": 0.0002545552304467171, + "loss": 7.3599, + "step": 17856 + }, + { + "epoch": 1.666231221423906, + "grad_norm": 0.7931297082579596, + "learning_rate": 0.0002545498067987446, + "loss": 7.1719, + "step": 17857 + }, + { + "epoch": 1.666324531118783, + "grad_norm": 0.5589601749350513, + "learning_rate": 0.00025454438288493285, + "loss": 7.2158, + "step": 17858 + }, + { + "epoch": 1.6664178408136605, + "grad_norm": 389913.0373022871, + "learning_rate": 0.0002545389587052957, + "loss": 7.2552, + "step": 17859 + }, + { + "epoch": 1.666511150508538, + "grad_norm": 130193.60923284617, + "learning_rate": 0.00025453353425984696, + "loss": 7.2215, + "step": 17860 + }, + { + "epoch": 1.6666044602034151, + "grad_norm": 0.7105793389703171, + "learning_rate": 0.00025452810954860046, + "loss": 7.0753, + "step": 17861 + }, + { + "epoch": 1.6666977698982923, + "grad_norm": 702230.6582126506, + "learning_rate": 0.0002545226845715698, + "loss": 7.1698, + "step": 17862 + }, + { + "epoch": 1.6667910795931697, + "grad_norm": 1.0376368630820667, + "learning_rate": 0.0002545172593287691, + "loss": 7.5589, + "step": 17863 + }, + { + "epoch": 1.6668843892880472, + "grad_norm": 0.5341678518081004, + "learning_rate": 0.00025451183382021184, + "loss": 6.8724, + "step": 17864 + }, + { + "epoch": 1.6669776989829244, + "grad_norm": 279048.68478739803, + "learning_rate": 0.00025450640804591194, + "loss": 7.4886, + "step": 17865 + }, + { + "epoch": 1.6670710086778016, + "grad_norm": 0.8133818753007193, + "learning_rate": 0.00025450098200588325, + "loss": 7.2502, + "step": 17866 + }, + { + "epoch": 1.667164318372679, + "grad_norm": 1.039979857980929, + "learning_rate": 0.00025449555570013953, + "loss": 7.5284, + "step": 17867 + }, + { + "epoch": 1.6672576280675562, + "grad_norm": 0.5607211764809504, + "learning_rate": 0.0002544901291286945, + "loss": 7.2729, + "step": 17868 + }, + { + "epoch": 1.6673509377624334, + "grad_norm": 0.7394512100202515, + "learning_rate": 0.0002544847022915621, + "loss": 7.262, + "step": 17869 + }, + { + "epoch": 1.6674442474573108, + "grad_norm": 1.1881977948976374, + "learning_rate": 0.00025447927518875606, + "loss": 7.2016, + "step": 17870 + }, + { + "epoch": 1.6675375571521882, + "grad_norm": 1.203093856826606, + "learning_rate": 0.0002544738478202901, + "loss": 7.0435, + "step": 17871 + }, + { + "epoch": 1.6676308668470654, + "grad_norm": 1.1305679260626287, + "learning_rate": 0.00025446842018617816, + "loss": 7.257, + "step": 17872 + }, + { + "epoch": 1.6677241765419426, + "grad_norm": 0.5573443503062577, + "learning_rate": 0.0002544629922864339, + "loss": 7.4383, + "step": 17873 + }, + { + "epoch": 1.66781748623682, + "grad_norm": 0.8221130351455952, + "learning_rate": 0.00025445756412107123, + "loss": 7.1026, + "step": 17874 + }, + { + "epoch": 1.6679107959316974, + "grad_norm": 128722.16483467778, + "learning_rate": 0.0002544521356901039, + "loss": 7.3385, + "step": 17875 + }, + { + "epoch": 1.6680041056265746, + "grad_norm": 464394.3306742779, + "learning_rate": 0.0002544467069935458, + "loss": 7.0712, + "step": 17876 + }, + { + "epoch": 1.6680974153214518, + "grad_norm": 1.3972072493115177, + "learning_rate": 0.0002544412780314106, + "loss": 7.1491, + "step": 17877 + }, + { + "epoch": 1.6681907250163293, + "grad_norm": 0.9690230173266369, + "learning_rate": 0.00025443584880371217, + "loss": 7.2953, + "step": 17878 + }, + { + "epoch": 1.6682840347112065, + "grad_norm": 1.2298519588934187, + "learning_rate": 0.0002544304193104644, + "loss": 7.3768, + "step": 17879 + }, + { + "epoch": 1.6683773444060837, + "grad_norm": 1.1457575049891568, + "learning_rate": 0.0002544249895516809, + "loss": 7.4581, + "step": 17880 + }, + { + "epoch": 1.668470654100961, + "grad_norm": 0.8277552145483908, + "learning_rate": 0.0002544195595273756, + "loss": 7.3898, + "step": 17881 + }, + { + "epoch": 1.6685639637958385, + "grad_norm": 1.5485234159215124, + "learning_rate": 0.00025441412923756234, + "loss": 7.2102, + "step": 17882 + }, + { + "epoch": 1.6686572734907157, + "grad_norm": 0.6233856875472763, + "learning_rate": 0.0002544086986822548, + "loss": 6.9173, + "step": 17883 + }, + { + "epoch": 1.6687505831855929, + "grad_norm": 0.9572984043942119, + "learning_rate": 0.0002544032678614669, + "loss": 7.2339, + "step": 17884 + }, + { + "epoch": 1.6688438928804703, + "grad_norm": 0.8783483839511866, + "learning_rate": 0.00025439783677521245, + "loss": 7.3507, + "step": 17885 + }, + { + "epoch": 1.6689372025753477, + "grad_norm": 29523.79473695212, + "learning_rate": 0.0002543924054235052, + "loss": 7.29, + "step": 17886 + }, + { + "epoch": 1.669030512270225, + "grad_norm": 1.390776123229596, + "learning_rate": 0.000254386973806359, + "loss": 7.605, + "step": 17887 + }, + { + "epoch": 1.6691238219651021, + "grad_norm": 0.47797201085353236, + "learning_rate": 0.00025438154192378767, + "loss": 7.2676, + "step": 17888 + }, + { + "epoch": 1.6692171316599795, + "grad_norm": 8186.196295458399, + "learning_rate": 0.00025437610977580493, + "loss": 7.3521, + "step": 17889 + }, + { + "epoch": 1.6693104413548567, + "grad_norm": 1.0881673378840004, + "learning_rate": 0.00025437067736242474, + "loss": 7.1292, + "step": 17890 + }, + { + "epoch": 1.669403751049734, + "grad_norm": 0.5847559826774066, + "learning_rate": 0.00025436524468366075, + "loss": 7.436, + "step": 17891 + }, + { + "epoch": 1.6694970607446113, + "grad_norm": 1.1231801940884485, + "learning_rate": 0.00025435981173952695, + "loss": 7.1527, + "step": 17892 + }, + { + "epoch": 1.6695903704394888, + "grad_norm": 752.747691691514, + "learning_rate": 0.000254354378530037, + "loss": 7.3883, + "step": 17893 + }, + { + "epoch": 1.669683680134366, + "grad_norm": 1.1356479861762898, + "learning_rate": 0.00025434894505520477, + "loss": 7.3967, + "step": 17894 + }, + { + "epoch": 1.6697769898292432, + "grad_norm": 0.6870530508666451, + "learning_rate": 0.00025434351131504415, + "loss": 7.4992, + "step": 17895 + }, + { + "epoch": 1.6698702995241206, + "grad_norm": 0.8763320478261121, + "learning_rate": 0.0002543380773095688, + "loss": 7.2338, + "step": 17896 + }, + { + "epoch": 1.669963609218998, + "grad_norm": 0.619104419654637, + "learning_rate": 0.0002543326430387927, + "loss": 7.2447, + "step": 17897 + }, + { + "epoch": 1.6700569189138752, + "grad_norm": 37729.73770120095, + "learning_rate": 0.00025432720850272956, + "loss": 7.2378, + "step": 17898 + }, + { + "epoch": 1.6701502286087524, + "grad_norm": 0.7278782464625525, + "learning_rate": 0.0002543217737013932, + "loss": 7.3048, + "step": 17899 + }, + { + "epoch": 1.6702435383036298, + "grad_norm": 0.5675034010209064, + "learning_rate": 0.00025431633863479754, + "loss": 7.5137, + "step": 17900 + }, + { + "epoch": 1.670336847998507, + "grad_norm": 0.6382417599811859, + "learning_rate": 0.00025431090330295636, + "loss": 7.303, + "step": 17901 + }, + { + "epoch": 1.6704301576933842, + "grad_norm": 32380.76753394904, + "learning_rate": 0.0002543054677058834, + "loss": 7.2842, + "step": 17902 + }, + { + "epoch": 1.6705234673882616, + "grad_norm": 0.49595229707194766, + "learning_rate": 0.0002543000318435925, + "loss": 7.2485, + "step": 17903 + }, + { + "epoch": 1.670616777083139, + "grad_norm": 0.44289172244838687, + "learning_rate": 0.00025429459571609756, + "loss": 7.1457, + "step": 17904 + }, + { + "epoch": 1.6707100867780162, + "grad_norm": 1.1148343818331157, + "learning_rate": 0.00025428915932341236, + "loss": 7.2734, + "step": 17905 + }, + { + "epoch": 1.6708033964728934, + "grad_norm": 1.0409131108542282, + "learning_rate": 0.00025428372266555067, + "loss": 7.2706, + "step": 17906 + }, + { + "epoch": 1.6708967061677709, + "grad_norm": 0.5640524586217116, + "learning_rate": 0.0002542782857425264, + "loss": 7.0122, + "step": 17907 + }, + { + "epoch": 1.6709900158626483, + "grad_norm": 0.9976033459327984, + "learning_rate": 0.00025427284855435335, + "loss": 7.211, + "step": 17908 + }, + { + "epoch": 1.6710833255575255, + "grad_norm": 1.1056197201597067, + "learning_rate": 0.0002542674111010453, + "loss": 7.3531, + "step": 17909 + }, + { + "epoch": 1.6711766352524027, + "grad_norm": 0.8475640365813696, + "learning_rate": 0.00025426197338261614, + "loss": 7.4544, + "step": 17910 + }, + { + "epoch": 1.67126994494728, + "grad_norm": 0.6166363876877424, + "learning_rate": 0.0002542565353990797, + "loss": 7.2329, + "step": 17911 + }, + { + "epoch": 1.6713632546421573, + "grad_norm": 0.6497384254146423, + "learning_rate": 0.0002542510971504497, + "loss": 7.3503, + "step": 17912 + }, + { + "epoch": 1.6714565643370345, + "grad_norm": 0.933905193527794, + "learning_rate": 0.0002542456586367401, + "loss": 6.7664, + "step": 17913 + }, + { + "epoch": 1.671549874031912, + "grad_norm": 71.88725525664562, + "learning_rate": 0.0002542402198579646, + "loss": 7.5409, + "step": 17914 + }, + { + "epoch": 1.6716431837267893, + "grad_norm": 0.5053340039211404, + "learning_rate": 0.00025423478081413713, + "loss": 7.0869, + "step": 17915 + }, + { + "epoch": 1.6717364934216665, + "grad_norm": 1.1153635302240659, + "learning_rate": 0.00025422934150527146, + "loss": 7.4189, + "step": 17916 + }, + { + "epoch": 1.6718298031165437, + "grad_norm": 0.46709350361577573, + "learning_rate": 0.00025422390193138156, + "loss": 7.1554, + "step": 17917 + }, + { + "epoch": 1.6719231128114211, + "grad_norm": 1.2972518799571569, + "learning_rate": 0.00025421846209248104, + "loss": 7.1896, + "step": 17918 + }, + { + "epoch": 1.6720164225062986, + "grad_norm": 0.7337531723409644, + "learning_rate": 0.00025421302198858385, + "loss": 6.8443, + "step": 17919 + }, + { + "epoch": 1.6721097322011755, + "grad_norm": 0.5380901185438441, + "learning_rate": 0.0002542075816197039, + "loss": 7.3869, + "step": 17920 + }, + { + "epoch": 1.672203041896053, + "grad_norm": 0.5607273351569542, + "learning_rate": 0.00025420214098585486, + "loss": 7.2637, + "step": 17921 + }, + { + "epoch": 1.6722963515909304, + "grad_norm": 0.6755578969332006, + "learning_rate": 0.00025419670008705066, + "loss": 7.0565, + "step": 17922 + }, + { + "epoch": 1.6723896612858076, + "grad_norm": 10828.779357850679, + "learning_rate": 0.0002541912589233052, + "loss": 7.8124, + "step": 17923 + }, + { + "epoch": 1.6724829709806848, + "grad_norm": 0.7160655920426936, + "learning_rate": 0.00025418581749463213, + "loss": 7.0665, + "step": 17924 + }, + { + "epoch": 1.6725762806755622, + "grad_norm": 1.9299958541990825, + "learning_rate": 0.0002541803758010454, + "loss": 6.9143, + "step": 17925 + }, + { + "epoch": 1.6726695903704396, + "grad_norm": 0.9346047428780172, + "learning_rate": 0.00025417493384255886, + "loss": 7.2737, + "step": 17926 + }, + { + "epoch": 1.6727629000653168, + "grad_norm": 0.6857071369983951, + "learning_rate": 0.00025416949161918633, + "loss": 7.5524, + "step": 17927 + }, + { + "epoch": 1.672856209760194, + "grad_norm": 0.6913816115930804, + "learning_rate": 0.0002541640491309416, + "loss": 7.5401, + "step": 17928 + }, + { + "epoch": 1.6729495194550714, + "grad_norm": 1503.726415729503, + "learning_rate": 0.00025415860637783855, + "loss": 7.4881, + "step": 17929 + }, + { + "epoch": 1.6730428291499488, + "grad_norm": 1.088199237146402, + "learning_rate": 0.00025415316335989106, + "loss": 7.01, + "step": 17930 + }, + { + "epoch": 1.6731361388448258, + "grad_norm": 13979.146680836875, + "learning_rate": 0.0002541477200771129, + "loss": 7.2579, + "step": 17931 + }, + { + "epoch": 1.6732294485397032, + "grad_norm": 1.0016164222654398, + "learning_rate": 0.00025414227652951793, + "loss": 7.0816, + "step": 17932 + }, + { + "epoch": 1.6733227582345807, + "grad_norm": 14474.324587484398, + "learning_rate": 0.00025413683271712005, + "loss": 7.0933, + "step": 17933 + }, + { + "epoch": 1.6734160679294579, + "grad_norm": 0.5259970888724016, + "learning_rate": 0.000254131388639933, + "loss": 7.1581, + "step": 17934 + }, + { + "epoch": 1.673509377624335, + "grad_norm": 0.7809553010898688, + "learning_rate": 0.00025412594429797067, + "loss": 7.2606, + "step": 17935 + }, + { + "epoch": 1.6736026873192125, + "grad_norm": 0.6328762031420239, + "learning_rate": 0.00025412049969124697, + "loss": 6.867, + "step": 17936 + }, + { + "epoch": 1.6736959970140899, + "grad_norm": 1.3690124668228854, + "learning_rate": 0.00025411505481977563, + "loss": 7.357, + "step": 17937 + }, + { + "epoch": 1.673789306708967, + "grad_norm": 1.3032712304444656, + "learning_rate": 0.00025410960968357053, + "loss": 7.532, + "step": 17938 + }, + { + "epoch": 1.6738826164038443, + "grad_norm": 0.5633052432317734, + "learning_rate": 0.0002541041642826456, + "loss": 6.9195, + "step": 17939 + }, + { + "epoch": 1.6739759260987217, + "grad_norm": 99.02766680198339, + "learning_rate": 0.0002540987186170146, + "loss": 7.3389, + "step": 17940 + }, + { + "epoch": 1.6740692357935991, + "grad_norm": 0.5190608798320941, + "learning_rate": 0.00025409327268669136, + "loss": 7.713, + "step": 17941 + }, + { + "epoch": 1.674162545488476, + "grad_norm": 1.0340806475809505, + "learning_rate": 0.00025408782649168977, + "loss": 7.3031, + "step": 17942 + }, + { + "epoch": 1.6742558551833535, + "grad_norm": 1.4474324631185627, + "learning_rate": 0.0002540823800320237, + "loss": 6.9692, + "step": 17943 + }, + { + "epoch": 1.674349164878231, + "grad_norm": 1.2879468077959535, + "learning_rate": 0.00025407693330770694, + "loss": 6.8384, + "step": 17944 + }, + { + "epoch": 1.6744424745731081, + "grad_norm": 0.43547763875306955, + "learning_rate": 0.00025407148631875336, + "loss": 7.1332, + "step": 17945 + }, + { + "epoch": 1.6745357842679853, + "grad_norm": 0.6563123392428071, + "learning_rate": 0.0002540660390651768, + "loss": 6.8371, + "step": 17946 + }, + { + "epoch": 1.6746290939628627, + "grad_norm": 1.7925095002307674, + "learning_rate": 0.00025406059154699115, + "loss": 7.6898, + "step": 17947 + }, + { + "epoch": 1.6747224036577402, + "grad_norm": 1.0066413135777033, + "learning_rate": 0.0002540551437642103, + "loss": 7.1096, + "step": 17948 + }, + { + "epoch": 1.6748157133526174, + "grad_norm": 0.8050682399779012, + "learning_rate": 0.000254049695716848, + "loss": 6.9111, + "step": 17949 + }, + { + "epoch": 1.6749090230474946, + "grad_norm": 0.6980002318002183, + "learning_rate": 0.00025404424740491815, + "loss": 7.1659, + "step": 17950 + }, + { + "epoch": 1.675002332742372, + "grad_norm": 0.5685619290179531, + "learning_rate": 0.00025403879882843455, + "loss": 6.8437, + "step": 17951 + }, + { + "epoch": 1.6750956424372492, + "grad_norm": 9171.648912334775, + "learning_rate": 0.0002540333499874112, + "loss": 7.2061, + "step": 17952 + }, + { + "epoch": 1.6751889521321264, + "grad_norm": 10628.74911372367, + "learning_rate": 0.00025402790088186177, + "loss": 7.2349, + "step": 17953 + }, + { + "epoch": 1.6752822618270038, + "grad_norm": 0.8047499976023619, + "learning_rate": 0.0002540224515118002, + "loss": 7.1985, + "step": 17954 + }, + { + "epoch": 1.6753755715218812, + "grad_norm": 0.6571982796029509, + "learning_rate": 0.00025401700187724043, + "loss": 7.4501, + "step": 17955 + }, + { + "epoch": 1.6754688812167584, + "grad_norm": 6302.986363143088, + "learning_rate": 0.0002540115519781962, + "loss": 7.3888, + "step": 17956 + }, + { + "epoch": 1.6755621909116356, + "grad_norm": 7507.598444864547, + "learning_rate": 0.0002540061018146814, + "loss": 7.3536, + "step": 17957 + }, + { + "epoch": 1.675655500606513, + "grad_norm": 0.5343401185066636, + "learning_rate": 0.00025400065138670986, + "loss": 7.4722, + "step": 17958 + }, + { + "epoch": 1.6757488103013904, + "grad_norm": 0.6975236419716927, + "learning_rate": 0.0002539952006942955, + "loss": 7.2539, + "step": 17959 + }, + { + "epoch": 1.6758421199962676, + "grad_norm": 0.6642021085360247, + "learning_rate": 0.00025398974973745217, + "loss": 7.4709, + "step": 17960 + }, + { + "epoch": 1.6759354296911448, + "grad_norm": 0.6779524274370129, + "learning_rate": 0.00025398429851619366, + "loss": 7.3948, + "step": 17961 + }, + { + "epoch": 1.6760287393860223, + "grad_norm": 0.5915913046862484, + "learning_rate": 0.0002539788470305339, + "loss": 7.2978, + "step": 17962 + }, + { + "epoch": 1.6761220490808995, + "grad_norm": 0.666948489286204, + "learning_rate": 0.00025397339528048673, + "loss": 7.1978, + "step": 17963 + }, + { + "epoch": 1.6762153587757767, + "grad_norm": 0.47734449680006996, + "learning_rate": 0.0002539679432660661, + "loss": 7.0356, + "step": 17964 + }, + { + "epoch": 1.676308668470654, + "grad_norm": 1.2327238538531788, + "learning_rate": 0.00025396249098728565, + "loss": 7.3857, + "step": 17965 + }, + { + "epoch": 1.6764019781655315, + "grad_norm": 0.5040586242915825, + "learning_rate": 0.0002539570384441595, + "loss": 6.9871, + "step": 17966 + }, + { + "epoch": 1.6764952878604087, + "grad_norm": 1.7189105798371114, + "learning_rate": 0.00025395158563670133, + "loss": 7.469, + "step": 17967 + }, + { + "epoch": 1.6765885975552859, + "grad_norm": 10753.632987583547, + "learning_rate": 0.0002539461325649251, + "loss": 7.0567, + "step": 17968 + }, + { + "epoch": 1.6766819072501633, + "grad_norm": 0.7558715048389854, + "learning_rate": 0.0002539406792288446, + "loss": 7.3146, + "step": 17969 + }, + { + "epoch": 1.6767752169450407, + "grad_norm": 0.40287220207480606, + "learning_rate": 0.00025393522562847375, + "loss": 7.2503, + "step": 17970 + }, + { + "epoch": 1.676868526639918, + "grad_norm": 0.8309895037760389, + "learning_rate": 0.00025392977176382647, + "loss": 7.1447, + "step": 17971 + }, + { + "epoch": 1.6769618363347951, + "grad_norm": 1.1990503614689325, + "learning_rate": 0.0002539243176349165, + "loss": 6.9911, + "step": 17972 + }, + { + "epoch": 1.6770551460296725, + "grad_norm": 0.6915549681316264, + "learning_rate": 0.0002539188632417578, + "loss": 7.2868, + "step": 17973 + }, + { + "epoch": 1.6771484557245497, + "grad_norm": 0.6162191897342236, + "learning_rate": 0.00025391340858436424, + "loss": 7.1744, + "step": 17974 + }, + { + "epoch": 1.677241765419427, + "grad_norm": 0.6980477597298455, + "learning_rate": 0.0002539079536627496, + "loss": 7.1785, + "step": 17975 + }, + { + "epoch": 1.6773350751143044, + "grad_norm": 20176.04523701207, + "learning_rate": 0.0002539024984769279, + "loss": 7.1615, + "step": 17976 + }, + { + "epoch": 1.6774283848091818, + "grad_norm": 0.6275554508081286, + "learning_rate": 0.00025389704302691287, + "loss": 7.4107, + "step": 17977 + }, + { + "epoch": 1.677521694504059, + "grad_norm": 1.1146200991471882, + "learning_rate": 0.00025389158731271844, + "loss": 7.3538, + "step": 17978 + }, + { + "epoch": 1.6776150041989362, + "grad_norm": 30014.220026171966, + "learning_rate": 0.00025388613133435847, + "loss": 7.3173, + "step": 17979 + }, + { + "epoch": 1.6777083138938136, + "grad_norm": 0.7106950585150139, + "learning_rate": 0.0002538806750918469, + "loss": 7.437, + "step": 17980 + }, + { + "epoch": 1.677801623588691, + "grad_norm": 0.6507020695204264, + "learning_rate": 0.00025387521858519745, + "loss": 7.1718, + "step": 17981 + }, + { + "epoch": 1.6778949332835682, + "grad_norm": 6490.691704591935, + "learning_rate": 0.00025386976181442414, + "loss": 7.0891, + "step": 17982 + }, + { + "epoch": 1.6779882429784454, + "grad_norm": 0.778131032287386, + "learning_rate": 0.0002538643047795408, + "loss": 7.5139, + "step": 17983 + }, + { + "epoch": 1.6780815526733228, + "grad_norm": 0.5215633669818457, + "learning_rate": 0.00025385884748056124, + "loss": 7.0525, + "step": 17984 + }, + { + "epoch": 1.6781748623682, + "grad_norm": 0.6671513009691425, + "learning_rate": 0.0002538533899174995, + "loss": 7.1795, + "step": 17985 + }, + { + "epoch": 1.6782681720630772, + "grad_norm": 0.6114932436371091, + "learning_rate": 0.00025384793209036927, + "loss": 7.2697, + "step": 17986 + }, + { + "epoch": 1.6783614817579546, + "grad_norm": 0.9767294030100254, + "learning_rate": 0.0002538424739991846, + "loss": 7.1785, + "step": 17987 + }, + { + "epoch": 1.678454791452832, + "grad_norm": 6476.688103336683, + "learning_rate": 0.0002538370156439592, + "loss": 7.2618, + "step": 17988 + }, + { + "epoch": 1.6785481011477092, + "grad_norm": 72904.62761986189, + "learning_rate": 0.00025383155702470706, + "loss": 7.2637, + "step": 17989 + }, + { + "epoch": 1.6786414108425864, + "grad_norm": 0.7367437438054507, + "learning_rate": 0.000253826098141442, + "loss": 7.2772, + "step": 17990 + }, + { + "epoch": 1.6787347205374639, + "grad_norm": 0.7195243535828595, + "learning_rate": 0.0002538206389941779, + "loss": 7.4026, + "step": 17991 + }, + { + "epoch": 1.6788280302323413, + "grad_norm": 0.8243373463276326, + "learning_rate": 0.0002538151795829288, + "loss": 7.3732, + "step": 17992 + }, + { + "epoch": 1.6789213399272185, + "grad_norm": 1.3468724630292972, + "learning_rate": 0.0002538097199077083, + "loss": 6.9934, + "step": 17993 + }, + { + "epoch": 1.6790146496220957, + "grad_norm": 1.0788072467120065, + "learning_rate": 0.0002538042599685305, + "loss": 7.2827, + "step": 17994 + }, + { + "epoch": 1.679107959316973, + "grad_norm": 0.7464911082409417, + "learning_rate": 0.00025379879976540924, + "loss": 7.4614, + "step": 17995 + }, + { + "epoch": 1.6792012690118503, + "grad_norm": 1.098164668952953, + "learning_rate": 0.0002537933392983584, + "loss": 7.1894, + "step": 17996 + }, + { + "epoch": 1.6792945787067275, + "grad_norm": 0.8369656092492055, + "learning_rate": 0.00025378787856739174, + "loss": 7.6538, + "step": 17997 + }, + { + "epoch": 1.679387888401605, + "grad_norm": 1.1951695979597725, + "learning_rate": 0.0002537824175725233, + "loss": 7.7472, + "step": 17998 + }, + { + "epoch": 1.6794811980964823, + "grad_norm": 0.6980272316102218, + "learning_rate": 0.00025377695631376694, + "loss": 7.5157, + "step": 17999 + }, + { + "epoch": 1.6795745077913595, + "grad_norm": 1.33958807510046, + "learning_rate": 0.00025377149479113647, + "loss": 6.9938, + "step": 18000 + }, + { + "epoch": 1.6796678174862367, + "grad_norm": 0.6392180400679575, + "learning_rate": 0.0002537660330046459, + "loss": 7.5461, + "step": 18001 + }, + { + "epoch": 1.6797611271811141, + "grad_norm": 0.8778439948660994, + "learning_rate": 0.00025376057095430896, + "loss": 7.4255, + "step": 18002 + }, + { + "epoch": 1.6798544368759916, + "grad_norm": 202740.60634344764, + "learning_rate": 0.0002537551086401397, + "loss": 7.0807, + "step": 18003 + }, + { + "epoch": 1.6799477465708688, + "grad_norm": 1.5705233429522918, + "learning_rate": 0.00025374964606215186, + "loss": 7.3932, + "step": 18004 + }, + { + "epoch": 1.680041056265746, + "grad_norm": 29858.97109867952, + "learning_rate": 0.00025374418322035943, + "loss": 7.334, + "step": 18005 + }, + { + "epoch": 1.6801343659606234, + "grad_norm": 1.480631279010039, + "learning_rate": 0.00025373872011477634, + "loss": 7.4135, + "step": 18006 + }, + { + "epoch": 1.6802276756555006, + "grad_norm": 1.1834543453651316, + "learning_rate": 0.00025373325674541634, + "loss": 7.4307, + "step": 18007 + }, + { + "epoch": 1.6803209853503778, + "grad_norm": 1.323056274467929, + "learning_rate": 0.00025372779311229343, + "loss": 7.5322, + "step": 18008 + }, + { + "epoch": 1.6804142950452552, + "grad_norm": 1.7009465653131441, + "learning_rate": 0.0002537223292154214, + "loss": 7.1068, + "step": 18009 + }, + { + "epoch": 1.6805076047401326, + "grad_norm": 0.8242631142175993, + "learning_rate": 0.0002537168650548143, + "loss": 7.5052, + "step": 18010 + }, + { + "epoch": 1.6806009144350098, + "grad_norm": 0.9331614494227672, + "learning_rate": 0.0002537114006304859, + "loss": 7.4031, + "step": 18011 + }, + { + "epoch": 1.680694224129887, + "grad_norm": 0.8029846864831873, + "learning_rate": 0.0002537059359424502, + "loss": 7.1291, + "step": 18012 + }, + { + "epoch": 1.6807875338247644, + "grad_norm": 1.8452649832703856, + "learning_rate": 0.0002537004709907209, + "loss": 7.4401, + "step": 18013 + }, + { + "epoch": 1.6808808435196418, + "grad_norm": 1.661229915199752, + "learning_rate": 0.0002536950057753121, + "loss": 7.3485, + "step": 18014 + }, + { + "epoch": 1.680974153214519, + "grad_norm": 2.103082700283801, + "learning_rate": 0.00025368954029623763, + "loss": 7.6355, + "step": 18015 + }, + { + "epoch": 1.6810674629093962, + "grad_norm": 3.7873828818601534, + "learning_rate": 0.0002536840745535113, + "loss": 7.5523, + "step": 18016 + }, + { + "epoch": 1.6811607726042737, + "grad_norm": 1.0002126947922325, + "learning_rate": 0.00025367860854714716, + "loss": 7.2422, + "step": 18017 + }, + { + "epoch": 1.6812540822991509, + "grad_norm": 11993.012302783647, + "learning_rate": 0.000253673142277159, + "loss": 7.4595, + "step": 18018 + }, + { + "epoch": 1.681347391994028, + "grad_norm": 1.259572767835827, + "learning_rate": 0.0002536676757435607, + "loss": 7.5692, + "step": 18019 + }, + { + "epoch": 1.6814407016889055, + "grad_norm": 0.9920964124570509, + "learning_rate": 0.00025366220894636627, + "loss": 7.6383, + "step": 18020 + }, + { + "epoch": 1.681534011383783, + "grad_norm": 2.2323514003069733, + "learning_rate": 0.0002536567418855895, + "loss": 7.1368, + "step": 18021 + }, + { + "epoch": 1.68162732107866, + "grad_norm": 1.449912040166969, + "learning_rate": 0.0002536512745612444, + "loss": 7.3643, + "step": 18022 + }, + { + "epoch": 1.6817206307735373, + "grad_norm": 1.310676153998569, + "learning_rate": 0.0002536458069733448, + "loss": 7.524, + "step": 18023 + }, + { + "epoch": 1.6818139404684147, + "grad_norm": 1.024298042476568, + "learning_rate": 0.00025364033912190457, + "loss": 7.5852, + "step": 18024 + }, + { + "epoch": 1.6819072501632921, + "grad_norm": 1.1014669470462284, + "learning_rate": 0.00025363487100693767, + "loss": 7.2311, + "step": 18025 + }, + { + "epoch": 1.682000559858169, + "grad_norm": 259.8793991411631, + "learning_rate": 0.00025362940262845803, + "loss": 7.3992, + "step": 18026 + }, + { + "epoch": 1.6820938695530465, + "grad_norm": 0.8315102129386911, + "learning_rate": 0.00025362393398647954, + "loss": 7.5231, + "step": 18027 + }, + { + "epoch": 1.682187179247924, + "grad_norm": 0.8231366338159772, + "learning_rate": 0.000253618465081016, + "loss": 7.2802, + "step": 18028 + }, + { + "epoch": 1.6822804889428011, + "grad_norm": 0.8171589350914177, + "learning_rate": 0.00025361299591208144, + "loss": 7.3598, + "step": 18029 + }, + { + "epoch": 1.6823737986376783, + "grad_norm": 1.0050037543930193, + "learning_rate": 0.00025360752647968975, + "loss": 7.4366, + "step": 18030 + }, + { + "epoch": 1.6824671083325557, + "grad_norm": 0.7088511207493862, + "learning_rate": 0.00025360205678385476, + "loss": 7.3892, + "step": 18031 + }, + { + "epoch": 1.6825604180274332, + "grad_norm": 1.0978351546370646, + "learning_rate": 0.00025359658682459047, + "loss": 7.5849, + "step": 18032 + }, + { + "epoch": 1.6826537277223104, + "grad_norm": 0.8744760903104647, + "learning_rate": 0.00025359111660191075, + "loss": 7.4654, + "step": 18033 + }, + { + "epoch": 1.6827470374171876, + "grad_norm": 1.1920429517588753, + "learning_rate": 0.0002535856461158294, + "loss": 7.4771, + "step": 18034 + }, + { + "epoch": 1.682840347112065, + "grad_norm": 1.147437774865186, + "learning_rate": 0.0002535801753663606, + "loss": 7.3201, + "step": 18035 + }, + { + "epoch": 1.6829336568069424, + "grad_norm": 0.7607458422620291, + "learning_rate": 0.00025357470435351796, + "loss": 7.1545, + "step": 18036 + }, + { + "epoch": 1.6830269665018194, + "grad_norm": 1.875239788077411, + "learning_rate": 0.0002535692330773156, + "loss": 7.564, + "step": 18037 + }, + { + "epoch": 1.6831202761966968, + "grad_norm": 0.6950008394819897, + "learning_rate": 0.00025356376153776736, + "loss": 6.9149, + "step": 18038 + }, + { + "epoch": 1.6832135858915742, + "grad_norm": 0.6980460621681269, + "learning_rate": 0.00025355828973488715, + "loss": 7.0706, + "step": 18039 + }, + { + "epoch": 1.6833068955864514, + "grad_norm": 0.8736814457215284, + "learning_rate": 0.0002535528176686889, + "loss": 7.2404, + "step": 18040 + }, + { + "epoch": 1.6834002052813286, + "grad_norm": 0.7036940544355351, + "learning_rate": 0.0002535473453391864, + "loss": 7.175, + "step": 18041 + }, + { + "epoch": 1.683493514976206, + "grad_norm": 26.589458032856946, + "learning_rate": 0.0002535418727463938, + "loss": 7.5884, + "step": 18042 + }, + { + "epoch": 1.6835868246710834, + "grad_norm": 2098.7217980214646, + "learning_rate": 0.00025353639989032485, + "loss": 7.3094, + "step": 18043 + }, + { + "epoch": 1.6836801343659606, + "grad_norm": 0.697160495574867, + "learning_rate": 0.00025353092677099346, + "loss": 7.4201, + "step": 18044 + }, + { + "epoch": 1.6837734440608378, + "grad_norm": 3335.376449530762, + "learning_rate": 0.00025352545338841367, + "loss": 7.1922, + "step": 18045 + }, + { + "epoch": 1.6838667537557153, + "grad_norm": 0.7356592556224266, + "learning_rate": 0.0002535199797425993, + "loss": 7.3628, + "step": 18046 + }, + { + "epoch": 1.6839600634505927, + "grad_norm": 0.8649891954999209, + "learning_rate": 0.00025351450583356426, + "loss": 7.1644, + "step": 18047 + }, + { + "epoch": 1.6840533731454697, + "grad_norm": 1.9971940702822946, + "learning_rate": 0.00025350903166132245, + "loss": 7.7034, + "step": 18048 + }, + { + "epoch": 1.684146682840347, + "grad_norm": 0.7247112532124279, + "learning_rate": 0.0002535035572258879, + "loss": 6.9366, + "step": 18049 + }, + { + "epoch": 1.6842399925352245, + "grad_norm": 1.3950056321733089, + "learning_rate": 0.00025349808252727446, + "loss": 7.3312, + "step": 18050 + }, + { + "epoch": 1.6843333022301017, + "grad_norm": 1.207675207293704, + "learning_rate": 0.00025349260756549603, + "loss": 7.4731, + "step": 18051 + }, + { + "epoch": 1.684426611924979, + "grad_norm": 1.1353172126322257, + "learning_rate": 0.0002534871323405666, + "loss": 7.3799, + "step": 18052 + }, + { + "epoch": 1.6845199216198563, + "grad_norm": 0.4936244227620769, + "learning_rate": 0.00025348165685249997, + "loss": 6.9234, + "step": 18053 + }, + { + "epoch": 1.6846132313147337, + "grad_norm": 0.4734114611588019, + "learning_rate": 0.0002534761811013102, + "loss": 7.1423, + "step": 18054 + }, + { + "epoch": 1.684706541009611, + "grad_norm": 0.7623514731909944, + "learning_rate": 0.0002534707050870111, + "loss": 7.4519, + "step": 18055 + }, + { + "epoch": 1.6847998507044881, + "grad_norm": 0.45059690560467786, + "learning_rate": 0.00025346522880961673, + "loss": 7.223, + "step": 18056 + }, + { + "epoch": 1.6848931603993655, + "grad_norm": 0.4461629178390795, + "learning_rate": 0.0002534597522691409, + "loss": 7.4004, + "step": 18057 + }, + { + "epoch": 1.6849864700942427, + "grad_norm": 0.5068666188759327, + "learning_rate": 0.00025345427546559756, + "loss": 7.0816, + "step": 18058 + }, + { + "epoch": 1.68507977978912, + "grad_norm": 0.43651013095409175, + "learning_rate": 0.00025344879839900064, + "loss": 7.3139, + "step": 18059 + }, + { + "epoch": 1.6851730894839974, + "grad_norm": 1.1501214793647623, + "learning_rate": 0.000253443321069364, + "loss": 7.0161, + "step": 18060 + }, + { + "epoch": 1.6852663991788748, + "grad_norm": 0.6091302272592626, + "learning_rate": 0.00025343784347670175, + "loss": 7.1776, + "step": 18061 + }, + { + "epoch": 1.685359708873752, + "grad_norm": 0.7862412333847116, + "learning_rate": 0.00025343236562102764, + "loss": 7.3222, + "step": 18062 + }, + { + "epoch": 1.6854530185686292, + "grad_norm": 0.42640973646962743, + "learning_rate": 0.0002534268875023557, + "loss": 7.0599, + "step": 18063 + }, + { + "epoch": 1.6855463282635066, + "grad_norm": 0.5853111402243851, + "learning_rate": 0.0002534214091206998, + "loss": 7.4396, + "step": 18064 + }, + { + "epoch": 1.685639637958384, + "grad_norm": 37889.203982312116, + "learning_rate": 0.0002534159304760739, + "loss": 6.9844, + "step": 18065 + }, + { + "epoch": 1.6857329476532612, + "grad_norm": 0.7457231405948984, + "learning_rate": 0.00025341045156849194, + "loss": 7.0349, + "step": 18066 + }, + { + "epoch": 1.6858262573481384, + "grad_norm": 0.43876434892740107, + "learning_rate": 0.0002534049723979678, + "loss": 7.0345, + "step": 18067 + }, + { + "epoch": 1.6859195670430158, + "grad_norm": 0.5531428023065144, + "learning_rate": 0.00025339949296451547, + "loss": 7.034, + "step": 18068 + }, + { + "epoch": 1.686012876737893, + "grad_norm": 1.0263198444898802, + "learning_rate": 0.00025339401326814883, + "loss": 7.4746, + "step": 18069 + }, + { + "epoch": 1.6861061864327702, + "grad_norm": 0.5026600741110574, + "learning_rate": 0.00025338853330888187, + "loss": 6.8729, + "step": 18070 + }, + { + "epoch": 1.6861994961276476, + "grad_norm": 1.0586671887388666, + "learning_rate": 0.00025338305308672846, + "loss": 7.4956, + "step": 18071 + }, + { + "epoch": 1.686292805822525, + "grad_norm": 0.9475417796566938, + "learning_rate": 0.0002533775726017026, + "loss": 7.1855, + "step": 18072 + }, + { + "epoch": 1.6863861155174023, + "grad_norm": 1.1783808344176654, + "learning_rate": 0.0002533720918538182, + "loss": 7.3726, + "step": 18073 + }, + { + "epoch": 1.6864794252122794, + "grad_norm": 0.6518937435472499, + "learning_rate": 0.0002533666108430892, + "loss": 7.0107, + "step": 18074 + }, + { + "epoch": 1.6865727349071569, + "grad_norm": 0.5591149200874488, + "learning_rate": 0.0002533611295695295, + "loss": 7.0374, + "step": 18075 + }, + { + "epoch": 1.6866660446020343, + "grad_norm": 0.6526462024644814, + "learning_rate": 0.0002533556480331531, + "loss": 7.195, + "step": 18076 + }, + { + "epoch": 1.6867593542969115, + "grad_norm": 460.63513463206556, + "learning_rate": 0.00025335016623397386, + "loss": 7.3565, + "step": 18077 + }, + { + "epoch": 1.6868526639917887, + "grad_norm": 0.5683628383822952, + "learning_rate": 0.0002533446841720058, + "loss": 7.3419, + "step": 18078 + }, + { + "epoch": 1.686945973686666, + "grad_norm": 0.4703859648837409, + "learning_rate": 0.0002533392018472628, + "loss": 7.2408, + "step": 18079 + }, + { + "epoch": 1.6870392833815433, + "grad_norm": 0.9287390376119948, + "learning_rate": 0.0002533337192597588, + "loss": 7.2377, + "step": 18080 + }, + { + "epoch": 1.6871325930764205, + "grad_norm": 0.8072218064897461, + "learning_rate": 0.0002533282364095078, + "loss": 7.6025, + "step": 18081 + }, + { + "epoch": 1.687225902771298, + "grad_norm": 2.0141619253935596, + "learning_rate": 0.0002533227532965237, + "loss": 7.174, + "step": 18082 + }, + { + "epoch": 1.6873192124661753, + "grad_norm": 1.3998143955081639, + "learning_rate": 0.0002533172699208204, + "loss": 7.2665, + "step": 18083 + }, + { + "epoch": 1.6874125221610525, + "grad_norm": 1.1309099975885601, + "learning_rate": 0.00025331178628241194, + "loss": 7.3579, + "step": 18084 + }, + { + "epoch": 1.6875058318559297, + "grad_norm": 1.1476501934413748, + "learning_rate": 0.00025330630238131217, + "loss": 7.2951, + "step": 18085 + }, + { + "epoch": 1.6875991415508071, + "grad_norm": 1.0491635586459485, + "learning_rate": 0.0002533008182175351, + "loss": 7.0574, + "step": 18086 + }, + { + "epoch": 1.6876924512456846, + "grad_norm": 0.4347756237613803, + "learning_rate": 0.00025329533379109465, + "loss": 7.3642, + "step": 18087 + }, + { + "epoch": 1.6877857609405618, + "grad_norm": 0.4328721734259831, + "learning_rate": 0.00025328984910200477, + "loss": 7.2446, + "step": 18088 + }, + { + "epoch": 1.687879070635439, + "grad_norm": 3645.370718565347, + "learning_rate": 0.0002532843641502794, + "loss": 7.2408, + "step": 18089 + }, + { + "epoch": 1.6879723803303164, + "grad_norm": 0.4627540438691221, + "learning_rate": 0.0002532788789359325, + "loss": 7.234, + "step": 18090 + }, + { + "epoch": 1.6880656900251936, + "grad_norm": 19272.3765657209, + "learning_rate": 0.00025327339345897795, + "loss": 7.3396, + "step": 18091 + }, + { + "epoch": 1.6881589997200708, + "grad_norm": 3538.7851878073816, + "learning_rate": 0.0002532679077194298, + "loss": 6.9592, + "step": 18092 + }, + { + "epoch": 1.6882523094149482, + "grad_norm": 0.4826220236716845, + "learning_rate": 0.00025326242171730195, + "loss": 7.0853, + "step": 18093 + }, + { + "epoch": 1.6883456191098256, + "grad_norm": 0.614134691400847, + "learning_rate": 0.0002532569354526083, + "loss": 6.9584, + "step": 18094 + }, + { + "epoch": 1.6884389288047028, + "grad_norm": 0.5169902879341448, + "learning_rate": 0.0002532514489253629, + "loss": 7.0884, + "step": 18095 + }, + { + "epoch": 1.68853223849958, + "grad_norm": 1.669185458297572, + "learning_rate": 0.00025324596213557963, + "loss": 7.7049, + "step": 18096 + }, + { + "epoch": 1.6886255481944574, + "grad_norm": 0.8421781525674629, + "learning_rate": 0.0002532404750832725, + "loss": 7.1684, + "step": 18097 + }, + { + "epoch": 1.6887188578893348, + "grad_norm": 1.0311857283656174, + "learning_rate": 0.0002532349877684554, + "loss": 7.4726, + "step": 18098 + }, + { + "epoch": 1.688812167584212, + "grad_norm": 0.8656147546673212, + "learning_rate": 0.00025322950019114225, + "loss": 7.3832, + "step": 18099 + }, + { + "epoch": 1.6889054772790892, + "grad_norm": 3126.4287447905967, + "learning_rate": 0.00025322401235134713, + "loss": 7.4093, + "step": 18100 + }, + { + "epoch": 1.6889987869739667, + "grad_norm": 5562.29176532806, + "learning_rate": 0.00025321852424908386, + "loss": 7.287, + "step": 18101 + }, + { + "epoch": 1.6890920966688439, + "grad_norm": 1.1962635114283775, + "learning_rate": 0.0002532130358843665, + "loss": 7.0544, + "step": 18102 + }, + { + "epoch": 1.689185406363721, + "grad_norm": 0.8414300692930126, + "learning_rate": 0.000253207547257209, + "loss": 7.191, + "step": 18103 + }, + { + "epoch": 1.6892787160585985, + "grad_norm": 0.766132057459042, + "learning_rate": 0.0002532020583676252, + "loss": 7.3351, + "step": 18104 + }, + { + "epoch": 1.689372025753476, + "grad_norm": 0.5818387405837872, + "learning_rate": 0.0002531965692156292, + "loss": 7.3188, + "step": 18105 + }, + { + "epoch": 1.689465335448353, + "grad_norm": 0.6932843929195371, + "learning_rate": 0.0002531910798012349, + "loss": 7.1963, + "step": 18106 + }, + { + "epoch": 1.6895586451432303, + "grad_norm": 1.098752619327632, + "learning_rate": 0.0002531855901244562, + "loss": 7.0624, + "step": 18107 + }, + { + "epoch": 1.6896519548381077, + "grad_norm": 0.8706459519130414, + "learning_rate": 0.00025318010018530714, + "loss": 7.6551, + "step": 18108 + }, + { + "epoch": 1.6897452645329851, + "grad_norm": 1.5785371661243837, + "learning_rate": 0.0002531746099838017, + "loss": 7.2441, + "step": 18109 + }, + { + "epoch": 1.6898385742278623, + "grad_norm": 1.5082055228265616, + "learning_rate": 0.00025316911951995364, + "loss": 7.4913, + "step": 18110 + }, + { + "epoch": 1.6899318839227395, + "grad_norm": 1.2938943002966727, + "learning_rate": 0.0002531636287937772, + "loss": 7.6639, + "step": 18111 + }, + { + "epoch": 1.690025193617617, + "grad_norm": 2.436023400633954, + "learning_rate": 0.0002531581378052862, + "loss": 7.1154, + "step": 18112 + }, + { + "epoch": 1.6901185033124941, + "grad_norm": 1.3316828864833492, + "learning_rate": 0.00025315264655449455, + "loss": 7.4878, + "step": 18113 + }, + { + "epoch": 1.6902118130073713, + "grad_norm": 1.795159702213673, + "learning_rate": 0.00025314715504141634, + "loss": 7.2854, + "step": 18114 + }, + { + "epoch": 1.6903051227022488, + "grad_norm": 1.393844172573371, + "learning_rate": 0.00025314166326606546, + "loss": 7.2153, + "step": 18115 + }, + { + "epoch": 1.6903984323971262, + "grad_norm": 0.8295771614517342, + "learning_rate": 0.0002531361712284559, + "loss": 7.3105, + "step": 18116 + }, + { + "epoch": 1.6904917420920034, + "grad_norm": 1.1647453874645906, + "learning_rate": 0.00025313067892860155, + "loss": 7.2305, + "step": 18117 + }, + { + "epoch": 1.6905850517868806, + "grad_norm": 1.413798330056186, + "learning_rate": 0.00025312518636651644, + "loss": 7.2957, + "step": 18118 + }, + { + "epoch": 1.690678361481758, + "grad_norm": 2.00068811340592, + "learning_rate": 0.00025311969354221454, + "loss": 7.6529, + "step": 18119 + }, + { + "epoch": 1.6907716711766354, + "grad_norm": 1.384509514139745, + "learning_rate": 0.0002531142004557098, + "loss": 7.3723, + "step": 18120 + }, + { + "epoch": 1.6908649808715126, + "grad_norm": 1.038738975755746, + "learning_rate": 0.0002531087071070162, + "loss": 7.4536, + "step": 18121 + }, + { + "epoch": 1.6909582905663898, + "grad_norm": 0.8418421910032858, + "learning_rate": 0.0002531032134961477, + "loss": 7.4778, + "step": 18122 + }, + { + "epoch": 1.6910516002612672, + "grad_norm": 1.0218191524147344, + "learning_rate": 0.0002530977196231183, + "loss": 7.5347, + "step": 18123 + }, + { + "epoch": 1.6911449099561444, + "grad_norm": 1.0294855716790052, + "learning_rate": 0.00025309222548794197, + "loss": 7.1861, + "step": 18124 + }, + { + "epoch": 1.6912382196510216, + "grad_norm": 1.0184953589403334, + "learning_rate": 0.00025308673109063255, + "loss": 7.4874, + "step": 18125 + }, + { + "epoch": 1.691331529345899, + "grad_norm": 0.5916065795266772, + "learning_rate": 0.00025308123643120415, + "loss": 7.0461, + "step": 18126 + }, + { + "epoch": 1.6914248390407765, + "grad_norm": 0.762152910266669, + "learning_rate": 0.00025307574150967073, + "loss": 7.2858, + "step": 18127 + }, + { + "epoch": 1.6915181487356536, + "grad_norm": 1.0151315236781153, + "learning_rate": 0.0002530702463260462, + "loss": 7.3926, + "step": 18128 + }, + { + "epoch": 1.6916114584305308, + "grad_norm": 0.8270136166800011, + "learning_rate": 0.00025306475088034456, + "loss": 7.2303, + "step": 18129 + }, + { + "epoch": 1.6917047681254083, + "grad_norm": 1.2544856857962645, + "learning_rate": 0.0002530592551725798, + "loss": 7.4899, + "step": 18130 + }, + { + "epoch": 1.6917980778202857, + "grad_norm": 0.9754971225985336, + "learning_rate": 0.0002530537592027659, + "loss": 7.3513, + "step": 18131 + }, + { + "epoch": 1.6918913875151627, + "grad_norm": 1.0698925854015444, + "learning_rate": 0.00025304826297091676, + "loss": 7.6722, + "step": 18132 + }, + { + "epoch": 1.69198469721004, + "grad_norm": 1.0241059053965447, + "learning_rate": 0.00025304276647704647, + "loss": 7.246, + "step": 18133 + }, + { + "epoch": 1.6920780069049175, + "grad_norm": 0.9939947378756578, + "learning_rate": 0.0002530372697211689, + "loss": 7.427, + "step": 18134 + }, + { + "epoch": 1.6921713165997947, + "grad_norm": 1.202366827870444, + "learning_rate": 0.0002530317727032981, + "loss": 7.1818, + "step": 18135 + }, + { + "epoch": 1.692264626294672, + "grad_norm": 0.8551407975559614, + "learning_rate": 0.00025302627542344805, + "loss": 7.3915, + "step": 18136 + }, + { + "epoch": 1.6923579359895493, + "grad_norm": 441385.0795609836, + "learning_rate": 0.0002530207778816327, + "loss": 7.2847, + "step": 18137 + }, + { + "epoch": 1.6924512456844267, + "grad_norm": 0.9424981768940338, + "learning_rate": 0.000253015280077866, + "loss": 7.2282, + "step": 18138 + }, + { + "epoch": 1.692544555379304, + "grad_norm": 1.21513130253891, + "learning_rate": 0.00025300978201216195, + "loss": 7.4613, + "step": 18139 + }, + { + "epoch": 1.6926378650741811, + "grad_norm": 1.334267784705976, + "learning_rate": 0.00025300428368453455, + "loss": 7.36, + "step": 18140 + }, + { + "epoch": 1.6927311747690585, + "grad_norm": 1.2382991768439295, + "learning_rate": 0.00025299878509499774, + "loss": 7.3618, + "step": 18141 + }, + { + "epoch": 1.692824484463936, + "grad_norm": 0.7193096547422457, + "learning_rate": 0.0002529932862435656, + "loss": 7.2728, + "step": 18142 + }, + { + "epoch": 1.692917794158813, + "grad_norm": 0.8404923792804025, + "learning_rate": 0.000252987787130252, + "loss": 7.0823, + "step": 18143 + }, + { + "epoch": 1.6930111038536904, + "grad_norm": 0.8875472785214346, + "learning_rate": 0.0002529822877550709, + "loss": 7.3402, + "step": 18144 + }, + { + "epoch": 1.6931044135485678, + "grad_norm": 1.052970184805687, + "learning_rate": 0.0002529767881180365, + "loss": 6.9833, + "step": 18145 + }, + { + "epoch": 1.693197723243445, + "grad_norm": 1.0827513089033112, + "learning_rate": 0.0002529712882191625, + "loss": 7.0495, + "step": 18146 + }, + { + "epoch": 1.6932910329383222, + "grad_norm": 1378.1725409361127, + "learning_rate": 0.00025296578805846306, + "loss": 6.9792, + "step": 18147 + }, + { + "epoch": 1.6933843426331996, + "grad_norm": 0.9288996349050279, + "learning_rate": 0.00025296028763595216, + "loss": 7.1507, + "step": 18148 + }, + { + "epoch": 1.693477652328077, + "grad_norm": 1.2916605955684826, + "learning_rate": 0.0002529547869516437, + "loss": 7.1107, + "step": 18149 + }, + { + "epoch": 1.6935709620229542, + "grad_norm": 1.9785409612105873, + "learning_rate": 0.00025294928600555174, + "loss": 7.4331, + "step": 18150 + }, + { + "epoch": 1.6936642717178314, + "grad_norm": 1.8562675794672054, + "learning_rate": 0.0002529437847976902, + "loss": 7.3487, + "step": 18151 + }, + { + "epoch": 1.6937575814127088, + "grad_norm": 1.9062811959590016, + "learning_rate": 0.00025293828332807316, + "loss": 7.5082, + "step": 18152 + }, + { + "epoch": 1.6938508911075862, + "grad_norm": 1.6207834340248808, + "learning_rate": 0.0002529327815967146, + "loss": 7.5578, + "step": 18153 + }, + { + "epoch": 1.6939442008024632, + "grad_norm": 0.6679429750025755, + "learning_rate": 0.00025292727960362835, + "loss": 7.145, + "step": 18154 + }, + { + "epoch": 1.6940375104973406, + "grad_norm": 1.0064669562756257, + "learning_rate": 0.00025292177734882856, + "loss": 7.1115, + "step": 18155 + }, + { + "epoch": 1.694130820192218, + "grad_norm": 0.7102893447814497, + "learning_rate": 0.00025291627483232926, + "loss": 7.7494, + "step": 18156 + }, + { + "epoch": 1.6942241298870953, + "grad_norm": 1.1837842972277643, + "learning_rate": 0.00025291077205414426, + "loss": 7.4578, + "step": 18157 + }, + { + "epoch": 1.6943174395819725, + "grad_norm": 1.2320719114067336, + "learning_rate": 0.00025290526901428767, + "loss": 7.4567, + "step": 18158 + }, + { + "epoch": 1.6944107492768499, + "grad_norm": 4.970655999053291, + "learning_rate": 0.00025289976571277347, + "loss": 7.2669, + "step": 18159 + }, + { + "epoch": 1.6945040589717273, + "grad_norm": 0.8583113928338153, + "learning_rate": 0.00025289426214961564, + "loss": 7.7507, + "step": 18160 + }, + { + "epoch": 1.6945973686666045, + "grad_norm": 1.0977677309975076, + "learning_rate": 0.00025288875832482817, + "loss": 7.0746, + "step": 18161 + }, + { + "epoch": 1.6946906783614817, + "grad_norm": 0.9927426765294484, + "learning_rate": 0.0002528832542384251, + "loss": 6.9831, + "step": 18162 + }, + { + "epoch": 1.694783988056359, + "grad_norm": 0.6618183859459841, + "learning_rate": 0.00025287774989042036, + "loss": 7.2997, + "step": 18163 + }, + { + "epoch": 1.6948772977512363, + "grad_norm": 1.1592720128173575, + "learning_rate": 0.000252872245280828, + "loss": 7.1625, + "step": 18164 + }, + { + "epoch": 1.6949706074461135, + "grad_norm": 0.527443773653759, + "learning_rate": 0.000252866740409662, + "loss": 6.9867, + "step": 18165 + }, + { + "epoch": 1.695063917140991, + "grad_norm": 1.2807468978387422, + "learning_rate": 0.00025286123527693634, + "loss": 7.3052, + "step": 18166 + }, + { + "epoch": 1.6951572268358683, + "grad_norm": 0.9175142381477351, + "learning_rate": 0.000252855729882665, + "loss": 7.3762, + "step": 18167 + }, + { + "epoch": 1.6952505365307455, + "grad_norm": 0.7059227928000142, + "learning_rate": 0.00025285022422686206, + "loss": 7.6488, + "step": 18168 + }, + { + "epoch": 1.6953438462256227, + "grad_norm": 0.9096248636759658, + "learning_rate": 0.00025284471830954147, + "loss": 7.6401, + "step": 18169 + }, + { + "epoch": 1.6954371559205001, + "grad_norm": 1598.7023355942076, + "learning_rate": 0.00025283921213071716, + "loss": 7.1286, + "step": 18170 + }, + { + "epoch": 1.6955304656153776, + "grad_norm": 1350.6544082074615, + "learning_rate": 0.00025283370569040325, + "loss": 7.367, + "step": 18171 + }, + { + "epoch": 1.6956237753102548, + "grad_norm": 1423.0259358368037, + "learning_rate": 0.0002528281989886137, + "loss": 7.5103, + "step": 18172 + }, + { + "epoch": 1.695717085005132, + "grad_norm": 1.372977312645762, + "learning_rate": 0.0002528226920253624, + "loss": 7.0541, + "step": 18173 + }, + { + "epoch": 1.6958103947000094, + "grad_norm": 0.5316027593996776, + "learning_rate": 0.00025281718480066356, + "loss": 7.1505, + "step": 18174 + }, + { + "epoch": 1.6959037043948866, + "grad_norm": 0.6124917680442536, + "learning_rate": 0.00025281167731453104, + "loss": 7.2978, + "step": 18175 + }, + { + "epoch": 1.6959970140897638, + "grad_norm": 2.0464153712901405, + "learning_rate": 0.0002528061695669789, + "loss": 7.948, + "step": 18176 + }, + { + "epoch": 1.6960903237846412, + "grad_norm": 1.0804332801181735, + "learning_rate": 0.00025280066155802106, + "loss": 7.3555, + "step": 18177 + }, + { + "epoch": 1.6961836334795186, + "grad_norm": 1.0181827925654732, + "learning_rate": 0.0002527951532876716, + "loss": 7.4096, + "step": 18178 + }, + { + "epoch": 1.6962769431743958, + "grad_norm": 0.426196117385857, + "learning_rate": 0.00025278964475594455, + "loss": 7.4234, + "step": 18179 + }, + { + "epoch": 1.696370252869273, + "grad_norm": 0.842496286581183, + "learning_rate": 0.0002527841359628539, + "loss": 7.1524, + "step": 18180 + }, + { + "epoch": 1.6964635625641504, + "grad_norm": 1378.3839344582234, + "learning_rate": 0.00025277862690841363, + "loss": 7.1872, + "step": 18181 + }, + { + "epoch": 1.6965568722590278, + "grad_norm": 0.9844474903570094, + "learning_rate": 0.00025277311759263775, + "loss": 7.4348, + "step": 18182 + }, + { + "epoch": 1.696650181953905, + "grad_norm": 0.9156956665849276, + "learning_rate": 0.0002527676080155403, + "loss": 7.5981, + "step": 18183 + }, + { + "epoch": 1.6967434916487822, + "grad_norm": 0.7680758489277657, + "learning_rate": 0.0002527620981771352, + "loss": 7.696, + "step": 18184 + }, + { + "epoch": 1.6968368013436597, + "grad_norm": 1.0772174295095138, + "learning_rate": 0.00025275658807743655, + "loss": 7.3423, + "step": 18185 + }, + { + "epoch": 1.6969301110385369, + "grad_norm": 1.3474460498333172, + "learning_rate": 0.0002527510777164584, + "loss": 7.3251, + "step": 18186 + }, + { + "epoch": 1.697023420733414, + "grad_norm": 0.9463993269786742, + "learning_rate": 0.0002527455670942146, + "loss": 7.347, + "step": 18187 + }, + { + "epoch": 1.6971167304282915, + "grad_norm": 0.7873144138577461, + "learning_rate": 0.0002527400562107193, + "loss": 7.2994, + "step": 18188 + }, + { + "epoch": 1.697210040123169, + "grad_norm": 0.49669580095049853, + "learning_rate": 0.0002527345450659865, + "loss": 7.335, + "step": 18189 + }, + { + "epoch": 1.697303349818046, + "grad_norm": 0.5451289799137752, + "learning_rate": 0.00025272903366003015, + "loss": 7.1335, + "step": 18190 + }, + { + "epoch": 1.6973966595129233, + "grad_norm": 606.5761540225146, + "learning_rate": 0.0002527235219928643, + "loss": 7.3131, + "step": 18191 + }, + { + "epoch": 1.6974899692078007, + "grad_norm": 1110.0168628670897, + "learning_rate": 0.0002527180100645029, + "loss": 7.2681, + "step": 18192 + }, + { + "epoch": 1.6975832789026781, + "grad_norm": 1.4911726806256194, + "learning_rate": 0.0002527124978749601, + "loss": 7.3731, + "step": 18193 + }, + { + "epoch": 1.6976765885975553, + "grad_norm": 1.1590216001037636, + "learning_rate": 0.0002527069854242499, + "loss": 7.1935, + "step": 18194 + }, + { + "epoch": 1.6977698982924325, + "grad_norm": 1.7922321281412403, + "learning_rate": 0.00025270147271238614, + "loss": 7.7008, + "step": 18195 + }, + { + "epoch": 1.69786320798731, + "grad_norm": 0.6170032541571803, + "learning_rate": 0.00025269595973938306, + "loss": 7.3613, + "step": 18196 + }, + { + "epoch": 1.6979565176821871, + "grad_norm": 0.6271777137558359, + "learning_rate": 0.0002526904465052545, + "loss": 7.0081, + "step": 18197 + }, + { + "epoch": 1.6980498273770643, + "grad_norm": 0.5622231805395813, + "learning_rate": 0.0002526849330100146, + "loss": 7.3731, + "step": 18198 + }, + { + "epoch": 1.6981431370719418, + "grad_norm": 0.5613902393630829, + "learning_rate": 0.0002526794192536773, + "loss": 7.4293, + "step": 18199 + }, + { + "epoch": 1.6982364467668192, + "grad_norm": 0.8188232274100404, + "learning_rate": 0.0002526739052362567, + "loss": 7.104, + "step": 18200 + }, + { + "epoch": 1.6983297564616964, + "grad_norm": 0.5528687523749404, + "learning_rate": 0.00025266839095776676, + "loss": 7.3536, + "step": 18201 + }, + { + "epoch": 1.6984230661565736, + "grad_norm": 0.9036494140486002, + "learning_rate": 0.0002526628764182215, + "loss": 7.1638, + "step": 18202 + }, + { + "epoch": 1.698516375851451, + "grad_norm": 0.7619712580258916, + "learning_rate": 0.00025265736161763497, + "loss": 7.1715, + "step": 18203 + }, + { + "epoch": 1.6986096855463284, + "grad_norm": 0.9662438485178704, + "learning_rate": 0.00025265184655602114, + "loss": 7.3805, + "step": 18204 + }, + { + "epoch": 1.6987029952412056, + "grad_norm": 1.0624047673507802, + "learning_rate": 0.0002526463312333941, + "loss": 7.355, + "step": 18205 + }, + { + "epoch": 1.6987963049360828, + "grad_norm": 1.4193874893261171, + "learning_rate": 0.0002526408156497678, + "loss": 7.6195, + "step": 18206 + }, + { + "epoch": 1.6988896146309602, + "grad_norm": 1.1853208185048845, + "learning_rate": 0.0002526352998051563, + "loss": 7.3272, + "step": 18207 + }, + { + "epoch": 1.6989829243258374, + "grad_norm": 0.5305654697216146, + "learning_rate": 0.00025262978369957375, + "loss": 7.2311, + "step": 18208 + }, + { + "epoch": 1.6990762340207146, + "grad_norm": 0.7433889332092813, + "learning_rate": 0.000252624267333034, + "loss": 7.06, + "step": 18209 + }, + { + "epoch": 1.699169543715592, + "grad_norm": 560.8211964002028, + "learning_rate": 0.00025261875070555113, + "loss": 6.9647, + "step": 18210 + }, + { + "epoch": 1.6992628534104695, + "grad_norm": 0.8377449128781721, + "learning_rate": 0.00025261323381713915, + "loss": 7.4202, + "step": 18211 + }, + { + "epoch": 1.6993561631053467, + "grad_norm": 1.5109587105953937, + "learning_rate": 0.00025260771666781213, + "loss": 7.1547, + "step": 18212 + }, + { + "epoch": 1.6994494728002238, + "grad_norm": 0.8235931331393818, + "learning_rate": 0.0002526021992575841, + "loss": 7.4168, + "step": 18213 + }, + { + "epoch": 1.6995427824951013, + "grad_norm": 1.2198642556024943, + "learning_rate": 0.00025259668158646913, + "loss": 7.2364, + "step": 18214 + }, + { + "epoch": 1.6996360921899787, + "grad_norm": 273.3632312430823, + "learning_rate": 0.0002525911636544811, + "loss": 6.8935, + "step": 18215 + }, + { + "epoch": 1.6997294018848559, + "grad_norm": 0.6021422107073142, + "learning_rate": 0.00025258564546163417, + "loss": 7.1561, + "step": 18216 + }, + { + "epoch": 1.699822711579733, + "grad_norm": 1.0664354672463194, + "learning_rate": 0.00025258012700794233, + "loss": 7.329, + "step": 18217 + }, + { + "epoch": 1.6999160212746105, + "grad_norm": 1.1482885267943501, + "learning_rate": 0.0002525746082934195, + "loss": 7.2607, + "step": 18218 + }, + { + "epoch": 1.7000093309694877, + "grad_norm": 1176.0635343608997, + "learning_rate": 0.00025256908931808, + "loss": 7.188, + "step": 18219 + }, + { + "epoch": 1.700102640664365, + "grad_norm": 1.0139422336146044, + "learning_rate": 0.00025256357008193763, + "loss": 7.1407, + "step": 18220 + }, + { + "epoch": 1.7001959503592423, + "grad_norm": 0.45966924657117275, + "learning_rate": 0.00025255805058500646, + "loss": 7.0979, + "step": 18221 + }, + { + "epoch": 1.7002892600541197, + "grad_norm": 262.0371509644504, + "learning_rate": 0.00025255253082730054, + "loss": 7.3563, + "step": 18222 + }, + { + "epoch": 1.700382569748997, + "grad_norm": 1.1443394364811448, + "learning_rate": 0.0002525470108088339, + "loss": 7.0092, + "step": 18223 + }, + { + "epoch": 1.7004758794438741, + "grad_norm": 0.6592506495499517, + "learning_rate": 0.0002525414905296206, + "loss": 7.4244, + "step": 18224 + }, + { + "epoch": 1.7005691891387515, + "grad_norm": 0.6159595826468904, + "learning_rate": 0.0002525359699896747, + "loss": 7.6369, + "step": 18225 + }, + { + "epoch": 1.700662498833629, + "grad_norm": 0.9546403744825164, + "learning_rate": 0.0002525304491890102, + "loss": 7.4044, + "step": 18226 + }, + { + "epoch": 1.7007558085285062, + "grad_norm": 2.748976003558889, + "learning_rate": 0.0002525249281276411, + "loss": 7.2724, + "step": 18227 + }, + { + "epoch": 1.7008491182233834, + "grad_norm": 1.3673399381973017, + "learning_rate": 0.0002525194068055815, + "loss": 7.285, + "step": 18228 + }, + { + "epoch": 1.7009424279182608, + "grad_norm": 1.197993897337591, + "learning_rate": 0.0002525138852228454, + "loss": 7.229, + "step": 18229 + }, + { + "epoch": 1.701035737613138, + "grad_norm": 0.6962096107749957, + "learning_rate": 0.0002525083633794469, + "loss": 7.4079, + "step": 18230 + }, + { + "epoch": 1.7011290473080152, + "grad_norm": 0.5167972502591565, + "learning_rate": 0.00025250284127539997, + "loss": 7.3238, + "step": 18231 + }, + { + "epoch": 1.7012223570028926, + "grad_norm": 520.4603705346597, + "learning_rate": 0.00025249731891071865, + "loss": 7.3921, + "step": 18232 + }, + { + "epoch": 1.70131566669777, + "grad_norm": 1.0005973999545716, + "learning_rate": 0.0002524917962854171, + "loss": 7.5671, + "step": 18233 + }, + { + "epoch": 1.7014089763926472, + "grad_norm": 0.684514430571277, + "learning_rate": 0.00025248627339950915, + "loss": 7.1726, + "step": 18234 + }, + { + "epoch": 1.7015022860875244, + "grad_norm": 0.6724015433850401, + "learning_rate": 0.00025248075025300905, + "loss": 7.2113, + "step": 18235 + }, + { + "epoch": 1.7015955957824018, + "grad_norm": 158.19134570402449, + "learning_rate": 0.00025247522684593076, + "loss": 7.4333, + "step": 18236 + }, + { + "epoch": 1.7016889054772792, + "grad_norm": 0.5454551086325287, + "learning_rate": 0.0002524697031782883, + "loss": 7.2249, + "step": 18237 + }, + { + "epoch": 1.7017822151721562, + "grad_norm": 0.6115858766987692, + "learning_rate": 0.0002524641792500957, + "loss": 7.0558, + "step": 18238 + }, + { + "epoch": 1.7018755248670336, + "grad_norm": 0.8616401840010577, + "learning_rate": 0.0002524586550613671, + "loss": 7.398, + "step": 18239 + }, + { + "epoch": 1.701968834561911, + "grad_norm": 1.4942239427246347, + "learning_rate": 0.0002524531306121165, + "loss": 7.6864, + "step": 18240 + }, + { + "epoch": 1.7020621442567883, + "grad_norm": 0.5102288489299014, + "learning_rate": 0.0002524476059023579, + "loss": 6.872, + "step": 18241 + }, + { + "epoch": 1.7021554539516655, + "grad_norm": 0.5910431639747978, + "learning_rate": 0.0002524420809321054, + "loss": 7.0024, + "step": 18242 + }, + { + "epoch": 1.7022487636465429, + "grad_norm": 0.5606527641039913, + "learning_rate": 0.00025243655570137297, + "loss": 6.933, + "step": 18243 + }, + { + "epoch": 1.7023420733414203, + "grad_norm": 0.6509389259453493, + "learning_rate": 0.0002524310302101748, + "loss": 7.2499, + "step": 18244 + }, + { + "epoch": 1.7024353830362975, + "grad_norm": 1.0378057163914085, + "learning_rate": 0.00025242550445852484, + "loss": 7.4422, + "step": 18245 + }, + { + "epoch": 1.7025286927311747, + "grad_norm": 648.2335238222071, + "learning_rate": 0.00025241997844643714, + "loss": 7.1164, + "step": 18246 + }, + { + "epoch": 1.702622002426052, + "grad_norm": 1.1212228285445303, + "learning_rate": 0.0002524144521739258, + "loss": 7.7848, + "step": 18247 + }, + { + "epoch": 1.7027153121209295, + "grad_norm": 0.6002295847114256, + "learning_rate": 0.00025240892564100484, + "loss": 7.1015, + "step": 18248 + }, + { + "epoch": 1.7028086218158065, + "grad_norm": 2124.686665862442, + "learning_rate": 0.0002524033988476884, + "loss": 7.2184, + "step": 18249 + }, + { + "epoch": 1.702901931510684, + "grad_norm": 0.808061192757872, + "learning_rate": 0.0002523978717939903, + "loss": 7.3187, + "step": 18250 + }, + { + "epoch": 1.7029952412055613, + "grad_norm": 0.6733260881443764, + "learning_rate": 0.00025239234447992483, + "loss": 7.2464, + "step": 18251 + }, + { + "epoch": 1.7030885509004385, + "grad_norm": 0.8146039650381732, + "learning_rate": 0.000252386816905506, + "loss": 7.0663, + "step": 18252 + }, + { + "epoch": 1.7031818605953157, + "grad_norm": 1.1037311873361262, + "learning_rate": 0.0002523812890707478, + "loss": 7.86, + "step": 18253 + }, + { + "epoch": 1.7032751702901932, + "grad_norm": 0.8543945370744044, + "learning_rate": 0.0002523757609756642, + "loss": 7.073, + "step": 18254 + }, + { + "epoch": 1.7033684799850706, + "grad_norm": 0.7447489373105731, + "learning_rate": 0.0002523702326202695, + "loss": 7.3425, + "step": 18255 + }, + { + "epoch": 1.7034617896799478, + "grad_norm": 0.8867723629404377, + "learning_rate": 0.00025236470400457755, + "loss": 7.4335, + "step": 18256 + }, + { + "epoch": 1.703555099374825, + "grad_norm": 0.584363900684414, + "learning_rate": 0.0002523591751286025, + "loss": 7.1682, + "step": 18257 + }, + { + "epoch": 1.7036484090697024, + "grad_norm": 0.8822799710642621, + "learning_rate": 0.0002523536459923584, + "loss": 7.3424, + "step": 18258 + }, + { + "epoch": 1.7037417187645798, + "grad_norm": 0.9009497109256295, + "learning_rate": 0.0002523481165958593, + "loss": 6.899, + "step": 18259 + }, + { + "epoch": 1.7038350284594568, + "grad_norm": 0.9863606676907364, + "learning_rate": 0.00025234258693911923, + "loss": 6.8627, + "step": 18260 + }, + { + "epoch": 1.7039283381543342, + "grad_norm": 1.1190484017961413, + "learning_rate": 0.0002523370570221523, + "loss": 6.969, + "step": 18261 + }, + { + "epoch": 1.7040216478492116, + "grad_norm": 0.9684776282983837, + "learning_rate": 0.0002523315268449725, + "loss": 7.2197, + "step": 18262 + }, + { + "epoch": 1.7041149575440888, + "grad_norm": 1.1813588772495098, + "learning_rate": 0.00025232599640759404, + "loss": 7.0235, + "step": 18263 + }, + { + "epoch": 1.704208267238966, + "grad_norm": 2648.6636058536574, + "learning_rate": 0.00025232046571003076, + "loss": 6.9923, + "step": 18264 + }, + { + "epoch": 1.7043015769338434, + "grad_norm": 0.7805977002041571, + "learning_rate": 0.00025231493475229695, + "loss": 7.6345, + "step": 18265 + }, + { + "epoch": 1.7043948866287209, + "grad_norm": 0.957307013910371, + "learning_rate": 0.0002523094035344065, + "loss": 6.7576, + "step": 18266 + }, + { + "epoch": 1.704488196323598, + "grad_norm": 0.8760047386757539, + "learning_rate": 0.00025230387205637356, + "loss": 7.3727, + "step": 18267 + }, + { + "epoch": 1.7045815060184752, + "grad_norm": 1089.0753816585018, + "learning_rate": 0.0002522983403182122, + "loss": 7.4366, + "step": 18268 + }, + { + "epoch": 1.7046748157133527, + "grad_norm": 1.1161603560971938, + "learning_rate": 0.0002522928083199364, + "loss": 7.58, + "step": 18269 + }, + { + "epoch": 1.7047681254082299, + "grad_norm": 1.2230912688873168, + "learning_rate": 0.00025228727606156035, + "loss": 7.2273, + "step": 18270 + }, + { + "epoch": 1.704861435103107, + "grad_norm": 1.0002054430515908, + "learning_rate": 0.00025228174354309806, + "loss": 7.3444, + "step": 18271 + }, + { + "epoch": 1.7049547447979845, + "grad_norm": 600.0939184444488, + "learning_rate": 0.00025227621076456363, + "loss": 7.2414, + "step": 18272 + }, + { + "epoch": 1.705048054492862, + "grad_norm": 0.7799058900138893, + "learning_rate": 0.000252270677725971, + "loss": 7.5228, + "step": 18273 + }, + { + "epoch": 1.705141364187739, + "grad_norm": 0.9340536141063991, + "learning_rate": 0.0002522651444273344, + "loss": 7.563, + "step": 18274 + }, + { + "epoch": 1.7052346738826163, + "grad_norm": 1.8665786402988762, + "learning_rate": 0.00025225961086866784, + "loss": 7.7944, + "step": 18275 + }, + { + "epoch": 1.7053279835774937, + "grad_norm": 1.1553334697549964, + "learning_rate": 0.0002522540770499853, + "loss": 7.4569, + "step": 18276 + }, + { + "epoch": 1.7054212932723711, + "grad_norm": 591.4803599848035, + "learning_rate": 0.000252248542971301, + "loss": 7.2173, + "step": 18277 + }, + { + "epoch": 1.7055146029672483, + "grad_norm": 0.5866844811014267, + "learning_rate": 0.00025224300863262893, + "loss": 7.059, + "step": 18278 + }, + { + "epoch": 1.7056079126621255, + "grad_norm": 0.6983082724440117, + "learning_rate": 0.00025223747403398317, + "loss": 7.3826, + "step": 18279 + }, + { + "epoch": 1.705701222357003, + "grad_norm": 0.5660916824221579, + "learning_rate": 0.00025223193917537777, + "loss": 7.1943, + "step": 18280 + }, + { + "epoch": 1.7057945320518801, + "grad_norm": 0.5787029332504939, + "learning_rate": 0.0002522264040568269, + "loss": 7.313, + "step": 18281 + }, + { + "epoch": 1.7058878417467573, + "grad_norm": 0.5838654761687244, + "learning_rate": 0.0002522208686783445, + "loss": 7.3308, + "step": 18282 + }, + { + "epoch": 1.7059811514416348, + "grad_norm": 2698.8252562215607, + "learning_rate": 0.0002522153330399447, + "loss": 7.174, + "step": 18283 + }, + { + "epoch": 1.7060744611365122, + "grad_norm": 1.062367713819649, + "learning_rate": 0.0002522097971416417, + "loss": 7.1222, + "step": 18284 + }, + { + "epoch": 1.7061677708313894, + "grad_norm": 1.2303845065371963, + "learning_rate": 0.00025220426098344933, + "loss": 7.2591, + "step": 18285 + }, + { + "epoch": 1.7062610805262666, + "grad_norm": 0.8076686056679694, + "learning_rate": 0.00025219872456538184, + "loss": 7.3112, + "step": 18286 + }, + { + "epoch": 1.706354390221144, + "grad_norm": 0.7829491362669114, + "learning_rate": 0.0002521931878874533, + "loss": 7.3954, + "step": 18287 + }, + { + "epoch": 1.7064476999160214, + "grad_norm": 1.0539749026872678, + "learning_rate": 0.00025218765094967775, + "loss": 7.0899, + "step": 18288 + }, + { + "epoch": 1.7065410096108986, + "grad_norm": 0.8344728820667603, + "learning_rate": 0.0002521821137520692, + "loss": 7.5608, + "step": 18289 + }, + { + "epoch": 1.7066343193057758, + "grad_norm": 0.7456830680848308, + "learning_rate": 0.00025217657629464195, + "loss": 7.2752, + "step": 18290 + }, + { + "epoch": 1.7067276290006532, + "grad_norm": 0.7648580417896639, + "learning_rate": 0.0002521710385774098, + "loss": 7.3649, + "step": 18291 + }, + { + "epoch": 1.7068209386955304, + "grad_norm": 0.8448786154309401, + "learning_rate": 0.00025216550060038704, + "loss": 7.601, + "step": 18292 + }, + { + "epoch": 1.7069142483904076, + "grad_norm": 0.743339748175449, + "learning_rate": 0.00025215996236358765, + "loss": 7.4412, + "step": 18293 + }, + { + "epoch": 1.707007558085285, + "grad_norm": 0.8607229126863916, + "learning_rate": 0.0002521544238670257, + "loss": 7.3207, + "step": 18294 + }, + { + "epoch": 1.7071008677801625, + "grad_norm": 0.9410186214218333, + "learning_rate": 0.00025214888511071535, + "loss": 6.9106, + "step": 18295 + }, + { + "epoch": 1.7071941774750397, + "grad_norm": 1.024240800716699, + "learning_rate": 0.00025214334609467064, + "loss": 7.4383, + "step": 18296 + }, + { + "epoch": 1.7072874871699169, + "grad_norm": 1.4007187173806768, + "learning_rate": 0.0002521378068189057, + "loss": 7.494, + "step": 18297 + }, + { + "epoch": 1.7073807968647943, + "grad_norm": 1.141672037381532, + "learning_rate": 0.0002521322672834345, + "loss": 7.0265, + "step": 18298 + }, + { + "epoch": 1.7074741065596717, + "grad_norm": 7953.805384172909, + "learning_rate": 0.00025212672748827126, + "loss": 7.6875, + "step": 18299 + }, + { + "epoch": 1.7075674162545489, + "grad_norm": 0.8531732896394983, + "learning_rate": 0.00025212118743342995, + "loss": 6.9733, + "step": 18300 + }, + { + "epoch": 1.707660725949426, + "grad_norm": 1.0075288048641036, + "learning_rate": 0.0002521156471189247, + "loss": 7.4612, + "step": 18301 + }, + { + "epoch": 1.7077540356443035, + "grad_norm": 8700.513850042524, + "learning_rate": 0.00025211010654476964, + "loss": 7.0099, + "step": 18302 + }, + { + "epoch": 1.7078473453391807, + "grad_norm": 3153.8820527160815, + "learning_rate": 0.0002521045657109789, + "loss": 7.4897, + "step": 18303 + }, + { + "epoch": 1.707940655034058, + "grad_norm": 1.2581392793996202, + "learning_rate": 0.0002520990246175664, + "loss": 7.0334, + "step": 18304 + }, + { + "epoch": 1.7080339647289353, + "grad_norm": 450.4642899709077, + "learning_rate": 0.00025209348326454633, + "loss": 7.3387, + "step": 18305 + }, + { + "epoch": 1.7081272744238127, + "grad_norm": 1.0069359091665946, + "learning_rate": 0.0002520879416519328, + "loss": 7.373, + "step": 18306 + }, + { + "epoch": 1.70822058411869, + "grad_norm": 1.037493832915976, + "learning_rate": 0.00025208239977973985, + "loss": 7.3983, + "step": 18307 + }, + { + "epoch": 1.7083138938135671, + "grad_norm": 0.9372055903776694, + "learning_rate": 0.0002520768576479816, + "loss": 6.9971, + "step": 18308 + }, + { + "epoch": 1.7084072035084445, + "grad_norm": 1.5791174239866694, + "learning_rate": 0.00025207131525667215, + "loss": 7.5673, + "step": 18309 + }, + { + "epoch": 1.708500513203322, + "grad_norm": 1.316256503354723, + "learning_rate": 0.0002520657726058256, + "loss": 7.8434, + "step": 18310 + }, + { + "epoch": 1.7085938228981992, + "grad_norm": 0.9660791910190523, + "learning_rate": 0.000252060229695456, + "loss": 7.4699, + "step": 18311 + }, + { + "epoch": 1.7086871325930764, + "grad_norm": 1.2258526003184163, + "learning_rate": 0.00025205468652557747, + "loss": 7.474, + "step": 18312 + }, + { + "epoch": 1.7087804422879538, + "grad_norm": 1.323897909664205, + "learning_rate": 0.00025204914309620417, + "loss": 7.3192, + "step": 18313 + }, + { + "epoch": 1.708873751982831, + "grad_norm": 30555.25550453592, + "learning_rate": 0.00025204359940735006, + "loss": 7.5901, + "step": 18314 + }, + { + "epoch": 1.7089670616777082, + "grad_norm": 153362.4999806971, + "learning_rate": 0.0002520380554590293, + "loss": 7.351, + "step": 18315 + }, + { + "epoch": 1.7090603713725856, + "grad_norm": 1.1659208219029953, + "learning_rate": 0.00025203251125125597, + "loss": 7.529, + "step": 18316 + }, + { + "epoch": 1.709153681067463, + "grad_norm": 19779.824970926227, + "learning_rate": 0.00025202696678404423, + "loss": 7.9291, + "step": 18317 + }, + { + "epoch": 1.7092469907623402, + "grad_norm": 7106.970349093363, + "learning_rate": 0.0002520214220574081, + "loss": 7.4307, + "step": 18318 + }, + { + "epoch": 1.7093403004572174, + "grad_norm": 1.6440998458688783, + "learning_rate": 0.0002520158770713618, + "loss": 7.6333, + "step": 18319 + }, + { + "epoch": 1.7094336101520948, + "grad_norm": 0.870135060997217, + "learning_rate": 0.0002520103318259192, + "loss": 7.0756, + "step": 18320 + }, + { + "epoch": 1.7095269198469722, + "grad_norm": 0.971384371351223, + "learning_rate": 0.00025200478632109467, + "loss": 7.4961, + "step": 18321 + }, + { + "epoch": 1.7096202295418494, + "grad_norm": 1.187838573386111, + "learning_rate": 0.0002519992405569022, + "loss": 7.2309, + "step": 18322 + }, + { + "epoch": 1.7097135392367266, + "grad_norm": 1.6496651676452363, + "learning_rate": 0.0002519936945333558, + "loss": 7.2085, + "step": 18323 + }, + { + "epoch": 1.709806848931604, + "grad_norm": 1.1781877458354526, + "learning_rate": 0.0002519881482504696, + "loss": 7.2018, + "step": 18324 + }, + { + "epoch": 1.7099001586264813, + "grad_norm": 0.6687084433520456, + "learning_rate": 0.00025198260170825786, + "loss": 7.3898, + "step": 18325 + }, + { + "epoch": 1.7099934683213585, + "grad_norm": 1.3047316804122104, + "learning_rate": 0.0002519770549067345, + "loss": 7.5466, + "step": 18326 + }, + { + "epoch": 1.7100867780162359, + "grad_norm": 0.8015013391061341, + "learning_rate": 0.00025197150784591374, + "loss": 7.4768, + "step": 18327 + }, + { + "epoch": 1.7101800877111133, + "grad_norm": 0.8042385000931379, + "learning_rate": 0.00025196596052580965, + "loss": 7.1732, + "step": 18328 + }, + { + "epoch": 1.7102733974059905, + "grad_norm": 0.8523329671752885, + "learning_rate": 0.00025196041294643634, + "loss": 7.4905, + "step": 18329 + }, + { + "epoch": 1.7103667071008677, + "grad_norm": 0.7577722446670558, + "learning_rate": 0.0002519548651078078, + "loss": 7.1977, + "step": 18330 + }, + { + "epoch": 1.710460016795745, + "grad_norm": 0.9169307153945649, + "learning_rate": 0.00025194931700993833, + "loss": 7.553, + "step": 18331 + }, + { + "epoch": 1.7105533264906225, + "grad_norm": 0.6626512387109286, + "learning_rate": 0.0002519437686528419, + "loss": 7.3182, + "step": 18332 + }, + { + "epoch": 1.7106466361854997, + "grad_norm": 4846.254983169434, + "learning_rate": 0.0002519382200365327, + "loss": 7.0927, + "step": 18333 + }, + { + "epoch": 1.710739945880377, + "grad_norm": 0.9286014018638233, + "learning_rate": 0.00025193267116102483, + "loss": 7.5499, + "step": 18334 + }, + { + "epoch": 1.7108332555752543, + "grad_norm": 1.1921466334263846, + "learning_rate": 0.0002519271220263323, + "loss": 7.1818, + "step": 18335 + }, + { + "epoch": 1.7109265652701315, + "grad_norm": 0.96278473213607, + "learning_rate": 0.00025192157263246933, + "loss": 7.5732, + "step": 18336 + }, + { + "epoch": 1.7110198749650087, + "grad_norm": 0.6816309394768792, + "learning_rate": 0.00025191602297944997, + "loss": 7.3949, + "step": 18337 + }, + { + "epoch": 1.7111131846598862, + "grad_norm": 0.9062064569129616, + "learning_rate": 0.0002519104730672884, + "loss": 7.3056, + "step": 18338 + }, + { + "epoch": 1.7112064943547636, + "grad_norm": 2.346041917201016, + "learning_rate": 0.0002519049228959986, + "loss": 7.3238, + "step": 18339 + }, + { + "epoch": 1.7112998040496408, + "grad_norm": 1.1282342930377103, + "learning_rate": 0.00025189937246559486, + "loss": 7.2543, + "step": 18340 + }, + { + "epoch": 1.711393113744518, + "grad_norm": 1.3048815825527977, + "learning_rate": 0.0002518938217760912, + "loss": 7.0875, + "step": 18341 + }, + { + "epoch": 1.7114864234393954, + "grad_norm": 1.973094814016034, + "learning_rate": 0.00025188827082750167, + "loss": 7.4471, + "step": 18342 + }, + { + "epoch": 1.7115797331342728, + "grad_norm": 0.9472890162026892, + "learning_rate": 0.0002518827196198405, + "loss": 7.211, + "step": 18343 + }, + { + "epoch": 1.7116730428291498, + "grad_norm": 273.1083082622386, + "learning_rate": 0.0002518771681531217, + "loss": 7.3426, + "step": 18344 + }, + { + "epoch": 1.7117663525240272, + "grad_norm": 1.766869801142552, + "learning_rate": 0.00025187161642735943, + "loss": 7.4579, + "step": 18345 + }, + { + "epoch": 1.7118596622189046, + "grad_norm": 1.4838822144901027, + "learning_rate": 0.0002518660644425679, + "loss": 7.385, + "step": 18346 + }, + { + "epoch": 1.7119529719137818, + "grad_norm": 1.1973519132317647, + "learning_rate": 0.00025186051219876107, + "loss": 7.2551, + "step": 18347 + }, + { + "epoch": 1.712046281608659, + "grad_norm": 0.787311673230825, + "learning_rate": 0.00025185495969595316, + "loss": 7.2989, + "step": 18348 + }, + { + "epoch": 1.7121395913035364, + "grad_norm": 1.8840211607790704, + "learning_rate": 0.0002518494069341583, + "loss": 7.2352, + "step": 18349 + }, + { + "epoch": 1.7122329009984139, + "grad_norm": 0.822093145804154, + "learning_rate": 0.0002518438539133905, + "loss": 7.2497, + "step": 18350 + }, + { + "epoch": 1.712326210693291, + "grad_norm": 1.3325165850023721, + "learning_rate": 0.000251838300633664, + "loss": 7.0026, + "step": 18351 + }, + { + "epoch": 1.7124195203881682, + "grad_norm": 1.1389103316757583, + "learning_rate": 0.0002518327470949928, + "loss": 7.4805, + "step": 18352 + }, + { + "epoch": 1.7125128300830457, + "grad_norm": 0.876432499878781, + "learning_rate": 0.00025182719329739115, + "loss": 7.0878, + "step": 18353 + }, + { + "epoch": 1.712606139777923, + "grad_norm": 248.62388043914757, + "learning_rate": 0.0002518216392408731, + "loss": 7.5853, + "step": 18354 + }, + { + "epoch": 1.7126994494728, + "grad_norm": 1.246165014197901, + "learning_rate": 0.0002518160849254528, + "loss": 7.0704, + "step": 18355 + }, + { + "epoch": 1.7127927591676775, + "grad_norm": 0.9547709351781944, + "learning_rate": 0.00025181053035114434, + "loss": 7.6478, + "step": 18356 + }, + { + "epoch": 1.712886068862555, + "grad_norm": 0.9547076917057139, + "learning_rate": 0.0002518049755179619, + "loss": 7.604, + "step": 18357 + }, + { + "epoch": 1.712979378557432, + "grad_norm": 1.2754529049764325, + "learning_rate": 0.0002517994204259194, + "loss": 7.4615, + "step": 18358 + }, + { + "epoch": 1.7130726882523093, + "grad_norm": 1.587535616119354, + "learning_rate": 0.0002517938650750313, + "loss": 7.4514, + "step": 18359 + }, + { + "epoch": 1.7131659979471867, + "grad_norm": 1.5985943694182323, + "learning_rate": 0.0002517883094653115, + "loss": 7.8199, + "step": 18360 + }, + { + "epoch": 1.7132593076420641, + "grad_norm": 340.5678443293705, + "learning_rate": 0.00025178275359677416, + "loss": 7.1667, + "step": 18361 + }, + { + "epoch": 1.7133526173369413, + "grad_norm": 121.66967639095559, + "learning_rate": 0.0002517771974694335, + "loss": 7.8806, + "step": 18362 + }, + { + "epoch": 1.7134459270318185, + "grad_norm": 2.722340069092929, + "learning_rate": 0.0002517716410833035, + "loss": 7.7494, + "step": 18363 + }, + { + "epoch": 1.713539236726696, + "grad_norm": 1.0209308411047868, + "learning_rate": 0.0002517660844383984, + "loss": 7.5111, + "step": 18364 + }, + { + "epoch": 1.7136325464215734, + "grad_norm": 1.4919661676467546, + "learning_rate": 0.0002517605275347323, + "loss": 7.3157, + "step": 18365 + }, + { + "epoch": 1.7137258561164503, + "grad_norm": 245.90589612536226, + "learning_rate": 0.0002517549703723193, + "loss": 7.5074, + "step": 18366 + }, + { + "epoch": 1.7138191658113278, + "grad_norm": 2.0944858636350854, + "learning_rate": 0.00025174941295117353, + "loss": 7.2755, + "step": 18367 + }, + { + "epoch": 1.7139124755062052, + "grad_norm": 1.4869813438440709, + "learning_rate": 0.00025174385527130914, + "loss": 7.3029, + "step": 18368 + }, + { + "epoch": 1.7140057852010824, + "grad_norm": 232.0975846080976, + "learning_rate": 0.0002517382973327403, + "loss": 7.6733, + "step": 18369 + }, + { + "epoch": 1.7140990948959596, + "grad_norm": 1.0990227390416794, + "learning_rate": 0.00025173273913548113, + "loss": 7.2793, + "step": 18370 + }, + { + "epoch": 1.714192404590837, + "grad_norm": 6.587019979962056, + "learning_rate": 0.0002517271806795457, + "loss": 7.5624, + "step": 18371 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.0662633148155194, + "learning_rate": 0.0002517216219649482, + "loss": 7.1438, + "step": 18372 + }, + { + "epoch": 1.7143790239805916, + "grad_norm": 2.229152408838462, + "learning_rate": 0.0002517160629917027, + "loss": 7.4166, + "step": 18373 + }, + { + "epoch": 1.7144723336754688, + "grad_norm": 178.32663997657477, + "learning_rate": 0.00025171050375982345, + "loss": 7.347, + "step": 18374 + }, + { + "epoch": 1.7145656433703462, + "grad_norm": 0.9781836390012321, + "learning_rate": 0.0002517049442693245, + "loss": 7.304, + "step": 18375 + }, + { + "epoch": 1.7146589530652234, + "grad_norm": 1.2820493800576622, + "learning_rate": 0.00025169938452022, + "loss": 7.5145, + "step": 18376 + }, + { + "epoch": 1.7147522627601006, + "grad_norm": 1.0653467657474114, + "learning_rate": 0.00025169382451252406, + "loss": 7.2333, + "step": 18377 + }, + { + "epoch": 1.714845572454978, + "grad_norm": 1.3767594021144143, + "learning_rate": 0.0002516882642462509, + "loss": 7.4357, + "step": 18378 + }, + { + "epoch": 1.7149388821498555, + "grad_norm": 0.9051276339139497, + "learning_rate": 0.0002516827037214146, + "loss": 7.3311, + "step": 18379 + }, + { + "epoch": 1.7150321918447327, + "grad_norm": 1.3734001167102279, + "learning_rate": 0.00025167714293802926, + "loss": 7.5478, + "step": 18380 + }, + { + "epoch": 1.7151255015396099, + "grad_norm": 107.65400618782729, + "learning_rate": 0.00025167158189610907, + "loss": 7.4119, + "step": 18381 + }, + { + "epoch": 1.7152188112344873, + "grad_norm": 1.0920961756630856, + "learning_rate": 0.00025166602059566824, + "loss": 7.2826, + "step": 18382 + }, + { + "epoch": 1.7153121209293647, + "grad_norm": 0.8227294957015451, + "learning_rate": 0.00025166045903672076, + "loss": 7.369, + "step": 18383 + }, + { + "epoch": 1.715405430624242, + "grad_norm": 0.9478764405012673, + "learning_rate": 0.0002516548972192809, + "loss": 7.4397, + "step": 18384 + }, + { + "epoch": 1.715498740319119, + "grad_norm": 1.7219586543077494, + "learning_rate": 0.0002516493351433627, + "loss": 7.6111, + "step": 18385 + }, + { + "epoch": 1.7155920500139965, + "grad_norm": 4.655851493136943, + "learning_rate": 0.00025164377280898035, + "loss": 7.404, + "step": 18386 + }, + { + "epoch": 1.7156853597088737, + "grad_norm": 149.09388246964278, + "learning_rate": 0.000251638210216148, + "loss": 7.2012, + "step": 18387 + }, + { + "epoch": 1.715778669403751, + "grad_norm": 0.9237496843465915, + "learning_rate": 0.0002516326473648798, + "loss": 7.3846, + "step": 18388 + }, + { + "epoch": 1.7158719790986283, + "grad_norm": 483.71140178691286, + "learning_rate": 0.0002516270842551899, + "loss": 7.2777, + "step": 18389 + }, + { + "epoch": 1.7159652887935057, + "grad_norm": 0.7493626489766014, + "learning_rate": 0.00025162152088709235, + "loss": 7.1188, + "step": 18390 + }, + { + "epoch": 1.716058598488383, + "grad_norm": 2.778505287427693, + "learning_rate": 0.00025161595726060147, + "loss": 7.6555, + "step": 18391 + }, + { + "epoch": 1.7161519081832601, + "grad_norm": 64.56292696436103, + "learning_rate": 0.0002516103933757313, + "loss": 7.4718, + "step": 18392 + }, + { + "epoch": 1.7162452178781376, + "grad_norm": 0.7169264210095431, + "learning_rate": 0.000251604829232496, + "loss": 7.5106, + "step": 18393 + }, + { + "epoch": 1.716338527573015, + "grad_norm": 2.016788078200999, + "learning_rate": 0.00025159926483090967, + "loss": 7.0546, + "step": 18394 + }, + { + "epoch": 1.7164318372678922, + "grad_norm": 192.8493075708025, + "learning_rate": 0.00025159370017098655, + "loss": 7.4064, + "step": 18395 + }, + { + "epoch": 1.7165251469627694, + "grad_norm": 2.3585915755543896, + "learning_rate": 0.0002515881352527407, + "loss": 7.7841, + "step": 18396 + }, + { + "epoch": 1.7166184566576468, + "grad_norm": 2.516693961545421, + "learning_rate": 0.0002515825700761864, + "loss": 7.0278, + "step": 18397 + }, + { + "epoch": 1.716711766352524, + "grad_norm": 0.6267272528420521, + "learning_rate": 0.0002515770046413376, + "loss": 7.3598, + "step": 18398 + }, + { + "epoch": 1.7168050760474012, + "grad_norm": 40.54788809821075, + "learning_rate": 0.00025157143894820864, + "loss": 7.3781, + "step": 18399 + }, + { + "epoch": 1.7168983857422786, + "grad_norm": 0.9740320670261267, + "learning_rate": 0.0002515658729968136, + "loss": 7.471, + "step": 18400 + }, + { + "epoch": 1.716991695437156, + "grad_norm": 4.953504543822543, + "learning_rate": 0.0002515603067871666, + "loss": 7.301, + "step": 18401 + }, + { + "epoch": 1.7170850051320332, + "grad_norm": 0.7768625387557424, + "learning_rate": 0.0002515547403192818, + "loss": 7.044, + "step": 18402 + }, + { + "epoch": 1.7171783148269104, + "grad_norm": 1.037543569476739, + "learning_rate": 0.0002515491735931734, + "loss": 7.2028, + "step": 18403 + }, + { + "epoch": 1.7172716245217878, + "grad_norm": 6.430089382937791, + "learning_rate": 0.0002515436066088556, + "loss": 7.6987, + "step": 18404 + }, + { + "epoch": 1.7173649342166653, + "grad_norm": 0.7632485421715531, + "learning_rate": 0.0002515380393663424, + "loss": 7.2007, + "step": 18405 + }, + { + "epoch": 1.7174582439115424, + "grad_norm": 508.6946131261352, + "learning_rate": 0.00025153247186564807, + "loss": 7.2847, + "step": 18406 + }, + { + "epoch": 1.7175515536064196, + "grad_norm": 1.141973469221454, + "learning_rate": 0.0002515269041067868, + "loss": 7.6082, + "step": 18407 + }, + { + "epoch": 1.717644863301297, + "grad_norm": 203.4137092516782, + "learning_rate": 0.0002515213360897726, + "loss": 7.3401, + "step": 18408 + }, + { + "epoch": 1.7177381729961743, + "grad_norm": 144.9504210407182, + "learning_rate": 0.0002515157678146198, + "loss": 7.5026, + "step": 18409 + }, + { + "epoch": 1.7178314826910515, + "grad_norm": 1.259789373518455, + "learning_rate": 0.00025151019928134244, + "loss": 7.2967, + "step": 18410 + }, + { + "epoch": 1.7179247923859289, + "grad_norm": 265.95308039607784, + "learning_rate": 0.0002515046304899546, + "loss": 7.519, + "step": 18411 + }, + { + "epoch": 1.7180181020808063, + "grad_norm": 0.8610181770923514, + "learning_rate": 0.0002514990614404707, + "loss": 7.4649, + "step": 18412 + }, + { + "epoch": 1.7181114117756835, + "grad_norm": 1.1841464852880386, + "learning_rate": 0.0002514934921329047, + "loss": 7.4976, + "step": 18413 + }, + { + "epoch": 1.7182047214705607, + "grad_norm": 0.5549418264401687, + "learning_rate": 0.00025148792256727084, + "loss": 7.397, + "step": 18414 + }, + { + "epoch": 1.7182980311654381, + "grad_norm": 0.9593797017660506, + "learning_rate": 0.00025148235274358323, + "loss": 7.2945, + "step": 18415 + }, + { + "epoch": 1.7183913408603155, + "grad_norm": 0.9278154143032679, + "learning_rate": 0.00025147678266185605, + "loss": 7.5561, + "step": 18416 + }, + { + "epoch": 1.7184846505551927, + "grad_norm": 1.205716215555942, + "learning_rate": 0.00025147121232210344, + "loss": 7.3767, + "step": 18417 + }, + { + "epoch": 1.71857796025007, + "grad_norm": 1.371696154939736, + "learning_rate": 0.00025146564172433966, + "loss": 7.2034, + "step": 18418 + }, + { + "epoch": 1.7186712699449473, + "grad_norm": 0.7811862805562855, + "learning_rate": 0.0002514600708685788, + "loss": 7.3913, + "step": 18419 + }, + { + "epoch": 1.7187645796398245, + "grad_norm": 2.063788282671407, + "learning_rate": 0.00025145449975483503, + "loss": 7.2528, + "step": 18420 + }, + { + "epoch": 1.7188578893347017, + "grad_norm": 0.6349761958695096, + "learning_rate": 0.0002514489283831225, + "loss": 7.4054, + "step": 18421 + }, + { + "epoch": 1.7189511990295792, + "grad_norm": 1.0129033133203957, + "learning_rate": 0.0002514433567534554, + "loss": 7.424, + "step": 18422 + }, + { + "epoch": 1.7190445087244566, + "grad_norm": 0.6105392761258118, + "learning_rate": 0.0002514377848658479, + "loss": 7.2574, + "step": 18423 + }, + { + "epoch": 1.7191378184193338, + "grad_norm": 1.1276772399186374, + "learning_rate": 0.0002514322127203142, + "loss": 7.4355, + "step": 18424 + }, + { + "epoch": 1.719231128114211, + "grad_norm": 0.8194562271267978, + "learning_rate": 0.0002514266403168684, + "loss": 7.1329, + "step": 18425 + }, + { + "epoch": 1.7193244378090884, + "grad_norm": 1.1092400122346662, + "learning_rate": 0.0002514210676555247, + "loss": 7.2459, + "step": 18426 + }, + { + "epoch": 1.7194177475039658, + "grad_norm": 0.7825727462830441, + "learning_rate": 0.00025141549473629724, + "loss": 7.0909, + "step": 18427 + }, + { + "epoch": 1.719511057198843, + "grad_norm": 0.669472712455952, + "learning_rate": 0.0002514099215592002, + "loss": 7.4101, + "step": 18428 + }, + { + "epoch": 1.7196043668937202, + "grad_norm": 1.0035520306743633, + "learning_rate": 0.00025140434812424785, + "loss": 7.3171, + "step": 18429 + }, + { + "epoch": 1.7196976765885976, + "grad_norm": 1179.6237004854615, + "learning_rate": 0.0002513987744314542, + "loss": 7.3121, + "step": 18430 + }, + { + "epoch": 1.7197909862834748, + "grad_norm": 0.610574070643053, + "learning_rate": 0.0002513932004808336, + "loss": 7.0847, + "step": 18431 + }, + { + "epoch": 1.719884295978352, + "grad_norm": 1.9456982532544225, + "learning_rate": 0.00025138762627240004, + "loss": 7.0618, + "step": 18432 + }, + { + "epoch": 1.7199776056732294, + "grad_norm": 1.9157688961574852, + "learning_rate": 0.0002513820518061678, + "loss": 7.3666, + "step": 18433 + }, + { + "epoch": 1.7200709153681069, + "grad_norm": 1.5029661647755916, + "learning_rate": 0.000251376477082151, + "loss": 7.0702, + "step": 18434 + }, + { + "epoch": 1.720164225062984, + "grad_norm": 1.3901634481360041, + "learning_rate": 0.0002513709021003639, + "loss": 7.1186, + "step": 18435 + }, + { + "epoch": 1.7202575347578613, + "grad_norm": 1.7463231813652165, + "learning_rate": 0.0002513653268608206, + "loss": 7.5279, + "step": 18436 + }, + { + "epoch": 1.7203508444527387, + "grad_norm": 699.1730845365106, + "learning_rate": 0.00025135975136353534, + "loss": 7.129, + "step": 18437 + }, + { + "epoch": 1.720444154147616, + "grad_norm": 620.6749673894745, + "learning_rate": 0.0002513541756085222, + "loss": 7.5595, + "step": 18438 + }, + { + "epoch": 1.7205374638424933, + "grad_norm": 0.9645939991849025, + "learning_rate": 0.00025134859959579547, + "loss": 6.9986, + "step": 18439 + }, + { + "epoch": 1.7206307735373705, + "grad_norm": 1.3341378766978629, + "learning_rate": 0.00025134302332536926, + "loss": 7.3732, + "step": 18440 + }, + { + "epoch": 1.720724083232248, + "grad_norm": 1.3249484263375924, + "learning_rate": 0.00025133744679725774, + "loss": 7.5422, + "step": 18441 + }, + { + "epoch": 1.720817392927125, + "grad_norm": 1.0153534673105287, + "learning_rate": 0.00025133187001147517, + "loss": 7.2976, + "step": 18442 + }, + { + "epoch": 1.7209107026220023, + "grad_norm": 685.6633119879425, + "learning_rate": 0.0002513262929680356, + "loss": 7.196, + "step": 18443 + }, + { + "epoch": 1.7210040123168797, + "grad_norm": 1.7753538334183092, + "learning_rate": 0.0002513207156669533, + "loss": 7.6456, + "step": 18444 + }, + { + "epoch": 1.7210973220117571, + "grad_norm": 173.1069403826171, + "learning_rate": 0.0002513151381082424, + "loss": 7.2081, + "step": 18445 + }, + { + "epoch": 1.7211906317066343, + "grad_norm": 2.5339456044756874, + "learning_rate": 0.0002513095602919172, + "loss": 7.5284, + "step": 18446 + }, + { + "epoch": 1.7212839414015115, + "grad_norm": 1.6773130490562655, + "learning_rate": 0.0002513039822179917, + "loss": 7.6082, + "step": 18447 + }, + { + "epoch": 1.721377251096389, + "grad_norm": 0.9374025387509873, + "learning_rate": 0.00025129840388648026, + "loss": 7.4438, + "step": 18448 + }, + { + "epoch": 1.7214705607912664, + "grad_norm": 1.5956432590510625, + "learning_rate": 0.00025129282529739696, + "loss": 7.2875, + "step": 18449 + }, + { + "epoch": 1.7215638704861433, + "grad_norm": 0.9672720487733791, + "learning_rate": 0.000251287246450756, + "loss": 7.4494, + "step": 18450 + }, + { + "epoch": 1.7216571801810208, + "grad_norm": 1.0471430752406314, + "learning_rate": 0.00025128166734657157, + "loss": 7.6357, + "step": 18451 + }, + { + "epoch": 1.7217504898758982, + "grad_norm": 1.5320456109700353, + "learning_rate": 0.0002512760879848579, + "loss": 7.5343, + "step": 18452 + }, + { + "epoch": 1.7218437995707754, + "grad_norm": 1.1028223487783544, + "learning_rate": 0.00025127050836562916, + "loss": 7.7085, + "step": 18453 + }, + { + "epoch": 1.7219371092656526, + "grad_norm": 1.2807316843143888, + "learning_rate": 0.0002512649284888994, + "loss": 6.952, + "step": 18454 + }, + { + "epoch": 1.72203041896053, + "grad_norm": 1.40136402363457, + "learning_rate": 0.00025125934835468303, + "loss": 7.5738, + "step": 18455 + }, + { + "epoch": 1.7221237286554074, + "grad_norm": 1.758490181910611, + "learning_rate": 0.0002512537679629941, + "loss": 7.2575, + "step": 18456 + }, + { + "epoch": 1.7222170383502846, + "grad_norm": 1.745448512212177, + "learning_rate": 0.00025124818731384686, + "loss": 7.8322, + "step": 18457 + }, + { + "epoch": 1.7223103480451618, + "grad_norm": 1.272930148922478, + "learning_rate": 0.0002512426064072554, + "loss": 7.5931, + "step": 18458 + }, + { + "epoch": 1.7224036577400392, + "grad_norm": 3.4906030275262476, + "learning_rate": 0.000251237025243234, + "loss": 7.9582, + "step": 18459 + }, + { + "epoch": 1.7224969674349166, + "grad_norm": 3.110820580453957, + "learning_rate": 0.0002512314438217969, + "loss": 7.4787, + "step": 18460 + }, + { + "epoch": 1.7225902771297936, + "grad_norm": 1.6595102413320226, + "learning_rate": 0.00025122586214295815, + "loss": 7.3286, + "step": 18461 + }, + { + "epoch": 1.722683586824671, + "grad_norm": 0.79982106654451, + "learning_rate": 0.0002512202802067321, + "loss": 7.4824, + "step": 18462 + }, + { + "epoch": 1.7227768965195485, + "grad_norm": 1.3137716367495313, + "learning_rate": 0.0002512146980131328, + "loss": 7.4343, + "step": 18463 + }, + { + "epoch": 1.7228702062144257, + "grad_norm": 1.1516461895441246, + "learning_rate": 0.00025120911556217453, + "loss": 6.9332, + "step": 18464 + }, + { + "epoch": 1.7229635159093029, + "grad_norm": 2.276259160387455, + "learning_rate": 0.0002512035328538714, + "loss": 7.1512, + "step": 18465 + }, + { + "epoch": 1.7230568256041803, + "grad_norm": 1.517524908384821, + "learning_rate": 0.0002511979498882377, + "loss": 7.4237, + "step": 18466 + }, + { + "epoch": 1.7231501352990577, + "grad_norm": 0.9195893310173351, + "learning_rate": 0.0002511923666652876, + "loss": 7.1203, + "step": 18467 + }, + { + "epoch": 1.723243444993935, + "grad_norm": 1.2942890852531046, + "learning_rate": 0.0002511867831850353, + "loss": 7.6517, + "step": 18468 + }, + { + "epoch": 1.723336754688812, + "grad_norm": 1.2136384360819603, + "learning_rate": 0.000251181199447495, + "loss": 7.4115, + "step": 18469 + }, + { + "epoch": 1.7234300643836895, + "grad_norm": 1.6884488884066078, + "learning_rate": 0.0002511756154526808, + "loss": 7.3088, + "step": 18470 + }, + { + "epoch": 1.723523374078567, + "grad_norm": 1.715294570498409, + "learning_rate": 0.00025117003120060704, + "loss": 7.4234, + "step": 18471 + }, + { + "epoch": 1.723616683773444, + "grad_norm": 1.264185692297411, + "learning_rate": 0.00025116444669128786, + "loss": 7.3658, + "step": 18472 + }, + { + "epoch": 1.7237099934683213, + "grad_norm": 1.524664873162205, + "learning_rate": 0.00025115886192473744, + "loss": 7.7711, + "step": 18473 + }, + { + "epoch": 1.7238033031631987, + "grad_norm": 1.181068685741605, + "learning_rate": 0.00025115327690097003, + "loss": 7.4796, + "step": 18474 + }, + { + "epoch": 1.723896612858076, + "grad_norm": 143.68193579396743, + "learning_rate": 0.00025114769161999984, + "loss": 7.649, + "step": 18475 + }, + { + "epoch": 1.7239899225529531, + "grad_norm": 2.2119204794757157, + "learning_rate": 0.00025114210608184095, + "loss": 7.7961, + "step": 18476 + }, + { + "epoch": 1.7240832322478306, + "grad_norm": 9.551592025587592, + "learning_rate": 0.0002511365202865076, + "loss": 7.6122, + "step": 18477 + }, + { + "epoch": 1.724176541942708, + "grad_norm": 1.203889979914528, + "learning_rate": 0.00025113093423401417, + "loss": 7.4946, + "step": 18478 + }, + { + "epoch": 1.7242698516375852, + "grad_norm": 2.032955192303217, + "learning_rate": 0.00025112534792437463, + "loss": 7.5804, + "step": 18479 + }, + { + "epoch": 1.7243631613324624, + "grad_norm": 203.69504579476308, + "learning_rate": 0.0002511197613576033, + "loss": 7.3993, + "step": 18480 + }, + { + "epoch": 1.7244564710273398, + "grad_norm": 11.089936573123444, + "learning_rate": 0.00025111417453371445, + "loss": 7.183, + "step": 18481 + }, + { + "epoch": 1.724549780722217, + "grad_norm": 1.1121505667867768, + "learning_rate": 0.00025110858745272215, + "loss": 7.584, + "step": 18482 + }, + { + "epoch": 1.7246430904170942, + "grad_norm": 0.8360716509993285, + "learning_rate": 0.00025110300011464064, + "loss": 7.252, + "step": 18483 + }, + { + "epoch": 1.7247364001119716, + "grad_norm": 2.2925848903568498, + "learning_rate": 0.00025109741251948416, + "loss": 7.6261, + "step": 18484 + }, + { + "epoch": 1.724829709806849, + "grad_norm": 47.051508630548064, + "learning_rate": 0.000251091824667267, + "loss": 7.1083, + "step": 18485 + }, + { + "epoch": 1.7249230195017262, + "grad_norm": 3.6764471626432726, + "learning_rate": 0.00025108623655800314, + "loss": 7.4979, + "step": 18486 + }, + { + "epoch": 1.7250163291966034, + "grad_norm": 1.2982448242734599, + "learning_rate": 0.000251080648191707, + "loss": 7.2438, + "step": 18487 + }, + { + "epoch": 1.7251096388914808, + "grad_norm": 2.153810244262497, + "learning_rate": 0.00025107505956839266, + "loss": 7.5783, + "step": 18488 + }, + { + "epoch": 1.7252029485863583, + "grad_norm": 0.7313630916303049, + "learning_rate": 0.00025106947068807443, + "loss": 7.5103, + "step": 18489 + }, + { + "epoch": 1.7252962582812355, + "grad_norm": 62.59637148748498, + "learning_rate": 0.00025106388155076646, + "loss": 7.3074, + "step": 18490 + }, + { + "epoch": 1.7253895679761126, + "grad_norm": 4.393569849333793, + "learning_rate": 0.00025105829215648305, + "loss": 7.4801, + "step": 18491 + }, + { + "epoch": 1.72548287767099, + "grad_norm": 2.45609884637284, + "learning_rate": 0.00025105270250523826, + "loss": 7.707, + "step": 18492 + }, + { + "epoch": 1.7255761873658673, + "grad_norm": 1.7752597002191826, + "learning_rate": 0.00025104711259704633, + "loss": 7.2408, + "step": 18493 + }, + { + "epoch": 1.7256694970607445, + "grad_norm": 1.2026537993299948, + "learning_rate": 0.00025104152243192164, + "loss": 7.5602, + "step": 18494 + }, + { + "epoch": 1.7257628067556219, + "grad_norm": 0.9390964929332506, + "learning_rate": 0.0002510359320098782, + "loss": 7.1349, + "step": 18495 + }, + { + "epoch": 1.7258561164504993, + "grad_norm": 13.967080359824175, + "learning_rate": 0.0002510303413309304, + "loss": 7.0765, + "step": 18496 + }, + { + "epoch": 1.7259494261453765, + "grad_norm": 4.385012599217538, + "learning_rate": 0.0002510247503950923, + "loss": 7.399, + "step": 18497 + }, + { + "epoch": 1.7260427358402537, + "grad_norm": 1.9709094385625874, + "learning_rate": 0.00025101915920237817, + "loss": 7.4, + "step": 18498 + }, + { + "epoch": 1.7261360455351311, + "grad_norm": 2.6092564851431685, + "learning_rate": 0.00025101356775280225, + "loss": 7.4656, + "step": 18499 + }, + { + "epoch": 1.7262293552300085, + "grad_norm": 3.330187177750256, + "learning_rate": 0.0002510079760463788, + "loss": 7.738, + "step": 18500 + }, + { + "epoch": 1.7263226649248857, + "grad_norm": 5.489578649414339, + "learning_rate": 0.000251002384083122, + "loss": 7.1712, + "step": 18501 + }, + { + "epoch": 1.726415974619763, + "grad_norm": 1.2393616223075195, + "learning_rate": 0.00025099679186304597, + "loss": 7.378, + "step": 18502 + }, + { + "epoch": 1.7265092843146403, + "grad_norm": 1.184287610318724, + "learning_rate": 0.00025099119938616507, + "loss": 7.3595, + "step": 18503 + }, + { + "epoch": 1.7266025940095175, + "grad_norm": 10.977834070058197, + "learning_rate": 0.0002509856066524935, + "loss": 7.4898, + "step": 18504 + }, + { + "epoch": 1.7266959037043947, + "grad_norm": 0.8669857642137887, + "learning_rate": 0.0002509800136620454, + "loss": 7.3009, + "step": 18505 + }, + { + "epoch": 1.7267892133992722, + "grad_norm": 3.7720806459710077, + "learning_rate": 0.000250974420414835, + "loss": 7.5743, + "step": 18506 + }, + { + "epoch": 1.7268825230941496, + "grad_norm": 252.60941510094838, + "learning_rate": 0.00025096882691087665, + "loss": 7.3576, + "step": 18507 + }, + { + "epoch": 1.7269758327890268, + "grad_norm": 0.7226698842860569, + "learning_rate": 0.0002509632331501844, + "loss": 7.4983, + "step": 18508 + }, + { + "epoch": 1.727069142483904, + "grad_norm": 17.852050622301416, + "learning_rate": 0.0002509576391327726, + "loss": 7.2208, + "step": 18509 + }, + { + "epoch": 1.7271624521787814, + "grad_norm": 1.1515585081374178, + "learning_rate": 0.00025095204485865545, + "loss": 7.4359, + "step": 18510 + }, + { + "epoch": 1.7272557618736588, + "grad_norm": 17.154015820059566, + "learning_rate": 0.0002509464503278471, + "loss": 7.4948, + "step": 18511 + }, + { + "epoch": 1.727349071568536, + "grad_norm": 0.8495963645912299, + "learning_rate": 0.00025094085554036187, + "loss": 7.4592, + "step": 18512 + }, + { + "epoch": 1.7274423812634132, + "grad_norm": 8.66164675095027, + "learning_rate": 0.00025093526049621394, + "loss": 7.4953, + "step": 18513 + }, + { + "epoch": 1.7275356909582906, + "grad_norm": 1.013938472661541, + "learning_rate": 0.0002509296651954175, + "loss": 7.5855, + "step": 18514 + }, + { + "epoch": 1.7276290006531678, + "grad_norm": 27.462842366970364, + "learning_rate": 0.00025092406963798686, + "loss": 7.0639, + "step": 18515 + }, + { + "epoch": 1.727722310348045, + "grad_norm": 2.557975481776854, + "learning_rate": 0.00025091847382393624, + "loss": 7.5384, + "step": 18516 + }, + { + "epoch": 1.7278156200429224, + "grad_norm": 14.008777385236366, + "learning_rate": 0.0002509128777532798, + "loss": 7.4638, + "step": 18517 + }, + { + "epoch": 1.7279089297377999, + "grad_norm": 2.036047669826469, + "learning_rate": 0.00025090728142603176, + "loss": 7.3379, + "step": 18518 + }, + { + "epoch": 1.728002239432677, + "grad_norm": 4.18591683744732, + "learning_rate": 0.0002509016848422065, + "loss": 7.2073, + "step": 18519 + }, + { + "epoch": 1.7280955491275543, + "grad_norm": 1.8821942649296153, + "learning_rate": 0.00025089608800181804, + "loss": 7.2771, + "step": 18520 + }, + { + "epoch": 1.7281888588224317, + "grad_norm": 0.9907679837030474, + "learning_rate": 0.00025089049090488074, + "loss": 7.4504, + "step": 18521 + }, + { + "epoch": 1.728282168517309, + "grad_norm": 1.3529827920972657, + "learning_rate": 0.0002508848935514088, + "loss": 7.7548, + "step": 18522 + }, + { + "epoch": 1.7283754782121863, + "grad_norm": 0.6968335899693732, + "learning_rate": 0.00025087929594141647, + "loss": 7.3279, + "step": 18523 + }, + { + "epoch": 1.7284687879070635, + "grad_norm": 2.7243872486807064, + "learning_rate": 0.000250873698074918, + "loss": 7.5249, + "step": 18524 + }, + { + "epoch": 1.728562097601941, + "grad_norm": 3.5859056610000133, + "learning_rate": 0.0002508680999519276, + "loss": 7.2477, + "step": 18525 + }, + { + "epoch": 1.728655407296818, + "grad_norm": 2.365679554700764, + "learning_rate": 0.0002508625015724595, + "loss": 7.4196, + "step": 18526 + }, + { + "epoch": 1.7287487169916953, + "grad_norm": 5.371246817417516, + "learning_rate": 0.00025085690293652794, + "loss": 7.6604, + "step": 18527 + }, + { + "epoch": 1.7288420266865727, + "grad_norm": 1.850738937022474, + "learning_rate": 0.0002508513040441471, + "loss": 7.4411, + "step": 18528 + }, + { + "epoch": 1.7289353363814501, + "grad_norm": 5.926752747776836, + "learning_rate": 0.0002508457048953313, + "loss": 7.6026, + "step": 18529 + }, + { + "epoch": 1.7290286460763273, + "grad_norm": 81.16174126246644, + "learning_rate": 0.00025084010549009475, + "loss": 7.2382, + "step": 18530 + }, + { + "epoch": 1.7291219557712045, + "grad_norm": 40.88548040127798, + "learning_rate": 0.0002508345058284517, + "loss": 7.0556, + "step": 18531 + }, + { + "epoch": 1.729215265466082, + "grad_norm": 0.7777780122371383, + "learning_rate": 0.0002508289059104163, + "loss": 7.3877, + "step": 18532 + }, + { + "epoch": 1.7293085751609594, + "grad_norm": 2.580972729799117, + "learning_rate": 0.00025082330573600293, + "loss": 7.4422, + "step": 18533 + }, + { + "epoch": 1.7294018848558366, + "grad_norm": 0.6132249576319745, + "learning_rate": 0.00025081770530522576, + "loss": 7.08, + "step": 18534 + }, + { + "epoch": 1.7294951945507138, + "grad_norm": 13.161666771499005, + "learning_rate": 0.000250812104618099, + "loss": 7.4093, + "step": 18535 + }, + { + "epoch": 1.7295885042455912, + "grad_norm": 1.2027426341766958, + "learning_rate": 0.0002508065036746369, + "loss": 7.329, + "step": 18536 + }, + { + "epoch": 1.7296818139404684, + "grad_norm": 1.2867191402430953, + "learning_rate": 0.00025080090247485377, + "loss": 7.2996, + "step": 18537 + }, + { + "epoch": 1.7297751236353456, + "grad_norm": 1.7186874216117953, + "learning_rate": 0.0002507953010187638, + "loss": 7.5915, + "step": 18538 + }, + { + "epoch": 1.729868433330223, + "grad_norm": 0.6706505276990756, + "learning_rate": 0.0002507896993063812, + "loss": 7.1137, + "step": 18539 + }, + { + "epoch": 1.7299617430251004, + "grad_norm": 0.8720185942348471, + "learning_rate": 0.00025078409733772023, + "loss": 7.3284, + "step": 18540 + }, + { + "epoch": 1.7300550527199776, + "grad_norm": 1.5382393272144461, + "learning_rate": 0.0002507784951127952, + "loss": 7.5858, + "step": 18541 + }, + { + "epoch": 1.7301483624148548, + "grad_norm": 0.6521169911182016, + "learning_rate": 0.0002507728926316204, + "loss": 7.2575, + "step": 18542 + }, + { + "epoch": 1.7302416721097322, + "grad_norm": 1.8965799742150737, + "learning_rate": 0.0002507672898942098, + "loss": 7.057, + "step": 18543 + }, + { + "epoch": 1.7303349818046097, + "grad_norm": 250.33375884389793, + "learning_rate": 0.000250761686900578, + "loss": 7.2205, + "step": 18544 + }, + { + "epoch": 1.7304282914994868, + "grad_norm": 0.625811946864666, + "learning_rate": 0.000250756083650739, + "loss": 7.2056, + "step": 18545 + }, + { + "epoch": 1.730521601194364, + "grad_norm": 161.43444483456426, + "learning_rate": 0.0002507504801447071, + "loss": 7.4091, + "step": 18546 + }, + { + "epoch": 1.7306149108892415, + "grad_norm": 1.238877758879101, + "learning_rate": 0.0002507448763824966, + "loss": 6.836, + "step": 18547 + }, + { + "epoch": 1.7307082205841187, + "grad_norm": 3.195058805986994, + "learning_rate": 0.00025073927236412175, + "loss": 7.4009, + "step": 18548 + }, + { + "epoch": 1.7308015302789959, + "grad_norm": 2.9836843621475735, + "learning_rate": 0.0002507336680895967, + "loss": 7.43, + "step": 18549 + }, + { + "epoch": 1.7308948399738733, + "grad_norm": 0.9630837295244448, + "learning_rate": 0.0002507280635589358, + "loss": 7.1481, + "step": 18550 + }, + { + "epoch": 1.7309881496687507, + "grad_norm": 0.806584496973107, + "learning_rate": 0.00025072245877215334, + "loss": 7.3031, + "step": 18551 + }, + { + "epoch": 1.731081459363628, + "grad_norm": 3.484639629184481, + "learning_rate": 0.00025071685372926345, + "loss": 7.4745, + "step": 18552 + }, + { + "epoch": 1.731174769058505, + "grad_norm": 3.131624645812031, + "learning_rate": 0.0002507112484302804, + "loss": 7.2247, + "step": 18553 + }, + { + "epoch": 1.7312680787533825, + "grad_norm": 2.5815752038963713, + "learning_rate": 0.00025070564287521855, + "loss": 7.2412, + "step": 18554 + }, + { + "epoch": 1.73136138844826, + "grad_norm": 160.15516905023722, + "learning_rate": 0.000250700037064092, + "loss": 7.5145, + "step": 18555 + }, + { + "epoch": 1.731454698143137, + "grad_norm": 0.7727946516754868, + "learning_rate": 0.0002506944309969151, + "loss": 7.4294, + "step": 18556 + }, + { + "epoch": 1.7315480078380143, + "grad_norm": 13.746079722731839, + "learning_rate": 0.0002506888246737022, + "loss": 7.1354, + "step": 18557 + }, + { + "epoch": 1.7316413175328917, + "grad_norm": 1.3041237429519688, + "learning_rate": 0.00025068321809446737, + "loss": 7.1096, + "step": 18558 + }, + { + "epoch": 1.731734627227769, + "grad_norm": 0.9869067809242716, + "learning_rate": 0.0002506776112592249, + "loss": 6.9609, + "step": 18559 + }, + { + "epoch": 1.7318279369226461, + "grad_norm": 0.7132454061638576, + "learning_rate": 0.00025067200416798916, + "loss": 7.0669, + "step": 18560 + }, + { + "epoch": 1.7319212466175236, + "grad_norm": 1548.8039060616388, + "learning_rate": 0.0002506663968207743, + "loss": 7.235, + "step": 18561 + }, + { + "epoch": 1.732014556312401, + "grad_norm": 2952.4804075567395, + "learning_rate": 0.0002506607892175946, + "loss": 7.2908, + "step": 18562 + }, + { + "epoch": 1.7321078660072782, + "grad_norm": 0.9831345653967799, + "learning_rate": 0.00025065518135846434, + "loss": 7.2687, + "step": 18563 + }, + { + "epoch": 1.7322011757021554, + "grad_norm": 2.6807028924798297, + "learning_rate": 0.00025064957324339777, + "loss": 7.5512, + "step": 18564 + }, + { + "epoch": 1.7322944853970328, + "grad_norm": 1.3028322606646514, + "learning_rate": 0.0002506439648724091, + "loss": 7.4819, + "step": 18565 + }, + { + "epoch": 1.7323877950919102, + "grad_norm": 1.3188289362022825, + "learning_rate": 0.0002506383562455127, + "loss": 7.6003, + "step": 18566 + }, + { + "epoch": 1.7324811047867872, + "grad_norm": 2.141578818347158, + "learning_rate": 0.0002506327473627228, + "loss": 7.3972, + "step": 18567 + }, + { + "epoch": 1.7325744144816646, + "grad_norm": 2.7549695240915364, + "learning_rate": 0.0002506271382240536, + "loss": 7.1576, + "step": 18568 + }, + { + "epoch": 1.732667724176542, + "grad_norm": 2.2581201970817677, + "learning_rate": 0.0002506215288295194, + "loss": 7.3827, + "step": 18569 + }, + { + "epoch": 1.7327610338714192, + "grad_norm": 1.882187017569094, + "learning_rate": 0.0002506159191791345, + "loss": 7.3365, + "step": 18570 + }, + { + "epoch": 1.7328543435662964, + "grad_norm": 1.210450081455713, + "learning_rate": 0.000250610309272913, + "loss": 7.5552, + "step": 18571 + }, + { + "epoch": 1.7329476532611738, + "grad_norm": 1.2412497788031767, + "learning_rate": 0.0002506046991108694, + "loss": 7.1341, + "step": 18572 + }, + { + "epoch": 1.7330409629560513, + "grad_norm": 1.8414566484974804, + "learning_rate": 0.00025059908869301783, + "loss": 7.6639, + "step": 18573 + }, + { + "epoch": 1.7331342726509285, + "grad_norm": 1.241019058704636, + "learning_rate": 0.00025059347801937253, + "loss": 7.8904, + "step": 18574 + }, + { + "epoch": 1.7332275823458057, + "grad_norm": 0.7643000366136489, + "learning_rate": 0.00025058786708994783, + "loss": 7.4758, + "step": 18575 + }, + { + "epoch": 1.733320892040683, + "grad_norm": 1.1187375835921798, + "learning_rate": 0.000250582255904758, + "loss": 7.369, + "step": 18576 + }, + { + "epoch": 1.7334142017355605, + "grad_norm": 1.3357555895136728, + "learning_rate": 0.0002505766444638173, + "loss": 7.5281, + "step": 18577 + }, + { + "epoch": 1.7335075114304375, + "grad_norm": 0.8311927876408647, + "learning_rate": 0.00025057103276713996, + "loss": 7.5467, + "step": 18578 + }, + { + "epoch": 1.7336008211253149, + "grad_norm": 1.5388809634492235, + "learning_rate": 0.00025056542081474027, + "loss": 7.5228, + "step": 18579 + }, + { + "epoch": 1.7336941308201923, + "grad_norm": 22545.351754673586, + "learning_rate": 0.0002505598086066325, + "loss": 7.5289, + "step": 18580 + }, + { + "epoch": 1.7337874405150695, + "grad_norm": 3829.9355542196913, + "learning_rate": 0.000250554196142831, + "loss": 7.4882, + "step": 18581 + }, + { + "epoch": 1.7338807502099467, + "grad_norm": 889.1431097511223, + "learning_rate": 0.00025054858342334985, + "loss": 7.4501, + "step": 18582 + }, + { + "epoch": 1.7339740599048241, + "grad_norm": 0.8803597928862029, + "learning_rate": 0.0002505429704482035, + "loss": 7.4023, + "step": 18583 + }, + { + "epoch": 1.7340673695997015, + "grad_norm": 1.7999114847729811, + "learning_rate": 0.00025053735721740616, + "loss": 7.5303, + "step": 18584 + }, + { + "epoch": 1.7341606792945787, + "grad_norm": 1.473728108284756, + "learning_rate": 0.0002505317437309721, + "loss": 7.4593, + "step": 18585 + }, + { + "epoch": 1.734253988989456, + "grad_norm": 0.6955068851267617, + "learning_rate": 0.0002505261299889156, + "loss": 7.1501, + "step": 18586 + }, + { + "epoch": 1.7343472986843333, + "grad_norm": 2188.375577858101, + "learning_rate": 0.0002505205159912509, + "loss": 7.2744, + "step": 18587 + }, + { + "epoch": 1.7344406083792105, + "grad_norm": 2.1486024104831936, + "learning_rate": 0.00025051490173799235, + "loss": 7.3847, + "step": 18588 + }, + { + "epoch": 1.7345339180740877, + "grad_norm": 0.9998020162537673, + "learning_rate": 0.00025050928722915414, + "loss": 7.526, + "step": 18589 + }, + { + "epoch": 1.7346272277689652, + "grad_norm": 0.6679815296026844, + "learning_rate": 0.0002505036724647506, + "loss": 7.2233, + "step": 18590 + }, + { + "epoch": 1.7347205374638426, + "grad_norm": 0.9953611714451999, + "learning_rate": 0.00025049805744479603, + "loss": 7.489, + "step": 18591 + }, + { + "epoch": 1.7348138471587198, + "grad_norm": 0.8466079394523769, + "learning_rate": 0.00025049244216930464, + "loss": 7.28, + "step": 18592 + }, + { + "epoch": 1.734907156853597, + "grad_norm": 1.2021375814705233, + "learning_rate": 0.0002504868266382907, + "loss": 7.4142, + "step": 18593 + }, + { + "epoch": 1.7350004665484744, + "grad_norm": 3479.8336063440042, + "learning_rate": 0.00025048121085176854, + "loss": 7.3459, + "step": 18594 + }, + { + "epoch": 1.7350937762433518, + "grad_norm": 1.1336181792043794, + "learning_rate": 0.00025047559480975245, + "loss": 7.4841, + "step": 18595 + }, + { + "epoch": 1.735187085938229, + "grad_norm": 8529.898004587267, + "learning_rate": 0.00025046997851225664, + "loss": 7.5411, + "step": 18596 + }, + { + "epoch": 1.7352803956331062, + "grad_norm": 1.7565630181462792, + "learning_rate": 0.0002504643619592955, + "loss": 7.4674, + "step": 18597 + }, + { + "epoch": 1.7353737053279836, + "grad_norm": 1.8674678561940297, + "learning_rate": 0.0002504587451508832, + "loss": 7.7296, + "step": 18598 + }, + { + "epoch": 1.7354670150228608, + "grad_norm": 1.281823460700265, + "learning_rate": 0.0002504531280870341, + "loss": 7.493, + "step": 18599 + }, + { + "epoch": 1.735560324717738, + "grad_norm": 1.3356010314481446, + "learning_rate": 0.00025044751076776245, + "loss": 7.5201, + "step": 18600 + }, + { + "epoch": 1.7356536344126154, + "grad_norm": 0.9913542900244838, + "learning_rate": 0.0002504418931930825, + "loss": 7.3535, + "step": 18601 + }, + { + "epoch": 1.7357469441074929, + "grad_norm": 0.8521302042769497, + "learning_rate": 0.0002504362753630086, + "loss": 7.2456, + "step": 18602 + }, + { + "epoch": 1.73584025380237, + "grad_norm": 1.2009321710075571, + "learning_rate": 0.00025043065727755503, + "loss": 7.2515, + "step": 18603 + }, + { + "epoch": 1.7359335634972473, + "grad_norm": 1.7838125975552606, + "learning_rate": 0.00025042503893673597, + "loss": 7.334, + "step": 18604 + }, + { + "epoch": 1.7360268731921247, + "grad_norm": 0.950750695280381, + "learning_rate": 0.00025041942034056584, + "loss": 7.4956, + "step": 18605 + }, + { + "epoch": 1.736120182887002, + "grad_norm": 1.2783149755861636, + "learning_rate": 0.0002504138014890588, + "loss": 7.3516, + "step": 18606 + }, + { + "epoch": 1.7362134925818793, + "grad_norm": 90.95292927595807, + "learning_rate": 0.00025040818238222936, + "loss": 7.2606, + "step": 18607 + }, + { + "epoch": 1.7363068022767565, + "grad_norm": 1.0190989690518075, + "learning_rate": 0.00025040256302009153, + "loss": 7.3563, + "step": 18608 + }, + { + "epoch": 1.736400111971634, + "grad_norm": 1.5377066460695819, + "learning_rate": 0.00025039694340265976, + "loss": 7.4078, + "step": 18609 + }, + { + "epoch": 1.736493421666511, + "grad_norm": 1.3169287819094317, + "learning_rate": 0.0002503913235299483, + "loss": 7.3332, + "step": 18610 + }, + { + "epoch": 1.7365867313613883, + "grad_norm": 0.9094648808751842, + "learning_rate": 0.0002503857034019715, + "loss": 7.2316, + "step": 18611 + }, + { + "epoch": 1.7366800410562657, + "grad_norm": 0.8500503208132089, + "learning_rate": 0.00025038008301874343, + "loss": 7.1177, + "step": 18612 + }, + { + "epoch": 1.7367733507511431, + "grad_norm": 57060.16921152684, + "learning_rate": 0.0002503744623802787, + "loss": 7.3687, + "step": 18613 + }, + { + "epoch": 1.7368666604460203, + "grad_norm": 296709.25475633115, + "learning_rate": 0.0002503688414865914, + "loss": 7.1771, + "step": 18614 + }, + { + "epoch": 1.7369599701408975, + "grad_norm": 11.11759896160142, + "learning_rate": 0.00025036322033769587, + "loss": 7.4393, + "step": 18615 + }, + { + "epoch": 1.737053279835775, + "grad_norm": 0.853291700718647, + "learning_rate": 0.00025035759893360637, + "loss": 6.9434, + "step": 18616 + }, + { + "epoch": 1.7371465895306524, + "grad_norm": 0.8081743664539632, + "learning_rate": 0.00025035197727433725, + "loss": 7.4951, + "step": 18617 + }, + { + "epoch": 1.7372398992255296, + "grad_norm": 1.001726689509418, + "learning_rate": 0.0002503463553599028, + "loss": 7.6598, + "step": 18618 + }, + { + "epoch": 1.7373332089204068, + "grad_norm": 1.126120884642138, + "learning_rate": 0.00025034073319031726, + "loss": 7.2751, + "step": 18619 + }, + { + "epoch": 1.7374265186152842, + "grad_norm": 0.8475552032211563, + "learning_rate": 0.00025033511076559496, + "loss": 7.4257, + "step": 18620 + }, + { + "epoch": 1.7375198283101614, + "grad_norm": 173084.07796635508, + "learning_rate": 0.0002503294880857502, + "loss": 6.9554, + "step": 18621 + }, + { + "epoch": 1.7376131380050386, + "grad_norm": 127374.92041254933, + "learning_rate": 0.0002503238651507973, + "loss": 7.2375, + "step": 18622 + }, + { + "epoch": 1.737706447699916, + "grad_norm": 77365.09562748813, + "learning_rate": 0.00025031824196075045, + "loss": 7.5574, + "step": 18623 + }, + { + "epoch": 1.7377997573947934, + "grad_norm": 1.5586229394933218, + "learning_rate": 0.0002503126185156241, + "loss": 7.4978, + "step": 18624 + }, + { + "epoch": 1.7378930670896706, + "grad_norm": 1.8676893320864503, + "learning_rate": 0.00025030699481543245, + "loss": 7.1147, + "step": 18625 + }, + { + "epoch": 1.7379863767845478, + "grad_norm": 1.5247574109900168, + "learning_rate": 0.0002503013708601898, + "loss": 7.642, + "step": 18626 + }, + { + "epoch": 1.7380796864794252, + "grad_norm": 0.9096599632395858, + "learning_rate": 0.0002502957466499105, + "loss": 7.3628, + "step": 18627 + }, + { + "epoch": 1.7381729961743027, + "grad_norm": 1.126429568235573, + "learning_rate": 0.00025029012218460886, + "loss": 7.3385, + "step": 18628 + }, + { + "epoch": 1.7382663058691799, + "grad_norm": 1.1599912321487613, + "learning_rate": 0.0002502844974642991, + "loss": 7.7105, + "step": 18629 + }, + { + "epoch": 1.738359615564057, + "grad_norm": 1.0995055288005835, + "learning_rate": 0.00025027887248899565, + "loss": 7.2088, + "step": 18630 + }, + { + "epoch": 1.7384529252589345, + "grad_norm": 38237.182991919966, + "learning_rate": 0.0002502732472587126, + "loss": 7.5574, + "step": 18631 + }, + { + "epoch": 1.7385462349538117, + "grad_norm": 47702.32904642522, + "learning_rate": 0.0002502676217734645, + "loss": 7.6273, + "step": 18632 + }, + { + "epoch": 1.7386395446486889, + "grad_norm": 0.8593509419792213, + "learning_rate": 0.00025026199603326546, + "loss": 7.4945, + "step": 18633 + }, + { + "epoch": 1.7387328543435663, + "grad_norm": 1.336229354894196, + "learning_rate": 0.0002502563700381299, + "loss": 7.8941, + "step": 18634 + }, + { + "epoch": 1.7388261640384437, + "grad_norm": 1.2170316379194548, + "learning_rate": 0.0002502507437880721, + "loss": 7.2459, + "step": 18635 + }, + { + "epoch": 1.738919473733321, + "grad_norm": 2.7658415303234736, + "learning_rate": 0.0002502451172831063, + "loss": 7.297, + "step": 18636 + }, + { + "epoch": 1.739012783428198, + "grad_norm": 0.9281448939731342, + "learning_rate": 0.00025023949052324694, + "loss": 7.2881, + "step": 18637 + }, + { + "epoch": 1.7391060931230755, + "grad_norm": 3.58737165057138, + "learning_rate": 0.00025023386350850823, + "loss": 7.3572, + "step": 18638 + }, + { + "epoch": 1.739199402817953, + "grad_norm": 3.4782074044360836, + "learning_rate": 0.00025022823623890443, + "loss": 7.5229, + "step": 18639 + }, + { + "epoch": 1.7392927125128301, + "grad_norm": 0.9397059119743888, + "learning_rate": 0.00025022260871444997, + "loss": 7.4567, + "step": 18640 + }, + { + "epoch": 1.7393860222077073, + "grad_norm": 1.3013431345896154, + "learning_rate": 0.00025021698093515915, + "loss": 7.6045, + "step": 18641 + }, + { + "epoch": 1.7394793319025847, + "grad_norm": 28831.51864965522, + "learning_rate": 0.00025021135290104613, + "loss": 7.1454, + "step": 18642 + }, + { + "epoch": 1.739572641597462, + "grad_norm": 1.7689149034373985, + "learning_rate": 0.0002502057246121254, + "loss": 7.603, + "step": 18643 + }, + { + "epoch": 1.7396659512923391, + "grad_norm": 0.8490019801727984, + "learning_rate": 0.00025020009606841116, + "loss": 7.4715, + "step": 18644 + }, + { + "epoch": 1.7397592609872166, + "grad_norm": 1.5410460460522337, + "learning_rate": 0.00025019446726991777, + "loss": 7.0848, + "step": 18645 + }, + { + "epoch": 1.739852570682094, + "grad_norm": 1.4779882597680627, + "learning_rate": 0.00025018883821665953, + "loss": 7.3061, + "step": 18646 + }, + { + "epoch": 1.7399458803769712, + "grad_norm": 1.0773734683485852, + "learning_rate": 0.00025018320890865075, + "loss": 7.4925, + "step": 18647 + }, + { + "epoch": 1.7400391900718484, + "grad_norm": 1.8171582186503947, + "learning_rate": 0.00025017757934590575, + "loss": 7.4491, + "step": 18648 + }, + { + "epoch": 1.7401324997667258, + "grad_norm": 7.805319503627295, + "learning_rate": 0.00025017194952843883, + "loss": 7.4444, + "step": 18649 + }, + { + "epoch": 1.7402258094616032, + "grad_norm": 1.9841097457536265, + "learning_rate": 0.00025016631945626436, + "loss": 7.5378, + "step": 18650 + }, + { + "epoch": 1.7403191191564804, + "grad_norm": 252768.82683501908, + "learning_rate": 0.00025016068912939654, + "loss": 7.2872, + "step": 18651 + }, + { + "epoch": 1.7404124288513576, + "grad_norm": 1.4266897496324102, + "learning_rate": 0.0002501550585478498, + "loss": 7.4537, + "step": 18652 + }, + { + "epoch": 1.740505738546235, + "grad_norm": 0.8950140958346349, + "learning_rate": 0.00025014942771163837, + "loss": 7.5817, + "step": 18653 + }, + { + "epoch": 1.7405990482411122, + "grad_norm": 0.8274603294404382, + "learning_rate": 0.0002501437966207767, + "loss": 7.3359, + "step": 18654 + }, + { + "epoch": 1.7406923579359894, + "grad_norm": 0.7858699965933259, + "learning_rate": 0.00025013816527527893, + "loss": 7.4446, + "step": 18655 + }, + { + "epoch": 1.7407856676308668, + "grad_norm": 0.8557118796127425, + "learning_rate": 0.00025013253367515953, + "loss": 7.2803, + "step": 18656 + }, + { + "epoch": 1.7408789773257443, + "grad_norm": 1.2052130597744155, + "learning_rate": 0.0002501269018204327, + "loss": 7.5826, + "step": 18657 + }, + { + "epoch": 1.7409722870206215, + "grad_norm": 5140149.412366255, + "learning_rate": 0.0002501212697111129, + "loss": 7.6373, + "step": 18658 + }, + { + "epoch": 1.7410655967154987, + "grad_norm": 8.362754077094843, + "learning_rate": 0.0002501156373472143, + "loss": 7.1196, + "step": 18659 + }, + { + "epoch": 1.741158906410376, + "grad_norm": 0.8089614510018589, + "learning_rate": 0.00025011000472875136, + "loss": 7.173, + "step": 18660 + }, + { + "epoch": 1.7412522161052535, + "grad_norm": 1.5294528292950733, + "learning_rate": 0.0002501043718557382, + "loss": 7.3901, + "step": 18661 + }, + { + "epoch": 1.7413455258001305, + "grad_norm": 1.0700280728233704, + "learning_rate": 0.00025009873872818944, + "loss": 7.4218, + "step": 18662 + }, + { + "epoch": 1.7414388354950079, + "grad_norm": 46474271.57566102, + "learning_rate": 0.00025009310534611915, + "loss": 7.3381, + "step": 18663 + }, + { + "epoch": 1.7415321451898853, + "grad_norm": 0.7830182627393232, + "learning_rate": 0.0002500874717095417, + "loss": 7.3943, + "step": 18664 + }, + { + "epoch": 1.7416254548847625, + "grad_norm": 22578160.331123423, + "learning_rate": 0.00025008183781847153, + "loss": 7.1919, + "step": 18665 + }, + { + "epoch": 1.7417187645796397, + "grad_norm": 45.727775090021126, + "learning_rate": 0.0002500762036729229, + "loss": 7.3975, + "step": 18666 + }, + { + "epoch": 1.7418120742745171, + "grad_norm": 4.317970876643696, + "learning_rate": 0.0002500705692729101, + "loss": 7.4139, + "step": 18667 + }, + { + "epoch": 1.7419053839693945, + "grad_norm": 1.6368682483771637, + "learning_rate": 0.00025006493461844745, + "loss": 7.0632, + "step": 18668 + }, + { + "epoch": 1.7419986936642717, + "grad_norm": 106557293.50716706, + "learning_rate": 0.0002500592997095494, + "loss": 7.1394, + "step": 18669 + }, + { + "epoch": 1.742092003359149, + "grad_norm": 1.2158482568650777, + "learning_rate": 0.00025005366454623014, + "loss": 7.1164, + "step": 18670 + }, + { + "epoch": 1.7421853130540264, + "grad_norm": 325076743.3012512, + "learning_rate": 0.000250048029128504, + "loss": 7.368, + "step": 18671 + }, + { + "epoch": 1.7422786227489038, + "grad_norm": 1.1471391363372179, + "learning_rate": 0.00025004239345638544, + "loss": 6.9223, + "step": 18672 + }, + { + "epoch": 1.7423719324437807, + "grad_norm": 3.3856701451520204, + "learning_rate": 0.0002500367575298887, + "loss": 7.7569, + "step": 18673 + }, + { + "epoch": 1.7424652421386582, + "grad_norm": 27767106.922774713, + "learning_rate": 0.00025003112134902805, + "loss": 7.7738, + "step": 18674 + }, + { + "epoch": 1.7425585518335356, + "grad_norm": 1759828.5760094044, + "learning_rate": 0.00025002548491381794, + "loss": 7.5202, + "step": 18675 + }, + { + "epoch": 1.7426518615284128, + "grad_norm": 1.6087406530327728, + "learning_rate": 0.00025001984822427265, + "loss": 7.4851, + "step": 18676 + }, + { + "epoch": 1.74274517122329, + "grad_norm": 0.9916483252987892, + "learning_rate": 0.00025001421128040654, + "loss": 7.1285, + "step": 18677 + }, + { + "epoch": 1.7428384809181674, + "grad_norm": 6167108.879674371, + "learning_rate": 0.00025000857408223387, + "loss": 7.5644, + "step": 18678 + }, + { + "epoch": 1.7429317906130448, + "grad_norm": 77466756.56986819, + "learning_rate": 0.00025000293662976906, + "loss": 7.3347, + "step": 18679 + }, + { + "epoch": 1.743025100307922, + "grad_norm": 1.151957143991566, + "learning_rate": 0.00024999729892302637, + "loss": 7.7163, + "step": 18680 + }, + { + "epoch": 1.7431184100027992, + "grad_norm": 0.9011198820614883, + "learning_rate": 0.00024999166096202017, + "loss": 7.3137, + "step": 18681 + }, + { + "epoch": 1.7432117196976766, + "grad_norm": 1.8720744214225162, + "learning_rate": 0.0002499860227467648, + "loss": 7.4442, + "step": 18682 + }, + { + "epoch": 1.743305029392554, + "grad_norm": 2.8131086398215928, + "learning_rate": 0.0002499803842772746, + "loss": 7.5758, + "step": 18683 + }, + { + "epoch": 1.743398339087431, + "grad_norm": 1.6702555180308147, + "learning_rate": 0.0002499747455535639, + "loss": 7.0434, + "step": 18684 + }, + { + "epoch": 1.7434916487823084, + "grad_norm": 7.418651193903657, + "learning_rate": 0.0002499691065756471, + "loss": 7.5937, + "step": 18685 + }, + { + "epoch": 1.7435849584771859, + "grad_norm": 25225545.48427501, + "learning_rate": 0.0002499634673435384, + "loss": 7.6084, + "step": 18686 + }, + { + "epoch": 1.743678268172063, + "grad_norm": 1.1286180222553164, + "learning_rate": 0.00024995782785725224, + "loss": 7.2692, + "step": 18687 + }, + { + "epoch": 1.7437715778669403, + "grad_norm": 1.3419609349339912, + "learning_rate": 0.00024995218811680285, + "loss": 7.4518, + "step": 18688 + }, + { + "epoch": 1.7438648875618177, + "grad_norm": 1.247440758728598, + "learning_rate": 0.00024994654812220476, + "loss": 7.392, + "step": 18689 + }, + { + "epoch": 1.743958197256695, + "grad_norm": 1.2613965863168328, + "learning_rate": 0.0002499409078734722, + "loss": 7.4182, + "step": 18690 + }, + { + "epoch": 1.7440515069515723, + "grad_norm": 1.8799019357549376, + "learning_rate": 0.0002499352673706194, + "loss": 7.359, + "step": 18691 + }, + { + "epoch": 1.7441448166464495, + "grad_norm": 3792542.644149064, + "learning_rate": 0.00024992962661366094, + "loss": 7.493, + "step": 18692 + }, + { + "epoch": 1.744238126341327, + "grad_norm": 1.794077968505422, + "learning_rate": 0.000249923985602611, + "loss": 7.1319, + "step": 18693 + }, + { + "epoch": 1.744331436036204, + "grad_norm": 0.7887434895925145, + "learning_rate": 0.00024991834433748394, + "loss": 7.4485, + "step": 18694 + }, + { + "epoch": 1.7444247457310813, + "grad_norm": 1.1332040251419184, + "learning_rate": 0.0002499127028182942, + "loss": 7.2596, + "step": 18695 + }, + { + "epoch": 1.7445180554259587, + "grad_norm": 1.260837948392975, + "learning_rate": 0.000249907061045056, + "loss": 7.5878, + "step": 18696 + }, + { + "epoch": 1.7446113651208361, + "grad_norm": 1.0940042389171118, + "learning_rate": 0.00024990141901778376, + "loss": 7.146, + "step": 18697 + }, + { + "epoch": 1.7447046748157133, + "grad_norm": 1.0935105936640934, + "learning_rate": 0.0002498957767364918, + "loss": 7.1996, + "step": 18698 + }, + { + "epoch": 1.7447979845105905, + "grad_norm": 273738480.79946655, + "learning_rate": 0.00024989013420119444, + "loss": 6.9992, + "step": 18699 + }, + { + "epoch": 1.744891294205468, + "grad_norm": 293550951.49823976, + "learning_rate": 0.0002498844914119061, + "loss": 7.0568, + "step": 18700 + }, + { + "epoch": 1.7449846039003454, + "grad_norm": 0.9550845009932933, + "learning_rate": 0.00024987884836864105, + "loss": 7.1538, + "step": 18701 + }, + { + "epoch": 1.7450779135952226, + "grad_norm": 1.4298503381422216, + "learning_rate": 0.0002498732050714137, + "loss": 7.4388, + "step": 18702 + }, + { + "epoch": 1.7451712232900998, + "grad_norm": 1.081050682005134, + "learning_rate": 0.00024986756152023837, + "loss": 7.2387, + "step": 18703 + }, + { + "epoch": 1.7452645329849772, + "grad_norm": 1.2700671882510033, + "learning_rate": 0.0002498619177151294, + "loss": 7.3637, + "step": 18704 + }, + { + "epoch": 1.7453578426798544, + "grad_norm": 1.2798539896774481, + "learning_rate": 0.0002498562736561012, + "loss": 7.4256, + "step": 18705 + }, + { + "epoch": 1.7454511523747316, + "grad_norm": 4.597439836151166, + "learning_rate": 0.000249850629343168, + "loss": 7.4136, + "step": 18706 + }, + { + "epoch": 1.745544462069609, + "grad_norm": 1.0695735462833251, + "learning_rate": 0.0002498449847763443, + "loss": 7.529, + "step": 18707 + }, + { + "epoch": 1.7456377717644864, + "grad_norm": 1.1705843221809558, + "learning_rate": 0.00024983933995564433, + "loss": 7.4583, + "step": 18708 + }, + { + "epoch": 1.7457310814593636, + "grad_norm": 1.0323897697451212, + "learning_rate": 0.00024983369488108256, + "loss": 7.8244, + "step": 18709 + }, + { + "epoch": 1.7458243911542408, + "grad_norm": 2.644730851981709, + "learning_rate": 0.00024982804955267317, + "loss": 7.2346, + "step": 18710 + }, + { + "epoch": 1.7459177008491182, + "grad_norm": 1.6714747438752164, + "learning_rate": 0.0002498224039704307, + "loss": 7.5245, + "step": 18711 + }, + { + "epoch": 1.7460110105439957, + "grad_norm": 2.189637329529558, + "learning_rate": 0.0002498167581343694, + "loss": 7.3375, + "step": 18712 + }, + { + "epoch": 1.7461043202388729, + "grad_norm": 6434479392.848952, + "learning_rate": 0.0002498111120445037, + "loss": 7.1237, + "step": 18713 + }, + { + "epoch": 1.74619762993375, + "grad_norm": 1.0029255363758818, + "learning_rate": 0.0002498054657008478, + "loss": 7.6099, + "step": 18714 + }, + { + "epoch": 1.7462909396286275, + "grad_norm": 3.389506354727014, + "learning_rate": 0.00024979981910341626, + "loss": 7.5954, + "step": 18715 + }, + { + "epoch": 1.7463842493235047, + "grad_norm": 9.837291731588008, + "learning_rate": 0.0002497941722522233, + "loss": 7.3983, + "step": 18716 + }, + { + "epoch": 1.7464775590183819, + "grad_norm": 4.8764585448949544, + "learning_rate": 0.0002497885251472834, + "loss": 7.499, + "step": 18717 + }, + { + "epoch": 1.7465708687132593, + "grad_norm": 1.2948534423184257, + "learning_rate": 0.0002497828777886107, + "loss": 7.1099, + "step": 18718 + }, + { + "epoch": 1.7466641784081367, + "grad_norm": 3.657343369583419, + "learning_rate": 0.0002497772301762198, + "loss": 7.5603, + "step": 18719 + }, + { + "epoch": 1.746757488103014, + "grad_norm": 4.227786036672882, + "learning_rate": 0.0002497715823101249, + "loss": 7.3956, + "step": 18720 + }, + { + "epoch": 1.746850797797891, + "grad_norm": 45935934429.84873, + "learning_rate": 0.0002497659341903405, + "loss": 7.4395, + "step": 18721 + }, + { + "epoch": 1.7469441074927685, + "grad_norm": 1.2209106362810616, + "learning_rate": 0.0002497602858168808, + "loss": 7.2554, + "step": 18722 + }, + { + "epoch": 1.747037417187646, + "grad_norm": 23.55949947034483, + "learning_rate": 0.0002497546371897603, + "loss": 7.5926, + "step": 18723 + }, + { + "epoch": 1.7471307268825231, + "grad_norm": 1.361880567559838, + "learning_rate": 0.0002497489883089933, + "loss": 7.3767, + "step": 18724 + }, + { + "epoch": 1.7472240365774003, + "grad_norm": 6.033019331539659, + "learning_rate": 0.00024974333917459414, + "loss": 7.6481, + "step": 18725 + }, + { + "epoch": 1.7473173462722777, + "grad_norm": 2.6145303783388814, + "learning_rate": 0.00024973768978657723, + "loss": 7.2313, + "step": 18726 + }, + { + "epoch": 1.747410655967155, + "grad_norm": 1.62629851160519, + "learning_rate": 0.000249732040144957, + "loss": 7.5487, + "step": 18727 + }, + { + "epoch": 1.7475039656620321, + "grad_norm": 1.075425211762846, + "learning_rate": 0.0002497263902497476, + "loss": 7.3208, + "step": 18728 + }, + { + "epoch": 1.7475972753569096, + "grad_norm": 5.532357172477872, + "learning_rate": 0.0002497207401009636, + "loss": 7.1391, + "step": 18729 + }, + { + "epoch": 1.747690585051787, + "grad_norm": 391471326285.2101, + "learning_rate": 0.00024971508969861926, + "loss": 7.3854, + "step": 18730 + }, + { + "epoch": 1.7477838947466642, + "grad_norm": 2.8832682492919144, + "learning_rate": 0.000249709439042729, + "loss": 7.5346, + "step": 18731 + }, + { + "epoch": 1.7478772044415414, + "grad_norm": 7953477688289.048, + "learning_rate": 0.00024970378813330725, + "loss": 7.2758, + "step": 18732 + }, + { + "epoch": 1.7479705141364188, + "grad_norm": 4.406851957908951, + "learning_rate": 0.00024969813697036823, + "loss": 7.5341, + "step": 18733 + }, + { + "epoch": 1.7480638238312962, + "grad_norm": 2.0540167116159505, + "learning_rate": 0.0002496924855539263, + "loss": 7.3835, + "step": 18734 + }, + { + "epoch": 1.7481571335261734, + "grad_norm": 1.0746306021687022, + "learning_rate": 0.00024968683388399603, + "loss": 7.2247, + "step": 18735 + }, + { + "epoch": 1.7482504432210506, + "grad_norm": 1.7414030620994598, + "learning_rate": 0.00024968118196059164, + "loss": 7.7175, + "step": 18736 + }, + { + "epoch": 1.748343752915928, + "grad_norm": 1.106911007948159, + "learning_rate": 0.00024967552978372756, + "loss": 7.5398, + "step": 18737 + }, + { + "epoch": 1.7484370626108052, + "grad_norm": 5.528054512156648, + "learning_rate": 0.0002496698773534181, + "loss": 7.7265, + "step": 18738 + }, + { + "epoch": 1.7485303723056824, + "grad_norm": 1.2587067634905114, + "learning_rate": 0.0002496642246696777, + "loss": 7.4897, + "step": 18739 + }, + { + "epoch": 1.7486236820005598, + "grad_norm": 21093848022477.516, + "learning_rate": 0.00024965857173252063, + "loss": 7.4756, + "step": 18740 + }, + { + "epoch": 1.7487169916954373, + "grad_norm": 122074625909964.69, + "learning_rate": 0.00024965291854196135, + "loss": 7.4913, + "step": 18741 + }, + { + "epoch": 1.7488103013903145, + "grad_norm": 3.9966253949576602, + "learning_rate": 0.00024964726509801426, + "loss": 7.4756, + "step": 18742 + }, + { + "epoch": 1.7489036110851917, + "grad_norm": 3.15318983840079, + "learning_rate": 0.0002496416114006937, + "loss": 8.0225, + "step": 18743 + }, + { + "epoch": 1.748996920780069, + "grad_norm": 1.6617420344434033, + "learning_rate": 0.00024963595745001397, + "loss": 7.4363, + "step": 18744 + }, + { + "epoch": 1.7490902304749465, + "grad_norm": 11.593318906661251, + "learning_rate": 0.0002496303032459896, + "loss": 7.327, + "step": 18745 + }, + { + "epoch": 1.7491835401698237, + "grad_norm": 1.5341864801034775, + "learning_rate": 0.0002496246487886348, + "loss": 7.5052, + "step": 18746 + }, + { + "epoch": 1.749276849864701, + "grad_norm": 2.2781816894881994, + "learning_rate": 0.00024961899407796405, + "loss": 7.3453, + "step": 18747 + }, + { + "epoch": 1.7493701595595783, + "grad_norm": 1.4030075990897133, + "learning_rate": 0.00024961333911399175, + "loss": 7.0404, + "step": 18748 + }, + { + "epoch": 1.7494634692544555, + "grad_norm": 21.685497177395703, + "learning_rate": 0.0002496076838967322, + "loss": 7.6508, + "step": 18749 + }, + { + "epoch": 1.7495567789493327, + "grad_norm": 1.4041369526533483, + "learning_rate": 0.0002496020284261998, + "loss": 7.6827, + "step": 18750 + }, + { + "epoch": 1.7496500886442101, + "grad_norm": 3.021611008320089, + "learning_rate": 0.00024959637270240896, + "loss": 7.5196, + "step": 18751 + }, + { + "epoch": 1.7497433983390875, + "grad_norm": 1.8622276022020892, + "learning_rate": 0.0002495907167253741, + "loss": 7.0222, + "step": 18752 + }, + { + "epoch": 1.7498367080339647, + "grad_norm": 1.8454109437657973, + "learning_rate": 0.0002495850604951095, + "loss": 7.5398, + "step": 18753 + }, + { + "epoch": 1.749930017728842, + "grad_norm": 1.6676727335285384, + "learning_rate": 0.0002495794040116296, + "loss": 7.5472, + "step": 18754 + }, + { + "epoch": 1.7500233274237194, + "grad_norm": 4.630808323624849, + "learning_rate": 0.00024957374727494874, + "loss": 7.6888, + "step": 18755 + }, + { + "epoch": 1.7501166371185968, + "grad_norm": 2.2478111262690277, + "learning_rate": 0.0002495680902850814, + "loss": 7.3494, + "step": 18756 + }, + { + "epoch": 1.750209946813474, + "grad_norm": 2.7241517997462563e+17, + "learning_rate": 0.00024956243304204183, + "loss": 7.7381, + "step": 18757 + }, + { + "epoch": 1.7503032565083512, + "grad_norm": 2.0395233289438055, + "learning_rate": 0.00024955677554584457, + "loss": 7.498, + "step": 18758 + }, + { + "epoch": 1.7503965662032286, + "grad_norm": 1.128035115325038, + "learning_rate": 0.00024955111779650387, + "loss": 7.4507, + "step": 18759 + }, + { + "epoch": 1.7504898758981058, + "grad_norm": 1.64026522702963, + "learning_rate": 0.00024954545979403417, + "loss": 7.5331, + "step": 18760 + }, + { + "epoch": 1.750583185592983, + "grad_norm": 3.0253120424675233, + "learning_rate": 0.00024953980153844983, + "loss": 7.6919, + "step": 18761 + }, + { + "epoch": 1.7506764952878604, + "grad_norm": 80.10835120349515, + "learning_rate": 0.0002495341430297653, + "loss": 7.5125, + "step": 18762 + }, + { + "epoch": 1.7507698049827378, + "grad_norm": 1.4870291329369971, + "learning_rate": 0.0002495284842679949, + "loss": 7.4508, + "step": 18763 + }, + { + "epoch": 1.750863114677615, + "grad_norm": 1.6080900081970988, + "learning_rate": 0.00024952282525315305, + "loss": 7.0637, + "step": 18764 + }, + { + "epoch": 1.7509564243724922, + "grad_norm": 2.0047172258873083e+18, + "learning_rate": 0.0002495171659852542, + "loss": 7.3157, + "step": 18765 + }, + { + "epoch": 1.7510497340673696, + "grad_norm": 1.0836868226066703, + "learning_rate": 0.00024951150646431255, + "loss": 7.5133, + "step": 18766 + }, + { + "epoch": 1.751143043762247, + "grad_norm": 1.7741603103131525, + "learning_rate": 0.0002495058466903427, + "loss": 7.5134, + "step": 18767 + }, + { + "epoch": 1.7512363534571243, + "grad_norm": 1.401289421718429, + "learning_rate": 0.00024950018666335897, + "loss": 7.4747, + "step": 18768 + }, + { + "epoch": 1.7513296631520014, + "grad_norm": 1.5335433578346247, + "learning_rate": 0.0002494945263833757, + "loss": 7.376, + "step": 18769 + }, + { + "epoch": 1.7514229728468789, + "grad_norm": 3.1985785094220303e+18, + "learning_rate": 0.00024948886585040734, + "loss": 7.2327, + "step": 18770 + }, + { + "epoch": 1.751516282541756, + "grad_norm": 1.4598406355430555, + "learning_rate": 0.0002494832050644683, + "loss": 7.2708, + "step": 18771 + }, + { + "epoch": 1.7516095922366333, + "grad_norm": 2.2666348122889146, + "learning_rate": 0.00024947754402557286, + "loss": 7.7088, + "step": 18772 + }, + { + "epoch": 1.7517029019315107, + "grad_norm": 8.610408208461648, + "learning_rate": 0.0002494718827337355, + "loss": 7.3983, + "step": 18773 + }, + { + "epoch": 1.751796211626388, + "grad_norm": 1.6043898607963547, + "learning_rate": 0.0002494662211889707, + "loss": 7.328, + "step": 18774 + }, + { + "epoch": 1.7518895213212653, + "grad_norm": 9.068553955846014, + "learning_rate": 0.0002494605593912927, + "loss": 7.5451, + "step": 18775 + }, + { + "epoch": 1.7519828310161425, + "grad_norm": 3.4266479413935355e+18, + "learning_rate": 0.0002494548973407159, + "loss": 7.4276, + "step": 18776 + }, + { + "epoch": 1.75207614071102, + "grad_norm": 2.0684674092783273, + "learning_rate": 0.00024944923503725485, + "loss": 7.5923, + "step": 18777 + }, + { + "epoch": 1.7521694504058973, + "grad_norm": 1.5139656486242274, + "learning_rate": 0.00024944357248092385, + "loss": 7.5263, + "step": 18778 + }, + { + "epoch": 1.7522627601007743, + "grad_norm": 1.4451206436869755, + "learning_rate": 0.0002494379096717373, + "loss": 7.6892, + "step": 18779 + }, + { + "epoch": 1.7523560697956517, + "grad_norm": 1.0069915941648228e+19, + "learning_rate": 0.00024943224660970955, + "loss": 7.5126, + "step": 18780 + }, + { + "epoch": 1.7524493794905291, + "grad_norm": 1.6544097402837366, + "learning_rate": 0.0002494265832948551, + "loss": 7.5852, + "step": 18781 + }, + { + "epoch": 1.7525426891854063, + "grad_norm": 1.1022292030476695, + "learning_rate": 0.00024942091972718827, + "loss": 7.4173, + "step": 18782 + }, + { + "epoch": 1.7526359988802835, + "grad_norm": 1.1487138872007048, + "learning_rate": 0.0002494152559067235, + "loss": 7.5968, + "step": 18783 + }, + { + "epoch": 1.752729308575161, + "grad_norm": 0.9709484582623835, + "learning_rate": 0.00024940959183347516, + "loss": 7.4222, + "step": 18784 + }, + { + "epoch": 1.7528226182700384, + "grad_norm": 2.709796167869314e+16, + "learning_rate": 0.0002494039275074577, + "loss": 7.3395, + "step": 18785 + }, + { + "epoch": 1.7529159279649156, + "grad_norm": 2.079770488429865, + "learning_rate": 0.0002493982629286855, + "loss": 7.6399, + "step": 18786 + }, + { + "epoch": 1.7530092376597928, + "grad_norm": 1.401316543213753, + "learning_rate": 0.000249392598097173, + "loss": 7.0959, + "step": 18787 + }, + { + "epoch": 1.7531025473546702, + "grad_norm": 0.8876088105203215, + "learning_rate": 0.00024938693301293453, + "loss": 7.503, + "step": 18788 + }, + { + "epoch": 1.7531958570495476, + "grad_norm": 3.0141866135632016, + "learning_rate": 0.0002493812676759845, + "loss": 7.6431, + "step": 18789 + }, + { + "epoch": 1.7532891667444246, + "grad_norm": 1.7767834545830057, + "learning_rate": 0.00024937560208633736, + "loss": 7.9464, + "step": 18790 + }, + { + "epoch": 1.753382476439302, + "grad_norm": 1.0487160882942328, + "learning_rate": 0.00024936993624400755, + "loss": 7.5907, + "step": 18791 + }, + { + "epoch": 1.7534757861341794, + "grad_norm": 1.5908122114885193, + "learning_rate": 0.00024936427014900936, + "loss": 7.334, + "step": 18792 + }, + { + "epoch": 1.7535690958290566, + "grad_norm": 1.2387196101429079, + "learning_rate": 0.0002493586038013573, + "loss": 7.4041, + "step": 18793 + }, + { + "epoch": 1.7536624055239338, + "grad_norm": 9.771714554289656e+18, + "learning_rate": 0.0002493529372010657, + "loss": 7.4482, + "step": 18794 + }, + { + "epoch": 1.7537557152188112, + "grad_norm": 3.540743958842889e+18, + "learning_rate": 0.00024934727034814906, + "loss": 7.5323, + "step": 18795 + }, + { + "epoch": 1.7538490249136887, + "grad_norm": 1.8888522609532163, + "learning_rate": 0.00024934160324262175, + "loss": 8.315, + "step": 18796 + }, + { + "epoch": 1.7539423346085659, + "grad_norm": 2.448132627136411, + "learning_rate": 0.00024933593588449815, + "loss": 7.5854, + "step": 18797 + }, + { + "epoch": 1.754035644303443, + "grad_norm": 1.0446936060440104, + "learning_rate": 0.0002493302682737927, + "loss": 7.4168, + "step": 18798 + }, + { + "epoch": 1.7541289539983205, + "grad_norm": 1.699650320138488, + "learning_rate": 0.0002493246004105198, + "loss": 7.3493, + "step": 18799 + }, + { + "epoch": 1.7542222636931977, + "grad_norm": 1.0854689222777132, + "learning_rate": 0.00024931893229469387, + "loss": 7.6964, + "step": 18800 + }, + { + "epoch": 1.7543155733880749, + "grad_norm": 2.493823209634368e+18, + "learning_rate": 0.0002493132639263293, + "loss": 7.5297, + "step": 18801 + }, + { + "epoch": 1.7544088830829523, + "grad_norm": 1.505863500053801, + "learning_rate": 0.0002493075953054405, + "loss": 7.5131, + "step": 18802 + }, + { + "epoch": 1.7545021927778297, + "grad_norm": 1.3645020218723174, + "learning_rate": 0.0002493019264320419, + "loss": 7.738, + "step": 18803 + }, + { + "epoch": 1.754595502472707, + "grad_norm": 5.114952540705346e+18, + "learning_rate": 0.00024929625730614794, + "loss": 7.5369, + "step": 18804 + }, + { + "epoch": 1.754688812167584, + "grad_norm": 1.3810344162788911e+19, + "learning_rate": 0.00024929058792777303, + "loss": 7.3361, + "step": 18805 + }, + { + "epoch": 1.7547821218624615, + "grad_norm": 1.7183561351950971, + "learning_rate": 0.00024928491829693156, + "loss": 7.2542, + "step": 18806 + }, + { + "epoch": 1.754875431557339, + "grad_norm": 5.080561938483393e+18, + "learning_rate": 0.0002492792484136379, + "loss": 7.4955, + "step": 18807 + }, + { + "epoch": 1.7549687412522161, + "grad_norm": 1.5949190806509477, + "learning_rate": 0.00024927357827790657, + "loss": 7.2414, + "step": 18808 + }, + { + "epoch": 1.7550620509470933, + "grad_norm": 3.6702403162264154e+17, + "learning_rate": 0.00024926790788975194, + "loss": 7.0597, + "step": 18809 + }, + { + "epoch": 1.7551553606419708, + "grad_norm": 2.3061645536369147e+18, + "learning_rate": 0.00024926223724918843, + "loss": 7.5345, + "step": 18810 + }, + { + "epoch": 1.755248670336848, + "grad_norm": 65.8055126205384, + "learning_rate": 0.0002492565663562304, + "loss": 7.3196, + "step": 18811 + }, + { + "epoch": 1.7553419800317251, + "grad_norm": 1.6158759859983647, + "learning_rate": 0.0002492508952108924, + "loss": 7.5828, + "step": 18812 + }, + { + "epoch": 1.7554352897266026, + "grad_norm": 2.1813054954690207e+18, + "learning_rate": 0.00024924522381318873, + "loss": 7.5026, + "step": 18813 + }, + { + "epoch": 1.75552859942148, + "grad_norm": 1.3232979731424495e+20, + "learning_rate": 0.00024923955216313385, + "loss": 7.3921, + "step": 18814 + }, + { + "epoch": 1.7556219091163572, + "grad_norm": 1.6633341387867013, + "learning_rate": 0.0002492338802607422, + "loss": 7.6002, + "step": 18815 + }, + { + "epoch": 1.7557152188112344, + "grad_norm": 1.5143566586761565, + "learning_rate": 0.00024922820810602817, + "loss": 7.6471, + "step": 18816 + }, + { + "epoch": 1.7558085285061118, + "grad_norm": 1.9589967171246818, + "learning_rate": 0.0002492225356990062, + "loss": 7.5115, + "step": 18817 + }, + { + "epoch": 1.7559018382009892, + "grad_norm": 2.4907374797709743, + "learning_rate": 0.00024921686303969073, + "loss": 7.1031, + "step": 18818 + }, + { + "epoch": 1.7559951478958664, + "grad_norm": 2.1332624959299586, + "learning_rate": 0.0002492111901280962, + "loss": 7.3592, + "step": 18819 + }, + { + "epoch": 1.7560884575907436, + "grad_norm": 1.4221069188981523, + "learning_rate": 0.00024920551696423695, + "loss": 7.828, + "step": 18820 + }, + { + "epoch": 1.756181767285621, + "grad_norm": 1.468655128003582, + "learning_rate": 0.0002491998435481275, + "loss": 7.5777, + "step": 18821 + }, + { + "epoch": 1.7562750769804982, + "grad_norm": 1.1202866387939114, + "learning_rate": 0.00024919416987978217, + "loss": 7.784, + "step": 18822 + }, + { + "epoch": 1.7563683866753754, + "grad_norm": 5.1666105826361475e+19, + "learning_rate": 0.00024918849595921544, + "loss": 7.4254, + "step": 18823 + }, + { + "epoch": 1.7564616963702528, + "grad_norm": 1.3654201394246577e+21, + "learning_rate": 0.0002491828217864418, + "loss": 7.377, + "step": 18824 + }, + { + "epoch": 1.7565550060651303, + "grad_norm": 5.554819819851378e+20, + "learning_rate": 0.00024917714736147567, + "loss": 7.6704, + "step": 18825 + }, + { + "epoch": 1.7566483157600075, + "grad_norm": 0.9734674493456645, + "learning_rate": 0.0002491714726843313, + "loss": 7.2855, + "step": 18826 + }, + { + "epoch": 1.7567416254548847, + "grad_norm": 1.1419949215914948e+21, + "learning_rate": 0.00024916579775502337, + "loss": 7.3071, + "step": 18827 + }, + { + "epoch": 1.756834935149762, + "grad_norm": 1.682286312002196, + "learning_rate": 0.00024916012257356616, + "loss": 7.415, + "step": 18828 + }, + { + "epoch": 1.7569282448446395, + "grad_norm": 1.0602577040860388, + "learning_rate": 0.0002491544471399741, + "loss": 7.4296, + "step": 18829 + }, + { + "epoch": 1.7570215545395167, + "grad_norm": 1.3351873988777534, + "learning_rate": 0.0002491487714542617, + "loss": 7.4312, + "step": 18830 + }, + { + "epoch": 1.757114864234394, + "grad_norm": 0.8933150904099875, + "learning_rate": 0.00024914309551644334, + "loss": 7.6053, + "step": 18831 + }, + { + "epoch": 1.7572081739292713, + "grad_norm": 1.2447767940341483, + "learning_rate": 0.00024913741932653337, + "loss": 7.4344, + "step": 18832 + }, + { + "epoch": 1.7573014836241485, + "grad_norm": 1.069940546559419e+19, + "learning_rate": 0.0002491317428845464, + "loss": 7.3998, + "step": 18833 + }, + { + "epoch": 1.7573947933190257, + "grad_norm": 1.1745385988003547, + "learning_rate": 0.0002491260661904968, + "loss": 7.1264, + "step": 18834 + }, + { + "epoch": 1.7574881030139031, + "grad_norm": 2534.9038673222362, + "learning_rate": 0.0002491203892443989, + "loss": 7.6315, + "step": 18835 + }, + { + "epoch": 1.7575814127087805, + "grad_norm": 2.8196432841026273e+18, + "learning_rate": 0.0002491147120462672, + "loss": 7.4217, + "step": 18836 + }, + { + "epoch": 1.7576747224036577, + "grad_norm": 1.369807731394126, + "learning_rate": 0.00024910903459611624, + "loss": 7.4881, + "step": 18837 + }, + { + "epoch": 1.757768032098535, + "grad_norm": 1.959877959317174, + "learning_rate": 0.00024910335689396027, + "loss": 7.4117, + "step": 18838 + }, + { + "epoch": 1.7578613417934124, + "grad_norm": 4.56664119059963e+19, + "learning_rate": 0.00024909767893981387, + "loss": 8.0123, + "step": 18839 + }, + { + "epoch": 1.7579546514882898, + "grad_norm": 0.9247181298126528, + "learning_rate": 0.00024909200073369145, + "loss": 7.5096, + "step": 18840 + }, + { + "epoch": 1.758047961183167, + "grad_norm": 6.346771782827434e+19, + "learning_rate": 0.0002490863222756074, + "loss": 7.5664, + "step": 18841 + }, + { + "epoch": 1.7581412708780442, + "grad_norm": 1.0133691071605293, + "learning_rate": 0.00024908064356557615, + "loss": 7.5915, + "step": 18842 + }, + { + "epoch": 1.7582345805729216, + "grad_norm": 1.0840241106892938, + "learning_rate": 0.0002490749646036122, + "loss": 7.4964, + "step": 18843 + }, + { + "epoch": 1.7583278902677988, + "grad_norm": 1.4205343141378264, + "learning_rate": 0.00024906928538972995, + "loss": 7.5496, + "step": 18844 + }, + { + "epoch": 1.758421199962676, + "grad_norm": 1.459816497296699e+19, + "learning_rate": 0.0002490636059239439, + "loss": 7.4118, + "step": 18845 + }, + { + "epoch": 1.7585145096575534, + "grad_norm": 1.0561398562573516, + "learning_rate": 0.0002490579262062684, + "loss": 7.3327, + "step": 18846 + }, + { + "epoch": 1.7586078193524308, + "grad_norm": 1.136544745991548, + "learning_rate": 0.00024905224623671796, + "loss": 7.2804, + "step": 18847 + }, + { + "epoch": 1.758701129047308, + "grad_norm": 2.2105388517492734, + "learning_rate": 0.00024904656601530694, + "loss": 7.3598, + "step": 18848 + }, + { + "epoch": 1.7587944387421852, + "grad_norm": 3.2116269823802936e+18, + "learning_rate": 0.00024904088554204986, + "loss": 7.4706, + "step": 18849 + }, + { + "epoch": 1.7588877484370626, + "grad_norm": 1.982259799944509, + "learning_rate": 0.00024903520481696123, + "loss": 7.6293, + "step": 18850 + }, + { + "epoch": 1.75898105813194, + "grad_norm": 2.5864573867949314, + "learning_rate": 0.0002490295238400553, + "loss": 7.1664, + "step": 18851 + }, + { + "epoch": 1.7590743678268173, + "grad_norm": 3.227265226586335e+19, + "learning_rate": 0.0002490238426113467, + "loss": 7.4835, + "step": 18852 + }, + { + "epoch": 1.7591676775216945, + "grad_norm": 2.1876695933335815, + "learning_rate": 0.0002490181611308498, + "loss": 7.291, + "step": 18853 + }, + { + "epoch": 1.7592609872165719, + "grad_norm": 1.9102384952253126, + "learning_rate": 0.00024901247939857906, + "loss": 7.3431, + "step": 18854 + }, + { + "epoch": 1.759354296911449, + "grad_norm": 8.580548809284378, + "learning_rate": 0.00024900679741454886, + "loss": 7.5512, + "step": 18855 + }, + { + "epoch": 1.7594476066063263, + "grad_norm": 1.2295495832739747, + "learning_rate": 0.00024900111517877374, + "loss": 7.6285, + "step": 18856 + }, + { + "epoch": 1.7595409163012037, + "grad_norm": 0.999280399967947, + "learning_rate": 0.0002489954326912681, + "loss": 7.3951, + "step": 18857 + }, + { + "epoch": 1.759634225996081, + "grad_norm": 1.2119056300545998, + "learning_rate": 0.0002489897499520464, + "loss": 7.1456, + "step": 18858 + }, + { + "epoch": 1.7597275356909583, + "grad_norm": 0.8823595079733447, + "learning_rate": 0.00024898406696112306, + "loss": 7.4023, + "step": 18859 + }, + { + "epoch": 1.7598208453858355, + "grad_norm": 1.2324192301033292, + "learning_rate": 0.00024897838371851254, + "loss": 7.5322, + "step": 18860 + }, + { + "epoch": 1.759914155080713, + "grad_norm": 0.982913877617632, + "learning_rate": 0.00024897270022422943, + "loss": 7.4789, + "step": 18861 + }, + { + "epoch": 1.7600074647755903, + "grad_norm": 24.724960715892003, + "learning_rate": 0.000248967016478288, + "loss": 7.619, + "step": 18862 + }, + { + "epoch": 1.7601007744704675, + "grad_norm": 1.2971727166733202, + "learning_rate": 0.00024896133248070265, + "loss": 7.2559, + "step": 18863 + }, + { + "epoch": 1.7601940841653447, + "grad_norm": 1.0849140938389183, + "learning_rate": 0.0002489556482314881, + "loss": 7.5222, + "step": 18864 + }, + { + "epoch": 1.7602873938602221, + "grad_norm": 0.9256296184264999, + "learning_rate": 0.0002489499637306586, + "loss": 7.5504, + "step": 18865 + }, + { + "epoch": 1.7603807035550993, + "grad_norm": 0.8135813809397954, + "learning_rate": 0.00024894427897822867, + "loss": 7.4931, + "step": 18866 + }, + { + "epoch": 1.7604740132499765, + "grad_norm": 1.1803678548449519, + "learning_rate": 0.0002489385939742127, + "loss": 7.6259, + "step": 18867 + }, + { + "epoch": 1.760567322944854, + "grad_norm": 0.9856902891964394, + "learning_rate": 0.0002489329087186252, + "loss": 7.2918, + "step": 18868 + }, + { + "epoch": 1.7606606326397314, + "grad_norm": 1.0098002322085495, + "learning_rate": 0.00024892722321148063, + "loss": 7.3423, + "step": 18869 + }, + { + "epoch": 1.7607539423346086, + "grad_norm": 1.2794496152747776, + "learning_rate": 0.00024892153745279346, + "loss": 7.7949, + "step": 18870 + }, + { + "epoch": 1.7608472520294858, + "grad_norm": 1.1053866192628456, + "learning_rate": 0.00024891585144257806, + "loss": 7.152, + "step": 18871 + }, + { + "epoch": 1.7609405617243632, + "grad_norm": 1.2318256178540299, + "learning_rate": 0.00024891016518084906, + "loss": 7.5796, + "step": 18872 + }, + { + "epoch": 1.7610338714192406, + "grad_norm": 7.507544141232784, + "learning_rate": 0.0002489044786676207, + "loss": 7.5201, + "step": 18873 + }, + { + "epoch": 1.7611271811141178, + "grad_norm": 1.5237799599066066, + "learning_rate": 0.00024889879190290767, + "loss": 7.5248, + "step": 18874 + }, + { + "epoch": 1.761220490808995, + "grad_norm": 0.7703658153914797, + "learning_rate": 0.00024889310488672423, + "loss": 7.3672, + "step": 18875 + }, + { + "epoch": 1.7613138005038724, + "grad_norm": 0.9316765081631088, + "learning_rate": 0.00024888741761908487, + "loss": 7.5331, + "step": 18876 + }, + { + "epoch": 1.7614071101987496, + "grad_norm": 1.0867807788236388, + "learning_rate": 0.0002488817301000041, + "loss": 7.3578, + "step": 18877 + }, + { + "epoch": 1.7615004198936268, + "grad_norm": 0.9103711327324647, + "learning_rate": 0.0002488760423294965, + "loss": 7.3073, + "step": 18878 + }, + { + "epoch": 1.7615937295885042, + "grad_norm": 0.8402090268239037, + "learning_rate": 0.00024887035430757633, + "loss": 7.3152, + "step": 18879 + }, + { + "epoch": 1.7616870392833817, + "grad_norm": 1.1849891761838798, + "learning_rate": 0.00024886466603425814, + "loss": 7.4454, + "step": 18880 + }, + { + "epoch": 1.7617803489782589, + "grad_norm": 0.7641494307930702, + "learning_rate": 0.00024885897750955645, + "loss": 7.1826, + "step": 18881 + }, + { + "epoch": 1.761873658673136, + "grad_norm": 1.433519483311087, + "learning_rate": 0.0002488532887334856, + "loss": 7.6515, + "step": 18882 + }, + { + "epoch": 1.7619669683680135, + "grad_norm": 4.111627544161348, + "learning_rate": 0.00024884759970606016, + "loss": 7.5244, + "step": 18883 + }, + { + "epoch": 1.762060278062891, + "grad_norm": 2.5759386947922215e+20, + "learning_rate": 0.0002488419104272945, + "loss": 7.7533, + "step": 18884 + }, + { + "epoch": 1.7621535877577679, + "grad_norm": 2.1332678569464832e+21, + "learning_rate": 0.00024883622089720315, + "loss": 7.3833, + "step": 18885 + }, + { + "epoch": 1.7622468974526453, + "grad_norm": 1.0209987697051757, + "learning_rate": 0.0002488305311158006, + "loss": 7.4908, + "step": 18886 + }, + { + "epoch": 1.7623402071475227, + "grad_norm": 0.8176880164161165, + "learning_rate": 0.00024882484108310125, + "loss": 7.5469, + "step": 18887 + }, + { + "epoch": 1.7624335168424, + "grad_norm": 0.9064205711269898, + "learning_rate": 0.00024881915079911967, + "loss": 7.6145, + "step": 18888 + }, + { + "epoch": 1.762526826537277, + "grad_norm": 1.8645156003050505, + "learning_rate": 0.00024881346026387017, + "loss": 7.5487, + "step": 18889 + }, + { + "epoch": 1.7626201362321545, + "grad_norm": 1.6006630828084334, + "learning_rate": 0.00024880776947736744, + "loss": 7.2482, + "step": 18890 + }, + { + "epoch": 1.762713445927032, + "grad_norm": 0.9878370201568772, + "learning_rate": 0.0002488020784396257, + "loss": 7.3502, + "step": 18891 + }, + { + "epoch": 1.7628067556219091, + "grad_norm": 1.034803423682231, + "learning_rate": 0.0002487963871506596, + "loss": 7.218, + "step": 18892 + }, + { + "epoch": 1.7629000653167863, + "grad_norm": 3.987477059388236, + "learning_rate": 0.00024879069561048353, + "loss": 7.4146, + "step": 18893 + }, + { + "epoch": 1.7629933750116638, + "grad_norm": 5.26964247414472e+18, + "learning_rate": 0.00024878500381911203, + "loss": 7.4103, + "step": 18894 + }, + { + "epoch": 1.7630866847065412, + "grad_norm": 1.5821246941313138, + "learning_rate": 0.00024877931177655946, + "loss": 7.4196, + "step": 18895 + }, + { + "epoch": 1.7631799944014181, + "grad_norm": 1.0576909255099725, + "learning_rate": 0.0002487736194828404, + "loss": 7.3087, + "step": 18896 + }, + { + "epoch": 1.7632733040962956, + "grad_norm": 1.1215698420861535, + "learning_rate": 0.00024876792693796925, + "loss": 7.5532, + "step": 18897 + }, + { + "epoch": 1.763366613791173, + "grad_norm": 1.380719247890003, + "learning_rate": 0.0002487622341419605, + "loss": 7.6609, + "step": 18898 + }, + { + "epoch": 1.7634599234860502, + "grad_norm": 3.1841017392944386, + "learning_rate": 0.00024875654109482876, + "loss": 7.6722, + "step": 18899 + }, + { + "epoch": 1.7635532331809274, + "grad_norm": 3.2558833037512556, + "learning_rate": 0.0002487508477965883, + "loss": 6.8967, + "step": 18900 + }, + { + "epoch": 1.7636465428758048, + "grad_norm": 1.4810974200540796, + "learning_rate": 0.0002487451542472537, + "loss": 7.2748, + "step": 18901 + }, + { + "epoch": 1.7637398525706822, + "grad_norm": 1.3624712328921134, + "learning_rate": 0.0002487394604468394, + "loss": 7.2358, + "step": 18902 + }, + { + "epoch": 1.7638331622655594, + "grad_norm": 1.1610996594092255, + "learning_rate": 0.00024873376639535994, + "loss": 7.7352, + "step": 18903 + }, + { + "epoch": 1.7639264719604366, + "grad_norm": 1.0962514608946075, + "learning_rate": 0.00024872807209282973, + "loss": 7.5928, + "step": 18904 + }, + { + "epoch": 1.764019781655314, + "grad_norm": 6.538936135018972e+18, + "learning_rate": 0.00024872237753926336, + "loss": 7.7191, + "step": 18905 + }, + { + "epoch": 1.7641130913501912, + "grad_norm": 1.006616504448871, + "learning_rate": 0.0002487166827346751, + "loss": 7.4053, + "step": 18906 + }, + { + "epoch": 1.7642064010450684, + "grad_norm": 1.1519963501540955, + "learning_rate": 0.0002487109876790796, + "loss": 7.6159, + "step": 18907 + }, + { + "epoch": 1.7642997107399458, + "grad_norm": 0.9233269010287238, + "learning_rate": 0.0002487052923724914, + "loss": 7.5317, + "step": 18908 + }, + { + "epoch": 1.7643930204348233, + "grad_norm": 1.0774950403853825, + "learning_rate": 0.00024869959681492475, + "loss": 7.5519, + "step": 18909 + }, + { + "epoch": 1.7644863301297005, + "grad_norm": 0.904078532484951, + "learning_rate": 0.00024869390100639436, + "loss": 7.3042, + "step": 18910 + }, + { + "epoch": 1.7645796398245777, + "grad_norm": 0.9675075584340715, + "learning_rate": 0.00024868820494691453, + "loss": 7.5253, + "step": 18911 + }, + { + "epoch": 1.764672949519455, + "grad_norm": 1.053460809986153, + "learning_rate": 0.0002486825086364999, + "loss": 7.2295, + "step": 18912 + }, + { + "epoch": 1.7647662592143325, + "grad_norm": 1.9164104989069608, + "learning_rate": 0.00024867681207516485, + "loss": 7.7913, + "step": 18913 + }, + { + "epoch": 1.7648595689092097, + "grad_norm": 1.3083341873122873, + "learning_rate": 0.00024867111526292383, + "loss": 7.7106, + "step": 18914 + }, + { + "epoch": 1.764952878604087, + "grad_norm": 1.5829017403186338, + "learning_rate": 0.0002486654181997915, + "loss": 7.6147, + "step": 18915 + }, + { + "epoch": 1.7650461882989643, + "grad_norm": 4.943191902910106e+19, + "learning_rate": 0.0002486597208857822, + "loss": 7.0427, + "step": 18916 + }, + { + "epoch": 1.7651394979938415, + "grad_norm": 1.0711798437399689, + "learning_rate": 0.0002486540233209104, + "loss": 7.6387, + "step": 18917 + }, + { + "epoch": 1.7652328076887187, + "grad_norm": 0.8832293406441788, + "learning_rate": 0.00024864832550519075, + "loss": 7.5658, + "step": 18918 + }, + { + "epoch": 1.7653261173835961, + "grad_norm": 0.9887009790705189, + "learning_rate": 0.00024864262743863754, + "loss": 7.3842, + "step": 18919 + }, + { + "epoch": 1.7654194270784735, + "grad_norm": 1.1044341228743737, + "learning_rate": 0.0002486369291212654, + "loss": 7.3811, + "step": 18920 + }, + { + "epoch": 1.7655127367733507, + "grad_norm": 1.1958688328789728, + "learning_rate": 0.00024863123055308874, + "loss": 7.9088, + "step": 18921 + }, + { + "epoch": 1.765606046468228, + "grad_norm": 1.0905766227244882, + "learning_rate": 0.0002486255317341221, + "loss": 7.4428, + "step": 18922 + }, + { + "epoch": 1.7656993561631054, + "grad_norm": 1.1050227539677293, + "learning_rate": 0.00024861983266437994, + "loss": 7.4317, + "step": 18923 + }, + { + "epoch": 1.7657926658579828, + "grad_norm": 7.404572684550674e+20, + "learning_rate": 0.00024861413334387677, + "loss": 7.0244, + "step": 18924 + }, + { + "epoch": 1.76588597555286, + "grad_norm": 2.220220608247342e+21, + "learning_rate": 0.00024860843377262707, + "loss": 7.4532, + "step": 18925 + }, + { + "epoch": 1.7659792852477372, + "grad_norm": 1.0922518123134297, + "learning_rate": 0.00024860273395064527, + "loss": 7.0086, + "step": 18926 + }, + { + "epoch": 1.7660725949426146, + "grad_norm": 1.2974736614660143, + "learning_rate": 0.000248597033877946, + "loss": 7.4759, + "step": 18927 + }, + { + "epoch": 1.7661659046374918, + "grad_norm": 1.1355421638891479, + "learning_rate": 0.0002485913335545436, + "loss": 7.5585, + "step": 18928 + }, + { + "epoch": 1.766259214332369, + "grad_norm": 2.340223783097003, + "learning_rate": 0.0002485856329804527, + "loss": 8.2361, + "step": 18929 + }, + { + "epoch": 1.7663525240272464, + "grad_norm": 0.8993335777624253, + "learning_rate": 0.0002485799321556878, + "loss": 7.6565, + "step": 18930 + }, + { + "epoch": 1.7664458337221238, + "grad_norm": 7.980682300988454e+19, + "learning_rate": 0.00024857423108026326, + "loss": 7.3825, + "step": 18931 + }, + { + "epoch": 1.766539143417001, + "grad_norm": 1.0318769323524362, + "learning_rate": 0.0002485685297541936, + "loss": 7.5646, + "step": 18932 + }, + { + "epoch": 1.7666324531118782, + "grad_norm": 5.17774217165222e+20, + "learning_rate": 0.00024856282817749346, + "loss": 7.2828, + "step": 18933 + }, + { + "epoch": 1.7667257628067556, + "grad_norm": 0.9116368470886145, + "learning_rate": 0.00024855712635017723, + "loss": 7.6675, + "step": 18934 + }, + { + "epoch": 1.766819072501633, + "grad_norm": 0.865121529921138, + "learning_rate": 0.0002485514242722594, + "loss": 7.6325, + "step": 18935 + }, + { + "epoch": 1.7669123821965103, + "grad_norm": 0.840183001712361, + "learning_rate": 0.0002485457219437545, + "loss": 7.4827, + "step": 18936 + }, + { + "epoch": 1.7670056918913875, + "grad_norm": 1.834317208581265e+20, + "learning_rate": 0.00024854001936467696, + "loss": 7.332, + "step": 18937 + }, + { + "epoch": 1.7670990015862649, + "grad_norm": 0.8950786959022206, + "learning_rate": 0.00024853431653504147, + "loss": 7.4242, + "step": 18938 + }, + { + "epoch": 1.767192311281142, + "grad_norm": 1.3750816621858715, + "learning_rate": 0.0002485286134548623, + "loss": 7.5231, + "step": 18939 + }, + { + "epoch": 1.7672856209760193, + "grad_norm": 0.9544896269636526, + "learning_rate": 0.0002485229101241541, + "loss": 7.3002, + "step": 18940 + }, + { + "epoch": 1.7673789306708967, + "grad_norm": 1.0579716194562815e+21, + "learning_rate": 0.00024851720654293127, + "loss": 7.4502, + "step": 18941 + }, + { + "epoch": 1.767472240365774, + "grad_norm": 1.6908161863980584, + "learning_rate": 0.00024851150271120836, + "loss": 7.6493, + "step": 18942 + }, + { + "epoch": 1.7675655500606513, + "grad_norm": 0.9242720775339567, + "learning_rate": 0.00024850579862899995, + "loss": 7.081, + "step": 18943 + }, + { + "epoch": 1.7676588597555285, + "grad_norm": 1.0100612409419987, + "learning_rate": 0.00024850009429632045, + "loss": 7.6129, + "step": 18944 + }, + { + "epoch": 1.767752169450406, + "grad_norm": 1.2562811192947345e+20, + "learning_rate": 0.0002484943897131844, + "loss": 7.0885, + "step": 18945 + }, + { + "epoch": 1.7678454791452833, + "grad_norm": 1.307755789823911, + "learning_rate": 0.00024848868487960626, + "loss": 7.4628, + "step": 18946 + }, + { + "epoch": 1.7679387888401605, + "grad_norm": 1.0198933085829445, + "learning_rate": 0.00024848297979560057, + "loss": 7.4124, + "step": 18947 + }, + { + "epoch": 1.7680320985350377, + "grad_norm": 1.2054293566181258, + "learning_rate": 0.00024847727446118185, + "loss": 7.1701, + "step": 18948 + }, + { + "epoch": 1.7681254082299152, + "grad_norm": 2.8069179118641877e+19, + "learning_rate": 0.00024847156887636463, + "loss": 7.4985, + "step": 18949 + }, + { + "epoch": 1.7682187179247923, + "grad_norm": 1.416212950141182, + "learning_rate": 0.0002484658630411633, + "loss": 7.541, + "step": 18950 + }, + { + "epoch": 1.7683120276196695, + "grad_norm": 0.7983727485928532, + "learning_rate": 0.0002484601569555925, + "loss": 7.4488, + "step": 18951 + }, + { + "epoch": 1.768405337314547, + "grad_norm": 0.9592043909709721, + "learning_rate": 0.0002484544506196667, + "loss": 7.4887, + "step": 18952 + }, + { + "epoch": 1.7684986470094244, + "grad_norm": 2.2228790972170835, + "learning_rate": 0.00024844874403340034, + "loss": 7.5521, + "step": 18953 + }, + { + "epoch": 1.7685919567043016, + "grad_norm": 0.7822943832996614, + "learning_rate": 0.000248443037196808, + "loss": 7.3568, + "step": 18954 + }, + { + "epoch": 1.7686852663991788, + "grad_norm": 0.822341769583994, + "learning_rate": 0.0002484373301099042, + "loss": 7.3236, + "step": 18955 + }, + { + "epoch": 1.7687785760940562, + "grad_norm": 3.7771921984669308, + "learning_rate": 0.00024843162277270344, + "loss": 7.4014, + "step": 18956 + }, + { + "epoch": 1.7688718857889336, + "grad_norm": 1.1044333037280536, + "learning_rate": 0.0002484259151852202, + "loss": 7.3074, + "step": 18957 + }, + { + "epoch": 1.7689651954838108, + "grad_norm": 1.066657885910877, + "learning_rate": 0.000248420207347469, + "loss": 7.5542, + "step": 18958 + }, + { + "epoch": 1.769058505178688, + "grad_norm": 1.760347072369164e+21, + "learning_rate": 0.0002484144992594644, + "loss": 7.5065, + "step": 18959 + }, + { + "epoch": 1.7691518148735654, + "grad_norm": 0.8925102931480533, + "learning_rate": 0.0002484087909212209, + "loss": 7.4941, + "step": 18960 + }, + { + "epoch": 1.7692451245684426, + "grad_norm": 0.8286141826744504, + "learning_rate": 0.00024840308233275293, + "loss": 7.7571, + "step": 18961 + }, + { + "epoch": 1.7693384342633198, + "grad_norm": 1.5326103586898354, + "learning_rate": 0.0002483973734940751, + "loss": 7.1403, + "step": 18962 + }, + { + "epoch": 1.7694317439581972, + "grad_norm": 17.044389636359313, + "learning_rate": 0.0002483916644052019, + "loss": 7.384, + "step": 18963 + }, + { + "epoch": 1.7695250536530747, + "grad_norm": 3.1956124382833386, + "learning_rate": 0.00024838595506614786, + "loss": 7.6039, + "step": 18964 + }, + { + "epoch": 1.7696183633479519, + "grad_norm": 0.8988594848859862, + "learning_rate": 0.00024838024547692744, + "loss": 7.5447, + "step": 18965 + }, + { + "epoch": 1.769711673042829, + "grad_norm": 1.1102904848543693, + "learning_rate": 0.00024837453563755524, + "loss": 7.6648, + "step": 18966 + }, + { + "epoch": 1.7698049827377065, + "grad_norm": 0.9016436330472136, + "learning_rate": 0.00024836882554804573, + "loss": 7.3858, + "step": 18967 + }, + { + "epoch": 1.769898292432584, + "grad_norm": 1.2039576063367727, + "learning_rate": 0.00024836311520841343, + "loss": 7.7552, + "step": 18968 + }, + { + "epoch": 1.769991602127461, + "grad_norm": 1.0097898133859275, + "learning_rate": 0.0002483574046186728, + "loss": 7.3294, + "step": 18969 + }, + { + "epoch": 1.7700849118223383, + "grad_norm": 0.9224101469518345, + "learning_rate": 0.00024835169377883853, + "loss": 7.484, + "step": 18970 + }, + { + "epoch": 1.7701782215172157, + "grad_norm": 2.695894155765869, + "learning_rate": 0.000248345982688925, + "loss": 7.1296, + "step": 18971 + }, + { + "epoch": 1.770271531212093, + "grad_norm": 1.9494706391822012, + "learning_rate": 0.00024834027134894673, + "loss": 7.1749, + "step": 18972 + }, + { + "epoch": 1.77036484090697, + "grad_norm": 1.0233167241814698, + "learning_rate": 0.0002483345597589183, + "loss": 7.3162, + "step": 18973 + }, + { + "epoch": 1.7704581506018475, + "grad_norm": 8.601676247448581e+22, + "learning_rate": 0.00024832884791885424, + "loss": 7.0693, + "step": 18974 + }, + { + "epoch": 1.770551460296725, + "grad_norm": 11.575309446791698, + "learning_rate": 0.0002483231358287691, + "loss": 6.9985, + "step": 18975 + }, + { + "epoch": 1.7706447699916021, + "grad_norm": 2.6747806016348588e+23, + "learning_rate": 0.0002483174234886772, + "loss": 7.5095, + "step": 18976 + }, + { + "epoch": 1.7707380796864793, + "grad_norm": 2.678412520347125, + "learning_rate": 0.00024831171089859333, + "loss": 7.1498, + "step": 18977 + }, + { + "epoch": 1.7708313893813568, + "grad_norm": 1.3079325247516442, + "learning_rate": 0.00024830599805853186, + "loss": 7.5499, + "step": 18978 + }, + { + "epoch": 1.7709246990762342, + "grad_norm": 5.048255422706714e+20, + "learning_rate": 0.00024830028496850736, + "loss": 7.6003, + "step": 18979 + }, + { + "epoch": 1.7710180087711114, + "grad_norm": 1.5330330589822638, + "learning_rate": 0.00024829457162853436, + "loss": 7.0302, + "step": 18980 + }, + { + "epoch": 1.7711113184659886, + "grad_norm": 1.111868481146791, + "learning_rate": 0.0002482888580386274, + "loss": 7.3593, + "step": 18981 + }, + { + "epoch": 1.771204628160866, + "grad_norm": 1.078084979341044, + "learning_rate": 0.0002482831441988009, + "loss": 7.3476, + "step": 18982 + }, + { + "epoch": 1.7712979378557432, + "grad_norm": 1.3483238059834843, + "learning_rate": 0.0002482774301090696, + "loss": 7.2266, + "step": 18983 + }, + { + "epoch": 1.7713912475506204, + "grad_norm": 3.694688405782563e+20, + "learning_rate": 0.0002482717157694479, + "loss": 7.2902, + "step": 18984 + }, + { + "epoch": 1.7714845572454978, + "grad_norm": 1.7605014017623153, + "learning_rate": 0.00024826600117995026, + "loss": 7.6352, + "step": 18985 + }, + { + "epoch": 1.7715778669403752, + "grad_norm": 1.6675612210047892, + "learning_rate": 0.0002482602863405913, + "loss": 7.546, + "step": 18986 + }, + { + "epoch": 1.7716711766352524, + "grad_norm": 1.295525233299987, + "learning_rate": 0.00024825457125138555, + "loss": 7.4413, + "step": 18987 + }, + { + "epoch": 1.7717644863301296, + "grad_norm": 1.2079665422526022, + "learning_rate": 0.0002482488559123475, + "loss": 7.6844, + "step": 18988 + }, + { + "epoch": 1.771857796025007, + "grad_norm": 0.7787044380221436, + "learning_rate": 0.0002482431403234918, + "loss": 7.328, + "step": 18989 + }, + { + "epoch": 1.7719511057198845, + "grad_norm": 0.9261922843019644, + "learning_rate": 0.00024823742448483287, + "loss": 7.5236, + "step": 18990 + }, + { + "epoch": 1.7720444154147614, + "grad_norm": 0.8334695812171252, + "learning_rate": 0.00024823170839638524, + "loss": 7.4467, + "step": 18991 + }, + { + "epoch": 1.7721377251096389, + "grad_norm": 2.8235342723794248e+22, + "learning_rate": 0.00024822599205816346, + "loss": 7.1937, + "step": 18992 + }, + { + "epoch": 1.7722310348045163, + "grad_norm": 0.8874181053791339, + "learning_rate": 0.0002482202754701821, + "loss": 7.2831, + "step": 18993 + }, + { + "epoch": 1.7723243444993935, + "grad_norm": 1.6005744857696642, + "learning_rate": 0.0002482145586324557, + "loss": 7.8902, + "step": 18994 + }, + { + "epoch": 1.7724176541942707, + "grad_norm": 0.9292210486328152, + "learning_rate": 0.0002482088415449987, + "loss": 7.5224, + "step": 18995 + }, + { + "epoch": 1.772510963889148, + "grad_norm": 4.605956167788891e+21, + "learning_rate": 0.0002482031242078258, + "loss": 7.6189, + "step": 18996 + }, + { + "epoch": 1.7726042735840255, + "grad_norm": 1.0680183679886885, + "learning_rate": 0.0002481974066209514, + "loss": 7.128, + "step": 18997 + }, + { + "epoch": 1.7726975832789027, + "grad_norm": 1.3248823427360683, + "learning_rate": 0.0002481916887843901, + "loss": 7.6261, + "step": 18998 + }, + { + "epoch": 1.77279089297378, + "grad_norm": 1.2212738320785343, + "learning_rate": 0.0002481859706981564, + "loss": 7.7427, + "step": 18999 + }, + { + "epoch": 1.7728842026686573, + "grad_norm": 0.8581875875186521, + "learning_rate": 0.00024818025236226485, + "loss": 7.4896, + "step": 19000 + }, + { + "epoch": 1.7729775123635347, + "grad_norm": 1.2654664190651985, + "learning_rate": 0.00024817453377673, + "loss": 7.3028, + "step": 19001 + }, + { + "epoch": 1.7730708220584117, + "grad_norm": 0.8253028868362963, + "learning_rate": 0.0002481688149415664, + "loss": 7.4867, + "step": 19002 + }, + { + "epoch": 1.7731641317532891, + "grad_norm": 3.7924992194193075e+22, + "learning_rate": 0.00024816309585678863, + "loss": 7.3971, + "step": 19003 + }, + { + "epoch": 1.7732574414481665, + "grad_norm": 1.0423404428514094, + "learning_rate": 0.0002481573765224111, + "loss": 7.1988, + "step": 19004 + }, + { + "epoch": 1.7733507511430437, + "grad_norm": 3.4881704990511733e+22, + "learning_rate": 0.00024815165693844847, + "loss": 7.2336, + "step": 19005 + }, + { + "epoch": 1.773444060837921, + "grad_norm": 1.3962991737008825, + "learning_rate": 0.0002481459371049153, + "loss": 7.5088, + "step": 19006 + }, + { + "epoch": 1.7735373705327984, + "grad_norm": 1.5578826973117867, + "learning_rate": 0.000248140217021826, + "loss": 7.4807, + "step": 19007 + }, + { + "epoch": 1.7736306802276758, + "grad_norm": 1.9730064168053936, + "learning_rate": 0.0002481344966891953, + "loss": 7.5121, + "step": 19008 + }, + { + "epoch": 1.773723989922553, + "grad_norm": 1.2331096708120617, + "learning_rate": 0.0002481287761070376, + "loss": 7.4631, + "step": 19009 + }, + { + "epoch": 1.7738172996174302, + "grad_norm": 0.9901719951410979, + "learning_rate": 0.00024812305527536745, + "loss": 7.4936, + "step": 19010 + }, + { + "epoch": 1.7739106093123076, + "grad_norm": 0.9484428367664292, + "learning_rate": 0.00024811733419419945, + "loss": 7.5568, + "step": 19011 + }, + { + "epoch": 1.7740039190071848, + "grad_norm": 1.406634902436136, + "learning_rate": 0.0002481116128635482, + "loss": 7.1316, + "step": 19012 + }, + { + "epoch": 1.774097228702062, + "grad_norm": 1.1300079041470195, + "learning_rate": 0.0002481058912834281, + "loss": 7.6884, + "step": 19013 + }, + { + "epoch": 1.7741905383969394, + "grad_norm": 0.9041166853872442, + "learning_rate": 0.00024810016945385385, + "loss": 7.616, + "step": 19014 + }, + { + "epoch": 1.7742838480918168, + "grad_norm": 1.0154373500747118, + "learning_rate": 0.0002480944473748399, + "loss": 7.5276, + "step": 19015 + }, + { + "epoch": 1.774377157786694, + "grad_norm": 1.0106668708239332, + "learning_rate": 0.0002480887250464008, + "loss": 7.4622, + "step": 19016 + }, + { + "epoch": 1.7744704674815712, + "grad_norm": 0.899765275838592, + "learning_rate": 0.00024808300246855115, + "loss": 7.5536, + "step": 19017 + }, + { + "epoch": 1.7745637771764486, + "grad_norm": 9.166683703726134, + "learning_rate": 0.0002480772796413055, + "loss": 6.9535, + "step": 19018 + }, + { + "epoch": 1.774657086871326, + "grad_norm": 1.8716875613851058, + "learning_rate": 0.00024807155656467834, + "loss": 7.7764, + "step": 19019 + }, + { + "epoch": 1.7747503965662033, + "grad_norm": 1.6187885473815258, + "learning_rate": 0.00024806583323868427, + "loss": 7.7371, + "step": 19020 + }, + { + "epoch": 1.7748437062610805, + "grad_norm": 1.3154802945233375, + "learning_rate": 0.00024806010966333785, + "loss": 7.5645, + "step": 19021 + }, + { + "epoch": 1.7749370159559579, + "grad_norm": 0.8862612066356055, + "learning_rate": 0.0002480543858386536, + "loss": 7.2765, + "step": 19022 + }, + { + "epoch": 1.775030325650835, + "grad_norm": 1.668666237927937, + "learning_rate": 0.00024804866176464613, + "loss": 7.6822, + "step": 19023 + }, + { + "epoch": 1.7751236353457123, + "grad_norm": 0.9391554635729235, + "learning_rate": 0.00024804293744132993, + "loss": 7.5717, + "step": 19024 + }, + { + "epoch": 1.7752169450405897, + "grad_norm": 1.4187566821904332, + "learning_rate": 0.0002480372128687196, + "loss": 7.0862, + "step": 19025 + }, + { + "epoch": 1.775310254735467, + "grad_norm": 0.8624018440425985, + "learning_rate": 0.0002480314880468297, + "loss": 7.5298, + "step": 19026 + }, + { + "epoch": 1.7754035644303443, + "grad_norm": 0.947325036276611, + "learning_rate": 0.0002480257629756747, + "loss": 7.372, + "step": 19027 + }, + { + "epoch": 1.7754968741252215, + "grad_norm": 0.9633285277710676, + "learning_rate": 0.00024802003765526926, + "loss": 7.4813, + "step": 19028 + }, + { + "epoch": 1.775590183820099, + "grad_norm": 1.3799183627360982, + "learning_rate": 0.0002480143120856279, + "loss": 7.0601, + "step": 19029 + }, + { + "epoch": 1.7756834935149763, + "grad_norm": 0.9564881766906544, + "learning_rate": 0.00024800858626676514, + "loss": 7.5792, + "step": 19030 + }, + { + "epoch": 1.7757768032098535, + "grad_norm": 3.626218993189484e+21, + "learning_rate": 0.0002480028601986957, + "loss": 7.3102, + "step": 19031 + }, + { + "epoch": 1.7758701129047307, + "grad_norm": 0.9968939678294169, + "learning_rate": 0.0002479971338814339, + "loss": 7.5396, + "step": 19032 + }, + { + "epoch": 1.7759634225996082, + "grad_norm": 1.0762333845169343, + "learning_rate": 0.00024799140731499443, + "loss": 7.6431, + "step": 19033 + }, + { + "epoch": 1.7760567322944854, + "grad_norm": 1.5379625577714777, + "learning_rate": 0.00024798568049939185, + "loss": 6.9305, + "step": 19034 + }, + { + "epoch": 1.7761500419893625, + "grad_norm": 0.9908393680991672, + "learning_rate": 0.0002479799534346407, + "loss": 7.7475, + "step": 19035 + }, + { + "epoch": 1.77624335168424, + "grad_norm": 0.8593428310357705, + "learning_rate": 0.00024797422612075556, + "loss": 7.583, + "step": 19036 + }, + { + "epoch": 1.7763366613791174, + "grad_norm": 8.88993716926113e+22, + "learning_rate": 0.00024796849855775096, + "loss": 7.4717, + "step": 19037 + }, + { + "epoch": 1.7764299710739946, + "grad_norm": 5.500084378063632e+21, + "learning_rate": 0.0002479627707456416, + "loss": 7.3645, + "step": 19038 + }, + { + "epoch": 1.7765232807688718, + "grad_norm": 1.0309170513856898, + "learning_rate": 0.0002479570426844418, + "loss": 7.458, + "step": 19039 + }, + { + "epoch": 1.7766165904637492, + "grad_norm": 0.9979119870753893, + "learning_rate": 0.00024795131437416635, + "loss": 7.333, + "step": 19040 + }, + { + "epoch": 1.7767099001586266, + "grad_norm": 1.1154011409860538, + "learning_rate": 0.00024794558581482965, + "loss": 7.5239, + "step": 19041 + }, + { + "epoch": 1.7768032098535038, + "grad_norm": 4.941109042943061e+23, + "learning_rate": 0.0002479398570064464, + "loss": 7.6823, + "step": 19042 + }, + { + "epoch": 1.776896519548381, + "grad_norm": 1.323185683462415, + "learning_rate": 0.00024793412794903104, + "loss": 7.7384, + "step": 19043 + }, + { + "epoch": 1.7769898292432584, + "grad_norm": 4.639659647164013e+23, + "learning_rate": 0.0002479283986425982, + "loss": 7.4713, + "step": 19044 + }, + { + "epoch": 1.7770831389381356, + "grad_norm": 1.022673585190104, + "learning_rate": 0.00024792266908716255, + "loss": 7.5286, + "step": 19045 + }, + { + "epoch": 1.7771764486330128, + "grad_norm": 2.827453436883232, + "learning_rate": 0.0002479169392827385, + "loss": 7.6664, + "step": 19046 + }, + { + "epoch": 1.7772697583278902, + "grad_norm": 1.7793198888900448, + "learning_rate": 0.00024791120922934063, + "loss": 7.3161, + "step": 19047 + }, + { + "epoch": 1.7773630680227677, + "grad_norm": 6.150597234588658e+23, + "learning_rate": 0.0002479054789269836, + "loss": 7.3292, + "step": 19048 + }, + { + "epoch": 1.7774563777176449, + "grad_norm": 1.4330492754979858e+23, + "learning_rate": 0.000247899748375682, + "loss": 7.2352, + "step": 19049 + }, + { + "epoch": 1.777549687412522, + "grad_norm": 320.22202947707717, + "learning_rate": 0.00024789401757545023, + "loss": 7.1699, + "step": 19050 + }, + { + "epoch": 1.7776429971073995, + "grad_norm": 0.9082817545139383, + "learning_rate": 0.00024788828652630305, + "loss": 7.4886, + "step": 19051 + }, + { + "epoch": 1.777736306802277, + "grad_norm": 2.002140327331555, + "learning_rate": 0.0002478825552282549, + "loss": 7.4104, + "step": 19052 + }, + { + "epoch": 1.777829616497154, + "grad_norm": 1.0940211908436615, + "learning_rate": 0.0002478768236813205, + "loss": 7.3746, + "step": 19053 + }, + { + "epoch": 1.7779229261920313, + "grad_norm": 1.5836045173936784, + "learning_rate": 0.00024787109188551427, + "loss": 7.8056, + "step": 19054 + }, + { + "epoch": 1.7780162358869087, + "grad_norm": 1.5253484141036806, + "learning_rate": 0.0002478653598408508, + "loss": 7.7218, + "step": 19055 + }, + { + "epoch": 1.778109545581786, + "grad_norm": 6.55019767286944e+23, + "learning_rate": 0.0002478596275473448, + "loss": 7.0553, + "step": 19056 + }, + { + "epoch": 1.778202855276663, + "grad_norm": 2.401935915402837, + "learning_rate": 0.0002478538950050107, + "loss": 7.7304, + "step": 19057 + }, + { + "epoch": 1.7782961649715405, + "grad_norm": 0.9609892543627706, + "learning_rate": 0.0002478481622138631, + "loss": 7.5772, + "step": 19058 + }, + { + "epoch": 1.778389474666418, + "grad_norm": 48.217262545264205, + "learning_rate": 0.0002478424291739167, + "loss": 7.2424, + "step": 19059 + }, + { + "epoch": 1.7784827843612951, + "grad_norm": 1.0082110612507664, + "learning_rate": 0.00024783669588518593, + "loss": 7.6785, + "step": 19060 + }, + { + "epoch": 1.7785760940561723, + "grad_norm": 1.5851002588640422, + "learning_rate": 0.0002478309623476854, + "loss": 7.2345, + "step": 19061 + }, + { + "epoch": 1.7786694037510498, + "grad_norm": 3.2072360613266064, + "learning_rate": 0.00024782522856142973, + "loss": 7.6399, + "step": 19062 + }, + { + "epoch": 1.7787627134459272, + "grad_norm": 1.2783718865905815, + "learning_rate": 0.0002478194945264335, + "loss": 7.6297, + "step": 19063 + }, + { + "epoch": 1.7788560231408044, + "grad_norm": 3.57878750605945e+22, + "learning_rate": 0.0002478137602427113, + "loss": 7.6129, + "step": 19064 + }, + { + "epoch": 1.7789493328356816, + "grad_norm": 1.1851864754519241, + "learning_rate": 0.0002478080257102776, + "loss": 7.4054, + "step": 19065 + }, + { + "epoch": 1.779042642530559, + "grad_norm": 1.2612316456060153, + "learning_rate": 0.00024780229092914713, + "loss": 7.1375, + "step": 19066 + }, + { + "epoch": 1.7791359522254362, + "grad_norm": 3.1976282891609426e+23, + "learning_rate": 0.0002477965558993344, + "loss": 7.1332, + "step": 19067 + }, + { + "epoch": 1.7792292619203134, + "grad_norm": 1.115190066716505e+23, + "learning_rate": 0.000247790820620854, + "loss": 7.6073, + "step": 19068 + }, + { + "epoch": 1.7793225716151908, + "grad_norm": 3.109449830777312, + "learning_rate": 0.0002477850850937204, + "loss": 7.5159, + "step": 19069 + }, + { + "epoch": 1.7794158813100682, + "grad_norm": 1.0149125447683738, + "learning_rate": 0.0002477793493179485, + "loss": 7.1411, + "step": 19070 + }, + { + "epoch": 1.7795091910049454, + "grad_norm": 1.153481595381054, + "learning_rate": 0.0002477736132935525, + "loss": 7.5065, + "step": 19071 + }, + { + "epoch": 1.7796025006998226, + "grad_norm": 2.5826446985044557e+23, + "learning_rate": 0.00024776787702054725, + "loss": 7.6291, + "step": 19072 + }, + { + "epoch": 1.7796958103947, + "grad_norm": 1.3963246197113945, + "learning_rate": 0.0002477621404989472, + "loss": 7.5878, + "step": 19073 + }, + { + "epoch": 1.7797891200895775, + "grad_norm": 1.774154071103055, + "learning_rate": 0.00024775640372876707, + "loss": 7.1166, + "step": 19074 + }, + { + "epoch": 1.7798824297844547, + "grad_norm": 7.396201552524732e+23, + "learning_rate": 0.00024775066671002126, + "loss": 7.3484, + "step": 19075 + }, + { + "epoch": 1.7799757394793319, + "grad_norm": 7.426584596739533e+23, + "learning_rate": 0.00024774492944272454, + "loss": 7.5867, + "step": 19076 + }, + { + "epoch": 1.7800690491742093, + "grad_norm": 1.286415071390443, + "learning_rate": 0.0002477391919268914, + "loss": 7.415, + "step": 19077 + }, + { + "epoch": 1.7801623588690865, + "grad_norm": 1.1519512444054163, + "learning_rate": 0.0002477334541625364, + "loss": 7.341, + "step": 19078 + }, + { + "epoch": 1.7802556685639637, + "grad_norm": 1.1278573966955778, + "learning_rate": 0.0002477277161496742, + "loss": 7.5516, + "step": 19079 + }, + { + "epoch": 1.780348978258841, + "grad_norm": 1.4213487554659808e+25, + "learning_rate": 0.0002477219778883194, + "loss": 7.5895, + "step": 19080 + }, + { + "epoch": 1.7804422879537185, + "grad_norm": 1.6872377227034507, + "learning_rate": 0.0002477162393784865, + "loss": 7.7627, + "step": 19081 + }, + { + "epoch": 1.7805355976485957, + "grad_norm": 1.3364307602510477, + "learning_rate": 0.00024771050062019027, + "loss": 7.4979, + "step": 19082 + }, + { + "epoch": 1.780628907343473, + "grad_norm": 1.0263034543097302, + "learning_rate": 0.00024770476161344507, + "loss": 7.4978, + "step": 19083 + }, + { + "epoch": 1.7807222170383503, + "grad_norm": 1.2401385873461775e+24, + "learning_rate": 0.0002476990223582656, + "loss": 7.4717, + "step": 19084 + }, + { + "epoch": 1.7808155267332277, + "grad_norm": 1.2623822699111553, + "learning_rate": 0.0002476932828546665, + "loss": 7.9955, + "step": 19085 + }, + { + "epoch": 1.780908836428105, + "grad_norm": 4.198995636706608, + "learning_rate": 0.0002476875431026623, + "loss": 7.9815, + "step": 19086 + }, + { + "epoch": 1.7810021461229821, + "grad_norm": 2.0699486607933653e+24, + "learning_rate": 0.00024768180310226766, + "loss": 7.6658, + "step": 19087 + }, + { + "epoch": 1.7810954558178596, + "grad_norm": 1.1530673928900184, + "learning_rate": 0.00024767606285349704, + "loss": 7.7268, + "step": 19088 + }, + { + "epoch": 1.7811887655127367, + "grad_norm": 1.7685502411911815, + "learning_rate": 0.0002476703223563652, + "loss": 7.5614, + "step": 19089 + }, + { + "epoch": 1.781282075207614, + "grad_norm": 1.2266719995128788, + "learning_rate": 0.00024766458161088666, + "loss": 7.4791, + "step": 19090 + }, + { + "epoch": 1.7813753849024914, + "grad_norm": 7.280834706035832e+24, + "learning_rate": 0.000247658840617076, + "loss": 7.5548, + "step": 19091 + }, + { + "epoch": 1.7814686945973688, + "grad_norm": 1.4388773046641936, + "learning_rate": 0.00024765309937494785, + "loss": 7.6108, + "step": 19092 + }, + { + "epoch": 1.781562004292246, + "grad_norm": 1.5460813061787717, + "learning_rate": 0.00024764735788451676, + "loss": 7.5943, + "step": 19093 + }, + { + "epoch": 1.7816553139871232, + "grad_norm": 1.888305356358371, + "learning_rate": 0.0002476416161457974, + "loss": 7.6099, + "step": 19094 + }, + { + "epoch": 1.7817486236820006, + "grad_norm": 1.4772871727066375, + "learning_rate": 0.0002476358741588043, + "loss": 7.7197, + "step": 19095 + }, + { + "epoch": 1.781841933376878, + "grad_norm": 1.7726347491984153, + "learning_rate": 0.00024763013192355215, + "loss": 7.9987, + "step": 19096 + }, + { + "epoch": 1.781935243071755, + "grad_norm": 2.7705260061997547e+26, + "learning_rate": 0.00024762438944005545, + "loss": 7.4496, + "step": 19097 + }, + { + "epoch": 1.7820285527666324, + "grad_norm": 1.274590348160889, + "learning_rate": 0.0002476186467083289, + "loss": 7.4648, + "step": 19098 + }, + { + "epoch": 1.7821218624615098, + "grad_norm": 1.4875922158842, + "learning_rate": 0.00024761290372838697, + "loss": 7.853, + "step": 19099 + }, + { + "epoch": 1.782215172156387, + "grad_norm": 1.6076443395777935e+27, + "learning_rate": 0.00024760716050024434, + "loss": 7.3213, + "step": 19100 + }, + { + "epoch": 1.7823084818512642, + "grad_norm": 2.4161615786213466, + "learning_rate": 0.0002476014170239157, + "loss": 7.4026, + "step": 19101 + }, + { + "epoch": 1.7824017915461416, + "grad_norm": 1.1196677733877227, + "learning_rate": 0.0002475956732994155, + "loss": 7.591, + "step": 19102 + }, + { + "epoch": 1.782495101241019, + "grad_norm": 14.297719685466705, + "learning_rate": 0.00024758992932675844, + "loss": 7.3255, + "step": 19103 + }, + { + "epoch": 1.7825884109358963, + "grad_norm": 1.2363080172954104, + "learning_rate": 0.00024758418510595906, + "loss": 7.6419, + "step": 19104 + }, + { + "epoch": 1.7826817206307735, + "grad_norm": 1.387178388425751, + "learning_rate": 0.000247578440637032, + "loss": 7.3854, + "step": 19105 + }, + { + "epoch": 1.7827750303256509, + "grad_norm": 1.196735552840646, + "learning_rate": 0.00024757269591999193, + "loss": 7.2566, + "step": 19106 + }, + { + "epoch": 1.7828683400205283, + "grad_norm": 1.497410954744657, + "learning_rate": 0.0002475669509548534, + "loss": 7.4793, + "step": 19107 + }, + { + "epoch": 1.7829616497154053, + "grad_norm": 1.9787876693017044, + "learning_rate": 0.00024756120574163096, + "loss": 7.5833, + "step": 19108 + }, + { + "epoch": 1.7830549594102827, + "grad_norm": 1.5795597437353064, + "learning_rate": 0.0002475554602803393, + "loss": 7.7214, + "step": 19109 + }, + { + "epoch": 1.78314826910516, + "grad_norm": 1.726687396748399, + "learning_rate": 0.000247549714570993, + "loss": 7.7336, + "step": 19110 + }, + { + "epoch": 1.7832415788000373, + "grad_norm": 3.174839185870439e+24, + "learning_rate": 0.00024754396861360667, + "loss": 7.4818, + "step": 19111 + }, + { + "epoch": 1.7833348884949145, + "grad_norm": 2.811128558763366e+25, + "learning_rate": 0.00024753822240819494, + "loss": 7.4061, + "step": 19112 + }, + { + "epoch": 1.783428198189792, + "grad_norm": 2.3076654895340147, + "learning_rate": 0.00024753247595477235, + "loss": 7.2108, + "step": 19113 + }, + { + "epoch": 1.7835215078846693, + "grad_norm": 2.3034452873762457e+23, + "learning_rate": 0.00024752672925335364, + "loss": 7.6922, + "step": 19114 + }, + { + "epoch": 1.7836148175795465, + "grad_norm": 1.38116911235703, + "learning_rate": 0.0002475209823039533, + "loss": 7.5161, + "step": 19115 + }, + { + "epoch": 1.7837081272744237, + "grad_norm": 2.9192549068721122e+26, + "learning_rate": 0.000247515235106586, + "loss": 7.1449, + "step": 19116 + }, + { + "epoch": 1.7838014369693012, + "grad_norm": 1.199055215736986, + "learning_rate": 0.0002475094876612664, + "loss": 7.2271, + "step": 19117 + }, + { + "epoch": 1.7838947466641784, + "grad_norm": 2.0674903074370348, + "learning_rate": 0.00024750373996800895, + "loss": 7.9479, + "step": 19118 + }, + { + "epoch": 1.7839880563590556, + "grad_norm": 8.16726547745407e+24, + "learning_rate": 0.0002474979920268284, + "loss": 7.5551, + "step": 19119 + }, + { + "epoch": 1.784081366053933, + "grad_norm": 2.5238819403551562, + "learning_rate": 0.00024749224383773937, + "loss": 7.6268, + "step": 19120 + }, + { + "epoch": 1.7841746757488104, + "grad_norm": 2.2876848746259433, + "learning_rate": 0.00024748649540075644, + "loss": 7.7387, + "step": 19121 + }, + { + "epoch": 1.7842679854436876, + "grad_norm": 2.7869167663837184e+26, + "learning_rate": 0.00024748074671589426, + "loss": 7.151, + "step": 19122 + }, + { + "epoch": 1.7843612951385648, + "grad_norm": 4.4782482222857956e+23, + "learning_rate": 0.00024747499778316736, + "loss": 7.7514, + "step": 19123 + }, + { + "epoch": 1.7844546048334422, + "grad_norm": 2.6329822679408794, + "learning_rate": 0.0002474692486025904, + "loss": 7.7906, + "step": 19124 + }, + { + "epoch": 1.7845479145283196, + "grad_norm": 1.1720372592425896, + "learning_rate": 0.00024746349917417813, + "loss": 7.5065, + "step": 19125 + }, + { + "epoch": 1.7846412242231968, + "grad_norm": 3.24279597028213, + "learning_rate": 0.0002474577494979449, + "loss": 7.1909, + "step": 19126 + }, + { + "epoch": 1.784734533918074, + "grad_norm": 1.9807787751434274, + "learning_rate": 0.00024745199957390557, + "loss": 7.562, + "step": 19127 + }, + { + "epoch": 1.7848278436129514, + "grad_norm": 1.5248598293149151, + "learning_rate": 0.0002474462494020747, + "loss": 7.4086, + "step": 19128 + }, + { + "epoch": 1.7849211533078286, + "grad_norm": 1.506807110236928, + "learning_rate": 0.0002474404989824668, + "loss": 7.2286, + "step": 19129 + }, + { + "epoch": 1.7850144630027058, + "grad_norm": 1.2817663053500983, + "learning_rate": 0.0002474347483150966, + "loss": 7.6642, + "step": 19130 + }, + { + "epoch": 1.7851077726975833, + "grad_norm": 1.1988976342537283, + "learning_rate": 0.00024742899739997875, + "loss": 7.5792, + "step": 19131 + }, + { + "epoch": 1.7852010823924607, + "grad_norm": 1.973440312679031e+23, + "learning_rate": 0.0002474232462371278, + "loss": 7.6688, + "step": 19132 + }, + { + "epoch": 1.7852943920873379, + "grad_norm": 1.2769250723944134, + "learning_rate": 0.00024741749482655836, + "loss": 7.592, + "step": 19133 + }, + { + "epoch": 1.785387701782215, + "grad_norm": 1.240384417452536, + "learning_rate": 0.0002474117431682851, + "loss": 7.1376, + "step": 19134 + }, + { + "epoch": 1.7854810114770925, + "grad_norm": 1.458613158370884, + "learning_rate": 0.0002474059912623226, + "loss": 7.4011, + "step": 19135 + }, + { + "epoch": 1.78557432117197, + "grad_norm": 1.4201750724523399, + "learning_rate": 0.0002474002391086856, + "loss": 7.4334, + "step": 19136 + }, + { + "epoch": 1.785667630866847, + "grad_norm": 1.1011705859014134, + "learning_rate": 0.0002473944867073886, + "loss": 7.6404, + "step": 19137 + }, + { + "epoch": 1.7857609405617243, + "grad_norm": 1.2532313725518631, + "learning_rate": 0.00024738873405844627, + "loss": 7.3134, + "step": 19138 + }, + { + "epoch": 1.7858542502566017, + "grad_norm": 1.4728022878558198, + "learning_rate": 0.0002473829811618732, + "loss": 7.8477, + "step": 19139 + }, + { + "epoch": 1.785947559951479, + "grad_norm": 1.452673956992455, + "learning_rate": 0.00024737722801768407, + "loss": 7.5405, + "step": 19140 + }, + { + "epoch": 1.786040869646356, + "grad_norm": 1.1135298027755227, + "learning_rate": 0.0002473714746258936, + "loss": 7.4913, + "step": 19141 + }, + { + "epoch": 1.7861341793412335, + "grad_norm": 1.4483498494444733, + "learning_rate": 0.0002473657209865162, + "loss": 7.639, + "step": 19142 + }, + { + "epoch": 1.786227489036111, + "grad_norm": 1.7798683984521242, + "learning_rate": 0.0002473599670995666, + "loss": 7.1186, + "step": 19143 + }, + { + "epoch": 1.7863207987309881, + "grad_norm": 1.4705820322554384, + "learning_rate": 0.0002473542129650595, + "loss": 7.2923, + "step": 19144 + }, + { + "epoch": 1.7864141084258653, + "grad_norm": 1.0490674361655932, + "learning_rate": 0.00024734845858300945, + "loss": 7.348, + "step": 19145 + }, + { + "epoch": 1.7865074181207428, + "grad_norm": 1.2876962211017715, + "learning_rate": 0.00024734270395343114, + "loss": 7.6685, + "step": 19146 + }, + { + "epoch": 1.7866007278156202, + "grad_norm": 1.5329977117851092, + "learning_rate": 0.0002473369490763392, + "loss": 7.6572, + "step": 19147 + }, + { + "epoch": 1.7866940375104974, + "grad_norm": 2.022560563408488, + "learning_rate": 0.00024733119395174816, + "loss": 7.6677, + "step": 19148 + }, + { + "epoch": 1.7867873472053746, + "grad_norm": 4.5440518991870084e+24, + "learning_rate": 0.00024732543857967275, + "loss": 7.333, + "step": 19149 + }, + { + "epoch": 1.786880656900252, + "grad_norm": 1.4055085172223247, + "learning_rate": 0.00024731968296012755, + "loss": 7.2418, + "step": 19150 + }, + { + "epoch": 1.7869739665951292, + "grad_norm": 1.0880589702654588e+23, + "learning_rate": 0.0002473139270931273, + "loss": 7.3631, + "step": 19151 + }, + { + "epoch": 1.7870672762900064, + "grad_norm": 1.3946645772457438, + "learning_rate": 0.00024730817097868646, + "loss": 7.0881, + "step": 19152 + }, + { + "epoch": 1.7871605859848838, + "grad_norm": 1.106826417653464, + "learning_rate": 0.0002473024146168198, + "loss": 7.4092, + "step": 19153 + }, + { + "epoch": 1.7872538956797612, + "grad_norm": 2.1579983408607326, + "learning_rate": 0.00024729665800754194, + "loss": 7.7537, + "step": 19154 + }, + { + "epoch": 1.7873472053746384, + "grad_norm": 1.698121585011032, + "learning_rate": 0.0002472909011508675, + "loss": 7.7611, + "step": 19155 + }, + { + "epoch": 1.7874405150695156, + "grad_norm": 8.955670773283603e+25, + "learning_rate": 0.00024728514404681113, + "loss": 7.5506, + "step": 19156 + }, + { + "epoch": 1.787533824764393, + "grad_norm": 1.4171598687215325, + "learning_rate": 0.00024727938669538745, + "loss": 7.7195, + "step": 19157 + }, + { + "epoch": 1.7876271344592705, + "grad_norm": 1.0950242566865551, + "learning_rate": 0.0002472736290966111, + "loss": 7.4072, + "step": 19158 + }, + { + "epoch": 1.7877204441541477, + "grad_norm": 1.1581916342990355, + "learning_rate": 0.00024726787125049673, + "loss": 7.6671, + "step": 19159 + }, + { + "epoch": 1.7878137538490249, + "grad_norm": 2.1213297234695002e+24, + "learning_rate": 0.000247262113157059, + "loss": 7.5829, + "step": 19160 + }, + { + "epoch": 1.7879070635439023, + "grad_norm": 1.5379519095291025, + "learning_rate": 0.0002472563548163125, + "loss": 7.6283, + "step": 19161 + }, + { + "epoch": 1.7880003732387795, + "grad_norm": 1.6395254392820036, + "learning_rate": 0.0002472505962282719, + "loss": 7.2491, + "step": 19162 + }, + { + "epoch": 1.7880936829336567, + "grad_norm": 1.6147751731601467, + "learning_rate": 0.0002472448373929518, + "loss": 7.3708, + "step": 19163 + }, + { + "epoch": 1.788186992628534, + "grad_norm": 1.4643267613692688, + "learning_rate": 0.00024723907831036693, + "loss": 7.328, + "step": 19164 + }, + { + "epoch": 1.7882803023234115, + "grad_norm": 2.164729442194239, + "learning_rate": 0.00024723331898053184, + "loss": 7.6254, + "step": 19165 + }, + { + "epoch": 1.7883736120182887, + "grad_norm": 1.8213862845311797, + "learning_rate": 0.0002472275594034613, + "loss": 7.0068, + "step": 19166 + }, + { + "epoch": 1.788466921713166, + "grad_norm": 1.3681900661413988, + "learning_rate": 0.00024722179957916985, + "loss": 7.6218, + "step": 19167 + }, + { + "epoch": 1.7885602314080433, + "grad_norm": 2.9426000044030298e+23, + "learning_rate": 0.00024721603950767217, + "loss": 7.5873, + "step": 19168 + }, + { + "epoch": 1.7886535411029207, + "grad_norm": 6.125934368967049e+24, + "learning_rate": 0.00024721027918898285, + "loss": 7.5425, + "step": 19169 + }, + { + "epoch": 1.788746850797798, + "grad_norm": 1.115980818849678, + "learning_rate": 0.0002472045186231166, + "loss": 7.0613, + "step": 19170 + }, + { + "epoch": 1.7888401604926751, + "grad_norm": 1.6956758065464623, + "learning_rate": 0.0002471987578100881, + "loss": 7.561, + "step": 19171 + }, + { + "epoch": 1.7889334701875526, + "grad_norm": 1.250329874739411, + "learning_rate": 0.00024719299674991197, + "loss": 7.164, + "step": 19172 + }, + { + "epoch": 1.7890267798824298, + "grad_norm": 1.1024295202880015, + "learning_rate": 0.00024718723544260275, + "loss": 7.4027, + "step": 19173 + }, + { + "epoch": 1.789120089577307, + "grad_norm": 1.539021750558509, + "learning_rate": 0.00024718147388817525, + "loss": 7.7435, + "step": 19174 + }, + { + "epoch": 1.7892133992721844, + "grad_norm": 2.173432529799017, + "learning_rate": 0.000247175712086644, + "loss": 6.8699, + "step": 19175 + }, + { + "epoch": 1.7893067089670618, + "grad_norm": 2.264949661159258, + "learning_rate": 0.0002471699500380237, + "loss": 7.5349, + "step": 19176 + }, + { + "epoch": 1.789400018661939, + "grad_norm": 1.1678562261349943, + "learning_rate": 0.00024716418774232903, + "loss": 7.451, + "step": 19177 + }, + { + "epoch": 1.7894933283568162, + "grad_norm": 1.3730736339044198, + "learning_rate": 0.0002471584251995746, + "loss": 7.7138, + "step": 19178 + }, + { + "epoch": 1.7895866380516936, + "grad_norm": 1.8374017473827932e+25, + "learning_rate": 0.00024715266240977514, + "loss": 7.5015, + "step": 19179 + }, + { + "epoch": 1.789679947746571, + "grad_norm": 1.3905844108121181, + "learning_rate": 0.00024714689937294516, + "loss": 7.7299, + "step": 19180 + }, + { + "epoch": 1.7897732574414482, + "grad_norm": 1.5294099063759519, + "learning_rate": 0.0002471411360890994, + "loss": 7.2331, + "step": 19181 + }, + { + "epoch": 1.7898665671363254, + "grad_norm": 1.266838754491407, + "learning_rate": 0.00024713537255825253, + "loss": 7.8089, + "step": 19182 + }, + { + "epoch": 1.7899598768312028, + "grad_norm": 8.221992604640791e+22, + "learning_rate": 0.00024712960878041915, + "loss": 7.5321, + "step": 19183 + }, + { + "epoch": 1.79005318652608, + "grad_norm": 6.765593351727352e+24, + "learning_rate": 0.00024712384475561396, + "loss": 7.3779, + "step": 19184 + }, + { + "epoch": 1.7901464962209572, + "grad_norm": 1.118496572364423, + "learning_rate": 0.0002471180804838516, + "loss": 7.5154, + "step": 19185 + }, + { + "epoch": 1.7902398059158346, + "grad_norm": 1.639938905156231e+25, + "learning_rate": 0.00024711231596514675, + "loss": 7.5503, + "step": 19186 + }, + { + "epoch": 1.790333115610712, + "grad_norm": 1.3779315917575832, + "learning_rate": 0.00024710655119951407, + "loss": 7.6485, + "step": 19187 + }, + { + "epoch": 1.7904264253055893, + "grad_norm": 1.1531946824456163, + "learning_rate": 0.0002471007861869681, + "loss": 7.5202, + "step": 19188 + }, + { + "epoch": 1.7905197350004665, + "grad_norm": 1.596532181658686, + "learning_rate": 0.0002470950209275237, + "loss": 7.7995, + "step": 19189 + }, + { + "epoch": 1.7906130446953439, + "grad_norm": 1.216163154477628, + "learning_rate": 0.00024708925542119535, + "loss": 7.4939, + "step": 19190 + }, + { + "epoch": 1.7907063543902213, + "grad_norm": 1.3673477586895708, + "learning_rate": 0.0002470834896679979, + "loss": 7.8508, + "step": 19191 + }, + { + "epoch": 1.7907996640850985, + "grad_norm": 1.3292569427771264, + "learning_rate": 0.0002470777236679458, + "loss": 7.8346, + "step": 19192 + }, + { + "epoch": 1.7908929737799757, + "grad_norm": 1.8648240424449651, + "learning_rate": 0.00024707195742105375, + "loss": 7.7025, + "step": 19193 + }, + { + "epoch": 1.7909862834748531, + "grad_norm": 1.2657703358993733, + "learning_rate": 0.00024706619092733654, + "loss": 7.5959, + "step": 19194 + }, + { + "epoch": 1.7910795931697303, + "grad_norm": 1.5108125878580043, + "learning_rate": 0.00024706042418680873, + "loss": 7.5063, + "step": 19195 + }, + { + "epoch": 1.7911729028646075, + "grad_norm": 4.236537948582111e+22, + "learning_rate": 0.000247054657199485, + "loss": 7.5542, + "step": 19196 + }, + { + "epoch": 1.791266212559485, + "grad_norm": 6.086830508064458e+23, + "learning_rate": 0.0002470488899653801, + "loss": 7.8006, + "step": 19197 + }, + { + "epoch": 1.7913595222543623, + "grad_norm": 4.455912260802715e+23, + "learning_rate": 0.00024704312248450855, + "loss": 7.4869, + "step": 19198 + }, + { + "epoch": 1.7914528319492395, + "grad_norm": 1.1179503482668247, + "learning_rate": 0.00024703735475688514, + "loss": 7.3783, + "step": 19199 + }, + { + "epoch": 1.7915461416441167, + "grad_norm": 1.1831402293779034, + "learning_rate": 0.00024703158678252447, + "loss": 7.5833, + "step": 19200 + }, + { + "epoch": 1.7916394513389942, + "grad_norm": 1.364227227691228e+23, + "learning_rate": 0.00024702581856144123, + "loss": 7.1952, + "step": 19201 + }, + { + "epoch": 1.7917327610338716, + "grad_norm": 1.9856243482784068, + "learning_rate": 0.00024702005009365, + "loss": 7.9706, + "step": 19202 + }, + { + "epoch": 1.7918260707287486, + "grad_norm": 1.2678008928777702, + "learning_rate": 0.00024701428137916556, + "loss": 7.5908, + "step": 19203 + }, + { + "epoch": 1.791919380423626, + "grad_norm": 1.1823151366124771, + "learning_rate": 0.00024700851241800256, + "loss": 7.6087, + "step": 19204 + }, + { + "epoch": 1.7920126901185034, + "grad_norm": 3.94200710219497e+23, + "learning_rate": 0.00024700274321017566, + "loss": 7.6154, + "step": 19205 + }, + { + "epoch": 1.7921059998133806, + "grad_norm": 1.212037918980366, + "learning_rate": 0.0002469969737556995, + "loss": 7.509, + "step": 19206 + }, + { + "epoch": 1.7921993095082578, + "grad_norm": 1.258548209090449, + "learning_rate": 0.00024699120405458877, + "loss": 7.6405, + "step": 19207 + }, + { + "epoch": 1.7922926192031352, + "grad_norm": 1.3187573298754227, + "learning_rate": 0.0002469854341068581, + "loss": 7.7395, + "step": 19208 + }, + { + "epoch": 1.7923859288980126, + "grad_norm": 2.809802017134122e+23, + "learning_rate": 0.0002469796639125222, + "loss": 7.2861, + "step": 19209 + }, + { + "epoch": 1.7924792385928898, + "grad_norm": 5.223731920788293e+24, + "learning_rate": 0.0002469738934715958, + "loss": 7.4247, + "step": 19210 + }, + { + "epoch": 1.792572548287767, + "grad_norm": 1.3407494938731908e+24, + "learning_rate": 0.0002469681227840935, + "loss": 7.5765, + "step": 19211 + }, + { + "epoch": 1.7926658579826444, + "grad_norm": 1.3547097496144294, + "learning_rate": 0.00024696235185003, + "loss": 7.5458, + "step": 19212 + }, + { + "epoch": 1.7927591676775219, + "grad_norm": 1.322934304491459, + "learning_rate": 0.00024695658066941994, + "loss": 6.9772, + "step": 19213 + }, + { + "epoch": 1.7928524773723988, + "grad_norm": 1.6721822762629865, + "learning_rate": 0.000246950809242278, + "loss": 7.681, + "step": 19214 + }, + { + "epoch": 1.7929457870672763, + "grad_norm": 1.6019104308381884, + "learning_rate": 0.0002469450375686189, + "loss": 7.4115, + "step": 19215 + }, + { + "epoch": 1.7930390967621537, + "grad_norm": 2.220978438538615, + "learning_rate": 0.0002469392656484573, + "loss": 7.6794, + "step": 19216 + }, + { + "epoch": 1.7931324064570309, + "grad_norm": 1.0979201965066665, + "learning_rate": 0.00024693349348180784, + "loss": 7.2669, + "step": 19217 + }, + { + "epoch": 1.793225716151908, + "grad_norm": 1.266093454341223, + "learning_rate": 0.0002469277210686852, + "loss": 7.5827, + "step": 19218 + }, + { + "epoch": 1.7933190258467855, + "grad_norm": 2.674612492424119e+25, + "learning_rate": 0.00024692194840910406, + "loss": 7.3686, + "step": 19219 + }, + { + "epoch": 1.793412335541663, + "grad_norm": 1.2798609006208197, + "learning_rate": 0.0002469161755030792, + "loss": 7.6834, + "step": 19220 + }, + { + "epoch": 1.79350564523654, + "grad_norm": 1.3073963893670828, + "learning_rate": 0.0002469104023506251, + "loss": 7.4631, + "step": 19221 + }, + { + "epoch": 1.7935989549314173, + "grad_norm": 2.038969384888391, + "learning_rate": 0.00024690462895175666, + "loss": 7.2997, + "step": 19222 + }, + { + "epoch": 1.7936922646262947, + "grad_norm": 1.623136273906067, + "learning_rate": 0.00024689885530648846, + "loss": 7.5878, + "step": 19223 + }, + { + "epoch": 1.793785574321172, + "grad_norm": 1.4559298060758938, + "learning_rate": 0.0002468930814148351, + "loss": 7.5028, + "step": 19224 + }, + { + "epoch": 1.7938788840160491, + "grad_norm": 1.8253953860818898e+25, + "learning_rate": 0.00024688730727681136, + "loss": 7.6929, + "step": 19225 + }, + { + "epoch": 1.7939721937109265, + "grad_norm": 7.907481487295172e+25, + "learning_rate": 0.0002468815328924319, + "loss": 7.4467, + "step": 19226 + }, + { + "epoch": 1.794065503405804, + "grad_norm": 1.5143678322701004, + "learning_rate": 0.0002468757582617114, + "loss": 7.525, + "step": 19227 + }, + { + "epoch": 1.7941588131006811, + "grad_norm": 1.3516631580839882, + "learning_rate": 0.00024686998338466454, + "loss": 7.3883, + "step": 19228 + }, + { + "epoch": 1.7942521227955583, + "grad_norm": 1.1175485334693054, + "learning_rate": 0.00024686420826130605, + "loss": 7.4953, + "step": 19229 + }, + { + "epoch": 1.7943454324904358, + "grad_norm": 1.3392095731624345, + "learning_rate": 0.0002468584328916505, + "loss": 7.5121, + "step": 19230 + }, + { + "epoch": 1.7944387421853132, + "grad_norm": 1.3240865161464677, + "learning_rate": 0.0002468526572757127, + "loss": 7.4067, + "step": 19231 + }, + { + "epoch": 1.7945320518801904, + "grad_norm": 1.3622126792050644, + "learning_rate": 0.00024684688141350723, + "loss": 7.3046, + "step": 19232 + }, + { + "epoch": 1.7946253615750676, + "grad_norm": 1.1924715277103304, + "learning_rate": 0.0002468411053050489, + "loss": 7.6387, + "step": 19233 + }, + { + "epoch": 1.794718671269945, + "grad_norm": 1.5828477063368118, + "learning_rate": 0.00024683532895035225, + "loss": 7.7685, + "step": 19234 + }, + { + "epoch": 1.7948119809648222, + "grad_norm": 1.4238098389711231, + "learning_rate": 0.0002468295523494321, + "loss": 7.5104, + "step": 19235 + }, + { + "epoch": 1.7949052906596994, + "grad_norm": 1.2506341012567745, + "learning_rate": 0.00024682377550230307, + "loss": 7.4076, + "step": 19236 + }, + { + "epoch": 1.7949986003545768, + "grad_norm": 1.417250803544641, + "learning_rate": 0.0002468179984089798, + "loss": 7.8271, + "step": 19237 + }, + { + "epoch": 1.7950919100494542, + "grad_norm": 1.201718598664745, + "learning_rate": 0.00024681222106947713, + "loss": 7.716, + "step": 19238 + }, + { + "epoch": 1.7951852197443314, + "grad_norm": 1.3004361510773967, + "learning_rate": 0.00024680644348380965, + "loss": 7.764, + "step": 19239 + }, + { + "epoch": 1.7952785294392086, + "grad_norm": 1.2548123822923702, + "learning_rate": 0.000246800665651992, + "loss": 7.5984, + "step": 19240 + }, + { + "epoch": 1.795371839134086, + "grad_norm": 3.873628329908659, + "learning_rate": 0.00024679488757403895, + "loss": 7.3778, + "step": 19241 + }, + { + "epoch": 1.7954651488289635, + "grad_norm": 3.122757545366675e+28, + "learning_rate": 0.0002467891092499652, + "loss": 7.5046, + "step": 19242 + }, + { + "epoch": 1.7955584585238407, + "grad_norm": 1.3904174834609455, + "learning_rate": 0.0002467833306797854, + "loss": 7.6696, + "step": 19243 + }, + { + "epoch": 1.7956517682187179, + "grad_norm": 1.588029519306095, + "learning_rate": 0.0002467775518635143, + "loss": 7.8198, + "step": 19244 + }, + { + "epoch": 1.7957450779135953, + "grad_norm": 6.1863960211847935e+28, + "learning_rate": 0.0002467717728011665, + "loss": 7.1687, + "step": 19245 + }, + { + "epoch": 1.7958383876084725, + "grad_norm": 1.2728050776436854, + "learning_rate": 0.00024676599349275673, + "loss": 7.4942, + "step": 19246 + }, + { + "epoch": 1.7959316973033497, + "grad_norm": 1.4393364057817397, + "learning_rate": 0.00024676021393829975, + "loss": 7.1855, + "step": 19247 + }, + { + "epoch": 1.796025006998227, + "grad_norm": 1.208645272264734, + "learning_rate": 0.0002467544341378102, + "loss": 7.5039, + "step": 19248 + }, + { + "epoch": 1.7961183166931045, + "grad_norm": 1.1153279741288338, + "learning_rate": 0.0002467486540913028, + "loss": 7.5595, + "step": 19249 + }, + { + "epoch": 1.7962116263879817, + "grad_norm": 1.411463943122864, + "learning_rate": 0.0002467428737987922, + "loss": 7.7682, + "step": 19250 + }, + { + "epoch": 1.796304936082859, + "grad_norm": 1.8239096231701528, + "learning_rate": 0.0002467370932602931, + "loss": 7.134, + "step": 19251 + }, + { + "epoch": 1.7963982457777363, + "grad_norm": 1.2497376840325993, + "learning_rate": 0.0002467313124758203, + "loss": 7.4087, + "step": 19252 + }, + { + "epoch": 1.7964915554726137, + "grad_norm": 5.736489821634249e+27, + "learning_rate": 0.0002467255314453884, + "loss": 7.0961, + "step": 19253 + }, + { + "epoch": 1.796584865167491, + "grad_norm": 1.106439107993079, + "learning_rate": 0.0002467197501690121, + "loss": 7.0391, + "step": 19254 + }, + { + "epoch": 1.7966781748623681, + "grad_norm": 1.9963478986603418, + "learning_rate": 0.00024671396864670616, + "loss": 7.868, + "step": 19255 + }, + { + "epoch": 1.7967714845572456, + "grad_norm": 2.0963205717634343, + "learning_rate": 0.0002467081868784853, + "loss": 7.6495, + "step": 19256 + }, + { + "epoch": 1.7968647942521228, + "grad_norm": 2.1196156777546036, + "learning_rate": 0.00024670240486436405, + "loss": 7.5996, + "step": 19257 + }, + { + "epoch": 1.796958103947, + "grad_norm": 1.3448042304014902, + "learning_rate": 0.0002466966226043573, + "loss": 7.4061, + "step": 19258 + }, + { + "epoch": 1.7970514136418774, + "grad_norm": 1.5459383265934539, + "learning_rate": 0.00024669084009847964, + "loss": 7.5767, + "step": 19259 + }, + { + "epoch": 1.7971447233367548, + "grad_norm": 2.333429717751965e+28, + "learning_rate": 0.00024668505734674586, + "loss": 7.3915, + "step": 19260 + }, + { + "epoch": 1.797238033031632, + "grad_norm": 1.5487083946963227, + "learning_rate": 0.0002466792743491706, + "loss": 7.2937, + "step": 19261 + }, + { + "epoch": 1.7973313427265092, + "grad_norm": 6.801472573447625e+26, + "learning_rate": 0.0002466734911057685, + "loss": 7.3588, + "step": 19262 + }, + { + "epoch": 1.7974246524213866, + "grad_norm": 1.4992721805145932, + "learning_rate": 0.0002466677076165545, + "loss": 7.9104, + "step": 19263 + }, + { + "epoch": 1.797517962116264, + "grad_norm": 1.2056654413933636, + "learning_rate": 0.00024666192388154306, + "loss": 7.7674, + "step": 19264 + }, + { + "epoch": 1.7976112718111412, + "grad_norm": 1.15790195059911, + "learning_rate": 0.000246656139900749, + "loss": 7.483, + "step": 19265 + }, + { + "epoch": 1.7977045815060184, + "grad_norm": 2.100400122239767, + "learning_rate": 0.00024665035567418705, + "loss": 8.1268, + "step": 19266 + }, + { + "epoch": 1.7977978912008958, + "grad_norm": 1.1347428255200125, + "learning_rate": 0.00024664457120187185, + "loss": 7.293, + "step": 19267 + }, + { + "epoch": 1.797891200895773, + "grad_norm": 1.6417272323907357, + "learning_rate": 0.0002466387864838181, + "loss": 7.9525, + "step": 19268 + }, + { + "epoch": 1.7979845105906502, + "grad_norm": 1.2623968799994177e+27, + "learning_rate": 0.00024663300152004056, + "loss": 7.4772, + "step": 19269 + }, + { + "epoch": 1.7980778202855277, + "grad_norm": 1.2414503848953606, + "learning_rate": 0.0002466272163105539, + "loss": 7.1607, + "step": 19270 + }, + { + "epoch": 1.798171129980405, + "grad_norm": 1.0811780587291746, + "learning_rate": 0.0002466214308553729, + "loss": 7.449, + "step": 19271 + }, + { + "epoch": 1.7982644396752823, + "grad_norm": 1.223976931025057, + "learning_rate": 0.0002466156451545122, + "loss": 7.7489, + "step": 19272 + }, + { + "epoch": 1.7983577493701595, + "grad_norm": 1.324812410689537, + "learning_rate": 0.00024660985920798656, + "loss": 7.5613, + "step": 19273 + }, + { + "epoch": 1.7984510590650369, + "grad_norm": 1.564778582996272e+28, + "learning_rate": 0.0002466040730158106, + "loss": 7.7964, + "step": 19274 + }, + { + "epoch": 1.7985443687599143, + "grad_norm": 2.9439990356289102e+29, + "learning_rate": 0.0002465982865779992, + "loss": 7.2931, + "step": 19275 + }, + { + "epoch": 1.7986376784547915, + "grad_norm": 1.3811732638407335, + "learning_rate": 0.0002465924998945669, + "loss": 7.5851, + "step": 19276 + }, + { + "epoch": 1.7987309881496687, + "grad_norm": 1.36406541550489, + "learning_rate": 0.0002465867129655285, + "loss": 7.2077, + "step": 19277 + }, + { + "epoch": 1.7988242978445461, + "grad_norm": 1.2281851609837482, + "learning_rate": 0.0002465809257908987, + "loss": 7.6278, + "step": 19278 + }, + { + "epoch": 1.7989176075394233, + "grad_norm": 1.0974883675475882, + "learning_rate": 0.0002465751383706922, + "loss": 7.6009, + "step": 19279 + }, + { + "epoch": 1.7990109172343005, + "grad_norm": 2.654774280656783e+29, + "learning_rate": 0.00024656935070492374, + "loss": 7.1384, + "step": 19280 + }, + { + "epoch": 1.799104226929178, + "grad_norm": 1.3923080533112209, + "learning_rate": 0.000246563562793608, + "loss": 7.7703, + "step": 19281 + }, + { + "epoch": 1.7991975366240553, + "grad_norm": 1.3851727322516298, + "learning_rate": 0.0002465577746367598, + "loss": 7.7784, + "step": 19282 + }, + { + "epoch": 1.7992908463189325, + "grad_norm": 1.0375142242859408, + "learning_rate": 0.0002465519862343937, + "loss": 7.294, + "step": 19283 + }, + { + "epoch": 1.7993841560138097, + "grad_norm": 1.1587929113332711, + "learning_rate": 0.00024654619758652456, + "loss": 7.2097, + "step": 19284 + }, + { + "epoch": 1.7994774657086872, + "grad_norm": 3.219729823799131, + "learning_rate": 0.000246540408693167, + "loss": 7.304, + "step": 19285 + }, + { + "epoch": 1.7995707754035646, + "grad_norm": 3.101733834417969, + "learning_rate": 0.0002465346195543358, + "loss": 7.2674, + "step": 19286 + }, + { + "epoch": 1.7996640850984418, + "grad_norm": 1.9076539188355404, + "learning_rate": 0.00024652883017004564, + "loss": 7.4418, + "step": 19287 + }, + { + "epoch": 1.799757394793319, + "grad_norm": 3.635898990788345e+29, + "learning_rate": 0.00024652304054031125, + "loss": 7.3834, + "step": 19288 + }, + { + "epoch": 1.7998507044881964, + "grad_norm": 1.5879940876970708, + "learning_rate": 0.0002465172506651474, + "loss": 7.6013, + "step": 19289 + }, + { + "epoch": 1.7999440141830736, + "grad_norm": 1.3270752382413713, + "learning_rate": 0.0002465114605445688, + "loss": 7.6726, + "step": 19290 + }, + { + "epoch": 1.8000373238779508, + "grad_norm": 1.1094801108557542, + "learning_rate": 0.0002465056701785901, + "loss": 7.4122, + "step": 19291 + }, + { + "epoch": 1.8001306335728282, + "grad_norm": 2.989708915684183e+30, + "learning_rate": 0.00024649987956722603, + "loss": 7.0909, + "step": 19292 + }, + { + "epoch": 1.8002239432677056, + "grad_norm": 7.312023418110143, + "learning_rate": 0.0002464940887104914, + "loss": 7.3747, + "step": 19293 + }, + { + "epoch": 1.8003172529625828, + "grad_norm": 1.4812955911164716, + "learning_rate": 0.00024648829760840087, + "loss": 7.8884, + "step": 19294 + }, + { + "epoch": 1.80041056265746, + "grad_norm": 1.384436431278942, + "learning_rate": 0.00024648250626096914, + "loss": 7.3742, + "step": 19295 + }, + { + "epoch": 1.8005038723523374, + "grad_norm": 1.3450192369389742, + "learning_rate": 0.0002464767146682111, + "loss": 7.5412, + "step": 19296 + }, + { + "epoch": 1.8005971820472149, + "grad_norm": 5.320475930592794e+29, + "learning_rate": 0.00024647092283014123, + "loss": 7.1496, + "step": 19297 + }, + { + "epoch": 1.800690491742092, + "grad_norm": 1.1713004528047868, + "learning_rate": 0.00024646513074677443, + "loss": 7.4987, + "step": 19298 + }, + { + "epoch": 1.8007838014369693, + "grad_norm": 8.576249773110467e+26, + "learning_rate": 0.00024645933841812537, + "loss": 7.0319, + "step": 19299 + }, + { + "epoch": 1.8008771111318467, + "grad_norm": 1.3533460317303938e+28, + "learning_rate": 0.0002464535458442088, + "loss": 7.0444, + "step": 19300 + }, + { + "epoch": 1.8009704208267239, + "grad_norm": 1.3473972142705465e+27, + "learning_rate": 0.00024644775302503936, + "loss": 7.8429, + "step": 19301 + }, + { + "epoch": 1.801063730521601, + "grad_norm": 5.487342680040157, + "learning_rate": 0.00024644195996063195, + "loss": 7.5053, + "step": 19302 + }, + { + "epoch": 1.8011570402164785, + "grad_norm": 2.128165308682169, + "learning_rate": 0.0002464361666510012, + "loss": 7.8396, + "step": 19303 + }, + { + "epoch": 1.801250349911356, + "grad_norm": 1.6584680611728033, + "learning_rate": 0.0002464303730961618, + "loss": 7.8107, + "step": 19304 + }, + { + "epoch": 1.801343659606233, + "grad_norm": 1.6990044863750364e+29, + "learning_rate": 0.00024642457929612856, + "loss": 7.4917, + "step": 19305 + }, + { + "epoch": 1.8014369693011103, + "grad_norm": 4.474472529069353e+28, + "learning_rate": 0.00024641878525091614, + "loss": 7.6357, + "step": 19306 + }, + { + "epoch": 1.8015302789959877, + "grad_norm": 2.34838074268072, + "learning_rate": 0.0002464129909605393, + "loss": 7.226, + "step": 19307 + }, + { + "epoch": 1.8016235886908651, + "grad_norm": 1.5180750531717297, + "learning_rate": 0.00024640719642501284, + "loss": 7.3794, + "step": 19308 + }, + { + "epoch": 1.8017168983857421, + "grad_norm": 1.2400946315321828, + "learning_rate": 0.0002464014016443514, + "loss": 7.6514, + "step": 19309 + }, + { + "epoch": 1.8018102080806195, + "grad_norm": 1.4454093852875542e+28, + "learning_rate": 0.0002463956066185698, + "loss": 7.1668, + "step": 19310 + }, + { + "epoch": 1.801903517775497, + "grad_norm": 1.6995906341955078, + "learning_rate": 0.00024638981134768263, + "loss": 7.6779, + "step": 19311 + }, + { + "epoch": 1.8019968274703742, + "grad_norm": 1.4167543056527694, + "learning_rate": 0.0002463840158317048, + "loss": 7.706, + "step": 19312 + }, + { + "epoch": 1.8020901371652513, + "grad_norm": 1.0015117277204464, + "learning_rate": 0.000246378220070651, + "loss": 7.3634, + "step": 19313 + }, + { + "epoch": 1.8021834468601288, + "grad_norm": 1.1950122318358323, + "learning_rate": 0.00024637242406453584, + "loss": 7.3871, + "step": 19314 + }, + { + "epoch": 1.8022767565550062, + "grad_norm": 1.7218697399941907, + "learning_rate": 0.00024636662781337424, + "loss": 7.5188, + "step": 19315 + }, + { + "epoch": 1.8023700662498834, + "grad_norm": 2.4107959677221573, + "learning_rate": 0.0002463608313171808, + "loss": 7.6778, + "step": 19316 + }, + { + "epoch": 1.8024633759447606, + "grad_norm": 1.419332034808865, + "learning_rate": 0.00024635503457597026, + "loss": 7.2798, + "step": 19317 + }, + { + "epoch": 1.802556685639638, + "grad_norm": 1.444806935312906, + "learning_rate": 0.00024634923758975753, + "loss": 7.6573, + "step": 19318 + }, + { + "epoch": 1.8026499953345154, + "grad_norm": 1.2116633090051572, + "learning_rate": 0.0002463434403585571, + "loss": 7.5693, + "step": 19319 + }, + { + "epoch": 1.8027433050293924, + "grad_norm": 1.2243244740127839, + "learning_rate": 0.00024633764288238395, + "loss": 7.7573, + "step": 19320 + }, + { + "epoch": 1.8028366147242698, + "grad_norm": 1.3395939899629108, + "learning_rate": 0.0002463318451612527, + "loss": 7.7253, + "step": 19321 + }, + { + "epoch": 1.8029299244191472, + "grad_norm": 1.1028824407047846, + "learning_rate": 0.00024632604719517805, + "loss": 7.5565, + "step": 19322 + }, + { + "epoch": 1.8030232341140244, + "grad_norm": 1.2465662585782142, + "learning_rate": 0.0002463202489841748, + "loss": 7.7966, + "step": 19323 + }, + { + "epoch": 1.8031165438089016, + "grad_norm": 1.617226701539834e+28, + "learning_rate": 0.00024631445052825774, + "loss": 7.3458, + "step": 19324 + }, + { + "epoch": 1.803209853503779, + "grad_norm": 1.0718653626301307, + "learning_rate": 0.0002463086518274415, + "loss": 7.8661, + "step": 19325 + }, + { + "epoch": 1.8033031631986565, + "grad_norm": 1.0048972736761526, + "learning_rate": 0.0002463028528817409, + "loss": 7.5761, + "step": 19326 + }, + { + "epoch": 1.8033964728935337, + "grad_norm": 1.1450932106593752, + "learning_rate": 0.00024629705369117073, + "loss": 7.4363, + "step": 19327 + }, + { + "epoch": 1.8034897825884109, + "grad_norm": 1.1478572758414018, + "learning_rate": 0.0002462912542557456, + "loss": 7.2316, + "step": 19328 + }, + { + "epoch": 1.8035830922832883, + "grad_norm": 1.4209886949972774e+27, + "learning_rate": 0.0002462854545754804, + "loss": 7.4432, + "step": 19329 + }, + { + "epoch": 1.8036764019781655, + "grad_norm": 9.421812846060002e+27, + "learning_rate": 0.0002462796546503898, + "loss": 7.627, + "step": 19330 + }, + { + "epoch": 1.8037697116730427, + "grad_norm": 5.683298021702422e+28, + "learning_rate": 0.0002462738544804885, + "loss": 7.7522, + "step": 19331 + }, + { + "epoch": 1.80386302136792, + "grad_norm": 1.8247093329724475, + "learning_rate": 0.0002462680540657914, + "loss": 7.8996, + "step": 19332 + }, + { + "epoch": 1.8039563310627975, + "grad_norm": 1.3659019571551311e+29, + "learning_rate": 0.0002462622534063131, + "loss": 7.6632, + "step": 19333 + }, + { + "epoch": 1.8040496407576747, + "grad_norm": 1.1780361029080337, + "learning_rate": 0.0002462564525020684, + "loss": 7.3167, + "step": 19334 + }, + { + "epoch": 1.804142950452552, + "grad_norm": 1.1797257118273825, + "learning_rate": 0.00024625065135307206, + "loss": 7.3303, + "step": 19335 + }, + { + "epoch": 1.8042362601474293, + "grad_norm": 1.4259347864289753, + "learning_rate": 0.0002462448499593388, + "loss": 7.4796, + "step": 19336 + }, + { + "epoch": 1.8043295698423067, + "grad_norm": 1.2648066649485992e+30, + "learning_rate": 0.00024623904832088347, + "loss": 7.4832, + "step": 19337 + }, + { + "epoch": 1.804422879537184, + "grad_norm": 1.1451902901935738, + "learning_rate": 0.0002462332464377207, + "loss": 7.8146, + "step": 19338 + }, + { + "epoch": 1.8045161892320611, + "grad_norm": 1.2215343759975534, + "learning_rate": 0.0002462274443098653, + "loss": 7.3218, + "step": 19339 + }, + { + "epoch": 1.8046094989269386, + "grad_norm": 145.8156812648974, + "learning_rate": 0.000246221641937332, + "loss": 7.4752, + "step": 19340 + }, + { + "epoch": 1.8047028086218158, + "grad_norm": 1.8278929983434367, + "learning_rate": 0.0002462158393201355, + "loss": 8.038, + "step": 19341 + }, + { + "epoch": 1.804796118316693, + "grad_norm": 1.208045815000966, + "learning_rate": 0.0002462100364582908, + "loss": 7.2713, + "step": 19342 + }, + { + "epoch": 1.8048894280115704, + "grad_norm": 1.3511054598120602, + "learning_rate": 0.00024620423335181235, + "loss": 7.3178, + "step": 19343 + }, + { + "epoch": 1.8049827377064478, + "grad_norm": 2.1849037236966775, + "learning_rate": 0.00024619843000071504, + "loss": 7.5674, + "step": 19344 + }, + { + "epoch": 1.805076047401325, + "grad_norm": 1.4418572865345916, + "learning_rate": 0.00024619262640501364, + "loss": 7.6298, + "step": 19345 + }, + { + "epoch": 1.8051693570962022, + "grad_norm": 1.4058235198343239, + "learning_rate": 0.0002461868225647229, + "loss": 7.5249, + "step": 19346 + }, + { + "epoch": 1.8052626667910796, + "grad_norm": 1.087192452709216, + "learning_rate": 0.0002461810184798575, + "loss": 7.4853, + "step": 19347 + }, + { + "epoch": 1.805355976485957, + "grad_norm": 1.2786329548447575, + "learning_rate": 0.0002461752141504323, + "loss": 7.8636, + "step": 19348 + }, + { + "epoch": 1.8054492861808342, + "grad_norm": 1.0435855392278734, + "learning_rate": 0.00024616940957646207, + "loss": 7.5906, + "step": 19349 + }, + { + "epoch": 1.8055425958757114, + "grad_norm": 1.4417123255022917, + "learning_rate": 0.00024616360475796146, + "loss": 7.5948, + "step": 19350 + }, + { + "epoch": 1.8056359055705888, + "grad_norm": 2.5512647331981606, + "learning_rate": 0.0002461577996949453, + "loss": 7.5817, + "step": 19351 + }, + { + "epoch": 1.805729215265466, + "grad_norm": 1.4752934872303152, + "learning_rate": 0.0002461519943874283, + "loss": 7.6996, + "step": 19352 + }, + { + "epoch": 1.8058225249603432, + "grad_norm": 1.3333055884368898, + "learning_rate": 0.0002461461888354253, + "loss": 7.6108, + "step": 19353 + }, + { + "epoch": 1.8059158346552207, + "grad_norm": 3.4946656108333243e+28, + "learning_rate": 0.0002461403830389511, + "loss": 7.6131, + "step": 19354 + }, + { + "epoch": 1.806009144350098, + "grad_norm": 1.2346564863170058, + "learning_rate": 0.0002461345769980203, + "loss": 7.546, + "step": 19355 + }, + { + "epoch": 1.8061024540449753, + "grad_norm": 1.0388537803566584, + "learning_rate": 0.0002461287707126477, + "loss": 7.345, + "step": 19356 + }, + { + "epoch": 1.8061957637398525, + "grad_norm": 1.0104931649301097, + "learning_rate": 0.0002461229641828482, + "loss": 7.394, + "step": 19357 + }, + { + "epoch": 1.8062890734347299, + "grad_norm": 1.3764489188913214, + "learning_rate": 0.00024611715740863643, + "loss": 7.4682, + "step": 19358 + }, + { + "epoch": 1.8063823831296073, + "grad_norm": 3.479407992068411, + "learning_rate": 0.0002461113503900272, + "loss": 7.4744, + "step": 19359 + }, + { + "epoch": 1.8064756928244845, + "grad_norm": 0.9679393685703503, + "learning_rate": 0.0002461055431270353, + "loss": 7.5406, + "step": 19360 + }, + { + "epoch": 1.8065690025193617, + "grad_norm": 0.9353037139309366, + "learning_rate": 0.0002460997356196755, + "loss": 7.5689, + "step": 19361 + }, + { + "epoch": 1.8066623122142391, + "grad_norm": 1.1882153639852884, + "learning_rate": 0.0002460939278679625, + "loss": 7.9043, + "step": 19362 + }, + { + "epoch": 1.8067556219091163, + "grad_norm": 1.3625713624345623, + "learning_rate": 0.00024608811987191106, + "loss": 7.4531, + "step": 19363 + }, + { + "epoch": 1.8068489316039935, + "grad_norm": 6.694706641656487e+27, + "learning_rate": 0.00024608231163153607, + "loss": 7.3522, + "step": 19364 + }, + { + "epoch": 1.806942241298871, + "grad_norm": 1.1282665805176384, + "learning_rate": 0.0002460765031468522, + "loss": 7.5664, + "step": 19365 + }, + { + "epoch": 1.8070355509937484, + "grad_norm": 1.2725953978435562, + "learning_rate": 0.0002460706944178742, + "loss": 7.6307, + "step": 19366 + }, + { + "epoch": 1.8071288606886255, + "grad_norm": 1.1978282883644513, + "learning_rate": 0.000246064885444617, + "loss": 7.2025, + "step": 19367 + }, + { + "epoch": 1.8072221703835027, + "grad_norm": 1.6381538166924385, + "learning_rate": 0.00024605907622709514, + "loss": 7.5877, + "step": 19368 + }, + { + "epoch": 1.8073154800783802, + "grad_norm": 1.080764358621147, + "learning_rate": 0.0002460532667653235, + "loss": 7.3691, + "step": 19369 + }, + { + "epoch": 1.8074087897732576, + "grad_norm": 1.7149546770511934, + "learning_rate": 0.0002460474570593169, + "loss": 7.5819, + "step": 19370 + }, + { + "epoch": 1.8075020994681348, + "grad_norm": 1.1816890014603094, + "learning_rate": 0.00024604164710909007, + "loss": 7.4762, + "step": 19371 + }, + { + "epoch": 1.807595409163012, + "grad_norm": 1.5615350485189927, + "learning_rate": 0.00024603583691465777, + "loss": 7.72, + "step": 19372 + }, + { + "epoch": 1.8076887188578894, + "grad_norm": 6.539814551862903e+29, + "learning_rate": 0.0002460300264760348, + "loss": 7.4863, + "step": 19373 + }, + { + "epoch": 1.8077820285527666, + "grad_norm": 1.2765349543158544, + "learning_rate": 0.0002460242157932359, + "loss": 7.491, + "step": 19374 + }, + { + "epoch": 1.8078753382476438, + "grad_norm": 1.1857825778046094, + "learning_rate": 0.0002460184048662759, + "loss": 7.2952, + "step": 19375 + }, + { + "epoch": 1.8079686479425212, + "grad_norm": 1.413528051169489, + "learning_rate": 0.0002460125936951695, + "loss": 7.4548, + "step": 19376 + }, + { + "epoch": 1.8080619576373986, + "grad_norm": 1.229185676136717e+29, + "learning_rate": 0.00024600678227993154, + "loss": 7.6639, + "step": 19377 + }, + { + "epoch": 1.8081552673322758, + "grad_norm": 1.5300624045634328e+29, + "learning_rate": 0.0002460009706205767, + "loss": 7.3359, + "step": 19378 + }, + { + "epoch": 1.808248577027153, + "grad_norm": 1.4563528072165797e+28, + "learning_rate": 0.0002459951587171199, + "loss": 7.2026, + "step": 19379 + }, + { + "epoch": 1.8083418867220304, + "grad_norm": 1.602162498351922, + "learning_rate": 0.00024598934656957583, + "loss": 7.9157, + "step": 19380 + }, + { + "epoch": 1.8084351964169079, + "grad_norm": 1.3469235758399434, + "learning_rate": 0.00024598353417795926, + "loss": 7.5873, + "step": 19381 + }, + { + "epoch": 1.808528506111785, + "grad_norm": 1.712187800665376, + "learning_rate": 0.00024597772154228505, + "loss": 7.6159, + "step": 19382 + }, + { + "epoch": 1.8086218158066623, + "grad_norm": 8.18202320438251, + "learning_rate": 0.0002459719086625679, + "loss": 7.8826, + "step": 19383 + }, + { + "epoch": 1.8087151255015397, + "grad_norm": 1.1448424420564875e+28, + "learning_rate": 0.00024596609553882264, + "loss": 7.5024, + "step": 19384 + }, + { + "epoch": 1.8088084351964169, + "grad_norm": 1.3036767509452778, + "learning_rate": 0.000245960282171064, + "loss": 7.7165, + "step": 19385 + }, + { + "epoch": 1.808901744891294, + "grad_norm": 1.0877120452242035, + "learning_rate": 0.0002459544685593068, + "loss": 7.5158, + "step": 19386 + }, + { + "epoch": 1.8089950545861715, + "grad_norm": 8.451841344949111e+27, + "learning_rate": 0.0002459486547035658, + "loss": 7.5665, + "step": 19387 + }, + { + "epoch": 1.809088364281049, + "grad_norm": 1.0650848373230488, + "learning_rate": 0.00024594284060385575, + "loss": 7.4468, + "step": 19388 + }, + { + "epoch": 1.809181673975926, + "grad_norm": 1.364895800167917, + "learning_rate": 0.00024593702626019156, + "loss": 7.4597, + "step": 19389 + }, + { + "epoch": 1.8092749836708033, + "grad_norm": 0.9718594073101383, + "learning_rate": 0.0002459312116725879, + "loss": 7.5139, + "step": 19390 + }, + { + "epoch": 1.8093682933656807, + "grad_norm": 1.1459981701981434, + "learning_rate": 0.0002459253968410595, + "loss": 7.5726, + "step": 19391 + }, + { + "epoch": 1.8094616030605581, + "grad_norm": 1.3238706416898427, + "learning_rate": 0.00024591958176562134, + "loss": 7.4229, + "step": 19392 + }, + { + "epoch": 1.8095549127554353, + "grad_norm": 2.2487462936138747, + "learning_rate": 0.00024591376644628806, + "loss": 8.1125, + "step": 19393 + }, + { + "epoch": 1.8096482224503125, + "grad_norm": 1.4406921297770935e+27, + "learning_rate": 0.00024590795088307444, + "loss": 7.594, + "step": 19394 + }, + { + "epoch": 1.80974153214519, + "grad_norm": 3.569258330985541e+28, + "learning_rate": 0.0002459021350759954, + "loss": 7.4444, + "step": 19395 + }, + { + "epoch": 1.8098348418400672, + "grad_norm": 1.070600522033454, + "learning_rate": 0.0002458963190250656, + "loss": 7.5904, + "step": 19396 + }, + { + "epoch": 1.8099281515349444, + "grad_norm": 2.5533663906140602, + "learning_rate": 0.00024589050273029985, + "loss": 7.2315, + "step": 19397 + }, + { + "epoch": 1.8100214612298218, + "grad_norm": 1.9579402109726936, + "learning_rate": 0.00024588468619171297, + "loss": 7.2634, + "step": 19398 + }, + { + "epoch": 1.8101147709246992, + "grad_norm": 7.651493380358178e+27, + "learning_rate": 0.00024587886940931976, + "loss": 7.3902, + "step": 19399 + }, + { + "epoch": 1.8102080806195764, + "grad_norm": 7.740418323435717e+27, + "learning_rate": 0.00024587305238313496, + "loss": 7.3016, + "step": 19400 + }, + { + "epoch": 1.8103013903144536, + "grad_norm": 1.9454358714318343, + "learning_rate": 0.0002458672351131734, + "loss": 7.6367, + "step": 19401 + }, + { + "epoch": 1.810394700009331, + "grad_norm": 16.583233869462518, + "learning_rate": 0.00024586141759944986, + "loss": 7.5792, + "step": 19402 + }, + { + "epoch": 1.8104880097042084, + "grad_norm": 1.3688714378962556, + "learning_rate": 0.0002458555998419791, + "loss": 7.7898, + "step": 19403 + }, + { + "epoch": 1.8105813193990856, + "grad_norm": 1.0934281146576246, + "learning_rate": 0.00024584978184077596, + "loss": 7.198, + "step": 19404 + }, + { + "epoch": 1.8106746290939628, + "grad_norm": 1.2861692588343088, + "learning_rate": 0.0002458439635958552, + "loss": 7.7494, + "step": 19405 + }, + { + "epoch": 1.8107679387888402, + "grad_norm": 1.431527348311823, + "learning_rate": 0.0002458381451072317, + "loss": 7.4063, + "step": 19406 + }, + { + "epoch": 1.8108612484837174, + "grad_norm": 7.306201855186834e+27, + "learning_rate": 0.0002458323263749201, + "loss": 7.3038, + "step": 19407 + }, + { + "epoch": 1.8109545581785946, + "grad_norm": 1.116047336925069, + "learning_rate": 0.00024582650739893535, + "loss": 7.4191, + "step": 19408 + }, + { + "epoch": 1.811047867873472, + "grad_norm": 4.058397740841262e+28, + "learning_rate": 0.00024582068817929214, + "loss": 7.5762, + "step": 19409 + }, + { + "epoch": 1.8111411775683495, + "grad_norm": 1.2464868519330317, + "learning_rate": 0.0002458148687160053, + "loss": 7.6472, + "step": 19410 + }, + { + "epoch": 1.8112344872632267, + "grad_norm": 1.7011344565027917, + "learning_rate": 0.0002458090490090897, + "loss": 7.8183, + "step": 19411 + }, + { + "epoch": 1.8113277969581039, + "grad_norm": 8.11083726588095e+29, + "learning_rate": 0.00024580322905855994, + "loss": 7.6498, + "step": 19412 + }, + { + "epoch": 1.8114211066529813, + "grad_norm": 1.2698646857314704, + "learning_rate": 0.00024579740886443105, + "loss": 7.7708, + "step": 19413 + }, + { + "epoch": 1.8115144163478587, + "grad_norm": 1.3901728490014007, + "learning_rate": 0.0002457915884267177, + "loss": 7.3245, + "step": 19414 + }, + { + "epoch": 1.8116077260427357, + "grad_norm": 1.3176946728245773, + "learning_rate": 0.00024578576774543476, + "loss": 7.4507, + "step": 19415 + }, + { + "epoch": 1.811701035737613, + "grad_norm": 1.2138087014418637, + "learning_rate": 0.00024577994682059687, + "loss": 7.4725, + "step": 19416 + }, + { + "epoch": 1.8117943454324905, + "grad_norm": 1.5491699778868933, + "learning_rate": 0.000245774125652219, + "loss": 7.1658, + "step": 19417 + }, + { + "epoch": 1.8118876551273677, + "grad_norm": 1.2237273660458146, + "learning_rate": 0.00024576830424031593, + "loss": 7.4836, + "step": 19418 + }, + { + "epoch": 1.811980964822245, + "grad_norm": 1.2298354400105962, + "learning_rate": 0.00024576248258490244, + "loss": 7.3325, + "step": 19419 + }, + { + "epoch": 1.8120742745171223, + "grad_norm": 2.4145833274067234, + "learning_rate": 0.0002457566606859933, + "loss": 7.5523, + "step": 19420 + }, + { + "epoch": 1.8121675842119997, + "grad_norm": 4.530159915707522e+28, + "learning_rate": 0.00024575083854360335, + "loss": 7.4557, + "step": 19421 + }, + { + "epoch": 1.812260893906877, + "grad_norm": 1.577126408617131, + "learning_rate": 0.00024574501615774735, + "loss": 7.5989, + "step": 19422 + }, + { + "epoch": 1.8123542036017541, + "grad_norm": 1.3052092679992005, + "learning_rate": 0.00024573919352844015, + "loss": 7.5773, + "step": 19423 + }, + { + "epoch": 1.8124475132966316, + "grad_norm": 1.5366552066079282, + "learning_rate": 0.0002457333706556966, + "loss": 7.2321, + "step": 19424 + }, + { + "epoch": 1.812540822991509, + "grad_norm": 1.324535890490403, + "learning_rate": 0.0002457275475395314, + "loss": 7.2198, + "step": 19425 + }, + { + "epoch": 1.812634132686386, + "grad_norm": 1.5108488803599063, + "learning_rate": 0.00024572172417995934, + "loss": 7.6094, + "step": 19426 + }, + { + "epoch": 1.8127274423812634, + "grad_norm": 1.0975730934001235, + "learning_rate": 0.0002457159005769954, + "loss": 7.4054, + "step": 19427 + }, + { + "epoch": 1.8128207520761408, + "grad_norm": 1.357981698785597, + "learning_rate": 0.00024571007673065417, + "loss": 7.4457, + "step": 19428 + }, + { + "epoch": 1.812914061771018, + "grad_norm": 1.734541895544104, + "learning_rate": 0.00024570425264095066, + "loss": 7.8529, + "step": 19429 + }, + { + "epoch": 1.8130073714658952, + "grad_norm": 1.1366977330894759, + "learning_rate": 0.00024569842830789954, + "loss": 7.5788, + "step": 19430 + }, + { + "epoch": 1.8131006811607726, + "grad_norm": 1.754925430148025, + "learning_rate": 0.00024569260373151567, + "loss": 7.3104, + "step": 19431 + }, + { + "epoch": 1.81319399085565, + "grad_norm": 1.3208794575566192, + "learning_rate": 0.00024568677891181385, + "loss": 7.4666, + "step": 19432 + }, + { + "epoch": 1.8132873005505272, + "grad_norm": 1.1747221430411492, + "learning_rate": 0.0002456809538488089, + "loss": 7.5761, + "step": 19433 + }, + { + "epoch": 1.8133806102454044, + "grad_norm": 1.222645345966021, + "learning_rate": 0.00024567512854251566, + "loss": 7.1517, + "step": 19434 + }, + { + "epoch": 1.8134739199402818, + "grad_norm": 2.2757967787189566, + "learning_rate": 0.00024566930299294883, + "loss": 7.7925, + "step": 19435 + }, + { + "epoch": 1.813567229635159, + "grad_norm": 1.1961922500739188, + "learning_rate": 0.00024566347720012335, + "loss": 7.4077, + "step": 19436 + }, + { + "epoch": 1.8136605393300362, + "grad_norm": 1.2922782909576864, + "learning_rate": 0.00024565765116405403, + "loss": 7.5382, + "step": 19437 + }, + { + "epoch": 1.8137538490249137, + "grad_norm": 1.2251039744531838, + "learning_rate": 0.00024565182488475555, + "loss": 7.7283, + "step": 19438 + }, + { + "epoch": 1.813847158719791, + "grad_norm": 1.587092716317918, + "learning_rate": 0.0002456459983622429, + "loss": 7.2578, + "step": 19439 + }, + { + "epoch": 1.8139404684146683, + "grad_norm": 1.1615967984432993, + "learning_rate": 0.00024564017159653075, + "loss": 7.5445, + "step": 19440 + }, + { + "epoch": 1.8140337781095455, + "grad_norm": 1.0359996894798118, + "learning_rate": 0.000245634344587634, + "loss": 7.5113, + "step": 19441 + }, + { + "epoch": 1.8141270878044229, + "grad_norm": 1.1518437511819604, + "learning_rate": 0.0002456285173355674, + "loss": 7.7372, + "step": 19442 + }, + { + "epoch": 1.8142203974993003, + "grad_norm": 1.5664971786780666, + "learning_rate": 0.00024562268984034584, + "loss": 7.6866, + "step": 19443 + }, + { + "epoch": 1.8143137071941775, + "grad_norm": 1.0599251419666842, + "learning_rate": 0.00024561686210198413, + "loss": 7.6292, + "step": 19444 + }, + { + "epoch": 1.8144070168890547, + "grad_norm": 2.2402374583409834, + "learning_rate": 0.000245611034120497, + "loss": 7.4127, + "step": 19445 + }, + { + "epoch": 1.8145003265839321, + "grad_norm": 1.3473516643767942, + "learning_rate": 0.00024560520589589943, + "loss": 7.5495, + "step": 19446 + }, + { + "epoch": 1.8145936362788093, + "grad_norm": 2.5074454783845996, + "learning_rate": 0.00024559937742820603, + "loss": 7.5099, + "step": 19447 + }, + { + "epoch": 1.8146869459736865, + "grad_norm": 1.8175530196522514e+28, + "learning_rate": 0.0002455935487174318, + "loss": 7.7231, + "step": 19448 + }, + { + "epoch": 1.814780255668564, + "grad_norm": 6.8143323951828e+29, + "learning_rate": 0.00024558771976359147, + "loss": 7.5432, + "step": 19449 + }, + { + "epoch": 1.8148735653634414, + "grad_norm": 18.600348276838677, + "learning_rate": 0.0002455818905666999, + "loss": 7.5454, + "step": 19450 + }, + { + "epoch": 1.8149668750583186, + "grad_norm": 5.3220605858246975e+29, + "learning_rate": 0.00024557606112677184, + "loss": 7.7201, + "step": 19451 + }, + { + "epoch": 1.8150601847531957, + "grad_norm": 1.932751630713463, + "learning_rate": 0.0002455702314438223, + "loss": 7.8714, + "step": 19452 + }, + { + "epoch": 1.8151534944480732, + "grad_norm": 1.1210852290830928, + "learning_rate": 0.0002455644015178658, + "loss": 7.5521, + "step": 19453 + }, + { + "epoch": 1.8152468041429506, + "grad_norm": 1.3832172717005902, + "learning_rate": 0.0002455585713489175, + "loss": 7.579, + "step": 19454 + }, + { + "epoch": 1.8153401138378278, + "grad_norm": 2.195840934112683, + "learning_rate": 0.00024555274093699195, + "loss": 7.3705, + "step": 19455 + }, + { + "epoch": 1.815433423532705, + "grad_norm": 1.4032733608657828, + "learning_rate": 0.0002455469102821041, + "loss": 7.5724, + "step": 19456 + }, + { + "epoch": 1.8155267332275824, + "grad_norm": 1.5135702137476195, + "learning_rate": 0.0002455410793842688, + "loss": 7.5328, + "step": 19457 + }, + { + "epoch": 1.8156200429224596, + "grad_norm": 1.0389896956878457, + "learning_rate": 0.00024553524824350075, + "loss": 7.5029, + "step": 19458 + }, + { + "epoch": 1.8157133526173368, + "grad_norm": 1.118291385495628, + "learning_rate": 0.000245529416859815, + "loss": 7.6951, + "step": 19459 + }, + { + "epoch": 1.8158066623122142, + "grad_norm": 1.0982524008299079e+29, + "learning_rate": 0.0002455235852332261, + "loss": 7.5475, + "step": 19460 + }, + { + "epoch": 1.8158999720070916, + "grad_norm": 1.6237331687835521, + "learning_rate": 0.0002455177533637491, + "loss": 7.6478, + "step": 19461 + }, + { + "epoch": 1.8159932817019688, + "grad_norm": 1.0605739861471406, + "learning_rate": 0.00024551192125139875, + "loss": 7.5711, + "step": 19462 + }, + { + "epoch": 1.816086591396846, + "grad_norm": 1.836329433339159, + "learning_rate": 0.0002455060888961898, + "loss": 7.8017, + "step": 19463 + }, + { + "epoch": 1.8161799010917234, + "grad_norm": 1.0582691710566423, + "learning_rate": 0.0002455002562981373, + "loss": 7.2888, + "step": 19464 + }, + { + "epoch": 1.8162732107866009, + "grad_norm": 1.2806067649472843, + "learning_rate": 0.0002454944234572558, + "loss": 7.7499, + "step": 19465 + }, + { + "epoch": 1.816366520481478, + "grad_norm": 1.2216080693931308, + "learning_rate": 0.00024548859037356035, + "loss": 7.515, + "step": 19466 + }, + { + "epoch": 1.8164598301763553, + "grad_norm": 1.363783063754581, + "learning_rate": 0.00024548275704706565, + "loss": 7.4699, + "step": 19467 + }, + { + "epoch": 1.8165531398712327, + "grad_norm": 1.5281964482727972, + "learning_rate": 0.00024547692347778665, + "loss": 7.3685, + "step": 19468 + }, + { + "epoch": 1.8166464495661099, + "grad_norm": 1.1136505168208843, + "learning_rate": 0.00024547108966573803, + "loss": 7.8939, + "step": 19469 + }, + { + "epoch": 1.816739759260987, + "grad_norm": 1.4065320226845575, + "learning_rate": 0.00024546525561093476, + "loss": 7.4214, + "step": 19470 + }, + { + "epoch": 1.8168330689558645, + "grad_norm": 1.3361812236303237, + "learning_rate": 0.0002454594213133916, + "loss": 7.3855, + "step": 19471 + }, + { + "epoch": 1.816926378650742, + "grad_norm": 1.7852270638611445, + "learning_rate": 0.00024545358677312344, + "loss": 7.2301, + "step": 19472 + }, + { + "epoch": 1.817019688345619, + "grad_norm": 1.0985995349103292, + "learning_rate": 0.00024544775199014505, + "loss": 7.5173, + "step": 19473 + }, + { + "epoch": 1.8171129980404963, + "grad_norm": 1.9704053691417625, + "learning_rate": 0.00024544191696447136, + "loss": 7.7864, + "step": 19474 + }, + { + "epoch": 1.8172063077353737, + "grad_norm": 1.3816062513361227, + "learning_rate": 0.00024543608169611713, + "loss": 7.6511, + "step": 19475 + }, + { + "epoch": 1.8172996174302511, + "grad_norm": 3.942735892383477e+29, + "learning_rate": 0.00024543024618509714, + "loss": 7.4526, + "step": 19476 + }, + { + "epoch": 1.8173929271251283, + "grad_norm": 1.1077456848326357, + "learning_rate": 0.0002454244104314263, + "loss": 7.671, + "step": 19477 + }, + { + "epoch": 1.8174862368200055, + "grad_norm": 1.208827412626917, + "learning_rate": 0.0002454185744351196, + "loss": 7.3282, + "step": 19478 + }, + { + "epoch": 1.817579546514883, + "grad_norm": 1.0399956542889493e+29, + "learning_rate": 0.0002454127381961916, + "loss": 7.466, + "step": 19479 + }, + { + "epoch": 1.8176728562097602, + "grad_norm": 1.3543559731814634, + "learning_rate": 0.0002454069017146573, + "loss": 7.5619, + "step": 19480 + }, + { + "epoch": 1.8177661659046374, + "grad_norm": 1.383421045968113e+29, + "learning_rate": 0.00024540106499053153, + "loss": 7.6844, + "step": 19481 + }, + { + "epoch": 1.8178594755995148, + "grad_norm": 1.7830441956281746, + "learning_rate": 0.00024539522802382907, + "loss": 7.1752, + "step": 19482 + }, + { + "epoch": 1.8179527852943922, + "grad_norm": 1.9942442365218136, + "learning_rate": 0.00024538939081456484, + "loss": 7.1425, + "step": 19483 + }, + { + "epoch": 1.8180460949892694, + "grad_norm": 1.2208320402945105, + "learning_rate": 0.00024538355336275363, + "loss": 7.5875, + "step": 19484 + }, + { + "epoch": 1.8181394046841466, + "grad_norm": 4.000640369780133e+28, + "learning_rate": 0.0002453777156684103, + "loss": 7.4262, + "step": 19485 + }, + { + "epoch": 1.818232714379024, + "grad_norm": 2.206102199449117, + "learning_rate": 0.0002453718777315497, + "loss": 7.6285, + "step": 19486 + }, + { + "epoch": 1.8183260240739014, + "grad_norm": 2.325546325890558, + "learning_rate": 0.0002453660395521867, + "loss": 7.4362, + "step": 19487 + }, + { + "epoch": 1.8184193337687786, + "grad_norm": 2.715177702502797e+27, + "learning_rate": 0.00024536020113033604, + "loss": 7.3896, + "step": 19488 + }, + { + "epoch": 1.8185126434636558, + "grad_norm": 9.719548843904782e+28, + "learning_rate": 0.00024535436246601264, + "loss": 8.0711, + "step": 19489 + }, + { + "epoch": 1.8186059531585332, + "grad_norm": 1.0684715731135113, + "learning_rate": 0.00024534852355923145, + "loss": 7.4126, + "step": 19490 + }, + { + "epoch": 1.8186992628534104, + "grad_norm": 1.1864662244472646, + "learning_rate": 0.0002453426844100071, + "loss": 7.4169, + "step": 19491 + }, + { + "epoch": 1.8187925725482876, + "grad_norm": 1.6478936392325485, + "learning_rate": 0.0002453368450183545, + "loss": 7.8144, + "step": 19492 + }, + { + "epoch": 1.818885882243165, + "grad_norm": 1.333579364560579, + "learning_rate": 0.0002453310053842887, + "loss": 7.4072, + "step": 19493 + }, + { + "epoch": 1.8189791919380425, + "grad_norm": 2.206024891123169, + "learning_rate": 0.0002453251655078243, + "loss": 7.465, + "step": 19494 + }, + { + "epoch": 1.8190725016329197, + "grad_norm": 1.9165274348456758, + "learning_rate": 0.0002453193253889762, + "loss": 7.5221, + "step": 19495 + }, + { + "epoch": 1.8191658113277969, + "grad_norm": 1.3225024081375425, + "learning_rate": 0.00024531348502775937, + "loss": 7.3327, + "step": 19496 + }, + { + "epoch": 1.8192591210226743, + "grad_norm": 1.3538335455845847, + "learning_rate": 0.0002453076444241886, + "loss": 7.8113, + "step": 19497 + }, + { + "epoch": 1.8193524307175517, + "grad_norm": 1.5486714641023887, + "learning_rate": 0.00024530180357827864, + "loss": 7.7609, + "step": 19498 + }, + { + "epoch": 1.819445740412429, + "grad_norm": 1.1840838030177223, + "learning_rate": 0.00024529596249004445, + "loss": 7.687, + "step": 19499 + }, + { + "epoch": 1.819539050107306, + "grad_norm": 1.3432027040436318, + "learning_rate": 0.0002452901211595009, + "loss": 7.2387, + "step": 19500 + }, + { + "epoch": 1.8196323598021835, + "grad_norm": 1.4773391448457782, + "learning_rate": 0.00024528427958666276, + "loss": 7.6616, + "step": 19501 + }, + { + "epoch": 1.8197256694970607, + "grad_norm": 1.1245357797504545, + "learning_rate": 0.00024527843777154496, + "loss": 7.4586, + "step": 19502 + }, + { + "epoch": 1.819818979191938, + "grad_norm": 1.1619736443331652, + "learning_rate": 0.0002452725957141623, + "loss": 7.6253, + "step": 19503 + }, + { + "epoch": 1.8199122888868153, + "grad_norm": 1.0945534933740215, + "learning_rate": 0.0002452667534145296, + "loss": 7.7465, + "step": 19504 + }, + { + "epoch": 1.8200055985816928, + "grad_norm": 1.138493512686159, + "learning_rate": 0.00024526091087266177, + "loss": 7.4329, + "step": 19505 + }, + { + "epoch": 1.82009890827657, + "grad_norm": 213.18559213645955, + "learning_rate": 0.0002452550680885738, + "loss": 7.4946, + "step": 19506 + }, + { + "epoch": 1.8201922179714471, + "grad_norm": 1.4334763700632107e+29, + "learning_rate": 0.00024524922506228024, + "loss": 7.7271, + "step": 19507 + }, + { + "epoch": 1.8202855276663246, + "grad_norm": 1.641091650813525, + "learning_rate": 0.00024524338179379615, + "loss": 7.8607, + "step": 19508 + }, + { + "epoch": 1.820378837361202, + "grad_norm": 1.092418429758597, + "learning_rate": 0.0002452375382831364, + "loss": 7.6294, + "step": 19509 + }, + { + "epoch": 1.8204721470560792, + "grad_norm": 1.447659277559818e+30, + "learning_rate": 0.0002452316945303158, + "loss": 7.2658, + "step": 19510 + }, + { + "epoch": 1.8205654567509564, + "grad_norm": 5.710938789939517e+28, + "learning_rate": 0.0002452258505353492, + "loss": 7.5246, + "step": 19511 + }, + { + "epoch": 1.8206587664458338, + "grad_norm": 1.059531586501253, + "learning_rate": 0.00024522000629825146, + "loss": 7.6619, + "step": 19512 + }, + { + "epoch": 1.820752076140711, + "grad_norm": 1.4987171846529568, + "learning_rate": 0.0002452141618190374, + "loss": 7.4789, + "step": 19513 + }, + { + "epoch": 1.8208453858355882, + "grad_norm": 1.218371264559813, + "learning_rate": 0.000245208317097722, + "loss": 7.3782, + "step": 19514 + }, + { + "epoch": 1.8209386955304656, + "grad_norm": 1.0580365014339666e+30, + "learning_rate": 0.00024520247213432006, + "loss": 7.2976, + "step": 19515 + }, + { + "epoch": 1.821032005225343, + "grad_norm": 1.2085553227688586, + "learning_rate": 0.00024519662692884644, + "loss": 7.5575, + "step": 19516 + }, + { + "epoch": 1.8211253149202202, + "grad_norm": 1.3273139373080347, + "learning_rate": 0.00024519078148131597, + "loss": 7.519, + "step": 19517 + }, + { + "epoch": 1.8212186246150974, + "grad_norm": 1.319971803179766, + "learning_rate": 0.00024518493579174354, + "loss": 7.4547, + "step": 19518 + }, + { + "epoch": 1.8213119343099748, + "grad_norm": 1.3697242655581434, + "learning_rate": 0.000245179089860144, + "loss": 7.2578, + "step": 19519 + }, + { + "epoch": 1.8214052440048523, + "grad_norm": 1.2870657919727294e+28, + "learning_rate": 0.0002451732436865323, + "loss": 7.8993, + "step": 19520 + }, + { + "epoch": 1.8214985536997292, + "grad_norm": 6.215103502379399e+28, + "learning_rate": 0.0002451673972709231, + "loss": 7.5646, + "step": 19521 + }, + { + "epoch": 1.8215918633946067, + "grad_norm": 1.0358400843660855, + "learning_rate": 0.0002451615506133315, + "loss": 7.4799, + "step": 19522 + }, + { + "epoch": 1.821685173089484, + "grad_norm": 1.677386835885518, + "learning_rate": 0.00024515570371377225, + "loss": 7.4486, + "step": 19523 + }, + { + "epoch": 1.8217784827843613, + "grad_norm": 2.2430949816931824, + "learning_rate": 0.00024514985657226023, + "loss": 7.3737, + "step": 19524 + }, + { + "epoch": 1.8218717924792385, + "grad_norm": 1.633067558721352, + "learning_rate": 0.0002451440091888103, + "loss": 7.4796, + "step": 19525 + }, + { + "epoch": 1.821965102174116, + "grad_norm": 1.3719397693076223, + "learning_rate": 0.0002451381615634373, + "loss": 7.3827, + "step": 19526 + }, + { + "epoch": 1.8220584118689933, + "grad_norm": 1.6554609550552442, + "learning_rate": 0.00024513231369615625, + "loss": 7.8299, + "step": 19527 + }, + { + "epoch": 1.8221517215638705, + "grad_norm": 1.2616634466153314, + "learning_rate": 0.0002451264655869818, + "loss": 7.5065, + "step": 19528 + }, + { + "epoch": 1.8222450312587477, + "grad_norm": 1.1783083316553844, + "learning_rate": 0.0002451206172359289, + "loss": 7.565, + "step": 19529 + }, + { + "epoch": 1.8223383409536251, + "grad_norm": 1.0981189774646727, + "learning_rate": 0.00024511476864301257, + "loss": 7.5591, + "step": 19530 + }, + { + "epoch": 1.8224316506485025, + "grad_norm": 1.3388095932885595, + "learning_rate": 0.00024510891980824754, + "loss": 7.8629, + "step": 19531 + }, + { + "epoch": 1.8225249603433795, + "grad_norm": 4.717345403843925, + "learning_rate": 0.0002451030707316486, + "loss": 7.5698, + "step": 19532 + }, + { + "epoch": 1.822618270038257, + "grad_norm": 1.0057640733916633, + "learning_rate": 0.00024509722141323085, + "loss": 7.4505, + "step": 19533 + }, + { + "epoch": 1.8227115797331344, + "grad_norm": 1.0472702470760686, + "learning_rate": 0.0002450913718530089, + "loss": 7.5234, + "step": 19534 + }, + { + "epoch": 1.8228048894280116, + "grad_norm": 3.6390362293416745e+27, + "learning_rate": 0.00024508552205099784, + "loss": 7.6983, + "step": 19535 + }, + { + "epoch": 1.8228981991228888, + "grad_norm": 1.478078643942912, + "learning_rate": 0.00024507967200721245, + "loss": 7.1794, + "step": 19536 + }, + { + "epoch": 1.8229915088177662, + "grad_norm": 1.0315068132271287e+29, + "learning_rate": 0.00024507382172166766, + "loss": 7.436, + "step": 19537 + }, + { + "epoch": 1.8230848185126436, + "grad_norm": 1.2035735339493991, + "learning_rate": 0.00024506797119437825, + "loss": 7.5564, + "step": 19538 + }, + { + "epoch": 1.8231781282075208, + "grad_norm": 1.091100317744135, + "learning_rate": 0.0002450621204253592, + "loss": 7.3422, + "step": 19539 + }, + { + "epoch": 1.823271437902398, + "grad_norm": 1.2540689257755568, + "learning_rate": 0.00024505626941462527, + "loss": 7.589, + "step": 19540 + }, + { + "epoch": 1.8233647475972754, + "grad_norm": 3.6010596132423234, + "learning_rate": 0.0002450504181621914, + "loss": 7.4702, + "step": 19541 + }, + { + "epoch": 1.8234580572921526, + "grad_norm": 8.541370540399729e+28, + "learning_rate": 0.0002450445666680725, + "loss": 7.5497, + "step": 19542 + }, + { + "epoch": 1.8235513669870298, + "grad_norm": 6.023932805403601e+27, + "learning_rate": 0.00024503871493228345, + "loss": 7.3267, + "step": 19543 + }, + { + "epoch": 1.8236446766819072, + "grad_norm": 1.0605435180037293, + "learning_rate": 0.0002450328629548391, + "loss": 7.4676, + "step": 19544 + }, + { + "epoch": 1.8237379863767846, + "grad_norm": 1.3712509938983766, + "learning_rate": 0.0002450270107357543, + "loss": 7.7845, + "step": 19545 + }, + { + "epoch": 1.8238312960716618, + "grad_norm": 1.0782916982799533, + "learning_rate": 0.000245021158275044, + "loss": 7.4643, + "step": 19546 + }, + { + "epoch": 1.823924605766539, + "grad_norm": 1.164177616472607, + "learning_rate": 0.000245015305572723, + "loss": 7.4957, + "step": 19547 + }, + { + "epoch": 1.8240179154614165, + "grad_norm": 2.0423430070753286, + "learning_rate": 0.0002450094526288062, + "loss": 7.1277, + "step": 19548 + }, + { + "epoch": 1.8241112251562939, + "grad_norm": 4.03171491654582, + "learning_rate": 0.00024500359944330856, + "loss": 7.43, + "step": 19549 + }, + { + "epoch": 1.824204534851171, + "grad_norm": 2.8360410838014444e+26, + "learning_rate": 0.0002449977460162449, + "loss": 7.3733, + "step": 19550 + }, + { + "epoch": 1.8242978445460483, + "grad_norm": 1.8773799943402494, + "learning_rate": 0.0002449918923476301, + "loss": 7.628, + "step": 19551 + }, + { + "epoch": 1.8243911542409257, + "grad_norm": 1.3391437863210693, + "learning_rate": 0.0002449860384374791, + "loss": 7.4141, + "step": 19552 + }, + { + "epoch": 1.8244844639358029, + "grad_norm": 1.3112640973755505, + "learning_rate": 0.00024498018428580667, + "loss": 7.2136, + "step": 19553 + }, + { + "epoch": 1.82457777363068, + "grad_norm": 1.2332555004210415, + "learning_rate": 0.0002449743298926278, + "loss": 7.714, + "step": 19554 + }, + { + "epoch": 1.8246710833255575, + "grad_norm": 1.6208720174031483, + "learning_rate": 0.00024496847525795737, + "loss": 7.5754, + "step": 19555 + }, + { + "epoch": 1.824764393020435, + "grad_norm": 1.9128186795360942, + "learning_rate": 0.0002449626203818102, + "loss": 7.4016, + "step": 19556 + }, + { + "epoch": 1.8248577027153121, + "grad_norm": 5.635231435583367e+27, + "learning_rate": 0.00024495676526420124, + "loss": 7.1384, + "step": 19557 + }, + { + "epoch": 1.8249510124101893, + "grad_norm": 1.078798665493993, + "learning_rate": 0.00024495090990514533, + "loss": 7.6691, + "step": 19558 + }, + { + "epoch": 1.8250443221050667, + "grad_norm": 10.028315552153442, + "learning_rate": 0.0002449450543046574, + "loss": 7.7979, + "step": 19559 + }, + { + "epoch": 1.8251376317999441, + "grad_norm": 6.919213218008662e+26, + "learning_rate": 0.00024493919846275226, + "loss": 7.8391, + "step": 19560 + }, + { + "epoch": 1.8252309414948213, + "grad_norm": 1.2655086733362395, + "learning_rate": 0.00024493334237944497, + "loss": 7.4064, + "step": 19561 + }, + { + "epoch": 1.8253242511896985, + "grad_norm": 1.143895937785576, + "learning_rate": 0.00024492748605475025, + "loss": 7.4945, + "step": 19562 + }, + { + "epoch": 1.825417560884576, + "grad_norm": 1.1328629595004358, + "learning_rate": 0.00024492162948868307, + "loss": 7.5831, + "step": 19563 + }, + { + "epoch": 1.8255108705794532, + "grad_norm": 1.1745452932872307, + "learning_rate": 0.0002449157726812583, + "loss": 7.6944, + "step": 19564 + }, + { + "epoch": 1.8256041802743304, + "grad_norm": 1.2768443624416346, + "learning_rate": 0.0002449099156324908, + "loss": 7.4106, + "step": 19565 + }, + { + "epoch": 1.8256974899692078, + "grad_norm": 3.8047215015557576, + "learning_rate": 0.00024490405834239553, + "loss": 7.4678, + "step": 19566 + }, + { + "epoch": 1.8257907996640852, + "grad_norm": 4.6877217843827595e+26, + "learning_rate": 0.0002448982008109873, + "loss": 7.5166, + "step": 19567 + }, + { + "epoch": 1.8258841093589624, + "grad_norm": 1.5516310214314755, + "learning_rate": 0.0002448923430382812, + "loss": 7.7883, + "step": 19568 + }, + { + "epoch": 1.8259774190538396, + "grad_norm": 1.1805468043210208, + "learning_rate": 0.0002448864850242919, + "loss": 7.563, + "step": 19569 + }, + { + "epoch": 1.826070728748717, + "grad_norm": 2.9966869241866768e+28, + "learning_rate": 0.00024488062676903435, + "loss": 7.5118, + "step": 19570 + }, + { + "epoch": 1.8261640384435944, + "grad_norm": 1.162672952598183, + "learning_rate": 0.00024487476827252347, + "loss": 7.8481, + "step": 19571 + }, + { + "epoch": 1.8262573481384716, + "grad_norm": 1.6049790085102889, + "learning_rate": 0.0002448689095347742, + "loss": 7.1833, + "step": 19572 + }, + { + "epoch": 1.8263506578333488, + "grad_norm": 9.08674323075494e+27, + "learning_rate": 0.00024486305055580136, + "loss": 7.5578, + "step": 19573 + }, + { + "epoch": 1.8264439675282262, + "grad_norm": 1.632923816766603, + "learning_rate": 0.0002448571913356199, + "loss": 7.603, + "step": 19574 + }, + { + "epoch": 1.8265372772231034, + "grad_norm": 1.9146278685418863, + "learning_rate": 0.0002448513318742447, + "loss": 8.1558, + "step": 19575 + }, + { + "epoch": 1.8266305869179806, + "grad_norm": 1.1543296826554195, + "learning_rate": 0.0002448454721716906, + "loss": 7.3454, + "step": 19576 + }, + { + "epoch": 1.826723896612858, + "grad_norm": 1.0942081562884065, + "learning_rate": 0.00024483961222797264, + "loss": 7.5229, + "step": 19577 + }, + { + "epoch": 1.8268172063077355, + "grad_norm": 1.3145659506128873, + "learning_rate": 0.0002448337520431056, + "loss": 7.697, + "step": 19578 + }, + { + "epoch": 1.8269105160026127, + "grad_norm": 1.1772074300147053, + "learning_rate": 0.00024482789161710445, + "loss": 7.5694, + "step": 19579 + }, + { + "epoch": 1.8270038256974899, + "grad_norm": 8.298847575624211e+27, + "learning_rate": 0.00024482203094998407, + "loss": 7.7469, + "step": 19580 + }, + { + "epoch": 1.8270971353923673, + "grad_norm": 1.2904252736336366, + "learning_rate": 0.0002448161700417593, + "loss": 7.2913, + "step": 19581 + }, + { + "epoch": 1.8271904450872447, + "grad_norm": 2.198305490179948, + "learning_rate": 0.00024481030889244513, + "loss": 7.9958, + "step": 19582 + }, + { + "epoch": 1.827283754782122, + "grad_norm": 1.4768682935779774e+29, + "learning_rate": 0.00024480444750205646, + "loss": 7.9177, + "step": 19583 + }, + { + "epoch": 1.827377064476999, + "grad_norm": 3.473103642213918e+29, + "learning_rate": 0.0002447985858706081, + "loss": 7.6804, + "step": 19584 + }, + { + "epoch": 1.8274703741718765, + "grad_norm": 1.210816803335173, + "learning_rate": 0.00024479272399811506, + "loss": 7.7055, + "step": 19585 + }, + { + "epoch": 1.8275636838667537, + "grad_norm": 1.6181031160944097, + "learning_rate": 0.00024478686188459216, + "loss": 7.5289, + "step": 19586 + }, + { + "epoch": 1.827656993561631, + "grad_norm": 1.8003239490830618, + "learning_rate": 0.00024478099953005437, + "loss": 7.5249, + "step": 19587 + }, + { + "epoch": 1.8277503032565083, + "grad_norm": 1.6079706284559643, + "learning_rate": 0.00024477513693451665, + "loss": 7.6131, + "step": 19588 + }, + { + "epoch": 1.8278436129513858, + "grad_norm": 3.4386379901801334e+29, + "learning_rate": 0.0002447692740979937, + "loss": 7.1972, + "step": 19589 + }, + { + "epoch": 1.827936922646263, + "grad_norm": 1.042451714846036e+28, + "learning_rate": 0.0002447634110205007, + "loss": 7.4871, + "step": 19590 + }, + { + "epoch": 1.8280302323411401, + "grad_norm": 1.368320577046171, + "learning_rate": 0.0002447575477020523, + "loss": 7.5243, + "step": 19591 + }, + { + "epoch": 1.8281235420360176, + "grad_norm": 3.454139138718535e+29, + "learning_rate": 0.0002447516841426635, + "loss": 7.465, + "step": 19592 + }, + { + "epoch": 1.828216851730895, + "grad_norm": 1.454579814055695, + "learning_rate": 0.0002447458203423494, + "loss": 7.3961, + "step": 19593 + }, + { + "epoch": 1.8283101614257722, + "grad_norm": 3.53911946730088, + "learning_rate": 0.0002447399563011246, + "loss": 7.3065, + "step": 19594 + }, + { + "epoch": 1.8284034711206494, + "grad_norm": 1.1514715144024763, + "learning_rate": 0.00024473409201900416, + "loss": 7.353, + "step": 19595 + }, + { + "epoch": 1.8284967808155268, + "grad_norm": 1.510550587627835, + "learning_rate": 0.00024472822749600304, + "loss": 7.501, + "step": 19596 + }, + { + "epoch": 1.828590090510404, + "grad_norm": 1.6314990908877633, + "learning_rate": 0.00024472236273213605, + "loss": 7.6343, + "step": 19597 + }, + { + "epoch": 1.8286834002052812, + "grad_norm": 1.5703896816992928, + "learning_rate": 0.0002447164977274182, + "loss": 7.6191, + "step": 19598 + }, + { + "epoch": 1.8287767099001586, + "grad_norm": 1.1983689861401485, + "learning_rate": 0.00024471063248186433, + "loss": 7.5638, + "step": 19599 + }, + { + "epoch": 1.828870019595036, + "grad_norm": 1.238942809110007, + "learning_rate": 0.0002447047669954894, + "loss": 7.5206, + "step": 19600 + }, + { + "epoch": 1.8289633292899132, + "grad_norm": 1.7402505412531575, + "learning_rate": 0.0002446989012683083, + "loss": 7.328, + "step": 19601 + }, + { + "epoch": 1.8290566389847904, + "grad_norm": 1.8664326544513792, + "learning_rate": 0.00024469303530033593, + "loss": 7.1663, + "step": 19602 + }, + { + "epoch": 1.8291499486796678, + "grad_norm": 1.1765731386458205, + "learning_rate": 0.00024468716909158725, + "loss": 7.594, + "step": 19603 + }, + { + "epoch": 1.8292432583745453, + "grad_norm": 1.340812226728864, + "learning_rate": 0.0002446813026420771, + "loss": 7.317, + "step": 19604 + }, + { + "epoch": 1.8293365680694225, + "grad_norm": 3.1831192419054515e+28, + "learning_rate": 0.00024467543595182046, + "loss": 7.873, + "step": 19605 + }, + { + "epoch": 1.8294298777642997, + "grad_norm": 1.0477663684885585, + "learning_rate": 0.0002446695690208323, + "loss": 7.5015, + "step": 19606 + }, + { + "epoch": 1.829523187459177, + "grad_norm": 1.0328346309888234, + "learning_rate": 0.00024466370184912737, + "loss": 7.4812, + "step": 19607 + }, + { + "epoch": 1.8296164971540543, + "grad_norm": 1.4117285090832906e+27, + "learning_rate": 0.0002446578344367207, + "loss": 7.2026, + "step": 19608 + }, + { + "epoch": 1.8297098068489315, + "grad_norm": 1.5320317441445432, + "learning_rate": 0.0002446519667836272, + "loss": 7.1819, + "step": 19609 + }, + { + "epoch": 1.829803116543809, + "grad_norm": 1.046198380949973, + "learning_rate": 0.0002446460988898618, + "loss": 7.5018, + "step": 19610 + }, + { + "epoch": 1.8298964262386863, + "grad_norm": 1.663579241002805, + "learning_rate": 0.00024464023075543946, + "loss": 7.7865, + "step": 19611 + }, + { + "epoch": 1.8299897359335635, + "grad_norm": 1.940582395267547e+29, + "learning_rate": 0.0002446343623803749, + "loss": 7.6387, + "step": 19612 + }, + { + "epoch": 1.8300830456284407, + "grad_norm": 1.423905693079025, + "learning_rate": 0.0002446284937646833, + "loss": 7.7924, + "step": 19613 + }, + { + "epoch": 1.8301763553233181, + "grad_norm": 5.715802739478119e+28, + "learning_rate": 0.0002446226249083795, + "loss": 7.5582, + "step": 19614 + }, + { + "epoch": 1.8302696650181955, + "grad_norm": 1.3610738191711065, + "learning_rate": 0.0002446167558114784, + "loss": 7.6217, + "step": 19615 + }, + { + "epoch": 1.8303629747130727, + "grad_norm": 1.2668416318106974, + "learning_rate": 0.00024461088647399483, + "loss": 7.4795, + "step": 19616 + }, + { + "epoch": 1.83045628440795, + "grad_norm": 2.292847077369985e+29, + "learning_rate": 0.00024460501689594385, + "loss": 7.5155, + "step": 19617 + }, + { + "epoch": 1.8305495941028274, + "grad_norm": 1.551900119521838, + "learning_rate": 0.00024459914707734034, + "loss": 7.6321, + "step": 19618 + }, + { + "epoch": 1.8306429037977046, + "grad_norm": 1.9398097918663586e+29, + "learning_rate": 0.0002445932770181992, + "loss": 7.3116, + "step": 19619 + }, + { + "epoch": 1.8307362134925818, + "grad_norm": 1.3050583199720103, + "learning_rate": 0.00024458740671853536, + "loss": 7.6361, + "step": 19620 + }, + { + "epoch": 1.8308295231874592, + "grad_norm": 1.3195747400175408, + "learning_rate": 0.0002445815361783638, + "loss": 7.1934, + "step": 19621 + }, + { + "epoch": 1.8309228328823366, + "grad_norm": 7.309068008320863e+29, + "learning_rate": 0.0002445756653976994, + "loss": 7.1683, + "step": 19622 + }, + { + "epoch": 1.8310161425772138, + "grad_norm": 1.644347501959748, + "learning_rate": 0.00024456979437655714, + "loss": 7.5014, + "step": 19623 + }, + { + "epoch": 1.831109452272091, + "grad_norm": 2.031994299863128, + "learning_rate": 0.0002445639231149519, + "loss": 7.3413, + "step": 19624 + }, + { + "epoch": 1.8312027619669684, + "grad_norm": 3.372617557665357e+27, + "learning_rate": 0.00024455805161289855, + "loss": 7.4918, + "step": 19625 + }, + { + "epoch": 1.8312960716618458, + "grad_norm": 1.1094618944866246, + "learning_rate": 0.0002445521798704121, + "loss": 7.362, + "step": 19626 + }, + { + "epoch": 1.8313893813567228, + "grad_norm": 1.0811882778323236, + "learning_rate": 0.0002445463078875075, + "loss": 7.3264, + "step": 19627 + }, + { + "epoch": 1.8314826910516002, + "grad_norm": 33.524430512057535, + "learning_rate": 0.0002445404356641996, + "loss": 7.5909, + "step": 19628 + }, + { + "epoch": 1.8315760007464776, + "grad_norm": 1.1573682427895735, + "learning_rate": 0.0002445345632005034, + "loss": 7.3813, + "step": 19629 + }, + { + "epoch": 1.8316693104413548, + "grad_norm": 1.1624863159514438, + "learning_rate": 0.00024452869049643383, + "loss": 7.6577, + "step": 19630 + }, + { + "epoch": 1.831762620136232, + "grad_norm": 1.1146136557090593, + "learning_rate": 0.0002445228175520058, + "loss": 7.5972, + "step": 19631 + }, + { + "epoch": 1.8318559298311095, + "grad_norm": 1.9905947283339707, + "learning_rate": 0.0002445169443672342, + "loss": 7.489, + "step": 19632 + }, + { + "epoch": 1.8319492395259869, + "grad_norm": 1.0666253290361243e+29, + "learning_rate": 0.00024451107094213405, + "loss": 7.4539, + "step": 19633 + }, + { + "epoch": 1.832042549220864, + "grad_norm": 1.75763657881175, + "learning_rate": 0.00024450519727672025, + "loss": 7.2986, + "step": 19634 + }, + { + "epoch": 1.8321358589157413, + "grad_norm": 1.1646323929353126, + "learning_rate": 0.00024449932337100767, + "loss": 7.4712, + "step": 19635 + }, + { + "epoch": 1.8322291686106187, + "grad_norm": 8.954777725895871e+27, + "learning_rate": 0.0002444934492250114, + "loss": 7.5214, + "step": 19636 + }, + { + "epoch": 1.832322478305496, + "grad_norm": 6.004306168654234e+27, + "learning_rate": 0.0002444875748387462, + "loss": 7.2485, + "step": 19637 + }, + { + "epoch": 1.832415788000373, + "grad_norm": 1.1972410327156022, + "learning_rate": 0.00024448170021222713, + "loss": 7.472, + "step": 19638 + }, + { + "epoch": 1.8325090976952505, + "grad_norm": 1.7272432197562255, + "learning_rate": 0.000244475825345469, + "loss": 7.9183, + "step": 19639 + }, + { + "epoch": 1.832602407390128, + "grad_norm": 1.66845972005862, + "learning_rate": 0.00024446995023848694, + "loss": 7.615, + "step": 19640 + }, + { + "epoch": 1.8326957170850051, + "grad_norm": 1.3610276711583777, + "learning_rate": 0.0002444640748912957, + "loss": 7.6835, + "step": 19641 + }, + { + "epoch": 1.8327890267798823, + "grad_norm": 2.9265439272729523, + "learning_rate": 0.0002444581993039103, + "loss": 7.7114, + "step": 19642 + }, + { + "epoch": 1.8328823364747597, + "grad_norm": 1.6950197088141192, + "learning_rate": 0.00024445232347634574, + "loss": 7.2765, + "step": 19643 + }, + { + "epoch": 1.8329756461696372, + "grad_norm": 1.5311046010304452, + "learning_rate": 0.0002444464474086169, + "loss": 7.3292, + "step": 19644 + }, + { + "epoch": 1.8330689558645143, + "grad_norm": 1.1570842420683034, + "learning_rate": 0.0002444405711007387, + "loss": 7.4877, + "step": 19645 + }, + { + "epoch": 1.8331622655593915, + "grad_norm": 1.1987691019038509, + "learning_rate": 0.0002444346945527261, + "loss": 7.3889, + "step": 19646 + }, + { + "epoch": 1.833255575254269, + "grad_norm": 1.4920098988658346, + "learning_rate": 0.0002444288177645941, + "loss": 7.1744, + "step": 19647 + }, + { + "epoch": 1.8333488849491462, + "grad_norm": 1.0857680097272582, + "learning_rate": 0.00024442294073635746, + "loss": 7.3556, + "step": 19648 + }, + { + "epoch": 1.8334421946440234, + "grad_norm": 2.431307832149268, + "learning_rate": 0.0002444170634680313, + "loss": 7.9602, + "step": 19649 + }, + { + "epoch": 1.8335355043389008, + "grad_norm": 1.9143838408687732, + "learning_rate": 0.0002444111859596306, + "loss": 7.6436, + "step": 19650 + }, + { + "epoch": 1.8336288140337782, + "grad_norm": 1.6289805216454174, + "learning_rate": 0.00024440530821117013, + "loss": 7.8052, + "step": 19651 + }, + { + "epoch": 1.8337221237286554, + "grad_norm": 1.207138242916154, + "learning_rate": 0.000244399430222665, + "loss": 7.8635, + "step": 19652 + }, + { + "epoch": 1.8338154334235326, + "grad_norm": 1.3408034290385231, + "learning_rate": 0.00024439355199413, + "loss": 7.7343, + "step": 19653 + }, + { + "epoch": 1.83390874311841, + "grad_norm": 2.4448175711912126, + "learning_rate": 0.00024438767352558024, + "loss": 7.5718, + "step": 19654 + }, + { + "epoch": 1.8340020528132874, + "grad_norm": 2.0185218094177544, + "learning_rate": 0.0002443817948170305, + "loss": 7.6375, + "step": 19655 + }, + { + "epoch": 1.8340953625081646, + "grad_norm": 1.3611914380669514e+29, + "learning_rate": 0.00024437591586849586, + "loss": 7.6933, + "step": 19656 + }, + { + "epoch": 1.8341886722030418, + "grad_norm": 1.4957974035701835, + "learning_rate": 0.00024437003667999123, + "loss": 7.34, + "step": 19657 + }, + { + "epoch": 1.8342819818979192, + "grad_norm": 1.2986247435524623, + "learning_rate": 0.0002443641572515315, + "loss": 7.3869, + "step": 19658 + }, + { + "epoch": 1.8343752915927964, + "grad_norm": 4.598343151138309, + "learning_rate": 0.0002443582775831318, + "loss": 7.7933, + "step": 19659 + }, + { + "epoch": 1.8344686012876736, + "grad_norm": 1.6202066140833447, + "learning_rate": 0.00024435239767480686, + "loss": 7.5286, + "step": 19660 + }, + { + "epoch": 1.834561910982551, + "grad_norm": 1.0349761449346362, + "learning_rate": 0.00024434651752657167, + "loss": 7.4134, + "step": 19661 + }, + { + "epoch": 1.8346552206774285, + "grad_norm": 2.2378332476407072, + "learning_rate": 0.0002443406371384413, + "loss": 7.5909, + "step": 19662 + }, + { + "epoch": 1.8347485303723057, + "grad_norm": 1.40307316386881, + "learning_rate": 0.00024433475651043066, + "loss": 7.4223, + "step": 19663 + }, + { + "epoch": 1.8348418400671829, + "grad_norm": 1.1288993765472322, + "learning_rate": 0.00024432887564255465, + "loss": 7.6565, + "step": 19664 + }, + { + "epoch": 1.8349351497620603, + "grad_norm": 1.0924098693715623, + "learning_rate": 0.00024432299453482824, + "loss": 7.6087, + "step": 19665 + }, + { + "epoch": 1.8350284594569377, + "grad_norm": 1.6935484839911933e+29, + "learning_rate": 0.0002443171131872664, + "loss": 7.3129, + "step": 19666 + }, + { + "epoch": 1.835121769151815, + "grad_norm": 1.7155792574559873e+29, + "learning_rate": 0.00024431123159988406, + "loss": 7.4263, + "step": 19667 + }, + { + "epoch": 1.835215078846692, + "grad_norm": 3.4408004338819556, + "learning_rate": 0.0002443053497726962, + "loss": 7.2085, + "step": 19668 + }, + { + "epoch": 1.8353083885415695, + "grad_norm": 2.3222004991206617, + "learning_rate": 0.0002442994677057178, + "loss": 7.2193, + "step": 19669 + }, + { + "epoch": 1.8354016982364467, + "grad_norm": 1.025042285579749, + "learning_rate": 0.00024429358539896376, + "loss": 7.4627, + "step": 19670 + }, + { + "epoch": 1.835495007931324, + "grad_norm": 1.2810734775160744, + "learning_rate": 0.0002442877028524491, + "loss": 7.442, + "step": 19671 + }, + { + "epoch": 1.8355883176262013, + "grad_norm": 1.1234906338724215, + "learning_rate": 0.00024428182006618873, + "loss": 7.3356, + "step": 19672 + }, + { + "epoch": 1.8356816273210788, + "grad_norm": 1.3265652318803216, + "learning_rate": 0.00024427593704019757, + "loss": 7.2048, + "step": 19673 + }, + { + "epoch": 1.835774937015956, + "grad_norm": 1.8558684354494432, + "learning_rate": 0.00024427005377449066, + "loss": 7.5003, + "step": 19674 + }, + { + "epoch": 1.8358682467108332, + "grad_norm": 1.3591365221469388e+31, + "learning_rate": 0.00024426417026908296, + "loss": 7.457, + "step": 19675 + }, + { + "epoch": 1.8359615564057106, + "grad_norm": 6.391208323773621, + "learning_rate": 0.0002442582865239894, + "loss": 7.3233, + "step": 19676 + }, + { + "epoch": 1.836054866100588, + "grad_norm": 1.3478150981732793, + "learning_rate": 0.0002442524025392249, + "loss": 7.6082, + "step": 19677 + }, + { + "epoch": 1.8361481757954652, + "grad_norm": 1.1923940465732246, + "learning_rate": 0.0002442465183148044, + "loss": 7.61, + "step": 19678 + }, + { + "epoch": 1.8362414854903424, + "grad_norm": 8.741192701841392e+31, + "learning_rate": 0.00024424063385074306, + "loss": 7.1949, + "step": 19679 + }, + { + "epoch": 1.8363347951852198, + "grad_norm": 3.4592179448767255e+30, + "learning_rate": 0.0002442347491470556, + "loss": 8.0054, + "step": 19680 + }, + { + "epoch": 1.836428104880097, + "grad_norm": 1.7999876916448847e+30, + "learning_rate": 0.00024422886420375715, + "loss": 7.6507, + "step": 19681 + }, + { + "epoch": 1.8365214145749742, + "grad_norm": 3.043607335813526, + "learning_rate": 0.0002442229790208626, + "loss": 7.216, + "step": 19682 + }, + { + "epoch": 1.8366147242698516, + "grad_norm": 1.6017622586711455, + "learning_rate": 0.0002442170935983869, + "loss": 7.6237, + "step": 19683 + }, + { + "epoch": 1.836708033964729, + "grad_norm": 1.6298370050158795, + "learning_rate": 0.00024421120793634505, + "loss": 7.4066, + "step": 19684 + }, + { + "epoch": 1.8368013436596062, + "grad_norm": 1.4565455211888102e+30, + "learning_rate": 0.00024420532203475205, + "loss": 7.6351, + "step": 19685 + }, + { + "epoch": 1.8368946533544834, + "grad_norm": 1.0787553558699022e+29, + "learning_rate": 0.0002441994358936227, + "loss": 7.5655, + "step": 19686 + }, + { + "epoch": 1.8369879630493609, + "grad_norm": 1.363020438161352, + "learning_rate": 0.0002441935495129722, + "loss": 7.182, + "step": 19687 + }, + { + "epoch": 1.8370812727442383, + "grad_norm": 1.1367559092133157, + "learning_rate": 0.0002441876628928154, + "loss": 7.0913, + "step": 19688 + }, + { + "epoch": 1.8371745824391155, + "grad_norm": 2.28738359180456, + "learning_rate": 0.00024418177603316725, + "loss": 7.5175, + "step": 19689 + }, + { + "epoch": 1.8372678921339927, + "grad_norm": 1.7356622753638626, + "learning_rate": 0.0002441758889340427, + "loss": 7.5813, + "step": 19690 + }, + { + "epoch": 1.83736120182887, + "grad_norm": 1.9965554369527913e+31, + "learning_rate": 0.00024417000159545683, + "loss": 7.2037, + "step": 19691 + }, + { + "epoch": 1.8374545115237473, + "grad_norm": 1.0523345649994695, + "learning_rate": 0.0002441641140174245, + "loss": 7.4384, + "step": 19692 + }, + { + "epoch": 1.8375478212186245, + "grad_norm": 1.5058292783973391e+31, + "learning_rate": 0.00024415822619996074, + "loss": 7.4206, + "step": 19693 + }, + { + "epoch": 1.837641130913502, + "grad_norm": 1.098561668042537, + "learning_rate": 0.00024415233814308046, + "loss": 7.5302, + "step": 19694 + }, + { + "epoch": 1.8377344406083793, + "grad_norm": 1.0871895634806157, + "learning_rate": 0.00024414644984679874, + "loss": 7.5078, + "step": 19695 + }, + { + "epoch": 1.8378277503032565, + "grad_norm": 1.1091200018133012, + "learning_rate": 0.00024414056131113045, + "loss": 7.5749, + "step": 19696 + }, + { + "epoch": 1.8379210599981337, + "grad_norm": 1.0750238357006389e+30, + "learning_rate": 0.0002441346725360906, + "loss": 7.7431, + "step": 19697 + }, + { + "epoch": 1.8380143696930111, + "grad_norm": 2.127878137568466, + "learning_rate": 0.00024412878352169417, + "loss": 7.1707, + "step": 19698 + }, + { + "epoch": 1.8381076793878885, + "grad_norm": 1.7165931357810544, + "learning_rate": 0.00024412289426795615, + "loss": 7.1565, + "step": 19699 + }, + { + "epoch": 1.8382009890827657, + "grad_norm": 1.175619777414762, + "learning_rate": 0.00024411700477489147, + "loss": 7.4091, + "step": 19700 + }, + { + "epoch": 1.838294298777643, + "grad_norm": 1.5644407470539687e+30, + "learning_rate": 0.00024411111504251513, + "loss": 7.543, + "step": 19701 + }, + { + "epoch": 1.8383876084725204, + "grad_norm": 1.1772620060455459, + "learning_rate": 0.00024410522507084206, + "loss": 7.4187, + "step": 19702 + }, + { + "epoch": 1.8384809181673976, + "grad_norm": 2.4894178511358307e+30, + "learning_rate": 0.00024409933485988733, + "loss": 7.3652, + "step": 19703 + }, + { + "epoch": 1.8385742278622748, + "grad_norm": 1.7548385992951212, + "learning_rate": 0.00024409344440966583, + "loss": 7.6428, + "step": 19704 + }, + { + "epoch": 1.8386675375571522, + "grad_norm": 1.578755290144139, + "learning_rate": 0.00024408755372019262, + "loss": 7.5319, + "step": 19705 + }, + { + "epoch": 1.8387608472520296, + "grad_norm": 1.4040695743706375, + "learning_rate": 0.00024408166279148258, + "loss": 7.6686, + "step": 19706 + }, + { + "epoch": 1.8388541569469068, + "grad_norm": 1.3302022438769041, + "learning_rate": 0.00024407577162355076, + "loss": 7.3816, + "step": 19707 + }, + { + "epoch": 1.838947466641784, + "grad_norm": 2.226515448554052e+30, + "learning_rate": 0.00024406988021641212, + "loss": 7.6168, + "step": 19708 + }, + { + "epoch": 1.8390407763366614, + "grad_norm": 1.329013743373552, + "learning_rate": 0.00024406398857008165, + "loss": 7.8383, + "step": 19709 + }, + { + "epoch": 1.8391340860315388, + "grad_norm": 1.6006236142540087, + "learning_rate": 0.00024405809668457434, + "loss": 7.551, + "step": 19710 + }, + { + "epoch": 1.839227395726416, + "grad_norm": 1.0890109006294246, + "learning_rate": 0.0002440522045599051, + "loss": 7.589, + "step": 19711 + }, + { + "epoch": 1.8393207054212932, + "grad_norm": 3.6619122798142924e+30, + "learning_rate": 0.000244046312196089, + "loss": 7.7002, + "step": 19712 + }, + { + "epoch": 1.8394140151161706, + "grad_norm": 1.432162913869358, + "learning_rate": 0.00024404041959314097, + "loss": 7.6082, + "step": 19713 + }, + { + "epoch": 1.8395073248110478, + "grad_norm": 1.6797850634115243, + "learning_rate": 0.000244034526751076, + "loss": 7.2161, + "step": 19714 + }, + { + "epoch": 1.839600634505925, + "grad_norm": 1.0750697027599498, + "learning_rate": 0.00024402863366990914, + "loss": 7.2436, + "step": 19715 + }, + { + "epoch": 1.8396939442008025, + "grad_norm": 1.6354196820685876, + "learning_rate": 0.00024402274034965527, + "loss": 7.3973, + "step": 19716 + }, + { + "epoch": 1.8397872538956799, + "grad_norm": 1.2909057738378722, + "learning_rate": 0.0002440168467903294, + "loss": 7.4149, + "step": 19717 + }, + { + "epoch": 1.839880563590557, + "grad_norm": 1.7283404317645954, + "learning_rate": 0.0002440109529919466, + "loss": 7.4824, + "step": 19718 + }, + { + "epoch": 1.8399738732854343, + "grad_norm": 1.45550503099199, + "learning_rate": 0.00024400505895452179, + "loss": 7.3061, + "step": 19719 + }, + { + "epoch": 1.8400671829803117, + "grad_norm": 51.72994203419789, + "learning_rate": 0.00024399916467806993, + "loss": 7.4332, + "step": 19720 + }, + { + "epoch": 1.840160492675189, + "grad_norm": 1.5175747460997517, + "learning_rate": 0.00024399327016260604, + "loss": 7.7019, + "step": 19721 + }, + { + "epoch": 1.8402538023700663, + "grad_norm": 1.102138487560938, + "learning_rate": 0.00024398737540814512, + "loss": 7.3758, + "step": 19722 + }, + { + "epoch": 1.8403471120649435, + "grad_norm": 0.9858275889536382, + "learning_rate": 0.0002439814804147021, + "loss": 7.5021, + "step": 19723 + }, + { + "epoch": 1.840440421759821, + "grad_norm": 1.5496119151341823, + "learning_rate": 0.0002439755851822921, + "loss": 7.8601, + "step": 19724 + }, + { + "epoch": 1.8405337314546981, + "grad_norm": 1.0502078736392162, + "learning_rate": 0.00024396968971092994, + "loss": 7.8013, + "step": 19725 + }, + { + "epoch": 1.8406270411495753, + "grad_norm": 1.8611430077287041, + "learning_rate": 0.00024396379400063074, + "loss": 7.3998, + "step": 19726 + }, + { + "epoch": 1.8407203508444527, + "grad_norm": 1.4080916822941116, + "learning_rate": 0.00024395789805140944, + "loss": 7.4502, + "step": 19727 + }, + { + "epoch": 1.8408136605393302, + "grad_norm": 1.0996409346845528, + "learning_rate": 0.00024395200186328104, + "loss": 7.5438, + "step": 19728 + }, + { + "epoch": 1.8409069702342074, + "grad_norm": 1.5715159977349298, + "learning_rate": 0.0002439461054362605, + "loss": 7.362, + "step": 19729 + }, + { + "epoch": 1.8410002799290845, + "grad_norm": 1.02242876385566, + "learning_rate": 0.0002439402087703629, + "loss": 7.4002, + "step": 19730 + }, + { + "epoch": 1.841093589623962, + "grad_norm": 0.944571216401312, + "learning_rate": 0.00024393431186560312, + "loss": 7.0943, + "step": 19731 + }, + { + "epoch": 1.8411868993188394, + "grad_norm": 1.5411680421821865, + "learning_rate": 0.0002439284147219963, + "loss": 7.4577, + "step": 19732 + }, + { + "epoch": 1.8412802090137164, + "grad_norm": 1.9868694540722642, + "learning_rate": 0.00024392251733955722, + "loss": 7.6755, + "step": 19733 + }, + { + "epoch": 1.8413735187085938, + "grad_norm": 3.2438642193597564, + "learning_rate": 0.00024391661971830106, + "loss": 6.808, + "step": 19734 + }, + { + "epoch": 1.8414668284034712, + "grad_norm": 1.482417944502236, + "learning_rate": 0.00024391072185824276, + "loss": 7.7282, + "step": 19735 + }, + { + "epoch": 1.8415601380983484, + "grad_norm": 1.189432224887968, + "learning_rate": 0.0002439048237593973, + "loss": 7.5442, + "step": 19736 + }, + { + "epoch": 1.8416534477932256, + "grad_norm": 1.0174144489983832, + "learning_rate": 0.00024389892542177972, + "loss": 7.3378, + "step": 19737 + }, + { + "epoch": 1.841746757488103, + "grad_norm": 1.61652513638486e+29, + "learning_rate": 0.00024389302684540496, + "loss": 7.4432, + "step": 19738 + }, + { + "epoch": 1.8418400671829804, + "grad_norm": 1.494326834399637e+29, + "learning_rate": 0.00024388712803028804, + "loss": 7.3144, + "step": 19739 + }, + { + "epoch": 1.8419333768778576, + "grad_norm": 2.410856805233088, + "learning_rate": 0.00024388122897644397, + "loss": 7.4441, + "step": 19740 + }, + { + "epoch": 1.8420266865727348, + "grad_norm": 1.4227874507734428, + "learning_rate": 0.00024387532968388778, + "loss": 7.959, + "step": 19741 + }, + { + "epoch": 1.8421199962676122, + "grad_norm": 1.1250071478248609, + "learning_rate": 0.0002438694301526344, + "loss": 7.3449, + "step": 19742 + }, + { + "epoch": 1.8422133059624897, + "grad_norm": 1.3847414654015517, + "learning_rate": 0.00024386353038269887, + "loss": 7.4468, + "step": 19743 + }, + { + "epoch": 1.8423066156573666, + "grad_norm": 0.9822912560413265, + "learning_rate": 0.0002438576303740962, + "loss": 7.7473, + "step": 19744 + }, + { + "epoch": 1.842399925352244, + "grad_norm": 0.9854668300783496, + "learning_rate": 0.00024385173012684137, + "loss": 7.4945, + "step": 19745 + }, + { + "epoch": 1.8424932350471215, + "grad_norm": 2.1638624692706855e+30, + "learning_rate": 0.0002438458296409494, + "loss": 7.0397, + "step": 19746 + }, + { + "epoch": 1.8425865447419987, + "grad_norm": 1.1314525908369006, + "learning_rate": 0.00024383992891643526, + "loss": 7.4559, + "step": 19747 + }, + { + "epoch": 1.8426798544368759, + "grad_norm": 1.6210383245079083, + "learning_rate": 0.000243834027953314, + "loss": 7.7675, + "step": 19748 + }, + { + "epoch": 1.8427731641317533, + "grad_norm": 1.4646565752273164, + "learning_rate": 0.00024382812675160056, + "loss": 7.756, + "step": 19749 + }, + { + "epoch": 1.8428664738266307, + "grad_norm": 1.2194271069321942, + "learning_rate": 0.00024382222531131007, + "loss": 7.6374, + "step": 19750 + }, + { + "epoch": 1.842959783521508, + "grad_norm": 0.9690474807858915, + "learning_rate": 0.00024381632363245737, + "loss": 7.4375, + "step": 19751 + }, + { + "epoch": 1.843053093216385, + "grad_norm": 1.167438752808734, + "learning_rate": 0.00024381042171505758, + "loss": 7.9544, + "step": 19752 + }, + { + "epoch": 1.8431464029112625, + "grad_norm": 1.1169685018256272, + "learning_rate": 0.0002438045195591257, + "loss": 7.5387, + "step": 19753 + }, + { + "epoch": 1.8432397126061397, + "grad_norm": 1.3327884840220428, + "learning_rate": 0.0002437986171646767, + "loss": 7.6593, + "step": 19754 + }, + { + "epoch": 1.843333022301017, + "grad_norm": 1.5867944106787866, + "learning_rate": 0.00024379271453172556, + "loss": 7.5442, + "step": 19755 + }, + { + "epoch": 1.8434263319958943, + "grad_norm": 1.368508899024063, + "learning_rate": 0.0002437868116602874, + "loss": 7.4834, + "step": 19756 + }, + { + "epoch": 1.8435196416907718, + "grad_norm": 2.2969701547407e+29, + "learning_rate": 0.0002437809085503771, + "loss": 7.3815, + "step": 19757 + }, + { + "epoch": 1.843612951385649, + "grad_norm": 1.0472097331374481, + "learning_rate": 0.00024377500520200978, + "loss": 7.7815, + "step": 19758 + }, + { + "epoch": 1.8437062610805262, + "grad_norm": 0.994757284634812, + "learning_rate": 0.00024376910161520035, + "loss": 7.4044, + "step": 19759 + }, + { + "epoch": 1.8437995707754036, + "grad_norm": 2.044498711139391e+29, + "learning_rate": 0.0002437631977899639, + "loss": 7.6368, + "step": 19760 + }, + { + "epoch": 1.843892880470281, + "grad_norm": 1.3501808704941194, + "learning_rate": 0.00024375729372631542, + "loss": 7.6178, + "step": 19761 + }, + { + "epoch": 1.8439861901651582, + "grad_norm": 1.5701261756680593, + "learning_rate": 0.00024375138942426988, + "loss": 7.363, + "step": 19762 + }, + { + "epoch": 1.8440794998600354, + "grad_norm": 0.9343800406427932, + "learning_rate": 0.00024374548488384235, + "loss": 7.2823, + "step": 19763 + }, + { + "epoch": 1.8441728095549128, + "grad_norm": 0.8919150893801217, + "learning_rate": 0.00024373958010504783, + "loss": 7.2004, + "step": 19764 + }, + { + "epoch": 1.84426611924979, + "grad_norm": 2.3546108302515303, + "learning_rate": 0.00024373367508790132, + "loss": 7.4389, + "step": 19765 + }, + { + "epoch": 1.8443594289446672, + "grad_norm": 3.7321394949713107, + "learning_rate": 0.00024372776983241784, + "loss": 7.3528, + "step": 19766 + }, + { + "epoch": 1.8444527386395446, + "grad_norm": 1.5165079499127836, + "learning_rate": 0.00024372186433861245, + "loss": 7.5449, + "step": 19767 + }, + { + "epoch": 1.844546048334422, + "grad_norm": 1.8959261397726637, + "learning_rate": 0.00024371595860650007, + "loss": 7.0572, + "step": 19768 + }, + { + "epoch": 1.8446393580292992, + "grad_norm": 1.1592464837336622, + "learning_rate": 0.00024371005263609574, + "loss": 7.3138, + "step": 19769 + }, + { + "epoch": 1.8447326677241764, + "grad_norm": 1.872509719257903, + "learning_rate": 0.00024370414642741454, + "loss": 7.0014, + "step": 19770 + }, + { + "epoch": 1.8448259774190539, + "grad_norm": 4.7321348660618206e+28, + "learning_rate": 0.00024369823998047146, + "loss": 7.3852, + "step": 19771 + }, + { + "epoch": 1.8449192871139313, + "grad_norm": 1.1529629058283395, + "learning_rate": 0.0002436923332952815, + "loss": 7.3748, + "step": 19772 + }, + { + "epoch": 1.8450125968088085, + "grad_norm": 1.678093516931057, + "learning_rate": 0.0002436864263718597, + "loss": 7.7502, + "step": 19773 + }, + { + "epoch": 1.8451059065036857, + "grad_norm": 1.6014384264233525, + "learning_rate": 0.00024368051921022107, + "loss": 7.5809, + "step": 19774 + }, + { + "epoch": 1.845199216198563, + "grad_norm": 1.290748667133029, + "learning_rate": 0.00024367461181038064, + "loss": 7.6052, + "step": 19775 + }, + { + "epoch": 1.8452925258934403, + "grad_norm": 1.0923967436319706, + "learning_rate": 0.0002436687041723534, + "loss": 7.237, + "step": 19776 + }, + { + "epoch": 1.8453858355883175, + "grad_norm": 1.2490633425313333, + "learning_rate": 0.00024366279629615435, + "loss": 7.2128, + "step": 19777 + }, + { + "epoch": 1.845479145283195, + "grad_norm": 1.0599943996017527, + "learning_rate": 0.00024365688818179862, + "loss": 7.5274, + "step": 19778 + }, + { + "epoch": 1.8455724549780723, + "grad_norm": 0.9313815668833418, + "learning_rate": 0.00024365097982930115, + "loss": 7.4734, + "step": 19779 + }, + { + "epoch": 1.8456657646729495, + "grad_norm": 4.606360331818974e+27, + "learning_rate": 0.00024364507123867697, + "loss": 7.5756, + "step": 19780 + }, + { + "epoch": 1.8457590743678267, + "grad_norm": 5.272602830313571e+29, + "learning_rate": 0.00024363916240994116, + "loss": 7.0911, + "step": 19781 + }, + { + "epoch": 1.8458523840627041, + "grad_norm": 0.9147457197954845, + "learning_rate": 0.00024363325334310863, + "loss": 7.2853, + "step": 19782 + }, + { + "epoch": 1.8459456937575816, + "grad_norm": 1.4380552963281257, + "learning_rate": 0.00024362734403819448, + "loss": 7.5056, + "step": 19783 + }, + { + "epoch": 1.8460390034524587, + "grad_norm": 1.437060098772726, + "learning_rate": 0.00024362143449521375, + "loss": 7.6742, + "step": 19784 + }, + { + "epoch": 1.846132313147336, + "grad_norm": 1.0561614331911304, + "learning_rate": 0.00024361552471418145, + "loss": 7.0182, + "step": 19785 + }, + { + "epoch": 1.8462256228422134, + "grad_norm": 2.000719735584073, + "learning_rate": 0.0002436096146951126, + "loss": 7.3165, + "step": 19786 + }, + { + "epoch": 1.8463189325370906, + "grad_norm": 0.8104580503148959, + "learning_rate": 0.0002436037044380222, + "loss": 7.4935, + "step": 19787 + }, + { + "epoch": 1.8464122422319678, + "grad_norm": 0.9033040895908081, + "learning_rate": 0.00024359779394292537, + "loss": 7.5715, + "step": 19788 + }, + { + "epoch": 1.8465055519268452, + "grad_norm": 0.9980282771084618, + "learning_rate": 0.000243591883209837, + "loss": 7.5953, + "step": 19789 + }, + { + "epoch": 1.8465988616217226, + "grad_norm": 0.9542923664928953, + "learning_rate": 0.00024358597223877226, + "loss": 7.4848, + "step": 19790 + }, + { + "epoch": 1.8466921713165998, + "grad_norm": 0.8550233047432954, + "learning_rate": 0.00024358006102974609, + "loss": 7.6802, + "step": 19791 + }, + { + "epoch": 1.846785481011477, + "grad_norm": 6.649275883047771, + "learning_rate": 0.00024357414958277353, + "loss": 7.1893, + "step": 19792 + }, + { + "epoch": 1.8468787907063544, + "grad_norm": 1.2704283900818103e+30, + "learning_rate": 0.0002435682378978696, + "loss": 7.0418, + "step": 19793 + }, + { + "epoch": 1.8469721004012318, + "grad_norm": 0.8516461302043346, + "learning_rate": 0.00024356232597504943, + "loss": 7.4757, + "step": 19794 + }, + { + "epoch": 1.847065410096109, + "grad_norm": 0.8526940438462286, + "learning_rate": 0.000243556413814328, + "loss": 7.3835, + "step": 19795 + }, + { + "epoch": 1.8471587197909862, + "grad_norm": 1.6661147348049983, + "learning_rate": 0.0002435505014157202, + "loss": 7.9643, + "step": 19796 + }, + { + "epoch": 1.8472520294858636, + "grad_norm": 1.6260249955885842, + "learning_rate": 0.0002435445887792413, + "loss": 7.9038, + "step": 19797 + }, + { + "epoch": 1.8473453391807408, + "grad_norm": 1.1366689749587298, + "learning_rate": 0.00024353867590490617, + "loss": 7.5308, + "step": 19798 + }, + { + "epoch": 1.847438648875618, + "grad_norm": 3.123037196825537, + "learning_rate": 0.00024353276279272986, + "loss": 7.5265, + "step": 19799 + }, + { + "epoch": 1.8475319585704955, + "grad_norm": 0.9904184742834042, + "learning_rate": 0.00024352684944272754, + "loss": 7.5058, + "step": 19800 + }, + { + "epoch": 1.8476252682653729, + "grad_norm": 1.9255148748604825, + "learning_rate": 0.00024352093585491407, + "loss": 7.3745, + "step": 19801 + }, + { + "epoch": 1.84771857796025, + "grad_norm": 1.2496880441913456, + "learning_rate": 0.00024351502202930454, + "loss": 7.3609, + "step": 19802 + }, + { + "epoch": 1.8478118876551273, + "grad_norm": 1.319706616824068, + "learning_rate": 0.00024350910796591408, + "loss": 7.4661, + "step": 19803 + }, + { + "epoch": 1.8479051973500047, + "grad_norm": 1.1914895604869582, + "learning_rate": 0.00024350319366475765, + "loss": 7.4855, + "step": 19804 + }, + { + "epoch": 1.847998507044882, + "grad_norm": 1.4234179407696643e+29, + "learning_rate": 0.00024349727912585024, + "loss": 7.317, + "step": 19805 + }, + { + "epoch": 1.8480918167397593, + "grad_norm": 0.8309058590800965, + "learning_rate": 0.00024349136434920703, + "loss": 7.4836, + "step": 19806 + }, + { + "epoch": 1.8481851264346365, + "grad_norm": 1.0599936185351366, + "learning_rate": 0.0002434854493348429, + "loss": 7.58, + "step": 19807 + }, + { + "epoch": 1.848278436129514, + "grad_norm": 0.8624066050598643, + "learning_rate": 0.00024347953408277302, + "loss": 7.4156, + "step": 19808 + }, + { + "epoch": 1.8483717458243911, + "grad_norm": 4.3590902744602286e+27, + "learning_rate": 0.0002434736185930123, + "loss": 7.5138, + "step": 19809 + }, + { + "epoch": 1.8484650555192683, + "grad_norm": 0.7850220477457385, + "learning_rate": 0.00024346770286557594, + "loss": 7.4436, + "step": 19810 + }, + { + "epoch": 1.8485583652141457, + "grad_norm": 0.8324231108332666, + "learning_rate": 0.00024346178690047885, + "loss": 7.4996, + "step": 19811 + }, + { + "epoch": 1.8486516749090232, + "grad_norm": 0.8617022696806932, + "learning_rate": 0.0002434558706977361, + "loss": 7.4709, + "step": 19812 + }, + { + "epoch": 1.8487449846039004, + "grad_norm": 3.8698742624312e+28, + "learning_rate": 0.00024344995425736285, + "loss": 7.4587, + "step": 19813 + }, + { + "epoch": 1.8488382942987776, + "grad_norm": 1.4856864746387208, + "learning_rate": 0.000243444037579374, + "loss": 7.1488, + "step": 19814 + }, + { + "epoch": 1.848931603993655, + "grad_norm": 1.2770259930871561, + "learning_rate": 0.00024343812066378456, + "loss": 7.5869, + "step": 19815 + }, + { + "epoch": 1.8490249136885324, + "grad_norm": 1.3237444276662014, + "learning_rate": 0.00024343220351060975, + "loss": 7.5041, + "step": 19816 + }, + { + "epoch": 1.8491182233834096, + "grad_norm": 4.338782896429951e+28, + "learning_rate": 0.00024342628611986452, + "loss": 7.1835, + "step": 19817 + }, + { + "epoch": 1.8492115330782868, + "grad_norm": 1.3279164348869978, + "learning_rate": 0.00024342036849156387, + "loss": 7.655, + "step": 19818 + }, + { + "epoch": 1.8493048427731642, + "grad_norm": 41.58010953904975, + "learning_rate": 0.00024341445062572295, + "loss": 7.3924, + "step": 19819 + }, + { + "epoch": 1.8493981524680414, + "grad_norm": 1.2292754968538018, + "learning_rate": 0.00024340853252235675, + "loss": 7.1792, + "step": 19820 + }, + { + "epoch": 1.8494914621629186, + "grad_norm": 1.2752823727095202, + "learning_rate": 0.00024340261418148024, + "loss": 7.205, + "step": 19821 + }, + { + "epoch": 1.849584771857796, + "grad_norm": 0.8596619450001531, + "learning_rate": 0.00024339669560310864, + "loss": 7.483, + "step": 19822 + }, + { + "epoch": 1.8496780815526734, + "grad_norm": 0.8853281847932926, + "learning_rate": 0.00024339077678725687, + "loss": 7.437, + "step": 19823 + }, + { + "epoch": 1.8497713912475506, + "grad_norm": 0.7937619212220945, + "learning_rate": 0.00024338485773394005, + "loss": 7.4159, + "step": 19824 + }, + { + "epoch": 1.8498647009424278, + "grad_norm": 1.1169976921929994, + "learning_rate": 0.00024337893844317317, + "loss": 7.6143, + "step": 19825 + }, + { + "epoch": 1.8499580106373053, + "grad_norm": 0.8777231316754199, + "learning_rate": 0.0002433730189149713, + "loss": 7.3672, + "step": 19826 + }, + { + "epoch": 1.8500513203321827, + "grad_norm": 1.0300993611228566e+28, + "learning_rate": 0.00024336709914934954, + "loss": 7.3702, + "step": 19827 + }, + { + "epoch": 1.8501446300270599, + "grad_norm": 0.9253047945776895, + "learning_rate": 0.00024336117914632287, + "loss": 7.3338, + "step": 19828 + }, + { + "epoch": 1.850237939721937, + "grad_norm": 2.7203695069353593, + "learning_rate": 0.0002433552589059064, + "loss": 7.1399, + "step": 19829 + }, + { + "epoch": 1.8503312494168145, + "grad_norm": 1.1132181140034225, + "learning_rate": 0.00024334933842811514, + "loss": 7.0725, + "step": 19830 + }, + { + "epoch": 1.8504245591116917, + "grad_norm": 3.19154585413718e+28, + "learning_rate": 0.00024334341771296418, + "loss": 7.6529, + "step": 19831 + }, + { + "epoch": 1.8505178688065689, + "grad_norm": 1.1105982411505306, + "learning_rate": 0.00024333749676046855, + "loss": 7.5196, + "step": 19832 + }, + { + "epoch": 1.8506111785014463, + "grad_norm": 4.481713736154465e+28, + "learning_rate": 0.00024333157557064334, + "loss": 7.4398, + "step": 19833 + }, + { + "epoch": 1.8507044881963237, + "grad_norm": 0.9059056791604553, + "learning_rate": 0.00024332565414350354, + "loss": 7.2399, + "step": 19834 + }, + { + "epoch": 1.850797797891201, + "grad_norm": 0.8426556038991302, + "learning_rate": 0.00024331973247906428, + "loss": 7.2334, + "step": 19835 + }, + { + "epoch": 1.850891107586078, + "grad_norm": 8.801512449623156e+27, + "learning_rate": 0.00024331381057734057, + "loss": 7.471, + "step": 19836 + }, + { + "epoch": 1.8509844172809555, + "grad_norm": 1.2224268659877189, + "learning_rate": 0.0002433078884383475, + "loss": 7.1812, + "step": 19837 + }, + { + "epoch": 1.851077726975833, + "grad_norm": 1.028423209160276, + "learning_rate": 0.00024330196606210007, + "loss": 7.5137, + "step": 19838 + }, + { + "epoch": 1.85117103667071, + "grad_norm": 0.8368614461137904, + "learning_rate": 0.0002432960434486134, + "loss": 7.2695, + "step": 19839 + }, + { + "epoch": 1.8512643463655873, + "grad_norm": 1.2729453200162109, + "learning_rate": 0.00024329012059790252, + "loss": 7.0445, + "step": 19840 + }, + { + "epoch": 1.8513576560604648, + "grad_norm": 37.35896449452264, + "learning_rate": 0.00024328419750998251, + "loss": 7.4699, + "step": 19841 + }, + { + "epoch": 1.851450965755342, + "grad_norm": 1.1920087635474774, + "learning_rate": 0.00024327827418486842, + "loss": 7.614, + "step": 19842 + }, + { + "epoch": 1.8515442754502192, + "grad_norm": 0.8816622052209536, + "learning_rate": 0.0002432723506225753, + "loss": 7.1214, + "step": 19843 + }, + { + "epoch": 1.8516375851450966, + "grad_norm": 3.192822439548888e+27, + "learning_rate": 0.00024326642682311823, + "loss": 7.5534, + "step": 19844 + }, + { + "epoch": 1.851730894839974, + "grad_norm": 0.9001907639459069, + "learning_rate": 0.00024326050278651227, + "loss": 7.5829, + "step": 19845 + }, + { + "epoch": 1.8518242045348512, + "grad_norm": 0.9961754174430735, + "learning_rate": 0.00024325457851277248, + "loss": 7.2357, + "step": 19846 + }, + { + "epoch": 1.8519175142297284, + "grad_norm": 0.9625259022765658, + "learning_rate": 0.00024324865400191388, + "loss": 7.4584, + "step": 19847 + }, + { + "epoch": 1.8520108239246058, + "grad_norm": 7.518485969443869e+27, + "learning_rate": 0.00024324272925395162, + "loss": 7.3759, + "step": 19848 + }, + { + "epoch": 1.8521041336194832, + "grad_norm": 1.6658984654423965, + "learning_rate": 0.0002432368042689007, + "loss": 7.0845, + "step": 19849 + }, + { + "epoch": 1.8521974433143602, + "grad_norm": 0.889471421556261, + "learning_rate": 0.0002432308790467762, + "loss": 7.5517, + "step": 19850 + }, + { + "epoch": 1.8522907530092376, + "grad_norm": 1.4214881162959995e+28, + "learning_rate": 0.00024322495358759322, + "loss": 7.4906, + "step": 19851 + }, + { + "epoch": 1.852384062704115, + "grad_norm": 1.6403422417908406, + "learning_rate": 0.0002432190278913668, + "loss": 7.7975, + "step": 19852 + }, + { + "epoch": 1.8524773723989922, + "grad_norm": 1.650176842321812, + "learning_rate": 0.00024321310195811195, + "loss": 7.4445, + "step": 19853 + }, + { + "epoch": 1.8525706820938694, + "grad_norm": 1.529896387925945, + "learning_rate": 0.00024320717578784384, + "loss": 7.5993, + "step": 19854 + }, + { + "epoch": 1.8526639917887469, + "grad_norm": 0.9900913349488758, + "learning_rate": 0.00024320124938057746, + "loss": 7.4874, + "step": 19855 + }, + { + "epoch": 1.8527573014836243, + "grad_norm": 1.2473379712709418, + "learning_rate": 0.00024319532273632793, + "loss": 7.4836, + "step": 19856 + }, + { + "epoch": 1.8528506111785015, + "grad_norm": 0.8440959876910179, + "learning_rate": 0.00024318939585511032, + "loss": 7.6113, + "step": 19857 + }, + { + "epoch": 1.8529439208733787, + "grad_norm": 1.2158739818139335, + "learning_rate": 0.00024318346873693965, + "loss": 7.3765, + "step": 19858 + }, + { + "epoch": 1.853037230568256, + "grad_norm": 5.770501224692479e+26, + "learning_rate": 0.00024317754138183101, + "loss": 7.4765, + "step": 19859 + }, + { + "epoch": 1.8531305402631333, + "grad_norm": 5.5711604739399895e+26, + "learning_rate": 0.00024317161378979954, + "loss": 7.5354, + "step": 19860 + }, + { + "epoch": 1.8532238499580105, + "grad_norm": 1.6409868709059516, + "learning_rate": 0.0002431656859608602, + "loss": 7.0519, + "step": 19861 + }, + { + "epoch": 1.853317159652888, + "grad_norm": 0.9432311814728573, + "learning_rate": 0.00024315975789502818, + "loss": 7.6203, + "step": 19862 + }, + { + "epoch": 1.8534104693477653, + "grad_norm": 1.139690188128126e+28, + "learning_rate": 0.00024315382959231845, + "loss": 7.4656, + "step": 19863 + }, + { + "epoch": 1.8535037790426425, + "grad_norm": 0.7316973711727034, + "learning_rate": 0.0002431479010527461, + "loss": 7.315, + "step": 19864 + }, + { + "epoch": 1.8535970887375197, + "grad_norm": 1.236216243773934, + "learning_rate": 0.00024314197227632626, + "loss": 7.5215, + "step": 19865 + }, + { + "epoch": 1.8536903984323971, + "grad_norm": 1.5049613045460841, + "learning_rate": 0.00024313604326307397, + "loss": 7.3325, + "step": 19866 + }, + { + "epoch": 1.8537837081272746, + "grad_norm": 0.8637503720394002, + "learning_rate": 0.00024313011401300434, + "loss": 7.1947, + "step": 19867 + }, + { + "epoch": 1.8538770178221518, + "grad_norm": 0.8371866277787875, + "learning_rate": 0.00024312418452613238, + "loss": 7.3049, + "step": 19868 + }, + { + "epoch": 1.853970327517029, + "grad_norm": 0.8777547960511337, + "learning_rate": 0.00024311825480247321, + "loss": 7.5018, + "step": 19869 + }, + { + "epoch": 1.8540636372119064, + "grad_norm": 0.8928197144290887, + "learning_rate": 0.00024311232484204194, + "loss": 7.4319, + "step": 19870 + }, + { + "epoch": 1.8541569469067836, + "grad_norm": 2.881592127287001e+28, + "learning_rate": 0.00024310639464485354, + "loss": 7.4409, + "step": 19871 + }, + { + "epoch": 1.8542502566016608, + "grad_norm": 0.9471766648487154, + "learning_rate": 0.00024310046421092323, + "loss": 7.5971, + "step": 19872 + }, + { + "epoch": 1.8543435662965382, + "grad_norm": 1.7085054832272002, + "learning_rate": 0.000243094533540266, + "loss": 7.116, + "step": 19873 + }, + { + "epoch": 1.8544368759914156, + "grad_norm": 4.5443387442675087e+27, + "learning_rate": 0.0002430886026328969, + "loss": 7.1768, + "step": 19874 + }, + { + "epoch": 1.8545301856862928, + "grad_norm": 1.1068322005368807, + "learning_rate": 0.00024308267148883108, + "loss": 7.304, + "step": 19875 + }, + { + "epoch": 1.85462349538117, + "grad_norm": 3.682191929284487e+27, + "learning_rate": 0.00024307674010808365, + "loss": 7.5886, + "step": 19876 + }, + { + "epoch": 1.8547168050760474, + "grad_norm": 0.7779233486325439, + "learning_rate": 0.0002430708084906696, + "loss": 7.3203, + "step": 19877 + }, + { + "epoch": 1.8548101147709248, + "grad_norm": 1.2552395337502027, + "learning_rate": 0.00024306487663660402, + "loss": 7.2646, + "step": 19878 + }, + { + "epoch": 1.854903424465802, + "grad_norm": 2.0887337521420515e+27, + "learning_rate": 0.00024305894454590207, + "loss": 7.7732, + "step": 19879 + }, + { + "epoch": 1.8549967341606792, + "grad_norm": 3.861876718226601e+27, + "learning_rate": 0.0002430530122185788, + "loss": 7.4677, + "step": 19880 + }, + { + "epoch": 1.8550900438555566, + "grad_norm": 0.8130441610531961, + "learning_rate": 0.00024304707965464926, + "loss": 7.4147, + "step": 19881 + }, + { + "epoch": 1.8551833535504338, + "grad_norm": 0.9733842722323853, + "learning_rate": 0.00024304114685412854, + "loss": 7.4992, + "step": 19882 + }, + { + "epoch": 1.855276663245311, + "grad_norm": 0.8436197795971713, + "learning_rate": 0.00024303521381703177, + "loss": 7.3544, + "step": 19883 + }, + { + "epoch": 1.8553699729401885, + "grad_norm": 1.0625378906394578, + "learning_rate": 0.000243029280543374, + "loss": 7.6094, + "step": 19884 + }, + { + "epoch": 1.8554632826350659, + "grad_norm": 0.9853809843486034, + "learning_rate": 0.00024302334703317036, + "loss": 7.5361, + "step": 19885 + }, + { + "epoch": 1.855556592329943, + "grad_norm": 0.9698142368328928, + "learning_rate": 0.00024301741328643588, + "loss": 7.3644, + "step": 19886 + }, + { + "epoch": 1.8556499020248203, + "grad_norm": 0.9505141661958529, + "learning_rate": 0.00024301147930318566, + "loss": 7.453, + "step": 19887 + }, + { + "epoch": 1.8557432117196977, + "grad_norm": 1.1962289091597609, + "learning_rate": 0.00024300554508343482, + "loss": 7.6575, + "step": 19888 + }, + { + "epoch": 1.8558365214145751, + "grad_norm": 0.7351665704908457, + "learning_rate": 0.00024299961062719847, + "loss": 7.4012, + "step": 19889 + }, + { + "epoch": 1.8559298311094523, + "grad_norm": 1.1204197000567855, + "learning_rate": 0.00024299367593449156, + "loss": 7.6516, + "step": 19890 + }, + { + "epoch": 1.8560231408043295, + "grad_norm": 4.5010477838452256e+27, + "learning_rate": 0.00024298774100532934, + "loss": 7.1689, + "step": 19891 + }, + { + "epoch": 1.856116450499207, + "grad_norm": 1.0738581329505748, + "learning_rate": 0.0002429818058397268, + "loss": 7.7527, + "step": 19892 + }, + { + "epoch": 1.8562097601940841, + "grad_norm": 4.194550766395359e+27, + "learning_rate": 0.00024297587043769912, + "loss": 7.4709, + "step": 19893 + }, + { + "epoch": 1.8563030698889613, + "grad_norm": 1.2710816009155829e+29, + "learning_rate": 0.00024296993479926132, + "loss": 7.3902, + "step": 19894 + }, + { + "epoch": 1.8563963795838387, + "grad_norm": 1.1305831306671625, + "learning_rate": 0.00024296399892442854, + "loss": 7.4308, + "step": 19895 + }, + { + "epoch": 1.8564896892787162, + "grad_norm": 1.016423970322691, + "learning_rate": 0.00024295806281321582, + "loss": 7.4352, + "step": 19896 + }, + { + "epoch": 1.8565829989735934, + "grad_norm": 0.8131592435609909, + "learning_rate": 0.00024295212646563828, + "loss": 7.3681, + "step": 19897 + }, + { + "epoch": 1.8566763086684706, + "grad_norm": 0.7536200985984901, + "learning_rate": 0.00024294618988171106, + "loss": 7.4391, + "step": 19898 + }, + { + "epoch": 1.856769618363348, + "grad_norm": 1.0760550178282648, + "learning_rate": 0.00024294025306144916, + "loss": 7.1926, + "step": 19899 + }, + { + "epoch": 1.8568629280582254, + "grad_norm": 0.9681927452738982, + "learning_rate": 0.00024293431600486774, + "loss": 7.2991, + "step": 19900 + }, + { + "epoch": 1.8569562377531026, + "grad_norm": 11.563853325710156, + "learning_rate": 0.0002429283787119819, + "loss": 7.5161, + "step": 19901 + }, + { + "epoch": 1.8570495474479798, + "grad_norm": 1.2357485187856678, + "learning_rate": 0.0002429224411828067, + "loss": 7.5639, + "step": 19902 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0212379979530135, + "learning_rate": 0.00024291650341735726, + "loss": 7.491, + "step": 19903 + }, + { + "epoch": 1.8572361668377344, + "grad_norm": 1.5333098055904418e+29, + "learning_rate": 0.0002429105654156487, + "loss": 7.0344, + "step": 19904 + }, + { + "epoch": 1.8573294765326116, + "grad_norm": 0.9589686646631345, + "learning_rate": 0.00024290462717769607, + "loss": 7.53, + "step": 19905 + }, + { + "epoch": 1.857422786227489, + "grad_norm": 0.8692545995197611, + "learning_rate": 0.00024289868870351448, + "loss": 7.3246, + "step": 19906 + }, + { + "epoch": 1.8575160959223664, + "grad_norm": 11.678501970487844, + "learning_rate": 0.0002428927499931191, + "loss": 7.8196, + "step": 19907 + }, + { + "epoch": 1.8576094056172436, + "grad_norm": 1.057583384760712, + "learning_rate": 0.0002428868110465249, + "loss": 7.3143, + "step": 19908 + }, + { + "epoch": 1.8577027153121208, + "grad_norm": 0.9517488622631433, + "learning_rate": 0.0002428808718637471, + "loss": 7.5543, + "step": 19909 + }, + { + "epoch": 1.8577960250069983, + "grad_norm": 0.8379686304350675, + "learning_rate": 0.00024287493244480075, + "loss": 7.4555, + "step": 19910 + }, + { + "epoch": 1.8578893347018757, + "grad_norm": 1.1563916837492263, + "learning_rate": 0.000242868992789701, + "loss": 7.1994, + "step": 19911 + }, + { + "epoch": 1.8579826443967529, + "grad_norm": 1.5660701039527554, + "learning_rate": 0.00024286305289846283, + "loss": 7.1551, + "step": 19912 + }, + { + "epoch": 1.85807595409163, + "grad_norm": 5.2905340539135e+27, + "learning_rate": 0.00024285711277110145, + "loss": 7.3863, + "step": 19913 + }, + { + "epoch": 1.8581692637865075, + "grad_norm": 1.8223011198108314, + "learning_rate": 0.000242851172407632, + "loss": 7.9276, + "step": 19914 + }, + { + "epoch": 1.8582625734813847, + "grad_norm": 1.0248747975413801, + "learning_rate": 0.00024284523180806945, + "loss": 7.3256, + "step": 19915 + }, + { + "epoch": 1.8583558831762619, + "grad_norm": 5.827001708618225e+28, + "learning_rate": 0.000242839290972429, + "loss": 7.373, + "step": 19916 + }, + { + "epoch": 1.8584491928711393, + "grad_norm": 1.382956242759397, + "learning_rate": 0.00024283334990072577, + "loss": 7.692, + "step": 19917 + }, + { + "epoch": 1.8585425025660167, + "grad_norm": 0.8429477943468418, + "learning_rate": 0.00024282740859297476, + "loss": 7.3809, + "step": 19918 + }, + { + "epoch": 1.858635812260894, + "grad_norm": 0.8061197416916464, + "learning_rate": 0.00024282146704919118, + "loss": 7.307, + "step": 19919 + }, + { + "epoch": 1.8587291219557711, + "grad_norm": 3.0563763790688298e+28, + "learning_rate": 0.0002428155252693901, + "loss": 7.4026, + "step": 19920 + }, + { + "epoch": 1.8588224316506485, + "grad_norm": 1.1243038557760245, + "learning_rate": 0.00024280958325358668, + "loss": 7.3208, + "step": 19921 + }, + { + "epoch": 1.858915741345526, + "grad_norm": 1.0578211115047766, + "learning_rate": 0.00024280364100179595, + "loss": 7.3788, + "step": 19922 + }, + { + "epoch": 1.8590090510404031, + "grad_norm": 1.111370098938385, + "learning_rate": 0.00024279769851403302, + "loss": 7.4045, + "step": 19923 + }, + { + "epoch": 1.8591023607352803, + "grad_norm": 0.8802776398906346, + "learning_rate": 0.00024279175579031308, + "loss": 7.4449, + "step": 19924 + }, + { + "epoch": 1.8591956704301578, + "grad_norm": 0.8341062196230639, + "learning_rate": 0.00024278581283065115, + "loss": 7.3763, + "step": 19925 + }, + { + "epoch": 1.859288980125035, + "grad_norm": 1.0073288942322394, + "learning_rate": 0.00024277986963506238, + "loss": 7.431, + "step": 19926 + }, + { + "epoch": 1.8593822898199122, + "grad_norm": 0.8233477121843419, + "learning_rate": 0.00024277392620356195, + "loss": 7.4549, + "step": 19927 + }, + { + "epoch": 1.8594755995147896, + "grad_norm": 2.1733607554062144e+29, + "learning_rate": 0.00024276798253616483, + "loss": 7.0429, + "step": 19928 + }, + { + "epoch": 1.859568909209667, + "grad_norm": 29.420908286615134, + "learning_rate": 0.00024276203863288627, + "loss": 7.2928, + "step": 19929 + }, + { + "epoch": 1.8596622189045442, + "grad_norm": 1.4314155815120868, + "learning_rate": 0.0002427560944937413, + "loss": 7.7512, + "step": 19930 + }, + { + "epoch": 1.8597555285994214, + "grad_norm": 1.4243950527178066, + "learning_rate": 0.00024275015011874506, + "loss": 7.5138, + "step": 19931 + }, + { + "epoch": 1.8598488382942988, + "grad_norm": 0.7903734724731838, + "learning_rate": 0.00024274420550791264, + "loss": 7.4457, + "step": 19932 + }, + { + "epoch": 1.8599421479891762, + "grad_norm": 1.3324308004503004e+28, + "learning_rate": 0.00024273826066125923, + "loss": 7.3195, + "step": 19933 + }, + { + "epoch": 1.8600354576840534, + "grad_norm": 1.0565600469868117, + "learning_rate": 0.00024273231557879983, + "loss": 7.3803, + "step": 19934 + }, + { + "epoch": 1.8601287673789306, + "grad_norm": 0.8305635000812549, + "learning_rate": 0.0002427263702605497, + "loss": 7.2693, + "step": 19935 + }, + { + "epoch": 1.860222077073808, + "grad_norm": 1.7057416764519522, + "learning_rate": 0.0002427204247065238, + "loss": 7.5471, + "step": 19936 + }, + { + "epoch": 1.8603153867686852, + "grad_norm": 3.0469140059943602, + "learning_rate": 0.00024271447891673732, + "loss": 7.4721, + "step": 19937 + }, + { + "epoch": 1.8604086964635624, + "grad_norm": 0.8458939533144761, + "learning_rate": 0.00024270853289120545, + "loss": 7.3751, + "step": 19938 + }, + { + "epoch": 1.8605020061584399, + "grad_norm": 1.0480246910523723, + "learning_rate": 0.00024270258662994318, + "loss": 7.4995, + "step": 19939 + }, + { + "epoch": 1.8605953158533173, + "grad_norm": 1.4305047353160611, + "learning_rate": 0.00024269664013296577, + "loss": 7.2394, + "step": 19940 + }, + { + "epoch": 1.8606886255481945, + "grad_norm": 5.127405128813576e+28, + "learning_rate": 0.00024269069340028822, + "loss": 7.9539, + "step": 19941 + }, + { + "epoch": 1.8607819352430717, + "grad_norm": 1.2890909124643113, + "learning_rate": 0.0002426847464319257, + "loss": 7.5077, + "step": 19942 + }, + { + "epoch": 1.860875244937949, + "grad_norm": 7.9765185367962665, + "learning_rate": 0.00024267879922789328, + "loss": 7.3828, + "step": 19943 + }, + { + "epoch": 1.8609685546328265, + "grad_norm": 3.373754697853699e+27, + "learning_rate": 0.00024267285178820622, + "loss": 7.5552, + "step": 19944 + }, + { + "epoch": 1.8610618643277035, + "grad_norm": 0.83782408717261, + "learning_rate": 0.00024266690411287948, + "loss": 7.3949, + "step": 19945 + }, + { + "epoch": 1.861155174022581, + "grad_norm": 12.84122630778091, + "learning_rate": 0.00024266095620192828, + "loss": 7.2426, + "step": 19946 + }, + { + "epoch": 1.8612484837174583, + "grad_norm": 1.0059452520867973, + "learning_rate": 0.0002426550080553677, + "loss": 7.445, + "step": 19947 + }, + { + "epoch": 1.8613417934123355, + "grad_norm": 2.1834685037958703, + "learning_rate": 0.00024264905967321288, + "loss": 7.5845, + "step": 19948 + }, + { + "epoch": 1.8614351031072127, + "grad_norm": 1.223066449570587, + "learning_rate": 0.00024264311105547893, + "loss": 7.4927, + "step": 19949 + }, + { + "epoch": 1.8615284128020901, + "grad_norm": 0.9286463241271131, + "learning_rate": 0.00024263716220218102, + "loss": 7.7971, + "step": 19950 + }, + { + "epoch": 1.8616217224969676, + "grad_norm": 1.0447069056126141, + "learning_rate": 0.00024263121311333426, + "loss": 7.6622, + "step": 19951 + }, + { + "epoch": 1.8617150321918448, + "grad_norm": 0.9154349721909723, + "learning_rate": 0.00024262526378895376, + "loss": 7.7981, + "step": 19952 + }, + { + "epoch": 1.861808341886722, + "grad_norm": 1.1369374063525115e+27, + "learning_rate": 0.0002426193142290546, + "loss": 7.6156, + "step": 19953 + }, + { + "epoch": 1.8619016515815994, + "grad_norm": 1.7039432477210055, + "learning_rate": 0.00024261336443365201, + "loss": 7.3606, + "step": 19954 + }, + { + "epoch": 1.8619949612764768, + "grad_norm": 1.6168155868650507, + "learning_rate": 0.00024260741440276107, + "loss": 7.3677, + "step": 19955 + }, + { + "epoch": 1.8620882709713538, + "grad_norm": 1.0697525570668283, + "learning_rate": 0.00024260146413639685, + "loss": 7.8411, + "step": 19956 + }, + { + "epoch": 1.8621815806662312, + "grad_norm": 1.1440292660711333, + "learning_rate": 0.00024259551363457461, + "loss": 7.1358, + "step": 19957 + }, + { + "epoch": 1.8622748903611086, + "grad_norm": 1.5552024037161325, + "learning_rate": 0.0002425895628973094, + "loss": 7.7393, + "step": 19958 + }, + { + "epoch": 1.8623682000559858, + "grad_norm": 1.0025249878465536, + "learning_rate": 0.0002425836119246163, + "loss": 7.1754, + "step": 19959 + }, + { + "epoch": 1.862461509750863, + "grad_norm": 1.034687139226302, + "learning_rate": 0.00024257766071651056, + "loss": 7.2688, + "step": 19960 + }, + { + "epoch": 1.8625548194457404, + "grad_norm": 1.5481461723499375, + "learning_rate": 0.00024257170927300724, + "loss": 7.5736, + "step": 19961 + }, + { + "epoch": 1.8626481291406178, + "grad_norm": 1.3705717018492138, + "learning_rate": 0.00024256575759412145, + "loss": 7.6347, + "step": 19962 + }, + { + "epoch": 1.862741438835495, + "grad_norm": 1.031647051961112, + "learning_rate": 0.00024255980567986838, + "loss": 7.6028, + "step": 19963 + }, + { + "epoch": 1.8628347485303722, + "grad_norm": 2.3592166513997843, + "learning_rate": 0.00024255385353026313, + "loss": 7.3229, + "step": 19964 + }, + { + "epoch": 1.8629280582252497, + "grad_norm": 1.0848718267637345, + "learning_rate": 0.00024254790114532085, + "loss": 7.433, + "step": 19965 + }, + { + "epoch": 1.8630213679201268, + "grad_norm": 1.9878228068925914, + "learning_rate": 0.0002425419485250567, + "loss": 7.0057, + "step": 19966 + }, + { + "epoch": 1.863114677615004, + "grad_norm": 1.23233851771383, + "learning_rate": 0.00024253599566948582, + "loss": 7.1386, + "step": 19967 + }, + { + "epoch": 1.8632079873098815, + "grad_norm": 4.696233165789268e+25, + "learning_rate": 0.00024253004257862327, + "loss": 7.4532, + "step": 19968 + }, + { + "epoch": 1.8633012970047589, + "grad_norm": 1.4570655136407761e+28, + "learning_rate": 0.0002425240892524842, + "loss": 7.5966, + "step": 19969 + }, + { + "epoch": 1.863394606699636, + "grad_norm": 1.5242884629101126e+28, + "learning_rate": 0.0002425181356910838, + "loss": 7.4701, + "step": 19970 + }, + { + "epoch": 1.8634879163945133, + "grad_norm": 1.6892380966335305, + "learning_rate": 0.00024251218189443721, + "loss": 7.5923, + "step": 19971 + }, + { + "epoch": 1.8635812260893907, + "grad_norm": 1.2077300175664083, + "learning_rate": 0.00024250622786255952, + "loss": 7.4361, + "step": 19972 + }, + { + "epoch": 1.8636745357842681, + "grad_norm": 1.4336907685064942e+28, + "learning_rate": 0.0002425002735954659, + "loss": 7.5869, + "step": 19973 + }, + { + "epoch": 1.8637678454791453, + "grad_norm": 1.4431460219464871, + "learning_rate": 0.0002424943190931715, + "loss": 7.4419, + "step": 19974 + }, + { + "epoch": 1.8638611551740225, + "grad_norm": 0.9006871716863166, + "learning_rate": 0.00024248836435569138, + "loss": 7.5472, + "step": 19975 + }, + { + "epoch": 1.8639544648689, + "grad_norm": 0.9814907764535545, + "learning_rate": 0.00024248240938304084, + "loss": 7.6949, + "step": 19976 + }, + { + "epoch": 1.8640477745637771, + "grad_norm": 1.1348568890360538, + "learning_rate": 0.00024247645417523492, + "loss": 7.4304, + "step": 19977 + }, + { + "epoch": 1.8641410842586543, + "grad_norm": 0.9056582950350334, + "learning_rate": 0.00024247049873228868, + "loss": 7.55, + "step": 19978 + }, + { + "epoch": 1.8642343939535317, + "grad_norm": 1.1521702443258919, + "learning_rate": 0.00024246454305421744, + "loss": 7.4663, + "step": 19979 + }, + { + "epoch": 1.8643277036484092, + "grad_norm": 5.921443888043403, + "learning_rate": 0.00024245858714103623, + "loss": 7.5188, + "step": 19980 + }, + { + "epoch": 1.8644210133432864, + "grad_norm": 0.8689032167477758, + "learning_rate": 0.0002424526309927602, + "loss": 7.752, + "step": 19981 + }, + { + "epoch": 1.8645143230381636, + "grad_norm": 1.56368964935604e+28, + "learning_rate": 0.00024244667460940456, + "loss": 7.3148, + "step": 19982 + }, + { + "epoch": 1.864607632733041, + "grad_norm": 1.0365130514901038, + "learning_rate": 0.0002424407179909844, + "loss": 7.3805, + "step": 19983 + }, + { + "epoch": 1.8647009424279184, + "grad_norm": 1.0379966639401523, + "learning_rate": 0.00024243476113751484, + "loss": 7.0871, + "step": 19984 + }, + { + "epoch": 1.8647942521227956, + "grad_norm": 1.0026472277142024, + "learning_rate": 0.0002424288040490111, + "loss": 7.4275, + "step": 19985 + }, + { + "epoch": 1.8648875618176728, + "grad_norm": 4.528218682924971e+27, + "learning_rate": 0.00024242284672548826, + "loss": 7.4568, + "step": 19986 + }, + { + "epoch": 1.8649808715125502, + "grad_norm": 1.4228655000631008, + "learning_rate": 0.0002424168891669615, + "loss": 7.6339, + "step": 19987 + }, + { + "epoch": 1.8650741812074274, + "grad_norm": 0.8832502964074441, + "learning_rate": 0.00024241093137344604, + "loss": 7.497, + "step": 19988 + }, + { + "epoch": 1.8651674909023046, + "grad_norm": 1.1489997052086522, + "learning_rate": 0.0002424049733449569, + "loss": 7.3305, + "step": 19989 + }, + { + "epoch": 1.865260800597182, + "grad_norm": 0.92727095157531, + "learning_rate": 0.0002423990150815093, + "loss": 7.1503, + "step": 19990 + }, + { + "epoch": 1.8653541102920594, + "grad_norm": 4.678106715444874e+28, + "learning_rate": 0.00024239305658311833, + "loss": 7.1578, + "step": 19991 + }, + { + "epoch": 1.8654474199869366, + "grad_norm": 1.3887000767204907, + "learning_rate": 0.0002423870978497992, + "loss": 7.4623, + "step": 19992 + }, + { + "epoch": 1.8655407296818138, + "grad_norm": 0.823927669754514, + "learning_rate": 0.0002423811388815671, + "loss": 7.4157, + "step": 19993 + }, + { + "epoch": 1.8656340393766913, + "grad_norm": 8.064787581549356e+25, + "learning_rate": 0.00024237517967843706, + "loss": 7.2241, + "step": 19994 + }, + { + "epoch": 1.8657273490715687, + "grad_norm": 10.112697863530771, + "learning_rate": 0.00024236922024042435, + "loss": 7.4038, + "step": 19995 + }, + { + "epoch": 1.8658206587664459, + "grad_norm": 0.9855646026558039, + "learning_rate": 0.00024236326056754405, + "loss": 7.4082, + "step": 19996 + }, + { + "epoch": 1.865913968461323, + "grad_norm": 0.9696434635510605, + "learning_rate": 0.00024235730065981135, + "loss": 7.1403, + "step": 19997 + }, + { + "epoch": 1.8660072781562005, + "grad_norm": 1.8557455772393716, + "learning_rate": 0.0002423513405172414, + "loss": 7.4973, + "step": 19998 + }, + { + "epoch": 1.8661005878510777, + "grad_norm": 289.9308856873929, + "learning_rate": 0.00024234538013984932, + "loss": 7.1933, + "step": 19999 + }, + { + "epoch": 1.8661938975459549, + "grad_norm": 2.2074161408262913, + "learning_rate": 0.0002423394195276503, + "loss": 7.605, + "step": 20000 + }, + { + "epoch": 1.8662872072408323, + "grad_norm": 1.497716827839457, + "learning_rate": 0.00024233345868065955, + "loss": 8.0006, + "step": 20001 + }, + { + "epoch": 1.8663805169357097, + "grad_norm": 1.0167878547533684, + "learning_rate": 0.00024232749759889208, + "loss": 7.4716, + "step": 20002 + }, + { + "epoch": 1.866473826630587, + "grad_norm": 1.3393000889827507, + "learning_rate": 0.00024232153628236315, + "loss": 7.4561, + "step": 20003 + }, + { + "epoch": 1.8665671363254641, + "grad_norm": 1.9693438690798968, + "learning_rate": 0.00024231557473108797, + "loss": 7.3979, + "step": 20004 + }, + { + "epoch": 1.8666604460203415, + "grad_norm": 1.4990667250881622, + "learning_rate": 0.00024230961294508156, + "loss": 7.3735, + "step": 20005 + }, + { + "epoch": 1.866753755715219, + "grad_norm": 5.422472261488245e+25, + "learning_rate": 0.00024230365092435917, + "loss": 7.1892, + "step": 20006 + }, + { + "epoch": 1.8668470654100962, + "grad_norm": 1.0809610810174632, + "learning_rate": 0.00024229768866893594, + "loss": 7.3351, + "step": 20007 + }, + { + "epoch": 1.8669403751049733, + "grad_norm": 0.90565042985596, + "learning_rate": 0.000242291726178827, + "loss": 7.5313, + "step": 20008 + }, + { + "epoch": 1.8670336847998508, + "grad_norm": 1.1353841965750457, + "learning_rate": 0.00024228576345404756, + "loss": 7.4222, + "step": 20009 + }, + { + "epoch": 1.867126994494728, + "grad_norm": 1.5372134986492425, + "learning_rate": 0.00024227980049461276, + "loss": 7.5933, + "step": 20010 + }, + { + "epoch": 1.8672203041896052, + "grad_norm": 1.7576251356245334, + "learning_rate": 0.00024227383730053779, + "loss": 7.6336, + "step": 20011 + }, + { + "epoch": 1.8673136138844826, + "grad_norm": 33.34328972565124, + "learning_rate": 0.00024226787387183774, + "loss": 7.527, + "step": 20012 + }, + { + "epoch": 1.86740692357936, + "grad_norm": 0.8794340207475637, + "learning_rate": 0.00024226191020852784, + "loss": 7.3419, + "step": 20013 + }, + { + "epoch": 1.8675002332742372, + "grad_norm": 4.892482401629611e+25, + "learning_rate": 0.00024225594631062324, + "loss": 7.5675, + "step": 20014 + }, + { + "epoch": 1.8675935429691144, + "grad_norm": 0.9436514939519924, + "learning_rate": 0.0002422499821781391, + "loss": 7.4589, + "step": 20015 + }, + { + "epoch": 1.8676868526639918, + "grad_norm": 0.9116200904373863, + "learning_rate": 0.00024224401781109057, + "loss": 7.7368, + "step": 20016 + }, + { + "epoch": 1.8677801623588692, + "grad_norm": 1.857120373998316e+27, + "learning_rate": 0.0002422380532094928, + "loss": 7.5699, + "step": 20017 + }, + { + "epoch": 1.8678734720537464, + "grad_norm": 1.9444882216083634, + "learning_rate": 0.00024223208837336106, + "loss": 7.4684, + "step": 20018 + }, + { + "epoch": 1.8679667817486236, + "grad_norm": 11.748005504177147, + "learning_rate": 0.00024222612330271036, + "loss": 7.5025, + "step": 20019 + }, + { + "epoch": 1.868060091443501, + "grad_norm": 0.9427617383371493, + "learning_rate": 0.00024222015799755598, + "loss": 7.5024, + "step": 20020 + }, + { + "epoch": 1.8681534011383782, + "grad_norm": 1.8403952441919078, + "learning_rate": 0.00024221419245791306, + "loss": 6.985, + "step": 20021 + }, + { + "epoch": 1.8682467108332554, + "grad_norm": 2.161266205200681e+28, + "learning_rate": 0.00024220822668379673, + "loss": 7.758, + "step": 20022 + }, + { + "epoch": 1.8683400205281329, + "grad_norm": 1.3204688308617618, + "learning_rate": 0.00024220226067522225, + "loss": 7.584, + "step": 20023 + }, + { + "epoch": 1.8684333302230103, + "grad_norm": 1.4515964637571157, + "learning_rate": 0.00024219629443220474, + "loss": 7.8026, + "step": 20024 + }, + { + "epoch": 1.8685266399178875, + "grad_norm": 1.1163008785368909, + "learning_rate": 0.0002421903279547593, + "loss": 7.5196, + "step": 20025 + }, + { + "epoch": 1.8686199496127647, + "grad_norm": 0.9273893712919817, + "learning_rate": 0.0002421843612429012, + "loss": 7.5243, + "step": 20026 + }, + { + "epoch": 1.868713259307642, + "grad_norm": 0.9600017684965986, + "learning_rate": 0.00024217839429664558, + "loss": 7.4268, + "step": 20027 + }, + { + "epoch": 1.8688065690025195, + "grad_norm": 1.0009610999889849, + "learning_rate": 0.0002421724271160076, + "loss": 7.6915, + "step": 20028 + }, + { + "epoch": 1.8688998786973967, + "grad_norm": 1.6381852977168985, + "learning_rate": 0.00024216645970100243, + "loss": 7.1407, + "step": 20029 + }, + { + "epoch": 1.868993188392274, + "grad_norm": 1.0625414062444933, + "learning_rate": 0.00024216049205164528, + "loss": 7.4314, + "step": 20030 + }, + { + "epoch": 1.8690864980871513, + "grad_norm": 1.0163055332112187, + "learning_rate": 0.0002421545241679513, + "loss": 7.5696, + "step": 20031 + }, + { + "epoch": 1.8691798077820285, + "grad_norm": 0.7661867968581261, + "learning_rate": 0.00024214855604993564, + "loss": 7.5439, + "step": 20032 + }, + { + "epoch": 1.8692731174769057, + "grad_norm": 0.8764927150124217, + "learning_rate": 0.0002421425876976135, + "loss": 7.7174, + "step": 20033 + }, + { + "epoch": 1.8693664271717831, + "grad_norm": 3.2946082755485754e+27, + "learning_rate": 0.00024213661911100007, + "loss": 7.4732, + "step": 20034 + }, + { + "epoch": 1.8694597368666606, + "grad_norm": 6.633732361296453, + "learning_rate": 0.00024213065029011048, + "loss": 7.4752, + "step": 20035 + }, + { + "epoch": 1.8695530465615378, + "grad_norm": 1.122733454190329, + "learning_rate": 0.00024212468123495996, + "loss": 7.3494, + "step": 20036 + }, + { + "epoch": 1.869646356256415, + "grad_norm": 1.2511739515038505, + "learning_rate": 0.0002421187119455637, + "loss": 7.5019, + "step": 20037 + }, + { + "epoch": 1.8697396659512924, + "grad_norm": 0.9632957354681773, + "learning_rate": 0.00024211274242193676, + "loss": 7.4642, + "step": 20038 + }, + { + "epoch": 1.8698329756461698, + "grad_norm": 8.871179776803015, + "learning_rate": 0.00024210677266409448, + "loss": 7.5942, + "step": 20039 + }, + { + "epoch": 1.869926285341047, + "grad_norm": 1.6226889307196315, + "learning_rate": 0.00024210080267205193, + "loss": 7.7143, + "step": 20040 + }, + { + "epoch": 1.8700195950359242, + "grad_norm": 0.8414427149893183, + "learning_rate": 0.00024209483244582434, + "loss": 7.5101, + "step": 20041 + }, + { + "epoch": 1.8701129047308016, + "grad_norm": 0.8882436656975707, + "learning_rate": 0.00024208886198542686, + "loss": 7.5969, + "step": 20042 + }, + { + "epoch": 1.8702062144256788, + "grad_norm": 4.138922602354526e+28, + "learning_rate": 0.00024208289129087465, + "loss": 7.6583, + "step": 20043 + }, + { + "epoch": 1.870299524120556, + "grad_norm": 3.546066004889115e+28, + "learning_rate": 0.00024207692036218293, + "loss": 7.4799, + "step": 20044 + }, + { + "epoch": 1.8703928338154334, + "grad_norm": 2.344828506591629, + "learning_rate": 0.00024207094919936694, + "loss": 7.0728, + "step": 20045 + }, + { + "epoch": 1.8704861435103108, + "grad_norm": 0.9483607055113666, + "learning_rate": 0.00024206497780244175, + "loss": 7.2938, + "step": 20046 + }, + { + "epoch": 1.870579453205188, + "grad_norm": 7.145329253348952e+27, + "learning_rate": 0.00024205900617142257, + "loss": 7.4989, + "step": 20047 + }, + { + "epoch": 1.8706727629000652, + "grad_norm": 2.6411265613349124e+28, + "learning_rate": 0.00024205303430632462, + "loss": 7.574, + "step": 20048 + }, + { + "epoch": 1.8707660725949427, + "grad_norm": 1.8206711157507143e+28, + "learning_rate": 0.0002420470622071631, + "loss": 7.2572, + "step": 20049 + }, + { + "epoch": 1.87085938228982, + "grad_norm": 0.9784697681269686, + "learning_rate": 0.00024204108987395316, + "loss": 7.1058, + "step": 20050 + }, + { + "epoch": 1.870952691984697, + "grad_norm": 1.533485477200865, + "learning_rate": 0.00024203511730670998, + "loss": 7.6136, + "step": 20051 + }, + { + "epoch": 1.8710460016795745, + "grad_norm": 1.5838196998301934, + "learning_rate": 0.00024202914450544878, + "loss": 7.7668, + "step": 20052 + }, + { + "epoch": 1.8711393113744519, + "grad_norm": 0.9691314922620181, + "learning_rate": 0.00024202317147018471, + "loss": 7.3947, + "step": 20053 + }, + { + "epoch": 1.871232621069329, + "grad_norm": 1.3186216452769084, + "learning_rate": 0.00024201719820093299, + "loss": 7.1519, + "step": 20054 + }, + { + "epoch": 1.8713259307642063, + "grad_norm": 0.9959667724518617, + "learning_rate": 0.00024201122469770877, + "loss": 7.4215, + "step": 20055 + }, + { + "epoch": 1.8714192404590837, + "grad_norm": 0.8598849569424799, + "learning_rate": 0.0002420052509605273, + "loss": 7.5106, + "step": 20056 + }, + { + "epoch": 1.8715125501539611, + "grad_norm": 1.4808327191512645, + "learning_rate": 0.00024199927698940367, + "loss": 7.5651, + "step": 20057 + }, + { + "epoch": 1.8716058598488383, + "grad_norm": 0.9453891234911384, + "learning_rate": 0.00024199330278435317, + "loss": 7.4398, + "step": 20058 + }, + { + "epoch": 1.8716991695437155, + "grad_norm": 1.0135427483563768, + "learning_rate": 0.00024198732834539096, + "loss": 7.6526, + "step": 20059 + }, + { + "epoch": 1.871792479238593, + "grad_norm": 3.285327282850865e+29, + "learning_rate": 0.00024198135367253218, + "loss": 7.2085, + "step": 20060 + }, + { + "epoch": 1.8718857889334704, + "grad_norm": 9.079941343766253e+29, + "learning_rate": 0.00024197537876579213, + "loss": 7.0224, + "step": 20061 + }, + { + "epoch": 1.8719790986283473, + "grad_norm": 1.8563024376782682, + "learning_rate": 0.0002419694036251859, + "loss": 7.8805, + "step": 20062 + }, + { + "epoch": 1.8720724083232247, + "grad_norm": 1.7544757726146776, + "learning_rate": 0.0002419634282507287, + "loss": 7.749, + "step": 20063 + }, + { + "epoch": 1.8721657180181022, + "grad_norm": 1.3955658117220435, + "learning_rate": 0.00024195745264243577, + "loss": 7.5773, + "step": 20064 + }, + { + "epoch": 1.8722590277129794, + "grad_norm": 9.483248881173811e+29, + "learning_rate": 0.00024195147680032228, + "loss": 7.2739, + "step": 20065 + }, + { + "epoch": 1.8723523374078566, + "grad_norm": 0.932225949058233, + "learning_rate": 0.00024194550072440343, + "loss": 7.3409, + "step": 20066 + }, + { + "epoch": 1.872445647102734, + "grad_norm": 1.0379002380477578, + "learning_rate": 0.00024193952441469438, + "loss": 7.3152, + "step": 20067 + }, + { + "epoch": 1.8725389567976114, + "grad_norm": 0.9962575187647875, + "learning_rate": 0.00024193354787121037, + "loss": 7.3818, + "step": 20068 + }, + { + "epoch": 1.8726322664924886, + "grad_norm": 0.9625682015653083, + "learning_rate": 0.00024192757109396654, + "loss": 7.4535, + "step": 20069 + }, + { + "epoch": 1.8727255761873658, + "grad_norm": 0.9247657958017348, + "learning_rate": 0.0002419215940829782, + "loss": 7.2594, + "step": 20070 + }, + { + "epoch": 1.8728188858822432, + "grad_norm": 1.2363670268821168e+29, + "learning_rate": 0.00024191561683826042, + "loss": 7.4125, + "step": 20071 + }, + { + "epoch": 1.8729121955771204, + "grad_norm": 1.3624257478839772, + "learning_rate": 0.00024190963935982846, + "loss": 7.0354, + "step": 20072 + }, + { + "epoch": 1.8730055052719976, + "grad_norm": 1.0850601789192424, + "learning_rate": 0.00024190366164769753, + "loss": 7.6025, + "step": 20073 + }, + { + "epoch": 1.873098814966875, + "grad_norm": 1.1217260566162846, + "learning_rate": 0.0002418976837018828, + "loss": 7.5242, + "step": 20074 + }, + { + "epoch": 1.8731921246617524, + "grad_norm": 1.144573031340817, + "learning_rate": 0.0002418917055223995, + "loss": 7.3704, + "step": 20075 + }, + { + "epoch": 1.8732854343566296, + "grad_norm": 0.7609400901892781, + "learning_rate": 0.0002418857271092628, + "loss": 7.2547, + "step": 20076 + }, + { + "epoch": 1.8733787440515068, + "grad_norm": 0.8166958847293054, + "learning_rate": 0.00024187974846248794, + "loss": 7.2067, + "step": 20077 + }, + { + "epoch": 1.8734720537463843, + "grad_norm": 3.971218755645011e+28, + "learning_rate": 0.00024187376958209006, + "loss": 7.4449, + "step": 20078 + }, + { + "epoch": 1.8735653634412617, + "grad_norm": 1.9774860201018057e+29, + "learning_rate": 0.00024186779046808443, + "loss": 7.5195, + "step": 20079 + }, + { + "epoch": 1.8736586731361389, + "grad_norm": 1.089385209103734, + "learning_rate": 0.00024186181112048621, + "loss": 7.4231, + "step": 20080 + }, + { + "epoch": 1.873751982831016, + "grad_norm": 6.300732306516557, + "learning_rate": 0.00024185583153931063, + "loss": 7.0255, + "step": 20081 + }, + { + "epoch": 1.8738452925258935, + "grad_norm": 1.7284818445592605, + "learning_rate": 0.00024184985172457282, + "loss": 7.6613, + "step": 20082 + }, + { + "epoch": 1.8739386022207707, + "grad_norm": 0.7848793643792861, + "learning_rate": 0.00024184387167628808, + "loss": 7.4189, + "step": 20083 + }, + { + "epoch": 1.8740319119156479, + "grad_norm": 0.9892824974616539, + "learning_rate": 0.00024183789139447162, + "loss": 7.5295, + "step": 20084 + }, + { + "epoch": 1.8741252216105253, + "grad_norm": 4.358275455574002e+29, + "learning_rate": 0.00024183191087913855, + "loss": 7.365, + "step": 20085 + }, + { + "epoch": 1.8742185313054027, + "grad_norm": 19.597824894401995, + "learning_rate": 0.00024182593013030414, + "loss": 7.8765, + "step": 20086 + }, + { + "epoch": 1.87431184100028, + "grad_norm": 0.9522551026897794, + "learning_rate": 0.00024181994914798365, + "loss": 7.4598, + "step": 20087 + }, + { + "epoch": 1.8744051506951571, + "grad_norm": 40.20924294748829, + "learning_rate": 0.00024181396793219214, + "loss": 7.3152, + "step": 20088 + }, + { + "epoch": 1.8744984603900345, + "grad_norm": 2.2561642081589642e+30, + "learning_rate": 0.00024180798648294497, + "loss": 7.263, + "step": 20089 + }, + { + "epoch": 1.874591770084912, + "grad_norm": 6.27069472729138e+28, + "learning_rate": 0.00024180200480025728, + "loss": 7.7861, + "step": 20090 + }, + { + "epoch": 1.8746850797797892, + "grad_norm": 0.8589745472274591, + "learning_rate": 0.00024179602288414427, + "loss": 7.525, + "step": 20091 + }, + { + "epoch": 1.8747783894746664, + "grad_norm": 1.2247064076441956, + "learning_rate": 0.00024179004073462116, + "loss": 7.529, + "step": 20092 + }, + { + "epoch": 1.8748716991695438, + "grad_norm": 0.9651007926886213, + "learning_rate": 0.0002417840583517032, + "loss": 7.3317, + "step": 20093 + }, + { + "epoch": 1.874965008864421, + "grad_norm": 2.329417556004702, + "learning_rate": 0.00024177807573540552, + "loss": 7.4953, + "step": 20094 + }, + { + "epoch": 1.8750583185592982, + "grad_norm": 0.9981215380896727, + "learning_rate": 0.00024177209288574344, + "loss": 7.5088, + "step": 20095 + }, + { + "epoch": 1.8751516282541756, + "grad_norm": 0.8281938758529264, + "learning_rate": 0.00024176610980273207, + "loss": 7.5249, + "step": 20096 + }, + { + "epoch": 1.875244937949053, + "grad_norm": 1.0230744876727471, + "learning_rate": 0.00024176012648638667, + "loss": 7.1906, + "step": 20097 + }, + { + "epoch": 1.8753382476439302, + "grad_norm": 1.293806977894407, + "learning_rate": 0.0002417541429367224, + "loss": 7.1644, + "step": 20098 + }, + { + "epoch": 1.8754315573388074, + "grad_norm": 2.8924201999567694, + "learning_rate": 0.00024174815915375462, + "loss": 7.2389, + "step": 20099 + }, + { + "epoch": 1.8755248670336848, + "grad_norm": 0.8683529713635822, + "learning_rate": 0.0002417421751374984, + "loss": 7.4401, + "step": 20100 + }, + { + "epoch": 1.8756181767285622, + "grad_norm": 0.9903977303790432, + "learning_rate": 0.000241736190887969, + "loss": 7.4489, + "step": 20101 + }, + { + "epoch": 1.8757114864234394, + "grad_norm": 1.3578678586856556, + "learning_rate": 0.0002417302064051817, + "loss": 7.66, + "step": 20102 + }, + { + "epoch": 1.8758047961183166, + "grad_norm": 0.9527956520558076, + "learning_rate": 0.0002417242216891516, + "loss": 7.4853, + "step": 20103 + }, + { + "epoch": 1.875898105813194, + "grad_norm": 0.9296510336153652, + "learning_rate": 0.00024171823673989394, + "loss": 7.5666, + "step": 20104 + }, + { + "epoch": 1.8759914155080712, + "grad_norm": 0.8950578625929273, + "learning_rate": 0.00024171225155742405, + "loss": 7.4657, + "step": 20105 + }, + { + "epoch": 1.8760847252029484, + "grad_norm": 0.9337991637722094, + "learning_rate": 0.00024170626614175705, + "loss": 7.5509, + "step": 20106 + }, + { + "epoch": 1.8761780348978259, + "grad_norm": 0.9859943971044481, + "learning_rate": 0.00024170028049290814, + "loss": 7.5948, + "step": 20107 + }, + { + "epoch": 1.8762713445927033, + "grad_norm": 1.177986951927201, + "learning_rate": 0.0002416942946108926, + "loss": 7.7332, + "step": 20108 + }, + { + "epoch": 1.8763646542875805, + "grad_norm": 0.8609175263984964, + "learning_rate": 0.0002416883084957257, + "loss": 7.4785, + "step": 20109 + }, + { + "epoch": 1.8764579639824577, + "grad_norm": 0.9005193718020388, + "learning_rate": 0.0002416823221474225, + "loss": 7.7024, + "step": 20110 + }, + { + "epoch": 1.876551273677335, + "grad_norm": 0.8117782415659113, + "learning_rate": 0.00024167633556599832, + "loss": 7.5828, + "step": 20111 + }, + { + "epoch": 1.8766445833722125, + "grad_norm": 1.0366153564979053, + "learning_rate": 0.00024167034875146844, + "loss": 7.4521, + "step": 20112 + }, + { + "epoch": 1.8767378930670897, + "grad_norm": 1.6373410425099986, + "learning_rate": 0.00024166436170384797, + "loss": 7.1, + "step": 20113 + }, + { + "epoch": 1.876831202761967, + "grad_norm": 0.8501558126117598, + "learning_rate": 0.00024165837442315213, + "loss": 7.5311, + "step": 20114 + }, + { + "epoch": 1.8769245124568443, + "grad_norm": 0.9892791171595741, + "learning_rate": 0.0002416523869093963, + "loss": 7.5143, + "step": 20115 + }, + { + "epoch": 1.8770178221517215, + "grad_norm": 2.581296115250796e+30, + "learning_rate": 0.00024164639916259557, + "loss": 7.4164, + "step": 20116 + }, + { + "epoch": 1.8771111318465987, + "grad_norm": 1.332636054704783, + "learning_rate": 0.0002416404111827652, + "loss": 7.3005, + "step": 20117 + }, + { + "epoch": 1.8772044415414761, + "grad_norm": 1.4523011735697646, + "learning_rate": 0.00024163442296992036, + "loss": 7.44, + "step": 20118 + }, + { + "epoch": 1.8772977512363536, + "grad_norm": 1.2077036510409405, + "learning_rate": 0.00024162843452407633, + "loss": 7.4767, + "step": 20119 + }, + { + "epoch": 1.8773910609312308, + "grad_norm": 0.8538622501702381, + "learning_rate": 0.00024162244584524835, + "loss": 7.5934, + "step": 20120 + }, + { + "epoch": 1.877484370626108, + "grad_norm": 1.5183362249968093, + "learning_rate": 0.00024161645693345166, + "loss": 7.0493, + "step": 20121 + }, + { + "epoch": 1.8775776803209854, + "grad_norm": 1.0902278408894355, + "learning_rate": 0.00024161046778870142, + "loss": 7.3389, + "step": 20122 + }, + { + "epoch": 1.8776709900158628, + "grad_norm": 1.158817013343055, + "learning_rate": 0.00024160447841101293, + "loss": 7.441, + "step": 20123 + }, + { + "epoch": 1.87776429971074, + "grad_norm": 1.301856931646877, + "learning_rate": 0.00024159848880040135, + "loss": 7.1393, + "step": 20124 + }, + { + "epoch": 1.8778576094056172, + "grad_norm": 1.1807137728460222, + "learning_rate": 0.000241592498956882, + "loss": 7.1868, + "step": 20125 + }, + { + "epoch": 1.8779509191004946, + "grad_norm": 7.720514286581386e+29, + "learning_rate": 0.00024158650888047003, + "loss": 7.6312, + "step": 20126 + }, + { + "epoch": 1.8780442287953718, + "grad_norm": 2.0431517191464359e+30, + "learning_rate": 0.00024158051857118068, + "loss": 7.373, + "step": 20127 + }, + { + "epoch": 1.878137538490249, + "grad_norm": 0.8340790118009289, + "learning_rate": 0.0002415745280290292, + "loss": 7.3584, + "step": 20128 + }, + { + "epoch": 1.8782308481851264, + "grad_norm": 4.251098107442563e+29, + "learning_rate": 0.00024156853725403087, + "loss": 7.6486, + "step": 20129 + }, + { + "epoch": 1.8783241578800038, + "grad_norm": 0.8472477647515563, + "learning_rate": 0.00024156254624620085, + "loss": 7.4205, + "step": 20130 + }, + { + "epoch": 1.878417467574881, + "grad_norm": 0.7474968193415482, + "learning_rate": 0.00024155655500555438, + "loss": 7.457, + "step": 20131 + }, + { + "epoch": 1.8785107772697582, + "grad_norm": 1.0520110653562518, + "learning_rate": 0.0002415505635321067, + "loss": 7.5245, + "step": 20132 + }, + { + "epoch": 1.8786040869646357, + "grad_norm": 2.3787275204499663e+28, + "learning_rate": 0.00024154457182587306, + "loss": 7.5916, + "step": 20133 + }, + { + "epoch": 1.878697396659513, + "grad_norm": 1.487930044813795, + "learning_rate": 0.00024153857988686874, + "loss": 7.0955, + "step": 20134 + }, + { + "epoch": 1.8787907063543903, + "grad_norm": 6.435016247134207e+26, + "learning_rate": 0.00024153258771510887, + "loss": 7.2812, + "step": 20135 + }, + { + "epoch": 1.8788840160492675, + "grad_norm": 3.814476810003419, + "learning_rate": 0.00024152659531060874, + "loss": 7.5463, + "step": 20136 + }, + { + "epoch": 1.8789773257441449, + "grad_norm": 1.1140836675527588, + "learning_rate": 0.00024152060267338367, + "loss": 7.3489, + "step": 20137 + }, + { + "epoch": 1.879070635439022, + "grad_norm": 1.4048876654688223, + "learning_rate": 0.00024151460980344872, + "loss": 7.4655, + "step": 20138 + }, + { + "epoch": 1.8791639451338993, + "grad_norm": 0.76973239351803, + "learning_rate": 0.0002415086167008193, + "loss": 7.2158, + "step": 20139 + }, + { + "epoch": 1.8792572548287767, + "grad_norm": 0.8876368563349061, + "learning_rate": 0.00024150262336551054, + "loss": 7.438, + "step": 20140 + }, + { + "epoch": 1.8793505645236541, + "grad_norm": 1.5323871309438192e+28, + "learning_rate": 0.0002414966297975377, + "loss": 7.4292, + "step": 20141 + }, + { + "epoch": 1.8794438742185313, + "grad_norm": 1.018634232473815, + "learning_rate": 0.00024149063599691607, + "loss": 7.3316, + "step": 20142 + }, + { + "epoch": 1.8795371839134085, + "grad_norm": 1.2764091785908034e+29, + "learning_rate": 0.0002414846419636608, + "loss": 7.6557, + "step": 20143 + }, + { + "epoch": 1.879630493608286, + "grad_norm": 1.32557501199327, + "learning_rate": 0.00024147864769778722, + "loss": 7.6335, + "step": 20144 + }, + { + "epoch": 1.8797238033031634, + "grad_norm": 0.8190178919994293, + "learning_rate": 0.00024147265319931054, + "loss": 7.494, + "step": 20145 + }, + { + "epoch": 1.8798171129980406, + "grad_norm": 6.253353932354008, + "learning_rate": 0.00024146665846824604, + "loss": 7.1309, + "step": 20146 + }, + { + "epoch": 1.8799104226929177, + "grad_norm": 4.4133230889803045e+27, + "learning_rate": 0.00024146066350460888, + "loss": 7.0512, + "step": 20147 + }, + { + "epoch": 1.8800037323877952, + "grad_norm": 1.2545118115394736, + "learning_rate": 0.00024145466830841433, + "loss": 7.4359, + "step": 20148 + }, + { + "epoch": 1.8800970420826724, + "grad_norm": 1.3001445745518196, + "learning_rate": 0.00024144867287967766, + "loss": 7.322, + "step": 20149 + }, + { + "epoch": 1.8801903517775496, + "grad_norm": 1.0271436844696145, + "learning_rate": 0.00024144267721841413, + "loss": 7.4602, + "step": 20150 + }, + { + "epoch": 1.880283661472427, + "grad_norm": 0.7677662010064743, + "learning_rate": 0.00024143668132463892, + "loss": 7.47, + "step": 20151 + }, + { + "epoch": 1.8803769711673044, + "grad_norm": 0.8997964350123937, + "learning_rate": 0.00024143068519836734, + "loss": 7.4264, + "step": 20152 + }, + { + "epoch": 1.8804702808621816, + "grad_norm": 1.0003363999916346, + "learning_rate": 0.00024142468883961462, + "loss": 7.4978, + "step": 20153 + }, + { + "epoch": 1.8805635905570588, + "grad_norm": 1.8476519405945806, + "learning_rate": 0.00024141869224839597, + "loss": 7.4235, + "step": 20154 + }, + { + "epoch": 1.8806569002519362, + "grad_norm": 2.2750833199113025e+28, + "learning_rate": 0.00024141269542472666, + "loss": 7.3674, + "step": 20155 + }, + { + "epoch": 1.8807502099468136, + "grad_norm": 2.017613602728966e+28, + "learning_rate": 0.00024140669836862196, + "loss": 7.5637, + "step": 20156 + }, + { + "epoch": 1.8808435196416906, + "grad_norm": 3.5408581773191163, + "learning_rate": 0.00024140070108009712, + "loss": 7.5706, + "step": 20157 + }, + { + "epoch": 1.880936829336568, + "grad_norm": 0.9698427100586717, + "learning_rate": 0.00024139470355916736, + "loss": 7.3773, + "step": 20158 + }, + { + "epoch": 1.8810301390314454, + "grad_norm": 1.072868447947038, + "learning_rate": 0.00024138870580584792, + "loss": 7.1931, + "step": 20159 + }, + { + "epoch": 1.8811234487263226, + "grad_norm": 0.8065623936507805, + "learning_rate": 0.00024138270782015408, + "loss": 7.3829, + "step": 20160 + }, + { + "epoch": 1.8812167584211998, + "grad_norm": 1.253087043049452, + "learning_rate": 0.0002413767096021011, + "loss": 7.5585, + "step": 20161 + }, + { + "epoch": 1.8813100681160773, + "grad_norm": 1.1041116844619916, + "learning_rate": 0.00024137071115170423, + "loss": 7.5777, + "step": 20162 + }, + { + "epoch": 1.8814033778109547, + "grad_norm": 0.8949766945891976, + "learning_rate": 0.00024136471246897869, + "loss": 7.4747, + "step": 20163 + }, + { + "epoch": 1.8814966875058319, + "grad_norm": 1.011793129681524, + "learning_rate": 0.00024135871355393976, + "loss": 7.1671, + "step": 20164 + }, + { + "epoch": 1.881589997200709, + "grad_norm": 1.1726539354422165, + "learning_rate": 0.00024135271440660266, + "loss": 7.3563, + "step": 20165 + }, + { + "epoch": 1.8816833068955865, + "grad_norm": 1.0493461970574742, + "learning_rate": 0.00024134671502698264, + "loss": 7.4676, + "step": 20166 + }, + { + "epoch": 1.881776616590464, + "grad_norm": 1.4156199601696637, + "learning_rate": 0.00024134071541509507, + "loss": 6.9186, + "step": 20167 + }, + { + "epoch": 1.8818699262853409, + "grad_norm": 4.8083453275715546e+29, + "learning_rate": 0.00024133471557095506, + "loss": 7.2698, + "step": 20168 + }, + { + "epoch": 1.8819632359802183, + "grad_norm": 1.3166067658884302, + "learning_rate": 0.00024132871549457796, + "loss": 7.4795, + "step": 20169 + }, + { + "epoch": 1.8820565456750957, + "grad_norm": 4.332540388715179e+29, + "learning_rate": 0.00024132271518597894, + "loss": 7.5157, + "step": 20170 + }, + { + "epoch": 1.882149855369973, + "grad_norm": 1.1198392763084004, + "learning_rate": 0.00024131671464517333, + "loss": 7.346, + "step": 20171 + }, + { + "epoch": 1.8822431650648501, + "grad_norm": 0.9416638444916294, + "learning_rate": 0.00024131071387217636, + "loss": 7.7113, + "step": 20172 + }, + { + "epoch": 1.8823364747597275, + "grad_norm": 1.1453543818107017, + "learning_rate": 0.0002413047128670033, + "loss": 7.9348, + "step": 20173 + }, + { + "epoch": 1.882429784454605, + "grad_norm": 0.9584342048613586, + "learning_rate": 0.0002412987116296694, + "loss": 7.7586, + "step": 20174 + }, + { + "epoch": 1.8825230941494822, + "grad_norm": 1.6506083959383147, + "learning_rate": 0.00024129271016018992, + "loss": 7.4467, + "step": 20175 + }, + { + "epoch": 1.8826164038443594, + "grad_norm": 1.3815259035909617, + "learning_rate": 0.00024128670845858014, + "loss": 7.2971, + "step": 20176 + }, + { + "epoch": 1.8827097135392368, + "grad_norm": 1.1602554952969957, + "learning_rate": 0.00024128070652485527, + "loss": 7.3171, + "step": 20177 + }, + { + "epoch": 1.882803023234114, + "grad_norm": 0.849736911786489, + "learning_rate": 0.00024127470435903064, + "loss": 7.4697, + "step": 20178 + }, + { + "epoch": 1.8828963329289912, + "grad_norm": 0.8732063289144194, + "learning_rate": 0.0002412687019611214, + "loss": 7.2128, + "step": 20179 + }, + { + "epoch": 1.8829896426238686, + "grad_norm": 1.3551513632802217, + "learning_rate": 0.00024126269933114297, + "loss": 7.6515, + "step": 20180 + }, + { + "epoch": 1.883082952318746, + "grad_norm": 1.1864964698112307e+30, + "learning_rate": 0.0002412566964691105, + "loss": 7.1585, + "step": 20181 + }, + { + "epoch": 1.8831762620136232, + "grad_norm": 1.026770723857829, + "learning_rate": 0.00024125069337503925, + "loss": 7.2977, + "step": 20182 + }, + { + "epoch": 1.8832695717085004, + "grad_norm": 0.9856404647895431, + "learning_rate": 0.00024124469004894458, + "loss": 7.035, + "step": 20183 + }, + { + "epoch": 1.8833628814033778, + "grad_norm": 9.442159999008416, + "learning_rate": 0.00024123868649084164, + "loss": 7.4726, + "step": 20184 + }, + { + "epoch": 1.8834561910982552, + "grad_norm": 1.0565231811729245, + "learning_rate": 0.00024123268270074573, + "loss": 7.5579, + "step": 20185 + }, + { + "epoch": 1.8835495007931324, + "grad_norm": 3.378835451686614e+30, + "learning_rate": 0.00024122667867867222, + "loss": 7.2494, + "step": 20186 + }, + { + "epoch": 1.8836428104880096, + "grad_norm": 1.4428746942384356, + "learning_rate": 0.00024122067442463623, + "loss": 7.3499, + "step": 20187 + }, + { + "epoch": 1.883736120182887, + "grad_norm": 2.763847523485962e+28, + "learning_rate": 0.00024121466993865307, + "loss": 7.4515, + "step": 20188 + }, + { + "epoch": 1.8838294298777642, + "grad_norm": 1.3769762287712919, + "learning_rate": 0.00024120866522073808, + "loss": 7.0823, + "step": 20189 + }, + { + "epoch": 1.8839227395726414, + "grad_norm": 7.398009382074968e+29, + "learning_rate": 0.00024120266027090643, + "loss": 7.6203, + "step": 20190 + }, + { + "epoch": 1.8840160492675189, + "grad_norm": 0.8748703190661, + "learning_rate": 0.00024119665508917347, + "loss": 7.4881, + "step": 20191 + }, + { + "epoch": 1.8841093589623963, + "grad_norm": 0.8642833052059651, + "learning_rate": 0.0002411906496755544, + "loss": 7.3871, + "step": 20192 + }, + { + "epoch": 1.8842026686572735, + "grad_norm": 1.393505359087013, + "learning_rate": 0.0002411846440300645, + "loss": 7.4989, + "step": 20193 + }, + { + "epoch": 1.8842959783521507, + "grad_norm": 1.202954947159881, + "learning_rate": 0.0002411786381527191, + "loss": 7.385, + "step": 20194 + }, + { + "epoch": 1.884389288047028, + "grad_norm": 5.534801757214567e+28, + "learning_rate": 0.0002411726320435334, + "loss": 7.6879, + "step": 20195 + }, + { + "epoch": 1.8844825977419055, + "grad_norm": 1.1902210172420242, + "learning_rate": 0.00024116662570252274, + "loss": 7.5528, + "step": 20196 + }, + { + "epoch": 1.8845759074367827, + "grad_norm": 1.0447305091969272, + "learning_rate": 0.00024116061912970234, + "loss": 7.5601, + "step": 20197 + }, + { + "epoch": 1.88466921713166, + "grad_norm": 1.252585320789569, + "learning_rate": 0.00024115461232508746, + "loss": 7.8116, + "step": 20198 + }, + { + "epoch": 1.8847625268265373, + "grad_norm": 1.1592168013386246, + "learning_rate": 0.0002411486052886934, + "loss": 7.6205, + "step": 20199 + }, + { + "epoch": 1.8848558365214145, + "grad_norm": 1.709897829792897, + "learning_rate": 0.0002411425980205355, + "loss": 7.2828, + "step": 20200 + }, + { + "epoch": 1.8849491462162917, + "grad_norm": 4.0572911536659056e+30, + "learning_rate": 0.00024113659052062893, + "loss": 7.1502, + "step": 20201 + }, + { + "epoch": 1.8850424559111691, + "grad_norm": 1.0863453298991559, + "learning_rate": 0.00024113058278898904, + "loss": 7.2399, + "step": 20202 + }, + { + "epoch": 1.8851357656060466, + "grad_norm": 1.1499809382779136, + "learning_rate": 0.00024112457482563105, + "loss": 7.1208, + "step": 20203 + }, + { + "epoch": 1.8852290753009238, + "grad_norm": 1.0395812573242713, + "learning_rate": 0.00024111856663057025, + "loss": 7.2273, + "step": 20204 + }, + { + "epoch": 1.885322384995801, + "grad_norm": 1.644215361865268, + "learning_rate": 0.0002411125582038219, + "loss": 7.331, + "step": 20205 + }, + { + "epoch": 1.8854156946906784, + "grad_norm": 2.650346282348203, + "learning_rate": 0.00024110654954540136, + "loss": 7.8354, + "step": 20206 + }, + { + "epoch": 1.8855090043855558, + "grad_norm": 1.6120495064728044, + "learning_rate": 0.00024110054065532385, + "loss": 7.1979, + "step": 20207 + }, + { + "epoch": 1.885602314080433, + "grad_norm": 2.4473776630578365, + "learning_rate": 0.00024109453153360464, + "loss": 7.6576, + "step": 20208 + }, + { + "epoch": 1.8856956237753102, + "grad_norm": 3.925036611245759e+25, + "learning_rate": 0.000241088522180259, + "loss": 7.4195, + "step": 20209 + }, + { + "epoch": 1.8857889334701876, + "grad_norm": 1.7751262456412584, + "learning_rate": 0.00024108251259530222, + "loss": 6.9767, + "step": 20210 + }, + { + "epoch": 1.8858822431650648, + "grad_norm": 1.8417412480536415, + "learning_rate": 0.0002410765027787496, + "loss": 6.9936, + "step": 20211 + }, + { + "epoch": 1.885975552859942, + "grad_norm": 9.693398811963264e+28, + "learning_rate": 0.00024107049273061645, + "loss": 7.2063, + "step": 20212 + }, + { + "epoch": 1.8860688625548194, + "grad_norm": 1.603836840800647e+28, + "learning_rate": 0.00024106448245091798, + "loss": 7.6605, + "step": 20213 + }, + { + "epoch": 1.8861621722496968, + "grad_norm": 1.1207507198996416, + "learning_rate": 0.00024105847193966955, + "loss": 7.2378, + "step": 20214 + }, + { + "epoch": 1.886255481944574, + "grad_norm": 1.4540134624911454, + "learning_rate": 0.00024105246119688633, + "loss": 7.7368, + "step": 20215 + }, + { + "epoch": 1.8863487916394512, + "grad_norm": 1.0536894216780537, + "learning_rate": 0.00024104645022258374, + "loss": 7.4924, + "step": 20216 + }, + { + "epoch": 1.8864421013343287, + "grad_norm": 1.0437796448403644e+29, + "learning_rate": 0.00024104043901677693, + "loss": 7.3281, + "step": 20217 + }, + { + "epoch": 1.886535411029206, + "grad_norm": 1.0135785622647107, + "learning_rate": 0.00024103442757948134, + "loss": 7.3285, + "step": 20218 + }, + { + "epoch": 1.8866287207240833, + "grad_norm": 0.9834440915226271, + "learning_rate": 0.00024102841591071208, + "loss": 7.5195, + "step": 20219 + }, + { + "epoch": 1.8867220304189605, + "grad_norm": 1.0059627869410475, + "learning_rate": 0.00024102240401048457, + "loss": 7.5179, + "step": 20220 + }, + { + "epoch": 1.886815340113838, + "grad_norm": 1.961329163011124e+29, + "learning_rate": 0.00024101639187881402, + "loss": 7.4253, + "step": 20221 + }, + { + "epoch": 1.886908649808715, + "grad_norm": 0.8671820806974054, + "learning_rate": 0.00024101037951571578, + "loss": 7.419, + "step": 20222 + }, + { + "epoch": 1.8870019595035923, + "grad_norm": 1.2664582124865813, + "learning_rate": 0.00024100436692120507, + "loss": 7.4348, + "step": 20223 + }, + { + "epoch": 1.8870952691984697, + "grad_norm": 1.02942312057361, + "learning_rate": 0.00024099835409529722, + "loss": 7.3101, + "step": 20224 + }, + { + "epoch": 1.8871885788933471, + "grad_norm": 1.2741066497630642, + "learning_rate": 0.00024099234103800753, + "loss": 7.6833, + "step": 20225 + }, + { + "epoch": 1.8872818885882243, + "grad_norm": 1.4005531564788483, + "learning_rate": 0.00024098632774935126, + "loss": 7.78, + "step": 20226 + }, + { + "epoch": 1.8873751982831015, + "grad_norm": 1.050037998342779, + "learning_rate": 0.0002409803142293437, + "loss": 7.2237, + "step": 20227 + }, + { + "epoch": 1.887468507977979, + "grad_norm": 2.4012481399518434e+29, + "learning_rate": 0.0002409743004780002, + "loss": 7.3102, + "step": 20228 + }, + { + "epoch": 1.8875618176728564, + "grad_norm": 1.4075807821509077, + "learning_rate": 0.00024096828649533593, + "loss": 7.2512, + "step": 20229 + }, + { + "epoch": 1.8876551273677336, + "grad_norm": 1.1664897756188888, + "learning_rate": 0.0002409622722813663, + "loss": 7.7759, + "step": 20230 + }, + { + "epoch": 1.8877484370626108, + "grad_norm": 0.8694815664508151, + "learning_rate": 0.00024095625783610657, + "loss": 7.5027, + "step": 20231 + }, + { + "epoch": 1.8878417467574882, + "grad_norm": 2.05863964654305, + "learning_rate": 0.00024095024315957196, + "loss": 7.3586, + "step": 20232 + }, + { + "epoch": 1.8879350564523654, + "grad_norm": 2.623125921634315, + "learning_rate": 0.00024094422825177787, + "loss": 7.4623, + "step": 20233 + }, + { + "epoch": 1.8880283661472426, + "grad_norm": 1.2326773567587064, + "learning_rate": 0.00024093821311273954, + "loss": 7.4118, + "step": 20234 + }, + { + "epoch": 1.88812167584212, + "grad_norm": 1.3644535378549498, + "learning_rate": 0.00024093219774247226, + "loss": 7.6621, + "step": 20235 + }, + { + "epoch": 1.8882149855369974, + "grad_norm": 1.3937214146307892, + "learning_rate": 0.00024092618214099137, + "loss": 7.6085, + "step": 20236 + }, + { + "epoch": 1.8883082952318746, + "grad_norm": 1.4561351236635882, + "learning_rate": 0.00024092016630831208, + "loss": 7.7465, + "step": 20237 + }, + { + "epoch": 1.8884016049267518, + "grad_norm": 1.0825040828453187, + "learning_rate": 0.0002409141502444498, + "loss": 7.5891, + "step": 20238 + }, + { + "epoch": 1.8884949146216292, + "grad_norm": 2.3174263270007223e+29, + "learning_rate": 0.0002409081339494197, + "loss": 7.3194, + "step": 20239 + }, + { + "epoch": 1.8885882243165066, + "grad_norm": 2.3702901111569815e+30, + "learning_rate": 0.0002409021174232372, + "loss": 7.3931, + "step": 20240 + }, + { + "epoch": 1.8886815340113838, + "grad_norm": 1.2752642395865317, + "learning_rate": 0.00024089610066591754, + "loss": 7.4457, + "step": 20241 + }, + { + "epoch": 1.888774843706261, + "grad_norm": 0.905018994925977, + "learning_rate": 0.00024089008367747598, + "loss": 7.4928, + "step": 20242 + }, + { + "epoch": 1.8888681534011384, + "grad_norm": 1.0055321018429677, + "learning_rate": 0.00024088406645792789, + "loss": 7.4321, + "step": 20243 + }, + { + "epoch": 1.8889614630960156, + "grad_norm": 1.303517298741234e+29, + "learning_rate": 0.00024087804900728855, + "loss": 7.5807, + "step": 20244 + }, + { + "epoch": 1.8890547727908928, + "grad_norm": 0.9962229228375039, + "learning_rate": 0.0002408720313255732, + "loss": 7.2057, + "step": 20245 + }, + { + "epoch": 1.8891480824857703, + "grad_norm": 1.0270925862099798, + "learning_rate": 0.00024086601341279727, + "loss": 7.3802, + "step": 20246 + }, + { + "epoch": 1.8892413921806477, + "grad_norm": 1.266477694165833, + "learning_rate": 0.00024085999526897594, + "loss": 7.2354, + "step": 20247 + }, + { + "epoch": 1.8893347018755249, + "grad_norm": 1.369135503346995, + "learning_rate": 0.00024085397689412453, + "loss": 7.5849, + "step": 20248 + }, + { + "epoch": 1.889428011570402, + "grad_norm": 1.4335591653266502, + "learning_rate": 0.00024084795828825843, + "loss": 7.5999, + "step": 20249 + }, + { + "epoch": 1.8895213212652795, + "grad_norm": 1.199472407874447, + "learning_rate": 0.00024084193945139284, + "loss": 7.5709, + "step": 20250 + }, + { + "epoch": 1.889614630960157, + "grad_norm": 1.1574613805716836, + "learning_rate": 0.00024083592038354312, + "loss": 7.6181, + "step": 20251 + }, + { + "epoch": 1.8897079406550341, + "grad_norm": 1.0792465416689967, + "learning_rate": 0.00024082990108472457, + "loss": 7.5438, + "step": 20252 + }, + { + "epoch": 1.8898012503499113, + "grad_norm": 0.94132516297601, + "learning_rate": 0.0002408238815549525, + "loss": 7.6765, + "step": 20253 + }, + { + "epoch": 1.8898945600447887, + "grad_norm": 2.954792852752717e+29, + "learning_rate": 0.00024081786179424214, + "loss": 7.6774, + "step": 20254 + }, + { + "epoch": 1.889987869739666, + "grad_norm": 1.3173166774297738, + "learning_rate": 0.00024081184180260894, + "loss": 7.4129, + "step": 20255 + }, + { + "epoch": 1.8900811794345431, + "grad_norm": 1.2215376765060921, + "learning_rate": 0.00024080582158006808, + "loss": 7.5774, + "step": 20256 + }, + { + "epoch": 1.8901744891294205, + "grad_norm": 0.9390488817299182, + "learning_rate": 0.00024079980112663493, + "loss": 7.6772, + "step": 20257 + }, + { + "epoch": 1.890267798824298, + "grad_norm": 2.733030281177554e+29, + "learning_rate": 0.00024079378044232475, + "loss": 7.2773, + "step": 20258 + }, + { + "epoch": 1.8903611085191752, + "grad_norm": 1.9111475674886598, + "learning_rate": 0.00024078775952715293, + "loss": 7.1591, + "step": 20259 + }, + { + "epoch": 1.8904544182140524, + "grad_norm": 5.211323129315742e+28, + "learning_rate": 0.0002407817383811347, + "loss": 7.5038, + "step": 20260 + }, + { + "epoch": 1.8905477279089298, + "grad_norm": 4.349295735286387e+27, + "learning_rate": 0.0002407757170042854, + "loss": 7.4118, + "step": 20261 + }, + { + "epoch": 1.8906410376038072, + "grad_norm": 1.9606173130724884, + "learning_rate": 0.00024076969539662039, + "loss": 7.7332, + "step": 20262 + }, + { + "epoch": 1.8907343472986842, + "grad_norm": 1.1342072890569708, + "learning_rate": 0.00024076367355815487, + "loss": 7.4073, + "step": 20263 + }, + { + "epoch": 1.8908276569935616, + "grad_norm": 0.9706899998755067, + "learning_rate": 0.00024075765148890425, + "loss": 7.4693, + "step": 20264 + }, + { + "epoch": 1.890920966688439, + "grad_norm": 1.384521275576778, + "learning_rate": 0.0002407516291888838, + "loss": 7.5198, + "step": 20265 + }, + { + "epoch": 1.8910142763833162, + "grad_norm": 1.2023876864019294, + "learning_rate": 0.0002407456066581088, + "loss": 7.5448, + "step": 20266 + }, + { + "epoch": 1.8911075860781934, + "grad_norm": 1.1793161431317023, + "learning_rate": 0.00024073958389659464, + "loss": 7.3692, + "step": 20267 + }, + { + "epoch": 1.8912008957730708, + "grad_norm": 1.7302774669711296, + "learning_rate": 0.00024073356090435663, + "loss": 7.3061, + "step": 20268 + }, + { + "epoch": 1.8912942054679482, + "grad_norm": 8.88283207326133, + "learning_rate": 0.00024072753768141002, + "loss": 7.385, + "step": 20269 + }, + { + "epoch": 1.8913875151628254, + "grad_norm": 4.9011205888967464e+29, + "learning_rate": 0.00024072151422777012, + "loss": 7.1482, + "step": 20270 + }, + { + "epoch": 1.8914808248577026, + "grad_norm": 1.042113950342542, + "learning_rate": 0.00024071549054345234, + "loss": 7.5808, + "step": 20271 + }, + { + "epoch": 1.89157413455258, + "grad_norm": 0.9423314293580501, + "learning_rate": 0.0002407094666284719, + "loss": 7.2708, + "step": 20272 + }, + { + "epoch": 1.8916674442474575, + "grad_norm": 1.2683736400410894, + "learning_rate": 0.00024070344248284416, + "loss": 7.5238, + "step": 20273 + }, + { + "epoch": 1.8917607539423344, + "grad_norm": 1.0997424799333813, + "learning_rate": 0.00024069741810658446, + "loss": 7.515, + "step": 20274 + }, + { + "epoch": 1.8918540636372119, + "grad_norm": 1.2291177043338202, + "learning_rate": 0.00024069139349970807, + "loss": 7.4661, + "step": 20275 + }, + { + "epoch": 1.8919473733320893, + "grad_norm": 1.0307949176303892, + "learning_rate": 0.00024068536866223036, + "loss": 7.39, + "step": 20276 + }, + { + "epoch": 1.8920406830269665, + "grad_norm": 4.691409031676732e+28, + "learning_rate": 0.00024067934359416656, + "loss": 7.5715, + "step": 20277 + }, + { + "epoch": 1.8921339927218437, + "grad_norm": 1.6024951611843234, + "learning_rate": 0.00024067331829553213, + "loss": 7.5361, + "step": 20278 + }, + { + "epoch": 1.892227302416721, + "grad_norm": 3.313155369259984e+26, + "learning_rate": 0.00024066729276634227, + "loss": 7.5897, + "step": 20279 + }, + { + "epoch": 1.8923206121115985, + "grad_norm": 1.2999228958170992, + "learning_rate": 0.00024066126700661235, + "loss": 7.2252, + "step": 20280 + }, + { + "epoch": 1.8924139218064757, + "grad_norm": 2.996077694969194e+28, + "learning_rate": 0.0002406552410163577, + "loss": 7.7801, + "step": 20281 + }, + { + "epoch": 1.892507231501353, + "grad_norm": 1.795077651252944, + "learning_rate": 0.00024064921479559356, + "loss": 7.4733, + "step": 20282 + }, + { + "epoch": 1.8926005411962303, + "grad_norm": 1.1648245882584793, + "learning_rate": 0.00024064318834433537, + "loss": 7.5985, + "step": 20283 + }, + { + "epoch": 1.8926938508911075, + "grad_norm": 1.0569278713773993, + "learning_rate": 0.0002406371616625984, + "loss": 7.9255, + "step": 20284 + }, + { + "epoch": 1.8927871605859847, + "grad_norm": 2.6002041510769534, + "learning_rate": 0.00024063113475039798, + "loss": 7.2533, + "step": 20285 + }, + { + "epoch": 1.8928804702808621, + "grad_norm": 1.9315440613748283, + "learning_rate": 0.0002406251076077494, + "loss": 7.2604, + "step": 20286 + }, + { + "epoch": 1.8929737799757396, + "grad_norm": 1.1275683752466237, + "learning_rate": 0.00024061908023466805, + "loss": 7.467, + "step": 20287 + }, + { + "epoch": 1.8930670896706168, + "grad_norm": 5.791177103264544e+30, + "learning_rate": 0.00024061305263116923, + "loss": 7.4445, + "step": 20288 + }, + { + "epoch": 1.893160399365494, + "grad_norm": 1.4106864285647742, + "learning_rate": 0.0002406070247972682, + "loss": 7.6156, + "step": 20289 + }, + { + "epoch": 1.8932537090603714, + "grad_norm": 1.0370348958828088, + "learning_rate": 0.0002406009967329804, + "loss": 7.2372, + "step": 20290 + }, + { + "epoch": 1.8933470187552488, + "grad_norm": 1.7621014942814563, + "learning_rate": 0.00024059496843832113, + "loss": 7.5241, + "step": 20291 + }, + { + "epoch": 1.893440328450126, + "grad_norm": 2.3955059231676183, + "learning_rate": 0.00024058893991330563, + "loss": 7.9838, + "step": 20292 + }, + { + "epoch": 1.8935336381450032, + "grad_norm": 1.2099775709120457, + "learning_rate": 0.0002405829111579493, + "loss": 7.574, + "step": 20293 + }, + { + "epoch": 1.8936269478398806, + "grad_norm": 1.5914163215465629, + "learning_rate": 0.00024057688217226747, + "loss": 7.2251, + "step": 20294 + }, + { + "epoch": 1.8937202575347578, + "grad_norm": 1.783752321948912, + "learning_rate": 0.00024057085295627547, + "loss": 7.6245, + "step": 20295 + }, + { + "epoch": 1.893813567229635, + "grad_norm": 2.0493614362104224e+30, + "learning_rate": 0.00024056482350998863, + "loss": 7.5569, + "step": 20296 + }, + { + "epoch": 1.8939068769245124, + "grad_norm": 1.3348641663297107, + "learning_rate": 0.00024055879383342224, + "loss": 7.5847, + "step": 20297 + }, + { + "epoch": 1.8940001866193898, + "grad_norm": 1.3015574399618075, + "learning_rate": 0.00024055276392659166, + "loss": 7.486, + "step": 20298 + }, + { + "epoch": 1.894093496314267, + "grad_norm": 1.0552749478484642, + "learning_rate": 0.00024054673378951226, + "loss": 7.51, + "step": 20299 + }, + { + "epoch": 1.8941868060091442, + "grad_norm": 1.5015079019067457, + "learning_rate": 0.0002405407034221993, + "loss": 7.2678, + "step": 20300 + }, + { + "epoch": 1.8942801157040217, + "grad_norm": 2.7502455003396373, + "learning_rate": 0.00024053467282466817, + "loss": 8.0456, + "step": 20301 + }, + { + "epoch": 1.894373425398899, + "grad_norm": 1.0903558087480356, + "learning_rate": 0.0002405286419969342, + "loss": 7.6263, + "step": 20302 + }, + { + "epoch": 1.8944667350937763, + "grad_norm": 1.5237064838605179, + "learning_rate": 0.0002405226109390127, + "loss": 7.539, + "step": 20303 + }, + { + "epoch": 1.8945600447886535, + "grad_norm": 1.7384027679163165, + "learning_rate": 0.000240516579650919, + "loss": 7.5182, + "step": 20304 + }, + { + "epoch": 1.894653354483531, + "grad_norm": 1.1978363086257204, + "learning_rate": 0.00024051054813266844, + "loss": 7.745, + "step": 20305 + }, + { + "epoch": 1.894746664178408, + "grad_norm": 2.0671754436877428, + "learning_rate": 0.00024050451638427643, + "loss": 6.9698, + "step": 20306 + }, + { + "epoch": 1.8948399738732853, + "grad_norm": 1.9849421191806424e+32, + "learning_rate": 0.0002404984844057582, + "loss": 7.6371, + "step": 20307 + }, + { + "epoch": 1.8949332835681627, + "grad_norm": 1.5427776684282262, + "learning_rate": 0.00024049245219712908, + "loss": 7.5289, + "step": 20308 + }, + { + "epoch": 1.8950265932630401, + "grad_norm": 1.3634923603224123, + "learning_rate": 0.00024048641975840454, + "loss": 7.2697, + "step": 20309 + }, + { + "epoch": 1.8951199029579173, + "grad_norm": 1.0420585199861114, + "learning_rate": 0.00024048038708959985, + "loss": 7.4875, + "step": 20310 + }, + { + "epoch": 1.8952132126527945, + "grad_norm": 0.9948251296992741, + "learning_rate": 0.00024047435419073025, + "loss": 7.4854, + "step": 20311 + }, + { + "epoch": 1.895306522347672, + "grad_norm": 1.9295458382322117, + "learning_rate": 0.00024046832106181123, + "loss": 7.4303, + "step": 20312 + }, + { + "epoch": 1.8953998320425494, + "grad_norm": 4.891544839634162, + "learning_rate": 0.00024046228770285808, + "loss": 7.6207, + "step": 20313 + }, + { + "epoch": 1.8954931417374266, + "grad_norm": 1.037747614175716, + "learning_rate": 0.00024045625411388608, + "loss": 7.2117, + "step": 20314 + }, + { + "epoch": 1.8955864514323038, + "grad_norm": 1.1464178979972535, + "learning_rate": 0.00024045022029491067, + "loss": 7.2987, + "step": 20315 + }, + { + "epoch": 1.8956797611271812, + "grad_norm": 2.2451993267252353, + "learning_rate": 0.0002404441862459471, + "loss": 8.2175, + "step": 20316 + }, + { + "epoch": 1.8957730708220584, + "grad_norm": 8.619445746951614e+28, + "learning_rate": 0.00024043815196701077, + "loss": 7.6159, + "step": 20317 + }, + { + "epoch": 1.8958663805169356, + "grad_norm": 6.836724509851581e+31, + "learning_rate": 0.00024043211745811702, + "loss": 7.4452, + "step": 20318 + }, + { + "epoch": 1.895959690211813, + "grad_norm": 1.537235914779161, + "learning_rate": 0.0002404260827192812, + "loss": 7.3327, + "step": 20319 + }, + { + "epoch": 1.8960529999066904, + "grad_norm": 1.5907850838387363, + "learning_rate": 0.0002404200477505186, + "loss": 7.3237, + "step": 20320 + }, + { + "epoch": 1.8961463096015676, + "grad_norm": 1.3352935106916626, + "learning_rate": 0.0002404140125518446, + "loss": 7.6547, + "step": 20321 + }, + { + "epoch": 1.8962396192964448, + "grad_norm": 1.6659890766485874, + "learning_rate": 0.00024040797712327462, + "loss": 7.3877, + "step": 20322 + }, + { + "epoch": 1.8963329289913222, + "grad_norm": 1.0616522296985833, + "learning_rate": 0.00024040194146482386, + "loss": 7.4641, + "step": 20323 + }, + { + "epoch": 1.8964262386861996, + "grad_norm": 1.2903252908561305, + "learning_rate": 0.00024039590557650776, + "loss": 7.4257, + "step": 20324 + }, + { + "epoch": 1.8965195483810768, + "grad_norm": 1.032772062356304, + "learning_rate": 0.00024038986945834167, + "loss": 7.371, + "step": 20325 + }, + { + "epoch": 1.896612858075954, + "grad_norm": 1.69349736336618e+29, + "learning_rate": 0.00024038383311034088, + "loss": 7.4527, + "step": 20326 + }, + { + "epoch": 1.8967061677708315, + "grad_norm": 1.2139755344270131, + "learning_rate": 0.0002403777965325208, + "loss": 7.4636, + "step": 20327 + }, + { + "epoch": 1.8967994774657086, + "grad_norm": 1.5433728247959257, + "learning_rate": 0.00024037175972489676, + "loss": 7.5754, + "step": 20328 + }, + { + "epoch": 1.8968927871605858, + "grad_norm": 1.2778557546875826, + "learning_rate": 0.0002403657226874841, + "loss": 7.7384, + "step": 20329 + }, + { + "epoch": 1.8969860968554633, + "grad_norm": 1.5934629645180076, + "learning_rate": 0.00024035968542029818, + "loss": 7.352, + "step": 20330 + }, + { + "epoch": 1.8970794065503407, + "grad_norm": 1.4937386743626735, + "learning_rate": 0.00024035364792335435, + "loss": 7.5903, + "step": 20331 + }, + { + "epoch": 1.8971727162452179, + "grad_norm": 1.4533340190101602, + "learning_rate": 0.00024034761019666793, + "loss": 7.8669, + "step": 20332 + }, + { + "epoch": 1.897266025940095, + "grad_norm": 1.4838107938579321, + "learning_rate": 0.00024034157224025432, + "loss": 7.5614, + "step": 20333 + }, + { + "epoch": 1.8973593356349725, + "grad_norm": 1.180706097816111, + "learning_rate": 0.00024033553405412885, + "loss": 7.4139, + "step": 20334 + }, + { + "epoch": 1.89745264532985, + "grad_norm": 1.3311036046742875, + "learning_rate": 0.00024032949563830688, + "loss": 7.6654, + "step": 20335 + }, + { + "epoch": 1.8975459550247271, + "grad_norm": 1.3672338928454144, + "learning_rate": 0.00024032345699280373, + "loss": 7.6193, + "step": 20336 + }, + { + "epoch": 1.8976392647196043, + "grad_norm": 1.1883681853927157, + "learning_rate": 0.00024031741811763484, + "loss": 7.8235, + "step": 20337 + }, + { + "epoch": 1.8977325744144817, + "grad_norm": 1.0555473270675606, + "learning_rate": 0.0002403113790128155, + "loss": 7.6164, + "step": 20338 + }, + { + "epoch": 1.897825884109359, + "grad_norm": 1.086370888467186, + "learning_rate": 0.00024030533967836102, + "loss": 7.5539, + "step": 20339 + }, + { + "epoch": 1.8979191938042361, + "grad_norm": 1.4405531559236504, + "learning_rate": 0.00024029930011428685, + "loss": 7.3585, + "step": 20340 + }, + { + "epoch": 1.8980125034991135, + "grad_norm": 1.2691561674149632, + "learning_rate": 0.00024029326032060832, + "loss": 7.6206, + "step": 20341 + }, + { + "epoch": 1.898105813193991, + "grad_norm": 1.066181512127178, + "learning_rate": 0.00024028722029734078, + "loss": 7.573, + "step": 20342 + }, + { + "epoch": 1.8981991228888682, + "grad_norm": 1.4794607750275013, + "learning_rate": 0.00024028118004449953, + "loss": 7.2994, + "step": 20343 + }, + { + "epoch": 1.8982924325837454, + "grad_norm": 3.9376680254812476e+26, + "learning_rate": 0.00024027513956210008, + "loss": 7.5747, + "step": 20344 + }, + { + "epoch": 1.8983857422786228, + "grad_norm": 2.5181318282675895, + "learning_rate": 0.00024026909885015754, + "loss": 7.5806, + "step": 20345 + }, + { + "epoch": 1.8984790519735002, + "grad_norm": 1.2220952973214272, + "learning_rate": 0.00024026305790868756, + "loss": 7.3621, + "step": 20346 + }, + { + "epoch": 1.8985723616683774, + "grad_norm": 1.320458903761583, + "learning_rate": 0.0002402570167377053, + "loss": 7.602, + "step": 20347 + }, + { + "epoch": 1.8986656713632546, + "grad_norm": 1.04571643349391, + "learning_rate": 0.00024025097533722622, + "loss": 7.3888, + "step": 20348 + }, + { + "epoch": 1.898758981058132, + "grad_norm": 7378.403996434562, + "learning_rate": 0.00024024493370726564, + "loss": 7.5338, + "step": 20349 + }, + { + "epoch": 1.8988522907530092, + "grad_norm": 2.731172581376461, + "learning_rate": 0.00024023889184783887, + "loss": 7.3242, + "step": 20350 + }, + { + "epoch": 1.8989456004478864, + "grad_norm": 3.08698410442247e+29, + "learning_rate": 0.00024023284975896138, + "loss": 7.2937, + "step": 20351 + }, + { + "epoch": 1.8990389101427638, + "grad_norm": 5.554247503016467e+30, + "learning_rate": 0.00024022680744064848, + "loss": 7.481, + "step": 20352 + }, + { + "epoch": 1.8991322198376412, + "grad_norm": 1.0482255814629762, + "learning_rate": 0.0002402207648929155, + "loss": 7.5563, + "step": 20353 + }, + { + "epoch": 1.8992255295325184, + "grad_norm": 1.13184156309085, + "learning_rate": 0.00024021472211577791, + "loss": 7.4473, + "step": 20354 + }, + { + "epoch": 1.8993188392273956, + "grad_norm": 1.1263486923860264, + "learning_rate": 0.00024020867910925093, + "loss": 7.2013, + "step": 20355 + }, + { + "epoch": 1.899412148922273, + "grad_norm": 1.0448272231160924, + "learning_rate": 0.00024020263587335003, + "loss": 7.1875, + "step": 20356 + }, + { + "epoch": 1.8995054586171505, + "grad_norm": 1.9308925657427933, + "learning_rate": 0.00024019659240809052, + "loss": 7.4032, + "step": 20357 + }, + { + "epoch": 1.8995987683120277, + "grad_norm": 1.8856387226905744, + "learning_rate": 0.00024019054871348783, + "loss": 7.5297, + "step": 20358 + }, + { + "epoch": 1.8996920780069049, + "grad_norm": 1.534307714428093, + "learning_rate": 0.0002401845047895573, + "loss": 7.4737, + "step": 20359 + }, + { + "epoch": 1.8997853877017823, + "grad_norm": 1.343430127461852, + "learning_rate": 0.00024017846063631424, + "loss": 7.7928, + "step": 20360 + }, + { + "epoch": 1.8998786973966595, + "grad_norm": 1.1367441184111113, + "learning_rate": 0.0002401724162537741, + "loss": 7.482, + "step": 20361 + }, + { + "epoch": 1.8999720070915367, + "grad_norm": 1.1372349479635686, + "learning_rate": 0.00024016637164195223, + "loss": 7.7974, + "step": 20362 + }, + { + "epoch": 1.900065316786414, + "grad_norm": 2.0434891489897273, + "learning_rate": 0.000240160326800864, + "loss": 7.2091, + "step": 20363 + }, + { + "epoch": 1.9001586264812915, + "grad_norm": 8.857177823670008e+28, + "learning_rate": 0.00024015428173052472, + "loss": 7.5742, + "step": 20364 + }, + { + "epoch": 1.9002519361761687, + "grad_norm": 4.576520977781446e+29, + "learning_rate": 0.0002401482364309498, + "loss": 7.4535, + "step": 20365 + }, + { + "epoch": 1.900345245871046, + "grad_norm": 1.8141753198574098e+29, + "learning_rate": 0.00024014219090215464, + "loss": 7.0859, + "step": 20366 + }, + { + "epoch": 1.9004385555659233, + "grad_norm": 1.1852558440060033, + "learning_rate": 0.00024013614514415457, + "loss": 7.5998, + "step": 20367 + }, + { + "epoch": 1.9005318652608008, + "grad_norm": 1.1770017697961646, + "learning_rate": 0.00024013009915696502, + "loss": 7.4585, + "step": 20368 + }, + { + "epoch": 1.9006251749556777, + "grad_norm": 1.7622278203911506, + "learning_rate": 0.00024012405294060132, + "loss": 7.761, + "step": 20369 + }, + { + "epoch": 1.9007184846505552, + "grad_norm": 1.4227144729808074, + "learning_rate": 0.00024011800649507888, + "loss": 7.5043, + "step": 20370 + }, + { + "epoch": 1.9008117943454326, + "grad_norm": 1.5791738210961426, + "learning_rate": 0.000240111959820413, + "loss": 7.4529, + "step": 20371 + }, + { + "epoch": 1.9009051040403098, + "grad_norm": 1.2900718245187677, + "learning_rate": 0.00024010591291661907, + "loss": 7.7478, + "step": 20372 + }, + { + "epoch": 1.900998413735187, + "grad_norm": 1.1676879040475212, + "learning_rate": 0.00024009986578371256, + "loss": 7.5676, + "step": 20373 + }, + { + "epoch": 1.9010917234300644, + "grad_norm": 1.2730893244483408e+29, + "learning_rate": 0.00024009381842170875, + "loss": 7.1692, + "step": 20374 + }, + { + "epoch": 1.9011850331249418, + "grad_norm": 1.31812124687259, + "learning_rate": 0.00024008777083062304, + "loss": 7.4068, + "step": 20375 + }, + { + "epoch": 1.901278342819819, + "grad_norm": 1.1353262448870962, + "learning_rate": 0.0002400817230104708, + "loss": 7.444, + "step": 20376 + }, + { + "epoch": 1.9013716525146962, + "grad_norm": 1.1913849713519344, + "learning_rate": 0.00024007567496126748, + "loss": 7.6652, + "step": 20377 + }, + { + "epoch": 1.9014649622095736, + "grad_norm": 2.1622612824154862e+30, + "learning_rate": 0.00024006962668302837, + "loss": 7.3549, + "step": 20378 + }, + { + "epoch": 1.901558271904451, + "grad_norm": 1.2413917965688133, + "learning_rate": 0.00024006357817576887, + "loss": 7.7674, + "step": 20379 + }, + { + "epoch": 1.901651581599328, + "grad_norm": 1.1789644029672957, + "learning_rate": 0.00024005752943950437, + "loss": 7.8096, + "step": 20380 + }, + { + "epoch": 1.9017448912942054, + "grad_norm": 1.307489380724284, + "learning_rate": 0.00024005148047425025, + "loss": 7.5401, + "step": 20381 + }, + { + "epoch": 1.9018382009890828, + "grad_norm": 3.097445416926433e+30, + "learning_rate": 0.0002400454312800219, + "loss": 7.4665, + "step": 20382 + }, + { + "epoch": 1.90193151068396, + "grad_norm": 2.722367095194682e+29, + "learning_rate": 0.00024003938185683467, + "loss": 7.3408, + "step": 20383 + }, + { + "epoch": 1.9020248203788372, + "grad_norm": 2.1034978360498853, + "learning_rate": 0.00024003333220470402, + "loss": 7.0555, + "step": 20384 + }, + { + "epoch": 1.9021181300737147, + "grad_norm": 1.320386995763106, + "learning_rate": 0.00024002728232364523, + "loss": 7.2661, + "step": 20385 + }, + { + "epoch": 1.902211439768592, + "grad_norm": 1.306995865799322, + "learning_rate": 0.00024002123221367373, + "loss": 7.6964, + "step": 20386 + }, + { + "epoch": 1.9023047494634693, + "grad_norm": 1.2668561838847499, + "learning_rate": 0.0002400151818748049, + "loss": 7.7606, + "step": 20387 + }, + { + "epoch": 1.9023980591583465, + "grad_norm": 1.6164939663432967, + "learning_rate": 0.00024000913130705414, + "loss": 7.5271, + "step": 20388 + }, + { + "epoch": 1.902491368853224, + "grad_norm": 1.514413693922827, + "learning_rate": 0.00024000308051043683, + "loss": 7.2733, + "step": 20389 + }, + { + "epoch": 1.902584678548101, + "grad_norm": 1.807924954735129, + "learning_rate": 0.00023999702948496833, + "loss": 7.4802, + "step": 20390 + }, + { + "epoch": 1.9026779882429783, + "grad_norm": 3.224210011652342e+29, + "learning_rate": 0.00023999097823066407, + "loss": 7.5753, + "step": 20391 + }, + { + "epoch": 1.9027712979378557, + "grad_norm": 1.255638761491778, + "learning_rate": 0.00023998492674753936, + "loss": 7.2965, + "step": 20392 + }, + { + "epoch": 1.9028646076327331, + "grad_norm": 2.0883191006872788, + "learning_rate": 0.00023997887503560968, + "loss": 7.7047, + "step": 20393 + }, + { + "epoch": 1.9029579173276103, + "grad_norm": 1.2590615471721929, + "learning_rate": 0.0002399728230948904, + "loss": 7.5245, + "step": 20394 + }, + { + "epoch": 1.9030512270224875, + "grad_norm": 1.69652158562427, + "learning_rate": 0.00023996677092539683, + "loss": 7.4758, + "step": 20395 + }, + { + "epoch": 1.903144536717365, + "grad_norm": 1.1114492861797867, + "learning_rate": 0.0002399607185271444, + "loss": 7.2345, + "step": 20396 + }, + { + "epoch": 1.9032378464122424, + "grad_norm": 2.3564878986686137, + "learning_rate": 0.00023995466590014854, + "loss": 8.1682, + "step": 20397 + }, + { + "epoch": 1.9033311561071196, + "grad_norm": 1.4007059318463366, + "learning_rate": 0.0002399486130444246, + "loss": 7.6836, + "step": 20398 + }, + { + "epoch": 1.9034244658019968, + "grad_norm": 1.1123197600870922, + "learning_rate": 0.00023994255995998798, + "loss": 7.6971, + "step": 20399 + }, + { + "epoch": 1.9035177754968742, + "grad_norm": 4.267456737913346e+31, + "learning_rate": 0.0002399365066468541, + "loss": 7.334, + "step": 20400 + }, + { + "epoch": 1.9036110851917514, + "grad_norm": 1.8149192407680037, + "learning_rate": 0.00023993045310503831, + "loss": 6.9994, + "step": 20401 + }, + { + "epoch": 1.9037043948866286, + "grad_norm": 1.0784711480453688, + "learning_rate": 0.00023992439933455598, + "loss": 7.4394, + "step": 20402 + }, + { + "epoch": 1.903797704581506, + "grad_norm": 1.2840868796426315, + "learning_rate": 0.0002399183453354226, + "loss": 7.3122, + "step": 20403 + }, + { + "epoch": 1.9038910142763834, + "grad_norm": 2.272775469139664e+30, + "learning_rate": 0.00023991229110765348, + "loss": 7.8809, + "step": 20404 + }, + { + "epoch": 1.9039843239712606, + "grad_norm": 2.63513282834589, + "learning_rate": 0.00023990623665126403, + "loss": 7.9167, + "step": 20405 + }, + { + "epoch": 1.9040776336661378, + "grad_norm": 1.2400289051879713e+30, + "learning_rate": 0.00023990018196626967, + "loss": 7.7303, + "step": 20406 + }, + { + "epoch": 1.9041709433610152, + "grad_norm": 1.1842392627611231, + "learning_rate": 0.00023989412705268575, + "loss": 7.3737, + "step": 20407 + }, + { + "epoch": 1.9042642530558926, + "grad_norm": 1.0447636011354162, + "learning_rate": 0.00023988807191052772, + "loss": 7.5643, + "step": 20408 + }, + { + "epoch": 1.9043575627507698, + "grad_norm": 8.324412505866213e+29, + "learning_rate": 0.00023988201653981093, + "loss": 7.453, + "step": 20409 + }, + { + "epoch": 1.904450872445647, + "grad_norm": 3.03533968671692e+29, + "learning_rate": 0.00023987596094055084, + "loss": 7.593, + "step": 20410 + }, + { + "epoch": 1.9045441821405245, + "grad_norm": 1.1798733780502777, + "learning_rate": 0.00023986990511276274, + "loss": 7.7162, + "step": 20411 + }, + { + "epoch": 1.9046374918354017, + "grad_norm": 2.0503551772672415, + "learning_rate": 0.00023986384905646215, + "loss": 7.4938, + "step": 20412 + }, + { + "epoch": 1.9047308015302788, + "grad_norm": 1.447172271006286, + "learning_rate": 0.00023985779277166437, + "loss": 7.9547, + "step": 20413 + }, + { + "epoch": 1.9048241112251563, + "grad_norm": 1.5236135709108394, + "learning_rate": 0.00023985173625838484, + "loss": 7.5829, + "step": 20414 + }, + { + "epoch": 1.9049174209200337, + "grad_norm": 2.063098454165015, + "learning_rate": 0.000239845679516639, + "loss": 7.2178, + "step": 20415 + }, + { + "epoch": 1.9050107306149109, + "grad_norm": 1.6170264585010083, + "learning_rate": 0.00023983962254644218, + "loss": 7.5021, + "step": 20416 + }, + { + "epoch": 1.905104040309788, + "grad_norm": 1.371346450551374, + "learning_rate": 0.00023983356534780983, + "loss": 7.6172, + "step": 20417 + }, + { + "epoch": 1.9051973500046655, + "grad_norm": 1.4787257280956672, + "learning_rate": 0.0002398275079207573, + "loss": 7.549, + "step": 20418 + }, + { + "epoch": 1.905290659699543, + "grad_norm": 2.111300003090644, + "learning_rate": 0.00023982145026530007, + "loss": 7.8553, + "step": 20419 + }, + { + "epoch": 1.9053839693944201, + "grad_norm": 1.2668831405155215e+31, + "learning_rate": 0.00023981539238145347, + "loss": 7.4211, + "step": 20420 + }, + { + "epoch": 1.9054772790892973, + "grad_norm": 15.646220690824856, + "learning_rate": 0.0002398093342692329, + "loss": 7.4056, + "step": 20421 + }, + { + "epoch": 1.9055705887841747, + "grad_norm": 1.8878467112974922, + "learning_rate": 0.0002398032759286539, + "loss": 7.5937, + "step": 20422 + }, + { + "epoch": 1.905663898479052, + "grad_norm": 1.3972206215869813, + "learning_rate": 0.00023979721735973168, + "loss": 7.5772, + "step": 20423 + }, + { + "epoch": 1.9057572081739291, + "grad_norm": 1.198434551675453, + "learning_rate": 0.00023979115856248177, + "loss": 7.9279, + "step": 20424 + }, + { + "epoch": 1.9058505178688065, + "grad_norm": 1.6460435991779825, + "learning_rate": 0.00023978509953691951, + "loss": 7.3858, + "step": 20425 + }, + { + "epoch": 1.905943827563684, + "grad_norm": 1.8315831742279298, + "learning_rate": 0.00023977904028306036, + "loss": 7.8528, + "step": 20426 + }, + { + "epoch": 1.9060371372585612, + "grad_norm": 1.5272480319371142, + "learning_rate": 0.00023977298080091972, + "loss": 7.4304, + "step": 20427 + }, + { + "epoch": 1.9061304469534384, + "grad_norm": 1.3069720551250283, + "learning_rate": 0.00023976692109051298, + "loss": 7.6673, + "step": 20428 + }, + { + "epoch": 1.9062237566483158, + "grad_norm": 1.3217774116407879, + "learning_rate": 0.00023976086115185558, + "loss": 7.375, + "step": 20429 + }, + { + "epoch": 1.9063170663431932, + "grad_norm": 1.3808587659300384, + "learning_rate": 0.0002397548009849628, + "loss": 7.7891, + "step": 20430 + }, + { + "epoch": 1.9064103760380704, + "grad_norm": 2.406495451748592, + "learning_rate": 0.00023974874058985026, + "loss": 7.0075, + "step": 20431 + }, + { + "epoch": 1.9065036857329476, + "grad_norm": 1.8078340849306531, + "learning_rate": 0.00023974267996653318, + "loss": 7.6442, + "step": 20432 + }, + { + "epoch": 1.906596995427825, + "grad_norm": 7.293158710524088e+29, + "learning_rate": 0.00023973661911502706, + "loss": 7.5364, + "step": 20433 + }, + { + "epoch": 1.9066903051227022, + "grad_norm": 1.4014852077970954, + "learning_rate": 0.00023973055803534735, + "loss": 7.5963, + "step": 20434 + }, + { + "epoch": 1.9067836148175794, + "grad_norm": 1.6808074166428226, + "learning_rate": 0.00023972449672750938, + "loss": 7.7547, + "step": 20435 + }, + { + "epoch": 1.9068769245124568, + "grad_norm": 2.1360694570600813, + "learning_rate": 0.00023971843519152855, + "loss": 7.6394, + "step": 20436 + }, + { + "epoch": 1.9069702342073342, + "grad_norm": 8.681828174029111e+29, + "learning_rate": 0.00023971237342742039, + "loss": 7.5511, + "step": 20437 + }, + { + "epoch": 1.9070635439022114, + "grad_norm": 1.2197960294206218, + "learning_rate": 0.00023970631143520023, + "loss": 7.5532, + "step": 20438 + }, + { + "epoch": 1.9071568535970886, + "grad_norm": 1.8123325719792176, + "learning_rate": 0.00023970024921488348, + "loss": 7.5206, + "step": 20439 + }, + { + "epoch": 1.907250163291966, + "grad_norm": 1.6868604443663537, + "learning_rate": 0.00023969418676648554, + "loss": 7.4999, + "step": 20440 + }, + { + "epoch": 1.9073434729868435, + "grad_norm": 1.334822020375359, + "learning_rate": 0.00023968812409002187, + "loss": 7.6546, + "step": 20441 + }, + { + "epoch": 1.9074367826817207, + "grad_norm": 1.3537972445973918, + "learning_rate": 0.00023968206118550785, + "loss": 7.5248, + "step": 20442 + }, + { + "epoch": 1.9075300923765979, + "grad_norm": 1.331848905403522, + "learning_rate": 0.00023967599805295895, + "loss": 7.4838, + "step": 20443 + }, + { + "epoch": 1.9076234020714753, + "grad_norm": 1.26368845671796, + "learning_rate": 0.00023966993469239051, + "loss": 7.6196, + "step": 20444 + }, + { + "epoch": 1.9077167117663525, + "grad_norm": 1.4551611880864911e+31, + "learning_rate": 0.000239663871103818, + "loss": 7.6131, + "step": 20445 + }, + { + "epoch": 1.9078100214612297, + "grad_norm": 6.579050353728999e+29, + "learning_rate": 0.00023965780728725685, + "loss": 7.701, + "step": 20446 + }, + { + "epoch": 1.907903331156107, + "grad_norm": 1.811656506379625, + "learning_rate": 0.00023965174324272248, + "loss": 7.9556, + "step": 20447 + }, + { + "epoch": 1.9079966408509845, + "grad_norm": 1.5155345215454705, + "learning_rate": 0.00023964567897023025, + "loss": 7.0474, + "step": 20448 + }, + { + "epoch": 1.9080899505458617, + "grad_norm": 1.4557619537813915, + "learning_rate": 0.00023963961446979554, + "loss": 7.3599, + "step": 20449 + }, + { + "epoch": 1.908183260240739, + "grad_norm": 3.267431242157691, + "learning_rate": 0.00023963354974143396, + "loss": 7.3639, + "step": 20450 + }, + { + "epoch": 1.9082765699356163, + "grad_norm": 1.745366675595124, + "learning_rate": 0.00023962748478516076, + "loss": 7.9569, + "step": 20451 + }, + { + "epoch": 1.9083698796304938, + "grad_norm": 1.2218928409288397, + "learning_rate": 0.0002396214196009914, + "loss": 7.6543, + "step": 20452 + }, + { + "epoch": 1.908463189325371, + "grad_norm": 1.4220575403602451, + "learning_rate": 0.00023961535418894138, + "loss": 7.5418, + "step": 20453 + }, + { + "epoch": 1.9085564990202482, + "grad_norm": 1.6102429599963486, + "learning_rate": 0.000239609288549026, + "loss": 7.2174, + "step": 20454 + }, + { + "epoch": 1.9086498087151256, + "grad_norm": 1.1799597356467593, + "learning_rate": 0.00023960322268126076, + "loss": 7.4982, + "step": 20455 + }, + { + "epoch": 1.9087431184100028, + "grad_norm": 1.2627599934761793, + "learning_rate": 0.00023959715658566105, + "loss": 7.2245, + "step": 20456 + }, + { + "epoch": 1.90883642810488, + "grad_norm": 1.1526833642918957, + "learning_rate": 0.00023959109026224236, + "loss": 7.6019, + "step": 20457 + }, + { + "epoch": 1.9089297377997574, + "grad_norm": 1.3787504969696809, + "learning_rate": 0.00023958502371102002, + "loss": 7.4954, + "step": 20458 + }, + { + "epoch": 1.9090230474946348, + "grad_norm": 7.889809030438366e+28, + "learning_rate": 0.00023957895693200952, + "loss": 7.1766, + "step": 20459 + }, + { + "epoch": 1.909116357189512, + "grad_norm": 1.2870356017808875, + "learning_rate": 0.0002395728899252263, + "loss": 7.5243, + "step": 20460 + }, + { + "epoch": 1.9092096668843892, + "grad_norm": 6.007274650565157e+30, + "learning_rate": 0.00023956682269068573, + "loss": 7.4146, + "step": 20461 + }, + { + "epoch": 1.9093029765792666, + "grad_norm": 9.14053362475543e+30, + "learning_rate": 0.00023956075522840322, + "loss": 7.6721, + "step": 20462 + }, + { + "epoch": 1.909396286274144, + "grad_norm": 1.2002797157855893, + "learning_rate": 0.00023955468753839428, + "loss": 7.3509, + "step": 20463 + }, + { + "epoch": 1.9094895959690212, + "grad_norm": 1.1992431123248855, + "learning_rate": 0.00023954861962067429, + "loss": 7.3702, + "step": 20464 + }, + { + "epoch": 1.9095829056638984, + "grad_norm": 8.49358137490953e+30, + "learning_rate": 0.0002395425514752587, + "loss": 7.9325, + "step": 20465 + }, + { + "epoch": 1.9096762153587759, + "grad_norm": 1.1348717108211124, + "learning_rate": 0.0002395364831021629, + "loss": 7.625, + "step": 20466 + }, + { + "epoch": 1.909769525053653, + "grad_norm": 1.0924238991809951, + "learning_rate": 0.0002395304145014024, + "loss": 7.4765, + "step": 20467 + }, + { + "epoch": 1.9098628347485302, + "grad_norm": 2.3217998527364225e+31, + "learning_rate": 0.0002395243456729925, + "loss": 7.4363, + "step": 20468 + }, + { + "epoch": 1.9099561444434077, + "grad_norm": 1.1788419845698233, + "learning_rate": 0.00023951827661694875, + "loss": 7.5584, + "step": 20469 + }, + { + "epoch": 1.910049454138285, + "grad_norm": 1.4509998684321512, + "learning_rate": 0.00023951220733328652, + "loss": 7.5919, + "step": 20470 + }, + { + "epoch": 1.9101427638331623, + "grad_norm": 1.371799850532158, + "learning_rate": 0.00023950613782202127, + "loss": 7.2394, + "step": 20471 + }, + { + "epoch": 1.9102360735280395, + "grad_norm": 1.4130457025582026, + "learning_rate": 0.00023950006808316844, + "loss": 7.5188, + "step": 20472 + }, + { + "epoch": 1.910329383222917, + "grad_norm": 1.4676673892753945e+31, + "learning_rate": 0.00023949399811674346, + "loss": 7.4874, + "step": 20473 + }, + { + "epoch": 1.9104226929177943, + "grad_norm": 1.3389011615818078, + "learning_rate": 0.00023948792792276172, + "loss": 7.4471, + "step": 20474 + }, + { + "epoch": 1.9105160026126713, + "grad_norm": 1.5585581521787706e+31, + "learning_rate": 0.00023948185750123868, + "loss": 7.6016, + "step": 20475 + }, + { + "epoch": 1.9106093123075487, + "grad_norm": 1.1464185688881852, + "learning_rate": 0.00023947578685218983, + "loss": 7.5036, + "step": 20476 + }, + { + "epoch": 1.9107026220024261, + "grad_norm": 1.6310117704583345, + "learning_rate": 0.00023946971597563053, + "loss": 7.1541, + "step": 20477 + }, + { + "epoch": 1.9107959316973033, + "grad_norm": 6.869936870233788e+30, + "learning_rate": 0.00023946364487157623, + "loss": 7.672, + "step": 20478 + }, + { + "epoch": 1.9108892413921805, + "grad_norm": 1.2725805360547037e+31, + "learning_rate": 0.0002394575735400424, + "loss": 7.4392, + "step": 20479 + }, + { + "epoch": 1.910982551087058, + "grad_norm": 2.3258616910678667, + "learning_rate": 0.00023945150198104447, + "loss": 7.4038, + "step": 20480 + }, + { + "epoch": 1.9110758607819354, + "grad_norm": 1.8096680955415947, + "learning_rate": 0.00023944543019459788, + "loss": 7.2573, + "step": 20481 + }, + { + "epoch": 1.9111691704768126, + "grad_norm": 1.8487883334162967, + "learning_rate": 0.00023943935818071803, + "loss": 7.1894, + "step": 20482 + }, + { + "epoch": 1.9112624801716898, + "grad_norm": 1.8144074899369041e+31, + "learning_rate": 0.00023943328593942039, + "loss": 7.8276, + "step": 20483 + }, + { + "epoch": 1.9113557898665672, + "grad_norm": 2.2744546230707385, + "learning_rate": 0.0002394272134707204, + "loss": 7.7057, + "step": 20484 + }, + { + "epoch": 1.9114490995614446, + "grad_norm": 1.5735110643552672, + "learning_rate": 0.00023942114077463348, + "loss": 7.5994, + "step": 20485 + }, + { + "epoch": 1.9115424092563216, + "grad_norm": 1.0634214963951563, + "learning_rate": 0.0002394150678511751, + "loss": 7.1464, + "step": 20486 + }, + { + "epoch": 1.911635718951199, + "grad_norm": 1.5468734885330602, + "learning_rate": 0.0002394089947003607, + "loss": 7.7557, + "step": 20487 + }, + { + "epoch": 1.9117290286460764, + "grad_norm": 1.613008298522374, + "learning_rate": 0.0002394029213222057, + "loss": 7.5226, + "step": 20488 + }, + { + "epoch": 1.9118223383409536, + "grad_norm": 1.1313772044089934, + "learning_rate": 0.00023939684771672555, + "loss": 7.642, + "step": 20489 + }, + { + "epoch": 1.9119156480358308, + "grad_norm": 1.6098816336798616, + "learning_rate": 0.0002393907738839357, + "loss": 7.2619, + "step": 20490 + }, + { + "epoch": 1.9120089577307082, + "grad_norm": 1.2498189179956871, + "learning_rate": 0.00023938469982385163, + "loss": 7.6288, + "step": 20491 + }, + { + "epoch": 1.9121022674255856, + "grad_norm": 1.8503390871299064, + "learning_rate": 0.00023937862553648868, + "loss": 7.2744, + "step": 20492 + }, + { + "epoch": 1.9121955771204628, + "grad_norm": 7.291870915344947e+31, + "learning_rate": 0.00023937255102186237, + "loss": 7.7776, + "step": 20493 + }, + { + "epoch": 1.91228888681534, + "grad_norm": 2.8538202666159013e+29, + "learning_rate": 0.00023936647627998816, + "loss": 7.1614, + "step": 20494 + }, + { + "epoch": 1.9123821965102175, + "grad_norm": 1.4846213990474852, + "learning_rate": 0.00023936040131088148, + "loss": 7.4073, + "step": 20495 + }, + { + "epoch": 1.9124755062050947, + "grad_norm": 2.2723380647700515, + "learning_rate": 0.00023935432611455777, + "loss": 7.5684, + "step": 20496 + }, + { + "epoch": 1.9125688158999719, + "grad_norm": 1.7120944095324445e+31, + "learning_rate": 0.00023934825069103245, + "loss": 7.6282, + "step": 20497 + }, + { + "epoch": 1.9126621255948493, + "grad_norm": 1.2922606202219888, + "learning_rate": 0.00023934217504032105, + "loss": 7.587, + "step": 20498 + }, + { + "epoch": 1.9127554352897267, + "grad_norm": 1.608224013065677, + "learning_rate": 0.0002393360991624389, + "loss": 7.5548, + "step": 20499 + }, + { + "epoch": 1.9128487449846039, + "grad_norm": 1.0963823626948208, + "learning_rate": 0.00023933002305740153, + "loss": 7.4304, + "step": 20500 + }, + { + "epoch": 1.912942054679481, + "grad_norm": 2.739124383682641e+29, + "learning_rate": 0.00023932394672522445, + "loss": 7.0745, + "step": 20501 + }, + { + "epoch": 1.9130353643743585, + "grad_norm": 1.242498882367742, + "learning_rate": 0.0002393178701659229, + "loss": 7.3382, + "step": 20502 + }, + { + "epoch": 1.913128674069236, + "grad_norm": 1.1324658708632396, + "learning_rate": 0.00023931179337951256, + "loss": 7.5117, + "step": 20503 + }, + { + "epoch": 1.9132219837641131, + "grad_norm": 1.6867552256545, + "learning_rate": 0.00023930571636600874, + "loss": 7.6729, + "step": 20504 + }, + { + "epoch": 1.9133152934589903, + "grad_norm": 1.7196376688627173, + "learning_rate": 0.00023929963912542698, + "loss": 7.5291, + "step": 20505 + }, + { + "epoch": 1.9134086031538677, + "grad_norm": 1.2914907588575129, + "learning_rate": 0.00023929356165778263, + "loss": 7.4534, + "step": 20506 + }, + { + "epoch": 1.913501912848745, + "grad_norm": 1.7448739309404966, + "learning_rate": 0.00023928748396309126, + "loss": 7.6626, + "step": 20507 + }, + { + "epoch": 1.9135952225436221, + "grad_norm": 1.8470316211278826, + "learning_rate": 0.00023928140604136827, + "loss": 7.8105, + "step": 20508 + }, + { + "epoch": 1.9136885322384996, + "grad_norm": 1.6379735994219544, + "learning_rate": 0.00023927532789262907, + "loss": 7.1785, + "step": 20509 + }, + { + "epoch": 1.913781841933377, + "grad_norm": 1.4278050149326402, + "learning_rate": 0.00023926924951688915, + "loss": 7.4725, + "step": 20510 + }, + { + "epoch": 1.9138751516282542, + "grad_norm": 1.6636196846117783, + "learning_rate": 0.000239263170914164, + "loss": 7.4566, + "step": 20511 + }, + { + "epoch": 1.9139684613231314, + "grad_norm": 1.2166378268247597, + "learning_rate": 0.00023925709208446903, + "loss": 7.6608, + "step": 20512 + }, + { + "epoch": 1.9140617710180088, + "grad_norm": 1.4563535907696155e+30, + "learning_rate": 0.00023925101302781977, + "loss": 7.4879, + "step": 20513 + }, + { + "epoch": 1.9141550807128862, + "grad_norm": 1.8698963318716963, + "learning_rate": 0.00023924493374423157, + "loss": 7.4926, + "step": 20514 + }, + { + "epoch": 1.9142483904077634, + "grad_norm": 1.3905854678631964, + "learning_rate": 0.00023923885423371996, + "loss": 7.3523, + "step": 20515 + }, + { + "epoch": 1.9143417001026406, + "grad_norm": 1.7161773862137813e+30, + "learning_rate": 0.00023923277449630035, + "loss": 7.6423, + "step": 20516 + }, + { + "epoch": 1.914435009797518, + "grad_norm": 1.2646212668526338, + "learning_rate": 0.00023922669453198823, + "loss": 7.3693, + "step": 20517 + }, + { + "epoch": 1.9145283194923952, + "grad_norm": 1.2595709413298128, + "learning_rate": 0.00023922061434079907, + "loss": 7.76, + "step": 20518 + }, + { + "epoch": 1.9146216291872724, + "grad_norm": 1.225476935648374, + "learning_rate": 0.00023921453392274832, + "loss": 7.5961, + "step": 20519 + }, + { + "epoch": 1.9147149388821498, + "grad_norm": 1.144006270096506, + "learning_rate": 0.00023920845327785146, + "loss": 7.5481, + "step": 20520 + }, + { + "epoch": 1.9148082485770272, + "grad_norm": 1.1694881438638678, + "learning_rate": 0.0002392023724061239, + "loss": 7.5032, + "step": 20521 + }, + { + "epoch": 1.9149015582719044, + "grad_norm": 5.378066501185592e+30, + "learning_rate": 0.00023919629130758112, + "loss": 7.4132, + "step": 20522 + }, + { + "epoch": 1.9149948679667816, + "grad_norm": 1.0980690838910752, + "learning_rate": 0.00023919020998223862, + "loss": 7.3061, + "step": 20523 + }, + { + "epoch": 1.915088177661659, + "grad_norm": 1.3552921309563333, + "learning_rate": 0.00023918412843011182, + "loss": 7.4812, + "step": 20524 + }, + { + "epoch": 1.9151814873565365, + "grad_norm": 10.521276804750725, + "learning_rate": 0.00023917804665121622, + "loss": 7.5935, + "step": 20525 + }, + { + "epoch": 1.9152747970514137, + "grad_norm": 4.112540834139594e+29, + "learning_rate": 0.00023917196464556725, + "loss": 7.7186, + "step": 20526 + }, + { + "epoch": 1.9153681067462909, + "grad_norm": 1.278755212662014, + "learning_rate": 0.00023916588241318037, + "loss": 7.6914, + "step": 20527 + }, + { + "epoch": 1.9154614164411683, + "grad_norm": 1.405344003280641, + "learning_rate": 0.00023915979995407108, + "loss": 7.5634, + "step": 20528 + }, + { + "epoch": 1.9155547261360455, + "grad_norm": 3.627741310919937e+30, + "learning_rate": 0.00023915371726825483, + "loss": 7.6014, + "step": 20529 + }, + { + "epoch": 1.9156480358309227, + "grad_norm": 1.6815140626201002, + "learning_rate": 0.00023914763435574712, + "loss": 7.2631, + "step": 20530 + }, + { + "epoch": 1.9157413455258, + "grad_norm": 1.2193009295468173, + "learning_rate": 0.0002391415512165633, + "loss": 7.3981, + "step": 20531 + }, + { + "epoch": 1.9158346552206775, + "grad_norm": 1.4169493956782209, + "learning_rate": 0.00023913546785071902, + "loss": 8.0034, + "step": 20532 + }, + { + "epoch": 1.9159279649155547, + "grad_norm": 1.3544885309417163, + "learning_rate": 0.00023912938425822957, + "loss": 7.3216, + "step": 20533 + }, + { + "epoch": 1.916021274610432, + "grad_norm": 1.1641834206094634, + "learning_rate": 0.00023912330043911054, + "loss": 7.4979, + "step": 20534 + }, + { + "epoch": 1.9161145843053093, + "grad_norm": 1.7667128522634965, + "learning_rate": 0.00023911721639337735, + "loss": 7.4899, + "step": 20535 + }, + { + "epoch": 1.9162078940001868, + "grad_norm": 1.3037668199805363, + "learning_rate": 0.00023911113212104546, + "loss": 7.7779, + "step": 20536 + }, + { + "epoch": 1.916301203695064, + "grad_norm": 1.0061847537310491e+31, + "learning_rate": 0.00023910504762213037, + "loss": 7.4654, + "step": 20537 + }, + { + "epoch": 1.9163945133899412, + "grad_norm": 1.0953484819804422, + "learning_rate": 0.00023909896289664757, + "loss": 7.5832, + "step": 20538 + }, + { + "epoch": 1.9164878230848186, + "grad_norm": 1.4197369459401334, + "learning_rate": 0.00023909287794461246, + "loss": 7.1109, + "step": 20539 + }, + { + "epoch": 1.9165811327796958, + "grad_norm": 1.552827822244079, + "learning_rate": 0.00023908679276604054, + "loss": 7.5166, + "step": 20540 + }, + { + "epoch": 1.916674442474573, + "grad_norm": 9.095106254073881e+30, + "learning_rate": 0.00023908070736094732, + "loss": 7.4571, + "step": 20541 + }, + { + "epoch": 1.9167677521694504, + "grad_norm": 1.656597551913908, + "learning_rate": 0.00023907462172934827, + "loss": 7.3076, + "step": 20542 + }, + { + "epoch": 1.9168610618643278, + "grad_norm": 1.7003565857092782e+30, + "learning_rate": 0.0002390685358712588, + "loss": 7.4784, + "step": 20543 + }, + { + "epoch": 1.916954371559205, + "grad_norm": 1.193145266271155, + "learning_rate": 0.00023906244978669447, + "loss": 7.5701, + "step": 20544 + }, + { + "epoch": 1.9170476812540822, + "grad_norm": 2.8320805928130115e+31, + "learning_rate": 0.0002390563634756707, + "loss": 7.3669, + "step": 20545 + }, + { + "epoch": 1.9171409909489596, + "grad_norm": 1.6344265929814743, + "learning_rate": 0.00023905027693820293, + "loss": 7.1432, + "step": 20546 + }, + { + "epoch": 1.917234300643837, + "grad_norm": 1.4144324909414459, + "learning_rate": 0.00023904419017430672, + "loss": 7.4495, + "step": 20547 + }, + { + "epoch": 1.9173276103387142, + "grad_norm": 2.111061318046938, + "learning_rate": 0.00023903810318399754, + "loss": 7.2698, + "step": 20548 + }, + { + "epoch": 1.9174209200335914, + "grad_norm": 9.085788376871152e+31, + "learning_rate": 0.0002390320159672908, + "loss": 7.6568, + "step": 20549 + }, + { + "epoch": 1.9175142297284689, + "grad_norm": 1.9562908935227425, + "learning_rate": 0.00023902592852420202, + "loss": 7.9107, + "step": 20550 + }, + { + "epoch": 1.917607539423346, + "grad_norm": 1.873985679375659, + "learning_rate": 0.00023901984085474668, + "loss": 7.5213, + "step": 20551 + }, + { + "epoch": 1.9177008491182232, + "grad_norm": 1.4809819934084776, + "learning_rate": 0.00023901375295894026, + "loss": 7.2652, + "step": 20552 + }, + { + "epoch": 1.9177941588131007, + "grad_norm": 1.7140955034762846, + "learning_rate": 0.00023900766483679823, + "loss": 7.3347, + "step": 20553 + }, + { + "epoch": 1.917887468507978, + "grad_norm": 1.9925883313740285, + "learning_rate": 0.00023900157648833607, + "loss": 7.0002, + "step": 20554 + }, + { + "epoch": 1.9179807782028553, + "grad_norm": 1.1357189457024213, + "learning_rate": 0.00023899548791356929, + "loss": 7.5221, + "step": 20555 + }, + { + "epoch": 1.9180740878977325, + "grad_norm": 1.4076483654605731, + "learning_rate": 0.00023898939911251333, + "loss": 7.5532, + "step": 20556 + }, + { + "epoch": 1.91816739759261, + "grad_norm": 4.7529442751619045e+29, + "learning_rate": 0.00023898331008518364, + "loss": 7.0048, + "step": 20557 + }, + { + "epoch": 1.9182607072874873, + "grad_norm": 2.1950587560168695, + "learning_rate": 0.00023897722083159582, + "loss": 7.8405, + "step": 20558 + }, + { + "epoch": 1.9183540169823645, + "grad_norm": 1.493014265192354, + "learning_rate": 0.00023897113135176528, + "loss": 7.6173, + "step": 20559 + }, + { + "epoch": 1.9184473266772417, + "grad_norm": 1.4762412125033202, + "learning_rate": 0.0002389650416457075, + "loss": 7.2034, + "step": 20560 + }, + { + "epoch": 1.9185406363721191, + "grad_norm": 1.7666829319559085, + "learning_rate": 0.00023895895171343795, + "loss": 7.5923, + "step": 20561 + }, + { + "epoch": 1.9186339460669963, + "grad_norm": 1.7823853493004065, + "learning_rate": 0.00023895286155497215, + "loss": 7.8057, + "step": 20562 + }, + { + "epoch": 1.9187272557618735, + "grad_norm": 2.6322664720424424e+31, + "learning_rate": 0.00023894677117032557, + "loss": 7.3549, + "step": 20563 + }, + { + "epoch": 1.918820565456751, + "grad_norm": 1.1234326642823274, + "learning_rate": 0.00023894068055951373, + "loss": 7.829, + "step": 20564 + }, + { + "epoch": 1.9189138751516284, + "grad_norm": 1.4769760217782149, + "learning_rate": 0.00023893458972255204, + "loss": 7.6736, + "step": 20565 + }, + { + "epoch": 1.9190071848465056, + "grad_norm": 1.299506382656787, + "learning_rate": 0.00023892849865945603, + "loss": 7.5685, + "step": 20566 + }, + { + "epoch": 1.9191004945413828, + "grad_norm": 1.1306173333556, + "learning_rate": 0.00023892240737024122, + "loss": 7.6203, + "step": 20567 + }, + { + "epoch": 1.9191938042362602, + "grad_norm": 1.1508086087622136, + "learning_rate": 0.00023891631585492304, + "loss": 7.5302, + "step": 20568 + }, + { + "epoch": 1.9192871139311376, + "grad_norm": 4.458168242344001e+31, + "learning_rate": 0.00023891022411351705, + "loss": 7.3413, + "step": 20569 + }, + { + "epoch": 1.9193804236260148, + "grad_norm": 1.6691228892846115, + "learning_rate": 0.00023890413214603867, + "loss": 7.2882, + "step": 20570 + }, + { + "epoch": 1.919473733320892, + "grad_norm": 1.3154666052179167, + "learning_rate": 0.0002388980399525034, + "loss": 7.6575, + "step": 20571 + }, + { + "epoch": 1.9195670430157694, + "grad_norm": 1.1364272660725838, + "learning_rate": 0.00023889194753292677, + "loss": 7.2951, + "step": 20572 + }, + { + "epoch": 1.9196603527106466, + "grad_norm": 1.45088752987347e+30, + "learning_rate": 0.00023888585488732428, + "loss": 7.3048, + "step": 20573 + }, + { + "epoch": 1.9197536624055238, + "grad_norm": 1.5431127760002055, + "learning_rate": 0.00023887976201571138, + "loss": 7.4608, + "step": 20574 + }, + { + "epoch": 1.9198469721004012, + "grad_norm": 1.913356270008289, + "learning_rate": 0.0002388736689181035, + "loss": 7.7204, + "step": 20575 + }, + { + "epoch": 1.9199402817952786, + "grad_norm": 2.935211440542473, + "learning_rate": 0.00023886757559451628, + "loss": 7.6612, + "step": 20576 + }, + { + "epoch": 1.9200335914901558, + "grad_norm": 2.1374484001313183e+32, + "learning_rate": 0.00023886148204496513, + "loss": 7.2786, + "step": 20577 + }, + { + "epoch": 1.920126901185033, + "grad_norm": 1.2466124120892161, + "learning_rate": 0.00023885538826946552, + "loss": 7.6269, + "step": 20578 + }, + { + "epoch": 1.9202202108799105, + "grad_norm": 1.5098007399982294, + "learning_rate": 0.00023884929426803303, + "loss": 7.7049, + "step": 20579 + }, + { + "epoch": 1.9203135205747879, + "grad_norm": 1.5657717866102063, + "learning_rate": 0.0002388432000406831, + "loss": 7.6217, + "step": 20580 + }, + { + "epoch": 1.9204068302696649, + "grad_norm": 1.5190323898877056e+32, + "learning_rate": 0.00023883710558743118, + "loss": 7.201, + "step": 20581 + }, + { + "epoch": 1.9205001399645423, + "grad_norm": 1.090909729823175, + "learning_rate": 0.00023883101090829283, + "loss": 7.6251, + "step": 20582 + }, + { + "epoch": 1.9205934496594197, + "grad_norm": 1.4302839095127737, + "learning_rate": 0.00023882491600328356, + "loss": 7.5777, + "step": 20583 + }, + { + "epoch": 1.920686759354297, + "grad_norm": 1.1769139395096146, + "learning_rate": 0.0002388188208724188, + "loss": 7.3055, + "step": 20584 + }, + { + "epoch": 1.920780069049174, + "grad_norm": 1.7648008146101202, + "learning_rate": 0.0002388127255157141, + "loss": 7.6182, + "step": 20585 + }, + { + "epoch": 1.9208733787440515, + "grad_norm": 1.1637234273737216, + "learning_rate": 0.000238806629933185, + "loss": 7.3086, + "step": 20586 + }, + { + "epoch": 1.920966688438929, + "grad_norm": 1.8115863363722646, + "learning_rate": 0.00023880053412484687, + "loss": 8.0685, + "step": 20587 + }, + { + "epoch": 1.9210599981338061, + "grad_norm": 1.46483871471359, + "learning_rate": 0.00023879443809071536, + "loss": 7.5361, + "step": 20588 + }, + { + "epoch": 1.9211533078286833, + "grad_norm": 1.48664408506257, + "learning_rate": 0.00023878834183080585, + "loss": 7.4298, + "step": 20589 + }, + { + "epoch": 1.9212466175235607, + "grad_norm": 1.2955396012350509, + "learning_rate": 0.00023878224534513388, + "loss": 7.638, + "step": 20590 + }, + { + "epoch": 1.9213399272184382, + "grad_norm": 1.1673296766807286, + "learning_rate": 0.00023877614863371501, + "loss": 7.2687, + "step": 20591 + }, + { + "epoch": 1.9214332369133151, + "grad_norm": 1.7179667832265102, + "learning_rate": 0.0002387700516965646, + "loss": 7.3559, + "step": 20592 + }, + { + "epoch": 1.9215265466081926, + "grad_norm": 7.536764065947197e+32, + "learning_rate": 0.00023876395453369833, + "loss": 7.4963, + "step": 20593 + }, + { + "epoch": 1.92161985630307, + "grad_norm": 1.919532552691484e+32, + "learning_rate": 0.00023875785714513162, + "loss": 7.1619, + "step": 20594 + }, + { + "epoch": 1.9217131659979472, + "grad_norm": 1.903451670432455, + "learning_rate": 0.0002387517595308799, + "loss": 7.5095, + "step": 20595 + }, + { + "epoch": 1.9218064756928244, + "grad_norm": 1.2326597305281313, + "learning_rate": 0.00023874566169095887, + "loss": 7.5501, + "step": 20596 + }, + { + "epoch": 1.9218997853877018, + "grad_norm": 3.9721429346108768e+31, + "learning_rate": 0.00023873956362538374, + "loss": 7.6496, + "step": 20597 + }, + { + "epoch": 1.9219930950825792, + "grad_norm": 1.4409055918174334, + "learning_rate": 0.00023873346533417032, + "loss": 7.3124, + "step": 20598 + }, + { + "epoch": 1.9220864047774564, + "grad_norm": 1.501720318735475, + "learning_rate": 0.00023872736681733396, + "loss": 7.1711, + "step": 20599 + }, + { + "epoch": 1.9221797144723336, + "grad_norm": 1.1728651465205573, + "learning_rate": 0.00023872126807489012, + "loss": 7.3111, + "step": 20600 + }, + { + "epoch": 1.922273024167211, + "grad_norm": 14.733104971746304, + "learning_rate": 0.0002387151691068544, + "loss": 7.4032, + "step": 20601 + }, + { + "epoch": 1.9223663338620882, + "grad_norm": 9.68805365949338e+33, + "learning_rate": 0.00023870906991324236, + "loss": 7.554, + "step": 20602 + }, + { + "epoch": 1.9224596435569654, + "grad_norm": 1.5725200526236947, + "learning_rate": 0.00023870297049406936, + "loss": 7.7105, + "step": 20603 + }, + { + "epoch": 1.9225529532518428, + "grad_norm": 4.6475704176353313e+33, + "learning_rate": 0.000238696870849351, + "loss": 7.8781, + "step": 20604 + }, + { + "epoch": 1.9226462629467203, + "grad_norm": 1.163072385717324, + "learning_rate": 0.0002386907709791028, + "loss": 7.1458, + "step": 20605 + }, + { + "epoch": 1.9227395726415974, + "grad_norm": 1.1065906067356868, + "learning_rate": 0.00023868467088334017, + "loss": 7.4663, + "step": 20606 + }, + { + "epoch": 1.9228328823364746, + "grad_norm": 1.6980465560952183, + "learning_rate": 0.00023867857056207876, + "loss": 7.3839, + "step": 20607 + }, + { + "epoch": 1.922926192031352, + "grad_norm": 1.5030518400913606, + "learning_rate": 0.000238672470015334, + "loss": 7.4768, + "step": 20608 + }, + { + "epoch": 1.9230195017262295, + "grad_norm": 1.508682296203855, + "learning_rate": 0.00023866636924312138, + "loss": 7.6716, + "step": 20609 + }, + { + "epoch": 1.9231128114211067, + "grad_norm": 1.4433576264558632, + "learning_rate": 0.0002386602682454565, + "loss": 7.3581, + "step": 20610 + }, + { + "epoch": 1.9232061211159839, + "grad_norm": 3.84275054207659, + "learning_rate": 0.00023865416702235478, + "loss": 7.1938, + "step": 20611 + }, + { + "epoch": 1.9232994308108613, + "grad_norm": 1.7480504701919921, + "learning_rate": 0.00023864806557383176, + "loss": 7.7845, + "step": 20612 + }, + { + "epoch": 1.9233927405057385, + "grad_norm": 2.1291227976879377, + "learning_rate": 0.00023864196389990303, + "loss": 7.5508, + "step": 20613 + }, + { + "epoch": 1.9234860502006157, + "grad_norm": 1.4656620062322234, + "learning_rate": 0.000238635862000584, + "loss": 7.5144, + "step": 20614 + }, + { + "epoch": 1.9235793598954931, + "grad_norm": 1.2073417600698024, + "learning_rate": 0.0002386297598758902, + "loss": 7.559, + "step": 20615 + }, + { + "epoch": 1.9236726695903705, + "grad_norm": 1.256598170738484, + "learning_rate": 0.00023862365752583728, + "loss": 7.441, + "step": 20616 + }, + { + "epoch": 1.9237659792852477, + "grad_norm": 1.5250060361596733, + "learning_rate": 0.0002386175549504406, + "loss": 7.3506, + "step": 20617 + }, + { + "epoch": 1.923859288980125, + "grad_norm": 1.1704977068238407, + "learning_rate": 0.00023861145214971568, + "loss": 7.7352, + "step": 20618 + }, + { + "epoch": 1.9239525986750023, + "grad_norm": 1.5803182591455567, + "learning_rate": 0.00023860534912367815, + "loss": 7.6309, + "step": 20619 + }, + { + "epoch": 1.9240459083698798, + "grad_norm": 1.3712582501818222, + "learning_rate": 0.00023859924587234344, + "loss": 7.556, + "step": 20620 + }, + { + "epoch": 1.924139218064757, + "grad_norm": 1.5765127789204385, + "learning_rate": 0.00023859314239572707, + "loss": 7.4073, + "step": 20621 + }, + { + "epoch": 1.9242325277596342, + "grad_norm": 1.468289397948624, + "learning_rate": 0.00023858703869384458, + "loss": 7.4747, + "step": 20622 + }, + { + "epoch": 1.9243258374545116, + "grad_norm": 1.2719207435202702, + "learning_rate": 0.00023858093476671153, + "loss": 7.3639, + "step": 20623 + }, + { + "epoch": 1.9244191471493888, + "grad_norm": 3.8567424895687966e+32, + "learning_rate": 0.0002385748306143434, + "loss": 7.4895, + "step": 20624 + }, + { + "epoch": 1.924512456844266, + "grad_norm": 1.2561867118980627, + "learning_rate": 0.00023856872623675567, + "loss": 7.4242, + "step": 20625 + }, + { + "epoch": 1.9246057665391434, + "grad_norm": 1.6441416979075556, + "learning_rate": 0.00023856262163396395, + "loss": 7.7847, + "step": 20626 + }, + { + "epoch": 1.9246990762340208, + "grad_norm": 1.6404371875021944, + "learning_rate": 0.00023855651680598365, + "loss": 7.5794, + "step": 20627 + }, + { + "epoch": 1.924792385928898, + "grad_norm": 1.0601412993256256, + "learning_rate": 0.00023855041175283045, + "loss": 7.3481, + "step": 20628 + }, + { + "epoch": 1.9248856956237752, + "grad_norm": 1.070758503079818, + "learning_rate": 0.00023854430647451974, + "loss": 7.6164, + "step": 20629 + }, + { + "epoch": 1.9249790053186526, + "grad_norm": 1.8513765108786362, + "learning_rate": 0.00023853820097106707, + "loss": 7.6137, + "step": 20630 + }, + { + "epoch": 1.92507231501353, + "grad_norm": 1.5981886705149853, + "learning_rate": 0.000238532095242488, + "loss": 7.2202, + "step": 20631 + }, + { + "epoch": 1.9251656247084072, + "grad_norm": 1.1270110596908847e+33, + "learning_rate": 0.00023852598928879808, + "loss": 7.6475, + "step": 20632 + }, + { + "epoch": 1.9252589344032844, + "grad_norm": 2.8942045523002624e+33, + "learning_rate": 0.00023851988311001274, + "loss": 7.031, + "step": 20633 + }, + { + "epoch": 1.9253522440981619, + "grad_norm": 1.1952937212651635, + "learning_rate": 0.00023851377670614754, + "loss": 7.2265, + "step": 20634 + }, + { + "epoch": 1.925445553793039, + "grad_norm": 1.306388427881138, + "learning_rate": 0.00023850767007721808, + "loss": 7.6178, + "step": 20635 + }, + { + "epoch": 1.9255388634879163, + "grad_norm": 3.1705469397544782e+32, + "learning_rate": 0.00023850156322323981, + "loss": 7.5015, + "step": 20636 + }, + { + "epoch": 1.9256321731827937, + "grad_norm": 8.773293823965877e+32, + "learning_rate": 0.00023849545614422824, + "loss": 7.5909, + "step": 20637 + }, + { + "epoch": 1.925725482877671, + "grad_norm": 1.549015437399827e+33, + "learning_rate": 0.000238489348840199, + "loss": 7.9526, + "step": 20638 + }, + { + "epoch": 1.9258187925725483, + "grad_norm": 1.465522028633259, + "learning_rate": 0.00023848324131116755, + "loss": 7.4676, + "step": 20639 + }, + { + "epoch": 1.9259121022674255, + "grad_norm": 1.1261165811876563, + "learning_rate": 0.0002384771335571494, + "loss": 7.5581, + "step": 20640 + }, + { + "epoch": 1.926005411962303, + "grad_norm": 1.3924565866748373, + "learning_rate": 0.00023847102557816017, + "loss": 7.5077, + "step": 20641 + }, + { + "epoch": 1.9260987216571803, + "grad_norm": 1.1995018797362915, + "learning_rate": 0.00023846491737421525, + "loss": 7.8099, + "step": 20642 + }, + { + "epoch": 1.9261920313520575, + "grad_norm": 1.5288630785151822, + "learning_rate": 0.0002384588089453303, + "loss": 7.5739, + "step": 20643 + }, + { + "epoch": 1.9262853410469347, + "grad_norm": 1.7950515671304646e+32, + "learning_rate": 0.00023845270029152074, + "loss": 7.1559, + "step": 20644 + }, + { + "epoch": 1.9263786507418121, + "grad_norm": 1.422267552550388, + "learning_rate": 0.00023844659141280226, + "loss": 7.3635, + "step": 20645 + }, + { + "epoch": 1.9264719604366893, + "grad_norm": 1.045761828965194, + "learning_rate": 0.00023844048230919023, + "loss": 7.4881, + "step": 20646 + }, + { + "epoch": 1.9265652701315665, + "grad_norm": 1.065333501859768, + "learning_rate": 0.00023843437298070024, + "loss": 7.3947, + "step": 20647 + }, + { + "epoch": 1.926658579826444, + "grad_norm": 1.2701529835852166, + "learning_rate": 0.0002384282634273479, + "loss": 7.5821, + "step": 20648 + }, + { + "epoch": 1.9267518895213214, + "grad_norm": 1.898399418380966, + "learning_rate": 0.00023842215364914865, + "loss": 7.3984, + "step": 20649 + }, + { + "epoch": 1.9268451992161986, + "grad_norm": 1.1933508948011853, + "learning_rate": 0.00023841604364611803, + "loss": 7.2162, + "step": 20650 + }, + { + "epoch": 1.9269385089110758, + "grad_norm": 1.3219128367416633, + "learning_rate": 0.00023840993341827163, + "loss": 7.3068, + "step": 20651 + }, + { + "epoch": 1.9270318186059532, + "grad_norm": 1.0928469513222685, + "learning_rate": 0.00023840382296562497, + "loss": 7.1569, + "step": 20652 + }, + { + "epoch": 1.9271251283008306, + "grad_norm": 1.3371297874481003, + "learning_rate": 0.00023839771228819354, + "loss": 7.5052, + "step": 20653 + }, + { + "epoch": 1.9272184379957078, + "grad_norm": 1.1988540525272438, + "learning_rate": 0.00023839160138599294, + "loss": 7.3116, + "step": 20654 + }, + { + "epoch": 1.927311747690585, + "grad_norm": 1.2957849618314958, + "learning_rate": 0.00023838549025903866, + "loss": 7.514, + "step": 20655 + }, + { + "epoch": 1.9274050573854624, + "grad_norm": 1.2366476452775674, + "learning_rate": 0.00023837937890734627, + "loss": 7.4428, + "step": 20656 + }, + { + "epoch": 1.9274983670803396, + "grad_norm": 8.747346581968408e+32, + "learning_rate": 0.00023837326733093132, + "loss": 7.385, + "step": 20657 + }, + { + "epoch": 1.9275916767752168, + "grad_norm": 5.331784706629121e+31, + "learning_rate": 0.00023836715552980932, + "loss": 7.3649, + "step": 20658 + }, + { + "epoch": 1.9276849864700942, + "grad_norm": 1.2021568751559086, + "learning_rate": 0.0002383610435039958, + "loss": 7.6651, + "step": 20659 + }, + { + "epoch": 1.9277782961649716, + "grad_norm": 1.6047368796817318, + "learning_rate": 0.0002383549312535063, + "loss": 7.4215, + "step": 20660 + }, + { + "epoch": 1.9278716058598488, + "grad_norm": 1.5728195324671923, + "learning_rate": 0.00023834881877835644, + "loss": 7.5983, + "step": 20661 + }, + { + "epoch": 1.927964915554726, + "grad_norm": 1.1846180305626681, + "learning_rate": 0.00023834270607856167, + "loss": 7.3575, + "step": 20662 + }, + { + "epoch": 1.9280582252496035, + "grad_norm": 29.111546090203706, + "learning_rate": 0.00023833659315413756, + "loss": 6.962, + "step": 20663 + }, + { + "epoch": 1.9281515349444809, + "grad_norm": 1.018469910664809e+32, + "learning_rate": 0.00023833048000509972, + "loss": 7.5929, + "step": 20664 + }, + { + "epoch": 1.928244844639358, + "grad_norm": 1.7367152200720009, + "learning_rate": 0.00023832436663146357, + "loss": 7.4769, + "step": 20665 + }, + { + "epoch": 1.9283381543342353, + "grad_norm": 1.6016287554121773, + "learning_rate": 0.00023831825303324472, + "loss": 7.1582, + "step": 20666 + }, + { + "epoch": 1.9284314640291127, + "grad_norm": 1.381904389899876, + "learning_rate": 0.00023831213921045874, + "loss": 7.6238, + "step": 20667 + }, + { + "epoch": 1.92852477372399, + "grad_norm": 1.1516029087999695, + "learning_rate": 0.00023830602516312113, + "loss": 7.2564, + "step": 20668 + }, + { + "epoch": 1.928618083418867, + "grad_norm": 3.7698916542443836e+32, + "learning_rate": 0.00023829991089124747, + "loss": 7.5824, + "step": 20669 + }, + { + "epoch": 1.9287113931137445, + "grad_norm": 4.32588711983188e+31, + "learning_rate": 0.00023829379639485332, + "loss": 7.471, + "step": 20670 + }, + { + "epoch": 1.928804702808622, + "grad_norm": 1.326911156330857, + "learning_rate": 0.00023828768167395419, + "loss": 7.7031, + "step": 20671 + }, + { + "epoch": 1.9288980125034991, + "grad_norm": 1.5195843869377592, + "learning_rate": 0.0002382815667285656, + "loss": 7.3869, + "step": 20672 + }, + { + "epoch": 1.9289913221983763, + "grad_norm": 1.2129269923303392, + "learning_rate": 0.00023827545155870317, + "loss": 7.4312, + "step": 20673 + }, + { + "epoch": 1.9290846318932537, + "grad_norm": 1.5528473013215185, + "learning_rate": 0.0002382693361643824, + "loss": 7.4636, + "step": 20674 + }, + { + "epoch": 1.9291779415881312, + "grad_norm": 1.8996995107379842, + "learning_rate": 0.00023826322054561883, + "loss": 7.0798, + "step": 20675 + }, + { + "epoch": 1.9292712512830084, + "grad_norm": 1.4760064603321832, + "learning_rate": 0.0002382571047024281, + "loss": 7.1691, + "step": 20676 + }, + { + "epoch": 1.9293645609778856, + "grad_norm": 1.2224720417355446, + "learning_rate": 0.00023825098863482566, + "loss": 7.5918, + "step": 20677 + }, + { + "epoch": 1.929457870672763, + "grad_norm": 1.5378921826958536, + "learning_rate": 0.00023824487234282707, + "loss": 7.5997, + "step": 20678 + }, + { + "epoch": 1.9295511803676402, + "grad_norm": 2.040499320015384, + "learning_rate": 0.00023823875582644797, + "loss": 7.9661, + "step": 20679 + }, + { + "epoch": 1.9296444900625174, + "grad_norm": 1.4341116955811033, + "learning_rate": 0.00023823263908570382, + "loss": 7.3251, + "step": 20680 + }, + { + "epoch": 1.9297377997573948, + "grad_norm": 1.3612962393062924, + "learning_rate": 0.00023822652212061017, + "loss": 7.3518, + "step": 20681 + }, + { + "epoch": 1.9298311094522722, + "grad_norm": 5.924799283897775, + "learning_rate": 0.00023822040493118267, + "loss": 7.7793, + "step": 20682 + }, + { + "epoch": 1.9299244191471494, + "grad_norm": 1.2196112787988118, + "learning_rate": 0.0002382142875174368, + "loss": 7.527, + "step": 20683 + }, + { + "epoch": 1.9300177288420266, + "grad_norm": 1.229491993046474, + "learning_rate": 0.00023820816987938812, + "loss": 7.8289, + "step": 20684 + }, + { + "epoch": 1.930111038536904, + "grad_norm": 1.2017987779753851, + "learning_rate": 0.00023820205201705217, + "loss": 7.5371, + "step": 20685 + }, + { + "epoch": 1.9302043482317814, + "grad_norm": 1.950034887572785, + "learning_rate": 0.00023819593393044458, + "loss": 7.2651, + "step": 20686 + }, + { + "epoch": 1.9302976579266584, + "grad_norm": 1.7776035685031775, + "learning_rate": 0.0002381898156195808, + "loss": 7.4869, + "step": 20687 + }, + { + "epoch": 1.9303909676215358, + "grad_norm": 2.216577911728309, + "learning_rate": 0.00023818369708447648, + "loss": 7.0873, + "step": 20688 + }, + { + "epoch": 1.9304842773164133, + "grad_norm": 2.005508812985953, + "learning_rate": 0.00023817757832514714, + "loss": 7.1482, + "step": 20689 + }, + { + "epoch": 1.9305775870112905, + "grad_norm": 1.3071841161296687, + "learning_rate": 0.0002381714593416083, + "loss": 7.3284, + "step": 20690 + }, + { + "epoch": 1.9306708967061676, + "grad_norm": 1.2604352398470648, + "learning_rate": 0.00023816534013387558, + "loss": 7.7107, + "step": 20691 + }, + { + "epoch": 1.930764206401045, + "grad_norm": 3.84188161978709e+30, + "learning_rate": 0.00023815922070196457, + "loss": 7.5517, + "step": 20692 + }, + { + "epoch": 1.9308575160959225, + "grad_norm": 3.1142281812652557e+32, + "learning_rate": 0.00023815310104589066, + "loss": 6.995, + "step": 20693 + }, + { + "epoch": 1.9309508257907997, + "grad_norm": 2.794058411453288, + "learning_rate": 0.0002381469811656696, + "loss": 7.9025, + "step": 20694 + }, + { + "epoch": 1.9310441354856769, + "grad_norm": 1.4158104072410898, + "learning_rate": 0.00023814086106131691, + "loss": 7.5857, + "step": 20695 + }, + { + "epoch": 1.9311374451805543, + "grad_norm": 1.1843656675147374e+31, + "learning_rate": 0.00023813474073284809, + "loss": 7.6565, + "step": 20696 + }, + { + "epoch": 1.9312307548754317, + "grad_norm": 0.9989963173872055, + "learning_rate": 0.00023812862018027868, + "loss": 7.6697, + "step": 20697 + }, + { + "epoch": 1.9313240645703087, + "grad_norm": 1.369747224931957e+33, + "learning_rate": 0.00023812249940362434, + "loss": 7.2905, + "step": 20698 + }, + { + "epoch": 1.9314173742651861, + "grad_norm": 2.1437568365211113, + "learning_rate": 0.0002381163784029006, + "loss": 7.0828, + "step": 20699 + }, + { + "epoch": 1.9315106839600635, + "grad_norm": 1.2896919130559628, + "learning_rate": 0.000238110257178123, + "loss": 7.4891, + "step": 20700 + }, + { + "epoch": 1.9316039936549407, + "grad_norm": 1.3407217004219762, + "learning_rate": 0.00023810413572930708, + "loss": 7.8256, + "step": 20701 + }, + { + "epoch": 1.931697303349818, + "grad_norm": 1.0734125343501535, + "learning_rate": 0.0002380980140564685, + "loss": 7.6442, + "step": 20702 + }, + { + "epoch": 1.9317906130446953, + "grad_norm": 1.0450792156469535, + "learning_rate": 0.00023809189215962272, + "loss": 7.3211, + "step": 20703 + }, + { + "epoch": 1.9318839227395728, + "grad_norm": 3.0890331882135575e+33, + "learning_rate": 0.00023808577003878536, + "loss": 7.262, + "step": 20704 + }, + { + "epoch": 1.93197723243445, + "grad_norm": 1.1402748492723602, + "learning_rate": 0.00023807964769397206, + "loss": 7.6117, + "step": 20705 + }, + { + "epoch": 1.9320705421293272, + "grad_norm": 1.4194077854204725, + "learning_rate": 0.0002380735251251982, + "loss": 7.1975, + "step": 20706 + }, + { + "epoch": 1.9321638518242046, + "grad_norm": 1.0283633914966952, + "learning_rate": 0.00023806740233247952, + "loss": 7.2334, + "step": 20707 + }, + { + "epoch": 1.9322571615190818, + "grad_norm": 8.285638347632446e+32, + "learning_rate": 0.00023806127931583147, + "loss": 7.766, + "step": 20708 + }, + { + "epoch": 1.932350471213959, + "grad_norm": 2.4245065367201254e+32, + "learning_rate": 0.00023805515607526972, + "loss": 7.1888, + "step": 20709 + }, + { + "epoch": 1.9324437809088364, + "grad_norm": 1.370379317054622, + "learning_rate": 0.00023804903261080977, + "loss": 7.4014, + "step": 20710 + }, + { + "epoch": 1.9325370906037138, + "grad_norm": 1.6681379535778746, + "learning_rate": 0.00023804290892246723, + "loss": 7.4726, + "step": 20711 + }, + { + "epoch": 1.932630400298591, + "grad_norm": 1.272123026622416, + "learning_rate": 0.00023803678501025765, + "loss": 7.4116, + "step": 20712 + }, + { + "epoch": 1.9327237099934682, + "grad_norm": 1.207122509696116, + "learning_rate": 0.00023803066087419656, + "loss": 7.5284, + "step": 20713 + }, + { + "epoch": 1.9328170196883456, + "grad_norm": 1.5337787448692182, + "learning_rate": 0.00023802453651429963, + "loss": 7.1514, + "step": 20714 + }, + { + "epoch": 1.932910329383223, + "grad_norm": 2.1498353128721703, + "learning_rate": 0.00023801841193058233, + "loss": 7.2449, + "step": 20715 + }, + { + "epoch": 1.9330036390781002, + "grad_norm": 94.5641112743108, + "learning_rate": 0.0002380122871230603, + "loss": 7.2644, + "step": 20716 + }, + { + "epoch": 1.9330969487729774, + "grad_norm": 1.1569517965607865, + "learning_rate": 0.00023800616209174912, + "loss": 7.3164, + "step": 20717 + }, + { + "epoch": 1.9331902584678549, + "grad_norm": 1.0082284452402126, + "learning_rate": 0.00023800003683666432, + "loss": 7.2186, + "step": 20718 + }, + { + "epoch": 1.933283568162732, + "grad_norm": 1.181661547724176, + "learning_rate": 0.00023799391135782152, + "loss": 7.3754, + "step": 20719 + }, + { + "epoch": 1.9333768778576093, + "grad_norm": 5.0228006686161646e+32, + "learning_rate": 0.0002379877856552362, + "loss": 7.7337, + "step": 20720 + }, + { + "epoch": 1.9334701875524867, + "grad_norm": 1.4937104490729716, + "learning_rate": 0.0002379816597289241, + "loss": 7.6322, + "step": 20721 + }, + { + "epoch": 1.933563497247364, + "grad_norm": 1.204644035931806, + "learning_rate": 0.00023797553357890062, + "loss": 7.6207, + "step": 20722 + }, + { + "epoch": 1.9336568069422413, + "grad_norm": 6.27728304216387e+33, + "learning_rate": 0.00023796940720518145, + "loss": 7.7351, + "step": 20723 + }, + { + "epoch": 1.9337501166371185, + "grad_norm": 1.4377439823541938, + "learning_rate": 0.00023796328060778215, + "loss": 7.5659, + "step": 20724 + }, + { + "epoch": 1.933843426331996, + "grad_norm": 1.0331716313843253, + "learning_rate": 0.00023795715378671826, + "loss": 7.5509, + "step": 20725 + }, + { + "epoch": 1.9339367360268733, + "grad_norm": 1.1711760787126189, + "learning_rate": 0.00023795102674200536, + "loss": 7.5167, + "step": 20726 + }, + { + "epoch": 1.9340300457217505, + "grad_norm": 1.109070095616235, + "learning_rate": 0.00023794489947365913, + "loss": 7.571, + "step": 20727 + }, + { + "epoch": 1.9341233554166277, + "grad_norm": 1.6542488793787298, + "learning_rate": 0.000237938771981695, + "loss": 7.3236, + "step": 20728 + }, + { + "epoch": 1.9342166651115051, + "grad_norm": 1.6464466068619403, + "learning_rate": 0.00023793264426612867, + "loss": 7.2404, + "step": 20729 + }, + { + "epoch": 1.9343099748063823, + "grad_norm": 1.2894446729262146, + "learning_rate": 0.00023792651632697563, + "loss": 7.5579, + "step": 20730 + }, + { + "epoch": 1.9344032845012595, + "grad_norm": 1.1276219614763148, + "learning_rate": 0.00023792038816425155, + "loss": 7.5513, + "step": 20731 + }, + { + "epoch": 1.934496594196137, + "grad_norm": 1.2736259776682555, + "learning_rate": 0.0002379142597779719, + "loss": 7.5724, + "step": 20732 + }, + { + "epoch": 1.9345899038910144, + "grad_norm": 2.0509548667655917, + "learning_rate": 0.0002379081311681524, + "loss": 7.8177, + "step": 20733 + }, + { + "epoch": 1.9346832135858916, + "grad_norm": 1.22734327833582, + "learning_rate": 0.00023790200233480855, + "loss": 7.303, + "step": 20734 + }, + { + "epoch": 1.9347765232807688, + "grad_norm": 1.3193792398918234, + "learning_rate": 0.00023789587327795593, + "loss": 7.6609, + "step": 20735 + }, + { + "epoch": 1.9348698329756462, + "grad_norm": 7.644841619292013, + "learning_rate": 0.00023788974399761013, + "loss": 7.5428, + "step": 20736 + }, + { + "epoch": 1.9349631426705236, + "grad_norm": 2.1158405397138282, + "learning_rate": 0.00023788361449378676, + "loss": 7.7616, + "step": 20737 + }, + { + "epoch": 1.9350564523654008, + "grad_norm": 1.2442011361056986, + "learning_rate": 0.0002378774847665014, + "loss": 7.5094, + "step": 20738 + }, + { + "epoch": 1.935149762060278, + "grad_norm": 1.2835072252774433, + "learning_rate": 0.0002378713548157696, + "loss": 7.3497, + "step": 20739 + }, + { + "epoch": 1.9352430717551554, + "grad_norm": 1.5665257928983372, + "learning_rate": 0.00023786522464160705, + "loss": 7.5307, + "step": 20740 + }, + { + "epoch": 1.9353363814500326, + "grad_norm": 1.3797806265441452, + "learning_rate": 0.00023785909424402918, + "loss": 7.4036, + "step": 20741 + }, + { + "epoch": 1.9354296911449098, + "grad_norm": 1.810097597268379, + "learning_rate": 0.00023785296362305167, + "loss": 7.2285, + "step": 20742 + }, + { + "epoch": 1.9355230008397872, + "grad_norm": 1.149519564943051, + "learning_rate": 0.0002378468327786901, + "loss": 7.3233, + "step": 20743 + }, + { + "epoch": 1.9356163105346647, + "grad_norm": 7.853389342246241e+32, + "learning_rate": 0.00023784070171096008, + "loss": 7.7995, + "step": 20744 + }, + { + "epoch": 1.9357096202295418, + "grad_norm": 1.3966995763470114, + "learning_rate": 0.00023783457041987716, + "loss": 7.4566, + "step": 20745 + }, + { + "epoch": 1.935802929924419, + "grad_norm": 1.4310527874830732, + "learning_rate": 0.00023782843890545694, + "loss": 7.3748, + "step": 20746 + }, + { + "epoch": 1.9358962396192965, + "grad_norm": 1.5995754207977415, + "learning_rate": 0.00023782230716771504, + "loss": 7.7374, + "step": 20747 + }, + { + "epoch": 1.9359895493141739, + "grad_norm": 1.2975378514046014e+34, + "learning_rate": 0.00023781617520666698, + "loss": 7.4153, + "step": 20748 + }, + { + "epoch": 1.936082859009051, + "grad_norm": 1.2731011060146693, + "learning_rate": 0.00023781004302232846, + "loss": 7.6174, + "step": 20749 + }, + { + "epoch": 1.9361761687039283, + "grad_norm": 1.5426108402098324, + "learning_rate": 0.00023780391061471495, + "loss": 7.8299, + "step": 20750 + }, + { + "epoch": 1.9362694783988057, + "grad_norm": 1.5568854708503517, + "learning_rate": 0.00023779777798384216, + "loss": 7.5581, + "step": 20751 + }, + { + "epoch": 1.936362788093683, + "grad_norm": 1.7323318863036692e+33, + "learning_rate": 0.00023779164512972563, + "loss": 7.5182, + "step": 20752 + }, + { + "epoch": 1.93645609778856, + "grad_norm": 1.0358879795760894, + "learning_rate": 0.0002377855120523809, + "loss": 7.4938, + "step": 20753 + }, + { + "epoch": 1.9365494074834375, + "grad_norm": 9.267411501341049e+32, + "learning_rate": 0.00023777937875182366, + "loss": 7.403, + "step": 20754 + }, + { + "epoch": 1.936642717178315, + "grad_norm": 4.44179181520117e+31, + "learning_rate": 0.00023777324522806945, + "loss": 7.7024, + "step": 20755 + }, + { + "epoch": 1.9367360268731921, + "grad_norm": 1.6772506551656667, + "learning_rate": 0.00023776711148113387, + "loss": 7.0867, + "step": 20756 + }, + { + "epoch": 1.9368293365680693, + "grad_norm": 1.02401866238141, + "learning_rate": 0.00023776097751103251, + "loss": 7.3899, + "step": 20757 + }, + { + "epoch": 1.9369226462629467, + "grad_norm": 1.5510273960776506e+34, + "learning_rate": 0.00023775484331778102, + "loss": 7.415, + "step": 20758 + }, + { + "epoch": 1.9370159559578242, + "grad_norm": 1.9754667565168293, + "learning_rate": 0.00023774870890139494, + "loss": 7.6401, + "step": 20759 + }, + { + "epoch": 1.9371092656527014, + "grad_norm": 1.3253192061615067, + "learning_rate": 0.0002377425742618899, + "loss": 7.3574, + "step": 20760 + }, + { + "epoch": 1.9372025753475786, + "grad_norm": 1.9676682272268908, + "learning_rate": 0.00023773643939928145, + "loss": 7.7227, + "step": 20761 + }, + { + "epoch": 1.937295885042456, + "grad_norm": 1.8984564979273349e+34, + "learning_rate": 0.00023773030431358528, + "loss": 7.6223, + "step": 20762 + }, + { + "epoch": 1.9373891947373332, + "grad_norm": 1.0109659575290573, + "learning_rate": 0.00023772416900481687, + "loss": 7.3463, + "step": 20763 + }, + { + "epoch": 1.9374825044322104, + "grad_norm": 3.7711805139495924e+32, + "learning_rate": 0.00023771803347299192, + "loss": 7.4141, + "step": 20764 + }, + { + "epoch": 1.9375758141270878, + "grad_norm": 1.7876580905413433, + "learning_rate": 0.00023771189771812598, + "loss": 7.1377, + "step": 20765 + }, + { + "epoch": 1.9376691238219652, + "grad_norm": 1.1425318264801425, + "learning_rate": 0.00023770576174023467, + "loss": 7.6635, + "step": 20766 + }, + { + "epoch": 1.9377624335168424, + "grad_norm": 1.4246332655206302, + "learning_rate": 0.00023769962553933356, + "loss": 7.2501, + "step": 20767 + }, + { + "epoch": 1.9378557432117196, + "grad_norm": 1.3833219122091067, + "learning_rate": 0.00023769348911543834, + "loss": 7.3669, + "step": 20768 + }, + { + "epoch": 1.937949052906597, + "grad_norm": 1.4295478641146666, + "learning_rate": 0.00023768735246856452, + "loss": 7.8217, + "step": 20769 + }, + { + "epoch": 1.9380423626014744, + "grad_norm": 1.437070955095567, + "learning_rate": 0.00023768121559872776, + "loss": 7.7512, + "step": 20770 + }, + { + "epoch": 1.9381356722963516, + "grad_norm": 0.9567751168918189, + "learning_rate": 0.00023767507850594363, + "loss": 7.3241, + "step": 20771 + }, + { + "epoch": 1.9382289819912288, + "grad_norm": 1.3354199950236765, + "learning_rate": 0.0002376689411902277, + "loss": 7.4323, + "step": 20772 + }, + { + "epoch": 1.9383222916861063, + "grad_norm": 0.9859926264415484, + "learning_rate": 0.0002376628036515957, + "loss": 7.1764, + "step": 20773 + }, + { + "epoch": 1.9384156013809835, + "grad_norm": 1.1146567883477838, + "learning_rate": 0.0002376566658900631, + "loss": 7.5752, + "step": 20774 + }, + { + "epoch": 1.9385089110758607, + "grad_norm": 1.2518471095243997, + "learning_rate": 0.00023765052790564566, + "loss": 7.2477, + "step": 20775 + }, + { + "epoch": 1.938602220770738, + "grad_norm": 1.0390883108366746, + "learning_rate": 0.00023764438969835875, + "loss": 7.4247, + "step": 20776 + }, + { + "epoch": 1.9386955304656155, + "grad_norm": 8.216222138033329e+32, + "learning_rate": 0.0002376382512682182, + "loss": 7.4562, + "step": 20777 + }, + { + "epoch": 1.9387888401604927, + "grad_norm": 1.0863029313469192, + "learning_rate": 0.00023763211261523953, + "loss": 7.421, + "step": 20778 + }, + { + "epoch": 1.9388821498553699, + "grad_norm": 1.1814206328357775, + "learning_rate": 0.00023762597373943838, + "loss": 7.6568, + "step": 20779 + }, + { + "epoch": 1.9389754595502473, + "grad_norm": 1.0779969572875403, + "learning_rate": 0.0002376198346408303, + "loss": 7.2145, + "step": 20780 + }, + { + "epoch": 1.9390687692451247, + "grad_norm": 1.1464677383753168, + "learning_rate": 0.00023761369531943093, + "loss": 7.5283, + "step": 20781 + }, + { + "epoch": 1.939162078940002, + "grad_norm": 2.2417759165982923e+34, + "learning_rate": 0.0002376075557752559, + "loss": 7.3883, + "step": 20782 + }, + { + "epoch": 1.9392553886348791, + "grad_norm": 1.0956945800615434, + "learning_rate": 0.0002376014160083208, + "loss": 7.6381, + "step": 20783 + }, + { + "epoch": 1.9393486983297565, + "grad_norm": 1.0813699898686795, + "learning_rate": 0.00023759527601864127, + "loss": 7.6137, + "step": 20784 + }, + { + "epoch": 1.9394420080246337, + "grad_norm": 1.2040769795926762, + "learning_rate": 0.00023758913580623284, + "loss": 7.5722, + "step": 20785 + }, + { + "epoch": 1.939535317719511, + "grad_norm": 1.7559161085302473, + "learning_rate": 0.00023758299537111126, + "loss": 7.2172, + "step": 20786 + }, + { + "epoch": 1.9396286274143884, + "grad_norm": 2.3174055413487875, + "learning_rate": 0.00023757685471329203, + "loss": 6.9001, + "step": 20787 + }, + { + "epoch": 1.9397219371092658, + "grad_norm": 1.041083670497917, + "learning_rate": 0.0002375707138327908, + "loss": 7.3387, + "step": 20788 + }, + { + "epoch": 1.939815246804143, + "grad_norm": 1.1670158047288937, + "learning_rate": 0.00023756457272962323, + "loss": 7.459, + "step": 20789 + }, + { + "epoch": 1.9399085564990202, + "grad_norm": 1.2303226531877518, + "learning_rate": 0.00023755843140380481, + "loss": 7.2827, + "step": 20790 + }, + { + "epoch": 1.9400018661938976, + "grad_norm": 1.3137791146547826, + "learning_rate": 0.00023755228985535126, + "loss": 7.2795, + "step": 20791 + }, + { + "epoch": 1.940095175888775, + "grad_norm": 2.508749195936759, + "learning_rate": 0.0002375461480842782, + "loss": 7.7952, + "step": 20792 + }, + { + "epoch": 1.940188485583652, + "grad_norm": 1.5969662245675327, + "learning_rate": 0.00023754000609060123, + "loss": 7.3809, + "step": 20793 + }, + { + "epoch": 1.9402817952785294, + "grad_norm": 5.599893312158256e+34, + "learning_rate": 0.0002375338638743359, + "loss": 7.2892, + "step": 20794 + }, + { + "epoch": 1.9403751049734068, + "grad_norm": 1.5964392490404224, + "learning_rate": 0.00023752772143549792, + "loss": 7.8672, + "step": 20795 + }, + { + "epoch": 1.940468414668284, + "grad_norm": 1.124598236955602, + "learning_rate": 0.0002375215787741029, + "loss": 7.5841, + "step": 20796 + }, + { + "epoch": 1.9405617243631612, + "grad_norm": 1.357969172991613, + "learning_rate": 0.0002375154358901664, + "loss": 7.4762, + "step": 20797 + }, + { + "epoch": 1.9406550340580386, + "grad_norm": 2.1745033668887466, + "learning_rate": 0.00023750929278370405, + "loss": 7.5081, + "step": 20798 + }, + { + "epoch": 1.940748343752916, + "grad_norm": 1.1981701488969343, + "learning_rate": 0.00023750314945473152, + "loss": 7.5769, + "step": 20799 + }, + { + "epoch": 1.9408416534477932, + "grad_norm": 1.6742019685934857, + "learning_rate": 0.0002374970059032644, + "loss": 7.4053, + "step": 20800 + }, + { + "epoch": 1.9409349631426704, + "grad_norm": 1.252432900655733, + "learning_rate": 0.00023749086212931827, + "loss": 7.4715, + "step": 20801 + }, + { + "epoch": 1.9410282728375479, + "grad_norm": 1.2345323838939535, + "learning_rate": 0.00023748471813290884, + "loss": 7.5888, + "step": 20802 + }, + { + "epoch": 1.9411215825324253, + "grad_norm": 1.6041075246474843, + "learning_rate": 0.00023747857391405168, + "loss": 6.9736, + "step": 20803 + }, + { + "epoch": 1.9412148922273023, + "grad_norm": 1.4460479798077188, + "learning_rate": 0.0002374724294727624, + "loss": 7.5146, + "step": 20804 + }, + { + "epoch": 1.9413082019221797, + "grad_norm": 1.6407870007353749, + "learning_rate": 0.00023746628480905666, + "loss": 7.5036, + "step": 20805 + }, + { + "epoch": 1.941401511617057, + "grad_norm": 1.782878452642162, + "learning_rate": 0.0002374601399229501, + "loss": 7.3999, + "step": 20806 + }, + { + "epoch": 1.9414948213119343, + "grad_norm": 1.927724490093629, + "learning_rate": 0.00023745399481445823, + "loss": 7.6053, + "step": 20807 + }, + { + "epoch": 1.9415881310068115, + "grad_norm": 1.0170839270728393, + "learning_rate": 0.0002374478494835968, + "loss": 7.0957, + "step": 20808 + }, + { + "epoch": 1.941681440701689, + "grad_norm": 1.6739144082743362, + "learning_rate": 0.00023744170393038142, + "loss": 7.6564, + "step": 20809 + }, + { + "epoch": 1.9417747503965663, + "grad_norm": 1.009667621080887, + "learning_rate": 0.00023743555815482767, + "loss": 7.4868, + "step": 20810 + }, + { + "epoch": 1.9418680600914435, + "grad_norm": 1.4401241884866007, + "learning_rate": 0.00023742941215695117, + "loss": 7.0371, + "step": 20811 + }, + { + "epoch": 1.9419613697863207, + "grad_norm": 1.0306951000107234, + "learning_rate": 0.0002374232659367676, + "loss": 7.6279, + "step": 20812 + }, + { + "epoch": 1.9420546794811981, + "grad_norm": 1.9199052219177501, + "learning_rate": 0.00023741711949429255, + "loss": 7.3845, + "step": 20813 + }, + { + "epoch": 1.9421479891760753, + "grad_norm": 1.0779587569105393, + "learning_rate": 0.00023741097282954166, + "loss": 7.5737, + "step": 20814 + }, + { + "epoch": 1.9422412988709525, + "grad_norm": 1.5775591235576825, + "learning_rate": 0.00023740482594253057, + "loss": 7.4008, + "step": 20815 + }, + { + "epoch": 1.94233460856583, + "grad_norm": 1.3776586418041068, + "learning_rate": 0.0002373986788332749, + "loss": 7.2173, + "step": 20816 + }, + { + "epoch": 1.9424279182607074, + "grad_norm": 1.3174768500891274, + "learning_rate": 0.00023739253150179029, + "loss": 7.4477, + "step": 20817 + }, + { + "epoch": 1.9425212279555846, + "grad_norm": 1.175864923442885, + "learning_rate": 0.0002373863839480923, + "loss": 7.5669, + "step": 20818 + }, + { + "epoch": 1.9426145376504618, + "grad_norm": 2.077814063588122, + "learning_rate": 0.0002373802361721967, + "loss": 8.1214, + "step": 20819 + }, + { + "epoch": 1.9427078473453392, + "grad_norm": 1.1576985019143458, + "learning_rate": 0.000237374088174119, + "loss": 7.706, + "step": 20820 + }, + { + "epoch": 1.9428011570402166, + "grad_norm": 1.0975202216478923, + "learning_rate": 0.00023736793995387487, + "loss": 7.5045, + "step": 20821 + }, + { + "epoch": 1.9428944667350938, + "grad_norm": 1.0907754131188945, + "learning_rate": 0.00023736179151147998, + "loss": 7.6312, + "step": 20822 + }, + { + "epoch": 1.942987776429971, + "grad_norm": 3.583473910974358e+33, + "learning_rate": 0.00023735564284694993, + "loss": 7.5618, + "step": 20823 + }, + { + "epoch": 1.9430810861248484, + "grad_norm": 1.0108048541259296, + "learning_rate": 0.00023734949396030035, + "loss": 7.8029, + "step": 20824 + }, + { + "epoch": 1.9431743958197256, + "grad_norm": 1.1811467434373637, + "learning_rate": 0.00023734334485154686, + "loss": 7.4527, + "step": 20825 + }, + { + "epoch": 1.9432677055146028, + "grad_norm": 1.6148509924782042, + "learning_rate": 0.0002373371955207051, + "loss": 8.0706, + "step": 20826 + }, + { + "epoch": 1.9433610152094802, + "grad_norm": 1.1318501994427785, + "learning_rate": 0.00023733104596779077, + "loss": 7.5271, + "step": 20827 + }, + { + "epoch": 1.9434543249043577, + "grad_norm": 1.4644908095304114, + "learning_rate": 0.00023732489619281947, + "loss": 7.3969, + "step": 20828 + }, + { + "epoch": 1.9435476345992349, + "grad_norm": 1.1292279572741255, + "learning_rate": 0.00023731874619580676, + "loss": 7.4481, + "step": 20829 + }, + { + "epoch": 1.943640944294112, + "grad_norm": 1.1097916154123817, + "learning_rate": 0.00023731259597676836, + "loss": 7.6004, + "step": 20830 + }, + { + "epoch": 1.9437342539889895, + "grad_norm": 5.676221180530516e+33, + "learning_rate": 0.00023730644553571995, + "loss": 7.3092, + "step": 20831 + }, + { + "epoch": 1.9438275636838669, + "grad_norm": 1.2372064916333425, + "learning_rate": 0.00023730029487267708, + "loss": 7.3664, + "step": 20832 + }, + { + "epoch": 1.943920873378744, + "grad_norm": 1.1608007106565987, + "learning_rate": 0.0002372941439876554, + "loss": 7.5848, + "step": 20833 + }, + { + "epoch": 1.9440141830736213, + "grad_norm": 1.2841234912351394, + "learning_rate": 0.00023728799288067062, + "loss": 7.1292, + "step": 20834 + }, + { + "epoch": 1.9441074927684987, + "grad_norm": 0.9301507470085464, + "learning_rate": 0.00023728184155173826, + "loss": 7.4342, + "step": 20835 + }, + { + "epoch": 1.944200802463376, + "grad_norm": 0.9806443926166333, + "learning_rate": 0.0002372756900008741, + "loss": 7.5202, + "step": 20836 + }, + { + "epoch": 1.944294112158253, + "grad_norm": 1.3623100080213029, + "learning_rate": 0.00023726953822809368, + "loss": 7.7655, + "step": 20837 + }, + { + "epoch": 1.9443874218531305, + "grad_norm": 1.1777905798484842, + "learning_rate": 0.00023726338623341267, + "loss": 7.4049, + "step": 20838 + }, + { + "epoch": 1.944480731548008, + "grad_norm": 0.9600215273482618, + "learning_rate": 0.00023725723401684672, + "loss": 7.6223, + "step": 20839 + }, + { + "epoch": 1.9445740412428851, + "grad_norm": 0.9095607104409744, + "learning_rate": 0.0002372510815784115, + "loss": 7.4272, + "step": 20840 + }, + { + "epoch": 1.9446673509377623, + "grad_norm": 0.9908324346046309, + "learning_rate": 0.0002372449289181226, + "loss": 7.4802, + "step": 20841 + }, + { + "epoch": 1.9447606606326397, + "grad_norm": 1.4489381607820833, + "learning_rate": 0.00023723877603599569, + "loss": 7.3295, + "step": 20842 + }, + { + "epoch": 1.9448539703275172, + "grad_norm": 1.8128442850230182, + "learning_rate": 0.00023723262293204642, + "loss": 7.4455, + "step": 20843 + }, + { + "epoch": 1.9449472800223944, + "grad_norm": 2.33554307352892e+33, + "learning_rate": 0.00023722646960629045, + "loss": 8.1148, + "step": 20844 + }, + { + "epoch": 1.9450405897172716, + "grad_norm": 0.9879979572767594, + "learning_rate": 0.00023722031605874334, + "loss": 7.5079, + "step": 20845 + }, + { + "epoch": 1.945133899412149, + "grad_norm": 1.1268836887639653, + "learning_rate": 0.00023721416228942086, + "loss": 7.8542, + "step": 20846 + }, + { + "epoch": 1.9452272091070262, + "grad_norm": 1.0262685372280511, + "learning_rate": 0.00023720800829833862, + "loss": 7.5781, + "step": 20847 + }, + { + "epoch": 1.9453205188019034, + "grad_norm": 1.9172012599094402e+32, + "learning_rate": 0.00023720185408551215, + "loss": 7.3571, + "step": 20848 + }, + { + "epoch": 1.9454138284967808, + "grad_norm": 6.198000240804341e+33, + "learning_rate": 0.0002371956996509573, + "loss": 7.527, + "step": 20849 + }, + { + "epoch": 1.9455071381916582, + "grad_norm": 1.835964408620919e+34, + "learning_rate": 0.00023718954499468958, + "loss": 7.3314, + "step": 20850 + }, + { + "epoch": 1.9456004478865354, + "grad_norm": 1.279261831817328, + "learning_rate": 0.00023718339011672465, + "loss": 7.2473, + "step": 20851 + }, + { + "epoch": 1.9456937575814126, + "grad_norm": 1.4990499027961317, + "learning_rate": 0.0002371772350170782, + "loss": 6.9839, + "step": 20852 + }, + { + "epoch": 1.94578706727629, + "grad_norm": 1.0232916301850234, + "learning_rate": 0.0002371710796957659, + "loss": 7.4519, + "step": 20853 + }, + { + "epoch": 1.9458803769711674, + "grad_norm": 5.0994756808855825e+31, + "learning_rate": 0.00023716492415280332, + "loss": 7.2638, + "step": 20854 + }, + { + "epoch": 1.9459736866660446, + "grad_norm": 1.0433959608026577, + "learning_rate": 0.00023715876838820618, + "loss": 7.2048, + "step": 20855 + }, + { + "epoch": 1.9460669963609218, + "grad_norm": 1.7610921908591854, + "learning_rate": 0.00023715261240199012, + "loss": 7.3808, + "step": 20856 + }, + { + "epoch": 1.9461603060557993, + "grad_norm": 4.7833258337455504e+32, + "learning_rate": 0.00023714645619417072, + "loss": 7.5981, + "step": 20857 + }, + { + "epoch": 1.9462536157506765, + "grad_norm": 1.7350813136063336e+33, + "learning_rate": 0.0002371402997647638, + "loss": 7.4557, + "step": 20858 + }, + { + "epoch": 1.9463469254455537, + "grad_norm": 1.0743526039193811, + "learning_rate": 0.00023713414311378483, + "loss": 7.5196, + "step": 20859 + }, + { + "epoch": 1.946440235140431, + "grad_norm": 1.0993656715805333, + "learning_rate": 0.00023712798624124956, + "loss": 7.5767, + "step": 20860 + }, + { + "epoch": 1.9465335448353085, + "grad_norm": 1.4903364847410536, + "learning_rate": 0.00023712182914717363, + "loss": 7.4638, + "step": 20861 + }, + { + "epoch": 1.9466268545301857, + "grad_norm": 3.5743030713207975e+32, + "learning_rate": 0.00023711567183157272, + "loss": 7.5205, + "step": 20862 + }, + { + "epoch": 1.9467201642250629, + "grad_norm": 1.3475281041451866e+33, + "learning_rate": 0.00023710951429446245, + "loss": 7.3997, + "step": 20863 + }, + { + "epoch": 1.9468134739199403, + "grad_norm": 0.935774710454487, + "learning_rate": 0.0002371033565358585, + "loss": 7.5263, + "step": 20864 + }, + { + "epoch": 1.9469067836148177, + "grad_norm": 1.050480180719331, + "learning_rate": 0.00023709719855577653, + "loss": 7.4441, + "step": 20865 + }, + { + "epoch": 1.947000093309695, + "grad_norm": 1.1117326494235626, + "learning_rate": 0.00023709104035423215, + "loss": 7.7081, + "step": 20866 + }, + { + "epoch": 1.9470934030045721, + "grad_norm": 1.1118698853878495, + "learning_rate": 0.00023708488193124104, + "loss": 7.5955, + "step": 20867 + }, + { + "epoch": 1.9471867126994495, + "grad_norm": 1.2389933183553172, + "learning_rate": 0.00023707872328681886, + "loss": 7.3843, + "step": 20868 + }, + { + "epoch": 1.9472800223943267, + "grad_norm": 3.150285195992002e+32, + "learning_rate": 0.00023707256442098134, + "loss": 7.3385, + "step": 20869 + }, + { + "epoch": 1.947373332089204, + "grad_norm": 1.1906692757792678, + "learning_rate": 0.00023706640533374405, + "loss": 7.4613, + "step": 20870 + }, + { + "epoch": 1.9474666417840814, + "grad_norm": 1.0569261596467718, + "learning_rate": 0.00023706024602512273, + "loss": 7.5673, + "step": 20871 + }, + { + "epoch": 1.9475599514789588, + "grad_norm": 1.134320272934771, + "learning_rate": 0.00023705408649513294, + "loss": 7.6167, + "step": 20872 + }, + { + "epoch": 1.947653261173836, + "grad_norm": 1.67042197870651, + "learning_rate": 0.00023704792674379037, + "loss": 7.2032, + "step": 20873 + }, + { + "epoch": 1.9477465708687132, + "grad_norm": 1.2527540726859294, + "learning_rate": 0.00023704176677111075, + "loss": 7.5604, + "step": 20874 + }, + { + "epoch": 1.9478398805635906, + "grad_norm": 3.572336626264687e+33, + "learning_rate": 0.0002370356065771097, + "loss": 7.9115, + "step": 20875 + }, + { + "epoch": 1.947933190258468, + "grad_norm": 1.563438844340305, + "learning_rate": 0.00023702944616180288, + "loss": 7.4668, + "step": 20876 + }, + { + "epoch": 1.9480264999533452, + "grad_norm": 8.770346304394318e+31, + "learning_rate": 0.00023702328552520596, + "loss": 7.6126, + "step": 20877 + }, + { + "epoch": 1.9481198096482224, + "grad_norm": 1.1766275351793156, + "learning_rate": 0.00023701712466733463, + "loss": 7.4338, + "step": 20878 + }, + { + "epoch": 1.9482131193430998, + "grad_norm": 1.2639195572160147, + "learning_rate": 0.0002370109635882045, + "loss": 7.2651, + "step": 20879 + }, + { + "epoch": 1.948306429037977, + "grad_norm": 2.2071525117571962e+33, + "learning_rate": 0.00023700480228783125, + "loss": 7.4624, + "step": 20880 + }, + { + "epoch": 1.9483997387328542, + "grad_norm": 1.000055813447652, + "learning_rate": 0.00023699864076623056, + "loss": 7.42, + "step": 20881 + }, + { + "epoch": 1.9484930484277316, + "grad_norm": 4.1737284269512414e+32, + "learning_rate": 0.0002369924790234181, + "loss": 7.3502, + "step": 20882 + }, + { + "epoch": 1.948586358122609, + "grad_norm": 1.335597332762344, + "learning_rate": 0.00023698631705940955, + "loss": 7.6224, + "step": 20883 + }, + { + "epoch": 1.9486796678174862, + "grad_norm": 1.3513949805588628, + "learning_rate": 0.00023698015487422057, + "loss": 7.0596, + "step": 20884 + }, + { + "epoch": 1.9487729775123634, + "grad_norm": 1.40748416933384, + "learning_rate": 0.00023697399246786676, + "loss": 7.5277, + "step": 20885 + }, + { + "epoch": 1.9488662872072409, + "grad_norm": 1.4756969484307967, + "learning_rate": 0.00023696782984036394, + "loss": 7.7284, + "step": 20886 + }, + { + "epoch": 1.9489595969021183, + "grad_norm": 1.1302515528092056, + "learning_rate": 0.00023696166699172766, + "loss": 7.4608, + "step": 20887 + }, + { + "epoch": 1.9490529065969955, + "grad_norm": 1.1914291162034938, + "learning_rate": 0.0002369555039219736, + "loss": 7.6681, + "step": 20888 + }, + { + "epoch": 1.9491462162918727, + "grad_norm": 25.787915193639854, + "learning_rate": 0.00023694934063111747, + "loss": 7.531, + "step": 20889 + }, + { + "epoch": 1.94923952598675, + "grad_norm": 9.243084751378342e+33, + "learning_rate": 0.00023694317711917494, + "loss": 7.3198, + "step": 20890 + }, + { + "epoch": 1.9493328356816273, + "grad_norm": 1.0042752041964262, + "learning_rate": 0.00023693701338616162, + "loss": 7.4281, + "step": 20891 + }, + { + "epoch": 1.9494261453765045, + "grad_norm": 1.99814311374497e+32, + "learning_rate": 0.00023693084943209326, + "loss": 7.0631, + "step": 20892 + }, + { + "epoch": 1.949519455071382, + "grad_norm": 1.0890987748235958, + "learning_rate": 0.00023692468525698547, + "loss": 7.4195, + "step": 20893 + }, + { + "epoch": 1.9496127647662593, + "grad_norm": 1.1123894558892955, + "learning_rate": 0.00023691852086085396, + "loss": 7.4667, + "step": 20894 + }, + { + "epoch": 1.9497060744611365, + "grad_norm": 1.3725162017841575, + "learning_rate": 0.00023691235624371442, + "loss": 7.4169, + "step": 20895 + }, + { + "epoch": 1.9497993841560137, + "grad_norm": 1.718057171329767, + "learning_rate": 0.00023690619140558252, + "loss": 7.5646, + "step": 20896 + }, + { + "epoch": 1.9498926938508911, + "grad_norm": 1.167632622248739, + "learning_rate": 0.0002369000263464739, + "loss": 7.6087, + "step": 20897 + }, + { + "epoch": 1.9499860035457686, + "grad_norm": 43.172208447657354, + "learning_rate": 0.00023689386106640422, + "loss": 7.5039, + "step": 20898 + }, + { + "epoch": 1.9500793132406455, + "grad_norm": 1.1110348826242875, + "learning_rate": 0.00023688769556538927, + "loss": 7.547, + "step": 20899 + }, + { + "epoch": 1.950172622935523, + "grad_norm": 1.2820246692649147, + "learning_rate": 0.0002368815298434446, + "loss": 7.8218, + "step": 20900 + }, + { + "epoch": 1.9502659326304004, + "grad_norm": 5.486672004113924e+32, + "learning_rate": 0.0002368753639005859, + "loss": 7.4561, + "step": 20901 + }, + { + "epoch": 1.9503592423252776, + "grad_norm": 1.0032311831138592, + "learning_rate": 0.0002368691977368289, + "loss": 7.6433, + "step": 20902 + }, + { + "epoch": 1.9504525520201548, + "grad_norm": 1.1043744734657839, + "learning_rate": 0.00023686303135218934, + "loss": 7.6762, + "step": 20903 + }, + { + "epoch": 1.9505458617150322, + "grad_norm": 5.118101975916346e+34, + "learning_rate": 0.0002368568647466827, + "loss": 7.7782, + "step": 20904 + }, + { + "epoch": 1.9506391714099096, + "grad_norm": 2.23082121256718, + "learning_rate": 0.00023685069792032487, + "loss": 7.648, + "step": 20905 + }, + { + "epoch": 1.9507324811047868, + "grad_norm": 3.012421486505558, + "learning_rate": 0.00023684453087313142, + "loss": 7.3862, + "step": 20906 + }, + { + "epoch": 1.950825790799664, + "grad_norm": 1.5689331531866137, + "learning_rate": 0.00023683836360511809, + "loss": 7.4599, + "step": 20907 + }, + { + "epoch": 1.9509191004945414, + "grad_norm": 1.4786854932759075, + "learning_rate": 0.00023683219611630044, + "loss": 7.2711, + "step": 20908 + }, + { + "epoch": 1.9510124101894188, + "grad_norm": 1.1247008421805327, + "learning_rate": 0.0002368260284066943, + "loss": 7.4657, + "step": 20909 + }, + { + "epoch": 1.9511057198842958, + "grad_norm": 1.9050346905406748, + "learning_rate": 0.00023681986047631527, + "loss": 7.745, + "step": 20910 + }, + { + "epoch": 1.9511990295791732, + "grad_norm": 2.017197267197598, + "learning_rate": 0.00023681369232517906, + "loss": 7.7162, + "step": 20911 + }, + { + "epoch": 1.9512923392740507, + "grad_norm": 1.9380007921181335, + "learning_rate": 0.00023680752395330133, + "loss": 7.4413, + "step": 20912 + }, + { + "epoch": 1.9513856489689279, + "grad_norm": 1.0695952064876125e+31, + "learning_rate": 0.00023680135536069782, + "loss": 7.2224, + "step": 20913 + }, + { + "epoch": 1.951478958663805, + "grad_norm": 1.9100599142918349, + "learning_rate": 0.0002367951865473841, + "loss": 7.8975, + "step": 20914 + }, + { + "epoch": 1.9515722683586825, + "grad_norm": 1.260731436930591, + "learning_rate": 0.00023678901751337603, + "loss": 7.4858, + "step": 20915 + }, + { + "epoch": 1.95166557805356, + "grad_norm": 1.227940764086474, + "learning_rate": 0.00023678284825868912, + "loss": 7.7629, + "step": 20916 + }, + { + "epoch": 1.951758887748437, + "grad_norm": 1.1173754130822056, + "learning_rate": 0.00023677667878333915, + "loss": 7.776, + "step": 20917 + }, + { + "epoch": 1.9518521974433143, + "grad_norm": 1.3333187393288695e+32, + "learning_rate": 0.00023677050908734183, + "loss": 7.5055, + "step": 20918 + }, + { + "epoch": 1.9519455071381917, + "grad_norm": 2.4260969498524925, + "learning_rate": 0.00023676433917071277, + "loss": 7.4196, + "step": 20919 + }, + { + "epoch": 1.952038816833069, + "grad_norm": 2.463924586258529, + "learning_rate": 0.0002367581690334677, + "loss": 7.232, + "step": 20920 + }, + { + "epoch": 1.952132126527946, + "grad_norm": 3.176272042532226, + "learning_rate": 0.0002367519986756223, + "loss": 7.9086, + "step": 20921 + }, + { + "epoch": 1.9522254362228235, + "grad_norm": 1.4184143325664431, + "learning_rate": 0.0002367458280971923, + "loss": 7.3975, + "step": 20922 + }, + { + "epoch": 1.952318745917701, + "grad_norm": 1.0391801870630646, + "learning_rate": 0.00023673965729819336, + "loss": 7.3962, + "step": 20923 + }, + { + "epoch": 1.9524120556125781, + "grad_norm": 1.1564802653070223, + "learning_rate": 0.00023673348627864112, + "loss": 7.6581, + "step": 20924 + }, + { + "epoch": 1.9525053653074553, + "grad_norm": 1.3092383757774626, + "learning_rate": 0.00023672731503855134, + "loss": 7.7442, + "step": 20925 + }, + { + "epoch": 1.9525986750023328, + "grad_norm": 1.327425989934848, + "learning_rate": 0.00023672114357793967, + "loss": 7.3947, + "step": 20926 + }, + { + "epoch": 1.9526919846972102, + "grad_norm": 1.6323124574442447, + "learning_rate": 0.00023671497189682185, + "loss": 7.45, + "step": 20927 + }, + { + "epoch": 1.9527852943920874, + "grad_norm": 1.177648490932018, + "learning_rate": 0.00023670879999521354, + "loss": 7.3516, + "step": 20928 + }, + { + "epoch": 1.9528786040869646, + "grad_norm": 1.3816063266663084e+31, + "learning_rate": 0.0002367026278731304, + "loss": 7.4839, + "step": 20929 + }, + { + "epoch": 1.952971913781842, + "grad_norm": 1.6921942525171891, + "learning_rate": 0.00023669645553058818, + "loss": 7.2834, + "step": 20930 + }, + { + "epoch": 1.9530652234767192, + "grad_norm": 1.022744115590551, + "learning_rate": 0.00023669028296760257, + "loss": 7.5202, + "step": 20931 + }, + { + "epoch": 1.9531585331715964, + "grad_norm": 1.1688204702518254, + "learning_rate": 0.00023668411018418925, + "loss": 7.3507, + "step": 20932 + }, + { + "epoch": 1.9532518428664738, + "grad_norm": 8.089300315695854e+33, + "learning_rate": 0.0002366779371803639, + "loss": 7.456, + "step": 20933 + }, + { + "epoch": 1.9533451525613512, + "grad_norm": 4.529694470719277e+32, + "learning_rate": 0.00023667176395614225, + "loss": 7.2852, + "step": 20934 + }, + { + "epoch": 1.9534384622562284, + "grad_norm": 1.4357491737449375, + "learning_rate": 0.00023666559051153995, + "loss": 7.7237, + "step": 20935 + }, + { + "epoch": 1.9535317719511056, + "grad_norm": 1.5131571010479805, + "learning_rate": 0.00023665941684657273, + "loss": 7.3116, + "step": 20936 + }, + { + "epoch": 1.953625081645983, + "grad_norm": 6.083049558742625e+32, + "learning_rate": 0.00023665324296125634, + "loss": 7.2038, + "step": 20937 + }, + { + "epoch": 1.9537183913408604, + "grad_norm": 1.077130347658474, + "learning_rate": 0.00023664706885560637, + "loss": 7.4017, + "step": 20938 + }, + { + "epoch": 1.9538117010357376, + "grad_norm": 1.3647455914650262, + "learning_rate": 0.00023664089452963855, + "loss": 7.2934, + "step": 20939 + }, + { + "epoch": 1.9539050107306148, + "grad_norm": 1.2339643357770367, + "learning_rate": 0.00023663471998336866, + "loss": 7.3816, + "step": 20940 + }, + { + "epoch": 1.9539983204254923, + "grad_norm": 1.9186847850737465, + "learning_rate": 0.0002366285452168123, + "loss": 7.5557, + "step": 20941 + }, + { + "epoch": 1.9540916301203695, + "grad_norm": 1.0892780361178311, + "learning_rate": 0.00023662237022998518, + "loss": 7.1023, + "step": 20942 + }, + { + "epoch": 1.9541849398152467, + "grad_norm": 2.0600471580217343, + "learning_rate": 0.00023661619502290309, + "loss": 7.6032, + "step": 20943 + }, + { + "epoch": 1.954278249510124, + "grad_norm": 1.661548184276727, + "learning_rate": 0.00023661001959558167, + "loss": 7.5996, + "step": 20944 + }, + { + "epoch": 1.9543715592050015, + "grad_norm": 1.2337940353375454, + "learning_rate": 0.0002366038439480366, + "loss": 7.6971, + "step": 20945 + }, + { + "epoch": 1.9544648688998787, + "grad_norm": 1.070530957531614, + "learning_rate": 0.0002365976680802836, + "loss": 7.4733, + "step": 20946 + }, + { + "epoch": 1.954558178594756, + "grad_norm": 1.394258279376721, + "learning_rate": 0.00023659149199233844, + "loss": 7.4173, + "step": 20947 + }, + { + "epoch": 1.9546514882896333, + "grad_norm": 2.2890311344985723, + "learning_rate": 0.0002365853156842167, + "loss": 7.1497, + "step": 20948 + }, + { + "epoch": 1.9547447979845107, + "grad_norm": 1.6108137393666802e+31, + "learning_rate": 0.00023657913915593417, + "loss": 7.5369, + "step": 20949 + }, + { + "epoch": 1.954838107679388, + "grad_norm": 1.658510012780904, + "learning_rate": 0.00023657296240750658, + "loss": 7.2766, + "step": 20950 + }, + { + "epoch": 1.9549314173742651, + "grad_norm": 1.8228818170544364e+32, + "learning_rate": 0.00023656678543894951, + "loss": 7.1545, + "step": 20951 + }, + { + "epoch": 1.9550247270691425, + "grad_norm": 1.4634652218772237, + "learning_rate": 0.0002365606082502788, + "loss": 7.4401, + "step": 20952 + }, + { + "epoch": 1.9551180367640197, + "grad_norm": 0.9913306037078171, + "learning_rate": 0.00023655443084151013, + "loss": 7.5024, + "step": 20953 + }, + { + "epoch": 1.955211346458897, + "grad_norm": 1.1777832894054894, + "learning_rate": 0.00023654825321265915, + "loss": 7.6165, + "step": 20954 + }, + { + "epoch": 1.9553046561537744, + "grad_norm": 1.2560118892603995, + "learning_rate": 0.00023654207536374157, + "loss": 7.4522, + "step": 20955 + }, + { + "epoch": 1.9553979658486518, + "grad_norm": 1.4305586949497642, + "learning_rate": 0.00023653589729477316, + "loss": 7.447, + "step": 20956 + }, + { + "epoch": 1.955491275543529, + "grad_norm": 1.1421685086052409, + "learning_rate": 0.00023652971900576963, + "loss": 7.1184, + "step": 20957 + }, + { + "epoch": 1.9555845852384062, + "grad_norm": 2.345583365566771e+32, + "learning_rate": 0.0002365235404967466, + "loss": 7.5773, + "step": 20958 + }, + { + "epoch": 1.9556778949332836, + "grad_norm": 1.5187837281536012, + "learning_rate": 0.00023651736176771984, + "loss": 7.5045, + "step": 20959 + }, + { + "epoch": 1.955771204628161, + "grad_norm": 1.3302362198817874, + "learning_rate": 0.0002365111828187051, + "loss": 7.7193, + "step": 20960 + }, + { + "epoch": 1.9558645143230382, + "grad_norm": 1.1599389497992552, + "learning_rate": 0.000236505003649718, + "loss": 7.4785, + "step": 20961 + }, + { + "epoch": 1.9559578240179154, + "grad_norm": 8.491357518514162e+30, + "learning_rate": 0.00023649882426077432, + "loss": 7.9448, + "step": 20962 + }, + { + "epoch": 1.9560511337127928, + "grad_norm": 1.0901698586855846, + "learning_rate": 0.0002364926446518898, + "loss": 7.5629, + "step": 20963 + }, + { + "epoch": 1.95614444340767, + "grad_norm": 2.708047310044087, + "learning_rate": 0.00023648646482308, + "loss": 7.1258, + "step": 20964 + }, + { + "epoch": 1.9562377531025472, + "grad_norm": 2.120053985162735, + "learning_rate": 0.00023648028477436082, + "loss": 7.1083, + "step": 20965 + }, + { + "epoch": 1.9563310627974246, + "grad_norm": 1.1263087964956162, + "learning_rate": 0.00023647410450574789, + "loss": 7.4176, + "step": 20966 + }, + { + "epoch": 1.956424372492302, + "grad_norm": 1.036732175487355, + "learning_rate": 0.0002364679240172569, + "loss": 7.3226, + "step": 20967 + }, + { + "epoch": 1.9565176821871793, + "grad_norm": 1.678304910437662, + "learning_rate": 0.00023646174330890362, + "loss": 7.5051, + "step": 20968 + }, + { + "epoch": 1.9566109918820564, + "grad_norm": 1.710773778545051, + "learning_rate": 0.00023645556238070372, + "loss": 7.1749, + "step": 20969 + }, + { + "epoch": 1.9567043015769339, + "grad_norm": 2.538387593934068, + "learning_rate": 0.00023644938123267294, + "loss": 7.4531, + "step": 20970 + }, + { + "epoch": 1.9567976112718113, + "grad_norm": 1.203714814615317, + "learning_rate": 0.00023644319986482697, + "loss": 7.3603, + "step": 20971 + }, + { + "epoch": 1.9568909209666885, + "grad_norm": 1.217428168216085, + "learning_rate": 0.0002364370182771816, + "loss": 7.5832, + "step": 20972 + }, + { + "epoch": 1.9569842306615657, + "grad_norm": 1.0308790862648112, + "learning_rate": 0.00023643083646975244, + "loss": 7.5634, + "step": 20973 + }, + { + "epoch": 1.957077540356443, + "grad_norm": 1.0456328780753712, + "learning_rate": 0.0002364246544425553, + "loss": 7.6474, + "step": 20974 + }, + { + "epoch": 1.9571708500513203, + "grad_norm": 1.5781661132580274, + "learning_rate": 0.00023641847219560584, + "loss": 7.6306, + "step": 20975 + }, + { + "epoch": 1.9572641597461975, + "grad_norm": 1.4965554287778797, + "learning_rate": 0.00023641228972891986, + "loss": 7.4145, + "step": 20976 + }, + { + "epoch": 1.957357469441075, + "grad_norm": 1.1769583406504966, + "learning_rate": 0.00023640610704251294, + "loss": 7.4692, + "step": 20977 + }, + { + "epoch": 1.9574507791359523, + "grad_norm": 2.9504648314612925, + "learning_rate": 0.00023639992413640093, + "loss": 6.9094, + "step": 20978 + }, + { + "epoch": 1.9575440888308295, + "grad_norm": 1.5478745751010123, + "learning_rate": 0.00023639374101059953, + "loss": 7.2918, + "step": 20979 + }, + { + "epoch": 1.9576373985257067, + "grad_norm": 4.551278646412208e+31, + "learning_rate": 0.00023638755766512438, + "loss": 7.2488, + "step": 20980 + }, + { + "epoch": 1.9577307082205841, + "grad_norm": 1.0715931029927361, + "learning_rate": 0.00023638137409999133, + "loss": 7.2205, + "step": 20981 + }, + { + "epoch": 1.9578240179154616, + "grad_norm": 1.190681008947762, + "learning_rate": 0.00023637519031521602, + "loss": 7.2785, + "step": 20982 + }, + { + "epoch": 1.9579173276103388, + "grad_norm": 2.260475055153987, + "learning_rate": 0.00023636900631081412, + "loss": 7.3623, + "step": 20983 + }, + { + "epoch": 1.958010637305216, + "grad_norm": 1.8850839632027236, + "learning_rate": 0.00023636282208680152, + "loss": 7.4467, + "step": 20984 + }, + { + "epoch": 1.9581039470000934, + "grad_norm": 1.5730726181454546, + "learning_rate": 0.00023635663764319374, + "loss": 7.1878, + "step": 20985 + }, + { + "epoch": 1.9581972566949706, + "grad_norm": 1.4504614080311158, + "learning_rate": 0.0002363504529800067, + "loss": 7.4109, + "step": 20986 + }, + { + "epoch": 1.9582905663898478, + "grad_norm": 1.7492902941497497, + "learning_rate": 0.000236344268097256, + "loss": 7.5526, + "step": 20987 + }, + { + "epoch": 1.9583838760847252, + "grad_norm": 1.0961747929255574, + "learning_rate": 0.00023633808299495742, + "loss": 7.4285, + "step": 20988 + }, + { + "epoch": 1.9584771857796026, + "grad_norm": 7.322451169813156e+32, + "learning_rate": 0.00023633189767312664, + "loss": 7.5103, + "step": 20989 + }, + { + "epoch": 1.9585704954744798, + "grad_norm": 4.308930499349762e+31, + "learning_rate": 0.00023632571213177942, + "loss": 7.5061, + "step": 20990 + }, + { + "epoch": 1.958663805169357, + "grad_norm": 7.10653069057815e+30, + "learning_rate": 0.00023631952637093151, + "loss": 7.2578, + "step": 20991 + }, + { + "epoch": 1.9587571148642344, + "grad_norm": 2.4226323921137606e+30, + "learning_rate": 0.0002363133403905986, + "loss": 7.6997, + "step": 20992 + }, + { + "epoch": 1.9588504245591118, + "grad_norm": 1.1504280242998177, + "learning_rate": 0.00023630715419079644, + "loss": 7.8467, + "step": 20993 + }, + { + "epoch": 1.958943734253989, + "grad_norm": 1.0823903708464535e+31, + "learning_rate": 0.00023630096777154075, + "loss": 7.4285, + "step": 20994 + }, + { + "epoch": 1.9590370439488662, + "grad_norm": 1.6610430037435642, + "learning_rate": 0.00023629478113284725, + "loss": 7.549, + "step": 20995 + }, + { + "epoch": 1.9591303536437437, + "grad_norm": 1.8871831442649842, + "learning_rate": 0.0002362885942747317, + "loss": 7.8402, + "step": 20996 + }, + { + "epoch": 1.9592236633386209, + "grad_norm": 1.192563160601927, + "learning_rate": 0.0002362824071972098, + "loss": 7.7437, + "step": 20997 + }, + { + "epoch": 1.959316973033498, + "grad_norm": 1.0800828694713172, + "learning_rate": 0.00023627621990029727, + "loss": 7.3067, + "step": 20998 + }, + { + "epoch": 1.9594102827283755, + "grad_norm": 1.1662930518575585, + "learning_rate": 0.00023627003238400996, + "loss": 7.444, + "step": 20999 + }, + { + "epoch": 1.959503592423253, + "grad_norm": 1.0521814616907528, + "learning_rate": 0.00023626384464836347, + "loss": 7.382, + "step": 21000 + }, + { + "epoch": 1.95959690211813, + "grad_norm": 1.416443026806098, + "learning_rate": 0.00023625765669337356, + "loss": 7.5536, + "step": 21001 + }, + { + "epoch": 1.9596902118130073, + "grad_norm": 1.6244063651944194, + "learning_rate": 0.00023625146851905597, + "loss": 7.544, + "step": 21002 + }, + { + "epoch": 1.9597835215078847, + "grad_norm": 2.126862393281754, + "learning_rate": 0.0002362452801254265, + "loss": 7.7148, + "step": 21003 + }, + { + "epoch": 1.9598768312027621, + "grad_norm": 1.496072682365826, + "learning_rate": 0.00023623909151250077, + "loss": 7.0728, + "step": 21004 + }, + { + "epoch": 1.959970140897639, + "grad_norm": 1.15386158984226, + "learning_rate": 0.0002362329026802946, + "loss": 7.633, + "step": 21005 + }, + { + "epoch": 1.9600634505925165, + "grad_norm": 9.549954967003507e+29, + "learning_rate": 0.00023622671362882368, + "loss": 7.4309, + "step": 21006 + }, + { + "epoch": 1.960156760287394, + "grad_norm": 1.092040375887177, + "learning_rate": 0.00023622052435810382, + "loss": 7.5433, + "step": 21007 + }, + { + "epoch": 1.9602500699822711, + "grad_norm": 1.0568842862268055, + "learning_rate": 0.00023621433486815065, + "loss": 7.6247, + "step": 21008 + }, + { + "epoch": 1.9603433796771483, + "grad_norm": 1.074678095113971, + "learning_rate": 0.00023620814515898004, + "loss": 7.7322, + "step": 21009 + }, + { + "epoch": 1.9604366893720258, + "grad_norm": 1.938001649841461, + "learning_rate": 0.0002362019552306076, + "loss": 7.3771, + "step": 21010 + }, + { + "epoch": 1.9605299990669032, + "grad_norm": 1.3371929110474023, + "learning_rate": 0.0002361957650830491, + "loss": 7.2376, + "step": 21011 + }, + { + "epoch": 1.9606233087617804, + "grad_norm": 1.5372079436901955, + "learning_rate": 0.00023618957471632034, + "loss": 7.3704, + "step": 21012 + }, + { + "epoch": 1.9607166184566576, + "grad_norm": 1.1504638903541886, + "learning_rate": 0.00023618338413043704, + "loss": 7.4555, + "step": 21013 + }, + { + "epoch": 1.960809928151535, + "grad_norm": 1.423048179076207, + "learning_rate": 0.00023617719332541488, + "loss": 7.7209, + "step": 21014 + }, + { + "epoch": 1.9609032378464124, + "grad_norm": 2.4006301974574344, + "learning_rate": 0.00023617100230126966, + "loss": 7.8865, + "step": 21015 + }, + { + "epoch": 1.9609965475412894, + "grad_norm": 6.05320291446166e+31, + "learning_rate": 0.00023616481105801713, + "loss": 7.8371, + "step": 21016 + }, + { + "epoch": 1.9610898572361668, + "grad_norm": 1.2533494272816659, + "learning_rate": 0.00023615861959567298, + "loss": 7.6247, + "step": 21017 + }, + { + "epoch": 1.9611831669310442, + "grad_norm": 1.0801570504305253, + "learning_rate": 0.000236152427914253, + "loss": 7.5618, + "step": 21018 + }, + { + "epoch": 1.9612764766259214, + "grad_norm": 2.099854036338738, + "learning_rate": 0.00023614623601377288, + "loss": 7.2058, + "step": 21019 + }, + { + "epoch": 1.9613697863207986, + "grad_norm": 2.798025485130558e+29, + "learning_rate": 0.00023614004389424842, + "loss": 7.3889, + "step": 21020 + }, + { + "epoch": 1.961463096015676, + "grad_norm": 2.0660950488228735, + "learning_rate": 0.0002361338515556954, + "loss": 7.346, + "step": 21021 + }, + { + "epoch": 1.9615564057105535, + "grad_norm": 1.1995887383914619, + "learning_rate": 0.00023612765899812943, + "loss": 7.3945, + "step": 21022 + }, + { + "epoch": 1.9616497154054306, + "grad_norm": 1.70069767951905, + "learning_rate": 0.00023612146622156637, + "loss": 7.5882, + "step": 21023 + }, + { + "epoch": 1.9617430251003078, + "grad_norm": 1.46929629859166e+31, + "learning_rate": 0.00023611527322602192, + "loss": 7.2968, + "step": 21024 + }, + { + "epoch": 1.9618363347951853, + "grad_norm": 1.0135831192796156, + "learning_rate": 0.00023610908001151187, + "loss": 7.5119, + "step": 21025 + }, + { + "epoch": 1.9619296444900625, + "grad_norm": 1.0720904182342585e+31, + "learning_rate": 0.00023610288657805194, + "loss": 7.453, + "step": 21026 + }, + { + "epoch": 1.9620229541849397, + "grad_norm": 0.9272911528696617, + "learning_rate": 0.00023609669292565784, + "loss": 7.3905, + "step": 21027 + }, + { + "epoch": 1.962116263879817, + "grad_norm": 0.9390161457495076, + "learning_rate": 0.00023609049905434535, + "loss": 7.213, + "step": 21028 + }, + { + "epoch": 1.9622095735746945, + "grad_norm": 0.9204450034369802, + "learning_rate": 0.00023608430496413027, + "loss": 7.3614, + "step": 21029 + }, + { + "epoch": 1.9623028832695717, + "grad_norm": 4.261756273830767e+30, + "learning_rate": 0.00023607811065502824, + "loss": 7.538, + "step": 21030 + }, + { + "epoch": 1.962396192964449, + "grad_norm": 1.1444851204690525, + "learning_rate": 0.00023607191612705513, + "loss": 7.5914, + "step": 21031 + }, + { + "epoch": 1.9624895026593263, + "grad_norm": 1.0491711997781858, + "learning_rate": 0.00023606572138022661, + "loss": 7.4726, + "step": 21032 + }, + { + "epoch": 1.9625828123542037, + "grad_norm": 1.3302563340870928, + "learning_rate": 0.00023605952641455843, + "loss": 7.4359, + "step": 21033 + }, + { + "epoch": 1.962676122049081, + "grad_norm": 1.0859599819235957, + "learning_rate": 0.00023605333123006642, + "loss": 7.4747, + "step": 21034 + }, + { + "epoch": 1.9627694317439581, + "grad_norm": 1.0926588468737437e+31, + "learning_rate": 0.00023604713582676626, + "loss": 7.4454, + "step": 21035 + }, + { + "epoch": 1.9628627414388355, + "grad_norm": 2.1124766801535526, + "learning_rate": 0.0002360409402046737, + "loss": 7.3329, + "step": 21036 + }, + { + "epoch": 1.9629560511337127, + "grad_norm": 1.4967411549490475, + "learning_rate": 0.00023603474436380452, + "loss": 7.8371, + "step": 21037 + }, + { + "epoch": 1.96304936082859, + "grad_norm": 6.641502650588449e+31, + "learning_rate": 0.00023602854830417447, + "loss": 7.5107, + "step": 21038 + }, + { + "epoch": 1.9631426705234674, + "grad_norm": 1.1330351307252019e+31, + "learning_rate": 0.00023602235202579928, + "loss": 7.5221, + "step": 21039 + }, + { + "epoch": 1.9632359802183448, + "grad_norm": 1.205696256240865, + "learning_rate": 0.0002360161555286948, + "loss": 7.4326, + "step": 21040 + }, + { + "epoch": 1.963329289913222, + "grad_norm": 1.74937740203575e+31, + "learning_rate": 0.0002360099588128767, + "loss": 7.2555, + "step": 21041 + }, + { + "epoch": 1.9634225996080992, + "grad_norm": 0.970943150948771, + "learning_rate": 0.00023600376187836073, + "loss": 7.4799, + "step": 21042 + }, + { + "epoch": 1.9635159093029766, + "grad_norm": 1.3987211583267145, + "learning_rate": 0.00023599756472516265, + "loss": 7.2318, + "step": 21043 + }, + { + "epoch": 1.963609218997854, + "grad_norm": 1.0678254465459838, + "learning_rate": 0.0002359913673532983, + "loss": 7.2833, + "step": 21044 + }, + { + "epoch": 1.9637025286927312, + "grad_norm": 2.8879345091590128e+31, + "learning_rate": 0.00023598516976278334, + "loss": 7.4609, + "step": 21045 + }, + { + "epoch": 1.9637958383876084, + "grad_norm": 1.08876153922978, + "learning_rate": 0.00023597897195363354, + "loss": 7.4478, + "step": 21046 + }, + { + "epoch": 1.9638891480824858, + "grad_norm": 0.9779372205639952, + "learning_rate": 0.0002359727739258647, + "loss": 7.2629, + "step": 21047 + }, + { + "epoch": 1.963982457777363, + "grad_norm": 9.58556166581395, + "learning_rate": 0.0002359665756794926, + "loss": 7.2458, + "step": 21048 + }, + { + "epoch": 1.9640757674722402, + "grad_norm": 1.2061068899379819, + "learning_rate": 0.00023596037721453292, + "loss": 7.4719, + "step": 21049 + }, + { + "epoch": 1.9641690771671176, + "grad_norm": 3.3508813267457536e+31, + "learning_rate": 0.00023595417853100145, + "loss": 7.6851, + "step": 21050 + }, + { + "epoch": 1.964262386861995, + "grad_norm": 1.1005563745850828, + "learning_rate": 0.000235947979628914, + "loss": 7.6552, + "step": 21051 + }, + { + "epoch": 1.9643556965568723, + "grad_norm": 1.3253317587300668, + "learning_rate": 0.00023594178050828624, + "loss": 7.7989, + "step": 21052 + }, + { + "epoch": 1.9644490062517495, + "grad_norm": 1.6827019537772452, + "learning_rate": 0.00023593558116913408, + "loss": 7.1847, + "step": 21053 + }, + { + "epoch": 1.9645423159466269, + "grad_norm": 5.591089283084628e+30, + "learning_rate": 0.00023592938161147314, + "loss": 7.279, + "step": 21054 + }, + { + "epoch": 1.9646356256415043, + "grad_norm": 1.2360185610568595, + "learning_rate": 0.00023592318183531926, + "loss": 7.1836, + "step": 21055 + }, + { + "epoch": 1.9647289353363815, + "grad_norm": 1.010810295823229, + "learning_rate": 0.00023591698184068818, + "loss": 7.2762, + "step": 21056 + }, + { + "epoch": 1.9648222450312587, + "grad_norm": 1.0283397422021088, + "learning_rate": 0.0002359107816275956, + "loss": 7.2092, + "step": 21057 + }, + { + "epoch": 1.964915554726136, + "grad_norm": 2.1013921592580258e+30, + "learning_rate": 0.00023590458119605742, + "loss": 7.5499, + "step": 21058 + }, + { + "epoch": 1.9650088644210133, + "grad_norm": 1.7031511971030011, + "learning_rate": 0.00023589838054608933, + "loss": 7.3835, + "step": 21059 + }, + { + "epoch": 1.9651021741158905, + "grad_norm": 7.203693884066477e+30, + "learning_rate": 0.00023589217967770712, + "loss": 7.0761, + "step": 21060 + }, + { + "epoch": 1.965195483810768, + "grad_norm": 1.4904261604698892, + "learning_rate": 0.0002358859785909265, + "loss": 7.5318, + "step": 21061 + }, + { + "epoch": 1.9652887935056453, + "grad_norm": 1.1038510277453049, + "learning_rate": 0.00023587977728576332, + "loss": 7.4256, + "step": 21062 + }, + { + "epoch": 1.9653821032005225, + "grad_norm": 3.136945871417609e+30, + "learning_rate": 0.00023587357576223327, + "loss": 7.4796, + "step": 21063 + }, + { + "epoch": 1.9654754128953997, + "grad_norm": 1.4294796791959157, + "learning_rate": 0.00023586737402035216, + "loss": 7.2649, + "step": 21064 + }, + { + "epoch": 1.9655687225902772, + "grad_norm": 2.149833193318968e+31, + "learning_rate": 0.00023586117206013575, + "loss": 7.6246, + "step": 21065 + }, + { + "epoch": 1.9656620322851546, + "grad_norm": 1.4364553921954542, + "learning_rate": 0.00023585496988159983, + "loss": 7.8839, + "step": 21066 + }, + { + "epoch": 1.9657553419800318, + "grad_norm": 4.816361694093504e+31, + "learning_rate": 0.00023584876748476016, + "loss": 7.6903, + "step": 21067 + }, + { + "epoch": 1.965848651674909, + "grad_norm": 1.3346865432429584, + "learning_rate": 0.00023584256486963246, + "loss": 7.4713, + "step": 21068 + }, + { + "epoch": 1.9659419613697864, + "grad_norm": 1.6390328838653527, + "learning_rate": 0.00023583636203623257, + "loss": 7.3478, + "step": 21069 + }, + { + "epoch": 1.9660352710646636, + "grad_norm": 1.2674721975280405, + "learning_rate": 0.0002358301589845763, + "loss": 7.686, + "step": 21070 + }, + { + "epoch": 1.9661285807595408, + "grad_norm": 1.3872722146505492, + "learning_rate": 0.00023582395571467927, + "loss": 7.2991, + "step": 21071 + }, + { + "epoch": 1.9662218904544182, + "grad_norm": 1.5228672464200272, + "learning_rate": 0.00023581775222655734, + "loss": 7.1438, + "step": 21072 + }, + { + "epoch": 1.9663152001492956, + "grad_norm": 1.0421420512246964, + "learning_rate": 0.00023581154852022636, + "loss": 7.4189, + "step": 21073 + }, + { + "epoch": 1.9664085098441728, + "grad_norm": 2.5512372588649828, + "learning_rate": 0.000235805344595702, + "loss": 7.8487, + "step": 21074 + }, + { + "epoch": 1.96650181953905, + "grad_norm": 1.4436329570915387, + "learning_rate": 0.00023579914045300006, + "loss": 7.3471, + "step": 21075 + }, + { + "epoch": 1.9665951292339274, + "grad_norm": 1.9027021623468237, + "learning_rate": 0.0002357929360921363, + "loss": 7.6904, + "step": 21076 + }, + { + "epoch": 1.9666884389288048, + "grad_norm": 1.084541326344638, + "learning_rate": 0.00023578673151312653, + "loss": 7.559, + "step": 21077 + }, + { + "epoch": 1.966781748623682, + "grad_norm": 1.0516381040094442, + "learning_rate": 0.00023578052671598655, + "loss": 7.2703, + "step": 21078 + }, + { + "epoch": 1.9668750583185592, + "grad_norm": 1.3391518152292217e+30, + "learning_rate": 0.00023577432170073206, + "loss": 7.2223, + "step": 21079 + }, + { + "epoch": 1.9669683680134367, + "grad_norm": 1.1989785013139993, + "learning_rate": 0.0002357681164673789, + "loss": 7.3908, + "step": 21080 + }, + { + "epoch": 1.9670616777083139, + "grad_norm": 9.369574445469383e+29, + "learning_rate": 0.00023576191101594283, + "loss": 7.4067, + "step": 21081 + }, + { + "epoch": 1.967154987403191, + "grad_norm": 1.7265095517597628, + "learning_rate": 0.0002357557053464396, + "loss": 7.9294, + "step": 21082 + }, + { + "epoch": 1.9672482970980685, + "grad_norm": 1.3756274462242453, + "learning_rate": 0.00023574949945888503, + "loss": 7.6752, + "step": 21083 + }, + { + "epoch": 1.967341606792946, + "grad_norm": 1.5010138084343416e+31, + "learning_rate": 0.00023574329335329487, + "loss": 7.3549, + "step": 21084 + }, + { + "epoch": 1.967434916487823, + "grad_norm": 4.7014267768590754e+29, + "learning_rate": 0.00023573708702968494, + "loss": 7.6369, + "step": 21085 + }, + { + "epoch": 1.9675282261827003, + "grad_norm": 2.825820052347792e+30, + "learning_rate": 0.00023573088048807098, + "loss": 7.4424, + "step": 21086 + }, + { + "epoch": 1.9676215358775777, + "grad_norm": 1.296943058268145, + "learning_rate": 0.00023572467372846877, + "loss": 7.5931, + "step": 21087 + }, + { + "epoch": 1.9677148455724551, + "grad_norm": 2.844501312708723e+30, + "learning_rate": 0.00023571846675089411, + "loss": 7.2885, + "step": 21088 + }, + { + "epoch": 1.9678081552673323, + "grad_norm": 0.9575876456926264, + "learning_rate": 0.0002357122595553628, + "loss": 7.2843, + "step": 21089 + }, + { + "epoch": 1.9679014649622095, + "grad_norm": 1.232946984516723, + "learning_rate": 0.0002357060521418906, + "loss": 7.7108, + "step": 21090 + }, + { + "epoch": 1.967994774657087, + "grad_norm": 1.084522009226788, + "learning_rate": 0.00023569984451049332, + "loss": 7.5998, + "step": 21091 + }, + { + "epoch": 1.9680880843519641, + "grad_norm": 1.5993985057207576, + "learning_rate": 0.0002356936366611867, + "loss": 7.2058, + "step": 21092 + }, + { + "epoch": 1.9681813940468413, + "grad_norm": 1.070804844949053, + "learning_rate": 0.00023568742859398655, + "loss": 7.4806, + "step": 21093 + }, + { + "epoch": 1.9682747037417188, + "grad_norm": 1.499715607712966, + "learning_rate": 0.00023568122030890866, + "loss": 7.0398, + "step": 21094 + }, + { + "epoch": 1.9683680134365962, + "grad_norm": 1.2080640058496663, + "learning_rate": 0.00023567501180596877, + "loss": 7.4786, + "step": 21095 + }, + { + "epoch": 1.9684613231314734, + "grad_norm": 0.9647152234216908, + "learning_rate": 0.0002356688030851827, + "loss": 7.0562, + "step": 21096 + }, + { + "epoch": 1.9685546328263506, + "grad_norm": 1.2393223043041766, + "learning_rate": 0.0002356625941465663, + "loss": 7.2566, + "step": 21097 + }, + { + "epoch": 1.968647942521228, + "grad_norm": 1.7930054842508227, + "learning_rate": 0.00023565638499013528, + "loss": 7.4698, + "step": 21098 + }, + { + "epoch": 1.9687412522161054, + "grad_norm": 1.4968656055609324, + "learning_rate": 0.00023565017561590543, + "loss": 7.6954, + "step": 21099 + }, + { + "epoch": 1.9688345619109826, + "grad_norm": 2.88319339609831e+28, + "learning_rate": 0.00023564396602389253, + "loss": 7.3674, + "step": 21100 + }, + { + "epoch": 1.9689278716058598, + "grad_norm": 1.2361237853932636, + "learning_rate": 0.00023563775621411246, + "loss": 7.5118, + "step": 21101 + }, + { + "epoch": 1.9690211813007372, + "grad_norm": 1.0256823626021996, + "learning_rate": 0.00023563154618658083, + "loss": 7.6865, + "step": 21102 + }, + { + "epoch": 1.9691144909956144, + "grad_norm": 1.2158092015086577, + "learning_rate": 0.0002356253359413137, + "loss": 7.279, + "step": 21103 + }, + { + "epoch": 1.9692078006904916, + "grad_norm": 1.4667785120151071, + "learning_rate": 0.0002356191254783266, + "loss": 7.2987, + "step": 21104 + }, + { + "epoch": 1.969301110385369, + "grad_norm": 1.0303664122332357, + "learning_rate": 0.00023561291479763544, + "loss": 7.5657, + "step": 21105 + }, + { + "epoch": 1.9693944200802465, + "grad_norm": 1.1052978143004772, + "learning_rate": 0.000235606703899256, + "loss": 7.5236, + "step": 21106 + }, + { + "epoch": 1.9694877297751237, + "grad_norm": 1.0372689196493563, + "learning_rate": 0.0002356004927832041, + "loss": 7.5386, + "step": 21107 + }, + { + "epoch": 1.9695810394700008, + "grad_norm": 2.0506825053918054, + "learning_rate": 0.00023559428144949549, + "loss": 6.8909, + "step": 21108 + }, + { + "epoch": 1.9696743491648783, + "grad_norm": 1.0183624567506362, + "learning_rate": 0.00023558806989814593, + "loss": 7.1954, + "step": 21109 + }, + { + "epoch": 1.9697676588597557, + "grad_norm": 1.7394718178195732, + "learning_rate": 0.00023558185812917135, + "loss": 7.4178, + "step": 21110 + }, + { + "epoch": 1.9698609685546327, + "grad_norm": 1.176575662815971, + "learning_rate": 0.00023557564614258737, + "loss": 7.4365, + "step": 21111 + }, + { + "epoch": 1.96995427824951, + "grad_norm": 1.3365451228540992, + "learning_rate": 0.00023556943393840987, + "loss": 7.4525, + "step": 21112 + }, + { + "epoch": 1.9700475879443875, + "grad_norm": 0.9916910680562545, + "learning_rate": 0.00023556322151665473, + "loss": 7.325, + "step": 21113 + }, + { + "epoch": 1.9701408976392647, + "grad_norm": 1.255017394433439, + "learning_rate": 0.00023555700887733762, + "loss": 7.4914, + "step": 21114 + }, + { + "epoch": 1.970234207334142, + "grad_norm": 7.5484641838366435e+31, + "learning_rate": 0.00023555079602047434, + "loss": 7.3235, + "step": 21115 + }, + { + "epoch": 1.9703275170290193, + "grad_norm": 3.0852962913635e+30, + "learning_rate": 0.00023554458294608078, + "loss": 7.5368, + "step": 21116 + }, + { + "epoch": 1.9704208267238967, + "grad_norm": 1.1406908738892163, + "learning_rate": 0.00023553836965417268, + "loss": 7.6443, + "step": 21117 + }, + { + "epoch": 1.970514136418774, + "grad_norm": 1.3101531757365656, + "learning_rate": 0.00023553215614476584, + "loss": 7.6196, + "step": 21118 + }, + { + "epoch": 1.9706074461136511, + "grad_norm": 20.109482757220665, + "learning_rate": 0.00023552594241787607, + "loss": 7.5937, + "step": 21119 + }, + { + "epoch": 1.9707007558085285, + "grad_norm": 0.9512733642990431, + "learning_rate": 0.00023551972847351914, + "loss": 7.6282, + "step": 21120 + }, + { + "epoch": 1.970794065503406, + "grad_norm": 76.10113447648492, + "learning_rate": 0.0002355135143117109, + "loss": 7.3497, + "step": 21121 + }, + { + "epoch": 1.970887375198283, + "grad_norm": 0.9627123689518942, + "learning_rate": 0.00023550729993246713, + "loss": 7.595, + "step": 21122 + }, + { + "epoch": 1.9709806848931604, + "grad_norm": 1.007479586193879, + "learning_rate": 0.00023550108533580358, + "loss": 7.504, + "step": 21123 + }, + { + "epoch": 1.9710739945880378, + "grad_norm": 1.3791604571785414, + "learning_rate": 0.00023549487052173612, + "loss": 7.9029, + "step": 21124 + }, + { + "epoch": 1.971167304282915, + "grad_norm": 2.4456806251987415, + "learning_rate": 0.00023548865549028052, + "loss": 7.403, + "step": 21125 + }, + { + "epoch": 1.9712606139777922, + "grad_norm": 1.1762498474464196, + "learning_rate": 0.00023548244024145263, + "loss": 7.4333, + "step": 21126 + }, + { + "epoch": 1.9713539236726696, + "grad_norm": 1.2408625026772664, + "learning_rate": 0.0002354762247752682, + "loss": 7.5808, + "step": 21127 + }, + { + "epoch": 1.971447233367547, + "grad_norm": 0.9696051207823693, + "learning_rate": 0.000235470009091743, + "loss": 7.5046, + "step": 21128 + }, + { + "epoch": 1.9715405430624242, + "grad_norm": 1.4192623650699474, + "learning_rate": 0.00023546379319089295, + "loss": 7.0927, + "step": 21129 + }, + { + "epoch": 1.9716338527573014, + "grad_norm": 1.6266299480598637e+32, + "learning_rate": 0.00023545757707273376, + "loss": 7.412, + "step": 21130 + }, + { + "epoch": 1.9717271624521788, + "grad_norm": 1.4038780943666852e+30, + "learning_rate": 0.0002354513607372813, + "loss": 7.6511, + "step": 21131 + }, + { + "epoch": 1.971820472147056, + "grad_norm": 1.0829571369431839, + "learning_rate": 0.0002354451441845513, + "loss": 7.4566, + "step": 21132 + }, + { + "epoch": 1.9719137818419332, + "grad_norm": 2.3861997822935256e+32, + "learning_rate": 0.00023543892741455962, + "loss": 7.675, + "step": 21133 + }, + { + "epoch": 1.9720070915368106, + "grad_norm": 1.6690851929599886, + "learning_rate": 0.00023543271042732208, + "loss": 7.3541, + "step": 21134 + }, + { + "epoch": 1.972100401231688, + "grad_norm": 1.2542325681025872, + "learning_rate": 0.00023542649322285444, + "loss": 7.7637, + "step": 21135 + }, + { + "epoch": 1.9721937109265653, + "grad_norm": 1.028710196407856, + "learning_rate": 0.00023542027580117253, + "loss": 7.6374, + "step": 21136 + }, + { + "epoch": 1.9722870206214425, + "grad_norm": 1.6713293781726455, + "learning_rate": 0.00023541405816229218, + "loss": 7.2196, + "step": 21137 + }, + { + "epoch": 1.9723803303163199, + "grad_norm": 1.0147497906010272, + "learning_rate": 0.00023540784030622915, + "loss": 7.5531, + "step": 21138 + }, + { + "epoch": 1.9724736400111973, + "grad_norm": 1.1007075969934674, + "learning_rate": 0.0002354016222329993, + "loss": 7.3123, + "step": 21139 + }, + { + "epoch": 1.9725669497060745, + "grad_norm": 1.0563999583353039, + "learning_rate": 0.00023539540394261843, + "loss": 7.5848, + "step": 21140 + }, + { + "epoch": 1.9726602594009517, + "grad_norm": 1.1223099347776755, + "learning_rate": 0.00023538918543510238, + "loss": 7.4686, + "step": 21141 + }, + { + "epoch": 1.972753569095829, + "grad_norm": 4.873542988706695e+30, + "learning_rate": 0.00023538296671046685, + "loss": 7.4604, + "step": 21142 + }, + { + "epoch": 1.9728468787907063, + "grad_norm": 1.0735332237977795, + "learning_rate": 0.00023537674776872775, + "loss": 7.7771, + "step": 21143 + }, + { + "epoch": 1.9729401884855835, + "grad_norm": 1.1150868418676627, + "learning_rate": 0.00023537052860990088, + "loss": 7.7163, + "step": 21144 + }, + { + "epoch": 1.973033498180461, + "grad_norm": 1.1403792187831034, + "learning_rate": 0.00023536430923400207, + "loss": 7.5618, + "step": 21145 + }, + { + "epoch": 1.9731268078753383, + "grad_norm": 9.461297671470193, + "learning_rate": 0.00023535808964104707, + "loss": 7.6443, + "step": 21146 + }, + { + "epoch": 1.9732201175702155, + "grad_norm": 0.9667033286801743, + "learning_rate": 0.00023535186983105177, + "loss": 7.4259, + "step": 21147 + }, + { + "epoch": 1.9733134272650927, + "grad_norm": 1.1458014415350253, + "learning_rate": 0.00023534564980403193, + "loss": 7.6516, + "step": 21148 + }, + { + "epoch": 1.9734067369599702, + "grad_norm": 0.9754479073606065, + "learning_rate": 0.00023533942956000336, + "loss": 7.4333, + "step": 21149 + }, + { + "epoch": 1.9735000466548476, + "grad_norm": 1.2129600602020614, + "learning_rate": 0.00023533320909898192, + "loss": 7.8025, + "step": 21150 + }, + { + "epoch": 1.9735933563497248, + "grad_norm": 2.97940932228518e+31, + "learning_rate": 0.00023532698842098344, + "loss": 7.4086, + "step": 21151 + }, + { + "epoch": 1.973686666044602, + "grad_norm": 1.05683288222358, + "learning_rate": 0.00023532076752602366, + "loss": 7.4035, + "step": 21152 + }, + { + "epoch": 1.9737799757394794, + "grad_norm": 1.106966458360318, + "learning_rate": 0.00023531454641411846, + "loss": 7.4073, + "step": 21153 + }, + { + "epoch": 1.9738732854343566, + "grad_norm": 1.5419674250912134, + "learning_rate": 0.00023530832508528366, + "loss": 7.8572, + "step": 21154 + }, + { + "epoch": 1.9739665951292338, + "grad_norm": 1.3295570504162595, + "learning_rate": 0.000235302103539535, + "loss": 7.5935, + "step": 21155 + }, + { + "epoch": 1.9740599048241112, + "grad_norm": 1.344407859090516, + "learning_rate": 0.00023529588177688838, + "loss": 7.5664, + "step": 21156 + }, + { + "epoch": 1.9741532145189886, + "grad_norm": 1.7340560735976132, + "learning_rate": 0.00023528965979735964, + "loss": 7.2295, + "step": 21157 + }, + { + "epoch": 1.9742465242138658, + "grad_norm": 0.9389208473829743, + "learning_rate": 0.00023528343760096453, + "loss": 7.4536, + "step": 21158 + }, + { + "epoch": 1.974339833908743, + "grad_norm": 1.4296017845229072, + "learning_rate": 0.0002352772151877189, + "loss": 7.5881, + "step": 21159 + }, + { + "epoch": 1.9744331436036204, + "grad_norm": 1.5958131465675092, + "learning_rate": 0.00023527099255763858, + "loss": 7.7475, + "step": 21160 + }, + { + "epoch": 1.9745264532984979, + "grad_norm": 1.2015736306871634, + "learning_rate": 0.00023526476971073938, + "loss": 7.5168, + "step": 21161 + }, + { + "epoch": 1.974619762993375, + "grad_norm": 1.2819797187785078, + "learning_rate": 0.00023525854664703714, + "loss": 7.1795, + "step": 21162 + }, + { + "epoch": 1.9747130726882522, + "grad_norm": 1.2096342447210506, + "learning_rate": 0.00023525232336654768, + "loss": 7.281, + "step": 21163 + }, + { + "epoch": 1.9748063823831297, + "grad_norm": 1.39307757150136, + "learning_rate": 0.0002352460998692868, + "loss": 7.2056, + "step": 21164 + }, + { + "epoch": 1.9748996920780069, + "grad_norm": 1.5723764401835691, + "learning_rate": 0.0002352398761552703, + "loss": 7.7823, + "step": 21165 + }, + { + "epoch": 1.974993001772884, + "grad_norm": 4.340319128948191e+31, + "learning_rate": 0.00023523365222451406, + "loss": 7.4692, + "step": 21166 + }, + { + "epoch": 1.9750863114677615, + "grad_norm": 1.27658625090308, + "learning_rate": 0.0002352274280770339, + "loss": 7.6998, + "step": 21167 + }, + { + "epoch": 1.975179621162639, + "grad_norm": 4.025619224294828e+33, + "learning_rate": 0.0002352212037128457, + "loss": 7.3109, + "step": 21168 + }, + { + "epoch": 1.975272930857516, + "grad_norm": 1.0905736495808955, + "learning_rate": 0.00023521497913196515, + "loss": 7.5965, + "step": 21169 + }, + { + "epoch": 1.9753662405523933, + "grad_norm": 1.107864840538593, + "learning_rate": 0.00023520875433440819, + "loss": 7.1305, + "step": 21170 + }, + { + "epoch": 1.9754595502472707, + "grad_norm": 1.5263828397795751, + "learning_rate": 0.00023520252932019055, + "loss": 7.4671, + "step": 21171 + }, + { + "epoch": 1.9755528599421481, + "grad_norm": 1.2592293552928882, + "learning_rate": 0.00023519630408932817, + "loss": 7.4902, + "step": 21172 + }, + { + "epoch": 1.9756461696370253, + "grad_norm": 0.9176764894758194, + "learning_rate": 0.00023519007864183683, + "loss": 7.4153, + "step": 21173 + }, + { + "epoch": 1.9757394793319025, + "grad_norm": 1.0622764447822137, + "learning_rate": 0.0002351838529777323, + "loss": 7.2776, + "step": 21174 + }, + { + "epoch": 1.97583278902678, + "grad_norm": 0.9869316712988926, + "learning_rate": 0.00023517762709703054, + "loss": 7.5055, + "step": 21175 + }, + { + "epoch": 1.9759260987216571, + "grad_norm": 3.914932140795626e+32, + "learning_rate": 0.00023517140099974727, + "loss": 7.7128, + "step": 21176 + }, + { + "epoch": 1.9760194084165343, + "grad_norm": 1.6296011914994477, + "learning_rate": 0.00023516517468589833, + "loss": 7.1127, + "step": 21177 + }, + { + "epoch": 1.9761127181114118, + "grad_norm": 0.954235462280417, + "learning_rate": 0.00023515894815549963, + "loss": 7.442, + "step": 21178 + }, + { + "epoch": 1.9762060278062892, + "grad_norm": 1.2069734221506694, + "learning_rate": 0.00023515272140856692, + "loss": 7.5708, + "step": 21179 + }, + { + "epoch": 1.9762993375011664, + "grad_norm": 1.0088371529916922, + "learning_rate": 0.00023514649444511608, + "loss": 7.5701, + "step": 21180 + }, + { + "epoch": 1.9763926471960436, + "grad_norm": 1.377448523656238, + "learning_rate": 0.00023514026726516288, + "loss": 7.8714, + "step": 21181 + }, + { + "epoch": 1.976485956890921, + "grad_norm": 0.9848169590196203, + "learning_rate": 0.00023513403986872327, + "loss": 7.4609, + "step": 21182 + }, + { + "epoch": 1.9765792665857984, + "grad_norm": 1.159413740981909, + "learning_rate": 0.000235127812255813, + "loss": 7.3253, + "step": 21183 + }, + { + "epoch": 1.9766725762806756, + "grad_norm": 5.0994925911894444e+33, + "learning_rate": 0.00023512158442644788, + "loss": 7.4128, + "step": 21184 + }, + { + "epoch": 1.9767658859755528, + "grad_norm": 1.130275411826269, + "learning_rate": 0.0002351153563806438, + "loss": 7.5344, + "step": 21185 + }, + { + "epoch": 1.9768591956704302, + "grad_norm": 1.189166292987583, + "learning_rate": 0.0002351091281184166, + "loss": 7.4846, + "step": 21186 + }, + { + "epoch": 1.9769525053653074, + "grad_norm": 1.3733911184105978e+32, + "learning_rate": 0.00023510289963978208, + "loss": 7.3901, + "step": 21187 + }, + { + "epoch": 1.9770458150601846, + "grad_norm": 0.9701855986093407, + "learning_rate": 0.0002350966709447561, + "loss": 7.5841, + "step": 21188 + }, + { + "epoch": 1.977139124755062, + "grad_norm": 0.9673178452279277, + "learning_rate": 0.00023509044203335453, + "loss": 7.5243, + "step": 21189 + }, + { + "epoch": 1.9772324344499395, + "grad_norm": 1.0593867718626273, + "learning_rate": 0.00023508421290559313, + "loss": 7.5755, + "step": 21190 + }, + { + "epoch": 1.9773257441448167, + "grad_norm": 0.9808045975472062, + "learning_rate": 0.00023507798356148777, + "loss": 7.5565, + "step": 21191 + }, + { + "epoch": 1.9774190538396939, + "grad_norm": 1.0033923943166727, + "learning_rate": 0.00023507175400105433, + "loss": 7.5044, + "step": 21192 + }, + { + "epoch": 1.9775123635345713, + "grad_norm": 2.2124253809515747e+34, + "learning_rate": 0.0002350655242243086, + "loss": 7.4058, + "step": 21193 + }, + { + "epoch": 1.9776056732294487, + "grad_norm": 1.5662860260076674, + "learning_rate": 0.0002350592942312664, + "loss": 7.1877, + "step": 21194 + }, + { + "epoch": 1.9776989829243259, + "grad_norm": 1.2357270464862278, + "learning_rate": 0.0002350530640219437, + "loss": 7.1314, + "step": 21195 + }, + { + "epoch": 1.977792292619203, + "grad_norm": 0.9146957538495171, + "learning_rate": 0.00023504683359635618, + "loss": 7.3703, + "step": 21196 + }, + { + "epoch": 1.9778856023140805, + "grad_norm": 1.4107686934637858, + "learning_rate": 0.0002350406029545198, + "loss": 7.5551, + "step": 21197 + }, + { + "epoch": 1.9779789120089577, + "grad_norm": 2.121867402482913, + "learning_rate": 0.00023503437209645034, + "loss": 7.592, + "step": 21198 + }, + { + "epoch": 1.978072221703835, + "grad_norm": 1.462512829189083, + "learning_rate": 0.00023502814102216367, + "loss": 7.6202, + "step": 21199 + }, + { + "epoch": 1.9781655313987123, + "grad_norm": 4.5183773466073744e+33, + "learning_rate": 0.00023502190973167565, + "loss": 7.1106, + "step": 21200 + }, + { + "epoch": 1.9782588410935897, + "grad_norm": 0.9223019088109826, + "learning_rate": 0.00023501567822500208, + "loss": 7.4305, + "step": 21201 + }, + { + "epoch": 1.978352150788467, + "grad_norm": 1.66667798245575, + "learning_rate": 0.00023500944650215879, + "loss": 7.6521, + "step": 21202 + }, + { + "epoch": 1.9784454604833441, + "grad_norm": 0.9488965431898759, + "learning_rate": 0.00023500321456316166, + "loss": 7.496, + "step": 21203 + }, + { + "epoch": 1.9785387701782216, + "grad_norm": 1.8338517763866602, + "learning_rate": 0.00023499698240802658, + "loss": 7.5182, + "step": 21204 + }, + { + "epoch": 1.978632079873099, + "grad_norm": 1.0444948752724579, + "learning_rate": 0.00023499075003676934, + "loss": 7.4665, + "step": 21205 + }, + { + "epoch": 1.9787253895679762, + "grad_norm": 1.1593968833413388, + "learning_rate": 0.00023498451744940576, + "loss": 7.5432, + "step": 21206 + }, + { + "epoch": 1.9788186992628534, + "grad_norm": 2.8353890839821954e+33, + "learning_rate": 0.00023497828464595175, + "loss": 7.4197, + "step": 21207 + }, + { + "epoch": 1.9789120089577308, + "grad_norm": 1.1792631057557263, + "learning_rate": 0.00023497205162642317, + "loss": 7.6298, + "step": 21208 + }, + { + "epoch": 1.979005318652608, + "grad_norm": 1.0982826139022308, + "learning_rate": 0.00023496581839083577, + "loss": 7.6866, + "step": 21209 + }, + { + "epoch": 1.9790986283474852, + "grad_norm": 1.1623994547403929, + "learning_rate": 0.00023495958493920547, + "loss": 7.4713, + "step": 21210 + }, + { + "epoch": 1.9791919380423626, + "grad_norm": 1.3701531937673537, + "learning_rate": 0.00023495335127154818, + "loss": 7.3114, + "step": 21211 + }, + { + "epoch": 1.97928524773724, + "grad_norm": 1.8355743175294208, + "learning_rate": 0.0002349471173878796, + "loss": 7.146, + "step": 21212 + }, + { + "epoch": 1.9793785574321172, + "grad_norm": 1.2160055302899204, + "learning_rate": 0.00023494088328821573, + "loss": 7.3741, + "step": 21213 + }, + { + "epoch": 1.9794718671269944, + "grad_norm": 2.319225789922903e+34, + "learning_rate": 0.00023493464897257233, + "loss": 7.594, + "step": 21214 + }, + { + "epoch": 1.9795651768218718, + "grad_norm": 2.591760608375607e+33, + "learning_rate": 0.0002349284144409653, + "loss": 7.296, + "step": 21215 + }, + { + "epoch": 1.9796584865167492, + "grad_norm": 1.2126156553395813, + "learning_rate": 0.00023492217969341042, + "loss": 7.2967, + "step": 21216 + }, + { + "epoch": 1.9797517962116262, + "grad_norm": 1.5851749998292655, + "learning_rate": 0.00023491594472992361, + "loss": 7.4326, + "step": 21217 + }, + { + "epoch": 1.9798451059065036, + "grad_norm": 0.9548383878019032, + "learning_rate": 0.00023490970955052068, + "loss": 7.4175, + "step": 21218 + }, + { + "epoch": 1.979938415601381, + "grad_norm": 1.0506340849796096, + "learning_rate": 0.0002349034741552176, + "loss": 7.44, + "step": 21219 + }, + { + "epoch": 1.9800317252962583, + "grad_norm": 1.2090003395624302, + "learning_rate": 0.00023489723854403007, + "loss": 7.5333, + "step": 21220 + }, + { + "epoch": 1.9801250349911355, + "grad_norm": 1.7667180383817092, + "learning_rate": 0.000234891002716974, + "loss": 7.7996, + "step": 21221 + }, + { + "epoch": 1.9802183446860129, + "grad_norm": 1.112793277980246, + "learning_rate": 0.00023488476667406528, + "loss": 7.4902, + "step": 21222 + }, + { + "epoch": 1.9803116543808903, + "grad_norm": 1.5122022958956882e+35, + "learning_rate": 0.00023487853041531974, + "loss": 7.071, + "step": 21223 + }, + { + "epoch": 1.9804049640757675, + "grad_norm": 7.929041261295541e+33, + "learning_rate": 0.00023487229394075329, + "loss": 7.8379, + "step": 21224 + }, + { + "epoch": 1.9804982737706447, + "grad_norm": 1.0897634934897928, + "learning_rate": 0.00023486605725038168, + "loss": 7.6557, + "step": 21225 + }, + { + "epoch": 1.980591583465522, + "grad_norm": 1.0807530609478009, + "learning_rate": 0.0002348598203442208, + "loss": 7.5299, + "step": 21226 + }, + { + "epoch": 1.9806848931603995, + "grad_norm": 1.3922251445584866, + "learning_rate": 0.00023485358322228658, + "loss": 7.6644, + "step": 21227 + }, + { + "epoch": 1.9807782028552765, + "grad_norm": 1.249674247414243, + "learning_rate": 0.00023484734588459483, + "loss": 7.4472, + "step": 21228 + }, + { + "epoch": 1.980871512550154, + "grad_norm": 6.871273447646889e+32, + "learning_rate": 0.0002348411083311614, + "loss": 7.5731, + "step": 21229 + }, + { + "epoch": 1.9809648222450313, + "grad_norm": 1.2965673903824297, + "learning_rate": 0.0002348348705620022, + "loss": 7.6256, + "step": 21230 + }, + { + "epoch": 1.9810581319399085, + "grad_norm": 1.1779036459376977, + "learning_rate": 0.000234828632577133, + "loss": 7.4193, + "step": 21231 + }, + { + "epoch": 1.9811514416347857, + "grad_norm": 1.0560303315989972, + "learning_rate": 0.0002348223943765697, + "loss": 7.4215, + "step": 21232 + }, + { + "epoch": 1.9812447513296632, + "grad_norm": 1.7573659835458872, + "learning_rate": 0.00023481615596032825, + "loss": 7.6804, + "step": 21233 + }, + { + "epoch": 1.9813380610245406, + "grad_norm": 1.4278969390885733e+32, + "learning_rate": 0.00023480991732842437, + "loss": 7.3124, + "step": 21234 + }, + { + "epoch": 1.9814313707194178, + "grad_norm": 7.706883491854047e+34, + "learning_rate": 0.00023480367848087406, + "loss": 7.1181, + "step": 21235 + }, + { + "epoch": 1.981524680414295, + "grad_norm": 0.9850813195268918, + "learning_rate": 0.00023479743941769312, + "loss": 7.1331, + "step": 21236 + }, + { + "epoch": 1.9816179901091724, + "grad_norm": 1.3521723654081316e+35, + "learning_rate": 0.0002347912001388973, + "loss": 7.6651, + "step": 21237 + }, + { + "epoch": 1.9817112998040496, + "grad_norm": 1.4090786745300852, + "learning_rate": 0.00023478496064450267, + "loss": 7.8289, + "step": 21238 + }, + { + "epoch": 1.9818046094989268, + "grad_norm": 0.97517257270897, + "learning_rate": 0.000234778720934525, + "loss": 7.45, + "step": 21239 + }, + { + "epoch": 1.9818979191938042, + "grad_norm": 5.440457079916007e+33, + "learning_rate": 0.0002347724810089801, + "loss": 7.5204, + "step": 21240 + }, + { + "epoch": 1.9819912288886816, + "grad_norm": 1.162767467014782, + "learning_rate": 0.00023476624086788396, + "loss": 7.3095, + "step": 21241 + }, + { + "epoch": 1.9820845385835588, + "grad_norm": 1.8536918225859234, + "learning_rate": 0.00023476000051125234, + "loss": 7.2342, + "step": 21242 + }, + { + "epoch": 1.982177848278436, + "grad_norm": 1.3209135639849192, + "learning_rate": 0.00023475375993910113, + "loss": 7.6733, + "step": 21243 + }, + { + "epoch": 1.9822711579733134, + "grad_norm": 1.9230120089822422, + "learning_rate": 0.00023474751915144625, + "loss": 7.3956, + "step": 21244 + }, + { + "epoch": 1.9823644676681909, + "grad_norm": 1.0336147814300591, + "learning_rate": 0.00023474127814830352, + "loss": 7.4222, + "step": 21245 + }, + { + "epoch": 1.982457777363068, + "grad_norm": 3.2064156126132927e+34, + "learning_rate": 0.00023473503692968885, + "loss": 7.5932, + "step": 21246 + }, + { + "epoch": 1.9825510870579452, + "grad_norm": 1.479992265928068, + "learning_rate": 0.00023472879549561805, + "loss": 7.1791, + "step": 21247 + }, + { + "epoch": 1.9826443967528227, + "grad_norm": 1.4483908005040982, + "learning_rate": 0.00023472255384610703, + "loss": 7.6498, + "step": 21248 + }, + { + "epoch": 1.9827377064476999, + "grad_norm": 5.565667836196863e+34, + "learning_rate": 0.00023471631198117166, + "loss": 7.6041, + "step": 21249 + }, + { + "epoch": 1.982831016142577, + "grad_norm": 1.8942302263344453, + "learning_rate": 0.00023471006990082776, + "loss": 7.755, + "step": 21250 + }, + { + "epoch": 1.9829243258374545, + "grad_norm": 1.440071845937623, + "learning_rate": 0.0002347038276050913, + "loss": 7.4843, + "step": 21251 + }, + { + "epoch": 1.983017635532332, + "grad_norm": 1.1299256777129647, + "learning_rate": 0.0002346975850939781, + "loss": 7.6078, + "step": 21252 + }, + { + "epoch": 1.983110945227209, + "grad_norm": 1.1674807902190734, + "learning_rate": 0.00023469134236750396, + "loss": 7.5153, + "step": 21253 + }, + { + "epoch": 1.9832042549220863, + "grad_norm": 1.089821896673279, + "learning_rate": 0.00023468509942568486, + "loss": 7.4221, + "step": 21254 + }, + { + "epoch": 1.9832975646169637, + "grad_norm": 1.5692940476888624, + "learning_rate": 0.00023467885626853667, + "loss": 7.3408, + "step": 21255 + }, + { + "epoch": 1.9833908743118411, + "grad_norm": 1.2552091986694112, + "learning_rate": 0.00023467261289607518, + "loss": 7.716, + "step": 21256 + }, + { + "epoch": 1.9834841840067183, + "grad_norm": 1.6639454530530976e+33, + "learning_rate": 0.00023466636930831634, + "loss": 7.6391, + "step": 21257 + }, + { + "epoch": 1.9835774937015955, + "grad_norm": 1.4178127383938317, + "learning_rate": 0.00023466012550527602, + "loss": 7.3893, + "step": 21258 + }, + { + "epoch": 1.983670803396473, + "grad_norm": 1.021475374603535e+35, + "learning_rate": 0.0002346538814869701, + "loss": 7.431, + "step": 21259 + }, + { + "epoch": 1.9837641130913501, + "grad_norm": 24.62166464522602, + "learning_rate": 0.0002346476372534144, + "loss": 7.4026, + "step": 21260 + }, + { + "epoch": 1.9838574227862273, + "grad_norm": 9.436486588317843e+33, + "learning_rate": 0.00023464139280462482, + "loss": 7.2073, + "step": 21261 + }, + { + "epoch": 1.9839507324811048, + "grad_norm": 4.648211428841282e+31, + "learning_rate": 0.00023463514814061725, + "loss": 7.417, + "step": 21262 + }, + { + "epoch": 1.9840440421759822, + "grad_norm": 1.4097600601606386, + "learning_rate": 0.00023462890326140758, + "loss": 7.6799, + "step": 21263 + }, + { + "epoch": 1.9841373518708594, + "grad_norm": 1.10153888103357, + "learning_rate": 0.0002346226581670117, + "loss": 7.2211, + "step": 21264 + }, + { + "epoch": 1.9842306615657366, + "grad_norm": 1.2073092622083834, + "learning_rate": 0.00023461641285744546, + "loss": 7.4175, + "step": 21265 + }, + { + "epoch": 1.984323971260614, + "grad_norm": 1.2153823653809583, + "learning_rate": 0.00023461016733272472, + "loss": 7.4341, + "step": 21266 + }, + { + "epoch": 1.9844172809554914, + "grad_norm": 1.613923850026275e+33, + "learning_rate": 0.00023460392159286542, + "loss": 7.7365, + "step": 21267 + }, + { + "epoch": 1.9845105906503686, + "grad_norm": 2.134968040218853, + "learning_rate": 0.0002345976756378834, + "loss": 7.7049, + "step": 21268 + }, + { + "epoch": 1.9846039003452458, + "grad_norm": 1.027576061680724, + "learning_rate": 0.00023459142946779453, + "loss": 7.4008, + "step": 21269 + }, + { + "epoch": 1.9846972100401232, + "grad_norm": 1.4684169989601863, + "learning_rate": 0.00023458518308261476, + "loss": 7.2564, + "step": 21270 + }, + { + "epoch": 1.9847905197350004, + "grad_norm": 1.1705642019974802, + "learning_rate": 0.0002345789364823599, + "loss": 7.7941, + "step": 21271 + }, + { + "epoch": 1.9848838294298776, + "grad_norm": 2.042519110519265, + "learning_rate": 0.0002345726896670458, + "loss": 8.0731, + "step": 21272 + }, + { + "epoch": 1.984977139124755, + "grad_norm": 1.9155148040475842, + "learning_rate": 0.00023456644263668847, + "loss": 7.4528, + "step": 21273 + }, + { + "epoch": 1.9850704488196325, + "grad_norm": 1.3015514032890447, + "learning_rate": 0.00023456019539130373, + "loss": 7.493, + "step": 21274 + }, + { + "epoch": 1.9851637585145097, + "grad_norm": 1.2971488480148148, + "learning_rate": 0.0002345539479309074, + "loss": 7.4222, + "step": 21275 + }, + { + "epoch": 1.9852570682093869, + "grad_norm": 3.6844060998869876e+32, + "learning_rate": 0.00023454770025551548, + "loss": 7.4119, + "step": 21276 + }, + { + "epoch": 1.9853503779042643, + "grad_norm": 1.1693357436318523, + "learning_rate": 0.0002345414523651438, + "loss": 7.9432, + "step": 21277 + }, + { + "epoch": 1.9854436875991417, + "grad_norm": 1.1926399600899202, + "learning_rate": 0.00023453520425980822, + "loss": 7.3522, + "step": 21278 + }, + { + "epoch": 1.985536997294019, + "grad_norm": 1.64341867333395, + "learning_rate": 0.0002345289559395247, + "loss": 7.6607, + "step": 21279 + }, + { + "epoch": 1.985630306988896, + "grad_norm": 4.2693595584839476e+33, + "learning_rate": 0.00023452270740430907, + "loss": 7.4324, + "step": 21280 + }, + { + "epoch": 1.9857236166837735, + "grad_norm": 1.47408093258348, + "learning_rate": 0.0002345164586541772, + "loss": 7.7981, + "step": 21281 + }, + { + "epoch": 1.9858169263786507, + "grad_norm": 9.323835584047805e+34, + "learning_rate": 0.0002345102096891451, + "loss": 7.3231, + "step": 21282 + }, + { + "epoch": 1.985910236073528, + "grad_norm": 1.1251386603108608, + "learning_rate": 0.0002345039605092285, + "loss": 7.3785, + "step": 21283 + }, + { + "epoch": 1.9860035457684053, + "grad_norm": 1.77328190416328, + "learning_rate": 0.00023449771111444333, + "loss": 7.1977, + "step": 21284 + }, + { + "epoch": 1.9860968554632827, + "grad_norm": 1.0960359457310054, + "learning_rate": 0.00023449146150480556, + "loss": 7.8059, + "step": 21285 + }, + { + "epoch": 1.98619016515816, + "grad_norm": 1.856138181258013, + "learning_rate": 0.00023448521168033102, + "loss": 7.191, + "step": 21286 + }, + { + "epoch": 1.9862834748530371, + "grad_norm": 1.4474098710368277, + "learning_rate": 0.00023447896164103561, + "loss": 7.5047, + "step": 21287 + }, + { + "epoch": 1.9863767845479146, + "grad_norm": 1.9021920969431676, + "learning_rate": 0.00023447271138693525, + "loss": 7.0562, + "step": 21288 + }, + { + "epoch": 1.986470094242792, + "grad_norm": 1.1959101384749264, + "learning_rate": 0.00023446646091804577, + "loss": 7.4111, + "step": 21289 + }, + { + "epoch": 1.9865634039376692, + "grad_norm": 1.1206780211354017, + "learning_rate": 0.00023446021023438318, + "loss": 7.5596, + "step": 21290 + }, + { + "epoch": 1.9866567136325464, + "grad_norm": 1.2219561398984045, + "learning_rate": 0.00023445395933596318, + "loss": 7.6195, + "step": 21291 + }, + { + "epoch": 1.9867500233274238, + "grad_norm": 1.1734106222010563, + "learning_rate": 0.00023444770822280184, + "loss": 7.4217, + "step": 21292 + }, + { + "epoch": 1.986843333022301, + "grad_norm": 1.43164732361708, + "learning_rate": 0.00023444145689491497, + "loss": 7.5259, + "step": 21293 + }, + { + "epoch": 1.9869366427171782, + "grad_norm": 1.097330141765051, + "learning_rate": 0.00023443520535231852, + "loss": 7.2247, + "step": 21294 + }, + { + "epoch": 1.9870299524120556, + "grad_norm": 1.3989324313149023, + "learning_rate": 0.00023442895359502834, + "loss": 7.5522, + "step": 21295 + }, + { + "epoch": 1.987123262106933, + "grad_norm": 1.4598581321955006, + "learning_rate": 0.00023442270162306035, + "loss": 7.305, + "step": 21296 + }, + { + "epoch": 1.9872165718018102, + "grad_norm": 1.11848618188275, + "learning_rate": 0.0002344164494364304, + "loss": 7.3462, + "step": 21297 + }, + { + "epoch": 1.9873098814966874, + "grad_norm": 1.301363243108865, + "learning_rate": 0.00023441019703515447, + "loss": 7.5853, + "step": 21298 + }, + { + "epoch": 1.9874031911915648, + "grad_norm": 1.0127812917813444, + "learning_rate": 0.00023440394441924837, + "loss": 7.3827, + "step": 21299 + }, + { + "epoch": 1.9874965008864423, + "grad_norm": 1.0962147466608305, + "learning_rate": 0.00023439769158872808, + "loss": 7.3059, + "step": 21300 + }, + { + "epoch": 1.9875898105813194, + "grad_norm": 1.586026466983478, + "learning_rate": 0.00023439143854360944, + "loss": 7.6591, + "step": 21301 + }, + { + "epoch": 1.9876831202761966, + "grad_norm": 1.1479538607750759, + "learning_rate": 0.0002343851852839084, + "loss": 7.6059, + "step": 21302 + }, + { + "epoch": 1.987776429971074, + "grad_norm": 1.5399509573587984, + "learning_rate": 0.00023437893180964077, + "loss": 7.1613, + "step": 21303 + }, + { + "epoch": 1.9878697396659513, + "grad_norm": 1.1647242550795431, + "learning_rate": 0.0002343726781208226, + "loss": 7.5818, + "step": 21304 + }, + { + "epoch": 1.9879630493608285, + "grad_norm": 1.7367426830190473, + "learning_rate": 0.0002343664242174696, + "loss": 7.4218, + "step": 21305 + }, + { + "epoch": 1.9880563590557059, + "grad_norm": 0.997709238068433, + "learning_rate": 0.00023436017009959782, + "loss": 7.4109, + "step": 21306 + }, + { + "epoch": 1.9881496687505833, + "grad_norm": 5.446434708681251e+34, + "learning_rate": 0.00023435391576722315, + "loss": 7.1436, + "step": 21307 + }, + { + "epoch": 1.9882429784454605, + "grad_norm": 1.2624508077774281, + "learning_rate": 0.00023434766122036143, + "loss": 7.5687, + "step": 21308 + }, + { + "epoch": 1.9883362881403377, + "grad_norm": 1.4298304498636756, + "learning_rate": 0.0002343414064590286, + "loss": 7.1226, + "step": 21309 + }, + { + "epoch": 1.9884295978352151, + "grad_norm": 1.0078396310348032, + "learning_rate": 0.00023433515148324054, + "loss": 7.3573, + "step": 21310 + }, + { + "epoch": 1.9885229075300925, + "grad_norm": 1.1415671888973415, + "learning_rate": 0.00023432889629301318, + "loss": 7.4307, + "step": 21311 + }, + { + "epoch": 1.9886162172249697, + "grad_norm": 1.5179715759448296, + "learning_rate": 0.00023432264088836242, + "loss": 7.4892, + "step": 21312 + }, + { + "epoch": 1.988709526919847, + "grad_norm": 1.2367143692035083, + "learning_rate": 0.00023431638526930416, + "loss": 7.6612, + "step": 21313 + }, + { + "epoch": 1.9888028366147243, + "grad_norm": 1.9773567749715895, + "learning_rate": 0.0002343101294358543, + "loss": 6.8388, + "step": 21314 + }, + { + "epoch": 1.9888961463096015, + "grad_norm": 0.976885021906079, + "learning_rate": 0.0002343038733880288, + "loss": 7.4141, + "step": 21315 + }, + { + "epoch": 1.9889894560044787, + "grad_norm": 1.0721691917411884, + "learning_rate": 0.00023429761712584347, + "loss": 7.335, + "step": 21316 + }, + { + "epoch": 1.9890827656993562, + "grad_norm": 1.1151559420446169, + "learning_rate": 0.0002342913606493143, + "loss": 7.5305, + "step": 21317 + }, + { + "epoch": 1.9891760753942336, + "grad_norm": 2.533666269175086, + "learning_rate": 0.00023428510395845717, + "loss": 7.3213, + "step": 21318 + }, + { + "epoch": 1.9892693850891108, + "grad_norm": 1.2199014185128325, + "learning_rate": 0.00023427884705328793, + "loss": 7.5626, + "step": 21319 + }, + { + "epoch": 1.989362694783988, + "grad_norm": 0.9737738130238123, + "learning_rate": 0.00023427258993382263, + "loss": 7.3769, + "step": 21320 + }, + { + "epoch": 1.9894560044788654, + "grad_norm": 1.1297906591017546, + "learning_rate": 0.00023426633260007705, + "loss": 7.3629, + "step": 21321 + }, + { + "epoch": 1.9895493141737428, + "grad_norm": 1.1603915157998936, + "learning_rate": 0.00023426007505206715, + "loss": 7.8045, + "step": 21322 + }, + { + "epoch": 1.9896426238686198, + "grad_norm": 1.3717807659845518, + "learning_rate": 0.00023425381728980888, + "loss": 7.3101, + "step": 21323 + }, + { + "epoch": 1.9897359335634972, + "grad_norm": 1.4844925510529081, + "learning_rate": 0.00023424755931331805, + "loss": 7.5432, + "step": 21324 + }, + { + "epoch": 1.9898292432583746, + "grad_norm": 1.1600245551766646, + "learning_rate": 0.00023424130112261065, + "loss": 7.3771, + "step": 21325 + }, + { + "epoch": 1.9899225529532518, + "grad_norm": 1.1596480421861706, + "learning_rate": 0.0002342350427177026, + "loss": 7.4667, + "step": 21326 + }, + { + "epoch": 1.990015862648129, + "grad_norm": 1.2133160805091887, + "learning_rate": 0.00023422878409860978, + "loss": 7.7987, + "step": 21327 + }, + { + "epoch": 1.9901091723430064, + "grad_norm": 1.2445610037483859, + "learning_rate": 0.00023422252526534808, + "loss": 7.7251, + "step": 21328 + }, + { + "epoch": 1.9902024820378839, + "grad_norm": 0.9879296016465361, + "learning_rate": 0.0002342162662179335, + "loss": 7.5173, + "step": 21329 + }, + { + "epoch": 1.990295791732761, + "grad_norm": 1.1718260456736382e+34, + "learning_rate": 0.00023421000695638186, + "loss": 7.5451, + "step": 21330 + }, + { + "epoch": 1.9903891014276383, + "grad_norm": 1.4417865194320505, + "learning_rate": 0.0002342037474807091, + "loss": 7.937, + "step": 21331 + }, + { + "epoch": 1.9904824111225157, + "grad_norm": 4.313766825306428e+35, + "learning_rate": 0.0002341974877909312, + "loss": 7.5256, + "step": 21332 + }, + { + "epoch": 1.990575720817393, + "grad_norm": 1.0022493504380623, + "learning_rate": 0.00023419122788706406, + "loss": 7.5274, + "step": 21333 + }, + { + "epoch": 1.99066903051227, + "grad_norm": 0.974255357296561, + "learning_rate": 0.0002341849677691235, + "loss": 7.5455, + "step": 21334 + }, + { + "epoch": 1.9907623402071475, + "grad_norm": 0.9387634641917394, + "learning_rate": 0.0002341787074371255, + "loss": 7.3904, + "step": 21335 + }, + { + "epoch": 1.990855649902025, + "grad_norm": 1.2562617839117578, + "learning_rate": 0.00023417244689108606, + "loss": 7.7761, + "step": 21336 + }, + { + "epoch": 1.990948959596902, + "grad_norm": 1.3540065930961374, + "learning_rate": 0.00023416618613102095, + "loss": 7.5036, + "step": 21337 + }, + { + "epoch": 1.9910422692917793, + "grad_norm": 1.8622960795342156, + "learning_rate": 0.00023415992515694615, + "loss": 7.6227, + "step": 21338 + }, + { + "epoch": 1.9911355789866567, + "grad_norm": 1.1892231838258078, + "learning_rate": 0.00023415366396887767, + "loss": 7.3739, + "step": 21339 + }, + { + "epoch": 1.9912288886815341, + "grad_norm": 1.1385873467903598, + "learning_rate": 0.00023414740256683128, + "loss": 7.4938, + "step": 21340 + }, + { + "epoch": 1.9913221983764113, + "grad_norm": 1.1545181281762051, + "learning_rate": 0.000234141140950823, + "loss": 7.4151, + "step": 21341 + }, + { + "epoch": 1.9914155080712885, + "grad_norm": 1.2723228822510897, + "learning_rate": 0.0002341348791208687, + "loss": 7.6482, + "step": 21342 + }, + { + "epoch": 1.991508817766166, + "grad_norm": 1.9270839704930636, + "learning_rate": 0.00023412861707698432, + "loss": 7.3928, + "step": 21343 + }, + { + "epoch": 1.9916021274610431, + "grad_norm": 9.744094376450513e+32, + "learning_rate": 0.00023412235481918583, + "loss": 7.645, + "step": 21344 + }, + { + "epoch": 1.9916954371559203, + "grad_norm": 1.1227575249014812, + "learning_rate": 0.0002341160923474891, + "loss": 7.8022, + "step": 21345 + }, + { + "epoch": 1.9917887468507978, + "grad_norm": 1.5728048247976878, + "learning_rate": 0.0002341098296619101, + "loss": 7.1812, + "step": 21346 + }, + { + "epoch": 1.9918820565456752, + "grad_norm": 1.5656233555683456, + "learning_rate": 0.00023410356676246462, + "loss": 7.1864, + "step": 21347 + }, + { + "epoch": 1.9919753662405524, + "grad_norm": 1.224078602388953, + "learning_rate": 0.00023409730364916878, + "loss": 7.3834, + "step": 21348 + }, + { + "epoch": 1.9920686759354296, + "grad_norm": 2.5253851919650767, + "learning_rate": 0.00023409104032203832, + "loss": 7.5951, + "step": 21349 + }, + { + "epoch": 1.992161985630307, + "grad_norm": 2.3425563489961257e+34, + "learning_rate": 0.00023408477678108928, + "loss": 7.8068, + "step": 21350 + }, + { + "epoch": 1.9922552953251844, + "grad_norm": 0.9947666190662797, + "learning_rate": 0.0002340785130263376, + "loss": 7.1806, + "step": 21351 + }, + { + "epoch": 1.9923486050200616, + "grad_norm": 6.214741307241045, + "learning_rate": 0.00023407224905779914, + "loss": 7.5706, + "step": 21352 + }, + { + "epoch": 1.9924419147149388, + "grad_norm": 0.9992393892958864, + "learning_rate": 0.00023406598487548985, + "loss": 7.46, + "step": 21353 + }, + { + "epoch": 1.9925352244098162, + "grad_norm": 1.410846061175616, + "learning_rate": 0.0002340597204794257, + "loss": 7.3475, + "step": 21354 + }, + { + "epoch": 1.9926285341046934, + "grad_norm": 1.3165600059643152, + "learning_rate": 0.00023405345586962253, + "loss": 7.8483, + "step": 21355 + }, + { + "epoch": 1.9927218437995706, + "grad_norm": 4.232039527546627e+32, + "learning_rate": 0.00023404719104609635, + "loss": 7.416, + "step": 21356 + }, + { + "epoch": 1.992815153494448, + "grad_norm": 1.1083987564459497, + "learning_rate": 0.00023404092600886304, + "loss": 7.5977, + "step": 21357 + }, + { + "epoch": 1.9929084631893255, + "grad_norm": 1.3705044436553755, + "learning_rate": 0.0002340346607579386, + "loss": 7.3976, + "step": 21358 + }, + { + "epoch": 1.9930017728842027, + "grad_norm": 9.348971507891591e+35, + "learning_rate": 0.00023402839529333884, + "loss": 7.1311, + "step": 21359 + }, + { + "epoch": 1.9930950825790799, + "grad_norm": 1.199305271890651, + "learning_rate": 0.0002340221296150798, + "loss": 7.6024, + "step": 21360 + }, + { + "epoch": 1.9931883922739573, + "grad_norm": 1.3425596808939095, + "learning_rate": 0.00023401586372317739, + "loss": 7.3128, + "step": 21361 + }, + { + "epoch": 1.9932817019688347, + "grad_norm": 14.436284514054002, + "learning_rate": 0.00023400959761764748, + "loss": 7.8151, + "step": 21362 + }, + { + "epoch": 1.993375011663712, + "grad_norm": 1.0891216374277046, + "learning_rate": 0.00023400333129850606, + "loss": 7.5101, + "step": 21363 + }, + { + "epoch": 1.993468321358589, + "grad_norm": 7.758969977435118e+31, + "learning_rate": 0.0002339970647657691, + "loss": 7.3415, + "step": 21364 + }, + { + "epoch": 1.9935616310534665, + "grad_norm": 1.3568664183162327, + "learning_rate": 0.00023399079801945244, + "loss": 7.6177, + "step": 21365 + }, + { + "epoch": 1.9936549407483437, + "grad_norm": 1.5408187338997763, + "learning_rate": 0.00023398453105957206, + "loss": 7.3729, + "step": 21366 + }, + { + "epoch": 1.993748250443221, + "grad_norm": 1.8196367309840402, + "learning_rate": 0.00023397826388614392, + "loss": 7.6157, + "step": 21367 + }, + { + "epoch": 1.9938415601380983, + "grad_norm": 1.68870667222378, + "learning_rate": 0.00023397199649918392, + "loss": 7.7952, + "step": 21368 + }, + { + "epoch": 1.9939348698329757, + "grad_norm": 1.1638725571293702e+33, + "learning_rate": 0.000233965728898708, + "loss": 7.5606, + "step": 21369 + }, + { + "epoch": 1.994028179527853, + "grad_norm": 9.170890986612078e+34, + "learning_rate": 0.0002339594610847321, + "loss": 7.356, + "step": 21370 + }, + { + "epoch": 1.9941214892227301, + "grad_norm": 1.2843975881477385, + "learning_rate": 0.00023395319305727217, + "loss": 7.7506, + "step": 21371 + }, + { + "epoch": 1.9942147989176076, + "grad_norm": 2.5373784434247426, + "learning_rate": 0.00023394692481634415, + "loss": 7.105, + "step": 21372 + }, + { + "epoch": 1.994308108612485, + "grad_norm": 2.041737898099027, + "learning_rate": 0.00023394065636196397, + "loss": 7.0725, + "step": 21373 + }, + { + "epoch": 1.9944014183073622, + "grad_norm": 1.2289882299819161, + "learning_rate": 0.00023393438769414755, + "loss": 7.666, + "step": 21374 + }, + { + "epoch": 1.9944947280022394, + "grad_norm": 1.0409566638748216, + "learning_rate": 0.00023392811881291082, + "loss": 7.4268, + "step": 21375 + }, + { + "epoch": 1.9945880376971168, + "grad_norm": 1.1935068101789068, + "learning_rate": 0.0002339218497182698, + "loss": 7.6475, + "step": 21376 + }, + { + "epoch": 1.994681347391994, + "grad_norm": 1.2934586521389584, + "learning_rate": 0.00023391558041024036, + "loss": 7.4752, + "step": 21377 + }, + { + "epoch": 1.9947746570868712, + "grad_norm": 1.8094289049309915, + "learning_rate": 0.00023390931088883844, + "loss": 7.728, + "step": 21378 + }, + { + "epoch": 1.9948679667817486, + "grad_norm": 1.5777691656965158, + "learning_rate": 0.00023390304115408, + "loss": 7.433, + "step": 21379 + }, + { + "epoch": 1.994961276476626, + "grad_norm": 1.3901250946844599, + "learning_rate": 0.00023389677120598098, + "loss": 7.1042, + "step": 21380 + }, + { + "epoch": 1.9950545861715032, + "grad_norm": 1.2130213344413194, + "learning_rate": 0.00023389050104455735, + "loss": 7.6925, + "step": 21381 + }, + { + "epoch": 1.9951478958663804, + "grad_norm": 1.0753621126569146, + "learning_rate": 0.000233884230669825, + "loss": 7.1336, + "step": 21382 + }, + { + "epoch": 1.9952412055612578, + "grad_norm": 1.4933040336060202, + "learning_rate": 0.0002338779600817999, + "loss": 7.5327, + "step": 21383 + }, + { + "epoch": 1.9953345152561353, + "grad_norm": 1.1308884736151499, + "learning_rate": 0.00023387168928049802, + "loss": 7.2978, + "step": 21384 + }, + { + "epoch": 1.9954278249510125, + "grad_norm": 5.0790132806352245e+31, + "learning_rate": 0.00023386541826593522, + "loss": 7.0381, + "step": 21385 + }, + { + "epoch": 1.9955211346458896, + "grad_norm": 1.0544272338824072, + "learning_rate": 0.00023385914703812755, + "loss": 7.2746, + "step": 21386 + }, + { + "epoch": 1.995614444340767, + "grad_norm": 1.1448101273039604, + "learning_rate": 0.00023385287559709092, + "loss": 7.5736, + "step": 21387 + }, + { + "epoch": 1.9957077540356443, + "grad_norm": 0.9924997317478741, + "learning_rate": 0.00023384660394284122, + "loss": 7.4058, + "step": 21388 + }, + { + "epoch": 1.9958010637305215, + "grad_norm": 1.0595970423389565, + "learning_rate": 0.00023384033207539444, + "loss": 7.3628, + "step": 21389 + }, + { + "epoch": 1.9958943734253989, + "grad_norm": 1.4442098209540504, + "learning_rate": 0.0002338340599947666, + "loss": 7.8753, + "step": 21390 + }, + { + "epoch": 1.9959876831202763, + "grad_norm": 1.6326395889447574, + "learning_rate": 0.00023382778770097352, + "loss": 7.9543, + "step": 21391 + }, + { + "epoch": 1.9960809928151535, + "grad_norm": 1.0893629346940417, + "learning_rate": 0.00023382151519403126, + "loss": 7.1626, + "step": 21392 + }, + { + "epoch": 1.9961743025100307, + "grad_norm": 0.9469373867620126, + "learning_rate": 0.0002338152424739557, + "loss": 7.5425, + "step": 21393 + }, + { + "epoch": 1.9962676122049081, + "grad_norm": 1.4167142478410961, + "learning_rate": 0.00023380896954076273, + "loss": 7.3881, + "step": 21394 + }, + { + "epoch": 1.9963609218997855, + "grad_norm": 1.2281345855367347, + "learning_rate": 0.00023380269639446845, + "loss": 7.7826, + "step": 21395 + }, + { + "epoch": 1.9964542315946627, + "grad_norm": 1.0501631236171927, + "learning_rate": 0.00023379642303508874, + "loss": 7.7211, + "step": 21396 + }, + { + "epoch": 1.99654754128954, + "grad_norm": 2.919445592658017e+33, + "learning_rate": 0.00023379014946263955, + "loss": 7.9129, + "step": 21397 + }, + { + "epoch": 1.9966408509844173, + "grad_norm": 1.0823474854032442, + "learning_rate": 0.00023378387567713675, + "loss": 7.3444, + "step": 21398 + }, + { + "epoch": 1.9967341606792945, + "grad_norm": 1.0373772772860306, + "learning_rate": 0.00023377760167859647, + "loss": 7.6403, + "step": 21399 + }, + { + "epoch": 1.9968274703741717, + "grad_norm": 1.0271195448338957, + "learning_rate": 0.0002337713274670345, + "loss": 7.6091, + "step": 21400 + }, + { + "epoch": 1.9969207800690492, + "grad_norm": 1.367070471481266, + "learning_rate": 0.00023376505304246694, + "loss": 7.5641, + "step": 21401 + }, + { + "epoch": 1.9970140897639266, + "grad_norm": 1.3299817205190172, + "learning_rate": 0.0002337587784049096, + "loss": 7.7943, + "step": 21402 + }, + { + "epoch": 1.9971073994588038, + "grad_norm": 1.1339515381017096, + "learning_rate": 0.00023375250355437853, + "loss": 7.5283, + "step": 21403 + }, + { + "epoch": 1.997200709153681, + "grad_norm": 7.450650104427387e+31, + "learning_rate": 0.0002337462284908896, + "loss": 7.4301, + "step": 21404 + }, + { + "epoch": 1.9972940188485584, + "grad_norm": 0.9728160291089042, + "learning_rate": 0.00023373995321445884, + "loss": 7.6728, + "step": 21405 + }, + { + "epoch": 1.9973873285434358, + "grad_norm": 1.1351870504284243, + "learning_rate": 0.00023373367772510224, + "loss": 7.5422, + "step": 21406 + }, + { + "epoch": 1.997480638238313, + "grad_norm": 1.8012957279594707e+31, + "learning_rate": 0.00023372740202283563, + "loss": 7.1411, + "step": 21407 + }, + { + "epoch": 1.9975739479331902, + "grad_norm": 3.331531210322919e+32, + "learning_rate": 0.00023372112610767505, + "loss": 7.283, + "step": 21408 + }, + { + "epoch": 1.9976672576280676, + "grad_norm": 1.0620522613121002, + "learning_rate": 0.00023371484997963653, + "loss": 7.6262, + "step": 21409 + }, + { + "epoch": 1.9977605673229448, + "grad_norm": 1.4620627914581812, + "learning_rate": 0.00023370857363873586, + "loss": 7.4012, + "step": 21410 + }, + { + "epoch": 1.997853877017822, + "grad_norm": 1.072072768092155, + "learning_rate": 0.0002337022970849891, + "loss": 7.3719, + "step": 21411 + }, + { + "epoch": 1.9979471867126994, + "grad_norm": 2.949931598851643e+34, + "learning_rate": 0.00023369602031841222, + "loss": 7.5499, + "step": 21412 + }, + { + "epoch": 1.9980404964075769, + "grad_norm": 1.5119673651871355, + "learning_rate": 0.0002336897433390211, + "loss": 7.5329, + "step": 21413 + }, + { + "epoch": 1.998133806102454, + "grad_norm": 1.3168266460753535, + "learning_rate": 0.00023368346614683177, + "loss": 7.576, + "step": 21414 + }, + { + "epoch": 1.9982271157973313, + "grad_norm": 1.0472903333604113, + "learning_rate": 0.0002336771887418602, + "loss": 7.453, + "step": 21415 + }, + { + "epoch": 1.9983204254922087, + "grad_norm": 1.1882129784785258, + "learning_rate": 0.0002336709111241223, + "loss": 7.5078, + "step": 21416 + }, + { + "epoch": 1.998413735187086, + "grad_norm": 1.5786290849726883, + "learning_rate": 0.00023366463329363405, + "loss": 7.725, + "step": 21417 + }, + { + "epoch": 1.9985070448819633, + "grad_norm": 1.9050069032102077, + "learning_rate": 0.00023365835525041148, + "loss": 7.3507, + "step": 21418 + }, + { + "epoch": 1.9986003545768405, + "grad_norm": 1.4699503315033724, + "learning_rate": 0.00023365207699447044, + "loss": 7.5747, + "step": 21419 + }, + { + "epoch": 1.998693664271718, + "grad_norm": 2.747802944933313, + "learning_rate": 0.00023364579852582697, + "loss": 6.9637, + "step": 21420 + }, + { + "epoch": 1.998786973966595, + "grad_norm": 4.9573030711432156e+32, + "learning_rate": 0.00023363951984449702, + "loss": 7.5151, + "step": 21421 + }, + { + "epoch": 1.9988802836614723, + "grad_norm": 1.6062664759655518, + "learning_rate": 0.00023363324095049647, + "loss": 7.8729, + "step": 21422 + }, + { + "epoch": 1.9989735933563497, + "grad_norm": 1.9317988267426056, + "learning_rate": 0.00023362696184384143, + "loss": 8.0422, + "step": 21423 + }, + { + "epoch": 1.9990669030512271, + "grad_norm": 1.1387226199191745, + "learning_rate": 0.00023362068252454781, + "loss": 7.3043, + "step": 21424 + }, + { + "epoch": 1.9991602127461043, + "grad_norm": 1.1087496564993302, + "learning_rate": 0.00023361440299263155, + "loss": 7.3239, + "step": 21425 + }, + { + "epoch": 1.9992535224409815, + "grad_norm": 1.2188287214197546, + "learning_rate": 0.0002336081232481086, + "loss": 7.3877, + "step": 21426 + }, + { + "epoch": 1.999346832135859, + "grad_norm": 1.1203531380091925, + "learning_rate": 0.000233601843290995, + "loss": 7.5976, + "step": 21427 + }, + { + "epoch": 1.9994401418307364, + "grad_norm": 1.3298968178519468, + "learning_rate": 0.00023359556312130668, + "loss": 7.7121, + "step": 21428 + }, + { + "epoch": 1.9995334515256133, + "grad_norm": 1.1981936696595845, + "learning_rate": 0.0002335892827390596, + "loss": 7.5374, + "step": 21429 + }, + { + "epoch": 1.9996267612204908, + "grad_norm": 1.468641752040821, + "learning_rate": 0.00023358300214426972, + "loss": 7.4079, + "step": 21430 + }, + { + "epoch": 1.9997200709153682, + "grad_norm": 6.656666154876437e+32, + "learning_rate": 0.00023357672133695305, + "loss": 7.4795, + "step": 21431 + }, + { + "epoch": 1.9998133806102454, + "grad_norm": 1.1762255363310254, + "learning_rate": 0.00023357044031712545, + "loss": 7.4734, + "step": 21432 + }, + { + "epoch": 1.9999066903051226, + "grad_norm": 1.6324139684108911, + "learning_rate": 0.00023356415908480307, + "loss": 7.253, + "step": 21433 + }, + { + "epoch": 2.0, + "grad_norm": 1.0805387942331297, + "learning_rate": 0.00023355787764000176, + "loss": 7.732, + "step": 21434 + }, + { + "epoch": 2.0, + "eval_loss": 7.387271881103516, + "eval_runtime": 26.0921, + "eval_samples_per_second": 6.784, + "eval_steps_per_second": 6.784, + "step": 21434 + }, + { + "epoch": 2.0000933096948774, + "grad_norm": 9.807793500076263e+29, + "learning_rate": 0.00023355159598273753, + "loss": 7.5948, + "step": 21435 + }, + { + "epoch": 2.0001866193897544, + "grad_norm": 2.0969906904898186, + "learning_rate": 0.00023354531411302632, + "loss": 7.4, + "step": 21436 + }, + { + "epoch": 2.000279929084632, + "grad_norm": 1.4311134095900182, + "learning_rate": 0.00023353903203088413, + "loss": 7.2254, + "step": 21437 + }, + { + "epoch": 2.0003732387795092, + "grad_norm": 1.0668816024666545, + "learning_rate": 0.00023353274973632695, + "loss": 7.2914, + "step": 21438 + }, + { + "epoch": 2.0004665484743867, + "grad_norm": 1.690724701922977, + "learning_rate": 0.00023352646722937069, + "loss": 7.4576, + "step": 21439 + }, + { + "epoch": 2.0005598581692636, + "grad_norm": 1.0145499939981855, + "learning_rate": 0.00023352018451003139, + "loss": 7.1302, + "step": 21440 + }, + { + "epoch": 2.000653167864141, + "grad_norm": 1.5655733309082145, + "learning_rate": 0.000233513901578325, + "loss": 7.3962, + "step": 21441 + }, + { + "epoch": 2.0007464775590185, + "grad_norm": 1.5394927658612685, + "learning_rate": 0.00023350761843426749, + "loss": 7.54, + "step": 21442 + }, + { + "epoch": 2.000839787253896, + "grad_norm": 1.274542328275053, + "learning_rate": 0.00023350133507787484, + "loss": 6.9216, + "step": 21443 + }, + { + "epoch": 2.000933096948773, + "grad_norm": 4.479050505302014e+33, + "learning_rate": 0.00023349505150916302, + "loss": 7.5238, + "step": 21444 + }, + { + "epoch": 2.0010264066436503, + "grad_norm": 1.3295180701258646, + "learning_rate": 0.00023348876772814805, + "loss": 7.7907, + "step": 21445 + }, + { + "epoch": 2.0011197163385277, + "grad_norm": 1.0478112326579985, + "learning_rate": 0.00023348248373484588, + "loss": 7.3321, + "step": 21446 + }, + { + "epoch": 2.0012130260334047, + "grad_norm": 1.1776686236360534, + "learning_rate": 0.00023347619952927246, + "loss": 7.3206, + "step": 21447 + }, + { + "epoch": 2.001306335728282, + "grad_norm": 1.4618655393638629, + "learning_rate": 0.0002334699151114438, + "loss": 7.3319, + "step": 21448 + }, + { + "epoch": 2.0013996454231595, + "grad_norm": 1.4170923749164384e+32, + "learning_rate": 0.00023346363048137584, + "loss": 7.4295, + "step": 21449 + }, + { + "epoch": 2.001492955118037, + "grad_norm": 1.6357516515787674e+33, + "learning_rate": 0.00023345734563908464, + "loss": 7.329, + "step": 21450 + }, + { + "epoch": 2.001586264812914, + "grad_norm": 1.030881791756821, + "learning_rate": 0.0002334510605845861, + "loss": 7.35, + "step": 21451 + }, + { + "epoch": 2.0016795745077913, + "grad_norm": 2.0094278516155746, + "learning_rate": 0.00023344477531789623, + "loss": 7.0142, + "step": 21452 + }, + { + "epoch": 2.0017728842026687, + "grad_norm": 1.1070580549413989, + "learning_rate": 0.00023343848983903103, + "loss": 7.3358, + "step": 21453 + }, + { + "epoch": 2.001866193897546, + "grad_norm": 1.35847393440413, + "learning_rate": 0.00023343220414800647, + "loss": 7.1798, + "step": 21454 + }, + { + "epoch": 2.001959503592423, + "grad_norm": 1.3281418162967003, + "learning_rate": 0.0002334259182448385, + "loss": 7.2102, + "step": 21455 + }, + { + "epoch": 2.0020528132873006, + "grad_norm": 2.529622468326313, + "learning_rate": 0.0002334196321295432, + "loss": 7.5539, + "step": 21456 + }, + { + "epoch": 2.002146122982178, + "grad_norm": 1.185732642917926, + "learning_rate": 0.0002334133458021364, + "loss": 7.207, + "step": 21457 + }, + { + "epoch": 2.002239432677055, + "grad_norm": 5.430958867698907e+31, + "learning_rate": 0.00023340705926263423, + "loss": 7.516, + "step": 21458 + }, + { + "epoch": 2.0023327423719324, + "grad_norm": 1.2204502953354859, + "learning_rate": 0.0002334007725110526, + "loss": 7.4403, + "step": 21459 + }, + { + "epoch": 2.00242605206681, + "grad_norm": 1.1100069971958955, + "learning_rate": 0.0002333944855474075, + "loss": 7.44, + "step": 21460 + }, + { + "epoch": 2.002519361761687, + "grad_norm": 1.0406746094044306, + "learning_rate": 0.00023338819837171495, + "loss": 7.8328, + "step": 21461 + }, + { + "epoch": 2.002612671456564, + "grad_norm": 1.5072131306237875, + "learning_rate": 0.00023338191098399091, + "loss": 7.3244, + "step": 21462 + }, + { + "epoch": 2.0027059811514416, + "grad_norm": 4.178977211850917, + "learning_rate": 0.0002333756233842513, + "loss": 7.196, + "step": 21463 + }, + { + "epoch": 2.002799290846319, + "grad_norm": 9.83388713159551e+35, + "learning_rate": 0.0002333693355725123, + "loss": 7.7777, + "step": 21464 + }, + { + "epoch": 2.0028926005411964, + "grad_norm": 1.2593890049004177, + "learning_rate": 0.0002333630475487897, + "loss": 7.4049, + "step": 21465 + }, + { + "epoch": 2.0029859102360734, + "grad_norm": 0.9730197792397041, + "learning_rate": 0.00023335675931309956, + "loss": 7.5617, + "step": 21466 + }, + { + "epoch": 2.003079219930951, + "grad_norm": 1.0447044741419815, + "learning_rate": 0.00023335047086545788, + "loss": 7.7002, + "step": 21467 + }, + { + "epoch": 2.0031725296258283, + "grad_norm": 1.2307679090470325, + "learning_rate": 0.00023334418220588067, + "loss": 7.5484, + "step": 21468 + }, + { + "epoch": 2.0032658393207052, + "grad_norm": 1.2487215067965054, + "learning_rate": 0.00023333789333438384, + "loss": 7.6914, + "step": 21469 + }, + { + "epoch": 2.0033591490155827, + "grad_norm": 2.106991958819849, + "learning_rate": 0.0002333316042509835, + "loss": 7.8202, + "step": 21470 + }, + { + "epoch": 2.00345245871046, + "grad_norm": 1.0301273004222655, + "learning_rate": 0.00023332531495569555, + "loss": 7.2887, + "step": 21471 + }, + { + "epoch": 2.0035457684053375, + "grad_norm": 1.6367001248874324, + "learning_rate": 0.000233319025448536, + "loss": 6.9122, + "step": 21472 + }, + { + "epoch": 2.0036390781002145, + "grad_norm": 1.0427761821351345, + "learning_rate": 0.00023331273572952085, + "loss": 7.3424, + "step": 21473 + }, + { + "epoch": 2.003732387795092, + "grad_norm": 1.0655875290607315, + "learning_rate": 0.0002333064457986661, + "loss": 7.3715, + "step": 21474 + }, + { + "epoch": 2.0038256974899693, + "grad_norm": 1.682870430850858, + "learning_rate": 0.0002333001556559877, + "loss": 7.0917, + "step": 21475 + }, + { + "epoch": 2.0039190071848467, + "grad_norm": 1.194026598114444, + "learning_rate": 0.0002332938653015017, + "loss": 7.2099, + "step": 21476 + }, + { + "epoch": 2.0040123168797237, + "grad_norm": 2.947400278954433e+34, + "learning_rate": 0.00023328757473522408, + "loss": 7.7928, + "step": 21477 + }, + { + "epoch": 2.004105626574601, + "grad_norm": 4.1266820388616325e+35, + "learning_rate": 0.00023328128395717085, + "loss": 7.8644, + "step": 21478 + }, + { + "epoch": 2.0041989362694785, + "grad_norm": 1.156855947181136, + "learning_rate": 0.00023327499296735794, + "loss": 7.4065, + "step": 21479 + }, + { + "epoch": 2.0042922459643555, + "grad_norm": 1.2601912116860228, + "learning_rate": 0.0002332687017658014, + "loss": 7.3483, + "step": 21480 + }, + { + "epoch": 2.004385555659233, + "grad_norm": 1.4515240103410416, + "learning_rate": 0.00023326241035251724, + "loss": 7.6512, + "step": 21481 + }, + { + "epoch": 2.0044788653541104, + "grad_norm": 1.0416820561078246, + "learning_rate": 0.00023325611872752143, + "loss": 7.1543, + "step": 21482 + }, + { + "epoch": 2.0045721750489878, + "grad_norm": 1.275725153592752, + "learning_rate": 0.00023324982689083, + "loss": 7.7925, + "step": 21483 + }, + { + "epoch": 2.0046654847438647, + "grad_norm": 1.8835045322823837e+33, + "learning_rate": 0.00023324353484245887, + "loss": 7.429, + "step": 21484 + }, + { + "epoch": 2.004758794438742, + "grad_norm": 7.574363383939662e+34, + "learning_rate": 0.0002332372425824241, + "loss": 7.4297, + "step": 21485 + }, + { + "epoch": 2.0048521041336196, + "grad_norm": 1.186513048481707, + "learning_rate": 0.0002332309501107417, + "loss": 7.4197, + "step": 21486 + }, + { + "epoch": 2.004945413828497, + "grad_norm": 1.9153384438309313, + "learning_rate": 0.00023322465742742763, + "loss": 7.3891, + "step": 21487 + }, + { + "epoch": 2.005038723523374, + "grad_norm": 6.632063763746117e+32, + "learning_rate": 0.00023321836453249793, + "loss": 7.3878, + "step": 21488 + }, + { + "epoch": 2.0051320332182514, + "grad_norm": 1.097521703030487, + "learning_rate": 0.00023321207142596855, + "loss": 7.4858, + "step": 21489 + }, + { + "epoch": 2.005225342913129, + "grad_norm": 1.1688977648900363, + "learning_rate": 0.00023320577810785552, + "loss": 7.5576, + "step": 21490 + }, + { + "epoch": 2.005318652608006, + "grad_norm": 0.9791510143749955, + "learning_rate": 0.00023319948457817486, + "loss": 7.2924, + "step": 21491 + }, + { + "epoch": 2.005411962302883, + "grad_norm": 1.220794293665533, + "learning_rate": 0.00023319319083694258, + "loss": 7.4226, + "step": 21492 + }, + { + "epoch": 2.0055052719977606, + "grad_norm": 1.6113256164808927, + "learning_rate": 0.00023318689688417465, + "loss": 7.6921, + "step": 21493 + }, + { + "epoch": 2.005598581692638, + "grad_norm": 1.4921160600033674, + "learning_rate": 0.0002331806027198871, + "loss": 7.7207, + "step": 21494 + }, + { + "epoch": 2.005691891387515, + "grad_norm": 6.060838390296084, + "learning_rate": 0.00023317430834409584, + "loss": 7.5819, + "step": 21495 + }, + { + "epoch": 2.0057852010823924, + "grad_norm": 1.2492655592262305, + "learning_rate": 0.00023316801375681702, + "loss": 7.2857, + "step": 21496 + }, + { + "epoch": 2.00587851077727, + "grad_norm": 1.559516553446045, + "learning_rate": 0.00023316171895806657, + "loss": 7.2418, + "step": 21497 + }, + { + "epoch": 2.005971820472147, + "grad_norm": 2.190206923925996, + "learning_rate": 0.00023315542394786043, + "loss": 7.2589, + "step": 21498 + }, + { + "epoch": 2.0060651301670243, + "grad_norm": 2.855038269063668e+34, + "learning_rate": 0.00023314912872621476, + "loss": 7.4443, + "step": 21499 + }, + { + "epoch": 2.0061584398619017, + "grad_norm": 7.145799526472889e+34, + "learning_rate": 0.00023314283329314544, + "loss": 7.7122, + "step": 21500 + }, + { + "epoch": 2.006251749556779, + "grad_norm": 3.955832245463258, + "learning_rate": 0.00023313653764866859, + "loss": 7.4251, + "step": 21501 + }, + { + "epoch": 2.006345059251656, + "grad_norm": 1.765797202019543, + "learning_rate": 0.0002331302417928001, + "loss": 7.8258, + "step": 21502 + }, + { + "epoch": 2.0064383689465335, + "grad_norm": 1.231735134719306, + "learning_rate": 0.00023312394572555604, + "loss": 7.5651, + "step": 21503 + }, + { + "epoch": 2.006531678641411, + "grad_norm": 1.1867572082404645, + "learning_rate": 0.00023311764944695238, + "loss": 7.2531, + "step": 21504 + }, + { + "epoch": 2.0066249883362883, + "grad_norm": 1.4893125315102784, + "learning_rate": 0.00023311135295700521, + "loss": 7.6666, + "step": 21505 + }, + { + "epoch": 2.0067182980311653, + "grad_norm": 1.1097556950495104, + "learning_rate": 0.00023310505625573046, + "loss": 7.2181, + "step": 21506 + }, + { + "epoch": 2.0068116077260427, + "grad_norm": 1.5420246674635816, + "learning_rate": 0.00023309875934314416, + "loss": 7.3292, + "step": 21507 + }, + { + "epoch": 2.00690491742092, + "grad_norm": 1.1730439907496883, + "learning_rate": 0.00023309246221926236, + "loss": 7.5647, + "step": 21508 + }, + { + "epoch": 2.006998227115797, + "grad_norm": 1.8926106150114963, + "learning_rate": 0.00023308616488410103, + "loss": 7.285, + "step": 21509 + }, + { + "epoch": 2.0070915368106745, + "grad_norm": 1.0792542736756952, + "learning_rate": 0.00023307986733767615, + "loss": 7.5348, + "step": 21510 + }, + { + "epoch": 2.007184846505552, + "grad_norm": 1.0725177718715732e+34, + "learning_rate": 0.00023307356958000386, + "loss": 7.4147, + "step": 21511 + }, + { + "epoch": 2.0072781562004294, + "grad_norm": 1.1847458292718072, + "learning_rate": 0.0002330672716111001, + "loss": 7.3863, + "step": 21512 + }, + { + "epoch": 2.0073714658953064, + "grad_norm": 1.0629937173834807, + "learning_rate": 0.00023306097343098078, + "loss": 7.5096, + "step": 21513 + }, + { + "epoch": 2.0074647755901838, + "grad_norm": 1.3244800241989509e+34, + "learning_rate": 0.0002330546750396621, + "loss": 7.3712, + "step": 21514 + }, + { + "epoch": 2.007558085285061, + "grad_norm": 1.1332001502314848, + "learning_rate": 0.00023304837643715992, + "loss": 7.6037, + "step": 21515 + }, + { + "epoch": 2.0076513949799386, + "grad_norm": 1.3544137311767073, + "learning_rate": 0.00023304207762349035, + "loss": 7.4224, + "step": 21516 + }, + { + "epoch": 2.0077447046748156, + "grad_norm": 1.1420641712911366, + "learning_rate": 0.00023303577859866932, + "loss": 7.6257, + "step": 21517 + }, + { + "epoch": 2.007838014369693, + "grad_norm": 2.13904983944687, + "learning_rate": 0.00023302947936271296, + "loss": 7.4803, + "step": 21518 + }, + { + "epoch": 2.0079313240645704, + "grad_norm": 1.202941531876748, + "learning_rate": 0.00023302317991563725, + "loss": 7.6245, + "step": 21519 + }, + { + "epoch": 2.0080246337594474, + "grad_norm": 1.080718131525596, + "learning_rate": 0.00023301688025745813, + "loss": 7.6199, + "step": 21520 + }, + { + "epoch": 2.008117943454325, + "grad_norm": 3.3753433344090877e+33, + "learning_rate": 0.00023301058038819168, + "loss": 7.1082, + "step": 21521 + }, + { + "epoch": 2.0082112531492022, + "grad_norm": 1.7356427270901658, + "learning_rate": 0.00023300428030785392, + "loss": 7.1207, + "step": 21522 + }, + { + "epoch": 2.0083045628440797, + "grad_norm": 1.1800802478502082, + "learning_rate": 0.00023299798001646087, + "loss": 7.5704, + "step": 21523 + }, + { + "epoch": 2.0083978725389566, + "grad_norm": 4.9032340051596375e+33, + "learning_rate": 0.00023299167951402856, + "loss": 7.5844, + "step": 21524 + }, + { + "epoch": 2.008491182233834, + "grad_norm": 5.1631099392620115e+33, + "learning_rate": 0.00023298537880057297, + "loss": 7.567, + "step": 21525 + }, + { + "epoch": 2.0085844919287115, + "grad_norm": 1.071990286946341, + "learning_rate": 0.00023297907787611015, + "loss": 7.5029, + "step": 21526 + }, + { + "epoch": 2.008677801623589, + "grad_norm": 1.0986472417968467, + "learning_rate": 0.00023297277674065613, + "loss": 7.5241, + "step": 21527 + }, + { + "epoch": 2.008771111318466, + "grad_norm": 1.0432228771634549, + "learning_rate": 0.00023296647539422692, + "loss": 7.4387, + "step": 21528 + }, + { + "epoch": 2.0088644210133433, + "grad_norm": 0.9765025504103485, + "learning_rate": 0.0002329601738368385, + "loss": 7.5475, + "step": 21529 + }, + { + "epoch": 2.0089577307082207, + "grad_norm": 6.648164621435972e+33, + "learning_rate": 0.000232953872068507, + "loss": 7.5944, + "step": 21530 + }, + { + "epoch": 2.0090510404030977, + "grad_norm": 1.2254613963684085, + "learning_rate": 0.0002329475700892483, + "loss": 7.5555, + "step": 21531 + }, + { + "epoch": 2.009144350097975, + "grad_norm": 3.3278216424414065e+33, + "learning_rate": 0.00023294126789907854, + "loss": 7.6123, + "step": 21532 + }, + { + "epoch": 2.0092376597928525, + "grad_norm": 1.1509781012694216, + "learning_rate": 0.0002329349654980137, + "loss": 7.3683, + "step": 21533 + }, + { + "epoch": 2.00933096948773, + "grad_norm": 1.3101762217560449, + "learning_rate": 0.0002329286628860698, + "loss": 7.1803, + "step": 21534 + }, + { + "epoch": 2.009424279182607, + "grad_norm": 1.1315874570398023, + "learning_rate": 0.00023292236006326286, + "loss": 7.6021, + "step": 21535 + }, + { + "epoch": 2.0095175888774843, + "grad_norm": 6.2753683461470124e+32, + "learning_rate": 0.00023291605702960899, + "loss": 7.7075, + "step": 21536 + }, + { + "epoch": 2.0096108985723617, + "grad_norm": 1.6931951688377512e+33, + "learning_rate": 0.00023290975378512408, + "loss": 7.5308, + "step": 21537 + }, + { + "epoch": 2.009704208267239, + "grad_norm": 1.7924297347282299, + "learning_rate": 0.00023290345032982426, + "loss": 7.7145, + "step": 21538 + }, + { + "epoch": 2.009797517962116, + "grad_norm": 1.752570649519296e+33, + "learning_rate": 0.00023289714666372552, + "loss": 7.4994, + "step": 21539 + }, + { + "epoch": 2.0098908276569936, + "grad_norm": 1.064390717879929, + "learning_rate": 0.0002328908427868439, + "loss": 7.5458, + "step": 21540 + }, + { + "epoch": 2.009984137351871, + "grad_norm": 2.7588248463986464, + "learning_rate": 0.0002328845386991954, + "loss": 6.9202, + "step": 21541 + }, + { + "epoch": 2.010077447046748, + "grad_norm": 1.6650615702847986, + "learning_rate": 0.00023287823440079608, + "loss": 7.0911, + "step": 21542 + }, + { + "epoch": 2.0101707567416254, + "grad_norm": 1.3568364280139926, + "learning_rate": 0.00023287192989166194, + "loss": 7.6693, + "step": 21543 + }, + { + "epoch": 2.010264066436503, + "grad_norm": 1.170270798743785, + "learning_rate": 0.00023286562517180906, + "loss": 7.1291, + "step": 21544 + }, + { + "epoch": 2.01035737613138, + "grad_norm": 1.29323199715832, + "learning_rate": 0.0002328593202412534, + "loss": 7.5601, + "step": 21545 + }, + { + "epoch": 2.010450685826257, + "grad_norm": 9.023533824372029, + "learning_rate": 0.00023285301510001107, + "loss": 7.469, + "step": 21546 + }, + { + "epoch": 2.0105439955211346, + "grad_norm": 1.9257506107030038, + "learning_rate": 0.00023284670974809806, + "loss": 7.4252, + "step": 21547 + }, + { + "epoch": 2.010637305216012, + "grad_norm": 1.914535518059261, + "learning_rate": 0.0002328404041855304, + "loss": 7.1953, + "step": 21548 + }, + { + "epoch": 2.0107306149108894, + "grad_norm": 1.7320612154096257e+34, + "learning_rate": 0.00023283409841232418, + "loss": 7.5232, + "step": 21549 + }, + { + "epoch": 2.0108239246057664, + "grad_norm": 1.0889038486613856, + "learning_rate": 0.0002328277924284953, + "loss": 7.6966, + "step": 21550 + }, + { + "epoch": 2.010917234300644, + "grad_norm": 1.1808687134359324, + "learning_rate": 0.00023282148623405993, + "loss": 7.7069, + "step": 21551 + }, + { + "epoch": 2.0110105439955213, + "grad_norm": 1.3451341170592856, + "learning_rate": 0.00023281517982903406, + "loss": 7.4553, + "step": 21552 + }, + { + "epoch": 2.0111038536903982, + "grad_norm": 1.0235385689098278, + "learning_rate": 0.0002328088732134337, + "loss": 7.7265, + "step": 21553 + }, + { + "epoch": 2.0111971633852757, + "grad_norm": 1.4063486238151395, + "learning_rate": 0.0002328025663872749, + "loss": 7.3802, + "step": 21554 + }, + { + "epoch": 2.011290473080153, + "grad_norm": 1.2193170907442836, + "learning_rate": 0.00023279625935057372, + "loss": 7.6512, + "step": 21555 + }, + { + "epoch": 2.0113837827750305, + "grad_norm": 1.728673118196538, + "learning_rate": 0.00023278995210334618, + "loss": 7.5331, + "step": 21556 + }, + { + "epoch": 2.0114770924699075, + "grad_norm": 1.1722145196888922, + "learning_rate": 0.0002327836446456083, + "loss": 7.5623, + "step": 21557 + }, + { + "epoch": 2.011570402164785, + "grad_norm": 1.0500666155189804, + "learning_rate": 0.0002327773369773761, + "loss": 7.4864, + "step": 21558 + }, + { + "epoch": 2.0116637118596623, + "grad_norm": 1.3410736788542565, + "learning_rate": 0.00023277102909866575, + "loss": 7.1999, + "step": 21559 + }, + { + "epoch": 2.0117570215545397, + "grad_norm": 0.9907558215621686, + "learning_rate": 0.00023276472100949313, + "loss": 7.2528, + "step": 21560 + }, + { + "epoch": 2.0118503312494167, + "grad_norm": 4.925012833178229e+34, + "learning_rate": 0.00023275841270987432, + "loss": 7.496, + "step": 21561 + }, + { + "epoch": 2.011943640944294, + "grad_norm": 4.3387479181579565e+35, + "learning_rate": 0.00023275210419982544, + "loss": 7.4597, + "step": 21562 + }, + { + "epoch": 2.0120369506391715, + "grad_norm": 1.0331886174365623, + "learning_rate": 0.00023274579547936242, + "loss": 7.0581, + "step": 21563 + }, + { + "epoch": 2.0121302603340485, + "grad_norm": 1.4488450805396809, + "learning_rate": 0.00023273948654850134, + "loss": 7.5462, + "step": 21564 + }, + { + "epoch": 2.012223570028926, + "grad_norm": 0.9552398981452668, + "learning_rate": 0.00023273317740725829, + "loss": 7.2095, + "step": 21565 + }, + { + "epoch": 2.0123168797238034, + "grad_norm": 1.0403802565589086, + "learning_rate": 0.00023272686805564933, + "loss": 7.553, + "step": 21566 + }, + { + "epoch": 2.0124101894186808, + "grad_norm": 1.565397980991706, + "learning_rate": 0.00023272055849369035, + "loss": 6.9862, + "step": 21567 + }, + { + "epoch": 2.0125034991135577, + "grad_norm": 1.2551133456849684, + "learning_rate": 0.00023271424872139756, + "loss": 7.7915, + "step": 21568 + }, + { + "epoch": 2.012596808808435, + "grad_norm": 1.1051405232951004, + "learning_rate": 0.00023270793873878692, + "loss": 7.5436, + "step": 21569 + }, + { + "epoch": 2.0126901185033126, + "grad_norm": 1.038436943865182, + "learning_rate": 0.00023270162854587448, + "loss": 7.4857, + "step": 21570 + }, + { + "epoch": 2.01278342819819, + "grad_norm": 1.194117091734381, + "learning_rate": 0.00023269531814267631, + "loss": 7.4469, + "step": 21571 + }, + { + "epoch": 2.012876737893067, + "grad_norm": 2.8808872111724754e+34, + "learning_rate": 0.00023268900752920844, + "loss": 7.5303, + "step": 21572 + }, + { + "epoch": 2.0129700475879444, + "grad_norm": 1.4185046310052363, + "learning_rate": 0.00023268269670548687, + "loss": 7.2333, + "step": 21573 + }, + { + "epoch": 2.013063357282822, + "grad_norm": 1.2559141229135562, + "learning_rate": 0.00023267638567152775, + "loss": 7.672, + "step": 21574 + }, + { + "epoch": 2.013156666977699, + "grad_norm": 1.222005102322775, + "learning_rate": 0.00023267007442734708, + "loss": 7.7856, + "step": 21575 + }, + { + "epoch": 2.013249976672576, + "grad_norm": 6.4767854072219695e+32, + "learning_rate": 0.00023266376297296086, + "loss": 7.6303, + "step": 21576 + }, + { + "epoch": 2.0133432863674536, + "grad_norm": 1.3770397472378921, + "learning_rate": 0.0002326574513083852, + "loss": 6.929, + "step": 21577 + }, + { + "epoch": 2.013436596062331, + "grad_norm": 1.3660040313313266, + "learning_rate": 0.00023265113943363614, + "loss": 7.6713, + "step": 21578 + }, + { + "epoch": 2.013529905757208, + "grad_norm": 1.8270148632176933e+34, + "learning_rate": 0.00023264482734872968, + "loss": 7.4649, + "step": 21579 + }, + { + "epoch": 2.0136232154520854, + "grad_norm": 1.4893253698505549, + "learning_rate": 0.00023263851505368193, + "loss": 7.7283, + "step": 21580 + }, + { + "epoch": 2.013716525146963, + "grad_norm": 1.547958044278866, + "learning_rate": 0.00023263220254850892, + "loss": 7.3343, + "step": 21581 + }, + { + "epoch": 2.0138098348418403, + "grad_norm": 1.4666725778664673, + "learning_rate": 0.00023262588983322666, + "loss": 7.5821, + "step": 21582 + }, + { + "epoch": 2.0139031445367173, + "grad_norm": 6.185713782219769e+33, + "learning_rate": 0.0002326195769078513, + "loss": 7.5529, + "step": 21583 + }, + { + "epoch": 2.0139964542315947, + "grad_norm": 1.2516006404459095, + "learning_rate": 0.0002326132637723988, + "loss": 7.2433, + "step": 21584 + }, + { + "epoch": 2.014089763926472, + "grad_norm": 1.2381285285364665, + "learning_rate": 0.00023260695042688525, + "loss": 7.5231, + "step": 21585 + }, + { + "epoch": 2.014183073621349, + "grad_norm": 1.0105380522288252, + "learning_rate": 0.00023260063687132666, + "loss": 7.5278, + "step": 21586 + }, + { + "epoch": 2.0142763833162265, + "grad_norm": 1.8707241783318538, + "learning_rate": 0.00023259432310573913, + "loss": 7.1256, + "step": 21587 + }, + { + "epoch": 2.014369693011104, + "grad_norm": 3.5030721811520644, + "learning_rate": 0.00023258800913013872, + "loss": 7.1564, + "step": 21588 + }, + { + "epoch": 2.0144630027059813, + "grad_norm": 2.1401281146779024e+34, + "learning_rate": 0.00023258169494454146, + "loss": 7.8799, + "step": 21589 + }, + { + "epoch": 2.0145563124008583, + "grad_norm": 0.9459280356650152, + "learning_rate": 0.0002325753805489634, + "loss": 7.4553, + "step": 21590 + }, + { + "epoch": 2.0146496220957357, + "grad_norm": 1.0521365839388959, + "learning_rate": 0.00023256906594342063, + "loss": 7.3485, + "step": 21591 + }, + { + "epoch": 2.014742931790613, + "grad_norm": 1.3197241528185486, + "learning_rate": 0.00023256275112792916, + "loss": 7.0468, + "step": 21592 + }, + { + "epoch": 2.01483624148549, + "grad_norm": 1.211696003425688e+34, + "learning_rate": 0.0002325564361025051, + "loss": 7.5915, + "step": 21593 + }, + { + "epoch": 2.0149295511803675, + "grad_norm": 1.4169517634525741, + "learning_rate": 0.00023255012086716446, + "loss": 7.7708, + "step": 21594 + }, + { + "epoch": 2.015022860875245, + "grad_norm": 1.0679478573940322, + "learning_rate": 0.00023254380542192331, + "loss": 6.9663, + "step": 21595 + }, + { + "epoch": 2.0151161705701224, + "grad_norm": 1.1607999843191406, + "learning_rate": 0.0002325374897667977, + "loss": 7.3923, + "step": 21596 + }, + { + "epoch": 2.0152094802649994, + "grad_norm": 1.1651161301561042, + "learning_rate": 0.00023253117390180378, + "loss": 7.5912, + "step": 21597 + }, + { + "epoch": 2.0153027899598768, + "grad_norm": 1.3946450367978884, + "learning_rate": 0.00023252485782695746, + "loss": 7.5646, + "step": 21598 + }, + { + "epoch": 2.015396099654754, + "grad_norm": 1.252737139639355, + "learning_rate": 0.0002325185415422749, + "loss": 7.4446, + "step": 21599 + }, + { + "epoch": 2.0154894093496316, + "grad_norm": 9.721783523741707e+33, + "learning_rate": 0.00023251222504777212, + "loss": 7.43, + "step": 21600 + }, + { + "epoch": 2.0155827190445086, + "grad_norm": 5.401371488367138e+34, + "learning_rate": 0.00023250590834346515, + "loss": 7.1592, + "step": 21601 + }, + { + "epoch": 2.015676028739386, + "grad_norm": 1.0414664744787125, + "learning_rate": 0.00023249959142937016, + "loss": 7.4718, + "step": 21602 + }, + { + "epoch": 2.0157693384342634, + "grad_norm": 1.456406078969631, + "learning_rate": 0.00023249327430550316, + "loss": 7.881, + "step": 21603 + }, + { + "epoch": 2.0158626481291404, + "grad_norm": 1.321499614049528, + "learning_rate": 0.00023248695697188014, + "loss": 7.5364, + "step": 21604 + }, + { + "epoch": 2.015955957824018, + "grad_norm": 1.2339834500744074, + "learning_rate": 0.0002324806394285172, + "loss": 7.7933, + "step": 21605 + }, + { + "epoch": 2.0160492675188952, + "grad_norm": 1.1748675000758468, + "learning_rate": 0.00023247432167543055, + "loss": 7.4979, + "step": 21606 + }, + { + "epoch": 2.0161425772137727, + "grad_norm": 3.8923132423124254, + "learning_rate": 0.00023246800371263604, + "loss": 7.7123, + "step": 21607 + }, + { + "epoch": 2.0162358869086496, + "grad_norm": 1.253607123361621e+34, + "learning_rate": 0.00023246168554014984, + "loss": 7.7708, + "step": 21608 + }, + { + "epoch": 2.016329196603527, + "grad_norm": 1.1807026416043132e+34, + "learning_rate": 0.000232455367157988, + "loss": 7.1643, + "step": 21609 + }, + { + "epoch": 2.0164225062984045, + "grad_norm": 3.293447249079901e+33, + "learning_rate": 0.0002324490485661666, + "loss": 7.4737, + "step": 21610 + }, + { + "epoch": 2.016515815993282, + "grad_norm": 2.047295637244775e+32, + "learning_rate": 0.00023244272976470168, + "loss": 7.5768, + "step": 21611 + }, + { + "epoch": 2.016609125688159, + "grad_norm": 1.5538666965819248, + "learning_rate": 0.00023243641075360933, + "loss": 7.6503, + "step": 21612 + }, + { + "epoch": 2.0167024353830363, + "grad_norm": 2.1998679069311833e+33, + "learning_rate": 0.00023243009153290563, + "loss": 7.4149, + "step": 21613 + }, + { + "epoch": 2.0167957450779137, + "grad_norm": 1.0388414505280712, + "learning_rate": 0.00023242377210260656, + "loss": 7.4925, + "step": 21614 + }, + { + "epoch": 2.0168890547727907, + "grad_norm": 1.6505886365677969, + "learning_rate": 0.0002324174524627283, + "loss": 7.6072, + "step": 21615 + }, + { + "epoch": 2.016982364467668, + "grad_norm": 0.9647189381933383, + "learning_rate": 0.00023241113261328686, + "loss": 7.441, + "step": 21616 + }, + { + "epoch": 2.0170756741625455, + "grad_norm": 0.9582446179020742, + "learning_rate": 0.0002324048125542983, + "loss": 7.3583, + "step": 21617 + }, + { + "epoch": 2.017168983857423, + "grad_norm": 1.1671395211469275, + "learning_rate": 0.00023239849228577872, + "loss": 7.3436, + "step": 21618 + }, + { + "epoch": 2.0172622935523, + "grad_norm": 1.0688475201407361, + "learning_rate": 0.0002323921718077442, + "loss": 7.3948, + "step": 21619 + }, + { + "epoch": 2.0173556032471773, + "grad_norm": 0.9716862977509556, + "learning_rate": 0.0002323858511202108, + "loss": 7.3112, + "step": 21620 + }, + { + "epoch": 2.0174489129420548, + "grad_norm": 1.1831319693186224, + "learning_rate": 0.00023237953022319457, + "loss": 7.219, + "step": 21621 + }, + { + "epoch": 2.017542222636932, + "grad_norm": 1.116123599665525, + "learning_rate": 0.0002323732091167116, + "loss": 7.5409, + "step": 21622 + }, + { + "epoch": 2.017635532331809, + "grad_norm": 1.8031344486409684, + "learning_rate": 0.00023236688780077798, + "loss": 8.1856, + "step": 21623 + }, + { + "epoch": 2.0177288420266866, + "grad_norm": 1.1143788925546134, + "learning_rate": 0.00023236056627540973, + "loss": 7.4158, + "step": 21624 + }, + { + "epoch": 2.017822151721564, + "grad_norm": 2.1239604272797354, + "learning_rate": 0.00023235424454062299, + "loss": 7.1447, + "step": 21625 + }, + { + "epoch": 2.017915461416441, + "grad_norm": 1.055208238777486, + "learning_rate": 0.00023234792259643376, + "loss": 7.4726, + "step": 21626 + }, + { + "epoch": 2.0180087711113184, + "grad_norm": 1.098001853244988, + "learning_rate": 0.00023234160044285817, + "loss": 7.4216, + "step": 21627 + }, + { + "epoch": 2.018102080806196, + "grad_norm": 0.9938710441045673, + "learning_rate": 0.0002323352780799123, + "loss": 7.4836, + "step": 21628 + }, + { + "epoch": 2.018195390501073, + "grad_norm": 0.9784684722583358, + "learning_rate": 0.00023232895550761223, + "loss": 7.4292, + "step": 21629 + }, + { + "epoch": 2.01828870019595, + "grad_norm": 1.0709999077635, + "learning_rate": 0.00023232263272597394, + "loss": 7.5972, + "step": 21630 + }, + { + "epoch": 2.0183820098908276, + "grad_norm": 1.72786255721799e+34, + "learning_rate": 0.00023231630973501364, + "loss": 7.4248, + "step": 21631 + }, + { + "epoch": 2.018475319585705, + "grad_norm": 1.8294826753485898, + "learning_rate": 0.00023230998653474733, + "loss": 7.2211, + "step": 21632 + }, + { + "epoch": 2.0185686292805824, + "grad_norm": 0.9909166826042946, + "learning_rate": 0.0002323036631251911, + "loss": 7.308, + "step": 21633 + }, + { + "epoch": 2.0186619389754594, + "grad_norm": 9.570167931852389e+35, + "learning_rate": 0.00023229733950636106, + "loss": 7.488, + "step": 21634 + }, + { + "epoch": 2.018755248670337, + "grad_norm": 1.0432674273477838, + "learning_rate": 0.00023229101567827327, + "loss": 7.5982, + "step": 21635 + }, + { + "epoch": 2.0188485583652143, + "grad_norm": 1.4886097191345242, + "learning_rate": 0.0002322846916409437, + "loss": 7.8117, + "step": 21636 + }, + { + "epoch": 2.0189418680600912, + "grad_norm": 1.2351135335969239, + "learning_rate": 0.0002322783673943887, + "loss": 7.6189, + "step": 21637 + }, + { + "epoch": 2.0190351777549687, + "grad_norm": 1.3277389792880292, + "learning_rate": 0.00023227204293862404, + "loss": 7.2924, + "step": 21638 + }, + { + "epoch": 2.019128487449846, + "grad_norm": 1.2122581183248216, + "learning_rate": 0.00023226571827366602, + "loss": 7.7567, + "step": 21639 + }, + { + "epoch": 2.0192217971447235, + "grad_norm": 2.0193439559711144, + "learning_rate": 0.00023225939339953063, + "loss": 7.0019, + "step": 21640 + }, + { + "epoch": 2.0193151068396005, + "grad_norm": 1.3760128275305699, + "learning_rate": 0.000232253068316234, + "loss": 7.7071, + "step": 21641 + }, + { + "epoch": 2.019408416534478, + "grad_norm": 8.269959087730522e+35, + "learning_rate": 0.00023224674302379212, + "loss": 7.8416, + "step": 21642 + }, + { + "epoch": 2.0195017262293553, + "grad_norm": 1.1043352394178452, + "learning_rate": 0.00023224041752222118, + "loss": 7.6497, + "step": 21643 + }, + { + "epoch": 2.0195950359242327, + "grad_norm": 1.339086993623729, + "learning_rate": 0.00023223409181153719, + "loss": 7.4835, + "step": 21644 + }, + { + "epoch": 2.0196883456191097, + "grad_norm": 1.1551196241628383, + "learning_rate": 0.00023222776589175624, + "loss": 7.6444, + "step": 21645 + }, + { + "epoch": 2.019781655313987, + "grad_norm": 1.1525557816121035, + "learning_rate": 0.00023222143976289451, + "loss": 7.5276, + "step": 21646 + }, + { + "epoch": 2.0198749650088645, + "grad_norm": 1.7312169903750527, + "learning_rate": 0.000232215113424968, + "loss": 7.15, + "step": 21647 + }, + { + "epoch": 2.0199682747037415, + "grad_norm": 3.1629679851236692e+35, + "learning_rate": 0.0002322087868779928, + "loss": 7.3712, + "step": 21648 + }, + { + "epoch": 2.020061584398619, + "grad_norm": 1.6265700409458046, + "learning_rate": 0.00023220246012198498, + "loss": 7.3276, + "step": 21649 + }, + { + "epoch": 2.0201548940934964, + "grad_norm": 1.1172676384285682, + "learning_rate": 0.0002321961331569607, + "loss": 7.3285, + "step": 21650 + }, + { + "epoch": 2.0202482037883738, + "grad_norm": 1.03902518298715, + "learning_rate": 0.00023218980598293596, + "loss": 7.1519, + "step": 21651 + }, + { + "epoch": 2.0203415134832508, + "grad_norm": 1.1695903537796886, + "learning_rate": 0.0002321834785999269, + "loss": 7.4617, + "step": 21652 + }, + { + "epoch": 2.020434823178128, + "grad_norm": 1.0797715050116186, + "learning_rate": 0.00023217715100794966, + "loss": 7.2733, + "step": 21653 + }, + { + "epoch": 2.0205281328730056, + "grad_norm": 1.0233570980904994, + "learning_rate": 0.0002321708232070202, + "loss": 7.2156, + "step": 21654 + }, + { + "epoch": 2.020621442567883, + "grad_norm": 94.86154497369004, + "learning_rate": 0.0002321644951971547, + "loss": 7.5793, + "step": 21655 + }, + { + "epoch": 2.02071475226276, + "grad_norm": 0.9885873394827714, + "learning_rate": 0.00023215816697836917, + "loss": 7.0048, + "step": 21656 + }, + { + "epoch": 2.0208080619576374, + "grad_norm": 1.5769527281173263, + "learning_rate": 0.00023215183855067982, + "loss": 7.6991, + "step": 21657 + }, + { + "epoch": 2.020901371652515, + "grad_norm": 9.39169777796855e+32, + "learning_rate": 0.0002321455099141027, + "loss": 7.4039, + "step": 21658 + }, + { + "epoch": 2.020994681347392, + "grad_norm": 1.1325656704087235e+34, + "learning_rate": 0.00023213918106865383, + "loss": 7.5044, + "step": 21659 + }, + { + "epoch": 2.021087991042269, + "grad_norm": 4.384718619362773e+34, + "learning_rate": 0.0002321328520143494, + "loss": 7.4236, + "step": 21660 + }, + { + "epoch": 2.0211813007371466, + "grad_norm": 1.2139793080693624, + "learning_rate": 0.00023212652275120542, + "loss": 7.4317, + "step": 21661 + }, + { + "epoch": 2.021274610432024, + "grad_norm": 3.188719429381502e+34, + "learning_rate": 0.000232120193279238, + "loss": 7.6898, + "step": 21662 + }, + { + "epoch": 2.021367920126901, + "grad_norm": 1.1697998714142321, + "learning_rate": 0.0002321138635984633, + "loss": 7.3601, + "step": 21663 + }, + { + "epoch": 2.0214612298217784, + "grad_norm": 7.314740091908123e+34, + "learning_rate": 0.0002321075337088973, + "loss": 7.3576, + "step": 21664 + }, + { + "epoch": 2.021554539516656, + "grad_norm": 1.0080949956748537, + "learning_rate": 0.00023210120361055628, + "loss": 7.5666, + "step": 21665 + }, + { + "epoch": 2.0216478492115333, + "grad_norm": 1.239446424336775, + "learning_rate": 0.00023209487330345615, + "loss": 7.8331, + "step": 21666 + }, + { + "epoch": 2.0217411589064103, + "grad_norm": 1.1961014500893241, + "learning_rate": 0.00023208854278761304, + "loss": 7.2256, + "step": 21667 + }, + { + "epoch": 2.0218344686012877, + "grad_norm": 0.9737271665268666, + "learning_rate": 0.00023208221206304314, + "loss": 7.5144, + "step": 21668 + }, + { + "epoch": 2.021927778296165, + "grad_norm": 1.520832624414542, + "learning_rate": 0.00023207588112976249, + "loss": 7.5872, + "step": 21669 + }, + { + "epoch": 2.022021087991042, + "grad_norm": 1.1609639533166092, + "learning_rate": 0.00023206954998778717, + "loss": 7.5606, + "step": 21670 + }, + { + "epoch": 2.0221143976859195, + "grad_norm": 1.9279185185294956, + "learning_rate": 0.0002320632186371333, + "loss": 7.2514, + "step": 21671 + }, + { + "epoch": 2.022207707380797, + "grad_norm": 9.402336609021913e+35, + "learning_rate": 0.00023205688707781698, + "loss": 7.3339, + "step": 21672 + }, + { + "epoch": 2.0223010170756743, + "grad_norm": 8.364452438105078e+32, + "learning_rate": 0.0002320505553098543, + "loss": 7.3056, + "step": 21673 + }, + { + "epoch": 2.0223943267705513, + "grad_norm": 1.3550333227904425, + "learning_rate": 0.00023204422333326132, + "loss": 7.6677, + "step": 21674 + }, + { + "epoch": 2.0224876364654287, + "grad_norm": 1.190731413885932, + "learning_rate": 0.00023203789114805424, + "loss": 7.3762, + "step": 21675 + }, + { + "epoch": 2.022580946160306, + "grad_norm": 4.276378884760201e+33, + "learning_rate": 0.0002320315587542491, + "loss": 7.3429, + "step": 21676 + }, + { + "epoch": 2.0226742558551836, + "grad_norm": 1.0238258337600754, + "learning_rate": 0.00023202522615186196, + "loss": 7.481, + "step": 21677 + }, + { + "epoch": 2.0227675655500605, + "grad_norm": 1.3171212721313459, + "learning_rate": 0.00023201889334090902, + "loss": 7.2326, + "step": 21678 + }, + { + "epoch": 2.022860875244938, + "grad_norm": 1.0303364070805477e+33, + "learning_rate": 0.0002320125603214063, + "loss": 7.3868, + "step": 21679 + }, + { + "epoch": 2.0229541849398154, + "grad_norm": 1.3466877196951454, + "learning_rate": 0.00023200622709336992, + "loss": 7.6347, + "step": 21680 + }, + { + "epoch": 2.0230474946346924, + "grad_norm": 6.708987729964136e+34, + "learning_rate": 0.00023199989365681601, + "loss": 7.7551, + "step": 21681 + }, + { + "epoch": 2.0231408043295698, + "grad_norm": 1.1511519308715432, + "learning_rate": 0.0002319935600117607, + "loss": 7.3385, + "step": 21682 + }, + { + "epoch": 2.023234114024447, + "grad_norm": 2.3166837504891415e+34, + "learning_rate": 0.00023198722615822, + "loss": 7.3421, + "step": 21683 + }, + { + "epoch": 2.0233274237193246, + "grad_norm": 2.4024866862931696e+34, + "learning_rate": 0.00023198089209621012, + "loss": 7.0326, + "step": 21684 + }, + { + "epoch": 2.0234207334142016, + "grad_norm": 1.674587585578945, + "learning_rate": 0.00023197455782574707, + "loss": 7.4145, + "step": 21685 + }, + { + "epoch": 2.023514043109079, + "grad_norm": 1.5853926208594058, + "learning_rate": 0.00023196822334684702, + "loss": 7.6106, + "step": 21686 + }, + { + "epoch": 2.0236073528039564, + "grad_norm": 1.1817755802741994, + "learning_rate": 0.00023196188865952605, + "loss": 7.1045, + "step": 21687 + }, + { + "epoch": 2.023700662498834, + "grad_norm": 1.3571282248326154e+34, + "learning_rate": 0.0002319555537638003, + "loss": 7.735, + "step": 21688 + }, + { + "epoch": 2.023793972193711, + "grad_norm": 1.0303681634516655, + "learning_rate": 0.00023194921865968583, + "loss": 7.4642, + "step": 21689 + }, + { + "epoch": 2.0238872818885882, + "grad_norm": 2.1855810307703085, + "learning_rate": 0.00023194288334719878, + "loss": 7.111, + "step": 21690 + }, + { + "epoch": 2.0239805915834657, + "grad_norm": 1.3606581998781673, + "learning_rate": 0.00023193654782635527, + "loss": 7.1832, + "step": 21691 + }, + { + "epoch": 2.0240739012783426, + "grad_norm": 1.3744016642103145, + "learning_rate": 0.00023193021209717135, + "loss": 7.6443, + "step": 21692 + }, + { + "epoch": 2.02416721097322, + "grad_norm": 1.0218480979870685, + "learning_rate": 0.0002319238761596632, + "loss": 7.4647, + "step": 21693 + }, + { + "epoch": 2.0242605206680975, + "grad_norm": 1.2599625834454853, + "learning_rate": 0.0002319175400138469, + "loss": 7.4475, + "step": 21694 + }, + { + "epoch": 2.024353830362975, + "grad_norm": 1.665603930367851, + "learning_rate": 0.00023191120365973855, + "loss": 7.5502, + "step": 21695 + }, + { + "epoch": 2.024447140057852, + "grad_norm": 5.974339830722579e+34, + "learning_rate": 0.00023190486709735427, + "loss": 7.2616, + "step": 21696 + }, + { + "epoch": 2.0245404497527293, + "grad_norm": 8.679090994261594e+32, + "learning_rate": 0.0002318985303267102, + "loss": 7.4729, + "step": 21697 + }, + { + "epoch": 2.0246337594476067, + "grad_norm": 1.176167129970534, + "learning_rate": 0.00023189219334782238, + "loss": 7.5937, + "step": 21698 + }, + { + "epoch": 2.024727069142484, + "grad_norm": 1.1250258507696371, + "learning_rate": 0.000231885856160707, + "loss": 7.4296, + "step": 21699 + }, + { + "epoch": 2.024820378837361, + "grad_norm": 1.3548318463029745, + "learning_rate": 0.00023187951876538015, + "loss": 7.7504, + "step": 21700 + }, + { + "epoch": 2.0249136885322385, + "grad_norm": 0.962610397620686, + "learning_rate": 0.00023187318116185796, + "loss": 7.4145, + "step": 21701 + }, + { + "epoch": 2.025006998227116, + "grad_norm": 1.1488414572992898, + "learning_rate": 0.00023186684335015645, + "loss": 7.5919, + "step": 21702 + }, + { + "epoch": 2.025100307921993, + "grad_norm": 2.2640760439575627, + "learning_rate": 0.0002318605053302919, + "loss": 7.0219, + "step": 21703 + }, + { + "epoch": 2.0251936176168703, + "grad_norm": 1.2993984603073792, + "learning_rate": 0.00023185416710228026, + "loss": 7.4394, + "step": 21704 + }, + { + "epoch": 2.0252869273117478, + "grad_norm": 1.1199281683130002, + "learning_rate": 0.00023184782866613772, + "loss": 7.4044, + "step": 21705 + }, + { + "epoch": 2.025380237006625, + "grad_norm": 1.0603975076794634, + "learning_rate": 0.00023184149002188043, + "loss": 7.5042, + "step": 21706 + }, + { + "epoch": 2.025473546701502, + "grad_norm": 1.5977968816006178, + "learning_rate": 0.0002318351511695245, + "loss": 7.6038, + "step": 21707 + }, + { + "epoch": 2.0255668563963796, + "grad_norm": 1.1243068946617993, + "learning_rate": 0.00023182881210908598, + "loss": 7.5793, + "step": 21708 + }, + { + "epoch": 2.025660166091257, + "grad_norm": 1.0721450319938112e+34, + "learning_rate": 0.00023182247284058105, + "loss": 7.5161, + "step": 21709 + }, + { + "epoch": 2.025753475786134, + "grad_norm": 4.0122509864343173e+34, + "learning_rate": 0.0002318161333640258, + "loss": 7.2507, + "step": 21710 + }, + { + "epoch": 2.0258467854810114, + "grad_norm": 1.2728816276539952, + "learning_rate": 0.00023180979367943633, + "loss": 7.5069, + "step": 21711 + }, + { + "epoch": 2.025940095175889, + "grad_norm": 2.0797580703067093, + "learning_rate": 0.00023180345378682882, + "loss": 7.5039, + "step": 21712 + }, + { + "epoch": 2.026033404870766, + "grad_norm": 2.7715439077772444, + "learning_rate": 0.00023179711368621938, + "loss": 7.6535, + "step": 21713 + }, + { + "epoch": 2.026126714565643, + "grad_norm": 1.922330554514852, + "learning_rate": 0.00023179077337762407, + "loss": 7.1332, + "step": 21714 + }, + { + "epoch": 2.0262200242605206, + "grad_norm": 1.2747391654774005, + "learning_rate": 0.00023178443286105908, + "loss": 7.3039, + "step": 21715 + }, + { + "epoch": 2.026313333955398, + "grad_norm": 1.1061048674558827, + "learning_rate": 0.00023177809213654048, + "loss": 7.3007, + "step": 21716 + }, + { + "epoch": 2.0264066436502755, + "grad_norm": 1.3560667980038756, + "learning_rate": 0.00023177175120408445, + "loss": 7.2638, + "step": 21717 + }, + { + "epoch": 2.0264999533451524, + "grad_norm": 1.2334288445317718, + "learning_rate": 0.00023176541006370704, + "loss": 7.7246, + "step": 21718 + }, + { + "epoch": 2.02659326304003, + "grad_norm": 8.072490431311606e+34, + "learning_rate": 0.00023175906871542444, + "loss": 7.3933, + "step": 21719 + }, + { + "epoch": 2.0266865727349073, + "grad_norm": 1.0240440578450194, + "learning_rate": 0.00023175272715925273, + "loss": 7.536, + "step": 21720 + }, + { + "epoch": 2.0267798824297842, + "grad_norm": 1.1077116123366386, + "learning_rate": 0.00023174638539520806, + "loss": 7.3922, + "step": 21721 + }, + { + "epoch": 2.0268731921246617, + "grad_norm": 1.1812389165949562, + "learning_rate": 0.00023174004342330656, + "loss": 7.6652, + "step": 21722 + }, + { + "epoch": 2.026966501819539, + "grad_norm": 1.090827247271858, + "learning_rate": 0.00023173370124356433, + "loss": 7.4433, + "step": 21723 + }, + { + "epoch": 2.0270598115144165, + "grad_norm": 1.9400597259898058, + "learning_rate": 0.00023172735885599747, + "loss": 7.6528, + "step": 21724 + }, + { + "epoch": 2.0271531212092935, + "grad_norm": 1.3535278339789996e+34, + "learning_rate": 0.00023172101626062222, + "loss": 7.3646, + "step": 21725 + }, + { + "epoch": 2.027246430904171, + "grad_norm": 1.245278555506947, + "learning_rate": 0.0002317146734574546, + "loss": 7.6805, + "step": 21726 + }, + { + "epoch": 2.0273397405990483, + "grad_norm": 4.915448341615833e+34, + "learning_rate": 0.00023170833044651072, + "loss": 7.5815, + "step": 21727 + }, + { + "epoch": 2.0274330502939257, + "grad_norm": 2.0120272422176566e+34, + "learning_rate": 0.0002317019872278068, + "loss": 7.5822, + "step": 21728 + }, + { + "epoch": 2.0275263599888027, + "grad_norm": 1.3534362407881046, + "learning_rate": 0.00023169564380135895, + "loss": 7.415, + "step": 21729 + }, + { + "epoch": 2.02761966968368, + "grad_norm": 1.3108650543507963, + "learning_rate": 0.00023168930016718327, + "loss": 7.4447, + "step": 21730 + }, + { + "epoch": 2.0277129793785575, + "grad_norm": 1.3603936630172764, + "learning_rate": 0.0002316829563252959, + "loss": 7.6576, + "step": 21731 + }, + { + "epoch": 2.0278062890734345, + "grad_norm": 1.4238828604343234, + "learning_rate": 0.00023167661227571295, + "loss": 7.2148, + "step": 21732 + }, + { + "epoch": 2.027899598768312, + "grad_norm": 1.63099201250494, + "learning_rate": 0.00023167026801845057, + "loss": 7.5348, + "step": 21733 + }, + { + "epoch": 2.0279929084631894, + "grad_norm": 1.923901932688108e+33, + "learning_rate": 0.00023166392355352492, + "loss": 7.1396, + "step": 21734 + }, + { + "epoch": 2.028086218158067, + "grad_norm": 1.1907035976910556, + "learning_rate": 0.00023165757888095207, + "loss": 7.3583, + "step": 21735 + }, + { + "epoch": 2.0281795278529438, + "grad_norm": 5.026380125528215e+34, + "learning_rate": 0.0002316512340007482, + "loss": 7.4498, + "step": 21736 + }, + { + "epoch": 2.028272837547821, + "grad_norm": 1.249904541353803, + "learning_rate": 0.00023164488891292944, + "loss": 7.648, + "step": 21737 + }, + { + "epoch": 2.0283661472426986, + "grad_norm": 1.3595886292470256, + "learning_rate": 0.00023163854361751192, + "loss": 7.3357, + "step": 21738 + }, + { + "epoch": 2.028459456937576, + "grad_norm": 0.990551247137328, + "learning_rate": 0.00023163219811451173, + "loss": 7.1934, + "step": 21739 + }, + { + "epoch": 2.028552766632453, + "grad_norm": 1.2715292248230698, + "learning_rate": 0.00023162585240394504, + "loss": 7.45, + "step": 21740 + }, + { + "epoch": 2.0286460763273304, + "grad_norm": 6.207008504295368, + "learning_rate": 0.00023161950648582802, + "loss": 7.5026, + "step": 21741 + }, + { + "epoch": 2.028739386022208, + "grad_norm": 146.6567553195216, + "learning_rate": 0.00023161316036017675, + "loss": 7.4843, + "step": 21742 + }, + { + "epoch": 2.028832695717085, + "grad_norm": 2.2754555627826076, + "learning_rate": 0.0002316068140270074, + "loss": 7.1274, + "step": 21743 + }, + { + "epoch": 2.028926005411962, + "grad_norm": 4.6374689342075226e+35, + "learning_rate": 0.0002316004674863361, + "loss": 7.317, + "step": 21744 + }, + { + "epoch": 2.0290193151068396, + "grad_norm": 1.4852586866582624, + "learning_rate": 0.00023159412073817895, + "loss": 7.7561, + "step": 21745 + }, + { + "epoch": 2.029112624801717, + "grad_norm": 3.18617345596418e+31, + "learning_rate": 0.00023158777378255214, + "loss": 7.4723, + "step": 21746 + }, + { + "epoch": 2.029205934496594, + "grad_norm": 1.1622677713072715, + "learning_rate": 0.0002315814266194718, + "loss": 7.4114, + "step": 21747 + }, + { + "epoch": 2.0292992441914715, + "grad_norm": 1.282091539691908, + "learning_rate": 0.00023157507924895408, + "loss": 7.3808, + "step": 21748 + }, + { + "epoch": 2.029392553886349, + "grad_norm": 1.1590456561706548, + "learning_rate": 0.00023156873167101504, + "loss": 7.3046, + "step": 21749 + }, + { + "epoch": 2.0294858635812263, + "grad_norm": 1.3351075365666842, + "learning_rate": 0.0002315623838856709, + "loss": 7.4969, + "step": 21750 + }, + { + "epoch": 2.0295791732761033, + "grad_norm": 1.6970402252668564, + "learning_rate": 0.00023155603589293782, + "loss": 7.6696, + "step": 21751 + }, + { + "epoch": 2.0296724829709807, + "grad_norm": 1.365399693635364, + "learning_rate": 0.00023154968769283185, + "loss": 7.1781, + "step": 21752 + }, + { + "epoch": 2.029765792665858, + "grad_norm": 3.329315474596191e+34, + "learning_rate": 0.0002315433392853692, + "loss": 7.0681, + "step": 21753 + }, + { + "epoch": 2.029859102360735, + "grad_norm": 1.1324513083019505, + "learning_rate": 0.00023153699067056602, + "loss": 7.5832, + "step": 21754 + }, + { + "epoch": 2.0299524120556125, + "grad_norm": 1.703454174458618e+34, + "learning_rate": 0.00023153064184843836, + "loss": 7.2386, + "step": 21755 + }, + { + "epoch": 2.03004572175049, + "grad_norm": 2.682103471624991e+33, + "learning_rate": 0.00023152429281900246, + "loss": 7.679, + "step": 21756 + }, + { + "epoch": 2.0301390314453673, + "grad_norm": 1.275377903957882, + "learning_rate": 0.00023151794358227445, + "loss": 7.6761, + "step": 21757 + }, + { + "epoch": 2.0302323411402443, + "grad_norm": 1.9187446950232754, + "learning_rate": 0.00023151159413827046, + "loss": 7.7344, + "step": 21758 + }, + { + "epoch": 2.0303256508351217, + "grad_norm": 1.3385533341121993, + "learning_rate": 0.0002315052444870066, + "loss": 7.4973, + "step": 21759 + }, + { + "epoch": 2.030418960529999, + "grad_norm": 1.1946597435263149, + "learning_rate": 0.00023149889462849906, + "loss": 7.6684, + "step": 21760 + }, + { + "epoch": 2.0305122702248766, + "grad_norm": 1.4494652725840296e+34, + "learning_rate": 0.00023149254456276397, + "loss": 7.5807, + "step": 21761 + }, + { + "epoch": 2.0306055799197535, + "grad_norm": 1.1737603856601013, + "learning_rate": 0.00023148619428981748, + "loss": 7.654, + "step": 21762 + }, + { + "epoch": 2.030698889614631, + "grad_norm": 2.4176643097838117, + "learning_rate": 0.00023147984380967576, + "loss": 7.4094, + "step": 21763 + }, + { + "epoch": 2.0307921993095084, + "grad_norm": 6.354031536599887e+33, + "learning_rate": 0.0002314734931223549, + "loss": 7.6358, + "step": 21764 + }, + { + "epoch": 2.0308855090043854, + "grad_norm": 1.1243054977816997, + "learning_rate": 0.0002314671422278711, + "loss": 7.819, + "step": 21765 + }, + { + "epoch": 2.0309788186992628, + "grad_norm": 1.5286789267160816, + "learning_rate": 0.0002314607911262405, + "loss": 7.349, + "step": 21766 + }, + { + "epoch": 2.03107212839414, + "grad_norm": 1.1857503961305613, + "learning_rate": 0.00023145443981747923, + "loss": 7.8187, + "step": 21767 + }, + { + "epoch": 2.0311654380890176, + "grad_norm": 1.1350991597253717, + "learning_rate": 0.00023144808830160343, + "loss": 7.3946, + "step": 21768 + }, + { + "epoch": 2.0312587477838946, + "grad_norm": 1.2862363204445335, + "learning_rate": 0.0002314417365786293, + "loss": 7.2, + "step": 21769 + }, + { + "epoch": 2.031352057478772, + "grad_norm": 1.0854721617777892, + "learning_rate": 0.00023143538464857292, + "loss": 7.1454, + "step": 21770 + }, + { + "epoch": 2.0314453671736494, + "grad_norm": 1.7108960900243413, + "learning_rate": 0.0002314290325114505, + "loss": 7.4907, + "step": 21771 + }, + { + "epoch": 2.031538676868527, + "grad_norm": 1.3861479715121519, + "learning_rate": 0.0002314226801672782, + "loss": 7.5094, + "step": 21772 + }, + { + "epoch": 2.031631986563404, + "grad_norm": 1.577313352944494, + "learning_rate": 0.0002314163276160721, + "loss": 7.3446, + "step": 21773 + }, + { + "epoch": 2.0317252962582812, + "grad_norm": 2.206600240712779, + "learning_rate": 0.00023140997485784838, + "loss": 7.4369, + "step": 21774 + }, + { + "epoch": 2.0318186059531587, + "grad_norm": 1.1951066279039393, + "learning_rate": 0.00023140362189262326, + "loss": 7.5664, + "step": 21775 + }, + { + "epoch": 2.0319119156480356, + "grad_norm": 1.2136762998857178, + "learning_rate": 0.0002313972687204128, + "loss": 7.1722, + "step": 21776 + }, + { + "epoch": 2.032005225342913, + "grad_norm": 4.0399565306122004e+34, + "learning_rate": 0.00023139091534123322, + "loss": 7.6248, + "step": 21777 + }, + { + "epoch": 2.0320985350377905, + "grad_norm": 2.000435334330901, + "learning_rate": 0.00023138456175510067, + "loss": 7.5094, + "step": 21778 + }, + { + "epoch": 2.032191844732668, + "grad_norm": 1.6703674861050586, + "learning_rate": 0.00023137820796203127, + "loss": 7.5982, + "step": 21779 + }, + { + "epoch": 2.032285154427545, + "grad_norm": 1.10806059795528, + "learning_rate": 0.00023137185396204115, + "loss": 7.565, + "step": 21780 + }, + { + "epoch": 2.0323784641224223, + "grad_norm": 1.4808280358475316, + "learning_rate": 0.00023136549975514659, + "loss": 7.424, + "step": 21781 + }, + { + "epoch": 2.0324717738172997, + "grad_norm": 1.5420790942812836, + "learning_rate": 0.0002313591453413636, + "loss": 7.5171, + "step": 21782 + }, + { + "epoch": 2.032565083512177, + "grad_norm": 1.306374627614002, + "learning_rate": 0.00023135279072070844, + "loss": 7.8839, + "step": 21783 + }, + { + "epoch": 2.032658393207054, + "grad_norm": 1.0892356305500372, + "learning_rate": 0.00023134643589319722, + "loss": 7.5609, + "step": 21784 + }, + { + "epoch": 2.0327517029019315, + "grad_norm": 1.7154544955481081, + "learning_rate": 0.00023134008085884614, + "loss": 7.6823, + "step": 21785 + }, + { + "epoch": 2.032845012596809, + "grad_norm": 1.0070368556216176, + "learning_rate": 0.00023133372561767133, + "loss": 7.528, + "step": 21786 + }, + { + "epoch": 2.032938322291686, + "grad_norm": 3.559796757057021, + "learning_rate": 0.0002313273701696889, + "loss": 7.6684, + "step": 21787 + }, + { + "epoch": 2.0330316319865633, + "grad_norm": 1.3526120297663438, + "learning_rate": 0.00023132101451491506, + "loss": 7.7951, + "step": 21788 + }, + { + "epoch": 2.0331249416814408, + "grad_norm": 1.7652142762783245, + "learning_rate": 0.000231314658653366, + "loss": 7.3951, + "step": 21789 + }, + { + "epoch": 2.033218251376318, + "grad_norm": 1.2308281418297171, + "learning_rate": 0.00023130830258505781, + "loss": 7.6696, + "step": 21790 + }, + { + "epoch": 2.033311561071195, + "grad_norm": 1.1737935807652795, + "learning_rate": 0.00023130194631000676, + "loss": 7.4974, + "step": 21791 + }, + { + "epoch": 2.0334048707660726, + "grad_norm": 8.518020942522509e+34, + "learning_rate": 0.00023129558982822892, + "loss": 7.7381, + "step": 21792 + }, + { + "epoch": 2.03349818046095, + "grad_norm": 2.7845482707899378e+35, + "learning_rate": 0.0002312892331397404, + "loss": 7.4417, + "step": 21793 + }, + { + "epoch": 2.0335914901558274, + "grad_norm": 1.2818065523641744, + "learning_rate": 0.00023128287624455754, + "loss": 7.5424, + "step": 21794 + }, + { + "epoch": 2.0336847998507044, + "grad_norm": 2.2726310618441074e+35, + "learning_rate": 0.00023127651914269635, + "loss": 7.364, + "step": 21795 + }, + { + "epoch": 2.033778109545582, + "grad_norm": 1.6261359897543697, + "learning_rate": 0.00023127016183417304, + "loss": 7.3937, + "step": 21796 + }, + { + "epoch": 2.033871419240459, + "grad_norm": 0.9886024211138346, + "learning_rate": 0.00023126380431900384, + "loss": 7.5321, + "step": 21797 + }, + { + "epoch": 2.033964728935336, + "grad_norm": 1.7474868842132316, + "learning_rate": 0.0002312574465972048, + "loss": 7.0807, + "step": 21798 + }, + { + "epoch": 2.0340580386302136, + "grad_norm": 1.210399610037464, + "learning_rate": 0.00023125108866879217, + "loss": 7.6314, + "step": 21799 + }, + { + "epoch": 2.034151348325091, + "grad_norm": 1.6048809282023684, + "learning_rate": 0.0002312447305337821, + "loss": 7.6169, + "step": 21800 + }, + { + "epoch": 2.0342446580199685, + "grad_norm": 1.237357782316655, + "learning_rate": 0.00023123837219219073, + "loss": 7.4293, + "step": 21801 + }, + { + "epoch": 2.0343379677148454, + "grad_norm": 1.3370066699633263, + "learning_rate": 0.0002312320136440342, + "loss": 7.5504, + "step": 21802 + }, + { + "epoch": 2.034431277409723, + "grad_norm": 5.666595963463717e+34, + "learning_rate": 0.0002312256548893288, + "loss": 7.3704, + "step": 21803 + }, + { + "epoch": 2.0345245871046003, + "grad_norm": 1.2172269342418285, + "learning_rate": 0.00023121929592809058, + "loss": 7.204, + "step": 21804 + }, + { + "epoch": 2.0346178967994772, + "grad_norm": 1.2856569039436372, + "learning_rate": 0.00023121293676033573, + "loss": 7.412, + "step": 21805 + }, + { + "epoch": 2.0347112064943547, + "grad_norm": 0.9811630911027979, + "learning_rate": 0.00023120657738608048, + "loss": 7.6805, + "step": 21806 + }, + { + "epoch": 2.034804516189232, + "grad_norm": 1.5486965437642681, + "learning_rate": 0.000231200217805341, + "loss": 7.3918, + "step": 21807 + }, + { + "epoch": 2.0348978258841095, + "grad_norm": 1.405738941286262, + "learning_rate": 0.00023119385801813333, + "loss": 7.2515, + "step": 21808 + }, + { + "epoch": 2.0349911355789865, + "grad_norm": 1.433927643875908, + "learning_rate": 0.00023118749802447376, + "loss": 7.5155, + "step": 21809 + }, + { + "epoch": 2.035084445273864, + "grad_norm": 1.1318212386766006e+36, + "learning_rate": 0.0002311811378243785, + "loss": 7.3955, + "step": 21810 + }, + { + "epoch": 2.0351777549687413, + "grad_norm": 1.044045298519184, + "learning_rate": 0.00023117477741786354, + "loss": 7.3373, + "step": 21811 + }, + { + "epoch": 2.0352710646636187, + "grad_norm": 2.4742111908229725, + "learning_rate": 0.00023116841680494523, + "loss": 7.5708, + "step": 21812 + }, + { + "epoch": 2.0353643743584957, + "grad_norm": 1.248110440720042, + "learning_rate": 0.00023116205598563966, + "loss": 7.4566, + "step": 21813 + }, + { + "epoch": 2.035457684053373, + "grad_norm": 1.9193227323169244, + "learning_rate": 0.00023115569495996304, + "loss": 7.0628, + "step": 21814 + }, + { + "epoch": 2.0355509937482505, + "grad_norm": 1.2325706674807042, + "learning_rate": 0.0002311493337279315, + "loss": 7.4449, + "step": 21815 + }, + { + "epoch": 2.0356443034431275, + "grad_norm": 1.0954215708387338, + "learning_rate": 0.00023114297228956127, + "loss": 7.4098, + "step": 21816 + }, + { + "epoch": 2.035737613138005, + "grad_norm": 1.4577333128649093, + "learning_rate": 0.00023113661064486852, + "loss": 7.5441, + "step": 21817 + }, + { + "epoch": 2.0358309228328824, + "grad_norm": 1.8066036693493857, + "learning_rate": 0.00023113024879386934, + "loss": 7.3737, + "step": 21818 + }, + { + "epoch": 2.03592423252776, + "grad_norm": 2.4733799063773714, + "learning_rate": 0.00023112388673658, + "loss": 7.8363, + "step": 21819 + }, + { + "epoch": 2.0360175422226368, + "grad_norm": 1.3633386327254273, + "learning_rate": 0.0002311175244730167, + "loss": 7.4169, + "step": 21820 + }, + { + "epoch": 2.036110851917514, + "grad_norm": 1.1240280844371917e+34, + "learning_rate": 0.00023111116200319546, + "loss": 7.3294, + "step": 21821 + }, + { + "epoch": 2.0362041616123916, + "grad_norm": 1.3350623263939978e+34, + "learning_rate": 0.00023110479932713263, + "loss": 7.2246, + "step": 21822 + }, + { + "epoch": 2.036297471307269, + "grad_norm": 1.26620169480067, + "learning_rate": 0.00023109843644484432, + "loss": 7.7969, + "step": 21823 + }, + { + "epoch": 2.036390781002146, + "grad_norm": 1.1421389280280738, + "learning_rate": 0.00023109207335634668, + "loss": 7.5906, + "step": 21824 + }, + { + "epoch": 2.0364840906970234, + "grad_norm": 1.4771805306793233, + "learning_rate": 0.00023108571006165594, + "loss": 7.6248, + "step": 21825 + }, + { + "epoch": 2.036577400391901, + "grad_norm": 1.5293883609988466, + "learning_rate": 0.00023107934656078826, + "loss": 7.4928, + "step": 21826 + }, + { + "epoch": 2.036670710086778, + "grad_norm": 1.1878407399136266, + "learning_rate": 0.0002310729828537598, + "loss": 7.4695, + "step": 21827 + }, + { + "epoch": 2.036764019781655, + "grad_norm": 2.33082562951218e+35, + "learning_rate": 0.0002310666189405868, + "loss": 7.4314, + "step": 21828 + }, + { + "epoch": 2.0368573294765326, + "grad_norm": 1.1665299095727042, + "learning_rate": 0.00023106025482128538, + "loss": 7.3687, + "step": 21829 + }, + { + "epoch": 2.03695063917141, + "grad_norm": 1.3363475373364886, + "learning_rate": 0.0002310538904958717, + "loss": 7.3089, + "step": 21830 + }, + { + "epoch": 2.037043948866287, + "grad_norm": 1.2251213179465388, + "learning_rate": 0.00023104752596436203, + "loss": 7.4906, + "step": 21831 + }, + { + "epoch": 2.0371372585611645, + "grad_norm": 1.5471219217214613, + "learning_rate": 0.00023104116122677252, + "loss": 7.6333, + "step": 21832 + }, + { + "epoch": 2.037230568256042, + "grad_norm": 1.3669035552247708, + "learning_rate": 0.00023103479628311933, + "loss": 7.1931, + "step": 21833 + }, + { + "epoch": 2.0373238779509193, + "grad_norm": 1.4507501306985622, + "learning_rate": 0.00023102843113341866, + "loss": 7.4933, + "step": 21834 + }, + { + "epoch": 2.0374171876457963, + "grad_norm": 1.161488092508703, + "learning_rate": 0.00023102206577768668, + "loss": 7.2581, + "step": 21835 + }, + { + "epoch": 2.0375104973406737, + "grad_norm": 1.31474507612349e+34, + "learning_rate": 0.0002310157002159396, + "loss": 7.0011, + "step": 21836 + }, + { + "epoch": 2.037603807035551, + "grad_norm": 1.3292409448796807, + "learning_rate": 0.0002310093344481936, + "loss": 7.1927, + "step": 21837 + }, + { + "epoch": 2.037697116730428, + "grad_norm": 1.0522340068004286, + "learning_rate": 0.00023100296847446483, + "loss": 7.4088, + "step": 21838 + }, + { + "epoch": 2.0377904264253055, + "grad_norm": 4.172977904574377, + "learning_rate": 0.00023099660229476953, + "loss": 7.5752, + "step": 21839 + }, + { + "epoch": 2.037883736120183, + "grad_norm": 1.162222981492262, + "learning_rate": 0.00023099023590912382, + "loss": 7.6761, + "step": 21840 + }, + { + "epoch": 2.0379770458150603, + "grad_norm": 1.163260692342313, + "learning_rate": 0.00023098386931754398, + "loss": 7.552, + "step": 21841 + }, + { + "epoch": 2.0380703555099373, + "grad_norm": 1.053420267894666, + "learning_rate": 0.00023097750252004613, + "loss": 7.2246, + "step": 21842 + }, + { + "epoch": 2.0381636652048147, + "grad_norm": 1.0212752526402171, + "learning_rate": 0.0002309711355166465, + "loss": 7.4802, + "step": 21843 + }, + { + "epoch": 2.038256974899692, + "grad_norm": 0.9827206316330769, + "learning_rate": 0.00023096476830736126, + "loss": 7.4633, + "step": 21844 + }, + { + "epoch": 2.0383502845945696, + "grad_norm": 1.11879321357775, + "learning_rate": 0.00023095840089220655, + "loss": 7.6326, + "step": 21845 + }, + { + "epoch": 2.0384435942894465, + "grad_norm": 1.4665942337335538, + "learning_rate": 0.00023095203327119866, + "loss": 7.7459, + "step": 21846 + }, + { + "epoch": 2.038536903984324, + "grad_norm": 1.528987354546384, + "learning_rate": 0.0002309456654443537, + "loss": 7.3371, + "step": 21847 + }, + { + "epoch": 2.0386302136792014, + "grad_norm": 3.996322344890074, + "learning_rate": 0.0002309392974116879, + "loss": 7.5707, + "step": 21848 + }, + { + "epoch": 2.0387235233740784, + "grad_norm": 1.1149791060620393, + "learning_rate": 0.00023093292917321744, + "loss": 7.6393, + "step": 21849 + }, + { + "epoch": 2.038816833068956, + "grad_norm": 1.1914732534347883, + "learning_rate": 0.0002309265607289585, + "loss": 7.6849, + "step": 21850 + }, + { + "epoch": 2.038910142763833, + "grad_norm": 1.0140737282304322, + "learning_rate": 0.00023092019207892733, + "loss": 7.4701, + "step": 21851 + }, + { + "epoch": 2.0390034524587106, + "grad_norm": 0.9659619982641878, + "learning_rate": 0.00023091382322314004, + "loss": 7.5001, + "step": 21852 + }, + { + "epoch": 2.0390967621535876, + "grad_norm": 1.1456991130983216, + "learning_rate": 0.0002309074541616129, + "loss": 7.4275, + "step": 21853 + }, + { + "epoch": 2.039190071848465, + "grad_norm": 0.9673404539944379, + "learning_rate": 0.00023090108489436204, + "loss": 7.4919, + "step": 21854 + }, + { + "epoch": 2.0392833815433424, + "grad_norm": 6.858822681260621e+33, + "learning_rate": 0.00023089471542140367, + "loss": 7.4996, + "step": 21855 + }, + { + "epoch": 2.03937669123822, + "grad_norm": 1.2658338390111028, + "learning_rate": 0.0002308883457427541, + "loss": 7.6245, + "step": 21856 + }, + { + "epoch": 2.039470000933097, + "grad_norm": 1.3911350893228993, + "learning_rate": 0.00023088197585842933, + "loss": 7.6879, + "step": 21857 + }, + { + "epoch": 2.0395633106279742, + "grad_norm": 2.6424106376581965e+33, + "learning_rate": 0.00023087560576844567, + "loss": 7.4501, + "step": 21858 + }, + { + "epoch": 2.0396566203228517, + "grad_norm": 1.4255051256821882, + "learning_rate": 0.0002308692354728193, + "loss": 7.7479, + "step": 21859 + }, + { + "epoch": 2.0397499300177286, + "grad_norm": 1.1343431295146686, + "learning_rate": 0.00023086286497156646, + "loss": 7.409, + "step": 21860 + }, + { + "epoch": 2.039843239712606, + "grad_norm": 1.0332366501529502, + "learning_rate": 0.00023085649426470326, + "loss": 7.6269, + "step": 21861 + }, + { + "epoch": 2.0399365494074835, + "grad_norm": 1.3942411564197148, + "learning_rate": 0.00023085012335224595, + "loss": 7.2244, + "step": 21862 + }, + { + "epoch": 2.040029859102361, + "grad_norm": 1.2458047429913117, + "learning_rate": 0.00023084375223421077, + "loss": 7.6888, + "step": 21863 + }, + { + "epoch": 2.040123168797238, + "grad_norm": 1.2072999725213058, + "learning_rate": 0.00023083738091061383, + "loss": 7.7255, + "step": 21864 + }, + { + "epoch": 2.0402164784921153, + "grad_norm": 1.28049493838998, + "learning_rate": 0.0002308310093814714, + "loss": 7.5744, + "step": 21865 + }, + { + "epoch": 2.0403097881869927, + "grad_norm": 3.546116833898525e+34, + "learning_rate": 0.00023082463764679967, + "loss": 7.6179, + "step": 21866 + }, + { + "epoch": 2.04040309788187, + "grad_norm": 1.16118870362874, + "learning_rate": 0.0002308182657066148, + "loss": 7.5255, + "step": 21867 + }, + { + "epoch": 2.040496407576747, + "grad_norm": 1.3981210762439458, + "learning_rate": 0.00023081189356093305, + "loss": 7.7582, + "step": 21868 + }, + { + "epoch": 2.0405897172716245, + "grad_norm": 0.9957899154704292, + "learning_rate": 0.00023080552120977057, + "loss": 7.4544, + "step": 21869 + }, + { + "epoch": 2.040683026966502, + "grad_norm": 1.078788255034793, + "learning_rate": 0.0002307991486531436, + "loss": 7.5123, + "step": 21870 + }, + { + "epoch": 2.040776336661379, + "grad_norm": 1.0947375156225634, + "learning_rate": 0.00023079277589106828, + "loss": 7.3071, + "step": 21871 + }, + { + "epoch": 2.0408696463562563, + "grad_norm": 1.6128812531385683, + "learning_rate": 0.00023078640292356094, + "loss": 7.5573, + "step": 21872 + }, + { + "epoch": 2.0409629560511338, + "grad_norm": 1.8346613639780491, + "learning_rate": 0.00023078002975063767, + "loss": 7.4001, + "step": 21873 + }, + { + "epoch": 2.041056265746011, + "grad_norm": 1.0395632287171994, + "learning_rate": 0.0002307736563723147, + "loss": 7.4863, + "step": 21874 + }, + { + "epoch": 2.041149575440888, + "grad_norm": 2.4372256030824786e+34, + "learning_rate": 0.00023076728278860825, + "loss": 7.6299, + "step": 21875 + }, + { + "epoch": 2.0412428851357656, + "grad_norm": 1.1141041608548463, + "learning_rate": 0.00023076090899953454, + "loss": 7.7343, + "step": 21876 + }, + { + "epoch": 2.041336194830643, + "grad_norm": 9.542919300558447e+33, + "learning_rate": 0.00023075453500510977, + "loss": 7.4777, + "step": 21877 + }, + { + "epoch": 2.0414295045255204, + "grad_norm": 1.4270694737640837, + "learning_rate": 0.00023074816080535012, + "loss": 7.4058, + "step": 21878 + }, + { + "epoch": 2.0415228142203974, + "grad_norm": 3.8323487888044704e+35, + "learning_rate": 0.00023074178640027185, + "loss": 7.715, + "step": 21879 + }, + { + "epoch": 2.041616123915275, + "grad_norm": 1.7593863035422337, + "learning_rate": 0.0002307354117898911, + "loss": 7.1968, + "step": 21880 + }, + { + "epoch": 2.0417094336101522, + "grad_norm": 7.331360267413075e+34, + "learning_rate": 0.00023072903697422413, + "loss": 7.5973, + "step": 21881 + }, + { + "epoch": 2.041802743305029, + "grad_norm": 1.0200913223757075, + "learning_rate": 0.00023072266195328712, + "loss": 7.5294, + "step": 21882 + }, + { + "epoch": 2.0418960529999066, + "grad_norm": 1.0042956808028711, + "learning_rate": 0.00023071628672709628, + "loss": 7.3018, + "step": 21883 + }, + { + "epoch": 2.041989362694784, + "grad_norm": 1.0512061020225323, + "learning_rate": 0.00023070991129566786, + "loss": 7.1932, + "step": 21884 + }, + { + "epoch": 2.0420826723896615, + "grad_norm": 1.390951630818656, + "learning_rate": 0.00023070353565901805, + "loss": 7.5126, + "step": 21885 + }, + { + "epoch": 2.0421759820845384, + "grad_norm": 1.2909782392622702, + "learning_rate": 0.00023069715981716303, + "loss": 7.7068, + "step": 21886 + }, + { + "epoch": 2.042269291779416, + "grad_norm": 1.0318110101404672, + "learning_rate": 0.000230690783770119, + "loss": 7.4051, + "step": 21887 + }, + { + "epoch": 2.0423626014742933, + "grad_norm": 1.4121999177846238, + "learning_rate": 0.0002306844075179023, + "loss": 7.463, + "step": 21888 + }, + { + "epoch": 2.0424559111691707, + "grad_norm": 1.2251179725030976, + "learning_rate": 0.00023067803106052902, + "loss": 7.0215, + "step": 21889 + }, + { + "epoch": 2.0425492208640477, + "grad_norm": 1.414449940872545, + "learning_rate": 0.0002306716543980154, + "loss": 7.6636, + "step": 21890 + }, + { + "epoch": 2.042642530558925, + "grad_norm": 1.494523527978028, + "learning_rate": 0.00023066527753037763, + "loss": 7.229, + "step": 21891 + }, + { + "epoch": 2.0427358402538025, + "grad_norm": 1.3621418703685295, + "learning_rate": 0.000230658900457632, + "loss": 7.1356, + "step": 21892 + }, + { + "epoch": 2.0428291499486795, + "grad_norm": 2.2260213228395104e+35, + "learning_rate": 0.00023065252317979465, + "loss": 7.4966, + "step": 21893 + }, + { + "epoch": 2.042922459643557, + "grad_norm": 3.041001872979987, + "learning_rate": 0.00023064614569688182, + "loss": 7.6348, + "step": 21894 + }, + { + "epoch": 2.0430157693384343, + "grad_norm": 1.555465170147468, + "learning_rate": 0.00023063976800890976, + "loss": 7.2402, + "step": 21895 + }, + { + "epoch": 2.0431090790333117, + "grad_norm": 0.9557382431213395, + "learning_rate": 0.00023063339011589464, + "loss": 7.4312, + "step": 21896 + }, + { + "epoch": 2.0432023887281887, + "grad_norm": 1.6018331070565466, + "learning_rate": 0.00023062701201785265, + "loss": 7.4567, + "step": 21897 + }, + { + "epoch": 2.043295698423066, + "grad_norm": 0.9926628785918666, + "learning_rate": 0.00023062063371480013, + "loss": 7.5234, + "step": 21898 + }, + { + "epoch": 2.0433890081179436, + "grad_norm": 1.054940834081315, + "learning_rate": 0.0002306142552067532, + "loss": 7.4881, + "step": 21899 + }, + { + "epoch": 2.043482317812821, + "grad_norm": 1.2066553340083774, + "learning_rate": 0.00023060787649372807, + "loss": 7.5742, + "step": 21900 + }, + { + "epoch": 2.043575627507698, + "grad_norm": 1.4582925322697362e+35, + "learning_rate": 0.00023060149757574098, + "loss": 7.7615, + "step": 21901 + }, + { + "epoch": 2.0436689372025754, + "grad_norm": 9.813752644179263e+34, + "learning_rate": 0.0002305951184528082, + "loss": 7.2772, + "step": 21902 + }, + { + "epoch": 2.043762246897453, + "grad_norm": 1.0608616311474934, + "learning_rate": 0.00023058873912494587, + "loss": 7.3756, + "step": 21903 + }, + { + "epoch": 2.0438555565923298, + "grad_norm": 1.3601602676421452e+35, + "learning_rate": 0.00023058235959217029, + "loss": 7.6706, + "step": 21904 + }, + { + "epoch": 2.043948866287207, + "grad_norm": 1.1086528211886013, + "learning_rate": 0.0002305759798544976, + "loss": 7.5583, + "step": 21905 + }, + { + "epoch": 2.0440421759820846, + "grad_norm": 1.168968233283476, + "learning_rate": 0.0002305695999119441, + "loss": 7.537, + "step": 21906 + }, + { + "epoch": 2.044135485676962, + "grad_norm": 1.6262606256705772, + "learning_rate": 0.0002305632197645259, + "loss": 7.9062, + "step": 21907 + }, + { + "epoch": 2.044228795371839, + "grad_norm": 1.0433502637260441, + "learning_rate": 0.00023055683941225938, + "loss": 7.4721, + "step": 21908 + }, + { + "epoch": 2.0443221050667164, + "grad_norm": 1.5160516113060165, + "learning_rate": 0.00023055045885516065, + "loss": 7.7911, + "step": 21909 + }, + { + "epoch": 2.044415414761594, + "grad_norm": 1.1902929557880737, + "learning_rate": 0.00023054407809324593, + "loss": 7.3657, + "step": 21910 + }, + { + "epoch": 2.0445087244564712, + "grad_norm": 1.2216356127938374e+34, + "learning_rate": 0.00023053769712653153, + "loss": 7.1491, + "step": 21911 + }, + { + "epoch": 2.0446020341513482, + "grad_norm": 1.5401424735104488, + "learning_rate": 0.00023053131595503359, + "loss": 7.677, + "step": 21912 + }, + { + "epoch": 2.0446953438462256, + "grad_norm": 1.0727298295977903, + "learning_rate": 0.00023052493457876837, + "loss": 7.2601, + "step": 21913 + }, + { + "epoch": 2.044788653541103, + "grad_norm": 1.532472071912472e+34, + "learning_rate": 0.00023051855299775214, + "loss": 7.6008, + "step": 21914 + }, + { + "epoch": 2.04488196323598, + "grad_norm": 3.659859133711932e+35, + "learning_rate": 0.00023051217121200103, + "loss": 7.42, + "step": 21915 + }, + { + "epoch": 2.0449752729308575, + "grad_norm": 1.0656073544999547, + "learning_rate": 0.00023050578922153132, + "loss": 7.5321, + "step": 21916 + }, + { + "epoch": 2.045068582625735, + "grad_norm": 1.314379965131961, + "learning_rate": 0.00023049940702635926, + "loss": 7.3641, + "step": 21917 + }, + { + "epoch": 2.0451618923206123, + "grad_norm": 1.2459564450471592, + "learning_rate": 0.00023049302462650102, + "loss": 7.8957, + "step": 21918 + }, + { + "epoch": 2.0452552020154893, + "grad_norm": 1.1700779022883854, + "learning_rate": 0.00023048664202197293, + "loss": 7.6887, + "step": 21919 + }, + { + "epoch": 2.0453485117103667, + "grad_norm": 1.4357783722641202, + "learning_rate": 0.0002304802592127911, + "loss": 7.4925, + "step": 21920 + }, + { + "epoch": 2.045441821405244, + "grad_norm": 1.1752064966951639e+33, + "learning_rate": 0.00023047387619897183, + "loss": 7.0879, + "step": 21921 + }, + { + "epoch": 2.045535131100121, + "grad_norm": 1.523695621488705, + "learning_rate": 0.00023046749298053128, + "loss": 7.61, + "step": 21922 + }, + { + "epoch": 2.0456284407949985, + "grad_norm": 1.06081688347115, + "learning_rate": 0.0002304611095574858, + "loss": 7.5697, + "step": 21923 + }, + { + "epoch": 2.045721750489876, + "grad_norm": 1.6212809696848904, + "learning_rate": 0.0002304547259298515, + "loss": 7.5867, + "step": 21924 + }, + { + "epoch": 2.0458150601847533, + "grad_norm": 1.5058753437473982, + "learning_rate": 0.0002304483420976447, + "loss": 7.612, + "step": 21925 + }, + { + "epoch": 2.0459083698796303, + "grad_norm": 0.9934195179868424, + "learning_rate": 0.00023044195806088162, + "loss": 7.0995, + "step": 21926 + }, + { + "epoch": 2.0460016795745077, + "grad_norm": 1.3620240678471993, + "learning_rate": 0.0002304355738195784, + "loss": 7.4046, + "step": 21927 + }, + { + "epoch": 2.046094989269385, + "grad_norm": 2.356377378697997e+34, + "learning_rate": 0.0002304291893737514, + "loss": 7.7455, + "step": 21928 + }, + { + "epoch": 2.0461882989642626, + "grad_norm": 1.9583018309678515, + "learning_rate": 0.00023042280472341677, + "loss": 7.9078, + "step": 21929 + }, + { + "epoch": 2.0462816086591396, + "grad_norm": 0.9754695485613, + "learning_rate": 0.0002304164198685908, + "loss": 7.4558, + "step": 21930 + }, + { + "epoch": 2.046374918354017, + "grad_norm": 1.2687371012932223, + "learning_rate": 0.00023041003480928963, + "loss": 7.4315, + "step": 21931 + }, + { + "epoch": 2.0464682280488944, + "grad_norm": 1.610367303959427, + "learning_rate": 0.0002304036495455296, + "loss": 7.6021, + "step": 21932 + }, + { + "epoch": 2.0465615377437714, + "grad_norm": 1.2854160458274329, + "learning_rate": 0.0002303972640773269, + "loss": 7.8692, + "step": 21933 + }, + { + "epoch": 2.046654847438649, + "grad_norm": 1.5365765136578788, + "learning_rate": 0.0002303908784046978, + "loss": 7.446, + "step": 21934 + }, + { + "epoch": 2.046748157133526, + "grad_norm": 1.5652023154337218, + "learning_rate": 0.00023038449252765848, + "loss": 7.707, + "step": 21935 + }, + { + "epoch": 2.0468414668284036, + "grad_norm": 1.4293095379058633, + "learning_rate": 0.0002303781064462252, + "loss": 7.6592, + "step": 21936 + }, + { + "epoch": 2.0469347765232806, + "grad_norm": 1.2847284696942736, + "learning_rate": 0.00023037172016041423, + "loss": 7.6332, + "step": 21937 + }, + { + "epoch": 2.047028086218158, + "grad_norm": 1.1285548016773181, + "learning_rate": 0.00023036533367024178, + "loss": 7.3795, + "step": 21938 + }, + { + "epoch": 2.0471213959130354, + "grad_norm": 1.5409315851868166, + "learning_rate": 0.00023035894697572408, + "loss": 7.8557, + "step": 21939 + }, + { + "epoch": 2.047214705607913, + "grad_norm": 6.79872004360373e+34, + "learning_rate": 0.00023035256007687742, + "loss": 7.7355, + "step": 21940 + }, + { + "epoch": 2.04730801530279, + "grad_norm": 2.5735832660071347e+33, + "learning_rate": 0.00023034617297371796, + "loss": 7.3632, + "step": 21941 + }, + { + "epoch": 2.0474013249976672, + "grad_norm": 1.1032372098433967, + "learning_rate": 0.00023033978566626198, + "loss": 7.6009, + "step": 21942 + }, + { + "epoch": 2.0474946346925447, + "grad_norm": 8.618316641484062e+33, + "learning_rate": 0.00023033339815452575, + "loss": 7.7442, + "step": 21943 + }, + { + "epoch": 2.0475879443874216, + "grad_norm": 1.1941471772542194, + "learning_rate": 0.00023032701043852548, + "loss": 7.5518, + "step": 21944 + }, + { + "epoch": 2.047681254082299, + "grad_norm": 1.1834097677175208, + "learning_rate": 0.00023032062251827738, + "loss": 7.3116, + "step": 21945 + }, + { + "epoch": 2.0477745637771765, + "grad_norm": 1.9530687914236022, + "learning_rate": 0.0002303142343937978, + "loss": 7.3798, + "step": 21946 + }, + { + "epoch": 2.047867873472054, + "grad_norm": 1.279512422796048, + "learning_rate": 0.0002303078460651028, + "loss": 7.6287, + "step": 21947 + }, + { + "epoch": 2.047961183166931, + "grad_norm": 0.9842256757115657, + "learning_rate": 0.00023030145753220885, + "loss": 7.4512, + "step": 21948 + }, + { + "epoch": 2.0480544928618083, + "grad_norm": 4.435490020266071e+33, + "learning_rate": 0.00023029506879513202, + "loss": 7.2435, + "step": 21949 + }, + { + "epoch": 2.0481478025566857, + "grad_norm": 1.105460548934572, + "learning_rate": 0.00023028867985388864, + "loss": 7.4673, + "step": 21950 + }, + { + "epoch": 2.048241112251563, + "grad_norm": 1.1681502890869777, + "learning_rate": 0.00023028229070849496, + "loss": 7.5221, + "step": 21951 + }, + { + "epoch": 2.04833442194644, + "grad_norm": 1.2627836885623986, + "learning_rate": 0.00023027590135896712, + "loss": 7.619, + "step": 21952 + }, + { + "epoch": 2.0484277316413175, + "grad_norm": 1.090801243715267, + "learning_rate": 0.00023026951180532153, + "loss": 7.4604, + "step": 21953 + }, + { + "epoch": 2.048521041336195, + "grad_norm": 1.1430777512947483e+34, + "learning_rate": 0.00023026312204757428, + "loss": 7.6122, + "step": 21954 + }, + { + "epoch": 2.048614351031072, + "grad_norm": 1.2961480193053234, + "learning_rate": 0.0002302567320857417, + "loss": 7.3076, + "step": 21955 + }, + { + "epoch": 2.0487076607259493, + "grad_norm": 1.2719947319310034, + "learning_rate": 0.00023025034191984003, + "loss": 7.7436, + "step": 21956 + }, + { + "epoch": 2.0488009704208268, + "grad_norm": 1.1162102937924456, + "learning_rate": 0.0002302439515498855, + "loss": 7.3808, + "step": 21957 + }, + { + "epoch": 2.048894280115704, + "grad_norm": 1.1781772392106025, + "learning_rate": 0.0002302375609758944, + "loss": 7.1343, + "step": 21958 + }, + { + "epoch": 2.048987589810581, + "grad_norm": 1.0492699780094425, + "learning_rate": 0.00023023117019788293, + "loss": 7.212, + "step": 21959 + }, + { + "epoch": 2.0490808995054586, + "grad_norm": 1.0329070632040556, + "learning_rate": 0.0002302247792158674, + "loss": 7.4814, + "step": 21960 + }, + { + "epoch": 2.049174209200336, + "grad_norm": 1.0375487810019706, + "learning_rate": 0.00023021838802986396, + "loss": 7.1827, + "step": 21961 + }, + { + "epoch": 2.0492675188952134, + "grad_norm": 5.287619544109276e+32, + "learning_rate": 0.00023021199663988889, + "loss": 7.3586, + "step": 21962 + }, + { + "epoch": 2.0493608285900904, + "grad_norm": 5.788350958496177e+33, + "learning_rate": 0.00023020560504595858, + "loss": 7.3681, + "step": 21963 + }, + { + "epoch": 2.049454138284968, + "grad_norm": 1.0138924847051438e+33, + "learning_rate": 0.00023019921324808913, + "loss": 7.4173, + "step": 21964 + }, + { + "epoch": 2.0495474479798452, + "grad_norm": 1.2851239753572334, + "learning_rate": 0.00023019282124629685, + "loss": 7.4004, + "step": 21965 + }, + { + "epoch": 2.049640757674722, + "grad_norm": 1.050846970450094, + "learning_rate": 0.00023018642904059795, + "loss": 7.4847, + "step": 21966 + }, + { + "epoch": 2.0497340673695996, + "grad_norm": 1.141428927053541, + "learning_rate": 0.00023018003663100874, + "loss": 7.248, + "step": 21967 + }, + { + "epoch": 2.049827377064477, + "grad_norm": 1.1000642380570473, + "learning_rate": 0.00023017364401754545, + "loss": 7.4131, + "step": 21968 + }, + { + "epoch": 2.0499206867593545, + "grad_norm": 1.1202122151520606, + "learning_rate": 0.00023016725120022428, + "loss": 7.1761, + "step": 21969 + }, + { + "epoch": 2.0500139964542314, + "grad_norm": 1.2146696065092772, + "learning_rate": 0.0002301608581790616, + "loss": 7.5077, + "step": 21970 + }, + { + "epoch": 2.050107306149109, + "grad_norm": 1.7660903570817423, + "learning_rate": 0.00023015446495407358, + "loss": 7.1916, + "step": 21971 + }, + { + "epoch": 2.0502006158439863, + "grad_norm": 1.5168946087754311, + "learning_rate": 0.0002301480715252765, + "loss": 7.7346, + "step": 21972 + }, + { + "epoch": 2.0502939255388637, + "grad_norm": 1.2490920225230546, + "learning_rate": 0.0002301416778926866, + "loss": 7.351, + "step": 21973 + }, + { + "epoch": 2.0503872352337407, + "grad_norm": 1.7174878457672647, + "learning_rate": 0.00023013528405632018, + "loss": 7.3711, + "step": 21974 + }, + { + "epoch": 2.050480544928618, + "grad_norm": 1.1633834770413007, + "learning_rate": 0.0002301288900161935, + "loss": 7.3626, + "step": 21975 + }, + { + "epoch": 2.0505738546234955, + "grad_norm": 1.6319989415494518, + "learning_rate": 0.0002301224957723227, + "loss": 7.5262, + "step": 21976 + }, + { + "epoch": 2.0506671643183725, + "grad_norm": 3.2987958147951655e+33, + "learning_rate": 0.0002301161013247242, + "loss": 7.5834, + "step": 21977 + }, + { + "epoch": 2.05076047401325, + "grad_norm": 1.361160162151967, + "learning_rate": 0.00023010970667341414, + "loss": 7.8339, + "step": 21978 + }, + { + "epoch": 2.0508537837081273, + "grad_norm": 1.5002848374921665, + "learning_rate": 0.00023010331181840885, + "loss": 7.6011, + "step": 21979 + }, + { + "epoch": 2.0509470934030047, + "grad_norm": 1.8421800074138224, + "learning_rate": 0.00023009691675972456, + "loss": 7.2517, + "step": 21980 + }, + { + "epoch": 2.0510404030978817, + "grad_norm": 1.8411923478511476, + "learning_rate": 0.0002300905214973775, + "loss": 7.4151, + "step": 21981 + }, + { + "epoch": 2.051133712792759, + "grad_norm": 1.508837717674631, + "learning_rate": 0.00023008412603138403, + "loss": 7.323, + "step": 21982 + }, + { + "epoch": 2.0512270224876366, + "grad_norm": 3.374167857360014e+33, + "learning_rate": 0.00023007773036176033, + "loss": 7.5808, + "step": 21983 + }, + { + "epoch": 2.051320332182514, + "grad_norm": 1.0401351892163715, + "learning_rate": 0.00023007133448852267, + "loss": 7.5812, + "step": 21984 + }, + { + "epoch": 2.051413641877391, + "grad_norm": 1.0171186299091446, + "learning_rate": 0.00023006493841168734, + "loss": 7.2773, + "step": 21985 + }, + { + "epoch": 2.0515069515722684, + "grad_norm": 2.2266975612409077, + "learning_rate": 0.00023005854213127062, + "loss": 7.6213, + "step": 21986 + }, + { + "epoch": 2.051600261267146, + "grad_norm": 1.3801192854573763, + "learning_rate": 0.0002300521456472887, + "loss": 7.481, + "step": 21987 + }, + { + "epoch": 2.0516935709620228, + "grad_norm": 1.3308748323651813, + "learning_rate": 0.00023004574895975786, + "loss": 7.2541, + "step": 21988 + }, + { + "epoch": 2.0517868806569, + "grad_norm": 1.2435702511636064, + "learning_rate": 0.00023003935206869445, + "loss": 7.0628, + "step": 21989 + }, + { + "epoch": 2.0518801903517776, + "grad_norm": 1.6050089599647421, + "learning_rate": 0.00023003295497411465, + "loss": 7.6208, + "step": 21990 + }, + { + "epoch": 2.051973500046655, + "grad_norm": 1.0673205622261797, + "learning_rate": 0.00023002655767603476, + "loss": 7.3547, + "step": 21991 + }, + { + "epoch": 2.052066809741532, + "grad_norm": 1.2700629372124408, + "learning_rate": 0.00023002016017447104, + "loss": 7.5418, + "step": 21992 + }, + { + "epoch": 2.0521601194364094, + "grad_norm": 1.2195434056251855, + "learning_rate": 0.00023001376246943977, + "loss": 7.6564, + "step": 21993 + }, + { + "epoch": 2.052253429131287, + "grad_norm": 2.505517220026386e+35, + "learning_rate": 0.00023000736456095717, + "loss": 7.7819, + "step": 21994 + }, + { + "epoch": 2.0523467388261643, + "grad_norm": 1.0390766854419042, + "learning_rate": 0.00023000096644903956, + "loss": 7.7213, + "step": 21995 + }, + { + "epoch": 2.0524400485210412, + "grad_norm": 1.251347528135862, + "learning_rate": 0.00022999456813370322, + "loss": 7.6512, + "step": 21996 + }, + { + "epoch": 2.0525333582159186, + "grad_norm": 1.5843009984364191, + "learning_rate": 0.00022998816961496433, + "loss": 7.2648, + "step": 21997 + }, + { + "epoch": 2.052626667910796, + "grad_norm": 0.954338060849937, + "learning_rate": 0.00022998177089283923, + "loss": 7.5725, + "step": 21998 + }, + { + "epoch": 2.052719977605673, + "grad_norm": 1.2360373689304591, + "learning_rate": 0.00022997537196734418, + "loss": 7.613, + "step": 21999 + }, + { + "epoch": 2.0528132873005505, + "grad_norm": 1.1120710544500223, + "learning_rate": 0.00022996897283849543, + "loss": 7.4271, + "step": 22000 + }, + { + "epoch": 2.052906596995428, + "grad_norm": 1.1048885789190986e+33, + "learning_rate": 0.00022996257350630934, + "loss": 7.67, + "step": 22001 + }, + { + "epoch": 2.0529999066903053, + "grad_norm": 1.0930129598968716, + "learning_rate": 0.0002299561739708021, + "loss": 7.3171, + "step": 22002 + }, + { + "epoch": 2.0530932163851823, + "grad_norm": 1.0781739450211567, + "learning_rate": 0.00022994977423198995, + "loss": 7.348, + "step": 22003 + }, + { + "epoch": 2.0531865260800597, + "grad_norm": 2.5036182785095034, + "learning_rate": 0.00022994337428988926, + "loss": 7.4041, + "step": 22004 + }, + { + "epoch": 2.053279835774937, + "grad_norm": 1.0746449194412755, + "learning_rate": 0.0002299369741445162, + "loss": 7.5053, + "step": 22005 + }, + { + "epoch": 2.0533731454698145, + "grad_norm": 2.311017475219262e+35, + "learning_rate": 0.0002299305737958871, + "loss": 7.6905, + "step": 22006 + }, + { + "epoch": 2.0534664551646915, + "grad_norm": 0.9839534192463937, + "learning_rate": 0.00022992417324401826, + "loss": 7.3056, + "step": 22007 + }, + { + "epoch": 2.053559764859569, + "grad_norm": 1.2207579234020327, + "learning_rate": 0.0002299177724889259, + "loss": 7.4443, + "step": 22008 + }, + { + "epoch": 2.0536530745544463, + "grad_norm": 1.3687990134277592, + "learning_rate": 0.00022991137153062632, + "loss": 7.7184, + "step": 22009 + }, + { + "epoch": 2.0537463842493233, + "grad_norm": 1.4383964892881513, + "learning_rate": 0.0002299049703691358, + "loss": 7.7392, + "step": 22010 + }, + { + "epoch": 2.0538396939442007, + "grad_norm": 1.1775457043040118, + "learning_rate": 0.0002298985690044706, + "loss": 7.3875, + "step": 22011 + }, + { + "epoch": 2.053933003639078, + "grad_norm": 9.967888396200651e+32, + "learning_rate": 0.000229892167436647, + "loss": 7.4985, + "step": 22012 + }, + { + "epoch": 2.0540263133339556, + "grad_norm": 3.2929862939812725e+33, + "learning_rate": 0.00022988576566568134, + "loss": 7.4881, + "step": 22013 + }, + { + "epoch": 2.0541196230288326, + "grad_norm": 1.1256213953570986, + "learning_rate": 0.00022987936369158978, + "loss": 7.731, + "step": 22014 + }, + { + "epoch": 2.05421293272371, + "grad_norm": 1.4706379949809176, + "learning_rate": 0.00022987296151438867, + "loss": 7.0149, + "step": 22015 + }, + { + "epoch": 2.0543062424185874, + "grad_norm": 1.1516712333723758, + "learning_rate": 0.00022986655913409431, + "loss": 7.7304, + "step": 22016 + }, + { + "epoch": 2.0543995521134644, + "grad_norm": 1.453158910676886, + "learning_rate": 0.0002298601565507229, + "loss": 7.6436, + "step": 22017 + }, + { + "epoch": 2.054492861808342, + "grad_norm": 1.9704376407735305, + "learning_rate": 0.00022985375376429078, + "loss": 7.8017, + "step": 22018 + }, + { + "epoch": 2.054586171503219, + "grad_norm": 1.573474776813053, + "learning_rate": 0.0002298473507748142, + "loss": 7.2674, + "step": 22019 + }, + { + "epoch": 2.0546794811980966, + "grad_norm": 1.2217842578119515, + "learning_rate": 0.00022984094758230953, + "loss": 7.6392, + "step": 22020 + }, + { + "epoch": 2.0547727908929736, + "grad_norm": 1.30224849485696, + "learning_rate": 0.00022983454418679298, + "loss": 7.5107, + "step": 22021 + }, + { + "epoch": 2.054866100587851, + "grad_norm": 1.1084055241697452e+34, + "learning_rate": 0.00022982814058828072, + "loss": 7.4523, + "step": 22022 + }, + { + "epoch": 2.0549594102827284, + "grad_norm": NaN, + "learning_rate": 0.00022982173678678927, + "loss": 7.7198, + "step": 22023 + }, + { + "epoch": 2.055052719977606, + "grad_norm": NaN, + "learning_rate": 0.00022981533278233475, + "loss": 0.0, + "step": 22024 + }, + { + "epoch": 2.055146029672483, + "grad_norm": NaN, + "learning_rate": 0.00022980892857493344, + "loss": 0.0, + "step": 22025 + }, + { + "epoch": 2.0552393393673603, + "grad_norm": NaN, + "learning_rate": 0.00022980252416460173, + "loss": 0.0, + "step": 22026 + }, + { + "epoch": 2.0553326490622377, + "grad_norm": NaN, + "learning_rate": 0.00022979611955135576, + "loss": 0.0, + "step": 22027 + }, + { + "epoch": 2.0554259587571146, + "grad_norm": NaN, + "learning_rate": 0.00022978971473521192, + "loss": 0.0, + "step": 22028 + }, + { + "epoch": 2.055519268451992, + "grad_norm": NaN, + "learning_rate": 0.00022978330971618657, + "loss": 0.0, + "step": 22029 + }, + { + "epoch": 2.0556125781468695, + "grad_norm": NaN, + "learning_rate": 0.00022977690449429582, + "loss": 0.0, + "step": 22030 + }, + { + "epoch": 2.055705887841747, + "grad_norm": NaN, + "learning_rate": 0.000229770499069556, + "loss": 0.0, + "step": 22031 + }, + { + "epoch": 2.055799197536624, + "grad_norm": NaN, + "learning_rate": 0.0002297640934419835, + "loss": 0.0, + "step": 22032 + }, + { + "epoch": 2.0558925072315013, + "grad_norm": NaN, + "learning_rate": 0.0002297576876115945, + "loss": 0.0, + "step": 22033 + }, + { + "epoch": 2.0559858169263787, + "grad_norm": NaN, + "learning_rate": 0.00022975128157840533, + "loss": 0.0, + "step": 22034 + }, + { + "epoch": 2.056079126621256, + "grad_norm": NaN, + "learning_rate": 0.00022974487534243226, + "loss": 0.0, + "step": 22035 + }, + { + "epoch": 2.056172436316133, + "grad_norm": NaN, + "learning_rate": 0.0002297384689036916, + "loss": 0.0, + "step": 22036 + }, + { + "epoch": 2.0562657460110105, + "grad_norm": NaN, + "learning_rate": 0.00022973206226219967, + "loss": 0.0, + "step": 22037 + }, + { + "epoch": 2.056359055705888, + "grad_norm": NaN, + "learning_rate": 0.00022972565541797265, + "loss": 0.0, + "step": 22038 + }, + { + "epoch": 2.056452365400765, + "grad_norm": NaN, + "learning_rate": 0.00022971924837102695, + "loss": 0.0, + "step": 22039 + }, + { + "epoch": 2.0565456750956423, + "grad_norm": NaN, + "learning_rate": 0.00022971284112137882, + "loss": 0.0, + "step": 22040 + }, + { + "epoch": 2.0566389847905198, + "grad_norm": NaN, + "learning_rate": 0.0002297064336690445, + "loss": 0.0, + "step": 22041 + }, + { + "epoch": 2.056732294485397, + "grad_norm": NaN, + "learning_rate": 0.00022970002601404045, + "loss": 0.0, + "step": 22042 + }, + { + "epoch": 2.056825604180274, + "grad_norm": NaN, + "learning_rate": 0.00022969361815638277, + "loss": 0.0, + "step": 22043 + }, + { + "epoch": 2.0569189138751516, + "grad_norm": NaN, + "learning_rate": 0.00022968721009608778, + "loss": 0.0, + "step": 22044 + }, + { + "epoch": 2.057012223570029, + "grad_norm": NaN, + "learning_rate": 0.0002296808018331719, + "loss": 0.0, + "step": 22045 + }, + { + "epoch": 2.0571055332649064, + "grad_norm": NaN, + "learning_rate": 0.00022967439336765127, + "loss": 0.0, + "step": 22046 + }, + { + "epoch": 2.0571988429597834, + "grad_norm": NaN, + "learning_rate": 0.00022966798469954228, + "loss": 0.0, + "step": 22047 + }, + { + "epoch": 2.057292152654661, + "grad_norm": NaN, + "learning_rate": 0.00022966157582886127, + "loss": 0.0, + "step": 22048 + }, + { + "epoch": 2.0573854623495382, + "grad_norm": NaN, + "learning_rate": 0.00022965516675562441, + "loss": 0.0, + "step": 22049 + }, + { + "epoch": 2.057478772044415, + "grad_norm": NaN, + "learning_rate": 0.00022964875747984805, + "loss": 0.0, + "step": 22050 + }, + { + "epoch": 2.0575720817392926, + "grad_norm": NaN, + "learning_rate": 0.00022964234800154854, + "loss": 0.0, + "step": 22051 + }, + { + "epoch": 2.05766539143417, + "grad_norm": NaN, + "learning_rate": 0.00022963593832074208, + "loss": 0.0, + "step": 22052 + }, + { + "epoch": 2.0577587011290475, + "grad_norm": NaN, + "learning_rate": 0.00022962952843744505, + "loss": 0.0, + "step": 22053 + }, + { + "epoch": 2.0578520108239244, + "grad_norm": NaN, + "learning_rate": 0.0002296231183516737, + "loss": 0.0, + "step": 22054 + }, + { + "epoch": 2.057945320518802, + "grad_norm": NaN, + "learning_rate": 0.0002296167080634444, + "loss": 0.0, + "step": 22055 + }, + { + "epoch": 2.0580386302136793, + "grad_norm": NaN, + "learning_rate": 0.0002296102975727733, + "loss": 0.0, + "step": 22056 + }, + { + "epoch": 2.0581319399085567, + "grad_norm": NaN, + "learning_rate": 0.0002296038868796768, + "loss": 0.0, + "step": 22057 + }, + { + "epoch": 2.0582252496034337, + "grad_norm": NaN, + "learning_rate": 0.0002295974759841713, + "loss": 0.0, + "step": 22058 + }, + { + "epoch": 2.058318559298311, + "grad_norm": NaN, + "learning_rate": 0.00022959106488627294, + "loss": 0.0, + "step": 22059 + }, + { + "epoch": 2.0584118689931885, + "grad_norm": NaN, + "learning_rate": 0.00022958465358599803, + "loss": 0.0, + "step": 22060 + }, + { + "epoch": 2.0585051786880655, + "grad_norm": NaN, + "learning_rate": 0.00022957824208336302, + "loss": 0.0, + "step": 22061 + }, + { + "epoch": 2.058598488382943, + "grad_norm": NaN, + "learning_rate": 0.00022957183037838406, + "loss": 0.0, + "step": 22062 + }, + { + "epoch": 2.0586917980778203, + "grad_norm": NaN, + "learning_rate": 0.00022956541847107747, + "loss": 0.0, + "step": 22063 + }, + { + "epoch": 2.0587851077726977, + "grad_norm": NaN, + "learning_rate": 0.00022955900636145962, + "loss": 0.0, + "step": 22064 + }, + { + "epoch": 2.0588784174675747, + "grad_norm": NaN, + "learning_rate": 0.00022955259404954683, + "loss": 0.0, + "step": 22065 + }, + { + "epoch": 2.058971727162452, + "grad_norm": NaN, + "learning_rate": 0.00022954618153535526, + "loss": 0.0, + "step": 22066 + }, + { + "epoch": 2.0590650368573296, + "grad_norm": NaN, + "learning_rate": 0.0002295397688189014, + "loss": 0.0, + "step": 22067 + }, + { + "epoch": 2.059158346552207, + "grad_norm": NaN, + "learning_rate": 0.00022953335590020148, + "loss": 0.0, + "step": 22068 + }, + { + "epoch": 2.059251656247084, + "grad_norm": NaN, + "learning_rate": 0.0002295269427792717, + "loss": 0.0, + "step": 22069 + }, + { + "epoch": 2.0593449659419614, + "grad_norm": NaN, + "learning_rate": 0.00022952052945612856, + "loss": 0.0, + "step": 22070 + }, + { + "epoch": 2.059438275636839, + "grad_norm": NaN, + "learning_rate": 0.00022951411593078822, + "loss": 0.0, + "step": 22071 + }, + { + "epoch": 2.0595315853317158, + "grad_norm": NaN, + "learning_rate": 0.00022950770220326702, + "loss": 0.0, + "step": 22072 + }, + { + "epoch": 2.059624895026593, + "grad_norm": NaN, + "learning_rate": 0.00022950128827358137, + "loss": 0.0, + "step": 22073 + }, + { + "epoch": 2.0597182047214706, + "grad_norm": NaN, + "learning_rate": 0.00022949487414174744, + "loss": 0.0, + "step": 22074 + }, + { + "epoch": 2.059811514416348, + "grad_norm": NaN, + "learning_rate": 0.00022948845980778153, + "loss": 0.0, + "step": 22075 + }, + { + "epoch": 2.059904824111225, + "grad_norm": NaN, + "learning_rate": 0.00022948204527170008, + "loss": 0.0, + "step": 22076 + }, + { + "epoch": 2.0599981338061024, + "grad_norm": NaN, + "learning_rate": 0.00022947563053351934, + "loss": 0.0, + "step": 22077 + }, + { + "epoch": 2.06009144350098, + "grad_norm": NaN, + "learning_rate": 0.0002294692155932556, + "loss": 0.0, + "step": 22078 + }, + { + "epoch": 2.0601847531958573, + "grad_norm": NaN, + "learning_rate": 0.00022946280045092518, + "loss": 0.0, + "step": 22079 + }, + { + "epoch": 2.0602780628907342, + "grad_norm": NaN, + "learning_rate": 0.00022945638510654443, + "loss": 0.0, + "step": 22080 + }, + { + "epoch": 2.0603713725856116, + "grad_norm": NaN, + "learning_rate": 0.0002294499695601296, + "loss": 0.0, + "step": 22081 + }, + { + "epoch": 2.060464682280489, + "grad_norm": NaN, + "learning_rate": 0.000229443553811697, + "loss": 0.0, + "step": 22082 + }, + { + "epoch": 2.060557991975366, + "grad_norm": NaN, + "learning_rate": 0.00022943713786126302, + "loss": 0.0, + "step": 22083 + }, + { + "epoch": 2.0606513016702435, + "grad_norm": NaN, + "learning_rate": 0.00022943072170884392, + "loss": 0.0, + "step": 22084 + }, + { + "epoch": 2.060744611365121, + "grad_norm": NaN, + "learning_rate": 0.00022942430535445599, + "loss": 0.0, + "step": 22085 + }, + { + "epoch": 2.0608379210599983, + "grad_norm": NaN, + "learning_rate": 0.00022941788879811563, + "loss": 0.0, + "step": 22086 + }, + { + "epoch": 2.0609312307548753, + "grad_norm": NaN, + "learning_rate": 0.0002294114720398391, + "loss": 0.0, + "step": 22087 + }, + { + "epoch": 2.0610245404497527, + "grad_norm": NaN, + "learning_rate": 0.0002294050550796427, + "loss": 0.0, + "step": 22088 + }, + { + "epoch": 2.06111785014463, + "grad_norm": NaN, + "learning_rate": 0.00022939863791754278, + "loss": 0.0, + "step": 22089 + }, + { + "epoch": 2.0612111598395075, + "grad_norm": NaN, + "learning_rate": 0.00022939222055355562, + "loss": 0.0, + "step": 22090 + }, + { + "epoch": 2.0613044695343845, + "grad_norm": NaN, + "learning_rate": 0.00022938580298769752, + "loss": 0.0, + "step": 22091 + }, + { + "epoch": 2.061397779229262, + "grad_norm": NaN, + "learning_rate": 0.00022937938521998494, + "loss": 0.0, + "step": 22092 + }, + { + "epoch": 2.0614910889241393, + "grad_norm": NaN, + "learning_rate": 0.00022937296725043406, + "loss": 0.0, + "step": 22093 + }, + { + "epoch": 2.0615843986190163, + "grad_norm": NaN, + "learning_rate": 0.0002293665490790612, + "loss": 0.0, + "step": 22094 + }, + { + "epoch": 2.0616777083138937, + "grad_norm": NaN, + "learning_rate": 0.00022936013070588276, + "loss": 0.0, + "step": 22095 + }, + { + "epoch": 2.061771018008771, + "grad_norm": NaN, + "learning_rate": 0.000229353712130915, + "loss": 0.0, + "step": 22096 + }, + { + "epoch": 2.0618643277036486, + "grad_norm": NaN, + "learning_rate": 0.00022934729335417423, + "loss": 0.0, + "step": 22097 + }, + { + "epoch": 2.0619576373985256, + "grad_norm": NaN, + "learning_rate": 0.00022934087437567684, + "loss": 0.0, + "step": 22098 + }, + { + "epoch": 2.062050947093403, + "grad_norm": NaN, + "learning_rate": 0.0002293344551954391, + "loss": 0.0, + "step": 22099 + }, + { + "epoch": 2.0621442567882804, + "grad_norm": NaN, + "learning_rate": 0.00022932803581347734, + "loss": 0.0, + "step": 22100 + }, + { + "epoch": 2.062237566483158, + "grad_norm": NaN, + "learning_rate": 0.00022932161622980782, + "loss": 0.0, + "step": 22101 + }, + { + "epoch": 2.062330876178035, + "grad_norm": NaN, + "learning_rate": 0.00022931519644444704, + "loss": 0.0, + "step": 22102 + }, + { + "epoch": 2.062424185872912, + "grad_norm": NaN, + "learning_rate": 0.00022930877645741112, + "loss": 0.0, + "step": 22103 + }, + { + "epoch": 2.0625174955677896, + "grad_norm": NaN, + "learning_rate": 0.0002293023562687165, + "loss": 0.0, + "step": 22104 + }, + { + "epoch": 2.0626108052626666, + "grad_norm": NaN, + "learning_rate": 0.00022929593587837953, + "loss": 0.0, + "step": 22105 + }, + { + "epoch": 2.062704114957544, + "grad_norm": NaN, + "learning_rate": 0.00022928951528641643, + "loss": 0.0, + "step": 22106 + }, + { + "epoch": 2.0627974246524214, + "grad_norm": NaN, + "learning_rate": 0.00022928309449284356, + "loss": 0.0, + "step": 22107 + }, + { + "epoch": 2.062890734347299, + "grad_norm": NaN, + "learning_rate": 0.0002292766734976773, + "loss": 0.0, + "step": 22108 + }, + { + "epoch": 2.062984044042176, + "grad_norm": NaN, + "learning_rate": 0.00022927025230093398, + "loss": 0.0, + "step": 22109 + }, + { + "epoch": 2.0630773537370533, + "grad_norm": NaN, + "learning_rate": 0.00022926383090262983, + "loss": 0.0, + "step": 22110 + }, + { + "epoch": 2.0631706634319307, + "grad_norm": NaN, + "learning_rate": 0.00022925740930278122, + "loss": 0.0, + "step": 22111 + }, + { + "epoch": 2.0632639731268076, + "grad_norm": NaN, + "learning_rate": 0.00022925098750140456, + "loss": 0.0, + "step": 22112 + }, + { + "epoch": 2.063357282821685, + "grad_norm": NaN, + "learning_rate": 0.0002292445654985161, + "loss": 0.0, + "step": 22113 + }, + { + "epoch": 2.0634505925165625, + "grad_norm": NaN, + "learning_rate": 0.00022923814329413216, + "loss": 0.0, + "step": 22114 + }, + { + "epoch": 2.06354390221144, + "grad_norm": NaN, + "learning_rate": 0.00022923172088826912, + "loss": 0.0, + "step": 22115 + }, + { + "epoch": 2.063637211906317, + "grad_norm": NaN, + "learning_rate": 0.00022922529828094321, + "loss": 0.0, + "step": 22116 + }, + { + "epoch": 2.0637305216011943, + "grad_norm": NaN, + "learning_rate": 0.00022921887547217091, + "loss": 0.0, + "step": 22117 + }, + { + "epoch": 2.0638238312960717, + "grad_norm": NaN, + "learning_rate": 0.00022921245246196849, + "loss": 0.0, + "step": 22118 + }, + { + "epoch": 2.063917140990949, + "grad_norm": NaN, + "learning_rate": 0.00022920602925035222, + "loss": 0.0, + "step": 22119 + }, + { + "epoch": 2.064010450685826, + "grad_norm": NaN, + "learning_rate": 0.00022919960583733852, + "loss": 0.0, + "step": 22120 + }, + { + "epoch": 2.0641037603807035, + "grad_norm": NaN, + "learning_rate": 0.00022919318222294367, + "loss": 0.0, + "step": 22121 + }, + { + "epoch": 2.064197070075581, + "grad_norm": NaN, + "learning_rate": 0.00022918675840718402, + "loss": 0.0, + "step": 22122 + }, + { + "epoch": 2.0642903797704584, + "grad_norm": NaN, + "learning_rate": 0.00022918033439007585, + "loss": 0.0, + "step": 22123 + }, + { + "epoch": 2.0643836894653353, + "grad_norm": NaN, + "learning_rate": 0.0002291739101716356, + "loss": 0.0, + "step": 22124 + }, + { + "epoch": 2.0644769991602128, + "grad_norm": NaN, + "learning_rate": 0.00022916748575187957, + "loss": 0.0, + "step": 22125 + }, + { + "epoch": 2.06457030885509, + "grad_norm": NaN, + "learning_rate": 0.000229161061130824, + "loss": 0.0, + "step": 22126 + }, + { + "epoch": 2.064663618549967, + "grad_norm": NaN, + "learning_rate": 0.00022915463630848536, + "loss": 0.0, + "step": 22127 + }, + { + "epoch": 2.0647569282448446, + "grad_norm": NaN, + "learning_rate": 0.0002291482112848799, + "loss": 0.0, + "step": 22128 + }, + { + "epoch": 2.064850237939722, + "grad_norm": NaN, + "learning_rate": 0.00022914178606002394, + "loss": 0.0, + "step": 22129 + }, + { + "epoch": 2.0649435476345994, + "grad_norm": NaN, + "learning_rate": 0.00022913536063393395, + "loss": 0.0, + "step": 22130 + }, + { + "epoch": 2.0650368573294764, + "grad_norm": NaN, + "learning_rate": 0.00022912893500662613, + "loss": 0.0, + "step": 22131 + }, + { + "epoch": 2.065130167024354, + "grad_norm": NaN, + "learning_rate": 0.00022912250917811689, + "loss": 0.0, + "step": 22132 + }, + { + "epoch": 2.0652234767192312, + "grad_norm": NaN, + "learning_rate": 0.00022911608314842252, + "loss": 0.0, + "step": 22133 + }, + { + "epoch": 2.065316786414108, + "grad_norm": NaN, + "learning_rate": 0.0002291096569175594, + "loss": 0.0, + "step": 22134 + }, + { + "epoch": 2.0654100961089856, + "grad_norm": NaN, + "learning_rate": 0.00022910323048554388, + "loss": 0.0, + "step": 22135 + }, + { + "epoch": 2.065503405803863, + "grad_norm": NaN, + "learning_rate": 0.0002290968038523922, + "loss": 0.0, + "step": 22136 + }, + { + "epoch": 2.0655967154987405, + "grad_norm": NaN, + "learning_rate": 0.0002290903770181209, + "loss": 0.0, + "step": 22137 + }, + { + "epoch": 2.0656900251936174, + "grad_norm": NaN, + "learning_rate": 0.00022908394998274608, + "loss": 0.0, + "step": 22138 + }, + { + "epoch": 2.065783334888495, + "grad_norm": NaN, + "learning_rate": 0.00022907752274628425, + "loss": 0.0, + "step": 22139 + }, + { + "epoch": 2.0658766445833723, + "grad_norm": NaN, + "learning_rate": 0.00022907109530875177, + "loss": 0.0, + "step": 22140 + }, + { + "epoch": 2.0659699542782497, + "grad_norm": NaN, + "learning_rate": 0.00022906466767016478, + "loss": 0.0, + "step": 22141 + }, + { + "epoch": 2.0660632639731267, + "grad_norm": NaN, + "learning_rate": 0.00022905823983053987, + "loss": 0.0, + "step": 22142 + }, + { + "epoch": 2.066156573668004, + "grad_norm": NaN, + "learning_rate": 0.00022905181178989325, + "loss": 0.0, + "step": 22143 + }, + { + "epoch": 2.0662498833628815, + "grad_norm": NaN, + "learning_rate": 0.00022904538354824128, + "loss": 0.0, + "step": 22144 + }, + { + "epoch": 2.0663431930577585, + "grad_norm": NaN, + "learning_rate": 0.0002290389551056003, + "loss": 0.0, + "step": 22145 + }, + { + "epoch": 2.066436502752636, + "grad_norm": NaN, + "learning_rate": 0.0002290325264619867, + "loss": 0.0, + "step": 22146 + }, + { + "epoch": 2.0665298124475133, + "grad_norm": NaN, + "learning_rate": 0.0002290260976174168, + "loss": 0.0, + "step": 22147 + }, + { + "epoch": 2.0666231221423907, + "grad_norm": NaN, + "learning_rate": 0.0002290196685719069, + "loss": 0.0, + "step": 22148 + }, + { + "epoch": 2.0667164318372677, + "grad_norm": NaN, + "learning_rate": 0.0002290132393254734, + "loss": 0.0, + "step": 22149 + }, + { + "epoch": 2.066809741532145, + "grad_norm": NaN, + "learning_rate": 0.00022900680987813268, + "loss": 0.0, + "step": 22150 + }, + { + "epoch": 2.0669030512270226, + "grad_norm": NaN, + "learning_rate": 0.00022900038022990096, + "loss": 0.0, + "step": 22151 + }, + { + "epoch": 2.0669963609219, + "grad_norm": NaN, + "learning_rate": 0.00022899395038079477, + "loss": 0.0, + "step": 22152 + }, + { + "epoch": 2.067089670616777, + "grad_norm": NaN, + "learning_rate": 0.00022898752033083034, + "loss": 0.0, + "step": 22153 + }, + { + "epoch": 2.0671829803116544, + "grad_norm": NaN, + "learning_rate": 0.00022898109008002402, + "loss": 0.0, + "step": 22154 + }, + { + "epoch": 2.067276290006532, + "grad_norm": NaN, + "learning_rate": 0.0002289746596283922, + "loss": 0.0, + "step": 22155 + }, + { + "epoch": 2.0673695997014088, + "grad_norm": NaN, + "learning_rate": 0.00022896822897595123, + "loss": 0.0, + "step": 22156 + }, + { + "epoch": 2.067462909396286, + "grad_norm": NaN, + "learning_rate": 0.0002289617981227174, + "loss": 0.0, + "step": 22157 + }, + { + "epoch": 2.0675562190911636, + "grad_norm": NaN, + "learning_rate": 0.00022895536706870714, + "loss": 0.0, + "step": 22158 + }, + { + "epoch": 2.067649528786041, + "grad_norm": NaN, + "learning_rate": 0.00022894893581393676, + "loss": 0.0, + "step": 22159 + }, + { + "epoch": 2.067742838480918, + "grad_norm": NaN, + "learning_rate": 0.0002289425043584226, + "loss": 0.0, + "step": 22160 + }, + { + "epoch": 2.0678361481757954, + "grad_norm": NaN, + "learning_rate": 0.00022893607270218106, + "loss": 0.0, + "step": 22161 + }, + { + "epoch": 2.067929457870673, + "grad_norm": NaN, + "learning_rate": 0.00022892964084522853, + "loss": 0.0, + "step": 22162 + }, + { + "epoch": 2.0680227675655503, + "grad_norm": NaN, + "learning_rate": 0.00022892320878758118, + "loss": 0.0, + "step": 22163 + }, + { + "epoch": 2.0681160772604272, + "grad_norm": NaN, + "learning_rate": 0.0002289167765292556, + "loss": 0.0, + "step": 22164 + }, + { + "epoch": 2.0682093869553047, + "grad_norm": NaN, + "learning_rate": 0.000228910344070268, + "loss": 0.0, + "step": 22165 + }, + { + "epoch": 2.068302696650182, + "grad_norm": NaN, + "learning_rate": 0.00022890391141063473, + "loss": 0.0, + "step": 22166 + }, + { + "epoch": 2.068396006345059, + "grad_norm": NaN, + "learning_rate": 0.0002288974785503722, + "loss": 0.0, + "step": 22167 + }, + { + "epoch": 2.0684893160399365, + "grad_norm": NaN, + "learning_rate": 0.0002288910454894968, + "loss": 0.0, + "step": 22168 + }, + { + "epoch": 2.068582625734814, + "grad_norm": NaN, + "learning_rate": 0.00022888461222802479, + "loss": 0.0, + "step": 22169 + }, + { + "epoch": 2.0686759354296913, + "grad_norm": NaN, + "learning_rate": 0.0002288781787659726, + "loss": 0.0, + "step": 22170 + }, + { + "epoch": 2.0687692451245683, + "grad_norm": NaN, + "learning_rate": 0.00022887174510335658, + "loss": 0.0, + "step": 22171 + }, + { + "epoch": 2.0688625548194457, + "grad_norm": NaN, + "learning_rate": 0.00022886531124019307, + "loss": 0.0, + "step": 22172 + }, + { + "epoch": 2.068955864514323, + "grad_norm": NaN, + "learning_rate": 0.00022885887717649837, + "loss": 0.0, + "step": 22173 + }, + { + "epoch": 2.0690491742092005, + "grad_norm": NaN, + "learning_rate": 0.00022885244291228897, + "loss": 0.0, + "step": 22174 + }, + { + "epoch": 2.0691424839040775, + "grad_norm": NaN, + "learning_rate": 0.0002288460084475812, + "loss": 0.0, + "step": 22175 + }, + { + "epoch": 2.069235793598955, + "grad_norm": NaN, + "learning_rate": 0.00022883957378239128, + "loss": 0.0, + "step": 22176 + }, + { + "epoch": 2.0693291032938324, + "grad_norm": NaN, + "learning_rate": 0.00022883313891673576, + "loss": 0.0, + "step": 22177 + }, + { + "epoch": 2.0694224129887093, + "grad_norm": NaN, + "learning_rate": 0.00022882670385063092, + "loss": 0.0, + "step": 22178 + }, + { + "epoch": 2.0695157226835867, + "grad_norm": NaN, + "learning_rate": 0.00022882026858409306, + "loss": 0.0, + "step": 22179 + }, + { + "epoch": 2.069609032378464, + "grad_norm": NaN, + "learning_rate": 0.00022881383311713866, + "loss": 0.0, + "step": 22180 + }, + { + "epoch": 2.0697023420733416, + "grad_norm": NaN, + "learning_rate": 0.00022880739744978406, + "loss": 0.0, + "step": 22181 + }, + { + "epoch": 2.0697956517682186, + "grad_norm": NaN, + "learning_rate": 0.00022880096158204552, + "loss": 0.0, + "step": 22182 + }, + { + "epoch": 2.069888961463096, + "grad_norm": NaN, + "learning_rate": 0.00022879452551393949, + "loss": 0.0, + "step": 22183 + }, + { + "epoch": 2.0699822711579734, + "grad_norm": NaN, + "learning_rate": 0.00022878808924548235, + "loss": 0.0, + "step": 22184 + }, + { + "epoch": 2.070075580852851, + "grad_norm": NaN, + "learning_rate": 0.0002287816527766904, + "loss": 0.0, + "step": 22185 + }, + { + "epoch": 2.070168890547728, + "grad_norm": NaN, + "learning_rate": 0.0002287752161075801, + "loss": 0.0, + "step": 22186 + }, + { + "epoch": 2.070262200242605, + "grad_norm": NaN, + "learning_rate": 0.00022876877923816774, + "loss": 0.0, + "step": 22187 + }, + { + "epoch": 2.0703555099374826, + "grad_norm": NaN, + "learning_rate": 0.0002287623421684697, + "loss": 0.0, + "step": 22188 + }, + { + "epoch": 2.0704488196323596, + "grad_norm": NaN, + "learning_rate": 0.00022875590489850232, + "loss": 0.0, + "step": 22189 + }, + { + "epoch": 2.070542129327237, + "grad_norm": NaN, + "learning_rate": 0.00022874946742828205, + "loss": 0.0, + "step": 22190 + }, + { + "epoch": 2.0706354390221144, + "grad_norm": NaN, + "learning_rate": 0.00022874302975782522, + "loss": 0.0, + "step": 22191 + }, + { + "epoch": 2.070728748716992, + "grad_norm": NaN, + "learning_rate": 0.00022873659188714813, + "loss": 0.0, + "step": 22192 + }, + { + "epoch": 2.070822058411869, + "grad_norm": NaN, + "learning_rate": 0.00022873015381626725, + "loss": 0.0, + "step": 22193 + }, + { + "epoch": 2.0709153681067463, + "grad_norm": NaN, + "learning_rate": 0.00022872371554519895, + "loss": 0.0, + "step": 22194 + }, + { + "epoch": 2.0710086778016237, + "grad_norm": NaN, + "learning_rate": 0.00022871727707395945, + "loss": 0.0, + "step": 22195 + }, + { + "epoch": 2.071101987496501, + "grad_norm": NaN, + "learning_rate": 0.00022871083840256533, + "loss": 0.0, + "step": 22196 + }, + { + "epoch": 2.071195297191378, + "grad_norm": NaN, + "learning_rate": 0.00022870439953103286, + "loss": 0.0, + "step": 22197 + }, + { + "epoch": 2.0712886068862555, + "grad_norm": NaN, + "learning_rate": 0.00022869796045937838, + "loss": 0.0, + "step": 22198 + }, + { + "epoch": 2.071381916581133, + "grad_norm": NaN, + "learning_rate": 0.00022869152118761831, + "loss": 0.0, + "step": 22199 + }, + { + "epoch": 2.07147522627601, + "grad_norm": NaN, + "learning_rate": 0.00022868508171576904, + "loss": 0.0, + "step": 22200 + }, + { + "epoch": 2.0715685359708873, + "grad_norm": NaN, + "learning_rate": 0.00022867864204384686, + "loss": 0.0, + "step": 22201 + }, + { + "epoch": 2.0716618456657647, + "grad_norm": NaN, + "learning_rate": 0.00022867220217186824, + "loss": 0.0, + "step": 22202 + }, + { + "epoch": 2.071755155360642, + "grad_norm": NaN, + "learning_rate": 0.0002286657620998495, + "loss": 0.0, + "step": 22203 + }, + { + "epoch": 2.071848465055519, + "grad_norm": NaN, + "learning_rate": 0.000228659321827807, + "loss": 0.0, + "step": 22204 + }, + { + "epoch": 2.0719417747503965, + "grad_norm": NaN, + "learning_rate": 0.00022865288135575715, + "loss": 0.0, + "step": 22205 + }, + { + "epoch": 2.072035084445274, + "grad_norm": NaN, + "learning_rate": 0.0002286464406837164, + "loss": 0.0, + "step": 22206 + }, + { + "epoch": 2.0721283941401514, + "grad_norm": NaN, + "learning_rate": 0.00022863999981170098, + "loss": 0.0, + "step": 22207 + }, + { + "epoch": 2.0722217038350284, + "grad_norm": NaN, + "learning_rate": 0.00022863355873972734, + "loss": 0.0, + "step": 22208 + }, + { + "epoch": 2.0723150135299058, + "grad_norm": NaN, + "learning_rate": 0.00022862711746781186, + "loss": 0.0, + "step": 22209 + }, + { + "epoch": 2.072408323224783, + "grad_norm": NaN, + "learning_rate": 0.00022862067599597094, + "loss": 0.0, + "step": 22210 + }, + { + "epoch": 2.07250163291966, + "grad_norm": NaN, + "learning_rate": 0.0002286142343242209, + "loss": 0.0, + "step": 22211 + }, + { + "epoch": 2.0725949426145376, + "grad_norm": NaN, + "learning_rate": 0.00022860779245257814, + "loss": 0.0, + "step": 22212 + }, + { + "epoch": 2.072688252309415, + "grad_norm": NaN, + "learning_rate": 0.0002286013503810591, + "loss": 0.0, + "step": 22213 + }, + { + "epoch": 2.0727815620042924, + "grad_norm": NaN, + "learning_rate": 0.00022859490810968005, + "loss": 0.0, + "step": 22214 + }, + { + "epoch": 2.0728748716991694, + "grad_norm": NaN, + "learning_rate": 0.00022858846563845743, + "loss": 0.0, + "step": 22215 + }, + { + "epoch": 2.072968181394047, + "grad_norm": NaN, + "learning_rate": 0.0002285820229674077, + "loss": 0.0, + "step": 22216 + }, + { + "epoch": 2.0730614910889242, + "grad_norm": NaN, + "learning_rate": 0.0002285755800965471, + "loss": 0.0, + "step": 22217 + }, + { + "epoch": 2.0731548007838017, + "grad_norm": NaN, + "learning_rate": 0.00022856913702589204, + "loss": 0.0, + "step": 22218 + }, + { + "epoch": 2.0732481104786786, + "grad_norm": NaN, + "learning_rate": 0.000228562693755459, + "loss": 0.0, + "step": 22219 + }, + { + "epoch": 2.073341420173556, + "grad_norm": NaN, + "learning_rate": 0.0002285562502852643, + "loss": 0.0, + "step": 22220 + }, + { + "epoch": 2.0734347298684335, + "grad_norm": NaN, + "learning_rate": 0.0002285498066153243, + "loss": 0.0, + "step": 22221 + }, + { + "epoch": 2.0735280395633104, + "grad_norm": NaN, + "learning_rate": 0.00022854336274565545, + "loss": 0.0, + "step": 22222 + }, + { + "epoch": 2.073621349258188, + "grad_norm": NaN, + "learning_rate": 0.00022853691867627402, + "loss": 0.0, + "step": 22223 + }, + { + "epoch": 2.0737146589530653, + "grad_norm": NaN, + "learning_rate": 0.0002285304744071965, + "loss": 0.0, + "step": 22224 + }, + { + "epoch": 2.0738079686479427, + "grad_norm": NaN, + "learning_rate": 0.00022852402993843928, + "loss": 0.0, + "step": 22225 + }, + { + "epoch": 2.0739012783428197, + "grad_norm": NaN, + "learning_rate": 0.0002285175852700187, + "loss": 0.0, + "step": 22226 + }, + { + "epoch": 2.073994588037697, + "grad_norm": NaN, + "learning_rate": 0.00022851114040195114, + "loss": 0.0, + "step": 22227 + }, + { + "epoch": 2.0740878977325745, + "grad_norm": NaN, + "learning_rate": 0.000228504695334253, + "loss": 0.0, + "step": 22228 + }, + { + "epoch": 2.0741812074274515, + "grad_norm": NaN, + "learning_rate": 0.00022849825006694073, + "loss": 0.0, + "step": 22229 + }, + { + "epoch": 2.074274517122329, + "grad_norm": NaN, + "learning_rate": 0.0002284918046000306, + "loss": 0.0, + "step": 22230 + }, + { + "epoch": 2.0743678268172063, + "grad_norm": NaN, + "learning_rate": 0.00022848535893353912, + "loss": 0.0, + "step": 22231 + }, + { + "epoch": 2.0744611365120837, + "grad_norm": NaN, + "learning_rate": 0.00022847891306748257, + "loss": 0.0, + "step": 22232 + }, + { + "epoch": 2.0745544462069607, + "grad_norm": NaN, + "learning_rate": 0.00022847246700187743, + "loss": 0.0, + "step": 22233 + }, + { + "epoch": 2.074647755901838, + "grad_norm": NaN, + "learning_rate": 0.00022846602073674002, + "loss": 0.0, + "step": 22234 + }, + { + "epoch": 2.0747410655967156, + "grad_norm": NaN, + "learning_rate": 0.0002284595742720868, + "loss": 0.0, + "step": 22235 + }, + { + "epoch": 2.074834375291593, + "grad_norm": NaN, + "learning_rate": 0.00022845312760793408, + "loss": 0.0, + "step": 22236 + }, + { + "epoch": 2.07492768498647, + "grad_norm": NaN, + "learning_rate": 0.0002284466807442983, + "loss": 0.0, + "step": 22237 + }, + { + "epoch": 2.0750209946813474, + "grad_norm": NaN, + "learning_rate": 0.0002284402336811959, + "loss": 0.0, + "step": 22238 + }, + { + "epoch": 2.075114304376225, + "grad_norm": NaN, + "learning_rate": 0.00022843378641864315, + "loss": 0.0, + "step": 22239 + }, + { + "epoch": 2.075207614071102, + "grad_norm": NaN, + "learning_rate": 0.0002284273389566565, + "loss": 0.0, + "step": 22240 + }, + { + "epoch": 2.075300923765979, + "grad_norm": NaN, + "learning_rate": 0.00022842089129525249, + "loss": 0.0, + "step": 22241 + }, + { + "epoch": 2.0753942334608566, + "grad_norm": NaN, + "learning_rate": 0.00022841444343444725, + "loss": 0.0, + "step": 22242 + }, + { + "epoch": 2.075487543155734, + "grad_norm": NaN, + "learning_rate": 0.0002284079953742574, + "loss": 0.0, + "step": 22243 + }, + { + "epoch": 2.075580852850611, + "grad_norm": NaN, + "learning_rate": 0.00022840154711469918, + "loss": 0.0, + "step": 22244 + }, + { + "epoch": 2.0756741625454884, + "grad_norm": NaN, + "learning_rate": 0.00022839509865578908, + "loss": 0.0, + "step": 22245 + }, + { + "epoch": 2.075767472240366, + "grad_norm": NaN, + "learning_rate": 0.00022838864999754344, + "loss": 0.0, + "step": 22246 + }, + { + "epoch": 2.0758607819352433, + "grad_norm": NaN, + "learning_rate": 0.00022838220113997873, + "loss": 0.0, + "step": 22247 + }, + { + "epoch": 2.0759540916301202, + "grad_norm": NaN, + "learning_rate": 0.0002283757520831113, + "loss": 0.0, + "step": 22248 + }, + { + "epoch": 2.0760474013249977, + "grad_norm": NaN, + "learning_rate": 0.00022836930282695752, + "loss": 0.0, + "step": 22249 + }, + { + "epoch": 2.076140711019875, + "grad_norm": NaN, + "learning_rate": 0.0002283628533715338, + "loss": 0.0, + "step": 22250 + }, + { + "epoch": 2.076234020714752, + "grad_norm": NaN, + "learning_rate": 0.00022835640371685658, + "loss": 0.0, + "step": 22251 + }, + { + "epoch": 2.0763273304096295, + "grad_norm": NaN, + "learning_rate": 0.00022834995386294222, + "loss": 0.0, + "step": 22252 + }, + { + "epoch": 2.076420640104507, + "grad_norm": NaN, + "learning_rate": 0.00022834350380980718, + "loss": 0.0, + "step": 22253 + }, + { + "epoch": 2.0765139497993843, + "grad_norm": NaN, + "learning_rate": 0.0002283370535574678, + "loss": 0.0, + "step": 22254 + }, + { + "epoch": 2.0766072594942613, + "grad_norm": NaN, + "learning_rate": 0.00022833060310594046, + "loss": 0.0, + "step": 22255 + }, + { + "epoch": 2.0767005691891387, + "grad_norm": NaN, + "learning_rate": 0.0002283241524552416, + "loss": 0.0, + "step": 22256 + }, + { + "epoch": 2.076793878884016, + "grad_norm": NaN, + "learning_rate": 0.0002283177016053877, + "loss": 0.0, + "step": 22257 + }, + { + "epoch": 2.0768871885788935, + "grad_norm": NaN, + "learning_rate": 0.000228311250556395, + "loss": 0.0, + "step": 22258 + }, + { + "epoch": 2.0769804982737705, + "grad_norm": NaN, + "learning_rate": 0.00022830479930828, + "loss": 0.0, + "step": 22259 + }, + { + "epoch": 2.077073807968648, + "grad_norm": NaN, + "learning_rate": 0.00022829834786105917, + "loss": 0.0, + "step": 22260 + }, + { + "epoch": 2.0771671176635254, + "grad_norm": NaN, + "learning_rate": 0.00022829189621474875, + "loss": 0.0, + "step": 22261 + }, + { + "epoch": 2.0772604273584023, + "grad_norm": NaN, + "learning_rate": 0.00022828544436936523, + "loss": 0.0, + "step": 22262 + }, + { + "epoch": 2.0773537370532797, + "grad_norm": NaN, + "learning_rate": 0.00022827899232492503, + "loss": 0.0, + "step": 22263 + }, + { + "epoch": 2.077447046748157, + "grad_norm": NaN, + "learning_rate": 0.00022827254008144459, + "loss": 0.0, + "step": 22264 + }, + { + "epoch": 2.0775403564430346, + "grad_norm": NaN, + "learning_rate": 0.00022826608763894024, + "loss": 0.0, + "step": 22265 + }, + { + "epoch": 2.0776336661379116, + "grad_norm": NaN, + "learning_rate": 0.0002282596349974284, + "loss": 0.0, + "step": 22266 + }, + { + "epoch": 2.077726975832789, + "grad_norm": NaN, + "learning_rate": 0.00022825318215692546, + "loss": 0.0, + "step": 22267 + }, + { + "epoch": 2.0778202855276664, + "grad_norm": NaN, + "learning_rate": 0.00022824672911744788, + "loss": 0.0, + "step": 22268 + }, + { + "epoch": 2.077913595222544, + "grad_norm": NaN, + "learning_rate": 0.00022824027587901207, + "loss": 0.0, + "step": 22269 + }, + { + "epoch": 2.078006904917421, + "grad_norm": NaN, + "learning_rate": 0.00022823382244163438, + "loss": 0.0, + "step": 22270 + }, + { + "epoch": 2.078100214612298, + "grad_norm": NaN, + "learning_rate": 0.00022822736880533128, + "loss": 0.0, + "step": 22271 + }, + { + "epoch": 2.0781935243071756, + "grad_norm": NaN, + "learning_rate": 0.00022822091497011915, + "loss": 0.0, + "step": 22272 + }, + { + "epoch": 2.0782868340020526, + "grad_norm": NaN, + "learning_rate": 0.00022821446093601442, + "loss": 0.0, + "step": 22273 + }, + { + "epoch": 2.07838014369693, + "grad_norm": NaN, + "learning_rate": 0.00022820800670303348, + "loss": 0.0, + "step": 22274 + }, + { + "epoch": 2.0784734533918074, + "grad_norm": NaN, + "learning_rate": 0.0002282015522711927, + "loss": 0.0, + "step": 22275 + }, + { + "epoch": 2.078566763086685, + "grad_norm": NaN, + "learning_rate": 0.00022819509764050857, + "loss": 0.0, + "step": 22276 + }, + { + "epoch": 2.078660072781562, + "grad_norm": NaN, + "learning_rate": 0.00022818864281099748, + "loss": 0.0, + "step": 22277 + }, + { + "epoch": 2.0787533824764393, + "grad_norm": NaN, + "learning_rate": 0.0002281821877826758, + "loss": 0.0, + "step": 22278 + }, + { + "epoch": 2.0788466921713167, + "grad_norm": NaN, + "learning_rate": 0.00022817573255556005, + "loss": 0.0, + "step": 22279 + }, + { + "epoch": 2.078940001866194, + "grad_norm": NaN, + "learning_rate": 0.0002281692771296665, + "loss": 0.0, + "step": 22280 + }, + { + "epoch": 2.079033311561071, + "grad_norm": NaN, + "learning_rate": 0.00022816282150501163, + "loss": 0.0, + "step": 22281 + }, + { + "epoch": 2.0791266212559485, + "grad_norm": NaN, + "learning_rate": 0.00022815636568161187, + "loss": 0.0, + "step": 22282 + }, + { + "epoch": 2.079219930950826, + "grad_norm": NaN, + "learning_rate": 0.00022814990965948364, + "loss": 0.0, + "step": 22283 + }, + { + "epoch": 2.079313240645703, + "grad_norm": NaN, + "learning_rate": 0.00022814345343864336, + "loss": 0.0, + "step": 22284 + }, + { + "epoch": 2.0794065503405803, + "grad_norm": NaN, + "learning_rate": 0.00022813699701910737, + "loss": 0.0, + "step": 22285 + }, + { + "epoch": 2.0794998600354577, + "grad_norm": NaN, + "learning_rate": 0.00022813054040089217, + "loss": 0.0, + "step": 22286 + }, + { + "epoch": 2.079593169730335, + "grad_norm": NaN, + "learning_rate": 0.00022812408358401418, + "loss": 0.0, + "step": 22287 + }, + { + "epoch": 2.079686479425212, + "grad_norm": NaN, + "learning_rate": 0.00022811762656848973, + "loss": 0.0, + "step": 22288 + }, + { + "epoch": 2.0797797891200895, + "grad_norm": NaN, + "learning_rate": 0.00022811116935433532, + "loss": 0.0, + "step": 22289 + }, + { + "epoch": 2.079873098814967, + "grad_norm": NaN, + "learning_rate": 0.00022810471194156736, + "loss": 0.0, + "step": 22290 + }, + { + "epoch": 2.0799664085098444, + "grad_norm": NaN, + "learning_rate": 0.00022809825433020225, + "loss": 0.0, + "step": 22291 + }, + { + "epoch": 2.0800597182047214, + "grad_norm": NaN, + "learning_rate": 0.0002280917965202564, + "loss": 0.0, + "step": 22292 + }, + { + "epoch": 2.0801530278995988, + "grad_norm": NaN, + "learning_rate": 0.00022808533851174624, + "loss": 0.0, + "step": 22293 + }, + { + "epoch": 2.080246337594476, + "grad_norm": NaN, + "learning_rate": 0.00022807888030468821, + "loss": 0.0, + "step": 22294 + }, + { + "epoch": 2.080339647289353, + "grad_norm": NaN, + "learning_rate": 0.0002280724218990987, + "loss": 0.0, + "step": 22295 + }, + { + "epoch": 2.0804329569842306, + "grad_norm": NaN, + "learning_rate": 0.00022806596329499418, + "loss": 0.0, + "step": 22296 + }, + { + "epoch": 2.080526266679108, + "grad_norm": NaN, + "learning_rate": 0.00022805950449239102, + "loss": 0.0, + "step": 22297 + }, + { + "epoch": 2.0806195763739854, + "grad_norm": NaN, + "learning_rate": 0.00022805304549130566, + "loss": 0.0, + "step": 22298 + }, + { + "epoch": 2.0807128860688624, + "grad_norm": NaN, + "learning_rate": 0.0002280465862917545, + "loss": 0.0, + "step": 22299 + }, + { + "epoch": 2.08080619576374, + "grad_norm": NaN, + "learning_rate": 0.00022804012689375401, + "loss": 0.0, + "step": 22300 + }, + { + "epoch": 2.0808995054586172, + "grad_norm": NaN, + "learning_rate": 0.00022803366729732063, + "loss": 0.0, + "step": 22301 + }, + { + "epoch": 2.0809928151534947, + "grad_norm": NaN, + "learning_rate": 0.0002280272075024707, + "loss": 0.0, + "step": 22302 + }, + { + "epoch": 2.0810861248483716, + "grad_norm": NaN, + "learning_rate": 0.00022802074750922073, + "loss": 0.0, + "step": 22303 + }, + { + "epoch": 2.081179434543249, + "grad_norm": NaN, + "learning_rate": 0.00022801428731758708, + "loss": 0.0, + "step": 22304 + }, + { + "epoch": 2.0812727442381265, + "grad_norm": NaN, + "learning_rate": 0.00022800782692758623, + "loss": 0.0, + "step": 22305 + }, + { + "epoch": 2.0813660539330034, + "grad_norm": NaN, + "learning_rate": 0.0002280013663392346, + "loss": 0.0, + "step": 22306 + }, + { + "epoch": 2.081459363627881, + "grad_norm": NaN, + "learning_rate": 0.00022799490555254855, + "loss": 0.0, + "step": 22307 + }, + { + "epoch": 2.0815526733227583, + "grad_norm": NaN, + "learning_rate": 0.0002279884445675446, + "loss": 0.0, + "step": 22308 + }, + { + "epoch": 2.0816459830176357, + "grad_norm": NaN, + "learning_rate": 0.00022798198338423912, + "loss": 0.0, + "step": 22309 + }, + { + "epoch": 2.0817392927125127, + "grad_norm": NaN, + "learning_rate": 0.0002279755220026486, + "loss": 0.0, + "step": 22310 + }, + { + "epoch": 2.08183260240739, + "grad_norm": NaN, + "learning_rate": 0.00022796906042278934, + "loss": 0.0, + "step": 22311 + }, + { + "epoch": 2.0819259121022675, + "grad_norm": NaN, + "learning_rate": 0.00022796259864467792, + "loss": 0.0, + "step": 22312 + }, + { + "epoch": 2.082019221797145, + "grad_norm": NaN, + "learning_rate": 0.00022795613666833068, + "loss": 0.0, + "step": 22313 + }, + { + "epoch": 2.082112531492022, + "grad_norm": NaN, + "learning_rate": 0.0002279496744937641, + "loss": 0.0, + "step": 22314 + }, + { + "epoch": 2.0822058411868993, + "grad_norm": NaN, + "learning_rate": 0.00022794321212099458, + "loss": 0.0, + "step": 22315 + }, + { + "epoch": 2.0822991508817768, + "grad_norm": NaN, + "learning_rate": 0.00022793674955003853, + "loss": 0.0, + "step": 22316 + }, + { + "epoch": 2.0823924605766537, + "grad_norm": NaN, + "learning_rate": 0.00022793028678091246, + "loss": 0.0, + "step": 22317 + }, + { + "epoch": 2.082485770271531, + "grad_norm": NaN, + "learning_rate": 0.00022792382381363273, + "loss": 0.0, + "step": 22318 + }, + { + "epoch": 2.0825790799664086, + "grad_norm": NaN, + "learning_rate": 0.0002279173606482158, + "loss": 0.0, + "step": 22319 + }, + { + "epoch": 2.082672389661286, + "grad_norm": NaN, + "learning_rate": 0.0002279108972846781, + "loss": 0.0, + "step": 22320 + }, + { + "epoch": 2.082765699356163, + "grad_norm": NaN, + "learning_rate": 0.00022790443372303608, + "loss": 0.0, + "step": 22321 + }, + { + "epoch": 2.0828590090510404, + "grad_norm": NaN, + "learning_rate": 0.00022789796996330614, + "loss": 0.0, + "step": 22322 + }, + { + "epoch": 2.082952318745918, + "grad_norm": NaN, + "learning_rate": 0.00022789150600550475, + "loss": 0.0, + "step": 22323 + }, + { + "epoch": 2.0830456284407948, + "grad_norm": NaN, + "learning_rate": 0.0002278850418496483, + "loss": 0.0, + "step": 22324 + }, + { + "epoch": 2.083138938135672, + "grad_norm": NaN, + "learning_rate": 0.00022787857749575333, + "loss": 0.0, + "step": 22325 + }, + { + "epoch": 2.0832322478305496, + "grad_norm": NaN, + "learning_rate": 0.00022787211294383617, + "loss": 0.0, + "step": 22326 + }, + { + "epoch": 2.083325557525427, + "grad_norm": NaN, + "learning_rate": 0.0002278656481939133, + "loss": 0.0, + "step": 22327 + }, + { + "epoch": 2.083418867220304, + "grad_norm": NaN, + "learning_rate": 0.00022785918324600112, + "loss": 0.0, + "step": 22328 + }, + { + "epoch": 2.0835121769151814, + "grad_norm": NaN, + "learning_rate": 0.0002278527181001161, + "loss": 0.0, + "step": 22329 + }, + { + "epoch": 2.083605486610059, + "grad_norm": NaN, + "learning_rate": 0.00022784625275627476, + "loss": 0.0, + "step": 22330 + }, + { + "epoch": 2.0836987963049363, + "grad_norm": NaN, + "learning_rate": 0.00022783978721449338, + "loss": 0.0, + "step": 22331 + }, + { + "epoch": 2.0837921059998132, + "grad_norm": NaN, + "learning_rate": 0.0002278333214747885, + "loss": 0.0, + "step": 22332 + }, + { + "epoch": 2.0838854156946907, + "grad_norm": NaN, + "learning_rate": 0.00022782685553717654, + "loss": 0.0, + "step": 22333 + }, + { + "epoch": 2.083978725389568, + "grad_norm": NaN, + "learning_rate": 0.00022782038940167396, + "loss": 0.0, + "step": 22334 + }, + { + "epoch": 2.0840720350844455, + "grad_norm": NaN, + "learning_rate": 0.00022781392306829717, + "loss": 0.0, + "step": 22335 + }, + { + "epoch": 2.0841653447793225, + "grad_norm": NaN, + "learning_rate": 0.0002278074565370626, + "loss": 0.0, + "step": 22336 + }, + { + "epoch": 2.0842586544742, + "grad_norm": NaN, + "learning_rate": 0.00022780098980798675, + "loss": 0.0, + "step": 22337 + }, + { + "epoch": 2.0843519641690773, + "grad_norm": NaN, + "learning_rate": 0.000227794522881086, + "loss": 0.0, + "step": 22338 + }, + { + "epoch": 2.0844452738639543, + "grad_norm": NaN, + "learning_rate": 0.00022778805575637685, + "loss": 0.0, + "step": 22339 + }, + { + "epoch": 2.0845385835588317, + "grad_norm": NaN, + "learning_rate": 0.00022778158843387572, + "loss": 0.0, + "step": 22340 + }, + { + "epoch": 2.084631893253709, + "grad_norm": NaN, + "learning_rate": 0.00022777512091359903, + "loss": 0.0, + "step": 22341 + }, + { + "epoch": 2.0847252029485865, + "grad_norm": NaN, + "learning_rate": 0.00022776865319556322, + "loss": 0.0, + "step": 22342 + }, + { + "epoch": 2.0848185126434635, + "grad_norm": NaN, + "learning_rate": 0.0002277621852797848, + "loss": 0.0, + "step": 22343 + }, + { + "epoch": 2.084911822338341, + "grad_norm": NaN, + "learning_rate": 0.0002277557171662802, + "loss": 0.0, + "step": 22344 + }, + { + "epoch": 2.0850051320332184, + "grad_norm": NaN, + "learning_rate": 0.00022774924885506583, + "loss": 0.0, + "step": 22345 + }, + { + "epoch": 2.0850984417280953, + "grad_norm": NaN, + "learning_rate": 0.00022774278034615812, + "loss": 0.0, + "step": 22346 + }, + { + "epoch": 2.0851917514229728, + "grad_norm": NaN, + "learning_rate": 0.00022773631163957352, + "loss": 0.0, + "step": 22347 + }, + { + "epoch": 2.08528506111785, + "grad_norm": NaN, + "learning_rate": 0.0002277298427353286, + "loss": 0.0, + "step": 22348 + }, + { + "epoch": 2.0853783708127276, + "grad_norm": NaN, + "learning_rate": 0.00022772337363343966, + "loss": 0.0, + "step": 22349 + }, + { + "epoch": 2.0854716805076046, + "grad_norm": NaN, + "learning_rate": 0.0002277169043339232, + "loss": 0.0, + "step": 22350 + }, + { + "epoch": 2.085564990202482, + "grad_norm": NaN, + "learning_rate": 0.0002277104348367957, + "loss": 0.0, + "step": 22351 + }, + { + "epoch": 2.0856582998973594, + "grad_norm": NaN, + "learning_rate": 0.00022770396514207357, + "loss": 0.0, + "step": 22352 + }, + { + "epoch": 2.085751609592237, + "grad_norm": NaN, + "learning_rate": 0.00022769749524977325, + "loss": 0.0, + "step": 22353 + }, + { + "epoch": 2.085844919287114, + "grad_norm": NaN, + "learning_rate": 0.00022769102515991124, + "loss": 0.0, + "step": 22354 + }, + { + "epoch": 2.085938228981991, + "grad_norm": NaN, + "learning_rate": 0.00022768455487250398, + "loss": 0.0, + "step": 22355 + }, + { + "epoch": 2.0860315386768686, + "grad_norm": NaN, + "learning_rate": 0.00022767808438756788, + "loss": 0.0, + "step": 22356 + }, + { + "epoch": 2.0861248483717456, + "grad_norm": NaN, + "learning_rate": 0.00022767161370511945, + "loss": 0.0, + "step": 22357 + }, + { + "epoch": 2.086218158066623, + "grad_norm": NaN, + "learning_rate": 0.00022766514282517508, + "loss": 0.0, + "step": 22358 + }, + { + "epoch": 2.0863114677615004, + "grad_norm": NaN, + "learning_rate": 0.0002276586717477513, + "loss": 0.0, + "step": 22359 + }, + { + "epoch": 2.086404777456378, + "grad_norm": NaN, + "learning_rate": 0.00022765220047286447, + "loss": 0.0, + "step": 22360 + }, + { + "epoch": 2.086498087151255, + "grad_norm": NaN, + "learning_rate": 0.00022764572900053112, + "loss": 0.0, + "step": 22361 + }, + { + "epoch": 2.0865913968461323, + "grad_norm": NaN, + "learning_rate": 0.00022763925733076772, + "loss": 0.0, + "step": 22362 + }, + { + "epoch": 2.0866847065410097, + "grad_norm": NaN, + "learning_rate": 0.00022763278546359067, + "loss": 0.0, + "step": 22363 + }, + { + "epoch": 2.086778016235887, + "grad_norm": NaN, + "learning_rate": 0.0002276263133990164, + "loss": 0.0, + "step": 22364 + }, + { + "epoch": 2.086871325930764, + "grad_norm": NaN, + "learning_rate": 0.00022761984113706145, + "loss": 0.0, + "step": 22365 + }, + { + "epoch": 2.0869646356256415, + "grad_norm": NaN, + "learning_rate": 0.00022761336867774222, + "loss": 0.0, + "step": 22366 + }, + { + "epoch": 2.087057945320519, + "grad_norm": NaN, + "learning_rate": 0.0002276068960210752, + "loss": 0.0, + "step": 22367 + }, + { + "epoch": 2.087151255015396, + "grad_norm": NaN, + "learning_rate": 0.0002276004231670768, + "loss": 0.0, + "step": 22368 + }, + { + "epoch": 2.0872445647102733, + "grad_norm": NaN, + "learning_rate": 0.00022759395011576355, + "loss": 0.0, + "step": 22369 + }, + { + "epoch": 2.0873378744051507, + "grad_norm": NaN, + "learning_rate": 0.00022758747686715185, + "loss": 0.0, + "step": 22370 + }, + { + "epoch": 2.087431184100028, + "grad_norm": NaN, + "learning_rate": 0.00022758100342125818, + "loss": 0.0, + "step": 22371 + }, + { + "epoch": 2.087524493794905, + "grad_norm": NaN, + "learning_rate": 0.00022757452977809897, + "loss": 0.0, + "step": 22372 + }, + { + "epoch": 2.0876178034897825, + "grad_norm": NaN, + "learning_rate": 0.00022756805593769077, + "loss": 0.0, + "step": 22373 + }, + { + "epoch": 2.08771111318466, + "grad_norm": NaN, + "learning_rate": 0.00022756158190004994, + "loss": 0.0, + "step": 22374 + }, + { + "epoch": 2.0878044228795374, + "grad_norm": NaN, + "learning_rate": 0.000227555107665193, + "loss": 0.0, + "step": 22375 + }, + { + "epoch": 2.0878977325744144, + "grad_norm": NaN, + "learning_rate": 0.0002275486332331364, + "loss": 0.0, + "step": 22376 + }, + { + "epoch": 2.0879910422692918, + "grad_norm": NaN, + "learning_rate": 0.0002275421586038966, + "loss": 0.0, + "step": 22377 + }, + { + "epoch": 2.088084351964169, + "grad_norm": NaN, + "learning_rate": 0.00022753568377749004, + "loss": 0.0, + "step": 22378 + }, + { + "epoch": 2.088177661659046, + "grad_norm": NaN, + "learning_rate": 0.00022752920875393316, + "loss": 0.0, + "step": 22379 + }, + { + "epoch": 2.0882709713539236, + "grad_norm": NaN, + "learning_rate": 0.00022752273353324252, + "loss": 0.0, + "step": 22380 + }, + { + "epoch": 2.088364281048801, + "grad_norm": NaN, + "learning_rate": 0.00022751625811543458, + "loss": 0.0, + "step": 22381 + }, + { + "epoch": 2.0884575907436784, + "grad_norm": NaN, + "learning_rate": 0.00022750978250052567, + "loss": 0.0, + "step": 22382 + }, + { + "epoch": 2.0885509004385554, + "grad_norm": NaN, + "learning_rate": 0.0002275033066885324, + "loss": 0.0, + "step": 22383 + }, + { + "epoch": 2.088644210133433, + "grad_norm": NaN, + "learning_rate": 0.00022749683067947116, + "loss": 0.0, + "step": 22384 + }, + { + "epoch": 2.0887375198283102, + "grad_norm": NaN, + "learning_rate": 0.00022749035447335844, + "loss": 0.0, + "step": 22385 + }, + { + "epoch": 2.0888308295231877, + "grad_norm": NaN, + "learning_rate": 0.0002274838780702107, + "loss": 0.0, + "step": 22386 + }, + { + "epoch": 2.0889241392180646, + "grad_norm": NaN, + "learning_rate": 0.00022747740147004443, + "loss": 0.0, + "step": 22387 + }, + { + "epoch": 2.089017448912942, + "grad_norm": NaN, + "learning_rate": 0.00022747092467287604, + "loss": 0.0, + "step": 22388 + }, + { + "epoch": 2.0891107586078195, + "grad_norm": NaN, + "learning_rate": 0.00022746444767872203, + "loss": 0.0, + "step": 22389 + }, + { + "epoch": 2.0892040683026964, + "grad_norm": NaN, + "learning_rate": 0.0002274579704875989, + "loss": 0.0, + "step": 22390 + }, + { + "epoch": 2.089297377997574, + "grad_norm": NaN, + "learning_rate": 0.00022745149309952314, + "loss": 0.0, + "step": 22391 + }, + { + "epoch": 2.0893906876924513, + "grad_norm": NaN, + "learning_rate": 0.00022744501551451112, + "loss": 0.0, + "step": 22392 + }, + { + "epoch": 2.0894839973873287, + "grad_norm": NaN, + "learning_rate": 0.00022743853773257936, + "loss": 0.0, + "step": 22393 + }, + { + "epoch": 2.0895773070822057, + "grad_norm": NaN, + "learning_rate": 0.00022743205975374437, + "loss": 0.0, + "step": 22394 + }, + { + "epoch": 2.089670616777083, + "grad_norm": NaN, + "learning_rate": 0.00022742558157802254, + "loss": 0.0, + "step": 22395 + }, + { + "epoch": 2.0897639264719605, + "grad_norm": NaN, + "learning_rate": 0.00022741910320543045, + "loss": 0.0, + "step": 22396 + }, + { + "epoch": 2.089857236166838, + "grad_norm": NaN, + "learning_rate": 0.00022741262463598454, + "loss": 0.0, + "step": 22397 + }, + { + "epoch": 2.089950545861715, + "grad_norm": NaN, + "learning_rate": 0.00022740614586970118, + "loss": 0.0, + "step": 22398 + }, + { + "epoch": 2.0900438555565923, + "grad_norm": NaN, + "learning_rate": 0.00022739966690659693, + "loss": 0.0, + "step": 22399 + }, + { + "epoch": 2.0901371652514698, + "grad_norm": NaN, + "learning_rate": 0.00022739318774668826, + "loss": 0.0, + "step": 22400 + }, + { + "epoch": 2.0902304749463467, + "grad_norm": NaN, + "learning_rate": 0.00022738670838999168, + "loss": 0.0, + "step": 22401 + }, + { + "epoch": 2.090323784641224, + "grad_norm": NaN, + "learning_rate": 0.00022738022883652358, + "loss": 0.0, + "step": 22402 + }, + { + "epoch": 2.0904170943361016, + "grad_norm": NaN, + "learning_rate": 0.00022737374908630054, + "loss": 0.0, + "step": 22403 + }, + { + "epoch": 2.090510404030979, + "grad_norm": NaN, + "learning_rate": 0.0002273672691393389, + "loss": 0.0, + "step": 22404 + }, + { + "epoch": 2.090603713725856, + "grad_norm": NaN, + "learning_rate": 0.00022736078899565526, + "loss": 0.0, + "step": 22405 + }, + { + "epoch": 2.0906970234207334, + "grad_norm": NaN, + "learning_rate": 0.00022735430865526604, + "loss": 0.0, + "step": 22406 + }, + { + "epoch": 2.090790333115611, + "grad_norm": NaN, + "learning_rate": 0.00022734782811818772, + "loss": 0.0, + "step": 22407 + }, + { + "epoch": 2.090883642810488, + "grad_norm": NaN, + "learning_rate": 0.00022734134738443675, + "loss": 0.0, + "step": 22408 + }, + { + "epoch": 2.090976952505365, + "grad_norm": NaN, + "learning_rate": 0.00022733486645402972, + "loss": 0.0, + "step": 22409 + }, + { + "epoch": 2.0910702622002426, + "grad_norm": NaN, + "learning_rate": 0.00022732838532698297, + "loss": 0.0, + "step": 22410 + }, + { + "epoch": 2.09116357189512, + "grad_norm": NaN, + "learning_rate": 0.00022732190400331307, + "loss": 0.0, + "step": 22411 + }, + { + "epoch": 2.091256881589997, + "grad_norm": NaN, + "learning_rate": 0.00022731542248303647, + "loss": 0.0, + "step": 22412 + }, + { + "epoch": 2.0913501912848744, + "grad_norm": NaN, + "learning_rate": 0.00022730894076616968, + "loss": 0.0, + "step": 22413 + }, + { + "epoch": 2.091443500979752, + "grad_norm": NaN, + "learning_rate": 0.0002273024588527291, + "loss": 0.0, + "step": 22414 + }, + { + "epoch": 2.0915368106746293, + "grad_norm": NaN, + "learning_rate": 0.0002272959767427313, + "loss": 0.0, + "step": 22415 + }, + { + "epoch": 2.0916301203695062, + "grad_norm": NaN, + "learning_rate": 0.0002272894944361927, + "loss": 0.0, + "step": 22416 + }, + { + "epoch": 2.0917234300643837, + "grad_norm": NaN, + "learning_rate": 0.00022728301193312983, + "loss": 0.0, + "step": 22417 + }, + { + "epoch": 2.091816739759261, + "grad_norm": NaN, + "learning_rate": 0.00022727652923355915, + "loss": 0.0, + "step": 22418 + }, + { + "epoch": 2.0919100494541385, + "grad_norm": NaN, + "learning_rate": 0.00022727004633749716, + "loss": 0.0, + "step": 22419 + }, + { + "epoch": 2.0920033591490155, + "grad_norm": NaN, + "learning_rate": 0.0002272635632449603, + "loss": 0.0, + "step": 22420 + }, + { + "epoch": 2.092096668843893, + "grad_norm": NaN, + "learning_rate": 0.00022725707995596516, + "loss": 0.0, + "step": 22421 + }, + { + "epoch": 2.0921899785387703, + "grad_norm": NaN, + "learning_rate": 0.0002272505964705281, + "loss": 0.0, + "step": 22422 + }, + { + "epoch": 2.0922832882336473, + "grad_norm": NaN, + "learning_rate": 0.00022724411278866562, + "loss": 0.0, + "step": 22423 + }, + { + "epoch": 2.0923765979285247, + "grad_norm": NaN, + "learning_rate": 0.00022723762891039428, + "loss": 0.0, + "step": 22424 + }, + { + "epoch": 2.092469907623402, + "grad_norm": NaN, + "learning_rate": 0.00022723114483573056, + "loss": 0.0, + "step": 22425 + }, + { + "epoch": 2.0925632173182795, + "grad_norm": NaN, + "learning_rate": 0.00022722466056469083, + "loss": 0.0, + "step": 22426 + }, + { + "epoch": 2.0926565270131565, + "grad_norm": NaN, + "learning_rate": 0.00022721817609729173, + "loss": 0.0, + "step": 22427 + }, + { + "epoch": 2.092749836708034, + "grad_norm": NaN, + "learning_rate": 0.0002272116914335497, + "loss": 0.0, + "step": 22428 + }, + { + "epoch": 2.0928431464029114, + "grad_norm": NaN, + "learning_rate": 0.00022720520657348116, + "loss": 0.0, + "step": 22429 + }, + { + "epoch": 2.0929364560977888, + "grad_norm": NaN, + "learning_rate": 0.00022719872151710264, + "loss": 0.0, + "step": 22430 + }, + { + "epoch": 2.0930297657926658, + "grad_norm": NaN, + "learning_rate": 0.0002271922362644307, + "loss": 0.0, + "step": 22431 + }, + { + "epoch": 2.093123075487543, + "grad_norm": NaN, + "learning_rate": 0.00022718575081548176, + "loss": 0.0, + "step": 22432 + }, + { + "epoch": 2.0932163851824206, + "grad_norm": NaN, + "learning_rate": 0.00022717926517027228, + "loss": 0.0, + "step": 22433 + }, + { + "epoch": 2.0933096948772976, + "grad_norm": NaN, + "learning_rate": 0.0002271727793288188, + "loss": 0.0, + "step": 22434 + }, + { + "epoch": 2.093403004572175, + "grad_norm": NaN, + "learning_rate": 0.00022716629329113782, + "loss": 0.0, + "step": 22435 + }, + { + "epoch": 2.0934963142670524, + "grad_norm": NaN, + "learning_rate": 0.0002271598070572458, + "loss": 0.0, + "step": 22436 + }, + { + "epoch": 2.09358962396193, + "grad_norm": NaN, + "learning_rate": 0.00022715332062715925, + "loss": 0.0, + "step": 22437 + }, + { + "epoch": 2.093682933656807, + "grad_norm": NaN, + "learning_rate": 0.00022714683400089467, + "loss": 0.0, + "step": 22438 + }, + { + "epoch": 2.093776243351684, + "grad_norm": NaN, + "learning_rate": 0.0002271403471784685, + "loss": 0.0, + "step": 22439 + }, + { + "epoch": 2.0938695530465616, + "grad_norm": NaN, + "learning_rate": 0.00022713386015989735, + "loss": 0.0, + "step": 22440 + }, + { + "epoch": 2.0939628627414386, + "grad_norm": NaN, + "learning_rate": 0.00022712737294519757, + "loss": 0.0, + "step": 22441 + }, + { + "epoch": 2.094056172436316, + "grad_norm": NaN, + "learning_rate": 0.0002271208855343858, + "loss": 0.0, + "step": 22442 + }, + { + "epoch": 2.0941494821311935, + "grad_norm": NaN, + "learning_rate": 0.0002271143979274784, + "loss": 0.0, + "step": 22443 + }, + { + "epoch": 2.094242791826071, + "grad_norm": NaN, + "learning_rate": 0.00022710791012449196, + "loss": 0.0, + "step": 22444 + }, + { + "epoch": 2.094336101520948, + "grad_norm": NaN, + "learning_rate": 0.00022710142212544295, + "loss": 0.0, + "step": 22445 + }, + { + "epoch": 2.0944294112158253, + "grad_norm": NaN, + "learning_rate": 0.00022709493393034783, + "loss": 0.0, + "step": 22446 + }, + { + "epoch": 2.0945227209107027, + "grad_norm": NaN, + "learning_rate": 0.00022708844553922318, + "loss": 0.0, + "step": 22447 + }, + { + "epoch": 2.09461603060558, + "grad_norm": NaN, + "learning_rate": 0.00022708195695208543, + "loss": 0.0, + "step": 22448 + }, + { + "epoch": 2.094709340300457, + "grad_norm": NaN, + "learning_rate": 0.00022707546816895107, + "loss": 0.0, + "step": 22449 + }, + { + "epoch": 2.0948026499953345, + "grad_norm": NaN, + "learning_rate": 0.0002270689791898367, + "loss": 0.0, + "step": 22450 + }, + { + "epoch": 2.094895959690212, + "grad_norm": NaN, + "learning_rate": 0.00022706249001475866, + "loss": 0.0, + "step": 22451 + }, + { + "epoch": 2.0949892693850893, + "grad_norm": NaN, + "learning_rate": 0.00022705600064373352, + "loss": 0.0, + "step": 22452 + }, + { + "epoch": 2.0950825790799663, + "grad_norm": NaN, + "learning_rate": 0.00022704951107677788, + "loss": 0.0, + "step": 22453 + }, + { + "epoch": 2.0951758887748437, + "grad_norm": NaN, + "learning_rate": 0.00022704302131390812, + "loss": 0.0, + "step": 22454 + }, + { + "epoch": 2.095269198469721, + "grad_norm": NaN, + "learning_rate": 0.00022703653135514075, + "loss": 0.0, + "step": 22455 + }, + { + "epoch": 2.095362508164598, + "grad_norm": NaN, + "learning_rate": 0.00022703004120049237, + "loss": 0.0, + "step": 22456 + }, + { + "epoch": 2.0954558178594755, + "grad_norm": NaN, + "learning_rate": 0.00022702355084997934, + "loss": 0.0, + "step": 22457 + }, + { + "epoch": 2.095549127554353, + "grad_norm": NaN, + "learning_rate": 0.0002270170603036183, + "loss": 0.0, + "step": 22458 + }, + { + "epoch": 2.0956424372492304, + "grad_norm": NaN, + "learning_rate": 0.00022701056956142565, + "loss": 0.0, + "step": 22459 + }, + { + "epoch": 2.0957357469441074, + "grad_norm": NaN, + "learning_rate": 0.00022700407862341794, + "loss": 0.0, + "step": 22460 + }, + { + "epoch": 2.0958290566389848, + "grad_norm": NaN, + "learning_rate": 0.00022699758748961167, + "loss": 0.0, + "step": 22461 + }, + { + "epoch": 2.095922366333862, + "grad_norm": NaN, + "learning_rate": 0.0002269910961600233, + "loss": 0.0, + "step": 22462 + }, + { + "epoch": 2.096015676028739, + "grad_norm": NaN, + "learning_rate": 0.00022698460463466946, + "loss": 0.0, + "step": 22463 + }, + { + "epoch": 2.0961089857236166, + "grad_norm": NaN, + "learning_rate": 0.0002269781129135665, + "loss": 0.0, + "step": 22464 + }, + { + "epoch": 2.096202295418494, + "grad_norm": NaN, + "learning_rate": 0.00022697162099673105, + "loss": 0.0, + "step": 22465 + }, + { + "epoch": 2.0962956051133714, + "grad_norm": NaN, + "learning_rate": 0.0002269651288841796, + "loss": 0.0, + "step": 22466 + }, + { + "epoch": 2.0963889148082484, + "grad_norm": NaN, + "learning_rate": 0.00022695863657592858, + "loss": 0.0, + "step": 22467 + }, + { + "epoch": 2.096482224503126, + "grad_norm": NaN, + "learning_rate": 0.0002269521440719945, + "loss": 0.0, + "step": 22468 + }, + { + "epoch": 2.0965755341980032, + "grad_norm": NaN, + "learning_rate": 0.000226945651372394, + "loss": 0.0, + "step": 22469 + }, + { + "epoch": 2.0966688438928807, + "grad_norm": NaN, + "learning_rate": 0.00022693915847714344, + "loss": 0.0, + "step": 22470 + }, + { + "epoch": 2.0967621535877576, + "grad_norm": NaN, + "learning_rate": 0.0002269326653862594, + "loss": 0.0, + "step": 22471 + }, + { + "epoch": 2.096855463282635, + "grad_norm": NaN, + "learning_rate": 0.00022692617209975842, + "loss": 0.0, + "step": 22472 + }, + { + "epoch": 2.0969487729775125, + "grad_norm": NaN, + "learning_rate": 0.00022691967861765696, + "loss": 0.0, + "step": 22473 + }, + { + "epoch": 2.0970420826723895, + "grad_norm": NaN, + "learning_rate": 0.00022691318493997146, + "loss": 0.0, + "step": 22474 + }, + { + "epoch": 2.097135392367267, + "grad_norm": NaN, + "learning_rate": 0.00022690669106671863, + "loss": 0.0, + "step": 22475 + }, + { + "epoch": 2.0972287020621443, + "grad_norm": NaN, + "learning_rate": 0.00022690019699791482, + "loss": 0.0, + "step": 22476 + }, + { + "epoch": 2.0973220117570217, + "grad_norm": NaN, + "learning_rate": 0.00022689370273357655, + "loss": 0.0, + "step": 22477 + }, + { + "epoch": 2.0974153214518987, + "grad_norm": NaN, + "learning_rate": 0.00022688720827372043, + "loss": 0.0, + "step": 22478 + }, + { + "epoch": 2.097508631146776, + "grad_norm": NaN, + "learning_rate": 0.0002268807136183629, + "loss": 0.0, + "step": 22479 + }, + { + "epoch": 2.0976019408416535, + "grad_norm": NaN, + "learning_rate": 0.0002268742187675205, + "loss": 0.0, + "step": 22480 + }, + { + "epoch": 2.097695250536531, + "grad_norm": NaN, + "learning_rate": 0.0002268677237212097, + "loss": 0.0, + "step": 22481 + }, + { + "epoch": 2.097788560231408, + "grad_norm": NaN, + "learning_rate": 0.00022686122847944708, + "loss": 0.0, + "step": 22482 + }, + { + "epoch": 2.0978818699262853, + "grad_norm": NaN, + "learning_rate": 0.0002268547330422491, + "loss": 0.0, + "step": 22483 + }, + { + "epoch": 2.0979751796211628, + "grad_norm": NaN, + "learning_rate": 0.00022684823740963224, + "loss": 0.0, + "step": 22484 + }, + { + "epoch": 2.0980684893160397, + "grad_norm": NaN, + "learning_rate": 0.0002268417415816132, + "loss": 0.0, + "step": 22485 + }, + { + "epoch": 2.098161799010917, + "grad_norm": NaN, + "learning_rate": 0.0002268352455582083, + "loss": 0.0, + "step": 22486 + }, + { + "epoch": 2.0982551087057946, + "grad_norm": NaN, + "learning_rate": 0.00022682874933943413, + "loss": 0.0, + "step": 22487 + }, + { + "epoch": 2.098348418400672, + "grad_norm": NaN, + "learning_rate": 0.00022682225292530725, + "loss": 0.0, + "step": 22488 + }, + { + "epoch": 2.098441728095549, + "grad_norm": NaN, + "learning_rate": 0.00022681575631584413, + "loss": 0.0, + "step": 22489 + }, + { + "epoch": 2.0985350377904264, + "grad_norm": NaN, + "learning_rate": 0.00022680925951106126, + "loss": 0.0, + "step": 22490 + }, + { + "epoch": 2.098628347485304, + "grad_norm": NaN, + "learning_rate": 0.00022680276251097522, + "loss": 0.0, + "step": 22491 + }, + { + "epoch": 2.098721657180181, + "grad_norm": NaN, + "learning_rate": 0.0002267962653156025, + "loss": 0.0, + "step": 22492 + }, + { + "epoch": 2.098814966875058, + "grad_norm": NaN, + "learning_rate": 0.0002267897679249596, + "loss": 0.0, + "step": 22493 + }, + { + "epoch": 2.0989082765699356, + "grad_norm": NaN, + "learning_rate": 0.00022678327033906313, + "loss": 0.0, + "step": 22494 + }, + { + "epoch": 2.099001586264813, + "grad_norm": NaN, + "learning_rate": 0.00022677677255792953, + "loss": 0.0, + "step": 22495 + }, + { + "epoch": 2.09909489595969, + "grad_norm": NaN, + "learning_rate": 0.00022677027458157525, + "loss": 0.0, + "step": 22496 + }, + { + "epoch": 2.0991882056545674, + "grad_norm": NaN, + "learning_rate": 0.000226763776410017, + "loss": 0.0, + "step": 22497 + }, + { + "epoch": 2.099281515349445, + "grad_norm": NaN, + "learning_rate": 0.00022675727804327125, + "loss": 0.0, + "step": 22498 + }, + { + "epoch": 2.0993748250443223, + "grad_norm": NaN, + "learning_rate": 0.00022675077948135436, + "loss": 0.0, + "step": 22499 + }, + { + "epoch": 2.0994681347391992, + "grad_norm": NaN, + "learning_rate": 0.00022674428072428304, + "loss": 0.0, + "step": 22500 + }, + { + "epoch": 2.0995614444340767, + "grad_norm": NaN, + "learning_rate": 0.00022673778177207372, + "loss": 0.0, + "step": 22501 + }, + { + "epoch": 2.099654754128954, + "grad_norm": NaN, + "learning_rate": 0.00022673128262474298, + "loss": 0.0, + "step": 22502 + }, + { + "epoch": 2.0997480638238315, + "grad_norm": NaN, + "learning_rate": 0.00022672478328230727, + "loss": 0.0, + "step": 22503 + }, + { + "epoch": 2.0998413735187085, + "grad_norm": NaN, + "learning_rate": 0.00022671828374478325, + "loss": 0.0, + "step": 22504 + }, + { + "epoch": 2.099934683213586, + "grad_norm": NaN, + "learning_rate": 0.0002267117840121873, + "loss": 0.0, + "step": 22505 + }, + { + "epoch": 2.1000279929084633, + "grad_norm": NaN, + "learning_rate": 0.00022670528408453598, + "loss": 0.0, + "step": 22506 + }, + { + "epoch": 2.1001213026033403, + "grad_norm": NaN, + "learning_rate": 0.00022669878396184588, + "loss": 0.0, + "step": 22507 + }, + { + "epoch": 2.1002146122982177, + "grad_norm": NaN, + "learning_rate": 0.0002266922836441335, + "loss": 0.0, + "step": 22508 + }, + { + "epoch": 2.100307921993095, + "grad_norm": NaN, + "learning_rate": 0.0002266857831314153, + "loss": 0.0, + "step": 22509 + }, + { + "epoch": 2.1004012316879725, + "grad_norm": NaN, + "learning_rate": 0.00022667928242370793, + "loss": 0.0, + "step": 22510 + }, + { + "epoch": 2.1004945413828495, + "grad_norm": NaN, + "learning_rate": 0.0002266727815210279, + "loss": 0.0, + "step": 22511 + }, + { + "epoch": 2.100587851077727, + "grad_norm": NaN, + "learning_rate": 0.0002266662804233916, + "loss": 0.0, + "step": 22512 + }, + { + "epoch": 2.1006811607726044, + "grad_norm": NaN, + "learning_rate": 0.00022665977913081573, + "loss": 0.0, + "step": 22513 + }, + { + "epoch": 2.100774470467482, + "grad_norm": NaN, + "learning_rate": 0.00022665327764331673, + "loss": 0.0, + "step": 22514 + }, + { + "epoch": 2.1008677801623588, + "grad_norm": NaN, + "learning_rate": 0.0002266467759609111, + "loss": 0.0, + "step": 22515 + }, + { + "epoch": 2.100961089857236, + "grad_norm": NaN, + "learning_rate": 0.00022664027408361554, + "loss": 0.0, + "step": 22516 + }, + { + "epoch": 2.1010543995521136, + "grad_norm": NaN, + "learning_rate": 0.0002266337720114464, + "loss": 0.0, + "step": 22517 + }, + { + "epoch": 2.1011477092469906, + "grad_norm": NaN, + "learning_rate": 0.0002266272697444202, + "loss": 0.0, + "step": 22518 + }, + { + "epoch": 2.101241018941868, + "grad_norm": NaN, + "learning_rate": 0.0002266207672825537, + "loss": 0.0, + "step": 22519 + }, + { + "epoch": 2.1013343286367454, + "grad_norm": NaN, + "learning_rate": 0.0002266142646258632, + "loss": 0.0, + "step": 22520 + }, + { + "epoch": 2.101427638331623, + "grad_norm": NaN, + "learning_rate": 0.0002266077617743653, + "loss": 0.0, + "step": 22521 + }, + { + "epoch": 2.1015209480265, + "grad_norm": NaN, + "learning_rate": 0.00022660125872807658, + "loss": 0.0, + "step": 22522 + }, + { + "epoch": 2.101614257721377, + "grad_norm": NaN, + "learning_rate": 0.00022659475548701357, + "loss": 0.0, + "step": 22523 + }, + { + "epoch": 2.1017075674162546, + "grad_norm": NaN, + "learning_rate": 0.00022658825205119275, + "loss": 0.0, + "step": 22524 + }, + { + "epoch": 2.101800877111132, + "grad_norm": NaN, + "learning_rate": 0.0002265817484206307, + "loss": 0.0, + "step": 22525 + }, + { + "epoch": 2.101894186806009, + "grad_norm": NaN, + "learning_rate": 0.00022657524459534402, + "loss": 0.0, + "step": 22526 + }, + { + "epoch": 2.1019874965008865, + "grad_norm": NaN, + "learning_rate": 0.0002265687405753491, + "loss": 0.0, + "step": 22527 + }, + { + "epoch": 2.102080806195764, + "grad_norm": NaN, + "learning_rate": 0.00022656223636066252, + "loss": 0.0, + "step": 22528 + }, + { + "epoch": 2.102174115890641, + "grad_norm": NaN, + "learning_rate": 0.00022655573195130095, + "loss": 0.0, + "step": 22529 + }, + { + "epoch": 2.1022674255855183, + "grad_norm": NaN, + "learning_rate": 0.0002265492273472808, + "loss": 0.0, + "step": 22530 + }, + { + "epoch": 2.1023607352803957, + "grad_norm": NaN, + "learning_rate": 0.0002265427225486186, + "loss": 0.0, + "step": 22531 + }, + { + "epoch": 2.102454044975273, + "grad_norm": NaN, + "learning_rate": 0.000226536217555331, + "loss": 0.0, + "step": 22532 + }, + { + "epoch": 2.10254735467015, + "grad_norm": NaN, + "learning_rate": 0.0002265297123674344, + "loss": 0.0, + "step": 22533 + }, + { + "epoch": 2.1026406643650275, + "grad_norm": NaN, + "learning_rate": 0.00022652320698494542, + "loss": 0.0, + "step": 22534 + }, + { + "epoch": 2.102733974059905, + "grad_norm": NaN, + "learning_rate": 0.00022651670140788063, + "loss": 0.0, + "step": 22535 + }, + { + "epoch": 2.102827283754782, + "grad_norm": NaN, + "learning_rate": 0.00022651019563625654, + "loss": 0.0, + "step": 22536 + }, + { + "epoch": 2.1029205934496593, + "grad_norm": NaN, + "learning_rate": 0.00022650368967008962, + "loss": 0.0, + "step": 22537 + }, + { + "epoch": 2.1030139031445367, + "grad_norm": NaN, + "learning_rate": 0.00022649718350939655, + "loss": 0.0, + "step": 22538 + }, + { + "epoch": 2.103107212839414, + "grad_norm": NaN, + "learning_rate": 0.00022649067715419376, + "loss": 0.0, + "step": 22539 + }, + { + "epoch": 2.103200522534291, + "grad_norm": NaN, + "learning_rate": 0.00022648417060449784, + "loss": 0.0, + "step": 22540 + }, + { + "epoch": 2.1032938322291685, + "grad_norm": NaN, + "learning_rate": 0.00022647766386032532, + "loss": 0.0, + "step": 22541 + }, + { + "epoch": 2.103387141924046, + "grad_norm": NaN, + "learning_rate": 0.0002264711569216928, + "loss": 0.0, + "step": 22542 + }, + { + "epoch": 2.1034804516189234, + "grad_norm": NaN, + "learning_rate": 0.00022646464978861677, + "loss": 0.0, + "step": 22543 + }, + { + "epoch": 2.1035737613138004, + "grad_norm": NaN, + "learning_rate": 0.00022645814246111375, + "loss": 0.0, + "step": 22544 + }, + { + "epoch": 2.103667071008678, + "grad_norm": NaN, + "learning_rate": 0.00022645163493920038, + "loss": 0.0, + "step": 22545 + }, + { + "epoch": 2.103760380703555, + "grad_norm": NaN, + "learning_rate": 0.0002264451272228931, + "loss": 0.0, + "step": 22546 + }, + { + "epoch": 2.1038536903984326, + "grad_norm": NaN, + "learning_rate": 0.00022643861931220848, + "loss": 0.0, + "step": 22547 + }, + { + "epoch": 2.1039470000933096, + "grad_norm": NaN, + "learning_rate": 0.00022643211120716318, + "loss": 0.0, + "step": 22548 + }, + { + "epoch": 2.104040309788187, + "grad_norm": NaN, + "learning_rate": 0.00022642560290777363, + "loss": 0.0, + "step": 22549 + }, + { + "epoch": 2.1041336194830644, + "grad_norm": NaN, + "learning_rate": 0.00022641909441405635, + "loss": 0.0, + "step": 22550 + }, + { + "epoch": 2.1042269291779414, + "grad_norm": NaN, + "learning_rate": 0.000226412585726028, + "loss": 0.0, + "step": 22551 + }, + { + "epoch": 2.104320238872819, + "grad_norm": NaN, + "learning_rate": 0.0002264060768437051, + "loss": 0.0, + "step": 22552 + }, + { + "epoch": 2.1044135485676962, + "grad_norm": NaN, + "learning_rate": 0.00022639956776710412, + "loss": 0.0, + "step": 22553 + }, + { + "epoch": 2.1045068582625737, + "grad_norm": NaN, + "learning_rate": 0.00022639305849624176, + "loss": 0.0, + "step": 22554 + }, + { + "epoch": 2.1046001679574506, + "grad_norm": NaN, + "learning_rate": 0.00022638654903113444, + "loss": 0.0, + "step": 22555 + }, + { + "epoch": 2.104693477652328, + "grad_norm": NaN, + "learning_rate": 0.0002263800393717987, + "loss": 0.0, + "step": 22556 + }, + { + "epoch": 2.1047867873472055, + "grad_norm": NaN, + "learning_rate": 0.00022637352951825117, + "loss": 0.0, + "step": 22557 + }, + { + "epoch": 2.1048800970420825, + "grad_norm": NaN, + "learning_rate": 0.00022636701947050843, + "loss": 0.0, + "step": 22558 + }, + { + "epoch": 2.10497340673696, + "grad_norm": NaN, + "learning_rate": 0.00022636050922858692, + "loss": 0.0, + "step": 22559 + }, + { + "epoch": 2.1050667164318373, + "grad_norm": NaN, + "learning_rate": 0.00022635399879250326, + "loss": 0.0, + "step": 22560 + }, + { + "epoch": 2.1051600261267147, + "grad_norm": NaN, + "learning_rate": 0.00022634748816227404, + "loss": 0.0, + "step": 22561 + }, + { + "epoch": 2.1052533358215917, + "grad_norm": NaN, + "learning_rate": 0.00022634097733791573, + "loss": 0.0, + "step": 22562 + }, + { + "epoch": 2.105346645516469, + "grad_norm": NaN, + "learning_rate": 0.00022633446631944496, + "loss": 0.0, + "step": 22563 + }, + { + "epoch": 2.1054399552113465, + "grad_norm": NaN, + "learning_rate": 0.00022632795510687826, + "loss": 0.0, + "step": 22564 + }, + { + "epoch": 2.105533264906224, + "grad_norm": NaN, + "learning_rate": 0.00022632144370023213, + "loss": 0.0, + "step": 22565 + }, + { + "epoch": 2.105626574601101, + "grad_norm": NaN, + "learning_rate": 0.00022631493209952322, + "loss": 0.0, + "step": 22566 + }, + { + "epoch": 2.1057198842959783, + "grad_norm": NaN, + "learning_rate": 0.00022630842030476808, + "loss": 0.0, + "step": 22567 + }, + { + "epoch": 2.1058131939908558, + "grad_norm": NaN, + "learning_rate": 0.00022630190831598312, + "loss": 0.0, + "step": 22568 + }, + { + "epoch": 2.1059065036857327, + "grad_norm": NaN, + "learning_rate": 0.0002262953961331851, + "loss": 0.0, + "step": 22569 + }, + { + "epoch": 2.10599981338061, + "grad_norm": NaN, + "learning_rate": 0.0002262888837563905, + "loss": 0.0, + "step": 22570 + }, + { + "epoch": 2.1060931230754876, + "grad_norm": NaN, + "learning_rate": 0.00022628237118561584, + "loss": 0.0, + "step": 22571 + }, + { + "epoch": 2.106186432770365, + "grad_norm": NaN, + "learning_rate": 0.00022627585842087766, + "loss": 0.0, + "step": 22572 + }, + { + "epoch": 2.106279742465242, + "grad_norm": NaN, + "learning_rate": 0.00022626934546219262, + "loss": 0.0, + "step": 22573 + }, + { + "epoch": 2.1063730521601194, + "grad_norm": NaN, + "learning_rate": 0.0002262628323095772, + "loss": 0.0, + "step": 22574 + }, + { + "epoch": 2.106466361854997, + "grad_norm": NaN, + "learning_rate": 0.000226256318963048, + "loss": 0.0, + "step": 22575 + }, + { + "epoch": 2.1065596715498742, + "grad_norm": NaN, + "learning_rate": 0.00022624980542262156, + "loss": 0.0, + "step": 22576 + }, + { + "epoch": 2.106652981244751, + "grad_norm": NaN, + "learning_rate": 0.0002262432916883145, + "loss": 0.0, + "step": 22577 + }, + { + "epoch": 2.1067462909396286, + "grad_norm": NaN, + "learning_rate": 0.00022623677776014323, + "loss": 0.0, + "step": 22578 + }, + { + "epoch": 2.106839600634506, + "grad_norm": NaN, + "learning_rate": 0.00022623026363812452, + "loss": 0.0, + "step": 22579 + }, + { + "epoch": 2.106932910329383, + "grad_norm": NaN, + "learning_rate": 0.00022622374932227483, + "loss": 0.0, + "step": 22580 + }, + { + "epoch": 2.1070262200242604, + "grad_norm": NaN, + "learning_rate": 0.00022621723481261065, + "loss": 0.0, + "step": 22581 + }, + { + "epoch": 2.107119529719138, + "grad_norm": NaN, + "learning_rate": 0.00022621072010914863, + "loss": 0.0, + "step": 22582 + }, + { + "epoch": 2.1072128394140153, + "grad_norm": NaN, + "learning_rate": 0.00022620420521190542, + "loss": 0.0, + "step": 22583 + }, + { + "epoch": 2.1073061491088922, + "grad_norm": NaN, + "learning_rate": 0.00022619769012089736, + "loss": 0.0, + "step": 22584 + }, + { + "epoch": 2.1073994588037697, + "grad_norm": NaN, + "learning_rate": 0.00022619117483614121, + "loss": 0.0, + "step": 22585 + }, + { + "epoch": 2.107492768498647, + "grad_norm": NaN, + "learning_rate": 0.00022618465935765348, + "loss": 0.0, + "step": 22586 + }, + { + "epoch": 2.1075860781935245, + "grad_norm": NaN, + "learning_rate": 0.00022617814368545072, + "loss": 0.0, + "step": 22587 + }, + { + "epoch": 2.1076793878884015, + "grad_norm": NaN, + "learning_rate": 0.00022617162781954947, + "loss": 0.0, + "step": 22588 + }, + { + "epoch": 2.107772697583279, + "grad_norm": NaN, + "learning_rate": 0.0002261651117599664, + "loss": 0.0, + "step": 22589 + }, + { + "epoch": 2.1078660072781563, + "grad_norm": NaN, + "learning_rate": 0.00022615859550671798, + "loss": 0.0, + "step": 22590 + }, + { + "epoch": 2.1079593169730333, + "grad_norm": NaN, + "learning_rate": 0.00022615207905982078, + "loss": 0.0, + "step": 22591 + }, + { + "epoch": 2.1080526266679107, + "grad_norm": NaN, + "learning_rate": 0.00022614556241929145, + "loss": 0.0, + "step": 22592 + }, + { + "epoch": 2.108145936362788, + "grad_norm": NaN, + "learning_rate": 0.0002261390455851465, + "loss": 0.0, + "step": 22593 + }, + { + "epoch": 2.1082392460576656, + "grad_norm": NaN, + "learning_rate": 0.00022613252855740247, + "loss": 0.0, + "step": 22594 + }, + { + "epoch": 2.1083325557525425, + "grad_norm": NaN, + "learning_rate": 0.00022612601133607604, + "loss": 0.0, + "step": 22595 + }, + { + "epoch": 2.10842586544742, + "grad_norm": NaN, + "learning_rate": 0.00022611949392118368, + "loss": 0.0, + "step": 22596 + }, + { + "epoch": 2.1085191751422974, + "grad_norm": NaN, + "learning_rate": 0.000226112976312742, + "loss": 0.0, + "step": 22597 + }, + { + "epoch": 2.108612484837175, + "grad_norm": NaN, + "learning_rate": 0.00022610645851076753, + "loss": 0.0, + "step": 22598 + }, + { + "epoch": 2.1087057945320518, + "grad_norm": NaN, + "learning_rate": 0.00022609994051527694, + "loss": 0.0, + "step": 22599 + }, + { + "epoch": 2.108799104226929, + "grad_norm": NaN, + "learning_rate": 0.0002260934223262867, + "loss": 0.0, + "step": 22600 + }, + { + "epoch": 2.1088924139218066, + "grad_norm": NaN, + "learning_rate": 0.00022608690394381343, + "loss": 0.0, + "step": 22601 + }, + { + "epoch": 2.1089857236166836, + "grad_norm": NaN, + "learning_rate": 0.00022608038536787374, + "loss": 0.0, + "step": 22602 + }, + { + "epoch": 2.109079033311561, + "grad_norm": NaN, + "learning_rate": 0.0002260738665984841, + "loss": 0.0, + "step": 22603 + }, + { + "epoch": 2.1091723430064384, + "grad_norm": NaN, + "learning_rate": 0.00022606734763566124, + "loss": 0.0, + "step": 22604 + }, + { + "epoch": 2.109265652701316, + "grad_norm": NaN, + "learning_rate": 0.0002260608284794216, + "loss": 0.0, + "step": 22605 + }, + { + "epoch": 2.109358962396193, + "grad_norm": NaN, + "learning_rate": 0.0002260543091297818, + "loss": 0.0, + "step": 22606 + }, + { + "epoch": 2.1094522720910702, + "grad_norm": NaN, + "learning_rate": 0.00022604778958675842, + "loss": 0.0, + "step": 22607 + }, + { + "epoch": 2.1095455817859476, + "grad_norm": NaN, + "learning_rate": 0.00022604126985036806, + "loss": 0.0, + "step": 22608 + }, + { + "epoch": 2.109638891480825, + "grad_norm": NaN, + "learning_rate": 0.00022603474992062724, + "loss": 0.0, + "step": 22609 + }, + { + "epoch": 2.109732201175702, + "grad_norm": NaN, + "learning_rate": 0.00022602822979755257, + "loss": 0.0, + "step": 22610 + }, + { + "epoch": 2.1098255108705795, + "grad_norm": NaN, + "learning_rate": 0.0002260217094811607, + "loss": 0.0, + "step": 22611 + }, + { + "epoch": 2.109918820565457, + "grad_norm": NaN, + "learning_rate": 0.00022601518897146807, + "loss": 0.0, + "step": 22612 + }, + { + "epoch": 2.110012130260334, + "grad_norm": NaN, + "learning_rate": 0.00022600866826849136, + "loss": 0.0, + "step": 22613 + }, + { + "epoch": 2.1101054399552113, + "grad_norm": NaN, + "learning_rate": 0.00022600214737224713, + "loss": 0.0, + "step": 22614 + }, + { + "epoch": 2.1101987496500887, + "grad_norm": NaN, + "learning_rate": 0.00022599562628275197, + "loss": 0.0, + "step": 22615 + }, + { + "epoch": 2.110292059344966, + "grad_norm": NaN, + "learning_rate": 0.0002259891050000224, + "loss": 0.0, + "step": 22616 + }, + { + "epoch": 2.110385369039843, + "grad_norm": NaN, + "learning_rate": 0.00022598258352407507, + "loss": 0.0, + "step": 22617 + }, + { + "epoch": 2.1104786787347205, + "grad_norm": NaN, + "learning_rate": 0.00022597606185492656, + "loss": 0.0, + "step": 22618 + }, + { + "epoch": 2.110571988429598, + "grad_norm": NaN, + "learning_rate": 0.00022596953999259338, + "loss": 0.0, + "step": 22619 + }, + { + "epoch": 2.1106652981244753, + "grad_norm": NaN, + "learning_rate": 0.00022596301793709216, + "loss": 0.0, + "step": 22620 + }, + { + "epoch": 2.1107586078193523, + "grad_norm": NaN, + "learning_rate": 0.00022595649568843954, + "loss": 0.0, + "step": 22621 + }, + { + "epoch": 2.1108519175142297, + "grad_norm": NaN, + "learning_rate": 0.000225949973246652, + "loss": 0.0, + "step": 22622 + }, + { + "epoch": 2.110945227209107, + "grad_norm": NaN, + "learning_rate": 0.0002259434506117462, + "loss": 0.0, + "step": 22623 + }, + { + "epoch": 2.111038536903984, + "grad_norm": NaN, + "learning_rate": 0.00022593692778373876, + "loss": 0.0, + "step": 22624 + }, + { + "epoch": 2.1111318465988616, + "grad_norm": NaN, + "learning_rate": 0.00022593040476264612, + "loss": 0.0, + "step": 22625 + }, + { + "epoch": 2.111225156293739, + "grad_norm": NaN, + "learning_rate": 0.00022592388154848498, + "loss": 0.0, + "step": 22626 + }, + { + "epoch": 2.1113184659886164, + "grad_norm": NaN, + "learning_rate": 0.0002259173581412719, + "loss": 0.0, + "step": 22627 + }, + { + "epoch": 2.1114117756834934, + "grad_norm": NaN, + "learning_rate": 0.00022591083454102346, + "loss": 0.0, + "step": 22628 + }, + { + "epoch": 2.111505085378371, + "grad_norm": NaN, + "learning_rate": 0.00022590431074775627, + "loss": 0.0, + "step": 22629 + }, + { + "epoch": 2.111598395073248, + "grad_norm": NaN, + "learning_rate": 0.00022589778676148693, + "loss": 0.0, + "step": 22630 + }, + { + "epoch": 2.1116917047681256, + "grad_norm": NaN, + "learning_rate": 0.0002258912625822319, + "loss": 0.0, + "step": 22631 + }, + { + "epoch": 2.1117850144630026, + "grad_norm": NaN, + "learning_rate": 0.00022588473821000797, + "loss": 0.0, + "step": 22632 + }, + { + "epoch": 2.11187832415788, + "grad_norm": NaN, + "learning_rate": 0.00022587821364483158, + "loss": 0.0, + "step": 22633 + }, + { + "epoch": 2.1119716338527574, + "grad_norm": NaN, + "learning_rate": 0.0002258716888867194, + "loss": 0.0, + "step": 22634 + }, + { + "epoch": 2.1120649435476344, + "grad_norm": NaN, + "learning_rate": 0.00022586516393568798, + "loss": 0.0, + "step": 22635 + }, + { + "epoch": 2.112158253242512, + "grad_norm": NaN, + "learning_rate": 0.0002258586387917539, + "loss": 0.0, + "step": 22636 + }, + { + "epoch": 2.1122515629373892, + "grad_norm": NaN, + "learning_rate": 0.0002258521134549338, + "loss": 0.0, + "step": 22637 + }, + { + "epoch": 2.1123448726322667, + "grad_norm": NaN, + "learning_rate": 0.0002258455879252442, + "loss": 0.0, + "step": 22638 + }, + { + "epoch": 2.1124381823271436, + "grad_norm": NaN, + "learning_rate": 0.00022583906220270181, + "loss": 0.0, + "step": 22639 + }, + { + "epoch": 2.112531492022021, + "grad_norm": NaN, + "learning_rate": 0.00022583253628732312, + "loss": 0.0, + "step": 22640 + }, + { + "epoch": 2.1126248017168985, + "grad_norm": NaN, + "learning_rate": 0.00022582601017912473, + "loss": 0.0, + "step": 22641 + }, + { + "epoch": 2.112718111411776, + "grad_norm": NaN, + "learning_rate": 0.00022581948387812327, + "loss": 0.0, + "step": 22642 + }, + { + "epoch": 2.112811421106653, + "grad_norm": NaN, + "learning_rate": 0.00022581295738433534, + "loss": 0.0, + "step": 22643 + }, + { + "epoch": 2.1129047308015303, + "grad_norm": NaN, + "learning_rate": 0.00022580643069777746, + "loss": 0.0, + "step": 22644 + }, + { + "epoch": 2.1129980404964077, + "grad_norm": NaN, + "learning_rate": 0.00022579990381846636, + "loss": 0.0, + "step": 22645 + }, + { + "epoch": 2.1130913501912847, + "grad_norm": NaN, + "learning_rate": 0.0002257933767464185, + "loss": 0.0, + "step": 22646 + }, + { + "epoch": 2.113184659886162, + "grad_norm": NaN, + "learning_rate": 0.00022578684948165056, + "loss": 0.0, + "step": 22647 + }, + { + "epoch": 2.1132779695810395, + "grad_norm": NaN, + "learning_rate": 0.00022578032202417907, + "loss": 0.0, + "step": 22648 + }, + { + "epoch": 2.113371279275917, + "grad_norm": NaN, + "learning_rate": 0.00022577379437402074, + "loss": 0.0, + "step": 22649 + }, + { + "epoch": 2.113464588970794, + "grad_norm": NaN, + "learning_rate": 0.00022576726653119206, + "loss": 0.0, + "step": 22650 + }, + { + "epoch": 2.1135578986656713, + "grad_norm": NaN, + "learning_rate": 0.0002257607384957097, + "loss": 0.0, + "step": 22651 + }, + { + "epoch": 2.1136512083605488, + "grad_norm": NaN, + "learning_rate": 0.00022575421026759017, + "loss": 0.0, + "step": 22652 + }, + { + "epoch": 2.1137445180554257, + "grad_norm": NaN, + "learning_rate": 0.00022574768184685013, + "loss": 0.0, + "step": 22653 + }, + { + "epoch": 2.113837827750303, + "grad_norm": NaN, + "learning_rate": 0.00022574115323350618, + "loss": 0.0, + "step": 22654 + }, + { + "epoch": 2.1139311374451806, + "grad_norm": NaN, + "learning_rate": 0.00022573462442757494, + "loss": 0.0, + "step": 22655 + }, + { + "epoch": 2.114024447140058, + "grad_norm": NaN, + "learning_rate": 0.00022572809542907297, + "loss": 0.0, + "step": 22656 + }, + { + "epoch": 2.114117756834935, + "grad_norm": NaN, + "learning_rate": 0.00022572156623801686, + "loss": 0.0, + "step": 22657 + }, + { + "epoch": 2.1142110665298124, + "grad_norm": NaN, + "learning_rate": 0.00022571503685442324, + "loss": 0.0, + "step": 22658 + }, + { + "epoch": 2.11430437622469, + "grad_norm": NaN, + "learning_rate": 0.00022570850727830877, + "loss": 0.0, + "step": 22659 + }, + { + "epoch": 2.1143976859195672, + "grad_norm": NaN, + "learning_rate": 0.00022570197750968995, + "loss": 0.0, + "step": 22660 + }, + { + "epoch": 2.114490995614444, + "grad_norm": NaN, + "learning_rate": 0.00022569544754858338, + "loss": 0.0, + "step": 22661 + }, + { + "epoch": 2.1145843053093216, + "grad_norm": NaN, + "learning_rate": 0.0002256889173950058, + "loss": 0.0, + "step": 22662 + }, + { + "epoch": 2.114677615004199, + "grad_norm": NaN, + "learning_rate": 0.00022568238704897365, + "loss": 0.0, + "step": 22663 + }, + { + "epoch": 2.1147709246990765, + "grad_norm": NaN, + "learning_rate": 0.00022567585651050366, + "loss": 0.0, + "step": 22664 + }, + { + "epoch": 2.1148642343939534, + "grad_norm": NaN, + "learning_rate": 0.0002256693257796124, + "loss": 0.0, + "step": 22665 + }, + { + "epoch": 2.114957544088831, + "grad_norm": NaN, + "learning_rate": 0.00022566279485631638, + "loss": 0.0, + "step": 22666 + }, + { + "epoch": 2.1150508537837083, + "grad_norm": NaN, + "learning_rate": 0.00022565626374063236, + "loss": 0.0, + "step": 22667 + }, + { + "epoch": 2.1151441634785852, + "grad_norm": NaN, + "learning_rate": 0.00022564973243257682, + "loss": 0.0, + "step": 22668 + }, + { + "epoch": 2.1152374731734627, + "grad_norm": NaN, + "learning_rate": 0.00022564320093216645, + "loss": 0.0, + "step": 22669 + }, + { + "epoch": 2.11533078286834, + "grad_norm": NaN, + "learning_rate": 0.0002256366692394178, + "loss": 0.0, + "step": 22670 + }, + { + "epoch": 2.1154240925632175, + "grad_norm": NaN, + "learning_rate": 0.00022563013735434753, + "loss": 0.0, + "step": 22671 + }, + { + "epoch": 2.1155174022580945, + "grad_norm": NaN, + "learning_rate": 0.0002256236052769722, + "loss": 0.0, + "step": 22672 + }, + { + "epoch": 2.115610711952972, + "grad_norm": NaN, + "learning_rate": 0.00022561707300730848, + "loss": 0.0, + "step": 22673 + }, + { + "epoch": 2.1157040216478493, + "grad_norm": NaN, + "learning_rate": 0.0002256105405453729, + "loss": 0.0, + "step": 22674 + }, + { + "epoch": 2.1157973313427263, + "grad_norm": NaN, + "learning_rate": 0.00022560400789118214, + "loss": 0.0, + "step": 22675 + }, + { + "epoch": 2.1158906410376037, + "grad_norm": NaN, + "learning_rate": 0.00022559747504475276, + "loss": 0.0, + "step": 22676 + }, + { + "epoch": 2.115983950732481, + "grad_norm": NaN, + "learning_rate": 0.00022559094200610145, + "loss": 0.0, + "step": 22677 + }, + { + "epoch": 2.1160772604273586, + "grad_norm": NaN, + "learning_rate": 0.00022558440877524472, + "loss": 0.0, + "step": 22678 + }, + { + "epoch": 2.1161705701222355, + "grad_norm": NaN, + "learning_rate": 0.00022557787535219923, + "loss": 0.0, + "step": 22679 + }, + { + "epoch": 2.116263879817113, + "grad_norm": NaN, + "learning_rate": 0.0002255713417369816, + "loss": 0.0, + "step": 22680 + }, + { + "epoch": 2.1163571895119904, + "grad_norm": NaN, + "learning_rate": 0.00022556480792960843, + "loss": 0.0, + "step": 22681 + }, + { + "epoch": 2.116450499206868, + "grad_norm": NaN, + "learning_rate": 0.00022555827393009635, + "loss": 0.0, + "step": 22682 + }, + { + "epoch": 2.1165438089017448, + "grad_norm": NaN, + "learning_rate": 0.00022555173973846194, + "loss": 0.0, + "step": 22683 + }, + { + "epoch": 2.116637118596622, + "grad_norm": NaN, + "learning_rate": 0.00022554520535472187, + "loss": 0.0, + "step": 22684 + }, + { + "epoch": 2.1167304282914996, + "grad_norm": NaN, + "learning_rate": 0.00022553867077889267, + "loss": 0.0, + "step": 22685 + }, + { + "epoch": 2.1168237379863766, + "grad_norm": NaN, + "learning_rate": 0.00022553213601099104, + "loss": 0.0, + "step": 22686 + }, + { + "epoch": 2.116917047681254, + "grad_norm": NaN, + "learning_rate": 0.00022552560105103356, + "loss": 0.0, + "step": 22687 + }, + { + "epoch": 2.1170103573761314, + "grad_norm": NaN, + "learning_rate": 0.00022551906589903683, + "loss": 0.0, + "step": 22688 + }, + { + "epoch": 2.117103667071009, + "grad_norm": NaN, + "learning_rate": 0.00022551253055501751, + "loss": 0.0, + "step": 22689 + }, + { + "epoch": 2.117196976765886, + "grad_norm": NaN, + "learning_rate": 0.0002255059950189922, + "loss": 0.0, + "step": 22690 + }, + { + "epoch": 2.1172902864607632, + "grad_norm": NaN, + "learning_rate": 0.0002254994592909775, + "loss": 0.0, + "step": 22691 + }, + { + "epoch": 2.1173835961556406, + "grad_norm": NaN, + "learning_rate": 0.00022549292337099, + "loss": 0.0, + "step": 22692 + }, + { + "epoch": 2.117476905850518, + "grad_norm": NaN, + "learning_rate": 0.0002254863872590464, + "loss": 0.0, + "step": 22693 + }, + { + "epoch": 2.117570215545395, + "grad_norm": NaN, + "learning_rate": 0.0002254798509551633, + "loss": 0.0, + "step": 22694 + }, + { + "epoch": 2.1176635252402725, + "grad_norm": NaN, + "learning_rate": 0.00022547331445935722, + "loss": 0.0, + "step": 22695 + }, + { + "epoch": 2.11775683493515, + "grad_norm": NaN, + "learning_rate": 0.00022546677777164493, + "loss": 0.0, + "step": 22696 + }, + { + "epoch": 2.117850144630027, + "grad_norm": NaN, + "learning_rate": 0.00022546024089204297, + "loss": 0.0, + "step": 22697 + }, + { + "epoch": 2.1179434543249043, + "grad_norm": NaN, + "learning_rate": 0.000225453703820568, + "loss": 0.0, + "step": 22698 + }, + { + "epoch": 2.1180367640197817, + "grad_norm": NaN, + "learning_rate": 0.00022544716655723652, + "loss": 0.0, + "step": 22699 + }, + { + "epoch": 2.118130073714659, + "grad_norm": NaN, + "learning_rate": 0.00022544062910206532, + "loss": 0.0, + "step": 22700 + }, + { + "epoch": 2.118223383409536, + "grad_norm": NaN, + "learning_rate": 0.00022543409145507086, + "loss": 0.0, + "step": 22701 + }, + { + "epoch": 2.1183166931044135, + "grad_norm": NaN, + "learning_rate": 0.00022542755361626993, + "loss": 0.0, + "step": 22702 + }, + { + "epoch": 2.118410002799291, + "grad_norm": NaN, + "learning_rate": 0.00022542101558567908, + "loss": 0.0, + "step": 22703 + }, + { + "epoch": 2.1185033124941683, + "grad_norm": NaN, + "learning_rate": 0.0002254144773633149, + "loss": 0.0, + "step": 22704 + }, + { + "epoch": 2.1185966221890453, + "grad_norm": NaN, + "learning_rate": 0.00022540793894919404, + "loss": 0.0, + "step": 22705 + }, + { + "epoch": 2.1186899318839227, + "grad_norm": NaN, + "learning_rate": 0.00022540140034333316, + "loss": 0.0, + "step": 22706 + }, + { + "epoch": 2.1187832415788, + "grad_norm": NaN, + "learning_rate": 0.00022539486154574885, + "loss": 0.0, + "step": 22707 + }, + { + "epoch": 2.118876551273677, + "grad_norm": NaN, + "learning_rate": 0.00022538832255645768, + "loss": 0.0, + "step": 22708 + }, + { + "epoch": 2.1189698609685546, + "grad_norm": NaN, + "learning_rate": 0.00022538178337547636, + "loss": 0.0, + "step": 22709 + }, + { + "epoch": 2.119063170663432, + "grad_norm": NaN, + "learning_rate": 0.00022537524400282152, + "loss": 0.0, + "step": 22710 + }, + { + "epoch": 2.1191564803583094, + "grad_norm": NaN, + "learning_rate": 0.0002253687044385098, + "loss": 0.0, + "step": 22711 + }, + { + "epoch": 2.1192497900531864, + "grad_norm": NaN, + "learning_rate": 0.00022536216468255774, + "loss": 0.0, + "step": 22712 + }, + { + "epoch": 2.119343099748064, + "grad_norm": NaN, + "learning_rate": 0.000225355624734982, + "loss": 0.0, + "step": 22713 + }, + { + "epoch": 2.119436409442941, + "grad_norm": NaN, + "learning_rate": 0.00022534908459579927, + "loss": 0.0, + "step": 22714 + }, + { + "epoch": 2.1195297191378186, + "grad_norm": NaN, + "learning_rate": 0.00022534254426502613, + "loss": 0.0, + "step": 22715 + }, + { + "epoch": 2.1196230288326956, + "grad_norm": NaN, + "learning_rate": 0.0002253360037426792, + "loss": 0.0, + "step": 22716 + }, + { + "epoch": 2.119716338527573, + "grad_norm": NaN, + "learning_rate": 0.00022532946302877512, + "loss": 0.0, + "step": 22717 + }, + { + "epoch": 2.1198096482224504, + "grad_norm": NaN, + "learning_rate": 0.0002253229221233306, + "loss": 0.0, + "step": 22718 + }, + { + "epoch": 2.1199029579173274, + "grad_norm": NaN, + "learning_rate": 0.0002253163810263621, + "loss": 0.0, + "step": 22719 + }, + { + "epoch": 2.119996267612205, + "grad_norm": NaN, + "learning_rate": 0.00022530983973788644, + "loss": 0.0, + "step": 22720 + }, + { + "epoch": 2.1200895773070823, + "grad_norm": NaN, + "learning_rate": 0.00022530329825792012, + "loss": 0.0, + "step": 22721 + }, + { + "epoch": 2.1201828870019597, + "grad_norm": NaN, + "learning_rate": 0.00022529675658647982, + "loss": 0.0, + "step": 22722 + }, + { + "epoch": 2.1202761966968366, + "grad_norm": NaN, + "learning_rate": 0.0002252902147235822, + "loss": 0.0, + "step": 22723 + }, + { + "epoch": 2.120369506391714, + "grad_norm": NaN, + "learning_rate": 0.0002252836726692438, + "loss": 0.0, + "step": 22724 + }, + { + "epoch": 2.1204628160865915, + "grad_norm": NaN, + "learning_rate": 0.00022527713042348141, + "loss": 0.0, + "step": 22725 + }, + { + "epoch": 2.120556125781469, + "grad_norm": NaN, + "learning_rate": 0.0002252705879863115, + "loss": 0.0, + "step": 22726 + }, + { + "epoch": 2.120649435476346, + "grad_norm": NaN, + "learning_rate": 0.00022526404535775084, + "loss": 0.0, + "step": 22727 + }, + { + "epoch": 2.1207427451712233, + "grad_norm": NaN, + "learning_rate": 0.000225257502537816, + "loss": 0.0, + "step": 22728 + }, + { + "epoch": 2.1208360548661007, + "grad_norm": NaN, + "learning_rate": 0.00022525095952652358, + "loss": 0.0, + "step": 22729 + }, + { + "epoch": 2.1209293645609777, + "grad_norm": NaN, + "learning_rate": 0.0002252444163238903, + "loss": 0.0, + "step": 22730 + }, + { + "epoch": 2.121022674255855, + "grad_norm": NaN, + "learning_rate": 0.00022523787292993275, + "loss": 0.0, + "step": 22731 + }, + { + "epoch": 2.1211159839507325, + "grad_norm": NaN, + "learning_rate": 0.00022523132934466756, + "loss": 0.0, + "step": 22732 + }, + { + "epoch": 2.12120929364561, + "grad_norm": NaN, + "learning_rate": 0.0002252247855681114, + "loss": 0.0, + "step": 22733 + }, + { + "epoch": 2.121302603340487, + "grad_norm": NaN, + "learning_rate": 0.00022521824160028092, + "loss": 0.0, + "step": 22734 + }, + { + "epoch": 2.1213959130353643, + "grad_norm": NaN, + "learning_rate": 0.0002252116974411927, + "loss": 0.0, + "step": 22735 + }, + { + "epoch": 2.1214892227302418, + "grad_norm": NaN, + "learning_rate": 0.00022520515309086341, + "loss": 0.0, + "step": 22736 + }, + { + "epoch": 2.121582532425119, + "grad_norm": NaN, + "learning_rate": 0.00022519860854930968, + "loss": 0.0, + "step": 22737 + }, + { + "epoch": 2.121675842119996, + "grad_norm": NaN, + "learning_rate": 0.00022519206381654822, + "loss": 0.0, + "step": 22738 + }, + { + "epoch": 2.1217691518148736, + "grad_norm": NaN, + "learning_rate": 0.00022518551889259556, + "loss": 0.0, + "step": 22739 + }, + { + "epoch": 2.121862461509751, + "grad_norm": NaN, + "learning_rate": 0.00022517897377746841, + "loss": 0.0, + "step": 22740 + }, + { + "epoch": 2.121955771204628, + "grad_norm": NaN, + "learning_rate": 0.0002251724284711834, + "loss": 0.0, + "step": 22741 + }, + { + "epoch": 2.1220490808995054, + "grad_norm": NaN, + "learning_rate": 0.00022516588297375717, + "loss": 0.0, + "step": 22742 + }, + { + "epoch": 2.122142390594383, + "grad_norm": NaN, + "learning_rate": 0.0002251593372852064, + "loss": 0.0, + "step": 22743 + }, + { + "epoch": 2.1222357002892602, + "grad_norm": NaN, + "learning_rate": 0.00022515279140554768, + "loss": 0.0, + "step": 22744 + }, + { + "epoch": 2.122329009984137, + "grad_norm": NaN, + "learning_rate": 0.00022514624533479766, + "loss": 0.0, + "step": 22745 + }, + { + "epoch": 2.1224223196790146, + "grad_norm": NaN, + "learning_rate": 0.000225139699072973, + "loss": 0.0, + "step": 22746 + }, + { + "epoch": 2.122515629373892, + "grad_norm": NaN, + "learning_rate": 0.00022513315262009034, + "loss": 0.0, + "step": 22747 + }, + { + "epoch": 2.122608939068769, + "grad_norm": NaN, + "learning_rate": 0.00022512660597616633, + "loss": 0.0, + "step": 22748 + }, + { + "epoch": 2.1227022487636464, + "grad_norm": NaN, + "learning_rate": 0.00022512005914121763, + "loss": 0.0, + "step": 22749 + }, + { + "epoch": 2.122795558458524, + "grad_norm": NaN, + "learning_rate": 0.00022511351211526085, + "loss": 0.0, + "step": 22750 + }, + { + "epoch": 2.1228888681534013, + "grad_norm": NaN, + "learning_rate": 0.00022510696489831265, + "loss": 0.0, + "step": 22751 + }, + { + "epoch": 2.1229821778482783, + "grad_norm": NaN, + "learning_rate": 0.00022510041749038975, + "loss": 0.0, + "step": 22752 + }, + { + "epoch": 2.1230754875431557, + "grad_norm": NaN, + "learning_rate": 0.00022509386989150867, + "loss": 0.0, + "step": 22753 + }, + { + "epoch": 2.123168797238033, + "grad_norm": NaN, + "learning_rate": 0.00022508732210168615, + "loss": 0.0, + "step": 22754 + }, + { + "epoch": 2.1232621069329105, + "grad_norm": NaN, + "learning_rate": 0.00022508077412093883, + "loss": 0.0, + "step": 22755 + }, + { + "epoch": 2.1233554166277875, + "grad_norm": NaN, + "learning_rate": 0.0002250742259492833, + "loss": 0.0, + "step": 22756 + }, + { + "epoch": 2.123448726322665, + "grad_norm": NaN, + "learning_rate": 0.00022506767758673628, + "loss": 0.0, + "step": 22757 + }, + { + "epoch": 2.1235420360175423, + "grad_norm": NaN, + "learning_rate": 0.00022506112903331442, + "loss": 0.0, + "step": 22758 + }, + { + "epoch": 2.1236353457124197, + "grad_norm": NaN, + "learning_rate": 0.0002250545802890343, + "loss": 0.0, + "step": 22759 + }, + { + "epoch": 2.1237286554072967, + "grad_norm": NaN, + "learning_rate": 0.00022504803135391262, + "loss": 0.0, + "step": 22760 + }, + { + "epoch": 2.123821965102174, + "grad_norm": NaN, + "learning_rate": 0.00022504148222796602, + "loss": 0.0, + "step": 22761 + }, + { + "epoch": 2.1239152747970516, + "grad_norm": NaN, + "learning_rate": 0.00022503493291121116, + "loss": 0.0, + "step": 22762 + }, + { + "epoch": 2.1240085844919285, + "grad_norm": NaN, + "learning_rate": 0.00022502838340366472, + "loss": 0.0, + "step": 22763 + }, + { + "epoch": 2.124101894186806, + "grad_norm": NaN, + "learning_rate": 0.00022502183370534329, + "loss": 0.0, + "step": 22764 + }, + { + "epoch": 2.1241952038816834, + "grad_norm": NaN, + "learning_rate": 0.00022501528381626358, + "loss": 0.0, + "step": 22765 + }, + { + "epoch": 2.124288513576561, + "grad_norm": NaN, + "learning_rate": 0.00022500873373644225, + "loss": 0.0, + "step": 22766 + }, + { + "epoch": 2.1243818232714378, + "grad_norm": NaN, + "learning_rate": 0.00022500218346589588, + "loss": 0.0, + "step": 22767 + }, + { + "epoch": 2.124475132966315, + "grad_norm": NaN, + "learning_rate": 0.00022499563300464119, + "loss": 0.0, + "step": 22768 + }, + { + "epoch": 2.1245684426611926, + "grad_norm": NaN, + "learning_rate": 0.00022498908235269485, + "loss": 0.0, + "step": 22769 + }, + { + "epoch": 2.1246617523560696, + "grad_norm": NaN, + "learning_rate": 0.00022498253151007346, + "loss": 0.0, + "step": 22770 + }, + { + "epoch": 2.124755062050947, + "grad_norm": NaN, + "learning_rate": 0.0002249759804767937, + "loss": 0.0, + "step": 22771 + }, + { + "epoch": 2.1248483717458244, + "grad_norm": NaN, + "learning_rate": 0.00022496942925287225, + "loss": 0.0, + "step": 22772 + }, + { + "epoch": 2.124941681440702, + "grad_norm": NaN, + "learning_rate": 0.00022496287783832576, + "loss": 0.0, + "step": 22773 + }, + { + "epoch": 2.125034991135579, + "grad_norm": NaN, + "learning_rate": 0.00022495632623317083, + "loss": 0.0, + "step": 22774 + }, + { + "epoch": 2.1251283008304562, + "grad_norm": NaN, + "learning_rate": 0.00022494977443742421, + "loss": 0.0, + "step": 22775 + }, + { + "epoch": 2.1252216105253336, + "grad_norm": NaN, + "learning_rate": 0.00022494322245110248, + "loss": 0.0, + "step": 22776 + }, + { + "epoch": 2.125314920220211, + "grad_norm": NaN, + "learning_rate": 0.00022493667027422234, + "loss": 0.0, + "step": 22777 + }, + { + "epoch": 2.125408229915088, + "grad_norm": NaN, + "learning_rate": 0.00022493011790680048, + "loss": 0.0, + "step": 22778 + }, + { + "epoch": 2.1255015396099655, + "grad_norm": NaN, + "learning_rate": 0.0002249235653488535, + "loss": 0.0, + "step": 22779 + }, + { + "epoch": 2.125594849304843, + "grad_norm": NaN, + "learning_rate": 0.0002249170126003981, + "loss": 0.0, + "step": 22780 + }, + { + "epoch": 2.1256881589997203, + "grad_norm": NaN, + "learning_rate": 0.00022491045966145092, + "loss": 0.0, + "step": 22781 + }, + { + "epoch": 2.1257814686945973, + "grad_norm": NaN, + "learning_rate": 0.00022490390653202865, + "loss": 0.0, + "step": 22782 + }, + { + "epoch": 2.1258747783894747, + "grad_norm": NaN, + "learning_rate": 0.0002248973532121479, + "loss": 0.0, + "step": 22783 + }, + { + "epoch": 2.125968088084352, + "grad_norm": NaN, + "learning_rate": 0.00022489079970182537, + "loss": 0.0, + "step": 22784 + }, + { + "epoch": 2.126061397779229, + "grad_norm": NaN, + "learning_rate": 0.00022488424600107775, + "loss": 0.0, + "step": 22785 + }, + { + "epoch": 2.1261547074741065, + "grad_norm": NaN, + "learning_rate": 0.00022487769210992166, + "loss": 0.0, + "step": 22786 + }, + { + "epoch": 2.126248017168984, + "grad_norm": NaN, + "learning_rate": 0.00022487113802837377, + "loss": 0.0, + "step": 22787 + }, + { + "epoch": 2.1263413268638613, + "grad_norm": NaN, + "learning_rate": 0.00022486458375645078, + "loss": 0.0, + "step": 22788 + }, + { + "epoch": 2.1264346365587383, + "grad_norm": NaN, + "learning_rate": 0.0002248580292941693, + "loss": 0.0, + "step": 22789 + }, + { + "epoch": 2.1265279462536157, + "grad_norm": NaN, + "learning_rate": 0.00022485147464154604, + "loss": 0.0, + "step": 22790 + }, + { + "epoch": 2.126621255948493, + "grad_norm": NaN, + "learning_rate": 0.00022484491979859764, + "loss": 0.0, + "step": 22791 + }, + { + "epoch": 2.12671456564337, + "grad_norm": NaN, + "learning_rate": 0.00022483836476534078, + "loss": 0.0, + "step": 22792 + }, + { + "epoch": 2.1268078753382476, + "grad_norm": NaN, + "learning_rate": 0.00022483180954179218, + "loss": 0.0, + "step": 22793 + }, + { + "epoch": 2.126901185033125, + "grad_norm": NaN, + "learning_rate": 0.0002248252541279684, + "loss": 0.0, + "step": 22794 + }, + { + "epoch": 2.1269944947280024, + "grad_norm": NaN, + "learning_rate": 0.0002248186985238862, + "loss": 0.0, + "step": 22795 + }, + { + "epoch": 2.1270878044228794, + "grad_norm": NaN, + "learning_rate": 0.00022481214272956218, + "loss": 0.0, + "step": 22796 + }, + { + "epoch": 2.127181114117757, + "grad_norm": NaN, + "learning_rate": 0.00022480558674501305, + "loss": 0.0, + "step": 22797 + }, + { + "epoch": 2.127274423812634, + "grad_norm": NaN, + "learning_rate": 0.00022479903057025545, + "loss": 0.0, + "step": 22798 + }, + { + "epoch": 2.1273677335075116, + "grad_norm": NaN, + "learning_rate": 0.0002247924742053061, + "loss": 0.0, + "step": 22799 + }, + { + "epoch": 2.1274610432023886, + "grad_norm": NaN, + "learning_rate": 0.00022478591765018164, + "loss": 0.0, + "step": 22800 + }, + { + "epoch": 2.127554352897266, + "grad_norm": NaN, + "learning_rate": 0.00022477936090489878, + "loss": 0.0, + "step": 22801 + }, + { + "epoch": 2.1276476625921434, + "grad_norm": NaN, + "learning_rate": 0.0002247728039694741, + "loss": 0.0, + "step": 22802 + }, + { + "epoch": 2.1277409722870204, + "grad_norm": NaN, + "learning_rate": 0.00022476624684392434, + "loss": 0.0, + "step": 22803 + }, + { + "epoch": 2.127834281981898, + "grad_norm": NaN, + "learning_rate": 0.00022475968952826616, + "loss": 0.0, + "step": 22804 + }, + { + "epoch": 2.1279275916767753, + "grad_norm": NaN, + "learning_rate": 0.00022475313202251625, + "loss": 0.0, + "step": 22805 + }, + { + "epoch": 2.1280209013716527, + "grad_norm": NaN, + "learning_rate": 0.00022474657432669125, + "loss": 0.0, + "step": 22806 + }, + { + "epoch": 2.1281142110665296, + "grad_norm": NaN, + "learning_rate": 0.00022474001644080788, + "loss": 0.0, + "step": 22807 + }, + { + "epoch": 2.128207520761407, + "grad_norm": NaN, + "learning_rate": 0.00022473345836488277, + "loss": 0.0, + "step": 22808 + }, + { + "epoch": 2.1283008304562845, + "grad_norm": NaN, + "learning_rate": 0.00022472690009893263, + "loss": 0.0, + "step": 22809 + }, + { + "epoch": 2.128394140151162, + "grad_norm": NaN, + "learning_rate": 0.00022472034164297411, + "loss": 0.0, + "step": 22810 + }, + { + "epoch": 2.128487449846039, + "grad_norm": NaN, + "learning_rate": 0.00022471378299702388, + "loss": 0.0, + "step": 22811 + }, + { + "epoch": 2.1285807595409163, + "grad_norm": NaN, + "learning_rate": 0.0002247072241610987, + "loss": 0.0, + "step": 22812 + }, + { + "epoch": 2.1286740692357937, + "grad_norm": NaN, + "learning_rate": 0.0002247006651352151, + "loss": 0.0, + "step": 22813 + }, + { + "epoch": 2.1287673789306707, + "grad_norm": NaN, + "learning_rate": 0.00022469410591938982, + "loss": 0.0, + "step": 22814 + }, + { + "epoch": 2.128860688625548, + "grad_norm": NaN, + "learning_rate": 0.0002246875465136396, + "loss": 0.0, + "step": 22815 + }, + { + "epoch": 2.1289539983204255, + "grad_norm": NaN, + "learning_rate": 0.0002246809869179811, + "loss": 0.0, + "step": 22816 + }, + { + "epoch": 2.129047308015303, + "grad_norm": NaN, + "learning_rate": 0.00022467442713243094, + "loss": 0.0, + "step": 22817 + }, + { + "epoch": 2.12914061771018, + "grad_norm": NaN, + "learning_rate": 0.0002246678671570058, + "loss": 0.0, + "step": 22818 + }, + { + "epoch": 2.1292339274050573, + "grad_norm": NaN, + "learning_rate": 0.00022466130699172243, + "loss": 0.0, + "step": 22819 + }, + { + "epoch": 2.1293272370999348, + "grad_norm": NaN, + "learning_rate": 0.00022465474663659746, + "loss": 0.0, + "step": 22820 + }, + { + "epoch": 2.129420546794812, + "grad_norm": NaN, + "learning_rate": 0.0002246481860916476, + "loss": 0.0, + "step": 22821 + }, + { + "epoch": 2.129513856489689, + "grad_norm": NaN, + "learning_rate": 0.0002246416253568895, + "loss": 0.0, + "step": 22822 + }, + { + "epoch": 2.1296071661845666, + "grad_norm": NaN, + "learning_rate": 0.00022463506443233988, + "loss": 0.0, + "step": 22823 + }, + { + "epoch": 2.129700475879444, + "grad_norm": NaN, + "learning_rate": 0.00022462850331801538, + "loss": 0.0, + "step": 22824 + }, + { + "epoch": 2.129793785574321, + "grad_norm": NaN, + "learning_rate": 0.0002246219420139327, + "loss": 0.0, + "step": 22825 + }, + { + "epoch": 2.1298870952691984, + "grad_norm": NaN, + "learning_rate": 0.00022461538052010854, + "loss": 0.0, + "step": 22826 + }, + { + "epoch": 2.129980404964076, + "grad_norm": NaN, + "learning_rate": 0.00022460881883655957, + "loss": 0.0, + "step": 22827 + }, + { + "epoch": 2.1300737146589532, + "grad_norm": NaN, + "learning_rate": 0.00022460225696330243, + "loss": 0.0, + "step": 22828 + }, + { + "epoch": 2.13016702435383, + "grad_norm": NaN, + "learning_rate": 0.0002245956949003539, + "loss": 0.0, + "step": 22829 + }, + { + "epoch": 2.1302603340487076, + "grad_norm": NaN, + "learning_rate": 0.0002245891326477306, + "loss": 0.0, + "step": 22830 + }, + { + "epoch": 2.130353643743585, + "grad_norm": NaN, + "learning_rate": 0.00022458257020544924, + "loss": 0.0, + "step": 22831 + }, + { + "epoch": 2.1304469534384625, + "grad_norm": NaN, + "learning_rate": 0.00022457600757352648, + "loss": 0.0, + "step": 22832 + }, + { + "epoch": 2.1305402631333394, + "grad_norm": NaN, + "learning_rate": 0.00022456944475197904, + "loss": 0.0, + "step": 22833 + }, + { + "epoch": 2.130633572828217, + "grad_norm": NaN, + "learning_rate": 0.0002245628817408236, + "loss": 0.0, + "step": 22834 + }, + { + "epoch": 2.1307268825230943, + "grad_norm": NaN, + "learning_rate": 0.00022455631854007677, + "loss": 0.0, + "step": 22835 + }, + { + "epoch": 2.1308201922179713, + "grad_norm": NaN, + "learning_rate": 0.00022454975514975538, + "loss": 0.0, + "step": 22836 + }, + { + "epoch": 2.1309135019128487, + "grad_norm": NaN, + "learning_rate": 0.00022454319156987602, + "loss": 0.0, + "step": 22837 + }, + { + "epoch": 2.131006811607726, + "grad_norm": NaN, + "learning_rate": 0.0002245366278004554, + "loss": 0.0, + "step": 22838 + }, + { + "epoch": 2.1311001213026035, + "grad_norm": NaN, + "learning_rate": 0.00022453006384151025, + "loss": 0.0, + "step": 22839 + }, + { + "epoch": 2.1311934309974805, + "grad_norm": NaN, + "learning_rate": 0.0002245234996930572, + "loss": 0.0, + "step": 22840 + }, + { + "epoch": 2.131286740692358, + "grad_norm": NaN, + "learning_rate": 0.00022451693535511296, + "loss": 0.0, + "step": 22841 + }, + { + "epoch": 2.1313800503872353, + "grad_norm": NaN, + "learning_rate": 0.00022451037082769427, + "loss": 0.0, + "step": 22842 + }, + { + "epoch": 2.1314733600821123, + "grad_norm": NaN, + "learning_rate": 0.0002245038061108177, + "loss": 0.0, + "step": 22843 + }, + { + "epoch": 2.1315666697769897, + "grad_norm": NaN, + "learning_rate": 0.00022449724120450005, + "loss": 0.0, + "step": 22844 + }, + { + "epoch": 2.131659979471867, + "grad_norm": NaN, + "learning_rate": 0.00022449067610875802, + "loss": 0.0, + "step": 22845 + }, + { + "epoch": 2.1317532891667446, + "grad_norm": NaN, + "learning_rate": 0.0002244841108236082, + "loss": 0.0, + "step": 22846 + }, + { + "epoch": 2.1318465988616215, + "grad_norm": NaN, + "learning_rate": 0.00022447754534906742, + "loss": 0.0, + "step": 22847 + }, + { + "epoch": 2.131939908556499, + "grad_norm": NaN, + "learning_rate": 0.00022447097968515228, + "loss": 0.0, + "step": 22848 + }, + { + "epoch": 2.1320332182513764, + "grad_norm": NaN, + "learning_rate": 0.0002244644138318795, + "loss": 0.0, + "step": 22849 + }, + { + "epoch": 2.132126527946254, + "grad_norm": NaN, + "learning_rate": 0.00022445784778926575, + "loss": 0.0, + "step": 22850 + }, + { + "epoch": 2.1322198376411308, + "grad_norm": NaN, + "learning_rate": 0.00022445128155732777, + "loss": 0.0, + "step": 22851 + }, + { + "epoch": 2.132313147336008, + "grad_norm": NaN, + "learning_rate": 0.0002244447151360822, + "loss": 0.0, + "step": 22852 + }, + { + "epoch": 2.1324064570308856, + "grad_norm": NaN, + "learning_rate": 0.00022443814852554582, + "loss": 0.0, + "step": 22853 + }, + { + "epoch": 2.132499766725763, + "grad_norm": NaN, + "learning_rate": 0.00022443158172573527, + "loss": 0.0, + "step": 22854 + }, + { + "epoch": 2.13259307642064, + "grad_norm": NaN, + "learning_rate": 0.00022442501473666726, + "loss": 0.0, + "step": 22855 + }, + { + "epoch": 2.1326863861155174, + "grad_norm": NaN, + "learning_rate": 0.00022441844755835848, + "loss": 0.0, + "step": 22856 + }, + { + "epoch": 2.132779695810395, + "grad_norm": NaN, + "learning_rate": 0.00022441188019082557, + "loss": 0.0, + "step": 22857 + }, + { + "epoch": 2.132873005505272, + "grad_norm": NaN, + "learning_rate": 0.0002244053126340854, + "loss": 0.0, + "step": 22858 + }, + { + "epoch": 2.1329663152001492, + "grad_norm": NaN, + "learning_rate": 0.0002243987448881545, + "loss": 0.0, + "step": 22859 + }, + { + "epoch": 2.1330596248950267, + "grad_norm": NaN, + "learning_rate": 0.00022439217695304961, + "loss": 0.0, + "step": 22860 + }, + { + "epoch": 2.133152934589904, + "grad_norm": NaN, + "learning_rate": 0.0002243856088287875, + "loss": 0.0, + "step": 22861 + }, + { + "epoch": 2.133246244284781, + "grad_norm": NaN, + "learning_rate": 0.00022437904051538478, + "loss": 0.0, + "step": 22862 + }, + { + "epoch": 2.1333395539796585, + "grad_norm": NaN, + "learning_rate": 0.00022437247201285824, + "loss": 0.0, + "step": 22863 + }, + { + "epoch": 2.133432863674536, + "grad_norm": NaN, + "learning_rate": 0.00022436590332122451, + "loss": 0.0, + "step": 22864 + }, + { + "epoch": 2.133526173369413, + "grad_norm": NaN, + "learning_rate": 0.00022435933444050026, + "loss": 0.0, + "step": 22865 + }, + { + "epoch": 2.1336194830642903, + "grad_norm": NaN, + "learning_rate": 0.00022435276537070235, + "loss": 0.0, + "step": 22866 + }, + { + "epoch": 2.1337127927591677, + "grad_norm": NaN, + "learning_rate": 0.00022434619611184733, + "loss": 0.0, + "step": 22867 + }, + { + "epoch": 2.133806102454045, + "grad_norm": NaN, + "learning_rate": 0.00022433962666395195, + "loss": 0.0, + "step": 22868 + }, + { + "epoch": 2.133899412148922, + "grad_norm": NaN, + "learning_rate": 0.00022433305702703293, + "loss": 0.0, + "step": 22869 + }, + { + "epoch": 2.1339927218437995, + "grad_norm": NaN, + "learning_rate": 0.00022432648720110697, + "loss": 0.0, + "step": 22870 + }, + { + "epoch": 2.134086031538677, + "grad_norm": NaN, + "learning_rate": 0.00022431991718619074, + "loss": 0.0, + "step": 22871 + }, + { + "epoch": 2.1341793412335544, + "grad_norm": NaN, + "learning_rate": 0.00022431334698230104, + "loss": 0.0, + "step": 22872 + }, + { + "epoch": 2.1342726509284313, + "grad_norm": NaN, + "learning_rate": 0.00022430677658945444, + "loss": 0.0, + "step": 22873 + }, + { + "epoch": 2.1343659606233087, + "grad_norm": NaN, + "learning_rate": 0.00022430020600766774, + "loss": 0.0, + "step": 22874 + }, + { + "epoch": 2.134459270318186, + "grad_norm": NaN, + "learning_rate": 0.00022429363523695766, + "loss": 0.0, + "step": 22875 + }, + { + "epoch": 2.1345525800130636, + "grad_norm": NaN, + "learning_rate": 0.00022428706427734082, + "loss": 0.0, + "step": 22876 + }, + { + "epoch": 2.1346458897079406, + "grad_norm": NaN, + "learning_rate": 0.00022428049312883403, + "loss": 0.0, + "step": 22877 + }, + { + "epoch": 2.134739199402818, + "grad_norm": NaN, + "learning_rate": 0.00022427392179145394, + "loss": 0.0, + "step": 22878 + }, + { + "epoch": 2.1348325090976954, + "grad_norm": NaN, + "learning_rate": 0.00022426735026521725, + "loss": 0.0, + "step": 22879 + }, + { + "epoch": 2.1349258187925724, + "grad_norm": NaN, + "learning_rate": 0.0002242607785501407, + "loss": 0.0, + "step": 22880 + }, + { + "epoch": 2.13501912848745, + "grad_norm": NaN, + "learning_rate": 0.000224254206646241, + "loss": 0.0, + "step": 22881 + }, + { + "epoch": 2.135112438182327, + "grad_norm": NaN, + "learning_rate": 0.00022424763455353478, + "loss": 0.0, + "step": 22882 + }, + { + "epoch": 2.1352057478772046, + "grad_norm": NaN, + "learning_rate": 0.00022424106227203892, + "loss": 0.0, + "step": 22883 + }, + { + "epoch": 2.1352990575720816, + "grad_norm": NaN, + "learning_rate": 0.00022423448980176997, + "loss": 0.0, + "step": 22884 + }, + { + "epoch": 2.135392367266959, + "grad_norm": NaN, + "learning_rate": 0.00022422791714274472, + "loss": 0.0, + "step": 22885 + }, + { + "epoch": 2.1354856769618364, + "grad_norm": NaN, + "learning_rate": 0.00022422134429497988, + "loss": 0.0, + "step": 22886 + }, + { + "epoch": 2.1355789866567134, + "grad_norm": NaN, + "learning_rate": 0.0002242147712584921, + "loss": 0.0, + "step": 22887 + }, + { + "epoch": 2.135672296351591, + "grad_norm": NaN, + "learning_rate": 0.0002242081980332982, + "loss": 0.0, + "step": 22888 + }, + { + "epoch": 2.1357656060464683, + "grad_norm": NaN, + "learning_rate": 0.00022420162461941477, + "loss": 0.0, + "step": 22889 + }, + { + "epoch": 2.1358589157413457, + "grad_norm": NaN, + "learning_rate": 0.00022419505101685863, + "loss": 0.0, + "step": 22890 + }, + { + "epoch": 2.1359522254362227, + "grad_norm": NaN, + "learning_rate": 0.00022418847722564645, + "loss": 0.0, + "step": 22891 + }, + { + "epoch": 2.1360455351311, + "grad_norm": NaN, + "learning_rate": 0.0002241819032457949, + "loss": 0.0, + "step": 22892 + }, + { + "epoch": 2.1361388448259775, + "grad_norm": NaN, + "learning_rate": 0.00022417532907732083, + "loss": 0.0, + "step": 22893 + }, + { + "epoch": 2.136232154520855, + "grad_norm": NaN, + "learning_rate": 0.00022416875472024082, + "loss": 0.0, + "step": 22894 + }, + { + "epoch": 2.136325464215732, + "grad_norm": NaN, + "learning_rate": 0.0002241621801745716, + "loss": 0.0, + "step": 22895 + }, + { + "epoch": 2.1364187739106093, + "grad_norm": NaN, + "learning_rate": 0.00022415560544033, + "loss": 0.0, + "step": 22896 + }, + { + "epoch": 2.1365120836054867, + "grad_norm": NaN, + "learning_rate": 0.00022414903051753263, + "loss": 0.0, + "step": 22897 + }, + { + "epoch": 2.136605393300364, + "grad_norm": NaN, + "learning_rate": 0.00022414245540619624, + "loss": 0.0, + "step": 22898 + }, + { + "epoch": 2.136698702995241, + "grad_norm": NaN, + "learning_rate": 0.00022413588010633756, + "loss": 0.0, + "step": 22899 + }, + { + "epoch": 2.1367920126901185, + "grad_norm": NaN, + "learning_rate": 0.00022412930461797328, + "loss": 0.0, + "step": 22900 + }, + { + "epoch": 2.136885322384996, + "grad_norm": NaN, + "learning_rate": 0.00022412272894112013, + "loss": 0.0, + "step": 22901 + }, + { + "epoch": 2.136978632079873, + "grad_norm": NaN, + "learning_rate": 0.00022411615307579489, + "loss": 0.0, + "step": 22902 + }, + { + "epoch": 2.1370719417747503, + "grad_norm": NaN, + "learning_rate": 0.0002241095770220142, + "loss": 0.0, + "step": 22903 + }, + { + "epoch": 2.1371652514696278, + "grad_norm": NaN, + "learning_rate": 0.00022410300077979473, + "loss": 0.0, + "step": 22904 + }, + { + "epoch": 2.137258561164505, + "grad_norm": NaN, + "learning_rate": 0.0002240964243491534, + "loss": 0.0, + "step": 22905 + }, + { + "epoch": 2.137351870859382, + "grad_norm": NaN, + "learning_rate": 0.00022408984773010675, + "loss": 0.0, + "step": 22906 + }, + { + "epoch": 2.1374451805542596, + "grad_norm": NaN, + "learning_rate": 0.00022408327092267156, + "loss": 0.0, + "step": 22907 + }, + { + "epoch": 2.137538490249137, + "grad_norm": NaN, + "learning_rate": 0.00022407669392686457, + "loss": 0.0, + "step": 22908 + }, + { + "epoch": 2.137631799944014, + "grad_norm": NaN, + "learning_rate": 0.0002240701167427025, + "loss": 0.0, + "step": 22909 + }, + { + "epoch": 2.1377251096388914, + "grad_norm": NaN, + "learning_rate": 0.00022406353937020208, + "loss": 0.0, + "step": 22910 + }, + { + "epoch": 2.137818419333769, + "grad_norm": NaN, + "learning_rate": 0.00022405696180937993, + "loss": 0.0, + "step": 22911 + }, + { + "epoch": 2.1379117290286462, + "grad_norm": NaN, + "learning_rate": 0.00022405038406025294, + "loss": 0.0, + "step": 22912 + }, + { + "epoch": 2.138005038723523, + "grad_norm": NaN, + "learning_rate": 0.00022404380612283778, + "loss": 0.0, + "step": 22913 + }, + { + "epoch": 2.1380983484184006, + "grad_norm": NaN, + "learning_rate": 0.0002240372279971511, + "loss": 0.0, + "step": 22914 + }, + { + "epoch": 2.138191658113278, + "grad_norm": NaN, + "learning_rate": 0.00022403064968320972, + "loss": 0.0, + "step": 22915 + }, + { + "epoch": 2.1382849678081555, + "grad_norm": NaN, + "learning_rate": 0.00022402407118103028, + "loss": 0.0, + "step": 22916 + }, + { + "epoch": 2.1383782775030324, + "grad_norm": NaN, + "learning_rate": 0.00022401749249062957, + "loss": 0.0, + "step": 22917 + }, + { + "epoch": 2.13847158719791, + "grad_norm": NaN, + "learning_rate": 0.00022401091361202435, + "loss": 0.0, + "step": 22918 + }, + { + "epoch": 2.1385648968927873, + "grad_norm": NaN, + "learning_rate": 0.00022400433454523126, + "loss": 0.0, + "step": 22919 + }, + { + "epoch": 2.1386582065876643, + "grad_norm": NaN, + "learning_rate": 0.00022399775529026705, + "loss": 0.0, + "step": 22920 + }, + { + "epoch": 2.1387515162825417, + "grad_norm": NaN, + "learning_rate": 0.00022399117584714852, + "loss": 0.0, + "step": 22921 + }, + { + "epoch": 2.138844825977419, + "grad_norm": NaN, + "learning_rate": 0.00022398459621589232, + "loss": 0.0, + "step": 22922 + }, + { + "epoch": 2.1389381356722965, + "grad_norm": NaN, + "learning_rate": 0.00022397801639651517, + "loss": 0.0, + "step": 22923 + }, + { + "epoch": 2.1390314453671735, + "grad_norm": NaN, + "learning_rate": 0.0002239714363890339, + "loss": 0.0, + "step": 22924 + }, + { + "epoch": 2.139124755062051, + "grad_norm": NaN, + "learning_rate": 0.0002239648561934652, + "loss": 0.0, + "step": 22925 + }, + { + "epoch": 2.1392180647569283, + "grad_norm": NaN, + "learning_rate": 0.00022395827580982565, + "loss": 0.0, + "step": 22926 + }, + { + "epoch": 2.1393113744518057, + "grad_norm": NaN, + "learning_rate": 0.00022395169523813224, + "loss": 0.0, + "step": 22927 + }, + { + "epoch": 2.1394046841466827, + "grad_norm": NaN, + "learning_rate": 0.00022394511447840156, + "loss": 0.0, + "step": 22928 + }, + { + "epoch": 2.13949799384156, + "grad_norm": NaN, + "learning_rate": 0.00022393853353065032, + "loss": 0.0, + "step": 22929 + }, + { + "epoch": 2.1395913035364376, + "grad_norm": NaN, + "learning_rate": 0.00022393195239489526, + "loss": 0.0, + "step": 22930 + }, + { + "epoch": 2.1396846132313145, + "grad_norm": NaN, + "learning_rate": 0.00022392537107115322, + "loss": 0.0, + "step": 22931 + }, + { + "epoch": 2.139777922926192, + "grad_norm": NaN, + "learning_rate": 0.00022391878955944084, + "loss": 0.0, + "step": 22932 + }, + { + "epoch": 2.1398712326210694, + "grad_norm": NaN, + "learning_rate": 0.00022391220785977485, + "loss": 0.0, + "step": 22933 + }, + { + "epoch": 2.139964542315947, + "grad_norm": NaN, + "learning_rate": 0.00022390562597217204, + "loss": 0.0, + "step": 22934 + }, + { + "epoch": 2.1400578520108238, + "grad_norm": NaN, + "learning_rate": 0.00022389904389664907, + "loss": 0.0, + "step": 22935 + }, + { + "epoch": 2.140151161705701, + "grad_norm": NaN, + "learning_rate": 0.00022389246163322273, + "loss": 0.0, + "step": 22936 + }, + { + "epoch": 2.1402444714005786, + "grad_norm": NaN, + "learning_rate": 0.0002238858791819098, + "loss": 0.0, + "step": 22937 + }, + { + "epoch": 2.1403377810954556, + "grad_norm": NaN, + "learning_rate": 0.00022387929654272694, + "loss": 0.0, + "step": 22938 + }, + { + "epoch": 2.140431090790333, + "grad_norm": NaN, + "learning_rate": 0.00022387271371569086, + "loss": 0.0, + "step": 22939 + }, + { + "epoch": 2.1405244004852104, + "grad_norm": NaN, + "learning_rate": 0.00022386613070081844, + "loss": 0.0, + "step": 22940 + }, + { + "epoch": 2.140617710180088, + "grad_norm": NaN, + "learning_rate": 0.0002238595474981263, + "loss": 0.0, + "step": 22941 + }, + { + "epoch": 2.140711019874965, + "grad_norm": NaN, + "learning_rate": 0.00022385296410763118, + "loss": 0.0, + "step": 22942 + }, + { + "epoch": 2.1408043295698422, + "grad_norm": NaN, + "learning_rate": 0.00022384638052934988, + "loss": 0.0, + "step": 22943 + }, + { + "epoch": 2.1408976392647197, + "grad_norm": NaN, + "learning_rate": 0.00022383979676329912, + "loss": 0.0, + "step": 22944 + }, + { + "epoch": 2.140990948959597, + "grad_norm": NaN, + "learning_rate": 0.00022383321280949555, + "loss": 0.0, + "step": 22945 + }, + { + "epoch": 2.141084258654474, + "grad_norm": NaN, + "learning_rate": 0.00022382662866795607, + "loss": 0.0, + "step": 22946 + }, + { + "epoch": 2.1411775683493515, + "grad_norm": NaN, + "learning_rate": 0.00022382004433869735, + "loss": 0.0, + "step": 22947 + }, + { + "epoch": 2.141270878044229, + "grad_norm": NaN, + "learning_rate": 0.00022381345982173606, + "loss": 0.0, + "step": 22948 + }, + { + "epoch": 2.1413641877391063, + "grad_norm": NaN, + "learning_rate": 0.00022380687511708904, + "loss": 0.0, + "step": 22949 + }, + { + "epoch": 2.1414574974339833, + "grad_norm": NaN, + "learning_rate": 0.00022380029022477303, + "loss": 0.0, + "step": 22950 + }, + { + "epoch": 2.1415508071288607, + "grad_norm": NaN, + "learning_rate": 0.0002237937051448047, + "loss": 0.0, + "step": 22951 + }, + { + "epoch": 2.141644116823738, + "grad_norm": NaN, + "learning_rate": 0.00022378711987720083, + "loss": 0.0, + "step": 22952 + }, + { + "epoch": 2.141737426518615, + "grad_norm": NaN, + "learning_rate": 0.00022378053442197822, + "loss": 0.0, + "step": 22953 + }, + { + "epoch": 2.1418307362134925, + "grad_norm": NaN, + "learning_rate": 0.00022377394877915355, + "loss": 0.0, + "step": 22954 + }, + { + "epoch": 2.14192404590837, + "grad_norm": NaN, + "learning_rate": 0.00022376736294874356, + "loss": 0.0, + "step": 22955 + }, + { + "epoch": 2.1420173556032474, + "grad_norm": NaN, + "learning_rate": 0.00022376077693076504, + "loss": 0.0, + "step": 22956 + }, + { + "epoch": 2.1421106652981243, + "grad_norm": NaN, + "learning_rate": 0.00022375419072523468, + "loss": 0.0, + "step": 22957 + }, + { + "epoch": 2.1422039749930017, + "grad_norm": NaN, + "learning_rate": 0.00022374760433216927, + "loss": 0.0, + "step": 22958 + }, + { + "epoch": 2.142297284687879, + "grad_norm": NaN, + "learning_rate": 0.00022374101775158559, + "loss": 0.0, + "step": 22959 + }, + { + "epoch": 2.142390594382756, + "grad_norm": NaN, + "learning_rate": 0.00022373443098350034, + "loss": 0.0, + "step": 22960 + }, + { + "epoch": 2.1424839040776336, + "grad_norm": NaN, + "learning_rate": 0.0002237278440279302, + "loss": 0.0, + "step": 22961 + }, + { + "epoch": 2.142577213772511, + "grad_norm": NaN, + "learning_rate": 0.0002237212568848921, + "loss": 0.0, + "step": 22962 + }, + { + "epoch": 2.1426705234673884, + "grad_norm": NaN, + "learning_rate": 0.0002237146695544026, + "loss": 0.0, + "step": 22963 + }, + { + "epoch": 2.1427638331622654, + "grad_norm": NaN, + "learning_rate": 0.00022370808203647853, + "loss": 0.0, + "step": 22964 + }, + { + "epoch": 2.142857142857143, + "grad_norm": NaN, + "learning_rate": 0.00022370149433113668, + "loss": 0.0, + "step": 22965 + }, + { + "epoch": 2.14295045255202, + "grad_norm": NaN, + "learning_rate": 0.00022369490643839375, + "loss": 0.0, + "step": 22966 + }, + { + "epoch": 2.1430437622468976, + "grad_norm": NaN, + "learning_rate": 0.0002236883183582665, + "loss": 0.0, + "step": 22967 + }, + { + "epoch": 2.1431370719417746, + "grad_norm": NaN, + "learning_rate": 0.00022368173009077168, + "loss": 0.0, + "step": 22968 + }, + { + "epoch": 2.143230381636652, + "grad_norm": NaN, + "learning_rate": 0.00022367514163592608, + "loss": 0.0, + "step": 22969 + }, + { + "epoch": 2.1433236913315294, + "grad_norm": NaN, + "learning_rate": 0.00022366855299374636, + "loss": 0.0, + "step": 22970 + }, + { + "epoch": 2.143417001026407, + "grad_norm": NaN, + "learning_rate": 0.0002236619641642494, + "loss": 0.0, + "step": 22971 + }, + { + "epoch": 2.143510310721284, + "grad_norm": NaN, + "learning_rate": 0.00022365537514745183, + "loss": 0.0, + "step": 22972 + }, + { + "epoch": 2.1436036204161613, + "grad_norm": NaN, + "learning_rate": 0.00022364878594337045, + "loss": 0.0, + "step": 22973 + }, + { + "epoch": 2.1436969301110387, + "grad_norm": NaN, + "learning_rate": 0.00022364219655202208, + "loss": 0.0, + "step": 22974 + }, + { + "epoch": 2.1437902398059157, + "grad_norm": NaN, + "learning_rate": 0.00022363560697342342, + "loss": 0.0, + "step": 22975 + }, + { + "epoch": 2.143883549500793, + "grad_norm": NaN, + "learning_rate": 0.00022362901720759118, + "loss": 0.0, + "step": 22976 + }, + { + "epoch": 2.1439768591956705, + "grad_norm": NaN, + "learning_rate": 0.00022362242725454217, + "loss": 0.0, + "step": 22977 + }, + { + "epoch": 2.144070168890548, + "grad_norm": NaN, + "learning_rate": 0.00022361583711429315, + "loss": 0.0, + "step": 22978 + }, + { + "epoch": 2.144163478585425, + "grad_norm": NaN, + "learning_rate": 0.00022360924678686085, + "loss": 0.0, + "step": 22979 + }, + { + "epoch": 2.1442567882803023, + "grad_norm": NaN, + "learning_rate": 0.000223602656272262, + "loss": 0.0, + "step": 22980 + }, + { + "epoch": 2.1443500979751797, + "grad_norm": NaN, + "learning_rate": 0.0002235960655705135, + "loss": 0.0, + "step": 22981 + }, + { + "epoch": 2.1444434076700567, + "grad_norm": NaN, + "learning_rate": 0.00022358947468163192, + "loss": 0.0, + "step": 22982 + }, + { + "epoch": 2.144536717364934, + "grad_norm": NaN, + "learning_rate": 0.0002235828836056341, + "loss": 0.0, + "step": 22983 + }, + { + "epoch": 2.1446300270598115, + "grad_norm": NaN, + "learning_rate": 0.00022357629234253689, + "loss": 0.0, + "step": 22984 + }, + { + "epoch": 2.144723336754689, + "grad_norm": NaN, + "learning_rate": 0.0002235697008923569, + "loss": 0.0, + "step": 22985 + }, + { + "epoch": 2.144816646449566, + "grad_norm": NaN, + "learning_rate": 0.00022356310925511095, + "loss": 0.0, + "step": 22986 + }, + { + "epoch": 2.1449099561444434, + "grad_norm": NaN, + "learning_rate": 0.0002235565174308158, + "loss": 0.0, + "step": 22987 + }, + { + "epoch": 2.1450032658393208, + "grad_norm": NaN, + "learning_rate": 0.00022354992541948827, + "loss": 0.0, + "step": 22988 + }, + { + "epoch": 2.145096575534198, + "grad_norm": NaN, + "learning_rate": 0.000223543333221145, + "loss": 0.0, + "step": 22989 + }, + { + "epoch": 2.145189885229075, + "grad_norm": NaN, + "learning_rate": 0.00022353674083580284, + "loss": 0.0, + "step": 22990 + }, + { + "epoch": 2.1452831949239526, + "grad_norm": NaN, + "learning_rate": 0.00022353014826347856, + "loss": 0.0, + "step": 22991 + }, + { + "epoch": 2.14537650461883, + "grad_norm": NaN, + "learning_rate": 0.00022352355550418887, + "loss": 0.0, + "step": 22992 + }, + { + "epoch": 2.1454698143137074, + "grad_norm": NaN, + "learning_rate": 0.00022351696255795057, + "loss": 0.0, + "step": 22993 + }, + { + "epoch": 2.1455631240085844, + "grad_norm": NaN, + "learning_rate": 0.0002235103694247804, + "loss": 0.0, + "step": 22994 + }, + { + "epoch": 2.145656433703462, + "grad_norm": NaN, + "learning_rate": 0.00022350377610469514, + "loss": 0.0, + "step": 22995 + }, + { + "epoch": 2.1457497433983392, + "grad_norm": NaN, + "learning_rate": 0.00022349718259771155, + "loss": 0.0, + "step": 22996 + }, + { + "epoch": 2.145843053093216, + "grad_norm": NaN, + "learning_rate": 0.0002234905889038464, + "loss": 0.0, + "step": 22997 + }, + { + "epoch": 2.1459363627880936, + "grad_norm": NaN, + "learning_rate": 0.00022348399502311646, + "loss": 0.0, + "step": 22998 + }, + { + "epoch": 2.146029672482971, + "grad_norm": NaN, + "learning_rate": 0.00022347740095553844, + "loss": 0.0, + "step": 22999 + }, + { + "epoch": 2.1461229821778485, + "grad_norm": NaN, + "learning_rate": 0.0002234708067011292, + "loss": 0.0, + "step": 23000 + }, + { + "epoch": 2.1462162918727254, + "grad_norm": NaN, + "learning_rate": 0.0002234642122599055, + "loss": 0.0, + "step": 23001 + }, + { + "epoch": 2.146309601567603, + "grad_norm": NaN, + "learning_rate": 0.00022345761763188396, + "loss": 0.0, + "step": 23002 + }, + { + "epoch": 2.1464029112624803, + "grad_norm": NaN, + "learning_rate": 0.0002234510228170815, + "loss": 0.0, + "step": 23003 + }, + { + "epoch": 2.1464962209573573, + "grad_norm": NaN, + "learning_rate": 0.00022344442781551492, + "loss": 0.0, + "step": 23004 + }, + { + "epoch": 2.1465895306522347, + "grad_norm": NaN, + "learning_rate": 0.00022343783262720085, + "loss": 0.0, + "step": 23005 + }, + { + "epoch": 2.146682840347112, + "grad_norm": NaN, + "learning_rate": 0.00022343123725215614, + "loss": 0.0, + "step": 23006 + }, + { + "epoch": 2.1467761500419895, + "grad_norm": NaN, + "learning_rate": 0.00022342464169039758, + "loss": 0.0, + "step": 23007 + }, + { + "epoch": 2.1468694597368665, + "grad_norm": NaN, + "learning_rate": 0.00022341804594194181, + "loss": 0.0, + "step": 23008 + }, + { + "epoch": 2.146962769431744, + "grad_norm": NaN, + "learning_rate": 0.0002234114500068058, + "loss": 0.0, + "step": 23009 + }, + { + "epoch": 2.1470560791266213, + "grad_norm": NaN, + "learning_rate": 0.00022340485388500618, + "loss": 0.0, + "step": 23010 + }, + { + "epoch": 2.1471493888214987, + "grad_norm": NaN, + "learning_rate": 0.00022339825757655972, + "loss": 0.0, + "step": 23011 + }, + { + "epoch": 2.1472426985163757, + "grad_norm": NaN, + "learning_rate": 0.0002233916610814833, + "loss": 0.0, + "step": 23012 + }, + { + "epoch": 2.147336008211253, + "grad_norm": NaN, + "learning_rate": 0.00022338506439979362, + "loss": 0.0, + "step": 23013 + }, + { + "epoch": 2.1474293179061306, + "grad_norm": NaN, + "learning_rate": 0.0002233784675315074, + "loss": 0.0, + "step": 23014 + }, + { + "epoch": 2.1475226276010075, + "grad_norm": NaN, + "learning_rate": 0.0002233718704766415, + "loss": 0.0, + "step": 23015 + }, + { + "epoch": 2.147615937295885, + "grad_norm": NaN, + "learning_rate": 0.00022336527323521272, + "loss": 0.0, + "step": 23016 + }, + { + "epoch": 2.1477092469907624, + "grad_norm": NaN, + "learning_rate": 0.00022335867580723772, + "loss": 0.0, + "step": 23017 + }, + { + "epoch": 2.14780255668564, + "grad_norm": NaN, + "learning_rate": 0.00022335207819273333, + "loss": 0.0, + "step": 23018 + }, + { + "epoch": 2.1478958663805168, + "grad_norm": NaN, + "learning_rate": 0.0002233454803917164, + "loss": 0.0, + "step": 23019 + }, + { + "epoch": 2.147989176075394, + "grad_norm": NaN, + "learning_rate": 0.0002233388824042036, + "loss": 0.0, + "step": 23020 + }, + { + "epoch": 2.1480824857702716, + "grad_norm": NaN, + "learning_rate": 0.00022333228423021174, + "loss": 0.0, + "step": 23021 + }, + { + "epoch": 2.148175795465149, + "grad_norm": NaN, + "learning_rate": 0.00022332568586975758, + "loss": 0.0, + "step": 23022 + }, + { + "epoch": 2.148269105160026, + "grad_norm": NaN, + "learning_rate": 0.000223319087322858, + "loss": 0.0, + "step": 23023 + }, + { + "epoch": 2.1483624148549034, + "grad_norm": NaN, + "learning_rate": 0.00022331248858952963, + "loss": 0.0, + "step": 23024 + }, + { + "epoch": 2.148455724549781, + "grad_norm": NaN, + "learning_rate": 0.00022330588966978934, + "loss": 0.0, + "step": 23025 + }, + { + "epoch": 2.148549034244658, + "grad_norm": NaN, + "learning_rate": 0.0002232992905636539, + "loss": 0.0, + "step": 23026 + }, + { + "epoch": 2.1486423439395352, + "grad_norm": NaN, + "learning_rate": 0.00022329269127114004, + "loss": 0.0, + "step": 23027 + }, + { + "epoch": 2.1487356536344127, + "grad_norm": NaN, + "learning_rate": 0.0002232860917922646, + "loss": 0.0, + "step": 23028 + }, + { + "epoch": 2.14882896332929, + "grad_norm": NaN, + "learning_rate": 0.00022327949212704439, + "loss": 0.0, + "step": 23029 + }, + { + "epoch": 2.148922273024167, + "grad_norm": NaN, + "learning_rate": 0.00022327289227549608, + "loss": 0.0, + "step": 23030 + }, + { + "epoch": 2.1490155827190445, + "grad_norm": NaN, + "learning_rate": 0.0002232662922376365, + "loss": 0.0, + "step": 23031 + }, + { + "epoch": 2.149108892413922, + "grad_norm": NaN, + "learning_rate": 0.00022325969201348247, + "loss": 0.0, + "step": 23032 + }, + { + "epoch": 2.1492022021087993, + "grad_norm": NaN, + "learning_rate": 0.00022325309160305075, + "loss": 0.0, + "step": 23033 + }, + { + "epoch": 2.1492955118036763, + "grad_norm": NaN, + "learning_rate": 0.00022324649100635805, + "loss": 0.0, + "step": 23034 + }, + { + "epoch": 2.1493888214985537, + "grad_norm": NaN, + "learning_rate": 0.00022323989022342131, + "loss": 0.0, + "step": 23035 + }, + { + "epoch": 2.149482131193431, + "grad_norm": NaN, + "learning_rate": 0.00022323328925425717, + "loss": 0.0, + "step": 23036 + }, + { + "epoch": 2.149575440888308, + "grad_norm": NaN, + "learning_rate": 0.00022322668809888249, + "loss": 0.0, + "step": 23037 + }, + { + "epoch": 2.1496687505831855, + "grad_norm": NaN, + "learning_rate": 0.00022322008675731405, + "loss": 0.0, + "step": 23038 + }, + { + "epoch": 2.149762060278063, + "grad_norm": NaN, + "learning_rate": 0.0002232134852295686, + "loss": 0.0, + "step": 23039 + }, + { + "epoch": 2.1498553699729404, + "grad_norm": NaN, + "learning_rate": 0.0002232068835156629, + "loss": 0.0, + "step": 23040 + }, + { + "epoch": 2.1499486796678173, + "grad_norm": NaN, + "learning_rate": 0.00022320028161561384, + "loss": 0.0, + "step": 23041 + }, + { + "epoch": 2.1500419893626947, + "grad_norm": NaN, + "learning_rate": 0.00022319367952943815, + "loss": 0.0, + "step": 23042 + }, + { + "epoch": 2.150135299057572, + "grad_norm": NaN, + "learning_rate": 0.00022318707725715257, + "loss": 0.0, + "step": 23043 + }, + { + "epoch": 2.1502286087524496, + "grad_norm": NaN, + "learning_rate": 0.000223180474798774, + "loss": 0.0, + "step": 23044 + }, + { + "epoch": 2.1503219184473266, + "grad_norm": NaN, + "learning_rate": 0.0002231738721543191, + "loss": 0.0, + "step": 23045 + }, + { + "epoch": 2.150415228142204, + "grad_norm": NaN, + "learning_rate": 0.0002231672693238047, + "loss": 0.0, + "step": 23046 + }, + { + "epoch": 2.1505085378370814, + "grad_norm": NaN, + "learning_rate": 0.00022316066630724766, + "loss": 0.0, + "step": 23047 + }, + { + "epoch": 2.1506018475319584, + "grad_norm": NaN, + "learning_rate": 0.00022315406310466473, + "loss": 0.0, + "step": 23048 + }, + { + "epoch": 2.150695157226836, + "grad_norm": NaN, + "learning_rate": 0.00022314745971607265, + "loss": 0.0, + "step": 23049 + }, + { + "epoch": 2.150788466921713, + "grad_norm": NaN, + "learning_rate": 0.00022314085614148822, + "loss": 0.0, + "step": 23050 + }, + { + "epoch": 2.1508817766165906, + "grad_norm": NaN, + "learning_rate": 0.00022313425238092833, + "loss": 0.0, + "step": 23051 + }, + { + "epoch": 2.1509750863114676, + "grad_norm": NaN, + "learning_rate": 0.00022312764843440968, + "loss": 0.0, + "step": 23052 + }, + { + "epoch": 2.151068396006345, + "grad_norm": NaN, + "learning_rate": 0.00022312104430194903, + "loss": 0.0, + "step": 23053 + }, + { + "epoch": 2.1511617057012224, + "grad_norm": NaN, + "learning_rate": 0.0002231144399835633, + "loss": 0.0, + "step": 23054 + }, + { + "epoch": 2.1512550153960994, + "grad_norm": NaN, + "learning_rate": 0.00022310783547926913, + "loss": 0.0, + "step": 23055 + }, + { + "epoch": 2.151348325090977, + "grad_norm": NaN, + "learning_rate": 0.00022310123078908345, + "loss": 0.0, + "step": 23056 + }, + { + "epoch": 2.1514416347858543, + "grad_norm": NaN, + "learning_rate": 0.000223094625913023, + "loss": 0.0, + "step": 23057 + }, + { + "epoch": 2.1515349444807317, + "grad_norm": NaN, + "learning_rate": 0.0002230880208511045, + "loss": 0.0, + "step": 23058 + }, + { + "epoch": 2.1516282541756087, + "grad_norm": NaN, + "learning_rate": 0.00022308141560334487, + "loss": 0.0, + "step": 23059 + }, + { + "epoch": 2.151721563870486, + "grad_norm": NaN, + "learning_rate": 0.00022307481016976084, + "loss": 0.0, + "step": 23060 + }, + { + "epoch": 2.1518148735653635, + "grad_norm": NaN, + "learning_rate": 0.0002230682045503692, + "loss": 0.0, + "step": 23061 + }, + { + "epoch": 2.151908183260241, + "grad_norm": NaN, + "learning_rate": 0.00022306159874518677, + "loss": 0.0, + "step": 23062 + }, + { + "epoch": 2.152001492955118, + "grad_norm": NaN, + "learning_rate": 0.00022305499275423034, + "loss": 0.0, + "step": 23063 + }, + { + "epoch": 2.1520948026499953, + "grad_norm": NaN, + "learning_rate": 0.0002230483865775167, + "loss": 0.0, + "step": 23064 + }, + { + "epoch": 2.1521881123448727, + "grad_norm": NaN, + "learning_rate": 0.00022304178021506266, + "loss": 0.0, + "step": 23065 + }, + { + "epoch": 2.15228142203975, + "grad_norm": NaN, + "learning_rate": 0.00022303517366688497, + "loss": 0.0, + "step": 23066 + }, + { + "epoch": 2.152374731734627, + "grad_norm": NaN, + "learning_rate": 0.00022302856693300052, + "loss": 0.0, + "step": 23067 + }, + { + "epoch": 2.1524680414295045, + "grad_norm": NaN, + "learning_rate": 0.00022302196001342602, + "loss": 0.0, + "step": 23068 + }, + { + "epoch": 2.152561351124382, + "grad_norm": NaN, + "learning_rate": 0.00022301535290817834, + "loss": 0.0, + "step": 23069 + }, + { + "epoch": 2.152654660819259, + "grad_norm": NaN, + "learning_rate": 0.00022300874561727424, + "loss": 0.0, + "step": 23070 + }, + { + "epoch": 2.1527479705141364, + "grad_norm": NaN, + "learning_rate": 0.0002230021381407305, + "loss": 0.0, + "step": 23071 + }, + { + "epoch": 2.1528412802090138, + "grad_norm": NaN, + "learning_rate": 0.00022299553047856396, + "loss": 0.0, + "step": 23072 + }, + { + "epoch": 2.152934589903891, + "grad_norm": NaN, + "learning_rate": 0.00022298892263079143, + "loss": 0.0, + "step": 23073 + }, + { + "epoch": 2.153027899598768, + "grad_norm": NaN, + "learning_rate": 0.00022298231459742966, + "loss": 0.0, + "step": 23074 + }, + { + "epoch": 2.1531212092936456, + "grad_norm": NaN, + "learning_rate": 0.0002229757063784955, + "loss": 0.0, + "step": 23075 + }, + { + "epoch": 2.153214518988523, + "grad_norm": NaN, + "learning_rate": 0.00022296909797400575, + "loss": 0.0, + "step": 23076 + }, + { + "epoch": 2.1533078286834, + "grad_norm": NaN, + "learning_rate": 0.00022296248938397716, + "loss": 0.0, + "step": 23077 + }, + { + "epoch": 2.1534011383782774, + "grad_norm": NaN, + "learning_rate": 0.0002229558806084266, + "loss": 0.0, + "step": 23078 + }, + { + "epoch": 2.153494448073155, + "grad_norm": NaN, + "learning_rate": 0.00022294927164737084, + "loss": 0.0, + "step": 23079 + }, + { + "epoch": 2.1535877577680322, + "grad_norm": NaN, + "learning_rate": 0.0002229426625008267, + "loss": 0.0, + "step": 23080 + }, + { + "epoch": 2.153681067462909, + "grad_norm": NaN, + "learning_rate": 0.00022293605316881093, + "loss": 0.0, + "step": 23081 + }, + { + "epoch": 2.1537743771577866, + "grad_norm": NaN, + "learning_rate": 0.00022292944365134043, + "loss": 0.0, + "step": 23082 + }, + { + "epoch": 2.153867686852664, + "grad_norm": NaN, + "learning_rate": 0.00022292283394843192, + "loss": 0.0, + "step": 23083 + }, + { + "epoch": 2.1539609965475415, + "grad_norm": NaN, + "learning_rate": 0.0002229162240601023, + "loss": 0.0, + "step": 23084 + }, + { + "epoch": 2.1540543062424184, + "grad_norm": NaN, + "learning_rate": 0.0002229096139863683, + "loss": 0.0, + "step": 23085 + }, + { + "epoch": 2.154147615937296, + "grad_norm": NaN, + "learning_rate": 0.00022290300372724675, + "loss": 0.0, + "step": 23086 + }, + { + "epoch": 2.1542409256321733, + "grad_norm": NaN, + "learning_rate": 0.00022289639328275441, + "loss": 0.0, + "step": 23087 + }, + { + "epoch": 2.1543342353270507, + "grad_norm": NaN, + "learning_rate": 0.00022288978265290818, + "loss": 0.0, + "step": 23088 + }, + { + "epoch": 2.1544275450219277, + "grad_norm": NaN, + "learning_rate": 0.00022288317183772485, + "loss": 0.0, + "step": 23089 + }, + { + "epoch": 2.154520854716805, + "grad_norm": NaN, + "learning_rate": 0.00022287656083722115, + "loss": 0.0, + "step": 23090 + }, + { + "epoch": 2.1546141644116825, + "grad_norm": NaN, + "learning_rate": 0.00022286994965141398, + "loss": 0.0, + "step": 23091 + }, + { + "epoch": 2.1547074741065595, + "grad_norm": NaN, + "learning_rate": 0.00022286333828032013, + "loss": 0.0, + "step": 23092 + }, + { + "epoch": 2.154800783801437, + "grad_norm": NaN, + "learning_rate": 0.00022285672672395633, + "loss": 0.0, + "step": 23093 + }, + { + "epoch": 2.1548940934963143, + "grad_norm": NaN, + "learning_rate": 0.00022285011498233945, + "loss": 0.0, + "step": 23094 + }, + { + "epoch": 2.1549874031911918, + "grad_norm": NaN, + "learning_rate": 0.0002228435030554864, + "loss": 0.0, + "step": 23095 + }, + { + "epoch": 2.1550807128860687, + "grad_norm": NaN, + "learning_rate": 0.00022283689094341385, + "loss": 0.0, + "step": 23096 + }, + { + "epoch": 2.155174022580946, + "grad_norm": NaN, + "learning_rate": 0.00022283027864613864, + "loss": 0.0, + "step": 23097 + }, + { + "epoch": 2.1552673322758236, + "grad_norm": NaN, + "learning_rate": 0.00022282366616367764, + "loss": 0.0, + "step": 23098 + }, + { + "epoch": 2.1553606419707005, + "grad_norm": NaN, + "learning_rate": 0.0002228170534960476, + "loss": 0.0, + "step": 23099 + }, + { + "epoch": 2.155453951665578, + "grad_norm": NaN, + "learning_rate": 0.00022281044064326535, + "loss": 0.0, + "step": 23100 + }, + { + "epoch": 2.1555472613604554, + "grad_norm": NaN, + "learning_rate": 0.00022280382760534777, + "loss": 0.0, + "step": 23101 + }, + { + "epoch": 2.155640571055333, + "grad_norm": NaN, + "learning_rate": 0.00022279721438231157, + "loss": 0.0, + "step": 23102 + }, + { + "epoch": 2.1557338807502098, + "grad_norm": NaN, + "learning_rate": 0.00022279060097417364, + "loss": 0.0, + "step": 23103 + }, + { + "epoch": 2.155827190445087, + "grad_norm": NaN, + "learning_rate": 0.0002227839873809508, + "loss": 0.0, + "step": 23104 + }, + { + "epoch": 2.1559205001399646, + "grad_norm": NaN, + "learning_rate": 0.0002227773736026598, + "loss": 0.0, + "step": 23105 + }, + { + "epoch": 2.156013809834842, + "grad_norm": NaN, + "learning_rate": 0.00022277075963931755, + "loss": 0.0, + "step": 23106 + }, + { + "epoch": 2.156107119529719, + "grad_norm": NaN, + "learning_rate": 0.00022276414549094074, + "loss": 0.0, + "step": 23107 + }, + { + "epoch": 2.1562004292245964, + "grad_norm": NaN, + "learning_rate": 0.00022275753115754635, + "loss": 0.0, + "step": 23108 + }, + { + "epoch": 2.156293738919474, + "grad_norm": NaN, + "learning_rate": 0.00022275091663915105, + "loss": 0.0, + "step": 23109 + }, + { + "epoch": 2.1563870486143513, + "grad_norm": NaN, + "learning_rate": 0.0002227443019357717, + "loss": 0.0, + "step": 23110 + }, + { + "epoch": 2.1564803583092282, + "grad_norm": NaN, + "learning_rate": 0.00022273768704742522, + "loss": 0.0, + "step": 23111 + }, + { + "epoch": 2.1565736680041057, + "grad_norm": NaN, + "learning_rate": 0.00022273107197412826, + "loss": 0.0, + "step": 23112 + }, + { + "epoch": 2.156666977698983, + "grad_norm": NaN, + "learning_rate": 0.00022272445671589776, + "loss": 0.0, + "step": 23113 + }, + { + "epoch": 2.15676028739386, + "grad_norm": NaN, + "learning_rate": 0.00022271784127275053, + "loss": 0.0, + "step": 23114 + }, + { + "epoch": 2.1568535970887375, + "grad_norm": NaN, + "learning_rate": 0.00022271122564470336, + "loss": 0.0, + "step": 23115 + }, + { + "epoch": 2.156946906783615, + "grad_norm": NaN, + "learning_rate": 0.00022270460983177308, + "loss": 0.0, + "step": 23116 + }, + { + "epoch": 2.1570402164784923, + "grad_norm": NaN, + "learning_rate": 0.00022269799383397654, + "loss": 0.0, + "step": 23117 + }, + { + "epoch": 2.1571335261733693, + "grad_norm": NaN, + "learning_rate": 0.0002226913776513305, + "loss": 0.0, + "step": 23118 + }, + { + "epoch": 2.1572268358682467, + "grad_norm": NaN, + "learning_rate": 0.00022268476128385183, + "loss": 0.0, + "step": 23119 + }, + { + "epoch": 2.157320145563124, + "grad_norm": NaN, + "learning_rate": 0.00022267814473155733, + "loss": 0.0, + "step": 23120 + }, + { + "epoch": 2.157413455258001, + "grad_norm": NaN, + "learning_rate": 0.00022267152799446382, + "loss": 0.0, + "step": 23121 + }, + { + "epoch": 2.1575067649528785, + "grad_norm": NaN, + "learning_rate": 0.00022266491107258816, + "loss": 0.0, + "step": 23122 + }, + { + "epoch": 2.157600074647756, + "grad_norm": NaN, + "learning_rate": 0.00022265829396594721, + "loss": 0.0, + "step": 23123 + }, + { + "epoch": 2.1576933843426334, + "grad_norm": NaN, + "learning_rate": 0.0002226516766745577, + "loss": 0.0, + "step": 23124 + }, + { + "epoch": 2.1577866940375103, + "grad_norm": NaN, + "learning_rate": 0.00022264505919843648, + "loss": 0.0, + "step": 23125 + }, + { + "epoch": 2.1578800037323878, + "grad_norm": NaN, + "learning_rate": 0.00022263844153760042, + "loss": 0.0, + "step": 23126 + }, + { + "epoch": 2.157973313427265, + "grad_norm": NaN, + "learning_rate": 0.0002226318236920663, + "loss": 0.0, + "step": 23127 + }, + { + "epoch": 2.1580666231221426, + "grad_norm": NaN, + "learning_rate": 0.00022262520566185097, + "loss": 0.0, + "step": 23128 + }, + { + "epoch": 2.1581599328170196, + "grad_norm": NaN, + "learning_rate": 0.00022261858744697125, + "loss": 0.0, + "step": 23129 + }, + { + "epoch": 2.158253242511897, + "grad_norm": NaN, + "learning_rate": 0.00022261196904744401, + "loss": 0.0, + "step": 23130 + }, + { + "epoch": 2.1583465522067744, + "grad_norm": NaN, + "learning_rate": 0.000222605350463286, + "loss": 0.0, + "step": 23131 + }, + { + "epoch": 2.1584398619016514, + "grad_norm": NaN, + "learning_rate": 0.0002225987316945141, + "loss": 0.0, + "step": 23132 + }, + { + "epoch": 2.158533171596529, + "grad_norm": NaN, + "learning_rate": 0.00022259211274114517, + "loss": 0.0, + "step": 23133 + }, + { + "epoch": 2.158626481291406, + "grad_norm": NaN, + "learning_rate": 0.00022258549360319594, + "loss": 0.0, + "step": 23134 + }, + { + "epoch": 2.1587197909862836, + "grad_norm": NaN, + "learning_rate": 0.00022257887428068333, + "loss": 0.0, + "step": 23135 + }, + { + "epoch": 2.1588131006811606, + "grad_norm": NaN, + "learning_rate": 0.00022257225477362414, + "loss": 0.0, + "step": 23136 + }, + { + "epoch": 2.158906410376038, + "grad_norm": NaN, + "learning_rate": 0.0002225656350820352, + "loss": 0.0, + "step": 23137 + }, + { + "epoch": 2.1589997200709155, + "grad_norm": NaN, + "learning_rate": 0.00022255901520593335, + "loss": 0.0, + "step": 23138 + }, + { + "epoch": 2.159093029765793, + "grad_norm": NaN, + "learning_rate": 0.00022255239514533542, + "loss": 0.0, + "step": 23139 + }, + { + "epoch": 2.15918633946067, + "grad_norm": NaN, + "learning_rate": 0.00022254577490025824, + "loss": 0.0, + "step": 23140 + }, + { + "epoch": 2.1592796491555473, + "grad_norm": NaN, + "learning_rate": 0.00022253915447071866, + "loss": 0.0, + "step": 23141 + }, + { + "epoch": 2.1593729588504247, + "grad_norm": NaN, + "learning_rate": 0.00022253253385673348, + "loss": 0.0, + "step": 23142 + }, + { + "epoch": 2.1594662685453017, + "grad_norm": NaN, + "learning_rate": 0.00022252591305831955, + "loss": 0.0, + "step": 23143 + }, + { + "epoch": 2.159559578240179, + "grad_norm": NaN, + "learning_rate": 0.00022251929207549372, + "loss": 0.0, + "step": 23144 + }, + { + "epoch": 2.1596528879350565, + "grad_norm": NaN, + "learning_rate": 0.0002225126709082728, + "loss": 0.0, + "step": 23145 + }, + { + "epoch": 2.159746197629934, + "grad_norm": NaN, + "learning_rate": 0.00022250604955667363, + "loss": 0.0, + "step": 23146 + }, + { + "epoch": 2.159839507324811, + "grad_norm": NaN, + "learning_rate": 0.00022249942802071306, + "loss": 0.0, + "step": 23147 + }, + { + "epoch": 2.1599328170196883, + "grad_norm": NaN, + "learning_rate": 0.00022249280630040792, + "loss": 0.0, + "step": 23148 + }, + { + "epoch": 2.1600261267145657, + "grad_norm": NaN, + "learning_rate": 0.00022248618439577505, + "loss": 0.0, + "step": 23149 + }, + { + "epoch": 2.1601194364094427, + "grad_norm": NaN, + "learning_rate": 0.0002224795623068313, + "loss": 0.0, + "step": 23150 + }, + { + "epoch": 2.16021274610432, + "grad_norm": NaN, + "learning_rate": 0.00022247294003359348, + "loss": 0.0, + "step": 23151 + }, + { + "epoch": 2.1603060557991975, + "grad_norm": NaN, + "learning_rate": 0.00022246631757607844, + "loss": 0.0, + "step": 23152 + }, + { + "epoch": 2.160399365494075, + "grad_norm": NaN, + "learning_rate": 0.00022245969493430302, + "loss": 0.0, + "step": 23153 + }, + { + "epoch": 2.160492675188952, + "grad_norm": NaN, + "learning_rate": 0.00022245307210828406, + "loss": 0.0, + "step": 23154 + }, + { + "epoch": 2.1605859848838294, + "grad_norm": NaN, + "learning_rate": 0.00022244644909803838, + "loss": 0.0, + "step": 23155 + }, + { + "epoch": 2.1606792945787068, + "grad_norm": NaN, + "learning_rate": 0.00022243982590358289, + "loss": 0.0, + "step": 23156 + }, + { + "epoch": 2.160772604273584, + "grad_norm": NaN, + "learning_rate": 0.00022243320252493434, + "loss": 0.0, + "step": 23157 + }, + { + "epoch": 2.160865913968461, + "grad_norm": NaN, + "learning_rate": 0.00022242657896210963, + "loss": 0.0, + "step": 23158 + }, + { + "epoch": 2.1609592236633386, + "grad_norm": NaN, + "learning_rate": 0.0002224199552151256, + "loss": 0.0, + "step": 23159 + }, + { + "epoch": 2.161052533358216, + "grad_norm": NaN, + "learning_rate": 0.00022241333128399904, + "loss": 0.0, + "step": 23160 + }, + { + "epoch": 2.1611458430530934, + "grad_norm": NaN, + "learning_rate": 0.00022240670716874684, + "loss": 0.0, + "step": 23161 + }, + { + "epoch": 2.1612391527479704, + "grad_norm": NaN, + "learning_rate": 0.00022240008286938582, + "loss": 0.0, + "step": 23162 + }, + { + "epoch": 2.161332462442848, + "grad_norm": NaN, + "learning_rate": 0.00022239345838593287, + "loss": 0.0, + "step": 23163 + }, + { + "epoch": 2.1614257721377252, + "grad_norm": NaN, + "learning_rate": 0.0002223868337184048, + "loss": 0.0, + "step": 23164 + }, + { + "epoch": 2.161519081832602, + "grad_norm": NaN, + "learning_rate": 0.00022238020886681842, + "loss": 0.0, + "step": 23165 + }, + { + "epoch": 2.1616123915274796, + "grad_norm": NaN, + "learning_rate": 0.0002223735838311906, + "loss": 0.0, + "step": 23166 + }, + { + "epoch": 2.161705701222357, + "grad_norm": NaN, + "learning_rate": 0.00022236695861153826, + "loss": 0.0, + "step": 23167 + }, + { + "epoch": 2.1617990109172345, + "grad_norm": NaN, + "learning_rate": 0.00022236033320787812, + "loss": 0.0, + "step": 23168 + }, + { + "epoch": 2.1618923206121115, + "grad_norm": NaN, + "learning_rate": 0.0002223537076202271, + "loss": 0.0, + "step": 23169 + }, + { + "epoch": 2.161985630306989, + "grad_norm": NaN, + "learning_rate": 0.00022234708184860205, + "loss": 0.0, + "step": 23170 + }, + { + "epoch": 2.1620789400018663, + "grad_norm": NaN, + "learning_rate": 0.0002223404558930198, + "loss": 0.0, + "step": 23171 + }, + { + "epoch": 2.1621722496967433, + "grad_norm": NaN, + "learning_rate": 0.0002223338297534972, + "loss": 0.0, + "step": 23172 + }, + { + "epoch": 2.1622655593916207, + "grad_norm": NaN, + "learning_rate": 0.0002223272034300511, + "loss": 0.0, + "step": 23173 + }, + { + "epoch": 2.162358869086498, + "grad_norm": NaN, + "learning_rate": 0.00022232057692269833, + "loss": 0.0, + "step": 23174 + }, + { + "epoch": 2.1624521787813755, + "grad_norm": NaN, + "learning_rate": 0.00022231395023145575, + "loss": 0.0, + "step": 23175 + }, + { + "epoch": 2.1625454884762525, + "grad_norm": NaN, + "learning_rate": 0.00022230732335634022, + "loss": 0.0, + "step": 23176 + }, + { + "epoch": 2.16263879817113, + "grad_norm": NaN, + "learning_rate": 0.00022230069629736865, + "loss": 0.0, + "step": 23177 + }, + { + "epoch": 2.1627321078660073, + "grad_norm": NaN, + "learning_rate": 0.00022229406905455775, + "loss": 0.0, + "step": 23178 + }, + { + "epoch": 2.1628254175608848, + "grad_norm": NaN, + "learning_rate": 0.00022228744162792446, + "loss": 0.0, + "step": 23179 + }, + { + "epoch": 2.1629187272557617, + "grad_norm": NaN, + "learning_rate": 0.00022228081401748563, + "loss": 0.0, + "step": 23180 + }, + { + "epoch": 2.163012036950639, + "grad_norm": NaN, + "learning_rate": 0.00022227418622325812, + "loss": 0.0, + "step": 23181 + }, + { + "epoch": 2.1631053466455166, + "grad_norm": NaN, + "learning_rate": 0.00022226755824525875, + "loss": 0.0, + "step": 23182 + }, + { + "epoch": 2.163198656340394, + "grad_norm": NaN, + "learning_rate": 0.00022226093008350436, + "loss": 0.0, + "step": 23183 + }, + { + "epoch": 2.163291966035271, + "grad_norm": NaN, + "learning_rate": 0.00022225430173801185, + "loss": 0.0, + "step": 23184 + }, + { + "epoch": 2.1633852757301484, + "grad_norm": NaN, + "learning_rate": 0.00022224767320879805, + "loss": 0.0, + "step": 23185 + }, + { + "epoch": 2.163478585425026, + "grad_norm": NaN, + "learning_rate": 0.00022224104449587983, + "loss": 0.0, + "step": 23186 + }, + { + "epoch": 2.1635718951199028, + "grad_norm": NaN, + "learning_rate": 0.00022223441559927404, + "loss": 0.0, + "step": 23187 + }, + { + "epoch": 2.16366520481478, + "grad_norm": NaN, + "learning_rate": 0.0002222277865189975, + "loss": 0.0, + "step": 23188 + }, + { + "epoch": 2.1637585145096576, + "grad_norm": NaN, + "learning_rate": 0.00022222115725506713, + "loss": 0.0, + "step": 23189 + }, + { + "epoch": 2.163851824204535, + "grad_norm": NaN, + "learning_rate": 0.00022221452780749973, + "loss": 0.0, + "step": 23190 + }, + { + "epoch": 2.163945133899412, + "grad_norm": NaN, + "learning_rate": 0.00022220789817631217, + "loss": 0.0, + "step": 23191 + }, + { + "epoch": 2.1640384435942894, + "grad_norm": NaN, + "learning_rate": 0.0002222012683615213, + "loss": 0.0, + "step": 23192 + }, + { + "epoch": 2.164131753289167, + "grad_norm": NaN, + "learning_rate": 0.00022219463836314402, + "loss": 0.0, + "step": 23193 + }, + { + "epoch": 2.164225062984044, + "grad_norm": NaN, + "learning_rate": 0.00022218800818119716, + "loss": 0.0, + "step": 23194 + }, + { + "epoch": 2.1643183726789212, + "grad_norm": NaN, + "learning_rate": 0.00022218137781569757, + "loss": 0.0, + "step": 23195 + }, + { + "epoch": 2.1644116823737987, + "grad_norm": NaN, + "learning_rate": 0.00022217474726666213, + "loss": 0.0, + "step": 23196 + }, + { + "epoch": 2.164504992068676, + "grad_norm": NaN, + "learning_rate": 0.00022216811653410769, + "loss": 0.0, + "step": 23197 + }, + { + "epoch": 2.164598301763553, + "grad_norm": NaN, + "learning_rate": 0.00022216148561805108, + "loss": 0.0, + "step": 23198 + }, + { + "epoch": 2.1646916114584305, + "grad_norm": NaN, + "learning_rate": 0.00022215485451850922, + "loss": 0.0, + "step": 23199 + }, + { + "epoch": 2.164784921153308, + "grad_norm": NaN, + "learning_rate": 0.0002221482232354989, + "loss": 0.0, + "step": 23200 + }, + { + "epoch": 2.1648782308481853, + "grad_norm": NaN, + "learning_rate": 0.00022214159176903704, + "loss": 0.0, + "step": 23201 + }, + { + "epoch": 2.1649715405430623, + "grad_norm": NaN, + "learning_rate": 0.0002221349601191405, + "loss": 0.0, + "step": 23202 + }, + { + "epoch": 2.1650648502379397, + "grad_norm": NaN, + "learning_rate": 0.00022212832828582608, + "loss": 0.0, + "step": 23203 + }, + { + "epoch": 2.165158159932817, + "grad_norm": NaN, + "learning_rate": 0.00022212169626911074, + "loss": 0.0, + "step": 23204 + }, + { + "epoch": 2.1652514696276945, + "grad_norm": NaN, + "learning_rate": 0.00022211506406901127, + "loss": 0.0, + "step": 23205 + }, + { + "epoch": 2.1653447793225715, + "grad_norm": NaN, + "learning_rate": 0.00022210843168554454, + "loss": 0.0, + "step": 23206 + }, + { + "epoch": 2.165438089017449, + "grad_norm": NaN, + "learning_rate": 0.00022210179911872743, + "loss": 0.0, + "step": 23207 + }, + { + "epoch": 2.1655313987123264, + "grad_norm": NaN, + "learning_rate": 0.00022209516636857683, + "loss": 0.0, + "step": 23208 + }, + { + "epoch": 2.1656247084072033, + "grad_norm": NaN, + "learning_rate": 0.00022208853343510954, + "loss": 0.0, + "step": 23209 + }, + { + "epoch": 2.1657180181020808, + "grad_norm": NaN, + "learning_rate": 0.00022208190031834248, + "loss": 0.0, + "step": 23210 + }, + { + "epoch": 2.165811327796958, + "grad_norm": NaN, + "learning_rate": 0.0002220752670182925, + "loss": 0.0, + "step": 23211 + }, + { + "epoch": 2.1659046374918356, + "grad_norm": NaN, + "learning_rate": 0.0002220686335349765, + "loss": 0.0, + "step": 23212 + }, + { + "epoch": 2.1659979471867126, + "grad_norm": NaN, + "learning_rate": 0.00022206199986841128, + "loss": 0.0, + "step": 23213 + }, + { + "epoch": 2.16609125688159, + "grad_norm": NaN, + "learning_rate": 0.00022205536601861372, + "loss": 0.0, + "step": 23214 + }, + { + "epoch": 2.1661845665764674, + "grad_norm": NaN, + "learning_rate": 0.00022204873198560077, + "loss": 0.0, + "step": 23215 + }, + { + "epoch": 2.1662778762713444, + "grad_norm": NaN, + "learning_rate": 0.00022204209776938917, + "loss": 0.0, + "step": 23216 + }, + { + "epoch": 2.166371185966222, + "grad_norm": NaN, + "learning_rate": 0.00022203546336999587, + "loss": 0.0, + "step": 23217 + }, + { + "epoch": 2.166464495661099, + "grad_norm": NaN, + "learning_rate": 0.00022202882878743778, + "loss": 0.0, + "step": 23218 + }, + { + "epoch": 2.1665578053559766, + "grad_norm": NaN, + "learning_rate": 0.00022202219402173167, + "loss": 0.0, + "step": 23219 + }, + { + "epoch": 2.1666511150508536, + "grad_norm": NaN, + "learning_rate": 0.00022201555907289445, + "loss": 0.0, + "step": 23220 + }, + { + "epoch": 2.166744424745731, + "grad_norm": NaN, + "learning_rate": 0.00022200892394094298, + "loss": 0.0, + "step": 23221 + }, + { + "epoch": 2.1668377344406085, + "grad_norm": NaN, + "learning_rate": 0.0002220022886258942, + "loss": 0.0, + "step": 23222 + }, + { + "epoch": 2.166931044135486, + "grad_norm": NaN, + "learning_rate": 0.00022199565312776488, + "loss": 0.0, + "step": 23223 + }, + { + "epoch": 2.167024353830363, + "grad_norm": NaN, + "learning_rate": 0.00022198901744657196, + "loss": 0.0, + "step": 23224 + }, + { + "epoch": 2.1671176635252403, + "grad_norm": NaN, + "learning_rate": 0.00022198238158233232, + "loss": 0.0, + "step": 23225 + }, + { + "epoch": 2.1672109732201177, + "grad_norm": NaN, + "learning_rate": 0.00022197574553506276, + "loss": 0.0, + "step": 23226 + }, + { + "epoch": 2.1673042829149947, + "grad_norm": NaN, + "learning_rate": 0.00022196910930478024, + "loss": 0.0, + "step": 23227 + }, + { + "epoch": 2.167397592609872, + "grad_norm": NaN, + "learning_rate": 0.00022196247289150157, + "loss": 0.0, + "step": 23228 + }, + { + "epoch": 2.1674909023047495, + "grad_norm": NaN, + "learning_rate": 0.00022195583629524367, + "loss": 0.0, + "step": 23229 + }, + { + "epoch": 2.167584211999627, + "grad_norm": NaN, + "learning_rate": 0.00022194919951602334, + "loss": 0.0, + "step": 23230 + }, + { + "epoch": 2.167677521694504, + "grad_norm": NaN, + "learning_rate": 0.00022194256255385757, + "loss": 0.0, + "step": 23231 + }, + { + "epoch": 2.1677708313893813, + "grad_norm": NaN, + "learning_rate": 0.00022193592540876314, + "loss": 0.0, + "step": 23232 + }, + { + "epoch": 2.1678641410842587, + "grad_norm": NaN, + "learning_rate": 0.00022192928808075697, + "loss": 0.0, + "step": 23233 + }, + { + "epoch": 2.167957450779136, + "grad_norm": NaN, + "learning_rate": 0.00022192265056985593, + "loss": 0.0, + "step": 23234 + }, + { + "epoch": 2.168050760474013, + "grad_norm": NaN, + "learning_rate": 0.0002219160128760769, + "loss": 0.0, + "step": 23235 + }, + { + "epoch": 2.1681440701688905, + "grad_norm": NaN, + "learning_rate": 0.00022190937499943676, + "loss": 0.0, + "step": 23236 + }, + { + "epoch": 2.168237379863768, + "grad_norm": NaN, + "learning_rate": 0.00022190273693995234, + "loss": 0.0, + "step": 23237 + }, + { + "epoch": 2.168330689558645, + "grad_norm": NaN, + "learning_rate": 0.0002218960986976406, + "loss": 0.0, + "step": 23238 + }, + { + "epoch": 2.1684239992535224, + "grad_norm": NaN, + "learning_rate": 0.00022188946027251836, + "loss": 0.0, + "step": 23239 + }, + { + "epoch": 2.1685173089484, + "grad_norm": NaN, + "learning_rate": 0.00022188282166460253, + "loss": 0.0, + "step": 23240 + }, + { + "epoch": 2.168610618643277, + "grad_norm": NaN, + "learning_rate": 0.00022187618287390995, + "loss": 0.0, + "step": 23241 + }, + { + "epoch": 2.168703928338154, + "grad_norm": NaN, + "learning_rate": 0.00022186954390045755, + "loss": 0.0, + "step": 23242 + }, + { + "epoch": 2.1687972380330316, + "grad_norm": NaN, + "learning_rate": 0.00022186290474426217, + "loss": 0.0, + "step": 23243 + }, + { + "epoch": 2.168890547727909, + "grad_norm": NaN, + "learning_rate": 0.00022185626540534073, + "loss": 0.0, + "step": 23244 + }, + { + "epoch": 2.1689838574227864, + "grad_norm": NaN, + "learning_rate": 0.0002218496258837101, + "loss": 0.0, + "step": 23245 + }, + { + "epoch": 2.1690771671176634, + "grad_norm": NaN, + "learning_rate": 0.00022184298617938716, + "loss": 0.0, + "step": 23246 + }, + { + "epoch": 2.169170476812541, + "grad_norm": NaN, + "learning_rate": 0.00022183634629238878, + "loss": 0.0, + "step": 23247 + }, + { + "epoch": 2.1692637865074182, + "grad_norm": NaN, + "learning_rate": 0.00022182970622273182, + "loss": 0.0, + "step": 23248 + }, + { + "epoch": 2.169357096202295, + "grad_norm": NaN, + "learning_rate": 0.00022182306597043322, + "loss": 0.0, + "step": 23249 + }, + { + "epoch": 2.1694504058971726, + "grad_norm": NaN, + "learning_rate": 0.00022181642553550983, + "loss": 0.0, + "step": 23250 + }, + { + "epoch": 2.16954371559205, + "grad_norm": NaN, + "learning_rate": 0.00022180978491797854, + "loss": 0.0, + "step": 23251 + }, + { + "epoch": 2.1696370252869275, + "grad_norm": NaN, + "learning_rate": 0.00022180314411785626, + "loss": 0.0, + "step": 23252 + }, + { + "epoch": 2.1697303349818045, + "grad_norm": NaN, + "learning_rate": 0.00022179650313515985, + "loss": 0.0, + "step": 23253 + }, + { + "epoch": 2.169823644676682, + "grad_norm": NaN, + "learning_rate": 0.0002217898619699062, + "loss": 0.0, + "step": 23254 + }, + { + "epoch": 2.1699169543715593, + "grad_norm": NaN, + "learning_rate": 0.00022178322062211216, + "loss": 0.0, + "step": 23255 + }, + { + "epoch": 2.1700102640664367, + "grad_norm": NaN, + "learning_rate": 0.0002217765790917947, + "loss": 0.0, + "step": 23256 + }, + { + "epoch": 2.1701035737613137, + "grad_norm": NaN, + "learning_rate": 0.00022176993737897064, + "loss": 0.0, + "step": 23257 + }, + { + "epoch": 2.170196883456191, + "grad_norm": NaN, + "learning_rate": 0.0002217632954836569, + "loss": 0.0, + "step": 23258 + }, + { + "epoch": 2.1702901931510685, + "grad_norm": NaN, + "learning_rate": 0.00022175665340587035, + "loss": 0.0, + "step": 23259 + }, + { + "epoch": 2.1703835028459455, + "grad_norm": NaN, + "learning_rate": 0.0002217500111456279, + "loss": 0.0, + "step": 23260 + }, + { + "epoch": 2.170476812540823, + "grad_norm": NaN, + "learning_rate": 0.0002217433687029464, + "loss": 0.0, + "step": 23261 + }, + { + "epoch": 2.1705701222357003, + "grad_norm": NaN, + "learning_rate": 0.00022173672607784275, + "loss": 0.0, + "step": 23262 + }, + { + "epoch": 2.1706634319305778, + "grad_norm": NaN, + "learning_rate": 0.0002217300832703339, + "loss": 0.0, + "step": 23263 + }, + { + "epoch": 2.1707567416254547, + "grad_norm": NaN, + "learning_rate": 0.00022172344028043668, + "loss": 0.0, + "step": 23264 + }, + { + "epoch": 2.170850051320332, + "grad_norm": NaN, + "learning_rate": 0.000221716797108168, + "loss": 0.0, + "step": 23265 + }, + { + "epoch": 2.1709433610152096, + "grad_norm": NaN, + "learning_rate": 0.00022171015375354477, + "loss": 0.0, + "step": 23266 + }, + { + "epoch": 2.1710366707100865, + "grad_norm": NaN, + "learning_rate": 0.00022170351021658381, + "loss": 0.0, + "step": 23267 + }, + { + "epoch": 2.171129980404964, + "grad_norm": NaN, + "learning_rate": 0.0002216968664973021, + "loss": 0.0, + "step": 23268 + }, + { + "epoch": 2.1712232900998414, + "grad_norm": NaN, + "learning_rate": 0.00022169022259571646, + "loss": 0.0, + "step": 23269 + }, + { + "epoch": 2.171316599794719, + "grad_norm": NaN, + "learning_rate": 0.00022168357851184387, + "loss": 0.0, + "step": 23270 + }, + { + "epoch": 2.171409909489596, + "grad_norm": NaN, + "learning_rate": 0.00022167693424570114, + "loss": 0.0, + "step": 23271 + }, + { + "epoch": 2.171503219184473, + "grad_norm": NaN, + "learning_rate": 0.00022167028979730522, + "loss": 0.0, + "step": 23272 + }, + { + "epoch": 2.1715965288793506, + "grad_norm": NaN, + "learning_rate": 0.000221663645166673, + "loss": 0.0, + "step": 23273 + }, + { + "epoch": 2.171689838574228, + "grad_norm": NaN, + "learning_rate": 0.0002216570003538213, + "loss": 0.0, + "step": 23274 + }, + { + "epoch": 2.171783148269105, + "grad_norm": NaN, + "learning_rate": 0.00022165035535876712, + "loss": 0.0, + "step": 23275 + }, + { + "epoch": 2.1718764579639824, + "grad_norm": NaN, + "learning_rate": 0.0002216437101815273, + "loss": 0.0, + "step": 23276 + }, + { + "epoch": 2.17196976765886, + "grad_norm": NaN, + "learning_rate": 0.00022163706482211873, + "loss": 0.0, + "step": 23277 + }, + { + "epoch": 2.1720630773537373, + "grad_norm": NaN, + "learning_rate": 0.00022163041928055836, + "loss": 0.0, + "step": 23278 + }, + { + "epoch": 2.1721563870486142, + "grad_norm": NaN, + "learning_rate": 0.00022162377355686302, + "loss": 0.0, + "step": 23279 + }, + { + "epoch": 2.1722496967434917, + "grad_norm": NaN, + "learning_rate": 0.00022161712765104966, + "loss": 0.0, + "step": 23280 + }, + { + "epoch": 2.172343006438369, + "grad_norm": NaN, + "learning_rate": 0.0002216104815631351, + "loss": 0.0, + "step": 23281 + }, + { + "epoch": 2.172436316133246, + "grad_norm": NaN, + "learning_rate": 0.0002216038352931364, + "loss": 0.0, + "step": 23282 + }, + { + "epoch": 2.1725296258281235, + "grad_norm": NaN, + "learning_rate": 0.0002215971888410703, + "loss": 0.0, + "step": 23283 + }, + { + "epoch": 2.172622935523001, + "grad_norm": NaN, + "learning_rate": 0.00022159054220695373, + "loss": 0.0, + "step": 23284 + }, + { + "epoch": 2.1727162452178783, + "grad_norm": NaN, + "learning_rate": 0.00022158389539080368, + "loss": 0.0, + "step": 23285 + }, + { + "epoch": 2.1728095549127553, + "grad_norm": NaN, + "learning_rate": 0.00022157724839263695, + "loss": 0.0, + "step": 23286 + }, + { + "epoch": 2.1729028646076327, + "grad_norm": NaN, + "learning_rate": 0.0002215706012124705, + "loss": 0.0, + "step": 23287 + }, + { + "epoch": 2.17299617430251, + "grad_norm": NaN, + "learning_rate": 0.00022156395385032122, + "loss": 0.0, + "step": 23288 + }, + { + "epoch": 2.173089483997387, + "grad_norm": NaN, + "learning_rate": 0.00022155730630620598, + "loss": 0.0, + "step": 23289 + }, + { + "epoch": 2.1731827936922645, + "grad_norm": NaN, + "learning_rate": 0.00022155065858014172, + "loss": 0.0, + "step": 23290 + }, + { + "epoch": 2.173276103387142, + "grad_norm": NaN, + "learning_rate": 0.00022154401067214533, + "loss": 0.0, + "step": 23291 + }, + { + "epoch": 2.1733694130820194, + "grad_norm": NaN, + "learning_rate": 0.00022153736258223372, + "loss": 0.0, + "step": 23292 + }, + { + "epoch": 2.1734627227768963, + "grad_norm": NaN, + "learning_rate": 0.00022153071431042375, + "loss": 0.0, + "step": 23293 + }, + { + "epoch": 2.1735560324717738, + "grad_norm": NaN, + "learning_rate": 0.00022152406585673238, + "loss": 0.0, + "step": 23294 + }, + { + "epoch": 2.173649342166651, + "grad_norm": NaN, + "learning_rate": 0.0002215174172211765, + "loss": 0.0, + "step": 23295 + }, + { + "epoch": 2.1737426518615286, + "grad_norm": NaN, + "learning_rate": 0.00022151076840377305, + "loss": 0.0, + "step": 23296 + }, + { + "epoch": 2.1738359615564056, + "grad_norm": NaN, + "learning_rate": 0.00022150411940453885, + "loss": 0.0, + "step": 23297 + }, + { + "epoch": 2.173929271251283, + "grad_norm": NaN, + "learning_rate": 0.00022149747022349087, + "loss": 0.0, + "step": 23298 + }, + { + "epoch": 2.1740225809461604, + "grad_norm": NaN, + "learning_rate": 0.00022149082086064603, + "loss": 0.0, + "step": 23299 + }, + { + "epoch": 2.174115890641038, + "grad_norm": NaN, + "learning_rate": 0.00022148417131602114, + "loss": 0.0, + "step": 23300 + }, + { + "epoch": 2.174209200335915, + "grad_norm": NaN, + "learning_rate": 0.00022147752158963323, + "loss": 0.0, + "step": 23301 + }, + { + "epoch": 2.1743025100307922, + "grad_norm": NaN, + "learning_rate": 0.00022147087168149915, + "loss": 0.0, + "step": 23302 + }, + { + "epoch": 2.1743958197256696, + "grad_norm": NaN, + "learning_rate": 0.0002214642215916358, + "loss": 0.0, + "step": 23303 + }, + { + "epoch": 2.1744891294205466, + "grad_norm": NaN, + "learning_rate": 0.00022145757132006015, + "loss": 0.0, + "step": 23304 + }, + { + "epoch": 2.174582439115424, + "grad_norm": NaN, + "learning_rate": 0.00022145092086678903, + "loss": 0.0, + "step": 23305 + }, + { + "epoch": 2.1746757488103015, + "grad_norm": NaN, + "learning_rate": 0.00022144427023183935, + "loss": 0.0, + "step": 23306 + }, + { + "epoch": 2.174769058505179, + "grad_norm": NaN, + "learning_rate": 0.00022143761941522812, + "loss": 0.0, + "step": 23307 + }, + { + "epoch": 2.174862368200056, + "grad_norm": NaN, + "learning_rate": 0.00022143096841697216, + "loss": 0.0, + "step": 23308 + }, + { + "epoch": 2.1749556778949333, + "grad_norm": NaN, + "learning_rate": 0.00022142431723708834, + "loss": 0.0, + "step": 23309 + }, + { + "epoch": 2.1750489875898107, + "grad_norm": NaN, + "learning_rate": 0.00022141766587559374, + "loss": 0.0, + "step": 23310 + }, + { + "epoch": 2.1751422972846877, + "grad_norm": NaN, + "learning_rate": 0.00022141101433250512, + "loss": 0.0, + "step": 23311 + }, + { + "epoch": 2.175235606979565, + "grad_norm": NaN, + "learning_rate": 0.00022140436260783945, + "loss": 0.0, + "step": 23312 + }, + { + "epoch": 2.1753289166744425, + "grad_norm": NaN, + "learning_rate": 0.00022139771070161366, + "loss": 0.0, + "step": 23313 + }, + { + "epoch": 2.17542222636932, + "grad_norm": NaN, + "learning_rate": 0.00022139105861384464, + "loss": 0.0, + "step": 23314 + }, + { + "epoch": 2.175515536064197, + "grad_norm": NaN, + "learning_rate": 0.00022138440634454927, + "loss": 0.0, + "step": 23315 + }, + { + "epoch": 2.1756088457590743, + "grad_norm": NaN, + "learning_rate": 0.00022137775389374453, + "loss": 0.0, + "step": 23316 + }, + { + "epoch": 2.1757021554539517, + "grad_norm": NaN, + "learning_rate": 0.00022137110126144727, + "loss": 0.0, + "step": 23317 + }, + { + "epoch": 2.175795465148829, + "grad_norm": NaN, + "learning_rate": 0.00022136444844767447, + "loss": 0.0, + "step": 23318 + }, + { + "epoch": 2.175888774843706, + "grad_norm": NaN, + "learning_rate": 0.00022135779545244297, + "loss": 0.0, + "step": 23319 + }, + { + "epoch": 2.1759820845385835, + "grad_norm": NaN, + "learning_rate": 0.0002213511422757698, + "loss": 0.0, + "step": 23320 + }, + { + "epoch": 2.176075394233461, + "grad_norm": NaN, + "learning_rate": 0.00022134448891767184, + "loss": 0.0, + "step": 23321 + }, + { + "epoch": 2.1761687039283384, + "grad_norm": NaN, + "learning_rate": 0.00022133783537816586, + "loss": 0.0, + "step": 23322 + }, + { + "epoch": 2.1762620136232154, + "grad_norm": NaN, + "learning_rate": 0.000221331181657269, + "loss": 0.0, + "step": 23323 + }, + { + "epoch": 2.176355323318093, + "grad_norm": NaN, + "learning_rate": 0.00022132452775499805, + "loss": 0.0, + "step": 23324 + }, + { + "epoch": 2.17644863301297, + "grad_norm": NaN, + "learning_rate": 0.00022131787367136988, + "loss": 0.0, + "step": 23325 + }, + { + "epoch": 2.176541942707847, + "grad_norm": NaN, + "learning_rate": 0.00022131121940640157, + "loss": 0.0, + "step": 23326 + }, + { + "epoch": 2.1766352524027246, + "grad_norm": NaN, + "learning_rate": 0.00022130456496010994, + "loss": 0.0, + "step": 23327 + }, + { + "epoch": 2.176728562097602, + "grad_norm": NaN, + "learning_rate": 0.00022129791033251187, + "loss": 0.0, + "step": 23328 + }, + { + "epoch": 2.1768218717924794, + "grad_norm": NaN, + "learning_rate": 0.00022129125552362443, + "loss": 0.0, + "step": 23329 + }, + { + "epoch": 2.1769151814873564, + "grad_norm": NaN, + "learning_rate": 0.00022128460053346438, + "loss": 0.0, + "step": 23330 + }, + { + "epoch": 2.177008491182234, + "grad_norm": NaN, + "learning_rate": 0.00022127794536204868, + "loss": 0.0, + "step": 23331 + }, + { + "epoch": 2.1771018008771112, + "grad_norm": NaN, + "learning_rate": 0.00022127129000939436, + "loss": 0.0, + "step": 23332 + }, + { + "epoch": 2.1771951105719882, + "grad_norm": NaN, + "learning_rate": 0.0002212646344755182, + "loss": 0.0, + "step": 23333 + }, + { + "epoch": 2.1772884202668656, + "grad_norm": NaN, + "learning_rate": 0.0002212579787604372, + "loss": 0.0, + "step": 23334 + }, + { + "epoch": 2.177381729961743, + "grad_norm": NaN, + "learning_rate": 0.00022125132286416826, + "loss": 0.0, + "step": 23335 + }, + { + "epoch": 2.1774750396566205, + "grad_norm": NaN, + "learning_rate": 0.00022124466678672831, + "loss": 0.0, + "step": 23336 + }, + { + "epoch": 2.1775683493514975, + "grad_norm": NaN, + "learning_rate": 0.0002212380105281343, + "loss": 0.0, + "step": 23337 + }, + { + "epoch": 2.177661659046375, + "grad_norm": NaN, + "learning_rate": 0.0002212313540884031, + "loss": 0.0, + "step": 23338 + }, + { + "epoch": 2.1777549687412523, + "grad_norm": NaN, + "learning_rate": 0.00022122469746755174, + "loss": 0.0, + "step": 23339 + }, + { + "epoch": 2.1778482784361297, + "grad_norm": NaN, + "learning_rate": 0.000221218040665597, + "loss": 0.0, + "step": 23340 + }, + { + "epoch": 2.1779415881310067, + "grad_norm": NaN, + "learning_rate": 0.00022121138368255585, + "loss": 0.0, + "step": 23341 + }, + { + "epoch": 2.178034897825884, + "grad_norm": NaN, + "learning_rate": 0.00022120472651844533, + "loss": 0.0, + "step": 23342 + }, + { + "epoch": 2.1781282075207615, + "grad_norm": NaN, + "learning_rate": 0.00022119806917328226, + "loss": 0.0, + "step": 23343 + }, + { + "epoch": 2.1782215172156385, + "grad_norm": NaN, + "learning_rate": 0.00022119141164708356, + "loss": 0.0, + "step": 23344 + }, + { + "epoch": 2.178314826910516, + "grad_norm": NaN, + "learning_rate": 0.00022118475393986624, + "loss": 0.0, + "step": 23345 + }, + { + "epoch": 2.1784081366053933, + "grad_norm": NaN, + "learning_rate": 0.00022117809605164712, + "loss": 0.0, + "step": 23346 + }, + { + "epoch": 2.1785014463002708, + "grad_norm": NaN, + "learning_rate": 0.0002211714379824432, + "loss": 0.0, + "step": 23347 + }, + { + "epoch": 2.1785947559951477, + "grad_norm": NaN, + "learning_rate": 0.0002211647797322714, + "loss": 0.0, + "step": 23348 + }, + { + "epoch": 2.178688065690025, + "grad_norm": NaN, + "learning_rate": 0.00022115812130114865, + "loss": 0.0, + "step": 23349 + }, + { + "epoch": 2.1787813753849026, + "grad_norm": NaN, + "learning_rate": 0.00022115146268909184, + "loss": 0.0, + "step": 23350 + }, + { + "epoch": 2.17887468507978, + "grad_norm": NaN, + "learning_rate": 0.00022114480389611804, + "loss": 0.0, + "step": 23351 + }, + { + "epoch": 2.178967994774657, + "grad_norm": NaN, + "learning_rate": 0.000221138144922244, + "loss": 0.0, + "step": 23352 + }, + { + "epoch": 2.1790613044695344, + "grad_norm": NaN, + "learning_rate": 0.0002211314857674867, + "loss": 0.0, + "step": 23353 + }, + { + "epoch": 2.179154614164412, + "grad_norm": NaN, + "learning_rate": 0.0002211248264318632, + "loss": 0.0, + "step": 23354 + }, + { + "epoch": 2.179247923859289, + "grad_norm": NaN, + "learning_rate": 0.00022111816691539027, + "loss": 0.0, + "step": 23355 + }, + { + "epoch": 2.179341233554166, + "grad_norm": NaN, + "learning_rate": 0.0002211115072180849, + "loss": 0.0, + "step": 23356 + }, + { + "epoch": 2.1794345432490436, + "grad_norm": NaN, + "learning_rate": 0.00022110484733996403, + "loss": 0.0, + "step": 23357 + }, + { + "epoch": 2.179527852943921, + "grad_norm": NaN, + "learning_rate": 0.00022109818728104468, + "loss": 0.0, + "step": 23358 + }, + { + "epoch": 2.179621162638798, + "grad_norm": NaN, + "learning_rate": 0.00022109152704134362, + "loss": 0.0, + "step": 23359 + }, + { + "epoch": 2.1797144723336754, + "grad_norm": NaN, + "learning_rate": 0.00022108486662087784, + "loss": 0.0, + "step": 23360 + }, + { + "epoch": 2.179807782028553, + "grad_norm": NaN, + "learning_rate": 0.0002210782060196644, + "loss": 0.0, + "step": 23361 + }, + { + "epoch": 2.17990109172343, + "grad_norm": NaN, + "learning_rate": 0.00022107154523772006, + "loss": 0.0, + "step": 23362 + }, + { + "epoch": 2.1799944014183072, + "grad_norm": NaN, + "learning_rate": 0.00022106488427506182, + "loss": 0.0, + "step": 23363 + }, + { + "epoch": 2.1800877111131847, + "grad_norm": NaN, + "learning_rate": 0.00022105822313170668, + "loss": 0.0, + "step": 23364 + }, + { + "epoch": 2.180181020808062, + "grad_norm": NaN, + "learning_rate": 0.00022105156180767153, + "loss": 0.0, + "step": 23365 + }, + { + "epoch": 2.180274330502939, + "grad_norm": NaN, + "learning_rate": 0.00022104490030297325, + "loss": 0.0, + "step": 23366 + }, + { + "epoch": 2.1803676401978165, + "grad_norm": NaN, + "learning_rate": 0.0002210382386176289, + "loss": 0.0, + "step": 23367 + }, + { + "epoch": 2.180460949892694, + "grad_norm": NaN, + "learning_rate": 0.00022103157675165534, + "loss": 0.0, + "step": 23368 + }, + { + "epoch": 2.1805542595875713, + "grad_norm": NaN, + "learning_rate": 0.00022102491470506948, + "loss": 0.0, + "step": 23369 + }, + { + "epoch": 2.1806475692824483, + "grad_norm": NaN, + "learning_rate": 0.00022101825247788833, + "loss": 0.0, + "step": 23370 + }, + { + "epoch": 2.1807408789773257, + "grad_norm": NaN, + "learning_rate": 0.0002210115900701288, + "loss": 0.0, + "step": 23371 + }, + { + "epoch": 2.180834188672203, + "grad_norm": NaN, + "learning_rate": 0.0002210049274818078, + "loss": 0.0, + "step": 23372 + }, + { + "epoch": 2.1809274983670806, + "grad_norm": NaN, + "learning_rate": 0.00022099826471294238, + "loss": 0.0, + "step": 23373 + }, + { + "epoch": 2.1810208080619575, + "grad_norm": NaN, + "learning_rate": 0.00022099160176354936, + "loss": 0.0, + "step": 23374 + }, + { + "epoch": 2.181114117756835, + "grad_norm": NaN, + "learning_rate": 0.00022098493863364566, + "loss": 0.0, + "step": 23375 + }, + { + "epoch": 2.1812074274517124, + "grad_norm": NaN, + "learning_rate": 0.00022097827532324834, + "loss": 0.0, + "step": 23376 + }, + { + "epoch": 2.1813007371465893, + "grad_norm": NaN, + "learning_rate": 0.00022097161183237432, + "loss": 0.0, + "step": 23377 + }, + { + "epoch": 2.1813940468414668, + "grad_norm": NaN, + "learning_rate": 0.00022096494816104055, + "loss": 0.0, + "step": 23378 + }, + { + "epoch": 2.181487356536344, + "grad_norm": NaN, + "learning_rate": 0.00022095828430926382, + "loss": 0.0, + "step": 23379 + }, + { + "epoch": 2.1815806662312216, + "grad_norm": NaN, + "learning_rate": 0.0002209516202770613, + "loss": 0.0, + "step": 23380 + }, + { + "epoch": 2.1816739759260986, + "grad_norm": NaN, + "learning_rate": 0.00022094495606444977, + "loss": 0.0, + "step": 23381 + }, + { + "epoch": 2.181767285620976, + "grad_norm": NaN, + "learning_rate": 0.00022093829167144622, + "loss": 0.0, + "step": 23382 + }, + { + "epoch": 2.1818605953158534, + "grad_norm": NaN, + "learning_rate": 0.00022093162709806765, + "loss": 0.0, + "step": 23383 + }, + { + "epoch": 2.1819539050107304, + "grad_norm": NaN, + "learning_rate": 0.00022092496234433093, + "loss": 0.0, + "step": 23384 + }, + { + "epoch": 2.182047214705608, + "grad_norm": NaN, + "learning_rate": 0.00022091829741025304, + "loss": 0.0, + "step": 23385 + }, + { + "epoch": 2.1821405244004852, + "grad_norm": NaN, + "learning_rate": 0.00022091163229585097, + "loss": 0.0, + "step": 23386 + }, + { + "epoch": 2.1822338340953626, + "grad_norm": NaN, + "learning_rate": 0.0002209049670011416, + "loss": 0.0, + "step": 23387 + }, + { + "epoch": 2.1823271437902396, + "grad_norm": NaN, + "learning_rate": 0.00022089830152614186, + "loss": 0.0, + "step": 23388 + }, + { + "epoch": 2.182420453485117, + "grad_norm": NaN, + "learning_rate": 0.00022089163587086884, + "loss": 0.0, + "step": 23389 + }, + { + "epoch": 2.1825137631799945, + "grad_norm": NaN, + "learning_rate": 0.00022088497003533932, + "loss": 0.0, + "step": 23390 + }, + { + "epoch": 2.182607072874872, + "grad_norm": NaN, + "learning_rate": 0.0002208783040195703, + "loss": 0.0, + "step": 23391 + }, + { + "epoch": 2.182700382569749, + "grad_norm": NaN, + "learning_rate": 0.0002208716378235788, + "loss": 0.0, + "step": 23392 + }, + { + "epoch": 2.1827936922646263, + "grad_norm": NaN, + "learning_rate": 0.0002208649714473817, + "loss": 0.0, + "step": 23393 + }, + { + "epoch": 2.1828870019595037, + "grad_norm": NaN, + "learning_rate": 0.00022085830489099593, + "loss": 0.0, + "step": 23394 + }, + { + "epoch": 2.182980311654381, + "grad_norm": NaN, + "learning_rate": 0.00022085163815443854, + "loss": 0.0, + "step": 23395 + }, + { + "epoch": 2.183073621349258, + "grad_norm": NaN, + "learning_rate": 0.00022084497123772644, + "loss": 0.0, + "step": 23396 + }, + { + "epoch": 2.1831669310441355, + "grad_norm": NaN, + "learning_rate": 0.00022083830414087647, + "loss": 0.0, + "step": 23397 + }, + { + "epoch": 2.183260240739013, + "grad_norm": NaN, + "learning_rate": 0.00022083163686390571, + "loss": 0.0, + "step": 23398 + }, + { + "epoch": 2.18335355043389, + "grad_norm": NaN, + "learning_rate": 0.00022082496940683115, + "loss": 0.0, + "step": 23399 + }, + { + "epoch": 2.1834468601287673, + "grad_norm": NaN, + "learning_rate": 0.00022081830176966962, + "loss": 0.0, + "step": 23400 + }, + { + "epoch": 2.1835401698236447, + "grad_norm": NaN, + "learning_rate": 0.00022081163395243808, + "loss": 0.0, + "step": 23401 + }, + { + "epoch": 2.183633479518522, + "grad_norm": NaN, + "learning_rate": 0.00022080496595515363, + "loss": 0.0, + "step": 23402 + }, + { + "epoch": 2.183726789213399, + "grad_norm": NaN, + "learning_rate": 0.0002207982977778331, + "loss": 0.0, + "step": 23403 + }, + { + "epoch": 2.1838200989082766, + "grad_norm": NaN, + "learning_rate": 0.00022079162942049337, + "loss": 0.0, + "step": 23404 + }, + { + "epoch": 2.183913408603154, + "grad_norm": NaN, + "learning_rate": 0.00022078496088315163, + "loss": 0.0, + "step": 23405 + }, + { + "epoch": 2.184006718298031, + "grad_norm": NaN, + "learning_rate": 0.00022077829216582463, + "loss": 0.0, + "step": 23406 + }, + { + "epoch": 2.1841000279929084, + "grad_norm": NaN, + "learning_rate": 0.00022077162326852938, + "loss": 0.0, + "step": 23407 + }, + { + "epoch": 2.184193337687786, + "grad_norm": NaN, + "learning_rate": 0.00022076495419128293, + "loss": 0.0, + "step": 23408 + }, + { + "epoch": 2.184286647382663, + "grad_norm": NaN, + "learning_rate": 0.00022075828493410213, + "loss": 0.0, + "step": 23409 + }, + { + "epoch": 2.18437995707754, + "grad_norm": NaN, + "learning_rate": 0.0002207516154970039, + "loss": 0.0, + "step": 23410 + }, + { + "epoch": 2.1844732667724176, + "grad_norm": NaN, + "learning_rate": 0.0002207449458800054, + "loss": 0.0, + "step": 23411 + }, + { + "epoch": 2.184566576467295, + "grad_norm": NaN, + "learning_rate": 0.0002207382760831234, + "loss": 0.0, + "step": 23412 + }, + { + "epoch": 2.1846598861621724, + "grad_norm": NaN, + "learning_rate": 0.00022073160610637489, + "loss": 0.0, + "step": 23413 + }, + { + "epoch": 2.1847531958570494, + "grad_norm": NaN, + "learning_rate": 0.00022072493594977685, + "loss": 0.0, + "step": 23414 + }, + { + "epoch": 2.184846505551927, + "grad_norm": NaN, + "learning_rate": 0.00022071826561334632, + "loss": 0.0, + "step": 23415 + }, + { + "epoch": 2.1849398152468043, + "grad_norm": NaN, + "learning_rate": 0.00022071159509710013, + "loss": 0.0, + "step": 23416 + }, + { + "epoch": 2.1850331249416817, + "grad_norm": NaN, + "learning_rate": 0.00022070492440105533, + "loss": 0.0, + "step": 23417 + }, + { + "epoch": 2.1851264346365586, + "grad_norm": NaN, + "learning_rate": 0.00022069825352522885, + "loss": 0.0, + "step": 23418 + }, + { + "epoch": 2.185219744331436, + "grad_norm": NaN, + "learning_rate": 0.0002206915824696376, + "loss": 0.0, + "step": 23419 + }, + { + "epoch": 2.1853130540263135, + "grad_norm": NaN, + "learning_rate": 0.00022068491123429865, + "loss": 0.0, + "step": 23420 + }, + { + "epoch": 2.1854063637211905, + "grad_norm": NaN, + "learning_rate": 0.00022067823981922893, + "loss": 0.0, + "step": 23421 + }, + { + "epoch": 2.185499673416068, + "grad_norm": NaN, + "learning_rate": 0.00022067156822444534, + "loss": 0.0, + "step": 23422 + }, + { + "epoch": 2.1855929831109453, + "grad_norm": NaN, + "learning_rate": 0.00022066489644996488, + "loss": 0.0, + "step": 23423 + }, + { + "epoch": 2.1856862928058227, + "grad_norm": NaN, + "learning_rate": 0.00022065822449580456, + "loss": 0.0, + "step": 23424 + }, + { + "epoch": 2.1857796025006997, + "grad_norm": NaN, + "learning_rate": 0.00022065155236198128, + "loss": 0.0, + "step": 23425 + }, + { + "epoch": 2.185872912195577, + "grad_norm": NaN, + "learning_rate": 0.000220644880048512, + "loss": 0.0, + "step": 23426 + }, + { + "epoch": 2.1859662218904545, + "grad_norm": NaN, + "learning_rate": 0.00022063820755541377, + "loss": 0.0, + "step": 23427 + }, + { + "epoch": 2.1860595315853315, + "grad_norm": NaN, + "learning_rate": 0.00022063153488270347, + "loss": 0.0, + "step": 23428 + }, + { + "epoch": 2.186152841280209, + "grad_norm": NaN, + "learning_rate": 0.0002206248620303981, + "loss": 0.0, + "step": 23429 + }, + { + "epoch": 2.1862461509750863, + "grad_norm": NaN, + "learning_rate": 0.00022061818899851465, + "loss": 0.0, + "step": 23430 + }, + { + "epoch": 2.1863394606699638, + "grad_norm": NaN, + "learning_rate": 0.00022061151578707007, + "loss": 0.0, + "step": 23431 + }, + { + "epoch": 2.1864327703648407, + "grad_norm": NaN, + "learning_rate": 0.00022060484239608128, + "loss": 0.0, + "step": 23432 + }, + { + "epoch": 2.186526080059718, + "grad_norm": NaN, + "learning_rate": 0.0002205981688255653, + "loss": 0.0, + "step": 23433 + }, + { + "epoch": 2.1866193897545956, + "grad_norm": NaN, + "learning_rate": 0.00022059149507553915, + "loss": 0.0, + "step": 23434 + }, + { + "epoch": 2.186712699449473, + "grad_norm": NaN, + "learning_rate": 0.00022058482114601968, + "loss": 0.0, + "step": 23435 + }, + { + "epoch": 2.18680600914435, + "grad_norm": NaN, + "learning_rate": 0.00022057814703702392, + "loss": 0.0, + "step": 23436 + }, + { + "epoch": 2.1868993188392274, + "grad_norm": NaN, + "learning_rate": 0.00022057147274856886, + "loss": 0.0, + "step": 23437 + }, + { + "epoch": 2.186992628534105, + "grad_norm": NaN, + "learning_rate": 0.00022056479828067144, + "loss": 0.0, + "step": 23438 + }, + { + "epoch": 2.187085938228982, + "grad_norm": NaN, + "learning_rate": 0.0002205581236333486, + "loss": 0.0, + "step": 23439 + }, + { + "epoch": 2.187179247923859, + "grad_norm": NaN, + "learning_rate": 0.00022055144880661745, + "loss": 0.0, + "step": 23440 + }, + { + "epoch": 2.1872725576187366, + "grad_norm": NaN, + "learning_rate": 0.00022054477380049477, + "loss": 0.0, + "step": 23441 + }, + { + "epoch": 2.187365867313614, + "grad_norm": NaN, + "learning_rate": 0.00022053809861499767, + "loss": 0.0, + "step": 23442 + }, + { + "epoch": 2.187459177008491, + "grad_norm": NaN, + "learning_rate": 0.00022053142325014312, + "loss": 0.0, + "step": 23443 + }, + { + "epoch": 2.1875524867033684, + "grad_norm": NaN, + "learning_rate": 0.00022052474770594796, + "loss": 0.0, + "step": 23444 + }, + { + "epoch": 2.187645796398246, + "grad_norm": NaN, + "learning_rate": 0.0002205180719824293, + "loss": 0.0, + "step": 23445 + }, + { + "epoch": 2.1877391060931233, + "grad_norm": NaN, + "learning_rate": 0.00022051139607960412, + "loss": 0.0, + "step": 23446 + }, + { + "epoch": 2.1878324157880003, + "grad_norm": NaN, + "learning_rate": 0.0002205047199974893, + "loss": 0.0, + "step": 23447 + }, + { + "epoch": 2.1879257254828777, + "grad_norm": NaN, + "learning_rate": 0.00022049804373610183, + "loss": 0.0, + "step": 23448 + }, + { + "epoch": 2.188019035177755, + "grad_norm": NaN, + "learning_rate": 0.0002204913672954588, + "loss": 0.0, + "step": 23449 + }, + { + "epoch": 2.188112344872632, + "grad_norm": NaN, + "learning_rate": 0.00022048469067557705, + "loss": 0.0, + "step": 23450 + }, + { + "epoch": 2.1882056545675095, + "grad_norm": NaN, + "learning_rate": 0.0002204780138764736, + "loss": 0.0, + "step": 23451 + }, + { + "epoch": 2.188298964262387, + "grad_norm": NaN, + "learning_rate": 0.00022047133689816545, + "loss": 0.0, + "step": 23452 + }, + { + "epoch": 2.1883922739572643, + "grad_norm": NaN, + "learning_rate": 0.00022046465974066962, + "loss": 0.0, + "step": 23453 + }, + { + "epoch": 2.1884855836521413, + "grad_norm": NaN, + "learning_rate": 0.00022045798240400295, + "loss": 0.0, + "step": 23454 + }, + { + "epoch": 2.1885788933470187, + "grad_norm": NaN, + "learning_rate": 0.00022045130488818252, + "loss": 0.0, + "step": 23455 + }, + { + "epoch": 2.188672203041896, + "grad_norm": NaN, + "learning_rate": 0.00022044462719322537, + "loss": 0.0, + "step": 23456 + }, + { + "epoch": 2.1887655127367736, + "grad_norm": NaN, + "learning_rate": 0.00022043794931914828, + "loss": 0.0, + "step": 23457 + }, + { + "epoch": 2.1888588224316505, + "grad_norm": NaN, + "learning_rate": 0.00022043127126596842, + "loss": 0.0, + "step": 23458 + }, + { + "epoch": 2.188952132126528, + "grad_norm": NaN, + "learning_rate": 0.00022042459303370273, + "loss": 0.0, + "step": 23459 + }, + { + "epoch": 2.1890454418214054, + "grad_norm": NaN, + "learning_rate": 0.00022041791462236806, + "loss": 0.0, + "step": 23460 + }, + { + "epoch": 2.1891387515162823, + "grad_norm": NaN, + "learning_rate": 0.00022041123603198159, + "loss": 0.0, + "step": 23461 + }, + { + "epoch": 2.1892320612111598, + "grad_norm": NaN, + "learning_rate": 0.00022040455726256018, + "loss": 0.0, + "step": 23462 + }, + { + "epoch": 2.189325370906037, + "grad_norm": NaN, + "learning_rate": 0.0002203978783141208, + "loss": 0.0, + "step": 23463 + }, + { + "epoch": 2.1894186806009146, + "grad_norm": NaN, + "learning_rate": 0.0002203911991866805, + "loss": 0.0, + "step": 23464 + }, + { + "epoch": 2.1895119902957916, + "grad_norm": NaN, + "learning_rate": 0.00022038451988025626, + "loss": 0.0, + "step": 23465 + }, + { + "epoch": 2.189605299990669, + "grad_norm": NaN, + "learning_rate": 0.00022037784039486498, + "loss": 0.0, + "step": 23466 + }, + { + "epoch": 2.1896986096855464, + "grad_norm": NaN, + "learning_rate": 0.00022037116073052372, + "loss": 0.0, + "step": 23467 + }, + { + "epoch": 2.189791919380424, + "grad_norm": NaN, + "learning_rate": 0.0002203644808872494, + "loss": 0.0, + "step": 23468 + }, + { + "epoch": 2.189885229075301, + "grad_norm": NaN, + "learning_rate": 0.00022035780086505913, + "loss": 0.0, + "step": 23469 + }, + { + "epoch": 2.1899785387701782, + "grad_norm": NaN, + "learning_rate": 0.0002203511206639698, + "loss": 0.0, + "step": 23470 + }, + { + "epoch": 2.1900718484650556, + "grad_norm": NaN, + "learning_rate": 0.00022034444028399834, + "loss": 0.0, + "step": 23471 + }, + { + "epoch": 2.1901651581599326, + "grad_norm": NaN, + "learning_rate": 0.00022033775972516188, + "loss": 0.0, + "step": 23472 + }, + { + "epoch": 2.19025846785481, + "grad_norm": NaN, + "learning_rate": 0.0002203310789874773, + "loss": 0.0, + "step": 23473 + }, + { + "epoch": 2.1903517775496875, + "grad_norm": NaN, + "learning_rate": 0.00022032439807096162, + "loss": 0.0, + "step": 23474 + }, + { + "epoch": 2.190445087244565, + "grad_norm": NaN, + "learning_rate": 0.00022031771697563184, + "loss": 0.0, + "step": 23475 + }, + { + "epoch": 2.190538396939442, + "grad_norm": NaN, + "learning_rate": 0.00022031103570150493, + "loss": 0.0, + "step": 23476 + }, + { + "epoch": 2.1906317066343193, + "grad_norm": NaN, + "learning_rate": 0.00022030435424859785, + "loss": 0.0, + "step": 23477 + }, + { + "epoch": 2.1907250163291967, + "grad_norm": NaN, + "learning_rate": 0.0002202976726169277, + "loss": 0.0, + "step": 23478 + }, + { + "epoch": 2.1908183260240737, + "grad_norm": NaN, + "learning_rate": 0.0002202909908065113, + "loss": 0.0, + "step": 23479 + }, + { + "epoch": 2.190911635718951, + "grad_norm": NaN, + "learning_rate": 0.0002202843088173658, + "loss": 0.0, + "step": 23480 + }, + { + "epoch": 2.1910049454138285, + "grad_norm": NaN, + "learning_rate": 0.0002202776266495081, + "loss": 0.0, + "step": 23481 + }, + { + "epoch": 2.191098255108706, + "grad_norm": NaN, + "learning_rate": 0.0002202709443029552, + "loss": 0.0, + "step": 23482 + }, + { + "epoch": 2.191191564803583, + "grad_norm": NaN, + "learning_rate": 0.00022026426177772414, + "loss": 0.0, + "step": 23483 + }, + { + "epoch": 2.1912848744984603, + "grad_norm": NaN, + "learning_rate": 0.00022025757907383187, + "loss": 0.0, + "step": 23484 + }, + { + "epoch": 2.1913781841933377, + "grad_norm": NaN, + "learning_rate": 0.00022025089619129534, + "loss": 0.0, + "step": 23485 + }, + { + "epoch": 2.191471493888215, + "grad_norm": NaN, + "learning_rate": 0.0002202442131301316, + "loss": 0.0, + "step": 23486 + }, + { + "epoch": 2.191564803583092, + "grad_norm": NaN, + "learning_rate": 0.00022023752989035767, + "loss": 0.0, + "step": 23487 + }, + { + "epoch": 2.1916581132779696, + "grad_norm": NaN, + "learning_rate": 0.0002202308464719905, + "loss": 0.0, + "step": 23488 + }, + { + "epoch": 2.191751422972847, + "grad_norm": NaN, + "learning_rate": 0.0002202241628750471, + "loss": 0.0, + "step": 23489 + }, + { + "epoch": 2.1918447326677244, + "grad_norm": NaN, + "learning_rate": 0.00022021747909954442, + "loss": 0.0, + "step": 23490 + }, + { + "epoch": 2.1919380423626014, + "grad_norm": NaN, + "learning_rate": 0.00022021079514549955, + "loss": 0.0, + "step": 23491 + }, + { + "epoch": 2.192031352057479, + "grad_norm": NaN, + "learning_rate": 0.00022020411101292934, + "loss": 0.0, + "step": 23492 + }, + { + "epoch": 2.192124661752356, + "grad_norm": NaN, + "learning_rate": 0.00022019742670185096, + "loss": 0.0, + "step": 23493 + }, + { + "epoch": 2.192217971447233, + "grad_norm": NaN, + "learning_rate": 0.0002201907422122813, + "loss": 0.0, + "step": 23494 + }, + { + "epoch": 2.1923112811421106, + "grad_norm": NaN, + "learning_rate": 0.0002201840575442373, + "loss": 0.0, + "step": 23495 + }, + { + "epoch": 2.192404590836988, + "grad_norm": NaN, + "learning_rate": 0.0002201773726977361, + "loss": 0.0, + "step": 23496 + }, + { + "epoch": 2.1924979005318654, + "grad_norm": NaN, + "learning_rate": 0.00022017068767279463, + "loss": 0.0, + "step": 23497 + }, + { + "epoch": 2.1925912102267424, + "grad_norm": NaN, + "learning_rate": 0.00022016400246942987, + "loss": 0.0, + "step": 23498 + }, + { + "epoch": 2.19268451992162, + "grad_norm": NaN, + "learning_rate": 0.00022015731708765882, + "loss": 0.0, + "step": 23499 + }, + { + "epoch": 2.1927778296164973, + "grad_norm": NaN, + "learning_rate": 0.00022015063152749855, + "loss": 0.0, + "step": 23500 + }, + { + "epoch": 2.1928711393113742, + "grad_norm": NaN, + "learning_rate": 0.00022014394578896593, + "loss": 0.0, + "step": 23501 + }, + { + "epoch": 2.1929644490062516, + "grad_norm": NaN, + "learning_rate": 0.00022013725987207806, + "loss": 0.0, + "step": 23502 + }, + { + "epoch": 2.193057758701129, + "grad_norm": NaN, + "learning_rate": 0.00022013057377685195, + "loss": 0.0, + "step": 23503 + }, + { + "epoch": 2.1931510683960065, + "grad_norm": NaN, + "learning_rate": 0.0002201238875033045, + "loss": 0.0, + "step": 23504 + }, + { + "epoch": 2.1932443780908835, + "grad_norm": NaN, + "learning_rate": 0.0002201172010514528, + "loss": 0.0, + "step": 23505 + }, + { + "epoch": 2.193337687785761, + "grad_norm": NaN, + "learning_rate": 0.00022011051442131387, + "loss": 0.0, + "step": 23506 + }, + { + "epoch": 2.1934309974806383, + "grad_norm": NaN, + "learning_rate": 0.00022010382761290464, + "loss": 0.0, + "step": 23507 + }, + { + "epoch": 2.1935243071755157, + "grad_norm": NaN, + "learning_rate": 0.00022009714062624214, + "loss": 0.0, + "step": 23508 + }, + { + "epoch": 2.1936176168703927, + "grad_norm": NaN, + "learning_rate": 0.0002200904534613434, + "loss": 0.0, + "step": 23509 + }, + { + "epoch": 2.19371092656527, + "grad_norm": NaN, + "learning_rate": 0.00022008376611822538, + "loss": 0.0, + "step": 23510 + }, + { + "epoch": 2.1938042362601475, + "grad_norm": NaN, + "learning_rate": 0.00022007707859690508, + "loss": 0.0, + "step": 23511 + }, + { + "epoch": 2.193897545955025, + "grad_norm": NaN, + "learning_rate": 0.00022007039089739957, + "loss": 0.0, + "step": 23512 + }, + { + "epoch": 2.193990855649902, + "grad_norm": NaN, + "learning_rate": 0.00022006370301972582, + "loss": 0.0, + "step": 23513 + }, + { + "epoch": 2.1940841653447793, + "grad_norm": NaN, + "learning_rate": 0.00022005701496390076, + "loss": 0.0, + "step": 23514 + }, + { + "epoch": 2.1941774750396568, + "grad_norm": NaN, + "learning_rate": 0.00022005032672994153, + "loss": 0.0, + "step": 23515 + }, + { + "epoch": 2.1942707847345337, + "grad_norm": NaN, + "learning_rate": 0.00022004363831786504, + "loss": 0.0, + "step": 23516 + }, + { + "epoch": 2.194364094429411, + "grad_norm": NaN, + "learning_rate": 0.00022003694972768832, + "loss": 0.0, + "step": 23517 + }, + { + "epoch": 2.1944574041242886, + "grad_norm": NaN, + "learning_rate": 0.0002200302609594284, + "loss": 0.0, + "step": 23518 + }, + { + "epoch": 2.194550713819166, + "grad_norm": NaN, + "learning_rate": 0.0002200235720131023, + "loss": 0.0, + "step": 23519 + }, + { + "epoch": 2.194644023514043, + "grad_norm": NaN, + "learning_rate": 0.00022001688288872693, + "loss": 0.0, + "step": 23520 + }, + { + "epoch": 2.1947373332089204, + "grad_norm": NaN, + "learning_rate": 0.0002200101935863194, + "loss": 0.0, + "step": 23521 + }, + { + "epoch": 2.194830642903798, + "grad_norm": NaN, + "learning_rate": 0.0002200035041058967, + "loss": 0.0, + "step": 23522 + }, + { + "epoch": 2.194923952598675, + "grad_norm": NaN, + "learning_rate": 0.0002199968144474758, + "loss": 0.0, + "step": 23523 + }, + { + "epoch": 2.195017262293552, + "grad_norm": NaN, + "learning_rate": 0.00021999012461107372, + "loss": 0.0, + "step": 23524 + }, + { + "epoch": 2.1951105719884296, + "grad_norm": NaN, + "learning_rate": 0.00021998343459670754, + "loss": 0.0, + "step": 23525 + }, + { + "epoch": 2.195203881683307, + "grad_norm": NaN, + "learning_rate": 0.0002199767444043942, + "loss": 0.0, + "step": 23526 + }, + { + "epoch": 2.195297191378184, + "grad_norm": NaN, + "learning_rate": 0.00021997005403415069, + "loss": 0.0, + "step": 23527 + }, + { + "epoch": 2.1953905010730614, + "grad_norm": NaN, + "learning_rate": 0.0002199633634859941, + "loss": 0.0, + "step": 23528 + }, + { + "epoch": 2.195483810767939, + "grad_norm": NaN, + "learning_rate": 0.0002199566727599414, + "loss": 0.0, + "step": 23529 + }, + { + "epoch": 2.1955771204628163, + "grad_norm": NaN, + "learning_rate": 0.0002199499818560096, + "loss": 0.0, + "step": 23530 + }, + { + "epoch": 2.1956704301576933, + "grad_norm": NaN, + "learning_rate": 0.0002199432907742157, + "loss": 0.0, + "step": 23531 + }, + { + "epoch": 2.1957637398525707, + "grad_norm": NaN, + "learning_rate": 0.00021993659951457674, + "loss": 0.0, + "step": 23532 + }, + { + "epoch": 2.195857049547448, + "grad_norm": NaN, + "learning_rate": 0.0002199299080771097, + "loss": 0.0, + "step": 23533 + }, + { + "epoch": 2.1959503592423255, + "grad_norm": NaN, + "learning_rate": 0.00021992321646183166, + "loss": 0.0, + "step": 23534 + }, + { + "epoch": 2.1960436689372025, + "grad_norm": NaN, + "learning_rate": 0.00021991652466875962, + "loss": 0.0, + "step": 23535 + }, + { + "epoch": 2.19613697863208, + "grad_norm": NaN, + "learning_rate": 0.0002199098326979105, + "loss": 0.0, + "step": 23536 + }, + { + "epoch": 2.1962302883269573, + "grad_norm": NaN, + "learning_rate": 0.0002199031405493014, + "loss": 0.0, + "step": 23537 + }, + { + "epoch": 2.1963235980218343, + "grad_norm": NaN, + "learning_rate": 0.00021989644822294934, + "loss": 0.0, + "step": 23538 + }, + { + "epoch": 2.1964169077167117, + "grad_norm": NaN, + "learning_rate": 0.0002198897557188713, + "loss": 0.0, + "step": 23539 + }, + { + "epoch": 2.196510217411589, + "grad_norm": NaN, + "learning_rate": 0.00021988306303708432, + "loss": 0.0, + "step": 23540 + }, + { + "epoch": 2.1966035271064666, + "grad_norm": NaN, + "learning_rate": 0.00021987637017760543, + "loss": 0.0, + "step": 23541 + }, + { + "epoch": 2.1966968368013435, + "grad_norm": NaN, + "learning_rate": 0.00021986967714045157, + "loss": 0.0, + "step": 23542 + }, + { + "epoch": 2.196790146496221, + "grad_norm": NaN, + "learning_rate": 0.00021986298392563986, + "loss": 0.0, + "step": 23543 + }, + { + "epoch": 2.1968834561910984, + "grad_norm": NaN, + "learning_rate": 0.0002198562905331873, + "loss": 0.0, + "step": 23544 + }, + { + "epoch": 2.1969767658859753, + "grad_norm": NaN, + "learning_rate": 0.00021984959696311085, + "loss": 0.0, + "step": 23545 + }, + { + "epoch": 2.1970700755808528, + "grad_norm": NaN, + "learning_rate": 0.00021984290321542757, + "loss": 0.0, + "step": 23546 + }, + { + "epoch": 2.19716338527573, + "grad_norm": NaN, + "learning_rate": 0.0002198362092901545, + "loss": 0.0, + "step": 23547 + }, + { + "epoch": 2.1972566949706076, + "grad_norm": NaN, + "learning_rate": 0.0002198295151873086, + "loss": 0.0, + "step": 23548 + }, + { + "epoch": 2.1973500046654846, + "grad_norm": NaN, + "learning_rate": 0.00021982282090690696, + "loss": 0.0, + "step": 23549 + }, + { + "epoch": 2.197443314360362, + "grad_norm": NaN, + "learning_rate": 0.00021981612644896654, + "loss": 0.0, + "step": 23550 + }, + { + "epoch": 2.1975366240552394, + "grad_norm": NaN, + "learning_rate": 0.00021980943181350442, + "loss": 0.0, + "step": 23551 + }, + { + "epoch": 2.197629933750117, + "grad_norm": NaN, + "learning_rate": 0.0002198027370005376, + "loss": 0.0, + "step": 23552 + }, + { + "epoch": 2.197723243444994, + "grad_norm": NaN, + "learning_rate": 0.00021979604201008307, + "loss": 0.0, + "step": 23553 + }, + { + "epoch": 2.1978165531398712, + "grad_norm": NaN, + "learning_rate": 0.00021978934684215788, + "loss": 0.0, + "step": 23554 + }, + { + "epoch": 2.1979098628347487, + "grad_norm": NaN, + "learning_rate": 0.0002197826514967791, + "loss": 0.0, + "step": 23555 + }, + { + "epoch": 2.1980031725296256, + "grad_norm": NaN, + "learning_rate": 0.00021977595597396363, + "loss": 0.0, + "step": 23556 + }, + { + "epoch": 2.198096482224503, + "grad_norm": NaN, + "learning_rate": 0.00021976926027372865, + "loss": 0.0, + "step": 23557 + }, + { + "epoch": 2.1981897919193805, + "grad_norm": NaN, + "learning_rate": 0.00021976256439609106, + "loss": 0.0, + "step": 23558 + }, + { + "epoch": 2.198283101614258, + "grad_norm": NaN, + "learning_rate": 0.00021975586834106794, + "loss": 0.0, + "step": 23559 + }, + { + "epoch": 2.198376411309135, + "grad_norm": NaN, + "learning_rate": 0.00021974917210867636, + "loss": 0.0, + "step": 23560 + }, + { + "epoch": 2.1984697210040123, + "grad_norm": NaN, + "learning_rate": 0.00021974247569893322, + "loss": 0.0, + "step": 23561 + }, + { + "epoch": 2.1985630306988897, + "grad_norm": NaN, + "learning_rate": 0.00021973577911185566, + "loss": 0.0, + "step": 23562 + }, + { + "epoch": 2.198656340393767, + "grad_norm": NaN, + "learning_rate": 0.00021972908234746066, + "loss": 0.0, + "step": 23563 + }, + { + "epoch": 2.198749650088644, + "grad_norm": NaN, + "learning_rate": 0.00021972238540576527, + "loss": 0.0, + "step": 23564 + }, + { + "epoch": 2.1988429597835215, + "grad_norm": NaN, + "learning_rate": 0.0002197156882867865, + "loss": 0.0, + "step": 23565 + }, + { + "epoch": 2.198936269478399, + "grad_norm": NaN, + "learning_rate": 0.0002197089909905414, + "loss": 0.0, + "step": 23566 + }, + { + "epoch": 2.199029579173276, + "grad_norm": NaN, + "learning_rate": 0.000219702293517047, + "loss": 0.0, + "step": 23567 + }, + { + "epoch": 2.1991228888681533, + "grad_norm": NaN, + "learning_rate": 0.0002196955958663203, + "loss": 0.0, + "step": 23568 + }, + { + "epoch": 2.1992161985630307, + "grad_norm": NaN, + "learning_rate": 0.0002196888980383783, + "loss": 0.0, + "step": 23569 + }, + { + "epoch": 2.199309508257908, + "grad_norm": NaN, + "learning_rate": 0.00021968220003323812, + "loss": 0.0, + "step": 23570 + }, + { + "epoch": 2.199402817952785, + "grad_norm": NaN, + "learning_rate": 0.00021967550185091677, + "loss": 0.0, + "step": 23571 + }, + { + "epoch": 2.1994961276476626, + "grad_norm": NaN, + "learning_rate": 0.00021966880349143122, + "loss": 0.0, + "step": 23572 + }, + { + "epoch": 2.19958943734254, + "grad_norm": NaN, + "learning_rate": 0.00021966210495479854, + "loss": 0.0, + "step": 23573 + }, + { + "epoch": 2.199682747037417, + "grad_norm": NaN, + "learning_rate": 0.00021965540624103573, + "loss": 0.0, + "step": 23574 + }, + { + "epoch": 2.1997760567322944, + "grad_norm": NaN, + "learning_rate": 0.00021964870735015994, + "loss": 0.0, + "step": 23575 + }, + { + "epoch": 2.199869366427172, + "grad_norm": NaN, + "learning_rate": 0.00021964200828218804, + "loss": 0.0, + "step": 23576 + }, + { + "epoch": 2.199962676122049, + "grad_norm": NaN, + "learning_rate": 0.00021963530903713722, + "loss": 0.0, + "step": 23577 + }, + { + "epoch": 2.200055985816926, + "grad_norm": NaN, + "learning_rate": 0.00021962860961502435, + "loss": 0.0, + "step": 23578 + }, + { + "epoch": 2.2001492955118036, + "grad_norm": NaN, + "learning_rate": 0.00021962191001586662, + "loss": 0.0, + "step": 23579 + }, + { + "epoch": 2.200242605206681, + "grad_norm": NaN, + "learning_rate": 0.00021961521023968096, + "loss": 0.0, + "step": 23580 + }, + { + "epoch": 2.2003359149015584, + "grad_norm": NaN, + "learning_rate": 0.00021960851028648447, + "loss": 0.0, + "step": 23581 + }, + { + "epoch": 2.2004292245964354, + "grad_norm": NaN, + "learning_rate": 0.00021960181015629414, + "loss": 0.0, + "step": 23582 + }, + { + "epoch": 2.200522534291313, + "grad_norm": NaN, + "learning_rate": 0.000219595109849127, + "loss": 0.0, + "step": 23583 + }, + { + "epoch": 2.2006158439861903, + "grad_norm": NaN, + "learning_rate": 0.00021958840936500015, + "loss": 0.0, + "step": 23584 + }, + { + "epoch": 2.2007091536810677, + "grad_norm": NaN, + "learning_rate": 0.00021958170870393058, + "loss": 0.0, + "step": 23585 + }, + { + "epoch": 2.2008024633759447, + "grad_norm": NaN, + "learning_rate": 0.00021957500786593532, + "loss": 0.0, + "step": 23586 + }, + { + "epoch": 2.200895773070822, + "grad_norm": NaN, + "learning_rate": 0.00021956830685103144, + "loss": 0.0, + "step": 23587 + }, + { + "epoch": 2.2009890827656995, + "grad_norm": NaN, + "learning_rate": 0.00021956160565923596, + "loss": 0.0, + "step": 23588 + }, + { + "epoch": 2.2010823924605765, + "grad_norm": NaN, + "learning_rate": 0.00021955490429056594, + "loss": 0.0, + "step": 23589 + }, + { + "epoch": 2.201175702155454, + "grad_norm": NaN, + "learning_rate": 0.0002195482027450384, + "loss": 0.0, + "step": 23590 + }, + { + "epoch": 2.2012690118503313, + "grad_norm": NaN, + "learning_rate": 0.00021954150102267034, + "loss": 0.0, + "step": 23591 + }, + { + "epoch": 2.2013623215452087, + "grad_norm": NaN, + "learning_rate": 0.00021953479912347886, + "loss": 0.0, + "step": 23592 + }, + { + "epoch": 2.2014556312400857, + "grad_norm": NaN, + "learning_rate": 0.00021952809704748103, + "loss": 0.0, + "step": 23593 + }, + { + "epoch": 2.201548940934963, + "grad_norm": NaN, + "learning_rate": 0.0002195213947946938, + "loss": 0.0, + "step": 23594 + }, + { + "epoch": 2.2016422506298405, + "grad_norm": NaN, + "learning_rate": 0.00021951469236513425, + "loss": 0.0, + "step": 23595 + }, + { + "epoch": 2.2017355603247175, + "grad_norm": NaN, + "learning_rate": 0.00021950798975881944, + "loss": 0.0, + "step": 23596 + }, + { + "epoch": 2.201828870019595, + "grad_norm": NaN, + "learning_rate": 0.0002195012869757664, + "loss": 0.0, + "step": 23597 + }, + { + "epoch": 2.2019221797144723, + "grad_norm": NaN, + "learning_rate": 0.00021949458401599218, + "loss": 0.0, + "step": 23598 + }, + { + "epoch": 2.2020154894093498, + "grad_norm": NaN, + "learning_rate": 0.00021948788087951384, + "loss": 0.0, + "step": 23599 + }, + { + "epoch": 2.2021087991042267, + "grad_norm": NaN, + "learning_rate": 0.00021948117756634837, + "loss": 0.0, + "step": 23600 + }, + { + "epoch": 2.202202108799104, + "grad_norm": NaN, + "learning_rate": 0.00021947447407651288, + "loss": 0.0, + "step": 23601 + }, + { + "epoch": 2.2022954184939816, + "grad_norm": NaN, + "learning_rate": 0.00021946777041002434, + "loss": 0.0, + "step": 23602 + }, + { + "epoch": 2.202388728188859, + "grad_norm": NaN, + "learning_rate": 0.00021946106656689987, + "loss": 0.0, + "step": 23603 + }, + { + "epoch": 2.202482037883736, + "grad_norm": NaN, + "learning_rate": 0.00021945436254715644, + "loss": 0.0, + "step": 23604 + }, + { + "epoch": 2.2025753475786134, + "grad_norm": NaN, + "learning_rate": 0.00021944765835081118, + "loss": 0.0, + "step": 23605 + }, + { + "epoch": 2.202668657273491, + "grad_norm": NaN, + "learning_rate": 0.0002194409539778811, + "loss": 0.0, + "step": 23606 + }, + { + "epoch": 2.2027619669683682, + "grad_norm": NaN, + "learning_rate": 0.00021943424942838323, + "loss": 0.0, + "step": 23607 + }, + { + "epoch": 2.202855276663245, + "grad_norm": NaN, + "learning_rate": 0.0002194275447023346, + "loss": 0.0, + "step": 23608 + }, + { + "epoch": 2.2029485863581226, + "grad_norm": NaN, + "learning_rate": 0.00021942083979975237, + "loss": 0.0, + "step": 23609 + }, + { + "epoch": 2.203041896053, + "grad_norm": NaN, + "learning_rate": 0.00021941413472065347, + "loss": 0.0, + "step": 23610 + }, + { + "epoch": 2.203135205747877, + "grad_norm": NaN, + "learning_rate": 0.00021940742946505496, + "loss": 0.0, + "step": 23611 + }, + { + "epoch": 2.2032285154427544, + "grad_norm": NaN, + "learning_rate": 0.00021940072403297395, + "loss": 0.0, + "step": 23612 + }, + { + "epoch": 2.203321825137632, + "grad_norm": NaN, + "learning_rate": 0.00021939401842442744, + "loss": 0.0, + "step": 23613 + }, + { + "epoch": 2.2034151348325093, + "grad_norm": NaN, + "learning_rate": 0.0002193873126394325, + "loss": 0.0, + "step": 23614 + }, + { + "epoch": 2.2035084445273863, + "grad_norm": NaN, + "learning_rate": 0.00021938060667800618, + "loss": 0.0, + "step": 23615 + }, + { + "epoch": 2.2036017542222637, + "grad_norm": NaN, + "learning_rate": 0.00021937390054016552, + "loss": 0.0, + "step": 23616 + }, + { + "epoch": 2.203695063917141, + "grad_norm": NaN, + "learning_rate": 0.0002193671942259276, + "loss": 0.0, + "step": 23617 + }, + { + "epoch": 2.203788373612018, + "grad_norm": NaN, + "learning_rate": 0.00021936048773530943, + "loss": 0.0, + "step": 23618 + }, + { + "epoch": 2.2038816833068955, + "grad_norm": NaN, + "learning_rate": 0.0002193537810683281, + "loss": 0.0, + "step": 23619 + }, + { + "epoch": 2.203974993001773, + "grad_norm": NaN, + "learning_rate": 0.00021934707422500066, + "loss": 0.0, + "step": 23620 + }, + { + "epoch": 2.2040683026966503, + "grad_norm": NaN, + "learning_rate": 0.00021934036720534415, + "loss": 0.0, + "step": 23621 + }, + { + "epoch": 2.2041616123915273, + "grad_norm": NaN, + "learning_rate": 0.00021933366000937557, + "loss": 0.0, + "step": 23622 + }, + { + "epoch": 2.2042549220864047, + "grad_norm": NaN, + "learning_rate": 0.00021932695263711212, + "loss": 0.0, + "step": 23623 + }, + { + "epoch": 2.204348231781282, + "grad_norm": NaN, + "learning_rate": 0.0002193202450885707, + "loss": 0.0, + "step": 23624 + }, + { + "epoch": 2.2044415414761596, + "grad_norm": NaN, + "learning_rate": 0.00021931353736376845, + "loss": 0.0, + "step": 23625 + }, + { + "epoch": 2.2045348511710365, + "grad_norm": NaN, + "learning_rate": 0.00021930682946272242, + "loss": 0.0, + "step": 23626 + }, + { + "epoch": 2.204628160865914, + "grad_norm": NaN, + "learning_rate": 0.00021930012138544964, + "loss": 0.0, + "step": 23627 + }, + { + "epoch": 2.2047214705607914, + "grad_norm": NaN, + "learning_rate": 0.00021929341313196716, + "loss": 0.0, + "step": 23628 + }, + { + "epoch": 2.204814780255669, + "grad_norm": NaN, + "learning_rate": 0.0002192867047022921, + "loss": 0.0, + "step": 23629 + }, + { + "epoch": 2.2049080899505458, + "grad_norm": NaN, + "learning_rate": 0.00021927999609644148, + "loss": 0.0, + "step": 23630 + }, + { + "epoch": 2.205001399645423, + "grad_norm": NaN, + "learning_rate": 0.00021927328731443228, + "loss": 0.0, + "step": 23631 + }, + { + "epoch": 2.2050947093403006, + "grad_norm": NaN, + "learning_rate": 0.00021926657835628167, + "loss": 0.0, + "step": 23632 + }, + { + "epoch": 2.2051880190351776, + "grad_norm": NaN, + "learning_rate": 0.0002192598692220067, + "loss": 0.0, + "step": 23633 + }, + { + "epoch": 2.205281328730055, + "grad_norm": NaN, + "learning_rate": 0.00021925315991162437, + "loss": 0.0, + "step": 23634 + }, + { + "epoch": 2.2053746384249324, + "grad_norm": NaN, + "learning_rate": 0.00021924645042515177, + "loss": 0.0, + "step": 23635 + }, + { + "epoch": 2.20546794811981, + "grad_norm": NaN, + "learning_rate": 0.00021923974076260597, + "loss": 0.0, + "step": 23636 + }, + { + "epoch": 2.205561257814687, + "grad_norm": NaN, + "learning_rate": 0.000219233030924004, + "loss": 0.0, + "step": 23637 + }, + { + "epoch": 2.2056545675095642, + "grad_norm": NaN, + "learning_rate": 0.00021922632090936293, + "loss": 0.0, + "step": 23638 + }, + { + "epoch": 2.2057478772044417, + "grad_norm": NaN, + "learning_rate": 0.00021921961071869988, + "loss": 0.0, + "step": 23639 + }, + { + "epoch": 2.2058411868993186, + "grad_norm": NaN, + "learning_rate": 0.00021921290035203182, + "loss": 0.0, + "step": 23640 + }, + { + "epoch": 2.205934496594196, + "grad_norm": NaN, + "learning_rate": 0.00021920618980937587, + "loss": 0.0, + "step": 23641 + }, + { + "epoch": 2.2060278062890735, + "grad_norm": NaN, + "learning_rate": 0.00021919947909074911, + "loss": 0.0, + "step": 23642 + }, + { + "epoch": 2.206121115983951, + "grad_norm": NaN, + "learning_rate": 0.00021919276819616854, + "loss": 0.0, + "step": 23643 + }, + { + "epoch": 2.206214425678828, + "grad_norm": NaN, + "learning_rate": 0.00021918605712565128, + "loss": 0.0, + "step": 23644 + }, + { + "epoch": 2.2063077353737053, + "grad_norm": NaN, + "learning_rate": 0.00021917934587921436, + "loss": 0.0, + "step": 23645 + }, + { + "epoch": 2.2064010450685827, + "grad_norm": NaN, + "learning_rate": 0.00021917263445687485, + "loss": 0.0, + "step": 23646 + }, + { + "epoch": 2.20649435476346, + "grad_norm": NaN, + "learning_rate": 0.00021916592285864983, + "loss": 0.0, + "step": 23647 + }, + { + "epoch": 2.206587664458337, + "grad_norm": NaN, + "learning_rate": 0.00021915921108455632, + "loss": 0.0, + "step": 23648 + }, + { + "epoch": 2.2066809741532145, + "grad_norm": NaN, + "learning_rate": 0.00021915249913461148, + "loss": 0.0, + "step": 23649 + }, + { + "epoch": 2.206774283848092, + "grad_norm": NaN, + "learning_rate": 0.0002191457870088323, + "loss": 0.0, + "step": 23650 + }, + { + "epoch": 2.206867593542969, + "grad_norm": NaN, + "learning_rate": 0.00021913907470723582, + "loss": 0.0, + "step": 23651 + }, + { + "epoch": 2.2069609032378463, + "grad_norm": NaN, + "learning_rate": 0.0002191323622298392, + "loss": 0.0, + "step": 23652 + }, + { + "epoch": 2.2070542129327237, + "grad_norm": NaN, + "learning_rate": 0.00021912564957665944, + "loss": 0.0, + "step": 23653 + }, + { + "epoch": 2.207147522627601, + "grad_norm": NaN, + "learning_rate": 0.00021911893674771368, + "loss": 0.0, + "step": 23654 + }, + { + "epoch": 2.207240832322478, + "grad_norm": NaN, + "learning_rate": 0.0002191122237430189, + "loss": 0.0, + "step": 23655 + }, + { + "epoch": 2.2073341420173556, + "grad_norm": NaN, + "learning_rate": 0.00021910551056259221, + "loss": 0.0, + "step": 23656 + }, + { + "epoch": 2.207427451712233, + "grad_norm": NaN, + "learning_rate": 0.00021909879720645068, + "loss": 0.0, + "step": 23657 + }, + { + "epoch": 2.2075207614071104, + "grad_norm": NaN, + "learning_rate": 0.00021909208367461134, + "loss": 0.0, + "step": 23658 + }, + { + "epoch": 2.2076140711019874, + "grad_norm": NaN, + "learning_rate": 0.00021908536996709133, + "loss": 0.0, + "step": 23659 + }, + { + "epoch": 2.207707380796865, + "grad_norm": NaN, + "learning_rate": 0.00021907865608390768, + "loss": 0.0, + "step": 23660 + }, + { + "epoch": 2.207800690491742, + "grad_norm": NaN, + "learning_rate": 0.00021907194202507748, + "loss": 0.0, + "step": 23661 + }, + { + "epoch": 2.207894000186619, + "grad_norm": NaN, + "learning_rate": 0.0002190652277906178, + "loss": 0.0, + "step": 23662 + }, + { + "epoch": 2.2079873098814966, + "grad_norm": NaN, + "learning_rate": 0.0002190585133805457, + "loss": 0.0, + "step": 23663 + }, + { + "epoch": 2.208080619576374, + "grad_norm": NaN, + "learning_rate": 0.00021905179879487825, + "loss": 0.0, + "step": 23664 + }, + { + "epoch": 2.2081739292712514, + "grad_norm": NaN, + "learning_rate": 0.00021904508403363252, + "loss": 0.0, + "step": 23665 + }, + { + "epoch": 2.2082672389661284, + "grad_norm": NaN, + "learning_rate": 0.00021903836909682563, + "loss": 0.0, + "step": 23666 + }, + { + "epoch": 2.208360548661006, + "grad_norm": NaN, + "learning_rate": 0.00021903165398447458, + "loss": 0.0, + "step": 23667 + }, + { + "epoch": 2.2084538583558833, + "grad_norm": NaN, + "learning_rate": 0.0002190249386965965, + "loss": 0.0, + "step": 23668 + }, + { + "epoch": 2.2085471680507607, + "grad_norm": NaN, + "learning_rate": 0.00021901822323320844, + "loss": 0.0, + "step": 23669 + }, + { + "epoch": 2.2086404777456377, + "grad_norm": NaN, + "learning_rate": 0.00021901150759432749, + "loss": 0.0, + "step": 23670 + }, + { + "epoch": 2.208733787440515, + "grad_norm": NaN, + "learning_rate": 0.00021900479177997074, + "loss": 0.0, + "step": 23671 + }, + { + "epoch": 2.2088270971353925, + "grad_norm": NaN, + "learning_rate": 0.00021899807579015522, + "loss": 0.0, + "step": 23672 + }, + { + "epoch": 2.2089204068302695, + "grad_norm": NaN, + "learning_rate": 0.00021899135962489803, + "loss": 0.0, + "step": 23673 + }, + { + "epoch": 2.209013716525147, + "grad_norm": NaN, + "learning_rate": 0.00021898464328421624, + "loss": 0.0, + "step": 23674 + }, + { + "epoch": 2.2091070262200243, + "grad_norm": NaN, + "learning_rate": 0.00021897792676812698, + "loss": 0.0, + "step": 23675 + }, + { + "epoch": 2.2092003359149017, + "grad_norm": NaN, + "learning_rate": 0.00021897121007664723, + "loss": 0.0, + "step": 23676 + }, + { + "epoch": 2.2092936456097787, + "grad_norm": NaN, + "learning_rate": 0.00021896449320979415, + "loss": 0.0, + "step": 23677 + }, + { + "epoch": 2.209386955304656, + "grad_norm": NaN, + "learning_rate": 0.0002189577761675848, + "loss": 0.0, + "step": 23678 + }, + { + "epoch": 2.2094802649995335, + "grad_norm": NaN, + "learning_rate": 0.00021895105895003623, + "loss": 0.0, + "step": 23679 + }, + { + "epoch": 2.209573574694411, + "grad_norm": NaN, + "learning_rate": 0.00021894434155716555, + "loss": 0.0, + "step": 23680 + }, + { + "epoch": 2.209666884389288, + "grad_norm": NaN, + "learning_rate": 0.00021893762398898984, + "loss": 0.0, + "step": 23681 + }, + { + "epoch": 2.2097601940841654, + "grad_norm": NaN, + "learning_rate": 0.0002189309062455262, + "loss": 0.0, + "step": 23682 + }, + { + "epoch": 2.2098535037790428, + "grad_norm": NaN, + "learning_rate": 0.00021892418832679164, + "loss": 0.0, + "step": 23683 + }, + { + "epoch": 2.2099468134739197, + "grad_norm": NaN, + "learning_rate": 0.0002189174702328033, + "loss": 0.0, + "step": 23684 + }, + { + "epoch": 2.210040123168797, + "grad_norm": NaN, + "learning_rate": 0.00021891075196357827, + "loss": 0.0, + "step": 23685 + }, + { + "epoch": 2.2101334328636746, + "grad_norm": NaN, + "learning_rate": 0.00021890403351913358, + "loss": 0.0, + "step": 23686 + }, + { + "epoch": 2.210226742558552, + "grad_norm": NaN, + "learning_rate": 0.00021889731489948636, + "loss": 0.0, + "step": 23687 + }, + { + "epoch": 2.210320052253429, + "grad_norm": NaN, + "learning_rate": 0.00021889059610465366, + "loss": 0.0, + "step": 23688 + }, + { + "epoch": 2.2104133619483064, + "grad_norm": NaN, + "learning_rate": 0.00021888387713465257, + "loss": 0.0, + "step": 23689 + }, + { + "epoch": 2.210506671643184, + "grad_norm": NaN, + "learning_rate": 0.00021887715798950022, + "loss": 0.0, + "step": 23690 + }, + { + "epoch": 2.210599981338061, + "grad_norm": NaN, + "learning_rate": 0.00021887043866921363, + "loss": 0.0, + "step": 23691 + }, + { + "epoch": 2.210693291032938, + "grad_norm": NaN, + "learning_rate": 0.00021886371917380994, + "loss": 0.0, + "step": 23692 + }, + { + "epoch": 2.2107866007278156, + "grad_norm": NaN, + "learning_rate": 0.00021885699950330623, + "loss": 0.0, + "step": 23693 + }, + { + "epoch": 2.210879910422693, + "grad_norm": NaN, + "learning_rate": 0.00021885027965771954, + "loss": 0.0, + "step": 23694 + }, + { + "epoch": 2.21097322011757, + "grad_norm": NaN, + "learning_rate": 0.00021884355963706697, + "loss": 0.0, + "step": 23695 + }, + { + "epoch": 2.2110665298124474, + "grad_norm": NaN, + "learning_rate": 0.00021883683944136565, + "loss": 0.0, + "step": 23696 + }, + { + "epoch": 2.211159839507325, + "grad_norm": NaN, + "learning_rate": 0.0002188301190706326, + "loss": 0.0, + "step": 23697 + }, + { + "epoch": 2.2112531492022023, + "grad_norm": NaN, + "learning_rate": 0.00021882339852488498, + "loss": 0.0, + "step": 23698 + }, + { + "epoch": 2.2113464588970793, + "grad_norm": NaN, + "learning_rate": 0.00021881667780413984, + "loss": 0.0, + "step": 23699 + }, + { + "epoch": 2.2114397685919567, + "grad_norm": NaN, + "learning_rate": 0.00021880995690841425, + "loss": 0.0, + "step": 23700 + }, + { + "epoch": 2.211533078286834, + "grad_norm": NaN, + "learning_rate": 0.00021880323583772538, + "loss": 0.0, + "step": 23701 + }, + { + "epoch": 2.2116263879817115, + "grad_norm": NaN, + "learning_rate": 0.00021879651459209018, + "loss": 0.0, + "step": 23702 + }, + { + "epoch": 2.2117196976765885, + "grad_norm": NaN, + "learning_rate": 0.00021878979317152588, + "loss": 0.0, + "step": 23703 + }, + { + "epoch": 2.211813007371466, + "grad_norm": NaN, + "learning_rate": 0.0002187830715760495, + "loss": 0.0, + "step": 23704 + }, + { + "epoch": 2.2119063170663433, + "grad_norm": NaN, + "learning_rate": 0.00021877634980567814, + "loss": 0.0, + "step": 23705 + }, + { + "epoch": 2.2119996267612203, + "grad_norm": NaN, + "learning_rate": 0.0002187696278604289, + "loss": 0.0, + "step": 23706 + }, + { + "epoch": 2.2120929364560977, + "grad_norm": NaN, + "learning_rate": 0.00021876290574031885, + "loss": 0.0, + "step": 23707 + }, + { + "epoch": 2.212186246150975, + "grad_norm": NaN, + "learning_rate": 0.00021875618344536512, + "loss": 0.0, + "step": 23708 + }, + { + "epoch": 2.2122795558458526, + "grad_norm": NaN, + "learning_rate": 0.00021874946097558476, + "loss": 0.0, + "step": 23709 + }, + { + "epoch": 2.2123728655407295, + "grad_norm": NaN, + "learning_rate": 0.00021874273833099494, + "loss": 0.0, + "step": 23710 + }, + { + "epoch": 2.212466175235607, + "grad_norm": NaN, + "learning_rate": 0.00021873601551161264, + "loss": 0.0, + "step": 23711 + }, + { + "epoch": 2.2125594849304844, + "grad_norm": NaN, + "learning_rate": 0.00021872929251745505, + "loss": 0.0, + "step": 23712 + }, + { + "epoch": 2.2126527946253614, + "grad_norm": NaN, + "learning_rate": 0.0002187225693485392, + "loss": 0.0, + "step": 23713 + }, + { + "epoch": 2.2127461043202388, + "grad_norm": NaN, + "learning_rate": 0.00021871584600488225, + "loss": 0.0, + "step": 23714 + }, + { + "epoch": 2.212839414015116, + "grad_norm": NaN, + "learning_rate": 0.0002187091224865012, + "loss": 0.0, + "step": 23715 + }, + { + "epoch": 2.2129327237099936, + "grad_norm": NaN, + "learning_rate": 0.00021870239879341328, + "loss": 0.0, + "step": 23716 + }, + { + "epoch": 2.2130260334048706, + "grad_norm": NaN, + "learning_rate": 0.00021869567492563546, + "loss": 0.0, + "step": 23717 + }, + { + "epoch": 2.213119343099748, + "grad_norm": NaN, + "learning_rate": 0.0002186889508831849, + "loss": 0.0, + "step": 23718 + }, + { + "epoch": 2.2132126527946254, + "grad_norm": NaN, + "learning_rate": 0.0002186822266660787, + "loss": 0.0, + "step": 23719 + }, + { + "epoch": 2.213305962489503, + "grad_norm": NaN, + "learning_rate": 0.0002186755022743339, + "loss": 0.0, + "step": 23720 + }, + { + "epoch": 2.21339927218438, + "grad_norm": NaN, + "learning_rate": 0.0002186687777079677, + "loss": 0.0, + "step": 23721 + }, + { + "epoch": 2.2134925818792572, + "grad_norm": NaN, + "learning_rate": 0.00021866205296699708, + "loss": 0.0, + "step": 23722 + }, + { + "epoch": 2.2135858915741347, + "grad_norm": NaN, + "learning_rate": 0.00021865532805143926, + "loss": 0.0, + "step": 23723 + }, + { + "epoch": 2.213679201269012, + "grad_norm": NaN, + "learning_rate": 0.00021864860296131122, + "loss": 0.0, + "step": 23724 + }, + { + "epoch": 2.213772510963889, + "grad_norm": NaN, + "learning_rate": 0.00021864187769663013, + "loss": 0.0, + "step": 23725 + }, + { + "epoch": 2.2138658206587665, + "grad_norm": NaN, + "learning_rate": 0.00021863515225741312, + "loss": 0.0, + "step": 23726 + }, + { + "epoch": 2.213959130353644, + "grad_norm": NaN, + "learning_rate": 0.00021862842664367718, + "loss": 0.0, + "step": 23727 + }, + { + "epoch": 2.214052440048521, + "grad_norm": NaN, + "learning_rate": 0.00021862170085543955, + "loss": 0.0, + "step": 23728 + }, + { + "epoch": 2.2141457497433983, + "grad_norm": NaN, + "learning_rate": 0.00021861497489271723, + "loss": 0.0, + "step": 23729 + }, + { + "epoch": 2.2142390594382757, + "grad_norm": NaN, + "learning_rate": 0.00021860824875552732, + "loss": 0.0, + "step": 23730 + }, + { + "epoch": 2.214332369133153, + "grad_norm": NaN, + "learning_rate": 0.00021860152244388705, + "loss": 0.0, + "step": 23731 + }, + { + "epoch": 2.21442567882803, + "grad_norm": NaN, + "learning_rate": 0.00021859479595781337, + "loss": 0.0, + "step": 23732 + }, + { + "epoch": 2.2145189885229075, + "grad_norm": NaN, + "learning_rate": 0.00021858806929732342, + "loss": 0.0, + "step": 23733 + }, + { + "epoch": 2.214612298217785, + "grad_norm": NaN, + "learning_rate": 0.00021858134246243436, + "loss": 0.0, + "step": 23734 + }, + { + "epoch": 2.214705607912662, + "grad_norm": NaN, + "learning_rate": 0.00021857461545316326, + "loss": 0.0, + "step": 23735 + }, + { + "epoch": 2.2147989176075393, + "grad_norm": NaN, + "learning_rate": 0.0002185678882695272, + "loss": 0.0, + "step": 23736 + }, + { + "epoch": 2.2148922273024167, + "grad_norm": NaN, + "learning_rate": 0.00021856116091154335, + "loss": 0.0, + "step": 23737 + }, + { + "epoch": 2.214985536997294, + "grad_norm": NaN, + "learning_rate": 0.00021855443337922875, + "loss": 0.0, + "step": 23738 + }, + { + "epoch": 2.215078846692171, + "grad_norm": NaN, + "learning_rate": 0.00021854770567260055, + "loss": 0.0, + "step": 23739 + }, + { + "epoch": 2.2151721563870486, + "grad_norm": NaN, + "learning_rate": 0.00021854097779167587, + "loss": 0.0, + "step": 23740 + }, + { + "epoch": 2.215265466081926, + "grad_norm": NaN, + "learning_rate": 0.00021853424973647173, + "loss": 0.0, + "step": 23741 + }, + { + "epoch": 2.2153587757768034, + "grad_norm": NaN, + "learning_rate": 0.0002185275215070053, + "loss": 0.0, + "step": 23742 + }, + { + "epoch": 2.2154520854716804, + "grad_norm": NaN, + "learning_rate": 0.0002185207931032937, + "loss": 0.0, + "step": 23743 + }, + { + "epoch": 2.215545395166558, + "grad_norm": NaN, + "learning_rate": 0.00021851406452535403, + "loss": 0.0, + "step": 23744 + }, + { + "epoch": 2.215638704861435, + "grad_norm": NaN, + "learning_rate": 0.00021850733577320335, + "loss": 0.0, + "step": 23745 + }, + { + "epoch": 2.2157320145563126, + "grad_norm": NaN, + "learning_rate": 0.0002185006068468588, + "loss": 0.0, + "step": 23746 + }, + { + "epoch": 2.2158253242511896, + "grad_norm": NaN, + "learning_rate": 0.00021849387774633755, + "loss": 0.0, + "step": 23747 + }, + { + "epoch": 2.215918633946067, + "grad_norm": NaN, + "learning_rate": 0.00021848714847165664, + "loss": 0.0, + "step": 23748 + }, + { + "epoch": 2.2160119436409444, + "grad_norm": NaN, + "learning_rate": 0.0002184804190228332, + "loss": 0.0, + "step": 23749 + }, + { + "epoch": 2.2161052533358214, + "grad_norm": NaN, + "learning_rate": 0.00021847368939988435, + "loss": 0.0, + "step": 23750 + }, + { + "epoch": 2.216198563030699, + "grad_norm": NaN, + "learning_rate": 0.00021846695960282716, + "loss": 0.0, + "step": 23751 + }, + { + "epoch": 2.2162918727255763, + "grad_norm": NaN, + "learning_rate": 0.00021846022963167876, + "loss": 0.0, + "step": 23752 + }, + { + "epoch": 2.2163851824204537, + "grad_norm": NaN, + "learning_rate": 0.0002184534994864564, + "loss": 0.0, + "step": 23753 + }, + { + "epoch": 2.2164784921153307, + "grad_norm": NaN, + "learning_rate": 0.00021844676916717696, + "loss": 0.0, + "step": 23754 + }, + { + "epoch": 2.216571801810208, + "grad_norm": NaN, + "learning_rate": 0.00021844003867385763, + "loss": 0.0, + "step": 23755 + }, + { + "epoch": 2.2166651115050855, + "grad_norm": NaN, + "learning_rate": 0.0002184333080065156, + "loss": 0.0, + "step": 23756 + }, + { + "epoch": 2.2167584211999625, + "grad_norm": NaN, + "learning_rate": 0.00021842657716516798, + "loss": 0.0, + "step": 23757 + }, + { + "epoch": 2.21685173089484, + "grad_norm": NaN, + "learning_rate": 0.00021841984614983176, + "loss": 0.0, + "step": 23758 + }, + { + "epoch": 2.2169450405897173, + "grad_norm": NaN, + "learning_rate": 0.00021841311496052424, + "loss": 0.0, + "step": 23759 + }, + { + "epoch": 2.2170383502845947, + "grad_norm": NaN, + "learning_rate": 0.00021840638359726236, + "loss": 0.0, + "step": 23760 + }, + { + "epoch": 2.2171316599794717, + "grad_norm": NaN, + "learning_rate": 0.00021839965206006334, + "loss": 0.0, + "step": 23761 + }, + { + "epoch": 2.217224969674349, + "grad_norm": NaN, + "learning_rate": 0.00021839292034894425, + "loss": 0.0, + "step": 23762 + }, + { + "epoch": 2.2173182793692265, + "grad_norm": NaN, + "learning_rate": 0.00021838618846392222, + "loss": 0.0, + "step": 23763 + }, + { + "epoch": 2.217411589064104, + "grad_norm": NaN, + "learning_rate": 0.00021837945640501439, + "loss": 0.0, + "step": 23764 + }, + { + "epoch": 2.217504898758981, + "grad_norm": NaN, + "learning_rate": 0.00021837272417223784, + "loss": 0.0, + "step": 23765 + }, + { + "epoch": 2.2175982084538584, + "grad_norm": NaN, + "learning_rate": 0.00021836599176560973, + "loss": 0.0, + "step": 23766 + }, + { + "epoch": 2.2176915181487358, + "grad_norm": NaN, + "learning_rate": 0.00021835925918514711, + "loss": 0.0, + "step": 23767 + }, + { + "epoch": 2.2177848278436127, + "grad_norm": NaN, + "learning_rate": 0.00021835252643086716, + "loss": 0.0, + "step": 23768 + }, + { + "epoch": 2.21787813753849, + "grad_norm": NaN, + "learning_rate": 0.00021834579350278702, + "loss": 0.0, + "step": 23769 + }, + { + "epoch": 2.2179714472333676, + "grad_norm": NaN, + "learning_rate": 0.00021833906040092375, + "loss": 0.0, + "step": 23770 + }, + { + "epoch": 2.218064756928245, + "grad_norm": NaN, + "learning_rate": 0.00021833232712529445, + "loss": 0.0, + "step": 23771 + }, + { + "epoch": 2.218158066623122, + "grad_norm": NaN, + "learning_rate": 0.00021832559367591638, + "loss": 0.0, + "step": 23772 + }, + { + "epoch": 2.2182513763179994, + "grad_norm": NaN, + "learning_rate": 0.00021831886005280649, + "loss": 0.0, + "step": 23773 + }, + { + "epoch": 2.218344686012877, + "grad_norm": NaN, + "learning_rate": 0.00021831212625598198, + "loss": 0.0, + "step": 23774 + }, + { + "epoch": 2.2184379957077542, + "grad_norm": NaN, + "learning_rate": 0.00021830539228546, + "loss": 0.0, + "step": 23775 + }, + { + "epoch": 2.218531305402631, + "grad_norm": NaN, + "learning_rate": 0.0002182986581412576, + "loss": 0.0, + "step": 23776 + }, + { + "epoch": 2.2186246150975086, + "grad_norm": NaN, + "learning_rate": 0.00021829192382339196, + "loss": 0.0, + "step": 23777 + }, + { + "epoch": 2.218717924792386, + "grad_norm": NaN, + "learning_rate": 0.0002182851893318802, + "loss": 0.0, + "step": 23778 + }, + { + "epoch": 2.218811234487263, + "grad_norm": NaN, + "learning_rate": 0.00021827845466673942, + "loss": 0.0, + "step": 23779 + }, + { + "epoch": 2.2189045441821404, + "grad_norm": NaN, + "learning_rate": 0.00021827171982798674, + "loss": 0.0, + "step": 23780 + }, + { + "epoch": 2.218997853877018, + "grad_norm": NaN, + "learning_rate": 0.00021826498481563935, + "loss": 0.0, + "step": 23781 + }, + { + "epoch": 2.2190911635718953, + "grad_norm": NaN, + "learning_rate": 0.00021825824962971432, + "loss": 0.0, + "step": 23782 + }, + { + "epoch": 2.2191844732667723, + "grad_norm": NaN, + "learning_rate": 0.00021825151427022873, + "loss": 0.0, + "step": 23783 + }, + { + "epoch": 2.2192777829616497, + "grad_norm": NaN, + "learning_rate": 0.00021824477873719979, + "loss": 0.0, + "step": 23784 + }, + { + "epoch": 2.219371092656527, + "grad_norm": NaN, + "learning_rate": 0.0002182380430306446, + "loss": 0.0, + "step": 23785 + }, + { + "epoch": 2.219464402351404, + "grad_norm": NaN, + "learning_rate": 0.00021823130715058027, + "loss": 0.0, + "step": 23786 + }, + { + "epoch": 2.2195577120462815, + "grad_norm": NaN, + "learning_rate": 0.00021822457109702388, + "loss": 0.0, + "step": 23787 + }, + { + "epoch": 2.219651021741159, + "grad_norm": NaN, + "learning_rate": 0.0002182178348699927, + "loss": 0.0, + "step": 23788 + }, + { + "epoch": 2.2197443314360363, + "grad_norm": NaN, + "learning_rate": 0.00021821109846950377, + "loss": 0.0, + "step": 23789 + }, + { + "epoch": 2.2198376411309133, + "grad_norm": NaN, + "learning_rate": 0.00021820436189557416, + "loss": 0.0, + "step": 23790 + }, + { + "epoch": 2.2199309508257907, + "grad_norm": NaN, + "learning_rate": 0.0002181976251482211, + "loss": 0.0, + "step": 23791 + }, + { + "epoch": 2.220024260520668, + "grad_norm": NaN, + "learning_rate": 0.0002181908882274617, + "loss": 0.0, + "step": 23792 + }, + { + "epoch": 2.2201175702155456, + "grad_norm": NaN, + "learning_rate": 0.00021818415113331301, + "loss": 0.0, + "step": 23793 + }, + { + "epoch": 2.2202108799104225, + "grad_norm": NaN, + "learning_rate": 0.00021817741386579229, + "loss": 0.0, + "step": 23794 + }, + { + "epoch": 2.2203041896053, + "grad_norm": NaN, + "learning_rate": 0.00021817067642491657, + "loss": 0.0, + "step": 23795 + }, + { + "epoch": 2.2203974993001774, + "grad_norm": NaN, + "learning_rate": 0.000218163938810703, + "loss": 0.0, + "step": 23796 + }, + { + "epoch": 2.220490808995055, + "grad_norm": NaN, + "learning_rate": 0.00021815720102316875, + "loss": 0.0, + "step": 23797 + }, + { + "epoch": 2.2205841186899318, + "grad_norm": NaN, + "learning_rate": 0.00021815046306233096, + "loss": 0.0, + "step": 23798 + }, + { + "epoch": 2.220677428384809, + "grad_norm": NaN, + "learning_rate": 0.00021814372492820662, + "loss": 0.0, + "step": 23799 + }, + { + "epoch": 2.2207707380796866, + "grad_norm": NaN, + "learning_rate": 0.00021813698662081308, + "loss": 0.0, + "step": 23800 + }, + { + "epoch": 2.2208640477745636, + "grad_norm": NaN, + "learning_rate": 0.00021813024814016732, + "loss": 0.0, + "step": 23801 + }, + { + "epoch": 2.220957357469441, + "grad_norm": NaN, + "learning_rate": 0.00021812350948628652, + "loss": 0.0, + "step": 23802 + }, + { + "epoch": 2.2210506671643184, + "grad_norm": NaN, + "learning_rate": 0.0002181167706591878, + "loss": 0.0, + "step": 23803 + }, + { + "epoch": 2.221143976859196, + "grad_norm": NaN, + "learning_rate": 0.00021811003165888839, + "loss": 0.0, + "step": 23804 + }, + { + "epoch": 2.221237286554073, + "grad_norm": NaN, + "learning_rate": 0.00021810329248540526, + "loss": 0.0, + "step": 23805 + }, + { + "epoch": 2.2213305962489502, + "grad_norm": NaN, + "learning_rate": 0.00021809655313875564, + "loss": 0.0, + "step": 23806 + }, + { + "epoch": 2.2214239059438277, + "grad_norm": NaN, + "learning_rate": 0.00021808981361895672, + "loss": 0.0, + "step": 23807 + }, + { + "epoch": 2.2215172156387046, + "grad_norm": NaN, + "learning_rate": 0.00021808307392602553, + "loss": 0.0, + "step": 23808 + }, + { + "epoch": 2.221610525333582, + "grad_norm": NaN, + "learning_rate": 0.0002180763340599792, + "loss": 0.0, + "step": 23809 + }, + { + "epoch": 2.2217038350284595, + "grad_norm": NaN, + "learning_rate": 0.000218069594020835, + "loss": 0.0, + "step": 23810 + }, + { + "epoch": 2.221797144723337, + "grad_norm": NaN, + "learning_rate": 0.00021806285380860998, + "loss": 0.0, + "step": 23811 + }, + { + "epoch": 2.221890454418214, + "grad_norm": NaN, + "learning_rate": 0.00021805611342332122, + "loss": 0.0, + "step": 23812 + }, + { + "epoch": 2.2219837641130913, + "grad_norm": NaN, + "learning_rate": 0.00021804937286498598, + "loss": 0.0, + "step": 23813 + }, + { + "epoch": 2.2220770738079687, + "grad_norm": NaN, + "learning_rate": 0.00021804263213362135, + "loss": 0.0, + "step": 23814 + }, + { + "epoch": 2.222170383502846, + "grad_norm": NaN, + "learning_rate": 0.0002180358912292444, + "loss": 0.0, + "step": 23815 + }, + { + "epoch": 2.222263693197723, + "grad_norm": NaN, + "learning_rate": 0.00021802915015187242, + "loss": 0.0, + "step": 23816 + }, + { + "epoch": 2.2223570028926005, + "grad_norm": NaN, + "learning_rate": 0.0002180224089015224, + "loss": 0.0, + "step": 23817 + }, + { + "epoch": 2.222450312587478, + "grad_norm": NaN, + "learning_rate": 0.00021801566747821152, + "loss": 0.0, + "step": 23818 + }, + { + "epoch": 2.2225436222823554, + "grad_norm": NaN, + "learning_rate": 0.00021800892588195702, + "loss": 0.0, + "step": 23819 + }, + { + "epoch": 2.2226369319772323, + "grad_norm": NaN, + "learning_rate": 0.00021800218411277593, + "loss": 0.0, + "step": 23820 + }, + { + "epoch": 2.2227302416721098, + "grad_norm": NaN, + "learning_rate": 0.0002179954421706854, + "loss": 0.0, + "step": 23821 + }, + { + "epoch": 2.222823551366987, + "grad_norm": NaN, + "learning_rate": 0.0002179887000557026, + "loss": 0.0, + "step": 23822 + }, + { + "epoch": 2.222916861061864, + "grad_norm": NaN, + "learning_rate": 0.00021798195776784476, + "loss": 0.0, + "step": 23823 + }, + { + "epoch": 2.2230101707567416, + "grad_norm": NaN, + "learning_rate": 0.00021797521530712884, + "loss": 0.0, + "step": 23824 + }, + { + "epoch": 2.223103480451619, + "grad_norm": NaN, + "learning_rate": 0.00021796847267357216, + "loss": 0.0, + "step": 23825 + }, + { + "epoch": 2.2231967901464964, + "grad_norm": NaN, + "learning_rate": 0.00021796172986719177, + "loss": 0.0, + "step": 23826 + }, + { + "epoch": 2.2232900998413734, + "grad_norm": NaN, + "learning_rate": 0.00021795498688800482, + "loss": 0.0, + "step": 23827 + }, + { + "epoch": 2.223383409536251, + "grad_norm": NaN, + "learning_rate": 0.00021794824373602844, + "loss": 0.0, + "step": 23828 + }, + { + "epoch": 2.223476719231128, + "grad_norm": NaN, + "learning_rate": 0.00021794150041127986, + "loss": 0.0, + "step": 23829 + }, + { + "epoch": 2.223570028926005, + "grad_norm": NaN, + "learning_rate": 0.00021793475691377615, + "loss": 0.0, + "step": 23830 + }, + { + "epoch": 2.2236633386208826, + "grad_norm": NaN, + "learning_rate": 0.0002179280132435344, + "loss": 0.0, + "step": 23831 + }, + { + "epoch": 2.22375664831576, + "grad_norm": NaN, + "learning_rate": 0.00021792126940057196, + "loss": 0.0, + "step": 23832 + }, + { + "epoch": 2.2238499580106375, + "grad_norm": NaN, + "learning_rate": 0.00021791452538490582, + "loss": 0.0, + "step": 23833 + }, + { + "epoch": 2.2239432677055144, + "grad_norm": NaN, + "learning_rate": 0.0002179077811965531, + "loss": 0.0, + "step": 23834 + }, + { + "epoch": 2.224036577400392, + "grad_norm": NaN, + "learning_rate": 0.00021790103683553107, + "loss": 0.0, + "step": 23835 + }, + { + "epoch": 2.2241298870952693, + "grad_norm": NaN, + "learning_rate": 0.0002178942923018568, + "loss": 0.0, + "step": 23836 + }, + { + "epoch": 2.2242231967901467, + "grad_norm": NaN, + "learning_rate": 0.00021788754759554738, + "loss": 0.0, + "step": 23837 + }, + { + "epoch": 2.2243165064850237, + "grad_norm": NaN, + "learning_rate": 0.00021788080271662014, + "loss": 0.0, + "step": 23838 + }, + { + "epoch": 2.224409816179901, + "grad_norm": NaN, + "learning_rate": 0.00021787405766509208, + "loss": 0.0, + "step": 23839 + }, + { + "epoch": 2.2245031258747785, + "grad_norm": NaN, + "learning_rate": 0.00021786731244098038, + "loss": 0.0, + "step": 23840 + }, + { + "epoch": 2.224596435569656, + "grad_norm": NaN, + "learning_rate": 0.00021786056704430225, + "loss": 0.0, + "step": 23841 + }, + { + "epoch": 2.224689745264533, + "grad_norm": NaN, + "learning_rate": 0.00021785382147507482, + "loss": 0.0, + "step": 23842 + }, + { + "epoch": 2.2247830549594103, + "grad_norm": NaN, + "learning_rate": 0.00021784707573331514, + "loss": 0.0, + "step": 23843 + }, + { + "epoch": 2.2248763646542877, + "grad_norm": NaN, + "learning_rate": 0.00021784032981904047, + "loss": 0.0, + "step": 23844 + }, + { + "epoch": 2.2249696743491647, + "grad_norm": NaN, + "learning_rate": 0.00021783358373226802, + "loss": 0.0, + "step": 23845 + }, + { + "epoch": 2.225062984044042, + "grad_norm": NaN, + "learning_rate": 0.00021782683747301475, + "loss": 0.0, + "step": 23846 + }, + { + "epoch": 2.2251562937389195, + "grad_norm": NaN, + "learning_rate": 0.00021782009104129798, + "loss": 0.0, + "step": 23847 + }, + { + "epoch": 2.225249603433797, + "grad_norm": NaN, + "learning_rate": 0.00021781334443713482, + "loss": 0.0, + "step": 23848 + }, + { + "epoch": 2.225342913128674, + "grad_norm": NaN, + "learning_rate": 0.00021780659766054242, + "loss": 0.0, + "step": 23849 + }, + { + "epoch": 2.2254362228235514, + "grad_norm": NaN, + "learning_rate": 0.00021779985071153786, + "loss": 0.0, + "step": 23850 + }, + { + "epoch": 2.2255295325184288, + "grad_norm": NaN, + "learning_rate": 0.00021779310359013843, + "loss": 0.0, + "step": 23851 + }, + { + "epoch": 2.2256228422133058, + "grad_norm": NaN, + "learning_rate": 0.00021778635629636115, + "loss": 0.0, + "step": 23852 + }, + { + "epoch": 2.225716151908183, + "grad_norm": NaN, + "learning_rate": 0.0002177796088302233, + "loss": 0.0, + "step": 23853 + }, + { + "epoch": 2.2258094616030606, + "grad_norm": NaN, + "learning_rate": 0.00021777286119174197, + "loss": 0.0, + "step": 23854 + }, + { + "epoch": 2.225902771297938, + "grad_norm": NaN, + "learning_rate": 0.00021776611338093435, + "loss": 0.0, + "step": 23855 + }, + { + "epoch": 2.225996080992815, + "grad_norm": NaN, + "learning_rate": 0.00021775936539781754, + "loss": 0.0, + "step": 23856 + }, + { + "epoch": 2.2260893906876924, + "grad_norm": NaN, + "learning_rate": 0.00021775261724240878, + "loss": 0.0, + "step": 23857 + }, + { + "epoch": 2.22618270038257, + "grad_norm": NaN, + "learning_rate": 0.00021774586891472515, + "loss": 0.0, + "step": 23858 + }, + { + "epoch": 2.2262760100774472, + "grad_norm": NaN, + "learning_rate": 0.00021773912041478385, + "loss": 0.0, + "step": 23859 + }, + { + "epoch": 2.226369319772324, + "grad_norm": NaN, + "learning_rate": 0.00021773237174260203, + "loss": 0.0, + "step": 23860 + }, + { + "epoch": 2.2264626294672016, + "grad_norm": NaN, + "learning_rate": 0.00021772562289819688, + "loss": 0.0, + "step": 23861 + }, + { + "epoch": 2.226555939162079, + "grad_norm": NaN, + "learning_rate": 0.00021771887388158547, + "loss": 0.0, + "step": 23862 + }, + { + "epoch": 2.226649248856956, + "grad_norm": NaN, + "learning_rate": 0.00021771212469278507, + "loss": 0.0, + "step": 23863 + }, + { + "epoch": 2.2267425585518335, + "grad_norm": NaN, + "learning_rate": 0.0002177053753318128, + "loss": 0.0, + "step": 23864 + }, + { + "epoch": 2.226835868246711, + "grad_norm": NaN, + "learning_rate": 0.00021769862579868577, + "loss": 0.0, + "step": 23865 + }, + { + "epoch": 2.2269291779415883, + "grad_norm": NaN, + "learning_rate": 0.00021769187609342123, + "loss": 0.0, + "step": 23866 + }, + { + "epoch": 2.2270224876364653, + "grad_norm": NaN, + "learning_rate": 0.00021768512621603633, + "loss": 0.0, + "step": 23867 + }, + { + "epoch": 2.2271157973313427, + "grad_norm": NaN, + "learning_rate": 0.00021767837616654815, + "loss": 0.0, + "step": 23868 + }, + { + "epoch": 2.22720910702622, + "grad_norm": NaN, + "learning_rate": 0.00021767162594497392, + "loss": 0.0, + "step": 23869 + }, + { + "epoch": 2.2273024167210975, + "grad_norm": NaN, + "learning_rate": 0.00021766487555133083, + "loss": 0.0, + "step": 23870 + }, + { + "epoch": 2.2273957264159745, + "grad_norm": NaN, + "learning_rate": 0.00021765812498563596, + "loss": 0.0, + "step": 23871 + }, + { + "epoch": 2.227489036110852, + "grad_norm": NaN, + "learning_rate": 0.0002176513742479065, + "loss": 0.0, + "step": 23872 + }, + { + "epoch": 2.2275823458057293, + "grad_norm": NaN, + "learning_rate": 0.0002176446233381597, + "loss": 0.0, + "step": 23873 + }, + { + "epoch": 2.2276756555006063, + "grad_norm": NaN, + "learning_rate": 0.00021763787225641265, + "loss": 0.0, + "step": 23874 + }, + { + "epoch": 2.2277689651954837, + "grad_norm": NaN, + "learning_rate": 0.00021763112100268243, + "loss": 0.0, + "step": 23875 + }, + { + "epoch": 2.227862274890361, + "grad_norm": NaN, + "learning_rate": 0.00021762436957698645, + "loss": 0.0, + "step": 23876 + }, + { + "epoch": 2.2279555845852386, + "grad_norm": NaN, + "learning_rate": 0.00021761761797934164, + "loss": 0.0, + "step": 23877 + }, + { + "epoch": 2.2280488942801155, + "grad_norm": NaN, + "learning_rate": 0.00021761086620976524, + "loss": 0.0, + "step": 23878 + }, + { + "epoch": 2.228142203974993, + "grad_norm": NaN, + "learning_rate": 0.00021760411426827446, + "loss": 0.0, + "step": 23879 + }, + { + "epoch": 2.2282355136698704, + "grad_norm": NaN, + "learning_rate": 0.0002175973621548865, + "loss": 0.0, + "step": 23880 + }, + { + "epoch": 2.228328823364748, + "grad_norm": NaN, + "learning_rate": 0.00021759060986961838, + "loss": 0.0, + "step": 23881 + }, + { + "epoch": 2.2284221330596248, + "grad_norm": NaN, + "learning_rate": 0.00021758385741248742, + "loss": 0.0, + "step": 23882 + }, + { + "epoch": 2.228515442754502, + "grad_norm": NaN, + "learning_rate": 0.00021757710478351072, + "loss": 0.0, + "step": 23883 + }, + { + "epoch": 2.2286087524493796, + "grad_norm": NaN, + "learning_rate": 0.00021757035198270545, + "loss": 0.0, + "step": 23884 + }, + { + "epoch": 2.2287020621442566, + "grad_norm": NaN, + "learning_rate": 0.00021756359901008878, + "loss": 0.0, + "step": 23885 + }, + { + "epoch": 2.228795371839134, + "grad_norm": NaN, + "learning_rate": 0.00021755684586567795, + "loss": 0.0, + "step": 23886 + }, + { + "epoch": 2.2288886815340114, + "grad_norm": NaN, + "learning_rate": 0.00021755009254949002, + "loss": 0.0, + "step": 23887 + }, + { + "epoch": 2.228981991228889, + "grad_norm": NaN, + "learning_rate": 0.0002175433390615422, + "loss": 0.0, + "step": 23888 + }, + { + "epoch": 2.229075300923766, + "grad_norm": NaN, + "learning_rate": 0.00021753658540185175, + "loss": 0.0, + "step": 23889 + }, + { + "epoch": 2.2291686106186432, + "grad_norm": NaN, + "learning_rate": 0.00021752983157043566, + "loss": 0.0, + "step": 23890 + }, + { + "epoch": 2.2292619203135207, + "grad_norm": NaN, + "learning_rate": 0.00021752307756731125, + "loss": 0.0, + "step": 23891 + }, + { + "epoch": 2.229355230008398, + "grad_norm": NaN, + "learning_rate": 0.00021751632339249576, + "loss": 0.0, + "step": 23892 + }, + { + "epoch": 2.229448539703275, + "grad_norm": NaN, + "learning_rate": 0.00021750956904600612, + "loss": 0.0, + "step": 23893 + }, + { + "epoch": 2.2295418493981525, + "grad_norm": NaN, + "learning_rate": 0.00021750281452785972, + "loss": 0.0, + "step": 23894 + }, + { + "epoch": 2.22963515909303, + "grad_norm": NaN, + "learning_rate": 0.00021749605983807367, + "loss": 0.0, + "step": 23895 + }, + { + "epoch": 2.229728468787907, + "grad_norm": NaN, + "learning_rate": 0.0002174893049766651, + "loss": 0.0, + "step": 23896 + }, + { + "epoch": 2.2298217784827843, + "grad_norm": NaN, + "learning_rate": 0.00021748254994365118, + "loss": 0.0, + "step": 23897 + }, + { + "epoch": 2.2299150881776617, + "grad_norm": NaN, + "learning_rate": 0.0002174757947390492, + "loss": 0.0, + "step": 23898 + }, + { + "epoch": 2.230008397872539, + "grad_norm": NaN, + "learning_rate": 0.00021746903936287624, + "loss": 0.0, + "step": 23899 + }, + { + "epoch": 2.230101707567416, + "grad_norm": NaN, + "learning_rate": 0.00021746228381514947, + "loss": 0.0, + "step": 23900 + }, + { + "epoch": 2.2301950172622935, + "grad_norm": NaN, + "learning_rate": 0.0002174555280958861, + "loss": 0.0, + "step": 23901 + }, + { + "epoch": 2.230288326957171, + "grad_norm": NaN, + "learning_rate": 0.00021744877220510335, + "loss": 0.0, + "step": 23902 + }, + { + "epoch": 2.230381636652048, + "grad_norm": NaN, + "learning_rate": 0.00021744201614281832, + "loss": 0.0, + "step": 23903 + }, + { + "epoch": 2.2304749463469253, + "grad_norm": NaN, + "learning_rate": 0.0002174352599090482, + "loss": 0.0, + "step": 23904 + }, + { + "epoch": 2.2305682560418028, + "grad_norm": NaN, + "learning_rate": 0.00021742850350381024, + "loss": 0.0, + "step": 23905 + }, + { + "epoch": 2.23066156573668, + "grad_norm": NaN, + "learning_rate": 0.0002174217469271215, + "loss": 0.0, + "step": 23906 + }, + { + "epoch": 2.230754875431557, + "grad_norm": NaN, + "learning_rate": 0.00021741499017899926, + "loss": 0.0, + "step": 23907 + }, + { + "epoch": 2.2308481851264346, + "grad_norm": NaN, + "learning_rate": 0.00021740823325946073, + "loss": 0.0, + "step": 23908 + }, + { + "epoch": 2.230941494821312, + "grad_norm": NaN, + "learning_rate": 0.00021740147616852295, + "loss": 0.0, + "step": 23909 + }, + { + "epoch": 2.2310348045161894, + "grad_norm": NaN, + "learning_rate": 0.00021739471890620322, + "loss": 0.0, + "step": 23910 + }, + { + "epoch": 2.2311281142110664, + "grad_norm": NaN, + "learning_rate": 0.0002173879614725187, + "loss": 0.0, + "step": 23911 + }, + { + "epoch": 2.231221423905944, + "grad_norm": NaN, + "learning_rate": 0.00021738120386748648, + "loss": 0.0, + "step": 23912 + }, + { + "epoch": 2.231314733600821, + "grad_norm": NaN, + "learning_rate": 0.0002173744460911239, + "loss": 0.0, + "step": 23913 + }, + { + "epoch": 2.2314080432956986, + "grad_norm": NaN, + "learning_rate": 0.00021736768814344803, + "loss": 0.0, + "step": 23914 + }, + { + "epoch": 2.2315013529905756, + "grad_norm": NaN, + "learning_rate": 0.00021736093002447604, + "loss": 0.0, + "step": 23915 + }, + { + "epoch": 2.231594662685453, + "grad_norm": NaN, + "learning_rate": 0.00021735417173422524, + "loss": 0.0, + "step": 23916 + }, + { + "epoch": 2.2316879723803305, + "grad_norm": NaN, + "learning_rate": 0.0002173474132727127, + "loss": 0.0, + "step": 23917 + }, + { + "epoch": 2.2317812820752074, + "grad_norm": NaN, + "learning_rate": 0.00021734065463995566, + "loss": 0.0, + "step": 23918 + }, + { + "epoch": 2.231874591770085, + "grad_norm": NaN, + "learning_rate": 0.00021733389583597124, + "loss": 0.0, + "step": 23919 + }, + { + "epoch": 2.2319679014649623, + "grad_norm": NaN, + "learning_rate": 0.0002173271368607767, + "loss": 0.0, + "step": 23920 + }, + { + "epoch": 2.2320612111598397, + "grad_norm": NaN, + "learning_rate": 0.00021732037771438918, + "loss": 0.0, + "step": 23921 + }, + { + "epoch": 2.2321545208547167, + "grad_norm": NaN, + "learning_rate": 0.00021731361839682586, + "loss": 0.0, + "step": 23922 + }, + { + "epoch": 2.232247830549594, + "grad_norm": NaN, + "learning_rate": 0.00021730685890810396, + "loss": 0.0, + "step": 23923 + }, + { + "epoch": 2.2323411402444715, + "grad_norm": NaN, + "learning_rate": 0.00021730009924824073, + "loss": 0.0, + "step": 23924 + }, + { + "epoch": 2.2324344499393485, + "grad_norm": NaN, + "learning_rate": 0.0002172933394172532, + "loss": 0.0, + "step": 23925 + }, + { + "epoch": 2.232527759634226, + "grad_norm": NaN, + "learning_rate": 0.00021728657941515867, + "loss": 0.0, + "step": 23926 + }, + { + "epoch": 2.2326210693291033, + "grad_norm": NaN, + "learning_rate": 0.00021727981924197427, + "loss": 0.0, + "step": 23927 + }, + { + "epoch": 2.2327143790239807, + "grad_norm": NaN, + "learning_rate": 0.00021727305889771722, + "loss": 0.0, + "step": 23928 + }, + { + "epoch": 2.2328076887188577, + "grad_norm": NaN, + "learning_rate": 0.00021726629838240472, + "loss": 0.0, + "step": 23929 + }, + { + "epoch": 2.232900998413735, + "grad_norm": NaN, + "learning_rate": 0.000217259537696054, + "loss": 0.0, + "step": 23930 + }, + { + "epoch": 2.2329943081086125, + "grad_norm": NaN, + "learning_rate": 0.00021725277683868217, + "loss": 0.0, + "step": 23931 + }, + { + "epoch": 2.23308761780349, + "grad_norm": NaN, + "learning_rate": 0.00021724601581030642, + "loss": 0.0, + "step": 23932 + }, + { + "epoch": 2.233180927498367, + "grad_norm": NaN, + "learning_rate": 0.000217239254610944, + "loss": 0.0, + "step": 23933 + }, + { + "epoch": 2.2332742371932444, + "grad_norm": NaN, + "learning_rate": 0.00021723249324061205, + "loss": 0.0, + "step": 23934 + }, + { + "epoch": 2.233367546888122, + "grad_norm": NaN, + "learning_rate": 0.0002172257316993278, + "loss": 0.0, + "step": 23935 + }, + { + "epoch": 2.233460856582999, + "grad_norm": NaN, + "learning_rate": 0.00021721896998710844, + "loss": 0.0, + "step": 23936 + }, + { + "epoch": 2.233554166277876, + "grad_norm": NaN, + "learning_rate": 0.00021721220810397111, + "loss": 0.0, + "step": 23937 + }, + { + "epoch": 2.2336474759727536, + "grad_norm": NaN, + "learning_rate": 0.0002172054460499331, + "loss": 0.0, + "step": 23938 + }, + { + "epoch": 2.233740785667631, + "grad_norm": NaN, + "learning_rate": 0.0002171986838250115, + "loss": 0.0, + "step": 23939 + }, + { + "epoch": 2.233834095362508, + "grad_norm": NaN, + "learning_rate": 0.0002171919214292236, + "loss": 0.0, + "step": 23940 + }, + { + "epoch": 2.2339274050573854, + "grad_norm": NaN, + "learning_rate": 0.0002171851588625865, + "loss": 0.0, + "step": 23941 + }, + { + "epoch": 2.234020714752263, + "grad_norm": NaN, + "learning_rate": 0.00021717839612511746, + "loss": 0.0, + "step": 23942 + }, + { + "epoch": 2.2341140244471402, + "grad_norm": NaN, + "learning_rate": 0.00021717163321683367, + "loss": 0.0, + "step": 23943 + }, + { + "epoch": 2.234207334142017, + "grad_norm": NaN, + "learning_rate": 0.00021716487013775232, + "loss": 0.0, + "step": 23944 + }, + { + "epoch": 2.2343006438368946, + "grad_norm": NaN, + "learning_rate": 0.00021715810688789056, + "loss": 0.0, + "step": 23945 + }, + { + "epoch": 2.234393953531772, + "grad_norm": NaN, + "learning_rate": 0.0002171513434672657, + "loss": 0.0, + "step": 23946 + }, + { + "epoch": 2.234487263226649, + "grad_norm": NaN, + "learning_rate": 0.0002171445798758948, + "loss": 0.0, + "step": 23947 + }, + { + "epoch": 2.2345805729215265, + "grad_norm": NaN, + "learning_rate": 0.00021713781611379515, + "loss": 0.0, + "step": 23948 + }, + { + "epoch": 2.234673882616404, + "grad_norm": NaN, + "learning_rate": 0.00021713105218098392, + "loss": 0.0, + "step": 23949 + }, + { + "epoch": 2.2347671923112813, + "grad_norm": NaN, + "learning_rate": 0.00021712428807747828, + "loss": 0.0, + "step": 23950 + }, + { + "epoch": 2.2348605020061583, + "grad_norm": NaN, + "learning_rate": 0.0002171175238032955, + "loss": 0.0, + "step": 23951 + }, + { + "epoch": 2.2349538117010357, + "grad_norm": NaN, + "learning_rate": 0.00021711075935845273, + "loss": 0.0, + "step": 23952 + }, + { + "epoch": 2.235047121395913, + "grad_norm": NaN, + "learning_rate": 0.00021710399474296718, + "loss": 0.0, + "step": 23953 + }, + { + "epoch": 2.2351404310907905, + "grad_norm": NaN, + "learning_rate": 0.00021709722995685602, + "loss": 0.0, + "step": 23954 + }, + { + "epoch": 2.2352337407856675, + "grad_norm": NaN, + "learning_rate": 0.0002170904650001365, + "loss": 0.0, + "step": 23955 + }, + { + "epoch": 2.235327050480545, + "grad_norm": NaN, + "learning_rate": 0.00021708369987282584, + "loss": 0.0, + "step": 23956 + }, + { + "epoch": 2.2354203601754223, + "grad_norm": NaN, + "learning_rate": 0.00021707693457494114, + "loss": 0.0, + "step": 23957 + }, + { + "epoch": 2.2355136698702998, + "grad_norm": NaN, + "learning_rate": 0.0002170701691064997, + "loss": 0.0, + "step": 23958 + }, + { + "epoch": 2.2356069795651767, + "grad_norm": NaN, + "learning_rate": 0.00021706340346751868, + "loss": 0.0, + "step": 23959 + }, + { + "epoch": 2.235700289260054, + "grad_norm": NaN, + "learning_rate": 0.0002170566376580153, + "loss": 0.0, + "step": 23960 + }, + { + "epoch": 2.2357935989549316, + "grad_norm": NaN, + "learning_rate": 0.00021704987167800671, + "loss": 0.0, + "step": 23961 + }, + { + "epoch": 2.2358869086498085, + "grad_norm": NaN, + "learning_rate": 0.00021704310552751025, + "loss": 0.0, + "step": 23962 + }, + { + "epoch": 2.235980218344686, + "grad_norm": NaN, + "learning_rate": 0.00021703633920654291, + "loss": 0.0, + "step": 23963 + }, + { + "epoch": 2.2360735280395634, + "grad_norm": NaN, + "learning_rate": 0.0002170295727151221, + "loss": 0.0, + "step": 23964 + }, + { + "epoch": 2.236166837734441, + "grad_norm": NaN, + "learning_rate": 0.00021702280605326494, + "loss": 0.0, + "step": 23965 + }, + { + "epoch": 2.236260147429318, + "grad_norm": NaN, + "learning_rate": 0.0002170160392209886, + "loss": 0.0, + "step": 23966 + }, + { + "epoch": 2.236353457124195, + "grad_norm": NaN, + "learning_rate": 0.00021700927221831036, + "loss": 0.0, + "step": 23967 + }, + { + "epoch": 2.2364467668190726, + "grad_norm": NaN, + "learning_rate": 0.00021700250504524737, + "loss": 0.0, + "step": 23968 + }, + { + "epoch": 2.2365400765139496, + "grad_norm": NaN, + "learning_rate": 0.00021699573770181686, + "loss": 0.0, + "step": 23969 + }, + { + "epoch": 2.236633386208827, + "grad_norm": NaN, + "learning_rate": 0.00021698897018803602, + "loss": 0.0, + "step": 23970 + }, + { + "epoch": 2.2367266959037044, + "grad_norm": NaN, + "learning_rate": 0.0002169822025039221, + "loss": 0.0, + "step": 23971 + }, + { + "epoch": 2.236820005598582, + "grad_norm": NaN, + "learning_rate": 0.00021697543464949226, + "loss": 0.0, + "step": 23972 + }, + { + "epoch": 2.236913315293459, + "grad_norm": NaN, + "learning_rate": 0.00021696866662476375, + "loss": 0.0, + "step": 23973 + }, + { + "epoch": 2.2370066249883362, + "grad_norm": NaN, + "learning_rate": 0.00021696189842975374, + "loss": 0.0, + "step": 23974 + }, + { + "epoch": 2.2370999346832137, + "grad_norm": NaN, + "learning_rate": 0.00021695513006447946, + "loss": 0.0, + "step": 23975 + }, + { + "epoch": 2.237193244378091, + "grad_norm": NaN, + "learning_rate": 0.00021694836152895812, + "loss": 0.0, + "step": 23976 + }, + { + "epoch": 2.237286554072968, + "grad_norm": NaN, + "learning_rate": 0.00021694159282320693, + "loss": 0.0, + "step": 23977 + }, + { + "epoch": 2.2373798637678455, + "grad_norm": NaN, + "learning_rate": 0.0002169348239472431, + "loss": 0.0, + "step": 23978 + }, + { + "epoch": 2.237473173462723, + "grad_norm": NaN, + "learning_rate": 0.00021692805490108384, + "loss": 0.0, + "step": 23979 + }, + { + "epoch": 2.2375664831576, + "grad_norm": NaN, + "learning_rate": 0.00021692128568474636, + "loss": 0.0, + "step": 23980 + }, + { + "epoch": 2.2376597928524773, + "grad_norm": NaN, + "learning_rate": 0.00021691451629824788, + "loss": 0.0, + "step": 23981 + }, + { + "epoch": 2.2377531025473547, + "grad_norm": NaN, + "learning_rate": 0.0002169077467416056, + "loss": 0.0, + "step": 23982 + }, + { + "epoch": 2.237846412242232, + "grad_norm": NaN, + "learning_rate": 0.00021690097701483675, + "loss": 0.0, + "step": 23983 + }, + { + "epoch": 2.237939721937109, + "grad_norm": NaN, + "learning_rate": 0.00021689420711795855, + "loss": 0.0, + "step": 23984 + }, + { + "epoch": 2.2380330316319865, + "grad_norm": NaN, + "learning_rate": 0.00021688743705098815, + "loss": 0.0, + "step": 23985 + }, + { + "epoch": 2.238126341326864, + "grad_norm": NaN, + "learning_rate": 0.00021688066681394285, + "loss": 0.0, + "step": 23986 + }, + { + "epoch": 2.2382196510217414, + "grad_norm": NaN, + "learning_rate": 0.0002168738964068398, + "loss": 0.0, + "step": 23987 + }, + { + "epoch": 2.2383129607166183, + "grad_norm": NaN, + "learning_rate": 0.00021686712582969625, + "loss": 0.0, + "step": 23988 + }, + { + "epoch": 2.2384062704114958, + "grad_norm": NaN, + "learning_rate": 0.00021686035508252939, + "loss": 0.0, + "step": 23989 + }, + { + "epoch": 2.238499580106373, + "grad_norm": NaN, + "learning_rate": 0.00021685358416535645, + "loss": 0.0, + "step": 23990 + }, + { + "epoch": 2.23859288980125, + "grad_norm": NaN, + "learning_rate": 0.0002168468130781947, + "loss": 0.0, + "step": 23991 + }, + { + "epoch": 2.2386861994961276, + "grad_norm": NaN, + "learning_rate": 0.00021684004182106125, + "loss": 0.0, + "step": 23992 + }, + { + "epoch": 2.238779509191005, + "grad_norm": NaN, + "learning_rate": 0.00021683327039397342, + "loss": 0.0, + "step": 23993 + }, + { + "epoch": 2.2388728188858824, + "grad_norm": NaN, + "learning_rate": 0.00021682649879694834, + "loss": 0.0, + "step": 23994 + }, + { + "epoch": 2.2389661285807594, + "grad_norm": NaN, + "learning_rate": 0.00021681972703000327, + "loss": 0.0, + "step": 23995 + }, + { + "epoch": 2.239059438275637, + "grad_norm": NaN, + "learning_rate": 0.00021681295509315545, + "loss": 0.0, + "step": 23996 + }, + { + "epoch": 2.2391527479705142, + "grad_norm": NaN, + "learning_rate": 0.00021680618298642207, + "loss": 0.0, + "step": 23997 + }, + { + "epoch": 2.239246057665391, + "grad_norm": NaN, + "learning_rate": 0.00021679941070982037, + "loss": 0.0, + "step": 23998 + }, + { + "epoch": 2.2393393673602686, + "grad_norm": NaN, + "learning_rate": 0.00021679263826336755, + "loss": 0.0, + "step": 23999 + }, + { + "epoch": 2.239432677055146, + "grad_norm": NaN, + "learning_rate": 0.00021678586564708084, + "loss": 0.0, + "step": 24000 + }, + { + "epoch": 2.2395259867500235, + "grad_norm": NaN, + "learning_rate": 0.00021677909286097746, + "loss": 0.0, + "step": 24001 + }, + { + "epoch": 2.2396192964449004, + "grad_norm": NaN, + "learning_rate": 0.0002167723199050746, + "loss": 0.0, + "step": 24002 + }, + { + "epoch": 2.239712606139778, + "grad_norm": NaN, + "learning_rate": 0.0002167655467793895, + "loss": 0.0, + "step": 24003 + }, + { + "epoch": 2.2398059158346553, + "grad_norm": NaN, + "learning_rate": 0.00021675877348393944, + "loss": 0.0, + "step": 24004 + }, + { + "epoch": 2.2398992255295327, + "grad_norm": NaN, + "learning_rate": 0.0002167520000187416, + "loss": 0.0, + "step": 24005 + }, + { + "epoch": 2.2399925352244097, + "grad_norm": NaN, + "learning_rate": 0.00021674522638381316, + "loss": 0.0, + "step": 24006 + }, + { + "epoch": 2.240085844919287, + "grad_norm": NaN, + "learning_rate": 0.00021673845257917136, + "loss": 0.0, + "step": 24007 + }, + { + "epoch": 2.2401791546141645, + "grad_norm": NaN, + "learning_rate": 0.0002167316786048335, + "loss": 0.0, + "step": 24008 + }, + { + "epoch": 2.240272464309042, + "grad_norm": NaN, + "learning_rate": 0.0002167249044608167, + "loss": 0.0, + "step": 24009 + }, + { + "epoch": 2.240365774003919, + "grad_norm": NaN, + "learning_rate": 0.00021671813014713826, + "loss": 0.0, + "step": 24010 + }, + { + "epoch": 2.2404590836987963, + "grad_norm": NaN, + "learning_rate": 0.00021671135566381537, + "loss": 0.0, + "step": 24011 + }, + { + "epoch": 2.2405523933936737, + "grad_norm": NaN, + "learning_rate": 0.00021670458101086528, + "loss": 0.0, + "step": 24012 + }, + { + "epoch": 2.2406457030885507, + "grad_norm": NaN, + "learning_rate": 0.0002166978061883052, + "loss": 0.0, + "step": 24013 + }, + { + "epoch": 2.240739012783428, + "grad_norm": NaN, + "learning_rate": 0.00021669103119615233, + "loss": 0.0, + "step": 24014 + }, + { + "epoch": 2.2408323224783055, + "grad_norm": NaN, + "learning_rate": 0.00021668425603442391, + "loss": 0.0, + "step": 24015 + }, + { + "epoch": 2.240925632173183, + "grad_norm": NaN, + "learning_rate": 0.00021667748070313723, + "loss": 0.0, + "step": 24016 + }, + { + "epoch": 2.24101894186806, + "grad_norm": NaN, + "learning_rate": 0.00021667070520230944, + "loss": 0.0, + "step": 24017 + }, + { + "epoch": 2.2411122515629374, + "grad_norm": NaN, + "learning_rate": 0.0002166639295319578, + "loss": 0.0, + "step": 24018 + }, + { + "epoch": 2.241205561257815, + "grad_norm": NaN, + "learning_rate": 0.00021665715369209953, + "loss": 0.0, + "step": 24019 + }, + { + "epoch": 2.2412988709526918, + "grad_norm": NaN, + "learning_rate": 0.00021665037768275186, + "loss": 0.0, + "step": 24020 + }, + { + "epoch": 2.241392180647569, + "grad_norm": NaN, + "learning_rate": 0.00021664360150393206, + "loss": 0.0, + "step": 24021 + }, + { + "epoch": 2.2414854903424466, + "grad_norm": NaN, + "learning_rate": 0.00021663682515565727, + "loss": 0.0, + "step": 24022 + }, + { + "epoch": 2.241578800037324, + "grad_norm": NaN, + "learning_rate": 0.00021663004863794482, + "loss": 0.0, + "step": 24023 + }, + { + "epoch": 2.241672109732201, + "grad_norm": NaN, + "learning_rate": 0.00021662327195081185, + "loss": 0.0, + "step": 24024 + }, + { + "epoch": 2.2417654194270784, + "grad_norm": NaN, + "learning_rate": 0.00021661649509427565, + "loss": 0.0, + "step": 24025 + }, + { + "epoch": 2.241858729121956, + "grad_norm": NaN, + "learning_rate": 0.0002166097180683535, + "loss": 0.0, + "step": 24026 + }, + { + "epoch": 2.2419520388168332, + "grad_norm": NaN, + "learning_rate": 0.00021660294087306247, + "loss": 0.0, + "step": 24027 + }, + { + "epoch": 2.2420453485117102, + "grad_norm": NaN, + "learning_rate": 0.00021659616350841996, + "loss": 0.0, + "step": 24028 + }, + { + "epoch": 2.2421386582065876, + "grad_norm": NaN, + "learning_rate": 0.00021658938597444308, + "loss": 0.0, + "step": 24029 + }, + { + "epoch": 2.242231967901465, + "grad_norm": NaN, + "learning_rate": 0.00021658260827114913, + "loss": 0.0, + "step": 24030 + }, + { + "epoch": 2.2423252775963425, + "grad_norm": NaN, + "learning_rate": 0.00021657583039855536, + "loss": 0.0, + "step": 24031 + }, + { + "epoch": 2.2424185872912195, + "grad_norm": NaN, + "learning_rate": 0.00021656905235667898, + "loss": 0.0, + "step": 24032 + }, + { + "epoch": 2.242511896986097, + "grad_norm": NaN, + "learning_rate": 0.00021656227414553717, + "loss": 0.0, + "step": 24033 + }, + { + "epoch": 2.2426052066809743, + "grad_norm": NaN, + "learning_rate": 0.00021655549576514723, + "loss": 0.0, + "step": 24034 + }, + { + "epoch": 2.2426985163758513, + "grad_norm": NaN, + "learning_rate": 0.0002165487172155264, + "loss": 0.0, + "step": 24035 + }, + { + "epoch": 2.2427918260707287, + "grad_norm": NaN, + "learning_rate": 0.00021654193849669187, + "loss": 0.0, + "step": 24036 + }, + { + "epoch": 2.242885135765606, + "grad_norm": NaN, + "learning_rate": 0.0002165351596086609, + "loss": 0.0, + "step": 24037 + }, + { + "epoch": 2.2429784454604835, + "grad_norm": NaN, + "learning_rate": 0.00021652838055145074, + "loss": 0.0, + "step": 24038 + }, + { + "epoch": 2.2430717551553605, + "grad_norm": NaN, + "learning_rate": 0.00021652160132507866, + "loss": 0.0, + "step": 24039 + }, + { + "epoch": 2.243165064850238, + "grad_norm": NaN, + "learning_rate": 0.0002165148219295618, + "loss": 0.0, + "step": 24040 + }, + { + "epoch": 2.2432583745451153, + "grad_norm": NaN, + "learning_rate": 0.00021650804236491745, + "loss": 0.0, + "step": 24041 + }, + { + "epoch": 2.2433516842399923, + "grad_norm": NaN, + "learning_rate": 0.00021650126263116288, + "loss": 0.0, + "step": 24042 + }, + { + "epoch": 2.2434449939348697, + "grad_norm": NaN, + "learning_rate": 0.00021649448272831527, + "loss": 0.0, + "step": 24043 + }, + { + "epoch": 2.243538303629747, + "grad_norm": NaN, + "learning_rate": 0.0002164877026563919, + "loss": 0.0, + "step": 24044 + }, + { + "epoch": 2.2436316133246246, + "grad_norm": NaN, + "learning_rate": 0.00021648092241540996, + "loss": 0.0, + "step": 24045 + }, + { + "epoch": 2.2437249230195015, + "grad_norm": NaN, + "learning_rate": 0.0002164741420053868, + "loss": 0.0, + "step": 24046 + }, + { + "epoch": 2.243818232714379, + "grad_norm": NaN, + "learning_rate": 0.00021646736142633957, + "loss": 0.0, + "step": 24047 + }, + { + "epoch": 2.2439115424092564, + "grad_norm": NaN, + "learning_rate": 0.00021646058067828548, + "loss": 0.0, + "step": 24048 + }, + { + "epoch": 2.244004852104134, + "grad_norm": NaN, + "learning_rate": 0.00021645379976124185, + "loss": 0.0, + "step": 24049 + }, + { + "epoch": 2.244098161799011, + "grad_norm": NaN, + "learning_rate": 0.0002164470186752259, + "loss": 0.0, + "step": 24050 + }, + { + "epoch": 2.244191471493888, + "grad_norm": NaN, + "learning_rate": 0.00021644023742025485, + "loss": 0.0, + "step": 24051 + }, + { + "epoch": 2.2442847811887656, + "grad_norm": NaN, + "learning_rate": 0.00021643345599634597, + "loss": 0.0, + "step": 24052 + }, + { + "epoch": 2.244378090883643, + "grad_norm": NaN, + "learning_rate": 0.0002164266744035165, + "loss": 0.0, + "step": 24053 + }, + { + "epoch": 2.24447140057852, + "grad_norm": NaN, + "learning_rate": 0.00021641989264178365, + "loss": 0.0, + "step": 24054 + }, + { + "epoch": 2.2445647102733974, + "grad_norm": NaN, + "learning_rate": 0.00021641311071116467, + "loss": 0.0, + "step": 24055 + }, + { + "epoch": 2.244658019968275, + "grad_norm": NaN, + "learning_rate": 0.00021640632861167687, + "loss": 0.0, + "step": 24056 + }, + { + "epoch": 2.244751329663152, + "grad_norm": NaN, + "learning_rate": 0.00021639954634333742, + "loss": 0.0, + "step": 24057 + }, + { + "epoch": 2.2448446393580292, + "grad_norm": NaN, + "learning_rate": 0.0002163927639061636, + "loss": 0.0, + "step": 24058 + }, + { + "epoch": 2.2449379490529067, + "grad_norm": NaN, + "learning_rate": 0.00021638598130017264, + "loss": 0.0, + "step": 24059 + }, + { + "epoch": 2.245031258747784, + "grad_norm": NaN, + "learning_rate": 0.0002163791985253818, + "loss": 0.0, + "step": 24060 + }, + { + "epoch": 2.245124568442661, + "grad_norm": NaN, + "learning_rate": 0.00021637241558180835, + "loss": 0.0, + "step": 24061 + }, + { + "epoch": 2.2452178781375385, + "grad_norm": NaN, + "learning_rate": 0.0002163656324694695, + "loss": 0.0, + "step": 24062 + }, + { + "epoch": 2.245311187832416, + "grad_norm": NaN, + "learning_rate": 0.00021635884918838247, + "loss": 0.0, + "step": 24063 + }, + { + "epoch": 2.245404497527293, + "grad_norm": NaN, + "learning_rate": 0.00021635206573856453, + "loss": 0.0, + "step": 24064 + }, + { + "epoch": 2.2454978072221703, + "grad_norm": NaN, + "learning_rate": 0.000216345282120033, + "loss": 0.0, + "step": 24065 + }, + { + "epoch": 2.2455911169170477, + "grad_norm": NaN, + "learning_rate": 0.00021633849833280504, + "loss": 0.0, + "step": 24066 + }, + { + "epoch": 2.245684426611925, + "grad_norm": NaN, + "learning_rate": 0.00021633171437689791, + "loss": 0.0, + "step": 24067 + }, + { + "epoch": 2.245777736306802, + "grad_norm": NaN, + "learning_rate": 0.00021632493025232894, + "loss": 0.0, + "step": 24068 + }, + { + "epoch": 2.2458710460016795, + "grad_norm": NaN, + "learning_rate": 0.00021631814595911525, + "loss": 0.0, + "step": 24069 + }, + { + "epoch": 2.245964355696557, + "grad_norm": NaN, + "learning_rate": 0.00021631136149727418, + "loss": 0.0, + "step": 24070 + }, + { + "epoch": 2.2460576653914344, + "grad_norm": NaN, + "learning_rate": 0.000216304576866823, + "loss": 0.0, + "step": 24071 + }, + { + "epoch": 2.2461509750863113, + "grad_norm": NaN, + "learning_rate": 0.00021629779206777888, + "loss": 0.0, + "step": 24072 + }, + { + "epoch": 2.2462442847811888, + "grad_norm": NaN, + "learning_rate": 0.00021629100710015913, + "loss": 0.0, + "step": 24073 + }, + { + "epoch": 2.246337594476066, + "grad_norm": NaN, + "learning_rate": 0.00021628422196398099, + "loss": 0.0, + "step": 24074 + }, + { + "epoch": 2.246430904170943, + "grad_norm": NaN, + "learning_rate": 0.00021627743665926172, + "loss": 0.0, + "step": 24075 + }, + { + "epoch": 2.2465242138658206, + "grad_norm": NaN, + "learning_rate": 0.0002162706511860185, + "loss": 0.0, + "step": 24076 + }, + { + "epoch": 2.246617523560698, + "grad_norm": NaN, + "learning_rate": 0.00021626386554426873, + "loss": 0.0, + "step": 24077 + }, + { + "epoch": 2.2467108332555754, + "grad_norm": NaN, + "learning_rate": 0.0002162570797340295, + "loss": 0.0, + "step": 24078 + }, + { + "epoch": 2.2468041429504524, + "grad_norm": NaN, + "learning_rate": 0.00021625029375531822, + "loss": 0.0, + "step": 24079 + }, + { + "epoch": 2.24689745264533, + "grad_norm": NaN, + "learning_rate": 0.00021624350760815201, + "loss": 0.0, + "step": 24080 + }, + { + "epoch": 2.2469907623402072, + "grad_norm": NaN, + "learning_rate": 0.0002162367212925482, + "loss": 0.0, + "step": 24081 + }, + { + "epoch": 2.2470840720350846, + "grad_norm": NaN, + "learning_rate": 0.00021622993480852404, + "loss": 0.0, + "step": 24082 + }, + { + "epoch": 2.2471773817299616, + "grad_norm": NaN, + "learning_rate": 0.00021622314815609677, + "loss": 0.0, + "step": 24083 + }, + { + "epoch": 2.247270691424839, + "grad_norm": NaN, + "learning_rate": 0.0002162163613352836, + "loss": 0.0, + "step": 24084 + }, + { + "epoch": 2.2473640011197165, + "grad_norm": NaN, + "learning_rate": 0.00021620957434610188, + "loss": 0.0, + "step": 24085 + }, + { + "epoch": 2.2474573108145934, + "grad_norm": NaN, + "learning_rate": 0.00021620278718856886, + "loss": 0.0, + "step": 24086 + }, + { + "epoch": 2.247550620509471, + "grad_norm": NaN, + "learning_rate": 0.00021619599986270172, + "loss": 0.0, + "step": 24087 + }, + { + "epoch": 2.2476439302043483, + "grad_norm": NaN, + "learning_rate": 0.00021618921236851778, + "loss": 0.0, + "step": 24088 + }, + { + "epoch": 2.2477372398992257, + "grad_norm": NaN, + "learning_rate": 0.00021618242470603424, + "loss": 0.0, + "step": 24089 + }, + { + "epoch": 2.2478305495941027, + "grad_norm": NaN, + "learning_rate": 0.00021617563687526843, + "loss": 0.0, + "step": 24090 + }, + { + "epoch": 2.24792385928898, + "grad_norm": NaN, + "learning_rate": 0.00021616884887623757, + "loss": 0.0, + "step": 24091 + }, + { + "epoch": 2.2480171689838575, + "grad_norm": NaN, + "learning_rate": 0.00021616206070895896, + "loss": 0.0, + "step": 24092 + }, + { + "epoch": 2.248110478678735, + "grad_norm": NaN, + "learning_rate": 0.0002161552723734498, + "loss": 0.0, + "step": 24093 + }, + { + "epoch": 2.248203788373612, + "grad_norm": NaN, + "learning_rate": 0.0002161484838697274, + "loss": 0.0, + "step": 24094 + }, + { + "epoch": 2.2482970980684893, + "grad_norm": NaN, + "learning_rate": 0.00021614169519780895, + "loss": 0.0, + "step": 24095 + }, + { + "epoch": 2.2483904077633667, + "grad_norm": NaN, + "learning_rate": 0.0002161349063577118, + "loss": 0.0, + "step": 24096 + }, + { + "epoch": 2.2484837174582437, + "grad_norm": NaN, + "learning_rate": 0.00021612811734945315, + "loss": 0.0, + "step": 24097 + }, + { + "epoch": 2.248577027153121, + "grad_norm": NaN, + "learning_rate": 0.00021612132817305034, + "loss": 0.0, + "step": 24098 + }, + { + "epoch": 2.2486703368479986, + "grad_norm": NaN, + "learning_rate": 0.0002161145388285205, + "loss": 0.0, + "step": 24099 + }, + { + "epoch": 2.248763646542876, + "grad_norm": NaN, + "learning_rate": 0.00021610774931588104, + "loss": 0.0, + "step": 24100 + }, + { + "epoch": 2.248856956237753, + "grad_norm": NaN, + "learning_rate": 0.00021610095963514914, + "loss": 0.0, + "step": 24101 + }, + { + "epoch": 2.2489502659326304, + "grad_norm": NaN, + "learning_rate": 0.00021609416978634208, + "loss": 0.0, + "step": 24102 + }, + { + "epoch": 2.249043575627508, + "grad_norm": NaN, + "learning_rate": 0.0002160873797694771, + "loss": 0.0, + "step": 24103 + }, + { + "epoch": 2.249136885322385, + "grad_norm": NaN, + "learning_rate": 0.00021608058958457152, + "loss": 0.0, + "step": 24104 + }, + { + "epoch": 2.249230195017262, + "grad_norm": NaN, + "learning_rate": 0.00021607379923164254, + "loss": 0.0, + "step": 24105 + }, + { + "epoch": 2.2493235047121396, + "grad_norm": NaN, + "learning_rate": 0.0002160670087107075, + "loss": 0.0, + "step": 24106 + }, + { + "epoch": 2.249416814407017, + "grad_norm": NaN, + "learning_rate": 0.00021606021802178364, + "loss": 0.0, + "step": 24107 + }, + { + "epoch": 2.249510124101894, + "grad_norm": NaN, + "learning_rate": 0.00021605342716488814, + "loss": 0.0, + "step": 24108 + }, + { + "epoch": 2.2496034337967714, + "grad_norm": NaN, + "learning_rate": 0.0002160466361400384, + "loss": 0.0, + "step": 24109 + }, + { + "epoch": 2.249696743491649, + "grad_norm": NaN, + "learning_rate": 0.00021603984494725163, + "loss": 0.0, + "step": 24110 + }, + { + "epoch": 2.2497900531865263, + "grad_norm": NaN, + "learning_rate": 0.00021603305358654506, + "loss": 0.0, + "step": 24111 + }, + { + "epoch": 2.2498833628814032, + "grad_norm": NaN, + "learning_rate": 0.00021602626205793603, + "loss": 0.0, + "step": 24112 + }, + { + "epoch": 2.2499766725762806, + "grad_norm": NaN, + "learning_rate": 0.00021601947036144176, + "loss": 0.0, + "step": 24113 + }, + { + "epoch": 2.250069982271158, + "grad_norm": NaN, + "learning_rate": 0.00021601267849707953, + "loss": 0.0, + "step": 24114 + }, + { + "epoch": 2.250163291966035, + "grad_norm": NaN, + "learning_rate": 0.00021600588646486662, + "loss": 0.0, + "step": 24115 + }, + { + "epoch": 2.2502566016609125, + "grad_norm": NaN, + "learning_rate": 0.0002159990942648203, + "loss": 0.0, + "step": 24116 + }, + { + "epoch": 2.25034991135579, + "grad_norm": NaN, + "learning_rate": 0.00021599230189695782, + "loss": 0.0, + "step": 24117 + }, + { + "epoch": 2.2504432210506673, + "grad_norm": NaN, + "learning_rate": 0.00021598550936129644, + "loss": 0.0, + "step": 24118 + }, + { + "epoch": 2.2505365307455443, + "grad_norm": NaN, + "learning_rate": 0.0002159787166578535, + "loss": 0.0, + "step": 24119 + }, + { + "epoch": 2.2506298404404217, + "grad_norm": NaN, + "learning_rate": 0.00021597192378664624, + "loss": 0.0, + "step": 24120 + }, + { + "epoch": 2.250723150135299, + "grad_norm": NaN, + "learning_rate": 0.00021596513074769184, + "loss": 0.0, + "step": 24121 + }, + { + "epoch": 2.2508164598301765, + "grad_norm": NaN, + "learning_rate": 0.0002159583375410077, + "loss": 0.0, + "step": 24122 + }, + { + "epoch": 2.2509097695250535, + "grad_norm": NaN, + "learning_rate": 0.00021595154416661108, + "loss": 0.0, + "step": 24123 + }, + { + "epoch": 2.251003079219931, + "grad_norm": NaN, + "learning_rate": 0.00021594475062451917, + "loss": 0.0, + "step": 24124 + }, + { + "epoch": 2.2510963889148083, + "grad_norm": NaN, + "learning_rate": 0.00021593795691474932, + "loss": 0.0, + "step": 24125 + }, + { + "epoch": 2.2511896986096858, + "grad_norm": NaN, + "learning_rate": 0.00021593116303731877, + "loss": 0.0, + "step": 24126 + }, + { + "epoch": 2.2512830083045627, + "grad_norm": NaN, + "learning_rate": 0.00021592436899224484, + "loss": 0.0, + "step": 24127 + }, + { + "epoch": 2.25137631799944, + "grad_norm": NaN, + "learning_rate": 0.00021591757477954472, + "loss": 0.0, + "step": 24128 + }, + { + "epoch": 2.2514696276943176, + "grad_norm": NaN, + "learning_rate": 0.00021591078039923575, + "loss": 0.0, + "step": 24129 + }, + { + "epoch": 2.2515629373891946, + "grad_norm": NaN, + "learning_rate": 0.00021590398585133517, + "loss": 0.0, + "step": 24130 + }, + { + "epoch": 2.251656247084072, + "grad_norm": NaN, + "learning_rate": 0.0002158971911358603, + "loss": 0.0, + "step": 24131 + }, + { + "epoch": 2.2517495567789494, + "grad_norm": NaN, + "learning_rate": 0.00021589039625282837, + "loss": 0.0, + "step": 24132 + }, + { + "epoch": 2.251842866473827, + "grad_norm": NaN, + "learning_rate": 0.00021588360120225667, + "loss": 0.0, + "step": 24133 + }, + { + "epoch": 2.251936176168704, + "grad_norm": NaN, + "learning_rate": 0.00021587680598416254, + "loss": 0.0, + "step": 24134 + }, + { + "epoch": 2.252029485863581, + "grad_norm": NaN, + "learning_rate": 0.00021587001059856318, + "loss": 0.0, + "step": 24135 + }, + { + "epoch": 2.2521227955584586, + "grad_norm": NaN, + "learning_rate": 0.0002158632150454759, + "loss": 0.0, + "step": 24136 + }, + { + "epoch": 2.2522161052533356, + "grad_norm": NaN, + "learning_rate": 0.00021585641932491794, + "loss": 0.0, + "step": 24137 + }, + { + "epoch": 2.252309414948213, + "grad_norm": NaN, + "learning_rate": 0.00021584962343690662, + "loss": 0.0, + "step": 24138 + }, + { + "epoch": 2.2524027246430904, + "grad_norm": NaN, + "learning_rate": 0.0002158428273814593, + "loss": 0.0, + "step": 24139 + }, + { + "epoch": 2.252496034337968, + "grad_norm": NaN, + "learning_rate": 0.0002158360311585931, + "loss": 0.0, + "step": 24140 + }, + { + "epoch": 2.252589344032845, + "grad_norm": NaN, + "learning_rate": 0.00021582923476832537, + "loss": 0.0, + "step": 24141 + }, + { + "epoch": 2.2526826537277223, + "grad_norm": NaN, + "learning_rate": 0.00021582243821067343, + "loss": 0.0, + "step": 24142 + }, + { + "epoch": 2.2527759634225997, + "grad_norm": NaN, + "learning_rate": 0.00021581564148565448, + "loss": 0.0, + "step": 24143 + }, + { + "epoch": 2.252869273117477, + "grad_norm": NaN, + "learning_rate": 0.00021580884459328588, + "loss": 0.0, + "step": 24144 + }, + { + "epoch": 2.252962582812354, + "grad_norm": NaN, + "learning_rate": 0.00021580204753358492, + "loss": 0.0, + "step": 24145 + }, + { + "epoch": 2.2530558925072315, + "grad_norm": NaN, + "learning_rate": 0.00021579525030656876, + "loss": 0.0, + "step": 24146 + }, + { + "epoch": 2.253149202202109, + "grad_norm": NaN, + "learning_rate": 0.00021578845291225486, + "loss": 0.0, + "step": 24147 + }, + { + "epoch": 2.2532425118969863, + "grad_norm": NaN, + "learning_rate": 0.00021578165535066034, + "loss": 0.0, + "step": 24148 + }, + { + "epoch": 2.2533358215918633, + "grad_norm": NaN, + "learning_rate": 0.0002157748576218026, + "loss": 0.0, + "step": 24149 + }, + { + "epoch": 2.2534291312867407, + "grad_norm": NaN, + "learning_rate": 0.00021576805972569883, + "loss": 0.0, + "step": 24150 + }, + { + "epoch": 2.253522440981618, + "grad_norm": NaN, + "learning_rate": 0.0002157612616623664, + "loss": 0.0, + "step": 24151 + }, + { + "epoch": 2.253615750676495, + "grad_norm": NaN, + "learning_rate": 0.0002157544634318226, + "loss": 0.0, + "step": 24152 + }, + { + "epoch": 2.2537090603713725, + "grad_norm": NaN, + "learning_rate": 0.00021574766503408462, + "loss": 0.0, + "step": 24153 + }, + { + "epoch": 2.25380237006625, + "grad_norm": NaN, + "learning_rate": 0.00021574086646916981, + "loss": 0.0, + "step": 24154 + }, + { + "epoch": 2.2538956797611274, + "grad_norm": NaN, + "learning_rate": 0.0002157340677370955, + "loss": 0.0, + "step": 24155 + }, + { + "epoch": 2.2539889894560043, + "grad_norm": NaN, + "learning_rate": 0.00021572726883787888, + "loss": 0.0, + "step": 24156 + }, + { + "epoch": 2.2540822991508818, + "grad_norm": NaN, + "learning_rate": 0.00021572046977153726, + "loss": 0.0, + "step": 24157 + }, + { + "epoch": 2.254175608845759, + "grad_norm": NaN, + "learning_rate": 0.00021571367053808803, + "loss": 0.0, + "step": 24158 + }, + { + "epoch": 2.254268918540636, + "grad_norm": NaN, + "learning_rate": 0.00021570687113754835, + "loss": 0.0, + "step": 24159 + }, + { + "epoch": 2.2543622282355136, + "grad_norm": NaN, + "learning_rate": 0.00021570007156993556, + "loss": 0.0, + "step": 24160 + }, + { + "epoch": 2.254455537930391, + "grad_norm": NaN, + "learning_rate": 0.00021569327183526698, + "loss": 0.0, + "step": 24161 + }, + { + "epoch": 2.2545488476252684, + "grad_norm": NaN, + "learning_rate": 0.00021568647193355984, + "loss": 0.0, + "step": 24162 + }, + { + "epoch": 2.2546421573201454, + "grad_norm": NaN, + "learning_rate": 0.00021567967186483146, + "loss": 0.0, + "step": 24163 + }, + { + "epoch": 2.254735467015023, + "grad_norm": NaN, + "learning_rate": 0.00021567287162909914, + "loss": 0.0, + "step": 24164 + }, + { + "epoch": 2.2548287767099002, + "grad_norm": NaN, + "learning_rate": 0.00021566607122638017, + "loss": 0.0, + "step": 24165 + }, + { + "epoch": 2.2549220864047776, + "grad_norm": NaN, + "learning_rate": 0.00021565927065669182, + "loss": 0.0, + "step": 24166 + }, + { + "epoch": 2.2550153960996546, + "grad_norm": NaN, + "learning_rate": 0.00021565246992005143, + "loss": 0.0, + "step": 24167 + }, + { + "epoch": 2.255108705794532, + "grad_norm": NaN, + "learning_rate": 0.0002156456690164762, + "loss": 0.0, + "step": 24168 + }, + { + "epoch": 2.2552020154894095, + "grad_norm": NaN, + "learning_rate": 0.00021563886794598352, + "loss": 0.0, + "step": 24169 + }, + { + "epoch": 2.255295325184287, + "grad_norm": NaN, + "learning_rate": 0.0002156320667085906, + "loss": 0.0, + "step": 24170 + }, + { + "epoch": 2.255388634879164, + "grad_norm": NaN, + "learning_rate": 0.00021562526530431484, + "loss": 0.0, + "step": 24171 + }, + { + "epoch": 2.2554819445740413, + "grad_norm": NaN, + "learning_rate": 0.00021561846373317338, + "loss": 0.0, + "step": 24172 + }, + { + "epoch": 2.2555752542689187, + "grad_norm": NaN, + "learning_rate": 0.00021561166199518365, + "loss": 0.0, + "step": 24173 + }, + { + "epoch": 2.2556685639637957, + "grad_norm": NaN, + "learning_rate": 0.00021560486009036294, + "loss": 0.0, + "step": 24174 + }, + { + "epoch": 2.255761873658673, + "grad_norm": NaN, + "learning_rate": 0.0002155980580187285, + "loss": 0.0, + "step": 24175 + }, + { + "epoch": 2.2558551833535505, + "grad_norm": NaN, + "learning_rate": 0.00021559125578029758, + "loss": 0.0, + "step": 24176 + }, + { + "epoch": 2.255948493048428, + "grad_norm": NaN, + "learning_rate": 0.00021558445337508756, + "loss": 0.0, + "step": 24177 + }, + { + "epoch": 2.256041802743305, + "grad_norm": NaN, + "learning_rate": 0.00021557765080311566, + "loss": 0.0, + "step": 24178 + }, + { + "epoch": 2.2561351124381823, + "grad_norm": NaN, + "learning_rate": 0.00021557084806439925, + "loss": 0.0, + "step": 24179 + }, + { + "epoch": 2.2562284221330597, + "grad_norm": NaN, + "learning_rate": 0.00021556404515895563, + "loss": 0.0, + "step": 24180 + }, + { + "epoch": 2.2563217318279367, + "grad_norm": NaN, + "learning_rate": 0.000215557242086802, + "loss": 0.0, + "step": 24181 + }, + { + "epoch": 2.256415041522814, + "grad_norm": NaN, + "learning_rate": 0.00021555043884795573, + "loss": 0.0, + "step": 24182 + }, + { + "epoch": 2.2565083512176916, + "grad_norm": NaN, + "learning_rate": 0.0002155436354424342, + "loss": 0.0, + "step": 24183 + }, + { + "epoch": 2.256601660912569, + "grad_norm": NaN, + "learning_rate": 0.00021553683187025454, + "loss": 0.0, + "step": 24184 + }, + { + "epoch": 2.256694970607446, + "grad_norm": NaN, + "learning_rate": 0.00021553002813143413, + "loss": 0.0, + "step": 24185 + }, + { + "epoch": 2.2567882803023234, + "grad_norm": NaN, + "learning_rate": 0.00021552322422599028, + "loss": 0.0, + "step": 24186 + }, + { + "epoch": 2.256881589997201, + "grad_norm": NaN, + "learning_rate": 0.0002155164201539403, + "loss": 0.0, + "step": 24187 + }, + { + "epoch": 2.2569748996920778, + "grad_norm": NaN, + "learning_rate": 0.00021550961591530144, + "loss": 0.0, + "step": 24188 + }, + { + "epoch": 2.257068209386955, + "grad_norm": NaN, + "learning_rate": 0.00021550281151009107, + "loss": 0.0, + "step": 24189 + }, + { + "epoch": 2.2571615190818326, + "grad_norm": NaN, + "learning_rate": 0.00021549600693832646, + "loss": 0.0, + "step": 24190 + }, + { + "epoch": 2.25725482877671, + "grad_norm": NaN, + "learning_rate": 0.00021548920220002484, + "loss": 0.0, + "step": 24191 + }, + { + "epoch": 2.2573481384715874, + "grad_norm": NaN, + "learning_rate": 0.00021548239729520357, + "loss": 0.0, + "step": 24192 + }, + { + "epoch": 2.2574414481664644, + "grad_norm": NaN, + "learning_rate": 0.00021547559222388006, + "loss": 0.0, + "step": 24193 + }, + { + "epoch": 2.257534757861342, + "grad_norm": NaN, + "learning_rate": 0.00021546878698607146, + "loss": 0.0, + "step": 24194 + }, + { + "epoch": 2.2576280675562193, + "grad_norm": NaN, + "learning_rate": 0.0002154619815817951, + "loss": 0.0, + "step": 24195 + }, + { + "epoch": 2.2577213772510962, + "grad_norm": NaN, + "learning_rate": 0.00021545517601106838, + "loss": 0.0, + "step": 24196 + }, + { + "epoch": 2.2578146869459736, + "grad_norm": NaN, + "learning_rate": 0.0002154483702739085, + "loss": 0.0, + "step": 24197 + }, + { + "epoch": 2.257907996640851, + "grad_norm": NaN, + "learning_rate": 0.00021544156437033277, + "loss": 0.0, + "step": 24198 + }, + { + "epoch": 2.2580013063357285, + "grad_norm": NaN, + "learning_rate": 0.00021543475830035857, + "loss": 0.0, + "step": 24199 + }, + { + "epoch": 2.2580946160306055, + "grad_norm": NaN, + "learning_rate": 0.00021542795206400318, + "loss": 0.0, + "step": 24200 + }, + { + "epoch": 2.258187925725483, + "grad_norm": NaN, + "learning_rate": 0.0002154211456612838, + "loss": 0.0, + "step": 24201 + }, + { + "epoch": 2.2582812354203603, + "grad_norm": NaN, + "learning_rate": 0.00021541433909221794, + "loss": 0.0, + "step": 24202 + }, + { + "epoch": 2.2583745451152373, + "grad_norm": NaN, + "learning_rate": 0.00021540753235682272, + "loss": 0.0, + "step": 24203 + }, + { + "epoch": 2.2584678548101147, + "grad_norm": NaN, + "learning_rate": 0.00021540072545511553, + "loss": 0.0, + "step": 24204 + }, + { + "epoch": 2.258561164504992, + "grad_norm": NaN, + "learning_rate": 0.00021539391838711367, + "loss": 0.0, + "step": 24205 + }, + { + "epoch": 2.2586544741998695, + "grad_norm": NaN, + "learning_rate": 0.00021538711115283448, + "loss": 0.0, + "step": 24206 + }, + { + "epoch": 2.2587477838947465, + "grad_norm": NaN, + "learning_rate": 0.00021538030375229517, + "loss": 0.0, + "step": 24207 + }, + { + "epoch": 2.258841093589624, + "grad_norm": NaN, + "learning_rate": 0.0002153734961855132, + "loss": 0.0, + "step": 24208 + }, + { + "epoch": 2.2589344032845013, + "grad_norm": NaN, + "learning_rate": 0.00021536668845250576, + "loss": 0.0, + "step": 24209 + }, + { + "epoch": 2.2590277129793783, + "grad_norm": NaN, + "learning_rate": 0.00021535988055329015, + "loss": 0.0, + "step": 24210 + }, + { + "epoch": 2.2591210226742557, + "grad_norm": NaN, + "learning_rate": 0.00021535307248788372, + "loss": 0.0, + "step": 24211 + }, + { + "epoch": 2.259214332369133, + "grad_norm": NaN, + "learning_rate": 0.00021534626425630385, + "loss": 0.0, + "step": 24212 + }, + { + "epoch": 2.2593076420640106, + "grad_norm": NaN, + "learning_rate": 0.00021533945585856776, + "loss": 0.0, + "step": 24213 + }, + { + "epoch": 2.2594009517588876, + "grad_norm": NaN, + "learning_rate": 0.00021533264729469277, + "loss": 0.0, + "step": 24214 + }, + { + "epoch": 2.259494261453765, + "grad_norm": NaN, + "learning_rate": 0.00021532583856469627, + "loss": 0.0, + "step": 24215 + }, + { + "epoch": 2.2595875711486424, + "grad_norm": NaN, + "learning_rate": 0.00021531902966859546, + "loss": 0.0, + "step": 24216 + }, + { + "epoch": 2.25968088084352, + "grad_norm": NaN, + "learning_rate": 0.0002153122206064077, + "loss": 0.0, + "step": 24217 + }, + { + "epoch": 2.259774190538397, + "grad_norm": NaN, + "learning_rate": 0.00021530541137815035, + "loss": 0.0, + "step": 24218 + }, + { + "epoch": 2.259867500233274, + "grad_norm": NaN, + "learning_rate": 0.00021529860198384068, + "loss": 0.0, + "step": 24219 + }, + { + "epoch": 2.2599608099281516, + "grad_norm": NaN, + "learning_rate": 0.00021529179242349596, + "loss": 0.0, + "step": 24220 + }, + { + "epoch": 2.260054119623029, + "grad_norm": NaN, + "learning_rate": 0.00021528498269713362, + "loss": 0.0, + "step": 24221 + }, + { + "epoch": 2.260147429317906, + "grad_norm": NaN, + "learning_rate": 0.00021527817280477085, + "loss": 0.0, + "step": 24222 + }, + { + "epoch": 2.2602407390127834, + "grad_norm": NaN, + "learning_rate": 0.00021527136274642503, + "loss": 0.0, + "step": 24223 + }, + { + "epoch": 2.260334048707661, + "grad_norm": NaN, + "learning_rate": 0.00021526455252211352, + "loss": 0.0, + "step": 24224 + }, + { + "epoch": 2.260427358402538, + "grad_norm": NaN, + "learning_rate": 0.00021525774213185353, + "loss": 0.0, + "step": 24225 + }, + { + "epoch": 2.2605206680974153, + "grad_norm": NaN, + "learning_rate": 0.00021525093157566242, + "loss": 0.0, + "step": 24226 + }, + { + "epoch": 2.2606139777922927, + "grad_norm": NaN, + "learning_rate": 0.0002152441208535576, + "loss": 0.0, + "step": 24227 + }, + { + "epoch": 2.26070728748717, + "grad_norm": NaN, + "learning_rate": 0.00021523730996555625, + "loss": 0.0, + "step": 24228 + }, + { + "epoch": 2.260800597182047, + "grad_norm": NaN, + "learning_rate": 0.0002152304989116757, + "loss": 0.0, + "step": 24229 + }, + { + "epoch": 2.2608939068769245, + "grad_norm": NaN, + "learning_rate": 0.00021522368769193338, + "loss": 0.0, + "step": 24230 + }, + { + "epoch": 2.260987216571802, + "grad_norm": NaN, + "learning_rate": 0.00021521687630634659, + "loss": 0.0, + "step": 24231 + }, + { + "epoch": 2.261080526266679, + "grad_norm": NaN, + "learning_rate": 0.0002152100647549325, + "loss": 0.0, + "step": 24232 + }, + { + "epoch": 2.2611738359615563, + "grad_norm": NaN, + "learning_rate": 0.00021520325303770856, + "loss": 0.0, + "step": 24233 + }, + { + "epoch": 2.2612671456564337, + "grad_norm": NaN, + "learning_rate": 0.0002151964411546921, + "loss": 0.0, + "step": 24234 + }, + { + "epoch": 2.261360455351311, + "grad_norm": NaN, + "learning_rate": 0.0002151896291059004, + "loss": 0.0, + "step": 24235 + }, + { + "epoch": 2.261453765046188, + "grad_norm": NaN, + "learning_rate": 0.00021518281689135075, + "loss": 0.0, + "step": 24236 + }, + { + "epoch": 2.2615470747410655, + "grad_norm": NaN, + "learning_rate": 0.00021517600451106053, + "loss": 0.0, + "step": 24237 + }, + { + "epoch": 2.261640384435943, + "grad_norm": NaN, + "learning_rate": 0.00021516919196504702, + "loss": 0.0, + "step": 24238 + }, + { + "epoch": 2.2617336941308204, + "grad_norm": NaN, + "learning_rate": 0.00021516237925332755, + "loss": 0.0, + "step": 24239 + }, + { + "epoch": 2.2618270038256973, + "grad_norm": NaN, + "learning_rate": 0.0002151555663759195, + "loss": 0.0, + "step": 24240 + }, + { + "epoch": 2.2619203135205748, + "grad_norm": NaN, + "learning_rate": 0.00021514875333284009, + "loss": 0.0, + "step": 24241 + }, + { + "epoch": 2.262013623215452, + "grad_norm": NaN, + "learning_rate": 0.0002151419401241067, + "loss": 0.0, + "step": 24242 + }, + { + "epoch": 2.2621069329103296, + "grad_norm": NaN, + "learning_rate": 0.00021513512674973672, + "loss": 0.0, + "step": 24243 + }, + { + "epoch": 2.2622002426052066, + "grad_norm": NaN, + "learning_rate": 0.00021512831320974737, + "loss": 0.0, + "step": 24244 + }, + { + "epoch": 2.262293552300084, + "grad_norm": NaN, + "learning_rate": 0.00021512149950415595, + "loss": 0.0, + "step": 24245 + }, + { + "epoch": 2.2623868619949614, + "grad_norm": NaN, + "learning_rate": 0.00021511468563297994, + "loss": 0.0, + "step": 24246 + }, + { + "epoch": 2.2624801716898384, + "grad_norm": NaN, + "learning_rate": 0.00021510787159623654, + "loss": 0.0, + "step": 24247 + }, + { + "epoch": 2.262573481384716, + "grad_norm": NaN, + "learning_rate": 0.00021510105739394307, + "loss": 0.0, + "step": 24248 + }, + { + "epoch": 2.2626667910795932, + "grad_norm": NaN, + "learning_rate": 0.00021509424302611693, + "loss": 0.0, + "step": 24249 + }, + { + "epoch": 2.2627601007744707, + "grad_norm": NaN, + "learning_rate": 0.00021508742849277545, + "loss": 0.0, + "step": 24250 + }, + { + "epoch": 2.2628534104693476, + "grad_norm": NaN, + "learning_rate": 0.00021508061379393582, + "loss": 0.0, + "step": 24251 + }, + { + "epoch": 2.262946720164225, + "grad_norm": NaN, + "learning_rate": 0.00021507379892961556, + "loss": 0.0, + "step": 24252 + }, + { + "epoch": 2.2630400298591025, + "grad_norm": NaN, + "learning_rate": 0.0002150669838998319, + "loss": 0.0, + "step": 24253 + }, + { + "epoch": 2.2631333395539794, + "grad_norm": NaN, + "learning_rate": 0.00021506016870460212, + "loss": 0.0, + "step": 24254 + }, + { + "epoch": 2.263226649248857, + "grad_norm": NaN, + "learning_rate": 0.0002150533533439436, + "loss": 0.0, + "step": 24255 + }, + { + "epoch": 2.2633199589437343, + "grad_norm": NaN, + "learning_rate": 0.00021504653781787375, + "loss": 0.0, + "step": 24256 + }, + { + "epoch": 2.2634132686386117, + "grad_norm": NaN, + "learning_rate": 0.0002150397221264098, + "loss": 0.0, + "step": 24257 + }, + { + "epoch": 2.2635065783334887, + "grad_norm": NaN, + "learning_rate": 0.000215032906269569, + "loss": 0.0, + "step": 24258 + }, + { + "epoch": 2.263599888028366, + "grad_norm": NaN, + "learning_rate": 0.0002150260902473689, + "loss": 0.0, + "step": 24259 + }, + { + "epoch": 2.2636931977232435, + "grad_norm": NaN, + "learning_rate": 0.0002150192740598267, + "loss": 0.0, + "step": 24260 + }, + { + "epoch": 2.263786507418121, + "grad_norm": NaN, + "learning_rate": 0.0002150124577069597, + "loss": 0.0, + "step": 24261 + }, + { + "epoch": 2.263879817112998, + "grad_norm": NaN, + "learning_rate": 0.00021500564118878533, + "loss": 0.0, + "step": 24262 + }, + { + "epoch": 2.2639731268078753, + "grad_norm": NaN, + "learning_rate": 0.00021499882450532083, + "loss": 0.0, + "step": 24263 + }, + { + "epoch": 2.2640664365027527, + "grad_norm": NaN, + "learning_rate": 0.00021499200765658357, + "loss": 0.0, + "step": 24264 + }, + { + "epoch": 2.26415974619763, + "grad_norm": NaN, + "learning_rate": 0.0002149851906425909, + "loss": 0.0, + "step": 24265 + }, + { + "epoch": 2.264253055892507, + "grad_norm": NaN, + "learning_rate": 0.00021497837346336017, + "loss": 0.0, + "step": 24266 + }, + { + "epoch": 2.2643463655873846, + "grad_norm": NaN, + "learning_rate": 0.00021497155611890863, + "loss": 0.0, + "step": 24267 + }, + { + "epoch": 2.264439675282262, + "grad_norm": NaN, + "learning_rate": 0.00021496473860925367, + "loss": 0.0, + "step": 24268 + }, + { + "epoch": 2.264532984977139, + "grad_norm": NaN, + "learning_rate": 0.0002149579209344127, + "loss": 0.0, + "step": 24269 + }, + { + "epoch": 2.2646262946720164, + "grad_norm": NaN, + "learning_rate": 0.00021495110309440287, + "loss": 0.0, + "step": 24270 + }, + { + "epoch": 2.264719604366894, + "grad_norm": NaN, + "learning_rate": 0.0002149442850892417, + "loss": 0.0, + "step": 24271 + }, + { + "epoch": 2.264812914061771, + "grad_norm": NaN, + "learning_rate": 0.00021493746691894644, + "loss": 0.0, + "step": 24272 + }, + { + "epoch": 2.264906223756648, + "grad_norm": NaN, + "learning_rate": 0.00021493064858353443, + "loss": 0.0, + "step": 24273 + }, + { + "epoch": 2.2649995334515256, + "grad_norm": NaN, + "learning_rate": 0.00021492383008302302, + "loss": 0.0, + "step": 24274 + }, + { + "epoch": 2.265092843146403, + "grad_norm": NaN, + "learning_rate": 0.00021491701141742954, + "loss": 0.0, + "step": 24275 + }, + { + "epoch": 2.26518615284128, + "grad_norm": NaN, + "learning_rate": 0.00021491019258677136, + "loss": 0.0, + "step": 24276 + }, + { + "epoch": 2.2652794625361574, + "grad_norm": NaN, + "learning_rate": 0.00021490337359106573, + "loss": 0.0, + "step": 24277 + }, + { + "epoch": 2.265372772231035, + "grad_norm": NaN, + "learning_rate": 0.00021489655443033008, + "loss": 0.0, + "step": 24278 + }, + { + "epoch": 2.2654660819259123, + "grad_norm": NaN, + "learning_rate": 0.0002148897351045817, + "loss": 0.0, + "step": 24279 + }, + { + "epoch": 2.2655593916207892, + "grad_norm": NaN, + "learning_rate": 0.00021488291561383793, + "loss": 0.0, + "step": 24280 + }, + { + "epoch": 2.2656527013156667, + "grad_norm": NaN, + "learning_rate": 0.0002148760959581162, + "loss": 0.0, + "step": 24281 + }, + { + "epoch": 2.265746011010544, + "grad_norm": NaN, + "learning_rate": 0.0002148692761374337, + "loss": 0.0, + "step": 24282 + }, + { + "epoch": 2.265839320705421, + "grad_norm": NaN, + "learning_rate": 0.00021486245615180783, + "loss": 0.0, + "step": 24283 + }, + { + "epoch": 2.2659326304002985, + "grad_norm": NaN, + "learning_rate": 0.00021485563600125602, + "loss": 0.0, + "step": 24284 + }, + { + "epoch": 2.266025940095176, + "grad_norm": NaN, + "learning_rate": 0.00021484881568579553, + "loss": 0.0, + "step": 24285 + }, + { + "epoch": 2.2661192497900533, + "grad_norm": NaN, + "learning_rate": 0.00021484199520544367, + "loss": 0.0, + "step": 24286 + }, + { + "epoch": 2.2662125594849307, + "grad_norm": NaN, + "learning_rate": 0.00021483517456021781, + "loss": 0.0, + "step": 24287 + }, + { + "epoch": 2.2663058691798077, + "grad_norm": NaN, + "learning_rate": 0.0002148283537501354, + "loss": 0.0, + "step": 24288 + }, + { + "epoch": 2.266399178874685, + "grad_norm": NaN, + "learning_rate": 0.0002148215327752136, + "loss": 0.0, + "step": 24289 + }, + { + "epoch": 2.2664924885695625, + "grad_norm": NaN, + "learning_rate": 0.00021481471163546984, + "loss": 0.0, + "step": 24290 + }, + { + "epoch": 2.2665857982644395, + "grad_norm": NaN, + "learning_rate": 0.00021480789033092153, + "loss": 0.0, + "step": 24291 + }, + { + "epoch": 2.266679107959317, + "grad_norm": NaN, + "learning_rate": 0.0002148010688615859, + "loss": 0.0, + "step": 24292 + }, + { + "epoch": 2.2667724176541943, + "grad_norm": NaN, + "learning_rate": 0.00021479424722748038, + "loss": 0.0, + "step": 24293 + }, + { + "epoch": 2.2668657273490718, + "grad_norm": NaN, + "learning_rate": 0.00021478742542862232, + "loss": 0.0, + "step": 24294 + }, + { + "epoch": 2.2669590370439487, + "grad_norm": NaN, + "learning_rate": 0.00021478060346502895, + "loss": 0.0, + "step": 24295 + }, + { + "epoch": 2.267052346738826, + "grad_norm": NaN, + "learning_rate": 0.0002147737813367177, + "loss": 0.0, + "step": 24296 + }, + { + "epoch": 2.2671456564337036, + "grad_norm": NaN, + "learning_rate": 0.000214766959043706, + "loss": 0.0, + "step": 24297 + }, + { + "epoch": 2.2672389661285806, + "grad_norm": NaN, + "learning_rate": 0.00021476013658601104, + "loss": 0.0, + "step": 24298 + }, + { + "epoch": 2.267332275823458, + "grad_norm": NaN, + "learning_rate": 0.0002147533139636502, + "loss": 0.0, + "step": 24299 + }, + { + "epoch": 2.2674255855183354, + "grad_norm": NaN, + "learning_rate": 0.00021474649117664095, + "loss": 0.0, + "step": 24300 + }, + { + "epoch": 2.267518895213213, + "grad_norm": NaN, + "learning_rate": 0.0002147396682250005, + "loss": 0.0, + "step": 24301 + }, + { + "epoch": 2.26761220490809, + "grad_norm": NaN, + "learning_rate": 0.00021473284510874625, + "loss": 0.0, + "step": 24302 + }, + { + "epoch": 2.267705514602967, + "grad_norm": NaN, + "learning_rate": 0.00021472602182789557, + "loss": 0.0, + "step": 24303 + }, + { + "epoch": 2.2677988242978446, + "grad_norm": NaN, + "learning_rate": 0.0002147191983824658, + "loss": 0.0, + "step": 24304 + }, + { + "epoch": 2.2678921339927216, + "grad_norm": NaN, + "learning_rate": 0.00021471237477247422, + "loss": 0.0, + "step": 24305 + }, + { + "epoch": 2.267985443687599, + "grad_norm": NaN, + "learning_rate": 0.0002147055509979383, + "loss": 0.0, + "step": 24306 + }, + { + "epoch": 2.2680787533824764, + "grad_norm": NaN, + "learning_rate": 0.00021469872705887535, + "loss": 0.0, + "step": 24307 + }, + { + "epoch": 2.268172063077354, + "grad_norm": NaN, + "learning_rate": 0.00021469190295530258, + "loss": 0.0, + "step": 24308 + }, + { + "epoch": 2.2682653727722313, + "grad_norm": NaN, + "learning_rate": 0.00021468507868723757, + "loss": 0.0, + "step": 24309 + }, + { + "epoch": 2.2683586824671083, + "grad_norm": NaN, + "learning_rate": 0.00021467825425469756, + "loss": 0.0, + "step": 24310 + }, + { + "epoch": 2.2684519921619857, + "grad_norm": NaN, + "learning_rate": 0.00021467142965769987, + "loss": 0.0, + "step": 24311 + }, + { + "epoch": 2.268545301856863, + "grad_norm": NaN, + "learning_rate": 0.00021466460489626192, + "loss": 0.0, + "step": 24312 + }, + { + "epoch": 2.26863861155174, + "grad_norm": NaN, + "learning_rate": 0.00021465777997040108, + "loss": 0.0, + "step": 24313 + }, + { + "epoch": 2.2687319212466175, + "grad_norm": NaN, + "learning_rate": 0.00021465095488013456, + "loss": 0.0, + "step": 24314 + }, + { + "epoch": 2.268825230941495, + "grad_norm": NaN, + "learning_rate": 0.00021464412962547987, + "loss": 0.0, + "step": 24315 + }, + { + "epoch": 2.2689185406363723, + "grad_norm": NaN, + "learning_rate": 0.00021463730420645432, + "loss": 0.0, + "step": 24316 + }, + { + "epoch": 2.2690118503312493, + "grad_norm": NaN, + "learning_rate": 0.0002146304786230752, + "loss": 0.0, + "step": 24317 + }, + { + "epoch": 2.2691051600261267, + "grad_norm": NaN, + "learning_rate": 0.00021462365287535994, + "loss": 0.0, + "step": 24318 + }, + { + "epoch": 2.269198469721004, + "grad_norm": NaN, + "learning_rate": 0.0002146168269633259, + "loss": 0.0, + "step": 24319 + }, + { + "epoch": 2.269291779415881, + "grad_norm": NaN, + "learning_rate": 0.0002146100008869904, + "loss": 0.0, + "step": 24320 + }, + { + "epoch": 2.2693850891107585, + "grad_norm": NaN, + "learning_rate": 0.00021460317464637078, + "loss": 0.0, + "step": 24321 + }, + { + "epoch": 2.269478398805636, + "grad_norm": NaN, + "learning_rate": 0.00021459634824148448, + "loss": 0.0, + "step": 24322 + }, + { + "epoch": 2.2695717085005134, + "grad_norm": NaN, + "learning_rate": 0.00021458952167234874, + "loss": 0.0, + "step": 24323 + }, + { + "epoch": 2.2696650181953903, + "grad_norm": NaN, + "learning_rate": 0.00021458269493898098, + "loss": 0.0, + "step": 24324 + }, + { + "epoch": 2.2697583278902678, + "grad_norm": NaN, + "learning_rate": 0.00021457586804139858, + "loss": 0.0, + "step": 24325 + }, + { + "epoch": 2.269851637585145, + "grad_norm": NaN, + "learning_rate": 0.00021456904097961888, + "loss": 0.0, + "step": 24326 + }, + { + "epoch": 2.269944947280022, + "grad_norm": NaN, + "learning_rate": 0.0002145622137536592, + "loss": 0.0, + "step": 24327 + }, + { + "epoch": 2.2700382569748996, + "grad_norm": NaN, + "learning_rate": 0.00021455538636353698, + "loss": 0.0, + "step": 24328 + }, + { + "epoch": 2.270131566669777, + "grad_norm": NaN, + "learning_rate": 0.0002145485588092696, + "loss": 0.0, + "step": 24329 + }, + { + "epoch": 2.2702248763646544, + "grad_norm": NaN, + "learning_rate": 0.0002145417310908742, + "loss": 0.0, + "step": 24330 + }, + { + "epoch": 2.2703181860595314, + "grad_norm": NaN, + "learning_rate": 0.0002145349032083684, + "loss": 0.0, + "step": 24331 + }, + { + "epoch": 2.270411495754409, + "grad_norm": NaN, + "learning_rate": 0.00021452807516176946, + "loss": 0.0, + "step": 24332 + }, + { + "epoch": 2.2705048054492862, + "grad_norm": NaN, + "learning_rate": 0.00021452124695109465, + "loss": 0.0, + "step": 24333 + }, + { + "epoch": 2.2705981151441637, + "grad_norm": NaN, + "learning_rate": 0.0002145144185763615, + "loss": 0.0, + "step": 24334 + }, + { + "epoch": 2.2706914248390406, + "grad_norm": NaN, + "learning_rate": 0.0002145075900375873, + "loss": 0.0, + "step": 24335 + }, + { + "epoch": 2.270784734533918, + "grad_norm": NaN, + "learning_rate": 0.00021450076133478936, + "loss": 0.0, + "step": 24336 + }, + { + "epoch": 2.2708780442287955, + "grad_norm": NaN, + "learning_rate": 0.00021449393246798513, + "loss": 0.0, + "step": 24337 + }, + { + "epoch": 2.270971353923673, + "grad_norm": NaN, + "learning_rate": 0.00021448710343719194, + "loss": 0.0, + "step": 24338 + }, + { + "epoch": 2.27106466361855, + "grad_norm": NaN, + "learning_rate": 0.0002144802742424271, + "loss": 0.0, + "step": 24339 + }, + { + "epoch": 2.2711579733134273, + "grad_norm": NaN, + "learning_rate": 0.00021447344488370804, + "loss": 0.0, + "step": 24340 + }, + { + "epoch": 2.2712512830083047, + "grad_norm": NaN, + "learning_rate": 0.00021446661536105217, + "loss": 0.0, + "step": 24341 + }, + { + "epoch": 2.2713445927031817, + "grad_norm": NaN, + "learning_rate": 0.0002144597856744767, + "loss": 0.0, + "step": 24342 + }, + { + "epoch": 2.271437902398059, + "grad_norm": NaN, + "learning_rate": 0.00021445295582399912, + "loss": 0.0, + "step": 24343 + }, + { + "epoch": 2.2715312120929365, + "grad_norm": NaN, + "learning_rate": 0.0002144461258096368, + "loss": 0.0, + "step": 24344 + }, + { + "epoch": 2.271624521787814, + "grad_norm": NaN, + "learning_rate": 0.00021443929563140705, + "loss": 0.0, + "step": 24345 + }, + { + "epoch": 2.271717831482691, + "grad_norm": NaN, + "learning_rate": 0.00021443246528932722, + "loss": 0.0, + "step": 24346 + }, + { + "epoch": 2.2718111411775683, + "grad_norm": NaN, + "learning_rate": 0.0002144256347834148, + "loss": 0.0, + "step": 24347 + }, + { + "epoch": 2.2719044508724457, + "grad_norm": NaN, + "learning_rate": 0.00021441880411368703, + "loss": 0.0, + "step": 24348 + }, + { + "epoch": 2.2719977605673227, + "grad_norm": NaN, + "learning_rate": 0.0002144119732801613, + "loss": 0.0, + "step": 24349 + }, + { + "epoch": 2.2720910702622, + "grad_norm": NaN, + "learning_rate": 0.00021440514228285502, + "loss": 0.0, + "step": 24350 + }, + { + "epoch": 2.2721843799570776, + "grad_norm": NaN, + "learning_rate": 0.0002143983111217856, + "loss": 0.0, + "step": 24351 + }, + { + "epoch": 2.272277689651955, + "grad_norm": NaN, + "learning_rate": 0.00021439147979697025, + "loss": 0.0, + "step": 24352 + }, + { + "epoch": 2.272370999346832, + "grad_norm": NaN, + "learning_rate": 0.0002143846483084265, + "loss": 0.0, + "step": 24353 + }, + { + "epoch": 2.2724643090417094, + "grad_norm": NaN, + "learning_rate": 0.0002143778166561717, + "loss": 0.0, + "step": 24354 + }, + { + "epoch": 2.272557618736587, + "grad_norm": NaN, + "learning_rate": 0.0002143709848402231, + "loss": 0.0, + "step": 24355 + }, + { + "epoch": 2.272650928431464, + "grad_norm": NaN, + "learning_rate": 0.00021436415286059816, + "loss": 0.0, + "step": 24356 + }, + { + "epoch": 2.272744238126341, + "grad_norm": NaN, + "learning_rate": 0.0002143573207173143, + "loss": 0.0, + "step": 24357 + }, + { + "epoch": 2.2728375478212186, + "grad_norm": NaN, + "learning_rate": 0.0002143504884103888, + "loss": 0.0, + "step": 24358 + }, + { + "epoch": 2.272930857516096, + "grad_norm": NaN, + "learning_rate": 0.0002143436559398391, + "loss": 0.0, + "step": 24359 + }, + { + "epoch": 2.2730241672109734, + "grad_norm": NaN, + "learning_rate": 0.00021433682330568253, + "loss": 0.0, + "step": 24360 + }, + { + "epoch": 2.2731174769058504, + "grad_norm": NaN, + "learning_rate": 0.00021432999050793641, + "loss": 0.0, + "step": 24361 + }, + { + "epoch": 2.273210786600728, + "grad_norm": NaN, + "learning_rate": 0.00021432315754661826, + "loss": 0.0, + "step": 24362 + }, + { + "epoch": 2.2733040962956053, + "grad_norm": NaN, + "learning_rate": 0.00021431632442174537, + "loss": 0.0, + "step": 24363 + }, + { + "epoch": 2.2733974059904822, + "grad_norm": NaN, + "learning_rate": 0.00021430949113333508, + "loss": 0.0, + "step": 24364 + }, + { + "epoch": 2.2734907156853597, + "grad_norm": NaN, + "learning_rate": 0.00021430265768140484, + "loss": 0.0, + "step": 24365 + }, + { + "epoch": 2.273584025380237, + "grad_norm": NaN, + "learning_rate": 0.000214295824065972, + "loss": 0.0, + "step": 24366 + }, + { + "epoch": 2.2736773350751145, + "grad_norm": NaN, + "learning_rate": 0.00021428899028705392, + "loss": 0.0, + "step": 24367 + }, + { + "epoch": 2.2737706447699915, + "grad_norm": NaN, + "learning_rate": 0.00021428215634466792, + "loss": 0.0, + "step": 24368 + }, + { + "epoch": 2.273863954464869, + "grad_norm": NaN, + "learning_rate": 0.00021427532223883147, + "loss": 0.0, + "step": 24369 + }, + { + "epoch": 2.2739572641597463, + "grad_norm": NaN, + "learning_rate": 0.00021426848796956196, + "loss": 0.0, + "step": 24370 + }, + { + "epoch": 2.2740505738546233, + "grad_norm": NaN, + "learning_rate": 0.00021426165353687665, + "loss": 0.0, + "step": 24371 + }, + { + "epoch": 2.2741438835495007, + "grad_norm": NaN, + "learning_rate": 0.00021425481894079304, + "loss": 0.0, + "step": 24372 + }, + { + "epoch": 2.274237193244378, + "grad_norm": NaN, + "learning_rate": 0.00021424798418132849, + "loss": 0.0, + "step": 24373 + }, + { + "epoch": 2.2743305029392555, + "grad_norm": NaN, + "learning_rate": 0.00021424114925850028, + "loss": 0.0, + "step": 24374 + }, + { + "epoch": 2.2744238126341325, + "grad_norm": NaN, + "learning_rate": 0.00021423431417232587, + "loss": 0.0, + "step": 24375 + }, + { + "epoch": 2.27451712232901, + "grad_norm": NaN, + "learning_rate": 0.00021422747892282267, + "loss": 0.0, + "step": 24376 + }, + { + "epoch": 2.2746104320238874, + "grad_norm": NaN, + "learning_rate": 0.000214220643510008, + "loss": 0.0, + "step": 24377 + }, + { + "epoch": 2.2747037417187648, + "grad_norm": NaN, + "learning_rate": 0.00021421380793389922, + "loss": 0.0, + "step": 24378 + }, + { + "epoch": 2.2747970514136417, + "grad_norm": NaN, + "learning_rate": 0.00021420697219451376, + "loss": 0.0, + "step": 24379 + }, + { + "epoch": 2.274890361108519, + "grad_norm": NaN, + "learning_rate": 0.000214200136291869, + "loss": 0.0, + "step": 24380 + }, + { + "epoch": 2.2749836708033966, + "grad_norm": NaN, + "learning_rate": 0.00021419330022598234, + "loss": 0.0, + "step": 24381 + }, + { + "epoch": 2.275076980498274, + "grad_norm": NaN, + "learning_rate": 0.00021418646399687106, + "loss": 0.0, + "step": 24382 + }, + { + "epoch": 2.275170290193151, + "grad_norm": NaN, + "learning_rate": 0.0002141796276045527, + "loss": 0.0, + "step": 24383 + }, + { + "epoch": 2.2752635998880284, + "grad_norm": NaN, + "learning_rate": 0.00021417279104904451, + "loss": 0.0, + "step": 24384 + }, + { + "epoch": 2.275356909582906, + "grad_norm": NaN, + "learning_rate": 0.00021416595433036394, + "loss": 0.0, + "step": 24385 + }, + { + "epoch": 2.275450219277783, + "grad_norm": NaN, + "learning_rate": 0.00021415911744852834, + "loss": 0.0, + "step": 24386 + }, + { + "epoch": 2.27554352897266, + "grad_norm": NaN, + "learning_rate": 0.0002141522804035551, + "loss": 0.0, + "step": 24387 + }, + { + "epoch": 2.2756368386675376, + "grad_norm": NaN, + "learning_rate": 0.00021414544319546164, + "loss": 0.0, + "step": 24388 + }, + { + "epoch": 2.275730148362415, + "grad_norm": NaN, + "learning_rate": 0.00021413860582426535, + "loss": 0.0, + "step": 24389 + }, + { + "epoch": 2.275823458057292, + "grad_norm": NaN, + "learning_rate": 0.00021413176828998347, + "loss": 0.0, + "step": 24390 + }, + { + "epoch": 2.2759167677521694, + "grad_norm": NaN, + "learning_rate": 0.00021412493059263363, + "loss": 0.0, + "step": 24391 + }, + { + "epoch": 2.276010077447047, + "grad_norm": NaN, + "learning_rate": 0.00021411809273223306, + "loss": 0.0, + "step": 24392 + }, + { + "epoch": 2.276103387141924, + "grad_norm": NaN, + "learning_rate": 0.00021411125470879912, + "loss": 0.0, + "step": 24393 + }, + { + "epoch": 2.2761966968368013, + "grad_norm": NaN, + "learning_rate": 0.00021410441652234925, + "loss": 0.0, + "step": 24394 + }, + { + "epoch": 2.2762900065316787, + "grad_norm": NaN, + "learning_rate": 0.0002140975781729009, + "loss": 0.0, + "step": 24395 + }, + { + "epoch": 2.276383316226556, + "grad_norm": NaN, + "learning_rate": 0.00021409073966047133, + "loss": 0.0, + "step": 24396 + }, + { + "epoch": 2.276476625921433, + "grad_norm": NaN, + "learning_rate": 0.00021408390098507803, + "loss": 0.0, + "step": 24397 + }, + { + "epoch": 2.2765699356163105, + "grad_norm": NaN, + "learning_rate": 0.00021407706214673832, + "loss": 0.0, + "step": 24398 + }, + { + "epoch": 2.276663245311188, + "grad_norm": NaN, + "learning_rate": 0.00021407022314546965, + "loss": 0.0, + "step": 24399 + }, + { + "epoch": 2.276756555006065, + "grad_norm": NaN, + "learning_rate": 0.0002140633839812894, + "loss": 0.0, + "step": 24400 + }, + { + "epoch": 2.2768498647009423, + "grad_norm": NaN, + "learning_rate": 0.00021405654465421492, + "loss": 0.0, + "step": 24401 + }, + { + "epoch": 2.2769431743958197, + "grad_norm": NaN, + "learning_rate": 0.00021404970516426363, + "loss": 0.0, + "step": 24402 + }, + { + "epoch": 2.277036484090697, + "grad_norm": NaN, + "learning_rate": 0.0002140428655114529, + "loss": 0.0, + "step": 24403 + }, + { + "epoch": 2.2771297937855746, + "grad_norm": NaN, + "learning_rate": 0.0002140360256958001, + "loss": 0.0, + "step": 24404 + }, + { + "epoch": 2.2772231034804515, + "grad_norm": NaN, + "learning_rate": 0.00021402918571732273, + "loss": 0.0, + "step": 24405 + }, + { + "epoch": 2.277316413175329, + "grad_norm": NaN, + "learning_rate": 0.00021402234557603805, + "loss": 0.0, + "step": 24406 + }, + { + "epoch": 2.2774097228702064, + "grad_norm": NaN, + "learning_rate": 0.00021401550527196355, + "loss": 0.0, + "step": 24407 + }, + { + "epoch": 2.2775030325650834, + "grad_norm": NaN, + "learning_rate": 0.00021400866480511657, + "loss": 0.0, + "step": 24408 + }, + { + "epoch": 2.2775963422599608, + "grad_norm": NaN, + "learning_rate": 0.0002140018241755145, + "loss": 0.0, + "step": 24409 + }, + { + "epoch": 2.277689651954838, + "grad_norm": NaN, + "learning_rate": 0.0002139949833831748, + "loss": 0.0, + "step": 24410 + }, + { + "epoch": 2.2777829616497156, + "grad_norm": NaN, + "learning_rate": 0.0002139881424281148, + "loss": 0.0, + "step": 24411 + }, + { + "epoch": 2.2778762713445926, + "grad_norm": NaN, + "learning_rate": 0.00021398130131035183, + "loss": 0.0, + "step": 24412 + }, + { + "epoch": 2.27796958103947, + "grad_norm": NaN, + "learning_rate": 0.00021397446002990343, + "loss": 0.0, + "step": 24413 + }, + { + "epoch": 2.2780628907343474, + "grad_norm": NaN, + "learning_rate": 0.00021396761858678693, + "loss": 0.0, + "step": 24414 + }, + { + "epoch": 2.2781562004292244, + "grad_norm": NaN, + "learning_rate": 0.0002139607769810197, + "loss": 0.0, + "step": 24415 + }, + { + "epoch": 2.278249510124102, + "grad_norm": NaN, + "learning_rate": 0.00021395393521261917, + "loss": 0.0, + "step": 24416 + }, + { + "epoch": 2.2783428198189792, + "grad_norm": NaN, + "learning_rate": 0.00021394709328160271, + "loss": 0.0, + "step": 24417 + }, + { + "epoch": 2.2784361295138567, + "grad_norm": NaN, + "learning_rate": 0.00021394025118798776, + "loss": 0.0, + "step": 24418 + }, + { + "epoch": 2.2785294392087336, + "grad_norm": NaN, + "learning_rate": 0.0002139334089317917, + "loss": 0.0, + "step": 24419 + }, + { + "epoch": 2.278622748903611, + "grad_norm": NaN, + "learning_rate": 0.0002139265665130319, + "loss": 0.0, + "step": 24420 + }, + { + "epoch": 2.2787160585984885, + "grad_norm": NaN, + "learning_rate": 0.00021391972393172576, + "loss": 0.0, + "step": 24421 + }, + { + "epoch": 2.2788093682933654, + "grad_norm": NaN, + "learning_rate": 0.00021391288118789074, + "loss": 0.0, + "step": 24422 + }, + { + "epoch": 2.278902677988243, + "grad_norm": NaN, + "learning_rate": 0.00021390603828154418, + "loss": 0.0, + "step": 24423 + }, + { + "epoch": 2.2789959876831203, + "grad_norm": NaN, + "learning_rate": 0.00021389919521270346, + "loss": 0.0, + "step": 24424 + }, + { + "epoch": 2.2790892973779977, + "grad_norm": NaN, + "learning_rate": 0.00021389235198138605, + "loss": 0.0, + "step": 24425 + }, + { + "epoch": 2.2791826070728747, + "grad_norm": NaN, + "learning_rate": 0.00021388550858760934, + "loss": 0.0, + "step": 24426 + }, + { + "epoch": 2.279275916767752, + "grad_norm": NaN, + "learning_rate": 0.00021387866503139066, + "loss": 0.0, + "step": 24427 + }, + { + "epoch": 2.2793692264626295, + "grad_norm": NaN, + "learning_rate": 0.00021387182131274747, + "loss": 0.0, + "step": 24428 + }, + { + "epoch": 2.279462536157507, + "grad_norm": NaN, + "learning_rate": 0.00021386497743169716, + "loss": 0.0, + "step": 24429 + }, + { + "epoch": 2.279555845852384, + "grad_norm": NaN, + "learning_rate": 0.00021385813338825713, + "loss": 0.0, + "step": 24430 + }, + { + "epoch": 2.2796491555472613, + "grad_norm": NaN, + "learning_rate": 0.00021385128918244482, + "loss": 0.0, + "step": 24431 + }, + { + "epoch": 2.2797424652421387, + "grad_norm": NaN, + "learning_rate": 0.00021384444481427751, + "loss": 0.0, + "step": 24432 + }, + { + "epoch": 2.279835774937016, + "grad_norm": NaN, + "learning_rate": 0.00021383760028377278, + "loss": 0.0, + "step": 24433 + }, + { + "epoch": 2.279929084631893, + "grad_norm": NaN, + "learning_rate": 0.0002138307555909479, + "loss": 0.0, + "step": 24434 + }, + { + "epoch": 2.2800223943267706, + "grad_norm": NaN, + "learning_rate": 0.00021382391073582028, + "loss": 0.0, + "step": 24435 + }, + { + "epoch": 2.280115704021648, + "grad_norm": NaN, + "learning_rate": 0.0002138170657184074, + "loss": 0.0, + "step": 24436 + }, + { + "epoch": 2.280209013716525, + "grad_norm": NaN, + "learning_rate": 0.00021381022053872662, + "loss": 0.0, + "step": 24437 + }, + { + "epoch": 2.2803023234114024, + "grad_norm": NaN, + "learning_rate": 0.0002138033751967954, + "loss": 0.0, + "step": 24438 + }, + { + "epoch": 2.28039563310628, + "grad_norm": NaN, + "learning_rate": 0.00021379652969263105, + "loss": 0.0, + "step": 24439 + }, + { + "epoch": 2.280488942801157, + "grad_norm": NaN, + "learning_rate": 0.00021378968402625101, + "loss": 0.0, + "step": 24440 + }, + { + "epoch": 2.280582252496034, + "grad_norm": NaN, + "learning_rate": 0.0002137828381976727, + "loss": 0.0, + "step": 24441 + }, + { + "epoch": 2.2806755621909116, + "grad_norm": NaN, + "learning_rate": 0.00021377599220691356, + "loss": 0.0, + "step": 24442 + }, + { + "epoch": 2.280768871885789, + "grad_norm": NaN, + "learning_rate": 0.00021376914605399094, + "loss": 0.0, + "step": 24443 + }, + { + "epoch": 2.280862181580666, + "grad_norm": NaN, + "learning_rate": 0.00021376229973892229, + "loss": 0.0, + "step": 24444 + }, + { + "epoch": 2.2809554912755434, + "grad_norm": NaN, + "learning_rate": 0.00021375545326172496, + "loss": 0.0, + "step": 24445 + }, + { + "epoch": 2.281048800970421, + "grad_norm": NaN, + "learning_rate": 0.00021374860662241643, + "loss": 0.0, + "step": 24446 + }, + { + "epoch": 2.2811421106652983, + "grad_norm": NaN, + "learning_rate": 0.00021374175982101406, + "loss": 0.0, + "step": 24447 + }, + { + "epoch": 2.2812354203601752, + "grad_norm": NaN, + "learning_rate": 0.00021373491285753531, + "loss": 0.0, + "step": 24448 + }, + { + "epoch": 2.2813287300550527, + "grad_norm": NaN, + "learning_rate": 0.00021372806573199753, + "loss": 0.0, + "step": 24449 + }, + { + "epoch": 2.28142203974993, + "grad_norm": NaN, + "learning_rate": 0.00021372121844441818, + "loss": 0.0, + "step": 24450 + }, + { + "epoch": 2.2815153494448075, + "grad_norm": NaN, + "learning_rate": 0.00021371437099481462, + "loss": 0.0, + "step": 24451 + }, + { + "epoch": 2.2816086591396845, + "grad_norm": NaN, + "learning_rate": 0.0002137075233832043, + "loss": 0.0, + "step": 24452 + }, + { + "epoch": 2.281701968834562, + "grad_norm": NaN, + "learning_rate": 0.00021370067560960463, + "loss": 0.0, + "step": 24453 + }, + { + "epoch": 2.2817952785294393, + "grad_norm": NaN, + "learning_rate": 0.000213693827674033, + "loss": 0.0, + "step": 24454 + }, + { + "epoch": 2.2818885882243167, + "grad_norm": NaN, + "learning_rate": 0.0002136869795765068, + "loss": 0.0, + "step": 24455 + }, + { + "epoch": 2.2819818979191937, + "grad_norm": NaN, + "learning_rate": 0.00021368013131704353, + "loss": 0.0, + "step": 24456 + }, + { + "epoch": 2.282075207614071, + "grad_norm": NaN, + "learning_rate": 0.0002136732828956605, + "loss": 0.0, + "step": 24457 + }, + { + "epoch": 2.2821685173089485, + "grad_norm": NaN, + "learning_rate": 0.0002136664343123752, + "loss": 0.0, + "step": 24458 + }, + { + "epoch": 2.2822618270038255, + "grad_norm": NaN, + "learning_rate": 0.00021365958556720502, + "loss": 0.0, + "step": 24459 + }, + { + "epoch": 2.282355136698703, + "grad_norm": NaN, + "learning_rate": 0.00021365273666016738, + "loss": 0.0, + "step": 24460 + }, + { + "epoch": 2.2824484463935804, + "grad_norm": NaN, + "learning_rate": 0.00021364588759127968, + "loss": 0.0, + "step": 24461 + }, + { + "epoch": 2.2825417560884578, + "grad_norm": NaN, + "learning_rate": 0.0002136390383605593, + "loss": 0.0, + "step": 24462 + }, + { + "epoch": 2.2826350657833347, + "grad_norm": NaN, + "learning_rate": 0.00021363218896802377, + "loss": 0.0, + "step": 24463 + }, + { + "epoch": 2.282728375478212, + "grad_norm": NaN, + "learning_rate": 0.00021362533941369038, + "loss": 0.0, + "step": 24464 + }, + { + "epoch": 2.2828216851730896, + "grad_norm": NaN, + "learning_rate": 0.00021361848969757665, + "loss": 0.0, + "step": 24465 + }, + { + "epoch": 2.2829149948679666, + "grad_norm": NaN, + "learning_rate": 0.00021361163981969988, + "loss": 0.0, + "step": 24466 + }, + { + "epoch": 2.283008304562844, + "grad_norm": NaN, + "learning_rate": 0.0002136047897800776, + "loss": 0.0, + "step": 24467 + }, + { + "epoch": 2.2831016142577214, + "grad_norm": NaN, + "learning_rate": 0.00021359793957872713, + "loss": 0.0, + "step": 24468 + }, + { + "epoch": 2.283194923952599, + "grad_norm": NaN, + "learning_rate": 0.000213591089215666, + "loss": 0.0, + "step": 24469 + }, + { + "epoch": 2.283288233647476, + "grad_norm": NaN, + "learning_rate": 0.00021358423869091153, + "loss": 0.0, + "step": 24470 + }, + { + "epoch": 2.283381543342353, + "grad_norm": NaN, + "learning_rate": 0.0002135773880044812, + "loss": 0.0, + "step": 24471 + }, + { + "epoch": 2.2834748530372306, + "grad_norm": NaN, + "learning_rate": 0.00021357053715639237, + "loss": 0.0, + "step": 24472 + }, + { + "epoch": 2.283568162732108, + "grad_norm": NaN, + "learning_rate": 0.00021356368614666252, + "loss": 0.0, + "step": 24473 + }, + { + "epoch": 2.283661472426985, + "grad_norm": NaN, + "learning_rate": 0.00021355683497530905, + "loss": 0.0, + "step": 24474 + }, + { + "epoch": 2.2837547821218624, + "grad_norm": NaN, + "learning_rate": 0.00021354998364234937, + "loss": 0.0, + "step": 24475 + }, + { + "epoch": 2.28384809181674, + "grad_norm": NaN, + "learning_rate": 0.0002135431321478009, + "loss": 0.0, + "step": 24476 + }, + { + "epoch": 2.2839414015116173, + "grad_norm": NaN, + "learning_rate": 0.00021353628049168108, + "loss": 0.0, + "step": 24477 + }, + { + "epoch": 2.2840347112064943, + "grad_norm": NaN, + "learning_rate": 0.0002135294286740073, + "loss": 0.0, + "step": 24478 + }, + { + "epoch": 2.2841280209013717, + "grad_norm": NaN, + "learning_rate": 0.000213522576694797, + "loss": 0.0, + "step": 24479 + }, + { + "epoch": 2.284221330596249, + "grad_norm": NaN, + "learning_rate": 0.00021351572455406763, + "loss": 0.0, + "step": 24480 + }, + { + "epoch": 2.284314640291126, + "grad_norm": NaN, + "learning_rate": 0.0002135088722518366, + "loss": 0.0, + "step": 24481 + }, + { + "epoch": 2.2844079499860035, + "grad_norm": NaN, + "learning_rate": 0.00021350201978812127, + "loss": 0.0, + "step": 24482 + }, + { + "epoch": 2.284501259680881, + "grad_norm": NaN, + "learning_rate": 0.00021349516716293913, + "loss": 0.0, + "step": 24483 + }, + { + "epoch": 2.2845945693757583, + "grad_norm": NaN, + "learning_rate": 0.00021348831437630763, + "loss": 0.0, + "step": 24484 + }, + { + "epoch": 2.2846878790706353, + "grad_norm": NaN, + "learning_rate": 0.0002134814614282441, + "loss": 0.0, + "step": 24485 + }, + { + "epoch": 2.2847811887655127, + "grad_norm": NaN, + "learning_rate": 0.00021347460831876603, + "loss": 0.0, + "step": 24486 + }, + { + "epoch": 2.28487449846039, + "grad_norm": NaN, + "learning_rate": 0.00021346775504789084, + "loss": 0.0, + "step": 24487 + }, + { + "epoch": 2.284967808155267, + "grad_norm": NaN, + "learning_rate": 0.00021346090161563594, + "loss": 0.0, + "step": 24488 + }, + { + "epoch": 2.2850611178501445, + "grad_norm": NaN, + "learning_rate": 0.00021345404802201878, + "loss": 0.0, + "step": 24489 + }, + { + "epoch": 2.285154427545022, + "grad_norm": NaN, + "learning_rate": 0.00021344719426705678, + "loss": 0.0, + "step": 24490 + }, + { + "epoch": 2.2852477372398994, + "grad_norm": NaN, + "learning_rate": 0.00021344034035076736, + "loss": 0.0, + "step": 24491 + }, + { + "epoch": 2.2853410469347764, + "grad_norm": NaN, + "learning_rate": 0.00021343348627316793, + "loss": 0.0, + "step": 24492 + }, + { + "epoch": 2.2854343566296538, + "grad_norm": NaN, + "learning_rate": 0.00021342663203427595, + "loss": 0.0, + "step": 24493 + }, + { + "epoch": 2.285527666324531, + "grad_norm": NaN, + "learning_rate": 0.00021341977763410884, + "loss": 0.0, + "step": 24494 + }, + { + "epoch": 2.285620976019408, + "grad_norm": NaN, + "learning_rate": 0.00021341292307268397, + "loss": 0.0, + "step": 24495 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": NaN, + "learning_rate": 0.00021340606835001886, + "loss": 0.0, + "step": 24496 + }, + { + "epoch": 2.285807595409163, + "grad_norm": NaN, + "learning_rate": 0.0002133992134661309, + "loss": 0.0, + "step": 24497 + }, + { + "epoch": 2.2859009051040404, + "grad_norm": NaN, + "learning_rate": 0.00021339235842103753, + "loss": 0.0, + "step": 24498 + }, + { + "epoch": 2.285994214798918, + "grad_norm": NaN, + "learning_rate": 0.00021338550321475615, + "loss": 0.0, + "step": 24499 + }, + { + "epoch": 2.286087524493795, + "grad_norm": NaN, + "learning_rate": 0.00021337864784730422, + "loss": 0.0, + "step": 24500 + }, + { + "epoch": 2.2861808341886722, + "grad_norm": NaN, + "learning_rate": 0.00021337179231869916, + "loss": 0.0, + "step": 24501 + }, + { + "epoch": 2.2862741438835497, + "grad_norm": NaN, + "learning_rate": 0.0002133649366289584, + "loss": 0.0, + "step": 24502 + }, + { + "epoch": 2.2863674535784266, + "grad_norm": NaN, + "learning_rate": 0.0002133580807780994, + "loss": 0.0, + "step": 24503 + }, + { + "epoch": 2.286460763273304, + "grad_norm": NaN, + "learning_rate": 0.00021335122476613954, + "loss": 0.0, + "step": 24504 + }, + { + "epoch": 2.2865540729681815, + "grad_norm": NaN, + "learning_rate": 0.0002133443685930963, + "loss": 0.0, + "step": 24505 + }, + { + "epoch": 2.286647382663059, + "grad_norm": NaN, + "learning_rate": 0.00021333751225898707, + "loss": 0.0, + "step": 24506 + }, + { + "epoch": 2.286740692357936, + "grad_norm": NaN, + "learning_rate": 0.00021333065576382935, + "loss": 0.0, + "step": 24507 + }, + { + "epoch": 2.2868340020528133, + "grad_norm": NaN, + "learning_rate": 0.00021332379910764048, + "loss": 0.0, + "step": 24508 + }, + { + "epoch": 2.2869273117476907, + "grad_norm": NaN, + "learning_rate": 0.000213316942290438, + "loss": 0.0, + "step": 24509 + }, + { + "epoch": 2.2870206214425677, + "grad_norm": NaN, + "learning_rate": 0.00021331008531223926, + "loss": 0.0, + "step": 24510 + }, + { + "epoch": 2.287113931137445, + "grad_norm": NaN, + "learning_rate": 0.00021330322817306174, + "loss": 0.0, + "step": 24511 + }, + { + "epoch": 2.2872072408323225, + "grad_norm": NaN, + "learning_rate": 0.00021329637087292284, + "loss": 0.0, + "step": 24512 + }, + { + "epoch": 2.2873005505272, + "grad_norm": NaN, + "learning_rate": 0.00021328951341184003, + "loss": 0.0, + "step": 24513 + }, + { + "epoch": 2.287393860222077, + "grad_norm": NaN, + "learning_rate": 0.00021328265578983075, + "loss": 0.0, + "step": 24514 + }, + { + "epoch": 2.2874871699169543, + "grad_norm": NaN, + "learning_rate": 0.00021327579800691242, + "loss": 0.0, + "step": 24515 + }, + { + "epoch": 2.2875804796118318, + "grad_norm": NaN, + "learning_rate": 0.0002132689400631025, + "loss": 0.0, + "step": 24516 + }, + { + "epoch": 2.2876737893067087, + "grad_norm": NaN, + "learning_rate": 0.00021326208195841833, + "loss": 0.0, + "step": 24517 + }, + { + "epoch": 2.287767099001586, + "grad_norm": NaN, + "learning_rate": 0.00021325522369287748, + "loss": 0.0, + "step": 24518 + }, + { + "epoch": 2.2878604086964636, + "grad_norm": NaN, + "learning_rate": 0.00021324836526649733, + "loss": 0.0, + "step": 24519 + }, + { + "epoch": 2.287953718391341, + "grad_norm": NaN, + "learning_rate": 0.00021324150667929534, + "loss": 0.0, + "step": 24520 + }, + { + "epoch": 2.2880470280862184, + "grad_norm": NaN, + "learning_rate": 0.0002132346479312889, + "loss": 0.0, + "step": 24521 + }, + { + "epoch": 2.2881403377810954, + "grad_norm": NaN, + "learning_rate": 0.0002132277890224955, + "loss": 0.0, + "step": 24522 + }, + { + "epoch": 2.288233647475973, + "grad_norm": NaN, + "learning_rate": 0.00021322092995293255, + "loss": 0.0, + "step": 24523 + }, + { + "epoch": 2.28832695717085, + "grad_norm": NaN, + "learning_rate": 0.0002132140707226175, + "loss": 0.0, + "step": 24524 + }, + { + "epoch": 2.288420266865727, + "grad_norm": NaN, + "learning_rate": 0.0002132072113315678, + "loss": 0.0, + "step": 24525 + }, + { + "epoch": 2.2885135765606046, + "grad_norm": NaN, + "learning_rate": 0.0002132003517798009, + "loss": 0.0, + "step": 24526 + }, + { + "epoch": 2.288606886255482, + "grad_norm": NaN, + "learning_rate": 0.0002131934920673342, + "loss": 0.0, + "step": 24527 + }, + { + "epoch": 2.2887001959503595, + "grad_norm": NaN, + "learning_rate": 0.0002131866321941852, + "loss": 0.0, + "step": 24528 + }, + { + "epoch": 2.2887935056452364, + "grad_norm": NaN, + "learning_rate": 0.0002131797721603713, + "loss": 0.0, + "step": 24529 + }, + { + "epoch": 2.288886815340114, + "grad_norm": NaN, + "learning_rate": 0.00021317291196590996, + "loss": 0.0, + "step": 24530 + }, + { + "epoch": 2.2889801250349913, + "grad_norm": NaN, + "learning_rate": 0.00021316605161081858, + "loss": 0.0, + "step": 24531 + }, + { + "epoch": 2.2890734347298682, + "grad_norm": NaN, + "learning_rate": 0.00021315919109511468, + "loss": 0.0, + "step": 24532 + }, + { + "epoch": 2.2891667444247457, + "grad_norm": NaN, + "learning_rate": 0.00021315233041881563, + "loss": 0.0, + "step": 24533 + }, + { + "epoch": 2.289260054119623, + "grad_norm": NaN, + "learning_rate": 0.00021314546958193896, + "loss": 0.0, + "step": 24534 + }, + { + "epoch": 2.2893533638145005, + "grad_norm": NaN, + "learning_rate": 0.00021313860858450203, + "loss": 0.0, + "step": 24535 + }, + { + "epoch": 2.2894466735093775, + "grad_norm": NaN, + "learning_rate": 0.00021313174742652234, + "loss": 0.0, + "step": 24536 + }, + { + "epoch": 2.289539983204255, + "grad_norm": NaN, + "learning_rate": 0.0002131248861080173, + "loss": 0.0, + "step": 24537 + }, + { + "epoch": 2.2896332928991323, + "grad_norm": NaN, + "learning_rate": 0.0002131180246290044, + "loss": 0.0, + "step": 24538 + }, + { + "epoch": 2.2897266025940093, + "grad_norm": NaN, + "learning_rate": 0.00021311116298950103, + "loss": 0.0, + "step": 24539 + }, + { + "epoch": 2.2898199122888867, + "grad_norm": NaN, + "learning_rate": 0.0002131043011895247, + "loss": 0.0, + "step": 24540 + }, + { + "epoch": 2.289913221983764, + "grad_norm": NaN, + "learning_rate": 0.0002130974392290928, + "loss": 0.0, + "step": 24541 + }, + { + "epoch": 2.2900065316786415, + "grad_norm": NaN, + "learning_rate": 0.00021309057710822282, + "loss": 0.0, + "step": 24542 + }, + { + "epoch": 2.2900998413735185, + "grad_norm": NaN, + "learning_rate": 0.00021308371482693216, + "loss": 0.0, + "step": 24543 + }, + { + "epoch": 2.290193151068396, + "grad_norm": NaN, + "learning_rate": 0.00021307685238523832, + "loss": 0.0, + "step": 24544 + }, + { + "epoch": 2.2902864607632734, + "grad_norm": NaN, + "learning_rate": 0.00021306998978315873, + "loss": 0.0, + "step": 24545 + }, + { + "epoch": 2.2903797704581508, + "grad_norm": NaN, + "learning_rate": 0.00021306312702071083, + "loss": 0.0, + "step": 24546 + }, + { + "epoch": 2.2904730801530278, + "grad_norm": NaN, + "learning_rate": 0.00021305626409791212, + "loss": 0.0, + "step": 24547 + }, + { + "epoch": 2.290566389847905, + "grad_norm": NaN, + "learning_rate": 0.00021304940101477994, + "loss": 0.0, + "step": 24548 + }, + { + "epoch": 2.2906596995427826, + "grad_norm": NaN, + "learning_rate": 0.00021304253777133184, + "loss": 0.0, + "step": 24549 + }, + { + "epoch": 2.29075300923766, + "grad_norm": NaN, + "learning_rate": 0.00021303567436758527, + "loss": 0.0, + "step": 24550 + }, + { + "epoch": 2.290846318932537, + "grad_norm": NaN, + "learning_rate": 0.00021302881080355766, + "loss": 0.0, + "step": 24551 + }, + { + "epoch": 2.2909396286274144, + "grad_norm": NaN, + "learning_rate": 0.0002130219470792664, + "loss": 0.0, + "step": 24552 + }, + { + "epoch": 2.291032938322292, + "grad_norm": NaN, + "learning_rate": 0.000213015083194729, + "loss": 0.0, + "step": 24553 + }, + { + "epoch": 2.291126248017169, + "grad_norm": NaN, + "learning_rate": 0.00021300821914996292, + "loss": 0.0, + "step": 24554 + }, + { + "epoch": 2.291219557712046, + "grad_norm": NaN, + "learning_rate": 0.00021300135494498564, + "loss": 0.0, + "step": 24555 + }, + { + "epoch": 2.2913128674069236, + "grad_norm": NaN, + "learning_rate": 0.00021299449057981458, + "loss": 0.0, + "step": 24556 + }, + { + "epoch": 2.291406177101801, + "grad_norm": NaN, + "learning_rate": 0.00021298762605446715, + "loss": 0.0, + "step": 24557 + }, + { + "epoch": 2.291499486796678, + "grad_norm": NaN, + "learning_rate": 0.00021298076136896086, + "loss": 0.0, + "step": 24558 + }, + { + "epoch": 2.2915927964915555, + "grad_norm": NaN, + "learning_rate": 0.00021297389652331312, + "loss": 0.0, + "step": 24559 + }, + { + "epoch": 2.291686106186433, + "grad_norm": NaN, + "learning_rate": 0.00021296703151754147, + "loss": 0.0, + "step": 24560 + }, + { + "epoch": 2.29177941588131, + "grad_norm": NaN, + "learning_rate": 0.00021296016635166326, + "loss": 0.0, + "step": 24561 + }, + { + "epoch": 2.2918727255761873, + "grad_norm": NaN, + "learning_rate": 0.00021295330102569603, + "loss": 0.0, + "step": 24562 + }, + { + "epoch": 2.2919660352710647, + "grad_norm": NaN, + "learning_rate": 0.0002129464355396572, + "loss": 0.0, + "step": 24563 + }, + { + "epoch": 2.292059344965942, + "grad_norm": NaN, + "learning_rate": 0.0002129395698935642, + "loss": 0.0, + "step": 24564 + }, + { + "epoch": 2.292152654660819, + "grad_norm": NaN, + "learning_rate": 0.00021293270408743456, + "loss": 0.0, + "step": 24565 + }, + { + "epoch": 2.2922459643556965, + "grad_norm": NaN, + "learning_rate": 0.00021292583812128568, + "loss": 0.0, + "step": 24566 + }, + { + "epoch": 2.292339274050574, + "grad_norm": NaN, + "learning_rate": 0.00021291897199513502, + "loss": 0.0, + "step": 24567 + }, + { + "epoch": 2.2924325837454513, + "grad_norm": NaN, + "learning_rate": 0.0002129121057090001, + "loss": 0.0, + "step": 24568 + }, + { + "epoch": 2.2925258934403283, + "grad_norm": NaN, + "learning_rate": 0.00021290523926289828, + "loss": 0.0, + "step": 24569 + }, + { + "epoch": 2.2926192031352057, + "grad_norm": NaN, + "learning_rate": 0.00021289837265684708, + "loss": 0.0, + "step": 24570 + }, + { + "epoch": 2.292712512830083, + "grad_norm": NaN, + "learning_rate": 0.00021289150589086397, + "loss": 0.0, + "step": 24571 + }, + { + "epoch": 2.2928058225249606, + "grad_norm": NaN, + "learning_rate": 0.00021288463896496635, + "loss": 0.0, + "step": 24572 + }, + { + "epoch": 2.2928991322198375, + "grad_norm": NaN, + "learning_rate": 0.00021287777187917173, + "loss": 0.0, + "step": 24573 + }, + { + "epoch": 2.292992441914715, + "grad_norm": NaN, + "learning_rate": 0.0002128709046334976, + "loss": 0.0, + "step": 24574 + }, + { + "epoch": 2.2930857516095924, + "grad_norm": NaN, + "learning_rate": 0.00021286403722796136, + "loss": 0.0, + "step": 24575 + }, + { + "epoch": 2.2931790613044694, + "grad_norm": NaN, + "learning_rate": 0.00021285716966258048, + "loss": 0.0, + "step": 24576 + }, + { + "epoch": 2.2932723709993468, + "grad_norm": NaN, + "learning_rate": 0.00021285030193737248, + "loss": 0.0, + "step": 24577 + }, + { + "epoch": 2.293365680694224, + "grad_norm": NaN, + "learning_rate": 0.00021284343405235474, + "loss": 0.0, + "step": 24578 + }, + { + "epoch": 2.2934589903891016, + "grad_norm": NaN, + "learning_rate": 0.0002128365660075448, + "loss": 0.0, + "step": 24579 + }, + { + "epoch": 2.2935523000839786, + "grad_norm": NaN, + "learning_rate": 0.00021282969780296007, + "loss": 0.0, + "step": 24580 + }, + { + "epoch": 2.293645609778856, + "grad_norm": NaN, + "learning_rate": 0.00021282282943861798, + "loss": 0.0, + "step": 24581 + }, + { + "epoch": 2.2937389194737334, + "grad_norm": NaN, + "learning_rate": 0.0002128159609145361, + "loss": 0.0, + "step": 24582 + }, + { + "epoch": 2.2938322291686104, + "grad_norm": NaN, + "learning_rate": 0.00021280909223073184, + "loss": 0.0, + "step": 24583 + }, + { + "epoch": 2.293925538863488, + "grad_norm": NaN, + "learning_rate": 0.00021280222338722262, + "loss": 0.0, + "step": 24584 + }, + { + "epoch": 2.2940188485583652, + "grad_norm": NaN, + "learning_rate": 0.00021279535438402603, + "loss": 0.0, + "step": 24585 + }, + { + "epoch": 2.2941121582532427, + "grad_norm": NaN, + "learning_rate": 0.0002127884852211594, + "loss": 0.0, + "step": 24586 + }, + { + "epoch": 2.2942054679481196, + "grad_norm": NaN, + "learning_rate": 0.00021278161589864022, + "loss": 0.0, + "step": 24587 + }, + { + "epoch": 2.294298777642997, + "grad_norm": NaN, + "learning_rate": 0.00021277474641648604, + "loss": 0.0, + "step": 24588 + }, + { + "epoch": 2.2943920873378745, + "grad_norm": NaN, + "learning_rate": 0.00021276787677471426, + "loss": 0.0, + "step": 24589 + }, + { + "epoch": 2.294485397032752, + "grad_norm": NaN, + "learning_rate": 0.00021276100697334237, + "loss": 0.0, + "step": 24590 + }, + { + "epoch": 2.294578706727629, + "grad_norm": NaN, + "learning_rate": 0.00021275413701238782, + "loss": 0.0, + "step": 24591 + }, + { + "epoch": 2.2946720164225063, + "grad_norm": NaN, + "learning_rate": 0.0002127472668918681, + "loss": 0.0, + "step": 24592 + }, + { + "epoch": 2.2947653261173837, + "grad_norm": NaN, + "learning_rate": 0.00021274039661180067, + "loss": 0.0, + "step": 24593 + }, + { + "epoch": 2.294858635812261, + "grad_norm": NaN, + "learning_rate": 0.00021273352617220297, + "loss": 0.0, + "step": 24594 + }, + { + "epoch": 2.294951945507138, + "grad_norm": NaN, + "learning_rate": 0.00021272665557309253, + "loss": 0.0, + "step": 24595 + }, + { + "epoch": 2.2950452552020155, + "grad_norm": NaN, + "learning_rate": 0.00021271978481448672, + "loss": 0.0, + "step": 24596 + }, + { + "epoch": 2.295138564896893, + "grad_norm": NaN, + "learning_rate": 0.00021271291389640318, + "loss": 0.0, + "step": 24597 + }, + { + "epoch": 2.29523187459177, + "grad_norm": NaN, + "learning_rate": 0.0002127060428188592, + "loss": 0.0, + "step": 24598 + }, + { + "epoch": 2.2953251842866473, + "grad_norm": NaN, + "learning_rate": 0.00021269917158187233, + "loss": 0.0, + "step": 24599 + }, + { + "epoch": 2.2954184939815248, + "grad_norm": NaN, + "learning_rate": 0.00021269230018546007, + "loss": 0.0, + "step": 24600 + }, + { + "epoch": 2.295511803676402, + "grad_norm": NaN, + "learning_rate": 0.00021268542862963987, + "loss": 0.0, + "step": 24601 + }, + { + "epoch": 2.295605113371279, + "grad_norm": NaN, + "learning_rate": 0.00021267855691442918, + "loss": 0.0, + "step": 24602 + }, + { + "epoch": 2.2956984230661566, + "grad_norm": NaN, + "learning_rate": 0.00021267168503984543, + "loss": 0.0, + "step": 24603 + }, + { + "epoch": 2.295791732761034, + "grad_norm": NaN, + "learning_rate": 0.00021266481300590624, + "loss": 0.0, + "step": 24604 + }, + { + "epoch": 2.295885042455911, + "grad_norm": NaN, + "learning_rate": 0.0002126579408126289, + "loss": 0.0, + "step": 24605 + }, + { + "epoch": 2.2959783521507884, + "grad_norm": NaN, + "learning_rate": 0.00021265106846003103, + "loss": 0.0, + "step": 24606 + }, + { + "epoch": 2.296071661845666, + "grad_norm": NaN, + "learning_rate": 0.0002126441959481301, + "loss": 0.0, + "step": 24607 + }, + { + "epoch": 2.296164971540543, + "grad_norm": NaN, + "learning_rate": 0.00021263732327694347, + "loss": 0.0, + "step": 24608 + }, + { + "epoch": 2.29625828123542, + "grad_norm": NaN, + "learning_rate": 0.00021263045044648865, + "loss": 0.0, + "step": 24609 + }, + { + "epoch": 2.2963515909302976, + "grad_norm": NaN, + "learning_rate": 0.0002126235774567832, + "loss": 0.0, + "step": 24610 + }, + { + "epoch": 2.296444900625175, + "grad_norm": NaN, + "learning_rate": 0.00021261670430784457, + "loss": 0.0, + "step": 24611 + }, + { + "epoch": 2.296538210320052, + "grad_norm": NaN, + "learning_rate": 0.0002126098309996902, + "loss": 0.0, + "step": 24612 + }, + { + "epoch": 2.2966315200149294, + "grad_norm": NaN, + "learning_rate": 0.00021260295753233754, + "loss": 0.0, + "step": 24613 + }, + { + "epoch": 2.296724829709807, + "grad_norm": NaN, + "learning_rate": 0.00021259608390580413, + "loss": 0.0, + "step": 24614 + }, + { + "epoch": 2.2968181394046843, + "grad_norm": NaN, + "learning_rate": 0.00021258921012010738, + "loss": 0.0, + "step": 24615 + }, + { + "epoch": 2.2969114490995617, + "grad_norm": NaN, + "learning_rate": 0.00021258233617526485, + "loss": 0.0, + "step": 24616 + }, + { + "epoch": 2.2970047587944387, + "grad_norm": NaN, + "learning_rate": 0.000212575462071294, + "loss": 0.0, + "step": 24617 + }, + { + "epoch": 2.297098068489316, + "grad_norm": NaN, + "learning_rate": 0.00021256858780821224, + "loss": 0.0, + "step": 24618 + }, + { + "epoch": 2.2971913781841935, + "grad_norm": NaN, + "learning_rate": 0.0002125617133860371, + "loss": 0.0, + "step": 24619 + }, + { + "epoch": 2.2972846878790705, + "grad_norm": NaN, + "learning_rate": 0.00021255483880478604, + "loss": 0.0, + "step": 24620 + }, + { + "epoch": 2.297377997573948, + "grad_norm": NaN, + "learning_rate": 0.00021254796406447657, + "loss": 0.0, + "step": 24621 + }, + { + "epoch": 2.2974713072688253, + "grad_norm": NaN, + "learning_rate": 0.00021254108916512614, + "loss": 0.0, + "step": 24622 + }, + { + "epoch": 2.2975646169637027, + "grad_norm": NaN, + "learning_rate": 0.0002125342141067523, + "loss": 0.0, + "step": 24623 + }, + { + "epoch": 2.2976579266585797, + "grad_norm": NaN, + "learning_rate": 0.00021252733888937246, + "loss": 0.0, + "step": 24624 + }, + { + "epoch": 2.297751236353457, + "grad_norm": NaN, + "learning_rate": 0.00021252046351300408, + "loss": 0.0, + "step": 24625 + }, + { + "epoch": 2.2978445460483345, + "grad_norm": NaN, + "learning_rate": 0.00021251358797766474, + "loss": 0.0, + "step": 24626 + }, + { + "epoch": 2.2979378557432115, + "grad_norm": NaN, + "learning_rate": 0.00021250671228337183, + "loss": 0.0, + "step": 24627 + }, + { + "epoch": 2.298031165438089, + "grad_norm": NaN, + "learning_rate": 0.00021249983643014284, + "loss": 0.0, + "step": 24628 + }, + { + "epoch": 2.2981244751329664, + "grad_norm": NaN, + "learning_rate": 0.0002124929604179953, + "loss": 0.0, + "step": 24629 + }, + { + "epoch": 2.298217784827844, + "grad_norm": NaN, + "learning_rate": 0.0002124860842469467, + "loss": 0.0, + "step": 24630 + }, + { + "epoch": 2.2983110945227208, + "grad_norm": NaN, + "learning_rate": 0.00021247920791701443, + "loss": 0.0, + "step": 24631 + }, + { + "epoch": 2.298404404217598, + "grad_norm": NaN, + "learning_rate": 0.00021247233142821615, + "loss": 0.0, + "step": 24632 + }, + { + "epoch": 2.2984977139124756, + "grad_norm": NaN, + "learning_rate": 0.00021246545478056917, + "loss": 0.0, + "step": 24633 + }, + { + "epoch": 2.2985910236073526, + "grad_norm": NaN, + "learning_rate": 0.00021245857797409104, + "loss": 0.0, + "step": 24634 + }, + { + "epoch": 2.29868433330223, + "grad_norm": NaN, + "learning_rate": 0.00021245170100879924, + "loss": 0.0, + "step": 24635 + }, + { + "epoch": 2.2987776429971074, + "grad_norm": NaN, + "learning_rate": 0.00021244482388471132, + "loss": 0.0, + "step": 24636 + }, + { + "epoch": 2.298870952691985, + "grad_norm": NaN, + "learning_rate": 0.00021243794660184465, + "loss": 0.0, + "step": 24637 + }, + { + "epoch": 2.298964262386862, + "grad_norm": NaN, + "learning_rate": 0.00021243106916021676, + "loss": 0.0, + "step": 24638 + }, + { + "epoch": 2.299057572081739, + "grad_norm": NaN, + "learning_rate": 0.00021242419155984525, + "loss": 0.0, + "step": 24639 + }, + { + "epoch": 2.2991508817766166, + "grad_norm": NaN, + "learning_rate": 0.00021241731380074744, + "loss": 0.0, + "step": 24640 + }, + { + "epoch": 2.299244191471494, + "grad_norm": NaN, + "learning_rate": 0.0002124104358829409, + "loss": 0.0, + "step": 24641 + }, + { + "epoch": 2.299337501166371, + "grad_norm": NaN, + "learning_rate": 0.0002124035578064431, + "loss": 0.0, + "step": 24642 + }, + { + "epoch": 2.2994308108612485, + "grad_norm": NaN, + "learning_rate": 0.00021239667957127157, + "loss": 0.0, + "step": 24643 + }, + { + "epoch": 2.299524120556126, + "grad_norm": NaN, + "learning_rate": 0.00021238980117744373, + "loss": 0.0, + "step": 24644 + }, + { + "epoch": 2.2996174302510033, + "grad_norm": NaN, + "learning_rate": 0.00021238292262497712, + "loss": 0.0, + "step": 24645 + }, + { + "epoch": 2.2997107399458803, + "grad_norm": NaN, + "learning_rate": 0.00021237604391388925, + "loss": 0.0, + "step": 24646 + }, + { + "epoch": 2.2998040496407577, + "grad_norm": NaN, + "learning_rate": 0.0002123691650441975, + "loss": 0.0, + "step": 24647 + }, + { + "epoch": 2.299897359335635, + "grad_norm": NaN, + "learning_rate": 0.00021236228601591954, + "loss": 0.0, + "step": 24648 + }, + { + "epoch": 2.299990669030512, + "grad_norm": NaN, + "learning_rate": 0.00021235540682907272, + "loss": 0.0, + "step": 24649 + }, + { + "epoch": 2.3000839787253895, + "grad_norm": NaN, + "learning_rate": 0.0002123485274836745, + "loss": 0.0, + "step": 24650 + }, + { + "epoch": 2.300177288420267, + "grad_norm": NaN, + "learning_rate": 0.00021234164797974254, + "loss": 0.0, + "step": 24651 + }, + { + "epoch": 2.3002705981151443, + "grad_norm": NaN, + "learning_rate": 0.0002123347683172942, + "loss": 0.0, + "step": 24652 + }, + { + "epoch": 2.3003639078100213, + "grad_norm": NaN, + "learning_rate": 0.000212327888496347, + "loss": 0.0, + "step": 24653 + }, + { + "epoch": 2.3004572175048987, + "grad_norm": NaN, + "learning_rate": 0.00021232100851691847, + "loss": 0.0, + "step": 24654 + }, + { + "epoch": 2.300550527199776, + "grad_norm": NaN, + "learning_rate": 0.00021231412837902604, + "loss": 0.0, + "step": 24655 + }, + { + "epoch": 2.300643836894653, + "grad_norm": NaN, + "learning_rate": 0.00021230724808268722, + "loss": 0.0, + "step": 24656 + }, + { + "epoch": 2.3007371465895305, + "grad_norm": NaN, + "learning_rate": 0.00021230036762791954, + "loss": 0.0, + "step": 24657 + }, + { + "epoch": 2.300830456284408, + "grad_norm": NaN, + "learning_rate": 0.00021229348701474058, + "loss": 0.0, + "step": 24658 + }, + { + "epoch": 2.3009237659792854, + "grad_norm": NaN, + "learning_rate": 0.00021228660624316763, + "loss": 0.0, + "step": 24659 + }, + { + "epoch": 2.3010170756741624, + "grad_norm": NaN, + "learning_rate": 0.0002122797253132183, + "loss": 0.0, + "step": 24660 + }, + { + "epoch": 2.30111038536904, + "grad_norm": NaN, + "learning_rate": 0.00021227284422491012, + "loss": 0.0, + "step": 24661 + }, + { + "epoch": 2.301203695063917, + "grad_norm": NaN, + "learning_rate": 0.00021226596297826053, + "loss": 0.0, + "step": 24662 + }, + { + "epoch": 2.3012970047587946, + "grad_norm": NaN, + "learning_rate": 0.00021225908157328696, + "loss": 0.0, + "step": 24663 + }, + { + "epoch": 2.3013903144536716, + "grad_norm": NaN, + "learning_rate": 0.00021225220001000713, + "loss": 0.0, + "step": 24664 + }, + { + "epoch": 2.301483624148549, + "grad_norm": NaN, + "learning_rate": 0.0002122453182884383, + "loss": 0.0, + "step": 24665 + }, + { + "epoch": 2.3015769338434264, + "grad_norm": NaN, + "learning_rate": 0.00021223843640859807, + "loss": 0.0, + "step": 24666 + }, + { + "epoch": 2.301670243538304, + "grad_norm": NaN, + "learning_rate": 0.00021223155437050396, + "loss": 0.0, + "step": 24667 + }, + { + "epoch": 2.301763553233181, + "grad_norm": NaN, + "learning_rate": 0.00021222467217417348, + "loss": 0.0, + "step": 24668 + }, + { + "epoch": 2.3018568629280582, + "grad_norm": NaN, + "learning_rate": 0.00021221778981962396, + "loss": 0.0, + "step": 24669 + }, + { + "epoch": 2.3019501726229357, + "grad_norm": NaN, + "learning_rate": 0.00021221090730687316, + "loss": 0.0, + "step": 24670 + }, + { + "epoch": 2.3020434823178126, + "grad_norm": NaN, + "learning_rate": 0.0002122040246359384, + "loss": 0.0, + "step": 24671 + }, + { + "epoch": 2.30213679201269, + "grad_norm": NaN, + "learning_rate": 0.0002121971418068372, + "loss": 0.0, + "step": 24672 + }, + { + "epoch": 2.3022301017075675, + "grad_norm": NaN, + "learning_rate": 0.0002121902588195872, + "loss": 0.0, + "step": 24673 + }, + { + "epoch": 2.302323411402445, + "grad_norm": NaN, + "learning_rate": 0.0002121833756742057, + "loss": 0.0, + "step": 24674 + }, + { + "epoch": 2.302416721097322, + "grad_norm": NaN, + "learning_rate": 0.0002121764923707103, + "loss": 0.0, + "step": 24675 + }, + { + "epoch": 2.3025100307921993, + "grad_norm": NaN, + "learning_rate": 0.00021216960890911853, + "loss": 0.0, + "step": 24676 + }, + { + "epoch": 2.3026033404870767, + "grad_norm": NaN, + "learning_rate": 0.00021216272528944784, + "loss": 0.0, + "step": 24677 + }, + { + "epoch": 2.3026966501819537, + "grad_norm": NaN, + "learning_rate": 0.00021215584151171573, + "loss": 0.0, + "step": 24678 + }, + { + "epoch": 2.302789959876831, + "grad_norm": NaN, + "learning_rate": 0.00021214895757593977, + "loss": 0.0, + "step": 24679 + }, + { + "epoch": 2.3028832695717085, + "grad_norm": NaN, + "learning_rate": 0.00021214207348213746, + "loss": 0.0, + "step": 24680 + }, + { + "epoch": 2.302976579266586, + "grad_norm": NaN, + "learning_rate": 0.00021213518923032617, + "loss": 0.0, + "step": 24681 + }, + { + "epoch": 2.303069888961463, + "grad_norm": NaN, + "learning_rate": 0.00021212830482052354, + "loss": 0.0, + "step": 24682 + }, + { + "epoch": 2.3031631986563403, + "grad_norm": NaN, + "learning_rate": 0.00021212142025274708, + "loss": 0.0, + "step": 24683 + }, + { + "epoch": 2.3032565083512178, + "grad_norm": NaN, + "learning_rate": 0.0002121145355270142, + "loss": 0.0, + "step": 24684 + }, + { + "epoch": 2.303349818046095, + "grad_norm": NaN, + "learning_rate": 0.0002121076506433424, + "loss": 0.0, + "step": 24685 + }, + { + "epoch": 2.303443127740972, + "grad_norm": NaN, + "learning_rate": 0.00021210076560174932, + "loss": 0.0, + "step": 24686 + }, + { + "epoch": 2.3035364374358496, + "grad_norm": NaN, + "learning_rate": 0.00021209388040225238, + "loss": 0.0, + "step": 24687 + }, + { + "epoch": 2.303629747130727, + "grad_norm": NaN, + "learning_rate": 0.00021208699504486907, + "loss": 0.0, + "step": 24688 + }, + { + "epoch": 2.3037230568256044, + "grad_norm": NaN, + "learning_rate": 0.00021208010952961697, + "loss": 0.0, + "step": 24689 + }, + { + "epoch": 2.3038163665204814, + "grad_norm": NaN, + "learning_rate": 0.00021207322385651355, + "loss": 0.0, + "step": 24690 + }, + { + "epoch": 2.303909676215359, + "grad_norm": NaN, + "learning_rate": 0.00021206633802557624, + "loss": 0.0, + "step": 24691 + }, + { + "epoch": 2.3040029859102362, + "grad_norm": NaN, + "learning_rate": 0.00021205945203682267, + "loss": 0.0, + "step": 24692 + }, + { + "epoch": 2.304096295605113, + "grad_norm": NaN, + "learning_rate": 0.00021205256589027025, + "loss": 0.0, + "step": 24693 + }, + { + "epoch": 2.3041896052999906, + "grad_norm": NaN, + "learning_rate": 0.00021204567958593658, + "loss": 0.0, + "step": 24694 + }, + { + "epoch": 2.304282914994868, + "grad_norm": NaN, + "learning_rate": 0.00021203879312383907, + "loss": 0.0, + "step": 24695 + }, + { + "epoch": 2.3043762246897455, + "grad_norm": NaN, + "learning_rate": 0.00021203190650399538, + "loss": 0.0, + "step": 24696 + }, + { + "epoch": 2.3044695343846224, + "grad_norm": NaN, + "learning_rate": 0.00021202501972642285, + "loss": 0.0, + "step": 24697 + }, + { + "epoch": 2.3045628440795, + "grad_norm": NaN, + "learning_rate": 0.00021201813279113908, + "loss": 0.0, + "step": 24698 + }, + { + "epoch": 2.3046561537743773, + "grad_norm": NaN, + "learning_rate": 0.0002120112456981616, + "loss": 0.0, + "step": 24699 + }, + { + "epoch": 2.3047494634692542, + "grad_norm": NaN, + "learning_rate": 0.00021200435844750785, + "loss": 0.0, + "step": 24700 + }, + { + "epoch": 2.3048427731641317, + "grad_norm": NaN, + "learning_rate": 0.0002119974710391954, + "loss": 0.0, + "step": 24701 + }, + { + "epoch": 2.304936082859009, + "grad_norm": NaN, + "learning_rate": 0.00021199058347324182, + "loss": 0.0, + "step": 24702 + }, + { + "epoch": 2.3050293925538865, + "grad_norm": NaN, + "learning_rate": 0.00021198369574966446, + "loss": 0.0, + "step": 24703 + }, + { + "epoch": 2.3051227022487635, + "grad_norm": NaN, + "learning_rate": 0.0002119768078684809, + "loss": 0.0, + "step": 24704 + }, + { + "epoch": 2.305216011943641, + "grad_norm": NaN, + "learning_rate": 0.00021196991982970874, + "loss": 0.0, + "step": 24705 + }, + { + "epoch": 2.3053093216385183, + "grad_norm": NaN, + "learning_rate": 0.00021196303163336543, + "loss": 0.0, + "step": 24706 + }, + { + "epoch": 2.3054026313333953, + "grad_norm": NaN, + "learning_rate": 0.0002119561432794684, + "loss": 0.0, + "step": 24707 + }, + { + "epoch": 2.3054959410282727, + "grad_norm": NaN, + "learning_rate": 0.00021194925476803538, + "loss": 0.0, + "step": 24708 + }, + { + "epoch": 2.30558925072315, + "grad_norm": NaN, + "learning_rate": 0.00021194236609908368, + "loss": 0.0, + "step": 24709 + }, + { + "epoch": 2.3056825604180275, + "grad_norm": NaN, + "learning_rate": 0.00021193547727263088, + "loss": 0.0, + "step": 24710 + }, + { + "epoch": 2.305775870112905, + "grad_norm": NaN, + "learning_rate": 0.00021192858828869457, + "loss": 0.0, + "step": 24711 + }, + { + "epoch": 2.305869179807782, + "grad_norm": NaN, + "learning_rate": 0.00021192169914729215, + "loss": 0.0, + "step": 24712 + }, + { + "epoch": 2.3059624895026594, + "grad_norm": NaN, + "learning_rate": 0.0002119148098484412, + "loss": 0.0, + "step": 24713 + }, + { + "epoch": 2.306055799197537, + "grad_norm": NaN, + "learning_rate": 0.00021190792039215922, + "loss": 0.0, + "step": 24714 + }, + { + "epoch": 2.3061491088924138, + "grad_norm": NaN, + "learning_rate": 0.00021190103077846378, + "loss": 0.0, + "step": 24715 + }, + { + "epoch": 2.306242418587291, + "grad_norm": NaN, + "learning_rate": 0.0002118941410073723, + "loss": 0.0, + "step": 24716 + }, + { + "epoch": 2.3063357282821686, + "grad_norm": NaN, + "learning_rate": 0.00021188725107890237, + "loss": 0.0, + "step": 24717 + }, + { + "epoch": 2.306429037977046, + "grad_norm": NaN, + "learning_rate": 0.00021188036099307157, + "loss": 0.0, + "step": 24718 + }, + { + "epoch": 2.306522347671923, + "grad_norm": NaN, + "learning_rate": 0.00021187347074989724, + "loss": 0.0, + "step": 24719 + }, + { + "epoch": 2.3066156573668004, + "grad_norm": NaN, + "learning_rate": 0.00021186658034939705, + "loss": 0.0, + "step": 24720 + }, + { + "epoch": 2.306708967061678, + "grad_norm": NaN, + "learning_rate": 0.00021185968979158846, + "loss": 0.0, + "step": 24721 + }, + { + "epoch": 2.306802276756555, + "grad_norm": NaN, + "learning_rate": 0.00021185279907648898, + "loss": 0.0, + "step": 24722 + }, + { + "epoch": 2.3068955864514322, + "grad_norm": NaN, + "learning_rate": 0.0002118459082041162, + "loss": 0.0, + "step": 24723 + }, + { + "epoch": 2.3069888961463096, + "grad_norm": NaN, + "learning_rate": 0.0002118390171744876, + "loss": 0.0, + "step": 24724 + }, + { + "epoch": 2.307082205841187, + "grad_norm": NaN, + "learning_rate": 0.00021183212598762066, + "loss": 0.0, + "step": 24725 + }, + { + "epoch": 2.307175515536064, + "grad_norm": NaN, + "learning_rate": 0.00021182523464353294, + "loss": 0.0, + "step": 24726 + }, + { + "epoch": 2.3072688252309415, + "grad_norm": NaN, + "learning_rate": 0.00021181834314224198, + "loss": 0.0, + "step": 24727 + }, + { + "epoch": 2.307362134925819, + "grad_norm": NaN, + "learning_rate": 0.00021181145148376529, + "loss": 0.0, + "step": 24728 + }, + { + "epoch": 2.307455444620696, + "grad_norm": NaN, + "learning_rate": 0.00021180455966812034, + "loss": 0.0, + "step": 24729 + }, + { + "epoch": 2.3075487543155733, + "grad_norm": NaN, + "learning_rate": 0.0002117976676953248, + "loss": 0.0, + "step": 24730 + }, + { + "epoch": 2.3076420640104507, + "grad_norm": NaN, + "learning_rate": 0.000211790775565396, + "loss": 0.0, + "step": 24731 + }, + { + "epoch": 2.307735373705328, + "grad_norm": NaN, + "learning_rate": 0.00021178388327835158, + "loss": 0.0, + "step": 24732 + }, + { + "epoch": 2.3078286834002055, + "grad_norm": NaN, + "learning_rate": 0.0002117769908342091, + "loss": 0.0, + "step": 24733 + }, + { + "epoch": 2.3079219930950825, + "grad_norm": NaN, + "learning_rate": 0.00021177009823298602, + "loss": 0.0, + "step": 24734 + }, + { + "epoch": 2.30801530278996, + "grad_norm": NaN, + "learning_rate": 0.00021176320547469982, + "loss": 0.0, + "step": 24735 + }, + { + "epoch": 2.3081086124848373, + "grad_norm": NaN, + "learning_rate": 0.0002117563125593681, + "loss": 0.0, + "step": 24736 + }, + { + "epoch": 2.3082019221797143, + "grad_norm": NaN, + "learning_rate": 0.00021174941948700842, + "loss": 0.0, + "step": 24737 + }, + { + "epoch": 2.3082952318745917, + "grad_norm": NaN, + "learning_rate": 0.00021174252625763822, + "loss": 0.0, + "step": 24738 + }, + { + "epoch": 2.308388541569469, + "grad_norm": NaN, + "learning_rate": 0.00021173563287127506, + "loss": 0.0, + "step": 24739 + }, + { + "epoch": 2.3084818512643466, + "grad_norm": NaN, + "learning_rate": 0.0002117287393279365, + "loss": 0.0, + "step": 24740 + }, + { + "epoch": 2.3085751609592235, + "grad_norm": NaN, + "learning_rate": 0.00021172184562764004, + "loss": 0.0, + "step": 24741 + }, + { + "epoch": 2.308668470654101, + "grad_norm": NaN, + "learning_rate": 0.00021171495177040316, + "loss": 0.0, + "step": 24742 + }, + { + "epoch": 2.3087617803489784, + "grad_norm": NaN, + "learning_rate": 0.0002117080577562435, + "loss": 0.0, + "step": 24743 + }, + { + "epoch": 2.3088550900438554, + "grad_norm": NaN, + "learning_rate": 0.00021170116358517848, + "loss": 0.0, + "step": 24744 + }, + { + "epoch": 2.308948399738733, + "grad_norm": NaN, + "learning_rate": 0.0002116942692572257, + "loss": 0.0, + "step": 24745 + }, + { + "epoch": 2.30904170943361, + "grad_norm": NaN, + "learning_rate": 0.00021168737477240274, + "loss": 0.0, + "step": 24746 + }, + { + "epoch": 2.3091350191284876, + "grad_norm": NaN, + "learning_rate": 0.00021168048013072698, + "loss": 0.0, + "step": 24747 + }, + { + "epoch": 2.3092283288233646, + "grad_norm": NaN, + "learning_rate": 0.000211673585332216, + "loss": 0.0, + "step": 24748 + }, + { + "epoch": 2.309321638518242, + "grad_norm": NaN, + "learning_rate": 0.00021166669037688744, + "loss": 0.0, + "step": 24749 + }, + { + "epoch": 2.3094149482131194, + "grad_norm": NaN, + "learning_rate": 0.00021165979526475872, + "loss": 0.0, + "step": 24750 + }, + { + "epoch": 2.3095082579079964, + "grad_norm": NaN, + "learning_rate": 0.00021165289999584738, + "loss": 0.0, + "step": 24751 + }, + { + "epoch": 2.309601567602874, + "grad_norm": NaN, + "learning_rate": 0.000211646004570171, + "loss": 0.0, + "step": 24752 + }, + { + "epoch": 2.3096948772977512, + "grad_norm": NaN, + "learning_rate": 0.00021163910898774712, + "loss": 0.0, + "step": 24753 + }, + { + "epoch": 2.3097881869926287, + "grad_norm": NaN, + "learning_rate": 0.0002116322132485932, + "loss": 0.0, + "step": 24754 + }, + { + "epoch": 2.3098814966875056, + "grad_norm": NaN, + "learning_rate": 0.00021162531735272681, + "loss": 0.0, + "step": 24755 + }, + { + "epoch": 2.309974806382383, + "grad_norm": NaN, + "learning_rate": 0.00021161842130016558, + "loss": 0.0, + "step": 24756 + }, + { + "epoch": 2.3100681160772605, + "grad_norm": NaN, + "learning_rate": 0.00021161152509092683, + "loss": 0.0, + "step": 24757 + }, + { + "epoch": 2.310161425772138, + "grad_norm": NaN, + "learning_rate": 0.0002116046287250283, + "loss": 0.0, + "step": 24758 + }, + { + "epoch": 2.310254735467015, + "grad_norm": NaN, + "learning_rate": 0.00021159773220248747, + "loss": 0.0, + "step": 24759 + }, + { + "epoch": 2.3103480451618923, + "grad_norm": NaN, + "learning_rate": 0.00021159083552332178, + "loss": 0.0, + "step": 24760 + }, + { + "epoch": 2.3104413548567697, + "grad_norm": NaN, + "learning_rate": 0.00021158393868754888, + "loss": 0.0, + "step": 24761 + }, + { + "epoch": 2.310534664551647, + "grad_norm": NaN, + "learning_rate": 0.00021157704169518627, + "loss": 0.0, + "step": 24762 + }, + { + "epoch": 2.310627974246524, + "grad_norm": NaN, + "learning_rate": 0.00021157014454625147, + "loss": 0.0, + "step": 24763 + }, + { + "epoch": 2.3107212839414015, + "grad_norm": NaN, + "learning_rate": 0.00021156324724076205, + "loss": 0.0, + "step": 24764 + }, + { + "epoch": 2.310814593636279, + "grad_norm": NaN, + "learning_rate": 0.00021155634977873555, + "loss": 0.0, + "step": 24765 + }, + { + "epoch": 2.310907903331156, + "grad_norm": NaN, + "learning_rate": 0.00021154945216018942, + "loss": 0.0, + "step": 24766 + }, + { + "epoch": 2.3110012130260333, + "grad_norm": NaN, + "learning_rate": 0.0002115425543851413, + "loss": 0.0, + "step": 24767 + }, + { + "epoch": 2.3110945227209108, + "grad_norm": NaN, + "learning_rate": 0.0002115356564536087, + "loss": 0.0, + "step": 24768 + }, + { + "epoch": 2.311187832415788, + "grad_norm": NaN, + "learning_rate": 0.00021152875836560916, + "loss": 0.0, + "step": 24769 + }, + { + "epoch": 2.311281142110665, + "grad_norm": NaN, + "learning_rate": 0.00021152186012116016, + "loss": 0.0, + "step": 24770 + }, + { + "epoch": 2.3113744518055426, + "grad_norm": NaN, + "learning_rate": 0.00021151496172027936, + "loss": 0.0, + "step": 24771 + }, + { + "epoch": 2.31146776150042, + "grad_norm": NaN, + "learning_rate": 0.00021150806316298424, + "loss": 0.0, + "step": 24772 + }, + { + "epoch": 2.311561071195297, + "grad_norm": NaN, + "learning_rate": 0.00021150116444929227, + "loss": 0.0, + "step": 24773 + }, + { + "epoch": 2.3116543808901744, + "grad_norm": NaN, + "learning_rate": 0.0002114942655792211, + "loss": 0.0, + "step": 24774 + }, + { + "epoch": 2.311747690585052, + "grad_norm": NaN, + "learning_rate": 0.00021148736655278823, + "loss": 0.0, + "step": 24775 + }, + { + "epoch": 2.3118410002799292, + "grad_norm": NaN, + "learning_rate": 0.00021148046737001116, + "loss": 0.0, + "step": 24776 + }, + { + "epoch": 2.311934309974806, + "grad_norm": NaN, + "learning_rate": 0.00021147356803090755, + "loss": 0.0, + "step": 24777 + }, + { + "epoch": 2.3120276196696836, + "grad_norm": NaN, + "learning_rate": 0.0002114666685354948, + "loss": 0.0, + "step": 24778 + }, + { + "epoch": 2.312120929364561, + "grad_norm": NaN, + "learning_rate": 0.00021145976888379054, + "loss": 0.0, + "step": 24779 + }, + { + "epoch": 2.3122142390594385, + "grad_norm": NaN, + "learning_rate": 0.00021145286907581226, + "loss": 0.0, + "step": 24780 + }, + { + "epoch": 2.3123075487543154, + "grad_norm": NaN, + "learning_rate": 0.00021144596911157763, + "loss": 0.0, + "step": 24781 + }, + { + "epoch": 2.312400858449193, + "grad_norm": NaN, + "learning_rate": 0.00021143906899110404, + "loss": 0.0, + "step": 24782 + }, + { + "epoch": 2.3124941681440703, + "grad_norm": NaN, + "learning_rate": 0.00021143216871440907, + "loss": 0.0, + "step": 24783 + }, + { + "epoch": 2.3125874778389477, + "grad_norm": NaN, + "learning_rate": 0.00021142526828151035, + "loss": 0.0, + "step": 24784 + }, + { + "epoch": 2.3126807875338247, + "grad_norm": NaN, + "learning_rate": 0.0002114183676924253, + "loss": 0.0, + "step": 24785 + }, + { + "epoch": 2.312774097228702, + "grad_norm": NaN, + "learning_rate": 0.0002114114669471716, + "loss": 0.0, + "step": 24786 + }, + { + "epoch": 2.3128674069235795, + "grad_norm": NaN, + "learning_rate": 0.00021140456604576673, + "loss": 0.0, + "step": 24787 + }, + { + "epoch": 2.3129607166184565, + "grad_norm": NaN, + "learning_rate": 0.00021139766498822822, + "loss": 0.0, + "step": 24788 + }, + { + "epoch": 2.313054026313334, + "grad_norm": NaN, + "learning_rate": 0.0002113907637745736, + "loss": 0.0, + "step": 24789 + }, + { + "epoch": 2.3131473360082113, + "grad_norm": NaN, + "learning_rate": 0.00021138386240482048, + "loss": 0.0, + "step": 24790 + }, + { + "epoch": 2.3132406457030887, + "grad_norm": NaN, + "learning_rate": 0.00021137696087898642, + "loss": 0.0, + "step": 24791 + }, + { + "epoch": 2.3133339553979657, + "grad_norm": NaN, + "learning_rate": 0.00021137005919708889, + "loss": 0.0, + "step": 24792 + }, + { + "epoch": 2.313427265092843, + "grad_norm": NaN, + "learning_rate": 0.0002113631573591455, + "loss": 0.0, + "step": 24793 + }, + { + "epoch": 2.3135205747877206, + "grad_norm": NaN, + "learning_rate": 0.00021135625536517378, + "loss": 0.0, + "step": 24794 + }, + { + "epoch": 2.3136138844825975, + "grad_norm": NaN, + "learning_rate": 0.0002113493532151912, + "loss": 0.0, + "step": 24795 + }, + { + "epoch": 2.313707194177475, + "grad_norm": NaN, + "learning_rate": 0.00021134245090921545, + "loss": 0.0, + "step": 24796 + }, + { + "epoch": 2.3138005038723524, + "grad_norm": NaN, + "learning_rate": 0.00021133554844726405, + "loss": 0.0, + "step": 24797 + }, + { + "epoch": 2.31389381356723, + "grad_norm": NaN, + "learning_rate": 0.00021132864582935448, + "loss": 0.0, + "step": 24798 + }, + { + "epoch": 2.3139871232621068, + "grad_norm": NaN, + "learning_rate": 0.0002113217430555043, + "loss": 0.0, + "step": 24799 + }, + { + "epoch": 2.314080432956984, + "grad_norm": NaN, + "learning_rate": 0.00021131484012573118, + "loss": 0.0, + "step": 24800 + }, + { + "epoch": 2.3141737426518616, + "grad_norm": NaN, + "learning_rate": 0.0002113079370400525, + "loss": 0.0, + "step": 24801 + }, + { + "epoch": 2.314267052346739, + "grad_norm": NaN, + "learning_rate": 0.0002113010337984859, + "loss": 0.0, + "step": 24802 + }, + { + "epoch": 2.314360362041616, + "grad_norm": NaN, + "learning_rate": 0.000211294130401049, + "loss": 0.0, + "step": 24803 + }, + { + "epoch": 2.3144536717364934, + "grad_norm": NaN, + "learning_rate": 0.00021128722684775922, + "loss": 0.0, + "step": 24804 + }, + { + "epoch": 2.314546981431371, + "grad_norm": NaN, + "learning_rate": 0.0002112803231386342, + "loss": 0.0, + "step": 24805 + }, + { + "epoch": 2.3146402911262483, + "grad_norm": NaN, + "learning_rate": 0.00021127341927369147, + "loss": 0.0, + "step": 24806 + }, + { + "epoch": 2.3147336008211252, + "grad_norm": NaN, + "learning_rate": 0.00021126651525294856, + "loss": 0.0, + "step": 24807 + }, + { + "epoch": 2.3148269105160026, + "grad_norm": NaN, + "learning_rate": 0.0002112596110764231, + "loss": 0.0, + "step": 24808 + }, + { + "epoch": 2.31492022021088, + "grad_norm": NaN, + "learning_rate": 0.00021125270674413255, + "loss": 0.0, + "step": 24809 + }, + { + "epoch": 2.315013529905757, + "grad_norm": NaN, + "learning_rate": 0.00021124580225609455, + "loss": 0.0, + "step": 24810 + }, + { + "epoch": 2.3151068396006345, + "grad_norm": NaN, + "learning_rate": 0.00021123889761232657, + "loss": 0.0, + "step": 24811 + }, + { + "epoch": 2.315200149295512, + "grad_norm": NaN, + "learning_rate": 0.00021123199281284624, + "loss": 0.0, + "step": 24812 + }, + { + "epoch": 2.3152934589903893, + "grad_norm": NaN, + "learning_rate": 0.00021122508785767111, + "loss": 0.0, + "step": 24813 + }, + { + "epoch": 2.3153867686852663, + "grad_norm": NaN, + "learning_rate": 0.0002112181827468187, + "loss": 0.0, + "step": 24814 + }, + { + "epoch": 2.3154800783801437, + "grad_norm": NaN, + "learning_rate": 0.00021121127748030658, + "loss": 0.0, + "step": 24815 + }, + { + "epoch": 2.315573388075021, + "grad_norm": NaN, + "learning_rate": 0.00021120437205815235, + "loss": 0.0, + "step": 24816 + }, + { + "epoch": 2.315666697769898, + "grad_norm": NaN, + "learning_rate": 0.00021119746648037347, + "loss": 0.0, + "step": 24817 + }, + { + "epoch": 2.3157600074647755, + "grad_norm": NaN, + "learning_rate": 0.00021119056074698762, + "loss": 0.0, + "step": 24818 + }, + { + "epoch": 2.315853317159653, + "grad_norm": NaN, + "learning_rate": 0.00021118365485801229, + "loss": 0.0, + "step": 24819 + }, + { + "epoch": 2.3159466268545303, + "grad_norm": NaN, + "learning_rate": 0.00021117674881346497, + "loss": 0.0, + "step": 24820 + }, + { + "epoch": 2.3160399365494073, + "grad_norm": NaN, + "learning_rate": 0.00021116984261336337, + "loss": 0.0, + "step": 24821 + }, + { + "epoch": 2.3161332462442847, + "grad_norm": NaN, + "learning_rate": 0.000211162936257725, + "loss": 0.0, + "step": 24822 + }, + { + "epoch": 2.316226555939162, + "grad_norm": NaN, + "learning_rate": 0.00021115602974656735, + "loss": 0.0, + "step": 24823 + }, + { + "epoch": 2.316319865634039, + "grad_norm": NaN, + "learning_rate": 0.00021114912307990808, + "loss": 0.0, + "step": 24824 + }, + { + "epoch": 2.3164131753289166, + "grad_norm": NaN, + "learning_rate": 0.00021114221625776467, + "loss": 0.0, + "step": 24825 + }, + { + "epoch": 2.316506485023794, + "grad_norm": NaN, + "learning_rate": 0.0002111353092801547, + "loss": 0.0, + "step": 24826 + }, + { + "epoch": 2.3165997947186714, + "grad_norm": NaN, + "learning_rate": 0.00021112840214709578, + "loss": 0.0, + "step": 24827 + }, + { + "epoch": 2.316693104413549, + "grad_norm": NaN, + "learning_rate": 0.00021112149485860544, + "loss": 0.0, + "step": 24828 + }, + { + "epoch": 2.316786414108426, + "grad_norm": NaN, + "learning_rate": 0.00021111458741470123, + "loss": 0.0, + "step": 24829 + }, + { + "epoch": 2.316879723803303, + "grad_norm": NaN, + "learning_rate": 0.00021110767981540073, + "loss": 0.0, + "step": 24830 + }, + { + "epoch": 2.3169730334981806, + "grad_norm": NaN, + "learning_rate": 0.0002111007720607215, + "loss": 0.0, + "step": 24831 + }, + { + "epoch": 2.3170663431930576, + "grad_norm": NaN, + "learning_rate": 0.0002110938641506811, + "loss": 0.0, + "step": 24832 + }, + { + "epoch": 2.317159652887935, + "grad_norm": NaN, + "learning_rate": 0.00021108695608529715, + "loss": 0.0, + "step": 24833 + }, + { + "epoch": 2.3172529625828124, + "grad_norm": NaN, + "learning_rate": 0.00021108004786458712, + "loss": 0.0, + "step": 24834 + }, + { + "epoch": 2.31734627227769, + "grad_norm": NaN, + "learning_rate": 0.00021107313948856863, + "loss": 0.0, + "step": 24835 + }, + { + "epoch": 2.317439581972567, + "grad_norm": NaN, + "learning_rate": 0.00021106623095725925, + "loss": 0.0, + "step": 24836 + }, + { + "epoch": 2.3175328916674443, + "grad_norm": NaN, + "learning_rate": 0.00021105932227067652, + "loss": 0.0, + "step": 24837 + }, + { + "epoch": 2.3176262013623217, + "grad_norm": NaN, + "learning_rate": 0.00021105241342883802, + "loss": 0.0, + "step": 24838 + }, + { + "epoch": 2.3177195110571986, + "grad_norm": NaN, + "learning_rate": 0.00021104550443176132, + "loss": 0.0, + "step": 24839 + }, + { + "epoch": 2.317812820752076, + "grad_norm": NaN, + "learning_rate": 0.00021103859527946397, + "loss": 0.0, + "step": 24840 + }, + { + "epoch": 2.3179061304469535, + "grad_norm": NaN, + "learning_rate": 0.0002110316859719636, + "loss": 0.0, + "step": 24841 + }, + { + "epoch": 2.317999440141831, + "grad_norm": NaN, + "learning_rate": 0.0002110247765092777, + "loss": 0.0, + "step": 24842 + }, + { + "epoch": 2.318092749836708, + "grad_norm": NaN, + "learning_rate": 0.00021101786689142385, + "loss": 0.0, + "step": 24843 + }, + { + "epoch": 2.3181860595315853, + "grad_norm": NaN, + "learning_rate": 0.00021101095711841965, + "loss": 0.0, + "step": 24844 + }, + { + "epoch": 2.3182793692264627, + "grad_norm": NaN, + "learning_rate": 0.00021100404719028267, + "loss": 0.0, + "step": 24845 + }, + { + "epoch": 2.3183726789213397, + "grad_norm": NaN, + "learning_rate": 0.00021099713710703047, + "loss": 0.0, + "step": 24846 + }, + { + "epoch": 2.318465988616217, + "grad_norm": NaN, + "learning_rate": 0.00021099022686868065, + "loss": 0.0, + "step": 24847 + }, + { + "epoch": 2.3185592983110945, + "grad_norm": NaN, + "learning_rate": 0.0002109833164752507, + "loss": 0.0, + "step": 24848 + }, + { + "epoch": 2.318652608005972, + "grad_norm": NaN, + "learning_rate": 0.00021097640592675825, + "loss": 0.0, + "step": 24849 + }, + { + "epoch": 2.318745917700849, + "grad_norm": NaN, + "learning_rate": 0.00021096949522322086, + "loss": 0.0, + "step": 24850 + }, + { + "epoch": 2.3188392273957263, + "grad_norm": NaN, + "learning_rate": 0.00021096258436465611, + "loss": 0.0, + "step": 24851 + }, + { + "epoch": 2.3189325370906038, + "grad_norm": NaN, + "learning_rate": 0.00021095567335108155, + "loss": 0.0, + "step": 24852 + }, + { + "epoch": 2.319025846785481, + "grad_norm": NaN, + "learning_rate": 0.00021094876218251483, + "loss": 0.0, + "step": 24853 + }, + { + "epoch": 2.319119156480358, + "grad_norm": NaN, + "learning_rate": 0.00021094185085897342, + "loss": 0.0, + "step": 24854 + }, + { + "epoch": 2.3192124661752356, + "grad_norm": NaN, + "learning_rate": 0.0002109349393804749, + "loss": 0.0, + "step": 24855 + }, + { + "epoch": 2.319305775870113, + "grad_norm": NaN, + "learning_rate": 0.0002109280277470369, + "loss": 0.0, + "step": 24856 + }, + { + "epoch": 2.3193990855649904, + "grad_norm": NaN, + "learning_rate": 0.000210921115958677, + "loss": 0.0, + "step": 24857 + }, + { + "epoch": 2.3194923952598674, + "grad_norm": NaN, + "learning_rate": 0.00021091420401541275, + "loss": 0.0, + "step": 24858 + }, + { + "epoch": 2.319585704954745, + "grad_norm": NaN, + "learning_rate": 0.00021090729191726172, + "loss": 0.0, + "step": 24859 + }, + { + "epoch": 2.3196790146496222, + "grad_norm": NaN, + "learning_rate": 0.0002109003796642415, + "loss": 0.0, + "step": 24860 + }, + { + "epoch": 2.319772324344499, + "grad_norm": NaN, + "learning_rate": 0.0002108934672563696, + "loss": 0.0, + "step": 24861 + }, + { + "epoch": 2.3198656340393766, + "grad_norm": NaN, + "learning_rate": 0.00021088655469366372, + "loss": 0.0, + "step": 24862 + }, + { + "epoch": 2.319958943734254, + "grad_norm": NaN, + "learning_rate": 0.0002108796419761413, + "loss": 0.0, + "step": 24863 + }, + { + "epoch": 2.3200522534291315, + "grad_norm": NaN, + "learning_rate": 0.00021087272910382, + "loss": 0.0, + "step": 24864 + }, + { + "epoch": 2.3201455631240084, + "grad_norm": NaN, + "learning_rate": 0.0002108658160767174, + "loss": 0.0, + "step": 24865 + }, + { + "epoch": 2.320238872818886, + "grad_norm": NaN, + "learning_rate": 0.00021085890289485108, + "loss": 0.0, + "step": 24866 + }, + { + "epoch": 2.3203321825137633, + "grad_norm": NaN, + "learning_rate": 0.00021085198955823856, + "loss": 0.0, + "step": 24867 + }, + { + "epoch": 2.3204254922086403, + "grad_norm": NaN, + "learning_rate": 0.0002108450760668975, + "loss": 0.0, + "step": 24868 + }, + { + "epoch": 2.3205188019035177, + "grad_norm": NaN, + "learning_rate": 0.0002108381624208454, + "loss": 0.0, + "step": 24869 + }, + { + "epoch": 2.320612111598395, + "grad_norm": NaN, + "learning_rate": 0.00021083124862009985, + "loss": 0.0, + "step": 24870 + }, + { + "epoch": 2.3207054212932725, + "grad_norm": NaN, + "learning_rate": 0.0002108243346646785, + "loss": 0.0, + "step": 24871 + }, + { + "epoch": 2.3207987309881495, + "grad_norm": NaN, + "learning_rate": 0.00021081742055459887, + "loss": 0.0, + "step": 24872 + }, + { + "epoch": 2.320892040683027, + "grad_norm": NaN, + "learning_rate": 0.00021081050628987856, + "loss": 0.0, + "step": 24873 + }, + { + "epoch": 2.3209853503779043, + "grad_norm": NaN, + "learning_rate": 0.00021080359187053516, + "loss": 0.0, + "step": 24874 + }, + { + "epoch": 2.3210786600727817, + "grad_norm": NaN, + "learning_rate": 0.0002107966772965862, + "loss": 0.0, + "step": 24875 + }, + { + "epoch": 2.3211719697676587, + "grad_norm": NaN, + "learning_rate": 0.0002107897625680493, + "loss": 0.0, + "step": 24876 + }, + { + "epoch": 2.321265279462536, + "grad_norm": NaN, + "learning_rate": 0.00021078284768494206, + "loss": 0.0, + "step": 24877 + }, + { + "epoch": 2.3213585891574136, + "grad_norm": NaN, + "learning_rate": 0.00021077593264728206, + "loss": 0.0, + "step": 24878 + }, + { + "epoch": 2.321451898852291, + "grad_norm": NaN, + "learning_rate": 0.00021076901745508686, + "loss": 0.0, + "step": 24879 + }, + { + "epoch": 2.321545208547168, + "grad_norm": NaN, + "learning_rate": 0.00021076210210837403, + "loss": 0.0, + "step": 24880 + }, + { + "epoch": 2.3216385182420454, + "grad_norm": NaN, + "learning_rate": 0.00021075518660716117, + "loss": 0.0, + "step": 24881 + }, + { + "epoch": 2.321731827936923, + "grad_norm": NaN, + "learning_rate": 0.00021074827095146588, + "loss": 0.0, + "step": 24882 + }, + { + "epoch": 2.3218251376317998, + "grad_norm": NaN, + "learning_rate": 0.00021074135514130573, + "loss": 0.0, + "step": 24883 + }, + { + "epoch": 2.321918447326677, + "grad_norm": NaN, + "learning_rate": 0.0002107344391766983, + "loss": 0.0, + "step": 24884 + }, + { + "epoch": 2.3220117570215546, + "grad_norm": NaN, + "learning_rate": 0.0002107275230576612, + "loss": 0.0, + "step": 24885 + }, + { + "epoch": 2.322105066716432, + "grad_norm": NaN, + "learning_rate": 0.000210720606784212, + "loss": 0.0, + "step": 24886 + }, + { + "epoch": 2.322198376411309, + "grad_norm": NaN, + "learning_rate": 0.00021071369035636825, + "loss": 0.0, + "step": 24887 + }, + { + "epoch": 2.3222916861061864, + "grad_norm": NaN, + "learning_rate": 0.00021070677377414763, + "loss": 0.0, + "step": 24888 + }, + { + "epoch": 2.322384995801064, + "grad_norm": NaN, + "learning_rate": 0.0002106998570375676, + "loss": 0.0, + "step": 24889 + }, + { + "epoch": 2.322478305495941, + "grad_norm": NaN, + "learning_rate": 0.00021069294014664584, + "loss": 0.0, + "step": 24890 + }, + { + "epoch": 2.3225716151908182, + "grad_norm": NaN, + "learning_rate": 0.00021068602310139994, + "loss": 0.0, + "step": 24891 + }, + { + "epoch": 2.3226649248856956, + "grad_norm": NaN, + "learning_rate": 0.00021067910590184742, + "loss": 0.0, + "step": 24892 + }, + { + "epoch": 2.322758234580573, + "grad_norm": NaN, + "learning_rate": 0.00021067218854800592, + "loss": 0.0, + "step": 24893 + }, + { + "epoch": 2.32285154427545, + "grad_norm": NaN, + "learning_rate": 0.000210665271039893, + "loss": 0.0, + "step": 24894 + }, + { + "epoch": 2.3229448539703275, + "grad_norm": NaN, + "learning_rate": 0.00021065835337752628, + "loss": 0.0, + "step": 24895 + }, + { + "epoch": 2.323038163665205, + "grad_norm": NaN, + "learning_rate": 0.00021065143556092336, + "loss": 0.0, + "step": 24896 + }, + { + "epoch": 2.3231314733600823, + "grad_norm": NaN, + "learning_rate": 0.00021064451759010176, + "loss": 0.0, + "step": 24897 + }, + { + "epoch": 2.3232247830549593, + "grad_norm": NaN, + "learning_rate": 0.00021063759946507915, + "loss": 0.0, + "step": 24898 + }, + { + "epoch": 2.3233180927498367, + "grad_norm": NaN, + "learning_rate": 0.00021063068118587307, + "loss": 0.0, + "step": 24899 + }, + { + "epoch": 2.323411402444714, + "grad_norm": NaN, + "learning_rate": 0.00021062376275250114, + "loss": 0.0, + "step": 24900 + }, + { + "epoch": 2.3235047121395915, + "grad_norm": NaN, + "learning_rate": 0.00021061684416498095, + "loss": 0.0, + "step": 24901 + }, + { + "epoch": 2.3235980218344685, + "grad_norm": NaN, + "learning_rate": 0.00021060992542333006, + "loss": 0.0, + "step": 24902 + }, + { + "epoch": 2.323691331529346, + "grad_norm": NaN, + "learning_rate": 0.0002106030065275661, + "loss": 0.0, + "step": 24903 + }, + { + "epoch": 2.3237846412242233, + "grad_norm": NaN, + "learning_rate": 0.00021059608747770665, + "loss": 0.0, + "step": 24904 + }, + { + "epoch": 2.3238779509191003, + "grad_norm": NaN, + "learning_rate": 0.00021058916827376928, + "loss": 0.0, + "step": 24905 + }, + { + "epoch": 2.3239712606139777, + "grad_norm": NaN, + "learning_rate": 0.0002105822489157716, + "loss": 0.0, + "step": 24906 + }, + { + "epoch": 2.324064570308855, + "grad_norm": NaN, + "learning_rate": 0.00021057532940373124, + "loss": 0.0, + "step": 24907 + }, + { + "epoch": 2.3241578800037326, + "grad_norm": NaN, + "learning_rate": 0.00021056840973766573, + "loss": 0.0, + "step": 24908 + }, + { + "epoch": 2.3242511896986096, + "grad_norm": NaN, + "learning_rate": 0.00021056148991759272, + "loss": 0.0, + "step": 24909 + }, + { + "epoch": 2.324344499393487, + "grad_norm": NaN, + "learning_rate": 0.00021055456994352974, + "loss": 0.0, + "step": 24910 + }, + { + "epoch": 2.3244378090883644, + "grad_norm": NaN, + "learning_rate": 0.00021054764981549444, + "loss": 0.0, + "step": 24911 + }, + { + "epoch": 2.3245311187832414, + "grad_norm": NaN, + "learning_rate": 0.0002105407295335044, + "loss": 0.0, + "step": 24912 + }, + { + "epoch": 2.324624428478119, + "grad_norm": NaN, + "learning_rate": 0.00021053380909757727, + "loss": 0.0, + "step": 24913 + }, + { + "epoch": 2.324717738172996, + "grad_norm": NaN, + "learning_rate": 0.00021052688850773055, + "loss": 0.0, + "step": 24914 + }, + { + "epoch": 2.3248110478678736, + "grad_norm": NaN, + "learning_rate": 0.0002105199677639819, + "loss": 0.0, + "step": 24915 + }, + { + "epoch": 2.3249043575627506, + "grad_norm": NaN, + "learning_rate": 0.00021051304686634886, + "loss": 0.0, + "step": 24916 + }, + { + "epoch": 2.324997667257628, + "grad_norm": NaN, + "learning_rate": 0.00021050612581484908, + "loss": 0.0, + "step": 24917 + }, + { + "epoch": 2.3250909769525054, + "grad_norm": NaN, + "learning_rate": 0.0002104992046095002, + "loss": 0.0, + "step": 24918 + }, + { + "epoch": 2.3251842866473824, + "grad_norm": NaN, + "learning_rate": 0.00021049228325031971, + "loss": 0.0, + "step": 24919 + }, + { + "epoch": 2.32527759634226, + "grad_norm": NaN, + "learning_rate": 0.00021048536173732526, + "loss": 0.0, + "step": 24920 + }, + { + "epoch": 2.3253709060371373, + "grad_norm": NaN, + "learning_rate": 0.00021047844007053448, + "loss": 0.0, + "step": 24921 + }, + { + "epoch": 2.3254642157320147, + "grad_norm": NaN, + "learning_rate": 0.00021047151824996494, + "loss": 0.0, + "step": 24922 + }, + { + "epoch": 2.325557525426892, + "grad_norm": NaN, + "learning_rate": 0.00021046459627563422, + "loss": 0.0, + "step": 24923 + }, + { + "epoch": 2.325650835121769, + "grad_norm": NaN, + "learning_rate": 0.00021045767414755993, + "loss": 0.0, + "step": 24924 + }, + { + "epoch": 2.3257441448166465, + "grad_norm": NaN, + "learning_rate": 0.00021045075186575975, + "loss": 0.0, + "step": 24925 + }, + { + "epoch": 2.325837454511524, + "grad_norm": NaN, + "learning_rate": 0.00021044382943025114, + "loss": 0.0, + "step": 24926 + }, + { + "epoch": 2.325930764206401, + "grad_norm": NaN, + "learning_rate": 0.0002104369068410518, + "loss": 0.0, + "step": 24927 + }, + { + "epoch": 2.3260240739012783, + "grad_norm": NaN, + "learning_rate": 0.00021042998409817937, + "loss": 0.0, + "step": 24928 + }, + { + "epoch": 2.3261173835961557, + "grad_norm": NaN, + "learning_rate": 0.00021042306120165133, + "loss": 0.0, + "step": 24929 + }, + { + "epoch": 2.326210693291033, + "grad_norm": NaN, + "learning_rate": 0.00021041613815148534, + "loss": 0.0, + "step": 24930 + }, + { + "epoch": 2.32630400298591, + "grad_norm": NaN, + "learning_rate": 0.00021040921494769901, + "loss": 0.0, + "step": 24931 + }, + { + "epoch": 2.3263973126807875, + "grad_norm": NaN, + "learning_rate": 0.00021040229159030997, + "loss": 0.0, + "step": 24932 + }, + { + "epoch": 2.326490622375665, + "grad_norm": NaN, + "learning_rate": 0.00021039536807933578, + "loss": 0.0, + "step": 24933 + }, + { + "epoch": 2.326583932070542, + "grad_norm": NaN, + "learning_rate": 0.00021038844441479405, + "loss": 0.0, + "step": 24934 + }, + { + "epoch": 2.3266772417654193, + "grad_norm": NaN, + "learning_rate": 0.0002103815205967024, + "loss": 0.0, + "step": 24935 + }, + { + "epoch": 2.3267705514602968, + "grad_norm": NaN, + "learning_rate": 0.00021037459662507843, + "loss": 0.0, + "step": 24936 + }, + { + "epoch": 2.326863861155174, + "grad_norm": NaN, + "learning_rate": 0.00021036767249993976, + "loss": 0.0, + "step": 24937 + }, + { + "epoch": 2.326957170850051, + "grad_norm": NaN, + "learning_rate": 0.00021036074822130394, + "loss": 0.0, + "step": 24938 + }, + { + "epoch": 2.3270504805449286, + "grad_norm": NaN, + "learning_rate": 0.00021035382378918865, + "loss": 0.0, + "step": 24939 + }, + { + "epoch": 2.327143790239806, + "grad_norm": NaN, + "learning_rate": 0.00021034689920361148, + "loss": 0.0, + "step": 24940 + }, + { + "epoch": 2.327237099934683, + "grad_norm": NaN, + "learning_rate": 0.00021033997446458997, + "loss": 0.0, + "step": 24941 + }, + { + "epoch": 2.3273304096295604, + "grad_norm": NaN, + "learning_rate": 0.00021033304957214182, + "loss": 0.0, + "step": 24942 + }, + { + "epoch": 2.327423719324438, + "grad_norm": NaN, + "learning_rate": 0.0002103261245262846, + "loss": 0.0, + "step": 24943 + }, + { + "epoch": 2.3275170290193152, + "grad_norm": NaN, + "learning_rate": 0.00021031919932703588, + "loss": 0.0, + "step": 24944 + }, + { + "epoch": 2.3276103387141927, + "grad_norm": NaN, + "learning_rate": 0.00021031227397441335, + "loss": 0.0, + "step": 24945 + }, + { + "epoch": 2.3277036484090696, + "grad_norm": NaN, + "learning_rate": 0.00021030534846843454, + "loss": 0.0, + "step": 24946 + }, + { + "epoch": 2.327796958103947, + "grad_norm": NaN, + "learning_rate": 0.00021029842280911708, + "loss": 0.0, + "step": 24947 + }, + { + "epoch": 2.3278902677988245, + "grad_norm": NaN, + "learning_rate": 0.00021029149699647862, + "loss": 0.0, + "step": 24948 + }, + { + "epoch": 2.3279835774937014, + "grad_norm": NaN, + "learning_rate": 0.00021028457103053672, + "loss": 0.0, + "step": 24949 + }, + { + "epoch": 2.328076887188579, + "grad_norm": NaN, + "learning_rate": 0.00021027764491130904, + "loss": 0.0, + "step": 24950 + }, + { + "epoch": 2.3281701968834563, + "grad_norm": NaN, + "learning_rate": 0.00021027071863881318, + "loss": 0.0, + "step": 24951 + }, + { + "epoch": 2.3282635065783337, + "grad_norm": NaN, + "learning_rate": 0.00021026379221306666, + "loss": 0.0, + "step": 24952 + }, + { + "epoch": 2.3283568162732107, + "grad_norm": NaN, + "learning_rate": 0.00021025686563408725, + "loss": 0.0, + "step": 24953 + }, + { + "epoch": 2.328450125968088, + "grad_norm": NaN, + "learning_rate": 0.00021024993890189243, + "loss": 0.0, + "step": 24954 + }, + { + "epoch": 2.3285434356629655, + "grad_norm": NaN, + "learning_rate": 0.0002102430120164999, + "loss": 0.0, + "step": 24955 + }, + { + "epoch": 2.3286367453578425, + "grad_norm": NaN, + "learning_rate": 0.0002102360849779272, + "loss": 0.0, + "step": 24956 + }, + { + "epoch": 2.32873005505272, + "grad_norm": NaN, + "learning_rate": 0.00021022915778619197, + "loss": 0.0, + "step": 24957 + }, + { + "epoch": 2.3288233647475973, + "grad_norm": NaN, + "learning_rate": 0.0002102222304413119, + "loss": 0.0, + "step": 24958 + }, + { + "epoch": 2.3289166744424747, + "grad_norm": NaN, + "learning_rate": 0.00021021530294330448, + "loss": 0.0, + "step": 24959 + }, + { + "epoch": 2.3290099841373517, + "grad_norm": NaN, + "learning_rate": 0.00021020837529218739, + "loss": 0.0, + "step": 24960 + }, + { + "epoch": 2.329103293832229, + "grad_norm": NaN, + "learning_rate": 0.00021020144748797826, + "loss": 0.0, + "step": 24961 + }, + { + "epoch": 2.3291966035271066, + "grad_norm": NaN, + "learning_rate": 0.00021019451953069464, + "loss": 0.0, + "step": 24962 + }, + { + "epoch": 2.3292899132219835, + "grad_norm": NaN, + "learning_rate": 0.00021018759142035423, + "loss": 0.0, + "step": 24963 + }, + { + "epoch": 2.329383222916861, + "grad_norm": NaN, + "learning_rate": 0.0002101806631569746, + "loss": 0.0, + "step": 24964 + }, + { + "epoch": 2.3294765326117384, + "grad_norm": NaN, + "learning_rate": 0.00021017373474057336, + "loss": 0.0, + "step": 24965 + }, + { + "epoch": 2.329569842306616, + "grad_norm": NaN, + "learning_rate": 0.00021016680617116812, + "loss": 0.0, + "step": 24966 + }, + { + "epoch": 2.3296631520014928, + "grad_norm": NaN, + "learning_rate": 0.00021015987744877652, + "loss": 0.0, + "step": 24967 + }, + { + "epoch": 2.32975646169637, + "grad_norm": NaN, + "learning_rate": 0.00021015294857341618, + "loss": 0.0, + "step": 24968 + }, + { + "epoch": 2.3298497713912476, + "grad_norm": NaN, + "learning_rate": 0.00021014601954510471, + "loss": 0.0, + "step": 24969 + }, + { + "epoch": 2.329943081086125, + "grad_norm": NaN, + "learning_rate": 0.00021013909036385973, + "loss": 0.0, + "step": 24970 + }, + { + "epoch": 2.330036390781002, + "grad_norm": NaN, + "learning_rate": 0.00021013216102969888, + "loss": 0.0, + "step": 24971 + }, + { + "epoch": 2.3301297004758794, + "grad_norm": NaN, + "learning_rate": 0.00021012523154263971, + "loss": 0.0, + "step": 24972 + }, + { + "epoch": 2.330223010170757, + "grad_norm": NaN, + "learning_rate": 0.0002101183019026999, + "loss": 0.0, + "step": 24973 + }, + { + "epoch": 2.3303163198656343, + "grad_norm": NaN, + "learning_rate": 0.0002101113721098971, + "loss": 0.0, + "step": 24974 + }, + { + "epoch": 2.3304096295605112, + "grad_norm": NaN, + "learning_rate": 0.00021010444216424887, + "loss": 0.0, + "step": 24975 + }, + { + "epoch": 2.3305029392553887, + "grad_norm": NaN, + "learning_rate": 0.00021009751206577283, + "loss": 0.0, + "step": 24976 + }, + { + "epoch": 2.330596248950266, + "grad_norm": NaN, + "learning_rate": 0.0002100905818144866, + "loss": 0.0, + "step": 24977 + }, + { + "epoch": 2.330689558645143, + "grad_norm": NaN, + "learning_rate": 0.00021008365141040786, + "loss": 0.0, + "step": 24978 + }, + { + "epoch": 2.3307828683400205, + "grad_norm": NaN, + "learning_rate": 0.0002100767208535542, + "loss": 0.0, + "step": 24979 + }, + { + "epoch": 2.330876178034898, + "grad_norm": NaN, + "learning_rate": 0.00021006979014394315, + "loss": 0.0, + "step": 24980 + }, + { + "epoch": 2.3309694877297753, + "grad_norm": NaN, + "learning_rate": 0.00021006285928159253, + "loss": 0.0, + "step": 24981 + }, + { + "epoch": 2.3310627974246523, + "grad_norm": NaN, + "learning_rate": 0.00021005592826651978, + "loss": 0.0, + "step": 24982 + }, + { + "epoch": 2.3311561071195297, + "grad_norm": NaN, + "learning_rate": 0.00021004899709874258, + "loss": 0.0, + "step": 24983 + }, + { + "epoch": 2.331249416814407, + "grad_norm": NaN, + "learning_rate": 0.00021004206577827862, + "loss": 0.0, + "step": 24984 + }, + { + "epoch": 2.331342726509284, + "grad_norm": NaN, + "learning_rate": 0.00021003513430514547, + "loss": 0.0, + "step": 24985 + }, + { + "epoch": 2.3314360362041615, + "grad_norm": NaN, + "learning_rate": 0.00021002820267936072, + "loss": 0.0, + "step": 24986 + }, + { + "epoch": 2.331529345899039, + "grad_norm": NaN, + "learning_rate": 0.00021002127090094205, + "loss": 0.0, + "step": 24987 + }, + { + "epoch": 2.3316226555939163, + "grad_norm": NaN, + "learning_rate": 0.00021001433896990706, + "loss": 0.0, + "step": 24988 + }, + { + "epoch": 2.3317159652887933, + "grad_norm": NaN, + "learning_rate": 0.0002100074068862734, + "loss": 0.0, + "step": 24989 + }, + { + "epoch": 2.3318092749836707, + "grad_norm": NaN, + "learning_rate": 0.0002100004746500587, + "loss": 0.0, + "step": 24990 + }, + { + "epoch": 2.331902584678548, + "grad_norm": NaN, + "learning_rate": 0.00020999354226128054, + "loss": 0.0, + "step": 24991 + }, + { + "epoch": 2.3319958943734256, + "grad_norm": NaN, + "learning_rate": 0.00020998660971995652, + "loss": 0.0, + "step": 24992 + }, + { + "epoch": 2.3320892040683026, + "grad_norm": NaN, + "learning_rate": 0.0002099796770261044, + "loss": 0.0, + "step": 24993 + }, + { + "epoch": 2.33218251376318, + "grad_norm": NaN, + "learning_rate": 0.00020997274417974176, + "loss": 0.0, + "step": 24994 + }, + { + "epoch": 2.3322758234580574, + "grad_norm": NaN, + "learning_rate": 0.0002099658111808861, + "loss": 0.0, + "step": 24995 + }, + { + "epoch": 2.332369133152935, + "grad_norm": NaN, + "learning_rate": 0.0002099588780295552, + "loss": 0.0, + "step": 24996 + }, + { + "epoch": 2.332462442847812, + "grad_norm": NaN, + "learning_rate": 0.00020995194472576663, + "loss": 0.0, + "step": 24997 + }, + { + "epoch": 2.332555752542689, + "grad_norm": NaN, + "learning_rate": 0.00020994501126953802, + "loss": 0.0, + "step": 24998 + }, + { + "epoch": 2.3326490622375666, + "grad_norm": NaN, + "learning_rate": 0.00020993807766088698, + "loss": 0.0, + "step": 24999 + }, + { + "epoch": 2.3327423719324436, + "grad_norm": NaN, + "learning_rate": 0.00020993114389983116, + "loss": 0.0, + "step": 25000 + }, + { + "epoch": 2.332835681627321, + "grad_norm": NaN, + "learning_rate": 0.00020992420998638825, + "loss": 0.0, + "step": 25001 + }, + { + "epoch": 2.3329289913221984, + "grad_norm": NaN, + "learning_rate": 0.00020991727592057578, + "loss": 0.0, + "step": 25002 + }, + { + "epoch": 2.333022301017076, + "grad_norm": NaN, + "learning_rate": 0.0002099103417024114, + "loss": 0.0, + "step": 25003 + }, + { + "epoch": 2.333115610711953, + "grad_norm": NaN, + "learning_rate": 0.00020990340733191283, + "loss": 0.0, + "step": 25004 + }, + { + "epoch": 2.3332089204068303, + "grad_norm": NaN, + "learning_rate": 0.00020989647280909762, + "loss": 0.0, + "step": 25005 + }, + { + "epoch": 2.3333022301017077, + "grad_norm": NaN, + "learning_rate": 0.00020988953813398341, + "loss": 0.0, + "step": 25006 + }, + { + "epoch": 2.3333955397965847, + "grad_norm": NaN, + "learning_rate": 0.00020988260330658784, + "loss": 0.0, + "step": 25007 + }, + { + "epoch": 2.333488849491462, + "grad_norm": NaN, + "learning_rate": 0.00020987566832692857, + "loss": 0.0, + "step": 25008 + }, + { + "epoch": 2.3335821591863395, + "grad_norm": NaN, + "learning_rate": 0.00020986873319502319, + "loss": 0.0, + "step": 25009 + }, + { + "epoch": 2.333675468881217, + "grad_norm": NaN, + "learning_rate": 0.0002098617979108894, + "loss": 0.0, + "step": 25010 + }, + { + "epoch": 2.333768778576094, + "grad_norm": NaN, + "learning_rate": 0.00020985486247454473, + "loss": 0.0, + "step": 25011 + }, + { + "epoch": 2.3338620882709713, + "grad_norm": NaN, + "learning_rate": 0.00020984792688600692, + "loss": 0.0, + "step": 25012 + }, + { + "epoch": 2.3339553979658487, + "grad_norm": NaN, + "learning_rate": 0.00020984099114529353, + "loss": 0.0, + "step": 25013 + }, + { + "epoch": 2.334048707660726, + "grad_norm": NaN, + "learning_rate": 0.0002098340552524222, + "loss": 0.0, + "step": 25014 + }, + { + "epoch": 2.334142017355603, + "grad_norm": NaN, + "learning_rate": 0.00020982711920741068, + "loss": 0.0, + "step": 25015 + }, + { + "epoch": 2.3342353270504805, + "grad_norm": NaN, + "learning_rate": 0.00020982018301027644, + "loss": 0.0, + "step": 25016 + }, + { + "epoch": 2.334328636745358, + "grad_norm": NaN, + "learning_rate": 0.00020981324666103724, + "loss": 0.0, + "step": 25017 + }, + { + "epoch": 2.3344219464402354, + "grad_norm": NaN, + "learning_rate": 0.00020980631015971065, + "loss": 0.0, + "step": 25018 + }, + { + "epoch": 2.3345152561351123, + "grad_norm": NaN, + "learning_rate": 0.00020979937350631433, + "loss": 0.0, + "step": 25019 + }, + { + "epoch": 2.3346085658299898, + "grad_norm": NaN, + "learning_rate": 0.00020979243670086594, + "loss": 0.0, + "step": 25020 + }, + { + "epoch": 2.334701875524867, + "grad_norm": NaN, + "learning_rate": 0.00020978549974338308, + "loss": 0.0, + "step": 25021 + }, + { + "epoch": 2.334795185219744, + "grad_norm": NaN, + "learning_rate": 0.0002097785626338834, + "loss": 0.0, + "step": 25022 + }, + { + "epoch": 2.3348884949146216, + "grad_norm": NaN, + "learning_rate": 0.00020977162537238455, + "loss": 0.0, + "step": 25023 + }, + { + "epoch": 2.334981804609499, + "grad_norm": NaN, + "learning_rate": 0.00020976468795890412, + "loss": 0.0, + "step": 25024 + }, + { + "epoch": 2.3350751143043764, + "grad_norm": NaN, + "learning_rate": 0.00020975775039345986, + "loss": 0.0, + "step": 25025 + }, + { + "epoch": 2.3351684239992534, + "grad_norm": NaN, + "learning_rate": 0.0002097508126760693, + "loss": 0.0, + "step": 25026 + }, + { + "epoch": 2.335261733694131, + "grad_norm": NaN, + "learning_rate": 0.00020974387480675013, + "loss": 0.0, + "step": 25027 + }, + { + "epoch": 2.3353550433890082, + "grad_norm": NaN, + "learning_rate": 0.00020973693678552, + "loss": 0.0, + "step": 25028 + }, + { + "epoch": 2.335448353083885, + "grad_norm": NaN, + "learning_rate": 0.0002097299986123965, + "loss": 0.0, + "step": 25029 + }, + { + "epoch": 2.3355416627787626, + "grad_norm": NaN, + "learning_rate": 0.0002097230602873973, + "loss": 0.0, + "step": 25030 + }, + { + "epoch": 2.33563497247364, + "grad_norm": NaN, + "learning_rate": 0.0002097161218105401, + "loss": 0.0, + "step": 25031 + }, + { + "epoch": 2.3357282821685175, + "grad_norm": NaN, + "learning_rate": 0.00020970918318184248, + "loss": 0.0, + "step": 25032 + }, + { + "epoch": 2.3358215918633944, + "grad_norm": NaN, + "learning_rate": 0.00020970224440132205, + "loss": 0.0, + "step": 25033 + }, + { + "epoch": 2.335914901558272, + "grad_norm": NaN, + "learning_rate": 0.00020969530546899656, + "loss": 0.0, + "step": 25034 + }, + { + "epoch": 2.3360082112531493, + "grad_norm": NaN, + "learning_rate": 0.00020968836638488356, + "loss": 0.0, + "step": 25035 + }, + { + "epoch": 2.3361015209480263, + "grad_norm": NaN, + "learning_rate": 0.00020968142714900071, + "loss": 0.0, + "step": 25036 + }, + { + "epoch": 2.3361948306429037, + "grad_norm": NaN, + "learning_rate": 0.0002096744877613657, + "loss": 0.0, + "step": 25037 + }, + { + "epoch": 2.336288140337781, + "grad_norm": NaN, + "learning_rate": 0.00020966754822199612, + "loss": 0.0, + "step": 25038 + }, + { + "epoch": 2.3363814500326585, + "grad_norm": NaN, + "learning_rate": 0.00020966060853090966, + "loss": 0.0, + "step": 25039 + }, + { + "epoch": 2.336474759727536, + "grad_norm": NaN, + "learning_rate": 0.00020965366868812394, + "loss": 0.0, + "step": 25040 + }, + { + "epoch": 2.336568069422413, + "grad_norm": NaN, + "learning_rate": 0.0002096467286936566, + "loss": 0.0, + "step": 25041 + }, + { + "epoch": 2.3366613791172903, + "grad_norm": NaN, + "learning_rate": 0.0002096397885475253, + "loss": 0.0, + "step": 25042 + }, + { + "epoch": 2.3367546888121677, + "grad_norm": NaN, + "learning_rate": 0.0002096328482497477, + "loss": 0.0, + "step": 25043 + }, + { + "epoch": 2.3368479985070447, + "grad_norm": NaN, + "learning_rate": 0.0002096259078003414, + "loss": 0.0, + "step": 25044 + }, + { + "epoch": 2.336941308201922, + "grad_norm": NaN, + "learning_rate": 0.00020961896719932411, + "loss": 0.0, + "step": 25045 + }, + { + "epoch": 2.3370346178967996, + "grad_norm": NaN, + "learning_rate": 0.00020961202644671344, + "loss": 0.0, + "step": 25046 + }, + { + "epoch": 2.337127927591677, + "grad_norm": NaN, + "learning_rate": 0.00020960508554252705, + "loss": 0.0, + "step": 25047 + }, + { + "epoch": 2.337221237286554, + "grad_norm": NaN, + "learning_rate": 0.00020959814448678256, + "loss": 0.0, + "step": 25048 + }, + { + "epoch": 2.3373145469814314, + "grad_norm": NaN, + "learning_rate": 0.00020959120327949767, + "loss": 0.0, + "step": 25049 + }, + { + "epoch": 2.337407856676309, + "grad_norm": NaN, + "learning_rate": 0.00020958426192069, + "loss": 0.0, + "step": 25050 + }, + { + "epoch": 2.3375011663711858, + "grad_norm": NaN, + "learning_rate": 0.0002095773204103772, + "loss": 0.0, + "step": 25051 + }, + { + "epoch": 2.337594476066063, + "grad_norm": NaN, + "learning_rate": 0.0002095703787485769, + "loss": 0.0, + "step": 25052 + }, + { + "epoch": 2.3376877857609406, + "grad_norm": NaN, + "learning_rate": 0.00020956343693530677, + "loss": 0.0, + "step": 25053 + }, + { + "epoch": 2.337781095455818, + "grad_norm": NaN, + "learning_rate": 0.00020955649497058452, + "loss": 0.0, + "step": 25054 + }, + { + "epoch": 2.337874405150695, + "grad_norm": NaN, + "learning_rate": 0.00020954955285442764, + "loss": 0.0, + "step": 25055 + }, + { + "epoch": 2.3379677148455724, + "grad_norm": NaN, + "learning_rate": 0.000209542610586854, + "loss": 0.0, + "step": 25056 + }, + { + "epoch": 2.33806102454045, + "grad_norm": NaN, + "learning_rate": 0.0002095356681678811, + "loss": 0.0, + "step": 25057 + }, + { + "epoch": 2.338154334235327, + "grad_norm": NaN, + "learning_rate": 0.0002095287255975266, + "loss": 0.0, + "step": 25058 + }, + { + "epoch": 2.3382476439302042, + "grad_norm": NaN, + "learning_rate": 0.00020952178287580824, + "loss": 0.0, + "step": 25059 + }, + { + "epoch": 2.3383409536250817, + "grad_norm": NaN, + "learning_rate": 0.00020951484000274358, + "loss": 0.0, + "step": 25060 + }, + { + "epoch": 2.338434263319959, + "grad_norm": NaN, + "learning_rate": 0.00020950789697835034, + "loss": 0.0, + "step": 25061 + }, + { + "epoch": 2.338527573014836, + "grad_norm": NaN, + "learning_rate": 0.0002095009538026461, + "loss": 0.0, + "step": 25062 + }, + { + "epoch": 2.3386208827097135, + "grad_norm": NaN, + "learning_rate": 0.0002094940104756486, + "loss": 0.0, + "step": 25063 + }, + { + "epoch": 2.338714192404591, + "grad_norm": NaN, + "learning_rate": 0.00020948706699737543, + "loss": 0.0, + "step": 25064 + }, + { + "epoch": 2.3388075020994683, + "grad_norm": NaN, + "learning_rate": 0.00020948012336784427, + "loss": 0.0, + "step": 25065 + }, + { + "epoch": 2.3389008117943453, + "grad_norm": NaN, + "learning_rate": 0.00020947317958707278, + "loss": 0.0, + "step": 25066 + }, + { + "epoch": 2.3389941214892227, + "grad_norm": NaN, + "learning_rate": 0.00020946623565507862, + "loss": 0.0, + "step": 25067 + }, + { + "epoch": 2.3390874311841, + "grad_norm": NaN, + "learning_rate": 0.0002094592915718794, + "loss": 0.0, + "step": 25068 + }, + { + "epoch": 2.3391807408789775, + "grad_norm": NaN, + "learning_rate": 0.0002094523473374929, + "loss": 0.0, + "step": 25069 + }, + { + "epoch": 2.3392740505738545, + "grad_norm": NaN, + "learning_rate": 0.00020944540295193663, + "loss": 0.0, + "step": 25070 + }, + { + "epoch": 2.339367360268732, + "grad_norm": NaN, + "learning_rate": 0.0002094384584152283, + "loss": 0.0, + "step": 25071 + }, + { + "epoch": 2.3394606699636094, + "grad_norm": NaN, + "learning_rate": 0.0002094315137273856, + "loss": 0.0, + "step": 25072 + }, + { + "epoch": 2.3395539796584863, + "grad_norm": NaN, + "learning_rate": 0.00020942456888842616, + "loss": 0.0, + "step": 25073 + }, + { + "epoch": 2.3396472893533637, + "grad_norm": NaN, + "learning_rate": 0.00020941762389836764, + "loss": 0.0, + "step": 25074 + }, + { + "epoch": 2.339740599048241, + "grad_norm": NaN, + "learning_rate": 0.0002094106787572277, + "loss": 0.0, + "step": 25075 + }, + { + "epoch": 2.3398339087431186, + "grad_norm": NaN, + "learning_rate": 0.00020940373346502403, + "loss": 0.0, + "step": 25076 + }, + { + "epoch": 2.3399272184379956, + "grad_norm": NaN, + "learning_rate": 0.00020939678802177423, + "loss": 0.0, + "step": 25077 + }, + { + "epoch": 2.340020528132873, + "grad_norm": NaN, + "learning_rate": 0.00020938984242749604, + "loss": 0.0, + "step": 25078 + }, + { + "epoch": 2.3401138378277504, + "grad_norm": NaN, + "learning_rate": 0.00020938289668220704, + "loss": 0.0, + "step": 25079 + }, + { + "epoch": 2.3402071475226274, + "grad_norm": NaN, + "learning_rate": 0.0002093759507859249, + "loss": 0.0, + "step": 25080 + }, + { + "epoch": 2.340300457217505, + "grad_norm": NaN, + "learning_rate": 0.00020936900473866735, + "loss": 0.0, + "step": 25081 + }, + { + "epoch": 2.340393766912382, + "grad_norm": NaN, + "learning_rate": 0.000209362058540452, + "loss": 0.0, + "step": 25082 + }, + { + "epoch": 2.3404870766072596, + "grad_norm": NaN, + "learning_rate": 0.0002093551121912965, + "loss": 0.0, + "step": 25083 + }, + { + "epoch": 2.3405803863021366, + "grad_norm": NaN, + "learning_rate": 0.0002093481656912185, + "loss": 0.0, + "step": 25084 + }, + { + "epoch": 2.340673695997014, + "grad_norm": NaN, + "learning_rate": 0.00020934121904023575, + "loss": 0.0, + "step": 25085 + }, + { + "epoch": 2.3407670056918914, + "grad_norm": NaN, + "learning_rate": 0.00020933427223836584, + "loss": 0.0, + "step": 25086 + }, + { + "epoch": 2.340860315386769, + "grad_norm": NaN, + "learning_rate": 0.00020932732528562646, + "loss": 0.0, + "step": 25087 + }, + { + "epoch": 2.340953625081646, + "grad_norm": NaN, + "learning_rate": 0.00020932037818203526, + "loss": 0.0, + "step": 25088 + }, + { + "epoch": 2.3410469347765233, + "grad_norm": NaN, + "learning_rate": 0.00020931343092760993, + "loss": 0.0, + "step": 25089 + }, + { + "epoch": 2.3411402444714007, + "grad_norm": NaN, + "learning_rate": 0.00020930648352236806, + "loss": 0.0, + "step": 25090 + }, + { + "epoch": 2.341233554166278, + "grad_norm": NaN, + "learning_rate": 0.00020929953596632742, + "loss": 0.0, + "step": 25091 + }, + { + "epoch": 2.341326863861155, + "grad_norm": NaN, + "learning_rate": 0.00020929258825950564, + "loss": 0.0, + "step": 25092 + }, + { + "epoch": 2.3414201735560325, + "grad_norm": NaN, + "learning_rate": 0.0002092856404019203, + "loss": 0.0, + "step": 25093 + }, + { + "epoch": 2.34151348325091, + "grad_norm": NaN, + "learning_rate": 0.00020927869239358924, + "loss": 0.0, + "step": 25094 + }, + { + "epoch": 2.341606792945787, + "grad_norm": NaN, + "learning_rate": 0.00020927174423452993, + "loss": 0.0, + "step": 25095 + }, + { + "epoch": 2.3417001026406643, + "grad_norm": NaN, + "learning_rate": 0.00020926479592476015, + "loss": 0.0, + "step": 25096 + }, + { + "epoch": 2.3417934123355417, + "grad_norm": NaN, + "learning_rate": 0.0002092578474642976, + "loss": 0.0, + "step": 25097 + }, + { + "epoch": 2.341886722030419, + "grad_norm": NaN, + "learning_rate": 0.0002092508988531599, + "loss": 0.0, + "step": 25098 + }, + { + "epoch": 2.341980031725296, + "grad_norm": NaN, + "learning_rate": 0.00020924395009136462, + "loss": 0.0, + "step": 25099 + }, + { + "epoch": 2.3420733414201735, + "grad_norm": NaN, + "learning_rate": 0.00020923700117892962, + "loss": 0.0, + "step": 25100 + }, + { + "epoch": 2.342166651115051, + "grad_norm": NaN, + "learning_rate": 0.00020923005211587244, + "loss": 0.0, + "step": 25101 + }, + { + "epoch": 2.342259960809928, + "grad_norm": NaN, + "learning_rate": 0.00020922310290221076, + "loss": 0.0, + "step": 25102 + }, + { + "epoch": 2.3423532705048054, + "grad_norm": NaN, + "learning_rate": 0.00020921615353796226, + "loss": 0.0, + "step": 25103 + }, + { + "epoch": 2.3424465801996828, + "grad_norm": NaN, + "learning_rate": 0.0002092092040231447, + "loss": 0.0, + "step": 25104 + }, + { + "epoch": 2.34253988989456, + "grad_norm": NaN, + "learning_rate": 0.0002092022543577756, + "loss": 0.0, + "step": 25105 + }, + { + "epoch": 2.342633199589437, + "grad_norm": NaN, + "learning_rate": 0.00020919530454187275, + "loss": 0.0, + "step": 25106 + }, + { + "epoch": 2.3427265092843146, + "grad_norm": NaN, + "learning_rate": 0.00020918835457545379, + "loss": 0.0, + "step": 25107 + }, + { + "epoch": 2.342819818979192, + "grad_norm": NaN, + "learning_rate": 0.00020918140445853634, + "loss": 0.0, + "step": 25108 + }, + { + "epoch": 2.3429131286740694, + "grad_norm": NaN, + "learning_rate": 0.00020917445419113808, + "loss": 0.0, + "step": 25109 + }, + { + "epoch": 2.3430064383689464, + "grad_norm": NaN, + "learning_rate": 0.00020916750377327677, + "loss": 0.0, + "step": 25110 + }, + { + "epoch": 2.343099748063824, + "grad_norm": NaN, + "learning_rate": 0.00020916055320497002, + "loss": 0.0, + "step": 25111 + }, + { + "epoch": 2.3431930577587012, + "grad_norm": NaN, + "learning_rate": 0.00020915360248623544, + "loss": 0.0, + "step": 25112 + }, + { + "epoch": 2.3432863674535787, + "grad_norm": NaN, + "learning_rate": 0.00020914665161709086, + "loss": 0.0, + "step": 25113 + }, + { + "epoch": 2.3433796771484556, + "grad_norm": NaN, + "learning_rate": 0.00020913970059755383, + "loss": 0.0, + "step": 25114 + }, + { + "epoch": 2.343472986843333, + "grad_norm": NaN, + "learning_rate": 0.00020913274942764202, + "loss": 0.0, + "step": 25115 + }, + { + "epoch": 2.3435662965382105, + "grad_norm": NaN, + "learning_rate": 0.00020912579810737321, + "loss": 0.0, + "step": 25116 + }, + { + "epoch": 2.3436596062330874, + "grad_norm": NaN, + "learning_rate": 0.00020911884663676498, + "loss": 0.0, + "step": 25117 + }, + { + "epoch": 2.343752915927965, + "grad_norm": NaN, + "learning_rate": 0.00020911189501583502, + "loss": 0.0, + "step": 25118 + }, + { + "epoch": 2.3438462256228423, + "grad_norm": NaN, + "learning_rate": 0.00020910494324460105, + "loss": 0.0, + "step": 25119 + }, + { + "epoch": 2.3439395353177197, + "grad_norm": NaN, + "learning_rate": 0.00020909799132308075, + "loss": 0.0, + "step": 25120 + }, + { + "epoch": 2.3440328450125967, + "grad_norm": NaN, + "learning_rate": 0.00020909103925129165, + "loss": 0.0, + "step": 25121 + }, + { + "epoch": 2.344126154707474, + "grad_norm": NaN, + "learning_rate": 0.00020908408702925162, + "loss": 0.0, + "step": 25122 + }, + { + "epoch": 2.3442194644023515, + "grad_norm": NaN, + "learning_rate": 0.00020907713465697828, + "loss": 0.0, + "step": 25123 + }, + { + "epoch": 2.3443127740972285, + "grad_norm": NaN, + "learning_rate": 0.00020907018213448921, + "loss": 0.0, + "step": 25124 + }, + { + "epoch": 2.344406083792106, + "grad_norm": NaN, + "learning_rate": 0.0002090632294618022, + "loss": 0.0, + "step": 25125 + }, + { + "epoch": 2.3444993934869833, + "grad_norm": NaN, + "learning_rate": 0.00020905627663893493, + "loss": 0.0, + "step": 25126 + }, + { + "epoch": 2.3445927031818607, + "grad_norm": NaN, + "learning_rate": 0.000209049323665905, + "loss": 0.0, + "step": 25127 + }, + { + "epoch": 2.3446860128767377, + "grad_norm": NaN, + "learning_rate": 0.00020904237054273015, + "loss": 0.0, + "step": 25128 + }, + { + "epoch": 2.344779322571615, + "grad_norm": NaN, + "learning_rate": 0.0002090354172694281, + "loss": 0.0, + "step": 25129 + }, + { + "epoch": 2.3448726322664926, + "grad_norm": NaN, + "learning_rate": 0.00020902846384601638, + "loss": 0.0, + "step": 25130 + }, + { + "epoch": 2.3449659419613695, + "grad_norm": NaN, + "learning_rate": 0.0002090215102725128, + "loss": 0.0, + "step": 25131 + }, + { + "epoch": 2.345059251656247, + "grad_norm": NaN, + "learning_rate": 0.00020901455654893502, + "loss": 0.0, + "step": 25132 + }, + { + "epoch": 2.3451525613511244, + "grad_norm": NaN, + "learning_rate": 0.00020900760267530066, + "loss": 0.0, + "step": 25133 + }, + { + "epoch": 2.345245871046002, + "grad_norm": NaN, + "learning_rate": 0.00020900064865162743, + "loss": 0.0, + "step": 25134 + }, + { + "epoch": 2.345339180740879, + "grad_norm": NaN, + "learning_rate": 0.0002089936944779331, + "loss": 0.0, + "step": 25135 + }, + { + "epoch": 2.345432490435756, + "grad_norm": NaN, + "learning_rate": 0.00020898674015423525, + "loss": 0.0, + "step": 25136 + }, + { + "epoch": 2.3455258001306336, + "grad_norm": NaN, + "learning_rate": 0.00020897978568055155, + "loss": 0.0, + "step": 25137 + }, + { + "epoch": 2.345619109825511, + "grad_norm": NaN, + "learning_rate": 0.0002089728310568998, + "loss": 0.0, + "step": 25138 + }, + { + "epoch": 2.345712419520388, + "grad_norm": NaN, + "learning_rate": 0.00020896587628329758, + "loss": 0.0, + "step": 25139 + }, + { + "epoch": 2.3458057292152654, + "grad_norm": NaN, + "learning_rate": 0.00020895892135976255, + "loss": 0.0, + "step": 25140 + }, + { + "epoch": 2.345899038910143, + "grad_norm": NaN, + "learning_rate": 0.0002089519662863125, + "loss": 0.0, + "step": 25141 + }, + { + "epoch": 2.3459923486050203, + "grad_norm": NaN, + "learning_rate": 0.00020894501106296506, + "loss": 0.0, + "step": 25142 + }, + { + "epoch": 2.3460856582998972, + "grad_norm": NaN, + "learning_rate": 0.00020893805568973788, + "loss": 0.0, + "step": 25143 + }, + { + "epoch": 2.3461789679947747, + "grad_norm": NaN, + "learning_rate": 0.00020893110016664875, + "loss": 0.0, + "step": 25144 + }, + { + "epoch": 2.346272277689652, + "grad_norm": NaN, + "learning_rate": 0.00020892414449371529, + "loss": 0.0, + "step": 25145 + }, + { + "epoch": 2.346365587384529, + "grad_norm": NaN, + "learning_rate": 0.00020891718867095509, + "loss": 0.0, + "step": 25146 + }, + { + "epoch": 2.3464588970794065, + "grad_norm": NaN, + "learning_rate": 0.000208910232698386, + "loss": 0.0, + "step": 25147 + }, + { + "epoch": 2.346552206774284, + "grad_norm": NaN, + "learning_rate": 0.00020890327657602565, + "loss": 0.0, + "step": 25148 + }, + { + "epoch": 2.3466455164691613, + "grad_norm": NaN, + "learning_rate": 0.00020889632030389167, + "loss": 0.0, + "step": 25149 + }, + { + "epoch": 2.3467388261640383, + "grad_norm": NaN, + "learning_rate": 0.00020888936388200184, + "loss": 0.0, + "step": 25150 + }, + { + "epoch": 2.3468321358589157, + "grad_norm": NaN, + "learning_rate": 0.0002088824073103738, + "loss": 0.0, + "step": 25151 + }, + { + "epoch": 2.346925445553793, + "grad_norm": NaN, + "learning_rate": 0.0002088754505890252, + "loss": 0.0, + "step": 25152 + }, + { + "epoch": 2.34701875524867, + "grad_norm": NaN, + "learning_rate": 0.00020886849371797378, + "loss": 0.0, + "step": 25153 + }, + { + "epoch": 2.3471120649435475, + "grad_norm": NaN, + "learning_rate": 0.0002088615366972373, + "loss": 0.0, + "step": 25154 + }, + { + "epoch": 2.347205374638425, + "grad_norm": NaN, + "learning_rate": 0.00020885457952683328, + "loss": 0.0, + "step": 25155 + }, + { + "epoch": 2.3472986843333024, + "grad_norm": NaN, + "learning_rate": 0.0002088476222067795, + "loss": 0.0, + "step": 25156 + }, + { + "epoch": 2.3473919940281798, + "grad_norm": NaN, + "learning_rate": 0.00020884066473709367, + "loss": 0.0, + "step": 25157 + }, + { + "epoch": 2.3474853037230567, + "grad_norm": NaN, + "learning_rate": 0.00020883370711779348, + "loss": 0.0, + "step": 25158 + }, + { + "epoch": 2.347578613417934, + "grad_norm": NaN, + "learning_rate": 0.00020882674934889653, + "loss": 0.0, + "step": 25159 + }, + { + "epoch": 2.3476719231128116, + "grad_norm": NaN, + "learning_rate": 0.00020881979143042064, + "loss": 0.0, + "step": 25160 + }, + { + "epoch": 2.3477652328076886, + "grad_norm": NaN, + "learning_rate": 0.00020881283336238347, + "loss": 0.0, + "step": 25161 + }, + { + "epoch": 2.347858542502566, + "grad_norm": NaN, + "learning_rate": 0.00020880587514480263, + "loss": 0.0, + "step": 25162 + }, + { + "epoch": 2.3479518521974434, + "grad_norm": NaN, + "learning_rate": 0.0002087989167776959, + "loss": 0.0, + "step": 25163 + }, + { + "epoch": 2.348045161892321, + "grad_norm": NaN, + "learning_rate": 0.00020879195826108092, + "loss": 0.0, + "step": 25164 + }, + { + "epoch": 2.348138471587198, + "grad_norm": NaN, + "learning_rate": 0.0002087849995949754, + "loss": 0.0, + "step": 25165 + }, + { + "epoch": 2.348231781282075, + "grad_norm": NaN, + "learning_rate": 0.00020877804077939704, + "loss": 0.0, + "step": 25166 + }, + { + "epoch": 2.3483250909769526, + "grad_norm": NaN, + "learning_rate": 0.0002087710818143636, + "loss": 0.0, + "step": 25167 + }, + { + "epoch": 2.3484184006718296, + "grad_norm": NaN, + "learning_rate": 0.0002087641226998926, + "loss": 0.0, + "step": 25168 + }, + { + "epoch": 2.348511710366707, + "grad_norm": NaN, + "learning_rate": 0.0002087571634360019, + "loss": 0.0, + "step": 25169 + }, + { + "epoch": 2.3486050200615844, + "grad_norm": NaN, + "learning_rate": 0.0002087502040227092, + "loss": 0.0, + "step": 25170 + }, + { + "epoch": 2.348698329756462, + "grad_norm": NaN, + "learning_rate": 0.00020874324446003205, + "loss": 0.0, + "step": 25171 + }, + { + "epoch": 2.348791639451339, + "grad_norm": NaN, + "learning_rate": 0.00020873628474798824, + "loss": 0.0, + "step": 25172 + }, + { + "epoch": 2.3488849491462163, + "grad_norm": NaN, + "learning_rate": 0.0002087293248865955, + "loss": 0.0, + "step": 25173 + }, + { + "epoch": 2.3489782588410937, + "grad_norm": NaN, + "learning_rate": 0.00020872236487587143, + "loss": 0.0, + "step": 25174 + }, + { + "epoch": 2.3490715685359707, + "grad_norm": NaN, + "learning_rate": 0.00020871540471583379, + "loss": 0.0, + "step": 25175 + }, + { + "epoch": 2.349164878230848, + "grad_norm": NaN, + "learning_rate": 0.00020870844440650034, + "loss": 0.0, + "step": 25176 + }, + { + "epoch": 2.3492581879257255, + "grad_norm": NaN, + "learning_rate": 0.00020870148394788865, + "loss": 0.0, + "step": 25177 + }, + { + "epoch": 2.349351497620603, + "grad_norm": NaN, + "learning_rate": 0.00020869452334001644, + "loss": 0.0, + "step": 25178 + }, + { + "epoch": 2.34944480731548, + "grad_norm": NaN, + "learning_rate": 0.00020868756258290148, + "loss": 0.0, + "step": 25179 + }, + { + "epoch": 2.3495381170103573, + "grad_norm": NaN, + "learning_rate": 0.00020868060167656145, + "loss": 0.0, + "step": 25180 + }, + { + "epoch": 2.3496314267052347, + "grad_norm": NaN, + "learning_rate": 0.00020867364062101396, + "loss": 0.0, + "step": 25181 + }, + { + "epoch": 2.349724736400112, + "grad_norm": NaN, + "learning_rate": 0.00020866667941627686, + "loss": 0.0, + "step": 25182 + }, + { + "epoch": 2.349818046094989, + "grad_norm": NaN, + "learning_rate": 0.00020865971806236782, + "loss": 0.0, + "step": 25183 + }, + { + "epoch": 2.3499113557898665, + "grad_norm": NaN, + "learning_rate": 0.00020865275655930436, + "loss": 0.0, + "step": 25184 + }, + { + "epoch": 2.350004665484744, + "grad_norm": NaN, + "learning_rate": 0.0002086457949071044, + "loss": 0.0, + "step": 25185 + }, + { + "epoch": 2.3500979751796214, + "grad_norm": NaN, + "learning_rate": 0.00020863883310578553, + "loss": 0.0, + "step": 25186 + }, + { + "epoch": 2.3501912848744984, + "grad_norm": NaN, + "learning_rate": 0.00020863187115536546, + "loss": 0.0, + "step": 25187 + }, + { + "epoch": 2.3502845945693758, + "grad_norm": NaN, + "learning_rate": 0.00020862490905586194, + "loss": 0.0, + "step": 25188 + }, + { + "epoch": 2.350377904264253, + "grad_norm": NaN, + "learning_rate": 0.00020861794680729267, + "loss": 0.0, + "step": 25189 + }, + { + "epoch": 2.35047121395913, + "grad_norm": NaN, + "learning_rate": 0.00020861098440967522, + "loss": 0.0, + "step": 25190 + }, + { + "epoch": 2.3505645236540076, + "grad_norm": NaN, + "learning_rate": 0.0002086040218630275, + "loss": 0.0, + "step": 25191 + }, + { + "epoch": 2.350657833348885, + "grad_norm": NaN, + "learning_rate": 0.00020859705916736711, + "loss": 0.0, + "step": 25192 + }, + { + "epoch": 2.3507511430437624, + "grad_norm": NaN, + "learning_rate": 0.0002085900963227117, + "loss": 0.0, + "step": 25193 + }, + { + "epoch": 2.3508444527386394, + "grad_norm": NaN, + "learning_rate": 0.00020858313332907906, + "loss": 0.0, + "step": 25194 + }, + { + "epoch": 2.350937762433517, + "grad_norm": NaN, + "learning_rate": 0.00020857617018648687, + "loss": 0.0, + "step": 25195 + }, + { + "epoch": 2.3510310721283942, + "grad_norm": NaN, + "learning_rate": 0.00020856920689495283, + "loss": 0.0, + "step": 25196 + }, + { + "epoch": 2.351124381823271, + "grad_norm": NaN, + "learning_rate": 0.0002085622434544946, + "loss": 0.0, + "step": 25197 + }, + { + "epoch": 2.3512176915181486, + "grad_norm": NaN, + "learning_rate": 0.00020855527986512996, + "loss": 0.0, + "step": 25198 + }, + { + "epoch": 2.351311001213026, + "grad_norm": NaN, + "learning_rate": 0.00020854831612687665, + "loss": 0.0, + "step": 25199 + }, + { + "epoch": 2.3514043109079035, + "grad_norm": NaN, + "learning_rate": 0.00020854135223975223, + "loss": 0.0, + "step": 25200 + }, + { + "epoch": 2.3514976206027804, + "grad_norm": NaN, + "learning_rate": 0.0002085343882037745, + "loss": 0.0, + "step": 25201 + }, + { + "epoch": 2.351590930297658, + "grad_norm": NaN, + "learning_rate": 0.00020852742401896126, + "loss": 0.0, + "step": 25202 + }, + { + "epoch": 2.3516842399925353, + "grad_norm": NaN, + "learning_rate": 0.00020852045968533, + "loss": 0.0, + "step": 25203 + }, + { + "epoch": 2.3517775496874127, + "grad_norm": NaN, + "learning_rate": 0.00020851349520289859, + "loss": 0.0, + "step": 25204 + }, + { + "epoch": 2.3518708593822897, + "grad_norm": NaN, + "learning_rate": 0.00020850653057168472, + "loss": 0.0, + "step": 25205 + }, + { + "epoch": 2.351964169077167, + "grad_norm": NaN, + "learning_rate": 0.000208499565791706, + "loss": 0.0, + "step": 25206 + }, + { + "epoch": 2.3520574787720445, + "grad_norm": NaN, + "learning_rate": 0.00020849260086298028, + "loss": 0.0, + "step": 25207 + }, + { + "epoch": 2.352150788466922, + "grad_norm": NaN, + "learning_rate": 0.00020848563578552523, + "loss": 0.0, + "step": 25208 + }, + { + "epoch": 2.352244098161799, + "grad_norm": NaN, + "learning_rate": 0.00020847867055935843, + "loss": 0.0, + "step": 25209 + }, + { + "epoch": 2.3523374078566763, + "grad_norm": NaN, + "learning_rate": 0.00020847170518449777, + "loss": 0.0, + "step": 25210 + }, + { + "epoch": 2.3524307175515538, + "grad_norm": NaN, + "learning_rate": 0.0002084647396609609, + "loss": 0.0, + "step": 25211 + }, + { + "epoch": 2.3525240272464307, + "grad_norm": NaN, + "learning_rate": 0.00020845777398876547, + "loss": 0.0, + "step": 25212 + }, + { + "epoch": 2.352617336941308, + "grad_norm": NaN, + "learning_rate": 0.00020845080816792922, + "loss": 0.0, + "step": 25213 + }, + { + "epoch": 2.3527106466361856, + "grad_norm": NaN, + "learning_rate": 0.00020844384219846992, + "loss": 0.0, + "step": 25214 + }, + { + "epoch": 2.352803956331063, + "grad_norm": NaN, + "learning_rate": 0.0002084368760804052, + "loss": 0.0, + "step": 25215 + }, + { + "epoch": 2.35289726602594, + "grad_norm": NaN, + "learning_rate": 0.00020842990981375287, + "loss": 0.0, + "step": 25216 + }, + { + "epoch": 2.3529905757208174, + "grad_norm": NaN, + "learning_rate": 0.00020842294339853056, + "loss": 0.0, + "step": 25217 + }, + { + "epoch": 2.353083885415695, + "grad_norm": NaN, + "learning_rate": 0.00020841597683475606, + "loss": 0.0, + "step": 25218 + }, + { + "epoch": 2.3531771951105718, + "grad_norm": NaN, + "learning_rate": 0.00020840901012244694, + "loss": 0.0, + "step": 25219 + }, + { + "epoch": 2.353270504805449, + "grad_norm": NaN, + "learning_rate": 0.00020840204326162104, + "loss": 0.0, + "step": 25220 + }, + { + "epoch": 2.3533638145003266, + "grad_norm": NaN, + "learning_rate": 0.00020839507625229612, + "loss": 0.0, + "step": 25221 + }, + { + "epoch": 2.353457124195204, + "grad_norm": NaN, + "learning_rate": 0.0002083881090944897, + "loss": 0.0, + "step": 25222 + }, + { + "epoch": 2.353550433890081, + "grad_norm": NaN, + "learning_rate": 0.0002083811417882197, + "loss": 0.0, + "step": 25223 + }, + { + "epoch": 2.3536437435849584, + "grad_norm": NaN, + "learning_rate": 0.00020837417433350374, + "loss": 0.0, + "step": 25224 + }, + { + "epoch": 2.353737053279836, + "grad_norm": NaN, + "learning_rate": 0.0002083672067303595, + "loss": 0.0, + "step": 25225 + }, + { + "epoch": 2.3538303629747133, + "grad_norm": NaN, + "learning_rate": 0.00020836023897880475, + "loss": 0.0, + "step": 25226 + }, + { + "epoch": 2.3539236726695902, + "grad_norm": NaN, + "learning_rate": 0.00020835327107885727, + "loss": 0.0, + "step": 25227 + }, + { + "epoch": 2.3540169823644677, + "grad_norm": NaN, + "learning_rate": 0.00020834630303053465, + "loss": 0.0, + "step": 25228 + }, + { + "epoch": 2.354110292059345, + "grad_norm": NaN, + "learning_rate": 0.00020833933483385467, + "loss": 0.0, + "step": 25229 + }, + { + "epoch": 2.3542036017542225, + "grad_norm": NaN, + "learning_rate": 0.00020833236648883509, + "loss": 0.0, + "step": 25230 + }, + { + "epoch": 2.3542969114490995, + "grad_norm": NaN, + "learning_rate": 0.00020832539799549348, + "loss": 0.0, + "step": 25231 + }, + { + "epoch": 2.354390221143977, + "grad_norm": NaN, + "learning_rate": 0.00020831842935384776, + "loss": 0.0, + "step": 25232 + }, + { + "epoch": 2.3544835308388543, + "grad_norm": NaN, + "learning_rate": 0.0002083114605639155, + "loss": 0.0, + "step": 25233 + }, + { + "epoch": 2.3545768405337313, + "grad_norm": NaN, + "learning_rate": 0.00020830449162571445, + "loss": 0.0, + "step": 25234 + }, + { + "epoch": 2.3546701502286087, + "grad_norm": NaN, + "learning_rate": 0.0002082975225392624, + "loss": 0.0, + "step": 25235 + }, + { + "epoch": 2.354763459923486, + "grad_norm": NaN, + "learning_rate": 0.000208290553304577, + "loss": 0.0, + "step": 25236 + }, + { + "epoch": 2.3548567696183635, + "grad_norm": NaN, + "learning_rate": 0.00020828358392167598, + "loss": 0.0, + "step": 25237 + }, + { + "epoch": 2.3549500793132405, + "grad_norm": NaN, + "learning_rate": 0.00020827661439057707, + "loss": 0.0, + "step": 25238 + }, + { + "epoch": 2.355043389008118, + "grad_norm": NaN, + "learning_rate": 0.000208269644711298, + "loss": 0.0, + "step": 25239 + }, + { + "epoch": 2.3551366987029954, + "grad_norm": NaN, + "learning_rate": 0.0002082626748838565, + "loss": 0.0, + "step": 25240 + }, + { + "epoch": 2.3552300083978723, + "grad_norm": NaN, + "learning_rate": 0.00020825570490827025, + "loss": 0.0, + "step": 25241 + }, + { + "epoch": 2.3553233180927498, + "grad_norm": NaN, + "learning_rate": 0.00020824873478455703, + "loss": 0.0, + "step": 25242 + }, + { + "epoch": 2.355416627787627, + "grad_norm": NaN, + "learning_rate": 0.00020824176451273452, + "loss": 0.0, + "step": 25243 + }, + { + "epoch": 2.3555099374825046, + "grad_norm": NaN, + "learning_rate": 0.00020823479409282043, + "loss": 0.0, + "step": 25244 + }, + { + "epoch": 2.3556032471773816, + "grad_norm": NaN, + "learning_rate": 0.0002082278235248325, + "loss": 0.0, + "step": 25245 + }, + { + "epoch": 2.355696556872259, + "grad_norm": NaN, + "learning_rate": 0.00020822085280878852, + "loss": 0.0, + "step": 25246 + }, + { + "epoch": 2.3557898665671364, + "grad_norm": NaN, + "learning_rate": 0.00020821388194470613, + "loss": 0.0, + "step": 25247 + }, + { + "epoch": 2.3558831762620134, + "grad_norm": NaN, + "learning_rate": 0.00020820691093260306, + "loss": 0.0, + "step": 25248 + }, + { + "epoch": 2.355976485956891, + "grad_norm": NaN, + "learning_rate": 0.0002081999397724971, + "loss": 0.0, + "step": 25249 + }, + { + "epoch": 2.356069795651768, + "grad_norm": NaN, + "learning_rate": 0.0002081929684644059, + "loss": 0.0, + "step": 25250 + }, + { + "epoch": 2.3561631053466456, + "grad_norm": NaN, + "learning_rate": 0.00020818599700834722, + "loss": 0.0, + "step": 25251 + }, + { + "epoch": 2.356256415041523, + "grad_norm": NaN, + "learning_rate": 0.0002081790254043388, + "loss": 0.0, + "step": 25252 + }, + { + "epoch": 2.3563497247364, + "grad_norm": NaN, + "learning_rate": 0.00020817205365239832, + "loss": 0.0, + "step": 25253 + }, + { + "epoch": 2.3564430344312775, + "grad_norm": NaN, + "learning_rate": 0.00020816508175254357, + "loss": 0.0, + "step": 25254 + }, + { + "epoch": 2.356536344126155, + "grad_norm": NaN, + "learning_rate": 0.00020815810970479224, + "loss": 0.0, + "step": 25255 + }, + { + "epoch": 2.356629653821032, + "grad_norm": NaN, + "learning_rate": 0.00020815113750916208, + "loss": 0.0, + "step": 25256 + }, + { + "epoch": 2.3567229635159093, + "grad_norm": NaN, + "learning_rate": 0.00020814416516567078, + "loss": 0.0, + "step": 25257 + }, + { + "epoch": 2.3568162732107867, + "grad_norm": NaN, + "learning_rate": 0.0002081371926743361, + "loss": 0.0, + "step": 25258 + }, + { + "epoch": 2.356909582905664, + "grad_norm": NaN, + "learning_rate": 0.00020813022003517576, + "loss": 0.0, + "step": 25259 + }, + { + "epoch": 2.357002892600541, + "grad_norm": NaN, + "learning_rate": 0.0002081232472482075, + "loss": 0.0, + "step": 25260 + }, + { + "epoch": 2.3570962022954185, + "grad_norm": NaN, + "learning_rate": 0.00020811627431344904, + "loss": 0.0, + "step": 25261 + }, + { + "epoch": 2.357189511990296, + "grad_norm": NaN, + "learning_rate": 0.0002081093012309181, + "loss": 0.0, + "step": 25262 + }, + { + "epoch": 2.357282821685173, + "grad_norm": NaN, + "learning_rate": 0.0002081023280006324, + "loss": 0.0, + "step": 25263 + }, + { + "epoch": 2.3573761313800503, + "grad_norm": NaN, + "learning_rate": 0.00020809535462260973, + "loss": 0.0, + "step": 25264 + }, + { + "epoch": 2.3574694410749277, + "grad_norm": NaN, + "learning_rate": 0.00020808838109686779, + "loss": 0.0, + "step": 25265 + }, + { + "epoch": 2.357562750769805, + "grad_norm": NaN, + "learning_rate": 0.00020808140742342425, + "loss": 0.0, + "step": 25266 + }, + { + "epoch": 2.357656060464682, + "grad_norm": NaN, + "learning_rate": 0.00020807443360229693, + "loss": 0.0, + "step": 25267 + }, + { + "epoch": 2.3577493701595595, + "grad_norm": NaN, + "learning_rate": 0.00020806745963350355, + "loss": 0.0, + "step": 25268 + }, + { + "epoch": 2.357842679854437, + "grad_norm": NaN, + "learning_rate": 0.00020806048551706176, + "loss": 0.0, + "step": 25269 + }, + { + "epoch": 2.357935989549314, + "grad_norm": NaN, + "learning_rate": 0.00020805351125298942, + "loss": 0.0, + "step": 25270 + }, + { + "epoch": 2.3580292992441914, + "grad_norm": NaN, + "learning_rate": 0.00020804653684130416, + "loss": 0.0, + "step": 25271 + }, + { + "epoch": 2.3581226089390688, + "grad_norm": NaN, + "learning_rate": 0.00020803956228202376, + "loss": 0.0, + "step": 25272 + }, + { + "epoch": 2.358215918633946, + "grad_norm": NaN, + "learning_rate": 0.00020803258757516593, + "loss": 0.0, + "step": 25273 + }, + { + "epoch": 2.358309228328823, + "grad_norm": NaN, + "learning_rate": 0.00020802561272074843, + "loss": 0.0, + "step": 25274 + }, + { + "epoch": 2.3584025380237006, + "grad_norm": NaN, + "learning_rate": 0.000208018637718789, + "loss": 0.0, + "step": 25275 + }, + { + "epoch": 2.358495847718578, + "grad_norm": NaN, + "learning_rate": 0.00020801166256930535, + "loss": 0.0, + "step": 25276 + }, + { + "epoch": 2.3585891574134554, + "grad_norm": NaN, + "learning_rate": 0.0002080046872723152, + "loss": 0.0, + "step": 25277 + }, + { + "epoch": 2.3586824671083324, + "grad_norm": NaN, + "learning_rate": 0.00020799771182783633, + "loss": 0.0, + "step": 25278 + }, + { + "epoch": 2.35877577680321, + "grad_norm": NaN, + "learning_rate": 0.00020799073623588648, + "loss": 0.0, + "step": 25279 + }, + { + "epoch": 2.3588690864980872, + "grad_norm": NaN, + "learning_rate": 0.00020798376049648338, + "loss": 0.0, + "step": 25280 + }, + { + "epoch": 2.3589623961929647, + "grad_norm": NaN, + "learning_rate": 0.0002079767846096447, + "loss": 0.0, + "step": 25281 + }, + { + "epoch": 2.3590557058878416, + "grad_norm": NaN, + "learning_rate": 0.00020796980857538827, + "loss": 0.0, + "step": 25282 + }, + { + "epoch": 2.359149015582719, + "grad_norm": NaN, + "learning_rate": 0.00020796283239373178, + "loss": 0.0, + "step": 25283 + }, + { + "epoch": 2.3592423252775965, + "grad_norm": NaN, + "learning_rate": 0.00020795585606469297, + "loss": 0.0, + "step": 25284 + }, + { + "epoch": 2.3593356349724734, + "grad_norm": NaN, + "learning_rate": 0.00020794887958828958, + "loss": 0.0, + "step": 25285 + }, + { + "epoch": 2.359428944667351, + "grad_norm": NaN, + "learning_rate": 0.00020794190296453938, + "loss": 0.0, + "step": 25286 + }, + { + "epoch": 2.3595222543622283, + "grad_norm": NaN, + "learning_rate": 0.0002079349261934601, + "loss": 0.0, + "step": 25287 + }, + { + "epoch": 2.3596155640571057, + "grad_norm": NaN, + "learning_rate": 0.0002079279492750694, + "loss": 0.0, + "step": 25288 + }, + { + "epoch": 2.3597088737519827, + "grad_norm": NaN, + "learning_rate": 0.00020792097220938513, + "loss": 0.0, + "step": 25289 + }, + { + "epoch": 2.35980218344686, + "grad_norm": NaN, + "learning_rate": 0.00020791399499642496, + "loss": 0.0, + "step": 25290 + }, + { + "epoch": 2.3598954931417375, + "grad_norm": NaN, + "learning_rate": 0.00020790701763620666, + "loss": 0.0, + "step": 25291 + }, + { + "epoch": 2.3599888028366145, + "grad_norm": NaN, + "learning_rate": 0.000207900040128748, + "loss": 0.0, + "step": 25292 + }, + { + "epoch": 2.360082112531492, + "grad_norm": NaN, + "learning_rate": 0.00020789306247406665, + "loss": 0.0, + "step": 25293 + }, + { + "epoch": 2.3601754222263693, + "grad_norm": NaN, + "learning_rate": 0.0002078860846721804, + "loss": 0.0, + "step": 25294 + }, + { + "epoch": 2.3602687319212468, + "grad_norm": NaN, + "learning_rate": 0.00020787910672310698, + "loss": 0.0, + "step": 25295 + }, + { + "epoch": 2.3603620416161237, + "grad_norm": NaN, + "learning_rate": 0.00020787212862686418, + "loss": 0.0, + "step": 25296 + }, + { + "epoch": 2.360455351311001, + "grad_norm": NaN, + "learning_rate": 0.00020786515038346965, + "loss": 0.0, + "step": 25297 + }, + { + "epoch": 2.3605486610058786, + "grad_norm": NaN, + "learning_rate": 0.0002078581719929412, + "loss": 0.0, + "step": 25298 + }, + { + "epoch": 2.360641970700756, + "grad_norm": NaN, + "learning_rate": 0.00020785119345529657, + "loss": 0.0, + "step": 25299 + }, + { + "epoch": 2.360735280395633, + "grad_norm": NaN, + "learning_rate": 0.00020784421477055344, + "loss": 0.0, + "step": 25300 + }, + { + "epoch": 2.3608285900905104, + "grad_norm": NaN, + "learning_rate": 0.00020783723593872968, + "loss": 0.0, + "step": 25301 + }, + { + "epoch": 2.360921899785388, + "grad_norm": NaN, + "learning_rate": 0.0002078302569598429, + "loss": 0.0, + "step": 25302 + }, + { + "epoch": 2.361015209480265, + "grad_norm": NaN, + "learning_rate": 0.00020782327783391093, + "loss": 0.0, + "step": 25303 + }, + { + "epoch": 2.361108519175142, + "grad_norm": NaN, + "learning_rate": 0.0002078162985609515, + "loss": 0.0, + "step": 25304 + }, + { + "epoch": 2.3612018288700196, + "grad_norm": NaN, + "learning_rate": 0.00020780931914098237, + "loss": 0.0, + "step": 25305 + }, + { + "epoch": 2.361295138564897, + "grad_norm": NaN, + "learning_rate": 0.00020780233957402126, + "loss": 0.0, + "step": 25306 + }, + { + "epoch": 2.361388448259774, + "grad_norm": NaN, + "learning_rate": 0.0002077953598600859, + "loss": 0.0, + "step": 25307 + }, + { + "epoch": 2.3614817579546514, + "grad_norm": NaN, + "learning_rate": 0.00020778837999919404, + "loss": 0.0, + "step": 25308 + }, + { + "epoch": 2.361575067649529, + "grad_norm": NaN, + "learning_rate": 0.00020778139999136347, + "loss": 0.0, + "step": 25309 + }, + { + "epoch": 2.3616683773444063, + "grad_norm": NaN, + "learning_rate": 0.0002077744198366119, + "loss": 0.0, + "step": 25310 + }, + { + "epoch": 2.3617616870392832, + "grad_norm": NaN, + "learning_rate": 0.00020776743953495716, + "loss": 0.0, + "step": 25311 + }, + { + "epoch": 2.3618549967341607, + "grad_norm": NaN, + "learning_rate": 0.00020776045908641686, + "loss": 0.0, + "step": 25312 + }, + { + "epoch": 2.361948306429038, + "grad_norm": NaN, + "learning_rate": 0.00020775347849100886, + "loss": 0.0, + "step": 25313 + }, + { + "epoch": 2.362041616123915, + "grad_norm": NaN, + "learning_rate": 0.00020774649774875088, + "loss": 0.0, + "step": 25314 + }, + { + "epoch": 2.3621349258187925, + "grad_norm": NaN, + "learning_rate": 0.00020773951685966062, + "loss": 0.0, + "step": 25315 + }, + { + "epoch": 2.36222823551367, + "grad_norm": NaN, + "learning_rate": 0.00020773253582375588, + "loss": 0.0, + "step": 25316 + }, + { + "epoch": 2.3623215452085473, + "grad_norm": NaN, + "learning_rate": 0.00020772555464105443, + "loss": 0.0, + "step": 25317 + }, + { + "epoch": 2.3624148549034243, + "grad_norm": NaN, + "learning_rate": 0.000207718573311574, + "loss": 0.0, + "step": 25318 + }, + { + "epoch": 2.3625081645983017, + "grad_norm": NaN, + "learning_rate": 0.0002077115918353323, + "loss": 0.0, + "step": 25319 + }, + { + "epoch": 2.362601474293179, + "grad_norm": NaN, + "learning_rate": 0.00020770461021234717, + "loss": 0.0, + "step": 25320 + }, + { + "epoch": 2.3626947839880565, + "grad_norm": NaN, + "learning_rate": 0.00020769762844263627, + "loss": 0.0, + "step": 25321 + }, + { + "epoch": 2.3627880936829335, + "grad_norm": NaN, + "learning_rate": 0.0002076906465262174, + "loss": 0.0, + "step": 25322 + }, + { + "epoch": 2.362881403377811, + "grad_norm": NaN, + "learning_rate": 0.00020768366446310832, + "loss": 0.0, + "step": 25323 + }, + { + "epoch": 2.3629747130726884, + "grad_norm": NaN, + "learning_rate": 0.00020767668225332673, + "loss": 0.0, + "step": 25324 + }, + { + "epoch": 2.363068022767566, + "grad_norm": NaN, + "learning_rate": 0.00020766969989689042, + "loss": 0.0, + "step": 25325 + }, + { + "epoch": 2.3631613324624428, + "grad_norm": NaN, + "learning_rate": 0.00020766271739381718, + "loss": 0.0, + "step": 25326 + }, + { + "epoch": 2.36325464215732, + "grad_norm": NaN, + "learning_rate": 0.00020765573474412473, + "loss": 0.0, + "step": 25327 + }, + { + "epoch": 2.3633479518521976, + "grad_norm": NaN, + "learning_rate": 0.00020764875194783083, + "loss": 0.0, + "step": 25328 + }, + { + "epoch": 2.3634412615470746, + "grad_norm": NaN, + "learning_rate": 0.00020764176900495322, + "loss": 0.0, + "step": 25329 + }, + { + "epoch": 2.363534571241952, + "grad_norm": NaN, + "learning_rate": 0.00020763478591550967, + "loss": 0.0, + "step": 25330 + }, + { + "epoch": 2.3636278809368294, + "grad_norm": NaN, + "learning_rate": 0.0002076278026795179, + "loss": 0.0, + "step": 25331 + }, + { + "epoch": 2.363721190631707, + "grad_norm": NaN, + "learning_rate": 0.00020762081929699572, + "loss": 0.0, + "step": 25332 + }, + { + "epoch": 2.363814500326584, + "grad_norm": NaN, + "learning_rate": 0.00020761383576796088, + "loss": 0.0, + "step": 25333 + }, + { + "epoch": 2.363907810021461, + "grad_norm": NaN, + "learning_rate": 0.00020760685209243108, + "loss": 0.0, + "step": 25334 + }, + { + "epoch": 2.3640011197163386, + "grad_norm": NaN, + "learning_rate": 0.0002075998682704242, + "loss": 0.0, + "step": 25335 + }, + { + "epoch": 2.3640944294112156, + "grad_norm": NaN, + "learning_rate": 0.00020759288430195785, + "loss": 0.0, + "step": 25336 + }, + { + "epoch": 2.364187739106093, + "grad_norm": NaN, + "learning_rate": 0.00020758590018704986, + "loss": 0.0, + "step": 25337 + }, + { + "epoch": 2.3642810488009705, + "grad_norm": NaN, + "learning_rate": 0.00020757891592571798, + "loss": 0.0, + "step": 25338 + }, + { + "epoch": 2.364374358495848, + "grad_norm": NaN, + "learning_rate": 0.00020757193151798, + "loss": 0.0, + "step": 25339 + }, + { + "epoch": 2.364467668190725, + "grad_norm": NaN, + "learning_rate": 0.00020756494696385367, + "loss": 0.0, + "step": 25340 + }, + { + "epoch": 2.3645609778856023, + "grad_norm": NaN, + "learning_rate": 0.00020755796226335668, + "loss": 0.0, + "step": 25341 + }, + { + "epoch": 2.3646542875804797, + "grad_norm": NaN, + "learning_rate": 0.00020755097741650687, + "loss": 0.0, + "step": 25342 + }, + { + "epoch": 2.3647475972753567, + "grad_norm": NaN, + "learning_rate": 0.000207543992423322, + "loss": 0.0, + "step": 25343 + }, + { + "epoch": 2.364840906970234, + "grad_norm": NaN, + "learning_rate": 0.00020753700728381972, + "loss": 0.0, + "step": 25344 + }, + { + "epoch": 2.3649342166651115, + "grad_norm": NaN, + "learning_rate": 0.00020753002199801795, + "loss": 0.0, + "step": 25345 + }, + { + "epoch": 2.365027526359989, + "grad_norm": NaN, + "learning_rate": 0.00020752303656593435, + "loss": 0.0, + "step": 25346 + }, + { + "epoch": 2.3651208360548663, + "grad_norm": NaN, + "learning_rate": 0.0002075160509875867, + "loss": 0.0, + "step": 25347 + }, + { + "epoch": 2.3652141457497433, + "grad_norm": NaN, + "learning_rate": 0.00020750906526299277, + "loss": 0.0, + "step": 25348 + }, + { + "epoch": 2.3653074554446207, + "grad_norm": NaN, + "learning_rate": 0.00020750207939217036, + "loss": 0.0, + "step": 25349 + }, + { + "epoch": 2.365400765139498, + "grad_norm": NaN, + "learning_rate": 0.00020749509337513716, + "loss": 0.0, + "step": 25350 + }, + { + "epoch": 2.365494074834375, + "grad_norm": NaN, + "learning_rate": 0.00020748810721191098, + "loss": 0.0, + "step": 25351 + }, + { + "epoch": 2.3655873845292525, + "grad_norm": NaN, + "learning_rate": 0.00020748112090250955, + "loss": 0.0, + "step": 25352 + }, + { + "epoch": 2.36568069422413, + "grad_norm": NaN, + "learning_rate": 0.00020747413444695067, + "loss": 0.0, + "step": 25353 + }, + { + "epoch": 2.3657740039190074, + "grad_norm": NaN, + "learning_rate": 0.0002074671478452521, + "loss": 0.0, + "step": 25354 + }, + { + "epoch": 2.3658673136138844, + "grad_norm": NaN, + "learning_rate": 0.0002074601610974316, + "loss": 0.0, + "step": 25355 + }, + { + "epoch": 2.365960623308762, + "grad_norm": NaN, + "learning_rate": 0.00020745317420350692, + "loss": 0.0, + "step": 25356 + }, + { + "epoch": 2.366053933003639, + "grad_norm": NaN, + "learning_rate": 0.00020744618716349585, + "loss": 0.0, + "step": 25357 + }, + { + "epoch": 2.366147242698516, + "grad_norm": NaN, + "learning_rate": 0.00020743919997741613, + "loss": 0.0, + "step": 25358 + }, + { + "epoch": 2.3662405523933936, + "grad_norm": NaN, + "learning_rate": 0.00020743221264528554, + "loss": 0.0, + "step": 25359 + }, + { + "epoch": 2.366333862088271, + "grad_norm": NaN, + "learning_rate": 0.00020742522516712183, + "loss": 0.0, + "step": 25360 + }, + { + "epoch": 2.3664271717831484, + "grad_norm": NaN, + "learning_rate": 0.00020741823754294282, + "loss": 0.0, + "step": 25361 + }, + { + "epoch": 2.3665204814780254, + "grad_norm": NaN, + "learning_rate": 0.0002074112497727662, + "loss": 0.0, + "step": 25362 + }, + { + "epoch": 2.366613791172903, + "grad_norm": NaN, + "learning_rate": 0.0002074042618566098, + "loss": 0.0, + "step": 25363 + }, + { + "epoch": 2.3667071008677802, + "grad_norm": NaN, + "learning_rate": 0.00020739727379449137, + "loss": 0.0, + "step": 25364 + }, + { + "epoch": 2.366800410562657, + "grad_norm": NaN, + "learning_rate": 0.0002073902855864287, + "loss": 0.0, + "step": 25365 + }, + { + "epoch": 2.3668937202575346, + "grad_norm": NaN, + "learning_rate": 0.00020738329723243948, + "loss": 0.0, + "step": 25366 + }, + { + "epoch": 2.366987029952412, + "grad_norm": NaN, + "learning_rate": 0.00020737630873254157, + "loss": 0.0, + "step": 25367 + }, + { + "epoch": 2.3670803396472895, + "grad_norm": NaN, + "learning_rate": 0.00020736932008675268, + "loss": 0.0, + "step": 25368 + }, + { + "epoch": 2.367173649342167, + "grad_norm": NaN, + "learning_rate": 0.00020736233129509064, + "loss": 0.0, + "step": 25369 + }, + { + "epoch": 2.367266959037044, + "grad_norm": NaN, + "learning_rate": 0.00020735534235757314, + "loss": 0.0, + "step": 25370 + }, + { + "epoch": 2.3673602687319213, + "grad_norm": NaN, + "learning_rate": 0.00020734835327421803, + "loss": 0.0, + "step": 25371 + }, + { + "epoch": 2.3674535784267987, + "grad_norm": NaN, + "learning_rate": 0.00020734136404504302, + "loss": 0.0, + "step": 25372 + }, + { + "epoch": 2.3675468881216757, + "grad_norm": NaN, + "learning_rate": 0.00020733437467006593, + "loss": 0.0, + "step": 25373 + }, + { + "epoch": 2.367640197816553, + "grad_norm": NaN, + "learning_rate": 0.0002073273851493045, + "loss": 0.0, + "step": 25374 + }, + { + "epoch": 2.3677335075114305, + "grad_norm": NaN, + "learning_rate": 0.00020732039548277645, + "loss": 0.0, + "step": 25375 + }, + { + "epoch": 2.367826817206308, + "grad_norm": NaN, + "learning_rate": 0.00020731340567049967, + "loss": 0.0, + "step": 25376 + }, + { + "epoch": 2.367920126901185, + "grad_norm": NaN, + "learning_rate": 0.0002073064157124919, + "loss": 0.0, + "step": 25377 + }, + { + "epoch": 2.3680134365960623, + "grad_norm": NaN, + "learning_rate": 0.0002072994256087709, + "loss": 0.0, + "step": 25378 + }, + { + "epoch": 2.3681067462909398, + "grad_norm": NaN, + "learning_rate": 0.00020729243535935437, + "loss": 0.0, + "step": 25379 + }, + { + "epoch": 2.3682000559858167, + "grad_norm": NaN, + "learning_rate": 0.00020728544496426019, + "loss": 0.0, + "step": 25380 + }, + { + "epoch": 2.368293365680694, + "grad_norm": NaN, + "learning_rate": 0.00020727845442350607, + "loss": 0.0, + "step": 25381 + }, + { + "epoch": 2.3683866753755716, + "grad_norm": NaN, + "learning_rate": 0.00020727146373710983, + "loss": 0.0, + "step": 25382 + }, + { + "epoch": 2.368479985070449, + "grad_norm": NaN, + "learning_rate": 0.0002072644729050892, + "loss": 0.0, + "step": 25383 + }, + { + "epoch": 2.368573294765326, + "grad_norm": NaN, + "learning_rate": 0.000207257481927462, + "loss": 0.0, + "step": 25384 + }, + { + "epoch": 2.3686666044602034, + "grad_norm": NaN, + "learning_rate": 0.00020725049080424596, + "loss": 0.0, + "step": 25385 + }, + { + "epoch": 2.368759914155081, + "grad_norm": NaN, + "learning_rate": 0.0002072434995354589, + "loss": 0.0, + "step": 25386 + }, + { + "epoch": 2.368853223849958, + "grad_norm": NaN, + "learning_rate": 0.00020723650812111855, + "loss": 0.0, + "step": 25387 + }, + { + "epoch": 2.368946533544835, + "grad_norm": NaN, + "learning_rate": 0.00020722951656124275, + "loss": 0.0, + "step": 25388 + }, + { + "epoch": 2.3690398432397126, + "grad_norm": NaN, + "learning_rate": 0.00020722252485584923, + "loss": 0.0, + "step": 25389 + }, + { + "epoch": 2.36913315293459, + "grad_norm": NaN, + "learning_rate": 0.00020721553300495578, + "loss": 0.0, + "step": 25390 + }, + { + "epoch": 2.369226462629467, + "grad_norm": NaN, + "learning_rate": 0.0002072085410085802, + "loss": 0.0, + "step": 25391 + }, + { + "epoch": 2.3693197723243444, + "grad_norm": NaN, + "learning_rate": 0.0002072015488667402, + "loss": 0.0, + "step": 25392 + }, + { + "epoch": 2.369413082019222, + "grad_norm": NaN, + "learning_rate": 0.00020719455657945362, + "loss": 0.0, + "step": 25393 + }, + { + "epoch": 2.3695063917140993, + "grad_norm": NaN, + "learning_rate": 0.00020718756414673826, + "loss": 0.0, + "step": 25394 + }, + { + "epoch": 2.3695997014089762, + "grad_norm": NaN, + "learning_rate": 0.0002071805715686118, + "loss": 0.0, + "step": 25395 + }, + { + "epoch": 2.3696930111038537, + "grad_norm": NaN, + "learning_rate": 0.00020717357884509217, + "loss": 0.0, + "step": 25396 + }, + { + "epoch": 2.369786320798731, + "grad_norm": NaN, + "learning_rate": 0.00020716658597619698, + "loss": 0.0, + "step": 25397 + }, + { + "epoch": 2.3698796304936085, + "grad_norm": NaN, + "learning_rate": 0.00020715959296194417, + "loss": 0.0, + "step": 25398 + }, + { + "epoch": 2.3699729401884855, + "grad_norm": NaN, + "learning_rate": 0.0002071525998023514, + "loss": 0.0, + "step": 25399 + }, + { + "epoch": 2.370066249883363, + "grad_norm": NaN, + "learning_rate": 0.0002071456064974365, + "loss": 0.0, + "step": 25400 + }, + { + "epoch": 2.3701595595782403, + "grad_norm": NaN, + "learning_rate": 0.00020713861304721726, + "loss": 0.0, + "step": 25401 + }, + { + "epoch": 2.3702528692731173, + "grad_norm": NaN, + "learning_rate": 0.00020713161945171146, + "loss": 0.0, + "step": 25402 + }, + { + "epoch": 2.3703461789679947, + "grad_norm": NaN, + "learning_rate": 0.00020712462571093683, + "loss": 0.0, + "step": 25403 + }, + { + "epoch": 2.370439488662872, + "grad_norm": NaN, + "learning_rate": 0.00020711763182491125, + "loss": 0.0, + "step": 25404 + }, + { + "epoch": 2.3705327983577495, + "grad_norm": NaN, + "learning_rate": 0.00020711063779365243, + "loss": 0.0, + "step": 25405 + }, + { + "epoch": 2.3706261080526265, + "grad_norm": NaN, + "learning_rate": 0.0002071036436171782, + "loss": 0.0, + "step": 25406 + }, + { + "epoch": 2.370719417747504, + "grad_norm": NaN, + "learning_rate": 0.00020709664929550631, + "loss": 0.0, + "step": 25407 + }, + { + "epoch": 2.3708127274423814, + "grad_norm": NaN, + "learning_rate": 0.00020708965482865455, + "loss": 0.0, + "step": 25408 + }, + { + "epoch": 2.3709060371372583, + "grad_norm": NaN, + "learning_rate": 0.0002070826602166407, + "loss": 0.0, + "step": 25409 + }, + { + "epoch": 2.3709993468321358, + "grad_norm": NaN, + "learning_rate": 0.00020707566545948258, + "loss": 0.0, + "step": 25410 + }, + { + "epoch": 2.371092656527013, + "grad_norm": NaN, + "learning_rate": 0.0002070686705571979, + "loss": 0.0, + "step": 25411 + }, + { + "epoch": 2.3711859662218906, + "grad_norm": NaN, + "learning_rate": 0.00020706167550980453, + "loss": 0.0, + "step": 25412 + }, + { + "epoch": 2.3712792759167676, + "grad_norm": NaN, + "learning_rate": 0.00020705468031732022, + "loss": 0.0, + "step": 25413 + }, + { + "epoch": 2.371372585611645, + "grad_norm": NaN, + "learning_rate": 0.0002070476849797628, + "loss": 0.0, + "step": 25414 + }, + { + "epoch": 2.3714658953065224, + "grad_norm": NaN, + "learning_rate": 0.00020704068949714994, + "loss": 0.0, + "step": 25415 + }, + { + "epoch": 2.3715592050014, + "grad_norm": NaN, + "learning_rate": 0.00020703369386949953, + "loss": 0.0, + "step": 25416 + }, + { + "epoch": 2.371652514696277, + "grad_norm": NaN, + "learning_rate": 0.00020702669809682936, + "loss": 0.0, + "step": 25417 + }, + { + "epoch": 2.3717458243911542, + "grad_norm": NaN, + "learning_rate": 0.00020701970217915715, + "loss": 0.0, + "step": 25418 + }, + { + "epoch": 2.3718391340860316, + "grad_norm": NaN, + "learning_rate": 0.0002070127061165007, + "loss": 0.0, + "step": 25419 + }, + { + "epoch": 2.371932443780909, + "grad_norm": NaN, + "learning_rate": 0.00020700570990887793, + "loss": 0.0, + "step": 25420 + }, + { + "epoch": 2.372025753475786, + "grad_norm": NaN, + "learning_rate": 0.00020699871355630645, + "loss": 0.0, + "step": 25421 + }, + { + "epoch": 2.3721190631706635, + "grad_norm": NaN, + "learning_rate": 0.00020699171705880414, + "loss": 0.0, + "step": 25422 + }, + { + "epoch": 2.372212372865541, + "grad_norm": NaN, + "learning_rate": 0.0002069847204163888, + "loss": 0.0, + "step": 25423 + }, + { + "epoch": 2.372305682560418, + "grad_norm": NaN, + "learning_rate": 0.00020697772362907813, + "loss": 0.0, + "step": 25424 + }, + { + "epoch": 2.3723989922552953, + "grad_norm": NaN, + "learning_rate": 0.00020697072669689006, + "loss": 0.0, + "step": 25425 + }, + { + "epoch": 2.3724923019501727, + "grad_norm": NaN, + "learning_rate": 0.00020696372961984228, + "loss": 0.0, + "step": 25426 + }, + { + "epoch": 2.37258561164505, + "grad_norm": NaN, + "learning_rate": 0.00020695673239795262, + "loss": 0.0, + "step": 25427 + }, + { + "epoch": 2.372678921339927, + "grad_norm": NaN, + "learning_rate": 0.00020694973503123884, + "loss": 0.0, + "step": 25428 + }, + { + "epoch": 2.3727722310348045, + "grad_norm": NaN, + "learning_rate": 0.00020694273751971874, + "loss": 0.0, + "step": 25429 + }, + { + "epoch": 2.372865540729682, + "grad_norm": NaN, + "learning_rate": 0.00020693573986341017, + "loss": 0.0, + "step": 25430 + }, + { + "epoch": 2.372958850424559, + "grad_norm": NaN, + "learning_rate": 0.00020692874206233088, + "loss": 0.0, + "step": 25431 + }, + { + "epoch": 2.3730521601194363, + "grad_norm": NaN, + "learning_rate": 0.00020692174411649864, + "loss": 0.0, + "step": 25432 + }, + { + "epoch": 2.3731454698143137, + "grad_norm": NaN, + "learning_rate": 0.00020691474602593126, + "loss": 0.0, + "step": 25433 + }, + { + "epoch": 2.373238779509191, + "grad_norm": NaN, + "learning_rate": 0.00020690774779064656, + "loss": 0.0, + "step": 25434 + }, + { + "epoch": 2.373332089204068, + "grad_norm": NaN, + "learning_rate": 0.0002069007494106623, + "loss": 0.0, + "step": 25435 + }, + { + "epoch": 2.3734253988989455, + "grad_norm": NaN, + "learning_rate": 0.00020689375088599627, + "loss": 0.0, + "step": 25436 + }, + { + "epoch": 2.373518708593823, + "grad_norm": NaN, + "learning_rate": 0.0002068867522166663, + "loss": 0.0, + "step": 25437 + }, + { + "epoch": 2.3736120182887004, + "grad_norm": NaN, + "learning_rate": 0.00020687975340269018, + "loss": 0.0, + "step": 25438 + }, + { + "epoch": 2.3737053279835774, + "grad_norm": NaN, + "learning_rate": 0.0002068727544440857, + "loss": 0.0, + "step": 25439 + }, + { + "epoch": 2.373798637678455, + "grad_norm": NaN, + "learning_rate": 0.00020686575534087067, + "loss": 0.0, + "step": 25440 + }, + { + "epoch": 2.373891947373332, + "grad_norm": NaN, + "learning_rate": 0.00020685875609306278, + "loss": 0.0, + "step": 25441 + }, + { + "epoch": 2.3739852570682096, + "grad_norm": NaN, + "learning_rate": 0.00020685175670068004, + "loss": 0.0, + "step": 25442 + }, + { + "epoch": 2.3740785667630866, + "grad_norm": NaN, + "learning_rate": 0.00020684475716374005, + "loss": 0.0, + "step": 25443 + }, + { + "epoch": 2.374171876457964, + "grad_norm": NaN, + "learning_rate": 0.00020683775748226067, + "loss": 0.0, + "step": 25444 + }, + { + "epoch": 2.3742651861528414, + "grad_norm": NaN, + "learning_rate": 0.00020683075765625978, + "loss": 0.0, + "step": 25445 + }, + { + "epoch": 2.3743584958477184, + "grad_norm": NaN, + "learning_rate": 0.00020682375768575503, + "loss": 0.0, + "step": 25446 + }, + { + "epoch": 2.374451805542596, + "grad_norm": NaN, + "learning_rate": 0.00020681675757076436, + "loss": 0.0, + "step": 25447 + }, + { + "epoch": 2.3745451152374732, + "grad_norm": NaN, + "learning_rate": 0.00020680975731130545, + "loss": 0.0, + "step": 25448 + }, + { + "epoch": 2.3746384249323507, + "grad_norm": NaN, + "learning_rate": 0.0002068027569073962, + "loss": 0.0, + "step": 25449 + }, + { + "epoch": 2.3747317346272276, + "grad_norm": NaN, + "learning_rate": 0.00020679575635905434, + "loss": 0.0, + "step": 25450 + }, + { + "epoch": 2.374825044322105, + "grad_norm": NaN, + "learning_rate": 0.00020678875566629772, + "loss": 0.0, + "step": 25451 + }, + { + "epoch": 2.3749183540169825, + "grad_norm": NaN, + "learning_rate": 0.00020678175482914408, + "loss": 0.0, + "step": 25452 + }, + { + "epoch": 2.3750116637118595, + "grad_norm": NaN, + "learning_rate": 0.0002067747538476113, + "loss": 0.0, + "step": 25453 + }, + { + "epoch": 2.375104973406737, + "grad_norm": NaN, + "learning_rate": 0.0002067677527217171, + "loss": 0.0, + "step": 25454 + }, + { + "epoch": 2.3751982831016143, + "grad_norm": NaN, + "learning_rate": 0.0002067607514514793, + "loss": 0.0, + "step": 25455 + }, + { + "epoch": 2.3752915927964917, + "grad_norm": NaN, + "learning_rate": 0.0002067537500369158, + "loss": 0.0, + "step": 25456 + }, + { + "epoch": 2.3753849024913687, + "grad_norm": NaN, + "learning_rate": 0.00020674674847804425, + "loss": 0.0, + "step": 25457 + }, + { + "epoch": 2.375478212186246, + "grad_norm": NaN, + "learning_rate": 0.00020673974677488257, + "loss": 0.0, + "step": 25458 + }, + { + "epoch": 2.3755715218811235, + "grad_norm": NaN, + "learning_rate": 0.00020673274492744853, + "loss": 0.0, + "step": 25459 + }, + { + "epoch": 2.3756648315760005, + "grad_norm": NaN, + "learning_rate": 0.00020672574293575987, + "loss": 0.0, + "step": 25460 + }, + { + "epoch": 2.375758141270878, + "grad_norm": NaN, + "learning_rate": 0.00020671874079983453, + "loss": 0.0, + "step": 25461 + }, + { + "epoch": 2.3758514509657553, + "grad_norm": NaN, + "learning_rate": 0.00020671173851969017, + "loss": 0.0, + "step": 25462 + }, + { + "epoch": 2.3759447606606328, + "grad_norm": NaN, + "learning_rate": 0.00020670473609534464, + "loss": 0.0, + "step": 25463 + }, + { + "epoch": 2.37603807035551, + "grad_norm": NaN, + "learning_rate": 0.00020669773352681583, + "loss": 0.0, + "step": 25464 + }, + { + "epoch": 2.376131380050387, + "grad_norm": NaN, + "learning_rate": 0.00020669073081412147, + "loss": 0.0, + "step": 25465 + }, + { + "epoch": 2.3762246897452646, + "grad_norm": NaN, + "learning_rate": 0.00020668372795727935, + "loss": 0.0, + "step": 25466 + }, + { + "epoch": 2.376317999440142, + "grad_norm": NaN, + "learning_rate": 0.0002066767249563073, + "loss": 0.0, + "step": 25467 + }, + { + "epoch": 2.376411309135019, + "grad_norm": NaN, + "learning_rate": 0.00020666972181122316, + "loss": 0.0, + "step": 25468 + }, + { + "epoch": 2.3765046188298964, + "grad_norm": NaN, + "learning_rate": 0.00020666271852204469, + "loss": 0.0, + "step": 25469 + }, + { + "epoch": 2.376597928524774, + "grad_norm": NaN, + "learning_rate": 0.00020665571508878967, + "loss": 0.0, + "step": 25470 + }, + { + "epoch": 2.3766912382196512, + "grad_norm": NaN, + "learning_rate": 0.000206648711511476, + "loss": 0.0, + "step": 25471 + }, + { + "epoch": 2.376784547914528, + "grad_norm": NaN, + "learning_rate": 0.00020664170779012138, + "loss": 0.0, + "step": 25472 + }, + { + "epoch": 2.3768778576094056, + "grad_norm": NaN, + "learning_rate": 0.00020663470392474372, + "loss": 0.0, + "step": 25473 + }, + { + "epoch": 2.376971167304283, + "grad_norm": NaN, + "learning_rate": 0.0002066276999153608, + "loss": 0.0, + "step": 25474 + }, + { + "epoch": 2.37706447699916, + "grad_norm": NaN, + "learning_rate": 0.00020662069576199036, + "loss": 0.0, + "step": 25475 + }, + { + "epoch": 2.3771577866940374, + "grad_norm": NaN, + "learning_rate": 0.00020661369146465026, + "loss": 0.0, + "step": 25476 + }, + { + "epoch": 2.377251096388915, + "grad_norm": NaN, + "learning_rate": 0.0002066066870233584, + "loss": 0.0, + "step": 25477 + }, + { + "epoch": 2.3773444060837923, + "grad_norm": NaN, + "learning_rate": 0.00020659968243813244, + "loss": 0.0, + "step": 25478 + }, + { + "epoch": 2.3774377157786692, + "grad_norm": NaN, + "learning_rate": 0.0002065926777089902, + "loss": 0.0, + "step": 25479 + }, + { + "epoch": 2.3775310254735467, + "grad_norm": NaN, + "learning_rate": 0.00020658567283594968, + "loss": 0.0, + "step": 25480 + }, + { + "epoch": 2.377624335168424, + "grad_norm": NaN, + "learning_rate": 0.00020657866781902847, + "loss": 0.0, + "step": 25481 + }, + { + "epoch": 2.377717644863301, + "grad_norm": NaN, + "learning_rate": 0.00020657166265824442, + "loss": 0.0, + "step": 25482 + }, + { + "epoch": 2.3778109545581785, + "grad_norm": NaN, + "learning_rate": 0.0002065646573536155, + "loss": 0.0, + "step": 25483 + }, + { + "epoch": 2.377904264253056, + "grad_norm": NaN, + "learning_rate": 0.0002065576519051594, + "loss": 0.0, + "step": 25484 + }, + { + "epoch": 2.3779975739479333, + "grad_norm": NaN, + "learning_rate": 0.00020655064631289386, + "loss": 0.0, + "step": 25485 + }, + { + "epoch": 2.3780908836428107, + "grad_norm": NaN, + "learning_rate": 0.00020654364057683683, + "loss": 0.0, + "step": 25486 + }, + { + "epoch": 2.3781841933376877, + "grad_norm": NaN, + "learning_rate": 0.00020653663469700608, + "loss": 0.0, + "step": 25487 + }, + { + "epoch": 2.378277503032565, + "grad_norm": NaN, + "learning_rate": 0.0002065296286734194, + "loss": 0.0, + "step": 25488 + }, + { + "epoch": 2.3783708127274426, + "grad_norm": NaN, + "learning_rate": 0.00020652262250609463, + "loss": 0.0, + "step": 25489 + }, + { + "epoch": 2.3784641224223195, + "grad_norm": NaN, + "learning_rate": 0.00020651561619504957, + "loss": 0.0, + "step": 25490 + }, + { + "epoch": 2.378557432117197, + "grad_norm": NaN, + "learning_rate": 0.00020650860974030203, + "loss": 0.0, + "step": 25491 + }, + { + "epoch": 2.3786507418120744, + "grad_norm": NaN, + "learning_rate": 0.0002065016031418698, + "loss": 0.0, + "step": 25492 + }, + { + "epoch": 2.378744051506952, + "grad_norm": NaN, + "learning_rate": 0.0002064945963997708, + "loss": 0.0, + "step": 25493 + }, + { + "epoch": 2.3788373612018288, + "grad_norm": NaN, + "learning_rate": 0.00020648758951402274, + "loss": 0.0, + "step": 25494 + }, + { + "epoch": 2.378930670896706, + "grad_norm": NaN, + "learning_rate": 0.00020648058248464343, + "loss": 0.0, + "step": 25495 + }, + { + "epoch": 2.3790239805915836, + "grad_norm": NaN, + "learning_rate": 0.00020647357531165082, + "loss": 0.0, + "step": 25496 + }, + { + "epoch": 2.3791172902864606, + "grad_norm": NaN, + "learning_rate": 0.0002064665679950626, + "loss": 0.0, + "step": 25497 + }, + { + "epoch": 2.379210599981338, + "grad_norm": NaN, + "learning_rate": 0.00020645956053489658, + "loss": 0.0, + "step": 25498 + }, + { + "epoch": 2.3793039096762154, + "grad_norm": NaN, + "learning_rate": 0.0002064525529311707, + "loss": 0.0, + "step": 25499 + }, + { + "epoch": 2.379397219371093, + "grad_norm": NaN, + "learning_rate": 0.00020644554518390264, + "loss": 0.0, + "step": 25500 + }, + { + "epoch": 2.37949052906597, + "grad_norm": NaN, + "learning_rate": 0.0002064385372931103, + "loss": 0.0, + "step": 25501 + }, + { + "epoch": 2.3795838387608472, + "grad_norm": NaN, + "learning_rate": 0.00020643152925881148, + "loss": 0.0, + "step": 25502 + }, + { + "epoch": 2.3796771484557246, + "grad_norm": NaN, + "learning_rate": 0.000206424521081024, + "loss": 0.0, + "step": 25503 + }, + { + "epoch": 2.3797704581506016, + "grad_norm": NaN, + "learning_rate": 0.00020641751275976566, + "loss": 0.0, + "step": 25504 + }, + { + "epoch": 2.379863767845479, + "grad_norm": NaN, + "learning_rate": 0.00020641050429505433, + "loss": 0.0, + "step": 25505 + }, + { + "epoch": 2.3799570775403565, + "grad_norm": NaN, + "learning_rate": 0.00020640349568690778, + "loss": 0.0, + "step": 25506 + }, + { + "epoch": 2.380050387235234, + "grad_norm": NaN, + "learning_rate": 0.00020639648693534382, + "loss": 0.0, + "step": 25507 + }, + { + "epoch": 2.380143696930111, + "grad_norm": NaN, + "learning_rate": 0.00020638947804038033, + "loss": 0.0, + "step": 25508 + }, + { + "epoch": 2.3802370066249883, + "grad_norm": NaN, + "learning_rate": 0.00020638246900203512, + "loss": 0.0, + "step": 25509 + }, + { + "epoch": 2.3803303163198657, + "grad_norm": NaN, + "learning_rate": 0.00020637545982032596, + "loss": 0.0, + "step": 25510 + }, + { + "epoch": 2.380423626014743, + "grad_norm": NaN, + "learning_rate": 0.0002063684504952707, + "loss": 0.0, + "step": 25511 + }, + { + "epoch": 2.38051693570962, + "grad_norm": NaN, + "learning_rate": 0.0002063614410268872, + "loss": 0.0, + "step": 25512 + }, + { + "epoch": 2.3806102454044975, + "grad_norm": NaN, + "learning_rate": 0.00020635443141519324, + "loss": 0.0, + "step": 25513 + }, + { + "epoch": 2.380703555099375, + "grad_norm": NaN, + "learning_rate": 0.0002063474216602066, + "loss": 0.0, + "step": 25514 + }, + { + "epoch": 2.3807968647942523, + "grad_norm": NaN, + "learning_rate": 0.00020634041176194524, + "loss": 0.0, + "step": 25515 + }, + { + "epoch": 2.3808901744891293, + "grad_norm": NaN, + "learning_rate": 0.0002063334017204269, + "loss": 0.0, + "step": 25516 + }, + { + "epoch": 2.3809834841840067, + "grad_norm": NaN, + "learning_rate": 0.00020632639153566934, + "loss": 0.0, + "step": 25517 + }, + { + "epoch": 2.381076793878884, + "grad_norm": NaN, + "learning_rate": 0.00020631938120769052, + "loss": 0.0, + "step": 25518 + }, + { + "epoch": 2.381170103573761, + "grad_norm": NaN, + "learning_rate": 0.00020631237073650814, + "loss": 0.0, + "step": 25519 + }, + { + "epoch": 2.3812634132686386, + "grad_norm": NaN, + "learning_rate": 0.0002063053601221401, + "loss": 0.0, + "step": 25520 + }, + { + "epoch": 2.381356722963516, + "grad_norm": NaN, + "learning_rate": 0.00020629834936460422, + "loss": 0.0, + "step": 25521 + }, + { + "epoch": 2.3814500326583934, + "grad_norm": NaN, + "learning_rate": 0.00020629133846391832, + "loss": 0.0, + "step": 25522 + }, + { + "epoch": 2.3815433423532704, + "grad_norm": NaN, + "learning_rate": 0.00020628432742010017, + "loss": 0.0, + "step": 25523 + }, + { + "epoch": 2.381636652048148, + "grad_norm": NaN, + "learning_rate": 0.0002062773162331677, + "loss": 0.0, + "step": 25524 + }, + { + "epoch": 2.381729961743025, + "grad_norm": NaN, + "learning_rate": 0.0002062703049031387, + "loss": 0.0, + "step": 25525 + }, + { + "epoch": 2.381823271437902, + "grad_norm": NaN, + "learning_rate": 0.0002062632934300309, + "loss": 0.0, + "step": 25526 + }, + { + "epoch": 2.3819165811327796, + "grad_norm": NaN, + "learning_rate": 0.0002062562818138623, + "loss": 0.0, + "step": 25527 + }, + { + "epoch": 2.382009890827657, + "grad_norm": NaN, + "learning_rate": 0.0002062492700546506, + "loss": 0.0, + "step": 25528 + }, + { + "epoch": 2.3821032005225344, + "grad_norm": NaN, + "learning_rate": 0.00020624225815241365, + "loss": 0.0, + "step": 25529 + }, + { + "epoch": 2.3821965102174114, + "grad_norm": NaN, + "learning_rate": 0.00020623524610716937, + "loss": 0.0, + "step": 25530 + }, + { + "epoch": 2.382289819912289, + "grad_norm": NaN, + "learning_rate": 0.00020622823391893547, + "loss": 0.0, + "step": 25531 + }, + { + "epoch": 2.3823831296071662, + "grad_norm": NaN, + "learning_rate": 0.00020622122158772983, + "loss": 0.0, + "step": 25532 + }, + { + "epoch": 2.3824764393020437, + "grad_norm": NaN, + "learning_rate": 0.0002062142091135702, + "loss": 0.0, + "step": 25533 + }, + { + "epoch": 2.3825697489969206, + "grad_norm": NaN, + "learning_rate": 0.00020620719649647464, + "loss": 0.0, + "step": 25534 + }, + { + "epoch": 2.382663058691798, + "grad_norm": NaN, + "learning_rate": 0.00020620018373646072, + "loss": 0.0, + "step": 25535 + }, + { + "epoch": 2.3827563683866755, + "grad_norm": NaN, + "learning_rate": 0.0002061931708335464, + "loss": 0.0, + "step": 25536 + }, + { + "epoch": 2.382849678081553, + "grad_norm": NaN, + "learning_rate": 0.00020618615778774952, + "loss": 0.0, + "step": 25537 + }, + { + "epoch": 2.38294298777643, + "grad_norm": NaN, + "learning_rate": 0.00020617914459908788, + "loss": 0.0, + "step": 25538 + }, + { + "epoch": 2.3830362974713073, + "grad_norm": NaN, + "learning_rate": 0.0002061721312675793, + "loss": 0.0, + "step": 25539 + }, + { + "epoch": 2.3831296071661847, + "grad_norm": NaN, + "learning_rate": 0.00020616511779324162, + "loss": 0.0, + "step": 25540 + }, + { + "epoch": 2.3832229168610617, + "grad_norm": NaN, + "learning_rate": 0.0002061581041760927, + "loss": 0.0, + "step": 25541 + }, + { + "epoch": 2.383316226555939, + "grad_norm": NaN, + "learning_rate": 0.00020615109041615033, + "loss": 0.0, + "step": 25542 + }, + { + "epoch": 2.3834095362508165, + "grad_norm": NaN, + "learning_rate": 0.0002061440765134324, + "loss": 0.0, + "step": 25543 + }, + { + "epoch": 2.383502845945694, + "grad_norm": NaN, + "learning_rate": 0.0002061370624679567, + "loss": 0.0, + "step": 25544 + }, + { + "epoch": 2.383596155640571, + "grad_norm": NaN, + "learning_rate": 0.0002061300482797411, + "loss": 0.0, + "step": 25545 + }, + { + "epoch": 2.3836894653354483, + "grad_norm": NaN, + "learning_rate": 0.0002061230339488034, + "loss": 0.0, + "step": 25546 + }, + { + "epoch": 2.3837827750303258, + "grad_norm": NaN, + "learning_rate": 0.00020611601947516145, + "loss": 0.0, + "step": 25547 + }, + { + "epoch": 2.3838760847252027, + "grad_norm": NaN, + "learning_rate": 0.00020610900485883306, + "loss": 0.0, + "step": 25548 + }, + { + "epoch": 2.38396939442008, + "grad_norm": NaN, + "learning_rate": 0.0002061019900998361, + "loss": 0.0, + "step": 25549 + }, + { + "epoch": 2.3840627041149576, + "grad_norm": NaN, + "learning_rate": 0.0002060949751981885, + "loss": 0.0, + "step": 25550 + }, + { + "epoch": 2.384156013809835, + "grad_norm": NaN, + "learning_rate": 0.00020608796015390786, + "loss": 0.0, + "step": 25551 + }, + { + "epoch": 2.384249323504712, + "grad_norm": NaN, + "learning_rate": 0.0002060809449670122, + "loss": 0.0, + "step": 25552 + }, + { + "epoch": 2.3843426331995894, + "grad_norm": NaN, + "learning_rate": 0.00020607392963751934, + "loss": 0.0, + "step": 25553 + }, + { + "epoch": 2.384435942894467, + "grad_norm": NaN, + "learning_rate": 0.00020606691416544704, + "loss": 0.0, + "step": 25554 + }, + { + "epoch": 2.384529252589344, + "grad_norm": NaN, + "learning_rate": 0.0002060598985508132, + "loss": 0.0, + "step": 25555 + }, + { + "epoch": 2.384622562284221, + "grad_norm": NaN, + "learning_rate": 0.0002060528827936357, + "loss": 0.0, + "step": 25556 + }, + { + "epoch": 2.3847158719790986, + "grad_norm": NaN, + "learning_rate": 0.00020604586689393223, + "loss": 0.0, + "step": 25557 + }, + { + "epoch": 2.384809181673976, + "grad_norm": NaN, + "learning_rate": 0.00020603885085172073, + "loss": 0.0, + "step": 25558 + }, + { + "epoch": 2.3849024913688535, + "grad_norm": NaN, + "learning_rate": 0.00020603183466701911, + "loss": 0.0, + "step": 25559 + }, + { + "epoch": 2.3849958010637304, + "grad_norm": NaN, + "learning_rate": 0.0002060248183398451, + "loss": 0.0, + "step": 25560 + }, + { + "epoch": 2.385089110758608, + "grad_norm": NaN, + "learning_rate": 0.00020601780187021654, + "loss": 0.0, + "step": 25561 + }, + { + "epoch": 2.3851824204534853, + "grad_norm": NaN, + "learning_rate": 0.00020601078525815135, + "loss": 0.0, + "step": 25562 + }, + { + "epoch": 2.3852757301483622, + "grad_norm": NaN, + "learning_rate": 0.0002060037685036673, + "loss": 0.0, + "step": 25563 + }, + { + "epoch": 2.3853690398432397, + "grad_norm": NaN, + "learning_rate": 0.00020599675160678222, + "loss": 0.0, + "step": 25564 + }, + { + "epoch": 2.385462349538117, + "grad_norm": NaN, + "learning_rate": 0.00020598973456751406, + "loss": 0.0, + "step": 25565 + }, + { + "epoch": 2.3855556592329945, + "grad_norm": NaN, + "learning_rate": 0.00020598271738588055, + "loss": 0.0, + "step": 25566 + }, + { + "epoch": 2.3856489689278715, + "grad_norm": NaN, + "learning_rate": 0.00020597570006189957, + "loss": 0.0, + "step": 25567 + }, + { + "epoch": 2.385742278622749, + "grad_norm": NaN, + "learning_rate": 0.00020596868259558894, + "loss": 0.0, + "step": 25568 + }, + { + "epoch": 2.3858355883176263, + "grad_norm": NaN, + "learning_rate": 0.0002059616649869666, + "loss": 0.0, + "step": 25569 + }, + { + "epoch": 2.3859288980125033, + "grad_norm": NaN, + "learning_rate": 0.00020595464723605026, + "loss": 0.0, + "step": 25570 + }, + { + "epoch": 2.3860222077073807, + "grad_norm": NaN, + "learning_rate": 0.00020594762934285786, + "loss": 0.0, + "step": 25571 + }, + { + "epoch": 2.386115517402258, + "grad_norm": NaN, + "learning_rate": 0.0002059406113074072, + "loss": 0.0, + "step": 25572 + }, + { + "epoch": 2.3862088270971356, + "grad_norm": NaN, + "learning_rate": 0.0002059335931297161, + "loss": 0.0, + "step": 25573 + }, + { + "epoch": 2.3863021367920125, + "grad_norm": NaN, + "learning_rate": 0.0002059265748098025, + "loss": 0.0, + "step": 25574 + }, + { + "epoch": 2.38639544648689, + "grad_norm": NaN, + "learning_rate": 0.0002059195563476842, + "loss": 0.0, + "step": 25575 + }, + { + "epoch": 2.3864887561817674, + "grad_norm": NaN, + "learning_rate": 0.000205912537743379, + "loss": 0.0, + "step": 25576 + }, + { + "epoch": 2.3865820658766443, + "grad_norm": NaN, + "learning_rate": 0.00020590551899690477, + "loss": 0.0, + "step": 25577 + }, + { + "epoch": 2.3866753755715218, + "grad_norm": NaN, + "learning_rate": 0.0002058985001082794, + "loss": 0.0, + "step": 25578 + }, + { + "epoch": 2.386768685266399, + "grad_norm": NaN, + "learning_rate": 0.00020589148107752069, + "loss": 0.0, + "step": 25579 + }, + { + "epoch": 2.3868619949612766, + "grad_norm": NaN, + "learning_rate": 0.00020588446190464646, + "loss": 0.0, + "step": 25580 + }, + { + "epoch": 2.386955304656154, + "grad_norm": NaN, + "learning_rate": 0.00020587744258967463, + "loss": 0.0, + "step": 25581 + }, + { + "epoch": 2.387048614351031, + "grad_norm": NaN, + "learning_rate": 0.00020587042313262302, + "loss": 0.0, + "step": 25582 + }, + { + "epoch": 2.3871419240459084, + "grad_norm": NaN, + "learning_rate": 0.00020586340353350945, + "loss": 0.0, + "step": 25583 + }, + { + "epoch": 2.387235233740786, + "grad_norm": NaN, + "learning_rate": 0.00020585638379235182, + "loss": 0.0, + "step": 25584 + }, + { + "epoch": 2.387328543435663, + "grad_norm": NaN, + "learning_rate": 0.00020584936390916796, + "loss": 0.0, + "step": 25585 + }, + { + "epoch": 2.3874218531305402, + "grad_norm": NaN, + "learning_rate": 0.00020584234388397566, + "loss": 0.0, + "step": 25586 + }, + { + "epoch": 2.3875151628254176, + "grad_norm": NaN, + "learning_rate": 0.00020583532371679284, + "loss": 0.0, + "step": 25587 + }, + { + "epoch": 2.387608472520295, + "grad_norm": NaN, + "learning_rate": 0.00020582830340763736, + "loss": 0.0, + "step": 25588 + }, + { + "epoch": 2.387701782215172, + "grad_norm": NaN, + "learning_rate": 0.00020582128295652702, + "loss": 0.0, + "step": 25589 + }, + { + "epoch": 2.3877950919100495, + "grad_norm": NaN, + "learning_rate": 0.0002058142623634797, + "loss": 0.0, + "step": 25590 + }, + { + "epoch": 2.387888401604927, + "grad_norm": NaN, + "learning_rate": 0.00020580724162851328, + "loss": 0.0, + "step": 25591 + }, + { + "epoch": 2.387981711299804, + "grad_norm": NaN, + "learning_rate": 0.0002058002207516455, + "loss": 0.0, + "step": 25592 + }, + { + "epoch": 2.3880750209946813, + "grad_norm": NaN, + "learning_rate": 0.00020579319973289435, + "loss": 0.0, + "step": 25593 + }, + { + "epoch": 2.3881683306895587, + "grad_norm": NaN, + "learning_rate": 0.0002057861785722776, + "loss": 0.0, + "step": 25594 + }, + { + "epoch": 2.388261640384436, + "grad_norm": NaN, + "learning_rate": 0.0002057791572698131, + "loss": 0.0, + "step": 25595 + }, + { + "epoch": 2.388354950079313, + "grad_norm": NaN, + "learning_rate": 0.00020577213582551875, + "loss": 0.0, + "step": 25596 + }, + { + "epoch": 2.3884482597741905, + "grad_norm": NaN, + "learning_rate": 0.0002057651142394124, + "loss": 0.0, + "step": 25597 + }, + { + "epoch": 2.388541569469068, + "grad_norm": NaN, + "learning_rate": 0.00020575809251151183, + "loss": 0.0, + "step": 25598 + }, + { + "epoch": 2.388634879163945, + "grad_norm": NaN, + "learning_rate": 0.00020575107064183497, + "loss": 0.0, + "step": 25599 + }, + { + "epoch": 2.3887281888588223, + "grad_norm": NaN, + "learning_rate": 0.0002057440486303997, + "loss": 0.0, + "step": 25600 + }, + { + "epoch": 2.3888214985536997, + "grad_norm": NaN, + "learning_rate": 0.00020573702647722376, + "loss": 0.0, + "step": 25601 + }, + { + "epoch": 2.388914808248577, + "grad_norm": NaN, + "learning_rate": 0.00020573000418232508, + "loss": 0.0, + "step": 25602 + }, + { + "epoch": 2.389008117943454, + "grad_norm": NaN, + "learning_rate": 0.00020572298174572156, + "loss": 0.0, + "step": 25603 + }, + { + "epoch": 2.3891014276383316, + "grad_norm": NaN, + "learning_rate": 0.00020571595916743095, + "loss": 0.0, + "step": 25604 + }, + { + "epoch": 2.389194737333209, + "grad_norm": NaN, + "learning_rate": 0.0002057089364474712, + "loss": 0.0, + "step": 25605 + }, + { + "epoch": 2.3892880470280864, + "grad_norm": NaN, + "learning_rate": 0.00020570191358586007, + "loss": 0.0, + "step": 25606 + }, + { + "epoch": 2.3893813567229634, + "grad_norm": NaN, + "learning_rate": 0.00020569489058261555, + "loss": 0.0, + "step": 25607 + }, + { + "epoch": 2.389474666417841, + "grad_norm": NaN, + "learning_rate": 0.00020568786743775536, + "loss": 0.0, + "step": 25608 + }, + { + "epoch": 2.389567976112718, + "grad_norm": NaN, + "learning_rate": 0.00020568084415129744, + "loss": 0.0, + "step": 25609 + }, + { + "epoch": 2.3896612858075956, + "grad_norm": NaN, + "learning_rate": 0.00020567382072325967, + "loss": 0.0, + "step": 25610 + }, + { + "epoch": 2.3897545955024726, + "grad_norm": NaN, + "learning_rate": 0.00020566679715365978, + "loss": 0.0, + "step": 25611 + }, + { + "epoch": 2.38984790519735, + "grad_norm": NaN, + "learning_rate": 0.00020565977344251575, + "loss": 0.0, + "step": 25612 + }, + { + "epoch": 2.3899412148922274, + "grad_norm": NaN, + "learning_rate": 0.00020565274958984545, + "loss": 0.0, + "step": 25613 + }, + { + "epoch": 2.3900345245871044, + "grad_norm": NaN, + "learning_rate": 0.00020564572559566663, + "loss": 0.0, + "step": 25614 + }, + { + "epoch": 2.390127834281982, + "grad_norm": NaN, + "learning_rate": 0.00020563870145999722, + "loss": 0.0, + "step": 25615 + }, + { + "epoch": 2.3902211439768593, + "grad_norm": NaN, + "learning_rate": 0.00020563167718285516, + "loss": 0.0, + "step": 25616 + }, + { + "epoch": 2.3903144536717367, + "grad_norm": NaN, + "learning_rate": 0.0002056246527642581, + "loss": 0.0, + "step": 25617 + }, + { + "epoch": 2.3904077633666136, + "grad_norm": NaN, + "learning_rate": 0.00020561762820422408, + "loss": 0.0, + "step": 25618 + }, + { + "epoch": 2.390501073061491, + "grad_norm": NaN, + "learning_rate": 0.00020561060350277096, + "loss": 0.0, + "step": 25619 + }, + { + "epoch": 2.3905943827563685, + "grad_norm": NaN, + "learning_rate": 0.00020560357865991645, + "loss": 0.0, + "step": 25620 + }, + { + "epoch": 2.3906876924512455, + "grad_norm": NaN, + "learning_rate": 0.00020559655367567858, + "loss": 0.0, + "step": 25621 + }, + { + "epoch": 2.390781002146123, + "grad_norm": NaN, + "learning_rate": 0.00020558952855007513, + "loss": 0.0, + "step": 25622 + }, + { + "epoch": 2.3908743118410003, + "grad_norm": NaN, + "learning_rate": 0.00020558250328312397, + "loss": 0.0, + "step": 25623 + }, + { + "epoch": 2.3909676215358777, + "grad_norm": NaN, + "learning_rate": 0.0002055754778748429, + "loss": 0.0, + "step": 25624 + }, + { + "epoch": 2.3910609312307547, + "grad_norm": NaN, + "learning_rate": 0.00020556845232524998, + "loss": 0.0, + "step": 25625 + }, + { + "epoch": 2.391154240925632, + "grad_norm": NaN, + "learning_rate": 0.0002055614266343629, + "loss": 0.0, + "step": 25626 + }, + { + "epoch": 2.3912475506205095, + "grad_norm": NaN, + "learning_rate": 0.00020555440080219953, + "loss": 0.0, + "step": 25627 + }, + { + "epoch": 2.391340860315387, + "grad_norm": NaN, + "learning_rate": 0.0002055473748287778, + "loss": 0.0, + "step": 25628 + }, + { + "epoch": 2.391434170010264, + "grad_norm": NaN, + "learning_rate": 0.00020554034871411554, + "loss": 0.0, + "step": 25629 + }, + { + "epoch": 2.3915274797051413, + "grad_norm": NaN, + "learning_rate": 0.00020553332245823062, + "loss": 0.0, + "step": 25630 + }, + { + "epoch": 2.3916207894000188, + "grad_norm": NaN, + "learning_rate": 0.00020552629606114095, + "loss": 0.0, + "step": 25631 + }, + { + "epoch": 2.391714099094896, + "grad_norm": NaN, + "learning_rate": 0.00020551926952286435, + "loss": 0.0, + "step": 25632 + }, + { + "epoch": 2.391807408789773, + "grad_norm": NaN, + "learning_rate": 0.00020551224284341864, + "loss": 0.0, + "step": 25633 + }, + { + "epoch": 2.3919007184846506, + "grad_norm": NaN, + "learning_rate": 0.0002055052160228218, + "loss": 0.0, + "step": 25634 + }, + { + "epoch": 2.391994028179528, + "grad_norm": NaN, + "learning_rate": 0.00020549818906109163, + "loss": 0.0, + "step": 25635 + }, + { + "epoch": 2.392087337874405, + "grad_norm": NaN, + "learning_rate": 0.00020549116195824594, + "loss": 0.0, + "step": 25636 + }, + { + "epoch": 2.3921806475692824, + "grad_norm": NaN, + "learning_rate": 0.00020548413471430273, + "loss": 0.0, + "step": 25637 + }, + { + "epoch": 2.39227395726416, + "grad_norm": NaN, + "learning_rate": 0.00020547710732927982, + "loss": 0.0, + "step": 25638 + }, + { + "epoch": 2.3923672669590372, + "grad_norm": NaN, + "learning_rate": 0.000205470079803195, + "loss": 0.0, + "step": 25639 + }, + { + "epoch": 2.392460576653914, + "grad_norm": NaN, + "learning_rate": 0.0002054630521360662, + "loss": 0.0, + "step": 25640 + }, + { + "epoch": 2.3925538863487916, + "grad_norm": NaN, + "learning_rate": 0.00020545602432791134, + "loss": 0.0, + "step": 25641 + }, + { + "epoch": 2.392647196043669, + "grad_norm": NaN, + "learning_rate": 0.00020544899637874817, + "loss": 0.0, + "step": 25642 + }, + { + "epoch": 2.392740505738546, + "grad_norm": NaN, + "learning_rate": 0.00020544196828859468, + "loss": 0.0, + "step": 25643 + }, + { + "epoch": 2.3928338154334234, + "grad_norm": NaN, + "learning_rate": 0.00020543494005746866, + "loss": 0.0, + "step": 25644 + }, + { + "epoch": 2.392927125128301, + "grad_norm": NaN, + "learning_rate": 0.00020542791168538807, + "loss": 0.0, + "step": 25645 + }, + { + "epoch": 2.3930204348231783, + "grad_norm": NaN, + "learning_rate": 0.00020542088317237065, + "loss": 0.0, + "step": 25646 + }, + { + "epoch": 2.3931137445180553, + "grad_norm": NaN, + "learning_rate": 0.00020541385451843433, + "loss": 0.0, + "step": 25647 + }, + { + "epoch": 2.3932070542129327, + "grad_norm": NaN, + "learning_rate": 0.00020540682572359707, + "loss": 0.0, + "step": 25648 + }, + { + "epoch": 2.39330036390781, + "grad_norm": NaN, + "learning_rate": 0.00020539979678787658, + "loss": 0.0, + "step": 25649 + }, + { + "epoch": 2.3933936736026875, + "grad_norm": NaN, + "learning_rate": 0.00020539276771129088, + "loss": 0.0, + "step": 25650 + }, + { + "epoch": 2.3934869832975645, + "grad_norm": NaN, + "learning_rate": 0.00020538573849385777, + "loss": 0.0, + "step": 25651 + }, + { + "epoch": 2.393580292992442, + "grad_norm": NaN, + "learning_rate": 0.00020537870913559506, + "loss": 0.0, + "step": 25652 + }, + { + "epoch": 2.3936736026873193, + "grad_norm": NaN, + "learning_rate": 0.0002053716796365208, + "loss": 0.0, + "step": 25653 + }, + { + "epoch": 2.3937669123821967, + "grad_norm": NaN, + "learning_rate": 0.00020536464999665273, + "loss": 0.0, + "step": 25654 + }, + { + "epoch": 2.3938602220770737, + "grad_norm": NaN, + "learning_rate": 0.0002053576202160087, + "loss": 0.0, + "step": 25655 + }, + { + "epoch": 2.393953531771951, + "grad_norm": NaN, + "learning_rate": 0.0002053505902946067, + "loss": 0.0, + "step": 25656 + }, + { + "epoch": 2.3940468414668286, + "grad_norm": NaN, + "learning_rate": 0.00020534356023246456, + "loss": 0.0, + "step": 25657 + }, + { + "epoch": 2.3941401511617055, + "grad_norm": NaN, + "learning_rate": 0.00020533653002960007, + "loss": 0.0, + "step": 25658 + }, + { + "epoch": 2.394233460856583, + "grad_norm": NaN, + "learning_rate": 0.00020532949968603124, + "loss": 0.0, + "step": 25659 + }, + { + "epoch": 2.3943267705514604, + "grad_norm": NaN, + "learning_rate": 0.00020532246920177587, + "loss": 0.0, + "step": 25660 + }, + { + "epoch": 2.394420080246338, + "grad_norm": NaN, + "learning_rate": 0.00020531543857685183, + "loss": 0.0, + "step": 25661 + }, + { + "epoch": 2.3945133899412148, + "grad_norm": NaN, + "learning_rate": 0.00020530840781127698, + "loss": 0.0, + "step": 25662 + }, + { + "epoch": 2.394606699636092, + "grad_norm": NaN, + "learning_rate": 0.0002053013769050693, + "loss": 0.0, + "step": 25663 + }, + { + "epoch": 2.3947000093309696, + "grad_norm": NaN, + "learning_rate": 0.00020529434585824655, + "loss": 0.0, + "step": 25664 + }, + { + "epoch": 2.3947933190258466, + "grad_norm": NaN, + "learning_rate": 0.0002052873146708267, + "loss": 0.0, + "step": 25665 + }, + { + "epoch": 2.394886628720724, + "grad_norm": NaN, + "learning_rate": 0.00020528028334282757, + "loss": 0.0, + "step": 25666 + }, + { + "epoch": 2.3949799384156014, + "grad_norm": NaN, + "learning_rate": 0.0002052732518742671, + "loss": 0.0, + "step": 25667 + }, + { + "epoch": 2.395073248110479, + "grad_norm": NaN, + "learning_rate": 0.000205266220265163, + "loss": 0.0, + "step": 25668 + }, + { + "epoch": 2.395166557805356, + "grad_norm": NaN, + "learning_rate": 0.00020525918851553338, + "loss": 0.0, + "step": 25669 + }, + { + "epoch": 2.3952598675002332, + "grad_norm": NaN, + "learning_rate": 0.00020525215662539598, + "loss": 0.0, + "step": 25670 + }, + { + "epoch": 2.3953531771951106, + "grad_norm": NaN, + "learning_rate": 0.0002052451245947687, + "loss": 0.0, + "step": 25671 + }, + { + "epoch": 2.3954464868899876, + "grad_norm": NaN, + "learning_rate": 0.00020523809242366947, + "loss": 0.0, + "step": 25672 + }, + { + "epoch": 2.395539796584865, + "grad_norm": NaN, + "learning_rate": 0.00020523106011211613, + "loss": 0.0, + "step": 25673 + }, + { + "epoch": 2.3956331062797425, + "grad_norm": NaN, + "learning_rate": 0.00020522402766012652, + "loss": 0.0, + "step": 25674 + }, + { + "epoch": 2.39572641597462, + "grad_norm": NaN, + "learning_rate": 0.00020521699506771855, + "loss": 0.0, + "step": 25675 + }, + { + "epoch": 2.3958197256694973, + "grad_norm": NaN, + "learning_rate": 0.00020520996233491023, + "loss": 0.0, + "step": 25676 + }, + { + "epoch": 2.3959130353643743, + "grad_norm": NaN, + "learning_rate": 0.00020520292946171924, + "loss": 0.0, + "step": 25677 + }, + { + "epoch": 2.3960063450592517, + "grad_norm": NaN, + "learning_rate": 0.00020519589644816356, + "loss": 0.0, + "step": 25678 + }, + { + "epoch": 2.396099654754129, + "grad_norm": NaN, + "learning_rate": 0.00020518886329426108, + "loss": 0.0, + "step": 25679 + }, + { + "epoch": 2.396192964449006, + "grad_norm": NaN, + "learning_rate": 0.00020518183000002964, + "loss": 0.0, + "step": 25680 + }, + { + "epoch": 2.3962862741438835, + "grad_norm": NaN, + "learning_rate": 0.0002051747965654872, + "loss": 0.0, + "step": 25681 + }, + { + "epoch": 2.396379583838761, + "grad_norm": NaN, + "learning_rate": 0.00020516776299065159, + "loss": 0.0, + "step": 25682 + }, + { + "epoch": 2.3964728935336383, + "grad_norm": NaN, + "learning_rate": 0.00020516072927554067, + "loss": 0.0, + "step": 25683 + }, + { + "epoch": 2.3965662032285153, + "grad_norm": NaN, + "learning_rate": 0.00020515369542017237, + "loss": 0.0, + "step": 25684 + }, + { + "epoch": 2.3966595129233927, + "grad_norm": NaN, + "learning_rate": 0.00020514666142456455, + "loss": 0.0, + "step": 25685 + }, + { + "epoch": 2.39675282261827, + "grad_norm": NaN, + "learning_rate": 0.00020513962728873513, + "loss": 0.0, + "step": 25686 + }, + { + "epoch": 2.396846132313147, + "grad_norm": NaN, + "learning_rate": 0.00020513259301270192, + "loss": 0.0, + "step": 25687 + }, + { + "epoch": 2.3969394420080246, + "grad_norm": NaN, + "learning_rate": 0.00020512555859648294, + "loss": 0.0, + "step": 25688 + }, + { + "epoch": 2.397032751702902, + "grad_norm": NaN, + "learning_rate": 0.00020511852404009597, + "loss": 0.0, + "step": 25689 + }, + { + "epoch": 2.3971260613977794, + "grad_norm": NaN, + "learning_rate": 0.00020511148934355885, + "loss": 0.0, + "step": 25690 + }, + { + "epoch": 2.3972193710926564, + "grad_norm": NaN, + "learning_rate": 0.0002051044545068896, + "loss": 0.0, + "step": 25691 + }, + { + "epoch": 2.397312680787534, + "grad_norm": NaN, + "learning_rate": 0.00020509741953010608, + "loss": 0.0, + "step": 25692 + }, + { + "epoch": 2.397405990482411, + "grad_norm": NaN, + "learning_rate": 0.00020509038441322605, + "loss": 0.0, + "step": 25693 + }, + { + "epoch": 2.397499300177288, + "grad_norm": NaN, + "learning_rate": 0.00020508334915626757, + "loss": 0.0, + "step": 25694 + }, + { + "epoch": 2.3975926098721656, + "grad_norm": NaN, + "learning_rate": 0.0002050763137592484, + "loss": 0.0, + "step": 25695 + }, + { + "epoch": 2.397685919567043, + "grad_norm": NaN, + "learning_rate": 0.0002050692782221865, + "loss": 0.0, + "step": 25696 + }, + { + "epoch": 2.3977792292619204, + "grad_norm": NaN, + "learning_rate": 0.00020506224254509972, + "loss": 0.0, + "step": 25697 + }, + { + "epoch": 2.397872538956798, + "grad_norm": NaN, + "learning_rate": 0.00020505520672800602, + "loss": 0.0, + "step": 25698 + }, + { + "epoch": 2.397965848651675, + "grad_norm": NaN, + "learning_rate": 0.00020504817077092317, + "loss": 0.0, + "step": 25699 + }, + { + "epoch": 2.3980591583465523, + "grad_norm": NaN, + "learning_rate": 0.00020504113467386913, + "loss": 0.0, + "step": 25700 + }, + { + "epoch": 2.3981524680414297, + "grad_norm": NaN, + "learning_rate": 0.00020503409843686184, + "loss": 0.0, + "step": 25701 + }, + { + "epoch": 2.3982457777363066, + "grad_norm": NaN, + "learning_rate": 0.00020502706205991912, + "loss": 0.0, + "step": 25702 + }, + { + "epoch": 2.398339087431184, + "grad_norm": NaN, + "learning_rate": 0.0002050200255430589, + "loss": 0.0, + "step": 25703 + }, + { + "epoch": 2.3984323971260615, + "grad_norm": NaN, + "learning_rate": 0.00020501298888629901, + "loss": 0.0, + "step": 25704 + }, + { + "epoch": 2.398525706820939, + "grad_norm": NaN, + "learning_rate": 0.0002050059520896574, + "loss": 0.0, + "step": 25705 + }, + { + "epoch": 2.398619016515816, + "grad_norm": NaN, + "learning_rate": 0.00020499891515315195, + "loss": 0.0, + "step": 25706 + }, + { + "epoch": 2.3987123262106933, + "grad_norm": NaN, + "learning_rate": 0.00020499187807680058, + "loss": 0.0, + "step": 25707 + }, + { + "epoch": 2.3988056359055707, + "grad_norm": NaN, + "learning_rate": 0.0002049848408606211, + "loss": 0.0, + "step": 25708 + }, + { + "epoch": 2.3988989456004477, + "grad_norm": NaN, + "learning_rate": 0.00020497780350463152, + "loss": 0.0, + "step": 25709 + }, + { + "epoch": 2.398992255295325, + "grad_norm": NaN, + "learning_rate": 0.00020497076600884965, + "loss": 0.0, + "step": 25710 + }, + { + "epoch": 2.3990855649902025, + "grad_norm": NaN, + "learning_rate": 0.00020496372837329336, + "loss": 0.0, + "step": 25711 + }, + { + "epoch": 2.39917887468508, + "grad_norm": NaN, + "learning_rate": 0.00020495669059798064, + "loss": 0.0, + "step": 25712 + }, + { + "epoch": 2.399272184379957, + "grad_norm": NaN, + "learning_rate": 0.00020494965268292933, + "loss": 0.0, + "step": 25713 + }, + { + "epoch": 2.3993654940748343, + "grad_norm": NaN, + "learning_rate": 0.00020494261462815735, + "loss": 0.0, + "step": 25714 + }, + { + "epoch": 2.3994588037697118, + "grad_norm": NaN, + "learning_rate": 0.00020493557643368253, + "loss": 0.0, + "step": 25715 + }, + { + "epoch": 2.3995521134645887, + "grad_norm": NaN, + "learning_rate": 0.0002049285380995228, + "loss": 0.0, + "step": 25716 + }, + { + "epoch": 2.399645423159466, + "grad_norm": NaN, + "learning_rate": 0.00020492149962569615, + "loss": 0.0, + "step": 25717 + }, + { + "epoch": 2.3997387328543436, + "grad_norm": NaN, + "learning_rate": 0.00020491446101222028, + "loss": 0.0, + "step": 25718 + }, + { + "epoch": 2.399832042549221, + "grad_norm": NaN, + "learning_rate": 0.00020490742225911328, + "loss": 0.0, + "step": 25719 + }, + { + "epoch": 2.399925352244098, + "grad_norm": NaN, + "learning_rate": 0.00020490038336639296, + "loss": 0.0, + "step": 25720 + }, + { + "epoch": 2.4000186619389754, + "grad_norm": NaN, + "learning_rate": 0.0002048933443340772, + "loss": 0.0, + "step": 25721 + }, + { + "epoch": 2.400111971633853, + "grad_norm": NaN, + "learning_rate": 0.00020488630516218395, + "loss": 0.0, + "step": 25722 + }, + { + "epoch": 2.4002052813287302, + "grad_norm": NaN, + "learning_rate": 0.0002048792658507311, + "loss": 0.0, + "step": 25723 + }, + { + "epoch": 2.400298591023607, + "grad_norm": NaN, + "learning_rate": 0.0002048722263997365, + "loss": 0.0, + "step": 25724 + }, + { + "epoch": 2.4003919007184846, + "grad_norm": NaN, + "learning_rate": 0.00020486518680921812, + "loss": 0.0, + "step": 25725 + }, + { + "epoch": 2.400485210413362, + "grad_norm": NaN, + "learning_rate": 0.0002048581470791938, + "loss": 0.0, + "step": 25726 + }, + { + "epoch": 2.4005785201082395, + "grad_norm": NaN, + "learning_rate": 0.0002048511072096815, + "loss": 0.0, + "step": 25727 + }, + { + "epoch": 2.4006718298031164, + "grad_norm": NaN, + "learning_rate": 0.00020484406720069903, + "loss": 0.0, + "step": 25728 + }, + { + "epoch": 2.400765139497994, + "grad_norm": NaN, + "learning_rate": 0.00020483702705226435, + "loss": 0.0, + "step": 25729 + }, + { + "epoch": 2.4008584491928713, + "grad_norm": NaN, + "learning_rate": 0.00020482998676439536, + "loss": 0.0, + "step": 25730 + }, + { + "epoch": 2.4009517588877483, + "grad_norm": NaN, + "learning_rate": 0.00020482294633710995, + "loss": 0.0, + "step": 25731 + }, + { + "epoch": 2.4010450685826257, + "grad_norm": NaN, + "learning_rate": 0.00020481590577042604, + "loss": 0.0, + "step": 25732 + }, + { + "epoch": 2.401138378277503, + "grad_norm": NaN, + "learning_rate": 0.00020480886506436154, + "loss": 0.0, + "step": 25733 + }, + { + "epoch": 2.4012316879723805, + "grad_norm": NaN, + "learning_rate": 0.0002048018242189343, + "loss": 0.0, + "step": 25734 + }, + { + "epoch": 2.4013249976672575, + "grad_norm": NaN, + "learning_rate": 0.00020479478323416226, + "loss": 0.0, + "step": 25735 + }, + { + "epoch": 2.401418307362135, + "grad_norm": NaN, + "learning_rate": 0.00020478774211006332, + "loss": 0.0, + "step": 25736 + }, + { + "epoch": 2.4015116170570123, + "grad_norm": NaN, + "learning_rate": 0.0002047807008466554, + "loss": 0.0, + "step": 25737 + }, + { + "epoch": 2.4016049267518893, + "grad_norm": NaN, + "learning_rate": 0.00020477365944395634, + "loss": 0.0, + "step": 25738 + }, + { + "epoch": 2.4016982364467667, + "grad_norm": NaN, + "learning_rate": 0.0002047666179019841, + "loss": 0.0, + "step": 25739 + }, + { + "epoch": 2.401791546141644, + "grad_norm": NaN, + "learning_rate": 0.00020475957622075657, + "loss": 0.0, + "step": 25740 + }, + { + "epoch": 2.4018848558365216, + "grad_norm": NaN, + "learning_rate": 0.0002047525344002917, + "loss": 0.0, + "step": 25741 + }, + { + "epoch": 2.4019781655313985, + "grad_norm": NaN, + "learning_rate": 0.00020474549244060733, + "loss": 0.0, + "step": 25742 + }, + { + "epoch": 2.402071475226276, + "grad_norm": NaN, + "learning_rate": 0.00020473845034172137, + "loss": 0.0, + "step": 25743 + }, + { + "epoch": 2.4021647849211534, + "grad_norm": NaN, + "learning_rate": 0.0002047314081036518, + "loss": 0.0, + "step": 25744 + }, + { + "epoch": 2.402258094616031, + "grad_norm": NaN, + "learning_rate": 0.00020472436572641642, + "loss": 0.0, + "step": 25745 + }, + { + "epoch": 2.4023514043109078, + "grad_norm": NaN, + "learning_rate": 0.00020471732321003324, + "loss": 0.0, + "step": 25746 + }, + { + "epoch": 2.402444714005785, + "grad_norm": NaN, + "learning_rate": 0.00020471028055452006, + "loss": 0.0, + "step": 25747 + }, + { + "epoch": 2.4025380237006626, + "grad_norm": NaN, + "learning_rate": 0.00020470323775989487, + "loss": 0.0, + "step": 25748 + }, + { + "epoch": 2.40263133339554, + "grad_norm": NaN, + "learning_rate": 0.00020469619482617554, + "loss": 0.0, + "step": 25749 + }, + { + "epoch": 2.402724643090417, + "grad_norm": NaN, + "learning_rate": 0.00020468915175338002, + "loss": 0.0, + "step": 25750 + }, + { + "epoch": 2.4028179527852944, + "grad_norm": NaN, + "learning_rate": 0.00020468210854152614, + "loss": 0.0, + "step": 25751 + }, + { + "epoch": 2.402911262480172, + "grad_norm": NaN, + "learning_rate": 0.0002046750651906319, + "loss": 0.0, + "step": 25752 + }, + { + "epoch": 2.403004572175049, + "grad_norm": NaN, + "learning_rate": 0.00020466802170071514, + "loss": 0.0, + "step": 25753 + }, + { + "epoch": 2.4030978818699262, + "grad_norm": NaN, + "learning_rate": 0.00020466097807179379, + "loss": 0.0, + "step": 25754 + }, + { + "epoch": 2.4031911915648037, + "grad_norm": NaN, + "learning_rate": 0.00020465393430388576, + "loss": 0.0, + "step": 25755 + }, + { + "epoch": 2.403284501259681, + "grad_norm": NaN, + "learning_rate": 0.00020464689039700898, + "loss": 0.0, + "step": 25756 + }, + { + "epoch": 2.403377810954558, + "grad_norm": NaN, + "learning_rate": 0.00020463984635118136, + "loss": 0.0, + "step": 25757 + }, + { + "epoch": 2.4034711206494355, + "grad_norm": NaN, + "learning_rate": 0.00020463280216642077, + "loss": 0.0, + "step": 25758 + }, + { + "epoch": 2.403564430344313, + "grad_norm": NaN, + "learning_rate": 0.00020462575784274516, + "loss": 0.0, + "step": 25759 + }, + { + "epoch": 2.40365774003919, + "grad_norm": NaN, + "learning_rate": 0.00020461871338017242, + "loss": 0.0, + "step": 25760 + }, + { + "epoch": 2.4037510497340673, + "grad_norm": NaN, + "learning_rate": 0.00020461166877872047, + "loss": 0.0, + "step": 25761 + }, + { + "epoch": 2.4038443594289447, + "grad_norm": NaN, + "learning_rate": 0.00020460462403840723, + "loss": 0.0, + "step": 25762 + }, + { + "epoch": 2.403937669123822, + "grad_norm": NaN, + "learning_rate": 0.0002045975791592506, + "loss": 0.0, + "step": 25763 + }, + { + "epoch": 2.404030978818699, + "grad_norm": NaN, + "learning_rate": 0.0002045905341412685, + "loss": 0.0, + "step": 25764 + }, + { + "epoch": 2.4041242885135765, + "grad_norm": NaN, + "learning_rate": 0.00020458348898447887, + "loss": 0.0, + "step": 25765 + }, + { + "epoch": 2.404217598208454, + "grad_norm": NaN, + "learning_rate": 0.00020457644368889957, + "loss": 0.0, + "step": 25766 + }, + { + "epoch": 2.404310907903331, + "grad_norm": NaN, + "learning_rate": 0.00020456939825454852, + "loss": 0.0, + "step": 25767 + }, + { + "epoch": 2.4044042175982083, + "grad_norm": NaN, + "learning_rate": 0.00020456235268144368, + "loss": 0.0, + "step": 25768 + }, + { + "epoch": 2.4044975272930857, + "grad_norm": NaN, + "learning_rate": 0.00020455530696960293, + "loss": 0.0, + "step": 25769 + }, + { + "epoch": 2.404590836987963, + "grad_norm": NaN, + "learning_rate": 0.00020454826111904425, + "loss": 0.0, + "step": 25770 + }, + { + "epoch": 2.4046841466828406, + "grad_norm": NaN, + "learning_rate": 0.00020454121512978544, + "loss": 0.0, + "step": 25771 + }, + { + "epoch": 2.4047774563777176, + "grad_norm": NaN, + "learning_rate": 0.00020453416900184448, + "loss": 0.0, + "step": 25772 + }, + { + "epoch": 2.404870766072595, + "grad_norm": NaN, + "learning_rate": 0.0002045271227352393, + "loss": 0.0, + "step": 25773 + }, + { + "epoch": 2.4049640757674724, + "grad_norm": NaN, + "learning_rate": 0.0002045200763299878, + "loss": 0.0, + "step": 25774 + }, + { + "epoch": 2.4050573854623494, + "grad_norm": NaN, + "learning_rate": 0.00020451302978610788, + "loss": 0.0, + "step": 25775 + }, + { + "epoch": 2.405150695157227, + "grad_norm": NaN, + "learning_rate": 0.00020450598310361748, + "loss": 0.0, + "step": 25776 + }, + { + "epoch": 2.405244004852104, + "grad_norm": NaN, + "learning_rate": 0.00020449893628253453, + "loss": 0.0, + "step": 25777 + }, + { + "epoch": 2.4053373145469816, + "grad_norm": NaN, + "learning_rate": 0.0002044918893228769, + "loss": 0.0, + "step": 25778 + }, + { + "epoch": 2.4054306242418586, + "grad_norm": NaN, + "learning_rate": 0.00020448484222466255, + "loss": 0.0, + "step": 25779 + }, + { + "epoch": 2.405523933936736, + "grad_norm": NaN, + "learning_rate": 0.0002044777949879094, + "loss": 0.0, + "step": 25780 + }, + { + "epoch": 2.4056172436316134, + "grad_norm": NaN, + "learning_rate": 0.00020447074761263532, + "loss": 0.0, + "step": 25781 + }, + { + "epoch": 2.4057105533264904, + "grad_norm": NaN, + "learning_rate": 0.00020446370009885828, + "loss": 0.0, + "step": 25782 + }, + { + "epoch": 2.405803863021368, + "grad_norm": NaN, + "learning_rate": 0.00020445665244659618, + "loss": 0.0, + "step": 25783 + }, + { + "epoch": 2.4058971727162453, + "grad_norm": NaN, + "learning_rate": 0.00020444960465586695, + "loss": 0.0, + "step": 25784 + }, + { + "epoch": 2.4059904824111227, + "grad_norm": NaN, + "learning_rate": 0.0002044425567266885, + "loss": 0.0, + "step": 25785 + }, + { + "epoch": 2.4060837921059997, + "grad_norm": NaN, + "learning_rate": 0.00020443550865907877, + "loss": 0.0, + "step": 25786 + }, + { + "epoch": 2.406177101800877, + "grad_norm": NaN, + "learning_rate": 0.00020442846045305566, + "loss": 0.0, + "step": 25787 + }, + { + "epoch": 2.4062704114957545, + "grad_norm": NaN, + "learning_rate": 0.0002044214121086371, + "loss": 0.0, + "step": 25788 + }, + { + "epoch": 2.4063637211906315, + "grad_norm": NaN, + "learning_rate": 0.00020441436362584098, + "loss": 0.0, + "step": 25789 + }, + { + "epoch": 2.406457030885509, + "grad_norm": NaN, + "learning_rate": 0.0002044073150046853, + "loss": 0.0, + "step": 25790 + }, + { + "epoch": 2.4065503405803863, + "grad_norm": NaN, + "learning_rate": 0.0002044002662451879, + "loss": 0.0, + "step": 25791 + }, + { + "epoch": 2.4066436502752637, + "grad_norm": NaN, + "learning_rate": 0.00020439321734736678, + "loss": 0.0, + "step": 25792 + }, + { + "epoch": 2.406736959970141, + "grad_norm": NaN, + "learning_rate": 0.00020438616831123976, + "loss": 0.0, + "step": 25793 + }, + { + "epoch": 2.406830269665018, + "grad_norm": NaN, + "learning_rate": 0.00020437911913682487, + "loss": 0.0, + "step": 25794 + }, + { + "epoch": 2.4069235793598955, + "grad_norm": NaN, + "learning_rate": 0.00020437206982413996, + "loss": 0.0, + "step": 25795 + }, + { + "epoch": 2.407016889054773, + "grad_norm": NaN, + "learning_rate": 0.00020436502037320302, + "loss": 0.0, + "step": 25796 + }, + { + "epoch": 2.40711019874965, + "grad_norm": NaN, + "learning_rate": 0.0002043579707840319, + "loss": 0.0, + "step": 25797 + }, + { + "epoch": 2.4072035084445274, + "grad_norm": NaN, + "learning_rate": 0.0002043509210566446, + "loss": 0.0, + "step": 25798 + }, + { + "epoch": 2.4072968181394048, + "grad_norm": NaN, + "learning_rate": 0.00020434387119105894, + "loss": 0.0, + "step": 25799 + }, + { + "epoch": 2.407390127834282, + "grad_norm": NaN, + "learning_rate": 0.00020433682118729297, + "loss": 0.0, + "step": 25800 + }, + { + "epoch": 2.407483437529159, + "grad_norm": NaN, + "learning_rate": 0.00020432977104536455, + "loss": 0.0, + "step": 25801 + }, + { + "epoch": 2.4075767472240366, + "grad_norm": NaN, + "learning_rate": 0.00020432272076529158, + "loss": 0.0, + "step": 25802 + }, + { + "epoch": 2.407670056918914, + "grad_norm": NaN, + "learning_rate": 0.00020431567034709207, + "loss": 0.0, + "step": 25803 + }, + { + "epoch": 2.407763366613791, + "grad_norm": NaN, + "learning_rate": 0.00020430861979078388, + "loss": 0.0, + "step": 25804 + }, + { + "epoch": 2.4078566763086684, + "grad_norm": NaN, + "learning_rate": 0.00020430156909638495, + "loss": 0.0, + "step": 25805 + }, + { + "epoch": 2.407949986003546, + "grad_norm": NaN, + "learning_rate": 0.00020429451826391325, + "loss": 0.0, + "step": 25806 + }, + { + "epoch": 2.4080432956984232, + "grad_norm": NaN, + "learning_rate": 0.00020428746729338662, + "loss": 0.0, + "step": 25807 + }, + { + "epoch": 2.4081366053933, + "grad_norm": NaN, + "learning_rate": 0.0002042804161848231, + "loss": 0.0, + "step": 25808 + }, + { + "epoch": 2.4082299150881776, + "grad_norm": NaN, + "learning_rate": 0.0002042733649382405, + "loss": 0.0, + "step": 25809 + }, + { + "epoch": 2.408323224783055, + "grad_norm": NaN, + "learning_rate": 0.00020426631355365687, + "loss": 0.0, + "step": 25810 + }, + { + "epoch": 2.408416534477932, + "grad_norm": NaN, + "learning_rate": 0.00020425926203109006, + "loss": 0.0, + "step": 25811 + }, + { + "epoch": 2.4085098441728094, + "grad_norm": NaN, + "learning_rate": 0.00020425221037055798, + "loss": 0.0, + "step": 25812 + }, + { + "epoch": 2.408603153867687, + "grad_norm": NaN, + "learning_rate": 0.00020424515857207862, + "loss": 0.0, + "step": 25813 + }, + { + "epoch": 2.4086964635625643, + "grad_norm": NaN, + "learning_rate": 0.00020423810663566994, + "loss": 0.0, + "step": 25814 + }, + { + "epoch": 2.4087897732574413, + "grad_norm": NaN, + "learning_rate": 0.00020423105456134977, + "loss": 0.0, + "step": 25815 + }, + { + "epoch": 2.4088830829523187, + "grad_norm": NaN, + "learning_rate": 0.0002042240023491361, + "loss": 0.0, + "step": 25816 + }, + { + "epoch": 2.408976392647196, + "grad_norm": NaN, + "learning_rate": 0.0002042169499990469, + "loss": 0.0, + "step": 25817 + }, + { + "epoch": 2.4090697023420735, + "grad_norm": NaN, + "learning_rate": 0.00020420989751109998, + "loss": 0.0, + "step": 25818 + }, + { + "epoch": 2.4091630120369505, + "grad_norm": NaN, + "learning_rate": 0.0002042028448853134, + "loss": 0.0, + "step": 25819 + }, + { + "epoch": 2.409256321731828, + "grad_norm": NaN, + "learning_rate": 0.000204195792121705, + "loss": 0.0, + "step": 25820 + }, + { + "epoch": 2.4093496314267053, + "grad_norm": NaN, + "learning_rate": 0.0002041887392202928, + "loss": 0.0, + "step": 25821 + }, + { + "epoch": 2.4094429411215827, + "grad_norm": NaN, + "learning_rate": 0.00020418168618109467, + "loss": 0.0, + "step": 25822 + }, + { + "epoch": 2.4095362508164597, + "grad_norm": NaN, + "learning_rate": 0.00020417463300412855, + "loss": 0.0, + "step": 25823 + }, + { + "epoch": 2.409629560511337, + "grad_norm": NaN, + "learning_rate": 0.0002041675796894124, + "loss": 0.0, + "step": 25824 + }, + { + "epoch": 2.4097228702062146, + "grad_norm": NaN, + "learning_rate": 0.00020416052623696416, + "loss": 0.0, + "step": 25825 + }, + { + "epoch": 2.4098161799010915, + "grad_norm": NaN, + "learning_rate": 0.00020415347264680168, + "loss": 0.0, + "step": 25826 + }, + { + "epoch": 2.409909489595969, + "grad_norm": NaN, + "learning_rate": 0.000204146418918943, + "loss": 0.0, + "step": 25827 + }, + { + "epoch": 2.4100027992908464, + "grad_norm": NaN, + "learning_rate": 0.00020413936505340603, + "loss": 0.0, + "step": 25828 + }, + { + "epoch": 2.410096108985724, + "grad_norm": NaN, + "learning_rate": 0.0002041323110502087, + "loss": 0.0, + "step": 25829 + }, + { + "epoch": 2.4101894186806008, + "grad_norm": NaN, + "learning_rate": 0.0002041252569093689, + "loss": 0.0, + "step": 25830 + }, + { + "epoch": 2.410282728375478, + "grad_norm": NaN, + "learning_rate": 0.00020411820263090464, + "loss": 0.0, + "step": 25831 + }, + { + "epoch": 2.4103760380703556, + "grad_norm": NaN, + "learning_rate": 0.0002041111482148338, + "loss": 0.0, + "step": 25832 + }, + { + "epoch": 2.4104693477652326, + "grad_norm": NaN, + "learning_rate": 0.00020410409366117434, + "loss": 0.0, + "step": 25833 + }, + { + "epoch": 2.41056265746011, + "grad_norm": NaN, + "learning_rate": 0.0002040970389699442, + "loss": 0.0, + "step": 25834 + }, + { + "epoch": 2.4106559671549874, + "grad_norm": NaN, + "learning_rate": 0.00020408998414116132, + "loss": 0.0, + "step": 25835 + }, + { + "epoch": 2.410749276849865, + "grad_norm": NaN, + "learning_rate": 0.00020408292917484364, + "loss": 0.0, + "step": 25836 + }, + { + "epoch": 2.410842586544742, + "grad_norm": NaN, + "learning_rate": 0.00020407587407100908, + "loss": 0.0, + "step": 25837 + }, + { + "epoch": 2.4109358962396192, + "grad_norm": NaN, + "learning_rate": 0.00020406881882967557, + "loss": 0.0, + "step": 25838 + }, + { + "epoch": 2.4110292059344967, + "grad_norm": NaN, + "learning_rate": 0.0002040617634508611, + "loss": 0.0, + "step": 25839 + }, + { + "epoch": 2.411122515629374, + "grad_norm": NaN, + "learning_rate": 0.00020405470793458358, + "loss": 0.0, + "step": 25840 + }, + { + "epoch": 2.411215825324251, + "grad_norm": NaN, + "learning_rate": 0.00020404765228086092, + "loss": 0.0, + "step": 25841 + }, + { + "epoch": 2.4113091350191285, + "grad_norm": NaN, + "learning_rate": 0.00020404059648971112, + "loss": 0.0, + "step": 25842 + }, + { + "epoch": 2.411402444714006, + "grad_norm": NaN, + "learning_rate": 0.00020403354056115208, + "loss": 0.0, + "step": 25843 + }, + { + "epoch": 2.4114957544088833, + "grad_norm": NaN, + "learning_rate": 0.00020402648449520177, + "loss": 0.0, + "step": 25844 + }, + { + "epoch": 2.4115890641037603, + "grad_norm": NaN, + "learning_rate": 0.0002040194282918781, + "loss": 0.0, + "step": 25845 + }, + { + "epoch": 2.4116823737986377, + "grad_norm": NaN, + "learning_rate": 0.000204012371951199, + "loss": 0.0, + "step": 25846 + }, + { + "epoch": 2.411775683493515, + "grad_norm": NaN, + "learning_rate": 0.00020400531547318249, + "loss": 0.0, + "step": 25847 + }, + { + "epoch": 2.411868993188392, + "grad_norm": NaN, + "learning_rate": 0.00020399825885784642, + "loss": 0.0, + "step": 25848 + }, + { + "epoch": 2.4119623028832695, + "grad_norm": NaN, + "learning_rate": 0.00020399120210520878, + "loss": 0.0, + "step": 25849 + }, + { + "epoch": 2.412055612578147, + "grad_norm": NaN, + "learning_rate": 0.0002039841452152875, + "loss": 0.0, + "step": 25850 + }, + { + "epoch": 2.4121489222730244, + "grad_norm": NaN, + "learning_rate": 0.00020397708818810056, + "loss": 0.0, + "step": 25851 + }, + { + "epoch": 2.4122422319679013, + "grad_norm": NaN, + "learning_rate": 0.00020397003102366588, + "loss": 0.0, + "step": 25852 + }, + { + "epoch": 2.4123355416627787, + "grad_norm": NaN, + "learning_rate": 0.00020396297372200135, + "loss": 0.0, + "step": 25853 + }, + { + "epoch": 2.412428851357656, + "grad_norm": NaN, + "learning_rate": 0.00020395591628312497, + "loss": 0.0, + "step": 25854 + }, + { + "epoch": 2.412522161052533, + "grad_norm": NaN, + "learning_rate": 0.00020394885870705473, + "loss": 0.0, + "step": 25855 + }, + { + "epoch": 2.4126154707474106, + "grad_norm": NaN, + "learning_rate": 0.00020394180099380846, + "loss": 0.0, + "step": 25856 + }, + { + "epoch": 2.412708780442288, + "grad_norm": NaN, + "learning_rate": 0.0002039347431434042, + "loss": 0.0, + "step": 25857 + }, + { + "epoch": 2.4128020901371654, + "grad_norm": NaN, + "learning_rate": 0.00020392768515585988, + "loss": 0.0, + "step": 25858 + }, + { + "epoch": 2.4128953998320424, + "grad_norm": NaN, + "learning_rate": 0.00020392062703119342, + "loss": 0.0, + "step": 25859 + }, + { + "epoch": 2.41298870952692, + "grad_norm": NaN, + "learning_rate": 0.00020391356876942277, + "loss": 0.0, + "step": 25860 + }, + { + "epoch": 2.413082019221797, + "grad_norm": NaN, + "learning_rate": 0.00020390651037056586, + "loss": 0.0, + "step": 25861 + }, + { + "epoch": 2.4131753289166746, + "grad_norm": NaN, + "learning_rate": 0.00020389945183464067, + "loss": 0.0, + "step": 25862 + }, + { + "epoch": 2.4132686386115516, + "grad_norm": NaN, + "learning_rate": 0.00020389239316166518, + "loss": 0.0, + "step": 25863 + }, + { + "epoch": 2.413361948306429, + "grad_norm": NaN, + "learning_rate": 0.00020388533435165726, + "loss": 0.0, + "step": 25864 + }, + { + "epoch": 2.4134552580013064, + "grad_norm": NaN, + "learning_rate": 0.0002038782754046349, + "loss": 0.0, + "step": 25865 + }, + { + "epoch": 2.413548567696184, + "grad_norm": NaN, + "learning_rate": 0.00020387121632061607, + "loss": 0.0, + "step": 25866 + }, + { + "epoch": 2.413641877391061, + "grad_norm": NaN, + "learning_rate": 0.00020386415709961869, + "loss": 0.0, + "step": 25867 + }, + { + "epoch": 2.4137351870859383, + "grad_norm": NaN, + "learning_rate": 0.00020385709774166068, + "loss": 0.0, + "step": 25868 + }, + { + "epoch": 2.4138284967808157, + "grad_norm": NaN, + "learning_rate": 0.00020385003824676008, + "loss": 0.0, + "step": 25869 + }, + { + "epoch": 2.4139218064756927, + "grad_norm": NaN, + "learning_rate": 0.00020384297861493475, + "loss": 0.0, + "step": 25870 + }, + { + "epoch": 2.41401511617057, + "grad_norm": NaN, + "learning_rate": 0.00020383591884620269, + "loss": 0.0, + "step": 25871 + }, + { + "epoch": 2.4141084258654475, + "grad_norm": NaN, + "learning_rate": 0.00020382885894058183, + "loss": 0.0, + "step": 25872 + }, + { + "epoch": 2.414201735560325, + "grad_norm": NaN, + "learning_rate": 0.00020382179889809015, + "loss": 0.0, + "step": 25873 + }, + { + "epoch": 2.414295045255202, + "grad_norm": NaN, + "learning_rate": 0.00020381473871874551, + "loss": 0.0, + "step": 25874 + }, + { + "epoch": 2.4143883549500793, + "grad_norm": NaN, + "learning_rate": 0.000203807678402566, + "loss": 0.0, + "step": 25875 + }, + { + "epoch": 2.4144816646449567, + "grad_norm": NaN, + "learning_rate": 0.00020380061794956947, + "loss": 0.0, + "step": 25876 + }, + { + "epoch": 2.4145749743398337, + "grad_norm": NaN, + "learning_rate": 0.0002037935573597739, + "loss": 0.0, + "step": 25877 + }, + { + "epoch": 2.414668284034711, + "grad_norm": NaN, + "learning_rate": 0.00020378649663319726, + "loss": 0.0, + "step": 25878 + }, + { + "epoch": 2.4147615937295885, + "grad_norm": NaN, + "learning_rate": 0.00020377943576985752, + "loss": 0.0, + "step": 25879 + }, + { + "epoch": 2.414854903424466, + "grad_norm": NaN, + "learning_rate": 0.00020377237476977255, + "loss": 0.0, + "step": 25880 + }, + { + "epoch": 2.414948213119343, + "grad_norm": NaN, + "learning_rate": 0.00020376531363296038, + "loss": 0.0, + "step": 25881 + }, + { + "epoch": 2.4150415228142204, + "grad_norm": NaN, + "learning_rate": 0.00020375825235943897, + "loss": 0.0, + "step": 25882 + }, + { + "epoch": 2.4151348325090978, + "grad_norm": NaN, + "learning_rate": 0.00020375119094922622, + "loss": 0.0, + "step": 25883 + }, + { + "epoch": 2.4152281422039747, + "grad_norm": NaN, + "learning_rate": 0.0002037441294023401, + "loss": 0.0, + "step": 25884 + }, + { + "epoch": 2.415321451898852, + "grad_norm": NaN, + "learning_rate": 0.00020373706771879862, + "loss": 0.0, + "step": 25885 + }, + { + "epoch": 2.4154147615937296, + "grad_norm": NaN, + "learning_rate": 0.00020373000589861968, + "loss": 0.0, + "step": 25886 + }, + { + "epoch": 2.415508071288607, + "grad_norm": NaN, + "learning_rate": 0.0002037229439418212, + "loss": 0.0, + "step": 25887 + }, + { + "epoch": 2.4156013809834844, + "grad_norm": NaN, + "learning_rate": 0.00020371588184842128, + "loss": 0.0, + "step": 25888 + }, + { + "epoch": 2.4156946906783614, + "grad_norm": NaN, + "learning_rate": 0.00020370881961843774, + "loss": 0.0, + "step": 25889 + }, + { + "epoch": 2.415788000373239, + "grad_norm": NaN, + "learning_rate": 0.00020370175725188853, + "loss": 0.0, + "step": 25890 + }, + { + "epoch": 2.4158813100681162, + "grad_norm": NaN, + "learning_rate": 0.00020369469474879176, + "loss": 0.0, + "step": 25891 + }, + { + "epoch": 2.415974619762993, + "grad_norm": NaN, + "learning_rate": 0.00020368763210916523, + "loss": 0.0, + "step": 25892 + }, + { + "epoch": 2.4160679294578706, + "grad_norm": NaN, + "learning_rate": 0.00020368056933302696, + "loss": 0.0, + "step": 25893 + }, + { + "epoch": 2.416161239152748, + "grad_norm": NaN, + "learning_rate": 0.0002036735064203949, + "loss": 0.0, + "step": 25894 + }, + { + "epoch": 2.4162545488476255, + "grad_norm": NaN, + "learning_rate": 0.00020366644337128703, + "loss": 0.0, + "step": 25895 + }, + { + "epoch": 2.4163478585425024, + "grad_norm": NaN, + "learning_rate": 0.00020365938018572125, + "loss": 0.0, + "step": 25896 + }, + { + "epoch": 2.41644116823738, + "grad_norm": NaN, + "learning_rate": 0.00020365231686371563, + "loss": 0.0, + "step": 25897 + }, + { + "epoch": 2.4165344779322573, + "grad_norm": NaN, + "learning_rate": 0.00020364525340528798, + "loss": 0.0, + "step": 25898 + }, + { + "epoch": 2.4166277876271343, + "grad_norm": NaN, + "learning_rate": 0.0002036381898104564, + "loss": 0.0, + "step": 25899 + }, + { + "epoch": 2.4167210973220117, + "grad_norm": NaN, + "learning_rate": 0.00020363112607923878, + "loss": 0.0, + "step": 25900 + }, + { + "epoch": 2.416814407016889, + "grad_norm": NaN, + "learning_rate": 0.0002036240622116531, + "loss": 0.0, + "step": 25901 + }, + { + "epoch": 2.4169077167117665, + "grad_norm": NaN, + "learning_rate": 0.0002036169982077173, + "loss": 0.0, + "step": 25902 + }, + { + "epoch": 2.4170010264066435, + "grad_norm": NaN, + "learning_rate": 0.00020360993406744933, + "loss": 0.0, + "step": 25903 + }, + { + "epoch": 2.417094336101521, + "grad_norm": NaN, + "learning_rate": 0.00020360286979086723, + "loss": 0.0, + "step": 25904 + }, + { + "epoch": 2.4171876457963983, + "grad_norm": NaN, + "learning_rate": 0.00020359580537798887, + "loss": 0.0, + "step": 25905 + }, + { + "epoch": 2.4172809554912753, + "grad_norm": NaN, + "learning_rate": 0.00020358874082883224, + "loss": 0.0, + "step": 25906 + }, + { + "epoch": 2.4173742651861527, + "grad_norm": NaN, + "learning_rate": 0.00020358167614341541, + "loss": 0.0, + "step": 25907 + }, + { + "epoch": 2.41746757488103, + "grad_norm": NaN, + "learning_rate": 0.0002035746113217562, + "loss": 0.0, + "step": 25908 + }, + { + "epoch": 2.4175608845759076, + "grad_norm": NaN, + "learning_rate": 0.00020356754636387253, + "loss": 0.0, + "step": 25909 + }, + { + "epoch": 2.417654194270785, + "grad_norm": NaN, + "learning_rate": 0.00020356048126978258, + "loss": 0.0, + "step": 25910 + }, + { + "epoch": 2.417747503965662, + "grad_norm": NaN, + "learning_rate": 0.00020355341603950415, + "loss": 0.0, + "step": 25911 + }, + { + "epoch": 2.4178408136605394, + "grad_norm": NaN, + "learning_rate": 0.00020354635067305522, + "loss": 0.0, + "step": 25912 + }, + { + "epoch": 2.417934123355417, + "grad_norm": NaN, + "learning_rate": 0.00020353928517045382, + "loss": 0.0, + "step": 25913 + }, + { + "epoch": 2.4180274330502938, + "grad_norm": NaN, + "learning_rate": 0.0002035322195317179, + "loss": 0.0, + "step": 25914 + }, + { + "epoch": 2.418120742745171, + "grad_norm": NaN, + "learning_rate": 0.00020352515375686537, + "loss": 0.0, + "step": 25915 + }, + { + "epoch": 2.4182140524400486, + "grad_norm": NaN, + "learning_rate": 0.00020351808784591422, + "loss": 0.0, + "step": 25916 + }, + { + "epoch": 2.418307362134926, + "grad_norm": NaN, + "learning_rate": 0.0002035110217988824, + "loss": 0.0, + "step": 25917 + }, + { + "epoch": 2.418400671829803, + "grad_norm": NaN, + "learning_rate": 0.00020350395561578795, + "loss": 0.0, + "step": 25918 + }, + { + "epoch": 2.4184939815246804, + "grad_norm": NaN, + "learning_rate": 0.00020349688929664878, + "loss": 0.0, + "step": 25919 + }, + { + "epoch": 2.418587291219558, + "grad_norm": NaN, + "learning_rate": 0.00020348982284148286, + "loss": 0.0, + "step": 25920 + }, + { + "epoch": 2.418680600914435, + "grad_norm": NaN, + "learning_rate": 0.00020348275625030817, + "loss": 0.0, + "step": 25921 + }, + { + "epoch": 2.4187739106093122, + "grad_norm": NaN, + "learning_rate": 0.00020347568952314263, + "loss": 0.0, + "step": 25922 + }, + { + "epoch": 2.4188672203041897, + "grad_norm": NaN, + "learning_rate": 0.00020346862266000434, + "loss": 0.0, + "step": 25923 + }, + { + "epoch": 2.418960529999067, + "grad_norm": NaN, + "learning_rate": 0.00020346155566091116, + "loss": 0.0, + "step": 25924 + }, + { + "epoch": 2.419053839693944, + "grad_norm": NaN, + "learning_rate": 0.000203454488525881, + "loss": 0.0, + "step": 25925 + }, + { + "epoch": 2.4191471493888215, + "grad_norm": NaN, + "learning_rate": 0.000203447421254932, + "loss": 0.0, + "step": 25926 + }, + { + "epoch": 2.419240459083699, + "grad_norm": NaN, + "learning_rate": 0.00020344035384808198, + "loss": 0.0, + "step": 25927 + }, + { + "epoch": 2.419333768778576, + "grad_norm": NaN, + "learning_rate": 0.00020343328630534902, + "loss": 0.0, + "step": 25928 + }, + { + "epoch": 2.4194270784734533, + "grad_norm": NaN, + "learning_rate": 0.00020342621862675105, + "loss": 0.0, + "step": 25929 + }, + { + "epoch": 2.4195203881683307, + "grad_norm": NaN, + "learning_rate": 0.00020341915081230596, + "loss": 0.0, + "step": 25930 + }, + { + "epoch": 2.419613697863208, + "grad_norm": NaN, + "learning_rate": 0.0002034120828620318, + "loss": 0.0, + "step": 25931 + }, + { + "epoch": 2.419707007558085, + "grad_norm": NaN, + "learning_rate": 0.0002034050147759466, + "loss": 0.0, + "step": 25932 + }, + { + "epoch": 2.4198003172529625, + "grad_norm": NaN, + "learning_rate": 0.00020339794655406824, + "loss": 0.0, + "step": 25933 + }, + { + "epoch": 2.41989362694784, + "grad_norm": NaN, + "learning_rate": 0.0002033908781964147, + "loss": 0.0, + "step": 25934 + }, + { + "epoch": 2.4199869366427174, + "grad_norm": NaN, + "learning_rate": 0.000203383809703004, + "loss": 0.0, + "step": 25935 + }, + { + "epoch": 2.4200802463375943, + "grad_norm": NaN, + "learning_rate": 0.0002033767410738541, + "loss": 0.0, + "step": 25936 + }, + { + "epoch": 2.4201735560324718, + "grad_norm": NaN, + "learning_rate": 0.0002033696723089829, + "loss": 0.0, + "step": 25937 + }, + { + "epoch": 2.420266865727349, + "grad_norm": NaN, + "learning_rate": 0.00020336260340840848, + "loss": 0.0, + "step": 25938 + }, + { + "epoch": 2.4203601754222266, + "grad_norm": NaN, + "learning_rate": 0.00020335553437214872, + "loss": 0.0, + "step": 25939 + }, + { + "epoch": 2.4204534851171036, + "grad_norm": NaN, + "learning_rate": 0.0002033484652002217, + "loss": 0.0, + "step": 25940 + }, + { + "epoch": 2.420546794811981, + "grad_norm": NaN, + "learning_rate": 0.00020334139589264528, + "loss": 0.0, + "step": 25941 + }, + { + "epoch": 2.4206401045068584, + "grad_norm": NaN, + "learning_rate": 0.00020333432644943755, + "loss": 0.0, + "step": 25942 + }, + { + "epoch": 2.4207334142017354, + "grad_norm": NaN, + "learning_rate": 0.00020332725687061638, + "loss": 0.0, + "step": 25943 + }, + { + "epoch": 2.420826723896613, + "grad_norm": NaN, + "learning_rate": 0.00020332018715619977, + "loss": 0.0, + "step": 25944 + }, + { + "epoch": 2.42092003359149, + "grad_norm": NaN, + "learning_rate": 0.00020331311730620582, + "loss": 0.0, + "step": 25945 + }, + { + "epoch": 2.4210133432863676, + "grad_norm": NaN, + "learning_rate": 0.0002033060473206523, + "loss": 0.0, + "step": 25946 + }, + { + "epoch": 2.4211066529812446, + "grad_norm": NaN, + "learning_rate": 0.00020329897719955733, + "loss": 0.0, + "step": 25947 + }, + { + "epoch": 2.421199962676122, + "grad_norm": NaN, + "learning_rate": 0.00020329190694293886, + "loss": 0.0, + "step": 25948 + }, + { + "epoch": 2.4212932723709994, + "grad_norm": NaN, + "learning_rate": 0.0002032848365508149, + "loss": 0.0, + "step": 25949 + }, + { + "epoch": 2.4213865820658764, + "grad_norm": NaN, + "learning_rate": 0.00020327776602320328, + "loss": 0.0, + "step": 25950 + }, + { + "epoch": 2.421479891760754, + "grad_norm": NaN, + "learning_rate": 0.00020327069536012215, + "loss": 0.0, + "step": 25951 + }, + { + "epoch": 2.4215732014556313, + "grad_norm": NaN, + "learning_rate": 0.0002032636245615894, + "loss": 0.0, + "step": 25952 + }, + { + "epoch": 2.4216665111505087, + "grad_norm": NaN, + "learning_rate": 0.00020325655362762304, + "loss": 0.0, + "step": 25953 + }, + { + "epoch": 2.4217598208453857, + "grad_norm": NaN, + "learning_rate": 0.00020324948255824108, + "loss": 0.0, + "step": 25954 + }, + { + "epoch": 2.421853130540263, + "grad_norm": NaN, + "learning_rate": 0.00020324241135346137, + "loss": 0.0, + "step": 25955 + }, + { + "epoch": 2.4219464402351405, + "grad_norm": NaN, + "learning_rate": 0.00020323534001330203, + "loss": 0.0, + "step": 25956 + }, + { + "epoch": 2.422039749930018, + "grad_norm": NaN, + "learning_rate": 0.000203228268537781, + "loss": 0.0, + "step": 25957 + }, + { + "epoch": 2.422133059624895, + "grad_norm": NaN, + "learning_rate": 0.00020322119692691627, + "loss": 0.0, + "step": 25958 + }, + { + "epoch": 2.4222263693197723, + "grad_norm": NaN, + "learning_rate": 0.00020321412518072577, + "loss": 0.0, + "step": 25959 + }, + { + "epoch": 2.4223196790146497, + "grad_norm": NaN, + "learning_rate": 0.00020320705329922745, + "loss": 0.0, + "step": 25960 + }, + { + "epoch": 2.422412988709527, + "grad_norm": NaN, + "learning_rate": 0.0002031999812824395, + "loss": 0.0, + "step": 25961 + }, + { + "epoch": 2.422506298404404, + "grad_norm": NaN, + "learning_rate": 0.00020319290913037964, + "loss": 0.0, + "step": 25962 + }, + { + "epoch": 2.4225996080992815, + "grad_norm": NaN, + "learning_rate": 0.000203185836843066, + "loss": 0.0, + "step": 25963 + }, + { + "epoch": 2.422692917794159, + "grad_norm": NaN, + "learning_rate": 0.00020317876442051654, + "loss": 0.0, + "step": 25964 + }, + { + "epoch": 2.422786227489036, + "grad_norm": NaN, + "learning_rate": 0.00020317169186274923, + "loss": 0.0, + "step": 25965 + }, + { + "epoch": 2.4228795371839134, + "grad_norm": NaN, + "learning_rate": 0.00020316461916978202, + "loss": 0.0, + "step": 25966 + }, + { + "epoch": 2.4229728468787908, + "grad_norm": NaN, + "learning_rate": 0.000203157546341633, + "loss": 0.0, + "step": 25967 + }, + { + "epoch": 2.423066156573668, + "grad_norm": NaN, + "learning_rate": 0.00020315047337832005, + "loss": 0.0, + "step": 25968 + }, + { + "epoch": 2.423159466268545, + "grad_norm": NaN, + "learning_rate": 0.0002031434002798612, + "loss": 0.0, + "step": 25969 + }, + { + "epoch": 2.4232527759634226, + "grad_norm": NaN, + "learning_rate": 0.00020313632704627442, + "loss": 0.0, + "step": 25970 + }, + { + "epoch": 2.4233460856583, + "grad_norm": NaN, + "learning_rate": 0.00020312925367757772, + "loss": 0.0, + "step": 25971 + }, + { + "epoch": 2.423439395353177, + "grad_norm": NaN, + "learning_rate": 0.00020312218017378903, + "loss": 0.0, + "step": 25972 + }, + { + "epoch": 2.4235327050480544, + "grad_norm": NaN, + "learning_rate": 0.0002031151065349264, + "loss": 0.0, + "step": 25973 + }, + { + "epoch": 2.423626014742932, + "grad_norm": NaN, + "learning_rate": 0.0002031080327610078, + "loss": 0.0, + "step": 25974 + }, + { + "epoch": 2.4237193244378092, + "grad_norm": NaN, + "learning_rate": 0.00020310095885205115, + "loss": 0.0, + "step": 25975 + }, + { + "epoch": 2.423812634132686, + "grad_norm": NaN, + "learning_rate": 0.00020309388480807455, + "loss": 0.0, + "step": 25976 + }, + { + "epoch": 2.4239059438275636, + "grad_norm": NaN, + "learning_rate": 0.00020308681062909589, + "loss": 0.0, + "step": 25977 + }, + { + "epoch": 2.423999253522441, + "grad_norm": NaN, + "learning_rate": 0.00020307973631513319, + "loss": 0.0, + "step": 25978 + }, + { + "epoch": 2.424092563217318, + "grad_norm": NaN, + "learning_rate": 0.00020307266186620444, + "loss": 0.0, + "step": 25979 + }, + { + "epoch": 2.4241858729121954, + "grad_norm": NaN, + "learning_rate": 0.0002030655872823277, + "loss": 0.0, + "step": 25980 + }, + { + "epoch": 2.424279182607073, + "grad_norm": NaN, + "learning_rate": 0.00020305851256352084, + "loss": 0.0, + "step": 25981 + }, + { + "epoch": 2.4243724923019503, + "grad_norm": NaN, + "learning_rate": 0.00020305143770980188, + "loss": 0.0, + "step": 25982 + }, + { + "epoch": 2.4244658019968277, + "grad_norm": NaN, + "learning_rate": 0.00020304436272118888, + "loss": 0.0, + "step": 25983 + }, + { + "epoch": 2.4245591116917047, + "grad_norm": NaN, + "learning_rate": 0.00020303728759769977, + "loss": 0.0, + "step": 25984 + }, + { + "epoch": 2.424652421386582, + "grad_norm": NaN, + "learning_rate": 0.0002030302123393525, + "loss": 0.0, + "step": 25985 + }, + { + "epoch": 2.4247457310814595, + "grad_norm": NaN, + "learning_rate": 0.00020302313694616517, + "loss": 0.0, + "step": 25986 + }, + { + "epoch": 2.4248390407763365, + "grad_norm": NaN, + "learning_rate": 0.00020301606141815567, + "loss": 0.0, + "step": 25987 + }, + { + "epoch": 2.424932350471214, + "grad_norm": NaN, + "learning_rate": 0.000203008985755342, + "loss": 0.0, + "step": 25988 + }, + { + "epoch": 2.4250256601660913, + "grad_norm": NaN, + "learning_rate": 0.00020300190995774226, + "loss": 0.0, + "step": 25989 + }, + { + "epoch": 2.4251189698609688, + "grad_norm": NaN, + "learning_rate": 0.00020299483402537433, + "loss": 0.0, + "step": 25990 + }, + { + "epoch": 2.4252122795558457, + "grad_norm": NaN, + "learning_rate": 0.0002029877579582562, + "loss": 0.0, + "step": 25991 + }, + { + "epoch": 2.425305589250723, + "grad_norm": NaN, + "learning_rate": 0.00020298068175640593, + "loss": 0.0, + "step": 25992 + }, + { + "epoch": 2.4253988989456006, + "grad_norm": NaN, + "learning_rate": 0.0002029736054198415, + "loss": 0.0, + "step": 25993 + }, + { + "epoch": 2.4254922086404775, + "grad_norm": NaN, + "learning_rate": 0.00020296652894858083, + "loss": 0.0, + "step": 25994 + }, + { + "epoch": 2.425585518335355, + "grad_norm": NaN, + "learning_rate": 0.00020295945234264203, + "loss": 0.0, + "step": 25995 + }, + { + "epoch": 2.4256788280302324, + "grad_norm": NaN, + "learning_rate": 0.000202952375602043, + "loss": 0.0, + "step": 25996 + }, + { + "epoch": 2.42577213772511, + "grad_norm": NaN, + "learning_rate": 0.00020294529872680173, + "loss": 0.0, + "step": 25997 + }, + { + "epoch": 2.4258654474199868, + "grad_norm": NaN, + "learning_rate": 0.0002029382217169363, + "loss": 0.0, + "step": 25998 + }, + { + "epoch": 2.425958757114864, + "grad_norm": NaN, + "learning_rate": 0.00020293114457246465, + "loss": 0.0, + "step": 25999 + }, + { + "epoch": 2.4260520668097416, + "grad_norm": NaN, + "learning_rate": 0.00020292406729340473, + "loss": 0.0, + "step": 26000 + }, + { + "epoch": 2.4261453765046186, + "grad_norm": NaN, + "learning_rate": 0.00020291698987977462, + "loss": 0.0, + "step": 26001 + }, + { + "epoch": 2.426238686199496, + "grad_norm": NaN, + "learning_rate": 0.0002029099123315923, + "loss": 0.0, + "step": 26002 + }, + { + "epoch": 2.4263319958943734, + "grad_norm": NaN, + "learning_rate": 0.00020290283464887573, + "loss": 0.0, + "step": 26003 + }, + { + "epoch": 2.426425305589251, + "grad_norm": NaN, + "learning_rate": 0.00020289575683164288, + "loss": 0.0, + "step": 26004 + }, + { + "epoch": 2.4265186152841283, + "grad_norm": NaN, + "learning_rate": 0.00020288867887991187, + "loss": 0.0, + "step": 26005 + }, + { + "epoch": 2.4266119249790052, + "grad_norm": NaN, + "learning_rate": 0.00020288160079370054, + "loss": 0.0, + "step": 26006 + }, + { + "epoch": 2.4267052346738827, + "grad_norm": NaN, + "learning_rate": 0.000202874522573027, + "loss": 0.0, + "step": 26007 + }, + { + "epoch": 2.42679854436876, + "grad_norm": NaN, + "learning_rate": 0.00020286744421790922, + "loss": 0.0, + "step": 26008 + }, + { + "epoch": 2.426891854063637, + "grad_norm": NaN, + "learning_rate": 0.00020286036572836517, + "loss": 0.0, + "step": 26009 + }, + { + "epoch": 2.4269851637585145, + "grad_norm": NaN, + "learning_rate": 0.00020285328710441285, + "loss": 0.0, + "step": 26010 + }, + { + "epoch": 2.427078473453392, + "grad_norm": NaN, + "learning_rate": 0.00020284620834607037, + "loss": 0.0, + "step": 26011 + }, + { + "epoch": 2.4271717831482693, + "grad_norm": NaN, + "learning_rate": 0.00020283912945335555, + "loss": 0.0, + "step": 26012 + }, + { + "epoch": 2.4272650928431463, + "grad_norm": NaN, + "learning_rate": 0.00020283205042628645, + "loss": 0.0, + "step": 26013 + }, + { + "epoch": 2.4273584025380237, + "grad_norm": NaN, + "learning_rate": 0.00020282497126488115, + "loss": 0.0, + "step": 26014 + }, + { + "epoch": 2.427451712232901, + "grad_norm": NaN, + "learning_rate": 0.0002028178919691576, + "loss": 0.0, + "step": 26015 + }, + { + "epoch": 2.427545021927778, + "grad_norm": NaN, + "learning_rate": 0.00020281081253913376, + "loss": 0.0, + "step": 26016 + }, + { + "epoch": 2.4276383316226555, + "grad_norm": NaN, + "learning_rate": 0.00020280373297482771, + "loss": 0.0, + "step": 26017 + }, + { + "epoch": 2.427731641317533, + "grad_norm": NaN, + "learning_rate": 0.00020279665327625743, + "loss": 0.0, + "step": 26018 + }, + { + "epoch": 2.4278249510124104, + "grad_norm": NaN, + "learning_rate": 0.00020278957344344083, + "loss": 0.0, + "step": 26019 + }, + { + "epoch": 2.4279182607072873, + "grad_norm": NaN, + "learning_rate": 0.000202782493476396, + "loss": 0.0, + "step": 26020 + }, + { + "epoch": 2.4280115704021648, + "grad_norm": NaN, + "learning_rate": 0.00020277541337514098, + "loss": 0.0, + "step": 26021 + }, + { + "epoch": 2.428104880097042, + "grad_norm": NaN, + "learning_rate": 0.00020276833313969363, + "loss": 0.0, + "step": 26022 + }, + { + "epoch": 2.428198189791919, + "grad_norm": NaN, + "learning_rate": 0.00020276125277007208, + "loss": 0.0, + "step": 26023 + }, + { + "epoch": 2.4282914994867966, + "grad_norm": NaN, + "learning_rate": 0.00020275417226629433, + "loss": 0.0, + "step": 26024 + }, + { + "epoch": 2.428384809181674, + "grad_norm": NaN, + "learning_rate": 0.0002027470916283783, + "loss": 0.0, + "step": 26025 + }, + { + "epoch": 2.4284781188765514, + "grad_norm": NaN, + "learning_rate": 0.00020274001085634203, + "loss": 0.0, + "step": 26026 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": NaN, + "learning_rate": 0.0002027329299502036, + "loss": 0.0, + "step": 26027 + }, + { + "epoch": 2.428664738266306, + "grad_norm": NaN, + "learning_rate": 0.00020272584890998092, + "loss": 0.0, + "step": 26028 + }, + { + "epoch": 2.428758047961183, + "grad_norm": NaN, + "learning_rate": 0.00020271876773569198, + "loss": 0.0, + "step": 26029 + }, + { + "epoch": 2.4288513576560606, + "grad_norm": NaN, + "learning_rate": 0.00020271168642735488, + "loss": 0.0, + "step": 26030 + }, + { + "epoch": 2.4289446673509376, + "grad_norm": NaN, + "learning_rate": 0.00020270460498498757, + "loss": 0.0, + "step": 26031 + }, + { + "epoch": 2.429037977045815, + "grad_norm": NaN, + "learning_rate": 0.00020269752340860801, + "loss": 0.0, + "step": 26032 + }, + { + "epoch": 2.4291312867406925, + "grad_norm": NaN, + "learning_rate": 0.0002026904416982343, + "loss": 0.0, + "step": 26033 + }, + { + "epoch": 2.42922459643557, + "grad_norm": NaN, + "learning_rate": 0.00020268335985388445, + "loss": 0.0, + "step": 26034 + }, + { + "epoch": 2.429317906130447, + "grad_norm": NaN, + "learning_rate": 0.00020267627787557633, + "loss": 0.0, + "step": 26035 + }, + { + "epoch": 2.4294112158253243, + "grad_norm": NaN, + "learning_rate": 0.0002026691957633281, + "loss": 0.0, + "step": 26036 + }, + { + "epoch": 2.4295045255202017, + "grad_norm": NaN, + "learning_rate": 0.00020266211351715774, + "loss": 0.0, + "step": 26037 + }, + { + "epoch": 2.4295978352150787, + "grad_norm": NaN, + "learning_rate": 0.00020265503113708314, + "loss": 0.0, + "step": 26038 + }, + { + "epoch": 2.429691144909956, + "grad_norm": NaN, + "learning_rate": 0.0002026479486231224, + "loss": 0.0, + "step": 26039 + }, + { + "epoch": 2.4297844546048335, + "grad_norm": NaN, + "learning_rate": 0.0002026408659752936, + "loss": 0.0, + "step": 26040 + }, + { + "epoch": 2.429877764299711, + "grad_norm": NaN, + "learning_rate": 0.00020263378319361459, + "loss": 0.0, + "step": 26041 + }, + { + "epoch": 2.429971073994588, + "grad_norm": NaN, + "learning_rate": 0.00020262670027810348, + "loss": 0.0, + "step": 26042 + }, + { + "epoch": 2.4300643836894653, + "grad_norm": NaN, + "learning_rate": 0.00020261961722877833, + "loss": 0.0, + "step": 26043 + }, + { + "epoch": 2.4301576933843427, + "grad_norm": NaN, + "learning_rate": 0.000202612534045657, + "loss": 0.0, + "step": 26044 + }, + { + "epoch": 2.4302510030792197, + "grad_norm": NaN, + "learning_rate": 0.0002026054507287576, + "loss": 0.0, + "step": 26045 + }, + { + "epoch": 2.430344312774097, + "grad_norm": NaN, + "learning_rate": 0.00020259836727809817, + "loss": 0.0, + "step": 26046 + }, + { + "epoch": 2.4304376224689745, + "grad_norm": NaN, + "learning_rate": 0.00020259128369369659, + "loss": 0.0, + "step": 26047 + }, + { + "epoch": 2.430530932163852, + "grad_norm": NaN, + "learning_rate": 0.00020258419997557097, + "loss": 0.0, + "step": 26048 + }, + { + "epoch": 2.430624241858729, + "grad_norm": NaN, + "learning_rate": 0.0002025771161237394, + "loss": 0.0, + "step": 26049 + }, + { + "epoch": 2.4307175515536064, + "grad_norm": NaN, + "learning_rate": 0.00020257003213821972, + "loss": 0.0, + "step": 26050 + }, + { + "epoch": 2.430810861248484, + "grad_norm": NaN, + "learning_rate": 0.00020256294801902997, + "loss": 0.0, + "step": 26051 + }, + { + "epoch": 2.430904170943361, + "grad_norm": NaN, + "learning_rate": 0.0002025558637661883, + "loss": 0.0, + "step": 26052 + }, + { + "epoch": 2.430997480638238, + "grad_norm": NaN, + "learning_rate": 0.00020254877937971264, + "loss": 0.0, + "step": 26053 + }, + { + "epoch": 2.4310907903331156, + "grad_norm": NaN, + "learning_rate": 0.00020254169485962094, + "loss": 0.0, + "step": 26054 + }, + { + "epoch": 2.431184100027993, + "grad_norm": NaN, + "learning_rate": 0.0002025346102059313, + "loss": 0.0, + "step": 26055 + }, + { + "epoch": 2.4312774097228704, + "grad_norm": NaN, + "learning_rate": 0.00020252752541866174, + "loss": 0.0, + "step": 26056 + }, + { + "epoch": 2.4313707194177474, + "grad_norm": NaN, + "learning_rate": 0.0002025204404978302, + "loss": 0.0, + "step": 26057 + }, + { + "epoch": 2.431464029112625, + "grad_norm": NaN, + "learning_rate": 0.00020251335544345476, + "loss": 0.0, + "step": 26058 + }, + { + "epoch": 2.4315573388075022, + "grad_norm": NaN, + "learning_rate": 0.00020250627025555343, + "loss": 0.0, + "step": 26059 + }, + { + "epoch": 2.431650648502379, + "grad_norm": NaN, + "learning_rate": 0.00020249918493414414, + "loss": 0.0, + "step": 26060 + }, + { + "epoch": 2.4317439581972566, + "grad_norm": NaN, + "learning_rate": 0.00020249209947924502, + "loss": 0.0, + "step": 26061 + }, + { + "epoch": 2.431837267892134, + "grad_norm": NaN, + "learning_rate": 0.00020248501389087407, + "loss": 0.0, + "step": 26062 + }, + { + "epoch": 2.4319305775870115, + "grad_norm": NaN, + "learning_rate": 0.0002024779281690492, + "loss": 0.0, + "step": 26063 + }, + { + "epoch": 2.4320238872818885, + "grad_norm": NaN, + "learning_rate": 0.00020247084231378857, + "loss": 0.0, + "step": 26064 + }, + { + "epoch": 2.432117196976766, + "grad_norm": NaN, + "learning_rate": 0.00020246375632511013, + "loss": 0.0, + "step": 26065 + }, + { + "epoch": 2.4322105066716433, + "grad_norm": NaN, + "learning_rate": 0.00020245667020303185, + "loss": 0.0, + "step": 26066 + }, + { + "epoch": 2.4323038163665203, + "grad_norm": NaN, + "learning_rate": 0.00020244958394757183, + "loss": 0.0, + "step": 26067 + }, + { + "epoch": 2.4323971260613977, + "grad_norm": NaN, + "learning_rate": 0.00020244249755874808, + "loss": 0.0, + "step": 26068 + }, + { + "epoch": 2.432490435756275, + "grad_norm": NaN, + "learning_rate": 0.00020243541103657855, + "loss": 0.0, + "step": 26069 + }, + { + "epoch": 2.4325837454511525, + "grad_norm": NaN, + "learning_rate": 0.0002024283243810813, + "loss": 0.0, + "step": 26070 + }, + { + "epoch": 2.4326770551460295, + "grad_norm": NaN, + "learning_rate": 0.00020242123759227438, + "loss": 0.0, + "step": 26071 + }, + { + "epoch": 2.432770364840907, + "grad_norm": NaN, + "learning_rate": 0.00020241415067017576, + "loss": 0.0, + "step": 26072 + }, + { + "epoch": 2.4328636745357843, + "grad_norm": NaN, + "learning_rate": 0.0002024070636148035, + "loss": 0.0, + "step": 26073 + }, + { + "epoch": 2.4329569842306618, + "grad_norm": NaN, + "learning_rate": 0.0002023999764261756, + "loss": 0.0, + "step": 26074 + }, + { + "epoch": 2.4330502939255387, + "grad_norm": NaN, + "learning_rate": 0.00020239288910431006, + "loss": 0.0, + "step": 26075 + }, + { + "epoch": 2.433143603620416, + "grad_norm": NaN, + "learning_rate": 0.00020238580164922492, + "loss": 0.0, + "step": 26076 + }, + { + "epoch": 2.4332369133152936, + "grad_norm": NaN, + "learning_rate": 0.00020237871406093825, + "loss": 0.0, + "step": 26077 + }, + { + "epoch": 2.433330223010171, + "grad_norm": NaN, + "learning_rate": 0.00020237162633946802, + "loss": 0.0, + "step": 26078 + }, + { + "epoch": 2.433423532705048, + "grad_norm": NaN, + "learning_rate": 0.00020236453848483215, + "loss": 0.0, + "step": 26079 + }, + { + "epoch": 2.4335168423999254, + "grad_norm": NaN, + "learning_rate": 0.00020235745049704889, + "loss": 0.0, + "step": 26080 + }, + { + "epoch": 2.433610152094803, + "grad_norm": NaN, + "learning_rate": 0.0002023503623761361, + "loss": 0.0, + "step": 26081 + }, + { + "epoch": 2.43370346178968, + "grad_norm": NaN, + "learning_rate": 0.00020234327412211185, + "loss": 0.0, + "step": 26082 + }, + { + "epoch": 2.433796771484557, + "grad_norm": NaN, + "learning_rate": 0.00020233618573499418, + "loss": 0.0, + "step": 26083 + }, + { + "epoch": 2.4338900811794346, + "grad_norm": NaN, + "learning_rate": 0.00020232909721480108, + "loss": 0.0, + "step": 26084 + }, + { + "epoch": 2.433983390874312, + "grad_norm": NaN, + "learning_rate": 0.00020232200856155056, + "loss": 0.0, + "step": 26085 + }, + { + "epoch": 2.434076700569189, + "grad_norm": NaN, + "learning_rate": 0.0002023149197752607, + "loss": 0.0, + "step": 26086 + }, + { + "epoch": 2.4341700102640664, + "grad_norm": NaN, + "learning_rate": 0.0002023078308559495, + "loss": 0.0, + "step": 26087 + }, + { + "epoch": 2.434263319958944, + "grad_norm": NaN, + "learning_rate": 0.00020230074180363494, + "loss": 0.0, + "step": 26088 + }, + { + "epoch": 2.434356629653821, + "grad_norm": NaN, + "learning_rate": 0.00020229365261833511, + "loss": 0.0, + "step": 26089 + }, + { + "epoch": 2.4344499393486982, + "grad_norm": NaN, + "learning_rate": 0.00020228656330006803, + "loss": 0.0, + "step": 26090 + }, + { + "epoch": 2.4345432490435757, + "grad_norm": NaN, + "learning_rate": 0.0002022794738488517, + "loss": 0.0, + "step": 26091 + }, + { + "epoch": 2.434636558738453, + "grad_norm": NaN, + "learning_rate": 0.00020227238426470415, + "loss": 0.0, + "step": 26092 + }, + { + "epoch": 2.43472986843333, + "grad_norm": NaN, + "learning_rate": 0.00020226529454764339, + "loss": 0.0, + "step": 26093 + }, + { + "epoch": 2.4348231781282075, + "grad_norm": NaN, + "learning_rate": 0.00020225820469768754, + "loss": 0.0, + "step": 26094 + }, + { + "epoch": 2.434916487823085, + "grad_norm": NaN, + "learning_rate": 0.00020225111471485445, + "loss": 0.0, + "step": 26095 + }, + { + "epoch": 2.435009797517962, + "grad_norm": NaN, + "learning_rate": 0.00020224402459916233, + "loss": 0.0, + "step": 26096 + }, + { + "epoch": 2.4351031072128393, + "grad_norm": NaN, + "learning_rate": 0.00020223693435062913, + "loss": 0.0, + "step": 26097 + }, + { + "epoch": 2.4351964169077167, + "grad_norm": NaN, + "learning_rate": 0.0002022298439692728, + "loss": 0.0, + "step": 26098 + }, + { + "epoch": 2.435289726602594, + "grad_norm": NaN, + "learning_rate": 0.00020222275345511154, + "loss": 0.0, + "step": 26099 + }, + { + "epoch": 2.4353830362974715, + "grad_norm": NaN, + "learning_rate": 0.00020221566280816328, + "loss": 0.0, + "step": 26100 + }, + { + "epoch": 2.4354763459923485, + "grad_norm": NaN, + "learning_rate": 0.00020220857202844599, + "loss": 0.0, + "step": 26101 + }, + { + "epoch": 2.435569655687226, + "grad_norm": NaN, + "learning_rate": 0.00020220148111597784, + "loss": 0.0, + "step": 26102 + }, + { + "epoch": 2.4356629653821034, + "grad_norm": NaN, + "learning_rate": 0.00020219439007077678, + "loss": 0.0, + "step": 26103 + }, + { + "epoch": 2.4357562750769803, + "grad_norm": NaN, + "learning_rate": 0.00020218729889286076, + "loss": 0.0, + "step": 26104 + }, + { + "epoch": 2.4358495847718578, + "grad_norm": NaN, + "learning_rate": 0.00020218020758224796, + "loss": 0.0, + "step": 26105 + }, + { + "epoch": 2.435942894466735, + "grad_norm": NaN, + "learning_rate": 0.00020217311613895638, + "loss": 0.0, + "step": 26106 + }, + { + "epoch": 2.4360362041616126, + "grad_norm": NaN, + "learning_rate": 0.00020216602456300396, + "loss": 0.0, + "step": 26107 + }, + { + "epoch": 2.4361295138564896, + "grad_norm": NaN, + "learning_rate": 0.00020215893285440884, + "loss": 0.0, + "step": 26108 + }, + { + "epoch": 2.436222823551367, + "grad_norm": NaN, + "learning_rate": 0.00020215184101318896, + "loss": 0.0, + "step": 26109 + }, + { + "epoch": 2.4363161332462444, + "grad_norm": NaN, + "learning_rate": 0.00020214474903936243, + "loss": 0.0, + "step": 26110 + }, + { + "epoch": 2.4364094429411214, + "grad_norm": NaN, + "learning_rate": 0.00020213765693294727, + "loss": 0.0, + "step": 26111 + }, + { + "epoch": 2.436502752635999, + "grad_norm": NaN, + "learning_rate": 0.00020213056469396142, + "loss": 0.0, + "step": 26112 + }, + { + "epoch": 2.4365960623308762, + "grad_norm": NaN, + "learning_rate": 0.00020212347232242305, + "loss": 0.0, + "step": 26113 + }, + { + "epoch": 2.4366893720257536, + "grad_norm": NaN, + "learning_rate": 0.00020211637981835013, + "loss": 0.0, + "step": 26114 + }, + { + "epoch": 2.4367826817206306, + "grad_norm": NaN, + "learning_rate": 0.00020210928718176067, + "loss": 0.0, + "step": 26115 + }, + { + "epoch": 2.436875991415508, + "grad_norm": NaN, + "learning_rate": 0.00020210219441267274, + "loss": 0.0, + "step": 26116 + }, + { + "epoch": 2.4369693011103855, + "grad_norm": NaN, + "learning_rate": 0.00020209510151110433, + "loss": 0.0, + "step": 26117 + }, + { + "epoch": 2.4370626108052624, + "grad_norm": NaN, + "learning_rate": 0.00020208800847707354, + "loss": 0.0, + "step": 26118 + }, + { + "epoch": 2.43715592050014, + "grad_norm": NaN, + "learning_rate": 0.00020208091531059844, + "loss": 0.0, + "step": 26119 + }, + { + "epoch": 2.4372492301950173, + "grad_norm": NaN, + "learning_rate": 0.00020207382201169687, + "loss": 0.0, + "step": 26120 + }, + { + "epoch": 2.4373425398898947, + "grad_norm": NaN, + "learning_rate": 0.00020206672858038703, + "loss": 0.0, + "step": 26121 + }, + { + "epoch": 2.437435849584772, + "grad_norm": NaN, + "learning_rate": 0.000202059635016687, + "loss": 0.0, + "step": 26122 + }, + { + "epoch": 2.437529159279649, + "grad_norm": NaN, + "learning_rate": 0.00020205254132061464, + "loss": 0.0, + "step": 26123 + }, + { + "epoch": 2.4376224689745265, + "grad_norm": NaN, + "learning_rate": 0.00020204544749218814, + "loss": 0.0, + "step": 26124 + }, + { + "epoch": 2.437715778669404, + "grad_norm": NaN, + "learning_rate": 0.0002020383535314255, + "loss": 0.0, + "step": 26125 + }, + { + "epoch": 2.437809088364281, + "grad_norm": NaN, + "learning_rate": 0.0002020312594383447, + "loss": 0.0, + "step": 26126 + }, + { + "epoch": 2.4379023980591583, + "grad_norm": NaN, + "learning_rate": 0.00020202416521296386, + "loss": 0.0, + "step": 26127 + }, + { + "epoch": 2.4379957077540357, + "grad_norm": NaN, + "learning_rate": 0.00020201707085530093, + "loss": 0.0, + "step": 26128 + }, + { + "epoch": 2.438089017448913, + "grad_norm": NaN, + "learning_rate": 0.00020200997636537403, + "loss": 0.0, + "step": 26129 + }, + { + "epoch": 2.43818232714379, + "grad_norm": NaN, + "learning_rate": 0.0002020028817432012, + "loss": 0.0, + "step": 26130 + }, + { + "epoch": 2.4382756368386675, + "grad_norm": NaN, + "learning_rate": 0.0002019957869888004, + "loss": 0.0, + "step": 26131 + }, + { + "epoch": 2.438368946533545, + "grad_norm": NaN, + "learning_rate": 0.0002019886921021897, + "loss": 0.0, + "step": 26132 + }, + { + "epoch": 2.438462256228422, + "grad_norm": NaN, + "learning_rate": 0.00020198159708338718, + "loss": 0.0, + "step": 26133 + }, + { + "epoch": 2.4385555659232994, + "grad_norm": NaN, + "learning_rate": 0.00020197450193241084, + "loss": 0.0, + "step": 26134 + }, + { + "epoch": 2.438648875618177, + "grad_norm": NaN, + "learning_rate": 0.00020196740664927878, + "loss": 0.0, + "step": 26135 + }, + { + "epoch": 2.438742185313054, + "grad_norm": NaN, + "learning_rate": 0.00020196031123400894, + "loss": 0.0, + "step": 26136 + }, + { + "epoch": 2.438835495007931, + "grad_norm": NaN, + "learning_rate": 0.00020195321568661947, + "loss": 0.0, + "step": 26137 + }, + { + "epoch": 2.4389288047028086, + "grad_norm": NaN, + "learning_rate": 0.0002019461200071284, + "loss": 0.0, + "step": 26138 + }, + { + "epoch": 2.439022114397686, + "grad_norm": NaN, + "learning_rate": 0.00020193902419555365, + "loss": 0.0, + "step": 26139 + }, + { + "epoch": 2.439115424092563, + "grad_norm": NaN, + "learning_rate": 0.00020193192825191338, + "loss": 0.0, + "step": 26140 + }, + { + "epoch": 2.4392087337874404, + "grad_norm": NaN, + "learning_rate": 0.0002019248321762256, + "loss": 0.0, + "step": 26141 + }, + { + "epoch": 2.439302043482318, + "grad_norm": NaN, + "learning_rate": 0.00020191773596850833, + "loss": 0.0, + "step": 26142 + }, + { + "epoch": 2.4393953531771952, + "grad_norm": NaN, + "learning_rate": 0.00020191063962877968, + "loss": 0.0, + "step": 26143 + }, + { + "epoch": 2.4394886628720722, + "grad_norm": NaN, + "learning_rate": 0.00020190354315705766, + "loss": 0.0, + "step": 26144 + }, + { + "epoch": 2.4395819725669496, + "grad_norm": NaN, + "learning_rate": 0.00020189644655336023, + "loss": 0.0, + "step": 26145 + }, + { + "epoch": 2.439675282261827, + "grad_norm": NaN, + "learning_rate": 0.00020188934981770553, + "loss": 0.0, + "step": 26146 + }, + { + "epoch": 2.4397685919567045, + "grad_norm": NaN, + "learning_rate": 0.0002018822529501116, + "loss": 0.0, + "step": 26147 + }, + { + "epoch": 2.4398619016515815, + "grad_norm": NaN, + "learning_rate": 0.0002018751559505965, + "loss": 0.0, + "step": 26148 + }, + { + "epoch": 2.439955211346459, + "grad_norm": NaN, + "learning_rate": 0.00020186805881917822, + "loss": 0.0, + "step": 26149 + }, + { + "epoch": 2.4400485210413363, + "grad_norm": NaN, + "learning_rate": 0.00020186096155587482, + "loss": 0.0, + "step": 26150 + }, + { + "epoch": 2.4401418307362137, + "grad_norm": NaN, + "learning_rate": 0.00020185386416070436, + "loss": 0.0, + "step": 26151 + }, + { + "epoch": 2.4402351404310907, + "grad_norm": NaN, + "learning_rate": 0.0002018467666336849, + "loss": 0.0, + "step": 26152 + }, + { + "epoch": 2.440328450125968, + "grad_norm": NaN, + "learning_rate": 0.00020183966897483445, + "loss": 0.0, + "step": 26153 + }, + { + "epoch": 2.4404217598208455, + "grad_norm": NaN, + "learning_rate": 0.0002018325711841711, + "loss": 0.0, + "step": 26154 + }, + { + "epoch": 2.4405150695157225, + "grad_norm": NaN, + "learning_rate": 0.00020182547326171285, + "loss": 0.0, + "step": 26155 + }, + { + "epoch": 2.4406083792106, + "grad_norm": NaN, + "learning_rate": 0.0002018183752074778, + "loss": 0.0, + "step": 26156 + }, + { + "epoch": 2.4407016889054773, + "grad_norm": NaN, + "learning_rate": 0.00020181127702148397, + "loss": 0.0, + "step": 26157 + }, + { + "epoch": 2.4407949986003548, + "grad_norm": NaN, + "learning_rate": 0.0002018041787037494, + "loss": 0.0, + "step": 26158 + }, + { + "epoch": 2.4408883082952317, + "grad_norm": NaN, + "learning_rate": 0.00020179708025429212, + "loss": 0.0, + "step": 26159 + }, + { + "epoch": 2.440981617990109, + "grad_norm": NaN, + "learning_rate": 0.00020178998167313028, + "loss": 0.0, + "step": 26160 + }, + { + "epoch": 2.4410749276849866, + "grad_norm": NaN, + "learning_rate": 0.0002017828829602818, + "loss": 0.0, + "step": 26161 + }, + { + "epoch": 2.4411682373798635, + "grad_norm": NaN, + "learning_rate": 0.0002017757841157648, + "loss": 0.0, + "step": 26162 + }, + { + "epoch": 2.441261547074741, + "grad_norm": NaN, + "learning_rate": 0.00020176868513959736, + "loss": 0.0, + "step": 26163 + }, + { + "epoch": 2.4413548567696184, + "grad_norm": NaN, + "learning_rate": 0.0002017615860317974, + "loss": 0.0, + "step": 26164 + }, + { + "epoch": 2.441448166464496, + "grad_norm": NaN, + "learning_rate": 0.0002017544867923831, + "loss": 0.0, + "step": 26165 + }, + { + "epoch": 2.441541476159373, + "grad_norm": NaN, + "learning_rate": 0.00020174738742137252, + "loss": 0.0, + "step": 26166 + }, + { + "epoch": 2.44163478585425, + "grad_norm": NaN, + "learning_rate": 0.00020174028791878363, + "loss": 0.0, + "step": 26167 + }, + { + "epoch": 2.4417280955491276, + "grad_norm": NaN, + "learning_rate": 0.0002017331882846345, + "loss": 0.0, + "step": 26168 + }, + { + "epoch": 2.441821405244005, + "grad_norm": NaN, + "learning_rate": 0.0002017260885189432, + "loss": 0.0, + "step": 26169 + }, + { + "epoch": 2.441914714938882, + "grad_norm": NaN, + "learning_rate": 0.00020171898862172779, + "loss": 0.0, + "step": 26170 + }, + { + "epoch": 2.4420080246337594, + "grad_norm": NaN, + "learning_rate": 0.0002017118885930063, + "loss": 0.0, + "step": 26171 + }, + { + "epoch": 2.442101334328637, + "grad_norm": NaN, + "learning_rate": 0.00020170478843279678, + "loss": 0.0, + "step": 26172 + }, + { + "epoch": 2.4421946440235143, + "grad_norm": NaN, + "learning_rate": 0.00020169768814111736, + "loss": 0.0, + "step": 26173 + }, + { + "epoch": 2.4422879537183912, + "grad_norm": NaN, + "learning_rate": 0.00020169058771798596, + "loss": 0.0, + "step": 26174 + }, + { + "epoch": 2.4423812634132687, + "grad_norm": NaN, + "learning_rate": 0.00020168348716342074, + "loss": 0.0, + "step": 26175 + }, + { + "epoch": 2.442474573108146, + "grad_norm": NaN, + "learning_rate": 0.00020167638647743976, + "loss": 0.0, + "step": 26176 + }, + { + "epoch": 2.442567882803023, + "grad_norm": NaN, + "learning_rate": 0.00020166928566006098, + "loss": 0.0, + "step": 26177 + }, + { + "epoch": 2.4426611924979005, + "grad_norm": NaN, + "learning_rate": 0.0002016621847113025, + "loss": 0.0, + "step": 26178 + }, + { + "epoch": 2.442754502192778, + "grad_norm": NaN, + "learning_rate": 0.00020165508363118243, + "loss": 0.0, + "step": 26179 + }, + { + "epoch": 2.4428478118876553, + "grad_norm": NaN, + "learning_rate": 0.00020164798241971877, + "loss": 0.0, + "step": 26180 + }, + { + "epoch": 2.4429411215825323, + "grad_norm": NaN, + "learning_rate": 0.0002016408810769296, + "loss": 0.0, + "step": 26181 + }, + { + "epoch": 2.4430344312774097, + "grad_norm": NaN, + "learning_rate": 0.00020163377960283297, + "loss": 0.0, + "step": 26182 + }, + { + "epoch": 2.443127740972287, + "grad_norm": NaN, + "learning_rate": 0.00020162667799744692, + "loss": 0.0, + "step": 26183 + }, + { + "epoch": 2.443221050667164, + "grad_norm": NaN, + "learning_rate": 0.00020161957626078954, + "loss": 0.0, + "step": 26184 + }, + { + "epoch": 2.4433143603620415, + "grad_norm": NaN, + "learning_rate": 0.00020161247439287884, + "loss": 0.0, + "step": 26185 + }, + { + "epoch": 2.443407670056919, + "grad_norm": NaN, + "learning_rate": 0.00020160537239373292, + "loss": 0.0, + "step": 26186 + }, + { + "epoch": 2.4435009797517964, + "grad_norm": NaN, + "learning_rate": 0.00020159827026336983, + "loss": 0.0, + "step": 26187 + }, + { + "epoch": 2.4435942894466733, + "grad_norm": NaN, + "learning_rate": 0.00020159116800180763, + "loss": 0.0, + "step": 26188 + }, + { + "epoch": 2.4436875991415508, + "grad_norm": NaN, + "learning_rate": 0.00020158406560906434, + "loss": 0.0, + "step": 26189 + }, + { + "epoch": 2.443780908836428, + "grad_norm": NaN, + "learning_rate": 0.0002015769630851581, + "loss": 0.0, + "step": 26190 + }, + { + "epoch": 2.443874218531305, + "grad_norm": NaN, + "learning_rate": 0.00020156986043010685, + "loss": 0.0, + "step": 26191 + }, + { + "epoch": 2.4439675282261826, + "grad_norm": NaN, + "learning_rate": 0.00020156275764392876, + "loss": 0.0, + "step": 26192 + }, + { + "epoch": 2.44406083792106, + "grad_norm": NaN, + "learning_rate": 0.00020155565472664187, + "loss": 0.0, + "step": 26193 + }, + { + "epoch": 2.4441541476159374, + "grad_norm": NaN, + "learning_rate": 0.00020154855167826423, + "loss": 0.0, + "step": 26194 + }, + { + "epoch": 2.444247457310815, + "grad_norm": NaN, + "learning_rate": 0.00020154144849881387, + "loss": 0.0, + "step": 26195 + }, + { + "epoch": 2.444340767005692, + "grad_norm": NaN, + "learning_rate": 0.00020153434518830886, + "loss": 0.0, + "step": 26196 + }, + { + "epoch": 2.4444340767005692, + "grad_norm": NaN, + "learning_rate": 0.00020152724174676728, + "loss": 0.0, + "step": 26197 + }, + { + "epoch": 2.4445273863954466, + "grad_norm": NaN, + "learning_rate": 0.0002015201381742072, + "loss": 0.0, + "step": 26198 + }, + { + "epoch": 2.4446206960903236, + "grad_norm": NaN, + "learning_rate": 0.00020151303447064667, + "loss": 0.0, + "step": 26199 + }, + { + "epoch": 2.444714005785201, + "grad_norm": NaN, + "learning_rate": 0.00020150593063610376, + "loss": 0.0, + "step": 26200 + }, + { + "epoch": 2.4448073154800785, + "grad_norm": NaN, + "learning_rate": 0.0002014988266705965, + "loss": 0.0, + "step": 26201 + }, + { + "epoch": 2.444900625174956, + "grad_norm": NaN, + "learning_rate": 0.000201491722574143, + "loss": 0.0, + "step": 26202 + }, + { + "epoch": 2.444993934869833, + "grad_norm": NaN, + "learning_rate": 0.00020148461834676128, + "loss": 0.0, + "step": 26203 + }, + { + "epoch": 2.4450872445647103, + "grad_norm": NaN, + "learning_rate": 0.00020147751398846944, + "loss": 0.0, + "step": 26204 + }, + { + "epoch": 2.4451805542595877, + "grad_norm": NaN, + "learning_rate": 0.00020147040949928556, + "loss": 0.0, + "step": 26205 + }, + { + "epoch": 2.4452738639544647, + "grad_norm": NaN, + "learning_rate": 0.00020146330487922764, + "loss": 0.0, + "step": 26206 + }, + { + "epoch": 2.445367173649342, + "grad_norm": NaN, + "learning_rate": 0.00020145620012831377, + "loss": 0.0, + "step": 26207 + }, + { + "epoch": 2.4454604833442195, + "grad_norm": NaN, + "learning_rate": 0.00020144909524656208, + "loss": 0.0, + "step": 26208 + }, + { + "epoch": 2.445553793039097, + "grad_norm": NaN, + "learning_rate": 0.00020144199023399052, + "loss": 0.0, + "step": 26209 + }, + { + "epoch": 2.445647102733974, + "grad_norm": NaN, + "learning_rate": 0.00020143488509061724, + "loss": 0.0, + "step": 26210 + }, + { + "epoch": 2.4457404124288513, + "grad_norm": NaN, + "learning_rate": 0.0002014277798164603, + "loss": 0.0, + "step": 26211 + }, + { + "epoch": 2.4458337221237287, + "grad_norm": NaN, + "learning_rate": 0.00020142067441153774, + "loss": 0.0, + "step": 26212 + }, + { + "epoch": 2.4459270318186057, + "grad_norm": NaN, + "learning_rate": 0.00020141356887586761, + "loss": 0.0, + "step": 26213 + }, + { + "epoch": 2.446020341513483, + "grad_norm": NaN, + "learning_rate": 0.00020140646320946805, + "loss": 0.0, + "step": 26214 + }, + { + "epoch": 2.4461136512083606, + "grad_norm": NaN, + "learning_rate": 0.00020139935741235706, + "loss": 0.0, + "step": 26215 + }, + { + "epoch": 2.446206960903238, + "grad_norm": NaN, + "learning_rate": 0.00020139225148455272, + "loss": 0.0, + "step": 26216 + }, + { + "epoch": 2.4463002705981154, + "grad_norm": NaN, + "learning_rate": 0.0002013851454260731, + "loss": 0.0, + "step": 26217 + }, + { + "epoch": 2.4463935802929924, + "grad_norm": NaN, + "learning_rate": 0.00020137803923693631, + "loss": 0.0, + "step": 26218 + }, + { + "epoch": 2.44648688998787, + "grad_norm": NaN, + "learning_rate": 0.00020137093291716038, + "loss": 0.0, + "step": 26219 + }, + { + "epoch": 2.446580199682747, + "grad_norm": NaN, + "learning_rate": 0.00020136382646676336, + "loss": 0.0, + "step": 26220 + }, + { + "epoch": 2.446673509377624, + "grad_norm": NaN, + "learning_rate": 0.00020135671988576336, + "loss": 0.0, + "step": 26221 + }, + { + "epoch": 2.4467668190725016, + "grad_norm": NaN, + "learning_rate": 0.00020134961317417842, + "loss": 0.0, + "step": 26222 + }, + { + "epoch": 2.446860128767379, + "grad_norm": NaN, + "learning_rate": 0.00020134250633202665, + "loss": 0.0, + "step": 26223 + }, + { + "epoch": 2.4469534384622564, + "grad_norm": NaN, + "learning_rate": 0.00020133539935932607, + "loss": 0.0, + "step": 26224 + }, + { + "epoch": 2.4470467481571334, + "grad_norm": NaN, + "learning_rate": 0.0002013282922560948, + "loss": 0.0, + "step": 26225 + }, + { + "epoch": 2.447140057852011, + "grad_norm": NaN, + "learning_rate": 0.00020132118502235087, + "loss": 0.0, + "step": 26226 + }, + { + "epoch": 2.4472333675468882, + "grad_norm": NaN, + "learning_rate": 0.00020131407765811235, + "loss": 0.0, + "step": 26227 + }, + { + "epoch": 2.4473266772417652, + "grad_norm": NaN, + "learning_rate": 0.00020130697016339737, + "loss": 0.0, + "step": 26228 + }, + { + "epoch": 2.4474199869366426, + "grad_norm": NaN, + "learning_rate": 0.00020129986253822398, + "loss": 0.0, + "step": 26229 + }, + { + "epoch": 2.44751329663152, + "grad_norm": NaN, + "learning_rate": 0.0002012927547826102, + "loss": 0.0, + "step": 26230 + }, + { + "epoch": 2.4476066063263975, + "grad_norm": NaN, + "learning_rate": 0.00020128564689657414, + "loss": 0.0, + "step": 26231 + }, + { + "epoch": 2.4476999160212745, + "grad_norm": NaN, + "learning_rate": 0.00020127853888013387, + "loss": 0.0, + "step": 26232 + }, + { + "epoch": 2.447793225716152, + "grad_norm": NaN, + "learning_rate": 0.00020127143073330747, + "loss": 0.0, + "step": 26233 + }, + { + "epoch": 2.4478865354110293, + "grad_norm": NaN, + "learning_rate": 0.00020126432245611302, + "loss": 0.0, + "step": 26234 + }, + { + "epoch": 2.4479798451059063, + "grad_norm": NaN, + "learning_rate": 0.00020125721404856854, + "loss": 0.0, + "step": 26235 + }, + { + "epoch": 2.4480731548007837, + "grad_norm": NaN, + "learning_rate": 0.00020125010551069218, + "loss": 0.0, + "step": 26236 + }, + { + "epoch": 2.448166464495661, + "grad_norm": NaN, + "learning_rate": 0.00020124299684250199, + "loss": 0.0, + "step": 26237 + }, + { + "epoch": 2.4482597741905385, + "grad_norm": NaN, + "learning_rate": 0.00020123588804401603, + "loss": 0.0, + "step": 26238 + }, + { + "epoch": 2.4483530838854155, + "grad_norm": NaN, + "learning_rate": 0.0002012287791152524, + "loss": 0.0, + "step": 26239 + }, + { + "epoch": 2.448446393580293, + "grad_norm": NaN, + "learning_rate": 0.00020122167005622913, + "loss": 0.0, + "step": 26240 + }, + { + "epoch": 2.4485397032751703, + "grad_norm": NaN, + "learning_rate": 0.00020121456086696438, + "loss": 0.0, + "step": 26241 + }, + { + "epoch": 2.4486330129700478, + "grad_norm": NaN, + "learning_rate": 0.0002012074515474761, + "loss": 0.0, + "step": 26242 + }, + { + "epoch": 2.4487263226649247, + "grad_norm": NaN, + "learning_rate": 0.0002012003420977825, + "loss": 0.0, + "step": 26243 + }, + { + "epoch": 2.448819632359802, + "grad_norm": NaN, + "learning_rate": 0.00020119323251790156, + "loss": 0.0, + "step": 26244 + }, + { + "epoch": 2.4489129420546796, + "grad_norm": NaN, + "learning_rate": 0.0002011861228078514, + "loss": 0.0, + "step": 26245 + }, + { + "epoch": 2.449006251749557, + "grad_norm": NaN, + "learning_rate": 0.0002011790129676501, + "loss": 0.0, + "step": 26246 + }, + { + "epoch": 2.449099561444434, + "grad_norm": NaN, + "learning_rate": 0.00020117190299731574, + "loss": 0.0, + "step": 26247 + }, + { + "epoch": 2.4491928711393114, + "grad_norm": NaN, + "learning_rate": 0.00020116479289686637, + "loss": 0.0, + "step": 26248 + }, + { + "epoch": 2.449286180834189, + "grad_norm": NaN, + "learning_rate": 0.00020115768266632005, + "loss": 0.0, + "step": 26249 + }, + { + "epoch": 2.449379490529066, + "grad_norm": NaN, + "learning_rate": 0.00020115057230569498, + "loss": 0.0, + "step": 26250 + }, + { + "epoch": 2.449472800223943, + "grad_norm": NaN, + "learning_rate": 0.00020114346181500912, + "loss": 0.0, + "step": 26251 + }, + { + "epoch": 2.4495661099188206, + "grad_norm": NaN, + "learning_rate": 0.00020113635119428059, + "loss": 0.0, + "step": 26252 + }, + { + "epoch": 2.449659419613698, + "grad_norm": NaN, + "learning_rate": 0.00020112924044352743, + "loss": 0.0, + "step": 26253 + }, + { + "epoch": 2.449752729308575, + "grad_norm": NaN, + "learning_rate": 0.0002011221295627678, + "loss": 0.0, + "step": 26254 + }, + { + "epoch": 2.4498460390034524, + "grad_norm": NaN, + "learning_rate": 0.00020111501855201973, + "loss": 0.0, + "step": 26255 + }, + { + "epoch": 2.44993934869833, + "grad_norm": NaN, + "learning_rate": 0.0002011079074113013, + "loss": 0.0, + "step": 26256 + }, + { + "epoch": 2.450032658393207, + "grad_norm": NaN, + "learning_rate": 0.00020110079614063062, + "loss": 0.0, + "step": 26257 + }, + { + "epoch": 2.4501259680880842, + "grad_norm": NaN, + "learning_rate": 0.00020109368474002573, + "loss": 0.0, + "step": 26258 + }, + { + "epoch": 2.4502192777829617, + "grad_norm": NaN, + "learning_rate": 0.00020108657320950474, + "loss": 0.0, + "step": 26259 + }, + { + "epoch": 2.450312587477839, + "grad_norm": NaN, + "learning_rate": 0.00020107946154908574, + "loss": 0.0, + "step": 26260 + }, + { + "epoch": 2.450405897172716, + "grad_norm": NaN, + "learning_rate": 0.0002010723497587868, + "loss": 0.0, + "step": 26261 + }, + { + "epoch": 2.4504992068675935, + "grad_norm": NaN, + "learning_rate": 0.000201065237838626, + "loss": 0.0, + "step": 26262 + }, + { + "epoch": 2.450592516562471, + "grad_norm": NaN, + "learning_rate": 0.0002010581257886214, + "loss": 0.0, + "step": 26263 + }, + { + "epoch": 2.4506858262573483, + "grad_norm": NaN, + "learning_rate": 0.00020105101360879112, + "loss": 0.0, + "step": 26264 + }, + { + "epoch": 2.4507791359522253, + "grad_norm": NaN, + "learning_rate": 0.00020104390129915326, + "loss": 0.0, + "step": 26265 + }, + { + "epoch": 2.4508724456471027, + "grad_norm": NaN, + "learning_rate": 0.00020103678885972586, + "loss": 0.0, + "step": 26266 + }, + { + "epoch": 2.45096575534198, + "grad_norm": NaN, + "learning_rate": 0.00020102967629052703, + "loss": 0.0, + "step": 26267 + }, + { + "epoch": 2.4510590650368576, + "grad_norm": NaN, + "learning_rate": 0.00020102256359157486, + "loss": 0.0, + "step": 26268 + }, + { + "epoch": 2.4511523747317345, + "grad_norm": NaN, + "learning_rate": 0.0002010154507628874, + "loss": 0.0, + "step": 26269 + }, + { + "epoch": 2.451245684426612, + "grad_norm": NaN, + "learning_rate": 0.0002010083378044828, + "loss": 0.0, + "step": 26270 + }, + { + "epoch": 2.4513389941214894, + "grad_norm": NaN, + "learning_rate": 0.00020100122471637907, + "loss": 0.0, + "step": 26271 + }, + { + "epoch": 2.4514323038163663, + "grad_norm": NaN, + "learning_rate": 0.00020099411149859434, + "loss": 0.0, + "step": 26272 + }, + { + "epoch": 2.4515256135112438, + "grad_norm": NaN, + "learning_rate": 0.0002009869981511467, + "loss": 0.0, + "step": 26273 + }, + { + "epoch": 2.451618923206121, + "grad_norm": NaN, + "learning_rate": 0.00020097988467405423, + "loss": 0.0, + "step": 26274 + }, + { + "epoch": 2.4517122329009986, + "grad_norm": NaN, + "learning_rate": 0.00020097277106733498, + "loss": 0.0, + "step": 26275 + }, + { + "epoch": 2.4518055425958756, + "grad_norm": NaN, + "learning_rate": 0.00020096565733100711, + "loss": 0.0, + "step": 26276 + }, + { + "epoch": 2.451898852290753, + "grad_norm": NaN, + "learning_rate": 0.00020095854346508868, + "loss": 0.0, + "step": 26277 + }, + { + "epoch": 2.4519921619856304, + "grad_norm": NaN, + "learning_rate": 0.00020095142946959774, + "loss": 0.0, + "step": 26278 + }, + { + "epoch": 2.4520854716805074, + "grad_norm": NaN, + "learning_rate": 0.0002009443153445524, + "loss": 0.0, + "step": 26279 + }, + { + "epoch": 2.452178781375385, + "grad_norm": NaN, + "learning_rate": 0.00020093720108997078, + "loss": 0.0, + "step": 26280 + }, + { + "epoch": 2.4522720910702622, + "grad_norm": NaN, + "learning_rate": 0.00020093008670587096, + "loss": 0.0, + "step": 26281 + }, + { + "epoch": 2.4523654007651396, + "grad_norm": NaN, + "learning_rate": 0.00020092297219227097, + "loss": 0.0, + "step": 26282 + }, + { + "epoch": 2.4524587104600166, + "grad_norm": NaN, + "learning_rate": 0.000200915857549189, + "loss": 0.0, + "step": 26283 + }, + { + "epoch": 2.452552020154894, + "grad_norm": NaN, + "learning_rate": 0.00020090874277664303, + "loss": 0.0, + "step": 26284 + }, + { + "epoch": 2.4526453298497715, + "grad_norm": NaN, + "learning_rate": 0.00020090162787465124, + "loss": 0.0, + "step": 26285 + }, + { + "epoch": 2.452738639544649, + "grad_norm": NaN, + "learning_rate": 0.00020089451284323169, + "loss": 0.0, + "step": 26286 + }, + { + "epoch": 2.452831949239526, + "grad_norm": NaN, + "learning_rate": 0.00020088739768240246, + "loss": 0.0, + "step": 26287 + }, + { + "epoch": 2.4529252589344033, + "grad_norm": NaN, + "learning_rate": 0.00020088028239218165, + "loss": 0.0, + "step": 26288 + }, + { + "epoch": 2.4530185686292807, + "grad_norm": NaN, + "learning_rate": 0.00020087316697258735, + "loss": 0.0, + "step": 26289 + }, + { + "epoch": 2.453111878324158, + "grad_norm": NaN, + "learning_rate": 0.00020086605142363768, + "loss": 0.0, + "step": 26290 + }, + { + "epoch": 2.453205188019035, + "grad_norm": NaN, + "learning_rate": 0.0002008589357453507, + "loss": 0.0, + "step": 26291 + }, + { + "epoch": 2.4532984977139125, + "grad_norm": NaN, + "learning_rate": 0.00020085181993774448, + "loss": 0.0, + "step": 26292 + }, + { + "epoch": 2.45339180740879, + "grad_norm": NaN, + "learning_rate": 0.0002008447040008372, + "loss": 0.0, + "step": 26293 + }, + { + "epoch": 2.453485117103667, + "grad_norm": NaN, + "learning_rate": 0.00020083758793464686, + "loss": 0.0, + "step": 26294 + }, + { + "epoch": 2.4535784267985443, + "grad_norm": NaN, + "learning_rate": 0.00020083047173919161, + "loss": 0.0, + "step": 26295 + }, + { + "epoch": 2.4536717364934217, + "grad_norm": NaN, + "learning_rate": 0.00020082335541448952, + "loss": 0.0, + "step": 26296 + }, + { + "epoch": 2.453765046188299, + "grad_norm": NaN, + "learning_rate": 0.0002008162389605587, + "loss": 0.0, + "step": 26297 + }, + { + "epoch": 2.453858355883176, + "grad_norm": NaN, + "learning_rate": 0.00020080912237741724, + "loss": 0.0, + "step": 26298 + }, + { + "epoch": 2.4539516655780536, + "grad_norm": NaN, + "learning_rate": 0.0002008020056650832, + "loss": 0.0, + "step": 26299 + }, + { + "epoch": 2.454044975272931, + "grad_norm": NaN, + "learning_rate": 0.00020079488882357476, + "loss": 0.0, + "step": 26300 + }, + { + "epoch": 2.454138284967808, + "grad_norm": NaN, + "learning_rate": 0.00020078777185290992, + "loss": 0.0, + "step": 26301 + }, + { + "epoch": 2.4542315946626854, + "grad_norm": NaN, + "learning_rate": 0.00020078065475310686, + "loss": 0.0, + "step": 26302 + }, + { + "epoch": 2.454324904357563, + "grad_norm": NaN, + "learning_rate": 0.00020077353752418357, + "loss": 0.0, + "step": 26303 + }, + { + "epoch": 2.45441821405244, + "grad_norm": NaN, + "learning_rate": 0.0002007664201661583, + "loss": 0.0, + "step": 26304 + }, + { + "epoch": 2.454511523747317, + "grad_norm": NaN, + "learning_rate": 0.00020075930267904901, + "loss": 0.0, + "step": 26305 + }, + { + "epoch": 2.4546048334421946, + "grad_norm": NaN, + "learning_rate": 0.00020075218506287383, + "loss": 0.0, + "step": 26306 + }, + { + "epoch": 2.454698143137072, + "grad_norm": NaN, + "learning_rate": 0.00020074506731765091, + "loss": 0.0, + "step": 26307 + }, + { + "epoch": 2.454791452831949, + "grad_norm": NaN, + "learning_rate": 0.00020073794944339834, + "loss": 0.0, + "step": 26308 + }, + { + "epoch": 2.4548847625268264, + "grad_norm": NaN, + "learning_rate": 0.00020073083144013412, + "loss": 0.0, + "step": 26309 + }, + { + "epoch": 2.454978072221704, + "grad_norm": NaN, + "learning_rate": 0.0002007237133078765, + "loss": 0.0, + "step": 26310 + }, + { + "epoch": 2.4550713819165813, + "grad_norm": NaN, + "learning_rate": 0.00020071659504664345, + "loss": 0.0, + "step": 26311 + }, + { + "epoch": 2.4551646916114587, + "grad_norm": NaN, + "learning_rate": 0.00020070947665645316, + "loss": 0.0, + "step": 26312 + }, + { + "epoch": 2.4552580013063356, + "grad_norm": NaN, + "learning_rate": 0.00020070235813732367, + "loss": 0.0, + "step": 26313 + }, + { + "epoch": 2.455351311001213, + "grad_norm": NaN, + "learning_rate": 0.00020069523948927311, + "loss": 0.0, + "step": 26314 + }, + { + "epoch": 2.4554446206960905, + "grad_norm": NaN, + "learning_rate": 0.0002006881207123196, + "loss": 0.0, + "step": 26315 + }, + { + "epoch": 2.4555379303909675, + "grad_norm": NaN, + "learning_rate": 0.00020068100180648118, + "loss": 0.0, + "step": 26316 + }, + { + "epoch": 2.455631240085845, + "grad_norm": NaN, + "learning_rate": 0.00020067388277177596, + "loss": 0.0, + "step": 26317 + }, + { + "epoch": 2.4557245497807223, + "grad_norm": NaN, + "learning_rate": 0.00020066676360822212, + "loss": 0.0, + "step": 26318 + }, + { + "epoch": 2.4558178594755997, + "grad_norm": NaN, + "learning_rate": 0.00020065964431583767, + "loss": 0.0, + "step": 26319 + }, + { + "epoch": 2.4559111691704767, + "grad_norm": NaN, + "learning_rate": 0.00020065252489464078, + "loss": 0.0, + "step": 26320 + }, + { + "epoch": 2.456004478865354, + "grad_norm": NaN, + "learning_rate": 0.0002006454053446495, + "loss": 0.0, + "step": 26321 + }, + { + "epoch": 2.4560977885602315, + "grad_norm": NaN, + "learning_rate": 0.00020063828566588198, + "loss": 0.0, + "step": 26322 + }, + { + "epoch": 2.4561910982551085, + "grad_norm": NaN, + "learning_rate": 0.0002006311658583563, + "loss": 0.0, + "step": 26323 + }, + { + "epoch": 2.456284407949986, + "grad_norm": NaN, + "learning_rate": 0.00020062404592209053, + "loss": 0.0, + "step": 26324 + }, + { + "epoch": 2.4563777176448633, + "grad_norm": NaN, + "learning_rate": 0.00020061692585710282, + "loss": 0.0, + "step": 26325 + }, + { + "epoch": 2.4564710273397408, + "grad_norm": NaN, + "learning_rate": 0.0002006098056634113, + "loss": 0.0, + "step": 26326 + }, + { + "epoch": 2.4565643370346177, + "grad_norm": NaN, + "learning_rate": 0.000200602685341034, + "loss": 0.0, + "step": 26327 + }, + { + "epoch": 2.456657646729495, + "grad_norm": NaN, + "learning_rate": 0.00020059556488998906, + "loss": 0.0, + "step": 26328 + }, + { + "epoch": 2.4567509564243726, + "grad_norm": NaN, + "learning_rate": 0.00020058844431029458, + "loss": 0.0, + "step": 26329 + }, + { + "epoch": 2.4568442661192496, + "grad_norm": NaN, + "learning_rate": 0.00020058132360196868, + "loss": 0.0, + "step": 26330 + }, + { + "epoch": 2.456937575814127, + "grad_norm": NaN, + "learning_rate": 0.00020057420276502948, + "loss": 0.0, + "step": 26331 + }, + { + "epoch": 2.4570308855090044, + "grad_norm": NaN, + "learning_rate": 0.00020056708179949504, + "loss": 0.0, + "step": 26332 + }, + { + "epoch": 2.457124195203882, + "grad_norm": NaN, + "learning_rate": 0.00020055996070538344, + "loss": 0.0, + "step": 26333 + }, + { + "epoch": 2.4572175048987592, + "grad_norm": NaN, + "learning_rate": 0.00020055283948271295, + "loss": 0.0, + "step": 26334 + }, + { + "epoch": 2.457310814593636, + "grad_norm": NaN, + "learning_rate": 0.0002005457181315015, + "loss": 0.0, + "step": 26335 + }, + { + "epoch": 2.4574041242885136, + "grad_norm": NaN, + "learning_rate": 0.00020053859665176721, + "loss": 0.0, + "step": 26336 + }, + { + "epoch": 2.457497433983391, + "grad_norm": NaN, + "learning_rate": 0.00020053147504352833, + "loss": 0.0, + "step": 26337 + }, + { + "epoch": 2.457590743678268, + "grad_norm": NaN, + "learning_rate": 0.00020052435330680286, + "loss": 0.0, + "step": 26338 + }, + { + "epoch": 2.4576840533731454, + "grad_norm": NaN, + "learning_rate": 0.00020051723144160887, + "loss": 0.0, + "step": 26339 + }, + { + "epoch": 2.457777363068023, + "grad_norm": NaN, + "learning_rate": 0.0002005101094479646, + "loss": 0.0, + "step": 26340 + }, + { + "epoch": 2.4578706727629003, + "grad_norm": NaN, + "learning_rate": 0.00020050298732588804, + "loss": 0.0, + "step": 26341 + }, + { + "epoch": 2.4579639824577773, + "grad_norm": NaN, + "learning_rate": 0.00020049586507539735, + "loss": 0.0, + "step": 26342 + }, + { + "epoch": 2.4580572921526547, + "grad_norm": NaN, + "learning_rate": 0.00020048874269651065, + "loss": 0.0, + "step": 26343 + }, + { + "epoch": 2.458150601847532, + "grad_norm": NaN, + "learning_rate": 0.000200481620189246, + "loss": 0.0, + "step": 26344 + }, + { + "epoch": 2.458243911542409, + "grad_norm": NaN, + "learning_rate": 0.00020047449755362159, + "loss": 0.0, + "step": 26345 + }, + { + "epoch": 2.4583372212372865, + "grad_norm": NaN, + "learning_rate": 0.00020046737478965544, + "loss": 0.0, + "step": 26346 + }, + { + "epoch": 2.458430530932164, + "grad_norm": NaN, + "learning_rate": 0.00020046025189736572, + "loss": 0.0, + "step": 26347 + }, + { + "epoch": 2.4585238406270413, + "grad_norm": NaN, + "learning_rate": 0.00020045312887677054, + "loss": 0.0, + "step": 26348 + }, + { + "epoch": 2.4586171503219183, + "grad_norm": NaN, + "learning_rate": 0.00020044600572788796, + "loss": 0.0, + "step": 26349 + }, + { + "epoch": 2.4587104600167957, + "grad_norm": NaN, + "learning_rate": 0.00020043888245073615, + "loss": 0.0, + "step": 26350 + }, + { + "epoch": 2.458803769711673, + "grad_norm": NaN, + "learning_rate": 0.00020043175904533325, + "loss": 0.0, + "step": 26351 + }, + { + "epoch": 2.45889707940655, + "grad_norm": NaN, + "learning_rate": 0.00020042463551169724, + "loss": 0.0, + "step": 26352 + }, + { + "epoch": 2.4589903891014275, + "grad_norm": NaN, + "learning_rate": 0.0002004175118498464, + "loss": 0.0, + "step": 26353 + }, + { + "epoch": 2.459083698796305, + "grad_norm": NaN, + "learning_rate": 0.00020041038805979872, + "loss": 0.0, + "step": 26354 + }, + { + "epoch": 2.4591770084911824, + "grad_norm": NaN, + "learning_rate": 0.00020040326414157232, + "loss": 0.0, + "step": 26355 + }, + { + "epoch": 2.4592703181860593, + "grad_norm": NaN, + "learning_rate": 0.00020039614009518546, + "loss": 0.0, + "step": 26356 + }, + { + "epoch": 2.4593636278809368, + "grad_norm": NaN, + "learning_rate": 0.00020038901592065605, + "loss": 0.0, + "step": 26357 + }, + { + "epoch": 2.459456937575814, + "grad_norm": NaN, + "learning_rate": 0.0002003818916180023, + "loss": 0.0, + "step": 26358 + }, + { + "epoch": 2.4595502472706916, + "grad_norm": NaN, + "learning_rate": 0.00020037476718724238, + "loss": 0.0, + "step": 26359 + }, + { + "epoch": 2.4596435569655686, + "grad_norm": NaN, + "learning_rate": 0.0002003676426283943, + "loss": 0.0, + "step": 26360 + }, + { + "epoch": 2.459736866660446, + "grad_norm": NaN, + "learning_rate": 0.00020036051794147623, + "loss": 0.0, + "step": 26361 + }, + { + "epoch": 2.4598301763553234, + "grad_norm": NaN, + "learning_rate": 0.00020035339312650632, + "loss": 0.0, + "step": 26362 + }, + { + "epoch": 2.459923486050201, + "grad_norm": NaN, + "learning_rate": 0.00020034626818350265, + "loss": 0.0, + "step": 26363 + }, + { + "epoch": 2.460016795745078, + "grad_norm": NaN, + "learning_rate": 0.0002003391431124833, + "loss": 0.0, + "step": 26364 + }, + { + "epoch": 2.4601101054399552, + "grad_norm": NaN, + "learning_rate": 0.00020033201791346642, + "loss": 0.0, + "step": 26365 + }, + { + "epoch": 2.4602034151348326, + "grad_norm": NaN, + "learning_rate": 0.00020032489258647014, + "loss": 0.0, + "step": 26366 + }, + { + "epoch": 2.4602967248297096, + "grad_norm": NaN, + "learning_rate": 0.00020031776713151254, + "loss": 0.0, + "step": 26367 + }, + { + "epoch": 2.460390034524587, + "grad_norm": NaN, + "learning_rate": 0.00020031064154861175, + "loss": 0.0, + "step": 26368 + }, + { + "epoch": 2.4604833442194645, + "grad_norm": NaN, + "learning_rate": 0.000200303515837786, + "loss": 0.0, + "step": 26369 + }, + { + "epoch": 2.460576653914342, + "grad_norm": NaN, + "learning_rate": 0.0002002963899990532, + "loss": 0.0, + "step": 26370 + }, + { + "epoch": 2.460669963609219, + "grad_norm": NaN, + "learning_rate": 0.0002002892640324316, + "loss": 0.0, + "step": 26371 + }, + { + "epoch": 2.4607632733040963, + "grad_norm": NaN, + "learning_rate": 0.00020028213793793937, + "loss": 0.0, + "step": 26372 + }, + { + "epoch": 2.4608565829989737, + "grad_norm": NaN, + "learning_rate": 0.0002002750117155945, + "loss": 0.0, + "step": 26373 + }, + { + "epoch": 2.4609498926938507, + "grad_norm": NaN, + "learning_rate": 0.00020026788536541514, + "loss": 0.0, + "step": 26374 + }, + { + "epoch": 2.461043202388728, + "grad_norm": NaN, + "learning_rate": 0.00020026075888741952, + "loss": 0.0, + "step": 26375 + }, + { + "epoch": 2.4611365120836055, + "grad_norm": NaN, + "learning_rate": 0.0002002536322816256, + "loss": 0.0, + "step": 26376 + }, + { + "epoch": 2.461229821778483, + "grad_norm": NaN, + "learning_rate": 0.00020024650554805162, + "loss": 0.0, + "step": 26377 + }, + { + "epoch": 2.46132313147336, + "grad_norm": NaN, + "learning_rate": 0.0002002393786867157, + "loss": 0.0, + "step": 26378 + }, + { + "epoch": 2.4614164411682373, + "grad_norm": NaN, + "learning_rate": 0.00020023225169763585, + "loss": 0.0, + "step": 26379 + }, + { + "epoch": 2.4615097508631147, + "grad_norm": NaN, + "learning_rate": 0.00020022512458083027, + "loss": 0.0, + "step": 26380 + }, + { + "epoch": 2.461603060557992, + "grad_norm": NaN, + "learning_rate": 0.0002002179973363171, + "loss": 0.0, + "step": 26381 + }, + { + "epoch": 2.461696370252869, + "grad_norm": NaN, + "learning_rate": 0.00020021086996411446, + "loss": 0.0, + "step": 26382 + }, + { + "epoch": 2.4617896799477466, + "grad_norm": NaN, + "learning_rate": 0.00020020374246424038, + "loss": 0.0, + "step": 26383 + }, + { + "epoch": 2.461882989642624, + "grad_norm": NaN, + "learning_rate": 0.00020019661483671312, + "loss": 0.0, + "step": 26384 + }, + { + "epoch": 2.4619762993375014, + "grad_norm": NaN, + "learning_rate": 0.00020018948708155074, + "loss": 0.0, + "step": 26385 + }, + { + "epoch": 2.4620696090323784, + "grad_norm": NaN, + "learning_rate": 0.00020018235919877131, + "loss": 0.0, + "step": 26386 + }, + { + "epoch": 2.462162918727256, + "grad_norm": NaN, + "learning_rate": 0.00020017523118839303, + "loss": 0.0, + "step": 26387 + }, + { + "epoch": 2.462256228422133, + "grad_norm": NaN, + "learning_rate": 0.00020016810305043402, + "loss": 0.0, + "step": 26388 + }, + { + "epoch": 2.46234953811701, + "grad_norm": NaN, + "learning_rate": 0.00020016097478491238, + "loss": 0.0, + "step": 26389 + }, + { + "epoch": 2.4624428478118876, + "grad_norm": NaN, + "learning_rate": 0.0002001538463918462, + "loss": 0.0, + "step": 26390 + }, + { + "epoch": 2.462536157506765, + "grad_norm": NaN, + "learning_rate": 0.0002001467178712537, + "loss": 0.0, + "step": 26391 + }, + { + "epoch": 2.4626294672016424, + "grad_norm": NaN, + "learning_rate": 0.0002001395892231529, + "loss": 0.0, + "step": 26392 + }, + { + "epoch": 2.4627227768965194, + "grad_norm": NaN, + "learning_rate": 0.00020013246044756202, + "loss": 0.0, + "step": 26393 + }, + { + "epoch": 2.462816086591397, + "grad_norm": NaN, + "learning_rate": 0.00020012533154449916, + "loss": 0.0, + "step": 26394 + }, + { + "epoch": 2.4629093962862743, + "grad_norm": NaN, + "learning_rate": 0.00020011820251398236, + "loss": 0.0, + "step": 26395 + }, + { + "epoch": 2.4630027059811512, + "grad_norm": NaN, + "learning_rate": 0.00020011107335602985, + "loss": 0.0, + "step": 26396 + }, + { + "epoch": 2.4630960156760286, + "grad_norm": NaN, + "learning_rate": 0.00020010394407065973, + "loss": 0.0, + "step": 26397 + }, + { + "epoch": 2.463189325370906, + "grad_norm": NaN, + "learning_rate": 0.00020009681465789014, + "loss": 0.0, + "step": 26398 + }, + { + "epoch": 2.4632826350657835, + "grad_norm": NaN, + "learning_rate": 0.00020008968511773913, + "loss": 0.0, + "step": 26399 + }, + { + "epoch": 2.4633759447606605, + "grad_norm": NaN, + "learning_rate": 0.00020008255545022496, + "loss": 0.0, + "step": 26400 + }, + { + "epoch": 2.463469254455538, + "grad_norm": NaN, + "learning_rate": 0.00020007542565536566, + "loss": 0.0, + "step": 26401 + }, + { + "epoch": 2.4635625641504153, + "grad_norm": NaN, + "learning_rate": 0.00020006829573317937, + "loss": 0.0, + "step": 26402 + }, + { + "epoch": 2.4636558738452923, + "grad_norm": NaN, + "learning_rate": 0.00020006116568368424, + "loss": 0.0, + "step": 26403 + }, + { + "epoch": 2.4637491835401697, + "grad_norm": NaN, + "learning_rate": 0.0002000540355068984, + "loss": 0.0, + "step": 26404 + }, + { + "epoch": 2.463842493235047, + "grad_norm": NaN, + "learning_rate": 0.00020004690520283996, + "loss": 0.0, + "step": 26405 + }, + { + "epoch": 2.4639358029299245, + "grad_norm": NaN, + "learning_rate": 0.0002000397747715271, + "loss": 0.0, + "step": 26406 + }, + { + "epoch": 2.464029112624802, + "grad_norm": NaN, + "learning_rate": 0.0002000326442129779, + "loss": 0.0, + "step": 26407 + }, + { + "epoch": 2.464122422319679, + "grad_norm": NaN, + "learning_rate": 0.00020002551352721052, + "loss": 0.0, + "step": 26408 + }, + { + "epoch": 2.4642157320145563, + "grad_norm": NaN, + "learning_rate": 0.00020001838271424302, + "loss": 0.0, + "step": 26409 + }, + { + "epoch": 2.4643090417094338, + "grad_norm": NaN, + "learning_rate": 0.00020001125177409365, + "loss": 0.0, + "step": 26410 + }, + { + "epoch": 2.4644023514043107, + "grad_norm": NaN, + "learning_rate": 0.00020000412070678047, + "loss": 0.0, + "step": 26411 + }, + { + "epoch": 2.464495661099188, + "grad_norm": NaN, + "learning_rate": 0.00019999698951232158, + "loss": 0.0, + "step": 26412 + }, + { + "epoch": 2.4645889707940656, + "grad_norm": NaN, + "learning_rate": 0.00019998985819073524, + "loss": 0.0, + "step": 26413 + }, + { + "epoch": 2.464682280488943, + "grad_norm": NaN, + "learning_rate": 0.00019998272674203943, + "loss": 0.0, + "step": 26414 + }, + { + "epoch": 2.46477559018382, + "grad_norm": NaN, + "learning_rate": 0.00019997559516625235, + "loss": 0.0, + "step": 26415 + }, + { + "epoch": 2.4648688998786974, + "grad_norm": NaN, + "learning_rate": 0.0001999684634633922, + "loss": 0.0, + "step": 26416 + }, + { + "epoch": 2.464962209573575, + "grad_norm": NaN, + "learning_rate": 0.000199961331633477, + "loss": 0.0, + "step": 26417 + }, + { + "epoch": 2.465055519268452, + "grad_norm": NaN, + "learning_rate": 0.00019995419967652492, + "loss": 0.0, + "step": 26418 + }, + { + "epoch": 2.465148828963329, + "grad_norm": NaN, + "learning_rate": 0.00019994706759255415, + "loss": 0.0, + "step": 26419 + }, + { + "epoch": 2.4652421386582066, + "grad_norm": NaN, + "learning_rate": 0.00019993993538158276, + "loss": 0.0, + "step": 26420 + }, + { + "epoch": 2.465335448353084, + "grad_norm": NaN, + "learning_rate": 0.00019993280304362888, + "loss": 0.0, + "step": 26421 + }, + { + "epoch": 2.465428758047961, + "grad_norm": NaN, + "learning_rate": 0.00019992567057871075, + "loss": 0.0, + "step": 26422 + }, + { + "epoch": 2.4655220677428384, + "grad_norm": NaN, + "learning_rate": 0.00019991853798684633, + "loss": 0.0, + "step": 26423 + }, + { + "epoch": 2.465615377437716, + "grad_norm": NaN, + "learning_rate": 0.00019991140526805392, + "loss": 0.0, + "step": 26424 + }, + { + "epoch": 2.465708687132593, + "grad_norm": NaN, + "learning_rate": 0.00019990427242235157, + "loss": 0.0, + "step": 26425 + }, + { + "epoch": 2.4658019968274703, + "grad_norm": NaN, + "learning_rate": 0.00019989713944975747, + "loss": 0.0, + "step": 26426 + }, + { + "epoch": 2.4658953065223477, + "grad_norm": NaN, + "learning_rate": 0.00019989000635028964, + "loss": 0.0, + "step": 26427 + }, + { + "epoch": 2.465988616217225, + "grad_norm": NaN, + "learning_rate": 0.00019988287312396635, + "loss": 0.0, + "step": 26428 + }, + { + "epoch": 2.4660819259121025, + "grad_norm": NaN, + "learning_rate": 0.00019987573977080578, + "loss": 0.0, + "step": 26429 + }, + { + "epoch": 2.4661752356069795, + "grad_norm": NaN, + "learning_rate": 0.00019986860629082588, + "loss": 0.0, + "step": 26430 + }, + { + "epoch": 2.466268545301857, + "grad_norm": NaN, + "learning_rate": 0.00019986147268404486, + "loss": 0.0, + "step": 26431 + }, + { + "epoch": 2.4663618549967343, + "grad_norm": NaN, + "learning_rate": 0.00019985433895048095, + "loss": 0.0, + "step": 26432 + }, + { + "epoch": 2.4664551646916113, + "grad_norm": NaN, + "learning_rate": 0.00019984720509015223, + "loss": 0.0, + "step": 26433 + }, + { + "epoch": 2.4665484743864887, + "grad_norm": NaN, + "learning_rate": 0.00019984007110307678, + "loss": 0.0, + "step": 26434 + }, + { + "epoch": 2.466641784081366, + "grad_norm": NaN, + "learning_rate": 0.00019983293698927284, + "loss": 0.0, + "step": 26435 + }, + { + "epoch": 2.4667350937762436, + "grad_norm": NaN, + "learning_rate": 0.0001998258027487585, + "loss": 0.0, + "step": 26436 + }, + { + "epoch": 2.4668284034711205, + "grad_norm": NaN, + "learning_rate": 0.00019981866838155184, + "loss": 0.0, + "step": 26437 + }, + { + "epoch": 2.466921713165998, + "grad_norm": NaN, + "learning_rate": 0.00019981153388767115, + "loss": 0.0, + "step": 26438 + }, + { + "epoch": 2.4670150228608754, + "grad_norm": NaN, + "learning_rate": 0.00019980439926713445, + "loss": 0.0, + "step": 26439 + }, + { + "epoch": 2.4671083325557523, + "grad_norm": NaN, + "learning_rate": 0.00019979726451995988, + "loss": 0.0, + "step": 26440 + }, + { + "epoch": 2.4672016422506298, + "grad_norm": NaN, + "learning_rate": 0.00019979012964616567, + "loss": 0.0, + "step": 26441 + }, + { + "epoch": 2.467294951945507, + "grad_norm": NaN, + "learning_rate": 0.0001997829946457699, + "loss": 0.0, + "step": 26442 + }, + { + "epoch": 2.4673882616403846, + "grad_norm": NaN, + "learning_rate": 0.00019977585951879066, + "loss": 0.0, + "step": 26443 + }, + { + "epoch": 2.4674815713352616, + "grad_norm": NaN, + "learning_rate": 0.00019976872426524618, + "loss": 0.0, + "step": 26444 + }, + { + "epoch": 2.467574881030139, + "grad_norm": NaN, + "learning_rate": 0.00019976158888515466, + "loss": 0.0, + "step": 26445 + }, + { + "epoch": 2.4676681907250164, + "grad_norm": NaN, + "learning_rate": 0.00019975445337853408, + "loss": 0.0, + "step": 26446 + }, + { + "epoch": 2.4677615004198934, + "grad_norm": NaN, + "learning_rate": 0.00019974731774540267, + "loss": 0.0, + "step": 26447 + }, + { + "epoch": 2.467854810114771, + "grad_norm": NaN, + "learning_rate": 0.00019974018198577861, + "loss": 0.0, + "step": 26448 + }, + { + "epoch": 2.4679481198096482, + "grad_norm": NaN, + "learning_rate": 0.00019973304609967993, + "loss": 0.0, + "step": 26449 + }, + { + "epoch": 2.4680414295045257, + "grad_norm": NaN, + "learning_rate": 0.0001997259100871249, + "loss": 0.0, + "step": 26450 + }, + { + "epoch": 2.4681347391994026, + "grad_norm": NaN, + "learning_rate": 0.00019971877394813162, + "loss": 0.0, + "step": 26451 + }, + { + "epoch": 2.46822804889428, + "grad_norm": NaN, + "learning_rate": 0.0001997116376827182, + "loss": 0.0, + "step": 26452 + }, + { + "epoch": 2.4683213585891575, + "grad_norm": NaN, + "learning_rate": 0.0001997045012909028, + "loss": 0.0, + "step": 26453 + }, + { + "epoch": 2.468414668284035, + "grad_norm": NaN, + "learning_rate": 0.0001996973647727036, + "loss": 0.0, + "step": 26454 + }, + { + "epoch": 2.468507977978912, + "grad_norm": NaN, + "learning_rate": 0.00019969022812813873, + "loss": 0.0, + "step": 26455 + }, + { + "epoch": 2.4686012876737893, + "grad_norm": NaN, + "learning_rate": 0.0001996830913572263, + "loss": 0.0, + "step": 26456 + }, + { + "epoch": 2.4686945973686667, + "grad_norm": NaN, + "learning_rate": 0.00019967595445998453, + "loss": 0.0, + "step": 26457 + }, + { + "epoch": 2.468787907063544, + "grad_norm": NaN, + "learning_rate": 0.00019966881743643146, + "loss": 0.0, + "step": 26458 + }, + { + "epoch": 2.468881216758421, + "grad_norm": NaN, + "learning_rate": 0.00019966168028658532, + "loss": 0.0, + "step": 26459 + }, + { + "epoch": 2.4689745264532985, + "grad_norm": NaN, + "learning_rate": 0.00019965454301046424, + "loss": 0.0, + "step": 26460 + }, + { + "epoch": 2.469067836148176, + "grad_norm": NaN, + "learning_rate": 0.0001996474056080864, + "loss": 0.0, + "step": 26461 + }, + { + "epoch": 2.469161145843053, + "grad_norm": NaN, + "learning_rate": 0.00019964026807946987, + "loss": 0.0, + "step": 26462 + }, + { + "epoch": 2.4692544555379303, + "grad_norm": NaN, + "learning_rate": 0.00019963313042463285, + "loss": 0.0, + "step": 26463 + }, + { + "epoch": 2.4693477652328077, + "grad_norm": NaN, + "learning_rate": 0.00019962599264359353, + "loss": 0.0, + "step": 26464 + }, + { + "epoch": 2.469441074927685, + "grad_norm": NaN, + "learning_rate": 0.00019961885473636994, + "loss": 0.0, + "step": 26465 + }, + { + "epoch": 2.469534384622562, + "grad_norm": NaN, + "learning_rate": 0.00019961171670298034, + "loss": 0.0, + "step": 26466 + }, + { + "epoch": 2.4696276943174396, + "grad_norm": NaN, + "learning_rate": 0.00019960457854344287, + "loss": 0.0, + "step": 26467 + }, + { + "epoch": 2.469721004012317, + "grad_norm": NaN, + "learning_rate": 0.0001995974402577756, + "loss": 0.0, + "step": 26468 + }, + { + "epoch": 2.469814313707194, + "grad_norm": NaN, + "learning_rate": 0.00019959030184599672, + "loss": 0.0, + "step": 26469 + }, + { + "epoch": 2.4699076234020714, + "grad_norm": NaN, + "learning_rate": 0.00019958316330812446, + "loss": 0.0, + "step": 26470 + }, + { + "epoch": 2.470000933096949, + "grad_norm": NaN, + "learning_rate": 0.00019957602464417686, + "loss": 0.0, + "step": 26471 + }, + { + "epoch": 2.470094242791826, + "grad_norm": NaN, + "learning_rate": 0.0001995688858541721, + "loss": 0.0, + "step": 26472 + }, + { + "epoch": 2.470187552486703, + "grad_norm": NaN, + "learning_rate": 0.0001995617469381284, + "loss": 0.0, + "step": 26473 + }, + { + "epoch": 2.4702808621815806, + "grad_norm": NaN, + "learning_rate": 0.00019955460789606376, + "loss": 0.0, + "step": 26474 + }, + { + "epoch": 2.470374171876458, + "grad_norm": NaN, + "learning_rate": 0.00019954746872799653, + "loss": 0.0, + "step": 26475 + }, + { + "epoch": 2.4704674815713354, + "grad_norm": NaN, + "learning_rate": 0.00019954032943394475, + "loss": 0.0, + "step": 26476 + }, + { + "epoch": 2.4705607912662124, + "grad_norm": NaN, + "learning_rate": 0.00019953319001392658, + "loss": 0.0, + "step": 26477 + }, + { + "epoch": 2.47065410096109, + "grad_norm": NaN, + "learning_rate": 0.0001995260504679601, + "loss": 0.0, + "step": 26478 + }, + { + "epoch": 2.4707474106559673, + "grad_norm": NaN, + "learning_rate": 0.00019951891079606366, + "loss": 0.0, + "step": 26479 + }, + { + "epoch": 2.4708407203508447, + "grad_norm": NaN, + "learning_rate": 0.00019951177099825524, + "loss": 0.0, + "step": 26480 + }, + { + "epoch": 2.4709340300457217, + "grad_norm": NaN, + "learning_rate": 0.00019950463107455303, + "loss": 0.0, + "step": 26481 + }, + { + "epoch": 2.471027339740599, + "grad_norm": NaN, + "learning_rate": 0.00019949749102497524, + "loss": 0.0, + "step": 26482 + }, + { + "epoch": 2.4711206494354765, + "grad_norm": NaN, + "learning_rate": 0.00019949035084954004, + "loss": 0.0, + "step": 26483 + }, + { + "epoch": 2.4712139591303535, + "grad_norm": NaN, + "learning_rate": 0.00019948321054826548, + "loss": 0.0, + "step": 26484 + }, + { + "epoch": 2.471307268825231, + "grad_norm": NaN, + "learning_rate": 0.00019947607012116977, + "loss": 0.0, + "step": 26485 + }, + { + "epoch": 2.4714005785201083, + "grad_norm": NaN, + "learning_rate": 0.0001994689295682711, + "loss": 0.0, + "step": 26486 + }, + { + "epoch": 2.4714938882149857, + "grad_norm": NaN, + "learning_rate": 0.00019946178888958756, + "loss": 0.0, + "step": 26487 + }, + { + "epoch": 2.4715871979098627, + "grad_norm": NaN, + "learning_rate": 0.00019945464808513737, + "loss": 0.0, + "step": 26488 + }, + { + "epoch": 2.47168050760474, + "grad_norm": NaN, + "learning_rate": 0.00019944750715493868, + "loss": 0.0, + "step": 26489 + }, + { + "epoch": 2.4717738172996175, + "grad_norm": NaN, + "learning_rate": 0.00019944036609900956, + "loss": 0.0, + "step": 26490 + }, + { + "epoch": 2.4718671269944945, + "grad_norm": NaN, + "learning_rate": 0.00019943322491736829, + "loss": 0.0, + "step": 26491 + }, + { + "epoch": 2.471960436689372, + "grad_norm": NaN, + "learning_rate": 0.00019942608361003298, + "loss": 0.0, + "step": 26492 + }, + { + "epoch": 2.4720537463842494, + "grad_norm": NaN, + "learning_rate": 0.0001994189421770217, + "loss": 0.0, + "step": 26493 + }, + { + "epoch": 2.4721470560791268, + "grad_norm": NaN, + "learning_rate": 0.00019941180061835276, + "loss": 0.0, + "step": 26494 + }, + { + "epoch": 2.4722403657740037, + "grad_norm": NaN, + "learning_rate": 0.00019940465893404427, + "loss": 0.0, + "step": 26495 + }, + { + "epoch": 2.472333675468881, + "grad_norm": NaN, + "learning_rate": 0.0001993975171241143, + "loss": 0.0, + "step": 26496 + }, + { + "epoch": 2.4724269851637586, + "grad_norm": NaN, + "learning_rate": 0.0001993903751885811, + "loss": 0.0, + "step": 26497 + }, + { + "epoch": 2.472520294858636, + "grad_norm": NaN, + "learning_rate": 0.00019938323312746283, + "loss": 0.0, + "step": 26498 + }, + { + "epoch": 2.472613604553513, + "grad_norm": NaN, + "learning_rate": 0.0001993760909407776, + "loss": 0.0, + "step": 26499 + }, + { + "epoch": 2.4727069142483904, + "grad_norm": NaN, + "learning_rate": 0.00019936894862854362, + "loss": 0.0, + "step": 26500 + }, + { + "epoch": 2.472800223943268, + "grad_norm": NaN, + "learning_rate": 0.00019936180619077902, + "loss": 0.0, + "step": 26501 + }, + { + "epoch": 2.4728935336381452, + "grad_norm": NaN, + "learning_rate": 0.000199354663627502, + "loss": 0.0, + "step": 26502 + }, + { + "epoch": 2.472986843333022, + "grad_norm": NaN, + "learning_rate": 0.0001993475209387306, + "loss": 0.0, + "step": 26503 + }, + { + "epoch": 2.4730801530278996, + "grad_norm": NaN, + "learning_rate": 0.00019934037812448314, + "loss": 0.0, + "step": 26504 + }, + { + "epoch": 2.473173462722777, + "grad_norm": NaN, + "learning_rate": 0.0001993332351847777, + "loss": 0.0, + "step": 26505 + }, + { + "epoch": 2.473266772417654, + "grad_norm": NaN, + "learning_rate": 0.00019932609211963245, + "loss": 0.0, + "step": 26506 + }, + { + "epoch": 2.4733600821125314, + "grad_norm": NaN, + "learning_rate": 0.00019931894892906557, + "loss": 0.0, + "step": 26507 + }, + { + "epoch": 2.473453391807409, + "grad_norm": NaN, + "learning_rate": 0.00019931180561309523, + "loss": 0.0, + "step": 26508 + }, + { + "epoch": 2.4735467015022863, + "grad_norm": NaN, + "learning_rate": 0.0001993046621717395, + "loss": 0.0, + "step": 26509 + }, + { + "epoch": 2.4736400111971633, + "grad_norm": NaN, + "learning_rate": 0.0001992975186050167, + "loss": 0.0, + "step": 26510 + }, + { + "epoch": 2.4737333208920407, + "grad_norm": NaN, + "learning_rate": 0.0001992903749129449, + "loss": 0.0, + "step": 26511 + }, + { + "epoch": 2.473826630586918, + "grad_norm": NaN, + "learning_rate": 0.00019928323109554225, + "loss": 0.0, + "step": 26512 + }, + { + "epoch": 2.473919940281795, + "grad_norm": NaN, + "learning_rate": 0.00019927608715282694, + "loss": 0.0, + "step": 26513 + }, + { + "epoch": 2.4740132499766725, + "grad_norm": NaN, + "learning_rate": 0.00019926894308481717, + "loss": 0.0, + "step": 26514 + }, + { + "epoch": 2.47410655967155, + "grad_norm": NaN, + "learning_rate": 0.00019926179889153103, + "loss": 0.0, + "step": 26515 + }, + { + "epoch": 2.4741998693664273, + "grad_norm": NaN, + "learning_rate": 0.00019925465457298677, + "loss": 0.0, + "step": 26516 + }, + { + "epoch": 2.4742931790613043, + "grad_norm": NaN, + "learning_rate": 0.00019924751012920247, + "loss": 0.0, + "step": 26517 + }, + { + "epoch": 2.4743864887561817, + "grad_norm": NaN, + "learning_rate": 0.00019924036556019637, + "loss": 0.0, + "step": 26518 + }, + { + "epoch": 2.474479798451059, + "grad_norm": NaN, + "learning_rate": 0.0001992332208659866, + "loss": 0.0, + "step": 26519 + }, + { + "epoch": 2.474573108145936, + "grad_norm": NaN, + "learning_rate": 0.00019922607604659135, + "loss": 0.0, + "step": 26520 + }, + { + "epoch": 2.4746664178408135, + "grad_norm": NaN, + "learning_rate": 0.0001992189311020288, + "loss": 0.0, + "step": 26521 + }, + { + "epoch": 2.474759727535691, + "grad_norm": NaN, + "learning_rate": 0.000199211786032317, + "loss": 0.0, + "step": 26522 + }, + { + "epoch": 2.4748530372305684, + "grad_norm": NaN, + "learning_rate": 0.00019920464083747423, + "loss": 0.0, + "step": 26523 + }, + { + "epoch": 2.474946346925446, + "grad_norm": NaN, + "learning_rate": 0.0001991974955175187, + "loss": 0.0, + "step": 26524 + }, + { + "epoch": 2.4750396566203228, + "grad_norm": NaN, + "learning_rate": 0.00019919035007246843, + "loss": 0.0, + "step": 26525 + }, + { + "epoch": 2.4751329663152, + "grad_norm": NaN, + "learning_rate": 0.0001991832045023417, + "loss": 0.0, + "step": 26526 + }, + { + "epoch": 2.4752262760100776, + "grad_norm": NaN, + "learning_rate": 0.0001991760588071567, + "loss": 0.0, + "step": 26527 + }, + { + "epoch": 2.4753195857049546, + "grad_norm": NaN, + "learning_rate": 0.00019916891298693147, + "loss": 0.0, + "step": 26528 + }, + { + "epoch": 2.475412895399832, + "grad_norm": NaN, + "learning_rate": 0.00019916176704168432, + "loss": 0.0, + "step": 26529 + }, + { + "epoch": 2.4755062050947094, + "grad_norm": NaN, + "learning_rate": 0.00019915462097143334, + "loss": 0.0, + "step": 26530 + }, + { + "epoch": 2.475599514789587, + "grad_norm": NaN, + "learning_rate": 0.0001991474747761967, + "loss": 0.0, + "step": 26531 + }, + { + "epoch": 2.475692824484464, + "grad_norm": NaN, + "learning_rate": 0.0001991403284559926, + "loss": 0.0, + "step": 26532 + }, + { + "epoch": 2.4757861341793412, + "grad_norm": NaN, + "learning_rate": 0.00019913318201083923, + "loss": 0.0, + "step": 26533 + }, + { + "epoch": 2.4758794438742187, + "grad_norm": NaN, + "learning_rate": 0.00019912603544075468, + "loss": 0.0, + "step": 26534 + }, + { + "epoch": 2.4759727535690956, + "grad_norm": NaN, + "learning_rate": 0.00019911888874575722, + "loss": 0.0, + "step": 26535 + }, + { + "epoch": 2.476066063263973, + "grad_norm": NaN, + "learning_rate": 0.00019911174192586493, + "loss": 0.0, + "step": 26536 + }, + { + "epoch": 2.4761593729588505, + "grad_norm": NaN, + "learning_rate": 0.00019910459498109604, + "loss": 0.0, + "step": 26537 + }, + { + "epoch": 2.476252682653728, + "grad_norm": NaN, + "learning_rate": 0.00019909744791146876, + "loss": 0.0, + "step": 26538 + }, + { + "epoch": 2.476345992348605, + "grad_norm": NaN, + "learning_rate": 0.00019909030071700118, + "loss": 0.0, + "step": 26539 + }, + { + "epoch": 2.4764393020434823, + "grad_norm": NaN, + "learning_rate": 0.0001990831533977115, + "loss": 0.0, + "step": 26540 + }, + { + "epoch": 2.4765326117383597, + "grad_norm": NaN, + "learning_rate": 0.0001990760059536179, + "loss": 0.0, + "step": 26541 + }, + { + "epoch": 2.4766259214332367, + "grad_norm": NaN, + "learning_rate": 0.00019906885838473859, + "loss": 0.0, + "step": 26542 + }, + { + "epoch": 2.476719231128114, + "grad_norm": NaN, + "learning_rate": 0.0001990617106910917, + "loss": 0.0, + "step": 26543 + }, + { + "epoch": 2.4768125408229915, + "grad_norm": NaN, + "learning_rate": 0.00019905456287269536, + "loss": 0.0, + "step": 26544 + }, + { + "epoch": 2.476905850517869, + "grad_norm": NaN, + "learning_rate": 0.00019904741492956783, + "loss": 0.0, + "step": 26545 + }, + { + "epoch": 2.4769991602127464, + "grad_norm": NaN, + "learning_rate": 0.0001990402668617273, + "loss": 0.0, + "step": 26546 + }, + { + "epoch": 2.4770924699076233, + "grad_norm": NaN, + "learning_rate": 0.0001990331186691918, + "loss": 0.0, + "step": 26547 + }, + { + "epoch": 2.4771857796025007, + "grad_norm": NaN, + "learning_rate": 0.00019902597035197965, + "loss": 0.0, + "step": 26548 + }, + { + "epoch": 2.477279089297378, + "grad_norm": NaN, + "learning_rate": 0.00019901882191010903, + "loss": 0.0, + "step": 26549 + }, + { + "epoch": 2.477372398992255, + "grad_norm": NaN, + "learning_rate": 0.000199011673343598, + "loss": 0.0, + "step": 26550 + }, + { + "epoch": 2.4774657086871326, + "grad_norm": NaN, + "learning_rate": 0.0001990045246524648, + "loss": 0.0, + "step": 26551 + }, + { + "epoch": 2.47755901838201, + "grad_norm": NaN, + "learning_rate": 0.0001989973758367277, + "loss": 0.0, + "step": 26552 + }, + { + "epoch": 2.4776523280768874, + "grad_norm": NaN, + "learning_rate": 0.00019899022689640468, + "loss": 0.0, + "step": 26553 + }, + { + "epoch": 2.4777456377717644, + "grad_norm": NaN, + "learning_rate": 0.00019898307783151406, + "loss": 0.0, + "step": 26554 + }, + { + "epoch": 2.477838947466642, + "grad_norm": NaN, + "learning_rate": 0.000198975928642074, + "loss": 0.0, + "step": 26555 + }, + { + "epoch": 2.477932257161519, + "grad_norm": NaN, + "learning_rate": 0.00019896877932810264, + "loss": 0.0, + "step": 26556 + }, + { + "epoch": 2.478025566856396, + "grad_norm": NaN, + "learning_rate": 0.0001989616298896182, + "loss": 0.0, + "step": 26557 + }, + { + "epoch": 2.4781188765512736, + "grad_norm": NaN, + "learning_rate": 0.0001989544803266388, + "loss": 0.0, + "step": 26558 + }, + { + "epoch": 2.478212186246151, + "grad_norm": NaN, + "learning_rate": 0.00019894733063918274, + "loss": 0.0, + "step": 26559 + }, + { + "epoch": 2.4783054959410284, + "grad_norm": NaN, + "learning_rate": 0.00019894018082726805, + "loss": 0.0, + "step": 26560 + }, + { + "epoch": 2.4783988056359054, + "grad_norm": NaN, + "learning_rate": 0.00019893303089091297, + "loss": 0.0, + "step": 26561 + }, + { + "epoch": 2.478492115330783, + "grad_norm": NaN, + "learning_rate": 0.00019892588083013571, + "loss": 0.0, + "step": 26562 + }, + { + "epoch": 2.4785854250256603, + "grad_norm": NaN, + "learning_rate": 0.00019891873064495445, + "loss": 0.0, + "step": 26563 + }, + { + "epoch": 2.4786787347205372, + "grad_norm": NaN, + "learning_rate": 0.00019891158033538735, + "loss": 0.0, + "step": 26564 + }, + { + "epoch": 2.4787720444154147, + "grad_norm": NaN, + "learning_rate": 0.0001989044299014526, + "loss": 0.0, + "step": 26565 + }, + { + "epoch": 2.478865354110292, + "grad_norm": NaN, + "learning_rate": 0.00019889727934316827, + "loss": 0.0, + "step": 26566 + }, + { + "epoch": 2.4789586638051695, + "grad_norm": NaN, + "learning_rate": 0.00019889012866055274, + "loss": 0.0, + "step": 26567 + }, + { + "epoch": 2.4790519735000465, + "grad_norm": NaN, + "learning_rate": 0.00019888297785362408, + "loss": 0.0, + "step": 26568 + }, + { + "epoch": 2.479145283194924, + "grad_norm": NaN, + "learning_rate": 0.00019887582692240045, + "loss": 0.0, + "step": 26569 + }, + { + "epoch": 2.4792385928898013, + "grad_norm": NaN, + "learning_rate": 0.0001988686758669001, + "loss": 0.0, + "step": 26570 + }, + { + "epoch": 2.4793319025846787, + "grad_norm": NaN, + "learning_rate": 0.0001988615246871412, + "loss": 0.0, + "step": 26571 + }, + { + "epoch": 2.4794252122795557, + "grad_norm": NaN, + "learning_rate": 0.00019885437338314186, + "loss": 0.0, + "step": 26572 + }, + { + "epoch": 2.479518521974433, + "grad_norm": NaN, + "learning_rate": 0.00019884722195492036, + "loss": 0.0, + "step": 26573 + }, + { + "epoch": 2.4796118316693105, + "grad_norm": NaN, + "learning_rate": 0.00019884007040249483, + "loss": 0.0, + "step": 26574 + }, + { + "epoch": 2.479705141364188, + "grad_norm": NaN, + "learning_rate": 0.00019883291872588348, + "loss": 0.0, + "step": 26575 + }, + { + "epoch": 2.479798451059065, + "grad_norm": NaN, + "learning_rate": 0.0001988257669251045, + "loss": 0.0, + "step": 26576 + }, + { + "epoch": 2.4798917607539424, + "grad_norm": NaN, + "learning_rate": 0.00019881861500017605, + "loss": 0.0, + "step": 26577 + }, + { + "epoch": 2.4799850704488198, + "grad_norm": NaN, + "learning_rate": 0.0001988114629511163, + "loss": 0.0, + "step": 26578 + }, + { + "epoch": 2.4800783801436967, + "grad_norm": NaN, + "learning_rate": 0.00019880431077794344, + "loss": 0.0, + "step": 26579 + }, + { + "epoch": 2.480171689838574, + "grad_norm": NaN, + "learning_rate": 0.0001987971584806757, + "loss": 0.0, + "step": 26580 + }, + { + "epoch": 2.4802649995334516, + "grad_norm": NaN, + "learning_rate": 0.00019879000605933127, + "loss": 0.0, + "step": 26581 + }, + { + "epoch": 2.480358309228329, + "grad_norm": NaN, + "learning_rate": 0.00019878285351392828, + "loss": 0.0, + "step": 26582 + }, + { + "epoch": 2.480451618923206, + "grad_norm": NaN, + "learning_rate": 0.00019877570084448495, + "loss": 0.0, + "step": 26583 + }, + { + "epoch": 2.4805449286180834, + "grad_norm": NaN, + "learning_rate": 0.00019876854805101946, + "loss": 0.0, + "step": 26584 + }, + { + "epoch": 2.480638238312961, + "grad_norm": NaN, + "learning_rate": 0.00019876139513355, + "loss": 0.0, + "step": 26585 + }, + { + "epoch": 2.480731548007838, + "grad_norm": NaN, + "learning_rate": 0.00019875424209209473, + "loss": 0.0, + "step": 26586 + }, + { + "epoch": 2.480824857702715, + "grad_norm": NaN, + "learning_rate": 0.00019874708892667195, + "loss": 0.0, + "step": 26587 + }, + { + "epoch": 2.4809181673975926, + "grad_norm": NaN, + "learning_rate": 0.00019873993563729965, + "loss": 0.0, + "step": 26588 + }, + { + "epoch": 2.48101147709247, + "grad_norm": NaN, + "learning_rate": 0.0001987327822239962, + "loss": 0.0, + "step": 26589 + }, + { + "epoch": 2.481104786787347, + "grad_norm": NaN, + "learning_rate": 0.0001987256286867797, + "loss": 0.0, + "step": 26590 + }, + { + "epoch": 2.4811980964822244, + "grad_norm": NaN, + "learning_rate": 0.00019871847502566834, + "loss": 0.0, + "step": 26591 + }, + { + "epoch": 2.481291406177102, + "grad_norm": NaN, + "learning_rate": 0.00019871132124068034, + "loss": 0.0, + "step": 26592 + }, + { + "epoch": 2.4813847158719793, + "grad_norm": NaN, + "learning_rate": 0.0001987041673318339, + "loss": 0.0, + "step": 26593 + }, + { + "epoch": 2.4814780255668563, + "grad_norm": NaN, + "learning_rate": 0.00019869701329914717, + "loss": 0.0, + "step": 26594 + }, + { + "epoch": 2.4815713352617337, + "grad_norm": NaN, + "learning_rate": 0.00019868985914263837, + "loss": 0.0, + "step": 26595 + }, + { + "epoch": 2.481664644956611, + "grad_norm": NaN, + "learning_rate": 0.0001986827048623257, + "loss": 0.0, + "step": 26596 + }, + { + "epoch": 2.4817579546514885, + "grad_norm": NaN, + "learning_rate": 0.00019867555045822733, + "loss": 0.0, + "step": 26597 + }, + { + "epoch": 2.4818512643463655, + "grad_norm": NaN, + "learning_rate": 0.00019866839593036143, + "loss": 0.0, + "step": 26598 + }, + { + "epoch": 2.481944574041243, + "grad_norm": NaN, + "learning_rate": 0.0001986612412787462, + "loss": 0.0, + "step": 26599 + }, + { + "epoch": 2.4820378837361203, + "grad_norm": NaN, + "learning_rate": 0.0001986540865033999, + "loss": 0.0, + "step": 26600 + }, + { + "epoch": 2.4821311934309973, + "grad_norm": NaN, + "learning_rate": 0.00019864693160434062, + "loss": 0.0, + "step": 26601 + }, + { + "epoch": 2.4822245031258747, + "grad_norm": NaN, + "learning_rate": 0.00019863977658158665, + "loss": 0.0, + "step": 26602 + }, + { + "epoch": 2.482317812820752, + "grad_norm": NaN, + "learning_rate": 0.0001986326214351561, + "loss": 0.0, + "step": 26603 + }, + { + "epoch": 2.4824111225156296, + "grad_norm": NaN, + "learning_rate": 0.0001986254661650672, + "loss": 0.0, + "step": 26604 + }, + { + "epoch": 2.4825044322105065, + "grad_norm": NaN, + "learning_rate": 0.00019861831077133818, + "loss": 0.0, + "step": 26605 + }, + { + "epoch": 2.482597741905384, + "grad_norm": NaN, + "learning_rate": 0.00019861115525398717, + "loss": 0.0, + "step": 26606 + }, + { + "epoch": 2.4826910516002614, + "grad_norm": NaN, + "learning_rate": 0.0001986039996130324, + "loss": 0.0, + "step": 26607 + }, + { + "epoch": 2.4827843612951384, + "grad_norm": NaN, + "learning_rate": 0.00019859684384849205, + "loss": 0.0, + "step": 26608 + }, + { + "epoch": 2.4828776709900158, + "grad_norm": NaN, + "learning_rate": 0.00019858968796038434, + "loss": 0.0, + "step": 26609 + }, + { + "epoch": 2.482970980684893, + "grad_norm": NaN, + "learning_rate": 0.0001985825319487274, + "loss": 0.0, + "step": 26610 + }, + { + "epoch": 2.4830642903797706, + "grad_norm": NaN, + "learning_rate": 0.0001985753758135395, + "loss": 0.0, + "step": 26611 + }, + { + "epoch": 2.4831576000746476, + "grad_norm": NaN, + "learning_rate": 0.00019856821955483881, + "loss": 0.0, + "step": 26612 + }, + { + "epoch": 2.483250909769525, + "grad_norm": NaN, + "learning_rate": 0.00019856106317264356, + "loss": 0.0, + "step": 26613 + }, + { + "epoch": 2.4833442194644024, + "grad_norm": NaN, + "learning_rate": 0.00019855390666697186, + "loss": 0.0, + "step": 26614 + }, + { + "epoch": 2.4834375291592794, + "grad_norm": NaN, + "learning_rate": 0.00019854675003784197, + "loss": 0.0, + "step": 26615 + }, + { + "epoch": 2.483530838854157, + "grad_norm": NaN, + "learning_rate": 0.00019853959328527206, + "loss": 0.0, + "step": 26616 + }, + { + "epoch": 2.4836241485490342, + "grad_norm": NaN, + "learning_rate": 0.0001985324364092804, + "loss": 0.0, + "step": 26617 + }, + { + "epoch": 2.4837174582439117, + "grad_norm": NaN, + "learning_rate": 0.0001985252794098851, + "loss": 0.0, + "step": 26618 + }, + { + "epoch": 2.483810767938789, + "grad_norm": NaN, + "learning_rate": 0.00019851812228710436, + "loss": 0.0, + "step": 26619 + }, + { + "epoch": 2.483904077633666, + "grad_norm": NaN, + "learning_rate": 0.0001985109650409564, + "loss": 0.0, + "step": 26620 + }, + { + "epoch": 2.4839973873285435, + "grad_norm": NaN, + "learning_rate": 0.0001985038076714595, + "loss": 0.0, + "step": 26621 + }, + { + "epoch": 2.484090697023421, + "grad_norm": NaN, + "learning_rate": 0.0001984966501786317, + "loss": 0.0, + "step": 26622 + }, + { + "epoch": 2.484184006718298, + "grad_norm": NaN, + "learning_rate": 0.00019848949256249135, + "loss": 0.0, + "step": 26623 + }, + { + "epoch": 2.4842773164131753, + "grad_norm": NaN, + "learning_rate": 0.00019848233482305656, + "loss": 0.0, + "step": 26624 + }, + { + "epoch": 2.4843706261080527, + "grad_norm": NaN, + "learning_rate": 0.00019847517696034556, + "loss": 0.0, + "step": 26625 + }, + { + "epoch": 2.48446393580293, + "grad_norm": NaN, + "learning_rate": 0.00019846801897437652, + "loss": 0.0, + "step": 26626 + }, + { + "epoch": 2.484557245497807, + "grad_norm": NaN, + "learning_rate": 0.0001984608608651677, + "loss": 0.0, + "step": 26627 + }, + { + "epoch": 2.4846505551926845, + "grad_norm": NaN, + "learning_rate": 0.00019845370263273725, + "loss": 0.0, + "step": 26628 + }, + { + "epoch": 2.484743864887562, + "grad_norm": NaN, + "learning_rate": 0.00019844654427710336, + "loss": 0.0, + "step": 26629 + }, + { + "epoch": 2.484837174582439, + "grad_norm": NaN, + "learning_rate": 0.00019843938579828429, + "loss": 0.0, + "step": 26630 + }, + { + "epoch": 2.4849304842773163, + "grad_norm": NaN, + "learning_rate": 0.0001984322271962982, + "loss": 0.0, + "step": 26631 + }, + { + "epoch": 2.4850237939721938, + "grad_norm": NaN, + "learning_rate": 0.00019842506847116332, + "loss": 0.0, + "step": 26632 + }, + { + "epoch": 2.485117103667071, + "grad_norm": NaN, + "learning_rate": 0.00019841790962289783, + "loss": 0.0, + "step": 26633 + }, + { + "epoch": 2.485210413361948, + "grad_norm": NaN, + "learning_rate": 0.0001984107506515199, + "loss": 0.0, + "step": 26634 + }, + { + "epoch": 2.4853037230568256, + "grad_norm": NaN, + "learning_rate": 0.0001984035915570478, + "loss": 0.0, + "step": 26635 + }, + { + "epoch": 2.485397032751703, + "grad_norm": NaN, + "learning_rate": 0.00019839643233949972, + "loss": 0.0, + "step": 26636 + }, + { + "epoch": 2.48549034244658, + "grad_norm": NaN, + "learning_rate": 0.00019838927299889388, + "loss": 0.0, + "step": 26637 + }, + { + "epoch": 2.4855836521414574, + "grad_norm": NaN, + "learning_rate": 0.0001983821135352484, + "loss": 0.0, + "step": 26638 + }, + { + "epoch": 2.485676961836335, + "grad_norm": NaN, + "learning_rate": 0.00019837495394858153, + "loss": 0.0, + "step": 26639 + }, + { + "epoch": 2.485770271531212, + "grad_norm": NaN, + "learning_rate": 0.0001983677942389115, + "loss": 0.0, + "step": 26640 + }, + { + "epoch": 2.4858635812260896, + "grad_norm": NaN, + "learning_rate": 0.00019836063440625647, + "loss": 0.0, + "step": 26641 + }, + { + "epoch": 2.4859568909209666, + "grad_norm": NaN, + "learning_rate": 0.00019835347445063472, + "loss": 0.0, + "step": 26642 + }, + { + "epoch": 2.486050200615844, + "grad_norm": NaN, + "learning_rate": 0.00019834631437206437, + "loss": 0.0, + "step": 26643 + }, + { + "epoch": 2.4861435103107214, + "grad_norm": NaN, + "learning_rate": 0.0001983391541705637, + "loss": 0.0, + "step": 26644 + }, + { + "epoch": 2.4862368200055984, + "grad_norm": NaN, + "learning_rate": 0.00019833199384615083, + "loss": 0.0, + "step": 26645 + }, + { + "epoch": 2.486330129700476, + "grad_norm": NaN, + "learning_rate": 0.00019832483339884404, + "loss": 0.0, + "step": 26646 + }, + { + "epoch": 2.4864234393953533, + "grad_norm": NaN, + "learning_rate": 0.00019831767282866157, + "loss": 0.0, + "step": 26647 + }, + { + "epoch": 2.4865167490902307, + "grad_norm": NaN, + "learning_rate": 0.0001983105121356215, + "loss": 0.0, + "step": 26648 + }, + { + "epoch": 2.4866100587851077, + "grad_norm": NaN, + "learning_rate": 0.0001983033513197421, + "loss": 0.0, + "step": 26649 + }, + { + "epoch": 2.486703368479985, + "grad_norm": NaN, + "learning_rate": 0.00019829619038104162, + "loss": 0.0, + "step": 26650 + }, + { + "epoch": 2.4867966781748625, + "grad_norm": NaN, + "learning_rate": 0.00019828902931953824, + "loss": 0.0, + "step": 26651 + }, + { + "epoch": 2.4868899878697395, + "grad_norm": NaN, + "learning_rate": 0.00019828186813525015, + "loss": 0.0, + "step": 26652 + }, + { + "epoch": 2.486983297564617, + "grad_norm": NaN, + "learning_rate": 0.0001982747068281956, + "loss": 0.0, + "step": 26653 + }, + { + "epoch": 2.4870766072594943, + "grad_norm": NaN, + "learning_rate": 0.00019826754539839273, + "loss": 0.0, + "step": 26654 + }, + { + "epoch": 2.4871699169543717, + "grad_norm": NaN, + "learning_rate": 0.0001982603838458598, + "loss": 0.0, + "step": 26655 + }, + { + "epoch": 2.4872632266492487, + "grad_norm": NaN, + "learning_rate": 0.00019825322217061498, + "loss": 0.0, + "step": 26656 + }, + { + "epoch": 2.487356536344126, + "grad_norm": NaN, + "learning_rate": 0.00019824606037267657, + "loss": 0.0, + "step": 26657 + }, + { + "epoch": 2.4874498460390035, + "grad_norm": NaN, + "learning_rate": 0.0001982388984520627, + "loss": 0.0, + "step": 26658 + }, + { + "epoch": 2.4875431557338805, + "grad_norm": NaN, + "learning_rate": 0.00019823173640879158, + "loss": 0.0, + "step": 26659 + }, + { + "epoch": 2.487636465428758, + "grad_norm": NaN, + "learning_rate": 0.00019822457424288148, + "loss": 0.0, + "step": 26660 + }, + { + "epoch": 2.4877297751236354, + "grad_norm": NaN, + "learning_rate": 0.00019821741195435056, + "loss": 0.0, + "step": 26661 + }, + { + "epoch": 2.4878230848185128, + "grad_norm": NaN, + "learning_rate": 0.000198210249543217, + "loss": 0.0, + "step": 26662 + }, + { + "epoch": 2.4879163945133898, + "grad_norm": NaN, + "learning_rate": 0.0001982030870094991, + "loss": 0.0, + "step": 26663 + }, + { + "epoch": 2.488009704208267, + "grad_norm": NaN, + "learning_rate": 0.000198195924353215, + "loss": 0.0, + "step": 26664 + }, + { + "epoch": 2.4881030139031446, + "grad_norm": NaN, + "learning_rate": 0.00019818876157438295, + "loss": 0.0, + "step": 26665 + }, + { + "epoch": 2.488196323598022, + "grad_norm": NaN, + "learning_rate": 0.00019818159867302117, + "loss": 0.0, + "step": 26666 + }, + { + "epoch": 2.488289633292899, + "grad_norm": NaN, + "learning_rate": 0.00019817443564914785, + "loss": 0.0, + "step": 26667 + }, + { + "epoch": 2.4883829429877764, + "grad_norm": NaN, + "learning_rate": 0.0001981672725027812, + "loss": 0.0, + "step": 26668 + }, + { + "epoch": 2.488476252682654, + "grad_norm": NaN, + "learning_rate": 0.00019816010923393948, + "loss": 0.0, + "step": 26669 + }, + { + "epoch": 2.4885695623775312, + "grad_norm": NaN, + "learning_rate": 0.00019815294584264083, + "loss": 0.0, + "step": 26670 + }, + { + "epoch": 2.488662872072408, + "grad_norm": NaN, + "learning_rate": 0.0001981457823289035, + "loss": 0.0, + "step": 26671 + }, + { + "epoch": 2.4887561817672856, + "grad_norm": NaN, + "learning_rate": 0.00019813861869274574, + "loss": 0.0, + "step": 26672 + }, + { + "epoch": 2.488849491462163, + "grad_norm": NaN, + "learning_rate": 0.00019813145493418572, + "loss": 0.0, + "step": 26673 + }, + { + "epoch": 2.48894280115704, + "grad_norm": NaN, + "learning_rate": 0.00019812429105324164, + "loss": 0.0, + "step": 26674 + }, + { + "epoch": 2.4890361108519174, + "grad_norm": NaN, + "learning_rate": 0.00019811712704993176, + "loss": 0.0, + "step": 26675 + }, + { + "epoch": 2.489129420546795, + "grad_norm": NaN, + "learning_rate": 0.0001981099629242743, + "loss": 0.0, + "step": 26676 + }, + { + "epoch": 2.4892227302416723, + "grad_norm": NaN, + "learning_rate": 0.00019810279867628743, + "loss": 0.0, + "step": 26677 + }, + { + "epoch": 2.4893160399365493, + "grad_norm": NaN, + "learning_rate": 0.00019809563430598943, + "loss": 0.0, + "step": 26678 + }, + { + "epoch": 2.4894093496314267, + "grad_norm": NaN, + "learning_rate": 0.0001980884698133984, + "loss": 0.0, + "step": 26679 + }, + { + "epoch": 2.489502659326304, + "grad_norm": NaN, + "learning_rate": 0.00019808130519853272, + "loss": 0.0, + "step": 26680 + }, + { + "epoch": 2.489595969021181, + "grad_norm": NaN, + "learning_rate": 0.00019807414046141048, + "loss": 0.0, + "step": 26681 + }, + { + "epoch": 2.4896892787160585, + "grad_norm": NaN, + "learning_rate": 0.00019806697560204996, + "loss": 0.0, + "step": 26682 + }, + { + "epoch": 2.489782588410936, + "grad_norm": NaN, + "learning_rate": 0.00019805981062046937, + "loss": 0.0, + "step": 26683 + }, + { + "epoch": 2.4898758981058133, + "grad_norm": NaN, + "learning_rate": 0.00019805264551668686, + "loss": 0.0, + "step": 26684 + }, + { + "epoch": 2.4899692078006903, + "grad_norm": NaN, + "learning_rate": 0.0001980454802907208, + "loss": 0.0, + "step": 26685 + }, + { + "epoch": 2.4900625174955677, + "grad_norm": NaN, + "learning_rate": 0.00019803831494258922, + "loss": 0.0, + "step": 26686 + }, + { + "epoch": 2.490155827190445, + "grad_norm": NaN, + "learning_rate": 0.0001980311494723105, + "loss": 0.0, + "step": 26687 + }, + { + "epoch": 2.4902491368853226, + "grad_norm": NaN, + "learning_rate": 0.00019802398387990276, + "loss": 0.0, + "step": 26688 + }, + { + "epoch": 2.4903424465801995, + "grad_norm": NaN, + "learning_rate": 0.00019801681816538426, + "loss": 0.0, + "step": 26689 + }, + { + "epoch": 2.490435756275077, + "grad_norm": NaN, + "learning_rate": 0.00019800965232877321, + "loss": 0.0, + "step": 26690 + }, + { + "epoch": 2.4905290659699544, + "grad_norm": NaN, + "learning_rate": 0.0001980024863700879, + "loss": 0.0, + "step": 26691 + }, + { + "epoch": 2.490622375664832, + "grad_norm": NaN, + "learning_rate": 0.00019799532028934641, + "loss": 0.0, + "step": 26692 + }, + { + "epoch": 2.4907156853597088, + "grad_norm": NaN, + "learning_rate": 0.00019798815408656706, + "loss": 0.0, + "step": 26693 + }, + { + "epoch": 2.490808995054586, + "grad_norm": NaN, + "learning_rate": 0.00019798098776176805, + "loss": 0.0, + "step": 26694 + }, + { + "epoch": 2.4909023047494636, + "grad_norm": NaN, + "learning_rate": 0.00019797382131496758, + "loss": 0.0, + "step": 26695 + }, + { + "epoch": 2.4909956144443406, + "grad_norm": NaN, + "learning_rate": 0.00019796665474618394, + "loss": 0.0, + "step": 26696 + }, + { + "epoch": 2.491088924139218, + "grad_norm": NaN, + "learning_rate": 0.00019795948805543528, + "loss": 0.0, + "step": 26697 + }, + { + "epoch": 2.4911822338340954, + "grad_norm": NaN, + "learning_rate": 0.00019795232124273987, + "loss": 0.0, + "step": 26698 + }, + { + "epoch": 2.491275543528973, + "grad_norm": NaN, + "learning_rate": 0.0001979451543081159, + "loss": 0.0, + "step": 26699 + }, + { + "epoch": 2.49136885322385, + "grad_norm": NaN, + "learning_rate": 0.00019793798725158156, + "loss": 0.0, + "step": 26700 + }, + { + "epoch": 2.4914621629187272, + "grad_norm": NaN, + "learning_rate": 0.00019793082007315516, + "loss": 0.0, + "step": 26701 + }, + { + "epoch": 2.4915554726136047, + "grad_norm": NaN, + "learning_rate": 0.0001979236527728549, + "loss": 0.0, + "step": 26702 + }, + { + "epoch": 2.4916487823084816, + "grad_norm": NaN, + "learning_rate": 0.00019791648535069894, + "loss": 0.0, + "step": 26703 + }, + { + "epoch": 2.491742092003359, + "grad_norm": NaN, + "learning_rate": 0.00019790931780670559, + "loss": 0.0, + "step": 26704 + }, + { + "epoch": 2.4918354016982365, + "grad_norm": NaN, + "learning_rate": 0.000197902150140893, + "loss": 0.0, + "step": 26705 + }, + { + "epoch": 2.491928711393114, + "grad_norm": NaN, + "learning_rate": 0.00019789498235327945, + "loss": 0.0, + "step": 26706 + }, + { + "epoch": 2.492022021087991, + "grad_norm": NaN, + "learning_rate": 0.00019788781444388316, + "loss": 0.0, + "step": 26707 + }, + { + "epoch": 2.4921153307828683, + "grad_norm": NaN, + "learning_rate": 0.0001978806464127223, + "loss": 0.0, + "step": 26708 + }, + { + "epoch": 2.4922086404777457, + "grad_norm": NaN, + "learning_rate": 0.0001978734782598152, + "loss": 0.0, + "step": 26709 + }, + { + "epoch": 2.492301950172623, + "grad_norm": NaN, + "learning_rate": 0.00019786630998518001, + "loss": 0.0, + "step": 26710 + }, + { + "epoch": 2.4923952598675, + "grad_norm": NaN, + "learning_rate": 0.00019785914158883497, + "loss": 0.0, + "step": 26711 + }, + { + "epoch": 2.4924885695623775, + "grad_norm": NaN, + "learning_rate": 0.0001978519730707983, + "loss": 0.0, + "step": 26712 + }, + { + "epoch": 2.492581879257255, + "grad_norm": NaN, + "learning_rate": 0.00019784480443108825, + "loss": 0.0, + "step": 26713 + }, + { + "epoch": 2.4926751889521324, + "grad_norm": NaN, + "learning_rate": 0.00019783763566972302, + "loss": 0.0, + "step": 26714 + }, + { + "epoch": 2.4927684986470093, + "grad_norm": NaN, + "learning_rate": 0.00019783046678672084, + "loss": 0.0, + "step": 26715 + }, + { + "epoch": 2.4928618083418868, + "grad_norm": NaN, + "learning_rate": 0.00019782329778209997, + "loss": 0.0, + "step": 26716 + }, + { + "epoch": 2.492955118036764, + "grad_norm": NaN, + "learning_rate": 0.00019781612865587864, + "loss": 0.0, + "step": 26717 + }, + { + "epoch": 2.493048427731641, + "grad_norm": NaN, + "learning_rate": 0.00019780895940807504, + "loss": 0.0, + "step": 26718 + }, + { + "epoch": 2.4931417374265186, + "grad_norm": NaN, + "learning_rate": 0.00019780179003870739, + "loss": 0.0, + "step": 26719 + }, + { + "epoch": 2.493235047121396, + "grad_norm": NaN, + "learning_rate": 0.000197794620547794, + "loss": 0.0, + "step": 26720 + }, + { + "epoch": 2.4933283568162734, + "grad_norm": NaN, + "learning_rate": 0.00019778745093535302, + "loss": 0.0, + "step": 26721 + }, + { + "epoch": 2.4934216665111504, + "grad_norm": NaN, + "learning_rate": 0.0001977802812014027, + "loss": 0.0, + "step": 26722 + }, + { + "epoch": 2.493514976206028, + "grad_norm": NaN, + "learning_rate": 0.00019777311134596133, + "loss": 0.0, + "step": 26723 + }, + { + "epoch": 2.493608285900905, + "grad_norm": NaN, + "learning_rate": 0.00019776594136904703, + "loss": 0.0, + "step": 26724 + }, + { + "epoch": 2.493701595595782, + "grad_norm": NaN, + "learning_rate": 0.0001977587712706781, + "loss": 0.0, + "step": 26725 + }, + { + "epoch": 2.4937949052906596, + "grad_norm": NaN, + "learning_rate": 0.0001977516010508728, + "loss": 0.0, + "step": 26726 + }, + { + "epoch": 2.493888214985537, + "grad_norm": NaN, + "learning_rate": 0.00019774443070964928, + "loss": 0.0, + "step": 26727 + }, + { + "epoch": 2.4939815246804145, + "grad_norm": NaN, + "learning_rate": 0.00019773726024702583, + "loss": 0.0, + "step": 26728 + }, + { + "epoch": 2.4940748343752914, + "grad_norm": NaN, + "learning_rate": 0.00019773008966302065, + "loss": 0.0, + "step": 26729 + }, + { + "epoch": 2.494168144070169, + "grad_norm": NaN, + "learning_rate": 0.000197722918957652, + "loss": 0.0, + "step": 26730 + }, + { + "epoch": 2.4942614537650463, + "grad_norm": NaN, + "learning_rate": 0.0001977157481309381, + "loss": 0.0, + "step": 26731 + }, + { + "epoch": 2.4943547634599232, + "grad_norm": NaN, + "learning_rate": 0.00019770857718289726, + "loss": 0.0, + "step": 26732 + }, + { + "epoch": 2.4944480731548007, + "grad_norm": NaN, + "learning_rate": 0.00019770140611354755, + "loss": 0.0, + "step": 26733 + }, + { + "epoch": 2.494541382849678, + "grad_norm": NaN, + "learning_rate": 0.00019769423492290735, + "loss": 0.0, + "step": 26734 + }, + { + "epoch": 2.4946346925445555, + "grad_norm": NaN, + "learning_rate": 0.00019768706361099477, + "loss": 0.0, + "step": 26735 + }, + { + "epoch": 2.494728002239433, + "grad_norm": NaN, + "learning_rate": 0.00019767989217782817, + "loss": 0.0, + "step": 26736 + }, + { + "epoch": 2.49482131193431, + "grad_norm": NaN, + "learning_rate": 0.00019767272062342573, + "loss": 0.0, + "step": 26737 + }, + { + "epoch": 2.4949146216291873, + "grad_norm": NaN, + "learning_rate": 0.00019766554894780564, + "loss": 0.0, + "step": 26738 + }, + { + "epoch": 2.4950079313240647, + "grad_norm": NaN, + "learning_rate": 0.00019765837715098622, + "loss": 0.0, + "step": 26739 + }, + { + "epoch": 2.4951012410189417, + "grad_norm": NaN, + "learning_rate": 0.00019765120523298568, + "loss": 0.0, + "step": 26740 + }, + { + "epoch": 2.495194550713819, + "grad_norm": NaN, + "learning_rate": 0.00019764403319382214, + "loss": 0.0, + "step": 26741 + }, + { + "epoch": 2.4952878604086965, + "grad_norm": NaN, + "learning_rate": 0.00019763686103351405, + "loss": 0.0, + "step": 26742 + }, + { + "epoch": 2.495381170103574, + "grad_norm": NaN, + "learning_rate": 0.0001976296887520795, + "loss": 0.0, + "step": 26743 + }, + { + "epoch": 2.495474479798451, + "grad_norm": NaN, + "learning_rate": 0.00019762251634953675, + "loss": 0.0, + "step": 26744 + }, + { + "epoch": 2.4955677894933284, + "grad_norm": NaN, + "learning_rate": 0.00019761534382590405, + "loss": 0.0, + "step": 26745 + }, + { + "epoch": 2.495661099188206, + "grad_norm": NaN, + "learning_rate": 0.00019760817118119964, + "loss": 0.0, + "step": 26746 + }, + { + "epoch": 2.4957544088830828, + "grad_norm": NaN, + "learning_rate": 0.00019760099841544174, + "loss": 0.0, + "step": 26747 + }, + { + "epoch": 2.49584771857796, + "grad_norm": NaN, + "learning_rate": 0.0001975938255286486, + "loss": 0.0, + "step": 26748 + }, + { + "epoch": 2.4959410282728376, + "grad_norm": NaN, + "learning_rate": 0.00019758665252083847, + "loss": 0.0, + "step": 26749 + }, + { + "epoch": 2.496034337967715, + "grad_norm": NaN, + "learning_rate": 0.0001975794793920296, + "loss": 0.0, + "step": 26750 + }, + { + "epoch": 2.496127647662592, + "grad_norm": NaN, + "learning_rate": 0.00019757230614224016, + "loss": 0.0, + "step": 26751 + }, + { + "epoch": 2.4962209573574694, + "grad_norm": NaN, + "learning_rate": 0.0001975651327714885, + "loss": 0.0, + "step": 26752 + }, + { + "epoch": 2.496314267052347, + "grad_norm": NaN, + "learning_rate": 0.00019755795927979276, + "loss": 0.0, + "step": 26753 + }, + { + "epoch": 2.496407576747224, + "grad_norm": NaN, + "learning_rate": 0.0001975507856671712, + "loss": 0.0, + "step": 26754 + }, + { + "epoch": 2.496500886442101, + "grad_norm": NaN, + "learning_rate": 0.00019754361193364213, + "loss": 0.0, + "step": 26755 + }, + { + "epoch": 2.4965941961369786, + "grad_norm": NaN, + "learning_rate": 0.00019753643807922367, + "loss": 0.0, + "step": 26756 + }, + { + "epoch": 2.496687505831856, + "grad_norm": NaN, + "learning_rate": 0.00019752926410393411, + "loss": 0.0, + "step": 26757 + }, + { + "epoch": 2.4967808155267335, + "grad_norm": NaN, + "learning_rate": 0.00019752209000779185, + "loss": 0.0, + "step": 26758 + }, + { + "epoch": 2.4968741252216105, + "grad_norm": NaN, + "learning_rate": 0.00019751491579081488, + "loss": 0.0, + "step": 26759 + }, + { + "epoch": 2.496967434916488, + "grad_norm": NaN, + "learning_rate": 0.00019750774145302154, + "loss": 0.0, + "step": 26760 + }, + { + "epoch": 2.4970607446113653, + "grad_norm": NaN, + "learning_rate": 0.00019750056699443014, + "loss": 0.0, + "step": 26761 + }, + { + "epoch": 2.4971540543062423, + "grad_norm": NaN, + "learning_rate": 0.0001974933924150589, + "loss": 0.0, + "step": 26762 + }, + { + "epoch": 2.4972473640011197, + "grad_norm": NaN, + "learning_rate": 0.00019748621771492595, + "loss": 0.0, + "step": 26763 + }, + { + "epoch": 2.497340673695997, + "grad_norm": NaN, + "learning_rate": 0.00019747904289404967, + "loss": 0.0, + "step": 26764 + }, + { + "epoch": 2.4974339833908745, + "grad_norm": NaN, + "learning_rate": 0.00019747186795244823, + "loss": 0.0, + "step": 26765 + }, + { + "epoch": 2.4975272930857515, + "grad_norm": NaN, + "learning_rate": 0.0001974646928901399, + "loss": 0.0, + "step": 26766 + }, + { + "epoch": 2.497620602780629, + "grad_norm": NaN, + "learning_rate": 0.0001974575177071429, + "loss": 0.0, + "step": 26767 + }, + { + "epoch": 2.4977139124755063, + "grad_norm": NaN, + "learning_rate": 0.00019745034240347553, + "loss": 0.0, + "step": 26768 + }, + { + "epoch": 2.4978072221703833, + "grad_norm": NaN, + "learning_rate": 0.00019744316697915595, + "loss": 0.0, + "step": 26769 + }, + { + "epoch": 2.4979005318652607, + "grad_norm": NaN, + "learning_rate": 0.00019743599143420246, + "loss": 0.0, + "step": 26770 + }, + { + "epoch": 2.497993841560138, + "grad_norm": NaN, + "learning_rate": 0.0001974288157686333, + "loss": 0.0, + "step": 26771 + }, + { + "epoch": 2.4980871512550156, + "grad_norm": NaN, + "learning_rate": 0.0001974216399824667, + "loss": 0.0, + "step": 26772 + }, + { + "epoch": 2.4981804609498925, + "grad_norm": NaN, + "learning_rate": 0.00019741446407572094, + "loss": 0.0, + "step": 26773 + }, + { + "epoch": 2.49827377064477, + "grad_norm": NaN, + "learning_rate": 0.00019740728804841422, + "loss": 0.0, + "step": 26774 + }, + { + "epoch": 2.4983670803396474, + "grad_norm": NaN, + "learning_rate": 0.00019740011190056485, + "loss": 0.0, + "step": 26775 + }, + { + "epoch": 2.4984603900345244, + "grad_norm": NaN, + "learning_rate": 0.00019739293563219096, + "loss": 0.0, + "step": 26776 + }, + { + "epoch": 2.498553699729402, + "grad_norm": NaN, + "learning_rate": 0.00019738575924331094, + "loss": 0.0, + "step": 26777 + }, + { + "epoch": 2.498647009424279, + "grad_norm": NaN, + "learning_rate": 0.00019737858273394295, + "loss": 0.0, + "step": 26778 + }, + { + "epoch": 2.4987403191191566, + "grad_norm": NaN, + "learning_rate": 0.00019737140610410524, + "loss": 0.0, + "step": 26779 + }, + { + "epoch": 2.4988336288140336, + "grad_norm": NaN, + "learning_rate": 0.00019736422935381613, + "loss": 0.0, + "step": 26780 + }, + { + "epoch": 2.498926938508911, + "grad_norm": NaN, + "learning_rate": 0.00019735705248309375, + "loss": 0.0, + "step": 26781 + }, + { + "epoch": 2.4990202482037884, + "grad_norm": NaN, + "learning_rate": 0.00019734987549195642, + "loss": 0.0, + "step": 26782 + }, + { + "epoch": 2.499113557898666, + "grad_norm": NaN, + "learning_rate": 0.00019734269838042246, + "loss": 0.0, + "step": 26783 + }, + { + "epoch": 2.499206867593543, + "grad_norm": NaN, + "learning_rate": 0.00019733552114851, + "loss": 0.0, + "step": 26784 + }, + { + "epoch": 2.4993001772884202, + "grad_norm": NaN, + "learning_rate": 0.00019732834379623725, + "loss": 0.0, + "step": 26785 + }, + { + "epoch": 2.4993934869832977, + "grad_norm": NaN, + "learning_rate": 0.00019732116632362266, + "loss": 0.0, + "step": 26786 + }, + { + "epoch": 2.499486796678175, + "grad_norm": NaN, + "learning_rate": 0.0001973139887306843, + "loss": 0.0, + "step": 26787 + }, + { + "epoch": 2.499580106373052, + "grad_norm": NaN, + "learning_rate": 0.00019730681101744047, + "loss": 0.0, + "step": 26788 + }, + { + "epoch": 2.4996734160679295, + "grad_norm": NaN, + "learning_rate": 0.0001972996331839095, + "loss": 0.0, + "step": 26789 + }, + { + "epoch": 2.499766725762807, + "grad_norm": NaN, + "learning_rate": 0.00019729245523010953, + "loss": 0.0, + "step": 26790 + }, + { + "epoch": 2.499860035457684, + "grad_norm": NaN, + "learning_rate": 0.00019728527715605888, + "loss": 0.0, + "step": 26791 + }, + { + "epoch": 2.4999533451525613, + "grad_norm": NaN, + "learning_rate": 0.00019727809896177574, + "loss": 0.0, + "step": 26792 + }, + { + "epoch": 2.5000466548474387, + "grad_norm": NaN, + "learning_rate": 0.00019727092064727842, + "loss": 0.0, + "step": 26793 + }, + { + "epoch": 2.500139964542316, + "grad_norm": NaN, + "learning_rate": 0.00019726374221258516, + "loss": 0.0, + "step": 26794 + }, + { + "epoch": 2.500233274237193, + "grad_norm": NaN, + "learning_rate": 0.00019725656365771415, + "loss": 0.0, + "step": 26795 + }, + { + "epoch": 2.5003265839320705, + "grad_norm": NaN, + "learning_rate": 0.00019724938498268376, + "loss": 0.0, + "step": 26796 + }, + { + "epoch": 2.500419893626948, + "grad_norm": NaN, + "learning_rate": 0.00019724220618751217, + "loss": 0.0, + "step": 26797 + }, + { + "epoch": 2.500513203321825, + "grad_norm": NaN, + "learning_rate": 0.00019723502727221758, + "loss": 0.0, + "step": 26798 + }, + { + "epoch": 2.5006065130167023, + "grad_norm": NaN, + "learning_rate": 0.00019722784823681842, + "loss": 0.0, + "step": 26799 + }, + { + "epoch": 2.5006998227115798, + "grad_norm": NaN, + "learning_rate": 0.00019722066908133275, + "loss": 0.0, + "step": 26800 + }, + { + "epoch": 2.500793132406457, + "grad_norm": NaN, + "learning_rate": 0.0001972134898057789, + "loss": 0.0, + "step": 26801 + }, + { + "epoch": 2.5008864421013346, + "grad_norm": NaN, + "learning_rate": 0.0001972063104101752, + "loss": 0.0, + "step": 26802 + }, + { + "epoch": 2.5009797517962116, + "grad_norm": NaN, + "learning_rate": 0.0001971991308945398, + "loss": 0.0, + "step": 26803 + }, + { + "epoch": 2.501073061491089, + "grad_norm": NaN, + "learning_rate": 0.00019719195125889096, + "loss": 0.0, + "step": 26804 + }, + { + "epoch": 2.501166371185966, + "grad_norm": NaN, + "learning_rate": 0.000197184771503247, + "loss": 0.0, + "step": 26805 + }, + { + "epoch": 2.5012596808808434, + "grad_norm": NaN, + "learning_rate": 0.00019717759162762616, + "loss": 0.0, + "step": 26806 + }, + { + "epoch": 2.501352990575721, + "grad_norm": NaN, + "learning_rate": 0.0001971704116320466, + "loss": 0.0, + "step": 26807 + }, + { + "epoch": 2.501446300270598, + "grad_norm": NaN, + "learning_rate": 0.00019716323151652674, + "loss": 0.0, + "step": 26808 + }, + { + "epoch": 2.5015396099654756, + "grad_norm": NaN, + "learning_rate": 0.00019715605128108475, + "loss": 0.0, + "step": 26809 + }, + { + "epoch": 2.5016329196603526, + "grad_norm": NaN, + "learning_rate": 0.00019714887092573883, + "loss": 0.0, + "step": 26810 + }, + { + "epoch": 2.50172622935523, + "grad_norm": NaN, + "learning_rate": 0.00019714169045050736, + "loss": 0.0, + "step": 26811 + }, + { + "epoch": 2.5018195390501075, + "grad_norm": NaN, + "learning_rate": 0.0001971345098554085, + "loss": 0.0, + "step": 26812 + }, + { + "epoch": 2.5019128487449844, + "grad_norm": NaN, + "learning_rate": 0.00019712732914046056, + "loss": 0.0, + "step": 26813 + }, + { + "epoch": 2.502006158439862, + "grad_norm": NaN, + "learning_rate": 0.00019712014830568174, + "loss": 0.0, + "step": 26814 + }, + { + "epoch": 2.5020994681347393, + "grad_norm": NaN, + "learning_rate": 0.0001971129673510904, + "loss": 0.0, + "step": 26815 + }, + { + "epoch": 2.5021927778296167, + "grad_norm": NaN, + "learning_rate": 0.00019710578627670473, + "loss": 0.0, + "step": 26816 + }, + { + "epoch": 2.5022860875244937, + "grad_norm": NaN, + "learning_rate": 0.00019709860508254295, + "loss": 0.0, + "step": 26817 + }, + { + "epoch": 2.502379397219371, + "grad_norm": NaN, + "learning_rate": 0.00019709142376862342, + "loss": 0.0, + "step": 26818 + }, + { + "epoch": 2.5024727069142485, + "grad_norm": NaN, + "learning_rate": 0.00019708424233496434, + "loss": 0.0, + "step": 26819 + }, + { + "epoch": 2.5025660166091255, + "grad_norm": NaN, + "learning_rate": 0.00019707706078158393, + "loss": 0.0, + "step": 26820 + }, + { + "epoch": 2.502659326304003, + "grad_norm": NaN, + "learning_rate": 0.0001970698791085006, + "loss": 0.0, + "step": 26821 + }, + { + "epoch": 2.5027526359988803, + "grad_norm": NaN, + "learning_rate": 0.00019706269731573244, + "loss": 0.0, + "step": 26822 + }, + { + "epoch": 2.5028459456937577, + "grad_norm": NaN, + "learning_rate": 0.00019705551540329778, + "loss": 0.0, + "step": 26823 + }, + { + "epoch": 2.502939255388635, + "grad_norm": NaN, + "learning_rate": 0.00019704833337121493, + "loss": 0.0, + "step": 26824 + }, + { + "epoch": 2.503032565083512, + "grad_norm": NaN, + "learning_rate": 0.00019704115121950205, + "loss": 0.0, + "step": 26825 + }, + { + "epoch": 2.5031258747783895, + "grad_norm": NaN, + "learning_rate": 0.00019703396894817744, + "loss": 0.0, + "step": 26826 + }, + { + "epoch": 2.5032191844732665, + "grad_norm": NaN, + "learning_rate": 0.00019702678655725948, + "loss": 0.0, + "step": 26827 + }, + { + "epoch": 2.503312494168144, + "grad_norm": NaN, + "learning_rate": 0.00019701960404676627, + "loss": 0.0, + "step": 26828 + }, + { + "epoch": 2.5034058038630214, + "grad_norm": NaN, + "learning_rate": 0.00019701242141671607, + "loss": 0.0, + "step": 26829 + }, + { + "epoch": 2.503499113557899, + "grad_norm": NaN, + "learning_rate": 0.00019700523866712734, + "loss": 0.0, + "step": 26830 + }, + { + "epoch": 2.503592423252776, + "grad_norm": NaN, + "learning_rate": 0.00019699805579801813, + "loss": 0.0, + "step": 26831 + }, + { + "epoch": 2.503685732947653, + "grad_norm": NaN, + "learning_rate": 0.00019699087280940675, + "loss": 0.0, + "step": 26832 + }, + { + "epoch": 2.5037790426425306, + "grad_norm": NaN, + "learning_rate": 0.00019698368970131155, + "loss": 0.0, + "step": 26833 + }, + { + "epoch": 2.503872352337408, + "grad_norm": NaN, + "learning_rate": 0.0001969765064737508, + "loss": 0.0, + "step": 26834 + }, + { + "epoch": 2.503965662032285, + "grad_norm": NaN, + "learning_rate": 0.00019696932312674267, + "loss": 0.0, + "step": 26835 + }, + { + "epoch": 2.5040589717271624, + "grad_norm": NaN, + "learning_rate": 0.00019696213966030542, + "loss": 0.0, + "step": 26836 + }, + { + "epoch": 2.50415228142204, + "grad_norm": NaN, + "learning_rate": 0.0001969549560744574, + "loss": 0.0, + "step": 26837 + }, + { + "epoch": 2.5042455911169172, + "grad_norm": NaN, + "learning_rate": 0.00019694777236921684, + "loss": 0.0, + "step": 26838 + }, + { + "epoch": 2.504338900811794, + "grad_norm": NaN, + "learning_rate": 0.00019694058854460196, + "loss": 0.0, + "step": 26839 + }, + { + "epoch": 2.5044322105066716, + "grad_norm": NaN, + "learning_rate": 0.00019693340460063112, + "loss": 0.0, + "step": 26840 + }, + { + "epoch": 2.504525520201549, + "grad_norm": NaN, + "learning_rate": 0.00019692622053732253, + "loss": 0.0, + "step": 26841 + }, + { + "epoch": 2.504618829896426, + "grad_norm": NaN, + "learning_rate": 0.0001969190363546944, + "loss": 0.0, + "step": 26842 + }, + { + "epoch": 2.5047121395913035, + "grad_norm": NaN, + "learning_rate": 0.00019691185205276512, + "loss": 0.0, + "step": 26843 + }, + { + "epoch": 2.504805449286181, + "grad_norm": NaN, + "learning_rate": 0.00019690466763155288, + "loss": 0.0, + "step": 26844 + }, + { + "epoch": 2.5048987589810583, + "grad_norm": NaN, + "learning_rate": 0.00019689748309107593, + "loss": 0.0, + "step": 26845 + }, + { + "epoch": 2.5049920686759353, + "grad_norm": NaN, + "learning_rate": 0.00019689029843135265, + "loss": 0.0, + "step": 26846 + }, + { + "epoch": 2.5050853783708127, + "grad_norm": NaN, + "learning_rate": 0.0001968831136524012, + "loss": 0.0, + "step": 26847 + }, + { + "epoch": 2.50517868806569, + "grad_norm": NaN, + "learning_rate": 0.00019687592875423983, + "loss": 0.0, + "step": 26848 + }, + { + "epoch": 2.505271997760567, + "grad_norm": NaN, + "learning_rate": 0.0001968687437368869, + "loss": 0.0, + "step": 26849 + }, + { + "epoch": 2.5053653074554445, + "grad_norm": NaN, + "learning_rate": 0.00019686155860036065, + "loss": 0.0, + "step": 26850 + }, + { + "epoch": 2.505458617150322, + "grad_norm": NaN, + "learning_rate": 0.00019685437334467928, + "loss": 0.0, + "step": 26851 + }, + { + "epoch": 2.5055519268451993, + "grad_norm": NaN, + "learning_rate": 0.0001968471879698612, + "loss": 0.0, + "step": 26852 + }, + { + "epoch": 2.5056452365400768, + "grad_norm": NaN, + "learning_rate": 0.00019684000247592454, + "loss": 0.0, + "step": 26853 + }, + { + "epoch": 2.5057385462349537, + "grad_norm": NaN, + "learning_rate": 0.00019683281686288764, + "loss": 0.0, + "step": 26854 + }, + { + "epoch": 2.505831855929831, + "grad_norm": NaN, + "learning_rate": 0.00019682563113076877, + "loss": 0.0, + "step": 26855 + }, + { + "epoch": 2.5059251656247086, + "grad_norm": NaN, + "learning_rate": 0.0001968184452795862, + "loss": 0.0, + "step": 26856 + }, + { + "epoch": 2.5060184753195855, + "grad_norm": NaN, + "learning_rate": 0.00019681125930935822, + "loss": 0.0, + "step": 26857 + }, + { + "epoch": 2.506111785014463, + "grad_norm": NaN, + "learning_rate": 0.00019680407322010296, + "loss": 0.0, + "step": 26858 + }, + { + "epoch": 2.5062050947093404, + "grad_norm": NaN, + "learning_rate": 0.0001967968870118389, + "loss": 0.0, + "step": 26859 + }, + { + "epoch": 2.506298404404218, + "grad_norm": NaN, + "learning_rate": 0.00019678970068458425, + "loss": 0.0, + "step": 26860 + }, + { + "epoch": 2.506391714099095, + "grad_norm": NaN, + "learning_rate": 0.00019678251423835714, + "loss": 0.0, + "step": 26861 + }, + { + "epoch": 2.506485023793972, + "grad_norm": NaN, + "learning_rate": 0.00019677532767317605, + "loss": 0.0, + "step": 26862 + }, + { + "epoch": 2.5065783334888496, + "grad_norm": NaN, + "learning_rate": 0.0001967681409890591, + "loss": 0.0, + "step": 26863 + }, + { + "epoch": 2.5066716431837266, + "grad_norm": NaN, + "learning_rate": 0.00019676095418602463, + "loss": 0.0, + "step": 26864 + }, + { + "epoch": 2.506764952878604, + "grad_norm": NaN, + "learning_rate": 0.00019675376726409094, + "loss": 0.0, + "step": 26865 + }, + { + "epoch": 2.5068582625734814, + "grad_norm": NaN, + "learning_rate": 0.00019674658022327628, + "loss": 0.0, + "step": 26866 + }, + { + "epoch": 2.506951572268359, + "grad_norm": NaN, + "learning_rate": 0.00019673939306359881, + "loss": 0.0, + "step": 26867 + }, + { + "epoch": 2.507044881963236, + "grad_norm": NaN, + "learning_rate": 0.00019673220578507702, + "loss": 0.0, + "step": 26868 + }, + { + "epoch": 2.5071381916581132, + "grad_norm": NaN, + "learning_rate": 0.00019672501838772903, + "loss": 0.0, + "step": 26869 + }, + { + "epoch": 2.5072315013529907, + "grad_norm": NaN, + "learning_rate": 0.00019671783087157316, + "loss": 0.0, + "step": 26870 + }, + { + "epoch": 2.5073248110478676, + "grad_norm": NaN, + "learning_rate": 0.00019671064323662765, + "loss": 0.0, + "step": 26871 + }, + { + "epoch": 2.507418120742745, + "grad_norm": NaN, + "learning_rate": 0.0001967034554829109, + "loss": 0.0, + "step": 26872 + }, + { + "epoch": 2.5075114304376225, + "grad_norm": NaN, + "learning_rate": 0.00019669626761044102, + "loss": 0.0, + "step": 26873 + }, + { + "epoch": 2.5076047401325, + "grad_norm": NaN, + "learning_rate": 0.0001966890796192364, + "loss": 0.0, + "step": 26874 + }, + { + "epoch": 2.5076980498273773, + "grad_norm": NaN, + "learning_rate": 0.00019668189150931527, + "loss": 0.0, + "step": 26875 + }, + { + "epoch": 2.5077913595222543, + "grad_norm": NaN, + "learning_rate": 0.00019667470328069592, + "loss": 0.0, + "step": 26876 + }, + { + "epoch": 2.5078846692171317, + "grad_norm": NaN, + "learning_rate": 0.0001966675149333966, + "loss": 0.0, + "step": 26877 + }, + { + "epoch": 2.5079779789120087, + "grad_norm": NaN, + "learning_rate": 0.0001966603264674357, + "loss": 0.0, + "step": 26878 + }, + { + "epoch": 2.508071288606886, + "grad_norm": NaN, + "learning_rate": 0.00019665313788283135, + "loss": 0.0, + "step": 26879 + }, + { + "epoch": 2.5081645983017635, + "grad_norm": NaN, + "learning_rate": 0.00019664594917960185, + "loss": 0.0, + "step": 26880 + }, + { + "epoch": 2.508257907996641, + "grad_norm": NaN, + "learning_rate": 0.00019663876035776558, + "loss": 0.0, + "step": 26881 + }, + { + "epoch": 2.5083512176915184, + "grad_norm": NaN, + "learning_rate": 0.00019663157141734073, + "loss": 0.0, + "step": 26882 + }, + { + "epoch": 2.5084445273863953, + "grad_norm": NaN, + "learning_rate": 0.0001966243823583456, + "loss": 0.0, + "step": 26883 + }, + { + "epoch": 2.5085378370812728, + "grad_norm": NaN, + "learning_rate": 0.0001966171931807985, + "loss": 0.0, + "step": 26884 + }, + { + "epoch": 2.50863114677615, + "grad_norm": NaN, + "learning_rate": 0.0001966100038847177, + "loss": 0.0, + "step": 26885 + }, + { + "epoch": 2.508724456471027, + "grad_norm": NaN, + "learning_rate": 0.0001966028144701214, + "loss": 0.0, + "step": 26886 + }, + { + "epoch": 2.5088177661659046, + "grad_norm": NaN, + "learning_rate": 0.000196595624937028, + "loss": 0.0, + "step": 26887 + }, + { + "epoch": 2.508911075860782, + "grad_norm": NaN, + "learning_rate": 0.00019658843528545575, + "loss": 0.0, + "step": 26888 + }, + { + "epoch": 2.5090043855556594, + "grad_norm": NaN, + "learning_rate": 0.00019658124551542286, + "loss": 0.0, + "step": 26889 + }, + { + "epoch": 2.5090976952505364, + "grad_norm": NaN, + "learning_rate": 0.00019657405562694766, + "loss": 0.0, + "step": 26890 + }, + { + "epoch": 2.509191004945414, + "grad_norm": NaN, + "learning_rate": 0.0001965668656200485, + "loss": 0.0, + "step": 26891 + }, + { + "epoch": 2.5092843146402912, + "grad_norm": NaN, + "learning_rate": 0.00019655967549474351, + "loss": 0.0, + "step": 26892 + }, + { + "epoch": 2.509377624335168, + "grad_norm": NaN, + "learning_rate": 0.00019655248525105108, + "loss": 0.0, + "step": 26893 + }, + { + "epoch": 2.5094709340300456, + "grad_norm": NaN, + "learning_rate": 0.0001965452948889895, + "loss": 0.0, + "step": 26894 + }, + { + "epoch": 2.509564243724923, + "grad_norm": NaN, + "learning_rate": 0.00019653810440857703, + "loss": 0.0, + "step": 26895 + }, + { + "epoch": 2.5096575534198005, + "grad_norm": NaN, + "learning_rate": 0.00019653091380983188, + "loss": 0.0, + "step": 26896 + }, + { + "epoch": 2.509750863114678, + "grad_norm": NaN, + "learning_rate": 0.0001965237230927725, + "loss": 0.0, + "step": 26897 + }, + { + "epoch": 2.509844172809555, + "grad_norm": NaN, + "learning_rate": 0.00019651653225741697, + "loss": 0.0, + "step": 26898 + }, + { + "epoch": 2.5099374825044323, + "grad_norm": NaN, + "learning_rate": 0.00019650934130378375, + "loss": 0.0, + "step": 26899 + }, + { + "epoch": 2.5100307921993092, + "grad_norm": NaN, + "learning_rate": 0.00019650215023189104, + "loss": 0.0, + "step": 26900 + }, + { + "epoch": 2.5101241018941867, + "grad_norm": NaN, + "learning_rate": 0.00019649495904175713, + "loss": 0.0, + "step": 26901 + }, + { + "epoch": 2.510217411589064, + "grad_norm": NaN, + "learning_rate": 0.00019648776773340028, + "loss": 0.0, + "step": 26902 + }, + { + "epoch": 2.5103107212839415, + "grad_norm": NaN, + "learning_rate": 0.00019648057630683886, + "loss": 0.0, + "step": 26903 + }, + { + "epoch": 2.510404030978819, + "grad_norm": NaN, + "learning_rate": 0.00019647338476209107, + "loss": 0.0, + "step": 26904 + }, + { + "epoch": 2.510497340673696, + "grad_norm": NaN, + "learning_rate": 0.00019646619309917523, + "loss": 0.0, + "step": 26905 + }, + { + "epoch": 2.5105906503685733, + "grad_norm": NaN, + "learning_rate": 0.00019645900131810964, + "loss": 0.0, + "step": 26906 + }, + { + "epoch": 2.5106839600634507, + "grad_norm": NaN, + "learning_rate": 0.00019645180941891256, + "loss": 0.0, + "step": 26907 + }, + { + "epoch": 2.5107772697583277, + "grad_norm": NaN, + "learning_rate": 0.00019644461740160227, + "loss": 0.0, + "step": 26908 + }, + { + "epoch": 2.510870579453205, + "grad_norm": NaN, + "learning_rate": 0.0001964374252661971, + "loss": 0.0, + "step": 26909 + }, + { + "epoch": 2.5109638891480826, + "grad_norm": NaN, + "learning_rate": 0.0001964302330127153, + "loss": 0.0, + "step": 26910 + }, + { + "epoch": 2.51105719884296, + "grad_norm": NaN, + "learning_rate": 0.00019642304064117517, + "loss": 0.0, + "step": 26911 + }, + { + "epoch": 2.511150508537837, + "grad_norm": NaN, + "learning_rate": 0.000196415848151595, + "loss": 0.0, + "step": 26912 + }, + { + "epoch": 2.5112438182327144, + "grad_norm": NaN, + "learning_rate": 0.00019640865554399313, + "loss": 0.0, + "step": 26913 + }, + { + "epoch": 2.511337127927592, + "grad_norm": NaN, + "learning_rate": 0.00019640146281838772, + "loss": 0.0, + "step": 26914 + }, + { + "epoch": 2.5114304376224688, + "grad_norm": NaN, + "learning_rate": 0.00019639426997479717, + "loss": 0.0, + "step": 26915 + }, + { + "epoch": 2.511523747317346, + "grad_norm": NaN, + "learning_rate": 0.00019638707701323977, + "loss": 0.0, + "step": 26916 + }, + { + "epoch": 2.5116170570122236, + "grad_norm": NaN, + "learning_rate": 0.00019637988393373372, + "loss": 0.0, + "step": 26917 + }, + { + "epoch": 2.511710366707101, + "grad_norm": NaN, + "learning_rate": 0.00019637269073629737, + "loss": 0.0, + "step": 26918 + }, + { + "epoch": 2.5118036764019784, + "grad_norm": NaN, + "learning_rate": 0.00019636549742094907, + "loss": 0.0, + "step": 26919 + }, + { + "epoch": 2.5118969860968554, + "grad_norm": NaN, + "learning_rate": 0.00019635830398770693, + "loss": 0.0, + "step": 26920 + }, + { + "epoch": 2.511990295791733, + "grad_norm": NaN, + "learning_rate": 0.00019635111043658942, + "loss": 0.0, + "step": 26921 + }, + { + "epoch": 2.51208360548661, + "grad_norm": NaN, + "learning_rate": 0.0001963439167676148, + "loss": 0.0, + "step": 26922 + }, + { + "epoch": 2.5121769151814872, + "grad_norm": NaN, + "learning_rate": 0.00019633672298080128, + "loss": 0.0, + "step": 26923 + }, + { + "epoch": 2.5122702248763646, + "grad_norm": NaN, + "learning_rate": 0.0001963295290761672, + "loss": 0.0, + "step": 26924 + }, + { + "epoch": 2.512363534571242, + "grad_norm": NaN, + "learning_rate": 0.00019632233505373085, + "loss": 0.0, + "step": 26925 + }, + { + "epoch": 2.5124568442661195, + "grad_norm": NaN, + "learning_rate": 0.00019631514091351056, + "loss": 0.0, + "step": 26926 + }, + { + "epoch": 2.5125501539609965, + "grad_norm": NaN, + "learning_rate": 0.00019630794665552453, + "loss": 0.0, + "step": 26927 + }, + { + "epoch": 2.512643463655874, + "grad_norm": NaN, + "learning_rate": 0.00019630075227979114, + "loss": 0.0, + "step": 26928 + }, + { + "epoch": 2.5127367733507513, + "grad_norm": NaN, + "learning_rate": 0.00019629355778632868, + "loss": 0.0, + "step": 26929 + }, + { + "epoch": 2.5128300830456283, + "grad_norm": NaN, + "learning_rate": 0.00019628636317515537, + "loss": 0.0, + "step": 26930 + }, + { + "epoch": 2.5129233927405057, + "grad_norm": NaN, + "learning_rate": 0.00019627916844628955, + "loss": 0.0, + "step": 26931 + }, + { + "epoch": 2.513016702435383, + "grad_norm": NaN, + "learning_rate": 0.00019627197359974957, + "loss": 0.0, + "step": 26932 + }, + { + "epoch": 2.5131100121302605, + "grad_norm": NaN, + "learning_rate": 0.0001962647786355536, + "loss": 0.0, + "step": 26933 + }, + { + "epoch": 2.5132033218251375, + "grad_norm": NaN, + "learning_rate": 0.00019625758355372005, + "loss": 0.0, + "step": 26934 + }, + { + "epoch": 2.513296631520015, + "grad_norm": NaN, + "learning_rate": 0.00019625038835426717, + "loss": 0.0, + "step": 26935 + }, + { + "epoch": 2.5133899412148923, + "grad_norm": NaN, + "learning_rate": 0.0001962431930372132, + "loss": 0.0, + "step": 26936 + }, + { + "epoch": 2.5134832509097693, + "grad_norm": NaN, + "learning_rate": 0.00019623599760257654, + "loss": 0.0, + "step": 26937 + }, + { + "epoch": 2.5135765606046467, + "grad_norm": NaN, + "learning_rate": 0.00019622880205037542, + "loss": 0.0, + "step": 26938 + }, + { + "epoch": 2.513669870299524, + "grad_norm": NaN, + "learning_rate": 0.00019622160638062811, + "loss": 0.0, + "step": 26939 + }, + { + "epoch": 2.5137631799944016, + "grad_norm": NaN, + "learning_rate": 0.00019621441059335297, + "loss": 0.0, + "step": 26940 + }, + { + "epoch": 2.5138564896892786, + "grad_norm": NaN, + "learning_rate": 0.00019620721468856833, + "loss": 0.0, + "step": 26941 + }, + { + "epoch": 2.513949799384156, + "grad_norm": NaN, + "learning_rate": 0.00019620001866629236, + "loss": 0.0, + "step": 26942 + }, + { + "epoch": 2.5140431090790334, + "grad_norm": NaN, + "learning_rate": 0.00019619282252654344, + "loss": 0.0, + "step": 26943 + }, + { + "epoch": 2.5141364187739104, + "grad_norm": NaN, + "learning_rate": 0.0001961856262693399, + "loss": 0.0, + "step": 26944 + }, + { + "epoch": 2.514229728468788, + "grad_norm": NaN, + "learning_rate": 0.00019617842989469995, + "loss": 0.0, + "step": 26945 + }, + { + "epoch": 2.514323038163665, + "grad_norm": NaN, + "learning_rate": 0.00019617123340264194, + "loss": 0.0, + "step": 26946 + }, + { + "epoch": 2.5144163478585426, + "grad_norm": NaN, + "learning_rate": 0.00019616403679318414, + "loss": 0.0, + "step": 26947 + }, + { + "epoch": 2.51450965755342, + "grad_norm": NaN, + "learning_rate": 0.00019615684006634495, + "loss": 0.0, + "step": 26948 + }, + { + "epoch": 2.514602967248297, + "grad_norm": NaN, + "learning_rate": 0.0001961496432221425, + "loss": 0.0, + "step": 26949 + }, + { + "epoch": 2.5146962769431744, + "grad_norm": NaN, + "learning_rate": 0.00019614244626059522, + "loss": 0.0, + "step": 26950 + }, + { + "epoch": 2.514789586638052, + "grad_norm": NaN, + "learning_rate": 0.00019613524918172138, + "loss": 0.0, + "step": 26951 + }, + { + "epoch": 2.514882896332929, + "grad_norm": NaN, + "learning_rate": 0.00019612805198553922, + "loss": 0.0, + "step": 26952 + }, + { + "epoch": 2.5149762060278062, + "grad_norm": NaN, + "learning_rate": 0.00019612085467206713, + "loss": 0.0, + "step": 26953 + }, + { + "epoch": 2.5150695157226837, + "grad_norm": NaN, + "learning_rate": 0.00019611365724132336, + "loss": 0.0, + "step": 26954 + }, + { + "epoch": 2.515162825417561, + "grad_norm": NaN, + "learning_rate": 0.00019610645969332617, + "loss": 0.0, + "step": 26955 + }, + { + "epoch": 2.515256135112438, + "grad_norm": NaN, + "learning_rate": 0.00019609926202809395, + "loss": 0.0, + "step": 26956 + }, + { + "epoch": 2.5153494448073155, + "grad_norm": NaN, + "learning_rate": 0.00019609206424564502, + "loss": 0.0, + "step": 26957 + }, + { + "epoch": 2.515442754502193, + "grad_norm": NaN, + "learning_rate": 0.00019608486634599753, + "loss": 0.0, + "step": 26958 + }, + { + "epoch": 2.51553606419707, + "grad_norm": NaN, + "learning_rate": 0.00019607766832916988, + "loss": 0.0, + "step": 26959 + }, + { + "epoch": 2.5156293738919473, + "grad_norm": NaN, + "learning_rate": 0.00019607047019518046, + "loss": 0.0, + "step": 26960 + }, + { + "epoch": 2.5157226835868247, + "grad_norm": NaN, + "learning_rate": 0.0001960632719440474, + "loss": 0.0, + "step": 26961 + }, + { + "epoch": 2.515815993281702, + "grad_norm": NaN, + "learning_rate": 0.0001960560735757891, + "loss": 0.0, + "step": 26962 + }, + { + "epoch": 2.515909302976579, + "grad_norm": NaN, + "learning_rate": 0.0001960488750904239, + "loss": 0.0, + "step": 26963 + }, + { + "epoch": 2.5160026126714565, + "grad_norm": NaN, + "learning_rate": 0.00019604167648796996, + "loss": 0.0, + "step": 26964 + }, + { + "epoch": 2.516095922366334, + "grad_norm": NaN, + "learning_rate": 0.00019603447776844575, + "loss": 0.0, + "step": 26965 + }, + { + "epoch": 2.516189232061211, + "grad_norm": NaN, + "learning_rate": 0.00019602727893186945, + "loss": 0.0, + "step": 26966 + }, + { + "epoch": 2.5162825417560883, + "grad_norm": NaN, + "learning_rate": 0.00019602007997825946, + "loss": 0.0, + "step": 26967 + }, + { + "epoch": 2.5163758514509658, + "grad_norm": NaN, + "learning_rate": 0.000196012880907634, + "loss": 0.0, + "step": 26968 + }, + { + "epoch": 2.516469161145843, + "grad_norm": NaN, + "learning_rate": 0.00019600568172001143, + "loss": 0.0, + "step": 26969 + }, + { + "epoch": 2.5165624708407206, + "grad_norm": NaN, + "learning_rate": 0.00019599848241541004, + "loss": 0.0, + "step": 26970 + }, + { + "epoch": 2.5166557805355976, + "grad_norm": NaN, + "learning_rate": 0.00019599128299384815, + "loss": 0.0, + "step": 26971 + }, + { + "epoch": 2.516749090230475, + "grad_norm": NaN, + "learning_rate": 0.00019598408345534402, + "loss": 0.0, + "step": 26972 + }, + { + "epoch": 2.516842399925352, + "grad_norm": NaN, + "learning_rate": 0.00019597688379991603, + "loss": 0.0, + "step": 26973 + }, + { + "epoch": 2.5169357096202294, + "grad_norm": NaN, + "learning_rate": 0.00019596968402758237, + "loss": 0.0, + "step": 26974 + }, + { + "epoch": 2.517029019315107, + "grad_norm": NaN, + "learning_rate": 0.00019596248413836147, + "loss": 0.0, + "step": 26975 + }, + { + "epoch": 2.5171223290099842, + "grad_norm": NaN, + "learning_rate": 0.00019595528413227163, + "loss": 0.0, + "step": 26976 + }, + { + "epoch": 2.5172156387048616, + "grad_norm": NaN, + "learning_rate": 0.00019594808400933103, + "loss": 0.0, + "step": 26977 + }, + { + "epoch": 2.5173089483997386, + "grad_norm": NaN, + "learning_rate": 0.00019594088376955813, + "loss": 0.0, + "step": 26978 + }, + { + "epoch": 2.517402258094616, + "grad_norm": NaN, + "learning_rate": 0.00019593368341297115, + "loss": 0.0, + "step": 26979 + }, + { + "epoch": 2.5174955677894935, + "grad_norm": NaN, + "learning_rate": 0.00019592648293958838, + "loss": 0.0, + "step": 26980 + }, + { + "epoch": 2.5175888774843704, + "grad_norm": NaN, + "learning_rate": 0.00019591928234942823, + "loss": 0.0, + "step": 26981 + }, + { + "epoch": 2.517682187179248, + "grad_norm": NaN, + "learning_rate": 0.00019591208164250888, + "loss": 0.0, + "step": 26982 + }, + { + "epoch": 2.5177754968741253, + "grad_norm": NaN, + "learning_rate": 0.00019590488081884877, + "loss": 0.0, + "step": 26983 + }, + { + "epoch": 2.5178688065690027, + "grad_norm": NaN, + "learning_rate": 0.00019589767987846613, + "loss": 0.0, + "step": 26984 + }, + { + "epoch": 2.5179621162638797, + "grad_norm": NaN, + "learning_rate": 0.00019589047882137927, + "loss": 0.0, + "step": 26985 + }, + { + "epoch": 2.518055425958757, + "grad_norm": NaN, + "learning_rate": 0.0001958832776476065, + "loss": 0.0, + "step": 26986 + }, + { + "epoch": 2.5181487356536345, + "grad_norm": NaN, + "learning_rate": 0.0001958760763571662, + "loss": 0.0, + "step": 26987 + }, + { + "epoch": 2.5182420453485115, + "grad_norm": NaN, + "learning_rate": 0.0001958688749500766, + "loss": 0.0, + "step": 26988 + }, + { + "epoch": 2.518335355043389, + "grad_norm": NaN, + "learning_rate": 0.00019586167342635602, + "loss": 0.0, + "step": 26989 + }, + { + "epoch": 2.5184286647382663, + "grad_norm": NaN, + "learning_rate": 0.0001958544717860228, + "loss": 0.0, + "step": 26990 + }, + { + "epoch": 2.5185219744331437, + "grad_norm": NaN, + "learning_rate": 0.00019584727002909527, + "loss": 0.0, + "step": 26991 + }, + { + "epoch": 2.518615284128021, + "grad_norm": NaN, + "learning_rate": 0.0001958400681555917, + "loss": 0.0, + "step": 26992 + }, + { + "epoch": 2.518708593822898, + "grad_norm": NaN, + "learning_rate": 0.0001958328661655304, + "loss": 0.0, + "step": 26993 + }, + { + "epoch": 2.5188019035177756, + "grad_norm": NaN, + "learning_rate": 0.0001958256640589297, + "loss": 0.0, + "step": 26994 + }, + { + "epoch": 2.5188952132126525, + "grad_norm": NaN, + "learning_rate": 0.00019581846183580796, + "loss": 0.0, + "step": 26995 + }, + { + "epoch": 2.51898852290753, + "grad_norm": NaN, + "learning_rate": 0.00019581125949618336, + "loss": 0.0, + "step": 26996 + }, + { + "epoch": 2.5190818326024074, + "grad_norm": NaN, + "learning_rate": 0.00019580405704007432, + "loss": 0.0, + "step": 26997 + }, + { + "epoch": 2.519175142297285, + "grad_norm": NaN, + "learning_rate": 0.0001957968544674992, + "loss": 0.0, + "step": 26998 + }, + { + "epoch": 2.519268451992162, + "grad_norm": NaN, + "learning_rate": 0.00019578965177847617, + "loss": 0.0, + "step": 26999 + }, + { + "epoch": 2.519361761687039, + "grad_norm": NaN, + "learning_rate": 0.00019578244897302364, + "loss": 0.0, + "step": 27000 + }, + { + "epoch": 2.5194550713819166, + "grad_norm": NaN, + "learning_rate": 0.0001957752460511599, + "loss": 0.0, + "step": 27001 + }, + { + "epoch": 2.519548381076794, + "grad_norm": NaN, + "learning_rate": 0.00019576804301290327, + "loss": 0.0, + "step": 27002 + }, + { + "epoch": 2.519641690771671, + "grad_norm": NaN, + "learning_rate": 0.00019576083985827208, + "loss": 0.0, + "step": 27003 + }, + { + "epoch": 2.5197350004665484, + "grad_norm": NaN, + "learning_rate": 0.0001957536365872846, + "loss": 0.0, + "step": 27004 + }, + { + "epoch": 2.519828310161426, + "grad_norm": NaN, + "learning_rate": 0.00019574643319995923, + "loss": 0.0, + "step": 27005 + }, + { + "epoch": 2.5199216198563033, + "grad_norm": NaN, + "learning_rate": 0.00019573922969631417, + "loss": 0.0, + "step": 27006 + }, + { + "epoch": 2.5200149295511802, + "grad_norm": NaN, + "learning_rate": 0.00019573202607636782, + "loss": 0.0, + "step": 27007 + }, + { + "epoch": 2.5201082392460576, + "grad_norm": NaN, + "learning_rate": 0.0001957248223401385, + "loss": 0.0, + "step": 27008 + }, + { + "epoch": 2.520201548940935, + "grad_norm": NaN, + "learning_rate": 0.00019571761848764446, + "loss": 0.0, + "step": 27009 + }, + { + "epoch": 2.520294858635812, + "grad_norm": NaN, + "learning_rate": 0.00019571041451890407, + "loss": 0.0, + "step": 27010 + }, + { + "epoch": 2.5203881683306895, + "grad_norm": NaN, + "learning_rate": 0.00019570321043393564, + "loss": 0.0, + "step": 27011 + }, + { + "epoch": 2.520481478025567, + "grad_norm": NaN, + "learning_rate": 0.0001956960062327575, + "loss": 0.0, + "step": 27012 + }, + { + "epoch": 2.5205747877204443, + "grad_norm": NaN, + "learning_rate": 0.00019568880191538795, + "loss": 0.0, + "step": 27013 + }, + { + "epoch": 2.5206680974153217, + "grad_norm": NaN, + "learning_rate": 0.00019568159748184532, + "loss": 0.0, + "step": 27014 + }, + { + "epoch": 2.5207614071101987, + "grad_norm": NaN, + "learning_rate": 0.00019567439293214785, + "loss": 0.0, + "step": 27015 + }, + { + "epoch": 2.520854716805076, + "grad_norm": NaN, + "learning_rate": 0.000195667188266314, + "loss": 0.0, + "step": 27016 + }, + { + "epoch": 2.520948026499953, + "grad_norm": NaN, + "learning_rate": 0.00019565998348436203, + "loss": 0.0, + "step": 27017 + }, + { + "epoch": 2.5210413361948305, + "grad_norm": NaN, + "learning_rate": 0.0001956527785863102, + "loss": 0.0, + "step": 27018 + }, + { + "epoch": 2.521134645889708, + "grad_norm": NaN, + "learning_rate": 0.00019564557357217688, + "loss": 0.0, + "step": 27019 + }, + { + "epoch": 2.5212279555845853, + "grad_norm": NaN, + "learning_rate": 0.0001956383684419804, + "loss": 0.0, + "step": 27020 + }, + { + "epoch": 2.5213212652794628, + "grad_norm": NaN, + "learning_rate": 0.00019563116319573908, + "loss": 0.0, + "step": 27021 + }, + { + "epoch": 2.5214145749743397, + "grad_norm": NaN, + "learning_rate": 0.00019562395783347122, + "loss": 0.0, + "step": 27022 + }, + { + "epoch": 2.521507884669217, + "grad_norm": NaN, + "learning_rate": 0.00019561675235519514, + "loss": 0.0, + "step": 27023 + }, + { + "epoch": 2.5216011943640946, + "grad_norm": NaN, + "learning_rate": 0.0001956095467609292, + "loss": 0.0, + "step": 27024 + }, + { + "epoch": 2.5216945040589716, + "grad_norm": NaN, + "learning_rate": 0.00019560234105069166, + "loss": 0.0, + "step": 27025 + }, + { + "epoch": 2.521787813753849, + "grad_norm": NaN, + "learning_rate": 0.00019559513522450088, + "loss": 0.0, + "step": 27026 + }, + { + "epoch": 2.5218811234487264, + "grad_norm": NaN, + "learning_rate": 0.0001955879292823752, + "loss": 0.0, + "step": 27027 + }, + { + "epoch": 2.521974433143604, + "grad_norm": NaN, + "learning_rate": 0.00019558072322433287, + "loss": 0.0, + "step": 27028 + }, + { + "epoch": 2.522067742838481, + "grad_norm": NaN, + "learning_rate": 0.00019557351705039232, + "loss": 0.0, + "step": 27029 + }, + { + "epoch": 2.522161052533358, + "grad_norm": NaN, + "learning_rate": 0.00019556631076057178, + "loss": 0.0, + "step": 27030 + }, + { + "epoch": 2.5222543622282356, + "grad_norm": NaN, + "learning_rate": 0.00019555910435488963, + "loss": 0.0, + "step": 27031 + }, + { + "epoch": 2.5223476719231126, + "grad_norm": NaN, + "learning_rate": 0.00019555189783336416, + "loss": 0.0, + "step": 27032 + }, + { + "epoch": 2.52244098161799, + "grad_norm": NaN, + "learning_rate": 0.00019554469119601372, + "loss": 0.0, + "step": 27033 + }, + { + "epoch": 2.5225342913128674, + "grad_norm": NaN, + "learning_rate": 0.00019553748444285662, + "loss": 0.0, + "step": 27034 + }, + { + "epoch": 2.522627601007745, + "grad_norm": NaN, + "learning_rate": 0.0001955302775739112, + "loss": 0.0, + "step": 27035 + }, + { + "epoch": 2.5227209107026223, + "grad_norm": NaN, + "learning_rate": 0.00019552307058919575, + "loss": 0.0, + "step": 27036 + }, + { + "epoch": 2.5228142203974993, + "grad_norm": NaN, + "learning_rate": 0.00019551586348872857, + "loss": 0.0, + "step": 27037 + }, + { + "epoch": 2.5229075300923767, + "grad_norm": NaN, + "learning_rate": 0.0001955086562725281, + "loss": 0.0, + "step": 27038 + }, + { + "epoch": 2.5230008397872536, + "grad_norm": NaN, + "learning_rate": 0.0001955014489406126, + "loss": 0.0, + "step": 27039 + }, + { + "epoch": 2.523094149482131, + "grad_norm": NaN, + "learning_rate": 0.00019549424149300037, + "loss": 0.0, + "step": 27040 + }, + { + "epoch": 2.5231874591770085, + "grad_norm": NaN, + "learning_rate": 0.00019548703392970976, + "loss": 0.0, + "step": 27041 + }, + { + "epoch": 2.523280768871886, + "grad_norm": NaN, + "learning_rate": 0.0001954798262507591, + "loss": 0.0, + "step": 27042 + }, + { + "epoch": 2.5233740785667633, + "grad_norm": NaN, + "learning_rate": 0.0001954726184561667, + "loss": 0.0, + "step": 27043 + }, + { + "epoch": 2.5234673882616403, + "grad_norm": NaN, + "learning_rate": 0.0001954654105459509, + "loss": 0.0, + "step": 27044 + }, + { + "epoch": 2.5235606979565177, + "grad_norm": NaN, + "learning_rate": 0.00019545820252013006, + "loss": 0.0, + "step": 27045 + }, + { + "epoch": 2.523654007651395, + "grad_norm": NaN, + "learning_rate": 0.00019545099437872246, + "loss": 0.0, + "step": 27046 + }, + { + "epoch": 2.523747317346272, + "grad_norm": NaN, + "learning_rate": 0.00019544378612174645, + "loss": 0.0, + "step": 27047 + }, + { + "epoch": 2.5238406270411495, + "grad_norm": NaN, + "learning_rate": 0.00019543657774922035, + "loss": 0.0, + "step": 27048 + }, + { + "epoch": 2.523933936736027, + "grad_norm": NaN, + "learning_rate": 0.00019542936926116248, + "loss": 0.0, + "step": 27049 + }, + { + "epoch": 2.5240272464309044, + "grad_norm": NaN, + "learning_rate": 0.0001954221606575912, + "loss": 0.0, + "step": 27050 + }, + { + "epoch": 2.5241205561257813, + "grad_norm": NaN, + "learning_rate": 0.00019541495193852482, + "loss": 0.0, + "step": 27051 + }, + { + "epoch": 2.5242138658206588, + "grad_norm": NaN, + "learning_rate": 0.00019540774310398165, + "loss": 0.0, + "step": 27052 + }, + { + "epoch": 2.524307175515536, + "grad_norm": NaN, + "learning_rate": 0.00019540053415398006, + "loss": 0.0, + "step": 27053 + }, + { + "epoch": 2.524400485210413, + "grad_norm": NaN, + "learning_rate": 0.00019539332508853834, + "loss": 0.0, + "step": 27054 + }, + { + "epoch": 2.5244937949052906, + "grad_norm": NaN, + "learning_rate": 0.00019538611590767487, + "loss": 0.0, + "step": 27055 + }, + { + "epoch": 2.524587104600168, + "grad_norm": NaN, + "learning_rate": 0.00019537890661140792, + "loss": 0.0, + "step": 27056 + }, + { + "epoch": 2.5246804142950454, + "grad_norm": NaN, + "learning_rate": 0.0001953716971997559, + "loss": 0.0, + "step": 27057 + }, + { + "epoch": 2.5247737239899224, + "grad_norm": NaN, + "learning_rate": 0.00019536448767273703, + "loss": 0.0, + "step": 27058 + }, + { + "epoch": 2.5248670336848, + "grad_norm": NaN, + "learning_rate": 0.00019535727803036976, + "loss": 0.0, + "step": 27059 + }, + { + "epoch": 2.5249603433796772, + "grad_norm": NaN, + "learning_rate": 0.00019535006827267232, + "loss": 0.0, + "step": 27060 + }, + { + "epoch": 2.525053653074554, + "grad_norm": NaN, + "learning_rate": 0.00019534285839966316, + "loss": 0.0, + "step": 27061 + }, + { + "epoch": 2.5251469627694316, + "grad_norm": NaN, + "learning_rate": 0.00019533564841136048, + "loss": 0.0, + "step": 27062 + }, + { + "epoch": 2.525240272464309, + "grad_norm": NaN, + "learning_rate": 0.00019532843830778268, + "loss": 0.0, + "step": 27063 + }, + { + "epoch": 2.5253335821591865, + "grad_norm": NaN, + "learning_rate": 0.00019532122808894813, + "loss": 0.0, + "step": 27064 + }, + { + "epoch": 2.525426891854064, + "grad_norm": NaN, + "learning_rate": 0.00019531401775487508, + "loss": 0.0, + "step": 27065 + }, + { + "epoch": 2.525520201548941, + "grad_norm": NaN, + "learning_rate": 0.0001953068073055819, + "loss": 0.0, + "step": 27066 + }, + { + "epoch": 2.5256135112438183, + "grad_norm": NaN, + "learning_rate": 0.00019529959674108695, + "loss": 0.0, + "step": 27067 + }, + { + "epoch": 2.5257068209386957, + "grad_norm": NaN, + "learning_rate": 0.00019529238606140858, + "loss": 0.0, + "step": 27068 + }, + { + "epoch": 2.5258001306335727, + "grad_norm": NaN, + "learning_rate": 0.00019528517526656503, + "loss": 0.0, + "step": 27069 + }, + { + "epoch": 2.52589344032845, + "grad_norm": NaN, + "learning_rate": 0.0001952779643565747, + "loss": 0.0, + "step": 27070 + }, + { + "epoch": 2.5259867500233275, + "grad_norm": NaN, + "learning_rate": 0.00019527075333145594, + "loss": 0.0, + "step": 27071 + }, + { + "epoch": 2.526080059718205, + "grad_norm": NaN, + "learning_rate": 0.00019526354219122707, + "loss": 0.0, + "step": 27072 + }, + { + "epoch": 2.526173369413082, + "grad_norm": NaN, + "learning_rate": 0.0001952563309359064, + "loss": 0.0, + "step": 27073 + }, + { + "epoch": 2.5262666791079593, + "grad_norm": NaN, + "learning_rate": 0.00019524911956551232, + "loss": 0.0, + "step": 27074 + }, + { + "epoch": 2.5263599888028367, + "grad_norm": NaN, + "learning_rate": 0.00019524190808006313, + "loss": 0.0, + "step": 27075 + }, + { + "epoch": 2.5264532984977137, + "grad_norm": NaN, + "learning_rate": 0.00019523469647957713, + "loss": 0.0, + "step": 27076 + }, + { + "epoch": 2.526546608192591, + "grad_norm": NaN, + "learning_rate": 0.00019522748476407273, + "loss": 0.0, + "step": 27077 + }, + { + "epoch": 2.5266399178874686, + "grad_norm": NaN, + "learning_rate": 0.0001952202729335682, + "loss": 0.0, + "step": 27078 + }, + { + "epoch": 2.526733227582346, + "grad_norm": NaN, + "learning_rate": 0.00019521306098808192, + "loss": 0.0, + "step": 27079 + }, + { + "epoch": 2.526826537277223, + "grad_norm": NaN, + "learning_rate": 0.00019520584892763224, + "loss": 0.0, + "step": 27080 + }, + { + "epoch": 2.5269198469721004, + "grad_norm": NaN, + "learning_rate": 0.00019519863675223747, + "loss": 0.0, + "step": 27081 + }, + { + "epoch": 2.527013156666978, + "grad_norm": NaN, + "learning_rate": 0.00019519142446191597, + "loss": 0.0, + "step": 27082 + }, + { + "epoch": 2.5271064663618548, + "grad_norm": NaN, + "learning_rate": 0.00019518421205668603, + "loss": 0.0, + "step": 27083 + }, + { + "epoch": 2.527199776056732, + "grad_norm": NaN, + "learning_rate": 0.000195176999536566, + "loss": 0.0, + "step": 27084 + }, + { + "epoch": 2.5272930857516096, + "grad_norm": NaN, + "learning_rate": 0.00019516978690157434, + "loss": 0.0, + "step": 27085 + }, + { + "epoch": 2.527386395446487, + "grad_norm": NaN, + "learning_rate": 0.00019516257415172922, + "loss": 0.0, + "step": 27086 + }, + { + "epoch": 2.5274797051413644, + "grad_norm": NaN, + "learning_rate": 0.00019515536128704905, + "loss": 0.0, + "step": 27087 + }, + { + "epoch": 2.5275730148362414, + "grad_norm": NaN, + "learning_rate": 0.0001951481483075522, + "loss": 0.0, + "step": 27088 + }, + { + "epoch": 2.527666324531119, + "grad_norm": NaN, + "learning_rate": 0.00019514093521325697, + "loss": 0.0, + "step": 27089 + }, + { + "epoch": 2.527759634225996, + "grad_norm": NaN, + "learning_rate": 0.00019513372200418173, + "loss": 0.0, + "step": 27090 + }, + { + "epoch": 2.5278529439208732, + "grad_norm": NaN, + "learning_rate": 0.00019512650868034476, + "loss": 0.0, + "step": 27091 + }, + { + "epoch": 2.5279462536157506, + "grad_norm": NaN, + "learning_rate": 0.0001951192952417645, + "loss": 0.0, + "step": 27092 + }, + { + "epoch": 2.528039563310628, + "grad_norm": NaN, + "learning_rate": 0.00019511208168845922, + "loss": 0.0, + "step": 27093 + }, + { + "epoch": 2.5281328730055055, + "grad_norm": NaN, + "learning_rate": 0.00019510486802044725, + "loss": 0.0, + "step": 27094 + }, + { + "epoch": 2.5282261827003825, + "grad_norm": NaN, + "learning_rate": 0.000195097654237747, + "loss": 0.0, + "step": 27095 + }, + { + "epoch": 2.52831949239526, + "grad_norm": NaN, + "learning_rate": 0.00019509044034037674, + "loss": 0.0, + "step": 27096 + }, + { + "epoch": 2.5284128020901373, + "grad_norm": NaN, + "learning_rate": 0.0001950832263283549, + "loss": 0.0, + "step": 27097 + }, + { + "epoch": 2.5285061117850143, + "grad_norm": NaN, + "learning_rate": 0.0001950760122016997, + "loss": 0.0, + "step": 27098 + }, + { + "epoch": 2.5285994214798917, + "grad_norm": NaN, + "learning_rate": 0.0001950687979604296, + "loss": 0.0, + "step": 27099 + }, + { + "epoch": 2.528692731174769, + "grad_norm": NaN, + "learning_rate": 0.0001950615836045629, + "loss": 0.0, + "step": 27100 + }, + { + "epoch": 2.5287860408696465, + "grad_norm": NaN, + "learning_rate": 0.00019505436913411792, + "loss": 0.0, + "step": 27101 + }, + { + "epoch": 2.5288793505645235, + "grad_norm": NaN, + "learning_rate": 0.00019504715454911304, + "loss": 0.0, + "step": 27102 + }, + { + "epoch": 2.528972660259401, + "grad_norm": NaN, + "learning_rate": 0.0001950399398495666, + "loss": 0.0, + "step": 27103 + }, + { + "epoch": 2.5290659699542783, + "grad_norm": NaN, + "learning_rate": 0.00019503272503549688, + "loss": 0.0, + "step": 27104 + }, + { + "epoch": 2.5291592796491553, + "grad_norm": NaN, + "learning_rate": 0.00019502551010692232, + "loss": 0.0, + "step": 27105 + }, + { + "epoch": 2.5292525893440327, + "grad_norm": NaN, + "learning_rate": 0.0001950182950638612, + "loss": 0.0, + "step": 27106 + }, + { + "epoch": 2.52934589903891, + "grad_norm": NaN, + "learning_rate": 0.00019501107990633194, + "loss": 0.0, + "step": 27107 + }, + { + "epoch": 2.5294392087337876, + "grad_norm": NaN, + "learning_rate": 0.00019500386463435282, + "loss": 0.0, + "step": 27108 + }, + { + "epoch": 2.529532518428665, + "grad_norm": NaN, + "learning_rate": 0.0001949966492479422, + "loss": 0.0, + "step": 27109 + }, + { + "epoch": 2.529625828123542, + "grad_norm": NaN, + "learning_rate": 0.0001949894337471184, + "loss": 0.0, + "step": 27110 + }, + { + "epoch": 2.5297191378184194, + "grad_norm": NaN, + "learning_rate": 0.00019498221813189984, + "loss": 0.0, + "step": 27111 + }, + { + "epoch": 2.5298124475132964, + "grad_norm": NaN, + "learning_rate": 0.00019497500240230482, + "loss": 0.0, + "step": 27112 + }, + { + "epoch": 2.529905757208174, + "grad_norm": NaN, + "learning_rate": 0.00019496778655835165, + "loss": 0.0, + "step": 27113 + }, + { + "epoch": 2.529999066903051, + "grad_norm": NaN, + "learning_rate": 0.00019496057060005876, + "loss": 0.0, + "step": 27114 + }, + { + "epoch": 2.5300923765979286, + "grad_norm": NaN, + "learning_rate": 0.00019495335452744445, + "loss": 0.0, + "step": 27115 + }, + { + "epoch": 2.530185686292806, + "grad_norm": NaN, + "learning_rate": 0.00019494613834052704, + "loss": 0.0, + "step": 27116 + }, + { + "epoch": 2.530278995987683, + "grad_norm": NaN, + "learning_rate": 0.00019493892203932493, + "loss": 0.0, + "step": 27117 + }, + { + "epoch": 2.5303723056825604, + "grad_norm": NaN, + "learning_rate": 0.00019493170562385649, + "loss": 0.0, + "step": 27118 + }, + { + "epoch": 2.530465615377438, + "grad_norm": NaN, + "learning_rate": 0.00019492448909414002, + "loss": 0.0, + "step": 27119 + }, + { + "epoch": 2.530558925072315, + "grad_norm": NaN, + "learning_rate": 0.00019491727245019385, + "loss": 0.0, + "step": 27120 + }, + { + "epoch": 2.5306522347671923, + "grad_norm": NaN, + "learning_rate": 0.0001949100556920364, + "loss": 0.0, + "step": 27121 + }, + { + "epoch": 2.5307455444620697, + "grad_norm": NaN, + "learning_rate": 0.00019490283881968596, + "loss": 0.0, + "step": 27122 + }, + { + "epoch": 2.530838854156947, + "grad_norm": NaN, + "learning_rate": 0.0001948956218331609, + "loss": 0.0, + "step": 27123 + }, + { + "epoch": 2.530932163851824, + "grad_norm": NaN, + "learning_rate": 0.0001948884047324796, + "loss": 0.0, + "step": 27124 + }, + { + "epoch": 2.5310254735467015, + "grad_norm": NaN, + "learning_rate": 0.00019488118751766036, + "loss": 0.0, + "step": 27125 + }, + { + "epoch": 2.531118783241579, + "grad_norm": NaN, + "learning_rate": 0.00019487397018872155, + "loss": 0.0, + "step": 27126 + }, + { + "epoch": 2.531212092936456, + "grad_norm": NaN, + "learning_rate": 0.00019486675274568154, + "loss": 0.0, + "step": 27127 + }, + { + "epoch": 2.5313054026313333, + "grad_norm": NaN, + "learning_rate": 0.00019485953518855864, + "loss": 0.0, + "step": 27128 + }, + { + "epoch": 2.5313987123262107, + "grad_norm": NaN, + "learning_rate": 0.00019485231751737125, + "loss": 0.0, + "step": 27129 + }, + { + "epoch": 2.531492022021088, + "grad_norm": NaN, + "learning_rate": 0.0001948450997321377, + "loss": 0.0, + "step": 27130 + }, + { + "epoch": 2.5315853317159656, + "grad_norm": NaN, + "learning_rate": 0.00019483788183287634, + "loss": 0.0, + "step": 27131 + }, + { + "epoch": 2.5316786414108425, + "grad_norm": NaN, + "learning_rate": 0.00019483066381960557, + "loss": 0.0, + "step": 27132 + }, + { + "epoch": 2.53177195110572, + "grad_norm": NaN, + "learning_rate": 0.00019482344569234366, + "loss": 0.0, + "step": 27133 + }, + { + "epoch": 2.531865260800597, + "grad_norm": NaN, + "learning_rate": 0.00019481622745110903, + "loss": 0.0, + "step": 27134 + }, + { + "epoch": 2.5319585704954743, + "grad_norm": NaN, + "learning_rate": 0.00019480900909591998, + "loss": 0.0, + "step": 27135 + }, + { + "epoch": 2.5320518801903518, + "grad_norm": NaN, + "learning_rate": 0.0001948017906267949, + "loss": 0.0, + "step": 27136 + }, + { + "epoch": 2.532145189885229, + "grad_norm": NaN, + "learning_rate": 0.00019479457204375214, + "loss": 0.0, + "step": 27137 + }, + { + "epoch": 2.5322384995801066, + "grad_norm": NaN, + "learning_rate": 0.00019478735334681007, + "loss": 0.0, + "step": 27138 + }, + { + "epoch": 2.5323318092749836, + "grad_norm": NaN, + "learning_rate": 0.000194780134535987, + "loss": 0.0, + "step": 27139 + }, + { + "epoch": 2.532425118969861, + "grad_norm": NaN, + "learning_rate": 0.00019477291561130133, + "loss": 0.0, + "step": 27140 + }, + { + "epoch": 2.5325184286647384, + "grad_norm": NaN, + "learning_rate": 0.00019476569657277142, + "loss": 0.0, + "step": 27141 + }, + { + "epoch": 2.5326117383596154, + "grad_norm": NaN, + "learning_rate": 0.00019475847742041557, + "loss": 0.0, + "step": 27142 + }, + { + "epoch": 2.532705048054493, + "grad_norm": NaN, + "learning_rate": 0.00019475125815425216, + "loss": 0.0, + "step": 27143 + }, + { + "epoch": 2.5327983577493702, + "grad_norm": NaN, + "learning_rate": 0.00019474403877429958, + "loss": 0.0, + "step": 27144 + }, + { + "epoch": 2.5328916674442477, + "grad_norm": NaN, + "learning_rate": 0.00019473681928057617, + "loss": 0.0, + "step": 27145 + }, + { + "epoch": 2.5329849771391246, + "grad_norm": NaN, + "learning_rate": 0.00019472959967310028, + "loss": 0.0, + "step": 27146 + }, + { + "epoch": 2.533078286834002, + "grad_norm": NaN, + "learning_rate": 0.00019472237995189027, + "loss": 0.0, + "step": 27147 + }, + { + "epoch": 2.5331715965288795, + "grad_norm": NaN, + "learning_rate": 0.00019471516011696448, + "loss": 0.0, + "step": 27148 + }, + { + "epoch": 2.5332649062237564, + "grad_norm": NaN, + "learning_rate": 0.00019470794016834128, + "loss": 0.0, + "step": 27149 + }, + { + "epoch": 2.533358215918634, + "grad_norm": NaN, + "learning_rate": 0.00019470072010603905, + "loss": 0.0, + "step": 27150 + }, + { + "epoch": 2.5334515256135113, + "grad_norm": NaN, + "learning_rate": 0.0001946934999300761, + "loss": 0.0, + "step": 27151 + }, + { + "epoch": 2.5335448353083887, + "grad_norm": NaN, + "learning_rate": 0.00019468627964047087, + "loss": 0.0, + "step": 27152 + }, + { + "epoch": 2.5336381450032657, + "grad_norm": NaN, + "learning_rate": 0.00019467905923724164, + "loss": 0.0, + "step": 27153 + }, + { + "epoch": 2.533731454698143, + "grad_norm": NaN, + "learning_rate": 0.00019467183872040679, + "loss": 0.0, + "step": 27154 + }, + { + "epoch": 2.5338247643930205, + "grad_norm": NaN, + "learning_rate": 0.00019466461808998468, + "loss": 0.0, + "step": 27155 + }, + { + "epoch": 2.5339180740878975, + "grad_norm": NaN, + "learning_rate": 0.0001946573973459937, + "loss": 0.0, + "step": 27156 + }, + { + "epoch": 2.534011383782775, + "grad_norm": NaN, + "learning_rate": 0.00019465017648845218, + "loss": 0.0, + "step": 27157 + }, + { + "epoch": 2.5341046934776523, + "grad_norm": NaN, + "learning_rate": 0.00019464295551737846, + "loss": 0.0, + "step": 27158 + }, + { + "epoch": 2.5341980031725297, + "grad_norm": NaN, + "learning_rate": 0.00019463573443279095, + "loss": 0.0, + "step": 27159 + }, + { + "epoch": 2.534291312867407, + "grad_norm": NaN, + "learning_rate": 0.000194628513234708, + "loss": 0.0, + "step": 27160 + }, + { + "epoch": 2.534384622562284, + "grad_norm": NaN, + "learning_rate": 0.00019462129192314795, + "loss": 0.0, + "step": 27161 + }, + { + "epoch": 2.5344779322571616, + "grad_norm": NaN, + "learning_rate": 0.00019461407049812916, + "loss": 0.0, + "step": 27162 + }, + { + "epoch": 2.534571241952039, + "grad_norm": NaN, + "learning_rate": 0.00019460684895967003, + "loss": 0.0, + "step": 27163 + }, + { + "epoch": 2.534664551646916, + "grad_norm": NaN, + "learning_rate": 0.0001945996273077889, + "loss": 0.0, + "step": 27164 + }, + { + "epoch": 2.5347578613417934, + "grad_norm": NaN, + "learning_rate": 0.00019459240554250412, + "loss": 0.0, + "step": 27165 + }, + { + "epoch": 2.534851171036671, + "grad_norm": NaN, + "learning_rate": 0.00019458518366383405, + "loss": 0.0, + "step": 27166 + }, + { + "epoch": 2.534944480731548, + "grad_norm": NaN, + "learning_rate": 0.00019457796167179708, + "loss": 0.0, + "step": 27167 + }, + { + "epoch": 2.535037790426425, + "grad_norm": NaN, + "learning_rate": 0.0001945707395664115, + "loss": 0.0, + "step": 27168 + }, + { + "epoch": 2.5351311001213026, + "grad_norm": NaN, + "learning_rate": 0.0001945635173476958, + "loss": 0.0, + "step": 27169 + }, + { + "epoch": 2.53522440981618, + "grad_norm": NaN, + "learning_rate": 0.00019455629501566828, + "loss": 0.0, + "step": 27170 + }, + { + "epoch": 2.535317719511057, + "grad_norm": NaN, + "learning_rate": 0.00019454907257034727, + "loss": 0.0, + "step": 27171 + }, + { + "epoch": 2.5354110292059344, + "grad_norm": NaN, + "learning_rate": 0.00019454185001175118, + "loss": 0.0, + "step": 27172 + }, + { + "epoch": 2.535504338900812, + "grad_norm": NaN, + "learning_rate": 0.00019453462733989832, + "loss": 0.0, + "step": 27173 + }, + { + "epoch": 2.5355976485956893, + "grad_norm": NaN, + "learning_rate": 0.00019452740455480716, + "loss": 0.0, + "step": 27174 + }, + { + "epoch": 2.5356909582905662, + "grad_norm": NaN, + "learning_rate": 0.00019452018165649594, + "loss": 0.0, + "step": 27175 + }, + { + "epoch": 2.5357842679854437, + "grad_norm": NaN, + "learning_rate": 0.00019451295864498315, + "loss": 0.0, + "step": 27176 + }, + { + "epoch": 2.535877577680321, + "grad_norm": NaN, + "learning_rate": 0.00019450573552028705, + "loss": 0.0, + "step": 27177 + }, + { + "epoch": 2.535970887375198, + "grad_norm": NaN, + "learning_rate": 0.00019449851228242604, + "loss": 0.0, + "step": 27178 + }, + { + "epoch": 2.5360641970700755, + "grad_norm": NaN, + "learning_rate": 0.0001944912889314185, + "loss": 0.0, + "step": 27179 + }, + { + "epoch": 2.536157506764953, + "grad_norm": NaN, + "learning_rate": 0.00019448406546728283, + "loss": 0.0, + "step": 27180 + }, + { + "epoch": 2.5362508164598303, + "grad_norm": NaN, + "learning_rate": 0.0001944768418900373, + "loss": 0.0, + "step": 27181 + }, + { + "epoch": 2.5363441261547077, + "grad_norm": NaN, + "learning_rate": 0.00019446961819970037, + "loss": 0.0, + "step": 27182 + }, + { + "epoch": 2.5364374358495847, + "grad_norm": NaN, + "learning_rate": 0.00019446239439629038, + "loss": 0.0, + "step": 27183 + }, + { + "epoch": 2.536530745544462, + "grad_norm": NaN, + "learning_rate": 0.00019445517047982563, + "loss": 0.0, + "step": 27184 + }, + { + "epoch": 2.536624055239339, + "grad_norm": NaN, + "learning_rate": 0.00019444794645032465, + "loss": 0.0, + "step": 27185 + }, + { + "epoch": 2.5367173649342165, + "grad_norm": NaN, + "learning_rate": 0.00019444072230780565, + "loss": 0.0, + "step": 27186 + }, + { + "epoch": 2.536810674629094, + "grad_norm": NaN, + "learning_rate": 0.00019443349805228706, + "loss": 0.0, + "step": 27187 + }, + { + "epoch": 2.5369039843239714, + "grad_norm": NaN, + "learning_rate": 0.00019442627368378726, + "loss": 0.0, + "step": 27188 + }, + { + "epoch": 2.5369972940188488, + "grad_norm": NaN, + "learning_rate": 0.00019441904920232462, + "loss": 0.0, + "step": 27189 + }, + { + "epoch": 2.5370906037137257, + "grad_norm": NaN, + "learning_rate": 0.00019441182460791745, + "loss": 0.0, + "step": 27190 + }, + { + "epoch": 2.537183913408603, + "grad_norm": NaN, + "learning_rate": 0.0001944045999005842, + "loss": 0.0, + "step": 27191 + }, + { + "epoch": 2.5372772231034806, + "grad_norm": NaN, + "learning_rate": 0.00019439737508034318, + "loss": 0.0, + "step": 27192 + }, + { + "epoch": 2.5373705327983576, + "grad_norm": NaN, + "learning_rate": 0.00019439015014721282, + "loss": 0.0, + "step": 27193 + }, + { + "epoch": 2.537463842493235, + "grad_norm": NaN, + "learning_rate": 0.00019438292510121145, + "loss": 0.0, + "step": 27194 + }, + { + "epoch": 2.5375571521881124, + "grad_norm": NaN, + "learning_rate": 0.00019437569994235743, + "loss": 0.0, + "step": 27195 + }, + { + "epoch": 2.53765046188299, + "grad_norm": NaN, + "learning_rate": 0.0001943684746706692, + "loss": 0.0, + "step": 27196 + }, + { + "epoch": 2.537743771577867, + "grad_norm": NaN, + "learning_rate": 0.000194361249286165, + "loss": 0.0, + "step": 27197 + }, + { + "epoch": 2.537837081272744, + "grad_norm": NaN, + "learning_rate": 0.00019435402378886334, + "loss": 0.0, + "step": 27198 + }, + { + "epoch": 2.5379303909676216, + "grad_norm": NaN, + "learning_rate": 0.0001943467981787825, + "loss": 0.0, + "step": 27199 + }, + { + "epoch": 2.5380237006624986, + "grad_norm": NaN, + "learning_rate": 0.00019433957245594094, + "loss": 0.0, + "step": 27200 + }, + { + "epoch": 2.538117010357376, + "grad_norm": NaN, + "learning_rate": 0.00019433234662035692, + "loss": 0.0, + "step": 27201 + }, + { + "epoch": 2.5382103200522534, + "grad_norm": NaN, + "learning_rate": 0.00019432512067204893, + "loss": 0.0, + "step": 27202 + }, + { + "epoch": 2.538303629747131, + "grad_norm": NaN, + "learning_rate": 0.00019431789461103524, + "loss": 0.0, + "step": 27203 + }, + { + "epoch": 2.5383969394420083, + "grad_norm": NaN, + "learning_rate": 0.00019431066843733432, + "loss": 0.0, + "step": 27204 + }, + { + "epoch": 2.5384902491368853, + "grad_norm": NaN, + "learning_rate": 0.00019430344215096447, + "loss": 0.0, + "step": 27205 + }, + { + "epoch": 2.5385835588317627, + "grad_norm": NaN, + "learning_rate": 0.00019429621575194406, + "loss": 0.0, + "step": 27206 + }, + { + "epoch": 2.5386768685266397, + "grad_norm": NaN, + "learning_rate": 0.00019428898924029158, + "loss": 0.0, + "step": 27207 + }, + { + "epoch": 2.538770178221517, + "grad_norm": NaN, + "learning_rate": 0.00019428176261602524, + "loss": 0.0, + "step": 27208 + }, + { + "epoch": 2.5388634879163945, + "grad_norm": NaN, + "learning_rate": 0.0001942745358791635, + "loss": 0.0, + "step": 27209 + }, + { + "epoch": 2.538956797611272, + "grad_norm": NaN, + "learning_rate": 0.0001942673090297248, + "loss": 0.0, + "step": 27210 + }, + { + "epoch": 2.5390501073061493, + "grad_norm": NaN, + "learning_rate": 0.0001942600820677274, + "loss": 0.0, + "step": 27211 + }, + { + "epoch": 2.5391434170010263, + "grad_norm": NaN, + "learning_rate": 0.00019425285499318967, + "loss": 0.0, + "step": 27212 + }, + { + "epoch": 2.5392367266959037, + "grad_norm": NaN, + "learning_rate": 0.00019424562780613012, + "loss": 0.0, + "step": 27213 + }, + { + "epoch": 2.539330036390781, + "grad_norm": NaN, + "learning_rate": 0.00019423840050656702, + "loss": 0.0, + "step": 27214 + }, + { + "epoch": 2.539423346085658, + "grad_norm": NaN, + "learning_rate": 0.00019423117309451876, + "loss": 0.0, + "step": 27215 + }, + { + "epoch": 2.5395166557805355, + "grad_norm": NaN, + "learning_rate": 0.00019422394557000377, + "loss": 0.0, + "step": 27216 + }, + { + "epoch": 2.539609965475413, + "grad_norm": NaN, + "learning_rate": 0.0001942167179330403, + "loss": 0.0, + "step": 27217 + }, + { + "epoch": 2.5397032751702904, + "grad_norm": NaN, + "learning_rate": 0.00019420949018364688, + "loss": 0.0, + "step": 27218 + }, + { + "epoch": 2.5397965848651674, + "grad_norm": NaN, + "learning_rate": 0.0001942022623218418, + "loss": 0.0, + "step": 27219 + }, + { + "epoch": 2.5398898945600448, + "grad_norm": NaN, + "learning_rate": 0.00019419503434764347, + "loss": 0.0, + "step": 27220 + }, + { + "epoch": 2.539983204254922, + "grad_norm": NaN, + "learning_rate": 0.00019418780626107027, + "loss": 0.0, + "step": 27221 + }, + { + "epoch": 2.540076513949799, + "grad_norm": NaN, + "learning_rate": 0.00019418057806214054, + "loss": 0.0, + "step": 27222 + }, + { + "epoch": 2.5401698236446766, + "grad_norm": NaN, + "learning_rate": 0.00019417334975087273, + "loss": 0.0, + "step": 27223 + }, + { + "epoch": 2.540263133339554, + "grad_norm": NaN, + "learning_rate": 0.00019416612132728514, + "loss": 0.0, + "step": 27224 + }, + { + "epoch": 2.5403564430344314, + "grad_norm": NaN, + "learning_rate": 0.00019415889279139617, + "loss": 0.0, + "step": 27225 + }, + { + "epoch": 2.540449752729309, + "grad_norm": NaN, + "learning_rate": 0.00019415166414322427, + "loss": 0.0, + "step": 27226 + }, + { + "epoch": 2.540543062424186, + "grad_norm": NaN, + "learning_rate": 0.00019414443538278773, + "loss": 0.0, + "step": 27227 + }, + { + "epoch": 2.5406363721190632, + "grad_norm": NaN, + "learning_rate": 0.00019413720651010496, + "loss": 0.0, + "step": 27228 + }, + { + "epoch": 2.54072968181394, + "grad_norm": NaN, + "learning_rate": 0.00019412997752519441, + "loss": 0.0, + "step": 27229 + }, + { + "epoch": 2.5408229915088176, + "grad_norm": NaN, + "learning_rate": 0.00019412274842807434, + "loss": 0.0, + "step": 27230 + }, + { + "epoch": 2.540916301203695, + "grad_norm": NaN, + "learning_rate": 0.00019411551921876317, + "loss": 0.0, + "step": 27231 + }, + { + "epoch": 2.5410096108985725, + "grad_norm": NaN, + "learning_rate": 0.00019410828989727935, + "loss": 0.0, + "step": 27232 + }, + { + "epoch": 2.54110292059345, + "grad_norm": NaN, + "learning_rate": 0.00019410106046364122, + "loss": 0.0, + "step": 27233 + }, + { + "epoch": 2.541196230288327, + "grad_norm": NaN, + "learning_rate": 0.00019409383091786712, + "loss": 0.0, + "step": 27234 + }, + { + "epoch": 2.5412895399832043, + "grad_norm": NaN, + "learning_rate": 0.0001940866012599755, + "loss": 0.0, + "step": 27235 + }, + { + "epoch": 2.5413828496780817, + "grad_norm": NaN, + "learning_rate": 0.0001940793714899847, + "loss": 0.0, + "step": 27236 + }, + { + "epoch": 2.5414761593729587, + "grad_norm": NaN, + "learning_rate": 0.0001940721416079131, + "loss": 0.0, + "step": 27237 + }, + { + "epoch": 2.541569469067836, + "grad_norm": NaN, + "learning_rate": 0.00019406491161377915, + "loss": 0.0, + "step": 27238 + }, + { + "epoch": 2.5416627787627135, + "grad_norm": NaN, + "learning_rate": 0.00019405768150760117, + "loss": 0.0, + "step": 27239 + }, + { + "epoch": 2.541756088457591, + "grad_norm": NaN, + "learning_rate": 0.0001940504512893975, + "loss": 0.0, + "step": 27240 + }, + { + "epoch": 2.541849398152468, + "grad_norm": NaN, + "learning_rate": 0.0001940432209591866, + "loss": 0.0, + "step": 27241 + }, + { + "epoch": 2.5419427078473453, + "grad_norm": NaN, + "learning_rate": 0.0001940359905169869, + "loss": 0.0, + "step": 27242 + }, + { + "epoch": 2.5420360175422227, + "grad_norm": NaN, + "learning_rate": 0.00019402875996281667, + "loss": 0.0, + "step": 27243 + }, + { + "epoch": 2.5421293272370997, + "grad_norm": NaN, + "learning_rate": 0.00019402152929669431, + "loss": 0.0, + "step": 27244 + }, + { + "epoch": 2.542222636931977, + "grad_norm": NaN, + "learning_rate": 0.0001940142985186383, + "loss": 0.0, + "step": 27245 + }, + { + "epoch": 2.5423159466268546, + "grad_norm": NaN, + "learning_rate": 0.00019400706762866694, + "loss": 0.0, + "step": 27246 + }, + { + "epoch": 2.542409256321732, + "grad_norm": NaN, + "learning_rate": 0.00019399983662679864, + "loss": 0.0, + "step": 27247 + }, + { + "epoch": 2.5425025660166094, + "grad_norm": NaN, + "learning_rate": 0.00019399260551305184, + "loss": 0.0, + "step": 27248 + }, + { + "epoch": 2.5425958757114864, + "grad_norm": NaN, + "learning_rate": 0.0001939853742874448, + "loss": 0.0, + "step": 27249 + }, + { + "epoch": 2.542689185406364, + "grad_norm": NaN, + "learning_rate": 0.000193978142949996, + "loss": 0.0, + "step": 27250 + }, + { + "epoch": 2.5427824951012408, + "grad_norm": NaN, + "learning_rate": 0.00019397091150072387, + "loss": 0.0, + "step": 27251 + }, + { + "epoch": 2.542875804796118, + "grad_norm": NaN, + "learning_rate": 0.00019396367993964667, + "loss": 0.0, + "step": 27252 + }, + { + "epoch": 2.5429691144909956, + "grad_norm": NaN, + "learning_rate": 0.00019395644826678286, + "loss": 0.0, + "step": 27253 + }, + { + "epoch": 2.543062424185873, + "grad_norm": NaN, + "learning_rate": 0.00019394921648215087, + "loss": 0.0, + "step": 27254 + }, + { + "epoch": 2.5431557338807504, + "grad_norm": NaN, + "learning_rate": 0.00019394198458576903, + "loss": 0.0, + "step": 27255 + }, + { + "epoch": 2.5432490435756274, + "grad_norm": NaN, + "learning_rate": 0.0001939347525776557, + "loss": 0.0, + "step": 27256 + }, + { + "epoch": 2.543342353270505, + "grad_norm": NaN, + "learning_rate": 0.00019392752045782936, + "loss": 0.0, + "step": 27257 + }, + { + "epoch": 2.5434356629653823, + "grad_norm": NaN, + "learning_rate": 0.00019392028822630829, + "loss": 0.0, + "step": 27258 + }, + { + "epoch": 2.5435289726602592, + "grad_norm": NaN, + "learning_rate": 0.00019391305588311098, + "loss": 0.0, + "step": 27259 + }, + { + "epoch": 2.5436222823551367, + "grad_norm": NaN, + "learning_rate": 0.00019390582342825575, + "loss": 0.0, + "step": 27260 + }, + { + "epoch": 2.543715592050014, + "grad_norm": NaN, + "learning_rate": 0.00019389859086176104, + "loss": 0.0, + "step": 27261 + }, + { + "epoch": 2.5438089017448915, + "grad_norm": NaN, + "learning_rate": 0.00019389135818364522, + "loss": 0.0, + "step": 27262 + }, + { + "epoch": 2.5439022114397685, + "grad_norm": NaN, + "learning_rate": 0.00019388412539392662, + "loss": 0.0, + "step": 27263 + }, + { + "epoch": 2.543995521134646, + "grad_norm": NaN, + "learning_rate": 0.0001938768924926238, + "loss": 0.0, + "step": 27264 + }, + { + "epoch": 2.5440888308295233, + "grad_norm": NaN, + "learning_rate": 0.00019386965947975495, + "loss": 0.0, + "step": 27265 + }, + { + "epoch": 2.5441821405244003, + "grad_norm": NaN, + "learning_rate": 0.00019386242635533853, + "loss": 0.0, + "step": 27266 + }, + { + "epoch": 2.5442754502192777, + "grad_norm": NaN, + "learning_rate": 0.00019385519311939306, + "loss": 0.0, + "step": 27267 + }, + { + "epoch": 2.544368759914155, + "grad_norm": NaN, + "learning_rate": 0.00019384795977193676, + "loss": 0.0, + "step": 27268 + }, + { + "epoch": 2.5444620696090325, + "grad_norm": NaN, + "learning_rate": 0.00019384072631298803, + "loss": 0.0, + "step": 27269 + }, + { + "epoch": 2.5445553793039095, + "grad_norm": NaN, + "learning_rate": 0.00019383349274256542, + "loss": 0.0, + "step": 27270 + }, + { + "epoch": 2.544648688998787, + "grad_norm": NaN, + "learning_rate": 0.0001938262590606872, + "loss": 0.0, + "step": 27271 + }, + { + "epoch": 2.5447419986936644, + "grad_norm": NaN, + "learning_rate": 0.00019381902526737174, + "loss": 0.0, + "step": 27272 + }, + { + "epoch": 2.5448353083885413, + "grad_norm": NaN, + "learning_rate": 0.0001938117913626375, + "loss": 0.0, + "step": 27273 + }, + { + "epoch": 2.5449286180834187, + "grad_norm": NaN, + "learning_rate": 0.00019380455734650286, + "loss": 0.0, + "step": 27274 + }, + { + "epoch": 2.545021927778296, + "grad_norm": NaN, + "learning_rate": 0.0001937973232189862, + "loss": 0.0, + "step": 27275 + }, + { + "epoch": 2.5451152374731736, + "grad_norm": NaN, + "learning_rate": 0.00019379008898010594, + "loss": 0.0, + "step": 27276 + }, + { + "epoch": 2.545208547168051, + "grad_norm": NaN, + "learning_rate": 0.00019378285462988044, + "loss": 0.0, + "step": 27277 + }, + { + "epoch": 2.545301856862928, + "grad_norm": NaN, + "learning_rate": 0.00019377562016832808, + "loss": 0.0, + "step": 27278 + }, + { + "epoch": 2.5453951665578054, + "grad_norm": NaN, + "learning_rate": 0.00019376838559546735, + "loss": 0.0, + "step": 27279 + }, + { + "epoch": 2.545488476252683, + "grad_norm": NaN, + "learning_rate": 0.00019376115091131654, + "loss": 0.0, + "step": 27280 + }, + { + "epoch": 2.54558178594756, + "grad_norm": NaN, + "learning_rate": 0.00019375391611589407, + "loss": 0.0, + "step": 27281 + }, + { + "epoch": 2.545675095642437, + "grad_norm": NaN, + "learning_rate": 0.00019374668120921836, + "loss": 0.0, + "step": 27282 + }, + { + "epoch": 2.5457684053373146, + "grad_norm": NaN, + "learning_rate": 0.00019373944619130787, + "loss": 0.0, + "step": 27283 + }, + { + "epoch": 2.545861715032192, + "grad_norm": NaN, + "learning_rate": 0.00019373221106218087, + "loss": 0.0, + "step": 27284 + }, + { + "epoch": 2.545955024727069, + "grad_norm": NaN, + "learning_rate": 0.00019372497582185576, + "loss": 0.0, + "step": 27285 + }, + { + "epoch": 2.5460483344219464, + "grad_norm": NaN, + "learning_rate": 0.00019371774047035107, + "loss": 0.0, + "step": 27286 + }, + { + "epoch": 2.546141644116824, + "grad_norm": NaN, + "learning_rate": 0.0001937105050076851, + "loss": 0.0, + "step": 27287 + }, + { + "epoch": 2.546234953811701, + "grad_norm": NaN, + "learning_rate": 0.00019370326943387624, + "loss": 0.0, + "step": 27288 + }, + { + "epoch": 2.5463282635065783, + "grad_norm": NaN, + "learning_rate": 0.00019369603374894291, + "loss": 0.0, + "step": 27289 + }, + { + "epoch": 2.5464215732014557, + "grad_norm": NaN, + "learning_rate": 0.00019368879795290353, + "loss": 0.0, + "step": 27290 + }, + { + "epoch": 2.546514882896333, + "grad_norm": NaN, + "learning_rate": 0.00019368156204577642, + "loss": 0.0, + "step": 27291 + }, + { + "epoch": 2.54660819259121, + "grad_norm": NaN, + "learning_rate": 0.00019367432602758014, + "loss": 0.0, + "step": 27292 + }, + { + "epoch": 2.5467015022860875, + "grad_norm": NaN, + "learning_rate": 0.00019366708989833293, + "loss": 0.0, + "step": 27293 + }, + { + "epoch": 2.546794811980965, + "grad_norm": NaN, + "learning_rate": 0.0001936598536580532, + "loss": 0.0, + "step": 27294 + }, + { + "epoch": 2.546888121675842, + "grad_norm": NaN, + "learning_rate": 0.0001936526173067595, + "loss": 0.0, + "step": 27295 + }, + { + "epoch": 2.5469814313707193, + "grad_norm": NaN, + "learning_rate": 0.00019364538084447008, + "loss": 0.0, + "step": 27296 + }, + { + "epoch": 2.5470747410655967, + "grad_norm": NaN, + "learning_rate": 0.00019363814427120335, + "loss": 0.0, + "step": 27297 + }, + { + "epoch": 2.547168050760474, + "grad_norm": NaN, + "learning_rate": 0.00019363090758697774, + "loss": 0.0, + "step": 27298 + }, + { + "epoch": 2.5472613604553516, + "grad_norm": NaN, + "learning_rate": 0.00019362367079181172, + "loss": 0.0, + "step": 27299 + }, + { + "epoch": 2.5473546701502285, + "grad_norm": NaN, + "learning_rate": 0.00019361643388572357, + "loss": 0.0, + "step": 27300 + }, + { + "epoch": 2.547447979845106, + "grad_norm": NaN, + "learning_rate": 0.00019360919686873176, + "loss": 0.0, + "step": 27301 + }, + { + "epoch": 2.547541289539983, + "grad_norm": NaN, + "learning_rate": 0.00019360195974085475, + "loss": 0.0, + "step": 27302 + }, + { + "epoch": 2.5476345992348604, + "grad_norm": NaN, + "learning_rate": 0.00019359472250211074, + "loss": 0.0, + "step": 27303 + }, + { + "epoch": 2.5477279089297378, + "grad_norm": NaN, + "learning_rate": 0.00019358748515251834, + "loss": 0.0, + "step": 27304 + }, + { + "epoch": 2.547821218624615, + "grad_norm": NaN, + "learning_rate": 0.00019358024769209595, + "loss": 0.0, + "step": 27305 + }, + { + "epoch": 2.5479145283194926, + "grad_norm": NaN, + "learning_rate": 0.00019357301012086183, + "loss": 0.0, + "step": 27306 + }, + { + "epoch": 2.5480078380143696, + "grad_norm": NaN, + "learning_rate": 0.00019356577243883437, + "loss": 0.0, + "step": 27307 + }, + { + "epoch": 2.548101147709247, + "grad_norm": NaN, + "learning_rate": 0.00019355853464603218, + "loss": 0.0, + "step": 27308 + }, + { + "epoch": 2.5481944574041244, + "grad_norm": NaN, + "learning_rate": 0.0001935512967424735, + "loss": 0.0, + "step": 27309 + }, + { + "epoch": 2.5482877670990014, + "grad_norm": NaN, + "learning_rate": 0.00019354405872817673, + "loss": 0.0, + "step": 27310 + }, + { + "epoch": 2.548381076793879, + "grad_norm": NaN, + "learning_rate": 0.00019353682060316039, + "loss": 0.0, + "step": 27311 + }, + { + "epoch": 2.5484743864887562, + "grad_norm": NaN, + "learning_rate": 0.00019352958236744277, + "loss": 0.0, + "step": 27312 + }, + { + "epoch": 2.5485676961836337, + "grad_norm": NaN, + "learning_rate": 0.0001935223440210423, + "loss": 0.0, + "step": 27313 + }, + { + "epoch": 2.5486610058785106, + "grad_norm": NaN, + "learning_rate": 0.00019351510556397743, + "loss": 0.0, + "step": 27314 + }, + { + "epoch": 2.548754315573388, + "grad_norm": NaN, + "learning_rate": 0.00019350786699626654, + "loss": 0.0, + "step": 27315 + }, + { + "epoch": 2.5488476252682655, + "grad_norm": NaN, + "learning_rate": 0.000193500628317928, + "loss": 0.0, + "step": 27316 + }, + { + "epoch": 2.5489409349631424, + "grad_norm": NaN, + "learning_rate": 0.0001934933895289803, + "loss": 0.0, + "step": 27317 + }, + { + "epoch": 2.54903424465802, + "grad_norm": NaN, + "learning_rate": 0.00019348615062944174, + "loss": 0.0, + "step": 27318 + }, + { + "epoch": 2.5491275543528973, + "grad_norm": NaN, + "learning_rate": 0.0001934789116193308, + "loss": 0.0, + "step": 27319 + }, + { + "epoch": 2.5492208640477747, + "grad_norm": NaN, + "learning_rate": 0.00019347167249866588, + "loss": 0.0, + "step": 27320 + }, + { + "epoch": 2.549314173742652, + "grad_norm": NaN, + "learning_rate": 0.00019346443326746538, + "loss": 0.0, + "step": 27321 + }, + { + "epoch": 2.549407483437529, + "grad_norm": NaN, + "learning_rate": 0.00019345719392574765, + "loss": 0.0, + "step": 27322 + }, + { + "epoch": 2.5495007931324065, + "grad_norm": NaN, + "learning_rate": 0.0001934499544735312, + "loss": 0.0, + "step": 27323 + }, + { + "epoch": 2.5495941028272835, + "grad_norm": NaN, + "learning_rate": 0.0001934427149108344, + "loss": 0.0, + "step": 27324 + }, + { + "epoch": 2.549687412522161, + "grad_norm": NaN, + "learning_rate": 0.00019343547523767557, + "loss": 0.0, + "step": 27325 + }, + { + "epoch": 2.5497807222170383, + "grad_norm": NaN, + "learning_rate": 0.00019342823545407325, + "loss": 0.0, + "step": 27326 + }, + { + "epoch": 2.5498740319119158, + "grad_norm": NaN, + "learning_rate": 0.0001934209955600458, + "loss": 0.0, + "step": 27327 + }, + { + "epoch": 2.549967341606793, + "grad_norm": NaN, + "learning_rate": 0.00019341375555561162, + "loss": 0.0, + "step": 27328 + }, + { + "epoch": 2.55006065130167, + "grad_norm": NaN, + "learning_rate": 0.0001934065154407891, + "loss": 0.0, + "step": 27329 + }, + { + "epoch": 2.5501539609965476, + "grad_norm": NaN, + "learning_rate": 0.00019339927521559667, + "loss": 0.0, + "step": 27330 + }, + { + "epoch": 2.550247270691425, + "grad_norm": NaN, + "learning_rate": 0.00019339203488005272, + "loss": 0.0, + "step": 27331 + }, + { + "epoch": 2.550340580386302, + "grad_norm": NaN, + "learning_rate": 0.00019338479443417567, + "loss": 0.0, + "step": 27332 + }, + { + "epoch": 2.5504338900811794, + "grad_norm": NaN, + "learning_rate": 0.00019337755387798403, + "loss": 0.0, + "step": 27333 + }, + { + "epoch": 2.550527199776057, + "grad_norm": NaN, + "learning_rate": 0.00019337031321149605, + "loss": 0.0, + "step": 27334 + }, + { + "epoch": 2.550620509470934, + "grad_norm": NaN, + "learning_rate": 0.00019336307243473017, + "loss": 0.0, + "step": 27335 + }, + { + "epoch": 2.550713819165811, + "grad_norm": NaN, + "learning_rate": 0.0001933558315477049, + "loss": 0.0, + "step": 27336 + }, + { + "epoch": 2.5508071288606886, + "grad_norm": NaN, + "learning_rate": 0.0001933485905504386, + "loss": 0.0, + "step": 27337 + }, + { + "epoch": 2.550900438555566, + "grad_norm": NaN, + "learning_rate": 0.00019334134944294965, + "loss": 0.0, + "step": 27338 + }, + { + "epoch": 2.550993748250443, + "grad_norm": NaN, + "learning_rate": 0.00019333410822525648, + "loss": 0.0, + "step": 27339 + }, + { + "epoch": 2.5510870579453204, + "grad_norm": NaN, + "learning_rate": 0.0001933268668973776, + "loss": 0.0, + "step": 27340 + }, + { + "epoch": 2.551180367640198, + "grad_norm": NaN, + "learning_rate": 0.0001933196254593312, + "loss": 0.0, + "step": 27341 + }, + { + "epoch": 2.5512736773350753, + "grad_norm": NaN, + "learning_rate": 0.0001933123839111359, + "loss": 0.0, + "step": 27342 + }, + { + "epoch": 2.5513669870299527, + "grad_norm": NaN, + "learning_rate": 0.00019330514225281004, + "loss": 0.0, + "step": 27343 + }, + { + "epoch": 2.5514602967248297, + "grad_norm": NaN, + "learning_rate": 0.00019329790048437198, + "loss": 0.0, + "step": 27344 + }, + { + "epoch": 2.551553606419707, + "grad_norm": NaN, + "learning_rate": 0.0001932906586058402, + "loss": 0.0, + "step": 27345 + }, + { + "epoch": 2.551646916114584, + "grad_norm": NaN, + "learning_rate": 0.00019328341661723316, + "loss": 0.0, + "step": 27346 + }, + { + "epoch": 2.5517402258094615, + "grad_norm": NaN, + "learning_rate": 0.00019327617451856912, + "loss": 0.0, + "step": 27347 + }, + { + "epoch": 2.551833535504339, + "grad_norm": NaN, + "learning_rate": 0.00019326893230986664, + "loss": 0.0, + "step": 27348 + }, + { + "epoch": 2.5519268451992163, + "grad_norm": NaN, + "learning_rate": 0.0001932616899911441, + "loss": 0.0, + "step": 27349 + }, + { + "epoch": 2.5520201548940937, + "grad_norm": NaN, + "learning_rate": 0.0001932544475624199, + "loss": 0.0, + "step": 27350 + }, + { + "epoch": 2.5521134645889707, + "grad_norm": NaN, + "learning_rate": 0.0001932472050237124, + "loss": 0.0, + "step": 27351 + }, + { + "epoch": 2.552206774283848, + "grad_norm": NaN, + "learning_rate": 0.00019323996237504014, + "loss": 0.0, + "step": 27352 + }, + { + "epoch": 2.5523000839787255, + "grad_norm": NaN, + "learning_rate": 0.0001932327196164214, + "loss": 0.0, + "step": 27353 + }, + { + "epoch": 2.5523933936736025, + "grad_norm": NaN, + "learning_rate": 0.00019322547674787462, + "loss": 0.0, + "step": 27354 + }, + { + "epoch": 2.55248670336848, + "grad_norm": NaN, + "learning_rate": 0.00019321823376941837, + "loss": 0.0, + "step": 27355 + }, + { + "epoch": 2.5525800130633574, + "grad_norm": NaN, + "learning_rate": 0.0001932109906810709, + "loss": 0.0, + "step": 27356 + }, + { + "epoch": 2.5526733227582348, + "grad_norm": NaN, + "learning_rate": 0.00019320374748285065, + "loss": 0.0, + "step": 27357 + }, + { + "epoch": 2.5527666324531118, + "grad_norm": NaN, + "learning_rate": 0.00019319650417477612, + "loss": 0.0, + "step": 27358 + }, + { + "epoch": 2.552859942147989, + "grad_norm": NaN, + "learning_rate": 0.0001931892607568657, + "loss": 0.0, + "step": 27359 + }, + { + "epoch": 2.5529532518428666, + "grad_norm": NaN, + "learning_rate": 0.00019318201722913768, + "loss": 0.0, + "step": 27360 + }, + { + "epoch": 2.5530465615377436, + "grad_norm": NaN, + "learning_rate": 0.00019317477359161064, + "loss": 0.0, + "step": 27361 + }, + { + "epoch": 2.553139871232621, + "grad_norm": NaN, + "learning_rate": 0.00019316752984430298, + "loss": 0.0, + "step": 27362 + }, + { + "epoch": 2.5532331809274984, + "grad_norm": NaN, + "learning_rate": 0.00019316028598723303, + "loss": 0.0, + "step": 27363 + }, + { + "epoch": 2.553326490622376, + "grad_norm": NaN, + "learning_rate": 0.00019315304202041926, + "loss": 0.0, + "step": 27364 + }, + { + "epoch": 2.553419800317253, + "grad_norm": NaN, + "learning_rate": 0.0001931457979438801, + "loss": 0.0, + "step": 27365 + }, + { + "epoch": 2.55351311001213, + "grad_norm": NaN, + "learning_rate": 0.00019313855375763394, + "loss": 0.0, + "step": 27366 + }, + { + "epoch": 2.5536064197070076, + "grad_norm": NaN, + "learning_rate": 0.00019313130946169924, + "loss": 0.0, + "step": 27367 + }, + { + "epoch": 2.5536997294018846, + "grad_norm": NaN, + "learning_rate": 0.0001931240650560944, + "loss": 0.0, + "step": 27368 + }, + { + "epoch": 2.553793039096762, + "grad_norm": NaN, + "learning_rate": 0.0001931168205408378, + "loss": 0.0, + "step": 27369 + }, + { + "epoch": 2.5538863487916394, + "grad_norm": NaN, + "learning_rate": 0.0001931095759159479, + "loss": 0.0, + "step": 27370 + }, + { + "epoch": 2.553979658486517, + "grad_norm": NaN, + "learning_rate": 0.00019310233118144317, + "loss": 0.0, + "step": 27371 + }, + { + "epoch": 2.5540729681813943, + "grad_norm": NaN, + "learning_rate": 0.00019309508633734193, + "loss": 0.0, + "step": 27372 + }, + { + "epoch": 2.5541662778762713, + "grad_norm": NaN, + "learning_rate": 0.00019308784138366266, + "loss": 0.0, + "step": 27373 + }, + { + "epoch": 2.5542595875711487, + "grad_norm": NaN, + "learning_rate": 0.00019308059632042377, + "loss": 0.0, + "step": 27374 + }, + { + "epoch": 2.554352897266026, + "grad_norm": NaN, + "learning_rate": 0.00019307335114764374, + "loss": 0.0, + "step": 27375 + }, + { + "epoch": 2.554446206960903, + "grad_norm": NaN, + "learning_rate": 0.00019306610586534084, + "loss": 0.0, + "step": 27376 + }, + { + "epoch": 2.5545395166557805, + "grad_norm": NaN, + "learning_rate": 0.00019305886047353365, + "loss": 0.0, + "step": 27377 + }, + { + "epoch": 2.554632826350658, + "grad_norm": NaN, + "learning_rate": 0.00019305161497224058, + "loss": 0.0, + "step": 27378 + }, + { + "epoch": 2.5547261360455353, + "grad_norm": NaN, + "learning_rate": 0.0001930443693614799, + "loss": 0.0, + "step": 27379 + }, + { + "epoch": 2.5548194457404123, + "grad_norm": NaN, + "learning_rate": 0.0001930371236412702, + "loss": 0.0, + "step": 27380 + }, + { + "epoch": 2.5549127554352897, + "grad_norm": NaN, + "learning_rate": 0.0001930298778116299, + "loss": 0.0, + "step": 27381 + }, + { + "epoch": 2.555006065130167, + "grad_norm": NaN, + "learning_rate": 0.00019302263187257726, + "loss": 0.0, + "step": 27382 + }, + { + "epoch": 2.555099374825044, + "grad_norm": NaN, + "learning_rate": 0.00019301538582413083, + "loss": 0.0, + "step": 27383 + }, + { + "epoch": 2.5551926845199215, + "grad_norm": NaN, + "learning_rate": 0.00019300813966630908, + "loss": 0.0, + "step": 27384 + }, + { + "epoch": 2.555285994214799, + "grad_norm": NaN, + "learning_rate": 0.00019300089339913028, + "loss": 0.0, + "step": 27385 + }, + { + "epoch": 2.5553793039096764, + "grad_norm": NaN, + "learning_rate": 0.000192993647022613, + "loss": 0.0, + "step": 27386 + }, + { + "epoch": 2.5554726136045534, + "grad_norm": NaN, + "learning_rate": 0.00019298640053677562, + "loss": 0.0, + "step": 27387 + }, + { + "epoch": 2.5555659232994308, + "grad_norm": NaN, + "learning_rate": 0.00019297915394163653, + "loss": 0.0, + "step": 27388 + }, + { + "epoch": 2.555659232994308, + "grad_norm": NaN, + "learning_rate": 0.0001929719072372142, + "loss": 0.0, + "step": 27389 + }, + { + "epoch": 2.555752542689185, + "grad_norm": NaN, + "learning_rate": 0.00019296466042352704, + "loss": 0.0, + "step": 27390 + }, + { + "epoch": 2.5558458523840626, + "grad_norm": NaN, + "learning_rate": 0.00019295741350059347, + "loss": 0.0, + "step": 27391 + }, + { + "epoch": 2.55593916207894, + "grad_norm": NaN, + "learning_rate": 0.00019295016646843192, + "loss": 0.0, + "step": 27392 + }, + { + "epoch": 2.5560324717738174, + "grad_norm": NaN, + "learning_rate": 0.0001929429193270608, + "loss": 0.0, + "step": 27393 + }, + { + "epoch": 2.556125781468695, + "grad_norm": NaN, + "learning_rate": 0.0001929356720764986, + "loss": 0.0, + "step": 27394 + }, + { + "epoch": 2.556219091163572, + "grad_norm": NaN, + "learning_rate": 0.0001929284247167637, + "loss": 0.0, + "step": 27395 + }, + { + "epoch": 2.5563124008584492, + "grad_norm": NaN, + "learning_rate": 0.0001929211772478745, + "loss": 0.0, + "step": 27396 + }, + { + "epoch": 2.556405710553326, + "grad_norm": NaN, + "learning_rate": 0.0001929139296698495, + "loss": 0.0, + "step": 27397 + }, + { + "epoch": 2.5564990202482036, + "grad_norm": NaN, + "learning_rate": 0.00019290668198270703, + "loss": 0.0, + "step": 27398 + }, + { + "epoch": 2.556592329943081, + "grad_norm": NaN, + "learning_rate": 0.00019289943418646563, + "loss": 0.0, + "step": 27399 + }, + { + "epoch": 2.5566856396379585, + "grad_norm": NaN, + "learning_rate": 0.00019289218628114368, + "loss": 0.0, + "step": 27400 + }, + { + "epoch": 2.556778949332836, + "grad_norm": NaN, + "learning_rate": 0.00019288493826675958, + "loss": 0.0, + "step": 27401 + }, + { + "epoch": 2.556872259027713, + "grad_norm": NaN, + "learning_rate": 0.0001928776901433318, + "loss": 0.0, + "step": 27402 + }, + { + "epoch": 2.5569655687225903, + "grad_norm": NaN, + "learning_rate": 0.00019287044191087875, + "loss": 0.0, + "step": 27403 + }, + { + "epoch": 2.5570588784174677, + "grad_norm": NaN, + "learning_rate": 0.00019286319356941884, + "loss": 0.0, + "step": 27404 + }, + { + "epoch": 2.5571521881123447, + "grad_norm": NaN, + "learning_rate": 0.00019285594511897055, + "loss": 0.0, + "step": 27405 + }, + { + "epoch": 2.557245497807222, + "grad_norm": NaN, + "learning_rate": 0.00019284869655955233, + "loss": 0.0, + "step": 27406 + }, + { + "epoch": 2.5573388075020995, + "grad_norm": NaN, + "learning_rate": 0.00019284144789118247, + "loss": 0.0, + "step": 27407 + }, + { + "epoch": 2.557432117196977, + "grad_norm": NaN, + "learning_rate": 0.00019283419911387954, + "loss": 0.0, + "step": 27408 + }, + { + "epoch": 2.557525426891854, + "grad_norm": NaN, + "learning_rate": 0.00019282695022766196, + "loss": 0.0, + "step": 27409 + }, + { + "epoch": 2.5576187365867313, + "grad_norm": NaN, + "learning_rate": 0.00019281970123254807, + "loss": 0.0, + "step": 27410 + }, + { + "epoch": 2.5577120462816088, + "grad_norm": NaN, + "learning_rate": 0.00019281245212855643, + "loss": 0.0, + "step": 27411 + }, + { + "epoch": 2.5578053559764857, + "grad_norm": NaN, + "learning_rate": 0.0001928052029157054, + "loss": 0.0, + "step": 27412 + }, + { + "epoch": 2.557898665671363, + "grad_norm": NaN, + "learning_rate": 0.0001927979535940134, + "loss": 0.0, + "step": 27413 + }, + { + "epoch": 2.5579919753662406, + "grad_norm": NaN, + "learning_rate": 0.0001927907041634989, + "loss": 0.0, + "step": 27414 + }, + { + "epoch": 2.558085285061118, + "grad_norm": NaN, + "learning_rate": 0.00019278345462418026, + "loss": 0.0, + "step": 27415 + }, + { + "epoch": 2.5581785947559954, + "grad_norm": NaN, + "learning_rate": 0.00019277620497607601, + "loss": 0.0, + "step": 27416 + }, + { + "epoch": 2.5582719044508724, + "grad_norm": NaN, + "learning_rate": 0.00019276895521920455, + "loss": 0.0, + "step": 27417 + }, + { + "epoch": 2.55836521414575, + "grad_norm": NaN, + "learning_rate": 0.00019276170535358428, + "loss": 0.0, + "step": 27418 + }, + { + "epoch": 2.5584585238406268, + "grad_norm": NaN, + "learning_rate": 0.00019275445537923373, + "loss": 0.0, + "step": 27419 + }, + { + "epoch": 2.558551833535504, + "grad_norm": NaN, + "learning_rate": 0.00019274720529617118, + "loss": 0.0, + "step": 27420 + }, + { + "epoch": 2.5586451432303816, + "grad_norm": NaN, + "learning_rate": 0.00019273995510441517, + "loss": 0.0, + "step": 27421 + }, + { + "epoch": 2.558738452925259, + "grad_norm": NaN, + "learning_rate": 0.00019273270480398412, + "loss": 0.0, + "step": 27422 + }, + { + "epoch": 2.5588317626201365, + "grad_norm": NaN, + "learning_rate": 0.00019272545439489645, + "loss": 0.0, + "step": 27423 + }, + { + "epoch": 2.5589250723150134, + "grad_norm": NaN, + "learning_rate": 0.0001927182038771706, + "loss": 0.0, + "step": 27424 + }, + { + "epoch": 2.559018382009891, + "grad_norm": NaN, + "learning_rate": 0.00019271095325082508, + "loss": 0.0, + "step": 27425 + }, + { + "epoch": 2.5591116917047683, + "grad_norm": NaN, + "learning_rate": 0.00019270370251587817, + "loss": 0.0, + "step": 27426 + }, + { + "epoch": 2.5592050013996452, + "grad_norm": NaN, + "learning_rate": 0.00019269645167234843, + "loss": 0.0, + "step": 27427 + }, + { + "epoch": 2.5592983110945227, + "grad_norm": NaN, + "learning_rate": 0.00019268920072025429, + "loss": 0.0, + "step": 27428 + }, + { + "epoch": 2.5593916207894, + "grad_norm": NaN, + "learning_rate": 0.00019268194965961412, + "loss": 0.0, + "step": 27429 + }, + { + "epoch": 2.5594849304842775, + "grad_norm": NaN, + "learning_rate": 0.0001926746984904464, + "loss": 0.0, + "step": 27430 + }, + { + "epoch": 2.5595782401791545, + "grad_norm": NaN, + "learning_rate": 0.00019266744721276957, + "loss": 0.0, + "step": 27431 + }, + { + "epoch": 2.559671549874032, + "grad_norm": NaN, + "learning_rate": 0.00019266019582660207, + "loss": 0.0, + "step": 27432 + }, + { + "epoch": 2.5597648595689093, + "grad_norm": NaN, + "learning_rate": 0.00019265294433196232, + "loss": 0.0, + "step": 27433 + }, + { + "epoch": 2.5598581692637863, + "grad_norm": NaN, + "learning_rate": 0.00019264569272886878, + "loss": 0.0, + "step": 27434 + }, + { + "epoch": 2.5599514789586637, + "grad_norm": NaN, + "learning_rate": 0.00019263844101733986, + "loss": 0.0, + "step": 27435 + }, + { + "epoch": 2.560044788653541, + "grad_norm": NaN, + "learning_rate": 0.00019263118919739402, + "loss": 0.0, + "step": 27436 + }, + { + "epoch": 2.5601380983484185, + "grad_norm": NaN, + "learning_rate": 0.00019262393726904967, + "loss": 0.0, + "step": 27437 + }, + { + "epoch": 2.560231408043296, + "grad_norm": NaN, + "learning_rate": 0.0001926166852323253, + "loss": 0.0, + "step": 27438 + }, + { + "epoch": 2.560324717738173, + "grad_norm": NaN, + "learning_rate": 0.00019260943308723933, + "loss": 0.0, + "step": 27439 + }, + { + "epoch": 2.5604180274330504, + "grad_norm": NaN, + "learning_rate": 0.00019260218083381016, + "loss": 0.0, + "step": 27440 + }, + { + "epoch": 2.5605113371279273, + "grad_norm": NaN, + "learning_rate": 0.00019259492847205635, + "loss": 0.0, + "step": 27441 + }, + { + "epoch": 2.5606046468228048, + "grad_norm": NaN, + "learning_rate": 0.00019258767600199614, + "loss": 0.0, + "step": 27442 + }, + { + "epoch": 2.560697956517682, + "grad_norm": NaN, + "learning_rate": 0.00019258042342364814, + "loss": 0.0, + "step": 27443 + }, + { + "epoch": 2.5607912662125596, + "grad_norm": NaN, + "learning_rate": 0.00019257317073703077, + "loss": 0.0, + "step": 27444 + }, + { + "epoch": 2.560884575907437, + "grad_norm": NaN, + "learning_rate": 0.00019256591794216237, + "loss": 0.0, + "step": 27445 + }, + { + "epoch": 2.560977885602314, + "grad_norm": NaN, + "learning_rate": 0.00019255866503906145, + "loss": 0.0, + "step": 27446 + }, + { + "epoch": 2.5610711952971914, + "grad_norm": NaN, + "learning_rate": 0.00019255141202774648, + "loss": 0.0, + "step": 27447 + }, + { + "epoch": 2.561164504992069, + "grad_norm": NaN, + "learning_rate": 0.00019254415890823589, + "loss": 0.0, + "step": 27448 + }, + { + "epoch": 2.561257814686946, + "grad_norm": NaN, + "learning_rate": 0.0001925369056805481, + "loss": 0.0, + "step": 27449 + }, + { + "epoch": 2.561351124381823, + "grad_norm": NaN, + "learning_rate": 0.00019252965234470154, + "loss": 0.0, + "step": 27450 + }, + { + "epoch": 2.5614444340767006, + "grad_norm": NaN, + "learning_rate": 0.0001925223989007147, + "loss": 0.0, + "step": 27451 + }, + { + "epoch": 2.561537743771578, + "grad_norm": NaN, + "learning_rate": 0.000192515145348606, + "loss": 0.0, + "step": 27452 + }, + { + "epoch": 2.561631053466455, + "grad_norm": NaN, + "learning_rate": 0.00019250789168839382, + "loss": 0.0, + "step": 27453 + }, + { + "epoch": 2.5617243631613325, + "grad_norm": NaN, + "learning_rate": 0.0001925006379200967, + "loss": 0.0, + "step": 27454 + }, + { + "epoch": 2.56181767285621, + "grad_norm": NaN, + "learning_rate": 0.00019249338404373306, + "loss": 0.0, + "step": 27455 + }, + { + "epoch": 2.561910982551087, + "grad_norm": NaN, + "learning_rate": 0.00019248613005932132, + "loss": 0.0, + "step": 27456 + }, + { + "epoch": 2.5620042922459643, + "grad_norm": NaN, + "learning_rate": 0.00019247887596687995, + "loss": 0.0, + "step": 27457 + }, + { + "epoch": 2.5620976019408417, + "grad_norm": NaN, + "learning_rate": 0.00019247162176642737, + "loss": 0.0, + "step": 27458 + }, + { + "epoch": 2.562190911635719, + "grad_norm": NaN, + "learning_rate": 0.00019246436745798203, + "loss": 0.0, + "step": 27459 + }, + { + "epoch": 2.5622842213305965, + "grad_norm": NaN, + "learning_rate": 0.0001924571130415624, + "loss": 0.0, + "step": 27460 + }, + { + "epoch": 2.5623775310254735, + "grad_norm": NaN, + "learning_rate": 0.0001924498585171869, + "loss": 0.0, + "step": 27461 + }, + { + "epoch": 2.562470840720351, + "grad_norm": NaN, + "learning_rate": 0.00019244260388487396, + "loss": 0.0, + "step": 27462 + }, + { + "epoch": 2.562564150415228, + "grad_norm": NaN, + "learning_rate": 0.0001924353491446421, + "loss": 0.0, + "step": 27463 + }, + { + "epoch": 2.5626574601101053, + "grad_norm": NaN, + "learning_rate": 0.00019242809429650967, + "loss": 0.0, + "step": 27464 + }, + { + "epoch": 2.5627507698049827, + "grad_norm": NaN, + "learning_rate": 0.00019242083934049517, + "loss": 0.0, + "step": 27465 + }, + { + "epoch": 2.56284407949986, + "grad_norm": NaN, + "learning_rate": 0.00019241358427661708, + "loss": 0.0, + "step": 27466 + }, + { + "epoch": 2.5629373891947376, + "grad_norm": NaN, + "learning_rate": 0.00019240632910489378, + "loss": 0.0, + "step": 27467 + }, + { + "epoch": 2.5630306988896145, + "grad_norm": NaN, + "learning_rate": 0.00019239907382534373, + "loss": 0.0, + "step": 27468 + }, + { + "epoch": 2.563124008584492, + "grad_norm": NaN, + "learning_rate": 0.00019239181843798544, + "loss": 0.0, + "step": 27469 + }, + { + "epoch": 2.5632173182793694, + "grad_norm": NaN, + "learning_rate": 0.0001923845629428373, + "loss": 0.0, + "step": 27470 + }, + { + "epoch": 2.5633106279742464, + "grad_norm": NaN, + "learning_rate": 0.00019237730733991773, + "loss": 0.0, + "step": 27471 + }, + { + "epoch": 2.5634039376691238, + "grad_norm": NaN, + "learning_rate": 0.00019237005162924526, + "loss": 0.0, + "step": 27472 + }, + { + "epoch": 2.563497247364001, + "grad_norm": NaN, + "learning_rate": 0.0001923627958108383, + "loss": 0.0, + "step": 27473 + }, + { + "epoch": 2.5635905570588786, + "grad_norm": NaN, + "learning_rate": 0.0001923555398847153, + "loss": 0.0, + "step": 27474 + }, + { + "epoch": 2.5636838667537556, + "grad_norm": NaN, + "learning_rate": 0.0001923482838508947, + "loss": 0.0, + "step": 27475 + }, + { + "epoch": 2.563777176448633, + "grad_norm": NaN, + "learning_rate": 0.00019234102770939493, + "loss": 0.0, + "step": 27476 + }, + { + "epoch": 2.5638704861435104, + "grad_norm": NaN, + "learning_rate": 0.0001923337714602345, + "loss": 0.0, + "step": 27477 + }, + { + "epoch": 2.5639637958383874, + "grad_norm": NaN, + "learning_rate": 0.00019232651510343182, + "loss": 0.0, + "step": 27478 + }, + { + "epoch": 2.564057105533265, + "grad_norm": NaN, + "learning_rate": 0.00019231925863900535, + "loss": 0.0, + "step": 27479 + }, + { + "epoch": 2.5641504152281422, + "grad_norm": NaN, + "learning_rate": 0.00019231200206697356, + "loss": 0.0, + "step": 27480 + }, + { + "epoch": 2.5642437249230197, + "grad_norm": NaN, + "learning_rate": 0.00019230474538735486, + "loss": 0.0, + "step": 27481 + }, + { + "epoch": 2.5643370346178966, + "grad_norm": NaN, + "learning_rate": 0.00019229748860016775, + "loss": 0.0, + "step": 27482 + }, + { + "epoch": 2.564430344312774, + "grad_norm": NaN, + "learning_rate": 0.00019229023170543063, + "loss": 0.0, + "step": 27483 + }, + { + "epoch": 2.5645236540076515, + "grad_norm": NaN, + "learning_rate": 0.00019228297470316199, + "loss": 0.0, + "step": 27484 + }, + { + "epoch": 2.5646169637025285, + "grad_norm": NaN, + "learning_rate": 0.00019227571759338028, + "loss": 0.0, + "step": 27485 + }, + { + "epoch": 2.564710273397406, + "grad_norm": NaN, + "learning_rate": 0.0001922684603761039, + "loss": 0.0, + "step": 27486 + }, + { + "epoch": 2.5648035830922833, + "grad_norm": NaN, + "learning_rate": 0.0001922612030513514, + "loss": 0.0, + "step": 27487 + }, + { + "epoch": 2.5648968927871607, + "grad_norm": NaN, + "learning_rate": 0.00019225394561914113, + "loss": 0.0, + "step": 27488 + }, + { + "epoch": 2.564990202482038, + "grad_norm": NaN, + "learning_rate": 0.00019224668807949166, + "loss": 0.0, + "step": 27489 + }, + { + "epoch": 2.565083512176915, + "grad_norm": NaN, + "learning_rate": 0.00019223943043242131, + "loss": 0.0, + "step": 27490 + }, + { + "epoch": 2.5651768218717925, + "grad_norm": NaN, + "learning_rate": 0.00019223217267794863, + "loss": 0.0, + "step": 27491 + }, + { + "epoch": 2.56527013156667, + "grad_norm": NaN, + "learning_rate": 0.00019222491481609203, + "loss": 0.0, + "step": 27492 + }, + { + "epoch": 2.565363441261547, + "grad_norm": NaN, + "learning_rate": 0.00019221765684687, + "loss": 0.0, + "step": 27493 + }, + { + "epoch": 2.5654567509564243, + "grad_norm": NaN, + "learning_rate": 0.00019221039877030096, + "loss": 0.0, + "step": 27494 + }, + { + "epoch": 2.5655500606513018, + "grad_norm": NaN, + "learning_rate": 0.00019220314058640342, + "loss": 0.0, + "step": 27495 + }, + { + "epoch": 2.565643370346179, + "grad_norm": NaN, + "learning_rate": 0.00019219588229519574, + "loss": 0.0, + "step": 27496 + }, + { + "epoch": 2.565736680041056, + "grad_norm": NaN, + "learning_rate": 0.00019218862389669641, + "loss": 0.0, + "step": 27497 + }, + { + "epoch": 2.5658299897359336, + "grad_norm": NaN, + "learning_rate": 0.000192181365390924, + "loss": 0.0, + "step": 27498 + }, + { + "epoch": 2.565923299430811, + "grad_norm": NaN, + "learning_rate": 0.0001921741067778968, + "loss": 0.0, + "step": 27499 + }, + { + "epoch": 2.566016609125688, + "grad_norm": NaN, + "learning_rate": 0.00019216684805763336, + "loss": 0.0, + "step": 27500 + }, + { + "epoch": 2.5661099188205654, + "grad_norm": NaN, + "learning_rate": 0.00019215958923015213, + "loss": 0.0, + "step": 27501 + }, + { + "epoch": 2.566203228515443, + "grad_norm": NaN, + "learning_rate": 0.00019215233029547151, + "loss": 0.0, + "step": 27502 + }, + { + "epoch": 2.56629653821032, + "grad_norm": NaN, + "learning_rate": 0.00019214507125361004, + "loss": 0.0, + "step": 27503 + }, + { + "epoch": 2.566389847905197, + "grad_norm": NaN, + "learning_rate": 0.00019213781210458613, + "loss": 0.0, + "step": 27504 + }, + { + "epoch": 2.5664831576000746, + "grad_norm": NaN, + "learning_rate": 0.00019213055284841824, + "loss": 0.0, + "step": 27505 + }, + { + "epoch": 2.566576467294952, + "grad_norm": NaN, + "learning_rate": 0.00019212329348512484, + "loss": 0.0, + "step": 27506 + }, + { + "epoch": 2.566669776989829, + "grad_norm": NaN, + "learning_rate": 0.00019211603401472437, + "loss": 0.0, + "step": 27507 + }, + { + "epoch": 2.5667630866847064, + "grad_norm": NaN, + "learning_rate": 0.00019210877443723534, + "loss": 0.0, + "step": 27508 + }, + { + "epoch": 2.566856396379584, + "grad_norm": NaN, + "learning_rate": 0.00019210151475267614, + "loss": 0.0, + "step": 27509 + }, + { + "epoch": 2.5669497060744613, + "grad_norm": NaN, + "learning_rate": 0.00019209425496106522, + "loss": 0.0, + "step": 27510 + }, + { + "epoch": 2.5670430157693387, + "grad_norm": NaN, + "learning_rate": 0.00019208699506242114, + "loss": 0.0, + "step": 27511 + }, + { + "epoch": 2.5671363254642157, + "grad_norm": NaN, + "learning_rate": 0.00019207973505676226, + "loss": 0.0, + "step": 27512 + }, + { + "epoch": 2.567229635159093, + "grad_norm": NaN, + "learning_rate": 0.00019207247494410713, + "loss": 0.0, + "step": 27513 + }, + { + "epoch": 2.56732294485397, + "grad_norm": NaN, + "learning_rate": 0.00019206521472447412, + "loss": 0.0, + "step": 27514 + }, + { + "epoch": 2.5674162545488475, + "grad_norm": NaN, + "learning_rate": 0.00019205795439788172, + "loss": 0.0, + "step": 27515 + }, + { + "epoch": 2.567509564243725, + "grad_norm": NaN, + "learning_rate": 0.00019205069396434842, + "loss": 0.0, + "step": 27516 + }, + { + "epoch": 2.5676028739386023, + "grad_norm": NaN, + "learning_rate": 0.00019204343342389267, + "loss": 0.0, + "step": 27517 + }, + { + "epoch": 2.5676961836334797, + "grad_norm": NaN, + "learning_rate": 0.0001920361727765329, + "loss": 0.0, + "step": 27518 + }, + { + "epoch": 2.5677894933283567, + "grad_norm": NaN, + "learning_rate": 0.00019202891202228762, + "loss": 0.0, + "step": 27519 + }, + { + "epoch": 2.567882803023234, + "grad_norm": NaN, + "learning_rate": 0.00019202165116117523, + "loss": 0.0, + "step": 27520 + }, + { + "epoch": 2.5679761127181115, + "grad_norm": NaN, + "learning_rate": 0.00019201439019321425, + "loss": 0.0, + "step": 27521 + }, + { + "epoch": 2.5680694224129885, + "grad_norm": NaN, + "learning_rate": 0.00019200712911842315, + "loss": 0.0, + "step": 27522 + }, + { + "epoch": 2.568162732107866, + "grad_norm": NaN, + "learning_rate": 0.00019199986793682033, + "loss": 0.0, + "step": 27523 + }, + { + "epoch": 2.5682560418027434, + "grad_norm": NaN, + "learning_rate": 0.00019199260664842427, + "loss": 0.0, + "step": 27524 + }, + { + "epoch": 2.568349351497621, + "grad_norm": NaN, + "learning_rate": 0.00019198534525325346, + "loss": 0.0, + "step": 27525 + }, + { + "epoch": 2.5684426611924978, + "grad_norm": NaN, + "learning_rate": 0.0001919780837513264, + "loss": 0.0, + "step": 27526 + }, + { + "epoch": 2.568535970887375, + "grad_norm": NaN, + "learning_rate": 0.00019197082214266148, + "loss": 0.0, + "step": 27527 + }, + { + "epoch": 2.5686292805822526, + "grad_norm": NaN, + "learning_rate": 0.00019196356042727717, + "loss": 0.0, + "step": 27528 + }, + { + "epoch": 2.5687225902771296, + "grad_norm": NaN, + "learning_rate": 0.00019195629860519197, + "loss": 0.0, + "step": 27529 + }, + { + "epoch": 2.568815899972007, + "grad_norm": NaN, + "learning_rate": 0.00019194903667642436, + "loss": 0.0, + "step": 27530 + }, + { + "epoch": 2.5689092096668844, + "grad_norm": NaN, + "learning_rate": 0.0001919417746409927, + "loss": 0.0, + "step": 27531 + }, + { + "epoch": 2.569002519361762, + "grad_norm": NaN, + "learning_rate": 0.00019193451249891562, + "loss": 0.0, + "step": 27532 + }, + { + "epoch": 2.5690958290566392, + "grad_norm": NaN, + "learning_rate": 0.00019192725025021147, + "loss": 0.0, + "step": 27533 + }, + { + "epoch": 2.569189138751516, + "grad_norm": NaN, + "learning_rate": 0.0001919199878948987, + "loss": 0.0, + "step": 27534 + }, + { + "epoch": 2.5692824484463936, + "grad_norm": NaN, + "learning_rate": 0.00019191272543299585, + "loss": 0.0, + "step": 27535 + }, + { + "epoch": 2.5693757581412706, + "grad_norm": NaN, + "learning_rate": 0.00019190546286452137, + "loss": 0.0, + "step": 27536 + }, + { + "epoch": 2.569469067836148, + "grad_norm": NaN, + "learning_rate": 0.0001918982001894937, + "loss": 0.0, + "step": 27537 + }, + { + "epoch": 2.5695623775310255, + "grad_norm": NaN, + "learning_rate": 0.00019189093740793132, + "loss": 0.0, + "step": 27538 + }, + { + "epoch": 2.569655687225903, + "grad_norm": NaN, + "learning_rate": 0.00019188367451985268, + "loss": 0.0, + "step": 27539 + }, + { + "epoch": 2.5697489969207803, + "grad_norm": NaN, + "learning_rate": 0.00019187641152527626, + "loss": 0.0, + "step": 27540 + }, + { + "epoch": 2.5698423066156573, + "grad_norm": NaN, + "learning_rate": 0.00019186914842422053, + "loss": 0.0, + "step": 27541 + }, + { + "epoch": 2.5699356163105347, + "grad_norm": NaN, + "learning_rate": 0.000191861885216704, + "loss": 0.0, + "step": 27542 + }, + { + "epoch": 2.570028926005412, + "grad_norm": NaN, + "learning_rate": 0.00019185462190274502, + "loss": 0.0, + "step": 27543 + }, + { + "epoch": 2.570122235700289, + "grad_norm": NaN, + "learning_rate": 0.0001918473584823622, + "loss": 0.0, + "step": 27544 + }, + { + "epoch": 2.5702155453951665, + "grad_norm": NaN, + "learning_rate": 0.00019184009495557395, + "loss": 0.0, + "step": 27545 + }, + { + "epoch": 2.570308855090044, + "grad_norm": NaN, + "learning_rate": 0.0001918328313223987, + "loss": 0.0, + "step": 27546 + }, + { + "epoch": 2.5704021647849213, + "grad_norm": NaN, + "learning_rate": 0.00019182556758285494, + "loss": 0.0, + "step": 27547 + }, + { + "epoch": 2.5704954744797983, + "grad_norm": NaN, + "learning_rate": 0.00019181830373696119, + "loss": 0.0, + "step": 27548 + }, + { + "epoch": 2.5705887841746757, + "grad_norm": NaN, + "learning_rate": 0.00019181103978473585, + "loss": 0.0, + "step": 27549 + }, + { + "epoch": 2.570682093869553, + "grad_norm": NaN, + "learning_rate": 0.00019180377572619744, + "loss": 0.0, + "step": 27550 + }, + { + "epoch": 2.57077540356443, + "grad_norm": NaN, + "learning_rate": 0.0001917965115613644, + "loss": 0.0, + "step": 27551 + }, + { + "epoch": 2.5708687132593075, + "grad_norm": NaN, + "learning_rate": 0.0001917892472902552, + "loss": 0.0, + "step": 27552 + }, + { + "epoch": 2.570962022954185, + "grad_norm": NaN, + "learning_rate": 0.00019178198291288834, + "loss": 0.0, + "step": 27553 + }, + { + "epoch": 2.5710553326490624, + "grad_norm": NaN, + "learning_rate": 0.00019177471842928226, + "loss": 0.0, + "step": 27554 + }, + { + "epoch": 2.57114864234394, + "grad_norm": NaN, + "learning_rate": 0.0001917674538394555, + "loss": 0.0, + "step": 27555 + }, + { + "epoch": 2.571241952038817, + "grad_norm": NaN, + "learning_rate": 0.0001917601891434264, + "loss": 0.0, + "step": 27556 + }, + { + "epoch": 2.571335261733694, + "grad_norm": NaN, + "learning_rate": 0.00019175292434121357, + "loss": 0.0, + "step": 27557 + }, + { + "epoch": 2.571428571428571, + "grad_norm": NaN, + "learning_rate": 0.00019174565943283538, + "loss": 0.0, + "step": 27558 + }, + { + "epoch": 2.5715218811234486, + "grad_norm": NaN, + "learning_rate": 0.00019173839441831038, + "loss": 0.0, + "step": 27559 + }, + { + "epoch": 2.571615190818326, + "grad_norm": NaN, + "learning_rate": 0.00019173112929765697, + "loss": 0.0, + "step": 27560 + }, + { + "epoch": 2.5717085005132034, + "grad_norm": NaN, + "learning_rate": 0.00019172386407089367, + "loss": 0.0, + "step": 27561 + }, + { + "epoch": 2.571801810208081, + "grad_norm": NaN, + "learning_rate": 0.00019171659873803897, + "loss": 0.0, + "step": 27562 + }, + { + "epoch": 2.571895119902958, + "grad_norm": NaN, + "learning_rate": 0.0001917093332991113, + "loss": 0.0, + "step": 27563 + }, + { + "epoch": 2.5719884295978352, + "grad_norm": NaN, + "learning_rate": 0.00019170206775412913, + "loss": 0.0, + "step": 27564 + }, + { + "epoch": 2.5720817392927127, + "grad_norm": NaN, + "learning_rate": 0.00019169480210311097, + "loss": 0.0, + "step": 27565 + }, + { + "epoch": 2.5721750489875896, + "grad_norm": NaN, + "learning_rate": 0.00019168753634607528, + "loss": 0.0, + "step": 27566 + }, + { + "epoch": 2.572268358682467, + "grad_norm": NaN, + "learning_rate": 0.00019168027048304054, + "loss": 0.0, + "step": 27567 + }, + { + "epoch": 2.5723616683773445, + "grad_norm": NaN, + "learning_rate": 0.00019167300451402523, + "loss": 0.0, + "step": 27568 + }, + { + "epoch": 2.572454978072222, + "grad_norm": NaN, + "learning_rate": 0.0001916657384390478, + "loss": 0.0, + "step": 27569 + }, + { + "epoch": 2.572548287767099, + "grad_norm": NaN, + "learning_rate": 0.00019165847225812672, + "loss": 0.0, + "step": 27570 + }, + { + "epoch": 2.5726415974619763, + "grad_norm": NaN, + "learning_rate": 0.0001916512059712805, + "loss": 0.0, + "step": 27571 + }, + { + "epoch": 2.5727349071568537, + "grad_norm": NaN, + "learning_rate": 0.0001916439395785276, + "loss": 0.0, + "step": 27572 + }, + { + "epoch": 2.5728282168517307, + "grad_norm": NaN, + "learning_rate": 0.00019163667307988652, + "loss": 0.0, + "step": 27573 + }, + { + "epoch": 2.572921526546608, + "grad_norm": NaN, + "learning_rate": 0.00019162940647537567, + "loss": 0.0, + "step": 27574 + }, + { + "epoch": 2.5730148362414855, + "grad_norm": NaN, + "learning_rate": 0.0001916221397650136, + "loss": 0.0, + "step": 27575 + }, + { + "epoch": 2.573108145936363, + "grad_norm": NaN, + "learning_rate": 0.00019161487294881875, + "loss": 0.0, + "step": 27576 + }, + { + "epoch": 2.57320145563124, + "grad_norm": NaN, + "learning_rate": 0.00019160760602680962, + "loss": 0.0, + "step": 27577 + }, + { + "epoch": 2.5732947653261173, + "grad_norm": NaN, + "learning_rate": 0.00019160033899900467, + "loss": 0.0, + "step": 27578 + }, + { + "epoch": 2.5733880750209948, + "grad_norm": NaN, + "learning_rate": 0.00019159307186542235, + "loss": 0.0, + "step": 27579 + }, + { + "epoch": 2.5734813847158717, + "grad_norm": NaN, + "learning_rate": 0.0001915858046260812, + "loss": 0.0, + "step": 27580 + }, + { + "epoch": 2.573574694410749, + "grad_norm": NaN, + "learning_rate": 0.00019157853728099967, + "loss": 0.0, + "step": 27581 + }, + { + "epoch": 2.5736680041056266, + "grad_norm": NaN, + "learning_rate": 0.00019157126983019622, + "loss": 0.0, + "step": 27582 + }, + { + "epoch": 2.573761313800504, + "grad_norm": NaN, + "learning_rate": 0.00019156400227368937, + "loss": 0.0, + "step": 27583 + }, + { + "epoch": 2.5738546234953814, + "grad_norm": NaN, + "learning_rate": 0.00019155673461149753, + "loss": 0.0, + "step": 27584 + }, + { + "epoch": 2.5739479331902584, + "grad_norm": NaN, + "learning_rate": 0.00019154946684363927, + "loss": 0.0, + "step": 27585 + }, + { + "epoch": 2.574041242885136, + "grad_norm": NaN, + "learning_rate": 0.000191542198970133, + "loss": 0.0, + "step": 27586 + }, + { + "epoch": 2.5741345525800132, + "grad_norm": NaN, + "learning_rate": 0.00019153493099099723, + "loss": 0.0, + "step": 27587 + }, + { + "epoch": 2.57422786227489, + "grad_norm": NaN, + "learning_rate": 0.0001915276629062504, + "loss": 0.0, + "step": 27588 + }, + { + "epoch": 2.5743211719697676, + "grad_norm": NaN, + "learning_rate": 0.0001915203947159111, + "loss": 0.0, + "step": 27589 + }, + { + "epoch": 2.574414481664645, + "grad_norm": NaN, + "learning_rate": 0.00019151312641999766, + "loss": 0.0, + "step": 27590 + }, + { + "epoch": 2.5745077913595225, + "grad_norm": NaN, + "learning_rate": 0.00019150585801852867, + "loss": 0.0, + "step": 27591 + }, + { + "epoch": 2.5746011010543994, + "grad_norm": NaN, + "learning_rate": 0.0001914985895115226, + "loss": 0.0, + "step": 27592 + }, + { + "epoch": 2.574694410749277, + "grad_norm": NaN, + "learning_rate": 0.00019149132089899784, + "loss": 0.0, + "step": 27593 + }, + { + "epoch": 2.5747877204441543, + "grad_norm": NaN, + "learning_rate": 0.00019148405218097301, + "loss": 0.0, + "step": 27594 + }, + { + "epoch": 2.5748810301390312, + "grad_norm": NaN, + "learning_rate": 0.00019147678335746648, + "loss": 0.0, + "step": 27595 + }, + { + "epoch": 2.5749743398339087, + "grad_norm": NaN, + "learning_rate": 0.0001914695144284968, + "loss": 0.0, + "step": 27596 + }, + { + "epoch": 2.575067649528786, + "grad_norm": NaN, + "learning_rate": 0.00019146224539408246, + "loss": 0.0, + "step": 27597 + }, + { + "epoch": 2.5751609592236635, + "grad_norm": NaN, + "learning_rate": 0.0001914549762542419, + "loss": 0.0, + "step": 27598 + }, + { + "epoch": 2.5752542689185405, + "grad_norm": NaN, + "learning_rate": 0.00019144770700899356, + "loss": 0.0, + "step": 27599 + }, + { + "epoch": 2.575347578613418, + "grad_norm": NaN, + "learning_rate": 0.000191440437658356, + "loss": 0.0, + "step": 27600 + }, + { + "epoch": 2.5754408883082953, + "grad_norm": NaN, + "learning_rate": 0.0001914331682023477, + "loss": 0.0, + "step": 27601 + }, + { + "epoch": 2.5755341980031723, + "grad_norm": NaN, + "learning_rate": 0.00019142589864098715, + "loss": 0.0, + "step": 27602 + }, + { + "epoch": 2.5756275076980497, + "grad_norm": NaN, + "learning_rate": 0.00019141862897429276, + "loss": 0.0, + "step": 27603 + }, + { + "epoch": 2.575720817392927, + "grad_norm": NaN, + "learning_rate": 0.00019141135920228307, + "loss": 0.0, + "step": 27604 + }, + { + "epoch": 2.5758141270878046, + "grad_norm": NaN, + "learning_rate": 0.0001914040893249766, + "loss": 0.0, + "step": 27605 + }, + { + "epoch": 2.575907436782682, + "grad_norm": NaN, + "learning_rate": 0.00019139681934239177, + "loss": 0.0, + "step": 27606 + }, + { + "epoch": 2.576000746477559, + "grad_norm": NaN, + "learning_rate": 0.0001913895492545471, + "loss": 0.0, + "step": 27607 + }, + { + "epoch": 2.5760940561724364, + "grad_norm": NaN, + "learning_rate": 0.00019138227906146104, + "loss": 0.0, + "step": 27608 + }, + { + "epoch": 2.5761873658673133, + "grad_norm": NaN, + "learning_rate": 0.00019137500876315213, + "loss": 0.0, + "step": 27609 + }, + { + "epoch": 2.5762806755621908, + "grad_norm": NaN, + "learning_rate": 0.00019136773835963882, + "loss": 0.0, + "step": 27610 + }, + { + "epoch": 2.576373985257068, + "grad_norm": NaN, + "learning_rate": 0.00019136046785093961, + "loss": 0.0, + "step": 27611 + }, + { + "epoch": 2.5764672949519456, + "grad_norm": NaN, + "learning_rate": 0.00019135319723707298, + "loss": 0.0, + "step": 27612 + }, + { + "epoch": 2.576560604646823, + "grad_norm": NaN, + "learning_rate": 0.00019134592651805744, + "loss": 0.0, + "step": 27613 + }, + { + "epoch": 2.5766539143417, + "grad_norm": NaN, + "learning_rate": 0.00019133865569391137, + "loss": 0.0, + "step": 27614 + }, + { + "epoch": 2.5767472240365774, + "grad_norm": NaN, + "learning_rate": 0.00019133138476465348, + "loss": 0.0, + "step": 27615 + }, + { + "epoch": 2.576840533731455, + "grad_norm": NaN, + "learning_rate": 0.00019132411373030202, + "loss": 0.0, + "step": 27616 + }, + { + "epoch": 2.576933843426332, + "grad_norm": NaN, + "learning_rate": 0.00019131684259087562, + "loss": 0.0, + "step": 27617 + }, + { + "epoch": 2.5770271531212092, + "grad_norm": NaN, + "learning_rate": 0.00019130957134639274, + "loss": 0.0, + "step": 27618 + }, + { + "epoch": 2.5771204628160866, + "grad_norm": NaN, + "learning_rate": 0.00019130229999687182, + "loss": 0.0, + "step": 27619 + }, + { + "epoch": 2.577213772510964, + "grad_norm": NaN, + "learning_rate": 0.00019129502854233144, + "loss": 0.0, + "step": 27620 + }, + { + "epoch": 2.577307082205841, + "grad_norm": NaN, + "learning_rate": 0.00019128775698278995, + "loss": 0.0, + "step": 27621 + }, + { + "epoch": 2.5774003919007185, + "grad_norm": NaN, + "learning_rate": 0.00019128048531826601, + "loss": 0.0, + "step": 27622 + }, + { + "epoch": 2.577493701595596, + "grad_norm": NaN, + "learning_rate": 0.00019127321354877797, + "loss": 0.0, + "step": 27623 + }, + { + "epoch": 2.577587011290473, + "grad_norm": NaN, + "learning_rate": 0.0001912659416743444, + "loss": 0.0, + "step": 27624 + }, + { + "epoch": 2.5776803209853503, + "grad_norm": NaN, + "learning_rate": 0.00019125866969498374, + "loss": 0.0, + "step": 27625 + }, + { + "epoch": 2.5777736306802277, + "grad_norm": NaN, + "learning_rate": 0.00019125139761071454, + "loss": 0.0, + "step": 27626 + }, + { + "epoch": 2.577866940375105, + "grad_norm": NaN, + "learning_rate": 0.0001912441254215552, + "loss": 0.0, + "step": 27627 + }, + { + "epoch": 2.5779602500699825, + "grad_norm": NaN, + "learning_rate": 0.0001912368531275243, + "loss": 0.0, + "step": 27628 + }, + { + "epoch": 2.5780535597648595, + "grad_norm": NaN, + "learning_rate": 0.00019122958072864031, + "loss": 0.0, + "step": 27629 + }, + { + "epoch": 2.578146869459737, + "grad_norm": NaN, + "learning_rate": 0.00019122230822492167, + "loss": 0.0, + "step": 27630 + }, + { + "epoch": 2.578240179154614, + "grad_norm": NaN, + "learning_rate": 0.000191215035616387, + "loss": 0.0, + "step": 27631 + }, + { + "epoch": 2.5783334888494913, + "grad_norm": NaN, + "learning_rate": 0.00019120776290305463, + "loss": 0.0, + "step": 27632 + }, + { + "epoch": 2.5784267985443687, + "grad_norm": NaN, + "learning_rate": 0.0001912004900849431, + "loss": 0.0, + "step": 27633 + }, + { + "epoch": 2.578520108239246, + "grad_norm": NaN, + "learning_rate": 0.000191193217162071, + "loss": 0.0, + "step": 27634 + }, + { + "epoch": 2.5786134179341236, + "grad_norm": NaN, + "learning_rate": 0.0001911859441344567, + "loss": 0.0, + "step": 27635 + }, + { + "epoch": 2.5787067276290006, + "grad_norm": NaN, + "learning_rate": 0.00019117867100211874, + "loss": 0.0, + "step": 27636 + }, + { + "epoch": 2.578800037323878, + "grad_norm": NaN, + "learning_rate": 0.00019117139776507567, + "loss": 0.0, + "step": 27637 + }, + { + "epoch": 2.5788933470187554, + "grad_norm": NaN, + "learning_rate": 0.0001911641244233459, + "loss": 0.0, + "step": 27638 + }, + { + "epoch": 2.5789866567136324, + "grad_norm": NaN, + "learning_rate": 0.00019115685097694792, + "loss": 0.0, + "step": 27639 + }, + { + "epoch": 2.57907996640851, + "grad_norm": NaN, + "learning_rate": 0.00019114957742590033, + "loss": 0.0, + "step": 27640 + }, + { + "epoch": 2.579173276103387, + "grad_norm": NaN, + "learning_rate": 0.0001911423037702215, + "loss": 0.0, + "step": 27641 + }, + { + "epoch": 2.5792665857982646, + "grad_norm": NaN, + "learning_rate": 0.00019113503000993003, + "loss": 0.0, + "step": 27642 + }, + { + "epoch": 2.5793598954931416, + "grad_norm": NaN, + "learning_rate": 0.00019112775614504434, + "loss": 0.0, + "step": 27643 + }, + { + "epoch": 2.579453205188019, + "grad_norm": NaN, + "learning_rate": 0.0001911204821755829, + "loss": 0.0, + "step": 27644 + }, + { + "epoch": 2.5795465148828964, + "grad_norm": NaN, + "learning_rate": 0.00019111320810156433, + "loss": 0.0, + "step": 27645 + }, + { + "epoch": 2.5796398245777734, + "grad_norm": NaN, + "learning_rate": 0.00019110593392300702, + "loss": 0.0, + "step": 27646 + }, + { + "epoch": 2.579733134272651, + "grad_norm": NaN, + "learning_rate": 0.0001910986596399295, + "loss": 0.0, + "step": 27647 + }, + { + "epoch": 2.5798264439675282, + "grad_norm": NaN, + "learning_rate": 0.0001910913852523502, + "loss": 0.0, + "step": 27648 + }, + { + "epoch": 2.5799197536624057, + "grad_norm": NaN, + "learning_rate": 0.00019108411076028774, + "loss": 0.0, + "step": 27649 + }, + { + "epoch": 2.580013063357283, + "grad_norm": NaN, + "learning_rate": 0.00019107683616376058, + "loss": 0.0, + "step": 27650 + }, + { + "epoch": 2.58010637305216, + "grad_norm": NaN, + "learning_rate": 0.0001910695614627872, + "loss": 0.0, + "step": 27651 + }, + { + "epoch": 2.5801996827470375, + "grad_norm": NaN, + "learning_rate": 0.000191062286657386, + "loss": 0.0, + "step": 27652 + }, + { + "epoch": 2.5802929924419145, + "grad_norm": NaN, + "learning_rate": 0.00019105501174757563, + "loss": 0.0, + "step": 27653 + }, + { + "epoch": 2.580386302136792, + "grad_norm": NaN, + "learning_rate": 0.00019104773673337455, + "loss": 0.0, + "step": 27654 + }, + { + "epoch": 2.5804796118316693, + "grad_norm": NaN, + "learning_rate": 0.00019104046161480117, + "loss": 0.0, + "step": 27655 + }, + { + "epoch": 2.5805729215265467, + "grad_norm": NaN, + "learning_rate": 0.00019103318639187412, + "loss": 0.0, + "step": 27656 + }, + { + "epoch": 2.580666231221424, + "grad_norm": NaN, + "learning_rate": 0.00019102591106461177, + "loss": 0.0, + "step": 27657 + }, + { + "epoch": 2.580759540916301, + "grad_norm": NaN, + "learning_rate": 0.0001910186356330327, + "loss": 0.0, + "step": 27658 + }, + { + "epoch": 2.5808528506111785, + "grad_norm": NaN, + "learning_rate": 0.00019101136009715543, + "loss": 0.0, + "step": 27659 + }, + { + "epoch": 2.580946160306056, + "grad_norm": NaN, + "learning_rate": 0.00019100408445699838, + "loss": 0.0, + "step": 27660 + }, + { + "epoch": 2.581039470000933, + "grad_norm": NaN, + "learning_rate": 0.00019099680871258008, + "loss": 0.0, + "step": 27661 + }, + { + "epoch": 2.5811327796958103, + "grad_norm": NaN, + "learning_rate": 0.0001909895328639191, + "loss": 0.0, + "step": 27662 + }, + { + "epoch": 2.5812260893906878, + "grad_norm": NaN, + "learning_rate": 0.00019098225691103384, + "loss": 0.0, + "step": 27663 + }, + { + "epoch": 2.581319399085565, + "grad_norm": NaN, + "learning_rate": 0.00019097498085394283, + "loss": 0.0, + "step": 27664 + }, + { + "epoch": 2.581412708780442, + "grad_norm": NaN, + "learning_rate": 0.0001909677046926646, + "loss": 0.0, + "step": 27665 + }, + { + "epoch": 2.5815060184753196, + "grad_norm": NaN, + "learning_rate": 0.00019096042842721764, + "loss": 0.0, + "step": 27666 + }, + { + "epoch": 2.581599328170197, + "grad_norm": NaN, + "learning_rate": 0.00019095315205762043, + "loss": 0.0, + "step": 27667 + }, + { + "epoch": 2.581692637865074, + "grad_norm": NaN, + "learning_rate": 0.00019094587558389142, + "loss": 0.0, + "step": 27668 + }, + { + "epoch": 2.5817859475599514, + "grad_norm": NaN, + "learning_rate": 0.00019093859900604928, + "loss": 0.0, + "step": 27669 + }, + { + "epoch": 2.581879257254829, + "grad_norm": NaN, + "learning_rate": 0.00019093132232411237, + "loss": 0.0, + "step": 27670 + }, + { + "epoch": 2.5819725669497062, + "grad_norm": NaN, + "learning_rate": 0.0001909240455380992, + "loss": 0.0, + "step": 27671 + }, + { + "epoch": 2.5820658766445836, + "grad_norm": NaN, + "learning_rate": 0.00019091676864802838, + "loss": 0.0, + "step": 27672 + }, + { + "epoch": 2.5821591863394606, + "grad_norm": NaN, + "learning_rate": 0.0001909094916539183, + "loss": 0.0, + "step": 27673 + }, + { + "epoch": 2.582252496034338, + "grad_norm": NaN, + "learning_rate": 0.00019090221455578745, + "loss": 0.0, + "step": 27674 + }, + { + "epoch": 2.582345805729215, + "grad_norm": NaN, + "learning_rate": 0.00019089493735365446, + "loss": 0.0, + "step": 27675 + }, + { + "epoch": 2.5824391154240924, + "grad_norm": NaN, + "learning_rate": 0.00019088766004753775, + "loss": 0.0, + "step": 27676 + }, + { + "epoch": 2.58253242511897, + "grad_norm": NaN, + "learning_rate": 0.00019088038263745575, + "loss": 0.0, + "step": 27677 + }, + { + "epoch": 2.5826257348138473, + "grad_norm": NaN, + "learning_rate": 0.00019087310512342715, + "loss": 0.0, + "step": 27678 + }, + { + "epoch": 2.5827190445087247, + "grad_norm": NaN, + "learning_rate": 0.00019086582750547034, + "loss": 0.0, + "step": 27679 + }, + { + "epoch": 2.5828123542036017, + "grad_norm": NaN, + "learning_rate": 0.00019085854978360378, + "loss": 0.0, + "step": 27680 + }, + { + "epoch": 2.582905663898479, + "grad_norm": NaN, + "learning_rate": 0.00019085127195784607, + "loss": 0.0, + "step": 27681 + }, + { + "epoch": 2.5829989735933565, + "grad_norm": NaN, + "learning_rate": 0.00019084399402821567, + "loss": 0.0, + "step": 27682 + }, + { + "epoch": 2.5830922832882335, + "grad_norm": NaN, + "learning_rate": 0.00019083671599473103, + "loss": 0.0, + "step": 27683 + }, + { + "epoch": 2.583185592983111, + "grad_norm": NaN, + "learning_rate": 0.0001908294378574108, + "loss": 0.0, + "step": 27684 + }, + { + "epoch": 2.5832789026779883, + "grad_norm": NaN, + "learning_rate": 0.0001908221596162734, + "loss": 0.0, + "step": 27685 + }, + { + "epoch": 2.5833722123728657, + "grad_norm": NaN, + "learning_rate": 0.00019081488127133727, + "loss": 0.0, + "step": 27686 + }, + { + "epoch": 2.5834655220677427, + "grad_norm": NaN, + "learning_rate": 0.00019080760282262106, + "loss": 0.0, + "step": 27687 + }, + { + "epoch": 2.58355883176262, + "grad_norm": NaN, + "learning_rate": 0.0001908003242701432, + "loss": 0.0, + "step": 27688 + }, + { + "epoch": 2.5836521414574976, + "grad_norm": NaN, + "learning_rate": 0.00019079304561392217, + "loss": 0.0, + "step": 27689 + }, + { + "epoch": 2.5837454511523745, + "grad_norm": NaN, + "learning_rate": 0.00019078576685397652, + "loss": 0.0, + "step": 27690 + }, + { + "epoch": 2.583838760847252, + "grad_norm": NaN, + "learning_rate": 0.00019077848799032475, + "loss": 0.0, + "step": 27691 + }, + { + "epoch": 2.5839320705421294, + "grad_norm": NaN, + "learning_rate": 0.00019077120902298534, + "loss": 0.0, + "step": 27692 + }, + { + "epoch": 2.584025380237007, + "grad_norm": NaN, + "learning_rate": 0.00019076392995197682, + "loss": 0.0, + "step": 27693 + }, + { + "epoch": 2.5841186899318838, + "grad_norm": NaN, + "learning_rate": 0.00019075665077731776, + "loss": 0.0, + "step": 27694 + }, + { + "epoch": 2.584211999626761, + "grad_norm": NaN, + "learning_rate": 0.0001907493714990266, + "loss": 0.0, + "step": 27695 + }, + { + "epoch": 2.5843053093216386, + "grad_norm": NaN, + "learning_rate": 0.0001907420921171218, + "loss": 0.0, + "step": 27696 + }, + { + "epoch": 2.5843986190165156, + "grad_norm": NaN, + "learning_rate": 0.00019073481263162198, + "loss": 0.0, + "step": 27697 + }, + { + "epoch": 2.584491928711393, + "grad_norm": NaN, + "learning_rate": 0.0001907275330425456, + "loss": 0.0, + "step": 27698 + }, + { + "epoch": 2.5845852384062704, + "grad_norm": NaN, + "learning_rate": 0.0001907202533499111, + "loss": 0.0, + "step": 27699 + }, + { + "epoch": 2.584678548101148, + "grad_norm": NaN, + "learning_rate": 0.00019071297355373714, + "loss": 0.0, + "step": 27700 + }, + { + "epoch": 2.5847718577960253, + "grad_norm": NaN, + "learning_rate": 0.00019070569365404214, + "loss": 0.0, + "step": 27701 + }, + { + "epoch": 2.5848651674909022, + "grad_norm": NaN, + "learning_rate": 0.00019069841365084453, + "loss": 0.0, + "step": 27702 + }, + { + "epoch": 2.5849584771857796, + "grad_norm": NaN, + "learning_rate": 0.00019069113354416303, + "loss": 0.0, + "step": 27703 + }, + { + "epoch": 2.585051786880657, + "grad_norm": NaN, + "learning_rate": 0.00019068385333401597, + "loss": 0.0, + "step": 27704 + }, + { + "epoch": 2.585145096575534, + "grad_norm": NaN, + "learning_rate": 0.00019067657302042189, + "loss": 0.0, + "step": 27705 + }, + { + "epoch": 2.5852384062704115, + "grad_norm": NaN, + "learning_rate": 0.00019066929260339938, + "loss": 0.0, + "step": 27706 + }, + { + "epoch": 2.585331715965289, + "grad_norm": NaN, + "learning_rate": 0.0001906620120829669, + "loss": 0.0, + "step": 27707 + }, + { + "epoch": 2.5854250256601663, + "grad_norm": NaN, + "learning_rate": 0.00019065473145914294, + "loss": 0.0, + "step": 27708 + }, + { + "epoch": 2.5855183353550433, + "grad_norm": NaN, + "learning_rate": 0.00019064745073194608, + "loss": 0.0, + "step": 27709 + }, + { + "epoch": 2.5856116450499207, + "grad_norm": NaN, + "learning_rate": 0.0001906401699013948, + "loss": 0.0, + "step": 27710 + }, + { + "epoch": 2.585704954744798, + "grad_norm": NaN, + "learning_rate": 0.00019063288896750758, + "loss": 0.0, + "step": 27711 + }, + { + "epoch": 2.585798264439675, + "grad_norm": NaN, + "learning_rate": 0.00019062560793030293, + "loss": 0.0, + "step": 27712 + }, + { + "epoch": 2.5858915741345525, + "grad_norm": NaN, + "learning_rate": 0.00019061832678979947, + "loss": 0.0, + "step": 27713 + }, + { + "epoch": 2.58598488382943, + "grad_norm": NaN, + "learning_rate": 0.00019061104554601562, + "loss": 0.0, + "step": 27714 + }, + { + "epoch": 2.5860781935243073, + "grad_norm": NaN, + "learning_rate": 0.00019060376419896983, + "loss": 0.0, + "step": 27715 + }, + { + "epoch": 2.5861715032191843, + "grad_norm": NaN, + "learning_rate": 0.00019059648274868078, + "loss": 0.0, + "step": 27716 + }, + { + "epoch": 2.5862648129140617, + "grad_norm": NaN, + "learning_rate": 0.0001905892011951669, + "loss": 0.0, + "step": 27717 + }, + { + "epoch": 2.586358122608939, + "grad_norm": NaN, + "learning_rate": 0.00019058191953844663, + "loss": 0.0, + "step": 27718 + }, + { + "epoch": 2.586451432303816, + "grad_norm": NaN, + "learning_rate": 0.00019057463777853866, + "loss": 0.0, + "step": 27719 + }, + { + "epoch": 2.5865447419986936, + "grad_norm": NaN, + "learning_rate": 0.00019056735591546132, + "loss": 0.0, + "step": 27720 + }, + { + "epoch": 2.586638051693571, + "grad_norm": NaN, + "learning_rate": 0.0001905600739492332, + "loss": 0.0, + "step": 27721 + }, + { + "epoch": 2.5867313613884484, + "grad_norm": NaN, + "learning_rate": 0.00019055279187987293, + "loss": 0.0, + "step": 27722 + }, + { + "epoch": 2.586824671083326, + "grad_norm": NaN, + "learning_rate": 0.00019054550970739885, + "loss": 0.0, + "step": 27723 + }, + { + "epoch": 2.586917980778203, + "grad_norm": NaN, + "learning_rate": 0.00019053822743182952, + "loss": 0.0, + "step": 27724 + }, + { + "epoch": 2.58701129047308, + "grad_norm": NaN, + "learning_rate": 0.00019053094505318353, + "loss": 0.0, + "step": 27725 + }, + { + "epoch": 2.587104600167957, + "grad_norm": NaN, + "learning_rate": 0.00019052366257147938, + "loss": 0.0, + "step": 27726 + }, + { + "epoch": 2.5871979098628346, + "grad_norm": NaN, + "learning_rate": 0.0001905163799867355, + "loss": 0.0, + "step": 27727 + }, + { + "epoch": 2.587291219557712, + "grad_norm": NaN, + "learning_rate": 0.00019050909729897053, + "loss": 0.0, + "step": 27728 + }, + { + "epoch": 2.5873845292525894, + "grad_norm": NaN, + "learning_rate": 0.00019050181450820292, + "loss": 0.0, + "step": 27729 + }, + { + "epoch": 2.587477838947467, + "grad_norm": NaN, + "learning_rate": 0.00019049453161445112, + "loss": 0.0, + "step": 27730 + }, + { + "epoch": 2.587571148642344, + "grad_norm": NaN, + "learning_rate": 0.00019048724861773378, + "loss": 0.0, + "step": 27731 + }, + { + "epoch": 2.5876644583372213, + "grad_norm": NaN, + "learning_rate": 0.00019047996551806935, + "loss": 0.0, + "step": 27732 + }, + { + "epoch": 2.5877577680320987, + "grad_norm": NaN, + "learning_rate": 0.00019047268231547638, + "loss": 0.0, + "step": 27733 + }, + { + "epoch": 2.5878510777269756, + "grad_norm": NaN, + "learning_rate": 0.0001904653990099733, + "loss": 0.0, + "step": 27734 + }, + { + "epoch": 2.587944387421853, + "grad_norm": NaN, + "learning_rate": 0.00019045811560157878, + "loss": 0.0, + "step": 27735 + }, + { + "epoch": 2.5880376971167305, + "grad_norm": NaN, + "learning_rate": 0.0001904508320903112, + "loss": 0.0, + "step": 27736 + }, + { + "epoch": 2.588131006811608, + "grad_norm": NaN, + "learning_rate": 0.00019044354847618915, + "loss": 0.0, + "step": 27737 + }, + { + "epoch": 2.588224316506485, + "grad_norm": NaN, + "learning_rate": 0.00019043626475923119, + "loss": 0.0, + "step": 27738 + }, + { + "epoch": 2.5883176262013623, + "grad_norm": NaN, + "learning_rate": 0.00019042898093945572, + "loss": 0.0, + "step": 27739 + }, + { + "epoch": 2.5884109358962397, + "grad_norm": NaN, + "learning_rate": 0.0001904216970168813, + "loss": 0.0, + "step": 27740 + }, + { + "epoch": 2.5885042455911167, + "grad_norm": NaN, + "learning_rate": 0.00019041441299152655, + "loss": 0.0, + "step": 27741 + }, + { + "epoch": 2.588597555285994, + "grad_norm": NaN, + "learning_rate": 0.00019040712886340993, + "loss": 0.0, + "step": 27742 + }, + { + "epoch": 2.5886908649808715, + "grad_norm": NaN, + "learning_rate": 0.00019039984463254986, + "loss": 0.0, + "step": 27743 + }, + { + "epoch": 2.588784174675749, + "grad_norm": NaN, + "learning_rate": 0.000190392560298965, + "loss": 0.0, + "step": 27744 + }, + { + "epoch": 2.5888774843706264, + "grad_norm": NaN, + "learning_rate": 0.00019038527586267388, + "loss": 0.0, + "step": 27745 + }, + { + "epoch": 2.5889707940655033, + "grad_norm": NaN, + "learning_rate": 0.00019037799132369485, + "loss": 0.0, + "step": 27746 + }, + { + "epoch": 2.5890641037603808, + "grad_norm": NaN, + "learning_rate": 0.0001903707066820466, + "loss": 0.0, + "step": 27747 + }, + { + "epoch": 2.5891574134552577, + "grad_norm": NaN, + "learning_rate": 0.0001903634219377477, + "loss": 0.0, + "step": 27748 + }, + { + "epoch": 2.589250723150135, + "grad_norm": NaN, + "learning_rate": 0.00019035613709081643, + "loss": 0.0, + "step": 27749 + }, + { + "epoch": 2.5893440328450126, + "grad_norm": NaN, + "learning_rate": 0.00019034885214127154, + "loss": 0.0, + "step": 27750 + }, + { + "epoch": 2.58943734253989, + "grad_norm": NaN, + "learning_rate": 0.00019034156708913147, + "loss": 0.0, + "step": 27751 + }, + { + "epoch": 2.5895306522347674, + "grad_norm": NaN, + "learning_rate": 0.0001903342819344147, + "loss": 0.0, + "step": 27752 + }, + { + "epoch": 2.5896239619296444, + "grad_norm": NaN, + "learning_rate": 0.00019032699667713982, + "loss": 0.0, + "step": 27753 + }, + { + "epoch": 2.589717271624522, + "grad_norm": NaN, + "learning_rate": 0.00019031971131732537, + "loss": 0.0, + "step": 27754 + }, + { + "epoch": 2.5898105813193992, + "grad_norm": NaN, + "learning_rate": 0.00019031242585498982, + "loss": 0.0, + "step": 27755 + }, + { + "epoch": 2.589903891014276, + "grad_norm": NaN, + "learning_rate": 0.00019030514029015166, + "loss": 0.0, + "step": 27756 + }, + { + "epoch": 2.5899972007091536, + "grad_norm": NaN, + "learning_rate": 0.00019029785462282953, + "loss": 0.0, + "step": 27757 + }, + { + "epoch": 2.590090510404031, + "grad_norm": NaN, + "learning_rate": 0.00019029056885304187, + "loss": 0.0, + "step": 27758 + }, + { + "epoch": 2.5901838200989085, + "grad_norm": NaN, + "learning_rate": 0.00019028328298080718, + "loss": 0.0, + "step": 27759 + }, + { + "epoch": 2.5902771297937854, + "grad_norm": NaN, + "learning_rate": 0.00019027599700614412, + "loss": 0.0, + "step": 27760 + }, + { + "epoch": 2.590370439488663, + "grad_norm": NaN, + "learning_rate": 0.00019026871092907112, + "loss": 0.0, + "step": 27761 + }, + { + "epoch": 2.5904637491835403, + "grad_norm": NaN, + "learning_rate": 0.00019026142474960663, + "loss": 0.0, + "step": 27762 + }, + { + "epoch": 2.5905570588784173, + "grad_norm": NaN, + "learning_rate": 0.0001902541384677693, + "loss": 0.0, + "step": 27763 + }, + { + "epoch": 2.5906503685732947, + "grad_norm": NaN, + "learning_rate": 0.0001902468520835777, + "loss": 0.0, + "step": 27764 + }, + { + "epoch": 2.590743678268172, + "grad_norm": NaN, + "learning_rate": 0.0001902395655970502, + "loss": 0.0, + "step": 27765 + }, + { + "epoch": 2.5908369879630495, + "grad_norm": NaN, + "learning_rate": 0.0001902322790082054, + "loss": 0.0, + "step": 27766 + }, + { + "epoch": 2.590930297657927, + "grad_norm": NaN, + "learning_rate": 0.0001902249923170619, + "loss": 0.0, + "step": 27767 + }, + { + "epoch": 2.591023607352804, + "grad_norm": NaN, + "learning_rate": 0.00019021770552363809, + "loss": 0.0, + "step": 27768 + }, + { + "epoch": 2.5911169170476813, + "grad_norm": NaN, + "learning_rate": 0.0001902104186279526, + "loss": 0.0, + "step": 27769 + }, + { + "epoch": 2.5912102267425583, + "grad_norm": NaN, + "learning_rate": 0.00019020313163002392, + "loss": 0.0, + "step": 27770 + }, + { + "epoch": 2.5913035364374357, + "grad_norm": NaN, + "learning_rate": 0.00019019584452987056, + "loss": 0.0, + "step": 27771 + }, + { + "epoch": 2.591396846132313, + "grad_norm": NaN, + "learning_rate": 0.00019018855732751112, + "loss": 0.0, + "step": 27772 + }, + { + "epoch": 2.5914901558271906, + "grad_norm": NaN, + "learning_rate": 0.00019018127002296407, + "loss": 0.0, + "step": 27773 + }, + { + "epoch": 2.591583465522068, + "grad_norm": NaN, + "learning_rate": 0.00019017398261624794, + "loss": 0.0, + "step": 27774 + }, + { + "epoch": 2.591676775216945, + "grad_norm": NaN, + "learning_rate": 0.00019016669510738126, + "loss": 0.0, + "step": 27775 + }, + { + "epoch": 2.5917700849118224, + "grad_norm": NaN, + "learning_rate": 0.00019015940749638263, + "loss": 0.0, + "step": 27776 + }, + { + "epoch": 2.5918633946067, + "grad_norm": NaN, + "learning_rate": 0.00019015211978327048, + "loss": 0.0, + "step": 27777 + }, + { + "epoch": 2.5919567043015768, + "grad_norm": NaN, + "learning_rate": 0.00019014483196806337, + "loss": 0.0, + "step": 27778 + }, + { + "epoch": 2.592050013996454, + "grad_norm": NaN, + "learning_rate": 0.00019013754405077988, + "loss": 0.0, + "step": 27779 + }, + { + "epoch": 2.5921433236913316, + "grad_norm": NaN, + "learning_rate": 0.00019013025603143848, + "loss": 0.0, + "step": 27780 + }, + { + "epoch": 2.592236633386209, + "grad_norm": NaN, + "learning_rate": 0.00019012296791005772, + "loss": 0.0, + "step": 27781 + }, + { + "epoch": 2.592329943081086, + "grad_norm": NaN, + "learning_rate": 0.00019011567968665617, + "loss": 0.0, + "step": 27782 + }, + { + "epoch": 2.5924232527759634, + "grad_norm": NaN, + "learning_rate": 0.0001901083913612523, + "loss": 0.0, + "step": 27783 + }, + { + "epoch": 2.592516562470841, + "grad_norm": NaN, + "learning_rate": 0.00019010110293386467, + "loss": 0.0, + "step": 27784 + }, + { + "epoch": 2.592609872165718, + "grad_norm": NaN, + "learning_rate": 0.0001900938144045118, + "loss": 0.0, + "step": 27785 + }, + { + "epoch": 2.5927031818605952, + "grad_norm": NaN, + "learning_rate": 0.0001900865257732123, + "loss": 0.0, + "step": 27786 + }, + { + "epoch": 2.5927964915554726, + "grad_norm": NaN, + "learning_rate": 0.00019007923703998457, + "loss": 0.0, + "step": 27787 + }, + { + "epoch": 2.59288980125035, + "grad_norm": NaN, + "learning_rate": 0.00019007194820484726, + "loss": 0.0, + "step": 27788 + }, + { + "epoch": 2.592983110945227, + "grad_norm": NaN, + "learning_rate": 0.00019006465926781887, + "loss": 0.0, + "step": 27789 + }, + { + "epoch": 2.5930764206401045, + "grad_norm": NaN, + "learning_rate": 0.00019005737022891788, + "loss": 0.0, + "step": 27790 + }, + { + "epoch": 2.593169730334982, + "grad_norm": NaN, + "learning_rate": 0.00019005008108816286, + "loss": 0.0, + "step": 27791 + }, + { + "epoch": 2.593263040029859, + "grad_norm": NaN, + "learning_rate": 0.00019004279184557237, + "loss": 0.0, + "step": 27792 + }, + { + "epoch": 2.5933563497247363, + "grad_norm": NaN, + "learning_rate": 0.00019003550250116488, + "loss": 0.0, + "step": 27793 + }, + { + "epoch": 2.5934496594196137, + "grad_norm": NaN, + "learning_rate": 0.000190028213054959, + "loss": 0.0, + "step": 27794 + }, + { + "epoch": 2.593542969114491, + "grad_norm": NaN, + "learning_rate": 0.00019002092350697326, + "loss": 0.0, + "step": 27795 + }, + { + "epoch": 2.5936362788093685, + "grad_norm": NaN, + "learning_rate": 0.0001900136338572261, + "loss": 0.0, + "step": 27796 + }, + { + "epoch": 2.5937295885042455, + "grad_norm": NaN, + "learning_rate": 0.0001900063441057362, + "loss": 0.0, + "step": 27797 + }, + { + "epoch": 2.593822898199123, + "grad_norm": NaN, + "learning_rate": 0.00018999905425252198, + "loss": 0.0, + "step": 27798 + }, + { + "epoch": 2.5939162078940003, + "grad_norm": NaN, + "learning_rate": 0.00018999176429760202, + "loss": 0.0, + "step": 27799 + }, + { + "epoch": 2.5940095175888773, + "grad_norm": NaN, + "learning_rate": 0.0001899844742409948, + "loss": 0.0, + "step": 27800 + }, + { + "epoch": 2.5941028272837547, + "grad_norm": NaN, + "learning_rate": 0.00018997718408271896, + "loss": 0.0, + "step": 27801 + }, + { + "epoch": 2.594196136978632, + "grad_norm": NaN, + "learning_rate": 0.00018996989382279297, + "loss": 0.0, + "step": 27802 + }, + { + "epoch": 2.5942894466735096, + "grad_norm": NaN, + "learning_rate": 0.00018996260346123536, + "loss": 0.0, + "step": 27803 + }, + { + "epoch": 2.5943827563683866, + "grad_norm": NaN, + "learning_rate": 0.0001899553129980647, + "loss": 0.0, + "step": 27804 + }, + { + "epoch": 2.594476066063264, + "grad_norm": NaN, + "learning_rate": 0.00018994802243329958, + "loss": 0.0, + "step": 27805 + }, + { + "epoch": 2.5945693757581414, + "grad_norm": NaN, + "learning_rate": 0.00018994073176695838, + "loss": 0.0, + "step": 27806 + }, + { + "epoch": 2.5946626854530184, + "grad_norm": NaN, + "learning_rate": 0.00018993344099905977, + "loss": 0.0, + "step": 27807 + }, + { + "epoch": 2.594755995147896, + "grad_norm": NaN, + "learning_rate": 0.00018992615012962226, + "loss": 0.0, + "step": 27808 + }, + { + "epoch": 2.594849304842773, + "grad_norm": NaN, + "learning_rate": 0.00018991885915866432, + "loss": 0.0, + "step": 27809 + }, + { + "epoch": 2.5949426145376506, + "grad_norm": NaN, + "learning_rate": 0.00018991156808620462, + "loss": 0.0, + "step": 27810 + }, + { + "epoch": 2.5950359242325276, + "grad_norm": NaN, + "learning_rate": 0.0001899042769122616, + "loss": 0.0, + "step": 27811 + }, + { + "epoch": 2.595129233927405, + "grad_norm": NaN, + "learning_rate": 0.0001898969856368538, + "loss": 0.0, + "step": 27812 + }, + { + "epoch": 2.5952225436222824, + "grad_norm": NaN, + "learning_rate": 0.0001898896942599998, + "loss": 0.0, + "step": 27813 + }, + { + "epoch": 2.5953158533171594, + "grad_norm": NaN, + "learning_rate": 0.00018988240278171816, + "loss": 0.0, + "step": 27814 + }, + { + "epoch": 2.595409163012037, + "grad_norm": NaN, + "learning_rate": 0.00018987511120202732, + "loss": 0.0, + "step": 27815 + }, + { + "epoch": 2.5955024727069143, + "grad_norm": NaN, + "learning_rate": 0.00018986781952094587, + "loss": 0.0, + "step": 27816 + }, + { + "epoch": 2.5955957824017917, + "grad_norm": NaN, + "learning_rate": 0.00018986052773849247, + "loss": 0.0, + "step": 27817 + }, + { + "epoch": 2.595689092096669, + "grad_norm": NaN, + "learning_rate": 0.00018985323585468547, + "loss": 0.0, + "step": 27818 + }, + { + "epoch": 2.595782401791546, + "grad_norm": NaN, + "learning_rate": 0.0001898459438695435, + "loss": 0.0, + "step": 27819 + }, + { + "epoch": 2.5958757114864235, + "grad_norm": NaN, + "learning_rate": 0.0001898386517830851, + "loss": 0.0, + "step": 27820 + }, + { + "epoch": 2.5959690211813005, + "grad_norm": NaN, + "learning_rate": 0.0001898313595953289, + "loss": 0.0, + "step": 27821 + }, + { + "epoch": 2.596062330876178, + "grad_norm": NaN, + "learning_rate": 0.00018982406730629322, + "loss": 0.0, + "step": 27822 + }, + { + "epoch": 2.5961556405710553, + "grad_norm": NaN, + "learning_rate": 0.0001898167749159968, + "loss": 0.0, + "step": 27823 + }, + { + "epoch": 2.5962489502659327, + "grad_norm": NaN, + "learning_rate": 0.00018980948242445814, + "loss": 0.0, + "step": 27824 + }, + { + "epoch": 2.59634225996081, + "grad_norm": NaN, + "learning_rate": 0.00018980218983169567, + "loss": 0.0, + "step": 27825 + }, + { + "epoch": 2.596435569655687, + "grad_norm": NaN, + "learning_rate": 0.00018979489713772809, + "loss": 0.0, + "step": 27826 + }, + { + "epoch": 2.5965288793505645, + "grad_norm": NaN, + "learning_rate": 0.00018978760434257389, + "loss": 0.0, + "step": 27827 + }, + { + "epoch": 2.596622189045442, + "grad_norm": NaN, + "learning_rate": 0.0001897803114462515, + "loss": 0.0, + "step": 27828 + }, + { + "epoch": 2.596715498740319, + "grad_norm": NaN, + "learning_rate": 0.00018977301844877965, + "loss": 0.0, + "step": 27829 + }, + { + "epoch": 2.5968088084351963, + "grad_norm": NaN, + "learning_rate": 0.0001897657253501768, + "loss": 0.0, + "step": 27830 + }, + { + "epoch": 2.5969021181300738, + "grad_norm": NaN, + "learning_rate": 0.00018975843215046146, + "loss": 0.0, + "step": 27831 + }, + { + "epoch": 2.596995427824951, + "grad_norm": NaN, + "learning_rate": 0.00018975113884965223, + "loss": 0.0, + "step": 27832 + }, + { + "epoch": 2.597088737519828, + "grad_norm": NaN, + "learning_rate": 0.00018974384544776762, + "loss": 0.0, + "step": 27833 + }, + { + "epoch": 2.5971820472147056, + "grad_norm": NaN, + "learning_rate": 0.00018973655194482616, + "loss": 0.0, + "step": 27834 + }, + { + "epoch": 2.597275356909583, + "grad_norm": NaN, + "learning_rate": 0.0001897292583408464, + "loss": 0.0, + "step": 27835 + }, + { + "epoch": 2.59736866660446, + "grad_norm": NaN, + "learning_rate": 0.00018972196463584696, + "loss": 0.0, + "step": 27836 + }, + { + "epoch": 2.5974619762993374, + "grad_norm": NaN, + "learning_rate": 0.0001897146708298463, + "loss": 0.0, + "step": 27837 + }, + { + "epoch": 2.597555285994215, + "grad_norm": NaN, + "learning_rate": 0.00018970737692286302, + "loss": 0.0, + "step": 27838 + }, + { + "epoch": 2.5976485956890922, + "grad_norm": NaN, + "learning_rate": 0.0001897000829149156, + "loss": 0.0, + "step": 27839 + }, + { + "epoch": 2.5977419053839697, + "grad_norm": NaN, + "learning_rate": 0.00018969278880602268, + "loss": 0.0, + "step": 27840 + }, + { + "epoch": 2.5978352150788466, + "grad_norm": NaN, + "learning_rate": 0.00018968549459620274, + "loss": 0.0, + "step": 27841 + }, + { + "epoch": 2.597928524773724, + "grad_norm": NaN, + "learning_rate": 0.00018967820028547433, + "loss": 0.0, + "step": 27842 + }, + { + "epoch": 2.598021834468601, + "grad_norm": NaN, + "learning_rate": 0.000189670905873856, + "loss": 0.0, + "step": 27843 + }, + { + "epoch": 2.5981151441634784, + "grad_norm": NaN, + "learning_rate": 0.00018966361136136633, + "loss": 0.0, + "step": 27844 + }, + { + "epoch": 2.598208453858356, + "grad_norm": NaN, + "learning_rate": 0.00018965631674802388, + "loss": 0.0, + "step": 27845 + }, + { + "epoch": 2.5983017635532333, + "grad_norm": NaN, + "learning_rate": 0.00018964902203384712, + "loss": 0.0, + "step": 27846 + }, + { + "epoch": 2.5983950732481107, + "grad_norm": NaN, + "learning_rate": 0.00018964172721885463, + "loss": 0.0, + "step": 27847 + }, + { + "epoch": 2.5984883829429877, + "grad_norm": NaN, + "learning_rate": 0.00018963443230306498, + "loss": 0.0, + "step": 27848 + }, + { + "epoch": 2.598581692637865, + "grad_norm": NaN, + "learning_rate": 0.00018962713728649672, + "loss": 0.0, + "step": 27849 + }, + { + "epoch": 2.5986750023327425, + "grad_norm": NaN, + "learning_rate": 0.0001896198421691684, + "loss": 0.0, + "step": 27850 + }, + { + "epoch": 2.5987683120276195, + "grad_norm": NaN, + "learning_rate": 0.0001896125469510985, + "loss": 0.0, + "step": 27851 + }, + { + "epoch": 2.598861621722497, + "grad_norm": NaN, + "learning_rate": 0.0001896052516323057, + "loss": 0.0, + "step": 27852 + }, + { + "epoch": 2.5989549314173743, + "grad_norm": NaN, + "learning_rate": 0.00018959795621280843, + "loss": 0.0, + "step": 27853 + }, + { + "epoch": 2.5990482411122517, + "grad_norm": NaN, + "learning_rate": 0.0001895906606926253, + "loss": 0.0, + "step": 27854 + }, + { + "epoch": 2.5991415508071287, + "grad_norm": NaN, + "learning_rate": 0.00018958336507177484, + "loss": 0.0, + "step": 27855 + }, + { + "epoch": 2.599234860502006, + "grad_norm": NaN, + "learning_rate": 0.0001895760693502756, + "loss": 0.0, + "step": 27856 + }, + { + "epoch": 2.5993281701968836, + "grad_norm": NaN, + "learning_rate": 0.00018956877352814617, + "loss": 0.0, + "step": 27857 + }, + { + "epoch": 2.5994214798917605, + "grad_norm": NaN, + "learning_rate": 0.00018956147760540505, + "loss": 0.0, + "step": 27858 + }, + { + "epoch": 2.599514789586638, + "grad_norm": NaN, + "learning_rate": 0.00018955418158207084, + "loss": 0.0, + "step": 27859 + }, + { + "epoch": 2.5996080992815154, + "grad_norm": NaN, + "learning_rate": 0.00018954688545816205, + "loss": 0.0, + "step": 27860 + }, + { + "epoch": 2.599701408976393, + "grad_norm": NaN, + "learning_rate": 0.00018953958923369722, + "loss": 0.0, + "step": 27861 + }, + { + "epoch": 2.59979471867127, + "grad_norm": NaN, + "learning_rate": 0.00018953229290869493, + "loss": 0.0, + "step": 27862 + }, + { + "epoch": 2.599888028366147, + "grad_norm": NaN, + "learning_rate": 0.00018952499648317375, + "loss": 0.0, + "step": 27863 + }, + { + "epoch": 2.5999813380610246, + "grad_norm": NaN, + "learning_rate": 0.0001895176999571522, + "loss": 0.0, + "step": 27864 + }, + { + "epoch": 2.6000746477559016, + "grad_norm": NaN, + "learning_rate": 0.00018951040333064887, + "loss": 0.0, + "step": 27865 + }, + { + "epoch": 2.600167957450779, + "grad_norm": NaN, + "learning_rate": 0.00018950310660368225, + "loss": 0.0, + "step": 27866 + }, + { + "epoch": 2.6002612671456564, + "grad_norm": NaN, + "learning_rate": 0.00018949580977627097, + "loss": 0.0, + "step": 27867 + }, + { + "epoch": 2.600354576840534, + "grad_norm": NaN, + "learning_rate": 0.00018948851284843353, + "loss": 0.0, + "step": 27868 + }, + { + "epoch": 2.6004478865354113, + "grad_norm": NaN, + "learning_rate": 0.0001894812158201885, + "loss": 0.0, + "step": 27869 + }, + { + "epoch": 2.6005411962302882, + "grad_norm": NaN, + "learning_rate": 0.00018947391869155442, + "loss": 0.0, + "step": 27870 + }, + { + "epoch": 2.6006345059251657, + "grad_norm": NaN, + "learning_rate": 0.00018946662146254988, + "loss": 0.0, + "step": 27871 + }, + { + "epoch": 2.600727815620043, + "grad_norm": NaN, + "learning_rate": 0.00018945932413319338, + "loss": 0.0, + "step": 27872 + }, + { + "epoch": 2.60082112531492, + "grad_norm": NaN, + "learning_rate": 0.00018945202670350353, + "loss": 0.0, + "step": 27873 + }, + { + "epoch": 2.6009144350097975, + "grad_norm": NaN, + "learning_rate": 0.00018944472917349885, + "loss": 0.0, + "step": 27874 + }, + { + "epoch": 2.601007744704675, + "grad_norm": NaN, + "learning_rate": 0.00018943743154319791, + "loss": 0.0, + "step": 27875 + }, + { + "epoch": 2.6011010543995523, + "grad_norm": NaN, + "learning_rate": 0.00018943013381261928, + "loss": 0.0, + "step": 27876 + }, + { + "epoch": 2.6011943640944293, + "grad_norm": NaN, + "learning_rate": 0.00018942283598178148, + "loss": 0.0, + "step": 27877 + }, + { + "epoch": 2.6012876737893067, + "grad_norm": NaN, + "learning_rate": 0.0001894155380507031, + "loss": 0.0, + "step": 27878 + }, + { + "epoch": 2.601380983484184, + "grad_norm": NaN, + "learning_rate": 0.00018940824001940267, + "loss": 0.0, + "step": 27879 + }, + { + "epoch": 2.601474293179061, + "grad_norm": NaN, + "learning_rate": 0.00018940094188789878, + "loss": 0.0, + "step": 27880 + }, + { + "epoch": 2.6015676028739385, + "grad_norm": NaN, + "learning_rate": 0.00018939364365620992, + "loss": 0.0, + "step": 27881 + }, + { + "epoch": 2.601660912568816, + "grad_norm": NaN, + "learning_rate": 0.00018938634532435477, + "loss": 0.0, + "step": 27882 + }, + { + "epoch": 2.6017542222636934, + "grad_norm": NaN, + "learning_rate": 0.00018937904689235173, + "loss": 0.0, + "step": 27883 + }, + { + "epoch": 2.6018475319585708, + "grad_norm": NaN, + "learning_rate": 0.00018937174836021952, + "loss": 0.0, + "step": 27884 + }, + { + "epoch": 2.6019408416534477, + "grad_norm": NaN, + "learning_rate": 0.00018936444972797654, + "loss": 0.0, + "step": 27885 + }, + { + "epoch": 2.602034151348325, + "grad_norm": NaN, + "learning_rate": 0.00018935715099564147, + "loss": 0.0, + "step": 27886 + }, + { + "epoch": 2.602127461043202, + "grad_norm": NaN, + "learning_rate": 0.0001893498521632328, + "loss": 0.0, + "step": 27887 + }, + { + "epoch": 2.6022207707380796, + "grad_norm": NaN, + "learning_rate": 0.00018934255323076912, + "loss": 0.0, + "step": 27888 + }, + { + "epoch": 2.602314080432957, + "grad_norm": NaN, + "learning_rate": 0.000189335254198269, + "loss": 0.0, + "step": 27889 + }, + { + "epoch": 2.6024073901278344, + "grad_norm": NaN, + "learning_rate": 0.00018932795506575097, + "loss": 0.0, + "step": 27890 + }, + { + "epoch": 2.602500699822712, + "grad_norm": NaN, + "learning_rate": 0.00018932065583323357, + "loss": 0.0, + "step": 27891 + }, + { + "epoch": 2.602594009517589, + "grad_norm": NaN, + "learning_rate": 0.00018931335650073543, + "loss": 0.0, + "step": 27892 + }, + { + "epoch": 2.602687319212466, + "grad_norm": NaN, + "learning_rate": 0.00018930605706827505, + "loss": 0.0, + "step": 27893 + }, + { + "epoch": 2.6027806289073436, + "grad_norm": NaN, + "learning_rate": 0.00018929875753587102, + "loss": 0.0, + "step": 27894 + }, + { + "epoch": 2.6028739386022206, + "grad_norm": NaN, + "learning_rate": 0.00018929145790354187, + "loss": 0.0, + "step": 27895 + }, + { + "epoch": 2.602967248297098, + "grad_norm": NaN, + "learning_rate": 0.0001892841581713062, + "loss": 0.0, + "step": 27896 + }, + { + "epoch": 2.6030605579919754, + "grad_norm": NaN, + "learning_rate": 0.00018927685833918255, + "loss": 0.0, + "step": 27897 + }, + { + "epoch": 2.603153867686853, + "grad_norm": NaN, + "learning_rate": 0.0001892695584071895, + "loss": 0.0, + "step": 27898 + }, + { + "epoch": 2.60324717738173, + "grad_norm": NaN, + "learning_rate": 0.00018926225837534558, + "loss": 0.0, + "step": 27899 + }, + { + "epoch": 2.6033404870766073, + "grad_norm": NaN, + "learning_rate": 0.00018925495824366936, + "loss": 0.0, + "step": 27900 + }, + { + "epoch": 2.6034337967714847, + "grad_norm": NaN, + "learning_rate": 0.0001892476580121794, + "loss": 0.0, + "step": 27901 + }, + { + "epoch": 2.6035271064663617, + "grad_norm": NaN, + "learning_rate": 0.00018924035768089429, + "loss": 0.0, + "step": 27902 + }, + { + "epoch": 2.603620416161239, + "grad_norm": NaN, + "learning_rate": 0.00018923305724983258, + "loss": 0.0, + "step": 27903 + }, + { + "epoch": 2.6037137258561165, + "grad_norm": NaN, + "learning_rate": 0.00018922575671901284, + "loss": 0.0, + "step": 27904 + }, + { + "epoch": 2.603807035550994, + "grad_norm": NaN, + "learning_rate": 0.00018921845608845357, + "loss": 0.0, + "step": 27905 + }, + { + "epoch": 2.603900345245871, + "grad_norm": NaN, + "learning_rate": 0.00018921115535817343, + "loss": 0.0, + "step": 27906 + }, + { + "epoch": 2.6039936549407483, + "grad_norm": NaN, + "learning_rate": 0.00018920385452819092, + "loss": 0.0, + "step": 27907 + }, + { + "epoch": 2.6040869646356257, + "grad_norm": NaN, + "learning_rate": 0.00018919655359852462, + "loss": 0.0, + "step": 27908 + }, + { + "epoch": 2.6041802743305027, + "grad_norm": NaN, + "learning_rate": 0.0001891892525691931, + "loss": 0.0, + "step": 27909 + }, + { + "epoch": 2.60427358402538, + "grad_norm": NaN, + "learning_rate": 0.0001891819514402149, + "loss": 0.0, + "step": 27910 + }, + { + "epoch": 2.6043668937202575, + "grad_norm": NaN, + "learning_rate": 0.00018917465021160864, + "loss": 0.0, + "step": 27911 + }, + { + "epoch": 2.604460203415135, + "grad_norm": NaN, + "learning_rate": 0.0001891673488833928, + "loss": 0.0, + "step": 27912 + }, + { + "epoch": 2.6045535131100124, + "grad_norm": NaN, + "learning_rate": 0.00018916004745558605, + "loss": 0.0, + "step": 27913 + }, + { + "epoch": 2.6046468228048894, + "grad_norm": NaN, + "learning_rate": 0.0001891527459282069, + "loss": 0.0, + "step": 27914 + }, + { + "epoch": 2.6047401324997668, + "grad_norm": NaN, + "learning_rate": 0.00018914544430127388, + "loss": 0.0, + "step": 27915 + }, + { + "epoch": 2.604833442194644, + "grad_norm": NaN, + "learning_rate": 0.00018913814257480558, + "loss": 0.0, + "step": 27916 + }, + { + "epoch": 2.604926751889521, + "grad_norm": NaN, + "learning_rate": 0.0001891308407488206, + "loss": 0.0, + "step": 27917 + }, + { + "epoch": 2.6050200615843986, + "grad_norm": NaN, + "learning_rate": 0.00018912353882333748, + "loss": 0.0, + "step": 27918 + }, + { + "epoch": 2.605113371279276, + "grad_norm": NaN, + "learning_rate": 0.0001891162367983748, + "loss": 0.0, + "step": 27919 + }, + { + "epoch": 2.6052066809741534, + "grad_norm": NaN, + "learning_rate": 0.0001891089346739511, + "loss": 0.0, + "step": 27920 + }, + { + "epoch": 2.6052999906690304, + "grad_norm": NaN, + "learning_rate": 0.00018910163245008496, + "loss": 0.0, + "step": 27921 + }, + { + "epoch": 2.605393300363908, + "grad_norm": NaN, + "learning_rate": 0.00018909433012679496, + "loss": 0.0, + "step": 27922 + }, + { + "epoch": 2.6054866100587852, + "grad_norm": NaN, + "learning_rate": 0.00018908702770409968, + "loss": 0.0, + "step": 27923 + }, + { + "epoch": 2.605579919753662, + "grad_norm": NaN, + "learning_rate": 0.00018907972518201764, + "loss": 0.0, + "step": 27924 + }, + { + "epoch": 2.6056732294485396, + "grad_norm": NaN, + "learning_rate": 0.00018907242256056744, + "loss": 0.0, + "step": 27925 + }, + { + "epoch": 2.605766539143417, + "grad_norm": NaN, + "learning_rate": 0.00018906511983976762, + "loss": 0.0, + "step": 27926 + }, + { + "epoch": 2.6058598488382945, + "grad_norm": NaN, + "learning_rate": 0.0001890578170196368, + "loss": 0.0, + "step": 27927 + }, + { + "epoch": 2.6059531585331714, + "grad_norm": NaN, + "learning_rate": 0.00018905051410019353, + "loss": 0.0, + "step": 27928 + }, + { + "epoch": 2.606046468228049, + "grad_norm": NaN, + "learning_rate": 0.00018904321108145634, + "loss": 0.0, + "step": 27929 + }, + { + "epoch": 2.6061397779229263, + "grad_norm": NaN, + "learning_rate": 0.00018903590796344387, + "loss": 0.0, + "step": 27930 + }, + { + "epoch": 2.6062330876178033, + "grad_norm": NaN, + "learning_rate": 0.0001890286047461746, + "loss": 0.0, + "step": 27931 + }, + { + "epoch": 2.6063263973126807, + "grad_norm": NaN, + "learning_rate": 0.0001890213014296672, + "loss": 0.0, + "step": 27932 + }, + { + "epoch": 2.606419707007558, + "grad_norm": NaN, + "learning_rate": 0.00018901399801394015, + "loss": 0.0, + "step": 27933 + }, + { + "epoch": 2.6065130167024355, + "grad_norm": NaN, + "learning_rate": 0.0001890066944990121, + "loss": 0.0, + "step": 27934 + }, + { + "epoch": 2.606606326397313, + "grad_norm": NaN, + "learning_rate": 0.00018899939088490157, + "loss": 0.0, + "step": 27935 + }, + { + "epoch": 2.60669963609219, + "grad_norm": NaN, + "learning_rate": 0.00018899208717162712, + "loss": 0.0, + "step": 27936 + }, + { + "epoch": 2.6067929457870673, + "grad_norm": NaN, + "learning_rate": 0.00018898478335920734, + "loss": 0.0, + "step": 27937 + }, + { + "epoch": 2.6068862554819443, + "grad_norm": NaN, + "learning_rate": 0.0001889774794476608, + "loss": 0.0, + "step": 27938 + }, + { + "epoch": 2.6069795651768217, + "grad_norm": NaN, + "learning_rate": 0.0001889701754370061, + "loss": 0.0, + "step": 27939 + }, + { + "epoch": 2.607072874871699, + "grad_norm": NaN, + "learning_rate": 0.0001889628713272618, + "loss": 0.0, + "step": 27940 + }, + { + "epoch": 2.6071661845665766, + "grad_norm": NaN, + "learning_rate": 0.00018895556711844642, + "loss": 0.0, + "step": 27941 + }, + { + "epoch": 2.607259494261454, + "grad_norm": NaN, + "learning_rate": 0.00018894826281057862, + "loss": 0.0, + "step": 27942 + }, + { + "epoch": 2.607352803956331, + "grad_norm": NaN, + "learning_rate": 0.00018894095840367689, + "loss": 0.0, + "step": 27943 + }, + { + "epoch": 2.6074461136512084, + "grad_norm": NaN, + "learning_rate": 0.00018893365389775982, + "loss": 0.0, + "step": 27944 + }, + { + "epoch": 2.607539423346086, + "grad_norm": NaN, + "learning_rate": 0.00018892634929284604, + "loss": 0.0, + "step": 27945 + }, + { + "epoch": 2.6076327330409628, + "grad_norm": NaN, + "learning_rate": 0.0001889190445889541, + "loss": 0.0, + "step": 27946 + }, + { + "epoch": 2.60772604273584, + "grad_norm": NaN, + "learning_rate": 0.0001889117397861025, + "loss": 0.0, + "step": 27947 + }, + { + "epoch": 2.6078193524307176, + "grad_norm": NaN, + "learning_rate": 0.0001889044348843099, + "loss": 0.0, + "step": 27948 + }, + { + "epoch": 2.607912662125595, + "grad_norm": NaN, + "learning_rate": 0.0001888971298835948, + "loss": 0.0, + "step": 27949 + }, + { + "epoch": 2.608005971820472, + "grad_norm": NaN, + "learning_rate": 0.0001888898247839759, + "loss": 0.0, + "step": 27950 + }, + { + "epoch": 2.6080992815153494, + "grad_norm": NaN, + "learning_rate": 0.00018888251958547165, + "loss": 0.0, + "step": 27951 + }, + { + "epoch": 2.608192591210227, + "grad_norm": NaN, + "learning_rate": 0.0001888752142881007, + "loss": 0.0, + "step": 27952 + }, + { + "epoch": 2.608285900905104, + "grad_norm": NaN, + "learning_rate": 0.00018886790889188158, + "loss": 0.0, + "step": 27953 + }, + { + "epoch": 2.6083792105999812, + "grad_norm": NaN, + "learning_rate": 0.00018886060339683286, + "loss": 0.0, + "step": 27954 + }, + { + "epoch": 2.6084725202948587, + "grad_norm": NaN, + "learning_rate": 0.00018885329780297317, + "loss": 0.0, + "step": 27955 + }, + { + "epoch": 2.608565829989736, + "grad_norm": NaN, + "learning_rate": 0.00018884599211032102, + "loss": 0.0, + "step": 27956 + }, + { + "epoch": 2.6086591396846135, + "grad_norm": NaN, + "learning_rate": 0.00018883868631889506, + "loss": 0.0, + "step": 27957 + }, + { + "epoch": 2.6087524493794905, + "grad_norm": NaN, + "learning_rate": 0.0001888313804287138, + "loss": 0.0, + "step": 27958 + }, + { + "epoch": 2.608845759074368, + "grad_norm": NaN, + "learning_rate": 0.00018882407443979586, + "loss": 0.0, + "step": 27959 + }, + { + "epoch": 2.608939068769245, + "grad_norm": NaN, + "learning_rate": 0.00018881676835215977, + "loss": 0.0, + "step": 27960 + }, + { + "epoch": 2.6090323784641223, + "grad_norm": NaN, + "learning_rate": 0.00018880946216582418, + "loss": 0.0, + "step": 27961 + }, + { + "epoch": 2.6091256881589997, + "grad_norm": NaN, + "learning_rate": 0.00018880215588080757, + "loss": 0.0, + "step": 27962 + }, + { + "epoch": 2.609218997853877, + "grad_norm": NaN, + "learning_rate": 0.00018879484949712862, + "loss": 0.0, + "step": 27963 + }, + { + "epoch": 2.6093123075487545, + "grad_norm": NaN, + "learning_rate": 0.00018878754301480583, + "loss": 0.0, + "step": 27964 + }, + { + "epoch": 2.6094056172436315, + "grad_norm": NaN, + "learning_rate": 0.0001887802364338578, + "loss": 0.0, + "step": 27965 + }, + { + "epoch": 2.609498926938509, + "grad_norm": NaN, + "learning_rate": 0.00018877292975430314, + "loss": 0.0, + "step": 27966 + }, + { + "epoch": 2.6095922366333864, + "grad_norm": NaN, + "learning_rate": 0.0001887656229761604, + "loss": 0.0, + "step": 27967 + }, + { + "epoch": 2.6096855463282633, + "grad_norm": NaN, + "learning_rate": 0.00018875831609944818, + "loss": 0.0, + "step": 27968 + }, + { + "epoch": 2.6097788560231407, + "grad_norm": NaN, + "learning_rate": 0.00018875100912418502, + "loss": 0.0, + "step": 27969 + }, + { + "epoch": 2.609872165718018, + "grad_norm": NaN, + "learning_rate": 0.00018874370205038953, + "loss": 0.0, + "step": 27970 + }, + { + "epoch": 2.6099654754128956, + "grad_norm": NaN, + "learning_rate": 0.00018873639487808024, + "loss": 0.0, + "step": 27971 + }, + { + "epoch": 2.6100587851077726, + "grad_norm": NaN, + "learning_rate": 0.00018872908760727585, + "loss": 0.0, + "step": 27972 + }, + { + "epoch": 2.61015209480265, + "grad_norm": NaN, + "learning_rate": 0.0001887217802379948, + "loss": 0.0, + "step": 27973 + }, + { + "epoch": 2.6102454044975274, + "grad_norm": NaN, + "learning_rate": 0.0001887144727702558, + "loss": 0.0, + "step": 27974 + }, + { + "epoch": 2.6103387141924044, + "grad_norm": NaN, + "learning_rate": 0.00018870716520407731, + "loss": 0.0, + "step": 27975 + }, + { + "epoch": 2.610432023887282, + "grad_norm": NaN, + "learning_rate": 0.00018869985753947798, + "loss": 0.0, + "step": 27976 + }, + { + "epoch": 2.610525333582159, + "grad_norm": NaN, + "learning_rate": 0.00018869254977647634, + "loss": 0.0, + "step": 27977 + }, + { + "epoch": 2.6106186432770366, + "grad_norm": NaN, + "learning_rate": 0.00018868524191509105, + "loss": 0.0, + "step": 27978 + }, + { + "epoch": 2.610711952971914, + "grad_norm": NaN, + "learning_rate": 0.00018867793395534063, + "loss": 0.0, + "step": 27979 + }, + { + "epoch": 2.610805262666791, + "grad_norm": NaN, + "learning_rate": 0.0001886706258972437, + "loss": 0.0, + "step": 27980 + }, + { + "epoch": 2.6108985723616684, + "grad_norm": NaN, + "learning_rate": 0.00018866331774081884, + "loss": 0.0, + "step": 27981 + }, + { + "epoch": 2.6109918820565454, + "grad_norm": NaN, + "learning_rate": 0.00018865600948608457, + "loss": 0.0, + "step": 27982 + }, + { + "epoch": 2.611085191751423, + "grad_norm": NaN, + "learning_rate": 0.00018864870113305952, + "loss": 0.0, + "step": 27983 + }, + { + "epoch": 2.6111785014463003, + "grad_norm": NaN, + "learning_rate": 0.0001886413926817623, + "loss": 0.0, + "step": 27984 + }, + { + "epoch": 2.6112718111411777, + "grad_norm": NaN, + "learning_rate": 0.00018863408413221148, + "loss": 0.0, + "step": 27985 + }, + { + "epoch": 2.611365120836055, + "grad_norm": NaN, + "learning_rate": 0.00018862677548442557, + "loss": 0.0, + "step": 27986 + }, + { + "epoch": 2.611458430530932, + "grad_norm": NaN, + "learning_rate": 0.00018861946673842325, + "loss": 0.0, + "step": 27987 + }, + { + "epoch": 2.6115517402258095, + "grad_norm": NaN, + "learning_rate": 0.00018861215789422306, + "loss": 0.0, + "step": 27988 + }, + { + "epoch": 2.611645049920687, + "grad_norm": NaN, + "learning_rate": 0.00018860484895184362, + "loss": 0.0, + "step": 27989 + }, + { + "epoch": 2.611738359615564, + "grad_norm": NaN, + "learning_rate": 0.00018859753991130345, + "loss": 0.0, + "step": 27990 + }, + { + "epoch": 2.6118316693104413, + "grad_norm": NaN, + "learning_rate": 0.00018859023077262118, + "loss": 0.0, + "step": 27991 + }, + { + "epoch": 2.6119249790053187, + "grad_norm": NaN, + "learning_rate": 0.00018858292153581537, + "loss": 0.0, + "step": 27992 + }, + { + "epoch": 2.612018288700196, + "grad_norm": NaN, + "learning_rate": 0.00018857561220090462, + "loss": 0.0, + "step": 27993 + }, + { + "epoch": 2.612111598395073, + "grad_norm": NaN, + "learning_rate": 0.00018856830276790752, + "loss": 0.0, + "step": 27994 + }, + { + "epoch": 2.6122049080899505, + "grad_norm": NaN, + "learning_rate": 0.00018856099323684268, + "loss": 0.0, + "step": 27995 + }, + { + "epoch": 2.612298217784828, + "grad_norm": NaN, + "learning_rate": 0.0001885536836077286, + "loss": 0.0, + "step": 27996 + }, + { + "epoch": 2.612391527479705, + "grad_norm": NaN, + "learning_rate": 0.00018854637388058396, + "loss": 0.0, + "step": 27997 + }, + { + "epoch": 2.6124848371745824, + "grad_norm": NaN, + "learning_rate": 0.00018853906405542732, + "loss": 0.0, + "step": 27998 + }, + { + "epoch": 2.6125781468694598, + "grad_norm": NaN, + "learning_rate": 0.00018853175413227724, + "loss": 0.0, + "step": 27999 + }, + { + "epoch": 2.612671456564337, + "grad_norm": NaN, + "learning_rate": 0.0001885244441111523, + "loss": 0.0, + "step": 28000 + }, + { + "epoch": 2.612764766259214, + "grad_norm": NaN, + "learning_rate": 0.00018851713399207115, + "loss": 0.0, + "step": 28001 + }, + { + "epoch": 2.6128580759540916, + "grad_norm": NaN, + "learning_rate": 0.00018850982377505233, + "loss": 0.0, + "step": 28002 + }, + { + "epoch": 2.612951385648969, + "grad_norm": NaN, + "learning_rate": 0.00018850251346011444, + "loss": 0.0, + "step": 28003 + }, + { + "epoch": 2.613044695343846, + "grad_norm": NaN, + "learning_rate": 0.00018849520304727605, + "loss": 0.0, + "step": 28004 + }, + { + "epoch": 2.6131380050387234, + "grad_norm": NaN, + "learning_rate": 0.00018848789253655574, + "loss": 0.0, + "step": 28005 + }, + { + "epoch": 2.613231314733601, + "grad_norm": NaN, + "learning_rate": 0.00018848058192797216, + "loss": 0.0, + "step": 28006 + }, + { + "epoch": 2.6133246244284782, + "grad_norm": NaN, + "learning_rate": 0.00018847327122154385, + "loss": 0.0, + "step": 28007 + }, + { + "epoch": 2.6134179341233557, + "grad_norm": NaN, + "learning_rate": 0.00018846596041728942, + "loss": 0.0, + "step": 28008 + }, + { + "epoch": 2.6135112438182326, + "grad_norm": NaN, + "learning_rate": 0.0001884586495152274, + "loss": 0.0, + "step": 28009 + }, + { + "epoch": 2.61360455351311, + "grad_norm": NaN, + "learning_rate": 0.00018845133851537644, + "loss": 0.0, + "step": 28010 + }, + { + "epoch": 2.6136978632079875, + "grad_norm": NaN, + "learning_rate": 0.00018844402741775515, + "loss": 0.0, + "step": 28011 + }, + { + "epoch": 2.6137911729028644, + "grad_norm": NaN, + "learning_rate": 0.00018843671622238204, + "loss": 0.0, + "step": 28012 + }, + { + "epoch": 2.613884482597742, + "grad_norm": NaN, + "learning_rate": 0.0001884294049292758, + "loss": 0.0, + "step": 28013 + }, + { + "epoch": 2.6139777922926193, + "grad_norm": NaN, + "learning_rate": 0.00018842209353845494, + "loss": 0.0, + "step": 28014 + }, + { + "epoch": 2.6140711019874967, + "grad_norm": NaN, + "learning_rate": 0.00018841478204993805, + "loss": 0.0, + "step": 28015 + }, + { + "epoch": 2.6141644116823737, + "grad_norm": NaN, + "learning_rate": 0.00018840747046374378, + "loss": 0.0, + "step": 28016 + }, + { + "epoch": 2.614257721377251, + "grad_norm": NaN, + "learning_rate": 0.0001884001587798907, + "loss": 0.0, + "step": 28017 + }, + { + "epoch": 2.6143510310721285, + "grad_norm": NaN, + "learning_rate": 0.00018839284699839737, + "loss": 0.0, + "step": 28018 + }, + { + "epoch": 2.6144443407670055, + "grad_norm": NaN, + "learning_rate": 0.00018838553511928242, + "loss": 0.0, + "step": 28019 + }, + { + "epoch": 2.614537650461883, + "grad_norm": NaN, + "learning_rate": 0.00018837822314256444, + "loss": 0.0, + "step": 28020 + }, + { + "epoch": 2.6146309601567603, + "grad_norm": NaN, + "learning_rate": 0.00018837091106826194, + "loss": 0.0, + "step": 28021 + }, + { + "epoch": 2.6147242698516378, + "grad_norm": NaN, + "learning_rate": 0.0001883635988963936, + "loss": 0.0, + "step": 28022 + }, + { + "epoch": 2.6148175795465147, + "grad_norm": NaN, + "learning_rate": 0.00018835628662697805, + "loss": 0.0, + "step": 28023 + }, + { + "epoch": 2.614910889241392, + "grad_norm": NaN, + "learning_rate": 0.00018834897426003378, + "loss": 0.0, + "step": 28024 + }, + { + "epoch": 2.6150041989362696, + "grad_norm": NaN, + "learning_rate": 0.00018834166179557943, + "loss": 0.0, + "step": 28025 + }, + { + "epoch": 2.6150975086311465, + "grad_norm": NaN, + "learning_rate": 0.0001883343492336336, + "loss": 0.0, + "step": 28026 + }, + { + "epoch": 2.615190818326024, + "grad_norm": NaN, + "learning_rate": 0.0001883270365742149, + "loss": 0.0, + "step": 28027 + }, + { + "epoch": 2.6152841280209014, + "grad_norm": NaN, + "learning_rate": 0.00018831972381734186, + "loss": 0.0, + "step": 28028 + }, + { + "epoch": 2.615377437715779, + "grad_norm": NaN, + "learning_rate": 0.0001883124109630331, + "loss": 0.0, + "step": 28029 + }, + { + "epoch": 2.615470747410656, + "grad_norm": NaN, + "learning_rate": 0.00018830509801130726, + "loss": 0.0, + "step": 28030 + }, + { + "epoch": 2.615564057105533, + "grad_norm": NaN, + "learning_rate": 0.00018829778496218293, + "loss": 0.0, + "step": 28031 + }, + { + "epoch": 2.6156573668004106, + "grad_norm": NaN, + "learning_rate": 0.00018829047181567862, + "loss": 0.0, + "step": 28032 + }, + { + "epoch": 2.6157506764952876, + "grad_norm": NaN, + "learning_rate": 0.00018828315857181304, + "loss": 0.0, + "step": 28033 + }, + { + "epoch": 2.615843986190165, + "grad_norm": NaN, + "learning_rate": 0.00018827584523060467, + "loss": 0.0, + "step": 28034 + }, + { + "epoch": 2.6159372958850424, + "grad_norm": NaN, + "learning_rate": 0.0001882685317920722, + "loss": 0.0, + "step": 28035 + }, + { + "epoch": 2.61603060557992, + "grad_norm": NaN, + "learning_rate": 0.0001882612182562342, + "loss": 0.0, + "step": 28036 + }, + { + "epoch": 2.6161239152747973, + "grad_norm": NaN, + "learning_rate": 0.0001882539046231092, + "loss": 0.0, + "step": 28037 + }, + { + "epoch": 2.6162172249696742, + "grad_norm": NaN, + "learning_rate": 0.0001882465908927159, + "loss": 0.0, + "step": 28038 + }, + { + "epoch": 2.6163105346645517, + "grad_norm": NaN, + "learning_rate": 0.00018823927706507285, + "loss": 0.0, + "step": 28039 + }, + { + "epoch": 2.616403844359429, + "grad_norm": NaN, + "learning_rate": 0.0001882319631401986, + "loss": 0.0, + "step": 28040 + }, + { + "epoch": 2.616497154054306, + "grad_norm": NaN, + "learning_rate": 0.0001882246491181118, + "loss": 0.0, + "step": 28041 + }, + { + "epoch": 2.6165904637491835, + "grad_norm": NaN, + "learning_rate": 0.0001882173349988311, + "loss": 0.0, + "step": 28042 + }, + { + "epoch": 2.616683773444061, + "grad_norm": NaN, + "learning_rate": 0.000188210020782375, + "loss": 0.0, + "step": 28043 + }, + { + "epoch": 2.6167770831389383, + "grad_norm": NaN, + "learning_rate": 0.0001882027064687621, + "loss": 0.0, + "step": 28044 + }, + { + "epoch": 2.6168703928338153, + "grad_norm": NaN, + "learning_rate": 0.00018819539205801113, + "loss": 0.0, + "step": 28045 + }, + { + "epoch": 2.6169637025286927, + "grad_norm": NaN, + "learning_rate": 0.0001881880775501405, + "loss": 0.0, + "step": 28046 + }, + { + "epoch": 2.61705701222357, + "grad_norm": NaN, + "learning_rate": 0.00018818076294516896, + "loss": 0.0, + "step": 28047 + }, + { + "epoch": 2.617150321918447, + "grad_norm": NaN, + "learning_rate": 0.00018817344824311502, + "loss": 0.0, + "step": 28048 + }, + { + "epoch": 2.6172436316133245, + "grad_norm": NaN, + "learning_rate": 0.0001881661334439973, + "loss": 0.0, + "step": 28049 + }, + { + "epoch": 2.617336941308202, + "grad_norm": NaN, + "learning_rate": 0.00018815881854783445, + "loss": 0.0, + "step": 28050 + }, + { + "epoch": 2.6174302510030794, + "grad_norm": NaN, + "learning_rate": 0.00018815150355464496, + "loss": 0.0, + "step": 28051 + }, + { + "epoch": 2.6175235606979568, + "grad_norm": NaN, + "learning_rate": 0.00018814418846444756, + "loss": 0.0, + "step": 28052 + }, + { + "epoch": 2.6176168703928338, + "grad_norm": NaN, + "learning_rate": 0.00018813687327726077, + "loss": 0.0, + "step": 28053 + }, + { + "epoch": 2.617710180087711, + "grad_norm": NaN, + "learning_rate": 0.0001881295579931032, + "loss": 0.0, + "step": 28054 + }, + { + "epoch": 2.617803489782588, + "grad_norm": NaN, + "learning_rate": 0.00018812224261199347, + "loss": 0.0, + "step": 28055 + }, + { + "epoch": 2.6178967994774656, + "grad_norm": NaN, + "learning_rate": 0.00018811492713395015, + "loss": 0.0, + "step": 28056 + }, + { + "epoch": 2.617990109172343, + "grad_norm": NaN, + "learning_rate": 0.0001881076115589919, + "loss": 0.0, + "step": 28057 + }, + { + "epoch": 2.6180834188672204, + "grad_norm": NaN, + "learning_rate": 0.00018810029588713724, + "loss": 0.0, + "step": 28058 + }, + { + "epoch": 2.618176728562098, + "grad_norm": NaN, + "learning_rate": 0.00018809298011840484, + "loss": 0.0, + "step": 28059 + }, + { + "epoch": 2.618270038256975, + "grad_norm": NaN, + "learning_rate": 0.0001880856642528132, + "loss": 0.0, + "step": 28060 + }, + { + "epoch": 2.618363347951852, + "grad_norm": NaN, + "learning_rate": 0.0001880783482903811, + "loss": 0.0, + "step": 28061 + }, + { + "epoch": 2.6184566576467296, + "grad_norm": NaN, + "learning_rate": 0.00018807103223112702, + "loss": 0.0, + "step": 28062 + }, + { + "epoch": 2.6185499673416066, + "grad_norm": NaN, + "learning_rate": 0.00018806371607506953, + "loss": 0.0, + "step": 28063 + }, + { + "epoch": 2.618643277036484, + "grad_norm": NaN, + "learning_rate": 0.00018805639982222736, + "loss": 0.0, + "step": 28064 + }, + { + "epoch": 2.6187365867313614, + "grad_norm": NaN, + "learning_rate": 0.000188049083472619, + "loss": 0.0, + "step": 28065 + }, + { + "epoch": 2.618829896426239, + "grad_norm": NaN, + "learning_rate": 0.00018804176702626304, + "loss": 0.0, + "step": 28066 + }, + { + "epoch": 2.618923206121116, + "grad_norm": NaN, + "learning_rate": 0.00018803445048317822, + "loss": 0.0, + "step": 28067 + }, + { + "epoch": 2.6190165158159933, + "grad_norm": NaN, + "learning_rate": 0.00018802713384338303, + "loss": 0.0, + "step": 28068 + }, + { + "epoch": 2.6191098255108707, + "grad_norm": NaN, + "learning_rate": 0.0001880198171068961, + "loss": 0.0, + "step": 28069 + }, + { + "epoch": 2.6192031352057477, + "grad_norm": NaN, + "learning_rate": 0.00018801250027373605, + "loss": 0.0, + "step": 28070 + }, + { + "epoch": 2.619296444900625, + "grad_norm": NaN, + "learning_rate": 0.00018800518334392144, + "loss": 0.0, + "step": 28071 + }, + { + "epoch": 2.6193897545955025, + "grad_norm": NaN, + "learning_rate": 0.00018799786631747094, + "loss": 0.0, + "step": 28072 + }, + { + "epoch": 2.61948306429038, + "grad_norm": NaN, + "learning_rate": 0.00018799054919440308, + "loss": 0.0, + "step": 28073 + }, + { + "epoch": 2.6195763739852573, + "grad_norm": NaN, + "learning_rate": 0.00018798323197473653, + "loss": 0.0, + "step": 28074 + }, + { + "epoch": 2.6196696836801343, + "grad_norm": NaN, + "learning_rate": 0.00018797591465848987, + "loss": 0.0, + "step": 28075 + }, + { + "epoch": 2.6197629933750117, + "grad_norm": NaN, + "learning_rate": 0.00018796859724568173, + "loss": 0.0, + "step": 28076 + }, + { + "epoch": 2.6198563030698887, + "grad_norm": NaN, + "learning_rate": 0.00018796127973633067, + "loss": 0.0, + "step": 28077 + }, + { + "epoch": 2.619949612764766, + "grad_norm": NaN, + "learning_rate": 0.00018795396213045534, + "loss": 0.0, + "step": 28078 + }, + { + "epoch": 2.6200429224596435, + "grad_norm": NaN, + "learning_rate": 0.00018794664442807428, + "loss": 0.0, + "step": 28079 + }, + { + "epoch": 2.620136232154521, + "grad_norm": NaN, + "learning_rate": 0.00018793932662920622, + "loss": 0.0, + "step": 28080 + }, + { + "epoch": 2.6202295418493984, + "grad_norm": NaN, + "learning_rate": 0.00018793200873386964, + "loss": 0.0, + "step": 28081 + }, + { + "epoch": 2.6203228515442754, + "grad_norm": NaN, + "learning_rate": 0.00018792469074208317, + "loss": 0.0, + "step": 28082 + }, + { + "epoch": 2.6204161612391528, + "grad_norm": NaN, + "learning_rate": 0.0001879173726538655, + "loss": 0.0, + "step": 28083 + }, + { + "epoch": 2.62050947093403, + "grad_norm": NaN, + "learning_rate": 0.0001879100544692352, + "loss": 0.0, + "step": 28084 + }, + { + "epoch": 2.620602780628907, + "grad_norm": NaN, + "learning_rate": 0.00018790273618821078, + "loss": 0.0, + "step": 28085 + }, + { + "epoch": 2.6206960903237846, + "grad_norm": NaN, + "learning_rate": 0.00018789541781081098, + "loss": 0.0, + "step": 28086 + }, + { + "epoch": 2.620789400018662, + "grad_norm": NaN, + "learning_rate": 0.00018788809933705438, + "loss": 0.0, + "step": 28087 + }, + { + "epoch": 2.6208827097135394, + "grad_norm": NaN, + "learning_rate": 0.00018788078076695952, + "loss": 0.0, + "step": 28088 + }, + { + "epoch": 2.6209760194084164, + "grad_norm": NaN, + "learning_rate": 0.00018787346210054513, + "loss": 0.0, + "step": 28089 + }, + { + "epoch": 2.621069329103294, + "grad_norm": NaN, + "learning_rate": 0.00018786614333782966, + "loss": 0.0, + "step": 28090 + }, + { + "epoch": 2.6211626387981712, + "grad_norm": NaN, + "learning_rate": 0.00018785882447883184, + "loss": 0.0, + "step": 28091 + }, + { + "epoch": 2.621255948493048, + "grad_norm": NaN, + "learning_rate": 0.00018785150552357024, + "loss": 0.0, + "step": 28092 + }, + { + "epoch": 2.6213492581879256, + "grad_norm": NaN, + "learning_rate": 0.00018784418647206348, + "loss": 0.0, + "step": 28093 + }, + { + "epoch": 2.621442567882803, + "grad_norm": NaN, + "learning_rate": 0.00018783686732433015, + "loss": 0.0, + "step": 28094 + }, + { + "epoch": 2.6215358775776805, + "grad_norm": NaN, + "learning_rate": 0.0001878295480803889, + "loss": 0.0, + "step": 28095 + }, + { + "epoch": 2.621629187272558, + "grad_norm": NaN, + "learning_rate": 0.0001878222287402583, + "loss": 0.0, + "step": 28096 + }, + { + "epoch": 2.621722496967435, + "grad_norm": NaN, + "learning_rate": 0.00018781490930395696, + "loss": 0.0, + "step": 28097 + }, + { + "epoch": 2.6218158066623123, + "grad_norm": NaN, + "learning_rate": 0.00018780758977150348, + "loss": 0.0, + "step": 28098 + }, + { + "epoch": 2.6219091163571893, + "grad_norm": NaN, + "learning_rate": 0.0001878002701429166, + "loss": 0.0, + "step": 28099 + }, + { + "epoch": 2.6220024260520667, + "grad_norm": NaN, + "learning_rate": 0.00018779295041821477, + "loss": 0.0, + "step": 28100 + }, + { + "epoch": 2.622095735746944, + "grad_norm": NaN, + "learning_rate": 0.00018778563059741662, + "loss": 0.0, + "step": 28101 + }, + { + "epoch": 2.6221890454418215, + "grad_norm": NaN, + "learning_rate": 0.0001877783106805409, + "loss": 0.0, + "step": 28102 + }, + { + "epoch": 2.622282355136699, + "grad_norm": NaN, + "learning_rate": 0.00018777099066760603, + "loss": 0.0, + "step": 28103 + }, + { + "epoch": 2.622375664831576, + "grad_norm": NaN, + "learning_rate": 0.00018776367055863076, + "loss": 0.0, + "step": 28104 + }, + { + "epoch": 2.6224689745264533, + "grad_norm": NaN, + "learning_rate": 0.0001877563503536337, + "loss": 0.0, + "step": 28105 + }, + { + "epoch": 2.6225622842213308, + "grad_norm": NaN, + "learning_rate": 0.0001877490300526334, + "loss": 0.0, + "step": 28106 + }, + { + "epoch": 2.6226555939162077, + "grad_norm": NaN, + "learning_rate": 0.00018774170965564843, + "loss": 0.0, + "step": 28107 + }, + { + "epoch": 2.622748903611085, + "grad_norm": NaN, + "learning_rate": 0.0001877343891626976, + "loss": 0.0, + "step": 28108 + }, + { + "epoch": 2.6228422133059626, + "grad_norm": NaN, + "learning_rate": 0.0001877270685737993, + "loss": 0.0, + "step": 28109 + }, + { + "epoch": 2.62293552300084, + "grad_norm": NaN, + "learning_rate": 0.00018771974788897225, + "loss": 0.0, + "step": 28110 + }, + { + "epoch": 2.623028832695717, + "grad_norm": NaN, + "learning_rate": 0.0001877124271082351, + "loss": 0.0, + "step": 28111 + }, + { + "epoch": 2.6231221423905944, + "grad_norm": NaN, + "learning_rate": 0.00018770510623160636, + "loss": 0.0, + "step": 28112 + }, + { + "epoch": 2.623215452085472, + "grad_norm": NaN, + "learning_rate": 0.00018769778525910475, + "loss": 0.0, + "step": 28113 + }, + { + "epoch": 2.6233087617803488, + "grad_norm": NaN, + "learning_rate": 0.00018769046419074881, + "loss": 0.0, + "step": 28114 + }, + { + "epoch": 2.623402071475226, + "grad_norm": NaN, + "learning_rate": 0.00018768314302655722, + "loss": 0.0, + "step": 28115 + }, + { + "epoch": 2.6234953811701036, + "grad_norm": NaN, + "learning_rate": 0.00018767582176654855, + "loss": 0.0, + "step": 28116 + }, + { + "epoch": 2.623588690864981, + "grad_norm": NaN, + "learning_rate": 0.0001876685004107414, + "loss": 0.0, + "step": 28117 + }, + { + "epoch": 2.623682000559858, + "grad_norm": NaN, + "learning_rate": 0.00018766117895915443, + "loss": 0.0, + "step": 28118 + }, + { + "epoch": 2.6237753102547354, + "grad_norm": NaN, + "learning_rate": 0.00018765385741180624, + "loss": 0.0, + "step": 28119 + }, + { + "epoch": 2.623868619949613, + "grad_norm": NaN, + "learning_rate": 0.0001876465357687154, + "loss": 0.0, + "step": 28120 + }, + { + "epoch": 2.62396192964449, + "grad_norm": NaN, + "learning_rate": 0.00018763921402990062, + "loss": 0.0, + "step": 28121 + }, + { + "epoch": 2.6240552393393672, + "grad_norm": NaN, + "learning_rate": 0.0001876318921953805, + "loss": 0.0, + "step": 28122 + }, + { + "epoch": 2.6241485490342447, + "grad_norm": NaN, + "learning_rate": 0.00018762457026517355, + "loss": 0.0, + "step": 28123 + }, + { + "epoch": 2.624241858729122, + "grad_norm": NaN, + "learning_rate": 0.00018761724823929853, + "loss": 0.0, + "step": 28124 + }, + { + "epoch": 2.6243351684239995, + "grad_norm": NaN, + "learning_rate": 0.00018760992611777393, + "loss": 0.0, + "step": 28125 + }, + { + "epoch": 2.6244284781188765, + "grad_norm": NaN, + "learning_rate": 0.00018760260390061843, + "loss": 0.0, + "step": 28126 + }, + { + "epoch": 2.624521787813754, + "grad_norm": NaN, + "learning_rate": 0.0001875952815878507, + "loss": 0.0, + "step": 28127 + }, + { + "epoch": 2.6246150975086313, + "grad_norm": NaN, + "learning_rate": 0.0001875879591794893, + "loss": 0.0, + "step": 28128 + }, + { + "epoch": 2.6247084072035083, + "grad_norm": NaN, + "learning_rate": 0.0001875806366755528, + "loss": 0.0, + "step": 28129 + }, + { + "epoch": 2.6248017168983857, + "grad_norm": NaN, + "learning_rate": 0.00018757331407605993, + "loss": 0.0, + "step": 28130 + }, + { + "epoch": 2.624895026593263, + "grad_norm": NaN, + "learning_rate": 0.00018756599138102921, + "loss": 0.0, + "step": 28131 + }, + { + "epoch": 2.6249883362881405, + "grad_norm": NaN, + "learning_rate": 0.0001875586685904793, + "loss": 0.0, + "step": 28132 + }, + { + "epoch": 2.6250816459830175, + "grad_norm": NaN, + "learning_rate": 0.00018755134570442886, + "loss": 0.0, + "step": 28133 + }, + { + "epoch": 2.625174955677895, + "grad_norm": NaN, + "learning_rate": 0.00018754402272289647, + "loss": 0.0, + "step": 28134 + }, + { + "epoch": 2.6252682653727724, + "grad_norm": NaN, + "learning_rate": 0.00018753669964590073, + "loss": 0.0, + "step": 28135 + }, + { + "epoch": 2.6253615750676493, + "grad_norm": NaN, + "learning_rate": 0.00018752937647346023, + "loss": 0.0, + "step": 28136 + }, + { + "epoch": 2.6254548847625268, + "grad_norm": NaN, + "learning_rate": 0.00018752205320559376, + "loss": 0.0, + "step": 28137 + }, + { + "epoch": 2.625548194457404, + "grad_norm": NaN, + "learning_rate": 0.00018751472984231978, + "loss": 0.0, + "step": 28138 + }, + { + "epoch": 2.6256415041522816, + "grad_norm": NaN, + "learning_rate": 0.0001875074063836569, + "loss": 0.0, + "step": 28139 + }, + { + "epoch": 2.6257348138471586, + "grad_norm": NaN, + "learning_rate": 0.0001875000828296239, + "loss": 0.0, + "step": 28140 + }, + { + "epoch": 2.625828123542036, + "grad_norm": NaN, + "learning_rate": 0.0001874927591802392, + "loss": 0.0, + "step": 28141 + }, + { + "epoch": 2.6259214332369134, + "grad_norm": NaN, + "learning_rate": 0.00018748543543552153, + "loss": 0.0, + "step": 28142 + }, + { + "epoch": 2.6260147429317904, + "grad_norm": NaN, + "learning_rate": 0.00018747811159548953, + "loss": 0.0, + "step": 28143 + }, + { + "epoch": 2.626108052626668, + "grad_norm": NaN, + "learning_rate": 0.00018747078766016178, + "loss": 0.0, + "step": 28144 + }, + { + "epoch": 2.626201362321545, + "grad_norm": NaN, + "learning_rate": 0.00018746346362955688, + "loss": 0.0, + "step": 28145 + }, + { + "epoch": 2.6262946720164226, + "grad_norm": NaN, + "learning_rate": 0.00018745613950369358, + "loss": 0.0, + "step": 28146 + }, + { + "epoch": 2.6263879817113, + "grad_norm": NaN, + "learning_rate": 0.00018744881528259038, + "loss": 0.0, + "step": 28147 + }, + { + "epoch": 2.626481291406177, + "grad_norm": NaN, + "learning_rate": 0.00018744149096626588, + "loss": 0.0, + "step": 28148 + }, + { + "epoch": 2.6265746011010545, + "grad_norm": NaN, + "learning_rate": 0.00018743416655473883, + "loss": 0.0, + "step": 28149 + }, + { + "epoch": 2.6266679107959314, + "grad_norm": NaN, + "learning_rate": 0.00018742684204802777, + "loss": 0.0, + "step": 28150 + }, + { + "epoch": 2.626761220490809, + "grad_norm": NaN, + "learning_rate": 0.00018741951744615128, + "loss": 0.0, + "step": 28151 + }, + { + "epoch": 2.6268545301856863, + "grad_norm": NaN, + "learning_rate": 0.00018741219274912807, + "loss": 0.0, + "step": 28152 + }, + { + "epoch": 2.6269478398805637, + "grad_norm": NaN, + "learning_rate": 0.00018740486795697678, + "loss": 0.0, + "step": 28153 + }, + { + "epoch": 2.627041149575441, + "grad_norm": NaN, + "learning_rate": 0.00018739754306971594, + "loss": 0.0, + "step": 28154 + }, + { + "epoch": 2.627134459270318, + "grad_norm": NaN, + "learning_rate": 0.00018739021808736423, + "loss": 0.0, + "step": 28155 + }, + { + "epoch": 2.6272277689651955, + "grad_norm": NaN, + "learning_rate": 0.00018738289300994034, + "loss": 0.0, + "step": 28156 + }, + { + "epoch": 2.627321078660073, + "grad_norm": NaN, + "learning_rate": 0.00018737556783746275, + "loss": 0.0, + "step": 28157 + }, + { + "epoch": 2.62741438835495, + "grad_norm": NaN, + "learning_rate": 0.00018736824256995017, + "loss": 0.0, + "step": 28158 + }, + { + "epoch": 2.6275076980498273, + "grad_norm": NaN, + "learning_rate": 0.00018736091720742128, + "loss": 0.0, + "step": 28159 + }, + { + "epoch": 2.6276010077447047, + "grad_norm": NaN, + "learning_rate": 0.0001873535917498946, + "loss": 0.0, + "step": 28160 + }, + { + "epoch": 2.627694317439582, + "grad_norm": NaN, + "learning_rate": 0.00018734626619738878, + "loss": 0.0, + "step": 28161 + }, + { + "epoch": 2.627787627134459, + "grad_norm": NaN, + "learning_rate": 0.0001873389405499225, + "loss": 0.0, + "step": 28162 + }, + { + "epoch": 2.6278809368293365, + "grad_norm": NaN, + "learning_rate": 0.00018733161480751433, + "loss": 0.0, + "step": 28163 + }, + { + "epoch": 2.627974246524214, + "grad_norm": NaN, + "learning_rate": 0.0001873242889701829, + "loss": 0.0, + "step": 28164 + }, + { + "epoch": 2.628067556219091, + "grad_norm": NaN, + "learning_rate": 0.00018731696303794692, + "loss": 0.0, + "step": 28165 + }, + { + "epoch": 2.6281608659139684, + "grad_norm": NaN, + "learning_rate": 0.00018730963701082495, + "loss": 0.0, + "step": 28166 + }, + { + "epoch": 2.6282541756088458, + "grad_norm": NaN, + "learning_rate": 0.00018730231088883557, + "loss": 0.0, + "step": 28167 + }, + { + "epoch": 2.628347485303723, + "grad_norm": NaN, + "learning_rate": 0.00018729498467199754, + "loss": 0.0, + "step": 28168 + }, + { + "epoch": 2.6284407949986006, + "grad_norm": NaN, + "learning_rate": 0.00018728765836032937, + "loss": 0.0, + "step": 28169 + }, + { + "epoch": 2.6285341046934776, + "grad_norm": NaN, + "learning_rate": 0.00018728033195384973, + "loss": 0.0, + "step": 28170 + }, + { + "epoch": 2.628627414388355, + "grad_norm": NaN, + "learning_rate": 0.00018727300545257723, + "loss": 0.0, + "step": 28171 + }, + { + "epoch": 2.628720724083232, + "grad_norm": NaN, + "learning_rate": 0.00018726567885653058, + "loss": 0.0, + "step": 28172 + }, + { + "epoch": 2.6288140337781094, + "grad_norm": NaN, + "learning_rate": 0.00018725835216572824, + "loss": 0.0, + "step": 28173 + }, + { + "epoch": 2.628907343472987, + "grad_norm": NaN, + "learning_rate": 0.00018725102538018904, + "loss": 0.0, + "step": 28174 + }, + { + "epoch": 2.6290006531678642, + "grad_norm": NaN, + "learning_rate": 0.00018724369849993153, + "loss": 0.0, + "step": 28175 + }, + { + "epoch": 2.6290939628627417, + "grad_norm": NaN, + "learning_rate": 0.00018723637152497425, + "loss": 0.0, + "step": 28176 + }, + { + "epoch": 2.6291872725576186, + "grad_norm": NaN, + "learning_rate": 0.00018722904445533594, + "loss": 0.0, + "step": 28177 + }, + { + "epoch": 2.629280582252496, + "grad_norm": NaN, + "learning_rate": 0.00018722171729103522, + "loss": 0.0, + "step": 28178 + }, + { + "epoch": 2.6293738919473735, + "grad_norm": NaN, + "learning_rate": 0.00018721439003209066, + "loss": 0.0, + "step": 28179 + }, + { + "epoch": 2.6294672016422505, + "grad_norm": NaN, + "learning_rate": 0.00018720706267852098, + "loss": 0.0, + "step": 28180 + }, + { + "epoch": 2.629560511337128, + "grad_norm": NaN, + "learning_rate": 0.00018719973523034474, + "loss": 0.0, + "step": 28181 + }, + { + "epoch": 2.6296538210320053, + "grad_norm": NaN, + "learning_rate": 0.0001871924076875806, + "loss": 0.0, + "step": 28182 + }, + { + "epoch": 2.6297471307268827, + "grad_norm": NaN, + "learning_rate": 0.00018718508005024712, + "loss": 0.0, + "step": 28183 + }, + { + "epoch": 2.6298404404217597, + "grad_norm": NaN, + "learning_rate": 0.00018717775231836308, + "loss": 0.0, + "step": 28184 + }, + { + "epoch": 2.629933750116637, + "grad_norm": NaN, + "learning_rate": 0.000187170424491947, + "loss": 0.0, + "step": 28185 + }, + { + "epoch": 2.6300270598115145, + "grad_norm": NaN, + "learning_rate": 0.0001871630965710175, + "loss": 0.0, + "step": 28186 + }, + { + "epoch": 2.6301203695063915, + "grad_norm": NaN, + "learning_rate": 0.00018715576855559332, + "loss": 0.0, + "step": 28187 + }, + { + "epoch": 2.630213679201269, + "grad_norm": NaN, + "learning_rate": 0.00018714844044569298, + "loss": 0.0, + "step": 28188 + }, + { + "epoch": 2.6303069888961463, + "grad_norm": NaN, + "learning_rate": 0.00018714111224133517, + "loss": 0.0, + "step": 28189 + }, + { + "epoch": 2.6304002985910238, + "grad_norm": NaN, + "learning_rate": 0.00018713378394253854, + "loss": 0.0, + "step": 28190 + }, + { + "epoch": 2.630493608285901, + "grad_norm": NaN, + "learning_rate": 0.0001871264555493217, + "loss": 0.0, + "step": 28191 + }, + { + "epoch": 2.630586917980778, + "grad_norm": NaN, + "learning_rate": 0.00018711912706170322, + "loss": 0.0, + "step": 28192 + }, + { + "epoch": 2.6306802276756556, + "grad_norm": NaN, + "learning_rate": 0.00018711179847970182, + "loss": 0.0, + "step": 28193 + }, + { + "epoch": 2.6307735373705325, + "grad_norm": NaN, + "learning_rate": 0.00018710446980333616, + "loss": 0.0, + "step": 28194 + }, + { + "epoch": 2.63086684706541, + "grad_norm": NaN, + "learning_rate": 0.00018709714103262476, + "loss": 0.0, + "step": 28195 + }, + { + "epoch": 2.6309601567602874, + "grad_norm": NaN, + "learning_rate": 0.00018708981216758634, + "loss": 0.0, + "step": 28196 + }, + { + "epoch": 2.631053466455165, + "grad_norm": NaN, + "learning_rate": 0.00018708248320823957, + "loss": 0.0, + "step": 28197 + }, + { + "epoch": 2.631146776150042, + "grad_norm": NaN, + "learning_rate": 0.00018707515415460294, + "loss": 0.0, + "step": 28198 + }, + { + "epoch": 2.631240085844919, + "grad_norm": NaN, + "learning_rate": 0.00018706782500669525, + "loss": 0.0, + "step": 28199 + }, + { + "epoch": 2.6313333955397966, + "grad_norm": NaN, + "learning_rate": 0.00018706049576453504, + "loss": 0.0, + "step": 28200 + }, + { + "epoch": 2.631426705234674, + "grad_norm": NaN, + "learning_rate": 0.00018705316642814092, + "loss": 0.0, + "step": 28201 + }, + { + "epoch": 2.631520014929551, + "grad_norm": NaN, + "learning_rate": 0.0001870458369975316, + "loss": 0.0, + "step": 28202 + }, + { + "epoch": 2.6316133246244284, + "grad_norm": NaN, + "learning_rate": 0.00018703850747272575, + "loss": 0.0, + "step": 28203 + }, + { + "epoch": 2.631706634319306, + "grad_norm": NaN, + "learning_rate": 0.0001870311778537419, + "loss": 0.0, + "step": 28204 + }, + { + "epoch": 2.6317999440141833, + "grad_norm": NaN, + "learning_rate": 0.00018702384814059872, + "loss": 0.0, + "step": 28205 + }, + { + "epoch": 2.6318932537090602, + "grad_norm": NaN, + "learning_rate": 0.00018701651833331488, + "loss": 0.0, + "step": 28206 + }, + { + "epoch": 2.6319865634039377, + "grad_norm": NaN, + "learning_rate": 0.00018700918843190902, + "loss": 0.0, + "step": 28207 + }, + { + "epoch": 2.632079873098815, + "grad_norm": NaN, + "learning_rate": 0.00018700185843639966, + "loss": 0.0, + "step": 28208 + }, + { + "epoch": 2.632173182793692, + "grad_norm": NaN, + "learning_rate": 0.00018699452834680563, + "loss": 0.0, + "step": 28209 + }, + { + "epoch": 2.6322664924885695, + "grad_norm": NaN, + "learning_rate": 0.00018698719816314548, + "loss": 0.0, + "step": 28210 + }, + { + "epoch": 2.632359802183447, + "grad_norm": NaN, + "learning_rate": 0.00018697986788543774, + "loss": 0.0, + "step": 28211 + }, + { + "epoch": 2.6324531118783243, + "grad_norm": NaN, + "learning_rate": 0.00018697253751370123, + "loss": 0.0, + "step": 28212 + }, + { + "epoch": 2.6325464215732013, + "grad_norm": NaN, + "learning_rate": 0.00018696520704795456, + "loss": 0.0, + "step": 28213 + }, + { + "epoch": 2.6326397312680787, + "grad_norm": NaN, + "learning_rate": 0.0001869578764882162, + "loss": 0.0, + "step": 28214 + }, + { + "epoch": 2.632733040962956, + "grad_norm": NaN, + "learning_rate": 0.00018695054583450497, + "loss": 0.0, + "step": 28215 + }, + { + "epoch": 2.632826350657833, + "grad_norm": NaN, + "learning_rate": 0.00018694321508683947, + "loss": 0.0, + "step": 28216 + }, + { + "epoch": 2.6329196603527105, + "grad_norm": NaN, + "learning_rate": 0.00018693588424523828, + "loss": 0.0, + "step": 28217 + }, + { + "epoch": 2.633012970047588, + "grad_norm": NaN, + "learning_rate": 0.0001869285533097201, + "loss": 0.0, + "step": 28218 + }, + { + "epoch": 2.6331062797424654, + "grad_norm": NaN, + "learning_rate": 0.00018692122228030356, + "loss": 0.0, + "step": 28219 + }, + { + "epoch": 2.633199589437343, + "grad_norm": NaN, + "learning_rate": 0.00018691389115700721, + "loss": 0.0, + "step": 28220 + }, + { + "epoch": 2.6332928991322198, + "grad_norm": NaN, + "learning_rate": 0.00018690655993984983, + "loss": 0.0, + "step": 28221 + }, + { + "epoch": 2.633386208827097, + "grad_norm": NaN, + "learning_rate": 0.00018689922862885004, + "loss": 0.0, + "step": 28222 + }, + { + "epoch": 2.6334795185219746, + "grad_norm": NaN, + "learning_rate": 0.00018689189722402634, + "loss": 0.0, + "step": 28223 + }, + { + "epoch": 2.6335728282168516, + "grad_norm": NaN, + "learning_rate": 0.00018688456572539753, + "loss": 0.0, + "step": 28224 + }, + { + "epoch": 2.633666137911729, + "grad_norm": NaN, + "learning_rate": 0.00018687723413298224, + "loss": 0.0, + "step": 28225 + }, + { + "epoch": 2.6337594476066064, + "grad_norm": NaN, + "learning_rate": 0.000186869902446799, + "loss": 0.0, + "step": 28226 + }, + { + "epoch": 2.633852757301484, + "grad_norm": NaN, + "learning_rate": 0.00018686257066686653, + "loss": 0.0, + "step": 28227 + }, + { + "epoch": 2.633946066996361, + "grad_norm": NaN, + "learning_rate": 0.00018685523879320346, + "loss": 0.0, + "step": 28228 + }, + { + "epoch": 2.634039376691238, + "grad_norm": NaN, + "learning_rate": 0.0001868479068258285, + "loss": 0.0, + "step": 28229 + }, + { + "epoch": 2.6341326863861156, + "grad_norm": NaN, + "learning_rate": 0.00018684057476476014, + "loss": 0.0, + "step": 28230 + }, + { + "epoch": 2.6342259960809926, + "grad_norm": NaN, + "learning_rate": 0.00018683324261001716, + "loss": 0.0, + "step": 28231 + }, + { + "epoch": 2.63431930577587, + "grad_norm": NaN, + "learning_rate": 0.00018682591036161815, + "loss": 0.0, + "step": 28232 + }, + { + "epoch": 2.6344126154707475, + "grad_norm": NaN, + "learning_rate": 0.00018681857801958171, + "loss": 0.0, + "step": 28233 + }, + { + "epoch": 2.634505925165625, + "grad_norm": NaN, + "learning_rate": 0.0001868112455839266, + "loss": 0.0, + "step": 28234 + }, + { + "epoch": 2.634599234860502, + "grad_norm": NaN, + "learning_rate": 0.00018680391305467137, + "loss": 0.0, + "step": 28235 + }, + { + "epoch": 2.6346925445553793, + "grad_norm": NaN, + "learning_rate": 0.0001867965804318347, + "loss": 0.0, + "step": 28236 + }, + { + "epoch": 2.6347858542502567, + "grad_norm": NaN, + "learning_rate": 0.00018678924771543522, + "loss": 0.0, + "step": 28237 + }, + { + "epoch": 2.6348791639451337, + "grad_norm": NaN, + "learning_rate": 0.0001867819149054916, + "loss": 0.0, + "step": 28238 + }, + { + "epoch": 2.634972473640011, + "grad_norm": NaN, + "learning_rate": 0.0001867745820020224, + "loss": 0.0, + "step": 28239 + }, + { + "epoch": 2.6350657833348885, + "grad_norm": NaN, + "learning_rate": 0.00018676724900504638, + "loss": 0.0, + "step": 28240 + }, + { + "epoch": 2.635159093029766, + "grad_norm": NaN, + "learning_rate": 0.00018675991591458218, + "loss": 0.0, + "step": 28241 + }, + { + "epoch": 2.6352524027246433, + "grad_norm": NaN, + "learning_rate": 0.0001867525827306483, + "loss": 0.0, + "step": 28242 + }, + { + "epoch": 2.6353457124195203, + "grad_norm": NaN, + "learning_rate": 0.00018674524945326354, + "loss": 0.0, + "step": 28243 + }, + { + "epoch": 2.6354390221143977, + "grad_norm": NaN, + "learning_rate": 0.00018673791608244655, + "loss": 0.0, + "step": 28244 + }, + { + "epoch": 2.6355323318092747, + "grad_norm": NaN, + "learning_rate": 0.00018673058261821585, + "loss": 0.0, + "step": 28245 + }, + { + "epoch": 2.635625641504152, + "grad_norm": NaN, + "learning_rate": 0.00018672324906059018, + "loss": 0.0, + "step": 28246 + }, + { + "epoch": 2.6357189511990295, + "grad_norm": NaN, + "learning_rate": 0.00018671591540958817, + "loss": 0.0, + "step": 28247 + }, + { + "epoch": 2.635812260893907, + "grad_norm": NaN, + "learning_rate": 0.00018670858166522846, + "loss": 0.0, + "step": 28248 + }, + { + "epoch": 2.6359055705887844, + "grad_norm": NaN, + "learning_rate": 0.0001867012478275297, + "loss": 0.0, + "step": 28249 + }, + { + "epoch": 2.6359988802836614, + "grad_norm": NaN, + "learning_rate": 0.00018669391389651053, + "loss": 0.0, + "step": 28250 + }, + { + "epoch": 2.636092189978539, + "grad_norm": NaN, + "learning_rate": 0.00018668657987218963, + "loss": 0.0, + "step": 28251 + }, + { + "epoch": 2.636185499673416, + "grad_norm": NaN, + "learning_rate": 0.0001866792457545856, + "loss": 0.0, + "step": 28252 + }, + { + "epoch": 2.636278809368293, + "grad_norm": NaN, + "learning_rate": 0.00018667191154371712, + "loss": 0.0, + "step": 28253 + }, + { + "epoch": 2.6363721190631706, + "grad_norm": NaN, + "learning_rate": 0.0001866645772396029, + "loss": 0.0, + "step": 28254 + }, + { + "epoch": 2.636465428758048, + "grad_norm": NaN, + "learning_rate": 0.0001866572428422614, + "loss": 0.0, + "step": 28255 + }, + { + "epoch": 2.6365587384529254, + "grad_norm": NaN, + "learning_rate": 0.00018664990835171144, + "loss": 0.0, + "step": 28256 + }, + { + "epoch": 2.6366520481478024, + "grad_norm": NaN, + "learning_rate": 0.00018664257376797166, + "loss": 0.0, + "step": 28257 + }, + { + "epoch": 2.63674535784268, + "grad_norm": NaN, + "learning_rate": 0.00018663523909106063, + "loss": 0.0, + "step": 28258 + }, + { + "epoch": 2.6368386675375572, + "grad_norm": NaN, + "learning_rate": 0.00018662790432099702, + "loss": 0.0, + "step": 28259 + }, + { + "epoch": 2.636931977232434, + "grad_norm": NaN, + "learning_rate": 0.00018662056945779957, + "loss": 0.0, + "step": 28260 + }, + { + "epoch": 2.6370252869273116, + "grad_norm": NaN, + "learning_rate": 0.0001866132345014868, + "loss": 0.0, + "step": 28261 + }, + { + "epoch": 2.637118596622189, + "grad_norm": NaN, + "learning_rate": 0.00018660589945207741, + "loss": 0.0, + "step": 28262 + }, + { + "epoch": 2.6372119063170665, + "grad_norm": NaN, + "learning_rate": 0.00018659856430959014, + "loss": 0.0, + "step": 28263 + }, + { + "epoch": 2.637305216011944, + "grad_norm": NaN, + "learning_rate": 0.00018659122907404346, + "loss": 0.0, + "step": 28264 + }, + { + "epoch": 2.637398525706821, + "grad_norm": NaN, + "learning_rate": 0.00018658389374545618, + "loss": 0.0, + "step": 28265 + }, + { + "epoch": 2.6374918354016983, + "grad_norm": NaN, + "learning_rate": 0.00018657655832384688, + "loss": 0.0, + "step": 28266 + }, + { + "epoch": 2.6375851450965753, + "grad_norm": NaN, + "learning_rate": 0.00018656922280923423, + "loss": 0.0, + "step": 28267 + }, + { + "epoch": 2.6376784547914527, + "grad_norm": NaN, + "learning_rate": 0.00018656188720163688, + "loss": 0.0, + "step": 28268 + }, + { + "epoch": 2.63777176448633, + "grad_norm": NaN, + "learning_rate": 0.00018655455150107348, + "loss": 0.0, + "step": 28269 + }, + { + "epoch": 2.6378650741812075, + "grad_norm": NaN, + "learning_rate": 0.00018654721570756275, + "loss": 0.0, + "step": 28270 + }, + { + "epoch": 2.637958383876085, + "grad_norm": NaN, + "learning_rate": 0.00018653987982112318, + "loss": 0.0, + "step": 28271 + }, + { + "epoch": 2.638051693570962, + "grad_norm": NaN, + "learning_rate": 0.00018653254384177353, + "loss": 0.0, + "step": 28272 + }, + { + "epoch": 2.6381450032658393, + "grad_norm": NaN, + "learning_rate": 0.0001865252077695325, + "loss": 0.0, + "step": 28273 + }, + { + "epoch": 2.6382383129607168, + "grad_norm": NaN, + "learning_rate": 0.00018651787160441863, + "loss": 0.0, + "step": 28274 + }, + { + "epoch": 2.6383316226555937, + "grad_norm": NaN, + "learning_rate": 0.00018651053534645066, + "loss": 0.0, + "step": 28275 + }, + { + "epoch": 2.638424932350471, + "grad_norm": NaN, + "learning_rate": 0.00018650319899564724, + "loss": 0.0, + "step": 28276 + }, + { + "epoch": 2.6385182420453486, + "grad_norm": NaN, + "learning_rate": 0.00018649586255202694, + "loss": 0.0, + "step": 28277 + }, + { + "epoch": 2.638611551740226, + "grad_norm": NaN, + "learning_rate": 0.0001864885260156085, + "loss": 0.0, + "step": 28278 + }, + { + "epoch": 2.638704861435103, + "grad_norm": NaN, + "learning_rate": 0.00018648118938641056, + "loss": 0.0, + "step": 28279 + }, + { + "epoch": 2.6387981711299804, + "grad_norm": NaN, + "learning_rate": 0.0001864738526644517, + "loss": 0.0, + "step": 28280 + }, + { + "epoch": 2.638891480824858, + "grad_norm": NaN, + "learning_rate": 0.00018646651584975068, + "loss": 0.0, + "step": 28281 + }, + { + "epoch": 2.638984790519735, + "grad_norm": NaN, + "learning_rate": 0.00018645917894232614, + "loss": 0.0, + "step": 28282 + }, + { + "epoch": 2.639078100214612, + "grad_norm": NaN, + "learning_rate": 0.00018645184194219667, + "loss": 0.0, + "step": 28283 + }, + { + "epoch": 2.6391714099094896, + "grad_norm": NaN, + "learning_rate": 0.00018644450484938096, + "loss": 0.0, + "step": 28284 + }, + { + "epoch": 2.639264719604367, + "grad_norm": NaN, + "learning_rate": 0.0001864371676638977, + "loss": 0.0, + "step": 28285 + }, + { + "epoch": 2.6393580292992445, + "grad_norm": NaN, + "learning_rate": 0.00018642983038576552, + "loss": 0.0, + "step": 28286 + }, + { + "epoch": 2.6394513389941214, + "grad_norm": NaN, + "learning_rate": 0.00018642249301500303, + "loss": 0.0, + "step": 28287 + }, + { + "epoch": 2.639544648688999, + "grad_norm": NaN, + "learning_rate": 0.00018641515555162897, + "loss": 0.0, + "step": 28288 + }, + { + "epoch": 2.639637958383876, + "grad_norm": NaN, + "learning_rate": 0.00018640781799566196, + "loss": 0.0, + "step": 28289 + }, + { + "epoch": 2.6397312680787532, + "grad_norm": NaN, + "learning_rate": 0.00018640048034712062, + "loss": 0.0, + "step": 28290 + }, + { + "epoch": 2.6398245777736307, + "grad_norm": NaN, + "learning_rate": 0.00018639314260602364, + "loss": 0.0, + "step": 28291 + }, + { + "epoch": 2.639917887468508, + "grad_norm": NaN, + "learning_rate": 0.0001863858047723897, + "loss": 0.0, + "step": 28292 + }, + { + "epoch": 2.6400111971633855, + "grad_norm": NaN, + "learning_rate": 0.00018637846684623743, + "loss": 0.0, + "step": 28293 + }, + { + "epoch": 2.6401045068582625, + "grad_norm": NaN, + "learning_rate": 0.00018637112882758552, + "loss": 0.0, + "step": 28294 + }, + { + "epoch": 2.64019781655314, + "grad_norm": NaN, + "learning_rate": 0.00018636379071645262, + "loss": 0.0, + "step": 28295 + }, + { + "epoch": 2.6402911262480173, + "grad_norm": NaN, + "learning_rate": 0.00018635645251285728, + "loss": 0.0, + "step": 28296 + }, + { + "epoch": 2.6403844359428943, + "grad_norm": NaN, + "learning_rate": 0.0001863491142168183, + "loss": 0.0, + "step": 28297 + }, + { + "epoch": 2.6404777456377717, + "grad_norm": NaN, + "learning_rate": 0.00018634177582835436, + "loss": 0.0, + "step": 28298 + }, + { + "epoch": 2.640571055332649, + "grad_norm": NaN, + "learning_rate": 0.0001863344373474839, + "loss": 0.0, + "step": 28299 + }, + { + "epoch": 2.6406643650275266, + "grad_norm": NaN, + "learning_rate": 0.00018632709877422583, + "loss": 0.0, + "step": 28300 + }, + { + "epoch": 2.6407576747224035, + "grad_norm": NaN, + "learning_rate": 0.0001863197601085987, + "loss": 0.0, + "step": 28301 + }, + { + "epoch": 2.640850984417281, + "grad_norm": NaN, + "learning_rate": 0.00018631242135062114, + "loss": 0.0, + "step": 28302 + }, + { + "epoch": 2.6409442941121584, + "grad_norm": NaN, + "learning_rate": 0.0001863050825003119, + "loss": 0.0, + "step": 28303 + }, + { + "epoch": 2.6410376038070353, + "grad_norm": NaN, + "learning_rate": 0.00018629774355768955, + "loss": 0.0, + "step": 28304 + }, + { + "epoch": 2.6411309135019128, + "grad_norm": NaN, + "learning_rate": 0.00018629040452277282, + "loss": 0.0, + "step": 28305 + }, + { + "epoch": 2.64122422319679, + "grad_norm": NaN, + "learning_rate": 0.00018628306539558031, + "loss": 0.0, + "step": 28306 + }, + { + "epoch": 2.6413175328916676, + "grad_norm": NaN, + "learning_rate": 0.00018627572617613074, + "loss": 0.0, + "step": 28307 + }, + { + "epoch": 2.641410842586545, + "grad_norm": NaN, + "learning_rate": 0.00018626838686444276, + "loss": 0.0, + "step": 28308 + }, + { + "epoch": 2.641504152281422, + "grad_norm": NaN, + "learning_rate": 0.00018626104746053497, + "loss": 0.0, + "step": 28309 + }, + { + "epoch": 2.6415974619762994, + "grad_norm": NaN, + "learning_rate": 0.0001862537079644261, + "loss": 0.0, + "step": 28310 + }, + { + "epoch": 2.6416907716711764, + "grad_norm": NaN, + "learning_rate": 0.0001862463683761348, + "loss": 0.0, + "step": 28311 + }, + { + "epoch": 2.641784081366054, + "grad_norm": NaN, + "learning_rate": 0.00018623902869567972, + "loss": 0.0, + "step": 28312 + }, + { + "epoch": 2.6418773910609312, + "grad_norm": NaN, + "learning_rate": 0.00018623168892307953, + "loss": 0.0, + "step": 28313 + }, + { + "epoch": 2.6419707007558086, + "grad_norm": NaN, + "learning_rate": 0.00018622434905835283, + "loss": 0.0, + "step": 28314 + }, + { + "epoch": 2.642064010450686, + "grad_norm": NaN, + "learning_rate": 0.0001862170091015184, + "loss": 0.0, + "step": 28315 + }, + { + "epoch": 2.642157320145563, + "grad_norm": NaN, + "learning_rate": 0.0001862096690525948, + "loss": 0.0, + "step": 28316 + }, + { + "epoch": 2.6422506298404405, + "grad_norm": NaN, + "learning_rate": 0.00018620232891160084, + "loss": 0.0, + "step": 28317 + }, + { + "epoch": 2.642343939535318, + "grad_norm": NaN, + "learning_rate": 0.00018619498867855495, + "loss": 0.0, + "step": 28318 + }, + { + "epoch": 2.642437249230195, + "grad_norm": NaN, + "learning_rate": 0.00018618764835347598, + "loss": 0.0, + "step": 28319 + }, + { + "epoch": 2.6425305589250723, + "grad_norm": NaN, + "learning_rate": 0.00018618030793638257, + "loss": 0.0, + "step": 28320 + }, + { + "epoch": 2.6426238686199497, + "grad_norm": NaN, + "learning_rate": 0.00018617296742729332, + "loss": 0.0, + "step": 28321 + }, + { + "epoch": 2.642717178314827, + "grad_norm": NaN, + "learning_rate": 0.00018616562682622693, + "loss": 0.0, + "step": 28322 + }, + { + "epoch": 2.642810488009704, + "grad_norm": NaN, + "learning_rate": 0.00018615828613320205, + "loss": 0.0, + "step": 28323 + }, + { + "epoch": 2.6429037977045815, + "grad_norm": NaN, + "learning_rate": 0.00018615094534823738, + "loss": 0.0, + "step": 28324 + }, + { + "epoch": 2.642997107399459, + "grad_norm": NaN, + "learning_rate": 0.0001861436044713516, + "loss": 0.0, + "step": 28325 + }, + { + "epoch": 2.643090417094336, + "grad_norm": NaN, + "learning_rate": 0.00018613626350256327, + "loss": 0.0, + "step": 28326 + }, + { + "epoch": 2.6431837267892133, + "grad_norm": NaN, + "learning_rate": 0.00018612892244189116, + "loss": 0.0, + "step": 28327 + }, + { + "epoch": 2.6432770364840907, + "grad_norm": NaN, + "learning_rate": 0.00018612158128935392, + "loss": 0.0, + "step": 28328 + }, + { + "epoch": 2.643370346178968, + "grad_norm": NaN, + "learning_rate": 0.00018611424004497018, + "loss": 0.0, + "step": 28329 + }, + { + "epoch": 2.643463655873845, + "grad_norm": NaN, + "learning_rate": 0.00018610689870875863, + "loss": 0.0, + "step": 28330 + }, + { + "epoch": 2.6435569655687225, + "grad_norm": NaN, + "learning_rate": 0.00018609955728073792, + "loss": 0.0, + "step": 28331 + }, + { + "epoch": 2.6436502752636, + "grad_norm": NaN, + "learning_rate": 0.00018609221576092674, + "loss": 0.0, + "step": 28332 + }, + { + "epoch": 2.643743584958477, + "grad_norm": NaN, + "learning_rate": 0.00018608487414934375, + "loss": 0.0, + "step": 28333 + }, + { + "epoch": 2.6438368946533544, + "grad_norm": NaN, + "learning_rate": 0.0001860775324460076, + "loss": 0.0, + "step": 28334 + }, + { + "epoch": 2.643930204348232, + "grad_norm": NaN, + "learning_rate": 0.00018607019065093697, + "loss": 0.0, + "step": 28335 + }, + { + "epoch": 2.644023514043109, + "grad_norm": NaN, + "learning_rate": 0.00018606284876415056, + "loss": 0.0, + "step": 28336 + }, + { + "epoch": 2.6441168237379866, + "grad_norm": NaN, + "learning_rate": 0.00018605550678566697, + "loss": 0.0, + "step": 28337 + }, + { + "epoch": 2.6442101334328636, + "grad_norm": NaN, + "learning_rate": 0.00018604816471550493, + "loss": 0.0, + "step": 28338 + }, + { + "epoch": 2.644303443127741, + "grad_norm": NaN, + "learning_rate": 0.0001860408225536831, + "loss": 0.0, + "step": 28339 + }, + { + "epoch": 2.6443967528226184, + "grad_norm": NaN, + "learning_rate": 0.0001860334803002201, + "loss": 0.0, + "step": 28340 + }, + { + "epoch": 2.6444900625174954, + "grad_norm": NaN, + "learning_rate": 0.00018602613795513467, + "loss": 0.0, + "step": 28341 + }, + { + "epoch": 2.644583372212373, + "grad_norm": NaN, + "learning_rate": 0.00018601879551844542, + "loss": 0.0, + "step": 28342 + }, + { + "epoch": 2.6446766819072502, + "grad_norm": NaN, + "learning_rate": 0.00018601145299017104, + "loss": 0.0, + "step": 28343 + }, + { + "epoch": 2.6447699916021277, + "grad_norm": NaN, + "learning_rate": 0.00018600411037033023, + "loss": 0.0, + "step": 28344 + }, + { + "epoch": 2.6448633012970046, + "grad_norm": NaN, + "learning_rate": 0.0001859967676589416, + "loss": 0.0, + "step": 28345 + }, + { + "epoch": 2.644956610991882, + "grad_norm": NaN, + "learning_rate": 0.00018598942485602383, + "loss": 0.0, + "step": 28346 + }, + { + "epoch": 2.6450499206867595, + "grad_norm": NaN, + "learning_rate": 0.00018598208196159567, + "loss": 0.0, + "step": 28347 + }, + { + "epoch": 2.6451432303816365, + "grad_norm": NaN, + "learning_rate": 0.0001859747389756757, + "loss": 0.0, + "step": 28348 + }, + { + "epoch": 2.645236540076514, + "grad_norm": NaN, + "learning_rate": 0.00018596739589828263, + "loss": 0.0, + "step": 28349 + }, + { + "epoch": 2.6453298497713913, + "grad_norm": NaN, + "learning_rate": 0.00018596005272943515, + "loss": 0.0, + "step": 28350 + }, + { + "epoch": 2.6454231594662687, + "grad_norm": NaN, + "learning_rate": 0.00018595270946915188, + "loss": 0.0, + "step": 28351 + }, + { + "epoch": 2.6455164691611457, + "grad_norm": NaN, + "learning_rate": 0.00018594536611745154, + "loss": 0.0, + "step": 28352 + }, + { + "epoch": 2.645609778856023, + "grad_norm": NaN, + "learning_rate": 0.00018593802267435276, + "loss": 0.0, + "step": 28353 + }, + { + "epoch": 2.6457030885509005, + "grad_norm": NaN, + "learning_rate": 0.00018593067913987425, + "loss": 0.0, + "step": 28354 + }, + { + "epoch": 2.6457963982457775, + "grad_norm": NaN, + "learning_rate": 0.00018592333551403466, + "loss": 0.0, + "step": 28355 + }, + { + "epoch": 2.645889707940655, + "grad_norm": NaN, + "learning_rate": 0.00018591599179685266, + "loss": 0.0, + "step": 28356 + }, + { + "epoch": 2.6459830176355323, + "grad_norm": NaN, + "learning_rate": 0.00018590864798834696, + "loss": 0.0, + "step": 28357 + }, + { + "epoch": 2.6460763273304098, + "grad_norm": NaN, + "learning_rate": 0.00018590130408853618, + "loss": 0.0, + "step": 28358 + }, + { + "epoch": 2.646169637025287, + "grad_norm": NaN, + "learning_rate": 0.00018589396009743904, + "loss": 0.0, + "step": 28359 + }, + { + "epoch": 2.646262946720164, + "grad_norm": NaN, + "learning_rate": 0.0001858866160150742, + "loss": 0.0, + "step": 28360 + }, + { + "epoch": 2.6463562564150416, + "grad_norm": NaN, + "learning_rate": 0.00018587927184146027, + "loss": 0.0, + "step": 28361 + }, + { + "epoch": 2.6464495661099185, + "grad_norm": NaN, + "learning_rate": 0.00018587192757661602, + "loss": 0.0, + "step": 28362 + }, + { + "epoch": 2.646542875804796, + "grad_norm": NaN, + "learning_rate": 0.00018586458322056006, + "loss": 0.0, + "step": 28363 + }, + { + "epoch": 2.6466361854996734, + "grad_norm": NaN, + "learning_rate": 0.00018585723877331115, + "loss": 0.0, + "step": 28364 + }, + { + "epoch": 2.646729495194551, + "grad_norm": NaN, + "learning_rate": 0.00018584989423488783, + "loss": 0.0, + "step": 28365 + }, + { + "epoch": 2.6468228048894282, + "grad_norm": NaN, + "learning_rate": 0.00018584254960530888, + "loss": 0.0, + "step": 28366 + }, + { + "epoch": 2.646916114584305, + "grad_norm": NaN, + "learning_rate": 0.00018583520488459296, + "loss": 0.0, + "step": 28367 + }, + { + "epoch": 2.6470094242791826, + "grad_norm": NaN, + "learning_rate": 0.00018582786007275868, + "loss": 0.0, + "step": 28368 + }, + { + "epoch": 2.64710273397406, + "grad_norm": NaN, + "learning_rate": 0.00018582051516982483, + "loss": 0.0, + "step": 28369 + }, + { + "epoch": 2.647196043668937, + "grad_norm": NaN, + "learning_rate": 0.00018581317017580997, + "loss": 0.0, + "step": 28370 + }, + { + "epoch": 2.6472893533638144, + "grad_norm": NaN, + "learning_rate": 0.00018580582509073284, + "loss": 0.0, + "step": 28371 + }, + { + "epoch": 2.647382663058692, + "grad_norm": NaN, + "learning_rate": 0.00018579847991461213, + "loss": 0.0, + "step": 28372 + }, + { + "epoch": 2.6474759727535693, + "grad_norm": NaN, + "learning_rate": 0.00018579113464746645, + "loss": 0.0, + "step": 28373 + }, + { + "epoch": 2.6475692824484462, + "grad_norm": NaN, + "learning_rate": 0.00018578378928931454, + "loss": 0.0, + "step": 28374 + }, + { + "epoch": 2.6476625921433237, + "grad_norm": NaN, + "learning_rate": 0.00018577644384017505, + "loss": 0.0, + "step": 28375 + }, + { + "epoch": 2.647755901838201, + "grad_norm": NaN, + "learning_rate": 0.00018576909830006664, + "loss": 0.0, + "step": 28376 + }, + { + "epoch": 2.647849211533078, + "grad_norm": NaN, + "learning_rate": 0.00018576175266900804, + "loss": 0.0, + "step": 28377 + }, + { + "epoch": 2.6479425212279555, + "grad_norm": NaN, + "learning_rate": 0.00018575440694701788, + "loss": 0.0, + "step": 28378 + }, + { + "epoch": 2.648035830922833, + "grad_norm": NaN, + "learning_rate": 0.00018574706113411486, + "loss": 0.0, + "step": 28379 + }, + { + "epoch": 2.6481291406177103, + "grad_norm": NaN, + "learning_rate": 0.00018573971523031768, + "loss": 0.0, + "step": 28380 + }, + { + "epoch": 2.6482224503125877, + "grad_norm": NaN, + "learning_rate": 0.00018573236923564494, + "loss": 0.0, + "step": 28381 + }, + { + "epoch": 2.6483157600074647, + "grad_norm": NaN, + "learning_rate": 0.0001857250231501154, + "loss": 0.0, + "step": 28382 + }, + { + "epoch": 2.648409069702342, + "grad_norm": NaN, + "learning_rate": 0.0001857176769737477, + "loss": 0.0, + "step": 28383 + }, + { + "epoch": 2.648502379397219, + "grad_norm": NaN, + "learning_rate": 0.0001857103307065605, + "loss": 0.0, + "step": 28384 + }, + { + "epoch": 2.6485956890920965, + "grad_norm": NaN, + "learning_rate": 0.0001857029843485726, + "loss": 0.0, + "step": 28385 + }, + { + "epoch": 2.648688998786974, + "grad_norm": NaN, + "learning_rate": 0.0001856956378998025, + "loss": 0.0, + "step": 28386 + }, + { + "epoch": 2.6487823084818514, + "grad_norm": NaN, + "learning_rate": 0.000185688291360269, + "loss": 0.0, + "step": 28387 + }, + { + "epoch": 2.648875618176729, + "grad_norm": NaN, + "learning_rate": 0.00018568094472999071, + "loss": 0.0, + "step": 28388 + }, + { + "epoch": 2.6489689278716058, + "grad_norm": NaN, + "learning_rate": 0.00018567359800898637, + "loss": 0.0, + "step": 28389 + }, + { + "epoch": 2.649062237566483, + "grad_norm": NaN, + "learning_rate": 0.00018566625119727465, + "loss": 0.0, + "step": 28390 + }, + { + "epoch": 2.6491555472613606, + "grad_norm": NaN, + "learning_rate": 0.00018565890429487423, + "loss": 0.0, + "step": 28391 + }, + { + "epoch": 2.6492488569562376, + "grad_norm": NaN, + "learning_rate": 0.00018565155730180372, + "loss": 0.0, + "step": 28392 + }, + { + "epoch": 2.649342166651115, + "grad_norm": NaN, + "learning_rate": 0.00018564421021808193, + "loss": 0.0, + "step": 28393 + }, + { + "epoch": 2.6494354763459924, + "grad_norm": NaN, + "learning_rate": 0.00018563686304372745, + "loss": 0.0, + "step": 28394 + }, + { + "epoch": 2.64952878604087, + "grad_norm": NaN, + "learning_rate": 0.00018562951577875896, + "loss": 0.0, + "step": 28395 + }, + { + "epoch": 2.649622095735747, + "grad_norm": NaN, + "learning_rate": 0.00018562216842319516, + "loss": 0.0, + "step": 28396 + }, + { + "epoch": 2.6497154054306242, + "grad_norm": NaN, + "learning_rate": 0.00018561482097705475, + "loss": 0.0, + "step": 28397 + }, + { + "epoch": 2.6498087151255016, + "grad_norm": NaN, + "learning_rate": 0.00018560747344035643, + "loss": 0.0, + "step": 28398 + }, + { + "epoch": 2.6499020248203786, + "grad_norm": NaN, + "learning_rate": 0.0001856001258131188, + "loss": 0.0, + "step": 28399 + }, + { + "epoch": 2.649995334515256, + "grad_norm": NaN, + "learning_rate": 0.00018559277809536065, + "loss": 0.0, + "step": 28400 + }, + { + "epoch": 2.6500886442101335, + "grad_norm": NaN, + "learning_rate": 0.00018558543028710057, + "loss": 0.0, + "step": 28401 + }, + { + "epoch": 2.650181953905011, + "grad_norm": NaN, + "learning_rate": 0.00018557808238835733, + "loss": 0.0, + "step": 28402 + }, + { + "epoch": 2.6502752635998883, + "grad_norm": NaN, + "learning_rate": 0.0001855707343991495, + "loss": 0.0, + "step": 28403 + }, + { + "epoch": 2.6503685732947653, + "grad_norm": NaN, + "learning_rate": 0.00018556338631949587, + "loss": 0.0, + "step": 28404 + }, + { + "epoch": 2.6504618829896427, + "grad_norm": NaN, + "learning_rate": 0.0001855560381494151, + "loss": 0.0, + "step": 28405 + }, + { + "epoch": 2.6505551926845197, + "grad_norm": NaN, + "learning_rate": 0.00018554868988892583, + "loss": 0.0, + "step": 28406 + }, + { + "epoch": 2.650648502379397, + "grad_norm": NaN, + "learning_rate": 0.0001855413415380468, + "loss": 0.0, + "step": 28407 + }, + { + "epoch": 2.6507418120742745, + "grad_norm": NaN, + "learning_rate": 0.00018553399309679662, + "loss": 0.0, + "step": 28408 + }, + { + "epoch": 2.650835121769152, + "grad_norm": NaN, + "learning_rate": 0.00018552664456519404, + "loss": 0.0, + "step": 28409 + }, + { + "epoch": 2.6509284314640293, + "grad_norm": NaN, + "learning_rate": 0.00018551929594325773, + "loss": 0.0, + "step": 28410 + }, + { + "epoch": 2.6510217411589063, + "grad_norm": NaN, + "learning_rate": 0.00018551194723100638, + "loss": 0.0, + "step": 28411 + }, + { + "epoch": 2.6511150508537837, + "grad_norm": NaN, + "learning_rate": 0.00018550459842845866, + "loss": 0.0, + "step": 28412 + }, + { + "epoch": 2.651208360548661, + "grad_norm": NaN, + "learning_rate": 0.00018549724953563325, + "loss": 0.0, + "step": 28413 + }, + { + "epoch": 2.651301670243538, + "grad_norm": NaN, + "learning_rate": 0.00018548990055254887, + "loss": 0.0, + "step": 28414 + }, + { + "epoch": 2.6513949799384156, + "grad_norm": NaN, + "learning_rate": 0.00018548255147922418, + "loss": 0.0, + "step": 28415 + }, + { + "epoch": 2.651488289633293, + "grad_norm": NaN, + "learning_rate": 0.00018547520231567785, + "loss": 0.0, + "step": 28416 + }, + { + "epoch": 2.6515815993281704, + "grad_norm": NaN, + "learning_rate": 0.0001854678530619286, + "loss": 0.0, + "step": 28417 + }, + { + "epoch": 2.6516749090230474, + "grad_norm": NaN, + "learning_rate": 0.00018546050371799513, + "loss": 0.0, + "step": 28418 + }, + { + "epoch": 2.651768218717925, + "grad_norm": NaN, + "learning_rate": 0.0001854531542838961, + "loss": 0.0, + "step": 28419 + }, + { + "epoch": 2.651861528412802, + "grad_norm": NaN, + "learning_rate": 0.00018544580475965015, + "loss": 0.0, + "step": 28420 + }, + { + "epoch": 2.651954838107679, + "grad_norm": NaN, + "learning_rate": 0.00018543845514527608, + "loss": 0.0, + "step": 28421 + }, + { + "epoch": 2.6520481478025566, + "grad_norm": NaN, + "learning_rate": 0.00018543110544079247, + "loss": 0.0, + "step": 28422 + }, + { + "epoch": 2.652141457497434, + "grad_norm": NaN, + "learning_rate": 0.0001854237556462181, + "loss": 0.0, + "step": 28423 + }, + { + "epoch": 2.6522347671923114, + "grad_norm": NaN, + "learning_rate": 0.0001854164057615716, + "loss": 0.0, + "step": 28424 + }, + { + "epoch": 2.6523280768871884, + "grad_norm": NaN, + "learning_rate": 0.00018540905578687166, + "loss": 0.0, + "step": 28425 + }, + { + "epoch": 2.652421386582066, + "grad_norm": NaN, + "learning_rate": 0.00018540170572213697, + "loss": 0.0, + "step": 28426 + }, + { + "epoch": 2.6525146962769433, + "grad_norm": NaN, + "learning_rate": 0.00018539435556738624, + "loss": 0.0, + "step": 28427 + }, + { + "epoch": 2.6526080059718202, + "grad_norm": NaN, + "learning_rate": 0.00018538700532263816, + "loss": 0.0, + "step": 28428 + }, + { + "epoch": 2.6527013156666976, + "grad_norm": NaN, + "learning_rate": 0.00018537965498791138, + "loss": 0.0, + "step": 28429 + }, + { + "epoch": 2.652794625361575, + "grad_norm": NaN, + "learning_rate": 0.00018537230456322463, + "loss": 0.0, + "step": 28430 + }, + { + "epoch": 2.6528879350564525, + "grad_norm": NaN, + "learning_rate": 0.00018536495404859662, + "loss": 0.0, + "step": 28431 + }, + { + "epoch": 2.65298124475133, + "grad_norm": NaN, + "learning_rate": 0.00018535760344404596, + "loss": 0.0, + "step": 28432 + }, + { + "epoch": 2.653074554446207, + "grad_norm": NaN, + "learning_rate": 0.00018535025274959142, + "loss": 0.0, + "step": 28433 + }, + { + "epoch": 2.6531678641410843, + "grad_norm": NaN, + "learning_rate": 0.00018534290196525166, + "loss": 0.0, + "step": 28434 + }, + { + "epoch": 2.6532611738359617, + "grad_norm": NaN, + "learning_rate": 0.00018533555109104534, + "loss": 0.0, + "step": 28435 + }, + { + "epoch": 2.6533544835308387, + "grad_norm": NaN, + "learning_rate": 0.0001853282001269912, + "loss": 0.0, + "step": 28436 + }, + { + "epoch": 2.653447793225716, + "grad_norm": NaN, + "learning_rate": 0.0001853208490731079, + "loss": 0.0, + "step": 28437 + }, + { + "epoch": 2.6535411029205935, + "grad_norm": NaN, + "learning_rate": 0.00018531349792941414, + "loss": 0.0, + "step": 28438 + }, + { + "epoch": 2.653634412615471, + "grad_norm": NaN, + "learning_rate": 0.00018530614669592864, + "loss": 0.0, + "step": 28439 + }, + { + "epoch": 2.653727722310348, + "grad_norm": NaN, + "learning_rate": 0.00018529879537267007, + "loss": 0.0, + "step": 28440 + }, + { + "epoch": 2.6538210320052253, + "grad_norm": NaN, + "learning_rate": 0.0001852914439596571, + "loss": 0.0, + "step": 28441 + }, + { + "epoch": 2.6539143417001028, + "grad_norm": NaN, + "learning_rate": 0.00018528409245690845, + "loss": 0.0, + "step": 28442 + }, + { + "epoch": 2.6540076513949797, + "grad_norm": NaN, + "learning_rate": 0.0001852767408644428, + "loss": 0.0, + "step": 28443 + }, + { + "epoch": 2.654100961089857, + "grad_norm": NaN, + "learning_rate": 0.00018526938918227886, + "loss": 0.0, + "step": 28444 + }, + { + "epoch": 2.6541942707847346, + "grad_norm": NaN, + "learning_rate": 0.0001852620374104353, + "loss": 0.0, + "step": 28445 + }, + { + "epoch": 2.654287580479612, + "grad_norm": NaN, + "learning_rate": 0.00018525468554893081, + "loss": 0.0, + "step": 28446 + }, + { + "epoch": 2.654380890174489, + "grad_norm": NaN, + "learning_rate": 0.00018524733359778412, + "loss": 0.0, + "step": 28447 + }, + { + "epoch": 2.6544741998693664, + "grad_norm": NaN, + "learning_rate": 0.00018523998155701388, + "loss": 0.0, + "step": 28448 + }, + { + "epoch": 2.654567509564244, + "grad_norm": NaN, + "learning_rate": 0.00018523262942663878, + "loss": 0.0, + "step": 28449 + }, + { + "epoch": 2.654660819259121, + "grad_norm": NaN, + "learning_rate": 0.00018522527720667766, + "loss": 0.0, + "step": 28450 + }, + { + "epoch": 2.654754128953998, + "grad_norm": NaN, + "learning_rate": 0.000185217924897149, + "loss": 0.0, + "step": 28451 + }, + { + "epoch": 2.6548474386488756, + "grad_norm": NaN, + "learning_rate": 0.0001852105724980716, + "loss": 0.0, + "step": 28452 + }, + { + "epoch": 2.654940748343753, + "grad_norm": NaN, + "learning_rate": 0.00018520322000946415, + "loss": 0.0, + "step": 28453 + }, + { + "epoch": 2.6550340580386305, + "grad_norm": NaN, + "learning_rate": 0.00018519586743134532, + "loss": 0.0, + "step": 28454 + }, + { + "epoch": 2.6551273677335074, + "grad_norm": NaN, + "learning_rate": 0.00018518851476373387, + "loss": 0.0, + "step": 28455 + }, + { + "epoch": 2.655220677428385, + "grad_norm": NaN, + "learning_rate": 0.0001851811620066484, + "loss": 0.0, + "step": 28456 + }, + { + "epoch": 2.655313987123262, + "grad_norm": NaN, + "learning_rate": 0.0001851738091601077, + "loss": 0.0, + "step": 28457 + }, + { + "epoch": 2.6554072968181393, + "grad_norm": NaN, + "learning_rate": 0.0001851664562241304, + "loss": 0.0, + "step": 28458 + }, + { + "epoch": 2.6555006065130167, + "grad_norm": NaN, + "learning_rate": 0.00018515910319873518, + "loss": 0.0, + "step": 28459 + }, + { + "epoch": 2.655593916207894, + "grad_norm": NaN, + "learning_rate": 0.00018515175008394083, + "loss": 0.0, + "step": 28460 + }, + { + "epoch": 2.6556872259027715, + "grad_norm": NaN, + "learning_rate": 0.000185144396879766, + "loss": 0.0, + "step": 28461 + }, + { + "epoch": 2.6557805355976485, + "grad_norm": NaN, + "learning_rate": 0.00018513704358622934, + "loss": 0.0, + "step": 28462 + }, + { + "epoch": 2.655873845292526, + "grad_norm": NaN, + "learning_rate": 0.0001851296902033496, + "loss": 0.0, + "step": 28463 + }, + { + "epoch": 2.6559671549874033, + "grad_norm": NaN, + "learning_rate": 0.00018512233673114548, + "loss": 0.0, + "step": 28464 + }, + { + "epoch": 2.6560604646822803, + "grad_norm": NaN, + "learning_rate": 0.00018511498316963563, + "loss": 0.0, + "step": 28465 + }, + { + "epoch": 2.6561537743771577, + "grad_norm": NaN, + "learning_rate": 0.0001851076295188388, + "loss": 0.0, + "step": 28466 + }, + { + "epoch": 2.656247084072035, + "grad_norm": NaN, + "learning_rate": 0.0001851002757787737, + "loss": 0.0, + "step": 28467 + }, + { + "epoch": 2.6563403937669126, + "grad_norm": NaN, + "learning_rate": 0.00018509292194945895, + "loss": 0.0, + "step": 28468 + }, + { + "epoch": 2.6564337034617895, + "grad_norm": NaN, + "learning_rate": 0.00018508556803091333, + "loss": 0.0, + "step": 28469 + }, + { + "epoch": 2.656527013156667, + "grad_norm": NaN, + "learning_rate": 0.00018507821402315546, + "loss": 0.0, + "step": 28470 + }, + { + "epoch": 2.6566203228515444, + "grad_norm": NaN, + "learning_rate": 0.0001850708599262041, + "loss": 0.0, + "step": 28471 + }, + { + "epoch": 2.6567136325464213, + "grad_norm": NaN, + "learning_rate": 0.00018506350574007795, + "loss": 0.0, + "step": 28472 + }, + { + "epoch": 2.6568069422412988, + "grad_norm": NaN, + "learning_rate": 0.0001850561514647957, + "loss": 0.0, + "step": 28473 + }, + { + "epoch": 2.656900251936176, + "grad_norm": NaN, + "learning_rate": 0.00018504879710037601, + "loss": 0.0, + "step": 28474 + }, + { + "epoch": 2.6569935616310536, + "grad_norm": NaN, + "learning_rate": 0.0001850414426468376, + "loss": 0.0, + "step": 28475 + }, + { + "epoch": 2.657086871325931, + "grad_norm": NaN, + "learning_rate": 0.00018503408810419922, + "loss": 0.0, + "step": 28476 + }, + { + "epoch": 2.657180181020808, + "grad_norm": NaN, + "learning_rate": 0.0001850267334724795, + "loss": 0.0, + "step": 28477 + }, + { + "epoch": 2.6572734907156854, + "grad_norm": NaN, + "learning_rate": 0.00018501937875169719, + "loss": 0.0, + "step": 28478 + }, + { + "epoch": 2.6573668004105624, + "grad_norm": NaN, + "learning_rate": 0.000185012023941871, + "loss": 0.0, + "step": 28479 + }, + { + "epoch": 2.65746011010544, + "grad_norm": NaN, + "learning_rate": 0.00018500466904301953, + "loss": 0.0, + "step": 28480 + }, + { + "epoch": 2.6575534198003172, + "grad_norm": NaN, + "learning_rate": 0.00018499731405516163, + "loss": 0.0, + "step": 28481 + }, + { + "epoch": 2.6576467294951946, + "grad_norm": NaN, + "learning_rate": 0.00018498995897831588, + "loss": 0.0, + "step": 28482 + }, + { + "epoch": 2.657740039190072, + "grad_norm": NaN, + "learning_rate": 0.00018498260381250107, + "loss": 0.0, + "step": 28483 + }, + { + "epoch": 2.657833348884949, + "grad_norm": NaN, + "learning_rate": 0.00018497524855773582, + "loss": 0.0, + "step": 28484 + }, + { + "epoch": 2.6579266585798265, + "grad_norm": NaN, + "learning_rate": 0.0001849678932140389, + "loss": 0.0, + "step": 28485 + }, + { + "epoch": 2.658019968274704, + "grad_norm": NaN, + "learning_rate": 0.000184960537781429, + "loss": 0.0, + "step": 28486 + }, + { + "epoch": 2.658113277969581, + "grad_norm": NaN, + "learning_rate": 0.00018495318225992473, + "loss": 0.0, + "step": 28487 + }, + { + "epoch": 2.6582065876644583, + "grad_norm": NaN, + "learning_rate": 0.00018494582664954496, + "loss": 0.0, + "step": 28488 + }, + { + "epoch": 2.6582998973593357, + "grad_norm": NaN, + "learning_rate": 0.00018493847095030826, + "loss": 0.0, + "step": 28489 + }, + { + "epoch": 2.658393207054213, + "grad_norm": NaN, + "learning_rate": 0.00018493111516223337, + "loss": 0.0, + "step": 28490 + }, + { + "epoch": 2.65848651674909, + "grad_norm": NaN, + "learning_rate": 0.00018492375928533906, + "loss": 0.0, + "step": 28491 + }, + { + "epoch": 2.6585798264439675, + "grad_norm": NaN, + "learning_rate": 0.00018491640331964393, + "loss": 0.0, + "step": 28492 + }, + { + "epoch": 2.658673136138845, + "grad_norm": NaN, + "learning_rate": 0.00018490904726516672, + "loss": 0.0, + "step": 28493 + }, + { + "epoch": 2.658766445833722, + "grad_norm": NaN, + "learning_rate": 0.00018490169112192617, + "loss": 0.0, + "step": 28494 + }, + { + "epoch": 2.6588597555285993, + "grad_norm": NaN, + "learning_rate": 0.00018489433488994096, + "loss": 0.0, + "step": 28495 + }, + { + "epoch": 2.6589530652234767, + "grad_norm": NaN, + "learning_rate": 0.00018488697856922976, + "loss": 0.0, + "step": 28496 + }, + { + "epoch": 2.659046374918354, + "grad_norm": NaN, + "learning_rate": 0.00018487962215981134, + "loss": 0.0, + "step": 28497 + }, + { + "epoch": 2.6591396846132316, + "grad_norm": NaN, + "learning_rate": 0.00018487226566170435, + "loss": 0.0, + "step": 28498 + }, + { + "epoch": 2.6592329943081086, + "grad_norm": NaN, + "learning_rate": 0.00018486490907492753, + "loss": 0.0, + "step": 28499 + }, + { + "epoch": 2.659326304002986, + "grad_norm": NaN, + "learning_rate": 0.0001848575523994996, + "loss": 0.0, + "step": 28500 + }, + { + "epoch": 2.659419613697863, + "grad_norm": NaN, + "learning_rate": 0.00018485019563543918, + "loss": 0.0, + "step": 28501 + }, + { + "epoch": 2.6595129233927404, + "grad_norm": NaN, + "learning_rate": 0.00018484283878276506, + "loss": 0.0, + "step": 28502 + }, + { + "epoch": 2.659606233087618, + "grad_norm": NaN, + "learning_rate": 0.00018483548184149596, + "loss": 0.0, + "step": 28503 + }, + { + "epoch": 2.659699542782495, + "grad_norm": NaN, + "learning_rate": 0.00018482812481165052, + "loss": 0.0, + "step": 28504 + }, + { + "epoch": 2.6597928524773726, + "grad_norm": NaN, + "learning_rate": 0.00018482076769324744, + "loss": 0.0, + "step": 28505 + }, + { + "epoch": 2.6598861621722496, + "grad_norm": NaN, + "learning_rate": 0.0001848134104863055, + "loss": 0.0, + "step": 28506 + }, + { + "epoch": 2.659979471867127, + "grad_norm": NaN, + "learning_rate": 0.0001848060531908434, + "loss": 0.0, + "step": 28507 + }, + { + "epoch": 2.6600727815620044, + "grad_norm": NaN, + "learning_rate": 0.0001847986958068798, + "loss": 0.0, + "step": 28508 + }, + { + "epoch": 2.6601660912568814, + "grad_norm": NaN, + "learning_rate": 0.00018479133833443335, + "loss": 0.0, + "step": 28509 + }, + { + "epoch": 2.660259400951759, + "grad_norm": NaN, + "learning_rate": 0.00018478398077352294, + "loss": 0.0, + "step": 28510 + }, + { + "epoch": 2.6603527106466363, + "grad_norm": NaN, + "learning_rate": 0.00018477662312416711, + "loss": 0.0, + "step": 28511 + }, + { + "epoch": 2.6604460203415137, + "grad_norm": NaN, + "learning_rate": 0.0001847692653863846, + "loss": 0.0, + "step": 28512 + }, + { + "epoch": 2.6605393300363906, + "grad_norm": NaN, + "learning_rate": 0.00018476190756019423, + "loss": 0.0, + "step": 28513 + }, + { + "epoch": 2.660632639731268, + "grad_norm": NaN, + "learning_rate": 0.0001847545496456146, + "loss": 0.0, + "step": 28514 + }, + { + "epoch": 2.6607259494261455, + "grad_norm": NaN, + "learning_rate": 0.00018474719164266438, + "loss": 0.0, + "step": 28515 + }, + { + "epoch": 2.6608192591210225, + "grad_norm": NaN, + "learning_rate": 0.0001847398335513624, + "loss": 0.0, + "step": 28516 + }, + { + "epoch": 2.6609125688159, + "grad_norm": NaN, + "learning_rate": 0.00018473247537172734, + "loss": 0.0, + "step": 28517 + }, + { + "epoch": 2.6610058785107773, + "grad_norm": NaN, + "learning_rate": 0.00018472511710377788, + "loss": 0.0, + "step": 28518 + }, + { + "epoch": 2.6610991882056547, + "grad_norm": NaN, + "learning_rate": 0.00018471775874753267, + "loss": 0.0, + "step": 28519 + }, + { + "epoch": 2.661192497900532, + "grad_norm": NaN, + "learning_rate": 0.00018471040030301053, + "loss": 0.0, + "step": 28520 + }, + { + "epoch": 2.661285807595409, + "grad_norm": NaN, + "learning_rate": 0.0001847030417702301, + "loss": 0.0, + "step": 28521 + }, + { + "epoch": 2.6613791172902865, + "grad_norm": NaN, + "learning_rate": 0.00018469568314921013, + "loss": 0.0, + "step": 28522 + }, + { + "epoch": 2.6614724269851635, + "grad_norm": NaN, + "learning_rate": 0.00018468832443996935, + "loss": 0.0, + "step": 28523 + }, + { + "epoch": 2.661565736680041, + "grad_norm": NaN, + "learning_rate": 0.00018468096564252636, + "loss": 0.0, + "step": 28524 + }, + { + "epoch": 2.6616590463749183, + "grad_norm": NaN, + "learning_rate": 0.00018467360675689998, + "loss": 0.0, + "step": 28525 + }, + { + "epoch": 2.6617523560697958, + "grad_norm": NaN, + "learning_rate": 0.00018466624778310894, + "loss": 0.0, + "step": 28526 + }, + { + "epoch": 2.661845665764673, + "grad_norm": NaN, + "learning_rate": 0.00018465888872117183, + "loss": 0.0, + "step": 28527 + }, + { + "epoch": 2.66193897545955, + "grad_norm": NaN, + "learning_rate": 0.00018465152957110743, + "loss": 0.0, + "step": 28528 + }, + { + "epoch": 2.6620322851544276, + "grad_norm": NaN, + "learning_rate": 0.00018464417033293453, + "loss": 0.0, + "step": 28529 + }, + { + "epoch": 2.662125594849305, + "grad_norm": NaN, + "learning_rate": 0.00018463681100667172, + "loss": 0.0, + "step": 28530 + }, + { + "epoch": 2.662218904544182, + "grad_norm": NaN, + "learning_rate": 0.00018462945159233773, + "loss": 0.0, + "step": 28531 + }, + { + "epoch": 2.6623122142390594, + "grad_norm": NaN, + "learning_rate": 0.00018462209208995137, + "loss": 0.0, + "step": 28532 + }, + { + "epoch": 2.662405523933937, + "grad_norm": NaN, + "learning_rate": 0.00018461473249953126, + "loss": 0.0, + "step": 28533 + }, + { + "epoch": 2.6624988336288142, + "grad_norm": NaN, + "learning_rate": 0.0001846073728210961, + "loss": 0.0, + "step": 28534 + }, + { + "epoch": 2.662592143323691, + "grad_norm": NaN, + "learning_rate": 0.00018460001305466468, + "loss": 0.0, + "step": 28535 + }, + { + "epoch": 2.6626854530185686, + "grad_norm": NaN, + "learning_rate": 0.00018459265320025568, + "loss": 0.0, + "step": 28536 + }, + { + "epoch": 2.662778762713446, + "grad_norm": NaN, + "learning_rate": 0.00018458529325788778, + "loss": 0.0, + "step": 28537 + }, + { + "epoch": 2.662872072408323, + "grad_norm": NaN, + "learning_rate": 0.00018457793322757977, + "loss": 0.0, + "step": 28538 + }, + { + "epoch": 2.6629653821032004, + "grad_norm": NaN, + "learning_rate": 0.0001845705731093503, + "loss": 0.0, + "step": 28539 + }, + { + "epoch": 2.663058691798078, + "grad_norm": NaN, + "learning_rate": 0.0001845632129032181, + "loss": 0.0, + "step": 28540 + }, + { + "epoch": 2.6631520014929553, + "grad_norm": NaN, + "learning_rate": 0.00018455585260920184, + "loss": 0.0, + "step": 28541 + }, + { + "epoch": 2.6632453111878323, + "grad_norm": NaN, + "learning_rate": 0.00018454849222732033, + "loss": 0.0, + "step": 28542 + }, + { + "epoch": 2.6633386208827097, + "grad_norm": NaN, + "learning_rate": 0.00018454113175759223, + "loss": 0.0, + "step": 28543 + }, + { + "epoch": 2.663431930577587, + "grad_norm": NaN, + "learning_rate": 0.00018453377120003622, + "loss": 0.0, + "step": 28544 + }, + { + "epoch": 2.663525240272464, + "grad_norm": NaN, + "learning_rate": 0.00018452641055467114, + "loss": 0.0, + "step": 28545 + }, + { + "epoch": 2.6636185499673415, + "grad_norm": NaN, + "learning_rate": 0.0001845190498215156, + "loss": 0.0, + "step": 28546 + }, + { + "epoch": 2.663711859662219, + "grad_norm": NaN, + "learning_rate": 0.0001845116890005883, + "loss": 0.0, + "step": 28547 + }, + { + "epoch": 2.6638051693570963, + "grad_norm": NaN, + "learning_rate": 0.00018450432809190807, + "loss": 0.0, + "step": 28548 + }, + { + "epoch": 2.6638984790519737, + "grad_norm": NaN, + "learning_rate": 0.0001844969670954935, + "loss": 0.0, + "step": 28549 + }, + { + "epoch": 2.6639917887468507, + "grad_norm": NaN, + "learning_rate": 0.00018448960601136338, + "loss": 0.0, + "step": 28550 + }, + { + "epoch": 2.664085098441728, + "grad_norm": NaN, + "learning_rate": 0.00018448224483953645, + "loss": 0.0, + "step": 28551 + }, + { + "epoch": 2.6641784081366056, + "grad_norm": NaN, + "learning_rate": 0.0001844748835800313, + "loss": 0.0, + "step": 28552 + }, + { + "epoch": 2.6642717178314825, + "grad_norm": NaN, + "learning_rate": 0.00018446752223286674, + "loss": 0.0, + "step": 28553 + }, + { + "epoch": 2.66436502752636, + "grad_norm": NaN, + "learning_rate": 0.00018446016079806159, + "loss": 0.0, + "step": 28554 + }, + { + "epoch": 2.6644583372212374, + "grad_norm": NaN, + "learning_rate": 0.00018445279927563438, + "loss": 0.0, + "step": 28555 + }, + { + "epoch": 2.664551646916115, + "grad_norm": NaN, + "learning_rate": 0.00018444543766560386, + "loss": 0.0, + "step": 28556 + }, + { + "epoch": 2.6646449566109918, + "grad_norm": NaN, + "learning_rate": 0.00018443807596798888, + "loss": 0.0, + "step": 28557 + }, + { + "epoch": 2.664738266305869, + "grad_norm": NaN, + "learning_rate": 0.00018443071418280805, + "loss": 0.0, + "step": 28558 + }, + { + "epoch": 2.6648315760007466, + "grad_norm": NaN, + "learning_rate": 0.00018442335231008005, + "loss": 0.0, + "step": 28559 + }, + { + "epoch": 2.6649248856956236, + "grad_norm": NaN, + "learning_rate": 0.00018441599034982374, + "loss": 0.0, + "step": 28560 + }, + { + "epoch": 2.665018195390501, + "grad_norm": NaN, + "learning_rate": 0.00018440862830205776, + "loss": 0.0, + "step": 28561 + }, + { + "epoch": 2.6651115050853784, + "grad_norm": NaN, + "learning_rate": 0.0001844012661668008, + "loss": 0.0, + "step": 28562 + }, + { + "epoch": 2.665204814780256, + "grad_norm": NaN, + "learning_rate": 0.00018439390394407162, + "loss": 0.0, + "step": 28563 + }, + { + "epoch": 2.665298124475133, + "grad_norm": NaN, + "learning_rate": 0.00018438654163388895, + "loss": 0.0, + "step": 28564 + }, + { + "epoch": 2.6653914341700102, + "grad_norm": NaN, + "learning_rate": 0.00018437917923627147, + "loss": 0.0, + "step": 28565 + }, + { + "epoch": 2.6654847438648877, + "grad_norm": NaN, + "learning_rate": 0.0001843718167512379, + "loss": 0.0, + "step": 28566 + }, + { + "epoch": 2.6655780535597646, + "grad_norm": NaN, + "learning_rate": 0.00018436445417880702, + "loss": 0.0, + "step": 28567 + }, + { + "epoch": 2.665671363254642, + "grad_norm": NaN, + "learning_rate": 0.0001843570915189975, + "loss": 0.0, + "step": 28568 + }, + { + "epoch": 2.6657646729495195, + "grad_norm": NaN, + "learning_rate": 0.00018434972877182803, + "loss": 0.0, + "step": 28569 + }, + { + "epoch": 2.665857982644397, + "grad_norm": NaN, + "learning_rate": 0.00018434236593731747, + "loss": 0.0, + "step": 28570 + }, + { + "epoch": 2.6659512923392743, + "grad_norm": NaN, + "learning_rate": 0.00018433500301548436, + "loss": 0.0, + "step": 28571 + }, + { + "epoch": 2.6660446020341513, + "grad_norm": NaN, + "learning_rate": 0.0001843276400063475, + "loss": 0.0, + "step": 28572 + }, + { + "epoch": 2.6661379117290287, + "grad_norm": NaN, + "learning_rate": 0.00018432027690992568, + "loss": 0.0, + "step": 28573 + }, + { + "epoch": 2.6662312214239057, + "grad_norm": NaN, + "learning_rate": 0.00018431291372623756, + "loss": 0.0, + "step": 28574 + }, + { + "epoch": 2.666324531118783, + "grad_norm": NaN, + "learning_rate": 0.00018430555045530181, + "loss": 0.0, + "step": 28575 + }, + { + "epoch": 2.6664178408136605, + "grad_norm": NaN, + "learning_rate": 0.00018429818709713727, + "loss": 0.0, + "step": 28576 + }, + { + "epoch": 2.666511150508538, + "grad_norm": NaN, + "learning_rate": 0.0001842908236517626, + "loss": 0.0, + "step": 28577 + }, + { + "epoch": 2.6666044602034153, + "grad_norm": NaN, + "learning_rate": 0.00018428346011919644, + "loss": 0.0, + "step": 28578 + }, + { + "epoch": 2.6666977698982923, + "grad_norm": NaN, + "learning_rate": 0.00018427609649945766, + "loss": 0.0, + "step": 28579 + }, + { + "epoch": 2.6667910795931697, + "grad_norm": NaN, + "learning_rate": 0.00018426873279256496, + "loss": 0.0, + "step": 28580 + }, + { + "epoch": 2.666884389288047, + "grad_norm": NaN, + "learning_rate": 0.00018426136899853693, + "loss": 0.0, + "step": 28581 + }, + { + "epoch": 2.666977698982924, + "grad_norm": NaN, + "learning_rate": 0.00018425400511739245, + "loss": 0.0, + "step": 28582 + }, + { + "epoch": 2.6670710086778016, + "grad_norm": NaN, + "learning_rate": 0.00018424664114915018, + "loss": 0.0, + "step": 28583 + }, + { + "epoch": 2.667164318372679, + "grad_norm": NaN, + "learning_rate": 0.0001842392770938288, + "loss": 0.0, + "step": 28584 + }, + { + "epoch": 2.6672576280675564, + "grad_norm": NaN, + "learning_rate": 0.0001842319129514471, + "loss": 0.0, + "step": 28585 + }, + { + "epoch": 2.6673509377624334, + "grad_norm": NaN, + "learning_rate": 0.00018422454872202385, + "loss": 0.0, + "step": 28586 + }, + { + "epoch": 2.667444247457311, + "grad_norm": NaN, + "learning_rate": 0.00018421718440557767, + "loss": 0.0, + "step": 28587 + }, + { + "epoch": 2.667537557152188, + "grad_norm": NaN, + "learning_rate": 0.00018420982000212727, + "loss": 0.0, + "step": 28588 + }, + { + "epoch": 2.667630866847065, + "grad_norm": NaN, + "learning_rate": 0.0001842024555116915, + "loss": 0.0, + "step": 28589 + }, + { + "epoch": 2.6677241765419426, + "grad_norm": NaN, + "learning_rate": 0.00018419509093428902, + "loss": 0.0, + "step": 28590 + }, + { + "epoch": 2.66781748623682, + "grad_norm": NaN, + "learning_rate": 0.00018418772626993844, + "loss": 0.0, + "step": 28591 + }, + { + "epoch": 2.6679107959316974, + "grad_norm": NaN, + "learning_rate": 0.00018418036151865875, + "loss": 0.0, + "step": 28592 + }, + { + "epoch": 2.668004105626575, + "grad_norm": NaN, + "learning_rate": 0.00018417299668046846, + "loss": 0.0, + "step": 28593 + }, + { + "epoch": 2.668097415321452, + "grad_norm": NaN, + "learning_rate": 0.00018416563175538634, + "loss": 0.0, + "step": 28594 + }, + { + "epoch": 2.6681907250163293, + "grad_norm": NaN, + "learning_rate": 0.00018415826674343118, + "loss": 0.0, + "step": 28595 + }, + { + "epoch": 2.6682840347112062, + "grad_norm": NaN, + "learning_rate": 0.00018415090164462165, + "loss": 0.0, + "step": 28596 + }, + { + "epoch": 2.6683773444060837, + "grad_norm": NaN, + "learning_rate": 0.00018414353645897645, + "loss": 0.0, + "step": 28597 + }, + { + "epoch": 2.668470654100961, + "grad_norm": NaN, + "learning_rate": 0.0001841361711865144, + "loss": 0.0, + "step": 28598 + }, + { + "epoch": 2.6685639637958385, + "grad_norm": NaN, + "learning_rate": 0.0001841288058272542, + "loss": 0.0, + "step": 28599 + }, + { + "epoch": 2.668657273490716, + "grad_norm": NaN, + "learning_rate": 0.0001841214403812145, + "loss": 0.0, + "step": 28600 + }, + { + "epoch": 2.668750583185593, + "grad_norm": NaN, + "learning_rate": 0.00018411407484841412, + "loss": 0.0, + "step": 28601 + }, + { + "epoch": 2.6688438928804703, + "grad_norm": NaN, + "learning_rate": 0.00018410670922887177, + "loss": 0.0, + "step": 28602 + }, + { + "epoch": 2.6689372025753477, + "grad_norm": NaN, + "learning_rate": 0.0001840993435226061, + "loss": 0.0, + "step": 28603 + }, + { + "epoch": 2.6690305122702247, + "grad_norm": NaN, + "learning_rate": 0.00018409197772963597, + "loss": 0.0, + "step": 28604 + }, + { + "epoch": 2.669123821965102, + "grad_norm": NaN, + "learning_rate": 0.00018408461184998, + "loss": 0.0, + "step": 28605 + }, + { + "epoch": 2.6692171316599795, + "grad_norm": NaN, + "learning_rate": 0.00018407724588365693, + "loss": 0.0, + "step": 28606 + }, + { + "epoch": 2.669310441354857, + "grad_norm": NaN, + "learning_rate": 0.00018406987983068555, + "loss": 0.0, + "step": 28607 + }, + { + "epoch": 2.669403751049734, + "grad_norm": NaN, + "learning_rate": 0.00018406251369108462, + "loss": 0.0, + "step": 28608 + }, + { + "epoch": 2.6694970607446113, + "grad_norm": NaN, + "learning_rate": 0.00018405514746487275, + "loss": 0.0, + "step": 28609 + }, + { + "epoch": 2.6695903704394888, + "grad_norm": NaN, + "learning_rate": 0.00018404778115206868, + "loss": 0.0, + "step": 28610 + }, + { + "epoch": 2.6696836801343657, + "grad_norm": NaN, + "learning_rate": 0.0001840404147526913, + "loss": 0.0, + "step": 28611 + }, + { + "epoch": 2.669776989829243, + "grad_norm": NaN, + "learning_rate": 0.00018403304826675914, + "loss": 0.0, + "step": 28612 + }, + { + "epoch": 2.6698702995241206, + "grad_norm": NaN, + "learning_rate": 0.000184025681694291, + "loss": 0.0, + "step": 28613 + }, + { + "epoch": 2.669963609218998, + "grad_norm": NaN, + "learning_rate": 0.00018401831503530574, + "loss": 0.0, + "step": 28614 + }, + { + "epoch": 2.6700569189138754, + "grad_norm": NaN, + "learning_rate": 0.0001840109482898219, + "loss": 0.0, + "step": 28615 + }, + { + "epoch": 2.6701502286087524, + "grad_norm": NaN, + "learning_rate": 0.00018400358145785828, + "loss": 0.0, + "step": 28616 + }, + { + "epoch": 2.67024353830363, + "grad_norm": NaN, + "learning_rate": 0.00018399621453943363, + "loss": 0.0, + "step": 28617 + }, + { + "epoch": 2.670336847998507, + "grad_norm": NaN, + "learning_rate": 0.00018398884753456675, + "loss": 0.0, + "step": 28618 + }, + { + "epoch": 2.670430157693384, + "grad_norm": NaN, + "learning_rate": 0.0001839814804432762, + "loss": 0.0, + "step": 28619 + }, + { + "epoch": 2.6705234673882616, + "grad_norm": NaN, + "learning_rate": 0.00018397411326558089, + "loss": 0.0, + "step": 28620 + }, + { + "epoch": 2.670616777083139, + "grad_norm": NaN, + "learning_rate": 0.00018396674600149946, + "loss": 0.0, + "step": 28621 + }, + { + "epoch": 2.6707100867780165, + "grad_norm": NaN, + "learning_rate": 0.00018395937865105062, + "loss": 0.0, + "step": 28622 + }, + { + "epoch": 2.6708033964728934, + "grad_norm": NaN, + "learning_rate": 0.00018395201121425314, + "loss": 0.0, + "step": 28623 + }, + { + "epoch": 2.670896706167771, + "grad_norm": NaN, + "learning_rate": 0.00018394464369112583, + "loss": 0.0, + "step": 28624 + }, + { + "epoch": 2.6709900158626483, + "grad_norm": NaN, + "learning_rate": 0.00018393727608168728, + "loss": 0.0, + "step": 28625 + }, + { + "epoch": 2.6710833255575253, + "grad_norm": NaN, + "learning_rate": 0.00018392990838595627, + "loss": 0.0, + "step": 28626 + }, + { + "epoch": 2.6711766352524027, + "grad_norm": NaN, + "learning_rate": 0.00018392254060395162, + "loss": 0.0, + "step": 28627 + }, + { + "epoch": 2.67126994494728, + "grad_norm": NaN, + "learning_rate": 0.00018391517273569192, + "loss": 0.0, + "step": 28628 + }, + { + "epoch": 2.6713632546421575, + "grad_norm": NaN, + "learning_rate": 0.00018390780478119606, + "loss": 0.0, + "step": 28629 + }, + { + "epoch": 2.6714565643370345, + "grad_norm": NaN, + "learning_rate": 0.00018390043674048267, + "loss": 0.0, + "step": 28630 + }, + { + "epoch": 2.671549874031912, + "grad_norm": NaN, + "learning_rate": 0.0001838930686135705, + "loss": 0.0, + "step": 28631 + }, + { + "epoch": 2.6716431837267893, + "grad_norm": NaN, + "learning_rate": 0.00018388570040047826, + "loss": 0.0, + "step": 28632 + }, + { + "epoch": 2.6717364934216663, + "grad_norm": NaN, + "learning_rate": 0.00018387833210122478, + "loss": 0.0, + "step": 28633 + }, + { + "epoch": 2.6718298031165437, + "grad_norm": NaN, + "learning_rate": 0.00018387096371582872, + "loss": 0.0, + "step": 28634 + }, + { + "epoch": 2.671923112811421, + "grad_norm": NaN, + "learning_rate": 0.00018386359524430882, + "loss": 0.0, + "step": 28635 + }, + { + "epoch": 2.6720164225062986, + "grad_norm": NaN, + "learning_rate": 0.0001838562266866838, + "loss": 0.0, + "step": 28636 + }, + { + "epoch": 2.6721097322011755, + "grad_norm": NaN, + "learning_rate": 0.00018384885804297252, + "loss": 0.0, + "step": 28637 + }, + { + "epoch": 2.672203041896053, + "grad_norm": NaN, + "learning_rate": 0.00018384148931319352, + "loss": 0.0, + "step": 28638 + }, + { + "epoch": 2.6722963515909304, + "grad_norm": NaN, + "learning_rate": 0.0001838341204973657, + "loss": 0.0, + "step": 28639 + }, + { + "epoch": 2.6723896612858073, + "grad_norm": NaN, + "learning_rate": 0.00018382675159550773, + "loss": 0.0, + "step": 28640 + }, + { + "epoch": 2.6724829709806848, + "grad_norm": NaN, + "learning_rate": 0.00018381938260763833, + "loss": 0.0, + "step": 28641 + }, + { + "epoch": 2.672576280675562, + "grad_norm": NaN, + "learning_rate": 0.00018381201353377623, + "loss": 0.0, + "step": 28642 + }, + { + "epoch": 2.6726695903704396, + "grad_norm": NaN, + "learning_rate": 0.00018380464437394028, + "loss": 0.0, + "step": 28643 + }, + { + "epoch": 2.672762900065317, + "grad_norm": NaN, + "learning_rate": 0.00018379727512814903, + "loss": 0.0, + "step": 28644 + }, + { + "epoch": 2.672856209760194, + "grad_norm": NaN, + "learning_rate": 0.0001837899057964214, + "loss": 0.0, + "step": 28645 + }, + { + "epoch": 2.6729495194550714, + "grad_norm": NaN, + "learning_rate": 0.00018378253637877604, + "loss": 0.0, + "step": 28646 + }, + { + "epoch": 2.673042829149949, + "grad_norm": NaN, + "learning_rate": 0.00018377516687523166, + "loss": 0.0, + "step": 28647 + }, + { + "epoch": 2.673136138844826, + "grad_norm": NaN, + "learning_rate": 0.00018376779728580705, + "loss": 0.0, + "step": 28648 + }, + { + "epoch": 2.6732294485397032, + "grad_norm": NaN, + "learning_rate": 0.00018376042761052098, + "loss": 0.0, + "step": 28649 + }, + { + "epoch": 2.6733227582345807, + "grad_norm": NaN, + "learning_rate": 0.00018375305784939207, + "loss": 0.0, + "step": 28650 + }, + { + "epoch": 2.673416067929458, + "grad_norm": NaN, + "learning_rate": 0.00018374568800243914, + "loss": 0.0, + "step": 28651 + }, + { + "epoch": 2.673509377624335, + "grad_norm": NaN, + "learning_rate": 0.00018373831806968097, + "loss": 0.0, + "step": 28652 + }, + { + "epoch": 2.6736026873192125, + "grad_norm": NaN, + "learning_rate": 0.00018373094805113627, + "loss": 0.0, + "step": 28653 + }, + { + "epoch": 2.67369599701409, + "grad_norm": NaN, + "learning_rate": 0.00018372357794682365, + "loss": 0.0, + "step": 28654 + }, + { + "epoch": 2.673789306708967, + "grad_norm": NaN, + "learning_rate": 0.00018371620775676204, + "loss": 0.0, + "step": 28655 + }, + { + "epoch": 2.6738826164038443, + "grad_norm": NaN, + "learning_rate": 0.00018370883748097014, + "loss": 0.0, + "step": 28656 + }, + { + "epoch": 2.6739759260987217, + "grad_norm": NaN, + "learning_rate": 0.00018370146711946654, + "loss": 0.0, + "step": 28657 + }, + { + "epoch": 2.674069235793599, + "grad_norm": NaN, + "learning_rate": 0.00018369409667227016, + "loss": 0.0, + "step": 28658 + }, + { + "epoch": 2.674162545488476, + "grad_norm": NaN, + "learning_rate": 0.0001836867261393997, + "loss": 0.0, + "step": 28659 + }, + { + "epoch": 2.6742558551833535, + "grad_norm": NaN, + "learning_rate": 0.0001836793555208738, + "loss": 0.0, + "step": 28660 + }, + { + "epoch": 2.674349164878231, + "grad_norm": NaN, + "learning_rate": 0.00018367198481671133, + "loss": 0.0, + "step": 28661 + }, + { + "epoch": 2.674442474573108, + "grad_norm": NaN, + "learning_rate": 0.000183664614026931, + "loss": 0.0, + "step": 28662 + }, + { + "epoch": 2.6745357842679853, + "grad_norm": NaN, + "learning_rate": 0.00018365724315155146, + "loss": 0.0, + "step": 28663 + }, + { + "epoch": 2.6746290939628627, + "grad_norm": NaN, + "learning_rate": 0.00018364987219059156, + "loss": 0.0, + "step": 28664 + }, + { + "epoch": 2.67472240365774, + "grad_norm": NaN, + "learning_rate": 0.00018364250114407004, + "loss": 0.0, + "step": 28665 + }, + { + "epoch": 2.6748157133526176, + "grad_norm": NaN, + "learning_rate": 0.0001836351300120055, + "loss": 0.0, + "step": 28666 + }, + { + "epoch": 2.6749090230474946, + "grad_norm": NaN, + "learning_rate": 0.00018362775879441685, + "loss": 0.0, + "step": 28667 + }, + { + "epoch": 2.675002332742372, + "grad_norm": NaN, + "learning_rate": 0.0001836203874913228, + "loss": 0.0, + "step": 28668 + }, + { + "epoch": 2.675095642437249, + "grad_norm": NaN, + "learning_rate": 0.000183613016102742, + "loss": 0.0, + "step": 28669 + }, + { + "epoch": 2.6751889521321264, + "grad_norm": NaN, + "learning_rate": 0.0001836056446286933, + "loss": 0.0, + "step": 28670 + }, + { + "epoch": 2.675282261827004, + "grad_norm": NaN, + "learning_rate": 0.00018359827306919542, + "loss": 0.0, + "step": 28671 + }, + { + "epoch": 2.675375571521881, + "grad_norm": NaN, + "learning_rate": 0.00018359090142426704, + "loss": 0.0, + "step": 28672 + }, + { + "epoch": 2.6754688812167586, + "grad_norm": NaN, + "learning_rate": 0.00018358352969392695, + "loss": 0.0, + "step": 28673 + }, + { + "epoch": 2.6755621909116356, + "grad_norm": NaN, + "learning_rate": 0.0001835761578781939, + "loss": 0.0, + "step": 28674 + }, + { + "epoch": 2.675655500606513, + "grad_norm": NaN, + "learning_rate": 0.0001835687859770867, + "loss": 0.0, + "step": 28675 + }, + { + "epoch": 2.6757488103013904, + "grad_norm": NaN, + "learning_rate": 0.00018356141399062387, + "loss": 0.0, + "step": 28676 + }, + { + "epoch": 2.6758421199962674, + "grad_norm": NaN, + "learning_rate": 0.0001835540419188244, + "loss": 0.0, + "step": 28677 + }, + { + "epoch": 2.675935429691145, + "grad_norm": NaN, + "learning_rate": 0.00018354666976170696, + "loss": 0.0, + "step": 28678 + }, + { + "epoch": 2.6760287393860223, + "grad_norm": NaN, + "learning_rate": 0.00018353929751929022, + "loss": 0.0, + "step": 28679 + }, + { + "epoch": 2.6761220490808997, + "grad_norm": NaN, + "learning_rate": 0.000183531925191593, + "loss": 0.0, + "step": 28680 + }, + { + "epoch": 2.6762153587757767, + "grad_norm": NaN, + "learning_rate": 0.00018352455277863408, + "loss": 0.0, + "step": 28681 + }, + { + "epoch": 2.676308668470654, + "grad_norm": NaN, + "learning_rate": 0.00018351718028043206, + "loss": 0.0, + "step": 28682 + }, + { + "epoch": 2.6764019781655315, + "grad_norm": NaN, + "learning_rate": 0.00018350980769700582, + "loss": 0.0, + "step": 28683 + }, + { + "epoch": 2.6764952878604085, + "grad_norm": NaN, + "learning_rate": 0.00018350243502837413, + "loss": 0.0, + "step": 28684 + }, + { + "epoch": 2.676588597555286, + "grad_norm": NaN, + "learning_rate": 0.00018349506227455557, + "loss": 0.0, + "step": 28685 + }, + { + "epoch": 2.6766819072501633, + "grad_norm": NaN, + "learning_rate": 0.00018348768943556905, + "loss": 0.0, + "step": 28686 + }, + { + "epoch": 2.6767752169450407, + "grad_norm": NaN, + "learning_rate": 0.00018348031651143324, + "loss": 0.0, + "step": 28687 + }, + { + "epoch": 2.676868526639918, + "grad_norm": NaN, + "learning_rate": 0.00018347294350216686, + "loss": 0.0, + "step": 28688 + }, + { + "epoch": 2.676961836334795, + "grad_norm": NaN, + "learning_rate": 0.00018346557040778877, + "loss": 0.0, + "step": 28689 + }, + { + "epoch": 2.6770551460296725, + "grad_norm": NaN, + "learning_rate": 0.00018345819722831764, + "loss": 0.0, + "step": 28690 + }, + { + "epoch": 2.6771484557245495, + "grad_norm": NaN, + "learning_rate": 0.00018345082396377217, + "loss": 0.0, + "step": 28691 + }, + { + "epoch": 2.677241765419427, + "grad_norm": NaN, + "learning_rate": 0.0001834434506141712, + "loss": 0.0, + "step": 28692 + }, + { + "epoch": 2.6773350751143044, + "grad_norm": NaN, + "learning_rate": 0.00018343607717953345, + "loss": 0.0, + "step": 28693 + }, + { + "epoch": 2.6774283848091818, + "grad_norm": NaN, + "learning_rate": 0.00018342870365987765, + "loss": 0.0, + "step": 28694 + }, + { + "epoch": 2.677521694504059, + "grad_norm": NaN, + "learning_rate": 0.00018342133005522254, + "loss": 0.0, + "step": 28695 + }, + { + "epoch": 2.677615004198936, + "grad_norm": NaN, + "learning_rate": 0.0001834139563655869, + "loss": 0.0, + "step": 28696 + }, + { + "epoch": 2.6777083138938136, + "grad_norm": NaN, + "learning_rate": 0.00018340658259098953, + "loss": 0.0, + "step": 28697 + }, + { + "epoch": 2.677801623588691, + "grad_norm": NaN, + "learning_rate": 0.00018339920873144903, + "loss": 0.0, + "step": 28698 + }, + { + "epoch": 2.677894933283568, + "grad_norm": NaN, + "learning_rate": 0.00018339183478698426, + "loss": 0.0, + "step": 28699 + }, + { + "epoch": 2.6779882429784454, + "grad_norm": NaN, + "learning_rate": 0.00018338446075761395, + "loss": 0.0, + "step": 28700 + }, + { + "epoch": 2.678081552673323, + "grad_norm": NaN, + "learning_rate": 0.00018337708664335685, + "loss": 0.0, + "step": 28701 + }, + { + "epoch": 2.6781748623682002, + "grad_norm": NaN, + "learning_rate": 0.0001833697124442317, + "loss": 0.0, + "step": 28702 + }, + { + "epoch": 2.678268172063077, + "grad_norm": NaN, + "learning_rate": 0.0001833623381602573, + "loss": 0.0, + "step": 28703 + }, + { + "epoch": 2.6783614817579546, + "grad_norm": NaN, + "learning_rate": 0.00018335496379145224, + "loss": 0.0, + "step": 28704 + }, + { + "epoch": 2.678454791452832, + "grad_norm": NaN, + "learning_rate": 0.0001833475893378355, + "loss": 0.0, + "step": 28705 + }, + { + "epoch": 2.678548101147709, + "grad_norm": NaN, + "learning_rate": 0.00018334021479942569, + "loss": 0.0, + "step": 28706 + }, + { + "epoch": 2.6786414108425864, + "grad_norm": NaN, + "learning_rate": 0.00018333284017624154, + "loss": 0.0, + "step": 28707 + }, + { + "epoch": 2.678734720537464, + "grad_norm": NaN, + "learning_rate": 0.00018332546546830192, + "loss": 0.0, + "step": 28708 + }, + { + "epoch": 2.6788280302323413, + "grad_norm": NaN, + "learning_rate": 0.0001833180906756255, + "loss": 0.0, + "step": 28709 + }, + { + "epoch": 2.6789213399272187, + "grad_norm": NaN, + "learning_rate": 0.000183310715798231, + "loss": 0.0, + "step": 28710 + }, + { + "epoch": 2.6790146496220957, + "grad_norm": NaN, + "learning_rate": 0.00018330334083613726, + "loss": 0.0, + "step": 28711 + }, + { + "epoch": 2.679107959316973, + "grad_norm": NaN, + "learning_rate": 0.00018329596578936297, + "loss": 0.0, + "step": 28712 + }, + { + "epoch": 2.67920126901185, + "grad_norm": NaN, + "learning_rate": 0.0001832885906579269, + "loss": 0.0, + "step": 28713 + }, + { + "epoch": 2.6792945787067275, + "grad_norm": NaN, + "learning_rate": 0.00018328121544184783, + "loss": 0.0, + "step": 28714 + }, + { + "epoch": 2.679387888401605, + "grad_norm": NaN, + "learning_rate": 0.00018327384014114444, + "loss": 0.0, + "step": 28715 + }, + { + "epoch": 2.6794811980964823, + "grad_norm": NaN, + "learning_rate": 0.00018326646475583559, + "loss": 0.0, + "step": 28716 + }, + { + "epoch": 2.6795745077913597, + "grad_norm": NaN, + "learning_rate": 0.00018325908928593993, + "loss": 0.0, + "step": 28717 + }, + { + "epoch": 2.6796678174862367, + "grad_norm": NaN, + "learning_rate": 0.0001832517137314763, + "loss": 0.0, + "step": 28718 + }, + { + "epoch": 2.679761127181114, + "grad_norm": NaN, + "learning_rate": 0.00018324433809246336, + "loss": 0.0, + "step": 28719 + }, + { + "epoch": 2.6798544368759916, + "grad_norm": NaN, + "learning_rate": 0.0001832369623689199, + "loss": 0.0, + "step": 28720 + }, + { + "epoch": 2.6799477465708685, + "grad_norm": NaN, + "learning_rate": 0.00018322958656086477, + "loss": 0.0, + "step": 28721 + }, + { + "epoch": 2.680041056265746, + "grad_norm": NaN, + "learning_rate": 0.00018322221066831661, + "loss": 0.0, + "step": 28722 + }, + { + "epoch": 2.6801343659606234, + "grad_norm": NaN, + "learning_rate": 0.0001832148346912942, + "loss": 0.0, + "step": 28723 + }, + { + "epoch": 2.680227675655501, + "grad_norm": NaN, + "learning_rate": 0.0001832074586298163, + "loss": 0.0, + "step": 28724 + }, + { + "epoch": 2.6803209853503778, + "grad_norm": NaN, + "learning_rate": 0.00018320008248390174, + "loss": 0.0, + "step": 28725 + }, + { + "epoch": 2.680414295045255, + "grad_norm": NaN, + "learning_rate": 0.0001831927062535691, + "loss": 0.0, + "step": 28726 + }, + { + "epoch": 2.6805076047401326, + "grad_norm": NaN, + "learning_rate": 0.00018318532993883728, + "loss": 0.0, + "step": 28727 + }, + { + "epoch": 2.6806009144350096, + "grad_norm": NaN, + "learning_rate": 0.00018317795353972501, + "loss": 0.0, + "step": 28728 + }, + { + "epoch": 2.680694224129887, + "grad_norm": NaN, + "learning_rate": 0.000183170577056251, + "loss": 0.0, + "step": 28729 + }, + { + "epoch": 2.6807875338247644, + "grad_norm": NaN, + "learning_rate": 0.00018316320048843406, + "loss": 0.0, + "step": 28730 + }, + { + "epoch": 2.680880843519642, + "grad_norm": NaN, + "learning_rate": 0.00018315582383629294, + "loss": 0.0, + "step": 28731 + }, + { + "epoch": 2.6809741532145193, + "grad_norm": NaN, + "learning_rate": 0.00018314844709984635, + "loss": 0.0, + "step": 28732 + }, + { + "epoch": 2.6810674629093962, + "grad_norm": NaN, + "learning_rate": 0.0001831410702791131, + "loss": 0.0, + "step": 28733 + }, + { + "epoch": 2.6811607726042737, + "grad_norm": NaN, + "learning_rate": 0.00018313369337411193, + "loss": 0.0, + "step": 28734 + }, + { + "epoch": 2.6812540822991506, + "grad_norm": NaN, + "learning_rate": 0.0001831263163848616, + "loss": 0.0, + "step": 28735 + }, + { + "epoch": 2.681347391994028, + "grad_norm": NaN, + "learning_rate": 0.00018311893931138084, + "loss": 0.0, + "step": 28736 + }, + { + "epoch": 2.6814407016889055, + "grad_norm": NaN, + "learning_rate": 0.00018311156215368844, + "loss": 0.0, + "step": 28737 + }, + { + "epoch": 2.681534011383783, + "grad_norm": NaN, + "learning_rate": 0.00018310418491180314, + "loss": 0.0, + "step": 28738 + }, + { + "epoch": 2.6816273210786603, + "grad_norm": NaN, + "learning_rate": 0.0001830968075857437, + "loss": 0.0, + "step": 28739 + }, + { + "epoch": 2.6817206307735373, + "grad_norm": NaN, + "learning_rate": 0.00018308943017552887, + "loss": 0.0, + "step": 28740 + }, + { + "epoch": 2.6818139404684147, + "grad_norm": NaN, + "learning_rate": 0.00018308205268117747, + "loss": 0.0, + "step": 28741 + }, + { + "epoch": 2.681907250163292, + "grad_norm": NaN, + "learning_rate": 0.00018307467510270814, + "loss": 0.0, + "step": 28742 + }, + { + "epoch": 2.682000559858169, + "grad_norm": NaN, + "learning_rate": 0.00018306729744013974, + "loss": 0.0, + "step": 28743 + }, + { + "epoch": 2.6820938695530465, + "grad_norm": NaN, + "learning_rate": 0.00018305991969349102, + "loss": 0.0, + "step": 28744 + }, + { + "epoch": 2.682187179247924, + "grad_norm": NaN, + "learning_rate": 0.0001830525418627807, + "loss": 0.0, + "step": 28745 + }, + { + "epoch": 2.6822804889428014, + "grad_norm": NaN, + "learning_rate": 0.00018304516394802752, + "loss": 0.0, + "step": 28746 + }, + { + "epoch": 2.6823737986376783, + "grad_norm": NaN, + "learning_rate": 0.00018303778594925036, + "loss": 0.0, + "step": 28747 + }, + { + "epoch": 2.6824671083325557, + "grad_norm": NaN, + "learning_rate": 0.0001830304078664678, + "loss": 0.0, + "step": 28748 + }, + { + "epoch": 2.682560418027433, + "grad_norm": NaN, + "learning_rate": 0.00018302302969969877, + "loss": 0.0, + "step": 28749 + }, + { + "epoch": 2.68265372772231, + "grad_norm": NaN, + "learning_rate": 0.00018301565144896196, + "loss": 0.0, + "step": 28750 + }, + { + "epoch": 2.6827470374171876, + "grad_norm": NaN, + "learning_rate": 0.00018300827311427606, + "loss": 0.0, + "step": 28751 + }, + { + "epoch": 2.682840347112065, + "grad_norm": NaN, + "learning_rate": 0.00018300089469565998, + "loss": 0.0, + "step": 28752 + }, + { + "epoch": 2.6829336568069424, + "grad_norm": NaN, + "learning_rate": 0.00018299351619313235, + "loss": 0.0, + "step": 28753 + }, + { + "epoch": 2.6830269665018194, + "grad_norm": NaN, + "learning_rate": 0.00018298613760671197, + "loss": 0.0, + "step": 28754 + }, + { + "epoch": 2.683120276196697, + "grad_norm": NaN, + "learning_rate": 0.00018297875893641765, + "loss": 0.0, + "step": 28755 + }, + { + "epoch": 2.683213585891574, + "grad_norm": NaN, + "learning_rate": 0.00018297138018226808, + "loss": 0.0, + "step": 28756 + }, + { + "epoch": 2.683306895586451, + "grad_norm": NaN, + "learning_rate": 0.00018296400134428209, + "loss": 0.0, + "step": 28757 + }, + { + "epoch": 2.6834002052813286, + "grad_norm": NaN, + "learning_rate": 0.00018295662242247843, + "loss": 0.0, + "step": 28758 + }, + { + "epoch": 2.683493514976206, + "grad_norm": NaN, + "learning_rate": 0.0001829492434168758, + "loss": 0.0, + "step": 28759 + }, + { + "epoch": 2.6835868246710834, + "grad_norm": NaN, + "learning_rate": 0.000182941864327493, + "loss": 0.0, + "step": 28760 + }, + { + "epoch": 2.683680134365961, + "grad_norm": NaN, + "learning_rate": 0.00018293448515434882, + "loss": 0.0, + "step": 28761 + }, + { + "epoch": 2.683773444060838, + "grad_norm": NaN, + "learning_rate": 0.000182927105897462, + "loss": 0.0, + "step": 28762 + }, + { + "epoch": 2.6838667537557153, + "grad_norm": NaN, + "learning_rate": 0.0001829197265568513, + "loss": 0.0, + "step": 28763 + }, + { + "epoch": 2.6839600634505927, + "grad_norm": NaN, + "learning_rate": 0.0001829123471325355, + "loss": 0.0, + "step": 28764 + }, + { + "epoch": 2.6840533731454697, + "grad_norm": NaN, + "learning_rate": 0.00018290496762453327, + "loss": 0.0, + "step": 28765 + }, + { + "epoch": 2.684146682840347, + "grad_norm": NaN, + "learning_rate": 0.00018289758803286354, + "loss": 0.0, + "step": 28766 + }, + { + "epoch": 2.6842399925352245, + "grad_norm": NaN, + "learning_rate": 0.00018289020835754498, + "loss": 0.0, + "step": 28767 + }, + { + "epoch": 2.684333302230102, + "grad_norm": NaN, + "learning_rate": 0.00018288282859859635, + "loss": 0.0, + "step": 28768 + }, + { + "epoch": 2.684426611924979, + "grad_norm": NaN, + "learning_rate": 0.0001828754487560364, + "loss": 0.0, + "step": 28769 + }, + { + "epoch": 2.6845199216198563, + "grad_norm": NaN, + "learning_rate": 0.00018286806882988397, + "loss": 0.0, + "step": 28770 + }, + { + "epoch": 2.6846132313147337, + "grad_norm": NaN, + "learning_rate": 0.00018286068882015775, + "loss": 0.0, + "step": 28771 + }, + { + "epoch": 2.6847065410096107, + "grad_norm": NaN, + "learning_rate": 0.00018285330872687655, + "loss": 0.0, + "step": 28772 + }, + { + "epoch": 2.684799850704488, + "grad_norm": NaN, + "learning_rate": 0.0001828459285500591, + "loss": 0.0, + "step": 28773 + }, + { + "epoch": 2.6848931603993655, + "grad_norm": NaN, + "learning_rate": 0.00018283854828972417, + "loss": 0.0, + "step": 28774 + }, + { + "epoch": 2.684986470094243, + "grad_norm": NaN, + "learning_rate": 0.00018283116794589056, + "loss": 0.0, + "step": 28775 + }, + { + "epoch": 2.68507977978912, + "grad_norm": NaN, + "learning_rate": 0.00018282378751857702, + "loss": 0.0, + "step": 28776 + }, + { + "epoch": 2.6851730894839974, + "grad_norm": NaN, + "learning_rate": 0.0001828164070078023, + "loss": 0.0, + "step": 28777 + }, + { + "epoch": 2.6852663991788748, + "grad_norm": NaN, + "learning_rate": 0.00018280902641358522, + "loss": 0.0, + "step": 28778 + }, + { + "epoch": 2.6853597088737517, + "grad_norm": NaN, + "learning_rate": 0.0001828016457359444, + "loss": 0.0, + "step": 28779 + }, + { + "epoch": 2.685453018568629, + "grad_norm": NaN, + "learning_rate": 0.0001827942649748988, + "loss": 0.0, + "step": 28780 + }, + { + "epoch": 2.6855463282635066, + "grad_norm": NaN, + "learning_rate": 0.0001827868841304671, + "loss": 0.0, + "step": 28781 + }, + { + "epoch": 2.685639637958384, + "grad_norm": NaN, + "learning_rate": 0.00018277950320266803, + "loss": 0.0, + "step": 28782 + }, + { + "epoch": 2.6857329476532614, + "grad_norm": NaN, + "learning_rate": 0.00018277212219152043, + "loss": 0.0, + "step": 28783 + }, + { + "epoch": 2.6858262573481384, + "grad_norm": NaN, + "learning_rate": 0.000182764741097043, + "loss": 0.0, + "step": 28784 + }, + { + "epoch": 2.685919567043016, + "grad_norm": NaN, + "learning_rate": 0.00018275735991925455, + "loss": 0.0, + "step": 28785 + }, + { + "epoch": 2.686012876737893, + "grad_norm": NaN, + "learning_rate": 0.00018274997865817382, + "loss": 0.0, + "step": 28786 + }, + { + "epoch": 2.68610618643277, + "grad_norm": NaN, + "learning_rate": 0.00018274259731381961, + "loss": 0.0, + "step": 28787 + }, + { + "epoch": 2.6861994961276476, + "grad_norm": NaN, + "learning_rate": 0.0001827352158862107, + "loss": 0.0, + "step": 28788 + }, + { + "epoch": 2.686292805822525, + "grad_norm": NaN, + "learning_rate": 0.0001827278343753658, + "loss": 0.0, + "step": 28789 + }, + { + "epoch": 2.6863861155174025, + "grad_norm": NaN, + "learning_rate": 0.0001827204527813037, + "loss": 0.0, + "step": 28790 + }, + { + "epoch": 2.6864794252122794, + "grad_norm": NaN, + "learning_rate": 0.0001827130711040432, + "loss": 0.0, + "step": 28791 + }, + { + "epoch": 2.686572734907157, + "grad_norm": NaN, + "learning_rate": 0.00018270568934360305, + "loss": 0.0, + "step": 28792 + }, + { + "epoch": 2.6866660446020343, + "grad_norm": NaN, + "learning_rate": 0.00018269830750000202, + "loss": 0.0, + "step": 28793 + }, + { + "epoch": 2.6867593542969113, + "grad_norm": NaN, + "learning_rate": 0.00018269092557325888, + "loss": 0.0, + "step": 28794 + }, + { + "epoch": 2.6868526639917887, + "grad_norm": NaN, + "learning_rate": 0.0001826835435633924, + "loss": 0.0, + "step": 28795 + }, + { + "epoch": 2.686945973686666, + "grad_norm": NaN, + "learning_rate": 0.00018267616147042137, + "loss": 0.0, + "step": 28796 + }, + { + "epoch": 2.6870392833815435, + "grad_norm": NaN, + "learning_rate": 0.0001826687792943645, + "loss": 0.0, + "step": 28797 + }, + { + "epoch": 2.6871325930764205, + "grad_norm": NaN, + "learning_rate": 0.00018266139703524065, + "loss": 0.0, + "step": 28798 + }, + { + "epoch": 2.687225902771298, + "grad_norm": NaN, + "learning_rate": 0.00018265401469306852, + "loss": 0.0, + "step": 28799 + }, + { + "epoch": 2.6873192124661753, + "grad_norm": NaN, + "learning_rate": 0.0001826466322678669, + "loss": 0.0, + "step": 28800 + }, + { + "epoch": 2.6874125221610523, + "grad_norm": NaN, + "learning_rate": 0.00018263924975965454, + "loss": 0.0, + "step": 28801 + }, + { + "epoch": 2.6875058318559297, + "grad_norm": NaN, + "learning_rate": 0.00018263186716845024, + "loss": 0.0, + "step": 28802 + }, + { + "epoch": 2.687599141550807, + "grad_norm": NaN, + "learning_rate": 0.00018262448449427282, + "loss": 0.0, + "step": 28803 + }, + { + "epoch": 2.6876924512456846, + "grad_norm": NaN, + "learning_rate": 0.00018261710173714095, + "loss": 0.0, + "step": 28804 + }, + { + "epoch": 2.687785760940562, + "grad_norm": NaN, + "learning_rate": 0.0001826097188970735, + "loss": 0.0, + "step": 28805 + }, + { + "epoch": 2.687879070635439, + "grad_norm": NaN, + "learning_rate": 0.00018260233597408915, + "loss": 0.0, + "step": 28806 + }, + { + "epoch": 2.6879723803303164, + "grad_norm": NaN, + "learning_rate": 0.00018259495296820674, + "loss": 0.0, + "step": 28807 + }, + { + "epoch": 2.6880656900251934, + "grad_norm": NaN, + "learning_rate": 0.00018258756987944502, + "loss": 0.0, + "step": 28808 + }, + { + "epoch": 2.6881589997200708, + "grad_norm": NaN, + "learning_rate": 0.00018258018670782274, + "loss": 0.0, + "step": 28809 + }, + { + "epoch": 2.688252309414948, + "grad_norm": NaN, + "learning_rate": 0.00018257280345335868, + "loss": 0.0, + "step": 28810 + }, + { + "epoch": 2.6883456191098256, + "grad_norm": NaN, + "learning_rate": 0.0001825654201160717, + "loss": 0.0, + "step": 28811 + }, + { + "epoch": 2.688438928804703, + "grad_norm": NaN, + "learning_rate": 0.00018255803669598044, + "loss": 0.0, + "step": 28812 + }, + { + "epoch": 2.68853223849958, + "grad_norm": NaN, + "learning_rate": 0.00018255065319310376, + "loss": 0.0, + "step": 28813 + }, + { + "epoch": 2.6886255481944574, + "grad_norm": NaN, + "learning_rate": 0.0001825432696074604, + "loss": 0.0, + "step": 28814 + }, + { + "epoch": 2.688718857889335, + "grad_norm": NaN, + "learning_rate": 0.00018253588593906918, + "loss": 0.0, + "step": 28815 + }, + { + "epoch": 2.688812167584212, + "grad_norm": NaN, + "learning_rate": 0.0001825285021879488, + "loss": 0.0, + "step": 28816 + }, + { + "epoch": 2.6889054772790892, + "grad_norm": NaN, + "learning_rate": 0.00018252111835411811, + "loss": 0.0, + "step": 28817 + }, + { + "epoch": 2.6889987869739667, + "grad_norm": NaN, + "learning_rate": 0.00018251373443759581, + "loss": 0.0, + "step": 28818 + }, + { + "epoch": 2.689092096668844, + "grad_norm": NaN, + "learning_rate": 0.00018250635043840074, + "loss": 0.0, + "step": 28819 + }, + { + "epoch": 2.689185406363721, + "grad_norm": NaN, + "learning_rate": 0.00018249896635655162, + "loss": 0.0, + "step": 28820 + }, + { + "epoch": 2.6892787160585985, + "grad_norm": NaN, + "learning_rate": 0.0001824915821920673, + "loss": 0.0, + "step": 28821 + }, + { + "epoch": 2.689372025753476, + "grad_norm": NaN, + "learning_rate": 0.00018248419794496653, + "loss": 0.0, + "step": 28822 + }, + { + "epoch": 2.689465335448353, + "grad_norm": NaN, + "learning_rate": 0.000182476813615268, + "loss": 0.0, + "step": 28823 + }, + { + "epoch": 2.6895586451432303, + "grad_norm": NaN, + "learning_rate": 0.00018246942920299058, + "loss": 0.0, + "step": 28824 + }, + { + "epoch": 2.6896519548381077, + "grad_norm": NaN, + "learning_rate": 0.00018246204470815302, + "loss": 0.0, + "step": 28825 + }, + { + "epoch": 2.689745264532985, + "grad_norm": NaN, + "learning_rate": 0.0001824546601307741, + "loss": 0.0, + "step": 28826 + }, + { + "epoch": 2.6898385742278625, + "grad_norm": NaN, + "learning_rate": 0.0001824472754708726, + "loss": 0.0, + "step": 28827 + }, + { + "epoch": 2.6899318839227395, + "grad_norm": NaN, + "learning_rate": 0.00018243989072846728, + "loss": 0.0, + "step": 28828 + }, + { + "epoch": 2.690025193617617, + "grad_norm": NaN, + "learning_rate": 0.00018243250590357693, + "loss": 0.0, + "step": 28829 + }, + { + "epoch": 2.690118503312494, + "grad_norm": NaN, + "learning_rate": 0.00018242512099622032, + "loss": 0.0, + "step": 28830 + }, + { + "epoch": 2.6902118130073713, + "grad_norm": NaN, + "learning_rate": 0.00018241773600641623, + "loss": 0.0, + "step": 28831 + }, + { + "epoch": 2.6903051227022488, + "grad_norm": NaN, + "learning_rate": 0.00018241035093418347, + "loss": 0.0, + "step": 28832 + }, + { + "epoch": 2.690398432397126, + "grad_norm": NaN, + "learning_rate": 0.00018240296577954078, + "loss": 0.0, + "step": 28833 + }, + { + "epoch": 2.6904917420920036, + "grad_norm": NaN, + "learning_rate": 0.0001823955805425069, + "loss": 0.0, + "step": 28834 + }, + { + "epoch": 2.6905850517868806, + "grad_norm": NaN, + "learning_rate": 0.0001823881952231007, + "loss": 0.0, + "step": 28835 + }, + { + "epoch": 2.690678361481758, + "grad_norm": NaN, + "learning_rate": 0.0001823808098213409, + "loss": 0.0, + "step": 28836 + }, + { + "epoch": 2.6907716711766354, + "grad_norm": NaN, + "learning_rate": 0.00018237342433724627, + "loss": 0.0, + "step": 28837 + }, + { + "epoch": 2.6908649808715124, + "grad_norm": NaN, + "learning_rate": 0.00018236603877083563, + "loss": 0.0, + "step": 28838 + }, + { + "epoch": 2.69095829056639, + "grad_norm": NaN, + "learning_rate": 0.00018235865312212776, + "loss": 0.0, + "step": 28839 + }, + { + "epoch": 2.691051600261267, + "grad_norm": NaN, + "learning_rate": 0.00018235126739114138, + "loss": 0.0, + "step": 28840 + }, + { + "epoch": 2.6911449099561446, + "grad_norm": NaN, + "learning_rate": 0.00018234388157789536, + "loss": 0.0, + "step": 28841 + }, + { + "epoch": 2.6912382196510216, + "grad_norm": NaN, + "learning_rate": 0.0001823364956824084, + "loss": 0.0, + "step": 28842 + }, + { + "epoch": 2.691331529345899, + "grad_norm": NaN, + "learning_rate": 0.00018232910970469933, + "loss": 0.0, + "step": 28843 + }, + { + "epoch": 2.6914248390407765, + "grad_norm": NaN, + "learning_rate": 0.00018232172364478688, + "loss": 0.0, + "step": 28844 + }, + { + "epoch": 2.6915181487356534, + "grad_norm": NaN, + "learning_rate": 0.00018231433750268987, + "loss": 0.0, + "step": 28845 + }, + { + "epoch": 2.691611458430531, + "grad_norm": NaN, + "learning_rate": 0.0001823069512784271, + "loss": 0.0, + "step": 28846 + }, + { + "epoch": 2.6917047681254083, + "grad_norm": NaN, + "learning_rate": 0.00018229956497201727, + "loss": 0.0, + "step": 28847 + }, + { + "epoch": 2.6917980778202857, + "grad_norm": NaN, + "learning_rate": 0.00018229217858347927, + "loss": 0.0, + "step": 28848 + }, + { + "epoch": 2.6918913875151627, + "grad_norm": NaN, + "learning_rate": 0.00018228479211283182, + "loss": 0.0, + "step": 28849 + }, + { + "epoch": 2.69198469721004, + "grad_norm": NaN, + "learning_rate": 0.00018227740556009368, + "loss": 0.0, + "step": 28850 + }, + { + "epoch": 2.6920780069049175, + "grad_norm": NaN, + "learning_rate": 0.00018227001892528368, + "loss": 0.0, + "step": 28851 + }, + { + "epoch": 2.6921713165997945, + "grad_norm": NaN, + "learning_rate": 0.00018226263220842055, + "loss": 0.0, + "step": 28852 + }, + { + "epoch": 2.692264626294672, + "grad_norm": NaN, + "learning_rate": 0.00018225524540952314, + "loss": 0.0, + "step": 28853 + }, + { + "epoch": 2.6923579359895493, + "grad_norm": NaN, + "learning_rate": 0.00018224785852861015, + "loss": 0.0, + "step": 28854 + }, + { + "epoch": 2.6924512456844267, + "grad_norm": NaN, + "learning_rate": 0.00018224047156570046, + "loss": 0.0, + "step": 28855 + }, + { + "epoch": 2.692544555379304, + "grad_norm": NaN, + "learning_rate": 0.00018223308452081275, + "loss": 0.0, + "step": 28856 + }, + { + "epoch": 2.692637865074181, + "grad_norm": NaN, + "learning_rate": 0.0001822256973939659, + "loss": 0.0, + "step": 28857 + }, + { + "epoch": 2.6927311747690585, + "grad_norm": NaN, + "learning_rate": 0.00018221831018517862, + "loss": 0.0, + "step": 28858 + }, + { + "epoch": 2.692824484463936, + "grad_norm": NaN, + "learning_rate": 0.00018221092289446975, + "loss": 0.0, + "step": 28859 + }, + { + "epoch": 2.692917794158813, + "grad_norm": NaN, + "learning_rate": 0.000182203535521858, + "loss": 0.0, + "step": 28860 + }, + { + "epoch": 2.6930111038536904, + "grad_norm": NaN, + "learning_rate": 0.00018219614806736225, + "loss": 0.0, + "step": 28861 + }, + { + "epoch": 2.6931044135485678, + "grad_norm": NaN, + "learning_rate": 0.0001821887605310012, + "loss": 0.0, + "step": 28862 + }, + { + "epoch": 2.693197723243445, + "grad_norm": NaN, + "learning_rate": 0.00018218137291279364, + "loss": 0.0, + "step": 28863 + }, + { + "epoch": 2.693291032938322, + "grad_norm": NaN, + "learning_rate": 0.00018217398521275843, + "loss": 0.0, + "step": 28864 + }, + { + "epoch": 2.6933843426331996, + "grad_norm": NaN, + "learning_rate": 0.00018216659743091427, + "loss": 0.0, + "step": 28865 + }, + { + "epoch": 2.693477652328077, + "grad_norm": NaN, + "learning_rate": 0.00018215920956728002, + "loss": 0.0, + "step": 28866 + }, + { + "epoch": 2.693570962022954, + "grad_norm": NaN, + "learning_rate": 0.00018215182162187443, + "loss": 0.0, + "step": 28867 + }, + { + "epoch": 2.6936642717178314, + "grad_norm": NaN, + "learning_rate": 0.00018214443359471625, + "loss": 0.0, + "step": 28868 + }, + { + "epoch": 2.693757581412709, + "grad_norm": NaN, + "learning_rate": 0.0001821370454858243, + "loss": 0.0, + "step": 28869 + }, + { + "epoch": 2.6938508911075862, + "grad_norm": NaN, + "learning_rate": 0.00018212965729521736, + "loss": 0.0, + "step": 28870 + }, + { + "epoch": 2.693944200802463, + "grad_norm": NaN, + "learning_rate": 0.00018212226902291424, + "loss": 0.0, + "step": 28871 + }, + { + "epoch": 2.6940375104973406, + "grad_norm": NaN, + "learning_rate": 0.0001821148806689337, + "loss": 0.0, + "step": 28872 + }, + { + "epoch": 2.694130820192218, + "grad_norm": NaN, + "learning_rate": 0.00018210749223329453, + "loss": 0.0, + "step": 28873 + }, + { + "epoch": 2.694224129887095, + "grad_norm": NaN, + "learning_rate": 0.00018210010371601552, + "loss": 0.0, + "step": 28874 + }, + { + "epoch": 2.6943174395819725, + "grad_norm": NaN, + "learning_rate": 0.00018209271511711544, + "loss": 0.0, + "step": 28875 + }, + { + "epoch": 2.69441074927685, + "grad_norm": NaN, + "learning_rate": 0.00018208532643661308, + "loss": 0.0, + "step": 28876 + }, + { + "epoch": 2.6945040589717273, + "grad_norm": NaN, + "learning_rate": 0.0001820779376745273, + "loss": 0.0, + "step": 28877 + }, + { + "epoch": 2.6945973686666047, + "grad_norm": NaN, + "learning_rate": 0.00018207054883087677, + "loss": 0.0, + "step": 28878 + }, + { + "epoch": 2.6946906783614817, + "grad_norm": NaN, + "learning_rate": 0.00018206315990568033, + "loss": 0.0, + "step": 28879 + }, + { + "epoch": 2.694783988056359, + "grad_norm": NaN, + "learning_rate": 0.00018205577089895682, + "loss": 0.0, + "step": 28880 + }, + { + "epoch": 2.694877297751236, + "grad_norm": NaN, + "learning_rate": 0.00018204838181072494, + "loss": 0.0, + "step": 28881 + }, + { + "epoch": 2.6949706074461135, + "grad_norm": NaN, + "learning_rate": 0.00018204099264100356, + "loss": 0.0, + "step": 28882 + }, + { + "epoch": 2.695063917140991, + "grad_norm": NaN, + "learning_rate": 0.00018203360338981138, + "loss": 0.0, + "step": 28883 + }, + { + "epoch": 2.6951572268358683, + "grad_norm": NaN, + "learning_rate": 0.00018202621405716726, + "loss": 0.0, + "step": 28884 + }, + { + "epoch": 2.6952505365307458, + "grad_norm": NaN, + "learning_rate": 0.00018201882464308994, + "loss": 0.0, + "step": 28885 + }, + { + "epoch": 2.6953438462256227, + "grad_norm": NaN, + "learning_rate": 0.00018201143514759826, + "loss": 0.0, + "step": 28886 + }, + { + "epoch": 2.6954371559205, + "grad_norm": NaN, + "learning_rate": 0.00018200404557071098, + "loss": 0.0, + "step": 28887 + }, + { + "epoch": 2.6955304656153776, + "grad_norm": NaN, + "learning_rate": 0.00018199665591244688, + "loss": 0.0, + "step": 28888 + }, + { + "epoch": 2.6956237753102545, + "grad_norm": NaN, + "learning_rate": 0.00018198926617282477, + "loss": 0.0, + "step": 28889 + }, + { + "epoch": 2.695717085005132, + "grad_norm": NaN, + "learning_rate": 0.00018198187635186345, + "loss": 0.0, + "step": 28890 + }, + { + "epoch": 2.6958103947000094, + "grad_norm": NaN, + "learning_rate": 0.00018197448644958168, + "loss": 0.0, + "step": 28891 + }, + { + "epoch": 2.695903704394887, + "grad_norm": NaN, + "learning_rate": 0.00018196709646599822, + "loss": 0.0, + "step": 28892 + }, + { + "epoch": 2.6959970140897638, + "grad_norm": NaN, + "learning_rate": 0.000181959706401132, + "loss": 0.0, + "step": 28893 + }, + { + "epoch": 2.696090323784641, + "grad_norm": NaN, + "learning_rate": 0.00018195231625500163, + "loss": 0.0, + "step": 28894 + }, + { + "epoch": 2.6961836334795186, + "grad_norm": NaN, + "learning_rate": 0.00018194492602762596, + "loss": 0.0, + "step": 28895 + }, + { + "epoch": 2.6962769431743956, + "grad_norm": NaN, + "learning_rate": 0.00018193753571902387, + "loss": 0.0, + "step": 28896 + }, + { + "epoch": 2.696370252869273, + "grad_norm": NaN, + "learning_rate": 0.0001819301453292141, + "loss": 0.0, + "step": 28897 + }, + { + "epoch": 2.6964635625641504, + "grad_norm": NaN, + "learning_rate": 0.00018192275485821535, + "loss": 0.0, + "step": 28898 + }, + { + "epoch": 2.696556872259028, + "grad_norm": NaN, + "learning_rate": 0.0001819153643060466, + "loss": 0.0, + "step": 28899 + }, + { + "epoch": 2.6966501819539053, + "grad_norm": NaN, + "learning_rate": 0.00018190797367272646, + "loss": 0.0, + "step": 28900 + }, + { + "epoch": 2.6967434916487822, + "grad_norm": NaN, + "learning_rate": 0.00018190058295827378, + "loss": 0.0, + "step": 28901 + }, + { + "epoch": 2.6968368013436597, + "grad_norm": NaN, + "learning_rate": 0.00018189319216270743, + "loss": 0.0, + "step": 28902 + }, + { + "epoch": 2.6969301110385366, + "grad_norm": NaN, + "learning_rate": 0.0001818858012860461, + "loss": 0.0, + "step": 28903 + }, + { + "epoch": 2.697023420733414, + "grad_norm": NaN, + "learning_rate": 0.0001818784103283086, + "loss": 0.0, + "step": 28904 + }, + { + "epoch": 2.6971167304282915, + "grad_norm": NaN, + "learning_rate": 0.0001818710192895138, + "loss": 0.0, + "step": 28905 + }, + { + "epoch": 2.697210040123169, + "grad_norm": NaN, + "learning_rate": 0.00018186362816968043, + "loss": 0.0, + "step": 28906 + }, + { + "epoch": 2.6973033498180463, + "grad_norm": NaN, + "learning_rate": 0.00018185623696882728, + "loss": 0.0, + "step": 28907 + }, + { + "epoch": 2.6973966595129233, + "grad_norm": NaN, + "learning_rate": 0.0001818488456869731, + "loss": 0.0, + "step": 28908 + }, + { + "epoch": 2.6974899692078007, + "grad_norm": NaN, + "learning_rate": 0.00018184145432413682, + "loss": 0.0, + "step": 28909 + }, + { + "epoch": 2.697583278902678, + "grad_norm": NaN, + "learning_rate": 0.0001818340628803371, + "loss": 0.0, + "step": 28910 + }, + { + "epoch": 2.697676588597555, + "grad_norm": NaN, + "learning_rate": 0.00018182667135559284, + "loss": 0.0, + "step": 28911 + }, + { + "epoch": 2.6977698982924325, + "grad_norm": NaN, + "learning_rate": 0.00018181927974992277, + "loss": 0.0, + "step": 28912 + }, + { + "epoch": 2.69786320798731, + "grad_norm": NaN, + "learning_rate": 0.0001818118880633457, + "loss": 0.0, + "step": 28913 + }, + { + "epoch": 2.6979565176821874, + "grad_norm": NaN, + "learning_rate": 0.00018180449629588032, + "loss": 0.0, + "step": 28914 + }, + { + "epoch": 2.6980498273770643, + "grad_norm": NaN, + "learning_rate": 0.00018179710444754567, + "loss": 0.0, + "step": 28915 + }, + { + "epoch": 2.6981431370719418, + "grad_norm": NaN, + "learning_rate": 0.00018178971251836033, + "loss": 0.0, + "step": 28916 + }, + { + "epoch": 2.698236446766819, + "grad_norm": NaN, + "learning_rate": 0.00018178232050834316, + "loss": 0.0, + "step": 28917 + }, + { + "epoch": 2.698329756461696, + "grad_norm": NaN, + "learning_rate": 0.000181774928417513, + "loss": 0.0, + "step": 28918 + }, + { + "epoch": 2.6984230661565736, + "grad_norm": NaN, + "learning_rate": 0.0001817675362458886, + "loss": 0.0, + "step": 28919 + }, + { + "epoch": 2.698516375851451, + "grad_norm": NaN, + "learning_rate": 0.0001817601439934887, + "loss": 0.0, + "step": 28920 + }, + { + "epoch": 2.6986096855463284, + "grad_norm": NaN, + "learning_rate": 0.00018175275166033226, + "loss": 0.0, + "step": 28921 + }, + { + "epoch": 2.698702995241206, + "grad_norm": NaN, + "learning_rate": 0.000181745359246438, + "loss": 0.0, + "step": 28922 + }, + { + "epoch": 2.698796304936083, + "grad_norm": NaN, + "learning_rate": 0.0001817379667518246, + "loss": 0.0, + "step": 28923 + }, + { + "epoch": 2.69888961463096, + "grad_norm": NaN, + "learning_rate": 0.00018173057417651098, + "loss": 0.0, + "step": 28924 + }, + { + "epoch": 2.698982924325837, + "grad_norm": NaN, + "learning_rate": 0.00018172318152051594, + "loss": 0.0, + "step": 28925 + }, + { + "epoch": 2.6990762340207146, + "grad_norm": NaN, + "learning_rate": 0.0001817157887838582, + "loss": 0.0, + "step": 28926 + }, + { + "epoch": 2.699169543715592, + "grad_norm": NaN, + "learning_rate": 0.00018170839596655662, + "loss": 0.0, + "step": 28927 + }, + { + "epoch": 2.6992628534104695, + "grad_norm": NaN, + "learning_rate": 0.00018170100306863003, + "loss": 0.0, + "step": 28928 + }, + { + "epoch": 2.699356163105347, + "grad_norm": NaN, + "learning_rate": 0.00018169361009009712, + "loss": 0.0, + "step": 28929 + }, + { + "epoch": 2.699449472800224, + "grad_norm": NaN, + "learning_rate": 0.00018168621703097676, + "loss": 0.0, + "step": 28930 + }, + { + "epoch": 2.6995427824951013, + "grad_norm": NaN, + "learning_rate": 0.00018167882389128774, + "loss": 0.0, + "step": 28931 + }, + { + "epoch": 2.6996360921899787, + "grad_norm": NaN, + "learning_rate": 0.00018167143067104885, + "loss": 0.0, + "step": 28932 + }, + { + "epoch": 2.6997294018848557, + "grad_norm": NaN, + "learning_rate": 0.00018166403737027888, + "loss": 0.0, + "step": 28933 + }, + { + "epoch": 2.699822711579733, + "grad_norm": NaN, + "learning_rate": 0.0001816566439889967, + "loss": 0.0, + "step": 28934 + }, + { + "epoch": 2.6999160212746105, + "grad_norm": NaN, + "learning_rate": 0.00018164925052722102, + "loss": 0.0, + "step": 28935 + }, + { + "epoch": 2.700009330969488, + "grad_norm": NaN, + "learning_rate": 0.00018164185698497064, + "loss": 0.0, + "step": 28936 + }, + { + "epoch": 2.700102640664365, + "grad_norm": NaN, + "learning_rate": 0.00018163446336226445, + "loss": 0.0, + "step": 28937 + }, + { + "epoch": 2.7001959503592423, + "grad_norm": NaN, + "learning_rate": 0.00018162706965912115, + "loss": 0.0, + "step": 28938 + }, + { + "epoch": 2.7002892600541197, + "grad_norm": NaN, + "learning_rate": 0.00018161967587555954, + "loss": 0.0, + "step": 28939 + }, + { + "epoch": 2.7003825697489967, + "grad_norm": NaN, + "learning_rate": 0.00018161228201159856, + "loss": 0.0, + "step": 28940 + }, + { + "epoch": 2.700475879443874, + "grad_norm": NaN, + "learning_rate": 0.00018160488806725686, + "loss": 0.0, + "step": 28941 + }, + { + "epoch": 2.7005691891387515, + "grad_norm": NaN, + "learning_rate": 0.00018159749404255324, + "loss": 0.0, + "step": 28942 + }, + { + "epoch": 2.700662498833629, + "grad_norm": NaN, + "learning_rate": 0.00018159009993750665, + "loss": 0.0, + "step": 28943 + }, + { + "epoch": 2.7007558085285064, + "grad_norm": NaN, + "learning_rate": 0.00018158270575213575, + "loss": 0.0, + "step": 28944 + }, + { + "epoch": 2.7008491182233834, + "grad_norm": NaN, + "learning_rate": 0.00018157531148645936, + "loss": 0.0, + "step": 28945 + }, + { + "epoch": 2.700942427918261, + "grad_norm": NaN, + "learning_rate": 0.00018156791714049632, + "loss": 0.0, + "step": 28946 + }, + { + "epoch": 2.7010357376131378, + "grad_norm": NaN, + "learning_rate": 0.00018156052271426544, + "loss": 0.0, + "step": 28947 + }, + { + "epoch": 2.701129047308015, + "grad_norm": NaN, + "learning_rate": 0.00018155312820778552, + "loss": 0.0, + "step": 28948 + }, + { + "epoch": 2.7012223570028926, + "grad_norm": NaN, + "learning_rate": 0.0001815457336210753, + "loss": 0.0, + "step": 28949 + }, + { + "epoch": 2.70131566669777, + "grad_norm": NaN, + "learning_rate": 0.0001815383389541536, + "loss": 0.0, + "step": 28950 + }, + { + "epoch": 2.7014089763926474, + "grad_norm": NaN, + "learning_rate": 0.0001815309442070393, + "loss": 0.0, + "step": 28951 + }, + { + "epoch": 2.7015022860875244, + "grad_norm": NaN, + "learning_rate": 0.00018152354937975109, + "loss": 0.0, + "step": 28952 + }, + { + "epoch": 2.701595595782402, + "grad_norm": NaN, + "learning_rate": 0.00018151615447230788, + "loss": 0.0, + "step": 28953 + }, + { + "epoch": 2.7016889054772792, + "grad_norm": NaN, + "learning_rate": 0.00018150875948472843, + "loss": 0.0, + "step": 28954 + }, + { + "epoch": 2.701782215172156, + "grad_norm": NaN, + "learning_rate": 0.0001815013644170315, + "loss": 0.0, + "step": 28955 + }, + { + "epoch": 2.7018755248670336, + "grad_norm": NaN, + "learning_rate": 0.00018149396926923596, + "loss": 0.0, + "step": 28956 + }, + { + "epoch": 2.701968834561911, + "grad_norm": NaN, + "learning_rate": 0.0001814865740413606, + "loss": 0.0, + "step": 28957 + }, + { + "epoch": 2.7020621442567885, + "grad_norm": NaN, + "learning_rate": 0.00018147917873342418, + "loss": 0.0, + "step": 28958 + }, + { + "epoch": 2.7021554539516655, + "grad_norm": NaN, + "learning_rate": 0.00018147178334544555, + "loss": 0.0, + "step": 28959 + }, + { + "epoch": 2.702248763646543, + "grad_norm": NaN, + "learning_rate": 0.0001814643878774435, + "loss": 0.0, + "step": 28960 + }, + { + "epoch": 2.7023420733414203, + "grad_norm": NaN, + "learning_rate": 0.00018145699232943678, + "loss": 0.0, + "step": 28961 + }, + { + "epoch": 2.7024353830362973, + "grad_norm": NaN, + "learning_rate": 0.00018144959670144434, + "loss": 0.0, + "step": 28962 + }, + { + "epoch": 2.7025286927311747, + "grad_norm": NaN, + "learning_rate": 0.00018144220099348482, + "loss": 0.0, + "step": 28963 + }, + { + "epoch": 2.702622002426052, + "grad_norm": NaN, + "learning_rate": 0.00018143480520557709, + "loss": 0.0, + "step": 28964 + }, + { + "epoch": 2.7027153121209295, + "grad_norm": NaN, + "learning_rate": 0.00018142740933774, + "loss": 0.0, + "step": 28965 + }, + { + "epoch": 2.7028086218158065, + "grad_norm": NaN, + "learning_rate": 0.00018142001338999233, + "loss": 0.0, + "step": 28966 + }, + { + "epoch": 2.702901931510684, + "grad_norm": NaN, + "learning_rate": 0.00018141261736235285, + "loss": 0.0, + "step": 28967 + }, + { + "epoch": 2.7029952412055613, + "grad_norm": NaN, + "learning_rate": 0.00018140522125484037, + "loss": 0.0, + "step": 28968 + }, + { + "epoch": 2.7030885509004383, + "grad_norm": NaN, + "learning_rate": 0.00018139782506747376, + "loss": 0.0, + "step": 28969 + }, + { + "epoch": 2.7031818605953157, + "grad_norm": NaN, + "learning_rate": 0.00018139042880027173, + "loss": 0.0, + "step": 28970 + }, + { + "epoch": 2.703275170290193, + "grad_norm": NaN, + "learning_rate": 0.00018138303245325313, + "loss": 0.0, + "step": 28971 + }, + { + "epoch": 2.7033684799850706, + "grad_norm": NaN, + "learning_rate": 0.00018137563602643684, + "loss": 0.0, + "step": 28972 + }, + { + "epoch": 2.703461789679948, + "grad_norm": NaN, + "learning_rate": 0.00018136823951984157, + "loss": 0.0, + "step": 28973 + }, + { + "epoch": 2.703555099374825, + "grad_norm": NaN, + "learning_rate": 0.00018136084293348613, + "loss": 0.0, + "step": 28974 + }, + { + "epoch": 2.7036484090697024, + "grad_norm": NaN, + "learning_rate": 0.00018135344626738943, + "loss": 0.0, + "step": 28975 + }, + { + "epoch": 2.70374171876458, + "grad_norm": NaN, + "learning_rate": 0.00018134604952157015, + "loss": 0.0, + "step": 28976 + }, + { + "epoch": 2.703835028459457, + "grad_norm": NaN, + "learning_rate": 0.00018133865269604713, + "loss": 0.0, + "step": 28977 + }, + { + "epoch": 2.703928338154334, + "grad_norm": NaN, + "learning_rate": 0.00018133125579083924, + "loss": 0.0, + "step": 28978 + }, + { + "epoch": 2.7040216478492116, + "grad_norm": NaN, + "learning_rate": 0.00018132385880596525, + "loss": 0.0, + "step": 28979 + }, + { + "epoch": 2.704114957544089, + "grad_norm": NaN, + "learning_rate": 0.00018131646174144393, + "loss": 0.0, + "step": 28980 + }, + { + "epoch": 2.704208267238966, + "grad_norm": NaN, + "learning_rate": 0.00018130906459729417, + "loss": 0.0, + "step": 28981 + }, + { + "epoch": 2.7043015769338434, + "grad_norm": NaN, + "learning_rate": 0.00018130166737353472, + "loss": 0.0, + "step": 28982 + }, + { + "epoch": 2.704394886628721, + "grad_norm": NaN, + "learning_rate": 0.00018129427007018437, + "loss": 0.0, + "step": 28983 + }, + { + "epoch": 2.704488196323598, + "grad_norm": NaN, + "learning_rate": 0.00018128687268726199, + "loss": 0.0, + "step": 28984 + }, + { + "epoch": 2.7045815060184752, + "grad_norm": NaN, + "learning_rate": 0.00018127947522478637, + "loss": 0.0, + "step": 28985 + }, + { + "epoch": 2.7046748157133527, + "grad_norm": NaN, + "learning_rate": 0.00018127207768277626, + "loss": 0.0, + "step": 28986 + }, + { + "epoch": 2.70476812540823, + "grad_norm": NaN, + "learning_rate": 0.0001812646800612506, + "loss": 0.0, + "step": 28987 + }, + { + "epoch": 2.704861435103107, + "grad_norm": NaN, + "learning_rate": 0.00018125728236022804, + "loss": 0.0, + "step": 28988 + }, + { + "epoch": 2.7049547447979845, + "grad_norm": NaN, + "learning_rate": 0.00018124988457972753, + "loss": 0.0, + "step": 28989 + }, + { + "epoch": 2.705048054492862, + "grad_norm": NaN, + "learning_rate": 0.00018124248671976779, + "loss": 0.0, + "step": 28990 + }, + { + "epoch": 2.705141364187739, + "grad_norm": NaN, + "learning_rate": 0.00018123508878036766, + "loss": 0.0, + "step": 28991 + }, + { + "epoch": 2.7052346738826163, + "grad_norm": NaN, + "learning_rate": 0.00018122769076154598, + "loss": 0.0, + "step": 28992 + }, + { + "epoch": 2.7053279835774937, + "grad_norm": NaN, + "learning_rate": 0.00018122029266332145, + "loss": 0.0, + "step": 28993 + }, + { + "epoch": 2.705421293272371, + "grad_norm": NaN, + "learning_rate": 0.00018121289448571308, + "loss": 0.0, + "step": 28994 + }, + { + "epoch": 2.7055146029672485, + "grad_norm": NaN, + "learning_rate": 0.0001812054962287395, + "loss": 0.0, + "step": 28995 + }, + { + "epoch": 2.7056079126621255, + "grad_norm": NaN, + "learning_rate": 0.00018119809789241958, + "loss": 0.0, + "step": 28996 + }, + { + "epoch": 2.705701222357003, + "grad_norm": NaN, + "learning_rate": 0.00018119069947677218, + "loss": 0.0, + "step": 28997 + }, + { + "epoch": 2.70579453205188, + "grad_norm": NaN, + "learning_rate": 0.00018118330098181604, + "loss": 0.0, + "step": 28998 + }, + { + "epoch": 2.7058878417467573, + "grad_norm": NaN, + "learning_rate": 0.00018117590240756997, + "loss": 0.0, + "step": 28999 + }, + { + "epoch": 2.7059811514416348, + "grad_norm": NaN, + "learning_rate": 0.0001811685037540529, + "loss": 0.0, + "step": 29000 + }, + { + "epoch": 2.706074461136512, + "grad_norm": NaN, + "learning_rate": 0.0001811611050212835, + "loss": 0.0, + "step": 29001 + }, + { + "epoch": 2.7061677708313896, + "grad_norm": NaN, + "learning_rate": 0.00018115370620928062, + "loss": 0.0, + "step": 29002 + }, + { + "epoch": 2.7062610805262666, + "grad_norm": NaN, + "learning_rate": 0.00018114630731806314, + "loss": 0.0, + "step": 29003 + }, + { + "epoch": 2.706354390221144, + "grad_norm": NaN, + "learning_rate": 0.00018113890834764982, + "loss": 0.0, + "step": 29004 + }, + { + "epoch": 2.7064476999160214, + "grad_norm": NaN, + "learning_rate": 0.00018113150929805944, + "loss": 0.0, + "step": 29005 + }, + { + "epoch": 2.7065410096108984, + "grad_norm": NaN, + "learning_rate": 0.0001811241101693109, + "loss": 0.0, + "step": 29006 + }, + { + "epoch": 2.706634319305776, + "grad_norm": NaN, + "learning_rate": 0.000181116710961423, + "loss": 0.0, + "step": 29007 + }, + { + "epoch": 2.7067276290006532, + "grad_norm": NaN, + "learning_rate": 0.00018110931167441443, + "loss": 0.0, + "step": 29008 + }, + { + "epoch": 2.7068209386955306, + "grad_norm": NaN, + "learning_rate": 0.00018110191230830412, + "loss": 0.0, + "step": 29009 + }, + { + "epoch": 2.7069142483904076, + "grad_norm": NaN, + "learning_rate": 0.00018109451286311092, + "loss": 0.0, + "step": 29010 + }, + { + "epoch": 2.707007558085285, + "grad_norm": NaN, + "learning_rate": 0.00018108711333885357, + "loss": 0.0, + "step": 29011 + }, + { + "epoch": 2.7071008677801625, + "grad_norm": NaN, + "learning_rate": 0.00018107971373555084, + "loss": 0.0, + "step": 29012 + }, + { + "epoch": 2.7071941774750394, + "grad_norm": NaN, + "learning_rate": 0.00018107231405322166, + "loss": 0.0, + "step": 29013 + }, + { + "epoch": 2.707287487169917, + "grad_norm": NaN, + "learning_rate": 0.00018106491429188478, + "loss": 0.0, + "step": 29014 + }, + { + "epoch": 2.7073807968647943, + "grad_norm": NaN, + "learning_rate": 0.00018105751445155897, + "loss": 0.0, + "step": 29015 + }, + { + "epoch": 2.7074741065596717, + "grad_norm": NaN, + "learning_rate": 0.00018105011453226316, + "loss": 0.0, + "step": 29016 + }, + { + "epoch": 2.707567416254549, + "grad_norm": NaN, + "learning_rate": 0.00018104271453401612, + "loss": 0.0, + "step": 29017 + }, + { + "epoch": 2.707660725949426, + "grad_norm": NaN, + "learning_rate": 0.00018103531445683656, + "loss": 0.0, + "step": 29018 + }, + { + "epoch": 2.7077540356443035, + "grad_norm": NaN, + "learning_rate": 0.00018102791430074351, + "loss": 0.0, + "step": 29019 + }, + { + "epoch": 2.7078473453391805, + "grad_norm": NaN, + "learning_rate": 0.0001810205140657556, + "loss": 0.0, + "step": 29020 + }, + { + "epoch": 2.707940655034058, + "grad_norm": NaN, + "learning_rate": 0.0001810131137518917, + "loss": 0.0, + "step": 29021 + }, + { + "epoch": 2.7080339647289353, + "grad_norm": NaN, + "learning_rate": 0.00018100571335917068, + "loss": 0.0, + "step": 29022 + }, + { + "epoch": 2.7081272744238127, + "grad_norm": NaN, + "learning_rate": 0.0001809983128876113, + "loss": 0.0, + "step": 29023 + }, + { + "epoch": 2.70822058411869, + "grad_norm": NaN, + "learning_rate": 0.00018099091233723236, + "loss": 0.0, + "step": 29024 + }, + { + "epoch": 2.708313893813567, + "grad_norm": NaN, + "learning_rate": 0.00018098351170805273, + "loss": 0.0, + "step": 29025 + }, + { + "epoch": 2.7084072035084445, + "grad_norm": NaN, + "learning_rate": 0.00018097611100009124, + "loss": 0.0, + "step": 29026 + }, + { + "epoch": 2.708500513203322, + "grad_norm": NaN, + "learning_rate": 0.0001809687102133666, + "loss": 0.0, + "step": 29027 + }, + { + "epoch": 2.708593822898199, + "grad_norm": NaN, + "learning_rate": 0.00018096130934789777, + "loss": 0.0, + "step": 29028 + }, + { + "epoch": 2.7086871325930764, + "grad_norm": NaN, + "learning_rate": 0.00018095390840370352, + "loss": 0.0, + "step": 29029 + }, + { + "epoch": 2.708780442287954, + "grad_norm": NaN, + "learning_rate": 0.00018094650738080257, + "loss": 0.0, + "step": 29030 + }, + { + "epoch": 2.708873751982831, + "grad_norm": NaN, + "learning_rate": 0.00018093910627921386, + "loss": 0.0, + "step": 29031 + }, + { + "epoch": 2.708967061677708, + "grad_norm": NaN, + "learning_rate": 0.0001809317050989562, + "loss": 0.0, + "step": 29032 + }, + { + "epoch": 2.7090603713725856, + "grad_norm": NaN, + "learning_rate": 0.00018092430384004837, + "loss": 0.0, + "step": 29033 + }, + { + "epoch": 2.709153681067463, + "grad_norm": NaN, + "learning_rate": 0.00018091690250250915, + "loss": 0.0, + "step": 29034 + }, + { + "epoch": 2.70924699076234, + "grad_norm": NaN, + "learning_rate": 0.00018090950108635745, + "loss": 0.0, + "step": 29035 + }, + { + "epoch": 2.7093403004572174, + "grad_norm": NaN, + "learning_rate": 0.000180902099591612, + "loss": 0.0, + "step": 29036 + }, + { + "epoch": 2.709433610152095, + "grad_norm": NaN, + "learning_rate": 0.00018089469801829168, + "loss": 0.0, + "step": 29037 + }, + { + "epoch": 2.7095269198469722, + "grad_norm": NaN, + "learning_rate": 0.00018088729636641534, + "loss": 0.0, + "step": 29038 + }, + { + "epoch": 2.7096202295418497, + "grad_norm": NaN, + "learning_rate": 0.00018087989463600173, + "loss": 0.0, + "step": 29039 + }, + { + "epoch": 2.7097135392367266, + "grad_norm": NaN, + "learning_rate": 0.00018087249282706963, + "loss": 0.0, + "step": 29040 + }, + { + "epoch": 2.709806848931604, + "grad_norm": NaN, + "learning_rate": 0.000180865090939638, + "loss": 0.0, + "step": 29041 + }, + { + "epoch": 2.709900158626481, + "grad_norm": NaN, + "learning_rate": 0.00018085768897372557, + "loss": 0.0, + "step": 29042 + }, + { + "epoch": 2.7099934683213585, + "grad_norm": NaN, + "learning_rate": 0.00018085028692935113, + "loss": 0.0, + "step": 29043 + }, + { + "epoch": 2.710086778016236, + "grad_norm": NaN, + "learning_rate": 0.0001808428848065336, + "loss": 0.0, + "step": 29044 + }, + { + "epoch": 2.7101800877111133, + "grad_norm": NaN, + "learning_rate": 0.0001808354826052918, + "loss": 0.0, + "step": 29045 + }, + { + "epoch": 2.7102733974059907, + "grad_norm": NaN, + "learning_rate": 0.00018082808032564437, + "loss": 0.0, + "step": 29046 + }, + { + "epoch": 2.7103667071008677, + "grad_norm": NaN, + "learning_rate": 0.00018082067796761038, + "loss": 0.0, + "step": 29047 + }, + { + "epoch": 2.710460016795745, + "grad_norm": NaN, + "learning_rate": 0.0001808132755312085, + "loss": 0.0, + "step": 29048 + }, + { + "epoch": 2.7105533264906225, + "grad_norm": NaN, + "learning_rate": 0.00018080587301645755, + "loss": 0.0, + "step": 29049 + }, + { + "epoch": 2.7106466361854995, + "grad_norm": NaN, + "learning_rate": 0.00018079847042337644, + "loss": 0.0, + "step": 29050 + }, + { + "epoch": 2.710739945880377, + "grad_norm": NaN, + "learning_rate": 0.00018079106775198395, + "loss": 0.0, + "step": 29051 + }, + { + "epoch": 2.7108332555752543, + "grad_norm": NaN, + "learning_rate": 0.00018078366500229884, + "loss": 0.0, + "step": 29052 + }, + { + "epoch": 2.7109265652701318, + "grad_norm": NaN, + "learning_rate": 0.00018077626217434, + "loss": 0.0, + "step": 29053 + }, + { + "epoch": 2.7110198749650087, + "grad_norm": NaN, + "learning_rate": 0.00018076885926812628, + "loss": 0.0, + "step": 29054 + }, + { + "epoch": 2.711113184659886, + "grad_norm": NaN, + "learning_rate": 0.00018076145628367644, + "loss": 0.0, + "step": 29055 + }, + { + "epoch": 2.7112064943547636, + "grad_norm": NaN, + "learning_rate": 0.00018075405322100932, + "loss": 0.0, + "step": 29056 + }, + { + "epoch": 2.7112998040496405, + "grad_norm": NaN, + "learning_rate": 0.00018074665008014382, + "loss": 0.0, + "step": 29057 + }, + { + "epoch": 2.711393113744518, + "grad_norm": NaN, + "learning_rate": 0.0001807392468610986, + "loss": 0.0, + "step": 29058 + }, + { + "epoch": 2.7114864234393954, + "grad_norm": NaN, + "learning_rate": 0.0001807318435638926, + "loss": 0.0, + "step": 29059 + }, + { + "epoch": 2.711579733134273, + "grad_norm": NaN, + "learning_rate": 0.0001807244401885447, + "loss": 0.0, + "step": 29060 + }, + { + "epoch": 2.71167304282915, + "grad_norm": NaN, + "learning_rate": 0.00018071703673507359, + "loss": 0.0, + "step": 29061 + }, + { + "epoch": 2.711766352524027, + "grad_norm": NaN, + "learning_rate": 0.00018070963320349816, + "loss": 0.0, + "step": 29062 + }, + { + "epoch": 2.7118596622189046, + "grad_norm": NaN, + "learning_rate": 0.00018070222959383722, + "loss": 0.0, + "step": 29063 + }, + { + "epoch": 2.7119529719137816, + "grad_norm": NaN, + "learning_rate": 0.00018069482590610966, + "loss": 0.0, + "step": 29064 + }, + { + "epoch": 2.712046281608659, + "grad_norm": NaN, + "learning_rate": 0.00018068742214033418, + "loss": 0.0, + "step": 29065 + }, + { + "epoch": 2.7121395913035364, + "grad_norm": NaN, + "learning_rate": 0.0001806800182965297, + "loss": 0.0, + "step": 29066 + }, + { + "epoch": 2.712232900998414, + "grad_norm": NaN, + "learning_rate": 0.00018067261437471505, + "loss": 0.0, + "step": 29067 + }, + { + "epoch": 2.7123262106932913, + "grad_norm": NaN, + "learning_rate": 0.00018066521037490897, + "loss": 0.0, + "step": 29068 + }, + { + "epoch": 2.7124195203881682, + "grad_norm": NaN, + "learning_rate": 0.00018065780629713038, + "loss": 0.0, + "step": 29069 + }, + { + "epoch": 2.7125128300830457, + "grad_norm": NaN, + "learning_rate": 0.00018065040214139812, + "loss": 0.0, + "step": 29070 + }, + { + "epoch": 2.712606139777923, + "grad_norm": NaN, + "learning_rate": 0.00018064299790773088, + "loss": 0.0, + "step": 29071 + }, + { + "epoch": 2.7126994494728, + "grad_norm": NaN, + "learning_rate": 0.0001806355935961476, + "loss": 0.0, + "step": 29072 + }, + { + "epoch": 2.7127927591676775, + "grad_norm": NaN, + "learning_rate": 0.00018062818920666713, + "loss": 0.0, + "step": 29073 + }, + { + "epoch": 2.712886068862555, + "grad_norm": NaN, + "learning_rate": 0.00018062078473930814, + "loss": 0.0, + "step": 29074 + }, + { + "epoch": 2.7129793785574323, + "grad_norm": NaN, + "learning_rate": 0.00018061338019408963, + "loss": 0.0, + "step": 29075 + }, + { + "epoch": 2.7130726882523093, + "grad_norm": NaN, + "learning_rate": 0.0001806059755710304, + "loss": 0.0, + "step": 29076 + }, + { + "epoch": 2.7131659979471867, + "grad_norm": NaN, + "learning_rate": 0.0001805985708701492, + "loss": 0.0, + "step": 29077 + }, + { + "epoch": 2.713259307642064, + "grad_norm": NaN, + "learning_rate": 0.0001805911660914649, + "loss": 0.0, + "step": 29078 + }, + { + "epoch": 2.713352617336941, + "grad_norm": NaN, + "learning_rate": 0.00018058376123499635, + "loss": 0.0, + "step": 29079 + }, + { + "epoch": 2.7134459270318185, + "grad_norm": NaN, + "learning_rate": 0.00018057635630076236, + "loss": 0.0, + "step": 29080 + }, + { + "epoch": 2.713539236726696, + "grad_norm": NaN, + "learning_rate": 0.00018056895128878167, + "loss": 0.0, + "step": 29081 + }, + { + "epoch": 2.7136325464215734, + "grad_norm": NaN, + "learning_rate": 0.00018056154619907326, + "loss": 0.0, + "step": 29082 + }, + { + "epoch": 2.7137258561164503, + "grad_norm": NaN, + "learning_rate": 0.0001805541410316559, + "loss": 0.0, + "step": 29083 + }, + { + "epoch": 2.7138191658113278, + "grad_norm": NaN, + "learning_rate": 0.00018054673578654835, + "loss": 0.0, + "step": 29084 + }, + { + "epoch": 2.713912475506205, + "grad_norm": NaN, + "learning_rate": 0.00018053933046376956, + "loss": 0.0, + "step": 29085 + }, + { + "epoch": 2.714005785201082, + "grad_norm": NaN, + "learning_rate": 0.0001805319250633383, + "loss": 0.0, + "step": 29086 + }, + { + "epoch": 2.7140990948959596, + "grad_norm": NaN, + "learning_rate": 0.00018052451958527335, + "loss": 0.0, + "step": 29087 + }, + { + "epoch": 2.714192404590837, + "grad_norm": NaN, + "learning_rate": 0.0001805171140295936, + "loss": 0.0, + "step": 29088 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": NaN, + "learning_rate": 0.00018050970839631793, + "loss": 0.0, + "step": 29089 + }, + { + "epoch": 2.714379023980592, + "grad_norm": NaN, + "learning_rate": 0.00018050230268546502, + "loss": 0.0, + "step": 29090 + }, + { + "epoch": 2.714472333675469, + "grad_norm": NaN, + "learning_rate": 0.00018049489689705384, + "loss": 0.0, + "step": 29091 + }, + { + "epoch": 2.7145656433703462, + "grad_norm": NaN, + "learning_rate": 0.00018048749103110318, + "loss": 0.0, + "step": 29092 + }, + { + "epoch": 2.714658953065223, + "grad_norm": NaN, + "learning_rate": 0.00018048008508763183, + "loss": 0.0, + "step": 29093 + }, + { + "epoch": 2.7147522627601006, + "grad_norm": NaN, + "learning_rate": 0.0001804726790666587, + "loss": 0.0, + "step": 29094 + }, + { + "epoch": 2.714845572454978, + "grad_norm": NaN, + "learning_rate": 0.00018046527296820253, + "loss": 0.0, + "step": 29095 + }, + { + "epoch": 2.7149388821498555, + "grad_norm": NaN, + "learning_rate": 0.00018045786679228218, + "loss": 0.0, + "step": 29096 + }, + { + "epoch": 2.715032191844733, + "grad_norm": NaN, + "learning_rate": 0.00018045046053891654, + "loss": 0.0, + "step": 29097 + }, + { + "epoch": 2.71512550153961, + "grad_norm": NaN, + "learning_rate": 0.0001804430542081244, + "loss": 0.0, + "step": 29098 + }, + { + "epoch": 2.7152188112344873, + "grad_norm": NaN, + "learning_rate": 0.00018043564779992455, + "loss": 0.0, + "step": 29099 + }, + { + "epoch": 2.7153121209293647, + "grad_norm": NaN, + "learning_rate": 0.0001804282413143359, + "loss": 0.0, + "step": 29100 + }, + { + "epoch": 2.7154054306242417, + "grad_norm": NaN, + "learning_rate": 0.00018042083475137722, + "loss": 0.0, + "step": 29101 + }, + { + "epoch": 2.715498740319119, + "grad_norm": NaN, + "learning_rate": 0.0001804134281110674, + "loss": 0.0, + "step": 29102 + }, + { + "epoch": 2.7155920500139965, + "grad_norm": NaN, + "learning_rate": 0.0001804060213934252, + "loss": 0.0, + "step": 29103 + }, + { + "epoch": 2.715685359708874, + "grad_norm": NaN, + "learning_rate": 0.00018039861459846955, + "loss": 0.0, + "step": 29104 + }, + { + "epoch": 2.715778669403751, + "grad_norm": NaN, + "learning_rate": 0.0001803912077262192, + "loss": 0.0, + "step": 29105 + }, + { + "epoch": 2.7158719790986283, + "grad_norm": NaN, + "learning_rate": 0.00018038380077669297, + "loss": 0.0, + "step": 29106 + }, + { + "epoch": 2.7159652887935057, + "grad_norm": NaN, + "learning_rate": 0.0001803763937499098, + "loss": 0.0, + "step": 29107 + }, + { + "epoch": 2.7160585984883827, + "grad_norm": NaN, + "learning_rate": 0.00018036898664588845, + "loss": 0.0, + "step": 29108 + }, + { + "epoch": 2.71615190818326, + "grad_norm": NaN, + "learning_rate": 0.0001803615794646477, + "loss": 0.0, + "step": 29109 + }, + { + "epoch": 2.7162452178781376, + "grad_norm": NaN, + "learning_rate": 0.0001803541722062065, + "loss": 0.0, + "step": 29110 + }, + { + "epoch": 2.716338527573015, + "grad_norm": NaN, + "learning_rate": 0.00018034676487058366, + "loss": 0.0, + "step": 29111 + }, + { + "epoch": 2.7164318372678924, + "grad_norm": NaN, + "learning_rate": 0.0001803393574577979, + "loss": 0.0, + "step": 29112 + }, + { + "epoch": 2.7165251469627694, + "grad_norm": NaN, + "learning_rate": 0.0001803319499678682, + "loss": 0.0, + "step": 29113 + }, + { + "epoch": 2.716618456657647, + "grad_norm": NaN, + "learning_rate": 0.00018032454240081333, + "loss": 0.0, + "step": 29114 + }, + { + "epoch": 2.7167117663525238, + "grad_norm": NaN, + "learning_rate": 0.00018031713475665211, + "loss": 0.0, + "step": 29115 + }, + { + "epoch": 2.716805076047401, + "grad_norm": NaN, + "learning_rate": 0.00018030972703540342, + "loss": 0.0, + "step": 29116 + }, + { + "epoch": 2.7168983857422786, + "grad_norm": NaN, + "learning_rate": 0.0001803023192370861, + "loss": 0.0, + "step": 29117 + }, + { + "epoch": 2.716991695437156, + "grad_norm": NaN, + "learning_rate": 0.0001802949113617189, + "loss": 0.0, + "step": 29118 + }, + { + "epoch": 2.7170850051320334, + "grad_norm": NaN, + "learning_rate": 0.00018028750340932072, + "loss": 0.0, + "step": 29119 + }, + { + "epoch": 2.7171783148269104, + "grad_norm": NaN, + "learning_rate": 0.0001802800953799104, + "loss": 0.0, + "step": 29120 + }, + { + "epoch": 2.717271624521788, + "grad_norm": NaN, + "learning_rate": 0.0001802726872735068, + "loss": 0.0, + "step": 29121 + }, + { + "epoch": 2.7173649342166653, + "grad_norm": NaN, + "learning_rate": 0.0001802652790901287, + "loss": 0.0, + "step": 29122 + }, + { + "epoch": 2.7174582439115422, + "grad_norm": NaN, + "learning_rate": 0.00018025787082979496, + "loss": 0.0, + "step": 29123 + }, + { + "epoch": 2.7175515536064196, + "grad_norm": NaN, + "learning_rate": 0.00018025046249252447, + "loss": 0.0, + "step": 29124 + }, + { + "epoch": 2.717644863301297, + "grad_norm": NaN, + "learning_rate": 0.0001802430540783359, + "loss": 0.0, + "step": 29125 + }, + { + "epoch": 2.7177381729961745, + "grad_norm": NaN, + "learning_rate": 0.0001802356455872483, + "loss": 0.0, + "step": 29126 + }, + { + "epoch": 2.7178314826910515, + "grad_norm": NaN, + "learning_rate": 0.00018022823701928037, + "loss": 0.0, + "step": 29127 + }, + { + "epoch": 2.717924792385929, + "grad_norm": NaN, + "learning_rate": 0.000180220828374451, + "loss": 0.0, + "step": 29128 + }, + { + "epoch": 2.7180181020808063, + "grad_norm": NaN, + "learning_rate": 0.000180213419652779, + "loss": 0.0, + "step": 29129 + }, + { + "epoch": 2.7181114117756833, + "grad_norm": NaN, + "learning_rate": 0.00018020601085428327, + "loss": 0.0, + "step": 29130 + }, + { + "epoch": 2.7182047214705607, + "grad_norm": NaN, + "learning_rate": 0.00018019860197898252, + "loss": 0.0, + "step": 29131 + }, + { + "epoch": 2.718298031165438, + "grad_norm": NaN, + "learning_rate": 0.00018019119302689572, + "loss": 0.0, + "step": 29132 + }, + { + "epoch": 2.7183913408603155, + "grad_norm": NaN, + "learning_rate": 0.0001801837839980417, + "loss": 0.0, + "step": 29133 + }, + { + "epoch": 2.718484650555193, + "grad_norm": NaN, + "learning_rate": 0.00018017637489243916, + "loss": 0.0, + "step": 29134 + }, + { + "epoch": 2.71857796025007, + "grad_norm": NaN, + "learning_rate": 0.0001801689657101071, + "loss": 0.0, + "step": 29135 + }, + { + "epoch": 2.7186712699449473, + "grad_norm": NaN, + "learning_rate": 0.00018016155645106432, + "loss": 0.0, + "step": 29136 + }, + { + "epoch": 2.7187645796398243, + "grad_norm": NaN, + "learning_rate": 0.00018015414711532957, + "loss": 0.0, + "step": 29137 + }, + { + "epoch": 2.7188578893347017, + "grad_norm": NaN, + "learning_rate": 0.00018014673770292179, + "loss": 0.0, + "step": 29138 + }, + { + "epoch": 2.718951199029579, + "grad_norm": NaN, + "learning_rate": 0.00018013932821385977, + "loss": 0.0, + "step": 29139 + }, + { + "epoch": 2.7190445087244566, + "grad_norm": NaN, + "learning_rate": 0.0001801319186481624, + "loss": 0.0, + "step": 29140 + }, + { + "epoch": 2.719137818419334, + "grad_norm": NaN, + "learning_rate": 0.00018012450900584847, + "loss": 0.0, + "step": 29141 + }, + { + "epoch": 2.719231128114211, + "grad_norm": NaN, + "learning_rate": 0.0001801170992869368, + "loss": 0.0, + "step": 29142 + }, + { + "epoch": 2.7193244378090884, + "grad_norm": NaN, + "learning_rate": 0.00018010968949144632, + "loss": 0.0, + "step": 29143 + }, + { + "epoch": 2.719417747503966, + "grad_norm": NaN, + "learning_rate": 0.00018010227961939583, + "loss": 0.0, + "step": 29144 + }, + { + "epoch": 2.719511057198843, + "grad_norm": NaN, + "learning_rate": 0.00018009486967080412, + "loss": 0.0, + "step": 29145 + }, + { + "epoch": 2.71960436689372, + "grad_norm": NaN, + "learning_rate": 0.00018008745964569013, + "loss": 0.0, + "step": 29146 + }, + { + "epoch": 2.7196976765885976, + "grad_norm": NaN, + "learning_rate": 0.00018008004954407254, + "loss": 0.0, + "step": 29147 + }, + { + "epoch": 2.719790986283475, + "grad_norm": NaN, + "learning_rate": 0.00018007263936597036, + "loss": 0.0, + "step": 29148 + }, + { + "epoch": 2.719884295978352, + "grad_norm": NaN, + "learning_rate": 0.00018006522911140238, + "loss": 0.0, + "step": 29149 + }, + { + "epoch": 2.7199776056732294, + "grad_norm": NaN, + "learning_rate": 0.00018005781878038736, + "loss": 0.0, + "step": 29150 + }, + { + "epoch": 2.720070915368107, + "grad_norm": NaN, + "learning_rate": 0.00018005040837294428, + "loss": 0.0, + "step": 29151 + }, + { + "epoch": 2.720164225062984, + "grad_norm": NaN, + "learning_rate": 0.0001800429978890919, + "loss": 0.0, + "step": 29152 + }, + { + "epoch": 2.7202575347578613, + "grad_norm": NaN, + "learning_rate": 0.00018003558732884907, + "loss": 0.0, + "step": 29153 + }, + { + "epoch": 2.7203508444527387, + "grad_norm": NaN, + "learning_rate": 0.00018002817669223465, + "loss": 0.0, + "step": 29154 + }, + { + "epoch": 2.720444154147616, + "grad_norm": NaN, + "learning_rate": 0.00018002076597926746, + "loss": 0.0, + "step": 29155 + }, + { + "epoch": 2.7205374638424935, + "grad_norm": NaN, + "learning_rate": 0.00018001335518996633, + "loss": 0.0, + "step": 29156 + }, + { + "epoch": 2.7206307735373705, + "grad_norm": NaN, + "learning_rate": 0.00018000594432435012, + "loss": 0.0, + "step": 29157 + }, + { + "epoch": 2.720724083232248, + "grad_norm": NaN, + "learning_rate": 0.00017999853338243773, + "loss": 0.0, + "step": 29158 + }, + { + "epoch": 2.720817392927125, + "grad_norm": NaN, + "learning_rate": 0.00017999112236424792, + "loss": 0.0, + "step": 29159 + }, + { + "epoch": 2.7209107026220023, + "grad_norm": NaN, + "learning_rate": 0.00017998371126979958, + "loss": 0.0, + "step": 29160 + }, + { + "epoch": 2.7210040123168797, + "grad_norm": NaN, + "learning_rate": 0.0001799763000991116, + "loss": 0.0, + "step": 29161 + }, + { + "epoch": 2.721097322011757, + "grad_norm": NaN, + "learning_rate": 0.00017996888885220272, + "loss": 0.0, + "step": 29162 + }, + { + "epoch": 2.7211906317066346, + "grad_norm": NaN, + "learning_rate": 0.00017996147752909178, + "loss": 0.0, + "step": 29163 + }, + { + "epoch": 2.7212839414015115, + "grad_norm": NaN, + "learning_rate": 0.00017995406612979776, + "loss": 0.0, + "step": 29164 + }, + { + "epoch": 2.721377251096389, + "grad_norm": NaN, + "learning_rate": 0.0001799466546543394, + "loss": 0.0, + "step": 29165 + }, + { + "epoch": 2.7214705607912664, + "grad_norm": NaN, + "learning_rate": 0.00017993924310273556, + "loss": 0.0, + "step": 29166 + }, + { + "epoch": 2.7215638704861433, + "grad_norm": NaN, + "learning_rate": 0.00017993183147500507, + "loss": 0.0, + "step": 29167 + }, + { + "epoch": 2.7216571801810208, + "grad_norm": NaN, + "learning_rate": 0.00017992441977116686, + "loss": 0.0, + "step": 29168 + }, + { + "epoch": 2.721750489875898, + "grad_norm": NaN, + "learning_rate": 0.00017991700799123967, + "loss": 0.0, + "step": 29169 + }, + { + "epoch": 2.7218437995707756, + "grad_norm": NaN, + "learning_rate": 0.0001799095961352424, + "loss": 0.0, + "step": 29170 + }, + { + "epoch": 2.7219371092656526, + "grad_norm": NaN, + "learning_rate": 0.0001799021842031939, + "loss": 0.0, + "step": 29171 + }, + { + "epoch": 2.72203041896053, + "grad_norm": NaN, + "learning_rate": 0.00017989477219511297, + "loss": 0.0, + "step": 29172 + }, + { + "epoch": 2.7221237286554074, + "grad_norm": NaN, + "learning_rate": 0.0001798873601110185, + "loss": 0.0, + "step": 29173 + }, + { + "epoch": 2.7222170383502844, + "grad_norm": NaN, + "learning_rate": 0.00017987994795092937, + "loss": 0.0, + "step": 29174 + }, + { + "epoch": 2.722310348045162, + "grad_norm": NaN, + "learning_rate": 0.00017987253571486432, + "loss": 0.0, + "step": 29175 + }, + { + "epoch": 2.7224036577400392, + "grad_norm": NaN, + "learning_rate": 0.00017986512340284228, + "loss": 0.0, + "step": 29176 + }, + { + "epoch": 2.7224969674349166, + "grad_norm": NaN, + "learning_rate": 0.00017985771101488207, + "loss": 0.0, + "step": 29177 + }, + { + "epoch": 2.7225902771297936, + "grad_norm": NaN, + "learning_rate": 0.00017985029855100259, + "loss": 0.0, + "step": 29178 + }, + { + "epoch": 2.722683586824671, + "grad_norm": NaN, + "learning_rate": 0.0001798428860112226, + "loss": 0.0, + "step": 29179 + }, + { + "epoch": 2.7227768965195485, + "grad_norm": NaN, + "learning_rate": 0.00017983547339556102, + "loss": 0.0, + "step": 29180 + }, + { + "epoch": 2.7228702062144254, + "grad_norm": NaN, + "learning_rate": 0.00017982806070403665, + "loss": 0.0, + "step": 29181 + }, + { + "epoch": 2.722963515909303, + "grad_norm": NaN, + "learning_rate": 0.00017982064793666834, + "loss": 0.0, + "step": 29182 + }, + { + "epoch": 2.7230568256041803, + "grad_norm": NaN, + "learning_rate": 0.000179813235093475, + "loss": 0.0, + "step": 29183 + }, + { + "epoch": 2.7231501352990577, + "grad_norm": NaN, + "learning_rate": 0.00017980582217447542, + "loss": 0.0, + "step": 29184 + }, + { + "epoch": 2.723243444993935, + "grad_norm": NaN, + "learning_rate": 0.00017979840917968845, + "loss": 0.0, + "step": 29185 + }, + { + "epoch": 2.723336754688812, + "grad_norm": NaN, + "learning_rate": 0.00017979099610913293, + "loss": 0.0, + "step": 29186 + }, + { + "epoch": 2.7234300643836895, + "grad_norm": NaN, + "learning_rate": 0.0001797835829628278, + "loss": 0.0, + "step": 29187 + }, + { + "epoch": 2.723523374078567, + "grad_norm": NaN, + "learning_rate": 0.0001797761697407918, + "loss": 0.0, + "step": 29188 + }, + { + "epoch": 2.723616683773444, + "grad_norm": NaN, + "learning_rate": 0.00017976875644304383, + "loss": 0.0, + "step": 29189 + }, + { + "epoch": 2.7237099934683213, + "grad_norm": NaN, + "learning_rate": 0.00017976134306960274, + "loss": 0.0, + "step": 29190 + }, + { + "epoch": 2.7238033031631987, + "grad_norm": NaN, + "learning_rate": 0.0001797539296204873, + "loss": 0.0, + "step": 29191 + }, + { + "epoch": 2.723896612858076, + "grad_norm": NaN, + "learning_rate": 0.0001797465160957165, + "loss": 0.0, + "step": 29192 + }, + { + "epoch": 2.723989922552953, + "grad_norm": NaN, + "learning_rate": 0.00017973910249530914, + "loss": 0.0, + "step": 29193 + }, + { + "epoch": 2.7240832322478306, + "grad_norm": NaN, + "learning_rate": 0.000179731688819284, + "loss": 0.0, + "step": 29194 + }, + { + "epoch": 2.724176541942708, + "grad_norm": NaN, + "learning_rate": 0.00017972427506765998, + "loss": 0.0, + "step": 29195 + }, + { + "epoch": 2.724269851637585, + "grad_norm": NaN, + "learning_rate": 0.00017971686124045597, + "loss": 0.0, + "step": 29196 + }, + { + "epoch": 2.7243631613324624, + "grad_norm": NaN, + "learning_rate": 0.00017970944733769077, + "loss": 0.0, + "step": 29197 + }, + { + "epoch": 2.72445647102734, + "grad_norm": NaN, + "learning_rate": 0.00017970203335938324, + "loss": 0.0, + "step": 29198 + }, + { + "epoch": 2.724549780722217, + "grad_norm": NaN, + "learning_rate": 0.00017969461930555225, + "loss": 0.0, + "step": 29199 + }, + { + "epoch": 2.724643090417094, + "grad_norm": NaN, + "learning_rate": 0.00017968720517621664, + "loss": 0.0, + "step": 29200 + }, + { + "epoch": 2.7247364001119716, + "grad_norm": NaN, + "learning_rate": 0.00017967979097139523, + "loss": 0.0, + "step": 29201 + }, + { + "epoch": 2.724829709806849, + "grad_norm": NaN, + "learning_rate": 0.00017967237669110693, + "loss": 0.0, + "step": 29202 + }, + { + "epoch": 2.724923019501726, + "grad_norm": NaN, + "learning_rate": 0.00017966496233537058, + "loss": 0.0, + "step": 29203 + }, + { + "epoch": 2.7250163291966034, + "grad_norm": NaN, + "learning_rate": 0.00017965754790420498, + "loss": 0.0, + "step": 29204 + }, + { + "epoch": 2.725109638891481, + "grad_norm": NaN, + "learning_rate": 0.00017965013339762905, + "loss": 0.0, + "step": 29205 + }, + { + "epoch": 2.7252029485863583, + "grad_norm": NaN, + "learning_rate": 0.0001796427188156616, + "loss": 0.0, + "step": 29206 + }, + { + "epoch": 2.7252962582812357, + "grad_norm": NaN, + "learning_rate": 0.0001796353041583215, + "loss": 0.0, + "step": 29207 + }, + { + "epoch": 2.7253895679761126, + "grad_norm": NaN, + "learning_rate": 0.00017962788942562762, + "loss": 0.0, + "step": 29208 + }, + { + "epoch": 2.72548287767099, + "grad_norm": NaN, + "learning_rate": 0.00017962047461759874, + "loss": 0.0, + "step": 29209 + }, + { + "epoch": 2.725576187365867, + "grad_norm": NaN, + "learning_rate": 0.00017961305973425383, + "loss": 0.0, + "step": 29210 + }, + { + "epoch": 2.7256694970607445, + "grad_norm": NaN, + "learning_rate": 0.00017960564477561164, + "loss": 0.0, + "step": 29211 + }, + { + "epoch": 2.725762806755622, + "grad_norm": NaN, + "learning_rate": 0.00017959822974169108, + "loss": 0.0, + "step": 29212 + }, + { + "epoch": 2.7258561164504993, + "grad_norm": NaN, + "learning_rate": 0.00017959081463251098, + "loss": 0.0, + "step": 29213 + }, + { + "epoch": 2.7259494261453767, + "grad_norm": NaN, + "learning_rate": 0.00017958339944809022, + "loss": 0.0, + "step": 29214 + }, + { + "epoch": 2.7260427358402537, + "grad_norm": NaN, + "learning_rate": 0.00017957598418844763, + "loss": 0.0, + "step": 29215 + }, + { + "epoch": 2.726136045535131, + "grad_norm": NaN, + "learning_rate": 0.00017956856885360207, + "loss": 0.0, + "step": 29216 + }, + { + "epoch": 2.7262293552300085, + "grad_norm": NaN, + "learning_rate": 0.00017956115344357242, + "loss": 0.0, + "step": 29217 + }, + { + "epoch": 2.7263226649248855, + "grad_norm": NaN, + "learning_rate": 0.00017955373795837748, + "loss": 0.0, + "step": 29218 + }, + { + "epoch": 2.726415974619763, + "grad_norm": NaN, + "learning_rate": 0.00017954632239803616, + "loss": 0.0, + "step": 29219 + }, + { + "epoch": 2.7265092843146403, + "grad_norm": NaN, + "learning_rate": 0.00017953890676256727, + "loss": 0.0, + "step": 29220 + }, + { + "epoch": 2.7266025940095178, + "grad_norm": NaN, + "learning_rate": 0.0001795314910519897, + "loss": 0.0, + "step": 29221 + }, + { + "epoch": 2.7266959037043947, + "grad_norm": NaN, + "learning_rate": 0.00017952407526632233, + "loss": 0.0, + "step": 29222 + }, + { + "epoch": 2.726789213399272, + "grad_norm": NaN, + "learning_rate": 0.00017951665940558394, + "loss": 0.0, + "step": 29223 + }, + { + "epoch": 2.7268825230941496, + "grad_norm": NaN, + "learning_rate": 0.00017950924346979346, + "loss": 0.0, + "step": 29224 + }, + { + "epoch": 2.7269758327890266, + "grad_norm": NaN, + "learning_rate": 0.00017950182745896969, + "loss": 0.0, + "step": 29225 + }, + { + "epoch": 2.727069142483904, + "grad_norm": NaN, + "learning_rate": 0.0001794944113731315, + "loss": 0.0, + "step": 29226 + }, + { + "epoch": 2.7271624521787814, + "grad_norm": NaN, + "learning_rate": 0.00017948699521229782, + "loss": 0.0, + "step": 29227 + }, + { + "epoch": 2.727255761873659, + "grad_norm": NaN, + "learning_rate": 0.0001794795789764874, + "loss": 0.0, + "step": 29228 + }, + { + "epoch": 2.7273490715685362, + "grad_norm": NaN, + "learning_rate": 0.00017947216266571916, + "loss": 0.0, + "step": 29229 + }, + { + "epoch": 2.727442381263413, + "grad_norm": NaN, + "learning_rate": 0.0001794647462800119, + "loss": 0.0, + "step": 29230 + }, + { + "epoch": 2.7275356909582906, + "grad_norm": NaN, + "learning_rate": 0.00017945732981938459, + "loss": 0.0, + "step": 29231 + }, + { + "epoch": 2.7276290006531676, + "grad_norm": NaN, + "learning_rate": 0.00017944991328385594, + "loss": 0.0, + "step": 29232 + }, + { + "epoch": 2.727722310348045, + "grad_norm": NaN, + "learning_rate": 0.00017944249667344492, + "loss": 0.0, + "step": 29233 + }, + { + "epoch": 2.7278156200429224, + "grad_norm": NaN, + "learning_rate": 0.00017943507998817036, + "loss": 0.0, + "step": 29234 + }, + { + "epoch": 2.7279089297378, + "grad_norm": NaN, + "learning_rate": 0.0001794276632280511, + "loss": 0.0, + "step": 29235 + }, + { + "epoch": 2.7280022394326773, + "grad_norm": NaN, + "learning_rate": 0.00017942024639310602, + "loss": 0.0, + "step": 29236 + }, + { + "epoch": 2.7280955491275543, + "grad_norm": NaN, + "learning_rate": 0.00017941282948335396, + "loss": 0.0, + "step": 29237 + }, + { + "epoch": 2.7281888588224317, + "grad_norm": NaN, + "learning_rate": 0.00017940541249881377, + "loss": 0.0, + "step": 29238 + }, + { + "epoch": 2.728282168517309, + "grad_norm": NaN, + "learning_rate": 0.00017939799543950435, + "loss": 0.0, + "step": 29239 + }, + { + "epoch": 2.728375478212186, + "grad_norm": NaN, + "learning_rate": 0.00017939057830544455, + "loss": 0.0, + "step": 29240 + }, + { + "epoch": 2.7284687879070635, + "grad_norm": NaN, + "learning_rate": 0.0001793831610966532, + "loss": 0.0, + "step": 29241 + }, + { + "epoch": 2.728562097601941, + "grad_norm": NaN, + "learning_rate": 0.00017937574381314916, + "loss": 0.0, + "step": 29242 + }, + { + "epoch": 2.7286554072968183, + "grad_norm": NaN, + "learning_rate": 0.00017936832645495135, + "loss": 0.0, + "step": 29243 + }, + { + "epoch": 2.7287487169916953, + "grad_norm": NaN, + "learning_rate": 0.00017936090902207852, + "loss": 0.0, + "step": 29244 + }, + { + "epoch": 2.7288420266865727, + "grad_norm": NaN, + "learning_rate": 0.00017935349151454962, + "loss": 0.0, + "step": 29245 + }, + { + "epoch": 2.72893533638145, + "grad_norm": NaN, + "learning_rate": 0.0001793460739323835, + "loss": 0.0, + "step": 29246 + }, + { + "epoch": 2.729028646076327, + "grad_norm": NaN, + "learning_rate": 0.000179338656275599, + "loss": 0.0, + "step": 29247 + }, + { + "epoch": 2.7291219557712045, + "grad_norm": NaN, + "learning_rate": 0.000179331238544215, + "loss": 0.0, + "step": 29248 + }, + { + "epoch": 2.729215265466082, + "grad_norm": NaN, + "learning_rate": 0.00017932382073825034, + "loss": 0.0, + "step": 29249 + }, + { + "epoch": 2.7293085751609594, + "grad_norm": NaN, + "learning_rate": 0.00017931640285772388, + "loss": 0.0, + "step": 29250 + }, + { + "epoch": 2.729401884855837, + "grad_norm": NaN, + "learning_rate": 0.0001793089849026545, + "loss": 0.0, + "step": 29251 + }, + { + "epoch": 2.7294951945507138, + "grad_norm": NaN, + "learning_rate": 0.00017930156687306106, + "loss": 0.0, + "step": 29252 + }, + { + "epoch": 2.729588504245591, + "grad_norm": NaN, + "learning_rate": 0.0001792941487689624, + "loss": 0.0, + "step": 29253 + }, + { + "epoch": 2.729681813940468, + "grad_norm": NaN, + "learning_rate": 0.00017928673059037738, + "loss": 0.0, + "step": 29254 + }, + { + "epoch": 2.7297751236353456, + "grad_norm": NaN, + "learning_rate": 0.0001792793123373249, + "loss": 0.0, + "step": 29255 + }, + { + "epoch": 2.729868433330223, + "grad_norm": NaN, + "learning_rate": 0.00017927189400982382, + "loss": 0.0, + "step": 29256 + }, + { + "epoch": 2.7299617430251004, + "grad_norm": NaN, + "learning_rate": 0.000179264475607893, + "loss": 0.0, + "step": 29257 + }, + { + "epoch": 2.730055052719978, + "grad_norm": NaN, + "learning_rate": 0.00017925705713155128, + "loss": 0.0, + "step": 29258 + }, + { + "epoch": 2.730148362414855, + "grad_norm": NaN, + "learning_rate": 0.0001792496385808175, + "loss": 0.0, + "step": 29259 + }, + { + "epoch": 2.7302416721097322, + "grad_norm": NaN, + "learning_rate": 0.00017924221995571055, + "loss": 0.0, + "step": 29260 + }, + { + "epoch": 2.7303349818046097, + "grad_norm": NaN, + "learning_rate": 0.0001792348012562493, + "loss": 0.0, + "step": 29261 + }, + { + "epoch": 2.7304282914994866, + "grad_norm": NaN, + "learning_rate": 0.00017922738248245262, + "loss": 0.0, + "step": 29262 + }, + { + "epoch": 2.730521601194364, + "grad_norm": NaN, + "learning_rate": 0.00017921996363433934, + "loss": 0.0, + "step": 29263 + }, + { + "epoch": 2.7306149108892415, + "grad_norm": NaN, + "learning_rate": 0.0001792125447119284, + "loss": 0.0, + "step": 29264 + }, + { + "epoch": 2.730708220584119, + "grad_norm": NaN, + "learning_rate": 0.00017920512571523858, + "loss": 0.0, + "step": 29265 + }, + { + "epoch": 2.730801530278996, + "grad_norm": NaN, + "learning_rate": 0.00017919770664428876, + "loss": 0.0, + "step": 29266 + }, + { + "epoch": 2.7308948399738733, + "grad_norm": NaN, + "learning_rate": 0.00017919028749909785, + "loss": 0.0, + "step": 29267 + }, + { + "epoch": 2.7309881496687507, + "grad_norm": NaN, + "learning_rate": 0.0001791828682796847, + "loss": 0.0, + "step": 29268 + }, + { + "epoch": 2.7310814593636277, + "grad_norm": NaN, + "learning_rate": 0.00017917544898606814, + "loss": 0.0, + "step": 29269 + }, + { + "epoch": 2.731174769058505, + "grad_norm": NaN, + "learning_rate": 0.00017916802961826702, + "loss": 0.0, + "step": 29270 + }, + { + "epoch": 2.7312680787533825, + "grad_norm": NaN, + "learning_rate": 0.0001791606101763003, + "loss": 0.0, + "step": 29271 + }, + { + "epoch": 2.73136138844826, + "grad_norm": NaN, + "learning_rate": 0.00017915319066018676, + "loss": 0.0, + "step": 29272 + }, + { + "epoch": 2.731454698143137, + "grad_norm": NaN, + "learning_rate": 0.0001791457710699453, + "loss": 0.0, + "step": 29273 + }, + { + "epoch": 2.7315480078380143, + "grad_norm": NaN, + "learning_rate": 0.00017913835140559475, + "loss": 0.0, + "step": 29274 + }, + { + "epoch": 2.7316413175328917, + "grad_norm": NaN, + "learning_rate": 0.00017913093166715404, + "loss": 0.0, + "step": 29275 + }, + { + "epoch": 2.7317346272277687, + "grad_norm": NaN, + "learning_rate": 0.00017912351185464198, + "loss": 0.0, + "step": 29276 + }, + { + "epoch": 2.731827936922646, + "grad_norm": NaN, + "learning_rate": 0.00017911609196807747, + "loss": 0.0, + "step": 29277 + }, + { + "epoch": 2.7319212466175236, + "grad_norm": NaN, + "learning_rate": 0.00017910867200747935, + "loss": 0.0, + "step": 29278 + }, + { + "epoch": 2.732014556312401, + "grad_norm": NaN, + "learning_rate": 0.0001791012519728665, + "loss": 0.0, + "step": 29279 + }, + { + "epoch": 2.7321078660072784, + "grad_norm": NaN, + "learning_rate": 0.00017909383186425778, + "loss": 0.0, + "step": 29280 + }, + { + "epoch": 2.7322011757021554, + "grad_norm": NaN, + "learning_rate": 0.00017908641168167208, + "loss": 0.0, + "step": 29281 + }, + { + "epoch": 2.732294485397033, + "grad_norm": NaN, + "learning_rate": 0.00017907899142512821, + "loss": 0.0, + "step": 29282 + }, + { + "epoch": 2.73238779509191, + "grad_norm": NaN, + "learning_rate": 0.00017907157109464517, + "loss": 0.0, + "step": 29283 + }, + { + "epoch": 2.732481104786787, + "grad_norm": NaN, + "learning_rate": 0.00017906415069024166, + "loss": 0.0, + "step": 29284 + }, + { + "epoch": 2.7325744144816646, + "grad_norm": NaN, + "learning_rate": 0.00017905673021193663, + "loss": 0.0, + "step": 29285 + }, + { + "epoch": 2.732667724176542, + "grad_norm": NaN, + "learning_rate": 0.00017904930965974896, + "loss": 0.0, + "step": 29286 + }, + { + "epoch": 2.7327610338714194, + "grad_norm": NaN, + "learning_rate": 0.00017904188903369753, + "loss": 0.0, + "step": 29287 + }, + { + "epoch": 2.7328543435662964, + "grad_norm": NaN, + "learning_rate": 0.0001790344683338011, + "loss": 0.0, + "step": 29288 + }, + { + "epoch": 2.732947653261174, + "grad_norm": NaN, + "learning_rate": 0.00017902704756007868, + "loss": 0.0, + "step": 29289 + }, + { + "epoch": 2.7330409629560513, + "grad_norm": NaN, + "learning_rate": 0.00017901962671254906, + "loss": 0.0, + "step": 29290 + }, + { + "epoch": 2.7331342726509282, + "grad_norm": NaN, + "learning_rate": 0.00017901220579123113, + "loss": 0.0, + "step": 29291 + }, + { + "epoch": 2.7332275823458057, + "grad_norm": NaN, + "learning_rate": 0.00017900478479614376, + "loss": 0.0, + "step": 29292 + }, + { + "epoch": 2.733320892040683, + "grad_norm": NaN, + "learning_rate": 0.00017899736372730578, + "loss": 0.0, + "step": 29293 + }, + { + "epoch": 2.7334142017355605, + "grad_norm": NaN, + "learning_rate": 0.00017898994258473615, + "loss": 0.0, + "step": 29294 + }, + { + "epoch": 2.7335075114304375, + "grad_norm": NaN, + "learning_rate": 0.00017898252136845365, + "loss": 0.0, + "step": 29295 + }, + { + "epoch": 2.733600821125315, + "grad_norm": NaN, + "learning_rate": 0.00017897510007847716, + "loss": 0.0, + "step": 29296 + }, + { + "epoch": 2.7336941308201923, + "grad_norm": NaN, + "learning_rate": 0.00017896767871482562, + "loss": 0.0, + "step": 29297 + }, + { + "epoch": 2.7337874405150693, + "grad_norm": NaN, + "learning_rate": 0.00017896025727751782, + "loss": 0.0, + "step": 29298 + }, + { + "epoch": 2.7338807502099467, + "grad_norm": NaN, + "learning_rate": 0.0001789528357665727, + "loss": 0.0, + "step": 29299 + }, + { + "epoch": 2.733974059904824, + "grad_norm": NaN, + "learning_rate": 0.00017894541418200904, + "loss": 0.0, + "step": 29300 + }, + { + "epoch": 2.7340673695997015, + "grad_norm": NaN, + "learning_rate": 0.00017893799252384582, + "loss": 0.0, + "step": 29301 + }, + { + "epoch": 2.734160679294579, + "grad_norm": NaN, + "learning_rate": 0.0001789305707921018, + "loss": 0.0, + "step": 29302 + }, + { + "epoch": 2.734253988989456, + "grad_norm": NaN, + "learning_rate": 0.00017892314898679598, + "loss": 0.0, + "step": 29303 + }, + { + "epoch": 2.7343472986843333, + "grad_norm": NaN, + "learning_rate": 0.00017891572710794712, + "loss": 0.0, + "step": 29304 + }, + { + "epoch": 2.7344406083792103, + "grad_norm": NaN, + "learning_rate": 0.00017890830515557412, + "loss": 0.0, + "step": 29305 + }, + { + "epoch": 2.7345339180740877, + "grad_norm": NaN, + "learning_rate": 0.00017890088312969588, + "loss": 0.0, + "step": 29306 + }, + { + "epoch": 2.734627227768965, + "grad_norm": NaN, + "learning_rate": 0.00017889346103033127, + "loss": 0.0, + "step": 29307 + }, + { + "epoch": 2.7347205374638426, + "grad_norm": NaN, + "learning_rate": 0.00017888603885749912, + "loss": 0.0, + "step": 29308 + }, + { + "epoch": 2.73481384715872, + "grad_norm": NaN, + "learning_rate": 0.00017887861661121835, + "loss": 0.0, + "step": 29309 + }, + { + "epoch": 2.734907156853597, + "grad_norm": NaN, + "learning_rate": 0.00017887119429150778, + "loss": 0.0, + "step": 29310 + }, + { + "epoch": 2.7350004665484744, + "grad_norm": NaN, + "learning_rate": 0.00017886377189838635, + "loss": 0.0, + "step": 29311 + }, + { + "epoch": 2.735093776243352, + "grad_norm": NaN, + "learning_rate": 0.0001788563494318729, + "loss": 0.0, + "step": 29312 + }, + { + "epoch": 2.735187085938229, + "grad_norm": NaN, + "learning_rate": 0.0001788489268919863, + "loss": 0.0, + "step": 29313 + }, + { + "epoch": 2.735280395633106, + "grad_norm": NaN, + "learning_rate": 0.00017884150427874537, + "loss": 0.0, + "step": 29314 + }, + { + "epoch": 2.7353737053279836, + "grad_norm": NaN, + "learning_rate": 0.00017883408159216909, + "loss": 0.0, + "step": 29315 + }, + { + "epoch": 2.735467015022861, + "grad_norm": NaN, + "learning_rate": 0.0001788266588322763, + "loss": 0.0, + "step": 29316 + }, + { + "epoch": 2.735560324717738, + "grad_norm": NaN, + "learning_rate": 0.00017881923599908582, + "loss": 0.0, + "step": 29317 + }, + { + "epoch": 2.7356536344126154, + "grad_norm": NaN, + "learning_rate": 0.00017881181309261656, + "loss": 0.0, + "step": 29318 + }, + { + "epoch": 2.735746944107493, + "grad_norm": NaN, + "learning_rate": 0.0001788043901128874, + "loss": 0.0, + "step": 29319 + }, + { + "epoch": 2.73584025380237, + "grad_norm": NaN, + "learning_rate": 0.0001787969670599172, + "loss": 0.0, + "step": 29320 + }, + { + "epoch": 2.7359335634972473, + "grad_norm": NaN, + "learning_rate": 0.00017878954393372486, + "loss": 0.0, + "step": 29321 + }, + { + "epoch": 2.7360268731921247, + "grad_norm": NaN, + "learning_rate": 0.0001787821207343292, + "loss": 0.0, + "step": 29322 + }, + { + "epoch": 2.736120182887002, + "grad_norm": NaN, + "learning_rate": 0.0001787746974617492, + "loss": 0.0, + "step": 29323 + }, + { + "epoch": 2.7362134925818795, + "grad_norm": NaN, + "learning_rate": 0.0001787672741160036, + "loss": 0.0, + "step": 29324 + }, + { + "epoch": 2.7363068022767565, + "grad_norm": NaN, + "learning_rate": 0.0001787598506971114, + "loss": 0.0, + "step": 29325 + }, + { + "epoch": 2.736400111971634, + "grad_norm": NaN, + "learning_rate": 0.0001787524272050914, + "loss": 0.0, + "step": 29326 + }, + { + "epoch": 2.736493421666511, + "grad_norm": NaN, + "learning_rate": 0.00017874500363996247, + "loss": 0.0, + "step": 29327 + }, + { + "epoch": 2.7365867313613883, + "grad_norm": NaN, + "learning_rate": 0.00017873758000174354, + "loss": 0.0, + "step": 29328 + }, + { + "epoch": 2.7366800410562657, + "grad_norm": NaN, + "learning_rate": 0.00017873015629045347, + "loss": 0.0, + "step": 29329 + }, + { + "epoch": 2.736773350751143, + "grad_norm": NaN, + "learning_rate": 0.00017872273250611105, + "loss": 0.0, + "step": 29330 + }, + { + "epoch": 2.7368666604460206, + "grad_norm": NaN, + "learning_rate": 0.00017871530864873529, + "loss": 0.0, + "step": 29331 + }, + { + "epoch": 2.7369599701408975, + "grad_norm": NaN, + "learning_rate": 0.000178707884718345, + "loss": 0.0, + "step": 29332 + }, + { + "epoch": 2.737053279835775, + "grad_norm": NaN, + "learning_rate": 0.00017870046071495908, + "loss": 0.0, + "step": 29333 + }, + { + "epoch": 2.7371465895306524, + "grad_norm": NaN, + "learning_rate": 0.00017869303663859638, + "loss": 0.0, + "step": 29334 + }, + { + "epoch": 2.7372398992255293, + "grad_norm": NaN, + "learning_rate": 0.00017868561248927576, + "loss": 0.0, + "step": 29335 + }, + { + "epoch": 2.7373332089204068, + "grad_norm": NaN, + "learning_rate": 0.0001786781882670161, + "loss": 0.0, + "step": 29336 + }, + { + "epoch": 2.737426518615284, + "grad_norm": NaN, + "learning_rate": 0.0001786707639718364, + "loss": 0.0, + "step": 29337 + }, + { + "epoch": 2.7375198283101616, + "grad_norm": NaN, + "learning_rate": 0.00017866333960375538, + "loss": 0.0, + "step": 29338 + }, + { + "epoch": 2.7376131380050386, + "grad_norm": NaN, + "learning_rate": 0.00017865591516279193, + "loss": 0.0, + "step": 29339 + }, + { + "epoch": 2.737706447699916, + "grad_norm": NaN, + "learning_rate": 0.00017864849064896506, + "loss": 0.0, + "step": 29340 + }, + { + "epoch": 2.7377997573947934, + "grad_norm": NaN, + "learning_rate": 0.0001786410660622935, + "loss": 0.0, + "step": 29341 + }, + { + "epoch": 2.7378930670896704, + "grad_norm": NaN, + "learning_rate": 0.00017863364140279625, + "loss": 0.0, + "step": 29342 + }, + { + "epoch": 2.737986376784548, + "grad_norm": NaN, + "learning_rate": 0.00017862621667049213, + "loss": 0.0, + "step": 29343 + }, + { + "epoch": 2.7380796864794252, + "grad_norm": NaN, + "learning_rate": 0.00017861879186539993, + "loss": 0.0, + "step": 29344 + }, + { + "epoch": 2.7381729961743027, + "grad_norm": NaN, + "learning_rate": 0.00017861136698753873, + "loss": 0.0, + "step": 29345 + }, + { + "epoch": 2.73826630586918, + "grad_norm": NaN, + "learning_rate": 0.00017860394203692725, + "loss": 0.0, + "step": 29346 + }, + { + "epoch": 2.738359615564057, + "grad_norm": NaN, + "learning_rate": 0.00017859651701358442, + "loss": 0.0, + "step": 29347 + }, + { + "epoch": 2.7384529252589345, + "grad_norm": NaN, + "learning_rate": 0.00017858909191752914, + "loss": 0.0, + "step": 29348 + }, + { + "epoch": 2.7385462349538114, + "grad_norm": NaN, + "learning_rate": 0.00017858166674878027, + "loss": 0.0, + "step": 29349 + }, + { + "epoch": 2.738639544648689, + "grad_norm": NaN, + "learning_rate": 0.00017857424150735665, + "loss": 0.0, + "step": 29350 + }, + { + "epoch": 2.7387328543435663, + "grad_norm": NaN, + "learning_rate": 0.0001785668161932772, + "loss": 0.0, + "step": 29351 + }, + { + "epoch": 2.7388261640384437, + "grad_norm": NaN, + "learning_rate": 0.00017855939080656084, + "loss": 0.0, + "step": 29352 + }, + { + "epoch": 2.738919473733321, + "grad_norm": NaN, + "learning_rate": 0.00017855196534722636, + "loss": 0.0, + "step": 29353 + }, + { + "epoch": 2.739012783428198, + "grad_norm": NaN, + "learning_rate": 0.00017854453981529274, + "loss": 0.0, + "step": 29354 + }, + { + "epoch": 2.7391060931230755, + "grad_norm": NaN, + "learning_rate": 0.0001785371142107788, + "loss": 0.0, + "step": 29355 + }, + { + "epoch": 2.739199402817953, + "grad_norm": NaN, + "learning_rate": 0.00017852968853370338, + "loss": 0.0, + "step": 29356 + }, + { + "epoch": 2.73929271251283, + "grad_norm": NaN, + "learning_rate": 0.00017852226278408548, + "loss": 0.0, + "step": 29357 + }, + { + "epoch": 2.7393860222077073, + "grad_norm": NaN, + "learning_rate": 0.00017851483696194385, + "loss": 0.0, + "step": 29358 + }, + { + "epoch": 2.7394793319025847, + "grad_norm": NaN, + "learning_rate": 0.0001785074110672975, + "loss": 0.0, + "step": 29359 + }, + { + "epoch": 2.739572641597462, + "grad_norm": NaN, + "learning_rate": 0.00017849998510016516, + "loss": 0.0, + "step": 29360 + }, + { + "epoch": 2.739665951292339, + "grad_norm": NaN, + "learning_rate": 0.0001784925590605659, + "loss": 0.0, + "step": 29361 + }, + { + "epoch": 2.7397592609872166, + "grad_norm": NaN, + "learning_rate": 0.00017848513294851847, + "loss": 0.0, + "step": 29362 + }, + { + "epoch": 2.739852570682094, + "grad_norm": NaN, + "learning_rate": 0.00017847770676404176, + "loss": 0.0, + "step": 29363 + }, + { + "epoch": 2.739945880376971, + "grad_norm": NaN, + "learning_rate": 0.0001784702805071547, + "loss": 0.0, + "step": 29364 + }, + { + "epoch": 2.7400391900718484, + "grad_norm": NaN, + "learning_rate": 0.00017846285417787616, + "loss": 0.0, + "step": 29365 + }, + { + "epoch": 2.740132499766726, + "grad_norm": NaN, + "learning_rate": 0.00017845542777622495, + "loss": 0.0, + "step": 29366 + }, + { + "epoch": 2.740225809461603, + "grad_norm": NaN, + "learning_rate": 0.0001784480013022201, + "loss": 0.0, + "step": 29367 + }, + { + "epoch": 2.7403191191564806, + "grad_norm": NaN, + "learning_rate": 0.00017844057475588037, + "loss": 0.0, + "step": 29368 + }, + { + "epoch": 2.7404124288513576, + "grad_norm": NaN, + "learning_rate": 0.00017843314813722466, + "loss": 0.0, + "step": 29369 + }, + { + "epoch": 2.740505738546235, + "grad_norm": NaN, + "learning_rate": 0.00017842572144627193, + "loss": 0.0, + "step": 29370 + }, + { + "epoch": 2.740599048241112, + "grad_norm": NaN, + "learning_rate": 0.000178418294683041, + "loss": 0.0, + "step": 29371 + }, + { + "epoch": 2.7406923579359894, + "grad_norm": NaN, + "learning_rate": 0.00017841086784755073, + "loss": 0.0, + "step": 29372 + }, + { + "epoch": 2.740785667630867, + "grad_norm": NaN, + "learning_rate": 0.00017840344093982008, + "loss": 0.0, + "step": 29373 + }, + { + "epoch": 2.7408789773257443, + "grad_norm": NaN, + "learning_rate": 0.00017839601395986787, + "loss": 0.0, + "step": 29374 + }, + { + "epoch": 2.7409722870206217, + "grad_norm": NaN, + "learning_rate": 0.000178388586907713, + "loss": 0.0, + "step": 29375 + }, + { + "epoch": 2.7410655967154987, + "grad_norm": NaN, + "learning_rate": 0.00017838115978337436, + "loss": 0.0, + "step": 29376 + }, + { + "epoch": 2.741158906410376, + "grad_norm": NaN, + "learning_rate": 0.00017837373258687087, + "loss": 0.0, + "step": 29377 + }, + { + "epoch": 2.7412522161052535, + "grad_norm": NaN, + "learning_rate": 0.00017836630531822137, + "loss": 0.0, + "step": 29378 + }, + { + "epoch": 2.7413455258001305, + "grad_norm": NaN, + "learning_rate": 0.0001783588779774447, + "loss": 0.0, + "step": 29379 + }, + { + "epoch": 2.741438835495008, + "grad_norm": NaN, + "learning_rate": 0.0001783514505645599, + "loss": 0.0, + "step": 29380 + }, + { + "epoch": 2.7415321451898853, + "grad_norm": NaN, + "learning_rate": 0.0001783440230795857, + "loss": 0.0, + "step": 29381 + }, + { + "epoch": 2.7416254548847627, + "grad_norm": NaN, + "learning_rate": 0.00017833659552254105, + "loss": 0.0, + "step": 29382 + }, + { + "epoch": 2.7417187645796397, + "grad_norm": NaN, + "learning_rate": 0.00017832916789344488, + "loss": 0.0, + "step": 29383 + }, + { + "epoch": 2.741812074274517, + "grad_norm": NaN, + "learning_rate": 0.00017832174019231597, + "loss": 0.0, + "step": 29384 + }, + { + "epoch": 2.7419053839693945, + "grad_norm": NaN, + "learning_rate": 0.00017831431241917324, + "loss": 0.0, + "step": 29385 + }, + { + "epoch": 2.7419986936642715, + "grad_norm": NaN, + "learning_rate": 0.0001783068845740357, + "loss": 0.0, + "step": 29386 + }, + { + "epoch": 2.742092003359149, + "grad_norm": NaN, + "learning_rate": 0.00017829945665692206, + "loss": 0.0, + "step": 29387 + }, + { + "epoch": 2.7421853130540264, + "grad_norm": NaN, + "learning_rate": 0.00017829202866785128, + "loss": 0.0, + "step": 29388 + }, + { + "epoch": 2.7422786227489038, + "grad_norm": NaN, + "learning_rate": 0.00017828460060684233, + "loss": 0.0, + "step": 29389 + }, + { + "epoch": 2.7423719324437807, + "grad_norm": NaN, + "learning_rate": 0.00017827717247391394, + "loss": 0.0, + "step": 29390 + }, + { + "epoch": 2.742465242138658, + "grad_norm": NaN, + "learning_rate": 0.00017826974426908508, + "loss": 0.0, + "step": 29391 + }, + { + "epoch": 2.7425585518335356, + "grad_norm": NaN, + "learning_rate": 0.00017826231599237468, + "loss": 0.0, + "step": 29392 + }, + { + "epoch": 2.7426518615284126, + "grad_norm": NaN, + "learning_rate": 0.00017825488764380157, + "loss": 0.0, + "step": 29393 + }, + { + "epoch": 2.74274517122329, + "grad_norm": NaN, + "learning_rate": 0.00017824745922338464, + "loss": 0.0, + "step": 29394 + }, + { + "epoch": 2.7428384809181674, + "grad_norm": NaN, + "learning_rate": 0.0001782400307311428, + "loss": 0.0, + "step": 29395 + }, + { + "epoch": 2.742931790613045, + "grad_norm": NaN, + "learning_rate": 0.0001782326021670949, + "loss": 0.0, + "step": 29396 + }, + { + "epoch": 2.7430251003079222, + "grad_norm": NaN, + "learning_rate": 0.00017822517353125987, + "loss": 0.0, + "step": 29397 + }, + { + "epoch": 2.743118410002799, + "grad_norm": NaN, + "learning_rate": 0.00017821774482365657, + "loss": 0.0, + "step": 29398 + }, + { + "epoch": 2.7432117196976766, + "grad_norm": NaN, + "learning_rate": 0.00017821031604430397, + "loss": 0.0, + "step": 29399 + }, + { + "epoch": 2.743305029392554, + "grad_norm": NaN, + "learning_rate": 0.00017820288719322085, + "loss": 0.0, + "step": 29400 + }, + { + "epoch": 2.743398339087431, + "grad_norm": NaN, + "learning_rate": 0.00017819545827042608, + "loss": 0.0, + "step": 29401 + }, + { + "epoch": 2.7434916487823084, + "grad_norm": NaN, + "learning_rate": 0.0001781880292759387, + "loss": 0.0, + "step": 29402 + }, + { + "epoch": 2.743584958477186, + "grad_norm": NaN, + "learning_rate": 0.0001781806002097775, + "loss": 0.0, + "step": 29403 + }, + { + "epoch": 2.7436782681720633, + "grad_norm": NaN, + "learning_rate": 0.0001781731710719613, + "loss": 0.0, + "step": 29404 + }, + { + "epoch": 2.7437715778669403, + "grad_norm": NaN, + "learning_rate": 0.00017816574186250918, + "loss": 0.0, + "step": 29405 + }, + { + "epoch": 2.7438648875618177, + "grad_norm": NaN, + "learning_rate": 0.00017815831258143986, + "loss": 0.0, + "step": 29406 + }, + { + "epoch": 2.743958197256695, + "grad_norm": NaN, + "learning_rate": 0.00017815088322877226, + "loss": 0.0, + "step": 29407 + }, + { + "epoch": 2.744051506951572, + "grad_norm": NaN, + "learning_rate": 0.0001781434538045254, + "loss": 0.0, + "step": 29408 + }, + { + "epoch": 2.7441448166464495, + "grad_norm": NaN, + "learning_rate": 0.00017813602430871803, + "loss": 0.0, + "step": 29409 + }, + { + "epoch": 2.744238126341327, + "grad_norm": NaN, + "learning_rate": 0.00017812859474136905, + "loss": 0.0, + "step": 29410 + }, + { + "epoch": 2.7443314360362043, + "grad_norm": NaN, + "learning_rate": 0.00017812116510249743, + "loss": 0.0, + "step": 29411 + }, + { + "epoch": 2.7444247457310813, + "grad_norm": NaN, + "learning_rate": 0.00017811373539212202, + "loss": 0.0, + "step": 29412 + }, + { + "epoch": 2.7445180554259587, + "grad_norm": NaN, + "learning_rate": 0.00017810630561026166, + "loss": 0.0, + "step": 29413 + }, + { + "epoch": 2.744611365120836, + "grad_norm": NaN, + "learning_rate": 0.00017809887575693536, + "loss": 0.0, + "step": 29414 + }, + { + "epoch": 2.744704674815713, + "grad_norm": NaN, + "learning_rate": 0.0001780914458321619, + "loss": 0.0, + "step": 29415 + }, + { + "epoch": 2.7447979845105905, + "grad_norm": NaN, + "learning_rate": 0.00017808401583596019, + "loss": 0.0, + "step": 29416 + }, + { + "epoch": 2.744891294205468, + "grad_norm": NaN, + "learning_rate": 0.00017807658576834916, + "loss": 0.0, + "step": 29417 + }, + { + "epoch": 2.7449846039003454, + "grad_norm": NaN, + "learning_rate": 0.00017806915562934773, + "loss": 0.0, + "step": 29418 + }, + { + "epoch": 2.745077913595223, + "grad_norm": NaN, + "learning_rate": 0.00017806172541897473, + "loss": 0.0, + "step": 29419 + }, + { + "epoch": 2.7451712232900998, + "grad_norm": NaN, + "learning_rate": 0.00017805429513724903, + "loss": 0.0, + "step": 29420 + }, + { + "epoch": 2.745264532984977, + "grad_norm": NaN, + "learning_rate": 0.00017804686478418966, + "loss": 0.0, + "step": 29421 + }, + { + "epoch": 2.745357842679854, + "grad_norm": NaN, + "learning_rate": 0.00017803943435981535, + "loss": 0.0, + "step": 29422 + }, + { + "epoch": 2.7454511523747316, + "grad_norm": NaN, + "learning_rate": 0.00017803200386414504, + "loss": 0.0, + "step": 29423 + }, + { + "epoch": 2.745544462069609, + "grad_norm": NaN, + "learning_rate": 0.00017802457329719773, + "loss": 0.0, + "step": 29424 + }, + { + "epoch": 2.7456377717644864, + "grad_norm": NaN, + "learning_rate": 0.0001780171426589922, + "loss": 0.0, + "step": 29425 + }, + { + "epoch": 2.745731081459364, + "grad_norm": NaN, + "learning_rate": 0.00017800971194954733, + "loss": 0.0, + "step": 29426 + }, + { + "epoch": 2.745824391154241, + "grad_norm": NaN, + "learning_rate": 0.00017800228116888212, + "loss": 0.0, + "step": 29427 + }, + { + "epoch": 2.7459177008491182, + "grad_norm": NaN, + "learning_rate": 0.0001779948503170154, + "loss": 0.0, + "step": 29428 + }, + { + "epoch": 2.7460110105439957, + "grad_norm": NaN, + "learning_rate": 0.000177987419393966, + "loss": 0.0, + "step": 29429 + }, + { + "epoch": 2.7461043202388726, + "grad_norm": NaN, + "learning_rate": 0.00017797998839975298, + "loss": 0.0, + "step": 29430 + }, + { + "epoch": 2.74619762993375, + "grad_norm": NaN, + "learning_rate": 0.0001779725573343951, + "loss": 0.0, + "step": 29431 + }, + { + "epoch": 2.7462909396286275, + "grad_norm": NaN, + "learning_rate": 0.00017796512619791123, + "loss": 0.0, + "step": 29432 + }, + { + "epoch": 2.746384249323505, + "grad_norm": NaN, + "learning_rate": 0.0001779576949903204, + "loss": 0.0, + "step": 29433 + }, + { + "epoch": 2.746477559018382, + "grad_norm": NaN, + "learning_rate": 0.00017795026371164143, + "loss": 0.0, + "step": 29434 + }, + { + "epoch": 2.7465708687132593, + "grad_norm": NaN, + "learning_rate": 0.00017794283236189317, + "loss": 0.0, + "step": 29435 + }, + { + "epoch": 2.7466641784081367, + "grad_norm": NaN, + "learning_rate": 0.00017793540094109458, + "loss": 0.0, + "step": 29436 + }, + { + "epoch": 2.7467574881030137, + "grad_norm": NaN, + "learning_rate": 0.0001779279694492646, + "loss": 0.0, + "step": 29437 + }, + { + "epoch": 2.746850797797891, + "grad_norm": NaN, + "learning_rate": 0.00017792053788642204, + "loss": 0.0, + "step": 29438 + }, + { + "epoch": 2.7469441074927685, + "grad_norm": NaN, + "learning_rate": 0.00017791310625258578, + "loss": 0.0, + "step": 29439 + }, + { + "epoch": 2.747037417187646, + "grad_norm": NaN, + "learning_rate": 0.0001779056745477748, + "loss": 0.0, + "step": 29440 + }, + { + "epoch": 2.7471307268825234, + "grad_norm": NaN, + "learning_rate": 0.00017789824277200796, + "loss": 0.0, + "step": 29441 + }, + { + "epoch": 2.7472240365774003, + "grad_norm": NaN, + "learning_rate": 0.00017789081092530409, + "loss": 0.0, + "step": 29442 + }, + { + "epoch": 2.7473173462722777, + "grad_norm": NaN, + "learning_rate": 0.00017788337900768224, + "loss": 0.0, + "step": 29443 + }, + { + "epoch": 2.7474106559671547, + "grad_norm": NaN, + "learning_rate": 0.00017787594701916117, + "loss": 0.0, + "step": 29444 + }, + { + "epoch": 2.747503965662032, + "grad_norm": NaN, + "learning_rate": 0.0001778685149597598, + "loss": 0.0, + "step": 29445 + }, + { + "epoch": 2.7475972753569096, + "grad_norm": NaN, + "learning_rate": 0.0001778610828294971, + "loss": 0.0, + "step": 29446 + }, + { + "epoch": 2.747690585051787, + "grad_norm": NaN, + "learning_rate": 0.0001778536506283919, + "loss": 0.0, + "step": 29447 + }, + { + "epoch": 2.7477838947466644, + "grad_norm": NaN, + "learning_rate": 0.0001778462183564631, + "loss": 0.0, + "step": 29448 + }, + { + "epoch": 2.7478772044415414, + "grad_norm": NaN, + "learning_rate": 0.00017783878601372965, + "loss": 0.0, + "step": 29449 + }, + { + "epoch": 2.747970514136419, + "grad_norm": NaN, + "learning_rate": 0.0001778313536002104, + "loss": 0.0, + "step": 29450 + }, + { + "epoch": 2.748063823831296, + "grad_norm": NaN, + "learning_rate": 0.0001778239211159242, + "loss": 0.0, + "step": 29451 + }, + { + "epoch": 2.748157133526173, + "grad_norm": NaN, + "learning_rate": 0.00017781648856089011, + "loss": 0.0, + "step": 29452 + }, + { + "epoch": 2.7482504432210506, + "grad_norm": NaN, + "learning_rate": 0.0001778090559351269, + "loss": 0.0, + "step": 29453 + }, + { + "epoch": 2.748343752915928, + "grad_norm": NaN, + "learning_rate": 0.00017780162323865345, + "loss": 0.0, + "step": 29454 + }, + { + "epoch": 2.7484370626108054, + "grad_norm": NaN, + "learning_rate": 0.00017779419047148873, + "loss": 0.0, + "step": 29455 + }, + { + "epoch": 2.7485303723056824, + "grad_norm": NaN, + "learning_rate": 0.00017778675763365168, + "loss": 0.0, + "step": 29456 + }, + { + "epoch": 2.74862368200056, + "grad_norm": NaN, + "learning_rate": 0.00017777932472516105, + "loss": 0.0, + "step": 29457 + }, + { + "epoch": 2.7487169916954373, + "grad_norm": NaN, + "learning_rate": 0.00017777189174603584, + "loss": 0.0, + "step": 29458 + }, + { + "epoch": 2.7488103013903142, + "grad_norm": NaN, + "learning_rate": 0.000177764458696295, + "loss": 0.0, + "step": 29459 + }, + { + "epoch": 2.7489036110851917, + "grad_norm": NaN, + "learning_rate": 0.00017775702557595733, + "loss": 0.0, + "step": 29460 + }, + { + "epoch": 2.748996920780069, + "grad_norm": NaN, + "learning_rate": 0.00017774959238504173, + "loss": 0.0, + "step": 29461 + }, + { + "epoch": 2.7490902304749465, + "grad_norm": NaN, + "learning_rate": 0.0001777421591235672, + "loss": 0.0, + "step": 29462 + }, + { + "epoch": 2.749183540169824, + "grad_norm": NaN, + "learning_rate": 0.00017773472579155254, + "loss": 0.0, + "step": 29463 + }, + { + "epoch": 2.749276849864701, + "grad_norm": NaN, + "learning_rate": 0.00017772729238901665, + "loss": 0.0, + "step": 29464 + }, + { + "epoch": 2.7493701595595783, + "grad_norm": NaN, + "learning_rate": 0.00017771985891597853, + "loss": 0.0, + "step": 29465 + }, + { + "epoch": 2.7494634692544553, + "grad_norm": NaN, + "learning_rate": 0.000177712425372457, + "loss": 0.0, + "step": 29466 + }, + { + "epoch": 2.7495567789493327, + "grad_norm": NaN, + "learning_rate": 0.00017770499175847097, + "loss": 0.0, + "step": 29467 + }, + { + "epoch": 2.74965008864421, + "grad_norm": NaN, + "learning_rate": 0.0001776975580740394, + "loss": 0.0, + "step": 29468 + }, + { + "epoch": 2.7497433983390875, + "grad_norm": NaN, + "learning_rate": 0.00017769012431918108, + "loss": 0.0, + "step": 29469 + }, + { + "epoch": 2.749836708033965, + "grad_norm": NaN, + "learning_rate": 0.000177682690493915, + "loss": 0.0, + "step": 29470 + }, + { + "epoch": 2.749930017728842, + "grad_norm": NaN, + "learning_rate": 0.00017767525659826005, + "loss": 0.0, + "step": 29471 + }, + { + "epoch": 2.7500233274237194, + "grad_norm": NaN, + "learning_rate": 0.00017766782263223512, + "loss": 0.0, + "step": 29472 + }, + { + "epoch": 2.7501166371185968, + "grad_norm": NaN, + "learning_rate": 0.00017766038859585908, + "loss": 0.0, + "step": 29473 + }, + { + "epoch": 2.7502099468134737, + "grad_norm": NaN, + "learning_rate": 0.0001776529544891509, + "loss": 0.0, + "step": 29474 + }, + { + "epoch": 2.750303256508351, + "grad_norm": NaN, + "learning_rate": 0.0001776455203121295, + "loss": 0.0, + "step": 29475 + }, + { + "epoch": 2.7503965662032286, + "grad_norm": NaN, + "learning_rate": 0.0001776380860648136, + "loss": 0.0, + "step": 29476 + }, + { + "epoch": 2.750489875898106, + "grad_norm": NaN, + "learning_rate": 0.00017763065174722232, + "loss": 0.0, + "step": 29477 + }, + { + "epoch": 2.750583185592983, + "grad_norm": NaN, + "learning_rate": 0.0001776232173593745, + "loss": 0.0, + "step": 29478 + }, + { + "epoch": 2.7506764952878604, + "grad_norm": NaN, + "learning_rate": 0.00017761578290128894, + "loss": 0.0, + "step": 29479 + }, + { + "epoch": 2.750769804982738, + "grad_norm": NaN, + "learning_rate": 0.00017760834837298465, + "loss": 0.0, + "step": 29480 + }, + { + "epoch": 2.750863114677615, + "grad_norm": NaN, + "learning_rate": 0.00017760091377448057, + "loss": 0.0, + "step": 29481 + }, + { + "epoch": 2.750956424372492, + "grad_norm": NaN, + "learning_rate": 0.0001775934791057955, + "loss": 0.0, + "step": 29482 + }, + { + "epoch": 2.7510497340673696, + "grad_norm": NaN, + "learning_rate": 0.00017758604436694838, + "loss": 0.0, + "step": 29483 + }, + { + "epoch": 2.751143043762247, + "grad_norm": NaN, + "learning_rate": 0.00017757860955795813, + "loss": 0.0, + "step": 29484 + }, + { + "epoch": 2.7512363534571245, + "grad_norm": NaN, + "learning_rate": 0.00017757117467884364, + "loss": 0.0, + "step": 29485 + }, + { + "epoch": 2.7513296631520014, + "grad_norm": NaN, + "learning_rate": 0.0001775637397296238, + "loss": 0.0, + "step": 29486 + }, + { + "epoch": 2.751422972846879, + "grad_norm": NaN, + "learning_rate": 0.00017755630471031757, + "loss": 0.0, + "step": 29487 + }, + { + "epoch": 2.751516282541756, + "grad_norm": NaN, + "learning_rate": 0.00017754886962094383, + "loss": 0.0, + "step": 29488 + }, + { + "epoch": 2.7516095922366333, + "grad_norm": NaN, + "learning_rate": 0.00017754143446152142, + "loss": 0.0, + "step": 29489 + }, + { + "epoch": 2.7517029019315107, + "grad_norm": NaN, + "learning_rate": 0.00017753399923206933, + "loss": 0.0, + "step": 29490 + }, + { + "epoch": 2.751796211626388, + "grad_norm": NaN, + "learning_rate": 0.00017752656393260646, + "loss": 0.0, + "step": 29491 + }, + { + "epoch": 2.7518895213212655, + "grad_norm": NaN, + "learning_rate": 0.00017751912856315164, + "loss": 0.0, + "step": 29492 + }, + { + "epoch": 2.7519828310161425, + "grad_norm": NaN, + "learning_rate": 0.00017751169312372385, + "loss": 0.0, + "step": 29493 + }, + { + "epoch": 2.75207614071102, + "grad_norm": NaN, + "learning_rate": 0.000177504257614342, + "loss": 0.0, + "step": 29494 + }, + { + "epoch": 2.7521694504058973, + "grad_norm": NaN, + "learning_rate": 0.00017749682203502495, + "loss": 0.0, + "step": 29495 + }, + { + "epoch": 2.7522627601007743, + "grad_norm": NaN, + "learning_rate": 0.0001774893863857916, + "loss": 0.0, + "step": 29496 + }, + { + "epoch": 2.7523560697956517, + "grad_norm": NaN, + "learning_rate": 0.00017748195066666096, + "loss": 0.0, + "step": 29497 + }, + { + "epoch": 2.752449379490529, + "grad_norm": NaN, + "learning_rate": 0.00017747451487765177, + "loss": 0.0, + "step": 29498 + }, + { + "epoch": 2.7525426891854066, + "grad_norm": NaN, + "learning_rate": 0.00017746707901878308, + "loss": 0.0, + "step": 29499 + }, + { + "epoch": 2.7526359988802835, + "grad_norm": NaN, + "learning_rate": 0.00017745964309007374, + "loss": 0.0, + "step": 29500 + }, + { + "epoch": 2.752729308575161, + "grad_norm": NaN, + "learning_rate": 0.00017745220709154265, + "loss": 0.0, + "step": 29501 + }, + { + "epoch": 2.7528226182700384, + "grad_norm": NaN, + "learning_rate": 0.00017744477102320873, + "loss": 0.0, + "step": 29502 + }, + { + "epoch": 2.7529159279649154, + "grad_norm": NaN, + "learning_rate": 0.00017743733488509092, + "loss": 0.0, + "step": 29503 + }, + { + "epoch": 2.7530092376597928, + "grad_norm": NaN, + "learning_rate": 0.00017742989867720806, + "loss": 0.0, + "step": 29504 + }, + { + "epoch": 2.75310254735467, + "grad_norm": NaN, + "learning_rate": 0.00017742246239957907, + "loss": 0.0, + "step": 29505 + }, + { + "epoch": 2.7531958570495476, + "grad_norm": NaN, + "learning_rate": 0.00017741502605222295, + "loss": 0.0, + "step": 29506 + }, + { + "epoch": 2.7532891667444246, + "grad_norm": NaN, + "learning_rate": 0.00017740758963515852, + "loss": 0.0, + "step": 29507 + }, + { + "epoch": 2.753382476439302, + "grad_norm": NaN, + "learning_rate": 0.00017740015314840463, + "loss": 0.0, + "step": 29508 + }, + { + "epoch": 2.7534757861341794, + "grad_norm": NaN, + "learning_rate": 0.00017739271659198035, + "loss": 0.0, + "step": 29509 + }, + { + "epoch": 2.7535690958290564, + "grad_norm": NaN, + "learning_rate": 0.0001773852799659045, + "loss": 0.0, + "step": 29510 + }, + { + "epoch": 2.753662405523934, + "grad_norm": NaN, + "learning_rate": 0.00017737784327019596, + "loss": 0.0, + "step": 29511 + }, + { + "epoch": 2.7537557152188112, + "grad_norm": NaN, + "learning_rate": 0.00017737040650487367, + "loss": 0.0, + "step": 29512 + }, + { + "epoch": 2.7538490249136887, + "grad_norm": NaN, + "learning_rate": 0.0001773629696699566, + "loss": 0.0, + "step": 29513 + }, + { + "epoch": 2.753942334608566, + "grad_norm": NaN, + "learning_rate": 0.00017735553276546355, + "loss": 0.0, + "step": 29514 + }, + { + "epoch": 2.754035644303443, + "grad_norm": NaN, + "learning_rate": 0.00017734809579141348, + "loss": 0.0, + "step": 29515 + }, + { + "epoch": 2.7541289539983205, + "grad_norm": NaN, + "learning_rate": 0.00017734065874782536, + "loss": 0.0, + "step": 29516 + }, + { + "epoch": 2.7542222636931974, + "grad_norm": NaN, + "learning_rate": 0.00017733322163471796, + "loss": 0.0, + "step": 29517 + }, + { + "epoch": 2.754315573388075, + "grad_norm": NaN, + "learning_rate": 0.00017732578445211035, + "loss": 0.0, + "step": 29518 + }, + { + "epoch": 2.7544088830829523, + "grad_norm": NaN, + "learning_rate": 0.00017731834720002135, + "loss": 0.0, + "step": 29519 + }, + { + "epoch": 2.7545021927778297, + "grad_norm": NaN, + "learning_rate": 0.00017731090987846987, + "loss": 0.0, + "step": 29520 + }, + { + "epoch": 2.754595502472707, + "grad_norm": NaN, + "learning_rate": 0.00017730347248747484, + "loss": 0.0, + "step": 29521 + }, + { + "epoch": 2.754688812167584, + "grad_norm": NaN, + "learning_rate": 0.0001772960350270552, + "loss": 0.0, + "step": 29522 + }, + { + "epoch": 2.7547821218624615, + "grad_norm": NaN, + "learning_rate": 0.00017728859749722975, + "loss": 0.0, + "step": 29523 + }, + { + "epoch": 2.754875431557339, + "grad_norm": NaN, + "learning_rate": 0.00017728115989801754, + "loss": 0.0, + "step": 29524 + }, + { + "epoch": 2.754968741252216, + "grad_norm": NaN, + "learning_rate": 0.00017727372222943742, + "loss": 0.0, + "step": 29525 + }, + { + "epoch": 2.7550620509470933, + "grad_norm": NaN, + "learning_rate": 0.00017726628449150827, + "loss": 0.0, + "step": 29526 + }, + { + "epoch": 2.7551553606419708, + "grad_norm": NaN, + "learning_rate": 0.00017725884668424904, + "loss": 0.0, + "step": 29527 + }, + { + "epoch": 2.755248670336848, + "grad_norm": NaN, + "learning_rate": 0.00017725140880767867, + "loss": 0.0, + "step": 29528 + }, + { + "epoch": 2.755341980031725, + "grad_norm": NaN, + "learning_rate": 0.00017724397086181604, + "loss": 0.0, + "step": 29529 + }, + { + "epoch": 2.7554352897266026, + "grad_norm": NaN, + "learning_rate": 0.00017723653284668002, + "loss": 0.0, + "step": 29530 + }, + { + "epoch": 2.75552859942148, + "grad_norm": NaN, + "learning_rate": 0.0001772290947622896, + "loss": 0.0, + "step": 29531 + }, + { + "epoch": 2.755621909116357, + "grad_norm": NaN, + "learning_rate": 0.0001772216566086637, + "loss": 0.0, + "step": 29532 + }, + { + "epoch": 2.7557152188112344, + "grad_norm": NaN, + "learning_rate": 0.0001772142183858211, + "loss": 0.0, + "step": 29533 + }, + { + "epoch": 2.755808528506112, + "grad_norm": NaN, + "learning_rate": 0.00017720678009378084, + "loss": 0.0, + "step": 29534 + }, + { + "epoch": 2.755901838200989, + "grad_norm": NaN, + "learning_rate": 0.00017719934173256183, + "loss": 0.0, + "step": 29535 + }, + { + "epoch": 2.7559951478958666, + "grad_norm": NaN, + "learning_rate": 0.00017719190330218288, + "loss": 0.0, + "step": 29536 + }, + { + "epoch": 2.7560884575907436, + "grad_norm": NaN, + "learning_rate": 0.00017718446480266304, + "loss": 0.0, + "step": 29537 + }, + { + "epoch": 2.756181767285621, + "grad_norm": NaN, + "learning_rate": 0.00017717702623402116, + "loss": 0.0, + "step": 29538 + }, + { + "epoch": 2.756275076980498, + "grad_norm": NaN, + "learning_rate": 0.00017716958759627608, + "loss": 0.0, + "step": 29539 + }, + { + "epoch": 2.7563683866753754, + "grad_norm": NaN, + "learning_rate": 0.00017716214888944686, + "loss": 0.0, + "step": 29540 + }, + { + "epoch": 2.756461696370253, + "grad_norm": NaN, + "learning_rate": 0.00017715471011355236, + "loss": 0.0, + "step": 29541 + }, + { + "epoch": 2.7565550060651303, + "grad_norm": NaN, + "learning_rate": 0.0001771472712686114, + "loss": 0.0, + "step": 29542 + }, + { + "epoch": 2.7566483157600077, + "grad_norm": NaN, + "learning_rate": 0.000177139832354643, + "loss": 0.0, + "step": 29543 + }, + { + "epoch": 2.7567416254548847, + "grad_norm": NaN, + "learning_rate": 0.00017713239337166604, + "loss": 0.0, + "step": 29544 + }, + { + "epoch": 2.756834935149762, + "grad_norm": NaN, + "learning_rate": 0.00017712495431969946, + "loss": 0.0, + "step": 29545 + }, + { + "epoch": 2.7569282448446395, + "grad_norm": NaN, + "learning_rate": 0.00017711751519876212, + "loss": 0.0, + "step": 29546 + }, + { + "epoch": 2.7570215545395165, + "grad_norm": NaN, + "learning_rate": 0.000177110076008873, + "loss": 0.0, + "step": 29547 + }, + { + "epoch": 2.757114864234394, + "grad_norm": NaN, + "learning_rate": 0.000177102636750051, + "loss": 0.0, + "step": 29548 + }, + { + "epoch": 2.7572081739292713, + "grad_norm": NaN, + "learning_rate": 0.00017709519742231498, + "loss": 0.0, + "step": 29549 + }, + { + "epoch": 2.7573014836241487, + "grad_norm": NaN, + "learning_rate": 0.00017708775802568392, + "loss": 0.0, + "step": 29550 + }, + { + "epoch": 2.7573947933190257, + "grad_norm": NaN, + "learning_rate": 0.00017708031856017678, + "loss": 0.0, + "step": 29551 + }, + { + "epoch": 2.757488103013903, + "grad_norm": NaN, + "learning_rate": 0.00017707287902581233, + "loss": 0.0, + "step": 29552 + }, + { + "epoch": 2.7575814127087805, + "grad_norm": NaN, + "learning_rate": 0.00017706543942260961, + "loss": 0.0, + "step": 29553 + }, + { + "epoch": 2.7576747224036575, + "grad_norm": NaN, + "learning_rate": 0.0001770579997505875, + "loss": 0.0, + "step": 29554 + }, + { + "epoch": 2.757768032098535, + "grad_norm": NaN, + "learning_rate": 0.00017705056000976482, + "loss": 0.0, + "step": 29555 + }, + { + "epoch": 2.7578613417934124, + "grad_norm": NaN, + "learning_rate": 0.0001770431202001607, + "loss": 0.0, + "step": 29556 + }, + { + "epoch": 2.7579546514882898, + "grad_norm": NaN, + "learning_rate": 0.0001770356803217939, + "loss": 0.0, + "step": 29557 + }, + { + "epoch": 2.758047961183167, + "grad_norm": NaN, + "learning_rate": 0.00017702824037468332, + "loss": 0.0, + "step": 29558 + }, + { + "epoch": 2.758141270878044, + "grad_norm": NaN, + "learning_rate": 0.00017702080035884797, + "loss": 0.0, + "step": 29559 + }, + { + "epoch": 2.7582345805729216, + "grad_norm": NaN, + "learning_rate": 0.00017701336027430676, + "loss": 0.0, + "step": 29560 + }, + { + "epoch": 2.7583278902677986, + "grad_norm": NaN, + "learning_rate": 0.0001770059201210785, + "loss": 0.0, + "step": 29561 + }, + { + "epoch": 2.758421199962676, + "grad_norm": NaN, + "learning_rate": 0.00017699847989918226, + "loss": 0.0, + "step": 29562 + }, + { + "epoch": 2.7585145096575534, + "grad_norm": NaN, + "learning_rate": 0.00017699103960863686, + "loss": 0.0, + "step": 29563 + }, + { + "epoch": 2.758607819352431, + "grad_norm": NaN, + "learning_rate": 0.00017698359924946122, + "loss": 0.0, + "step": 29564 + }, + { + "epoch": 2.7587011290473082, + "grad_norm": NaN, + "learning_rate": 0.00017697615882167428, + "loss": 0.0, + "step": 29565 + }, + { + "epoch": 2.758794438742185, + "grad_norm": NaN, + "learning_rate": 0.00017696871832529497, + "loss": 0.0, + "step": 29566 + }, + { + "epoch": 2.7588877484370626, + "grad_norm": NaN, + "learning_rate": 0.0001769612777603422, + "loss": 0.0, + "step": 29567 + }, + { + "epoch": 2.75898105813194, + "grad_norm": NaN, + "learning_rate": 0.00017695383712683488, + "loss": 0.0, + "step": 29568 + }, + { + "epoch": 2.759074367826817, + "grad_norm": NaN, + "learning_rate": 0.00017694639642479198, + "loss": 0.0, + "step": 29569 + }, + { + "epoch": 2.7591676775216945, + "grad_norm": NaN, + "learning_rate": 0.0001769389556542323, + "loss": 0.0, + "step": 29570 + }, + { + "epoch": 2.759260987216572, + "grad_norm": NaN, + "learning_rate": 0.0001769315148151749, + "loss": 0.0, + "step": 29571 + }, + { + "epoch": 2.7593542969114493, + "grad_norm": NaN, + "learning_rate": 0.00017692407390763862, + "loss": 0.0, + "step": 29572 + }, + { + "epoch": 2.7594476066063263, + "grad_norm": NaN, + "learning_rate": 0.00017691663293164242, + "loss": 0.0, + "step": 29573 + }, + { + "epoch": 2.7595409163012037, + "grad_norm": NaN, + "learning_rate": 0.00017690919188720515, + "loss": 0.0, + "step": 29574 + }, + { + "epoch": 2.759634225996081, + "grad_norm": NaN, + "learning_rate": 0.00017690175077434577, + "loss": 0.0, + "step": 29575 + }, + { + "epoch": 2.759727535690958, + "grad_norm": NaN, + "learning_rate": 0.00017689430959308324, + "loss": 0.0, + "step": 29576 + }, + { + "epoch": 2.7598208453858355, + "grad_norm": NaN, + "learning_rate": 0.00017688686834343642, + "loss": 0.0, + "step": 29577 + }, + { + "epoch": 2.759914155080713, + "grad_norm": NaN, + "learning_rate": 0.00017687942702542426, + "loss": 0.0, + "step": 29578 + }, + { + "epoch": 2.7600074647755903, + "grad_norm": NaN, + "learning_rate": 0.00017687198563906572, + "loss": 0.0, + "step": 29579 + }, + { + "epoch": 2.7601007744704678, + "grad_norm": NaN, + "learning_rate": 0.00017686454418437965, + "loss": 0.0, + "step": 29580 + }, + { + "epoch": 2.7601940841653447, + "grad_norm": NaN, + "learning_rate": 0.000176857102661385, + "loss": 0.0, + "step": 29581 + }, + { + "epoch": 2.760287393860222, + "grad_norm": NaN, + "learning_rate": 0.0001768496610701007, + "loss": 0.0, + "step": 29582 + }, + { + "epoch": 2.760380703555099, + "grad_norm": NaN, + "learning_rate": 0.00017684221941054567, + "loss": 0.0, + "step": 29583 + }, + { + "epoch": 2.7604740132499765, + "grad_norm": NaN, + "learning_rate": 0.0001768347776827388, + "loss": 0.0, + "step": 29584 + }, + { + "epoch": 2.760567322944854, + "grad_norm": NaN, + "learning_rate": 0.00017682733588669905, + "loss": 0.0, + "step": 29585 + }, + { + "epoch": 2.7606606326397314, + "grad_norm": NaN, + "learning_rate": 0.00017681989402244536, + "loss": 0.0, + "step": 29586 + }, + { + "epoch": 2.760753942334609, + "grad_norm": NaN, + "learning_rate": 0.0001768124520899966, + "loss": 0.0, + "step": 29587 + }, + { + "epoch": 2.7608472520294858, + "grad_norm": NaN, + "learning_rate": 0.00017680501008937173, + "loss": 0.0, + "step": 29588 + }, + { + "epoch": 2.760940561724363, + "grad_norm": NaN, + "learning_rate": 0.00017679756802058965, + "loss": 0.0, + "step": 29589 + }, + { + "epoch": 2.7610338714192406, + "grad_norm": NaN, + "learning_rate": 0.00017679012588366929, + "loss": 0.0, + "step": 29590 + }, + { + "epoch": 2.7611271811141176, + "grad_norm": NaN, + "learning_rate": 0.00017678268367862958, + "loss": 0.0, + "step": 29591 + }, + { + "epoch": 2.761220490808995, + "grad_norm": NaN, + "learning_rate": 0.00017677524140548946, + "loss": 0.0, + "step": 29592 + }, + { + "epoch": 2.7613138005038724, + "grad_norm": NaN, + "learning_rate": 0.0001767677990642678, + "loss": 0.0, + "step": 29593 + }, + { + "epoch": 2.76140711019875, + "grad_norm": NaN, + "learning_rate": 0.00017676035665498358, + "loss": 0.0, + "step": 29594 + }, + { + "epoch": 2.761500419893627, + "grad_norm": NaN, + "learning_rate": 0.00017675291417765569, + "loss": 0.0, + "step": 29595 + }, + { + "epoch": 2.7615937295885042, + "grad_norm": NaN, + "learning_rate": 0.00017674547163230305, + "loss": 0.0, + "step": 29596 + }, + { + "epoch": 2.7616870392833817, + "grad_norm": NaN, + "learning_rate": 0.0001767380290189446, + "loss": 0.0, + "step": 29597 + }, + { + "epoch": 2.7617803489782586, + "grad_norm": NaN, + "learning_rate": 0.00017673058633759932, + "loss": 0.0, + "step": 29598 + }, + { + "epoch": 2.761873658673136, + "grad_norm": NaN, + "learning_rate": 0.00017672314358828598, + "loss": 0.0, + "step": 29599 + }, + { + "epoch": 2.7619669683680135, + "grad_norm": NaN, + "learning_rate": 0.00017671570077102368, + "loss": 0.0, + "step": 29600 + }, + { + "epoch": 2.762060278062891, + "grad_norm": NaN, + "learning_rate": 0.00017670825788583126, + "loss": 0.0, + "step": 29601 + }, + { + "epoch": 2.762153587757768, + "grad_norm": NaN, + "learning_rate": 0.0001767008149327276, + "loss": 0.0, + "step": 29602 + }, + { + "epoch": 2.7622468974526453, + "grad_norm": NaN, + "learning_rate": 0.0001766933719117317, + "loss": 0.0, + "step": 29603 + }, + { + "epoch": 2.7623402071475227, + "grad_norm": NaN, + "learning_rate": 0.00017668592882286247, + "loss": 0.0, + "step": 29604 + }, + { + "epoch": 2.7624335168423997, + "grad_norm": NaN, + "learning_rate": 0.00017667848566613883, + "loss": 0.0, + "step": 29605 + }, + { + "epoch": 2.762526826537277, + "grad_norm": NaN, + "learning_rate": 0.0001766710424415797, + "loss": 0.0, + "step": 29606 + }, + { + "epoch": 2.7626201362321545, + "grad_norm": NaN, + "learning_rate": 0.00017666359914920402, + "loss": 0.0, + "step": 29607 + }, + { + "epoch": 2.762713445927032, + "grad_norm": NaN, + "learning_rate": 0.0001766561557890307, + "loss": 0.0, + "step": 29608 + }, + { + "epoch": 2.7628067556219094, + "grad_norm": NaN, + "learning_rate": 0.00017664871236107866, + "loss": 0.0, + "step": 29609 + }, + { + "epoch": 2.7629000653167863, + "grad_norm": NaN, + "learning_rate": 0.00017664126886536685, + "loss": 0.0, + "step": 29610 + }, + { + "epoch": 2.7629933750116638, + "grad_norm": NaN, + "learning_rate": 0.00017663382530191417, + "loss": 0.0, + "step": 29611 + }, + { + "epoch": 2.763086684706541, + "grad_norm": NaN, + "learning_rate": 0.00017662638167073957, + "loss": 0.0, + "step": 29612 + }, + { + "epoch": 2.763179994401418, + "grad_norm": NaN, + "learning_rate": 0.000176618937971862, + "loss": 0.0, + "step": 29613 + }, + { + "epoch": 2.7632733040962956, + "grad_norm": NaN, + "learning_rate": 0.00017661149420530032, + "loss": 0.0, + "step": 29614 + }, + { + "epoch": 2.763366613791173, + "grad_norm": NaN, + "learning_rate": 0.0001766040503710735, + "loss": 0.0, + "step": 29615 + }, + { + "epoch": 2.7634599234860504, + "grad_norm": NaN, + "learning_rate": 0.00017659660646920046, + "loss": 0.0, + "step": 29616 + }, + { + "epoch": 2.7635532331809274, + "grad_norm": NaN, + "learning_rate": 0.00017658916249970016, + "loss": 0.0, + "step": 29617 + }, + { + "epoch": 2.763646542875805, + "grad_norm": NaN, + "learning_rate": 0.00017658171846259146, + "loss": 0.0, + "step": 29618 + }, + { + "epoch": 2.763739852570682, + "grad_norm": NaN, + "learning_rate": 0.00017657427435789332, + "loss": 0.0, + "step": 29619 + }, + { + "epoch": 2.763833162265559, + "grad_norm": NaN, + "learning_rate": 0.00017656683018562472, + "loss": 0.0, + "step": 29620 + }, + { + "epoch": 2.7639264719604366, + "grad_norm": NaN, + "learning_rate": 0.00017655938594580446, + "loss": 0.0, + "step": 29621 + }, + { + "epoch": 2.764019781655314, + "grad_norm": NaN, + "learning_rate": 0.0001765519416384516, + "loss": 0.0, + "step": 29622 + }, + { + "epoch": 2.7641130913501915, + "grad_norm": NaN, + "learning_rate": 0.00017654449726358499, + "loss": 0.0, + "step": 29623 + }, + { + "epoch": 2.7642064010450684, + "grad_norm": NaN, + "learning_rate": 0.0001765370528212236, + "loss": 0.0, + "step": 29624 + }, + { + "epoch": 2.764299710739946, + "grad_norm": NaN, + "learning_rate": 0.00017652960831138637, + "loss": 0.0, + "step": 29625 + }, + { + "epoch": 2.7643930204348233, + "grad_norm": NaN, + "learning_rate": 0.0001765221637340922, + "loss": 0.0, + "step": 29626 + }, + { + "epoch": 2.7644863301297002, + "grad_norm": NaN, + "learning_rate": 0.00017651471908936, + "loss": 0.0, + "step": 29627 + }, + { + "epoch": 2.7645796398245777, + "grad_norm": NaN, + "learning_rate": 0.00017650727437720873, + "loss": 0.0, + "step": 29628 + }, + { + "epoch": 2.764672949519455, + "grad_norm": NaN, + "learning_rate": 0.00017649982959765733, + "loss": 0.0, + "step": 29629 + }, + { + "epoch": 2.7647662592143325, + "grad_norm": NaN, + "learning_rate": 0.00017649238475072468, + "loss": 0.0, + "step": 29630 + }, + { + "epoch": 2.76485956890921, + "grad_norm": NaN, + "learning_rate": 0.00017648493983642976, + "loss": 0.0, + "step": 29631 + }, + { + "epoch": 2.764952878604087, + "grad_norm": NaN, + "learning_rate": 0.0001764774948547915, + "loss": 0.0, + "step": 29632 + }, + { + "epoch": 2.7650461882989643, + "grad_norm": NaN, + "learning_rate": 0.00017647004980582878, + "loss": 0.0, + "step": 29633 + }, + { + "epoch": 2.7651394979938413, + "grad_norm": NaN, + "learning_rate": 0.0001764626046895606, + "loss": 0.0, + "step": 29634 + }, + { + "epoch": 2.7652328076887187, + "grad_norm": NaN, + "learning_rate": 0.00017645515950600585, + "loss": 0.0, + "step": 29635 + }, + { + "epoch": 2.765326117383596, + "grad_norm": NaN, + "learning_rate": 0.00017644771425518344, + "loss": 0.0, + "step": 29636 + }, + { + "epoch": 2.7654194270784735, + "grad_norm": NaN, + "learning_rate": 0.00017644026893711234, + "loss": 0.0, + "step": 29637 + }, + { + "epoch": 2.765512736773351, + "grad_norm": NaN, + "learning_rate": 0.00017643282355181148, + "loss": 0.0, + "step": 29638 + }, + { + "epoch": 2.765606046468228, + "grad_norm": NaN, + "learning_rate": 0.0001764253780992998, + "loss": 0.0, + "step": 29639 + }, + { + "epoch": 2.7656993561631054, + "grad_norm": NaN, + "learning_rate": 0.00017641793257959618, + "loss": 0.0, + "step": 29640 + }, + { + "epoch": 2.765792665857983, + "grad_norm": NaN, + "learning_rate": 0.00017641048699271956, + "loss": 0.0, + "step": 29641 + }, + { + "epoch": 2.7658859755528598, + "grad_norm": NaN, + "learning_rate": 0.00017640304133868893, + "loss": 0.0, + "step": 29642 + }, + { + "epoch": 2.765979285247737, + "grad_norm": NaN, + "learning_rate": 0.00017639559561752317, + "loss": 0.0, + "step": 29643 + }, + { + "epoch": 2.7660725949426146, + "grad_norm": NaN, + "learning_rate": 0.00017638814982924125, + "loss": 0.0, + "step": 29644 + }, + { + "epoch": 2.766165904637492, + "grad_norm": NaN, + "learning_rate": 0.00017638070397386208, + "loss": 0.0, + "step": 29645 + }, + { + "epoch": 2.766259214332369, + "grad_norm": NaN, + "learning_rate": 0.00017637325805140459, + "loss": 0.0, + "step": 29646 + }, + { + "epoch": 2.7663525240272464, + "grad_norm": NaN, + "learning_rate": 0.0001763658120618877, + "loss": 0.0, + "step": 29647 + }, + { + "epoch": 2.766445833722124, + "grad_norm": NaN, + "learning_rate": 0.00017635836600533037, + "loss": 0.0, + "step": 29648 + }, + { + "epoch": 2.766539143417001, + "grad_norm": NaN, + "learning_rate": 0.00017635091988175153, + "loss": 0.0, + "step": 29649 + }, + { + "epoch": 2.766632453111878, + "grad_norm": NaN, + "learning_rate": 0.00017634347369117009, + "loss": 0.0, + "step": 29650 + }, + { + "epoch": 2.7667257628067556, + "grad_norm": NaN, + "learning_rate": 0.000176336027433605, + "loss": 0.0, + "step": 29651 + }, + { + "epoch": 2.766819072501633, + "grad_norm": NaN, + "learning_rate": 0.00017632858110907523, + "loss": 0.0, + "step": 29652 + }, + { + "epoch": 2.7669123821965105, + "grad_norm": NaN, + "learning_rate": 0.00017632113471759963, + "loss": 0.0, + "step": 29653 + }, + { + "epoch": 2.7670056918913875, + "grad_norm": NaN, + "learning_rate": 0.0001763136882591972, + "loss": 0.0, + "step": 29654 + }, + { + "epoch": 2.767099001586265, + "grad_norm": NaN, + "learning_rate": 0.00017630624173388686, + "loss": 0.0, + "step": 29655 + }, + { + "epoch": 2.767192311281142, + "grad_norm": NaN, + "learning_rate": 0.00017629879514168754, + "loss": 0.0, + "step": 29656 + }, + { + "epoch": 2.7672856209760193, + "grad_norm": NaN, + "learning_rate": 0.0001762913484826182, + "loss": 0.0, + "step": 29657 + }, + { + "epoch": 2.7673789306708967, + "grad_norm": NaN, + "learning_rate": 0.0001762839017566977, + "loss": 0.0, + "step": 29658 + }, + { + "epoch": 2.767472240365774, + "grad_norm": NaN, + "learning_rate": 0.00017627645496394505, + "loss": 0.0, + "step": 29659 + }, + { + "epoch": 2.7675655500606515, + "grad_norm": NaN, + "learning_rate": 0.00017626900810437915, + "loss": 0.0, + "step": 29660 + }, + { + "epoch": 2.7676588597555285, + "grad_norm": NaN, + "learning_rate": 0.00017626156117801894, + "loss": 0.0, + "step": 29661 + }, + { + "epoch": 2.767752169450406, + "grad_norm": NaN, + "learning_rate": 0.00017625411418488337, + "loss": 0.0, + "step": 29662 + }, + { + "epoch": 2.7678454791452833, + "grad_norm": NaN, + "learning_rate": 0.00017624666712499134, + "loss": 0.0, + "step": 29663 + }, + { + "epoch": 2.7679387888401603, + "grad_norm": NaN, + "learning_rate": 0.00017623921999836186, + "loss": 0.0, + "step": 29664 + }, + { + "epoch": 2.7680320985350377, + "grad_norm": NaN, + "learning_rate": 0.00017623177280501378, + "loss": 0.0, + "step": 29665 + }, + { + "epoch": 2.768125408229915, + "grad_norm": NaN, + "learning_rate": 0.00017622432554496606, + "loss": 0.0, + "step": 29666 + }, + { + "epoch": 2.7682187179247926, + "grad_norm": NaN, + "learning_rate": 0.00017621687821823767, + "loss": 0.0, + "step": 29667 + }, + { + "epoch": 2.7683120276196695, + "grad_norm": NaN, + "learning_rate": 0.00017620943082484756, + "loss": 0.0, + "step": 29668 + }, + { + "epoch": 2.768405337314547, + "grad_norm": NaN, + "learning_rate": 0.0001762019833648146, + "loss": 0.0, + "step": 29669 + }, + { + "epoch": 2.7684986470094244, + "grad_norm": NaN, + "learning_rate": 0.00017619453583815774, + "loss": 0.0, + "step": 29670 + }, + { + "epoch": 2.7685919567043014, + "grad_norm": NaN, + "learning_rate": 0.00017618708824489594, + "loss": 0.0, + "step": 29671 + }, + { + "epoch": 2.768685266399179, + "grad_norm": NaN, + "learning_rate": 0.00017617964058504814, + "loss": 0.0, + "step": 29672 + }, + { + "epoch": 2.768778576094056, + "grad_norm": NaN, + "learning_rate": 0.00017617219285863324, + "loss": 0.0, + "step": 29673 + }, + { + "epoch": 2.7688718857889336, + "grad_norm": NaN, + "learning_rate": 0.00017616474506567024, + "loss": 0.0, + "step": 29674 + }, + { + "epoch": 2.768965195483811, + "grad_norm": NaN, + "learning_rate": 0.00017615729720617804, + "loss": 0.0, + "step": 29675 + }, + { + "epoch": 2.769058505178688, + "grad_norm": NaN, + "learning_rate": 0.00017614984928017557, + "loss": 0.0, + "step": 29676 + }, + { + "epoch": 2.7691518148735654, + "grad_norm": NaN, + "learning_rate": 0.0001761424012876818, + "loss": 0.0, + "step": 29677 + }, + { + "epoch": 2.7692451245684424, + "grad_norm": NaN, + "learning_rate": 0.00017613495322871562, + "loss": 0.0, + "step": 29678 + }, + { + "epoch": 2.76933843426332, + "grad_norm": NaN, + "learning_rate": 0.00017612750510329604, + "loss": 0.0, + "step": 29679 + }, + { + "epoch": 2.7694317439581972, + "grad_norm": NaN, + "learning_rate": 0.00017612005691144194, + "loss": 0.0, + "step": 29680 + }, + { + "epoch": 2.7695250536530747, + "grad_norm": NaN, + "learning_rate": 0.00017611260865317224, + "loss": 0.0, + "step": 29681 + }, + { + "epoch": 2.769618363347952, + "grad_norm": NaN, + "learning_rate": 0.00017610516032850592, + "loss": 0.0, + "step": 29682 + }, + { + "epoch": 2.769711673042829, + "grad_norm": NaN, + "learning_rate": 0.00017609771193746195, + "loss": 0.0, + "step": 29683 + }, + { + "epoch": 2.7698049827377065, + "grad_norm": NaN, + "learning_rate": 0.00017609026348005917, + "loss": 0.0, + "step": 29684 + }, + { + "epoch": 2.769898292432584, + "grad_norm": NaN, + "learning_rate": 0.0001760828149563166, + "loss": 0.0, + "step": 29685 + }, + { + "epoch": 2.769991602127461, + "grad_norm": NaN, + "learning_rate": 0.00017607536636625318, + "loss": 0.0, + "step": 29686 + }, + { + "epoch": 2.7700849118223383, + "grad_norm": NaN, + "learning_rate": 0.0001760679177098878, + "loss": 0.0, + "step": 29687 + }, + { + "epoch": 2.7701782215172157, + "grad_norm": NaN, + "learning_rate": 0.00017606046898723944, + "loss": 0.0, + "step": 29688 + }, + { + "epoch": 2.770271531212093, + "grad_norm": NaN, + "learning_rate": 0.00017605302019832705, + "loss": 0.0, + "step": 29689 + }, + { + "epoch": 2.77036484090697, + "grad_norm": NaN, + "learning_rate": 0.00017604557134316952, + "loss": 0.0, + "step": 29690 + }, + { + "epoch": 2.7704581506018475, + "grad_norm": NaN, + "learning_rate": 0.00017603812242178581, + "loss": 0.0, + "step": 29691 + }, + { + "epoch": 2.770551460296725, + "grad_norm": NaN, + "learning_rate": 0.0001760306734341949, + "loss": 0.0, + "step": 29692 + }, + { + "epoch": 2.770644769991602, + "grad_norm": NaN, + "learning_rate": 0.00017602322438041565, + "loss": 0.0, + "step": 29693 + }, + { + "epoch": 2.7707380796864793, + "grad_norm": NaN, + "learning_rate": 0.00017601577526046708, + "loss": 0.0, + "step": 29694 + }, + { + "epoch": 2.7708313893813568, + "grad_norm": NaN, + "learning_rate": 0.00017600832607436812, + "loss": 0.0, + "step": 29695 + }, + { + "epoch": 2.770924699076234, + "grad_norm": NaN, + "learning_rate": 0.00017600087682213765, + "loss": 0.0, + "step": 29696 + }, + { + "epoch": 2.7710180087711116, + "grad_norm": NaN, + "learning_rate": 0.00017599342750379467, + "loss": 0.0, + "step": 29697 + }, + { + "epoch": 2.7711113184659886, + "grad_norm": NaN, + "learning_rate": 0.0001759859781193581, + "loss": 0.0, + "step": 29698 + }, + { + "epoch": 2.771204628160866, + "grad_norm": NaN, + "learning_rate": 0.0001759785286688469, + "loss": 0.0, + "step": 29699 + }, + { + "epoch": 2.771297937855743, + "grad_norm": NaN, + "learning_rate": 0.00017597107915228, + "loss": 0.0, + "step": 29700 + }, + { + "epoch": 2.7713912475506204, + "grad_norm": NaN, + "learning_rate": 0.0001759636295696763, + "loss": 0.0, + "step": 29701 + }, + { + "epoch": 2.771484557245498, + "grad_norm": NaN, + "learning_rate": 0.00017595617992105481, + "loss": 0.0, + "step": 29702 + }, + { + "epoch": 2.7715778669403752, + "grad_norm": NaN, + "learning_rate": 0.00017594873020643442, + "loss": 0.0, + "step": 29703 + }, + { + "epoch": 2.7716711766352526, + "grad_norm": NaN, + "learning_rate": 0.00017594128042583413, + "loss": 0.0, + "step": 29704 + }, + { + "epoch": 2.7717644863301296, + "grad_norm": NaN, + "learning_rate": 0.00017593383057927284, + "loss": 0.0, + "step": 29705 + }, + { + "epoch": 2.771857796025007, + "grad_norm": NaN, + "learning_rate": 0.00017592638066676948, + "loss": 0.0, + "step": 29706 + }, + { + "epoch": 2.7719511057198845, + "grad_norm": NaN, + "learning_rate": 0.00017591893068834305, + "loss": 0.0, + "step": 29707 + }, + { + "epoch": 2.7720444154147614, + "grad_norm": NaN, + "learning_rate": 0.00017591148064401241, + "loss": 0.0, + "step": 29708 + }, + { + "epoch": 2.772137725109639, + "grad_norm": NaN, + "learning_rate": 0.00017590403053379658, + "loss": 0.0, + "step": 29709 + }, + { + "epoch": 2.7722310348045163, + "grad_norm": NaN, + "learning_rate": 0.00017589658035771448, + "loss": 0.0, + "step": 29710 + }, + { + "epoch": 2.7723243444993937, + "grad_norm": NaN, + "learning_rate": 0.00017588913011578505, + "loss": 0.0, + "step": 29711 + }, + { + "epoch": 2.7724176541942707, + "grad_norm": NaN, + "learning_rate": 0.00017588167980802722, + "loss": 0.0, + "step": 29712 + }, + { + "epoch": 2.772510963889148, + "grad_norm": NaN, + "learning_rate": 0.00017587422943445992, + "loss": 0.0, + "step": 29713 + }, + { + "epoch": 2.7726042735840255, + "grad_norm": NaN, + "learning_rate": 0.00017586677899510212, + "loss": 0.0, + "step": 29714 + }, + { + "epoch": 2.7726975832789025, + "grad_norm": NaN, + "learning_rate": 0.00017585932848997282, + "loss": 0.0, + "step": 29715 + }, + { + "epoch": 2.77279089297378, + "grad_norm": NaN, + "learning_rate": 0.00017585187791909087, + "loss": 0.0, + "step": 29716 + }, + { + "epoch": 2.7728842026686573, + "grad_norm": NaN, + "learning_rate": 0.00017584442728247522, + "loss": 0.0, + "step": 29717 + }, + { + "epoch": 2.7729775123635347, + "grad_norm": NaN, + "learning_rate": 0.0001758369765801449, + "loss": 0.0, + "step": 29718 + }, + { + "epoch": 2.7730708220584117, + "grad_norm": NaN, + "learning_rate": 0.00017582952581211877, + "loss": 0.0, + "step": 29719 + }, + { + "epoch": 2.773164131753289, + "grad_norm": NaN, + "learning_rate": 0.00017582207497841582, + "loss": 0.0, + "step": 29720 + }, + { + "epoch": 2.7732574414481665, + "grad_norm": NaN, + "learning_rate": 0.000175814624079055, + "loss": 0.0, + "step": 29721 + }, + { + "epoch": 2.7733507511430435, + "grad_norm": NaN, + "learning_rate": 0.0001758071731140552, + "loss": 0.0, + "step": 29722 + }, + { + "epoch": 2.773444060837921, + "grad_norm": NaN, + "learning_rate": 0.00017579972208343545, + "loss": 0.0, + "step": 29723 + }, + { + "epoch": 2.7735373705327984, + "grad_norm": NaN, + "learning_rate": 0.00017579227098721463, + "loss": 0.0, + "step": 29724 + }, + { + "epoch": 2.773630680227676, + "grad_norm": NaN, + "learning_rate": 0.0001757848198254117, + "loss": 0.0, + "step": 29725 + }, + { + "epoch": 2.773723989922553, + "grad_norm": NaN, + "learning_rate": 0.00017577736859804562, + "loss": 0.0, + "step": 29726 + }, + { + "epoch": 2.77381729961743, + "grad_norm": NaN, + "learning_rate": 0.0001757699173051353, + "loss": 0.0, + "step": 29727 + }, + { + "epoch": 2.7739106093123076, + "grad_norm": NaN, + "learning_rate": 0.00017576246594669972, + "loss": 0.0, + "step": 29728 + }, + { + "epoch": 2.7740039190071846, + "grad_norm": NaN, + "learning_rate": 0.00017575501452275785, + "loss": 0.0, + "step": 29729 + }, + { + "epoch": 2.774097228702062, + "grad_norm": NaN, + "learning_rate": 0.0001757475630333286, + "loss": 0.0, + "step": 29730 + }, + { + "epoch": 2.7741905383969394, + "grad_norm": NaN, + "learning_rate": 0.0001757401114784309, + "loss": 0.0, + "step": 29731 + }, + { + "epoch": 2.774283848091817, + "grad_norm": NaN, + "learning_rate": 0.00017573265985808373, + "loss": 0.0, + "step": 29732 + }, + { + "epoch": 2.7743771577866942, + "grad_norm": NaN, + "learning_rate": 0.00017572520817230606, + "loss": 0.0, + "step": 29733 + }, + { + "epoch": 2.7744704674815712, + "grad_norm": NaN, + "learning_rate": 0.00017571775642111673, + "loss": 0.0, + "step": 29734 + }, + { + "epoch": 2.7745637771764486, + "grad_norm": NaN, + "learning_rate": 0.00017571030460453486, + "loss": 0.0, + "step": 29735 + }, + { + "epoch": 2.774657086871326, + "grad_norm": NaN, + "learning_rate": 0.00017570285272257925, + "loss": 0.0, + "step": 29736 + }, + { + "epoch": 2.774750396566203, + "grad_norm": NaN, + "learning_rate": 0.0001756954007752689, + "loss": 0.0, + "step": 29737 + }, + { + "epoch": 2.7748437062610805, + "grad_norm": NaN, + "learning_rate": 0.00017568794876262272, + "loss": 0.0, + "step": 29738 + }, + { + "epoch": 2.774937015955958, + "grad_norm": NaN, + "learning_rate": 0.00017568049668465976, + "loss": 0.0, + "step": 29739 + }, + { + "epoch": 2.7750303256508353, + "grad_norm": NaN, + "learning_rate": 0.00017567304454139889, + "loss": 0.0, + "step": 29740 + }, + { + "epoch": 2.7751236353457123, + "grad_norm": NaN, + "learning_rate": 0.00017566559233285904, + "loss": 0.0, + "step": 29741 + }, + { + "epoch": 2.7752169450405897, + "grad_norm": NaN, + "learning_rate": 0.00017565814005905922, + "loss": 0.0, + "step": 29742 + }, + { + "epoch": 2.775310254735467, + "grad_norm": NaN, + "learning_rate": 0.00017565068772001834, + "loss": 0.0, + "step": 29743 + }, + { + "epoch": 2.775403564430344, + "grad_norm": NaN, + "learning_rate": 0.0001756432353157554, + "loss": 0.0, + "step": 29744 + }, + { + "epoch": 2.7754968741252215, + "grad_norm": NaN, + "learning_rate": 0.00017563578284628925, + "loss": 0.0, + "step": 29745 + }, + { + "epoch": 2.775590183820099, + "grad_norm": NaN, + "learning_rate": 0.00017562833031163893, + "loss": 0.0, + "step": 29746 + }, + { + "epoch": 2.7756834935149763, + "grad_norm": NaN, + "learning_rate": 0.00017562087771182334, + "loss": 0.0, + "step": 29747 + }, + { + "epoch": 2.7757768032098538, + "grad_norm": NaN, + "learning_rate": 0.00017561342504686145, + "loss": 0.0, + "step": 29748 + }, + { + "epoch": 2.7758701129047307, + "grad_norm": NaN, + "learning_rate": 0.0001756059723167722, + "loss": 0.0, + "step": 29749 + }, + { + "epoch": 2.775963422599608, + "grad_norm": NaN, + "learning_rate": 0.00017559851952157459, + "loss": 0.0, + "step": 29750 + }, + { + "epoch": 2.776056732294485, + "grad_norm": NaN, + "learning_rate": 0.0001755910666612875, + "loss": 0.0, + "step": 29751 + }, + { + "epoch": 2.7761500419893625, + "grad_norm": NaN, + "learning_rate": 0.0001755836137359299, + "loss": 0.0, + "step": 29752 + }, + { + "epoch": 2.77624335168424, + "grad_norm": NaN, + "learning_rate": 0.00017557616074552078, + "loss": 0.0, + "step": 29753 + }, + { + "epoch": 2.7763366613791174, + "grad_norm": NaN, + "learning_rate": 0.00017556870769007903, + "loss": 0.0, + "step": 29754 + }, + { + "epoch": 2.776429971073995, + "grad_norm": NaN, + "learning_rate": 0.00017556125456962362, + "loss": 0.0, + "step": 29755 + }, + { + "epoch": 2.776523280768872, + "grad_norm": NaN, + "learning_rate": 0.00017555380138417352, + "loss": 0.0, + "step": 29756 + }, + { + "epoch": 2.776616590463749, + "grad_norm": NaN, + "learning_rate": 0.00017554634813374766, + "loss": 0.0, + "step": 29757 + }, + { + "epoch": 2.7767099001586266, + "grad_norm": NaN, + "learning_rate": 0.00017553889481836503, + "loss": 0.0, + "step": 29758 + }, + { + "epoch": 2.7768032098535036, + "grad_norm": NaN, + "learning_rate": 0.00017553144143804455, + "loss": 0.0, + "step": 29759 + }, + { + "epoch": 2.776896519548381, + "grad_norm": NaN, + "learning_rate": 0.00017552398799280518, + "loss": 0.0, + "step": 29760 + }, + { + "epoch": 2.7769898292432584, + "grad_norm": NaN, + "learning_rate": 0.00017551653448266588, + "loss": 0.0, + "step": 29761 + }, + { + "epoch": 2.777083138938136, + "grad_norm": NaN, + "learning_rate": 0.0001755090809076456, + "loss": 0.0, + "step": 29762 + }, + { + "epoch": 2.777176448633013, + "grad_norm": NaN, + "learning_rate": 0.00017550162726776326, + "loss": 0.0, + "step": 29763 + }, + { + "epoch": 2.7772697583278902, + "grad_norm": NaN, + "learning_rate": 0.0001754941735630378, + "loss": 0.0, + "step": 29764 + }, + { + "epoch": 2.7773630680227677, + "grad_norm": NaN, + "learning_rate": 0.00017548671979348827, + "loss": 0.0, + "step": 29765 + }, + { + "epoch": 2.7774563777176446, + "grad_norm": NaN, + "learning_rate": 0.00017547926595913352, + "loss": 0.0, + "step": 29766 + }, + { + "epoch": 2.777549687412522, + "grad_norm": NaN, + "learning_rate": 0.00017547181205999256, + "loss": 0.0, + "step": 29767 + }, + { + "epoch": 2.7776429971073995, + "grad_norm": NaN, + "learning_rate": 0.0001754643580960843, + "loss": 0.0, + "step": 29768 + }, + { + "epoch": 2.777736306802277, + "grad_norm": NaN, + "learning_rate": 0.00017545690406742777, + "loss": 0.0, + "step": 29769 + }, + { + "epoch": 2.7778296164971543, + "grad_norm": NaN, + "learning_rate": 0.00017544944997404188, + "loss": 0.0, + "step": 29770 + }, + { + "epoch": 2.7779229261920313, + "grad_norm": NaN, + "learning_rate": 0.0001754419958159455, + "loss": 0.0, + "step": 29771 + }, + { + "epoch": 2.7780162358869087, + "grad_norm": NaN, + "learning_rate": 0.00017543454159315772, + "loss": 0.0, + "step": 29772 + }, + { + "epoch": 2.7781095455817857, + "grad_norm": NaN, + "learning_rate": 0.00017542708730569742, + "loss": 0.0, + "step": 29773 + }, + { + "epoch": 2.778202855276663, + "grad_norm": NaN, + "learning_rate": 0.0001754196329535836, + "loss": 0.0, + "step": 29774 + }, + { + "epoch": 2.7782961649715405, + "grad_norm": NaN, + "learning_rate": 0.00017541217853683514, + "loss": 0.0, + "step": 29775 + }, + { + "epoch": 2.778389474666418, + "grad_norm": NaN, + "learning_rate": 0.00017540472405547106, + "loss": 0.0, + "step": 29776 + }, + { + "epoch": 2.7784827843612954, + "grad_norm": NaN, + "learning_rate": 0.00017539726950951032, + "loss": 0.0, + "step": 29777 + }, + { + "epoch": 2.7785760940561723, + "grad_norm": NaN, + "learning_rate": 0.0001753898148989718, + "loss": 0.0, + "step": 29778 + }, + { + "epoch": 2.7786694037510498, + "grad_norm": NaN, + "learning_rate": 0.00017538236022387454, + "loss": 0.0, + "step": 29779 + }, + { + "epoch": 2.778762713445927, + "grad_norm": NaN, + "learning_rate": 0.0001753749054842374, + "loss": 0.0, + "step": 29780 + }, + { + "epoch": 2.778856023140804, + "grad_norm": NaN, + "learning_rate": 0.00017536745068007941, + "loss": 0.0, + "step": 29781 + }, + { + "epoch": 2.7789493328356816, + "grad_norm": NaN, + "learning_rate": 0.00017535999581141953, + "loss": 0.0, + "step": 29782 + }, + { + "epoch": 2.779042642530559, + "grad_norm": NaN, + "learning_rate": 0.0001753525408782767, + "loss": 0.0, + "step": 29783 + }, + { + "epoch": 2.7791359522254364, + "grad_norm": NaN, + "learning_rate": 0.00017534508588066985, + "loss": 0.0, + "step": 29784 + }, + { + "epoch": 2.7792292619203134, + "grad_norm": NaN, + "learning_rate": 0.00017533763081861798, + "loss": 0.0, + "step": 29785 + }, + { + "epoch": 2.779322571615191, + "grad_norm": NaN, + "learning_rate": 0.00017533017569213997, + "loss": 0.0, + "step": 29786 + }, + { + "epoch": 2.7794158813100682, + "grad_norm": NaN, + "learning_rate": 0.00017532272050125482, + "loss": 0.0, + "step": 29787 + }, + { + "epoch": 2.779509191004945, + "grad_norm": NaN, + "learning_rate": 0.00017531526524598156, + "loss": 0.0, + "step": 29788 + }, + { + "epoch": 2.7796025006998226, + "grad_norm": NaN, + "learning_rate": 0.00017530780992633908, + "loss": 0.0, + "step": 29789 + }, + { + "epoch": 2.7796958103947, + "grad_norm": NaN, + "learning_rate": 0.00017530035454234625, + "loss": 0.0, + "step": 29790 + }, + { + "epoch": 2.7797891200895775, + "grad_norm": NaN, + "learning_rate": 0.0001752928990940222, + "loss": 0.0, + "step": 29791 + }, + { + "epoch": 2.779882429784455, + "grad_norm": NaN, + "learning_rate": 0.00017528544358138575, + "loss": 0.0, + "step": 29792 + }, + { + "epoch": 2.779975739479332, + "grad_norm": NaN, + "learning_rate": 0.00017527798800445589, + "loss": 0.0, + "step": 29793 + }, + { + "epoch": 2.7800690491742093, + "grad_norm": NaN, + "learning_rate": 0.00017527053236325167, + "loss": 0.0, + "step": 29794 + }, + { + "epoch": 2.7801623588690862, + "grad_norm": NaN, + "learning_rate": 0.00017526307665779192, + "loss": 0.0, + "step": 29795 + }, + { + "epoch": 2.7802556685639637, + "grad_norm": NaN, + "learning_rate": 0.00017525562088809566, + "loss": 0.0, + "step": 29796 + }, + { + "epoch": 2.780348978258841, + "grad_norm": NaN, + "learning_rate": 0.00017524816505418186, + "loss": 0.0, + "step": 29797 + }, + { + "epoch": 2.7804422879537185, + "grad_norm": NaN, + "learning_rate": 0.00017524070915606943, + "loss": 0.0, + "step": 29798 + }, + { + "epoch": 2.780535597648596, + "grad_norm": NaN, + "learning_rate": 0.00017523325319377737, + "loss": 0.0, + "step": 29799 + }, + { + "epoch": 2.780628907343473, + "grad_norm": NaN, + "learning_rate": 0.0001752257971673246, + "loss": 0.0, + "step": 29800 + }, + { + "epoch": 2.7807222170383503, + "grad_norm": NaN, + "learning_rate": 0.00017521834107673008, + "loss": 0.0, + "step": 29801 + }, + { + "epoch": 2.7808155267332277, + "grad_norm": NaN, + "learning_rate": 0.00017521088492201284, + "loss": 0.0, + "step": 29802 + }, + { + "epoch": 2.7809088364281047, + "grad_norm": NaN, + "learning_rate": 0.00017520342870319178, + "loss": 0.0, + "step": 29803 + }, + { + "epoch": 2.781002146122982, + "grad_norm": NaN, + "learning_rate": 0.00017519597242028585, + "loss": 0.0, + "step": 29804 + }, + { + "epoch": 2.7810954558178596, + "grad_norm": NaN, + "learning_rate": 0.00017518851607331402, + "loss": 0.0, + "step": 29805 + }, + { + "epoch": 2.781188765512737, + "grad_norm": NaN, + "learning_rate": 0.0001751810596622952, + "loss": 0.0, + "step": 29806 + }, + { + "epoch": 2.781282075207614, + "grad_norm": NaN, + "learning_rate": 0.0001751736031872485, + "loss": 0.0, + "step": 29807 + }, + { + "epoch": 2.7813753849024914, + "grad_norm": NaN, + "learning_rate": 0.00017516614664819279, + "loss": 0.0, + "step": 29808 + }, + { + "epoch": 2.781468694597369, + "grad_norm": NaN, + "learning_rate": 0.0001751586900451469, + "loss": 0.0, + "step": 29809 + }, + { + "epoch": 2.7815620042922458, + "grad_norm": NaN, + "learning_rate": 0.00017515123337813004, + "loss": 0.0, + "step": 29810 + }, + { + "epoch": 2.781655313987123, + "grad_norm": NaN, + "learning_rate": 0.00017514377664716103, + "loss": 0.0, + "step": 29811 + }, + { + "epoch": 2.7817486236820006, + "grad_norm": NaN, + "learning_rate": 0.00017513631985225877, + "loss": 0.0, + "step": 29812 + }, + { + "epoch": 2.781841933376878, + "grad_norm": NaN, + "learning_rate": 0.00017512886299344237, + "loss": 0.0, + "step": 29813 + }, + { + "epoch": 2.781935243071755, + "grad_norm": NaN, + "learning_rate": 0.0001751214060707307, + "loss": 0.0, + "step": 29814 + }, + { + "epoch": 2.7820285527666324, + "grad_norm": NaN, + "learning_rate": 0.00017511394908414267, + "loss": 0.0, + "step": 29815 + }, + { + "epoch": 2.78212186246151, + "grad_norm": NaN, + "learning_rate": 0.00017510649203369738, + "loss": 0.0, + "step": 29816 + }, + { + "epoch": 2.782215172156387, + "grad_norm": NaN, + "learning_rate": 0.0001750990349194137, + "loss": 0.0, + "step": 29817 + }, + { + "epoch": 2.7823084818512642, + "grad_norm": NaN, + "learning_rate": 0.0001750915777413106, + "loss": 0.0, + "step": 29818 + }, + { + "epoch": 2.7824017915461416, + "grad_norm": NaN, + "learning_rate": 0.00017508412049940708, + "loss": 0.0, + "step": 29819 + }, + { + "epoch": 2.782495101241019, + "grad_norm": NaN, + "learning_rate": 0.00017507666319372203, + "loss": 0.0, + "step": 29820 + }, + { + "epoch": 2.7825884109358965, + "grad_norm": NaN, + "learning_rate": 0.00017506920582427448, + "loss": 0.0, + "step": 29821 + }, + { + "epoch": 2.7826817206307735, + "grad_norm": NaN, + "learning_rate": 0.00017506174839108335, + "loss": 0.0, + "step": 29822 + }, + { + "epoch": 2.782775030325651, + "grad_norm": NaN, + "learning_rate": 0.00017505429089416765, + "loss": 0.0, + "step": 29823 + }, + { + "epoch": 2.7828683400205283, + "grad_norm": NaN, + "learning_rate": 0.00017504683333354626, + "loss": 0.0, + "step": 29824 + }, + { + "epoch": 2.7829616497154053, + "grad_norm": NaN, + "learning_rate": 0.0001750393757092382, + "loss": 0.0, + "step": 29825 + }, + { + "epoch": 2.7830549594102827, + "grad_norm": NaN, + "learning_rate": 0.00017503191802126247, + "loss": 0.0, + "step": 29826 + }, + { + "epoch": 2.78314826910516, + "grad_norm": NaN, + "learning_rate": 0.00017502446026963796, + "loss": 0.0, + "step": 29827 + }, + { + "epoch": 2.7832415788000375, + "grad_norm": NaN, + "learning_rate": 0.00017501700245438362, + "loss": 0.0, + "step": 29828 + }, + { + "epoch": 2.7833348884949145, + "grad_norm": NaN, + "learning_rate": 0.00017500954457551853, + "loss": 0.0, + "step": 29829 + }, + { + "epoch": 2.783428198189792, + "grad_norm": NaN, + "learning_rate": 0.00017500208663306154, + "loss": 0.0, + "step": 29830 + }, + { + "epoch": 2.7835215078846693, + "grad_norm": NaN, + "learning_rate": 0.00017499462862703163, + "loss": 0.0, + "step": 29831 + }, + { + "epoch": 2.7836148175795463, + "grad_norm": NaN, + "learning_rate": 0.00017498717055744781, + "loss": 0.0, + "step": 29832 + }, + { + "epoch": 2.7837081272744237, + "grad_norm": NaN, + "learning_rate": 0.00017497971242432902, + "loss": 0.0, + "step": 29833 + }, + { + "epoch": 2.783801436969301, + "grad_norm": NaN, + "learning_rate": 0.00017497225422769417, + "loss": 0.0, + "step": 29834 + }, + { + "epoch": 2.7838947466641786, + "grad_norm": NaN, + "learning_rate": 0.00017496479596756234, + "loss": 0.0, + "step": 29835 + }, + { + "epoch": 2.7839880563590556, + "grad_norm": NaN, + "learning_rate": 0.0001749573376439524, + "loss": 0.0, + "step": 29836 + }, + { + "epoch": 2.784081366053933, + "grad_norm": NaN, + "learning_rate": 0.00017494987925688332, + "loss": 0.0, + "step": 29837 + }, + { + "epoch": 2.7841746757488104, + "grad_norm": NaN, + "learning_rate": 0.00017494242080637415, + "loss": 0.0, + "step": 29838 + }, + { + "epoch": 2.7842679854436874, + "grad_norm": NaN, + "learning_rate": 0.00017493496229244377, + "loss": 0.0, + "step": 29839 + }, + { + "epoch": 2.784361295138565, + "grad_norm": NaN, + "learning_rate": 0.0001749275037151111, + "loss": 0.0, + "step": 29840 + }, + { + "epoch": 2.784454604833442, + "grad_norm": NaN, + "learning_rate": 0.00017492004507439527, + "loss": 0.0, + "step": 29841 + }, + { + "epoch": 2.7845479145283196, + "grad_norm": NaN, + "learning_rate": 0.0001749125863703151, + "loss": 0.0, + "step": 29842 + }, + { + "epoch": 2.784641224223197, + "grad_norm": NaN, + "learning_rate": 0.0001749051276028896, + "loss": 0.0, + "step": 29843 + }, + { + "epoch": 2.784734533918074, + "grad_norm": NaN, + "learning_rate": 0.00017489766877213771, + "loss": 0.0, + "step": 29844 + }, + { + "epoch": 2.7848278436129514, + "grad_norm": NaN, + "learning_rate": 0.00017489020987807849, + "loss": 0.0, + "step": 29845 + }, + { + "epoch": 2.7849211533078284, + "grad_norm": NaN, + "learning_rate": 0.00017488275092073083, + "loss": 0.0, + "step": 29846 + }, + { + "epoch": 2.785014463002706, + "grad_norm": NaN, + "learning_rate": 0.00017487529190011363, + "loss": 0.0, + "step": 29847 + }, + { + "epoch": 2.7851077726975833, + "grad_norm": NaN, + "learning_rate": 0.00017486783281624602, + "loss": 0.0, + "step": 29848 + }, + { + "epoch": 2.7852010823924607, + "grad_norm": NaN, + "learning_rate": 0.00017486037366914683, + "loss": 0.0, + "step": 29849 + }, + { + "epoch": 2.785294392087338, + "grad_norm": NaN, + "learning_rate": 0.00017485291445883507, + "loss": 0.0, + "step": 29850 + }, + { + "epoch": 2.785387701782215, + "grad_norm": NaN, + "learning_rate": 0.00017484545518532977, + "loss": 0.0, + "step": 29851 + }, + { + "epoch": 2.7854810114770925, + "grad_norm": NaN, + "learning_rate": 0.00017483799584864978, + "loss": 0.0, + "step": 29852 + }, + { + "epoch": 2.78557432117197, + "grad_norm": NaN, + "learning_rate": 0.0001748305364488141, + "loss": 0.0, + "step": 29853 + }, + { + "epoch": 2.785667630866847, + "grad_norm": NaN, + "learning_rate": 0.0001748230769858418, + "loss": 0.0, + "step": 29854 + }, + { + "epoch": 2.7857609405617243, + "grad_norm": NaN, + "learning_rate": 0.00017481561745975172, + "loss": 0.0, + "step": 29855 + }, + { + "epoch": 2.7858542502566017, + "grad_norm": NaN, + "learning_rate": 0.00017480815787056285, + "loss": 0.0, + "step": 29856 + }, + { + "epoch": 2.785947559951479, + "grad_norm": NaN, + "learning_rate": 0.00017480069821829424, + "loss": 0.0, + "step": 29857 + }, + { + "epoch": 2.786040869646356, + "grad_norm": NaN, + "learning_rate": 0.0001747932385029648, + "loss": 0.0, + "step": 29858 + }, + { + "epoch": 2.7861341793412335, + "grad_norm": NaN, + "learning_rate": 0.00017478577872459344, + "loss": 0.0, + "step": 29859 + }, + { + "epoch": 2.786227489036111, + "grad_norm": NaN, + "learning_rate": 0.00017477831888319925, + "loss": 0.0, + "step": 29860 + }, + { + "epoch": 2.786320798730988, + "grad_norm": NaN, + "learning_rate": 0.0001747708589788011, + "loss": 0.0, + "step": 29861 + }, + { + "epoch": 2.7864141084258653, + "grad_norm": NaN, + "learning_rate": 0.000174763399011418, + "loss": 0.0, + "step": 29862 + }, + { + "epoch": 2.7865074181207428, + "grad_norm": NaN, + "learning_rate": 0.0001747559389810689, + "loss": 0.0, + "step": 29863 + }, + { + "epoch": 2.78660072781562, + "grad_norm": NaN, + "learning_rate": 0.00017474847888777282, + "loss": 0.0, + "step": 29864 + }, + { + "epoch": 2.7866940375104976, + "grad_norm": NaN, + "learning_rate": 0.00017474101873154863, + "loss": 0.0, + "step": 29865 + }, + { + "epoch": 2.7867873472053746, + "grad_norm": NaN, + "learning_rate": 0.00017473355851241535, + "loss": 0.0, + "step": 29866 + }, + { + "epoch": 2.786880656900252, + "grad_norm": NaN, + "learning_rate": 0.00017472609823039205, + "loss": 0.0, + "step": 29867 + }, + { + "epoch": 2.786973966595129, + "grad_norm": NaN, + "learning_rate": 0.00017471863788549757, + "loss": 0.0, + "step": 29868 + }, + { + "epoch": 2.7870672762900064, + "grad_norm": NaN, + "learning_rate": 0.00017471117747775087, + "loss": 0.0, + "step": 29869 + }, + { + "epoch": 2.787160585984884, + "grad_norm": NaN, + "learning_rate": 0.00017470371700717101, + "loss": 0.0, + "step": 29870 + }, + { + "epoch": 2.7872538956797612, + "grad_norm": NaN, + "learning_rate": 0.00017469625647377693, + "loss": 0.0, + "step": 29871 + }, + { + "epoch": 2.7873472053746386, + "grad_norm": NaN, + "learning_rate": 0.00017468879587758749, + "loss": 0.0, + "step": 29872 + }, + { + "epoch": 2.7874405150695156, + "grad_norm": NaN, + "learning_rate": 0.00017468133521862187, + "loss": 0.0, + "step": 29873 + }, + { + "epoch": 2.787533824764393, + "grad_norm": NaN, + "learning_rate": 0.00017467387449689889, + "loss": 0.0, + "step": 29874 + }, + { + "epoch": 2.7876271344592705, + "grad_norm": NaN, + "learning_rate": 0.0001746664137124375, + "loss": 0.0, + "step": 29875 + }, + { + "epoch": 2.7877204441541474, + "grad_norm": NaN, + "learning_rate": 0.00017465895286525678, + "loss": 0.0, + "step": 29876 + }, + { + "epoch": 2.787813753849025, + "grad_norm": NaN, + "learning_rate": 0.00017465149195537564, + "loss": 0.0, + "step": 29877 + }, + { + "epoch": 2.7879070635439023, + "grad_norm": NaN, + "learning_rate": 0.00017464403098281305, + "loss": 0.0, + "step": 29878 + }, + { + "epoch": 2.7880003732387797, + "grad_norm": NaN, + "learning_rate": 0.000174636569947588, + "loss": 0.0, + "step": 29879 + }, + { + "epoch": 2.7880936829336567, + "grad_norm": NaN, + "learning_rate": 0.00017462910884971948, + "loss": 0.0, + "step": 29880 + }, + { + "epoch": 2.788186992628534, + "grad_norm": NaN, + "learning_rate": 0.00017462164768922633, + "loss": 0.0, + "step": 29881 + }, + { + "epoch": 2.7882803023234115, + "grad_norm": NaN, + "learning_rate": 0.00017461418646612771, + "loss": 0.0, + "step": 29882 + }, + { + "epoch": 2.7883736120182885, + "grad_norm": NaN, + "learning_rate": 0.0001746067251804425, + "loss": 0.0, + "step": 29883 + }, + { + "epoch": 2.788466921713166, + "grad_norm": NaN, + "learning_rate": 0.00017459926383218963, + "loss": 0.0, + "step": 29884 + }, + { + "epoch": 2.7885602314080433, + "grad_norm": NaN, + "learning_rate": 0.00017459180242138816, + "loss": 0.0, + "step": 29885 + }, + { + "epoch": 2.7886535411029207, + "grad_norm": NaN, + "learning_rate": 0.00017458434094805707, + "loss": 0.0, + "step": 29886 + }, + { + "epoch": 2.788746850797798, + "grad_norm": NaN, + "learning_rate": 0.00017457687941221519, + "loss": 0.0, + "step": 29887 + }, + { + "epoch": 2.788840160492675, + "grad_norm": NaN, + "learning_rate": 0.0001745694178138816, + "loss": 0.0, + "step": 29888 + }, + { + "epoch": 2.7889334701875526, + "grad_norm": NaN, + "learning_rate": 0.00017456195615307533, + "loss": 0.0, + "step": 29889 + }, + { + "epoch": 2.7890267798824295, + "grad_norm": NaN, + "learning_rate": 0.00017455449442981523, + "loss": 0.0, + "step": 29890 + }, + { + "epoch": 2.789120089577307, + "grad_norm": NaN, + "learning_rate": 0.00017454703264412026, + "loss": 0.0, + "step": 29891 + }, + { + "epoch": 2.7892133992721844, + "grad_norm": NaN, + "learning_rate": 0.0001745395707960096, + "loss": 0.0, + "step": 29892 + }, + { + "epoch": 2.789306708967062, + "grad_norm": NaN, + "learning_rate": 0.00017453210888550196, + "loss": 0.0, + "step": 29893 + }, + { + "epoch": 2.789400018661939, + "grad_norm": NaN, + "learning_rate": 0.00017452464691261645, + "loss": 0.0, + "step": 29894 + }, + { + "epoch": 2.789493328356816, + "grad_norm": NaN, + "learning_rate": 0.0001745171848773721, + "loss": 0.0, + "step": 29895 + }, + { + "epoch": 2.7895866380516936, + "grad_norm": NaN, + "learning_rate": 0.00017450972277978775, + "loss": 0.0, + "step": 29896 + }, + { + "epoch": 2.789679947746571, + "grad_norm": NaN, + "learning_rate": 0.00017450226061988244, + "loss": 0.0, + "step": 29897 + }, + { + "epoch": 2.789773257441448, + "grad_norm": NaN, + "learning_rate": 0.00017449479839767518, + "loss": 0.0, + "step": 29898 + }, + { + "epoch": 2.7898665671363254, + "grad_norm": NaN, + "learning_rate": 0.0001744873361131849, + "loss": 0.0, + "step": 29899 + }, + { + "epoch": 2.789959876831203, + "grad_norm": NaN, + "learning_rate": 0.0001744798737664305, + "loss": 0.0, + "step": 29900 + }, + { + "epoch": 2.7900531865260803, + "grad_norm": NaN, + "learning_rate": 0.0001744724113574311, + "loss": 0.0, + "step": 29901 + }, + { + "epoch": 2.7901464962209572, + "grad_norm": NaN, + "learning_rate": 0.00017446494888620563, + "loss": 0.0, + "step": 29902 + }, + { + "epoch": 2.7902398059158346, + "grad_norm": NaN, + "learning_rate": 0.00017445748635277296, + "loss": 0.0, + "step": 29903 + }, + { + "epoch": 2.790333115610712, + "grad_norm": NaN, + "learning_rate": 0.0001744500237571522, + "loss": 0.0, + "step": 29904 + }, + { + "epoch": 2.790426425305589, + "grad_norm": NaN, + "learning_rate": 0.00017444256109936228, + "loss": 0.0, + "step": 29905 + }, + { + "epoch": 2.7905197350004665, + "grad_norm": NaN, + "learning_rate": 0.00017443509837942214, + "loss": 0.0, + "step": 29906 + }, + { + "epoch": 2.790613044695344, + "grad_norm": NaN, + "learning_rate": 0.0001744276355973508, + "loss": 0.0, + "step": 29907 + }, + { + "epoch": 2.7907063543902213, + "grad_norm": NaN, + "learning_rate": 0.00017442017275316724, + "loss": 0.0, + "step": 29908 + }, + { + "epoch": 2.7907996640850987, + "grad_norm": NaN, + "learning_rate": 0.00017441270984689037, + "loss": 0.0, + "step": 29909 + }, + { + "epoch": 2.7908929737799757, + "grad_norm": NaN, + "learning_rate": 0.0001744052468785392, + "loss": 0.0, + "step": 29910 + }, + { + "epoch": 2.790986283474853, + "grad_norm": NaN, + "learning_rate": 0.00017439778384813277, + "loss": 0.0, + "step": 29911 + }, + { + "epoch": 2.79107959316973, + "grad_norm": NaN, + "learning_rate": 0.00017439032075568998, + "loss": 0.0, + "step": 29912 + }, + { + "epoch": 2.7911729028646075, + "grad_norm": NaN, + "learning_rate": 0.0001743828576012298, + "loss": 0.0, + "step": 29913 + }, + { + "epoch": 2.791266212559485, + "grad_norm": NaN, + "learning_rate": 0.0001743753943847713, + "loss": 0.0, + "step": 29914 + }, + { + "epoch": 2.7913595222543623, + "grad_norm": NaN, + "learning_rate": 0.00017436793110633334, + "loss": 0.0, + "step": 29915 + }, + { + "epoch": 2.7914528319492398, + "grad_norm": NaN, + "learning_rate": 0.00017436046776593496, + "loss": 0.0, + "step": 29916 + }, + { + "epoch": 2.7915461416441167, + "grad_norm": NaN, + "learning_rate": 0.00017435300436359513, + "loss": 0.0, + "step": 29917 + }, + { + "epoch": 2.791639451338994, + "grad_norm": NaN, + "learning_rate": 0.00017434554089933285, + "loss": 0.0, + "step": 29918 + }, + { + "epoch": 2.7917327610338716, + "grad_norm": NaN, + "learning_rate": 0.00017433807737316702, + "loss": 0.0, + "step": 29919 + }, + { + "epoch": 2.7918260707287486, + "grad_norm": NaN, + "learning_rate": 0.0001743306137851167, + "loss": 0.0, + "step": 29920 + }, + { + "epoch": 2.791919380423626, + "grad_norm": NaN, + "learning_rate": 0.00017432315013520087, + "loss": 0.0, + "step": 29921 + }, + { + "epoch": 2.7920126901185034, + "grad_norm": NaN, + "learning_rate": 0.00017431568642343838, + "loss": 0.0, + "step": 29922 + }, + { + "epoch": 2.792105999813381, + "grad_norm": NaN, + "learning_rate": 0.00017430822264984835, + "loss": 0.0, + "step": 29923 + }, + { + "epoch": 2.792199309508258, + "grad_norm": NaN, + "learning_rate": 0.0001743007588144498, + "loss": 0.0, + "step": 29924 + }, + { + "epoch": 2.792292619203135, + "grad_norm": NaN, + "learning_rate": 0.00017429329491726146, + "loss": 0.0, + "step": 29925 + }, + { + "epoch": 2.7923859288980126, + "grad_norm": NaN, + "learning_rate": 0.00017428583095830254, + "loss": 0.0, + "step": 29926 + }, + { + "epoch": 2.7924792385928896, + "grad_norm": NaN, + "learning_rate": 0.000174278366937592, + "loss": 0.0, + "step": 29927 + }, + { + "epoch": 2.792572548287767, + "grad_norm": NaN, + "learning_rate": 0.00017427090285514869, + "loss": 0.0, + "step": 29928 + }, + { + "epoch": 2.7926658579826444, + "grad_norm": NaN, + "learning_rate": 0.00017426343871099166, + "loss": 0.0, + "step": 29929 + }, + { + "epoch": 2.792759167677522, + "grad_norm": NaN, + "learning_rate": 0.00017425597450513997, + "loss": 0.0, + "step": 29930 + }, + { + "epoch": 2.792852477372399, + "grad_norm": NaN, + "learning_rate": 0.00017424851023761247, + "loss": 0.0, + "step": 29931 + }, + { + "epoch": 2.7929457870672763, + "grad_norm": NaN, + "learning_rate": 0.00017424104590842818, + "loss": 0.0, + "step": 29932 + }, + { + "epoch": 2.7930390967621537, + "grad_norm": NaN, + "learning_rate": 0.00017423358151760613, + "loss": 0.0, + "step": 29933 + }, + { + "epoch": 2.7931324064570306, + "grad_norm": NaN, + "learning_rate": 0.00017422611706516524, + "loss": 0.0, + "step": 29934 + }, + { + "epoch": 2.793225716151908, + "grad_norm": NaN, + "learning_rate": 0.00017421865255112448, + "loss": 0.0, + "step": 29935 + }, + { + "epoch": 2.7933190258467855, + "grad_norm": NaN, + "learning_rate": 0.00017421118797550288, + "loss": 0.0, + "step": 29936 + }, + { + "epoch": 2.793412335541663, + "grad_norm": NaN, + "learning_rate": 0.00017420372333831944, + "loss": 0.0, + "step": 29937 + }, + { + "epoch": 2.7935056452365403, + "grad_norm": NaN, + "learning_rate": 0.00017419625863959303, + "loss": 0.0, + "step": 29938 + }, + { + "epoch": 2.7935989549314173, + "grad_norm": NaN, + "learning_rate": 0.00017418879387934272, + "loss": 0.0, + "step": 29939 + }, + { + "epoch": 2.7936922646262947, + "grad_norm": NaN, + "learning_rate": 0.00017418132905758758, + "loss": 0.0, + "step": 29940 + }, + { + "epoch": 2.7937855743211717, + "grad_norm": NaN, + "learning_rate": 0.00017417386417434635, + "loss": 0.0, + "step": 29941 + }, + { + "epoch": 2.793878884016049, + "grad_norm": NaN, + "learning_rate": 0.00017416639922963818, + "loss": 0.0, + "step": 29942 + }, + { + "epoch": 2.7939721937109265, + "grad_norm": NaN, + "learning_rate": 0.00017415893422348204, + "loss": 0.0, + "step": 29943 + }, + { + "epoch": 2.794065503405804, + "grad_norm": NaN, + "learning_rate": 0.00017415146915589685, + "loss": 0.0, + "step": 29944 + }, + { + "epoch": 2.7941588131006814, + "grad_norm": NaN, + "learning_rate": 0.00017414400402690166, + "loss": 0.0, + "step": 29945 + }, + { + "epoch": 2.7942521227955583, + "grad_norm": NaN, + "learning_rate": 0.00017413653883651545, + "loss": 0.0, + "step": 29946 + }, + { + "epoch": 2.7943454324904358, + "grad_norm": NaN, + "learning_rate": 0.0001741290735847571, + "loss": 0.0, + "step": 29947 + }, + { + "epoch": 2.794438742185313, + "grad_norm": NaN, + "learning_rate": 0.00017412160827164568, + "loss": 0.0, + "step": 29948 + }, + { + "epoch": 2.79453205188019, + "grad_norm": NaN, + "learning_rate": 0.00017411414289720024, + "loss": 0.0, + "step": 29949 + }, + { + "epoch": 2.7946253615750676, + "grad_norm": NaN, + "learning_rate": 0.00017410667746143956, + "loss": 0.0, + "step": 29950 + }, + { + "epoch": 2.794718671269945, + "grad_norm": NaN, + "learning_rate": 0.00017409921196438277, + "loss": 0.0, + "step": 29951 + }, + { + "epoch": 2.7948119809648224, + "grad_norm": NaN, + "learning_rate": 0.00017409174640604892, + "loss": 0.0, + "step": 29952 + }, + { + "epoch": 2.7949052906596994, + "grad_norm": NaN, + "learning_rate": 0.00017408428078645677, + "loss": 0.0, + "step": 29953 + }, + { + "epoch": 2.794998600354577, + "grad_norm": NaN, + "learning_rate": 0.00017407681510562547, + "loss": 0.0, + "step": 29954 + }, + { + "epoch": 2.7950919100494542, + "grad_norm": NaN, + "learning_rate": 0.00017406934936357396, + "loss": 0.0, + "step": 29955 + }, + { + "epoch": 2.795185219744331, + "grad_norm": NaN, + "learning_rate": 0.00017406188356032125, + "loss": 0.0, + "step": 29956 + }, + { + "epoch": 2.7952785294392086, + "grad_norm": NaN, + "learning_rate": 0.00017405441769588627, + "loss": 0.0, + "step": 29957 + }, + { + "epoch": 2.795371839134086, + "grad_norm": NaN, + "learning_rate": 0.000174046951770288, + "loss": 0.0, + "step": 29958 + }, + { + "epoch": 2.7954651488289635, + "grad_norm": NaN, + "learning_rate": 0.00017403948578354554, + "loss": 0.0, + "step": 29959 + }, + { + "epoch": 2.795558458523841, + "grad_norm": NaN, + "learning_rate": 0.00017403201973567774, + "loss": 0.0, + "step": 29960 + }, + { + "epoch": 2.795651768218718, + "grad_norm": NaN, + "learning_rate": 0.00017402455362670364, + "loss": 0.0, + "step": 29961 + }, + { + "epoch": 2.7957450779135953, + "grad_norm": NaN, + "learning_rate": 0.00017401708745664224, + "loss": 0.0, + "step": 29962 + }, + { + "epoch": 2.7958383876084723, + "grad_norm": NaN, + "learning_rate": 0.00017400962122551242, + "loss": 0.0, + "step": 29963 + }, + { + "epoch": 2.7959316973033497, + "grad_norm": NaN, + "learning_rate": 0.0001740021549333333, + "loss": 0.0, + "step": 29964 + }, + { + "epoch": 2.796025006998227, + "grad_norm": NaN, + "learning_rate": 0.00017399468858012384, + "loss": 0.0, + "step": 29965 + }, + { + "epoch": 2.7961183166931045, + "grad_norm": NaN, + "learning_rate": 0.00017398722216590296, + "loss": 0.0, + "step": 29966 + }, + { + "epoch": 2.796211626387982, + "grad_norm": NaN, + "learning_rate": 0.00017397975569068968, + "loss": 0.0, + "step": 29967 + }, + { + "epoch": 2.796304936082859, + "grad_norm": NaN, + "learning_rate": 0.00017397228915450304, + "loss": 0.0, + "step": 29968 + }, + { + "epoch": 2.7963982457777363, + "grad_norm": NaN, + "learning_rate": 0.00017396482255736185, + "loss": 0.0, + "step": 29969 + }, + { + "epoch": 2.7964915554726137, + "grad_norm": NaN, + "learning_rate": 0.0001739573558992853, + "loss": 0.0, + "step": 29970 + }, + { + "epoch": 2.7965848651674907, + "grad_norm": NaN, + "learning_rate": 0.00017394988918029231, + "loss": 0.0, + "step": 29971 + }, + { + "epoch": 2.796678174862368, + "grad_norm": NaN, + "learning_rate": 0.00017394242240040177, + "loss": 0.0, + "step": 29972 + }, + { + "epoch": 2.7967714845572456, + "grad_norm": NaN, + "learning_rate": 0.0001739349555596328, + "loss": 0.0, + "step": 29973 + }, + { + "epoch": 2.796864794252123, + "grad_norm": NaN, + "learning_rate": 0.0001739274886580043, + "loss": 0.0, + "step": 29974 + }, + { + "epoch": 2.796958103947, + "grad_norm": NaN, + "learning_rate": 0.00017392002169553528, + "loss": 0.0, + "step": 29975 + }, + { + "epoch": 2.7970514136418774, + "grad_norm": NaN, + "learning_rate": 0.00017391255467224476, + "loss": 0.0, + "step": 29976 + }, + { + "epoch": 2.797144723336755, + "grad_norm": NaN, + "learning_rate": 0.00017390508758815168, + "loss": 0.0, + "step": 29977 + }, + { + "epoch": 2.7972380330316318, + "grad_norm": NaN, + "learning_rate": 0.00017389762044327508, + "loss": 0.0, + "step": 29978 + }, + { + "epoch": 2.797331342726509, + "grad_norm": NaN, + "learning_rate": 0.00017389015323763382, + "loss": 0.0, + "step": 29979 + }, + { + "epoch": 2.7974246524213866, + "grad_norm": NaN, + "learning_rate": 0.00017388268597124708, + "loss": 0.0, + "step": 29980 + }, + { + "epoch": 2.797517962116264, + "grad_norm": NaN, + "learning_rate": 0.0001738752186441337, + "loss": 0.0, + "step": 29981 + }, + { + "epoch": 2.7976112718111414, + "grad_norm": NaN, + "learning_rate": 0.0001738677512563127, + "loss": 0.0, + "step": 29982 + }, + { + "epoch": 2.7977045815060184, + "grad_norm": NaN, + "learning_rate": 0.00017386028380780307, + "loss": 0.0, + "step": 29983 + }, + { + "epoch": 2.797797891200896, + "grad_norm": NaN, + "learning_rate": 0.00017385281629862386, + "loss": 0.0, + "step": 29984 + }, + { + "epoch": 2.797891200895773, + "grad_norm": NaN, + "learning_rate": 0.00017384534872879394, + "loss": 0.0, + "step": 29985 + }, + { + "epoch": 2.7979845105906502, + "grad_norm": NaN, + "learning_rate": 0.0001738378810983324, + "loss": 0.0, + "step": 29986 + }, + { + "epoch": 2.7980778202855277, + "grad_norm": NaN, + "learning_rate": 0.0001738304134072582, + "loss": 0.0, + "step": 29987 + }, + { + "epoch": 2.798171129980405, + "grad_norm": NaN, + "learning_rate": 0.00017382294565559028, + "loss": 0.0, + "step": 29988 + }, + { + "epoch": 2.7982644396752825, + "grad_norm": NaN, + "learning_rate": 0.00017381547784334764, + "loss": 0.0, + "step": 29989 + }, + { + "epoch": 2.7983577493701595, + "grad_norm": NaN, + "learning_rate": 0.0001738080099705494, + "loss": 0.0, + "step": 29990 + }, + { + "epoch": 2.798451059065037, + "grad_norm": NaN, + "learning_rate": 0.00017380054203721434, + "loss": 0.0, + "step": 29991 + }, + { + "epoch": 2.7985443687599143, + "grad_norm": NaN, + "learning_rate": 0.00017379307404336158, + "loss": 0.0, + "step": 29992 + }, + { + "epoch": 2.7986376784547913, + "grad_norm": NaN, + "learning_rate": 0.00017378560598901012, + "loss": 0.0, + "step": 29993 + }, + { + "epoch": 2.7987309881496687, + "grad_norm": NaN, + "learning_rate": 0.00017377813787417885, + "loss": 0.0, + "step": 29994 + }, + { + "epoch": 2.798824297844546, + "grad_norm": NaN, + "learning_rate": 0.00017377066969888688, + "loss": 0.0, + "step": 29995 + }, + { + "epoch": 2.7989176075394235, + "grad_norm": NaN, + "learning_rate": 0.0001737632014631531, + "loss": 0.0, + "step": 29996 + }, + { + "epoch": 2.7990109172343005, + "grad_norm": NaN, + "learning_rate": 0.00017375573316699653, + "loss": 0.0, + "step": 29997 + }, + { + "epoch": 2.799104226929178, + "grad_norm": NaN, + "learning_rate": 0.0001737482648104362, + "loss": 0.0, + "step": 29998 + }, + { + "epoch": 2.7991975366240553, + "grad_norm": NaN, + "learning_rate": 0.00017374079639349106, + "loss": 0.0, + "step": 29999 + }, + { + "epoch": 2.7992908463189323, + "grad_norm": NaN, + "learning_rate": 0.00017373332791618013, + "loss": 0.0, + "step": 30000 + }, + { + "epoch": 2.7993841560138097, + "grad_norm": NaN, + "learning_rate": 0.00017372585937852232, + "loss": 0.0, + "step": 30001 + }, + { + "epoch": 2.799477465708687, + "grad_norm": NaN, + "learning_rate": 0.00017371839078053668, + "loss": 0.0, + "step": 30002 + }, + { + "epoch": 2.7995707754035646, + "grad_norm": NaN, + "learning_rate": 0.00017371092212224227, + "loss": 0.0, + "step": 30003 + }, + { + "epoch": 2.799664085098442, + "grad_norm": NaN, + "learning_rate": 0.00017370345340365795, + "loss": 0.0, + "step": 30004 + }, + { + "epoch": 2.799757394793319, + "grad_norm": NaN, + "learning_rate": 0.00017369598462480278, + "loss": 0.0, + "step": 30005 + }, + { + "epoch": 2.7998507044881964, + "grad_norm": NaN, + "learning_rate": 0.00017368851578569578, + "loss": 0.0, + "step": 30006 + }, + { + "epoch": 2.7999440141830734, + "grad_norm": NaN, + "learning_rate": 0.00017368104688635582, + "loss": 0.0, + "step": 30007 + }, + { + "epoch": 2.800037323877951, + "grad_norm": NaN, + "learning_rate": 0.000173673577926802, + "loss": 0.0, + "step": 30008 + }, + { + "epoch": 2.800130633572828, + "grad_norm": NaN, + "learning_rate": 0.00017366610890705335, + "loss": 0.0, + "step": 30009 + }, + { + "epoch": 2.8002239432677056, + "grad_norm": NaN, + "learning_rate": 0.0001736586398271287, + "loss": 0.0, + "step": 30010 + }, + { + "epoch": 2.800317252962583, + "grad_norm": NaN, + "learning_rate": 0.0001736511706870472, + "loss": 0.0, + "step": 30011 + }, + { + "epoch": 2.80041056265746, + "grad_norm": NaN, + "learning_rate": 0.00017364370148682776, + "loss": 0.0, + "step": 30012 + }, + { + "epoch": 2.8005038723523374, + "grad_norm": NaN, + "learning_rate": 0.00017363623222648938, + "loss": 0.0, + "step": 30013 + }, + { + "epoch": 2.800597182047215, + "grad_norm": NaN, + "learning_rate": 0.0001736287629060511, + "loss": 0.0, + "step": 30014 + }, + { + "epoch": 2.800690491742092, + "grad_norm": NaN, + "learning_rate": 0.00017362129352553184, + "loss": 0.0, + "step": 30015 + }, + { + "epoch": 2.8007838014369693, + "grad_norm": NaN, + "learning_rate": 0.00017361382408495062, + "loss": 0.0, + "step": 30016 + }, + { + "epoch": 2.8008771111318467, + "grad_norm": NaN, + "learning_rate": 0.00017360635458432645, + "loss": 0.0, + "step": 30017 + }, + { + "epoch": 2.800970420826724, + "grad_norm": NaN, + "learning_rate": 0.00017359888502367835, + "loss": 0.0, + "step": 30018 + }, + { + "epoch": 2.801063730521601, + "grad_norm": NaN, + "learning_rate": 0.00017359141540302522, + "loss": 0.0, + "step": 30019 + }, + { + "epoch": 2.8011570402164785, + "grad_norm": NaN, + "learning_rate": 0.00017358394572238614, + "loss": 0.0, + "step": 30020 + }, + { + "epoch": 2.801250349911356, + "grad_norm": NaN, + "learning_rate": 0.00017357647598178008, + "loss": 0.0, + "step": 30021 + }, + { + "epoch": 2.801343659606233, + "grad_norm": NaN, + "learning_rate": 0.00017356900618122604, + "loss": 0.0, + "step": 30022 + }, + { + "epoch": 2.8014369693011103, + "grad_norm": NaN, + "learning_rate": 0.00017356153632074294, + "loss": 0.0, + "step": 30023 + }, + { + "epoch": 2.8015302789959877, + "grad_norm": NaN, + "learning_rate": 0.00017355406640034987, + "loss": 0.0, + "step": 30024 + }, + { + "epoch": 2.801623588690865, + "grad_norm": NaN, + "learning_rate": 0.00017354659642006582, + "loss": 0.0, + "step": 30025 + }, + { + "epoch": 2.801716898385742, + "grad_norm": NaN, + "learning_rate": 0.0001735391263799097, + "loss": 0.0, + "step": 30026 + }, + { + "epoch": 2.8018102080806195, + "grad_norm": NaN, + "learning_rate": 0.00017353165627990056, + "loss": 0.0, + "step": 30027 + }, + { + "epoch": 2.801903517775497, + "grad_norm": NaN, + "learning_rate": 0.00017352418612005742, + "loss": 0.0, + "step": 30028 + }, + { + "epoch": 2.801996827470374, + "grad_norm": NaN, + "learning_rate": 0.0001735167159003992, + "loss": 0.0, + "step": 30029 + }, + { + "epoch": 2.8020901371652513, + "grad_norm": NaN, + "learning_rate": 0.00017350924562094494, + "loss": 0.0, + "step": 30030 + }, + { + "epoch": 2.8021834468601288, + "grad_norm": NaN, + "learning_rate": 0.00017350177528171367, + "loss": 0.0, + "step": 30031 + }, + { + "epoch": 2.802276756555006, + "grad_norm": NaN, + "learning_rate": 0.0001734943048827243, + "loss": 0.0, + "step": 30032 + }, + { + "epoch": 2.8023700662498836, + "grad_norm": NaN, + "learning_rate": 0.0001734868344239959, + "loss": 0.0, + "step": 30033 + }, + { + "epoch": 2.8024633759447606, + "grad_norm": NaN, + "learning_rate": 0.00017347936390554744, + "loss": 0.0, + "step": 30034 + }, + { + "epoch": 2.802556685639638, + "grad_norm": NaN, + "learning_rate": 0.00017347189332739792, + "loss": 0.0, + "step": 30035 + }, + { + "epoch": 2.8026499953345154, + "grad_norm": NaN, + "learning_rate": 0.0001734644226895663, + "loss": 0.0, + "step": 30036 + }, + { + "epoch": 2.8027433050293924, + "grad_norm": NaN, + "learning_rate": 0.0001734569519920716, + "loss": 0.0, + "step": 30037 + }, + { + "epoch": 2.80283661472427, + "grad_norm": NaN, + "learning_rate": 0.00017344948123493287, + "loss": 0.0, + "step": 30038 + }, + { + "epoch": 2.8029299244191472, + "grad_norm": NaN, + "learning_rate": 0.000173442010418169, + "loss": 0.0, + "step": 30039 + }, + { + "epoch": 2.8030232341140247, + "grad_norm": NaN, + "learning_rate": 0.0001734345395417991, + "loss": 0.0, + "step": 30040 + }, + { + "epoch": 2.8031165438089016, + "grad_norm": NaN, + "learning_rate": 0.00017342706860584207, + "loss": 0.0, + "step": 30041 + }, + { + "epoch": 2.803209853503779, + "grad_norm": NaN, + "learning_rate": 0.00017341959761031693, + "loss": 0.0, + "step": 30042 + }, + { + "epoch": 2.8033031631986565, + "grad_norm": NaN, + "learning_rate": 0.0001734121265552427, + "loss": 0.0, + "step": 30043 + }, + { + "epoch": 2.8033964728935334, + "grad_norm": NaN, + "learning_rate": 0.00017340465544063842, + "loss": 0.0, + "step": 30044 + }, + { + "epoch": 2.803489782588411, + "grad_norm": NaN, + "learning_rate": 0.00017339718426652296, + "loss": 0.0, + "step": 30045 + }, + { + "epoch": 2.8035830922832883, + "grad_norm": NaN, + "learning_rate": 0.00017338971303291546, + "loss": 0.0, + "step": 30046 + }, + { + "epoch": 2.8036764019781657, + "grad_norm": NaN, + "learning_rate": 0.0001733822417398348, + "loss": 0.0, + "step": 30047 + }, + { + "epoch": 2.8037697116730427, + "grad_norm": NaN, + "learning_rate": 0.00017337477038730003, + "loss": 0.0, + "step": 30048 + }, + { + "epoch": 2.80386302136792, + "grad_norm": NaN, + "learning_rate": 0.00017336729897533015, + "loss": 0.0, + "step": 30049 + }, + { + "epoch": 2.8039563310627975, + "grad_norm": NaN, + "learning_rate": 0.00017335982750394413, + "loss": 0.0, + "step": 30050 + }, + { + "epoch": 2.8040496407576745, + "grad_norm": NaN, + "learning_rate": 0.000173352355973161, + "loss": 0.0, + "step": 30051 + }, + { + "epoch": 2.804142950452552, + "grad_norm": NaN, + "learning_rate": 0.00017334488438299977, + "loss": 0.0, + "step": 30052 + }, + { + "epoch": 2.8042362601474293, + "grad_norm": NaN, + "learning_rate": 0.0001733374127334794, + "loss": 0.0, + "step": 30053 + }, + { + "epoch": 2.8043295698423067, + "grad_norm": NaN, + "learning_rate": 0.0001733299410246189, + "loss": 0.0, + "step": 30054 + }, + { + "epoch": 2.804422879537184, + "grad_norm": NaN, + "learning_rate": 0.00017332246925643726, + "loss": 0.0, + "step": 30055 + }, + { + "epoch": 2.804516189232061, + "grad_norm": NaN, + "learning_rate": 0.00017331499742895352, + "loss": 0.0, + "step": 30056 + }, + { + "epoch": 2.8046094989269386, + "grad_norm": NaN, + "learning_rate": 0.00017330752554218663, + "loss": 0.0, + "step": 30057 + }, + { + "epoch": 2.8047028086218155, + "grad_norm": NaN, + "learning_rate": 0.0001733000535961556, + "loss": 0.0, + "step": 30058 + }, + { + "epoch": 2.804796118316693, + "grad_norm": NaN, + "learning_rate": 0.00017329258159087943, + "loss": 0.0, + "step": 30059 + }, + { + "epoch": 2.8048894280115704, + "grad_norm": NaN, + "learning_rate": 0.00017328510952637716, + "loss": 0.0, + "step": 30060 + }, + { + "epoch": 2.804982737706448, + "grad_norm": NaN, + "learning_rate": 0.0001732776374026677, + "loss": 0.0, + "step": 30061 + }, + { + "epoch": 2.805076047401325, + "grad_norm": NaN, + "learning_rate": 0.00017327016521977016, + "loss": 0.0, + "step": 30062 + }, + { + "epoch": 2.805169357096202, + "grad_norm": NaN, + "learning_rate": 0.00017326269297770344, + "loss": 0.0, + "step": 30063 + }, + { + "epoch": 2.8052626667910796, + "grad_norm": NaN, + "learning_rate": 0.0001732552206764866, + "loss": 0.0, + "step": 30064 + }, + { + "epoch": 2.805355976485957, + "grad_norm": NaN, + "learning_rate": 0.00017324774831613862, + "loss": 0.0, + "step": 30065 + }, + { + "epoch": 2.805449286180834, + "grad_norm": NaN, + "learning_rate": 0.00017324027589667854, + "loss": 0.0, + "step": 30066 + }, + { + "epoch": 2.8055425958757114, + "grad_norm": NaN, + "learning_rate": 0.00017323280341812527, + "loss": 0.0, + "step": 30067 + }, + { + "epoch": 2.805635905570589, + "grad_norm": NaN, + "learning_rate": 0.00017322533088049786, + "loss": 0.0, + "step": 30068 + }, + { + "epoch": 2.8057292152654663, + "grad_norm": NaN, + "learning_rate": 0.00017321785828381534, + "loss": 0.0, + "step": 30069 + }, + { + "epoch": 2.8058225249603432, + "grad_norm": NaN, + "learning_rate": 0.00017321038562809668, + "loss": 0.0, + "step": 30070 + }, + { + "epoch": 2.8059158346552207, + "grad_norm": NaN, + "learning_rate": 0.00017320291291336089, + "loss": 0.0, + "step": 30071 + }, + { + "epoch": 2.806009144350098, + "grad_norm": NaN, + "learning_rate": 0.00017319544013962693, + "loss": 0.0, + "step": 30072 + }, + { + "epoch": 2.806102454044975, + "grad_norm": NaN, + "learning_rate": 0.00017318796730691387, + "loss": 0.0, + "step": 30073 + }, + { + "epoch": 2.8061957637398525, + "grad_norm": NaN, + "learning_rate": 0.00017318049441524067, + "loss": 0.0, + "step": 30074 + }, + { + "epoch": 2.80628907343473, + "grad_norm": NaN, + "learning_rate": 0.00017317302146462637, + "loss": 0.0, + "step": 30075 + }, + { + "epoch": 2.8063823831296073, + "grad_norm": NaN, + "learning_rate": 0.00017316554845508988, + "loss": 0.0, + "step": 30076 + }, + { + "epoch": 2.8064756928244847, + "grad_norm": NaN, + "learning_rate": 0.00017315807538665032, + "loss": 0.0, + "step": 30077 + }, + { + "epoch": 2.8065690025193617, + "grad_norm": NaN, + "learning_rate": 0.00017315060225932656, + "loss": 0.0, + "step": 30078 + }, + { + "epoch": 2.806662312214239, + "grad_norm": NaN, + "learning_rate": 0.00017314312907313776, + "loss": 0.0, + "step": 30079 + }, + { + "epoch": 2.806755621909116, + "grad_norm": NaN, + "learning_rate": 0.0001731356558281028, + "loss": 0.0, + "step": 30080 + }, + { + "epoch": 2.8068489316039935, + "grad_norm": NaN, + "learning_rate": 0.00017312818252424074, + "loss": 0.0, + "step": 30081 + }, + { + "epoch": 2.806942241298871, + "grad_norm": NaN, + "learning_rate": 0.00017312070916157053, + "loss": 0.0, + "step": 30082 + }, + { + "epoch": 2.8070355509937484, + "grad_norm": NaN, + "learning_rate": 0.0001731132357401112, + "loss": 0.0, + "step": 30083 + }, + { + "epoch": 2.8071288606886258, + "grad_norm": NaN, + "learning_rate": 0.00017310576225988182, + "loss": 0.0, + "step": 30084 + }, + { + "epoch": 2.8072221703835027, + "grad_norm": NaN, + "learning_rate": 0.00017309828872090127, + "loss": 0.0, + "step": 30085 + }, + { + "epoch": 2.80731548007838, + "grad_norm": NaN, + "learning_rate": 0.00017309081512318866, + "loss": 0.0, + "step": 30086 + }, + { + "epoch": 2.8074087897732576, + "grad_norm": NaN, + "learning_rate": 0.0001730833414667629, + "loss": 0.0, + "step": 30087 + }, + { + "epoch": 2.8075020994681346, + "grad_norm": NaN, + "learning_rate": 0.00017307586775164308, + "loss": 0.0, + "step": 30088 + }, + { + "epoch": 2.807595409163012, + "grad_norm": NaN, + "learning_rate": 0.00017306839397784814, + "loss": 0.0, + "step": 30089 + }, + { + "epoch": 2.8076887188578894, + "grad_norm": NaN, + "learning_rate": 0.00017306092014539713, + "loss": 0.0, + "step": 30090 + }, + { + "epoch": 2.807782028552767, + "grad_norm": NaN, + "learning_rate": 0.000173053446254309, + "loss": 0.0, + "step": 30091 + }, + { + "epoch": 2.807875338247644, + "grad_norm": NaN, + "learning_rate": 0.0001730459723046028, + "loss": 0.0, + "step": 30092 + }, + { + "epoch": 2.807968647942521, + "grad_norm": NaN, + "learning_rate": 0.0001730384982962975, + "loss": 0.0, + "step": 30093 + }, + { + "epoch": 2.8080619576373986, + "grad_norm": NaN, + "learning_rate": 0.00017303102422941215, + "loss": 0.0, + "step": 30094 + }, + { + "epoch": 2.8081552673322756, + "grad_norm": NaN, + "learning_rate": 0.00017302355010396576, + "loss": 0.0, + "step": 30095 + }, + { + "epoch": 2.808248577027153, + "grad_norm": NaN, + "learning_rate": 0.00017301607591997724, + "loss": 0.0, + "step": 30096 + }, + { + "epoch": 2.8083418867220304, + "grad_norm": NaN, + "learning_rate": 0.00017300860167746568, + "loss": 0.0, + "step": 30097 + }, + { + "epoch": 2.808435196416908, + "grad_norm": NaN, + "learning_rate": 0.00017300112737645004, + "loss": 0.0, + "step": 30098 + }, + { + "epoch": 2.8085285061117853, + "grad_norm": NaN, + "learning_rate": 0.00017299365301694938, + "loss": 0.0, + "step": 30099 + }, + { + "epoch": 2.8086218158066623, + "grad_norm": NaN, + "learning_rate": 0.00017298617859898266, + "loss": 0.0, + "step": 30100 + }, + { + "epoch": 2.8087151255015397, + "grad_norm": NaN, + "learning_rate": 0.00017297870412256888, + "loss": 0.0, + "step": 30101 + }, + { + "epoch": 2.8088084351964167, + "grad_norm": NaN, + "learning_rate": 0.0001729712295877271, + "loss": 0.0, + "step": 30102 + }, + { + "epoch": 2.808901744891294, + "grad_norm": NaN, + "learning_rate": 0.00017296375499447627, + "loss": 0.0, + "step": 30103 + }, + { + "epoch": 2.8089950545861715, + "grad_norm": NaN, + "learning_rate": 0.00017295628034283541, + "loss": 0.0, + "step": 30104 + }, + { + "epoch": 2.809088364281049, + "grad_norm": NaN, + "learning_rate": 0.00017294880563282353, + "loss": 0.0, + "step": 30105 + }, + { + "epoch": 2.8091816739759263, + "grad_norm": NaN, + "learning_rate": 0.00017294133086445963, + "loss": 0.0, + "step": 30106 + }, + { + "epoch": 2.8092749836708033, + "grad_norm": NaN, + "learning_rate": 0.0001729338560377627, + "loss": 0.0, + "step": 30107 + }, + { + "epoch": 2.8093682933656807, + "grad_norm": NaN, + "learning_rate": 0.00017292638115275183, + "loss": 0.0, + "step": 30108 + }, + { + "epoch": 2.809461603060558, + "grad_norm": NaN, + "learning_rate": 0.00017291890620944593, + "loss": 0.0, + "step": 30109 + }, + { + "epoch": 2.809554912755435, + "grad_norm": NaN, + "learning_rate": 0.00017291143120786402, + "loss": 0.0, + "step": 30110 + }, + { + "epoch": 2.8096482224503125, + "grad_norm": NaN, + "learning_rate": 0.00017290395614802514, + "loss": 0.0, + "step": 30111 + }, + { + "epoch": 2.80974153214519, + "grad_norm": NaN, + "learning_rate": 0.00017289648102994832, + "loss": 0.0, + "step": 30112 + }, + { + "epoch": 2.8098348418400674, + "grad_norm": NaN, + "learning_rate": 0.0001728890058536525, + "loss": 0.0, + "step": 30113 + }, + { + "epoch": 2.8099281515349444, + "grad_norm": NaN, + "learning_rate": 0.0001728815306191567, + "loss": 0.0, + "step": 30114 + }, + { + "epoch": 2.8100214612298218, + "grad_norm": NaN, + "learning_rate": 0.00017287405532647994, + "loss": 0.0, + "step": 30115 + }, + { + "epoch": 2.810114770924699, + "grad_norm": NaN, + "learning_rate": 0.00017286657997564128, + "loss": 0.0, + "step": 30116 + }, + { + "epoch": 2.810208080619576, + "grad_norm": NaN, + "learning_rate": 0.00017285910456665965, + "loss": 0.0, + "step": 30117 + }, + { + "epoch": 2.8103013903144536, + "grad_norm": NaN, + "learning_rate": 0.00017285162909955412, + "loss": 0.0, + "step": 30118 + }, + { + "epoch": 2.810394700009331, + "grad_norm": NaN, + "learning_rate": 0.00017284415357434363, + "loss": 0.0, + "step": 30119 + }, + { + "epoch": 2.8104880097042084, + "grad_norm": NaN, + "learning_rate": 0.00017283667799104725, + "loss": 0.0, + "step": 30120 + }, + { + "epoch": 2.810581319399086, + "grad_norm": NaN, + "learning_rate": 0.00017282920234968393, + "loss": 0.0, + "step": 30121 + }, + { + "epoch": 2.810674629093963, + "grad_norm": NaN, + "learning_rate": 0.00017282172665027273, + "loss": 0.0, + "step": 30122 + }, + { + "epoch": 2.8107679387888402, + "grad_norm": NaN, + "learning_rate": 0.00017281425089283264, + "loss": 0.0, + "step": 30123 + }, + { + "epoch": 2.810861248483717, + "grad_norm": NaN, + "learning_rate": 0.00017280677507738269, + "loss": 0.0, + "step": 30124 + }, + { + "epoch": 2.8109545581785946, + "grad_norm": NaN, + "learning_rate": 0.00017279929920394182, + "loss": 0.0, + "step": 30125 + }, + { + "epoch": 2.811047867873472, + "grad_norm": NaN, + "learning_rate": 0.00017279182327252913, + "loss": 0.0, + "step": 30126 + }, + { + "epoch": 2.8111411775683495, + "grad_norm": NaN, + "learning_rate": 0.00017278434728316356, + "loss": 0.0, + "step": 30127 + }, + { + "epoch": 2.811234487263227, + "grad_norm": NaN, + "learning_rate": 0.00017277687123586416, + "loss": 0.0, + "step": 30128 + }, + { + "epoch": 2.811327796958104, + "grad_norm": NaN, + "learning_rate": 0.0001727693951306499, + "loss": 0.0, + "step": 30129 + }, + { + "epoch": 2.8114211066529813, + "grad_norm": NaN, + "learning_rate": 0.00017276191896753982, + "loss": 0.0, + "step": 30130 + }, + { + "epoch": 2.8115144163478587, + "grad_norm": NaN, + "learning_rate": 0.00017275444274655291, + "loss": 0.0, + "step": 30131 + }, + { + "epoch": 2.8116077260427357, + "grad_norm": NaN, + "learning_rate": 0.00017274696646770826, + "loss": 0.0, + "step": 30132 + }, + { + "epoch": 2.811701035737613, + "grad_norm": NaN, + "learning_rate": 0.00017273949013102473, + "loss": 0.0, + "step": 30133 + }, + { + "epoch": 2.8117943454324905, + "grad_norm": NaN, + "learning_rate": 0.00017273201373652145, + "loss": 0.0, + "step": 30134 + }, + { + "epoch": 2.811887655127368, + "grad_norm": NaN, + "learning_rate": 0.00017272453728421738, + "loss": 0.0, + "step": 30135 + }, + { + "epoch": 2.811980964822245, + "grad_norm": NaN, + "learning_rate": 0.0001727170607741316, + "loss": 0.0, + "step": 30136 + }, + { + "epoch": 2.8120742745171223, + "grad_norm": NaN, + "learning_rate": 0.000172709584206283, + "loss": 0.0, + "step": 30137 + }, + { + "epoch": 2.8121675842119997, + "grad_norm": NaN, + "learning_rate": 0.00017270210758069064, + "loss": 0.0, + "step": 30138 + }, + { + "epoch": 2.8122608939068767, + "grad_norm": NaN, + "learning_rate": 0.00017269463089737358, + "loss": 0.0, + "step": 30139 + }, + { + "epoch": 2.812354203601754, + "grad_norm": NaN, + "learning_rate": 0.0001726871541563508, + "loss": 0.0, + "step": 30140 + }, + { + "epoch": 2.8124475132966316, + "grad_norm": NaN, + "learning_rate": 0.00017267967735764132, + "loss": 0.0, + "step": 30141 + }, + { + "epoch": 2.812540822991509, + "grad_norm": NaN, + "learning_rate": 0.0001726722005012641, + "loss": 0.0, + "step": 30142 + }, + { + "epoch": 2.812634132686386, + "grad_norm": NaN, + "learning_rate": 0.0001726647235872382, + "loss": 0.0, + "step": 30143 + }, + { + "epoch": 2.8127274423812634, + "grad_norm": NaN, + "learning_rate": 0.00017265724661558266, + "loss": 0.0, + "step": 30144 + }, + { + "epoch": 2.812820752076141, + "grad_norm": NaN, + "learning_rate": 0.00017264976958631642, + "loss": 0.0, + "step": 30145 + }, + { + "epoch": 2.8129140617710178, + "grad_norm": NaN, + "learning_rate": 0.00017264229249945852, + "loss": 0.0, + "step": 30146 + }, + { + "epoch": 2.813007371465895, + "grad_norm": NaN, + "learning_rate": 0.00017263481535502797, + "loss": 0.0, + "step": 30147 + }, + { + "epoch": 2.8131006811607726, + "grad_norm": NaN, + "learning_rate": 0.00017262733815304383, + "loss": 0.0, + "step": 30148 + }, + { + "epoch": 2.81319399085565, + "grad_norm": NaN, + "learning_rate": 0.00017261986089352503, + "loss": 0.0, + "step": 30149 + }, + { + "epoch": 2.8132873005505274, + "grad_norm": NaN, + "learning_rate": 0.00017261238357649063, + "loss": 0.0, + "step": 30150 + }, + { + "epoch": 2.8133806102454044, + "grad_norm": NaN, + "learning_rate": 0.00017260490620195967, + "loss": 0.0, + "step": 30151 + }, + { + "epoch": 2.813473919940282, + "grad_norm": NaN, + "learning_rate": 0.00017259742876995113, + "loss": 0.0, + "step": 30152 + }, + { + "epoch": 2.813567229635159, + "grad_norm": NaN, + "learning_rate": 0.00017258995128048397, + "loss": 0.0, + "step": 30153 + }, + { + "epoch": 2.8136605393300362, + "grad_norm": NaN, + "learning_rate": 0.00017258247373357728, + "loss": 0.0, + "step": 30154 + }, + { + "epoch": 2.8137538490249137, + "grad_norm": NaN, + "learning_rate": 0.00017257499612925007, + "loss": 0.0, + "step": 30155 + }, + { + "epoch": 2.813847158719791, + "grad_norm": NaN, + "learning_rate": 0.00017256751846752132, + "loss": 0.0, + "step": 30156 + }, + { + "epoch": 2.8139404684146685, + "grad_norm": NaN, + "learning_rate": 0.00017256004074841003, + "loss": 0.0, + "step": 30157 + }, + { + "epoch": 2.8140337781095455, + "grad_norm": NaN, + "learning_rate": 0.0001725525629719353, + "loss": 0.0, + "step": 30158 + }, + { + "epoch": 2.814127087804423, + "grad_norm": NaN, + "learning_rate": 0.00017254508513811604, + "loss": 0.0, + "step": 30159 + }, + { + "epoch": 2.8142203974993003, + "grad_norm": NaN, + "learning_rate": 0.0001725376072469713, + "loss": 0.0, + "step": 30160 + }, + { + "epoch": 2.8143137071941773, + "grad_norm": NaN, + "learning_rate": 0.00017253012929852014, + "loss": 0.0, + "step": 30161 + }, + { + "epoch": 2.8144070168890547, + "grad_norm": NaN, + "learning_rate": 0.0001725226512927815, + "loss": 0.0, + "step": 30162 + }, + { + "epoch": 2.814500326583932, + "grad_norm": NaN, + "learning_rate": 0.0001725151732297744, + "loss": 0.0, + "step": 30163 + }, + { + "epoch": 2.8145936362788095, + "grad_norm": NaN, + "learning_rate": 0.00017250769510951793, + "loss": 0.0, + "step": 30164 + }, + { + "epoch": 2.8146869459736865, + "grad_norm": NaN, + "learning_rate": 0.00017250021693203103, + "loss": 0.0, + "step": 30165 + }, + { + "epoch": 2.814780255668564, + "grad_norm": NaN, + "learning_rate": 0.0001724927386973328, + "loss": 0.0, + "step": 30166 + }, + { + "epoch": 2.8148735653634414, + "grad_norm": NaN, + "learning_rate": 0.00017248526040544214, + "loss": 0.0, + "step": 30167 + }, + { + "epoch": 2.8149668750583183, + "grad_norm": NaN, + "learning_rate": 0.00017247778205637813, + "loss": 0.0, + "step": 30168 + }, + { + "epoch": 2.8150601847531957, + "grad_norm": NaN, + "learning_rate": 0.00017247030365015982, + "loss": 0.0, + "step": 30169 + }, + { + "epoch": 2.815153494448073, + "grad_norm": NaN, + "learning_rate": 0.00017246282518680614, + "loss": 0.0, + "step": 30170 + }, + { + "epoch": 2.8152468041429506, + "grad_norm": NaN, + "learning_rate": 0.00017245534666633617, + "loss": 0.0, + "step": 30171 + }, + { + "epoch": 2.815340113837828, + "grad_norm": NaN, + "learning_rate": 0.0001724478680887689, + "loss": 0.0, + "step": 30172 + }, + { + "epoch": 2.815433423532705, + "grad_norm": NaN, + "learning_rate": 0.00017244038945412335, + "loss": 0.0, + "step": 30173 + }, + { + "epoch": 2.8155267332275824, + "grad_norm": NaN, + "learning_rate": 0.00017243291076241853, + "loss": 0.0, + "step": 30174 + }, + { + "epoch": 2.8156200429224594, + "grad_norm": NaN, + "learning_rate": 0.0001724254320136735, + "loss": 0.0, + "step": 30175 + }, + { + "epoch": 2.815713352617337, + "grad_norm": NaN, + "learning_rate": 0.00017241795320790718, + "loss": 0.0, + "step": 30176 + }, + { + "epoch": 2.815806662312214, + "grad_norm": NaN, + "learning_rate": 0.00017241047434513873, + "loss": 0.0, + "step": 30177 + }, + { + "epoch": 2.8158999720070916, + "grad_norm": NaN, + "learning_rate": 0.00017240299542538702, + "loss": 0.0, + "step": 30178 + }, + { + "epoch": 2.815993281701969, + "grad_norm": NaN, + "learning_rate": 0.00017239551644867112, + "loss": 0.0, + "step": 30179 + }, + { + "epoch": 2.816086591396846, + "grad_norm": NaN, + "learning_rate": 0.0001723880374150101, + "loss": 0.0, + "step": 30180 + }, + { + "epoch": 2.8161799010917234, + "grad_norm": NaN, + "learning_rate": 0.00017238055832442288, + "loss": 0.0, + "step": 30181 + }, + { + "epoch": 2.816273210786601, + "grad_norm": NaN, + "learning_rate": 0.00017237307917692855, + "loss": 0.0, + "step": 30182 + }, + { + "epoch": 2.816366520481478, + "grad_norm": NaN, + "learning_rate": 0.00017236559997254613, + "loss": 0.0, + "step": 30183 + }, + { + "epoch": 2.8164598301763553, + "grad_norm": NaN, + "learning_rate": 0.0001723581207112946, + "loss": 0.0, + "step": 30184 + }, + { + "epoch": 2.8165531398712327, + "grad_norm": NaN, + "learning_rate": 0.00017235064139319297, + "loss": 0.0, + "step": 30185 + }, + { + "epoch": 2.81664644956611, + "grad_norm": NaN, + "learning_rate": 0.0001723431620182603, + "loss": 0.0, + "step": 30186 + }, + { + "epoch": 2.816739759260987, + "grad_norm": NaN, + "learning_rate": 0.0001723356825865156, + "loss": 0.0, + "step": 30187 + }, + { + "epoch": 2.8168330689558645, + "grad_norm": NaN, + "learning_rate": 0.00017232820309797788, + "loss": 0.0, + "step": 30188 + }, + { + "epoch": 2.816926378650742, + "grad_norm": NaN, + "learning_rate": 0.00017232072355266614, + "loss": 0.0, + "step": 30189 + }, + { + "epoch": 2.817019688345619, + "grad_norm": NaN, + "learning_rate": 0.0001723132439505994, + "loss": 0.0, + "step": 30190 + }, + { + "epoch": 2.8171129980404963, + "grad_norm": NaN, + "learning_rate": 0.00017230576429179672, + "loss": 0.0, + "step": 30191 + }, + { + "epoch": 2.8172063077353737, + "grad_norm": NaN, + "learning_rate": 0.00017229828457627705, + "loss": 0.0, + "step": 30192 + }, + { + "epoch": 2.817299617430251, + "grad_norm": NaN, + "learning_rate": 0.00017229080480405946, + "loss": 0.0, + "step": 30193 + }, + { + "epoch": 2.8173929271251286, + "grad_norm": NaN, + "learning_rate": 0.00017228332497516295, + "loss": 0.0, + "step": 30194 + }, + { + "epoch": 2.8174862368200055, + "grad_norm": NaN, + "learning_rate": 0.00017227584508960653, + "loss": 0.0, + "step": 30195 + }, + { + "epoch": 2.817579546514883, + "grad_norm": NaN, + "learning_rate": 0.0001722683651474093, + "loss": 0.0, + "step": 30196 + }, + { + "epoch": 2.81767285620976, + "grad_norm": NaN, + "learning_rate": 0.0001722608851485902, + "loss": 0.0, + "step": 30197 + }, + { + "epoch": 2.8177661659046374, + "grad_norm": NaN, + "learning_rate": 0.0001722534050931682, + "loss": 0.0, + "step": 30198 + }, + { + "epoch": 2.8178594755995148, + "grad_norm": NaN, + "learning_rate": 0.00017224592498116243, + "loss": 0.0, + "step": 30199 + }, + { + "epoch": 2.817952785294392, + "grad_norm": NaN, + "learning_rate": 0.00017223844481259188, + "loss": 0.0, + "step": 30200 + }, + { + "epoch": 2.8180460949892696, + "grad_norm": NaN, + "learning_rate": 0.00017223096458747548, + "loss": 0.0, + "step": 30201 + }, + { + "epoch": 2.8181394046841466, + "grad_norm": NaN, + "learning_rate": 0.00017222348430583238, + "loss": 0.0, + "step": 30202 + }, + { + "epoch": 2.818232714379024, + "grad_norm": NaN, + "learning_rate": 0.00017221600396768156, + "loss": 0.0, + "step": 30203 + }, + { + "epoch": 2.8183260240739014, + "grad_norm": NaN, + "learning_rate": 0.00017220852357304197, + "loss": 0.0, + "step": 30204 + }, + { + "epoch": 2.8184193337687784, + "grad_norm": NaN, + "learning_rate": 0.00017220104312193273, + "loss": 0.0, + "step": 30205 + }, + { + "epoch": 2.818512643463656, + "grad_norm": NaN, + "learning_rate": 0.00017219356261437277, + "loss": 0.0, + "step": 30206 + }, + { + "epoch": 2.8186059531585332, + "grad_norm": NaN, + "learning_rate": 0.0001721860820503812, + "loss": 0.0, + "step": 30207 + }, + { + "epoch": 2.8186992628534107, + "grad_norm": NaN, + "learning_rate": 0.00017217860142997694, + "loss": 0.0, + "step": 30208 + }, + { + "epoch": 2.8187925725482876, + "grad_norm": NaN, + "learning_rate": 0.00017217112075317915, + "loss": 0.0, + "step": 30209 + }, + { + "epoch": 2.818885882243165, + "grad_norm": NaN, + "learning_rate": 0.0001721636400200067, + "loss": 0.0, + "step": 30210 + }, + { + "epoch": 2.8189791919380425, + "grad_norm": NaN, + "learning_rate": 0.00017215615923047868, + "loss": 0.0, + "step": 30211 + }, + { + "epoch": 2.8190725016329194, + "grad_norm": NaN, + "learning_rate": 0.00017214867838461411, + "loss": 0.0, + "step": 30212 + }, + { + "epoch": 2.819165811327797, + "grad_norm": NaN, + "learning_rate": 0.00017214119748243202, + "loss": 0.0, + "step": 30213 + }, + { + "epoch": 2.8192591210226743, + "grad_norm": NaN, + "learning_rate": 0.0001721337165239514, + "loss": 0.0, + "step": 30214 + }, + { + "epoch": 2.8193524307175517, + "grad_norm": NaN, + "learning_rate": 0.00017212623550919137, + "loss": 0.0, + "step": 30215 + }, + { + "epoch": 2.819445740412429, + "grad_norm": NaN, + "learning_rate": 0.00017211875443817084, + "loss": 0.0, + "step": 30216 + }, + { + "epoch": 2.819539050107306, + "grad_norm": NaN, + "learning_rate": 0.00017211127331090884, + "loss": 0.0, + "step": 30217 + }, + { + "epoch": 2.8196323598021835, + "grad_norm": NaN, + "learning_rate": 0.00017210379212742445, + "loss": 0.0, + "step": 30218 + }, + { + "epoch": 2.8197256694970605, + "grad_norm": NaN, + "learning_rate": 0.00017209631088773667, + "loss": 0.0, + "step": 30219 + }, + { + "epoch": 2.819818979191938, + "grad_norm": NaN, + "learning_rate": 0.00017208882959186448, + "loss": 0.0, + "step": 30220 + }, + { + "epoch": 2.8199122888868153, + "grad_norm": NaN, + "learning_rate": 0.000172081348239827, + "loss": 0.0, + "step": 30221 + }, + { + "epoch": 2.8200055985816928, + "grad_norm": NaN, + "learning_rate": 0.00017207386683164313, + "loss": 0.0, + "step": 30222 + }, + { + "epoch": 2.82009890827657, + "grad_norm": NaN, + "learning_rate": 0.000172066385367332, + "loss": 0.0, + "step": 30223 + }, + { + "epoch": 2.820192217971447, + "grad_norm": NaN, + "learning_rate": 0.00017205890384691254, + "loss": 0.0, + "step": 30224 + }, + { + "epoch": 2.8202855276663246, + "grad_norm": NaN, + "learning_rate": 0.00017205142227040386, + "loss": 0.0, + "step": 30225 + }, + { + "epoch": 2.820378837361202, + "grad_norm": NaN, + "learning_rate": 0.00017204394063782493, + "loss": 0.0, + "step": 30226 + }, + { + "epoch": 2.820472147056079, + "grad_norm": NaN, + "learning_rate": 0.00017203645894919481, + "loss": 0.0, + "step": 30227 + }, + { + "epoch": 2.8205654567509564, + "grad_norm": NaN, + "learning_rate": 0.0001720289772045325, + "loss": 0.0, + "step": 30228 + }, + { + "epoch": 2.820658766445834, + "grad_norm": NaN, + "learning_rate": 0.00017202149540385703, + "loss": 0.0, + "step": 30229 + }, + { + "epoch": 2.820752076140711, + "grad_norm": NaN, + "learning_rate": 0.0001720140135471874, + "loss": 0.0, + "step": 30230 + }, + { + "epoch": 2.820845385835588, + "grad_norm": NaN, + "learning_rate": 0.00017200653163454264, + "loss": 0.0, + "step": 30231 + }, + { + "epoch": 2.8209386955304656, + "grad_norm": NaN, + "learning_rate": 0.00017199904966594181, + "loss": 0.0, + "step": 30232 + }, + { + "epoch": 2.821032005225343, + "grad_norm": NaN, + "learning_rate": 0.00017199156764140388, + "loss": 0.0, + "step": 30233 + }, + { + "epoch": 2.82112531492022, + "grad_norm": NaN, + "learning_rate": 0.00017198408556094797, + "loss": 0.0, + "step": 30234 + }, + { + "epoch": 2.8212186246150974, + "grad_norm": NaN, + "learning_rate": 0.00017197660342459302, + "loss": 0.0, + "step": 30235 + }, + { + "epoch": 2.821311934309975, + "grad_norm": NaN, + "learning_rate": 0.00017196912123235803, + "loss": 0.0, + "step": 30236 + }, + { + "epoch": 2.8214052440048523, + "grad_norm": NaN, + "learning_rate": 0.00017196163898426217, + "loss": 0.0, + "step": 30237 + }, + { + "epoch": 2.8214985536997292, + "grad_norm": NaN, + "learning_rate": 0.0001719541566803243, + "loss": 0.0, + "step": 30238 + }, + { + "epoch": 2.8215918633946067, + "grad_norm": NaN, + "learning_rate": 0.0001719466743205635, + "loss": 0.0, + "step": 30239 + }, + { + "epoch": 2.821685173089484, + "grad_norm": NaN, + "learning_rate": 0.00017193919190499886, + "loss": 0.0, + "step": 30240 + }, + { + "epoch": 2.821778482784361, + "grad_norm": NaN, + "learning_rate": 0.00017193170943364934, + "loss": 0.0, + "step": 30241 + }, + { + "epoch": 2.8218717924792385, + "grad_norm": NaN, + "learning_rate": 0.00017192422690653393, + "loss": 0.0, + "step": 30242 + }, + { + "epoch": 2.821965102174116, + "grad_norm": NaN, + "learning_rate": 0.00017191674432367177, + "loss": 0.0, + "step": 30243 + }, + { + "epoch": 2.8220584118689933, + "grad_norm": NaN, + "learning_rate": 0.00017190926168508176, + "loss": 0.0, + "step": 30244 + }, + { + "epoch": 2.8221517215638707, + "grad_norm": NaN, + "learning_rate": 0.00017190177899078306, + "loss": 0.0, + "step": 30245 + }, + { + "epoch": 2.8222450312587477, + "grad_norm": NaN, + "learning_rate": 0.00017189429624079455, + "loss": 0.0, + "step": 30246 + }, + { + "epoch": 2.822338340953625, + "grad_norm": NaN, + "learning_rate": 0.00017188681343513536, + "loss": 0.0, + "step": 30247 + }, + { + "epoch": 2.8224316506485025, + "grad_norm": NaN, + "learning_rate": 0.00017187933057382452, + "loss": 0.0, + "step": 30248 + }, + { + "epoch": 2.8225249603433795, + "grad_norm": NaN, + "learning_rate": 0.00017187184765688097, + "loss": 0.0, + "step": 30249 + }, + { + "epoch": 2.822618270038257, + "grad_norm": NaN, + "learning_rate": 0.00017186436468432382, + "loss": 0.0, + "step": 30250 + }, + { + "epoch": 2.8227115797331344, + "grad_norm": NaN, + "learning_rate": 0.00017185688165617205, + "loss": 0.0, + "step": 30251 + }, + { + "epoch": 2.8228048894280118, + "grad_norm": NaN, + "learning_rate": 0.00017184939857244468, + "loss": 0.0, + "step": 30252 + }, + { + "epoch": 2.8228981991228888, + "grad_norm": NaN, + "learning_rate": 0.00017184191543316086, + "loss": 0.0, + "step": 30253 + }, + { + "epoch": 2.822991508817766, + "grad_norm": NaN, + "learning_rate": 0.00017183443223833946, + "loss": 0.0, + "step": 30254 + }, + { + "epoch": 2.8230848185126436, + "grad_norm": NaN, + "learning_rate": 0.00017182694898799952, + "loss": 0.0, + "step": 30255 + }, + { + "epoch": 2.8231781282075206, + "grad_norm": NaN, + "learning_rate": 0.00017181946568216018, + "loss": 0.0, + "step": 30256 + }, + { + "epoch": 2.823271437902398, + "grad_norm": NaN, + "learning_rate": 0.00017181198232084038, + "loss": 0.0, + "step": 30257 + }, + { + "epoch": 2.8233647475972754, + "grad_norm": NaN, + "learning_rate": 0.00017180449890405913, + "loss": 0.0, + "step": 30258 + }, + { + "epoch": 2.823458057292153, + "grad_norm": NaN, + "learning_rate": 0.00017179701543183556, + "loss": 0.0, + "step": 30259 + }, + { + "epoch": 2.82355136698703, + "grad_norm": NaN, + "learning_rate": 0.00017178953190418863, + "loss": 0.0, + "step": 30260 + }, + { + "epoch": 2.823644676681907, + "grad_norm": NaN, + "learning_rate": 0.0001717820483211373, + "loss": 0.0, + "step": 30261 + }, + { + "epoch": 2.8237379863767846, + "grad_norm": NaN, + "learning_rate": 0.0001717745646827008, + "loss": 0.0, + "step": 30262 + }, + { + "epoch": 2.8238312960716616, + "grad_norm": NaN, + "learning_rate": 0.00017176708098889796, + "loss": 0.0, + "step": 30263 + }, + { + "epoch": 2.823924605766539, + "grad_norm": NaN, + "learning_rate": 0.00017175959723974787, + "loss": 0.0, + "step": 30264 + }, + { + "epoch": 2.8240179154614165, + "grad_norm": NaN, + "learning_rate": 0.00017175211343526962, + "loss": 0.0, + "step": 30265 + }, + { + "epoch": 2.824111225156294, + "grad_norm": NaN, + "learning_rate": 0.00017174462957548218, + "loss": 0.0, + "step": 30266 + }, + { + "epoch": 2.8242045348511713, + "grad_norm": NaN, + "learning_rate": 0.0001717371456604045, + "loss": 0.0, + "step": 30267 + }, + { + "epoch": 2.8242978445460483, + "grad_norm": NaN, + "learning_rate": 0.0001717296616900558, + "loss": 0.0, + "step": 30268 + }, + { + "epoch": 2.8243911542409257, + "grad_norm": NaN, + "learning_rate": 0.00017172217766445495, + "loss": 0.0, + "step": 30269 + }, + { + "epoch": 2.8244844639358027, + "grad_norm": NaN, + "learning_rate": 0.00017171469358362107, + "loss": 0.0, + "step": 30270 + }, + { + "epoch": 2.82457777363068, + "grad_norm": NaN, + "learning_rate": 0.00017170720944757308, + "loss": 0.0, + "step": 30271 + }, + { + "epoch": 2.8246710833255575, + "grad_norm": NaN, + "learning_rate": 0.0001716997252563302, + "loss": 0.0, + "step": 30272 + }, + { + "epoch": 2.824764393020435, + "grad_norm": NaN, + "learning_rate": 0.0001716922410099113, + "loss": 0.0, + "step": 30273 + }, + { + "epoch": 2.8248577027153123, + "grad_norm": NaN, + "learning_rate": 0.00017168475670833541, + "loss": 0.0, + "step": 30274 + }, + { + "epoch": 2.8249510124101893, + "grad_norm": NaN, + "learning_rate": 0.00017167727235162169, + "loss": 0.0, + "step": 30275 + }, + { + "epoch": 2.8250443221050667, + "grad_norm": NaN, + "learning_rate": 0.00017166978793978904, + "loss": 0.0, + "step": 30276 + }, + { + "epoch": 2.825137631799944, + "grad_norm": NaN, + "learning_rate": 0.00017166230347285652, + "loss": 0.0, + "step": 30277 + }, + { + "epoch": 2.825230941494821, + "grad_norm": NaN, + "learning_rate": 0.00017165481895084324, + "loss": 0.0, + "step": 30278 + }, + { + "epoch": 2.8253242511896985, + "grad_norm": NaN, + "learning_rate": 0.00017164733437376813, + "loss": 0.0, + "step": 30279 + }, + { + "epoch": 2.825417560884576, + "grad_norm": NaN, + "learning_rate": 0.00017163984974165024, + "loss": 0.0, + "step": 30280 + }, + { + "epoch": 2.8255108705794534, + "grad_norm": NaN, + "learning_rate": 0.00017163236505450866, + "loss": 0.0, + "step": 30281 + }, + { + "epoch": 2.8256041802743304, + "grad_norm": NaN, + "learning_rate": 0.00017162488031236236, + "loss": 0.0, + "step": 30282 + }, + { + "epoch": 2.8256974899692078, + "grad_norm": NaN, + "learning_rate": 0.00017161739551523037, + "loss": 0.0, + "step": 30283 + }, + { + "epoch": 2.825790799664085, + "grad_norm": NaN, + "learning_rate": 0.00017160991066313182, + "loss": 0.0, + "step": 30284 + }, + { + "epoch": 2.825884109358962, + "grad_norm": NaN, + "learning_rate": 0.00017160242575608563, + "loss": 0.0, + "step": 30285 + }, + { + "epoch": 2.8259774190538396, + "grad_norm": NaN, + "learning_rate": 0.00017159494079411084, + "loss": 0.0, + "step": 30286 + }, + { + "epoch": 2.826070728748717, + "grad_norm": NaN, + "learning_rate": 0.00017158745577722657, + "loss": 0.0, + "step": 30287 + }, + { + "epoch": 2.8261640384435944, + "grad_norm": NaN, + "learning_rate": 0.00017157997070545177, + "loss": 0.0, + "step": 30288 + }, + { + "epoch": 2.826257348138472, + "grad_norm": NaN, + "learning_rate": 0.00017157248557880548, + "loss": 0.0, + "step": 30289 + }, + { + "epoch": 2.826350657833349, + "grad_norm": NaN, + "learning_rate": 0.00017156500039730677, + "loss": 0.0, + "step": 30290 + }, + { + "epoch": 2.8264439675282262, + "grad_norm": NaN, + "learning_rate": 0.00017155751516097467, + "loss": 0.0, + "step": 30291 + }, + { + "epoch": 2.826537277223103, + "grad_norm": NaN, + "learning_rate": 0.00017155002986982818, + "loss": 0.0, + "step": 30292 + }, + { + "epoch": 2.8266305869179806, + "grad_norm": NaN, + "learning_rate": 0.0001715425445238863, + "loss": 0.0, + "step": 30293 + }, + { + "epoch": 2.826723896612858, + "grad_norm": NaN, + "learning_rate": 0.00017153505912316817, + "loss": 0.0, + "step": 30294 + }, + { + "epoch": 2.8268172063077355, + "grad_norm": NaN, + "learning_rate": 0.00017152757366769274, + "loss": 0.0, + "step": 30295 + }, + { + "epoch": 2.826910516002613, + "grad_norm": NaN, + "learning_rate": 0.00017152008815747908, + "loss": 0.0, + "step": 30296 + }, + { + "epoch": 2.82700382569749, + "grad_norm": NaN, + "learning_rate": 0.00017151260259254622, + "loss": 0.0, + "step": 30297 + }, + { + "epoch": 2.8270971353923673, + "grad_norm": NaN, + "learning_rate": 0.00017150511697291317, + "loss": 0.0, + "step": 30298 + }, + { + "epoch": 2.8271904450872447, + "grad_norm": NaN, + "learning_rate": 0.00017149763129859893, + "loss": 0.0, + "step": 30299 + }, + { + "epoch": 2.8272837547821217, + "grad_norm": NaN, + "learning_rate": 0.00017149014556962263, + "loss": 0.0, + "step": 30300 + }, + { + "epoch": 2.827377064476999, + "grad_norm": NaN, + "learning_rate": 0.00017148265978600327, + "loss": 0.0, + "step": 30301 + }, + { + "epoch": 2.8274703741718765, + "grad_norm": NaN, + "learning_rate": 0.00017147517394775982, + "loss": 0.0, + "step": 30302 + }, + { + "epoch": 2.827563683866754, + "grad_norm": NaN, + "learning_rate": 0.00017146768805491146, + "loss": 0.0, + "step": 30303 + }, + { + "epoch": 2.827656993561631, + "grad_norm": NaN, + "learning_rate": 0.00017146020210747704, + "loss": 0.0, + "step": 30304 + }, + { + "epoch": 2.8277503032565083, + "grad_norm": NaN, + "learning_rate": 0.0001714527161054757, + "loss": 0.0, + "step": 30305 + }, + { + "epoch": 2.8278436129513858, + "grad_norm": NaN, + "learning_rate": 0.00017144523004892646, + "loss": 0.0, + "step": 30306 + }, + { + "epoch": 2.8279369226462627, + "grad_norm": NaN, + "learning_rate": 0.00017143774393784837, + "loss": 0.0, + "step": 30307 + }, + { + "epoch": 2.82803023234114, + "grad_norm": NaN, + "learning_rate": 0.00017143025777226038, + "loss": 0.0, + "step": 30308 + }, + { + "epoch": 2.8281235420360176, + "grad_norm": NaN, + "learning_rate": 0.00017142277155218164, + "loss": 0.0, + "step": 30309 + }, + { + "epoch": 2.828216851730895, + "grad_norm": NaN, + "learning_rate": 0.0001714152852776312, + "loss": 0.0, + "step": 30310 + }, + { + "epoch": 2.8283101614257724, + "grad_norm": NaN, + "learning_rate": 0.00017140779894862793, + "loss": 0.0, + "step": 30311 + }, + { + "epoch": 2.8284034711206494, + "grad_norm": NaN, + "learning_rate": 0.00017140031256519103, + "loss": 0.0, + "step": 30312 + }, + { + "epoch": 2.828496780815527, + "grad_norm": NaN, + "learning_rate": 0.00017139282612733947, + "loss": 0.0, + "step": 30313 + }, + { + "epoch": 2.8285900905104038, + "grad_norm": NaN, + "learning_rate": 0.00017138533963509232, + "loss": 0.0, + "step": 30314 + }, + { + "epoch": 2.828683400205281, + "grad_norm": NaN, + "learning_rate": 0.00017137785308846847, + "loss": 0.0, + "step": 30315 + }, + { + "epoch": 2.8287767099001586, + "grad_norm": NaN, + "learning_rate": 0.00017137036648748718, + "loss": 0.0, + "step": 30316 + }, + { + "epoch": 2.828870019595036, + "grad_norm": NaN, + "learning_rate": 0.00017136287983216737, + "loss": 0.0, + "step": 30317 + }, + { + "epoch": 2.8289633292899135, + "grad_norm": NaN, + "learning_rate": 0.00017135539312252804, + "loss": 0.0, + "step": 30318 + }, + { + "epoch": 2.8290566389847904, + "grad_norm": NaN, + "learning_rate": 0.00017134790635858833, + "loss": 0.0, + "step": 30319 + }, + { + "epoch": 2.829149948679668, + "grad_norm": NaN, + "learning_rate": 0.00017134041954036718, + "loss": 0.0, + "step": 30320 + }, + { + "epoch": 2.8292432583745453, + "grad_norm": NaN, + "learning_rate": 0.00017133293266788363, + "loss": 0.0, + "step": 30321 + }, + { + "epoch": 2.8293365680694222, + "grad_norm": NaN, + "learning_rate": 0.00017132544574115682, + "loss": 0.0, + "step": 30322 + }, + { + "epoch": 2.8294298777642997, + "grad_norm": NaN, + "learning_rate": 0.00017131795876020568, + "loss": 0.0, + "step": 30323 + }, + { + "epoch": 2.829523187459177, + "grad_norm": NaN, + "learning_rate": 0.00017131047172504925, + "loss": 0.0, + "step": 30324 + }, + { + "epoch": 2.8296164971540545, + "grad_norm": NaN, + "learning_rate": 0.00017130298463570668, + "loss": 0.0, + "step": 30325 + }, + { + "epoch": 2.8297098068489315, + "grad_norm": NaN, + "learning_rate": 0.0001712954974921969, + "loss": 0.0, + "step": 30326 + }, + { + "epoch": 2.829803116543809, + "grad_norm": NaN, + "learning_rate": 0.00017128801029453895, + "loss": 0.0, + "step": 30327 + }, + { + "epoch": 2.8298964262386863, + "grad_norm": NaN, + "learning_rate": 0.0001712805230427519, + "loss": 0.0, + "step": 30328 + }, + { + "epoch": 2.8299897359335633, + "grad_norm": NaN, + "learning_rate": 0.00017127303573685483, + "loss": 0.0, + "step": 30329 + }, + { + "epoch": 2.8300830456284407, + "grad_norm": NaN, + "learning_rate": 0.00017126554837686668, + "loss": 0.0, + "step": 30330 + }, + { + "epoch": 2.830176355323318, + "grad_norm": NaN, + "learning_rate": 0.00017125806096280656, + "loss": 0.0, + "step": 30331 + }, + { + "epoch": 2.8302696650181955, + "grad_norm": NaN, + "learning_rate": 0.00017125057349469354, + "loss": 0.0, + "step": 30332 + }, + { + "epoch": 2.830362974713073, + "grad_norm": NaN, + "learning_rate": 0.0001712430859725465, + "loss": 0.0, + "step": 30333 + }, + { + "epoch": 2.83045628440795, + "grad_norm": NaN, + "learning_rate": 0.00017123559839638466, + "loss": 0.0, + "step": 30334 + }, + { + "epoch": 2.8305495941028274, + "grad_norm": NaN, + "learning_rate": 0.00017122811076622702, + "loss": 0.0, + "step": 30335 + }, + { + "epoch": 2.8306429037977043, + "grad_norm": NaN, + "learning_rate": 0.0001712206230820925, + "loss": 0.0, + "step": 30336 + }, + { + "epoch": 2.8307362134925818, + "grad_norm": NaN, + "learning_rate": 0.00017121313534400025, + "loss": 0.0, + "step": 30337 + }, + { + "epoch": 2.830829523187459, + "grad_norm": NaN, + "learning_rate": 0.0001712056475519693, + "loss": 0.0, + "step": 30338 + }, + { + "epoch": 2.8309228328823366, + "grad_norm": NaN, + "learning_rate": 0.00017119815970601866, + "loss": 0.0, + "step": 30339 + }, + { + "epoch": 2.831016142577214, + "grad_norm": NaN, + "learning_rate": 0.00017119067180616736, + "loss": 0.0, + "step": 30340 + }, + { + "epoch": 2.831109452272091, + "grad_norm": NaN, + "learning_rate": 0.0001711831838524345, + "loss": 0.0, + "step": 30341 + }, + { + "epoch": 2.8312027619669684, + "grad_norm": NaN, + "learning_rate": 0.00017117569584483902, + "loss": 0.0, + "step": 30342 + }, + { + "epoch": 2.831296071661846, + "grad_norm": NaN, + "learning_rate": 0.00017116820778340002, + "loss": 0.0, + "step": 30343 + }, + { + "epoch": 2.831389381356723, + "grad_norm": NaN, + "learning_rate": 0.00017116071966813662, + "loss": 0.0, + "step": 30344 + }, + { + "epoch": 2.8314826910516, + "grad_norm": NaN, + "learning_rate": 0.0001711532314990677, + "loss": 0.0, + "step": 30345 + }, + { + "epoch": 2.8315760007464776, + "grad_norm": NaN, + "learning_rate": 0.00017114574327621238, + "loss": 0.0, + "step": 30346 + }, + { + "epoch": 2.831669310441355, + "grad_norm": NaN, + "learning_rate": 0.00017113825499958968, + "loss": 0.0, + "step": 30347 + }, + { + "epoch": 2.831762620136232, + "grad_norm": NaN, + "learning_rate": 0.00017113076666921877, + "loss": 0.0, + "step": 30348 + }, + { + "epoch": 2.8318559298311095, + "grad_norm": NaN, + "learning_rate": 0.00017112327828511847, + "loss": 0.0, + "step": 30349 + }, + { + "epoch": 2.831949239525987, + "grad_norm": NaN, + "learning_rate": 0.00017111578984730795, + "loss": 0.0, + "step": 30350 + }, + { + "epoch": 2.832042549220864, + "grad_norm": NaN, + "learning_rate": 0.00017110830135580628, + "loss": 0.0, + "step": 30351 + }, + { + "epoch": 2.8321358589157413, + "grad_norm": NaN, + "learning_rate": 0.0001711008128106324, + "loss": 0.0, + "step": 30352 + }, + { + "epoch": 2.8322291686106187, + "grad_norm": NaN, + "learning_rate": 0.00017109332421180542, + "loss": 0.0, + "step": 30353 + }, + { + "epoch": 2.832322478305496, + "grad_norm": NaN, + "learning_rate": 0.00017108583555934442, + "loss": 0.0, + "step": 30354 + }, + { + "epoch": 2.832415788000373, + "grad_norm": NaN, + "learning_rate": 0.0001710783468532683, + "loss": 0.0, + "step": 30355 + }, + { + "epoch": 2.8325090976952505, + "grad_norm": NaN, + "learning_rate": 0.00017107085809359623, + "loss": 0.0, + "step": 30356 + }, + { + "epoch": 2.832602407390128, + "grad_norm": NaN, + "learning_rate": 0.0001710633692803472, + "loss": 0.0, + "step": 30357 + }, + { + "epoch": 2.832695717085005, + "grad_norm": NaN, + "learning_rate": 0.0001710558804135403, + "loss": 0.0, + "step": 30358 + }, + { + "epoch": 2.8327890267798823, + "grad_norm": NaN, + "learning_rate": 0.00017104839149319445, + "loss": 0.0, + "step": 30359 + }, + { + "epoch": 2.8328823364747597, + "grad_norm": NaN, + "learning_rate": 0.00017104090251932887, + "loss": 0.0, + "step": 30360 + }, + { + "epoch": 2.832975646169637, + "grad_norm": NaN, + "learning_rate": 0.00017103341349196244, + "loss": 0.0, + "step": 30361 + }, + { + "epoch": 2.8330689558645146, + "grad_norm": NaN, + "learning_rate": 0.0001710259244111143, + "loss": 0.0, + "step": 30362 + }, + { + "epoch": 2.8331622655593915, + "grad_norm": NaN, + "learning_rate": 0.00017101843527680345, + "loss": 0.0, + "step": 30363 + }, + { + "epoch": 2.833255575254269, + "grad_norm": NaN, + "learning_rate": 0.00017101094608904899, + "loss": 0.0, + "step": 30364 + }, + { + "epoch": 2.833348884949146, + "grad_norm": NaN, + "learning_rate": 0.0001710034568478698, + "loss": 0.0, + "step": 30365 + }, + { + "epoch": 2.8334421946440234, + "grad_norm": NaN, + "learning_rate": 0.00017099596755328512, + "loss": 0.0, + "step": 30366 + }, + { + "epoch": 2.833535504338901, + "grad_norm": NaN, + "learning_rate": 0.00017098847820531398, + "loss": 0.0, + "step": 30367 + }, + { + "epoch": 2.833628814033778, + "grad_norm": NaN, + "learning_rate": 0.00017098098880397524, + "loss": 0.0, + "step": 30368 + }, + { + "epoch": 2.8337221237286556, + "grad_norm": NaN, + "learning_rate": 0.00017097349934928812, + "loss": 0.0, + "step": 30369 + }, + { + "epoch": 2.8338154334235326, + "grad_norm": NaN, + "learning_rate": 0.00017096600984127164, + "loss": 0.0, + "step": 30370 + }, + { + "epoch": 2.83390874311841, + "grad_norm": NaN, + "learning_rate": 0.00017095852027994472, + "loss": 0.0, + "step": 30371 + }, + { + "epoch": 2.8340020528132874, + "grad_norm": NaN, + "learning_rate": 0.00017095103066532653, + "loss": 0.0, + "step": 30372 + }, + { + "epoch": 2.8340953625081644, + "grad_norm": NaN, + "learning_rate": 0.00017094354099743612, + "loss": 0.0, + "step": 30373 + }, + { + "epoch": 2.834188672203042, + "grad_norm": NaN, + "learning_rate": 0.0001709360512762924, + "loss": 0.0, + "step": 30374 + }, + { + "epoch": 2.8342819818979192, + "grad_norm": NaN, + "learning_rate": 0.00017092856150191456, + "loss": 0.0, + "step": 30375 + }, + { + "epoch": 2.8343752915927967, + "grad_norm": NaN, + "learning_rate": 0.00017092107167432163, + "loss": 0.0, + "step": 30376 + }, + { + "epoch": 2.8344686012876736, + "grad_norm": NaN, + "learning_rate": 0.0001709135817935325, + "loss": 0.0, + "step": 30377 + }, + { + "epoch": 2.834561910982551, + "grad_norm": NaN, + "learning_rate": 0.00017090609185956641, + "loss": 0.0, + "step": 30378 + }, + { + "epoch": 2.8346552206774285, + "grad_norm": NaN, + "learning_rate": 0.00017089860187244232, + "loss": 0.0, + "step": 30379 + }, + { + "epoch": 2.8347485303723055, + "grad_norm": NaN, + "learning_rate": 0.00017089111183217926, + "loss": 0.0, + "step": 30380 + }, + { + "epoch": 2.834841840067183, + "grad_norm": NaN, + "learning_rate": 0.00017088362173879625, + "loss": 0.0, + "step": 30381 + }, + { + "epoch": 2.8349351497620603, + "grad_norm": NaN, + "learning_rate": 0.0001708761315923124, + "loss": 0.0, + "step": 30382 + }, + { + "epoch": 2.8350284594569377, + "grad_norm": NaN, + "learning_rate": 0.0001708686413927468, + "loss": 0.0, + "step": 30383 + }, + { + "epoch": 2.835121769151815, + "grad_norm": NaN, + "learning_rate": 0.00017086115114011831, + "loss": 0.0, + "step": 30384 + }, + { + "epoch": 2.835215078846692, + "grad_norm": NaN, + "learning_rate": 0.00017085366083444616, + "loss": 0.0, + "step": 30385 + }, + { + "epoch": 2.8353083885415695, + "grad_norm": NaN, + "learning_rate": 0.00017084617047574937, + "loss": 0.0, + "step": 30386 + }, + { + "epoch": 2.8354016982364465, + "grad_norm": NaN, + "learning_rate": 0.00017083868006404686, + "loss": 0.0, + "step": 30387 + }, + { + "epoch": 2.835495007931324, + "grad_norm": NaN, + "learning_rate": 0.00017083118959935778, + "loss": 0.0, + "step": 30388 + }, + { + "epoch": 2.8355883176262013, + "grad_norm": NaN, + "learning_rate": 0.0001708236990817012, + "loss": 0.0, + "step": 30389 + }, + { + "epoch": 2.8356816273210788, + "grad_norm": NaN, + "learning_rate": 0.00017081620851109604, + "loss": 0.0, + "step": 30390 + }, + { + "epoch": 2.835774937015956, + "grad_norm": NaN, + "learning_rate": 0.0001708087178875615, + "loss": 0.0, + "step": 30391 + }, + { + "epoch": 2.835868246710833, + "grad_norm": NaN, + "learning_rate": 0.00017080122721111656, + "loss": 0.0, + "step": 30392 + }, + { + "epoch": 2.8359615564057106, + "grad_norm": NaN, + "learning_rate": 0.00017079373648178021, + "loss": 0.0, + "step": 30393 + }, + { + "epoch": 2.836054866100588, + "grad_norm": NaN, + "learning_rate": 0.00017078624569957156, + "loss": 0.0, + "step": 30394 + }, + { + "epoch": 2.836148175795465, + "grad_norm": NaN, + "learning_rate": 0.00017077875486450972, + "loss": 0.0, + "step": 30395 + }, + { + "epoch": 2.8362414854903424, + "grad_norm": NaN, + "learning_rate": 0.00017077126397661353, + "loss": 0.0, + "step": 30396 + }, + { + "epoch": 2.83633479518522, + "grad_norm": NaN, + "learning_rate": 0.00017076377303590224, + "loss": 0.0, + "step": 30397 + }, + { + "epoch": 2.8364281048800972, + "grad_norm": NaN, + "learning_rate": 0.00017075628204239488, + "loss": 0.0, + "step": 30398 + }, + { + "epoch": 2.836521414574974, + "grad_norm": NaN, + "learning_rate": 0.00017074879099611033, + "loss": 0.0, + "step": 30399 + }, + { + "epoch": 2.8366147242698516, + "grad_norm": NaN, + "learning_rate": 0.00017074129989706782, + "loss": 0.0, + "step": 30400 + }, + { + "epoch": 2.836708033964729, + "grad_norm": NaN, + "learning_rate": 0.0001707338087452863, + "loss": 0.0, + "step": 30401 + }, + { + "epoch": 2.836801343659606, + "grad_norm": NaN, + "learning_rate": 0.0001707263175407849, + "loss": 0.0, + "step": 30402 + }, + { + "epoch": 2.8368946533544834, + "grad_norm": NaN, + "learning_rate": 0.00017071882628358253, + "loss": 0.0, + "step": 30403 + }, + { + "epoch": 2.836987963049361, + "grad_norm": NaN, + "learning_rate": 0.0001707113349736984, + "loss": 0.0, + "step": 30404 + }, + { + "epoch": 2.8370812727442383, + "grad_norm": NaN, + "learning_rate": 0.00017070384361115147, + "loss": 0.0, + "step": 30405 + }, + { + "epoch": 2.8371745824391157, + "grad_norm": NaN, + "learning_rate": 0.00017069635219596073, + "loss": 0.0, + "step": 30406 + }, + { + "epoch": 2.8372678921339927, + "grad_norm": NaN, + "learning_rate": 0.00017068886072814534, + "loss": 0.0, + "step": 30407 + }, + { + "epoch": 2.83736120182887, + "grad_norm": NaN, + "learning_rate": 0.00017068136920772434, + "loss": 0.0, + "step": 30408 + }, + { + "epoch": 2.837454511523747, + "grad_norm": NaN, + "learning_rate": 0.0001706738776347167, + "loss": 0.0, + "step": 30409 + }, + { + "epoch": 2.8375478212186245, + "grad_norm": NaN, + "learning_rate": 0.0001706663860091415, + "loss": 0.0, + "step": 30410 + }, + { + "epoch": 2.837641130913502, + "grad_norm": NaN, + "learning_rate": 0.00017065889433101787, + "loss": 0.0, + "step": 30411 + }, + { + "epoch": 2.8377344406083793, + "grad_norm": NaN, + "learning_rate": 0.00017065140260036474, + "loss": 0.0, + "step": 30412 + }, + { + "epoch": 2.8378277503032567, + "grad_norm": NaN, + "learning_rate": 0.0001706439108172012, + "loss": 0.0, + "step": 30413 + }, + { + "epoch": 2.8379210599981337, + "grad_norm": NaN, + "learning_rate": 0.00017063641898154637, + "loss": 0.0, + "step": 30414 + }, + { + "epoch": 2.838014369693011, + "grad_norm": NaN, + "learning_rate": 0.00017062892709341916, + "loss": 0.0, + "step": 30415 + }, + { + "epoch": 2.8381076793878885, + "grad_norm": NaN, + "learning_rate": 0.00017062143515283874, + "loss": 0.0, + "step": 30416 + }, + { + "epoch": 2.8382009890827655, + "grad_norm": NaN, + "learning_rate": 0.00017061394315982413, + "loss": 0.0, + "step": 30417 + }, + { + "epoch": 2.838294298777643, + "grad_norm": NaN, + "learning_rate": 0.00017060645111439433, + "loss": 0.0, + "step": 30418 + }, + { + "epoch": 2.8383876084725204, + "grad_norm": NaN, + "learning_rate": 0.00017059895901656846, + "loss": 0.0, + "step": 30419 + }, + { + "epoch": 2.838480918167398, + "grad_norm": NaN, + "learning_rate": 0.00017059146686636555, + "loss": 0.0, + "step": 30420 + }, + { + "epoch": 2.8385742278622748, + "grad_norm": NaN, + "learning_rate": 0.0001705839746638046, + "loss": 0.0, + "step": 30421 + }, + { + "epoch": 2.838667537557152, + "grad_norm": NaN, + "learning_rate": 0.00017057648240890473, + "loss": 0.0, + "step": 30422 + }, + { + "epoch": 2.8387608472520296, + "grad_norm": NaN, + "learning_rate": 0.00017056899010168495, + "loss": 0.0, + "step": 30423 + }, + { + "epoch": 2.8388541569469066, + "grad_norm": NaN, + "learning_rate": 0.00017056149774216433, + "loss": 0.0, + "step": 30424 + }, + { + "epoch": 2.838947466641784, + "grad_norm": NaN, + "learning_rate": 0.0001705540053303619, + "loss": 0.0, + "step": 30425 + }, + { + "epoch": 2.8390407763366614, + "grad_norm": NaN, + "learning_rate": 0.00017054651286629676, + "loss": 0.0, + "step": 30426 + }, + { + "epoch": 2.839134086031539, + "grad_norm": NaN, + "learning_rate": 0.0001705390203499879, + "loss": 0.0, + "step": 30427 + }, + { + "epoch": 2.8392273957264162, + "grad_norm": NaN, + "learning_rate": 0.00017053152778145436, + "loss": 0.0, + "step": 30428 + }, + { + "epoch": 2.8393207054212932, + "grad_norm": NaN, + "learning_rate": 0.00017052403516071526, + "loss": 0.0, + "step": 30429 + }, + { + "epoch": 2.8394140151161706, + "grad_norm": NaN, + "learning_rate": 0.00017051654248778967, + "loss": 0.0, + "step": 30430 + }, + { + "epoch": 2.8395073248110476, + "grad_norm": NaN, + "learning_rate": 0.0001705090497626965, + "loss": 0.0, + "step": 30431 + }, + { + "epoch": 2.839600634505925, + "grad_norm": NaN, + "learning_rate": 0.00017050155698545492, + "loss": 0.0, + "step": 30432 + }, + { + "epoch": 2.8396939442008025, + "grad_norm": NaN, + "learning_rate": 0.000170494064156084, + "loss": 0.0, + "step": 30433 + }, + { + "epoch": 2.83978725389568, + "grad_norm": NaN, + "learning_rate": 0.00017048657127460265, + "loss": 0.0, + "step": 30434 + }, + { + "epoch": 2.8398805635905573, + "grad_norm": NaN, + "learning_rate": 0.00017047907834103008, + "loss": 0.0, + "step": 30435 + }, + { + "epoch": 2.8399738732854343, + "grad_norm": NaN, + "learning_rate": 0.00017047158535538529, + "loss": 0.0, + "step": 30436 + }, + { + "epoch": 2.8400671829803117, + "grad_norm": NaN, + "learning_rate": 0.0001704640923176873, + "loss": 0.0, + "step": 30437 + }, + { + "epoch": 2.840160492675189, + "grad_norm": NaN, + "learning_rate": 0.0001704565992279552, + "loss": 0.0, + "step": 30438 + }, + { + "epoch": 2.840253802370066, + "grad_norm": NaN, + "learning_rate": 0.00017044910608620802, + "loss": 0.0, + "step": 30439 + }, + { + "epoch": 2.8403471120649435, + "grad_norm": NaN, + "learning_rate": 0.0001704416128924648, + "loss": 0.0, + "step": 30440 + }, + { + "epoch": 2.840440421759821, + "grad_norm": NaN, + "learning_rate": 0.00017043411964674465, + "loss": 0.0, + "step": 30441 + }, + { + "epoch": 2.8405337314546983, + "grad_norm": NaN, + "learning_rate": 0.00017042662634906655, + "loss": 0.0, + "step": 30442 + }, + { + "epoch": 2.8406270411495753, + "grad_norm": NaN, + "learning_rate": 0.0001704191329994496, + "loss": 0.0, + "step": 30443 + }, + { + "epoch": 2.8407203508444527, + "grad_norm": NaN, + "learning_rate": 0.0001704116395979129, + "loss": 0.0, + "step": 30444 + }, + { + "epoch": 2.84081366053933, + "grad_norm": NaN, + "learning_rate": 0.00017040414614447535, + "loss": 0.0, + "step": 30445 + }, + { + "epoch": 2.840906970234207, + "grad_norm": NaN, + "learning_rate": 0.00017039665263915616, + "loss": 0.0, + "step": 30446 + }, + { + "epoch": 2.8410002799290845, + "grad_norm": NaN, + "learning_rate": 0.00017038915908197433, + "loss": 0.0, + "step": 30447 + }, + { + "epoch": 2.841093589623962, + "grad_norm": NaN, + "learning_rate": 0.0001703816654729489, + "loss": 0.0, + "step": 30448 + }, + { + "epoch": 2.8411868993188394, + "grad_norm": NaN, + "learning_rate": 0.00017037417181209894, + "loss": 0.0, + "step": 30449 + }, + { + "epoch": 2.8412802090137164, + "grad_norm": NaN, + "learning_rate": 0.00017036667809944343, + "loss": 0.0, + "step": 30450 + }, + { + "epoch": 2.841373518708594, + "grad_norm": NaN, + "learning_rate": 0.00017035918433500155, + "loss": 0.0, + "step": 30451 + }, + { + "epoch": 2.841466828403471, + "grad_norm": NaN, + "learning_rate": 0.00017035169051879233, + "loss": 0.0, + "step": 30452 + }, + { + "epoch": 2.841560138098348, + "grad_norm": NaN, + "learning_rate": 0.00017034419665083473, + "loss": 0.0, + "step": 30453 + }, + { + "epoch": 2.8416534477932256, + "grad_norm": NaN, + "learning_rate": 0.00017033670273114787, + "loss": 0.0, + "step": 30454 + }, + { + "epoch": 2.841746757488103, + "grad_norm": NaN, + "learning_rate": 0.00017032920875975085, + "loss": 0.0, + "step": 30455 + }, + { + "epoch": 2.8418400671829804, + "grad_norm": NaN, + "learning_rate": 0.0001703217147366626, + "loss": 0.0, + "step": 30456 + }, + { + "epoch": 2.841933376877858, + "grad_norm": NaN, + "learning_rate": 0.0001703142206619023, + "loss": 0.0, + "step": 30457 + }, + { + "epoch": 2.842026686572735, + "grad_norm": NaN, + "learning_rate": 0.00017030672653548892, + "loss": 0.0, + "step": 30458 + }, + { + "epoch": 2.8421199962676122, + "grad_norm": NaN, + "learning_rate": 0.00017029923235744156, + "loss": 0.0, + "step": 30459 + }, + { + "epoch": 2.8422133059624897, + "grad_norm": NaN, + "learning_rate": 0.0001702917381277793, + "loss": 0.0, + "step": 30460 + }, + { + "epoch": 2.8423066156573666, + "grad_norm": NaN, + "learning_rate": 0.0001702842438465211, + "loss": 0.0, + "step": 30461 + }, + { + "epoch": 2.842399925352244, + "grad_norm": NaN, + "learning_rate": 0.00017027674951368614, + "loss": 0.0, + "step": 30462 + }, + { + "epoch": 2.8424932350471215, + "grad_norm": NaN, + "learning_rate": 0.00017026925512929336, + "loss": 0.0, + "step": 30463 + }, + { + "epoch": 2.842586544741999, + "grad_norm": NaN, + "learning_rate": 0.0001702617606933619, + "loss": 0.0, + "step": 30464 + }, + { + "epoch": 2.842679854436876, + "grad_norm": NaN, + "learning_rate": 0.00017025426620591075, + "loss": 0.0, + "step": 30465 + }, + { + "epoch": 2.8427731641317533, + "grad_norm": NaN, + "learning_rate": 0.00017024677166695905, + "loss": 0.0, + "step": 30466 + }, + { + "epoch": 2.8428664738266307, + "grad_norm": NaN, + "learning_rate": 0.0001702392770765258, + "loss": 0.0, + "step": 30467 + }, + { + "epoch": 2.8429597835215077, + "grad_norm": NaN, + "learning_rate": 0.00017023178243463004, + "loss": 0.0, + "step": 30468 + }, + { + "epoch": 2.843053093216385, + "grad_norm": NaN, + "learning_rate": 0.00017022428774129085, + "loss": 0.0, + "step": 30469 + }, + { + "epoch": 2.8431464029112625, + "grad_norm": NaN, + "learning_rate": 0.0001702167929965273, + "loss": 0.0, + "step": 30470 + }, + { + "epoch": 2.84323971260614, + "grad_norm": NaN, + "learning_rate": 0.00017020929820035845, + "loss": 0.0, + "step": 30471 + }, + { + "epoch": 2.843333022301017, + "grad_norm": NaN, + "learning_rate": 0.0001702018033528033, + "loss": 0.0, + "step": 30472 + }, + { + "epoch": 2.8434263319958943, + "grad_norm": NaN, + "learning_rate": 0.00017019430845388096, + "loss": 0.0, + "step": 30473 + }, + { + "epoch": 2.8435196416907718, + "grad_norm": NaN, + "learning_rate": 0.00017018681350361054, + "loss": 0.0, + "step": 30474 + }, + { + "epoch": 2.8436129513856487, + "grad_norm": NaN, + "learning_rate": 0.0001701793185020109, + "loss": 0.0, + "step": 30475 + }, + { + "epoch": 2.843706261080526, + "grad_norm": NaN, + "learning_rate": 0.00017017182344910135, + "loss": 0.0, + "step": 30476 + }, + { + "epoch": 2.8437995707754036, + "grad_norm": NaN, + "learning_rate": 0.00017016432834490077, + "loss": 0.0, + "step": 30477 + }, + { + "epoch": 2.843892880470281, + "grad_norm": NaN, + "learning_rate": 0.0001701568331894283, + "loss": 0.0, + "step": 30478 + }, + { + "epoch": 2.8439861901651584, + "grad_norm": NaN, + "learning_rate": 0.00017014933798270294, + "loss": 0.0, + "step": 30479 + }, + { + "epoch": 2.8440794998600354, + "grad_norm": NaN, + "learning_rate": 0.00017014184272474384, + "loss": 0.0, + "step": 30480 + }, + { + "epoch": 2.844172809554913, + "grad_norm": NaN, + "learning_rate": 0.00017013434741556998, + "loss": 0.0, + "step": 30481 + }, + { + "epoch": 2.84426611924979, + "grad_norm": NaN, + "learning_rate": 0.0001701268520552004, + "loss": 0.0, + "step": 30482 + }, + { + "epoch": 2.844359428944667, + "grad_norm": NaN, + "learning_rate": 0.00017011935664365424, + "loss": 0.0, + "step": 30483 + }, + { + "epoch": 2.8444527386395446, + "grad_norm": NaN, + "learning_rate": 0.0001701118611809505, + "loss": 0.0, + "step": 30484 + }, + { + "epoch": 2.844546048334422, + "grad_norm": NaN, + "learning_rate": 0.00017010436566710824, + "loss": 0.0, + "step": 30485 + }, + { + "epoch": 2.8446393580292995, + "grad_norm": NaN, + "learning_rate": 0.00017009687010214653, + "loss": 0.0, + "step": 30486 + }, + { + "epoch": 2.8447326677241764, + "grad_norm": NaN, + "learning_rate": 0.00017008937448608448, + "loss": 0.0, + "step": 30487 + }, + { + "epoch": 2.844825977419054, + "grad_norm": NaN, + "learning_rate": 0.00017008187881894105, + "loss": 0.0, + "step": 30488 + }, + { + "epoch": 2.8449192871139313, + "grad_norm": NaN, + "learning_rate": 0.0001700743831007354, + "loss": 0.0, + "step": 30489 + }, + { + "epoch": 2.8450125968088082, + "grad_norm": NaN, + "learning_rate": 0.0001700668873314865, + "loss": 0.0, + "step": 30490 + }, + { + "epoch": 2.8451059065036857, + "grad_norm": NaN, + "learning_rate": 0.00017005939151121347, + "loss": 0.0, + "step": 30491 + }, + { + "epoch": 2.845199216198563, + "grad_norm": NaN, + "learning_rate": 0.00017005189563993532, + "loss": 0.0, + "step": 30492 + }, + { + "epoch": 2.8452925258934405, + "grad_norm": NaN, + "learning_rate": 0.00017004439971767123, + "loss": 0.0, + "step": 30493 + }, + { + "epoch": 2.8453858355883175, + "grad_norm": NaN, + "learning_rate": 0.00017003690374444008, + "loss": 0.0, + "step": 30494 + }, + { + "epoch": 2.845479145283195, + "grad_norm": NaN, + "learning_rate": 0.00017002940772026102, + "loss": 0.0, + "step": 30495 + }, + { + "epoch": 2.8455724549780723, + "grad_norm": NaN, + "learning_rate": 0.00017002191164515313, + "loss": 0.0, + "step": 30496 + }, + { + "epoch": 2.8456657646729493, + "grad_norm": NaN, + "learning_rate": 0.00017001441551913545, + "loss": 0.0, + "step": 30497 + }, + { + "epoch": 2.8457590743678267, + "grad_norm": NaN, + "learning_rate": 0.00017000691934222705, + "loss": 0.0, + "step": 30498 + }, + { + "epoch": 2.845852384062704, + "grad_norm": NaN, + "learning_rate": 0.00016999942311444696, + "loss": 0.0, + "step": 30499 + }, + { + "epoch": 2.8459456937575816, + "grad_norm": NaN, + "learning_rate": 0.0001699919268358143, + "loss": 0.0, + "step": 30500 + }, + { + "epoch": 2.846039003452459, + "grad_norm": NaN, + "learning_rate": 0.00016998443050634805, + "loss": 0.0, + "step": 30501 + }, + { + "epoch": 2.846132313147336, + "grad_norm": NaN, + "learning_rate": 0.0001699769341260673, + "loss": 0.0, + "step": 30502 + }, + { + "epoch": 2.8462256228422134, + "grad_norm": NaN, + "learning_rate": 0.00016996943769499116, + "loss": 0.0, + "step": 30503 + }, + { + "epoch": 2.8463189325370903, + "grad_norm": NaN, + "learning_rate": 0.00016996194121313865, + "loss": 0.0, + "step": 30504 + }, + { + "epoch": 2.8464122422319678, + "grad_norm": NaN, + "learning_rate": 0.0001699544446805288, + "loss": 0.0, + "step": 30505 + }, + { + "epoch": 2.846505551926845, + "grad_norm": NaN, + "learning_rate": 0.00016994694809718076, + "loss": 0.0, + "step": 30506 + }, + { + "epoch": 2.8465988616217226, + "grad_norm": NaN, + "learning_rate": 0.00016993945146311353, + "loss": 0.0, + "step": 30507 + }, + { + "epoch": 2.8466921713166, + "grad_norm": NaN, + "learning_rate": 0.00016993195477834618, + "loss": 0.0, + "step": 30508 + }, + { + "epoch": 2.846785481011477, + "grad_norm": NaN, + "learning_rate": 0.00016992445804289775, + "loss": 0.0, + "step": 30509 + }, + { + "epoch": 2.8468787907063544, + "grad_norm": NaN, + "learning_rate": 0.00016991696125678732, + "loss": 0.0, + "step": 30510 + }, + { + "epoch": 2.846972100401232, + "grad_norm": NaN, + "learning_rate": 0.00016990946442003402, + "loss": 0.0, + "step": 30511 + }, + { + "epoch": 2.847065410096109, + "grad_norm": NaN, + "learning_rate": 0.0001699019675326568, + "loss": 0.0, + "step": 30512 + }, + { + "epoch": 2.8471587197909862, + "grad_norm": NaN, + "learning_rate": 0.00016989447059467477, + "loss": 0.0, + "step": 30513 + }, + { + "epoch": 2.8472520294858636, + "grad_norm": NaN, + "learning_rate": 0.000169886973606107, + "loss": 0.0, + "step": 30514 + }, + { + "epoch": 2.847345339180741, + "grad_norm": NaN, + "learning_rate": 0.00016987947656697258, + "loss": 0.0, + "step": 30515 + }, + { + "epoch": 2.847438648875618, + "grad_norm": NaN, + "learning_rate": 0.00016987197947729047, + "loss": 0.0, + "step": 30516 + }, + { + "epoch": 2.8475319585704955, + "grad_norm": NaN, + "learning_rate": 0.00016986448233707988, + "loss": 0.0, + "step": 30517 + }, + { + "epoch": 2.847625268265373, + "grad_norm": NaN, + "learning_rate": 0.00016985698514635972, + "loss": 0.0, + "step": 30518 + }, + { + "epoch": 2.84771857796025, + "grad_norm": NaN, + "learning_rate": 0.00016984948790514922, + "loss": 0.0, + "step": 30519 + }, + { + "epoch": 2.8478118876551273, + "grad_norm": NaN, + "learning_rate": 0.00016984199061346726, + "loss": 0.0, + "step": 30520 + }, + { + "epoch": 2.8479051973500047, + "grad_norm": NaN, + "learning_rate": 0.00016983449327133304, + "loss": 0.0, + "step": 30521 + }, + { + "epoch": 2.847998507044882, + "grad_norm": NaN, + "learning_rate": 0.00016982699587876558, + "loss": 0.0, + "step": 30522 + }, + { + "epoch": 2.8480918167397595, + "grad_norm": NaN, + "learning_rate": 0.00016981949843578395, + "loss": 0.0, + "step": 30523 + }, + { + "epoch": 2.8481851264346365, + "grad_norm": NaN, + "learning_rate": 0.00016981200094240716, + "loss": 0.0, + "step": 30524 + }, + { + "epoch": 2.848278436129514, + "grad_norm": NaN, + "learning_rate": 0.0001698045033986544, + "loss": 0.0, + "step": 30525 + }, + { + "epoch": 2.848371745824391, + "grad_norm": NaN, + "learning_rate": 0.0001697970058045446, + "loss": 0.0, + "step": 30526 + }, + { + "epoch": 2.8484650555192683, + "grad_norm": NaN, + "learning_rate": 0.00016978950816009687, + "loss": 0.0, + "step": 30527 + }, + { + "epoch": 2.8485583652141457, + "grad_norm": NaN, + "learning_rate": 0.00016978201046533028, + "loss": 0.0, + "step": 30528 + }, + { + "epoch": 2.848651674909023, + "grad_norm": NaN, + "learning_rate": 0.00016977451272026396, + "loss": 0.0, + "step": 30529 + }, + { + "epoch": 2.8487449846039006, + "grad_norm": NaN, + "learning_rate": 0.00016976701492491686, + "loss": 0.0, + "step": 30530 + }, + { + "epoch": 2.8488382942987776, + "grad_norm": NaN, + "learning_rate": 0.0001697595170793081, + "loss": 0.0, + "step": 30531 + }, + { + "epoch": 2.848931603993655, + "grad_norm": NaN, + "learning_rate": 0.00016975201918345675, + "loss": 0.0, + "step": 30532 + }, + { + "epoch": 2.8490249136885324, + "grad_norm": NaN, + "learning_rate": 0.00016974452123738186, + "loss": 0.0, + "step": 30533 + }, + { + "epoch": 2.8491182233834094, + "grad_norm": NaN, + "learning_rate": 0.0001697370232411025, + "loss": 0.0, + "step": 30534 + }, + { + "epoch": 2.849211533078287, + "grad_norm": NaN, + "learning_rate": 0.00016972952519463775, + "loss": 0.0, + "step": 30535 + }, + { + "epoch": 2.849304842773164, + "grad_norm": NaN, + "learning_rate": 0.00016972202709800662, + "loss": 0.0, + "step": 30536 + }, + { + "epoch": 2.8493981524680416, + "grad_norm": NaN, + "learning_rate": 0.00016971452895122826, + "loss": 0.0, + "step": 30537 + }, + { + "epoch": 2.8494914621629186, + "grad_norm": NaN, + "learning_rate": 0.00016970703075432168, + "loss": 0.0, + "step": 30538 + }, + { + "epoch": 2.849584771857796, + "grad_norm": NaN, + "learning_rate": 0.00016969953250730598, + "loss": 0.0, + "step": 30539 + }, + { + "epoch": 2.8496780815526734, + "grad_norm": NaN, + "learning_rate": 0.00016969203421020017, + "loss": 0.0, + "step": 30540 + }, + { + "epoch": 2.8497713912475504, + "grad_norm": NaN, + "learning_rate": 0.00016968453586302336, + "loss": 0.0, + "step": 30541 + }, + { + "epoch": 2.849864700942428, + "grad_norm": NaN, + "learning_rate": 0.00016967703746579462, + "loss": 0.0, + "step": 30542 + }, + { + "epoch": 2.8499580106373053, + "grad_norm": NaN, + "learning_rate": 0.00016966953901853302, + "loss": 0.0, + "step": 30543 + }, + { + "epoch": 2.8500513203321827, + "grad_norm": NaN, + "learning_rate": 0.00016966204052125758, + "loss": 0.0, + "step": 30544 + }, + { + "epoch": 2.85014463002706, + "grad_norm": NaN, + "learning_rate": 0.00016965454197398738, + "loss": 0.0, + "step": 30545 + }, + { + "epoch": 2.850237939721937, + "grad_norm": NaN, + "learning_rate": 0.00016964704337674153, + "loss": 0.0, + "step": 30546 + }, + { + "epoch": 2.8503312494168145, + "grad_norm": NaN, + "learning_rate": 0.00016963954472953906, + "loss": 0.0, + "step": 30547 + }, + { + "epoch": 2.8504245591116915, + "grad_norm": NaN, + "learning_rate": 0.00016963204603239902, + "loss": 0.0, + "step": 30548 + }, + { + "epoch": 2.850517868806569, + "grad_norm": NaN, + "learning_rate": 0.00016962454728534053, + "loss": 0.0, + "step": 30549 + }, + { + "epoch": 2.8506111785014463, + "grad_norm": NaN, + "learning_rate": 0.00016961704848838263, + "loss": 0.0, + "step": 30550 + }, + { + "epoch": 2.8507044881963237, + "grad_norm": NaN, + "learning_rate": 0.0001696095496415444, + "loss": 0.0, + "step": 30551 + }, + { + "epoch": 2.850797797891201, + "grad_norm": NaN, + "learning_rate": 0.00016960205074484486, + "loss": 0.0, + "step": 30552 + }, + { + "epoch": 2.850891107586078, + "grad_norm": NaN, + "learning_rate": 0.00016959455179830314, + "loss": 0.0, + "step": 30553 + }, + { + "epoch": 2.8509844172809555, + "grad_norm": NaN, + "learning_rate": 0.00016958705280193826, + "loss": 0.0, + "step": 30554 + }, + { + "epoch": 2.851077726975833, + "grad_norm": NaN, + "learning_rate": 0.00016957955375576932, + "loss": 0.0, + "step": 30555 + }, + { + "epoch": 2.85117103667071, + "grad_norm": NaN, + "learning_rate": 0.00016957205465981535, + "loss": 0.0, + "step": 30556 + }, + { + "epoch": 2.8512643463655873, + "grad_norm": NaN, + "learning_rate": 0.00016956455551409548, + "loss": 0.0, + "step": 30557 + }, + { + "epoch": 2.8513576560604648, + "grad_norm": NaN, + "learning_rate": 0.0001695570563186287, + "loss": 0.0, + "step": 30558 + }, + { + "epoch": 2.851450965755342, + "grad_norm": NaN, + "learning_rate": 0.00016954955707343413, + "loss": 0.0, + "step": 30559 + }, + { + "epoch": 2.851544275450219, + "grad_norm": NaN, + "learning_rate": 0.00016954205777853086, + "loss": 0.0, + "step": 30560 + }, + { + "epoch": 2.8516375851450966, + "grad_norm": NaN, + "learning_rate": 0.00016953455843393792, + "loss": 0.0, + "step": 30561 + }, + { + "epoch": 2.851730894839974, + "grad_norm": NaN, + "learning_rate": 0.00016952705903967437, + "loss": 0.0, + "step": 30562 + }, + { + "epoch": 2.851824204534851, + "grad_norm": NaN, + "learning_rate": 0.0001695195595957593, + "loss": 0.0, + "step": 30563 + }, + { + "epoch": 2.8519175142297284, + "grad_norm": NaN, + "learning_rate": 0.00016951206010221175, + "loss": 0.0, + "step": 30564 + }, + { + "epoch": 2.852010823924606, + "grad_norm": NaN, + "learning_rate": 0.00016950456055905082, + "loss": 0.0, + "step": 30565 + }, + { + "epoch": 2.8521041336194832, + "grad_norm": NaN, + "learning_rate": 0.00016949706096629555, + "loss": 0.0, + "step": 30566 + }, + { + "epoch": 2.85219744331436, + "grad_norm": NaN, + "learning_rate": 0.00016948956132396508, + "loss": 0.0, + "step": 30567 + }, + { + "epoch": 2.8522907530092376, + "grad_norm": NaN, + "learning_rate": 0.0001694820616320784, + "loss": 0.0, + "step": 30568 + }, + { + "epoch": 2.852384062704115, + "grad_norm": NaN, + "learning_rate": 0.0001694745618906546, + "loss": 0.0, + "step": 30569 + }, + { + "epoch": 2.852477372398992, + "grad_norm": NaN, + "learning_rate": 0.0001694670620997128, + "loss": 0.0, + "step": 30570 + }, + { + "epoch": 2.8525706820938694, + "grad_norm": NaN, + "learning_rate": 0.00016945956225927197, + "loss": 0.0, + "step": 30571 + }, + { + "epoch": 2.852663991788747, + "grad_norm": NaN, + "learning_rate": 0.00016945206236935128, + "loss": 0.0, + "step": 30572 + }, + { + "epoch": 2.8527573014836243, + "grad_norm": NaN, + "learning_rate": 0.00016944456242996974, + "loss": 0.0, + "step": 30573 + }, + { + "epoch": 2.8528506111785017, + "grad_norm": NaN, + "learning_rate": 0.00016943706244114645, + "loss": 0.0, + "step": 30574 + }, + { + "epoch": 2.8529439208733787, + "grad_norm": NaN, + "learning_rate": 0.00016942956240290045, + "loss": 0.0, + "step": 30575 + }, + { + "epoch": 2.853037230568256, + "grad_norm": NaN, + "learning_rate": 0.00016942206231525082, + "loss": 0.0, + "step": 30576 + }, + { + "epoch": 2.853130540263133, + "grad_norm": NaN, + "learning_rate": 0.00016941456217821669, + "loss": 0.0, + "step": 30577 + }, + { + "epoch": 2.8532238499580105, + "grad_norm": NaN, + "learning_rate": 0.00016940706199181703, + "loss": 0.0, + "step": 30578 + }, + { + "epoch": 2.853317159652888, + "grad_norm": NaN, + "learning_rate": 0.00016939956175607098, + "loss": 0.0, + "step": 30579 + }, + { + "epoch": 2.8534104693477653, + "grad_norm": NaN, + "learning_rate": 0.0001693920614709976, + "loss": 0.0, + "step": 30580 + }, + { + "epoch": 2.8535037790426427, + "grad_norm": NaN, + "learning_rate": 0.00016938456113661592, + "loss": 0.0, + "step": 30581 + }, + { + "epoch": 2.8535970887375197, + "grad_norm": NaN, + "learning_rate": 0.00016937706075294503, + "loss": 0.0, + "step": 30582 + }, + { + "epoch": 2.853690398432397, + "grad_norm": NaN, + "learning_rate": 0.00016936956032000407, + "loss": 0.0, + "step": 30583 + }, + { + "epoch": 2.8537837081272746, + "grad_norm": NaN, + "learning_rate": 0.00016936205983781203, + "loss": 0.0, + "step": 30584 + }, + { + "epoch": 2.8538770178221515, + "grad_norm": NaN, + "learning_rate": 0.00016935455930638802, + "loss": 0.0, + "step": 30585 + }, + { + "epoch": 2.853970327517029, + "grad_norm": NaN, + "learning_rate": 0.00016934705872575107, + "loss": 0.0, + "step": 30586 + }, + { + "epoch": 2.8540636372119064, + "grad_norm": NaN, + "learning_rate": 0.0001693395580959203, + "loss": 0.0, + "step": 30587 + }, + { + "epoch": 2.854156946906784, + "grad_norm": NaN, + "learning_rate": 0.00016933205741691477, + "loss": 0.0, + "step": 30588 + }, + { + "epoch": 2.8542502566016608, + "grad_norm": NaN, + "learning_rate": 0.00016932455668875352, + "loss": 0.0, + "step": 30589 + }, + { + "epoch": 2.854343566296538, + "grad_norm": NaN, + "learning_rate": 0.00016931705591145566, + "loss": 0.0, + "step": 30590 + }, + { + "epoch": 2.8544368759914156, + "grad_norm": NaN, + "learning_rate": 0.00016930955508504024, + "loss": 0.0, + "step": 30591 + }, + { + "epoch": 2.8545301856862926, + "grad_norm": NaN, + "learning_rate": 0.00016930205420952638, + "loss": 0.0, + "step": 30592 + }, + { + "epoch": 2.85462349538117, + "grad_norm": NaN, + "learning_rate": 0.0001692945532849331, + "loss": 0.0, + "step": 30593 + }, + { + "epoch": 2.8547168050760474, + "grad_norm": NaN, + "learning_rate": 0.00016928705231127945, + "loss": 0.0, + "step": 30594 + }, + { + "epoch": 2.854810114770925, + "grad_norm": NaN, + "learning_rate": 0.00016927955128858457, + "loss": 0.0, + "step": 30595 + }, + { + "epoch": 2.8549034244658023, + "grad_norm": NaN, + "learning_rate": 0.00016927205021686746, + "loss": 0.0, + "step": 30596 + }, + { + "epoch": 2.8549967341606792, + "grad_norm": NaN, + "learning_rate": 0.00016926454909614732, + "loss": 0.0, + "step": 30597 + }, + { + "epoch": 2.8550900438555566, + "grad_norm": NaN, + "learning_rate": 0.00016925704792644306, + "loss": 0.0, + "step": 30598 + }, + { + "epoch": 2.8551833535504336, + "grad_norm": NaN, + "learning_rate": 0.00016924954670777384, + "loss": 0.0, + "step": 30599 + }, + { + "epoch": 2.855276663245311, + "grad_norm": NaN, + "learning_rate": 0.00016924204544015877, + "loss": 0.0, + "step": 30600 + }, + { + "epoch": 2.8553699729401885, + "grad_norm": NaN, + "learning_rate": 0.00016923454412361686, + "loss": 0.0, + "step": 30601 + }, + { + "epoch": 2.855463282635066, + "grad_norm": NaN, + "learning_rate": 0.0001692270427581672, + "loss": 0.0, + "step": 30602 + }, + { + "epoch": 2.8555565923299433, + "grad_norm": NaN, + "learning_rate": 0.00016921954134382883, + "loss": 0.0, + "step": 30603 + }, + { + "epoch": 2.8556499020248203, + "grad_norm": NaN, + "learning_rate": 0.00016921203988062088, + "loss": 0.0, + "step": 30604 + }, + { + "epoch": 2.8557432117196977, + "grad_norm": NaN, + "learning_rate": 0.00016920453836856243, + "loss": 0.0, + "step": 30605 + }, + { + "epoch": 2.855836521414575, + "grad_norm": NaN, + "learning_rate": 0.0001691970368076725, + "loss": 0.0, + "step": 30606 + }, + { + "epoch": 2.855929831109452, + "grad_norm": NaN, + "learning_rate": 0.0001691895351979702, + "loss": 0.0, + "step": 30607 + }, + { + "epoch": 2.8560231408043295, + "grad_norm": NaN, + "learning_rate": 0.00016918203353947462, + "loss": 0.0, + "step": 30608 + }, + { + "epoch": 2.856116450499207, + "grad_norm": NaN, + "learning_rate": 0.00016917453183220478, + "loss": 0.0, + "step": 30609 + }, + { + "epoch": 2.8562097601940843, + "grad_norm": NaN, + "learning_rate": 0.0001691670300761798, + "loss": 0.0, + "step": 30610 + }, + { + "epoch": 2.8563030698889613, + "grad_norm": NaN, + "learning_rate": 0.00016915952827141874, + "loss": 0.0, + "step": 30611 + }, + { + "epoch": 2.8563963795838387, + "grad_norm": NaN, + "learning_rate": 0.00016915202641794067, + "loss": 0.0, + "step": 30612 + }, + { + "epoch": 2.856489689278716, + "grad_norm": NaN, + "learning_rate": 0.00016914452451576465, + "loss": 0.0, + "step": 30613 + }, + { + "epoch": 2.856582998973593, + "grad_norm": NaN, + "learning_rate": 0.0001691370225649098, + "loss": 0.0, + "step": 30614 + }, + { + "epoch": 2.8566763086684706, + "grad_norm": NaN, + "learning_rate": 0.00016912952056539517, + "loss": 0.0, + "step": 30615 + }, + { + "epoch": 2.856769618363348, + "grad_norm": NaN, + "learning_rate": 0.00016912201851723984, + "loss": 0.0, + "step": 30616 + }, + { + "epoch": 2.8568629280582254, + "grad_norm": NaN, + "learning_rate": 0.00016911451642046286, + "loss": 0.0, + "step": 30617 + }, + { + "epoch": 2.856956237753103, + "grad_norm": NaN, + "learning_rate": 0.00016910701427508333, + "loss": 0.0, + "step": 30618 + }, + { + "epoch": 2.85704954744798, + "grad_norm": NaN, + "learning_rate": 0.00016909951208112035, + "loss": 0.0, + "step": 30619 + }, + { + "epoch": 2.857142857142857, + "grad_norm": NaN, + "learning_rate": 0.00016909200983859297, + "loss": 0.0, + "step": 30620 + }, + { + "epoch": 2.857236166837734, + "grad_norm": NaN, + "learning_rate": 0.00016908450754752022, + "loss": 0.0, + "step": 30621 + }, + { + "epoch": 2.8573294765326116, + "grad_norm": NaN, + "learning_rate": 0.00016907700520792126, + "loss": 0.0, + "step": 30622 + }, + { + "epoch": 2.857422786227489, + "grad_norm": NaN, + "learning_rate": 0.0001690695028198151, + "loss": 0.0, + "step": 30623 + }, + { + "epoch": 2.8575160959223664, + "grad_norm": NaN, + "learning_rate": 0.00016906200038322084, + "loss": 0.0, + "step": 30624 + }, + { + "epoch": 2.857609405617244, + "grad_norm": NaN, + "learning_rate": 0.00016905449789815754, + "loss": 0.0, + "step": 30625 + }, + { + "epoch": 2.857702715312121, + "grad_norm": NaN, + "learning_rate": 0.0001690469953646444, + "loss": 0.0, + "step": 30626 + }, + { + "epoch": 2.8577960250069983, + "grad_norm": NaN, + "learning_rate": 0.0001690394927827003, + "loss": 0.0, + "step": 30627 + }, + { + "epoch": 2.8578893347018757, + "grad_norm": NaN, + "learning_rate": 0.00016903199015234444, + "loss": 0.0, + "step": 30628 + }, + { + "epoch": 2.8579826443967526, + "grad_norm": NaN, + "learning_rate": 0.00016902448747359587, + "loss": 0.0, + "step": 30629 + }, + { + "epoch": 2.85807595409163, + "grad_norm": NaN, + "learning_rate": 0.00016901698474647365, + "loss": 0.0, + "step": 30630 + }, + { + "epoch": 2.8581692637865075, + "grad_norm": NaN, + "learning_rate": 0.00016900948197099685, + "loss": 0.0, + "step": 30631 + }, + { + "epoch": 2.858262573481385, + "grad_norm": NaN, + "learning_rate": 0.00016900197914718464, + "loss": 0.0, + "step": 30632 + }, + { + "epoch": 2.858355883176262, + "grad_norm": NaN, + "learning_rate": 0.000168994476275056, + "loss": 0.0, + "step": 30633 + }, + { + "epoch": 2.8584491928711393, + "grad_norm": NaN, + "learning_rate": 0.00016898697335462995, + "loss": 0.0, + "step": 30634 + }, + { + "epoch": 2.8585425025660167, + "grad_norm": NaN, + "learning_rate": 0.00016897947038592572, + "loss": 0.0, + "step": 30635 + }, + { + "epoch": 2.8586358122608937, + "grad_norm": NaN, + "learning_rate": 0.00016897196736896232, + "loss": 0.0, + "step": 30636 + }, + { + "epoch": 2.858729121955771, + "grad_norm": NaN, + "learning_rate": 0.00016896446430375884, + "loss": 0.0, + "step": 30637 + }, + { + "epoch": 2.8588224316506485, + "grad_norm": NaN, + "learning_rate": 0.00016895696119033433, + "loss": 0.0, + "step": 30638 + }, + { + "epoch": 2.858915741345526, + "grad_norm": NaN, + "learning_rate": 0.0001689494580287079, + "loss": 0.0, + "step": 30639 + }, + { + "epoch": 2.8590090510404034, + "grad_norm": NaN, + "learning_rate": 0.00016894195481889854, + "loss": 0.0, + "step": 30640 + }, + { + "epoch": 2.8591023607352803, + "grad_norm": NaN, + "learning_rate": 0.00016893445156092543, + "loss": 0.0, + "step": 30641 + }, + { + "epoch": 2.8591956704301578, + "grad_norm": NaN, + "learning_rate": 0.00016892694825480767, + "loss": 0.0, + "step": 30642 + }, + { + "epoch": 2.8592889801250347, + "grad_norm": NaN, + "learning_rate": 0.0001689194449005643, + "loss": 0.0, + "step": 30643 + }, + { + "epoch": 2.859382289819912, + "grad_norm": NaN, + "learning_rate": 0.00016891194149821427, + "loss": 0.0, + "step": 30644 + }, + { + "epoch": 2.8594755995147896, + "grad_norm": NaN, + "learning_rate": 0.0001689044380477769, + "loss": 0.0, + "step": 30645 + }, + { + "epoch": 2.859568909209667, + "grad_norm": NaN, + "learning_rate": 0.0001688969345492711, + "loss": 0.0, + "step": 30646 + }, + { + "epoch": 2.8596622189045444, + "grad_norm": NaN, + "learning_rate": 0.00016888943100271591, + "loss": 0.0, + "step": 30647 + }, + { + "epoch": 2.8597555285994214, + "grad_norm": NaN, + "learning_rate": 0.0001688819274081306, + "loss": 0.0, + "step": 30648 + }, + { + "epoch": 2.859848838294299, + "grad_norm": NaN, + "learning_rate": 0.00016887442376553412, + "loss": 0.0, + "step": 30649 + }, + { + "epoch": 2.8599421479891762, + "grad_norm": NaN, + "learning_rate": 0.00016886692007494559, + "loss": 0.0, + "step": 30650 + }, + { + "epoch": 2.860035457684053, + "grad_norm": NaN, + "learning_rate": 0.000168859416336384, + "loss": 0.0, + "step": 30651 + }, + { + "epoch": 2.8601287673789306, + "grad_norm": NaN, + "learning_rate": 0.00016885191254986854, + "loss": 0.0, + "step": 30652 + }, + { + "epoch": 2.860222077073808, + "grad_norm": NaN, + "learning_rate": 0.00016884440871541828, + "loss": 0.0, + "step": 30653 + }, + { + "epoch": 2.8603153867686855, + "grad_norm": NaN, + "learning_rate": 0.00016883690483305224, + "loss": 0.0, + "step": 30654 + }, + { + "epoch": 2.8604086964635624, + "grad_norm": NaN, + "learning_rate": 0.00016882940090278954, + "loss": 0.0, + "step": 30655 + }, + { + "epoch": 2.86050200615844, + "grad_norm": NaN, + "learning_rate": 0.00016882189692464923, + "loss": 0.0, + "step": 30656 + }, + { + "epoch": 2.8605953158533173, + "grad_norm": NaN, + "learning_rate": 0.00016881439289865042, + "loss": 0.0, + "step": 30657 + }, + { + "epoch": 2.8606886255481943, + "grad_norm": NaN, + "learning_rate": 0.0001688068888248122, + "loss": 0.0, + "step": 30658 + }, + { + "epoch": 2.8607819352430717, + "grad_norm": NaN, + "learning_rate": 0.0001687993847031536, + "loss": 0.0, + "step": 30659 + }, + { + "epoch": 2.860875244937949, + "grad_norm": NaN, + "learning_rate": 0.00016879188053369373, + "loss": 0.0, + "step": 30660 + }, + { + "epoch": 2.8609685546328265, + "grad_norm": NaN, + "learning_rate": 0.00016878437631645174, + "loss": 0.0, + "step": 30661 + }, + { + "epoch": 2.8610618643277035, + "grad_norm": NaN, + "learning_rate": 0.0001687768720514466, + "loss": 0.0, + "step": 30662 + }, + { + "epoch": 2.861155174022581, + "grad_norm": NaN, + "learning_rate": 0.0001687693677386974, + "loss": 0.0, + "step": 30663 + }, + { + "epoch": 2.8612484837174583, + "grad_norm": NaN, + "learning_rate": 0.00016876186337822335, + "loss": 0.0, + "step": 30664 + }, + { + "epoch": 2.8613417934123353, + "grad_norm": NaN, + "learning_rate": 0.00016875435897004338, + "loss": 0.0, + "step": 30665 + }, + { + "epoch": 2.8614351031072127, + "grad_norm": NaN, + "learning_rate": 0.00016874685451417662, + "loss": 0.0, + "step": 30666 + }, + { + "epoch": 2.86152841280209, + "grad_norm": NaN, + "learning_rate": 0.00016873935001064219, + "loss": 0.0, + "step": 30667 + }, + { + "epoch": 2.8616217224969676, + "grad_norm": NaN, + "learning_rate": 0.00016873184545945912, + "loss": 0.0, + "step": 30668 + }, + { + "epoch": 2.861715032191845, + "grad_norm": NaN, + "learning_rate": 0.00016872434086064652, + "loss": 0.0, + "step": 30669 + }, + { + "epoch": 2.861808341886722, + "grad_norm": NaN, + "learning_rate": 0.0001687168362142235, + "loss": 0.0, + "step": 30670 + }, + { + "epoch": 2.8619016515815994, + "grad_norm": NaN, + "learning_rate": 0.0001687093315202091, + "loss": 0.0, + "step": 30671 + }, + { + "epoch": 2.861994961276477, + "grad_norm": NaN, + "learning_rate": 0.0001687018267786224, + "loss": 0.0, + "step": 30672 + }, + { + "epoch": 2.8620882709713538, + "grad_norm": NaN, + "learning_rate": 0.0001686943219894825, + "loss": 0.0, + "step": 30673 + }, + { + "epoch": 2.862181580666231, + "grad_norm": NaN, + "learning_rate": 0.00016868681715280845, + "loss": 0.0, + "step": 30674 + }, + { + "epoch": 2.8622748903611086, + "grad_norm": NaN, + "learning_rate": 0.00016867931226861938, + "loss": 0.0, + "step": 30675 + }, + { + "epoch": 2.862368200055986, + "grad_norm": NaN, + "learning_rate": 0.00016867180733693438, + "loss": 0.0, + "step": 30676 + }, + { + "epoch": 2.862461509750863, + "grad_norm": NaN, + "learning_rate": 0.00016866430235777248, + "loss": 0.0, + "step": 30677 + }, + { + "epoch": 2.8625548194457404, + "grad_norm": NaN, + "learning_rate": 0.00016865679733115278, + "loss": 0.0, + "step": 30678 + }, + { + "epoch": 2.862648129140618, + "grad_norm": NaN, + "learning_rate": 0.0001686492922570943, + "loss": 0.0, + "step": 30679 + }, + { + "epoch": 2.862741438835495, + "grad_norm": NaN, + "learning_rate": 0.00016864178713561632, + "loss": 0.0, + "step": 30680 + }, + { + "epoch": 2.8628347485303722, + "grad_norm": NaN, + "learning_rate": 0.00016863428196673774, + "loss": 0.0, + "step": 30681 + }, + { + "epoch": 2.8629280582252497, + "grad_norm": NaN, + "learning_rate": 0.00016862677675047772, + "loss": 0.0, + "step": 30682 + }, + { + "epoch": 2.863021367920127, + "grad_norm": NaN, + "learning_rate": 0.00016861927148685533, + "loss": 0.0, + "step": 30683 + }, + { + "epoch": 2.863114677615004, + "grad_norm": NaN, + "learning_rate": 0.00016861176617588965, + "loss": 0.0, + "step": 30684 + }, + { + "epoch": 2.8632079873098815, + "grad_norm": NaN, + "learning_rate": 0.00016860426081759968, + "loss": 0.0, + "step": 30685 + }, + { + "epoch": 2.863301297004759, + "grad_norm": NaN, + "learning_rate": 0.0001685967554120047, + "loss": 0.0, + "step": 30686 + }, + { + "epoch": 2.863394606699636, + "grad_norm": NaN, + "learning_rate": 0.00016858924995912363, + "loss": 0.0, + "step": 30687 + }, + { + "epoch": 2.8634879163945133, + "grad_norm": NaN, + "learning_rate": 0.00016858174445897554, + "loss": 0.0, + "step": 30688 + }, + { + "epoch": 2.8635812260893907, + "grad_norm": NaN, + "learning_rate": 0.00016857423891157968, + "loss": 0.0, + "step": 30689 + }, + { + "epoch": 2.863674535784268, + "grad_norm": NaN, + "learning_rate": 0.000168566733316955, + "loss": 0.0, + "step": 30690 + }, + { + "epoch": 2.8637678454791455, + "grad_norm": NaN, + "learning_rate": 0.0001685592276751206, + "loss": 0.0, + "step": 30691 + }, + { + "epoch": 2.8638611551740225, + "grad_norm": NaN, + "learning_rate": 0.00016855172198609558, + "loss": 0.0, + "step": 30692 + }, + { + "epoch": 2.8639544648689, + "grad_norm": NaN, + "learning_rate": 0.00016854421624989905, + "loss": 0.0, + "step": 30693 + }, + { + "epoch": 2.864047774563777, + "grad_norm": NaN, + "learning_rate": 0.00016853671046655007, + "loss": 0.0, + "step": 30694 + }, + { + "epoch": 2.8641410842586543, + "grad_norm": NaN, + "learning_rate": 0.00016852920463606772, + "loss": 0.0, + "step": 30695 + }, + { + "epoch": 2.8642343939535317, + "grad_norm": NaN, + "learning_rate": 0.0001685216987584711, + "loss": 0.0, + "step": 30696 + }, + { + "epoch": 2.864327703648409, + "grad_norm": NaN, + "learning_rate": 0.00016851419283377925, + "loss": 0.0, + "step": 30697 + }, + { + "epoch": 2.8644210133432866, + "grad_norm": NaN, + "learning_rate": 0.0001685066868620113, + "loss": 0.0, + "step": 30698 + }, + { + "epoch": 2.8645143230381636, + "grad_norm": NaN, + "learning_rate": 0.00016849918084318636, + "loss": 0.0, + "step": 30699 + }, + { + "epoch": 2.864607632733041, + "grad_norm": NaN, + "learning_rate": 0.00016849167477732348, + "loss": 0.0, + "step": 30700 + }, + { + "epoch": 2.8647009424279184, + "grad_norm": NaN, + "learning_rate": 0.00016848416866444167, + "loss": 0.0, + "step": 30701 + }, + { + "epoch": 2.8647942521227954, + "grad_norm": NaN, + "learning_rate": 0.00016847666250456022, + "loss": 0.0, + "step": 30702 + }, + { + "epoch": 2.864887561817673, + "grad_norm": NaN, + "learning_rate": 0.00016846915629769804, + "loss": 0.0, + "step": 30703 + }, + { + "epoch": 2.86498087151255, + "grad_norm": NaN, + "learning_rate": 0.0001684616500438742, + "loss": 0.0, + "step": 30704 + }, + { + "epoch": 2.8650741812074276, + "grad_norm": NaN, + "learning_rate": 0.00016845414374310795, + "loss": 0.0, + "step": 30705 + }, + { + "epoch": 2.8651674909023046, + "grad_norm": NaN, + "learning_rate": 0.00016844663739541823, + "loss": 0.0, + "step": 30706 + }, + { + "epoch": 2.865260800597182, + "grad_norm": NaN, + "learning_rate": 0.00016843913100082417, + "loss": 0.0, + "step": 30707 + }, + { + "epoch": 2.8653541102920594, + "grad_norm": NaN, + "learning_rate": 0.0001684316245593449, + "loss": 0.0, + "step": 30708 + }, + { + "epoch": 2.8654474199869364, + "grad_norm": NaN, + "learning_rate": 0.00016842411807099946, + "loss": 0.0, + "step": 30709 + }, + { + "epoch": 2.865540729681814, + "grad_norm": NaN, + "learning_rate": 0.0001684166115358069, + "loss": 0.0, + "step": 30710 + }, + { + "epoch": 2.8656340393766913, + "grad_norm": NaN, + "learning_rate": 0.00016840910495378642, + "loss": 0.0, + "step": 30711 + }, + { + "epoch": 2.8657273490715687, + "grad_norm": NaN, + "learning_rate": 0.00016840159832495698, + "loss": 0.0, + "step": 30712 + }, + { + "epoch": 2.865820658766446, + "grad_norm": NaN, + "learning_rate": 0.0001683940916493377, + "loss": 0.0, + "step": 30713 + }, + { + "epoch": 2.865913968461323, + "grad_norm": NaN, + "learning_rate": 0.00016838658492694778, + "loss": 0.0, + "step": 30714 + }, + { + "epoch": 2.8660072781562005, + "grad_norm": NaN, + "learning_rate": 0.00016837907815780618, + "loss": 0.0, + "step": 30715 + }, + { + "epoch": 2.8661005878510775, + "grad_norm": NaN, + "learning_rate": 0.00016837157134193202, + "loss": 0.0, + "step": 30716 + }, + { + "epoch": 2.866193897545955, + "grad_norm": NaN, + "learning_rate": 0.0001683640644793444, + "loss": 0.0, + "step": 30717 + }, + { + "epoch": 2.8662872072408323, + "grad_norm": NaN, + "learning_rate": 0.00016835655757006243, + "loss": 0.0, + "step": 30718 + }, + { + "epoch": 2.8663805169357097, + "grad_norm": NaN, + "learning_rate": 0.00016834905061410518, + "loss": 0.0, + "step": 30719 + }, + { + "epoch": 2.866473826630587, + "grad_norm": NaN, + "learning_rate": 0.00016834154361149167, + "loss": 0.0, + "step": 30720 + }, + { + "epoch": 2.866567136325464, + "grad_norm": NaN, + "learning_rate": 0.0001683340365622411, + "loss": 0.0, + "step": 30721 + }, + { + "epoch": 2.8666604460203415, + "grad_norm": NaN, + "learning_rate": 0.0001683265294663725, + "loss": 0.0, + "step": 30722 + }, + { + "epoch": 2.866753755715219, + "grad_norm": NaN, + "learning_rate": 0.00016831902232390488, + "loss": 0.0, + "step": 30723 + }, + { + "epoch": 2.866847065410096, + "grad_norm": NaN, + "learning_rate": 0.00016831151513485752, + "loss": 0.0, + "step": 30724 + }, + { + "epoch": 2.8669403751049733, + "grad_norm": NaN, + "learning_rate": 0.00016830400789924938, + "loss": 0.0, + "step": 30725 + }, + { + "epoch": 2.8670336847998508, + "grad_norm": NaN, + "learning_rate": 0.00016829650061709952, + "loss": 0.0, + "step": 30726 + }, + { + "epoch": 2.867126994494728, + "grad_norm": NaN, + "learning_rate": 0.00016828899328842712, + "loss": 0.0, + "step": 30727 + }, + { + "epoch": 2.867220304189605, + "grad_norm": NaN, + "learning_rate": 0.00016828148591325126, + "loss": 0.0, + "step": 30728 + }, + { + "epoch": 2.8673136138844826, + "grad_norm": NaN, + "learning_rate": 0.0001682739784915909, + "loss": 0.0, + "step": 30729 + }, + { + "epoch": 2.86740692357936, + "grad_norm": NaN, + "learning_rate": 0.0001682664710234653, + "loss": 0.0, + "step": 30730 + }, + { + "epoch": 2.867500233274237, + "grad_norm": NaN, + "learning_rate": 0.00016825896350889345, + "loss": 0.0, + "step": 30731 + }, + { + "epoch": 2.8675935429691144, + "grad_norm": NaN, + "learning_rate": 0.00016825145594789444, + "loss": 0.0, + "step": 30732 + }, + { + "epoch": 2.867686852663992, + "grad_norm": NaN, + "learning_rate": 0.00016824394834048746, + "loss": 0.0, + "step": 30733 + }, + { + "epoch": 2.8677801623588692, + "grad_norm": NaN, + "learning_rate": 0.00016823644068669148, + "loss": 0.0, + "step": 30734 + }, + { + "epoch": 2.8678734720537467, + "grad_norm": NaN, + "learning_rate": 0.0001682289329865256, + "loss": 0.0, + "step": 30735 + }, + { + "epoch": 2.8679667817486236, + "grad_norm": NaN, + "learning_rate": 0.00016822142524000898, + "loss": 0.0, + "step": 30736 + }, + { + "epoch": 2.868060091443501, + "grad_norm": NaN, + "learning_rate": 0.0001682139174471607, + "loss": 0.0, + "step": 30737 + }, + { + "epoch": 2.868153401138378, + "grad_norm": NaN, + "learning_rate": 0.00016820640960799976, + "loss": 0.0, + "step": 30738 + }, + { + "epoch": 2.8682467108332554, + "grad_norm": NaN, + "learning_rate": 0.00016819890172254536, + "loss": 0.0, + "step": 30739 + }, + { + "epoch": 2.868340020528133, + "grad_norm": NaN, + "learning_rate": 0.00016819139379081657, + "loss": 0.0, + "step": 30740 + }, + { + "epoch": 2.8684333302230103, + "grad_norm": NaN, + "learning_rate": 0.00016818388581283243, + "loss": 0.0, + "step": 30741 + }, + { + "epoch": 2.8685266399178877, + "grad_norm": NaN, + "learning_rate": 0.000168176377788612, + "loss": 0.0, + "step": 30742 + }, + { + "epoch": 2.8686199496127647, + "grad_norm": NaN, + "learning_rate": 0.0001681688697181745, + "loss": 0.0, + "step": 30743 + }, + { + "epoch": 2.868713259307642, + "grad_norm": NaN, + "learning_rate": 0.0001681613616015389, + "loss": 0.0, + "step": 30744 + }, + { + "epoch": 2.8688065690025195, + "grad_norm": NaN, + "learning_rate": 0.0001681538534387243, + "loss": 0.0, + "step": 30745 + }, + { + "epoch": 2.8688998786973965, + "grad_norm": NaN, + "learning_rate": 0.0001681463452297499, + "loss": 0.0, + "step": 30746 + }, + { + "epoch": 2.868993188392274, + "grad_norm": NaN, + "learning_rate": 0.0001681388369746347, + "loss": 0.0, + "step": 30747 + }, + { + "epoch": 2.8690864980871513, + "grad_norm": NaN, + "learning_rate": 0.0001681313286733978, + "loss": 0.0, + "step": 30748 + }, + { + "epoch": 2.8691798077820287, + "grad_norm": NaN, + "learning_rate": 0.00016812382032605833, + "loss": 0.0, + "step": 30749 + }, + { + "epoch": 2.8692731174769057, + "grad_norm": NaN, + "learning_rate": 0.00016811631193263532, + "loss": 0.0, + "step": 30750 + }, + { + "epoch": 2.869366427171783, + "grad_norm": NaN, + "learning_rate": 0.0001681088034931479, + "loss": 0.0, + "step": 30751 + }, + { + "epoch": 2.8694597368666606, + "grad_norm": NaN, + "learning_rate": 0.0001681012950076152, + "loss": 0.0, + "step": 30752 + }, + { + "epoch": 2.8695530465615375, + "grad_norm": NaN, + "learning_rate": 0.00016809378647605623, + "loss": 0.0, + "step": 30753 + }, + { + "epoch": 2.869646356256415, + "grad_norm": NaN, + "learning_rate": 0.0001680862778984901, + "loss": 0.0, + "step": 30754 + }, + { + "epoch": 2.8697396659512924, + "grad_norm": NaN, + "learning_rate": 0.00016807876927493593, + "loss": 0.0, + "step": 30755 + }, + { + "epoch": 2.86983297564617, + "grad_norm": NaN, + "learning_rate": 0.00016807126060541284, + "loss": 0.0, + "step": 30756 + }, + { + "epoch": 2.869926285341047, + "grad_norm": NaN, + "learning_rate": 0.00016806375188993986, + "loss": 0.0, + "step": 30757 + }, + { + "epoch": 2.870019595035924, + "grad_norm": NaN, + "learning_rate": 0.0001680562431285361, + "loss": 0.0, + "step": 30758 + }, + { + "epoch": 2.8701129047308016, + "grad_norm": NaN, + "learning_rate": 0.0001680487343212207, + "loss": 0.0, + "step": 30759 + }, + { + "epoch": 2.8702062144256786, + "grad_norm": NaN, + "learning_rate": 0.0001680412254680127, + "loss": 0.0, + "step": 30760 + }, + { + "epoch": 2.870299524120556, + "grad_norm": NaN, + "learning_rate": 0.00016803371656893117, + "loss": 0.0, + "step": 30761 + }, + { + "epoch": 2.8703928338154334, + "grad_norm": NaN, + "learning_rate": 0.00016802620762399533, + "loss": 0.0, + "step": 30762 + }, + { + "epoch": 2.870486143510311, + "grad_norm": NaN, + "learning_rate": 0.00016801869863322414, + "loss": 0.0, + "step": 30763 + }, + { + "epoch": 2.8705794532051883, + "grad_norm": NaN, + "learning_rate": 0.0001680111895966367, + "loss": 0.0, + "step": 30764 + }, + { + "epoch": 2.8706727629000652, + "grad_norm": NaN, + "learning_rate": 0.00016800368051425217, + "loss": 0.0, + "step": 30765 + }, + { + "epoch": 2.8707660725949427, + "grad_norm": NaN, + "learning_rate": 0.00016799617138608963, + "loss": 0.0, + "step": 30766 + }, + { + "epoch": 2.87085938228982, + "grad_norm": NaN, + "learning_rate": 0.0001679886622121681, + "loss": 0.0, + "step": 30767 + }, + { + "epoch": 2.870952691984697, + "grad_norm": NaN, + "learning_rate": 0.0001679811529925068, + "loss": 0.0, + "step": 30768 + }, + { + "epoch": 2.8710460016795745, + "grad_norm": NaN, + "learning_rate": 0.00016797364372712472, + "loss": 0.0, + "step": 30769 + }, + { + "epoch": 2.871139311374452, + "grad_norm": NaN, + "learning_rate": 0.00016796613441604093, + "loss": 0.0, + "step": 30770 + }, + { + "epoch": 2.8712326210693293, + "grad_norm": NaN, + "learning_rate": 0.0001679586250592747, + "loss": 0.0, + "step": 30771 + }, + { + "epoch": 2.8713259307642063, + "grad_norm": NaN, + "learning_rate": 0.00016795111565684495, + "loss": 0.0, + "step": 30772 + }, + { + "epoch": 2.8714192404590837, + "grad_norm": NaN, + "learning_rate": 0.0001679436062087708, + "loss": 0.0, + "step": 30773 + }, + { + "epoch": 2.871512550153961, + "grad_norm": NaN, + "learning_rate": 0.0001679360967150714, + "loss": 0.0, + "step": 30774 + }, + { + "epoch": 2.871605859848838, + "grad_norm": NaN, + "learning_rate": 0.00016792858717576585, + "loss": 0.0, + "step": 30775 + }, + { + "epoch": 2.8716991695437155, + "grad_norm": NaN, + "learning_rate": 0.00016792107759087315, + "loss": 0.0, + "step": 30776 + }, + { + "epoch": 2.871792479238593, + "grad_norm": NaN, + "learning_rate": 0.0001679135679604125, + "loss": 0.0, + "step": 30777 + }, + { + "epoch": 2.8718857889334704, + "grad_norm": NaN, + "learning_rate": 0.000167906058284403, + "loss": 0.0, + "step": 30778 + }, + { + "epoch": 2.8719790986283473, + "grad_norm": NaN, + "learning_rate": 0.0001678985485628636, + "loss": 0.0, + "step": 30779 + }, + { + "epoch": 2.8720724083232247, + "grad_norm": NaN, + "learning_rate": 0.00016789103879581355, + "loss": 0.0, + "step": 30780 + }, + { + "epoch": 2.872165718018102, + "grad_norm": NaN, + "learning_rate": 0.0001678835289832719, + "loss": 0.0, + "step": 30781 + }, + { + "epoch": 2.872259027712979, + "grad_norm": NaN, + "learning_rate": 0.00016787601912525766, + "loss": 0.0, + "step": 30782 + }, + { + "epoch": 2.8723523374078566, + "grad_norm": NaN, + "learning_rate": 0.00016786850922179006, + "loss": 0.0, + "step": 30783 + }, + { + "epoch": 2.872445647102734, + "grad_norm": NaN, + "learning_rate": 0.00016786099927288816, + "loss": 0.0, + "step": 30784 + }, + { + "epoch": 2.8725389567976114, + "grad_norm": NaN, + "learning_rate": 0.000167853489278571, + "loss": 0.0, + "step": 30785 + }, + { + "epoch": 2.872632266492489, + "grad_norm": NaN, + "learning_rate": 0.00016784597923885766, + "loss": 0.0, + "step": 30786 + }, + { + "epoch": 2.872725576187366, + "grad_norm": NaN, + "learning_rate": 0.00016783846915376734, + "loss": 0.0, + "step": 30787 + }, + { + "epoch": 2.872818885882243, + "grad_norm": NaN, + "learning_rate": 0.00016783095902331903, + "loss": 0.0, + "step": 30788 + }, + { + "epoch": 2.87291219557712, + "grad_norm": NaN, + "learning_rate": 0.0001678234488475319, + "loss": 0.0, + "step": 30789 + }, + { + "epoch": 2.8730055052719976, + "grad_norm": NaN, + "learning_rate": 0.00016781593862642504, + "loss": 0.0, + "step": 30790 + }, + { + "epoch": 2.873098814966875, + "grad_norm": NaN, + "learning_rate": 0.0001678084283600175, + "loss": 0.0, + "step": 30791 + }, + { + "epoch": 2.8731921246617524, + "grad_norm": NaN, + "learning_rate": 0.00016780091804832837, + "loss": 0.0, + "step": 30792 + }, + { + "epoch": 2.87328543435663, + "grad_norm": NaN, + "learning_rate": 0.00016779340769137684, + "loss": 0.0, + "step": 30793 + }, + { + "epoch": 2.873378744051507, + "grad_norm": NaN, + "learning_rate": 0.00016778589728918194, + "loss": 0.0, + "step": 30794 + }, + { + "epoch": 2.8734720537463843, + "grad_norm": NaN, + "learning_rate": 0.0001677783868417627, + "loss": 0.0, + "step": 30795 + }, + { + "epoch": 2.8735653634412617, + "grad_norm": NaN, + "learning_rate": 0.00016777087634913836, + "loss": 0.0, + "step": 30796 + }, + { + "epoch": 2.8736586731361387, + "grad_norm": NaN, + "learning_rate": 0.00016776336581132795, + "loss": 0.0, + "step": 30797 + }, + { + "epoch": 2.873751982831016, + "grad_norm": NaN, + "learning_rate": 0.0001677558552283505, + "loss": 0.0, + "step": 30798 + }, + { + "epoch": 2.8738452925258935, + "grad_norm": NaN, + "learning_rate": 0.0001677483446002252, + "loss": 0.0, + "step": 30799 + }, + { + "epoch": 2.873938602220771, + "grad_norm": NaN, + "learning_rate": 0.00016774083392697117, + "loss": 0.0, + "step": 30800 + }, + { + "epoch": 2.874031911915648, + "grad_norm": NaN, + "learning_rate": 0.0001677333232086074, + "loss": 0.0, + "step": 30801 + }, + { + "epoch": 2.8741252216105253, + "grad_norm": NaN, + "learning_rate": 0.000167725812445153, + "loss": 0.0, + "step": 30802 + }, + { + "epoch": 2.8742185313054027, + "grad_norm": NaN, + "learning_rate": 0.00016771830163662724, + "loss": 0.0, + "step": 30803 + }, + { + "epoch": 2.8743118410002797, + "grad_norm": NaN, + "learning_rate": 0.00016771079078304897, + "loss": 0.0, + "step": 30804 + }, + { + "epoch": 2.874405150695157, + "grad_norm": NaN, + "learning_rate": 0.00016770327988443743, + "loss": 0.0, + "step": 30805 + }, + { + "epoch": 2.8744984603900345, + "grad_norm": NaN, + "learning_rate": 0.00016769576894081175, + "loss": 0.0, + "step": 30806 + }, + { + "epoch": 2.874591770084912, + "grad_norm": NaN, + "learning_rate": 0.00016768825795219092, + "loss": 0.0, + "step": 30807 + }, + { + "epoch": 2.8746850797797894, + "grad_norm": NaN, + "learning_rate": 0.00016768074691859408, + "loss": 0.0, + "step": 30808 + }, + { + "epoch": 2.8747783894746664, + "grad_norm": NaN, + "learning_rate": 0.0001676732358400404, + "loss": 0.0, + "step": 30809 + }, + { + "epoch": 2.8748716991695438, + "grad_norm": NaN, + "learning_rate": 0.00016766572471654885, + "loss": 0.0, + "step": 30810 + }, + { + "epoch": 2.8749650088644207, + "grad_norm": NaN, + "learning_rate": 0.0001676582135481386, + "loss": 0.0, + "step": 30811 + }, + { + "epoch": 2.875058318559298, + "grad_norm": NaN, + "learning_rate": 0.00016765070233482873, + "loss": 0.0, + "step": 30812 + }, + { + "epoch": 2.8751516282541756, + "grad_norm": NaN, + "learning_rate": 0.00016764319107663843, + "loss": 0.0, + "step": 30813 + }, + { + "epoch": 2.875244937949053, + "grad_norm": NaN, + "learning_rate": 0.00016763567977358667, + "loss": 0.0, + "step": 30814 + }, + { + "epoch": 2.8753382476439304, + "grad_norm": NaN, + "learning_rate": 0.0001676281684256926, + "loss": 0.0, + "step": 30815 + }, + { + "epoch": 2.8754315573388074, + "grad_norm": NaN, + "learning_rate": 0.00016762065703297533, + "loss": 0.0, + "step": 30816 + }, + { + "epoch": 2.875524867033685, + "grad_norm": NaN, + "learning_rate": 0.00016761314559545393, + "loss": 0.0, + "step": 30817 + }, + { + "epoch": 2.8756181767285622, + "grad_norm": NaN, + "learning_rate": 0.00016760563411314755, + "loss": 0.0, + "step": 30818 + }, + { + "epoch": 2.875711486423439, + "grad_norm": NaN, + "learning_rate": 0.00016759812258607525, + "loss": 0.0, + "step": 30819 + }, + { + "epoch": 2.8758047961183166, + "grad_norm": NaN, + "learning_rate": 0.0001675906110142561, + "loss": 0.0, + "step": 30820 + }, + { + "epoch": 2.875898105813194, + "grad_norm": NaN, + "learning_rate": 0.00016758309939770926, + "loss": 0.0, + "step": 30821 + }, + { + "epoch": 2.8759914155080715, + "grad_norm": NaN, + "learning_rate": 0.00016757558773645385, + "loss": 0.0, + "step": 30822 + }, + { + "epoch": 2.8760847252029484, + "grad_norm": NaN, + "learning_rate": 0.00016756807603050887, + "loss": 0.0, + "step": 30823 + }, + { + "epoch": 2.876178034897826, + "grad_norm": NaN, + "learning_rate": 0.00016756056427989346, + "loss": 0.0, + "step": 30824 + }, + { + "epoch": 2.8762713445927033, + "grad_norm": NaN, + "learning_rate": 0.0001675530524846268, + "loss": 0.0, + "step": 30825 + }, + { + "epoch": 2.8763646542875803, + "grad_norm": NaN, + "learning_rate": 0.00016754554064472786, + "loss": 0.0, + "step": 30826 + }, + { + "epoch": 2.8764579639824577, + "grad_norm": NaN, + "learning_rate": 0.00016753802876021587, + "loss": 0.0, + "step": 30827 + }, + { + "epoch": 2.876551273677335, + "grad_norm": NaN, + "learning_rate": 0.00016753051683110985, + "loss": 0.0, + "step": 30828 + }, + { + "epoch": 2.8766445833722125, + "grad_norm": NaN, + "learning_rate": 0.00016752300485742893, + "loss": 0.0, + "step": 30829 + }, + { + "epoch": 2.87673789306709, + "grad_norm": NaN, + "learning_rate": 0.00016751549283919214, + "loss": 0.0, + "step": 30830 + }, + { + "epoch": 2.876831202761967, + "grad_norm": NaN, + "learning_rate": 0.0001675079807764187, + "loss": 0.0, + "step": 30831 + }, + { + "epoch": 2.8769245124568443, + "grad_norm": NaN, + "learning_rate": 0.00016750046866912766, + "loss": 0.0, + "step": 30832 + }, + { + "epoch": 2.8770178221517213, + "grad_norm": NaN, + "learning_rate": 0.00016749295651733803, + "loss": 0.0, + "step": 30833 + }, + { + "epoch": 2.8771111318465987, + "grad_norm": NaN, + "learning_rate": 0.00016748544432106907, + "loss": 0.0, + "step": 30834 + }, + { + "epoch": 2.877204441541476, + "grad_norm": NaN, + "learning_rate": 0.00016747793208033983, + "loss": 0.0, + "step": 30835 + }, + { + "epoch": 2.8772977512363536, + "grad_norm": NaN, + "learning_rate": 0.0001674704197951693, + "loss": 0.0, + "step": 30836 + }, + { + "epoch": 2.877391060931231, + "grad_norm": NaN, + "learning_rate": 0.0001674629074655767, + "loss": 0.0, + "step": 30837 + }, + { + "epoch": 2.877484370626108, + "grad_norm": NaN, + "learning_rate": 0.00016745539509158114, + "loss": 0.0, + "step": 30838 + }, + { + "epoch": 2.8775776803209854, + "grad_norm": NaN, + "learning_rate": 0.0001674478826732016, + "loss": 0.0, + "step": 30839 + }, + { + "epoch": 2.877670990015863, + "grad_norm": NaN, + "learning_rate": 0.00016744037021045732, + "loss": 0.0, + "step": 30840 + }, + { + "epoch": 2.8777642997107398, + "grad_norm": NaN, + "learning_rate": 0.00016743285770336736, + "loss": 0.0, + "step": 30841 + }, + { + "epoch": 2.877857609405617, + "grad_norm": NaN, + "learning_rate": 0.00016742534515195074, + "loss": 0.0, + "step": 30842 + }, + { + "epoch": 2.8779509191004946, + "grad_norm": NaN, + "learning_rate": 0.0001674178325562267, + "loss": 0.0, + "step": 30843 + }, + { + "epoch": 2.878044228795372, + "grad_norm": NaN, + "learning_rate": 0.00016741031991621426, + "loss": 0.0, + "step": 30844 + }, + { + "epoch": 2.878137538490249, + "grad_norm": NaN, + "learning_rate": 0.00016740280723193248, + "loss": 0.0, + "step": 30845 + }, + { + "epoch": 2.8782308481851264, + "grad_norm": NaN, + "learning_rate": 0.00016739529450340057, + "loss": 0.0, + "step": 30846 + }, + { + "epoch": 2.878324157880004, + "grad_norm": NaN, + "learning_rate": 0.00016738778173063754, + "loss": 0.0, + "step": 30847 + }, + { + "epoch": 2.878417467574881, + "grad_norm": NaN, + "learning_rate": 0.00016738026891366256, + "loss": 0.0, + "step": 30848 + }, + { + "epoch": 2.8785107772697582, + "grad_norm": NaN, + "learning_rate": 0.0001673727560524947, + "loss": 0.0, + "step": 30849 + }, + { + "epoch": 2.8786040869646357, + "grad_norm": NaN, + "learning_rate": 0.00016736524314715304, + "loss": 0.0, + "step": 30850 + }, + { + "epoch": 2.878697396659513, + "grad_norm": NaN, + "learning_rate": 0.0001673577301976568, + "loss": 0.0, + "step": 30851 + }, + { + "epoch": 2.8787907063543905, + "grad_norm": NaN, + "learning_rate": 0.0001673502172040249, + "loss": 0.0, + "step": 30852 + }, + { + "epoch": 2.8788840160492675, + "grad_norm": NaN, + "learning_rate": 0.00016734270416627657, + "loss": 0.0, + "step": 30853 + }, + { + "epoch": 2.878977325744145, + "grad_norm": NaN, + "learning_rate": 0.0001673351910844309, + "loss": 0.0, + "step": 30854 + }, + { + "epoch": 2.879070635439022, + "grad_norm": NaN, + "learning_rate": 0.00016732767795850693, + "loss": 0.0, + "step": 30855 + }, + { + "epoch": 2.8791639451338993, + "grad_norm": NaN, + "learning_rate": 0.0001673201647885238, + "loss": 0.0, + "step": 30856 + }, + { + "epoch": 2.8792572548287767, + "grad_norm": NaN, + "learning_rate": 0.0001673126515745007, + "loss": 0.0, + "step": 30857 + }, + { + "epoch": 2.879350564523654, + "grad_norm": NaN, + "learning_rate": 0.00016730513831645657, + "loss": 0.0, + "step": 30858 + }, + { + "epoch": 2.8794438742185315, + "grad_norm": NaN, + "learning_rate": 0.00016729762501441065, + "loss": 0.0, + "step": 30859 + }, + { + "epoch": 2.8795371839134085, + "grad_norm": NaN, + "learning_rate": 0.00016729011166838199, + "loss": 0.0, + "step": 30860 + }, + { + "epoch": 2.879630493608286, + "grad_norm": NaN, + "learning_rate": 0.00016728259827838965, + "loss": 0.0, + "step": 30861 + }, + { + "epoch": 2.8797238033031634, + "grad_norm": NaN, + "learning_rate": 0.00016727508484445285, + "loss": 0.0, + "step": 30862 + }, + { + "epoch": 2.8798171129980403, + "grad_norm": NaN, + "learning_rate": 0.0001672675713665906, + "loss": 0.0, + "step": 30863 + }, + { + "epoch": 2.8799104226929177, + "grad_norm": NaN, + "learning_rate": 0.00016726005784482203, + "loss": 0.0, + "step": 30864 + }, + { + "epoch": 2.880003732387795, + "grad_norm": NaN, + "learning_rate": 0.0001672525442791662, + "loss": 0.0, + "step": 30865 + }, + { + "epoch": 2.8800970420826726, + "grad_norm": NaN, + "learning_rate": 0.0001672450306696423, + "loss": 0.0, + "step": 30866 + }, + { + "epoch": 2.8801903517775496, + "grad_norm": NaN, + "learning_rate": 0.00016723751701626944, + "loss": 0.0, + "step": 30867 + }, + { + "epoch": 2.880283661472427, + "grad_norm": NaN, + "learning_rate": 0.0001672300033190666, + "loss": 0.0, + "step": 30868 + }, + { + "epoch": 2.8803769711673044, + "grad_norm": NaN, + "learning_rate": 0.00016722248957805303, + "loss": 0.0, + "step": 30869 + }, + { + "epoch": 2.8804702808621814, + "grad_norm": NaN, + "learning_rate": 0.00016721497579324772, + "loss": 0.0, + "step": 30870 + }, + { + "epoch": 2.880563590557059, + "grad_norm": NaN, + "learning_rate": 0.00016720746196466986, + "loss": 0.0, + "step": 30871 + }, + { + "epoch": 2.880656900251936, + "grad_norm": NaN, + "learning_rate": 0.00016719994809233853, + "loss": 0.0, + "step": 30872 + }, + { + "epoch": 2.8807502099468136, + "grad_norm": NaN, + "learning_rate": 0.0001671924341762728, + "loss": 0.0, + "step": 30873 + }, + { + "epoch": 2.8808435196416906, + "grad_norm": NaN, + "learning_rate": 0.00016718492021649178, + "loss": 0.0, + "step": 30874 + }, + { + "epoch": 2.880936829336568, + "grad_norm": NaN, + "learning_rate": 0.00016717740621301466, + "loss": 0.0, + "step": 30875 + }, + { + "epoch": 2.8810301390314454, + "grad_norm": NaN, + "learning_rate": 0.00016716989216586047, + "loss": 0.0, + "step": 30876 + }, + { + "epoch": 2.8811234487263224, + "grad_norm": NaN, + "learning_rate": 0.0001671623780750483, + "loss": 0.0, + "step": 30877 + }, + { + "epoch": 2.8812167584212, + "grad_norm": NaN, + "learning_rate": 0.0001671548639405973, + "loss": 0.0, + "step": 30878 + }, + { + "epoch": 2.8813100681160773, + "grad_norm": NaN, + "learning_rate": 0.0001671473497625266, + "loss": 0.0, + "step": 30879 + }, + { + "epoch": 2.8814033778109547, + "grad_norm": NaN, + "learning_rate": 0.00016713983554085519, + "loss": 0.0, + "step": 30880 + }, + { + "epoch": 2.881496687505832, + "grad_norm": NaN, + "learning_rate": 0.0001671323212756023, + "loss": 0.0, + "step": 30881 + }, + { + "epoch": 2.881589997200709, + "grad_norm": NaN, + "learning_rate": 0.000167124806966787, + "loss": 0.0, + "step": 30882 + }, + { + "epoch": 2.8816833068955865, + "grad_norm": NaN, + "learning_rate": 0.00016711729261442837, + "loss": 0.0, + "step": 30883 + }, + { + "epoch": 2.881776616590464, + "grad_norm": NaN, + "learning_rate": 0.00016710977821854554, + "loss": 0.0, + "step": 30884 + }, + { + "epoch": 2.881869926285341, + "grad_norm": NaN, + "learning_rate": 0.00016710226377915764, + "loss": 0.0, + "step": 30885 + }, + { + "epoch": 2.8819632359802183, + "grad_norm": NaN, + "learning_rate": 0.00016709474929628372, + "loss": 0.0, + "step": 30886 + }, + { + "epoch": 2.8820565456750957, + "grad_norm": NaN, + "learning_rate": 0.00016708723476994291, + "loss": 0.0, + "step": 30887 + }, + { + "epoch": 2.882149855369973, + "grad_norm": NaN, + "learning_rate": 0.00016707972020015433, + "loss": 0.0, + "step": 30888 + }, + { + "epoch": 2.88224316506485, + "grad_norm": NaN, + "learning_rate": 0.00016707220558693708, + "loss": 0.0, + "step": 30889 + }, + { + "epoch": 2.8823364747597275, + "grad_norm": NaN, + "learning_rate": 0.00016706469093031025, + "loss": 0.0, + "step": 30890 + }, + { + "epoch": 2.882429784454605, + "grad_norm": NaN, + "learning_rate": 0.00016705717623029302, + "loss": 0.0, + "step": 30891 + }, + { + "epoch": 2.882523094149482, + "grad_norm": NaN, + "learning_rate": 0.00016704966148690443, + "loss": 0.0, + "step": 30892 + }, + { + "epoch": 2.8826164038443594, + "grad_norm": NaN, + "learning_rate": 0.00016704214670016358, + "loss": 0.0, + "step": 30893 + }, + { + "epoch": 2.8827097135392368, + "grad_norm": NaN, + "learning_rate": 0.00016703463187008958, + "loss": 0.0, + "step": 30894 + }, + { + "epoch": 2.882803023234114, + "grad_norm": NaN, + "learning_rate": 0.0001670271169967016, + "loss": 0.0, + "step": 30895 + }, + { + "epoch": 2.882896332928991, + "grad_norm": NaN, + "learning_rate": 0.00016701960208001868, + "loss": 0.0, + "step": 30896 + }, + { + "epoch": 2.8829896426238686, + "grad_norm": NaN, + "learning_rate": 0.00016701208712005997, + "loss": 0.0, + "step": 30897 + }, + { + "epoch": 2.883082952318746, + "grad_norm": NaN, + "learning_rate": 0.00016700457211684457, + "loss": 0.0, + "step": 30898 + }, + { + "epoch": 2.883176262013623, + "grad_norm": NaN, + "learning_rate": 0.00016699705707039154, + "loss": 0.0, + "step": 30899 + }, + { + "epoch": 2.8832695717085004, + "grad_norm": NaN, + "learning_rate": 0.00016698954198072005, + "loss": 0.0, + "step": 30900 + }, + { + "epoch": 2.883362881403378, + "grad_norm": NaN, + "learning_rate": 0.0001669820268478492, + "loss": 0.0, + "step": 30901 + }, + { + "epoch": 2.8834561910982552, + "grad_norm": NaN, + "learning_rate": 0.00016697451167179808, + "loss": 0.0, + "step": 30902 + }, + { + "epoch": 2.8835495007931327, + "grad_norm": NaN, + "learning_rate": 0.0001669669964525858, + "loss": 0.0, + "step": 30903 + }, + { + "epoch": 2.8836428104880096, + "grad_norm": NaN, + "learning_rate": 0.00016695948119023146, + "loss": 0.0, + "step": 30904 + }, + { + "epoch": 2.883736120182887, + "grad_norm": NaN, + "learning_rate": 0.0001669519658847542, + "loss": 0.0, + "step": 30905 + }, + { + "epoch": 2.883829429877764, + "grad_norm": NaN, + "learning_rate": 0.00016694445053617313, + "loss": 0.0, + "step": 30906 + }, + { + "epoch": 2.8839227395726414, + "grad_norm": NaN, + "learning_rate": 0.00016693693514450728, + "loss": 0.0, + "step": 30907 + }, + { + "epoch": 2.884016049267519, + "grad_norm": NaN, + "learning_rate": 0.00016692941970977587, + "loss": 0.0, + "step": 30908 + }, + { + "epoch": 2.8841093589623963, + "grad_norm": NaN, + "learning_rate": 0.00016692190423199798, + "loss": 0.0, + "step": 30909 + }, + { + "epoch": 2.8842026686572737, + "grad_norm": NaN, + "learning_rate": 0.00016691438871119265, + "loss": 0.0, + "step": 30910 + }, + { + "epoch": 2.8842959783521507, + "grad_norm": NaN, + "learning_rate": 0.00016690687314737906, + "loss": 0.0, + "step": 30911 + }, + { + "epoch": 2.884389288047028, + "grad_norm": NaN, + "learning_rate": 0.0001668993575405763, + "loss": 0.0, + "step": 30912 + }, + { + "epoch": 2.8844825977419055, + "grad_norm": NaN, + "learning_rate": 0.0001668918418908035, + "loss": 0.0, + "step": 30913 + }, + { + "epoch": 2.8845759074367825, + "grad_norm": NaN, + "learning_rate": 0.00016688432619807973, + "loss": 0.0, + "step": 30914 + }, + { + "epoch": 2.88466921713166, + "grad_norm": NaN, + "learning_rate": 0.0001668768104624241, + "loss": 0.0, + "step": 30915 + }, + { + "epoch": 2.8847625268265373, + "grad_norm": NaN, + "learning_rate": 0.00016686929468385576, + "loss": 0.0, + "step": 30916 + }, + { + "epoch": 2.8848558365214148, + "grad_norm": NaN, + "learning_rate": 0.00016686177886239382, + "loss": 0.0, + "step": 30917 + }, + { + "epoch": 2.8849491462162917, + "grad_norm": NaN, + "learning_rate": 0.00016685426299805736, + "loss": 0.0, + "step": 30918 + }, + { + "epoch": 2.885042455911169, + "grad_norm": NaN, + "learning_rate": 0.00016684674709086547, + "loss": 0.0, + "step": 30919 + }, + { + "epoch": 2.8851357656060466, + "grad_norm": NaN, + "learning_rate": 0.00016683923114083735, + "loss": 0.0, + "step": 30920 + }, + { + "epoch": 2.8852290753009235, + "grad_norm": NaN, + "learning_rate": 0.000166831715147992, + "loss": 0.0, + "step": 30921 + }, + { + "epoch": 2.885322384995801, + "grad_norm": NaN, + "learning_rate": 0.0001668241991123486, + "loss": 0.0, + "step": 30922 + }, + { + "epoch": 2.8854156946906784, + "grad_norm": NaN, + "learning_rate": 0.00016681668303392626, + "loss": 0.0, + "step": 30923 + }, + { + "epoch": 2.885509004385556, + "grad_norm": NaN, + "learning_rate": 0.0001668091669127441, + "loss": 0.0, + "step": 30924 + }, + { + "epoch": 2.885602314080433, + "grad_norm": NaN, + "learning_rate": 0.00016680165074882113, + "loss": 0.0, + "step": 30925 + }, + { + "epoch": 2.88569562377531, + "grad_norm": NaN, + "learning_rate": 0.00016679413454217658, + "loss": 0.0, + "step": 30926 + }, + { + "epoch": 2.8857889334701876, + "grad_norm": NaN, + "learning_rate": 0.00016678661829282954, + "loss": 0.0, + "step": 30927 + }, + { + "epoch": 2.8858822431650646, + "grad_norm": NaN, + "learning_rate": 0.0001667791020007991, + "loss": 0.0, + "step": 30928 + }, + { + "epoch": 2.885975552859942, + "grad_norm": NaN, + "learning_rate": 0.00016677158566610436, + "loss": 0.0, + "step": 30929 + }, + { + "epoch": 2.8860688625548194, + "grad_norm": NaN, + "learning_rate": 0.00016676406928876447, + "loss": 0.0, + "step": 30930 + }, + { + "epoch": 2.886162172249697, + "grad_norm": NaN, + "learning_rate": 0.00016675655286879849, + "loss": 0.0, + "step": 30931 + }, + { + "epoch": 2.8862554819445743, + "grad_norm": NaN, + "learning_rate": 0.00016674903640622553, + "loss": 0.0, + "step": 30932 + }, + { + "epoch": 2.8863487916394512, + "grad_norm": NaN, + "learning_rate": 0.00016674151990106478, + "loss": 0.0, + "step": 30933 + }, + { + "epoch": 2.8864421013343287, + "grad_norm": NaN, + "learning_rate": 0.0001667340033533353, + "loss": 0.0, + "step": 30934 + }, + { + "epoch": 2.886535411029206, + "grad_norm": NaN, + "learning_rate": 0.00016672648676305618, + "loss": 0.0, + "step": 30935 + }, + { + "epoch": 2.886628720724083, + "grad_norm": NaN, + "learning_rate": 0.0001667189701302466, + "loss": 0.0, + "step": 30936 + }, + { + "epoch": 2.8867220304189605, + "grad_norm": NaN, + "learning_rate": 0.0001667114534549256, + "loss": 0.0, + "step": 30937 + }, + { + "epoch": 2.886815340113838, + "grad_norm": NaN, + "learning_rate": 0.00016670393673711234, + "loss": 0.0, + "step": 30938 + }, + { + "epoch": 2.8869086498087153, + "grad_norm": NaN, + "learning_rate": 0.0001666964199768259, + "loss": 0.0, + "step": 30939 + }, + { + "epoch": 2.8870019595035923, + "grad_norm": NaN, + "learning_rate": 0.00016668890317408542, + "loss": 0.0, + "step": 30940 + }, + { + "epoch": 2.8870952691984697, + "grad_norm": NaN, + "learning_rate": 0.00016668138632890998, + "loss": 0.0, + "step": 30941 + }, + { + "epoch": 2.887188578893347, + "grad_norm": NaN, + "learning_rate": 0.00016667386944131874, + "loss": 0.0, + "step": 30942 + }, + { + "epoch": 2.887281888588224, + "grad_norm": NaN, + "learning_rate": 0.0001666663525113308, + "loss": 0.0, + "step": 30943 + }, + { + "epoch": 2.8873751982831015, + "grad_norm": NaN, + "learning_rate": 0.00016665883553896525, + "loss": 0.0, + "step": 30944 + }, + { + "epoch": 2.887468507977979, + "grad_norm": NaN, + "learning_rate": 0.00016665131852424122, + "loss": 0.0, + "step": 30945 + }, + { + "epoch": 2.8875618176728564, + "grad_norm": NaN, + "learning_rate": 0.0001666438014671778, + "loss": 0.0, + "step": 30946 + }, + { + "epoch": 2.8876551273677338, + "grad_norm": NaN, + "learning_rate": 0.00016663628436779413, + "loss": 0.0, + "step": 30947 + }, + { + "epoch": 2.8877484370626108, + "grad_norm": NaN, + "learning_rate": 0.00016662876722610935, + "loss": 0.0, + "step": 30948 + }, + { + "epoch": 2.887841746757488, + "grad_norm": NaN, + "learning_rate": 0.00016662125004214252, + "loss": 0.0, + "step": 30949 + }, + { + "epoch": 2.887935056452365, + "grad_norm": NaN, + "learning_rate": 0.00016661373281591277, + "loss": 0.0, + "step": 30950 + }, + { + "epoch": 2.8880283661472426, + "grad_norm": NaN, + "learning_rate": 0.0001666062155474392, + "loss": 0.0, + "step": 30951 + }, + { + "epoch": 2.88812167584212, + "grad_norm": NaN, + "learning_rate": 0.00016659869823674097, + "loss": 0.0, + "step": 30952 + }, + { + "epoch": 2.8882149855369974, + "grad_norm": NaN, + "learning_rate": 0.00016659118088383718, + "loss": 0.0, + "step": 30953 + }, + { + "epoch": 2.888308295231875, + "grad_norm": NaN, + "learning_rate": 0.0001665836634887469, + "loss": 0.0, + "step": 30954 + }, + { + "epoch": 2.888401604926752, + "grad_norm": NaN, + "learning_rate": 0.00016657614605148927, + "loss": 0.0, + "step": 30955 + }, + { + "epoch": 2.888494914621629, + "grad_norm": NaN, + "learning_rate": 0.00016656862857208343, + "loss": 0.0, + "step": 30956 + }, + { + "epoch": 2.8885882243165066, + "grad_norm": NaN, + "learning_rate": 0.0001665611110505485, + "loss": 0.0, + "step": 30957 + }, + { + "epoch": 2.8886815340113836, + "grad_norm": NaN, + "learning_rate": 0.00016655359348690353, + "loss": 0.0, + "step": 30958 + }, + { + "epoch": 2.888774843706261, + "grad_norm": NaN, + "learning_rate": 0.0001665460758811677, + "loss": 0.0, + "step": 30959 + }, + { + "epoch": 2.8888681534011384, + "grad_norm": NaN, + "learning_rate": 0.00016653855823336007, + "loss": 0.0, + "step": 30960 + }, + { + "epoch": 2.888961463096016, + "grad_norm": NaN, + "learning_rate": 0.0001665310405434998, + "loss": 0.0, + "step": 30961 + }, + { + "epoch": 2.889054772790893, + "grad_norm": NaN, + "learning_rate": 0.00016652352281160598, + "loss": 0.0, + "step": 30962 + }, + { + "epoch": 2.8891480824857703, + "grad_norm": NaN, + "learning_rate": 0.00016651600503769776, + "loss": 0.0, + "step": 30963 + }, + { + "epoch": 2.8892413921806477, + "grad_norm": NaN, + "learning_rate": 0.00016650848722179424, + "loss": 0.0, + "step": 30964 + }, + { + "epoch": 2.8893347018755247, + "grad_norm": NaN, + "learning_rate": 0.0001665009693639145, + "loss": 0.0, + "step": 30965 + }, + { + "epoch": 2.889428011570402, + "grad_norm": NaN, + "learning_rate": 0.00016649345146407772, + "loss": 0.0, + "step": 30966 + }, + { + "epoch": 2.8895213212652795, + "grad_norm": NaN, + "learning_rate": 0.00016648593352230293, + "loss": 0.0, + "step": 30967 + }, + { + "epoch": 2.889614630960157, + "grad_norm": NaN, + "learning_rate": 0.00016647841553860931, + "loss": 0.0, + "step": 30968 + }, + { + "epoch": 2.8897079406550343, + "grad_norm": NaN, + "learning_rate": 0.00016647089751301597, + "loss": 0.0, + "step": 30969 + }, + { + "epoch": 2.8898012503499113, + "grad_norm": NaN, + "learning_rate": 0.000166463379445542, + "loss": 0.0, + "step": 30970 + }, + { + "epoch": 2.8898945600447887, + "grad_norm": NaN, + "learning_rate": 0.00016645586133620657, + "loss": 0.0, + "step": 30971 + }, + { + "epoch": 2.8899878697396657, + "grad_norm": NaN, + "learning_rate": 0.0001664483431850287, + "loss": 0.0, + "step": 30972 + }, + { + "epoch": 2.890081179434543, + "grad_norm": NaN, + "learning_rate": 0.00016644082499202764, + "loss": 0.0, + "step": 30973 + }, + { + "epoch": 2.8901744891294205, + "grad_norm": NaN, + "learning_rate": 0.0001664333067572224, + "loss": 0.0, + "step": 30974 + }, + { + "epoch": 2.890267798824298, + "grad_norm": NaN, + "learning_rate": 0.0001664257884806321, + "loss": 0.0, + "step": 30975 + }, + { + "epoch": 2.8903611085191754, + "grad_norm": NaN, + "learning_rate": 0.00016641827016227592, + "loss": 0.0, + "step": 30976 + }, + { + "epoch": 2.8904544182140524, + "grad_norm": NaN, + "learning_rate": 0.00016641075180217293, + "loss": 0.0, + "step": 30977 + }, + { + "epoch": 2.8905477279089298, + "grad_norm": NaN, + "learning_rate": 0.00016640323340034224, + "loss": 0.0, + "step": 30978 + }, + { + "epoch": 2.890641037603807, + "grad_norm": NaN, + "learning_rate": 0.00016639571495680305, + "loss": 0.0, + "step": 30979 + }, + { + "epoch": 2.890734347298684, + "grad_norm": NaN, + "learning_rate": 0.00016638819647157434, + "loss": 0.0, + "step": 30980 + }, + { + "epoch": 2.8908276569935616, + "grad_norm": NaN, + "learning_rate": 0.00016638067794467535, + "loss": 0.0, + "step": 30981 + }, + { + "epoch": 2.890920966688439, + "grad_norm": NaN, + "learning_rate": 0.0001663731593761251, + "loss": 0.0, + "step": 30982 + }, + { + "epoch": 2.8910142763833164, + "grad_norm": NaN, + "learning_rate": 0.0001663656407659428, + "loss": 0.0, + "step": 30983 + }, + { + "epoch": 2.8911075860781934, + "grad_norm": NaN, + "learning_rate": 0.00016635812211414748, + "loss": 0.0, + "step": 30984 + }, + { + "epoch": 2.891200895773071, + "grad_norm": NaN, + "learning_rate": 0.00016635060342075835, + "loss": 0.0, + "step": 30985 + }, + { + "epoch": 2.8912942054679482, + "grad_norm": NaN, + "learning_rate": 0.00016634308468579446, + "loss": 0.0, + "step": 30986 + }, + { + "epoch": 2.891387515162825, + "grad_norm": NaN, + "learning_rate": 0.00016633556590927497, + "loss": 0.0, + "step": 30987 + }, + { + "epoch": 2.8914808248577026, + "grad_norm": NaN, + "learning_rate": 0.00016632804709121893, + "loss": 0.0, + "step": 30988 + }, + { + "epoch": 2.89157413455258, + "grad_norm": NaN, + "learning_rate": 0.0001663205282316455, + "loss": 0.0, + "step": 30989 + }, + { + "epoch": 2.8916674442474575, + "grad_norm": NaN, + "learning_rate": 0.00016631300933057385, + "loss": 0.0, + "step": 30990 + }, + { + "epoch": 2.8917607539423344, + "grad_norm": NaN, + "learning_rate": 0.00016630549038802302, + "loss": 0.0, + "step": 30991 + }, + { + "epoch": 2.891854063637212, + "grad_norm": NaN, + "learning_rate": 0.00016629797140401217, + "loss": 0.0, + "step": 30992 + }, + { + "epoch": 2.8919473733320893, + "grad_norm": NaN, + "learning_rate": 0.0001662904523785604, + "loss": 0.0, + "step": 30993 + }, + { + "epoch": 2.8920406830269663, + "grad_norm": NaN, + "learning_rate": 0.0001662829333116868, + "loss": 0.0, + "step": 30994 + }, + { + "epoch": 2.8921339927218437, + "grad_norm": NaN, + "learning_rate": 0.0001662754142034106, + "loss": 0.0, + "step": 30995 + }, + { + "epoch": 2.892227302416721, + "grad_norm": NaN, + "learning_rate": 0.0001662678950537508, + "loss": 0.0, + "step": 30996 + }, + { + "epoch": 2.8923206121115985, + "grad_norm": NaN, + "learning_rate": 0.00016626037586272655, + "loss": 0.0, + "step": 30997 + }, + { + "epoch": 2.892413921806476, + "grad_norm": NaN, + "learning_rate": 0.000166252856630357, + "loss": 0.0, + "step": 30998 + }, + { + "epoch": 2.892507231501353, + "grad_norm": NaN, + "learning_rate": 0.00016624533735666125, + "loss": 0.0, + "step": 30999 + }, + { + "epoch": 2.8926005411962303, + "grad_norm": NaN, + "learning_rate": 0.00016623781804165838, + "loss": 0.0, + "step": 31000 + }, + { + "epoch": 2.8926938508911073, + "grad_norm": NaN, + "learning_rate": 0.0001662302986853676, + "loss": 0.0, + "step": 31001 + }, + { + "epoch": 2.8927871605859847, + "grad_norm": NaN, + "learning_rate": 0.00016622277928780797, + "loss": 0.0, + "step": 31002 + }, + { + "epoch": 2.892880470280862, + "grad_norm": NaN, + "learning_rate": 0.0001662152598489986, + "loss": 0.0, + "step": 31003 + }, + { + "epoch": 2.8929737799757396, + "grad_norm": NaN, + "learning_rate": 0.0001662077403689586, + "loss": 0.0, + "step": 31004 + }, + { + "epoch": 2.893067089670617, + "grad_norm": NaN, + "learning_rate": 0.00016620022084770717, + "loss": 0.0, + "step": 31005 + }, + { + "epoch": 2.893160399365494, + "grad_norm": NaN, + "learning_rate": 0.00016619270128526338, + "loss": 0.0, + "step": 31006 + }, + { + "epoch": 2.8932537090603714, + "grad_norm": NaN, + "learning_rate": 0.0001661851816816463, + "loss": 0.0, + "step": 31007 + }, + { + "epoch": 2.893347018755249, + "grad_norm": NaN, + "learning_rate": 0.00016617766203687513, + "loss": 0.0, + "step": 31008 + }, + { + "epoch": 2.8934403284501258, + "grad_norm": NaN, + "learning_rate": 0.00016617014235096895, + "loss": 0.0, + "step": 31009 + }, + { + "epoch": 2.893533638145003, + "grad_norm": NaN, + "learning_rate": 0.0001661626226239469, + "loss": 0.0, + "step": 31010 + }, + { + "epoch": 2.8936269478398806, + "grad_norm": NaN, + "learning_rate": 0.00016615510285582807, + "loss": 0.0, + "step": 31011 + }, + { + "epoch": 2.893720257534758, + "grad_norm": NaN, + "learning_rate": 0.0001661475830466316, + "loss": 0.0, + "step": 31012 + }, + { + "epoch": 2.893813567229635, + "grad_norm": NaN, + "learning_rate": 0.00016614006319637663, + "loss": 0.0, + "step": 31013 + }, + { + "epoch": 2.8939068769245124, + "grad_norm": NaN, + "learning_rate": 0.00016613254330508225, + "loss": 0.0, + "step": 31014 + }, + { + "epoch": 2.89400018661939, + "grad_norm": NaN, + "learning_rate": 0.00016612502337276757, + "loss": 0.0, + "step": 31015 + }, + { + "epoch": 2.894093496314267, + "grad_norm": NaN, + "learning_rate": 0.00016611750339945176, + "loss": 0.0, + "step": 31016 + }, + { + "epoch": 2.8941868060091442, + "grad_norm": NaN, + "learning_rate": 0.0001661099833851539, + "loss": 0.0, + "step": 31017 + }, + { + "epoch": 2.8942801157040217, + "grad_norm": NaN, + "learning_rate": 0.00016610246332989315, + "loss": 0.0, + "step": 31018 + }, + { + "epoch": 2.894373425398899, + "grad_norm": NaN, + "learning_rate": 0.00016609494323368857, + "loss": 0.0, + "step": 31019 + }, + { + "epoch": 2.8944667350937765, + "grad_norm": NaN, + "learning_rate": 0.00016608742309655932, + "loss": 0.0, + "step": 31020 + }, + { + "epoch": 2.8945600447886535, + "grad_norm": NaN, + "learning_rate": 0.00016607990291852453, + "loss": 0.0, + "step": 31021 + }, + { + "epoch": 2.894653354483531, + "grad_norm": NaN, + "learning_rate": 0.0001660723826996033, + "loss": 0.0, + "step": 31022 + }, + { + "epoch": 2.894746664178408, + "grad_norm": NaN, + "learning_rate": 0.00016606486243981477, + "loss": 0.0, + "step": 31023 + }, + { + "epoch": 2.8948399738732853, + "grad_norm": NaN, + "learning_rate": 0.00016605734213917805, + "loss": 0.0, + "step": 31024 + }, + { + "epoch": 2.8949332835681627, + "grad_norm": NaN, + "learning_rate": 0.0001660498217977123, + "loss": 0.0, + "step": 31025 + }, + { + "epoch": 2.89502659326304, + "grad_norm": NaN, + "learning_rate": 0.00016604230141543656, + "loss": 0.0, + "step": 31026 + }, + { + "epoch": 2.8951199029579175, + "grad_norm": NaN, + "learning_rate": 0.00016603478099237002, + "loss": 0.0, + "step": 31027 + }, + { + "epoch": 2.8952132126527945, + "grad_norm": NaN, + "learning_rate": 0.0001660272605285318, + "loss": 0.0, + "step": 31028 + }, + { + "epoch": 2.895306522347672, + "grad_norm": NaN, + "learning_rate": 0.00016601974002394097, + "loss": 0.0, + "step": 31029 + }, + { + "epoch": 2.8953998320425494, + "grad_norm": NaN, + "learning_rate": 0.00016601221947861667, + "loss": 0.0, + "step": 31030 + }, + { + "epoch": 2.8954931417374263, + "grad_norm": NaN, + "learning_rate": 0.0001660046988925781, + "loss": 0.0, + "step": 31031 + }, + { + "epoch": 2.8955864514323038, + "grad_norm": NaN, + "learning_rate": 0.00016599717826584428, + "loss": 0.0, + "step": 31032 + }, + { + "epoch": 2.895679761127181, + "grad_norm": NaN, + "learning_rate": 0.00016598965759843438, + "loss": 0.0, + "step": 31033 + }, + { + "epoch": 2.8957730708220586, + "grad_norm": NaN, + "learning_rate": 0.00016598213689036751, + "loss": 0.0, + "step": 31034 + }, + { + "epoch": 2.8958663805169356, + "grad_norm": NaN, + "learning_rate": 0.0001659746161416628, + "loss": 0.0, + "step": 31035 + }, + { + "epoch": 2.895959690211813, + "grad_norm": NaN, + "learning_rate": 0.0001659670953523394, + "loss": 0.0, + "step": 31036 + }, + { + "epoch": 2.8960529999066904, + "grad_norm": NaN, + "learning_rate": 0.0001659595745224164, + "loss": 0.0, + "step": 31037 + }, + { + "epoch": 2.8961463096015674, + "grad_norm": NaN, + "learning_rate": 0.0001659520536519129, + "loss": 0.0, + "step": 31038 + }, + { + "epoch": 2.896239619296445, + "grad_norm": NaN, + "learning_rate": 0.00016594453274084808, + "loss": 0.0, + "step": 31039 + }, + { + "epoch": 2.896332928991322, + "grad_norm": NaN, + "learning_rate": 0.00016593701178924106, + "loss": 0.0, + "step": 31040 + }, + { + "epoch": 2.8964262386861996, + "grad_norm": NaN, + "learning_rate": 0.0001659294907971109, + "loss": 0.0, + "step": 31041 + }, + { + "epoch": 2.896519548381077, + "grad_norm": NaN, + "learning_rate": 0.00016592196976447676, + "loss": 0.0, + "step": 31042 + }, + { + "epoch": 2.896612858075954, + "grad_norm": NaN, + "learning_rate": 0.0001659144486913578, + "loss": 0.0, + "step": 31043 + }, + { + "epoch": 2.8967061677708315, + "grad_norm": NaN, + "learning_rate": 0.00016590692757777306, + "loss": 0.0, + "step": 31044 + }, + { + "epoch": 2.8967994774657084, + "grad_norm": NaN, + "learning_rate": 0.00016589940642374175, + "loss": 0.0, + "step": 31045 + }, + { + "epoch": 2.896892787160586, + "grad_norm": NaN, + "learning_rate": 0.00016589188522928298, + "loss": 0.0, + "step": 31046 + }, + { + "epoch": 2.8969860968554633, + "grad_norm": NaN, + "learning_rate": 0.0001658843639944158, + "loss": 0.0, + "step": 31047 + }, + { + "epoch": 2.8970794065503407, + "grad_norm": NaN, + "learning_rate": 0.00016587684271915942, + "loss": 0.0, + "step": 31048 + }, + { + "epoch": 2.897172716245218, + "grad_norm": NaN, + "learning_rate": 0.00016586932140353296, + "loss": 0.0, + "step": 31049 + }, + { + "epoch": 2.897266025940095, + "grad_norm": NaN, + "learning_rate": 0.00016586180004755547, + "loss": 0.0, + "step": 31050 + }, + { + "epoch": 2.8973593356349725, + "grad_norm": NaN, + "learning_rate": 0.00016585427865124615, + "loss": 0.0, + "step": 31051 + }, + { + "epoch": 2.89745264532985, + "grad_norm": NaN, + "learning_rate": 0.00016584675721462405, + "loss": 0.0, + "step": 31052 + }, + { + "epoch": 2.897545955024727, + "grad_norm": NaN, + "learning_rate": 0.0001658392357377084, + "loss": 0.0, + "step": 31053 + }, + { + "epoch": 2.8976392647196043, + "grad_norm": NaN, + "learning_rate": 0.00016583171422051825, + "loss": 0.0, + "step": 31054 + }, + { + "epoch": 2.8977325744144817, + "grad_norm": NaN, + "learning_rate": 0.0001658241926630727, + "loss": 0.0, + "step": 31055 + }, + { + "epoch": 2.897825884109359, + "grad_norm": NaN, + "learning_rate": 0.00016581667106539097, + "loss": 0.0, + "step": 31056 + }, + { + "epoch": 2.897919193804236, + "grad_norm": NaN, + "learning_rate": 0.00016580914942749208, + "loss": 0.0, + "step": 31057 + }, + { + "epoch": 2.8980125034991135, + "grad_norm": NaN, + "learning_rate": 0.00016580162774939525, + "loss": 0.0, + "step": 31058 + }, + { + "epoch": 2.898105813193991, + "grad_norm": NaN, + "learning_rate": 0.00016579410603111952, + "loss": 0.0, + "step": 31059 + }, + { + "epoch": 2.898199122888868, + "grad_norm": NaN, + "learning_rate": 0.0001657865842726841, + "loss": 0.0, + "step": 31060 + }, + { + "epoch": 2.8982924325837454, + "grad_norm": NaN, + "learning_rate": 0.00016577906247410807, + "loss": 0.0, + "step": 31061 + }, + { + "epoch": 2.898385742278623, + "grad_norm": NaN, + "learning_rate": 0.00016577154063541054, + "loss": 0.0, + "step": 31062 + }, + { + "epoch": 2.8984790519735, + "grad_norm": NaN, + "learning_rate": 0.00016576401875661065, + "loss": 0.0, + "step": 31063 + }, + { + "epoch": 2.8985723616683776, + "grad_norm": NaN, + "learning_rate": 0.0001657564968377275, + "loss": 0.0, + "step": 31064 + }, + { + "epoch": 2.8986656713632546, + "grad_norm": NaN, + "learning_rate": 0.0001657489748787803, + "loss": 0.0, + "step": 31065 + }, + { + "epoch": 2.898758981058132, + "grad_norm": NaN, + "learning_rate": 0.0001657414528797881, + "loss": 0.0, + "step": 31066 + }, + { + "epoch": 2.898852290753009, + "grad_norm": NaN, + "learning_rate": 0.00016573393084077007, + "loss": 0.0, + "step": 31067 + }, + { + "epoch": 2.8989456004478864, + "grad_norm": NaN, + "learning_rate": 0.00016572640876174528, + "loss": 0.0, + "step": 31068 + }, + { + "epoch": 2.899038910142764, + "grad_norm": NaN, + "learning_rate": 0.0001657188866427329, + "loss": 0.0, + "step": 31069 + }, + { + "epoch": 2.8991322198376412, + "grad_norm": NaN, + "learning_rate": 0.00016571136448375204, + "loss": 0.0, + "step": 31070 + }, + { + "epoch": 2.8992255295325187, + "grad_norm": NaN, + "learning_rate": 0.00016570384228482184, + "loss": 0.0, + "step": 31071 + }, + { + "epoch": 2.8993188392273956, + "grad_norm": NaN, + "learning_rate": 0.00016569632004596147, + "loss": 0.0, + "step": 31072 + }, + { + "epoch": 2.899412148922273, + "grad_norm": NaN, + "learning_rate": 0.00016568879776718994, + "loss": 0.0, + "step": 31073 + }, + { + "epoch": 2.8995054586171505, + "grad_norm": NaN, + "learning_rate": 0.00016568127544852644, + "loss": 0.0, + "step": 31074 + }, + { + "epoch": 2.8995987683120275, + "grad_norm": NaN, + "learning_rate": 0.0001656737530899902, + "loss": 0.0, + "step": 31075 + }, + { + "epoch": 2.899692078006905, + "grad_norm": NaN, + "learning_rate": 0.00016566623069160015, + "loss": 0.0, + "step": 31076 + }, + { + "epoch": 2.8997853877017823, + "grad_norm": NaN, + "learning_rate": 0.00016565870825337555, + "loss": 0.0, + "step": 31077 + }, + { + "epoch": 2.8998786973966597, + "grad_norm": NaN, + "learning_rate": 0.0001656511857753355, + "loss": 0.0, + "step": 31078 + }, + { + "epoch": 2.8999720070915367, + "grad_norm": NaN, + "learning_rate": 0.0001656436632574991, + "loss": 0.0, + "step": 31079 + }, + { + "epoch": 2.900065316786414, + "grad_norm": NaN, + "learning_rate": 0.0001656361406998855, + "loss": 0.0, + "step": 31080 + }, + { + "epoch": 2.9001586264812915, + "grad_norm": NaN, + "learning_rate": 0.00016562861810251385, + "loss": 0.0, + "step": 31081 + }, + { + "epoch": 2.9002519361761685, + "grad_norm": NaN, + "learning_rate": 0.00016562109546540326, + "loss": 0.0, + "step": 31082 + }, + { + "epoch": 2.900345245871046, + "grad_norm": NaN, + "learning_rate": 0.0001656135727885728, + "loss": 0.0, + "step": 31083 + }, + { + "epoch": 2.9004385555659233, + "grad_norm": NaN, + "learning_rate": 0.00016560605007204167, + "loss": 0.0, + "step": 31084 + }, + { + "epoch": 2.9005318652608008, + "grad_norm": NaN, + "learning_rate": 0.00016559852731582903, + "loss": 0.0, + "step": 31085 + }, + { + "epoch": 2.9006251749556777, + "grad_norm": NaN, + "learning_rate": 0.0001655910045199539, + "loss": 0.0, + "step": 31086 + }, + { + "epoch": 2.900718484650555, + "grad_norm": NaN, + "learning_rate": 0.00016558348168443547, + "loss": 0.0, + "step": 31087 + }, + { + "epoch": 2.9008117943454326, + "grad_norm": NaN, + "learning_rate": 0.00016557595880929287, + "loss": 0.0, + "step": 31088 + }, + { + "epoch": 2.9009051040403095, + "grad_norm": NaN, + "learning_rate": 0.0001655684358945452, + "loss": 0.0, + "step": 31089 + }, + { + "epoch": 2.900998413735187, + "grad_norm": NaN, + "learning_rate": 0.00016556091294021164, + "loss": 0.0, + "step": 31090 + }, + { + "epoch": 2.9010917234300644, + "grad_norm": NaN, + "learning_rate": 0.0001655533899463113, + "loss": 0.0, + "step": 31091 + }, + { + "epoch": 2.901185033124942, + "grad_norm": NaN, + "learning_rate": 0.00016554586691286326, + "loss": 0.0, + "step": 31092 + }, + { + "epoch": 2.9012783428198192, + "grad_norm": NaN, + "learning_rate": 0.0001655383438398867, + "loss": 0.0, + "step": 31093 + }, + { + "epoch": 2.901371652514696, + "grad_norm": NaN, + "learning_rate": 0.00016553082072740075, + "loss": 0.0, + "step": 31094 + }, + { + "epoch": 2.9014649622095736, + "grad_norm": NaN, + "learning_rate": 0.0001655232975754245, + "loss": 0.0, + "step": 31095 + }, + { + "epoch": 2.901558271904451, + "grad_norm": NaN, + "learning_rate": 0.00016551577438397708, + "loss": 0.0, + "step": 31096 + }, + { + "epoch": 2.901651581599328, + "grad_norm": NaN, + "learning_rate": 0.0001655082511530777, + "loss": 0.0, + "step": 31097 + }, + { + "epoch": 2.9017448912942054, + "grad_norm": NaN, + "learning_rate": 0.0001655007278827454, + "loss": 0.0, + "step": 31098 + }, + { + "epoch": 2.901838200989083, + "grad_norm": NaN, + "learning_rate": 0.00016549320457299936, + "loss": 0.0, + "step": 31099 + }, + { + "epoch": 2.9019315106839603, + "grad_norm": NaN, + "learning_rate": 0.00016548568122385867, + "loss": 0.0, + "step": 31100 + }, + { + "epoch": 2.9020248203788372, + "grad_norm": NaN, + "learning_rate": 0.0001654781578353425, + "loss": 0.0, + "step": 31101 + }, + { + "epoch": 2.9021181300737147, + "grad_norm": NaN, + "learning_rate": 0.00016547063440746994, + "loss": 0.0, + "step": 31102 + }, + { + "epoch": 2.902211439768592, + "grad_norm": NaN, + "learning_rate": 0.00016546311094026015, + "loss": 0.0, + "step": 31103 + }, + { + "epoch": 2.902304749463469, + "grad_norm": NaN, + "learning_rate": 0.00016545558743373225, + "loss": 0.0, + "step": 31104 + }, + { + "epoch": 2.9023980591583465, + "grad_norm": NaN, + "learning_rate": 0.00016544806388790533, + "loss": 0.0, + "step": 31105 + }, + { + "epoch": 2.902491368853224, + "grad_norm": NaN, + "learning_rate": 0.00016544054030279858, + "loss": 0.0, + "step": 31106 + }, + { + "epoch": 2.9025846785481013, + "grad_norm": NaN, + "learning_rate": 0.00016543301667843114, + "loss": 0.0, + "step": 31107 + }, + { + "epoch": 2.9026779882429783, + "grad_norm": NaN, + "learning_rate": 0.00016542549301482208, + "loss": 0.0, + "step": 31108 + }, + { + "epoch": 2.9027712979378557, + "grad_norm": NaN, + "learning_rate": 0.00016541796931199054, + "loss": 0.0, + "step": 31109 + }, + { + "epoch": 2.902864607632733, + "grad_norm": NaN, + "learning_rate": 0.00016541044556995575, + "loss": 0.0, + "step": 31110 + }, + { + "epoch": 2.90295791732761, + "grad_norm": NaN, + "learning_rate": 0.00016540292178873672, + "loss": 0.0, + "step": 31111 + }, + { + "epoch": 2.9030512270224875, + "grad_norm": NaN, + "learning_rate": 0.0001653953979683526, + "loss": 0.0, + "step": 31112 + }, + { + "epoch": 2.903144536717365, + "grad_norm": NaN, + "learning_rate": 0.0001653878741088226, + "loss": 0.0, + "step": 31113 + }, + { + "epoch": 2.9032378464122424, + "grad_norm": NaN, + "learning_rate": 0.00016538035021016574, + "loss": 0.0, + "step": 31114 + }, + { + "epoch": 2.90333115610712, + "grad_norm": NaN, + "learning_rate": 0.00016537282627240117, + "loss": 0.0, + "step": 31115 + }, + { + "epoch": 2.9034244658019968, + "grad_norm": NaN, + "learning_rate": 0.00016536530229554816, + "loss": 0.0, + "step": 31116 + }, + { + "epoch": 2.903517775496874, + "grad_norm": NaN, + "learning_rate": 0.00016535777827962568, + "loss": 0.0, + "step": 31117 + }, + { + "epoch": 2.903611085191751, + "grad_norm": NaN, + "learning_rate": 0.0001653502542246529, + "loss": 0.0, + "step": 31118 + }, + { + "epoch": 2.9037043948866286, + "grad_norm": NaN, + "learning_rate": 0.00016534273013064902, + "loss": 0.0, + "step": 31119 + }, + { + "epoch": 2.903797704581506, + "grad_norm": NaN, + "learning_rate": 0.00016533520599763308, + "loss": 0.0, + "step": 31120 + }, + { + "epoch": 2.9038910142763834, + "grad_norm": NaN, + "learning_rate": 0.0001653276818256243, + "loss": 0.0, + "step": 31121 + }, + { + "epoch": 2.903984323971261, + "grad_norm": NaN, + "learning_rate": 0.00016532015761464174, + "loss": 0.0, + "step": 31122 + }, + { + "epoch": 2.904077633666138, + "grad_norm": NaN, + "learning_rate": 0.00016531263336470452, + "loss": 0.0, + "step": 31123 + }, + { + "epoch": 2.9041709433610152, + "grad_norm": NaN, + "learning_rate": 0.00016530510907583187, + "loss": 0.0, + "step": 31124 + }, + { + "epoch": 2.9042642530558926, + "grad_norm": NaN, + "learning_rate": 0.0001652975847480428, + "loss": 0.0, + "step": 31125 + }, + { + "epoch": 2.9043575627507696, + "grad_norm": NaN, + "learning_rate": 0.00016529006038135656, + "loss": 0.0, + "step": 31126 + }, + { + "epoch": 2.904450872445647, + "grad_norm": NaN, + "learning_rate": 0.0001652825359757922, + "loss": 0.0, + "step": 31127 + }, + { + "epoch": 2.9045441821405245, + "grad_norm": NaN, + "learning_rate": 0.00016527501153136887, + "loss": 0.0, + "step": 31128 + }, + { + "epoch": 2.904637491835402, + "grad_norm": NaN, + "learning_rate": 0.00016526748704810573, + "loss": 0.0, + "step": 31129 + }, + { + "epoch": 2.904730801530279, + "grad_norm": NaN, + "learning_rate": 0.0001652599625260219, + "loss": 0.0, + "step": 31130 + }, + { + "epoch": 2.9048241112251563, + "grad_norm": NaN, + "learning_rate": 0.00016525243796513644, + "loss": 0.0, + "step": 31131 + }, + { + "epoch": 2.9049174209200337, + "grad_norm": NaN, + "learning_rate": 0.00016524491336546864, + "loss": 0.0, + "step": 31132 + }, + { + "epoch": 2.9050107306149107, + "grad_norm": NaN, + "learning_rate": 0.0001652373887270375, + "loss": 0.0, + "step": 31133 + }, + { + "epoch": 2.905104040309788, + "grad_norm": NaN, + "learning_rate": 0.00016522986404986215, + "loss": 0.0, + "step": 31134 + }, + { + "epoch": 2.9051973500046655, + "grad_norm": NaN, + "learning_rate": 0.00016522233933396186, + "loss": 0.0, + "step": 31135 + }, + { + "epoch": 2.905290659699543, + "grad_norm": NaN, + "learning_rate": 0.00016521481457935564, + "loss": 0.0, + "step": 31136 + }, + { + "epoch": 2.9053839693944203, + "grad_norm": NaN, + "learning_rate": 0.0001652072897860626, + "loss": 0.0, + "step": 31137 + }, + { + "epoch": 2.9054772790892973, + "grad_norm": NaN, + "learning_rate": 0.000165199764954102, + "loss": 0.0, + "step": 31138 + }, + { + "epoch": 2.9055705887841747, + "grad_norm": NaN, + "learning_rate": 0.00016519224008349284, + "loss": 0.0, + "step": 31139 + }, + { + "epoch": 2.9056638984790517, + "grad_norm": NaN, + "learning_rate": 0.0001651847151742543, + "loss": 0.0, + "step": 31140 + }, + { + "epoch": 2.905757208173929, + "grad_norm": NaN, + "learning_rate": 0.0001651771902264056, + "loss": 0.0, + "step": 31141 + }, + { + "epoch": 2.9058505178688065, + "grad_norm": NaN, + "learning_rate": 0.0001651696652399658, + "loss": 0.0, + "step": 31142 + }, + { + "epoch": 2.905943827563684, + "grad_norm": NaN, + "learning_rate": 0.000165162140214954, + "loss": 0.0, + "step": 31143 + }, + { + "epoch": 2.9060371372585614, + "grad_norm": NaN, + "learning_rate": 0.0001651546151513893, + "loss": 0.0, + "step": 31144 + }, + { + "epoch": 2.9061304469534384, + "grad_norm": NaN, + "learning_rate": 0.00016514709004929103, + "loss": 0.0, + "step": 31145 + }, + { + "epoch": 2.906223756648316, + "grad_norm": NaN, + "learning_rate": 0.00016513956490867812, + "loss": 0.0, + "step": 31146 + }, + { + "epoch": 2.906317066343193, + "grad_norm": NaN, + "learning_rate": 0.00016513203972956978, + "loss": 0.0, + "step": 31147 + }, + { + "epoch": 2.90641037603807, + "grad_norm": NaN, + "learning_rate": 0.0001651245145119852, + "loss": 0.0, + "step": 31148 + }, + { + "epoch": 2.9065036857329476, + "grad_norm": NaN, + "learning_rate": 0.00016511698925594342, + "loss": 0.0, + "step": 31149 + }, + { + "epoch": 2.906596995427825, + "grad_norm": NaN, + "learning_rate": 0.00016510946396146355, + "loss": 0.0, + "step": 31150 + }, + { + "epoch": 2.9066903051227024, + "grad_norm": NaN, + "learning_rate": 0.00016510193862856495, + "loss": 0.0, + "step": 31151 + }, + { + "epoch": 2.9067836148175794, + "grad_norm": NaN, + "learning_rate": 0.00016509441325726645, + "loss": 0.0, + "step": 31152 + }, + { + "epoch": 2.906876924512457, + "grad_norm": NaN, + "learning_rate": 0.00016508688784758739, + "loss": 0.0, + "step": 31153 + }, + { + "epoch": 2.9069702342073342, + "grad_norm": NaN, + "learning_rate": 0.00016507936239954686, + "loss": 0.0, + "step": 31154 + }, + { + "epoch": 2.9070635439022112, + "grad_norm": NaN, + "learning_rate": 0.00016507183691316394, + "loss": 0.0, + "step": 31155 + }, + { + "epoch": 2.9071568535970886, + "grad_norm": NaN, + "learning_rate": 0.00016506431138845777, + "loss": 0.0, + "step": 31156 + }, + { + "epoch": 2.907250163291966, + "grad_norm": NaN, + "learning_rate": 0.00016505678582544761, + "loss": 0.0, + "step": 31157 + }, + { + "epoch": 2.9073434729868435, + "grad_norm": NaN, + "learning_rate": 0.00016504926022415246, + "loss": 0.0, + "step": 31158 + }, + { + "epoch": 2.907436782681721, + "grad_norm": NaN, + "learning_rate": 0.00016504173458459145, + "loss": 0.0, + "step": 31159 + }, + { + "epoch": 2.907530092376598, + "grad_norm": NaN, + "learning_rate": 0.00016503420890678382, + "loss": 0.0, + "step": 31160 + }, + { + "epoch": 2.9076234020714753, + "grad_norm": NaN, + "learning_rate": 0.00016502668319074869, + "loss": 0.0, + "step": 31161 + }, + { + "epoch": 2.9077167117663523, + "grad_norm": NaN, + "learning_rate": 0.00016501915743650505, + "loss": 0.0, + "step": 31162 + }, + { + "epoch": 2.9078100214612297, + "grad_norm": NaN, + "learning_rate": 0.00016501163164407218, + "loss": 0.0, + "step": 31163 + }, + { + "epoch": 2.907903331156107, + "grad_norm": NaN, + "learning_rate": 0.00016500410581346924, + "loss": 0.0, + "step": 31164 + }, + { + "epoch": 2.9079966408509845, + "grad_norm": NaN, + "learning_rate": 0.00016499657994471526, + "loss": 0.0, + "step": 31165 + }, + { + "epoch": 2.908089950545862, + "grad_norm": NaN, + "learning_rate": 0.0001649890540378294, + "loss": 0.0, + "step": 31166 + }, + { + "epoch": 2.908183260240739, + "grad_norm": NaN, + "learning_rate": 0.00016498152809283088, + "loss": 0.0, + "step": 31167 + }, + { + "epoch": 2.9082765699356163, + "grad_norm": NaN, + "learning_rate": 0.0001649740021097387, + "loss": 0.0, + "step": 31168 + }, + { + "epoch": 2.9083698796304938, + "grad_norm": NaN, + "learning_rate": 0.00016496647608857208, + "loss": 0.0, + "step": 31169 + }, + { + "epoch": 2.9084631893253707, + "grad_norm": NaN, + "learning_rate": 0.00016495895002935018, + "loss": 0.0, + "step": 31170 + }, + { + "epoch": 2.908556499020248, + "grad_norm": NaN, + "learning_rate": 0.00016495142393209208, + "loss": 0.0, + "step": 31171 + }, + { + "epoch": 2.9086498087151256, + "grad_norm": NaN, + "learning_rate": 0.00016494389779681692, + "loss": 0.0, + "step": 31172 + }, + { + "epoch": 2.908743118410003, + "grad_norm": NaN, + "learning_rate": 0.0001649363716235439, + "loss": 0.0, + "step": 31173 + }, + { + "epoch": 2.90883642810488, + "grad_norm": NaN, + "learning_rate": 0.00016492884541229207, + "loss": 0.0, + "step": 31174 + }, + { + "epoch": 2.9089297377997574, + "grad_norm": NaN, + "learning_rate": 0.00016492131916308062, + "loss": 0.0, + "step": 31175 + }, + { + "epoch": 2.909023047494635, + "grad_norm": NaN, + "learning_rate": 0.00016491379287592866, + "loss": 0.0, + "step": 31176 + }, + { + "epoch": 2.909116357189512, + "grad_norm": NaN, + "learning_rate": 0.0001649062665508554, + "loss": 0.0, + "step": 31177 + }, + { + "epoch": 2.909209666884389, + "grad_norm": NaN, + "learning_rate": 0.00016489874018787984, + "loss": 0.0, + "step": 31178 + }, + { + "epoch": 2.9093029765792666, + "grad_norm": NaN, + "learning_rate": 0.00016489121378702128, + "loss": 0.0, + "step": 31179 + }, + { + "epoch": 2.909396286274144, + "grad_norm": NaN, + "learning_rate": 0.00016488368734829872, + "loss": 0.0, + "step": 31180 + }, + { + "epoch": 2.9094895959690215, + "grad_norm": NaN, + "learning_rate": 0.00016487616087173133, + "loss": 0.0, + "step": 31181 + }, + { + "epoch": 2.9095829056638984, + "grad_norm": NaN, + "learning_rate": 0.00016486863435733829, + "loss": 0.0, + "step": 31182 + }, + { + "epoch": 2.909676215358776, + "grad_norm": NaN, + "learning_rate": 0.00016486110780513876, + "loss": 0.0, + "step": 31183 + }, + { + "epoch": 2.909769525053653, + "grad_norm": NaN, + "learning_rate": 0.0001648535812151518, + "loss": 0.0, + "step": 31184 + }, + { + "epoch": 2.9098628347485302, + "grad_norm": NaN, + "learning_rate": 0.0001648460545873966, + "loss": 0.0, + "step": 31185 + }, + { + "epoch": 2.9099561444434077, + "grad_norm": NaN, + "learning_rate": 0.00016483852792189226, + "loss": 0.0, + "step": 31186 + }, + { + "epoch": 2.910049454138285, + "grad_norm": NaN, + "learning_rate": 0.00016483100121865793, + "loss": 0.0, + "step": 31187 + }, + { + "epoch": 2.9101427638331625, + "grad_norm": NaN, + "learning_rate": 0.0001648234744777128, + "loss": 0.0, + "step": 31188 + }, + { + "epoch": 2.9102360735280395, + "grad_norm": NaN, + "learning_rate": 0.00016481594769907597, + "loss": 0.0, + "step": 31189 + }, + { + "epoch": 2.910329383222917, + "grad_norm": NaN, + "learning_rate": 0.00016480842088276656, + "loss": 0.0, + "step": 31190 + }, + { + "epoch": 2.9104226929177943, + "grad_norm": NaN, + "learning_rate": 0.00016480089402880365, + "loss": 0.0, + "step": 31191 + }, + { + "epoch": 2.9105160026126713, + "grad_norm": NaN, + "learning_rate": 0.00016479336713720653, + "loss": 0.0, + "step": 31192 + }, + { + "epoch": 2.9106093123075487, + "grad_norm": NaN, + "learning_rate": 0.00016478584020799425, + "loss": 0.0, + "step": 31193 + }, + { + "epoch": 2.910702622002426, + "grad_norm": NaN, + "learning_rate": 0.0001647783132411859, + "loss": 0.0, + "step": 31194 + }, + { + "epoch": 2.9107959316973036, + "grad_norm": NaN, + "learning_rate": 0.00016477078623680075, + "loss": 0.0, + "step": 31195 + }, + { + "epoch": 2.9108892413921805, + "grad_norm": NaN, + "learning_rate": 0.00016476325919485786, + "loss": 0.0, + "step": 31196 + }, + { + "epoch": 2.910982551087058, + "grad_norm": NaN, + "learning_rate": 0.00016475573211537632, + "loss": 0.0, + "step": 31197 + }, + { + "epoch": 2.9110758607819354, + "grad_norm": NaN, + "learning_rate": 0.0001647482049983754, + "loss": 0.0, + "step": 31198 + }, + { + "epoch": 2.9111691704768123, + "grad_norm": NaN, + "learning_rate": 0.00016474067784387413, + "loss": 0.0, + "step": 31199 + }, + { + "epoch": 2.9112624801716898, + "grad_norm": NaN, + "learning_rate": 0.00016473315065189162, + "loss": 0.0, + "step": 31200 + }, + { + "epoch": 2.911355789866567, + "grad_norm": NaN, + "learning_rate": 0.00016472562342244714, + "loss": 0.0, + "step": 31201 + }, + { + "epoch": 2.9114490995614446, + "grad_norm": NaN, + "learning_rate": 0.00016471809615555976, + "loss": 0.0, + "step": 31202 + }, + { + "epoch": 2.9115424092563216, + "grad_norm": NaN, + "learning_rate": 0.0001647105688512486, + "loss": 0.0, + "step": 31203 + }, + { + "epoch": 2.911635718951199, + "grad_norm": NaN, + "learning_rate": 0.00016470304150953283, + "loss": 0.0, + "step": 31204 + }, + { + "epoch": 2.9117290286460764, + "grad_norm": NaN, + "learning_rate": 0.00016469551413043158, + "loss": 0.0, + "step": 31205 + }, + { + "epoch": 2.9118223383409534, + "grad_norm": NaN, + "learning_rate": 0.00016468798671396398, + "loss": 0.0, + "step": 31206 + }, + { + "epoch": 2.911915648035831, + "grad_norm": NaN, + "learning_rate": 0.00016468045926014917, + "loss": 0.0, + "step": 31207 + }, + { + "epoch": 2.9120089577307082, + "grad_norm": NaN, + "learning_rate": 0.00016467293176900636, + "loss": 0.0, + "step": 31208 + }, + { + "epoch": 2.9121022674255856, + "grad_norm": NaN, + "learning_rate": 0.00016466540424055456, + "loss": 0.0, + "step": 31209 + }, + { + "epoch": 2.912195577120463, + "grad_norm": NaN, + "learning_rate": 0.00016465787667481303, + "loss": 0.0, + "step": 31210 + }, + { + "epoch": 2.91228888681534, + "grad_norm": NaN, + "learning_rate": 0.00016465034907180091, + "loss": 0.0, + "step": 31211 + }, + { + "epoch": 2.9123821965102175, + "grad_norm": NaN, + "learning_rate": 0.0001646428214315372, + "loss": 0.0, + "step": 31212 + }, + { + "epoch": 2.9124755062050944, + "grad_norm": NaN, + "learning_rate": 0.00016463529375404114, + "loss": 0.0, + "step": 31213 + }, + { + "epoch": 2.912568815899972, + "grad_norm": NaN, + "learning_rate": 0.0001646277660393319, + "loss": 0.0, + "step": 31214 + }, + { + "epoch": 2.9126621255948493, + "grad_norm": NaN, + "learning_rate": 0.00016462023828742859, + "loss": 0.0, + "step": 31215 + }, + { + "epoch": 2.9127554352897267, + "grad_norm": NaN, + "learning_rate": 0.00016461271049835027, + "loss": 0.0, + "step": 31216 + }, + { + "epoch": 2.912848744984604, + "grad_norm": NaN, + "learning_rate": 0.00016460518267211624, + "loss": 0.0, + "step": 31217 + }, + { + "epoch": 2.912942054679481, + "grad_norm": NaN, + "learning_rate": 0.00016459765480874555, + "loss": 0.0, + "step": 31218 + }, + { + "epoch": 2.9130353643743585, + "grad_norm": NaN, + "learning_rate": 0.00016459012690825728, + "loss": 0.0, + "step": 31219 + }, + { + "epoch": 2.913128674069236, + "grad_norm": NaN, + "learning_rate": 0.00016458259897067067, + "loss": 0.0, + "step": 31220 + }, + { + "epoch": 2.913221983764113, + "grad_norm": NaN, + "learning_rate": 0.0001645750709960049, + "loss": 0.0, + "step": 31221 + }, + { + "epoch": 2.9133152934589903, + "grad_norm": NaN, + "learning_rate": 0.00016456754298427897, + "loss": 0.0, + "step": 31222 + }, + { + "epoch": 2.9134086031538677, + "grad_norm": NaN, + "learning_rate": 0.00016456001493551212, + "loss": 0.0, + "step": 31223 + }, + { + "epoch": 2.913501912848745, + "grad_norm": NaN, + "learning_rate": 0.00016455248684972345, + "loss": 0.0, + "step": 31224 + }, + { + "epoch": 2.913595222543622, + "grad_norm": NaN, + "learning_rate": 0.00016454495872693207, + "loss": 0.0, + "step": 31225 + }, + { + "epoch": 2.9136885322384996, + "grad_norm": NaN, + "learning_rate": 0.00016453743056715726, + "loss": 0.0, + "step": 31226 + }, + { + "epoch": 2.913781841933377, + "grad_norm": NaN, + "learning_rate": 0.00016452990237041806, + "loss": 0.0, + "step": 31227 + }, + { + "epoch": 2.913875151628254, + "grad_norm": NaN, + "learning_rate": 0.00016452237413673357, + "loss": 0.0, + "step": 31228 + }, + { + "epoch": 2.9139684613231314, + "grad_norm": NaN, + "learning_rate": 0.00016451484586612299, + "loss": 0.0, + "step": 31229 + }, + { + "epoch": 2.914061771018009, + "grad_norm": NaN, + "learning_rate": 0.0001645073175586055, + "loss": 0.0, + "step": 31230 + }, + { + "epoch": 2.914155080712886, + "grad_norm": NaN, + "learning_rate": 0.00016449978921420015, + "loss": 0.0, + "step": 31231 + }, + { + "epoch": 2.9142483904077636, + "grad_norm": NaN, + "learning_rate": 0.00016449226083292616, + "loss": 0.0, + "step": 31232 + }, + { + "epoch": 2.9143417001026406, + "grad_norm": NaN, + "learning_rate": 0.00016448473241480267, + "loss": 0.0, + "step": 31233 + }, + { + "epoch": 2.914435009797518, + "grad_norm": NaN, + "learning_rate": 0.00016447720395984874, + "loss": 0.0, + "step": 31234 + }, + { + "epoch": 2.914528319492395, + "grad_norm": NaN, + "learning_rate": 0.00016446967546808358, + "loss": 0.0, + "step": 31235 + }, + { + "epoch": 2.9146216291872724, + "grad_norm": NaN, + "learning_rate": 0.00016446214693952634, + "loss": 0.0, + "step": 31236 + }, + { + "epoch": 2.91471493888215, + "grad_norm": NaN, + "learning_rate": 0.00016445461837419616, + "loss": 0.0, + "step": 31237 + }, + { + "epoch": 2.9148082485770272, + "grad_norm": NaN, + "learning_rate": 0.00016444708977211213, + "loss": 0.0, + "step": 31238 + }, + { + "epoch": 2.9149015582719047, + "grad_norm": NaN, + "learning_rate": 0.00016443956113329342, + "loss": 0.0, + "step": 31239 + }, + { + "epoch": 2.9149948679667816, + "grad_norm": NaN, + "learning_rate": 0.00016443203245775925, + "loss": 0.0, + "step": 31240 + }, + { + "epoch": 2.915088177661659, + "grad_norm": NaN, + "learning_rate": 0.00016442450374552862, + "loss": 0.0, + "step": 31241 + }, + { + "epoch": 2.9151814873565365, + "grad_norm": NaN, + "learning_rate": 0.00016441697499662079, + "loss": 0.0, + "step": 31242 + }, + { + "epoch": 2.9152747970514135, + "grad_norm": NaN, + "learning_rate": 0.00016440944621105489, + "loss": 0.0, + "step": 31243 + }, + { + "epoch": 2.915368106746291, + "grad_norm": NaN, + "learning_rate": 0.00016440191738884994, + "loss": 0.0, + "step": 31244 + }, + { + "epoch": 2.9154614164411683, + "grad_norm": NaN, + "learning_rate": 0.0001643943885300253, + "loss": 0.0, + "step": 31245 + }, + { + "epoch": 2.9155547261360457, + "grad_norm": NaN, + "learning_rate": 0.00016438685963459995, + "loss": 0.0, + "step": 31246 + }, + { + "epoch": 2.9156480358309227, + "grad_norm": NaN, + "learning_rate": 0.00016437933070259303, + "loss": 0.0, + "step": 31247 + }, + { + "epoch": 2.9157413455258, + "grad_norm": NaN, + "learning_rate": 0.00016437180173402378, + "loss": 0.0, + "step": 31248 + }, + { + "epoch": 2.9158346552206775, + "grad_norm": NaN, + "learning_rate": 0.00016436427272891133, + "loss": 0.0, + "step": 31249 + }, + { + "epoch": 2.9159279649155545, + "grad_norm": NaN, + "learning_rate": 0.0001643567436872747, + "loss": 0.0, + "step": 31250 + }, + { + "epoch": 2.916021274610432, + "grad_norm": NaN, + "learning_rate": 0.00016434921460913317, + "loss": 0.0, + "step": 31251 + }, + { + "epoch": 2.9161145843053093, + "grad_norm": NaN, + "learning_rate": 0.00016434168549450586, + "loss": 0.0, + "step": 31252 + }, + { + "epoch": 2.9162078940001868, + "grad_norm": NaN, + "learning_rate": 0.00016433415634341185, + "loss": 0.0, + "step": 31253 + }, + { + "epoch": 2.916301203695064, + "grad_norm": NaN, + "learning_rate": 0.00016432662715587033, + "loss": 0.0, + "step": 31254 + }, + { + "epoch": 2.916394513389941, + "grad_norm": NaN, + "learning_rate": 0.0001643190979319005, + "loss": 0.0, + "step": 31255 + }, + { + "epoch": 2.9164878230848186, + "grad_norm": NaN, + "learning_rate": 0.00016431156867152138, + "loss": 0.0, + "step": 31256 + }, + { + "epoch": 2.9165811327796956, + "grad_norm": NaN, + "learning_rate": 0.00016430403937475215, + "loss": 0.0, + "step": 31257 + }, + { + "epoch": 2.916674442474573, + "grad_norm": NaN, + "learning_rate": 0.00016429651004161205, + "loss": 0.0, + "step": 31258 + }, + { + "epoch": 2.9167677521694504, + "grad_norm": NaN, + "learning_rate": 0.00016428898067212018, + "loss": 0.0, + "step": 31259 + }, + { + "epoch": 2.916861061864328, + "grad_norm": NaN, + "learning_rate": 0.0001642814512662956, + "loss": 0.0, + "step": 31260 + }, + { + "epoch": 2.9169543715592052, + "grad_norm": NaN, + "learning_rate": 0.00016427392182415755, + "loss": 0.0, + "step": 31261 + }, + { + "epoch": 2.917047681254082, + "grad_norm": NaN, + "learning_rate": 0.0001642663923457252, + "loss": 0.0, + "step": 31262 + }, + { + "epoch": 2.9171409909489596, + "grad_norm": NaN, + "learning_rate": 0.00016425886283101754, + "loss": 0.0, + "step": 31263 + }, + { + "epoch": 2.917234300643837, + "grad_norm": NaN, + "learning_rate": 0.00016425133328005383, + "loss": 0.0, + "step": 31264 + }, + { + "epoch": 2.917327610338714, + "grad_norm": NaN, + "learning_rate": 0.0001642438036928533, + "loss": 0.0, + "step": 31265 + }, + { + "epoch": 2.9174209200335914, + "grad_norm": NaN, + "learning_rate": 0.00016423627406943488, + "loss": 0.0, + "step": 31266 + }, + { + "epoch": 2.917514229728469, + "grad_norm": NaN, + "learning_rate": 0.0001642287444098179, + "loss": 0.0, + "step": 31267 + }, + { + "epoch": 2.9176075394233463, + "grad_norm": NaN, + "learning_rate": 0.00016422121471402145, + "loss": 0.0, + "step": 31268 + }, + { + "epoch": 2.9177008491182232, + "grad_norm": NaN, + "learning_rate": 0.00016421368498206458, + "loss": 0.0, + "step": 31269 + }, + { + "epoch": 2.9177941588131007, + "grad_norm": NaN, + "learning_rate": 0.00016420615521396656, + "loss": 0.0, + "step": 31270 + }, + { + "epoch": 2.917887468507978, + "grad_norm": NaN, + "learning_rate": 0.00016419862540974656, + "loss": 0.0, + "step": 31271 + }, + { + "epoch": 2.917980778202855, + "grad_norm": NaN, + "learning_rate": 0.00016419109556942358, + "loss": 0.0, + "step": 31272 + }, + { + "epoch": 2.9180740878977325, + "grad_norm": NaN, + "learning_rate": 0.00016418356569301688, + "loss": 0.0, + "step": 31273 + }, + { + "epoch": 2.91816739759261, + "grad_norm": NaN, + "learning_rate": 0.00016417603578054558, + "loss": 0.0, + "step": 31274 + }, + { + "epoch": 2.9182607072874873, + "grad_norm": NaN, + "learning_rate": 0.00016416850583202881, + "loss": 0.0, + "step": 31275 + }, + { + "epoch": 2.9183540169823647, + "grad_norm": NaN, + "learning_rate": 0.00016416097584748572, + "loss": 0.0, + "step": 31276 + }, + { + "epoch": 2.9184473266772417, + "grad_norm": NaN, + "learning_rate": 0.00016415344582693545, + "loss": 0.0, + "step": 31277 + }, + { + "epoch": 2.918540636372119, + "grad_norm": NaN, + "learning_rate": 0.00016414591577039724, + "loss": 0.0, + "step": 31278 + }, + { + "epoch": 2.918633946066996, + "grad_norm": NaN, + "learning_rate": 0.00016413838567789005, + "loss": 0.0, + "step": 31279 + }, + { + "epoch": 2.9187272557618735, + "grad_norm": NaN, + "learning_rate": 0.00016413085554943318, + "loss": 0.0, + "step": 31280 + }, + { + "epoch": 2.918820565456751, + "grad_norm": NaN, + "learning_rate": 0.00016412332538504574, + "loss": 0.0, + "step": 31281 + }, + { + "epoch": 2.9189138751516284, + "grad_norm": NaN, + "learning_rate": 0.00016411579518474685, + "loss": 0.0, + "step": 31282 + }, + { + "epoch": 2.919007184846506, + "grad_norm": NaN, + "learning_rate": 0.00016410826494855568, + "loss": 0.0, + "step": 31283 + }, + { + "epoch": 2.9191004945413828, + "grad_norm": NaN, + "learning_rate": 0.00016410073467649141, + "loss": 0.0, + "step": 31284 + }, + { + "epoch": 2.91919380423626, + "grad_norm": NaN, + "learning_rate": 0.0001640932043685731, + "loss": 0.0, + "step": 31285 + }, + { + "epoch": 2.9192871139311376, + "grad_norm": NaN, + "learning_rate": 0.0001640856740248199, + "loss": 0.0, + "step": 31286 + }, + { + "epoch": 2.9193804236260146, + "grad_norm": NaN, + "learning_rate": 0.0001640781436452511, + "loss": 0.0, + "step": 31287 + }, + { + "epoch": 2.919473733320892, + "grad_norm": NaN, + "learning_rate": 0.0001640706132298857, + "loss": 0.0, + "step": 31288 + }, + { + "epoch": 2.9195670430157694, + "grad_norm": NaN, + "learning_rate": 0.00016406308277874292, + "loss": 0.0, + "step": 31289 + }, + { + "epoch": 2.919660352710647, + "grad_norm": NaN, + "learning_rate": 0.0001640555522918419, + "loss": 0.0, + "step": 31290 + }, + { + "epoch": 2.919753662405524, + "grad_norm": NaN, + "learning_rate": 0.00016404802176920173, + "loss": 0.0, + "step": 31291 + }, + { + "epoch": 2.9198469721004012, + "grad_norm": NaN, + "learning_rate": 0.0001640404912108416, + "loss": 0.0, + "step": 31292 + }, + { + "epoch": 2.9199402817952786, + "grad_norm": NaN, + "learning_rate": 0.00016403296061678075, + "loss": 0.0, + "step": 31293 + }, + { + "epoch": 2.9200335914901556, + "grad_norm": NaN, + "learning_rate": 0.00016402542998703812, + "loss": 0.0, + "step": 31294 + }, + { + "epoch": 2.920126901185033, + "grad_norm": NaN, + "learning_rate": 0.000164017899321633, + "loss": 0.0, + "step": 31295 + }, + { + "epoch": 2.9202202108799105, + "grad_norm": NaN, + "learning_rate": 0.00016401036862058457, + "loss": 0.0, + "step": 31296 + }, + { + "epoch": 2.920313520574788, + "grad_norm": NaN, + "learning_rate": 0.00016400283788391192, + "loss": 0.0, + "step": 31297 + }, + { + "epoch": 2.920406830269665, + "grad_norm": NaN, + "learning_rate": 0.00016399530711163414, + "loss": 0.0, + "step": 31298 + }, + { + "epoch": 2.9205001399645423, + "grad_norm": NaN, + "learning_rate": 0.0001639877763037705, + "loss": 0.0, + "step": 31299 + }, + { + "epoch": 2.9205934496594197, + "grad_norm": NaN, + "learning_rate": 0.0001639802454603401, + "loss": 0.0, + "step": 31300 + }, + { + "epoch": 2.9206867593542967, + "grad_norm": NaN, + "learning_rate": 0.000163972714581362, + "loss": 0.0, + "step": 31301 + }, + { + "epoch": 2.920780069049174, + "grad_norm": NaN, + "learning_rate": 0.00016396518366685548, + "loss": 0.0, + "step": 31302 + }, + { + "epoch": 2.9208733787440515, + "grad_norm": NaN, + "learning_rate": 0.00016395765271683965, + "loss": 0.0, + "step": 31303 + }, + { + "epoch": 2.920966688438929, + "grad_norm": NaN, + "learning_rate": 0.00016395012173133356, + "loss": 0.0, + "step": 31304 + }, + { + "epoch": 2.9210599981338063, + "grad_norm": NaN, + "learning_rate": 0.00016394259071035648, + "loss": 0.0, + "step": 31305 + }, + { + "epoch": 2.9211533078286833, + "grad_norm": NaN, + "learning_rate": 0.00016393505965392759, + "loss": 0.0, + "step": 31306 + }, + { + "epoch": 2.9212466175235607, + "grad_norm": NaN, + "learning_rate": 0.00016392752856206588, + "loss": 0.0, + "step": 31307 + }, + { + "epoch": 2.921339927218438, + "grad_norm": NaN, + "learning_rate": 0.00016391999743479063, + "loss": 0.0, + "step": 31308 + }, + { + "epoch": 2.921433236913315, + "grad_norm": NaN, + "learning_rate": 0.000163912466272121, + "loss": 0.0, + "step": 31309 + }, + { + "epoch": 2.9215265466081926, + "grad_norm": NaN, + "learning_rate": 0.000163904935074076, + "loss": 0.0, + "step": 31310 + }, + { + "epoch": 2.92161985630307, + "grad_norm": NaN, + "learning_rate": 0.00016389740384067494, + "loss": 0.0, + "step": 31311 + }, + { + "epoch": 2.9217131659979474, + "grad_norm": NaN, + "learning_rate": 0.00016388987257193684, + "loss": 0.0, + "step": 31312 + }, + { + "epoch": 2.9218064756928244, + "grad_norm": NaN, + "learning_rate": 0.00016388234126788097, + "loss": 0.0, + "step": 31313 + }, + { + "epoch": 2.921899785387702, + "grad_norm": NaN, + "learning_rate": 0.00016387480992852637, + "loss": 0.0, + "step": 31314 + }, + { + "epoch": 2.921993095082579, + "grad_norm": NaN, + "learning_rate": 0.00016386727855389227, + "loss": 0.0, + "step": 31315 + }, + { + "epoch": 2.922086404777456, + "grad_norm": NaN, + "learning_rate": 0.00016385974714399774, + "loss": 0.0, + "step": 31316 + }, + { + "epoch": 2.9221797144723336, + "grad_norm": NaN, + "learning_rate": 0.000163852215698862, + "loss": 0.0, + "step": 31317 + }, + { + "epoch": 2.922273024167211, + "grad_norm": NaN, + "learning_rate": 0.00016384468421850421, + "loss": 0.0, + "step": 31318 + }, + { + "epoch": 2.9223663338620884, + "grad_norm": NaN, + "learning_rate": 0.00016383715270294345, + "loss": 0.0, + "step": 31319 + }, + { + "epoch": 2.9224596435569654, + "grad_norm": NaN, + "learning_rate": 0.00016382962115219892, + "loss": 0.0, + "step": 31320 + }, + { + "epoch": 2.922552953251843, + "grad_norm": NaN, + "learning_rate": 0.0001638220895662898, + "loss": 0.0, + "step": 31321 + }, + { + "epoch": 2.9226462629467203, + "grad_norm": NaN, + "learning_rate": 0.0001638145579452352, + "loss": 0.0, + "step": 31322 + }, + { + "epoch": 2.9227395726415972, + "grad_norm": NaN, + "learning_rate": 0.0001638070262890542, + "loss": 0.0, + "step": 31323 + }, + { + "epoch": 2.9228328823364746, + "grad_norm": NaN, + "learning_rate": 0.00016379949459776607, + "loss": 0.0, + "step": 31324 + }, + { + "epoch": 2.922926192031352, + "grad_norm": NaN, + "learning_rate": 0.00016379196287138992, + "loss": 0.0, + "step": 31325 + }, + { + "epoch": 2.9230195017262295, + "grad_norm": NaN, + "learning_rate": 0.00016378443110994488, + "loss": 0.0, + "step": 31326 + }, + { + "epoch": 2.923112811421107, + "grad_norm": NaN, + "learning_rate": 0.00016377689931345007, + "loss": 0.0, + "step": 31327 + }, + { + "epoch": 2.923206121115984, + "grad_norm": NaN, + "learning_rate": 0.0001637693674819248, + "loss": 0.0, + "step": 31328 + }, + { + "epoch": 2.9232994308108613, + "grad_norm": NaN, + "learning_rate": 0.00016376183561538798, + "loss": 0.0, + "step": 31329 + }, + { + "epoch": 2.9233927405057383, + "grad_norm": NaN, + "learning_rate": 0.00016375430371385897, + "loss": 0.0, + "step": 31330 + }, + { + "epoch": 2.9234860502006157, + "grad_norm": NaN, + "learning_rate": 0.0001637467717773568, + "loss": 0.0, + "step": 31331 + }, + { + "epoch": 2.923579359895493, + "grad_norm": NaN, + "learning_rate": 0.00016373923980590068, + "loss": 0.0, + "step": 31332 + }, + { + "epoch": 2.9236726695903705, + "grad_norm": NaN, + "learning_rate": 0.00016373170779950975, + "loss": 0.0, + "step": 31333 + }, + { + "epoch": 2.923765979285248, + "grad_norm": NaN, + "learning_rate": 0.00016372417575820315, + "loss": 0.0, + "step": 31334 + }, + { + "epoch": 2.923859288980125, + "grad_norm": NaN, + "learning_rate": 0.000163716643682, + "loss": 0.0, + "step": 31335 + }, + { + "epoch": 2.9239525986750023, + "grad_norm": NaN, + "learning_rate": 0.00016370911157091955, + "loss": 0.0, + "step": 31336 + }, + { + "epoch": 2.9240459083698798, + "grad_norm": NaN, + "learning_rate": 0.00016370157942498085, + "loss": 0.0, + "step": 31337 + }, + { + "epoch": 2.9241392180647567, + "grad_norm": NaN, + "learning_rate": 0.0001636940472442031, + "loss": 0.0, + "step": 31338 + }, + { + "epoch": 2.924232527759634, + "grad_norm": NaN, + "learning_rate": 0.00016368651502860545, + "loss": 0.0, + "step": 31339 + }, + { + "epoch": 2.9243258374545116, + "grad_norm": NaN, + "learning_rate": 0.00016367898277820704, + "loss": 0.0, + "step": 31340 + }, + { + "epoch": 2.924419147149389, + "grad_norm": NaN, + "learning_rate": 0.00016367145049302702, + "loss": 0.0, + "step": 31341 + }, + { + "epoch": 2.924512456844266, + "grad_norm": NaN, + "learning_rate": 0.0001636639181730846, + "loss": 0.0, + "step": 31342 + }, + { + "epoch": 2.9246057665391434, + "grad_norm": NaN, + "learning_rate": 0.00016365638581839883, + "loss": 0.0, + "step": 31343 + }, + { + "epoch": 2.924699076234021, + "grad_norm": NaN, + "learning_rate": 0.00016364885342898895, + "loss": 0.0, + "step": 31344 + }, + { + "epoch": 2.924792385928898, + "grad_norm": NaN, + "learning_rate": 0.00016364132100487404, + "loss": 0.0, + "step": 31345 + }, + { + "epoch": 2.924885695623775, + "grad_norm": NaN, + "learning_rate": 0.0001636337885460733, + "loss": 0.0, + "step": 31346 + }, + { + "epoch": 2.9249790053186526, + "grad_norm": NaN, + "learning_rate": 0.00016362625605260595, + "loss": 0.0, + "step": 31347 + }, + { + "epoch": 2.92507231501353, + "grad_norm": NaN, + "learning_rate": 0.00016361872352449095, + "loss": 0.0, + "step": 31348 + }, + { + "epoch": 2.9251656247084075, + "grad_norm": NaN, + "learning_rate": 0.00016361119096174764, + "loss": 0.0, + "step": 31349 + }, + { + "epoch": 2.9252589344032844, + "grad_norm": NaN, + "learning_rate": 0.00016360365836439508, + "loss": 0.0, + "step": 31350 + }, + { + "epoch": 2.925352244098162, + "grad_norm": NaN, + "learning_rate": 0.00016359612573245248, + "loss": 0.0, + "step": 31351 + }, + { + "epoch": 2.925445553793039, + "grad_norm": NaN, + "learning_rate": 0.00016358859306593894, + "loss": 0.0, + "step": 31352 + }, + { + "epoch": 2.9255388634879163, + "grad_norm": NaN, + "learning_rate": 0.0001635810603648736, + "loss": 0.0, + "step": 31353 + }, + { + "epoch": 2.9256321731827937, + "grad_norm": NaN, + "learning_rate": 0.0001635735276292757, + "loss": 0.0, + "step": 31354 + }, + { + "epoch": 2.925725482877671, + "grad_norm": NaN, + "learning_rate": 0.0001635659948591643, + "loss": 0.0, + "step": 31355 + }, + { + "epoch": 2.9258187925725485, + "grad_norm": NaN, + "learning_rate": 0.0001635584620545586, + "loss": 0.0, + "step": 31356 + }, + { + "epoch": 2.9259121022674255, + "grad_norm": NaN, + "learning_rate": 0.00016355092921547775, + "loss": 0.0, + "step": 31357 + }, + { + "epoch": 2.926005411962303, + "grad_norm": NaN, + "learning_rate": 0.00016354339634194092, + "loss": 0.0, + "step": 31358 + }, + { + "epoch": 2.9260987216571803, + "grad_norm": NaN, + "learning_rate": 0.00016353586343396723, + "loss": 0.0, + "step": 31359 + }, + { + "epoch": 2.9261920313520573, + "grad_norm": NaN, + "learning_rate": 0.00016352833049157586, + "loss": 0.0, + "step": 31360 + }, + { + "epoch": 2.9262853410469347, + "grad_norm": NaN, + "learning_rate": 0.00016352079751478593, + "loss": 0.0, + "step": 31361 + }, + { + "epoch": 2.926378650741812, + "grad_norm": NaN, + "learning_rate": 0.00016351326450361663, + "loss": 0.0, + "step": 31362 + }, + { + "epoch": 2.9264719604366896, + "grad_norm": NaN, + "learning_rate": 0.0001635057314580871, + "loss": 0.0, + "step": 31363 + }, + { + "epoch": 2.9265652701315665, + "grad_norm": NaN, + "learning_rate": 0.0001634981983782165, + "loss": 0.0, + "step": 31364 + }, + { + "epoch": 2.926658579826444, + "grad_norm": NaN, + "learning_rate": 0.00016349066526402396, + "loss": 0.0, + "step": 31365 + }, + { + "epoch": 2.9267518895213214, + "grad_norm": NaN, + "learning_rate": 0.00016348313211552872, + "loss": 0.0, + "step": 31366 + }, + { + "epoch": 2.9268451992161983, + "grad_norm": NaN, + "learning_rate": 0.0001634755989327498, + "loss": 0.0, + "step": 31367 + }, + { + "epoch": 2.9269385089110758, + "grad_norm": NaN, + "learning_rate": 0.00016346806571570642, + "loss": 0.0, + "step": 31368 + }, + { + "epoch": 2.927031818605953, + "grad_norm": NaN, + "learning_rate": 0.00016346053246441778, + "loss": 0.0, + "step": 31369 + }, + { + "epoch": 2.9271251283008306, + "grad_norm": NaN, + "learning_rate": 0.00016345299917890297, + "loss": 0.0, + "step": 31370 + }, + { + "epoch": 2.927218437995708, + "grad_norm": NaN, + "learning_rate": 0.00016344546585918116, + "loss": 0.0, + "step": 31371 + }, + { + "epoch": 2.927311747690585, + "grad_norm": NaN, + "learning_rate": 0.00016343793250527154, + "loss": 0.0, + "step": 31372 + }, + { + "epoch": 2.9274050573854624, + "grad_norm": NaN, + "learning_rate": 0.00016343039911719323, + "loss": 0.0, + "step": 31373 + }, + { + "epoch": 2.9274983670803394, + "grad_norm": NaN, + "learning_rate": 0.0001634228656949654, + "loss": 0.0, + "step": 31374 + }, + { + "epoch": 2.927591676775217, + "grad_norm": NaN, + "learning_rate": 0.00016341533223860716, + "loss": 0.0, + "step": 31375 + }, + { + "epoch": 2.9276849864700942, + "grad_norm": NaN, + "learning_rate": 0.00016340779874813774, + "loss": 0.0, + "step": 31376 + }, + { + "epoch": 2.9277782961649716, + "grad_norm": NaN, + "learning_rate": 0.00016340026522357627, + "loss": 0.0, + "step": 31377 + }, + { + "epoch": 2.927871605859849, + "grad_norm": NaN, + "learning_rate": 0.0001633927316649419, + "loss": 0.0, + "step": 31378 + }, + { + "epoch": 2.927964915554726, + "grad_norm": NaN, + "learning_rate": 0.00016338519807225374, + "loss": 0.0, + "step": 31379 + }, + { + "epoch": 2.9280582252496035, + "grad_norm": NaN, + "learning_rate": 0.000163377664445531, + "loss": 0.0, + "step": 31380 + }, + { + "epoch": 2.928151534944481, + "grad_norm": NaN, + "learning_rate": 0.0001633701307847928, + "loss": 0.0, + "step": 31381 + }, + { + "epoch": 2.928244844639358, + "grad_norm": NaN, + "learning_rate": 0.00016336259709005834, + "loss": 0.0, + "step": 31382 + }, + { + "epoch": 2.9283381543342353, + "grad_norm": NaN, + "learning_rate": 0.00016335506336134678, + "loss": 0.0, + "step": 31383 + }, + { + "epoch": 2.9284314640291127, + "grad_norm": NaN, + "learning_rate": 0.00016334752959867724, + "loss": 0.0, + "step": 31384 + }, + { + "epoch": 2.92852477372399, + "grad_norm": NaN, + "learning_rate": 0.00016333999580206884, + "loss": 0.0, + "step": 31385 + }, + { + "epoch": 2.928618083418867, + "grad_norm": NaN, + "learning_rate": 0.00016333246197154083, + "loss": 0.0, + "step": 31386 + }, + { + "epoch": 2.9287113931137445, + "grad_norm": NaN, + "learning_rate": 0.0001633249281071123, + "loss": 0.0, + "step": 31387 + }, + { + "epoch": 2.928804702808622, + "grad_norm": NaN, + "learning_rate": 0.00016331739420880244, + "loss": 0.0, + "step": 31388 + }, + { + "epoch": 2.928898012503499, + "grad_norm": NaN, + "learning_rate": 0.0001633098602766304, + "loss": 0.0, + "step": 31389 + }, + { + "epoch": 2.9289913221983763, + "grad_norm": NaN, + "learning_rate": 0.0001633023263106153, + "loss": 0.0, + "step": 31390 + }, + { + "epoch": 2.9290846318932537, + "grad_norm": NaN, + "learning_rate": 0.00016329479231077634, + "loss": 0.0, + "step": 31391 + }, + { + "epoch": 2.929177941588131, + "grad_norm": NaN, + "learning_rate": 0.00016328725827713264, + "loss": 0.0, + "step": 31392 + }, + { + "epoch": 2.9292712512830086, + "grad_norm": NaN, + "learning_rate": 0.0001632797242097034, + "loss": 0.0, + "step": 31393 + }, + { + "epoch": 2.9293645609778856, + "grad_norm": NaN, + "learning_rate": 0.00016327219010850774, + "loss": 0.0, + "step": 31394 + }, + { + "epoch": 2.929457870672763, + "grad_norm": NaN, + "learning_rate": 0.00016326465597356482, + "loss": 0.0, + "step": 31395 + }, + { + "epoch": 2.92955118036764, + "grad_norm": NaN, + "learning_rate": 0.00016325712180489385, + "loss": 0.0, + "step": 31396 + }, + { + "epoch": 2.9296444900625174, + "grad_norm": NaN, + "learning_rate": 0.00016324958760251392, + "loss": 0.0, + "step": 31397 + }, + { + "epoch": 2.929737799757395, + "grad_norm": NaN, + "learning_rate": 0.0001632420533664442, + "loss": 0.0, + "step": 31398 + }, + { + "epoch": 2.929831109452272, + "grad_norm": NaN, + "learning_rate": 0.0001632345190967039, + "loss": 0.0, + "step": 31399 + }, + { + "epoch": 2.9299244191471496, + "grad_norm": NaN, + "learning_rate": 0.00016322698479331212, + "loss": 0.0, + "step": 31400 + }, + { + "epoch": 2.9300177288420266, + "grad_norm": NaN, + "learning_rate": 0.00016321945045628803, + "loss": 0.0, + "step": 31401 + }, + { + "epoch": 2.930111038536904, + "grad_norm": NaN, + "learning_rate": 0.0001632119160856508, + "loss": 0.0, + "step": 31402 + }, + { + "epoch": 2.9302043482317814, + "grad_norm": NaN, + "learning_rate": 0.00016320438168141956, + "loss": 0.0, + "step": 31403 + }, + { + "epoch": 2.9302976579266584, + "grad_norm": NaN, + "learning_rate": 0.0001631968472436135, + "loss": 0.0, + "step": 31404 + }, + { + "epoch": 2.930390967621536, + "grad_norm": NaN, + "learning_rate": 0.00016318931277225178, + "loss": 0.0, + "step": 31405 + }, + { + "epoch": 2.9304842773164133, + "grad_norm": NaN, + "learning_rate": 0.00016318177826735357, + "loss": 0.0, + "step": 31406 + }, + { + "epoch": 2.9305775870112907, + "grad_norm": NaN, + "learning_rate": 0.00016317424372893797, + "loss": 0.0, + "step": 31407 + }, + { + "epoch": 2.9306708967061676, + "grad_norm": NaN, + "learning_rate": 0.00016316670915702414, + "loss": 0.0, + "step": 31408 + }, + { + "epoch": 2.930764206401045, + "grad_norm": NaN, + "learning_rate": 0.0001631591745516313, + "loss": 0.0, + "step": 31409 + }, + { + "epoch": 2.9308575160959225, + "grad_norm": NaN, + "learning_rate": 0.0001631516399127786, + "loss": 0.0, + "step": 31410 + }, + { + "epoch": 2.9309508257907995, + "grad_norm": NaN, + "learning_rate": 0.00016314410524048516, + "loss": 0.0, + "step": 31411 + }, + { + "epoch": 2.931044135485677, + "grad_norm": NaN, + "learning_rate": 0.00016313657053477015, + "loss": 0.0, + "step": 31412 + }, + { + "epoch": 2.9311374451805543, + "grad_norm": NaN, + "learning_rate": 0.0001631290357956527, + "loss": 0.0, + "step": 31413 + }, + { + "epoch": 2.9312307548754317, + "grad_norm": NaN, + "learning_rate": 0.00016312150102315204, + "loss": 0.0, + "step": 31414 + }, + { + "epoch": 2.9313240645703087, + "grad_norm": NaN, + "learning_rate": 0.00016311396621728727, + "loss": 0.0, + "step": 31415 + }, + { + "epoch": 2.931417374265186, + "grad_norm": NaN, + "learning_rate": 0.0001631064313780776, + "loss": 0.0, + "step": 31416 + }, + { + "epoch": 2.9315106839600635, + "grad_norm": NaN, + "learning_rate": 0.00016309889650554214, + "loss": 0.0, + "step": 31417 + }, + { + "epoch": 2.9316039936549405, + "grad_norm": NaN, + "learning_rate": 0.00016309136159970005, + "loss": 0.0, + "step": 31418 + }, + { + "epoch": 2.931697303349818, + "grad_norm": NaN, + "learning_rate": 0.00016308382666057052, + "loss": 0.0, + "step": 31419 + }, + { + "epoch": 2.9317906130446953, + "grad_norm": NaN, + "learning_rate": 0.0001630762916881727, + "loss": 0.0, + "step": 31420 + }, + { + "epoch": 2.9318839227395728, + "grad_norm": NaN, + "learning_rate": 0.00016306875668252574, + "loss": 0.0, + "step": 31421 + }, + { + "epoch": 2.93197723243445, + "grad_norm": NaN, + "learning_rate": 0.0001630612216436488, + "loss": 0.0, + "step": 31422 + }, + { + "epoch": 2.932070542129327, + "grad_norm": NaN, + "learning_rate": 0.00016305368657156105, + "loss": 0.0, + "step": 31423 + }, + { + "epoch": 2.9321638518242046, + "grad_norm": NaN, + "learning_rate": 0.0001630461514662816, + "loss": 0.0, + "step": 31424 + }, + { + "epoch": 2.9322571615190816, + "grad_norm": NaN, + "learning_rate": 0.0001630386163278297, + "loss": 0.0, + "step": 31425 + }, + { + "epoch": 2.932350471213959, + "grad_norm": NaN, + "learning_rate": 0.00016303108115622443, + "loss": 0.0, + "step": 31426 + }, + { + "epoch": 2.9324437809088364, + "grad_norm": NaN, + "learning_rate": 0.00016302354595148502, + "loss": 0.0, + "step": 31427 + }, + { + "epoch": 2.932537090603714, + "grad_norm": NaN, + "learning_rate": 0.00016301601071363057, + "loss": 0.0, + "step": 31428 + }, + { + "epoch": 2.9326304002985912, + "grad_norm": NaN, + "learning_rate": 0.00016300847544268023, + "loss": 0.0, + "step": 31429 + }, + { + "epoch": 2.932723709993468, + "grad_norm": NaN, + "learning_rate": 0.00016300094013865324, + "loss": 0.0, + "step": 31430 + }, + { + "epoch": 2.9328170196883456, + "grad_norm": NaN, + "learning_rate": 0.00016299340480156868, + "loss": 0.0, + "step": 31431 + }, + { + "epoch": 2.932910329383223, + "grad_norm": NaN, + "learning_rate": 0.00016298586943144577, + "loss": 0.0, + "step": 31432 + }, + { + "epoch": 2.9330036390781, + "grad_norm": NaN, + "learning_rate": 0.0001629783340283036, + "loss": 0.0, + "step": 31433 + }, + { + "epoch": 2.9330969487729774, + "grad_norm": NaN, + "learning_rate": 0.0001629707985921614, + "loss": 0.0, + "step": 31434 + }, + { + "epoch": 2.933190258467855, + "grad_norm": NaN, + "learning_rate": 0.00016296326312303827, + "loss": 0.0, + "step": 31435 + }, + { + "epoch": 2.9332835681627323, + "grad_norm": NaN, + "learning_rate": 0.00016295572762095345, + "loss": 0.0, + "step": 31436 + }, + { + "epoch": 2.9333768778576093, + "grad_norm": NaN, + "learning_rate": 0.00016294819208592602, + "loss": 0.0, + "step": 31437 + }, + { + "epoch": 2.9334701875524867, + "grad_norm": NaN, + "learning_rate": 0.00016294065651797517, + "loss": 0.0, + "step": 31438 + }, + { + "epoch": 2.933563497247364, + "grad_norm": NaN, + "learning_rate": 0.00016293312091712007, + "loss": 0.0, + "step": 31439 + }, + { + "epoch": 2.933656806942241, + "grad_norm": NaN, + "learning_rate": 0.00016292558528337988, + "loss": 0.0, + "step": 31440 + }, + { + "epoch": 2.9337501166371185, + "grad_norm": NaN, + "learning_rate": 0.00016291804961677378, + "loss": 0.0, + "step": 31441 + }, + { + "epoch": 2.933843426331996, + "grad_norm": NaN, + "learning_rate": 0.00016291051391732086, + "loss": 0.0, + "step": 31442 + }, + { + "epoch": 2.9339367360268733, + "grad_norm": NaN, + "learning_rate": 0.00016290297818504035, + "loss": 0.0, + "step": 31443 + }, + { + "epoch": 2.9340300457217507, + "grad_norm": NaN, + "learning_rate": 0.00016289544241995137, + "loss": 0.0, + "step": 31444 + }, + { + "epoch": 2.9341233554166277, + "grad_norm": NaN, + "learning_rate": 0.00016288790662207314, + "loss": 0.0, + "step": 31445 + }, + { + "epoch": 2.934216665111505, + "grad_norm": NaN, + "learning_rate": 0.00016288037079142473, + "loss": 0.0, + "step": 31446 + }, + { + "epoch": 2.934309974806382, + "grad_norm": NaN, + "learning_rate": 0.0001628728349280254, + "loss": 0.0, + "step": 31447 + }, + { + "epoch": 2.9344032845012595, + "grad_norm": NaN, + "learning_rate": 0.00016286529903189424, + "loss": 0.0, + "step": 31448 + }, + { + "epoch": 2.934496594196137, + "grad_norm": NaN, + "learning_rate": 0.00016285776310305042, + "loss": 0.0, + "step": 31449 + }, + { + "epoch": 2.9345899038910144, + "grad_norm": NaN, + "learning_rate": 0.00016285022714151315, + "loss": 0.0, + "step": 31450 + }, + { + "epoch": 2.934683213585892, + "grad_norm": NaN, + "learning_rate": 0.00016284269114730154, + "loss": 0.0, + "step": 31451 + }, + { + "epoch": 2.9347765232807688, + "grad_norm": NaN, + "learning_rate": 0.00016283515512043476, + "loss": 0.0, + "step": 31452 + }, + { + "epoch": 2.934869832975646, + "grad_norm": NaN, + "learning_rate": 0.000162827619060932, + "loss": 0.0, + "step": 31453 + }, + { + "epoch": 2.9349631426705236, + "grad_norm": NaN, + "learning_rate": 0.00016282008296881242, + "loss": 0.0, + "step": 31454 + }, + { + "epoch": 2.9350564523654006, + "grad_norm": NaN, + "learning_rate": 0.00016281254684409515, + "loss": 0.0, + "step": 31455 + }, + { + "epoch": 2.935149762060278, + "grad_norm": NaN, + "learning_rate": 0.00016280501068679934, + "loss": 0.0, + "step": 31456 + }, + { + "epoch": 2.9352430717551554, + "grad_norm": NaN, + "learning_rate": 0.0001627974744969442, + "loss": 0.0, + "step": 31457 + }, + { + "epoch": 2.935336381450033, + "grad_norm": NaN, + "learning_rate": 0.00016278993827454892, + "loss": 0.0, + "step": 31458 + }, + { + "epoch": 2.93542969114491, + "grad_norm": NaN, + "learning_rate": 0.00016278240201963258, + "loss": 0.0, + "step": 31459 + }, + { + "epoch": 2.9355230008397872, + "grad_norm": NaN, + "learning_rate": 0.00016277486573221438, + "loss": 0.0, + "step": 31460 + }, + { + "epoch": 2.9356163105346647, + "grad_norm": NaN, + "learning_rate": 0.00016276732941231345, + "loss": 0.0, + "step": 31461 + }, + { + "epoch": 2.9357096202295416, + "grad_norm": NaN, + "learning_rate": 0.00016275979305994901, + "loss": 0.0, + "step": 31462 + }, + { + "epoch": 2.935802929924419, + "grad_norm": NaN, + "learning_rate": 0.0001627522566751402, + "loss": 0.0, + "step": 31463 + }, + { + "epoch": 2.9358962396192965, + "grad_norm": NaN, + "learning_rate": 0.0001627447202579062, + "loss": 0.0, + "step": 31464 + }, + { + "epoch": 2.935989549314174, + "grad_norm": NaN, + "learning_rate": 0.0001627371838082661, + "loss": 0.0, + "step": 31465 + }, + { + "epoch": 2.9360828590090513, + "grad_norm": NaN, + "learning_rate": 0.00016272964732623915, + "loss": 0.0, + "step": 31466 + }, + { + "epoch": 2.9361761687039283, + "grad_norm": NaN, + "learning_rate": 0.00016272211081184447, + "loss": 0.0, + "step": 31467 + }, + { + "epoch": 2.9362694783988057, + "grad_norm": NaN, + "learning_rate": 0.0001627145742651012, + "loss": 0.0, + "step": 31468 + }, + { + "epoch": 2.9363627880936827, + "grad_norm": NaN, + "learning_rate": 0.00016270703768602859, + "loss": 0.0, + "step": 31469 + }, + { + "epoch": 2.93645609778856, + "grad_norm": NaN, + "learning_rate": 0.00016269950107464573, + "loss": 0.0, + "step": 31470 + }, + { + "epoch": 2.9365494074834375, + "grad_norm": NaN, + "learning_rate": 0.0001626919644309718, + "loss": 0.0, + "step": 31471 + }, + { + "epoch": 2.936642717178315, + "grad_norm": NaN, + "learning_rate": 0.00016268442775502594, + "loss": 0.0, + "step": 31472 + }, + { + "epoch": 2.9367360268731924, + "grad_norm": NaN, + "learning_rate": 0.00016267689104682736, + "loss": 0.0, + "step": 31473 + }, + { + "epoch": 2.9368293365680693, + "grad_norm": NaN, + "learning_rate": 0.0001626693543063952, + "loss": 0.0, + "step": 31474 + }, + { + "epoch": 2.9369226462629467, + "grad_norm": NaN, + "learning_rate": 0.0001626618175337486, + "loss": 0.0, + "step": 31475 + }, + { + "epoch": 2.937015955957824, + "grad_norm": NaN, + "learning_rate": 0.00016265428072890676, + "loss": 0.0, + "step": 31476 + }, + { + "epoch": 2.937109265652701, + "grad_norm": NaN, + "learning_rate": 0.00016264674389188886, + "loss": 0.0, + "step": 31477 + }, + { + "epoch": 2.9372025753475786, + "grad_norm": NaN, + "learning_rate": 0.00016263920702271403, + "loss": 0.0, + "step": 31478 + }, + { + "epoch": 2.937295885042456, + "grad_norm": NaN, + "learning_rate": 0.0001626316701214014, + "loss": 0.0, + "step": 31479 + }, + { + "epoch": 2.9373891947373334, + "grad_norm": NaN, + "learning_rate": 0.0001626241331879702, + "loss": 0.0, + "step": 31480 + }, + { + "epoch": 2.9374825044322104, + "grad_norm": NaN, + "learning_rate": 0.0001626165962224396, + "loss": 0.0, + "step": 31481 + }, + { + "epoch": 2.937575814127088, + "grad_norm": NaN, + "learning_rate": 0.0001626090592248287, + "loss": 0.0, + "step": 31482 + }, + { + "epoch": 2.937669123821965, + "grad_norm": NaN, + "learning_rate": 0.00016260152219515667, + "loss": 0.0, + "step": 31483 + }, + { + "epoch": 2.937762433516842, + "grad_norm": NaN, + "learning_rate": 0.00016259398513344275, + "loss": 0.0, + "step": 31484 + }, + { + "epoch": 2.9378557432117196, + "grad_norm": NaN, + "learning_rate": 0.00016258644803970605, + "loss": 0.0, + "step": 31485 + }, + { + "epoch": 2.937949052906597, + "grad_norm": NaN, + "learning_rate": 0.0001625789109139657, + "loss": 0.0, + "step": 31486 + }, + { + "epoch": 2.9380423626014744, + "grad_norm": NaN, + "learning_rate": 0.00016257137375624096, + "loss": 0.0, + "step": 31487 + }, + { + "epoch": 2.938135672296352, + "grad_norm": NaN, + "learning_rate": 0.00016256383656655093, + "loss": 0.0, + "step": 31488 + }, + { + "epoch": 2.938228981991229, + "grad_norm": NaN, + "learning_rate": 0.00016255629934491478, + "loss": 0.0, + "step": 31489 + }, + { + "epoch": 2.9383222916861063, + "grad_norm": NaN, + "learning_rate": 0.00016254876209135166, + "loss": 0.0, + "step": 31490 + }, + { + "epoch": 2.9384156013809832, + "grad_norm": NaN, + "learning_rate": 0.00016254122480588077, + "loss": 0.0, + "step": 31491 + }, + { + "epoch": 2.9385089110758607, + "grad_norm": NaN, + "learning_rate": 0.00016253368748852124, + "loss": 0.0, + "step": 31492 + }, + { + "epoch": 2.938602220770738, + "grad_norm": NaN, + "learning_rate": 0.00016252615013929227, + "loss": 0.0, + "step": 31493 + }, + { + "epoch": 2.9386955304656155, + "grad_norm": NaN, + "learning_rate": 0.00016251861275821302, + "loss": 0.0, + "step": 31494 + }, + { + "epoch": 2.938788840160493, + "grad_norm": NaN, + "learning_rate": 0.00016251107534530265, + "loss": 0.0, + "step": 31495 + }, + { + "epoch": 2.93888214985537, + "grad_norm": NaN, + "learning_rate": 0.00016250353790058032, + "loss": 0.0, + "step": 31496 + }, + { + "epoch": 2.9389754595502473, + "grad_norm": NaN, + "learning_rate": 0.00016249600042406517, + "loss": 0.0, + "step": 31497 + }, + { + "epoch": 2.9390687692451247, + "grad_norm": NaN, + "learning_rate": 0.00016248846291577637, + "loss": 0.0, + "step": 31498 + }, + { + "epoch": 2.9391620789400017, + "grad_norm": NaN, + "learning_rate": 0.00016248092537573318, + "loss": 0.0, + "step": 31499 + }, + { + "epoch": 2.939255388634879, + "grad_norm": NaN, + "learning_rate": 0.00016247338780395467, + "loss": 0.0, + "step": 31500 + }, + { + "epoch": 2.9393486983297565, + "grad_norm": NaN, + "learning_rate": 0.00016246585020046, + "loss": 0.0, + "step": 31501 + }, + { + "epoch": 2.939442008024634, + "grad_norm": NaN, + "learning_rate": 0.00016245831256526844, + "loss": 0.0, + "step": 31502 + }, + { + "epoch": 2.939535317719511, + "grad_norm": NaN, + "learning_rate": 0.000162450774898399, + "loss": 0.0, + "step": 31503 + }, + { + "epoch": 2.9396286274143884, + "grad_norm": NaN, + "learning_rate": 0.00016244323719987098, + "loss": 0.0, + "step": 31504 + }, + { + "epoch": 2.9397219371092658, + "grad_norm": NaN, + "learning_rate": 0.00016243569946970348, + "loss": 0.0, + "step": 31505 + }, + { + "epoch": 2.9398152468041427, + "grad_norm": NaN, + "learning_rate": 0.00016242816170791566, + "loss": 0.0, + "step": 31506 + }, + { + "epoch": 2.93990855649902, + "grad_norm": NaN, + "learning_rate": 0.00016242062391452668, + "loss": 0.0, + "step": 31507 + }, + { + "epoch": 2.9400018661938976, + "grad_norm": NaN, + "learning_rate": 0.0001624130860895558, + "loss": 0.0, + "step": 31508 + }, + { + "epoch": 2.940095175888775, + "grad_norm": NaN, + "learning_rate": 0.0001624055482330221, + "loss": 0.0, + "step": 31509 + }, + { + "epoch": 2.940188485583652, + "grad_norm": NaN, + "learning_rate": 0.00016239801034494475, + "loss": 0.0, + "step": 31510 + }, + { + "epoch": 2.9402817952785294, + "grad_norm": NaN, + "learning_rate": 0.00016239047242534293, + "loss": 0.0, + "step": 31511 + }, + { + "epoch": 2.940375104973407, + "grad_norm": NaN, + "learning_rate": 0.00016238293447423582, + "loss": 0.0, + "step": 31512 + }, + { + "epoch": 2.940468414668284, + "grad_norm": NaN, + "learning_rate": 0.00016237539649164258, + "loss": 0.0, + "step": 31513 + }, + { + "epoch": 2.940561724363161, + "grad_norm": NaN, + "learning_rate": 0.00016236785847758236, + "loss": 0.0, + "step": 31514 + }, + { + "epoch": 2.9406550340580386, + "grad_norm": NaN, + "learning_rate": 0.00016236032043207433, + "loss": 0.0, + "step": 31515 + }, + { + "epoch": 2.940748343752916, + "grad_norm": NaN, + "learning_rate": 0.00016235278235513767, + "loss": 0.0, + "step": 31516 + }, + { + "epoch": 2.9408416534477935, + "grad_norm": NaN, + "learning_rate": 0.00016234524424679154, + "loss": 0.0, + "step": 31517 + }, + { + "epoch": 2.9409349631426704, + "grad_norm": NaN, + "learning_rate": 0.00016233770610705518, + "loss": 0.0, + "step": 31518 + }, + { + "epoch": 2.941028272837548, + "grad_norm": NaN, + "learning_rate": 0.00016233016793594762, + "loss": 0.0, + "step": 31519 + }, + { + "epoch": 2.9411215825324253, + "grad_norm": NaN, + "learning_rate": 0.00016232262973348807, + "loss": 0.0, + "step": 31520 + }, + { + "epoch": 2.9412148922273023, + "grad_norm": NaN, + "learning_rate": 0.0001623150914996958, + "loss": 0.0, + "step": 31521 + }, + { + "epoch": 2.9413082019221797, + "grad_norm": NaN, + "learning_rate": 0.00016230755323458986, + "loss": 0.0, + "step": 31522 + }, + { + "epoch": 2.941401511617057, + "grad_norm": NaN, + "learning_rate": 0.00016230001493818943, + "loss": 0.0, + "step": 31523 + }, + { + "epoch": 2.9414948213119345, + "grad_norm": NaN, + "learning_rate": 0.00016229247661051378, + "loss": 0.0, + "step": 31524 + }, + { + "epoch": 2.9415881310068115, + "grad_norm": NaN, + "learning_rate": 0.00016228493825158193, + "loss": 0.0, + "step": 31525 + }, + { + "epoch": 2.941681440701689, + "grad_norm": NaN, + "learning_rate": 0.00016227739986141316, + "loss": 0.0, + "step": 31526 + }, + { + "epoch": 2.9417747503965663, + "grad_norm": NaN, + "learning_rate": 0.00016226986144002656, + "loss": 0.0, + "step": 31527 + }, + { + "epoch": 2.9418680600914433, + "grad_norm": NaN, + "learning_rate": 0.00016226232298744138, + "loss": 0.0, + "step": 31528 + }, + { + "epoch": 2.9419613697863207, + "grad_norm": NaN, + "learning_rate": 0.00016225478450367675, + "loss": 0.0, + "step": 31529 + }, + { + "epoch": 2.942054679481198, + "grad_norm": NaN, + "learning_rate": 0.0001622472459887518, + "loss": 0.0, + "step": 31530 + }, + { + "epoch": 2.9421479891760756, + "grad_norm": NaN, + "learning_rate": 0.00016223970744268573, + "loss": 0.0, + "step": 31531 + }, + { + "epoch": 2.9422412988709525, + "grad_norm": NaN, + "learning_rate": 0.00016223216886549773, + "loss": 0.0, + "step": 31532 + }, + { + "epoch": 2.94233460856583, + "grad_norm": NaN, + "learning_rate": 0.00016222463025720695, + "loss": 0.0, + "step": 31533 + }, + { + "epoch": 2.9424279182607074, + "grad_norm": NaN, + "learning_rate": 0.00016221709161783255, + "loss": 0.0, + "step": 31534 + }, + { + "epoch": 2.9425212279555844, + "grad_norm": NaN, + "learning_rate": 0.0001622095529473937, + "loss": 0.0, + "step": 31535 + }, + { + "epoch": 2.9426145376504618, + "grad_norm": NaN, + "learning_rate": 0.00016220201424590954, + "loss": 0.0, + "step": 31536 + }, + { + "epoch": 2.942707847345339, + "grad_norm": NaN, + "learning_rate": 0.00016219447551339935, + "loss": 0.0, + "step": 31537 + }, + { + "epoch": 2.9428011570402166, + "grad_norm": NaN, + "learning_rate": 0.00016218693674988218, + "loss": 0.0, + "step": 31538 + }, + { + "epoch": 2.942894466735094, + "grad_norm": NaN, + "learning_rate": 0.0001621793979553772, + "loss": 0.0, + "step": 31539 + }, + { + "epoch": 2.942987776429971, + "grad_norm": NaN, + "learning_rate": 0.0001621718591299037, + "loss": 0.0, + "step": 31540 + }, + { + "epoch": 2.9430810861248484, + "grad_norm": NaN, + "learning_rate": 0.00016216432027348074, + "loss": 0.0, + "step": 31541 + }, + { + "epoch": 2.9431743958197254, + "grad_norm": NaN, + "learning_rate": 0.00016215678138612746, + "loss": 0.0, + "step": 31542 + }, + { + "epoch": 2.943267705514603, + "grad_norm": NaN, + "learning_rate": 0.00016214924246786317, + "loss": 0.0, + "step": 31543 + }, + { + "epoch": 2.9433610152094802, + "grad_norm": NaN, + "learning_rate": 0.00016214170351870692, + "loss": 0.0, + "step": 31544 + }, + { + "epoch": 2.9434543249043577, + "grad_norm": NaN, + "learning_rate": 0.00016213416453867785, + "loss": 0.0, + "step": 31545 + }, + { + "epoch": 2.943547634599235, + "grad_norm": NaN, + "learning_rate": 0.00016212662552779532, + "loss": 0.0, + "step": 31546 + }, + { + "epoch": 2.943640944294112, + "grad_norm": NaN, + "learning_rate": 0.00016211908648607832, + "loss": 0.0, + "step": 31547 + }, + { + "epoch": 2.9437342539889895, + "grad_norm": NaN, + "learning_rate": 0.00016211154741354605, + "loss": 0.0, + "step": 31548 + }, + { + "epoch": 2.943827563683867, + "grad_norm": NaN, + "learning_rate": 0.00016210400831021769, + "loss": 0.0, + "step": 31549 + }, + { + "epoch": 2.943920873378744, + "grad_norm": NaN, + "learning_rate": 0.00016209646917611246, + "loss": 0.0, + "step": 31550 + }, + { + "epoch": 2.9440141830736213, + "grad_norm": NaN, + "learning_rate": 0.00016208893001124947, + "loss": 0.0, + "step": 31551 + }, + { + "epoch": 2.9441074927684987, + "grad_norm": NaN, + "learning_rate": 0.0001620813908156479, + "loss": 0.0, + "step": 31552 + }, + { + "epoch": 2.944200802463376, + "grad_norm": NaN, + "learning_rate": 0.00016207385158932696, + "loss": 0.0, + "step": 31553 + }, + { + "epoch": 2.944294112158253, + "grad_norm": NaN, + "learning_rate": 0.0001620663123323058, + "loss": 0.0, + "step": 31554 + }, + { + "epoch": 2.9443874218531305, + "grad_norm": NaN, + "learning_rate": 0.00016205877304460354, + "loss": 0.0, + "step": 31555 + }, + { + "epoch": 2.944480731548008, + "grad_norm": NaN, + "learning_rate": 0.00016205123372623942, + "loss": 0.0, + "step": 31556 + }, + { + "epoch": 2.944574041242885, + "grad_norm": NaN, + "learning_rate": 0.00016204369437723259, + "loss": 0.0, + "step": 31557 + }, + { + "epoch": 2.9446673509377623, + "grad_norm": NaN, + "learning_rate": 0.0001620361549976022, + "loss": 0.0, + "step": 31558 + }, + { + "epoch": 2.9447606606326397, + "grad_norm": NaN, + "learning_rate": 0.00016202861558736747, + "loss": 0.0, + "step": 31559 + }, + { + "epoch": 2.944853970327517, + "grad_norm": NaN, + "learning_rate": 0.00016202107614654747, + "loss": 0.0, + "step": 31560 + }, + { + "epoch": 2.9449472800223946, + "grad_norm": NaN, + "learning_rate": 0.00016201353667516145, + "loss": 0.0, + "step": 31561 + }, + { + "epoch": 2.9450405897172716, + "grad_norm": NaN, + "learning_rate": 0.00016200599717322862, + "loss": 0.0, + "step": 31562 + }, + { + "epoch": 2.945133899412149, + "grad_norm": NaN, + "learning_rate": 0.00016199845764076803, + "loss": 0.0, + "step": 31563 + }, + { + "epoch": 2.945227209107026, + "grad_norm": NaN, + "learning_rate": 0.0001619909180777989, + "loss": 0.0, + "step": 31564 + }, + { + "epoch": 2.9453205188019034, + "grad_norm": NaN, + "learning_rate": 0.00016198337848434053, + "loss": 0.0, + "step": 31565 + }, + { + "epoch": 2.945413828496781, + "grad_norm": NaN, + "learning_rate": 0.0001619758388604119, + "loss": 0.0, + "step": 31566 + }, + { + "epoch": 2.945507138191658, + "grad_norm": NaN, + "learning_rate": 0.0001619682992060322, + "loss": 0.0, + "step": 31567 + }, + { + "epoch": 2.9456004478865356, + "grad_norm": NaN, + "learning_rate": 0.0001619607595212208, + "loss": 0.0, + "step": 31568 + }, + { + "epoch": 2.9456937575814126, + "grad_norm": NaN, + "learning_rate": 0.00016195321980599662, + "loss": 0.0, + "step": 31569 + }, + { + "epoch": 2.94578706727629, + "grad_norm": NaN, + "learning_rate": 0.000161945680060379, + "loss": 0.0, + "step": 31570 + }, + { + "epoch": 2.9458803769711674, + "grad_norm": NaN, + "learning_rate": 0.000161938140284387, + "loss": 0.0, + "step": 31571 + }, + { + "epoch": 2.9459736866660444, + "grad_norm": NaN, + "learning_rate": 0.00016193060047803992, + "loss": 0.0, + "step": 31572 + }, + { + "epoch": 2.946066996360922, + "grad_norm": NaN, + "learning_rate": 0.00016192306064135683, + "loss": 0.0, + "step": 31573 + }, + { + "epoch": 2.9461603060557993, + "grad_norm": NaN, + "learning_rate": 0.00016191552077435688, + "loss": 0.0, + "step": 31574 + }, + { + "epoch": 2.9462536157506767, + "grad_norm": NaN, + "learning_rate": 0.00016190798087705935, + "loss": 0.0, + "step": 31575 + }, + { + "epoch": 2.9463469254455537, + "grad_norm": NaN, + "learning_rate": 0.00016190044094948334, + "loss": 0.0, + "step": 31576 + }, + { + "epoch": 2.946440235140431, + "grad_norm": NaN, + "learning_rate": 0.00016189290099164798, + "loss": 0.0, + "step": 31577 + }, + { + "epoch": 2.9465335448353085, + "grad_norm": NaN, + "learning_rate": 0.0001618853610035726, + "loss": 0.0, + "step": 31578 + }, + { + "epoch": 2.9466268545301855, + "grad_norm": NaN, + "learning_rate": 0.00016187782098527618, + "loss": 0.0, + "step": 31579 + }, + { + "epoch": 2.946720164225063, + "grad_norm": NaN, + "learning_rate": 0.00016187028093677796, + "loss": 0.0, + "step": 31580 + }, + { + "epoch": 2.9468134739199403, + "grad_norm": NaN, + "learning_rate": 0.00016186274085809723, + "loss": 0.0, + "step": 31581 + }, + { + "epoch": 2.9469067836148177, + "grad_norm": NaN, + "learning_rate": 0.000161855200749253, + "loss": 0.0, + "step": 31582 + }, + { + "epoch": 2.947000093309695, + "grad_norm": NaN, + "learning_rate": 0.00016184766061026447, + "loss": 0.0, + "step": 31583 + }, + { + "epoch": 2.947093403004572, + "grad_norm": NaN, + "learning_rate": 0.00016184012044115093, + "loss": 0.0, + "step": 31584 + }, + { + "epoch": 2.9471867126994495, + "grad_norm": NaN, + "learning_rate": 0.00016183258024193145, + "loss": 0.0, + "step": 31585 + }, + { + "epoch": 2.9472800223943265, + "grad_norm": NaN, + "learning_rate": 0.00016182504001262517, + "loss": 0.0, + "step": 31586 + }, + { + "epoch": 2.947373332089204, + "grad_norm": NaN, + "learning_rate": 0.00016181749975325137, + "loss": 0.0, + "step": 31587 + }, + { + "epoch": 2.9474666417840814, + "grad_norm": NaN, + "learning_rate": 0.0001618099594638292, + "loss": 0.0, + "step": 31588 + }, + { + "epoch": 2.9475599514789588, + "grad_norm": NaN, + "learning_rate": 0.00016180241914437771, + "loss": 0.0, + "step": 31589 + }, + { + "epoch": 2.947653261173836, + "grad_norm": NaN, + "learning_rate": 0.00016179487879491622, + "loss": 0.0, + "step": 31590 + }, + { + "epoch": 2.947746570868713, + "grad_norm": NaN, + "learning_rate": 0.00016178733841546385, + "loss": 0.0, + "step": 31591 + }, + { + "epoch": 2.9478398805635906, + "grad_norm": NaN, + "learning_rate": 0.00016177979800603977, + "loss": 0.0, + "step": 31592 + }, + { + "epoch": 2.947933190258468, + "grad_norm": NaN, + "learning_rate": 0.0001617722575666631, + "loss": 0.0, + "step": 31593 + }, + { + "epoch": 2.948026499953345, + "grad_norm": NaN, + "learning_rate": 0.00016176471709735316, + "loss": 0.0, + "step": 31594 + }, + { + "epoch": 2.9481198096482224, + "grad_norm": NaN, + "learning_rate": 0.00016175717659812895, + "loss": 0.0, + "step": 31595 + }, + { + "epoch": 2.9482131193431, + "grad_norm": NaN, + "learning_rate": 0.00016174963606900976, + "loss": 0.0, + "step": 31596 + }, + { + "epoch": 2.9483064290379772, + "grad_norm": NaN, + "learning_rate": 0.00016174209551001473, + "loss": 0.0, + "step": 31597 + }, + { + "epoch": 2.948399738732854, + "grad_norm": NaN, + "learning_rate": 0.000161734554921163, + "loss": 0.0, + "step": 31598 + }, + { + "epoch": 2.9484930484277316, + "grad_norm": NaN, + "learning_rate": 0.00016172701430247375, + "loss": 0.0, + "step": 31599 + }, + { + "epoch": 2.948586358122609, + "grad_norm": NaN, + "learning_rate": 0.00016171947365396626, + "loss": 0.0, + "step": 31600 + }, + { + "epoch": 2.948679667817486, + "grad_norm": NaN, + "learning_rate": 0.00016171193297565958, + "loss": 0.0, + "step": 31601 + }, + { + "epoch": 2.9487729775123634, + "grad_norm": NaN, + "learning_rate": 0.00016170439226757287, + "loss": 0.0, + "step": 31602 + }, + { + "epoch": 2.948866287207241, + "grad_norm": NaN, + "learning_rate": 0.00016169685152972544, + "loss": 0.0, + "step": 31603 + }, + { + "epoch": 2.9489595969021183, + "grad_norm": NaN, + "learning_rate": 0.00016168931076213634, + "loss": 0.0, + "step": 31604 + }, + { + "epoch": 2.9490529065969957, + "grad_norm": NaN, + "learning_rate": 0.00016168176996482478, + "loss": 0.0, + "step": 31605 + }, + { + "epoch": 2.9491462162918727, + "grad_norm": NaN, + "learning_rate": 0.00016167422913781, + "loss": 0.0, + "step": 31606 + }, + { + "epoch": 2.94923952598675, + "grad_norm": NaN, + "learning_rate": 0.00016166668828111104, + "loss": 0.0, + "step": 31607 + }, + { + "epoch": 2.949332835681627, + "grad_norm": NaN, + "learning_rate": 0.00016165914739474713, + "loss": 0.0, + "step": 31608 + }, + { + "epoch": 2.9494261453765045, + "grad_norm": NaN, + "learning_rate": 0.00016165160647873755, + "loss": 0.0, + "step": 31609 + }, + { + "epoch": 2.949519455071382, + "grad_norm": NaN, + "learning_rate": 0.00016164406553310134, + "loss": 0.0, + "step": 31610 + }, + { + "epoch": 2.9496127647662593, + "grad_norm": NaN, + "learning_rate": 0.0001616365245578577, + "loss": 0.0, + "step": 31611 + }, + { + "epoch": 2.9497060744611368, + "grad_norm": NaN, + "learning_rate": 0.00016162898355302584, + "loss": 0.0, + "step": 31612 + }, + { + "epoch": 2.9497993841560137, + "grad_norm": NaN, + "learning_rate": 0.00016162144251862496, + "loss": 0.0, + "step": 31613 + }, + { + "epoch": 2.949892693850891, + "grad_norm": NaN, + "learning_rate": 0.00016161390145467416, + "loss": 0.0, + "step": 31614 + }, + { + "epoch": 2.9499860035457686, + "grad_norm": NaN, + "learning_rate": 0.00016160636036119264, + "loss": 0.0, + "step": 31615 + }, + { + "epoch": 2.9500793132406455, + "grad_norm": NaN, + "learning_rate": 0.0001615988192381996, + "loss": 0.0, + "step": 31616 + }, + { + "epoch": 2.950172622935523, + "grad_norm": NaN, + "learning_rate": 0.0001615912780857142, + "loss": 0.0, + "step": 31617 + }, + { + "epoch": 2.9502659326304004, + "grad_norm": NaN, + "learning_rate": 0.00016158373690375557, + "loss": 0.0, + "step": 31618 + }, + { + "epoch": 2.950359242325278, + "grad_norm": NaN, + "learning_rate": 0.00016157619569234303, + "loss": 0.0, + "step": 31619 + }, + { + "epoch": 2.9504525520201548, + "grad_norm": NaN, + "learning_rate": 0.00016156865445149556, + "loss": 0.0, + "step": 31620 + }, + { + "epoch": 2.950545861715032, + "grad_norm": NaN, + "learning_rate": 0.0001615611131812324, + "loss": 0.0, + "step": 31621 + }, + { + "epoch": 2.9506391714099096, + "grad_norm": NaN, + "learning_rate": 0.00016155357188157288, + "loss": 0.0, + "step": 31622 + }, + { + "epoch": 2.9507324811047866, + "grad_norm": NaN, + "learning_rate": 0.00016154603055253597, + "loss": 0.0, + "step": 31623 + }, + { + "epoch": 2.950825790799664, + "grad_norm": NaN, + "learning_rate": 0.0001615384891941409, + "loss": 0.0, + "step": 31624 + }, + { + "epoch": 2.9509191004945414, + "grad_norm": NaN, + "learning_rate": 0.00016153094780640693, + "loss": 0.0, + "step": 31625 + }, + { + "epoch": 2.951012410189419, + "grad_norm": NaN, + "learning_rate": 0.00016152340638935314, + "loss": 0.0, + "step": 31626 + }, + { + "epoch": 2.951105719884296, + "grad_norm": NaN, + "learning_rate": 0.0001615158649429987, + "loss": 0.0, + "step": 31627 + }, + { + "epoch": 2.9511990295791732, + "grad_norm": NaN, + "learning_rate": 0.0001615083234673629, + "loss": 0.0, + "step": 31628 + }, + { + "epoch": 2.9512923392740507, + "grad_norm": NaN, + "learning_rate": 0.00016150078196246483, + "loss": 0.0, + "step": 31629 + }, + { + "epoch": 2.9513856489689276, + "grad_norm": NaN, + "learning_rate": 0.00016149324042832365, + "loss": 0.0, + "step": 31630 + }, + { + "epoch": 2.951478958663805, + "grad_norm": NaN, + "learning_rate": 0.00016148569886495858, + "loss": 0.0, + "step": 31631 + }, + { + "epoch": 2.9515722683586825, + "grad_norm": NaN, + "learning_rate": 0.00016147815727238883, + "loss": 0.0, + "step": 31632 + }, + { + "epoch": 2.95166557805356, + "grad_norm": NaN, + "learning_rate": 0.00016147061565063343, + "loss": 0.0, + "step": 31633 + }, + { + "epoch": 2.9517588877484373, + "grad_norm": NaN, + "learning_rate": 0.0001614630739997117, + "loss": 0.0, + "step": 31634 + }, + { + "epoch": 2.9518521974433143, + "grad_norm": NaN, + "learning_rate": 0.00016145553231964278, + "loss": 0.0, + "step": 31635 + }, + { + "epoch": 2.9519455071381917, + "grad_norm": NaN, + "learning_rate": 0.0001614479906104458, + "loss": 0.0, + "step": 31636 + }, + { + "epoch": 2.9520388168330687, + "grad_norm": NaN, + "learning_rate": 0.00016144044887214, + "loss": 0.0, + "step": 31637 + }, + { + "epoch": 2.952132126527946, + "grad_norm": NaN, + "learning_rate": 0.00016143290710474454, + "loss": 0.0, + "step": 31638 + }, + { + "epoch": 2.9522254362228235, + "grad_norm": NaN, + "learning_rate": 0.0001614253653082786, + "loss": 0.0, + "step": 31639 + }, + { + "epoch": 2.952318745917701, + "grad_norm": NaN, + "learning_rate": 0.00016141782348276125, + "loss": 0.0, + "step": 31640 + }, + { + "epoch": 2.9524120556125784, + "grad_norm": NaN, + "learning_rate": 0.00016141028162821184, + "loss": 0.0, + "step": 31641 + }, + { + "epoch": 2.9525053653074553, + "grad_norm": NaN, + "learning_rate": 0.00016140273974464946, + "loss": 0.0, + "step": 31642 + }, + { + "epoch": 2.9525986750023328, + "grad_norm": NaN, + "learning_rate": 0.00016139519783209323, + "loss": 0.0, + "step": 31643 + }, + { + "epoch": 2.95269198469721, + "grad_norm": NaN, + "learning_rate": 0.00016138765589056243, + "loss": 0.0, + "step": 31644 + }, + { + "epoch": 2.952785294392087, + "grad_norm": NaN, + "learning_rate": 0.0001613801139200762, + "loss": 0.0, + "step": 31645 + }, + { + "epoch": 2.9528786040869646, + "grad_norm": NaN, + "learning_rate": 0.00016137257192065367, + "loss": 0.0, + "step": 31646 + }, + { + "epoch": 2.952971913781842, + "grad_norm": NaN, + "learning_rate": 0.00016136502989231409, + "loss": 0.0, + "step": 31647 + }, + { + "epoch": 2.9530652234767194, + "grad_norm": NaN, + "learning_rate": 0.00016135748783507664, + "loss": 0.0, + "step": 31648 + }, + { + "epoch": 2.9531585331715964, + "grad_norm": NaN, + "learning_rate": 0.0001613499457489604, + "loss": 0.0, + "step": 31649 + }, + { + "epoch": 2.953251842866474, + "grad_norm": NaN, + "learning_rate": 0.0001613424036339846, + "loss": 0.0, + "step": 31650 + }, + { + "epoch": 2.953345152561351, + "grad_norm": NaN, + "learning_rate": 0.00016133486149016852, + "loss": 0.0, + "step": 31651 + }, + { + "epoch": 2.953438462256228, + "grad_norm": NaN, + "learning_rate": 0.00016132731931753117, + "loss": 0.0, + "step": 31652 + }, + { + "epoch": 2.9535317719511056, + "grad_norm": NaN, + "learning_rate": 0.00016131977711609182, + "loss": 0.0, + "step": 31653 + }, + { + "epoch": 2.953625081645983, + "grad_norm": NaN, + "learning_rate": 0.00016131223488586967, + "loss": 0.0, + "step": 31654 + }, + { + "epoch": 2.9537183913408604, + "grad_norm": NaN, + "learning_rate": 0.00016130469262688377, + "loss": 0.0, + "step": 31655 + }, + { + "epoch": 2.953811701035738, + "grad_norm": NaN, + "learning_rate": 0.00016129715033915345, + "loss": 0.0, + "step": 31656 + }, + { + "epoch": 2.953905010730615, + "grad_norm": NaN, + "learning_rate": 0.00016128960802269782, + "loss": 0.0, + "step": 31657 + }, + { + "epoch": 2.9539983204254923, + "grad_norm": NaN, + "learning_rate": 0.00016128206567753603, + "loss": 0.0, + "step": 31658 + }, + { + "epoch": 2.9540916301203692, + "grad_norm": NaN, + "learning_rate": 0.00016127452330368732, + "loss": 0.0, + "step": 31659 + }, + { + "epoch": 2.9541849398152467, + "grad_norm": NaN, + "learning_rate": 0.00016126698090117083, + "loss": 0.0, + "step": 31660 + }, + { + "epoch": 2.954278249510124, + "grad_norm": NaN, + "learning_rate": 0.00016125943847000574, + "loss": 0.0, + "step": 31661 + }, + { + "epoch": 2.9543715592050015, + "grad_norm": NaN, + "learning_rate": 0.0001612518960102112, + "loss": 0.0, + "step": 31662 + }, + { + "epoch": 2.954464868899879, + "grad_norm": NaN, + "learning_rate": 0.0001612443535218065, + "loss": 0.0, + "step": 31663 + }, + { + "epoch": 2.954558178594756, + "grad_norm": NaN, + "learning_rate": 0.00016123681100481068, + "loss": 0.0, + "step": 31664 + }, + { + "epoch": 2.9546514882896333, + "grad_norm": NaN, + "learning_rate": 0.00016122926845924297, + "loss": 0.0, + "step": 31665 + }, + { + "epoch": 2.9547447979845107, + "grad_norm": NaN, + "learning_rate": 0.0001612217258851226, + "loss": 0.0, + "step": 31666 + }, + { + "epoch": 2.9548381076793877, + "grad_norm": NaN, + "learning_rate": 0.0001612141832824687, + "loss": 0.0, + "step": 31667 + }, + { + "epoch": 2.954931417374265, + "grad_norm": NaN, + "learning_rate": 0.0001612066406513004, + "loss": 0.0, + "step": 31668 + }, + { + "epoch": 2.9550247270691425, + "grad_norm": NaN, + "learning_rate": 0.000161199097991637, + "loss": 0.0, + "step": 31669 + }, + { + "epoch": 2.95511803676402, + "grad_norm": NaN, + "learning_rate": 0.0001611915553034976, + "loss": 0.0, + "step": 31670 + }, + { + "epoch": 2.955211346458897, + "grad_norm": NaN, + "learning_rate": 0.00016118401258690134, + "loss": 0.0, + "step": 31671 + }, + { + "epoch": 2.9553046561537744, + "grad_norm": NaN, + "learning_rate": 0.00016117646984186747, + "loss": 0.0, + "step": 31672 + }, + { + "epoch": 2.9553979658486518, + "grad_norm": NaN, + "learning_rate": 0.0001611689270684152, + "loss": 0.0, + "step": 31673 + }, + { + "epoch": 2.9554912755435288, + "grad_norm": NaN, + "learning_rate": 0.00016116138426656355, + "loss": 0.0, + "step": 31674 + }, + { + "epoch": 2.955584585238406, + "grad_norm": NaN, + "learning_rate": 0.00016115384143633189, + "loss": 0.0, + "step": 31675 + }, + { + "epoch": 2.9556778949332836, + "grad_norm": NaN, + "learning_rate": 0.00016114629857773933, + "loss": 0.0, + "step": 31676 + }, + { + "epoch": 2.955771204628161, + "grad_norm": NaN, + "learning_rate": 0.00016113875569080498, + "loss": 0.0, + "step": 31677 + }, + { + "epoch": 2.9558645143230384, + "grad_norm": NaN, + "learning_rate": 0.0001611312127755481, + "loss": 0.0, + "step": 31678 + }, + { + "epoch": 2.9559578240179154, + "grad_norm": NaN, + "learning_rate": 0.00016112366983198786, + "loss": 0.0, + "step": 31679 + }, + { + "epoch": 2.956051133712793, + "grad_norm": NaN, + "learning_rate": 0.0001611161268601434, + "loss": 0.0, + "step": 31680 + }, + { + "epoch": 2.95614444340767, + "grad_norm": NaN, + "learning_rate": 0.0001611085838600339, + "loss": 0.0, + "step": 31681 + }, + { + "epoch": 2.956237753102547, + "grad_norm": NaN, + "learning_rate": 0.00016110104083167862, + "loss": 0.0, + "step": 31682 + }, + { + "epoch": 2.9563310627974246, + "grad_norm": NaN, + "learning_rate": 0.00016109349777509665, + "loss": 0.0, + "step": 31683 + }, + { + "epoch": 2.956424372492302, + "grad_norm": NaN, + "learning_rate": 0.00016108595469030716, + "loss": 0.0, + "step": 31684 + }, + { + "epoch": 2.9565176821871795, + "grad_norm": NaN, + "learning_rate": 0.0001610784115773294, + "loss": 0.0, + "step": 31685 + }, + { + "epoch": 2.9566109918820564, + "grad_norm": NaN, + "learning_rate": 0.00016107086843618256, + "loss": 0.0, + "step": 31686 + }, + { + "epoch": 2.956704301576934, + "grad_norm": NaN, + "learning_rate": 0.00016106332526688574, + "loss": 0.0, + "step": 31687 + }, + { + "epoch": 2.9567976112718113, + "grad_norm": NaN, + "learning_rate": 0.00016105578206945817, + "loss": 0.0, + "step": 31688 + }, + { + "epoch": 2.9568909209666883, + "grad_norm": NaN, + "learning_rate": 0.00016104823884391907, + "loss": 0.0, + "step": 31689 + }, + { + "epoch": 2.9569842306615657, + "grad_norm": NaN, + "learning_rate": 0.00016104069559028748, + "loss": 0.0, + "step": 31690 + }, + { + "epoch": 2.957077540356443, + "grad_norm": NaN, + "learning_rate": 0.0001610331523085827, + "loss": 0.0, + "step": 31691 + }, + { + "epoch": 2.9571708500513205, + "grad_norm": NaN, + "learning_rate": 0.00016102560899882394, + "loss": 0.0, + "step": 31692 + }, + { + "epoch": 2.9572641597461975, + "grad_norm": NaN, + "learning_rate": 0.00016101806566103024, + "loss": 0.0, + "step": 31693 + }, + { + "epoch": 2.957357469441075, + "grad_norm": NaN, + "learning_rate": 0.0001610105222952209, + "loss": 0.0, + "step": 31694 + }, + { + "epoch": 2.9574507791359523, + "grad_norm": NaN, + "learning_rate": 0.0001610029789014151, + "loss": 0.0, + "step": 31695 + }, + { + "epoch": 2.9575440888308293, + "grad_norm": NaN, + "learning_rate": 0.00016099543547963194, + "loss": 0.0, + "step": 31696 + }, + { + "epoch": 2.9576373985257067, + "grad_norm": NaN, + "learning_rate": 0.00016098789202989065, + "loss": 0.0, + "step": 31697 + }, + { + "epoch": 2.957730708220584, + "grad_norm": NaN, + "learning_rate": 0.00016098034855221044, + "loss": 0.0, + "step": 31698 + }, + { + "epoch": 2.9578240179154616, + "grad_norm": NaN, + "learning_rate": 0.0001609728050466104, + "loss": 0.0, + "step": 31699 + }, + { + "epoch": 2.957917327610339, + "grad_norm": NaN, + "learning_rate": 0.00016096526151310978, + "loss": 0.0, + "step": 31700 + }, + { + "epoch": 2.958010637305216, + "grad_norm": NaN, + "learning_rate": 0.00016095771795172778, + "loss": 0.0, + "step": 31701 + }, + { + "epoch": 2.9581039470000934, + "grad_norm": NaN, + "learning_rate": 0.00016095017436248353, + "loss": 0.0, + "step": 31702 + }, + { + "epoch": 2.9581972566949704, + "grad_norm": NaN, + "learning_rate": 0.00016094263074539625, + "loss": 0.0, + "step": 31703 + }, + { + "epoch": 2.9582905663898478, + "grad_norm": NaN, + "learning_rate": 0.00016093508710048504, + "loss": 0.0, + "step": 31704 + }, + { + "epoch": 2.958383876084725, + "grad_norm": NaN, + "learning_rate": 0.00016092754342776925, + "loss": 0.0, + "step": 31705 + }, + { + "epoch": 2.9584771857796026, + "grad_norm": NaN, + "learning_rate": 0.00016091999972726786, + "loss": 0.0, + "step": 31706 + }, + { + "epoch": 2.95857049547448, + "grad_norm": NaN, + "learning_rate": 0.00016091245599900015, + "loss": 0.0, + "step": 31707 + }, + { + "epoch": 2.958663805169357, + "grad_norm": NaN, + "learning_rate": 0.00016090491224298535, + "loss": 0.0, + "step": 31708 + }, + { + "epoch": 2.9587571148642344, + "grad_norm": NaN, + "learning_rate": 0.00016089736845924254, + "loss": 0.0, + "step": 31709 + }, + { + "epoch": 2.958850424559112, + "grad_norm": NaN, + "learning_rate": 0.00016088982464779097, + "loss": 0.0, + "step": 31710 + }, + { + "epoch": 2.958943734253989, + "grad_norm": NaN, + "learning_rate": 0.00016088228080864982, + "loss": 0.0, + "step": 31711 + }, + { + "epoch": 2.9590370439488662, + "grad_norm": NaN, + "learning_rate": 0.00016087473694183823, + "loss": 0.0, + "step": 31712 + }, + { + "epoch": 2.9591303536437437, + "grad_norm": NaN, + "learning_rate": 0.0001608671930473754, + "loss": 0.0, + "step": 31713 + }, + { + "epoch": 2.959223663338621, + "grad_norm": NaN, + "learning_rate": 0.00016085964912528059, + "loss": 0.0, + "step": 31714 + }, + { + "epoch": 2.959316973033498, + "grad_norm": NaN, + "learning_rate": 0.0001608521051755728, + "loss": 0.0, + "step": 31715 + }, + { + "epoch": 2.9594102827283755, + "grad_norm": NaN, + "learning_rate": 0.00016084456119827136, + "loss": 0.0, + "step": 31716 + }, + { + "epoch": 2.959503592423253, + "grad_norm": NaN, + "learning_rate": 0.00016083701719339543, + "loss": 0.0, + "step": 31717 + }, + { + "epoch": 2.95959690211813, + "grad_norm": NaN, + "learning_rate": 0.00016082947316096416, + "loss": 0.0, + "step": 31718 + }, + { + "epoch": 2.9596902118130073, + "grad_norm": NaN, + "learning_rate": 0.00016082192910099675, + "loss": 0.0, + "step": 31719 + }, + { + "epoch": 2.9597835215078847, + "grad_norm": NaN, + "learning_rate": 0.00016081438501351243, + "loss": 0.0, + "step": 31720 + }, + { + "epoch": 2.959876831202762, + "grad_norm": NaN, + "learning_rate": 0.00016080684089853024, + "loss": 0.0, + "step": 31721 + }, + { + "epoch": 2.959970140897639, + "grad_norm": NaN, + "learning_rate": 0.0001607992967560695, + "loss": 0.0, + "step": 31722 + }, + { + "epoch": 2.9600634505925165, + "grad_norm": NaN, + "learning_rate": 0.00016079175258614938, + "loss": 0.0, + "step": 31723 + }, + { + "epoch": 2.960156760287394, + "grad_norm": NaN, + "learning_rate": 0.00016078420838878897, + "loss": 0.0, + "step": 31724 + }, + { + "epoch": 2.960250069982271, + "grad_norm": NaN, + "learning_rate": 0.00016077666416400757, + "loss": 0.0, + "step": 31725 + }, + { + "epoch": 2.9603433796771483, + "grad_norm": NaN, + "learning_rate": 0.00016076911991182426, + "loss": 0.0, + "step": 31726 + }, + { + "epoch": 2.9604366893720258, + "grad_norm": NaN, + "learning_rate": 0.00016076157563225835, + "loss": 0.0, + "step": 31727 + }, + { + "epoch": 2.960529999066903, + "grad_norm": NaN, + "learning_rate": 0.00016075403132532882, + "loss": 0.0, + "step": 31728 + }, + { + "epoch": 2.9606233087617806, + "grad_norm": NaN, + "learning_rate": 0.00016074648699105503, + "loss": 0.0, + "step": 31729 + }, + { + "epoch": 2.9607166184566576, + "grad_norm": NaN, + "learning_rate": 0.00016073894262945616, + "loss": 0.0, + "step": 31730 + }, + { + "epoch": 2.960809928151535, + "grad_norm": NaN, + "learning_rate": 0.00016073139824055122, + "loss": 0.0, + "step": 31731 + }, + { + "epoch": 2.9609032378464124, + "grad_norm": NaN, + "learning_rate": 0.00016072385382435956, + "loss": 0.0, + "step": 31732 + }, + { + "epoch": 2.9609965475412894, + "grad_norm": NaN, + "learning_rate": 0.0001607163093809004, + "loss": 0.0, + "step": 31733 + }, + { + "epoch": 2.961089857236167, + "grad_norm": NaN, + "learning_rate": 0.00016070876491019274, + "loss": 0.0, + "step": 31734 + }, + { + "epoch": 2.961183166931044, + "grad_norm": NaN, + "learning_rate": 0.0001607012204122559, + "loss": 0.0, + "step": 31735 + }, + { + "epoch": 2.9612764766259216, + "grad_norm": NaN, + "learning_rate": 0.00016069367588710909, + "loss": 0.0, + "step": 31736 + }, + { + "epoch": 2.9613697863207986, + "grad_norm": NaN, + "learning_rate": 0.00016068613133477132, + "loss": 0.0, + "step": 31737 + }, + { + "epoch": 2.961463096015676, + "grad_norm": NaN, + "learning_rate": 0.0001606785867552619, + "loss": 0.0, + "step": 31738 + }, + { + "epoch": 2.9615564057105535, + "grad_norm": NaN, + "learning_rate": 0.00016067104214860005, + "loss": 0.0, + "step": 31739 + }, + { + "epoch": 2.9616497154054304, + "grad_norm": NaN, + "learning_rate": 0.0001606634975148049, + "loss": 0.0, + "step": 31740 + }, + { + "epoch": 2.961743025100308, + "grad_norm": NaN, + "learning_rate": 0.00016065595285389556, + "loss": 0.0, + "step": 31741 + }, + { + "epoch": 2.9618363347951853, + "grad_norm": NaN, + "learning_rate": 0.00016064840816589136, + "loss": 0.0, + "step": 31742 + }, + { + "epoch": 2.9619296444900627, + "grad_norm": NaN, + "learning_rate": 0.0001606408634508114, + "loss": 0.0, + "step": 31743 + }, + { + "epoch": 2.9620229541849397, + "grad_norm": NaN, + "learning_rate": 0.00016063331870867485, + "loss": 0.0, + "step": 31744 + }, + { + "epoch": 2.962116263879817, + "grad_norm": NaN, + "learning_rate": 0.00016062577393950096, + "loss": 0.0, + "step": 31745 + }, + { + "epoch": 2.9622095735746945, + "grad_norm": NaN, + "learning_rate": 0.00016061822914330886, + "loss": 0.0, + "step": 31746 + }, + { + "epoch": 2.9623028832695715, + "grad_norm": NaN, + "learning_rate": 0.0001606106843201177, + "loss": 0.0, + "step": 31747 + }, + { + "epoch": 2.962396192964449, + "grad_norm": NaN, + "learning_rate": 0.0001606031394699468, + "loss": 0.0, + "step": 31748 + }, + { + "epoch": 2.9624895026593263, + "grad_norm": NaN, + "learning_rate": 0.00016059559459281523, + "loss": 0.0, + "step": 31749 + }, + { + "epoch": 2.9625828123542037, + "grad_norm": NaN, + "learning_rate": 0.00016058804968874216, + "loss": 0.0, + "step": 31750 + }, + { + "epoch": 2.962676122049081, + "grad_norm": NaN, + "learning_rate": 0.00016058050475774681, + "loss": 0.0, + "step": 31751 + }, + { + "epoch": 2.962769431743958, + "grad_norm": NaN, + "learning_rate": 0.00016057295979984848, + "loss": 0.0, + "step": 31752 + }, + { + "epoch": 2.9628627414388355, + "grad_norm": NaN, + "learning_rate": 0.00016056541481506613, + "loss": 0.0, + "step": 31753 + }, + { + "epoch": 2.9629560511337125, + "grad_norm": NaN, + "learning_rate": 0.0001605578698034191, + "loss": 0.0, + "step": 31754 + }, + { + "epoch": 2.96304936082859, + "grad_norm": NaN, + "learning_rate": 0.00016055032476492654, + "loss": 0.0, + "step": 31755 + }, + { + "epoch": 2.9631426705234674, + "grad_norm": NaN, + "learning_rate": 0.0001605427796996076, + "loss": 0.0, + "step": 31756 + }, + { + "epoch": 2.963235980218345, + "grad_norm": NaN, + "learning_rate": 0.0001605352346074815, + "loss": 0.0, + "step": 31757 + }, + { + "epoch": 2.963329289913222, + "grad_norm": NaN, + "learning_rate": 0.00016052768948856744, + "loss": 0.0, + "step": 31758 + }, + { + "epoch": 2.963422599608099, + "grad_norm": NaN, + "learning_rate": 0.00016052014434288458, + "loss": 0.0, + "step": 31759 + }, + { + "epoch": 2.9635159093029766, + "grad_norm": NaN, + "learning_rate": 0.0001605125991704521, + "loss": 0.0, + "step": 31760 + }, + { + "epoch": 2.963609218997854, + "grad_norm": NaN, + "learning_rate": 0.0001605050539712892, + "loss": 0.0, + "step": 31761 + }, + { + "epoch": 2.963702528692731, + "grad_norm": NaN, + "learning_rate": 0.00016049750874541508, + "loss": 0.0, + "step": 31762 + }, + { + "epoch": 2.9637958383876084, + "grad_norm": NaN, + "learning_rate": 0.00016048996349284886, + "loss": 0.0, + "step": 31763 + }, + { + "epoch": 2.963889148082486, + "grad_norm": NaN, + "learning_rate": 0.00016048241821360978, + "loss": 0.0, + "step": 31764 + }, + { + "epoch": 2.9639824577773632, + "grad_norm": NaN, + "learning_rate": 0.00016047487290771708, + "loss": 0.0, + "step": 31765 + }, + { + "epoch": 2.96407576747224, + "grad_norm": NaN, + "learning_rate": 0.0001604673275751898, + "loss": 0.0, + "step": 31766 + }, + { + "epoch": 2.9641690771671176, + "grad_norm": NaN, + "learning_rate": 0.00016045978221604723, + "loss": 0.0, + "step": 31767 + }, + { + "epoch": 2.964262386861995, + "grad_norm": NaN, + "learning_rate": 0.00016045223683030854, + "loss": 0.0, + "step": 31768 + }, + { + "epoch": 2.964355696556872, + "grad_norm": NaN, + "learning_rate": 0.0001604446914179929, + "loss": 0.0, + "step": 31769 + }, + { + "epoch": 2.9644490062517495, + "grad_norm": NaN, + "learning_rate": 0.00016043714597911953, + "loss": 0.0, + "step": 31770 + }, + { + "epoch": 2.964542315946627, + "grad_norm": NaN, + "learning_rate": 0.00016042960051370763, + "loss": 0.0, + "step": 31771 + }, + { + "epoch": 2.9646356256415043, + "grad_norm": NaN, + "learning_rate": 0.00016042205502177624, + "loss": 0.0, + "step": 31772 + }, + { + "epoch": 2.9647289353363817, + "grad_norm": NaN, + "learning_rate": 0.0001604145095033447, + "loss": 0.0, + "step": 31773 + }, + { + "epoch": 2.9648222450312587, + "grad_norm": NaN, + "learning_rate": 0.00016040696395843218, + "loss": 0.0, + "step": 31774 + }, + { + "epoch": 2.964915554726136, + "grad_norm": NaN, + "learning_rate": 0.00016039941838705775, + "loss": 0.0, + "step": 31775 + }, + { + "epoch": 2.965008864421013, + "grad_norm": NaN, + "learning_rate": 0.00016039187278924072, + "loss": 0.0, + "step": 31776 + }, + { + "epoch": 2.9651021741158905, + "grad_norm": NaN, + "learning_rate": 0.00016038432716500027, + "loss": 0.0, + "step": 31777 + }, + { + "epoch": 2.965195483810768, + "grad_norm": NaN, + "learning_rate": 0.0001603767815143555, + "loss": 0.0, + "step": 31778 + }, + { + "epoch": 2.9652887935056453, + "grad_norm": NaN, + "learning_rate": 0.0001603692358373257, + "loss": 0.0, + "step": 31779 + }, + { + "epoch": 2.9653821032005228, + "grad_norm": NaN, + "learning_rate": 0.00016036169013393, + "loss": 0.0, + "step": 31780 + }, + { + "epoch": 2.9654754128953997, + "grad_norm": NaN, + "learning_rate": 0.00016035414440418756, + "loss": 0.0, + "step": 31781 + }, + { + "epoch": 2.965568722590277, + "grad_norm": NaN, + "learning_rate": 0.0001603465986481176, + "loss": 0.0, + "step": 31782 + }, + { + "epoch": 2.9656620322851546, + "grad_norm": NaN, + "learning_rate": 0.00016033905286573933, + "loss": 0.0, + "step": 31783 + }, + { + "epoch": 2.9657553419800315, + "grad_norm": NaN, + "learning_rate": 0.0001603315070570719, + "loss": 0.0, + "step": 31784 + }, + { + "epoch": 2.965848651674909, + "grad_norm": NaN, + "learning_rate": 0.0001603239612221345, + "loss": 0.0, + "step": 31785 + }, + { + "epoch": 2.9659419613697864, + "grad_norm": NaN, + "learning_rate": 0.00016031641536094632, + "loss": 0.0, + "step": 31786 + }, + { + "epoch": 2.966035271064664, + "grad_norm": NaN, + "learning_rate": 0.00016030886947352655, + "loss": 0.0, + "step": 31787 + }, + { + "epoch": 2.966128580759541, + "grad_norm": NaN, + "learning_rate": 0.0001603013235598944, + "loss": 0.0, + "step": 31788 + }, + { + "epoch": 2.966221890454418, + "grad_norm": NaN, + "learning_rate": 0.000160293777620069, + "loss": 0.0, + "step": 31789 + }, + { + "epoch": 2.9663152001492956, + "grad_norm": NaN, + "learning_rate": 0.00016028623165406961, + "loss": 0.0, + "step": 31790 + }, + { + "epoch": 2.9664085098441726, + "grad_norm": NaN, + "learning_rate": 0.00016027868566191537, + "loss": 0.0, + "step": 31791 + }, + { + "epoch": 2.96650181953905, + "grad_norm": NaN, + "learning_rate": 0.0001602711396436255, + "loss": 0.0, + "step": 31792 + }, + { + "epoch": 2.9665951292339274, + "grad_norm": NaN, + "learning_rate": 0.00016026359359921913, + "loss": 0.0, + "step": 31793 + }, + { + "epoch": 2.966688438928805, + "grad_norm": NaN, + "learning_rate": 0.00016025604752871545, + "loss": 0.0, + "step": 31794 + }, + { + "epoch": 2.9667817486236823, + "grad_norm": NaN, + "learning_rate": 0.00016024850143213376, + "loss": 0.0, + "step": 31795 + }, + { + "epoch": 2.9668750583185592, + "grad_norm": NaN, + "learning_rate": 0.0001602409553094931, + "loss": 0.0, + "step": 31796 + }, + { + "epoch": 2.9669683680134367, + "grad_norm": NaN, + "learning_rate": 0.00016023340916081278, + "loss": 0.0, + "step": 31797 + }, + { + "epoch": 2.9670616777083136, + "grad_norm": NaN, + "learning_rate": 0.00016022586298611188, + "loss": 0.0, + "step": 31798 + }, + { + "epoch": 2.967154987403191, + "grad_norm": NaN, + "learning_rate": 0.00016021831678540965, + "loss": 0.0, + "step": 31799 + }, + { + "epoch": 2.9672482970980685, + "grad_norm": NaN, + "learning_rate": 0.0001602107705587253, + "loss": 0.0, + "step": 31800 + }, + { + "epoch": 2.967341606792946, + "grad_norm": NaN, + "learning_rate": 0.00016020322430607795, + "loss": 0.0, + "step": 31801 + }, + { + "epoch": 2.9674349164878233, + "grad_norm": NaN, + "learning_rate": 0.00016019567802748685, + "loss": 0.0, + "step": 31802 + }, + { + "epoch": 2.9675282261827003, + "grad_norm": NaN, + "learning_rate": 0.00016018813172297115, + "loss": 0.0, + "step": 31803 + }, + { + "epoch": 2.9676215358775777, + "grad_norm": NaN, + "learning_rate": 0.00016018058539255003, + "loss": 0.0, + "step": 31804 + }, + { + "epoch": 2.967714845572455, + "grad_norm": NaN, + "learning_rate": 0.00016017303903624272, + "loss": 0.0, + "step": 31805 + }, + { + "epoch": 2.967808155267332, + "grad_norm": NaN, + "learning_rate": 0.00016016549265406837, + "loss": 0.0, + "step": 31806 + }, + { + "epoch": 2.9679014649622095, + "grad_norm": NaN, + "learning_rate": 0.0001601579462460462, + "loss": 0.0, + "step": 31807 + }, + { + "epoch": 2.967994774657087, + "grad_norm": NaN, + "learning_rate": 0.00016015039981219538, + "loss": 0.0, + "step": 31808 + }, + { + "epoch": 2.9680880843519644, + "grad_norm": NaN, + "learning_rate": 0.0001601428533525351, + "loss": 0.0, + "step": 31809 + }, + { + "epoch": 2.9681813940468413, + "grad_norm": NaN, + "learning_rate": 0.00016013530686708456, + "loss": 0.0, + "step": 31810 + }, + { + "epoch": 2.9682747037417188, + "grad_norm": NaN, + "learning_rate": 0.0001601277603558629, + "loss": 0.0, + "step": 31811 + }, + { + "epoch": 2.968368013436596, + "grad_norm": NaN, + "learning_rate": 0.00016012021381888936, + "loss": 0.0, + "step": 31812 + }, + { + "epoch": 2.968461323131473, + "grad_norm": NaN, + "learning_rate": 0.00016011266725618313, + "loss": 0.0, + "step": 31813 + }, + { + "epoch": 2.9685546328263506, + "grad_norm": NaN, + "learning_rate": 0.00016010512066776336, + "loss": 0.0, + "step": 31814 + }, + { + "epoch": 2.968647942521228, + "grad_norm": NaN, + "learning_rate": 0.00016009757405364926, + "loss": 0.0, + "step": 31815 + }, + { + "epoch": 2.9687412522161054, + "grad_norm": NaN, + "learning_rate": 0.00016009002741386005, + "loss": 0.0, + "step": 31816 + }, + { + "epoch": 2.968834561910983, + "grad_norm": NaN, + "learning_rate": 0.00016008248074841488, + "loss": 0.0, + "step": 31817 + }, + { + "epoch": 2.96892787160586, + "grad_norm": NaN, + "learning_rate": 0.00016007493405733296, + "loss": 0.0, + "step": 31818 + }, + { + "epoch": 2.9690211813007372, + "grad_norm": NaN, + "learning_rate": 0.00016006738734063343, + "loss": 0.0, + "step": 31819 + }, + { + "epoch": 2.969114490995614, + "grad_norm": NaN, + "learning_rate": 0.00016005984059833553, + "loss": 0.0, + "step": 31820 + }, + { + "epoch": 2.9692078006904916, + "grad_norm": NaN, + "learning_rate": 0.00016005229383045846, + "loss": 0.0, + "step": 31821 + }, + { + "epoch": 2.969301110385369, + "grad_norm": NaN, + "learning_rate": 0.00016004474703702135, + "loss": 0.0, + "step": 31822 + }, + { + "epoch": 2.9693944200802465, + "grad_norm": NaN, + "learning_rate": 0.00016003720021804342, + "loss": 0.0, + "step": 31823 + }, + { + "epoch": 2.969487729775124, + "grad_norm": NaN, + "learning_rate": 0.00016002965337354392, + "loss": 0.0, + "step": 31824 + }, + { + "epoch": 2.969581039470001, + "grad_norm": NaN, + "learning_rate": 0.00016002210650354191, + "loss": 0.0, + "step": 31825 + }, + { + "epoch": 2.9696743491648783, + "grad_norm": NaN, + "learning_rate": 0.00016001455960805672, + "loss": 0.0, + "step": 31826 + }, + { + "epoch": 2.9697676588597557, + "grad_norm": NaN, + "learning_rate": 0.00016000701268710743, + "loss": 0.0, + "step": 31827 + }, + { + "epoch": 2.9698609685546327, + "grad_norm": NaN, + "learning_rate": 0.00015999946574071327, + "loss": 0.0, + "step": 31828 + }, + { + "epoch": 2.96995427824951, + "grad_norm": NaN, + "learning_rate": 0.0001599919187688934, + "loss": 0.0, + "step": 31829 + }, + { + "epoch": 2.9700475879443875, + "grad_norm": NaN, + "learning_rate": 0.00015998437177166709, + "loss": 0.0, + "step": 31830 + }, + { + "epoch": 2.970140897639265, + "grad_norm": NaN, + "learning_rate": 0.0001599768247490535, + "loss": 0.0, + "step": 31831 + }, + { + "epoch": 2.970234207334142, + "grad_norm": NaN, + "learning_rate": 0.00015996927770107176, + "loss": 0.0, + "step": 31832 + }, + { + "epoch": 2.9703275170290193, + "grad_norm": NaN, + "learning_rate": 0.0001599617306277411, + "loss": 0.0, + "step": 31833 + }, + { + "epoch": 2.9704208267238967, + "grad_norm": NaN, + "learning_rate": 0.0001599541835290807, + "loss": 0.0, + "step": 31834 + }, + { + "epoch": 2.9705141364187737, + "grad_norm": NaN, + "learning_rate": 0.00015994663640510982, + "loss": 0.0, + "step": 31835 + }, + { + "epoch": 2.970607446113651, + "grad_norm": NaN, + "learning_rate": 0.00015993908925584753, + "loss": 0.0, + "step": 31836 + }, + { + "epoch": 2.9707007558085285, + "grad_norm": NaN, + "learning_rate": 0.0001599315420813131, + "loss": 0.0, + "step": 31837 + }, + { + "epoch": 2.970794065503406, + "grad_norm": NaN, + "learning_rate": 0.0001599239948815257, + "loss": 0.0, + "step": 31838 + }, + { + "epoch": 2.970887375198283, + "grad_norm": NaN, + "learning_rate": 0.00015991644765650452, + "loss": 0.0, + "step": 31839 + }, + { + "epoch": 2.9709806848931604, + "grad_norm": NaN, + "learning_rate": 0.00015990890040626875, + "loss": 0.0, + "step": 31840 + }, + { + "epoch": 2.971073994588038, + "grad_norm": NaN, + "learning_rate": 0.0001599013531308376, + "loss": 0.0, + "step": 31841 + }, + { + "epoch": 2.9711673042829148, + "grad_norm": NaN, + "learning_rate": 0.00015989380583023022, + "loss": 0.0, + "step": 31842 + }, + { + "epoch": 2.971260613977792, + "grad_norm": NaN, + "learning_rate": 0.00015988625850446583, + "loss": 0.0, + "step": 31843 + }, + { + "epoch": 2.9713539236726696, + "grad_norm": NaN, + "learning_rate": 0.00015987871115356363, + "loss": 0.0, + "step": 31844 + }, + { + "epoch": 2.971447233367547, + "grad_norm": NaN, + "learning_rate": 0.00015987116377754276, + "loss": 0.0, + "step": 31845 + }, + { + "epoch": 2.9715405430624244, + "grad_norm": NaN, + "learning_rate": 0.00015986361637642245, + "loss": 0.0, + "step": 31846 + }, + { + "epoch": 2.9716338527573014, + "grad_norm": NaN, + "learning_rate": 0.0001598560689502219, + "loss": 0.0, + "step": 31847 + }, + { + "epoch": 2.971727162452179, + "grad_norm": NaN, + "learning_rate": 0.00015984852149896026, + "loss": 0.0, + "step": 31848 + }, + { + "epoch": 2.971820472147056, + "grad_norm": NaN, + "learning_rate": 0.0001598409740226568, + "loss": 0.0, + "step": 31849 + }, + { + "epoch": 2.971913781841933, + "grad_norm": NaN, + "learning_rate": 0.0001598334265213306, + "loss": 0.0, + "step": 31850 + }, + { + "epoch": 2.9720070915368106, + "grad_norm": NaN, + "learning_rate": 0.00015982587899500097, + "loss": 0.0, + "step": 31851 + }, + { + "epoch": 2.972100401231688, + "grad_norm": NaN, + "learning_rate": 0.000159818331443687, + "loss": 0.0, + "step": 31852 + }, + { + "epoch": 2.9721937109265655, + "grad_norm": NaN, + "learning_rate": 0.00015981078386740792, + "loss": 0.0, + "step": 31853 + }, + { + "epoch": 2.9722870206214425, + "grad_norm": NaN, + "learning_rate": 0.00015980323626618292, + "loss": 0.0, + "step": 31854 + }, + { + "epoch": 2.97238033031632, + "grad_norm": NaN, + "learning_rate": 0.0001597956886400312, + "loss": 0.0, + "step": 31855 + }, + { + "epoch": 2.9724736400111973, + "grad_norm": NaN, + "learning_rate": 0.00015978814098897197, + "loss": 0.0, + "step": 31856 + }, + { + "epoch": 2.9725669497060743, + "grad_norm": NaN, + "learning_rate": 0.00015978059331302435, + "loss": 0.0, + "step": 31857 + }, + { + "epoch": 2.9726602594009517, + "grad_norm": NaN, + "learning_rate": 0.0001597730456122076, + "loss": 0.0, + "step": 31858 + }, + { + "epoch": 2.972753569095829, + "grad_norm": NaN, + "learning_rate": 0.00015976549788654091, + "loss": 0.0, + "step": 31859 + }, + { + "epoch": 2.9728468787907065, + "grad_norm": NaN, + "learning_rate": 0.00015975795013604345, + "loss": 0.0, + "step": 31860 + }, + { + "epoch": 2.9729401884855835, + "grad_norm": NaN, + "learning_rate": 0.00015975040236073436, + "loss": 0.0, + "step": 31861 + }, + { + "epoch": 2.973033498180461, + "grad_norm": NaN, + "learning_rate": 0.00015974285456063294, + "loss": 0.0, + "step": 31862 + }, + { + "epoch": 2.9731268078753383, + "grad_norm": NaN, + "learning_rate": 0.00015973530673575834, + "loss": 0.0, + "step": 31863 + }, + { + "epoch": 2.9732201175702153, + "grad_norm": NaN, + "learning_rate": 0.0001597277588861297, + "loss": 0.0, + "step": 31864 + }, + { + "epoch": 2.9733134272650927, + "grad_norm": NaN, + "learning_rate": 0.00015972021101176624, + "loss": 0.0, + "step": 31865 + }, + { + "epoch": 2.97340673695997, + "grad_norm": NaN, + "learning_rate": 0.0001597126631126872, + "loss": 0.0, + "step": 31866 + }, + { + "epoch": 2.9735000466548476, + "grad_norm": NaN, + "learning_rate": 0.00015970511518891172, + "loss": 0.0, + "step": 31867 + }, + { + "epoch": 2.973593356349725, + "grad_norm": NaN, + "learning_rate": 0.000159697567240459, + "loss": 0.0, + "step": 31868 + }, + { + "epoch": 2.973686666044602, + "grad_norm": NaN, + "learning_rate": 0.00015969001926734828, + "loss": 0.0, + "step": 31869 + }, + { + "epoch": 2.9737799757394794, + "grad_norm": NaN, + "learning_rate": 0.00015968247126959864, + "loss": 0.0, + "step": 31870 + }, + { + "epoch": 2.9738732854343564, + "grad_norm": NaN, + "learning_rate": 0.00015967492324722938, + "loss": 0.0, + "step": 31871 + }, + { + "epoch": 2.973966595129234, + "grad_norm": NaN, + "learning_rate": 0.00015966737520025967, + "loss": 0.0, + "step": 31872 + }, + { + "epoch": 2.974059904824111, + "grad_norm": NaN, + "learning_rate": 0.00015965982712870866, + "loss": 0.0, + "step": 31873 + }, + { + "epoch": 2.9741532145189886, + "grad_norm": NaN, + "learning_rate": 0.0001596522790325956, + "loss": 0.0, + "step": 31874 + }, + { + "epoch": 2.974246524213866, + "grad_norm": NaN, + "learning_rate": 0.00015964473091193963, + "loss": 0.0, + "step": 31875 + }, + { + "epoch": 2.974339833908743, + "grad_norm": NaN, + "learning_rate": 0.00015963718276676, + "loss": 0.0, + "step": 31876 + }, + { + "epoch": 2.9744331436036204, + "grad_norm": NaN, + "learning_rate": 0.00015962963459707584, + "loss": 0.0, + "step": 31877 + }, + { + "epoch": 2.974526453298498, + "grad_norm": NaN, + "learning_rate": 0.00015962208640290637, + "loss": 0.0, + "step": 31878 + }, + { + "epoch": 2.974619762993375, + "grad_norm": NaN, + "learning_rate": 0.00015961453818427076, + "loss": 0.0, + "step": 31879 + }, + { + "epoch": 2.9747130726882522, + "grad_norm": NaN, + "learning_rate": 0.00015960698994118828, + "loss": 0.0, + "step": 31880 + }, + { + "epoch": 2.9748063823831297, + "grad_norm": NaN, + "learning_rate": 0.00015959944167367806, + "loss": 0.0, + "step": 31881 + }, + { + "epoch": 2.974899692078007, + "grad_norm": NaN, + "learning_rate": 0.00015959189338175927, + "loss": 0.0, + "step": 31882 + }, + { + "epoch": 2.974993001772884, + "grad_norm": NaN, + "learning_rate": 0.00015958434506545117, + "loss": 0.0, + "step": 31883 + }, + { + "epoch": 2.9750863114677615, + "grad_norm": NaN, + "learning_rate": 0.00015957679672477294, + "loss": 0.0, + "step": 31884 + }, + { + "epoch": 2.975179621162639, + "grad_norm": NaN, + "learning_rate": 0.0001595692483597437, + "loss": 0.0, + "step": 31885 + }, + { + "epoch": 2.975272930857516, + "grad_norm": NaN, + "learning_rate": 0.00015956169997038272, + "loss": 0.0, + "step": 31886 + }, + { + "epoch": 2.9753662405523933, + "grad_norm": NaN, + "learning_rate": 0.00015955415155670914, + "loss": 0.0, + "step": 31887 + }, + { + "epoch": 2.9754595502472707, + "grad_norm": NaN, + "learning_rate": 0.00015954660311874222, + "loss": 0.0, + "step": 31888 + }, + { + "epoch": 2.975552859942148, + "grad_norm": NaN, + "learning_rate": 0.00015953905465650113, + "loss": 0.0, + "step": 31889 + }, + { + "epoch": 2.9756461696370256, + "grad_norm": NaN, + "learning_rate": 0.00015953150617000503, + "loss": 0.0, + "step": 31890 + }, + { + "epoch": 2.9757394793319025, + "grad_norm": NaN, + "learning_rate": 0.0001595239576592731, + "loss": 0.0, + "step": 31891 + }, + { + "epoch": 2.97583278902678, + "grad_norm": NaN, + "learning_rate": 0.0001595164091243246, + "loss": 0.0, + "step": 31892 + }, + { + "epoch": 2.975926098721657, + "grad_norm": NaN, + "learning_rate": 0.00015950886056517868, + "loss": 0.0, + "step": 31893 + }, + { + "epoch": 2.9760194084165343, + "grad_norm": NaN, + "learning_rate": 0.00015950131198185457, + "loss": 0.0, + "step": 31894 + }, + { + "epoch": 2.9761127181114118, + "grad_norm": NaN, + "learning_rate": 0.00015949376337437142, + "loss": 0.0, + "step": 31895 + }, + { + "epoch": 2.976206027806289, + "grad_norm": NaN, + "learning_rate": 0.00015948621474274843, + "loss": 0.0, + "step": 31896 + }, + { + "epoch": 2.9762993375011666, + "grad_norm": NaN, + "learning_rate": 0.00015947866608700481, + "loss": 0.0, + "step": 31897 + }, + { + "epoch": 2.9763926471960436, + "grad_norm": NaN, + "learning_rate": 0.00015947111740715974, + "loss": 0.0, + "step": 31898 + }, + { + "epoch": 2.976485956890921, + "grad_norm": NaN, + "learning_rate": 0.00015946356870323247, + "loss": 0.0, + "step": 31899 + }, + { + "epoch": 2.9765792665857984, + "grad_norm": NaN, + "learning_rate": 0.00015945601997524212, + "loss": 0.0, + "step": 31900 + }, + { + "epoch": 2.9766725762806754, + "grad_norm": NaN, + "learning_rate": 0.00015944847122320788, + "loss": 0.0, + "step": 31901 + }, + { + "epoch": 2.976765885975553, + "grad_norm": NaN, + "learning_rate": 0.000159440922447149, + "loss": 0.0, + "step": 31902 + }, + { + "epoch": 2.9768591956704302, + "grad_norm": NaN, + "learning_rate": 0.00015943337364708468, + "loss": 0.0, + "step": 31903 + }, + { + "epoch": 2.9769525053653076, + "grad_norm": NaN, + "learning_rate": 0.00015942582482303405, + "loss": 0.0, + "step": 31904 + }, + { + "epoch": 2.9770458150601846, + "grad_norm": NaN, + "learning_rate": 0.00015941827597501637, + "loss": 0.0, + "step": 31905 + }, + { + "epoch": 2.977139124755062, + "grad_norm": NaN, + "learning_rate": 0.00015941072710305078, + "loss": 0.0, + "step": 31906 + }, + { + "epoch": 2.9772324344499395, + "grad_norm": NaN, + "learning_rate": 0.0001594031782071565, + "loss": 0.0, + "step": 31907 + }, + { + "epoch": 2.9773257441448164, + "grad_norm": NaN, + "learning_rate": 0.00015939562928735275, + "loss": 0.0, + "step": 31908 + }, + { + "epoch": 2.977419053839694, + "grad_norm": NaN, + "learning_rate": 0.00015938808034365867, + "loss": 0.0, + "step": 31909 + }, + { + "epoch": 2.9775123635345713, + "grad_norm": NaN, + "learning_rate": 0.00015938053137609347, + "loss": 0.0, + "step": 31910 + }, + { + "epoch": 2.9776056732294487, + "grad_norm": NaN, + "learning_rate": 0.00015937298238467642, + "loss": 0.0, + "step": 31911 + }, + { + "epoch": 2.977698982924326, + "grad_norm": NaN, + "learning_rate": 0.0001593654333694266, + "loss": 0.0, + "step": 31912 + }, + { + "epoch": 2.977792292619203, + "grad_norm": NaN, + "learning_rate": 0.00015935788433036325, + "loss": 0.0, + "step": 31913 + }, + { + "epoch": 2.9778856023140805, + "grad_norm": NaN, + "learning_rate": 0.00015935033526750557, + "loss": 0.0, + "step": 31914 + }, + { + "epoch": 2.9779789120089575, + "grad_norm": NaN, + "learning_rate": 0.0001593427861808728, + "loss": 0.0, + "step": 31915 + }, + { + "epoch": 2.978072221703835, + "grad_norm": NaN, + "learning_rate": 0.00015933523707048407, + "loss": 0.0, + "step": 31916 + }, + { + "epoch": 2.9781655313987123, + "grad_norm": NaN, + "learning_rate": 0.00015932768793635858, + "loss": 0.0, + "step": 31917 + }, + { + "epoch": 2.9782588410935897, + "grad_norm": NaN, + "learning_rate": 0.00015932013877851557, + "loss": 0.0, + "step": 31918 + }, + { + "epoch": 2.978352150788467, + "grad_norm": NaN, + "learning_rate": 0.00015931258959697418, + "loss": 0.0, + "step": 31919 + }, + { + "epoch": 2.978445460483344, + "grad_norm": NaN, + "learning_rate": 0.00015930504039175366, + "loss": 0.0, + "step": 31920 + }, + { + "epoch": 2.9785387701782216, + "grad_norm": NaN, + "learning_rate": 0.00015929749116287318, + "loss": 0.0, + "step": 31921 + }, + { + "epoch": 2.978632079873099, + "grad_norm": NaN, + "learning_rate": 0.0001592899419103519, + "loss": 0.0, + "step": 31922 + }, + { + "epoch": 2.978725389567976, + "grad_norm": NaN, + "learning_rate": 0.0001592823926342091, + "loss": 0.0, + "step": 31923 + }, + { + "epoch": 2.9788186992628534, + "grad_norm": NaN, + "learning_rate": 0.0001592748433344639, + "loss": 0.0, + "step": 31924 + }, + { + "epoch": 2.978912008957731, + "grad_norm": NaN, + "learning_rate": 0.00015926729401113544, + "loss": 0.0, + "step": 31925 + }, + { + "epoch": 2.979005318652608, + "grad_norm": NaN, + "learning_rate": 0.00015925974466424314, + "loss": 0.0, + "step": 31926 + }, + { + "epoch": 2.979098628347485, + "grad_norm": NaN, + "learning_rate": 0.00015925219529380597, + "loss": 0.0, + "step": 31927 + }, + { + "epoch": 2.9791919380423626, + "grad_norm": NaN, + "learning_rate": 0.00015924464589984325, + "loss": 0.0, + "step": 31928 + }, + { + "epoch": 2.97928524773724, + "grad_norm": NaN, + "learning_rate": 0.00015923709648237407, + "loss": 0.0, + "step": 31929 + }, + { + "epoch": 2.979378557432117, + "grad_norm": NaN, + "learning_rate": 0.00015922954704141777, + "loss": 0.0, + "step": 31930 + }, + { + "epoch": 2.9794718671269944, + "grad_norm": NaN, + "learning_rate": 0.0001592219975769934, + "loss": 0.0, + "step": 31931 + }, + { + "epoch": 2.979565176821872, + "grad_norm": NaN, + "learning_rate": 0.00015921444808912025, + "loss": 0.0, + "step": 31932 + }, + { + "epoch": 2.9796584865167492, + "grad_norm": NaN, + "learning_rate": 0.00015920689857781747, + "loss": 0.0, + "step": 31933 + }, + { + "epoch": 2.9797517962116262, + "grad_norm": NaN, + "learning_rate": 0.0001591993490431043, + "loss": 0.0, + "step": 31934 + }, + { + "epoch": 2.9798451059065036, + "grad_norm": NaN, + "learning_rate": 0.0001591917994849999, + "loss": 0.0, + "step": 31935 + }, + { + "epoch": 2.979938415601381, + "grad_norm": NaN, + "learning_rate": 0.00015918424990352348, + "loss": 0.0, + "step": 31936 + }, + { + "epoch": 2.980031725296258, + "grad_norm": NaN, + "learning_rate": 0.00015917670029869422, + "loss": 0.0, + "step": 31937 + }, + { + "epoch": 2.9801250349911355, + "grad_norm": NaN, + "learning_rate": 0.00015916915067053133, + "loss": 0.0, + "step": 31938 + }, + { + "epoch": 2.980218344686013, + "grad_norm": NaN, + "learning_rate": 0.000159161601019054, + "loss": 0.0, + "step": 31939 + }, + { + "epoch": 2.9803116543808903, + "grad_norm": NaN, + "learning_rate": 0.00015915405134428148, + "loss": 0.0, + "step": 31940 + }, + { + "epoch": 2.9804049640757677, + "grad_norm": NaN, + "learning_rate": 0.00015914650164623286, + "loss": 0.0, + "step": 31941 + }, + { + "epoch": 2.9804982737706447, + "grad_norm": NaN, + "learning_rate": 0.0001591389519249274, + "loss": 0.0, + "step": 31942 + }, + { + "epoch": 2.980591583465522, + "grad_norm": NaN, + "learning_rate": 0.00015913140218038434, + "loss": 0.0, + "step": 31943 + }, + { + "epoch": 2.9806848931603995, + "grad_norm": NaN, + "learning_rate": 0.00015912385241262277, + "loss": 0.0, + "step": 31944 + }, + { + "epoch": 2.9807782028552765, + "grad_norm": NaN, + "learning_rate": 0.00015911630262166202, + "loss": 0.0, + "step": 31945 + }, + { + "epoch": 2.980871512550154, + "grad_norm": NaN, + "learning_rate": 0.00015910875280752116, + "loss": 0.0, + "step": 31946 + }, + { + "epoch": 2.9809648222450313, + "grad_norm": NaN, + "learning_rate": 0.00015910120297021941, + "loss": 0.0, + "step": 31947 + }, + { + "epoch": 2.9810581319399088, + "grad_norm": NaN, + "learning_rate": 0.0001590936531097761, + "loss": 0.0, + "step": 31948 + }, + { + "epoch": 2.9811514416347857, + "grad_norm": NaN, + "learning_rate": 0.00015908610322621023, + "loss": 0.0, + "step": 31949 + }, + { + "epoch": 2.981244751329663, + "grad_norm": NaN, + "learning_rate": 0.00015907855331954112, + "loss": 0.0, + "step": 31950 + }, + { + "epoch": 2.9813380610245406, + "grad_norm": NaN, + "learning_rate": 0.00015907100338978798, + "loss": 0.0, + "step": 31951 + }, + { + "epoch": 2.9814313707194176, + "grad_norm": NaN, + "learning_rate": 0.00015906345343696992, + "loss": 0.0, + "step": 31952 + }, + { + "epoch": 2.981524680414295, + "grad_norm": NaN, + "learning_rate": 0.00015905590346110618, + "loss": 0.0, + "step": 31953 + }, + { + "epoch": 2.9816179901091724, + "grad_norm": NaN, + "learning_rate": 0.00015904835346221597, + "loss": 0.0, + "step": 31954 + }, + { + "epoch": 2.98171129980405, + "grad_norm": NaN, + "learning_rate": 0.00015904080344031848, + "loss": 0.0, + "step": 31955 + }, + { + "epoch": 2.981804609498927, + "grad_norm": NaN, + "learning_rate": 0.00015903325339543289, + "loss": 0.0, + "step": 31956 + }, + { + "epoch": 2.981897919193804, + "grad_norm": NaN, + "learning_rate": 0.00015902570332757844, + "loss": 0.0, + "step": 31957 + }, + { + "epoch": 2.9819912288886816, + "grad_norm": NaN, + "learning_rate": 0.0001590181532367743, + "loss": 0.0, + "step": 31958 + }, + { + "epoch": 2.9820845385835586, + "grad_norm": NaN, + "learning_rate": 0.00015901060312303964, + "loss": 0.0, + "step": 31959 + }, + { + "epoch": 2.982177848278436, + "grad_norm": NaN, + "learning_rate": 0.00015900305298639368, + "loss": 0.0, + "step": 31960 + }, + { + "epoch": 2.9822711579733134, + "grad_norm": NaN, + "learning_rate": 0.00015899550282685564, + "loss": 0.0, + "step": 31961 + }, + { + "epoch": 2.982364467668191, + "grad_norm": NaN, + "learning_rate": 0.0001589879526444447, + "loss": 0.0, + "step": 31962 + }, + { + "epoch": 2.9824577773630683, + "grad_norm": NaN, + "learning_rate": 0.00015898040243918004, + "loss": 0.0, + "step": 31963 + }, + { + "epoch": 2.9825510870579452, + "grad_norm": NaN, + "learning_rate": 0.0001589728522110809, + "loss": 0.0, + "step": 31964 + }, + { + "epoch": 2.9826443967528227, + "grad_norm": NaN, + "learning_rate": 0.00015896530196016646, + "loss": 0.0, + "step": 31965 + }, + { + "epoch": 2.9827377064476996, + "grad_norm": NaN, + "learning_rate": 0.00015895775168645588, + "loss": 0.0, + "step": 31966 + }, + { + "epoch": 2.982831016142577, + "grad_norm": NaN, + "learning_rate": 0.00015895020138996846, + "loss": 0.0, + "step": 31967 + }, + { + "epoch": 2.9829243258374545, + "grad_norm": NaN, + "learning_rate": 0.0001589426510707233, + "loss": 0.0, + "step": 31968 + }, + { + "epoch": 2.983017635532332, + "grad_norm": NaN, + "learning_rate": 0.00015893510072873955, + "loss": 0.0, + "step": 31969 + }, + { + "epoch": 2.9831109452272093, + "grad_norm": NaN, + "learning_rate": 0.0001589275503640366, + "loss": 0.0, + "step": 31970 + }, + { + "epoch": 2.9832042549220863, + "grad_norm": NaN, + "learning_rate": 0.00015891999997663347, + "loss": 0.0, + "step": 31971 + }, + { + "epoch": 2.9832975646169637, + "grad_norm": NaN, + "learning_rate": 0.0001589124495665494, + "loss": 0.0, + "step": 31972 + }, + { + "epoch": 2.983390874311841, + "grad_norm": NaN, + "learning_rate": 0.00015890489913380367, + "loss": 0.0, + "step": 31973 + }, + { + "epoch": 2.983484184006718, + "grad_norm": NaN, + "learning_rate": 0.0001588973486784154, + "loss": 0.0, + "step": 31974 + }, + { + "epoch": 2.9835774937015955, + "grad_norm": NaN, + "learning_rate": 0.0001588897982004038, + "loss": 0.0, + "step": 31975 + }, + { + "epoch": 2.983670803396473, + "grad_norm": NaN, + "learning_rate": 0.0001588822476997881, + "loss": 0.0, + "step": 31976 + }, + { + "epoch": 2.9837641130913504, + "grad_norm": NaN, + "learning_rate": 0.00015887469717658744, + "loss": 0.0, + "step": 31977 + }, + { + "epoch": 2.9838574227862273, + "grad_norm": NaN, + "learning_rate": 0.00015886714663082107, + "loss": 0.0, + "step": 31978 + }, + { + "epoch": 2.9839507324811048, + "grad_norm": NaN, + "learning_rate": 0.00015885959606250815, + "loss": 0.0, + "step": 31979 + }, + { + "epoch": 2.984044042175982, + "grad_norm": NaN, + "learning_rate": 0.00015885204547166795, + "loss": 0.0, + "step": 31980 + }, + { + "epoch": 2.984137351870859, + "grad_norm": NaN, + "learning_rate": 0.0001588444948583196, + "loss": 0.0, + "step": 31981 + }, + { + "epoch": 2.9842306615657366, + "grad_norm": NaN, + "learning_rate": 0.00015883694422248225, + "loss": 0.0, + "step": 31982 + }, + { + "epoch": 2.984323971260614, + "grad_norm": NaN, + "learning_rate": 0.0001588293935641753, + "loss": 0.0, + "step": 31983 + }, + { + "epoch": 2.9844172809554914, + "grad_norm": NaN, + "learning_rate": 0.00015882184288341773, + "loss": 0.0, + "step": 31984 + }, + { + "epoch": 2.984510590650369, + "grad_norm": NaN, + "learning_rate": 0.0001588142921802288, + "loss": 0.0, + "step": 31985 + }, + { + "epoch": 2.984603900345246, + "grad_norm": NaN, + "learning_rate": 0.0001588067414546278, + "loss": 0.0, + "step": 31986 + }, + { + "epoch": 2.9846972100401232, + "grad_norm": NaN, + "learning_rate": 0.00015879919070663386, + "loss": 0.0, + "step": 31987 + }, + { + "epoch": 2.984790519735, + "grad_norm": NaN, + "learning_rate": 0.00015879163993626615, + "loss": 0.0, + "step": 31988 + }, + { + "epoch": 2.9848838294298776, + "grad_norm": NaN, + "learning_rate": 0.00015878408914354395, + "loss": 0.0, + "step": 31989 + }, + { + "epoch": 2.984977139124755, + "grad_norm": NaN, + "learning_rate": 0.00015877653832848637, + "loss": 0.0, + "step": 31990 + }, + { + "epoch": 2.9850704488196325, + "grad_norm": NaN, + "learning_rate": 0.00015876898749111267, + "loss": 0.0, + "step": 31991 + }, + { + "epoch": 2.98516375851451, + "grad_norm": NaN, + "learning_rate": 0.00015876143663144207, + "loss": 0.0, + "step": 31992 + }, + { + "epoch": 2.985257068209387, + "grad_norm": NaN, + "learning_rate": 0.0001587538857494937, + "loss": 0.0, + "step": 31993 + }, + { + "epoch": 2.9853503779042643, + "grad_norm": NaN, + "learning_rate": 0.00015874633484528673, + "loss": 0.0, + "step": 31994 + }, + { + "epoch": 2.9854436875991417, + "grad_norm": NaN, + "learning_rate": 0.00015873878391884052, + "loss": 0.0, + "step": 31995 + }, + { + "epoch": 2.9855369972940187, + "grad_norm": NaN, + "learning_rate": 0.0001587312329701741, + "loss": 0.0, + "step": 31996 + }, + { + "epoch": 2.985630306988896, + "grad_norm": NaN, + "learning_rate": 0.00015872368199930682, + "loss": 0.0, + "step": 31997 + }, + { + "epoch": 2.9857236166837735, + "grad_norm": NaN, + "learning_rate": 0.00015871613100625775, + "loss": 0.0, + "step": 31998 + }, + { + "epoch": 2.985816926378651, + "grad_norm": NaN, + "learning_rate": 0.0001587085799910461, + "loss": 0.0, + "step": 31999 + }, + { + "epoch": 2.985910236073528, + "grad_norm": NaN, + "learning_rate": 0.00015870102895369117, + "loss": 0.0, + "step": 32000 + }, + { + "epoch": 2.9860035457684053, + "grad_norm": NaN, + "learning_rate": 0.00015869347789421206, + "loss": 0.0, + "step": 32001 + }, + { + "epoch": 2.9860968554632827, + "grad_norm": NaN, + "learning_rate": 0.00015868592681262807, + "loss": 0.0, + "step": 32002 + }, + { + "epoch": 2.9861901651581597, + "grad_norm": NaN, + "learning_rate": 0.0001586783757089583, + "loss": 0.0, + "step": 32003 + }, + { + "epoch": 2.986283474853037, + "grad_norm": NaN, + "learning_rate": 0.00015867082458322196, + "loss": 0.0, + "step": 32004 + }, + { + "epoch": 2.9863767845479146, + "grad_norm": NaN, + "learning_rate": 0.00015866327343543835, + "loss": 0.0, + "step": 32005 + }, + { + "epoch": 2.986470094242792, + "grad_norm": NaN, + "learning_rate": 0.00015865572226562657, + "loss": 0.0, + "step": 32006 + }, + { + "epoch": 2.9865634039376694, + "grad_norm": NaN, + "learning_rate": 0.0001586481710738058, + "loss": 0.0, + "step": 32007 + }, + { + "epoch": 2.9866567136325464, + "grad_norm": NaN, + "learning_rate": 0.0001586406198599954, + "loss": 0.0, + "step": 32008 + }, + { + "epoch": 2.986750023327424, + "grad_norm": NaN, + "learning_rate": 0.0001586330686242144, + "loss": 0.0, + "step": 32009 + }, + { + "epoch": 2.9868433330223008, + "grad_norm": NaN, + "learning_rate": 0.00015862551736648206, + "loss": 0.0, + "step": 32010 + }, + { + "epoch": 2.986936642717178, + "grad_norm": NaN, + "learning_rate": 0.00015861796608681762, + "loss": 0.0, + "step": 32011 + }, + { + "epoch": 2.9870299524120556, + "grad_norm": NaN, + "learning_rate": 0.00015861041478524023, + "loss": 0.0, + "step": 32012 + }, + { + "epoch": 2.987123262106933, + "grad_norm": NaN, + "learning_rate": 0.00015860286346176903, + "loss": 0.0, + "step": 32013 + }, + { + "epoch": 2.9872165718018104, + "grad_norm": NaN, + "learning_rate": 0.0001585953121164234, + "loss": 0.0, + "step": 32014 + }, + { + "epoch": 2.9873098814966874, + "grad_norm": NaN, + "learning_rate": 0.0001585877607492224, + "loss": 0.0, + "step": 32015 + }, + { + "epoch": 2.987403191191565, + "grad_norm": NaN, + "learning_rate": 0.00015858020936018526, + "loss": 0.0, + "step": 32016 + }, + { + "epoch": 2.9874965008864423, + "grad_norm": NaN, + "learning_rate": 0.0001585726579493312, + "loss": 0.0, + "step": 32017 + }, + { + "epoch": 2.9875898105813192, + "grad_norm": NaN, + "learning_rate": 0.00015856510651667945, + "loss": 0.0, + "step": 32018 + }, + { + "epoch": 2.9876831202761966, + "grad_norm": NaN, + "learning_rate": 0.00015855755506224913, + "loss": 0.0, + "step": 32019 + }, + { + "epoch": 2.987776429971074, + "grad_norm": NaN, + "learning_rate": 0.00015855000358605943, + "loss": 0.0, + "step": 32020 + }, + { + "epoch": 2.9878697396659515, + "grad_norm": NaN, + "learning_rate": 0.0001585424520881297, + "loss": 0.0, + "step": 32021 + }, + { + "epoch": 2.9879630493608285, + "grad_norm": NaN, + "learning_rate": 0.000158534900568479, + "loss": 0.0, + "step": 32022 + }, + { + "epoch": 2.988056359055706, + "grad_norm": NaN, + "learning_rate": 0.00015852734902712653, + "loss": 0.0, + "step": 32023 + }, + { + "epoch": 2.9881496687505833, + "grad_norm": NaN, + "learning_rate": 0.00015851979746409165, + "loss": 0.0, + "step": 32024 + }, + { + "epoch": 2.9882429784454603, + "grad_norm": NaN, + "learning_rate": 0.00015851224587939337, + "loss": 0.0, + "step": 32025 + }, + { + "epoch": 2.9883362881403377, + "grad_norm": NaN, + "learning_rate": 0.00015850469427305095, + "loss": 0.0, + "step": 32026 + }, + { + "epoch": 2.988429597835215, + "grad_norm": NaN, + "learning_rate": 0.00015849714264508366, + "loss": 0.0, + "step": 32027 + }, + { + "epoch": 2.9885229075300925, + "grad_norm": NaN, + "learning_rate": 0.00015848959099551063, + "loss": 0.0, + "step": 32028 + }, + { + "epoch": 2.98861621722497, + "grad_norm": NaN, + "learning_rate": 0.00015848203932435109, + "loss": 0.0, + "step": 32029 + }, + { + "epoch": 2.988709526919847, + "grad_norm": NaN, + "learning_rate": 0.00015847448763162426, + "loss": 0.0, + "step": 32030 + }, + { + "epoch": 2.9888028366147243, + "grad_norm": NaN, + "learning_rate": 0.0001584669359173493, + "loss": 0.0, + "step": 32031 + }, + { + "epoch": 2.9888961463096013, + "grad_norm": NaN, + "learning_rate": 0.0001584593841815454, + "loss": 0.0, + "step": 32032 + }, + { + "epoch": 2.9889894560044787, + "grad_norm": NaN, + "learning_rate": 0.00015845183242423189, + "loss": 0.0, + "step": 32033 + }, + { + "epoch": 2.989082765699356, + "grad_norm": NaN, + "learning_rate": 0.00015844428064542778, + "loss": 0.0, + "step": 32034 + }, + { + "epoch": 2.9891760753942336, + "grad_norm": NaN, + "learning_rate": 0.00015843672884515236, + "loss": 0.0, + "step": 32035 + }, + { + "epoch": 2.989269385089111, + "grad_norm": NaN, + "learning_rate": 0.00015842917702342486, + "loss": 0.0, + "step": 32036 + }, + { + "epoch": 2.989362694783988, + "grad_norm": NaN, + "learning_rate": 0.0001584216251802645, + "loss": 0.0, + "step": 32037 + }, + { + "epoch": 2.9894560044788654, + "grad_norm": NaN, + "learning_rate": 0.00015841407331569038, + "loss": 0.0, + "step": 32038 + }, + { + "epoch": 2.989549314173743, + "grad_norm": NaN, + "learning_rate": 0.0001584065214297218, + "loss": 0.0, + "step": 32039 + }, + { + "epoch": 2.98964262386862, + "grad_norm": NaN, + "learning_rate": 0.00015839896952237793, + "loss": 0.0, + "step": 32040 + }, + { + "epoch": 2.989735933563497, + "grad_norm": NaN, + "learning_rate": 0.00015839141759367798, + "loss": 0.0, + "step": 32041 + }, + { + "epoch": 2.9898292432583746, + "grad_norm": NaN, + "learning_rate": 0.00015838386564364108, + "loss": 0.0, + "step": 32042 + }, + { + "epoch": 2.989922552953252, + "grad_norm": NaN, + "learning_rate": 0.00015837631367228655, + "loss": 0.0, + "step": 32043 + }, + { + "epoch": 2.990015862648129, + "grad_norm": NaN, + "learning_rate": 0.00015836876167963352, + "loss": 0.0, + "step": 32044 + }, + { + "epoch": 2.9901091723430064, + "grad_norm": NaN, + "learning_rate": 0.00015836120966570118, + "loss": 0.0, + "step": 32045 + }, + { + "epoch": 2.990202482037884, + "grad_norm": NaN, + "learning_rate": 0.0001583536576305088, + "loss": 0.0, + "step": 32046 + }, + { + "epoch": 2.990295791732761, + "grad_norm": NaN, + "learning_rate": 0.00015834610557407554, + "loss": 0.0, + "step": 32047 + }, + { + "epoch": 2.9903891014276383, + "grad_norm": NaN, + "learning_rate": 0.00015833855349642055, + "loss": 0.0, + "step": 32048 + }, + { + "epoch": 2.9904824111225157, + "grad_norm": NaN, + "learning_rate": 0.00015833100139756316, + "loss": 0.0, + "step": 32049 + }, + { + "epoch": 2.990575720817393, + "grad_norm": NaN, + "learning_rate": 0.0001583234492775225, + "loss": 0.0, + "step": 32050 + }, + { + "epoch": 2.99066903051227, + "grad_norm": NaN, + "learning_rate": 0.0001583158971363177, + "loss": 0.0, + "step": 32051 + }, + { + "epoch": 2.9907623402071475, + "grad_norm": NaN, + "learning_rate": 0.00015830834497396814, + "loss": 0.0, + "step": 32052 + }, + { + "epoch": 2.990855649902025, + "grad_norm": NaN, + "learning_rate": 0.00015830079279049287, + "loss": 0.0, + "step": 32053 + }, + { + "epoch": 2.990948959596902, + "grad_norm": NaN, + "learning_rate": 0.00015829324058591108, + "loss": 0.0, + "step": 32054 + }, + { + "epoch": 2.9910422692917793, + "grad_norm": NaN, + "learning_rate": 0.0001582856883602421, + "loss": 0.0, + "step": 32055 + }, + { + "epoch": 2.9911355789866567, + "grad_norm": NaN, + "learning_rate": 0.0001582781361135051, + "loss": 0.0, + "step": 32056 + }, + { + "epoch": 2.991228888681534, + "grad_norm": NaN, + "learning_rate": 0.00015827058384571918, + "loss": 0.0, + "step": 32057 + }, + { + "epoch": 2.9913221983764116, + "grad_norm": NaN, + "learning_rate": 0.00015826303155690364, + "loss": 0.0, + "step": 32058 + }, + { + "epoch": 2.9914155080712885, + "grad_norm": NaN, + "learning_rate": 0.00015825547924707773, + "loss": 0.0, + "step": 32059 + }, + { + "epoch": 2.991508817766166, + "grad_norm": NaN, + "learning_rate": 0.00015824792691626047, + "loss": 0.0, + "step": 32060 + }, + { + "epoch": 2.991602127461043, + "grad_norm": NaN, + "learning_rate": 0.00015824037456447124, + "loss": 0.0, + "step": 32061 + }, + { + "epoch": 2.9916954371559203, + "grad_norm": NaN, + "learning_rate": 0.00015823282219172918, + "loss": 0.0, + "step": 32062 + }, + { + "epoch": 2.9917887468507978, + "grad_norm": NaN, + "learning_rate": 0.00015822526979805348, + "loss": 0.0, + "step": 32063 + }, + { + "epoch": 2.991882056545675, + "grad_norm": NaN, + "learning_rate": 0.00015821771738346334, + "loss": 0.0, + "step": 32064 + }, + { + "epoch": 2.9919753662405526, + "grad_norm": NaN, + "learning_rate": 0.000158210164947978, + "loss": 0.0, + "step": 32065 + }, + { + "epoch": 2.9920686759354296, + "grad_norm": NaN, + "learning_rate": 0.00015820261249161664, + "loss": 0.0, + "step": 32066 + }, + { + "epoch": 2.992161985630307, + "grad_norm": NaN, + "learning_rate": 0.00015819506001439842, + "loss": 0.0, + "step": 32067 + }, + { + "epoch": 2.9922552953251844, + "grad_norm": NaN, + "learning_rate": 0.00015818750751634268, + "loss": 0.0, + "step": 32068 + }, + { + "epoch": 2.9923486050200614, + "grad_norm": NaN, + "learning_rate": 0.0001581799549974685, + "loss": 0.0, + "step": 32069 + }, + { + "epoch": 2.992441914714939, + "grad_norm": NaN, + "learning_rate": 0.00015817240245779505, + "loss": 0.0, + "step": 32070 + }, + { + "epoch": 2.9925352244098162, + "grad_norm": NaN, + "learning_rate": 0.00015816484989734168, + "loss": 0.0, + "step": 32071 + }, + { + "epoch": 2.9926285341046936, + "grad_norm": NaN, + "learning_rate": 0.0001581572973161275, + "loss": 0.0, + "step": 32072 + }, + { + "epoch": 2.9927218437995706, + "grad_norm": NaN, + "learning_rate": 0.00015814974471417168, + "loss": 0.0, + "step": 32073 + }, + { + "epoch": 2.992815153494448, + "grad_norm": NaN, + "learning_rate": 0.00015814219209149351, + "loss": 0.0, + "step": 32074 + }, + { + "epoch": 2.9929084631893255, + "grad_norm": NaN, + "learning_rate": 0.00015813463944811218, + "loss": 0.0, + "step": 32075 + }, + { + "epoch": 2.9930017728842024, + "grad_norm": NaN, + "learning_rate": 0.0001581270867840468, + "loss": 0.0, + "step": 32076 + }, + { + "epoch": 2.99309508257908, + "grad_norm": NaN, + "learning_rate": 0.00015811953409931672, + "loss": 0.0, + "step": 32077 + }, + { + "epoch": 2.9931883922739573, + "grad_norm": NaN, + "learning_rate": 0.00015811198139394108, + "loss": 0.0, + "step": 32078 + }, + { + "epoch": 2.9932817019688347, + "grad_norm": NaN, + "learning_rate": 0.000158104428667939, + "loss": 0.0, + "step": 32079 + }, + { + "epoch": 2.993375011663712, + "grad_norm": NaN, + "learning_rate": 0.0001580968759213298, + "loss": 0.0, + "step": 32080 + }, + { + "epoch": 2.993468321358589, + "grad_norm": NaN, + "learning_rate": 0.00015808932315413267, + "loss": 0.0, + "step": 32081 + }, + { + "epoch": 2.9935616310534665, + "grad_norm": NaN, + "learning_rate": 0.00015808177036636675, + "loss": 0.0, + "step": 32082 + }, + { + "epoch": 2.9936549407483435, + "grad_norm": NaN, + "learning_rate": 0.00015807421755805126, + "loss": 0.0, + "step": 32083 + }, + { + "epoch": 2.993748250443221, + "grad_norm": NaN, + "learning_rate": 0.0001580666647292055, + "loss": 0.0, + "step": 32084 + }, + { + "epoch": 2.9938415601380983, + "grad_norm": NaN, + "learning_rate": 0.00015805911187984854, + "loss": 0.0, + "step": 32085 + }, + { + "epoch": 2.9939348698329757, + "grad_norm": NaN, + "learning_rate": 0.00015805155900999962, + "loss": 0.0, + "step": 32086 + }, + { + "epoch": 2.994028179527853, + "grad_norm": NaN, + "learning_rate": 0.00015804400611967807, + "loss": 0.0, + "step": 32087 + }, + { + "epoch": 2.99412148922273, + "grad_norm": NaN, + "learning_rate": 0.00015803645320890296, + "loss": 0.0, + "step": 32088 + }, + { + "epoch": 2.9942147989176076, + "grad_norm": NaN, + "learning_rate": 0.00015802890027769346, + "loss": 0.0, + "step": 32089 + }, + { + "epoch": 2.994308108612485, + "grad_norm": NaN, + "learning_rate": 0.00015802134732606892, + "loss": 0.0, + "step": 32090 + }, + { + "epoch": 2.994401418307362, + "grad_norm": NaN, + "learning_rate": 0.00015801379435404842, + "loss": 0.0, + "step": 32091 + }, + { + "epoch": 2.9944947280022394, + "grad_norm": NaN, + "learning_rate": 0.0001580062413616512, + "loss": 0.0, + "step": 32092 + }, + { + "epoch": 2.994588037697117, + "grad_norm": NaN, + "learning_rate": 0.00015799868834889653, + "loss": 0.0, + "step": 32093 + }, + { + "epoch": 2.994681347391994, + "grad_norm": NaN, + "learning_rate": 0.0001579911353158036, + "loss": 0.0, + "step": 32094 + }, + { + "epoch": 2.994774657086871, + "grad_norm": NaN, + "learning_rate": 0.00015798358226239148, + "loss": 0.0, + "step": 32095 + }, + { + "epoch": 2.9948679667817486, + "grad_norm": NaN, + "learning_rate": 0.00015797602918867954, + "loss": 0.0, + "step": 32096 + }, + { + "epoch": 2.994961276476626, + "grad_norm": NaN, + "learning_rate": 0.00015796847609468692, + "loss": 0.0, + "step": 32097 + }, + { + "epoch": 2.995054586171503, + "grad_norm": NaN, + "learning_rate": 0.00015796092298043282, + "loss": 0.0, + "step": 32098 + }, + { + "epoch": 2.9951478958663804, + "grad_norm": NaN, + "learning_rate": 0.00015795336984593644, + "loss": 0.0, + "step": 32099 + }, + { + "epoch": 2.995241205561258, + "grad_norm": NaN, + "learning_rate": 0.00015794581669121703, + "loss": 0.0, + "step": 32100 + }, + { + "epoch": 2.9953345152561353, + "grad_norm": NaN, + "learning_rate": 0.0001579382635162937, + "loss": 0.0, + "step": 32101 + }, + { + "epoch": 2.9954278249510127, + "grad_norm": NaN, + "learning_rate": 0.00015793071032118574, + "loss": 0.0, + "step": 32102 + }, + { + "epoch": 2.9955211346458896, + "grad_norm": NaN, + "learning_rate": 0.00015792315710591237, + "loss": 0.0, + "step": 32103 + }, + { + "epoch": 2.995614444340767, + "grad_norm": NaN, + "learning_rate": 0.00015791560387049274, + "loss": 0.0, + "step": 32104 + }, + { + "epoch": 2.995707754035644, + "grad_norm": NaN, + "learning_rate": 0.00015790805061494602, + "loss": 0.0, + "step": 32105 + }, + { + "epoch": 2.9958010637305215, + "grad_norm": NaN, + "learning_rate": 0.00015790049733929157, + "loss": 0.0, + "step": 32106 + }, + { + "epoch": 2.995894373425399, + "grad_norm": NaN, + "learning_rate": 0.00015789294404354844, + "loss": 0.0, + "step": 32107 + }, + { + "epoch": 2.9959876831202763, + "grad_norm": NaN, + "learning_rate": 0.0001578853907277359, + "loss": 0.0, + "step": 32108 + }, + { + "epoch": 2.9960809928151537, + "grad_norm": NaN, + "learning_rate": 0.00015787783739187313, + "loss": 0.0, + "step": 32109 + }, + { + "epoch": 2.9961743025100307, + "grad_norm": NaN, + "learning_rate": 0.0001578702840359794, + "loss": 0.0, + "step": 32110 + }, + { + "epoch": 2.996267612204908, + "grad_norm": NaN, + "learning_rate": 0.00015786273066007377, + "loss": 0.0, + "step": 32111 + }, + { + "epoch": 2.9963609218997855, + "grad_norm": NaN, + "learning_rate": 0.00015785517726417563, + "loss": 0.0, + "step": 32112 + }, + { + "epoch": 2.9964542315946625, + "grad_norm": NaN, + "learning_rate": 0.00015784762384830412, + "loss": 0.0, + "step": 32113 + }, + { + "epoch": 2.99654754128954, + "grad_norm": NaN, + "learning_rate": 0.00015784007041247833, + "loss": 0.0, + "step": 32114 + }, + { + "epoch": 2.9966408509844173, + "grad_norm": NaN, + "learning_rate": 0.00015783251695671764, + "loss": 0.0, + "step": 32115 + }, + { + "epoch": 2.9967341606792948, + "grad_norm": NaN, + "learning_rate": 0.00015782496348104116, + "loss": 0.0, + "step": 32116 + }, + { + "epoch": 2.9968274703741717, + "grad_norm": NaN, + "learning_rate": 0.0001578174099854681, + "loss": 0.0, + "step": 32117 + }, + { + "epoch": 2.996920780069049, + "grad_norm": NaN, + "learning_rate": 0.0001578098564700177, + "loss": 0.0, + "step": 32118 + }, + { + "epoch": 2.9970140897639266, + "grad_norm": NaN, + "learning_rate": 0.00015780230293470918, + "loss": 0.0, + "step": 32119 + }, + { + "epoch": 2.9971073994588036, + "grad_norm": NaN, + "learning_rate": 0.00015779474937956166, + "loss": 0.0, + "step": 32120 + }, + { + "epoch": 2.997200709153681, + "grad_norm": NaN, + "learning_rate": 0.0001577871958045944, + "loss": 0.0, + "step": 32121 + }, + { + "epoch": 2.9972940188485584, + "grad_norm": NaN, + "learning_rate": 0.00015777964220982667, + "loss": 0.0, + "step": 32122 + }, + { + "epoch": 2.997387328543436, + "grad_norm": NaN, + "learning_rate": 0.00015777208859527752, + "loss": 0.0, + "step": 32123 + }, + { + "epoch": 2.9974806382383132, + "grad_norm": NaN, + "learning_rate": 0.0001577645349609663, + "loss": 0.0, + "step": 32124 + }, + { + "epoch": 2.99757394793319, + "grad_norm": NaN, + "learning_rate": 0.00015775698130691225, + "loss": 0.0, + "step": 32125 + }, + { + "epoch": 2.9976672576280676, + "grad_norm": NaN, + "learning_rate": 0.00015774942763313436, + "loss": 0.0, + "step": 32126 + }, + { + "epoch": 2.9977605673229446, + "grad_norm": NaN, + "learning_rate": 0.00015774187393965202, + "loss": 0.0, + "step": 32127 + }, + { + "epoch": 2.997853877017822, + "grad_norm": NaN, + "learning_rate": 0.00015773432022648447, + "loss": 0.0, + "step": 32128 + }, + { + "epoch": 2.9979471867126994, + "grad_norm": NaN, + "learning_rate": 0.00015772676649365068, + "loss": 0.0, + "step": 32129 + }, + { + "epoch": 2.998040496407577, + "grad_norm": NaN, + "learning_rate": 0.0001577192127411701, + "loss": 0.0, + "step": 32130 + }, + { + "epoch": 2.9981338061024543, + "grad_norm": NaN, + "learning_rate": 0.00015771165896906185, + "loss": 0.0, + "step": 32131 + }, + { + "epoch": 2.9982271157973313, + "grad_norm": NaN, + "learning_rate": 0.00015770410517734515, + "loss": 0.0, + "step": 32132 + }, + { + "epoch": 2.9983204254922087, + "grad_norm": NaN, + "learning_rate": 0.00015769655136603915, + "loss": 0.0, + "step": 32133 + }, + { + "epoch": 2.998413735187086, + "grad_norm": NaN, + "learning_rate": 0.00015768899753516313, + "loss": 0.0, + "step": 32134 + }, + { + "epoch": 2.998507044881963, + "grad_norm": NaN, + "learning_rate": 0.00015768144368473626, + "loss": 0.0, + "step": 32135 + }, + { + "epoch": 2.9986003545768405, + "grad_norm": NaN, + "learning_rate": 0.00015767388981477773, + "loss": 0.0, + "step": 32136 + }, + { + "epoch": 2.998693664271718, + "grad_norm": NaN, + "learning_rate": 0.0001576663359253068, + "loss": 0.0, + "step": 32137 + }, + { + "epoch": 2.9987869739665953, + "grad_norm": NaN, + "learning_rate": 0.0001576587820163427, + "loss": 0.0, + "step": 32138 + }, + { + "epoch": 2.9988802836614723, + "grad_norm": NaN, + "learning_rate": 0.00015765122808790444, + "loss": 0.0, + "step": 32139 + }, + { + "epoch": 2.9989735933563497, + "grad_norm": NaN, + "learning_rate": 0.00015764367414001148, + "loss": 0.0, + "step": 32140 + }, + { + "epoch": 2.999066903051227, + "grad_norm": NaN, + "learning_rate": 0.00015763612017268293, + "loss": 0.0, + "step": 32141 + }, + { + "epoch": 2.999160212746104, + "grad_norm": NaN, + "learning_rate": 0.00015762856618593794, + "loss": 0.0, + "step": 32142 + }, + { + "epoch": 2.9992535224409815, + "grad_norm": NaN, + "learning_rate": 0.00015762101217979576, + "loss": 0.0, + "step": 32143 + }, + { + "epoch": 2.999346832135859, + "grad_norm": NaN, + "learning_rate": 0.0001576134581542757, + "loss": 0.0, + "step": 32144 + }, + { + "epoch": 2.9994401418307364, + "grad_norm": NaN, + "learning_rate": 0.00015760590410939673, + "loss": 0.0, + "step": 32145 + }, + { + "epoch": 2.9995334515256133, + "grad_norm": NaN, + "learning_rate": 0.00015759835004517829, + "loss": 0.0, + "step": 32146 + }, + { + "epoch": 2.9996267612204908, + "grad_norm": NaN, + "learning_rate": 0.0001575907959616395, + "loss": 0.0, + "step": 32147 + }, + { + "epoch": 2.999720070915368, + "grad_norm": NaN, + "learning_rate": 0.0001575832418587995, + "loss": 0.0, + "step": 32148 + }, + { + "epoch": 2.999813380610245, + "grad_norm": NaN, + "learning_rate": 0.00015757568773667758, + "loss": 0.0, + "step": 32149 + }, + { + "epoch": 2.9999066903051226, + "grad_norm": NaN, + "learning_rate": 0.00015756813359529297, + "loss": 0.0, + "step": 32150 + }, + { + "epoch": 3.0, + "grad_norm": NaN, + "learning_rate": 0.00015756057943466483, + "loss": 0.0, + "step": 32151 + }, + { + "epoch": 3.0, + "eval_loss": NaN, + "eval_runtime": 26.4081, + "eval_samples_per_second": 6.702, + "eval_steps_per_second": 6.702, + "step": 32151 + }, + { + "epoch": 3.0000933096948774, + "grad_norm": NaN, + "learning_rate": 0.00015755302525481235, + "loss": 0.0, + "step": 32152 + }, + { + "epoch": 3.0001866193897544, + "grad_norm": NaN, + "learning_rate": 0.0001575454710557548, + "loss": 0.0, + "step": 32153 + }, + { + "epoch": 3.000279929084632, + "grad_norm": NaN, + "learning_rate": 0.00015753791683751134, + "loss": 0.0, + "step": 32154 + }, + { + "epoch": 3.0003732387795092, + "grad_norm": NaN, + "learning_rate": 0.00015753036260010117, + "loss": 0.0, + "step": 32155 + }, + { + "epoch": 3.0004665484743867, + "grad_norm": NaN, + "learning_rate": 0.00015752280834354355, + "loss": 0.0, + "step": 32156 + }, + { + "epoch": 3.0005598581692636, + "grad_norm": NaN, + "learning_rate": 0.00015751525406785765, + "loss": 0.0, + "step": 32157 + }, + { + "epoch": 3.000653167864141, + "grad_norm": NaN, + "learning_rate": 0.00015750769977306264, + "loss": 0.0, + "step": 32158 + }, + { + "epoch": 3.0007464775590185, + "grad_norm": NaN, + "learning_rate": 0.0001575001454591778, + "loss": 0.0, + "step": 32159 + }, + { + "epoch": 3.000839787253896, + "grad_norm": NaN, + "learning_rate": 0.00015749259112622234, + "loss": 0.0, + "step": 32160 + }, + { + "epoch": 3.000933096948773, + "grad_norm": NaN, + "learning_rate": 0.0001574850367742154, + "loss": 0.0, + "step": 32161 + }, + { + "epoch": 3.0010264066436503, + "grad_norm": NaN, + "learning_rate": 0.00015747748240317624, + "loss": 0.0, + "step": 32162 + }, + { + "epoch": 3.0011197163385277, + "grad_norm": NaN, + "learning_rate": 0.0001574699280131241, + "loss": 0.0, + "step": 32163 + }, + { + "epoch": 3.0012130260334047, + "grad_norm": NaN, + "learning_rate": 0.00015746237360407807, + "loss": 0.0, + "step": 32164 + }, + { + "epoch": 3.001306335728282, + "grad_norm": NaN, + "learning_rate": 0.00015745481917605749, + "loss": 0.0, + "step": 32165 + }, + { + "epoch": 3.0013996454231595, + "grad_norm": NaN, + "learning_rate": 0.00015744726472908152, + "loss": 0.0, + "step": 32166 + }, + { + "epoch": 3.001492955118037, + "grad_norm": NaN, + "learning_rate": 0.00015743971026316932, + "loss": 0.0, + "step": 32167 + }, + { + "epoch": 3.001586264812914, + "grad_norm": NaN, + "learning_rate": 0.00015743215577834017, + "loss": 0.0, + "step": 32168 + }, + { + "epoch": 3.0016795745077913, + "grad_norm": NaN, + "learning_rate": 0.00015742460127461322, + "loss": 0.0, + "step": 32169 + }, + { + "epoch": 3.0017728842026687, + "grad_norm": NaN, + "learning_rate": 0.00015741704675200776, + "loss": 0.0, + "step": 32170 + }, + { + "epoch": 3.001866193897546, + "grad_norm": NaN, + "learning_rate": 0.0001574094922105429, + "loss": 0.0, + "step": 32171 + }, + { + "epoch": 3.001959503592423, + "grad_norm": NaN, + "learning_rate": 0.0001574019376502379, + "loss": 0.0, + "step": 32172 + }, + { + "epoch": 3.0020528132873006, + "grad_norm": NaN, + "learning_rate": 0.00015739438307111199, + "loss": 0.0, + "step": 32173 + }, + { + "epoch": 3.002146122982178, + "grad_norm": NaN, + "learning_rate": 0.00015738682847318434, + "loss": 0.0, + "step": 32174 + }, + { + "epoch": 3.002239432677055, + "grad_norm": NaN, + "learning_rate": 0.00015737927385647417, + "loss": 0.0, + "step": 32175 + }, + { + "epoch": 3.0023327423719324, + "grad_norm": NaN, + "learning_rate": 0.00015737171922100074, + "loss": 0.0, + "step": 32176 + }, + { + "epoch": 3.00242605206681, + "grad_norm": NaN, + "learning_rate": 0.00015736416456678314, + "loss": 0.0, + "step": 32177 + }, + { + "epoch": 3.002519361761687, + "grad_norm": NaN, + "learning_rate": 0.00015735660989384063, + "loss": 0.0, + "step": 32178 + }, + { + "epoch": 3.002612671456564, + "grad_norm": NaN, + "learning_rate": 0.00015734905520219255, + "loss": 0.0, + "step": 32179 + }, + { + "epoch": 3.0027059811514416, + "grad_norm": NaN, + "learning_rate": 0.00015734150049185792, + "loss": 0.0, + "step": 32180 + }, + { + "epoch": 3.002799290846319, + "grad_norm": NaN, + "learning_rate": 0.00015733394576285603, + "loss": 0.0, + "step": 32181 + }, + { + "epoch": 3.0028926005411964, + "grad_norm": NaN, + "learning_rate": 0.00015732639101520617, + "loss": 0.0, + "step": 32182 + }, + { + "epoch": 3.0029859102360734, + "grad_norm": NaN, + "learning_rate": 0.00015731883624892735, + "loss": 0.0, + "step": 32183 + }, + { + "epoch": 3.003079219930951, + "grad_norm": NaN, + "learning_rate": 0.00015731128146403896, + "loss": 0.0, + "step": 32184 + }, + { + "epoch": 3.0031725296258283, + "grad_norm": NaN, + "learning_rate": 0.00015730372666056017, + "loss": 0.0, + "step": 32185 + }, + { + "epoch": 3.0032658393207052, + "grad_norm": NaN, + "learning_rate": 0.0001572961718385101, + "loss": 0.0, + "step": 32186 + }, + { + "epoch": 3.0033591490155827, + "grad_norm": NaN, + "learning_rate": 0.00015728861699790806, + "loss": 0.0, + "step": 32187 + }, + { + "epoch": 3.00345245871046, + "grad_norm": NaN, + "learning_rate": 0.0001572810621387732, + "loss": 0.0, + "step": 32188 + }, + { + "epoch": 3.0035457684053375, + "grad_norm": NaN, + "learning_rate": 0.0001572735072611248, + "loss": 0.0, + "step": 32189 + }, + { + "epoch": 3.0036390781002145, + "grad_norm": NaN, + "learning_rate": 0.00015726595236498204, + "loss": 0.0, + "step": 32190 + }, + { + "epoch": 3.003732387795092, + "grad_norm": NaN, + "learning_rate": 0.00015725839745036406, + "loss": 0.0, + "step": 32191 + }, + { + "epoch": 3.0038256974899693, + "grad_norm": NaN, + "learning_rate": 0.00015725084251729015, + "loss": 0.0, + "step": 32192 + }, + { + "epoch": 3.0039190071848467, + "grad_norm": NaN, + "learning_rate": 0.0001572432875657795, + "loss": 0.0, + "step": 32193 + }, + { + "epoch": 3.0040123168797237, + "grad_norm": NaN, + "learning_rate": 0.00015723573259585131, + "loss": 0.0, + "step": 32194 + }, + { + "epoch": 3.004105626574601, + "grad_norm": NaN, + "learning_rate": 0.0001572281776075248, + "loss": 0.0, + "step": 32195 + }, + { + "epoch": 3.0041989362694785, + "grad_norm": NaN, + "learning_rate": 0.00015722062260081917, + "loss": 0.0, + "step": 32196 + }, + { + "epoch": 3.0042922459643555, + "grad_norm": NaN, + "learning_rate": 0.00015721306757575363, + "loss": 0.0, + "step": 32197 + }, + { + "epoch": 3.004385555659233, + "grad_norm": NaN, + "learning_rate": 0.00015720551253234745, + "loss": 0.0, + "step": 32198 + }, + { + "epoch": 3.0044788653541104, + "grad_norm": NaN, + "learning_rate": 0.0001571979574706197, + "loss": 0.0, + "step": 32199 + }, + { + "epoch": 3.0045721750489878, + "grad_norm": NaN, + "learning_rate": 0.0001571904023905897, + "loss": 0.0, + "step": 32200 + }, + { + "epoch": 3.0046654847438647, + "grad_norm": NaN, + "learning_rate": 0.00015718284729227672, + "loss": 0.0, + "step": 32201 + }, + { + "epoch": 3.004758794438742, + "grad_norm": NaN, + "learning_rate": 0.00015717529217569977, + "loss": 0.0, + "step": 32202 + }, + { + "epoch": 3.0048521041336196, + "grad_norm": NaN, + "learning_rate": 0.0001571677370408782, + "loss": 0.0, + "step": 32203 + }, + { + "epoch": 3.004945413828497, + "grad_norm": NaN, + "learning_rate": 0.0001571601818878313, + "loss": 0.0, + "step": 32204 + }, + { + "epoch": 3.005038723523374, + "grad_norm": NaN, + "learning_rate": 0.00015715262671657805, + "loss": 0.0, + "step": 32205 + }, + { + "epoch": 3.0051320332182514, + "grad_norm": NaN, + "learning_rate": 0.00015714507152713784, + "loss": 0.0, + "step": 32206 + }, + { + "epoch": 3.005225342913129, + "grad_norm": NaN, + "learning_rate": 0.00015713751631952984, + "loss": 0.0, + "step": 32207 + }, + { + "epoch": 3.005318652608006, + "grad_norm": NaN, + "learning_rate": 0.00015712996109377322, + "loss": 0.0, + "step": 32208 + }, + { + "epoch": 3.005411962302883, + "grad_norm": NaN, + "learning_rate": 0.00015712240584988724, + "loss": 0.0, + "step": 32209 + }, + { + "epoch": 3.0055052719977606, + "grad_norm": NaN, + "learning_rate": 0.0001571148505878911, + "loss": 0.0, + "step": 32210 + }, + { + "epoch": 3.005598581692638, + "grad_norm": NaN, + "learning_rate": 0.000157107295307804, + "loss": 0.0, + "step": 32211 + }, + { + "epoch": 3.005691891387515, + "grad_norm": NaN, + "learning_rate": 0.00015709974000964515, + "loss": 0.0, + "step": 32212 + }, + { + "epoch": 3.0057852010823924, + "grad_norm": NaN, + "learning_rate": 0.00015709218469343376, + "loss": 0.0, + "step": 32213 + }, + { + "epoch": 3.00587851077727, + "grad_norm": NaN, + "learning_rate": 0.00015708462935918907, + "loss": 0.0, + "step": 32214 + }, + { + "epoch": 3.005971820472147, + "grad_norm": NaN, + "learning_rate": 0.00015707707400693024, + "loss": 0.0, + "step": 32215 + }, + { + "epoch": 3.0060651301670243, + "grad_norm": NaN, + "learning_rate": 0.00015706951863667652, + "loss": 0.0, + "step": 32216 + }, + { + "epoch": 3.0061584398619017, + "grad_norm": NaN, + "learning_rate": 0.00015706196324844708, + "loss": 0.0, + "step": 32217 + }, + { + "epoch": 3.006251749556779, + "grad_norm": NaN, + "learning_rate": 0.0001570544078422612, + "loss": 0.0, + "step": 32218 + }, + { + "epoch": 3.006345059251656, + "grad_norm": NaN, + "learning_rate": 0.00015704685241813802, + "loss": 0.0, + "step": 32219 + }, + { + "epoch": 3.0064383689465335, + "grad_norm": NaN, + "learning_rate": 0.00015703929697609686, + "loss": 0.0, + "step": 32220 + }, + { + "epoch": 3.006531678641411, + "grad_norm": NaN, + "learning_rate": 0.00015703174151615674, + "loss": 0.0, + "step": 32221 + }, + { + "epoch": 3.0066249883362883, + "grad_norm": NaN, + "learning_rate": 0.00015702418603833702, + "loss": 0.0, + "step": 32222 + }, + { + "epoch": 3.0067182980311653, + "grad_norm": NaN, + "learning_rate": 0.00015701663054265688, + "loss": 0.0, + "step": 32223 + }, + { + "epoch": 3.0068116077260427, + "grad_norm": NaN, + "learning_rate": 0.00015700907502913555, + "loss": 0.0, + "step": 32224 + }, + { + "epoch": 3.00690491742092, + "grad_norm": NaN, + "learning_rate": 0.0001570015194977922, + "loss": 0.0, + "step": 32225 + }, + { + "epoch": 3.006998227115797, + "grad_norm": NaN, + "learning_rate": 0.00015699396394864605, + "loss": 0.0, + "step": 32226 + }, + { + "epoch": 3.0070915368106745, + "grad_norm": NaN, + "learning_rate": 0.00015698640838171635, + "loss": 0.0, + "step": 32227 + }, + { + "epoch": 3.007184846505552, + "grad_norm": NaN, + "learning_rate": 0.00015697885279702224, + "loss": 0.0, + "step": 32228 + }, + { + "epoch": 3.0072781562004294, + "grad_norm": NaN, + "learning_rate": 0.000156971297194583, + "loss": 0.0, + "step": 32229 + }, + { + "epoch": 3.0073714658953064, + "grad_norm": NaN, + "learning_rate": 0.00015696374157441785, + "loss": 0.0, + "step": 32230 + }, + { + "epoch": 3.0074647755901838, + "grad_norm": NaN, + "learning_rate": 0.00015695618593654592, + "loss": 0.0, + "step": 32231 + }, + { + "epoch": 3.007558085285061, + "grad_norm": NaN, + "learning_rate": 0.0001569486302809865, + "loss": 0.0, + "step": 32232 + }, + { + "epoch": 3.0076513949799386, + "grad_norm": NaN, + "learning_rate": 0.00015694107460775873, + "loss": 0.0, + "step": 32233 + }, + { + "epoch": 3.0077447046748156, + "grad_norm": NaN, + "learning_rate": 0.0001569335189168819, + "loss": 0.0, + "step": 32234 + }, + { + "epoch": 3.007838014369693, + "grad_norm": NaN, + "learning_rate": 0.00015692596320837518, + "loss": 0.0, + "step": 32235 + }, + { + "epoch": 3.0079313240645704, + "grad_norm": NaN, + "learning_rate": 0.0001569184074822578, + "loss": 0.0, + "step": 32236 + }, + { + "epoch": 3.0080246337594474, + "grad_norm": NaN, + "learning_rate": 0.00015691085173854893, + "loss": 0.0, + "step": 32237 + }, + { + "epoch": 3.008117943454325, + "grad_norm": NaN, + "learning_rate": 0.0001569032959772678, + "loss": 0.0, + "step": 32238 + }, + { + "epoch": 3.0082112531492022, + "grad_norm": NaN, + "learning_rate": 0.00015689574019843368, + "loss": 0.0, + "step": 32239 + }, + { + "epoch": 3.0083045628440797, + "grad_norm": NaN, + "learning_rate": 0.00015688818440206569, + "loss": 0.0, + "step": 32240 + }, + { + "epoch": 3.0083978725389566, + "grad_norm": NaN, + "learning_rate": 0.0001568806285881831, + "loss": 0.0, + "step": 32241 + }, + { + "epoch": 3.008491182233834, + "grad_norm": NaN, + "learning_rate": 0.00015687307275680514, + "loss": 0.0, + "step": 32242 + }, + { + "epoch": 3.0085844919287115, + "grad_norm": NaN, + "learning_rate": 0.00015686551690795097, + "loss": 0.0, + "step": 32243 + }, + { + "epoch": 3.008677801623589, + "grad_norm": NaN, + "learning_rate": 0.00015685796104163982, + "loss": 0.0, + "step": 32244 + }, + { + "epoch": 3.008771111318466, + "grad_norm": NaN, + "learning_rate": 0.0001568504051578909, + "loss": 0.0, + "step": 32245 + }, + { + "epoch": 3.0088644210133433, + "grad_norm": NaN, + "learning_rate": 0.00015684284925672346, + "loss": 0.0, + "step": 32246 + }, + { + "epoch": 3.0089577307082207, + "grad_norm": NaN, + "learning_rate": 0.00015683529333815663, + "loss": 0.0, + "step": 32247 + }, + { + "epoch": 3.0090510404030977, + "grad_norm": NaN, + "learning_rate": 0.0001568277374022097, + "loss": 0.0, + "step": 32248 + }, + { + "epoch": 3.009144350097975, + "grad_norm": NaN, + "learning_rate": 0.00015682018144890186, + "loss": 0.0, + "step": 32249 + }, + { + "epoch": 3.0092376597928525, + "grad_norm": NaN, + "learning_rate": 0.00015681262547825235, + "loss": 0.0, + "step": 32250 + }, + { + "epoch": 3.00933096948773, + "grad_norm": NaN, + "learning_rate": 0.0001568050694902803, + "loss": 0.0, + "step": 32251 + }, + { + "epoch": 3.009424279182607, + "grad_norm": NaN, + "learning_rate": 0.000156797513485005, + "loss": 0.0, + "step": 32252 + }, + { + "epoch": 3.0095175888774843, + "grad_norm": NaN, + "learning_rate": 0.00015678995746244562, + "loss": 0.0, + "step": 32253 + }, + { + "epoch": 3.0096108985723617, + "grad_norm": NaN, + "learning_rate": 0.00015678240142262144, + "loss": 0.0, + "step": 32254 + }, + { + "epoch": 3.009704208267239, + "grad_norm": NaN, + "learning_rate": 0.00015677484536555155, + "loss": 0.0, + "step": 32255 + }, + { + "epoch": 3.009797517962116, + "grad_norm": NaN, + "learning_rate": 0.00015676728929125527, + "loss": 0.0, + "step": 32256 + }, + { + "epoch": 3.0098908276569936, + "grad_norm": NaN, + "learning_rate": 0.00015675973319975177, + "loss": 0.0, + "step": 32257 + }, + { + "epoch": 3.009984137351871, + "grad_norm": NaN, + "learning_rate": 0.00015675217709106031, + "loss": 0.0, + "step": 32258 + }, + { + "epoch": 3.010077447046748, + "grad_norm": NaN, + "learning_rate": 0.00015674462096520003, + "loss": 0.0, + "step": 32259 + }, + { + "epoch": 3.0101707567416254, + "grad_norm": NaN, + "learning_rate": 0.00015673706482219018, + "loss": 0.0, + "step": 32260 + }, + { + "epoch": 3.010264066436503, + "grad_norm": NaN, + "learning_rate": 0.00015672950866204996, + "loss": 0.0, + "step": 32261 + }, + { + "epoch": 3.01035737613138, + "grad_norm": NaN, + "learning_rate": 0.00015672195248479857, + "loss": 0.0, + "step": 32262 + }, + { + "epoch": 3.010450685826257, + "grad_norm": NaN, + "learning_rate": 0.00015671439629045533, + "loss": 0.0, + "step": 32263 + }, + { + "epoch": 3.0105439955211346, + "grad_norm": NaN, + "learning_rate": 0.0001567068400790393, + "loss": 0.0, + "step": 32264 + }, + { + "epoch": 3.010637305216012, + "grad_norm": NaN, + "learning_rate": 0.00015669928385056976, + "loss": 0.0, + "step": 32265 + }, + { + "epoch": 3.0107306149108894, + "grad_norm": NaN, + "learning_rate": 0.00015669172760506594, + "loss": 0.0, + "step": 32266 + }, + { + "epoch": 3.0108239246057664, + "grad_norm": NaN, + "learning_rate": 0.00015668417134254706, + "loss": 0.0, + "step": 32267 + }, + { + "epoch": 3.010917234300644, + "grad_norm": NaN, + "learning_rate": 0.0001566766150630323, + "loss": 0.0, + "step": 32268 + }, + { + "epoch": 3.0110105439955213, + "grad_norm": NaN, + "learning_rate": 0.00015666905876654088, + "loss": 0.0, + "step": 32269 + }, + { + "epoch": 3.0111038536903982, + "grad_norm": NaN, + "learning_rate": 0.00015666150245309201, + "loss": 0.0, + "step": 32270 + }, + { + "epoch": 3.0111971633852757, + "grad_norm": NaN, + "learning_rate": 0.00015665394612270493, + "loss": 0.0, + "step": 32271 + }, + { + "epoch": 3.011290473080153, + "grad_norm": NaN, + "learning_rate": 0.0001566463897753988, + "loss": 0.0, + "step": 32272 + }, + { + "epoch": 3.0113837827750305, + "grad_norm": NaN, + "learning_rate": 0.00015663883341119294, + "loss": 0.0, + "step": 32273 + }, + { + "epoch": 3.0114770924699075, + "grad_norm": NaN, + "learning_rate": 0.00015663127703010644, + "loss": 0.0, + "step": 32274 + }, + { + "epoch": 3.011570402164785, + "grad_norm": NaN, + "learning_rate": 0.0001566237206321586, + "loss": 0.0, + "step": 32275 + }, + { + "epoch": 3.0116637118596623, + "grad_norm": NaN, + "learning_rate": 0.00015661616421736856, + "loss": 0.0, + "step": 32276 + }, + { + "epoch": 3.0117570215545397, + "grad_norm": NaN, + "learning_rate": 0.0001566086077857556, + "loss": 0.0, + "step": 32277 + }, + { + "epoch": 3.0118503312494167, + "grad_norm": NaN, + "learning_rate": 0.00015660105133733892, + "loss": 0.0, + "step": 32278 + }, + { + "epoch": 3.011943640944294, + "grad_norm": NaN, + "learning_rate": 0.0001565934948721377, + "loss": 0.0, + "step": 32279 + }, + { + "epoch": 3.0120369506391715, + "grad_norm": NaN, + "learning_rate": 0.00015658593839017117, + "loss": 0.0, + "step": 32280 + }, + { + "epoch": 3.0121302603340485, + "grad_norm": NaN, + "learning_rate": 0.00015657838189145855, + "loss": 0.0, + "step": 32281 + }, + { + "epoch": 3.012223570028926, + "grad_norm": NaN, + "learning_rate": 0.00015657082537601905, + "loss": 0.0, + "step": 32282 + }, + { + "epoch": 3.0123168797238034, + "grad_norm": NaN, + "learning_rate": 0.0001565632688438719, + "loss": 0.0, + "step": 32283 + }, + { + "epoch": 3.0124101894186808, + "grad_norm": NaN, + "learning_rate": 0.00015655571229503633, + "loss": 0.0, + "step": 32284 + }, + { + "epoch": 3.0125034991135577, + "grad_norm": NaN, + "learning_rate": 0.0001565481557295315, + "loss": 0.0, + "step": 32285 + }, + { + "epoch": 3.012596808808435, + "grad_norm": NaN, + "learning_rate": 0.0001565405991473766, + "loss": 0.0, + "step": 32286 + }, + { + "epoch": 3.0126901185033126, + "grad_norm": NaN, + "learning_rate": 0.00015653304254859097, + "loss": 0.0, + "step": 32287 + }, + { + "epoch": 3.01278342819819, + "grad_norm": NaN, + "learning_rate": 0.00015652548593319372, + "loss": 0.0, + "step": 32288 + }, + { + "epoch": 3.012876737893067, + "grad_norm": NaN, + "learning_rate": 0.00015651792930120408, + "loss": 0.0, + "step": 32289 + }, + { + "epoch": 3.0129700475879444, + "grad_norm": NaN, + "learning_rate": 0.00015651037265264128, + "loss": 0.0, + "step": 32290 + }, + { + "epoch": 3.013063357282822, + "grad_norm": NaN, + "learning_rate": 0.00015650281598752453, + "loss": 0.0, + "step": 32291 + }, + { + "epoch": 3.013156666977699, + "grad_norm": NaN, + "learning_rate": 0.0001564952593058731, + "loss": 0.0, + "step": 32292 + }, + { + "epoch": 3.013249976672576, + "grad_norm": NaN, + "learning_rate": 0.00015648770260770606, + "loss": 0.0, + "step": 32293 + }, + { + "epoch": 3.0133432863674536, + "grad_norm": NaN, + "learning_rate": 0.00015648014589304278, + "loss": 0.0, + "step": 32294 + }, + { + "epoch": 3.013436596062331, + "grad_norm": NaN, + "learning_rate": 0.00015647258916190237, + "loss": 0.0, + "step": 32295 + }, + { + "epoch": 3.013529905757208, + "grad_norm": NaN, + "learning_rate": 0.00015646503241430412, + "loss": 0.0, + "step": 32296 + }, + { + "epoch": 3.0136232154520854, + "grad_norm": NaN, + "learning_rate": 0.00015645747565026715, + "loss": 0.0, + "step": 32297 + }, + { + "epoch": 3.013716525146963, + "grad_norm": NaN, + "learning_rate": 0.00015644991886981077, + "loss": 0.0, + "step": 32298 + }, + { + "epoch": 3.0138098348418403, + "grad_norm": NaN, + "learning_rate": 0.00015644236207295418, + "loss": 0.0, + "step": 32299 + }, + { + "epoch": 3.0139031445367173, + "grad_norm": NaN, + "learning_rate": 0.00015643480525971655, + "loss": 0.0, + "step": 32300 + }, + { + "epoch": 3.0139964542315947, + "grad_norm": NaN, + "learning_rate": 0.0001564272484301171, + "loss": 0.0, + "step": 32301 + }, + { + "epoch": 3.014089763926472, + "grad_norm": NaN, + "learning_rate": 0.00015641969158417505, + "loss": 0.0, + "step": 32302 + }, + { + "epoch": 3.014183073621349, + "grad_norm": NaN, + "learning_rate": 0.0001564121347219097, + "loss": 0.0, + "step": 32303 + }, + { + "epoch": 3.0142763833162265, + "grad_norm": NaN, + "learning_rate": 0.00015640457784334011, + "loss": 0.0, + "step": 32304 + }, + { + "epoch": 3.014369693011104, + "grad_norm": NaN, + "learning_rate": 0.0001563970209484856, + "loss": 0.0, + "step": 32305 + }, + { + "epoch": 3.0144630027059813, + "grad_norm": NaN, + "learning_rate": 0.0001563894640373654, + "loss": 0.0, + "step": 32306 + }, + { + "epoch": 3.0145563124008583, + "grad_norm": NaN, + "learning_rate": 0.00015638190710999864, + "loss": 0.0, + "step": 32307 + }, + { + "epoch": 3.0146496220957357, + "grad_norm": NaN, + "learning_rate": 0.00015637435016640458, + "loss": 0.0, + "step": 32308 + }, + { + "epoch": 3.014742931790613, + "grad_norm": NaN, + "learning_rate": 0.00015636679320660247, + "loss": 0.0, + "step": 32309 + }, + { + "epoch": 3.01483624148549, + "grad_norm": NaN, + "learning_rate": 0.00015635923623061145, + "loss": 0.0, + "step": 32310 + }, + { + "epoch": 3.0149295511803675, + "grad_norm": NaN, + "learning_rate": 0.0001563516792384508, + "loss": 0.0, + "step": 32311 + }, + { + "epoch": 3.015022860875245, + "grad_norm": NaN, + "learning_rate": 0.0001563441222301397, + "loss": 0.0, + "step": 32312 + }, + { + "epoch": 3.0151161705701224, + "grad_norm": NaN, + "learning_rate": 0.0001563365652056974, + "loss": 0.0, + "step": 32313 + }, + { + "epoch": 3.0152094802649994, + "grad_norm": NaN, + "learning_rate": 0.00015632900816514307, + "loss": 0.0, + "step": 32314 + }, + { + "epoch": 3.0153027899598768, + "grad_norm": NaN, + "learning_rate": 0.00015632145110849595, + "loss": 0.0, + "step": 32315 + }, + { + "epoch": 3.015396099654754, + "grad_norm": NaN, + "learning_rate": 0.00015631389403577525, + "loss": 0.0, + "step": 32316 + }, + { + "epoch": 3.0154894093496316, + "grad_norm": NaN, + "learning_rate": 0.0001563063369470002, + "loss": 0.0, + "step": 32317 + }, + { + "epoch": 3.0155827190445086, + "grad_norm": NaN, + "learning_rate": 0.00015629877984219, + "loss": 0.0, + "step": 32318 + }, + { + "epoch": 3.015676028739386, + "grad_norm": NaN, + "learning_rate": 0.0001562912227213639, + "loss": 0.0, + "step": 32319 + }, + { + "epoch": 3.0157693384342634, + "grad_norm": NaN, + "learning_rate": 0.00015628366558454102, + "loss": 0.0, + "step": 32320 + }, + { + "epoch": 3.0158626481291404, + "grad_norm": NaN, + "learning_rate": 0.00015627610843174064, + "loss": 0.0, + "step": 32321 + }, + { + "epoch": 3.015955957824018, + "grad_norm": NaN, + "learning_rate": 0.00015626855126298206, + "loss": 0.0, + "step": 32322 + }, + { + "epoch": 3.0160492675188952, + "grad_norm": NaN, + "learning_rate": 0.00015626099407828431, + "loss": 0.0, + "step": 32323 + }, + { + "epoch": 3.0161425772137727, + "grad_norm": NaN, + "learning_rate": 0.00015625343687766677, + "loss": 0.0, + "step": 32324 + }, + { + "epoch": 3.0162358869086496, + "grad_norm": NaN, + "learning_rate": 0.0001562458796611486, + "loss": 0.0, + "step": 32325 + }, + { + "epoch": 3.016329196603527, + "grad_norm": NaN, + "learning_rate": 0.00015623832242874896, + "loss": 0.0, + "step": 32326 + }, + { + "epoch": 3.0164225062984045, + "grad_norm": NaN, + "learning_rate": 0.00015623076518048713, + "loss": 0.0, + "step": 32327 + }, + { + "epoch": 3.016515815993282, + "grad_norm": NaN, + "learning_rate": 0.00015622320791638234, + "loss": 0.0, + "step": 32328 + }, + { + "epoch": 3.016609125688159, + "grad_norm": NaN, + "learning_rate": 0.00015621565063645375, + "loss": 0.0, + "step": 32329 + }, + { + "epoch": 3.0167024353830363, + "grad_norm": NaN, + "learning_rate": 0.00015620809334072056, + "loss": 0.0, + "step": 32330 + }, + { + "epoch": 3.0167957450779137, + "grad_norm": NaN, + "learning_rate": 0.0001562005360292021, + "loss": 0.0, + "step": 32331 + }, + { + "epoch": 3.0168890547727907, + "grad_norm": NaN, + "learning_rate": 0.00015619297870191748, + "loss": 0.0, + "step": 32332 + }, + { + "epoch": 3.016982364467668, + "grad_norm": NaN, + "learning_rate": 0.00015618542135888594, + "loss": 0.0, + "step": 32333 + }, + { + "epoch": 3.0170756741625455, + "grad_norm": NaN, + "learning_rate": 0.00015617786400012673, + "loss": 0.0, + "step": 32334 + }, + { + "epoch": 3.017168983857423, + "grad_norm": NaN, + "learning_rate": 0.00015617030662565904, + "loss": 0.0, + "step": 32335 + }, + { + "epoch": 3.0172622935523, + "grad_norm": NaN, + "learning_rate": 0.0001561627492355021, + "loss": 0.0, + "step": 32336 + }, + { + "epoch": 3.0173556032471773, + "grad_norm": NaN, + "learning_rate": 0.00015615519182967508, + "loss": 0.0, + "step": 32337 + }, + { + "epoch": 3.0174489129420548, + "grad_norm": NaN, + "learning_rate": 0.00015614763440819723, + "loss": 0.0, + "step": 32338 + }, + { + "epoch": 3.017542222636932, + "grad_norm": NaN, + "learning_rate": 0.00015614007697108777, + "loss": 0.0, + "step": 32339 + }, + { + "epoch": 3.017635532331809, + "grad_norm": NaN, + "learning_rate": 0.00015613251951836595, + "loss": 0.0, + "step": 32340 + }, + { + "epoch": 3.0177288420266866, + "grad_norm": NaN, + "learning_rate": 0.00015612496205005092, + "loss": 0.0, + "step": 32341 + }, + { + "epoch": 3.017822151721564, + "grad_norm": NaN, + "learning_rate": 0.00015611740456616193, + "loss": 0.0, + "step": 32342 + }, + { + "epoch": 3.017915461416441, + "grad_norm": NaN, + "learning_rate": 0.0001561098470667182, + "loss": 0.0, + "step": 32343 + }, + { + "epoch": 3.0180087711113184, + "grad_norm": NaN, + "learning_rate": 0.0001561022895517389, + "loss": 0.0, + "step": 32344 + }, + { + "epoch": 3.018102080806196, + "grad_norm": NaN, + "learning_rate": 0.00015609473202124336, + "loss": 0.0, + "step": 32345 + }, + { + "epoch": 3.018195390501073, + "grad_norm": NaN, + "learning_rate": 0.00015608717447525067, + "loss": 0.0, + "step": 32346 + }, + { + "epoch": 3.01828870019595, + "grad_norm": NaN, + "learning_rate": 0.0001560796169137801, + "loss": 0.0, + "step": 32347 + }, + { + "epoch": 3.0183820098908276, + "grad_norm": NaN, + "learning_rate": 0.00015607205933685085, + "loss": 0.0, + "step": 32348 + }, + { + "epoch": 3.018475319585705, + "grad_norm": NaN, + "learning_rate": 0.0001560645017444822, + "loss": 0.0, + "step": 32349 + }, + { + "epoch": 3.0185686292805824, + "grad_norm": NaN, + "learning_rate": 0.0001560569441366933, + "loss": 0.0, + "step": 32350 + }, + { + "epoch": 3.0186619389754594, + "grad_norm": NaN, + "learning_rate": 0.0001560493865135034, + "loss": 0.0, + "step": 32351 + }, + { + "epoch": 3.018755248670337, + "grad_norm": NaN, + "learning_rate": 0.00015604182887493163, + "loss": 0.0, + "step": 32352 + }, + { + "epoch": 3.0188485583652143, + "grad_norm": NaN, + "learning_rate": 0.00015603427122099734, + "loss": 0.0, + "step": 32353 + }, + { + "epoch": 3.0189418680600912, + "grad_norm": NaN, + "learning_rate": 0.00015602671355171966, + "loss": 0.0, + "step": 32354 + }, + { + "epoch": 3.0190351777549687, + "grad_norm": NaN, + "learning_rate": 0.00015601915586711785, + "loss": 0.0, + "step": 32355 + }, + { + "epoch": 3.019128487449846, + "grad_norm": NaN, + "learning_rate": 0.00015601159816721114, + "loss": 0.0, + "step": 32356 + }, + { + "epoch": 3.0192217971447235, + "grad_norm": NaN, + "learning_rate": 0.00015600404045201864, + "loss": 0.0, + "step": 32357 + }, + { + "epoch": 3.0193151068396005, + "grad_norm": NaN, + "learning_rate": 0.00015599648272155973, + "loss": 0.0, + "step": 32358 + }, + { + "epoch": 3.019408416534478, + "grad_norm": NaN, + "learning_rate": 0.00015598892497585348, + "loss": 0.0, + "step": 32359 + }, + { + "epoch": 3.0195017262293553, + "grad_norm": NaN, + "learning_rate": 0.00015598136721491917, + "loss": 0.0, + "step": 32360 + }, + { + "epoch": 3.0195950359242327, + "grad_norm": NaN, + "learning_rate": 0.00015597380943877605, + "loss": 0.0, + "step": 32361 + }, + { + "epoch": 3.0196883456191097, + "grad_norm": NaN, + "learning_rate": 0.00015596625164744328, + "loss": 0.0, + "step": 32362 + }, + { + "epoch": 3.019781655313987, + "grad_norm": NaN, + "learning_rate": 0.00015595869384094012, + "loss": 0.0, + "step": 32363 + }, + { + "epoch": 3.0198749650088645, + "grad_norm": NaN, + "learning_rate": 0.00015595113601928574, + "loss": 0.0, + "step": 32364 + }, + { + "epoch": 3.0199682747037415, + "grad_norm": NaN, + "learning_rate": 0.0001559435781824994, + "loss": 0.0, + "step": 32365 + }, + { + "epoch": 3.020061584398619, + "grad_norm": NaN, + "learning_rate": 0.0001559360203306003, + "loss": 0.0, + "step": 32366 + }, + { + "epoch": 3.0201548940934964, + "grad_norm": NaN, + "learning_rate": 0.00015592846246360764, + "loss": 0.0, + "step": 32367 + }, + { + "epoch": 3.0202482037883738, + "grad_norm": NaN, + "learning_rate": 0.00015592090458154068, + "loss": 0.0, + "step": 32368 + }, + { + "epoch": 3.0203415134832508, + "grad_norm": NaN, + "learning_rate": 0.0001559133466844186, + "loss": 0.0, + "step": 32369 + }, + { + "epoch": 3.020434823178128, + "grad_norm": NaN, + "learning_rate": 0.0001559057887722606, + "loss": 0.0, + "step": 32370 + }, + { + "epoch": 3.0205281328730056, + "grad_norm": NaN, + "learning_rate": 0.00015589823084508593, + "loss": 0.0, + "step": 32371 + }, + { + "epoch": 3.020621442567883, + "grad_norm": NaN, + "learning_rate": 0.0001558906729029139, + "loss": 0.0, + "step": 32372 + }, + { + "epoch": 3.02071475226276, + "grad_norm": NaN, + "learning_rate": 0.00015588311494576357, + "loss": 0.0, + "step": 32373 + }, + { + "epoch": 3.0208080619576374, + "grad_norm": NaN, + "learning_rate": 0.00015587555697365416, + "loss": 0.0, + "step": 32374 + }, + { + "epoch": 3.020901371652515, + "grad_norm": NaN, + "learning_rate": 0.00015586799898660507, + "loss": 0.0, + "step": 32375 + }, + { + "epoch": 3.020994681347392, + "grad_norm": NaN, + "learning_rate": 0.00015586044098463535, + "loss": 0.0, + "step": 32376 + }, + { + "epoch": 3.021087991042269, + "grad_norm": NaN, + "learning_rate": 0.00015585288296776422, + "loss": 0.0, + "step": 32377 + }, + { + "epoch": 3.0211813007371466, + "grad_norm": NaN, + "learning_rate": 0.000155845324936011, + "loss": 0.0, + "step": 32378 + }, + { + "epoch": 3.021274610432024, + "grad_norm": NaN, + "learning_rate": 0.00015583776688939482, + "loss": 0.0, + "step": 32379 + }, + { + "epoch": 3.021367920126901, + "grad_norm": NaN, + "learning_rate": 0.00015583020882793493, + "loss": 0.0, + "step": 32380 + }, + { + "epoch": 3.0214612298217784, + "grad_norm": NaN, + "learning_rate": 0.00015582265075165055, + "loss": 0.0, + "step": 32381 + }, + { + "epoch": 3.021554539516656, + "grad_norm": NaN, + "learning_rate": 0.0001558150926605609, + "loss": 0.0, + "step": 32382 + }, + { + "epoch": 3.0216478492115333, + "grad_norm": NaN, + "learning_rate": 0.00015580753455468518, + "loss": 0.0, + "step": 32383 + }, + { + "epoch": 3.0217411589064103, + "grad_norm": NaN, + "learning_rate": 0.00015579997643404262, + "loss": 0.0, + "step": 32384 + }, + { + "epoch": 3.0218344686012877, + "grad_norm": NaN, + "learning_rate": 0.00015579241829865245, + "loss": 0.0, + "step": 32385 + }, + { + "epoch": 3.021927778296165, + "grad_norm": NaN, + "learning_rate": 0.00015578486014853387, + "loss": 0.0, + "step": 32386 + }, + { + "epoch": 3.022021087991042, + "grad_norm": NaN, + "learning_rate": 0.00015577730198370612, + "loss": 0.0, + "step": 32387 + }, + { + "epoch": 3.0221143976859195, + "grad_norm": NaN, + "learning_rate": 0.00015576974380418837, + "loss": 0.0, + "step": 32388 + }, + { + "epoch": 3.022207707380797, + "grad_norm": NaN, + "learning_rate": 0.0001557621856099999, + "loss": 0.0, + "step": 32389 + }, + { + "epoch": 3.0223010170756743, + "grad_norm": NaN, + "learning_rate": 0.00015575462740115983, + "loss": 0.0, + "step": 32390 + }, + { + "epoch": 3.0223943267705513, + "grad_norm": NaN, + "learning_rate": 0.00015574706917768756, + "loss": 0.0, + "step": 32391 + }, + { + "epoch": 3.0224876364654287, + "grad_norm": NaN, + "learning_rate": 0.00015573951093960213, + "loss": 0.0, + "step": 32392 + }, + { + "epoch": 3.022580946160306, + "grad_norm": NaN, + "learning_rate": 0.0001557319526869228, + "loss": 0.0, + "step": 32393 + }, + { + "epoch": 3.0226742558551836, + "grad_norm": NaN, + "learning_rate": 0.00015572439441966887, + "loss": 0.0, + "step": 32394 + }, + { + "epoch": 3.0227675655500605, + "grad_norm": NaN, + "learning_rate": 0.00015571683613785947, + "loss": 0.0, + "step": 32395 + }, + { + "epoch": 3.022860875244938, + "grad_norm": NaN, + "learning_rate": 0.00015570927784151384, + "loss": 0.0, + "step": 32396 + }, + { + "epoch": 3.0229541849398154, + "grad_norm": NaN, + "learning_rate": 0.00015570171953065127, + "loss": 0.0, + "step": 32397 + }, + { + "epoch": 3.0230474946346924, + "grad_norm": NaN, + "learning_rate": 0.00015569416120529087, + "loss": 0.0, + "step": 32398 + }, + { + "epoch": 3.0231408043295698, + "grad_norm": NaN, + "learning_rate": 0.00015568660286545185, + "loss": 0.0, + "step": 32399 + }, + { + "epoch": 3.023234114024447, + "grad_norm": NaN, + "learning_rate": 0.0001556790445111536, + "loss": 0.0, + "step": 32400 + }, + { + "epoch": 3.0233274237193246, + "grad_norm": NaN, + "learning_rate": 0.00015567148614241514, + "loss": 0.0, + "step": 32401 + }, + { + "epoch": 3.0234207334142016, + "grad_norm": NaN, + "learning_rate": 0.00015566392775925576, + "loss": 0.0, + "step": 32402 + }, + { + "epoch": 3.023514043109079, + "grad_norm": NaN, + "learning_rate": 0.00015565636936169474, + "loss": 0.0, + "step": 32403 + }, + { + "epoch": 3.0236073528039564, + "grad_norm": NaN, + "learning_rate": 0.00015564881094975124, + "loss": 0.0, + "step": 32404 + }, + { + "epoch": 3.023700662498834, + "grad_norm": NaN, + "learning_rate": 0.00015564125252344447, + "loss": 0.0, + "step": 32405 + }, + { + "epoch": 3.023793972193711, + "grad_norm": NaN, + "learning_rate": 0.00015563369408279365, + "loss": 0.0, + "step": 32406 + }, + { + "epoch": 3.0238872818885882, + "grad_norm": NaN, + "learning_rate": 0.00015562613562781805, + "loss": 0.0, + "step": 32407 + }, + { + "epoch": 3.0239805915834657, + "grad_norm": NaN, + "learning_rate": 0.00015561857715853684, + "loss": 0.0, + "step": 32408 + }, + { + "epoch": 3.0240739012783426, + "grad_norm": NaN, + "learning_rate": 0.0001556110186749692, + "loss": 0.0, + "step": 32409 + }, + { + "epoch": 3.02416721097322, + "grad_norm": NaN, + "learning_rate": 0.00015560346017713452, + "loss": 0.0, + "step": 32410 + }, + { + "epoch": 3.0242605206680975, + "grad_norm": NaN, + "learning_rate": 0.00015559590166505183, + "loss": 0.0, + "step": 32411 + }, + { + "epoch": 3.024353830362975, + "grad_norm": NaN, + "learning_rate": 0.0001555883431387404, + "loss": 0.0, + "step": 32412 + }, + { + "epoch": 3.024447140057852, + "grad_norm": NaN, + "learning_rate": 0.0001555807845982195, + "loss": 0.0, + "step": 32413 + }, + { + "epoch": 3.0245404497527293, + "grad_norm": NaN, + "learning_rate": 0.00015557322604350832, + "loss": 0.0, + "step": 32414 + }, + { + "epoch": 3.0246337594476067, + "grad_norm": NaN, + "learning_rate": 0.00015556566747462603, + "loss": 0.0, + "step": 32415 + }, + { + "epoch": 3.024727069142484, + "grad_norm": NaN, + "learning_rate": 0.00015555810889159198, + "loss": 0.0, + "step": 32416 + }, + { + "epoch": 3.024820378837361, + "grad_norm": NaN, + "learning_rate": 0.00015555055029442525, + "loss": 0.0, + "step": 32417 + }, + { + "epoch": 3.0249136885322385, + "grad_norm": NaN, + "learning_rate": 0.0001555429916831451, + "loss": 0.0, + "step": 32418 + }, + { + "epoch": 3.025006998227116, + "grad_norm": NaN, + "learning_rate": 0.00015553543305777082, + "loss": 0.0, + "step": 32419 + }, + { + "epoch": 3.025100307921993, + "grad_norm": NaN, + "learning_rate": 0.00015552787441832155, + "loss": 0.0, + "step": 32420 + }, + { + "epoch": 3.0251936176168703, + "grad_norm": NaN, + "learning_rate": 0.00015552031576481648, + "loss": 0.0, + "step": 32421 + }, + { + "epoch": 3.0252869273117478, + "grad_norm": NaN, + "learning_rate": 0.00015551275709727497, + "loss": 0.0, + "step": 32422 + }, + { + "epoch": 3.025380237006625, + "grad_norm": NaN, + "learning_rate": 0.0001555051984157161, + "loss": 0.0, + "step": 32423 + }, + { + "epoch": 3.025473546701502, + "grad_norm": NaN, + "learning_rate": 0.00015549763972015913, + "loss": 0.0, + "step": 32424 + }, + { + "epoch": 3.0255668563963796, + "grad_norm": NaN, + "learning_rate": 0.00015549008101062333, + "loss": 0.0, + "step": 32425 + }, + { + "epoch": 3.025660166091257, + "grad_norm": NaN, + "learning_rate": 0.00015548252228712785, + "loss": 0.0, + "step": 32426 + }, + { + "epoch": 3.025753475786134, + "grad_norm": NaN, + "learning_rate": 0.00015547496354969195, + "loss": 0.0, + "step": 32427 + }, + { + "epoch": 3.0258467854810114, + "grad_norm": NaN, + "learning_rate": 0.0001554674047983348, + "loss": 0.0, + "step": 32428 + }, + { + "epoch": 3.025940095175889, + "grad_norm": NaN, + "learning_rate": 0.00015545984603307575, + "loss": 0.0, + "step": 32429 + }, + { + "epoch": 3.026033404870766, + "grad_norm": NaN, + "learning_rate": 0.00015545228725393387, + "loss": 0.0, + "step": 32430 + }, + { + "epoch": 3.026126714565643, + "grad_norm": NaN, + "learning_rate": 0.00015544472846092843, + "loss": 0.0, + "step": 32431 + }, + { + "epoch": 3.0262200242605206, + "grad_norm": NaN, + "learning_rate": 0.00015543716965407868, + "loss": 0.0, + "step": 32432 + }, + { + "epoch": 3.026313333955398, + "grad_norm": NaN, + "learning_rate": 0.00015542961083340383, + "loss": 0.0, + "step": 32433 + }, + { + "epoch": 3.0264066436502755, + "grad_norm": NaN, + "learning_rate": 0.00015542205199892305, + "loss": 0.0, + "step": 32434 + }, + { + "epoch": 3.0264999533451524, + "grad_norm": NaN, + "learning_rate": 0.00015541449315065563, + "loss": 0.0, + "step": 32435 + }, + { + "epoch": 3.02659326304003, + "grad_norm": NaN, + "learning_rate": 0.00015540693428862078, + "loss": 0.0, + "step": 32436 + }, + { + "epoch": 3.0266865727349073, + "grad_norm": NaN, + "learning_rate": 0.0001553993754128376, + "loss": 0.0, + "step": 32437 + }, + { + "epoch": 3.0267798824297842, + "grad_norm": NaN, + "learning_rate": 0.00015539181652332552, + "loss": 0.0, + "step": 32438 + }, + { + "epoch": 3.0268731921246617, + "grad_norm": NaN, + "learning_rate": 0.0001553842576201036, + "loss": 0.0, + "step": 32439 + }, + { + "epoch": 3.026966501819539, + "grad_norm": NaN, + "learning_rate": 0.00015537669870319107, + "loss": 0.0, + "step": 32440 + }, + { + "epoch": 3.0270598115144165, + "grad_norm": NaN, + "learning_rate": 0.00015536913977260723, + "loss": 0.0, + "step": 32441 + }, + { + "epoch": 3.0271531212092935, + "grad_norm": NaN, + "learning_rate": 0.00015536158082837126, + "loss": 0.0, + "step": 32442 + }, + { + "epoch": 3.027246430904171, + "grad_norm": NaN, + "learning_rate": 0.00015535402187050236, + "loss": 0.0, + "step": 32443 + }, + { + "epoch": 3.0273397405990483, + "grad_norm": NaN, + "learning_rate": 0.00015534646289901981, + "loss": 0.0, + "step": 32444 + }, + { + "epoch": 3.0274330502939257, + "grad_norm": NaN, + "learning_rate": 0.00015533890391394276, + "loss": 0.0, + "step": 32445 + }, + { + "epoch": 3.0275263599888027, + "grad_norm": NaN, + "learning_rate": 0.00015533134491529046, + "loss": 0.0, + "step": 32446 + }, + { + "epoch": 3.02761966968368, + "grad_norm": NaN, + "learning_rate": 0.0001553237859030821, + "loss": 0.0, + "step": 32447 + }, + { + "epoch": 3.0277129793785575, + "grad_norm": NaN, + "learning_rate": 0.000155316226877337, + "loss": 0.0, + "step": 32448 + }, + { + "epoch": 3.0278062890734345, + "grad_norm": NaN, + "learning_rate": 0.00015530866783807423, + "loss": 0.0, + "step": 32449 + }, + { + "epoch": 3.027899598768312, + "grad_norm": NaN, + "learning_rate": 0.0001553011087853131, + "loss": 0.0, + "step": 32450 + }, + { + "epoch": 3.0279929084631894, + "grad_norm": NaN, + "learning_rate": 0.0001552935497190729, + "loss": 0.0, + "step": 32451 + }, + { + "epoch": 3.028086218158067, + "grad_norm": NaN, + "learning_rate": 0.0001552859906393727, + "loss": 0.0, + "step": 32452 + }, + { + "epoch": 3.0281795278529438, + "grad_norm": NaN, + "learning_rate": 0.0001552784315462318, + "loss": 0.0, + "step": 32453 + }, + { + "epoch": 3.028272837547821, + "grad_norm": NaN, + "learning_rate": 0.00015527087243966944, + "loss": 0.0, + "step": 32454 + }, + { + "epoch": 3.0283661472426986, + "grad_norm": NaN, + "learning_rate": 0.0001552633133197048, + "loss": 0.0, + "step": 32455 + }, + { + "epoch": 3.028459456937576, + "grad_norm": NaN, + "learning_rate": 0.00015525575418635706, + "loss": 0.0, + "step": 32456 + }, + { + "epoch": 3.028552766632453, + "grad_norm": NaN, + "learning_rate": 0.0001552481950396456, + "loss": 0.0, + "step": 32457 + }, + { + "epoch": 3.0286460763273304, + "grad_norm": NaN, + "learning_rate": 0.00015524063587958945, + "loss": 0.0, + "step": 32458 + }, + { + "epoch": 3.028739386022208, + "grad_norm": NaN, + "learning_rate": 0.0001552330767062079, + "loss": 0.0, + "step": 32459 + }, + { + "epoch": 3.028832695717085, + "grad_norm": NaN, + "learning_rate": 0.00015522551751952029, + "loss": 0.0, + "step": 32460 + }, + { + "epoch": 3.028926005411962, + "grad_norm": NaN, + "learning_rate": 0.00015521795831954568, + "loss": 0.0, + "step": 32461 + }, + { + "epoch": 3.0290193151068396, + "grad_norm": NaN, + "learning_rate": 0.0001552103991063033, + "loss": 0.0, + "step": 32462 + }, + { + "epoch": 3.029112624801717, + "grad_norm": NaN, + "learning_rate": 0.00015520283987981246, + "loss": 0.0, + "step": 32463 + }, + { + "epoch": 3.029205934496594, + "grad_norm": NaN, + "learning_rate": 0.0001551952806400924, + "loss": 0.0, + "step": 32464 + }, + { + "epoch": 3.0292992441914715, + "grad_norm": NaN, + "learning_rate": 0.00015518772138716215, + "loss": 0.0, + "step": 32465 + }, + { + "epoch": 3.029392553886349, + "grad_norm": NaN, + "learning_rate": 0.00015518016212104113, + "loss": 0.0, + "step": 32466 + }, + { + "epoch": 3.0294858635812263, + "grad_norm": NaN, + "learning_rate": 0.00015517260284174855, + "loss": 0.0, + "step": 32467 + }, + { + "epoch": 3.0295791732761033, + "grad_norm": NaN, + "learning_rate": 0.0001551650435493035, + "loss": 0.0, + "step": 32468 + }, + { + "epoch": 3.0296724829709807, + "grad_norm": NaN, + "learning_rate": 0.00015515748424372526, + "loss": 0.0, + "step": 32469 + }, + { + "epoch": 3.029765792665858, + "grad_norm": NaN, + "learning_rate": 0.00015514992492503314, + "loss": 0.0, + "step": 32470 + }, + { + "epoch": 3.029859102360735, + "grad_norm": NaN, + "learning_rate": 0.00015514236559324626, + "loss": 0.0, + "step": 32471 + }, + { + "epoch": 3.0299524120556125, + "grad_norm": NaN, + "learning_rate": 0.0001551348062483838, + "loss": 0.0, + "step": 32472 + }, + { + "epoch": 3.03004572175049, + "grad_norm": NaN, + "learning_rate": 0.00015512724689046515, + "loss": 0.0, + "step": 32473 + }, + { + "epoch": 3.0301390314453673, + "grad_norm": NaN, + "learning_rate": 0.00015511968751950934, + "loss": 0.0, + "step": 32474 + }, + { + "epoch": 3.0302323411402443, + "grad_norm": NaN, + "learning_rate": 0.0001551121281355357, + "loss": 0.0, + "step": 32475 + }, + { + "epoch": 3.0303256508351217, + "grad_norm": NaN, + "learning_rate": 0.0001551045687385635, + "loss": 0.0, + "step": 32476 + }, + { + "epoch": 3.030418960529999, + "grad_norm": NaN, + "learning_rate": 0.0001550970093286118, + "loss": 0.0, + "step": 32477 + }, + { + "epoch": 3.0305122702248766, + "grad_norm": NaN, + "learning_rate": 0.00015508944990569998, + "loss": 0.0, + "step": 32478 + }, + { + "epoch": 3.0306055799197535, + "grad_norm": NaN, + "learning_rate": 0.0001550818904698472, + "loss": 0.0, + "step": 32479 + }, + { + "epoch": 3.030698889614631, + "grad_norm": NaN, + "learning_rate": 0.00015507433102107264, + "loss": 0.0, + "step": 32480 + }, + { + "epoch": 3.0307921993095084, + "grad_norm": NaN, + "learning_rate": 0.00015506677155939554, + "loss": 0.0, + "step": 32481 + }, + { + "epoch": 3.0308855090043854, + "grad_norm": NaN, + "learning_rate": 0.0001550592120848352, + "loss": 0.0, + "step": 32482 + }, + { + "epoch": 3.0309788186992628, + "grad_norm": NaN, + "learning_rate": 0.00015505165259741078, + "loss": 0.0, + "step": 32483 + }, + { + "epoch": 3.03107212839414, + "grad_norm": NaN, + "learning_rate": 0.00015504409309714145, + "loss": 0.0, + "step": 32484 + }, + { + "epoch": 3.0311654380890176, + "grad_norm": NaN, + "learning_rate": 0.00015503653358404652, + "loss": 0.0, + "step": 32485 + }, + { + "epoch": 3.0312587477838946, + "grad_norm": NaN, + "learning_rate": 0.00015502897405814524, + "loss": 0.0, + "step": 32486 + }, + { + "epoch": 3.031352057478772, + "grad_norm": NaN, + "learning_rate": 0.00015502141451945663, + "loss": 0.0, + "step": 32487 + }, + { + "epoch": 3.0314453671736494, + "grad_norm": NaN, + "learning_rate": 0.00015501385496800014, + "loss": 0.0, + "step": 32488 + }, + { + "epoch": 3.031538676868527, + "grad_norm": NaN, + "learning_rate": 0.00015500629540379494, + "loss": 0.0, + "step": 32489 + }, + { + "epoch": 3.031631986563404, + "grad_norm": NaN, + "learning_rate": 0.00015499873582686013, + "loss": 0.0, + "step": 32490 + }, + { + "epoch": 3.0317252962582812, + "grad_norm": NaN, + "learning_rate": 0.00015499117623721503, + "loss": 0.0, + "step": 32491 + }, + { + "epoch": 3.0318186059531587, + "grad_norm": NaN, + "learning_rate": 0.00015498361663487888, + "loss": 0.0, + "step": 32492 + }, + { + "epoch": 3.0319119156480356, + "grad_norm": NaN, + "learning_rate": 0.00015497605701987087, + "loss": 0.0, + "step": 32493 + }, + { + "epoch": 3.032005225342913, + "grad_norm": NaN, + "learning_rate": 0.00015496849739221017, + "loss": 0.0, + "step": 32494 + }, + { + "epoch": 3.0320985350377905, + "grad_norm": NaN, + "learning_rate": 0.00015496093775191612, + "loss": 0.0, + "step": 32495 + }, + { + "epoch": 3.032191844732668, + "grad_norm": NaN, + "learning_rate": 0.00015495337809900782, + "loss": 0.0, + "step": 32496 + }, + { + "epoch": 3.032285154427545, + "grad_norm": NaN, + "learning_rate": 0.00015494581843350454, + "loss": 0.0, + "step": 32497 + }, + { + "epoch": 3.0323784641224223, + "grad_norm": NaN, + "learning_rate": 0.0001549382587554256, + "loss": 0.0, + "step": 32498 + }, + { + "epoch": 3.0324717738172997, + "grad_norm": NaN, + "learning_rate": 0.00015493069906479003, + "loss": 0.0, + "step": 32499 + }, + { + "epoch": 3.032565083512177, + "grad_norm": NaN, + "learning_rate": 0.00015492313936161717, + "loss": 0.0, + "step": 32500 + }, + { + "epoch": 3.032658393207054, + "grad_norm": NaN, + "learning_rate": 0.00015491557964592625, + "loss": 0.0, + "step": 32501 + }, + { + "epoch": 3.0327517029019315, + "grad_norm": NaN, + "learning_rate": 0.0001549080199177365, + "loss": 0.0, + "step": 32502 + }, + { + "epoch": 3.032845012596809, + "grad_norm": NaN, + "learning_rate": 0.00015490046017706702, + "loss": 0.0, + "step": 32503 + }, + { + "epoch": 3.032938322291686, + "grad_norm": NaN, + "learning_rate": 0.00015489290042393716, + "loss": 0.0, + "step": 32504 + }, + { + "epoch": 3.0330316319865633, + "grad_norm": NaN, + "learning_rate": 0.00015488534065836616, + "loss": 0.0, + "step": 32505 + }, + { + "epoch": 3.0331249416814408, + "grad_norm": NaN, + "learning_rate": 0.00015487778088037313, + "loss": 0.0, + "step": 32506 + }, + { + "epoch": 3.033218251376318, + "grad_norm": NaN, + "learning_rate": 0.00015487022108997734, + "loss": 0.0, + "step": 32507 + }, + { + "epoch": 3.033311561071195, + "grad_norm": NaN, + "learning_rate": 0.0001548626612871981, + "loss": 0.0, + "step": 32508 + }, + { + "epoch": 3.0334048707660726, + "grad_norm": NaN, + "learning_rate": 0.00015485510147205444, + "loss": 0.0, + "step": 32509 + }, + { + "epoch": 3.03349818046095, + "grad_norm": NaN, + "learning_rate": 0.00015484754164456576, + "loss": 0.0, + "step": 32510 + }, + { + "epoch": 3.0335914901558274, + "grad_norm": NaN, + "learning_rate": 0.00015483998180475125, + "loss": 0.0, + "step": 32511 + }, + { + "epoch": 3.0336847998507044, + "grad_norm": NaN, + "learning_rate": 0.00015483242195263005, + "loss": 0.0, + "step": 32512 + }, + { + "epoch": 3.033778109545582, + "grad_norm": NaN, + "learning_rate": 0.0001548248620882214, + "loss": 0.0, + "step": 32513 + }, + { + "epoch": 3.033871419240459, + "grad_norm": NaN, + "learning_rate": 0.00015481730221154463, + "loss": 0.0, + "step": 32514 + }, + { + "epoch": 3.033964728935336, + "grad_norm": NaN, + "learning_rate": 0.00015480974232261885, + "loss": 0.0, + "step": 32515 + }, + { + "epoch": 3.0340580386302136, + "grad_norm": NaN, + "learning_rate": 0.00015480218242146327, + "loss": 0.0, + "step": 32516 + }, + { + "epoch": 3.034151348325091, + "grad_norm": NaN, + "learning_rate": 0.00015479462250809724, + "loss": 0.0, + "step": 32517 + }, + { + "epoch": 3.0342446580199685, + "grad_norm": NaN, + "learning_rate": 0.00015478706258253988, + "loss": 0.0, + "step": 32518 + }, + { + "epoch": 3.0343379677148454, + "grad_norm": NaN, + "learning_rate": 0.00015477950264481042, + "loss": 0.0, + "step": 32519 + }, + { + "epoch": 3.034431277409723, + "grad_norm": NaN, + "learning_rate": 0.0001547719426949281, + "loss": 0.0, + "step": 32520 + }, + { + "epoch": 3.0345245871046003, + "grad_norm": NaN, + "learning_rate": 0.00015476438273291223, + "loss": 0.0, + "step": 32521 + }, + { + "epoch": 3.0346178967994772, + "grad_norm": NaN, + "learning_rate": 0.00015475682275878185, + "loss": 0.0, + "step": 32522 + }, + { + "epoch": 3.0347112064943547, + "grad_norm": NaN, + "learning_rate": 0.00015474926277255627, + "loss": 0.0, + "step": 32523 + }, + { + "epoch": 3.034804516189232, + "grad_norm": NaN, + "learning_rate": 0.0001547417027742548, + "loss": 0.0, + "step": 32524 + }, + { + "epoch": 3.0348978258841095, + "grad_norm": NaN, + "learning_rate": 0.00015473414276389653, + "loss": 0.0, + "step": 32525 + }, + { + "epoch": 3.0349911355789865, + "grad_norm": NaN, + "learning_rate": 0.00015472658274150072, + "loss": 0.0, + "step": 32526 + }, + { + "epoch": 3.035084445273864, + "grad_norm": NaN, + "learning_rate": 0.00015471902270708667, + "loss": 0.0, + "step": 32527 + }, + { + "epoch": 3.0351777549687413, + "grad_norm": NaN, + "learning_rate": 0.00015471146266067347, + "loss": 0.0, + "step": 32528 + }, + { + "epoch": 3.0352710646636187, + "grad_norm": NaN, + "learning_rate": 0.00015470390260228047, + "loss": 0.0, + "step": 32529 + }, + { + "epoch": 3.0353643743584957, + "grad_norm": NaN, + "learning_rate": 0.00015469634253192685, + "loss": 0.0, + "step": 32530 + }, + { + "epoch": 3.035457684053373, + "grad_norm": NaN, + "learning_rate": 0.00015468878244963177, + "loss": 0.0, + "step": 32531 + }, + { + "epoch": 3.0355509937482505, + "grad_norm": NaN, + "learning_rate": 0.00015468122235541456, + "loss": 0.0, + "step": 32532 + }, + { + "epoch": 3.0356443034431275, + "grad_norm": NaN, + "learning_rate": 0.00015467366224929439, + "loss": 0.0, + "step": 32533 + }, + { + "epoch": 3.035737613138005, + "grad_norm": NaN, + "learning_rate": 0.00015466610213129046, + "loss": 0.0, + "step": 32534 + }, + { + "epoch": 3.0358309228328824, + "grad_norm": NaN, + "learning_rate": 0.00015465854200142197, + "loss": 0.0, + "step": 32535 + }, + { + "epoch": 3.03592423252776, + "grad_norm": NaN, + "learning_rate": 0.00015465098185970828, + "loss": 0.0, + "step": 32536 + }, + { + "epoch": 3.0360175422226368, + "grad_norm": NaN, + "learning_rate": 0.00015464342170616846, + "loss": 0.0, + "step": 32537 + }, + { + "epoch": 3.036110851917514, + "grad_norm": NaN, + "learning_rate": 0.00015463586154082176, + "loss": 0.0, + "step": 32538 + }, + { + "epoch": 3.0362041616123916, + "grad_norm": NaN, + "learning_rate": 0.00015462830136368747, + "loss": 0.0, + "step": 32539 + }, + { + "epoch": 3.036297471307269, + "grad_norm": NaN, + "learning_rate": 0.00015462074117478483, + "loss": 0.0, + "step": 32540 + }, + { + "epoch": 3.036390781002146, + "grad_norm": NaN, + "learning_rate": 0.00015461318097413296, + "loss": 0.0, + "step": 32541 + }, + { + "epoch": 3.0364840906970234, + "grad_norm": NaN, + "learning_rate": 0.00015460562076175113, + "loss": 0.0, + "step": 32542 + }, + { + "epoch": 3.036577400391901, + "grad_norm": NaN, + "learning_rate": 0.0001545980605376586, + "loss": 0.0, + "step": 32543 + }, + { + "epoch": 3.036670710086778, + "grad_norm": NaN, + "learning_rate": 0.00015459050030187455, + "loss": 0.0, + "step": 32544 + }, + { + "epoch": 3.036764019781655, + "grad_norm": NaN, + "learning_rate": 0.0001545829400544182, + "loss": 0.0, + "step": 32545 + }, + { + "epoch": 3.0368573294765326, + "grad_norm": NaN, + "learning_rate": 0.00015457537979530884, + "loss": 0.0, + "step": 32546 + }, + { + "epoch": 3.03695063917141, + "grad_norm": NaN, + "learning_rate": 0.0001545678195245656, + "loss": 0.0, + "step": 32547 + }, + { + "epoch": 3.037043948866287, + "grad_norm": NaN, + "learning_rate": 0.0001545602592422078, + "loss": 0.0, + "step": 32548 + }, + { + "epoch": 3.0371372585611645, + "grad_norm": NaN, + "learning_rate": 0.0001545526989482546, + "loss": 0.0, + "step": 32549 + }, + { + "epoch": 3.037230568256042, + "grad_norm": NaN, + "learning_rate": 0.0001545451386427252, + "loss": 0.0, + "step": 32550 + }, + { + "epoch": 3.0373238779509193, + "grad_norm": NaN, + "learning_rate": 0.00015453757832563884, + "loss": 0.0, + "step": 32551 + }, + { + "epoch": 3.0374171876457963, + "grad_norm": NaN, + "learning_rate": 0.00015453001799701482, + "loss": 0.0, + "step": 32552 + }, + { + "epoch": 3.0375104973406737, + "grad_norm": NaN, + "learning_rate": 0.00015452245765687226, + "loss": 0.0, + "step": 32553 + }, + { + "epoch": 3.037603807035551, + "grad_norm": NaN, + "learning_rate": 0.00015451489730523044, + "loss": 0.0, + "step": 32554 + }, + { + "epoch": 3.037697116730428, + "grad_norm": NaN, + "learning_rate": 0.00015450733694210862, + "loss": 0.0, + "step": 32555 + }, + { + "epoch": 3.0377904264253055, + "grad_norm": NaN, + "learning_rate": 0.00015449977656752591, + "loss": 0.0, + "step": 32556 + }, + { + "epoch": 3.037883736120183, + "grad_norm": NaN, + "learning_rate": 0.00015449221618150163, + "loss": 0.0, + "step": 32557 + }, + { + "epoch": 3.0379770458150603, + "grad_norm": NaN, + "learning_rate": 0.00015448465578405497, + "loss": 0.0, + "step": 32558 + }, + { + "epoch": 3.0380703555099373, + "grad_norm": NaN, + "learning_rate": 0.00015447709537520522, + "loss": 0.0, + "step": 32559 + }, + { + "epoch": 3.0381636652048147, + "grad_norm": NaN, + "learning_rate": 0.00015446953495497145, + "loss": 0.0, + "step": 32560 + }, + { + "epoch": 3.038256974899692, + "grad_norm": NaN, + "learning_rate": 0.000154461974523373, + "loss": 0.0, + "step": 32561 + }, + { + "epoch": 3.0383502845945696, + "grad_norm": NaN, + "learning_rate": 0.00015445441408042915, + "loss": 0.0, + "step": 32562 + }, + { + "epoch": 3.0384435942894465, + "grad_norm": NaN, + "learning_rate": 0.00015444685362615894, + "loss": 0.0, + "step": 32563 + }, + { + "epoch": 3.038536903984324, + "grad_norm": NaN, + "learning_rate": 0.00015443929316058172, + "loss": 0.0, + "step": 32564 + }, + { + "epoch": 3.0386302136792014, + "grad_norm": NaN, + "learning_rate": 0.00015443173268371677, + "loss": 0.0, + "step": 32565 + }, + { + "epoch": 3.0387235233740784, + "grad_norm": NaN, + "learning_rate": 0.0001544241721955831, + "loss": 0.0, + "step": 32566 + }, + { + "epoch": 3.038816833068956, + "grad_norm": NaN, + "learning_rate": 0.00015441661169620017, + "loss": 0.0, + "step": 32567 + }, + { + "epoch": 3.038910142763833, + "grad_norm": NaN, + "learning_rate": 0.00015440905118558712, + "loss": 0.0, + "step": 32568 + }, + { + "epoch": 3.0390034524587106, + "grad_norm": NaN, + "learning_rate": 0.0001544014906637631, + "loss": 0.0, + "step": 32569 + }, + { + "epoch": 3.0390967621535876, + "grad_norm": NaN, + "learning_rate": 0.00015439393013074742, + "loss": 0.0, + "step": 32570 + }, + { + "epoch": 3.039190071848465, + "grad_norm": NaN, + "learning_rate": 0.0001543863695865593, + "loss": 0.0, + "step": 32571 + }, + { + "epoch": 3.0392833815433424, + "grad_norm": NaN, + "learning_rate": 0.00015437880903121788, + "loss": 0.0, + "step": 32572 + }, + { + "epoch": 3.03937669123822, + "grad_norm": NaN, + "learning_rate": 0.0001543712484647425, + "loss": 0.0, + "step": 32573 + }, + { + "epoch": 3.039470000933097, + "grad_norm": NaN, + "learning_rate": 0.00015436368788715232, + "loss": 0.0, + "step": 32574 + }, + { + "epoch": 3.0395633106279742, + "grad_norm": NaN, + "learning_rate": 0.00015435612729846655, + "loss": 0.0, + "step": 32575 + }, + { + "epoch": 3.0396566203228517, + "grad_norm": NaN, + "learning_rate": 0.00015434856669870444, + "loss": 0.0, + "step": 32576 + }, + { + "epoch": 3.0397499300177286, + "grad_norm": NaN, + "learning_rate": 0.00015434100608788522, + "loss": 0.0, + "step": 32577 + }, + { + "epoch": 3.039843239712606, + "grad_norm": NaN, + "learning_rate": 0.00015433344546602812, + "loss": 0.0, + "step": 32578 + }, + { + "epoch": 3.0399365494074835, + "grad_norm": NaN, + "learning_rate": 0.00015432588483315233, + "loss": 0.0, + "step": 32579 + }, + { + "epoch": 3.040029859102361, + "grad_norm": NaN, + "learning_rate": 0.00015431832418927712, + "loss": 0.0, + "step": 32580 + }, + { + "epoch": 3.040123168797238, + "grad_norm": NaN, + "learning_rate": 0.00015431076353442173, + "loss": 0.0, + "step": 32581 + }, + { + "epoch": 3.0402164784921153, + "grad_norm": NaN, + "learning_rate": 0.00015430320286860525, + "loss": 0.0, + "step": 32582 + }, + { + "epoch": 3.0403097881869927, + "grad_norm": NaN, + "learning_rate": 0.00015429564219184705, + "loss": 0.0, + "step": 32583 + }, + { + "epoch": 3.04040309788187, + "grad_norm": NaN, + "learning_rate": 0.00015428808150416633, + "loss": 0.0, + "step": 32584 + }, + { + "epoch": 3.040496407576747, + "grad_norm": NaN, + "learning_rate": 0.00015428052080558225, + "loss": 0.0, + "step": 32585 + }, + { + "epoch": 3.0405897172716245, + "grad_norm": NaN, + "learning_rate": 0.00015427296009611408, + "loss": 0.0, + "step": 32586 + }, + { + "epoch": 3.040683026966502, + "grad_norm": NaN, + "learning_rate": 0.00015426539937578106, + "loss": 0.0, + "step": 32587 + }, + { + "epoch": 3.040776336661379, + "grad_norm": NaN, + "learning_rate": 0.00015425783864460231, + "loss": 0.0, + "step": 32588 + }, + { + "epoch": 3.0408696463562563, + "grad_norm": NaN, + "learning_rate": 0.00015425027790259726, + "loss": 0.0, + "step": 32589 + }, + { + "epoch": 3.0409629560511338, + "grad_norm": NaN, + "learning_rate": 0.00015424271714978497, + "loss": 0.0, + "step": 32590 + }, + { + "epoch": 3.041056265746011, + "grad_norm": NaN, + "learning_rate": 0.00015423515638618464, + "loss": 0.0, + "step": 32591 + }, + { + "epoch": 3.041149575440888, + "grad_norm": NaN, + "learning_rate": 0.00015422759561181565, + "loss": 0.0, + "step": 32592 + }, + { + "epoch": 3.0412428851357656, + "grad_norm": NaN, + "learning_rate": 0.0001542200348266971, + "loss": 0.0, + "step": 32593 + }, + { + "epoch": 3.041336194830643, + "grad_norm": NaN, + "learning_rate": 0.00015421247403084825, + "loss": 0.0, + "step": 32594 + }, + { + "epoch": 3.0414295045255204, + "grad_norm": NaN, + "learning_rate": 0.0001542049132242883, + "loss": 0.0, + "step": 32595 + }, + { + "epoch": 3.0415228142203974, + "grad_norm": NaN, + "learning_rate": 0.0001541973524070365, + "loss": 0.0, + "step": 32596 + }, + { + "epoch": 3.041616123915275, + "grad_norm": NaN, + "learning_rate": 0.00015418979157911215, + "loss": 0.0, + "step": 32597 + }, + { + "epoch": 3.0417094336101522, + "grad_norm": NaN, + "learning_rate": 0.00015418223074053434, + "loss": 0.0, + "step": 32598 + }, + { + "epoch": 3.041802743305029, + "grad_norm": NaN, + "learning_rate": 0.0001541746698913224, + "loss": 0.0, + "step": 32599 + }, + { + "epoch": 3.0418960529999066, + "grad_norm": NaN, + "learning_rate": 0.00015416710903149548, + "loss": 0.0, + "step": 32600 + }, + { + "epoch": 3.041989362694784, + "grad_norm": NaN, + "learning_rate": 0.00015415954816107283, + "loss": 0.0, + "step": 32601 + }, + { + "epoch": 3.0420826723896615, + "grad_norm": NaN, + "learning_rate": 0.0001541519872800737, + "loss": 0.0, + "step": 32602 + }, + { + "epoch": 3.0421759820845384, + "grad_norm": NaN, + "learning_rate": 0.00015414442638851731, + "loss": 0.0, + "step": 32603 + }, + { + "epoch": 3.042269291779416, + "grad_norm": NaN, + "learning_rate": 0.00015413686548642282, + "loss": 0.0, + "step": 32604 + }, + { + "epoch": 3.0423626014742933, + "grad_norm": NaN, + "learning_rate": 0.00015412930457380953, + "loss": 0.0, + "step": 32605 + }, + { + "epoch": 3.0424559111691707, + "grad_norm": NaN, + "learning_rate": 0.0001541217436506967, + "loss": 0.0, + "step": 32606 + }, + { + "epoch": 3.0425492208640477, + "grad_norm": NaN, + "learning_rate": 0.00015411418271710342, + "loss": 0.0, + "step": 32607 + }, + { + "epoch": 3.042642530558925, + "grad_norm": NaN, + "learning_rate": 0.000154106621773049, + "loss": 0.0, + "step": 32608 + }, + { + "epoch": 3.0427358402538025, + "grad_norm": NaN, + "learning_rate": 0.00015409906081855268, + "loss": 0.0, + "step": 32609 + }, + { + "epoch": 3.0428291499486795, + "grad_norm": NaN, + "learning_rate": 0.00015409149985363366, + "loss": 0.0, + "step": 32610 + }, + { + "epoch": 3.042922459643557, + "grad_norm": NaN, + "learning_rate": 0.00015408393887831115, + "loss": 0.0, + "step": 32611 + }, + { + "epoch": 3.0430157693384343, + "grad_norm": NaN, + "learning_rate": 0.00015407637789260445, + "loss": 0.0, + "step": 32612 + }, + { + "epoch": 3.0431090790333117, + "grad_norm": NaN, + "learning_rate": 0.00015406881689653264, + "loss": 0.0, + "step": 32613 + }, + { + "epoch": 3.0432023887281887, + "grad_norm": NaN, + "learning_rate": 0.00015406125589011507, + "loss": 0.0, + "step": 32614 + }, + { + "epoch": 3.043295698423066, + "grad_norm": NaN, + "learning_rate": 0.00015405369487337094, + "loss": 0.0, + "step": 32615 + }, + { + "epoch": 3.0433890081179436, + "grad_norm": NaN, + "learning_rate": 0.00015404613384631947, + "loss": 0.0, + "step": 32616 + }, + { + "epoch": 3.043482317812821, + "grad_norm": NaN, + "learning_rate": 0.0001540385728089799, + "loss": 0.0, + "step": 32617 + }, + { + "epoch": 3.043575627507698, + "grad_norm": NaN, + "learning_rate": 0.0001540310117613714, + "loss": 0.0, + "step": 32618 + }, + { + "epoch": 3.0436689372025754, + "grad_norm": NaN, + "learning_rate": 0.00015402345070351323, + "loss": 0.0, + "step": 32619 + }, + { + "epoch": 3.043762246897453, + "grad_norm": NaN, + "learning_rate": 0.00015401588963542465, + "loss": 0.0, + "step": 32620 + }, + { + "epoch": 3.0438555565923298, + "grad_norm": NaN, + "learning_rate": 0.0001540083285571248, + "loss": 0.0, + "step": 32621 + }, + { + "epoch": 3.043948866287207, + "grad_norm": NaN, + "learning_rate": 0.000154000767468633, + "loss": 0.0, + "step": 32622 + }, + { + "epoch": 3.0440421759820846, + "grad_norm": NaN, + "learning_rate": 0.00015399320636996842, + "loss": 0.0, + "step": 32623 + }, + { + "epoch": 3.044135485676962, + "grad_norm": NaN, + "learning_rate": 0.0001539856452611503, + "loss": 0.0, + "step": 32624 + }, + { + "epoch": 3.044228795371839, + "grad_norm": NaN, + "learning_rate": 0.0001539780841421979, + "loss": 0.0, + "step": 32625 + }, + { + "epoch": 3.0443221050667164, + "grad_norm": NaN, + "learning_rate": 0.00015397052301313035, + "loss": 0.0, + "step": 32626 + }, + { + "epoch": 3.044415414761594, + "grad_norm": NaN, + "learning_rate": 0.00015396296187396695, + "loss": 0.0, + "step": 32627 + }, + { + "epoch": 3.0445087244564712, + "grad_norm": NaN, + "learning_rate": 0.00015395540072472695, + "loss": 0.0, + "step": 32628 + }, + { + "epoch": 3.0446020341513482, + "grad_norm": NaN, + "learning_rate": 0.00015394783956542947, + "loss": 0.0, + "step": 32629 + }, + { + "epoch": 3.0446953438462256, + "grad_norm": NaN, + "learning_rate": 0.00015394027839609386, + "loss": 0.0, + "step": 32630 + }, + { + "epoch": 3.044788653541103, + "grad_norm": NaN, + "learning_rate": 0.0001539327172167393, + "loss": 0.0, + "step": 32631 + }, + { + "epoch": 3.04488196323598, + "grad_norm": NaN, + "learning_rate": 0.00015392515602738496, + "loss": 0.0, + "step": 32632 + }, + { + "epoch": 3.0449752729308575, + "grad_norm": NaN, + "learning_rate": 0.00015391759482805012, + "loss": 0.0, + "step": 32633 + }, + { + "epoch": 3.045068582625735, + "grad_norm": NaN, + "learning_rate": 0.00015391003361875402, + "loss": 0.0, + "step": 32634 + }, + { + "epoch": 3.0451618923206123, + "grad_norm": NaN, + "learning_rate": 0.00015390247239951583, + "loss": 0.0, + "step": 32635 + }, + { + "epoch": 3.0452552020154893, + "grad_norm": NaN, + "learning_rate": 0.0001538949111703548, + "loss": 0.0, + "step": 32636 + }, + { + "epoch": 3.0453485117103667, + "grad_norm": NaN, + "learning_rate": 0.00015388734993129023, + "loss": 0.0, + "step": 32637 + }, + { + "epoch": 3.045441821405244, + "grad_norm": NaN, + "learning_rate": 0.00015387978868234125, + "loss": 0.0, + "step": 32638 + }, + { + "epoch": 3.045535131100121, + "grad_norm": NaN, + "learning_rate": 0.00015387222742352708, + "loss": 0.0, + "step": 32639 + }, + { + "epoch": 3.0456284407949985, + "grad_norm": NaN, + "learning_rate": 0.00015386466615486703, + "loss": 0.0, + "step": 32640 + }, + { + "epoch": 3.045721750489876, + "grad_norm": NaN, + "learning_rate": 0.00015385710487638026, + "loss": 0.0, + "step": 32641 + }, + { + "epoch": 3.0458150601847533, + "grad_norm": NaN, + "learning_rate": 0.00015384954358808604, + "loss": 0.0, + "step": 32642 + }, + { + "epoch": 3.0459083698796303, + "grad_norm": NaN, + "learning_rate": 0.00015384198229000353, + "loss": 0.0, + "step": 32643 + }, + { + "epoch": 3.0460016795745077, + "grad_norm": NaN, + "learning_rate": 0.000153834420982152, + "loss": 0.0, + "step": 32644 + }, + { + "epoch": 3.046094989269385, + "grad_norm": NaN, + "learning_rate": 0.00015382685966455067, + "loss": 0.0, + "step": 32645 + }, + { + "epoch": 3.0461882989642626, + "grad_norm": NaN, + "learning_rate": 0.00015381929833721883, + "loss": 0.0, + "step": 32646 + }, + { + "epoch": 3.0462816086591396, + "grad_norm": NaN, + "learning_rate": 0.00015381173700017563, + "loss": 0.0, + "step": 32647 + }, + { + "epoch": 3.046374918354017, + "grad_norm": NaN, + "learning_rate": 0.00015380417565344026, + "loss": 0.0, + "step": 32648 + }, + { + "epoch": 3.0464682280488944, + "grad_norm": NaN, + "learning_rate": 0.00015379661429703201, + "loss": 0.0, + "step": 32649 + }, + { + "epoch": 3.0465615377437714, + "grad_norm": NaN, + "learning_rate": 0.00015378905293097014, + "loss": 0.0, + "step": 32650 + }, + { + "epoch": 3.046654847438649, + "grad_norm": NaN, + "learning_rate": 0.00015378149155527375, + "loss": 0.0, + "step": 32651 + }, + { + "epoch": 3.046748157133526, + "grad_norm": NaN, + "learning_rate": 0.00015377393016996222, + "loss": 0.0, + "step": 32652 + }, + { + "epoch": 3.0468414668284036, + "grad_norm": NaN, + "learning_rate": 0.00015376636877505466, + "loss": 0.0, + "step": 32653 + }, + { + "epoch": 3.0469347765232806, + "grad_norm": NaN, + "learning_rate": 0.00015375880737057037, + "loss": 0.0, + "step": 32654 + }, + { + "epoch": 3.047028086218158, + "grad_norm": NaN, + "learning_rate": 0.0001537512459565285, + "loss": 0.0, + "step": 32655 + }, + { + "epoch": 3.0471213959130354, + "grad_norm": NaN, + "learning_rate": 0.00015374368453294839, + "loss": 0.0, + "step": 32656 + }, + { + "epoch": 3.047214705607913, + "grad_norm": NaN, + "learning_rate": 0.00015373612309984915, + "loss": 0.0, + "step": 32657 + }, + { + "epoch": 3.04730801530279, + "grad_norm": NaN, + "learning_rate": 0.00015372856165725006, + "loss": 0.0, + "step": 32658 + }, + { + "epoch": 3.0474013249976672, + "grad_norm": NaN, + "learning_rate": 0.00015372100020517035, + "loss": 0.0, + "step": 32659 + }, + { + "epoch": 3.0474946346925447, + "grad_norm": NaN, + "learning_rate": 0.00015371343874362925, + "loss": 0.0, + "step": 32660 + }, + { + "epoch": 3.0475879443874216, + "grad_norm": NaN, + "learning_rate": 0.000153705877272646, + "loss": 0.0, + "step": 32661 + }, + { + "epoch": 3.047681254082299, + "grad_norm": NaN, + "learning_rate": 0.00015369831579223976, + "loss": 0.0, + "step": 32662 + }, + { + "epoch": 3.0477745637771765, + "grad_norm": NaN, + "learning_rate": 0.0001536907543024298, + "loss": 0.0, + "step": 32663 + }, + { + "epoch": 3.047867873472054, + "grad_norm": NaN, + "learning_rate": 0.00015368319280323537, + "loss": 0.0, + "step": 32664 + }, + { + "epoch": 3.047961183166931, + "grad_norm": NaN, + "learning_rate": 0.00015367563129467564, + "loss": 0.0, + "step": 32665 + }, + { + "epoch": 3.0480544928618083, + "grad_norm": NaN, + "learning_rate": 0.0001536680697767699, + "loss": 0.0, + "step": 32666 + }, + { + "epoch": 3.0481478025566857, + "grad_norm": NaN, + "learning_rate": 0.00015366050824953733, + "loss": 0.0, + "step": 32667 + }, + { + "epoch": 3.048241112251563, + "grad_norm": NaN, + "learning_rate": 0.00015365294671299717, + "loss": 0.0, + "step": 32668 + }, + { + "epoch": 3.04833442194644, + "grad_norm": NaN, + "learning_rate": 0.00015364538516716866, + "loss": 0.0, + "step": 32669 + }, + { + "epoch": 3.0484277316413175, + "grad_norm": NaN, + "learning_rate": 0.00015363782361207097, + "loss": 0.0, + "step": 32670 + }, + { + "epoch": 3.048521041336195, + "grad_norm": NaN, + "learning_rate": 0.00015363026204772345, + "loss": 0.0, + "step": 32671 + }, + { + "epoch": 3.048614351031072, + "grad_norm": NaN, + "learning_rate": 0.0001536227004741452, + "loss": 0.0, + "step": 32672 + }, + { + "epoch": 3.0487076607259493, + "grad_norm": NaN, + "learning_rate": 0.00015361513889135551, + "loss": 0.0, + "step": 32673 + }, + { + "epoch": 3.0488009704208268, + "grad_norm": NaN, + "learning_rate": 0.00015360757729937358, + "loss": 0.0, + "step": 32674 + }, + { + "epoch": 3.048894280115704, + "grad_norm": NaN, + "learning_rate": 0.00015360001569821865, + "loss": 0.0, + "step": 32675 + }, + { + "epoch": 3.048987589810581, + "grad_norm": NaN, + "learning_rate": 0.00015359245408790994, + "loss": 0.0, + "step": 32676 + }, + { + "epoch": 3.0490808995054586, + "grad_norm": NaN, + "learning_rate": 0.00015358489246846668, + "loss": 0.0, + "step": 32677 + }, + { + "epoch": 3.049174209200336, + "grad_norm": NaN, + "learning_rate": 0.00015357733083990813, + "loss": 0.0, + "step": 32678 + }, + { + "epoch": 3.0492675188952134, + "grad_norm": NaN, + "learning_rate": 0.00015356976920225348, + "loss": 0.0, + "step": 32679 + }, + { + "epoch": 3.0493608285900904, + "grad_norm": NaN, + "learning_rate": 0.00015356220755552194, + "loss": 0.0, + "step": 32680 + }, + { + "epoch": 3.049454138284968, + "grad_norm": NaN, + "learning_rate": 0.0001535546458997328, + "loss": 0.0, + "step": 32681 + }, + { + "epoch": 3.0495474479798452, + "grad_norm": NaN, + "learning_rate": 0.00015354708423490524, + "loss": 0.0, + "step": 32682 + }, + { + "epoch": 3.049640757674722, + "grad_norm": NaN, + "learning_rate": 0.00015353952256105845, + "loss": 0.0, + "step": 32683 + }, + { + "epoch": 3.0497340673695996, + "grad_norm": NaN, + "learning_rate": 0.00015353196087821173, + "loss": 0.0, + "step": 32684 + }, + { + "epoch": 3.049827377064477, + "grad_norm": NaN, + "learning_rate": 0.0001535243991863843, + "loss": 0.0, + "step": 32685 + }, + { + "epoch": 3.0499206867593545, + "grad_norm": NaN, + "learning_rate": 0.00015351683748559536, + "loss": 0.0, + "step": 32686 + }, + { + "epoch": 3.0500139964542314, + "grad_norm": NaN, + "learning_rate": 0.0001535092757758641, + "loss": 0.0, + "step": 32687 + }, + { + "epoch": 3.050107306149109, + "grad_norm": NaN, + "learning_rate": 0.00015350171405720982, + "loss": 0.0, + "step": 32688 + }, + { + "epoch": 3.0502006158439863, + "grad_norm": NaN, + "learning_rate": 0.00015349415232965175, + "loss": 0.0, + "step": 32689 + }, + { + "epoch": 3.0502939255388637, + "grad_norm": NaN, + "learning_rate": 0.00015348659059320904, + "loss": 0.0, + "step": 32690 + }, + { + "epoch": 3.0503872352337407, + "grad_norm": NaN, + "learning_rate": 0.000153479028847901, + "loss": 0.0, + "step": 32691 + }, + { + "epoch": 3.050480544928618, + "grad_norm": NaN, + "learning_rate": 0.0001534714670937468, + "loss": 0.0, + "step": 32692 + }, + { + "epoch": 3.0505738546234955, + "grad_norm": NaN, + "learning_rate": 0.00015346390533076568, + "loss": 0.0, + "step": 32693 + }, + { + "epoch": 3.0506671643183725, + "grad_norm": NaN, + "learning_rate": 0.00015345634355897692, + "loss": 0.0, + "step": 32694 + }, + { + "epoch": 3.05076047401325, + "grad_norm": NaN, + "learning_rate": 0.00015344878177839965, + "loss": 0.0, + "step": 32695 + }, + { + "epoch": 3.0508537837081273, + "grad_norm": NaN, + "learning_rate": 0.00015344121998905316, + "loss": 0.0, + "step": 32696 + }, + { + "epoch": 3.0509470934030047, + "grad_norm": NaN, + "learning_rate": 0.00015343365819095667, + "loss": 0.0, + "step": 32697 + }, + { + "epoch": 3.0510404030978817, + "grad_norm": NaN, + "learning_rate": 0.00015342609638412943, + "loss": 0.0, + "step": 32698 + }, + { + "epoch": 3.051133712792759, + "grad_norm": NaN, + "learning_rate": 0.00015341853456859062, + "loss": 0.0, + "step": 32699 + }, + { + "epoch": 3.0512270224876366, + "grad_norm": NaN, + "learning_rate": 0.00015341097274435946, + "loss": 0.0, + "step": 32700 + }, + { + "epoch": 3.051320332182514, + "grad_norm": NaN, + "learning_rate": 0.00015340341091145526, + "loss": 0.0, + "step": 32701 + }, + { + "epoch": 3.051413641877391, + "grad_norm": NaN, + "learning_rate": 0.00015339584906989717, + "loss": 0.0, + "step": 32702 + }, + { + "epoch": 3.0515069515722684, + "grad_norm": NaN, + "learning_rate": 0.00015338828721970444, + "loss": 0.0, + "step": 32703 + }, + { + "epoch": 3.051600261267146, + "grad_norm": NaN, + "learning_rate": 0.0001533807253608963, + "loss": 0.0, + "step": 32704 + }, + { + "epoch": 3.0516935709620228, + "grad_norm": NaN, + "learning_rate": 0.000153373163493492, + "loss": 0.0, + "step": 32705 + }, + { + "epoch": 3.0517868806569, + "grad_norm": NaN, + "learning_rate": 0.0001533656016175107, + "loss": 0.0, + "step": 32706 + }, + { + "epoch": 3.0518801903517776, + "grad_norm": NaN, + "learning_rate": 0.0001533580397329717, + "loss": 0.0, + "step": 32707 + }, + { + "epoch": 3.051973500046655, + "grad_norm": NaN, + "learning_rate": 0.0001533504778398942, + "loss": 0.0, + "step": 32708 + }, + { + "epoch": 3.052066809741532, + "grad_norm": NaN, + "learning_rate": 0.0001533429159382974, + "loss": 0.0, + "step": 32709 + }, + { + "epoch": 3.0521601194364094, + "grad_norm": NaN, + "learning_rate": 0.0001533353540282006, + "loss": 0.0, + "step": 32710 + }, + { + "epoch": 3.052253429131287, + "grad_norm": NaN, + "learning_rate": 0.00015332779210962296, + "loss": 0.0, + "step": 32711 + }, + { + "epoch": 3.0523467388261643, + "grad_norm": NaN, + "learning_rate": 0.00015332023018258372, + "loss": 0.0, + "step": 32712 + }, + { + "epoch": 3.0524400485210412, + "grad_norm": NaN, + "learning_rate": 0.00015331266824710212, + "loss": 0.0, + "step": 32713 + }, + { + "epoch": 3.0525333582159186, + "grad_norm": NaN, + "learning_rate": 0.0001533051063031974, + "loss": 0.0, + "step": 32714 + }, + { + "epoch": 3.052626667910796, + "grad_norm": NaN, + "learning_rate": 0.00015329754435088877, + "loss": 0.0, + "step": 32715 + }, + { + "epoch": 3.052719977605673, + "grad_norm": NaN, + "learning_rate": 0.00015328998239019548, + "loss": 0.0, + "step": 32716 + }, + { + "epoch": 3.0528132873005505, + "grad_norm": NaN, + "learning_rate": 0.0001532824204211367, + "loss": 0.0, + "step": 32717 + }, + { + "epoch": 3.052906596995428, + "grad_norm": NaN, + "learning_rate": 0.00015327485844373172, + "loss": 0.0, + "step": 32718 + }, + { + "epoch": 3.0529999066903053, + "grad_norm": NaN, + "learning_rate": 0.00015326729645799976, + "loss": 0.0, + "step": 32719 + }, + { + "epoch": 3.0530932163851823, + "grad_norm": NaN, + "learning_rate": 0.00015325973446396, + "loss": 0.0, + "step": 32720 + }, + { + "epoch": 3.0531865260800597, + "grad_norm": NaN, + "learning_rate": 0.00015325217246163172, + "loss": 0.0, + "step": 32721 + }, + { + "epoch": 3.053279835774937, + "grad_norm": NaN, + "learning_rate": 0.00015324461045103412, + "loss": 0.0, + "step": 32722 + }, + { + "epoch": 3.0533731454698145, + "grad_norm": NaN, + "learning_rate": 0.00015323704843218643, + "loss": 0.0, + "step": 32723 + }, + { + "epoch": 3.0534664551646915, + "grad_norm": NaN, + "learning_rate": 0.0001532294864051079, + "loss": 0.0, + "step": 32724 + }, + { + "epoch": 3.053559764859569, + "grad_norm": NaN, + "learning_rate": 0.00015322192436981774, + "loss": 0.0, + "step": 32725 + }, + { + "epoch": 3.0536530745544463, + "grad_norm": NaN, + "learning_rate": 0.00015321436232633515, + "loss": 0.0, + "step": 32726 + }, + { + "epoch": 3.0537463842493233, + "grad_norm": NaN, + "learning_rate": 0.00015320680027467942, + "loss": 0.0, + "step": 32727 + }, + { + "epoch": 3.0538396939442007, + "grad_norm": NaN, + "learning_rate": 0.00015319923821486975, + "loss": 0.0, + "step": 32728 + }, + { + "epoch": 3.053933003639078, + "grad_norm": NaN, + "learning_rate": 0.0001531916761469254, + "loss": 0.0, + "step": 32729 + }, + { + "epoch": 3.0540263133339556, + "grad_norm": NaN, + "learning_rate": 0.00015318411407086552, + "loss": 0.0, + "step": 32730 + }, + { + "epoch": 3.0541196230288326, + "grad_norm": NaN, + "learning_rate": 0.00015317655198670936, + "loss": 0.0, + "step": 32731 + }, + { + "epoch": 3.05421293272371, + "grad_norm": NaN, + "learning_rate": 0.00015316898989447617, + "loss": 0.0, + "step": 32732 + }, + { + "epoch": 3.0543062424185874, + "grad_norm": NaN, + "learning_rate": 0.00015316142779418521, + "loss": 0.0, + "step": 32733 + }, + { + "epoch": 3.0543995521134644, + "grad_norm": NaN, + "learning_rate": 0.00015315386568585568, + "loss": 0.0, + "step": 32734 + }, + { + "epoch": 3.054492861808342, + "grad_norm": NaN, + "learning_rate": 0.00015314630356950677, + "loss": 0.0, + "step": 32735 + }, + { + "epoch": 3.054586171503219, + "grad_norm": NaN, + "learning_rate": 0.0001531387414451578, + "loss": 0.0, + "step": 32736 + }, + { + "epoch": 3.0546794811980966, + "grad_norm": NaN, + "learning_rate": 0.0001531311793128279, + "loss": 0.0, + "step": 32737 + }, + { + "epoch": 3.0547727908929736, + "grad_norm": NaN, + "learning_rate": 0.00015312361717253635, + "loss": 0.0, + "step": 32738 + }, + { + "epoch": 3.054866100587851, + "grad_norm": NaN, + "learning_rate": 0.00015311605502430232, + "loss": 0.0, + "step": 32739 + }, + { + "epoch": 3.0549594102827284, + "grad_norm": NaN, + "learning_rate": 0.00015310849286814514, + "loss": 0.0, + "step": 32740 + }, + { + "epoch": 3.055052719977606, + "grad_norm": NaN, + "learning_rate": 0.00015310093070408396, + "loss": 0.0, + "step": 32741 + }, + { + "epoch": 3.055146029672483, + "grad_norm": NaN, + "learning_rate": 0.00015309336853213803, + "loss": 0.0, + "step": 32742 + }, + { + "epoch": 3.0552393393673603, + "grad_norm": NaN, + "learning_rate": 0.0001530858063523266, + "loss": 0.0, + "step": 32743 + }, + { + "epoch": 3.0553326490622377, + "grad_norm": NaN, + "learning_rate": 0.00015307824416466886, + "loss": 0.0, + "step": 32744 + }, + { + "epoch": 3.0554259587571146, + "grad_norm": NaN, + "learning_rate": 0.00015307068196918408, + "loss": 0.0, + "step": 32745 + }, + { + "epoch": 3.055519268451992, + "grad_norm": NaN, + "learning_rate": 0.00015306311976589145, + "loss": 0.0, + "step": 32746 + }, + { + "epoch": 3.0556125781468695, + "grad_norm": NaN, + "learning_rate": 0.00015305555755481021, + "loss": 0.0, + "step": 32747 + }, + { + "epoch": 3.055705887841747, + "grad_norm": NaN, + "learning_rate": 0.00015304799533595958, + "loss": 0.0, + "step": 32748 + }, + { + "epoch": 3.055799197536624, + "grad_norm": NaN, + "learning_rate": 0.00015304043310935883, + "loss": 0.0, + "step": 32749 + }, + { + "epoch": 3.0558925072315013, + "grad_norm": NaN, + "learning_rate": 0.00015303287087502712, + "loss": 0.0, + "step": 32750 + }, + { + "epoch": 3.0559858169263787, + "grad_norm": NaN, + "learning_rate": 0.00015302530863298375, + "loss": 0.0, + "step": 32751 + }, + { + "epoch": 3.056079126621256, + "grad_norm": NaN, + "learning_rate": 0.0001530177463832479, + "loss": 0.0, + "step": 32752 + }, + { + "epoch": 3.056172436316133, + "grad_norm": NaN, + "learning_rate": 0.00015301018412583884, + "loss": 0.0, + "step": 32753 + }, + { + "epoch": 3.0562657460110105, + "grad_norm": NaN, + "learning_rate": 0.00015300262186077575, + "loss": 0.0, + "step": 32754 + }, + { + "epoch": 3.056359055705888, + "grad_norm": NaN, + "learning_rate": 0.0001529950595880779, + "loss": 0.0, + "step": 32755 + }, + { + "epoch": 3.056452365400765, + "grad_norm": NaN, + "learning_rate": 0.00015298749730776449, + "loss": 0.0, + "step": 32756 + }, + { + "epoch": 3.0565456750956423, + "grad_norm": NaN, + "learning_rate": 0.00015297993501985476, + "loss": 0.0, + "step": 32757 + }, + { + "epoch": 3.0566389847905198, + "grad_norm": NaN, + "learning_rate": 0.00015297237272436792, + "loss": 0.0, + "step": 32758 + }, + { + "epoch": 3.056732294485397, + "grad_norm": NaN, + "learning_rate": 0.00015296481042132324, + "loss": 0.0, + "step": 32759 + }, + { + "epoch": 3.056825604180274, + "grad_norm": NaN, + "learning_rate": 0.00015295724811073994, + "loss": 0.0, + "step": 32760 + }, + { + "epoch": 3.0569189138751516, + "grad_norm": NaN, + "learning_rate": 0.0001529496857926372, + "loss": 0.0, + "step": 32761 + }, + { + "epoch": 3.057012223570029, + "grad_norm": NaN, + "learning_rate": 0.00015294212346703427, + "loss": 0.0, + "step": 32762 + }, + { + "epoch": 3.0571055332649064, + "grad_norm": NaN, + "learning_rate": 0.00015293456113395042, + "loss": 0.0, + "step": 32763 + }, + { + "epoch": 3.0571988429597834, + "grad_norm": NaN, + "learning_rate": 0.00015292699879340484, + "loss": 0.0, + "step": 32764 + }, + { + "epoch": 3.057292152654661, + "grad_norm": NaN, + "learning_rate": 0.0001529194364454168, + "loss": 0.0, + "step": 32765 + }, + { + "epoch": 3.0573854623495382, + "grad_norm": NaN, + "learning_rate": 0.0001529118740900055, + "loss": 0.0, + "step": 32766 + }, + { + "epoch": 3.057478772044415, + "grad_norm": NaN, + "learning_rate": 0.00015290431172719013, + "loss": 0.0, + "step": 32767 + }, + { + "epoch": 3.0575720817392926, + "grad_norm": NaN, + "learning_rate": 0.00015289674935698995, + "loss": 0.0, + "step": 32768 + }, + { + "epoch": 3.05766539143417, + "grad_norm": NaN, + "learning_rate": 0.0001528891869794242, + "loss": 0.0, + "step": 32769 + }, + { + "epoch": 3.0577587011290475, + "grad_norm": NaN, + "learning_rate": 0.00015288162459451213, + "loss": 0.0, + "step": 32770 + }, + { + "epoch": 3.0578520108239244, + "grad_norm": NaN, + "learning_rate": 0.00015287406220227293, + "loss": 0.0, + "step": 32771 + }, + { + "epoch": 3.057945320518802, + "grad_norm": NaN, + "learning_rate": 0.00015286649980272583, + "loss": 0.0, + "step": 32772 + }, + { + "epoch": 3.0580386302136793, + "grad_norm": NaN, + "learning_rate": 0.00015285893739589007, + "loss": 0.0, + "step": 32773 + }, + { + "epoch": 3.0581319399085567, + "grad_norm": NaN, + "learning_rate": 0.0001528513749817849, + "loss": 0.0, + "step": 32774 + }, + { + "epoch": 3.0582252496034337, + "grad_norm": NaN, + "learning_rate": 0.0001528438125604295, + "loss": 0.0, + "step": 32775 + }, + { + "epoch": 3.058318559298311, + "grad_norm": NaN, + "learning_rate": 0.00015283625013184316, + "loss": 0.0, + "step": 32776 + }, + { + "epoch": 3.0584118689931885, + "grad_norm": NaN, + "learning_rate": 0.00015282868769604503, + "loss": 0.0, + "step": 32777 + }, + { + "epoch": 3.0585051786880655, + "grad_norm": NaN, + "learning_rate": 0.00015282112525305443, + "loss": 0.0, + "step": 32778 + }, + { + "epoch": 3.058598488382943, + "grad_norm": NaN, + "learning_rate": 0.0001528135628028905, + "loss": 0.0, + "step": 32779 + }, + { + "epoch": 3.0586917980778203, + "grad_norm": NaN, + "learning_rate": 0.0001528060003455726, + "loss": 0.0, + "step": 32780 + }, + { + "epoch": 3.0587851077726977, + "grad_norm": NaN, + "learning_rate": 0.0001527984378811198, + "loss": 0.0, + "step": 32781 + }, + { + "epoch": 3.0588784174675747, + "grad_norm": NaN, + "learning_rate": 0.0001527908754095514, + "loss": 0.0, + "step": 32782 + }, + { + "epoch": 3.058971727162452, + "grad_norm": NaN, + "learning_rate": 0.00015278331293088664, + "loss": 0.0, + "step": 32783 + }, + { + "epoch": 3.0590650368573296, + "grad_norm": NaN, + "learning_rate": 0.00015277575044514473, + "loss": 0.0, + "step": 32784 + }, + { + "epoch": 3.059158346552207, + "grad_norm": NaN, + "learning_rate": 0.00015276818795234494, + "loss": 0.0, + "step": 32785 + }, + { + "epoch": 3.059251656247084, + "grad_norm": NaN, + "learning_rate": 0.00015276062545250646, + "loss": 0.0, + "step": 32786 + }, + { + "epoch": 3.0593449659419614, + "grad_norm": NaN, + "learning_rate": 0.00015275306294564852, + "loss": 0.0, + "step": 32787 + }, + { + "epoch": 3.059438275636839, + "grad_norm": NaN, + "learning_rate": 0.00015274550043179036, + "loss": 0.0, + "step": 32788 + }, + { + "epoch": 3.0595315853317158, + "grad_norm": NaN, + "learning_rate": 0.0001527379379109512, + "loss": 0.0, + "step": 32789 + }, + { + "epoch": 3.059624895026593, + "grad_norm": NaN, + "learning_rate": 0.00015273037538315027, + "loss": 0.0, + "step": 32790 + }, + { + "epoch": 3.0597182047214706, + "grad_norm": NaN, + "learning_rate": 0.00015272281284840682, + "loss": 0.0, + "step": 32791 + }, + { + "epoch": 3.059811514416348, + "grad_norm": NaN, + "learning_rate": 0.00015271525030674006, + "loss": 0.0, + "step": 32792 + }, + { + "epoch": 3.059904824111225, + "grad_norm": NaN, + "learning_rate": 0.0001527076877581692, + "loss": 0.0, + "step": 32793 + }, + { + "epoch": 3.0599981338061024, + "grad_norm": NaN, + "learning_rate": 0.0001527001252027135, + "loss": 0.0, + "step": 32794 + }, + { + "epoch": 3.06009144350098, + "grad_norm": NaN, + "learning_rate": 0.0001526925626403922, + "loss": 0.0, + "step": 32795 + }, + { + "epoch": 3.0601847531958573, + "grad_norm": NaN, + "learning_rate": 0.00015268500007122453, + "loss": 0.0, + "step": 32796 + }, + { + "epoch": 3.0602780628907342, + "grad_norm": NaN, + "learning_rate": 0.00015267743749522965, + "loss": 0.0, + "step": 32797 + }, + { + "epoch": 3.0603713725856116, + "grad_norm": NaN, + "learning_rate": 0.00015266987491242682, + "loss": 0.0, + "step": 32798 + }, + { + "epoch": 3.060464682280489, + "grad_norm": NaN, + "learning_rate": 0.00015266231232283537, + "loss": 0.0, + "step": 32799 + }, + { + "epoch": 3.060557991975366, + "grad_norm": NaN, + "learning_rate": 0.0001526547497264744, + "loss": 0.0, + "step": 32800 + }, + { + "epoch": 3.0606513016702435, + "grad_norm": NaN, + "learning_rate": 0.00015264718712336317, + "loss": 0.0, + "step": 32801 + }, + { + "epoch": 3.060744611365121, + "grad_norm": NaN, + "learning_rate": 0.000152639624513521, + "loss": 0.0, + "step": 32802 + }, + { + "epoch": 3.0608379210599983, + "grad_norm": NaN, + "learning_rate": 0.000152632061896967, + "loss": 0.0, + "step": 32803 + }, + { + "epoch": 3.0609312307548753, + "grad_norm": NaN, + "learning_rate": 0.00015262449927372042, + "loss": 0.0, + "step": 32804 + }, + { + "epoch": 3.0610245404497527, + "grad_norm": NaN, + "learning_rate": 0.00015261693664380056, + "loss": 0.0, + "step": 32805 + }, + { + "epoch": 3.06111785014463, + "grad_norm": NaN, + "learning_rate": 0.00015260937400722659, + "loss": 0.0, + "step": 32806 + }, + { + "epoch": 3.0612111598395075, + "grad_norm": NaN, + "learning_rate": 0.00015260181136401776, + "loss": 0.0, + "step": 32807 + }, + { + "epoch": 3.0613044695343845, + "grad_norm": NaN, + "learning_rate": 0.00015259424871419327, + "loss": 0.0, + "step": 32808 + }, + { + "epoch": 3.061397779229262, + "grad_norm": NaN, + "learning_rate": 0.0001525866860577724, + "loss": 0.0, + "step": 32809 + }, + { + "epoch": 3.0614910889241393, + "grad_norm": NaN, + "learning_rate": 0.00015257912339477432, + "loss": 0.0, + "step": 32810 + }, + { + "epoch": 3.0615843986190163, + "grad_norm": NaN, + "learning_rate": 0.00015257156072521835, + "loss": 0.0, + "step": 32811 + }, + { + "epoch": 3.0616777083138937, + "grad_norm": NaN, + "learning_rate": 0.0001525639980491236, + "loss": 0.0, + "step": 32812 + }, + { + "epoch": 3.061771018008771, + "grad_norm": NaN, + "learning_rate": 0.0001525564353665094, + "loss": 0.0, + "step": 32813 + }, + { + "epoch": 3.0618643277036486, + "grad_norm": NaN, + "learning_rate": 0.00015254887267739494, + "loss": 0.0, + "step": 32814 + }, + { + "epoch": 3.0619576373985256, + "grad_norm": NaN, + "learning_rate": 0.00015254130998179944, + "loss": 0.0, + "step": 32815 + }, + { + "epoch": 3.062050947093403, + "grad_norm": NaN, + "learning_rate": 0.00015253374727974215, + "loss": 0.0, + "step": 32816 + }, + { + "epoch": 3.0621442567882804, + "grad_norm": NaN, + "learning_rate": 0.00015252618457124227, + "loss": 0.0, + "step": 32817 + }, + { + "epoch": 3.062237566483158, + "grad_norm": NaN, + "learning_rate": 0.0001525186218563191, + "loss": 0.0, + "step": 32818 + }, + { + "epoch": 3.062330876178035, + "grad_norm": NaN, + "learning_rate": 0.00015251105913499177, + "loss": 0.0, + "step": 32819 + }, + { + "epoch": 3.062424185872912, + "grad_norm": NaN, + "learning_rate": 0.00015250349640727958, + "loss": 0.0, + "step": 32820 + }, + { + "epoch": 3.0625174955677896, + "grad_norm": NaN, + "learning_rate": 0.00015249593367320176, + "loss": 0.0, + "step": 32821 + }, + { + "epoch": 3.0626108052626666, + "grad_norm": NaN, + "learning_rate": 0.0001524883709327775, + "loss": 0.0, + "step": 32822 + }, + { + "epoch": 3.062704114957544, + "grad_norm": NaN, + "learning_rate": 0.00015248080818602603, + "loss": 0.0, + "step": 32823 + }, + { + "epoch": 3.0627974246524214, + "grad_norm": NaN, + "learning_rate": 0.00015247324543296665, + "loss": 0.0, + "step": 32824 + }, + { + "epoch": 3.062890734347299, + "grad_norm": NaN, + "learning_rate": 0.00015246568267361852, + "loss": 0.0, + "step": 32825 + }, + { + "epoch": 3.062984044042176, + "grad_norm": NaN, + "learning_rate": 0.00015245811990800086, + "loss": 0.0, + "step": 32826 + }, + { + "epoch": 3.0630773537370533, + "grad_norm": NaN, + "learning_rate": 0.00015245055713613296, + "loss": 0.0, + "step": 32827 + }, + { + "epoch": 3.0631706634319307, + "grad_norm": NaN, + "learning_rate": 0.000152442994358034, + "loss": 0.0, + "step": 32828 + }, + { + "epoch": 3.0632639731268076, + "grad_norm": NaN, + "learning_rate": 0.00015243543157372324, + "loss": 0.0, + "step": 32829 + }, + { + "epoch": 3.063357282821685, + "grad_norm": NaN, + "learning_rate": 0.0001524278687832199, + "loss": 0.0, + "step": 32830 + }, + { + "epoch": 3.0634505925165625, + "grad_norm": NaN, + "learning_rate": 0.00015242030598654319, + "loss": 0.0, + "step": 32831 + }, + { + "epoch": 3.06354390221144, + "grad_norm": NaN, + "learning_rate": 0.00015241274318371237, + "loss": 0.0, + "step": 32832 + }, + { + "epoch": 3.063637211906317, + "grad_norm": NaN, + "learning_rate": 0.0001524051803747467, + "loss": 0.0, + "step": 32833 + }, + { + "epoch": 3.0637305216011943, + "grad_norm": NaN, + "learning_rate": 0.0001523976175596653, + "loss": 0.0, + "step": 32834 + }, + { + "epoch": 3.0638238312960717, + "grad_norm": NaN, + "learning_rate": 0.0001523900547384875, + "loss": 0.0, + "step": 32835 + }, + { + "epoch": 3.063917140990949, + "grad_norm": NaN, + "learning_rate": 0.00015238249191123248, + "loss": 0.0, + "step": 32836 + }, + { + "epoch": 3.064010450685826, + "grad_norm": NaN, + "learning_rate": 0.00015237492907791954, + "loss": 0.0, + "step": 32837 + }, + { + "epoch": 3.0641037603807035, + "grad_norm": NaN, + "learning_rate": 0.0001523673662385678, + "loss": 0.0, + "step": 32838 + }, + { + "epoch": 3.064197070075581, + "grad_norm": NaN, + "learning_rate": 0.00015235980339319656, + "loss": 0.0, + "step": 32839 + }, + { + "epoch": 3.0642903797704584, + "grad_norm": NaN, + "learning_rate": 0.00015235224054182509, + "loss": 0.0, + "step": 32840 + }, + { + "epoch": 3.0643836894653353, + "grad_norm": NaN, + "learning_rate": 0.00015234467768447253, + "loss": 0.0, + "step": 32841 + }, + { + "epoch": 3.0644769991602128, + "grad_norm": NaN, + "learning_rate": 0.00015233711482115812, + "loss": 0.0, + "step": 32842 + }, + { + "epoch": 3.06457030885509, + "grad_norm": NaN, + "learning_rate": 0.00015232955195190122, + "loss": 0.0, + "step": 32843 + }, + { + "epoch": 3.064663618549967, + "grad_norm": NaN, + "learning_rate": 0.00015232198907672086, + "loss": 0.0, + "step": 32844 + }, + { + "epoch": 3.0647569282448446, + "grad_norm": NaN, + "learning_rate": 0.00015231442619563636, + "loss": 0.0, + "step": 32845 + }, + { + "epoch": 3.064850237939722, + "grad_norm": NaN, + "learning_rate": 0.00015230686330866707, + "loss": 0.0, + "step": 32846 + }, + { + "epoch": 3.0649435476345994, + "grad_norm": NaN, + "learning_rate": 0.00015229930041583205, + "loss": 0.0, + "step": 32847 + }, + { + "epoch": 3.0650368573294764, + "grad_norm": NaN, + "learning_rate": 0.00015229173751715052, + "loss": 0.0, + "step": 32848 + }, + { + "epoch": 3.065130167024354, + "grad_norm": NaN, + "learning_rate": 0.00015228417461264187, + "loss": 0.0, + "step": 32849 + }, + { + "epoch": 3.0652234767192312, + "grad_norm": NaN, + "learning_rate": 0.0001522766117023252, + "loss": 0.0, + "step": 32850 + }, + { + "epoch": 3.065316786414108, + "grad_norm": NaN, + "learning_rate": 0.00015226904878621982, + "loss": 0.0, + "step": 32851 + }, + { + "epoch": 3.0654100961089856, + "grad_norm": NaN, + "learning_rate": 0.0001522614858643449, + "loss": 0.0, + "step": 32852 + }, + { + "epoch": 3.065503405803863, + "grad_norm": NaN, + "learning_rate": 0.00015225392293671968, + "loss": 0.0, + "step": 32853 + }, + { + "epoch": 3.0655967154987405, + "grad_norm": NaN, + "learning_rate": 0.0001522463600033634, + "loss": 0.0, + "step": 32854 + }, + { + "epoch": 3.0656900251936174, + "grad_norm": NaN, + "learning_rate": 0.00015223879706429525, + "loss": 0.0, + "step": 32855 + }, + { + "epoch": 3.065783334888495, + "grad_norm": NaN, + "learning_rate": 0.00015223123411953462, + "loss": 0.0, + "step": 32856 + }, + { + "epoch": 3.0658766445833723, + "grad_norm": NaN, + "learning_rate": 0.00015222367116910052, + "loss": 0.0, + "step": 32857 + }, + { + "epoch": 3.0659699542782497, + "grad_norm": NaN, + "learning_rate": 0.0001522161082130123, + "loss": 0.0, + "step": 32858 + }, + { + "epoch": 3.0660632639731267, + "grad_norm": NaN, + "learning_rate": 0.00015220854525128922, + "loss": 0.0, + "step": 32859 + }, + { + "epoch": 3.066156573668004, + "grad_norm": NaN, + "learning_rate": 0.00015220098228395043, + "loss": 0.0, + "step": 32860 + }, + { + "epoch": 3.0662498833628815, + "grad_norm": NaN, + "learning_rate": 0.00015219341931101517, + "loss": 0.0, + "step": 32861 + }, + { + "epoch": 3.0663431930577585, + "grad_norm": NaN, + "learning_rate": 0.00015218585633250276, + "loss": 0.0, + "step": 32862 + }, + { + "epoch": 3.066436502752636, + "grad_norm": NaN, + "learning_rate": 0.00015217829334843232, + "loss": 0.0, + "step": 32863 + }, + { + "epoch": 3.0665298124475133, + "grad_norm": NaN, + "learning_rate": 0.0001521707303588231, + "loss": 0.0, + "step": 32864 + }, + { + "epoch": 3.0666231221423907, + "grad_norm": NaN, + "learning_rate": 0.0001521631673636944, + "loss": 0.0, + "step": 32865 + }, + { + "epoch": 3.0667164318372677, + "grad_norm": NaN, + "learning_rate": 0.0001521556043630654, + "loss": 0.0, + "step": 32866 + }, + { + "epoch": 3.066809741532145, + "grad_norm": NaN, + "learning_rate": 0.0001521480413569553, + "loss": 0.0, + "step": 32867 + }, + { + "epoch": 3.0669030512270226, + "grad_norm": NaN, + "learning_rate": 0.00015214047834538345, + "loss": 0.0, + "step": 32868 + }, + { + "epoch": 3.0669963609219, + "grad_norm": NaN, + "learning_rate": 0.00015213291532836893, + "loss": 0.0, + "step": 32869 + }, + { + "epoch": 3.067089670616777, + "grad_norm": NaN, + "learning_rate": 0.000152125352305931, + "loss": 0.0, + "step": 32870 + }, + { + "epoch": 3.0671829803116544, + "grad_norm": NaN, + "learning_rate": 0.00015211778927808905, + "loss": 0.0, + "step": 32871 + }, + { + "epoch": 3.067276290006532, + "grad_norm": NaN, + "learning_rate": 0.00015211022624486212, + "loss": 0.0, + "step": 32872 + }, + { + "epoch": 3.0673695997014088, + "grad_norm": NaN, + "learning_rate": 0.0001521026632062695, + "loss": 0.0, + "step": 32873 + }, + { + "epoch": 3.067462909396286, + "grad_norm": NaN, + "learning_rate": 0.00015209510016233043, + "loss": 0.0, + "step": 32874 + }, + { + "epoch": 3.0675562190911636, + "grad_norm": NaN, + "learning_rate": 0.00015208753711306417, + "loss": 0.0, + "step": 32875 + }, + { + "epoch": 3.067649528786041, + "grad_norm": NaN, + "learning_rate": 0.0001520799740584899, + "loss": 0.0, + "step": 32876 + }, + { + "epoch": 3.067742838480918, + "grad_norm": NaN, + "learning_rate": 0.00015207241099862685, + "loss": 0.0, + "step": 32877 + }, + { + "epoch": 3.0678361481757954, + "grad_norm": NaN, + "learning_rate": 0.00015206484793349435, + "loss": 0.0, + "step": 32878 + }, + { + "epoch": 3.067929457870673, + "grad_norm": NaN, + "learning_rate": 0.00015205728486311146, + "loss": 0.0, + "step": 32879 + }, + { + "epoch": 3.0680227675655503, + "grad_norm": NaN, + "learning_rate": 0.00015204972178749752, + "loss": 0.0, + "step": 32880 + }, + { + "epoch": 3.0681160772604272, + "grad_norm": NaN, + "learning_rate": 0.00015204215870667182, + "loss": 0.0, + "step": 32881 + }, + { + "epoch": 3.0682093869553047, + "grad_norm": NaN, + "learning_rate": 0.00015203459562065347, + "loss": 0.0, + "step": 32882 + }, + { + "epoch": 3.068302696650182, + "grad_norm": NaN, + "learning_rate": 0.0001520270325294617, + "loss": 0.0, + "step": 32883 + }, + { + "epoch": 3.068396006345059, + "grad_norm": NaN, + "learning_rate": 0.00015201946943311587, + "loss": 0.0, + "step": 32884 + }, + { + "epoch": 3.0684893160399365, + "grad_norm": NaN, + "learning_rate": 0.0001520119063316351, + "loss": 0.0, + "step": 32885 + }, + { + "epoch": 3.068582625734814, + "grad_norm": NaN, + "learning_rate": 0.00015200434322503858, + "loss": 0.0, + "step": 32886 + }, + { + "epoch": 3.0686759354296913, + "grad_norm": NaN, + "learning_rate": 0.0001519967801133457, + "loss": 0.0, + "step": 32887 + }, + { + "epoch": 3.0687692451245683, + "grad_norm": NaN, + "learning_rate": 0.00015198921699657557, + "loss": 0.0, + "step": 32888 + }, + { + "epoch": 3.0688625548194457, + "grad_norm": NaN, + "learning_rate": 0.00015198165387474743, + "loss": 0.0, + "step": 32889 + }, + { + "epoch": 3.068955864514323, + "grad_norm": NaN, + "learning_rate": 0.00015197409074788057, + "loss": 0.0, + "step": 32890 + }, + { + "epoch": 3.0690491742092005, + "grad_norm": NaN, + "learning_rate": 0.00015196652761599415, + "loss": 0.0, + "step": 32891 + }, + { + "epoch": 3.0691424839040775, + "grad_norm": NaN, + "learning_rate": 0.00015195896447910742, + "loss": 0.0, + "step": 32892 + }, + { + "epoch": 3.069235793598955, + "grad_norm": NaN, + "learning_rate": 0.00015195140133723961, + "loss": 0.0, + "step": 32893 + }, + { + "epoch": 3.0693291032938324, + "grad_norm": NaN, + "learning_rate": 0.00015194383819041004, + "loss": 0.0, + "step": 32894 + }, + { + "epoch": 3.0694224129887093, + "grad_norm": NaN, + "learning_rate": 0.0001519362750386378, + "loss": 0.0, + "step": 32895 + }, + { + "epoch": 3.0695157226835867, + "grad_norm": NaN, + "learning_rate": 0.0001519287118819422, + "loss": 0.0, + "step": 32896 + }, + { + "epoch": 3.069609032378464, + "grad_norm": NaN, + "learning_rate": 0.00015192114872034248, + "loss": 0.0, + "step": 32897 + }, + { + "epoch": 3.0697023420733416, + "grad_norm": NaN, + "learning_rate": 0.00015191358555385786, + "loss": 0.0, + "step": 32898 + }, + { + "epoch": 3.0697956517682186, + "grad_norm": NaN, + "learning_rate": 0.0001519060223825075, + "loss": 0.0, + "step": 32899 + }, + { + "epoch": 3.069888961463096, + "grad_norm": NaN, + "learning_rate": 0.00015189845920631073, + "loss": 0.0, + "step": 32900 + }, + { + "epoch": 3.0699822711579734, + "grad_norm": NaN, + "learning_rate": 0.00015189089602528675, + "loss": 0.0, + "step": 32901 + }, + { + "epoch": 3.070075580852851, + "grad_norm": NaN, + "learning_rate": 0.00015188333283945474, + "loss": 0.0, + "step": 32902 + }, + { + "epoch": 3.070168890547728, + "grad_norm": NaN, + "learning_rate": 0.00015187576964883402, + "loss": 0.0, + "step": 32903 + }, + { + "epoch": 3.070262200242605, + "grad_norm": NaN, + "learning_rate": 0.00015186820645344374, + "loss": 0.0, + "step": 32904 + }, + { + "epoch": 3.0703555099374826, + "grad_norm": NaN, + "learning_rate": 0.00015186064325330316, + "loss": 0.0, + "step": 32905 + }, + { + "epoch": 3.0704488196323596, + "grad_norm": NaN, + "learning_rate": 0.00015185308004843156, + "loss": 0.0, + "step": 32906 + }, + { + "epoch": 3.070542129327237, + "grad_norm": NaN, + "learning_rate": 0.0001518455168388481, + "loss": 0.0, + "step": 32907 + }, + { + "epoch": 3.0706354390221144, + "grad_norm": NaN, + "learning_rate": 0.000151837953624572, + "loss": 0.0, + "step": 32908 + }, + { + "epoch": 3.070728748716992, + "grad_norm": NaN, + "learning_rate": 0.00015183039040562259, + "loss": 0.0, + "step": 32909 + }, + { + "epoch": 3.070822058411869, + "grad_norm": NaN, + "learning_rate": 0.000151822827182019, + "loss": 0.0, + "step": 32910 + }, + { + "epoch": 3.0709153681067463, + "grad_norm": NaN, + "learning_rate": 0.00015181526395378049, + "loss": 0.0, + "step": 32911 + }, + { + "epoch": 3.0710086778016237, + "grad_norm": NaN, + "learning_rate": 0.00015180770072092633, + "loss": 0.0, + "step": 32912 + }, + { + "epoch": 3.071101987496501, + "grad_norm": NaN, + "learning_rate": 0.00015180013748347575, + "loss": 0.0, + "step": 32913 + }, + { + "epoch": 3.071195297191378, + "grad_norm": NaN, + "learning_rate": 0.0001517925742414479, + "loss": 0.0, + "step": 32914 + }, + { + "epoch": 3.0712886068862555, + "grad_norm": NaN, + "learning_rate": 0.00015178501099486207, + "loss": 0.0, + "step": 32915 + }, + { + "epoch": 3.071381916581133, + "grad_norm": NaN, + "learning_rate": 0.00015177744774373753, + "loss": 0.0, + "step": 32916 + }, + { + "epoch": 3.07147522627601, + "grad_norm": NaN, + "learning_rate": 0.00015176988448809345, + "loss": 0.0, + "step": 32917 + }, + { + "epoch": 3.0715685359708873, + "grad_norm": NaN, + "learning_rate": 0.000151762321227949, + "loss": 0.0, + "step": 32918 + }, + { + "epoch": 3.0716618456657647, + "grad_norm": NaN, + "learning_rate": 0.00015175475796332363, + "loss": 0.0, + "step": 32919 + }, + { + "epoch": 3.071755155360642, + "grad_norm": NaN, + "learning_rate": 0.00015174719469423633, + "loss": 0.0, + "step": 32920 + }, + { + "epoch": 3.071848465055519, + "grad_norm": NaN, + "learning_rate": 0.00015173963142070643, + "loss": 0.0, + "step": 32921 + }, + { + "epoch": 3.0719417747503965, + "grad_norm": NaN, + "learning_rate": 0.0001517320681427532, + "loss": 0.0, + "step": 32922 + }, + { + "epoch": 3.072035084445274, + "grad_norm": NaN, + "learning_rate": 0.00015172450486039584, + "loss": 0.0, + "step": 32923 + }, + { + "epoch": 3.0721283941401514, + "grad_norm": NaN, + "learning_rate": 0.00015171694157365353, + "loss": 0.0, + "step": 32924 + }, + { + "epoch": 3.0722217038350284, + "grad_norm": NaN, + "learning_rate": 0.00015170937828254562, + "loss": 0.0, + "step": 32925 + }, + { + "epoch": 3.0723150135299058, + "grad_norm": NaN, + "learning_rate": 0.00015170181498709123, + "loss": 0.0, + "step": 32926 + }, + { + "epoch": 3.072408323224783, + "grad_norm": NaN, + "learning_rate": 0.00015169425168730955, + "loss": 0.0, + "step": 32927 + }, + { + "epoch": 3.07250163291966, + "grad_norm": NaN, + "learning_rate": 0.00015168668838321997, + "loss": 0.0, + "step": 32928 + }, + { + "epoch": 3.0725949426145376, + "grad_norm": NaN, + "learning_rate": 0.00015167912507484167, + "loss": 0.0, + "step": 32929 + }, + { + "epoch": 3.072688252309415, + "grad_norm": NaN, + "learning_rate": 0.00015167156176219376, + "loss": 0.0, + "step": 32930 + }, + { + "epoch": 3.0727815620042924, + "grad_norm": NaN, + "learning_rate": 0.00015166399844529564, + "loss": 0.0, + "step": 32931 + }, + { + "epoch": 3.0728748716991694, + "grad_norm": NaN, + "learning_rate": 0.00015165643512416647, + "loss": 0.0, + "step": 32932 + }, + { + "epoch": 3.072968181394047, + "grad_norm": NaN, + "learning_rate": 0.00015164887179882543, + "loss": 0.0, + "step": 32933 + }, + { + "epoch": 3.0730614910889242, + "grad_norm": NaN, + "learning_rate": 0.0001516413084692918, + "loss": 0.0, + "step": 32934 + }, + { + "epoch": 3.0731548007838017, + "grad_norm": NaN, + "learning_rate": 0.00015163374513558486, + "loss": 0.0, + "step": 32935 + }, + { + "epoch": 3.0732481104786786, + "grad_norm": NaN, + "learning_rate": 0.0001516261817977237, + "loss": 0.0, + "step": 32936 + }, + { + "epoch": 3.073341420173556, + "grad_norm": NaN, + "learning_rate": 0.00015161861845572772, + "loss": 0.0, + "step": 32937 + }, + { + "epoch": 3.0734347298684335, + "grad_norm": NaN, + "learning_rate": 0.0001516110551096161, + "loss": 0.0, + "step": 32938 + }, + { + "epoch": 3.0735280395633104, + "grad_norm": NaN, + "learning_rate": 0.00015160349175940795, + "loss": 0.0, + "step": 32939 + }, + { + "epoch": 3.073621349258188, + "grad_norm": NaN, + "learning_rate": 0.0001515959284051226, + "loss": 0.0, + "step": 32940 + }, + { + "epoch": 3.0737146589530653, + "grad_norm": NaN, + "learning_rate": 0.00015158836504677937, + "loss": 0.0, + "step": 32941 + }, + { + "epoch": 3.0738079686479427, + "grad_norm": NaN, + "learning_rate": 0.00015158080168439734, + "loss": 0.0, + "step": 32942 + }, + { + "epoch": 3.0739012783428197, + "grad_norm": NaN, + "learning_rate": 0.00015157323831799577, + "loss": 0.0, + "step": 32943 + }, + { + "epoch": 3.073994588037697, + "grad_norm": NaN, + "learning_rate": 0.000151565674947594, + "loss": 0.0, + "step": 32944 + }, + { + "epoch": 3.0740878977325745, + "grad_norm": NaN, + "learning_rate": 0.0001515581115732111, + "loss": 0.0, + "step": 32945 + }, + { + "epoch": 3.0741812074274515, + "grad_norm": NaN, + "learning_rate": 0.00015155054819486642, + "loss": 0.0, + "step": 32946 + }, + { + "epoch": 3.074274517122329, + "grad_norm": NaN, + "learning_rate": 0.00015154298481257915, + "loss": 0.0, + "step": 32947 + }, + { + "epoch": 3.0743678268172063, + "grad_norm": NaN, + "learning_rate": 0.00015153542142636855, + "loss": 0.0, + "step": 32948 + }, + { + "epoch": 3.0744611365120837, + "grad_norm": NaN, + "learning_rate": 0.00015152785803625377, + "loss": 0.0, + "step": 32949 + }, + { + "epoch": 3.0745544462069607, + "grad_norm": NaN, + "learning_rate": 0.00015152029464225416, + "loss": 0.0, + "step": 32950 + }, + { + "epoch": 3.074647755901838, + "grad_norm": NaN, + "learning_rate": 0.0001515127312443889, + "loss": 0.0, + "step": 32951 + }, + { + "epoch": 3.0747410655967156, + "grad_norm": NaN, + "learning_rate": 0.00015150516784267714, + "loss": 0.0, + "step": 32952 + }, + { + "epoch": 3.074834375291593, + "grad_norm": NaN, + "learning_rate": 0.00015149760443713823, + "loss": 0.0, + "step": 32953 + }, + { + "epoch": 3.07492768498647, + "grad_norm": NaN, + "learning_rate": 0.00015149004102779138, + "loss": 0.0, + "step": 32954 + }, + { + "epoch": 3.0750209946813474, + "grad_norm": NaN, + "learning_rate": 0.00015148247761465574, + "loss": 0.0, + "step": 32955 + }, + { + "epoch": 3.075114304376225, + "grad_norm": NaN, + "learning_rate": 0.00015147491419775062, + "loss": 0.0, + "step": 32956 + }, + { + "epoch": 3.075207614071102, + "grad_norm": NaN, + "learning_rate": 0.00015146735077709527, + "loss": 0.0, + "step": 32957 + }, + { + "epoch": 3.075300923765979, + "grad_norm": NaN, + "learning_rate": 0.00015145978735270883, + "loss": 0.0, + "step": 32958 + }, + { + "epoch": 3.0753942334608566, + "grad_norm": NaN, + "learning_rate": 0.0001514522239246106, + "loss": 0.0, + "step": 32959 + }, + { + "epoch": 3.075487543155734, + "grad_norm": NaN, + "learning_rate": 0.00015144466049281982, + "loss": 0.0, + "step": 32960 + }, + { + "epoch": 3.075580852850611, + "grad_norm": NaN, + "learning_rate": 0.0001514370970573557, + "loss": 0.0, + "step": 32961 + }, + { + "epoch": 3.0756741625454884, + "grad_norm": NaN, + "learning_rate": 0.0001514295336182374, + "loss": 0.0, + "step": 32962 + }, + { + "epoch": 3.075767472240366, + "grad_norm": NaN, + "learning_rate": 0.00015142197017548427, + "loss": 0.0, + "step": 32963 + }, + { + "epoch": 3.0758607819352433, + "grad_norm": NaN, + "learning_rate": 0.0001514144067291155, + "loss": 0.0, + "step": 32964 + }, + { + "epoch": 3.0759540916301202, + "grad_norm": NaN, + "learning_rate": 0.00015140684327915024, + "loss": 0.0, + "step": 32965 + }, + { + "epoch": 3.0760474013249977, + "grad_norm": NaN, + "learning_rate": 0.00015139927982560787, + "loss": 0.0, + "step": 32966 + }, + { + "epoch": 3.076140711019875, + "grad_norm": NaN, + "learning_rate": 0.00015139171636850755, + "loss": 0.0, + "step": 32967 + }, + { + "epoch": 3.076234020714752, + "grad_norm": NaN, + "learning_rate": 0.00015138415290786845, + "loss": 0.0, + "step": 32968 + }, + { + "epoch": 3.0763273304096295, + "grad_norm": NaN, + "learning_rate": 0.00015137658944370993, + "loss": 0.0, + "step": 32969 + }, + { + "epoch": 3.076420640104507, + "grad_norm": NaN, + "learning_rate": 0.0001513690259760511, + "loss": 0.0, + "step": 32970 + }, + { + "epoch": 3.0765139497993843, + "grad_norm": NaN, + "learning_rate": 0.00015136146250491124, + "loss": 0.0, + "step": 32971 + }, + { + "epoch": 3.0766072594942613, + "grad_norm": NaN, + "learning_rate": 0.0001513538990303096, + "loss": 0.0, + "step": 32972 + }, + { + "epoch": 3.0767005691891387, + "grad_norm": NaN, + "learning_rate": 0.00015134633555226543, + "loss": 0.0, + "step": 32973 + }, + { + "epoch": 3.076793878884016, + "grad_norm": NaN, + "learning_rate": 0.00015133877207079786, + "loss": 0.0, + "step": 32974 + }, + { + "epoch": 3.0768871885788935, + "grad_norm": NaN, + "learning_rate": 0.00015133120858592622, + "loss": 0.0, + "step": 32975 + }, + { + "epoch": 3.0769804982737705, + "grad_norm": NaN, + "learning_rate": 0.00015132364509766976, + "loss": 0.0, + "step": 32976 + }, + { + "epoch": 3.077073807968648, + "grad_norm": NaN, + "learning_rate": 0.00015131608160604758, + "loss": 0.0, + "step": 32977 + }, + { + "epoch": 3.0771671176635254, + "grad_norm": NaN, + "learning_rate": 0.000151308518111079, + "loss": 0.0, + "step": 32978 + }, + { + "epoch": 3.0772604273584023, + "grad_norm": NaN, + "learning_rate": 0.00015130095461278334, + "loss": 0.0, + "step": 32979 + }, + { + "epoch": 3.0773537370532797, + "grad_norm": NaN, + "learning_rate": 0.00015129339111117965, + "loss": 0.0, + "step": 32980 + }, + { + "epoch": 3.077447046748157, + "grad_norm": NaN, + "learning_rate": 0.00015128582760628724, + "loss": 0.0, + "step": 32981 + }, + { + "epoch": 3.0775403564430346, + "grad_norm": NaN, + "learning_rate": 0.00015127826409812545, + "loss": 0.0, + "step": 32982 + }, + { + "epoch": 3.0776336661379116, + "grad_norm": NaN, + "learning_rate": 0.00015127070058671332, + "loss": 0.0, + "step": 32983 + }, + { + "epoch": 3.077726975832789, + "grad_norm": NaN, + "learning_rate": 0.00015126313707207017, + "loss": 0.0, + "step": 32984 + }, + { + "epoch": 3.0778202855276664, + "grad_norm": NaN, + "learning_rate": 0.00015125557355421526, + "loss": 0.0, + "step": 32985 + }, + { + "epoch": 3.077913595222544, + "grad_norm": NaN, + "learning_rate": 0.00015124801003316786, + "loss": 0.0, + "step": 32986 + }, + { + "epoch": 3.078006904917421, + "grad_norm": NaN, + "learning_rate": 0.00015124044650894703, + "loss": 0.0, + "step": 32987 + }, + { + "epoch": 3.078100214612298, + "grad_norm": NaN, + "learning_rate": 0.0001512328829815722, + "loss": 0.0, + "step": 32988 + }, + { + "epoch": 3.0781935243071756, + "grad_norm": NaN, + "learning_rate": 0.00015122531945106248, + "loss": 0.0, + "step": 32989 + }, + { + "epoch": 3.0782868340020526, + "grad_norm": NaN, + "learning_rate": 0.0001512177559174371, + "loss": 0.0, + "step": 32990 + }, + { + "epoch": 3.07838014369693, + "grad_norm": NaN, + "learning_rate": 0.00015121019238071537, + "loss": 0.0, + "step": 32991 + }, + { + "epoch": 3.0784734533918074, + "grad_norm": NaN, + "learning_rate": 0.00015120262884091653, + "loss": 0.0, + "step": 32992 + }, + { + "epoch": 3.078566763086685, + "grad_norm": NaN, + "learning_rate": 0.00015119506529805968, + "loss": 0.0, + "step": 32993 + }, + { + "epoch": 3.078660072781562, + "grad_norm": NaN, + "learning_rate": 0.00015118750175216415, + "loss": 0.0, + "step": 32994 + }, + { + "epoch": 3.0787533824764393, + "grad_norm": NaN, + "learning_rate": 0.0001511799382032492, + "loss": 0.0, + "step": 32995 + }, + { + "epoch": 3.0788466921713167, + "grad_norm": NaN, + "learning_rate": 0.00015117237465133397, + "loss": 0.0, + "step": 32996 + }, + { + "epoch": 3.078940001866194, + "grad_norm": NaN, + "learning_rate": 0.00015116481109643773, + "loss": 0.0, + "step": 32997 + }, + { + "epoch": 3.079033311561071, + "grad_norm": NaN, + "learning_rate": 0.00015115724753857977, + "loss": 0.0, + "step": 32998 + }, + { + "epoch": 3.0791266212559485, + "grad_norm": NaN, + "learning_rate": 0.0001511496839777792, + "loss": 0.0, + "step": 32999 + }, + { + "epoch": 3.079219930950826, + "grad_norm": NaN, + "learning_rate": 0.00015114212041405538, + "loss": 0.0, + "step": 33000 + }, + { + "epoch": 3.079313240645703, + "grad_norm": NaN, + "learning_rate": 0.0001511345568474275, + "loss": 0.0, + "step": 33001 + }, + { + "epoch": 3.0794065503405803, + "grad_norm": NaN, + "learning_rate": 0.00015112699327791471, + "loss": 0.0, + "step": 33002 + }, + { + "epoch": 3.0794998600354577, + "grad_norm": NaN, + "learning_rate": 0.00015111942970553633, + "loss": 0.0, + "step": 33003 + }, + { + "epoch": 3.079593169730335, + "grad_norm": NaN, + "learning_rate": 0.00015111186613031162, + "loss": 0.0, + "step": 33004 + }, + { + "epoch": 3.079686479425212, + "grad_norm": NaN, + "learning_rate": 0.00015110430255225974, + "loss": 0.0, + "step": 33005 + }, + { + "epoch": 3.0797797891200895, + "grad_norm": NaN, + "learning_rate": 0.00015109673897139995, + "loss": 0.0, + "step": 33006 + }, + { + "epoch": 3.079873098814967, + "grad_norm": NaN, + "learning_rate": 0.00015108917538775144, + "loss": 0.0, + "step": 33007 + }, + { + "epoch": 3.0799664085098444, + "grad_norm": NaN, + "learning_rate": 0.00015108161180133357, + "loss": 0.0, + "step": 33008 + }, + { + "epoch": 3.0800597182047214, + "grad_norm": NaN, + "learning_rate": 0.0001510740482121654, + "loss": 0.0, + "step": 33009 + }, + { + "epoch": 3.0801530278995988, + "grad_norm": NaN, + "learning_rate": 0.00015106648462026622, + "loss": 0.0, + "step": 33010 + }, + { + "epoch": 3.080246337594476, + "grad_norm": NaN, + "learning_rate": 0.0001510589210256554, + "loss": 0.0, + "step": 33011 + }, + { + "epoch": 3.080339647289353, + "grad_norm": NaN, + "learning_rate": 0.00015105135742835192, + "loss": 0.0, + "step": 33012 + }, + { + "epoch": 3.0804329569842306, + "grad_norm": NaN, + "learning_rate": 0.00015104379382837522, + "loss": 0.0, + "step": 33013 + }, + { + "epoch": 3.080526266679108, + "grad_norm": NaN, + "learning_rate": 0.0001510362302257445, + "loss": 0.0, + "step": 33014 + }, + { + "epoch": 3.0806195763739854, + "grad_norm": NaN, + "learning_rate": 0.0001510286666204789, + "loss": 0.0, + "step": 33015 + }, + { + "epoch": 3.0807128860688624, + "grad_norm": NaN, + "learning_rate": 0.0001510211030125977, + "loss": 0.0, + "step": 33016 + }, + { + "epoch": 3.08080619576374, + "grad_norm": NaN, + "learning_rate": 0.0001510135394021202, + "loss": 0.0, + "step": 33017 + }, + { + "epoch": 3.0808995054586172, + "grad_norm": NaN, + "learning_rate": 0.0001510059757890655, + "loss": 0.0, + "step": 33018 + }, + { + "epoch": 3.0809928151534947, + "grad_norm": NaN, + "learning_rate": 0.0001509984121734529, + "loss": 0.0, + "step": 33019 + }, + { + "epoch": 3.0810861248483716, + "grad_norm": NaN, + "learning_rate": 0.0001509908485553017, + "loss": 0.0, + "step": 33020 + }, + { + "epoch": 3.081179434543249, + "grad_norm": NaN, + "learning_rate": 0.000150983284934631, + "loss": 0.0, + "step": 33021 + }, + { + "epoch": 3.0812727442381265, + "grad_norm": NaN, + "learning_rate": 0.0001509757213114601, + "loss": 0.0, + "step": 33022 + }, + { + "epoch": 3.0813660539330034, + "grad_norm": NaN, + "learning_rate": 0.00015096815768580828, + "loss": 0.0, + "step": 33023 + }, + { + "epoch": 3.081459363627881, + "grad_norm": NaN, + "learning_rate": 0.0001509605940576947, + "loss": 0.0, + "step": 33024 + }, + { + "epoch": 3.0815526733227583, + "grad_norm": NaN, + "learning_rate": 0.00015095303042713857, + "loss": 0.0, + "step": 33025 + }, + { + "epoch": 3.0816459830176357, + "grad_norm": NaN, + "learning_rate": 0.00015094546679415922, + "loss": 0.0, + "step": 33026 + }, + { + "epoch": 3.0817392927125127, + "grad_norm": NaN, + "learning_rate": 0.00015093790315877584, + "loss": 0.0, + "step": 33027 + }, + { + "epoch": 3.08183260240739, + "grad_norm": NaN, + "learning_rate": 0.0001509303395210076, + "loss": 0.0, + "step": 33028 + }, + { + "epoch": 3.0819259121022675, + "grad_norm": NaN, + "learning_rate": 0.00015092277588087382, + "loss": 0.0, + "step": 33029 + }, + { + "epoch": 3.082019221797145, + "grad_norm": NaN, + "learning_rate": 0.0001509152122383937, + "loss": 0.0, + "step": 33030 + }, + { + "epoch": 3.082112531492022, + "grad_norm": NaN, + "learning_rate": 0.0001509076485935864, + "loss": 0.0, + "step": 33031 + }, + { + "epoch": 3.0822058411868993, + "grad_norm": NaN, + "learning_rate": 0.00015090008494647128, + "loss": 0.0, + "step": 33032 + }, + { + "epoch": 3.0822991508817768, + "grad_norm": NaN, + "learning_rate": 0.00015089252129706752, + "loss": 0.0, + "step": 33033 + }, + { + "epoch": 3.0823924605766537, + "grad_norm": NaN, + "learning_rate": 0.00015088495764539427, + "loss": 0.0, + "step": 33034 + }, + { + "epoch": 3.082485770271531, + "grad_norm": NaN, + "learning_rate": 0.00015087739399147088, + "loss": 0.0, + "step": 33035 + }, + { + "epoch": 3.0825790799664086, + "grad_norm": NaN, + "learning_rate": 0.0001508698303353166, + "loss": 0.0, + "step": 33036 + }, + { + "epoch": 3.082672389661286, + "grad_norm": NaN, + "learning_rate": 0.0001508622666769505, + "loss": 0.0, + "step": 33037 + }, + { + "epoch": 3.082765699356163, + "grad_norm": NaN, + "learning_rate": 0.00015085470301639194, + "loss": 0.0, + "step": 33038 + }, + { + "epoch": 3.0828590090510404, + "grad_norm": NaN, + "learning_rate": 0.00015084713935366016, + "loss": 0.0, + "step": 33039 + }, + { + "epoch": 3.082952318745918, + "grad_norm": NaN, + "learning_rate": 0.00015083957568877432, + "loss": 0.0, + "step": 33040 + }, + { + "epoch": 3.0830456284407948, + "grad_norm": NaN, + "learning_rate": 0.00015083201202175368, + "loss": 0.0, + "step": 33041 + }, + { + "epoch": 3.083138938135672, + "grad_norm": NaN, + "learning_rate": 0.0001508244483526175, + "loss": 0.0, + "step": 33042 + }, + { + "epoch": 3.0832322478305496, + "grad_norm": NaN, + "learning_rate": 0.000150816884681385, + "loss": 0.0, + "step": 33043 + }, + { + "epoch": 3.083325557525427, + "grad_norm": NaN, + "learning_rate": 0.00015080932100807539, + "loss": 0.0, + "step": 33044 + }, + { + "epoch": 3.083418867220304, + "grad_norm": NaN, + "learning_rate": 0.00015080175733270788, + "loss": 0.0, + "step": 33045 + }, + { + "epoch": 3.0835121769151814, + "grad_norm": NaN, + "learning_rate": 0.00015079419365530178, + "loss": 0.0, + "step": 33046 + }, + { + "epoch": 3.083605486610059, + "grad_norm": NaN, + "learning_rate": 0.00015078662997587632, + "loss": 0.0, + "step": 33047 + }, + { + "epoch": 3.0836987963049363, + "grad_norm": NaN, + "learning_rate": 0.00015077906629445062, + "loss": 0.0, + "step": 33048 + }, + { + "epoch": 3.0837921059998132, + "grad_norm": NaN, + "learning_rate": 0.00015077150261104402, + "loss": 0.0, + "step": 33049 + }, + { + "epoch": 3.0838854156946907, + "grad_norm": NaN, + "learning_rate": 0.0001507639389256757, + "loss": 0.0, + "step": 33050 + }, + { + "epoch": 3.083978725389568, + "grad_norm": NaN, + "learning_rate": 0.00015075637523836492, + "loss": 0.0, + "step": 33051 + }, + { + "epoch": 3.0840720350844455, + "grad_norm": NaN, + "learning_rate": 0.00015074881154913096, + "loss": 0.0, + "step": 33052 + }, + { + "epoch": 3.0841653447793225, + "grad_norm": NaN, + "learning_rate": 0.00015074124785799292, + "loss": 0.0, + "step": 33053 + }, + { + "epoch": 3.0842586544742, + "grad_norm": NaN, + "learning_rate": 0.0001507336841649701, + "loss": 0.0, + "step": 33054 + }, + { + "epoch": 3.0843519641690773, + "grad_norm": NaN, + "learning_rate": 0.00015072612047008178, + "loss": 0.0, + "step": 33055 + }, + { + "epoch": 3.0844452738639543, + "grad_norm": NaN, + "learning_rate": 0.00015071855677334712, + "loss": 0.0, + "step": 33056 + }, + { + "epoch": 3.0845385835588317, + "grad_norm": NaN, + "learning_rate": 0.0001507109930747854, + "loss": 0.0, + "step": 33057 + }, + { + "epoch": 3.084631893253709, + "grad_norm": NaN, + "learning_rate": 0.00015070342937441585, + "loss": 0.0, + "step": 33058 + }, + { + "epoch": 3.0847252029485865, + "grad_norm": NaN, + "learning_rate": 0.00015069586567225765, + "loss": 0.0, + "step": 33059 + }, + { + "epoch": 3.0848185126434635, + "grad_norm": NaN, + "learning_rate": 0.0001506883019683301, + "loss": 0.0, + "step": 33060 + }, + { + "epoch": 3.084911822338341, + "grad_norm": NaN, + "learning_rate": 0.0001506807382626524, + "loss": 0.0, + "step": 33061 + }, + { + "epoch": 3.0850051320332184, + "grad_norm": NaN, + "learning_rate": 0.00015067317455524375, + "loss": 0.0, + "step": 33062 + }, + { + "epoch": 3.0850984417280953, + "grad_norm": NaN, + "learning_rate": 0.00015066561084612346, + "loss": 0.0, + "step": 33063 + }, + { + "epoch": 3.0851917514229728, + "grad_norm": NaN, + "learning_rate": 0.00015065804713531068, + "loss": 0.0, + "step": 33064 + }, + { + "epoch": 3.08528506111785, + "grad_norm": NaN, + "learning_rate": 0.00015065048342282472, + "loss": 0.0, + "step": 33065 + }, + { + "epoch": 3.0853783708127276, + "grad_norm": NaN, + "learning_rate": 0.00015064291970868474, + "loss": 0.0, + "step": 33066 + }, + { + "epoch": 3.0854716805076046, + "grad_norm": NaN, + "learning_rate": 0.00015063535599291005, + "loss": 0.0, + "step": 33067 + }, + { + "epoch": 3.085564990202482, + "grad_norm": NaN, + "learning_rate": 0.0001506277922755198, + "loss": 0.0, + "step": 33068 + }, + { + "epoch": 3.0856582998973594, + "grad_norm": NaN, + "learning_rate": 0.00015062022855653326, + "loss": 0.0, + "step": 33069 + }, + { + "epoch": 3.085751609592237, + "grad_norm": NaN, + "learning_rate": 0.00015061266483596967, + "loss": 0.0, + "step": 33070 + }, + { + "epoch": 3.085844919287114, + "grad_norm": NaN, + "learning_rate": 0.00015060510111384825, + "loss": 0.0, + "step": 33071 + }, + { + "epoch": 3.085938228981991, + "grad_norm": NaN, + "learning_rate": 0.00015059753739018824, + "loss": 0.0, + "step": 33072 + }, + { + "epoch": 3.0860315386768686, + "grad_norm": NaN, + "learning_rate": 0.0001505899736650089, + "loss": 0.0, + "step": 33073 + }, + { + "epoch": 3.0861248483717456, + "grad_norm": NaN, + "learning_rate": 0.00015058240993832942, + "loss": 0.0, + "step": 33074 + }, + { + "epoch": 3.086218158066623, + "grad_norm": NaN, + "learning_rate": 0.000150574846210169, + "loss": 0.0, + "step": 33075 + }, + { + "epoch": 3.0863114677615004, + "grad_norm": NaN, + "learning_rate": 0.00015056728248054695, + "loss": 0.0, + "step": 33076 + }, + { + "epoch": 3.086404777456378, + "grad_norm": NaN, + "learning_rate": 0.0001505597187494825, + "loss": 0.0, + "step": 33077 + }, + { + "epoch": 3.086498087151255, + "grad_norm": NaN, + "learning_rate": 0.00015055215501699477, + "loss": 0.0, + "step": 33078 + }, + { + "epoch": 3.0865913968461323, + "grad_norm": NaN, + "learning_rate": 0.00015054459128310314, + "loss": 0.0, + "step": 33079 + }, + { + "epoch": 3.0866847065410097, + "grad_norm": NaN, + "learning_rate": 0.00015053702754782672, + "loss": 0.0, + "step": 33080 + }, + { + "epoch": 3.086778016235887, + "grad_norm": NaN, + "learning_rate": 0.00015052946381118485, + "loss": 0.0, + "step": 33081 + }, + { + "epoch": 3.086871325930764, + "grad_norm": NaN, + "learning_rate": 0.00015052190007319667, + "loss": 0.0, + "step": 33082 + }, + { + "epoch": 3.0869646356256415, + "grad_norm": NaN, + "learning_rate": 0.0001505143363338815, + "loss": 0.0, + "step": 33083 + }, + { + "epoch": 3.087057945320519, + "grad_norm": NaN, + "learning_rate": 0.0001505067725932585, + "loss": 0.0, + "step": 33084 + }, + { + "epoch": 3.087151255015396, + "grad_norm": NaN, + "learning_rate": 0.00015049920885134695, + "loss": 0.0, + "step": 33085 + }, + { + "epoch": 3.0872445647102733, + "grad_norm": NaN, + "learning_rate": 0.00015049164510816602, + "loss": 0.0, + "step": 33086 + }, + { + "epoch": 3.0873378744051507, + "grad_norm": NaN, + "learning_rate": 0.00015048408136373502, + "loss": 0.0, + "step": 33087 + }, + { + "epoch": 3.087431184100028, + "grad_norm": NaN, + "learning_rate": 0.0001504765176180731, + "loss": 0.0, + "step": 33088 + }, + { + "epoch": 3.087524493794905, + "grad_norm": NaN, + "learning_rate": 0.00015046895387119954, + "loss": 0.0, + "step": 33089 + }, + { + "epoch": 3.0876178034897825, + "grad_norm": NaN, + "learning_rate": 0.0001504613901231336, + "loss": 0.0, + "step": 33090 + }, + { + "epoch": 3.08771111318466, + "grad_norm": NaN, + "learning_rate": 0.00015045382637389446, + "loss": 0.0, + "step": 33091 + }, + { + "epoch": 3.0878044228795374, + "grad_norm": NaN, + "learning_rate": 0.0001504462626235014, + "loss": 0.0, + "step": 33092 + }, + { + "epoch": 3.0878977325744144, + "grad_norm": NaN, + "learning_rate": 0.0001504386988719736, + "loss": 0.0, + "step": 33093 + }, + { + "epoch": 3.0879910422692918, + "grad_norm": NaN, + "learning_rate": 0.00015043113511933033, + "loss": 0.0, + "step": 33094 + }, + { + "epoch": 3.088084351964169, + "grad_norm": NaN, + "learning_rate": 0.00015042357136559083, + "loss": 0.0, + "step": 33095 + }, + { + "epoch": 3.088177661659046, + "grad_norm": NaN, + "learning_rate": 0.00015041600761077432, + "loss": 0.0, + "step": 33096 + }, + { + "epoch": 3.0882709713539236, + "grad_norm": NaN, + "learning_rate": 0.00015040844385489995, + "loss": 0.0, + "step": 33097 + }, + { + "epoch": 3.088364281048801, + "grad_norm": NaN, + "learning_rate": 0.0001504008800979871, + "loss": 0.0, + "step": 33098 + }, + { + "epoch": 3.0884575907436784, + "grad_norm": NaN, + "learning_rate": 0.00015039331634005492, + "loss": 0.0, + "step": 33099 + }, + { + "epoch": 3.0885509004385554, + "grad_norm": NaN, + "learning_rate": 0.00015038575258112264, + "loss": 0.0, + "step": 33100 + }, + { + "epoch": 3.088644210133433, + "grad_norm": NaN, + "learning_rate": 0.0001503781888212095, + "loss": 0.0, + "step": 33101 + }, + { + "epoch": 3.0887375198283102, + "grad_norm": NaN, + "learning_rate": 0.00015037062506033478, + "loss": 0.0, + "step": 33102 + }, + { + "epoch": 3.0888308295231877, + "grad_norm": NaN, + "learning_rate": 0.00015036306129851762, + "loss": 0.0, + "step": 33103 + }, + { + "epoch": 3.0889241392180646, + "grad_norm": NaN, + "learning_rate": 0.00015035549753577733, + "loss": 0.0, + "step": 33104 + }, + { + "epoch": 3.089017448912942, + "grad_norm": NaN, + "learning_rate": 0.00015034793377213308, + "loss": 0.0, + "step": 33105 + }, + { + "epoch": 3.0891107586078195, + "grad_norm": NaN, + "learning_rate": 0.00015034037000760417, + "loss": 0.0, + "step": 33106 + }, + { + "epoch": 3.0892040683026964, + "grad_norm": NaN, + "learning_rate": 0.0001503328062422098, + "loss": 0.0, + "step": 33107 + }, + { + "epoch": 3.089297377997574, + "grad_norm": NaN, + "learning_rate": 0.0001503252424759692, + "loss": 0.0, + "step": 33108 + }, + { + "epoch": 3.0893906876924513, + "grad_norm": NaN, + "learning_rate": 0.00015031767870890157, + "loss": 0.0, + "step": 33109 + }, + { + "epoch": 3.0894839973873287, + "grad_norm": NaN, + "learning_rate": 0.00015031011494102622, + "loss": 0.0, + "step": 33110 + }, + { + "epoch": 3.0895773070822057, + "grad_norm": NaN, + "learning_rate": 0.0001503025511723623, + "loss": 0.0, + "step": 33111 + }, + { + "epoch": 3.089670616777083, + "grad_norm": NaN, + "learning_rate": 0.00015029498740292912, + "loss": 0.0, + "step": 33112 + }, + { + "epoch": 3.0897639264719605, + "grad_norm": NaN, + "learning_rate": 0.0001502874236327459, + "loss": 0.0, + "step": 33113 + }, + { + "epoch": 3.089857236166838, + "grad_norm": NaN, + "learning_rate": 0.0001502798598618318, + "loss": 0.0, + "step": 33114 + }, + { + "epoch": 3.089950545861715, + "grad_norm": NaN, + "learning_rate": 0.00015027229609020608, + "loss": 0.0, + "step": 33115 + }, + { + "epoch": 3.0900438555565923, + "grad_norm": NaN, + "learning_rate": 0.00015026473231788805, + "loss": 0.0, + "step": 33116 + }, + { + "epoch": 3.0901371652514698, + "grad_norm": NaN, + "learning_rate": 0.00015025716854489684, + "loss": 0.0, + "step": 33117 + }, + { + "epoch": 3.0902304749463467, + "grad_norm": NaN, + "learning_rate": 0.00015024960477125172, + "loss": 0.0, + "step": 33118 + }, + { + "epoch": 3.090323784641224, + "grad_norm": NaN, + "learning_rate": 0.00015024204099697196, + "loss": 0.0, + "step": 33119 + }, + { + "epoch": 3.0904170943361016, + "grad_norm": NaN, + "learning_rate": 0.00015023447722207675, + "loss": 0.0, + "step": 33120 + }, + { + "epoch": 3.090510404030979, + "grad_norm": NaN, + "learning_rate": 0.00015022691344658535, + "loss": 0.0, + "step": 33121 + }, + { + "epoch": 3.090603713725856, + "grad_norm": NaN, + "learning_rate": 0.00015021934967051695, + "loss": 0.0, + "step": 33122 + }, + { + "epoch": 3.0906970234207334, + "grad_norm": NaN, + "learning_rate": 0.00015021178589389085, + "loss": 0.0, + "step": 33123 + }, + { + "epoch": 3.090790333115611, + "grad_norm": NaN, + "learning_rate": 0.0001502042221167262, + "loss": 0.0, + "step": 33124 + }, + { + "epoch": 3.090883642810488, + "grad_norm": NaN, + "learning_rate": 0.00015019665833904228, + "loss": 0.0, + "step": 33125 + }, + { + "epoch": 3.090976952505365, + "grad_norm": NaN, + "learning_rate": 0.0001501890945608583, + "loss": 0.0, + "step": 33126 + }, + { + "epoch": 3.0910702622002426, + "grad_norm": NaN, + "learning_rate": 0.00015018153078219352, + "loss": 0.0, + "step": 33127 + }, + { + "epoch": 3.09116357189512, + "grad_norm": NaN, + "learning_rate": 0.00015017396700306722, + "loss": 0.0, + "step": 33128 + }, + { + "epoch": 3.091256881589997, + "grad_norm": NaN, + "learning_rate": 0.0001501664032234985, + "loss": 0.0, + "step": 33129 + }, + { + "epoch": 3.0913501912848744, + "grad_norm": NaN, + "learning_rate": 0.00015015883944350668, + "loss": 0.0, + "step": 33130 + }, + { + "epoch": 3.091443500979752, + "grad_norm": NaN, + "learning_rate": 0.00015015127566311097, + "loss": 0.0, + "step": 33131 + }, + { + "epoch": 3.0915368106746293, + "grad_norm": NaN, + "learning_rate": 0.00015014371188233063, + "loss": 0.0, + "step": 33132 + }, + { + "epoch": 3.0916301203695062, + "grad_norm": NaN, + "learning_rate": 0.0001501361481011849, + "loss": 0.0, + "step": 33133 + }, + { + "epoch": 3.0917234300643837, + "grad_norm": NaN, + "learning_rate": 0.00015012858431969297, + "loss": 0.0, + "step": 33134 + }, + { + "epoch": 3.091816739759261, + "grad_norm": NaN, + "learning_rate": 0.00015012102053787405, + "loss": 0.0, + "step": 33135 + }, + { + "epoch": 3.0919100494541385, + "grad_norm": NaN, + "learning_rate": 0.00015011345675574746, + "loss": 0.0, + "step": 33136 + }, + { + "epoch": 3.0920033591490155, + "grad_norm": NaN, + "learning_rate": 0.00015010589297333236, + "loss": 0.0, + "step": 33137 + }, + { + "epoch": 3.092096668843893, + "grad_norm": NaN, + "learning_rate": 0.000150098329190648, + "loss": 0.0, + "step": 33138 + }, + { + "epoch": 3.0921899785387703, + "grad_norm": NaN, + "learning_rate": 0.00015009076540771363, + "loss": 0.0, + "step": 33139 + }, + { + "epoch": 3.0922832882336473, + "grad_norm": NaN, + "learning_rate": 0.00015008320162454846, + "loss": 0.0, + "step": 33140 + }, + { + "epoch": 3.0923765979285247, + "grad_norm": NaN, + "learning_rate": 0.00015007563784117173, + "loss": 0.0, + "step": 33141 + }, + { + "epoch": 3.092469907623402, + "grad_norm": NaN, + "learning_rate": 0.0001500680740576027, + "loss": 0.0, + "step": 33142 + }, + { + "epoch": 3.0925632173182795, + "grad_norm": NaN, + "learning_rate": 0.00015006051027386056, + "loss": 0.0, + "step": 33143 + }, + { + "epoch": 3.0926565270131565, + "grad_norm": NaN, + "learning_rate": 0.00015005294648996452, + "loss": 0.0, + "step": 33144 + }, + { + "epoch": 3.092749836708034, + "grad_norm": NaN, + "learning_rate": 0.00015004538270593394, + "loss": 0.0, + "step": 33145 + }, + { + "epoch": 3.0928431464029114, + "grad_norm": NaN, + "learning_rate": 0.00015003781892178789, + "loss": 0.0, + "step": 33146 + }, + { + "epoch": 3.0929364560977888, + "grad_norm": NaN, + "learning_rate": 0.00015003025513754572, + "loss": 0.0, + "step": 33147 + }, + { + "epoch": 3.0930297657926658, + "grad_norm": NaN, + "learning_rate": 0.0001500226913532266, + "loss": 0.0, + "step": 33148 + }, + { + "epoch": 3.093123075487543, + "grad_norm": NaN, + "learning_rate": 0.00015001512756884978, + "loss": 0.0, + "step": 33149 + }, + { + "epoch": 3.0932163851824206, + "grad_norm": NaN, + "learning_rate": 0.0001500075637844345, + "loss": 0.0, + "step": 33150 + }, + { + "epoch": 3.0933096948772976, + "grad_norm": NaN, + "learning_rate": 0.00015, + "loss": 0.0, + "step": 33151 + }, + { + "epoch": 3.093403004572175, + "grad_norm": NaN, + "learning_rate": 0.00014999243621556546, + "loss": 0.0, + "step": 33152 + }, + { + "epoch": 3.0934963142670524, + "grad_norm": NaN, + "learning_rate": 0.0001499848724311502, + "loss": 0.0, + "step": 33153 + }, + { + "epoch": 3.09358962396193, + "grad_norm": NaN, + "learning_rate": 0.0001499773086467734, + "loss": 0.0, + "step": 33154 + }, + { + "epoch": 3.093682933656807, + "grad_norm": NaN, + "learning_rate": 0.00014996974486245425, + "loss": 0.0, + "step": 33155 + }, + { + "epoch": 3.093776243351684, + "grad_norm": NaN, + "learning_rate": 0.0001499621810782121, + "loss": 0.0, + "step": 33156 + }, + { + "epoch": 3.0938695530465616, + "grad_norm": NaN, + "learning_rate": 0.0001499546172940661, + "loss": 0.0, + "step": 33157 + }, + { + "epoch": 3.0939628627414386, + "grad_norm": NaN, + "learning_rate": 0.00014994705351003543, + "loss": 0.0, + "step": 33158 + }, + { + "epoch": 3.094056172436316, + "grad_norm": NaN, + "learning_rate": 0.00014993948972613947, + "loss": 0.0, + "step": 33159 + }, + { + "epoch": 3.0941494821311935, + "grad_norm": NaN, + "learning_rate": 0.0001499319259423973, + "loss": 0.0, + "step": 33160 + }, + { + "epoch": 3.094242791826071, + "grad_norm": NaN, + "learning_rate": 0.00014992436215882822, + "loss": 0.0, + "step": 33161 + }, + { + "epoch": 3.094336101520948, + "grad_norm": NaN, + "learning_rate": 0.00014991679837545154, + "loss": 0.0, + "step": 33162 + }, + { + "epoch": 3.0944294112158253, + "grad_norm": NaN, + "learning_rate": 0.00014990923459228637, + "loss": 0.0, + "step": 33163 + }, + { + "epoch": 3.0945227209107027, + "grad_norm": NaN, + "learning_rate": 0.00014990167080935197, + "loss": 0.0, + "step": 33164 + }, + { + "epoch": 3.09461603060558, + "grad_norm": NaN, + "learning_rate": 0.00014989410702666764, + "loss": 0.0, + "step": 33165 + }, + { + "epoch": 3.094709340300457, + "grad_norm": NaN, + "learning_rate": 0.00014988654324425254, + "loss": 0.0, + "step": 33166 + }, + { + "epoch": 3.0948026499953345, + "grad_norm": NaN, + "learning_rate": 0.0001498789794621259, + "loss": 0.0, + "step": 33167 + }, + { + "epoch": 3.094895959690212, + "grad_norm": NaN, + "learning_rate": 0.00014987141568030703, + "loss": 0.0, + "step": 33168 + }, + { + "epoch": 3.0949892693850893, + "grad_norm": NaN, + "learning_rate": 0.0001498638518988151, + "loss": 0.0, + "step": 33169 + }, + { + "epoch": 3.0950825790799663, + "grad_norm": NaN, + "learning_rate": 0.00014985628811766929, + "loss": 0.0, + "step": 33170 + }, + { + "epoch": 3.0951758887748437, + "grad_norm": NaN, + "learning_rate": 0.000149848724336889, + "loss": 0.0, + "step": 33171 + }, + { + "epoch": 3.095269198469721, + "grad_norm": NaN, + "learning_rate": 0.00014984116055649332, + "loss": 0.0, + "step": 33172 + }, + { + "epoch": 3.095362508164598, + "grad_norm": NaN, + "learning_rate": 0.00014983359677650147, + "loss": 0.0, + "step": 33173 + }, + { + "epoch": 3.0954558178594755, + "grad_norm": NaN, + "learning_rate": 0.0001498260329969328, + "loss": 0.0, + "step": 33174 + }, + { + "epoch": 3.095549127554353, + "grad_norm": NaN, + "learning_rate": 0.00014981846921780645, + "loss": 0.0, + "step": 33175 + }, + { + "epoch": 3.0956424372492304, + "grad_norm": NaN, + "learning_rate": 0.00014981090543914172, + "loss": 0.0, + "step": 33176 + }, + { + "epoch": 3.0957357469441074, + "grad_norm": NaN, + "learning_rate": 0.00014980334166095772, + "loss": 0.0, + "step": 33177 + }, + { + "epoch": 3.0958290566389848, + "grad_norm": NaN, + "learning_rate": 0.0001497957778832738, + "loss": 0.0, + "step": 33178 + }, + { + "epoch": 3.095922366333862, + "grad_norm": NaN, + "learning_rate": 0.0001497882141061092, + "loss": 0.0, + "step": 33179 + }, + { + "epoch": 3.096015676028739, + "grad_norm": NaN, + "learning_rate": 0.00014978065032948305, + "loss": 0.0, + "step": 33180 + }, + { + "epoch": 3.0961089857236166, + "grad_norm": NaN, + "learning_rate": 0.00014977308655341465, + "loss": 0.0, + "step": 33181 + }, + { + "epoch": 3.096202295418494, + "grad_norm": NaN, + "learning_rate": 0.00014976552277792327, + "loss": 0.0, + "step": 33182 + }, + { + "epoch": 3.0962956051133714, + "grad_norm": NaN, + "learning_rate": 0.00014975795900302801, + "loss": 0.0, + "step": 33183 + }, + { + "epoch": 3.0963889148082484, + "grad_norm": NaN, + "learning_rate": 0.00014975039522874826, + "loss": 0.0, + "step": 33184 + }, + { + "epoch": 3.096482224503126, + "grad_norm": NaN, + "learning_rate": 0.0001497428314551032, + "loss": 0.0, + "step": 33185 + }, + { + "epoch": 3.0965755341980032, + "grad_norm": NaN, + "learning_rate": 0.00014973526768211195, + "loss": 0.0, + "step": 33186 + }, + { + "epoch": 3.0966688438928807, + "grad_norm": NaN, + "learning_rate": 0.00014972770390979387, + "loss": 0.0, + "step": 33187 + }, + { + "epoch": 3.0967621535877576, + "grad_norm": NaN, + "learning_rate": 0.00014972014013816824, + "loss": 0.0, + "step": 33188 + }, + { + "epoch": 3.096855463282635, + "grad_norm": NaN, + "learning_rate": 0.00014971257636725411, + "loss": 0.0, + "step": 33189 + }, + { + "epoch": 3.0969487729775125, + "grad_norm": NaN, + "learning_rate": 0.00014970501259707082, + "loss": 0.0, + "step": 33190 + }, + { + "epoch": 3.0970420826723895, + "grad_norm": NaN, + "learning_rate": 0.00014969744882763767, + "loss": 0.0, + "step": 33191 + }, + { + "epoch": 3.097135392367267, + "grad_norm": NaN, + "learning_rate": 0.00014968988505897378, + "loss": 0.0, + "step": 33192 + }, + { + "epoch": 3.0972287020621443, + "grad_norm": NaN, + "learning_rate": 0.00014968232129109838, + "loss": 0.0, + "step": 33193 + }, + { + "epoch": 3.0973220117570217, + "grad_norm": NaN, + "learning_rate": 0.0001496747575240308, + "loss": 0.0, + "step": 33194 + }, + { + "epoch": 3.0974153214518987, + "grad_norm": NaN, + "learning_rate": 0.0001496671937577902, + "loss": 0.0, + "step": 33195 + }, + { + "epoch": 3.097508631146776, + "grad_norm": NaN, + "learning_rate": 0.00014965962999239577, + "loss": 0.0, + "step": 33196 + }, + { + "epoch": 3.0976019408416535, + "grad_norm": NaN, + "learning_rate": 0.0001496520662278669, + "loss": 0.0, + "step": 33197 + }, + { + "epoch": 3.097695250536531, + "grad_norm": NaN, + "learning_rate": 0.00014964450246422267, + "loss": 0.0, + "step": 33198 + }, + { + "epoch": 3.097788560231408, + "grad_norm": NaN, + "learning_rate": 0.00014963693870148235, + "loss": 0.0, + "step": 33199 + }, + { + "epoch": 3.0978818699262853, + "grad_norm": NaN, + "learning_rate": 0.00014962937493966523, + "loss": 0.0, + "step": 33200 + }, + { + "epoch": 3.0979751796211628, + "grad_norm": NaN, + "learning_rate": 0.00014962181117879046, + "loss": 0.0, + "step": 33201 + }, + { + "epoch": 3.0980684893160397, + "grad_norm": NaN, + "learning_rate": 0.0001496142474188773, + "loss": 0.0, + "step": 33202 + }, + { + "epoch": 3.098161799010917, + "grad_norm": NaN, + "learning_rate": 0.00014960668365994508, + "loss": 0.0, + "step": 33203 + }, + { + "epoch": 3.0982551087057946, + "grad_norm": NaN, + "learning_rate": 0.00014959911990201287, + "loss": 0.0, + "step": 33204 + }, + { + "epoch": 3.098348418400672, + "grad_norm": NaN, + "learning_rate": 0.00014959155614509997, + "loss": 0.0, + "step": 33205 + }, + { + "epoch": 3.098441728095549, + "grad_norm": NaN, + "learning_rate": 0.0001495839923892257, + "loss": 0.0, + "step": 33206 + }, + { + "epoch": 3.0985350377904264, + "grad_norm": NaN, + "learning_rate": 0.00014957642863440917, + "loss": 0.0, + "step": 33207 + }, + { + "epoch": 3.098628347485304, + "grad_norm": NaN, + "learning_rate": 0.0001495688648806696, + "loss": 0.0, + "step": 33208 + }, + { + "epoch": 3.098721657180181, + "grad_norm": NaN, + "learning_rate": 0.00014956130112802636, + "loss": 0.0, + "step": 33209 + }, + { + "epoch": 3.098814966875058, + "grad_norm": NaN, + "learning_rate": 0.0001495537373764986, + "loss": 0.0, + "step": 33210 + }, + { + "epoch": 3.0989082765699356, + "grad_norm": NaN, + "learning_rate": 0.00014954617362610546, + "loss": 0.0, + "step": 33211 + }, + { + "epoch": 3.099001586264813, + "grad_norm": NaN, + "learning_rate": 0.00014953860987686638, + "loss": 0.0, + "step": 33212 + }, + { + "epoch": 3.09909489595969, + "grad_norm": NaN, + "learning_rate": 0.00014953104612880043, + "loss": 0.0, + "step": 33213 + }, + { + "epoch": 3.0991882056545674, + "grad_norm": NaN, + "learning_rate": 0.00014952348238192687, + "loss": 0.0, + "step": 33214 + }, + { + "epoch": 3.099281515349445, + "grad_norm": NaN, + "learning_rate": 0.000149515918636265, + "loss": 0.0, + "step": 33215 + }, + { + "epoch": 3.0993748250443223, + "grad_norm": NaN, + "learning_rate": 0.00014950835489183398, + "loss": 0.0, + "step": 33216 + }, + { + "epoch": 3.0994681347391992, + "grad_norm": NaN, + "learning_rate": 0.00014950079114865302, + "loss": 0.0, + "step": 33217 + }, + { + "epoch": 3.0995614444340767, + "grad_norm": NaN, + "learning_rate": 0.0001494932274067415, + "loss": 0.0, + "step": 33218 + }, + { + "epoch": 3.099654754128954, + "grad_norm": NaN, + "learning_rate": 0.0001494856636661185, + "loss": 0.0, + "step": 33219 + }, + { + "epoch": 3.0997480638238315, + "grad_norm": NaN, + "learning_rate": 0.00014947809992680333, + "loss": 0.0, + "step": 33220 + }, + { + "epoch": 3.0998413735187085, + "grad_norm": NaN, + "learning_rate": 0.00014947053618881515, + "loss": 0.0, + "step": 33221 + }, + { + "epoch": 3.099934683213586, + "grad_norm": NaN, + "learning_rate": 0.00014946297245217326, + "loss": 0.0, + "step": 33222 + }, + { + "epoch": 3.1000279929084633, + "grad_norm": NaN, + "learning_rate": 0.0001494554087168969, + "loss": 0.0, + "step": 33223 + }, + { + "epoch": 3.1001213026033403, + "grad_norm": NaN, + "learning_rate": 0.0001494478449830052, + "loss": 0.0, + "step": 33224 + }, + { + "epoch": 3.1002146122982177, + "grad_norm": NaN, + "learning_rate": 0.00014944028125051748, + "loss": 0.0, + "step": 33225 + }, + { + "epoch": 3.100307921993095, + "grad_norm": NaN, + "learning_rate": 0.00014943271751945308, + "loss": 0.0, + "step": 33226 + }, + { + "epoch": 3.1004012316879725, + "grad_norm": NaN, + "learning_rate": 0.00014942515378983098, + "loss": 0.0, + "step": 33227 + }, + { + "epoch": 3.1004945413828495, + "grad_norm": NaN, + "learning_rate": 0.00014941759006167058, + "loss": 0.0, + "step": 33228 + }, + { + "epoch": 3.100587851077727, + "grad_norm": NaN, + "learning_rate": 0.0001494100263349911, + "loss": 0.0, + "step": 33229 + }, + { + "epoch": 3.1006811607726044, + "grad_norm": NaN, + "learning_rate": 0.00014940246260981174, + "loss": 0.0, + "step": 33230 + }, + { + "epoch": 3.100774470467482, + "grad_norm": NaN, + "learning_rate": 0.0001493948988861517, + "loss": 0.0, + "step": 33231 + }, + { + "epoch": 3.1008677801623588, + "grad_norm": NaN, + "learning_rate": 0.0001493873351640303, + "loss": 0.0, + "step": 33232 + }, + { + "epoch": 3.100961089857236, + "grad_norm": NaN, + "learning_rate": 0.00014937977144346672, + "loss": 0.0, + "step": 33233 + }, + { + "epoch": 3.1010543995521136, + "grad_norm": NaN, + "learning_rate": 0.00014937220772448016, + "loss": 0.0, + "step": 33234 + }, + { + "epoch": 3.1011477092469906, + "grad_norm": NaN, + "learning_rate": 0.00014936464400708995, + "loss": 0.0, + "step": 33235 + }, + { + "epoch": 3.101241018941868, + "grad_norm": NaN, + "learning_rate": 0.00014935708029131524, + "loss": 0.0, + "step": 33236 + }, + { + "epoch": 3.1013343286367454, + "grad_norm": NaN, + "learning_rate": 0.00014934951657717525, + "loss": 0.0, + "step": 33237 + }, + { + "epoch": 3.101427638331623, + "grad_norm": NaN, + "learning_rate": 0.0001493419528646893, + "loss": 0.0, + "step": 33238 + }, + { + "epoch": 3.1015209480265, + "grad_norm": NaN, + "learning_rate": 0.00014933438915387654, + "loss": 0.0, + "step": 33239 + }, + { + "epoch": 3.101614257721377, + "grad_norm": NaN, + "learning_rate": 0.0001493268254447562, + "loss": 0.0, + "step": 33240 + }, + { + "epoch": 3.1017075674162546, + "grad_norm": NaN, + "learning_rate": 0.00014931926173734761, + "loss": 0.0, + "step": 33241 + }, + { + "epoch": 3.101800877111132, + "grad_norm": NaN, + "learning_rate": 0.0001493116980316699, + "loss": 0.0, + "step": 33242 + }, + { + "epoch": 3.101894186806009, + "grad_norm": NaN, + "learning_rate": 0.0001493041343277423, + "loss": 0.0, + "step": 33243 + }, + { + "epoch": 3.1019874965008865, + "grad_norm": NaN, + "learning_rate": 0.00014929657062558415, + "loss": 0.0, + "step": 33244 + }, + { + "epoch": 3.102080806195764, + "grad_norm": NaN, + "learning_rate": 0.0001492890069252146, + "loss": 0.0, + "step": 33245 + }, + { + "epoch": 3.102174115890641, + "grad_norm": NaN, + "learning_rate": 0.00014928144322665283, + "loss": 0.0, + "step": 33246 + }, + { + "epoch": 3.1022674255855183, + "grad_norm": NaN, + "learning_rate": 0.00014927387952991822, + "loss": 0.0, + "step": 33247 + }, + { + "epoch": 3.1023607352803957, + "grad_norm": NaN, + "learning_rate": 0.0001492663158350299, + "loss": 0.0, + "step": 33248 + }, + { + "epoch": 3.102454044975273, + "grad_norm": NaN, + "learning_rate": 0.00014925875214200706, + "loss": 0.0, + "step": 33249 + }, + { + "epoch": 3.10254735467015, + "grad_norm": NaN, + "learning_rate": 0.00014925118845086907, + "loss": 0.0, + "step": 33250 + }, + { + "epoch": 3.1026406643650275, + "grad_norm": NaN, + "learning_rate": 0.00014924362476163505, + "loss": 0.0, + "step": 33251 + }, + { + "epoch": 3.102733974059905, + "grad_norm": NaN, + "learning_rate": 0.00014923606107432425, + "loss": 0.0, + "step": 33252 + }, + { + "epoch": 3.102827283754782, + "grad_norm": NaN, + "learning_rate": 0.00014922849738895595, + "loss": 0.0, + "step": 33253 + }, + { + "epoch": 3.1029205934496593, + "grad_norm": NaN, + "learning_rate": 0.00014922093370554936, + "loss": 0.0, + "step": 33254 + }, + { + "epoch": 3.1030139031445367, + "grad_norm": NaN, + "learning_rate": 0.00014921337002412366, + "loss": 0.0, + "step": 33255 + }, + { + "epoch": 3.103107212839414, + "grad_norm": NaN, + "learning_rate": 0.0001492058063446982, + "loss": 0.0, + "step": 33256 + }, + { + "epoch": 3.103200522534291, + "grad_norm": NaN, + "learning_rate": 0.0001491982426672921, + "loss": 0.0, + "step": 33257 + }, + { + "epoch": 3.1032938322291685, + "grad_norm": NaN, + "learning_rate": 0.0001491906789919246, + "loss": 0.0, + "step": 33258 + }, + { + "epoch": 3.103387141924046, + "grad_norm": NaN, + "learning_rate": 0.00014918311531861503, + "loss": 0.0, + "step": 33259 + }, + { + "epoch": 3.1034804516189234, + "grad_norm": NaN, + "learning_rate": 0.0001491755516473825, + "loss": 0.0, + "step": 33260 + }, + { + "epoch": 3.1035737613138004, + "grad_norm": NaN, + "learning_rate": 0.0001491679879782463, + "loss": 0.0, + "step": 33261 + }, + { + "epoch": 3.103667071008678, + "grad_norm": NaN, + "learning_rate": 0.00014916042431122568, + "loss": 0.0, + "step": 33262 + }, + { + "epoch": 3.103760380703555, + "grad_norm": NaN, + "learning_rate": 0.0001491528606463398, + "loss": 0.0, + "step": 33263 + }, + { + "epoch": 3.1038536903984326, + "grad_norm": NaN, + "learning_rate": 0.00014914529698360808, + "loss": 0.0, + "step": 33264 + }, + { + "epoch": 3.1039470000933096, + "grad_norm": NaN, + "learning_rate": 0.0001491377333230495, + "loss": 0.0, + "step": 33265 + }, + { + "epoch": 3.104040309788187, + "grad_norm": NaN, + "learning_rate": 0.0001491301696646834, + "loss": 0.0, + "step": 33266 + }, + { + "epoch": 3.1041336194830644, + "grad_norm": NaN, + "learning_rate": 0.0001491226060085291, + "loss": 0.0, + "step": 33267 + }, + { + "epoch": 3.1042269291779414, + "grad_norm": NaN, + "learning_rate": 0.0001491150423546057, + "loss": 0.0, + "step": 33268 + }, + { + "epoch": 3.104320238872819, + "grad_norm": NaN, + "learning_rate": 0.00014910747870293249, + "loss": 0.0, + "step": 33269 + }, + { + "epoch": 3.1044135485676962, + "grad_norm": NaN, + "learning_rate": 0.00014909991505352872, + "loss": 0.0, + "step": 33270 + }, + { + "epoch": 3.1045068582625737, + "grad_norm": NaN, + "learning_rate": 0.00014909235140641357, + "loss": 0.0, + "step": 33271 + }, + { + "epoch": 3.1046001679574506, + "grad_norm": NaN, + "learning_rate": 0.00014908478776160628, + "loss": 0.0, + "step": 33272 + }, + { + "epoch": 3.104693477652328, + "grad_norm": NaN, + "learning_rate": 0.00014907722411912618, + "loss": 0.0, + "step": 33273 + }, + { + "epoch": 3.1047867873472055, + "grad_norm": NaN, + "learning_rate": 0.0001490696604789924, + "loss": 0.0, + "step": 33274 + }, + { + "epoch": 3.1048800970420825, + "grad_norm": NaN, + "learning_rate": 0.00014906209684122413, + "loss": 0.0, + "step": 33275 + }, + { + "epoch": 3.10497340673696, + "grad_norm": NaN, + "learning_rate": 0.00014905453320584075, + "loss": 0.0, + "step": 33276 + }, + { + "epoch": 3.1050667164318373, + "grad_norm": NaN, + "learning_rate": 0.0001490469695728614, + "loss": 0.0, + "step": 33277 + }, + { + "epoch": 3.1051600261267147, + "grad_norm": NaN, + "learning_rate": 0.00014903940594230528, + "loss": 0.0, + "step": 33278 + }, + { + "epoch": 3.1052533358215917, + "grad_norm": NaN, + "learning_rate": 0.00014903184231419172, + "loss": 0.0, + "step": 33279 + }, + { + "epoch": 3.105346645516469, + "grad_norm": NaN, + "learning_rate": 0.00014902427868853987, + "loss": 0.0, + "step": 33280 + }, + { + "epoch": 3.1054399552113465, + "grad_norm": NaN, + "learning_rate": 0.00014901671506536898, + "loss": 0.0, + "step": 33281 + }, + { + "epoch": 3.105533264906224, + "grad_norm": NaN, + "learning_rate": 0.00014900915144469833, + "loss": 0.0, + "step": 33282 + }, + { + "epoch": 3.105626574601101, + "grad_norm": NaN, + "learning_rate": 0.0001490015878265471, + "loss": 0.0, + "step": 33283 + }, + { + "epoch": 3.1057198842959783, + "grad_norm": NaN, + "learning_rate": 0.00014899402421093448, + "loss": 0.0, + "step": 33284 + }, + { + "epoch": 3.1058131939908558, + "grad_norm": NaN, + "learning_rate": 0.00014898646059787984, + "loss": 0.0, + "step": 33285 + }, + { + "epoch": 3.1059065036857327, + "grad_norm": NaN, + "learning_rate": 0.0001489788969874023, + "loss": 0.0, + "step": 33286 + }, + { + "epoch": 3.10599981338061, + "grad_norm": NaN, + "learning_rate": 0.00014897133337952109, + "loss": 0.0, + "step": 33287 + }, + { + "epoch": 3.1060931230754876, + "grad_norm": NaN, + "learning_rate": 0.00014896376977425552, + "loss": 0.0, + "step": 33288 + }, + { + "epoch": 3.106186432770365, + "grad_norm": NaN, + "learning_rate": 0.00014895620617162475, + "loss": 0.0, + "step": 33289 + }, + { + "epoch": 3.106279742465242, + "grad_norm": NaN, + "learning_rate": 0.000148948642571648, + "loss": 0.0, + "step": 33290 + }, + { + "epoch": 3.1063730521601194, + "grad_norm": NaN, + "learning_rate": 0.00014894107897434465, + "loss": 0.0, + "step": 33291 + }, + { + "epoch": 3.106466361854997, + "grad_norm": NaN, + "learning_rate": 0.00014893351537973375, + "loss": 0.0, + "step": 33292 + }, + { + "epoch": 3.1065596715498742, + "grad_norm": NaN, + "learning_rate": 0.00014892595178783458, + "loss": 0.0, + "step": 33293 + }, + { + "epoch": 3.106652981244751, + "grad_norm": NaN, + "learning_rate": 0.00014891838819866646, + "loss": 0.0, + "step": 33294 + }, + { + "epoch": 3.1067462909396286, + "grad_norm": NaN, + "learning_rate": 0.00014891082461224853, + "loss": 0.0, + "step": 33295 + }, + { + "epoch": 3.106839600634506, + "grad_norm": NaN, + "learning_rate": 0.0001489032610286, + "loss": 0.0, + "step": 33296 + }, + { + "epoch": 3.106932910329383, + "grad_norm": NaN, + "learning_rate": 0.00014889569744774026, + "loss": 0.0, + "step": 33297 + }, + { + "epoch": 3.1070262200242604, + "grad_norm": NaN, + "learning_rate": 0.00014888813386968835, + "loss": 0.0, + "step": 33298 + }, + { + "epoch": 3.107119529719138, + "grad_norm": NaN, + "learning_rate": 0.0001488805702944636, + "loss": 0.0, + "step": 33299 + }, + { + "epoch": 3.1072128394140153, + "grad_norm": NaN, + "learning_rate": 0.00014887300672208526, + "loss": 0.0, + "step": 33300 + }, + { + "epoch": 3.1073061491088922, + "grad_norm": NaN, + "learning_rate": 0.00014886544315257248, + "loss": 0.0, + "step": 33301 + }, + { + "epoch": 3.1073994588037697, + "grad_norm": NaN, + "learning_rate": 0.0001488578795859446, + "loss": 0.0, + "step": 33302 + }, + { + "epoch": 3.107492768498647, + "grad_norm": NaN, + "learning_rate": 0.0001488503160222208, + "loss": 0.0, + "step": 33303 + }, + { + "epoch": 3.1075860781935245, + "grad_norm": NaN, + "learning_rate": 0.0001488427524614202, + "loss": 0.0, + "step": 33304 + }, + { + "epoch": 3.1076793878884015, + "grad_norm": NaN, + "learning_rate": 0.00014883518890356222, + "loss": 0.0, + "step": 33305 + }, + { + "epoch": 3.107772697583279, + "grad_norm": NaN, + "learning_rate": 0.00014882762534866606, + "loss": 0.0, + "step": 33306 + }, + { + "epoch": 3.1078660072781563, + "grad_norm": NaN, + "learning_rate": 0.0001488200617967508, + "loss": 0.0, + "step": 33307 + }, + { + "epoch": 3.1079593169730333, + "grad_norm": NaN, + "learning_rate": 0.0001488124982478358, + "loss": 0.0, + "step": 33308 + }, + { + "epoch": 3.1080526266679107, + "grad_norm": NaN, + "learning_rate": 0.00014880493470194033, + "loss": 0.0, + "step": 33309 + }, + { + "epoch": 3.108145936362788, + "grad_norm": NaN, + "learning_rate": 0.00014879737115908347, + "loss": 0.0, + "step": 33310 + }, + { + "epoch": 3.1082392460576656, + "grad_norm": NaN, + "learning_rate": 0.0001487898076192846, + "loss": 0.0, + "step": 33311 + }, + { + "epoch": 3.1083325557525425, + "grad_norm": NaN, + "learning_rate": 0.00014878224408256286, + "loss": 0.0, + "step": 33312 + }, + { + "epoch": 3.10842586544742, + "grad_norm": NaN, + "learning_rate": 0.0001487746805489375, + "loss": 0.0, + "step": 33313 + }, + { + "epoch": 3.1085191751422974, + "grad_norm": NaN, + "learning_rate": 0.0001487671170184278, + "loss": 0.0, + "step": 33314 + }, + { + "epoch": 3.108612484837175, + "grad_norm": NaN, + "learning_rate": 0.00014875955349105295, + "loss": 0.0, + "step": 33315 + }, + { + "epoch": 3.1087057945320518, + "grad_norm": NaN, + "learning_rate": 0.00014875198996683214, + "loss": 0.0, + "step": 33316 + }, + { + "epoch": 3.108799104226929, + "grad_norm": NaN, + "learning_rate": 0.00014874442644578472, + "loss": 0.0, + "step": 33317 + }, + { + "epoch": 3.1088924139218066, + "grad_norm": NaN, + "learning_rate": 0.00014873686292792983, + "loss": 0.0, + "step": 33318 + }, + { + "epoch": 3.1089857236166836, + "grad_norm": NaN, + "learning_rate": 0.00014872929941328668, + "loss": 0.0, + "step": 33319 + }, + { + "epoch": 3.109079033311561, + "grad_norm": NaN, + "learning_rate": 0.0001487217359018746, + "loss": 0.0, + "step": 33320 + }, + { + "epoch": 3.1091723430064384, + "grad_norm": NaN, + "learning_rate": 0.00014871417239371274, + "loss": 0.0, + "step": 33321 + }, + { + "epoch": 3.109265652701316, + "grad_norm": NaN, + "learning_rate": 0.00014870660888882033, + "loss": 0.0, + "step": 33322 + }, + { + "epoch": 3.109358962396193, + "grad_norm": NaN, + "learning_rate": 0.0001486990453872167, + "loss": 0.0, + "step": 33323 + }, + { + "epoch": 3.1094522720910702, + "grad_norm": NaN, + "learning_rate": 0.00014869148188892097, + "loss": 0.0, + "step": 33324 + }, + { + "epoch": 3.1095455817859476, + "grad_norm": NaN, + "learning_rate": 0.0001486839183939524, + "loss": 0.0, + "step": 33325 + }, + { + "epoch": 3.109638891480825, + "grad_norm": NaN, + "learning_rate": 0.00014867635490233027, + "loss": 0.0, + "step": 33326 + }, + { + "epoch": 3.109732201175702, + "grad_norm": NaN, + "learning_rate": 0.00014866879141407378, + "loss": 0.0, + "step": 33327 + }, + { + "epoch": 3.1098255108705795, + "grad_norm": NaN, + "learning_rate": 0.00014866122792920212, + "loss": 0.0, + "step": 33328 + }, + { + "epoch": 3.109918820565457, + "grad_norm": NaN, + "learning_rate": 0.0001486536644477346, + "loss": 0.0, + "step": 33329 + }, + { + "epoch": 3.110012130260334, + "grad_norm": NaN, + "learning_rate": 0.0001486461009696904, + "loss": 0.0, + "step": 33330 + }, + { + "epoch": 3.1101054399552113, + "grad_norm": NaN, + "learning_rate": 0.00014863853749508873, + "loss": 0.0, + "step": 33331 + }, + { + "epoch": 3.1101987496500887, + "grad_norm": NaN, + "learning_rate": 0.0001486309740239489, + "loss": 0.0, + "step": 33332 + }, + { + "epoch": 3.110292059344966, + "grad_norm": NaN, + "learning_rate": 0.0001486234105562901, + "loss": 0.0, + "step": 33333 + }, + { + "epoch": 3.110385369039843, + "grad_norm": NaN, + "learning_rate": 0.0001486158470921315, + "loss": 0.0, + "step": 33334 + }, + { + "epoch": 3.1104786787347205, + "grad_norm": NaN, + "learning_rate": 0.00014860828363149248, + "loss": 0.0, + "step": 33335 + }, + { + "epoch": 3.110571988429598, + "grad_norm": NaN, + "learning_rate": 0.0001486007201743921, + "loss": 0.0, + "step": 33336 + }, + { + "epoch": 3.1106652981244753, + "grad_norm": NaN, + "learning_rate": 0.00014859315672084968, + "loss": 0.0, + "step": 33337 + }, + { + "epoch": 3.1107586078193523, + "grad_norm": NaN, + "learning_rate": 0.00014858559327088453, + "loss": 0.0, + "step": 33338 + }, + { + "epoch": 3.1108519175142297, + "grad_norm": NaN, + "learning_rate": 0.0001485780298245157, + "loss": 0.0, + "step": 33339 + }, + { + "epoch": 3.110945227209107, + "grad_norm": NaN, + "learning_rate": 0.00014857046638176258, + "loss": 0.0, + "step": 33340 + }, + { + "epoch": 3.111038536903984, + "grad_norm": NaN, + "learning_rate": 0.00014856290294264435, + "loss": 0.0, + "step": 33341 + }, + { + "epoch": 3.1111318465988616, + "grad_norm": NaN, + "learning_rate": 0.00014855533950718016, + "loss": 0.0, + "step": 33342 + }, + { + "epoch": 3.111225156293739, + "grad_norm": NaN, + "learning_rate": 0.00014854777607538935, + "loss": 0.0, + "step": 33343 + }, + { + "epoch": 3.1113184659886164, + "grad_norm": NaN, + "learning_rate": 0.00014854021264729114, + "loss": 0.0, + "step": 33344 + }, + { + "epoch": 3.1114117756834934, + "grad_norm": NaN, + "learning_rate": 0.0001485326492229047, + "loss": 0.0, + "step": 33345 + }, + { + "epoch": 3.111505085378371, + "grad_norm": NaN, + "learning_rate": 0.00014852508580224933, + "loss": 0.0, + "step": 33346 + }, + { + "epoch": 3.111598395073248, + "grad_norm": NaN, + "learning_rate": 0.00014851752238534424, + "loss": 0.0, + "step": 33347 + }, + { + "epoch": 3.1116917047681256, + "grad_norm": NaN, + "learning_rate": 0.0001485099589722086, + "loss": 0.0, + "step": 33348 + }, + { + "epoch": 3.1117850144630026, + "grad_norm": NaN, + "learning_rate": 0.00014850239556286174, + "loss": 0.0, + "step": 33349 + }, + { + "epoch": 3.11187832415788, + "grad_norm": NaN, + "learning_rate": 0.00014849483215732286, + "loss": 0.0, + "step": 33350 + }, + { + "epoch": 3.1119716338527574, + "grad_norm": NaN, + "learning_rate": 0.0001484872687556111, + "loss": 0.0, + "step": 33351 + }, + { + "epoch": 3.1120649435476344, + "grad_norm": NaN, + "learning_rate": 0.00014847970535774582, + "loss": 0.0, + "step": 33352 + }, + { + "epoch": 3.112158253242512, + "grad_norm": NaN, + "learning_rate": 0.0001484721419637462, + "loss": 0.0, + "step": 33353 + }, + { + "epoch": 3.1122515629373892, + "grad_norm": NaN, + "learning_rate": 0.00014846457857363145, + "loss": 0.0, + "step": 33354 + }, + { + "epoch": 3.1123448726322667, + "grad_norm": NaN, + "learning_rate": 0.00014845701518742085, + "loss": 0.0, + "step": 33355 + }, + { + "epoch": 3.1124381823271436, + "grad_norm": NaN, + "learning_rate": 0.0001484494518051336, + "loss": 0.0, + "step": 33356 + }, + { + "epoch": 3.112531492022021, + "grad_norm": NaN, + "learning_rate": 0.00014844188842678887, + "loss": 0.0, + "step": 33357 + }, + { + "epoch": 3.1126248017168985, + "grad_norm": NaN, + "learning_rate": 0.00014843432505240605, + "loss": 0.0, + "step": 33358 + }, + { + "epoch": 3.112718111411776, + "grad_norm": NaN, + "learning_rate": 0.00014842676168200423, + "loss": 0.0, + "step": 33359 + }, + { + "epoch": 3.112811421106653, + "grad_norm": NaN, + "learning_rate": 0.00014841919831560263, + "loss": 0.0, + "step": 33360 + }, + { + "epoch": 3.1129047308015303, + "grad_norm": NaN, + "learning_rate": 0.00014841163495322066, + "loss": 0.0, + "step": 33361 + }, + { + "epoch": 3.1129980404964077, + "grad_norm": NaN, + "learning_rate": 0.00014840407159487736, + "loss": 0.0, + "step": 33362 + }, + { + "epoch": 3.1130913501912847, + "grad_norm": NaN, + "learning_rate": 0.00014839650824059202, + "loss": 0.0, + "step": 33363 + }, + { + "epoch": 3.113184659886162, + "grad_norm": NaN, + "learning_rate": 0.00014838894489038396, + "loss": 0.0, + "step": 33364 + }, + { + "epoch": 3.1132779695810395, + "grad_norm": NaN, + "learning_rate": 0.00014838138154427228, + "loss": 0.0, + "step": 33365 + }, + { + "epoch": 3.113371279275917, + "grad_norm": NaN, + "learning_rate": 0.00014837381820227623, + "loss": 0.0, + "step": 33366 + }, + { + "epoch": 3.113464588970794, + "grad_norm": NaN, + "learning_rate": 0.00014836625486441517, + "loss": 0.0, + "step": 33367 + }, + { + "epoch": 3.1135578986656713, + "grad_norm": NaN, + "learning_rate": 0.00014835869153070818, + "loss": 0.0, + "step": 33368 + }, + { + "epoch": 3.1136512083605488, + "grad_norm": NaN, + "learning_rate": 0.00014835112820117455, + "loss": 0.0, + "step": 33369 + }, + { + "epoch": 3.1137445180554257, + "grad_norm": NaN, + "learning_rate": 0.00014834356487583355, + "loss": 0.0, + "step": 33370 + }, + { + "epoch": 3.113837827750303, + "grad_norm": NaN, + "learning_rate": 0.00014833600155470436, + "loss": 0.0, + "step": 33371 + }, + { + "epoch": 3.1139311374451806, + "grad_norm": NaN, + "learning_rate": 0.00014832843823780619, + "loss": 0.0, + "step": 33372 + }, + { + "epoch": 3.114024447140058, + "grad_norm": NaN, + "learning_rate": 0.00014832087492515835, + "loss": 0.0, + "step": 33373 + }, + { + "epoch": 3.114117756834935, + "grad_norm": NaN, + "learning_rate": 0.00014831331161678003, + "loss": 0.0, + "step": 33374 + }, + { + "epoch": 3.1142110665298124, + "grad_norm": NaN, + "learning_rate": 0.00014830574831269037, + "loss": 0.0, + "step": 33375 + }, + { + "epoch": 3.11430437622469, + "grad_norm": NaN, + "learning_rate": 0.00014829818501290882, + "loss": 0.0, + "step": 33376 + }, + { + "epoch": 3.1143976859195672, + "grad_norm": NaN, + "learning_rate": 0.00014829062171745438, + "loss": 0.0, + "step": 33377 + }, + { + "epoch": 3.114490995614444, + "grad_norm": NaN, + "learning_rate": 0.00014828305842634642, + "loss": 0.0, + "step": 33378 + }, + { + "epoch": 3.1145843053093216, + "grad_norm": NaN, + "learning_rate": 0.0001482754951396042, + "loss": 0.0, + "step": 33379 + }, + { + "epoch": 3.114677615004199, + "grad_norm": NaN, + "learning_rate": 0.00014826793185724677, + "loss": 0.0, + "step": 33380 + }, + { + "epoch": 3.1147709246990765, + "grad_norm": NaN, + "learning_rate": 0.00014826036857929354, + "loss": 0.0, + "step": 33381 + }, + { + "epoch": 3.1148642343939534, + "grad_norm": NaN, + "learning_rate": 0.00014825280530576367, + "loss": 0.0, + "step": 33382 + }, + { + "epoch": 3.114957544088831, + "grad_norm": NaN, + "learning_rate": 0.00014824524203667637, + "loss": 0.0, + "step": 33383 + }, + { + "epoch": 3.1150508537837083, + "grad_norm": NaN, + "learning_rate": 0.00014823767877205092, + "loss": 0.0, + "step": 33384 + }, + { + "epoch": 3.1151441634785852, + "grad_norm": NaN, + "learning_rate": 0.00014823011551190658, + "loss": 0.0, + "step": 33385 + }, + { + "epoch": 3.1152374731734627, + "grad_norm": NaN, + "learning_rate": 0.00014822255225626244, + "loss": 0.0, + "step": 33386 + }, + { + "epoch": 3.11533078286834, + "grad_norm": NaN, + "learning_rate": 0.00014821498900513788, + "loss": 0.0, + "step": 33387 + }, + { + "epoch": 3.1154240925632175, + "grad_norm": NaN, + "learning_rate": 0.0001482074257585521, + "loss": 0.0, + "step": 33388 + }, + { + "epoch": 3.1155174022580945, + "grad_norm": NaN, + "learning_rate": 0.00014819986251652425, + "loss": 0.0, + "step": 33389 + }, + { + "epoch": 3.115610711952972, + "grad_norm": NaN, + "learning_rate": 0.00014819229927907362, + "loss": 0.0, + "step": 33390 + }, + { + "epoch": 3.1157040216478493, + "grad_norm": NaN, + "learning_rate": 0.0001481847360462195, + "loss": 0.0, + "step": 33391 + }, + { + "epoch": 3.1157973313427263, + "grad_norm": NaN, + "learning_rate": 0.00014817717281798097, + "loss": 0.0, + "step": 33392 + }, + { + "epoch": 3.1158906410376037, + "grad_norm": NaN, + "learning_rate": 0.0001481696095943774, + "loss": 0.0, + "step": 33393 + }, + { + "epoch": 3.115983950732481, + "grad_norm": NaN, + "learning_rate": 0.000148162046375428, + "loss": 0.0, + "step": 33394 + }, + { + "epoch": 3.1160772604273586, + "grad_norm": NaN, + "learning_rate": 0.00014815448316115188, + "loss": 0.0, + "step": 33395 + }, + { + "epoch": 3.1161705701222355, + "grad_norm": NaN, + "learning_rate": 0.00014814691995156842, + "loss": 0.0, + "step": 33396 + }, + { + "epoch": 3.116263879817113, + "grad_norm": NaN, + "learning_rate": 0.00014813935674669684, + "loss": 0.0, + "step": 33397 + }, + { + "epoch": 3.1163571895119904, + "grad_norm": NaN, + "learning_rate": 0.00014813179354655623, + "loss": 0.0, + "step": 33398 + }, + { + "epoch": 3.116450499206868, + "grad_norm": NaN, + "learning_rate": 0.000148124230351166, + "loss": 0.0, + "step": 33399 + }, + { + "epoch": 3.1165438089017448, + "grad_norm": NaN, + "learning_rate": 0.00014811666716054526, + "loss": 0.0, + "step": 33400 + }, + { + "epoch": 3.116637118596622, + "grad_norm": NaN, + "learning_rate": 0.00014810910397471325, + "loss": 0.0, + "step": 33401 + }, + { + "epoch": 3.1167304282914996, + "grad_norm": NaN, + "learning_rate": 0.00014810154079368927, + "loss": 0.0, + "step": 33402 + }, + { + "epoch": 3.1168237379863766, + "grad_norm": NaN, + "learning_rate": 0.00014809397761749249, + "loss": 0.0, + "step": 33403 + }, + { + "epoch": 3.116917047681254, + "grad_norm": NaN, + "learning_rate": 0.00014808641444614214, + "loss": 0.0, + "step": 33404 + }, + { + "epoch": 3.1170103573761314, + "grad_norm": NaN, + "learning_rate": 0.00014807885127965752, + "loss": 0.0, + "step": 33405 + }, + { + "epoch": 3.117103667071009, + "grad_norm": NaN, + "learning_rate": 0.00014807128811805779, + "loss": 0.0, + "step": 33406 + }, + { + "epoch": 3.117196976765886, + "grad_norm": NaN, + "learning_rate": 0.00014806372496136216, + "loss": 0.0, + "step": 33407 + }, + { + "epoch": 3.1172902864607632, + "grad_norm": NaN, + "learning_rate": 0.00014805616180958999, + "loss": 0.0, + "step": 33408 + }, + { + "epoch": 3.1173835961556406, + "grad_norm": NaN, + "learning_rate": 0.00014804859866276036, + "loss": 0.0, + "step": 33409 + }, + { + "epoch": 3.117476905850518, + "grad_norm": NaN, + "learning_rate": 0.00014804103552089256, + "loss": 0.0, + "step": 33410 + }, + { + "epoch": 3.117570215545395, + "grad_norm": NaN, + "learning_rate": 0.00014803347238400588, + "loss": 0.0, + "step": 33411 + }, + { + "epoch": 3.1176635252402725, + "grad_norm": NaN, + "learning_rate": 0.00014802590925211943, + "loss": 0.0, + "step": 33412 + }, + { + "epoch": 3.11775683493515, + "grad_norm": NaN, + "learning_rate": 0.00014801834612525254, + "loss": 0.0, + "step": 33413 + }, + { + "epoch": 3.117850144630027, + "grad_norm": NaN, + "learning_rate": 0.00014801078300342446, + "loss": 0.0, + "step": 33414 + }, + { + "epoch": 3.1179434543249043, + "grad_norm": NaN, + "learning_rate": 0.0001480032198866543, + "loss": 0.0, + "step": 33415 + }, + { + "epoch": 3.1180367640197817, + "grad_norm": NaN, + "learning_rate": 0.00014799565677496137, + "loss": 0.0, + "step": 33416 + }, + { + "epoch": 3.118130073714659, + "grad_norm": NaN, + "learning_rate": 0.00014798809366836494, + "loss": 0.0, + "step": 33417 + }, + { + "epoch": 3.118223383409536, + "grad_norm": NaN, + "learning_rate": 0.00014798053056688413, + "loss": 0.0, + "step": 33418 + }, + { + "epoch": 3.1183166931044135, + "grad_norm": NaN, + "learning_rate": 0.00014797296747053826, + "loss": 0.0, + "step": 33419 + }, + { + "epoch": 3.118410002799291, + "grad_norm": NaN, + "learning_rate": 0.00014796540437934656, + "loss": 0.0, + "step": 33420 + }, + { + "epoch": 3.1185033124941683, + "grad_norm": NaN, + "learning_rate": 0.00014795784129332818, + "loss": 0.0, + "step": 33421 + }, + { + "epoch": 3.1185966221890453, + "grad_norm": NaN, + "learning_rate": 0.00014795027821250242, + "loss": 0.0, + "step": 33422 + }, + { + "epoch": 3.1186899318839227, + "grad_norm": NaN, + "learning_rate": 0.00014794271513688854, + "loss": 0.0, + "step": 33423 + }, + { + "epoch": 3.1187832415788, + "grad_norm": NaN, + "learning_rate": 0.00014793515206650565, + "loss": 0.0, + "step": 33424 + }, + { + "epoch": 3.118876551273677, + "grad_norm": NaN, + "learning_rate": 0.0001479275890013731, + "loss": 0.0, + "step": 33425 + }, + { + "epoch": 3.1189698609685546, + "grad_norm": NaN, + "learning_rate": 0.00014792002594151012, + "loss": 0.0, + "step": 33426 + }, + { + "epoch": 3.119063170663432, + "grad_norm": NaN, + "learning_rate": 0.0001479124628869358, + "loss": 0.0, + "step": 33427 + }, + { + "epoch": 3.1191564803583094, + "grad_norm": NaN, + "learning_rate": 0.00014790489983766954, + "loss": 0.0, + "step": 33428 + }, + { + "epoch": 3.1192497900531864, + "grad_norm": NaN, + "learning_rate": 0.00014789733679373052, + "loss": 0.0, + "step": 33429 + }, + { + "epoch": 3.119343099748064, + "grad_norm": NaN, + "learning_rate": 0.00014788977375513788, + "loss": 0.0, + "step": 33430 + }, + { + "epoch": 3.119436409442941, + "grad_norm": NaN, + "learning_rate": 0.00014788221072191095, + "loss": 0.0, + "step": 33431 + }, + { + "epoch": 3.1195297191378186, + "grad_norm": NaN, + "learning_rate": 0.00014787464769406897, + "loss": 0.0, + "step": 33432 + }, + { + "epoch": 3.1196230288326956, + "grad_norm": NaN, + "learning_rate": 0.00014786708467163105, + "loss": 0.0, + "step": 33433 + }, + { + "epoch": 3.119716338527573, + "grad_norm": NaN, + "learning_rate": 0.00014785952165461655, + "loss": 0.0, + "step": 33434 + }, + { + "epoch": 3.1198096482224504, + "grad_norm": NaN, + "learning_rate": 0.00014785195864304467, + "loss": 0.0, + "step": 33435 + }, + { + "epoch": 3.1199029579173274, + "grad_norm": NaN, + "learning_rate": 0.0001478443956369346, + "loss": 0.0, + "step": 33436 + }, + { + "epoch": 3.119996267612205, + "grad_norm": NaN, + "learning_rate": 0.00014783683263630556, + "loss": 0.0, + "step": 33437 + }, + { + "epoch": 3.1200895773070823, + "grad_norm": NaN, + "learning_rate": 0.00014782926964117688, + "loss": 0.0, + "step": 33438 + }, + { + "epoch": 3.1201828870019597, + "grad_norm": NaN, + "learning_rate": 0.00014782170665156768, + "loss": 0.0, + "step": 33439 + }, + { + "epoch": 3.1202761966968366, + "grad_norm": NaN, + "learning_rate": 0.00014781414366749724, + "loss": 0.0, + "step": 33440 + }, + { + "epoch": 3.120369506391714, + "grad_norm": NaN, + "learning_rate": 0.00014780658068898483, + "loss": 0.0, + "step": 33441 + }, + { + "epoch": 3.1204628160865915, + "grad_norm": NaN, + "learning_rate": 0.00014779901771604955, + "loss": 0.0, + "step": 33442 + }, + { + "epoch": 3.120556125781469, + "grad_norm": NaN, + "learning_rate": 0.0001477914547487108, + "loss": 0.0, + "step": 33443 + }, + { + "epoch": 3.120649435476346, + "grad_norm": NaN, + "learning_rate": 0.0001477838917869877, + "loss": 0.0, + "step": 33444 + }, + { + "epoch": 3.1207427451712233, + "grad_norm": NaN, + "learning_rate": 0.00014777632883089946, + "loss": 0.0, + "step": 33445 + }, + { + "epoch": 3.1208360548661007, + "grad_norm": NaN, + "learning_rate": 0.00014776876588046544, + "loss": 0.0, + "step": 33446 + }, + { + "epoch": 3.1209293645609777, + "grad_norm": NaN, + "learning_rate": 0.00014776120293570472, + "loss": 0.0, + "step": 33447 + }, + { + "epoch": 3.121022674255855, + "grad_norm": NaN, + "learning_rate": 0.00014775363999663659, + "loss": 0.0, + "step": 33448 + }, + { + "epoch": 3.1211159839507325, + "grad_norm": NaN, + "learning_rate": 0.00014774607706328035, + "loss": 0.0, + "step": 33449 + }, + { + "epoch": 3.12120929364561, + "grad_norm": NaN, + "learning_rate": 0.0001477385141356551, + "loss": 0.0, + "step": 33450 + }, + { + "epoch": 3.121302603340487, + "grad_norm": NaN, + "learning_rate": 0.00014773095121378018, + "loss": 0.0, + "step": 33451 + }, + { + "epoch": 3.1213959130353643, + "grad_norm": NaN, + "learning_rate": 0.0001477233882976748, + "loss": 0.0, + "step": 33452 + }, + { + "epoch": 3.1214892227302418, + "grad_norm": NaN, + "learning_rate": 0.00014771582538735813, + "loss": 0.0, + "step": 33453 + }, + { + "epoch": 3.121582532425119, + "grad_norm": NaN, + "learning_rate": 0.00014770826248284942, + "loss": 0.0, + "step": 33454 + }, + { + "epoch": 3.121675842119996, + "grad_norm": NaN, + "learning_rate": 0.000147700699584168, + "loss": 0.0, + "step": 33455 + }, + { + "epoch": 3.1217691518148736, + "grad_norm": NaN, + "learning_rate": 0.00014769313669133296, + "loss": 0.0, + "step": 33456 + }, + { + "epoch": 3.121862461509751, + "grad_norm": NaN, + "learning_rate": 0.00014768557380436358, + "loss": 0.0, + "step": 33457 + }, + { + "epoch": 3.121955771204628, + "grad_norm": NaN, + "learning_rate": 0.00014767801092327917, + "loss": 0.0, + "step": 33458 + }, + { + "epoch": 3.1220490808995054, + "grad_norm": NaN, + "learning_rate": 0.0001476704480480988, + "loss": 0.0, + "step": 33459 + }, + { + "epoch": 3.122142390594383, + "grad_norm": NaN, + "learning_rate": 0.00014766288517884183, + "loss": 0.0, + "step": 33460 + }, + { + "epoch": 3.1222357002892602, + "grad_norm": NaN, + "learning_rate": 0.0001476553223155275, + "loss": 0.0, + "step": 33461 + }, + { + "epoch": 3.122329009984137, + "grad_norm": NaN, + "learning_rate": 0.0001476477594581749, + "loss": 0.0, + "step": 33462 + }, + { + "epoch": 3.1224223196790146, + "grad_norm": NaN, + "learning_rate": 0.0001476401966068034, + "loss": 0.0, + "step": 33463 + }, + { + "epoch": 3.122515629373892, + "grad_norm": NaN, + "learning_rate": 0.0001476326337614322, + "loss": 0.0, + "step": 33464 + }, + { + "epoch": 3.122608939068769, + "grad_norm": NaN, + "learning_rate": 0.00014762507092208046, + "loss": 0.0, + "step": 33465 + }, + { + "epoch": 3.1227022487636464, + "grad_norm": NaN, + "learning_rate": 0.0001476175080887675, + "loss": 0.0, + "step": 33466 + }, + { + "epoch": 3.122795558458524, + "grad_norm": NaN, + "learning_rate": 0.0001476099452615125, + "loss": 0.0, + "step": 33467 + }, + { + "epoch": 3.1228888681534013, + "grad_norm": NaN, + "learning_rate": 0.00014760238244033468, + "loss": 0.0, + "step": 33468 + }, + { + "epoch": 3.1229821778482783, + "grad_norm": NaN, + "learning_rate": 0.00014759481962525333, + "loss": 0.0, + "step": 33469 + }, + { + "epoch": 3.1230754875431557, + "grad_norm": NaN, + "learning_rate": 0.00014758725681628763, + "loss": 0.0, + "step": 33470 + }, + { + "epoch": 3.123168797238033, + "grad_norm": NaN, + "learning_rate": 0.0001475796940134568, + "loss": 0.0, + "step": 33471 + }, + { + "epoch": 3.1232621069329105, + "grad_norm": NaN, + "learning_rate": 0.0001475721312167801, + "loss": 0.0, + "step": 33472 + }, + { + "epoch": 3.1233554166277875, + "grad_norm": NaN, + "learning_rate": 0.0001475645684262768, + "loss": 0.0, + "step": 33473 + }, + { + "epoch": 3.123448726322665, + "grad_norm": NaN, + "learning_rate": 0.000147557005641966, + "loss": 0.0, + "step": 33474 + }, + { + "epoch": 3.1235420360175423, + "grad_norm": NaN, + "learning_rate": 0.00014754944286386702, + "loss": 0.0, + "step": 33475 + }, + { + "epoch": 3.1236353457124197, + "grad_norm": NaN, + "learning_rate": 0.00014754188009199917, + "loss": 0.0, + "step": 33476 + }, + { + "epoch": 3.1237286554072967, + "grad_norm": NaN, + "learning_rate": 0.00014753431732638148, + "loss": 0.0, + "step": 33477 + }, + { + "epoch": 3.123821965102174, + "grad_norm": NaN, + "learning_rate": 0.00014752675456703335, + "loss": 0.0, + "step": 33478 + }, + { + "epoch": 3.1239152747970516, + "grad_norm": NaN, + "learning_rate": 0.00014751919181397397, + "loss": 0.0, + "step": 33479 + }, + { + "epoch": 3.1240085844919285, + "grad_norm": NaN, + "learning_rate": 0.00014751162906722248, + "loss": 0.0, + "step": 33480 + }, + { + "epoch": 3.124101894186806, + "grad_norm": NaN, + "learning_rate": 0.00014750406632679824, + "loss": 0.0, + "step": 33481 + }, + { + "epoch": 3.1241952038816834, + "grad_norm": NaN, + "learning_rate": 0.00014749650359272042, + "loss": 0.0, + "step": 33482 + }, + { + "epoch": 3.124288513576561, + "grad_norm": NaN, + "learning_rate": 0.0001474889408650082, + "loss": 0.0, + "step": 33483 + }, + { + "epoch": 3.1243818232714378, + "grad_norm": NaN, + "learning_rate": 0.00014748137814368084, + "loss": 0.0, + "step": 33484 + }, + { + "epoch": 3.124475132966315, + "grad_norm": NaN, + "learning_rate": 0.00014747381542875774, + "loss": 0.0, + "step": 33485 + }, + { + "epoch": 3.1245684426611926, + "grad_norm": NaN, + "learning_rate": 0.00014746625272025785, + "loss": 0.0, + "step": 33486 + }, + { + "epoch": 3.1246617523560696, + "grad_norm": NaN, + "learning_rate": 0.0001474586900182005, + "loss": 0.0, + "step": 33487 + }, + { + "epoch": 3.124755062050947, + "grad_norm": NaN, + "learning_rate": 0.00014745112732260506, + "loss": 0.0, + "step": 33488 + }, + { + "epoch": 3.1248483717458244, + "grad_norm": NaN, + "learning_rate": 0.00014744356463349057, + "loss": 0.0, + "step": 33489 + }, + { + "epoch": 3.124941681440702, + "grad_norm": NaN, + "learning_rate": 0.00014743600195087642, + "loss": 0.0, + "step": 33490 + }, + { + "epoch": 3.125034991135579, + "grad_norm": NaN, + "learning_rate": 0.00014742843927478165, + "loss": 0.0, + "step": 33491 + }, + { + "epoch": 3.1251283008304562, + "grad_norm": NaN, + "learning_rate": 0.00014742087660522566, + "loss": 0.0, + "step": 33492 + }, + { + "epoch": 3.1252216105253336, + "grad_norm": NaN, + "learning_rate": 0.00014741331394222765, + "loss": 0.0, + "step": 33493 + }, + { + "epoch": 3.125314920220211, + "grad_norm": NaN, + "learning_rate": 0.00014740575128580673, + "loss": 0.0, + "step": 33494 + }, + { + "epoch": 3.125408229915088, + "grad_norm": NaN, + "learning_rate": 0.00014739818863598224, + "loss": 0.0, + "step": 33495 + }, + { + "epoch": 3.1255015396099655, + "grad_norm": NaN, + "learning_rate": 0.00014739062599277344, + "loss": 0.0, + "step": 33496 + }, + { + "epoch": 3.125594849304843, + "grad_norm": NaN, + "learning_rate": 0.00014738306335619945, + "loss": 0.0, + "step": 33497 + }, + { + "epoch": 3.1256881589997203, + "grad_norm": NaN, + "learning_rate": 0.00014737550072627958, + "loss": 0.0, + "step": 33498 + }, + { + "epoch": 3.1257814686945973, + "grad_norm": NaN, + "learning_rate": 0.00014736793810303305, + "loss": 0.0, + "step": 33499 + }, + { + "epoch": 3.1258747783894747, + "grad_norm": NaN, + "learning_rate": 0.000147360375486479, + "loss": 0.0, + "step": 33500 + }, + { + "epoch": 3.125968088084352, + "grad_norm": NaN, + "learning_rate": 0.0001473528128766368, + "loss": 0.0, + "step": 33501 + }, + { + "epoch": 3.126061397779229, + "grad_norm": NaN, + "learning_rate": 0.00014734525027352562, + "loss": 0.0, + "step": 33502 + }, + { + "epoch": 3.1261547074741065, + "grad_norm": NaN, + "learning_rate": 0.00014733768767716463, + "loss": 0.0, + "step": 33503 + }, + { + "epoch": 3.126248017168984, + "grad_norm": NaN, + "learning_rate": 0.00014733012508757313, + "loss": 0.0, + "step": 33504 + }, + { + "epoch": 3.1263413268638613, + "grad_norm": NaN, + "learning_rate": 0.00014732256250477038, + "loss": 0.0, + "step": 33505 + }, + { + "epoch": 3.1264346365587383, + "grad_norm": NaN, + "learning_rate": 0.0001473149999287755, + "loss": 0.0, + "step": 33506 + }, + { + "epoch": 3.1265279462536157, + "grad_norm": NaN, + "learning_rate": 0.0001473074373596078, + "loss": 0.0, + "step": 33507 + }, + { + "epoch": 3.126621255948493, + "grad_norm": NaN, + "learning_rate": 0.00014729987479728652, + "loss": 0.0, + "step": 33508 + }, + { + "epoch": 3.12671456564337, + "grad_norm": NaN, + "learning_rate": 0.0001472923122418308, + "loss": 0.0, + "step": 33509 + }, + { + "epoch": 3.1268078753382476, + "grad_norm": NaN, + "learning_rate": 0.00014728474969325994, + "loss": 0.0, + "step": 33510 + }, + { + "epoch": 3.126901185033125, + "grad_norm": NaN, + "learning_rate": 0.0001472771871515932, + "loss": 0.0, + "step": 33511 + }, + { + "epoch": 3.1269944947280024, + "grad_norm": NaN, + "learning_rate": 0.00014726962461684973, + "loss": 0.0, + "step": 33512 + }, + { + "epoch": 3.1270878044228794, + "grad_norm": NaN, + "learning_rate": 0.0001472620620890488, + "loss": 0.0, + "step": 33513 + }, + { + "epoch": 3.127181114117757, + "grad_norm": NaN, + "learning_rate": 0.00014725449956820967, + "loss": 0.0, + "step": 33514 + }, + { + "epoch": 3.127274423812634, + "grad_norm": NaN, + "learning_rate": 0.00014724693705435148, + "loss": 0.0, + "step": 33515 + }, + { + "epoch": 3.1273677335075116, + "grad_norm": NaN, + "learning_rate": 0.00014723937454749354, + "loss": 0.0, + "step": 33516 + }, + { + "epoch": 3.1274610432023886, + "grad_norm": NaN, + "learning_rate": 0.00014723181204765508, + "loss": 0.0, + "step": 33517 + }, + { + "epoch": 3.127554352897266, + "grad_norm": NaN, + "learning_rate": 0.00014722424955485524, + "loss": 0.0, + "step": 33518 + }, + { + "epoch": 3.1276476625921434, + "grad_norm": NaN, + "learning_rate": 0.00014721668706911334, + "loss": 0.0, + "step": 33519 + }, + { + "epoch": 3.1277409722870204, + "grad_norm": NaN, + "learning_rate": 0.00014720912459044861, + "loss": 0.0, + "step": 33520 + }, + { + "epoch": 3.127834281981898, + "grad_norm": NaN, + "learning_rate": 0.00014720156211888019, + "loss": 0.0, + "step": 33521 + }, + { + "epoch": 3.1279275916767753, + "grad_norm": NaN, + "learning_rate": 0.00014719399965442738, + "loss": 0.0, + "step": 33522 + }, + { + "epoch": 3.1280209013716527, + "grad_norm": NaN, + "learning_rate": 0.00014718643719710949, + "loss": 0.0, + "step": 33523 + }, + { + "epoch": 3.1281142110665296, + "grad_norm": NaN, + "learning_rate": 0.00014717887474694557, + "loss": 0.0, + "step": 33524 + }, + { + "epoch": 3.128207520761407, + "grad_norm": NaN, + "learning_rate": 0.0001471713123039549, + "loss": 0.0, + "step": 33525 + }, + { + "epoch": 3.1283008304562845, + "grad_norm": NaN, + "learning_rate": 0.00014716374986815684, + "loss": 0.0, + "step": 33526 + }, + { + "epoch": 3.128394140151162, + "grad_norm": NaN, + "learning_rate": 0.00014715618743957047, + "loss": 0.0, + "step": 33527 + }, + { + "epoch": 3.128487449846039, + "grad_norm": NaN, + "learning_rate": 0.00014714862501821505, + "loss": 0.0, + "step": 33528 + }, + { + "epoch": 3.1285807595409163, + "grad_norm": NaN, + "learning_rate": 0.0001471410626041099, + "loss": 0.0, + "step": 33529 + }, + { + "epoch": 3.1286740692357937, + "grad_norm": NaN, + "learning_rate": 0.00014713350019727415, + "loss": 0.0, + "step": 33530 + }, + { + "epoch": 3.1287673789306707, + "grad_norm": NaN, + "learning_rate": 0.00014712593779772702, + "loss": 0.0, + "step": 33531 + }, + { + "epoch": 3.128860688625548, + "grad_norm": NaN, + "learning_rate": 0.00014711837540548787, + "loss": 0.0, + "step": 33532 + }, + { + "epoch": 3.1289539983204255, + "grad_norm": NaN, + "learning_rate": 0.0001471108130205758, + "loss": 0.0, + "step": 33533 + }, + { + "epoch": 3.129047308015303, + "grad_norm": NaN, + "learning_rate": 0.00014710325064301008, + "loss": 0.0, + "step": 33534 + }, + { + "epoch": 3.12914061771018, + "grad_norm": NaN, + "learning_rate": 0.0001470956882728099, + "loss": 0.0, + "step": 33535 + }, + { + "epoch": 3.1292339274050573, + "grad_norm": NaN, + "learning_rate": 0.00014708812590999453, + "loss": 0.0, + "step": 33536 + }, + { + "epoch": 3.1293272370999348, + "grad_norm": NaN, + "learning_rate": 0.00014708056355458322, + "loss": 0.0, + "step": 33537 + }, + { + "epoch": 3.129420546794812, + "grad_norm": NaN, + "learning_rate": 0.00014707300120659513, + "loss": 0.0, + "step": 33538 + }, + { + "epoch": 3.129513856489689, + "grad_norm": NaN, + "learning_rate": 0.00014706543886604956, + "loss": 0.0, + "step": 33539 + }, + { + "epoch": 3.1296071661845666, + "grad_norm": NaN, + "learning_rate": 0.00014705787653296573, + "loss": 0.0, + "step": 33540 + }, + { + "epoch": 3.129700475879444, + "grad_norm": NaN, + "learning_rate": 0.00014705031420736282, + "loss": 0.0, + "step": 33541 + }, + { + "epoch": 3.129793785574321, + "grad_norm": NaN, + "learning_rate": 0.0001470427518892601, + "loss": 0.0, + "step": 33542 + }, + { + "epoch": 3.1298870952691984, + "grad_norm": NaN, + "learning_rate": 0.0001470351895786768, + "loss": 0.0, + "step": 33543 + }, + { + "epoch": 3.129980404964076, + "grad_norm": NaN, + "learning_rate": 0.00014702762727563206, + "loss": 0.0, + "step": 33544 + }, + { + "epoch": 3.1300737146589532, + "grad_norm": NaN, + "learning_rate": 0.00014702006498014525, + "loss": 0.0, + "step": 33545 + }, + { + "epoch": 3.13016702435383, + "grad_norm": NaN, + "learning_rate": 0.00014701250269223554, + "loss": 0.0, + "step": 33546 + }, + { + "epoch": 3.1302603340487076, + "grad_norm": NaN, + "learning_rate": 0.00014700494041192209, + "loss": 0.0, + "step": 33547 + }, + { + "epoch": 3.130353643743585, + "grad_norm": NaN, + "learning_rate": 0.00014699737813922425, + "loss": 0.0, + "step": 33548 + }, + { + "epoch": 3.1304469534384625, + "grad_norm": NaN, + "learning_rate": 0.00014698981587416118, + "loss": 0.0, + "step": 33549 + }, + { + "epoch": 3.1305402631333394, + "grad_norm": NaN, + "learning_rate": 0.0001469822536167521, + "loss": 0.0, + "step": 33550 + }, + { + "epoch": 3.130633572828217, + "grad_norm": NaN, + "learning_rate": 0.00014697469136701623, + "loss": 0.0, + "step": 33551 + }, + { + "epoch": 3.1307268825230943, + "grad_norm": NaN, + "learning_rate": 0.0001469671291249729, + "loss": 0.0, + "step": 33552 + }, + { + "epoch": 3.1308201922179713, + "grad_norm": NaN, + "learning_rate": 0.00014695956689064117, + "loss": 0.0, + "step": 33553 + }, + { + "epoch": 3.1309135019128487, + "grad_norm": NaN, + "learning_rate": 0.00014695200466404042, + "loss": 0.0, + "step": 33554 + }, + { + "epoch": 3.131006811607726, + "grad_norm": NaN, + "learning_rate": 0.00014694444244518984, + "loss": 0.0, + "step": 33555 + }, + { + "epoch": 3.1311001213026035, + "grad_norm": NaN, + "learning_rate": 0.00014693688023410857, + "loss": 0.0, + "step": 33556 + }, + { + "epoch": 3.1311934309974805, + "grad_norm": NaN, + "learning_rate": 0.00014692931803081595, + "loss": 0.0, + "step": 33557 + }, + { + "epoch": 3.131286740692358, + "grad_norm": NaN, + "learning_rate": 0.00014692175583533114, + "loss": 0.0, + "step": 33558 + }, + { + "epoch": 3.1313800503872353, + "grad_norm": NaN, + "learning_rate": 0.00014691419364767341, + "loss": 0.0, + "step": 33559 + }, + { + "epoch": 3.1314733600821123, + "grad_norm": NaN, + "learning_rate": 0.00014690663146786192, + "loss": 0.0, + "step": 33560 + }, + { + "epoch": 3.1315666697769897, + "grad_norm": NaN, + "learning_rate": 0.00014689906929591607, + "loss": 0.0, + "step": 33561 + }, + { + "epoch": 3.131659979471867, + "grad_norm": NaN, + "learning_rate": 0.00014689150713185486, + "loss": 0.0, + "step": 33562 + }, + { + "epoch": 3.1317532891667446, + "grad_norm": NaN, + "learning_rate": 0.00014688394497569762, + "loss": 0.0, + "step": 33563 + }, + { + "epoch": 3.1318465988616215, + "grad_norm": NaN, + "learning_rate": 0.00014687638282746368, + "loss": 0.0, + "step": 33564 + }, + { + "epoch": 3.131939908556499, + "grad_norm": NaN, + "learning_rate": 0.00014686882068717211, + "loss": 0.0, + "step": 33565 + }, + { + "epoch": 3.1320332182513764, + "grad_norm": NaN, + "learning_rate": 0.00014686125855484217, + "loss": 0.0, + "step": 33566 + }, + { + "epoch": 3.132126527946254, + "grad_norm": NaN, + "learning_rate": 0.0001468536964304932, + "loss": 0.0, + "step": 33567 + }, + { + "epoch": 3.1322198376411308, + "grad_norm": NaN, + "learning_rate": 0.00014684613431414432, + "loss": 0.0, + "step": 33568 + }, + { + "epoch": 3.132313147336008, + "grad_norm": NaN, + "learning_rate": 0.00014683857220581473, + "loss": 0.0, + "step": 33569 + }, + { + "epoch": 3.1324064570308856, + "grad_norm": NaN, + "learning_rate": 0.0001468310101055238, + "loss": 0.0, + "step": 33570 + }, + { + "epoch": 3.132499766725763, + "grad_norm": NaN, + "learning_rate": 0.0001468234480132906, + "loss": 0.0, + "step": 33571 + }, + { + "epoch": 3.13259307642064, + "grad_norm": NaN, + "learning_rate": 0.00014681588592913445, + "loss": 0.0, + "step": 33572 + }, + { + "epoch": 3.1326863861155174, + "grad_norm": NaN, + "learning_rate": 0.00014680832385307463, + "loss": 0.0, + "step": 33573 + }, + { + "epoch": 3.132779695810395, + "grad_norm": NaN, + "learning_rate": 0.00014680076178513022, + "loss": 0.0, + "step": 33574 + }, + { + "epoch": 3.132873005505272, + "grad_norm": NaN, + "learning_rate": 0.0001467931997253205, + "loss": 0.0, + "step": 33575 + }, + { + "epoch": 3.1329663152001492, + "grad_norm": NaN, + "learning_rate": 0.00014678563767366482, + "loss": 0.0, + "step": 33576 + }, + { + "epoch": 3.1330596248950267, + "grad_norm": NaN, + "learning_rate": 0.00014677807563018226, + "loss": 0.0, + "step": 33577 + }, + { + "epoch": 3.133152934589904, + "grad_norm": NaN, + "learning_rate": 0.00014677051359489213, + "loss": 0.0, + "step": 33578 + }, + { + "epoch": 3.133246244284781, + "grad_norm": NaN, + "learning_rate": 0.00014676295156781355, + "loss": 0.0, + "step": 33579 + }, + { + "epoch": 3.1333395539796585, + "grad_norm": NaN, + "learning_rate": 0.00014675538954896588, + "loss": 0.0, + "step": 33580 + }, + { + "epoch": 3.133432863674536, + "grad_norm": NaN, + "learning_rate": 0.0001467478275383683, + "loss": 0.0, + "step": 33581 + }, + { + "epoch": 3.133526173369413, + "grad_norm": NaN, + "learning_rate": 0.00014674026553604, + "loss": 0.0, + "step": 33582 + }, + { + "epoch": 3.1336194830642903, + "grad_norm": NaN, + "learning_rate": 0.00014673270354200024, + "loss": 0.0, + "step": 33583 + }, + { + "epoch": 3.1337127927591677, + "grad_norm": NaN, + "learning_rate": 0.0001467251415562683, + "loss": 0.0, + "step": 33584 + }, + { + "epoch": 3.133806102454045, + "grad_norm": NaN, + "learning_rate": 0.0001467175795788633, + "loss": 0.0, + "step": 33585 + }, + { + "epoch": 3.133899412148922, + "grad_norm": NaN, + "learning_rate": 0.00014671001760980452, + "loss": 0.0, + "step": 33586 + }, + { + "epoch": 3.1339927218437995, + "grad_norm": NaN, + "learning_rate": 0.00014670245564911126, + "loss": 0.0, + "step": 33587 + }, + { + "epoch": 3.134086031538677, + "grad_norm": NaN, + "learning_rate": 0.0001466948936968026, + "loss": 0.0, + "step": 33588 + }, + { + "epoch": 3.1341793412335544, + "grad_norm": NaN, + "learning_rate": 0.00014668733175289786, + "loss": 0.0, + "step": 33589 + }, + { + "epoch": 3.1342726509284313, + "grad_norm": NaN, + "learning_rate": 0.0001466797698174163, + "loss": 0.0, + "step": 33590 + }, + { + "epoch": 3.1343659606233087, + "grad_norm": NaN, + "learning_rate": 0.00014667220789037704, + "loss": 0.0, + "step": 33591 + }, + { + "epoch": 3.134459270318186, + "grad_norm": NaN, + "learning_rate": 0.0001466646459717994, + "loss": 0.0, + "step": 33592 + }, + { + "epoch": 3.1345525800130636, + "grad_norm": NaN, + "learning_rate": 0.00014665708406170263, + "loss": 0.0, + "step": 33593 + }, + { + "epoch": 3.1346458897079406, + "grad_norm": NaN, + "learning_rate": 0.00014664952216010581, + "loss": 0.0, + "step": 33594 + }, + { + "epoch": 3.134739199402818, + "grad_norm": NaN, + "learning_rate": 0.00014664196026702828, + "loss": 0.0, + "step": 33595 + }, + { + "epoch": 3.1348325090976954, + "grad_norm": NaN, + "learning_rate": 0.0001466343983824893, + "loss": 0.0, + "step": 33596 + }, + { + "epoch": 3.1349258187925724, + "grad_norm": NaN, + "learning_rate": 0.000146626836506508, + "loss": 0.0, + "step": 33597 + }, + { + "epoch": 3.13501912848745, + "grad_norm": NaN, + "learning_rate": 0.00014661927463910365, + "loss": 0.0, + "step": 33598 + }, + { + "epoch": 3.135112438182327, + "grad_norm": NaN, + "learning_rate": 0.00014661171278029556, + "loss": 0.0, + "step": 33599 + }, + { + "epoch": 3.1352057478772046, + "grad_norm": NaN, + "learning_rate": 0.00014660415093010283, + "loss": 0.0, + "step": 33600 + }, + { + "epoch": 3.1352990575720816, + "grad_norm": NaN, + "learning_rate": 0.00014659658908854469, + "loss": 0.0, + "step": 33601 + }, + { + "epoch": 3.135392367266959, + "grad_norm": NaN, + "learning_rate": 0.0001465890272556405, + "loss": 0.0, + "step": 33602 + }, + { + "epoch": 3.1354856769618364, + "grad_norm": NaN, + "learning_rate": 0.00014658146543140938, + "loss": 0.0, + "step": 33603 + }, + { + "epoch": 3.1355789866567134, + "grad_norm": NaN, + "learning_rate": 0.00014657390361587055, + "loss": 0.0, + "step": 33604 + }, + { + "epoch": 3.135672296351591, + "grad_norm": NaN, + "learning_rate": 0.0001465663418090433, + "loss": 0.0, + "step": 33605 + }, + { + "epoch": 3.1357656060464683, + "grad_norm": NaN, + "learning_rate": 0.0001465587800109468, + "loss": 0.0, + "step": 33606 + }, + { + "epoch": 3.1358589157413457, + "grad_norm": NaN, + "learning_rate": 0.00014655121822160032, + "loss": 0.0, + "step": 33607 + }, + { + "epoch": 3.1359522254362227, + "grad_norm": NaN, + "learning_rate": 0.0001465436564410231, + "loss": 0.0, + "step": 33608 + }, + { + "epoch": 3.1360455351311, + "grad_norm": NaN, + "learning_rate": 0.0001465360946692343, + "loss": 0.0, + "step": 33609 + }, + { + "epoch": 3.1361388448259775, + "grad_norm": NaN, + "learning_rate": 0.00014652853290625316, + "loss": 0.0, + "step": 33610 + }, + { + "epoch": 3.136232154520855, + "grad_norm": NaN, + "learning_rate": 0.000146520971152099, + "loss": 0.0, + "step": 33611 + }, + { + "epoch": 3.136325464215732, + "grad_norm": NaN, + "learning_rate": 0.00014651340940679094, + "loss": 0.0, + "step": 33612 + }, + { + "epoch": 3.1364187739106093, + "grad_norm": NaN, + "learning_rate": 0.00014650584767034823, + "loss": 0.0, + "step": 33613 + }, + { + "epoch": 3.1365120836054867, + "grad_norm": NaN, + "learning_rate": 0.00014649828594279015, + "loss": 0.0, + "step": 33614 + }, + { + "epoch": 3.136605393300364, + "grad_norm": NaN, + "learning_rate": 0.00014649072422413586, + "loss": 0.0, + "step": 33615 + }, + { + "epoch": 3.136698702995241, + "grad_norm": NaN, + "learning_rate": 0.00014648316251440462, + "loss": 0.0, + "step": 33616 + }, + { + "epoch": 3.1367920126901185, + "grad_norm": NaN, + "learning_rate": 0.0001464756008136157, + "loss": 0.0, + "step": 33617 + }, + { + "epoch": 3.136885322384996, + "grad_norm": NaN, + "learning_rate": 0.00014646803912178827, + "loss": 0.0, + "step": 33618 + }, + { + "epoch": 3.136978632079873, + "grad_norm": NaN, + "learning_rate": 0.00014646047743894152, + "loss": 0.0, + "step": 33619 + }, + { + "epoch": 3.1370719417747503, + "grad_norm": NaN, + "learning_rate": 0.0001464529157650948, + "loss": 0.0, + "step": 33620 + }, + { + "epoch": 3.1371652514696278, + "grad_norm": NaN, + "learning_rate": 0.0001464453541002672, + "loss": 0.0, + "step": 33621 + }, + { + "epoch": 3.137258561164505, + "grad_norm": NaN, + "learning_rate": 0.000146437792444478, + "loss": 0.0, + "step": 33622 + }, + { + "epoch": 3.137351870859382, + "grad_norm": NaN, + "learning_rate": 0.00014643023079774652, + "loss": 0.0, + "step": 33623 + }, + { + "epoch": 3.1374451805542596, + "grad_norm": NaN, + "learning_rate": 0.00014642266916009187, + "loss": 0.0, + "step": 33624 + }, + { + "epoch": 3.137538490249137, + "grad_norm": NaN, + "learning_rate": 0.00014641510753153332, + "loss": 0.0, + "step": 33625 + }, + { + "epoch": 3.137631799944014, + "grad_norm": NaN, + "learning_rate": 0.00014640754591209004, + "loss": 0.0, + "step": 33626 + }, + { + "epoch": 3.1377251096388914, + "grad_norm": NaN, + "learning_rate": 0.00014639998430178135, + "loss": 0.0, + "step": 33627 + }, + { + "epoch": 3.137818419333769, + "grad_norm": NaN, + "learning_rate": 0.00014639242270062647, + "loss": 0.0, + "step": 33628 + }, + { + "epoch": 3.1379117290286462, + "grad_norm": NaN, + "learning_rate": 0.0001463848611086445, + "loss": 0.0, + "step": 33629 + }, + { + "epoch": 3.138005038723523, + "grad_norm": NaN, + "learning_rate": 0.0001463772995258548, + "loss": 0.0, + "step": 33630 + }, + { + "epoch": 3.1380983484184006, + "grad_norm": NaN, + "learning_rate": 0.0001463697379522766, + "loss": 0.0, + "step": 33631 + }, + { + "epoch": 3.138191658113278, + "grad_norm": NaN, + "learning_rate": 0.000146362176387929, + "loss": 0.0, + "step": 33632 + }, + { + "epoch": 3.1382849678081555, + "grad_norm": NaN, + "learning_rate": 0.0001463546148328313, + "loss": 0.0, + "step": 33633 + }, + { + "epoch": 3.1383782775030324, + "grad_norm": NaN, + "learning_rate": 0.00014634705328700286, + "loss": 0.0, + "step": 33634 + }, + { + "epoch": 3.13847158719791, + "grad_norm": NaN, + "learning_rate": 0.00014633949175046267, + "loss": 0.0, + "step": 33635 + }, + { + "epoch": 3.1385648968927873, + "grad_norm": NaN, + "learning_rate": 0.00014633193022323005, + "loss": 0.0, + "step": 33636 + }, + { + "epoch": 3.1386582065876643, + "grad_norm": NaN, + "learning_rate": 0.00014632436870532433, + "loss": 0.0, + "step": 33637 + }, + { + "epoch": 3.1387515162825417, + "grad_norm": NaN, + "learning_rate": 0.00014631680719676463, + "loss": 0.0, + "step": 33638 + }, + { + "epoch": 3.138844825977419, + "grad_norm": NaN, + "learning_rate": 0.00014630924569757016, + "loss": 0.0, + "step": 33639 + }, + { + "epoch": 3.1389381356722965, + "grad_norm": NaN, + "learning_rate": 0.00014630168420776024, + "loss": 0.0, + "step": 33640 + }, + { + "epoch": 3.1390314453671735, + "grad_norm": NaN, + "learning_rate": 0.00014629412272735401, + "loss": 0.0, + "step": 33641 + }, + { + "epoch": 3.139124755062051, + "grad_norm": NaN, + "learning_rate": 0.00014628656125637067, + "loss": 0.0, + "step": 33642 + }, + { + "epoch": 3.1392180647569283, + "grad_norm": NaN, + "learning_rate": 0.00014627899979482962, + "loss": 0.0, + "step": 33643 + }, + { + "epoch": 3.1393113744518057, + "grad_norm": NaN, + "learning_rate": 0.0001462714383427499, + "loss": 0.0, + "step": 33644 + }, + { + "epoch": 3.1394046841466827, + "grad_norm": NaN, + "learning_rate": 0.0001462638769001508, + "loss": 0.0, + "step": 33645 + }, + { + "epoch": 3.13949799384156, + "grad_norm": NaN, + "learning_rate": 0.00014625631546705161, + "loss": 0.0, + "step": 33646 + }, + { + "epoch": 3.1395913035364376, + "grad_norm": NaN, + "learning_rate": 0.00014624875404347146, + "loss": 0.0, + "step": 33647 + }, + { + "epoch": 3.1396846132313145, + "grad_norm": NaN, + "learning_rate": 0.0001462411926294296, + "loss": 0.0, + "step": 33648 + }, + { + "epoch": 3.139777922926192, + "grad_norm": NaN, + "learning_rate": 0.00014623363122494534, + "loss": 0.0, + "step": 33649 + }, + { + "epoch": 3.1398712326210694, + "grad_norm": NaN, + "learning_rate": 0.00014622606983003778, + "loss": 0.0, + "step": 33650 + }, + { + "epoch": 3.139964542315947, + "grad_norm": NaN, + "learning_rate": 0.0001462185084447262, + "loss": 0.0, + "step": 33651 + }, + { + "epoch": 3.1400578520108238, + "grad_norm": NaN, + "learning_rate": 0.0001462109470690299, + "loss": 0.0, + "step": 33652 + }, + { + "epoch": 3.140151161705701, + "grad_norm": NaN, + "learning_rate": 0.000146203385702968, + "loss": 0.0, + "step": 33653 + }, + { + "epoch": 3.1402444714005786, + "grad_norm": NaN, + "learning_rate": 0.00014619582434655971, + "loss": 0.0, + "step": 33654 + }, + { + "epoch": 3.1403377810954556, + "grad_norm": NaN, + "learning_rate": 0.00014618826299982442, + "loss": 0.0, + "step": 33655 + }, + { + "epoch": 3.140431090790333, + "grad_norm": NaN, + "learning_rate": 0.0001461807016627812, + "loss": 0.0, + "step": 33656 + }, + { + "epoch": 3.1405244004852104, + "grad_norm": NaN, + "learning_rate": 0.00014617314033544925, + "loss": 0.0, + "step": 33657 + }, + { + "epoch": 3.140617710180088, + "grad_norm": NaN, + "learning_rate": 0.00014616557901784798, + "loss": 0.0, + "step": 33658 + }, + { + "epoch": 3.140711019874965, + "grad_norm": NaN, + "learning_rate": 0.00014615801770999647, + "loss": 0.0, + "step": 33659 + }, + { + "epoch": 3.1408043295698422, + "grad_norm": NaN, + "learning_rate": 0.00014615045641191393, + "loss": 0.0, + "step": 33660 + }, + { + "epoch": 3.1408976392647197, + "grad_norm": NaN, + "learning_rate": 0.00014614289512361974, + "loss": 0.0, + "step": 33661 + }, + { + "epoch": 3.140990948959597, + "grad_norm": NaN, + "learning_rate": 0.00014613533384513297, + "loss": 0.0, + "step": 33662 + }, + { + "epoch": 3.141084258654474, + "grad_norm": NaN, + "learning_rate": 0.00014612777257647287, + "loss": 0.0, + "step": 33663 + }, + { + "epoch": 3.1411775683493515, + "grad_norm": NaN, + "learning_rate": 0.00014612021131765876, + "loss": 0.0, + "step": 33664 + }, + { + "epoch": 3.141270878044229, + "grad_norm": NaN, + "learning_rate": 0.00014611265006870975, + "loss": 0.0, + "step": 33665 + }, + { + "epoch": 3.1413641877391063, + "grad_norm": NaN, + "learning_rate": 0.00014610508882964513, + "loss": 0.0, + "step": 33666 + }, + { + "epoch": 3.1414574974339833, + "grad_norm": NaN, + "learning_rate": 0.00014609752760048414, + "loss": 0.0, + "step": 33667 + }, + { + "epoch": 3.1415508071288607, + "grad_norm": NaN, + "learning_rate": 0.00014608996638124598, + "loss": 0.0, + "step": 33668 + }, + { + "epoch": 3.141644116823738, + "grad_norm": NaN, + "learning_rate": 0.0001460824051719499, + "loss": 0.0, + "step": 33669 + }, + { + "epoch": 3.141737426518615, + "grad_norm": NaN, + "learning_rate": 0.00014607484397261505, + "loss": 0.0, + "step": 33670 + }, + { + "epoch": 3.1418307362134925, + "grad_norm": NaN, + "learning_rate": 0.0001460672827832607, + "loss": 0.0, + "step": 33671 + }, + { + "epoch": 3.14192404590837, + "grad_norm": NaN, + "learning_rate": 0.00014605972160390616, + "loss": 0.0, + "step": 33672 + }, + { + "epoch": 3.1420173556032474, + "grad_norm": NaN, + "learning_rate": 0.00014605216043457053, + "loss": 0.0, + "step": 33673 + }, + { + "epoch": 3.1421106652981243, + "grad_norm": NaN, + "learning_rate": 0.00014604459927527303, + "loss": 0.0, + "step": 33674 + }, + { + "epoch": 3.1422039749930017, + "grad_norm": NaN, + "learning_rate": 0.00014603703812603305, + "loss": 0.0, + "step": 33675 + }, + { + "epoch": 3.142297284687879, + "grad_norm": NaN, + "learning_rate": 0.00014602947698686965, + "loss": 0.0, + "step": 33676 + }, + { + "epoch": 3.142390594382756, + "grad_norm": NaN, + "learning_rate": 0.0001460219158578021, + "loss": 0.0, + "step": 33677 + }, + { + "epoch": 3.1424839040776336, + "grad_norm": NaN, + "learning_rate": 0.0001460143547388497, + "loss": 0.0, + "step": 33678 + }, + { + "epoch": 3.142577213772511, + "grad_norm": NaN, + "learning_rate": 0.00014600679363003158, + "loss": 0.0, + "step": 33679 + }, + { + "epoch": 3.1426705234673884, + "grad_norm": NaN, + "learning_rate": 0.00014599923253136697, + "loss": 0.0, + "step": 33680 + }, + { + "epoch": 3.1427638331622654, + "grad_norm": NaN, + "learning_rate": 0.0001459916714428752, + "loss": 0.0, + "step": 33681 + }, + { + "epoch": 3.142857142857143, + "grad_norm": NaN, + "learning_rate": 0.00014598411036457538, + "loss": 0.0, + "step": 33682 + }, + { + "epoch": 3.14295045255202, + "grad_norm": NaN, + "learning_rate": 0.00014597654929648674, + "loss": 0.0, + "step": 33683 + }, + { + "epoch": 3.1430437622468976, + "grad_norm": NaN, + "learning_rate": 0.0001459689882386286, + "loss": 0.0, + "step": 33684 + }, + { + "epoch": 3.1431370719417746, + "grad_norm": NaN, + "learning_rate": 0.00014596142719102011, + "loss": 0.0, + "step": 33685 + }, + { + "epoch": 3.143230381636652, + "grad_norm": NaN, + "learning_rate": 0.00014595386615368048, + "loss": 0.0, + "step": 33686 + }, + { + "epoch": 3.1433236913315294, + "grad_norm": NaN, + "learning_rate": 0.00014594630512662903, + "loss": 0.0, + "step": 33687 + }, + { + "epoch": 3.143417001026407, + "grad_norm": NaN, + "learning_rate": 0.00014593874410988493, + "loss": 0.0, + "step": 33688 + }, + { + "epoch": 3.143510310721284, + "grad_norm": NaN, + "learning_rate": 0.0001459311831034673, + "loss": 0.0, + "step": 33689 + }, + { + "epoch": 3.1436036204161613, + "grad_norm": NaN, + "learning_rate": 0.00014592362210739558, + "loss": 0.0, + "step": 33690 + }, + { + "epoch": 3.1436969301110387, + "grad_norm": NaN, + "learning_rate": 0.00014591606112168885, + "loss": 0.0, + "step": 33691 + }, + { + "epoch": 3.1437902398059157, + "grad_norm": NaN, + "learning_rate": 0.0001459085001463663, + "loss": 0.0, + "step": 33692 + }, + { + "epoch": 3.143883549500793, + "grad_norm": NaN, + "learning_rate": 0.00014590093918144732, + "loss": 0.0, + "step": 33693 + }, + { + "epoch": 3.1439768591956705, + "grad_norm": NaN, + "learning_rate": 0.000145893378226951, + "loss": 0.0, + "step": 33694 + }, + { + "epoch": 3.144070168890548, + "grad_norm": NaN, + "learning_rate": 0.00014588581728289655, + "loss": 0.0, + "step": 33695 + }, + { + "epoch": 3.144163478585425, + "grad_norm": NaN, + "learning_rate": 0.00014587825634930336, + "loss": 0.0, + "step": 33696 + }, + { + "epoch": 3.1442567882803023, + "grad_norm": NaN, + "learning_rate": 0.00014587069542619047, + "loss": 0.0, + "step": 33697 + }, + { + "epoch": 3.1443500979751797, + "grad_norm": NaN, + "learning_rate": 0.00014586313451357715, + "loss": 0.0, + "step": 33698 + }, + { + "epoch": 3.1444434076700567, + "grad_norm": NaN, + "learning_rate": 0.0001458555736114827, + "loss": 0.0, + "step": 33699 + }, + { + "epoch": 3.144536717364934, + "grad_norm": NaN, + "learning_rate": 0.00014584801271992629, + "loss": 0.0, + "step": 33700 + }, + { + "epoch": 3.1446300270598115, + "grad_norm": NaN, + "learning_rate": 0.0001458404518389271, + "loss": 0.0, + "step": 33701 + }, + { + "epoch": 3.144723336754689, + "grad_norm": NaN, + "learning_rate": 0.00014583289096850452, + "loss": 0.0, + "step": 33702 + }, + { + "epoch": 3.144816646449566, + "grad_norm": NaN, + "learning_rate": 0.0001458253301086776, + "loss": 0.0, + "step": 33703 + }, + { + "epoch": 3.1449099561444434, + "grad_norm": NaN, + "learning_rate": 0.0001458177692594656, + "loss": 0.0, + "step": 33704 + }, + { + "epoch": 3.1450032658393208, + "grad_norm": NaN, + "learning_rate": 0.00014581020842088785, + "loss": 0.0, + "step": 33705 + }, + { + "epoch": 3.145096575534198, + "grad_norm": NaN, + "learning_rate": 0.00014580264759296346, + "loss": 0.0, + "step": 33706 + }, + { + "epoch": 3.145189885229075, + "grad_norm": NaN, + "learning_rate": 0.00014579508677571164, + "loss": 0.0, + "step": 33707 + }, + { + "epoch": 3.1452831949239526, + "grad_norm": NaN, + "learning_rate": 0.00014578752596915176, + "loss": 0.0, + "step": 33708 + }, + { + "epoch": 3.14537650461883, + "grad_norm": NaN, + "learning_rate": 0.00014577996517330287, + "loss": 0.0, + "step": 33709 + }, + { + "epoch": 3.1454698143137074, + "grad_norm": NaN, + "learning_rate": 0.00014577240438818433, + "loss": 0.0, + "step": 33710 + }, + { + "epoch": 3.1455631240085844, + "grad_norm": NaN, + "learning_rate": 0.00014576484361381533, + "loss": 0.0, + "step": 33711 + }, + { + "epoch": 3.145656433703462, + "grad_norm": NaN, + "learning_rate": 0.00014575728285021503, + "loss": 0.0, + "step": 33712 + }, + { + "epoch": 3.1457497433983392, + "grad_norm": NaN, + "learning_rate": 0.00014574972209740276, + "loss": 0.0, + "step": 33713 + }, + { + "epoch": 3.145843053093216, + "grad_norm": NaN, + "learning_rate": 0.00014574216135539766, + "loss": 0.0, + "step": 33714 + }, + { + "epoch": 3.1459363627880936, + "grad_norm": NaN, + "learning_rate": 0.00014573460062421894, + "loss": 0.0, + "step": 33715 + }, + { + "epoch": 3.146029672482971, + "grad_norm": NaN, + "learning_rate": 0.00014572703990388592, + "loss": 0.0, + "step": 33716 + }, + { + "epoch": 3.1461229821778485, + "grad_norm": NaN, + "learning_rate": 0.00014571947919441775, + "loss": 0.0, + "step": 33717 + }, + { + "epoch": 3.1462162918727254, + "grad_norm": NaN, + "learning_rate": 0.00014571191849583365, + "loss": 0.0, + "step": 33718 + }, + { + "epoch": 3.146309601567603, + "grad_norm": NaN, + "learning_rate": 0.00014570435780815295, + "loss": 0.0, + "step": 33719 + }, + { + "epoch": 3.1464029112624803, + "grad_norm": NaN, + "learning_rate": 0.00014569679713139476, + "loss": 0.0, + "step": 33720 + }, + { + "epoch": 3.1464962209573573, + "grad_norm": NaN, + "learning_rate": 0.00014568923646557827, + "loss": 0.0, + "step": 33721 + }, + { + "epoch": 3.1465895306522347, + "grad_norm": NaN, + "learning_rate": 0.00014568167581072286, + "loss": 0.0, + "step": 33722 + }, + { + "epoch": 3.146682840347112, + "grad_norm": NaN, + "learning_rate": 0.00014567411516684764, + "loss": 0.0, + "step": 33723 + }, + { + "epoch": 3.1467761500419895, + "grad_norm": NaN, + "learning_rate": 0.00014566655453397185, + "loss": 0.0, + "step": 33724 + }, + { + "epoch": 3.1468694597368665, + "grad_norm": NaN, + "learning_rate": 0.00014565899391211475, + "loss": 0.0, + "step": 33725 + }, + { + "epoch": 3.146962769431744, + "grad_norm": NaN, + "learning_rate": 0.00014565143330129554, + "loss": 0.0, + "step": 33726 + }, + { + "epoch": 3.1470560791266213, + "grad_norm": NaN, + "learning_rate": 0.00014564387270153342, + "loss": 0.0, + "step": 33727 + }, + { + "epoch": 3.1471493888214987, + "grad_norm": NaN, + "learning_rate": 0.0001456363121128477, + "loss": 0.0, + "step": 33728 + }, + { + "epoch": 3.1472426985163757, + "grad_norm": NaN, + "learning_rate": 0.00014562875153525752, + "loss": 0.0, + "step": 33729 + }, + { + "epoch": 3.147336008211253, + "grad_norm": NaN, + "learning_rate": 0.00014562119096878207, + "loss": 0.0, + "step": 33730 + }, + { + "epoch": 3.1474293179061306, + "grad_norm": NaN, + "learning_rate": 0.00014561363041344072, + "loss": 0.0, + "step": 33731 + }, + { + "epoch": 3.1475226276010075, + "grad_norm": NaN, + "learning_rate": 0.00014560606986925258, + "loss": 0.0, + "step": 33732 + }, + { + "epoch": 3.147615937295885, + "grad_norm": NaN, + "learning_rate": 0.00014559850933623685, + "loss": 0.0, + "step": 33733 + }, + { + "epoch": 3.1477092469907624, + "grad_norm": NaN, + "learning_rate": 0.00014559094881441288, + "loss": 0.0, + "step": 33734 + }, + { + "epoch": 3.14780255668564, + "grad_norm": NaN, + "learning_rate": 0.0001455833883037998, + "loss": 0.0, + "step": 33735 + }, + { + "epoch": 3.1478958663805168, + "grad_norm": NaN, + "learning_rate": 0.0001455758278044168, + "loss": 0.0, + "step": 33736 + }, + { + "epoch": 3.147989176075394, + "grad_norm": NaN, + "learning_rate": 0.00014556826731628328, + "loss": 0.0, + "step": 33737 + }, + { + "epoch": 3.1480824857702716, + "grad_norm": NaN, + "learning_rate": 0.00014556070683941825, + "loss": 0.0, + "step": 33738 + }, + { + "epoch": 3.148175795465149, + "grad_norm": NaN, + "learning_rate": 0.00014555314637384103, + "loss": 0.0, + "step": 33739 + }, + { + "epoch": 3.148269105160026, + "grad_norm": NaN, + "learning_rate": 0.00014554558591957088, + "loss": 0.0, + "step": 33740 + }, + { + "epoch": 3.1483624148549034, + "grad_norm": NaN, + "learning_rate": 0.00014553802547662698, + "loss": 0.0, + "step": 33741 + }, + { + "epoch": 3.148455724549781, + "grad_norm": NaN, + "learning_rate": 0.00014553046504502853, + "loss": 0.0, + "step": 33742 + }, + { + "epoch": 3.148549034244658, + "grad_norm": NaN, + "learning_rate": 0.0001455229046247948, + "loss": 0.0, + "step": 33743 + }, + { + "epoch": 3.1486423439395352, + "grad_norm": NaN, + "learning_rate": 0.000145515344215945, + "loss": 0.0, + "step": 33744 + }, + { + "epoch": 3.1487356536344127, + "grad_norm": NaN, + "learning_rate": 0.00014550778381849832, + "loss": 0.0, + "step": 33745 + }, + { + "epoch": 3.14882896332929, + "grad_norm": NaN, + "learning_rate": 0.00014550022343247406, + "loss": 0.0, + "step": 33746 + }, + { + "epoch": 3.148922273024167, + "grad_norm": NaN, + "learning_rate": 0.00014549266305789138, + "loss": 0.0, + "step": 33747 + }, + { + "epoch": 3.1490155827190445, + "grad_norm": NaN, + "learning_rate": 0.0001454851026947695, + "loss": 0.0, + "step": 33748 + }, + { + "epoch": 3.149108892413922, + "grad_norm": NaN, + "learning_rate": 0.00014547754234312774, + "loss": 0.0, + "step": 33749 + }, + { + "epoch": 3.1492022021087993, + "grad_norm": NaN, + "learning_rate": 0.00014546998200298515, + "loss": 0.0, + "step": 33750 + }, + { + "epoch": 3.1492955118036763, + "grad_norm": NaN, + "learning_rate": 0.0001454624216743611, + "loss": 0.0, + "step": 33751 + }, + { + "epoch": 3.1493888214985537, + "grad_norm": NaN, + "learning_rate": 0.00014545486135727483, + "loss": 0.0, + "step": 33752 + }, + { + "epoch": 3.149482131193431, + "grad_norm": NaN, + "learning_rate": 0.0001454473010517454, + "loss": 0.0, + "step": 33753 + }, + { + "epoch": 3.149575440888308, + "grad_norm": NaN, + "learning_rate": 0.00014543974075779218, + "loss": 0.0, + "step": 33754 + }, + { + "epoch": 3.1496687505831855, + "grad_norm": NaN, + "learning_rate": 0.00014543218047543437, + "loss": 0.0, + "step": 33755 + }, + { + "epoch": 3.149762060278063, + "grad_norm": NaN, + "learning_rate": 0.00014542462020469113, + "loss": 0.0, + "step": 33756 + }, + { + "epoch": 3.1498553699729404, + "grad_norm": NaN, + "learning_rate": 0.00014541705994558176, + "loss": 0.0, + "step": 33757 + }, + { + "epoch": 3.1499486796678173, + "grad_norm": NaN, + "learning_rate": 0.00014540949969812545, + "loss": 0.0, + "step": 33758 + }, + { + "epoch": 3.1500419893626947, + "grad_norm": NaN, + "learning_rate": 0.00014540193946234136, + "loss": 0.0, + "step": 33759 + }, + { + "epoch": 3.150135299057572, + "grad_norm": NaN, + "learning_rate": 0.00014539437923824885, + "loss": 0.0, + "step": 33760 + }, + { + "epoch": 3.1502286087524496, + "grad_norm": NaN, + "learning_rate": 0.00014538681902586705, + "loss": 0.0, + "step": 33761 + }, + { + "epoch": 3.1503219184473266, + "grad_norm": NaN, + "learning_rate": 0.00014537925882521517, + "loss": 0.0, + "step": 33762 + }, + { + "epoch": 3.150415228142204, + "grad_norm": NaN, + "learning_rate": 0.00014537169863631253, + "loss": 0.0, + "step": 33763 + }, + { + "epoch": 3.1505085378370814, + "grad_norm": NaN, + "learning_rate": 0.00014536413845917822, + "loss": 0.0, + "step": 33764 + }, + { + "epoch": 3.1506018475319584, + "grad_norm": NaN, + "learning_rate": 0.00014535657829383154, + "loss": 0.0, + "step": 33765 + }, + { + "epoch": 3.150695157226836, + "grad_norm": NaN, + "learning_rate": 0.00014534901814029175, + "loss": 0.0, + "step": 33766 + }, + { + "epoch": 3.150788466921713, + "grad_norm": NaN, + "learning_rate": 0.00014534145799857803, + "loss": 0.0, + "step": 33767 + }, + { + "epoch": 3.1508817766165906, + "grad_norm": NaN, + "learning_rate": 0.00014533389786870954, + "loss": 0.0, + "step": 33768 + }, + { + "epoch": 3.1509750863114676, + "grad_norm": NaN, + "learning_rate": 0.00014532633775070564, + "loss": 0.0, + "step": 33769 + }, + { + "epoch": 3.151068396006345, + "grad_norm": NaN, + "learning_rate": 0.00014531877764458544, + "loss": 0.0, + "step": 33770 + }, + { + "epoch": 3.1511617057012224, + "grad_norm": NaN, + "learning_rate": 0.00014531121755036817, + "loss": 0.0, + "step": 33771 + }, + { + "epoch": 3.1512550153960994, + "grad_norm": NaN, + "learning_rate": 0.00014530365746807315, + "loss": 0.0, + "step": 33772 + }, + { + "epoch": 3.151348325090977, + "grad_norm": NaN, + "learning_rate": 0.0001452960973977195, + "loss": 0.0, + "step": 33773 + }, + { + "epoch": 3.1514416347858543, + "grad_norm": NaN, + "learning_rate": 0.00014528853733932647, + "loss": 0.0, + "step": 33774 + }, + { + "epoch": 3.1515349444807317, + "grad_norm": NaN, + "learning_rate": 0.00014528097729291334, + "loss": 0.0, + "step": 33775 + }, + { + "epoch": 3.1516282541756087, + "grad_norm": NaN, + "learning_rate": 0.00014527341725849925, + "loss": 0.0, + "step": 33776 + }, + { + "epoch": 3.151721563870486, + "grad_norm": NaN, + "learning_rate": 0.00014526585723610344, + "loss": 0.0, + "step": 33777 + }, + { + "epoch": 3.1518148735653635, + "grad_norm": NaN, + "learning_rate": 0.0001452582972257452, + "loss": 0.0, + "step": 33778 + }, + { + "epoch": 3.151908183260241, + "grad_norm": NaN, + "learning_rate": 0.0001452507372274437, + "loss": 0.0, + "step": 33779 + }, + { + "epoch": 3.152001492955118, + "grad_norm": NaN, + "learning_rate": 0.00014524317724121813, + "loss": 0.0, + "step": 33780 + }, + { + "epoch": 3.1520948026499953, + "grad_norm": NaN, + "learning_rate": 0.0001452356172670878, + "loss": 0.0, + "step": 33781 + }, + { + "epoch": 3.1521881123448727, + "grad_norm": NaN, + "learning_rate": 0.00014522805730507186, + "loss": 0.0, + "step": 33782 + }, + { + "epoch": 3.15228142203975, + "grad_norm": NaN, + "learning_rate": 0.00014522049735518953, + "loss": 0.0, + "step": 33783 + }, + { + "epoch": 3.152374731734627, + "grad_norm": NaN, + "learning_rate": 0.00014521293741746012, + "loss": 0.0, + "step": 33784 + }, + { + "epoch": 3.1524680414295045, + "grad_norm": NaN, + "learning_rate": 0.00014520537749190273, + "loss": 0.0, + "step": 33785 + }, + { + "epoch": 3.152561351124382, + "grad_norm": NaN, + "learning_rate": 0.00014519781757853668, + "loss": 0.0, + "step": 33786 + }, + { + "epoch": 3.152654660819259, + "grad_norm": NaN, + "learning_rate": 0.00014519025767738118, + "loss": 0.0, + "step": 33787 + }, + { + "epoch": 3.1527479705141364, + "grad_norm": NaN, + "learning_rate": 0.00014518269778845537, + "loss": 0.0, + "step": 33788 + }, + { + "epoch": 3.1528412802090138, + "grad_norm": NaN, + "learning_rate": 0.00014517513791177856, + "loss": 0.0, + "step": 33789 + }, + { + "epoch": 3.152934589903891, + "grad_norm": NaN, + "learning_rate": 0.00014516757804736998, + "loss": 0.0, + "step": 33790 + }, + { + "epoch": 3.153027899598768, + "grad_norm": NaN, + "learning_rate": 0.00014516001819524877, + "loss": 0.0, + "step": 33791 + }, + { + "epoch": 3.1531212092936456, + "grad_norm": NaN, + "learning_rate": 0.0001451524583554342, + "loss": 0.0, + "step": 33792 + }, + { + "epoch": 3.153214518988523, + "grad_norm": NaN, + "learning_rate": 0.00014514489852794553, + "loss": 0.0, + "step": 33793 + }, + { + "epoch": 3.1533078286834, + "grad_norm": NaN, + "learning_rate": 0.0001451373387128019, + "loss": 0.0, + "step": 33794 + }, + { + "epoch": 3.1534011383782774, + "grad_norm": NaN, + "learning_rate": 0.0001451297789100226, + "loss": 0.0, + "step": 33795 + }, + { + "epoch": 3.153494448073155, + "grad_norm": NaN, + "learning_rate": 0.00014512221911962687, + "loss": 0.0, + "step": 33796 + }, + { + "epoch": 3.1535877577680322, + "grad_norm": NaN, + "learning_rate": 0.00014511465934163381, + "loss": 0.0, + "step": 33797 + }, + { + "epoch": 3.153681067462909, + "grad_norm": NaN, + "learning_rate": 0.00014510709957606276, + "loss": 0.0, + "step": 33798 + }, + { + "epoch": 3.1537743771577866, + "grad_norm": NaN, + "learning_rate": 0.00014509953982293295, + "loss": 0.0, + "step": 33799 + }, + { + "epoch": 3.153867686852664, + "grad_norm": NaN, + "learning_rate": 0.0001450919800822635, + "loss": 0.0, + "step": 33800 + }, + { + "epoch": 3.1539609965475415, + "grad_norm": NaN, + "learning_rate": 0.0001450844203540737, + "loss": 0.0, + "step": 33801 + }, + { + "epoch": 3.1540543062424184, + "grad_norm": NaN, + "learning_rate": 0.0001450768606383828, + "loss": 0.0, + "step": 33802 + }, + { + "epoch": 3.154147615937296, + "grad_norm": NaN, + "learning_rate": 0.00014506930093520994, + "loss": 0.0, + "step": 33803 + }, + { + "epoch": 3.1542409256321733, + "grad_norm": NaN, + "learning_rate": 0.00014506174124457444, + "loss": 0.0, + "step": 33804 + }, + { + "epoch": 3.1543342353270507, + "grad_norm": NaN, + "learning_rate": 0.00014505418156649544, + "loss": 0.0, + "step": 33805 + }, + { + "epoch": 3.1544275450219277, + "grad_norm": NaN, + "learning_rate": 0.00014504662190099215, + "loss": 0.0, + "step": 33806 + }, + { + "epoch": 3.154520854716805, + "grad_norm": NaN, + "learning_rate": 0.0001450390622480839, + "loss": 0.0, + "step": 33807 + }, + { + "epoch": 3.1546141644116825, + "grad_norm": NaN, + "learning_rate": 0.00014503150260778983, + "loss": 0.0, + "step": 33808 + }, + { + "epoch": 3.1547074741065595, + "grad_norm": NaN, + "learning_rate": 0.00014502394298012913, + "loss": 0.0, + "step": 33809 + }, + { + "epoch": 3.154800783801437, + "grad_norm": NaN, + "learning_rate": 0.00014501638336512115, + "loss": 0.0, + "step": 33810 + }, + { + "epoch": 3.1548940934963143, + "grad_norm": NaN, + "learning_rate": 0.00014500882376278497, + "loss": 0.0, + "step": 33811 + }, + { + "epoch": 3.1549874031911918, + "grad_norm": NaN, + "learning_rate": 0.00014500126417313987, + "loss": 0.0, + "step": 33812 + }, + { + "epoch": 3.1550807128860687, + "grad_norm": NaN, + "learning_rate": 0.0001449937045962051, + "loss": 0.0, + "step": 33813 + }, + { + "epoch": 3.155174022580946, + "grad_norm": NaN, + "learning_rate": 0.00014498614503199986, + "loss": 0.0, + "step": 33814 + }, + { + "epoch": 3.1552673322758236, + "grad_norm": NaN, + "learning_rate": 0.0001449785854805433, + "loss": 0.0, + "step": 33815 + }, + { + "epoch": 3.1553606419707005, + "grad_norm": NaN, + "learning_rate": 0.00014497102594185482, + "loss": 0.0, + "step": 33816 + }, + { + "epoch": 3.155453951665578, + "grad_norm": NaN, + "learning_rate": 0.00014496346641595348, + "loss": 0.0, + "step": 33817 + }, + { + "epoch": 3.1555472613604554, + "grad_norm": NaN, + "learning_rate": 0.0001449559069028585, + "loss": 0.0, + "step": 33818 + }, + { + "epoch": 3.155640571055333, + "grad_norm": NaN, + "learning_rate": 0.00014494834740258925, + "loss": 0.0, + "step": 33819 + }, + { + "epoch": 3.1557338807502098, + "grad_norm": NaN, + "learning_rate": 0.00014494078791516479, + "loss": 0.0, + "step": 33820 + }, + { + "epoch": 3.155827190445087, + "grad_norm": NaN, + "learning_rate": 0.0001449332284406044, + "loss": 0.0, + "step": 33821 + }, + { + "epoch": 3.1559205001399646, + "grad_norm": NaN, + "learning_rate": 0.00014492566897892736, + "loss": 0.0, + "step": 33822 + }, + { + "epoch": 3.156013809834842, + "grad_norm": NaN, + "learning_rate": 0.0001449181095301528, + "loss": 0.0, + "step": 33823 + }, + { + "epoch": 3.156107119529719, + "grad_norm": NaN, + "learning_rate": 0.0001449105500943, + "loss": 0.0, + "step": 33824 + }, + { + "epoch": 3.1562004292245964, + "grad_norm": NaN, + "learning_rate": 0.0001449029906713882, + "loss": 0.0, + "step": 33825 + }, + { + "epoch": 3.156293738919474, + "grad_norm": NaN, + "learning_rate": 0.0001448954312614365, + "loss": 0.0, + "step": 33826 + }, + { + "epoch": 3.1563870486143513, + "grad_norm": NaN, + "learning_rate": 0.00014488787186446426, + "loss": 0.0, + "step": 33827 + }, + { + "epoch": 3.1564803583092282, + "grad_norm": NaN, + "learning_rate": 0.00014488031248049066, + "loss": 0.0, + "step": 33828 + }, + { + "epoch": 3.1565736680041057, + "grad_norm": NaN, + "learning_rate": 0.00014487275310953488, + "loss": 0.0, + "step": 33829 + }, + { + "epoch": 3.156666977698983, + "grad_norm": NaN, + "learning_rate": 0.00014486519375161617, + "loss": 0.0, + "step": 33830 + }, + { + "epoch": 3.15676028739386, + "grad_norm": NaN, + "learning_rate": 0.0001448576344067538, + "loss": 0.0, + "step": 33831 + }, + { + "epoch": 3.1568535970887375, + "grad_norm": NaN, + "learning_rate": 0.00014485007507496686, + "loss": 0.0, + "step": 33832 + }, + { + "epoch": 3.156946906783615, + "grad_norm": NaN, + "learning_rate": 0.0001448425157562747, + "loss": 0.0, + "step": 33833 + }, + { + "epoch": 3.1570402164784923, + "grad_norm": NaN, + "learning_rate": 0.00014483495645069652, + "loss": 0.0, + "step": 33834 + }, + { + "epoch": 3.1571335261733693, + "grad_norm": NaN, + "learning_rate": 0.00014482739715825145, + "loss": 0.0, + "step": 33835 + }, + { + "epoch": 3.1572268358682467, + "grad_norm": NaN, + "learning_rate": 0.0001448198378789588, + "loss": 0.0, + "step": 33836 + }, + { + "epoch": 3.157320145563124, + "grad_norm": NaN, + "learning_rate": 0.00014481227861283785, + "loss": 0.0, + "step": 33837 + }, + { + "epoch": 3.157413455258001, + "grad_norm": NaN, + "learning_rate": 0.00014480471935990764, + "loss": 0.0, + "step": 33838 + }, + { + "epoch": 3.1575067649528785, + "grad_norm": NaN, + "learning_rate": 0.0001447971601201875, + "loss": 0.0, + "step": 33839 + }, + { + "epoch": 3.157600074647756, + "grad_norm": NaN, + "learning_rate": 0.0001447896008936967, + "loss": 0.0, + "step": 33840 + }, + { + "epoch": 3.1576933843426334, + "grad_norm": NaN, + "learning_rate": 0.00014478204168045432, + "loss": 0.0, + "step": 33841 + }, + { + "epoch": 3.1577866940375103, + "grad_norm": NaN, + "learning_rate": 0.00014477448248047972, + "loss": 0.0, + "step": 33842 + }, + { + "epoch": 3.1578800037323878, + "grad_norm": NaN, + "learning_rate": 0.00014476692329379206, + "loss": 0.0, + "step": 33843 + }, + { + "epoch": 3.157973313427265, + "grad_norm": NaN, + "learning_rate": 0.00014475936412041053, + "loss": 0.0, + "step": 33844 + }, + { + "epoch": 3.1580666231221426, + "grad_norm": NaN, + "learning_rate": 0.0001447518049603544, + "loss": 0.0, + "step": 33845 + }, + { + "epoch": 3.1581599328170196, + "grad_norm": NaN, + "learning_rate": 0.00014474424581364294, + "loss": 0.0, + "step": 33846 + }, + { + "epoch": 3.158253242511897, + "grad_norm": NaN, + "learning_rate": 0.0001447366866802952, + "loss": 0.0, + "step": 33847 + }, + { + "epoch": 3.1583465522067744, + "grad_norm": NaN, + "learning_rate": 0.0001447291275603306, + "loss": 0.0, + "step": 33848 + }, + { + "epoch": 3.1584398619016514, + "grad_norm": NaN, + "learning_rate": 0.00014472156845376821, + "loss": 0.0, + "step": 33849 + }, + { + "epoch": 3.158533171596529, + "grad_norm": NaN, + "learning_rate": 0.0001447140093606273, + "loss": 0.0, + "step": 33850 + }, + { + "epoch": 3.158626481291406, + "grad_norm": NaN, + "learning_rate": 0.00014470645028092713, + "loss": 0.0, + "step": 33851 + }, + { + "epoch": 3.1587197909862836, + "grad_norm": NaN, + "learning_rate": 0.0001446988912146869, + "loss": 0.0, + "step": 33852 + }, + { + "epoch": 3.1588131006811606, + "grad_norm": NaN, + "learning_rate": 0.00014469133216192575, + "loss": 0.0, + "step": 33853 + }, + { + "epoch": 3.158906410376038, + "grad_norm": NaN, + "learning_rate": 0.00014468377312266305, + "loss": 0.0, + "step": 33854 + }, + { + "epoch": 3.1589997200709155, + "grad_norm": NaN, + "learning_rate": 0.0001446762140969179, + "loss": 0.0, + "step": 33855 + }, + { + "epoch": 3.159093029765793, + "grad_norm": NaN, + "learning_rate": 0.00014466865508470955, + "loss": 0.0, + "step": 33856 + }, + { + "epoch": 3.15918633946067, + "grad_norm": NaN, + "learning_rate": 0.00014466109608605727, + "loss": 0.0, + "step": 33857 + }, + { + "epoch": 3.1592796491555473, + "grad_norm": NaN, + "learning_rate": 0.00014465353710098019, + "loss": 0.0, + "step": 33858 + }, + { + "epoch": 3.1593729588504247, + "grad_norm": NaN, + "learning_rate": 0.0001446459781294976, + "loss": 0.0, + "step": 33859 + }, + { + "epoch": 3.1594662685453017, + "grad_norm": NaN, + "learning_rate": 0.00014463841917162874, + "loss": 0.0, + "step": 33860 + }, + { + "epoch": 3.159559578240179, + "grad_norm": NaN, + "learning_rate": 0.00014463086022739274, + "loss": 0.0, + "step": 33861 + }, + { + "epoch": 3.1596528879350565, + "grad_norm": NaN, + "learning_rate": 0.0001446233012968089, + "loss": 0.0, + "step": 33862 + }, + { + "epoch": 3.159746197629934, + "grad_norm": NaN, + "learning_rate": 0.00014461574237989645, + "loss": 0.0, + "step": 33863 + }, + { + "epoch": 3.159839507324811, + "grad_norm": NaN, + "learning_rate": 0.00014460818347667448, + "loss": 0.0, + "step": 33864 + }, + { + "epoch": 3.1599328170196883, + "grad_norm": NaN, + "learning_rate": 0.00014460062458716236, + "loss": 0.0, + "step": 33865 + }, + { + "epoch": 3.1600261267145657, + "grad_norm": NaN, + "learning_rate": 0.00014459306571137928, + "loss": 0.0, + "step": 33866 + }, + { + "epoch": 3.1601194364094427, + "grad_norm": NaN, + "learning_rate": 0.00014458550684934437, + "loss": 0.0, + "step": 33867 + }, + { + "epoch": 3.16021274610432, + "grad_norm": NaN, + "learning_rate": 0.00014457794800107693, + "loss": 0.0, + "step": 33868 + }, + { + "epoch": 3.1603060557991975, + "grad_norm": NaN, + "learning_rate": 0.0001445703891665962, + "loss": 0.0, + "step": 33869 + }, + { + "epoch": 3.160399365494075, + "grad_norm": NaN, + "learning_rate": 0.0001445628303459213, + "loss": 0.0, + "step": 33870 + }, + { + "epoch": 3.160492675188952, + "grad_norm": NaN, + "learning_rate": 0.00014455527153907154, + "loss": 0.0, + "step": 33871 + }, + { + "epoch": 3.1605859848838294, + "grad_norm": NaN, + "learning_rate": 0.00014454771274606616, + "loss": 0.0, + "step": 33872 + }, + { + "epoch": 3.1606792945787068, + "grad_norm": NaN, + "learning_rate": 0.00014454015396692425, + "loss": 0.0, + "step": 33873 + }, + { + "epoch": 3.160772604273584, + "grad_norm": NaN, + "learning_rate": 0.00014453259520166517, + "loss": 0.0, + "step": 33874 + }, + { + "epoch": 3.160865913968461, + "grad_norm": NaN, + "learning_rate": 0.00014452503645030808, + "loss": 0.0, + "step": 33875 + }, + { + "epoch": 3.1609592236633386, + "grad_norm": NaN, + "learning_rate": 0.00014451747771287215, + "loss": 0.0, + "step": 33876 + }, + { + "epoch": 3.161052533358216, + "grad_norm": NaN, + "learning_rate": 0.00014450991898937667, + "loss": 0.0, + "step": 33877 + }, + { + "epoch": 3.1611458430530934, + "grad_norm": NaN, + "learning_rate": 0.0001445023602798409, + "loss": 0.0, + "step": 33878 + }, + { + "epoch": 3.1612391527479704, + "grad_norm": NaN, + "learning_rate": 0.0001444948015842839, + "loss": 0.0, + "step": 33879 + }, + { + "epoch": 3.161332462442848, + "grad_norm": NaN, + "learning_rate": 0.00014448724290272503, + "loss": 0.0, + "step": 33880 + }, + { + "epoch": 3.1614257721377252, + "grad_norm": NaN, + "learning_rate": 0.00014447968423518352, + "loss": 0.0, + "step": 33881 + }, + { + "epoch": 3.161519081832602, + "grad_norm": NaN, + "learning_rate": 0.00014447212558167847, + "loss": 0.0, + "step": 33882 + }, + { + "epoch": 3.1616123915274796, + "grad_norm": NaN, + "learning_rate": 0.00014446456694222918, + "loss": 0.0, + "step": 33883 + }, + { + "epoch": 3.161705701222357, + "grad_norm": NaN, + "learning_rate": 0.0001444570083168549, + "loss": 0.0, + "step": 33884 + }, + { + "epoch": 3.1617990109172345, + "grad_norm": NaN, + "learning_rate": 0.00014444944970557473, + "loss": 0.0, + "step": 33885 + }, + { + "epoch": 3.1618923206121115, + "grad_norm": NaN, + "learning_rate": 0.00014444189110840802, + "loss": 0.0, + "step": 33886 + }, + { + "epoch": 3.161985630306989, + "grad_norm": NaN, + "learning_rate": 0.00014443433252537397, + "loss": 0.0, + "step": 33887 + }, + { + "epoch": 3.1620789400018663, + "grad_norm": NaN, + "learning_rate": 0.00014442677395649166, + "loss": 0.0, + "step": 33888 + }, + { + "epoch": 3.1621722496967433, + "grad_norm": NaN, + "learning_rate": 0.0001444192154017805, + "loss": 0.0, + "step": 33889 + }, + { + "epoch": 3.1622655593916207, + "grad_norm": NaN, + "learning_rate": 0.0001444116568612596, + "loss": 0.0, + "step": 33890 + }, + { + "epoch": 3.162358869086498, + "grad_norm": NaN, + "learning_rate": 0.00014440409833494817, + "loss": 0.0, + "step": 33891 + }, + { + "epoch": 3.1624521787813755, + "grad_norm": NaN, + "learning_rate": 0.00014439653982286553, + "loss": 0.0, + "step": 33892 + }, + { + "epoch": 3.1625454884762525, + "grad_norm": NaN, + "learning_rate": 0.00014438898132503077, + "loss": 0.0, + "step": 33893 + }, + { + "epoch": 3.16263879817113, + "grad_norm": NaN, + "learning_rate": 0.00014438142284146316, + "loss": 0.0, + "step": 33894 + }, + { + "epoch": 3.1627321078660073, + "grad_norm": NaN, + "learning_rate": 0.00014437386437218198, + "loss": 0.0, + "step": 33895 + }, + { + "epoch": 3.1628254175608848, + "grad_norm": NaN, + "learning_rate": 0.00014436630591720633, + "loss": 0.0, + "step": 33896 + }, + { + "epoch": 3.1629187272557617, + "grad_norm": NaN, + "learning_rate": 0.00014435874747655553, + "loss": 0.0, + "step": 33897 + }, + { + "epoch": 3.163012036950639, + "grad_norm": NaN, + "learning_rate": 0.0001443511890502488, + "loss": 0.0, + "step": 33898 + }, + { + "epoch": 3.1631053466455166, + "grad_norm": NaN, + "learning_rate": 0.00014434363063830523, + "loss": 0.0, + "step": 33899 + }, + { + "epoch": 3.163198656340394, + "grad_norm": NaN, + "learning_rate": 0.0001443360722407442, + "loss": 0.0, + "step": 33900 + }, + { + "epoch": 3.163291966035271, + "grad_norm": NaN, + "learning_rate": 0.0001443285138575849, + "loss": 0.0, + "step": 33901 + }, + { + "epoch": 3.1633852757301484, + "grad_norm": NaN, + "learning_rate": 0.00014432095548884644, + "loss": 0.0, + "step": 33902 + }, + { + "epoch": 3.163478585425026, + "grad_norm": NaN, + "learning_rate": 0.0001443133971345481, + "loss": 0.0, + "step": 33903 + }, + { + "epoch": 3.1635718951199028, + "grad_norm": NaN, + "learning_rate": 0.00014430583879470918, + "loss": 0.0, + "step": 33904 + }, + { + "epoch": 3.16366520481478, + "grad_norm": NaN, + "learning_rate": 0.00014429828046934873, + "loss": 0.0, + "step": 33905 + }, + { + "epoch": 3.1637585145096576, + "grad_norm": NaN, + "learning_rate": 0.00014429072215848613, + "loss": 0.0, + "step": 33906 + }, + { + "epoch": 3.163851824204535, + "grad_norm": NaN, + "learning_rate": 0.00014428316386214053, + "loss": 0.0, + "step": 33907 + }, + { + "epoch": 3.163945133899412, + "grad_norm": NaN, + "learning_rate": 0.0001442756055803311, + "loss": 0.0, + "step": 33908 + }, + { + "epoch": 3.1640384435942894, + "grad_norm": NaN, + "learning_rate": 0.00014426804731307717, + "loss": 0.0, + "step": 33909 + }, + { + "epoch": 3.164131753289167, + "grad_norm": NaN, + "learning_rate": 0.0001442604890603979, + "loss": 0.0, + "step": 33910 + }, + { + "epoch": 3.164225062984044, + "grad_norm": NaN, + "learning_rate": 0.00014425293082231242, + "loss": 0.0, + "step": 33911 + }, + { + "epoch": 3.1643183726789212, + "grad_norm": NaN, + "learning_rate": 0.0001442453725988401, + "loss": 0.0, + "step": 33912 + }, + { + "epoch": 3.1644116823737987, + "grad_norm": NaN, + "learning_rate": 0.00014423781439000015, + "loss": 0.0, + "step": 33913 + }, + { + "epoch": 3.164504992068676, + "grad_norm": NaN, + "learning_rate": 0.00014423025619581163, + "loss": 0.0, + "step": 33914 + }, + { + "epoch": 3.164598301763553, + "grad_norm": NaN, + "learning_rate": 0.00014422269801629388, + "loss": 0.0, + "step": 33915 + }, + { + "epoch": 3.1646916114584305, + "grad_norm": NaN, + "learning_rate": 0.00014421513985146616, + "loss": 0.0, + "step": 33916 + }, + { + "epoch": 3.164784921153308, + "grad_norm": NaN, + "learning_rate": 0.00014420758170134753, + "loss": 0.0, + "step": 33917 + }, + { + "epoch": 3.1648782308481853, + "grad_norm": NaN, + "learning_rate": 0.00014420002356595735, + "loss": 0.0, + "step": 33918 + }, + { + "epoch": 3.1649715405430623, + "grad_norm": NaN, + "learning_rate": 0.00014419246544531485, + "loss": 0.0, + "step": 33919 + }, + { + "epoch": 3.1650648502379397, + "grad_norm": NaN, + "learning_rate": 0.0001441849073394391, + "loss": 0.0, + "step": 33920 + }, + { + "epoch": 3.165158159932817, + "grad_norm": NaN, + "learning_rate": 0.00014417734924834942, + "loss": 0.0, + "step": 33921 + }, + { + "epoch": 3.1652514696276945, + "grad_norm": NaN, + "learning_rate": 0.00014416979117206507, + "loss": 0.0, + "step": 33922 + }, + { + "epoch": 3.1653447793225715, + "grad_norm": NaN, + "learning_rate": 0.00014416223311060518, + "loss": 0.0, + "step": 33923 + }, + { + "epoch": 3.165438089017449, + "grad_norm": NaN, + "learning_rate": 0.000144154675063989, + "loss": 0.0, + "step": 33924 + }, + { + "epoch": 3.1655313987123264, + "grad_norm": NaN, + "learning_rate": 0.00014414711703223578, + "loss": 0.0, + "step": 33925 + }, + { + "epoch": 3.1656247084072033, + "grad_norm": NaN, + "learning_rate": 0.00014413955901536465, + "loss": 0.0, + "step": 33926 + }, + { + "epoch": 3.1657180181020808, + "grad_norm": NaN, + "learning_rate": 0.00014413200101339493, + "loss": 0.0, + "step": 33927 + }, + { + "epoch": 3.165811327796958, + "grad_norm": NaN, + "learning_rate": 0.0001441244430263458, + "loss": 0.0, + "step": 33928 + }, + { + "epoch": 3.1659046374918356, + "grad_norm": NaN, + "learning_rate": 0.00014411688505423643, + "loss": 0.0, + "step": 33929 + }, + { + "epoch": 3.1659979471867126, + "grad_norm": NaN, + "learning_rate": 0.00014410932709708608, + "loss": 0.0, + "step": 33930 + }, + { + "epoch": 3.16609125688159, + "grad_norm": NaN, + "learning_rate": 0.00014410176915491404, + "loss": 0.0, + "step": 33931 + }, + { + "epoch": 3.1661845665764674, + "grad_norm": NaN, + "learning_rate": 0.0001440942112277394, + "loss": 0.0, + "step": 33932 + }, + { + "epoch": 3.1662778762713444, + "grad_norm": NaN, + "learning_rate": 0.00014408665331558138, + "loss": 0.0, + "step": 33933 + }, + { + "epoch": 3.166371185966222, + "grad_norm": NaN, + "learning_rate": 0.00014407909541845932, + "loss": 0.0, + "step": 33934 + }, + { + "epoch": 3.166464495661099, + "grad_norm": NaN, + "learning_rate": 0.00014407153753639236, + "loss": 0.0, + "step": 33935 + }, + { + "epoch": 3.1665578053559766, + "grad_norm": NaN, + "learning_rate": 0.0001440639796693997, + "loss": 0.0, + "step": 33936 + }, + { + "epoch": 3.1666511150508536, + "grad_norm": NaN, + "learning_rate": 0.00014405642181750062, + "loss": 0.0, + "step": 33937 + }, + { + "epoch": 3.166744424745731, + "grad_norm": NaN, + "learning_rate": 0.00014404886398071426, + "loss": 0.0, + "step": 33938 + }, + { + "epoch": 3.1668377344406085, + "grad_norm": NaN, + "learning_rate": 0.0001440413061590599, + "loss": 0.0, + "step": 33939 + }, + { + "epoch": 3.166931044135486, + "grad_norm": NaN, + "learning_rate": 0.00014403374835255672, + "loss": 0.0, + "step": 33940 + }, + { + "epoch": 3.167024353830363, + "grad_norm": NaN, + "learning_rate": 0.00014402619056122393, + "loss": 0.0, + "step": 33941 + }, + { + "epoch": 3.1671176635252403, + "grad_norm": NaN, + "learning_rate": 0.00014401863278508083, + "loss": 0.0, + "step": 33942 + }, + { + "epoch": 3.1672109732201177, + "grad_norm": NaN, + "learning_rate": 0.00014401107502414652, + "loss": 0.0, + "step": 33943 + }, + { + "epoch": 3.1673042829149947, + "grad_norm": NaN, + "learning_rate": 0.0001440035172784403, + "loss": 0.0, + "step": 33944 + }, + { + "epoch": 3.167397592609872, + "grad_norm": NaN, + "learning_rate": 0.00014399595954798136, + "loss": 0.0, + "step": 33945 + }, + { + "epoch": 3.1674909023047495, + "grad_norm": NaN, + "learning_rate": 0.00014398840183278886, + "loss": 0.0, + "step": 33946 + }, + { + "epoch": 3.167584211999627, + "grad_norm": NaN, + "learning_rate": 0.00014398084413288212, + "loss": 0.0, + "step": 33947 + }, + { + "epoch": 3.167677521694504, + "grad_norm": NaN, + "learning_rate": 0.00014397328644828036, + "loss": 0.0, + "step": 33948 + }, + { + "epoch": 3.1677708313893813, + "grad_norm": NaN, + "learning_rate": 0.00014396572877900264, + "loss": 0.0, + "step": 33949 + }, + { + "epoch": 3.1678641410842587, + "grad_norm": NaN, + "learning_rate": 0.00014395817112506835, + "loss": 0.0, + "step": 33950 + }, + { + "epoch": 3.167957450779136, + "grad_norm": NaN, + "learning_rate": 0.00014395061348649666, + "loss": 0.0, + "step": 33951 + }, + { + "epoch": 3.168050760474013, + "grad_norm": NaN, + "learning_rate": 0.0001439430558633067, + "loss": 0.0, + "step": 33952 + }, + { + "epoch": 3.1681440701688905, + "grad_norm": NaN, + "learning_rate": 0.0001439354982555178, + "loss": 0.0, + "step": 33953 + }, + { + "epoch": 3.168237379863768, + "grad_norm": NaN, + "learning_rate": 0.00014392794066314915, + "loss": 0.0, + "step": 33954 + }, + { + "epoch": 3.168330689558645, + "grad_norm": NaN, + "learning_rate": 0.00014392038308621988, + "loss": 0.0, + "step": 33955 + }, + { + "epoch": 3.1684239992535224, + "grad_norm": NaN, + "learning_rate": 0.00014391282552474934, + "loss": 0.0, + "step": 33956 + }, + { + "epoch": 3.1685173089484, + "grad_norm": NaN, + "learning_rate": 0.00014390526797875667, + "loss": 0.0, + "step": 33957 + }, + { + "epoch": 3.168610618643277, + "grad_norm": NaN, + "learning_rate": 0.00014389771044826107, + "loss": 0.0, + "step": 33958 + }, + { + "epoch": 3.168703928338154, + "grad_norm": NaN, + "learning_rate": 0.0001438901529332818, + "loss": 0.0, + "step": 33959 + }, + { + "epoch": 3.1687972380330316, + "grad_norm": NaN, + "learning_rate": 0.0001438825954338381, + "loss": 0.0, + "step": 33960 + }, + { + "epoch": 3.168890547727909, + "grad_norm": NaN, + "learning_rate": 0.00014387503794994905, + "loss": 0.0, + "step": 33961 + }, + { + "epoch": 3.1689838574227864, + "grad_norm": NaN, + "learning_rate": 0.00014386748048163405, + "loss": 0.0, + "step": 33962 + }, + { + "epoch": 3.1690771671176634, + "grad_norm": NaN, + "learning_rate": 0.00014385992302891223, + "loss": 0.0, + "step": 33963 + }, + { + "epoch": 3.169170476812541, + "grad_norm": NaN, + "learning_rate": 0.00014385236559180274, + "loss": 0.0, + "step": 33964 + }, + { + "epoch": 3.1692637865074182, + "grad_norm": NaN, + "learning_rate": 0.00014384480817032492, + "loss": 0.0, + "step": 33965 + }, + { + "epoch": 3.169357096202295, + "grad_norm": NaN, + "learning_rate": 0.00014383725076449794, + "loss": 0.0, + "step": 33966 + }, + { + "epoch": 3.1694504058971726, + "grad_norm": NaN, + "learning_rate": 0.00014382969337434096, + "loss": 0.0, + "step": 33967 + }, + { + "epoch": 3.16954371559205, + "grad_norm": NaN, + "learning_rate": 0.00014382213599987322, + "loss": 0.0, + "step": 33968 + }, + { + "epoch": 3.1696370252869275, + "grad_norm": NaN, + "learning_rate": 0.00014381457864111406, + "loss": 0.0, + "step": 33969 + }, + { + "epoch": 3.1697303349818045, + "grad_norm": NaN, + "learning_rate": 0.00014380702129808252, + "loss": 0.0, + "step": 33970 + }, + { + "epoch": 3.169823644676682, + "grad_norm": NaN, + "learning_rate": 0.00014379946397079785, + "loss": 0.0, + "step": 33971 + }, + { + "epoch": 3.1699169543715593, + "grad_norm": NaN, + "learning_rate": 0.0001437919066592794, + "loss": 0.0, + "step": 33972 + }, + { + "epoch": 3.1700102640664367, + "grad_norm": NaN, + "learning_rate": 0.00014378434936354625, + "loss": 0.0, + "step": 33973 + }, + { + "epoch": 3.1701035737613137, + "grad_norm": NaN, + "learning_rate": 0.00014377679208361764, + "loss": 0.0, + "step": 33974 + }, + { + "epoch": 3.170196883456191, + "grad_norm": NaN, + "learning_rate": 0.00014376923481951285, + "loss": 0.0, + "step": 33975 + }, + { + "epoch": 3.1702901931510685, + "grad_norm": NaN, + "learning_rate": 0.00014376167757125104, + "loss": 0.0, + "step": 33976 + }, + { + "epoch": 3.1703835028459455, + "grad_norm": NaN, + "learning_rate": 0.00014375412033885138, + "loss": 0.0, + "step": 33977 + }, + { + "epoch": 3.170476812540823, + "grad_norm": NaN, + "learning_rate": 0.0001437465631223332, + "loss": 0.0, + "step": 33978 + }, + { + "epoch": 3.1705701222357003, + "grad_norm": NaN, + "learning_rate": 0.00014373900592171566, + "loss": 0.0, + "step": 33979 + }, + { + "epoch": 3.1706634319305778, + "grad_norm": NaN, + "learning_rate": 0.00014373144873701792, + "loss": 0.0, + "step": 33980 + }, + { + "epoch": 3.1707567416254547, + "grad_norm": NaN, + "learning_rate": 0.00014372389156825933, + "loss": 0.0, + "step": 33981 + }, + { + "epoch": 3.170850051320332, + "grad_norm": NaN, + "learning_rate": 0.00014371633441545898, + "loss": 0.0, + "step": 33982 + }, + { + "epoch": 3.1709433610152096, + "grad_norm": NaN, + "learning_rate": 0.00014370877727863616, + "loss": 0.0, + "step": 33983 + }, + { + "epoch": 3.1710366707100865, + "grad_norm": NaN, + "learning_rate": 0.00014370122015781, + "loss": 0.0, + "step": 33984 + }, + { + "epoch": 3.171129980404964, + "grad_norm": NaN, + "learning_rate": 0.00014369366305299976, + "loss": 0.0, + "step": 33985 + }, + { + "epoch": 3.1712232900998414, + "grad_norm": NaN, + "learning_rate": 0.00014368610596422475, + "loss": 0.0, + "step": 33986 + }, + { + "epoch": 3.171316599794719, + "grad_norm": NaN, + "learning_rate": 0.00014367854889150403, + "loss": 0.0, + "step": 33987 + }, + { + "epoch": 3.171409909489596, + "grad_norm": NaN, + "learning_rate": 0.0001436709918348569, + "loss": 0.0, + "step": 33988 + }, + { + "epoch": 3.171503219184473, + "grad_norm": NaN, + "learning_rate": 0.00014366343479430262, + "loss": 0.0, + "step": 33989 + }, + { + "epoch": 3.1715965288793506, + "grad_norm": NaN, + "learning_rate": 0.00014365587776986026, + "loss": 0.0, + "step": 33990 + }, + { + "epoch": 3.171689838574228, + "grad_norm": NaN, + "learning_rate": 0.00014364832076154918, + "loss": 0.0, + "step": 33991 + }, + { + "epoch": 3.171783148269105, + "grad_norm": NaN, + "learning_rate": 0.00014364076376938855, + "loss": 0.0, + "step": 33992 + }, + { + "epoch": 3.1718764579639824, + "grad_norm": NaN, + "learning_rate": 0.00014363320679339754, + "loss": 0.0, + "step": 33993 + }, + { + "epoch": 3.17196976765886, + "grad_norm": NaN, + "learning_rate": 0.0001436256498335954, + "loss": 0.0, + "step": 33994 + }, + { + "epoch": 3.1720630773537373, + "grad_norm": NaN, + "learning_rate": 0.0001436180928900014, + "loss": 0.0, + "step": 33995 + }, + { + "epoch": 3.1721563870486142, + "grad_norm": NaN, + "learning_rate": 0.0001436105359626346, + "loss": 0.0, + "step": 33996 + }, + { + "epoch": 3.1722496967434917, + "grad_norm": NaN, + "learning_rate": 0.00014360297905151436, + "loss": 0.0, + "step": 33997 + }, + { + "epoch": 3.172343006438369, + "grad_norm": NaN, + "learning_rate": 0.0001435954221566599, + "loss": 0.0, + "step": 33998 + }, + { + "epoch": 3.172436316133246, + "grad_norm": NaN, + "learning_rate": 0.00014358786527809031, + "loss": 0.0, + "step": 33999 + }, + { + "epoch": 3.1725296258281235, + "grad_norm": NaN, + "learning_rate": 0.00014358030841582492, + "loss": 0.0, + "step": 34000 + }, + { + "epoch": 3.172622935523001, + "grad_norm": NaN, + "learning_rate": 0.00014357275156988292, + "loss": 0.0, + "step": 34001 + }, + { + "epoch": 3.1727162452178783, + "grad_norm": NaN, + "learning_rate": 0.00014356519474028345, + "loss": 0.0, + "step": 34002 + }, + { + "epoch": 3.1728095549127553, + "grad_norm": NaN, + "learning_rate": 0.00014355763792704582, + "loss": 0.0, + "step": 34003 + }, + { + "epoch": 3.1729028646076327, + "grad_norm": NaN, + "learning_rate": 0.00014355008113018923, + "loss": 0.0, + "step": 34004 + }, + { + "epoch": 3.17299617430251, + "grad_norm": NaN, + "learning_rate": 0.00014354252434973283, + "loss": 0.0, + "step": 34005 + }, + { + "epoch": 3.173089483997387, + "grad_norm": NaN, + "learning_rate": 0.00014353496758569585, + "loss": 0.0, + "step": 34006 + }, + { + "epoch": 3.1731827936922645, + "grad_norm": NaN, + "learning_rate": 0.00014352741083809766, + "loss": 0.0, + "step": 34007 + }, + { + "epoch": 3.173276103387142, + "grad_norm": NaN, + "learning_rate": 0.00014351985410695722, + "loss": 0.0, + "step": 34008 + }, + { + "epoch": 3.1733694130820194, + "grad_norm": NaN, + "learning_rate": 0.0001435122973922939, + "loss": 0.0, + "step": 34009 + }, + { + "epoch": 3.1734627227768963, + "grad_norm": NaN, + "learning_rate": 0.0001435047406941269, + "loss": 0.0, + "step": 34010 + }, + { + "epoch": 3.1735560324717738, + "grad_norm": NaN, + "learning_rate": 0.00014349718401247545, + "loss": 0.0, + "step": 34011 + }, + { + "epoch": 3.173649342166651, + "grad_norm": NaN, + "learning_rate": 0.00014348962734735867, + "loss": 0.0, + "step": 34012 + }, + { + "epoch": 3.1737426518615286, + "grad_norm": NaN, + "learning_rate": 0.0001434820706987959, + "loss": 0.0, + "step": 34013 + }, + { + "epoch": 3.1738359615564056, + "grad_norm": NaN, + "learning_rate": 0.00014347451406680628, + "loss": 0.0, + "step": 34014 + }, + { + "epoch": 3.173929271251283, + "grad_norm": NaN, + "learning_rate": 0.00014346695745140898, + "loss": 0.0, + "step": 34015 + }, + { + "epoch": 3.1740225809461604, + "grad_norm": NaN, + "learning_rate": 0.00014345940085262336, + "loss": 0.0, + "step": 34016 + }, + { + "epoch": 3.174115890641038, + "grad_norm": NaN, + "learning_rate": 0.0001434518442704685, + "loss": 0.0, + "step": 34017 + }, + { + "epoch": 3.174209200335915, + "grad_norm": NaN, + "learning_rate": 0.00014344428770496364, + "loss": 0.0, + "step": 34018 + }, + { + "epoch": 3.1743025100307922, + "grad_norm": NaN, + "learning_rate": 0.0001434367311561281, + "loss": 0.0, + "step": 34019 + }, + { + "epoch": 3.1743958197256696, + "grad_norm": NaN, + "learning_rate": 0.00014342917462398092, + "loss": 0.0, + "step": 34020 + }, + { + "epoch": 3.1744891294205466, + "grad_norm": NaN, + "learning_rate": 0.0001434216181085414, + "loss": 0.0, + "step": 34021 + }, + { + "epoch": 3.174582439115424, + "grad_norm": NaN, + "learning_rate": 0.00014341406160982883, + "loss": 0.0, + "step": 34022 + }, + { + "epoch": 3.1746757488103015, + "grad_norm": NaN, + "learning_rate": 0.0001434065051278623, + "loss": 0.0, + "step": 34023 + }, + { + "epoch": 3.174769058505179, + "grad_norm": NaN, + "learning_rate": 0.00014339894866266105, + "loss": 0.0, + "step": 34024 + }, + { + "epoch": 3.174862368200056, + "grad_norm": NaN, + "learning_rate": 0.0001433913922142444, + "loss": 0.0, + "step": 34025 + }, + { + "epoch": 3.1749556778949333, + "grad_norm": NaN, + "learning_rate": 0.00014338383578263144, + "loss": 0.0, + "step": 34026 + }, + { + "epoch": 3.1750489875898107, + "grad_norm": NaN, + "learning_rate": 0.00014337627936784146, + "loss": 0.0, + "step": 34027 + }, + { + "epoch": 3.1751422972846877, + "grad_norm": NaN, + "learning_rate": 0.00014336872296989356, + "loss": 0.0, + "step": 34028 + }, + { + "epoch": 3.175235606979565, + "grad_norm": NaN, + "learning_rate": 0.00014336116658880706, + "loss": 0.0, + "step": 34029 + }, + { + "epoch": 3.1753289166744425, + "grad_norm": NaN, + "learning_rate": 0.0001433536102246012, + "loss": 0.0, + "step": 34030 + }, + { + "epoch": 3.17542222636932, + "grad_norm": NaN, + "learning_rate": 0.00014334605387729505, + "loss": 0.0, + "step": 34031 + }, + { + "epoch": 3.175515536064197, + "grad_norm": NaN, + "learning_rate": 0.00014333849754690796, + "loss": 0.0, + "step": 34032 + }, + { + "epoch": 3.1756088457590743, + "grad_norm": NaN, + "learning_rate": 0.00014333094123345915, + "loss": 0.0, + "step": 34033 + }, + { + "epoch": 3.1757021554539517, + "grad_norm": NaN, + "learning_rate": 0.0001433233849369677, + "loss": 0.0, + "step": 34034 + }, + { + "epoch": 3.175795465148829, + "grad_norm": NaN, + "learning_rate": 0.00014331582865745294, + "loss": 0.0, + "step": 34035 + }, + { + "epoch": 3.175888774843706, + "grad_norm": NaN, + "learning_rate": 0.00014330827239493406, + "loss": 0.0, + "step": 34036 + }, + { + "epoch": 3.1759820845385835, + "grad_norm": NaN, + "learning_rate": 0.00014330071614943022, + "loss": 0.0, + "step": 34037 + }, + { + "epoch": 3.176075394233461, + "grad_norm": NaN, + "learning_rate": 0.0001432931599209607, + "loss": 0.0, + "step": 34038 + }, + { + "epoch": 3.1761687039283384, + "grad_norm": NaN, + "learning_rate": 0.00014328560370954472, + "loss": 0.0, + "step": 34039 + }, + { + "epoch": 3.1762620136232154, + "grad_norm": NaN, + "learning_rate": 0.0001432780475152014, + "loss": 0.0, + "step": 34040 + }, + { + "epoch": 3.176355323318093, + "grad_norm": NaN, + "learning_rate": 0.00014327049133795004, + "loss": 0.0, + "step": 34041 + }, + { + "epoch": 3.17644863301297, + "grad_norm": NaN, + "learning_rate": 0.00014326293517780985, + "loss": 0.0, + "step": 34042 + }, + { + "epoch": 3.176541942707847, + "grad_norm": NaN, + "learning_rate": 0.00014325537903479998, + "loss": 0.0, + "step": 34043 + }, + { + "epoch": 3.1766352524027246, + "grad_norm": NaN, + "learning_rate": 0.00014324782290893966, + "loss": 0.0, + "step": 34044 + }, + { + "epoch": 3.176728562097602, + "grad_norm": NaN, + "learning_rate": 0.00014324026680024823, + "loss": 0.0, + "step": 34045 + }, + { + "epoch": 3.1768218717924794, + "grad_norm": NaN, + "learning_rate": 0.0001432327107087447, + "loss": 0.0, + "step": 34046 + }, + { + "epoch": 3.1769151814873564, + "grad_norm": NaN, + "learning_rate": 0.0001432251546344484, + "loss": 0.0, + "step": 34047 + }, + { + "epoch": 3.177008491182234, + "grad_norm": NaN, + "learning_rate": 0.0001432175985773786, + "loss": 0.0, + "step": 34048 + }, + { + "epoch": 3.1771018008771112, + "grad_norm": NaN, + "learning_rate": 0.00014321004253755435, + "loss": 0.0, + "step": 34049 + }, + { + "epoch": 3.1771951105719882, + "grad_norm": NaN, + "learning_rate": 0.00014320248651499495, + "loss": 0.0, + "step": 34050 + }, + { + "epoch": 3.1772884202668656, + "grad_norm": NaN, + "learning_rate": 0.00014319493050971968, + "loss": 0.0, + "step": 34051 + }, + { + "epoch": 3.177381729961743, + "grad_norm": NaN, + "learning_rate": 0.00014318737452174765, + "loss": 0.0, + "step": 34052 + }, + { + "epoch": 3.1774750396566205, + "grad_norm": NaN, + "learning_rate": 0.00014317981855109808, + "loss": 0.0, + "step": 34053 + }, + { + "epoch": 3.1775683493514975, + "grad_norm": NaN, + "learning_rate": 0.00014317226259779028, + "loss": 0.0, + "step": 34054 + }, + { + "epoch": 3.177661659046375, + "grad_norm": NaN, + "learning_rate": 0.00014316470666184334, + "loss": 0.0, + "step": 34055 + }, + { + "epoch": 3.1777549687412523, + "grad_norm": NaN, + "learning_rate": 0.00014315715074327652, + "loss": 0.0, + "step": 34056 + }, + { + "epoch": 3.1778482784361297, + "grad_norm": NaN, + "learning_rate": 0.00014314959484210908, + "loss": 0.0, + "step": 34057 + }, + { + "epoch": 3.1779415881310067, + "grad_norm": NaN, + "learning_rate": 0.00014314203895836018, + "loss": 0.0, + "step": 34058 + }, + { + "epoch": 3.178034897825884, + "grad_norm": NaN, + "learning_rate": 0.000143134483092049, + "loss": 0.0, + "step": 34059 + }, + { + "epoch": 3.1781282075207615, + "grad_norm": NaN, + "learning_rate": 0.00014312692724319486, + "loss": 0.0, + "step": 34060 + }, + { + "epoch": 3.1782215172156385, + "grad_norm": NaN, + "learning_rate": 0.00014311937141181687, + "loss": 0.0, + "step": 34061 + }, + { + "epoch": 3.178314826910516, + "grad_norm": NaN, + "learning_rate": 0.00014311181559793426, + "loss": 0.0, + "step": 34062 + }, + { + "epoch": 3.1784081366053933, + "grad_norm": NaN, + "learning_rate": 0.00014310425980156635, + "loss": 0.0, + "step": 34063 + }, + { + "epoch": 3.1785014463002708, + "grad_norm": NaN, + "learning_rate": 0.00014309670402273217, + "loss": 0.0, + "step": 34064 + }, + { + "epoch": 3.1785947559951477, + "grad_norm": NaN, + "learning_rate": 0.00014308914826145105, + "loss": 0.0, + "step": 34065 + }, + { + "epoch": 3.178688065690025, + "grad_norm": NaN, + "learning_rate": 0.00014308159251774224, + "loss": 0.0, + "step": 34066 + }, + { + "epoch": 3.1787813753849026, + "grad_norm": NaN, + "learning_rate": 0.00014307403679162482, + "loss": 0.0, + "step": 34067 + }, + { + "epoch": 3.17887468507978, + "grad_norm": NaN, + "learning_rate": 0.00014306648108311807, + "loss": 0.0, + "step": 34068 + }, + { + "epoch": 3.178967994774657, + "grad_norm": NaN, + "learning_rate": 0.00014305892539224127, + "loss": 0.0, + "step": 34069 + }, + { + "epoch": 3.1790613044695344, + "grad_norm": NaN, + "learning_rate": 0.0001430513697190135, + "loss": 0.0, + "step": 34070 + }, + { + "epoch": 3.179154614164412, + "grad_norm": NaN, + "learning_rate": 0.00014304381406345405, + "loss": 0.0, + "step": 34071 + }, + { + "epoch": 3.179247923859289, + "grad_norm": NaN, + "learning_rate": 0.00014303625842558215, + "loss": 0.0, + "step": 34072 + }, + { + "epoch": 3.179341233554166, + "grad_norm": NaN, + "learning_rate": 0.00014302870280541699, + "loss": 0.0, + "step": 34073 + }, + { + "epoch": 3.1794345432490436, + "grad_norm": NaN, + "learning_rate": 0.00014302114720297776, + "loss": 0.0, + "step": 34074 + }, + { + "epoch": 3.179527852943921, + "grad_norm": NaN, + "learning_rate": 0.00014301359161828365, + "loss": 0.0, + "step": 34075 + }, + { + "epoch": 3.179621162638798, + "grad_norm": NaN, + "learning_rate": 0.00014300603605135392, + "loss": 0.0, + "step": 34076 + }, + { + "epoch": 3.1797144723336754, + "grad_norm": NaN, + "learning_rate": 0.00014299848050220782, + "loss": 0.0, + "step": 34077 + }, + { + "epoch": 3.179807782028553, + "grad_norm": NaN, + "learning_rate": 0.00014299092497086445, + "loss": 0.0, + "step": 34078 + }, + { + "epoch": 3.17990109172343, + "grad_norm": NaN, + "learning_rate": 0.0001429833694573431, + "loss": 0.0, + "step": 34079 + }, + { + "epoch": 3.1799944014183072, + "grad_norm": NaN, + "learning_rate": 0.000142975813961663, + "loss": 0.0, + "step": 34080 + }, + { + "epoch": 3.1800877111131847, + "grad_norm": NaN, + "learning_rate": 0.00014296825848384326, + "loss": 0.0, + "step": 34081 + }, + { + "epoch": 3.180181020808062, + "grad_norm": NaN, + "learning_rate": 0.00014296070302390314, + "loss": 0.0, + "step": 34082 + }, + { + "epoch": 3.180274330502939, + "grad_norm": NaN, + "learning_rate": 0.00014295314758186196, + "loss": 0.0, + "step": 34083 + }, + { + "epoch": 3.1803676401978165, + "grad_norm": NaN, + "learning_rate": 0.00014294559215773879, + "loss": 0.0, + "step": 34084 + }, + { + "epoch": 3.180460949892694, + "grad_norm": NaN, + "learning_rate": 0.00014293803675155284, + "loss": 0.0, + "step": 34085 + }, + { + "epoch": 3.1805542595875713, + "grad_norm": NaN, + "learning_rate": 0.00014293048136332346, + "loss": 0.0, + "step": 34086 + }, + { + "epoch": 3.1806475692824483, + "grad_norm": NaN, + "learning_rate": 0.00014292292599306976, + "loss": 0.0, + "step": 34087 + }, + { + "epoch": 3.1807408789773257, + "grad_norm": NaN, + "learning_rate": 0.0001429153706408109, + "loss": 0.0, + "step": 34088 + }, + { + "epoch": 3.180834188672203, + "grad_norm": NaN, + "learning_rate": 0.00014290781530656622, + "loss": 0.0, + "step": 34089 + }, + { + "epoch": 3.1809274983670806, + "grad_norm": NaN, + "learning_rate": 0.00014290025999035482, + "loss": 0.0, + "step": 34090 + }, + { + "epoch": 3.1810208080619575, + "grad_norm": NaN, + "learning_rate": 0.00014289270469219594, + "loss": 0.0, + "step": 34091 + }, + { + "epoch": 3.181114117756835, + "grad_norm": NaN, + "learning_rate": 0.00014288514941210886, + "loss": 0.0, + "step": 34092 + }, + { + "epoch": 3.1812074274517124, + "grad_norm": NaN, + "learning_rate": 0.00014287759415011273, + "loss": 0.0, + "step": 34093 + }, + { + "epoch": 3.1813007371465893, + "grad_norm": NaN, + "learning_rate": 0.00014287003890622673, + "loss": 0.0, + "step": 34094 + }, + { + "epoch": 3.1813940468414668, + "grad_norm": NaN, + "learning_rate": 0.00014286248368047016, + "loss": 0.0, + "step": 34095 + }, + { + "epoch": 3.181487356536344, + "grad_norm": NaN, + "learning_rate": 0.00014285492847286214, + "loss": 0.0, + "step": 34096 + }, + { + "epoch": 3.1815806662312216, + "grad_norm": NaN, + "learning_rate": 0.00014284737328342192, + "loss": 0.0, + "step": 34097 + }, + { + "epoch": 3.1816739759260986, + "grad_norm": NaN, + "learning_rate": 0.00014283981811216874, + "loss": 0.0, + "step": 34098 + }, + { + "epoch": 3.181767285620976, + "grad_norm": NaN, + "learning_rate": 0.00014283226295912178, + "loss": 0.0, + "step": 34099 + }, + { + "epoch": 3.1818605953158534, + "grad_norm": NaN, + "learning_rate": 0.0001428247078243002, + "loss": 0.0, + "step": 34100 + }, + { + "epoch": 3.1819539050107304, + "grad_norm": NaN, + "learning_rate": 0.00014281715270772334, + "loss": 0.0, + "step": 34101 + }, + { + "epoch": 3.182047214705608, + "grad_norm": NaN, + "learning_rate": 0.0001428095976094103, + "loss": 0.0, + "step": 34102 + }, + { + "epoch": 3.1821405244004852, + "grad_norm": NaN, + "learning_rate": 0.00014280204252938025, + "loss": 0.0, + "step": 34103 + }, + { + "epoch": 3.1822338340953626, + "grad_norm": NaN, + "learning_rate": 0.00014279448746765258, + "loss": 0.0, + "step": 34104 + }, + { + "epoch": 3.1823271437902396, + "grad_norm": NaN, + "learning_rate": 0.00014278693242424635, + "loss": 0.0, + "step": 34105 + }, + { + "epoch": 3.182420453485117, + "grad_norm": NaN, + "learning_rate": 0.00014277937739918078, + "loss": 0.0, + "step": 34106 + }, + { + "epoch": 3.1825137631799945, + "grad_norm": NaN, + "learning_rate": 0.0001427718223924752, + "loss": 0.0, + "step": 34107 + }, + { + "epoch": 3.182607072874872, + "grad_norm": NaN, + "learning_rate": 0.0001427642674041487, + "loss": 0.0, + "step": 34108 + }, + { + "epoch": 3.182700382569749, + "grad_norm": NaN, + "learning_rate": 0.00014275671243422046, + "loss": 0.0, + "step": 34109 + }, + { + "epoch": 3.1827936922646263, + "grad_norm": NaN, + "learning_rate": 0.00014274915748270983, + "loss": 0.0, + "step": 34110 + }, + { + "epoch": 3.1828870019595037, + "grad_norm": NaN, + "learning_rate": 0.00014274160254963594, + "loss": 0.0, + "step": 34111 + }, + { + "epoch": 3.182980311654381, + "grad_norm": NaN, + "learning_rate": 0.00014273404763501794, + "loss": 0.0, + "step": 34112 + }, + { + "epoch": 3.183073621349258, + "grad_norm": NaN, + "learning_rate": 0.00014272649273887518, + "loss": 0.0, + "step": 34113 + }, + { + "epoch": 3.1831669310441355, + "grad_norm": NaN, + "learning_rate": 0.00014271893786122676, + "loss": 0.0, + "step": 34114 + }, + { + "epoch": 3.183260240739013, + "grad_norm": NaN, + "learning_rate": 0.0001427113830020919, + "loss": 0.0, + "step": 34115 + }, + { + "epoch": 3.18335355043389, + "grad_norm": NaN, + "learning_rate": 0.0001427038281614899, + "loss": 0.0, + "step": 34116 + }, + { + "epoch": 3.1834468601287673, + "grad_norm": NaN, + "learning_rate": 0.00014269627333943983, + "loss": 0.0, + "step": 34117 + }, + { + "epoch": 3.1835401698236447, + "grad_norm": NaN, + "learning_rate": 0.0001426887185359611, + "loss": 0.0, + "step": 34118 + }, + { + "epoch": 3.183633479518522, + "grad_norm": NaN, + "learning_rate": 0.00014268116375107262, + "loss": 0.0, + "step": 34119 + }, + { + "epoch": 3.183726789213399, + "grad_norm": NaN, + "learning_rate": 0.00014267360898479383, + "loss": 0.0, + "step": 34120 + }, + { + "epoch": 3.1838200989082766, + "grad_norm": NaN, + "learning_rate": 0.00014266605423714394, + "loss": 0.0, + "step": 34121 + }, + { + "epoch": 3.183913408603154, + "grad_norm": NaN, + "learning_rate": 0.00014265849950814209, + "loss": 0.0, + "step": 34122 + }, + { + "epoch": 3.184006718298031, + "grad_norm": NaN, + "learning_rate": 0.00014265094479780742, + "loss": 0.0, + "step": 34123 + }, + { + "epoch": 3.1841000279929084, + "grad_norm": NaN, + "learning_rate": 0.00014264339010615934, + "loss": 0.0, + "step": 34124 + }, + { + "epoch": 3.184193337687786, + "grad_norm": NaN, + "learning_rate": 0.00014263583543321686, + "loss": 0.0, + "step": 34125 + }, + { + "epoch": 3.184286647382663, + "grad_norm": NaN, + "learning_rate": 0.00014262828077899928, + "loss": 0.0, + "step": 34126 + }, + { + "epoch": 3.18437995707754, + "grad_norm": NaN, + "learning_rate": 0.00014262072614352583, + "loss": 0.0, + "step": 34127 + }, + { + "epoch": 3.1844732667724176, + "grad_norm": NaN, + "learning_rate": 0.00014261317152681566, + "loss": 0.0, + "step": 34128 + }, + { + "epoch": 3.184566576467295, + "grad_norm": NaN, + "learning_rate": 0.000142605616928888, + "loss": 0.0, + "step": 34129 + }, + { + "epoch": 3.1846598861621724, + "grad_norm": NaN, + "learning_rate": 0.0001425980623497621, + "loss": 0.0, + "step": 34130 + }, + { + "epoch": 3.1847531958570494, + "grad_norm": NaN, + "learning_rate": 0.0001425905077894571, + "loss": 0.0, + "step": 34131 + }, + { + "epoch": 3.184846505551927, + "grad_norm": NaN, + "learning_rate": 0.00014258295324799222, + "loss": 0.0, + "step": 34132 + }, + { + "epoch": 3.1849398152468043, + "grad_norm": NaN, + "learning_rate": 0.00014257539872538676, + "loss": 0.0, + "step": 34133 + }, + { + "epoch": 3.1850331249416817, + "grad_norm": NaN, + "learning_rate": 0.00014256784422165983, + "loss": 0.0, + "step": 34134 + }, + { + "epoch": 3.1851264346365586, + "grad_norm": NaN, + "learning_rate": 0.00014256028973683063, + "loss": 0.0, + "step": 34135 + }, + { + "epoch": 3.185219744331436, + "grad_norm": NaN, + "learning_rate": 0.00014255273527091848, + "loss": 0.0, + "step": 34136 + }, + { + "epoch": 3.1853130540263135, + "grad_norm": NaN, + "learning_rate": 0.0001425451808239425, + "loss": 0.0, + "step": 34137 + }, + { + "epoch": 3.1854063637211905, + "grad_norm": NaN, + "learning_rate": 0.00014253762639592187, + "loss": 0.0, + "step": 34138 + }, + { + "epoch": 3.185499673416068, + "grad_norm": NaN, + "learning_rate": 0.0001425300719868759, + "loss": 0.0, + "step": 34139 + }, + { + "epoch": 3.1855929831109453, + "grad_norm": NaN, + "learning_rate": 0.00014252251759682373, + "loss": 0.0, + "step": 34140 + }, + { + "epoch": 3.1856862928058227, + "grad_norm": NaN, + "learning_rate": 0.00014251496322578456, + "loss": 0.0, + "step": 34141 + }, + { + "epoch": 3.1857796025006997, + "grad_norm": NaN, + "learning_rate": 0.00014250740887377766, + "loss": 0.0, + "step": 34142 + }, + { + "epoch": 3.185872912195577, + "grad_norm": NaN, + "learning_rate": 0.00014249985454082217, + "loss": 0.0, + "step": 34143 + }, + { + "epoch": 3.1859662218904545, + "grad_norm": NaN, + "learning_rate": 0.00014249230022693734, + "loss": 0.0, + "step": 34144 + }, + { + "epoch": 3.1860595315853315, + "grad_norm": NaN, + "learning_rate": 0.00014248474593214238, + "loss": 0.0, + "step": 34145 + }, + { + "epoch": 3.186152841280209, + "grad_norm": NaN, + "learning_rate": 0.00014247719165645648, + "loss": 0.0, + "step": 34146 + }, + { + "epoch": 3.1862461509750863, + "grad_norm": NaN, + "learning_rate": 0.00014246963739989878, + "loss": 0.0, + "step": 34147 + }, + { + "epoch": 3.1863394606699638, + "grad_norm": NaN, + "learning_rate": 0.00014246208316248866, + "loss": 0.0, + "step": 34148 + }, + { + "epoch": 3.1864327703648407, + "grad_norm": NaN, + "learning_rate": 0.0001424545289442452, + "loss": 0.0, + "step": 34149 + }, + { + "epoch": 3.186526080059718, + "grad_norm": NaN, + "learning_rate": 0.0001424469747451876, + "loss": 0.0, + "step": 34150 + }, + { + "epoch": 3.1866193897545956, + "grad_norm": NaN, + "learning_rate": 0.00014243942056533517, + "loss": 0.0, + "step": 34151 + }, + { + "epoch": 3.186712699449473, + "grad_norm": NaN, + "learning_rate": 0.000142431866404707, + "loss": 0.0, + "step": 34152 + }, + { + "epoch": 3.18680600914435, + "grad_norm": NaN, + "learning_rate": 0.00014242431226332234, + "loss": 0.0, + "step": 34153 + }, + { + "epoch": 3.1868993188392274, + "grad_norm": NaN, + "learning_rate": 0.00014241675814120048, + "loss": 0.0, + "step": 34154 + }, + { + "epoch": 3.186992628534105, + "grad_norm": NaN, + "learning_rate": 0.00014240920403836048, + "loss": 0.0, + "step": 34155 + }, + { + "epoch": 3.187085938228982, + "grad_norm": NaN, + "learning_rate": 0.0001424016499548217, + "loss": 0.0, + "step": 34156 + }, + { + "epoch": 3.187179247923859, + "grad_norm": NaN, + "learning_rate": 0.00014239409589060324, + "loss": 0.0, + "step": 34157 + }, + { + "epoch": 3.1872725576187366, + "grad_norm": NaN, + "learning_rate": 0.0001423865418457243, + "loss": 0.0, + "step": 34158 + }, + { + "epoch": 3.187365867313614, + "grad_norm": NaN, + "learning_rate": 0.00014237898782020419, + "loss": 0.0, + "step": 34159 + }, + { + "epoch": 3.187459177008491, + "grad_norm": NaN, + "learning_rate": 0.00014237143381406206, + "loss": 0.0, + "step": 34160 + }, + { + "epoch": 3.1875524867033684, + "grad_norm": NaN, + "learning_rate": 0.00014236387982731705, + "loss": 0.0, + "step": 34161 + }, + { + "epoch": 3.187645796398246, + "grad_norm": NaN, + "learning_rate": 0.00014235632585998852, + "loss": 0.0, + "step": 34162 + }, + { + "epoch": 3.1877391060931233, + "grad_norm": NaN, + "learning_rate": 0.00014234877191209554, + "loss": 0.0, + "step": 34163 + }, + { + "epoch": 3.1878324157880003, + "grad_norm": NaN, + "learning_rate": 0.00014234121798365734, + "loss": 0.0, + "step": 34164 + }, + { + "epoch": 3.1879257254828777, + "grad_norm": NaN, + "learning_rate": 0.0001423336640746932, + "loss": 0.0, + "step": 34165 + }, + { + "epoch": 3.188019035177755, + "grad_norm": NaN, + "learning_rate": 0.00014232611018522224, + "loss": 0.0, + "step": 34166 + }, + { + "epoch": 3.188112344872632, + "grad_norm": NaN, + "learning_rate": 0.00014231855631526371, + "loss": 0.0, + "step": 34167 + }, + { + "epoch": 3.1882056545675095, + "grad_norm": NaN, + "learning_rate": 0.00014231100246483687, + "loss": 0.0, + "step": 34168 + }, + { + "epoch": 3.188298964262387, + "grad_norm": NaN, + "learning_rate": 0.00014230344863396083, + "loss": 0.0, + "step": 34169 + }, + { + "epoch": 3.1883922739572643, + "grad_norm": NaN, + "learning_rate": 0.00014229589482265482, + "loss": 0.0, + "step": 34170 + }, + { + "epoch": 3.1884855836521413, + "grad_norm": NaN, + "learning_rate": 0.00014228834103093815, + "loss": 0.0, + "step": 34171 + }, + { + "epoch": 3.1885788933470187, + "grad_norm": NaN, + "learning_rate": 0.00014228078725882988, + "loss": 0.0, + "step": 34172 + }, + { + "epoch": 3.188672203041896, + "grad_norm": NaN, + "learning_rate": 0.00014227323350634926, + "loss": 0.0, + "step": 34173 + }, + { + "epoch": 3.1887655127367736, + "grad_norm": NaN, + "learning_rate": 0.0001422656797735156, + "loss": 0.0, + "step": 34174 + }, + { + "epoch": 3.1888588224316505, + "grad_norm": NaN, + "learning_rate": 0.00014225812606034796, + "loss": 0.0, + "step": 34175 + }, + { + "epoch": 3.188952132126528, + "grad_norm": NaN, + "learning_rate": 0.0001422505723668656, + "loss": 0.0, + "step": 34176 + }, + { + "epoch": 3.1890454418214054, + "grad_norm": NaN, + "learning_rate": 0.00014224301869308778, + "loss": 0.0, + "step": 34177 + }, + { + "epoch": 3.1891387515162823, + "grad_norm": NaN, + "learning_rate": 0.00014223546503903367, + "loss": 0.0, + "step": 34178 + }, + { + "epoch": 3.1892320612111598, + "grad_norm": NaN, + "learning_rate": 0.00014222791140472242, + "loss": 0.0, + "step": 34179 + }, + { + "epoch": 3.189325370906037, + "grad_norm": NaN, + "learning_rate": 0.00014222035779017336, + "loss": 0.0, + "step": 34180 + }, + { + "epoch": 3.1894186806009146, + "grad_norm": NaN, + "learning_rate": 0.0001422128041954056, + "loss": 0.0, + "step": 34181 + }, + { + "epoch": 3.1895119902957916, + "grad_norm": NaN, + "learning_rate": 0.00014220525062043831, + "loss": 0.0, + "step": 34182 + }, + { + "epoch": 3.189605299990669, + "grad_norm": NaN, + "learning_rate": 0.00014219769706529085, + "loss": 0.0, + "step": 34183 + }, + { + "epoch": 3.1896986096855464, + "grad_norm": NaN, + "learning_rate": 0.0001421901435299823, + "loss": 0.0, + "step": 34184 + }, + { + "epoch": 3.189791919380424, + "grad_norm": NaN, + "learning_rate": 0.00014218259001453186, + "loss": 0.0, + "step": 34185 + }, + { + "epoch": 3.189885229075301, + "grad_norm": NaN, + "learning_rate": 0.00014217503651895884, + "loss": 0.0, + "step": 34186 + }, + { + "epoch": 3.1899785387701782, + "grad_norm": NaN, + "learning_rate": 0.00014216748304328236, + "loss": 0.0, + "step": 34187 + }, + { + "epoch": 3.1900718484650556, + "grad_norm": NaN, + "learning_rate": 0.00014215992958752162, + "loss": 0.0, + "step": 34188 + }, + { + "epoch": 3.1901651581599326, + "grad_norm": NaN, + "learning_rate": 0.00014215237615169594, + "loss": 0.0, + "step": 34189 + }, + { + "epoch": 3.19025846785481, + "grad_norm": NaN, + "learning_rate": 0.00014214482273582437, + "loss": 0.0, + "step": 34190 + }, + { + "epoch": 3.1903517775496875, + "grad_norm": NaN, + "learning_rate": 0.00014213726933992617, + "loss": 0.0, + "step": 34191 + }, + { + "epoch": 3.190445087244565, + "grad_norm": NaN, + "learning_rate": 0.00014212971596402066, + "loss": 0.0, + "step": 34192 + }, + { + "epoch": 3.190538396939442, + "grad_norm": NaN, + "learning_rate": 0.00014212216260812684, + "loss": 0.0, + "step": 34193 + }, + { + "epoch": 3.1906317066343193, + "grad_norm": NaN, + "learning_rate": 0.00014211460927226408, + "loss": 0.0, + "step": 34194 + }, + { + "epoch": 3.1907250163291967, + "grad_norm": NaN, + "learning_rate": 0.0001421070559564516, + "loss": 0.0, + "step": 34195 + }, + { + "epoch": 3.1908183260240737, + "grad_norm": NaN, + "learning_rate": 0.0001420995026607084, + "loss": 0.0, + "step": 34196 + }, + { + "epoch": 3.190911635718951, + "grad_norm": NaN, + "learning_rate": 0.0001420919493850539, + "loss": 0.0, + "step": 34197 + }, + { + "epoch": 3.1910049454138285, + "grad_norm": NaN, + "learning_rate": 0.00014208439612950727, + "loss": 0.0, + "step": 34198 + }, + { + "epoch": 3.191098255108706, + "grad_norm": NaN, + "learning_rate": 0.0001420768428940876, + "loss": 0.0, + "step": 34199 + }, + { + "epoch": 3.191191564803583, + "grad_norm": NaN, + "learning_rate": 0.0001420692896788142, + "loss": 0.0, + "step": 34200 + }, + { + "epoch": 3.1912848744984603, + "grad_norm": NaN, + "learning_rate": 0.0001420617364837063, + "loss": 0.0, + "step": 34201 + }, + { + "epoch": 3.1913781841933377, + "grad_norm": NaN, + "learning_rate": 0.00014205418330878297, + "loss": 0.0, + "step": 34202 + }, + { + "epoch": 3.191471493888215, + "grad_norm": NaN, + "learning_rate": 0.0001420466301540635, + "loss": 0.0, + "step": 34203 + }, + { + "epoch": 3.191564803583092, + "grad_norm": NaN, + "learning_rate": 0.00014203907701956718, + "loss": 0.0, + "step": 34204 + }, + { + "epoch": 3.1916581132779696, + "grad_norm": NaN, + "learning_rate": 0.00014203152390531305, + "loss": 0.0, + "step": 34205 + }, + { + "epoch": 3.191751422972847, + "grad_norm": NaN, + "learning_rate": 0.00014202397081132043, + "loss": 0.0, + "step": 34206 + }, + { + "epoch": 3.1918447326677244, + "grad_norm": NaN, + "learning_rate": 0.0001420164177376085, + "loss": 0.0, + "step": 34207 + }, + { + "epoch": 3.1919380423626014, + "grad_norm": NaN, + "learning_rate": 0.0001420088646841964, + "loss": 0.0, + "step": 34208 + }, + { + "epoch": 3.192031352057479, + "grad_norm": NaN, + "learning_rate": 0.00014200131165110344, + "loss": 0.0, + "step": 34209 + }, + { + "epoch": 3.192124661752356, + "grad_norm": NaN, + "learning_rate": 0.00014199375863834876, + "loss": 0.0, + "step": 34210 + }, + { + "epoch": 3.192217971447233, + "grad_norm": NaN, + "learning_rate": 0.00014198620564595155, + "loss": 0.0, + "step": 34211 + }, + { + "epoch": 3.1923112811421106, + "grad_norm": NaN, + "learning_rate": 0.0001419786526739311, + "loss": 0.0, + "step": 34212 + }, + { + "epoch": 3.192404590836988, + "grad_norm": NaN, + "learning_rate": 0.00014197109972230654, + "loss": 0.0, + "step": 34213 + }, + { + "epoch": 3.1924979005318654, + "grad_norm": NaN, + "learning_rate": 0.00014196354679109707, + "loss": 0.0, + "step": 34214 + }, + { + "epoch": 3.1925912102267424, + "grad_norm": NaN, + "learning_rate": 0.00014195599388032196, + "loss": 0.0, + "step": 34215 + }, + { + "epoch": 3.19268451992162, + "grad_norm": NaN, + "learning_rate": 0.00014194844099000035, + "loss": 0.0, + "step": 34216 + }, + { + "epoch": 3.1927778296164973, + "grad_norm": NaN, + "learning_rate": 0.00014194088812015144, + "loss": 0.0, + "step": 34217 + }, + { + "epoch": 3.1928711393113742, + "grad_norm": NaN, + "learning_rate": 0.00014193333527079454, + "loss": 0.0, + "step": 34218 + }, + { + "epoch": 3.1929644490062516, + "grad_norm": NaN, + "learning_rate": 0.00014192578244194874, + "loss": 0.0, + "step": 34219 + }, + { + "epoch": 3.193057758701129, + "grad_norm": NaN, + "learning_rate": 0.00014191822963363323, + "loss": 0.0, + "step": 34220 + }, + { + "epoch": 3.1931510683960065, + "grad_norm": NaN, + "learning_rate": 0.00014191067684586736, + "loss": 0.0, + "step": 34221 + }, + { + "epoch": 3.1932443780908835, + "grad_norm": NaN, + "learning_rate": 0.00014190312407867022, + "loss": 0.0, + "step": 34222 + }, + { + "epoch": 3.193337687785761, + "grad_norm": NaN, + "learning_rate": 0.00014189557133206096, + "loss": 0.0, + "step": 34223 + }, + { + "epoch": 3.1934309974806383, + "grad_norm": NaN, + "learning_rate": 0.00014188801860605895, + "loss": 0.0, + "step": 34224 + }, + { + "epoch": 3.1935243071755157, + "grad_norm": NaN, + "learning_rate": 0.00014188046590068328, + "loss": 0.0, + "step": 34225 + }, + { + "epoch": 3.1936176168703927, + "grad_norm": NaN, + "learning_rate": 0.00014187291321595313, + "loss": 0.0, + "step": 34226 + }, + { + "epoch": 3.19371092656527, + "grad_norm": NaN, + "learning_rate": 0.00014186536055188785, + "loss": 0.0, + "step": 34227 + }, + { + "epoch": 3.1938042362601475, + "grad_norm": NaN, + "learning_rate": 0.0001418578079085065, + "loss": 0.0, + "step": 34228 + }, + { + "epoch": 3.193897545955025, + "grad_norm": NaN, + "learning_rate": 0.0001418502552858283, + "loss": 0.0, + "step": 34229 + }, + { + "epoch": 3.193990855649902, + "grad_norm": NaN, + "learning_rate": 0.00014184270268387253, + "loss": 0.0, + "step": 34230 + }, + { + "epoch": 3.1940841653447793, + "grad_norm": NaN, + "learning_rate": 0.00014183515010265832, + "loss": 0.0, + "step": 34231 + }, + { + "epoch": 3.1941774750396568, + "grad_norm": NaN, + "learning_rate": 0.00014182759754220492, + "loss": 0.0, + "step": 34232 + }, + { + "epoch": 3.1942707847345337, + "grad_norm": NaN, + "learning_rate": 0.00014182004500253156, + "loss": 0.0, + "step": 34233 + }, + { + "epoch": 3.194364094429411, + "grad_norm": NaN, + "learning_rate": 0.00014181249248365732, + "loss": 0.0, + "step": 34234 + }, + { + "epoch": 3.1944574041242886, + "grad_norm": NaN, + "learning_rate": 0.00014180493998560152, + "loss": 0.0, + "step": 34235 + }, + { + "epoch": 3.194550713819166, + "grad_norm": NaN, + "learning_rate": 0.00014179738750838338, + "loss": 0.0, + "step": 34236 + }, + { + "epoch": 3.194644023514043, + "grad_norm": NaN, + "learning_rate": 0.000141789835052022, + "loss": 0.0, + "step": 34237 + }, + { + "epoch": 3.1947373332089204, + "grad_norm": NaN, + "learning_rate": 0.00014178228261653664, + "loss": 0.0, + "step": 34238 + }, + { + "epoch": 3.194830642903798, + "grad_norm": NaN, + "learning_rate": 0.00014177473020194652, + "loss": 0.0, + "step": 34239 + }, + { + "epoch": 3.194923952598675, + "grad_norm": NaN, + "learning_rate": 0.0001417671778082708, + "loss": 0.0, + "step": 34240 + }, + { + "epoch": 3.195017262293552, + "grad_norm": NaN, + "learning_rate": 0.00014175962543552873, + "loss": 0.0, + "step": 34241 + }, + { + "epoch": 3.1951105719884296, + "grad_norm": NaN, + "learning_rate": 0.0001417520730837395, + "loss": 0.0, + "step": 34242 + }, + { + "epoch": 3.195203881683307, + "grad_norm": NaN, + "learning_rate": 0.00014174452075292227, + "loss": 0.0, + "step": 34243 + }, + { + "epoch": 3.195297191378184, + "grad_norm": NaN, + "learning_rate": 0.0001417369684430963, + "loss": 0.0, + "step": 34244 + }, + { + "epoch": 3.1953905010730614, + "grad_norm": NaN, + "learning_rate": 0.0001417294161542808, + "loss": 0.0, + "step": 34245 + }, + { + "epoch": 3.195483810767939, + "grad_norm": NaN, + "learning_rate": 0.00014172186388649487, + "loss": 0.0, + "step": 34246 + }, + { + "epoch": 3.1955771204628163, + "grad_norm": NaN, + "learning_rate": 0.00014171431163975784, + "loss": 0.0, + "step": 34247 + }, + { + "epoch": 3.1956704301576933, + "grad_norm": NaN, + "learning_rate": 0.0001417067594140889, + "loss": 0.0, + "step": 34248 + }, + { + "epoch": 3.1957637398525707, + "grad_norm": NaN, + "learning_rate": 0.00014169920720950713, + "loss": 0.0, + "step": 34249 + }, + { + "epoch": 3.195857049547448, + "grad_norm": NaN, + "learning_rate": 0.00014169165502603186, + "loss": 0.0, + "step": 34250 + }, + { + "epoch": 3.1959503592423255, + "grad_norm": NaN, + "learning_rate": 0.00014168410286368227, + "loss": 0.0, + "step": 34251 + }, + { + "epoch": 3.1960436689372025, + "grad_norm": NaN, + "learning_rate": 0.0001416765507224775, + "loss": 0.0, + "step": 34252 + }, + { + "epoch": 3.19613697863208, + "grad_norm": NaN, + "learning_rate": 0.00014166899860243684, + "loss": 0.0, + "step": 34253 + }, + { + "epoch": 3.1962302883269573, + "grad_norm": NaN, + "learning_rate": 0.00014166144650357943, + "loss": 0.0, + "step": 34254 + }, + { + "epoch": 3.1963235980218343, + "grad_norm": NaN, + "learning_rate": 0.00014165389442592446, + "loss": 0.0, + "step": 34255 + }, + { + "epoch": 3.1964169077167117, + "grad_norm": NaN, + "learning_rate": 0.0001416463423694912, + "loss": 0.0, + "step": 34256 + }, + { + "epoch": 3.196510217411589, + "grad_norm": NaN, + "learning_rate": 0.00014163879033429882, + "loss": 0.0, + "step": 34257 + }, + { + "epoch": 3.1966035271064666, + "grad_norm": NaN, + "learning_rate": 0.00014163123832036648, + "loss": 0.0, + "step": 34258 + }, + { + "epoch": 3.1966968368013435, + "grad_norm": NaN, + "learning_rate": 0.00014162368632771348, + "loss": 0.0, + "step": 34259 + }, + { + "epoch": 3.196790146496221, + "grad_norm": NaN, + "learning_rate": 0.00014161613435635892, + "loss": 0.0, + "step": 34260 + }, + { + "epoch": 3.1968834561910984, + "grad_norm": NaN, + "learning_rate": 0.00014160858240632203, + "loss": 0.0, + "step": 34261 + }, + { + "epoch": 3.1969767658859753, + "grad_norm": NaN, + "learning_rate": 0.00014160103047762207, + "loss": 0.0, + "step": 34262 + }, + { + "epoch": 3.1970700755808528, + "grad_norm": NaN, + "learning_rate": 0.0001415934785702782, + "loss": 0.0, + "step": 34263 + }, + { + "epoch": 3.19716338527573, + "grad_norm": NaN, + "learning_rate": 0.0001415859266843096, + "loss": 0.0, + "step": 34264 + }, + { + "epoch": 3.1972566949706076, + "grad_norm": NaN, + "learning_rate": 0.00014157837481973553, + "loss": 0.0, + "step": 34265 + }, + { + "epoch": 3.1973500046654846, + "grad_norm": NaN, + "learning_rate": 0.00014157082297657514, + "loss": 0.0, + "step": 34266 + }, + { + "epoch": 3.197443314360362, + "grad_norm": NaN, + "learning_rate": 0.0001415632711548476, + "loss": 0.0, + "step": 34267 + }, + { + "epoch": 3.1975366240552394, + "grad_norm": NaN, + "learning_rate": 0.00014155571935457227, + "loss": 0.0, + "step": 34268 + }, + { + "epoch": 3.197629933750117, + "grad_norm": NaN, + "learning_rate": 0.00014154816757576814, + "loss": 0.0, + "step": 34269 + }, + { + "epoch": 3.197723243444994, + "grad_norm": NaN, + "learning_rate": 0.00014154061581845456, + "loss": 0.0, + "step": 34270 + }, + { + "epoch": 3.1978165531398712, + "grad_norm": NaN, + "learning_rate": 0.00014153306408265073, + "loss": 0.0, + "step": 34271 + }, + { + "epoch": 3.1979098628347487, + "grad_norm": NaN, + "learning_rate": 0.00014152551236837574, + "loss": 0.0, + "step": 34272 + }, + { + "epoch": 3.1980031725296256, + "grad_norm": NaN, + "learning_rate": 0.00014151796067564886, + "loss": 0.0, + "step": 34273 + }, + { + "epoch": 3.198096482224503, + "grad_norm": NaN, + "learning_rate": 0.00014151040900448937, + "loss": 0.0, + "step": 34274 + }, + { + "epoch": 3.1981897919193805, + "grad_norm": NaN, + "learning_rate": 0.0001415028573549163, + "loss": 0.0, + "step": 34275 + }, + { + "epoch": 3.198283101614258, + "grad_norm": NaN, + "learning_rate": 0.00014149530572694902, + "loss": 0.0, + "step": 34276 + }, + { + "epoch": 3.198376411309135, + "grad_norm": NaN, + "learning_rate": 0.00014148775412060665, + "loss": 0.0, + "step": 34277 + }, + { + "epoch": 3.1984697210040123, + "grad_norm": NaN, + "learning_rate": 0.00014148020253590835, + "loss": 0.0, + "step": 34278 + }, + { + "epoch": 3.1985630306988897, + "grad_norm": NaN, + "learning_rate": 0.00014147265097287341, + "loss": 0.0, + "step": 34279 + }, + { + "epoch": 3.198656340393767, + "grad_norm": NaN, + "learning_rate": 0.00014146509943152103, + "loss": 0.0, + "step": 34280 + }, + { + "epoch": 3.198749650088644, + "grad_norm": NaN, + "learning_rate": 0.0001414575479118703, + "loss": 0.0, + "step": 34281 + }, + { + "epoch": 3.1988429597835215, + "grad_norm": NaN, + "learning_rate": 0.00014144999641394051, + "loss": 0.0, + "step": 34282 + }, + { + "epoch": 3.198936269478399, + "grad_norm": NaN, + "learning_rate": 0.0001414424449377509, + "loss": 0.0, + "step": 34283 + }, + { + "epoch": 3.199029579173276, + "grad_norm": NaN, + "learning_rate": 0.00014143489348332055, + "loss": 0.0, + "step": 34284 + }, + { + "epoch": 3.1991228888681533, + "grad_norm": NaN, + "learning_rate": 0.00014142734205066878, + "loss": 0.0, + "step": 34285 + }, + { + "epoch": 3.1992161985630307, + "grad_norm": NaN, + "learning_rate": 0.00014141979063981474, + "loss": 0.0, + "step": 34286 + }, + { + "epoch": 3.199309508257908, + "grad_norm": NaN, + "learning_rate": 0.00014141223925077758, + "loss": 0.0, + "step": 34287 + }, + { + "epoch": 3.199402817952785, + "grad_norm": NaN, + "learning_rate": 0.00014140468788357657, + "loss": 0.0, + "step": 34288 + }, + { + "epoch": 3.1994961276476626, + "grad_norm": NaN, + "learning_rate": 0.00014139713653823095, + "loss": 0.0, + "step": 34289 + }, + { + "epoch": 3.19958943734254, + "grad_norm": NaN, + "learning_rate": 0.00014138958521475977, + "loss": 0.0, + "step": 34290 + }, + { + "epoch": 3.199682747037417, + "grad_norm": NaN, + "learning_rate": 0.00014138203391318238, + "loss": 0.0, + "step": 34291 + }, + { + "epoch": 3.1997760567322944, + "grad_norm": NaN, + "learning_rate": 0.00014137448263351794, + "loss": 0.0, + "step": 34292 + }, + { + "epoch": 3.199869366427172, + "grad_norm": NaN, + "learning_rate": 0.00014136693137578558, + "loss": 0.0, + "step": 34293 + }, + { + "epoch": 3.199962676122049, + "grad_norm": NaN, + "learning_rate": 0.0001413593801400046, + "loss": 0.0, + "step": 34294 + }, + { + "epoch": 3.200055985816926, + "grad_norm": NaN, + "learning_rate": 0.00014135182892619416, + "loss": 0.0, + "step": 34295 + }, + { + "epoch": 3.2001492955118036, + "grad_norm": NaN, + "learning_rate": 0.00014134427773437343, + "loss": 0.0, + "step": 34296 + }, + { + "epoch": 3.200242605206681, + "grad_norm": NaN, + "learning_rate": 0.00014133672656456168, + "loss": 0.0, + "step": 34297 + }, + { + "epoch": 3.2003359149015584, + "grad_norm": NaN, + "learning_rate": 0.00014132917541677804, + "loss": 0.0, + "step": 34298 + }, + { + "epoch": 3.2004292245964354, + "grad_norm": NaN, + "learning_rate": 0.00014132162429104168, + "loss": 0.0, + "step": 34299 + }, + { + "epoch": 3.200522534291313, + "grad_norm": NaN, + "learning_rate": 0.00014131407318737196, + "loss": 0.0, + "step": 34300 + }, + { + "epoch": 3.2006158439861903, + "grad_norm": NaN, + "learning_rate": 0.00014130652210578794, + "loss": 0.0, + "step": 34301 + }, + { + "epoch": 3.2007091536810677, + "grad_norm": NaN, + "learning_rate": 0.00014129897104630883, + "loss": 0.0, + "step": 34302 + }, + { + "epoch": 3.2008024633759447, + "grad_norm": NaN, + "learning_rate": 0.0001412914200089539, + "loss": 0.0, + "step": 34303 + }, + { + "epoch": 3.200895773070822, + "grad_norm": NaN, + "learning_rate": 0.00014128386899374228, + "loss": 0.0, + "step": 34304 + }, + { + "epoch": 3.2009890827656995, + "grad_norm": NaN, + "learning_rate": 0.0001412763180006932, + "loss": 0.0, + "step": 34305 + }, + { + "epoch": 3.2010823924605765, + "grad_norm": NaN, + "learning_rate": 0.00014126876702982592, + "loss": 0.0, + "step": 34306 + }, + { + "epoch": 3.201175702155454, + "grad_norm": NaN, + "learning_rate": 0.00014126121608115948, + "loss": 0.0, + "step": 34307 + }, + { + "epoch": 3.2012690118503313, + "grad_norm": NaN, + "learning_rate": 0.00014125366515471322, + "loss": 0.0, + "step": 34308 + }, + { + "epoch": 3.2013623215452087, + "grad_norm": NaN, + "learning_rate": 0.00014124611425050634, + "loss": 0.0, + "step": 34309 + }, + { + "epoch": 3.2014556312400857, + "grad_norm": NaN, + "learning_rate": 0.00014123856336855793, + "loss": 0.0, + "step": 34310 + }, + { + "epoch": 3.201548940934963, + "grad_norm": NaN, + "learning_rate": 0.0001412310125088873, + "loss": 0.0, + "step": 34311 + }, + { + "epoch": 3.2016422506298405, + "grad_norm": NaN, + "learning_rate": 0.00014122346167151365, + "loss": 0.0, + "step": 34312 + }, + { + "epoch": 3.2017355603247175, + "grad_norm": NaN, + "learning_rate": 0.00014121591085645605, + "loss": 0.0, + "step": 34313 + }, + { + "epoch": 3.201828870019595, + "grad_norm": NaN, + "learning_rate": 0.00014120836006373383, + "loss": 0.0, + "step": 34314 + }, + { + "epoch": 3.2019221797144723, + "grad_norm": NaN, + "learning_rate": 0.00014120080929336617, + "loss": 0.0, + "step": 34315 + }, + { + "epoch": 3.2020154894093498, + "grad_norm": NaN, + "learning_rate": 0.00014119325854537217, + "loss": 0.0, + "step": 34316 + }, + { + "epoch": 3.2021087991042267, + "grad_norm": NaN, + "learning_rate": 0.00014118570781977116, + "loss": 0.0, + "step": 34317 + }, + { + "epoch": 3.202202108799104, + "grad_norm": NaN, + "learning_rate": 0.0001411781571165823, + "loss": 0.0, + "step": 34318 + }, + { + "epoch": 3.2022954184939816, + "grad_norm": NaN, + "learning_rate": 0.0001411706064358247, + "loss": 0.0, + "step": 34319 + }, + { + "epoch": 3.202388728188859, + "grad_norm": NaN, + "learning_rate": 0.0001411630557775177, + "loss": 0.0, + "step": 34320 + }, + { + "epoch": 3.202482037883736, + "grad_norm": NaN, + "learning_rate": 0.00014115550514168045, + "loss": 0.0, + "step": 34321 + }, + { + "epoch": 3.2025753475786134, + "grad_norm": NaN, + "learning_rate": 0.00014114795452833205, + "loss": 0.0, + "step": 34322 + }, + { + "epoch": 3.202668657273491, + "grad_norm": NaN, + "learning_rate": 0.00014114040393749182, + "loss": 0.0, + "step": 34323 + }, + { + "epoch": 3.2027619669683682, + "grad_norm": NaN, + "learning_rate": 0.00014113285336917895, + "loss": 0.0, + "step": 34324 + }, + { + "epoch": 3.202855276663245, + "grad_norm": NaN, + "learning_rate": 0.00014112530282341253, + "loss": 0.0, + "step": 34325 + }, + { + "epoch": 3.2029485863581226, + "grad_norm": NaN, + "learning_rate": 0.0001411177523002119, + "loss": 0.0, + "step": 34326 + }, + { + "epoch": 3.203041896053, + "grad_norm": NaN, + "learning_rate": 0.00014111020179959622, + "loss": 0.0, + "step": 34327 + }, + { + "epoch": 3.203135205747877, + "grad_norm": NaN, + "learning_rate": 0.00014110265132158458, + "loss": 0.0, + "step": 34328 + }, + { + "epoch": 3.2032285154427544, + "grad_norm": NaN, + "learning_rate": 0.00014109510086619633, + "loss": 0.0, + "step": 34329 + }, + { + "epoch": 3.203321825137632, + "grad_norm": NaN, + "learning_rate": 0.00014108755043345058, + "loss": 0.0, + "step": 34330 + }, + { + "epoch": 3.2034151348325093, + "grad_norm": NaN, + "learning_rate": 0.0001410800000233665, + "loss": 0.0, + "step": 34331 + }, + { + "epoch": 3.2035084445273863, + "grad_norm": NaN, + "learning_rate": 0.0001410724496359634, + "loss": 0.0, + "step": 34332 + }, + { + "epoch": 3.2036017542222637, + "grad_norm": NaN, + "learning_rate": 0.00014106489927126045, + "loss": 0.0, + "step": 34333 + }, + { + "epoch": 3.203695063917141, + "grad_norm": NaN, + "learning_rate": 0.0001410573489292767, + "loss": 0.0, + "step": 34334 + }, + { + "epoch": 3.203788373612018, + "grad_norm": NaN, + "learning_rate": 0.00014104979861003154, + "loss": 0.0, + "step": 34335 + }, + { + "epoch": 3.2038816833068955, + "grad_norm": NaN, + "learning_rate": 0.0001410422483135441, + "loss": 0.0, + "step": 34336 + }, + { + "epoch": 3.203974993001773, + "grad_norm": NaN, + "learning_rate": 0.00014103469803983351, + "loss": 0.0, + "step": 34337 + }, + { + "epoch": 3.2040683026966503, + "grad_norm": NaN, + "learning_rate": 0.00014102714778891907, + "loss": 0.0, + "step": 34338 + }, + { + "epoch": 3.2041616123915273, + "grad_norm": NaN, + "learning_rate": 0.00014101959756081996, + "loss": 0.0, + "step": 34339 + }, + { + "epoch": 3.2042549220864047, + "grad_norm": NaN, + "learning_rate": 0.00014101204735555528, + "loss": 0.0, + "step": 34340 + }, + { + "epoch": 3.204348231781282, + "grad_norm": NaN, + "learning_rate": 0.00014100449717314436, + "loss": 0.0, + "step": 34341 + }, + { + "epoch": 3.2044415414761596, + "grad_norm": NaN, + "learning_rate": 0.0001409969470136063, + "loss": 0.0, + "step": 34342 + }, + { + "epoch": 3.2045348511710365, + "grad_norm": NaN, + "learning_rate": 0.00014098939687696036, + "loss": 0.0, + "step": 34343 + }, + { + "epoch": 3.204628160865914, + "grad_norm": NaN, + "learning_rate": 0.00014098184676322576, + "loss": 0.0, + "step": 34344 + }, + { + "epoch": 3.2047214705607914, + "grad_norm": NaN, + "learning_rate": 0.00014097429667242156, + "loss": 0.0, + "step": 34345 + }, + { + "epoch": 3.204814780255669, + "grad_norm": NaN, + "learning_rate": 0.0001409667466045671, + "loss": 0.0, + "step": 34346 + }, + { + "epoch": 3.2049080899505458, + "grad_norm": NaN, + "learning_rate": 0.00014095919655968155, + "loss": 0.0, + "step": 34347 + }, + { + "epoch": 3.205001399645423, + "grad_norm": NaN, + "learning_rate": 0.00014095164653778403, + "loss": 0.0, + "step": 34348 + }, + { + "epoch": 3.2050947093403006, + "grad_norm": NaN, + "learning_rate": 0.00014094409653889382, + "loss": 0.0, + "step": 34349 + }, + { + "epoch": 3.2051880190351776, + "grad_norm": NaN, + "learning_rate": 0.0001409365465630301, + "loss": 0.0, + "step": 34350 + }, + { + "epoch": 3.205281328730055, + "grad_norm": NaN, + "learning_rate": 0.00014092899661021204, + "loss": 0.0, + "step": 34351 + }, + { + "epoch": 3.2053746384249324, + "grad_norm": NaN, + "learning_rate": 0.00014092144668045886, + "loss": 0.0, + "step": 34352 + }, + { + "epoch": 3.20546794811981, + "grad_norm": NaN, + "learning_rate": 0.00014091389677378977, + "loss": 0.0, + "step": 34353 + }, + { + "epoch": 3.205561257814687, + "grad_norm": NaN, + "learning_rate": 0.0001409063468902239, + "loss": 0.0, + "step": 34354 + }, + { + "epoch": 3.2056545675095642, + "grad_norm": NaN, + "learning_rate": 0.00014089879702978053, + "loss": 0.0, + "step": 34355 + }, + { + "epoch": 3.2057478772044417, + "grad_norm": NaN, + "learning_rate": 0.00014089124719247884, + "loss": 0.0, + "step": 34356 + }, + { + "epoch": 3.2058411868993186, + "grad_norm": NaN, + "learning_rate": 0.00014088369737833798, + "loss": 0.0, + "step": 34357 + }, + { + "epoch": 3.205934496594196, + "grad_norm": NaN, + "learning_rate": 0.0001408761475873772, + "loss": 0.0, + "step": 34358 + }, + { + "epoch": 3.2060278062890735, + "grad_norm": NaN, + "learning_rate": 0.00014086859781961566, + "loss": 0.0, + "step": 34359 + }, + { + "epoch": 3.206121115983951, + "grad_norm": NaN, + "learning_rate": 0.00014086104807507256, + "loss": 0.0, + "step": 34360 + }, + { + "epoch": 3.206214425678828, + "grad_norm": NaN, + "learning_rate": 0.0001408534983537671, + "loss": 0.0, + "step": 34361 + }, + { + "epoch": 3.2063077353737053, + "grad_norm": NaN, + "learning_rate": 0.00014084594865571855, + "loss": 0.0, + "step": 34362 + }, + { + "epoch": 3.2064010450685827, + "grad_norm": NaN, + "learning_rate": 0.00014083839898094597, + "loss": 0.0, + "step": 34363 + }, + { + "epoch": 3.20649435476346, + "grad_norm": NaN, + "learning_rate": 0.00014083084932946864, + "loss": 0.0, + "step": 34364 + }, + { + "epoch": 3.206587664458337, + "grad_norm": NaN, + "learning_rate": 0.0001408232997013058, + "loss": 0.0, + "step": 34365 + }, + { + "epoch": 3.2066809741532145, + "grad_norm": NaN, + "learning_rate": 0.00014081575009647652, + "loss": 0.0, + "step": 34366 + }, + { + "epoch": 3.206774283848092, + "grad_norm": NaN, + "learning_rate": 0.00014080820051500008, + "loss": 0.0, + "step": 34367 + }, + { + "epoch": 3.206867593542969, + "grad_norm": NaN, + "learning_rate": 0.00014080065095689573, + "loss": 0.0, + "step": 34368 + }, + { + "epoch": 3.2069609032378463, + "grad_norm": NaN, + "learning_rate": 0.0001407931014221825, + "loss": 0.0, + "step": 34369 + }, + { + "epoch": 3.2070542129327237, + "grad_norm": NaN, + "learning_rate": 0.00014078555191087975, + "loss": 0.0, + "step": 34370 + }, + { + "epoch": 3.207147522627601, + "grad_norm": NaN, + "learning_rate": 0.00014077800242300663, + "loss": 0.0, + "step": 34371 + }, + { + "epoch": 3.207240832322478, + "grad_norm": NaN, + "learning_rate": 0.00014077045295858226, + "loss": 0.0, + "step": 34372 + }, + { + "epoch": 3.2073341420173556, + "grad_norm": NaN, + "learning_rate": 0.0001407629035176259, + "loss": 0.0, + "step": 34373 + }, + { + "epoch": 3.207427451712233, + "grad_norm": NaN, + "learning_rate": 0.0001407553541001568, + "loss": 0.0, + "step": 34374 + }, + { + "epoch": 3.2075207614071104, + "grad_norm": NaN, + "learning_rate": 0.00014074780470619403, + "loss": 0.0, + "step": 34375 + }, + { + "epoch": 3.2076140711019874, + "grad_norm": NaN, + "learning_rate": 0.00014074025533575684, + "loss": 0.0, + "step": 34376 + }, + { + "epoch": 3.207707380796865, + "grad_norm": NaN, + "learning_rate": 0.00014073270598886453, + "loss": 0.0, + "step": 34377 + }, + { + "epoch": 3.207800690491742, + "grad_norm": NaN, + "learning_rate": 0.0001407251566655361, + "loss": 0.0, + "step": 34378 + }, + { + "epoch": 3.207894000186619, + "grad_norm": NaN, + "learning_rate": 0.00014071760736579088, + "loss": 0.0, + "step": 34379 + }, + { + "epoch": 3.2079873098814966, + "grad_norm": NaN, + "learning_rate": 0.00014071005808964808, + "loss": 0.0, + "step": 34380 + }, + { + "epoch": 3.208080619576374, + "grad_norm": NaN, + "learning_rate": 0.00014070250883712682, + "loss": 0.0, + "step": 34381 + }, + { + "epoch": 3.2081739292712514, + "grad_norm": NaN, + "learning_rate": 0.00014069495960824632, + "loss": 0.0, + "step": 34382 + }, + { + "epoch": 3.2082672389661284, + "grad_norm": NaN, + "learning_rate": 0.00014068741040302582, + "loss": 0.0, + "step": 34383 + }, + { + "epoch": 3.208360548661006, + "grad_norm": NaN, + "learning_rate": 0.00014067986122148443, + "loss": 0.0, + "step": 34384 + }, + { + "epoch": 3.2084538583558833, + "grad_norm": NaN, + "learning_rate": 0.00014067231206364136, + "loss": 0.0, + "step": 34385 + }, + { + "epoch": 3.2085471680507607, + "grad_norm": NaN, + "learning_rate": 0.00014066476292951593, + "loss": 0.0, + "step": 34386 + }, + { + "epoch": 3.2086404777456377, + "grad_norm": NaN, + "learning_rate": 0.0001406572138191272, + "loss": 0.0, + "step": 34387 + }, + { + "epoch": 3.208733787440515, + "grad_norm": NaN, + "learning_rate": 0.00014064966473249443, + "loss": 0.0, + "step": 34388 + }, + { + "epoch": 3.2088270971353925, + "grad_norm": NaN, + "learning_rate": 0.00014064211566963672, + "loss": 0.0, + "step": 34389 + }, + { + "epoch": 3.2089204068302695, + "grad_norm": NaN, + "learning_rate": 0.0001406345666305734, + "loss": 0.0, + "step": 34390 + }, + { + "epoch": 3.209013716525147, + "grad_norm": NaN, + "learning_rate": 0.00014062701761532364, + "loss": 0.0, + "step": 34391 + }, + { + "epoch": 3.2091070262200243, + "grad_norm": NaN, + "learning_rate": 0.0001406194686239065, + "loss": 0.0, + "step": 34392 + }, + { + "epoch": 3.2092003359149017, + "grad_norm": NaN, + "learning_rate": 0.00014061191965634133, + "loss": 0.0, + "step": 34393 + }, + { + "epoch": 3.2092936456097787, + "grad_norm": NaN, + "learning_rate": 0.00014060437071264728, + "loss": 0.0, + "step": 34394 + }, + { + "epoch": 3.209386955304656, + "grad_norm": NaN, + "learning_rate": 0.00014059682179284349, + "loss": 0.0, + "step": 34395 + }, + { + "epoch": 3.2094802649995335, + "grad_norm": NaN, + "learning_rate": 0.0001405892728969492, + "loss": 0.0, + "step": 34396 + }, + { + "epoch": 3.209573574694411, + "grad_norm": NaN, + "learning_rate": 0.00014058172402498366, + "loss": 0.0, + "step": 34397 + }, + { + "epoch": 3.209666884389288, + "grad_norm": NaN, + "learning_rate": 0.00014057417517696592, + "loss": 0.0, + "step": 34398 + }, + { + "epoch": 3.2097601940841654, + "grad_norm": NaN, + "learning_rate": 0.0001405666263529153, + "loss": 0.0, + "step": 34399 + }, + { + "epoch": 3.2098535037790428, + "grad_norm": NaN, + "learning_rate": 0.000140559077552851, + "loss": 0.0, + "step": 34400 + }, + { + "epoch": 3.2099468134739197, + "grad_norm": NaN, + "learning_rate": 0.0001405515287767921, + "loss": 0.0, + "step": 34401 + }, + { + "epoch": 3.210040123168797, + "grad_norm": NaN, + "learning_rate": 0.00014054398002475788, + "loss": 0.0, + "step": 34402 + }, + { + "epoch": 3.2101334328636746, + "grad_norm": NaN, + "learning_rate": 0.00014053643129676756, + "loss": 0.0, + "step": 34403 + }, + { + "epoch": 3.210226742558552, + "grad_norm": NaN, + "learning_rate": 0.00014052888259284024, + "loss": 0.0, + "step": 34404 + }, + { + "epoch": 3.210320052253429, + "grad_norm": NaN, + "learning_rate": 0.0001405213339129952, + "loss": 0.0, + "step": 34405 + }, + { + "epoch": 3.2104133619483064, + "grad_norm": NaN, + "learning_rate": 0.0001405137852572516, + "loss": 0.0, + "step": 34406 + }, + { + "epoch": 3.210506671643184, + "grad_norm": NaN, + "learning_rate": 0.00014050623662562858, + "loss": 0.0, + "step": 34407 + }, + { + "epoch": 3.210599981338061, + "grad_norm": NaN, + "learning_rate": 0.00014049868801814544, + "loss": 0.0, + "step": 34408 + }, + { + "epoch": 3.210693291032938, + "grad_norm": NaN, + "learning_rate": 0.00014049113943482132, + "loss": 0.0, + "step": 34409 + }, + { + "epoch": 3.2107866007278156, + "grad_norm": NaN, + "learning_rate": 0.0001404835908756754, + "loss": 0.0, + "step": 34410 + }, + { + "epoch": 3.210879910422693, + "grad_norm": NaN, + "learning_rate": 0.00014047604234072687, + "loss": 0.0, + "step": 34411 + }, + { + "epoch": 3.21097322011757, + "grad_norm": NaN, + "learning_rate": 0.00014046849382999503, + "loss": 0.0, + "step": 34412 + }, + { + "epoch": 3.2110665298124474, + "grad_norm": NaN, + "learning_rate": 0.0001404609453434989, + "loss": 0.0, + "step": 34413 + }, + { + "epoch": 3.211159839507325, + "grad_norm": NaN, + "learning_rate": 0.00014045339688125772, + "loss": 0.0, + "step": 34414 + }, + { + "epoch": 3.2112531492022023, + "grad_norm": NaN, + "learning_rate": 0.00014044584844329086, + "loss": 0.0, + "step": 34415 + }, + { + "epoch": 3.2113464588970793, + "grad_norm": NaN, + "learning_rate": 0.00014043830002961728, + "loss": 0.0, + "step": 34416 + }, + { + "epoch": 3.2114397685919567, + "grad_norm": NaN, + "learning_rate": 0.00014043075164025626, + "loss": 0.0, + "step": 34417 + }, + { + "epoch": 3.211533078286834, + "grad_norm": NaN, + "learning_rate": 0.0001404232032752271, + "loss": 0.0, + "step": 34418 + }, + { + "epoch": 3.2116263879817115, + "grad_norm": NaN, + "learning_rate": 0.00014041565493454883, + "loss": 0.0, + "step": 34419 + }, + { + "epoch": 3.2117196976765885, + "grad_norm": NaN, + "learning_rate": 0.00014040810661824067, + "loss": 0.0, + "step": 34420 + }, + { + "epoch": 3.211813007371466, + "grad_norm": NaN, + "learning_rate": 0.00014040055832632194, + "loss": 0.0, + "step": 34421 + }, + { + "epoch": 3.2119063170663433, + "grad_norm": NaN, + "learning_rate": 0.00014039301005881172, + "loss": 0.0, + "step": 34422 + }, + { + "epoch": 3.2119996267612203, + "grad_norm": NaN, + "learning_rate": 0.00014038546181572916, + "loss": 0.0, + "step": 34423 + }, + { + "epoch": 3.2120929364560977, + "grad_norm": NaN, + "learning_rate": 0.00014037791359709364, + "loss": 0.0, + "step": 34424 + }, + { + "epoch": 3.212186246150975, + "grad_norm": NaN, + "learning_rate": 0.00014037036540292416, + "loss": 0.0, + "step": 34425 + }, + { + "epoch": 3.2122795558458526, + "grad_norm": NaN, + "learning_rate": 0.00014036281723323998, + "loss": 0.0, + "step": 34426 + }, + { + "epoch": 3.2123728655407295, + "grad_norm": NaN, + "learning_rate": 0.00014035526908806034, + "loss": 0.0, + "step": 34427 + }, + { + "epoch": 3.212466175235607, + "grad_norm": NaN, + "learning_rate": 0.0001403477209674044, + "loss": 0.0, + "step": 34428 + }, + { + "epoch": 3.2125594849304844, + "grad_norm": NaN, + "learning_rate": 0.00014034017287129129, + "loss": 0.0, + "step": 34429 + }, + { + "epoch": 3.2126527946253614, + "grad_norm": NaN, + "learning_rate": 0.00014033262479974033, + "loss": 0.0, + "step": 34430 + }, + { + "epoch": 3.2127461043202388, + "grad_norm": NaN, + "learning_rate": 0.0001403250767527706, + "loss": 0.0, + "step": 34431 + }, + { + "epoch": 3.212839414015116, + "grad_norm": NaN, + "learning_rate": 0.00014031752873040139, + "loss": 0.0, + "step": 34432 + }, + { + "epoch": 3.2129327237099936, + "grad_norm": NaN, + "learning_rate": 0.00014030998073265175, + "loss": 0.0, + "step": 34433 + }, + { + "epoch": 3.2130260334048706, + "grad_norm": NaN, + "learning_rate": 0.00014030243275954098, + "loss": 0.0, + "step": 34434 + }, + { + "epoch": 3.213119343099748, + "grad_norm": NaN, + "learning_rate": 0.0001402948848110883, + "loss": 0.0, + "step": 34435 + }, + { + "epoch": 3.2132126527946254, + "grad_norm": NaN, + "learning_rate": 0.00014028733688731279, + "loss": 0.0, + "step": 34436 + }, + { + "epoch": 3.213305962489503, + "grad_norm": NaN, + "learning_rate": 0.00014027978898823373, + "loss": 0.0, + "step": 34437 + }, + { + "epoch": 3.21339927218438, + "grad_norm": NaN, + "learning_rate": 0.00014027224111387034, + "loss": 0.0, + "step": 34438 + }, + { + "epoch": 3.2134925818792572, + "grad_norm": NaN, + "learning_rate": 0.00014026469326424167, + "loss": 0.0, + "step": 34439 + }, + { + "epoch": 3.2135858915741347, + "grad_norm": NaN, + "learning_rate": 0.00014025714543936703, + "loss": 0.0, + "step": 34440 + }, + { + "epoch": 3.213679201269012, + "grad_norm": NaN, + "learning_rate": 0.00014024959763926564, + "loss": 0.0, + "step": 34441 + }, + { + "epoch": 3.213772510963889, + "grad_norm": NaN, + "learning_rate": 0.00014024204986395655, + "loss": 0.0, + "step": 34442 + }, + { + "epoch": 3.2138658206587665, + "grad_norm": NaN, + "learning_rate": 0.0001402345021134591, + "loss": 0.0, + "step": 34443 + }, + { + "epoch": 3.213959130353644, + "grad_norm": NaN, + "learning_rate": 0.0001402269543877924, + "loss": 0.0, + "step": 34444 + }, + { + "epoch": 3.214052440048521, + "grad_norm": NaN, + "learning_rate": 0.00014021940668697562, + "loss": 0.0, + "step": 34445 + }, + { + "epoch": 3.2141457497433983, + "grad_norm": NaN, + "learning_rate": 0.00014021185901102803, + "loss": 0.0, + "step": 34446 + }, + { + "epoch": 3.2142390594382757, + "grad_norm": NaN, + "learning_rate": 0.00014020431135996881, + "loss": 0.0, + "step": 34447 + }, + { + "epoch": 3.214332369133153, + "grad_norm": NaN, + "learning_rate": 0.00014019676373381706, + "loss": 0.0, + "step": 34448 + }, + { + "epoch": 3.21442567882803, + "grad_norm": NaN, + "learning_rate": 0.00014018921613259208, + "loss": 0.0, + "step": 34449 + }, + { + "epoch": 3.2145189885229075, + "grad_norm": NaN, + "learning_rate": 0.00014018166855631303, + "loss": 0.0, + "step": 34450 + }, + { + "epoch": 3.214612298217785, + "grad_norm": NaN, + "learning_rate": 0.00014017412100499906, + "loss": 0.0, + "step": 34451 + }, + { + "epoch": 3.214705607912662, + "grad_norm": NaN, + "learning_rate": 0.00014016657347866934, + "loss": 0.0, + "step": 34452 + }, + { + "epoch": 3.2147989176075393, + "grad_norm": NaN, + "learning_rate": 0.00014015902597734324, + "loss": 0.0, + "step": 34453 + }, + { + "epoch": 3.2148922273024167, + "grad_norm": NaN, + "learning_rate": 0.00014015147850103972, + "loss": 0.0, + "step": 34454 + }, + { + "epoch": 3.214985536997294, + "grad_norm": NaN, + "learning_rate": 0.00014014393104977806, + "loss": 0.0, + "step": 34455 + }, + { + "epoch": 3.215078846692171, + "grad_norm": NaN, + "learning_rate": 0.00014013638362357753, + "loss": 0.0, + "step": 34456 + }, + { + "epoch": 3.2151721563870486, + "grad_norm": NaN, + "learning_rate": 0.00014012883622245724, + "loss": 0.0, + "step": 34457 + }, + { + "epoch": 3.215265466081926, + "grad_norm": NaN, + "learning_rate": 0.00014012128884643635, + "loss": 0.0, + "step": 34458 + }, + { + "epoch": 3.2153587757768034, + "grad_norm": NaN, + "learning_rate": 0.00014011374149553417, + "loss": 0.0, + "step": 34459 + }, + { + "epoch": 3.2154520854716804, + "grad_norm": NaN, + "learning_rate": 0.00014010619416976978, + "loss": 0.0, + "step": 34460 + }, + { + "epoch": 3.215545395166558, + "grad_norm": NaN, + "learning_rate": 0.00014009864686916236, + "loss": 0.0, + "step": 34461 + }, + { + "epoch": 3.215638704861435, + "grad_norm": NaN, + "learning_rate": 0.00014009109959373122, + "loss": 0.0, + "step": 34462 + }, + { + "epoch": 3.2157320145563126, + "grad_norm": NaN, + "learning_rate": 0.00014008355234349545, + "loss": 0.0, + "step": 34463 + }, + { + "epoch": 3.2158253242511896, + "grad_norm": NaN, + "learning_rate": 0.00014007600511847424, + "loss": 0.0, + "step": 34464 + }, + { + "epoch": 3.215918633946067, + "grad_norm": NaN, + "learning_rate": 0.00014006845791868688, + "loss": 0.0, + "step": 34465 + }, + { + "epoch": 3.2160119436409444, + "grad_norm": NaN, + "learning_rate": 0.00014006091074415247, + "loss": 0.0, + "step": 34466 + }, + { + "epoch": 3.2161052533358214, + "grad_norm": NaN, + "learning_rate": 0.00014005336359489015, + "loss": 0.0, + "step": 34467 + }, + { + "epoch": 3.216198563030699, + "grad_norm": NaN, + "learning_rate": 0.00014004581647091927, + "loss": 0.0, + "step": 34468 + }, + { + "epoch": 3.2162918727255763, + "grad_norm": NaN, + "learning_rate": 0.0001400382693722589, + "loss": 0.0, + "step": 34469 + }, + { + "epoch": 3.2163851824204537, + "grad_norm": NaN, + "learning_rate": 0.0001400307222989282, + "loss": 0.0, + "step": 34470 + }, + { + "epoch": 3.2164784921153307, + "grad_norm": NaN, + "learning_rate": 0.00014002317525094648, + "loss": 0.0, + "step": 34471 + }, + { + "epoch": 3.216571801810208, + "grad_norm": NaN, + "learning_rate": 0.0001400156282283329, + "loss": 0.0, + "step": 34472 + }, + { + "epoch": 3.2166651115050855, + "grad_norm": NaN, + "learning_rate": 0.0001400080812311065, + "loss": 0.0, + "step": 34473 + }, + { + "epoch": 3.2167584211999625, + "grad_norm": NaN, + "learning_rate": 0.00014000053425928673, + "loss": 0.0, + "step": 34474 + }, + { + "epoch": 3.21685173089484, + "grad_norm": NaN, + "learning_rate": 0.00013999298731289257, + "loss": 0.0, + "step": 34475 + }, + { + "epoch": 3.2169450405897173, + "grad_norm": NaN, + "learning_rate": 0.0001399854403919433, + "loss": 0.0, + "step": 34476 + }, + { + "epoch": 3.2170383502845947, + "grad_norm": NaN, + "learning_rate": 0.0001399778934964581, + "loss": 0.0, + "step": 34477 + }, + { + "epoch": 3.2171316599794717, + "grad_norm": NaN, + "learning_rate": 0.0001399703466264561, + "loss": 0.0, + "step": 34478 + }, + { + "epoch": 3.217224969674349, + "grad_norm": NaN, + "learning_rate": 0.00013996279978195658, + "loss": 0.0, + "step": 34479 + }, + { + "epoch": 3.2173182793692265, + "grad_norm": NaN, + "learning_rate": 0.00013995525296297865, + "loss": 0.0, + "step": 34480 + }, + { + "epoch": 3.217411589064104, + "grad_norm": NaN, + "learning_rate": 0.00013994770616954156, + "loss": 0.0, + "step": 34481 + }, + { + "epoch": 3.217504898758981, + "grad_norm": NaN, + "learning_rate": 0.0001399401594016645, + "loss": 0.0, + "step": 34482 + }, + { + "epoch": 3.2175982084538584, + "grad_norm": NaN, + "learning_rate": 0.00013993261265936657, + "loss": 0.0, + "step": 34483 + }, + { + "epoch": 3.2176915181487358, + "grad_norm": NaN, + "learning_rate": 0.00013992506594266707, + "loss": 0.0, + "step": 34484 + }, + { + "epoch": 3.2177848278436127, + "grad_norm": NaN, + "learning_rate": 0.00013991751925158515, + "loss": 0.0, + "step": 34485 + }, + { + "epoch": 3.21787813753849, + "grad_norm": NaN, + "learning_rate": 0.00013990997258613993, + "loss": 0.0, + "step": 34486 + }, + { + "epoch": 3.2179714472333676, + "grad_norm": NaN, + "learning_rate": 0.00013990242594635072, + "loss": 0.0, + "step": 34487 + }, + { + "epoch": 3.218064756928245, + "grad_norm": NaN, + "learning_rate": 0.00013989487933223666, + "loss": 0.0, + "step": 34488 + }, + { + "epoch": 3.218158066623122, + "grad_norm": NaN, + "learning_rate": 0.00013988733274381687, + "loss": 0.0, + "step": 34489 + }, + { + "epoch": 3.2182513763179994, + "grad_norm": NaN, + "learning_rate": 0.0001398797861811106, + "loss": 0.0, + "step": 34490 + }, + { + "epoch": 3.218344686012877, + "grad_norm": NaN, + "learning_rate": 0.0001398722396441371, + "loss": 0.0, + "step": 34491 + }, + { + "epoch": 3.2184379957077542, + "grad_norm": NaN, + "learning_rate": 0.00013986469313291544, + "loss": 0.0, + "step": 34492 + }, + { + "epoch": 3.218531305402631, + "grad_norm": NaN, + "learning_rate": 0.00013985714664746486, + "loss": 0.0, + "step": 34493 + }, + { + "epoch": 3.2186246150975086, + "grad_norm": NaN, + "learning_rate": 0.00013984960018780462, + "loss": 0.0, + "step": 34494 + }, + { + "epoch": 3.218717924792386, + "grad_norm": NaN, + "learning_rate": 0.00013984205375395378, + "loss": 0.0, + "step": 34495 + }, + { + "epoch": 3.218811234487263, + "grad_norm": NaN, + "learning_rate": 0.00013983450734593157, + "loss": 0.0, + "step": 34496 + }, + { + "epoch": 3.2189045441821404, + "grad_norm": NaN, + "learning_rate": 0.00013982696096375726, + "loss": 0.0, + "step": 34497 + }, + { + "epoch": 3.218997853877018, + "grad_norm": NaN, + "learning_rate": 0.00013981941460744995, + "loss": 0.0, + "step": 34498 + }, + { + "epoch": 3.2190911635718953, + "grad_norm": NaN, + "learning_rate": 0.0001398118682770288, + "loss": 0.0, + "step": 34499 + }, + { + "epoch": 3.2191844732667723, + "grad_norm": NaN, + "learning_rate": 0.00013980432197251315, + "loss": 0.0, + "step": 34500 + }, + { + "epoch": 3.2192777829616497, + "grad_norm": NaN, + "learning_rate": 0.00013979677569392202, + "loss": 0.0, + "step": 34501 + }, + { + "epoch": 3.219371092656527, + "grad_norm": NaN, + "learning_rate": 0.00013978922944127468, + "loss": 0.0, + "step": 34502 + }, + { + "epoch": 3.219464402351404, + "grad_norm": NaN, + "learning_rate": 0.00013978168321459033, + "loss": 0.0, + "step": 34503 + }, + { + "epoch": 3.2195577120462815, + "grad_norm": NaN, + "learning_rate": 0.00013977413701388812, + "loss": 0.0, + "step": 34504 + }, + { + "epoch": 3.219651021741159, + "grad_norm": NaN, + "learning_rate": 0.0001397665908391872, + "loss": 0.0, + "step": 34505 + }, + { + "epoch": 3.2197443314360363, + "grad_norm": NaN, + "learning_rate": 0.0001397590446905069, + "loss": 0.0, + "step": 34506 + }, + { + "epoch": 3.2198376411309133, + "grad_norm": NaN, + "learning_rate": 0.00013975149856786624, + "loss": 0.0, + "step": 34507 + }, + { + "epoch": 3.2199309508257907, + "grad_norm": NaN, + "learning_rate": 0.0001397439524712845, + "loss": 0.0, + "step": 34508 + }, + { + "epoch": 3.220024260520668, + "grad_norm": NaN, + "learning_rate": 0.00013973640640078087, + "loss": 0.0, + "step": 34509 + }, + { + "epoch": 3.2201175702155456, + "grad_norm": NaN, + "learning_rate": 0.0001397288603563745, + "loss": 0.0, + "step": 34510 + }, + { + "epoch": 3.2202108799104225, + "grad_norm": NaN, + "learning_rate": 0.0001397213143380846, + "loss": 0.0, + "step": 34511 + }, + { + "epoch": 3.2203041896053, + "grad_norm": NaN, + "learning_rate": 0.00013971376834593036, + "loss": 0.0, + "step": 34512 + }, + { + "epoch": 3.2203974993001774, + "grad_norm": NaN, + "learning_rate": 0.00013970622237993098, + "loss": 0.0, + "step": 34513 + }, + { + "epoch": 3.220490808995055, + "grad_norm": NaN, + "learning_rate": 0.00013969867644010557, + "loss": 0.0, + "step": 34514 + }, + { + "epoch": 3.2205841186899318, + "grad_norm": NaN, + "learning_rate": 0.00013969113052647345, + "loss": 0.0, + "step": 34515 + }, + { + "epoch": 3.220677428384809, + "grad_norm": NaN, + "learning_rate": 0.00013968358463905368, + "loss": 0.0, + "step": 34516 + }, + { + "epoch": 3.2207707380796866, + "grad_norm": NaN, + "learning_rate": 0.00013967603877786548, + "loss": 0.0, + "step": 34517 + }, + { + "epoch": 3.2208640477745636, + "grad_norm": NaN, + "learning_rate": 0.0001396684929429281, + "loss": 0.0, + "step": 34518 + }, + { + "epoch": 3.220957357469441, + "grad_norm": NaN, + "learning_rate": 0.00013966094713426067, + "loss": 0.0, + "step": 34519 + }, + { + "epoch": 3.2210506671643184, + "grad_norm": NaN, + "learning_rate": 0.00013965340135188236, + "loss": 0.0, + "step": 34520 + }, + { + "epoch": 3.221143976859196, + "grad_norm": NaN, + "learning_rate": 0.00013964585559581244, + "loss": 0.0, + "step": 34521 + }, + { + "epoch": 3.221237286554073, + "grad_norm": NaN, + "learning_rate": 0.00013963830986607, + "loss": 0.0, + "step": 34522 + }, + { + "epoch": 3.2213305962489502, + "grad_norm": NaN, + "learning_rate": 0.00013963076416267432, + "loss": 0.0, + "step": 34523 + }, + { + "epoch": 3.2214239059438277, + "grad_norm": NaN, + "learning_rate": 0.00013962321848564447, + "loss": 0.0, + "step": 34524 + }, + { + "epoch": 3.2215172156387046, + "grad_norm": NaN, + "learning_rate": 0.00013961567283499974, + "loss": 0.0, + "step": 34525 + }, + { + "epoch": 3.221610525333582, + "grad_norm": NaN, + "learning_rate": 0.00013960812721075928, + "loss": 0.0, + "step": 34526 + }, + { + "epoch": 3.2217038350284595, + "grad_norm": NaN, + "learning_rate": 0.00013960058161294222, + "loss": 0.0, + "step": 34527 + }, + { + "epoch": 3.221797144723337, + "grad_norm": NaN, + "learning_rate": 0.00013959303604156782, + "loss": 0.0, + "step": 34528 + }, + { + "epoch": 3.221890454418214, + "grad_norm": NaN, + "learning_rate": 0.0001395854904966553, + "loss": 0.0, + "step": 34529 + }, + { + "epoch": 3.2219837641130913, + "grad_norm": NaN, + "learning_rate": 0.00013957794497822376, + "loss": 0.0, + "step": 34530 + }, + { + "epoch": 3.2220770738079687, + "grad_norm": NaN, + "learning_rate": 0.00013957039948629237, + "loss": 0.0, + "step": 34531 + }, + { + "epoch": 3.222170383502846, + "grad_norm": NaN, + "learning_rate": 0.00013956285402088044, + "loss": 0.0, + "step": 34532 + }, + { + "epoch": 3.222263693197723, + "grad_norm": NaN, + "learning_rate": 0.00013955530858200706, + "loss": 0.0, + "step": 34533 + }, + { + "epoch": 3.2223570028926005, + "grad_norm": NaN, + "learning_rate": 0.0001395477631696914, + "loss": 0.0, + "step": 34534 + }, + { + "epoch": 3.222450312587478, + "grad_norm": NaN, + "learning_rate": 0.00013954021778395275, + "loss": 0.0, + "step": 34535 + }, + { + "epoch": 3.2225436222823554, + "grad_norm": NaN, + "learning_rate": 0.0001395326724248102, + "loss": 0.0, + "step": 34536 + }, + { + "epoch": 3.2226369319772323, + "grad_norm": NaN, + "learning_rate": 0.0001395251270922829, + "loss": 0.0, + "step": 34537 + }, + { + "epoch": 3.2227302416721098, + "grad_norm": NaN, + "learning_rate": 0.0001395175817863902, + "loss": 0.0, + "step": 34538 + }, + { + "epoch": 3.222823551366987, + "grad_norm": NaN, + "learning_rate": 0.00013951003650715114, + "loss": 0.0, + "step": 34539 + }, + { + "epoch": 3.222916861061864, + "grad_norm": NaN, + "learning_rate": 0.0001395024912545849, + "loss": 0.0, + "step": 34540 + }, + { + "epoch": 3.2230101707567416, + "grad_norm": NaN, + "learning_rate": 0.0001394949460287108, + "loss": 0.0, + "step": 34541 + }, + { + "epoch": 3.223103480451619, + "grad_norm": NaN, + "learning_rate": 0.0001394874008295479, + "loss": 0.0, + "step": 34542 + }, + { + "epoch": 3.2231967901464964, + "grad_norm": NaN, + "learning_rate": 0.0001394798556571154, + "loss": 0.0, + "step": 34543 + }, + { + "epoch": 3.2232900998413734, + "grad_norm": NaN, + "learning_rate": 0.00013947231051143256, + "loss": 0.0, + "step": 34544 + }, + { + "epoch": 3.223383409536251, + "grad_norm": NaN, + "learning_rate": 0.0001394647653925185, + "loss": 0.0, + "step": 34545 + }, + { + "epoch": 3.223476719231128, + "grad_norm": NaN, + "learning_rate": 0.00013945722030039238, + "loss": 0.0, + "step": 34546 + }, + { + "epoch": 3.223570028926005, + "grad_norm": NaN, + "learning_rate": 0.0001394496752350735, + "loss": 0.0, + "step": 34547 + }, + { + "epoch": 3.2236633386208826, + "grad_norm": NaN, + "learning_rate": 0.0001394421301965809, + "loss": 0.0, + "step": 34548 + }, + { + "epoch": 3.22375664831576, + "grad_norm": NaN, + "learning_rate": 0.00013943458518493384, + "loss": 0.0, + "step": 34549 + }, + { + "epoch": 3.2238499580106375, + "grad_norm": NaN, + "learning_rate": 0.00013942704020015158, + "loss": 0.0, + "step": 34550 + }, + { + "epoch": 3.2239432677055144, + "grad_norm": NaN, + "learning_rate": 0.00013941949524225316, + "loss": 0.0, + "step": 34551 + }, + { + "epoch": 3.224036577400392, + "grad_norm": NaN, + "learning_rate": 0.0001394119503112578, + "loss": 0.0, + "step": 34552 + }, + { + "epoch": 3.2241298870952693, + "grad_norm": NaN, + "learning_rate": 0.0001394044054071848, + "loss": 0.0, + "step": 34553 + }, + { + "epoch": 3.2242231967901467, + "grad_norm": NaN, + "learning_rate": 0.0001393968605300532, + "loss": 0.0, + "step": 34554 + }, + { + "epoch": 3.2243165064850237, + "grad_norm": NaN, + "learning_rate": 0.00013938931567988224, + "loss": 0.0, + "step": 34555 + }, + { + "epoch": 3.224409816179901, + "grad_norm": NaN, + "learning_rate": 0.00013938177085669114, + "loss": 0.0, + "step": 34556 + }, + { + "epoch": 3.2245031258747785, + "grad_norm": NaN, + "learning_rate": 0.00013937422606049904, + "loss": 0.0, + "step": 34557 + }, + { + "epoch": 3.224596435569656, + "grad_norm": NaN, + "learning_rate": 0.0001393666812913251, + "loss": 0.0, + "step": 34558 + }, + { + "epoch": 3.224689745264533, + "grad_norm": NaN, + "learning_rate": 0.0001393591365491886, + "loss": 0.0, + "step": 34559 + }, + { + "epoch": 3.2247830549594103, + "grad_norm": NaN, + "learning_rate": 0.00013935159183410862, + "loss": 0.0, + "step": 34560 + }, + { + "epoch": 3.2248763646542877, + "grad_norm": NaN, + "learning_rate": 0.0001393440471461044, + "loss": 0.0, + "step": 34561 + }, + { + "epoch": 3.2249696743491647, + "grad_norm": NaN, + "learning_rate": 0.00013933650248519513, + "loss": 0.0, + "step": 34562 + }, + { + "epoch": 3.225062984044042, + "grad_norm": NaN, + "learning_rate": 0.00013932895785139993, + "loss": 0.0, + "step": 34563 + }, + { + "epoch": 3.2251562937389195, + "grad_norm": NaN, + "learning_rate": 0.00013932141324473805, + "loss": 0.0, + "step": 34564 + }, + { + "epoch": 3.225249603433797, + "grad_norm": NaN, + "learning_rate": 0.00013931386866522868, + "loss": 0.0, + "step": 34565 + }, + { + "epoch": 3.225342913128674, + "grad_norm": NaN, + "learning_rate": 0.00013930632411289094, + "loss": 0.0, + "step": 34566 + }, + { + "epoch": 3.2254362228235514, + "grad_norm": NaN, + "learning_rate": 0.00013929877958774407, + "loss": 0.0, + "step": 34567 + }, + { + "epoch": 3.2255295325184288, + "grad_norm": NaN, + "learning_rate": 0.00013929123508980726, + "loss": 0.0, + "step": 34568 + }, + { + "epoch": 3.2256228422133058, + "grad_norm": NaN, + "learning_rate": 0.0001392836906190996, + "loss": 0.0, + "step": 34569 + }, + { + "epoch": 3.225716151908183, + "grad_norm": NaN, + "learning_rate": 0.0001392761461756404, + "loss": 0.0, + "step": 34570 + }, + { + "epoch": 3.2258094616030606, + "grad_norm": NaN, + "learning_rate": 0.00013926860175944876, + "loss": 0.0, + "step": 34571 + }, + { + "epoch": 3.225902771297938, + "grad_norm": NaN, + "learning_rate": 0.00013926105737054387, + "loss": 0.0, + "step": 34572 + }, + { + "epoch": 3.225996080992815, + "grad_norm": NaN, + "learning_rate": 0.00013925351300894497, + "loss": 0.0, + "step": 34573 + }, + { + "epoch": 3.2260893906876924, + "grad_norm": NaN, + "learning_rate": 0.00013924596867467118, + "loss": 0.0, + "step": 34574 + }, + { + "epoch": 3.22618270038257, + "grad_norm": NaN, + "learning_rate": 0.00013923842436774168, + "loss": 0.0, + "step": 34575 + }, + { + "epoch": 3.2262760100774472, + "grad_norm": NaN, + "learning_rate": 0.00013923088008817574, + "loss": 0.0, + "step": 34576 + }, + { + "epoch": 3.226369319772324, + "grad_norm": NaN, + "learning_rate": 0.00013922333583599246, + "loss": 0.0, + "step": 34577 + }, + { + "epoch": 3.2264626294672016, + "grad_norm": NaN, + "learning_rate": 0.00013921579161121098, + "loss": 0.0, + "step": 34578 + }, + { + "epoch": 3.226555939162079, + "grad_norm": NaN, + "learning_rate": 0.00013920824741385065, + "loss": 0.0, + "step": 34579 + }, + { + "epoch": 3.226649248856956, + "grad_norm": NaN, + "learning_rate": 0.00013920070324393048, + "loss": 0.0, + "step": 34580 + }, + { + "epoch": 3.2267425585518335, + "grad_norm": NaN, + "learning_rate": 0.0001391931591014697, + "loss": 0.0, + "step": 34581 + }, + { + "epoch": 3.226835868246711, + "grad_norm": NaN, + "learning_rate": 0.0001391856149864876, + "loss": 0.0, + "step": 34582 + }, + { + "epoch": 3.2269291779415883, + "grad_norm": NaN, + "learning_rate": 0.00013917807089900322, + "loss": 0.0, + "step": 34583 + }, + { + "epoch": 3.2270224876364653, + "grad_norm": NaN, + "learning_rate": 0.0001391705268390358, + "loss": 0.0, + "step": 34584 + }, + { + "epoch": 3.2271157973313427, + "grad_norm": NaN, + "learning_rate": 0.00013916298280660457, + "loss": 0.0, + "step": 34585 + }, + { + "epoch": 3.22720910702622, + "grad_norm": NaN, + "learning_rate": 0.00013915543880172864, + "loss": 0.0, + "step": 34586 + }, + { + "epoch": 3.2273024167210975, + "grad_norm": NaN, + "learning_rate": 0.00013914789482442718, + "loss": 0.0, + "step": 34587 + }, + { + "epoch": 3.2273957264159745, + "grad_norm": NaN, + "learning_rate": 0.00013914035087471947, + "loss": 0.0, + "step": 34588 + }, + { + "epoch": 3.227489036110852, + "grad_norm": NaN, + "learning_rate": 0.00013913280695262458, + "loss": 0.0, + "step": 34589 + }, + { + "epoch": 3.2275823458057293, + "grad_norm": NaN, + "learning_rate": 0.00013912526305816174, + "loss": 0.0, + "step": 34590 + }, + { + "epoch": 3.2276756555006063, + "grad_norm": NaN, + "learning_rate": 0.00013911771919135018, + "loss": 0.0, + "step": 34591 + }, + { + "epoch": 3.2277689651954837, + "grad_norm": NaN, + "learning_rate": 0.00013911017535220903, + "loss": 0.0, + "step": 34592 + }, + { + "epoch": 3.227862274890361, + "grad_norm": NaN, + "learning_rate": 0.00013910263154075743, + "loss": 0.0, + "step": 34593 + }, + { + "epoch": 3.2279555845852386, + "grad_norm": NaN, + "learning_rate": 0.00013909508775701465, + "loss": 0.0, + "step": 34594 + }, + { + "epoch": 3.2280488942801155, + "grad_norm": NaN, + "learning_rate": 0.00013908754400099982, + "loss": 0.0, + "step": 34595 + }, + { + "epoch": 3.228142203974993, + "grad_norm": NaN, + "learning_rate": 0.00013908000027273212, + "loss": 0.0, + "step": 34596 + }, + { + "epoch": 3.2282355136698704, + "grad_norm": NaN, + "learning_rate": 0.0001390724565722308, + "loss": 0.0, + "step": 34597 + }, + { + "epoch": 3.228328823364748, + "grad_norm": NaN, + "learning_rate": 0.00013906491289951493, + "loss": 0.0, + "step": 34598 + }, + { + "epoch": 3.2284221330596248, + "grad_norm": NaN, + "learning_rate": 0.00013905736925460372, + "loss": 0.0, + "step": 34599 + }, + { + "epoch": 3.228515442754502, + "grad_norm": NaN, + "learning_rate": 0.00013904982563751647, + "loss": 0.0, + "step": 34600 + }, + { + "epoch": 3.2286087524493796, + "grad_norm": NaN, + "learning_rate": 0.0001390422820482722, + "loss": 0.0, + "step": 34601 + }, + { + "epoch": 3.2287020621442566, + "grad_norm": NaN, + "learning_rate": 0.00013903473848689017, + "loss": 0.0, + "step": 34602 + }, + { + "epoch": 3.228795371839134, + "grad_norm": NaN, + "learning_rate": 0.00013902719495338962, + "loss": 0.0, + "step": 34603 + }, + { + "epoch": 3.2288886815340114, + "grad_norm": NaN, + "learning_rate": 0.00013901965144778956, + "loss": 0.0, + "step": 34604 + }, + { + "epoch": 3.228981991228889, + "grad_norm": NaN, + "learning_rate": 0.00013901210797010932, + "loss": 0.0, + "step": 34605 + }, + { + "epoch": 3.229075300923766, + "grad_norm": NaN, + "learning_rate": 0.00013900456452036806, + "loss": 0.0, + "step": 34606 + }, + { + "epoch": 3.2291686106186432, + "grad_norm": NaN, + "learning_rate": 0.00013899702109858487, + "loss": 0.0, + "step": 34607 + }, + { + "epoch": 3.2292619203135207, + "grad_norm": NaN, + "learning_rate": 0.00013898947770477903, + "loss": 0.0, + "step": 34608 + }, + { + "epoch": 3.229355230008398, + "grad_norm": NaN, + "learning_rate": 0.00013898193433896976, + "loss": 0.0, + "step": 34609 + }, + { + "epoch": 3.229448539703275, + "grad_norm": NaN, + "learning_rate": 0.00013897439100117606, + "loss": 0.0, + "step": 34610 + }, + { + "epoch": 3.2295418493981525, + "grad_norm": NaN, + "learning_rate": 0.00013896684769141727, + "loss": 0.0, + "step": 34611 + }, + { + "epoch": 3.22963515909303, + "grad_norm": NaN, + "learning_rate": 0.00013895930440971252, + "loss": 0.0, + "step": 34612 + }, + { + "epoch": 3.229728468787907, + "grad_norm": NaN, + "learning_rate": 0.00013895176115608093, + "loss": 0.0, + "step": 34613 + }, + { + "epoch": 3.2298217784827843, + "grad_norm": NaN, + "learning_rate": 0.00013894421793054183, + "loss": 0.0, + "step": 34614 + }, + { + "epoch": 3.2299150881776617, + "grad_norm": NaN, + "learning_rate": 0.00013893667473311426, + "loss": 0.0, + "step": 34615 + }, + { + "epoch": 3.230008397872539, + "grad_norm": NaN, + "learning_rate": 0.00013892913156381741, + "loss": 0.0, + "step": 34616 + }, + { + "epoch": 3.230101707567416, + "grad_norm": NaN, + "learning_rate": 0.00013892158842267058, + "loss": 0.0, + "step": 34617 + }, + { + "epoch": 3.2301950172622935, + "grad_norm": NaN, + "learning_rate": 0.00013891404530969284, + "loss": 0.0, + "step": 34618 + }, + { + "epoch": 3.230288326957171, + "grad_norm": NaN, + "learning_rate": 0.00013890650222490335, + "loss": 0.0, + "step": 34619 + }, + { + "epoch": 3.230381636652048, + "grad_norm": NaN, + "learning_rate": 0.0001388989591683214, + "loss": 0.0, + "step": 34620 + }, + { + "epoch": 3.2304749463469253, + "grad_norm": NaN, + "learning_rate": 0.00013889141613996608, + "loss": 0.0, + "step": 34621 + }, + { + "epoch": 3.2305682560418028, + "grad_norm": NaN, + "learning_rate": 0.00013888387313985658, + "loss": 0.0, + "step": 34622 + }, + { + "epoch": 3.23066156573668, + "grad_norm": NaN, + "learning_rate": 0.00013887633016801214, + "loss": 0.0, + "step": 34623 + }, + { + "epoch": 3.230754875431557, + "grad_norm": NaN, + "learning_rate": 0.00013886878722445188, + "loss": 0.0, + "step": 34624 + }, + { + "epoch": 3.2308481851264346, + "grad_norm": NaN, + "learning_rate": 0.00013886124430919497, + "loss": 0.0, + "step": 34625 + }, + { + "epoch": 3.230941494821312, + "grad_norm": NaN, + "learning_rate": 0.00013885370142226067, + "loss": 0.0, + "step": 34626 + }, + { + "epoch": 3.2310348045161894, + "grad_norm": NaN, + "learning_rate": 0.0001388461585636681, + "loss": 0.0, + "step": 34627 + }, + { + "epoch": 3.2311281142110664, + "grad_norm": NaN, + "learning_rate": 0.00013883861573343637, + "loss": 0.0, + "step": 34628 + }, + { + "epoch": 3.231221423905944, + "grad_norm": NaN, + "learning_rate": 0.00013883107293158483, + "loss": 0.0, + "step": 34629 + }, + { + "epoch": 3.231314733600821, + "grad_norm": NaN, + "learning_rate": 0.0001388235301581325, + "loss": 0.0, + "step": 34630 + }, + { + "epoch": 3.2314080432956986, + "grad_norm": NaN, + "learning_rate": 0.0001388159874130986, + "loss": 0.0, + "step": 34631 + }, + { + "epoch": 3.2315013529905756, + "grad_norm": NaN, + "learning_rate": 0.00013880844469650242, + "loss": 0.0, + "step": 34632 + }, + { + "epoch": 3.231594662685453, + "grad_norm": NaN, + "learning_rate": 0.000138800902008363, + "loss": 0.0, + "step": 34633 + }, + { + "epoch": 3.2316879723803305, + "grad_norm": NaN, + "learning_rate": 0.00013879335934869954, + "loss": 0.0, + "step": 34634 + }, + { + "epoch": 3.2317812820752074, + "grad_norm": NaN, + "learning_rate": 0.00013878581671753133, + "loss": 0.0, + "step": 34635 + }, + { + "epoch": 3.231874591770085, + "grad_norm": NaN, + "learning_rate": 0.0001387782741148774, + "loss": 0.0, + "step": 34636 + }, + { + "epoch": 3.2319679014649623, + "grad_norm": NaN, + "learning_rate": 0.00013877073154075698, + "loss": 0.0, + "step": 34637 + }, + { + "epoch": 3.2320612111598397, + "grad_norm": NaN, + "learning_rate": 0.00013876318899518932, + "loss": 0.0, + "step": 34638 + }, + { + "epoch": 3.2321545208547167, + "grad_norm": NaN, + "learning_rate": 0.0001387556464781935, + "loss": 0.0, + "step": 34639 + }, + { + "epoch": 3.232247830549594, + "grad_norm": NaN, + "learning_rate": 0.00013874810398978874, + "loss": 0.0, + "step": 34640 + }, + { + "epoch": 3.2323411402444715, + "grad_norm": NaN, + "learning_rate": 0.00013874056152999426, + "loss": 0.0, + "step": 34641 + }, + { + "epoch": 3.2324344499393485, + "grad_norm": NaN, + "learning_rate": 0.00013873301909882914, + "loss": 0.0, + "step": 34642 + }, + { + "epoch": 3.232527759634226, + "grad_norm": NaN, + "learning_rate": 0.00013872547669631266, + "loss": 0.0, + "step": 34643 + }, + { + "epoch": 3.2326210693291033, + "grad_norm": NaN, + "learning_rate": 0.00013871793432246397, + "loss": 0.0, + "step": 34644 + }, + { + "epoch": 3.2327143790239807, + "grad_norm": NaN, + "learning_rate": 0.00013871039197730216, + "loss": 0.0, + "step": 34645 + }, + { + "epoch": 3.2328076887188577, + "grad_norm": NaN, + "learning_rate": 0.00013870284966084652, + "loss": 0.0, + "step": 34646 + }, + { + "epoch": 3.232900998413735, + "grad_norm": NaN, + "learning_rate": 0.0001386953073731162, + "loss": 0.0, + "step": 34647 + }, + { + "epoch": 3.2329943081086125, + "grad_norm": NaN, + "learning_rate": 0.00013868776511413033, + "loss": 0.0, + "step": 34648 + }, + { + "epoch": 3.23308761780349, + "grad_norm": NaN, + "learning_rate": 0.00013868022288390813, + "loss": 0.0, + "step": 34649 + }, + { + "epoch": 3.233180927498367, + "grad_norm": NaN, + "learning_rate": 0.00013867268068246883, + "loss": 0.0, + "step": 34650 + }, + { + "epoch": 3.2332742371932444, + "grad_norm": NaN, + "learning_rate": 0.00013866513850983145, + "loss": 0.0, + "step": 34651 + }, + { + "epoch": 3.233367546888122, + "grad_norm": NaN, + "learning_rate": 0.0001386575963660153, + "loss": 0.0, + "step": 34652 + }, + { + "epoch": 3.233460856582999, + "grad_norm": NaN, + "learning_rate": 0.0001386500542510396, + "loss": 0.0, + "step": 34653 + }, + { + "epoch": 3.233554166277876, + "grad_norm": NaN, + "learning_rate": 0.00013864251216492336, + "loss": 0.0, + "step": 34654 + }, + { + "epoch": 3.2336474759727536, + "grad_norm": NaN, + "learning_rate": 0.0001386349701076859, + "loss": 0.0, + "step": 34655 + }, + { + "epoch": 3.233740785667631, + "grad_norm": NaN, + "learning_rate": 0.0001386274280793463, + "loss": 0.0, + "step": 34656 + }, + { + "epoch": 3.233834095362508, + "grad_norm": NaN, + "learning_rate": 0.0001386198860799238, + "loss": 0.0, + "step": 34657 + }, + { + "epoch": 3.2339274050573854, + "grad_norm": NaN, + "learning_rate": 0.00013861234410943757, + "loss": 0.0, + "step": 34658 + }, + { + "epoch": 3.234020714752263, + "grad_norm": NaN, + "learning_rate": 0.00013860480216790677, + "loss": 0.0, + "step": 34659 + }, + { + "epoch": 3.2341140244471402, + "grad_norm": NaN, + "learning_rate": 0.00013859726025535054, + "loss": 0.0, + "step": 34660 + }, + { + "epoch": 3.234207334142017, + "grad_norm": NaN, + "learning_rate": 0.0001385897183717882, + "loss": 0.0, + "step": 34661 + }, + { + "epoch": 3.2343006438368946, + "grad_norm": NaN, + "learning_rate": 0.00013858217651723873, + "loss": 0.0, + "step": 34662 + }, + { + "epoch": 3.234393953531772, + "grad_norm": NaN, + "learning_rate": 0.0001385746346917214, + "loss": 0.0, + "step": 34663 + }, + { + "epoch": 3.234487263226649, + "grad_norm": NaN, + "learning_rate": 0.00013856709289525546, + "loss": 0.0, + "step": 34664 + }, + { + "epoch": 3.2345805729215265, + "grad_norm": NaN, + "learning_rate": 0.00013855955112786, + "loss": 0.0, + "step": 34665 + }, + { + "epoch": 3.234673882616404, + "grad_norm": NaN, + "learning_rate": 0.00013855200938955414, + "loss": 0.0, + "step": 34666 + }, + { + "epoch": 3.2347671923112813, + "grad_norm": NaN, + "learning_rate": 0.00013854446768035722, + "loss": 0.0, + "step": 34667 + }, + { + "epoch": 3.2348605020061583, + "grad_norm": NaN, + "learning_rate": 0.00013853692600028828, + "loss": 0.0, + "step": 34668 + }, + { + "epoch": 3.2349538117010357, + "grad_norm": NaN, + "learning_rate": 0.00013852938434936652, + "loss": 0.0, + "step": 34669 + }, + { + "epoch": 3.235047121395913, + "grad_norm": NaN, + "learning_rate": 0.0001385218427276112, + "loss": 0.0, + "step": 34670 + }, + { + "epoch": 3.2351404310907905, + "grad_norm": NaN, + "learning_rate": 0.00013851430113504142, + "loss": 0.0, + "step": 34671 + }, + { + "epoch": 3.2352337407856675, + "grad_norm": NaN, + "learning_rate": 0.0001385067595716763, + "loss": 0.0, + "step": 34672 + }, + { + "epoch": 3.235327050480545, + "grad_norm": NaN, + "learning_rate": 0.00013849921803753517, + "loss": 0.0, + "step": 34673 + }, + { + "epoch": 3.2354203601754223, + "grad_norm": NaN, + "learning_rate": 0.0001384916765326371, + "loss": 0.0, + "step": 34674 + }, + { + "epoch": 3.2355136698702998, + "grad_norm": NaN, + "learning_rate": 0.00013848413505700122, + "loss": 0.0, + "step": 34675 + }, + { + "epoch": 3.2356069795651767, + "grad_norm": NaN, + "learning_rate": 0.0001384765936106469, + "loss": 0.0, + "step": 34676 + }, + { + "epoch": 3.235700289260054, + "grad_norm": NaN, + "learning_rate": 0.00013846905219359307, + "loss": 0.0, + "step": 34677 + }, + { + "epoch": 3.2357935989549316, + "grad_norm": NaN, + "learning_rate": 0.00013846151080585908, + "loss": 0.0, + "step": 34678 + }, + { + "epoch": 3.2358869086498085, + "grad_norm": NaN, + "learning_rate": 0.00013845396944746406, + "loss": 0.0, + "step": 34679 + }, + { + "epoch": 3.235980218344686, + "grad_norm": NaN, + "learning_rate": 0.00013844642811842713, + "loss": 0.0, + "step": 34680 + }, + { + "epoch": 3.2360735280395634, + "grad_norm": NaN, + "learning_rate": 0.00013843888681876754, + "loss": 0.0, + "step": 34681 + }, + { + "epoch": 3.236166837734441, + "grad_norm": NaN, + "learning_rate": 0.00013843134554850447, + "loss": 0.0, + "step": 34682 + }, + { + "epoch": 3.236260147429318, + "grad_norm": NaN, + "learning_rate": 0.000138423804307657, + "loss": 0.0, + "step": 34683 + }, + { + "epoch": 3.236353457124195, + "grad_norm": NaN, + "learning_rate": 0.0001384162630962444, + "loss": 0.0, + "step": 34684 + }, + { + "epoch": 3.2364467668190726, + "grad_norm": NaN, + "learning_rate": 0.00013840872191428583, + "loss": 0.0, + "step": 34685 + }, + { + "epoch": 3.2365400765139496, + "grad_norm": NaN, + "learning_rate": 0.00013840118076180038, + "loss": 0.0, + "step": 34686 + }, + { + "epoch": 3.236633386208827, + "grad_norm": NaN, + "learning_rate": 0.00013839363963880734, + "loss": 0.0, + "step": 34687 + }, + { + "epoch": 3.2367266959037044, + "grad_norm": NaN, + "learning_rate": 0.00013838609854532587, + "loss": 0.0, + "step": 34688 + }, + { + "epoch": 3.236820005598582, + "grad_norm": NaN, + "learning_rate": 0.00013837855748137504, + "loss": 0.0, + "step": 34689 + }, + { + "epoch": 3.236913315293459, + "grad_norm": NaN, + "learning_rate": 0.00013837101644697413, + "loss": 0.0, + "step": 34690 + }, + { + "epoch": 3.2370066249883362, + "grad_norm": NaN, + "learning_rate": 0.0001383634754421423, + "loss": 0.0, + "step": 34691 + }, + { + "epoch": 3.2370999346832137, + "grad_norm": NaN, + "learning_rate": 0.00013835593446689864, + "loss": 0.0, + "step": 34692 + }, + { + "epoch": 3.237193244378091, + "grad_norm": NaN, + "learning_rate": 0.00013834839352126242, + "loss": 0.0, + "step": 34693 + }, + { + "epoch": 3.237286554072968, + "grad_norm": NaN, + "learning_rate": 0.00013834085260525284, + "loss": 0.0, + "step": 34694 + }, + { + "epoch": 3.2373798637678455, + "grad_norm": NaN, + "learning_rate": 0.00013833331171888893, + "loss": 0.0, + "step": 34695 + }, + { + "epoch": 3.237473173462723, + "grad_norm": NaN, + "learning_rate": 0.00013832577086219, + "loss": 0.0, + "step": 34696 + }, + { + "epoch": 3.2375664831576, + "grad_norm": NaN, + "learning_rate": 0.00013831823003517522, + "loss": 0.0, + "step": 34697 + }, + { + "epoch": 3.2376597928524773, + "grad_norm": NaN, + "learning_rate": 0.00013831068923786364, + "loss": 0.0, + "step": 34698 + }, + { + "epoch": 3.2377531025473547, + "grad_norm": NaN, + "learning_rate": 0.00013830314847027453, + "loss": 0.0, + "step": 34699 + }, + { + "epoch": 3.237846412242232, + "grad_norm": NaN, + "learning_rate": 0.0001382956077324271, + "loss": 0.0, + "step": 34700 + }, + { + "epoch": 3.237939721937109, + "grad_norm": NaN, + "learning_rate": 0.0001382880670243404, + "loss": 0.0, + "step": 34701 + }, + { + "epoch": 3.2380330316319865, + "grad_norm": NaN, + "learning_rate": 0.00013828052634603377, + "loss": 0.0, + "step": 34702 + }, + { + "epoch": 3.238126341326864, + "grad_norm": NaN, + "learning_rate": 0.00013827298569752623, + "loss": 0.0, + "step": 34703 + }, + { + "epoch": 3.2382196510217414, + "grad_norm": NaN, + "learning_rate": 0.00013826544507883698, + "loss": 0.0, + "step": 34704 + }, + { + "epoch": 3.2383129607166183, + "grad_norm": NaN, + "learning_rate": 0.0001382579044899853, + "loss": 0.0, + "step": 34705 + }, + { + "epoch": 3.2384062704114958, + "grad_norm": NaN, + "learning_rate": 0.00013825036393099027, + "loss": 0.0, + "step": 34706 + }, + { + "epoch": 3.238499580106373, + "grad_norm": NaN, + "learning_rate": 0.00013824282340187103, + "loss": 0.0, + "step": 34707 + }, + { + "epoch": 3.23859288980125, + "grad_norm": NaN, + "learning_rate": 0.00013823528290264687, + "loss": 0.0, + "step": 34708 + }, + { + "epoch": 3.2386861994961276, + "grad_norm": NaN, + "learning_rate": 0.00013822774243333689, + "loss": 0.0, + "step": 34709 + }, + { + "epoch": 3.238779509191005, + "grad_norm": NaN, + "learning_rate": 0.00013822020199396023, + "loss": 0.0, + "step": 34710 + }, + { + "epoch": 3.2388728188858824, + "grad_norm": NaN, + "learning_rate": 0.00013821266158453618, + "loss": 0.0, + "step": 34711 + }, + { + "epoch": 3.2389661285807594, + "grad_norm": NaN, + "learning_rate": 0.00013820512120508378, + "loss": 0.0, + "step": 34712 + }, + { + "epoch": 3.239059438275637, + "grad_norm": NaN, + "learning_rate": 0.00013819758085562226, + "loss": 0.0, + "step": 34713 + }, + { + "epoch": 3.2391527479705142, + "grad_norm": NaN, + "learning_rate": 0.00013819004053617083, + "loss": 0.0, + "step": 34714 + }, + { + "epoch": 3.239246057665391, + "grad_norm": NaN, + "learning_rate": 0.0001381825002467486, + "loss": 0.0, + "step": 34715 + }, + { + "epoch": 3.2393393673602686, + "grad_norm": NaN, + "learning_rate": 0.00013817495998737477, + "loss": 0.0, + "step": 34716 + }, + { + "epoch": 3.239432677055146, + "grad_norm": NaN, + "learning_rate": 0.00013816741975806858, + "loss": 0.0, + "step": 34717 + }, + { + "epoch": 3.2395259867500235, + "grad_norm": NaN, + "learning_rate": 0.00013815987955884907, + "loss": 0.0, + "step": 34718 + }, + { + "epoch": 3.2396192964449004, + "grad_norm": NaN, + "learning_rate": 0.00013815233938973547, + "loss": 0.0, + "step": 34719 + }, + { + "epoch": 3.239712606139778, + "grad_norm": NaN, + "learning_rate": 0.00013814479925074703, + "loss": 0.0, + "step": 34720 + }, + { + "epoch": 3.2398059158346553, + "grad_norm": NaN, + "learning_rate": 0.00013813725914190277, + "loss": 0.0, + "step": 34721 + }, + { + "epoch": 3.2398992255295327, + "grad_norm": NaN, + "learning_rate": 0.000138129719063222, + "loss": 0.0, + "step": 34722 + }, + { + "epoch": 3.2399925352244097, + "grad_norm": NaN, + "learning_rate": 0.00013812217901472385, + "loss": 0.0, + "step": 34723 + }, + { + "epoch": 3.240085844919287, + "grad_norm": NaN, + "learning_rate": 0.00013811463899642744, + "loss": 0.0, + "step": 34724 + }, + { + "epoch": 3.2401791546141645, + "grad_norm": NaN, + "learning_rate": 0.00013810709900835197, + "loss": 0.0, + "step": 34725 + }, + { + "epoch": 3.240272464309042, + "grad_norm": NaN, + "learning_rate": 0.00013809955905051672, + "loss": 0.0, + "step": 34726 + }, + { + "epoch": 3.240365774003919, + "grad_norm": NaN, + "learning_rate": 0.00013809201912294065, + "loss": 0.0, + "step": 34727 + }, + { + "epoch": 3.2404590836987963, + "grad_norm": NaN, + "learning_rate": 0.0001380844792256431, + "loss": 0.0, + "step": 34728 + }, + { + "epoch": 3.2405523933936737, + "grad_norm": NaN, + "learning_rate": 0.00013807693935864322, + "loss": 0.0, + "step": 34729 + }, + { + "epoch": 3.2406457030885507, + "grad_norm": NaN, + "learning_rate": 0.00013806939952196008, + "loss": 0.0, + "step": 34730 + }, + { + "epoch": 3.240739012783428, + "grad_norm": NaN, + "learning_rate": 0.00013806185971561297, + "loss": 0.0, + "step": 34731 + }, + { + "epoch": 3.2408323224783055, + "grad_norm": NaN, + "learning_rate": 0.00013805431993962102, + "loss": 0.0, + "step": 34732 + }, + { + "epoch": 3.240925632173183, + "grad_norm": NaN, + "learning_rate": 0.00013804678019400336, + "loss": 0.0, + "step": 34733 + }, + { + "epoch": 3.24101894186806, + "grad_norm": NaN, + "learning_rate": 0.0001380392404787792, + "loss": 0.0, + "step": 34734 + }, + { + "epoch": 3.2411122515629374, + "grad_norm": NaN, + "learning_rate": 0.00013803170079396777, + "loss": 0.0, + "step": 34735 + }, + { + "epoch": 3.241205561257815, + "grad_norm": NaN, + "learning_rate": 0.0001380241611395881, + "loss": 0.0, + "step": 34736 + }, + { + "epoch": 3.2412988709526918, + "grad_norm": NaN, + "learning_rate": 0.0001380166215156595, + "loss": 0.0, + "step": 34737 + }, + { + "epoch": 3.241392180647569, + "grad_norm": NaN, + "learning_rate": 0.00013800908192220106, + "loss": 0.0, + "step": 34738 + }, + { + "epoch": 3.2414854903424466, + "grad_norm": NaN, + "learning_rate": 0.00013800154235923197, + "loss": 0.0, + "step": 34739 + }, + { + "epoch": 3.241578800037324, + "grad_norm": NaN, + "learning_rate": 0.00013799400282677138, + "loss": 0.0, + "step": 34740 + }, + { + "epoch": 3.241672109732201, + "grad_norm": NaN, + "learning_rate": 0.00013798646332483855, + "loss": 0.0, + "step": 34741 + }, + { + "epoch": 3.2417654194270784, + "grad_norm": NaN, + "learning_rate": 0.0001379789238534525, + "loss": 0.0, + "step": 34742 + }, + { + "epoch": 3.241858729121956, + "grad_norm": NaN, + "learning_rate": 0.00013797138441263256, + "loss": 0.0, + "step": 34743 + }, + { + "epoch": 3.2419520388168332, + "grad_norm": NaN, + "learning_rate": 0.0001379638450023978, + "loss": 0.0, + "step": 34744 + }, + { + "epoch": 3.2420453485117102, + "grad_norm": NaN, + "learning_rate": 0.0001379563056227674, + "loss": 0.0, + "step": 34745 + }, + { + "epoch": 3.2421386582065876, + "grad_norm": NaN, + "learning_rate": 0.00013794876627376058, + "loss": 0.0, + "step": 34746 + }, + { + "epoch": 3.242231967901465, + "grad_norm": NaN, + "learning_rate": 0.00013794122695539647, + "loss": 0.0, + "step": 34747 + }, + { + "epoch": 3.2423252775963425, + "grad_norm": NaN, + "learning_rate": 0.0001379336876676942, + "loss": 0.0, + "step": 34748 + }, + { + "epoch": 3.2424185872912195, + "grad_norm": NaN, + "learning_rate": 0.00013792614841067304, + "loss": 0.0, + "step": 34749 + }, + { + "epoch": 3.242511896986097, + "grad_norm": NaN, + "learning_rate": 0.0001379186091843521, + "loss": 0.0, + "step": 34750 + }, + { + "epoch": 3.2426052066809743, + "grad_norm": NaN, + "learning_rate": 0.00013791106998875053, + "loss": 0.0, + "step": 34751 + }, + { + "epoch": 3.2426985163758513, + "grad_norm": NaN, + "learning_rate": 0.00013790353082388757, + "loss": 0.0, + "step": 34752 + }, + { + "epoch": 3.2427918260707287, + "grad_norm": NaN, + "learning_rate": 0.0001378959916897823, + "loss": 0.0, + "step": 34753 + }, + { + "epoch": 3.242885135765606, + "grad_norm": NaN, + "learning_rate": 0.00013788845258645396, + "loss": 0.0, + "step": 34754 + }, + { + "epoch": 3.2429784454604835, + "grad_norm": NaN, + "learning_rate": 0.00013788091351392174, + "loss": 0.0, + "step": 34755 + }, + { + "epoch": 3.2430717551553605, + "grad_norm": NaN, + "learning_rate": 0.00013787337447220468, + "loss": 0.0, + "step": 34756 + }, + { + "epoch": 3.243165064850238, + "grad_norm": NaN, + "learning_rate": 0.0001378658354613221, + "loss": 0.0, + "step": 34757 + }, + { + "epoch": 3.2432583745451153, + "grad_norm": NaN, + "learning_rate": 0.0001378582964812931, + "loss": 0.0, + "step": 34758 + }, + { + "epoch": 3.2433516842399923, + "grad_norm": NaN, + "learning_rate": 0.00013785075753213684, + "loss": 0.0, + "step": 34759 + }, + { + "epoch": 3.2434449939348697, + "grad_norm": NaN, + "learning_rate": 0.0001378432186138725, + "loss": 0.0, + "step": 34760 + }, + { + "epoch": 3.243538303629747, + "grad_norm": NaN, + "learning_rate": 0.0001378356797265193, + "loss": 0.0, + "step": 34761 + }, + { + "epoch": 3.2436316133246246, + "grad_norm": NaN, + "learning_rate": 0.0001378281408700963, + "loss": 0.0, + "step": 34762 + }, + { + "epoch": 3.2437249230195015, + "grad_norm": NaN, + "learning_rate": 0.00013782060204462276, + "loss": 0.0, + "step": 34763 + }, + { + "epoch": 3.243818232714379, + "grad_norm": NaN, + "learning_rate": 0.00013781306325011785, + "loss": 0.0, + "step": 34764 + }, + { + "epoch": 3.2439115424092564, + "grad_norm": NaN, + "learning_rate": 0.00013780552448660062, + "loss": 0.0, + "step": 34765 + }, + { + "epoch": 3.244004852104134, + "grad_norm": NaN, + "learning_rate": 0.0001377979857540904, + "loss": 0.0, + "step": 34766 + }, + { + "epoch": 3.244098161799011, + "grad_norm": NaN, + "learning_rate": 0.00013779044705260633, + "loss": 0.0, + "step": 34767 + }, + { + "epoch": 3.244191471493888, + "grad_norm": NaN, + "learning_rate": 0.00013778290838216742, + "loss": 0.0, + "step": 34768 + }, + { + "epoch": 3.2442847811887656, + "grad_norm": NaN, + "learning_rate": 0.00013777536974279302, + "loss": 0.0, + "step": 34769 + }, + { + "epoch": 3.244378090883643, + "grad_norm": NaN, + "learning_rate": 0.0001377678311345023, + "loss": 0.0, + "step": 34770 + }, + { + "epoch": 3.24447140057852, + "grad_norm": NaN, + "learning_rate": 0.00013776029255731424, + "loss": 0.0, + "step": 34771 + }, + { + "epoch": 3.2445647102733974, + "grad_norm": NaN, + "learning_rate": 0.00013775275401124818, + "loss": 0.0, + "step": 34772 + }, + { + "epoch": 3.244658019968275, + "grad_norm": NaN, + "learning_rate": 0.0001377452154963233, + "loss": 0.0, + "step": 34773 + }, + { + "epoch": 3.244751329663152, + "grad_norm": NaN, + "learning_rate": 0.0001377376770125586, + "loss": 0.0, + "step": 34774 + }, + { + "epoch": 3.2448446393580292, + "grad_norm": NaN, + "learning_rate": 0.00013773013855997342, + "loss": 0.0, + "step": 34775 + }, + { + "epoch": 3.2449379490529067, + "grad_norm": NaN, + "learning_rate": 0.0001377226001385869, + "loss": 0.0, + "step": 34776 + }, + { + "epoch": 3.245031258747784, + "grad_norm": NaN, + "learning_rate": 0.00013771506174841804, + "loss": 0.0, + "step": 34777 + }, + { + "epoch": 3.245124568442661, + "grad_norm": NaN, + "learning_rate": 0.00013770752338948625, + "loss": 0.0, + "step": 34778 + }, + { + "epoch": 3.2452178781375385, + "grad_norm": NaN, + "learning_rate": 0.00013769998506181057, + "loss": 0.0, + "step": 34779 + }, + { + "epoch": 3.245311187832416, + "grad_norm": NaN, + "learning_rate": 0.00013769244676541014, + "loss": 0.0, + "step": 34780 + }, + { + "epoch": 3.245404497527293, + "grad_norm": NaN, + "learning_rate": 0.0001376849085003042, + "loss": 0.0, + "step": 34781 + }, + { + "epoch": 3.2454978072221703, + "grad_norm": NaN, + "learning_rate": 0.00013767737026651193, + "loss": 0.0, + "step": 34782 + }, + { + "epoch": 3.2455911169170477, + "grad_norm": NaN, + "learning_rate": 0.00013766983206405238, + "loss": 0.0, + "step": 34783 + }, + { + "epoch": 3.245684426611925, + "grad_norm": NaN, + "learning_rate": 0.00013766229389294485, + "loss": 0.0, + "step": 34784 + }, + { + "epoch": 3.245777736306802, + "grad_norm": NaN, + "learning_rate": 0.00013765475575320846, + "loss": 0.0, + "step": 34785 + }, + { + "epoch": 3.2458710460016795, + "grad_norm": NaN, + "learning_rate": 0.0001376472176448623, + "loss": 0.0, + "step": 34786 + }, + { + "epoch": 3.245964355696557, + "grad_norm": NaN, + "learning_rate": 0.00013763967956792562, + "loss": 0.0, + "step": 34787 + }, + { + "epoch": 3.2460576653914344, + "grad_norm": NaN, + "learning_rate": 0.00013763214152241765, + "loss": 0.0, + "step": 34788 + }, + { + "epoch": 3.2461509750863113, + "grad_norm": NaN, + "learning_rate": 0.00013762460350835742, + "loss": 0.0, + "step": 34789 + }, + { + "epoch": 3.2462442847811888, + "grad_norm": NaN, + "learning_rate": 0.0001376170655257642, + "loss": 0.0, + "step": 34790 + }, + { + "epoch": 3.246337594476066, + "grad_norm": NaN, + "learning_rate": 0.00013760952757465705, + "loss": 0.0, + "step": 34791 + }, + { + "epoch": 3.246430904170943, + "grad_norm": NaN, + "learning_rate": 0.00013760198965505525, + "loss": 0.0, + "step": 34792 + }, + { + "epoch": 3.2465242138658206, + "grad_norm": NaN, + "learning_rate": 0.00013759445176697794, + "loss": 0.0, + "step": 34793 + }, + { + "epoch": 3.246617523560698, + "grad_norm": NaN, + "learning_rate": 0.00013758691391044418, + "loss": 0.0, + "step": 34794 + }, + { + "epoch": 3.2467108332555754, + "grad_norm": NaN, + "learning_rate": 0.0001375793760854733, + "loss": 0.0, + "step": 34795 + }, + { + "epoch": 3.2468041429504524, + "grad_norm": NaN, + "learning_rate": 0.0001375718382920844, + "loss": 0.0, + "step": 34796 + }, + { + "epoch": 3.24689745264533, + "grad_norm": NaN, + "learning_rate": 0.00013756430053029654, + "loss": 0.0, + "step": 34797 + }, + { + "epoch": 3.2469907623402072, + "grad_norm": NaN, + "learning_rate": 0.00013755676280012902, + "loss": 0.0, + "step": 34798 + }, + { + "epoch": 3.2470840720350846, + "grad_norm": NaN, + "learning_rate": 0.000137549225101601, + "loss": 0.0, + "step": 34799 + }, + { + "epoch": 3.2471773817299616, + "grad_norm": NaN, + "learning_rate": 0.00013754168743473157, + "loss": 0.0, + "step": 34800 + }, + { + "epoch": 3.247270691424839, + "grad_norm": NaN, + "learning_rate": 0.00013753414979953996, + "loss": 0.0, + "step": 34801 + }, + { + "epoch": 3.2473640011197165, + "grad_norm": NaN, + "learning_rate": 0.00013752661219604533, + "loss": 0.0, + "step": 34802 + }, + { + "epoch": 3.2474573108145934, + "grad_norm": NaN, + "learning_rate": 0.0001375190746242668, + "loss": 0.0, + "step": 34803 + }, + { + "epoch": 3.247550620509471, + "grad_norm": NaN, + "learning_rate": 0.00013751153708422358, + "loss": 0.0, + "step": 34804 + }, + { + "epoch": 3.2476439302043483, + "grad_norm": NaN, + "learning_rate": 0.00013750399957593486, + "loss": 0.0, + "step": 34805 + }, + { + "epoch": 3.2477372398992257, + "grad_norm": NaN, + "learning_rate": 0.00013749646209941968, + "loss": 0.0, + "step": 34806 + }, + { + "epoch": 3.2478305495941027, + "grad_norm": NaN, + "learning_rate": 0.00013748892465469736, + "loss": 0.0, + "step": 34807 + }, + { + "epoch": 3.24792385928898, + "grad_norm": NaN, + "learning_rate": 0.00013748138724178698, + "loss": 0.0, + "step": 34808 + }, + { + "epoch": 3.2480171689838575, + "grad_norm": NaN, + "learning_rate": 0.0001374738498607077, + "loss": 0.0, + "step": 34809 + }, + { + "epoch": 3.248110478678735, + "grad_norm": NaN, + "learning_rate": 0.00013746631251147873, + "loss": 0.0, + "step": 34810 + }, + { + "epoch": 3.248203788373612, + "grad_norm": NaN, + "learning_rate": 0.00013745877519411926, + "loss": 0.0, + "step": 34811 + }, + { + "epoch": 3.2482970980684893, + "grad_norm": NaN, + "learning_rate": 0.00013745123790864834, + "loss": 0.0, + "step": 34812 + }, + { + "epoch": 3.2483904077633667, + "grad_norm": NaN, + "learning_rate": 0.00013744370065508525, + "loss": 0.0, + "step": 34813 + }, + { + "epoch": 3.2484837174582437, + "grad_norm": NaN, + "learning_rate": 0.0001374361634334491, + "loss": 0.0, + "step": 34814 + }, + { + "epoch": 3.248577027153121, + "grad_norm": NaN, + "learning_rate": 0.00013742862624375904, + "loss": 0.0, + "step": 34815 + }, + { + "epoch": 3.2486703368479986, + "grad_norm": NaN, + "learning_rate": 0.00013742108908603427, + "loss": 0.0, + "step": 34816 + }, + { + "epoch": 3.248763646542876, + "grad_norm": NaN, + "learning_rate": 0.00013741355196029398, + "loss": 0.0, + "step": 34817 + }, + { + "epoch": 3.248856956237753, + "grad_norm": NaN, + "learning_rate": 0.00013740601486655722, + "loss": 0.0, + "step": 34818 + }, + { + "epoch": 3.2489502659326304, + "grad_norm": NaN, + "learning_rate": 0.0001373984778048433, + "loss": 0.0, + "step": 34819 + }, + { + "epoch": 3.249043575627508, + "grad_norm": NaN, + "learning_rate": 0.00013739094077517135, + "loss": 0.0, + "step": 34820 + }, + { + "epoch": 3.249136885322385, + "grad_norm": NaN, + "learning_rate": 0.00013738340377756041, + "loss": 0.0, + "step": 34821 + }, + { + "epoch": 3.249230195017262, + "grad_norm": NaN, + "learning_rate": 0.00013737586681202975, + "loss": 0.0, + "step": 34822 + }, + { + "epoch": 3.2493235047121396, + "grad_norm": NaN, + "learning_rate": 0.00013736832987859863, + "loss": 0.0, + "step": 34823 + }, + { + "epoch": 3.249416814407017, + "grad_norm": NaN, + "learning_rate": 0.000137360792977286, + "loss": 0.0, + "step": 34824 + }, + { + "epoch": 3.249510124101894, + "grad_norm": NaN, + "learning_rate": 0.00013735325610811112, + "loss": 0.0, + "step": 34825 + }, + { + "epoch": 3.2496034337967714, + "grad_norm": NaN, + "learning_rate": 0.00013734571927109321, + "loss": 0.0, + "step": 34826 + }, + { + "epoch": 3.249696743491649, + "grad_norm": NaN, + "learning_rate": 0.00013733818246625137, + "loss": 0.0, + "step": 34827 + }, + { + "epoch": 3.2497900531865263, + "grad_norm": NaN, + "learning_rate": 0.00013733064569360477, + "loss": 0.0, + "step": 34828 + }, + { + "epoch": 3.2498833628814032, + "grad_norm": NaN, + "learning_rate": 0.00013732310895317264, + "loss": 0.0, + "step": 34829 + }, + { + "epoch": 3.2499766725762806, + "grad_norm": NaN, + "learning_rate": 0.00013731557224497406, + "loss": 0.0, + "step": 34830 + }, + { + "epoch": 3.250069982271158, + "grad_norm": NaN, + "learning_rate": 0.00013730803556902816, + "loss": 0.0, + "step": 34831 + }, + { + "epoch": 3.250163291966035, + "grad_norm": NaN, + "learning_rate": 0.00013730049892535427, + "loss": 0.0, + "step": 34832 + }, + { + "epoch": 3.2502566016609125, + "grad_norm": NaN, + "learning_rate": 0.0001372929623139714, + "loss": 0.0, + "step": 34833 + }, + { + "epoch": 3.25034991135579, + "grad_norm": NaN, + "learning_rate": 0.00013728542573489873, + "loss": 0.0, + "step": 34834 + }, + { + "epoch": 3.2504432210506673, + "grad_norm": NaN, + "learning_rate": 0.0001372778891881555, + "loss": 0.0, + "step": 34835 + }, + { + "epoch": 3.2505365307455443, + "grad_norm": NaN, + "learning_rate": 0.00013727035267376085, + "loss": 0.0, + "step": 34836 + }, + { + "epoch": 3.2506298404404217, + "grad_norm": NaN, + "learning_rate": 0.0001372628161917339, + "loss": 0.0, + "step": 34837 + }, + { + "epoch": 3.250723150135299, + "grad_norm": NaN, + "learning_rate": 0.0001372552797420938, + "loss": 0.0, + "step": 34838 + }, + { + "epoch": 3.2508164598301765, + "grad_norm": NaN, + "learning_rate": 0.00013724774332485976, + "loss": 0.0, + "step": 34839 + }, + { + "epoch": 3.2509097695250535, + "grad_norm": NaN, + "learning_rate": 0.00013724020694005099, + "loss": 0.0, + "step": 34840 + }, + { + "epoch": 3.251003079219931, + "grad_norm": NaN, + "learning_rate": 0.00013723267058768653, + "loss": 0.0, + "step": 34841 + }, + { + "epoch": 3.2510963889148083, + "grad_norm": NaN, + "learning_rate": 0.00013722513426778563, + "loss": 0.0, + "step": 34842 + }, + { + "epoch": 3.2511896986096858, + "grad_norm": NaN, + "learning_rate": 0.00013721759798036745, + "loss": 0.0, + "step": 34843 + }, + { + "epoch": 3.2512830083045627, + "grad_norm": NaN, + "learning_rate": 0.00013721006172545108, + "loss": 0.0, + "step": 34844 + }, + { + "epoch": 3.25137631799944, + "grad_norm": NaN, + "learning_rate": 0.00013720252550305576, + "loss": 0.0, + "step": 34845 + }, + { + "epoch": 3.2514696276943176, + "grad_norm": NaN, + "learning_rate": 0.00013719498931320066, + "loss": 0.0, + "step": 34846 + }, + { + "epoch": 3.2515629373891946, + "grad_norm": NaN, + "learning_rate": 0.00013718745315590485, + "loss": 0.0, + "step": 34847 + }, + { + "epoch": 3.251656247084072, + "grad_norm": NaN, + "learning_rate": 0.00013717991703118758, + "loss": 0.0, + "step": 34848 + }, + { + "epoch": 3.2517495567789494, + "grad_norm": NaN, + "learning_rate": 0.00013717238093906802, + "loss": 0.0, + "step": 34849 + }, + { + "epoch": 3.251842866473827, + "grad_norm": NaN, + "learning_rate": 0.0001371648448795652, + "loss": 0.0, + "step": 34850 + }, + { + "epoch": 3.251936176168704, + "grad_norm": NaN, + "learning_rate": 0.00013715730885269844, + "loss": 0.0, + "step": 34851 + }, + { + "epoch": 3.252029485863581, + "grad_norm": NaN, + "learning_rate": 0.00013714977285848688, + "loss": 0.0, + "step": 34852 + }, + { + "epoch": 3.2521227955584586, + "grad_norm": NaN, + "learning_rate": 0.00013714223689694955, + "loss": 0.0, + "step": 34853 + }, + { + "epoch": 3.2522161052533356, + "grad_norm": NaN, + "learning_rate": 0.00013713470096810576, + "loss": 0.0, + "step": 34854 + }, + { + "epoch": 3.252309414948213, + "grad_norm": NaN, + "learning_rate": 0.00013712716507197463, + "loss": 0.0, + "step": 34855 + }, + { + "epoch": 3.2524027246430904, + "grad_norm": NaN, + "learning_rate": 0.00013711962920857524, + "loss": 0.0, + "step": 34856 + }, + { + "epoch": 3.252496034337968, + "grad_norm": NaN, + "learning_rate": 0.00013711209337792686, + "loss": 0.0, + "step": 34857 + }, + { + "epoch": 3.252589344032845, + "grad_norm": NaN, + "learning_rate": 0.00013710455758004865, + "loss": 0.0, + "step": 34858 + }, + { + "epoch": 3.2526826537277223, + "grad_norm": NaN, + "learning_rate": 0.00013709702181495965, + "loss": 0.0, + "step": 34859 + }, + { + "epoch": 3.2527759634225997, + "grad_norm": NaN, + "learning_rate": 0.00013708948608267911, + "loss": 0.0, + "step": 34860 + }, + { + "epoch": 3.252869273117477, + "grad_norm": NaN, + "learning_rate": 0.00013708195038322627, + "loss": 0.0, + "step": 34861 + }, + { + "epoch": 3.252962582812354, + "grad_norm": NaN, + "learning_rate": 0.00013707441471662012, + "loss": 0.0, + "step": 34862 + }, + { + "epoch": 3.2530558925072315, + "grad_norm": NaN, + "learning_rate": 0.00013706687908287987, + "loss": 0.0, + "step": 34863 + }, + { + "epoch": 3.253149202202109, + "grad_norm": NaN, + "learning_rate": 0.0001370593434820248, + "loss": 0.0, + "step": 34864 + }, + { + "epoch": 3.2532425118969863, + "grad_norm": NaN, + "learning_rate": 0.00013705180791407398, + "loss": 0.0, + "step": 34865 + }, + { + "epoch": 3.2533358215918633, + "grad_norm": NaN, + "learning_rate": 0.00013704427237904652, + "loss": 0.0, + "step": 34866 + }, + { + "epoch": 3.2534291312867407, + "grad_norm": NaN, + "learning_rate": 0.0001370367368769617, + "loss": 0.0, + "step": 34867 + }, + { + "epoch": 3.253522440981618, + "grad_norm": NaN, + "learning_rate": 0.00013702920140783861, + "loss": 0.0, + "step": 34868 + }, + { + "epoch": 3.253615750676495, + "grad_norm": NaN, + "learning_rate": 0.00013702166597169636, + "loss": 0.0, + "step": 34869 + }, + { + "epoch": 3.2537090603713725, + "grad_norm": NaN, + "learning_rate": 0.00013701413056855426, + "loss": 0.0, + "step": 34870 + }, + { + "epoch": 3.25380237006625, + "grad_norm": NaN, + "learning_rate": 0.0001370065951984313, + "loss": 0.0, + "step": 34871 + }, + { + "epoch": 3.2538956797611274, + "grad_norm": NaN, + "learning_rate": 0.00013699905986134674, + "loss": 0.0, + "step": 34872 + }, + { + "epoch": 3.2539889894560043, + "grad_norm": NaN, + "learning_rate": 0.00013699152455731974, + "loss": 0.0, + "step": 34873 + }, + { + "epoch": 3.2540822991508818, + "grad_norm": NaN, + "learning_rate": 0.00013698398928636943, + "loss": 0.0, + "step": 34874 + }, + { + "epoch": 3.254175608845759, + "grad_norm": NaN, + "learning_rate": 0.00013697645404851496, + "loss": 0.0, + "step": 34875 + }, + { + "epoch": 3.254268918540636, + "grad_norm": NaN, + "learning_rate": 0.00013696891884377554, + "loss": 0.0, + "step": 34876 + }, + { + "epoch": 3.2543622282355136, + "grad_norm": NaN, + "learning_rate": 0.0001369613836721703, + "loss": 0.0, + "step": 34877 + }, + { + "epoch": 3.254455537930391, + "grad_norm": NaN, + "learning_rate": 0.00013695384853371835, + "loss": 0.0, + "step": 34878 + }, + { + "epoch": 3.2545488476252684, + "grad_norm": NaN, + "learning_rate": 0.00013694631342843898, + "loss": 0.0, + "step": 34879 + }, + { + "epoch": 3.2546421573201454, + "grad_norm": NaN, + "learning_rate": 0.0001369387783563512, + "loss": 0.0, + "step": 34880 + }, + { + "epoch": 3.254735467015023, + "grad_norm": NaN, + "learning_rate": 0.00013693124331747432, + "loss": 0.0, + "step": 34881 + }, + { + "epoch": 3.2548287767099002, + "grad_norm": NaN, + "learning_rate": 0.0001369237083118273, + "loss": 0.0, + "step": 34882 + }, + { + "epoch": 3.2549220864047776, + "grad_norm": NaN, + "learning_rate": 0.00013691617333942948, + "loss": 0.0, + "step": 34883 + }, + { + "epoch": 3.2550153960996546, + "grad_norm": NaN, + "learning_rate": 0.00013690863840029997, + "loss": 0.0, + "step": 34884 + }, + { + "epoch": 3.255108705794532, + "grad_norm": NaN, + "learning_rate": 0.00013690110349445786, + "loss": 0.0, + "step": 34885 + }, + { + "epoch": 3.2552020154894095, + "grad_norm": NaN, + "learning_rate": 0.0001368935686219224, + "loss": 0.0, + "step": 34886 + }, + { + "epoch": 3.255295325184287, + "grad_norm": NaN, + "learning_rate": 0.00013688603378271273, + "loss": 0.0, + "step": 34887 + }, + { + "epoch": 3.255388634879164, + "grad_norm": NaN, + "learning_rate": 0.00013687849897684793, + "loss": 0.0, + "step": 34888 + }, + { + "epoch": 3.2554819445740413, + "grad_norm": NaN, + "learning_rate": 0.0001368709642043473, + "loss": 0.0, + "step": 34889 + }, + { + "epoch": 3.2555752542689187, + "grad_norm": NaN, + "learning_rate": 0.00013686342946522988, + "loss": 0.0, + "step": 34890 + }, + { + "epoch": 3.2556685639637957, + "grad_norm": NaN, + "learning_rate": 0.00013685589475951484, + "loss": 0.0, + "step": 34891 + }, + { + "epoch": 3.255761873658673, + "grad_norm": NaN, + "learning_rate": 0.0001368483600872214, + "loss": 0.0, + "step": 34892 + }, + { + "epoch": 3.2558551833535505, + "grad_norm": NaN, + "learning_rate": 0.0001368408254483687, + "loss": 0.0, + "step": 34893 + }, + { + "epoch": 3.255948493048428, + "grad_norm": NaN, + "learning_rate": 0.00013683329084297584, + "loss": 0.0, + "step": 34894 + }, + { + "epoch": 3.256041802743305, + "grad_norm": NaN, + "learning_rate": 0.00013682575627106203, + "loss": 0.0, + "step": 34895 + }, + { + "epoch": 3.2561351124381823, + "grad_norm": NaN, + "learning_rate": 0.00013681822173264646, + "loss": 0.0, + "step": 34896 + }, + { + "epoch": 3.2562284221330597, + "grad_norm": NaN, + "learning_rate": 0.0001368106872277482, + "loss": 0.0, + "step": 34897 + }, + { + "epoch": 3.2563217318279367, + "grad_norm": NaN, + "learning_rate": 0.00013680315275638644, + "loss": 0.0, + "step": 34898 + }, + { + "epoch": 3.256415041522814, + "grad_norm": NaN, + "learning_rate": 0.00013679561831858044, + "loss": 0.0, + "step": 34899 + }, + { + "epoch": 3.2565083512176916, + "grad_norm": NaN, + "learning_rate": 0.0001367880839143492, + "loss": 0.0, + "step": 34900 + }, + { + "epoch": 3.256601660912569, + "grad_norm": NaN, + "learning_rate": 0.00013678054954371194, + "loss": 0.0, + "step": 34901 + }, + { + "epoch": 3.256694970607446, + "grad_norm": NaN, + "learning_rate": 0.00013677301520668788, + "loss": 0.0, + "step": 34902 + }, + { + "epoch": 3.2567882803023234, + "grad_norm": NaN, + "learning_rate": 0.0001367654809032961, + "loss": 0.0, + "step": 34903 + }, + { + "epoch": 3.256881589997201, + "grad_norm": NaN, + "learning_rate": 0.00013675794663355574, + "loss": 0.0, + "step": 34904 + }, + { + "epoch": 3.2569748996920778, + "grad_norm": NaN, + "learning_rate": 0.00013675041239748608, + "loss": 0.0, + "step": 34905 + }, + { + "epoch": 3.257068209386955, + "grad_norm": NaN, + "learning_rate": 0.00013674287819510615, + "loss": 0.0, + "step": 34906 + }, + { + "epoch": 3.2571615190818326, + "grad_norm": NaN, + "learning_rate": 0.00013673534402643513, + "loss": 0.0, + "step": 34907 + }, + { + "epoch": 3.25725482877671, + "grad_norm": NaN, + "learning_rate": 0.00013672780989149226, + "loss": 0.0, + "step": 34908 + }, + { + "epoch": 3.2573481384715874, + "grad_norm": NaN, + "learning_rate": 0.0001367202757902966, + "loss": 0.0, + "step": 34909 + }, + { + "epoch": 3.2574414481664644, + "grad_norm": NaN, + "learning_rate": 0.00013671274172286733, + "loss": 0.0, + "step": 34910 + }, + { + "epoch": 3.257534757861342, + "grad_norm": NaN, + "learning_rate": 0.00013670520768922366, + "loss": 0.0, + "step": 34911 + }, + { + "epoch": 3.2576280675562193, + "grad_norm": NaN, + "learning_rate": 0.0001366976736893847, + "loss": 0.0, + "step": 34912 + }, + { + "epoch": 3.2577213772510962, + "grad_norm": NaN, + "learning_rate": 0.0001366901397233696, + "loss": 0.0, + "step": 34913 + }, + { + "epoch": 3.2578146869459736, + "grad_norm": NaN, + "learning_rate": 0.00013668260579119756, + "loss": 0.0, + "step": 34914 + }, + { + "epoch": 3.257907996640851, + "grad_norm": NaN, + "learning_rate": 0.0001366750718928877, + "loss": 0.0, + "step": 34915 + }, + { + "epoch": 3.2580013063357285, + "grad_norm": NaN, + "learning_rate": 0.00013666753802845912, + "loss": 0.0, + "step": 34916 + }, + { + "epoch": 3.2580946160306055, + "grad_norm": NaN, + "learning_rate": 0.00013666000419793113, + "loss": 0.0, + "step": 34917 + }, + { + "epoch": 3.258187925725483, + "grad_norm": NaN, + "learning_rate": 0.00013665247040132276, + "loss": 0.0, + "step": 34918 + }, + { + "epoch": 3.2582812354203603, + "grad_norm": NaN, + "learning_rate": 0.00013664493663865317, + "loss": 0.0, + "step": 34919 + }, + { + "epoch": 3.2583745451152373, + "grad_norm": NaN, + "learning_rate": 0.00013663740290994163, + "loss": 0.0, + "step": 34920 + }, + { + "epoch": 3.2584678548101147, + "grad_norm": NaN, + "learning_rate": 0.00013662986921520717, + "loss": 0.0, + "step": 34921 + }, + { + "epoch": 3.258561164504992, + "grad_norm": NaN, + "learning_rate": 0.00013662233555446896, + "loss": 0.0, + "step": 34922 + }, + { + "epoch": 3.2586544741998695, + "grad_norm": NaN, + "learning_rate": 0.00013661480192774626, + "loss": 0.0, + "step": 34923 + }, + { + "epoch": 3.2587477838947465, + "grad_norm": NaN, + "learning_rate": 0.00013660726833505813, + "loss": 0.0, + "step": 34924 + }, + { + "epoch": 3.258841093589624, + "grad_norm": NaN, + "learning_rate": 0.00013659973477642376, + "loss": 0.0, + "step": 34925 + }, + { + "epoch": 3.2589344032845013, + "grad_norm": NaN, + "learning_rate": 0.00013659220125186223, + "loss": 0.0, + "step": 34926 + }, + { + "epoch": 3.2590277129793783, + "grad_norm": NaN, + "learning_rate": 0.00013658466776139282, + "loss": 0.0, + "step": 34927 + }, + { + "epoch": 3.2591210226742557, + "grad_norm": NaN, + "learning_rate": 0.00013657713430503463, + "loss": 0.0, + "step": 34928 + }, + { + "epoch": 3.259214332369133, + "grad_norm": NaN, + "learning_rate": 0.00013656960088280677, + "loss": 0.0, + "step": 34929 + }, + { + "epoch": 3.2593076420640106, + "grad_norm": NaN, + "learning_rate": 0.00013656206749472843, + "loss": 0.0, + "step": 34930 + }, + { + "epoch": 3.2594009517588876, + "grad_norm": NaN, + "learning_rate": 0.00013655453414081884, + "loss": 0.0, + "step": 34931 + }, + { + "epoch": 3.259494261453765, + "grad_norm": NaN, + "learning_rate": 0.000136547000821097, + "loss": 0.0, + "step": 34932 + }, + { + "epoch": 3.2595875711486424, + "grad_norm": NaN, + "learning_rate": 0.00013653946753558222, + "loss": 0.0, + "step": 34933 + }, + { + "epoch": 3.25968088084352, + "grad_norm": NaN, + "learning_rate": 0.0001365319342842936, + "loss": 0.0, + "step": 34934 + }, + { + "epoch": 3.259774190538397, + "grad_norm": NaN, + "learning_rate": 0.0001365244010672502, + "loss": 0.0, + "step": 34935 + }, + { + "epoch": 3.259867500233274, + "grad_norm": NaN, + "learning_rate": 0.00013651686788447126, + "loss": 0.0, + "step": 34936 + }, + { + "epoch": 3.2599608099281516, + "grad_norm": NaN, + "learning_rate": 0.00013650933473597604, + "loss": 0.0, + "step": 34937 + }, + { + "epoch": 3.260054119623029, + "grad_norm": NaN, + "learning_rate": 0.0001365018016217835, + "loss": 0.0, + "step": 34938 + }, + { + "epoch": 3.260147429317906, + "grad_norm": NaN, + "learning_rate": 0.00013649426854191285, + "loss": 0.0, + "step": 34939 + }, + { + "epoch": 3.2602407390127834, + "grad_norm": NaN, + "learning_rate": 0.00013648673549638337, + "loss": 0.0, + "step": 34940 + }, + { + "epoch": 3.260334048707661, + "grad_norm": NaN, + "learning_rate": 0.00013647920248521407, + "loss": 0.0, + "step": 34941 + }, + { + "epoch": 3.260427358402538, + "grad_norm": NaN, + "learning_rate": 0.0001364716695084241, + "loss": 0.0, + "step": 34942 + }, + { + "epoch": 3.2605206680974153, + "grad_norm": NaN, + "learning_rate": 0.00013646413656603275, + "loss": 0.0, + "step": 34943 + }, + { + "epoch": 3.2606139777922927, + "grad_norm": NaN, + "learning_rate": 0.00013645660365805908, + "loss": 0.0, + "step": 34944 + }, + { + "epoch": 3.26070728748717, + "grad_norm": NaN, + "learning_rate": 0.0001364490707845222, + "loss": 0.0, + "step": 34945 + }, + { + "epoch": 3.260800597182047, + "grad_norm": NaN, + "learning_rate": 0.00013644153794544137, + "loss": 0.0, + "step": 34946 + }, + { + "epoch": 3.2608939068769245, + "grad_norm": NaN, + "learning_rate": 0.0001364340051408357, + "loss": 0.0, + "step": 34947 + }, + { + "epoch": 3.260987216571802, + "grad_norm": NaN, + "learning_rate": 0.00013642647237072428, + "loss": 0.0, + "step": 34948 + }, + { + "epoch": 3.261080526266679, + "grad_norm": NaN, + "learning_rate": 0.00013641893963512637, + "loss": 0.0, + "step": 34949 + }, + { + "epoch": 3.2611738359615563, + "grad_norm": NaN, + "learning_rate": 0.00013641140693406107, + "loss": 0.0, + "step": 34950 + }, + { + "epoch": 3.2612671456564337, + "grad_norm": NaN, + "learning_rate": 0.0001364038742675475, + "loss": 0.0, + "step": 34951 + }, + { + "epoch": 3.261360455351311, + "grad_norm": NaN, + "learning_rate": 0.0001363963416356049, + "loss": 0.0, + "step": 34952 + }, + { + "epoch": 3.261453765046188, + "grad_norm": NaN, + "learning_rate": 0.00013638880903825236, + "loss": 0.0, + "step": 34953 + }, + { + "epoch": 3.2615470747410655, + "grad_norm": NaN, + "learning_rate": 0.000136381276475509, + "loss": 0.0, + "step": 34954 + }, + { + "epoch": 3.261640384435943, + "grad_norm": NaN, + "learning_rate": 0.00013637374394739408, + "loss": 0.0, + "step": 34955 + }, + { + "epoch": 3.2617336941308204, + "grad_norm": NaN, + "learning_rate": 0.00013636621145392668, + "loss": 0.0, + "step": 34956 + }, + { + "epoch": 3.2618270038256973, + "grad_norm": NaN, + "learning_rate": 0.0001363586789951259, + "loss": 0.0, + "step": 34957 + }, + { + "epoch": 3.2619203135205748, + "grad_norm": NaN, + "learning_rate": 0.00013635114657101105, + "loss": 0.0, + "step": 34958 + }, + { + "epoch": 3.262013623215452, + "grad_norm": NaN, + "learning_rate": 0.00013634361418160115, + "loss": 0.0, + "step": 34959 + }, + { + "epoch": 3.2621069329103296, + "grad_norm": NaN, + "learning_rate": 0.00013633608182691536, + "loss": 0.0, + "step": 34960 + }, + { + "epoch": 3.2622002426052066, + "grad_norm": NaN, + "learning_rate": 0.00013632854950697295, + "loss": 0.0, + "step": 34961 + }, + { + "epoch": 3.262293552300084, + "grad_norm": NaN, + "learning_rate": 0.00013632101722179293, + "loss": 0.0, + "step": 34962 + }, + { + "epoch": 3.2623868619949614, + "grad_norm": NaN, + "learning_rate": 0.0001363134849713945, + "loss": 0.0, + "step": 34963 + }, + { + "epoch": 3.2624801716898384, + "grad_norm": NaN, + "learning_rate": 0.0001363059527557969, + "loss": 0.0, + "step": 34964 + }, + { + "epoch": 3.262573481384716, + "grad_norm": NaN, + "learning_rate": 0.00013629842057501912, + "loss": 0.0, + "step": 34965 + }, + { + "epoch": 3.2626667910795932, + "grad_norm": NaN, + "learning_rate": 0.00013629088842908042, + "loss": 0.0, + "step": 34966 + }, + { + "epoch": 3.2627601007744707, + "grad_norm": NaN, + "learning_rate": 0.00013628335631799996, + "loss": 0.0, + "step": 34967 + }, + { + "epoch": 3.2628534104693476, + "grad_norm": NaN, + "learning_rate": 0.00013627582424179685, + "loss": 0.0, + "step": 34968 + }, + { + "epoch": 3.262946720164225, + "grad_norm": NaN, + "learning_rate": 0.00013626829220049028, + "loss": 0.0, + "step": 34969 + }, + { + "epoch": 3.2630400298591025, + "grad_norm": NaN, + "learning_rate": 0.0001362607601940993, + "loss": 0.0, + "step": 34970 + }, + { + "epoch": 3.2631333395539794, + "grad_norm": NaN, + "learning_rate": 0.0001362532282226432, + "loss": 0.0, + "step": 34971 + }, + { + "epoch": 3.263226649248857, + "grad_norm": NaN, + "learning_rate": 0.00013624569628614106, + "loss": 0.0, + "step": 34972 + }, + { + "epoch": 3.2633199589437343, + "grad_norm": NaN, + "learning_rate": 0.000136238164384612, + "loss": 0.0, + "step": 34973 + }, + { + "epoch": 3.2634132686386117, + "grad_norm": NaN, + "learning_rate": 0.0001362306325180752, + "loss": 0.0, + "step": 34974 + }, + { + "epoch": 3.2635065783334887, + "grad_norm": NaN, + "learning_rate": 0.0001362231006865499, + "loss": 0.0, + "step": 34975 + }, + { + "epoch": 3.263599888028366, + "grad_norm": NaN, + "learning_rate": 0.00013621556889005515, + "loss": 0.0, + "step": 34976 + }, + { + "epoch": 3.2636931977232435, + "grad_norm": NaN, + "learning_rate": 0.00013620803712861008, + "loss": 0.0, + "step": 34977 + }, + { + "epoch": 3.263786507418121, + "grad_norm": NaN, + "learning_rate": 0.00013620050540223393, + "loss": 0.0, + "step": 34978 + }, + { + "epoch": 3.263879817112998, + "grad_norm": NaN, + "learning_rate": 0.0001361929737109458, + "loss": 0.0, + "step": 34979 + }, + { + "epoch": 3.2639731268078753, + "grad_norm": NaN, + "learning_rate": 0.0001361854420547648, + "loss": 0.0, + "step": 34980 + }, + { + "epoch": 3.2640664365027527, + "grad_norm": NaN, + "learning_rate": 0.0001361779104337102, + "loss": 0.0, + "step": 34981 + }, + { + "epoch": 3.26415974619763, + "grad_norm": NaN, + "learning_rate": 0.00013617037884780105, + "loss": 0.0, + "step": 34982 + }, + { + "epoch": 3.264253055892507, + "grad_norm": NaN, + "learning_rate": 0.0001361628472970565, + "loss": 0.0, + "step": 34983 + }, + { + "epoch": 3.2643463655873846, + "grad_norm": NaN, + "learning_rate": 0.0001361553157814958, + "loss": 0.0, + "step": 34984 + }, + { + "epoch": 3.264439675282262, + "grad_norm": NaN, + "learning_rate": 0.000136147784301138, + "loss": 0.0, + "step": 34985 + }, + { + "epoch": 3.264532984977139, + "grad_norm": NaN, + "learning_rate": 0.00013614025285600223, + "loss": 0.0, + "step": 34986 + }, + { + "epoch": 3.2646262946720164, + "grad_norm": NaN, + "learning_rate": 0.00013613272144610776, + "loss": 0.0, + "step": 34987 + }, + { + "epoch": 3.264719604366894, + "grad_norm": NaN, + "learning_rate": 0.00013612519007147363, + "loss": 0.0, + "step": 34988 + }, + { + "epoch": 3.264812914061771, + "grad_norm": NaN, + "learning_rate": 0.000136117658732119, + "loss": 0.0, + "step": 34989 + }, + { + "epoch": 3.264906223756648, + "grad_norm": NaN, + "learning_rate": 0.00013611012742806316, + "loss": 0.0, + "step": 34990 + }, + { + "epoch": 3.2649995334515256, + "grad_norm": NaN, + "learning_rate": 0.00013610259615932506, + "loss": 0.0, + "step": 34991 + }, + { + "epoch": 3.265092843146403, + "grad_norm": NaN, + "learning_rate": 0.00013609506492592396, + "loss": 0.0, + "step": 34992 + }, + { + "epoch": 3.26518615284128, + "grad_norm": NaN, + "learning_rate": 0.00013608753372787902, + "loss": 0.0, + "step": 34993 + }, + { + "epoch": 3.2652794625361574, + "grad_norm": NaN, + "learning_rate": 0.00013608000256520934, + "loss": 0.0, + "step": 34994 + }, + { + "epoch": 3.265372772231035, + "grad_norm": NaN, + "learning_rate": 0.00013607247143793407, + "loss": 0.0, + "step": 34995 + }, + { + "epoch": 3.2654660819259123, + "grad_norm": NaN, + "learning_rate": 0.00013606494034607242, + "loss": 0.0, + "step": 34996 + }, + { + "epoch": 3.2655593916207892, + "grad_norm": NaN, + "learning_rate": 0.0001360574092896435, + "loss": 0.0, + "step": 34997 + }, + { + "epoch": 3.2656527013156667, + "grad_norm": NaN, + "learning_rate": 0.00013604987826866638, + "loss": 0.0, + "step": 34998 + }, + { + "epoch": 3.265746011010544, + "grad_norm": NaN, + "learning_rate": 0.00013604234728316038, + "loss": 0.0, + "step": 34999 + }, + { + "epoch": 3.265839320705421, + "grad_norm": NaN, + "learning_rate": 0.00013603481633314455, + "loss": 0.0, + "step": 35000 + }, + { + "epoch": 3.2659326304002985, + "grad_norm": NaN, + "learning_rate": 0.00013602728541863796, + "loss": 0.0, + "step": 35001 + }, + { + "epoch": 3.266025940095176, + "grad_norm": NaN, + "learning_rate": 0.00013601975453965995, + "loss": 0.0, + "step": 35002 + }, + { + "epoch": 3.2661192497900533, + "grad_norm": NaN, + "learning_rate": 0.0001360122236962295, + "loss": 0.0, + "step": 35003 + }, + { + "epoch": 3.2662125594849307, + "grad_norm": NaN, + "learning_rate": 0.0001360046928883658, + "loss": 0.0, + "step": 35004 + }, + { + "epoch": 3.2663058691798077, + "grad_norm": NaN, + "learning_rate": 0.0001359971621160881, + "loss": 0.0, + "step": 35005 + }, + { + "epoch": 3.266399178874685, + "grad_norm": NaN, + "learning_rate": 0.00013598963137941543, + "loss": 0.0, + "step": 35006 + }, + { + "epoch": 3.2664924885695625, + "grad_norm": NaN, + "learning_rate": 0.00013598210067836692, + "loss": 0.0, + "step": 35007 + }, + { + "epoch": 3.2665857982644395, + "grad_norm": NaN, + "learning_rate": 0.00013597457001296188, + "loss": 0.0, + "step": 35008 + }, + { + "epoch": 3.266679107959317, + "grad_norm": NaN, + "learning_rate": 0.00013596703938321925, + "loss": 0.0, + "step": 35009 + }, + { + "epoch": 3.2667724176541943, + "grad_norm": NaN, + "learning_rate": 0.00013595950878915832, + "loss": 0.0, + "step": 35010 + }, + { + "epoch": 3.2668657273490718, + "grad_norm": NaN, + "learning_rate": 0.00013595197823079827, + "loss": 0.0, + "step": 35011 + }, + { + "epoch": 3.2669590370439487, + "grad_norm": NaN, + "learning_rate": 0.0001359444477081581, + "loss": 0.0, + "step": 35012 + }, + { + "epoch": 3.267052346738826, + "grad_norm": NaN, + "learning_rate": 0.00013593691722125703, + "loss": 0.0, + "step": 35013 + }, + { + "epoch": 3.2671456564337036, + "grad_norm": NaN, + "learning_rate": 0.00013592938677011427, + "loss": 0.0, + "step": 35014 + }, + { + "epoch": 3.2672389661285806, + "grad_norm": NaN, + "learning_rate": 0.00013592185635474886, + "loss": 0.0, + "step": 35015 + }, + { + "epoch": 3.267332275823458, + "grad_norm": NaN, + "learning_rate": 0.00013591432597518006, + "loss": 0.0, + "step": 35016 + }, + { + "epoch": 3.2674255855183354, + "grad_norm": NaN, + "learning_rate": 0.00013590679563142694, + "loss": 0.0, + "step": 35017 + }, + { + "epoch": 3.267518895213213, + "grad_norm": NaN, + "learning_rate": 0.0001358992653235086, + "loss": 0.0, + "step": 35018 + }, + { + "epoch": 3.26761220490809, + "grad_norm": NaN, + "learning_rate": 0.00013589173505144432, + "loss": 0.0, + "step": 35019 + }, + { + "epoch": 3.267705514602967, + "grad_norm": NaN, + "learning_rate": 0.00013588420481525315, + "loss": 0.0, + "step": 35020 + }, + { + "epoch": 3.2677988242978446, + "grad_norm": NaN, + "learning_rate": 0.00013587667461495423, + "loss": 0.0, + "step": 35021 + }, + { + "epoch": 3.2678921339927216, + "grad_norm": NaN, + "learning_rate": 0.00013586914445056682, + "loss": 0.0, + "step": 35022 + }, + { + "epoch": 3.267985443687599, + "grad_norm": NaN, + "learning_rate": 0.00013586161432210995, + "loss": 0.0, + "step": 35023 + }, + { + "epoch": 3.2680787533824764, + "grad_norm": NaN, + "learning_rate": 0.00013585408422960276, + "loss": 0.0, + "step": 35024 + }, + { + "epoch": 3.268172063077354, + "grad_norm": NaN, + "learning_rate": 0.00013584655417306452, + "loss": 0.0, + "step": 35025 + }, + { + "epoch": 3.2682653727722313, + "grad_norm": NaN, + "learning_rate": 0.00013583902415251428, + "loss": 0.0, + "step": 35026 + }, + { + "epoch": 3.2683586824671083, + "grad_norm": NaN, + "learning_rate": 0.00013583149416797116, + "loss": 0.0, + "step": 35027 + }, + { + "epoch": 3.2684519921619857, + "grad_norm": NaN, + "learning_rate": 0.00013582396421945445, + "loss": 0.0, + "step": 35028 + }, + { + "epoch": 3.268545301856863, + "grad_norm": NaN, + "learning_rate": 0.00013581643430698312, + "loss": 0.0, + "step": 35029 + }, + { + "epoch": 3.26863861155174, + "grad_norm": NaN, + "learning_rate": 0.0001358089044305764, + "loss": 0.0, + "step": 35030 + }, + { + "epoch": 3.2687319212466175, + "grad_norm": NaN, + "learning_rate": 0.00013580137459025347, + "loss": 0.0, + "step": 35031 + }, + { + "epoch": 3.268825230941495, + "grad_norm": NaN, + "learning_rate": 0.0001357938447860334, + "loss": 0.0, + "step": 35032 + }, + { + "epoch": 3.2689185406363723, + "grad_norm": NaN, + "learning_rate": 0.00013578631501793537, + "loss": 0.0, + "step": 35033 + }, + { + "epoch": 3.2690118503312493, + "grad_norm": NaN, + "learning_rate": 0.00013577878528597858, + "loss": 0.0, + "step": 35034 + }, + { + "epoch": 3.2691051600261267, + "grad_norm": NaN, + "learning_rate": 0.00013577125559018213, + "loss": 0.0, + "step": 35035 + }, + { + "epoch": 3.269198469721004, + "grad_norm": NaN, + "learning_rate": 0.00013576372593056507, + "loss": 0.0, + "step": 35036 + }, + { + "epoch": 3.269291779415881, + "grad_norm": NaN, + "learning_rate": 0.00013575619630714673, + "loss": 0.0, + "step": 35037 + }, + { + "epoch": 3.2693850891107585, + "grad_norm": NaN, + "learning_rate": 0.00013574866671994614, + "loss": 0.0, + "step": 35038 + }, + { + "epoch": 3.269478398805636, + "grad_norm": NaN, + "learning_rate": 0.00013574113716898243, + "loss": 0.0, + "step": 35039 + }, + { + "epoch": 3.2695717085005134, + "grad_norm": NaN, + "learning_rate": 0.00013573360765427485, + "loss": 0.0, + "step": 35040 + }, + { + "epoch": 3.2696650181953903, + "grad_norm": NaN, + "learning_rate": 0.00013572607817584242, + "loss": 0.0, + "step": 35041 + }, + { + "epoch": 3.2697583278902678, + "grad_norm": NaN, + "learning_rate": 0.00013571854873370437, + "loss": 0.0, + "step": 35042 + }, + { + "epoch": 3.269851637585145, + "grad_norm": NaN, + "learning_rate": 0.00013571101932787982, + "loss": 0.0, + "step": 35043 + }, + { + "epoch": 3.269944947280022, + "grad_norm": NaN, + "learning_rate": 0.00013570348995838792, + "loss": 0.0, + "step": 35044 + }, + { + "epoch": 3.2700382569748996, + "grad_norm": NaN, + "learning_rate": 0.00013569596062524777, + "loss": 0.0, + "step": 35045 + }, + { + "epoch": 3.270131566669777, + "grad_norm": NaN, + "learning_rate": 0.00013568843132847862, + "loss": 0.0, + "step": 35046 + }, + { + "epoch": 3.2702248763646544, + "grad_norm": NaN, + "learning_rate": 0.00013568090206809953, + "loss": 0.0, + "step": 35047 + }, + { + "epoch": 3.2703181860595314, + "grad_norm": NaN, + "learning_rate": 0.00013567337284412965, + "loss": 0.0, + "step": 35048 + }, + { + "epoch": 3.270411495754409, + "grad_norm": NaN, + "learning_rate": 0.00013566584365658815, + "loss": 0.0, + "step": 35049 + }, + { + "epoch": 3.2705048054492862, + "grad_norm": NaN, + "learning_rate": 0.00013565831450549414, + "loss": 0.0, + "step": 35050 + }, + { + "epoch": 3.2705981151441637, + "grad_norm": NaN, + "learning_rate": 0.0001356507853908668, + "loss": 0.0, + "step": 35051 + }, + { + "epoch": 3.2706914248390406, + "grad_norm": NaN, + "learning_rate": 0.0001356432563127253, + "loss": 0.0, + "step": 35052 + }, + { + "epoch": 3.270784734533918, + "grad_norm": NaN, + "learning_rate": 0.00013563572727108867, + "loss": 0.0, + "step": 35053 + }, + { + "epoch": 3.2708780442287955, + "grad_norm": NaN, + "learning_rate": 0.00013562819826597617, + "loss": 0.0, + "step": 35054 + }, + { + "epoch": 3.270971353923673, + "grad_norm": NaN, + "learning_rate": 0.00013562066929740694, + "loss": 0.0, + "step": 35055 + }, + { + "epoch": 3.27106466361855, + "grad_norm": NaN, + "learning_rate": 0.00013561314036540005, + "loss": 0.0, + "step": 35056 + }, + { + "epoch": 3.2711579733134273, + "grad_norm": NaN, + "learning_rate": 0.00013560561146997468, + "loss": 0.0, + "step": 35057 + }, + { + "epoch": 3.2712512830083047, + "grad_norm": NaN, + "learning_rate": 0.00013559808261115003, + "loss": 0.0, + "step": 35058 + }, + { + "epoch": 3.2713445927031817, + "grad_norm": NaN, + "learning_rate": 0.0001355905537889451, + "loss": 0.0, + "step": 35059 + }, + { + "epoch": 3.271437902398059, + "grad_norm": NaN, + "learning_rate": 0.0001355830250033792, + "loss": 0.0, + "step": 35060 + }, + { + "epoch": 3.2715312120929365, + "grad_norm": NaN, + "learning_rate": 0.00013557549625447136, + "loss": 0.0, + "step": 35061 + }, + { + "epoch": 3.271624521787814, + "grad_norm": NaN, + "learning_rate": 0.00013556796754224073, + "loss": 0.0, + "step": 35062 + }, + { + "epoch": 3.271717831482691, + "grad_norm": NaN, + "learning_rate": 0.00013556043886670655, + "loss": 0.0, + "step": 35063 + }, + { + "epoch": 3.2718111411775683, + "grad_norm": NaN, + "learning_rate": 0.00013555291022788787, + "loss": 0.0, + "step": 35064 + }, + { + "epoch": 3.2719044508724457, + "grad_norm": NaN, + "learning_rate": 0.00013554538162580384, + "loss": 0.0, + "step": 35065 + }, + { + "epoch": 3.2719977605673227, + "grad_norm": NaN, + "learning_rate": 0.00013553785306047366, + "loss": 0.0, + "step": 35066 + }, + { + "epoch": 3.2720910702622, + "grad_norm": NaN, + "learning_rate": 0.00013553032453191642, + "loss": 0.0, + "step": 35067 + }, + { + "epoch": 3.2721843799570776, + "grad_norm": NaN, + "learning_rate": 0.00013552279604015123, + "loss": 0.0, + "step": 35068 + }, + { + "epoch": 3.272277689651955, + "grad_norm": NaN, + "learning_rate": 0.00013551526758519736, + "loss": 0.0, + "step": 35069 + }, + { + "epoch": 3.272370999346832, + "grad_norm": NaN, + "learning_rate": 0.00013550773916707384, + "loss": 0.0, + "step": 35070 + }, + { + "epoch": 3.2724643090417094, + "grad_norm": NaN, + "learning_rate": 0.00013550021078579983, + "loss": 0.0, + "step": 35071 + }, + { + "epoch": 3.272557618736587, + "grad_norm": NaN, + "learning_rate": 0.00013549268244139452, + "loss": 0.0, + "step": 35072 + }, + { + "epoch": 3.272650928431464, + "grad_norm": NaN, + "learning_rate": 0.000135485154133877, + "loss": 0.0, + "step": 35073 + }, + { + "epoch": 3.272744238126341, + "grad_norm": NaN, + "learning_rate": 0.0001354776258632664, + "loss": 0.0, + "step": 35074 + }, + { + "epoch": 3.2728375478212186, + "grad_norm": NaN, + "learning_rate": 0.00013547009762958197, + "loss": 0.0, + "step": 35075 + }, + { + "epoch": 3.272930857516096, + "grad_norm": NaN, + "learning_rate": 0.00013546256943284274, + "loss": 0.0, + "step": 35076 + }, + { + "epoch": 3.2730241672109734, + "grad_norm": NaN, + "learning_rate": 0.00013545504127306785, + "loss": 0.0, + "step": 35077 + }, + { + "epoch": 3.2731174769058504, + "grad_norm": NaN, + "learning_rate": 0.00013544751315027655, + "loss": 0.0, + "step": 35078 + }, + { + "epoch": 3.273210786600728, + "grad_norm": NaN, + "learning_rate": 0.0001354399850644879, + "loss": 0.0, + "step": 35079 + }, + { + "epoch": 3.2733040962956053, + "grad_norm": NaN, + "learning_rate": 0.000135432457015721, + "loss": 0.0, + "step": 35080 + }, + { + "epoch": 3.2733974059904822, + "grad_norm": NaN, + "learning_rate": 0.0001354249290039951, + "loss": 0.0, + "step": 35081 + }, + { + "epoch": 3.2734907156853597, + "grad_norm": NaN, + "learning_rate": 0.0001354174010293293, + "loss": 0.0, + "step": 35082 + }, + { + "epoch": 3.273584025380237, + "grad_norm": NaN, + "learning_rate": 0.00013540987309174267, + "loss": 0.0, + "step": 35083 + }, + { + "epoch": 3.2736773350751145, + "grad_norm": NaN, + "learning_rate": 0.00013540234519125448, + "loss": 0.0, + "step": 35084 + }, + { + "epoch": 3.2737706447699915, + "grad_norm": NaN, + "learning_rate": 0.00013539481732788373, + "loss": 0.0, + "step": 35085 + }, + { + "epoch": 3.273863954464869, + "grad_norm": NaN, + "learning_rate": 0.00013538728950164967, + "loss": 0.0, + "step": 35086 + }, + { + "epoch": 3.2739572641597463, + "grad_norm": NaN, + "learning_rate": 0.00013537976171257144, + "loss": 0.0, + "step": 35087 + }, + { + "epoch": 3.2740505738546233, + "grad_norm": NaN, + "learning_rate": 0.00013537223396066806, + "loss": 0.0, + "step": 35088 + }, + { + "epoch": 3.2741438835495007, + "grad_norm": NaN, + "learning_rate": 0.00013536470624595884, + "loss": 0.0, + "step": 35089 + }, + { + "epoch": 3.274237193244378, + "grad_norm": NaN, + "learning_rate": 0.00013535717856846282, + "loss": 0.0, + "step": 35090 + }, + { + "epoch": 3.2743305029392555, + "grad_norm": NaN, + "learning_rate": 0.0001353496509281991, + "loss": 0.0, + "step": 35091 + }, + { + "epoch": 3.2744238126341325, + "grad_norm": NaN, + "learning_rate": 0.00013534212332518692, + "loss": 0.0, + "step": 35092 + }, + { + "epoch": 3.27451712232901, + "grad_norm": NaN, + "learning_rate": 0.00013533459575944541, + "loss": 0.0, + "step": 35093 + }, + { + "epoch": 3.2746104320238874, + "grad_norm": NaN, + "learning_rate": 0.0001353270682309936, + "loss": 0.0, + "step": 35094 + }, + { + "epoch": 3.2747037417187648, + "grad_norm": NaN, + "learning_rate": 0.00013531954073985077, + "loss": 0.0, + "step": 35095 + }, + { + "epoch": 3.2747970514136417, + "grad_norm": NaN, + "learning_rate": 0.00013531201328603602, + "loss": 0.0, + "step": 35096 + }, + { + "epoch": 3.274890361108519, + "grad_norm": NaN, + "learning_rate": 0.0001353044858695684, + "loss": 0.0, + "step": 35097 + }, + { + "epoch": 3.2749836708033966, + "grad_norm": NaN, + "learning_rate": 0.00013529695849046714, + "loss": 0.0, + "step": 35098 + }, + { + "epoch": 3.275076980498274, + "grad_norm": NaN, + "learning_rate": 0.0001352894311487514, + "loss": 0.0, + "step": 35099 + }, + { + "epoch": 3.275170290193151, + "grad_norm": NaN, + "learning_rate": 0.00013528190384444022, + "loss": 0.0, + "step": 35100 + }, + { + "epoch": 3.2752635998880284, + "grad_norm": NaN, + "learning_rate": 0.00013527437657755284, + "loss": 0.0, + "step": 35101 + }, + { + "epoch": 3.275356909582906, + "grad_norm": NaN, + "learning_rate": 0.00013526684934810835, + "loss": 0.0, + "step": 35102 + }, + { + "epoch": 3.275450219277783, + "grad_norm": NaN, + "learning_rate": 0.00013525932215612587, + "loss": 0.0, + "step": 35103 + }, + { + "epoch": 3.27554352897266, + "grad_norm": NaN, + "learning_rate": 0.00013525179500162464, + "loss": 0.0, + "step": 35104 + }, + { + "epoch": 3.2756368386675376, + "grad_norm": NaN, + "learning_rate": 0.00013524426788462369, + "loss": 0.0, + "step": 35105 + }, + { + "epoch": 3.275730148362415, + "grad_norm": NaN, + "learning_rate": 0.00013523674080514214, + "loss": 0.0, + "step": 35106 + }, + { + "epoch": 3.275823458057292, + "grad_norm": NaN, + "learning_rate": 0.00013522921376319926, + "loss": 0.0, + "step": 35107 + }, + { + "epoch": 3.2759167677521694, + "grad_norm": NaN, + "learning_rate": 0.00013522168675881408, + "loss": 0.0, + "step": 35108 + }, + { + "epoch": 3.276010077447047, + "grad_norm": NaN, + "learning_rate": 0.00013521415979200573, + "loss": 0.0, + "step": 35109 + }, + { + "epoch": 3.276103387141924, + "grad_norm": NaN, + "learning_rate": 0.00013520663286279347, + "loss": 0.0, + "step": 35110 + }, + { + "epoch": 3.2761966968368013, + "grad_norm": NaN, + "learning_rate": 0.00013519910597119635, + "loss": 0.0, + "step": 35111 + }, + { + "epoch": 3.2762900065316787, + "grad_norm": NaN, + "learning_rate": 0.00013519157911723347, + "loss": 0.0, + "step": 35112 + }, + { + "epoch": 3.276383316226556, + "grad_norm": NaN, + "learning_rate": 0.00013518405230092406, + "loss": 0.0, + "step": 35113 + }, + { + "epoch": 3.276476625921433, + "grad_norm": NaN, + "learning_rate": 0.0001351765255222872, + "loss": 0.0, + "step": 35114 + }, + { + "epoch": 3.2765699356163105, + "grad_norm": NaN, + "learning_rate": 0.00013516899878134202, + "loss": 0.0, + "step": 35115 + }, + { + "epoch": 3.276663245311188, + "grad_norm": NaN, + "learning_rate": 0.00013516147207810774, + "loss": 0.0, + "step": 35116 + }, + { + "epoch": 3.276756555006065, + "grad_norm": NaN, + "learning_rate": 0.00013515394541260343, + "loss": 0.0, + "step": 35117 + }, + { + "epoch": 3.2768498647009423, + "grad_norm": NaN, + "learning_rate": 0.00013514641878484816, + "loss": 0.0, + "step": 35118 + }, + { + "epoch": 3.2769431743958197, + "grad_norm": NaN, + "learning_rate": 0.00013513889219486126, + "loss": 0.0, + "step": 35119 + }, + { + "epoch": 3.277036484090697, + "grad_norm": NaN, + "learning_rate": 0.0001351313656426617, + "loss": 0.0, + "step": 35120 + }, + { + "epoch": 3.2771297937855746, + "grad_norm": NaN, + "learning_rate": 0.00013512383912826865, + "loss": 0.0, + "step": 35121 + }, + { + "epoch": 3.2772231034804515, + "grad_norm": NaN, + "learning_rate": 0.0001351163126517013, + "loss": 0.0, + "step": 35122 + }, + { + "epoch": 3.277316413175329, + "grad_norm": NaN, + "learning_rate": 0.00013510878621297872, + "loss": 0.0, + "step": 35123 + }, + { + "epoch": 3.2774097228702064, + "grad_norm": NaN, + "learning_rate": 0.00013510125981212013, + "loss": 0.0, + "step": 35124 + }, + { + "epoch": 3.2775030325650834, + "grad_norm": NaN, + "learning_rate": 0.00013509373344914463, + "loss": 0.0, + "step": 35125 + }, + { + "epoch": 3.2775963422599608, + "grad_norm": NaN, + "learning_rate": 0.0001350862071240713, + "loss": 0.0, + "step": 35126 + }, + { + "epoch": 3.277689651954838, + "grad_norm": NaN, + "learning_rate": 0.00013507868083691935, + "loss": 0.0, + "step": 35127 + }, + { + "epoch": 3.2777829616497156, + "grad_norm": NaN, + "learning_rate": 0.00013507115458770793, + "loss": 0.0, + "step": 35128 + }, + { + "epoch": 3.2778762713445926, + "grad_norm": NaN, + "learning_rate": 0.0001350636283764561, + "loss": 0.0, + "step": 35129 + }, + { + "epoch": 3.27796958103947, + "grad_norm": NaN, + "learning_rate": 0.00013505610220318305, + "loss": 0.0, + "step": 35130 + }, + { + "epoch": 3.2780628907343474, + "grad_norm": NaN, + "learning_rate": 0.00013504857606790795, + "loss": 0.0, + "step": 35131 + }, + { + "epoch": 3.2781562004292244, + "grad_norm": NaN, + "learning_rate": 0.00013504104997064982, + "loss": 0.0, + "step": 35132 + }, + { + "epoch": 3.278249510124102, + "grad_norm": NaN, + "learning_rate": 0.0001350335239114279, + "loss": 0.0, + "step": 35133 + }, + { + "epoch": 3.2783428198189792, + "grad_norm": NaN, + "learning_rate": 0.00013502599789026132, + "loss": 0.0, + "step": 35134 + }, + { + "epoch": 3.2784361295138567, + "grad_norm": NaN, + "learning_rate": 0.00013501847190716914, + "loss": 0.0, + "step": 35135 + }, + { + "epoch": 3.2785294392087336, + "grad_norm": NaN, + "learning_rate": 0.00013501094596217057, + "loss": 0.0, + "step": 35136 + }, + { + "epoch": 3.278622748903611, + "grad_norm": NaN, + "learning_rate": 0.00013500342005528477, + "loss": 0.0, + "step": 35137 + }, + { + "epoch": 3.2787160585984885, + "grad_norm": NaN, + "learning_rate": 0.00013499589418653073, + "loss": 0.0, + "step": 35138 + }, + { + "epoch": 3.2788093682933654, + "grad_norm": NaN, + "learning_rate": 0.00013498836835592777, + "loss": 0.0, + "step": 35139 + }, + { + "epoch": 3.278902677988243, + "grad_norm": NaN, + "learning_rate": 0.00013498084256349495, + "loss": 0.0, + "step": 35140 + }, + { + "epoch": 3.2789959876831203, + "grad_norm": NaN, + "learning_rate": 0.00013497331680925134, + "loss": 0.0, + "step": 35141 + }, + { + "epoch": 3.2790892973779977, + "grad_norm": NaN, + "learning_rate": 0.00013496579109321616, + "loss": 0.0, + "step": 35142 + }, + { + "epoch": 3.2791826070728747, + "grad_norm": NaN, + "learning_rate": 0.00013495826541540855, + "loss": 0.0, + "step": 35143 + }, + { + "epoch": 3.279275916767752, + "grad_norm": NaN, + "learning_rate": 0.00013495073977584754, + "loss": 0.0, + "step": 35144 + }, + { + "epoch": 3.2793692264626295, + "grad_norm": NaN, + "learning_rate": 0.0001349432141745524, + "loss": 0.0, + "step": 35145 + }, + { + "epoch": 3.279462536157507, + "grad_norm": NaN, + "learning_rate": 0.00013493568861154223, + "loss": 0.0, + "step": 35146 + }, + { + "epoch": 3.279555845852384, + "grad_norm": NaN, + "learning_rate": 0.00013492816308683606, + "loss": 0.0, + "step": 35147 + }, + { + "epoch": 3.2796491555472613, + "grad_norm": NaN, + "learning_rate": 0.00013492063760045316, + "loss": 0.0, + "step": 35148 + }, + { + "epoch": 3.2797424652421387, + "grad_norm": NaN, + "learning_rate": 0.00013491311215241262, + "loss": 0.0, + "step": 35149 + }, + { + "epoch": 3.279835774937016, + "grad_norm": NaN, + "learning_rate": 0.00013490558674273352, + "loss": 0.0, + "step": 35150 + }, + { + "epoch": 3.279929084631893, + "grad_norm": NaN, + "learning_rate": 0.0001348980613714351, + "loss": 0.0, + "step": 35151 + }, + { + "epoch": 3.2800223943267706, + "grad_norm": NaN, + "learning_rate": 0.00013489053603853643, + "loss": 0.0, + "step": 35152 + }, + { + "epoch": 3.280115704021648, + "grad_norm": NaN, + "learning_rate": 0.00013488301074405658, + "loss": 0.0, + "step": 35153 + }, + { + "epoch": 3.280209013716525, + "grad_norm": NaN, + "learning_rate": 0.00013487548548801485, + "loss": 0.0, + "step": 35154 + }, + { + "epoch": 3.2803023234114024, + "grad_norm": NaN, + "learning_rate": 0.00013486796027043022, + "loss": 0.0, + "step": 35155 + }, + { + "epoch": 3.28039563310628, + "grad_norm": NaN, + "learning_rate": 0.00013486043509132185, + "loss": 0.0, + "step": 35156 + }, + { + "epoch": 3.280488942801157, + "grad_norm": NaN, + "learning_rate": 0.00013485290995070902, + "loss": 0.0, + "step": 35157 + }, + { + "epoch": 3.280582252496034, + "grad_norm": NaN, + "learning_rate": 0.0001348453848486107, + "loss": 0.0, + "step": 35158 + }, + { + "epoch": 3.2806755621909116, + "grad_norm": NaN, + "learning_rate": 0.000134837859785046, + "loss": 0.0, + "step": 35159 + }, + { + "epoch": 3.280768871885789, + "grad_norm": NaN, + "learning_rate": 0.00013483033476003426, + "loss": 0.0, + "step": 35160 + }, + { + "epoch": 3.280862181580666, + "grad_norm": NaN, + "learning_rate": 0.0001348228097735944, + "loss": 0.0, + "step": 35161 + }, + { + "epoch": 3.2809554912755434, + "grad_norm": NaN, + "learning_rate": 0.00013481528482574566, + "loss": 0.0, + "step": 35162 + }, + { + "epoch": 3.281048800970421, + "grad_norm": NaN, + "learning_rate": 0.0001348077599165072, + "loss": 0.0, + "step": 35163 + }, + { + "epoch": 3.2811421106652983, + "grad_norm": NaN, + "learning_rate": 0.000134800235045898, + "loss": 0.0, + "step": 35164 + }, + { + "epoch": 3.2812354203601752, + "grad_norm": NaN, + "learning_rate": 0.00013479271021393736, + "loss": 0.0, + "step": 35165 + }, + { + "epoch": 3.2813287300550527, + "grad_norm": NaN, + "learning_rate": 0.0001347851854206444, + "loss": 0.0, + "step": 35166 + }, + { + "epoch": 3.28142203974993, + "grad_norm": NaN, + "learning_rate": 0.0001347776606660381, + "loss": 0.0, + "step": 35167 + }, + { + "epoch": 3.2815153494448075, + "grad_norm": NaN, + "learning_rate": 0.00013477013595013777, + "loss": 0.0, + "step": 35168 + }, + { + "epoch": 3.2816086591396845, + "grad_norm": NaN, + "learning_rate": 0.0001347626112729625, + "loss": 0.0, + "step": 35169 + }, + { + "epoch": 3.281701968834562, + "grad_norm": NaN, + "learning_rate": 0.00013475508663453136, + "loss": 0.0, + "step": 35170 + }, + { + "epoch": 3.2817952785294393, + "grad_norm": NaN, + "learning_rate": 0.0001347475620348635, + "loss": 0.0, + "step": 35171 + }, + { + "epoch": 3.2818885882243167, + "grad_norm": NaN, + "learning_rate": 0.00013474003747397813, + "loss": 0.0, + "step": 35172 + }, + { + "epoch": 3.2819818979191937, + "grad_norm": NaN, + "learning_rate": 0.00013473251295189424, + "loss": 0.0, + "step": 35173 + }, + { + "epoch": 3.282075207614071, + "grad_norm": NaN, + "learning_rate": 0.0001347249884686311, + "loss": 0.0, + "step": 35174 + }, + { + "epoch": 3.2821685173089485, + "grad_norm": NaN, + "learning_rate": 0.0001347174640242078, + "loss": 0.0, + "step": 35175 + }, + { + "epoch": 3.2822618270038255, + "grad_norm": NaN, + "learning_rate": 0.00013470993961864342, + "loss": 0.0, + "step": 35176 + }, + { + "epoch": 3.282355136698703, + "grad_norm": NaN, + "learning_rate": 0.00013470241525195717, + "loss": 0.0, + "step": 35177 + }, + { + "epoch": 3.2824484463935804, + "grad_norm": NaN, + "learning_rate": 0.00013469489092416816, + "loss": 0.0, + "step": 35178 + }, + { + "epoch": 3.2825417560884578, + "grad_norm": NaN, + "learning_rate": 0.00013468736663529545, + "loss": 0.0, + "step": 35179 + }, + { + "epoch": 3.2826350657833347, + "grad_norm": NaN, + "learning_rate": 0.00013467984238535826, + "loss": 0.0, + "step": 35180 + }, + { + "epoch": 3.282728375478212, + "grad_norm": NaN, + "learning_rate": 0.00013467231817437575, + "loss": 0.0, + "step": 35181 + }, + { + "epoch": 3.2828216851730896, + "grad_norm": NaN, + "learning_rate": 0.0001346647940023669, + "loss": 0.0, + "step": 35182 + }, + { + "epoch": 3.2829149948679666, + "grad_norm": NaN, + "learning_rate": 0.00013465726986935098, + "loss": 0.0, + "step": 35183 + }, + { + "epoch": 3.283008304562844, + "grad_norm": NaN, + "learning_rate": 0.0001346497457753471, + "loss": 0.0, + "step": 35184 + }, + { + "epoch": 3.2831016142577214, + "grad_norm": NaN, + "learning_rate": 0.00013464222172037432, + "loss": 0.0, + "step": 35185 + }, + { + "epoch": 3.283194923952599, + "grad_norm": NaN, + "learning_rate": 0.00013463469770445184, + "loss": 0.0, + "step": 35186 + }, + { + "epoch": 3.283288233647476, + "grad_norm": NaN, + "learning_rate": 0.0001346271737275988, + "loss": 0.0, + "step": 35187 + }, + { + "epoch": 3.283381543342353, + "grad_norm": NaN, + "learning_rate": 0.00013461964978983426, + "loss": 0.0, + "step": 35188 + }, + { + "epoch": 3.2834748530372306, + "grad_norm": NaN, + "learning_rate": 0.0001346121258911774, + "loss": 0.0, + "step": 35189 + }, + { + "epoch": 3.283568162732108, + "grad_norm": NaN, + "learning_rate": 0.0001346046020316474, + "loss": 0.0, + "step": 35190 + }, + { + "epoch": 3.283661472426985, + "grad_norm": NaN, + "learning_rate": 0.00013459707821126328, + "loss": 0.0, + "step": 35191 + }, + { + "epoch": 3.2837547821218624, + "grad_norm": NaN, + "learning_rate": 0.00013458955443004423, + "loss": 0.0, + "step": 35192 + }, + { + "epoch": 3.28384809181674, + "grad_norm": NaN, + "learning_rate": 0.00013458203068800943, + "loss": 0.0, + "step": 35193 + }, + { + "epoch": 3.2839414015116173, + "grad_norm": NaN, + "learning_rate": 0.0001345745069851779, + "loss": 0.0, + "step": 35194 + }, + { + "epoch": 3.2840347112064943, + "grad_norm": NaN, + "learning_rate": 0.0001345669833215689, + "loss": 0.0, + "step": 35195 + }, + { + "epoch": 3.2841280209013717, + "grad_norm": NaN, + "learning_rate": 0.00013455945969720142, + "loss": 0.0, + "step": 35196 + }, + { + "epoch": 3.284221330596249, + "grad_norm": NaN, + "learning_rate": 0.00013455193611209467, + "loss": 0.0, + "step": 35197 + }, + { + "epoch": 3.284314640291126, + "grad_norm": NaN, + "learning_rate": 0.0001345444125662678, + "loss": 0.0, + "step": 35198 + }, + { + "epoch": 3.2844079499860035, + "grad_norm": NaN, + "learning_rate": 0.00013453688905973985, + "loss": 0.0, + "step": 35199 + }, + { + "epoch": 3.284501259680881, + "grad_norm": NaN, + "learning_rate": 0.00013452936559253006, + "loss": 0.0, + "step": 35200 + }, + { + "epoch": 3.2845945693757583, + "grad_norm": NaN, + "learning_rate": 0.00013452184216465753, + "loss": 0.0, + "step": 35201 + }, + { + "epoch": 3.2846878790706353, + "grad_norm": NaN, + "learning_rate": 0.00013451431877614133, + "loss": 0.0, + "step": 35202 + }, + { + "epoch": 3.2847811887655127, + "grad_norm": NaN, + "learning_rate": 0.00013450679542700064, + "loss": 0.0, + "step": 35203 + }, + { + "epoch": 3.28487449846039, + "grad_norm": NaN, + "learning_rate": 0.00013449927211725462, + "loss": 0.0, + "step": 35204 + }, + { + "epoch": 3.284967808155267, + "grad_norm": NaN, + "learning_rate": 0.00013449174884692228, + "loss": 0.0, + "step": 35205 + }, + { + "epoch": 3.2850611178501445, + "grad_norm": NaN, + "learning_rate": 0.00013448422561602286, + "loss": 0.0, + "step": 35206 + }, + { + "epoch": 3.285154427545022, + "grad_norm": NaN, + "learning_rate": 0.0001344767024245755, + "loss": 0.0, + "step": 35207 + }, + { + "epoch": 3.2852477372398994, + "grad_norm": NaN, + "learning_rate": 0.00013446917927259925, + "loss": 0.0, + "step": 35208 + }, + { + "epoch": 3.2853410469347764, + "grad_norm": NaN, + "learning_rate": 0.00013446165616011328, + "loss": 0.0, + "step": 35209 + }, + { + "epoch": 3.2854343566296538, + "grad_norm": NaN, + "learning_rate": 0.00013445413308713677, + "loss": 0.0, + "step": 35210 + }, + { + "epoch": 3.285527666324531, + "grad_norm": NaN, + "learning_rate": 0.0001344466100536887, + "loss": 0.0, + "step": 35211 + }, + { + "epoch": 3.285620976019408, + "grad_norm": NaN, + "learning_rate": 0.00013443908705978834, + "loss": 0.0, + "step": 35212 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": NaN, + "learning_rate": 0.0001344315641054548, + "loss": 0.0, + "step": 35213 + }, + { + "epoch": 3.285807595409163, + "grad_norm": NaN, + "learning_rate": 0.0001344240411907071, + "loss": 0.0, + "step": 35214 + }, + { + "epoch": 3.2859009051040404, + "grad_norm": NaN, + "learning_rate": 0.0001344165183155645, + "loss": 0.0, + "step": 35215 + }, + { + "epoch": 3.285994214798918, + "grad_norm": NaN, + "learning_rate": 0.0001344089954800461, + "loss": 0.0, + "step": 35216 + }, + { + "epoch": 3.286087524493795, + "grad_norm": NaN, + "learning_rate": 0.000134401472684171, + "loss": 0.0, + "step": 35217 + }, + { + "epoch": 3.2861808341886722, + "grad_norm": NaN, + "learning_rate": 0.0001343939499279583, + "loss": 0.0, + "step": 35218 + }, + { + "epoch": 3.2862741438835497, + "grad_norm": NaN, + "learning_rate": 0.00013438642721142721, + "loss": 0.0, + "step": 35219 + }, + { + "epoch": 3.2863674535784266, + "grad_norm": NaN, + "learning_rate": 0.00013437890453459677, + "loss": 0.0, + "step": 35220 + }, + { + "epoch": 3.286460763273304, + "grad_norm": NaN, + "learning_rate": 0.00013437138189748615, + "loss": 0.0, + "step": 35221 + }, + { + "epoch": 3.2865540729681815, + "grad_norm": NaN, + "learning_rate": 0.00013436385930011452, + "loss": 0.0, + "step": 35222 + }, + { + "epoch": 3.286647382663059, + "grad_norm": NaN, + "learning_rate": 0.00013435633674250088, + "loss": 0.0, + "step": 35223 + }, + { + "epoch": 3.286740692357936, + "grad_norm": NaN, + "learning_rate": 0.00013434881422466448, + "loss": 0.0, + "step": 35224 + }, + { + "epoch": 3.2868340020528133, + "grad_norm": NaN, + "learning_rate": 0.00013434129174662445, + "loss": 0.0, + "step": 35225 + }, + { + "epoch": 3.2869273117476907, + "grad_norm": NaN, + "learning_rate": 0.00013433376930839982, + "loss": 0.0, + "step": 35226 + }, + { + "epoch": 3.2870206214425677, + "grad_norm": NaN, + "learning_rate": 0.00013432624691000981, + "loss": 0.0, + "step": 35227 + }, + { + "epoch": 3.287113931137445, + "grad_norm": NaN, + "learning_rate": 0.00013431872455147353, + "loss": 0.0, + "step": 35228 + }, + { + "epoch": 3.2872072408323225, + "grad_norm": NaN, + "learning_rate": 0.00013431120223281003, + "loss": 0.0, + "step": 35229 + }, + { + "epoch": 3.2873005505272, + "grad_norm": NaN, + "learning_rate": 0.00013430367995403855, + "loss": 0.0, + "step": 35230 + }, + { + "epoch": 3.287393860222077, + "grad_norm": NaN, + "learning_rate": 0.00013429615771517816, + "loss": 0.0, + "step": 35231 + }, + { + "epoch": 3.2874871699169543, + "grad_norm": NaN, + "learning_rate": 0.00013428863551624793, + "loss": 0.0, + "step": 35232 + }, + { + "epoch": 3.2875804796118318, + "grad_norm": NaN, + "learning_rate": 0.00013428111335726705, + "loss": 0.0, + "step": 35233 + }, + { + "epoch": 3.2876737893067087, + "grad_norm": NaN, + "learning_rate": 0.00013427359123825472, + "loss": 0.0, + "step": 35234 + }, + { + "epoch": 3.287767099001586, + "grad_norm": NaN, + "learning_rate": 0.00013426606915922993, + "loss": 0.0, + "step": 35235 + }, + { + "epoch": 3.2878604086964636, + "grad_norm": NaN, + "learning_rate": 0.00013425854712021188, + "loss": 0.0, + "step": 35236 + }, + { + "epoch": 3.287953718391341, + "grad_norm": NaN, + "learning_rate": 0.0001342510251212197, + "loss": 0.0, + "step": 35237 + }, + { + "epoch": 3.2880470280862184, + "grad_norm": NaN, + "learning_rate": 0.00013424350316227246, + "loss": 0.0, + "step": 35238 + }, + { + "epoch": 3.2881403377810954, + "grad_norm": NaN, + "learning_rate": 0.0001342359812433894, + "loss": 0.0, + "step": 35239 + }, + { + "epoch": 3.288233647475973, + "grad_norm": NaN, + "learning_rate": 0.0001342284593645895, + "loss": 0.0, + "step": 35240 + }, + { + "epoch": 3.28832695717085, + "grad_norm": NaN, + "learning_rate": 0.00013422093752589193, + "loss": 0.0, + "step": 35241 + }, + { + "epoch": 3.288420266865727, + "grad_norm": NaN, + "learning_rate": 0.00013421341572731593, + "loss": 0.0, + "step": 35242 + }, + { + "epoch": 3.2885135765606046, + "grad_norm": NaN, + "learning_rate": 0.00013420589396888046, + "loss": 0.0, + "step": 35243 + }, + { + "epoch": 3.288606886255482, + "grad_norm": NaN, + "learning_rate": 0.00013419837225060475, + "loss": 0.0, + "step": 35244 + }, + { + "epoch": 3.2887001959503595, + "grad_norm": NaN, + "learning_rate": 0.00013419085057250792, + "loss": 0.0, + "step": 35245 + }, + { + "epoch": 3.2887935056452364, + "grad_norm": NaN, + "learning_rate": 0.00013418332893460903, + "loss": 0.0, + "step": 35246 + }, + { + "epoch": 3.288886815340114, + "grad_norm": NaN, + "learning_rate": 0.0001341758073369273, + "loss": 0.0, + "step": 35247 + }, + { + "epoch": 3.2889801250349913, + "grad_norm": NaN, + "learning_rate": 0.0001341682857794818, + "loss": 0.0, + "step": 35248 + }, + { + "epoch": 3.2890734347298682, + "grad_norm": NaN, + "learning_rate": 0.0001341607642622916, + "loss": 0.0, + "step": 35249 + }, + { + "epoch": 3.2891667444247457, + "grad_norm": NaN, + "learning_rate": 0.00013415324278537592, + "loss": 0.0, + "step": 35250 + }, + { + "epoch": 3.289260054119623, + "grad_norm": NaN, + "learning_rate": 0.00013414572134875388, + "loss": 0.0, + "step": 35251 + }, + { + "epoch": 3.2893533638145005, + "grad_norm": NaN, + "learning_rate": 0.00013413819995244453, + "loss": 0.0, + "step": 35252 + }, + { + "epoch": 3.2894466735093775, + "grad_norm": NaN, + "learning_rate": 0.00013413067859646704, + "loss": 0.0, + "step": 35253 + }, + { + "epoch": 3.289539983204255, + "grad_norm": NaN, + "learning_rate": 0.0001341231572808406, + "loss": 0.0, + "step": 35254 + }, + { + "epoch": 3.2896332928991323, + "grad_norm": NaN, + "learning_rate": 0.00013411563600558416, + "loss": 0.0, + "step": 35255 + }, + { + "epoch": 3.2897266025940093, + "grad_norm": NaN, + "learning_rate": 0.00013410811477071702, + "loss": 0.0, + "step": 35256 + }, + { + "epoch": 3.2898199122888867, + "grad_norm": NaN, + "learning_rate": 0.00013410059357625825, + "loss": 0.0, + "step": 35257 + }, + { + "epoch": 3.289913221983764, + "grad_norm": NaN, + "learning_rate": 0.0001340930724222269, + "loss": 0.0, + "step": 35258 + }, + { + "epoch": 3.2900065316786415, + "grad_norm": NaN, + "learning_rate": 0.00013408555130864223, + "loss": 0.0, + "step": 35259 + }, + { + "epoch": 3.2900998413735185, + "grad_norm": NaN, + "learning_rate": 0.00013407803023552327, + "loss": 0.0, + "step": 35260 + }, + { + "epoch": 3.290193151068396, + "grad_norm": NaN, + "learning_rate": 0.0001340705092028891, + "loss": 0.0, + "step": 35261 + }, + { + "epoch": 3.2902864607632734, + "grad_norm": NaN, + "learning_rate": 0.00013406298821075894, + "loss": 0.0, + "step": 35262 + }, + { + "epoch": 3.2903797704581508, + "grad_norm": NaN, + "learning_rate": 0.00013405546725915192, + "loss": 0.0, + "step": 35263 + }, + { + "epoch": 3.2904730801530278, + "grad_norm": NaN, + "learning_rate": 0.00013404794634808707, + "loss": 0.0, + "step": 35264 + }, + { + "epoch": 3.290566389847905, + "grad_norm": NaN, + "learning_rate": 0.0001340404254775836, + "loss": 0.0, + "step": 35265 + }, + { + "epoch": 3.2906596995427826, + "grad_norm": NaN, + "learning_rate": 0.00013403290464766062, + "loss": 0.0, + "step": 35266 + }, + { + "epoch": 3.29075300923766, + "grad_norm": NaN, + "learning_rate": 0.00013402538385833717, + "loss": 0.0, + "step": 35267 + }, + { + "epoch": 3.290846318932537, + "grad_norm": NaN, + "learning_rate": 0.00013401786310963243, + "loss": 0.0, + "step": 35268 + }, + { + "epoch": 3.2909396286274144, + "grad_norm": NaN, + "learning_rate": 0.00013401034240156565, + "loss": 0.0, + "step": 35269 + }, + { + "epoch": 3.291032938322292, + "grad_norm": NaN, + "learning_rate": 0.00013400282173415572, + "loss": 0.0, + "step": 35270 + }, + { + "epoch": 3.291126248017169, + "grad_norm": NaN, + "learning_rate": 0.00013399530110742188, + "loss": 0.0, + "step": 35271 + }, + { + "epoch": 3.291219557712046, + "grad_norm": NaN, + "learning_rate": 0.0001339877805213833, + "loss": 0.0, + "step": 35272 + }, + { + "epoch": 3.2913128674069236, + "grad_norm": NaN, + "learning_rate": 0.00013398025997605903, + "loss": 0.0, + "step": 35273 + }, + { + "epoch": 3.291406177101801, + "grad_norm": NaN, + "learning_rate": 0.0001339727394714682, + "loss": 0.0, + "step": 35274 + }, + { + "epoch": 3.291499486796678, + "grad_norm": NaN, + "learning_rate": 0.00013396521900762998, + "loss": 0.0, + "step": 35275 + }, + { + "epoch": 3.2915927964915555, + "grad_norm": NaN, + "learning_rate": 0.00013395769858456344, + "loss": 0.0, + "step": 35276 + }, + { + "epoch": 3.291686106186433, + "grad_norm": NaN, + "learning_rate": 0.00013395017820228767, + "loss": 0.0, + "step": 35277 + }, + { + "epoch": 3.29177941588131, + "grad_norm": NaN, + "learning_rate": 0.00013394265786082193, + "loss": 0.0, + "step": 35278 + }, + { + "epoch": 3.2918727255761873, + "grad_norm": NaN, + "learning_rate": 0.00013393513756018523, + "loss": 0.0, + "step": 35279 + }, + { + "epoch": 3.2919660352710647, + "grad_norm": NaN, + "learning_rate": 0.00013392761730039667, + "loss": 0.0, + "step": 35280 + }, + { + "epoch": 3.292059344965942, + "grad_norm": NaN, + "learning_rate": 0.00013392009708147545, + "loss": 0.0, + "step": 35281 + }, + { + "epoch": 3.292152654660819, + "grad_norm": NaN, + "learning_rate": 0.00013391257690344066, + "loss": 0.0, + "step": 35282 + }, + { + "epoch": 3.2922459643556965, + "grad_norm": NaN, + "learning_rate": 0.0001339050567663114, + "loss": 0.0, + "step": 35283 + }, + { + "epoch": 3.292339274050574, + "grad_norm": NaN, + "learning_rate": 0.00013389753667010686, + "loss": 0.0, + "step": 35284 + }, + { + "epoch": 3.2924325837454513, + "grad_norm": NaN, + "learning_rate": 0.0001338900166148461, + "loss": 0.0, + "step": 35285 + }, + { + "epoch": 3.2925258934403283, + "grad_norm": NaN, + "learning_rate": 0.00013388249660054827, + "loss": 0.0, + "step": 35286 + }, + { + "epoch": 3.2926192031352057, + "grad_norm": NaN, + "learning_rate": 0.0001338749766272324, + "loss": 0.0, + "step": 35287 + }, + { + "epoch": 3.292712512830083, + "grad_norm": NaN, + "learning_rate": 0.00013386745669491775, + "loss": 0.0, + "step": 35288 + }, + { + "epoch": 3.2928058225249606, + "grad_norm": NaN, + "learning_rate": 0.0001338599368036234, + "loss": 0.0, + "step": 35289 + }, + { + "epoch": 3.2928991322198375, + "grad_norm": NaN, + "learning_rate": 0.00013385241695336838, + "loss": 0.0, + "step": 35290 + }, + { + "epoch": 3.292992441914715, + "grad_norm": NaN, + "learning_rate": 0.0001338448971441719, + "loss": 0.0, + "step": 35291 + }, + { + "epoch": 3.2930857516095924, + "grad_norm": NaN, + "learning_rate": 0.00013383737737605314, + "loss": 0.0, + "step": 35292 + }, + { + "epoch": 3.2931790613044694, + "grad_norm": NaN, + "learning_rate": 0.00013382985764903105, + "loss": 0.0, + "step": 35293 + }, + { + "epoch": 3.2932723709993468, + "grad_norm": NaN, + "learning_rate": 0.00013382233796312485, + "loss": 0.0, + "step": 35294 + }, + { + "epoch": 3.293365680694224, + "grad_norm": NaN, + "learning_rate": 0.0001338148183183537, + "loss": 0.0, + "step": 35295 + }, + { + "epoch": 3.2934589903891016, + "grad_norm": NaN, + "learning_rate": 0.00013380729871473662, + "loss": 0.0, + "step": 35296 + }, + { + "epoch": 3.2935523000839786, + "grad_norm": NaN, + "learning_rate": 0.00013379977915229283, + "loss": 0.0, + "step": 35297 + }, + { + "epoch": 3.293645609778856, + "grad_norm": NaN, + "learning_rate": 0.0001337922596310414, + "loss": 0.0, + "step": 35298 + }, + { + "epoch": 3.2937389194737334, + "grad_norm": NaN, + "learning_rate": 0.0001337847401510014, + "loss": 0.0, + "step": 35299 + }, + { + "epoch": 3.2938322291686104, + "grad_norm": NaN, + "learning_rate": 0.00013377722071219206, + "loss": 0.0, + "step": 35300 + }, + { + "epoch": 3.293925538863488, + "grad_norm": NaN, + "learning_rate": 0.00013376970131463242, + "loss": 0.0, + "step": 35301 + }, + { + "epoch": 3.2940188485583652, + "grad_norm": NaN, + "learning_rate": 0.0001337621819583416, + "loss": 0.0, + "step": 35302 + }, + { + "epoch": 3.2941121582532427, + "grad_norm": NaN, + "learning_rate": 0.00013375466264333878, + "loss": 0.0, + "step": 35303 + }, + { + "epoch": 3.2942054679481196, + "grad_norm": NaN, + "learning_rate": 0.00013374714336964304, + "loss": 0.0, + "step": 35304 + }, + { + "epoch": 3.294298777642997, + "grad_norm": NaN, + "learning_rate": 0.00013373962413727345, + "loss": 0.0, + "step": 35305 + }, + { + "epoch": 3.2943920873378745, + "grad_norm": NaN, + "learning_rate": 0.0001337321049462492, + "loss": 0.0, + "step": 35306 + }, + { + "epoch": 3.294485397032752, + "grad_norm": NaN, + "learning_rate": 0.00013372458579658945, + "loss": 0.0, + "step": 35307 + }, + { + "epoch": 3.294578706727629, + "grad_norm": NaN, + "learning_rate": 0.00013371706668831318, + "loss": 0.0, + "step": 35308 + }, + { + "epoch": 3.2946720164225063, + "grad_norm": NaN, + "learning_rate": 0.00013370954762143957, + "loss": 0.0, + "step": 35309 + }, + { + "epoch": 3.2947653261173837, + "grad_norm": NaN, + "learning_rate": 0.00013370202859598783, + "loss": 0.0, + "step": 35310 + }, + { + "epoch": 3.294858635812261, + "grad_norm": NaN, + "learning_rate": 0.00013369450961197695, + "loss": 0.0, + "step": 35311 + }, + { + "epoch": 3.294951945507138, + "grad_norm": NaN, + "learning_rate": 0.00013368699066942613, + "loss": 0.0, + "step": 35312 + }, + { + "epoch": 3.2950452552020155, + "grad_norm": NaN, + "learning_rate": 0.00013367947176835446, + "loss": 0.0, + "step": 35313 + }, + { + "epoch": 3.295138564896893, + "grad_norm": NaN, + "learning_rate": 0.00013367195290878107, + "loss": 0.0, + "step": 35314 + }, + { + "epoch": 3.29523187459177, + "grad_norm": NaN, + "learning_rate": 0.000133664434090725, + "loss": 0.0, + "step": 35315 + }, + { + "epoch": 3.2953251842866473, + "grad_norm": NaN, + "learning_rate": 0.00013365691531420552, + "loss": 0.0, + "step": 35316 + }, + { + "epoch": 3.2954184939815248, + "grad_norm": NaN, + "learning_rate": 0.00013364939657924162, + "loss": 0.0, + "step": 35317 + }, + { + "epoch": 3.295511803676402, + "grad_norm": NaN, + "learning_rate": 0.00013364187788585244, + "loss": 0.0, + "step": 35318 + }, + { + "epoch": 3.295605113371279, + "grad_norm": NaN, + "learning_rate": 0.00013363435923405718, + "loss": 0.0, + "step": 35319 + }, + { + "epoch": 3.2956984230661566, + "grad_norm": NaN, + "learning_rate": 0.00013362684062387487, + "loss": 0.0, + "step": 35320 + }, + { + "epoch": 3.295791732761034, + "grad_norm": NaN, + "learning_rate": 0.00013361932205532462, + "loss": 0.0, + "step": 35321 + }, + { + "epoch": 3.295885042455911, + "grad_norm": NaN, + "learning_rate": 0.00013361180352842566, + "loss": 0.0, + "step": 35322 + }, + { + "epoch": 3.2959783521507884, + "grad_norm": NaN, + "learning_rate": 0.00013360428504319698, + "loss": 0.0, + "step": 35323 + }, + { + "epoch": 3.296071661845666, + "grad_norm": NaN, + "learning_rate": 0.0001335967665996577, + "loss": 0.0, + "step": 35324 + }, + { + "epoch": 3.296164971540543, + "grad_norm": NaN, + "learning_rate": 0.00013358924819782708, + "loss": 0.0, + "step": 35325 + }, + { + "epoch": 3.29625828123542, + "grad_norm": NaN, + "learning_rate": 0.00013358172983772408, + "loss": 0.0, + "step": 35326 + }, + { + "epoch": 3.2963515909302976, + "grad_norm": NaN, + "learning_rate": 0.00013357421151936785, + "loss": 0.0, + "step": 35327 + }, + { + "epoch": 3.296444900625175, + "grad_norm": NaN, + "learning_rate": 0.0001335666932427776, + "loss": 0.0, + "step": 35328 + }, + { + "epoch": 3.296538210320052, + "grad_norm": NaN, + "learning_rate": 0.00013355917500797236, + "loss": 0.0, + "step": 35329 + }, + { + "epoch": 3.2966315200149294, + "grad_norm": NaN, + "learning_rate": 0.0001335516568149713, + "loss": 0.0, + "step": 35330 + }, + { + "epoch": 3.296724829709807, + "grad_norm": NaN, + "learning_rate": 0.00013354413866379343, + "loss": 0.0, + "step": 35331 + }, + { + "epoch": 3.2968181394046843, + "grad_norm": NaN, + "learning_rate": 0.000133536620554458, + "loss": 0.0, + "step": 35332 + }, + { + "epoch": 3.2969114490995617, + "grad_norm": NaN, + "learning_rate": 0.00013352910248698406, + "loss": 0.0, + "step": 35333 + }, + { + "epoch": 3.2970047587944387, + "grad_norm": NaN, + "learning_rate": 0.0001335215844613907, + "loss": 0.0, + "step": 35334 + }, + { + "epoch": 3.297098068489316, + "grad_norm": NaN, + "learning_rate": 0.00013351406647769707, + "loss": 0.0, + "step": 35335 + }, + { + "epoch": 3.2971913781841935, + "grad_norm": NaN, + "learning_rate": 0.00013350654853592234, + "loss": 0.0, + "step": 35336 + }, + { + "epoch": 3.2972846878790705, + "grad_norm": NaN, + "learning_rate": 0.0001334990306360855, + "loss": 0.0, + "step": 35337 + }, + { + "epoch": 3.297377997573948, + "grad_norm": NaN, + "learning_rate": 0.00013349151277820576, + "loss": 0.0, + "step": 35338 + }, + { + "epoch": 3.2974713072688253, + "grad_norm": NaN, + "learning_rate": 0.00013348399496230227, + "loss": 0.0, + "step": 35339 + }, + { + "epoch": 3.2975646169637027, + "grad_norm": NaN, + "learning_rate": 0.000133476477188394, + "loss": 0.0, + "step": 35340 + }, + { + "epoch": 3.2976579266585797, + "grad_norm": NaN, + "learning_rate": 0.00013346895945650017, + "loss": 0.0, + "step": 35341 + }, + { + "epoch": 3.297751236353457, + "grad_norm": NaN, + "learning_rate": 0.00013346144176663995, + "loss": 0.0, + "step": 35342 + }, + { + "epoch": 3.2978445460483345, + "grad_norm": NaN, + "learning_rate": 0.0001334539241188323, + "loss": 0.0, + "step": 35343 + }, + { + "epoch": 3.2979378557432115, + "grad_norm": NaN, + "learning_rate": 0.00013344640651309644, + "loss": 0.0, + "step": 35344 + }, + { + "epoch": 3.298031165438089, + "grad_norm": NaN, + "learning_rate": 0.00013343888894945156, + "loss": 0.0, + "step": 35345 + }, + { + "epoch": 3.2981244751329664, + "grad_norm": NaN, + "learning_rate": 0.00013343137142791657, + "loss": 0.0, + "step": 35346 + }, + { + "epoch": 3.298217784827844, + "grad_norm": NaN, + "learning_rate": 0.0001334238539485107, + "loss": 0.0, + "step": 35347 + }, + { + "epoch": 3.2983110945227208, + "grad_norm": NaN, + "learning_rate": 0.0001334163365112531, + "loss": 0.0, + "step": 35348 + }, + { + "epoch": 3.298404404217598, + "grad_norm": NaN, + "learning_rate": 0.00013340881911616284, + "loss": 0.0, + "step": 35349 + }, + { + "epoch": 3.2984977139124756, + "grad_norm": NaN, + "learning_rate": 0.000133401301763259, + "loss": 0.0, + "step": 35350 + }, + { + "epoch": 3.2985910236073526, + "grad_norm": NaN, + "learning_rate": 0.00013339378445256079, + "loss": 0.0, + "step": 35351 + }, + { + "epoch": 3.29868433330223, + "grad_norm": NaN, + "learning_rate": 0.00013338626718408723, + "loss": 0.0, + "step": 35352 + }, + { + "epoch": 3.2987776429971074, + "grad_norm": NaN, + "learning_rate": 0.00013337874995785745, + "loss": 0.0, + "step": 35353 + }, + { + "epoch": 3.298870952691985, + "grad_norm": NaN, + "learning_rate": 0.00013337123277389065, + "loss": 0.0, + "step": 35354 + }, + { + "epoch": 3.298964262386862, + "grad_norm": NaN, + "learning_rate": 0.00013336371563220584, + "loss": 0.0, + "step": 35355 + }, + { + "epoch": 3.299057572081739, + "grad_norm": NaN, + "learning_rate": 0.00013335619853282216, + "loss": 0.0, + "step": 35356 + }, + { + "epoch": 3.2991508817766166, + "grad_norm": NaN, + "learning_rate": 0.00013334868147575876, + "loss": 0.0, + "step": 35357 + }, + { + "epoch": 3.299244191471494, + "grad_norm": NaN, + "learning_rate": 0.00013334116446103473, + "loss": 0.0, + "step": 35358 + }, + { + "epoch": 3.299337501166371, + "grad_norm": NaN, + "learning_rate": 0.00013333364748866917, + "loss": 0.0, + "step": 35359 + }, + { + "epoch": 3.2994308108612485, + "grad_norm": NaN, + "learning_rate": 0.00013332613055868123, + "loss": 0.0, + "step": 35360 + }, + { + "epoch": 3.299524120556126, + "grad_norm": NaN, + "learning_rate": 0.00013331861367109, + "loss": 0.0, + "step": 35361 + }, + { + "epoch": 3.2996174302510033, + "grad_norm": NaN, + "learning_rate": 0.00013331109682591453, + "loss": 0.0, + "step": 35362 + }, + { + "epoch": 3.2997107399458803, + "grad_norm": NaN, + "learning_rate": 0.00013330358002317408, + "loss": 0.0, + "step": 35363 + }, + { + "epoch": 3.2998040496407577, + "grad_norm": NaN, + "learning_rate": 0.00013329606326288766, + "loss": 0.0, + "step": 35364 + }, + { + "epoch": 3.299897359335635, + "grad_norm": NaN, + "learning_rate": 0.00013328854654507438, + "loss": 0.0, + "step": 35365 + }, + { + "epoch": 3.299990669030512, + "grad_norm": NaN, + "learning_rate": 0.0001332810298697534, + "loss": 0.0, + "step": 35366 + }, + { + "epoch": 3.3000839787253895, + "grad_norm": NaN, + "learning_rate": 0.0001332735132369438, + "loss": 0.0, + "step": 35367 + }, + { + "epoch": 3.300177288420267, + "grad_norm": NaN, + "learning_rate": 0.00013326599664666467, + "loss": 0.0, + "step": 35368 + }, + { + "epoch": 3.3002705981151443, + "grad_norm": NaN, + "learning_rate": 0.00013325848009893522, + "loss": 0.0, + "step": 35369 + }, + { + "epoch": 3.3003639078100213, + "grad_norm": NaN, + "learning_rate": 0.00013325096359377445, + "loss": 0.0, + "step": 35370 + }, + { + "epoch": 3.3004572175048987, + "grad_norm": NaN, + "learning_rate": 0.0001332434471312015, + "loss": 0.0, + "step": 35371 + }, + { + "epoch": 3.300550527199776, + "grad_norm": NaN, + "learning_rate": 0.00013323593071123556, + "loss": 0.0, + "step": 35372 + }, + { + "epoch": 3.300643836894653, + "grad_norm": NaN, + "learning_rate": 0.00013322841433389564, + "loss": 0.0, + "step": 35373 + }, + { + "epoch": 3.3007371465895305, + "grad_norm": NaN, + "learning_rate": 0.0001332208979992009, + "loss": 0.0, + "step": 35374 + }, + { + "epoch": 3.300830456284408, + "grad_norm": NaN, + "learning_rate": 0.00013321338170717046, + "loss": 0.0, + "step": 35375 + }, + { + "epoch": 3.3009237659792854, + "grad_norm": NaN, + "learning_rate": 0.0001332058654578234, + "loss": 0.0, + "step": 35376 + }, + { + "epoch": 3.3010170756741624, + "grad_norm": NaN, + "learning_rate": 0.00013319834925117887, + "loss": 0.0, + "step": 35377 + }, + { + "epoch": 3.30111038536904, + "grad_norm": NaN, + "learning_rate": 0.0001331908330872559, + "loss": 0.0, + "step": 35378 + }, + { + "epoch": 3.301203695063917, + "grad_norm": NaN, + "learning_rate": 0.0001331833169660737, + "loss": 0.0, + "step": 35379 + }, + { + "epoch": 3.3012970047587946, + "grad_norm": NaN, + "learning_rate": 0.00013317580088765142, + "loss": 0.0, + "step": 35380 + }, + { + "epoch": 3.3013903144536716, + "grad_norm": NaN, + "learning_rate": 0.00013316828485200797, + "loss": 0.0, + "step": 35381 + }, + { + "epoch": 3.301483624148549, + "grad_norm": NaN, + "learning_rate": 0.00013316076885916262, + "loss": 0.0, + "step": 35382 + }, + { + "epoch": 3.3015769338434264, + "grad_norm": NaN, + "learning_rate": 0.00013315325290913453, + "loss": 0.0, + "step": 35383 + }, + { + "epoch": 3.301670243538304, + "grad_norm": NaN, + "learning_rate": 0.00013314573700194264, + "loss": 0.0, + "step": 35384 + }, + { + "epoch": 3.301763553233181, + "grad_norm": NaN, + "learning_rate": 0.00013313822113760615, + "loss": 0.0, + "step": 35385 + }, + { + "epoch": 3.3018568629280582, + "grad_norm": NaN, + "learning_rate": 0.00013313070531614422, + "loss": 0.0, + "step": 35386 + }, + { + "epoch": 3.3019501726229357, + "grad_norm": NaN, + "learning_rate": 0.00013312318953757587, + "loss": 0.0, + "step": 35387 + }, + { + "epoch": 3.3020434823178126, + "grad_norm": NaN, + "learning_rate": 0.00013311567380192024, + "loss": 0.0, + "step": 35388 + }, + { + "epoch": 3.30213679201269, + "grad_norm": NaN, + "learning_rate": 0.0001331081581091965, + "loss": 0.0, + "step": 35389 + }, + { + "epoch": 3.3022301017075675, + "grad_norm": NaN, + "learning_rate": 0.00013310064245942367, + "loss": 0.0, + "step": 35390 + }, + { + "epoch": 3.302323411402445, + "grad_norm": NaN, + "learning_rate": 0.0001330931268526209, + "loss": 0.0, + "step": 35391 + }, + { + "epoch": 3.302416721097322, + "grad_norm": NaN, + "learning_rate": 0.00013308561128880735, + "loss": 0.0, + "step": 35392 + }, + { + "epoch": 3.3025100307921993, + "grad_norm": NaN, + "learning_rate": 0.00013307809576800202, + "loss": 0.0, + "step": 35393 + }, + { + "epoch": 3.3026033404870767, + "grad_norm": NaN, + "learning_rate": 0.00013307058029022407, + "loss": 0.0, + "step": 35394 + }, + { + "epoch": 3.3026966501819537, + "grad_norm": NaN, + "learning_rate": 0.0001330630648554927, + "loss": 0.0, + "step": 35395 + }, + { + "epoch": 3.302789959876831, + "grad_norm": NaN, + "learning_rate": 0.0001330555494638269, + "loss": 0.0, + "step": 35396 + }, + { + "epoch": 3.3028832695717085, + "grad_norm": NaN, + "learning_rate": 0.00013304803411524576, + "loss": 0.0, + "step": 35397 + }, + { + "epoch": 3.302976579266586, + "grad_norm": NaN, + "learning_rate": 0.00013304051880976854, + "loss": 0.0, + "step": 35398 + }, + { + "epoch": 3.303069888961463, + "grad_norm": NaN, + "learning_rate": 0.00013303300354741422, + "loss": 0.0, + "step": 35399 + }, + { + "epoch": 3.3031631986563403, + "grad_norm": NaN, + "learning_rate": 0.0001330254883282019, + "loss": 0.0, + "step": 35400 + }, + { + "epoch": 3.3032565083512178, + "grad_norm": NaN, + "learning_rate": 0.0001330179731521508, + "loss": 0.0, + "step": 35401 + }, + { + "epoch": 3.303349818046095, + "grad_norm": NaN, + "learning_rate": 0.00013301045801927995, + "loss": 0.0, + "step": 35402 + }, + { + "epoch": 3.303443127740972, + "grad_norm": NaN, + "learning_rate": 0.00013300294292960843, + "loss": 0.0, + "step": 35403 + }, + { + "epoch": 3.3035364374358496, + "grad_norm": NaN, + "learning_rate": 0.00013299542788315546, + "loss": 0.0, + "step": 35404 + }, + { + "epoch": 3.303629747130727, + "grad_norm": NaN, + "learning_rate": 0.00013298791287994004, + "loss": 0.0, + "step": 35405 + }, + { + "epoch": 3.3037230568256044, + "grad_norm": NaN, + "learning_rate": 0.00013298039791998127, + "loss": 0.0, + "step": 35406 + }, + { + "epoch": 3.3038163665204814, + "grad_norm": NaN, + "learning_rate": 0.00013297288300329838, + "loss": 0.0, + "step": 35407 + }, + { + "epoch": 3.303909676215359, + "grad_norm": NaN, + "learning_rate": 0.0001329653681299104, + "loss": 0.0, + "step": 35408 + }, + { + "epoch": 3.3040029859102362, + "grad_norm": NaN, + "learning_rate": 0.0001329578532998364, + "loss": 0.0, + "step": 35409 + }, + { + "epoch": 3.304096295605113, + "grad_norm": NaN, + "learning_rate": 0.0001329503385130956, + "loss": 0.0, + "step": 35410 + }, + { + "epoch": 3.3041896052999906, + "grad_norm": NaN, + "learning_rate": 0.00013294282376970698, + "loss": 0.0, + "step": 35411 + }, + { + "epoch": 3.304282914994868, + "grad_norm": NaN, + "learning_rate": 0.00013293530906968967, + "loss": 0.0, + "step": 35412 + }, + { + "epoch": 3.3043762246897455, + "grad_norm": NaN, + "learning_rate": 0.00013292779441306292, + "loss": 0.0, + "step": 35413 + }, + { + "epoch": 3.3044695343846224, + "grad_norm": NaN, + "learning_rate": 0.00013292027979984567, + "loss": 0.0, + "step": 35414 + }, + { + "epoch": 3.3045628440795, + "grad_norm": NaN, + "learning_rate": 0.00013291276523005706, + "loss": 0.0, + "step": 35415 + }, + { + "epoch": 3.3046561537743773, + "grad_norm": NaN, + "learning_rate": 0.00013290525070371628, + "loss": 0.0, + "step": 35416 + }, + { + "epoch": 3.3047494634692542, + "grad_norm": NaN, + "learning_rate": 0.00013289773622084239, + "loss": 0.0, + "step": 35417 + }, + { + "epoch": 3.3048427731641317, + "grad_norm": NaN, + "learning_rate": 0.00013289022178145449, + "loss": 0.0, + "step": 35418 + }, + { + "epoch": 3.304936082859009, + "grad_norm": NaN, + "learning_rate": 0.00013288270738557163, + "loss": 0.0, + "step": 35419 + }, + { + "epoch": 3.3050293925538865, + "grad_norm": NaN, + "learning_rate": 0.00013287519303321297, + "loss": 0.0, + "step": 35420 + }, + { + "epoch": 3.3051227022487635, + "grad_norm": NaN, + "learning_rate": 0.00013286767872439773, + "loss": 0.0, + "step": 35421 + }, + { + "epoch": 3.305216011943641, + "grad_norm": NaN, + "learning_rate": 0.00013286016445914481, + "loss": 0.0, + "step": 35422 + }, + { + "epoch": 3.3053093216385183, + "grad_norm": NaN, + "learning_rate": 0.0001328526502374734, + "loss": 0.0, + "step": 35423 + }, + { + "epoch": 3.3054026313333953, + "grad_norm": NaN, + "learning_rate": 0.0001328451360594027, + "loss": 0.0, + "step": 35424 + }, + { + "epoch": 3.3054959410282727, + "grad_norm": NaN, + "learning_rate": 0.00013283762192495169, + "loss": 0.0, + "step": 35425 + }, + { + "epoch": 3.30558925072315, + "grad_norm": NaN, + "learning_rate": 0.00013283010783413953, + "loss": 0.0, + "step": 35426 + }, + { + "epoch": 3.3056825604180275, + "grad_norm": NaN, + "learning_rate": 0.00013282259378698534, + "loss": 0.0, + "step": 35427 + }, + { + "epoch": 3.305775870112905, + "grad_norm": NaN, + "learning_rate": 0.0001328150797835082, + "loss": 0.0, + "step": 35428 + }, + { + "epoch": 3.305869179807782, + "grad_norm": NaN, + "learning_rate": 0.00013280756582372718, + "loss": 0.0, + "step": 35429 + }, + { + "epoch": 3.3059624895026594, + "grad_norm": NaN, + "learning_rate": 0.0001328000519076615, + "loss": 0.0, + "step": 35430 + }, + { + "epoch": 3.306055799197537, + "grad_norm": NaN, + "learning_rate": 0.00013279253803533014, + "loss": 0.0, + "step": 35431 + }, + { + "epoch": 3.3061491088924138, + "grad_norm": NaN, + "learning_rate": 0.00013278502420675223, + "loss": 0.0, + "step": 35432 + }, + { + "epoch": 3.306242418587291, + "grad_norm": NaN, + "learning_rate": 0.00013277751042194697, + "loss": 0.0, + "step": 35433 + }, + { + "epoch": 3.3063357282821686, + "grad_norm": NaN, + "learning_rate": 0.00013276999668093337, + "loss": 0.0, + "step": 35434 + }, + { + "epoch": 3.306429037977046, + "grad_norm": NaN, + "learning_rate": 0.00013276248298373054, + "loss": 0.0, + "step": 35435 + }, + { + "epoch": 3.306522347671923, + "grad_norm": NaN, + "learning_rate": 0.0001327549693303577, + "loss": 0.0, + "step": 35436 + }, + { + "epoch": 3.3066156573668004, + "grad_norm": NaN, + "learning_rate": 0.00013274745572083377, + "loss": 0.0, + "step": 35437 + }, + { + "epoch": 3.306708967061678, + "grad_norm": NaN, + "learning_rate": 0.00013273994215517795, + "loss": 0.0, + "step": 35438 + }, + { + "epoch": 3.306802276756555, + "grad_norm": NaN, + "learning_rate": 0.0001327324286334094, + "loss": 0.0, + "step": 35439 + }, + { + "epoch": 3.3068955864514322, + "grad_norm": NaN, + "learning_rate": 0.00013272491515554715, + "loss": 0.0, + "step": 35440 + }, + { + "epoch": 3.3069888961463096, + "grad_norm": NaN, + "learning_rate": 0.0001327174017216103, + "loss": 0.0, + "step": 35441 + }, + { + "epoch": 3.307082205841187, + "grad_norm": NaN, + "learning_rate": 0.00013270988833161802, + "loss": 0.0, + "step": 35442 + }, + { + "epoch": 3.307175515536064, + "grad_norm": NaN, + "learning_rate": 0.00013270237498558935, + "loss": 0.0, + "step": 35443 + }, + { + "epoch": 3.3072688252309415, + "grad_norm": NaN, + "learning_rate": 0.00013269486168354338, + "loss": 0.0, + "step": 35444 + }, + { + "epoch": 3.307362134925819, + "grad_norm": NaN, + "learning_rate": 0.00013268734842549932, + "loss": 0.0, + "step": 35445 + }, + { + "epoch": 3.307455444620696, + "grad_norm": NaN, + "learning_rate": 0.00013267983521147617, + "loss": 0.0, + "step": 35446 + }, + { + "epoch": 3.3075487543155733, + "grad_norm": NaN, + "learning_rate": 0.00013267232204149305, + "loss": 0.0, + "step": 35447 + }, + { + "epoch": 3.3076420640104507, + "grad_norm": NaN, + "learning_rate": 0.00013266480891556913, + "loss": 0.0, + "step": 35448 + }, + { + "epoch": 3.307735373705328, + "grad_norm": NaN, + "learning_rate": 0.00013265729583372343, + "loss": 0.0, + "step": 35449 + }, + { + "epoch": 3.3078286834002055, + "grad_norm": NaN, + "learning_rate": 0.00013264978279597508, + "loss": 0.0, + "step": 35450 + }, + { + "epoch": 3.3079219930950825, + "grad_norm": NaN, + "learning_rate": 0.00013264226980234324, + "loss": 0.0, + "step": 35451 + }, + { + "epoch": 3.30801530278996, + "grad_norm": NaN, + "learning_rate": 0.00013263475685284693, + "loss": 0.0, + "step": 35452 + }, + { + "epoch": 3.3081086124848373, + "grad_norm": NaN, + "learning_rate": 0.00013262724394750525, + "loss": 0.0, + "step": 35453 + }, + { + "epoch": 3.3082019221797143, + "grad_norm": NaN, + "learning_rate": 0.00013261973108633744, + "loss": 0.0, + "step": 35454 + }, + { + "epoch": 3.3082952318745917, + "grad_norm": NaN, + "learning_rate": 0.00013261221826936243, + "loss": 0.0, + "step": 35455 + }, + { + "epoch": 3.308388541569469, + "grad_norm": NaN, + "learning_rate": 0.0001326047054965994, + "loss": 0.0, + "step": 35456 + }, + { + "epoch": 3.3084818512643466, + "grad_norm": NaN, + "learning_rate": 0.0001325971927680675, + "loss": 0.0, + "step": 35457 + }, + { + "epoch": 3.3085751609592235, + "grad_norm": NaN, + "learning_rate": 0.00013258968008378571, + "loss": 0.0, + "step": 35458 + }, + { + "epoch": 3.308668470654101, + "grad_norm": NaN, + "learning_rate": 0.00013258216744377327, + "loss": 0.0, + "step": 35459 + }, + { + "epoch": 3.3087617803489784, + "grad_norm": NaN, + "learning_rate": 0.00013257465484804923, + "loss": 0.0, + "step": 35460 + }, + { + "epoch": 3.3088550900438554, + "grad_norm": NaN, + "learning_rate": 0.0001325671422966326, + "loss": 0.0, + "step": 35461 + }, + { + "epoch": 3.308948399738733, + "grad_norm": NaN, + "learning_rate": 0.00013255962978954265, + "loss": 0.0, + "step": 35462 + }, + { + "epoch": 3.30904170943361, + "grad_norm": NaN, + "learning_rate": 0.00013255211732679837, + "loss": 0.0, + "step": 35463 + }, + { + "epoch": 3.3091350191284876, + "grad_norm": NaN, + "learning_rate": 0.00013254460490841886, + "loss": 0.0, + "step": 35464 + }, + { + "epoch": 3.3092283288233646, + "grad_norm": NaN, + "learning_rate": 0.00013253709253442327, + "loss": 0.0, + "step": 35465 + }, + { + "epoch": 3.309321638518242, + "grad_norm": NaN, + "learning_rate": 0.00013252958020483068, + "loss": 0.0, + "step": 35466 + }, + { + "epoch": 3.3094149482131194, + "grad_norm": NaN, + "learning_rate": 0.00013252206791966017, + "loss": 0.0, + "step": 35467 + }, + { + "epoch": 3.3095082579079964, + "grad_norm": NaN, + "learning_rate": 0.0001325145556789309, + "loss": 0.0, + "step": 35468 + }, + { + "epoch": 3.309601567602874, + "grad_norm": NaN, + "learning_rate": 0.00013250704348266194, + "loss": 0.0, + "step": 35469 + }, + { + "epoch": 3.3096948772977512, + "grad_norm": NaN, + "learning_rate": 0.00013249953133087232, + "loss": 0.0, + "step": 35470 + }, + { + "epoch": 3.3097881869926287, + "grad_norm": NaN, + "learning_rate": 0.00013249201922358128, + "loss": 0.0, + "step": 35471 + }, + { + "epoch": 3.3098814966875056, + "grad_norm": NaN, + "learning_rate": 0.00013248450716080783, + "loss": 0.0, + "step": 35472 + }, + { + "epoch": 3.309974806382383, + "grad_norm": NaN, + "learning_rate": 0.00013247699514257105, + "loss": 0.0, + "step": 35473 + }, + { + "epoch": 3.3100681160772605, + "grad_norm": NaN, + "learning_rate": 0.00013246948316889015, + "loss": 0.0, + "step": 35474 + }, + { + "epoch": 3.310161425772138, + "grad_norm": NaN, + "learning_rate": 0.00013246197123978413, + "loss": 0.0, + "step": 35475 + }, + { + "epoch": 3.310254735467015, + "grad_norm": NaN, + "learning_rate": 0.00013245445935527206, + "loss": 0.0, + "step": 35476 + }, + { + "epoch": 3.3103480451618923, + "grad_norm": NaN, + "learning_rate": 0.00013244694751537322, + "loss": 0.0, + "step": 35477 + }, + { + "epoch": 3.3104413548567697, + "grad_norm": NaN, + "learning_rate": 0.00013243943572010652, + "loss": 0.0, + "step": 35478 + }, + { + "epoch": 3.310534664551647, + "grad_norm": NaN, + "learning_rate": 0.0001324319239694911, + "loss": 0.0, + "step": 35479 + }, + { + "epoch": 3.310627974246524, + "grad_norm": NaN, + "learning_rate": 0.00013242441226354618, + "loss": 0.0, + "step": 35480 + }, + { + "epoch": 3.3107212839414015, + "grad_norm": NaN, + "learning_rate": 0.00013241690060229071, + "loss": 0.0, + "step": 35481 + }, + { + "epoch": 3.310814593636279, + "grad_norm": NaN, + "learning_rate": 0.00013240938898574387, + "loss": 0.0, + "step": 35482 + }, + { + "epoch": 3.310907903331156, + "grad_norm": NaN, + "learning_rate": 0.00013240187741392478, + "loss": 0.0, + "step": 35483 + }, + { + "epoch": 3.3110012130260333, + "grad_norm": NaN, + "learning_rate": 0.00013239436588685248, + "loss": 0.0, + "step": 35484 + }, + { + "epoch": 3.3110945227209108, + "grad_norm": NaN, + "learning_rate": 0.00013238685440454604, + "loss": 0.0, + "step": 35485 + }, + { + "epoch": 3.311187832415788, + "grad_norm": NaN, + "learning_rate": 0.0001323793429670247, + "loss": 0.0, + "step": 35486 + }, + { + "epoch": 3.311281142110665, + "grad_norm": NaN, + "learning_rate": 0.0001323718315743074, + "loss": 0.0, + "step": 35487 + }, + { + "epoch": 3.3113744518055426, + "grad_norm": NaN, + "learning_rate": 0.0001323643202264133, + "loss": 0.0, + "step": 35488 + }, + { + "epoch": 3.31146776150042, + "grad_norm": NaN, + "learning_rate": 0.0001323568089233616, + "loss": 0.0, + "step": 35489 + }, + { + "epoch": 3.311561071195297, + "grad_norm": NaN, + "learning_rate": 0.00013234929766517124, + "loss": 0.0, + "step": 35490 + }, + { + "epoch": 3.3116543808901744, + "grad_norm": NaN, + "learning_rate": 0.00013234178645186138, + "loss": 0.0, + "step": 35491 + }, + { + "epoch": 3.311747690585052, + "grad_norm": NaN, + "learning_rate": 0.00013233427528345116, + "loss": 0.0, + "step": 35492 + }, + { + "epoch": 3.3118410002799292, + "grad_norm": NaN, + "learning_rate": 0.00013232676415995961, + "loss": 0.0, + "step": 35493 + }, + { + "epoch": 3.311934309974806, + "grad_norm": NaN, + "learning_rate": 0.0001323192530814059, + "loss": 0.0, + "step": 35494 + }, + { + "epoch": 3.3120276196696836, + "grad_norm": NaN, + "learning_rate": 0.0001323117420478091, + "loss": 0.0, + "step": 35495 + }, + { + "epoch": 3.312120929364561, + "grad_norm": NaN, + "learning_rate": 0.00013230423105918825, + "loss": 0.0, + "step": 35496 + }, + { + "epoch": 3.3122142390594385, + "grad_norm": NaN, + "learning_rate": 0.00013229672011556254, + "loss": 0.0, + "step": 35497 + }, + { + "epoch": 3.3123075487543154, + "grad_norm": NaN, + "learning_rate": 0.00013228920921695104, + "loss": 0.0, + "step": 35498 + }, + { + "epoch": 3.312400858449193, + "grad_norm": NaN, + "learning_rate": 0.00013228169836337276, + "loss": 0.0, + "step": 35499 + }, + { + "epoch": 3.3124941681440703, + "grad_norm": NaN, + "learning_rate": 0.0001322741875548469, + "loss": 0.0, + "step": 35500 + }, + { + "epoch": 3.3125874778389477, + "grad_norm": NaN, + "learning_rate": 0.00013226667679139262, + "loss": 0.0, + "step": 35501 + }, + { + "epoch": 3.3126807875338247, + "grad_norm": NaN, + "learning_rate": 0.0001322591660730288, + "loss": 0.0, + "step": 35502 + }, + { + "epoch": 3.312774097228702, + "grad_norm": NaN, + "learning_rate": 0.00013225165539977474, + "loss": 0.0, + "step": 35503 + }, + { + "epoch": 3.3128674069235795, + "grad_norm": NaN, + "learning_rate": 0.00013224414477164948, + "loss": 0.0, + "step": 35504 + }, + { + "epoch": 3.3129607166184565, + "grad_norm": NaN, + "learning_rate": 0.00013223663418867202, + "loss": 0.0, + "step": 35505 + }, + { + "epoch": 3.313054026313334, + "grad_norm": NaN, + "learning_rate": 0.00013222912365086161, + "loss": 0.0, + "step": 35506 + }, + { + "epoch": 3.3131473360082113, + "grad_norm": NaN, + "learning_rate": 0.00013222161315823728, + "loss": 0.0, + "step": 35507 + }, + { + "epoch": 3.3132406457030887, + "grad_norm": NaN, + "learning_rate": 0.00013221410271081806, + "loss": 0.0, + "step": 35508 + }, + { + "epoch": 3.3133339553979657, + "grad_norm": NaN, + "learning_rate": 0.00013220659230862318, + "loss": 0.0, + "step": 35509 + }, + { + "epoch": 3.313427265092843, + "grad_norm": NaN, + "learning_rate": 0.0001321990819516716, + "loss": 0.0, + "step": 35510 + }, + { + "epoch": 3.3135205747877206, + "grad_norm": NaN, + "learning_rate": 0.00013219157163998248, + "loss": 0.0, + "step": 35511 + }, + { + "epoch": 3.3136138844825975, + "grad_norm": NaN, + "learning_rate": 0.00013218406137357498, + "loss": 0.0, + "step": 35512 + }, + { + "epoch": 3.313707194177475, + "grad_norm": NaN, + "learning_rate": 0.00013217655115246808, + "loss": 0.0, + "step": 35513 + }, + { + "epoch": 3.3138005038723524, + "grad_norm": NaN, + "learning_rate": 0.00013216904097668094, + "loss": 0.0, + "step": 35514 + }, + { + "epoch": 3.31389381356723, + "grad_norm": NaN, + "learning_rate": 0.0001321615308462327, + "loss": 0.0, + "step": 35515 + }, + { + "epoch": 3.3139871232621068, + "grad_norm": NaN, + "learning_rate": 0.00013215402076114234, + "loss": 0.0, + "step": 35516 + }, + { + "epoch": 3.314080432956984, + "grad_norm": NaN, + "learning_rate": 0.00013214651072142902, + "loss": 0.0, + "step": 35517 + }, + { + "epoch": 3.3141737426518616, + "grad_norm": NaN, + "learning_rate": 0.00013213900072711187, + "loss": 0.0, + "step": 35518 + }, + { + "epoch": 3.314267052346739, + "grad_norm": NaN, + "learning_rate": 0.00013213149077820994, + "loss": 0.0, + "step": 35519 + }, + { + "epoch": 3.314360362041616, + "grad_norm": NaN, + "learning_rate": 0.0001321239808747423, + "loss": 0.0, + "step": 35520 + }, + { + "epoch": 3.3144536717364934, + "grad_norm": NaN, + "learning_rate": 0.00013211647101672814, + "loss": 0.0, + "step": 35521 + }, + { + "epoch": 3.314546981431371, + "grad_norm": NaN, + "learning_rate": 0.00013210896120418645, + "loss": 0.0, + "step": 35522 + }, + { + "epoch": 3.3146402911262483, + "grad_norm": NaN, + "learning_rate": 0.00013210145143713634, + "loss": 0.0, + "step": 35523 + }, + { + "epoch": 3.3147336008211252, + "grad_norm": NaN, + "learning_rate": 0.00013209394171559706, + "loss": 0.0, + "step": 35524 + }, + { + "epoch": 3.3148269105160026, + "grad_norm": NaN, + "learning_rate": 0.0001320864320395875, + "loss": 0.0, + "step": 35525 + }, + { + "epoch": 3.31492022021088, + "grad_norm": NaN, + "learning_rate": 0.0001320789224091268, + "loss": 0.0, + "step": 35526 + }, + { + "epoch": 3.315013529905757, + "grad_norm": NaN, + "learning_rate": 0.00013207141282423418, + "loss": 0.0, + "step": 35527 + }, + { + "epoch": 3.3151068396006345, + "grad_norm": NaN, + "learning_rate": 0.0001320639032849286, + "loss": 0.0, + "step": 35528 + }, + { + "epoch": 3.315200149295512, + "grad_norm": NaN, + "learning_rate": 0.00013205639379122917, + "loss": 0.0, + "step": 35529 + }, + { + "epoch": 3.3152934589903893, + "grad_norm": NaN, + "learning_rate": 0.00013204888434315508, + "loss": 0.0, + "step": 35530 + }, + { + "epoch": 3.3153867686852663, + "grad_norm": NaN, + "learning_rate": 0.00013204137494072528, + "loss": 0.0, + "step": 35531 + }, + { + "epoch": 3.3154800783801437, + "grad_norm": NaN, + "learning_rate": 0.000132033865583959, + "loss": 0.0, + "step": 35532 + }, + { + "epoch": 3.315573388075021, + "grad_norm": NaN, + "learning_rate": 0.0001320263562728753, + "loss": 0.0, + "step": 35533 + }, + { + "epoch": 3.315666697769898, + "grad_norm": NaN, + "learning_rate": 0.0001320188470074932, + "loss": 0.0, + "step": 35534 + }, + { + "epoch": 3.3157600074647755, + "grad_norm": NaN, + "learning_rate": 0.00013201133778783188, + "loss": 0.0, + "step": 35535 + }, + { + "epoch": 3.315853317159653, + "grad_norm": NaN, + "learning_rate": 0.0001320038286139104, + "loss": 0.0, + "step": 35536 + }, + { + "epoch": 3.3159466268545303, + "grad_norm": NaN, + "learning_rate": 0.0001319963194857478, + "loss": 0.0, + "step": 35537 + }, + { + "epoch": 3.3160399365494073, + "grad_norm": NaN, + "learning_rate": 0.00013198881040336327, + "loss": 0.0, + "step": 35538 + }, + { + "epoch": 3.3161332462442847, + "grad_norm": NaN, + "learning_rate": 0.0001319813013667759, + "loss": 0.0, + "step": 35539 + }, + { + "epoch": 3.316226555939162, + "grad_norm": NaN, + "learning_rate": 0.00013197379237600465, + "loss": 0.0, + "step": 35540 + }, + { + "epoch": 3.316319865634039, + "grad_norm": NaN, + "learning_rate": 0.00013196628343106875, + "loss": 0.0, + "step": 35541 + }, + { + "epoch": 3.3164131753289166, + "grad_norm": NaN, + "learning_rate": 0.00013195877453198732, + "loss": 0.0, + "step": 35542 + }, + { + "epoch": 3.316506485023794, + "grad_norm": NaN, + "learning_rate": 0.00013195126567877927, + "loss": 0.0, + "step": 35543 + }, + { + "epoch": 3.3165997947186714, + "grad_norm": NaN, + "learning_rate": 0.00013194375687146384, + "loss": 0.0, + "step": 35544 + }, + { + "epoch": 3.316693104413549, + "grad_norm": NaN, + "learning_rate": 0.00013193624811006014, + "loss": 0.0, + "step": 35545 + }, + { + "epoch": 3.316786414108426, + "grad_norm": NaN, + "learning_rate": 0.00013192873939458713, + "loss": 0.0, + "step": 35546 + }, + { + "epoch": 3.316879723803303, + "grad_norm": NaN, + "learning_rate": 0.000131921230725064, + "loss": 0.0, + "step": 35547 + }, + { + "epoch": 3.3169730334981806, + "grad_norm": NaN, + "learning_rate": 0.00013191372210150993, + "loss": 0.0, + "step": 35548 + }, + { + "epoch": 3.3170663431930576, + "grad_norm": NaN, + "learning_rate": 0.00013190621352394377, + "loss": 0.0, + "step": 35549 + }, + { + "epoch": 3.317159652887935, + "grad_norm": NaN, + "learning_rate": 0.0001318987049923848, + "loss": 0.0, + "step": 35550 + }, + { + "epoch": 3.3172529625828124, + "grad_norm": NaN, + "learning_rate": 0.0001318911965068521, + "loss": 0.0, + "step": 35551 + }, + { + "epoch": 3.31734627227769, + "grad_norm": NaN, + "learning_rate": 0.00013188368806736465, + "loss": 0.0, + "step": 35552 + }, + { + "epoch": 3.317439581972567, + "grad_norm": NaN, + "learning_rate": 0.0001318761796739417, + "loss": 0.0, + "step": 35553 + }, + { + "epoch": 3.3175328916674443, + "grad_norm": NaN, + "learning_rate": 0.0001318686713266022, + "loss": 0.0, + "step": 35554 + }, + { + "epoch": 3.3176262013623217, + "grad_norm": NaN, + "learning_rate": 0.00013186116302536528, + "loss": 0.0, + "step": 35555 + }, + { + "epoch": 3.3177195110571986, + "grad_norm": NaN, + "learning_rate": 0.00013185365477025012, + "loss": 0.0, + "step": 35556 + }, + { + "epoch": 3.317812820752076, + "grad_norm": NaN, + "learning_rate": 0.00013184614656127567, + "loss": 0.0, + "step": 35557 + }, + { + "epoch": 3.3179061304469535, + "grad_norm": NaN, + "learning_rate": 0.0001318386383984611, + "loss": 0.0, + "step": 35558 + }, + { + "epoch": 3.317999440141831, + "grad_norm": NaN, + "learning_rate": 0.00013183113028182554, + "loss": 0.0, + "step": 35559 + }, + { + "epoch": 3.318092749836708, + "grad_norm": NaN, + "learning_rate": 0.000131823622211388, + "loss": 0.0, + "step": 35560 + }, + { + "epoch": 3.3181860595315853, + "grad_norm": NaN, + "learning_rate": 0.00013181611418716757, + "loss": 0.0, + "step": 35561 + }, + { + "epoch": 3.3182793692264627, + "grad_norm": NaN, + "learning_rate": 0.00013180860620918346, + "loss": 0.0, + "step": 35562 + }, + { + "epoch": 3.3183726789213397, + "grad_norm": NaN, + "learning_rate": 0.00013180109827745464, + "loss": 0.0, + "step": 35563 + }, + { + "epoch": 3.318465988616217, + "grad_norm": NaN, + "learning_rate": 0.00013179359039200018, + "loss": 0.0, + "step": 35564 + }, + { + "epoch": 3.3185592983110945, + "grad_norm": NaN, + "learning_rate": 0.00013178608255283932, + "loss": 0.0, + "step": 35565 + }, + { + "epoch": 3.318652608005972, + "grad_norm": NaN, + "learning_rate": 0.000131778574759991, + "loss": 0.0, + "step": 35566 + }, + { + "epoch": 3.318745917700849, + "grad_norm": NaN, + "learning_rate": 0.00013177106701347434, + "loss": 0.0, + "step": 35567 + }, + { + "epoch": 3.3188392273957263, + "grad_norm": NaN, + "learning_rate": 0.00013176355931330852, + "loss": 0.0, + "step": 35568 + }, + { + "epoch": 3.3189325370906038, + "grad_norm": NaN, + "learning_rate": 0.00013175605165951251, + "loss": 0.0, + "step": 35569 + }, + { + "epoch": 3.319025846785481, + "grad_norm": NaN, + "learning_rate": 0.0001317485440521055, + "loss": 0.0, + "step": 35570 + }, + { + "epoch": 3.319119156480358, + "grad_norm": NaN, + "learning_rate": 0.00013174103649110655, + "loss": 0.0, + "step": 35571 + }, + { + "epoch": 3.3192124661752356, + "grad_norm": NaN, + "learning_rate": 0.00013173352897653467, + "loss": 0.0, + "step": 35572 + }, + { + "epoch": 3.319305775870113, + "grad_norm": NaN, + "learning_rate": 0.00013172602150840907, + "loss": 0.0, + "step": 35573 + }, + { + "epoch": 3.3193990855649904, + "grad_norm": NaN, + "learning_rate": 0.0001317185140867488, + "loss": 0.0, + "step": 35574 + }, + { + "epoch": 3.3194923952598674, + "grad_norm": NaN, + "learning_rate": 0.00013171100671157285, + "loss": 0.0, + "step": 35575 + }, + { + "epoch": 3.319585704954745, + "grad_norm": NaN, + "learning_rate": 0.00013170349938290045, + "loss": 0.0, + "step": 35576 + }, + { + "epoch": 3.3196790146496222, + "grad_norm": NaN, + "learning_rate": 0.00013169599210075065, + "loss": 0.0, + "step": 35577 + }, + { + "epoch": 3.319772324344499, + "grad_norm": NaN, + "learning_rate": 0.00013168848486514245, + "loss": 0.0, + "step": 35578 + }, + { + "epoch": 3.3198656340393766, + "grad_norm": NaN, + "learning_rate": 0.00013168097767609507, + "loss": 0.0, + "step": 35579 + }, + { + "epoch": 3.319958943734254, + "grad_norm": NaN, + "learning_rate": 0.00013167347053362756, + "loss": 0.0, + "step": 35580 + }, + { + "epoch": 3.3200522534291315, + "grad_norm": NaN, + "learning_rate": 0.00013166596343775893, + "loss": 0.0, + "step": 35581 + }, + { + "epoch": 3.3201455631240084, + "grad_norm": NaN, + "learning_rate": 0.0001316584563885083, + "loss": 0.0, + "step": 35582 + }, + { + "epoch": 3.320238872818886, + "grad_norm": NaN, + "learning_rate": 0.00013165094938589488, + "loss": 0.0, + "step": 35583 + }, + { + "epoch": 3.3203321825137633, + "grad_norm": NaN, + "learning_rate": 0.00013164344242993755, + "loss": 0.0, + "step": 35584 + }, + { + "epoch": 3.3204254922086403, + "grad_norm": NaN, + "learning_rate": 0.00013163593552065558, + "loss": 0.0, + "step": 35585 + }, + { + "epoch": 3.3205188019035177, + "grad_norm": NaN, + "learning_rate": 0.00013162842865806798, + "loss": 0.0, + "step": 35586 + }, + { + "epoch": 3.320612111598395, + "grad_norm": NaN, + "learning_rate": 0.00013162092184219382, + "loss": 0.0, + "step": 35587 + }, + { + "epoch": 3.3207054212932725, + "grad_norm": NaN, + "learning_rate": 0.00013161341507305222, + "loss": 0.0, + "step": 35588 + }, + { + "epoch": 3.3207987309881495, + "grad_norm": NaN, + "learning_rate": 0.0001316059083506623, + "loss": 0.0, + "step": 35589 + }, + { + "epoch": 3.320892040683027, + "grad_norm": NaN, + "learning_rate": 0.00013159840167504302, + "loss": 0.0, + "step": 35590 + }, + { + "epoch": 3.3209853503779043, + "grad_norm": NaN, + "learning_rate": 0.00013159089504621358, + "loss": 0.0, + "step": 35591 + }, + { + "epoch": 3.3210786600727817, + "grad_norm": NaN, + "learning_rate": 0.0001315833884641931, + "loss": 0.0, + "step": 35592 + }, + { + "epoch": 3.3211719697676587, + "grad_norm": NaN, + "learning_rate": 0.00013157588192900055, + "loss": 0.0, + "step": 35593 + }, + { + "epoch": 3.321265279462536, + "grad_norm": NaN, + "learning_rate": 0.00013156837544065508, + "loss": 0.0, + "step": 35594 + }, + { + "epoch": 3.3213585891574136, + "grad_norm": NaN, + "learning_rate": 0.00013156086899917583, + "loss": 0.0, + "step": 35595 + }, + { + "epoch": 3.321451898852291, + "grad_norm": NaN, + "learning_rate": 0.00013155336260458174, + "loss": 0.0, + "step": 35596 + }, + { + "epoch": 3.321545208547168, + "grad_norm": NaN, + "learning_rate": 0.00013154585625689205, + "loss": 0.0, + "step": 35597 + }, + { + "epoch": 3.3216385182420454, + "grad_norm": NaN, + "learning_rate": 0.0001315383499561258, + "loss": 0.0, + "step": 35598 + }, + { + "epoch": 3.321731827936923, + "grad_norm": NaN, + "learning_rate": 0.00013153084370230196, + "loss": 0.0, + "step": 35599 + }, + { + "epoch": 3.3218251376317998, + "grad_norm": NaN, + "learning_rate": 0.0001315233374954398, + "loss": 0.0, + "step": 35600 + }, + { + "epoch": 3.321918447326677, + "grad_norm": NaN, + "learning_rate": 0.0001315158313355583, + "loss": 0.0, + "step": 35601 + }, + { + "epoch": 3.3220117570215546, + "grad_norm": NaN, + "learning_rate": 0.00013150832522267652, + "loss": 0.0, + "step": 35602 + }, + { + "epoch": 3.322105066716432, + "grad_norm": NaN, + "learning_rate": 0.00013150081915681367, + "loss": 0.0, + "step": 35603 + }, + { + "epoch": 3.322198376411309, + "grad_norm": NaN, + "learning_rate": 0.0001314933131379887, + "loss": 0.0, + "step": 35604 + }, + { + "epoch": 3.3222916861061864, + "grad_norm": NaN, + "learning_rate": 0.00013148580716622072, + "loss": 0.0, + "step": 35605 + }, + { + "epoch": 3.322384995801064, + "grad_norm": NaN, + "learning_rate": 0.00013147830124152893, + "loss": 0.0, + "step": 35606 + }, + { + "epoch": 3.322478305495941, + "grad_norm": NaN, + "learning_rate": 0.00013147079536393228, + "loss": 0.0, + "step": 35607 + }, + { + "epoch": 3.3225716151908182, + "grad_norm": NaN, + "learning_rate": 0.0001314632895334499, + "loss": 0.0, + "step": 35608 + }, + { + "epoch": 3.3226649248856956, + "grad_norm": NaN, + "learning_rate": 0.00013145578375010095, + "loss": 0.0, + "step": 35609 + }, + { + "epoch": 3.322758234580573, + "grad_norm": NaN, + "learning_rate": 0.0001314482780139044, + "loss": 0.0, + "step": 35610 + }, + { + "epoch": 3.32285154427545, + "grad_norm": NaN, + "learning_rate": 0.00013144077232487937, + "loss": 0.0, + "step": 35611 + }, + { + "epoch": 3.3229448539703275, + "grad_norm": NaN, + "learning_rate": 0.00013143326668304503, + "loss": 0.0, + "step": 35612 + }, + { + "epoch": 3.323038163665205, + "grad_norm": NaN, + "learning_rate": 0.0001314257610884203, + "loss": 0.0, + "step": 35613 + }, + { + "epoch": 3.3231314733600823, + "grad_norm": NaN, + "learning_rate": 0.0001314182555410244, + "loss": 0.0, + "step": 35614 + }, + { + "epoch": 3.3232247830549593, + "grad_norm": NaN, + "learning_rate": 0.0001314107500408764, + "loss": 0.0, + "step": 35615 + }, + { + "epoch": 3.3233180927498367, + "grad_norm": NaN, + "learning_rate": 0.0001314032445879953, + "loss": 0.0, + "step": 35616 + }, + { + "epoch": 3.323411402444714, + "grad_norm": NaN, + "learning_rate": 0.0001313957391824003, + "loss": 0.0, + "step": 35617 + }, + { + "epoch": 3.3235047121395915, + "grad_norm": NaN, + "learning_rate": 0.0001313882338241104, + "loss": 0.0, + "step": 35618 + }, + { + "epoch": 3.3235980218344685, + "grad_norm": NaN, + "learning_rate": 0.00013138072851314467, + "loss": 0.0, + "step": 35619 + }, + { + "epoch": 3.323691331529346, + "grad_norm": NaN, + "learning_rate": 0.00013137322324952225, + "loss": 0.0, + "step": 35620 + }, + { + "epoch": 3.3237846412242233, + "grad_norm": NaN, + "learning_rate": 0.00013136571803326226, + "loss": 0.0, + "step": 35621 + }, + { + "epoch": 3.3238779509191003, + "grad_norm": NaN, + "learning_rate": 0.00013135821286438366, + "loss": 0.0, + "step": 35622 + }, + { + "epoch": 3.3239712606139777, + "grad_norm": NaN, + "learning_rate": 0.00013135070774290562, + "loss": 0.0, + "step": 35623 + }, + { + "epoch": 3.324064570308855, + "grad_norm": NaN, + "learning_rate": 0.00013134320266884725, + "loss": 0.0, + "step": 35624 + }, + { + "epoch": 3.3241578800037326, + "grad_norm": NaN, + "learning_rate": 0.00013133569764222755, + "loss": 0.0, + "step": 35625 + }, + { + "epoch": 3.3242511896986096, + "grad_norm": NaN, + "learning_rate": 0.00013132819266306562, + "loss": 0.0, + "step": 35626 + }, + { + "epoch": 3.324344499393487, + "grad_norm": NaN, + "learning_rate": 0.00013132068773138062, + "loss": 0.0, + "step": 35627 + }, + { + "epoch": 3.3244378090883644, + "grad_norm": NaN, + "learning_rate": 0.00013131318284719152, + "loss": 0.0, + "step": 35628 + }, + { + "epoch": 3.3245311187832414, + "grad_norm": NaN, + "learning_rate": 0.00013130567801051748, + "loss": 0.0, + "step": 35629 + }, + { + "epoch": 3.324624428478119, + "grad_norm": NaN, + "learning_rate": 0.00013129817322137763, + "loss": 0.0, + "step": 35630 + }, + { + "epoch": 3.324717738172996, + "grad_norm": NaN, + "learning_rate": 0.0001312906684797909, + "loss": 0.0, + "step": 35631 + }, + { + "epoch": 3.3248110478678736, + "grad_norm": NaN, + "learning_rate": 0.00013128316378577647, + "loss": 0.0, + "step": 35632 + }, + { + "epoch": 3.3249043575627506, + "grad_norm": NaN, + "learning_rate": 0.00013127565913935346, + "loss": 0.0, + "step": 35633 + }, + { + "epoch": 3.324997667257628, + "grad_norm": NaN, + "learning_rate": 0.00013126815454054085, + "loss": 0.0, + "step": 35634 + }, + { + "epoch": 3.3250909769525054, + "grad_norm": NaN, + "learning_rate": 0.0001312606499893578, + "loss": 0.0, + "step": 35635 + }, + { + "epoch": 3.3251842866473824, + "grad_norm": NaN, + "learning_rate": 0.0001312531454858234, + "loss": 0.0, + "step": 35636 + }, + { + "epoch": 3.32527759634226, + "grad_norm": NaN, + "learning_rate": 0.0001312456410299566, + "loss": 0.0, + "step": 35637 + }, + { + "epoch": 3.3253709060371373, + "grad_norm": NaN, + "learning_rate": 0.00013123813662177666, + "loss": 0.0, + "step": 35638 + }, + { + "epoch": 3.3254642157320147, + "grad_norm": NaN, + "learning_rate": 0.0001312306322613026, + "loss": 0.0, + "step": 35639 + }, + { + "epoch": 3.325557525426892, + "grad_norm": NaN, + "learning_rate": 0.0001312231279485534, + "loss": 0.0, + "step": 35640 + }, + { + "epoch": 3.325650835121769, + "grad_norm": NaN, + "learning_rate": 0.00013121562368354823, + "loss": 0.0, + "step": 35641 + }, + { + "epoch": 3.3257441448166465, + "grad_norm": NaN, + "learning_rate": 0.00013120811946630627, + "loss": 0.0, + "step": 35642 + }, + { + "epoch": 3.325837454511524, + "grad_norm": NaN, + "learning_rate": 0.0001312006152968464, + "loss": 0.0, + "step": 35643 + }, + { + "epoch": 3.325930764206401, + "grad_norm": NaN, + "learning_rate": 0.00013119311117518783, + "loss": 0.0, + "step": 35644 + }, + { + "epoch": 3.3260240739012783, + "grad_norm": NaN, + "learning_rate": 0.00013118560710134958, + "loss": 0.0, + "step": 35645 + }, + { + "epoch": 3.3261173835961557, + "grad_norm": NaN, + "learning_rate": 0.00013117810307535077, + "loss": 0.0, + "step": 35646 + }, + { + "epoch": 3.326210693291033, + "grad_norm": NaN, + "learning_rate": 0.0001311705990972105, + "loss": 0.0, + "step": 35647 + }, + { + "epoch": 3.32630400298591, + "grad_norm": NaN, + "learning_rate": 0.00013116309516694779, + "loss": 0.0, + "step": 35648 + }, + { + "epoch": 3.3263973126807875, + "grad_norm": NaN, + "learning_rate": 0.00013115559128458172, + "loss": 0.0, + "step": 35649 + }, + { + "epoch": 3.326490622375665, + "grad_norm": NaN, + "learning_rate": 0.00013114808745013149, + "loss": 0.0, + "step": 35650 + }, + { + "epoch": 3.326583932070542, + "grad_norm": NaN, + "learning_rate": 0.00013114058366361597, + "loss": 0.0, + "step": 35651 + }, + { + "epoch": 3.3266772417654193, + "grad_norm": NaN, + "learning_rate": 0.00013113307992505444, + "loss": 0.0, + "step": 35652 + }, + { + "epoch": 3.3267705514602968, + "grad_norm": NaN, + "learning_rate": 0.0001311255762344659, + "loss": 0.0, + "step": 35653 + }, + { + "epoch": 3.326863861155174, + "grad_norm": NaN, + "learning_rate": 0.00013111807259186937, + "loss": 0.0, + "step": 35654 + }, + { + "epoch": 3.326957170850051, + "grad_norm": NaN, + "learning_rate": 0.00013111056899728403, + "loss": 0.0, + "step": 35655 + }, + { + "epoch": 3.3270504805449286, + "grad_norm": NaN, + "learning_rate": 0.00013110306545072894, + "loss": 0.0, + "step": 35656 + }, + { + "epoch": 3.327143790239806, + "grad_norm": NaN, + "learning_rate": 0.0001310955619522231, + "loss": 0.0, + "step": 35657 + }, + { + "epoch": 3.327237099934683, + "grad_norm": NaN, + "learning_rate": 0.00013108805850178568, + "loss": 0.0, + "step": 35658 + }, + { + "epoch": 3.3273304096295604, + "grad_norm": NaN, + "learning_rate": 0.00013108055509943576, + "loss": 0.0, + "step": 35659 + }, + { + "epoch": 3.327423719324438, + "grad_norm": NaN, + "learning_rate": 0.0001310730517451923, + "loss": 0.0, + "step": 35660 + }, + { + "epoch": 3.3275170290193152, + "grad_norm": NaN, + "learning_rate": 0.00013106554843907454, + "loss": 0.0, + "step": 35661 + }, + { + "epoch": 3.3276103387141927, + "grad_norm": NaN, + "learning_rate": 0.00013105804518110146, + "loss": 0.0, + "step": 35662 + }, + { + "epoch": 3.3277036484090696, + "grad_norm": NaN, + "learning_rate": 0.00013105054197129213, + "loss": 0.0, + "step": 35663 + }, + { + "epoch": 3.327796958103947, + "grad_norm": NaN, + "learning_rate": 0.00013104303880966567, + "loss": 0.0, + "step": 35664 + }, + { + "epoch": 3.3278902677988245, + "grad_norm": NaN, + "learning_rate": 0.00013103553569624119, + "loss": 0.0, + "step": 35665 + }, + { + "epoch": 3.3279835774937014, + "grad_norm": NaN, + "learning_rate": 0.00013102803263103765, + "loss": 0.0, + "step": 35666 + }, + { + "epoch": 3.328076887188579, + "grad_norm": NaN, + "learning_rate": 0.00013102052961407425, + "loss": 0.0, + "step": 35667 + }, + { + "epoch": 3.3281701968834563, + "grad_norm": NaN, + "learning_rate": 0.00013101302664537005, + "loss": 0.0, + "step": 35668 + }, + { + "epoch": 3.3282635065783337, + "grad_norm": NaN, + "learning_rate": 0.00013100552372494404, + "loss": 0.0, + "step": 35669 + }, + { + "epoch": 3.3283568162732107, + "grad_norm": NaN, + "learning_rate": 0.0001309980208528154, + "loss": 0.0, + "step": 35670 + }, + { + "epoch": 3.328450125968088, + "grad_norm": NaN, + "learning_rate": 0.00013099051802900315, + "loss": 0.0, + "step": 35671 + }, + { + "epoch": 3.3285434356629655, + "grad_norm": NaN, + "learning_rate": 0.00013098301525352635, + "loss": 0.0, + "step": 35672 + }, + { + "epoch": 3.3286367453578425, + "grad_norm": NaN, + "learning_rate": 0.00013097551252640413, + "loss": 0.0, + "step": 35673 + }, + { + "epoch": 3.32873005505272, + "grad_norm": NaN, + "learning_rate": 0.0001309680098476556, + "loss": 0.0, + "step": 35674 + }, + { + "epoch": 3.3288233647475973, + "grad_norm": NaN, + "learning_rate": 0.0001309605072172997, + "loss": 0.0, + "step": 35675 + }, + { + "epoch": 3.3289166744424747, + "grad_norm": NaN, + "learning_rate": 0.0001309530046353556, + "loss": 0.0, + "step": 35676 + }, + { + "epoch": 3.3290099841373517, + "grad_norm": NaN, + "learning_rate": 0.00013094550210184243, + "loss": 0.0, + "step": 35677 + }, + { + "epoch": 3.329103293832229, + "grad_norm": NaN, + "learning_rate": 0.00013093799961677913, + "loss": 0.0, + "step": 35678 + }, + { + "epoch": 3.3291966035271066, + "grad_norm": NaN, + "learning_rate": 0.00013093049718018485, + "loss": 0.0, + "step": 35679 + }, + { + "epoch": 3.3292899132219835, + "grad_norm": NaN, + "learning_rate": 0.00013092299479207874, + "loss": 0.0, + "step": 35680 + }, + { + "epoch": 3.329383222916861, + "grad_norm": NaN, + "learning_rate": 0.00013091549245247976, + "loss": 0.0, + "step": 35681 + }, + { + "epoch": 3.3294765326117384, + "grad_norm": NaN, + "learning_rate": 0.000130907990161407, + "loss": 0.0, + "step": 35682 + }, + { + "epoch": 3.329569842306616, + "grad_norm": NaN, + "learning_rate": 0.00013090048791887965, + "loss": 0.0, + "step": 35683 + }, + { + "epoch": 3.3296631520014928, + "grad_norm": NaN, + "learning_rate": 0.00013089298572491664, + "loss": 0.0, + "step": 35684 + }, + { + "epoch": 3.32975646169637, + "grad_norm": NaN, + "learning_rate": 0.00013088548357953708, + "loss": 0.0, + "step": 35685 + }, + { + "epoch": 3.3298497713912476, + "grad_norm": NaN, + "learning_rate": 0.00013087798148276016, + "loss": 0.0, + "step": 35686 + }, + { + "epoch": 3.329943081086125, + "grad_norm": NaN, + "learning_rate": 0.0001308704794346048, + "loss": 0.0, + "step": 35687 + }, + { + "epoch": 3.330036390781002, + "grad_norm": NaN, + "learning_rate": 0.0001308629774350902, + "loss": 0.0, + "step": 35688 + }, + { + "epoch": 3.3301297004758794, + "grad_norm": NaN, + "learning_rate": 0.00013085547548423532, + "loss": 0.0, + "step": 35689 + }, + { + "epoch": 3.330223010170757, + "grad_norm": NaN, + "learning_rate": 0.00013084797358205933, + "loss": 0.0, + "step": 35690 + }, + { + "epoch": 3.3303163198656343, + "grad_norm": NaN, + "learning_rate": 0.0001308404717285813, + "loss": 0.0, + "step": 35691 + }, + { + "epoch": 3.3304096295605112, + "grad_norm": NaN, + "learning_rate": 0.00013083296992382018, + "loss": 0.0, + "step": 35692 + }, + { + "epoch": 3.3305029392553887, + "grad_norm": NaN, + "learning_rate": 0.00013082546816779522, + "loss": 0.0, + "step": 35693 + }, + { + "epoch": 3.330596248950266, + "grad_norm": NaN, + "learning_rate": 0.0001308179664605254, + "loss": 0.0, + "step": 35694 + }, + { + "epoch": 3.330689558645143, + "grad_norm": NaN, + "learning_rate": 0.00013081046480202977, + "loss": 0.0, + "step": 35695 + }, + { + "epoch": 3.3307828683400205, + "grad_norm": NaN, + "learning_rate": 0.00013080296319232748, + "loss": 0.0, + "step": 35696 + }, + { + "epoch": 3.330876178034898, + "grad_norm": NaN, + "learning_rate": 0.0001307954616314376, + "loss": 0.0, + "step": 35697 + }, + { + "epoch": 3.3309694877297753, + "grad_norm": NaN, + "learning_rate": 0.0001307879601193791, + "loss": 0.0, + "step": 35698 + }, + { + "epoch": 3.3310627974246523, + "grad_norm": NaN, + "learning_rate": 0.00013078045865617114, + "loss": 0.0, + "step": 35699 + }, + { + "epoch": 3.3311561071195297, + "grad_norm": NaN, + "learning_rate": 0.00013077295724183287, + "loss": 0.0, + "step": 35700 + }, + { + "epoch": 3.331249416814407, + "grad_norm": NaN, + "learning_rate": 0.00013076545587638316, + "loss": 0.0, + "step": 35701 + }, + { + "epoch": 3.331342726509284, + "grad_norm": NaN, + "learning_rate": 0.00013075795455984123, + "loss": 0.0, + "step": 35702 + }, + { + "epoch": 3.3314360362041615, + "grad_norm": NaN, + "learning_rate": 0.00013075045329222616, + "loss": 0.0, + "step": 35703 + }, + { + "epoch": 3.331529345899039, + "grad_norm": NaN, + "learning_rate": 0.00013074295207355694, + "loss": 0.0, + "step": 35704 + }, + { + "epoch": 3.3316226555939163, + "grad_norm": NaN, + "learning_rate": 0.0001307354509038527, + "loss": 0.0, + "step": 35705 + }, + { + "epoch": 3.3317159652887933, + "grad_norm": NaN, + "learning_rate": 0.00013072794978313254, + "loss": 0.0, + "step": 35706 + }, + { + "epoch": 3.3318092749836707, + "grad_norm": NaN, + "learning_rate": 0.00013072044871141543, + "loss": 0.0, + "step": 35707 + }, + { + "epoch": 3.331902584678548, + "grad_norm": NaN, + "learning_rate": 0.00013071294768872055, + "loss": 0.0, + "step": 35708 + }, + { + "epoch": 3.3319958943734256, + "grad_norm": NaN, + "learning_rate": 0.00013070544671506696, + "loss": 0.0, + "step": 35709 + }, + { + "epoch": 3.3320892040683026, + "grad_norm": NaN, + "learning_rate": 0.00013069794579047362, + "loss": 0.0, + "step": 35710 + }, + { + "epoch": 3.33218251376318, + "grad_norm": NaN, + "learning_rate": 0.00013069044491495973, + "loss": 0.0, + "step": 35711 + }, + { + "epoch": 3.3322758234580574, + "grad_norm": NaN, + "learning_rate": 0.00013068294408854437, + "loss": 0.0, + "step": 35712 + }, + { + "epoch": 3.332369133152935, + "grad_norm": NaN, + "learning_rate": 0.00013067544331124645, + "loss": 0.0, + "step": 35713 + }, + { + "epoch": 3.332462442847812, + "grad_norm": NaN, + "learning_rate": 0.00013066794258308523, + "loss": 0.0, + "step": 35714 + }, + { + "epoch": 3.332555752542689, + "grad_norm": NaN, + "learning_rate": 0.0001306604419040797, + "loss": 0.0, + "step": 35715 + }, + { + "epoch": 3.3326490622375666, + "grad_norm": NaN, + "learning_rate": 0.0001306529412742489, + "loss": 0.0, + "step": 35716 + }, + { + "epoch": 3.3327423719324436, + "grad_norm": NaN, + "learning_rate": 0.00013064544069361195, + "loss": 0.0, + "step": 35717 + }, + { + "epoch": 3.332835681627321, + "grad_norm": NaN, + "learning_rate": 0.00013063794016218797, + "loss": 0.0, + "step": 35718 + }, + { + "epoch": 3.3329289913221984, + "grad_norm": NaN, + "learning_rate": 0.00013063043967999593, + "loss": 0.0, + "step": 35719 + }, + { + "epoch": 3.333022301017076, + "grad_norm": NaN, + "learning_rate": 0.0001306229392470549, + "loss": 0.0, + "step": 35720 + }, + { + "epoch": 3.333115610711953, + "grad_norm": NaN, + "learning_rate": 0.00013061543886338408, + "loss": 0.0, + "step": 35721 + }, + { + "epoch": 3.3332089204068303, + "grad_norm": NaN, + "learning_rate": 0.00013060793852900243, + "loss": 0.0, + "step": 35722 + }, + { + "epoch": 3.3333022301017077, + "grad_norm": NaN, + "learning_rate": 0.000130600438243929, + "loss": 0.0, + "step": 35723 + }, + { + "epoch": 3.3333955397965847, + "grad_norm": NaN, + "learning_rate": 0.00013059293800818297, + "loss": 0.0, + "step": 35724 + }, + { + "epoch": 3.333488849491462, + "grad_norm": NaN, + "learning_rate": 0.00013058543782178331, + "loss": 0.0, + "step": 35725 + }, + { + "epoch": 3.3335821591863395, + "grad_norm": NaN, + "learning_rate": 0.00013057793768474912, + "loss": 0.0, + "step": 35726 + }, + { + "epoch": 3.333675468881217, + "grad_norm": NaN, + "learning_rate": 0.00013057043759709955, + "loss": 0.0, + "step": 35727 + }, + { + "epoch": 3.333768778576094, + "grad_norm": NaN, + "learning_rate": 0.00013056293755885355, + "loss": 0.0, + "step": 35728 + }, + { + "epoch": 3.3338620882709713, + "grad_norm": NaN, + "learning_rate": 0.0001305554375700302, + "loss": 0.0, + "step": 35729 + }, + { + "epoch": 3.3339553979658487, + "grad_norm": NaN, + "learning_rate": 0.00013054793763064872, + "loss": 0.0, + "step": 35730 + }, + { + "epoch": 3.334048707660726, + "grad_norm": NaN, + "learning_rate": 0.000130540437740728, + "loss": 0.0, + "step": 35731 + }, + { + "epoch": 3.334142017355603, + "grad_norm": NaN, + "learning_rate": 0.00013053293790028724, + "loss": 0.0, + "step": 35732 + }, + { + "epoch": 3.3342353270504805, + "grad_norm": NaN, + "learning_rate": 0.00013052543810934537, + "loss": 0.0, + "step": 35733 + }, + { + "epoch": 3.334328636745358, + "grad_norm": NaN, + "learning_rate": 0.00013051793836792158, + "loss": 0.0, + "step": 35734 + }, + { + "epoch": 3.3344219464402354, + "grad_norm": NaN, + "learning_rate": 0.00013051043867603495, + "loss": 0.0, + "step": 35735 + }, + { + "epoch": 3.3345152561351123, + "grad_norm": NaN, + "learning_rate": 0.00013050293903370442, + "loss": 0.0, + "step": 35736 + }, + { + "epoch": 3.3346085658299898, + "grad_norm": NaN, + "learning_rate": 0.00013049543944094915, + "loss": 0.0, + "step": 35737 + }, + { + "epoch": 3.334701875524867, + "grad_norm": NaN, + "learning_rate": 0.00013048793989778827, + "loss": 0.0, + "step": 35738 + }, + { + "epoch": 3.334795185219744, + "grad_norm": NaN, + "learning_rate": 0.00013048044040424073, + "loss": 0.0, + "step": 35739 + }, + { + "epoch": 3.3348884949146216, + "grad_norm": NaN, + "learning_rate": 0.00013047294096032563, + "loss": 0.0, + "step": 35740 + }, + { + "epoch": 3.334981804609499, + "grad_norm": NaN, + "learning_rate": 0.00013046544156606214, + "loss": 0.0, + "step": 35741 + }, + { + "epoch": 3.3350751143043764, + "grad_norm": NaN, + "learning_rate": 0.00013045794222146914, + "loss": 0.0, + "step": 35742 + }, + { + "epoch": 3.3351684239992534, + "grad_norm": NaN, + "learning_rate": 0.00013045044292656584, + "loss": 0.0, + "step": 35743 + }, + { + "epoch": 3.335261733694131, + "grad_norm": NaN, + "learning_rate": 0.0001304429436813713, + "loss": 0.0, + "step": 35744 + }, + { + "epoch": 3.3353550433890082, + "grad_norm": NaN, + "learning_rate": 0.00013043544448590452, + "loss": 0.0, + "step": 35745 + }, + { + "epoch": 3.335448353083885, + "grad_norm": NaN, + "learning_rate": 0.00013042794534018462, + "loss": 0.0, + "step": 35746 + }, + { + "epoch": 3.3355416627787626, + "grad_norm": NaN, + "learning_rate": 0.0001304204462442307, + "loss": 0.0, + "step": 35747 + }, + { + "epoch": 3.33563497247364, + "grad_norm": NaN, + "learning_rate": 0.00013041294719806174, + "loss": 0.0, + "step": 35748 + }, + { + "epoch": 3.3357282821685175, + "grad_norm": NaN, + "learning_rate": 0.00013040544820169686, + "loss": 0.0, + "step": 35749 + }, + { + "epoch": 3.3358215918633944, + "grad_norm": NaN, + "learning_rate": 0.00013039794925515517, + "loss": 0.0, + "step": 35750 + }, + { + "epoch": 3.335914901558272, + "grad_norm": NaN, + "learning_rate": 0.0001303904503584556, + "loss": 0.0, + "step": 35751 + }, + { + "epoch": 3.3360082112531493, + "grad_norm": NaN, + "learning_rate": 0.00013038295151161734, + "loss": 0.0, + "step": 35752 + }, + { + "epoch": 3.3361015209480263, + "grad_norm": NaN, + "learning_rate": 0.00013037545271465947, + "loss": 0.0, + "step": 35753 + }, + { + "epoch": 3.3361948306429037, + "grad_norm": NaN, + "learning_rate": 0.00013036795396760095, + "loss": 0.0, + "step": 35754 + }, + { + "epoch": 3.336288140337781, + "grad_norm": NaN, + "learning_rate": 0.00013036045527046092, + "loss": 0.0, + "step": 35755 + }, + { + "epoch": 3.3363814500326585, + "grad_norm": NaN, + "learning_rate": 0.00013035295662325847, + "loss": 0.0, + "step": 35756 + }, + { + "epoch": 3.336474759727536, + "grad_norm": NaN, + "learning_rate": 0.0001303454580260126, + "loss": 0.0, + "step": 35757 + }, + { + "epoch": 3.336568069422413, + "grad_norm": NaN, + "learning_rate": 0.0001303379594787424, + "loss": 0.0, + "step": 35758 + }, + { + "epoch": 3.3366613791172903, + "grad_norm": NaN, + "learning_rate": 0.00013033046098146698, + "loss": 0.0, + "step": 35759 + }, + { + "epoch": 3.3367546888121677, + "grad_norm": NaN, + "learning_rate": 0.00013032296253420535, + "loss": 0.0, + "step": 35760 + }, + { + "epoch": 3.3368479985070447, + "grad_norm": NaN, + "learning_rate": 0.0001303154641369766, + "loss": 0.0, + "step": 35761 + }, + { + "epoch": 3.336941308201922, + "grad_norm": NaN, + "learning_rate": 0.0001303079657897998, + "loss": 0.0, + "step": 35762 + }, + { + "epoch": 3.3370346178967996, + "grad_norm": NaN, + "learning_rate": 0.00013030046749269402, + "loss": 0.0, + "step": 35763 + }, + { + "epoch": 3.337127927591677, + "grad_norm": NaN, + "learning_rate": 0.00013029296924567827, + "loss": 0.0, + "step": 35764 + }, + { + "epoch": 3.337221237286554, + "grad_norm": NaN, + "learning_rate": 0.00013028547104877172, + "loss": 0.0, + "step": 35765 + }, + { + "epoch": 3.3373145469814314, + "grad_norm": NaN, + "learning_rate": 0.00013027797290199336, + "loss": 0.0, + "step": 35766 + }, + { + "epoch": 3.337407856676309, + "grad_norm": NaN, + "learning_rate": 0.00013027047480536222, + "loss": 0.0, + "step": 35767 + }, + { + "epoch": 3.3375011663711858, + "grad_norm": NaN, + "learning_rate": 0.0001302629767588975, + "loss": 0.0, + "step": 35768 + }, + { + "epoch": 3.337594476066063, + "grad_norm": NaN, + "learning_rate": 0.00013025547876261814, + "loss": 0.0, + "step": 35769 + }, + { + "epoch": 3.3376877857609406, + "grad_norm": NaN, + "learning_rate": 0.00013024798081654322, + "loss": 0.0, + "step": 35770 + }, + { + "epoch": 3.337781095455818, + "grad_norm": NaN, + "learning_rate": 0.0001302404829206919, + "loss": 0.0, + "step": 35771 + }, + { + "epoch": 3.337874405150695, + "grad_norm": NaN, + "learning_rate": 0.00013023298507508315, + "loss": 0.0, + "step": 35772 + }, + { + "epoch": 3.3379677148455724, + "grad_norm": NaN, + "learning_rate": 0.00013022548727973602, + "loss": 0.0, + "step": 35773 + }, + { + "epoch": 3.33806102454045, + "grad_norm": NaN, + "learning_rate": 0.0001302179895346697, + "loss": 0.0, + "step": 35774 + }, + { + "epoch": 3.338154334235327, + "grad_norm": NaN, + "learning_rate": 0.00013021049183990313, + "loss": 0.0, + "step": 35775 + }, + { + "epoch": 3.3382476439302042, + "grad_norm": NaN, + "learning_rate": 0.00013020299419545537, + "loss": 0.0, + "step": 35776 + }, + { + "epoch": 3.3383409536250817, + "grad_norm": NaN, + "learning_rate": 0.00013019549660134563, + "loss": 0.0, + "step": 35777 + }, + { + "epoch": 3.338434263319959, + "grad_norm": NaN, + "learning_rate": 0.00013018799905759282, + "loss": 0.0, + "step": 35778 + }, + { + "epoch": 3.338527573014836, + "grad_norm": NaN, + "learning_rate": 0.00013018050156421608, + "loss": 0.0, + "step": 35779 + }, + { + "epoch": 3.3386208827097135, + "grad_norm": NaN, + "learning_rate": 0.00013017300412123442, + "loss": 0.0, + "step": 35780 + }, + { + "epoch": 3.338714192404591, + "grad_norm": NaN, + "learning_rate": 0.00013016550672866693, + "loss": 0.0, + "step": 35781 + }, + { + "epoch": 3.3388075020994683, + "grad_norm": NaN, + "learning_rate": 0.00013015800938653274, + "loss": 0.0, + "step": 35782 + }, + { + "epoch": 3.3389008117943453, + "grad_norm": NaN, + "learning_rate": 0.0001301505120948508, + "loss": 0.0, + "step": 35783 + }, + { + "epoch": 3.3389941214892227, + "grad_norm": NaN, + "learning_rate": 0.00013014301485364025, + "loss": 0.0, + "step": 35784 + }, + { + "epoch": 3.3390874311841, + "grad_norm": NaN, + "learning_rate": 0.00013013551766292015, + "loss": 0.0, + "step": 35785 + }, + { + "epoch": 3.3391807408789775, + "grad_norm": NaN, + "learning_rate": 0.0001301280205227095, + "loss": 0.0, + "step": 35786 + }, + { + "epoch": 3.3392740505738545, + "grad_norm": NaN, + "learning_rate": 0.00013012052343302743, + "loss": 0.0, + "step": 35787 + }, + { + "epoch": 3.339367360268732, + "grad_norm": NaN, + "learning_rate": 0.00013011302639389302, + "loss": 0.0, + "step": 35788 + }, + { + "epoch": 3.3394606699636094, + "grad_norm": NaN, + "learning_rate": 0.00013010552940532523, + "loss": 0.0, + "step": 35789 + }, + { + "epoch": 3.3395539796584863, + "grad_norm": NaN, + "learning_rate": 0.00013009803246734317, + "loss": 0.0, + "step": 35790 + }, + { + "epoch": 3.3396472893533637, + "grad_norm": NaN, + "learning_rate": 0.00013009053557996603, + "loss": 0.0, + "step": 35791 + }, + { + "epoch": 3.339740599048241, + "grad_norm": NaN, + "learning_rate": 0.00013008303874321265, + "loss": 0.0, + "step": 35792 + }, + { + "epoch": 3.3398339087431186, + "grad_norm": NaN, + "learning_rate": 0.0001300755419571022, + "loss": 0.0, + "step": 35793 + }, + { + "epoch": 3.3399272184379956, + "grad_norm": NaN, + "learning_rate": 0.00013006804522165382, + "loss": 0.0, + "step": 35794 + }, + { + "epoch": 3.340020528132873, + "grad_norm": NaN, + "learning_rate": 0.00013006054853688647, + "loss": 0.0, + "step": 35795 + }, + { + "epoch": 3.3401138378277504, + "grad_norm": NaN, + "learning_rate": 0.00013005305190281918, + "loss": 0.0, + "step": 35796 + }, + { + "epoch": 3.3402071475226274, + "grad_norm": NaN, + "learning_rate": 0.00013004555531947117, + "loss": 0.0, + "step": 35797 + }, + { + "epoch": 3.340300457217505, + "grad_norm": NaN, + "learning_rate": 0.00013003805878686136, + "loss": 0.0, + "step": 35798 + }, + { + "epoch": 3.340393766912382, + "grad_norm": NaN, + "learning_rate": 0.0001300305623050088, + "loss": 0.0, + "step": 35799 + }, + { + "epoch": 3.3404870766072596, + "grad_norm": NaN, + "learning_rate": 0.0001300230658739327, + "loss": 0.0, + "step": 35800 + }, + { + "epoch": 3.3405803863021366, + "grad_norm": NaN, + "learning_rate": 0.00013001556949365195, + "loss": 0.0, + "step": 35801 + }, + { + "epoch": 3.340673695997014, + "grad_norm": NaN, + "learning_rate": 0.00013000807316418568, + "loss": 0.0, + "step": 35802 + }, + { + "epoch": 3.3407670056918914, + "grad_norm": NaN, + "learning_rate": 0.00013000057688555304, + "loss": 0.0, + "step": 35803 + }, + { + "epoch": 3.340860315386769, + "grad_norm": NaN, + "learning_rate": 0.00012999308065777296, + "loss": 0.0, + "step": 35804 + }, + { + "epoch": 3.340953625081646, + "grad_norm": NaN, + "learning_rate": 0.0001299855844808645, + "loss": 0.0, + "step": 35805 + }, + { + "epoch": 3.3410469347765233, + "grad_norm": NaN, + "learning_rate": 0.00012997808835484684, + "loss": 0.0, + "step": 35806 + }, + { + "epoch": 3.3411402444714007, + "grad_norm": NaN, + "learning_rate": 0.00012997059227973898, + "loss": 0.0, + "step": 35807 + }, + { + "epoch": 3.341233554166278, + "grad_norm": NaN, + "learning_rate": 0.0001299630962555599, + "loss": 0.0, + "step": 35808 + }, + { + "epoch": 3.341326863861155, + "grad_norm": NaN, + "learning_rate": 0.00012995560028232882, + "loss": 0.0, + "step": 35809 + }, + { + "epoch": 3.3414201735560325, + "grad_norm": NaN, + "learning_rate": 0.00012994810436006465, + "loss": 0.0, + "step": 35810 + }, + { + "epoch": 3.34151348325091, + "grad_norm": NaN, + "learning_rate": 0.0001299406084887865, + "loss": 0.0, + "step": 35811 + }, + { + "epoch": 3.341606792945787, + "grad_norm": NaN, + "learning_rate": 0.00012993311266851348, + "loss": 0.0, + "step": 35812 + }, + { + "epoch": 3.3417001026406643, + "grad_norm": NaN, + "learning_rate": 0.0001299256168992646, + "loss": 0.0, + "step": 35813 + }, + { + "epoch": 3.3417934123355417, + "grad_norm": NaN, + "learning_rate": 0.0001299181211810589, + "loss": 0.0, + "step": 35814 + }, + { + "epoch": 3.341886722030419, + "grad_norm": NaN, + "learning_rate": 0.00012991062551391552, + "loss": 0.0, + "step": 35815 + }, + { + "epoch": 3.341980031725296, + "grad_norm": NaN, + "learning_rate": 0.00012990312989785344, + "loss": 0.0, + "step": 35816 + }, + { + "epoch": 3.3420733414201735, + "grad_norm": NaN, + "learning_rate": 0.00012989563433289173, + "loss": 0.0, + "step": 35817 + }, + { + "epoch": 3.342166651115051, + "grad_norm": NaN, + "learning_rate": 0.0001298881388190495, + "loss": 0.0, + "step": 35818 + }, + { + "epoch": 3.342259960809928, + "grad_norm": NaN, + "learning_rate": 0.00012988064335634576, + "loss": 0.0, + "step": 35819 + }, + { + "epoch": 3.3423532705048054, + "grad_norm": NaN, + "learning_rate": 0.00012987314794479955, + "loss": 0.0, + "step": 35820 + }, + { + "epoch": 3.3424465801996828, + "grad_norm": NaN, + "learning_rate": 0.00012986565258443005, + "loss": 0.0, + "step": 35821 + }, + { + "epoch": 3.34253988989456, + "grad_norm": NaN, + "learning_rate": 0.00012985815727525616, + "loss": 0.0, + "step": 35822 + }, + { + "epoch": 3.342633199589437, + "grad_norm": NaN, + "learning_rate": 0.00012985066201729706, + "loss": 0.0, + "step": 35823 + }, + { + "epoch": 3.3427265092843146, + "grad_norm": NaN, + "learning_rate": 0.00012984316681057171, + "loss": 0.0, + "step": 35824 + }, + { + "epoch": 3.342819818979192, + "grad_norm": NaN, + "learning_rate": 0.0001298356716550992, + "loss": 0.0, + "step": 35825 + }, + { + "epoch": 3.3429131286740694, + "grad_norm": NaN, + "learning_rate": 0.0001298281765508987, + "loss": 0.0, + "step": 35826 + }, + { + "epoch": 3.3430064383689464, + "grad_norm": NaN, + "learning_rate": 0.00012982068149798906, + "loss": 0.0, + "step": 35827 + }, + { + "epoch": 3.343099748063824, + "grad_norm": NaN, + "learning_rate": 0.00012981318649638946, + "loss": 0.0, + "step": 35828 + }, + { + "epoch": 3.3431930577587012, + "grad_norm": NaN, + "learning_rate": 0.00012980569154611904, + "loss": 0.0, + "step": 35829 + }, + { + "epoch": 3.3432863674535787, + "grad_norm": NaN, + "learning_rate": 0.0001297981966471967, + "loss": 0.0, + "step": 35830 + }, + { + "epoch": 3.3433796771484556, + "grad_norm": NaN, + "learning_rate": 0.00012979070179964153, + "loss": 0.0, + "step": 35831 + }, + { + "epoch": 3.343472986843333, + "grad_norm": NaN, + "learning_rate": 0.0001297832070034727, + "loss": 0.0, + "step": 35832 + }, + { + "epoch": 3.3435662965382105, + "grad_norm": NaN, + "learning_rate": 0.00012977571225870912, + "loss": 0.0, + "step": 35833 + }, + { + "epoch": 3.3436596062330874, + "grad_norm": NaN, + "learning_rate": 0.00012976821756536993, + "loss": 0.0, + "step": 35834 + }, + { + "epoch": 3.343752915927965, + "grad_norm": NaN, + "learning_rate": 0.0001297607229234742, + "loss": 0.0, + "step": 35835 + }, + { + "epoch": 3.3438462256228423, + "grad_norm": NaN, + "learning_rate": 0.00012975322833304095, + "loss": 0.0, + "step": 35836 + }, + { + "epoch": 3.3439395353177197, + "grad_norm": NaN, + "learning_rate": 0.00012974573379408917, + "loss": 0.0, + "step": 35837 + }, + { + "epoch": 3.3440328450125967, + "grad_norm": NaN, + "learning_rate": 0.00012973823930663807, + "loss": 0.0, + "step": 35838 + }, + { + "epoch": 3.344126154707474, + "grad_norm": NaN, + "learning_rate": 0.0001297307448707066, + "loss": 0.0, + "step": 35839 + }, + { + "epoch": 3.3442194644023515, + "grad_norm": NaN, + "learning_rate": 0.00012972325048631384, + "loss": 0.0, + "step": 35840 + }, + { + "epoch": 3.3443127740972285, + "grad_norm": NaN, + "learning_rate": 0.00012971575615347886, + "loss": 0.0, + "step": 35841 + }, + { + "epoch": 3.344406083792106, + "grad_norm": NaN, + "learning_rate": 0.0001297082618722207, + "loss": 0.0, + "step": 35842 + }, + { + "epoch": 3.3444993934869833, + "grad_norm": NaN, + "learning_rate": 0.0001297007676425584, + "loss": 0.0, + "step": 35843 + }, + { + "epoch": 3.3445927031818607, + "grad_norm": NaN, + "learning_rate": 0.00012969327346451108, + "loss": 0.0, + "step": 35844 + }, + { + "epoch": 3.3446860128767377, + "grad_norm": NaN, + "learning_rate": 0.0001296857793380977, + "loss": 0.0, + "step": 35845 + }, + { + "epoch": 3.344779322571615, + "grad_norm": NaN, + "learning_rate": 0.00012967828526333738, + "loss": 0.0, + "step": 35846 + }, + { + "epoch": 3.3448726322664926, + "grad_norm": NaN, + "learning_rate": 0.00012967079124024918, + "loss": 0.0, + "step": 35847 + }, + { + "epoch": 3.3449659419613695, + "grad_norm": NaN, + "learning_rate": 0.0001296632972688521, + "loss": 0.0, + "step": 35848 + }, + { + "epoch": 3.345059251656247, + "grad_norm": NaN, + "learning_rate": 0.00012965580334916524, + "loss": 0.0, + "step": 35849 + }, + { + "epoch": 3.3451525613511244, + "grad_norm": NaN, + "learning_rate": 0.00012964830948120767, + "loss": 0.0, + "step": 35850 + }, + { + "epoch": 3.345245871046002, + "grad_norm": NaN, + "learning_rate": 0.00012964081566499842, + "loss": 0.0, + "step": 35851 + }, + { + "epoch": 3.345339180740879, + "grad_norm": NaN, + "learning_rate": 0.0001296333219005565, + "loss": 0.0, + "step": 35852 + }, + { + "epoch": 3.345432490435756, + "grad_norm": NaN, + "learning_rate": 0.00012962582818790106, + "loss": 0.0, + "step": 35853 + }, + { + "epoch": 3.3455258001306336, + "grad_norm": NaN, + "learning_rate": 0.00012961833452705113, + "loss": 0.0, + "step": 35854 + }, + { + "epoch": 3.345619109825511, + "grad_norm": NaN, + "learning_rate": 0.00012961084091802564, + "loss": 0.0, + "step": 35855 + }, + { + "epoch": 3.345712419520388, + "grad_norm": NaN, + "learning_rate": 0.00012960334736084384, + "loss": 0.0, + "step": 35856 + }, + { + "epoch": 3.3458057292152654, + "grad_norm": NaN, + "learning_rate": 0.00012959585385552462, + "loss": 0.0, + "step": 35857 + }, + { + "epoch": 3.345899038910143, + "grad_norm": NaN, + "learning_rate": 0.0001295883604020871, + "loss": 0.0, + "step": 35858 + }, + { + "epoch": 3.3459923486050203, + "grad_norm": NaN, + "learning_rate": 0.00012958086700055036, + "loss": 0.0, + "step": 35859 + }, + { + "epoch": 3.3460856582998972, + "grad_norm": NaN, + "learning_rate": 0.00012957337365093345, + "loss": 0.0, + "step": 35860 + }, + { + "epoch": 3.3461789679947747, + "grad_norm": NaN, + "learning_rate": 0.00012956588035325533, + "loss": 0.0, + "step": 35861 + }, + { + "epoch": 3.346272277689652, + "grad_norm": NaN, + "learning_rate": 0.00012955838710753518, + "loss": 0.0, + "step": 35862 + }, + { + "epoch": 3.346365587384529, + "grad_norm": NaN, + "learning_rate": 0.000129550893913792, + "loss": 0.0, + "step": 35863 + }, + { + "epoch": 3.3464588970794065, + "grad_norm": NaN, + "learning_rate": 0.00012954340077204479, + "loss": 0.0, + "step": 35864 + }, + { + "epoch": 3.346552206774284, + "grad_norm": NaN, + "learning_rate": 0.0001295359076823127, + "loss": 0.0, + "step": 35865 + }, + { + "epoch": 3.3466455164691613, + "grad_norm": NaN, + "learning_rate": 0.0001295284146446147, + "loss": 0.0, + "step": 35866 + }, + { + "epoch": 3.3467388261640383, + "grad_norm": NaN, + "learning_rate": 0.00012952092165896995, + "loss": 0.0, + "step": 35867 + }, + { + "epoch": 3.3468321358589157, + "grad_norm": NaN, + "learning_rate": 0.00012951342872539732, + "loss": 0.0, + "step": 35868 + }, + { + "epoch": 3.346925445553793, + "grad_norm": NaN, + "learning_rate": 0.000129505935843916, + "loss": 0.0, + "step": 35869 + }, + { + "epoch": 3.34701875524867, + "grad_norm": NaN, + "learning_rate": 0.00012949844301454508, + "loss": 0.0, + "step": 35870 + }, + { + "epoch": 3.3471120649435475, + "grad_norm": NaN, + "learning_rate": 0.0001294909502373035, + "loss": 0.0, + "step": 35871 + }, + { + "epoch": 3.347205374638425, + "grad_norm": NaN, + "learning_rate": 0.00012948345751221033, + "loss": 0.0, + "step": 35872 + }, + { + "epoch": 3.3472986843333024, + "grad_norm": NaN, + "learning_rate": 0.0001294759648392847, + "loss": 0.0, + "step": 35873 + }, + { + "epoch": 3.3473919940281798, + "grad_norm": NaN, + "learning_rate": 0.0001294684722185456, + "loss": 0.0, + "step": 35874 + }, + { + "epoch": 3.3474853037230567, + "grad_norm": NaN, + "learning_rate": 0.00012946097965001207, + "loss": 0.0, + "step": 35875 + }, + { + "epoch": 3.347578613417934, + "grad_norm": NaN, + "learning_rate": 0.00012945348713370324, + "loss": 0.0, + "step": 35876 + }, + { + "epoch": 3.3476719231128116, + "grad_norm": NaN, + "learning_rate": 0.00012944599466963808, + "loss": 0.0, + "step": 35877 + }, + { + "epoch": 3.3477652328076886, + "grad_norm": NaN, + "learning_rate": 0.00012943850225783564, + "loss": 0.0, + "step": 35878 + }, + { + "epoch": 3.347858542502566, + "grad_norm": NaN, + "learning_rate": 0.00012943100989831502, + "loss": 0.0, + "step": 35879 + }, + { + "epoch": 3.3479518521974434, + "grad_norm": NaN, + "learning_rate": 0.00012942351759109527, + "loss": 0.0, + "step": 35880 + }, + { + "epoch": 3.348045161892321, + "grad_norm": NaN, + "learning_rate": 0.00012941602533619534, + "loss": 0.0, + "step": 35881 + }, + { + "epoch": 3.348138471587198, + "grad_norm": NaN, + "learning_rate": 0.00012940853313363445, + "loss": 0.0, + "step": 35882 + }, + { + "epoch": 3.348231781282075, + "grad_norm": NaN, + "learning_rate": 0.00012940104098343154, + "loss": 0.0, + "step": 35883 + }, + { + "epoch": 3.3483250909769526, + "grad_norm": NaN, + "learning_rate": 0.0001293935488856056, + "loss": 0.0, + "step": 35884 + }, + { + "epoch": 3.3484184006718296, + "grad_norm": NaN, + "learning_rate": 0.00012938605684017587, + "loss": 0.0, + "step": 35885 + }, + { + "epoch": 3.348511710366707, + "grad_norm": NaN, + "learning_rate": 0.00012937856484716127, + "loss": 0.0, + "step": 35886 + }, + { + "epoch": 3.3486050200615844, + "grad_norm": NaN, + "learning_rate": 0.00012937107290658082, + "loss": 0.0, + "step": 35887 + }, + { + "epoch": 3.348698329756462, + "grad_norm": NaN, + "learning_rate": 0.00012936358101845366, + "loss": 0.0, + "step": 35888 + }, + { + "epoch": 3.348791639451339, + "grad_norm": NaN, + "learning_rate": 0.00012935608918279877, + "loss": 0.0, + "step": 35889 + }, + { + "epoch": 3.3488849491462163, + "grad_norm": NaN, + "learning_rate": 0.00012934859739963524, + "loss": 0.0, + "step": 35890 + }, + { + "epoch": 3.3489782588410937, + "grad_norm": NaN, + "learning_rate": 0.00012934110566898216, + "loss": 0.0, + "step": 35891 + }, + { + "epoch": 3.3490715685359707, + "grad_norm": NaN, + "learning_rate": 0.00012933361399085848, + "loss": 0.0, + "step": 35892 + }, + { + "epoch": 3.349164878230848, + "grad_norm": NaN, + "learning_rate": 0.00012932612236528327, + "loss": 0.0, + "step": 35893 + }, + { + "epoch": 3.3492581879257255, + "grad_norm": NaN, + "learning_rate": 0.00012931863079227568, + "loss": 0.0, + "step": 35894 + }, + { + "epoch": 3.349351497620603, + "grad_norm": NaN, + "learning_rate": 0.00012931113927185466, + "loss": 0.0, + "step": 35895 + }, + { + "epoch": 3.34944480731548, + "grad_norm": NaN, + "learning_rate": 0.00012930364780403924, + "loss": 0.0, + "step": 35896 + }, + { + "epoch": 3.3495381170103573, + "grad_norm": NaN, + "learning_rate": 0.00012929615638884856, + "loss": 0.0, + "step": 35897 + }, + { + "epoch": 3.3496314267052347, + "grad_norm": NaN, + "learning_rate": 0.00012928866502630163, + "loss": 0.0, + "step": 35898 + }, + { + "epoch": 3.349724736400112, + "grad_norm": NaN, + "learning_rate": 0.00012928117371641742, + "loss": 0.0, + "step": 35899 + }, + { + "epoch": 3.349818046094989, + "grad_norm": NaN, + "learning_rate": 0.00012927368245921512, + "loss": 0.0, + "step": 35900 + }, + { + "epoch": 3.3499113557898665, + "grad_norm": NaN, + "learning_rate": 0.00012926619125471368, + "loss": 0.0, + "step": 35901 + }, + { + "epoch": 3.350004665484744, + "grad_norm": NaN, + "learning_rate": 0.00012925870010293216, + "loss": 0.0, + "step": 35902 + }, + { + "epoch": 3.3500979751796214, + "grad_norm": NaN, + "learning_rate": 0.00012925120900388965, + "loss": 0.0, + "step": 35903 + }, + { + "epoch": 3.3501912848744984, + "grad_norm": NaN, + "learning_rate": 0.00012924371795760512, + "loss": 0.0, + "step": 35904 + }, + { + "epoch": 3.3502845945693758, + "grad_norm": NaN, + "learning_rate": 0.0001292362269640977, + "loss": 0.0, + "step": 35905 + }, + { + "epoch": 3.350377904264253, + "grad_norm": NaN, + "learning_rate": 0.00012922873602338645, + "loss": 0.0, + "step": 35906 + }, + { + "epoch": 3.35047121395913, + "grad_norm": NaN, + "learning_rate": 0.0001292212451354903, + "loss": 0.0, + "step": 35907 + }, + { + "epoch": 3.3505645236540076, + "grad_norm": NaN, + "learning_rate": 0.0001292137543004284, + "loss": 0.0, + "step": 35908 + }, + { + "epoch": 3.350657833348885, + "grad_norm": NaN, + "learning_rate": 0.0001292062635182198, + "loss": 0.0, + "step": 35909 + }, + { + "epoch": 3.3507511430437624, + "grad_norm": NaN, + "learning_rate": 0.00012919877278888341, + "loss": 0.0, + "step": 35910 + }, + { + "epoch": 3.3508444527386394, + "grad_norm": NaN, + "learning_rate": 0.00012919128211243846, + "loss": 0.0, + "step": 35911 + }, + { + "epoch": 3.350937762433517, + "grad_norm": NaN, + "learning_rate": 0.00012918379148890393, + "loss": 0.0, + "step": 35912 + }, + { + "epoch": 3.3510310721283942, + "grad_norm": NaN, + "learning_rate": 0.0001291763009182988, + "loss": 0.0, + "step": 35913 + }, + { + "epoch": 3.351124381823271, + "grad_norm": NaN, + "learning_rate": 0.0001291688104006422, + "loss": 0.0, + "step": 35914 + }, + { + "epoch": 3.3512176915181486, + "grad_norm": NaN, + "learning_rate": 0.00012916131993595314, + "loss": 0.0, + "step": 35915 + }, + { + "epoch": 3.351311001213026, + "grad_norm": NaN, + "learning_rate": 0.00012915382952425063, + "loss": 0.0, + "step": 35916 + }, + { + "epoch": 3.3514043109079035, + "grad_norm": NaN, + "learning_rate": 0.0001291463391655538, + "loss": 0.0, + "step": 35917 + }, + { + "epoch": 3.3514976206027804, + "grad_norm": NaN, + "learning_rate": 0.00012913884885988166, + "loss": 0.0, + "step": 35918 + }, + { + "epoch": 3.351590930297658, + "grad_norm": NaN, + "learning_rate": 0.0001291313586072532, + "loss": 0.0, + "step": 35919 + }, + { + "epoch": 3.3516842399925353, + "grad_norm": NaN, + "learning_rate": 0.00012912386840768756, + "loss": 0.0, + "step": 35920 + }, + { + "epoch": 3.3517775496874127, + "grad_norm": NaN, + "learning_rate": 0.00012911637826120375, + "loss": 0.0, + "step": 35921 + }, + { + "epoch": 3.3518708593822897, + "grad_norm": NaN, + "learning_rate": 0.00012910888816782074, + "loss": 0.0, + "step": 35922 + }, + { + "epoch": 3.351964169077167, + "grad_norm": NaN, + "learning_rate": 0.0001291013981275577, + "loss": 0.0, + "step": 35923 + }, + { + "epoch": 3.3520574787720445, + "grad_norm": NaN, + "learning_rate": 0.00012909390814043359, + "loss": 0.0, + "step": 35924 + }, + { + "epoch": 3.352150788466922, + "grad_norm": NaN, + "learning_rate": 0.00012908641820646744, + "loss": 0.0, + "step": 35925 + }, + { + "epoch": 3.352244098161799, + "grad_norm": NaN, + "learning_rate": 0.0001290789283256784, + "loss": 0.0, + "step": 35926 + }, + { + "epoch": 3.3523374078566763, + "grad_norm": NaN, + "learning_rate": 0.00012907143849808542, + "loss": 0.0, + "step": 35927 + }, + { + "epoch": 3.3524307175515538, + "grad_norm": NaN, + "learning_rate": 0.00012906394872370754, + "loss": 0.0, + "step": 35928 + }, + { + "epoch": 3.3525240272464307, + "grad_norm": NaN, + "learning_rate": 0.0001290564590025639, + "loss": 0.0, + "step": 35929 + }, + { + "epoch": 3.352617336941308, + "grad_norm": NaN, + "learning_rate": 0.00012904896933467347, + "loss": 0.0, + "step": 35930 + }, + { + "epoch": 3.3527106466361856, + "grad_norm": NaN, + "learning_rate": 0.00012904147972005522, + "loss": 0.0, + "step": 35931 + }, + { + "epoch": 3.352803956331063, + "grad_norm": NaN, + "learning_rate": 0.0001290339901587284, + "loss": 0.0, + "step": 35932 + }, + { + "epoch": 3.35289726602594, + "grad_norm": NaN, + "learning_rate": 0.00012902650065071185, + "loss": 0.0, + "step": 35933 + }, + { + "epoch": 3.3529905757208174, + "grad_norm": NaN, + "learning_rate": 0.0001290190111960247, + "loss": 0.0, + "step": 35934 + }, + { + "epoch": 3.353083885415695, + "grad_norm": NaN, + "learning_rate": 0.00012901152179468605, + "loss": 0.0, + "step": 35935 + }, + { + "epoch": 3.3531771951105718, + "grad_norm": NaN, + "learning_rate": 0.00012900403244671486, + "loss": 0.0, + "step": 35936 + }, + { + "epoch": 3.353270504805449, + "grad_norm": NaN, + "learning_rate": 0.00012899654315213013, + "loss": 0.0, + "step": 35937 + }, + { + "epoch": 3.3533638145003266, + "grad_norm": NaN, + "learning_rate": 0.00012898905391095104, + "loss": 0.0, + "step": 35938 + }, + { + "epoch": 3.353457124195204, + "grad_norm": NaN, + "learning_rate": 0.00012898156472319653, + "loss": 0.0, + "step": 35939 + }, + { + "epoch": 3.353550433890081, + "grad_norm": NaN, + "learning_rate": 0.00012897407558888567, + "loss": 0.0, + "step": 35940 + }, + { + "epoch": 3.3536437435849584, + "grad_norm": NaN, + "learning_rate": 0.00012896658650803756, + "loss": 0.0, + "step": 35941 + }, + { + "epoch": 3.353737053279836, + "grad_norm": NaN, + "learning_rate": 0.00012895909748067113, + "loss": 0.0, + "step": 35942 + }, + { + "epoch": 3.3538303629747133, + "grad_norm": NaN, + "learning_rate": 0.0001289516085068055, + "loss": 0.0, + "step": 35943 + }, + { + "epoch": 3.3539236726695902, + "grad_norm": NaN, + "learning_rate": 0.00012894411958645974, + "loss": 0.0, + "step": 35944 + }, + { + "epoch": 3.3540169823644677, + "grad_norm": NaN, + "learning_rate": 0.00012893663071965276, + "loss": 0.0, + "step": 35945 + }, + { + "epoch": 3.354110292059345, + "grad_norm": NaN, + "learning_rate": 0.00012892914190640374, + "loss": 0.0, + "step": 35946 + }, + { + "epoch": 3.3542036017542225, + "grad_norm": NaN, + "learning_rate": 0.00012892165314673168, + "loss": 0.0, + "step": 35947 + }, + { + "epoch": 3.3542969114490995, + "grad_norm": NaN, + "learning_rate": 0.00012891416444065558, + "loss": 0.0, + "step": 35948 + }, + { + "epoch": 3.354390221143977, + "grad_norm": NaN, + "learning_rate": 0.00012890667578819453, + "loss": 0.0, + "step": 35949 + }, + { + "epoch": 3.3544835308388543, + "grad_norm": NaN, + "learning_rate": 0.00012889918718936758, + "loss": 0.0, + "step": 35950 + }, + { + "epoch": 3.3545768405337313, + "grad_norm": NaN, + "learning_rate": 0.0001288916986441937, + "loss": 0.0, + "step": 35951 + }, + { + "epoch": 3.3546701502286087, + "grad_norm": NaN, + "learning_rate": 0.000128884210152692, + "loss": 0.0, + "step": 35952 + }, + { + "epoch": 3.354763459923486, + "grad_norm": NaN, + "learning_rate": 0.00012887672171488153, + "loss": 0.0, + "step": 35953 + }, + { + "epoch": 3.3548567696183635, + "grad_norm": NaN, + "learning_rate": 0.00012886923333078125, + "loss": 0.0, + "step": 35954 + }, + { + "epoch": 3.3549500793132405, + "grad_norm": NaN, + "learning_rate": 0.00012886174500041027, + "loss": 0.0, + "step": 35955 + }, + { + "epoch": 3.355043389008118, + "grad_norm": NaN, + "learning_rate": 0.00012885425672378762, + "loss": 0.0, + "step": 35956 + }, + { + "epoch": 3.3551366987029954, + "grad_norm": NaN, + "learning_rate": 0.0001288467685009323, + "loss": 0.0, + "step": 35957 + }, + { + "epoch": 3.3552300083978723, + "grad_norm": NaN, + "learning_rate": 0.00012883928033186344, + "loss": 0.0, + "step": 35958 + }, + { + "epoch": 3.3553233180927498, + "grad_norm": NaN, + "learning_rate": 0.00012883179221659998, + "loss": 0.0, + "step": 35959 + }, + { + "epoch": 3.355416627787627, + "grad_norm": NaN, + "learning_rate": 0.00012882430415516098, + "loss": 0.0, + "step": 35960 + }, + { + "epoch": 3.3555099374825046, + "grad_norm": NaN, + "learning_rate": 0.00012881681614756557, + "loss": 0.0, + "step": 35961 + }, + { + "epoch": 3.3556032471773816, + "grad_norm": NaN, + "learning_rate": 0.00012880932819383267, + "loss": 0.0, + "step": 35962 + }, + { + "epoch": 3.355696556872259, + "grad_norm": NaN, + "learning_rate": 0.00012880184029398134, + "loss": 0.0, + "step": 35963 + }, + { + "epoch": 3.3557898665671364, + "grad_norm": NaN, + "learning_rate": 0.00012879435244803072, + "loss": 0.0, + "step": 35964 + }, + { + "epoch": 3.3558831762620134, + "grad_norm": NaN, + "learning_rate": 0.00012878686465599978, + "loss": 0.0, + "step": 35965 + }, + { + "epoch": 3.355976485956891, + "grad_norm": NaN, + "learning_rate": 0.00012877937691790748, + "loss": 0.0, + "step": 35966 + }, + { + "epoch": 3.356069795651768, + "grad_norm": NaN, + "learning_rate": 0.000128771889233773, + "loss": 0.0, + "step": 35967 + }, + { + "epoch": 3.3561631053466456, + "grad_norm": NaN, + "learning_rate": 0.00012876440160361532, + "loss": 0.0, + "step": 35968 + }, + { + "epoch": 3.356256415041523, + "grad_norm": NaN, + "learning_rate": 0.00012875691402745344, + "loss": 0.0, + "step": 35969 + }, + { + "epoch": 3.3563497247364, + "grad_norm": NaN, + "learning_rate": 0.0001287494265053065, + "loss": 0.0, + "step": 35970 + }, + { + "epoch": 3.3564430344312775, + "grad_norm": NaN, + "learning_rate": 0.00012874193903719341, + "loss": 0.0, + "step": 35971 + }, + { + "epoch": 3.356536344126155, + "grad_norm": NaN, + "learning_rate": 0.00012873445162313326, + "loss": 0.0, + "step": 35972 + }, + { + "epoch": 3.356629653821032, + "grad_norm": NaN, + "learning_rate": 0.00012872696426314518, + "loss": 0.0, + "step": 35973 + }, + { + "epoch": 3.3567229635159093, + "grad_norm": NaN, + "learning_rate": 0.0001287194769572481, + "loss": 0.0, + "step": 35974 + }, + { + "epoch": 3.3568162732107867, + "grad_norm": NaN, + "learning_rate": 0.00012871198970546103, + "loss": 0.0, + "step": 35975 + }, + { + "epoch": 3.356909582905664, + "grad_norm": NaN, + "learning_rate": 0.00012870450250780313, + "loss": 0.0, + "step": 35976 + }, + { + "epoch": 3.357002892600541, + "grad_norm": NaN, + "learning_rate": 0.0001286970153642933, + "loss": 0.0, + "step": 35977 + }, + { + "epoch": 3.3570962022954185, + "grad_norm": NaN, + "learning_rate": 0.0001286895282749507, + "loss": 0.0, + "step": 35978 + }, + { + "epoch": 3.357189511990296, + "grad_norm": NaN, + "learning_rate": 0.00012868204123979435, + "loss": 0.0, + "step": 35979 + }, + { + "epoch": 3.357282821685173, + "grad_norm": NaN, + "learning_rate": 0.00012867455425884318, + "loss": 0.0, + "step": 35980 + }, + { + "epoch": 3.3573761313800503, + "grad_norm": NaN, + "learning_rate": 0.00012866706733211634, + "loss": 0.0, + "step": 35981 + }, + { + "epoch": 3.3574694410749277, + "grad_norm": NaN, + "learning_rate": 0.00012865958045963285, + "loss": 0.0, + "step": 35982 + }, + { + "epoch": 3.357562750769805, + "grad_norm": NaN, + "learning_rate": 0.00012865209364141167, + "loss": 0.0, + "step": 35983 + }, + { + "epoch": 3.357656060464682, + "grad_norm": NaN, + "learning_rate": 0.00012864460687747194, + "loss": 0.0, + "step": 35984 + }, + { + "epoch": 3.3577493701595595, + "grad_norm": NaN, + "learning_rate": 0.00012863712016783266, + "loss": 0.0, + "step": 35985 + }, + { + "epoch": 3.357842679854437, + "grad_norm": NaN, + "learning_rate": 0.0001286296335125128, + "loss": 0.0, + "step": 35986 + }, + { + "epoch": 3.357935989549314, + "grad_norm": NaN, + "learning_rate": 0.00012862214691153147, + "loss": 0.0, + "step": 35987 + }, + { + "epoch": 3.3580292992441914, + "grad_norm": NaN, + "learning_rate": 0.00012861466036490773, + "loss": 0.0, + "step": 35988 + }, + { + "epoch": 3.3581226089390688, + "grad_norm": NaN, + "learning_rate": 0.0001286071738726605, + "loss": 0.0, + "step": 35989 + }, + { + "epoch": 3.358215918633946, + "grad_norm": NaN, + "learning_rate": 0.00012859968743480894, + "loss": 0.0, + "step": 35990 + }, + { + "epoch": 3.358309228328823, + "grad_norm": NaN, + "learning_rate": 0.00012859220105137205, + "loss": 0.0, + "step": 35991 + }, + { + "epoch": 3.3584025380237006, + "grad_norm": NaN, + "learning_rate": 0.0001285847147223688, + "loss": 0.0, + "step": 35992 + }, + { + "epoch": 3.358495847718578, + "grad_norm": NaN, + "learning_rate": 0.0001285772284478183, + "loss": 0.0, + "step": 35993 + }, + { + "epoch": 3.3585891574134554, + "grad_norm": NaN, + "learning_rate": 0.0001285697422277396, + "loss": 0.0, + "step": 35994 + }, + { + "epoch": 3.3586824671083324, + "grad_norm": NaN, + "learning_rate": 0.00012856225606215163, + "loss": 0.0, + "step": 35995 + }, + { + "epoch": 3.35877577680321, + "grad_norm": NaN, + "learning_rate": 0.00012855476995107354, + "loss": 0.0, + "step": 35996 + }, + { + "epoch": 3.3588690864980872, + "grad_norm": NaN, + "learning_rate": 0.00012854728389452433, + "loss": 0.0, + "step": 35997 + }, + { + "epoch": 3.3589623961929647, + "grad_norm": NaN, + "learning_rate": 0.00012853979789252296, + "loss": 0.0, + "step": 35998 + }, + { + "epoch": 3.3590557058878416, + "grad_norm": NaN, + "learning_rate": 0.00012853231194508857, + "loss": 0.0, + "step": 35999 + }, + { + "epoch": 3.359149015582719, + "grad_norm": NaN, + "learning_rate": 0.00012852482605224015, + "loss": 0.0, + "step": 36000 + }, + { + "epoch": 3.3592423252775965, + "grad_norm": NaN, + "learning_rate": 0.0001285173402139967, + "loss": 0.0, + "step": 36001 + }, + { + "epoch": 3.3593356349724734, + "grad_norm": NaN, + "learning_rate": 0.00012850985443037737, + "loss": 0.0, + "step": 36002 + }, + { + "epoch": 3.359428944667351, + "grad_norm": NaN, + "learning_rate": 0.00012850236870140107, + "loss": 0.0, + "step": 36003 + }, + { + "epoch": 3.3595222543622283, + "grad_norm": NaN, + "learning_rate": 0.00012849488302708683, + "loss": 0.0, + "step": 36004 + }, + { + "epoch": 3.3596155640571057, + "grad_norm": NaN, + "learning_rate": 0.0001284873974074538, + "loss": 0.0, + "step": 36005 + }, + { + "epoch": 3.3597088737519827, + "grad_norm": NaN, + "learning_rate": 0.00012847991184252095, + "loss": 0.0, + "step": 36006 + }, + { + "epoch": 3.35980218344686, + "grad_norm": NaN, + "learning_rate": 0.00012847242633230724, + "loss": 0.0, + "step": 36007 + }, + { + "epoch": 3.3598954931417375, + "grad_norm": NaN, + "learning_rate": 0.00012846494087683183, + "loss": 0.0, + "step": 36008 + }, + { + "epoch": 3.3599888028366145, + "grad_norm": NaN, + "learning_rate": 0.00012845745547611367, + "loss": 0.0, + "step": 36009 + }, + { + "epoch": 3.360082112531492, + "grad_norm": NaN, + "learning_rate": 0.00012844997013017182, + "loss": 0.0, + "step": 36010 + }, + { + "epoch": 3.3601754222263693, + "grad_norm": NaN, + "learning_rate": 0.00012844248483902536, + "loss": 0.0, + "step": 36011 + }, + { + "epoch": 3.3602687319212468, + "grad_norm": NaN, + "learning_rate": 0.00012843499960269323, + "loss": 0.0, + "step": 36012 + }, + { + "epoch": 3.3603620416161237, + "grad_norm": NaN, + "learning_rate": 0.0001284275144211945, + "loss": 0.0, + "step": 36013 + }, + { + "epoch": 3.360455351311001, + "grad_norm": NaN, + "learning_rate": 0.00012842002929454823, + "loss": 0.0, + "step": 36014 + }, + { + "epoch": 3.3605486610058786, + "grad_norm": NaN, + "learning_rate": 0.0001284125442227734, + "loss": 0.0, + "step": 36015 + }, + { + "epoch": 3.360641970700756, + "grad_norm": NaN, + "learning_rate": 0.0001284050592058891, + "loss": 0.0, + "step": 36016 + }, + { + "epoch": 3.360735280395633, + "grad_norm": NaN, + "learning_rate": 0.0001283975742439144, + "loss": 0.0, + "step": 36017 + }, + { + "epoch": 3.3608285900905104, + "grad_norm": NaN, + "learning_rate": 0.00012839008933686815, + "loss": 0.0, + "step": 36018 + }, + { + "epoch": 3.360921899785388, + "grad_norm": NaN, + "learning_rate": 0.00012838260448476958, + "loss": 0.0, + "step": 36019 + }, + { + "epoch": 3.361015209480265, + "grad_norm": NaN, + "learning_rate": 0.00012837511968763767, + "loss": 0.0, + "step": 36020 + }, + { + "epoch": 3.361108519175142, + "grad_norm": NaN, + "learning_rate": 0.00012836763494549134, + "loss": 0.0, + "step": 36021 + }, + { + "epoch": 3.3612018288700196, + "grad_norm": NaN, + "learning_rate": 0.00012836015025834973, + "loss": 0.0, + "step": 36022 + }, + { + "epoch": 3.361295138564897, + "grad_norm": NaN, + "learning_rate": 0.0001283526656262319, + "loss": 0.0, + "step": 36023 + }, + { + "epoch": 3.361388448259774, + "grad_norm": NaN, + "learning_rate": 0.00012834518104915676, + "loss": 0.0, + "step": 36024 + }, + { + "epoch": 3.3614817579546514, + "grad_norm": NaN, + "learning_rate": 0.00012833769652714346, + "loss": 0.0, + "step": 36025 + }, + { + "epoch": 3.361575067649529, + "grad_norm": NaN, + "learning_rate": 0.00012833021206021098, + "loss": 0.0, + "step": 36026 + }, + { + "epoch": 3.3616683773444063, + "grad_norm": NaN, + "learning_rate": 0.00012832272764837831, + "loss": 0.0, + "step": 36027 + }, + { + "epoch": 3.3617616870392832, + "grad_norm": NaN, + "learning_rate": 0.00012831524329166453, + "loss": 0.0, + "step": 36028 + }, + { + "epoch": 3.3618549967341607, + "grad_norm": NaN, + "learning_rate": 0.00012830775899008874, + "loss": 0.0, + "step": 36029 + }, + { + "epoch": 3.361948306429038, + "grad_norm": NaN, + "learning_rate": 0.0001283002747436698, + "loss": 0.0, + "step": 36030 + }, + { + "epoch": 3.362041616123915, + "grad_norm": NaN, + "learning_rate": 0.00012829279055242686, + "loss": 0.0, + "step": 36031 + }, + { + "epoch": 3.3621349258187925, + "grad_norm": NaN, + "learning_rate": 0.00012828530641637896, + "loss": 0.0, + "step": 36032 + }, + { + "epoch": 3.36222823551367, + "grad_norm": NaN, + "learning_rate": 0.00012827782233554503, + "loss": 0.0, + "step": 36033 + }, + { + "epoch": 3.3623215452085473, + "grad_norm": NaN, + "learning_rate": 0.0001282703383099442, + "loss": 0.0, + "step": 36034 + }, + { + "epoch": 3.3624148549034243, + "grad_norm": NaN, + "learning_rate": 0.0001282628543395955, + "loss": 0.0, + "step": 36035 + }, + { + "epoch": 3.3625081645983017, + "grad_norm": NaN, + "learning_rate": 0.00012825537042451785, + "loss": 0.0, + "step": 36036 + }, + { + "epoch": 3.362601474293179, + "grad_norm": NaN, + "learning_rate": 0.00012824788656473038, + "loss": 0.0, + "step": 36037 + }, + { + "epoch": 3.3626947839880565, + "grad_norm": NaN, + "learning_rate": 0.00012824040276025213, + "loss": 0.0, + "step": 36038 + }, + { + "epoch": 3.3627880936829335, + "grad_norm": NaN, + "learning_rate": 0.00012823291901110202, + "loss": 0.0, + "step": 36039 + }, + { + "epoch": 3.362881403377811, + "grad_norm": NaN, + "learning_rate": 0.0001282254353172992, + "loss": 0.0, + "step": 36040 + }, + { + "epoch": 3.3629747130726884, + "grad_norm": NaN, + "learning_rate": 0.00012821795167886266, + "loss": 0.0, + "step": 36041 + }, + { + "epoch": 3.363068022767566, + "grad_norm": NaN, + "learning_rate": 0.00012821046809581135, + "loss": 0.0, + "step": 36042 + }, + { + "epoch": 3.3631613324624428, + "grad_norm": NaN, + "learning_rate": 0.00012820298456816441, + "loss": 0.0, + "step": 36043 + }, + { + "epoch": 3.36325464215732, + "grad_norm": NaN, + "learning_rate": 0.00012819550109594084, + "loss": 0.0, + "step": 36044 + }, + { + "epoch": 3.3633479518521976, + "grad_norm": NaN, + "learning_rate": 0.00012818801767915962, + "loss": 0.0, + "step": 36045 + }, + { + "epoch": 3.3634412615470746, + "grad_norm": NaN, + "learning_rate": 0.0001281805343178398, + "loss": 0.0, + "step": 36046 + }, + { + "epoch": 3.363534571241952, + "grad_norm": NaN, + "learning_rate": 0.00012817305101200046, + "loss": 0.0, + "step": 36047 + }, + { + "epoch": 3.3636278809368294, + "grad_norm": NaN, + "learning_rate": 0.00012816556776166054, + "loss": 0.0, + "step": 36048 + }, + { + "epoch": 3.363721190631707, + "grad_norm": NaN, + "learning_rate": 0.00012815808456683917, + "loss": 0.0, + "step": 36049 + }, + { + "epoch": 3.363814500326584, + "grad_norm": NaN, + "learning_rate": 0.0001281506014275553, + "loss": 0.0, + "step": 36050 + }, + { + "epoch": 3.363907810021461, + "grad_norm": NaN, + "learning_rate": 0.00012814311834382792, + "loss": 0.0, + "step": 36051 + }, + { + "epoch": 3.3640011197163386, + "grad_norm": NaN, + "learning_rate": 0.0001281356353156762, + "loss": 0.0, + "step": 36052 + }, + { + "epoch": 3.3640944294112156, + "grad_norm": NaN, + "learning_rate": 0.000128128152343119, + "loss": 0.0, + "step": 36053 + }, + { + "epoch": 3.364187739106093, + "grad_norm": NaN, + "learning_rate": 0.0001281206694261755, + "loss": 0.0, + "step": 36054 + }, + { + "epoch": 3.3642810488009705, + "grad_norm": NaN, + "learning_rate": 0.00012811318656486464, + "loss": 0.0, + "step": 36055 + }, + { + "epoch": 3.364374358495848, + "grad_norm": NaN, + "learning_rate": 0.00012810570375920542, + "loss": 0.0, + "step": 36056 + }, + { + "epoch": 3.364467668190725, + "grad_norm": NaN, + "learning_rate": 0.00012809822100921694, + "loss": 0.0, + "step": 36057 + }, + { + "epoch": 3.3645609778856023, + "grad_norm": NaN, + "learning_rate": 0.00012809073831491824, + "loss": 0.0, + "step": 36058 + }, + { + "epoch": 3.3646542875804797, + "grad_norm": NaN, + "learning_rate": 0.00012808325567632823, + "loss": 0.0, + "step": 36059 + }, + { + "epoch": 3.3647475972753567, + "grad_norm": NaN, + "learning_rate": 0.00012807577309346604, + "loss": 0.0, + "step": 36060 + }, + { + "epoch": 3.364840906970234, + "grad_norm": NaN, + "learning_rate": 0.00012806829056635072, + "loss": 0.0, + "step": 36061 + }, + { + "epoch": 3.3649342166651115, + "grad_norm": NaN, + "learning_rate": 0.00012806080809500114, + "loss": 0.0, + "step": 36062 + }, + { + "epoch": 3.365027526359989, + "grad_norm": NaN, + "learning_rate": 0.00012805332567943645, + "loss": 0.0, + "step": 36063 + }, + { + "epoch": 3.3651208360548663, + "grad_norm": NaN, + "learning_rate": 0.00012804584331967573, + "loss": 0.0, + "step": 36064 + }, + { + "epoch": 3.3652141457497433, + "grad_norm": NaN, + "learning_rate": 0.00012803836101573784, + "loss": 0.0, + "step": 36065 + }, + { + "epoch": 3.3653074554446207, + "grad_norm": NaN, + "learning_rate": 0.0001280308787676419, + "loss": 0.0, + "step": 36066 + }, + { + "epoch": 3.365400765139498, + "grad_norm": NaN, + "learning_rate": 0.00012802339657540698, + "loss": 0.0, + "step": 36067 + }, + { + "epoch": 3.365494074834375, + "grad_norm": NaN, + "learning_rate": 0.000128015914439052, + "loss": 0.0, + "step": 36068 + }, + { + "epoch": 3.3655873845292525, + "grad_norm": NaN, + "learning_rate": 0.00012800843235859607, + "loss": 0.0, + "step": 36069 + }, + { + "epoch": 3.36568069422413, + "grad_norm": NaN, + "learning_rate": 0.00012800095033405819, + "loss": 0.0, + "step": 36070 + }, + { + "epoch": 3.3657740039190074, + "grad_norm": NaN, + "learning_rate": 0.00012799346836545733, + "loss": 0.0, + "step": 36071 + }, + { + "epoch": 3.3658673136138844, + "grad_norm": NaN, + "learning_rate": 0.0001279859864528126, + "loss": 0.0, + "step": 36072 + }, + { + "epoch": 3.365960623308762, + "grad_norm": NaN, + "learning_rate": 0.000127978504596143, + "loss": 0.0, + "step": 36073 + }, + { + "epoch": 3.366053933003639, + "grad_norm": NaN, + "learning_rate": 0.0001279710227954675, + "loss": 0.0, + "step": 36074 + }, + { + "epoch": 3.366147242698516, + "grad_norm": NaN, + "learning_rate": 0.00012796354105080516, + "loss": 0.0, + "step": 36075 + }, + { + "epoch": 3.3662405523933936, + "grad_norm": NaN, + "learning_rate": 0.00012795605936217507, + "loss": 0.0, + "step": 36076 + }, + { + "epoch": 3.366333862088271, + "grad_norm": NaN, + "learning_rate": 0.0001279485777295961, + "loss": 0.0, + "step": 36077 + }, + { + "epoch": 3.3664271717831484, + "grad_norm": NaN, + "learning_rate": 0.00012794109615308743, + "loss": 0.0, + "step": 36078 + }, + { + "epoch": 3.3665204814780254, + "grad_norm": NaN, + "learning_rate": 0.00012793361463266803, + "loss": 0.0, + "step": 36079 + }, + { + "epoch": 3.366613791172903, + "grad_norm": NaN, + "learning_rate": 0.00012792613316835684, + "loss": 0.0, + "step": 36080 + }, + { + "epoch": 3.3667071008677802, + "grad_norm": NaN, + "learning_rate": 0.00012791865176017302, + "loss": 0.0, + "step": 36081 + }, + { + "epoch": 3.366800410562657, + "grad_norm": NaN, + "learning_rate": 0.00012791117040813552, + "loss": 0.0, + "step": 36082 + }, + { + "epoch": 3.3668937202575346, + "grad_norm": NaN, + "learning_rate": 0.00012790368911226333, + "loss": 0.0, + "step": 36083 + }, + { + "epoch": 3.366987029952412, + "grad_norm": NaN, + "learning_rate": 0.00012789620787257553, + "loss": 0.0, + "step": 36084 + }, + { + "epoch": 3.3670803396472895, + "grad_norm": NaN, + "learning_rate": 0.00012788872668909116, + "loss": 0.0, + "step": 36085 + }, + { + "epoch": 3.367173649342167, + "grad_norm": NaN, + "learning_rate": 0.00012788124556182916, + "loss": 0.0, + "step": 36086 + }, + { + "epoch": 3.367266959037044, + "grad_norm": NaN, + "learning_rate": 0.00012787376449080858, + "loss": 0.0, + "step": 36087 + }, + { + "epoch": 3.3673602687319213, + "grad_norm": NaN, + "learning_rate": 0.00012786628347604857, + "loss": 0.0, + "step": 36088 + }, + { + "epoch": 3.3674535784267987, + "grad_norm": NaN, + "learning_rate": 0.00012785880251756795, + "loss": 0.0, + "step": 36089 + }, + { + "epoch": 3.3675468881216757, + "grad_norm": NaN, + "learning_rate": 0.00012785132161538583, + "loss": 0.0, + "step": 36090 + }, + { + "epoch": 3.367640197816553, + "grad_norm": NaN, + "learning_rate": 0.00012784384076952132, + "loss": 0.0, + "step": 36091 + }, + { + "epoch": 3.3677335075114305, + "grad_norm": NaN, + "learning_rate": 0.0001278363599799933, + "loss": 0.0, + "step": 36092 + }, + { + "epoch": 3.367826817206308, + "grad_norm": NaN, + "learning_rate": 0.0001278288792468209, + "loss": 0.0, + "step": 36093 + }, + { + "epoch": 3.367920126901185, + "grad_norm": NaN, + "learning_rate": 0.00012782139857002304, + "loss": 0.0, + "step": 36094 + }, + { + "epoch": 3.3680134365960623, + "grad_norm": NaN, + "learning_rate": 0.0001278139179496188, + "loss": 0.0, + "step": 36095 + }, + { + "epoch": 3.3681067462909398, + "grad_norm": NaN, + "learning_rate": 0.00012780643738562723, + "loss": 0.0, + "step": 36096 + }, + { + "epoch": 3.3682000559858167, + "grad_norm": NaN, + "learning_rate": 0.00012779895687806727, + "loss": 0.0, + "step": 36097 + }, + { + "epoch": 3.368293365680694, + "grad_norm": NaN, + "learning_rate": 0.000127791476426958, + "loss": 0.0, + "step": 36098 + }, + { + "epoch": 3.3683866753755716, + "grad_norm": NaN, + "learning_rate": 0.0001277839960323185, + "loss": 0.0, + "step": 36099 + }, + { + "epoch": 3.368479985070449, + "grad_norm": NaN, + "learning_rate": 0.0001277765156941676, + "loss": 0.0, + "step": 36100 + }, + { + "epoch": 3.368573294765326, + "grad_norm": NaN, + "learning_rate": 0.0001277690354125245, + "loss": 0.0, + "step": 36101 + }, + { + "epoch": 3.3686666044602034, + "grad_norm": NaN, + "learning_rate": 0.00012776155518740817, + "loss": 0.0, + "step": 36102 + }, + { + "epoch": 3.368759914155081, + "grad_norm": NaN, + "learning_rate": 0.00012775407501883755, + "loss": 0.0, + "step": 36103 + }, + { + "epoch": 3.368853223849958, + "grad_norm": NaN, + "learning_rate": 0.00012774659490683178, + "loss": 0.0, + "step": 36104 + }, + { + "epoch": 3.368946533544835, + "grad_norm": NaN, + "learning_rate": 0.00012773911485140985, + "loss": 0.0, + "step": 36105 + }, + { + "epoch": 3.3690398432397126, + "grad_norm": NaN, + "learning_rate": 0.00012773163485259068, + "loss": 0.0, + "step": 36106 + }, + { + "epoch": 3.36913315293459, + "grad_norm": NaN, + "learning_rate": 0.00012772415491039342, + "loss": 0.0, + "step": 36107 + }, + { + "epoch": 3.369226462629467, + "grad_norm": NaN, + "learning_rate": 0.00012771667502483707, + "loss": 0.0, + "step": 36108 + }, + { + "epoch": 3.3693197723243444, + "grad_norm": NaN, + "learning_rate": 0.00012770919519594054, + "loss": 0.0, + "step": 36109 + }, + { + "epoch": 3.369413082019222, + "grad_norm": NaN, + "learning_rate": 0.00012770171542372295, + "loss": 0.0, + "step": 36110 + }, + { + "epoch": 3.3695063917140993, + "grad_norm": NaN, + "learning_rate": 0.00012769423570820334, + "loss": 0.0, + "step": 36111 + }, + { + "epoch": 3.3695997014089762, + "grad_norm": NaN, + "learning_rate": 0.00012768675604940059, + "loss": 0.0, + "step": 36112 + }, + { + "epoch": 3.3696930111038537, + "grad_norm": NaN, + "learning_rate": 0.00012767927644733386, + "loss": 0.0, + "step": 36113 + }, + { + "epoch": 3.369786320798731, + "grad_norm": NaN, + "learning_rate": 0.00012767179690202217, + "loss": 0.0, + "step": 36114 + }, + { + "epoch": 3.3698796304936085, + "grad_norm": NaN, + "learning_rate": 0.00012766431741348437, + "loss": 0.0, + "step": 36115 + }, + { + "epoch": 3.3699729401884855, + "grad_norm": NaN, + "learning_rate": 0.00012765683798173966, + "loss": 0.0, + "step": 36116 + }, + { + "epoch": 3.370066249883363, + "grad_norm": NaN, + "learning_rate": 0.00012764935860680703, + "loss": 0.0, + "step": 36117 + }, + { + "epoch": 3.3701595595782403, + "grad_norm": NaN, + "learning_rate": 0.00012764187928870538, + "loss": 0.0, + "step": 36118 + }, + { + "epoch": 3.3702528692731173, + "grad_norm": NaN, + "learning_rate": 0.00012763440002745385, + "loss": 0.0, + "step": 36119 + }, + { + "epoch": 3.3703461789679947, + "grad_norm": NaN, + "learning_rate": 0.00012762692082307145, + "loss": 0.0, + "step": 36120 + }, + { + "epoch": 3.370439488662872, + "grad_norm": NaN, + "learning_rate": 0.0001276194416755771, + "loss": 0.0, + "step": 36121 + }, + { + "epoch": 3.3705327983577495, + "grad_norm": NaN, + "learning_rate": 0.0001276119625849899, + "loss": 0.0, + "step": 36122 + }, + { + "epoch": 3.3706261080526265, + "grad_norm": NaN, + "learning_rate": 0.0001276044835513289, + "loss": 0.0, + "step": 36123 + }, + { + "epoch": 3.370719417747504, + "grad_norm": NaN, + "learning_rate": 0.000127597004574613, + "loss": 0.0, + "step": 36124 + }, + { + "epoch": 3.3708127274423814, + "grad_norm": NaN, + "learning_rate": 0.00012758952565486125, + "loss": 0.0, + "step": 36125 + }, + { + "epoch": 3.3709060371372583, + "grad_norm": NaN, + "learning_rate": 0.0001275820467920928, + "loss": 0.0, + "step": 36126 + }, + { + "epoch": 3.3709993468321358, + "grad_norm": NaN, + "learning_rate": 0.0001275745679863265, + "loss": 0.0, + "step": 36127 + }, + { + "epoch": 3.371092656527013, + "grad_norm": NaN, + "learning_rate": 0.0001275670892375814, + "loss": 0.0, + "step": 36128 + }, + { + "epoch": 3.3711859662218906, + "grad_norm": NaN, + "learning_rate": 0.00012755961054587662, + "loss": 0.0, + "step": 36129 + }, + { + "epoch": 3.3712792759167676, + "grad_norm": NaN, + "learning_rate": 0.00012755213191123108, + "loss": 0.0, + "step": 36130 + }, + { + "epoch": 3.371372585611645, + "grad_norm": NaN, + "learning_rate": 0.00012754465333366378, + "loss": 0.0, + "step": 36131 + }, + { + "epoch": 3.3714658953065224, + "grad_norm": NaN, + "learning_rate": 0.00012753717481319384, + "loss": 0.0, + "step": 36132 + }, + { + "epoch": 3.3715592050014, + "grad_norm": NaN, + "learning_rate": 0.00012752969634984018, + "loss": 0.0, + "step": 36133 + }, + { + "epoch": 3.371652514696277, + "grad_norm": NaN, + "learning_rate": 0.0001275222179436218, + "loss": 0.0, + "step": 36134 + }, + { + "epoch": 3.3717458243911542, + "grad_norm": NaN, + "learning_rate": 0.00012751473959455784, + "loss": 0.0, + "step": 36135 + }, + { + "epoch": 3.3718391340860316, + "grad_norm": NaN, + "learning_rate": 0.0001275072613026672, + "loss": 0.0, + "step": 36136 + }, + { + "epoch": 3.371932443780909, + "grad_norm": NaN, + "learning_rate": 0.00012749978306796897, + "loss": 0.0, + "step": 36137 + }, + { + "epoch": 3.372025753475786, + "grad_norm": NaN, + "learning_rate": 0.00012749230489048205, + "loss": 0.0, + "step": 36138 + }, + { + "epoch": 3.3721190631706635, + "grad_norm": NaN, + "learning_rate": 0.00012748482677022558, + "loss": 0.0, + "step": 36139 + }, + { + "epoch": 3.372212372865541, + "grad_norm": NaN, + "learning_rate": 0.00012747734870721856, + "loss": 0.0, + "step": 36140 + }, + { + "epoch": 3.372305682560418, + "grad_norm": NaN, + "learning_rate": 0.0001274698707014799, + "loss": 0.0, + "step": 36141 + }, + { + "epoch": 3.3723989922552953, + "grad_norm": NaN, + "learning_rate": 0.0001274623927530287, + "loss": 0.0, + "step": 36142 + }, + { + "epoch": 3.3724923019501727, + "grad_norm": NaN, + "learning_rate": 0.000127454914861884, + "loss": 0.0, + "step": 36143 + }, + { + "epoch": 3.37258561164505, + "grad_norm": NaN, + "learning_rate": 0.0001274474370280647, + "loss": 0.0, + "step": 36144 + }, + { + "epoch": 3.372678921339927, + "grad_norm": NaN, + "learning_rate": 0.00012743995925158994, + "loss": 0.0, + "step": 36145 + }, + { + "epoch": 3.3727722310348045, + "grad_norm": NaN, + "learning_rate": 0.0001274324815324787, + "loss": 0.0, + "step": 36146 + }, + { + "epoch": 3.372865540729682, + "grad_norm": NaN, + "learning_rate": 0.00012742500387074993, + "loss": 0.0, + "step": 36147 + }, + { + "epoch": 3.372958850424559, + "grad_norm": NaN, + "learning_rate": 0.0001274175262664227, + "loss": 0.0, + "step": 36148 + }, + { + "epoch": 3.3730521601194363, + "grad_norm": NaN, + "learning_rate": 0.00012741004871951605, + "loss": 0.0, + "step": 36149 + }, + { + "epoch": 3.3731454698143137, + "grad_norm": NaN, + "learning_rate": 0.0001274025712300489, + "loss": 0.0, + "step": 36150 + }, + { + "epoch": 3.373238779509191, + "grad_norm": NaN, + "learning_rate": 0.0001273950937980403, + "loss": 0.0, + "step": 36151 + }, + { + "epoch": 3.373332089204068, + "grad_norm": NaN, + "learning_rate": 0.00012738761642350937, + "loss": 0.0, + "step": 36152 + }, + { + "epoch": 3.3734253988989455, + "grad_norm": NaN, + "learning_rate": 0.00012738013910647495, + "loss": 0.0, + "step": 36153 + }, + { + "epoch": 3.373518708593823, + "grad_norm": NaN, + "learning_rate": 0.00012737266184695617, + "loss": 0.0, + "step": 36154 + }, + { + "epoch": 3.3736120182887004, + "grad_norm": NaN, + "learning_rate": 0.00012736518464497205, + "loss": 0.0, + "step": 36155 + }, + { + "epoch": 3.3737053279835774, + "grad_norm": NaN, + "learning_rate": 0.00012735770750054148, + "loss": 0.0, + "step": 36156 + }, + { + "epoch": 3.373798637678455, + "grad_norm": NaN, + "learning_rate": 0.00012735023041368358, + "loss": 0.0, + "step": 36157 + }, + { + "epoch": 3.373891947373332, + "grad_norm": NaN, + "learning_rate": 0.0001273427533844174, + "loss": 0.0, + "step": 36158 + }, + { + "epoch": 3.3739852570682096, + "grad_norm": NaN, + "learning_rate": 0.00012733527641276178, + "loss": 0.0, + "step": 36159 + }, + { + "epoch": 3.3740785667630866, + "grad_norm": NaN, + "learning_rate": 0.00012732779949873587, + "loss": 0.0, + "step": 36160 + }, + { + "epoch": 3.374171876457964, + "grad_norm": NaN, + "learning_rate": 0.0001273203226423587, + "loss": 0.0, + "step": 36161 + }, + { + "epoch": 3.3742651861528414, + "grad_norm": NaN, + "learning_rate": 0.0001273128458436492, + "loss": 0.0, + "step": 36162 + }, + { + "epoch": 3.3743584958477184, + "grad_norm": NaN, + "learning_rate": 0.00012730536910262634, + "loss": 0.0, + "step": 36163 + }, + { + "epoch": 3.374451805542596, + "grad_norm": NaN, + "learning_rate": 0.00012729789241930933, + "loss": 0.0, + "step": 36164 + }, + { + "epoch": 3.3745451152374732, + "grad_norm": NaN, + "learning_rate": 0.000127290415793717, + "loss": 0.0, + "step": 36165 + }, + { + "epoch": 3.3746384249323507, + "grad_norm": NaN, + "learning_rate": 0.00012728293922586838, + "loss": 0.0, + "step": 36166 + }, + { + "epoch": 3.3747317346272276, + "grad_norm": NaN, + "learning_rate": 0.0001272754627157826, + "loss": 0.0, + "step": 36167 + }, + { + "epoch": 3.374825044322105, + "grad_norm": NaN, + "learning_rate": 0.00012726798626347852, + "loss": 0.0, + "step": 36168 + }, + { + "epoch": 3.3749183540169825, + "grad_norm": NaN, + "learning_rate": 0.00012726050986897522, + "loss": 0.0, + "step": 36169 + }, + { + "epoch": 3.3750116637118595, + "grad_norm": NaN, + "learning_rate": 0.00012725303353229177, + "loss": 0.0, + "step": 36170 + }, + { + "epoch": 3.375104973406737, + "grad_norm": NaN, + "learning_rate": 0.00012724555725344706, + "loss": 0.0, + "step": 36171 + }, + { + "epoch": 3.3751982831016143, + "grad_norm": NaN, + "learning_rate": 0.00012723808103246012, + "loss": 0.0, + "step": 36172 + }, + { + "epoch": 3.3752915927964917, + "grad_norm": NaN, + "learning_rate": 0.0001272306048693501, + "loss": 0.0, + "step": 36173 + }, + { + "epoch": 3.3753849024913687, + "grad_norm": NaN, + "learning_rate": 0.00012722312876413584, + "loss": 0.0, + "step": 36174 + }, + { + "epoch": 3.375478212186246, + "grad_norm": NaN, + "learning_rate": 0.00012721565271683638, + "loss": 0.0, + "step": 36175 + }, + { + "epoch": 3.3755715218811235, + "grad_norm": NaN, + "learning_rate": 0.00012720817672747087, + "loss": 0.0, + "step": 36176 + }, + { + "epoch": 3.3756648315760005, + "grad_norm": NaN, + "learning_rate": 0.00012720070079605815, + "loss": 0.0, + "step": 36177 + }, + { + "epoch": 3.375758141270878, + "grad_norm": NaN, + "learning_rate": 0.0001271932249226173, + "loss": 0.0, + "step": 36178 + }, + { + "epoch": 3.3758514509657553, + "grad_norm": NaN, + "learning_rate": 0.00012718574910716734, + "loss": 0.0, + "step": 36179 + }, + { + "epoch": 3.3759447606606328, + "grad_norm": NaN, + "learning_rate": 0.00012717827334972725, + "loss": 0.0, + "step": 36180 + }, + { + "epoch": 3.37603807035551, + "grad_norm": NaN, + "learning_rate": 0.0001271707976503161, + "loss": 0.0, + "step": 36181 + }, + { + "epoch": 3.376131380050387, + "grad_norm": NaN, + "learning_rate": 0.00012716332200895275, + "loss": 0.0, + "step": 36182 + }, + { + "epoch": 3.3762246897452646, + "grad_norm": NaN, + "learning_rate": 0.00012715584642565637, + "loss": 0.0, + "step": 36183 + }, + { + "epoch": 3.376317999440142, + "grad_norm": NaN, + "learning_rate": 0.0001271483709004459, + "loss": 0.0, + "step": 36184 + }, + { + "epoch": 3.376411309135019, + "grad_norm": NaN, + "learning_rate": 0.00012714089543334033, + "loss": 0.0, + "step": 36185 + }, + { + "epoch": 3.3765046188298964, + "grad_norm": NaN, + "learning_rate": 0.0001271334200243587, + "loss": 0.0, + "step": 36186 + }, + { + "epoch": 3.376597928524774, + "grad_norm": NaN, + "learning_rate": 0.00012712594467352006, + "loss": 0.0, + "step": 36187 + }, + { + "epoch": 3.3766912382196512, + "grad_norm": NaN, + "learning_rate": 0.0001271184693808433, + "loss": 0.0, + "step": 36188 + }, + { + "epoch": 3.376784547914528, + "grad_norm": NaN, + "learning_rate": 0.0001271109941463475, + "loss": 0.0, + "step": 36189 + }, + { + "epoch": 3.3768778576094056, + "grad_norm": NaN, + "learning_rate": 0.00012710351897005174, + "loss": 0.0, + "step": 36190 + }, + { + "epoch": 3.376971167304283, + "grad_norm": NaN, + "learning_rate": 0.00012709604385197483, + "loss": 0.0, + "step": 36191 + }, + { + "epoch": 3.37706447699916, + "grad_norm": NaN, + "learning_rate": 0.00012708856879213595, + "loss": 0.0, + "step": 36192 + }, + { + "epoch": 3.3771577866940374, + "grad_norm": NaN, + "learning_rate": 0.00012708109379055412, + "loss": 0.0, + "step": 36193 + }, + { + "epoch": 3.377251096388915, + "grad_norm": NaN, + "learning_rate": 0.0001270736188472482, + "loss": 0.0, + "step": 36194 + }, + { + "epoch": 3.3773444060837923, + "grad_norm": NaN, + "learning_rate": 0.00012706614396223727, + "loss": 0.0, + "step": 36195 + }, + { + "epoch": 3.3774377157786692, + "grad_norm": NaN, + "learning_rate": 0.0001270586691355404, + "loss": 0.0, + "step": 36196 + }, + { + "epoch": 3.3775310254735467, + "grad_norm": NaN, + "learning_rate": 0.00012705119436717647, + "loss": 0.0, + "step": 36197 + }, + { + "epoch": 3.377624335168424, + "grad_norm": NaN, + "learning_rate": 0.00012704371965716456, + "loss": 0.0, + "step": 36198 + }, + { + "epoch": 3.377717644863301, + "grad_norm": NaN, + "learning_rate": 0.00012703624500552376, + "loss": 0.0, + "step": 36199 + }, + { + "epoch": 3.3778109545581785, + "grad_norm": NaN, + "learning_rate": 0.00012702877041227288, + "loss": 0.0, + "step": 36200 + }, + { + "epoch": 3.377904264253056, + "grad_norm": NaN, + "learning_rate": 0.00012702129587743104, + "loss": 0.0, + "step": 36201 + }, + { + "epoch": 3.3779975739479333, + "grad_norm": NaN, + "learning_rate": 0.00012701382140101734, + "loss": 0.0, + "step": 36202 + }, + { + "epoch": 3.3780908836428107, + "grad_norm": NaN, + "learning_rate": 0.0001270063469830506, + "loss": 0.0, + "step": 36203 + }, + { + "epoch": 3.3781841933376877, + "grad_norm": NaN, + "learning_rate": 0.0001269988726235499, + "loss": 0.0, + "step": 36204 + }, + { + "epoch": 3.378277503032565, + "grad_norm": NaN, + "learning_rate": 0.00012699139832253432, + "loss": 0.0, + "step": 36205 + }, + { + "epoch": 3.3783708127274426, + "grad_norm": NaN, + "learning_rate": 0.00012698392408002273, + "loss": 0.0, + "step": 36206 + }, + { + "epoch": 3.3784641224223195, + "grad_norm": NaN, + "learning_rate": 0.00012697644989603422, + "loss": 0.0, + "step": 36207 + }, + { + "epoch": 3.378557432117197, + "grad_norm": NaN, + "learning_rate": 0.00012696897577058782, + "loss": 0.0, + "step": 36208 + }, + { + "epoch": 3.3786507418120744, + "grad_norm": NaN, + "learning_rate": 0.0001269615017037025, + "loss": 0.0, + "step": 36209 + }, + { + "epoch": 3.378744051506952, + "grad_norm": NaN, + "learning_rate": 0.00012695402769539717, + "loss": 0.0, + "step": 36210 + }, + { + "epoch": 3.3788373612018288, + "grad_norm": NaN, + "learning_rate": 0.000126946553745691, + "loss": 0.0, + "step": 36211 + }, + { + "epoch": 3.378930670896706, + "grad_norm": NaN, + "learning_rate": 0.0001269390798546029, + "loss": 0.0, + "step": 36212 + }, + { + "epoch": 3.3790239805915836, + "grad_norm": NaN, + "learning_rate": 0.0001269316060221518, + "loss": 0.0, + "step": 36213 + }, + { + "epoch": 3.3791172902864606, + "grad_norm": NaN, + "learning_rate": 0.00012692413224835693, + "loss": 0.0, + "step": 36214 + }, + { + "epoch": 3.379210599981338, + "grad_norm": NaN, + "learning_rate": 0.00012691665853323709, + "loss": 0.0, + "step": 36215 + }, + { + "epoch": 3.3793039096762154, + "grad_norm": NaN, + "learning_rate": 0.00012690918487681132, + "loss": 0.0, + "step": 36216 + }, + { + "epoch": 3.379397219371093, + "grad_norm": NaN, + "learning_rate": 0.0001269017112790987, + "loss": 0.0, + "step": 36217 + }, + { + "epoch": 3.37949052906597, + "grad_norm": NaN, + "learning_rate": 0.00012689423774011818, + "loss": 0.0, + "step": 36218 + }, + { + "epoch": 3.3795838387608472, + "grad_norm": NaN, + "learning_rate": 0.00012688676425988874, + "loss": 0.0, + "step": 36219 + }, + { + "epoch": 3.3796771484557246, + "grad_norm": NaN, + "learning_rate": 0.00012687929083842945, + "loss": 0.0, + "step": 36220 + }, + { + "epoch": 3.3797704581506016, + "grad_norm": NaN, + "learning_rate": 0.00012687181747575926, + "loss": 0.0, + "step": 36221 + }, + { + "epoch": 3.379863767845479, + "grad_norm": NaN, + "learning_rate": 0.00012686434417189714, + "loss": 0.0, + "step": 36222 + }, + { + "epoch": 3.3799570775403565, + "grad_norm": NaN, + "learning_rate": 0.00012685687092686224, + "loss": 0.0, + "step": 36223 + }, + { + "epoch": 3.380050387235234, + "grad_norm": NaN, + "learning_rate": 0.0001268493977406734, + "loss": 0.0, + "step": 36224 + }, + { + "epoch": 3.380143696930111, + "grad_norm": NaN, + "learning_rate": 0.00012684192461334966, + "loss": 0.0, + "step": 36225 + }, + { + "epoch": 3.3802370066249883, + "grad_norm": NaN, + "learning_rate": 0.0001268344515449101, + "loss": 0.0, + "step": 36226 + }, + { + "epoch": 3.3803303163198657, + "grad_norm": NaN, + "learning_rate": 0.00012682697853537366, + "loss": 0.0, + "step": 36227 + }, + { + "epoch": 3.380423626014743, + "grad_norm": NaN, + "learning_rate": 0.00012681950558475935, + "loss": 0.0, + "step": 36228 + }, + { + "epoch": 3.38051693570962, + "grad_norm": NaN, + "learning_rate": 0.0001268120326930861, + "loss": 0.0, + "step": 36229 + }, + { + "epoch": 3.3806102454044975, + "grad_norm": NaN, + "learning_rate": 0.00012680455986037304, + "loss": 0.0, + "step": 36230 + }, + { + "epoch": 3.380703555099375, + "grad_norm": NaN, + "learning_rate": 0.00012679708708663917, + "loss": 0.0, + "step": 36231 + }, + { + "epoch": 3.3807968647942523, + "grad_norm": NaN, + "learning_rate": 0.00012678961437190332, + "loss": 0.0, + "step": 36232 + }, + { + "epoch": 3.3808901744891293, + "grad_norm": NaN, + "learning_rate": 0.00012678214171618467, + "loss": 0.0, + "step": 36233 + }, + { + "epoch": 3.3809834841840067, + "grad_norm": NaN, + "learning_rate": 0.00012677466911950217, + "loss": 0.0, + "step": 36234 + }, + { + "epoch": 3.381076793878884, + "grad_norm": NaN, + "learning_rate": 0.00012676719658187474, + "loss": 0.0, + "step": 36235 + }, + { + "epoch": 3.381170103573761, + "grad_norm": NaN, + "learning_rate": 0.00012675972410332144, + "loss": 0.0, + "step": 36236 + }, + { + "epoch": 3.3812634132686386, + "grad_norm": NaN, + "learning_rate": 0.0001267522516838614, + "loss": 0.0, + "step": 36237 + }, + { + "epoch": 3.381356722963516, + "grad_norm": NaN, + "learning_rate": 0.0001267447793235134, + "loss": 0.0, + "step": 36238 + }, + { + "epoch": 3.3814500326583934, + "grad_norm": NaN, + "learning_rate": 0.00012673730702229653, + "loss": 0.0, + "step": 36239 + }, + { + "epoch": 3.3815433423532704, + "grad_norm": NaN, + "learning_rate": 0.00012672983478022984, + "loss": 0.0, + "step": 36240 + }, + { + "epoch": 3.381636652048148, + "grad_norm": NaN, + "learning_rate": 0.0001267223625973323, + "loss": 0.0, + "step": 36241 + }, + { + "epoch": 3.381729961743025, + "grad_norm": NaN, + "learning_rate": 0.0001267148904736228, + "loss": 0.0, + "step": 36242 + }, + { + "epoch": 3.381823271437902, + "grad_norm": NaN, + "learning_rate": 0.00012670741840912055, + "loss": 0.0, + "step": 36243 + }, + { + "epoch": 3.3819165811327796, + "grad_norm": NaN, + "learning_rate": 0.0001266999464038444, + "loss": 0.0, + "step": 36244 + }, + { + "epoch": 3.382009890827657, + "grad_norm": NaN, + "learning_rate": 0.00012669247445781334, + "loss": 0.0, + "step": 36245 + }, + { + "epoch": 3.3821032005225344, + "grad_norm": NaN, + "learning_rate": 0.00012668500257104648, + "loss": 0.0, + "step": 36246 + }, + { + "epoch": 3.3821965102174114, + "grad_norm": NaN, + "learning_rate": 0.0001266775307435627, + "loss": 0.0, + "step": 36247 + }, + { + "epoch": 3.382289819912289, + "grad_norm": NaN, + "learning_rate": 0.00012667005897538106, + "loss": 0.0, + "step": 36248 + }, + { + "epoch": 3.3823831296071662, + "grad_norm": NaN, + "learning_rate": 0.0001266625872665206, + "loss": 0.0, + "step": 36249 + }, + { + "epoch": 3.3824764393020437, + "grad_norm": NaN, + "learning_rate": 0.00012665511561700023, + "loss": 0.0, + "step": 36250 + }, + { + "epoch": 3.3825697489969206, + "grad_norm": NaN, + "learning_rate": 0.00012664764402683896, + "loss": 0.0, + "step": 36251 + }, + { + "epoch": 3.382663058691798, + "grad_norm": NaN, + "learning_rate": 0.00012664017249605587, + "loss": 0.0, + "step": 36252 + }, + { + "epoch": 3.3827563683866755, + "grad_norm": NaN, + "learning_rate": 0.00012663270102466985, + "loss": 0.0, + "step": 36253 + }, + { + "epoch": 3.382849678081553, + "grad_norm": NaN, + "learning_rate": 0.00012662522961269995, + "loss": 0.0, + "step": 36254 + }, + { + "epoch": 3.38294298777643, + "grad_norm": NaN, + "learning_rate": 0.00012661775826016523, + "loss": 0.0, + "step": 36255 + }, + { + "epoch": 3.3830362974713073, + "grad_norm": NaN, + "learning_rate": 0.00012661028696708457, + "loss": 0.0, + "step": 36256 + }, + { + "epoch": 3.3831296071661847, + "grad_norm": NaN, + "learning_rate": 0.000126602815733477, + "loss": 0.0, + "step": 36257 + }, + { + "epoch": 3.3832229168610617, + "grad_norm": NaN, + "learning_rate": 0.00012659534455936158, + "loss": 0.0, + "step": 36258 + }, + { + "epoch": 3.383316226555939, + "grad_norm": NaN, + "learning_rate": 0.00012658787344475728, + "loss": 0.0, + "step": 36259 + }, + { + "epoch": 3.3834095362508165, + "grad_norm": NaN, + "learning_rate": 0.00012658040238968301, + "loss": 0.0, + "step": 36260 + }, + { + "epoch": 3.383502845945694, + "grad_norm": NaN, + "learning_rate": 0.00012657293139415793, + "loss": 0.0, + "step": 36261 + }, + { + "epoch": 3.383596155640571, + "grad_norm": NaN, + "learning_rate": 0.0001265654604582009, + "loss": 0.0, + "step": 36262 + }, + { + "epoch": 3.3836894653354483, + "grad_norm": NaN, + "learning_rate": 0.00012655798958183094, + "loss": 0.0, + "step": 36263 + }, + { + "epoch": 3.3837827750303258, + "grad_norm": NaN, + "learning_rate": 0.00012655051876506713, + "loss": 0.0, + "step": 36264 + }, + { + "epoch": 3.3838760847252027, + "grad_norm": NaN, + "learning_rate": 0.00012654304800792837, + "loss": 0.0, + "step": 36265 + }, + { + "epoch": 3.38396939442008, + "grad_norm": NaN, + "learning_rate": 0.00012653557731043364, + "loss": 0.0, + "step": 36266 + }, + { + "epoch": 3.3840627041149576, + "grad_norm": NaN, + "learning_rate": 0.00012652810667260209, + "loss": 0.0, + "step": 36267 + }, + { + "epoch": 3.384156013809835, + "grad_norm": NaN, + "learning_rate": 0.00012652063609445256, + "loss": 0.0, + "step": 36268 + }, + { + "epoch": 3.384249323504712, + "grad_norm": NaN, + "learning_rate": 0.00012651316557600405, + "loss": 0.0, + "step": 36269 + }, + { + "epoch": 3.3843426331995894, + "grad_norm": NaN, + "learning_rate": 0.00012650569511727568, + "loss": 0.0, + "step": 36270 + }, + { + "epoch": 3.384435942894467, + "grad_norm": NaN, + "learning_rate": 0.00012649822471828636, + "loss": 0.0, + "step": 36271 + }, + { + "epoch": 3.384529252589344, + "grad_norm": NaN, + "learning_rate": 0.0001264907543790551, + "loss": 0.0, + "step": 36272 + }, + { + "epoch": 3.384622562284221, + "grad_norm": NaN, + "learning_rate": 0.00012648328409960082, + "loss": 0.0, + "step": 36273 + }, + { + "epoch": 3.3847158719790986, + "grad_norm": NaN, + "learning_rate": 0.00012647581387994258, + "loss": 0.0, + "step": 36274 + }, + { + "epoch": 3.384809181673976, + "grad_norm": NaN, + "learning_rate": 0.00012646834372009947, + "loss": 0.0, + "step": 36275 + }, + { + "epoch": 3.3849024913688535, + "grad_norm": NaN, + "learning_rate": 0.0001264608736200903, + "loss": 0.0, + "step": 36276 + }, + { + "epoch": 3.3849958010637304, + "grad_norm": NaN, + "learning_rate": 0.00012645340357993418, + "loss": 0.0, + "step": 36277 + }, + { + "epoch": 3.385089110758608, + "grad_norm": NaN, + "learning_rate": 0.0001264459335996501, + "loss": 0.0, + "step": 36278 + }, + { + "epoch": 3.3851824204534853, + "grad_norm": NaN, + "learning_rate": 0.00012643846367925703, + "loss": 0.0, + "step": 36279 + }, + { + "epoch": 3.3852757301483622, + "grad_norm": NaN, + "learning_rate": 0.00012643099381877393, + "loss": 0.0, + "step": 36280 + }, + { + "epoch": 3.3853690398432397, + "grad_norm": NaN, + "learning_rate": 0.0001264235240182199, + "loss": 0.0, + "step": 36281 + }, + { + "epoch": 3.385462349538117, + "grad_norm": NaN, + "learning_rate": 0.00012641605427761383, + "loss": 0.0, + "step": 36282 + }, + { + "epoch": 3.3855556592329945, + "grad_norm": NaN, + "learning_rate": 0.00012640858459697472, + "loss": 0.0, + "step": 36283 + }, + { + "epoch": 3.3856489689278715, + "grad_norm": NaN, + "learning_rate": 0.00012640111497632165, + "loss": 0.0, + "step": 36284 + }, + { + "epoch": 3.385742278622749, + "grad_norm": NaN, + "learning_rate": 0.00012639364541567352, + "loss": 0.0, + "step": 36285 + }, + { + "epoch": 3.3858355883176263, + "grad_norm": NaN, + "learning_rate": 0.00012638617591504932, + "loss": 0.0, + "step": 36286 + }, + { + "epoch": 3.3859288980125033, + "grad_norm": NaN, + "learning_rate": 0.00012637870647446816, + "loss": 0.0, + "step": 36287 + }, + { + "epoch": 3.3860222077073807, + "grad_norm": NaN, + "learning_rate": 0.0001263712370939489, + "loss": 0.0, + "step": 36288 + }, + { + "epoch": 3.386115517402258, + "grad_norm": NaN, + "learning_rate": 0.0001263637677735106, + "loss": 0.0, + "step": 36289 + }, + { + "epoch": 3.3862088270971356, + "grad_norm": NaN, + "learning_rate": 0.00012635629851317224, + "loss": 0.0, + "step": 36290 + }, + { + "epoch": 3.3863021367920125, + "grad_norm": NaN, + "learning_rate": 0.0001263488293129528, + "loss": 0.0, + "step": 36291 + }, + { + "epoch": 3.38639544648689, + "grad_norm": NaN, + "learning_rate": 0.00012634136017287124, + "loss": 0.0, + "step": 36292 + }, + { + "epoch": 3.3864887561817674, + "grad_norm": NaN, + "learning_rate": 0.00012633389109294668, + "loss": 0.0, + "step": 36293 + }, + { + "epoch": 3.3865820658766443, + "grad_norm": NaN, + "learning_rate": 0.000126326422073198, + "loss": 0.0, + "step": 36294 + }, + { + "epoch": 3.3866753755715218, + "grad_norm": NaN, + "learning_rate": 0.00012631895311364415, + "loss": 0.0, + "step": 36295 + }, + { + "epoch": 3.386768685266399, + "grad_norm": NaN, + "learning_rate": 0.00012631148421430425, + "loss": 0.0, + "step": 36296 + }, + { + "epoch": 3.3868619949612766, + "grad_norm": NaN, + "learning_rate": 0.00012630401537519723, + "loss": 0.0, + "step": 36297 + }, + { + "epoch": 3.386955304656154, + "grad_norm": NaN, + "learning_rate": 0.00012629654659634203, + "loss": 0.0, + "step": 36298 + }, + { + "epoch": 3.387048614351031, + "grad_norm": NaN, + "learning_rate": 0.00012628907787775773, + "loss": 0.0, + "step": 36299 + }, + { + "epoch": 3.3871419240459084, + "grad_norm": NaN, + "learning_rate": 0.0001262816092194633, + "loss": 0.0, + "step": 36300 + }, + { + "epoch": 3.387235233740786, + "grad_norm": NaN, + "learning_rate": 0.00012627414062147766, + "loss": 0.0, + "step": 36301 + }, + { + "epoch": 3.387328543435663, + "grad_norm": NaN, + "learning_rate": 0.0001262666720838199, + "loss": 0.0, + "step": 36302 + }, + { + "epoch": 3.3874218531305402, + "grad_norm": NaN, + "learning_rate": 0.00012625920360650894, + "loss": 0.0, + "step": 36303 + }, + { + "epoch": 3.3875151628254176, + "grad_norm": NaN, + "learning_rate": 0.00012625173518956376, + "loss": 0.0, + "step": 36304 + }, + { + "epoch": 3.387608472520295, + "grad_norm": NaN, + "learning_rate": 0.00012624426683300344, + "loss": 0.0, + "step": 36305 + }, + { + "epoch": 3.387701782215172, + "grad_norm": NaN, + "learning_rate": 0.0001262367985368469, + "loss": 0.0, + "step": 36306 + }, + { + "epoch": 3.3877950919100495, + "grad_norm": NaN, + "learning_rate": 0.0001262293303011131, + "loss": 0.0, + "step": 36307 + }, + { + "epoch": 3.387888401604927, + "grad_norm": NaN, + "learning_rate": 0.00012622186212582113, + "loss": 0.0, + "step": 36308 + }, + { + "epoch": 3.387981711299804, + "grad_norm": NaN, + "learning_rate": 0.00012621439401098989, + "loss": 0.0, + "step": 36309 + }, + { + "epoch": 3.3880750209946813, + "grad_norm": NaN, + "learning_rate": 0.00012620692595663837, + "loss": 0.0, + "step": 36310 + }, + { + "epoch": 3.3881683306895587, + "grad_norm": NaN, + "learning_rate": 0.00012619945796278566, + "loss": 0.0, + "step": 36311 + }, + { + "epoch": 3.388261640384436, + "grad_norm": NaN, + "learning_rate": 0.00012619199002945062, + "loss": 0.0, + "step": 36312 + }, + { + "epoch": 3.388354950079313, + "grad_norm": NaN, + "learning_rate": 0.00012618452215665228, + "loss": 0.0, + "step": 36313 + }, + { + "epoch": 3.3884482597741905, + "grad_norm": NaN, + "learning_rate": 0.00012617705434440975, + "loss": 0.0, + "step": 36314 + }, + { + "epoch": 3.388541569469068, + "grad_norm": NaN, + "learning_rate": 0.00012616958659274178, + "loss": 0.0, + "step": 36315 + }, + { + "epoch": 3.388634879163945, + "grad_norm": NaN, + "learning_rate": 0.00012616211890166762, + "loss": 0.0, + "step": 36316 + }, + { + "epoch": 3.3887281888588223, + "grad_norm": NaN, + "learning_rate": 0.00012615465127120606, + "loss": 0.0, + "step": 36317 + }, + { + "epoch": 3.3888214985536997, + "grad_norm": NaN, + "learning_rate": 0.00012614718370137612, + "loss": 0.0, + "step": 36318 + }, + { + "epoch": 3.388914808248577, + "grad_norm": NaN, + "learning_rate": 0.00012613971619219693, + "loss": 0.0, + "step": 36319 + }, + { + "epoch": 3.389008117943454, + "grad_norm": NaN, + "learning_rate": 0.00012613224874368731, + "loss": 0.0, + "step": 36320 + }, + { + "epoch": 3.3891014276383316, + "grad_norm": NaN, + "learning_rate": 0.0001261247813558663, + "loss": 0.0, + "step": 36321 + }, + { + "epoch": 3.389194737333209, + "grad_norm": NaN, + "learning_rate": 0.00012611731402875292, + "loss": 0.0, + "step": 36322 + }, + { + "epoch": 3.3892880470280864, + "grad_norm": NaN, + "learning_rate": 0.00012610984676236615, + "loss": 0.0, + "step": 36323 + }, + { + "epoch": 3.3893813567229634, + "grad_norm": NaN, + "learning_rate": 0.0001261023795567249, + "loss": 0.0, + "step": 36324 + }, + { + "epoch": 3.389474666417841, + "grad_norm": NaN, + "learning_rate": 0.00012609491241184832, + "loss": 0.0, + "step": 36325 + }, + { + "epoch": 3.389567976112718, + "grad_norm": NaN, + "learning_rate": 0.00012608744532775524, + "loss": 0.0, + "step": 36326 + }, + { + "epoch": 3.3896612858075956, + "grad_norm": NaN, + "learning_rate": 0.00012607997830446467, + "loss": 0.0, + "step": 36327 + }, + { + "epoch": 3.3897545955024726, + "grad_norm": NaN, + "learning_rate": 0.00012607251134199568, + "loss": 0.0, + "step": 36328 + }, + { + "epoch": 3.38984790519735, + "grad_norm": NaN, + "learning_rate": 0.0001260650444403672, + "loss": 0.0, + "step": 36329 + }, + { + "epoch": 3.3899412148922274, + "grad_norm": NaN, + "learning_rate": 0.0001260575775995982, + "loss": 0.0, + "step": 36330 + }, + { + "epoch": 3.3900345245871044, + "grad_norm": NaN, + "learning_rate": 0.00012605011081970771, + "loss": 0.0, + "step": 36331 + }, + { + "epoch": 3.390127834281982, + "grad_norm": NaN, + "learning_rate": 0.0001260426441007147, + "loss": 0.0, + "step": 36332 + }, + { + "epoch": 3.3902211439768593, + "grad_norm": NaN, + "learning_rate": 0.0001260351774426381, + "loss": 0.0, + "step": 36333 + }, + { + "epoch": 3.3903144536717367, + "grad_norm": NaN, + "learning_rate": 0.000126027710845497, + "loss": 0.0, + "step": 36334 + }, + { + "epoch": 3.3904077633666136, + "grad_norm": NaN, + "learning_rate": 0.00012602024430931032, + "loss": 0.0, + "step": 36335 + }, + { + "epoch": 3.390501073061491, + "grad_norm": NaN, + "learning_rate": 0.00012601277783409702, + "loss": 0.0, + "step": 36336 + }, + { + "epoch": 3.3905943827563685, + "grad_norm": NaN, + "learning_rate": 0.00012600531141987616, + "loss": 0.0, + "step": 36337 + }, + { + "epoch": 3.3906876924512455, + "grad_norm": NaN, + "learning_rate": 0.0001259978450666667, + "loss": 0.0, + "step": 36338 + }, + { + "epoch": 3.390781002146123, + "grad_norm": NaN, + "learning_rate": 0.0001259903787744875, + "loss": 0.0, + "step": 36339 + }, + { + "epoch": 3.3908743118410003, + "grad_norm": NaN, + "learning_rate": 0.0001259829125433578, + "loss": 0.0, + "step": 36340 + }, + { + "epoch": 3.3909676215358777, + "grad_norm": NaN, + "learning_rate": 0.00012597544637329636, + "loss": 0.0, + "step": 36341 + }, + { + "epoch": 3.3910609312307547, + "grad_norm": NaN, + "learning_rate": 0.00012596798026432224, + "loss": 0.0, + "step": 36342 + }, + { + "epoch": 3.391154240925632, + "grad_norm": NaN, + "learning_rate": 0.00012596051421645446, + "loss": 0.0, + "step": 36343 + }, + { + "epoch": 3.3912475506205095, + "grad_norm": NaN, + "learning_rate": 0.00012595304822971196, + "loss": 0.0, + "step": 36344 + }, + { + "epoch": 3.391340860315387, + "grad_norm": NaN, + "learning_rate": 0.0001259455823041137, + "loss": 0.0, + "step": 36345 + }, + { + "epoch": 3.391434170010264, + "grad_norm": NaN, + "learning_rate": 0.00012593811643967875, + "loss": 0.0, + "step": 36346 + }, + { + "epoch": 3.3915274797051413, + "grad_norm": NaN, + "learning_rate": 0.00012593065063642604, + "loss": 0.0, + "step": 36347 + }, + { + "epoch": 3.3916207894000188, + "grad_norm": NaN, + "learning_rate": 0.0001259231848943745, + "loss": 0.0, + "step": 36348 + }, + { + "epoch": 3.391714099094896, + "grad_norm": NaN, + "learning_rate": 0.00012591571921354323, + "loss": 0.0, + "step": 36349 + }, + { + "epoch": 3.391807408789773, + "grad_norm": NaN, + "learning_rate": 0.0001259082535939511, + "loss": 0.0, + "step": 36350 + }, + { + "epoch": 3.3919007184846506, + "grad_norm": NaN, + "learning_rate": 0.00012590078803561717, + "loss": 0.0, + "step": 36351 + }, + { + "epoch": 3.391994028179528, + "grad_norm": NaN, + "learning_rate": 0.00012589332253856044, + "loss": 0.0, + "step": 36352 + }, + { + "epoch": 3.392087337874405, + "grad_norm": NaN, + "learning_rate": 0.0001258858571027998, + "loss": 0.0, + "step": 36353 + }, + { + "epoch": 3.3921806475692824, + "grad_norm": NaN, + "learning_rate": 0.00012587839172835427, + "loss": 0.0, + "step": 36354 + }, + { + "epoch": 3.39227395726416, + "grad_norm": NaN, + "learning_rate": 0.0001258709264152429, + "loss": 0.0, + "step": 36355 + }, + { + "epoch": 3.3923672669590372, + "grad_norm": NaN, + "learning_rate": 0.00012586346116348453, + "loss": 0.0, + "step": 36356 + }, + { + "epoch": 3.392460576653914, + "grad_norm": NaN, + "learning_rate": 0.00012585599597309829, + "loss": 0.0, + "step": 36357 + }, + { + "epoch": 3.3925538863487916, + "grad_norm": NaN, + "learning_rate": 0.00012584853084410312, + "loss": 0.0, + "step": 36358 + }, + { + "epoch": 3.392647196043669, + "grad_norm": NaN, + "learning_rate": 0.00012584106577651793, + "loss": 0.0, + "step": 36359 + }, + { + "epoch": 3.392740505738546, + "grad_norm": NaN, + "learning_rate": 0.0001258336007703618, + "loss": 0.0, + "step": 36360 + }, + { + "epoch": 3.3928338154334234, + "grad_norm": NaN, + "learning_rate": 0.00012582613582565365, + "loss": 0.0, + "step": 36361 + }, + { + "epoch": 3.392927125128301, + "grad_norm": NaN, + "learning_rate": 0.00012581867094241245, + "loss": 0.0, + "step": 36362 + }, + { + "epoch": 3.3930204348231783, + "grad_norm": NaN, + "learning_rate": 0.00012581120612065725, + "loss": 0.0, + "step": 36363 + }, + { + "epoch": 3.3931137445180553, + "grad_norm": NaN, + "learning_rate": 0.00012580374136040694, + "loss": 0.0, + "step": 36364 + }, + { + "epoch": 3.3932070542129327, + "grad_norm": NaN, + "learning_rate": 0.00012579627666168056, + "loss": 0.0, + "step": 36365 + }, + { + "epoch": 3.39330036390781, + "grad_norm": NaN, + "learning_rate": 0.00012578881202449712, + "loss": 0.0, + "step": 36366 + }, + { + "epoch": 3.3933936736026875, + "grad_norm": NaN, + "learning_rate": 0.00012578134744887552, + "loss": 0.0, + "step": 36367 + }, + { + "epoch": 3.3934869832975645, + "grad_norm": NaN, + "learning_rate": 0.00012577388293483476, + "loss": 0.0, + "step": 36368 + }, + { + "epoch": 3.393580292992442, + "grad_norm": NaN, + "learning_rate": 0.0001257664184823939, + "loss": 0.0, + "step": 36369 + }, + { + "epoch": 3.3936736026873193, + "grad_norm": NaN, + "learning_rate": 0.00012575895409157182, + "loss": 0.0, + "step": 36370 + }, + { + "epoch": 3.3937669123821967, + "grad_norm": NaN, + "learning_rate": 0.0001257514897623875, + "loss": 0.0, + "step": 36371 + }, + { + "epoch": 3.3938602220770737, + "grad_norm": NaN, + "learning_rate": 0.00012574402549486006, + "loss": 0.0, + "step": 36372 + }, + { + "epoch": 3.393953531771951, + "grad_norm": NaN, + "learning_rate": 0.00012573656128900832, + "loss": 0.0, + "step": 36373 + }, + { + "epoch": 3.3940468414668286, + "grad_norm": NaN, + "learning_rate": 0.0001257290971448513, + "loss": 0.0, + "step": 36374 + }, + { + "epoch": 3.3941401511617055, + "grad_norm": NaN, + "learning_rate": 0.00012572163306240802, + "loss": 0.0, + "step": 36375 + }, + { + "epoch": 3.394233460856583, + "grad_norm": NaN, + "learning_rate": 0.00012571416904169744, + "loss": 0.0, + "step": 36376 + }, + { + "epoch": 3.3943267705514604, + "grad_norm": NaN, + "learning_rate": 0.00012570670508273848, + "loss": 0.0, + "step": 36377 + }, + { + "epoch": 3.394420080246338, + "grad_norm": NaN, + "learning_rate": 0.00012569924118555027, + "loss": 0.0, + "step": 36378 + }, + { + "epoch": 3.3945133899412148, + "grad_norm": NaN, + "learning_rate": 0.00012569177735015163, + "loss": 0.0, + "step": 36379 + }, + { + "epoch": 3.394606699636092, + "grad_norm": NaN, + "learning_rate": 0.0001256843135765616, + "loss": 0.0, + "step": 36380 + }, + { + "epoch": 3.3947000093309696, + "grad_norm": NaN, + "learning_rate": 0.0001256768498647992, + "loss": 0.0, + "step": 36381 + }, + { + "epoch": 3.3947933190258466, + "grad_norm": NaN, + "learning_rate": 0.0001256693862148833, + "loss": 0.0, + "step": 36382 + }, + { + "epoch": 3.394886628720724, + "grad_norm": NaN, + "learning_rate": 0.00012566192262683295, + "loss": 0.0, + "step": 36383 + }, + { + "epoch": 3.3949799384156014, + "grad_norm": NaN, + "learning_rate": 0.00012565445910066718, + "loss": 0.0, + "step": 36384 + }, + { + "epoch": 3.395073248110479, + "grad_norm": NaN, + "learning_rate": 0.00012564699563640485, + "loss": 0.0, + "step": 36385 + }, + { + "epoch": 3.395166557805356, + "grad_norm": NaN, + "learning_rate": 0.00012563953223406502, + "loss": 0.0, + "step": 36386 + }, + { + "epoch": 3.3952598675002332, + "grad_norm": NaN, + "learning_rate": 0.00012563206889366668, + "loss": 0.0, + "step": 36387 + }, + { + "epoch": 3.3953531771951106, + "grad_norm": NaN, + "learning_rate": 0.0001256246056152287, + "loss": 0.0, + "step": 36388 + }, + { + "epoch": 3.3954464868899876, + "grad_norm": NaN, + "learning_rate": 0.00012561714239877017, + "loss": 0.0, + "step": 36389 + }, + { + "epoch": 3.395539796584865, + "grad_norm": NaN, + "learning_rate": 0.00012560967924431005, + "loss": 0.0, + "step": 36390 + }, + { + "epoch": 3.3956331062797425, + "grad_norm": NaN, + "learning_rate": 0.0001256022161518672, + "loss": 0.0, + "step": 36391 + }, + { + "epoch": 3.39572641597462, + "grad_norm": NaN, + "learning_rate": 0.00012559475312146076, + "loss": 0.0, + "step": 36392 + }, + { + "epoch": 3.3958197256694973, + "grad_norm": NaN, + "learning_rate": 0.00012558729015310966, + "loss": 0.0, + "step": 36393 + }, + { + "epoch": 3.3959130353643743, + "grad_norm": NaN, + "learning_rate": 0.00012557982724683276, + "loss": 0.0, + "step": 36394 + }, + { + "epoch": 3.3960063450592517, + "grad_norm": NaN, + "learning_rate": 0.00012557236440264918, + "loss": 0.0, + "step": 36395 + }, + { + "epoch": 3.396099654754129, + "grad_norm": NaN, + "learning_rate": 0.00012556490162057789, + "loss": 0.0, + "step": 36396 + }, + { + "epoch": 3.396192964449006, + "grad_norm": NaN, + "learning_rate": 0.0001255574389006377, + "loss": 0.0, + "step": 36397 + }, + { + "epoch": 3.3962862741438835, + "grad_norm": NaN, + "learning_rate": 0.00012554997624284776, + "loss": 0.0, + "step": 36398 + }, + { + "epoch": 3.396379583838761, + "grad_norm": NaN, + "learning_rate": 0.00012554251364722705, + "loss": 0.0, + "step": 36399 + }, + { + "epoch": 3.3964728935336383, + "grad_norm": NaN, + "learning_rate": 0.00012553505111379437, + "loss": 0.0, + "step": 36400 + }, + { + "epoch": 3.3965662032285153, + "grad_norm": NaN, + "learning_rate": 0.00012552758864256886, + "loss": 0.0, + "step": 36401 + }, + { + "epoch": 3.3966595129233927, + "grad_norm": NaN, + "learning_rate": 0.00012552012623356947, + "loss": 0.0, + "step": 36402 + }, + { + "epoch": 3.39675282261827, + "grad_norm": NaN, + "learning_rate": 0.0001255126638868151, + "loss": 0.0, + "step": 36403 + }, + { + "epoch": 3.396846132313147, + "grad_norm": NaN, + "learning_rate": 0.0001255052016023248, + "loss": 0.0, + "step": 36404 + }, + { + "epoch": 3.3969394420080246, + "grad_norm": NaN, + "learning_rate": 0.00012549773938011756, + "loss": 0.0, + "step": 36405 + }, + { + "epoch": 3.397032751702902, + "grad_norm": NaN, + "learning_rate": 0.00012549027722021222, + "loss": 0.0, + "step": 36406 + }, + { + "epoch": 3.3971260613977794, + "grad_norm": NaN, + "learning_rate": 0.00012548281512262793, + "loss": 0.0, + "step": 36407 + }, + { + "epoch": 3.3972193710926564, + "grad_norm": NaN, + "learning_rate": 0.00012547535308738352, + "loss": 0.0, + "step": 36408 + }, + { + "epoch": 3.397312680787534, + "grad_norm": NaN, + "learning_rate": 0.000125467891114498, + "loss": 0.0, + "step": 36409 + }, + { + "epoch": 3.397405990482411, + "grad_norm": NaN, + "learning_rate": 0.00012546042920399045, + "loss": 0.0, + "step": 36410 + }, + { + "epoch": 3.397499300177288, + "grad_norm": NaN, + "learning_rate": 0.00012545296735587972, + "loss": 0.0, + "step": 36411 + }, + { + "epoch": 3.3975926098721656, + "grad_norm": NaN, + "learning_rate": 0.00012544550557018477, + "loss": 0.0, + "step": 36412 + }, + { + "epoch": 3.397685919567043, + "grad_norm": NaN, + "learning_rate": 0.0001254380438469247, + "loss": 0.0, + "step": 36413 + }, + { + "epoch": 3.3977792292619204, + "grad_norm": NaN, + "learning_rate": 0.00012543058218611838, + "loss": 0.0, + "step": 36414 + }, + { + "epoch": 3.397872538956798, + "grad_norm": NaN, + "learning_rate": 0.0001254231205877848, + "loss": 0.0, + "step": 36415 + }, + { + "epoch": 3.397965848651675, + "grad_norm": NaN, + "learning_rate": 0.000125415659051943, + "loss": 0.0, + "step": 36416 + }, + { + "epoch": 3.3980591583465523, + "grad_norm": NaN, + "learning_rate": 0.0001254081975786118, + "loss": 0.0, + "step": 36417 + }, + { + "epoch": 3.3981524680414297, + "grad_norm": NaN, + "learning_rate": 0.00012540073616781032, + "loss": 0.0, + "step": 36418 + }, + { + "epoch": 3.3982457777363066, + "grad_norm": NaN, + "learning_rate": 0.0001253932748195575, + "loss": 0.0, + "step": 36419 + }, + { + "epoch": 3.398339087431184, + "grad_norm": NaN, + "learning_rate": 0.0001253858135338723, + "loss": 0.0, + "step": 36420 + }, + { + "epoch": 3.3984323971260615, + "grad_norm": NaN, + "learning_rate": 0.00012537835231077362, + "loss": 0.0, + "step": 36421 + }, + { + "epoch": 3.398525706820939, + "grad_norm": NaN, + "learning_rate": 0.00012537089115028058, + "loss": 0.0, + "step": 36422 + }, + { + "epoch": 3.398619016515816, + "grad_norm": NaN, + "learning_rate": 0.00012536343005241197, + "loss": 0.0, + "step": 36423 + }, + { + "epoch": 3.3987123262106933, + "grad_norm": NaN, + "learning_rate": 0.00012535596901718692, + "loss": 0.0, + "step": 36424 + }, + { + "epoch": 3.3988056359055707, + "grad_norm": NaN, + "learning_rate": 0.0001253485080446244, + "loss": 0.0, + "step": 36425 + }, + { + "epoch": 3.3988989456004477, + "grad_norm": NaN, + "learning_rate": 0.00012534104713474322, + "loss": 0.0, + "step": 36426 + }, + { + "epoch": 3.398992255295325, + "grad_norm": NaN, + "learning_rate": 0.00012533358628756246, + "loss": 0.0, + "step": 36427 + }, + { + "epoch": 3.3990855649902025, + "grad_norm": NaN, + "learning_rate": 0.00012532612550310117, + "loss": 0.0, + "step": 36428 + }, + { + "epoch": 3.39917887468508, + "grad_norm": NaN, + "learning_rate": 0.00012531866478137813, + "loss": 0.0, + "step": 36429 + }, + { + "epoch": 3.399272184379957, + "grad_norm": NaN, + "learning_rate": 0.00012531120412241246, + "loss": 0.0, + "step": 36430 + }, + { + "epoch": 3.3993654940748343, + "grad_norm": NaN, + "learning_rate": 0.00012530374352622312, + "loss": 0.0, + "step": 36431 + }, + { + "epoch": 3.3994588037697118, + "grad_norm": NaN, + "learning_rate": 0.000125296282992829, + "loss": 0.0, + "step": 36432 + }, + { + "epoch": 3.3995521134645887, + "grad_norm": NaN, + "learning_rate": 0.0001252888225222491, + "loss": 0.0, + "step": 36433 + }, + { + "epoch": 3.399645423159466, + "grad_norm": NaN, + "learning_rate": 0.00012528136211450248, + "loss": 0.0, + "step": 36434 + }, + { + "epoch": 3.3997387328543436, + "grad_norm": NaN, + "learning_rate": 0.00012527390176960792, + "loss": 0.0, + "step": 36435 + }, + { + "epoch": 3.399832042549221, + "grad_norm": NaN, + "learning_rate": 0.00012526644148758457, + "loss": 0.0, + "step": 36436 + }, + { + "epoch": 3.399925352244098, + "grad_norm": NaN, + "learning_rate": 0.00012525898126845137, + "loss": 0.0, + "step": 36437 + }, + { + "epoch": 3.4000186619389754, + "grad_norm": NaN, + "learning_rate": 0.0001252515211122272, + "loss": 0.0, + "step": 36438 + }, + { + "epoch": 3.400111971633853, + "grad_norm": NaN, + "learning_rate": 0.0001252440610189311, + "loss": 0.0, + "step": 36439 + }, + { + "epoch": 3.4002052813287302, + "grad_norm": NaN, + "learning_rate": 0.00012523660098858204, + "loss": 0.0, + "step": 36440 + }, + { + "epoch": 3.400298591023607, + "grad_norm": NaN, + "learning_rate": 0.0001252291410211989, + "loss": 0.0, + "step": 36441 + }, + { + "epoch": 3.4003919007184846, + "grad_norm": NaN, + "learning_rate": 0.00012522168111680075, + "loss": 0.0, + "step": 36442 + }, + { + "epoch": 3.400485210413362, + "grad_norm": NaN, + "learning_rate": 0.00012521422127540656, + "loss": 0.0, + "step": 36443 + }, + { + "epoch": 3.4005785201082395, + "grad_norm": NaN, + "learning_rate": 0.0001252067614970352, + "loss": 0.0, + "step": 36444 + }, + { + "epoch": 3.4006718298031164, + "grad_norm": NaN, + "learning_rate": 0.00012519930178170574, + "loss": 0.0, + "step": 36445 + }, + { + "epoch": 3.400765139497994, + "grad_norm": NaN, + "learning_rate": 0.00012519184212943715, + "loss": 0.0, + "step": 36446 + }, + { + "epoch": 3.4008584491928713, + "grad_norm": NaN, + "learning_rate": 0.00012518438254024826, + "loss": 0.0, + "step": 36447 + }, + { + "epoch": 3.4009517588877483, + "grad_norm": NaN, + "learning_rate": 0.0001251769230141582, + "loss": 0.0, + "step": 36448 + }, + { + "epoch": 3.4010450685826257, + "grad_norm": NaN, + "learning_rate": 0.0001251694635511859, + "loss": 0.0, + "step": 36449 + }, + { + "epoch": 3.401138378277503, + "grad_norm": NaN, + "learning_rate": 0.0001251620041513502, + "loss": 0.0, + "step": 36450 + }, + { + "epoch": 3.4012316879723805, + "grad_norm": NaN, + "learning_rate": 0.00012515454481467025, + "loss": 0.0, + "step": 36451 + }, + { + "epoch": 3.4013249976672575, + "grad_norm": NaN, + "learning_rate": 0.00012514708554116493, + "loss": 0.0, + "step": 36452 + }, + { + "epoch": 3.401418307362135, + "grad_norm": NaN, + "learning_rate": 0.00012513962633085314, + "loss": 0.0, + "step": 36453 + }, + { + "epoch": 3.4015116170570123, + "grad_norm": NaN, + "learning_rate": 0.000125132167183754, + "loss": 0.0, + "step": 36454 + }, + { + "epoch": 3.4016049267518893, + "grad_norm": NaN, + "learning_rate": 0.00012512470809988635, + "loss": 0.0, + "step": 36455 + }, + { + "epoch": 3.4016982364467667, + "grad_norm": NaN, + "learning_rate": 0.00012511724907926917, + "loss": 0.0, + "step": 36456 + }, + { + "epoch": 3.401791546141644, + "grad_norm": NaN, + "learning_rate": 0.00012510979012192151, + "loss": 0.0, + "step": 36457 + }, + { + "epoch": 3.4018848558365216, + "grad_norm": NaN, + "learning_rate": 0.00012510233122786226, + "loss": 0.0, + "step": 36458 + }, + { + "epoch": 3.4019781655313985, + "grad_norm": NaN, + "learning_rate": 0.00012509487239711038, + "loss": 0.0, + "step": 36459 + }, + { + "epoch": 3.402071475226276, + "grad_norm": NaN, + "learning_rate": 0.0001250874136296849, + "loss": 0.0, + "step": 36460 + }, + { + "epoch": 3.4021647849211534, + "grad_norm": NaN, + "learning_rate": 0.0001250799549256047, + "loss": 0.0, + "step": 36461 + }, + { + "epoch": 3.402258094616031, + "grad_norm": NaN, + "learning_rate": 0.00012507249628488886, + "loss": 0.0, + "step": 36462 + }, + { + "epoch": 3.4023514043109078, + "grad_norm": NaN, + "learning_rate": 0.00012506503770755626, + "loss": 0.0, + "step": 36463 + }, + { + "epoch": 3.402444714005785, + "grad_norm": NaN, + "learning_rate": 0.00012505757919362585, + "loss": 0.0, + "step": 36464 + }, + { + "epoch": 3.4025380237006626, + "grad_norm": NaN, + "learning_rate": 0.00012505012074311665, + "loss": 0.0, + "step": 36465 + }, + { + "epoch": 3.40263133339554, + "grad_norm": NaN, + "learning_rate": 0.0001250426623560476, + "loss": 0.0, + "step": 36466 + }, + { + "epoch": 3.402724643090417, + "grad_norm": NaN, + "learning_rate": 0.00012503520403243766, + "loss": 0.0, + "step": 36467 + }, + { + "epoch": 3.4028179527852944, + "grad_norm": NaN, + "learning_rate": 0.0001250277457723058, + "loss": 0.0, + "step": 36468 + }, + { + "epoch": 3.402911262480172, + "grad_norm": NaN, + "learning_rate": 0.00012502028757567103, + "loss": 0.0, + "step": 36469 + }, + { + "epoch": 3.403004572175049, + "grad_norm": NaN, + "learning_rate": 0.0001250128294425522, + "loss": 0.0, + "step": 36470 + }, + { + "epoch": 3.4030978818699262, + "grad_norm": NaN, + "learning_rate": 0.00012500537137296835, + "loss": 0.0, + "step": 36471 + }, + { + "epoch": 3.4031911915648037, + "grad_norm": NaN, + "learning_rate": 0.0001249979133669385, + "loss": 0.0, + "step": 36472 + }, + { + "epoch": 3.403284501259681, + "grad_norm": NaN, + "learning_rate": 0.00012499045542448147, + "loss": 0.0, + "step": 36473 + }, + { + "epoch": 3.403377810954558, + "grad_norm": NaN, + "learning_rate": 0.00012498299754561633, + "loss": 0.0, + "step": 36474 + }, + { + "epoch": 3.4034711206494355, + "grad_norm": NaN, + "learning_rate": 0.00012497553973036206, + "loss": 0.0, + "step": 36475 + }, + { + "epoch": 3.403564430344313, + "grad_norm": NaN, + "learning_rate": 0.00012496808197873753, + "loss": 0.0, + "step": 36476 + }, + { + "epoch": 3.40365774003919, + "grad_norm": NaN, + "learning_rate": 0.00012496062429076177, + "loss": 0.0, + "step": 36477 + }, + { + "epoch": 3.4037510497340673, + "grad_norm": NaN, + "learning_rate": 0.00012495316666645374, + "loss": 0.0, + "step": 36478 + }, + { + "epoch": 3.4038443594289447, + "grad_norm": NaN, + "learning_rate": 0.00012494570910583238, + "loss": 0.0, + "step": 36479 + }, + { + "epoch": 3.403937669123822, + "grad_norm": NaN, + "learning_rate": 0.00012493825160891665, + "loss": 0.0, + "step": 36480 + }, + { + "epoch": 3.404030978818699, + "grad_norm": NaN, + "learning_rate": 0.00012493079417572555, + "loss": 0.0, + "step": 36481 + }, + { + "epoch": 3.4041242885135765, + "grad_norm": NaN, + "learning_rate": 0.00012492333680627795, + "loss": 0.0, + "step": 36482 + }, + { + "epoch": 3.404217598208454, + "grad_norm": NaN, + "learning_rate": 0.00012491587950059292, + "loss": 0.0, + "step": 36483 + }, + { + "epoch": 3.404310907903331, + "grad_norm": NaN, + "learning_rate": 0.0001249084222586894, + "loss": 0.0, + "step": 36484 + }, + { + "epoch": 3.4044042175982083, + "grad_norm": NaN, + "learning_rate": 0.00012490096508058627, + "loss": 0.0, + "step": 36485 + }, + { + "epoch": 3.4044975272930857, + "grad_norm": NaN, + "learning_rate": 0.0001248935079663026, + "loss": 0.0, + "step": 36486 + }, + { + "epoch": 3.404590836987963, + "grad_norm": NaN, + "learning_rate": 0.00012488605091585733, + "loss": 0.0, + "step": 36487 + }, + { + "epoch": 3.4046841466828406, + "grad_norm": NaN, + "learning_rate": 0.0001248785939292693, + "loss": 0.0, + "step": 36488 + }, + { + "epoch": 3.4047774563777176, + "grad_norm": NaN, + "learning_rate": 0.00012487113700655763, + "loss": 0.0, + "step": 36489 + }, + { + "epoch": 3.404870766072595, + "grad_norm": NaN, + "learning_rate": 0.00012486368014774123, + "loss": 0.0, + "step": 36490 + }, + { + "epoch": 3.4049640757674724, + "grad_norm": NaN, + "learning_rate": 0.00012485622335283898, + "loss": 0.0, + "step": 36491 + }, + { + "epoch": 3.4050573854623494, + "grad_norm": NaN, + "learning_rate": 0.00012484876662186993, + "loss": 0.0, + "step": 36492 + }, + { + "epoch": 3.405150695157227, + "grad_norm": NaN, + "learning_rate": 0.00012484130995485306, + "loss": 0.0, + "step": 36493 + }, + { + "epoch": 3.405244004852104, + "grad_norm": NaN, + "learning_rate": 0.00012483385335180724, + "loss": 0.0, + "step": 36494 + }, + { + "epoch": 3.4053373145469816, + "grad_norm": NaN, + "learning_rate": 0.00012482639681275144, + "loss": 0.0, + "step": 36495 + }, + { + "epoch": 3.4054306242418586, + "grad_norm": NaN, + "learning_rate": 0.0001248189403377048, + "loss": 0.0, + "step": 36496 + }, + { + "epoch": 3.405523933936736, + "grad_norm": NaN, + "learning_rate": 0.000124811483926686, + "loss": 0.0, + "step": 36497 + }, + { + "epoch": 3.4056172436316134, + "grad_norm": NaN, + "learning_rate": 0.0001248040275797142, + "loss": 0.0, + "step": 36498 + }, + { + "epoch": 3.4057105533264904, + "grad_norm": NaN, + "learning_rate": 0.00012479657129680824, + "loss": 0.0, + "step": 36499 + }, + { + "epoch": 3.405803863021368, + "grad_norm": NaN, + "learning_rate": 0.00012478911507798717, + "loss": 0.0, + "step": 36500 + }, + { + "epoch": 3.4058971727162453, + "grad_norm": NaN, + "learning_rate": 0.00012478165892326992, + "loss": 0.0, + "step": 36501 + }, + { + "epoch": 3.4059904824111227, + "grad_norm": NaN, + "learning_rate": 0.0001247742028326754, + "loss": 0.0, + "step": 36502 + }, + { + "epoch": 3.4060837921059997, + "grad_norm": NaN, + "learning_rate": 0.00012476674680622263, + "loss": 0.0, + "step": 36503 + }, + { + "epoch": 3.406177101800877, + "grad_norm": NaN, + "learning_rate": 0.0001247592908439306, + "loss": 0.0, + "step": 36504 + }, + { + "epoch": 3.4062704114957545, + "grad_norm": NaN, + "learning_rate": 0.00012475183494581812, + "loss": 0.0, + "step": 36505 + }, + { + "epoch": 3.4063637211906315, + "grad_norm": NaN, + "learning_rate": 0.00012474437911190431, + "loss": 0.0, + "step": 36506 + }, + { + "epoch": 3.406457030885509, + "grad_norm": NaN, + "learning_rate": 0.00012473692334220808, + "loss": 0.0, + "step": 36507 + }, + { + "epoch": 3.4065503405803863, + "grad_norm": NaN, + "learning_rate": 0.0001247294676367483, + "loss": 0.0, + "step": 36508 + }, + { + "epoch": 3.4066436502752637, + "grad_norm": NaN, + "learning_rate": 0.00012472201199554406, + "loss": 0.0, + "step": 36509 + }, + { + "epoch": 3.406736959970141, + "grad_norm": NaN, + "learning_rate": 0.00012471455641861425, + "loss": 0.0, + "step": 36510 + }, + { + "epoch": 3.406830269665018, + "grad_norm": NaN, + "learning_rate": 0.0001247071009059778, + "loss": 0.0, + "step": 36511 + }, + { + "epoch": 3.4069235793598955, + "grad_norm": NaN, + "learning_rate": 0.0001246996454576537, + "loss": 0.0, + "step": 36512 + }, + { + "epoch": 3.407016889054773, + "grad_norm": NaN, + "learning_rate": 0.00012469219007366097, + "loss": 0.0, + "step": 36513 + }, + { + "epoch": 3.40711019874965, + "grad_norm": NaN, + "learning_rate": 0.0001246847347540184, + "loss": 0.0, + "step": 36514 + }, + { + "epoch": 3.4072035084445274, + "grad_norm": NaN, + "learning_rate": 0.00012467727949874512, + "loss": 0.0, + "step": 36515 + }, + { + "epoch": 3.4072968181394048, + "grad_norm": NaN, + "learning_rate": 0.00012466982430786006, + "loss": 0.0, + "step": 36516 + }, + { + "epoch": 3.407390127834282, + "grad_norm": NaN, + "learning_rate": 0.00012466236918138202, + "loss": 0.0, + "step": 36517 + }, + { + "epoch": 3.407483437529159, + "grad_norm": NaN, + "learning_rate": 0.00012465491411933015, + "loss": 0.0, + "step": 36518 + }, + { + "epoch": 3.4075767472240366, + "grad_norm": NaN, + "learning_rate": 0.0001246474591217233, + "loss": 0.0, + "step": 36519 + }, + { + "epoch": 3.407670056918914, + "grad_norm": NaN, + "learning_rate": 0.00012464000418858044, + "loss": 0.0, + "step": 36520 + }, + { + "epoch": 3.407763366613791, + "grad_norm": NaN, + "learning_rate": 0.00012463254931992056, + "loss": 0.0, + "step": 36521 + }, + { + "epoch": 3.4078566763086684, + "grad_norm": NaN, + "learning_rate": 0.00012462509451576263, + "loss": 0.0, + "step": 36522 + }, + { + "epoch": 3.407949986003546, + "grad_norm": NaN, + "learning_rate": 0.00012461763977612549, + "loss": 0.0, + "step": 36523 + }, + { + "epoch": 3.4080432956984232, + "grad_norm": NaN, + "learning_rate": 0.0001246101851010282, + "loss": 0.0, + "step": 36524 + }, + { + "epoch": 3.4081366053933, + "grad_norm": NaN, + "learning_rate": 0.00012460273049048974, + "loss": 0.0, + "step": 36525 + }, + { + "epoch": 3.4082299150881776, + "grad_norm": NaN, + "learning_rate": 0.00012459527594452891, + "loss": 0.0, + "step": 36526 + }, + { + "epoch": 3.408323224783055, + "grad_norm": NaN, + "learning_rate": 0.00012458782146316483, + "loss": 0.0, + "step": 36527 + }, + { + "epoch": 3.408416534477932, + "grad_norm": NaN, + "learning_rate": 0.00012458036704641643, + "loss": 0.0, + "step": 36528 + }, + { + "epoch": 3.4085098441728094, + "grad_norm": NaN, + "learning_rate": 0.00012457291269430255, + "loss": 0.0, + "step": 36529 + }, + { + "epoch": 3.408603153867687, + "grad_norm": NaN, + "learning_rate": 0.00012456545840684225, + "loss": 0.0, + "step": 36530 + }, + { + "epoch": 3.4086964635625643, + "grad_norm": NaN, + "learning_rate": 0.0001245580041840545, + "loss": 0.0, + "step": 36531 + }, + { + "epoch": 3.4087897732574413, + "grad_norm": NaN, + "learning_rate": 0.00012455055002595815, + "loss": 0.0, + "step": 36532 + }, + { + "epoch": 3.4088830829523187, + "grad_norm": NaN, + "learning_rate": 0.00012454309593257218, + "loss": 0.0, + "step": 36533 + }, + { + "epoch": 3.408976392647196, + "grad_norm": NaN, + "learning_rate": 0.0001245356419039157, + "loss": 0.0, + "step": 36534 + }, + { + "epoch": 3.4090697023420735, + "grad_norm": NaN, + "learning_rate": 0.00012452818794000744, + "loss": 0.0, + "step": 36535 + }, + { + "epoch": 3.4091630120369505, + "grad_norm": NaN, + "learning_rate": 0.00012452073404086643, + "loss": 0.0, + "step": 36536 + }, + { + "epoch": 3.409256321731828, + "grad_norm": NaN, + "learning_rate": 0.00012451328020651175, + "loss": 0.0, + "step": 36537 + }, + { + "epoch": 3.4093496314267053, + "grad_norm": NaN, + "learning_rate": 0.00012450582643696217, + "loss": 0.0, + "step": 36538 + }, + { + "epoch": 3.4094429411215827, + "grad_norm": NaN, + "learning_rate": 0.00012449837273223672, + "loss": 0.0, + "step": 36539 + }, + { + "epoch": 3.4095362508164597, + "grad_norm": NaN, + "learning_rate": 0.0001244909190923544, + "loss": 0.0, + "step": 36540 + }, + { + "epoch": 3.409629560511337, + "grad_norm": NaN, + "learning_rate": 0.00012448346551733412, + "loss": 0.0, + "step": 36541 + }, + { + "epoch": 3.4097228702062146, + "grad_norm": NaN, + "learning_rate": 0.00012447601200719485, + "loss": 0.0, + "step": 36542 + }, + { + "epoch": 3.4098161799010915, + "grad_norm": NaN, + "learning_rate": 0.00012446855856195542, + "loss": 0.0, + "step": 36543 + }, + { + "epoch": 3.409909489595969, + "grad_norm": NaN, + "learning_rate": 0.00012446110518163494, + "loss": 0.0, + "step": 36544 + }, + { + "epoch": 3.4100027992908464, + "grad_norm": NaN, + "learning_rate": 0.00012445365186625237, + "loss": 0.0, + "step": 36545 + }, + { + "epoch": 3.410096108985724, + "grad_norm": NaN, + "learning_rate": 0.00012444619861582646, + "loss": 0.0, + "step": 36546 + }, + { + "epoch": 3.4101894186806008, + "grad_norm": NaN, + "learning_rate": 0.00012443874543037638, + "loss": 0.0, + "step": 36547 + }, + { + "epoch": 3.410282728375478, + "grad_norm": NaN, + "learning_rate": 0.00012443129230992102, + "loss": 0.0, + "step": 36548 + }, + { + "epoch": 3.4103760380703556, + "grad_norm": NaN, + "learning_rate": 0.00012442383925447925, + "loss": 0.0, + "step": 36549 + }, + { + "epoch": 3.4104693477652326, + "grad_norm": NaN, + "learning_rate": 0.0001244163862640701, + "loss": 0.0, + "step": 36550 + }, + { + "epoch": 3.41056265746011, + "grad_norm": NaN, + "learning_rate": 0.00012440893333871253, + "loss": 0.0, + "step": 36551 + }, + { + "epoch": 3.4106559671549874, + "grad_norm": NaN, + "learning_rate": 0.00012440148047842541, + "loss": 0.0, + "step": 36552 + }, + { + "epoch": 3.410749276849865, + "grad_norm": NaN, + "learning_rate": 0.00012439402768322778, + "loss": 0.0, + "step": 36553 + }, + { + "epoch": 3.410842586544742, + "grad_norm": NaN, + "learning_rate": 0.00012438657495313858, + "loss": 0.0, + "step": 36554 + }, + { + "epoch": 3.4109358962396192, + "grad_norm": NaN, + "learning_rate": 0.00012437912228817666, + "loss": 0.0, + "step": 36555 + }, + { + "epoch": 3.4110292059344967, + "grad_norm": NaN, + "learning_rate": 0.00012437166968836107, + "loss": 0.0, + "step": 36556 + }, + { + "epoch": 3.411122515629374, + "grad_norm": NaN, + "learning_rate": 0.00012436421715371078, + "loss": 0.0, + "step": 36557 + }, + { + "epoch": 3.411215825324251, + "grad_norm": NaN, + "learning_rate": 0.0001243567646842446, + "loss": 0.0, + "step": 36558 + }, + { + "epoch": 3.4113091350191285, + "grad_norm": NaN, + "learning_rate": 0.00012434931227998163, + "loss": 0.0, + "step": 36559 + }, + { + "epoch": 3.411402444714006, + "grad_norm": NaN, + "learning_rate": 0.00012434185994094078, + "loss": 0.0, + "step": 36560 + }, + { + "epoch": 3.4114957544088833, + "grad_norm": NaN, + "learning_rate": 0.00012433440766714094, + "loss": 0.0, + "step": 36561 + }, + { + "epoch": 3.4115890641037603, + "grad_norm": NaN, + "learning_rate": 0.00012432695545860111, + "loss": 0.0, + "step": 36562 + }, + { + "epoch": 3.4116823737986377, + "grad_norm": NaN, + "learning_rate": 0.00012431950331534027, + "loss": 0.0, + "step": 36563 + }, + { + "epoch": 3.411775683493515, + "grad_norm": NaN, + "learning_rate": 0.00012431205123737725, + "loss": 0.0, + "step": 36564 + }, + { + "epoch": 3.411868993188392, + "grad_norm": NaN, + "learning_rate": 0.0001243045992247311, + "loss": 0.0, + "step": 36565 + }, + { + "epoch": 3.4119623028832695, + "grad_norm": NaN, + "learning_rate": 0.0001242971472774208, + "loss": 0.0, + "step": 36566 + }, + { + "epoch": 3.412055612578147, + "grad_norm": NaN, + "learning_rate": 0.00012428969539546517, + "loss": 0.0, + "step": 36567 + }, + { + "epoch": 3.4121489222730244, + "grad_norm": NaN, + "learning_rate": 0.00012428224357888325, + "loss": 0.0, + "step": 36568 + }, + { + "epoch": 3.4122422319679013, + "grad_norm": NaN, + "learning_rate": 0.000124274791827694, + "loss": 0.0, + "step": 36569 + }, + { + "epoch": 3.4123355416627787, + "grad_norm": NaN, + "learning_rate": 0.00012426734014191627, + "loss": 0.0, + "step": 36570 + }, + { + "epoch": 3.412428851357656, + "grad_norm": NaN, + "learning_rate": 0.00012425988852156908, + "loss": 0.0, + "step": 36571 + }, + { + "epoch": 3.412522161052533, + "grad_norm": NaN, + "learning_rate": 0.00012425243696667144, + "loss": 0.0, + "step": 36572 + }, + { + "epoch": 3.4126154707474106, + "grad_norm": NaN, + "learning_rate": 0.00012424498547724215, + "loss": 0.0, + "step": 36573 + }, + { + "epoch": 3.412708780442288, + "grad_norm": NaN, + "learning_rate": 0.00012423753405330022, + "loss": 0.0, + "step": 36574 + }, + { + "epoch": 3.4128020901371654, + "grad_norm": NaN, + "learning_rate": 0.00012423008269486468, + "loss": 0.0, + "step": 36575 + }, + { + "epoch": 3.4128953998320424, + "grad_norm": NaN, + "learning_rate": 0.0001242226314019544, + "loss": 0.0, + "step": 36576 + }, + { + "epoch": 3.41298870952692, + "grad_norm": NaN, + "learning_rate": 0.00012421518017458825, + "loss": 0.0, + "step": 36577 + }, + { + "epoch": 3.413082019221797, + "grad_norm": NaN, + "learning_rate": 0.00012420772901278537, + "loss": 0.0, + "step": 36578 + }, + { + "epoch": 3.4131753289166746, + "grad_norm": NaN, + "learning_rate": 0.00012420027791656455, + "loss": 0.0, + "step": 36579 + }, + { + "epoch": 3.4132686386115516, + "grad_norm": NaN, + "learning_rate": 0.00012419282688594472, + "loss": 0.0, + "step": 36580 + }, + { + "epoch": 3.413361948306429, + "grad_norm": NaN, + "learning_rate": 0.000124185375920945, + "loss": 0.0, + "step": 36581 + }, + { + "epoch": 3.4134552580013064, + "grad_norm": NaN, + "learning_rate": 0.00012417792502158415, + "loss": 0.0, + "step": 36582 + }, + { + "epoch": 3.413548567696184, + "grad_norm": NaN, + "learning_rate": 0.00012417047418788115, + "loss": 0.0, + "step": 36583 + }, + { + "epoch": 3.413641877391061, + "grad_norm": NaN, + "learning_rate": 0.00012416302341985508, + "loss": 0.0, + "step": 36584 + }, + { + "epoch": 3.4137351870859383, + "grad_norm": NaN, + "learning_rate": 0.00012415557271752475, + "loss": 0.0, + "step": 36585 + }, + { + "epoch": 3.4138284967808157, + "grad_norm": NaN, + "learning_rate": 0.00012414812208090916, + "loss": 0.0, + "step": 36586 + }, + { + "epoch": 3.4139218064756927, + "grad_norm": NaN, + "learning_rate": 0.00012414067151002718, + "loss": 0.0, + "step": 36587 + }, + { + "epoch": 3.41401511617057, + "grad_norm": NaN, + "learning_rate": 0.00012413322100489785, + "loss": 0.0, + "step": 36588 + }, + { + "epoch": 3.4141084258654475, + "grad_norm": NaN, + "learning_rate": 0.0001241257705655401, + "loss": 0.0, + "step": 36589 + }, + { + "epoch": 3.414201735560325, + "grad_norm": NaN, + "learning_rate": 0.0001241183201919728, + "loss": 0.0, + "step": 36590 + }, + { + "epoch": 3.414295045255202, + "grad_norm": NaN, + "learning_rate": 0.00012411086988421496, + "loss": 0.0, + "step": 36591 + }, + { + "epoch": 3.4143883549500793, + "grad_norm": NaN, + "learning_rate": 0.00012410341964228555, + "loss": 0.0, + "step": 36592 + }, + { + "epoch": 3.4144816646449567, + "grad_norm": NaN, + "learning_rate": 0.0001240959694662034, + "loss": 0.0, + "step": 36593 + }, + { + "epoch": 3.4145749743398337, + "grad_norm": NaN, + "learning_rate": 0.00012408851935598756, + "loss": 0.0, + "step": 36594 + }, + { + "epoch": 3.414668284034711, + "grad_norm": NaN, + "learning_rate": 0.00012408106931165698, + "loss": 0.0, + "step": 36595 + }, + { + "epoch": 3.4147615937295885, + "grad_norm": NaN, + "learning_rate": 0.0001240736193332305, + "loss": 0.0, + "step": 36596 + }, + { + "epoch": 3.414854903424466, + "grad_norm": NaN, + "learning_rate": 0.00012406616942072716, + "loss": 0.0, + "step": 36597 + }, + { + "epoch": 3.414948213119343, + "grad_norm": NaN, + "learning_rate": 0.00012405871957416587, + "loss": 0.0, + "step": 36598 + }, + { + "epoch": 3.4150415228142204, + "grad_norm": NaN, + "learning_rate": 0.00012405126979356555, + "loss": 0.0, + "step": 36599 + }, + { + "epoch": 3.4151348325090978, + "grad_norm": NaN, + "learning_rate": 0.00012404382007894519, + "loss": 0.0, + "step": 36600 + }, + { + "epoch": 3.4152281422039747, + "grad_norm": NaN, + "learning_rate": 0.00012403637043032373, + "loss": 0.0, + "step": 36601 + }, + { + "epoch": 3.415321451898852, + "grad_norm": NaN, + "learning_rate": 0.00012402892084772, + "loss": 0.0, + "step": 36602 + }, + { + "epoch": 3.4154147615937296, + "grad_norm": NaN, + "learning_rate": 0.0001240214713311531, + "loss": 0.0, + "step": 36603 + }, + { + "epoch": 3.415508071288607, + "grad_norm": NaN, + "learning_rate": 0.00012401402188064192, + "loss": 0.0, + "step": 36604 + }, + { + "epoch": 3.4156013809834844, + "grad_norm": NaN, + "learning_rate": 0.00012400657249620533, + "loss": 0.0, + "step": 36605 + }, + { + "epoch": 3.4156946906783614, + "grad_norm": NaN, + "learning_rate": 0.00012399912317786235, + "loss": 0.0, + "step": 36606 + }, + { + "epoch": 3.415788000373239, + "grad_norm": NaN, + "learning_rate": 0.00012399167392563193, + "loss": 0.0, + "step": 36607 + }, + { + "epoch": 3.4158813100681162, + "grad_norm": NaN, + "learning_rate": 0.0001239842247395329, + "loss": 0.0, + "step": 36608 + }, + { + "epoch": 3.415974619762993, + "grad_norm": NaN, + "learning_rate": 0.0001239767756195843, + "loss": 0.0, + "step": 36609 + }, + { + "epoch": 3.4160679294578706, + "grad_norm": NaN, + "learning_rate": 0.0001239693265658051, + "loss": 0.0, + "step": 36610 + }, + { + "epoch": 3.416161239152748, + "grad_norm": NaN, + "learning_rate": 0.0001239618775782142, + "loss": 0.0, + "step": 36611 + }, + { + "epoch": 3.4162545488476255, + "grad_norm": NaN, + "learning_rate": 0.00012395442865683045, + "loss": 0.0, + "step": 36612 + }, + { + "epoch": 3.4163478585425024, + "grad_norm": NaN, + "learning_rate": 0.00012394697980167295, + "loss": 0.0, + "step": 36613 + }, + { + "epoch": 3.41644116823738, + "grad_norm": NaN, + "learning_rate": 0.00012393953101276056, + "loss": 0.0, + "step": 36614 + }, + { + "epoch": 3.4165344779322573, + "grad_norm": NaN, + "learning_rate": 0.00012393208229011216, + "loss": 0.0, + "step": 36615 + }, + { + "epoch": 3.4166277876271343, + "grad_norm": NaN, + "learning_rate": 0.00012392463363374682, + "loss": 0.0, + "step": 36616 + }, + { + "epoch": 3.4167210973220117, + "grad_norm": NaN, + "learning_rate": 0.0001239171850436834, + "loss": 0.0, + "step": 36617 + }, + { + "epoch": 3.416814407016889, + "grad_norm": NaN, + "learning_rate": 0.0001239097365199408, + "loss": 0.0, + "step": 36618 + }, + { + "epoch": 3.4169077167117665, + "grad_norm": NaN, + "learning_rate": 0.00012390228806253808, + "loss": 0.0, + "step": 36619 + }, + { + "epoch": 3.4170010264066435, + "grad_norm": NaN, + "learning_rate": 0.00012389483967149405, + "loss": 0.0, + "step": 36620 + }, + { + "epoch": 3.417094336101521, + "grad_norm": NaN, + "learning_rate": 0.00012388739134682773, + "loss": 0.0, + "step": 36621 + }, + { + "epoch": 3.4171876457963983, + "grad_norm": NaN, + "learning_rate": 0.00012387994308855806, + "loss": 0.0, + "step": 36622 + }, + { + "epoch": 3.4172809554912753, + "grad_norm": NaN, + "learning_rate": 0.00012387249489670396, + "loss": 0.0, + "step": 36623 + }, + { + "epoch": 3.4173742651861527, + "grad_norm": NaN, + "learning_rate": 0.00012386504677128432, + "loss": 0.0, + "step": 36624 + }, + { + "epoch": 3.41746757488103, + "grad_norm": NaN, + "learning_rate": 0.0001238575987123182, + "loss": 0.0, + "step": 36625 + }, + { + "epoch": 3.4175608845759076, + "grad_norm": NaN, + "learning_rate": 0.0001238501507198244, + "loss": 0.0, + "step": 36626 + }, + { + "epoch": 3.417654194270785, + "grad_norm": NaN, + "learning_rate": 0.0001238427027938219, + "loss": 0.0, + "step": 36627 + }, + { + "epoch": 3.417747503965662, + "grad_norm": NaN, + "learning_rate": 0.00012383525493432976, + "loss": 0.0, + "step": 36628 + }, + { + "epoch": 3.4178408136605394, + "grad_norm": NaN, + "learning_rate": 0.00012382780714136673, + "loss": 0.0, + "step": 36629 + }, + { + "epoch": 3.417934123355417, + "grad_norm": NaN, + "learning_rate": 0.0001238203594149519, + "loss": 0.0, + "step": 36630 + }, + { + "epoch": 3.4180274330502938, + "grad_norm": NaN, + "learning_rate": 0.00012381291175510406, + "loss": 0.0, + "step": 36631 + }, + { + "epoch": 3.418120742745171, + "grad_norm": NaN, + "learning_rate": 0.00012380546416184226, + "loss": 0.0, + "step": 36632 + }, + { + "epoch": 3.4182140524400486, + "grad_norm": NaN, + "learning_rate": 0.00012379801663518547, + "loss": 0.0, + "step": 36633 + }, + { + "epoch": 3.418307362134926, + "grad_norm": NaN, + "learning_rate": 0.00012379056917515246, + "loss": 0.0, + "step": 36634 + }, + { + "epoch": 3.418400671829803, + "grad_norm": NaN, + "learning_rate": 0.0001237831217817623, + "loss": 0.0, + "step": 36635 + }, + { + "epoch": 3.4184939815246804, + "grad_norm": NaN, + "learning_rate": 0.00012377567445503396, + "loss": 0.0, + "step": 36636 + }, + { + "epoch": 3.418587291219558, + "grad_norm": NaN, + "learning_rate": 0.00012376822719498622, + "loss": 0.0, + "step": 36637 + }, + { + "epoch": 3.418680600914435, + "grad_norm": NaN, + "learning_rate": 0.00012376078000163814, + "loss": 0.0, + "step": 36638 + }, + { + "epoch": 3.4187739106093122, + "grad_norm": NaN, + "learning_rate": 0.00012375333287500866, + "loss": 0.0, + "step": 36639 + }, + { + "epoch": 3.4188672203041897, + "grad_norm": NaN, + "learning_rate": 0.00012374588581511663, + "loss": 0.0, + "step": 36640 + }, + { + "epoch": 3.418960529999067, + "grad_norm": NaN, + "learning_rate": 0.00012373843882198106, + "loss": 0.0, + "step": 36641 + }, + { + "epoch": 3.419053839693944, + "grad_norm": NaN, + "learning_rate": 0.00012373099189562088, + "loss": 0.0, + "step": 36642 + }, + { + "epoch": 3.4191471493888215, + "grad_norm": NaN, + "learning_rate": 0.00012372354503605495, + "loss": 0.0, + "step": 36643 + }, + { + "epoch": 3.419240459083699, + "grad_norm": NaN, + "learning_rate": 0.00012371609824330225, + "loss": 0.0, + "step": 36644 + }, + { + "epoch": 3.419333768778576, + "grad_norm": NaN, + "learning_rate": 0.00012370865151738186, + "loss": 0.0, + "step": 36645 + }, + { + "epoch": 3.4194270784734533, + "grad_norm": NaN, + "learning_rate": 0.00012370120485831243, + "loss": 0.0, + "step": 36646 + }, + { + "epoch": 3.4195203881683307, + "grad_norm": NaN, + "learning_rate": 0.0001236937582661131, + "loss": 0.0, + "step": 36647 + }, + { + "epoch": 3.419613697863208, + "grad_norm": NaN, + "learning_rate": 0.0001236863117408028, + "loss": 0.0, + "step": 36648 + }, + { + "epoch": 3.419707007558085, + "grad_norm": NaN, + "learning_rate": 0.00012367886528240037, + "loss": 0.0, + "step": 36649 + }, + { + "epoch": 3.4198003172529625, + "grad_norm": NaN, + "learning_rate": 0.00012367141889092474, + "loss": 0.0, + "step": 36650 + }, + { + "epoch": 3.41989362694784, + "grad_norm": NaN, + "learning_rate": 0.00012366397256639497, + "loss": 0.0, + "step": 36651 + }, + { + "epoch": 3.4199869366427174, + "grad_norm": NaN, + "learning_rate": 0.0001236565263088299, + "loss": 0.0, + "step": 36652 + }, + { + "epoch": 3.4200802463375943, + "grad_norm": NaN, + "learning_rate": 0.00012364908011824844, + "loss": 0.0, + "step": 36653 + }, + { + "epoch": 3.4201735560324718, + "grad_norm": NaN, + "learning_rate": 0.00012364163399466963, + "loss": 0.0, + "step": 36654 + }, + { + "epoch": 3.420266865727349, + "grad_norm": NaN, + "learning_rate": 0.0001236341879381123, + "loss": 0.0, + "step": 36655 + }, + { + "epoch": 3.4203601754222266, + "grad_norm": NaN, + "learning_rate": 0.0001236267419485954, + "loss": 0.0, + "step": 36656 + }, + { + "epoch": 3.4204534851171036, + "grad_norm": NaN, + "learning_rate": 0.00012361929602613793, + "loss": 0.0, + "step": 36657 + }, + { + "epoch": 3.420546794811981, + "grad_norm": NaN, + "learning_rate": 0.00012361185017075872, + "loss": 0.0, + "step": 36658 + }, + { + "epoch": 3.4206401045068584, + "grad_norm": NaN, + "learning_rate": 0.00012360440438247678, + "loss": 0.0, + "step": 36659 + }, + { + "epoch": 3.4207334142017354, + "grad_norm": NaN, + "learning_rate": 0.00012359695866131105, + "loss": 0.0, + "step": 36660 + }, + { + "epoch": 3.420826723896613, + "grad_norm": NaN, + "learning_rate": 0.0001235895130072804, + "loss": 0.0, + "step": 36661 + }, + { + "epoch": 3.42092003359149, + "grad_norm": NaN, + "learning_rate": 0.0001235820674204038, + "loss": 0.0, + "step": 36662 + }, + { + "epoch": 3.4210133432863676, + "grad_norm": NaN, + "learning_rate": 0.0001235746219007002, + "loss": 0.0, + "step": 36663 + }, + { + "epoch": 3.4211066529812446, + "grad_norm": NaN, + "learning_rate": 0.0001235671764481885, + "loss": 0.0, + "step": 36664 + }, + { + "epoch": 3.421199962676122, + "grad_norm": NaN, + "learning_rate": 0.00012355973106288758, + "loss": 0.0, + "step": 36665 + }, + { + "epoch": 3.4212932723709994, + "grad_norm": NaN, + "learning_rate": 0.00012355228574481653, + "loss": 0.0, + "step": 36666 + }, + { + "epoch": 3.4213865820658764, + "grad_norm": NaN, + "learning_rate": 0.00012354484049399413, + "loss": 0.0, + "step": 36667 + }, + { + "epoch": 3.421479891760754, + "grad_norm": NaN, + "learning_rate": 0.00012353739531043936, + "loss": 0.0, + "step": 36668 + }, + { + "epoch": 3.4215732014556313, + "grad_norm": NaN, + "learning_rate": 0.0001235299501941712, + "loss": 0.0, + "step": 36669 + }, + { + "epoch": 3.4216665111505087, + "grad_norm": NaN, + "learning_rate": 0.00012352250514520847, + "loss": 0.0, + "step": 36670 + }, + { + "epoch": 3.4217598208453857, + "grad_norm": NaN, + "learning_rate": 0.00012351506016357018, + "loss": 0.0, + "step": 36671 + }, + { + "epoch": 3.421853130540263, + "grad_norm": NaN, + "learning_rate": 0.0001235076152492753, + "loss": 0.0, + "step": 36672 + }, + { + "epoch": 3.4219464402351405, + "grad_norm": NaN, + "learning_rate": 0.00012350017040234267, + "loss": 0.0, + "step": 36673 + }, + { + "epoch": 3.422039749930018, + "grad_norm": NaN, + "learning_rate": 0.00012349272562279124, + "loss": 0.0, + "step": 36674 + }, + { + "epoch": 3.422133059624895, + "grad_norm": NaN, + "learning_rate": 0.00012348528091064, + "loss": 0.0, + "step": 36675 + }, + { + "epoch": 3.4222263693197723, + "grad_norm": NaN, + "learning_rate": 0.00012347783626590782, + "loss": 0.0, + "step": 36676 + }, + { + "epoch": 3.4223196790146497, + "grad_norm": NaN, + "learning_rate": 0.00012347039168861365, + "loss": 0.0, + "step": 36677 + }, + { + "epoch": 3.422412988709527, + "grad_norm": NaN, + "learning_rate": 0.00012346294717877637, + "loss": 0.0, + "step": 36678 + }, + { + "epoch": 3.422506298404404, + "grad_norm": NaN, + "learning_rate": 0.000123455502736415, + "loss": 0.0, + "step": 36679 + }, + { + "epoch": 3.4225996080992815, + "grad_norm": NaN, + "learning_rate": 0.00012344805836154844, + "loss": 0.0, + "step": 36680 + }, + { + "epoch": 3.422692917794159, + "grad_norm": NaN, + "learning_rate": 0.00012344061405419554, + "loss": 0.0, + "step": 36681 + }, + { + "epoch": 3.422786227489036, + "grad_norm": NaN, + "learning_rate": 0.00012343316981437528, + "loss": 0.0, + "step": 36682 + }, + { + "epoch": 3.4228795371839134, + "grad_norm": NaN, + "learning_rate": 0.0001234257256421067, + "loss": 0.0, + "step": 36683 + }, + { + "epoch": 3.4229728468787908, + "grad_norm": NaN, + "learning_rate": 0.00012341828153740857, + "loss": 0.0, + "step": 36684 + }, + { + "epoch": 3.423066156573668, + "grad_norm": NaN, + "learning_rate": 0.00012341083750029984, + "loss": 0.0, + "step": 36685 + }, + { + "epoch": 3.423159466268545, + "grad_norm": NaN, + "learning_rate": 0.00012340339353079954, + "loss": 0.0, + "step": 36686 + }, + { + "epoch": 3.4232527759634226, + "grad_norm": NaN, + "learning_rate": 0.00012339594962892649, + "loss": 0.0, + "step": 36687 + }, + { + "epoch": 3.4233460856583, + "grad_norm": NaN, + "learning_rate": 0.00012338850579469965, + "loss": 0.0, + "step": 36688 + }, + { + "epoch": 3.423439395353177, + "grad_norm": NaN, + "learning_rate": 0.00012338106202813801, + "loss": 0.0, + "step": 36689 + }, + { + "epoch": 3.4235327050480544, + "grad_norm": NaN, + "learning_rate": 0.0001233736183292604, + "loss": 0.0, + "step": 36690 + }, + { + "epoch": 3.423626014742932, + "grad_norm": NaN, + "learning_rate": 0.00012336617469808577, + "loss": 0.0, + "step": 36691 + }, + { + "epoch": 3.4237193244378092, + "grad_norm": NaN, + "learning_rate": 0.00012335873113463315, + "loss": 0.0, + "step": 36692 + }, + { + "epoch": 3.423812634132686, + "grad_norm": NaN, + "learning_rate": 0.00012335128763892134, + "loss": 0.0, + "step": 36693 + }, + { + "epoch": 3.4239059438275636, + "grad_norm": NaN, + "learning_rate": 0.00012334384421096927, + "loss": 0.0, + "step": 36694 + }, + { + "epoch": 3.423999253522441, + "grad_norm": NaN, + "learning_rate": 0.00012333640085079598, + "loss": 0.0, + "step": 36695 + }, + { + "epoch": 3.424092563217318, + "grad_norm": NaN, + "learning_rate": 0.0001233289575584203, + "loss": 0.0, + "step": 36696 + }, + { + "epoch": 3.4241858729121954, + "grad_norm": NaN, + "learning_rate": 0.00012332151433386114, + "loss": 0.0, + "step": 36697 + }, + { + "epoch": 3.424279182607073, + "grad_norm": NaN, + "learning_rate": 0.00012331407117713753, + "loss": 0.0, + "step": 36698 + }, + { + "epoch": 3.4243724923019503, + "grad_norm": NaN, + "learning_rate": 0.0001233066280882683, + "loss": 0.0, + "step": 36699 + }, + { + "epoch": 3.4244658019968277, + "grad_norm": NaN, + "learning_rate": 0.00012329918506727238, + "loss": 0.0, + "step": 36700 + }, + { + "epoch": 3.4245591116917047, + "grad_norm": NaN, + "learning_rate": 0.00012329174211416876, + "loss": 0.0, + "step": 36701 + }, + { + "epoch": 3.424652421386582, + "grad_norm": NaN, + "learning_rate": 0.00012328429922897632, + "loss": 0.0, + "step": 36702 + }, + { + "epoch": 3.4247457310814595, + "grad_norm": NaN, + "learning_rate": 0.00012327685641171397, + "loss": 0.0, + "step": 36703 + }, + { + "epoch": 3.4248390407763365, + "grad_norm": NaN, + "learning_rate": 0.0001232694136624007, + "loss": 0.0, + "step": 36704 + }, + { + "epoch": 3.424932350471214, + "grad_norm": NaN, + "learning_rate": 0.00012326197098105538, + "loss": 0.0, + "step": 36705 + }, + { + "epoch": 3.4250256601660913, + "grad_norm": NaN, + "learning_rate": 0.00012325452836769692, + "loss": 0.0, + "step": 36706 + }, + { + "epoch": 3.4251189698609688, + "grad_norm": NaN, + "learning_rate": 0.00012324708582234432, + "loss": 0.0, + "step": 36707 + }, + { + "epoch": 3.4252122795558457, + "grad_norm": NaN, + "learning_rate": 0.00012323964334501642, + "loss": 0.0, + "step": 36708 + }, + { + "epoch": 3.425305589250723, + "grad_norm": NaN, + "learning_rate": 0.00012323220093573217, + "loss": 0.0, + "step": 36709 + }, + { + "epoch": 3.4253988989456006, + "grad_norm": NaN, + "learning_rate": 0.00012322475859451054, + "loss": 0.0, + "step": 36710 + }, + { + "epoch": 3.4254922086404775, + "grad_norm": NaN, + "learning_rate": 0.0001232173163213704, + "loss": 0.0, + "step": 36711 + }, + { + "epoch": 3.425585518335355, + "grad_norm": NaN, + "learning_rate": 0.00012320987411633066, + "loss": 0.0, + "step": 36712 + }, + { + "epoch": 3.4256788280302324, + "grad_norm": NaN, + "learning_rate": 0.00012320243197941032, + "loss": 0.0, + "step": 36713 + }, + { + "epoch": 3.42577213772511, + "grad_norm": NaN, + "learning_rate": 0.00012319498991062827, + "loss": 0.0, + "step": 36714 + }, + { + "epoch": 3.4258654474199868, + "grad_norm": NaN, + "learning_rate": 0.00012318754791000336, + "loss": 0.0, + "step": 36715 + }, + { + "epoch": 3.425958757114864, + "grad_norm": NaN, + "learning_rate": 0.00012318010597755464, + "loss": 0.0, + "step": 36716 + }, + { + "epoch": 3.4260520668097416, + "grad_norm": NaN, + "learning_rate": 0.00012317266411330092, + "loss": 0.0, + "step": 36717 + }, + { + "epoch": 3.4261453765046186, + "grad_norm": NaN, + "learning_rate": 0.00012316522231726115, + "loss": 0.0, + "step": 36718 + }, + { + "epoch": 3.426238686199496, + "grad_norm": NaN, + "learning_rate": 0.00012315778058945433, + "loss": 0.0, + "step": 36719 + }, + { + "epoch": 3.4263319958943734, + "grad_norm": NaN, + "learning_rate": 0.00012315033892989927, + "loss": 0.0, + "step": 36720 + }, + { + "epoch": 3.426425305589251, + "grad_norm": NaN, + "learning_rate": 0.00012314289733861503, + "loss": 0.0, + "step": 36721 + }, + { + "epoch": 3.4265186152841283, + "grad_norm": NaN, + "learning_rate": 0.00012313545581562035, + "loss": 0.0, + "step": 36722 + }, + { + "epoch": 3.4266119249790052, + "grad_norm": NaN, + "learning_rate": 0.00012312801436093425, + "loss": 0.0, + "step": 36723 + }, + { + "epoch": 3.4267052346738827, + "grad_norm": NaN, + "learning_rate": 0.00012312057297457572, + "loss": 0.0, + "step": 36724 + }, + { + "epoch": 3.42679854436876, + "grad_norm": NaN, + "learning_rate": 0.00012311313165656358, + "loss": 0.0, + "step": 36725 + }, + { + "epoch": 3.426891854063637, + "grad_norm": NaN, + "learning_rate": 0.00012310569040691673, + "loss": 0.0, + "step": 36726 + }, + { + "epoch": 3.4269851637585145, + "grad_norm": NaN, + "learning_rate": 0.00012309824922565423, + "loss": 0.0, + "step": 36727 + }, + { + "epoch": 3.427078473453392, + "grad_norm": NaN, + "learning_rate": 0.00012309080811279488, + "loss": 0.0, + "step": 36728 + }, + { + "epoch": 3.4271717831482693, + "grad_norm": NaN, + "learning_rate": 0.00012308336706835758, + "loss": 0.0, + "step": 36729 + }, + { + "epoch": 3.4272650928431463, + "grad_norm": NaN, + "learning_rate": 0.00012307592609236138, + "loss": 0.0, + "step": 36730 + }, + { + "epoch": 3.4273584025380237, + "grad_norm": NaN, + "learning_rate": 0.0001230684851848251, + "loss": 0.0, + "step": 36731 + }, + { + "epoch": 3.427451712232901, + "grad_norm": NaN, + "learning_rate": 0.0001230610443457676, + "loss": 0.0, + "step": 36732 + }, + { + "epoch": 3.427545021927778, + "grad_norm": NaN, + "learning_rate": 0.00012305360357520802, + "loss": 0.0, + "step": 36733 + }, + { + "epoch": 3.4276383316226555, + "grad_norm": NaN, + "learning_rate": 0.0001230461628731651, + "loss": 0.0, + "step": 36734 + }, + { + "epoch": 3.427731641317533, + "grad_norm": NaN, + "learning_rate": 0.00012303872223965774, + "loss": 0.0, + "step": 36735 + }, + { + "epoch": 3.4278249510124104, + "grad_norm": NaN, + "learning_rate": 0.000123031281674705, + "loss": 0.0, + "step": 36736 + }, + { + "epoch": 3.4279182607072873, + "grad_norm": NaN, + "learning_rate": 0.00012302384117832572, + "loss": 0.0, + "step": 36737 + }, + { + "epoch": 3.4280115704021648, + "grad_norm": NaN, + "learning_rate": 0.00012301640075053875, + "loss": 0.0, + "step": 36738 + }, + { + "epoch": 3.428104880097042, + "grad_norm": NaN, + "learning_rate": 0.00012300896039136314, + "loss": 0.0, + "step": 36739 + }, + { + "epoch": 3.428198189791919, + "grad_norm": NaN, + "learning_rate": 0.00012300152010081774, + "loss": 0.0, + "step": 36740 + }, + { + "epoch": 3.4282914994867966, + "grad_norm": NaN, + "learning_rate": 0.00012299407987892142, + "loss": 0.0, + "step": 36741 + }, + { + "epoch": 3.428384809181674, + "grad_norm": NaN, + "learning_rate": 0.00012298663972569327, + "loss": 0.0, + "step": 36742 + }, + { + "epoch": 3.4284781188765514, + "grad_norm": NaN, + "learning_rate": 0.00012297919964115203, + "loss": 0.0, + "step": 36743 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": NaN, + "learning_rate": 0.00012297175962531663, + "loss": 0.0, + "step": 36744 + }, + { + "epoch": 3.428664738266306, + "grad_norm": NaN, + "learning_rate": 0.00012296431967820614, + "loss": 0.0, + "step": 36745 + }, + { + "epoch": 3.428758047961183, + "grad_norm": NaN, + "learning_rate": 0.0001229568797998393, + "loss": 0.0, + "step": 36746 + }, + { + "epoch": 3.4288513576560606, + "grad_norm": NaN, + "learning_rate": 0.0001229494399902351, + "loss": 0.0, + "step": 36747 + }, + { + "epoch": 3.4289446673509376, + "grad_norm": NaN, + "learning_rate": 0.00012294200024941254, + "loss": 0.0, + "step": 36748 + }, + { + "epoch": 3.429037977045815, + "grad_norm": NaN, + "learning_rate": 0.00012293456057739041, + "loss": 0.0, + "step": 36749 + }, + { + "epoch": 3.4291312867406925, + "grad_norm": NaN, + "learning_rate": 0.00012292712097418764, + "loss": 0.0, + "step": 36750 + }, + { + "epoch": 3.42922459643557, + "grad_norm": NaN, + "learning_rate": 0.00012291968143982325, + "loss": 0.0, + "step": 36751 + }, + { + "epoch": 3.429317906130447, + "grad_norm": NaN, + "learning_rate": 0.00012291224197431605, + "loss": 0.0, + "step": 36752 + }, + { + "epoch": 3.4294112158253243, + "grad_norm": NaN, + "learning_rate": 0.00012290480257768496, + "loss": 0.0, + "step": 36753 + }, + { + "epoch": 3.4295045255202017, + "grad_norm": NaN, + "learning_rate": 0.00012289736324994899, + "loss": 0.0, + "step": 36754 + }, + { + "epoch": 3.4295978352150787, + "grad_norm": NaN, + "learning_rate": 0.000122889923991127, + "loss": 0.0, + "step": 36755 + }, + { + "epoch": 3.429691144909956, + "grad_norm": NaN, + "learning_rate": 0.00012288248480123783, + "loss": 0.0, + "step": 36756 + }, + { + "epoch": 3.4297844546048335, + "grad_norm": NaN, + "learning_rate": 0.00012287504568030054, + "loss": 0.0, + "step": 36757 + }, + { + "epoch": 3.429877764299711, + "grad_norm": NaN, + "learning_rate": 0.00012286760662833394, + "loss": 0.0, + "step": 36758 + }, + { + "epoch": 3.429971073994588, + "grad_norm": NaN, + "learning_rate": 0.000122860167645357, + "loss": 0.0, + "step": 36759 + }, + { + "epoch": 3.4300643836894653, + "grad_norm": NaN, + "learning_rate": 0.0001228527287313886, + "loss": 0.0, + "step": 36760 + }, + { + "epoch": 3.4301576933843427, + "grad_norm": NaN, + "learning_rate": 0.00012284528988644767, + "loss": 0.0, + "step": 36761 + }, + { + "epoch": 3.4302510030792197, + "grad_norm": NaN, + "learning_rate": 0.00012283785111055312, + "loss": 0.0, + "step": 36762 + }, + { + "epoch": 3.430344312774097, + "grad_norm": NaN, + "learning_rate": 0.0001228304124037239, + "loss": 0.0, + "step": 36763 + }, + { + "epoch": 3.4304376224689745, + "grad_norm": NaN, + "learning_rate": 0.00012282297376597885, + "loss": 0.0, + "step": 36764 + }, + { + "epoch": 3.430530932163852, + "grad_norm": NaN, + "learning_rate": 0.00012281553519733696, + "loss": 0.0, + "step": 36765 + }, + { + "epoch": 3.430624241858729, + "grad_norm": NaN, + "learning_rate": 0.0001228080966978171, + "loss": 0.0, + "step": 36766 + }, + { + "epoch": 3.4307175515536064, + "grad_norm": NaN, + "learning_rate": 0.00012280065826743817, + "loss": 0.0, + "step": 36767 + }, + { + "epoch": 3.430810861248484, + "grad_norm": NaN, + "learning_rate": 0.00012279321990621916, + "loss": 0.0, + "step": 36768 + }, + { + "epoch": 3.430904170943361, + "grad_norm": NaN, + "learning_rate": 0.0001227857816141789, + "loss": 0.0, + "step": 36769 + }, + { + "epoch": 3.430997480638238, + "grad_norm": NaN, + "learning_rate": 0.00012277834339133632, + "loss": 0.0, + "step": 36770 + }, + { + "epoch": 3.4310907903331156, + "grad_norm": NaN, + "learning_rate": 0.00012277090523771039, + "loss": 0.0, + "step": 36771 + }, + { + "epoch": 3.431184100027993, + "grad_norm": NaN, + "learning_rate": 0.00012276346715331998, + "loss": 0.0, + "step": 36772 + }, + { + "epoch": 3.4312774097228704, + "grad_norm": NaN, + "learning_rate": 0.00012275602913818393, + "loss": 0.0, + "step": 36773 + }, + { + "epoch": 3.4313707194177474, + "grad_norm": NaN, + "learning_rate": 0.00012274859119232133, + "loss": 0.0, + "step": 36774 + }, + { + "epoch": 3.431464029112625, + "grad_norm": NaN, + "learning_rate": 0.00012274115331575093, + "loss": 0.0, + "step": 36775 + }, + { + "epoch": 3.4315573388075022, + "grad_norm": NaN, + "learning_rate": 0.00012273371550849168, + "loss": 0.0, + "step": 36776 + }, + { + "epoch": 3.431650648502379, + "grad_norm": NaN, + "learning_rate": 0.00012272627777056258, + "loss": 0.0, + "step": 36777 + }, + { + "epoch": 3.4317439581972566, + "grad_norm": NaN, + "learning_rate": 0.00012271884010198246, + "loss": 0.0, + "step": 36778 + }, + { + "epoch": 3.431837267892134, + "grad_norm": NaN, + "learning_rate": 0.0001227114025027702, + "loss": 0.0, + "step": 36779 + }, + { + "epoch": 3.4319305775870115, + "grad_norm": NaN, + "learning_rate": 0.00012270396497294483, + "loss": 0.0, + "step": 36780 + }, + { + "epoch": 3.4320238872818885, + "grad_norm": NaN, + "learning_rate": 0.00012269652751252516, + "loss": 0.0, + "step": 36781 + }, + { + "epoch": 3.432117196976766, + "grad_norm": NaN, + "learning_rate": 0.0001226890901215301, + "loss": 0.0, + "step": 36782 + }, + { + "epoch": 3.4322105066716433, + "grad_norm": NaN, + "learning_rate": 0.00012268165279997865, + "loss": 0.0, + "step": 36783 + }, + { + "epoch": 3.4323038163665203, + "grad_norm": NaN, + "learning_rate": 0.00012267421554788965, + "loss": 0.0, + "step": 36784 + }, + { + "epoch": 3.4323971260613977, + "grad_norm": NaN, + "learning_rate": 0.00012266677836528196, + "loss": 0.0, + "step": 36785 + }, + { + "epoch": 3.432490435756275, + "grad_norm": NaN, + "learning_rate": 0.00012265934125217464, + "loss": 0.0, + "step": 36786 + }, + { + "epoch": 3.4325837454511525, + "grad_norm": NaN, + "learning_rate": 0.0001226519042085865, + "loss": 0.0, + "step": 36787 + }, + { + "epoch": 3.4326770551460295, + "grad_norm": NaN, + "learning_rate": 0.00012264446723453642, + "loss": 0.0, + "step": 36788 + }, + { + "epoch": 3.432770364840907, + "grad_norm": NaN, + "learning_rate": 0.00012263703033004342, + "loss": 0.0, + "step": 36789 + }, + { + "epoch": 3.4328636745357843, + "grad_norm": NaN, + "learning_rate": 0.0001226295934951263, + "loss": 0.0, + "step": 36790 + }, + { + "epoch": 3.4329569842306618, + "grad_norm": NaN, + "learning_rate": 0.00012262215672980401, + "loss": 0.0, + "step": 36791 + }, + { + "epoch": 3.4330502939255387, + "grad_norm": NaN, + "learning_rate": 0.00012261472003409553, + "loss": 0.0, + "step": 36792 + }, + { + "epoch": 3.433143603620416, + "grad_norm": NaN, + "learning_rate": 0.00012260728340801965, + "loss": 0.0, + "step": 36793 + }, + { + "epoch": 3.4332369133152936, + "grad_norm": NaN, + "learning_rate": 0.00012259984685159531, + "loss": 0.0, + "step": 36794 + }, + { + "epoch": 3.433330223010171, + "grad_norm": NaN, + "learning_rate": 0.00012259241036484154, + "loss": 0.0, + "step": 36795 + }, + { + "epoch": 3.433423532705048, + "grad_norm": NaN, + "learning_rate": 0.00012258497394777702, + "loss": 0.0, + "step": 36796 + }, + { + "epoch": 3.4335168423999254, + "grad_norm": NaN, + "learning_rate": 0.0001225775376004209, + "loss": 0.0, + "step": 36797 + }, + { + "epoch": 3.433610152094803, + "grad_norm": NaN, + "learning_rate": 0.00012257010132279194, + "loss": 0.0, + "step": 36798 + }, + { + "epoch": 3.43370346178968, + "grad_norm": NaN, + "learning_rate": 0.00012256266511490908, + "loss": 0.0, + "step": 36799 + }, + { + "epoch": 3.433796771484557, + "grad_norm": NaN, + "learning_rate": 0.00012255522897679122, + "loss": 0.0, + "step": 36800 + }, + { + "epoch": 3.4338900811794346, + "grad_norm": NaN, + "learning_rate": 0.00012254779290845735, + "loss": 0.0, + "step": 36801 + }, + { + "epoch": 3.433983390874312, + "grad_norm": NaN, + "learning_rate": 0.00012254035690992623, + "loss": 0.0, + "step": 36802 + }, + { + "epoch": 3.434076700569189, + "grad_norm": NaN, + "learning_rate": 0.0001225329209812169, + "loss": 0.0, + "step": 36803 + }, + { + "epoch": 3.4341700102640664, + "grad_norm": NaN, + "learning_rate": 0.0001225254851223482, + "loss": 0.0, + "step": 36804 + }, + { + "epoch": 3.434263319958944, + "grad_norm": NaN, + "learning_rate": 0.00012251804933333904, + "loss": 0.0, + "step": 36805 + }, + { + "epoch": 3.434356629653821, + "grad_norm": NaN, + "learning_rate": 0.00012251061361420834, + "loss": 0.0, + "step": 36806 + }, + { + "epoch": 3.4344499393486982, + "grad_norm": NaN, + "learning_rate": 0.00012250317796497508, + "loss": 0.0, + "step": 36807 + }, + { + "epoch": 3.4345432490435757, + "grad_norm": NaN, + "learning_rate": 0.00012249574238565797, + "loss": 0.0, + "step": 36808 + }, + { + "epoch": 3.434636558738453, + "grad_norm": NaN, + "learning_rate": 0.0001224883068762761, + "loss": 0.0, + "step": 36809 + }, + { + "epoch": 3.43472986843333, + "grad_norm": NaN, + "learning_rate": 0.00012248087143684836, + "loss": 0.0, + "step": 36810 + }, + { + "epoch": 3.4348231781282075, + "grad_norm": NaN, + "learning_rate": 0.00012247343606739354, + "loss": 0.0, + "step": 36811 + }, + { + "epoch": 3.434916487823085, + "grad_norm": NaN, + "learning_rate": 0.00012246600076793067, + "loss": 0.0, + "step": 36812 + }, + { + "epoch": 3.435009797517962, + "grad_norm": NaN, + "learning_rate": 0.00012245856553847858, + "loss": 0.0, + "step": 36813 + }, + { + "epoch": 3.4351031072128393, + "grad_norm": NaN, + "learning_rate": 0.0001224511303790562, + "loss": 0.0, + "step": 36814 + }, + { + "epoch": 3.4351964169077167, + "grad_norm": NaN, + "learning_rate": 0.00012244369528968243, + "loss": 0.0, + "step": 36815 + }, + { + "epoch": 3.435289726602594, + "grad_norm": NaN, + "learning_rate": 0.0001224362602703762, + "loss": 0.0, + "step": 36816 + }, + { + "epoch": 3.4353830362974715, + "grad_norm": NaN, + "learning_rate": 0.00012242882532115636, + "loss": 0.0, + "step": 36817 + }, + { + "epoch": 3.4354763459923485, + "grad_norm": NaN, + "learning_rate": 0.0001224213904420419, + "loss": 0.0, + "step": 36818 + }, + { + "epoch": 3.435569655687226, + "grad_norm": NaN, + "learning_rate": 0.00012241395563305165, + "loss": 0.0, + "step": 36819 + }, + { + "epoch": 3.4356629653821034, + "grad_norm": NaN, + "learning_rate": 0.00012240652089420449, + "loss": 0.0, + "step": 36820 + }, + { + "epoch": 3.4357562750769803, + "grad_norm": NaN, + "learning_rate": 0.00012239908622551946, + "loss": 0.0, + "step": 36821 + }, + { + "epoch": 3.4358495847718578, + "grad_norm": NaN, + "learning_rate": 0.00012239165162701533, + "loss": 0.0, + "step": 36822 + }, + { + "epoch": 3.435942894466735, + "grad_norm": NaN, + "learning_rate": 0.000122384217098711, + "loss": 0.0, + "step": 36823 + }, + { + "epoch": 3.4360362041616126, + "grad_norm": NaN, + "learning_rate": 0.00012237678264062553, + "loss": 0.0, + "step": 36824 + }, + { + "epoch": 3.4361295138564896, + "grad_norm": NaN, + "learning_rate": 0.00012236934825277768, + "loss": 0.0, + "step": 36825 + }, + { + "epoch": 3.436222823551367, + "grad_norm": NaN, + "learning_rate": 0.00012236191393518635, + "loss": 0.0, + "step": 36826 + }, + { + "epoch": 3.4363161332462444, + "grad_norm": NaN, + "learning_rate": 0.00012235447968787056, + "loss": 0.0, + "step": 36827 + }, + { + "epoch": 3.4364094429411214, + "grad_norm": NaN, + "learning_rate": 0.0001223470455108491, + "loss": 0.0, + "step": 36828 + }, + { + "epoch": 3.436502752635999, + "grad_norm": NaN, + "learning_rate": 0.0001223396114041409, + "loss": 0.0, + "step": 36829 + }, + { + "epoch": 3.4365960623308762, + "grad_norm": NaN, + "learning_rate": 0.0001223321773677649, + "loss": 0.0, + "step": 36830 + }, + { + "epoch": 3.4366893720257536, + "grad_norm": NaN, + "learning_rate": 0.00012232474340173998, + "loss": 0.0, + "step": 36831 + }, + { + "epoch": 3.4367826817206306, + "grad_norm": NaN, + "learning_rate": 0.00012231730950608497, + "loss": 0.0, + "step": 36832 + }, + { + "epoch": 3.436875991415508, + "grad_norm": NaN, + "learning_rate": 0.00012230987568081892, + "loss": 0.0, + "step": 36833 + }, + { + "epoch": 3.4369693011103855, + "grad_norm": NaN, + "learning_rate": 0.0001223024419259606, + "loss": 0.0, + "step": 36834 + }, + { + "epoch": 3.4370626108052624, + "grad_norm": NaN, + "learning_rate": 0.000122295008241529, + "loss": 0.0, + "step": 36835 + }, + { + "epoch": 3.43715592050014, + "grad_norm": NaN, + "learning_rate": 0.000122287574627543, + "loss": 0.0, + "step": 36836 + }, + { + "epoch": 3.4372492301950173, + "grad_norm": NaN, + "learning_rate": 0.00012228014108402145, + "loss": 0.0, + "step": 36837 + }, + { + "epoch": 3.4373425398898947, + "grad_norm": NaN, + "learning_rate": 0.0001222727076109833, + "loss": 0.0, + "step": 36838 + }, + { + "epoch": 3.437435849584772, + "grad_norm": NaN, + "learning_rate": 0.0001222652742084475, + "loss": 0.0, + "step": 36839 + }, + { + "epoch": 3.437529159279649, + "grad_norm": NaN, + "learning_rate": 0.00012225784087643282, + "loss": 0.0, + "step": 36840 + }, + { + "epoch": 3.4376224689745265, + "grad_norm": NaN, + "learning_rate": 0.00012225040761495824, + "loss": 0.0, + "step": 36841 + }, + { + "epoch": 3.437715778669404, + "grad_norm": NaN, + "learning_rate": 0.0001222429744240427, + "loss": 0.0, + "step": 36842 + }, + { + "epoch": 3.437809088364281, + "grad_norm": NaN, + "learning_rate": 0.00012223554130370498, + "loss": 0.0, + "step": 36843 + }, + { + "epoch": 3.4379023980591583, + "grad_norm": NaN, + "learning_rate": 0.0001222281082539641, + "loss": 0.0, + "step": 36844 + }, + { + "epoch": 3.4379957077540357, + "grad_norm": NaN, + "learning_rate": 0.00012222067527483895, + "loss": 0.0, + "step": 36845 + }, + { + "epoch": 3.438089017448913, + "grad_norm": NaN, + "learning_rate": 0.00012221324236634832, + "loss": 0.0, + "step": 36846 + }, + { + "epoch": 3.43818232714379, + "grad_norm": NaN, + "learning_rate": 0.00012220580952851124, + "loss": 0.0, + "step": 36847 + }, + { + "epoch": 3.4382756368386675, + "grad_norm": NaN, + "learning_rate": 0.00012219837676134655, + "loss": 0.0, + "step": 36848 + }, + { + "epoch": 3.438368946533545, + "grad_norm": NaN, + "learning_rate": 0.0001221909440648731, + "loss": 0.0, + "step": 36849 + }, + { + "epoch": 3.438462256228422, + "grad_norm": NaN, + "learning_rate": 0.00012218351143910986, + "loss": 0.0, + "step": 36850 + }, + { + "epoch": 3.4385555659232994, + "grad_norm": NaN, + "learning_rate": 0.00012217607888407578, + "loss": 0.0, + "step": 36851 + }, + { + "epoch": 3.438648875618177, + "grad_norm": NaN, + "learning_rate": 0.0001221686463997896, + "loss": 0.0, + "step": 36852 + }, + { + "epoch": 3.438742185313054, + "grad_norm": NaN, + "learning_rate": 0.00012216121398627032, + "loss": 0.0, + "step": 36853 + }, + { + "epoch": 3.438835495007931, + "grad_norm": NaN, + "learning_rate": 0.0001221537816435369, + "loss": 0.0, + "step": 36854 + }, + { + "epoch": 3.4389288047028086, + "grad_norm": NaN, + "learning_rate": 0.0001221463493716081, + "loss": 0.0, + "step": 36855 + }, + { + "epoch": 3.439022114397686, + "grad_norm": NaN, + "learning_rate": 0.00012213891717050292, + "loss": 0.0, + "step": 36856 + }, + { + "epoch": 3.439115424092563, + "grad_norm": NaN, + "learning_rate": 0.00012213148504024022, + "loss": 0.0, + "step": 36857 + }, + { + "epoch": 3.4392087337874404, + "grad_norm": NaN, + "learning_rate": 0.00012212405298083883, + "loss": 0.0, + "step": 36858 + }, + { + "epoch": 3.439302043482318, + "grad_norm": NaN, + "learning_rate": 0.00012211662099231778, + "loss": 0.0, + "step": 36859 + }, + { + "epoch": 3.4393953531771952, + "grad_norm": NaN, + "learning_rate": 0.0001221091890746959, + "loss": 0.0, + "step": 36860 + }, + { + "epoch": 3.4394886628720722, + "grad_norm": NaN, + "learning_rate": 0.00012210175722799204, + "loss": 0.0, + "step": 36861 + }, + { + "epoch": 3.4395819725669496, + "grad_norm": NaN, + "learning_rate": 0.0001220943254522252, + "loss": 0.0, + "step": 36862 + }, + { + "epoch": 3.439675282261827, + "grad_norm": NaN, + "learning_rate": 0.00012208689374741422, + "loss": 0.0, + "step": 36863 + }, + { + "epoch": 3.4397685919567045, + "grad_norm": NaN, + "learning_rate": 0.00012207946211357793, + "loss": 0.0, + "step": 36864 + }, + { + "epoch": 3.4398619016515815, + "grad_norm": NaN, + "learning_rate": 0.0001220720305507354, + "loss": 0.0, + "step": 36865 + }, + { + "epoch": 3.439955211346459, + "grad_norm": NaN, + "learning_rate": 0.00012206459905890539, + "loss": 0.0, + "step": 36866 + }, + { + "epoch": 3.4400485210413363, + "grad_norm": NaN, + "learning_rate": 0.00012205716763810678, + "loss": 0.0, + "step": 36867 + }, + { + "epoch": 3.4401418307362137, + "grad_norm": NaN, + "learning_rate": 0.00012204973628835858, + "loss": 0.0, + "step": 36868 + }, + { + "epoch": 3.4402351404310907, + "grad_norm": NaN, + "learning_rate": 0.00012204230500967956, + "loss": 0.0, + "step": 36869 + }, + { + "epoch": 3.440328450125968, + "grad_norm": NaN, + "learning_rate": 0.00012203487380208871, + "loss": 0.0, + "step": 36870 + }, + { + "epoch": 3.4404217598208455, + "grad_norm": NaN, + "learning_rate": 0.00012202744266560494, + "loss": 0.0, + "step": 36871 + }, + { + "epoch": 3.4405150695157225, + "grad_norm": NaN, + "learning_rate": 0.00012202001160024702, + "loss": 0.0, + "step": 36872 + }, + { + "epoch": 3.4406083792106, + "grad_norm": NaN, + "learning_rate": 0.00012201258060603395, + "loss": 0.0, + "step": 36873 + }, + { + "epoch": 3.4407016889054773, + "grad_norm": NaN, + "learning_rate": 0.00012200514968298463, + "loss": 0.0, + "step": 36874 + }, + { + "epoch": 3.4407949986003548, + "grad_norm": NaN, + "learning_rate": 0.00012199771883111786, + "loss": 0.0, + "step": 36875 + }, + { + "epoch": 3.4408883082952317, + "grad_norm": NaN, + "learning_rate": 0.00012199028805045263, + "loss": 0.0, + "step": 36876 + }, + { + "epoch": 3.440981617990109, + "grad_norm": NaN, + "learning_rate": 0.00012198285734100782, + "loss": 0.0, + "step": 36877 + }, + { + "epoch": 3.4410749276849866, + "grad_norm": NaN, + "learning_rate": 0.00012197542670280225, + "loss": 0.0, + "step": 36878 + }, + { + "epoch": 3.4411682373798635, + "grad_norm": NaN, + "learning_rate": 0.00012196799613585491, + "loss": 0.0, + "step": 36879 + }, + { + "epoch": 3.441261547074741, + "grad_norm": NaN, + "learning_rate": 0.00012196056564018467, + "loss": 0.0, + "step": 36880 + }, + { + "epoch": 3.4413548567696184, + "grad_norm": NaN, + "learning_rate": 0.00012195313521581035, + "loss": 0.0, + "step": 36881 + }, + { + "epoch": 3.441448166464496, + "grad_norm": NaN, + "learning_rate": 0.00012194570486275093, + "loss": 0.0, + "step": 36882 + }, + { + "epoch": 3.441541476159373, + "grad_norm": NaN, + "learning_rate": 0.0001219382745810253, + "loss": 0.0, + "step": 36883 + }, + { + "epoch": 3.44163478585425, + "grad_norm": NaN, + "learning_rate": 0.00012193084437065226, + "loss": 0.0, + "step": 36884 + }, + { + "epoch": 3.4417280955491276, + "grad_norm": NaN, + "learning_rate": 0.0001219234142316508, + "loss": 0.0, + "step": 36885 + }, + { + "epoch": 3.441821405244005, + "grad_norm": NaN, + "learning_rate": 0.00012191598416403981, + "loss": 0.0, + "step": 36886 + }, + { + "epoch": 3.441914714938882, + "grad_norm": NaN, + "learning_rate": 0.00012190855416783811, + "loss": 0.0, + "step": 36887 + }, + { + "epoch": 3.4420080246337594, + "grad_norm": NaN, + "learning_rate": 0.00012190112424306465, + "loss": 0.0, + "step": 36888 + }, + { + "epoch": 3.442101334328637, + "grad_norm": NaN, + "learning_rate": 0.00012189369438973836, + "loss": 0.0, + "step": 36889 + }, + { + "epoch": 3.4421946440235143, + "grad_norm": NaN, + "learning_rate": 0.00012188626460787797, + "loss": 0.0, + "step": 36890 + }, + { + "epoch": 3.4422879537183912, + "grad_norm": NaN, + "learning_rate": 0.00012187883489750255, + "loss": 0.0, + "step": 36891 + }, + { + "epoch": 3.4423812634132687, + "grad_norm": NaN, + "learning_rate": 0.00012187140525863095, + "loss": 0.0, + "step": 36892 + }, + { + "epoch": 3.442474573108146, + "grad_norm": NaN, + "learning_rate": 0.00012186397569128197, + "loss": 0.0, + "step": 36893 + }, + { + "epoch": 3.442567882803023, + "grad_norm": NaN, + "learning_rate": 0.00012185654619547458, + "loss": 0.0, + "step": 36894 + }, + { + "epoch": 3.4426611924979005, + "grad_norm": NaN, + "learning_rate": 0.0001218491167712277, + "loss": 0.0, + "step": 36895 + }, + { + "epoch": 3.442754502192778, + "grad_norm": NaN, + "learning_rate": 0.00012184168741856012, + "loss": 0.0, + "step": 36896 + }, + { + "epoch": 3.4428478118876553, + "grad_norm": NaN, + "learning_rate": 0.00012183425813749082, + "loss": 0.0, + "step": 36897 + }, + { + "epoch": 3.4429411215825323, + "grad_norm": NaN, + "learning_rate": 0.00012182682892803867, + "loss": 0.0, + "step": 36898 + }, + { + "epoch": 3.4430344312774097, + "grad_norm": NaN, + "learning_rate": 0.00012181939979022251, + "loss": 0.0, + "step": 36899 + }, + { + "epoch": 3.443127740972287, + "grad_norm": NaN, + "learning_rate": 0.00012181197072406132, + "loss": 0.0, + "step": 36900 + }, + { + "epoch": 3.443221050667164, + "grad_norm": NaN, + "learning_rate": 0.00012180454172957391, + "loss": 0.0, + "step": 36901 + }, + { + "epoch": 3.4433143603620415, + "grad_norm": NaN, + "learning_rate": 0.00012179711280677916, + "loss": 0.0, + "step": 36902 + }, + { + "epoch": 3.443407670056919, + "grad_norm": NaN, + "learning_rate": 0.00012178968395569607, + "loss": 0.0, + "step": 36903 + }, + { + "epoch": 3.4435009797517964, + "grad_norm": NaN, + "learning_rate": 0.00012178225517634341, + "loss": 0.0, + "step": 36904 + }, + { + "epoch": 3.4435942894466733, + "grad_norm": NaN, + "learning_rate": 0.0001217748264687401, + "loss": 0.0, + "step": 36905 + }, + { + "epoch": 3.4436875991415508, + "grad_norm": NaN, + "learning_rate": 0.00012176739783290512, + "loss": 0.0, + "step": 36906 + }, + { + "epoch": 3.443780908836428, + "grad_norm": NaN, + "learning_rate": 0.0001217599692688572, + "loss": 0.0, + "step": 36907 + }, + { + "epoch": 3.443874218531305, + "grad_norm": NaN, + "learning_rate": 0.00012175254077661534, + "loss": 0.0, + "step": 36908 + }, + { + "epoch": 3.4439675282261826, + "grad_norm": NaN, + "learning_rate": 0.00012174511235619845, + "loss": 0.0, + "step": 36909 + }, + { + "epoch": 3.44406083792106, + "grad_norm": NaN, + "learning_rate": 0.0001217376840076253, + "loss": 0.0, + "step": 36910 + }, + { + "epoch": 3.4441541476159374, + "grad_norm": NaN, + "learning_rate": 0.00012173025573091488, + "loss": 0.0, + "step": 36911 + }, + { + "epoch": 3.444247457310815, + "grad_norm": NaN, + "learning_rate": 0.00012172282752608608, + "loss": 0.0, + "step": 36912 + }, + { + "epoch": 3.444340767005692, + "grad_norm": NaN, + "learning_rate": 0.00012171539939315769, + "loss": 0.0, + "step": 36913 + }, + { + "epoch": 3.4444340767005692, + "grad_norm": NaN, + "learning_rate": 0.00012170797133214867, + "loss": 0.0, + "step": 36914 + }, + { + "epoch": 3.4445273863954466, + "grad_norm": NaN, + "learning_rate": 0.00012170054334307796, + "loss": 0.0, + "step": 36915 + }, + { + "epoch": 3.4446206960903236, + "grad_norm": NaN, + "learning_rate": 0.00012169311542596432, + "loss": 0.0, + "step": 36916 + }, + { + "epoch": 3.444714005785201, + "grad_norm": NaN, + "learning_rate": 0.00012168568758082672, + "loss": 0.0, + "step": 36917 + }, + { + "epoch": 3.4448073154800785, + "grad_norm": NaN, + "learning_rate": 0.00012167825980768407, + "loss": 0.0, + "step": 36918 + }, + { + "epoch": 3.444900625174956, + "grad_norm": NaN, + "learning_rate": 0.00012167083210655513, + "loss": 0.0, + "step": 36919 + }, + { + "epoch": 3.444993934869833, + "grad_norm": NaN, + "learning_rate": 0.00012166340447745892, + "loss": 0.0, + "step": 36920 + }, + { + "epoch": 3.4450872445647103, + "grad_norm": NaN, + "learning_rate": 0.00012165597692041431, + "loss": 0.0, + "step": 36921 + }, + { + "epoch": 3.4451805542595877, + "grad_norm": NaN, + "learning_rate": 0.0001216485494354401, + "loss": 0.0, + "step": 36922 + }, + { + "epoch": 3.4452738639544647, + "grad_norm": NaN, + "learning_rate": 0.00012164112202255525, + "loss": 0.0, + "step": 36923 + }, + { + "epoch": 3.445367173649342, + "grad_norm": NaN, + "learning_rate": 0.00012163369468177867, + "loss": 0.0, + "step": 36924 + }, + { + "epoch": 3.4454604833442195, + "grad_norm": NaN, + "learning_rate": 0.00012162626741312913, + "loss": 0.0, + "step": 36925 + }, + { + "epoch": 3.445553793039097, + "grad_norm": NaN, + "learning_rate": 0.00012161884021662562, + "loss": 0.0, + "step": 36926 + }, + { + "epoch": 3.445647102733974, + "grad_norm": NaN, + "learning_rate": 0.00012161141309228702, + "loss": 0.0, + "step": 36927 + }, + { + "epoch": 3.4457404124288513, + "grad_norm": NaN, + "learning_rate": 0.00012160398604013213, + "loss": 0.0, + "step": 36928 + }, + { + "epoch": 3.4458337221237287, + "grad_norm": NaN, + "learning_rate": 0.00012159655906017992, + "loss": 0.0, + "step": 36929 + }, + { + "epoch": 3.4459270318186057, + "grad_norm": NaN, + "learning_rate": 0.00012158913215244928, + "loss": 0.0, + "step": 36930 + }, + { + "epoch": 3.446020341513483, + "grad_norm": NaN, + "learning_rate": 0.000121581705316959, + "loss": 0.0, + "step": 36931 + }, + { + "epoch": 3.4461136512083606, + "grad_norm": NaN, + "learning_rate": 0.00012157427855372806, + "loss": 0.0, + "step": 36932 + }, + { + "epoch": 3.446206960903238, + "grad_norm": NaN, + "learning_rate": 0.00012156685186277533, + "loss": 0.0, + "step": 36933 + }, + { + "epoch": 3.4463002705981154, + "grad_norm": NaN, + "learning_rate": 0.00012155942524411961, + "loss": 0.0, + "step": 36934 + }, + { + "epoch": 3.4463935802929924, + "grad_norm": NaN, + "learning_rate": 0.00012155199869777989, + "loss": 0.0, + "step": 36935 + }, + { + "epoch": 3.44648688998787, + "grad_norm": NaN, + "learning_rate": 0.00012154457222377504, + "loss": 0.0, + "step": 36936 + }, + { + "epoch": 3.446580199682747, + "grad_norm": NaN, + "learning_rate": 0.00012153714582212384, + "loss": 0.0, + "step": 36937 + }, + { + "epoch": 3.446673509377624, + "grad_norm": NaN, + "learning_rate": 0.00012152971949284529, + "loss": 0.0, + "step": 36938 + }, + { + "epoch": 3.4467668190725016, + "grad_norm": NaN, + "learning_rate": 0.00012152229323595825, + "loss": 0.0, + "step": 36939 + }, + { + "epoch": 3.446860128767379, + "grad_norm": NaN, + "learning_rate": 0.00012151486705148152, + "loss": 0.0, + "step": 36940 + }, + { + "epoch": 3.4469534384622564, + "grad_norm": NaN, + "learning_rate": 0.00012150744093943405, + "loss": 0.0, + "step": 36941 + }, + { + "epoch": 3.4470467481571334, + "grad_norm": NaN, + "learning_rate": 0.00012150001489983481, + "loss": 0.0, + "step": 36942 + }, + { + "epoch": 3.447140057852011, + "grad_norm": NaN, + "learning_rate": 0.0001214925889327025, + "loss": 0.0, + "step": 36943 + }, + { + "epoch": 3.4472333675468882, + "grad_norm": NaN, + "learning_rate": 0.00012148516303805615, + "loss": 0.0, + "step": 36944 + }, + { + "epoch": 3.4473266772417652, + "grad_norm": NaN, + "learning_rate": 0.00012147773721591452, + "loss": 0.0, + "step": 36945 + }, + { + "epoch": 3.4474199869366426, + "grad_norm": NaN, + "learning_rate": 0.0001214703114662966, + "loss": 0.0, + "step": 36946 + }, + { + "epoch": 3.44751329663152, + "grad_norm": NaN, + "learning_rate": 0.00012146288578922125, + "loss": 0.0, + "step": 36947 + }, + { + "epoch": 3.4476066063263975, + "grad_norm": NaN, + "learning_rate": 0.00012145546018470726, + "loss": 0.0, + "step": 36948 + }, + { + "epoch": 3.4476999160212745, + "grad_norm": NaN, + "learning_rate": 0.00012144803465277362, + "loss": 0.0, + "step": 36949 + }, + { + "epoch": 3.447793225716152, + "grad_norm": NaN, + "learning_rate": 0.0001214406091934392, + "loss": 0.0, + "step": 36950 + }, + { + "epoch": 3.4478865354110293, + "grad_norm": NaN, + "learning_rate": 0.00012143318380672279, + "loss": 0.0, + "step": 36951 + }, + { + "epoch": 3.4479798451059063, + "grad_norm": NaN, + "learning_rate": 0.00012142575849264335, + "loss": 0.0, + "step": 36952 + }, + { + "epoch": 3.4480731548007837, + "grad_norm": NaN, + "learning_rate": 0.00012141833325121977, + "loss": 0.0, + "step": 36953 + }, + { + "epoch": 3.448166464495661, + "grad_norm": NaN, + "learning_rate": 0.00012141090808247086, + "loss": 0.0, + "step": 36954 + }, + { + "epoch": 3.4482597741905385, + "grad_norm": NaN, + "learning_rate": 0.00012140348298641555, + "loss": 0.0, + "step": 36955 + }, + { + "epoch": 3.4483530838854155, + "grad_norm": NaN, + "learning_rate": 0.00012139605796307275, + "loss": 0.0, + "step": 36956 + }, + { + "epoch": 3.448446393580293, + "grad_norm": NaN, + "learning_rate": 0.00012138863301246127, + "loss": 0.0, + "step": 36957 + }, + { + "epoch": 3.4485397032751703, + "grad_norm": NaN, + "learning_rate": 0.00012138120813460002, + "loss": 0.0, + "step": 36958 + }, + { + "epoch": 3.4486330129700478, + "grad_norm": NaN, + "learning_rate": 0.00012137378332950793, + "loss": 0.0, + "step": 36959 + }, + { + "epoch": 3.4487263226649247, + "grad_norm": NaN, + "learning_rate": 0.00012136635859720375, + "loss": 0.0, + "step": 36960 + }, + { + "epoch": 3.448819632359802, + "grad_norm": NaN, + "learning_rate": 0.00012135893393770647, + "loss": 0.0, + "step": 36961 + }, + { + "epoch": 3.4489129420546796, + "grad_norm": NaN, + "learning_rate": 0.00012135150935103497, + "loss": 0.0, + "step": 36962 + }, + { + "epoch": 3.449006251749557, + "grad_norm": NaN, + "learning_rate": 0.00012134408483720803, + "loss": 0.0, + "step": 36963 + }, + { + "epoch": 3.449099561444434, + "grad_norm": NaN, + "learning_rate": 0.00012133666039624464, + "loss": 0.0, + "step": 36964 + }, + { + "epoch": 3.4491928711393114, + "grad_norm": NaN, + "learning_rate": 0.00012132923602816365, + "loss": 0.0, + "step": 36965 + }, + { + "epoch": 3.449286180834189, + "grad_norm": NaN, + "learning_rate": 0.00012132181173298385, + "loss": 0.0, + "step": 36966 + }, + { + "epoch": 3.449379490529066, + "grad_norm": NaN, + "learning_rate": 0.00012131438751072423, + "loss": 0.0, + "step": 36967 + }, + { + "epoch": 3.449472800223943, + "grad_norm": NaN, + "learning_rate": 0.00012130696336140368, + "loss": 0.0, + "step": 36968 + }, + { + "epoch": 3.4495661099188206, + "grad_norm": NaN, + "learning_rate": 0.00012129953928504092, + "loss": 0.0, + "step": 36969 + }, + { + "epoch": 3.449659419613698, + "grad_norm": NaN, + "learning_rate": 0.000121292115281655, + "loss": 0.0, + "step": 36970 + }, + { + "epoch": 3.449752729308575, + "grad_norm": NaN, + "learning_rate": 0.00012128469135126472, + "loss": 0.0, + "step": 36971 + }, + { + "epoch": 3.4498460390034524, + "grad_norm": NaN, + "learning_rate": 0.00012127726749388891, + "loss": 0.0, + "step": 36972 + }, + { + "epoch": 3.44993934869833, + "grad_norm": NaN, + "learning_rate": 0.00012126984370954654, + "loss": 0.0, + "step": 36973 + }, + { + "epoch": 3.450032658393207, + "grad_norm": NaN, + "learning_rate": 0.00012126241999825648, + "loss": 0.0, + "step": 36974 + }, + { + "epoch": 3.4501259680880842, + "grad_norm": NaN, + "learning_rate": 0.00012125499636003751, + "loss": 0.0, + "step": 36975 + }, + { + "epoch": 3.4502192777829617, + "grad_norm": NaN, + "learning_rate": 0.0001212475727949086, + "loss": 0.0, + "step": 36976 + }, + { + "epoch": 3.450312587477839, + "grad_norm": NaN, + "learning_rate": 0.00012124014930288863, + "loss": 0.0, + "step": 36977 + }, + { + "epoch": 3.450405897172716, + "grad_norm": NaN, + "learning_rate": 0.00012123272588399636, + "loss": 0.0, + "step": 36978 + }, + { + "epoch": 3.4504992068675935, + "grad_norm": NaN, + "learning_rate": 0.00012122530253825078, + "loss": 0.0, + "step": 36979 + }, + { + "epoch": 3.450592516562471, + "grad_norm": NaN, + "learning_rate": 0.0001212178792656708, + "loss": 0.0, + "step": 36980 + }, + { + "epoch": 3.4506858262573483, + "grad_norm": NaN, + "learning_rate": 0.00012121045606627514, + "loss": 0.0, + "step": 36981 + }, + { + "epoch": 3.4507791359522253, + "grad_norm": NaN, + "learning_rate": 0.00012120303294008276, + "loss": 0.0, + "step": 36982 + }, + { + "epoch": 3.4508724456471027, + "grad_norm": NaN, + "learning_rate": 0.0001211956098871126, + "loss": 0.0, + "step": 36983 + }, + { + "epoch": 3.45096575534198, + "grad_norm": NaN, + "learning_rate": 0.00012118818690738344, + "loss": 0.0, + "step": 36984 + }, + { + "epoch": 3.4510590650368576, + "grad_norm": NaN, + "learning_rate": 0.00012118076400091417, + "loss": 0.0, + "step": 36985 + }, + { + "epoch": 3.4511523747317345, + "grad_norm": NaN, + "learning_rate": 0.00012117334116772372, + "loss": 0.0, + "step": 36986 + }, + { + "epoch": 3.451245684426612, + "grad_norm": NaN, + "learning_rate": 0.0001211659184078309, + "loss": 0.0, + "step": 36987 + }, + { + "epoch": 3.4513389941214894, + "grad_norm": NaN, + "learning_rate": 0.00012115849572125457, + "loss": 0.0, + "step": 36988 + }, + { + "epoch": 3.4514323038163663, + "grad_norm": NaN, + "learning_rate": 0.00012115107310801372, + "loss": 0.0, + "step": 36989 + }, + { + "epoch": 3.4515256135112438, + "grad_norm": NaN, + "learning_rate": 0.00012114365056812711, + "loss": 0.0, + "step": 36990 + }, + { + "epoch": 3.451618923206121, + "grad_norm": NaN, + "learning_rate": 0.00012113622810161367, + "loss": 0.0, + "step": 36991 + }, + { + "epoch": 3.4517122329009986, + "grad_norm": NaN, + "learning_rate": 0.0001211288057084922, + "loss": 0.0, + "step": 36992 + }, + { + "epoch": 3.4518055425958756, + "grad_norm": NaN, + "learning_rate": 0.00012112138338878165, + "loss": 0.0, + "step": 36993 + }, + { + "epoch": 3.451898852290753, + "grad_norm": NaN, + "learning_rate": 0.00012111396114250092, + "loss": 0.0, + "step": 36994 + }, + { + "epoch": 3.4519921619856304, + "grad_norm": NaN, + "learning_rate": 0.00012110653896966874, + "loss": 0.0, + "step": 36995 + }, + { + "epoch": 3.4520854716805074, + "grad_norm": NaN, + "learning_rate": 0.0001210991168703041, + "loss": 0.0, + "step": 36996 + }, + { + "epoch": 3.452178781375385, + "grad_norm": NaN, + "learning_rate": 0.00012109169484442589, + "loss": 0.0, + "step": 36997 + }, + { + "epoch": 3.4522720910702622, + "grad_norm": NaN, + "learning_rate": 0.00012108427289205289, + "loss": 0.0, + "step": 36998 + }, + { + "epoch": 3.4523654007651396, + "grad_norm": NaN, + "learning_rate": 0.00012107685101320403, + "loss": 0.0, + "step": 36999 + }, + { + "epoch": 3.4524587104600166, + "grad_norm": NaN, + "learning_rate": 0.00012106942920789819, + "loss": 0.0, + "step": 37000 + }, + { + "epoch": 3.452552020154894, + "grad_norm": NaN, + "learning_rate": 0.00012106200747615419, + "loss": 0.0, + "step": 37001 + }, + { + "epoch": 3.4526453298497715, + "grad_norm": NaN, + "learning_rate": 0.00012105458581799094, + "loss": 0.0, + "step": 37002 + }, + { + "epoch": 3.452738639544649, + "grad_norm": NaN, + "learning_rate": 0.00012104716423342735, + "loss": 0.0, + "step": 37003 + }, + { + "epoch": 3.452831949239526, + "grad_norm": NaN, + "learning_rate": 0.00012103974272248218, + "loss": 0.0, + "step": 37004 + }, + { + "epoch": 3.4529252589344033, + "grad_norm": NaN, + "learning_rate": 0.0001210323212851744, + "loss": 0.0, + "step": 37005 + }, + { + "epoch": 3.4530185686292807, + "grad_norm": NaN, + "learning_rate": 0.00012102489992152287, + "loss": 0.0, + "step": 37006 + }, + { + "epoch": 3.453111878324158, + "grad_norm": NaN, + "learning_rate": 0.00012101747863154637, + "loss": 0.0, + "step": 37007 + }, + { + "epoch": 3.453205188019035, + "grad_norm": NaN, + "learning_rate": 0.00012101005741526386, + "loss": 0.0, + "step": 37008 + }, + { + "epoch": 3.4532984977139125, + "grad_norm": NaN, + "learning_rate": 0.00012100263627269423, + "loss": 0.0, + "step": 37009 + }, + { + "epoch": 3.45339180740879, + "grad_norm": NaN, + "learning_rate": 0.00012099521520385624, + "loss": 0.0, + "step": 37010 + }, + { + "epoch": 3.453485117103667, + "grad_norm": NaN, + "learning_rate": 0.00012098779420876887, + "loss": 0.0, + "step": 37011 + }, + { + "epoch": 3.4535784267985443, + "grad_norm": NaN, + "learning_rate": 0.00012098037328745097, + "loss": 0.0, + "step": 37012 + }, + { + "epoch": 3.4536717364934217, + "grad_norm": NaN, + "learning_rate": 0.0001209729524399213, + "loss": 0.0, + "step": 37013 + }, + { + "epoch": 3.453765046188299, + "grad_norm": NaN, + "learning_rate": 0.00012096553166619888, + "loss": 0.0, + "step": 37014 + }, + { + "epoch": 3.453858355883176, + "grad_norm": NaN, + "learning_rate": 0.00012095811096630253, + "loss": 0.0, + "step": 37015 + }, + { + "epoch": 3.4539516655780536, + "grad_norm": NaN, + "learning_rate": 0.00012095069034025103, + "loss": 0.0, + "step": 37016 + }, + { + "epoch": 3.454044975272931, + "grad_norm": NaN, + "learning_rate": 0.00012094326978806332, + "loss": 0.0, + "step": 37017 + }, + { + "epoch": 3.454138284967808, + "grad_norm": NaN, + "learning_rate": 0.00012093584930975837, + "loss": 0.0, + "step": 37018 + }, + { + "epoch": 3.4542315946626854, + "grad_norm": NaN, + "learning_rate": 0.00012092842890535484, + "loss": 0.0, + "step": 37019 + }, + { + "epoch": 3.454324904357563, + "grad_norm": NaN, + "learning_rate": 0.00012092100857487172, + "loss": 0.0, + "step": 37020 + }, + { + "epoch": 3.45441821405244, + "grad_norm": NaN, + "learning_rate": 0.00012091358831832792, + "loss": 0.0, + "step": 37021 + }, + { + "epoch": 3.454511523747317, + "grad_norm": NaN, + "learning_rate": 0.0001209061681357422, + "loss": 0.0, + "step": 37022 + }, + { + "epoch": 3.4546048334421946, + "grad_norm": NaN, + "learning_rate": 0.00012089874802713347, + "loss": 0.0, + "step": 37023 + }, + { + "epoch": 3.454698143137072, + "grad_norm": NaN, + "learning_rate": 0.00012089132799252065, + "loss": 0.0, + "step": 37024 + }, + { + "epoch": 3.454791452831949, + "grad_norm": NaN, + "learning_rate": 0.00012088390803192254, + "loss": 0.0, + "step": 37025 + }, + { + "epoch": 3.4548847625268264, + "grad_norm": NaN, + "learning_rate": 0.00012087648814535797, + "loss": 0.0, + "step": 37026 + }, + { + "epoch": 3.454978072221704, + "grad_norm": NaN, + "learning_rate": 0.00012086906833284596, + "loss": 0.0, + "step": 37027 + }, + { + "epoch": 3.4550713819165813, + "grad_norm": NaN, + "learning_rate": 0.00012086164859440523, + "loss": 0.0, + "step": 37028 + }, + { + "epoch": 3.4551646916114587, + "grad_norm": NaN, + "learning_rate": 0.00012085422893005467, + "loss": 0.0, + "step": 37029 + }, + { + "epoch": 3.4552580013063356, + "grad_norm": NaN, + "learning_rate": 0.00012084680933981324, + "loss": 0.0, + "step": 37030 + }, + { + "epoch": 3.455351311001213, + "grad_norm": NaN, + "learning_rate": 0.0001208393898236997, + "loss": 0.0, + "step": 37031 + }, + { + "epoch": 3.4554446206960905, + "grad_norm": NaN, + "learning_rate": 0.00012083197038173292, + "loss": 0.0, + "step": 37032 + }, + { + "epoch": 3.4555379303909675, + "grad_norm": NaN, + "learning_rate": 0.00012082455101393186, + "loss": 0.0, + "step": 37033 + }, + { + "epoch": 3.455631240085845, + "grad_norm": NaN, + "learning_rate": 0.00012081713172031531, + "loss": 0.0, + "step": 37034 + }, + { + "epoch": 3.4557245497807223, + "grad_norm": NaN, + "learning_rate": 0.00012080971250090217, + "loss": 0.0, + "step": 37035 + }, + { + "epoch": 3.4558178594755997, + "grad_norm": NaN, + "learning_rate": 0.00012080229335571121, + "loss": 0.0, + "step": 37036 + }, + { + "epoch": 3.4559111691704767, + "grad_norm": NaN, + "learning_rate": 0.0001207948742847614, + "loss": 0.0, + "step": 37037 + }, + { + "epoch": 3.456004478865354, + "grad_norm": NaN, + "learning_rate": 0.00012078745528807163, + "loss": 0.0, + "step": 37038 + }, + { + "epoch": 3.4560977885602315, + "grad_norm": NaN, + "learning_rate": 0.00012078003636566062, + "loss": 0.0, + "step": 37039 + }, + { + "epoch": 3.4561910982551085, + "grad_norm": NaN, + "learning_rate": 0.00012077261751754737, + "loss": 0.0, + "step": 37040 + }, + { + "epoch": 3.456284407949986, + "grad_norm": NaN, + "learning_rate": 0.00012076519874375072, + "loss": 0.0, + "step": 37041 + }, + { + "epoch": 3.4563777176448633, + "grad_norm": NaN, + "learning_rate": 0.00012075778004428945, + "loss": 0.0, + "step": 37042 + }, + { + "epoch": 3.4564710273397408, + "grad_norm": NaN, + "learning_rate": 0.00012075036141918251, + "loss": 0.0, + "step": 37043 + }, + { + "epoch": 3.4565643370346177, + "grad_norm": NaN, + "learning_rate": 0.00012074294286844878, + "loss": 0.0, + "step": 37044 + }, + { + "epoch": 3.456657646729495, + "grad_norm": NaN, + "learning_rate": 0.000120735524392107, + "loss": 0.0, + "step": 37045 + }, + { + "epoch": 3.4567509564243726, + "grad_norm": NaN, + "learning_rate": 0.00012072810599017615, + "loss": 0.0, + "step": 37046 + }, + { + "epoch": 3.4568442661192496, + "grad_norm": NaN, + "learning_rate": 0.00012072068766267509, + "loss": 0.0, + "step": 37047 + }, + { + "epoch": 3.456937575814127, + "grad_norm": NaN, + "learning_rate": 0.00012071326940962258, + "loss": 0.0, + "step": 37048 + }, + { + "epoch": 3.4570308855090044, + "grad_norm": NaN, + "learning_rate": 0.00012070585123103759, + "loss": 0.0, + "step": 37049 + }, + { + "epoch": 3.457124195203882, + "grad_norm": NaN, + "learning_rate": 0.00012069843312693897, + "loss": 0.0, + "step": 37050 + }, + { + "epoch": 3.4572175048987592, + "grad_norm": NaN, + "learning_rate": 0.00012069101509734551, + "loss": 0.0, + "step": 37051 + }, + { + "epoch": 3.457310814593636, + "grad_norm": NaN, + "learning_rate": 0.0001206835971422761, + "loss": 0.0, + "step": 37052 + }, + { + "epoch": 3.4574041242885136, + "grad_norm": NaN, + "learning_rate": 0.00012067617926174969, + "loss": 0.0, + "step": 37053 + }, + { + "epoch": 3.457497433983391, + "grad_norm": NaN, + "learning_rate": 0.000120668761455785, + "loss": 0.0, + "step": 37054 + }, + { + "epoch": 3.457590743678268, + "grad_norm": NaN, + "learning_rate": 0.00012066134372440095, + "loss": 0.0, + "step": 37055 + }, + { + "epoch": 3.4576840533731454, + "grad_norm": NaN, + "learning_rate": 0.00012065392606761648, + "loss": 0.0, + "step": 37056 + }, + { + "epoch": 3.457777363068023, + "grad_norm": NaN, + "learning_rate": 0.00012064650848545036, + "loss": 0.0, + "step": 37057 + }, + { + "epoch": 3.4578706727629003, + "grad_norm": NaN, + "learning_rate": 0.00012063909097792143, + "loss": 0.0, + "step": 37058 + }, + { + "epoch": 3.4579639824577773, + "grad_norm": NaN, + "learning_rate": 0.00012063167354504866, + "loss": 0.0, + "step": 37059 + }, + { + "epoch": 3.4580572921526547, + "grad_norm": NaN, + "learning_rate": 0.00012062425618685082, + "loss": 0.0, + "step": 37060 + }, + { + "epoch": 3.458150601847532, + "grad_norm": NaN, + "learning_rate": 0.00012061683890334676, + "loss": 0.0, + "step": 37061 + }, + { + "epoch": 3.458243911542409, + "grad_norm": NaN, + "learning_rate": 0.00012060942169455545, + "loss": 0.0, + "step": 37062 + }, + { + "epoch": 3.4583372212372865, + "grad_norm": NaN, + "learning_rate": 0.00012060200456049563, + "loss": 0.0, + "step": 37063 + }, + { + "epoch": 3.458430530932164, + "grad_norm": NaN, + "learning_rate": 0.00012059458750118616, + "loss": 0.0, + "step": 37064 + }, + { + "epoch": 3.4585238406270413, + "grad_norm": NaN, + "learning_rate": 0.00012058717051664603, + "loss": 0.0, + "step": 37065 + }, + { + "epoch": 3.4586171503219183, + "grad_norm": NaN, + "learning_rate": 0.00012057975360689397, + "loss": 0.0, + "step": 37066 + }, + { + "epoch": 3.4587104600167957, + "grad_norm": NaN, + "learning_rate": 0.00012057233677194884, + "loss": 0.0, + "step": 37067 + }, + { + "epoch": 3.458803769711673, + "grad_norm": NaN, + "learning_rate": 0.00012056492001182962, + "loss": 0.0, + "step": 37068 + }, + { + "epoch": 3.45889707940655, + "grad_norm": NaN, + "learning_rate": 0.00012055750332655505, + "loss": 0.0, + "step": 37069 + }, + { + "epoch": 3.4589903891014275, + "grad_norm": NaN, + "learning_rate": 0.00012055008671614401, + "loss": 0.0, + "step": 37070 + }, + { + "epoch": 3.459083698796305, + "grad_norm": NaN, + "learning_rate": 0.00012054267018061542, + "loss": 0.0, + "step": 37071 + }, + { + "epoch": 3.4591770084911824, + "grad_norm": NaN, + "learning_rate": 0.00012053525371998808, + "loss": 0.0, + "step": 37072 + }, + { + "epoch": 3.4592703181860593, + "grad_norm": NaN, + "learning_rate": 0.00012052783733428082, + "loss": 0.0, + "step": 37073 + }, + { + "epoch": 3.4593636278809368, + "grad_norm": NaN, + "learning_rate": 0.0001205204210235126, + "loss": 0.0, + "step": 37074 + }, + { + "epoch": 3.459456937575814, + "grad_norm": NaN, + "learning_rate": 0.00012051300478770218, + "loss": 0.0, + "step": 37075 + }, + { + "epoch": 3.4595502472706916, + "grad_norm": NaN, + "learning_rate": 0.00012050558862686842, + "loss": 0.0, + "step": 37076 + }, + { + "epoch": 3.4596435569655686, + "grad_norm": NaN, + "learning_rate": 0.00012049817254103029, + "loss": 0.0, + "step": 37077 + }, + { + "epoch": 3.459736866660446, + "grad_norm": NaN, + "learning_rate": 0.00012049075653020654, + "loss": 0.0, + "step": 37078 + }, + { + "epoch": 3.4598301763553234, + "grad_norm": NaN, + "learning_rate": 0.00012048334059441607, + "loss": 0.0, + "step": 37079 + }, + { + "epoch": 3.459923486050201, + "grad_norm": NaN, + "learning_rate": 0.00012047592473367768, + "loss": 0.0, + "step": 37080 + }, + { + "epoch": 3.460016795745078, + "grad_norm": NaN, + "learning_rate": 0.00012046850894801027, + "loss": 0.0, + "step": 37081 + }, + { + "epoch": 3.4601101054399552, + "grad_norm": NaN, + "learning_rate": 0.00012046109323743275, + "loss": 0.0, + "step": 37082 + }, + { + "epoch": 3.4602034151348326, + "grad_norm": NaN, + "learning_rate": 0.00012045367760196384, + "loss": 0.0, + "step": 37083 + }, + { + "epoch": 3.4602967248297096, + "grad_norm": NaN, + "learning_rate": 0.00012044626204162252, + "loss": 0.0, + "step": 37084 + }, + { + "epoch": 3.460390034524587, + "grad_norm": NaN, + "learning_rate": 0.00012043884655642762, + "loss": 0.0, + "step": 37085 + }, + { + "epoch": 3.4604833442194645, + "grad_norm": NaN, + "learning_rate": 0.00012043143114639792, + "loss": 0.0, + "step": 37086 + }, + { + "epoch": 3.460576653914342, + "grad_norm": NaN, + "learning_rate": 0.00012042401581155236, + "loss": 0.0, + "step": 37087 + }, + { + "epoch": 3.460669963609219, + "grad_norm": NaN, + "learning_rate": 0.0001204166005519098, + "loss": 0.0, + "step": 37088 + }, + { + "epoch": 3.4607632733040963, + "grad_norm": NaN, + "learning_rate": 0.000120409185367489, + "loss": 0.0, + "step": 37089 + }, + { + "epoch": 3.4608565829989737, + "grad_norm": NaN, + "learning_rate": 0.0001204017702583089, + "loss": 0.0, + "step": 37090 + }, + { + "epoch": 3.4609498926938507, + "grad_norm": NaN, + "learning_rate": 0.00012039435522438838, + "loss": 0.0, + "step": 37091 + }, + { + "epoch": 3.461043202388728, + "grad_norm": NaN, + "learning_rate": 0.00012038694026574617, + "loss": 0.0, + "step": 37092 + }, + { + "epoch": 3.4611365120836055, + "grad_norm": NaN, + "learning_rate": 0.00012037952538240119, + "loss": 0.0, + "step": 37093 + }, + { + "epoch": 3.461229821778483, + "grad_norm": NaN, + "learning_rate": 0.00012037211057437239, + "loss": 0.0, + "step": 37094 + }, + { + "epoch": 3.46132313147336, + "grad_norm": NaN, + "learning_rate": 0.0001203646958416785, + "loss": 0.0, + "step": 37095 + }, + { + "epoch": 3.4614164411682373, + "grad_norm": NaN, + "learning_rate": 0.00012035728118433837, + "loss": 0.0, + "step": 37096 + }, + { + "epoch": 3.4615097508631147, + "grad_norm": NaN, + "learning_rate": 0.00012034986660237094, + "loss": 0.0, + "step": 37097 + }, + { + "epoch": 3.461603060557992, + "grad_norm": NaN, + "learning_rate": 0.00012034245209579501, + "loss": 0.0, + "step": 37098 + }, + { + "epoch": 3.461696370252869, + "grad_norm": NaN, + "learning_rate": 0.0001203350376646294, + "loss": 0.0, + "step": 37099 + }, + { + "epoch": 3.4617896799477466, + "grad_norm": NaN, + "learning_rate": 0.00012032762330889305, + "loss": 0.0, + "step": 37100 + }, + { + "epoch": 3.461882989642624, + "grad_norm": NaN, + "learning_rate": 0.00012032020902860475, + "loss": 0.0, + "step": 37101 + }, + { + "epoch": 3.4619762993375014, + "grad_norm": NaN, + "learning_rate": 0.00012031279482378334, + "loss": 0.0, + "step": 37102 + }, + { + "epoch": 3.4620696090323784, + "grad_norm": NaN, + "learning_rate": 0.00012030538069444775, + "loss": 0.0, + "step": 37103 + }, + { + "epoch": 3.462162918727256, + "grad_norm": NaN, + "learning_rate": 0.00012029796664061674, + "loss": 0.0, + "step": 37104 + }, + { + "epoch": 3.462256228422133, + "grad_norm": NaN, + "learning_rate": 0.0001202905526623092, + "loss": 0.0, + "step": 37105 + }, + { + "epoch": 3.46234953811701, + "grad_norm": NaN, + "learning_rate": 0.00012028313875954403, + "loss": 0.0, + "step": 37106 + }, + { + "epoch": 3.4624428478118876, + "grad_norm": NaN, + "learning_rate": 0.00012027572493233998, + "loss": 0.0, + "step": 37107 + }, + { + "epoch": 3.462536157506765, + "grad_norm": NaN, + "learning_rate": 0.00012026831118071597, + "loss": 0.0, + "step": 37108 + }, + { + "epoch": 3.4626294672016424, + "grad_norm": NaN, + "learning_rate": 0.00012026089750469086, + "loss": 0.0, + "step": 37109 + }, + { + "epoch": 3.4627227768965194, + "grad_norm": NaN, + "learning_rate": 0.00012025348390428348, + "loss": 0.0, + "step": 37110 + }, + { + "epoch": 3.462816086591397, + "grad_norm": NaN, + "learning_rate": 0.00012024607037951263, + "loss": 0.0, + "step": 37111 + }, + { + "epoch": 3.4629093962862743, + "grad_norm": NaN, + "learning_rate": 0.00012023865693039727, + "loss": 0.0, + "step": 37112 + }, + { + "epoch": 3.4630027059811512, + "grad_norm": NaN, + "learning_rate": 0.00012023124355695619, + "loss": 0.0, + "step": 37113 + }, + { + "epoch": 3.4630960156760286, + "grad_norm": NaN, + "learning_rate": 0.00012022383025920816, + "loss": 0.0, + "step": 37114 + }, + { + "epoch": 3.463189325370906, + "grad_norm": NaN, + "learning_rate": 0.00012021641703717219, + "loss": 0.0, + "step": 37115 + }, + { + "epoch": 3.4632826350657835, + "grad_norm": NaN, + "learning_rate": 0.00012020900389086703, + "loss": 0.0, + "step": 37116 + }, + { + "epoch": 3.4633759447606605, + "grad_norm": NaN, + "learning_rate": 0.00012020159082031151, + "loss": 0.0, + "step": 37117 + }, + { + "epoch": 3.463469254455538, + "grad_norm": NaN, + "learning_rate": 0.00012019417782552458, + "loss": 0.0, + "step": 37118 + }, + { + "epoch": 3.4635625641504153, + "grad_norm": NaN, + "learning_rate": 0.00012018676490652499, + "loss": 0.0, + "step": 37119 + }, + { + "epoch": 3.4636558738452923, + "grad_norm": NaN, + "learning_rate": 0.00012017935206333159, + "loss": 0.0, + "step": 37120 + }, + { + "epoch": 3.4637491835401697, + "grad_norm": NaN, + "learning_rate": 0.00012017193929596334, + "loss": 0.0, + "step": 37121 + }, + { + "epoch": 3.463842493235047, + "grad_norm": NaN, + "learning_rate": 0.00012016452660443898, + "loss": 0.0, + "step": 37122 + }, + { + "epoch": 3.4639358029299245, + "grad_norm": NaN, + "learning_rate": 0.00012015711398877734, + "loss": 0.0, + "step": 37123 + }, + { + "epoch": 3.464029112624802, + "grad_norm": NaN, + "learning_rate": 0.00012014970144899742, + "loss": 0.0, + "step": 37124 + }, + { + "epoch": 3.464122422319679, + "grad_norm": NaN, + "learning_rate": 0.0001201422889851179, + "loss": 0.0, + "step": 37125 + }, + { + "epoch": 3.4642157320145563, + "grad_norm": NaN, + "learning_rate": 0.00012013487659715773, + "loss": 0.0, + "step": 37126 + }, + { + "epoch": 3.4643090417094338, + "grad_norm": NaN, + "learning_rate": 0.00012012746428513568, + "loss": 0.0, + "step": 37127 + }, + { + "epoch": 3.4644023514043107, + "grad_norm": NaN, + "learning_rate": 0.00012012005204907064, + "loss": 0.0, + "step": 37128 + }, + { + "epoch": 3.464495661099188, + "grad_norm": NaN, + "learning_rate": 0.00012011263988898153, + "loss": 0.0, + "step": 37129 + }, + { + "epoch": 3.4645889707940656, + "grad_norm": NaN, + "learning_rate": 0.00012010522780488704, + "loss": 0.0, + "step": 37130 + }, + { + "epoch": 3.464682280488943, + "grad_norm": NaN, + "learning_rate": 0.00012009781579680608, + "loss": 0.0, + "step": 37131 + }, + { + "epoch": 3.46477559018382, + "grad_norm": NaN, + "learning_rate": 0.0001200904038647576, + "loss": 0.0, + "step": 37132 + }, + { + "epoch": 3.4648688998786974, + "grad_norm": NaN, + "learning_rate": 0.00012008299200876034, + "loss": 0.0, + "step": 37133 + }, + { + "epoch": 3.464962209573575, + "grad_norm": NaN, + "learning_rate": 0.00012007558022883312, + "loss": 0.0, + "step": 37134 + }, + { + "epoch": 3.465055519268452, + "grad_norm": NaN, + "learning_rate": 0.0001200681685249949, + "loss": 0.0, + "step": 37135 + }, + { + "epoch": 3.465148828963329, + "grad_norm": NaN, + "learning_rate": 0.00012006075689726445, + "loss": 0.0, + "step": 37136 + }, + { + "epoch": 3.4652421386582066, + "grad_norm": NaN, + "learning_rate": 0.00012005334534566058, + "loss": 0.0, + "step": 37137 + }, + { + "epoch": 3.465335448353084, + "grad_norm": NaN, + "learning_rate": 0.00012004593387020224, + "loss": 0.0, + "step": 37138 + }, + { + "epoch": 3.465428758047961, + "grad_norm": NaN, + "learning_rate": 0.00012003852247090818, + "loss": 0.0, + "step": 37139 + }, + { + "epoch": 3.4655220677428384, + "grad_norm": NaN, + "learning_rate": 0.00012003111114779727, + "loss": 0.0, + "step": 37140 + }, + { + "epoch": 3.465615377437716, + "grad_norm": NaN, + "learning_rate": 0.00012002369990088842, + "loss": 0.0, + "step": 37141 + }, + { + "epoch": 3.465708687132593, + "grad_norm": NaN, + "learning_rate": 0.00012001628873020038, + "loss": 0.0, + "step": 37142 + }, + { + "epoch": 3.4658019968274703, + "grad_norm": NaN, + "learning_rate": 0.00012000887763575202, + "loss": 0.0, + "step": 37143 + }, + { + "epoch": 3.4658953065223477, + "grad_norm": NaN, + "learning_rate": 0.00012000146661756226, + "loss": 0.0, + "step": 37144 + }, + { + "epoch": 3.465988616217225, + "grad_norm": NaN, + "learning_rate": 0.00011999405567564984, + "loss": 0.0, + "step": 37145 + }, + { + "epoch": 3.4660819259121025, + "grad_norm": NaN, + "learning_rate": 0.00011998664481003364, + "loss": 0.0, + "step": 37146 + }, + { + "epoch": 3.4661752356069795, + "grad_norm": NaN, + "learning_rate": 0.00011997923402073257, + "loss": 0.0, + "step": 37147 + }, + { + "epoch": 3.466268545301857, + "grad_norm": NaN, + "learning_rate": 0.00011997182330776537, + "loss": 0.0, + "step": 37148 + }, + { + "epoch": 3.4663618549967343, + "grad_norm": NaN, + "learning_rate": 0.0001199644126711509, + "loss": 0.0, + "step": 37149 + }, + { + "epoch": 3.4664551646916113, + "grad_norm": NaN, + "learning_rate": 0.00011995700211090809, + "loss": 0.0, + "step": 37150 + }, + { + "epoch": 3.4665484743864887, + "grad_norm": NaN, + "learning_rate": 0.00011994959162705571, + "loss": 0.0, + "step": 37151 + }, + { + "epoch": 3.466641784081366, + "grad_norm": NaN, + "learning_rate": 0.00011994218121961257, + "loss": 0.0, + "step": 37152 + }, + { + "epoch": 3.4667350937762436, + "grad_norm": NaN, + "learning_rate": 0.00011993477088859763, + "loss": 0.0, + "step": 37153 + }, + { + "epoch": 3.4668284034711205, + "grad_norm": NaN, + "learning_rate": 0.00011992736063402963, + "loss": 0.0, + "step": 37154 + }, + { + "epoch": 3.466921713165998, + "grad_norm": NaN, + "learning_rate": 0.00011991995045592741, + "loss": 0.0, + "step": 37155 + }, + { + "epoch": 3.4670150228608754, + "grad_norm": NaN, + "learning_rate": 0.00011991254035430992, + "loss": 0.0, + "step": 37156 + }, + { + "epoch": 3.4671083325557523, + "grad_norm": NaN, + "learning_rate": 0.00011990513032919588, + "loss": 0.0, + "step": 37157 + }, + { + "epoch": 3.4672016422506298, + "grad_norm": NaN, + "learning_rate": 0.00011989772038060413, + "loss": 0.0, + "step": 37158 + }, + { + "epoch": 3.467294951945507, + "grad_norm": NaN, + "learning_rate": 0.00011989031050855366, + "loss": 0.0, + "step": 37159 + }, + { + "epoch": 3.4673882616403846, + "grad_norm": NaN, + "learning_rate": 0.00011988290071306315, + "loss": 0.0, + "step": 37160 + }, + { + "epoch": 3.4674815713352616, + "grad_norm": NaN, + "learning_rate": 0.00011987549099415149, + "loss": 0.0, + "step": 37161 + }, + { + "epoch": 3.467574881030139, + "grad_norm": NaN, + "learning_rate": 0.00011986808135183759, + "loss": 0.0, + "step": 37162 + }, + { + "epoch": 3.4676681907250164, + "grad_norm": NaN, + "learning_rate": 0.0001198606717861402, + "loss": 0.0, + "step": 37163 + }, + { + "epoch": 3.4677615004198934, + "grad_norm": NaN, + "learning_rate": 0.00011985326229707817, + "loss": 0.0, + "step": 37164 + }, + { + "epoch": 3.467854810114771, + "grad_norm": NaN, + "learning_rate": 0.00011984585288467043, + "loss": 0.0, + "step": 37165 + }, + { + "epoch": 3.4679481198096482, + "grad_norm": NaN, + "learning_rate": 0.00011983844354893569, + "loss": 0.0, + "step": 37166 + }, + { + "epoch": 3.4680414295045257, + "grad_norm": NaN, + "learning_rate": 0.00011983103428989286, + "loss": 0.0, + "step": 37167 + }, + { + "epoch": 3.4681347391994026, + "grad_norm": NaN, + "learning_rate": 0.00011982362510756083, + "loss": 0.0, + "step": 37168 + }, + { + "epoch": 3.46822804889428, + "grad_norm": NaN, + "learning_rate": 0.00011981621600195831, + "loss": 0.0, + "step": 37169 + }, + { + "epoch": 3.4683213585891575, + "grad_norm": NaN, + "learning_rate": 0.00011980880697310428, + "loss": 0.0, + "step": 37170 + }, + { + "epoch": 3.468414668284035, + "grad_norm": NaN, + "learning_rate": 0.00011980139802101747, + "loss": 0.0, + "step": 37171 + }, + { + "epoch": 3.468507977978912, + "grad_norm": NaN, + "learning_rate": 0.00011979398914571675, + "loss": 0.0, + "step": 37172 + }, + { + "epoch": 3.4686012876737893, + "grad_norm": NaN, + "learning_rate": 0.00011978658034722101, + "loss": 0.0, + "step": 37173 + }, + { + "epoch": 3.4686945973686667, + "grad_norm": NaN, + "learning_rate": 0.00011977917162554902, + "loss": 0.0, + "step": 37174 + }, + { + "epoch": 3.468787907063544, + "grad_norm": NaN, + "learning_rate": 0.0001197717629807196, + "loss": 0.0, + "step": 37175 + }, + { + "epoch": 3.468881216758421, + "grad_norm": NaN, + "learning_rate": 0.00011976435441275171, + "loss": 0.0, + "step": 37176 + }, + { + "epoch": 3.4689745264532985, + "grad_norm": NaN, + "learning_rate": 0.00011975694592166407, + "loss": 0.0, + "step": 37177 + }, + { + "epoch": 3.469067836148176, + "grad_norm": NaN, + "learning_rate": 0.00011974953750747555, + "loss": 0.0, + "step": 37178 + }, + { + "epoch": 3.469161145843053, + "grad_norm": NaN, + "learning_rate": 0.00011974212917020504, + "loss": 0.0, + "step": 37179 + }, + { + "epoch": 3.4692544555379303, + "grad_norm": NaN, + "learning_rate": 0.00011973472090987128, + "loss": 0.0, + "step": 37180 + }, + { + "epoch": 3.4693477652328077, + "grad_norm": NaN, + "learning_rate": 0.00011972731272649315, + "loss": 0.0, + "step": 37181 + }, + { + "epoch": 3.469441074927685, + "grad_norm": NaN, + "learning_rate": 0.00011971990462008957, + "loss": 0.0, + "step": 37182 + }, + { + "epoch": 3.469534384622562, + "grad_norm": NaN, + "learning_rate": 0.00011971249659067926, + "loss": 0.0, + "step": 37183 + }, + { + "epoch": 3.4696276943174396, + "grad_norm": NaN, + "learning_rate": 0.00011970508863828107, + "loss": 0.0, + "step": 37184 + }, + { + "epoch": 3.469721004012317, + "grad_norm": NaN, + "learning_rate": 0.00011969768076291391, + "loss": 0.0, + "step": 37185 + }, + { + "epoch": 3.469814313707194, + "grad_norm": NaN, + "learning_rate": 0.00011969027296459658, + "loss": 0.0, + "step": 37186 + }, + { + "epoch": 3.4699076234020714, + "grad_norm": NaN, + "learning_rate": 0.00011968286524334785, + "loss": 0.0, + "step": 37187 + }, + { + "epoch": 3.470000933096949, + "grad_norm": NaN, + "learning_rate": 0.00011967545759918667, + "loss": 0.0, + "step": 37188 + }, + { + "epoch": 3.470094242791826, + "grad_norm": NaN, + "learning_rate": 0.0001196680500321318, + "loss": 0.0, + "step": 37189 + }, + { + "epoch": 3.470187552486703, + "grad_norm": NaN, + "learning_rate": 0.00011966064254220207, + "loss": 0.0, + "step": 37190 + }, + { + "epoch": 3.4702808621815806, + "grad_norm": NaN, + "learning_rate": 0.00011965323512941637, + "loss": 0.0, + "step": 37191 + }, + { + "epoch": 3.470374171876458, + "grad_norm": NaN, + "learning_rate": 0.0001196458277937935, + "loss": 0.0, + "step": 37192 + }, + { + "epoch": 3.4704674815713354, + "grad_norm": NaN, + "learning_rate": 0.00011963842053535226, + "loss": 0.0, + "step": 37193 + }, + { + "epoch": 3.4705607912662124, + "grad_norm": NaN, + "learning_rate": 0.00011963101335411159, + "loss": 0.0, + "step": 37194 + }, + { + "epoch": 3.47065410096109, + "grad_norm": NaN, + "learning_rate": 0.00011962360625009023, + "loss": 0.0, + "step": 37195 + }, + { + "epoch": 3.4707474106559673, + "grad_norm": NaN, + "learning_rate": 0.000119616199223307, + "loss": 0.0, + "step": 37196 + }, + { + "epoch": 3.4708407203508447, + "grad_norm": NaN, + "learning_rate": 0.00011960879227378083, + "loss": 0.0, + "step": 37197 + }, + { + "epoch": 3.4709340300457217, + "grad_norm": NaN, + "learning_rate": 0.00011960138540153047, + "loss": 0.0, + "step": 37198 + }, + { + "epoch": 3.471027339740599, + "grad_norm": NaN, + "learning_rate": 0.00011959397860657475, + "loss": 0.0, + "step": 37199 + }, + { + "epoch": 3.4711206494354765, + "grad_norm": NaN, + "learning_rate": 0.0001195865718889326, + "loss": 0.0, + "step": 37200 + }, + { + "epoch": 3.4712139591303535, + "grad_norm": NaN, + "learning_rate": 0.00011957916524862277, + "loss": 0.0, + "step": 37201 + }, + { + "epoch": 3.471307268825231, + "grad_norm": NaN, + "learning_rate": 0.00011957175868566408, + "loss": 0.0, + "step": 37202 + }, + { + "epoch": 3.4714005785201083, + "grad_norm": NaN, + "learning_rate": 0.00011956435220007544, + "loss": 0.0, + "step": 37203 + }, + { + "epoch": 3.4714938882149857, + "grad_norm": NaN, + "learning_rate": 0.00011955694579187558, + "loss": 0.0, + "step": 37204 + }, + { + "epoch": 3.4715871979098627, + "grad_norm": NaN, + "learning_rate": 0.00011954953946108343, + "loss": 0.0, + "step": 37205 + }, + { + "epoch": 3.47168050760474, + "grad_norm": NaN, + "learning_rate": 0.00011954213320771782, + "loss": 0.0, + "step": 37206 + }, + { + "epoch": 3.4717738172996175, + "grad_norm": NaN, + "learning_rate": 0.00011953472703179746, + "loss": 0.0, + "step": 37207 + }, + { + "epoch": 3.4718671269944945, + "grad_norm": NaN, + "learning_rate": 0.0001195273209333413, + "loss": 0.0, + "step": 37208 + }, + { + "epoch": 3.471960436689372, + "grad_norm": NaN, + "learning_rate": 0.00011951991491236816, + "loss": 0.0, + "step": 37209 + }, + { + "epoch": 3.4720537463842494, + "grad_norm": NaN, + "learning_rate": 0.00011951250896889679, + "loss": 0.0, + "step": 37210 + }, + { + "epoch": 3.4721470560791268, + "grad_norm": NaN, + "learning_rate": 0.00011950510310294613, + "loss": 0.0, + "step": 37211 + }, + { + "epoch": 3.4722403657740037, + "grad_norm": NaN, + "learning_rate": 0.00011949769731453496, + "loss": 0.0, + "step": 37212 + }, + { + "epoch": 3.472333675468881, + "grad_norm": NaN, + "learning_rate": 0.00011949029160368207, + "loss": 0.0, + "step": 37213 + }, + { + "epoch": 3.4724269851637586, + "grad_norm": NaN, + "learning_rate": 0.00011948288597040639, + "loss": 0.0, + "step": 37214 + }, + { + "epoch": 3.472520294858636, + "grad_norm": NaN, + "learning_rate": 0.00011947548041472665, + "loss": 0.0, + "step": 37215 + }, + { + "epoch": 3.472613604553513, + "grad_norm": NaN, + "learning_rate": 0.0001194680749366617, + "loss": 0.0, + "step": 37216 + }, + { + "epoch": 3.4727069142483904, + "grad_norm": NaN, + "learning_rate": 0.00011946066953623045, + "loss": 0.0, + "step": 37217 + }, + { + "epoch": 3.472800223943268, + "grad_norm": NaN, + "learning_rate": 0.00011945326421345162, + "loss": 0.0, + "step": 37218 + }, + { + "epoch": 3.4728935336381452, + "grad_norm": NaN, + "learning_rate": 0.00011944585896834409, + "loss": 0.0, + "step": 37219 + }, + { + "epoch": 3.472986843333022, + "grad_norm": NaN, + "learning_rate": 0.00011943845380092674, + "loss": 0.0, + "step": 37220 + }, + { + "epoch": 3.4730801530278996, + "grad_norm": NaN, + "learning_rate": 0.00011943104871121833, + "loss": 0.0, + "step": 37221 + }, + { + "epoch": 3.473173462722777, + "grad_norm": NaN, + "learning_rate": 0.00011942364369923766, + "loss": 0.0, + "step": 37222 + }, + { + "epoch": 3.473266772417654, + "grad_norm": NaN, + "learning_rate": 0.00011941623876500367, + "loss": 0.0, + "step": 37223 + }, + { + "epoch": 3.4733600821125314, + "grad_norm": NaN, + "learning_rate": 0.00011940883390853511, + "loss": 0.0, + "step": 37224 + }, + { + "epoch": 3.473453391807409, + "grad_norm": NaN, + "learning_rate": 0.00011940142912985077, + "loss": 0.0, + "step": 37225 + }, + { + "epoch": 3.4735467015022863, + "grad_norm": NaN, + "learning_rate": 0.0001193940244289696, + "loss": 0.0, + "step": 37226 + }, + { + "epoch": 3.4736400111971633, + "grad_norm": NaN, + "learning_rate": 0.00011938661980591033, + "loss": 0.0, + "step": 37227 + }, + { + "epoch": 3.4737333208920407, + "grad_norm": NaN, + "learning_rate": 0.0001193792152606918, + "loss": 0.0, + "step": 37228 + }, + { + "epoch": 3.473826630586918, + "grad_norm": NaN, + "learning_rate": 0.00011937181079333291, + "loss": 0.0, + "step": 37229 + }, + { + "epoch": 3.473919940281795, + "grad_norm": NaN, + "learning_rate": 0.0001193644064038524, + "loss": 0.0, + "step": 37230 + }, + { + "epoch": 3.4740132499766725, + "grad_norm": NaN, + "learning_rate": 0.00011935700209226908, + "loss": 0.0, + "step": 37231 + }, + { + "epoch": 3.47410655967155, + "grad_norm": NaN, + "learning_rate": 0.00011934959785860192, + "loss": 0.0, + "step": 37232 + }, + { + "epoch": 3.4741998693664273, + "grad_norm": NaN, + "learning_rate": 0.0001193421937028696, + "loss": 0.0, + "step": 37233 + }, + { + "epoch": 3.4742931790613043, + "grad_norm": NaN, + "learning_rate": 0.00011933478962509097, + "loss": 0.0, + "step": 37234 + }, + { + "epoch": 3.4743864887561817, + "grad_norm": NaN, + "learning_rate": 0.00011932738562528498, + "loss": 0.0, + "step": 37235 + }, + { + "epoch": 3.474479798451059, + "grad_norm": NaN, + "learning_rate": 0.00011931998170347029, + "loss": 0.0, + "step": 37236 + }, + { + "epoch": 3.474573108145936, + "grad_norm": NaN, + "learning_rate": 0.00011931257785966578, + "loss": 0.0, + "step": 37237 + }, + { + "epoch": 3.4746664178408135, + "grad_norm": NaN, + "learning_rate": 0.00011930517409389038, + "loss": 0.0, + "step": 37238 + }, + { + "epoch": 3.474759727535691, + "grad_norm": NaN, + "learning_rate": 0.00011929777040616277, + "loss": 0.0, + "step": 37239 + }, + { + "epoch": 3.4748530372305684, + "grad_norm": NaN, + "learning_rate": 0.00011929036679650181, + "loss": 0.0, + "step": 37240 + }, + { + "epoch": 3.474946346925446, + "grad_norm": NaN, + "learning_rate": 0.00011928296326492642, + "loss": 0.0, + "step": 37241 + }, + { + "epoch": 3.4750396566203228, + "grad_norm": NaN, + "learning_rate": 0.00011927555981145529, + "loss": 0.0, + "step": 37242 + }, + { + "epoch": 3.4751329663152, + "grad_norm": NaN, + "learning_rate": 0.00011926815643610734, + "loss": 0.0, + "step": 37243 + }, + { + "epoch": 3.4752262760100776, + "grad_norm": NaN, + "learning_rate": 0.00011926075313890139, + "loss": 0.0, + "step": 37244 + }, + { + "epoch": 3.4753195857049546, + "grad_norm": NaN, + "learning_rate": 0.00011925334991985618, + "loss": 0.0, + "step": 37245 + }, + { + "epoch": 3.475412895399832, + "grad_norm": NaN, + "learning_rate": 0.00011924594677899063, + "loss": 0.0, + "step": 37246 + }, + { + "epoch": 3.4755062050947094, + "grad_norm": NaN, + "learning_rate": 0.00011923854371632356, + "loss": 0.0, + "step": 37247 + }, + { + "epoch": 3.475599514789587, + "grad_norm": NaN, + "learning_rate": 0.00011923114073187369, + "loss": 0.0, + "step": 37248 + }, + { + "epoch": 3.475692824484464, + "grad_norm": NaN, + "learning_rate": 0.00011922373782565995, + "loss": 0.0, + "step": 37249 + }, + { + "epoch": 3.4757861341793412, + "grad_norm": NaN, + "learning_rate": 0.00011921633499770117, + "loss": 0.0, + "step": 37250 + }, + { + "epoch": 3.4758794438742187, + "grad_norm": NaN, + "learning_rate": 0.00011920893224801605, + "loss": 0.0, + "step": 37251 + }, + { + "epoch": 3.4759727535690956, + "grad_norm": NaN, + "learning_rate": 0.00011920152957662355, + "loss": 0.0, + "step": 37252 + }, + { + "epoch": 3.476066063263973, + "grad_norm": NaN, + "learning_rate": 0.00011919412698354244, + "loss": 0.0, + "step": 37253 + }, + { + "epoch": 3.4761593729588505, + "grad_norm": NaN, + "learning_rate": 0.00011918672446879148, + "loss": 0.0, + "step": 37254 + }, + { + "epoch": 3.476252682653728, + "grad_norm": NaN, + "learning_rate": 0.0001191793220323896, + "loss": 0.0, + "step": 37255 + }, + { + "epoch": 3.476345992348605, + "grad_norm": NaN, + "learning_rate": 0.00011917191967435559, + "loss": 0.0, + "step": 37256 + }, + { + "epoch": 3.4764393020434823, + "grad_norm": NaN, + "learning_rate": 0.00011916451739470821, + "loss": 0.0, + "step": 37257 + }, + { + "epoch": 3.4765326117383597, + "grad_norm": NaN, + "learning_rate": 0.00011915711519346635, + "loss": 0.0, + "step": 37258 + }, + { + "epoch": 3.4766259214332367, + "grad_norm": NaN, + "learning_rate": 0.00011914971307064884, + "loss": 0.0, + "step": 37259 + }, + { + "epoch": 3.476719231128114, + "grad_norm": NaN, + "learning_rate": 0.00011914231102627443, + "loss": 0.0, + "step": 37260 + }, + { + "epoch": 3.4768125408229915, + "grad_norm": NaN, + "learning_rate": 0.00011913490906036202, + "loss": 0.0, + "step": 37261 + }, + { + "epoch": 3.476905850517869, + "grad_norm": NaN, + "learning_rate": 0.00011912750717293036, + "loss": 0.0, + "step": 37262 + }, + { + "epoch": 3.4769991602127464, + "grad_norm": NaN, + "learning_rate": 0.00011912010536399829, + "loss": 0.0, + "step": 37263 + }, + { + "epoch": 3.4770924699076233, + "grad_norm": NaN, + "learning_rate": 0.0001191127036335847, + "loss": 0.0, + "step": 37264 + }, + { + "epoch": 3.4771857796025007, + "grad_norm": NaN, + "learning_rate": 0.00011910530198170832, + "loss": 0.0, + "step": 37265 + }, + { + "epoch": 3.477279089297378, + "grad_norm": NaN, + "learning_rate": 0.00011909790040838797, + "loss": 0.0, + "step": 37266 + }, + { + "epoch": 3.477372398992255, + "grad_norm": NaN, + "learning_rate": 0.00011909049891364258, + "loss": 0.0, + "step": 37267 + }, + { + "epoch": 3.4774657086871326, + "grad_norm": NaN, + "learning_rate": 0.00011908309749749086, + "loss": 0.0, + "step": 37268 + }, + { + "epoch": 3.47755901838201, + "grad_norm": NaN, + "learning_rate": 0.00011907569615995163, + "loss": 0.0, + "step": 37269 + }, + { + "epoch": 3.4776523280768874, + "grad_norm": NaN, + "learning_rate": 0.0001190682949010438, + "loss": 0.0, + "step": 37270 + }, + { + "epoch": 3.4777456377717644, + "grad_norm": NaN, + "learning_rate": 0.00011906089372078611, + "loss": 0.0, + "step": 37271 + }, + { + "epoch": 3.477838947466642, + "grad_norm": NaN, + "learning_rate": 0.00011905349261919739, + "loss": 0.0, + "step": 37272 + }, + { + "epoch": 3.477932257161519, + "grad_norm": NaN, + "learning_rate": 0.0001190460915962965, + "loss": 0.0, + "step": 37273 + }, + { + "epoch": 3.478025566856396, + "grad_norm": NaN, + "learning_rate": 0.00011903869065210223, + "loss": 0.0, + "step": 37274 + }, + { + "epoch": 3.4781188765512736, + "grad_norm": NaN, + "learning_rate": 0.00011903128978663333, + "loss": 0.0, + "step": 37275 + }, + { + "epoch": 3.478212186246151, + "grad_norm": NaN, + "learning_rate": 0.00011902388899990878, + "loss": 0.0, + "step": 37276 + }, + { + "epoch": 3.4783054959410284, + "grad_norm": NaN, + "learning_rate": 0.00011901648829194727, + "loss": 0.0, + "step": 37277 + }, + { + "epoch": 3.4783988056359054, + "grad_norm": NaN, + "learning_rate": 0.00011900908766276761, + "loss": 0.0, + "step": 37278 + }, + { + "epoch": 3.478492115330783, + "grad_norm": NaN, + "learning_rate": 0.00011900168711238874, + "loss": 0.0, + "step": 37279 + }, + { + "epoch": 3.4785854250256603, + "grad_norm": NaN, + "learning_rate": 0.00011899428664082931, + "loss": 0.0, + "step": 37280 + }, + { + "epoch": 3.4786787347205372, + "grad_norm": NaN, + "learning_rate": 0.00011898688624810827, + "loss": 0.0, + "step": 37281 + }, + { + "epoch": 3.4787720444154147, + "grad_norm": NaN, + "learning_rate": 0.00011897948593424441, + "loss": 0.0, + "step": 37282 + }, + { + "epoch": 3.478865354110292, + "grad_norm": NaN, + "learning_rate": 0.00011897208569925649, + "loss": 0.0, + "step": 37283 + }, + { + "epoch": 3.4789586638051695, + "grad_norm": NaN, + "learning_rate": 0.00011896468554316337, + "loss": 0.0, + "step": 37284 + }, + { + "epoch": 3.4790519735000465, + "grad_norm": NaN, + "learning_rate": 0.0001189572854659839, + "loss": 0.0, + "step": 37285 + }, + { + "epoch": 3.479145283194924, + "grad_norm": NaN, + "learning_rate": 0.0001189498854677368, + "loss": 0.0, + "step": 37286 + }, + { + "epoch": 3.4792385928898013, + "grad_norm": NaN, + "learning_rate": 0.00011894248554844099, + "loss": 0.0, + "step": 37287 + }, + { + "epoch": 3.4793319025846787, + "grad_norm": NaN, + "learning_rate": 0.00011893508570811525, + "loss": 0.0, + "step": 37288 + }, + { + "epoch": 3.4794252122795557, + "grad_norm": NaN, + "learning_rate": 0.00011892768594677832, + "loss": 0.0, + "step": 37289 + }, + { + "epoch": 3.479518521974433, + "grad_norm": NaN, + "learning_rate": 0.00011892028626444912, + "loss": 0.0, + "step": 37290 + }, + { + "epoch": 3.4796118316693105, + "grad_norm": NaN, + "learning_rate": 0.00011891288666114647, + "loss": 0.0, + "step": 37291 + }, + { + "epoch": 3.479705141364188, + "grad_norm": NaN, + "learning_rate": 0.00011890548713688906, + "loss": 0.0, + "step": 37292 + }, + { + "epoch": 3.479798451059065, + "grad_norm": NaN, + "learning_rate": 0.00011889808769169581, + "loss": 0.0, + "step": 37293 + }, + { + "epoch": 3.4798917607539424, + "grad_norm": NaN, + "learning_rate": 0.00011889068832558557, + "loss": 0.0, + "step": 37294 + }, + { + "epoch": 3.4799850704488198, + "grad_norm": NaN, + "learning_rate": 0.00011888328903857702, + "loss": 0.0, + "step": 37295 + }, + { + "epoch": 3.4800783801436967, + "grad_norm": NaN, + "learning_rate": 0.00011887588983068905, + "loss": 0.0, + "step": 37296 + }, + { + "epoch": 3.480171689838574, + "grad_norm": NaN, + "learning_rate": 0.00011886849070194053, + "loss": 0.0, + "step": 37297 + }, + { + "epoch": 3.4802649995334516, + "grad_norm": NaN, + "learning_rate": 0.00011886109165235015, + "loss": 0.0, + "step": 37298 + }, + { + "epoch": 3.480358309228329, + "grad_norm": NaN, + "learning_rate": 0.00011885369268193682, + "loss": 0.0, + "step": 37299 + }, + { + "epoch": 3.480451618923206, + "grad_norm": NaN, + "learning_rate": 0.00011884629379071937, + "loss": 0.0, + "step": 37300 + }, + { + "epoch": 3.4805449286180834, + "grad_norm": NaN, + "learning_rate": 0.00011883889497871648, + "loss": 0.0, + "step": 37301 + }, + { + "epoch": 3.480638238312961, + "grad_norm": NaN, + "learning_rate": 0.0001188314962459471, + "loss": 0.0, + "step": 37302 + }, + { + "epoch": 3.480731548007838, + "grad_norm": NaN, + "learning_rate": 0.00011882409759243, + "loss": 0.0, + "step": 37303 + }, + { + "epoch": 3.480824857702715, + "grad_norm": NaN, + "learning_rate": 0.00011881669901818395, + "loss": 0.0, + "step": 37304 + }, + { + "epoch": 3.4809181673975926, + "grad_norm": NaN, + "learning_rate": 0.00011880930052322784, + "loss": 0.0, + "step": 37305 + }, + { + "epoch": 3.48101147709247, + "grad_norm": NaN, + "learning_rate": 0.00011880190210758043, + "loss": 0.0, + "step": 37306 + }, + { + "epoch": 3.481104786787347, + "grad_norm": NaN, + "learning_rate": 0.00011879450377126048, + "loss": 0.0, + "step": 37307 + }, + { + "epoch": 3.4811980964822244, + "grad_norm": NaN, + "learning_rate": 0.00011878710551428695, + "loss": 0.0, + "step": 37308 + }, + { + "epoch": 3.481291406177102, + "grad_norm": NaN, + "learning_rate": 0.00011877970733667852, + "loss": 0.0, + "step": 37309 + }, + { + "epoch": 3.4813847158719793, + "grad_norm": NaN, + "learning_rate": 0.00011877230923845402, + "loss": 0.0, + "step": 37310 + }, + { + "epoch": 3.4814780255668563, + "grad_norm": NaN, + "learning_rate": 0.00011876491121963235, + "loss": 0.0, + "step": 37311 + }, + { + "epoch": 3.4815713352617337, + "grad_norm": NaN, + "learning_rate": 0.00011875751328023223, + "loss": 0.0, + "step": 37312 + }, + { + "epoch": 3.481664644956611, + "grad_norm": NaN, + "learning_rate": 0.00011875011542027246, + "loss": 0.0, + "step": 37313 + }, + { + "epoch": 3.4817579546514885, + "grad_norm": NaN, + "learning_rate": 0.00011874271763977196, + "loss": 0.0, + "step": 37314 + }, + { + "epoch": 3.4818512643463655, + "grad_norm": NaN, + "learning_rate": 0.0001187353199387494, + "loss": 0.0, + "step": 37315 + }, + { + "epoch": 3.481944574041243, + "grad_norm": NaN, + "learning_rate": 0.0001187279223172237, + "loss": 0.0, + "step": 37316 + }, + { + "epoch": 3.4820378837361203, + "grad_norm": NaN, + "learning_rate": 0.00011872052477521366, + "loss": 0.0, + "step": 37317 + }, + { + "epoch": 3.4821311934309973, + "grad_norm": NaN, + "learning_rate": 0.00011871312731273798, + "loss": 0.0, + "step": 37318 + }, + { + "epoch": 3.4822245031258747, + "grad_norm": NaN, + "learning_rate": 0.0001187057299298156, + "loss": 0.0, + "step": 37319 + }, + { + "epoch": 3.482317812820752, + "grad_norm": NaN, + "learning_rate": 0.00011869833262646529, + "loss": 0.0, + "step": 37320 + }, + { + "epoch": 3.4824111225156296, + "grad_norm": NaN, + "learning_rate": 0.0001186909354027058, + "loss": 0.0, + "step": 37321 + }, + { + "epoch": 3.4825044322105065, + "grad_norm": NaN, + "learning_rate": 0.00011868353825855601, + "loss": 0.0, + "step": 37322 + }, + { + "epoch": 3.482597741905384, + "grad_norm": NaN, + "learning_rate": 0.00011867614119403476, + "loss": 0.0, + "step": 37323 + }, + { + "epoch": 3.4826910516002614, + "grad_norm": NaN, + "learning_rate": 0.00011866874420916072, + "loss": 0.0, + "step": 37324 + }, + { + "epoch": 3.4827843612951384, + "grad_norm": NaN, + "learning_rate": 0.00011866134730395283, + "loss": 0.0, + "step": 37325 + }, + { + "epoch": 3.4828776709900158, + "grad_norm": NaN, + "learning_rate": 0.00011865395047842987, + "loss": 0.0, + "step": 37326 + }, + { + "epoch": 3.482970980684893, + "grad_norm": NaN, + "learning_rate": 0.00011864655373261056, + "loss": 0.0, + "step": 37327 + }, + { + "epoch": 3.4830642903797706, + "grad_norm": NaN, + "learning_rate": 0.00011863915706651382, + "loss": 0.0, + "step": 37328 + }, + { + "epoch": 3.4831576000746476, + "grad_norm": NaN, + "learning_rate": 0.00011863176048015843, + "loss": 0.0, + "step": 37329 + }, + { + "epoch": 3.483250909769525, + "grad_norm": NaN, + "learning_rate": 0.00011862436397356313, + "loss": 0.0, + "step": 37330 + }, + { + "epoch": 3.4833442194644024, + "grad_norm": NaN, + "learning_rate": 0.00011861696754674681, + "loss": 0.0, + "step": 37331 + }, + { + "epoch": 3.4834375291592794, + "grad_norm": NaN, + "learning_rate": 0.00011860957119972829, + "loss": 0.0, + "step": 37332 + }, + { + "epoch": 3.483530838854157, + "grad_norm": NaN, + "learning_rate": 0.00011860217493252624, + "loss": 0.0, + "step": 37333 + }, + { + "epoch": 3.4836241485490342, + "grad_norm": NaN, + "learning_rate": 0.00011859477874515961, + "loss": 0.0, + "step": 37334 + }, + { + "epoch": 3.4837174582439117, + "grad_norm": NaN, + "learning_rate": 0.00011858738263764719, + "loss": 0.0, + "step": 37335 + }, + { + "epoch": 3.483810767938789, + "grad_norm": NaN, + "learning_rate": 0.00011857998661000767, + "loss": 0.0, + "step": 37336 + }, + { + "epoch": 3.483904077633666, + "grad_norm": NaN, + "learning_rate": 0.00011857259066225996, + "loss": 0.0, + "step": 37337 + }, + { + "epoch": 3.4839973873285435, + "grad_norm": NaN, + "learning_rate": 0.00011856519479442292, + "loss": 0.0, + "step": 37338 + }, + { + "epoch": 3.484090697023421, + "grad_norm": NaN, + "learning_rate": 0.00011855779900651516, + "loss": 0.0, + "step": 37339 + }, + { + "epoch": 3.484184006718298, + "grad_norm": NaN, + "learning_rate": 0.00011855040329855568, + "loss": 0.0, + "step": 37340 + }, + { + "epoch": 3.4842773164131753, + "grad_norm": NaN, + "learning_rate": 0.0001185430076705632, + "loss": 0.0, + "step": 37341 + }, + { + "epoch": 3.4843706261080527, + "grad_norm": NaN, + "learning_rate": 0.0001185356121225565, + "loss": 0.0, + "step": 37342 + }, + { + "epoch": 3.48446393580293, + "grad_norm": NaN, + "learning_rate": 0.00011852821665455445, + "loss": 0.0, + "step": 37343 + }, + { + "epoch": 3.484557245497807, + "grad_norm": NaN, + "learning_rate": 0.00011852082126657583, + "loss": 0.0, + "step": 37344 + }, + { + "epoch": 3.4846505551926845, + "grad_norm": NaN, + "learning_rate": 0.0001185134259586394, + "loss": 0.0, + "step": 37345 + }, + { + "epoch": 3.484743864887562, + "grad_norm": NaN, + "learning_rate": 0.000118506030730764, + "loss": 0.0, + "step": 37346 + }, + { + "epoch": 3.484837174582439, + "grad_norm": NaN, + "learning_rate": 0.0001184986355829685, + "loss": 0.0, + "step": 37347 + }, + { + "epoch": 3.4849304842773163, + "grad_norm": NaN, + "learning_rate": 0.00011849124051527155, + "loss": 0.0, + "step": 37348 + }, + { + "epoch": 3.4850237939721938, + "grad_norm": NaN, + "learning_rate": 0.00011848384552769213, + "loss": 0.0, + "step": 37349 + }, + { + "epoch": 3.485117103667071, + "grad_norm": NaN, + "learning_rate": 0.0001184764506202489, + "loss": 0.0, + "step": 37350 + }, + { + "epoch": 3.485210413361948, + "grad_norm": NaN, + "learning_rate": 0.0001184690557929607, + "loss": 0.0, + "step": 37351 + }, + { + "epoch": 3.4853037230568256, + "grad_norm": NaN, + "learning_rate": 0.0001184616610458464, + "loss": 0.0, + "step": 37352 + }, + { + "epoch": 3.485397032751703, + "grad_norm": NaN, + "learning_rate": 0.00011845426637892471, + "loss": 0.0, + "step": 37353 + }, + { + "epoch": 3.48549034244658, + "grad_norm": NaN, + "learning_rate": 0.00011844687179221449, + "loss": 0.0, + "step": 37354 + }, + { + "epoch": 3.4855836521414574, + "grad_norm": NaN, + "learning_rate": 0.00011843947728573458, + "loss": 0.0, + "step": 37355 + }, + { + "epoch": 3.485676961836335, + "grad_norm": NaN, + "learning_rate": 0.00011843208285950364, + "loss": 0.0, + "step": 37356 + }, + { + "epoch": 3.485770271531212, + "grad_norm": NaN, + "learning_rate": 0.0001184246885135406, + "loss": 0.0, + "step": 37357 + }, + { + "epoch": 3.4858635812260896, + "grad_norm": NaN, + "learning_rate": 0.00011841729424786429, + "loss": 0.0, + "step": 37358 + }, + { + "epoch": 3.4859568909209666, + "grad_norm": NaN, + "learning_rate": 0.00011840990006249335, + "loss": 0.0, + "step": 37359 + }, + { + "epoch": 3.486050200615844, + "grad_norm": NaN, + "learning_rate": 0.00011840250595744671, + "loss": 0.0, + "step": 37360 + }, + { + "epoch": 3.4861435103107214, + "grad_norm": NaN, + "learning_rate": 0.00011839511193274318, + "loss": 0.0, + "step": 37361 + }, + { + "epoch": 3.4862368200055984, + "grad_norm": NaN, + "learning_rate": 0.00011838771798840144, + "loss": 0.0, + "step": 37362 + }, + { + "epoch": 3.486330129700476, + "grad_norm": NaN, + "learning_rate": 0.00011838032412444041, + "loss": 0.0, + "step": 37363 + }, + { + "epoch": 3.4864234393953533, + "grad_norm": NaN, + "learning_rate": 0.00011837293034087889, + "loss": 0.0, + "step": 37364 + }, + { + "epoch": 3.4865167490902307, + "grad_norm": NaN, + "learning_rate": 0.00011836553663773556, + "loss": 0.0, + "step": 37365 + }, + { + "epoch": 3.4866100587851077, + "grad_norm": NaN, + "learning_rate": 0.00011835814301502934, + "loss": 0.0, + "step": 37366 + }, + { + "epoch": 3.486703368479985, + "grad_norm": NaN, + "learning_rate": 0.00011835074947277902, + "loss": 0.0, + "step": 37367 + }, + { + "epoch": 3.4867966781748625, + "grad_norm": NaN, + "learning_rate": 0.0001183433560110033, + "loss": 0.0, + "step": 37368 + }, + { + "epoch": 3.4868899878697395, + "grad_norm": NaN, + "learning_rate": 0.00011833596262972108, + "loss": 0.0, + "step": 37369 + }, + { + "epoch": 3.486983297564617, + "grad_norm": NaN, + "learning_rate": 0.00011832856932895117, + "loss": 0.0, + "step": 37370 + }, + { + "epoch": 3.4870766072594943, + "grad_norm": NaN, + "learning_rate": 0.00011832117610871226, + "loss": 0.0, + "step": 37371 + }, + { + "epoch": 3.4871699169543717, + "grad_norm": NaN, + "learning_rate": 0.00011831378296902324, + "loss": 0.0, + "step": 37372 + }, + { + "epoch": 3.4872632266492487, + "grad_norm": NaN, + "learning_rate": 0.00011830638990990292, + "loss": 0.0, + "step": 37373 + }, + { + "epoch": 3.487356536344126, + "grad_norm": NaN, + "learning_rate": 0.00011829899693136999, + "loss": 0.0, + "step": 37374 + }, + { + "epoch": 3.4874498460390035, + "grad_norm": NaN, + "learning_rate": 0.00011829160403344335, + "loss": 0.0, + "step": 37375 + }, + { + "epoch": 3.4875431557338805, + "grad_norm": NaN, + "learning_rate": 0.00011828421121614182, + "loss": 0.0, + "step": 37376 + }, + { + "epoch": 3.487636465428758, + "grad_norm": NaN, + "learning_rate": 0.00011827681847948406, + "loss": 0.0, + "step": 37377 + }, + { + "epoch": 3.4877297751236354, + "grad_norm": NaN, + "learning_rate": 0.000118269425823489, + "loss": 0.0, + "step": 37378 + }, + { + "epoch": 3.4878230848185128, + "grad_norm": NaN, + "learning_rate": 0.00011826203324817541, + "loss": 0.0, + "step": 37379 + }, + { + "epoch": 3.4879163945133898, + "grad_norm": NaN, + "learning_rate": 0.00011825464075356202, + "loss": 0.0, + "step": 37380 + }, + { + "epoch": 3.488009704208267, + "grad_norm": NaN, + "learning_rate": 0.0001182472483396677, + "loss": 0.0, + "step": 37381 + }, + { + "epoch": 3.4881030139031446, + "grad_norm": NaN, + "learning_rate": 0.00011823985600651126, + "loss": 0.0, + "step": 37382 + }, + { + "epoch": 3.488196323598022, + "grad_norm": NaN, + "learning_rate": 0.00011823246375411138, + "loss": 0.0, + "step": 37383 + }, + { + "epoch": 3.488289633292899, + "grad_norm": NaN, + "learning_rate": 0.00011822507158248698, + "loss": 0.0, + "step": 37384 + }, + { + "epoch": 3.4883829429877764, + "grad_norm": NaN, + "learning_rate": 0.00011821767949165685, + "loss": 0.0, + "step": 37385 + }, + { + "epoch": 3.488476252682654, + "grad_norm": NaN, + "learning_rate": 0.00011821028748163966, + "loss": 0.0, + "step": 37386 + }, + { + "epoch": 3.4885695623775312, + "grad_norm": NaN, + "learning_rate": 0.0001182028955524543, + "loss": 0.0, + "step": 37387 + }, + { + "epoch": 3.488662872072408, + "grad_norm": NaN, + "learning_rate": 0.00011819550370411966, + "loss": 0.0, + "step": 37388 + }, + { + "epoch": 3.4887561817672856, + "grad_norm": NaN, + "learning_rate": 0.00011818811193665432, + "loss": 0.0, + "step": 37389 + }, + { + "epoch": 3.488849491462163, + "grad_norm": NaN, + "learning_rate": 0.00011818072025007721, + "loss": 0.0, + "step": 37390 + }, + { + "epoch": 3.48894280115704, + "grad_norm": NaN, + "learning_rate": 0.00011817332864440716, + "loss": 0.0, + "step": 37391 + }, + { + "epoch": 3.4890361108519174, + "grad_norm": NaN, + "learning_rate": 0.00011816593711966287, + "loss": 0.0, + "step": 37392 + }, + { + "epoch": 3.489129420546795, + "grad_norm": NaN, + "learning_rate": 0.00011815854567586321, + "loss": 0.0, + "step": 37393 + }, + { + "epoch": 3.4892227302416723, + "grad_norm": NaN, + "learning_rate": 0.00011815115431302686, + "loss": 0.0, + "step": 37394 + }, + { + "epoch": 3.4893160399365493, + "grad_norm": NaN, + "learning_rate": 0.00011814376303117273, + "loss": 0.0, + "step": 37395 + }, + { + "epoch": 3.4894093496314267, + "grad_norm": NaN, + "learning_rate": 0.00011813637183031961, + "loss": 0.0, + "step": 37396 + }, + { + "epoch": 3.489502659326304, + "grad_norm": NaN, + "learning_rate": 0.00011812898071048619, + "loss": 0.0, + "step": 37397 + }, + { + "epoch": 3.489595969021181, + "grad_norm": NaN, + "learning_rate": 0.00011812158967169137, + "loss": 0.0, + "step": 37398 + }, + { + "epoch": 3.4896892787160585, + "grad_norm": NaN, + "learning_rate": 0.00011811419871395392, + "loss": 0.0, + "step": 37399 + }, + { + "epoch": 3.489782588410936, + "grad_norm": NaN, + "learning_rate": 0.00011810680783729257, + "loss": 0.0, + "step": 37400 + }, + { + "epoch": 3.4898758981058133, + "grad_norm": NaN, + "learning_rate": 0.00011809941704172618, + "loss": 0.0, + "step": 37401 + }, + { + "epoch": 3.4899692078006903, + "grad_norm": NaN, + "learning_rate": 0.00011809202632727357, + "loss": 0.0, + "step": 37402 + }, + { + "epoch": 3.4900625174955677, + "grad_norm": NaN, + "learning_rate": 0.00011808463569395341, + "loss": 0.0, + "step": 37403 + }, + { + "epoch": 3.490155827190445, + "grad_norm": NaN, + "learning_rate": 0.0001180772451417846, + "loss": 0.0, + "step": 37404 + }, + { + "epoch": 3.4902491368853226, + "grad_norm": NaN, + "learning_rate": 0.00011806985467078592, + "loss": 0.0, + "step": 37405 + }, + { + "epoch": 3.4903424465801995, + "grad_norm": NaN, + "learning_rate": 0.0001180624642809761, + "loss": 0.0, + "step": 37406 + }, + { + "epoch": 3.490435756275077, + "grad_norm": NaN, + "learning_rate": 0.00011805507397237401, + "loss": 0.0, + "step": 37407 + }, + { + "epoch": 3.4905290659699544, + "grad_norm": NaN, + "learning_rate": 0.00011804768374499841, + "loss": 0.0, + "step": 37408 + }, + { + "epoch": 3.490622375664832, + "grad_norm": NaN, + "learning_rate": 0.00011804029359886802, + "loss": 0.0, + "step": 37409 + }, + { + "epoch": 3.4907156853597088, + "grad_norm": NaN, + "learning_rate": 0.00011803290353400174, + "loss": 0.0, + "step": 37410 + }, + { + "epoch": 3.490808995054586, + "grad_norm": NaN, + "learning_rate": 0.00011802551355041836, + "loss": 0.0, + "step": 37411 + }, + { + "epoch": 3.4909023047494636, + "grad_norm": NaN, + "learning_rate": 0.00011801812364813655, + "loss": 0.0, + "step": 37412 + }, + { + "epoch": 3.4909956144443406, + "grad_norm": NaN, + "learning_rate": 0.0001180107338271752, + "loss": 0.0, + "step": 37413 + }, + { + "epoch": 3.491088924139218, + "grad_norm": NaN, + "learning_rate": 0.00011800334408755314, + "loss": 0.0, + "step": 37414 + }, + { + "epoch": 3.4911822338340954, + "grad_norm": NaN, + "learning_rate": 0.00011799595442928902, + "loss": 0.0, + "step": 37415 + }, + { + "epoch": 3.491275543528973, + "grad_norm": NaN, + "learning_rate": 0.00011798856485240172, + "loss": 0.0, + "step": 37416 + }, + { + "epoch": 3.49136885322385, + "grad_norm": NaN, + "learning_rate": 0.00011798117535691007, + "loss": 0.0, + "step": 37417 + }, + { + "epoch": 3.4914621629187272, + "grad_norm": NaN, + "learning_rate": 0.00011797378594283274, + "loss": 0.0, + "step": 37418 + }, + { + "epoch": 3.4915554726136047, + "grad_norm": NaN, + "learning_rate": 0.00011796639661018862, + "loss": 0.0, + "step": 37419 + }, + { + "epoch": 3.4916487823084816, + "grad_norm": NaN, + "learning_rate": 0.0001179590073589965, + "loss": 0.0, + "step": 37420 + }, + { + "epoch": 3.491742092003359, + "grad_norm": NaN, + "learning_rate": 0.00011795161818927505, + "loss": 0.0, + "step": 37421 + }, + { + "epoch": 3.4918354016982365, + "grad_norm": NaN, + "learning_rate": 0.00011794422910104318, + "loss": 0.0, + "step": 37422 + }, + { + "epoch": 3.491928711393114, + "grad_norm": NaN, + "learning_rate": 0.00011793684009431967, + "loss": 0.0, + "step": 37423 + }, + { + "epoch": 3.492022021087991, + "grad_norm": NaN, + "learning_rate": 0.00011792945116912324, + "loss": 0.0, + "step": 37424 + }, + { + "epoch": 3.4921153307828683, + "grad_norm": NaN, + "learning_rate": 0.00011792206232547269, + "loss": 0.0, + "step": 37425 + }, + { + "epoch": 3.4922086404777457, + "grad_norm": NaN, + "learning_rate": 0.00011791467356338692, + "loss": 0.0, + "step": 37426 + }, + { + "epoch": 3.492301950172623, + "grad_norm": NaN, + "learning_rate": 0.00011790728488288456, + "loss": 0.0, + "step": 37427 + }, + { + "epoch": 3.4923952598675, + "grad_norm": NaN, + "learning_rate": 0.00011789989628398445, + "loss": 0.0, + "step": 37428 + }, + { + "epoch": 3.4924885695623775, + "grad_norm": NaN, + "learning_rate": 0.00011789250776670546, + "loss": 0.0, + "step": 37429 + }, + { + "epoch": 3.492581879257255, + "grad_norm": NaN, + "learning_rate": 0.0001178851193310663, + "loss": 0.0, + "step": 37430 + }, + { + "epoch": 3.4926751889521324, + "grad_norm": NaN, + "learning_rate": 0.0001178777309770857, + "loss": 0.0, + "step": 37431 + }, + { + "epoch": 3.4927684986470093, + "grad_norm": NaN, + "learning_rate": 0.00011787034270478261, + "loss": 0.0, + "step": 37432 + }, + { + "epoch": 3.4928618083418868, + "grad_norm": NaN, + "learning_rate": 0.00011786295451417568, + "loss": 0.0, + "step": 37433 + }, + { + "epoch": 3.492955118036764, + "grad_norm": NaN, + "learning_rate": 0.00011785556640528371, + "loss": 0.0, + "step": 37434 + }, + { + "epoch": 3.493048427731641, + "grad_norm": NaN, + "learning_rate": 0.00011784817837812557, + "loss": 0.0, + "step": 37435 + }, + { + "epoch": 3.4931417374265186, + "grad_norm": NaN, + "learning_rate": 0.00011784079043271997, + "loss": 0.0, + "step": 37436 + }, + { + "epoch": 3.493235047121396, + "grad_norm": NaN, + "learning_rate": 0.00011783340256908566, + "loss": 0.0, + "step": 37437 + }, + { + "epoch": 3.4933283568162734, + "grad_norm": NaN, + "learning_rate": 0.00011782601478724156, + "loss": 0.0, + "step": 37438 + }, + { + "epoch": 3.4934216665111504, + "grad_norm": NaN, + "learning_rate": 0.00011781862708720632, + "loss": 0.0, + "step": 37439 + }, + { + "epoch": 3.493514976206028, + "grad_norm": NaN, + "learning_rate": 0.00011781123946899884, + "loss": 0.0, + "step": 37440 + }, + { + "epoch": 3.493608285900905, + "grad_norm": NaN, + "learning_rate": 0.00011780385193263775, + "loss": 0.0, + "step": 37441 + }, + { + "epoch": 3.493701595595782, + "grad_norm": NaN, + "learning_rate": 0.00011779646447814198, + "loss": 0.0, + "step": 37442 + }, + { + "epoch": 3.4937949052906596, + "grad_norm": NaN, + "learning_rate": 0.0001177890771055303, + "loss": 0.0, + "step": 37443 + }, + { + "epoch": 3.493888214985537, + "grad_norm": NaN, + "learning_rate": 0.00011778168981482136, + "loss": 0.0, + "step": 37444 + }, + { + "epoch": 3.4939815246804145, + "grad_norm": NaN, + "learning_rate": 0.0001177743026060341, + "loss": 0.0, + "step": 37445 + }, + { + "epoch": 3.4940748343752914, + "grad_norm": NaN, + "learning_rate": 0.00011776691547918726, + "loss": 0.0, + "step": 37446 + }, + { + "epoch": 3.494168144070169, + "grad_norm": NaN, + "learning_rate": 0.00011775952843429954, + "loss": 0.0, + "step": 37447 + }, + { + "epoch": 3.4942614537650463, + "grad_norm": NaN, + "learning_rate": 0.00011775214147138983, + "loss": 0.0, + "step": 37448 + }, + { + "epoch": 3.4943547634599232, + "grad_norm": NaN, + "learning_rate": 0.00011774475459047691, + "loss": 0.0, + "step": 37449 + }, + { + "epoch": 3.4944480731548007, + "grad_norm": NaN, + "learning_rate": 0.00011773736779157943, + "loss": 0.0, + "step": 37450 + }, + { + "epoch": 3.494541382849678, + "grad_norm": NaN, + "learning_rate": 0.00011772998107471632, + "loss": 0.0, + "step": 37451 + }, + { + "epoch": 3.4946346925445555, + "grad_norm": NaN, + "learning_rate": 0.00011772259443990634, + "loss": 0.0, + "step": 37452 + }, + { + "epoch": 3.494728002239433, + "grad_norm": NaN, + "learning_rate": 0.0001177152078871682, + "loss": 0.0, + "step": 37453 + }, + { + "epoch": 3.49482131193431, + "grad_norm": NaN, + "learning_rate": 0.00011770782141652072, + "loss": 0.0, + "step": 37454 + }, + { + "epoch": 3.4949146216291873, + "grad_norm": NaN, + "learning_rate": 0.00011770043502798271, + "loss": 0.0, + "step": 37455 + }, + { + "epoch": 3.4950079313240647, + "grad_norm": NaN, + "learning_rate": 0.00011769304872157289, + "loss": 0.0, + "step": 37456 + }, + { + "epoch": 3.4951012410189417, + "grad_norm": NaN, + "learning_rate": 0.0001176856624973101, + "loss": 0.0, + "step": 37457 + }, + { + "epoch": 3.495194550713819, + "grad_norm": NaN, + "learning_rate": 0.00011767827635521313, + "loss": 0.0, + "step": 37458 + }, + { + "epoch": 3.4952878604086965, + "grad_norm": NaN, + "learning_rate": 0.00011767089029530067, + "loss": 0.0, + "step": 37459 + }, + { + "epoch": 3.495381170103574, + "grad_norm": NaN, + "learning_rate": 0.0001176635043175916, + "loss": 0.0, + "step": 37460 + }, + { + "epoch": 3.495474479798451, + "grad_norm": NaN, + "learning_rate": 0.00011765611842210467, + "loss": 0.0, + "step": 37461 + }, + { + "epoch": 3.4955677894933284, + "grad_norm": NaN, + "learning_rate": 0.00011764873260885858, + "loss": 0.0, + "step": 37462 + }, + { + "epoch": 3.495661099188206, + "grad_norm": NaN, + "learning_rate": 0.0001176413468778722, + "loss": 0.0, + "step": 37463 + }, + { + "epoch": 3.4957544088830828, + "grad_norm": NaN, + "learning_rate": 0.00011763396122916437, + "loss": 0.0, + "step": 37464 + }, + { + "epoch": 3.49584771857796, + "grad_norm": NaN, + "learning_rate": 0.00011762657566275371, + "loss": 0.0, + "step": 37465 + }, + { + "epoch": 3.4959410282728376, + "grad_norm": NaN, + "learning_rate": 0.00011761919017865908, + "loss": 0.0, + "step": 37466 + }, + { + "epoch": 3.496034337967715, + "grad_norm": NaN, + "learning_rate": 0.0001176118047768993, + "loss": 0.0, + "step": 37467 + }, + { + "epoch": 3.496127647662592, + "grad_norm": NaN, + "learning_rate": 0.0001176044194574931, + "loss": 0.0, + "step": 37468 + }, + { + "epoch": 3.4962209573574694, + "grad_norm": NaN, + "learning_rate": 0.00011759703422045921, + "loss": 0.0, + "step": 37469 + }, + { + "epoch": 3.496314267052347, + "grad_norm": NaN, + "learning_rate": 0.00011758964906581653, + "loss": 0.0, + "step": 37470 + }, + { + "epoch": 3.496407576747224, + "grad_norm": NaN, + "learning_rate": 0.00011758226399358375, + "loss": 0.0, + "step": 37471 + }, + { + "epoch": 3.496500886442101, + "grad_norm": NaN, + "learning_rate": 0.00011757487900377963, + "loss": 0.0, + "step": 37472 + }, + { + "epoch": 3.4965941961369786, + "grad_norm": NaN, + "learning_rate": 0.00011756749409642306, + "loss": 0.0, + "step": 37473 + }, + { + "epoch": 3.496687505831856, + "grad_norm": NaN, + "learning_rate": 0.0001175601092715327, + "loss": 0.0, + "step": 37474 + }, + { + "epoch": 3.4967808155267335, + "grad_norm": NaN, + "learning_rate": 0.00011755272452912736, + "loss": 0.0, + "step": 37475 + }, + { + "epoch": 3.4968741252216105, + "grad_norm": NaN, + "learning_rate": 0.00011754533986922588, + "loss": 0.0, + "step": 37476 + }, + { + "epoch": 3.496967434916488, + "grad_norm": NaN, + "learning_rate": 0.00011753795529184695, + "loss": 0.0, + "step": 37477 + }, + { + "epoch": 3.4970607446113653, + "grad_norm": NaN, + "learning_rate": 0.00011753057079700937, + "loss": 0.0, + "step": 37478 + }, + { + "epoch": 3.4971540543062423, + "grad_norm": NaN, + "learning_rate": 0.00011752318638473198, + "loss": 0.0, + "step": 37479 + }, + { + "epoch": 3.4972473640011197, + "grad_norm": NaN, + "learning_rate": 0.00011751580205503349, + "loss": 0.0, + "step": 37480 + }, + { + "epoch": 3.497340673695997, + "grad_norm": NaN, + "learning_rate": 0.00011750841780793265, + "loss": 0.0, + "step": 37481 + }, + { + "epoch": 3.4974339833908745, + "grad_norm": NaN, + "learning_rate": 0.00011750103364344834, + "loss": 0.0, + "step": 37482 + }, + { + "epoch": 3.4975272930857515, + "grad_norm": NaN, + "learning_rate": 0.00011749364956159925, + "loss": 0.0, + "step": 37483 + }, + { + "epoch": 3.497620602780629, + "grad_norm": NaN, + "learning_rate": 0.00011748626556240421, + "loss": 0.0, + "step": 37484 + }, + { + "epoch": 3.4977139124755063, + "grad_norm": NaN, + "learning_rate": 0.0001174788816458819, + "loss": 0.0, + "step": 37485 + }, + { + "epoch": 3.4978072221703833, + "grad_norm": NaN, + "learning_rate": 0.00011747149781205118, + "loss": 0.0, + "step": 37486 + }, + { + "epoch": 3.4979005318652607, + "grad_norm": NaN, + "learning_rate": 0.00011746411406093085, + "loss": 0.0, + "step": 37487 + }, + { + "epoch": 3.497993841560138, + "grad_norm": NaN, + "learning_rate": 0.00011745673039253959, + "loss": 0.0, + "step": 37488 + }, + { + "epoch": 3.4980871512550156, + "grad_norm": NaN, + "learning_rate": 0.00011744934680689624, + "loss": 0.0, + "step": 37489 + }, + { + "epoch": 3.4981804609498925, + "grad_norm": NaN, + "learning_rate": 0.00011744196330401959, + "loss": 0.0, + "step": 37490 + }, + { + "epoch": 3.49827377064477, + "grad_norm": NaN, + "learning_rate": 0.00011743457988392831, + "loss": 0.0, + "step": 37491 + }, + { + "epoch": 3.4983670803396474, + "grad_norm": NaN, + "learning_rate": 0.00011742719654664129, + "loss": 0.0, + "step": 37492 + }, + { + "epoch": 3.4984603900345244, + "grad_norm": NaN, + "learning_rate": 0.00011741981329217729, + "loss": 0.0, + "step": 37493 + }, + { + "epoch": 3.498553699729402, + "grad_norm": NaN, + "learning_rate": 0.000117412430120555, + "loss": 0.0, + "step": 37494 + }, + { + "epoch": 3.498647009424279, + "grad_norm": NaN, + "learning_rate": 0.00011740504703179327, + "loss": 0.0, + "step": 37495 + }, + { + "epoch": 3.4987403191191566, + "grad_norm": NaN, + "learning_rate": 0.00011739766402591089, + "loss": 0.0, + "step": 37496 + }, + { + "epoch": 3.4988336288140336, + "grad_norm": NaN, + "learning_rate": 0.00011739028110292653, + "loss": 0.0, + "step": 37497 + }, + { + "epoch": 3.498926938508911, + "grad_norm": NaN, + "learning_rate": 0.00011738289826285904, + "loss": 0.0, + "step": 37498 + }, + { + "epoch": 3.4990202482037884, + "grad_norm": NaN, + "learning_rate": 0.00011737551550572722, + "loss": 0.0, + "step": 37499 + }, + { + "epoch": 3.499113557898666, + "grad_norm": NaN, + "learning_rate": 0.00011736813283154973, + "loss": 0.0, + "step": 37500 + }, + { + "epoch": 3.499206867593543, + "grad_norm": NaN, + "learning_rate": 0.00011736075024034542, + "loss": 0.0, + "step": 37501 + }, + { + "epoch": 3.4993001772884202, + "grad_norm": NaN, + "learning_rate": 0.00011735336773213315, + "loss": 0.0, + "step": 37502 + }, + { + "epoch": 3.4993934869832977, + "grad_norm": NaN, + "learning_rate": 0.0001173459853069315, + "loss": 0.0, + "step": 37503 + }, + { + "epoch": 3.499486796678175, + "grad_norm": NaN, + "learning_rate": 0.00011733860296475933, + "loss": 0.0, + "step": 37504 + }, + { + "epoch": 3.499580106373052, + "grad_norm": NaN, + "learning_rate": 0.00011733122070563548, + "loss": 0.0, + "step": 37505 + }, + { + "epoch": 3.4996734160679295, + "grad_norm": NaN, + "learning_rate": 0.00011732383852957863, + "loss": 0.0, + "step": 37506 + }, + { + "epoch": 3.499766725762807, + "grad_norm": NaN, + "learning_rate": 0.00011731645643660756, + "loss": 0.0, + "step": 37507 + }, + { + "epoch": 3.499860035457684, + "grad_norm": NaN, + "learning_rate": 0.00011730907442674111, + "loss": 0.0, + "step": 37508 + }, + { + "epoch": 3.4999533451525613, + "grad_norm": NaN, + "learning_rate": 0.00011730169249999796, + "loss": 0.0, + "step": 37509 + }, + { + "epoch": 3.5000466548474387, + "grad_norm": NaN, + "learning_rate": 0.0001172943106563969, + "loss": 0.0, + "step": 37510 + }, + { + "epoch": 3.500139964542316, + "grad_norm": NaN, + "learning_rate": 0.00011728692889595679, + "loss": 0.0, + "step": 37511 + }, + { + "epoch": 3.500233274237193, + "grad_norm": NaN, + "learning_rate": 0.00011727954721869627, + "loss": 0.0, + "step": 37512 + }, + { + "epoch": 3.5003265839320705, + "grad_norm": NaN, + "learning_rate": 0.00011727216562463416, + "loss": 0.0, + "step": 37513 + }, + { + "epoch": 3.500419893626948, + "grad_norm": NaN, + "learning_rate": 0.0001172647841137893, + "loss": 0.0, + "step": 37514 + }, + { + "epoch": 3.500513203321825, + "grad_norm": NaN, + "learning_rate": 0.00011725740268618037, + "loss": 0.0, + "step": 37515 + }, + { + "epoch": 3.5006065130167023, + "grad_norm": NaN, + "learning_rate": 0.00011725002134182614, + "loss": 0.0, + "step": 37516 + }, + { + "epoch": 3.5006998227115798, + "grad_norm": NaN, + "learning_rate": 0.00011724264008074546, + "loss": 0.0, + "step": 37517 + }, + { + "epoch": 3.500793132406457, + "grad_norm": NaN, + "learning_rate": 0.00011723525890295699, + "loss": 0.0, + "step": 37518 + }, + { + "epoch": 3.5008864421013346, + "grad_norm": NaN, + "learning_rate": 0.00011722787780847953, + "loss": 0.0, + "step": 37519 + }, + { + "epoch": 3.5009797517962116, + "grad_norm": NaN, + "learning_rate": 0.00011722049679733194, + "loss": 0.0, + "step": 37520 + }, + { + "epoch": 3.501073061491089, + "grad_norm": NaN, + "learning_rate": 0.00011721311586953289, + "loss": 0.0, + "step": 37521 + }, + { + "epoch": 3.501166371185966, + "grad_norm": NaN, + "learning_rate": 0.00011720573502510114, + "loss": 0.0, + "step": 37522 + }, + { + "epoch": 3.5012596808808434, + "grad_norm": NaN, + "learning_rate": 0.00011719835426405555, + "loss": 0.0, + "step": 37523 + }, + { + "epoch": 3.501352990575721, + "grad_norm": NaN, + "learning_rate": 0.0001171909735864148, + "loss": 0.0, + "step": 37524 + }, + { + "epoch": 3.501446300270598, + "grad_norm": NaN, + "learning_rate": 0.00011718359299219764, + "loss": 0.0, + "step": 37525 + }, + { + "epoch": 3.5015396099654756, + "grad_norm": NaN, + "learning_rate": 0.00011717621248142298, + "loss": 0.0, + "step": 37526 + }, + { + "epoch": 3.5016329196603526, + "grad_norm": NaN, + "learning_rate": 0.00011716883205410942, + "loss": 0.0, + "step": 37527 + }, + { + "epoch": 3.50172622935523, + "grad_norm": NaN, + "learning_rate": 0.00011716145171027583, + "loss": 0.0, + "step": 37528 + }, + { + "epoch": 3.5018195390501075, + "grad_norm": NaN, + "learning_rate": 0.0001171540714499409, + "loss": 0.0, + "step": 37529 + }, + { + "epoch": 3.5019128487449844, + "grad_norm": NaN, + "learning_rate": 0.00011714669127312347, + "loss": 0.0, + "step": 37530 + }, + { + "epoch": 3.502006158439862, + "grad_norm": NaN, + "learning_rate": 0.00011713931117984226, + "loss": 0.0, + "step": 37531 + }, + { + "epoch": 3.5020994681347393, + "grad_norm": NaN, + "learning_rate": 0.00011713193117011603, + "loss": 0.0, + "step": 37532 + }, + { + "epoch": 3.5021927778296167, + "grad_norm": NaN, + "learning_rate": 0.00011712455124396359, + "loss": 0.0, + "step": 37533 + }, + { + "epoch": 3.5022860875244937, + "grad_norm": NaN, + "learning_rate": 0.00011711717140140369, + "loss": 0.0, + "step": 37534 + }, + { + "epoch": 3.502379397219371, + "grad_norm": NaN, + "learning_rate": 0.00011710979164245502, + "loss": 0.0, + "step": 37535 + }, + { + "epoch": 3.5024727069142485, + "grad_norm": NaN, + "learning_rate": 0.00011710241196713645, + "loss": 0.0, + "step": 37536 + }, + { + "epoch": 3.5025660166091255, + "grad_norm": NaN, + "learning_rate": 0.00011709503237546672, + "loss": 0.0, + "step": 37537 + }, + { + "epoch": 3.502659326304003, + "grad_norm": NaN, + "learning_rate": 0.00011708765286746452, + "loss": 0.0, + "step": 37538 + }, + { + "epoch": 3.5027526359988803, + "grad_norm": NaN, + "learning_rate": 0.00011708027344314868, + "loss": 0.0, + "step": 37539 + }, + { + "epoch": 3.5028459456937577, + "grad_norm": NaN, + "learning_rate": 0.00011707289410253801, + "loss": 0.0, + "step": 37540 + }, + { + "epoch": 3.502939255388635, + "grad_norm": NaN, + "learning_rate": 0.00011706551484565117, + "loss": 0.0, + "step": 37541 + }, + { + "epoch": 3.503032565083512, + "grad_norm": NaN, + "learning_rate": 0.00011705813567250695, + "loss": 0.0, + "step": 37542 + }, + { + "epoch": 3.5031258747783895, + "grad_norm": NaN, + "learning_rate": 0.00011705075658312421, + "loss": 0.0, + "step": 37543 + }, + { + "epoch": 3.5032191844732665, + "grad_norm": NaN, + "learning_rate": 0.00011704337757752157, + "loss": 0.0, + "step": 37544 + }, + { + "epoch": 3.503312494168144, + "grad_norm": NaN, + "learning_rate": 0.00011703599865571786, + "loss": 0.0, + "step": 37545 + }, + { + "epoch": 3.5034058038630214, + "grad_norm": NaN, + "learning_rate": 0.00011702861981773188, + "loss": 0.0, + "step": 37546 + }, + { + "epoch": 3.503499113557899, + "grad_norm": NaN, + "learning_rate": 0.00011702124106358235, + "loss": 0.0, + "step": 37547 + }, + { + "epoch": 3.503592423252776, + "grad_norm": NaN, + "learning_rate": 0.00011701386239328798, + "loss": 0.0, + "step": 37548 + }, + { + "epoch": 3.503685732947653, + "grad_norm": NaN, + "learning_rate": 0.00011700648380686765, + "loss": 0.0, + "step": 37549 + }, + { + "epoch": 3.5037790426425306, + "grad_norm": NaN, + "learning_rate": 0.00011699910530434003, + "loss": 0.0, + "step": 37550 + }, + { + "epoch": 3.503872352337408, + "grad_norm": NaN, + "learning_rate": 0.00011699172688572389, + "loss": 0.0, + "step": 37551 + }, + { + "epoch": 3.503965662032285, + "grad_norm": NaN, + "learning_rate": 0.00011698434855103805, + "loss": 0.0, + "step": 37552 + }, + { + "epoch": 3.5040589717271624, + "grad_norm": NaN, + "learning_rate": 0.00011697697030030122, + "loss": 0.0, + "step": 37553 + }, + { + "epoch": 3.50415228142204, + "grad_norm": NaN, + "learning_rate": 0.00011696959213353213, + "loss": 0.0, + "step": 37554 + }, + { + "epoch": 3.5042455911169172, + "grad_norm": NaN, + "learning_rate": 0.00011696221405074966, + "loss": 0.0, + "step": 37555 + }, + { + "epoch": 3.504338900811794, + "grad_norm": NaN, + "learning_rate": 0.00011695483605197245, + "loss": 0.0, + "step": 37556 + }, + { + "epoch": 3.5044322105066716, + "grad_norm": NaN, + "learning_rate": 0.00011694745813721927, + "loss": 0.0, + "step": 37557 + }, + { + "epoch": 3.504525520201549, + "grad_norm": NaN, + "learning_rate": 0.00011694008030650898, + "loss": 0.0, + "step": 37558 + }, + { + "epoch": 3.504618829896426, + "grad_norm": NaN, + "learning_rate": 0.00011693270255986023, + "loss": 0.0, + "step": 37559 + }, + { + "epoch": 3.5047121395913035, + "grad_norm": NaN, + "learning_rate": 0.00011692532489729182, + "loss": 0.0, + "step": 37560 + }, + { + "epoch": 3.504805449286181, + "grad_norm": NaN, + "learning_rate": 0.00011691794731882255, + "loss": 0.0, + "step": 37561 + }, + { + "epoch": 3.5048987589810583, + "grad_norm": NaN, + "learning_rate": 0.00011691056982447111, + "loss": 0.0, + "step": 37562 + }, + { + "epoch": 3.5049920686759353, + "grad_norm": NaN, + "learning_rate": 0.00011690319241425627, + "loss": 0.0, + "step": 37563 + }, + { + "epoch": 3.5050853783708127, + "grad_norm": NaN, + "learning_rate": 0.00011689581508819687, + "loss": 0.0, + "step": 37564 + }, + { + "epoch": 3.50517868806569, + "grad_norm": NaN, + "learning_rate": 0.00011688843784631157, + "loss": 0.0, + "step": 37565 + }, + { + "epoch": 3.505271997760567, + "grad_norm": NaN, + "learning_rate": 0.00011688106068861912, + "loss": 0.0, + "step": 37566 + }, + { + "epoch": 3.5053653074554445, + "grad_norm": NaN, + "learning_rate": 0.00011687368361513839, + "loss": 0.0, + "step": 37567 + }, + { + "epoch": 3.505458617150322, + "grad_norm": NaN, + "learning_rate": 0.00011686630662588807, + "loss": 0.0, + "step": 37568 + }, + { + "epoch": 3.5055519268451993, + "grad_norm": NaN, + "learning_rate": 0.00011685892972088684, + "loss": 0.0, + "step": 37569 + }, + { + "epoch": 3.5056452365400768, + "grad_norm": NaN, + "learning_rate": 0.00011685155290015362, + "loss": 0.0, + "step": 37570 + }, + { + "epoch": 3.5057385462349537, + "grad_norm": NaN, + "learning_rate": 0.00011684417616370706, + "loss": 0.0, + "step": 37571 + }, + { + "epoch": 3.505831855929831, + "grad_norm": NaN, + "learning_rate": 0.00011683679951156588, + "loss": 0.0, + "step": 37572 + }, + { + "epoch": 3.5059251656247086, + "grad_norm": NaN, + "learning_rate": 0.00011682942294374898, + "loss": 0.0, + "step": 37573 + }, + { + "epoch": 3.5060184753195855, + "grad_norm": NaN, + "learning_rate": 0.00011682204646027496, + "loss": 0.0, + "step": 37574 + }, + { + "epoch": 3.506111785014463, + "grad_norm": NaN, + "learning_rate": 0.00011681467006116275, + "loss": 0.0, + "step": 37575 + }, + { + "epoch": 3.5062050947093404, + "grad_norm": NaN, + "learning_rate": 0.0001168072937464309, + "loss": 0.0, + "step": 37576 + }, + { + "epoch": 3.506298404404218, + "grad_norm": NaN, + "learning_rate": 0.00011679991751609827, + "loss": 0.0, + "step": 37577 + }, + { + "epoch": 3.506391714099095, + "grad_norm": NaN, + "learning_rate": 0.00011679254137018368, + "loss": 0.0, + "step": 37578 + }, + { + "epoch": 3.506485023793972, + "grad_norm": NaN, + "learning_rate": 0.0001167851653087058, + "loss": 0.0, + "step": 37579 + }, + { + "epoch": 3.5065783334888496, + "grad_norm": NaN, + "learning_rate": 0.00011677778933168336, + "loss": 0.0, + "step": 37580 + }, + { + "epoch": 3.5066716431837266, + "grad_norm": NaN, + "learning_rate": 0.00011677041343913523, + "loss": 0.0, + "step": 37581 + }, + { + "epoch": 3.506764952878604, + "grad_norm": NaN, + "learning_rate": 0.00011676303763108006, + "loss": 0.0, + "step": 37582 + }, + { + "epoch": 3.5068582625734814, + "grad_norm": NaN, + "learning_rate": 0.0001167556619075366, + "loss": 0.0, + "step": 37583 + }, + { + "epoch": 3.506951572268359, + "grad_norm": NaN, + "learning_rate": 0.00011674828626852373, + "loss": 0.0, + "step": 37584 + }, + { + "epoch": 3.507044881963236, + "grad_norm": NaN, + "learning_rate": 0.00011674091071406007, + "loss": 0.0, + "step": 37585 + }, + { + "epoch": 3.5071381916581132, + "grad_norm": NaN, + "learning_rate": 0.0001167335352441644, + "loss": 0.0, + "step": 37586 + }, + { + "epoch": 3.5072315013529907, + "grad_norm": NaN, + "learning_rate": 0.00011672615985885554, + "loss": 0.0, + "step": 37587 + }, + { + "epoch": 3.5073248110478676, + "grad_norm": NaN, + "learning_rate": 0.00011671878455815219, + "loss": 0.0, + "step": 37588 + }, + { + "epoch": 3.507418120742745, + "grad_norm": NaN, + "learning_rate": 0.00011671140934207307, + "loss": 0.0, + "step": 37589 + }, + { + "epoch": 3.5075114304376225, + "grad_norm": NaN, + "learning_rate": 0.00011670403421063705, + "loss": 0.0, + "step": 37590 + }, + { + "epoch": 3.5076047401325, + "grad_norm": NaN, + "learning_rate": 0.00011669665916386276, + "loss": 0.0, + "step": 37591 + }, + { + "epoch": 3.5076980498273773, + "grad_norm": NaN, + "learning_rate": 0.00011668928420176898, + "loss": 0.0, + "step": 37592 + }, + { + "epoch": 3.5077913595222543, + "grad_norm": NaN, + "learning_rate": 0.00011668190932437454, + "loss": 0.0, + "step": 37593 + }, + { + "epoch": 3.5078846692171317, + "grad_norm": NaN, + "learning_rate": 0.0001166745345316981, + "loss": 0.0, + "step": 37594 + }, + { + "epoch": 3.5079779789120087, + "grad_norm": NaN, + "learning_rate": 0.00011666715982375841, + "loss": 0.0, + "step": 37595 + }, + { + "epoch": 3.508071288606886, + "grad_norm": NaN, + "learning_rate": 0.00011665978520057433, + "loss": 0.0, + "step": 37596 + }, + { + "epoch": 3.5081645983017635, + "grad_norm": NaN, + "learning_rate": 0.00011665241066216452, + "loss": 0.0, + "step": 37597 + }, + { + "epoch": 3.508257907996641, + "grad_norm": NaN, + "learning_rate": 0.00011664503620854771, + "loss": 0.0, + "step": 37598 + }, + { + "epoch": 3.5083512176915184, + "grad_norm": NaN, + "learning_rate": 0.00011663766183974276, + "loss": 0.0, + "step": 37599 + }, + { + "epoch": 3.5084445273863953, + "grad_norm": NaN, + "learning_rate": 0.00011663028755576831, + "loss": 0.0, + "step": 37600 + }, + { + "epoch": 3.5085378370812728, + "grad_norm": NaN, + "learning_rate": 0.00011662291335664313, + "loss": 0.0, + "step": 37601 + }, + { + "epoch": 3.50863114677615, + "grad_norm": NaN, + "learning_rate": 0.00011661553924238605, + "loss": 0.0, + "step": 37602 + }, + { + "epoch": 3.508724456471027, + "grad_norm": NaN, + "learning_rate": 0.00011660816521301575, + "loss": 0.0, + "step": 37603 + }, + { + "epoch": 3.5088177661659046, + "grad_norm": NaN, + "learning_rate": 0.00011660079126855095, + "loss": 0.0, + "step": 37604 + }, + { + "epoch": 3.508911075860782, + "grad_norm": NaN, + "learning_rate": 0.00011659341740901051, + "loss": 0.0, + "step": 37605 + }, + { + "epoch": 3.5090043855556594, + "grad_norm": NaN, + "learning_rate": 0.00011658604363441308, + "loss": 0.0, + "step": 37606 + }, + { + "epoch": 3.5090976952505364, + "grad_norm": NaN, + "learning_rate": 0.00011657866994477741, + "loss": 0.0, + "step": 37607 + }, + { + "epoch": 3.509191004945414, + "grad_norm": NaN, + "learning_rate": 0.00011657129634012236, + "loss": 0.0, + "step": 37608 + }, + { + "epoch": 3.5092843146402912, + "grad_norm": NaN, + "learning_rate": 0.00011656392282046656, + "loss": 0.0, + "step": 37609 + }, + { + "epoch": 3.509377624335168, + "grad_norm": NaN, + "learning_rate": 0.00011655654938582877, + "loss": 0.0, + "step": 37610 + }, + { + "epoch": 3.5094709340300456, + "grad_norm": NaN, + "learning_rate": 0.00011654917603622782, + "loss": 0.0, + "step": 37611 + }, + { + "epoch": 3.509564243724923, + "grad_norm": NaN, + "learning_rate": 0.00011654180277168235, + "loss": 0.0, + "step": 37612 + }, + { + "epoch": 3.5096575534198005, + "grad_norm": NaN, + "learning_rate": 0.0001165344295922112, + "loss": 0.0, + "step": 37613 + }, + { + "epoch": 3.509750863114678, + "grad_norm": NaN, + "learning_rate": 0.00011652705649783311, + "loss": 0.0, + "step": 37614 + }, + { + "epoch": 3.509844172809555, + "grad_norm": NaN, + "learning_rate": 0.00011651968348856673, + "loss": 0.0, + "step": 37615 + }, + { + "epoch": 3.5099374825044323, + "grad_norm": NaN, + "learning_rate": 0.00011651231056443093, + "loss": 0.0, + "step": 37616 + }, + { + "epoch": 3.5100307921993092, + "grad_norm": NaN, + "learning_rate": 0.00011650493772544442, + "loss": 0.0, + "step": 37617 + }, + { + "epoch": 3.5101241018941867, + "grad_norm": NaN, + "learning_rate": 0.00011649756497162587, + "loss": 0.0, + "step": 37618 + }, + { + "epoch": 3.510217411589064, + "grad_norm": NaN, + "learning_rate": 0.00011649019230299414, + "loss": 0.0, + "step": 37619 + }, + { + "epoch": 3.5103107212839415, + "grad_norm": NaN, + "learning_rate": 0.00011648281971956793, + "loss": 0.0, + "step": 37620 + }, + { + "epoch": 3.510404030978819, + "grad_norm": NaN, + "learning_rate": 0.00011647544722136592, + "loss": 0.0, + "step": 37621 + }, + { + "epoch": 3.510497340673696, + "grad_norm": NaN, + "learning_rate": 0.00011646807480840697, + "loss": 0.0, + "step": 37622 + }, + { + "epoch": 3.5105906503685733, + "grad_norm": NaN, + "learning_rate": 0.00011646070248070977, + "loss": 0.0, + "step": 37623 + }, + { + "epoch": 3.5106839600634507, + "grad_norm": NaN, + "learning_rate": 0.00011645333023829301, + "loss": 0.0, + "step": 37624 + }, + { + "epoch": 3.5107772697583277, + "grad_norm": NaN, + "learning_rate": 0.00011644595808117558, + "loss": 0.0, + "step": 37625 + }, + { + "epoch": 3.510870579453205, + "grad_norm": NaN, + "learning_rate": 0.0001164385860093761, + "loss": 0.0, + "step": 37626 + }, + { + "epoch": 3.5109638891480826, + "grad_norm": NaN, + "learning_rate": 0.00011643121402291332, + "loss": 0.0, + "step": 37627 + }, + { + "epoch": 3.51105719884296, + "grad_norm": NaN, + "learning_rate": 0.00011642384212180608, + "loss": 0.0, + "step": 37628 + }, + { + "epoch": 3.511150508537837, + "grad_norm": NaN, + "learning_rate": 0.00011641647030607305, + "loss": 0.0, + "step": 37629 + }, + { + "epoch": 3.5112438182327144, + "grad_norm": NaN, + "learning_rate": 0.00011640909857573293, + "loss": 0.0, + "step": 37630 + }, + { + "epoch": 3.511337127927592, + "grad_norm": NaN, + "learning_rate": 0.0001164017269308046, + "loss": 0.0, + "step": 37631 + }, + { + "epoch": 3.5114304376224688, + "grad_norm": NaN, + "learning_rate": 0.00011639435537130668, + "loss": 0.0, + "step": 37632 + }, + { + "epoch": 3.511523747317346, + "grad_norm": NaN, + "learning_rate": 0.00011638698389725796, + "loss": 0.0, + "step": 37633 + }, + { + "epoch": 3.5116170570122236, + "grad_norm": NaN, + "learning_rate": 0.00011637961250867722, + "loss": 0.0, + "step": 37634 + }, + { + "epoch": 3.511710366707101, + "grad_norm": NaN, + "learning_rate": 0.00011637224120558314, + "loss": 0.0, + "step": 37635 + }, + { + "epoch": 3.5118036764019784, + "grad_norm": NaN, + "learning_rate": 0.00011636486998799446, + "loss": 0.0, + "step": 37636 + }, + { + "epoch": 3.5118969860968554, + "grad_norm": NaN, + "learning_rate": 0.00011635749885593, + "loss": 0.0, + "step": 37637 + }, + { + "epoch": 3.511990295791733, + "grad_norm": NaN, + "learning_rate": 0.00011635012780940844, + "loss": 0.0, + "step": 37638 + }, + { + "epoch": 3.51208360548661, + "grad_norm": NaN, + "learning_rate": 0.00011634275684844849, + "loss": 0.0, + "step": 37639 + }, + { + "epoch": 3.5121769151814872, + "grad_norm": NaN, + "learning_rate": 0.00011633538597306902, + "loss": 0.0, + "step": 37640 + }, + { + "epoch": 3.5122702248763646, + "grad_norm": NaN, + "learning_rate": 0.00011632801518328867, + "loss": 0.0, + "step": 37641 + }, + { + "epoch": 3.512363534571242, + "grad_norm": NaN, + "learning_rate": 0.00011632064447912614, + "loss": 0.0, + "step": 37642 + }, + { + "epoch": 3.5124568442661195, + "grad_norm": NaN, + "learning_rate": 0.00011631327386060032, + "loss": 0.0, + "step": 37643 + }, + { + "epoch": 3.5125501539609965, + "grad_norm": NaN, + "learning_rate": 0.00011630590332772982, + "loss": 0.0, + "step": 37644 + }, + { + "epoch": 3.512643463655874, + "grad_norm": NaN, + "learning_rate": 0.0001162985328805334, + "loss": 0.0, + "step": 37645 + }, + { + "epoch": 3.5127367733507513, + "grad_norm": NaN, + "learning_rate": 0.00011629116251902989, + "loss": 0.0, + "step": 37646 + }, + { + "epoch": 3.5128300830456283, + "grad_norm": NaN, + "learning_rate": 0.00011628379224323794, + "loss": 0.0, + "step": 37647 + }, + { + "epoch": 3.5129233927405057, + "grad_norm": NaN, + "learning_rate": 0.00011627642205317629, + "loss": 0.0, + "step": 37648 + }, + { + "epoch": 3.513016702435383, + "grad_norm": NaN, + "learning_rate": 0.00011626905194886378, + "loss": 0.0, + "step": 37649 + }, + { + "epoch": 3.5131100121302605, + "grad_norm": NaN, + "learning_rate": 0.00011626168193031899, + "loss": 0.0, + "step": 37650 + }, + { + "epoch": 3.5132033218251375, + "grad_norm": NaN, + "learning_rate": 0.0001162543119975608, + "loss": 0.0, + "step": 37651 + }, + { + "epoch": 3.513296631520015, + "grad_norm": NaN, + "learning_rate": 0.00011624694215060793, + "loss": 0.0, + "step": 37652 + }, + { + "epoch": 3.5133899412148923, + "grad_norm": NaN, + "learning_rate": 0.00011623957238947902, + "loss": 0.0, + "step": 37653 + }, + { + "epoch": 3.5134832509097693, + "grad_norm": NaN, + "learning_rate": 0.0001162322027141929, + "loss": 0.0, + "step": 37654 + }, + { + "epoch": 3.5135765606046467, + "grad_norm": NaN, + "learning_rate": 0.00011622483312476834, + "loss": 0.0, + "step": 37655 + }, + { + "epoch": 3.513669870299524, + "grad_norm": NaN, + "learning_rate": 0.00011621746362122393, + "loss": 0.0, + "step": 37656 + }, + { + "epoch": 3.5137631799944016, + "grad_norm": NaN, + "learning_rate": 0.00011621009420357857, + "loss": 0.0, + "step": 37657 + }, + { + "epoch": 3.5138564896892786, + "grad_norm": NaN, + "learning_rate": 0.00011620272487185093, + "loss": 0.0, + "step": 37658 + }, + { + "epoch": 3.513949799384156, + "grad_norm": NaN, + "learning_rate": 0.00011619535562605972, + "loss": 0.0, + "step": 37659 + }, + { + "epoch": 3.5140431090790334, + "grad_norm": NaN, + "learning_rate": 0.0001161879864662237, + "loss": 0.0, + "step": 37660 + }, + { + "epoch": 3.5141364187739104, + "grad_norm": NaN, + "learning_rate": 0.00011618061739236168, + "loss": 0.0, + "step": 37661 + }, + { + "epoch": 3.514229728468788, + "grad_norm": NaN, + "learning_rate": 0.00011617324840449225, + "loss": 0.0, + "step": 37662 + }, + { + "epoch": 3.514323038163665, + "grad_norm": NaN, + "learning_rate": 0.00011616587950263429, + "loss": 0.0, + "step": 37663 + }, + { + "epoch": 3.5144163478585426, + "grad_norm": NaN, + "learning_rate": 0.00011615851068680645, + "loss": 0.0, + "step": 37664 + }, + { + "epoch": 3.51450965755342, + "grad_norm": NaN, + "learning_rate": 0.00011615114195702747, + "loss": 0.0, + "step": 37665 + }, + { + "epoch": 3.514602967248297, + "grad_norm": NaN, + "learning_rate": 0.00011614377331331616, + "loss": 0.0, + "step": 37666 + }, + { + "epoch": 3.5146962769431744, + "grad_norm": NaN, + "learning_rate": 0.00011613640475569118, + "loss": 0.0, + "step": 37667 + }, + { + "epoch": 3.514789586638052, + "grad_norm": NaN, + "learning_rate": 0.00011612903628417125, + "loss": 0.0, + "step": 37668 + }, + { + "epoch": 3.514882896332929, + "grad_norm": NaN, + "learning_rate": 0.00011612166789877524, + "loss": 0.0, + "step": 37669 + }, + { + "epoch": 3.5149762060278062, + "grad_norm": NaN, + "learning_rate": 0.00011611429959952173, + "loss": 0.0, + "step": 37670 + }, + { + "epoch": 3.5150695157226837, + "grad_norm": NaN, + "learning_rate": 0.00011610693138642949, + "loss": 0.0, + "step": 37671 + }, + { + "epoch": 3.515162825417561, + "grad_norm": NaN, + "learning_rate": 0.00011609956325951737, + "loss": 0.0, + "step": 37672 + }, + { + "epoch": 3.515256135112438, + "grad_norm": NaN, + "learning_rate": 0.00011609219521880396, + "loss": 0.0, + "step": 37673 + }, + { + "epoch": 3.5153494448073155, + "grad_norm": NaN, + "learning_rate": 0.00011608482726430802, + "loss": 0.0, + "step": 37674 + }, + { + "epoch": 3.515442754502193, + "grad_norm": NaN, + "learning_rate": 0.00011607745939604842, + "loss": 0.0, + "step": 37675 + }, + { + "epoch": 3.51553606419707, + "grad_norm": NaN, + "learning_rate": 0.00011607009161404371, + "loss": 0.0, + "step": 37676 + }, + { + "epoch": 3.5156293738919473, + "grad_norm": NaN, + "learning_rate": 0.0001160627239183127, + "loss": 0.0, + "step": 37677 + }, + { + "epoch": 3.5157226835868247, + "grad_norm": NaN, + "learning_rate": 0.00011605535630887421, + "loss": 0.0, + "step": 37678 + }, + { + "epoch": 3.515815993281702, + "grad_norm": NaN, + "learning_rate": 0.00011604798878574683, + "loss": 0.0, + "step": 37679 + }, + { + "epoch": 3.515909302976579, + "grad_norm": NaN, + "learning_rate": 0.00011604062134894934, + "loss": 0.0, + "step": 37680 + }, + { + "epoch": 3.5160026126714565, + "grad_norm": NaN, + "learning_rate": 0.00011603325399850056, + "loss": 0.0, + "step": 37681 + }, + { + "epoch": 3.516095922366334, + "grad_norm": NaN, + "learning_rate": 0.00011602588673441912, + "loss": 0.0, + "step": 37682 + }, + { + "epoch": 3.516189232061211, + "grad_norm": NaN, + "learning_rate": 0.00011601851955672375, + "loss": 0.0, + "step": 37683 + }, + { + "epoch": 3.5162825417560883, + "grad_norm": NaN, + "learning_rate": 0.00011601115246543327, + "loss": 0.0, + "step": 37684 + }, + { + "epoch": 3.5163758514509658, + "grad_norm": NaN, + "learning_rate": 0.00011600378546056635, + "loss": 0.0, + "step": 37685 + }, + { + "epoch": 3.516469161145843, + "grad_norm": NaN, + "learning_rate": 0.00011599641854214169, + "loss": 0.0, + "step": 37686 + }, + { + "epoch": 3.5165624708407206, + "grad_norm": NaN, + "learning_rate": 0.00011598905171017814, + "loss": 0.0, + "step": 37687 + }, + { + "epoch": 3.5166557805355976, + "grad_norm": NaN, + "learning_rate": 0.00011598168496469427, + "loss": 0.0, + "step": 37688 + }, + { + "epoch": 3.516749090230475, + "grad_norm": NaN, + "learning_rate": 0.00011597431830570896, + "loss": 0.0, + "step": 37689 + }, + { + "epoch": 3.516842399925352, + "grad_norm": NaN, + "learning_rate": 0.00011596695173324088, + "loss": 0.0, + "step": 37690 + }, + { + "epoch": 3.5169357096202294, + "grad_norm": NaN, + "learning_rate": 0.00011595958524730872, + "loss": 0.0, + "step": 37691 + }, + { + "epoch": 3.517029019315107, + "grad_norm": NaN, + "learning_rate": 0.00011595221884793126, + "loss": 0.0, + "step": 37692 + }, + { + "epoch": 3.5171223290099842, + "grad_norm": NaN, + "learning_rate": 0.00011594485253512728, + "loss": 0.0, + "step": 37693 + }, + { + "epoch": 3.5172156387048616, + "grad_norm": NaN, + "learning_rate": 0.00011593748630891538, + "loss": 0.0, + "step": 37694 + }, + { + "epoch": 3.5173089483997386, + "grad_norm": NaN, + "learning_rate": 0.0001159301201693144, + "loss": 0.0, + "step": 37695 + }, + { + "epoch": 3.517402258094616, + "grad_norm": NaN, + "learning_rate": 0.00011592275411634305, + "loss": 0.0, + "step": 37696 + }, + { + "epoch": 3.5174955677894935, + "grad_norm": NaN, + "learning_rate": 0.00011591538815001997, + "loss": 0.0, + "step": 37697 + }, + { + "epoch": 3.5175888774843704, + "grad_norm": NaN, + "learning_rate": 0.00011590802227036402, + "loss": 0.0, + "step": 37698 + }, + { + "epoch": 3.517682187179248, + "grad_norm": NaN, + "learning_rate": 0.00011590065647739389, + "loss": 0.0, + "step": 37699 + }, + { + "epoch": 3.5177754968741253, + "grad_norm": NaN, + "learning_rate": 0.00011589329077112821, + "loss": 0.0, + "step": 37700 + }, + { + "epoch": 3.5178688065690027, + "grad_norm": NaN, + "learning_rate": 0.00011588592515158584, + "loss": 0.0, + "step": 37701 + }, + { + "epoch": 3.5179621162638797, + "grad_norm": NaN, + "learning_rate": 0.00011587855961878548, + "loss": 0.0, + "step": 37702 + }, + { + "epoch": 3.518055425958757, + "grad_norm": NaN, + "learning_rate": 0.00011587119417274577, + "loss": 0.0, + "step": 37703 + }, + { + "epoch": 3.5181487356536345, + "grad_norm": NaN, + "learning_rate": 0.00011586382881348555, + "loss": 0.0, + "step": 37704 + }, + { + "epoch": 3.5182420453485115, + "grad_norm": NaN, + "learning_rate": 0.00011585646354102353, + "loss": 0.0, + "step": 37705 + }, + { + "epoch": 3.518335355043389, + "grad_norm": NaN, + "learning_rate": 0.00011584909835537833, + "loss": 0.0, + "step": 37706 + }, + { + "epoch": 3.5184286647382663, + "grad_norm": NaN, + "learning_rate": 0.00011584173325656884, + "loss": 0.0, + "step": 37707 + }, + { + "epoch": 3.5185219744331437, + "grad_norm": NaN, + "learning_rate": 0.00011583436824461366, + "loss": 0.0, + "step": 37708 + }, + { + "epoch": 3.518615284128021, + "grad_norm": NaN, + "learning_rate": 0.00011582700331953154, + "loss": 0.0, + "step": 37709 + }, + { + "epoch": 3.518708593822898, + "grad_norm": NaN, + "learning_rate": 0.00011581963848134128, + "loss": 0.0, + "step": 37710 + }, + { + "epoch": 3.5188019035177756, + "grad_norm": NaN, + "learning_rate": 0.00011581227373006152, + "loss": 0.0, + "step": 37711 + }, + { + "epoch": 3.5188952132126525, + "grad_norm": NaN, + "learning_rate": 0.00011580490906571099, + "loss": 0.0, + "step": 37712 + }, + { + "epoch": 3.51898852290753, + "grad_norm": NaN, + "learning_rate": 0.00011579754448830851, + "loss": 0.0, + "step": 37713 + }, + { + "epoch": 3.5190818326024074, + "grad_norm": NaN, + "learning_rate": 0.00011579017999787273, + "loss": 0.0, + "step": 37714 + }, + { + "epoch": 3.519175142297285, + "grad_norm": NaN, + "learning_rate": 0.00011578281559442235, + "loss": 0.0, + "step": 37715 + }, + { + "epoch": 3.519268451992162, + "grad_norm": NaN, + "learning_rate": 0.00011577545127797618, + "loss": 0.0, + "step": 37716 + }, + { + "epoch": 3.519361761687039, + "grad_norm": NaN, + "learning_rate": 0.00011576808704855289, + "loss": 0.0, + "step": 37717 + }, + { + "epoch": 3.5194550713819166, + "grad_norm": NaN, + "learning_rate": 0.00011576072290617115, + "loss": 0.0, + "step": 37718 + }, + { + "epoch": 3.519548381076794, + "grad_norm": NaN, + "learning_rate": 0.00011575335885084983, + "loss": 0.0, + "step": 37719 + }, + { + "epoch": 3.519641690771671, + "grad_norm": NaN, + "learning_rate": 0.00011574599488260757, + "loss": 0.0, + "step": 37720 + }, + { + "epoch": 3.5197350004665484, + "grad_norm": NaN, + "learning_rate": 0.00011573863100146303, + "loss": 0.0, + "step": 37721 + }, + { + "epoch": 3.519828310161426, + "grad_norm": NaN, + "learning_rate": 0.00011573126720743509, + "loss": 0.0, + "step": 37722 + }, + { + "epoch": 3.5199216198563033, + "grad_norm": NaN, + "learning_rate": 0.00011572390350054234, + "loss": 0.0, + "step": 37723 + }, + { + "epoch": 3.5200149295511802, + "grad_norm": NaN, + "learning_rate": 0.0001157165398808035, + "loss": 0.0, + "step": 37724 + }, + { + "epoch": 3.5201082392460576, + "grad_norm": NaN, + "learning_rate": 0.00011570917634823744, + "loss": 0.0, + "step": 37725 + }, + { + "epoch": 3.520201548940935, + "grad_norm": NaN, + "learning_rate": 0.0001157018129028627, + "loss": 0.0, + "step": 37726 + }, + { + "epoch": 3.520294858635812, + "grad_norm": NaN, + "learning_rate": 0.00011569444954469815, + "loss": 0.0, + "step": 37727 + }, + { + "epoch": 3.5203881683306895, + "grad_norm": NaN, + "learning_rate": 0.00011568708627376246, + "loss": 0.0, + "step": 37728 + }, + { + "epoch": 3.520481478025567, + "grad_norm": NaN, + "learning_rate": 0.00011567972309007428, + "loss": 0.0, + "step": 37729 + }, + { + "epoch": 3.5205747877204443, + "grad_norm": NaN, + "learning_rate": 0.00011567235999365244, + "loss": 0.0, + "step": 37730 + }, + { + "epoch": 3.5206680974153217, + "grad_norm": NaN, + "learning_rate": 0.00011566499698451566, + "loss": 0.0, + "step": 37731 + }, + { + "epoch": 3.5207614071101987, + "grad_norm": NaN, + "learning_rate": 0.00011565763406268255, + "loss": 0.0, + "step": 37732 + }, + { + "epoch": 3.520854716805076, + "grad_norm": NaN, + "learning_rate": 0.00011565027122817193, + "loss": 0.0, + "step": 37733 + }, + { + "epoch": 3.520948026499953, + "grad_norm": NaN, + "learning_rate": 0.00011564290848100253, + "loss": 0.0, + "step": 37734 + }, + { + "epoch": 3.5210413361948305, + "grad_norm": NaN, + "learning_rate": 0.00011563554582119297, + "loss": 0.0, + "step": 37735 + }, + { + "epoch": 3.521134645889708, + "grad_norm": NaN, + "learning_rate": 0.00011562818324876208, + "loss": 0.0, + "step": 37736 + }, + { + "epoch": 3.5212279555845853, + "grad_norm": NaN, + "learning_rate": 0.00011562082076372856, + "loss": 0.0, + "step": 37737 + }, + { + "epoch": 3.5213212652794628, + "grad_norm": NaN, + "learning_rate": 0.00011561345836611105, + "loss": 0.0, + "step": 37738 + }, + { + "epoch": 3.5214145749743397, + "grad_norm": NaN, + "learning_rate": 0.00011560609605592837, + "loss": 0.0, + "step": 37739 + }, + { + "epoch": 3.521507884669217, + "grad_norm": NaN, + "learning_rate": 0.0001155987338331992, + "loss": 0.0, + "step": 37740 + }, + { + "epoch": 3.5216011943640946, + "grad_norm": NaN, + "learning_rate": 0.00011559137169794223, + "loss": 0.0, + "step": 37741 + }, + { + "epoch": 3.5216945040589716, + "grad_norm": NaN, + "learning_rate": 0.00011558400965017623, + "loss": 0.0, + "step": 37742 + }, + { + "epoch": 3.521787813753849, + "grad_norm": NaN, + "learning_rate": 0.00011557664768991992, + "loss": 0.0, + "step": 37743 + }, + { + "epoch": 3.5218811234487264, + "grad_norm": NaN, + "learning_rate": 0.00011556928581719194, + "loss": 0.0, + "step": 37744 + }, + { + "epoch": 3.521974433143604, + "grad_norm": NaN, + "learning_rate": 0.0001155619240320111, + "loss": 0.0, + "step": 37745 + }, + { + "epoch": 3.522067742838481, + "grad_norm": NaN, + "learning_rate": 0.00011555456233439613, + "loss": 0.0, + "step": 37746 + }, + { + "epoch": 3.522161052533358, + "grad_norm": NaN, + "learning_rate": 0.00011554720072436563, + "loss": 0.0, + "step": 37747 + }, + { + "epoch": 3.5222543622282356, + "grad_norm": NaN, + "learning_rate": 0.00011553983920193842, + "loss": 0.0, + "step": 37748 + }, + { + "epoch": 3.5223476719231126, + "grad_norm": NaN, + "learning_rate": 0.00011553247776713322, + "loss": 0.0, + "step": 37749 + }, + { + "epoch": 3.52244098161799, + "grad_norm": NaN, + "learning_rate": 0.00011552511641996867, + "loss": 0.0, + "step": 37750 + }, + { + "epoch": 3.5225342913128674, + "grad_norm": NaN, + "learning_rate": 0.00011551775516046357, + "loss": 0.0, + "step": 37751 + }, + { + "epoch": 3.522627601007745, + "grad_norm": NaN, + "learning_rate": 0.00011551039398863662, + "loss": 0.0, + "step": 37752 + }, + { + "epoch": 3.5227209107026223, + "grad_norm": NaN, + "learning_rate": 0.00011550303290450647, + "loss": 0.0, + "step": 37753 + }, + { + "epoch": 3.5228142203974993, + "grad_norm": NaN, + "learning_rate": 0.00011549567190809194, + "loss": 0.0, + "step": 37754 + }, + { + "epoch": 3.5229075300923767, + "grad_norm": NaN, + "learning_rate": 0.00011548831099941166, + "loss": 0.0, + "step": 37755 + }, + { + "epoch": 3.5230008397872536, + "grad_norm": NaN, + "learning_rate": 0.0001154809501784844, + "loss": 0.0, + "step": 37756 + }, + { + "epoch": 3.523094149482131, + "grad_norm": NaN, + "learning_rate": 0.00011547358944532886, + "loss": 0.0, + "step": 37757 + }, + { + "epoch": 3.5231874591770085, + "grad_norm": NaN, + "learning_rate": 0.00011546622879996374, + "loss": 0.0, + "step": 37758 + }, + { + "epoch": 3.523280768871886, + "grad_norm": NaN, + "learning_rate": 0.00011545886824240776, + "loss": 0.0, + "step": 37759 + }, + { + "epoch": 3.5233740785667633, + "grad_norm": NaN, + "learning_rate": 0.00011545150777267969, + "loss": 0.0, + "step": 37760 + }, + { + "epoch": 3.5234673882616403, + "grad_norm": NaN, + "learning_rate": 0.00011544414739079814, + "loss": 0.0, + "step": 37761 + }, + { + "epoch": 3.5235606979565177, + "grad_norm": NaN, + "learning_rate": 0.00011543678709678193, + "loss": 0.0, + "step": 37762 + }, + { + "epoch": 3.523654007651395, + "grad_norm": NaN, + "learning_rate": 0.00011542942689064975, + "loss": 0.0, + "step": 37763 + }, + { + "epoch": 3.523747317346272, + "grad_norm": NaN, + "learning_rate": 0.00011542206677242024, + "loss": 0.0, + "step": 37764 + }, + { + "epoch": 3.5238406270411495, + "grad_norm": NaN, + "learning_rate": 0.0001154147067421122, + "loss": 0.0, + "step": 37765 + }, + { + "epoch": 3.523933936736027, + "grad_norm": NaN, + "learning_rate": 0.00011540734679974436, + "loss": 0.0, + "step": 37766 + }, + { + "epoch": 3.5240272464309044, + "grad_norm": NaN, + "learning_rate": 0.00011539998694533531, + "loss": 0.0, + "step": 37767 + }, + { + "epoch": 3.5241205561257813, + "grad_norm": NaN, + "learning_rate": 0.00011539262717890387, + "loss": 0.0, + "step": 37768 + }, + { + "epoch": 3.5242138658206588, + "grad_norm": NaN, + "learning_rate": 0.00011538526750046878, + "loss": 0.0, + "step": 37769 + }, + { + "epoch": 3.524307175515536, + "grad_norm": NaN, + "learning_rate": 0.00011537790791004863, + "loss": 0.0, + "step": 37770 + }, + { + "epoch": 3.524400485210413, + "grad_norm": NaN, + "learning_rate": 0.00011537054840766222, + "loss": 0.0, + "step": 37771 + }, + { + "epoch": 3.5244937949052906, + "grad_norm": NaN, + "learning_rate": 0.00011536318899332831, + "loss": 0.0, + "step": 37772 + }, + { + "epoch": 3.524587104600168, + "grad_norm": NaN, + "learning_rate": 0.00011535582966706546, + "loss": 0.0, + "step": 37773 + }, + { + "epoch": 3.5246804142950454, + "grad_norm": NaN, + "learning_rate": 0.00011534847042889252, + "loss": 0.0, + "step": 37774 + }, + { + "epoch": 3.5247737239899224, + "grad_norm": NaN, + "learning_rate": 0.00011534111127882819, + "loss": 0.0, + "step": 37775 + }, + { + "epoch": 3.5248670336848, + "grad_norm": NaN, + "learning_rate": 0.00011533375221689107, + "loss": 0.0, + "step": 37776 + }, + { + "epoch": 3.5249603433796772, + "grad_norm": NaN, + "learning_rate": 0.00011532639324309999, + "loss": 0.0, + "step": 37777 + }, + { + "epoch": 3.525053653074554, + "grad_norm": NaN, + "learning_rate": 0.00011531903435747364, + "loss": 0.0, + "step": 37778 + }, + { + "epoch": 3.5251469627694316, + "grad_norm": NaN, + "learning_rate": 0.00011531167556003067, + "loss": 0.0, + "step": 37779 + }, + { + "epoch": 3.525240272464309, + "grad_norm": NaN, + "learning_rate": 0.00011530431685078985, + "loss": 0.0, + "step": 37780 + }, + { + "epoch": 3.5253335821591865, + "grad_norm": NaN, + "learning_rate": 0.00011529695822976991, + "loss": 0.0, + "step": 37781 + }, + { + "epoch": 3.525426891854064, + "grad_norm": NaN, + "learning_rate": 0.00011528959969698946, + "loss": 0.0, + "step": 37782 + }, + { + "epoch": 3.525520201548941, + "grad_norm": NaN, + "learning_rate": 0.00011528224125246732, + "loss": 0.0, + "step": 37783 + }, + { + "epoch": 3.5256135112438183, + "grad_norm": NaN, + "learning_rate": 0.00011527488289622218, + "loss": 0.0, + "step": 37784 + }, + { + "epoch": 3.5257068209386957, + "grad_norm": NaN, + "learning_rate": 0.00011526752462827266, + "loss": 0.0, + "step": 37785 + }, + { + "epoch": 3.5258001306335727, + "grad_norm": NaN, + "learning_rate": 0.00011526016644863757, + "loss": 0.0, + "step": 37786 + }, + { + "epoch": 3.52589344032845, + "grad_norm": NaN, + "learning_rate": 0.0001152528083573356, + "loss": 0.0, + "step": 37787 + }, + { + "epoch": 3.5259867500233275, + "grad_norm": NaN, + "learning_rate": 0.0001152454503543854, + "loss": 0.0, + "step": 37788 + }, + { + "epoch": 3.526080059718205, + "grad_norm": NaN, + "learning_rate": 0.00011523809243980575, + "loss": 0.0, + "step": 37789 + }, + { + "epoch": 3.526173369413082, + "grad_norm": NaN, + "learning_rate": 0.00011523073461361537, + "loss": 0.0, + "step": 37790 + }, + { + "epoch": 3.5262666791079593, + "grad_norm": NaN, + "learning_rate": 0.00011522337687583289, + "loss": 0.0, + "step": 37791 + }, + { + "epoch": 3.5263599888028367, + "grad_norm": NaN, + "learning_rate": 0.00011521601922647706, + "loss": 0.0, + "step": 37792 + }, + { + "epoch": 3.5264532984977137, + "grad_norm": NaN, + "learning_rate": 0.00011520866166556663, + "loss": 0.0, + "step": 37793 + }, + { + "epoch": 3.526546608192591, + "grad_norm": NaN, + "learning_rate": 0.00011520130419312021, + "loss": 0.0, + "step": 37794 + }, + { + "epoch": 3.5266399178874686, + "grad_norm": NaN, + "learning_rate": 0.0001151939468091566, + "loss": 0.0, + "step": 37795 + }, + { + "epoch": 3.526733227582346, + "grad_norm": NaN, + "learning_rate": 0.0001151865895136945, + "loss": 0.0, + "step": 37796 + }, + { + "epoch": 3.526826537277223, + "grad_norm": NaN, + "learning_rate": 0.00011517923230675252, + "loss": 0.0, + "step": 37797 + }, + { + "epoch": 3.5269198469721004, + "grad_norm": NaN, + "learning_rate": 0.00011517187518834951, + "loss": 0.0, + "step": 37798 + }, + { + "epoch": 3.527013156666978, + "grad_norm": NaN, + "learning_rate": 0.00011516451815850404, + "loss": 0.0, + "step": 37799 + }, + { + "epoch": 3.5271064663618548, + "grad_norm": NaN, + "learning_rate": 0.00011515716121723491, + "loss": 0.0, + "step": 37800 + }, + { + "epoch": 3.527199776056732, + "grad_norm": NaN, + "learning_rate": 0.00011514980436456083, + "loss": 0.0, + "step": 37801 + }, + { + "epoch": 3.5272930857516096, + "grad_norm": NaN, + "learning_rate": 0.00011514244760050041, + "loss": 0.0, + "step": 37802 + }, + { + "epoch": 3.527386395446487, + "grad_norm": NaN, + "learning_rate": 0.00011513509092507246, + "loss": 0.0, + "step": 37803 + }, + { + "epoch": 3.5274797051413644, + "grad_norm": NaN, + "learning_rate": 0.00011512773433829567, + "loss": 0.0, + "step": 37804 + }, + { + "epoch": 3.5275730148362414, + "grad_norm": NaN, + "learning_rate": 0.00011512037784018866, + "loss": 0.0, + "step": 37805 + }, + { + "epoch": 3.527666324531119, + "grad_norm": NaN, + "learning_rate": 0.00011511302143077021, + "loss": 0.0, + "step": 37806 + }, + { + "epoch": 3.527759634225996, + "grad_norm": NaN, + "learning_rate": 0.00011510566511005909, + "loss": 0.0, + "step": 37807 + }, + { + "epoch": 3.5278529439208732, + "grad_norm": NaN, + "learning_rate": 0.00011509830887807382, + "loss": 0.0, + "step": 37808 + }, + { + "epoch": 3.5279462536157506, + "grad_norm": NaN, + "learning_rate": 0.00011509095273483326, + "loss": 0.0, + "step": 37809 + }, + { + "epoch": 3.528039563310628, + "grad_norm": NaN, + "learning_rate": 0.0001150835966803561, + "loss": 0.0, + "step": 37810 + }, + { + "epoch": 3.5281328730055055, + "grad_norm": NaN, + "learning_rate": 0.00011507624071466095, + "loss": 0.0, + "step": 37811 + }, + { + "epoch": 3.5282261827003825, + "grad_norm": NaN, + "learning_rate": 0.00011506888483776658, + "loss": 0.0, + "step": 37812 + }, + { + "epoch": 3.52831949239526, + "grad_norm": NaN, + "learning_rate": 0.00011506152904969175, + "loss": 0.0, + "step": 37813 + }, + { + "epoch": 3.5284128020901373, + "grad_norm": NaN, + "learning_rate": 0.00011505417335045503, + "loss": 0.0, + "step": 37814 + }, + { + "epoch": 3.5285061117850143, + "grad_norm": NaN, + "learning_rate": 0.00011504681774007523, + "loss": 0.0, + "step": 37815 + }, + { + "epoch": 3.5285994214798917, + "grad_norm": NaN, + "learning_rate": 0.00011503946221857106, + "loss": 0.0, + "step": 37816 + }, + { + "epoch": 3.528692731174769, + "grad_norm": NaN, + "learning_rate": 0.0001150321067859611, + "loss": 0.0, + "step": 37817 + }, + { + "epoch": 3.5287860408696465, + "grad_norm": NaN, + "learning_rate": 0.00011502475144226417, + "loss": 0.0, + "step": 37818 + }, + { + "epoch": 3.5288793505645235, + "grad_norm": NaN, + "learning_rate": 0.00011501739618749897, + "loss": 0.0, + "step": 37819 + }, + { + "epoch": 3.528972660259401, + "grad_norm": NaN, + "learning_rate": 0.0001150100410216841, + "loss": 0.0, + "step": 37820 + }, + { + "epoch": 3.5290659699542783, + "grad_norm": NaN, + "learning_rate": 0.00011500268594483836, + "loss": 0.0, + "step": 37821 + }, + { + "epoch": 3.5291592796491553, + "grad_norm": NaN, + "learning_rate": 0.00011499533095698047, + "loss": 0.0, + "step": 37822 + }, + { + "epoch": 3.5292525893440327, + "grad_norm": NaN, + "learning_rate": 0.00011498797605812902, + "loss": 0.0, + "step": 37823 + }, + { + "epoch": 3.52934589903891, + "grad_norm": NaN, + "learning_rate": 0.00011498062124830279, + "loss": 0.0, + "step": 37824 + }, + { + "epoch": 3.5294392087337876, + "grad_norm": NaN, + "learning_rate": 0.00011497326652752051, + "loss": 0.0, + "step": 37825 + }, + { + "epoch": 3.529532518428665, + "grad_norm": NaN, + "learning_rate": 0.00011496591189580077, + "loss": 0.0, + "step": 37826 + }, + { + "epoch": 3.529625828123542, + "grad_norm": NaN, + "learning_rate": 0.00011495855735316238, + "loss": 0.0, + "step": 37827 + }, + { + "epoch": 3.5297191378184194, + "grad_norm": NaN, + "learning_rate": 0.00011495120289962403, + "loss": 0.0, + "step": 37828 + }, + { + "epoch": 3.5298124475132964, + "grad_norm": NaN, + "learning_rate": 0.00011494384853520431, + "loss": 0.0, + "step": 37829 + }, + { + "epoch": 3.529905757208174, + "grad_norm": NaN, + "learning_rate": 0.00011493649425992204, + "loss": 0.0, + "step": 37830 + }, + { + "epoch": 3.529999066903051, + "grad_norm": NaN, + "learning_rate": 0.00011492914007379592, + "loss": 0.0, + "step": 37831 + }, + { + "epoch": 3.5300923765979286, + "grad_norm": NaN, + "learning_rate": 0.00011492178597684453, + "loss": 0.0, + "step": 37832 + }, + { + "epoch": 3.530185686292806, + "grad_norm": NaN, + "learning_rate": 0.00011491443196908663, + "loss": 0.0, + "step": 37833 + }, + { + "epoch": 3.530278995987683, + "grad_norm": NaN, + "learning_rate": 0.00011490707805054106, + "loss": 0.0, + "step": 37834 + }, + { + "epoch": 3.5303723056825604, + "grad_norm": NaN, + "learning_rate": 0.0001148997242212263, + "loss": 0.0, + "step": 37835 + }, + { + "epoch": 3.530465615377438, + "grad_norm": NaN, + "learning_rate": 0.00011489237048116113, + "loss": 0.0, + "step": 37836 + }, + { + "epoch": 3.530558925072315, + "grad_norm": NaN, + "learning_rate": 0.00011488501683036433, + "loss": 0.0, + "step": 37837 + }, + { + "epoch": 3.5306522347671923, + "grad_norm": NaN, + "learning_rate": 0.0001148776632688545, + "loss": 0.0, + "step": 37838 + }, + { + "epoch": 3.5307455444620697, + "grad_norm": NaN, + "learning_rate": 0.00011487030979665034, + "loss": 0.0, + "step": 37839 + }, + { + "epoch": 3.530838854156947, + "grad_norm": NaN, + "learning_rate": 0.00011486295641377064, + "loss": 0.0, + "step": 37840 + }, + { + "epoch": 3.530932163851824, + "grad_norm": NaN, + "learning_rate": 0.00011485560312023398, + "loss": 0.0, + "step": 37841 + }, + { + "epoch": 3.5310254735467015, + "grad_norm": NaN, + "learning_rate": 0.00011484824991605918, + "loss": 0.0, + "step": 37842 + }, + { + "epoch": 3.531118783241579, + "grad_norm": NaN, + "learning_rate": 0.00011484089680126478, + "loss": 0.0, + "step": 37843 + }, + { + "epoch": 3.531212092936456, + "grad_norm": NaN, + "learning_rate": 0.0001148335437758696, + "loss": 0.0, + "step": 37844 + }, + { + "epoch": 3.5313054026313333, + "grad_norm": NaN, + "learning_rate": 0.00011482619083989233, + "loss": 0.0, + "step": 37845 + }, + { + "epoch": 3.5313987123262107, + "grad_norm": NaN, + "learning_rate": 0.00011481883799335159, + "loss": 0.0, + "step": 37846 + }, + { + "epoch": 3.531492022021088, + "grad_norm": NaN, + "learning_rate": 0.00011481148523626615, + "loss": 0.0, + "step": 37847 + }, + { + "epoch": 3.5315853317159656, + "grad_norm": NaN, + "learning_rate": 0.0001148041325686547, + "loss": 0.0, + "step": 37848 + }, + { + "epoch": 3.5316786414108425, + "grad_norm": NaN, + "learning_rate": 0.00011479677999053585, + "loss": 0.0, + "step": 37849 + }, + { + "epoch": 3.53177195110572, + "grad_norm": NaN, + "learning_rate": 0.0001147894275019284, + "loss": 0.0, + "step": 37850 + }, + { + "epoch": 3.531865260800597, + "grad_norm": NaN, + "learning_rate": 0.00011478207510285103, + "loss": 0.0, + "step": 37851 + }, + { + "epoch": 3.5319585704954743, + "grad_norm": NaN, + "learning_rate": 0.00011477472279332236, + "loss": 0.0, + "step": 37852 + }, + { + "epoch": 3.5320518801903518, + "grad_norm": NaN, + "learning_rate": 0.00011476737057336115, + "loss": 0.0, + "step": 37853 + }, + { + "epoch": 3.532145189885229, + "grad_norm": NaN, + "learning_rate": 0.00011476001844298613, + "loss": 0.0, + "step": 37854 + }, + { + "epoch": 3.5322384995801066, + "grad_norm": NaN, + "learning_rate": 0.00011475266640221587, + "loss": 0.0, + "step": 37855 + }, + { + "epoch": 3.5323318092749836, + "grad_norm": NaN, + "learning_rate": 0.00011474531445106917, + "loss": 0.0, + "step": 37856 + }, + { + "epoch": 3.532425118969861, + "grad_norm": NaN, + "learning_rate": 0.00011473796258956473, + "loss": 0.0, + "step": 37857 + }, + { + "epoch": 3.5325184286647384, + "grad_norm": NaN, + "learning_rate": 0.00011473061081772114, + "loss": 0.0, + "step": 37858 + }, + { + "epoch": 3.5326117383596154, + "grad_norm": NaN, + "learning_rate": 0.00011472325913555719, + "loss": 0.0, + "step": 37859 + }, + { + "epoch": 3.532705048054493, + "grad_norm": NaN, + "learning_rate": 0.00011471590754309157, + "loss": 0.0, + "step": 37860 + }, + { + "epoch": 3.5327983577493702, + "grad_norm": NaN, + "learning_rate": 0.0001147085560403429, + "loss": 0.0, + "step": 37861 + }, + { + "epoch": 3.5328916674442477, + "grad_norm": NaN, + "learning_rate": 0.00011470120462732993, + "loss": 0.0, + "step": 37862 + }, + { + "epoch": 3.5329849771391246, + "grad_norm": NaN, + "learning_rate": 0.00011469385330407137, + "loss": 0.0, + "step": 37863 + }, + { + "epoch": 3.533078286834002, + "grad_norm": NaN, + "learning_rate": 0.00011468650207058582, + "loss": 0.0, + "step": 37864 + }, + { + "epoch": 3.5331715965288795, + "grad_norm": NaN, + "learning_rate": 0.0001146791509268921, + "loss": 0.0, + "step": 37865 + }, + { + "epoch": 3.5332649062237564, + "grad_norm": NaN, + "learning_rate": 0.00011467179987300882, + "loss": 0.0, + "step": 37866 + }, + { + "epoch": 3.533358215918634, + "grad_norm": NaN, + "learning_rate": 0.00011466444890895466, + "loss": 0.0, + "step": 37867 + }, + { + "epoch": 3.5334515256135113, + "grad_norm": NaN, + "learning_rate": 0.00011465709803474835, + "loss": 0.0, + "step": 37868 + }, + { + "epoch": 3.5335448353083887, + "grad_norm": NaN, + "learning_rate": 0.0001146497472504086, + "loss": 0.0, + "step": 37869 + }, + { + "epoch": 3.5336381450032657, + "grad_norm": NaN, + "learning_rate": 0.00011464239655595403, + "loss": 0.0, + "step": 37870 + }, + { + "epoch": 3.533731454698143, + "grad_norm": NaN, + "learning_rate": 0.00011463504595140336, + "loss": 0.0, + "step": 37871 + }, + { + "epoch": 3.5338247643930205, + "grad_norm": NaN, + "learning_rate": 0.00011462769543677537, + "loss": 0.0, + "step": 37872 + }, + { + "epoch": 3.5339180740878975, + "grad_norm": NaN, + "learning_rate": 0.00011462034501208861, + "loss": 0.0, + "step": 37873 + }, + { + "epoch": 3.534011383782775, + "grad_norm": NaN, + "learning_rate": 0.00011461299467736182, + "loss": 0.0, + "step": 37874 + }, + { + "epoch": 3.5341046934776523, + "grad_norm": NaN, + "learning_rate": 0.00011460564443261375, + "loss": 0.0, + "step": 37875 + }, + { + "epoch": 3.5341980031725297, + "grad_norm": NaN, + "learning_rate": 0.00011459829427786303, + "loss": 0.0, + "step": 37876 + }, + { + "epoch": 3.534291312867407, + "grad_norm": NaN, + "learning_rate": 0.00011459094421312831, + "loss": 0.0, + "step": 37877 + }, + { + "epoch": 3.534384622562284, + "grad_norm": NaN, + "learning_rate": 0.0001145835942384284, + "loss": 0.0, + "step": 37878 + }, + { + "epoch": 3.5344779322571616, + "grad_norm": NaN, + "learning_rate": 0.00011457624435378189, + "loss": 0.0, + "step": 37879 + }, + { + "epoch": 3.534571241952039, + "grad_norm": NaN, + "learning_rate": 0.00011456889455920746, + "loss": 0.0, + "step": 37880 + }, + { + "epoch": 3.534664551646916, + "grad_norm": NaN, + "learning_rate": 0.00011456154485472391, + "loss": 0.0, + "step": 37881 + }, + { + "epoch": 3.5347578613417934, + "grad_norm": NaN, + "learning_rate": 0.00011455419524034981, + "loss": 0.0, + "step": 37882 + }, + { + "epoch": 3.534851171036671, + "grad_norm": NaN, + "learning_rate": 0.00011454684571610389, + "loss": 0.0, + "step": 37883 + }, + { + "epoch": 3.534944480731548, + "grad_norm": NaN, + "learning_rate": 0.00011453949628200487, + "loss": 0.0, + "step": 37884 + }, + { + "epoch": 3.535037790426425, + "grad_norm": NaN, + "learning_rate": 0.00011453214693807139, + "loss": 0.0, + "step": 37885 + }, + { + "epoch": 3.5351311001213026, + "grad_norm": NaN, + "learning_rate": 0.00011452479768432211, + "loss": 0.0, + "step": 37886 + }, + { + "epoch": 3.53522440981618, + "grad_norm": NaN, + "learning_rate": 0.00011451744852077583, + "loss": 0.0, + "step": 37887 + }, + { + "epoch": 3.535317719511057, + "grad_norm": NaN, + "learning_rate": 0.00011451009944745114, + "loss": 0.0, + "step": 37888 + }, + { + "epoch": 3.5354110292059344, + "grad_norm": NaN, + "learning_rate": 0.00011450275046436678, + "loss": 0.0, + "step": 37889 + }, + { + "epoch": 3.535504338900812, + "grad_norm": NaN, + "learning_rate": 0.00011449540157154135, + "loss": 0.0, + "step": 37890 + }, + { + "epoch": 3.5355976485956893, + "grad_norm": NaN, + "learning_rate": 0.00011448805276899363, + "loss": 0.0, + "step": 37891 + }, + { + "epoch": 3.5356909582905662, + "grad_norm": NaN, + "learning_rate": 0.00011448070405674231, + "loss": 0.0, + "step": 37892 + }, + { + "epoch": 3.5357842679854437, + "grad_norm": NaN, + "learning_rate": 0.00011447335543480596, + "loss": 0.0, + "step": 37893 + }, + { + "epoch": 3.535877577680321, + "grad_norm": NaN, + "learning_rate": 0.00011446600690320339, + "loss": 0.0, + "step": 37894 + }, + { + "epoch": 3.535970887375198, + "grad_norm": NaN, + "learning_rate": 0.00011445865846195325, + "loss": 0.0, + "step": 37895 + }, + { + "epoch": 3.5360641970700755, + "grad_norm": NaN, + "learning_rate": 0.00011445131011107417, + "loss": 0.0, + "step": 37896 + }, + { + "epoch": 3.536157506764953, + "grad_norm": NaN, + "learning_rate": 0.00011444396185058491, + "loss": 0.0, + "step": 37897 + }, + { + "epoch": 3.5362508164598303, + "grad_norm": NaN, + "learning_rate": 0.00011443661368050413, + "loss": 0.0, + "step": 37898 + }, + { + "epoch": 3.5363441261547077, + "grad_norm": NaN, + "learning_rate": 0.00011442926560085046, + "loss": 0.0, + "step": 37899 + }, + { + "epoch": 3.5364374358495847, + "grad_norm": NaN, + "learning_rate": 0.00011442191761164267, + "loss": 0.0, + "step": 37900 + }, + { + "epoch": 3.536530745544462, + "grad_norm": NaN, + "learning_rate": 0.00011441456971289943, + "loss": 0.0, + "step": 37901 + }, + { + "epoch": 3.536624055239339, + "grad_norm": NaN, + "learning_rate": 0.00011440722190463933, + "loss": 0.0, + "step": 37902 + }, + { + "epoch": 3.5367173649342165, + "grad_norm": NaN, + "learning_rate": 0.00011439987418688115, + "loss": 0.0, + "step": 37903 + }, + { + "epoch": 3.536810674629094, + "grad_norm": NaN, + "learning_rate": 0.0001143925265596436, + "loss": 0.0, + "step": 37904 + }, + { + "epoch": 3.5369039843239714, + "grad_norm": NaN, + "learning_rate": 0.00011438517902294521, + "loss": 0.0, + "step": 37905 + }, + { + "epoch": 3.5369972940188488, + "grad_norm": NaN, + "learning_rate": 0.00011437783157680482, + "loss": 0.0, + "step": 37906 + }, + { + "epoch": 3.5370906037137257, + "grad_norm": NaN, + "learning_rate": 0.00011437048422124107, + "loss": 0.0, + "step": 37907 + }, + { + "epoch": 3.537183913408603, + "grad_norm": NaN, + "learning_rate": 0.00011436313695627257, + "loss": 0.0, + "step": 37908 + }, + { + "epoch": 3.5372772231034806, + "grad_norm": NaN, + "learning_rate": 0.00011435578978191805, + "loss": 0.0, + "step": 37909 + }, + { + "epoch": 3.5373705327983576, + "grad_norm": NaN, + "learning_rate": 0.0001143484426981963, + "loss": 0.0, + "step": 37910 + }, + { + "epoch": 3.537463842493235, + "grad_norm": NaN, + "learning_rate": 0.00011434109570512579, + "loss": 0.0, + "step": 37911 + }, + { + "epoch": 3.5375571521881124, + "grad_norm": NaN, + "learning_rate": 0.0001143337488027253, + "loss": 0.0, + "step": 37912 + }, + { + "epoch": 3.53765046188299, + "grad_norm": NaN, + "learning_rate": 0.0001143264019910136, + "loss": 0.0, + "step": 37913 + }, + { + "epoch": 3.537743771577867, + "grad_norm": NaN, + "learning_rate": 0.00011431905527000927, + "loss": 0.0, + "step": 37914 + }, + { + "epoch": 3.537837081272744, + "grad_norm": NaN, + "learning_rate": 0.00011431170863973097, + "loss": 0.0, + "step": 37915 + }, + { + "epoch": 3.5379303909676216, + "grad_norm": NaN, + "learning_rate": 0.0001143043621001975, + "loss": 0.0, + "step": 37916 + }, + { + "epoch": 3.5380237006624986, + "grad_norm": NaN, + "learning_rate": 0.00011429701565142743, + "loss": 0.0, + "step": 37917 + }, + { + "epoch": 3.538117010357376, + "grad_norm": NaN, + "learning_rate": 0.00011428966929343943, + "loss": 0.0, + "step": 37918 + }, + { + "epoch": 3.5382103200522534, + "grad_norm": NaN, + "learning_rate": 0.00011428232302625229, + "loss": 0.0, + "step": 37919 + }, + { + "epoch": 3.538303629747131, + "grad_norm": NaN, + "learning_rate": 0.00011427497684988459, + "loss": 0.0, + "step": 37920 + }, + { + "epoch": 3.5383969394420083, + "grad_norm": NaN, + "learning_rate": 0.00011426763076435501, + "loss": 0.0, + "step": 37921 + }, + { + "epoch": 3.5384902491368853, + "grad_norm": NaN, + "learning_rate": 0.00011426028476968233, + "loss": 0.0, + "step": 37922 + }, + { + "epoch": 3.5385835588317627, + "grad_norm": NaN, + "learning_rate": 0.00011425293886588512, + "loss": 0.0, + "step": 37923 + }, + { + "epoch": 3.5386768685266397, + "grad_norm": NaN, + "learning_rate": 0.00011424559305298205, + "loss": 0.0, + "step": 37924 + }, + { + "epoch": 3.538770178221517, + "grad_norm": NaN, + "learning_rate": 0.00011423824733099194, + "loss": 0.0, + "step": 37925 + }, + { + "epoch": 3.5388634879163945, + "grad_norm": NaN, + "learning_rate": 0.00011423090169993332, + "loss": 0.0, + "step": 37926 + }, + { + "epoch": 3.538956797611272, + "grad_norm": NaN, + "learning_rate": 0.00011422355615982492, + "loss": 0.0, + "step": 37927 + }, + { + "epoch": 3.5390501073061493, + "grad_norm": NaN, + "learning_rate": 0.00011421621071068544, + "loss": 0.0, + "step": 37928 + }, + { + "epoch": 3.5391434170010263, + "grad_norm": NaN, + "learning_rate": 0.00011420886535253354, + "loss": 0.0, + "step": 37929 + }, + { + "epoch": 3.5392367266959037, + "grad_norm": NaN, + "learning_rate": 0.00011420152008538783, + "loss": 0.0, + "step": 37930 + }, + { + "epoch": 3.539330036390781, + "grad_norm": NaN, + "learning_rate": 0.00011419417490926714, + "loss": 0.0, + "step": 37931 + }, + { + "epoch": 3.539423346085658, + "grad_norm": NaN, + "learning_rate": 0.00011418682982419, + "loss": 0.0, + "step": 37932 + }, + { + "epoch": 3.5395166557805355, + "grad_norm": NaN, + "learning_rate": 0.0001141794848301752, + "loss": 0.0, + "step": 37933 + }, + { + "epoch": 3.539609965475413, + "grad_norm": NaN, + "learning_rate": 0.00011417213992724128, + "loss": 0.0, + "step": 37934 + }, + { + "epoch": 3.5397032751702904, + "grad_norm": NaN, + "learning_rate": 0.00011416479511540705, + "loss": 0.0, + "step": 37935 + }, + { + "epoch": 3.5397965848651674, + "grad_norm": NaN, + "learning_rate": 0.00011415745039469113, + "loss": 0.0, + "step": 37936 + }, + { + "epoch": 3.5398898945600448, + "grad_norm": NaN, + "learning_rate": 0.00011415010576511215, + "loss": 0.0, + "step": 37937 + }, + { + "epoch": 3.539983204254922, + "grad_norm": NaN, + "learning_rate": 0.00011414276122668887, + "loss": 0.0, + "step": 37938 + }, + { + "epoch": 3.540076513949799, + "grad_norm": NaN, + "learning_rate": 0.00011413541677943993, + "loss": 0.0, + "step": 37939 + }, + { + "epoch": 3.5401698236446766, + "grad_norm": NaN, + "learning_rate": 0.00011412807242338397, + "loss": 0.0, + "step": 37940 + }, + { + "epoch": 3.540263133339554, + "grad_norm": NaN, + "learning_rate": 0.00011412072815853972, + "loss": 0.0, + "step": 37941 + }, + { + "epoch": 3.5403564430344314, + "grad_norm": NaN, + "learning_rate": 0.00011411338398492586, + "loss": 0.0, + "step": 37942 + }, + { + "epoch": 3.540449752729309, + "grad_norm": NaN, + "learning_rate": 0.00011410603990256096, + "loss": 0.0, + "step": 37943 + }, + { + "epoch": 3.540543062424186, + "grad_norm": NaN, + "learning_rate": 0.00011409869591146381, + "loss": 0.0, + "step": 37944 + }, + { + "epoch": 3.5406363721190632, + "grad_norm": NaN, + "learning_rate": 0.00011409135201165306, + "loss": 0.0, + "step": 37945 + }, + { + "epoch": 3.54072968181394, + "grad_norm": NaN, + "learning_rate": 0.00011408400820314732, + "loss": 0.0, + "step": 37946 + }, + { + "epoch": 3.5408229915088176, + "grad_norm": NaN, + "learning_rate": 0.00011407666448596529, + "loss": 0.0, + "step": 37947 + }, + { + "epoch": 3.540916301203695, + "grad_norm": NaN, + "learning_rate": 0.00011406932086012578, + "loss": 0.0, + "step": 37948 + }, + { + "epoch": 3.5410096108985725, + "grad_norm": NaN, + "learning_rate": 0.00011406197732564724, + "loss": 0.0, + "step": 37949 + }, + { + "epoch": 3.54110292059345, + "grad_norm": NaN, + "learning_rate": 0.00011405463388254842, + "loss": 0.0, + "step": 37950 + }, + { + "epoch": 3.541196230288327, + "grad_norm": NaN, + "learning_rate": 0.00011404729053084812, + "loss": 0.0, + "step": 37951 + }, + { + "epoch": 3.5412895399832043, + "grad_norm": NaN, + "learning_rate": 0.00011403994727056486, + "loss": 0.0, + "step": 37952 + }, + { + "epoch": 3.5413828496780817, + "grad_norm": NaN, + "learning_rate": 0.00011403260410171732, + "loss": 0.0, + "step": 37953 + }, + { + "epoch": 3.5414761593729587, + "grad_norm": NaN, + "learning_rate": 0.00011402526102432427, + "loss": 0.0, + "step": 37954 + }, + { + "epoch": 3.541569469067836, + "grad_norm": NaN, + "learning_rate": 0.00011401791803840433, + "loss": 0.0, + "step": 37955 + }, + { + "epoch": 3.5416627787627135, + "grad_norm": NaN, + "learning_rate": 0.0001140105751439761, + "loss": 0.0, + "step": 37956 + }, + { + "epoch": 3.541756088457591, + "grad_norm": NaN, + "learning_rate": 0.0001140032323410584, + "loss": 0.0, + "step": 37957 + }, + { + "epoch": 3.541849398152468, + "grad_norm": NaN, + "learning_rate": 0.00011399588962966979, + "loss": 0.0, + "step": 37958 + }, + { + "epoch": 3.5419427078473453, + "grad_norm": NaN, + "learning_rate": 0.0001139885470098289, + "loss": 0.0, + "step": 37959 + }, + { + "epoch": 3.5420360175422227, + "grad_norm": NaN, + "learning_rate": 0.00011398120448155458, + "loss": 0.0, + "step": 37960 + }, + { + "epoch": 3.5421293272370997, + "grad_norm": NaN, + "learning_rate": 0.00011397386204486534, + "loss": 0.0, + "step": 37961 + }, + { + "epoch": 3.542222636931977, + "grad_norm": NaN, + "learning_rate": 0.00011396651969977985, + "loss": 0.0, + "step": 37962 + }, + { + "epoch": 3.5423159466268546, + "grad_norm": NaN, + "learning_rate": 0.0001139591774463169, + "loss": 0.0, + "step": 37963 + }, + { + "epoch": 3.542409256321732, + "grad_norm": NaN, + "learning_rate": 0.00011395183528449505, + "loss": 0.0, + "step": 37964 + }, + { + "epoch": 3.5425025660166094, + "grad_norm": NaN, + "learning_rate": 0.00011394449321433296, + "loss": 0.0, + "step": 37965 + }, + { + "epoch": 3.5425958757114864, + "grad_norm": NaN, + "learning_rate": 0.00011393715123584942, + "loss": 0.0, + "step": 37966 + }, + { + "epoch": 3.542689185406364, + "grad_norm": NaN, + "learning_rate": 0.00011392980934906299, + "loss": 0.0, + "step": 37967 + }, + { + "epoch": 3.5427824951012408, + "grad_norm": NaN, + "learning_rate": 0.00011392246755399235, + "loss": 0.0, + "step": 37968 + }, + { + "epoch": 3.542875804796118, + "grad_norm": NaN, + "learning_rate": 0.00011391512585065624, + "loss": 0.0, + "step": 37969 + }, + { + "epoch": 3.5429691144909956, + "grad_norm": NaN, + "learning_rate": 0.00011390778423907325, + "loss": 0.0, + "step": 37970 + }, + { + "epoch": 3.543062424185873, + "grad_norm": NaN, + "learning_rate": 0.00011390044271926204, + "loss": 0.0, + "step": 37971 + }, + { + "epoch": 3.5431557338807504, + "grad_norm": NaN, + "learning_rate": 0.00011389310129124137, + "loss": 0.0, + "step": 37972 + }, + { + "epoch": 3.5432490435756274, + "grad_norm": NaN, + "learning_rate": 0.00011388575995502981, + "loss": 0.0, + "step": 37973 + }, + { + "epoch": 3.543342353270505, + "grad_norm": NaN, + "learning_rate": 0.00011387841871064605, + "loss": 0.0, + "step": 37974 + }, + { + "epoch": 3.5434356629653823, + "grad_norm": NaN, + "learning_rate": 0.00011387107755810881, + "loss": 0.0, + "step": 37975 + }, + { + "epoch": 3.5435289726602592, + "grad_norm": NaN, + "learning_rate": 0.00011386373649743672, + "loss": 0.0, + "step": 37976 + }, + { + "epoch": 3.5436222823551367, + "grad_norm": NaN, + "learning_rate": 0.00011385639552864845, + "loss": 0.0, + "step": 37977 + }, + { + "epoch": 3.543715592050014, + "grad_norm": NaN, + "learning_rate": 0.0001138490546517626, + "loss": 0.0, + "step": 37978 + }, + { + "epoch": 3.5438089017448915, + "grad_norm": NaN, + "learning_rate": 0.00011384171386679792, + "loss": 0.0, + "step": 37979 + }, + { + "epoch": 3.5439022114397685, + "grad_norm": NaN, + "learning_rate": 0.0001138343731737731, + "loss": 0.0, + "step": 37980 + }, + { + "epoch": 3.543995521134646, + "grad_norm": NaN, + "learning_rate": 0.00011382703257270669, + "loss": 0.0, + "step": 37981 + }, + { + "epoch": 3.5440888308295233, + "grad_norm": NaN, + "learning_rate": 0.00011381969206361743, + "loss": 0.0, + "step": 37982 + }, + { + "epoch": 3.5441821405244003, + "grad_norm": NaN, + "learning_rate": 0.00011381235164652402, + "loss": 0.0, + "step": 37983 + }, + { + "epoch": 3.5442754502192777, + "grad_norm": NaN, + "learning_rate": 0.00011380501132144503, + "loss": 0.0, + "step": 37984 + }, + { + "epoch": 3.544368759914155, + "grad_norm": NaN, + "learning_rate": 0.00011379767108839917, + "loss": 0.0, + "step": 37985 + }, + { + "epoch": 3.5444620696090325, + "grad_norm": NaN, + "learning_rate": 0.00011379033094740516, + "loss": 0.0, + "step": 37986 + }, + { + "epoch": 3.5445553793039095, + "grad_norm": NaN, + "learning_rate": 0.0001137829908984816, + "loss": 0.0, + "step": 37987 + }, + { + "epoch": 3.544648688998787, + "grad_norm": NaN, + "learning_rate": 0.00011377565094164712, + "loss": 0.0, + "step": 37988 + }, + { + "epoch": 3.5447419986936644, + "grad_norm": NaN, + "learning_rate": 0.00011376831107692049, + "loss": 0.0, + "step": 37989 + }, + { + "epoch": 3.5448353083885413, + "grad_norm": NaN, + "learning_rate": 0.0001137609713043203, + "loss": 0.0, + "step": 37990 + }, + { + "epoch": 3.5449286180834187, + "grad_norm": NaN, + "learning_rate": 0.00011375363162386517, + "loss": 0.0, + "step": 37991 + }, + { + "epoch": 3.545021927778296, + "grad_norm": NaN, + "learning_rate": 0.00011374629203557388, + "loss": 0.0, + "step": 37992 + }, + { + "epoch": 3.5451152374731736, + "grad_norm": NaN, + "learning_rate": 0.00011373895253946502, + "loss": 0.0, + "step": 37993 + }, + { + "epoch": 3.545208547168051, + "grad_norm": NaN, + "learning_rate": 0.00011373161313555722, + "loss": 0.0, + "step": 37994 + }, + { + "epoch": 3.545301856862928, + "grad_norm": NaN, + "learning_rate": 0.00011372427382386924, + "loss": 0.0, + "step": 37995 + }, + { + "epoch": 3.5453951665578054, + "grad_norm": NaN, + "learning_rate": 0.00011371693460441967, + "loss": 0.0, + "step": 37996 + }, + { + "epoch": 3.545488476252683, + "grad_norm": NaN, + "learning_rate": 0.00011370959547722714, + "loss": 0.0, + "step": 37997 + }, + { + "epoch": 3.54558178594756, + "grad_norm": NaN, + "learning_rate": 0.00011370225644231042, + "loss": 0.0, + "step": 37998 + }, + { + "epoch": 3.545675095642437, + "grad_norm": NaN, + "learning_rate": 0.0001136949174996881, + "loss": 0.0, + "step": 37999 + }, + { + "epoch": 3.5457684053373146, + "grad_norm": NaN, + "learning_rate": 0.0001136875786493788, + "loss": 0.0, + "step": 38000 + }, + { + "epoch": 3.545861715032192, + "grad_norm": NaN, + "learning_rate": 0.00011368023989140131, + "loss": 0.0, + "step": 38001 + }, + { + "epoch": 3.545955024727069, + "grad_norm": NaN, + "learning_rate": 0.00011367290122577416, + "loss": 0.0, + "step": 38002 + }, + { + "epoch": 3.5460483344219464, + "grad_norm": NaN, + "learning_rate": 0.00011366556265251604, + "loss": 0.0, + "step": 38003 + }, + { + "epoch": 3.546141644116824, + "grad_norm": NaN, + "learning_rate": 0.0001136582241716457, + "loss": 0.0, + "step": 38004 + }, + { + "epoch": 3.546234953811701, + "grad_norm": NaN, + "learning_rate": 0.00011365088578318168, + "loss": 0.0, + "step": 38005 + }, + { + "epoch": 3.5463282635065783, + "grad_norm": NaN, + "learning_rate": 0.00011364354748714267, + "loss": 0.0, + "step": 38006 + }, + { + "epoch": 3.5464215732014557, + "grad_norm": NaN, + "learning_rate": 0.00011363620928354744, + "loss": 0.0, + "step": 38007 + }, + { + "epoch": 3.546514882896333, + "grad_norm": NaN, + "learning_rate": 0.0001136288711724145, + "loss": 0.0, + "step": 38008 + }, + { + "epoch": 3.54660819259121, + "grad_norm": NaN, + "learning_rate": 0.00011362153315376251, + "loss": 0.0, + "step": 38009 + }, + { + "epoch": 3.5467015022860875, + "grad_norm": NaN, + "learning_rate": 0.00011361419522761028, + "loss": 0.0, + "step": 38010 + }, + { + "epoch": 3.546794811980965, + "grad_norm": NaN, + "learning_rate": 0.00011360685739397634, + "loss": 0.0, + "step": 38011 + }, + { + "epoch": 3.546888121675842, + "grad_norm": NaN, + "learning_rate": 0.00011359951965287934, + "loss": 0.0, + "step": 38012 + }, + { + "epoch": 3.5469814313707193, + "grad_norm": NaN, + "learning_rate": 0.00011359218200433805, + "loss": 0.0, + "step": 38013 + }, + { + "epoch": 3.5470747410655967, + "grad_norm": NaN, + "learning_rate": 0.00011358484444837103, + "loss": 0.0, + "step": 38014 + }, + { + "epoch": 3.547168050760474, + "grad_norm": NaN, + "learning_rate": 0.00011357750698499691, + "loss": 0.0, + "step": 38015 + }, + { + "epoch": 3.5472613604553516, + "grad_norm": NaN, + "learning_rate": 0.00011357016961423448, + "loss": 0.0, + "step": 38016 + }, + { + "epoch": 3.5473546701502285, + "grad_norm": NaN, + "learning_rate": 0.0001135628323361023, + "loss": 0.0, + "step": 38017 + }, + { + "epoch": 3.547447979845106, + "grad_norm": NaN, + "learning_rate": 0.00011355549515061899, + "loss": 0.0, + "step": 38018 + }, + { + "epoch": 3.547541289539983, + "grad_norm": NaN, + "learning_rate": 0.00011354815805780333, + "loss": 0.0, + "step": 38019 + }, + { + "epoch": 3.5476345992348604, + "grad_norm": NaN, + "learning_rate": 0.00011354082105767382, + "loss": 0.0, + "step": 38020 + }, + { + "epoch": 3.5477279089297378, + "grad_norm": NaN, + "learning_rate": 0.00011353348415024927, + "loss": 0.0, + "step": 38021 + }, + { + "epoch": 3.547821218624615, + "grad_norm": NaN, + "learning_rate": 0.00011352614733554827, + "loss": 0.0, + "step": 38022 + }, + { + "epoch": 3.5479145283194926, + "grad_norm": NaN, + "learning_rate": 0.00011351881061358943, + "loss": 0.0, + "step": 38023 + }, + { + "epoch": 3.5480078380143696, + "grad_norm": NaN, + "learning_rate": 0.00011351147398439152, + "loss": 0.0, + "step": 38024 + }, + { + "epoch": 3.548101147709247, + "grad_norm": NaN, + "learning_rate": 0.00011350413744797308, + "loss": 0.0, + "step": 38025 + }, + { + "epoch": 3.5481944574041244, + "grad_norm": NaN, + "learning_rate": 0.00011349680100435277, + "loss": 0.0, + "step": 38026 + }, + { + "epoch": 3.5482877670990014, + "grad_norm": NaN, + "learning_rate": 0.00011348946465354934, + "loss": 0.0, + "step": 38027 + }, + { + "epoch": 3.548381076793879, + "grad_norm": NaN, + "learning_rate": 0.00011348212839558137, + "loss": 0.0, + "step": 38028 + }, + { + "epoch": 3.5484743864887562, + "grad_norm": NaN, + "learning_rate": 0.00011347479223046748, + "loss": 0.0, + "step": 38029 + }, + { + "epoch": 3.5485676961836337, + "grad_norm": NaN, + "learning_rate": 0.00011346745615822646, + "loss": 0.0, + "step": 38030 + }, + { + "epoch": 3.5486610058785106, + "grad_norm": NaN, + "learning_rate": 0.00011346012017887683, + "loss": 0.0, + "step": 38031 + }, + { + "epoch": 3.548754315573388, + "grad_norm": NaN, + "learning_rate": 0.00011345278429243727, + "loss": 0.0, + "step": 38032 + }, + { + "epoch": 3.5488476252682655, + "grad_norm": NaN, + "learning_rate": 0.0001134454484989265, + "loss": 0.0, + "step": 38033 + }, + { + "epoch": 3.5489409349631424, + "grad_norm": NaN, + "learning_rate": 0.0001134381127983631, + "loss": 0.0, + "step": 38034 + }, + { + "epoch": 3.54903424465802, + "grad_norm": NaN, + "learning_rate": 0.00011343077719076573, + "loss": 0.0, + "step": 38035 + }, + { + "epoch": 3.5491275543528973, + "grad_norm": NaN, + "learning_rate": 0.0001134234416761531, + "loss": 0.0, + "step": 38036 + }, + { + "epoch": 3.5492208640477747, + "grad_norm": NaN, + "learning_rate": 0.00011341610625454383, + "loss": 0.0, + "step": 38037 + }, + { + "epoch": 3.549314173742652, + "grad_norm": NaN, + "learning_rate": 0.0001134087709259565, + "loss": 0.0, + "step": 38038 + }, + { + "epoch": 3.549407483437529, + "grad_norm": NaN, + "learning_rate": 0.00011340143569040989, + "loss": 0.0, + "step": 38039 + }, + { + "epoch": 3.5495007931324065, + "grad_norm": NaN, + "learning_rate": 0.00011339410054792257, + "loss": 0.0, + "step": 38040 + }, + { + "epoch": 3.5495941028272835, + "grad_norm": NaN, + "learning_rate": 0.00011338676549851317, + "loss": 0.0, + "step": 38041 + }, + { + "epoch": 3.549687412522161, + "grad_norm": NaN, + "learning_rate": 0.00011337943054220046, + "loss": 0.0, + "step": 38042 + }, + { + "epoch": 3.5497807222170383, + "grad_norm": NaN, + "learning_rate": 0.00011337209567900297, + "loss": 0.0, + "step": 38043 + }, + { + "epoch": 3.5498740319119158, + "grad_norm": NaN, + "learning_rate": 0.00011336476090893934, + "loss": 0.0, + "step": 38044 + }, + { + "epoch": 3.549967341606793, + "grad_norm": NaN, + "learning_rate": 0.00011335742623202834, + "loss": 0.0, + "step": 38045 + }, + { + "epoch": 3.55006065130167, + "grad_norm": NaN, + "learning_rate": 0.00011335009164828854, + "loss": 0.0, + "step": 38046 + }, + { + "epoch": 3.5501539609965476, + "grad_norm": NaN, + "learning_rate": 0.00011334275715773855, + "loss": 0.0, + "step": 38047 + }, + { + "epoch": 3.550247270691425, + "grad_norm": NaN, + "learning_rate": 0.00011333542276039715, + "loss": 0.0, + "step": 38048 + }, + { + "epoch": 3.550340580386302, + "grad_norm": NaN, + "learning_rate": 0.00011332808845628286, + "loss": 0.0, + "step": 38049 + }, + { + "epoch": 3.5504338900811794, + "grad_norm": NaN, + "learning_rate": 0.00011332075424541435, + "loss": 0.0, + "step": 38050 + }, + { + "epoch": 3.550527199776057, + "grad_norm": NaN, + "learning_rate": 0.00011331342012781037, + "loss": 0.0, + "step": 38051 + }, + { + "epoch": 3.550620509470934, + "grad_norm": NaN, + "learning_rate": 0.00011330608610348944, + "loss": 0.0, + "step": 38052 + }, + { + "epoch": 3.550713819165811, + "grad_norm": NaN, + "learning_rate": 0.00011329875217247026, + "loss": 0.0, + "step": 38053 + }, + { + "epoch": 3.5508071288606886, + "grad_norm": NaN, + "learning_rate": 0.00011329141833477154, + "loss": 0.0, + "step": 38054 + }, + { + "epoch": 3.550900438555566, + "grad_norm": NaN, + "learning_rate": 0.00011328408459041182, + "loss": 0.0, + "step": 38055 + }, + { + "epoch": 3.550993748250443, + "grad_norm": NaN, + "learning_rate": 0.00011327675093940978, + "loss": 0.0, + "step": 38056 + }, + { + "epoch": 3.5510870579453204, + "grad_norm": NaN, + "learning_rate": 0.00011326941738178415, + "loss": 0.0, + "step": 38057 + }, + { + "epoch": 3.551180367640198, + "grad_norm": NaN, + "learning_rate": 0.00011326208391755345, + "loss": 0.0, + "step": 38058 + }, + { + "epoch": 3.5512736773350753, + "grad_norm": NaN, + "learning_rate": 0.0001132547505467364, + "loss": 0.0, + "step": 38059 + }, + { + "epoch": 3.5513669870299527, + "grad_norm": NaN, + "learning_rate": 0.00011324741726935168, + "loss": 0.0, + "step": 38060 + }, + { + "epoch": 3.5514602967248297, + "grad_norm": NaN, + "learning_rate": 0.00011324008408541782, + "loss": 0.0, + "step": 38061 + }, + { + "epoch": 3.551553606419707, + "grad_norm": NaN, + "learning_rate": 0.00011323275099495358, + "loss": 0.0, + "step": 38062 + }, + { + "epoch": 3.551646916114584, + "grad_norm": NaN, + "learning_rate": 0.00011322541799797758, + "loss": 0.0, + "step": 38063 + }, + { + "epoch": 3.5517402258094615, + "grad_norm": NaN, + "learning_rate": 0.0001132180850945084, + "loss": 0.0, + "step": 38064 + }, + { + "epoch": 3.551833535504339, + "grad_norm": NaN, + "learning_rate": 0.00011321075228456476, + "loss": 0.0, + "step": 38065 + }, + { + "epoch": 3.5519268451992163, + "grad_norm": NaN, + "learning_rate": 0.0001132034195681653, + "loss": 0.0, + "step": 38066 + }, + { + "epoch": 3.5520201548940937, + "grad_norm": NaN, + "learning_rate": 0.00011319608694532859, + "loss": 0.0, + "step": 38067 + }, + { + "epoch": 3.5521134645889707, + "grad_norm": NaN, + "learning_rate": 0.0001131887544160734, + "loss": 0.0, + "step": 38068 + }, + { + "epoch": 3.552206774283848, + "grad_norm": NaN, + "learning_rate": 0.00011318142198041826, + "loss": 0.0, + "step": 38069 + }, + { + "epoch": 3.5523000839787255, + "grad_norm": NaN, + "learning_rate": 0.00011317408963838182, + "loss": 0.0, + "step": 38070 + }, + { + "epoch": 3.5523933936736025, + "grad_norm": NaN, + "learning_rate": 0.00011316675738998285, + "loss": 0.0, + "step": 38071 + }, + { + "epoch": 3.55248670336848, + "grad_norm": NaN, + "learning_rate": 0.00011315942523523986, + "loss": 0.0, + "step": 38072 + }, + { + "epoch": 3.5525800130633574, + "grad_norm": NaN, + "learning_rate": 0.00011315209317417151, + "loss": 0.0, + "step": 38073 + }, + { + "epoch": 3.5526733227582348, + "grad_norm": NaN, + "learning_rate": 0.00011314476120679653, + "loss": 0.0, + "step": 38074 + }, + { + "epoch": 3.5527666324531118, + "grad_norm": NaN, + "learning_rate": 0.00011313742933313348, + "loss": 0.0, + "step": 38075 + }, + { + "epoch": 3.552859942147989, + "grad_norm": NaN, + "learning_rate": 0.00011313009755320097, + "loss": 0.0, + "step": 38076 + }, + { + "epoch": 3.5529532518428666, + "grad_norm": NaN, + "learning_rate": 0.00011312276586701779, + "loss": 0.0, + "step": 38077 + }, + { + "epoch": 3.5530465615377436, + "grad_norm": NaN, + "learning_rate": 0.00011311543427460246, + "loss": 0.0, + "step": 38078 + }, + { + "epoch": 3.553139871232621, + "grad_norm": NaN, + "learning_rate": 0.0001131081027759736, + "loss": 0.0, + "step": 38079 + }, + { + "epoch": 3.5532331809274984, + "grad_norm": NaN, + "learning_rate": 0.00011310077137115002, + "loss": 0.0, + "step": 38080 + }, + { + "epoch": 3.553326490622376, + "grad_norm": NaN, + "learning_rate": 0.00011309344006015017, + "loss": 0.0, + "step": 38081 + }, + { + "epoch": 3.553419800317253, + "grad_norm": NaN, + "learning_rate": 0.00011308610884299275, + "loss": 0.0, + "step": 38082 + }, + { + "epoch": 3.55351311001213, + "grad_norm": NaN, + "learning_rate": 0.0001130787777196965, + "loss": 0.0, + "step": 38083 + }, + { + "epoch": 3.5536064197070076, + "grad_norm": NaN, + "learning_rate": 0.00011307144669027991, + "loss": 0.0, + "step": 38084 + }, + { + "epoch": 3.5536997294018846, + "grad_norm": NaN, + "learning_rate": 0.00011306411575476169, + "loss": 0.0, + "step": 38085 + }, + { + "epoch": 3.553793039096762, + "grad_norm": NaN, + "learning_rate": 0.00011305678491316053, + "loss": 0.0, + "step": 38086 + }, + { + "epoch": 3.5538863487916394, + "grad_norm": NaN, + "learning_rate": 0.00011304945416549501, + "loss": 0.0, + "step": 38087 + }, + { + "epoch": 3.553979658486517, + "grad_norm": NaN, + "learning_rate": 0.00011304212351178375, + "loss": 0.0, + "step": 38088 + }, + { + "epoch": 3.5540729681813943, + "grad_norm": NaN, + "learning_rate": 0.00011303479295204549, + "loss": 0.0, + "step": 38089 + }, + { + "epoch": 3.5541662778762713, + "grad_norm": NaN, + "learning_rate": 0.00011302746248629874, + "loss": 0.0, + "step": 38090 + }, + { + "epoch": 3.5542595875711487, + "grad_norm": NaN, + "learning_rate": 0.0001130201321145622, + "loss": 0.0, + "step": 38091 + }, + { + "epoch": 3.554352897266026, + "grad_norm": NaN, + "learning_rate": 0.00011301280183685456, + "loss": 0.0, + "step": 38092 + }, + { + "epoch": 3.554446206960903, + "grad_norm": NaN, + "learning_rate": 0.00011300547165319437, + "loss": 0.0, + "step": 38093 + }, + { + "epoch": 3.5545395166557805, + "grad_norm": NaN, + "learning_rate": 0.00011299814156360028, + "loss": 0.0, + "step": 38094 + }, + { + "epoch": 3.554632826350658, + "grad_norm": NaN, + "learning_rate": 0.00011299081156809104, + "loss": 0.0, + "step": 38095 + }, + { + "epoch": 3.5547261360455353, + "grad_norm": NaN, + "learning_rate": 0.0001129834816666851, + "loss": 0.0, + "step": 38096 + }, + { + "epoch": 3.5548194457404123, + "grad_norm": NaN, + "learning_rate": 0.00011297615185940126, + "loss": 0.0, + "step": 38097 + }, + { + "epoch": 3.5549127554352897, + "grad_norm": NaN, + "learning_rate": 0.00011296882214625813, + "loss": 0.0, + "step": 38098 + }, + { + "epoch": 3.555006065130167, + "grad_norm": NaN, + "learning_rate": 0.00011296149252727425, + "loss": 0.0, + "step": 38099 + }, + { + "epoch": 3.555099374825044, + "grad_norm": NaN, + "learning_rate": 0.00011295416300246834, + "loss": 0.0, + "step": 38100 + }, + { + "epoch": 3.5551926845199215, + "grad_norm": NaN, + "learning_rate": 0.00011294683357185907, + "loss": 0.0, + "step": 38101 + }, + { + "epoch": 3.555285994214799, + "grad_norm": NaN, + "learning_rate": 0.00011293950423546496, + "loss": 0.0, + "step": 38102 + }, + { + "epoch": 3.5553793039096764, + "grad_norm": NaN, + "learning_rate": 0.00011293217499330474, + "loss": 0.0, + "step": 38103 + }, + { + "epoch": 3.5554726136045534, + "grad_norm": NaN, + "learning_rate": 0.00011292484584539704, + "loss": 0.0, + "step": 38104 + }, + { + "epoch": 3.5555659232994308, + "grad_norm": NaN, + "learning_rate": 0.00011291751679176043, + "loss": 0.0, + "step": 38105 + }, + { + "epoch": 3.555659232994308, + "grad_norm": NaN, + "learning_rate": 0.0001129101878324136, + "loss": 0.0, + "step": 38106 + }, + { + "epoch": 3.555752542689185, + "grad_norm": NaN, + "learning_rate": 0.00011290285896737523, + "loss": 0.0, + "step": 38107 + }, + { + "epoch": 3.5558458523840626, + "grad_norm": NaN, + "learning_rate": 0.00011289553019666382, + "loss": 0.0, + "step": 38108 + }, + { + "epoch": 3.55593916207894, + "grad_norm": NaN, + "learning_rate": 0.00011288820152029814, + "loss": 0.0, + "step": 38109 + }, + { + "epoch": 3.5560324717738174, + "grad_norm": NaN, + "learning_rate": 0.00011288087293829678, + "loss": 0.0, + "step": 38110 + }, + { + "epoch": 3.556125781468695, + "grad_norm": NaN, + "learning_rate": 0.00011287354445067831, + "loss": 0.0, + "step": 38111 + }, + { + "epoch": 3.556219091163572, + "grad_norm": NaN, + "learning_rate": 0.00011286621605746147, + "loss": 0.0, + "step": 38112 + }, + { + "epoch": 3.5563124008584492, + "grad_norm": NaN, + "learning_rate": 0.00011285888775866482, + "loss": 0.0, + "step": 38113 + }, + { + "epoch": 3.556405710553326, + "grad_norm": NaN, + "learning_rate": 0.00011285155955430698, + "loss": 0.0, + "step": 38114 + }, + { + "epoch": 3.5564990202482036, + "grad_norm": NaN, + "learning_rate": 0.0001128442314444067, + "loss": 0.0, + "step": 38115 + }, + { + "epoch": 3.556592329943081, + "grad_norm": NaN, + "learning_rate": 0.00011283690342898248, + "loss": 0.0, + "step": 38116 + }, + { + "epoch": 3.5566856396379585, + "grad_norm": NaN, + "learning_rate": 0.000112829575508053, + "loss": 0.0, + "step": 38117 + }, + { + "epoch": 3.556778949332836, + "grad_norm": NaN, + "learning_rate": 0.00011282224768163693, + "loss": 0.0, + "step": 38118 + }, + { + "epoch": 3.556872259027713, + "grad_norm": NaN, + "learning_rate": 0.00011281491994975287, + "loss": 0.0, + "step": 38119 + }, + { + "epoch": 3.5569655687225903, + "grad_norm": NaN, + "learning_rate": 0.00011280759231241941, + "loss": 0.0, + "step": 38120 + }, + { + "epoch": 3.5570588784174677, + "grad_norm": NaN, + "learning_rate": 0.00011280026476965529, + "loss": 0.0, + "step": 38121 + }, + { + "epoch": 3.5571521881123447, + "grad_norm": NaN, + "learning_rate": 0.00011279293732147903, + "loss": 0.0, + "step": 38122 + }, + { + "epoch": 3.557245497807222, + "grad_norm": NaN, + "learning_rate": 0.00011278560996790931, + "loss": 0.0, + "step": 38123 + }, + { + "epoch": 3.5573388075020995, + "grad_norm": NaN, + "learning_rate": 0.00011277828270896478, + "loss": 0.0, + "step": 38124 + }, + { + "epoch": 3.557432117196977, + "grad_norm": NaN, + "learning_rate": 0.00011277095554466406, + "loss": 0.0, + "step": 38125 + }, + { + "epoch": 3.557525426891854, + "grad_norm": NaN, + "learning_rate": 0.00011276362847502571, + "loss": 0.0, + "step": 38126 + }, + { + "epoch": 3.5576187365867313, + "grad_norm": NaN, + "learning_rate": 0.00011275630150006851, + "loss": 0.0, + "step": 38127 + }, + { + "epoch": 3.5577120462816088, + "grad_norm": NaN, + "learning_rate": 0.00011274897461981096, + "loss": 0.0, + "step": 38128 + }, + { + "epoch": 3.5578053559764857, + "grad_norm": NaN, + "learning_rate": 0.0001127416478342717, + "loss": 0.0, + "step": 38129 + }, + { + "epoch": 3.557898665671363, + "grad_norm": NaN, + "learning_rate": 0.00011273432114346946, + "loss": 0.0, + "step": 38130 + }, + { + "epoch": 3.5579919753662406, + "grad_norm": NaN, + "learning_rate": 0.00011272699454742277, + "loss": 0.0, + "step": 38131 + }, + { + "epoch": 3.558085285061118, + "grad_norm": NaN, + "learning_rate": 0.00011271966804615026, + "loss": 0.0, + "step": 38132 + }, + { + "epoch": 3.5581785947559954, + "grad_norm": NaN, + "learning_rate": 0.00011271234163967066, + "loss": 0.0, + "step": 38133 + }, + { + "epoch": 3.5582719044508724, + "grad_norm": NaN, + "learning_rate": 0.00011270501532800246, + "loss": 0.0, + "step": 38134 + }, + { + "epoch": 3.55836521414575, + "grad_norm": NaN, + "learning_rate": 0.00011269768911116438, + "loss": 0.0, + "step": 38135 + }, + { + "epoch": 3.5584585238406268, + "grad_norm": NaN, + "learning_rate": 0.00011269036298917507, + "loss": 0.0, + "step": 38136 + }, + { + "epoch": 3.558551833535504, + "grad_norm": NaN, + "learning_rate": 0.00011268303696205304, + "loss": 0.0, + "step": 38137 + }, + { + "epoch": 3.5586451432303816, + "grad_norm": NaN, + "learning_rate": 0.00011267571102981704, + "loss": 0.0, + "step": 38138 + }, + { + "epoch": 3.558738452925259, + "grad_norm": NaN, + "learning_rate": 0.00011266838519248568, + "loss": 0.0, + "step": 38139 + }, + { + "epoch": 3.5588317626201365, + "grad_norm": NaN, + "learning_rate": 0.0001126610594500775, + "loss": 0.0, + "step": 38140 + }, + { + "epoch": 3.5589250723150134, + "grad_norm": NaN, + "learning_rate": 0.0001126537338026112, + "loss": 0.0, + "step": 38141 + }, + { + "epoch": 3.559018382009891, + "grad_norm": NaN, + "learning_rate": 0.00011264640825010544, + "loss": 0.0, + "step": 38142 + }, + { + "epoch": 3.5591116917047683, + "grad_norm": NaN, + "learning_rate": 0.00011263908279257873, + "loss": 0.0, + "step": 38143 + }, + { + "epoch": 3.5592050013996452, + "grad_norm": NaN, + "learning_rate": 0.00011263175743004978, + "loss": 0.0, + "step": 38144 + }, + { + "epoch": 3.5592983110945227, + "grad_norm": NaN, + "learning_rate": 0.00011262443216253725, + "loss": 0.0, + "step": 38145 + }, + { + "epoch": 3.5593916207894, + "grad_norm": NaN, + "learning_rate": 0.00011261710699005966, + "loss": 0.0, + "step": 38146 + }, + { + "epoch": 3.5594849304842775, + "grad_norm": NaN, + "learning_rate": 0.0001126097819126357, + "loss": 0.0, + "step": 38147 + }, + { + "epoch": 3.5595782401791545, + "grad_norm": NaN, + "learning_rate": 0.00011260245693028405, + "loss": 0.0, + "step": 38148 + }, + { + "epoch": 3.559671549874032, + "grad_norm": NaN, + "learning_rate": 0.00011259513204302319, + "loss": 0.0, + "step": 38149 + }, + { + "epoch": 3.5597648595689093, + "grad_norm": NaN, + "learning_rate": 0.00011258780725087187, + "loss": 0.0, + "step": 38150 + }, + { + "epoch": 3.5598581692637863, + "grad_norm": NaN, + "learning_rate": 0.00011258048255384871, + "loss": 0.0, + "step": 38151 + }, + { + "epoch": 3.5599514789586637, + "grad_norm": NaN, + "learning_rate": 0.00011257315795197224, + "loss": 0.0, + "step": 38152 + }, + { + "epoch": 3.560044788653541, + "grad_norm": NaN, + "learning_rate": 0.00011256583344526116, + "loss": 0.0, + "step": 38153 + }, + { + "epoch": 3.5601380983484185, + "grad_norm": NaN, + "learning_rate": 0.0001125585090337341, + "loss": 0.0, + "step": 38154 + }, + { + "epoch": 3.560231408043296, + "grad_norm": NaN, + "learning_rate": 0.00011255118471740962, + "loss": 0.0, + "step": 38155 + }, + { + "epoch": 3.560324717738173, + "grad_norm": NaN, + "learning_rate": 0.00011254386049630644, + "loss": 0.0, + "step": 38156 + }, + { + "epoch": 3.5604180274330504, + "grad_norm": NaN, + "learning_rate": 0.00011253653637044309, + "loss": 0.0, + "step": 38157 + }, + { + "epoch": 3.5605113371279273, + "grad_norm": NaN, + "learning_rate": 0.0001125292123398382, + "loss": 0.0, + "step": 38158 + }, + { + "epoch": 3.5606046468228048, + "grad_norm": NaN, + "learning_rate": 0.00011252188840451048, + "loss": 0.0, + "step": 38159 + }, + { + "epoch": 3.560697956517682, + "grad_norm": NaN, + "learning_rate": 0.00011251456456447849, + "loss": 0.0, + "step": 38160 + }, + { + "epoch": 3.5607912662125596, + "grad_norm": NaN, + "learning_rate": 0.0001125072408197608, + "loss": 0.0, + "step": 38161 + }, + { + "epoch": 3.560884575907437, + "grad_norm": NaN, + "learning_rate": 0.00011249991717037617, + "loss": 0.0, + "step": 38162 + }, + { + "epoch": 3.560977885602314, + "grad_norm": NaN, + "learning_rate": 0.00011249259361634309, + "loss": 0.0, + "step": 38163 + }, + { + "epoch": 3.5610711952971914, + "grad_norm": NaN, + "learning_rate": 0.00011248527015768022, + "loss": 0.0, + "step": 38164 + }, + { + "epoch": 3.561164504992069, + "grad_norm": NaN, + "learning_rate": 0.00011247794679440625, + "loss": 0.0, + "step": 38165 + }, + { + "epoch": 3.561257814686946, + "grad_norm": NaN, + "learning_rate": 0.00011247062352653973, + "loss": 0.0, + "step": 38166 + }, + { + "epoch": 3.561351124381823, + "grad_norm": NaN, + "learning_rate": 0.00011246330035409924, + "loss": 0.0, + "step": 38167 + }, + { + "epoch": 3.5614444340767006, + "grad_norm": NaN, + "learning_rate": 0.00011245597727710354, + "loss": 0.0, + "step": 38168 + }, + { + "epoch": 3.561537743771578, + "grad_norm": NaN, + "learning_rate": 0.00011244865429557114, + "loss": 0.0, + "step": 38169 + }, + { + "epoch": 3.561631053466455, + "grad_norm": NaN, + "learning_rate": 0.00011244133140952067, + "loss": 0.0, + "step": 38170 + }, + { + "epoch": 3.5617243631613325, + "grad_norm": NaN, + "learning_rate": 0.0001124340086189708, + "loss": 0.0, + "step": 38171 + }, + { + "epoch": 3.56181767285621, + "grad_norm": NaN, + "learning_rate": 0.00011242668592394005, + "loss": 0.0, + "step": 38172 + }, + { + "epoch": 3.561910982551087, + "grad_norm": NaN, + "learning_rate": 0.00011241936332444716, + "loss": 0.0, + "step": 38173 + }, + { + "epoch": 3.5620042922459643, + "grad_norm": NaN, + "learning_rate": 0.00011241204082051073, + "loss": 0.0, + "step": 38174 + }, + { + "epoch": 3.5620976019408417, + "grad_norm": NaN, + "learning_rate": 0.00011240471841214929, + "loss": 0.0, + "step": 38175 + }, + { + "epoch": 3.562190911635719, + "grad_norm": NaN, + "learning_rate": 0.00011239739609938151, + "loss": 0.0, + "step": 38176 + }, + { + "epoch": 3.5622842213305965, + "grad_norm": NaN, + "learning_rate": 0.00011239007388222607, + "loss": 0.0, + "step": 38177 + }, + { + "epoch": 3.5623775310254735, + "grad_norm": NaN, + "learning_rate": 0.00011238275176070147, + "loss": 0.0, + "step": 38178 + }, + { + "epoch": 3.562470840720351, + "grad_norm": NaN, + "learning_rate": 0.00011237542973482643, + "loss": 0.0, + "step": 38179 + }, + { + "epoch": 3.562564150415228, + "grad_norm": NaN, + "learning_rate": 0.00011236810780461954, + "loss": 0.0, + "step": 38180 + }, + { + "epoch": 3.5626574601101053, + "grad_norm": NaN, + "learning_rate": 0.00011236078597009935, + "loss": 0.0, + "step": 38181 + }, + { + "epoch": 3.5627507698049827, + "grad_norm": NaN, + "learning_rate": 0.00011235346423128456, + "loss": 0.0, + "step": 38182 + }, + { + "epoch": 3.56284407949986, + "grad_norm": NaN, + "learning_rate": 0.00011234614258819379, + "loss": 0.0, + "step": 38183 + }, + { + "epoch": 3.5629373891947376, + "grad_norm": NaN, + "learning_rate": 0.00011233882104084556, + "loss": 0.0, + "step": 38184 + }, + { + "epoch": 3.5630306988896145, + "grad_norm": NaN, + "learning_rate": 0.00011233149958925857, + "loss": 0.0, + "step": 38185 + }, + { + "epoch": 3.563124008584492, + "grad_norm": NaN, + "learning_rate": 0.00011232417823345147, + "loss": 0.0, + "step": 38186 + }, + { + "epoch": 3.5632173182793694, + "grad_norm": NaN, + "learning_rate": 0.00011231685697344276, + "loss": 0.0, + "step": 38187 + }, + { + "epoch": 3.5633106279742464, + "grad_norm": NaN, + "learning_rate": 0.00011230953580925116, + "loss": 0.0, + "step": 38188 + }, + { + "epoch": 3.5634039376691238, + "grad_norm": NaN, + "learning_rate": 0.00011230221474089526, + "loss": 0.0, + "step": 38189 + }, + { + "epoch": 3.563497247364001, + "grad_norm": NaN, + "learning_rate": 0.0001122948937683936, + "loss": 0.0, + "step": 38190 + }, + { + "epoch": 3.5635905570588786, + "grad_norm": NaN, + "learning_rate": 0.00011228757289176489, + "loss": 0.0, + "step": 38191 + }, + { + "epoch": 3.5636838667537556, + "grad_norm": NaN, + "learning_rate": 0.00011228025211102775, + "loss": 0.0, + "step": 38192 + }, + { + "epoch": 3.563777176448633, + "grad_norm": NaN, + "learning_rate": 0.00011227293142620069, + "loss": 0.0, + "step": 38193 + }, + { + "epoch": 3.5638704861435104, + "grad_norm": NaN, + "learning_rate": 0.00011226561083730242, + "loss": 0.0, + "step": 38194 + }, + { + "epoch": 3.5639637958383874, + "grad_norm": NaN, + "learning_rate": 0.00011225829034435154, + "loss": 0.0, + "step": 38195 + }, + { + "epoch": 3.564057105533265, + "grad_norm": NaN, + "learning_rate": 0.00011225096994736661, + "loss": 0.0, + "step": 38196 + }, + { + "epoch": 3.5641504152281422, + "grad_norm": NaN, + "learning_rate": 0.00011224364964636631, + "loss": 0.0, + "step": 38197 + }, + { + "epoch": 3.5642437249230197, + "grad_norm": NaN, + "learning_rate": 0.00011223632944136925, + "loss": 0.0, + "step": 38198 + }, + { + "epoch": 3.5643370346178966, + "grad_norm": NaN, + "learning_rate": 0.00011222900933239393, + "loss": 0.0, + "step": 38199 + }, + { + "epoch": 3.564430344312774, + "grad_norm": NaN, + "learning_rate": 0.0001122216893194591, + "loss": 0.0, + "step": 38200 + }, + { + "epoch": 3.5645236540076515, + "grad_norm": NaN, + "learning_rate": 0.00011221436940258337, + "loss": 0.0, + "step": 38201 + }, + { + "epoch": 3.5646169637025285, + "grad_norm": NaN, + "learning_rate": 0.00011220704958178523, + "loss": 0.0, + "step": 38202 + }, + { + "epoch": 3.564710273397406, + "grad_norm": NaN, + "learning_rate": 0.00011219972985708344, + "loss": 0.0, + "step": 38203 + }, + { + "epoch": 3.5648035830922833, + "grad_norm": NaN, + "learning_rate": 0.00011219241022849649, + "loss": 0.0, + "step": 38204 + }, + { + "epoch": 3.5648968927871607, + "grad_norm": NaN, + "learning_rate": 0.00011218509069604304, + "loss": 0.0, + "step": 38205 + }, + { + "epoch": 3.564990202482038, + "grad_norm": NaN, + "learning_rate": 0.00011217777125974175, + "loss": 0.0, + "step": 38206 + }, + { + "epoch": 3.565083512176915, + "grad_norm": NaN, + "learning_rate": 0.00011217045191961113, + "loss": 0.0, + "step": 38207 + }, + { + "epoch": 3.5651768218717925, + "grad_norm": NaN, + "learning_rate": 0.00011216313267566982, + "loss": 0.0, + "step": 38208 + }, + { + "epoch": 3.56527013156667, + "grad_norm": NaN, + "learning_rate": 0.00011215581352793654, + "loss": 0.0, + "step": 38209 + }, + { + "epoch": 3.565363441261547, + "grad_norm": NaN, + "learning_rate": 0.00011214849447642976, + "loss": 0.0, + "step": 38210 + }, + { + "epoch": 3.5654567509564243, + "grad_norm": NaN, + "learning_rate": 0.00011214117552116814, + "loss": 0.0, + "step": 38211 + }, + { + "epoch": 3.5655500606513018, + "grad_norm": NaN, + "learning_rate": 0.00011213385666217035, + "loss": 0.0, + "step": 38212 + }, + { + "epoch": 3.565643370346179, + "grad_norm": NaN, + "learning_rate": 0.00011212653789945488, + "loss": 0.0, + "step": 38213 + }, + { + "epoch": 3.565736680041056, + "grad_norm": NaN, + "learning_rate": 0.00011211921923304044, + "loss": 0.0, + "step": 38214 + }, + { + "epoch": 3.5658299897359336, + "grad_norm": NaN, + "learning_rate": 0.00011211190066294563, + "loss": 0.0, + "step": 38215 + }, + { + "epoch": 3.565923299430811, + "grad_norm": NaN, + "learning_rate": 0.00011210458218918898, + "loss": 0.0, + "step": 38216 + }, + { + "epoch": 3.566016609125688, + "grad_norm": NaN, + "learning_rate": 0.00011209726381178918, + "loss": 0.0, + "step": 38217 + }, + { + "epoch": 3.5661099188205654, + "grad_norm": NaN, + "learning_rate": 0.00011208994553076484, + "loss": 0.0, + "step": 38218 + }, + { + "epoch": 3.566203228515443, + "grad_norm": NaN, + "learning_rate": 0.00011208262734613447, + "loss": 0.0, + "step": 38219 + }, + { + "epoch": 3.56629653821032, + "grad_norm": NaN, + "learning_rate": 0.00011207530925791677, + "loss": 0.0, + "step": 38220 + }, + { + "epoch": 3.566389847905197, + "grad_norm": NaN, + "learning_rate": 0.0001120679912661304, + "loss": 0.0, + "step": 38221 + }, + { + "epoch": 3.5664831576000746, + "grad_norm": NaN, + "learning_rate": 0.0001120606733707938, + "loss": 0.0, + "step": 38222 + }, + { + "epoch": 3.566576467294952, + "grad_norm": NaN, + "learning_rate": 0.0001120533555719257, + "loss": 0.0, + "step": 38223 + }, + { + "epoch": 3.566669776989829, + "grad_norm": NaN, + "learning_rate": 0.0001120460378695447, + "loss": 0.0, + "step": 38224 + }, + { + "epoch": 3.5667630866847064, + "grad_norm": NaN, + "learning_rate": 0.00011203872026366932, + "loss": 0.0, + "step": 38225 + }, + { + "epoch": 3.566856396379584, + "grad_norm": NaN, + "learning_rate": 0.00011203140275431828, + "loss": 0.0, + "step": 38226 + }, + { + "epoch": 3.5669497060744613, + "grad_norm": NaN, + "learning_rate": 0.00011202408534151014, + "loss": 0.0, + "step": 38227 + }, + { + "epoch": 3.5670430157693387, + "grad_norm": NaN, + "learning_rate": 0.00011201676802526346, + "loss": 0.0, + "step": 38228 + }, + { + "epoch": 3.5671363254642157, + "grad_norm": NaN, + "learning_rate": 0.0001120094508055969, + "loss": 0.0, + "step": 38229 + }, + { + "epoch": 3.567229635159093, + "grad_norm": NaN, + "learning_rate": 0.0001120021336825291, + "loss": 0.0, + "step": 38230 + }, + { + "epoch": 3.56732294485397, + "grad_norm": NaN, + "learning_rate": 0.00011199481665607856, + "loss": 0.0, + "step": 38231 + }, + { + "epoch": 3.5674162545488475, + "grad_norm": NaN, + "learning_rate": 0.00011198749972626396, + "loss": 0.0, + "step": 38232 + }, + { + "epoch": 3.567509564243725, + "grad_norm": NaN, + "learning_rate": 0.00011198018289310394, + "loss": 0.0, + "step": 38233 + }, + { + "epoch": 3.5676028739386023, + "grad_norm": NaN, + "learning_rate": 0.00011197286615661696, + "loss": 0.0, + "step": 38234 + }, + { + "epoch": 3.5676961836334797, + "grad_norm": NaN, + "learning_rate": 0.00011196554951682177, + "loss": 0.0, + "step": 38235 + }, + { + "epoch": 3.5677894933283567, + "grad_norm": NaN, + "learning_rate": 0.00011195823297373694, + "loss": 0.0, + "step": 38236 + }, + { + "epoch": 3.567882803023234, + "grad_norm": NaN, + "learning_rate": 0.000111950916527381, + "loss": 0.0, + "step": 38237 + }, + { + "epoch": 3.5679761127181115, + "grad_norm": NaN, + "learning_rate": 0.00011194360017777266, + "loss": 0.0, + "step": 38238 + }, + { + "epoch": 3.5680694224129885, + "grad_norm": NaN, + "learning_rate": 0.00011193628392493046, + "loss": 0.0, + "step": 38239 + }, + { + "epoch": 3.568162732107866, + "grad_norm": NaN, + "learning_rate": 0.00011192896776887298, + "loss": 0.0, + "step": 38240 + }, + { + "epoch": 3.5682560418027434, + "grad_norm": NaN, + "learning_rate": 0.00011192165170961888, + "loss": 0.0, + "step": 38241 + }, + { + "epoch": 3.568349351497621, + "grad_norm": NaN, + "learning_rate": 0.00011191433574718677, + "loss": 0.0, + "step": 38242 + }, + { + "epoch": 3.5684426611924978, + "grad_norm": NaN, + "learning_rate": 0.00011190701988159517, + "loss": 0.0, + "step": 38243 + }, + { + "epoch": 3.568535970887375, + "grad_norm": NaN, + "learning_rate": 0.00011189970411286274, + "loss": 0.0, + "step": 38244 + }, + { + "epoch": 3.5686292805822526, + "grad_norm": NaN, + "learning_rate": 0.00011189238844100811, + "loss": 0.0, + "step": 38245 + }, + { + "epoch": 3.5687225902771296, + "grad_norm": NaN, + "learning_rate": 0.00011188507286604984, + "loss": 0.0, + "step": 38246 + }, + { + "epoch": 3.568815899972007, + "grad_norm": NaN, + "learning_rate": 0.00011187775738800655, + "loss": 0.0, + "step": 38247 + }, + { + "epoch": 3.5689092096668844, + "grad_norm": NaN, + "learning_rate": 0.0001118704420068968, + "loss": 0.0, + "step": 38248 + }, + { + "epoch": 3.569002519361762, + "grad_norm": NaN, + "learning_rate": 0.00011186312672273922, + "loss": 0.0, + "step": 38249 + }, + { + "epoch": 3.5690958290566392, + "grad_norm": NaN, + "learning_rate": 0.00011185581153555246, + "loss": 0.0, + "step": 38250 + }, + { + "epoch": 3.569189138751516, + "grad_norm": NaN, + "learning_rate": 0.000111848496445355, + "loss": 0.0, + "step": 38251 + }, + { + "epoch": 3.5692824484463936, + "grad_norm": NaN, + "learning_rate": 0.00011184118145216556, + "loss": 0.0, + "step": 38252 + }, + { + "epoch": 3.5693757581412706, + "grad_norm": NaN, + "learning_rate": 0.0001118338665560027, + "loss": 0.0, + "step": 38253 + }, + { + "epoch": 3.569469067836148, + "grad_norm": NaN, + "learning_rate": 0.00011182655175688498, + "loss": 0.0, + "step": 38254 + }, + { + "epoch": 3.5695623775310255, + "grad_norm": NaN, + "learning_rate": 0.00011181923705483104, + "loss": 0.0, + "step": 38255 + }, + { + "epoch": 3.569655687225903, + "grad_norm": NaN, + "learning_rate": 0.0001118119224498595, + "loss": 0.0, + "step": 38256 + }, + { + "epoch": 3.5697489969207803, + "grad_norm": NaN, + "learning_rate": 0.00011180460794198888, + "loss": 0.0, + "step": 38257 + }, + { + "epoch": 3.5698423066156573, + "grad_norm": NaN, + "learning_rate": 0.00011179729353123786, + "loss": 0.0, + "step": 38258 + }, + { + "epoch": 3.5699356163105347, + "grad_norm": NaN, + "learning_rate": 0.00011178997921762502, + "loss": 0.0, + "step": 38259 + }, + { + "epoch": 3.570028926005412, + "grad_norm": NaN, + "learning_rate": 0.00011178266500116889, + "loss": 0.0, + "step": 38260 + }, + { + "epoch": 3.570122235700289, + "grad_norm": NaN, + "learning_rate": 0.00011177535088188815, + "loss": 0.0, + "step": 38261 + }, + { + "epoch": 3.5702155453951665, + "grad_norm": NaN, + "learning_rate": 0.0001117680368598014, + "loss": 0.0, + "step": 38262 + }, + { + "epoch": 3.570308855090044, + "grad_norm": NaN, + "learning_rate": 0.00011176072293492716, + "loss": 0.0, + "step": 38263 + }, + { + "epoch": 3.5704021647849213, + "grad_norm": NaN, + "learning_rate": 0.00011175340910728411, + "loss": 0.0, + "step": 38264 + }, + { + "epoch": 3.5704954744797983, + "grad_norm": NaN, + "learning_rate": 0.00011174609537689082, + "loss": 0.0, + "step": 38265 + }, + { + "epoch": 3.5705887841746757, + "grad_norm": NaN, + "learning_rate": 0.00011173878174376581, + "loss": 0.0, + "step": 38266 + }, + { + "epoch": 3.570682093869553, + "grad_norm": NaN, + "learning_rate": 0.0001117314682079278, + "loss": 0.0, + "step": 38267 + }, + { + "epoch": 3.57077540356443, + "grad_norm": NaN, + "learning_rate": 0.00011172415476939534, + "loss": 0.0, + "step": 38268 + }, + { + "epoch": 3.5708687132593075, + "grad_norm": NaN, + "learning_rate": 0.00011171684142818697, + "loss": 0.0, + "step": 38269 + }, + { + "epoch": 3.570962022954185, + "grad_norm": NaN, + "learning_rate": 0.00011170952818432135, + "loss": 0.0, + "step": 38270 + }, + { + "epoch": 3.5710553326490624, + "grad_norm": NaN, + "learning_rate": 0.00011170221503781711, + "loss": 0.0, + "step": 38271 + }, + { + "epoch": 3.57114864234394, + "grad_norm": NaN, + "learning_rate": 0.00011169490198869271, + "loss": 0.0, + "step": 38272 + }, + { + "epoch": 3.571241952038817, + "grad_norm": NaN, + "learning_rate": 0.00011168758903696686, + "loss": 0.0, + "step": 38273 + }, + { + "epoch": 3.571335261733694, + "grad_norm": NaN, + "learning_rate": 0.00011168027618265815, + "loss": 0.0, + "step": 38274 + }, + { + "epoch": 3.571428571428571, + "grad_norm": NaN, + "learning_rate": 0.00011167296342578511, + "loss": 0.0, + "step": 38275 + }, + { + "epoch": 3.5715218811234486, + "grad_norm": NaN, + "learning_rate": 0.00011166565076636638, + "loss": 0.0, + "step": 38276 + }, + { + "epoch": 3.571615190818326, + "grad_norm": NaN, + "learning_rate": 0.00011165833820442057, + "loss": 0.0, + "step": 38277 + }, + { + "epoch": 3.5717085005132034, + "grad_norm": NaN, + "learning_rate": 0.00011165102573996622, + "loss": 0.0, + "step": 38278 + }, + { + "epoch": 3.571801810208081, + "grad_norm": NaN, + "learning_rate": 0.00011164371337302191, + "loss": 0.0, + "step": 38279 + }, + { + "epoch": 3.571895119902958, + "grad_norm": NaN, + "learning_rate": 0.00011163640110360638, + "loss": 0.0, + "step": 38280 + }, + { + "epoch": 3.5719884295978352, + "grad_norm": NaN, + "learning_rate": 0.00011162908893173804, + "loss": 0.0, + "step": 38281 + }, + { + "epoch": 3.5720817392927127, + "grad_norm": NaN, + "learning_rate": 0.00011162177685743554, + "loss": 0.0, + "step": 38282 + }, + { + "epoch": 3.5721750489875896, + "grad_norm": NaN, + "learning_rate": 0.00011161446488071757, + "loss": 0.0, + "step": 38283 + }, + { + "epoch": 3.572268358682467, + "grad_norm": NaN, + "learning_rate": 0.00011160715300160263, + "loss": 0.0, + "step": 38284 + }, + { + "epoch": 3.5723616683773445, + "grad_norm": NaN, + "learning_rate": 0.00011159984122010925, + "loss": 0.0, + "step": 38285 + }, + { + "epoch": 3.572454978072222, + "grad_norm": NaN, + "learning_rate": 0.00011159252953625619, + "loss": 0.0, + "step": 38286 + }, + { + "epoch": 3.572548287767099, + "grad_norm": NaN, + "learning_rate": 0.00011158521795006193, + "loss": 0.0, + "step": 38287 + }, + { + "epoch": 3.5726415974619763, + "grad_norm": NaN, + "learning_rate": 0.00011157790646154503, + "loss": 0.0, + "step": 38288 + }, + { + "epoch": 3.5727349071568537, + "grad_norm": NaN, + "learning_rate": 0.00011157059507072419, + "loss": 0.0, + "step": 38289 + }, + { + "epoch": 3.5728282168517307, + "grad_norm": NaN, + "learning_rate": 0.00011156328377761793, + "loss": 0.0, + "step": 38290 + }, + { + "epoch": 3.572921526546608, + "grad_norm": NaN, + "learning_rate": 0.00011155597258224488, + "loss": 0.0, + "step": 38291 + }, + { + "epoch": 3.5730148362414855, + "grad_norm": NaN, + "learning_rate": 0.00011154866148462354, + "loss": 0.0, + "step": 38292 + }, + { + "epoch": 3.573108145936363, + "grad_norm": NaN, + "learning_rate": 0.0001115413504847726, + "loss": 0.0, + "step": 38293 + }, + { + "epoch": 3.57320145563124, + "grad_norm": NaN, + "learning_rate": 0.00011153403958271064, + "loss": 0.0, + "step": 38294 + }, + { + "epoch": 3.5732947653261173, + "grad_norm": NaN, + "learning_rate": 0.00011152672877845615, + "loss": 0.0, + "step": 38295 + }, + { + "epoch": 3.5733880750209948, + "grad_norm": NaN, + "learning_rate": 0.00011151941807202784, + "loss": 0.0, + "step": 38296 + }, + { + "epoch": 3.5734813847158717, + "grad_norm": NaN, + "learning_rate": 0.00011151210746344427, + "loss": 0.0, + "step": 38297 + }, + { + "epoch": 3.573574694410749, + "grad_norm": NaN, + "learning_rate": 0.00011150479695272396, + "loss": 0.0, + "step": 38298 + }, + { + "epoch": 3.5736680041056266, + "grad_norm": NaN, + "learning_rate": 0.00011149748653988556, + "loss": 0.0, + "step": 38299 + }, + { + "epoch": 3.573761313800504, + "grad_norm": NaN, + "learning_rate": 0.0001114901762249477, + "loss": 0.0, + "step": 38300 + }, + { + "epoch": 3.5738546234953814, + "grad_norm": NaN, + "learning_rate": 0.00011148286600792883, + "loss": 0.0, + "step": 38301 + }, + { + "epoch": 3.5739479331902584, + "grad_norm": NaN, + "learning_rate": 0.00011147555588884767, + "loss": 0.0, + "step": 38302 + }, + { + "epoch": 3.574041242885136, + "grad_norm": NaN, + "learning_rate": 0.00011146824586772278, + "loss": 0.0, + "step": 38303 + }, + { + "epoch": 3.5741345525800132, + "grad_norm": NaN, + "learning_rate": 0.00011146093594457268, + "loss": 0.0, + "step": 38304 + }, + { + "epoch": 3.57422786227489, + "grad_norm": NaN, + "learning_rate": 0.00011145362611941602, + "loss": 0.0, + "step": 38305 + }, + { + "epoch": 3.5743211719697676, + "grad_norm": NaN, + "learning_rate": 0.0001114463163922714, + "loss": 0.0, + "step": 38306 + }, + { + "epoch": 3.574414481664645, + "grad_norm": NaN, + "learning_rate": 0.00011143900676315734, + "loss": 0.0, + "step": 38307 + }, + { + "epoch": 3.5745077913595225, + "grad_norm": NaN, + "learning_rate": 0.00011143169723209245, + "loss": 0.0, + "step": 38308 + }, + { + "epoch": 3.5746011010543994, + "grad_norm": NaN, + "learning_rate": 0.0001114243877990954, + "loss": 0.0, + "step": 38309 + }, + { + "epoch": 3.574694410749277, + "grad_norm": NaN, + "learning_rate": 0.00011141707846418463, + "loss": 0.0, + "step": 38310 + }, + { + "epoch": 3.5747877204441543, + "grad_norm": NaN, + "learning_rate": 0.00011140976922737883, + "loss": 0.0, + "step": 38311 + }, + { + "epoch": 3.5748810301390312, + "grad_norm": NaN, + "learning_rate": 0.00011140246008869658, + "loss": 0.0, + "step": 38312 + }, + { + "epoch": 3.5749743398339087, + "grad_norm": NaN, + "learning_rate": 0.00011139515104815639, + "loss": 0.0, + "step": 38313 + }, + { + "epoch": 3.575067649528786, + "grad_norm": NaN, + "learning_rate": 0.00011138784210577692, + "loss": 0.0, + "step": 38314 + }, + { + "epoch": 3.5751609592236635, + "grad_norm": NaN, + "learning_rate": 0.00011138053326157676, + "loss": 0.0, + "step": 38315 + }, + { + "epoch": 3.5752542689185405, + "grad_norm": NaN, + "learning_rate": 0.0001113732245155744, + "loss": 0.0, + "step": 38316 + }, + { + "epoch": 3.575347578613418, + "grad_norm": NaN, + "learning_rate": 0.00011136591586778848, + "loss": 0.0, + "step": 38317 + }, + { + "epoch": 3.5754408883082953, + "grad_norm": NaN, + "learning_rate": 0.00011135860731823772, + "loss": 0.0, + "step": 38318 + }, + { + "epoch": 3.5755341980031723, + "grad_norm": NaN, + "learning_rate": 0.00011135129886694046, + "loss": 0.0, + "step": 38319 + }, + { + "epoch": 3.5756275076980497, + "grad_norm": NaN, + "learning_rate": 0.00011134399051391539, + "loss": 0.0, + "step": 38320 + }, + { + "epoch": 3.575720817392927, + "grad_norm": NaN, + "learning_rate": 0.00011133668225918118, + "loss": 0.0, + "step": 38321 + }, + { + "epoch": 3.5758141270878046, + "grad_norm": NaN, + "learning_rate": 0.00011132937410275629, + "loss": 0.0, + "step": 38322 + }, + { + "epoch": 3.575907436782682, + "grad_norm": NaN, + "learning_rate": 0.00011132206604465931, + "loss": 0.0, + "step": 38323 + }, + { + "epoch": 3.576000746477559, + "grad_norm": NaN, + "learning_rate": 0.00011131475808490893, + "loss": 0.0, + "step": 38324 + }, + { + "epoch": 3.5760940561724364, + "grad_norm": NaN, + "learning_rate": 0.00011130745022352363, + "loss": 0.0, + "step": 38325 + }, + { + "epoch": 3.5761873658673133, + "grad_norm": NaN, + "learning_rate": 0.000111300142460522, + "loss": 0.0, + "step": 38326 + }, + { + "epoch": 3.5762806755621908, + "grad_norm": NaN, + "learning_rate": 0.00011129283479592269, + "loss": 0.0, + "step": 38327 + }, + { + "epoch": 3.576373985257068, + "grad_norm": NaN, + "learning_rate": 0.00011128552722974423, + "loss": 0.0, + "step": 38328 + }, + { + "epoch": 3.5764672949519456, + "grad_norm": NaN, + "learning_rate": 0.00011127821976200514, + "loss": 0.0, + "step": 38329 + }, + { + "epoch": 3.576560604646823, + "grad_norm": NaN, + "learning_rate": 0.00011127091239272415, + "loss": 0.0, + "step": 38330 + }, + { + "epoch": 3.5766539143417, + "grad_norm": NaN, + "learning_rate": 0.00011126360512191972, + "loss": 0.0, + "step": 38331 + }, + { + "epoch": 3.5767472240365774, + "grad_norm": NaN, + "learning_rate": 0.00011125629794961044, + "loss": 0.0, + "step": 38332 + }, + { + "epoch": 3.576840533731455, + "grad_norm": NaN, + "learning_rate": 0.00011124899087581498, + "loss": 0.0, + "step": 38333 + }, + { + "epoch": 3.576933843426332, + "grad_norm": NaN, + "learning_rate": 0.00011124168390055183, + "loss": 0.0, + "step": 38334 + }, + { + "epoch": 3.5770271531212092, + "grad_norm": NaN, + "learning_rate": 0.00011123437702383954, + "loss": 0.0, + "step": 38335 + }, + { + "epoch": 3.5771204628160866, + "grad_norm": NaN, + "learning_rate": 0.00011122707024569683, + "loss": 0.0, + "step": 38336 + }, + { + "epoch": 3.577213772510964, + "grad_norm": NaN, + "learning_rate": 0.00011121976356614218, + "loss": 0.0, + "step": 38337 + }, + { + "epoch": 3.577307082205841, + "grad_norm": NaN, + "learning_rate": 0.0001112124569851942, + "loss": 0.0, + "step": 38338 + }, + { + "epoch": 3.5774003919007185, + "grad_norm": NaN, + "learning_rate": 0.00011120515050287136, + "loss": 0.0, + "step": 38339 + }, + { + "epoch": 3.577493701595596, + "grad_norm": NaN, + "learning_rate": 0.00011119784411919239, + "loss": 0.0, + "step": 38340 + }, + { + "epoch": 3.577587011290473, + "grad_norm": NaN, + "learning_rate": 0.00011119053783417586, + "loss": 0.0, + "step": 38341 + }, + { + "epoch": 3.5776803209853503, + "grad_norm": NaN, + "learning_rate": 0.0001111832316478402, + "loss": 0.0, + "step": 38342 + }, + { + "epoch": 3.5777736306802277, + "grad_norm": NaN, + "learning_rate": 0.00011117592556020413, + "loss": 0.0, + "step": 38343 + }, + { + "epoch": 3.577866940375105, + "grad_norm": NaN, + "learning_rate": 0.00011116861957128619, + "loss": 0.0, + "step": 38344 + }, + { + "epoch": 3.5779602500699825, + "grad_norm": NaN, + "learning_rate": 0.00011116131368110494, + "loss": 0.0, + "step": 38345 + }, + { + "epoch": 3.5780535597648595, + "grad_norm": NaN, + "learning_rate": 0.00011115400788967894, + "loss": 0.0, + "step": 38346 + }, + { + "epoch": 3.578146869459737, + "grad_norm": NaN, + "learning_rate": 0.00011114670219702684, + "loss": 0.0, + "step": 38347 + }, + { + "epoch": 3.578240179154614, + "grad_norm": NaN, + "learning_rate": 0.0001111393966031671, + "loss": 0.0, + "step": 38348 + }, + { + "epoch": 3.5783334888494913, + "grad_norm": NaN, + "learning_rate": 0.00011113209110811841, + "loss": 0.0, + "step": 38349 + }, + { + "epoch": 3.5784267985443687, + "grad_norm": NaN, + "learning_rate": 0.00011112478571189931, + "loss": 0.0, + "step": 38350 + }, + { + "epoch": 3.578520108239246, + "grad_norm": NaN, + "learning_rate": 0.00011111748041452831, + "loss": 0.0, + "step": 38351 + }, + { + "epoch": 3.5786134179341236, + "grad_norm": NaN, + "learning_rate": 0.00011111017521602409, + "loss": 0.0, + "step": 38352 + }, + { + "epoch": 3.5787067276290006, + "grad_norm": NaN, + "learning_rate": 0.00011110287011640519, + "loss": 0.0, + "step": 38353 + }, + { + "epoch": 3.578800037323878, + "grad_norm": NaN, + "learning_rate": 0.0001110955651156901, + "loss": 0.0, + "step": 38354 + }, + { + "epoch": 3.5788933470187554, + "grad_norm": NaN, + "learning_rate": 0.00011108826021389746, + "loss": 0.0, + "step": 38355 + }, + { + "epoch": 3.5789866567136324, + "grad_norm": NaN, + "learning_rate": 0.00011108095541104596, + "loss": 0.0, + "step": 38356 + }, + { + "epoch": 3.57907996640851, + "grad_norm": NaN, + "learning_rate": 0.00011107365070715396, + "loss": 0.0, + "step": 38357 + }, + { + "epoch": 3.579173276103387, + "grad_norm": NaN, + "learning_rate": 0.00011106634610224013, + "loss": 0.0, + "step": 38358 + }, + { + "epoch": 3.5792665857982646, + "grad_norm": NaN, + "learning_rate": 0.00011105904159632313, + "loss": 0.0, + "step": 38359 + }, + { + "epoch": 3.5793598954931416, + "grad_norm": NaN, + "learning_rate": 0.0001110517371894214, + "loss": 0.0, + "step": 38360 + }, + { + "epoch": 3.579453205188019, + "grad_norm": NaN, + "learning_rate": 0.00011104443288155354, + "loss": 0.0, + "step": 38361 + }, + { + "epoch": 3.5795465148828964, + "grad_norm": NaN, + "learning_rate": 0.0001110371286727382, + "loss": 0.0, + "step": 38362 + }, + { + "epoch": 3.5796398245777734, + "grad_norm": NaN, + "learning_rate": 0.00011102982456299388, + "loss": 0.0, + "step": 38363 + }, + { + "epoch": 3.579733134272651, + "grad_norm": NaN, + "learning_rate": 0.00011102252055233913, + "loss": 0.0, + "step": 38364 + }, + { + "epoch": 3.5798264439675282, + "grad_norm": NaN, + "learning_rate": 0.00011101521664079266, + "loss": 0.0, + "step": 38365 + }, + { + "epoch": 3.5799197536624057, + "grad_norm": NaN, + "learning_rate": 0.0001110079128283729, + "loss": 0.0, + "step": 38366 + }, + { + "epoch": 3.580013063357283, + "grad_norm": NaN, + "learning_rate": 0.00011100060911509842, + "loss": 0.0, + "step": 38367 + }, + { + "epoch": 3.58010637305216, + "grad_norm": NaN, + "learning_rate": 0.00011099330550098788, + "loss": 0.0, + "step": 38368 + }, + { + "epoch": 3.5801996827470375, + "grad_norm": NaN, + "learning_rate": 0.00011098600198605982, + "loss": 0.0, + "step": 38369 + }, + { + "epoch": 3.5802929924419145, + "grad_norm": NaN, + "learning_rate": 0.00011097869857033276, + "loss": 0.0, + "step": 38370 + }, + { + "epoch": 3.580386302136792, + "grad_norm": NaN, + "learning_rate": 0.00011097139525382536, + "loss": 0.0, + "step": 38371 + }, + { + "epoch": 3.5804796118316693, + "grad_norm": NaN, + "learning_rate": 0.00011096409203655613, + "loss": 0.0, + "step": 38372 + }, + { + "epoch": 3.5805729215265467, + "grad_norm": NaN, + "learning_rate": 0.0001109567889185436, + "loss": 0.0, + "step": 38373 + }, + { + "epoch": 3.580666231221424, + "grad_norm": NaN, + "learning_rate": 0.00011094948589980646, + "loss": 0.0, + "step": 38374 + }, + { + "epoch": 3.580759540916301, + "grad_norm": NaN, + "learning_rate": 0.00011094218298036317, + "loss": 0.0, + "step": 38375 + }, + { + "epoch": 3.5808528506111785, + "grad_norm": NaN, + "learning_rate": 0.00011093488016023231, + "loss": 0.0, + "step": 38376 + }, + { + "epoch": 3.580946160306056, + "grad_norm": NaN, + "learning_rate": 0.00011092757743943256, + "loss": 0.0, + "step": 38377 + }, + { + "epoch": 3.581039470000933, + "grad_norm": NaN, + "learning_rate": 0.00011092027481798236, + "loss": 0.0, + "step": 38378 + }, + { + "epoch": 3.5811327796958103, + "grad_norm": NaN, + "learning_rate": 0.0001109129722959003, + "loss": 0.0, + "step": 38379 + }, + { + "epoch": 3.5812260893906878, + "grad_norm": NaN, + "learning_rate": 0.00011090566987320504, + "loss": 0.0, + "step": 38380 + }, + { + "epoch": 3.581319399085565, + "grad_norm": NaN, + "learning_rate": 0.00011089836754991502, + "loss": 0.0, + "step": 38381 + }, + { + "epoch": 3.581412708780442, + "grad_norm": NaN, + "learning_rate": 0.00011089106532604894, + "loss": 0.0, + "step": 38382 + }, + { + "epoch": 3.5815060184753196, + "grad_norm": NaN, + "learning_rate": 0.00011088376320162522, + "loss": 0.0, + "step": 38383 + }, + { + "epoch": 3.581599328170197, + "grad_norm": NaN, + "learning_rate": 0.00011087646117666252, + "loss": 0.0, + "step": 38384 + }, + { + "epoch": 3.581692637865074, + "grad_norm": NaN, + "learning_rate": 0.00011086915925117943, + "loss": 0.0, + "step": 38385 + }, + { + "epoch": 3.5817859475599514, + "grad_norm": NaN, + "learning_rate": 0.00011086185742519442, + "loss": 0.0, + "step": 38386 + }, + { + "epoch": 3.581879257254829, + "grad_norm": NaN, + "learning_rate": 0.00011085455569872614, + "loss": 0.0, + "step": 38387 + }, + { + "epoch": 3.5819725669497062, + "grad_norm": NaN, + "learning_rate": 0.00011084725407179316, + "loss": 0.0, + "step": 38388 + }, + { + "epoch": 3.5820658766445836, + "grad_norm": NaN, + "learning_rate": 0.00011083995254441395, + "loss": 0.0, + "step": 38389 + }, + { + "epoch": 3.5821591863394606, + "grad_norm": NaN, + "learning_rate": 0.00011083265111660715, + "loss": 0.0, + "step": 38390 + }, + { + "epoch": 3.582252496034338, + "grad_norm": NaN, + "learning_rate": 0.00011082534978839139, + "loss": 0.0, + "step": 38391 + }, + { + "epoch": 3.582345805729215, + "grad_norm": NaN, + "learning_rate": 0.00011081804855978507, + "loss": 0.0, + "step": 38392 + }, + { + "epoch": 3.5824391154240924, + "grad_norm": NaN, + "learning_rate": 0.00011081074743080685, + "loss": 0.0, + "step": 38393 + }, + { + "epoch": 3.58253242511897, + "grad_norm": NaN, + "learning_rate": 0.00011080344640147538, + "loss": 0.0, + "step": 38394 + }, + { + "epoch": 3.5826257348138473, + "grad_norm": NaN, + "learning_rate": 0.00011079614547180907, + "loss": 0.0, + "step": 38395 + }, + { + "epoch": 3.5827190445087247, + "grad_norm": NaN, + "learning_rate": 0.00011078884464182652, + "loss": 0.0, + "step": 38396 + }, + { + "epoch": 3.5828123542036017, + "grad_norm": NaN, + "learning_rate": 0.00011078154391154639, + "loss": 0.0, + "step": 38397 + }, + { + "epoch": 3.582905663898479, + "grad_norm": NaN, + "learning_rate": 0.00011077424328098716, + "loss": 0.0, + "step": 38398 + }, + { + "epoch": 3.5829989735933565, + "grad_norm": NaN, + "learning_rate": 0.00011076694275016735, + "loss": 0.0, + "step": 38399 + }, + { + "epoch": 3.5830922832882335, + "grad_norm": NaN, + "learning_rate": 0.00011075964231910567, + "loss": 0.0, + "step": 38400 + }, + { + "epoch": 3.583185592983111, + "grad_norm": NaN, + "learning_rate": 0.00011075234198782056, + "loss": 0.0, + "step": 38401 + }, + { + "epoch": 3.5832789026779883, + "grad_norm": NaN, + "learning_rate": 0.0001107450417563306, + "loss": 0.0, + "step": 38402 + }, + { + "epoch": 3.5833722123728657, + "grad_norm": NaN, + "learning_rate": 0.00011073774162465442, + "loss": 0.0, + "step": 38403 + }, + { + "epoch": 3.5834655220677427, + "grad_norm": NaN, + "learning_rate": 0.00011073044159281049, + "loss": 0.0, + "step": 38404 + }, + { + "epoch": 3.58355883176262, + "grad_norm": NaN, + "learning_rate": 0.00011072314166081738, + "loss": 0.0, + "step": 38405 + }, + { + "epoch": 3.5836521414574976, + "grad_norm": NaN, + "learning_rate": 0.00011071584182869378, + "loss": 0.0, + "step": 38406 + }, + { + "epoch": 3.5837454511523745, + "grad_norm": NaN, + "learning_rate": 0.00011070854209645809, + "loss": 0.0, + "step": 38407 + }, + { + "epoch": 3.583838760847252, + "grad_norm": NaN, + "learning_rate": 0.00011070124246412893, + "loss": 0.0, + "step": 38408 + }, + { + "epoch": 3.5839320705421294, + "grad_norm": NaN, + "learning_rate": 0.00011069394293172495, + "loss": 0.0, + "step": 38409 + }, + { + "epoch": 3.584025380237007, + "grad_norm": NaN, + "learning_rate": 0.00011068664349926457, + "loss": 0.0, + "step": 38410 + }, + { + "epoch": 3.5841186899318838, + "grad_norm": NaN, + "learning_rate": 0.00011067934416676638, + "loss": 0.0, + "step": 38411 + }, + { + "epoch": 3.584211999626761, + "grad_norm": NaN, + "learning_rate": 0.00011067204493424903, + "loss": 0.0, + "step": 38412 + }, + { + "epoch": 3.5843053093216386, + "grad_norm": NaN, + "learning_rate": 0.000110664745801731, + "loss": 0.0, + "step": 38413 + }, + { + "epoch": 3.5843986190165156, + "grad_norm": NaN, + "learning_rate": 0.00011065744676923082, + "loss": 0.0, + "step": 38414 + }, + { + "epoch": 3.584491928711393, + "grad_norm": NaN, + "learning_rate": 0.0001106501478367672, + "loss": 0.0, + "step": 38415 + }, + { + "epoch": 3.5845852384062704, + "grad_norm": NaN, + "learning_rate": 0.00011064284900435853, + "loss": 0.0, + "step": 38416 + }, + { + "epoch": 3.584678548101148, + "grad_norm": NaN, + "learning_rate": 0.00011063555027202341, + "loss": 0.0, + "step": 38417 + }, + { + "epoch": 3.5847718577960253, + "grad_norm": NaN, + "learning_rate": 0.00011062825163978049, + "loss": 0.0, + "step": 38418 + }, + { + "epoch": 3.5848651674909022, + "grad_norm": NaN, + "learning_rate": 0.00011062095310764823, + "loss": 0.0, + "step": 38419 + }, + { + "epoch": 3.5849584771857796, + "grad_norm": NaN, + "learning_rate": 0.00011061365467564522, + "loss": 0.0, + "step": 38420 + }, + { + "epoch": 3.585051786880657, + "grad_norm": NaN, + "learning_rate": 0.00011060635634379005, + "loss": 0.0, + "step": 38421 + }, + { + "epoch": 3.585145096575534, + "grad_norm": NaN, + "learning_rate": 0.00011059905811210122, + "loss": 0.0, + "step": 38422 + }, + { + "epoch": 3.5852384062704115, + "grad_norm": NaN, + "learning_rate": 0.00011059175998059729, + "loss": 0.0, + "step": 38423 + }, + { + "epoch": 3.585331715965289, + "grad_norm": NaN, + "learning_rate": 0.00011058446194929689, + "loss": 0.0, + "step": 38424 + }, + { + "epoch": 3.5854250256601663, + "grad_norm": NaN, + "learning_rate": 0.00011057716401821852, + "loss": 0.0, + "step": 38425 + }, + { + "epoch": 3.5855183353550433, + "grad_norm": NaN, + "learning_rate": 0.00011056986618738075, + "loss": 0.0, + "step": 38426 + }, + { + "epoch": 3.5856116450499207, + "grad_norm": NaN, + "learning_rate": 0.00011056256845680209, + "loss": 0.0, + "step": 38427 + }, + { + "epoch": 3.585704954744798, + "grad_norm": NaN, + "learning_rate": 0.00011055527082650115, + "loss": 0.0, + "step": 38428 + }, + { + "epoch": 3.585798264439675, + "grad_norm": NaN, + "learning_rate": 0.00011054797329649652, + "loss": 0.0, + "step": 38429 + }, + { + "epoch": 3.5858915741345525, + "grad_norm": NaN, + "learning_rate": 0.00011054067586680662, + "loss": 0.0, + "step": 38430 + }, + { + "epoch": 3.58598488382943, + "grad_norm": NaN, + "learning_rate": 0.00011053337853745012, + "loss": 0.0, + "step": 38431 + }, + { + "epoch": 3.5860781935243073, + "grad_norm": NaN, + "learning_rate": 0.00011052608130844559, + "loss": 0.0, + "step": 38432 + }, + { + "epoch": 3.5861715032191843, + "grad_norm": NaN, + "learning_rate": 0.00011051878417981151, + "loss": 0.0, + "step": 38433 + }, + { + "epoch": 3.5862648129140617, + "grad_norm": NaN, + "learning_rate": 0.00011051148715156644, + "loss": 0.0, + "step": 38434 + }, + { + "epoch": 3.586358122608939, + "grad_norm": NaN, + "learning_rate": 0.00011050419022372903, + "loss": 0.0, + "step": 38435 + }, + { + "epoch": 3.586451432303816, + "grad_norm": NaN, + "learning_rate": 0.00011049689339631772, + "loss": 0.0, + "step": 38436 + }, + { + "epoch": 3.5865447419986936, + "grad_norm": NaN, + "learning_rate": 0.0001104895966693511, + "loss": 0.0, + "step": 38437 + }, + { + "epoch": 3.586638051693571, + "grad_norm": NaN, + "learning_rate": 0.00011048230004284778, + "loss": 0.0, + "step": 38438 + }, + { + "epoch": 3.5867313613884484, + "grad_norm": NaN, + "learning_rate": 0.00011047500351682622, + "loss": 0.0, + "step": 38439 + }, + { + "epoch": 3.586824671083326, + "grad_norm": NaN, + "learning_rate": 0.00011046770709130501, + "loss": 0.0, + "step": 38440 + }, + { + "epoch": 3.586917980778203, + "grad_norm": NaN, + "learning_rate": 0.00011046041076630278, + "loss": 0.0, + "step": 38441 + }, + { + "epoch": 3.58701129047308, + "grad_norm": NaN, + "learning_rate": 0.00011045311454183795, + "loss": 0.0, + "step": 38442 + }, + { + "epoch": 3.587104600167957, + "grad_norm": NaN, + "learning_rate": 0.00011044581841792911, + "loss": 0.0, + "step": 38443 + }, + { + "epoch": 3.5871979098628346, + "grad_norm": NaN, + "learning_rate": 0.00011043852239459492, + "loss": 0.0, + "step": 38444 + }, + { + "epoch": 3.587291219557712, + "grad_norm": NaN, + "learning_rate": 0.00011043122647185382, + "loss": 0.0, + "step": 38445 + }, + { + "epoch": 3.5873845292525894, + "grad_norm": NaN, + "learning_rate": 0.00011042393064972433, + "loss": 0.0, + "step": 38446 + }, + { + "epoch": 3.587477838947467, + "grad_norm": NaN, + "learning_rate": 0.00011041663492822514, + "loss": 0.0, + "step": 38447 + }, + { + "epoch": 3.587571148642344, + "grad_norm": NaN, + "learning_rate": 0.0001104093393073747, + "loss": 0.0, + "step": 38448 + }, + { + "epoch": 3.5876644583372213, + "grad_norm": NaN, + "learning_rate": 0.00011040204378719154, + "loss": 0.0, + "step": 38449 + }, + { + "epoch": 3.5877577680320987, + "grad_norm": NaN, + "learning_rate": 0.00011039474836769431, + "loss": 0.0, + "step": 38450 + }, + { + "epoch": 3.5878510777269756, + "grad_norm": NaN, + "learning_rate": 0.00011038745304890148, + "loss": 0.0, + "step": 38451 + }, + { + "epoch": 3.587944387421853, + "grad_norm": NaN, + "learning_rate": 0.0001103801578308316, + "loss": 0.0, + "step": 38452 + }, + { + "epoch": 3.5880376971167305, + "grad_norm": NaN, + "learning_rate": 0.00011037286271350328, + "loss": 0.0, + "step": 38453 + }, + { + "epoch": 3.588131006811608, + "grad_norm": NaN, + "learning_rate": 0.00011036556769693503, + "loss": 0.0, + "step": 38454 + }, + { + "epoch": 3.588224316506485, + "grad_norm": NaN, + "learning_rate": 0.00011035827278114536, + "loss": 0.0, + "step": 38455 + }, + { + "epoch": 3.5883176262013623, + "grad_norm": NaN, + "learning_rate": 0.00011035097796615288, + "loss": 0.0, + "step": 38456 + }, + { + "epoch": 3.5884109358962397, + "grad_norm": NaN, + "learning_rate": 0.00011034368325197614, + "loss": 0.0, + "step": 38457 + }, + { + "epoch": 3.5885042455911167, + "grad_norm": NaN, + "learning_rate": 0.00011033638863863361, + "loss": 0.0, + "step": 38458 + }, + { + "epoch": 3.588597555285994, + "grad_norm": NaN, + "learning_rate": 0.00011032909412614397, + "loss": 0.0, + "step": 38459 + }, + { + "epoch": 3.5886908649808715, + "grad_norm": NaN, + "learning_rate": 0.00011032179971452567, + "loss": 0.0, + "step": 38460 + }, + { + "epoch": 3.588784174675749, + "grad_norm": NaN, + "learning_rate": 0.00011031450540379722, + "loss": 0.0, + "step": 38461 + }, + { + "epoch": 3.5888774843706264, + "grad_norm": NaN, + "learning_rate": 0.00011030721119397732, + "loss": 0.0, + "step": 38462 + }, + { + "epoch": 3.5889707940655033, + "grad_norm": NaN, + "learning_rate": 0.00011029991708508437, + "loss": 0.0, + "step": 38463 + }, + { + "epoch": 3.5890641037603808, + "grad_norm": NaN, + "learning_rate": 0.00011029262307713693, + "loss": 0.0, + "step": 38464 + }, + { + "epoch": 3.5891574134552577, + "grad_norm": NaN, + "learning_rate": 0.00011028532917015366, + "loss": 0.0, + "step": 38465 + }, + { + "epoch": 3.589250723150135, + "grad_norm": NaN, + "learning_rate": 0.00011027803536415303, + "loss": 0.0, + "step": 38466 + }, + { + "epoch": 3.5893440328450126, + "grad_norm": NaN, + "learning_rate": 0.00011027074165915352, + "loss": 0.0, + "step": 38467 + }, + { + "epoch": 3.58943734253989, + "grad_norm": NaN, + "learning_rate": 0.00011026344805517385, + "loss": 0.0, + "step": 38468 + }, + { + "epoch": 3.5895306522347674, + "grad_norm": NaN, + "learning_rate": 0.00011025615455223235, + "loss": 0.0, + "step": 38469 + }, + { + "epoch": 3.5896239619296444, + "grad_norm": NaN, + "learning_rate": 0.00011024886115034776, + "loss": 0.0, + "step": 38470 + }, + { + "epoch": 3.589717271624522, + "grad_norm": NaN, + "learning_rate": 0.00011024156784953854, + "loss": 0.0, + "step": 38471 + }, + { + "epoch": 3.5898105813193992, + "grad_norm": NaN, + "learning_rate": 0.00011023427464982316, + "loss": 0.0, + "step": 38472 + }, + { + "epoch": 3.589903891014276, + "grad_norm": NaN, + "learning_rate": 0.00011022698155122032, + "loss": 0.0, + "step": 38473 + }, + { + "epoch": 3.5899972007091536, + "grad_norm": NaN, + "learning_rate": 0.00011021968855374845, + "loss": 0.0, + "step": 38474 + }, + { + "epoch": 3.590090510404031, + "grad_norm": NaN, + "learning_rate": 0.00011021239565742611, + "loss": 0.0, + "step": 38475 + }, + { + "epoch": 3.5901838200989085, + "grad_norm": NaN, + "learning_rate": 0.00011020510286227192, + "loss": 0.0, + "step": 38476 + }, + { + "epoch": 3.5902771297937854, + "grad_norm": NaN, + "learning_rate": 0.0001101978101683043, + "loss": 0.0, + "step": 38477 + }, + { + "epoch": 3.590370439488663, + "grad_norm": NaN, + "learning_rate": 0.00011019051757554187, + "loss": 0.0, + "step": 38478 + }, + { + "epoch": 3.5904637491835403, + "grad_norm": NaN, + "learning_rate": 0.0001101832250840032, + "loss": 0.0, + "step": 38479 + }, + { + "epoch": 3.5905570588784173, + "grad_norm": NaN, + "learning_rate": 0.00011017593269370678, + "loss": 0.0, + "step": 38480 + }, + { + "epoch": 3.5906503685732947, + "grad_norm": NaN, + "learning_rate": 0.00011016864040467113, + "loss": 0.0, + "step": 38481 + }, + { + "epoch": 3.590743678268172, + "grad_norm": NaN, + "learning_rate": 0.00011016134821691489, + "loss": 0.0, + "step": 38482 + }, + { + "epoch": 3.5908369879630495, + "grad_norm": NaN, + "learning_rate": 0.00011015405613045649, + "loss": 0.0, + "step": 38483 + }, + { + "epoch": 3.590930297657927, + "grad_norm": NaN, + "learning_rate": 0.00011014676414531451, + "loss": 0.0, + "step": 38484 + }, + { + "epoch": 3.591023607352804, + "grad_norm": NaN, + "learning_rate": 0.00011013947226150758, + "loss": 0.0, + "step": 38485 + }, + { + "epoch": 3.5911169170476813, + "grad_norm": NaN, + "learning_rate": 0.0001101321804790541, + "loss": 0.0, + "step": 38486 + }, + { + "epoch": 3.5912102267425583, + "grad_norm": NaN, + "learning_rate": 0.00011012488879797267, + "loss": 0.0, + "step": 38487 + }, + { + "epoch": 3.5913035364374357, + "grad_norm": NaN, + "learning_rate": 0.00011011759721828188, + "loss": 0.0, + "step": 38488 + }, + { + "epoch": 3.591396846132313, + "grad_norm": NaN, + "learning_rate": 0.0001101103057400002, + "loss": 0.0, + "step": 38489 + }, + { + "epoch": 3.5914901558271906, + "grad_norm": NaN, + "learning_rate": 0.00011010301436314618, + "loss": 0.0, + "step": 38490 + }, + { + "epoch": 3.591583465522068, + "grad_norm": NaN, + "learning_rate": 0.0001100957230877384, + "loss": 0.0, + "step": 38491 + }, + { + "epoch": 3.591676775216945, + "grad_norm": NaN, + "learning_rate": 0.00011008843191379539, + "loss": 0.0, + "step": 38492 + }, + { + "epoch": 3.5917700849118224, + "grad_norm": NaN, + "learning_rate": 0.00011008114084133563, + "loss": 0.0, + "step": 38493 + }, + { + "epoch": 3.5918633946067, + "grad_norm": NaN, + "learning_rate": 0.00011007384987037776, + "loss": 0.0, + "step": 38494 + }, + { + "epoch": 3.5919567043015768, + "grad_norm": NaN, + "learning_rate": 0.00011006655900094024, + "loss": 0.0, + "step": 38495 + }, + { + "epoch": 3.592050013996454, + "grad_norm": NaN, + "learning_rate": 0.00011005926823304158, + "loss": 0.0, + "step": 38496 + }, + { + "epoch": 3.5921433236913316, + "grad_norm": NaN, + "learning_rate": 0.00011005197756670046, + "loss": 0.0, + "step": 38497 + }, + { + "epoch": 3.592236633386209, + "grad_norm": NaN, + "learning_rate": 0.00011004468700193526, + "loss": 0.0, + "step": 38498 + }, + { + "epoch": 3.592329943081086, + "grad_norm": NaN, + "learning_rate": 0.00011003739653876458, + "loss": 0.0, + "step": 38499 + }, + { + "epoch": 3.5924232527759634, + "grad_norm": NaN, + "learning_rate": 0.00011003010617720702, + "loss": 0.0, + "step": 38500 + }, + { + "epoch": 3.592516562470841, + "grad_norm": NaN, + "learning_rate": 0.00011002281591728104, + "loss": 0.0, + "step": 38501 + }, + { + "epoch": 3.592609872165718, + "grad_norm": NaN, + "learning_rate": 0.00011001552575900516, + "loss": 0.0, + "step": 38502 + }, + { + "epoch": 3.5927031818605952, + "grad_norm": NaN, + "learning_rate": 0.00011000823570239802, + "loss": 0.0, + "step": 38503 + }, + { + "epoch": 3.5927964915554726, + "grad_norm": NaN, + "learning_rate": 0.000110000945747478, + "loss": 0.0, + "step": 38504 + }, + { + "epoch": 3.59288980125035, + "grad_norm": NaN, + "learning_rate": 0.00010999365589426379, + "loss": 0.0, + "step": 38505 + }, + { + "epoch": 3.592983110945227, + "grad_norm": NaN, + "learning_rate": 0.00010998636614277387, + "loss": 0.0, + "step": 38506 + }, + { + "epoch": 3.5930764206401045, + "grad_norm": NaN, + "learning_rate": 0.00010997907649302671, + "loss": 0.0, + "step": 38507 + }, + { + "epoch": 3.593169730334982, + "grad_norm": NaN, + "learning_rate": 0.00010997178694504095, + "loss": 0.0, + "step": 38508 + }, + { + "epoch": 3.593263040029859, + "grad_norm": NaN, + "learning_rate": 0.00010996449749883509, + "loss": 0.0, + "step": 38509 + }, + { + "epoch": 3.5933563497247363, + "grad_norm": NaN, + "learning_rate": 0.00010995720815442762, + "loss": 0.0, + "step": 38510 + }, + { + "epoch": 3.5934496594196137, + "grad_norm": NaN, + "learning_rate": 0.00010994991891183713, + "loss": 0.0, + "step": 38511 + }, + { + "epoch": 3.593542969114491, + "grad_norm": NaN, + "learning_rate": 0.00010994262977108214, + "loss": 0.0, + "step": 38512 + }, + { + "epoch": 3.5936362788093685, + "grad_norm": NaN, + "learning_rate": 0.00010993534073218113, + "loss": 0.0, + "step": 38513 + }, + { + "epoch": 3.5937295885042455, + "grad_norm": NaN, + "learning_rate": 0.00010992805179515269, + "loss": 0.0, + "step": 38514 + }, + { + "epoch": 3.593822898199123, + "grad_norm": NaN, + "learning_rate": 0.0001099207629600154, + "loss": 0.0, + "step": 38515 + }, + { + "epoch": 3.5939162078940003, + "grad_norm": NaN, + "learning_rate": 0.00010991347422678768, + "loss": 0.0, + "step": 38516 + }, + { + "epoch": 3.5940095175888773, + "grad_norm": NaN, + "learning_rate": 0.00010990618559548818, + "loss": 0.0, + "step": 38517 + }, + { + "epoch": 3.5941028272837547, + "grad_norm": NaN, + "learning_rate": 0.00010989889706613533, + "loss": 0.0, + "step": 38518 + }, + { + "epoch": 3.594196136978632, + "grad_norm": NaN, + "learning_rate": 0.00010989160863874769, + "loss": 0.0, + "step": 38519 + }, + { + "epoch": 3.5942894466735096, + "grad_norm": NaN, + "learning_rate": 0.00010988432031334385, + "loss": 0.0, + "step": 38520 + }, + { + "epoch": 3.5943827563683866, + "grad_norm": NaN, + "learning_rate": 0.0001098770320899423, + "loss": 0.0, + "step": 38521 + }, + { + "epoch": 3.594476066063264, + "grad_norm": NaN, + "learning_rate": 0.0001098697439685615, + "loss": 0.0, + "step": 38522 + }, + { + "epoch": 3.5945693757581414, + "grad_norm": NaN, + "learning_rate": 0.00010986245594922014, + "loss": 0.0, + "step": 38523 + }, + { + "epoch": 3.5946626854530184, + "grad_norm": NaN, + "learning_rate": 0.00010985516803193664, + "loss": 0.0, + "step": 38524 + }, + { + "epoch": 3.594755995147896, + "grad_norm": NaN, + "learning_rate": 0.0001098478802167295, + "loss": 0.0, + "step": 38525 + }, + { + "epoch": 3.594849304842773, + "grad_norm": NaN, + "learning_rate": 0.00010984059250361738, + "loss": 0.0, + "step": 38526 + }, + { + "epoch": 3.5949426145376506, + "grad_norm": NaN, + "learning_rate": 0.00010983330489261872, + "loss": 0.0, + "step": 38527 + }, + { + "epoch": 3.5950359242325276, + "grad_norm": NaN, + "learning_rate": 0.00010982601738375203, + "loss": 0.0, + "step": 38528 + }, + { + "epoch": 3.595129233927405, + "grad_norm": NaN, + "learning_rate": 0.00010981872997703594, + "loss": 0.0, + "step": 38529 + }, + { + "epoch": 3.5952225436222824, + "grad_norm": NaN, + "learning_rate": 0.0001098114426724889, + "loss": 0.0, + "step": 38530 + }, + { + "epoch": 3.5953158533171594, + "grad_norm": NaN, + "learning_rate": 0.00010980415547012939, + "loss": 0.0, + "step": 38531 + }, + { + "epoch": 3.595409163012037, + "grad_norm": NaN, + "learning_rate": 0.00010979686836997608, + "loss": 0.0, + "step": 38532 + }, + { + "epoch": 3.5955024727069143, + "grad_norm": NaN, + "learning_rate": 0.00010978958137204742, + "loss": 0.0, + "step": 38533 + }, + { + "epoch": 3.5955957824017917, + "grad_norm": NaN, + "learning_rate": 0.00010978229447636189, + "loss": 0.0, + "step": 38534 + }, + { + "epoch": 3.595689092096669, + "grad_norm": NaN, + "learning_rate": 0.00010977500768293813, + "loss": 0.0, + "step": 38535 + }, + { + "epoch": 3.595782401791546, + "grad_norm": NaN, + "learning_rate": 0.00010976772099179457, + "loss": 0.0, + "step": 38536 + }, + { + "epoch": 3.5958757114864235, + "grad_norm": NaN, + "learning_rate": 0.00010976043440294978, + "loss": 0.0, + "step": 38537 + }, + { + "epoch": 3.5959690211813005, + "grad_norm": NaN, + "learning_rate": 0.00010975314791642232, + "loss": 0.0, + "step": 38538 + }, + { + "epoch": 3.596062330876178, + "grad_norm": NaN, + "learning_rate": 0.00010974586153223066, + "loss": 0.0, + "step": 38539 + }, + { + "epoch": 3.5961556405710553, + "grad_norm": NaN, + "learning_rate": 0.00010973857525039334, + "loss": 0.0, + "step": 38540 + }, + { + "epoch": 3.5962489502659327, + "grad_norm": NaN, + "learning_rate": 0.00010973128907092893, + "loss": 0.0, + "step": 38541 + }, + { + "epoch": 3.59634225996081, + "grad_norm": NaN, + "learning_rate": 0.00010972400299385585, + "loss": 0.0, + "step": 38542 + }, + { + "epoch": 3.596435569655687, + "grad_norm": NaN, + "learning_rate": 0.00010971671701919276, + "loss": 0.0, + "step": 38543 + }, + { + "epoch": 3.5965288793505645, + "grad_norm": NaN, + "learning_rate": 0.00010970943114695816, + "loss": 0.0, + "step": 38544 + }, + { + "epoch": 3.596622189045442, + "grad_norm": NaN, + "learning_rate": 0.00010970214537717045, + "loss": 0.0, + "step": 38545 + }, + { + "epoch": 3.596715498740319, + "grad_norm": NaN, + "learning_rate": 0.0001096948597098483, + "loss": 0.0, + "step": 38546 + }, + { + "epoch": 3.5968088084351963, + "grad_norm": NaN, + "learning_rate": 0.00010968757414501022, + "loss": 0.0, + "step": 38547 + }, + { + "epoch": 3.5969021181300738, + "grad_norm": NaN, + "learning_rate": 0.00010968028868267461, + "loss": 0.0, + "step": 38548 + }, + { + "epoch": 3.596995427824951, + "grad_norm": NaN, + "learning_rate": 0.00010967300332286014, + "loss": 0.0, + "step": 38549 + }, + { + "epoch": 3.597088737519828, + "grad_norm": NaN, + "learning_rate": 0.00010966571806558529, + "loss": 0.0, + "step": 38550 + }, + { + "epoch": 3.5971820472147056, + "grad_norm": NaN, + "learning_rate": 0.00010965843291086852, + "loss": 0.0, + "step": 38551 + }, + { + "epoch": 3.597275356909583, + "grad_norm": NaN, + "learning_rate": 0.00010965114785872843, + "loss": 0.0, + "step": 38552 + }, + { + "epoch": 3.59736866660446, + "grad_norm": NaN, + "learning_rate": 0.00010964386290918355, + "loss": 0.0, + "step": 38553 + }, + { + "epoch": 3.5974619762993374, + "grad_norm": NaN, + "learning_rate": 0.00010963657806225231, + "loss": 0.0, + "step": 38554 + }, + { + "epoch": 3.597555285994215, + "grad_norm": NaN, + "learning_rate": 0.00010962929331795333, + "loss": 0.0, + "step": 38555 + }, + { + "epoch": 3.5976485956890922, + "grad_norm": NaN, + "learning_rate": 0.00010962200867630513, + "loss": 0.0, + "step": 38556 + }, + { + "epoch": 3.5977419053839697, + "grad_norm": NaN, + "learning_rate": 0.00010961472413732614, + "loss": 0.0, + "step": 38557 + }, + { + "epoch": 3.5978352150788466, + "grad_norm": NaN, + "learning_rate": 0.00010960743970103498, + "loss": 0.0, + "step": 38558 + }, + { + "epoch": 3.597928524773724, + "grad_norm": NaN, + "learning_rate": 0.00010960015536745015, + "loss": 0.0, + "step": 38559 + }, + { + "epoch": 3.598021834468601, + "grad_norm": NaN, + "learning_rate": 0.00010959287113659009, + "loss": 0.0, + "step": 38560 + }, + { + "epoch": 3.5981151441634784, + "grad_norm": NaN, + "learning_rate": 0.00010958558700847347, + "loss": 0.0, + "step": 38561 + }, + { + "epoch": 3.598208453858356, + "grad_norm": NaN, + "learning_rate": 0.0001095783029831187, + "loss": 0.0, + "step": 38562 + }, + { + "epoch": 3.5983017635532333, + "grad_norm": NaN, + "learning_rate": 0.00010957101906054428, + "loss": 0.0, + "step": 38563 + }, + { + "epoch": 3.5983950732481107, + "grad_norm": NaN, + "learning_rate": 0.00010956373524076887, + "loss": 0.0, + "step": 38564 + }, + { + "epoch": 3.5984883829429877, + "grad_norm": NaN, + "learning_rate": 0.00010955645152381086, + "loss": 0.0, + "step": 38565 + }, + { + "epoch": 3.598581692637865, + "grad_norm": NaN, + "learning_rate": 0.00010954916790968878, + "loss": 0.0, + "step": 38566 + }, + { + "epoch": 3.5986750023327425, + "grad_norm": NaN, + "learning_rate": 0.00010954188439842125, + "loss": 0.0, + "step": 38567 + }, + { + "epoch": 3.5987683120276195, + "grad_norm": NaN, + "learning_rate": 0.00010953460099002669, + "loss": 0.0, + "step": 38568 + }, + { + "epoch": 3.598861621722497, + "grad_norm": NaN, + "learning_rate": 0.00010952731768452362, + "loss": 0.0, + "step": 38569 + }, + { + "epoch": 3.5989549314173743, + "grad_norm": NaN, + "learning_rate": 0.00010952003448193067, + "loss": 0.0, + "step": 38570 + }, + { + "epoch": 3.5990482411122517, + "grad_norm": NaN, + "learning_rate": 0.00010951275138226624, + "loss": 0.0, + "step": 38571 + }, + { + "epoch": 3.5991415508071287, + "grad_norm": NaN, + "learning_rate": 0.00010950546838554883, + "loss": 0.0, + "step": 38572 + }, + { + "epoch": 3.599234860502006, + "grad_norm": NaN, + "learning_rate": 0.00010949818549179713, + "loss": 0.0, + "step": 38573 + }, + { + "epoch": 3.5993281701968836, + "grad_norm": NaN, + "learning_rate": 0.00010949090270102948, + "loss": 0.0, + "step": 38574 + }, + { + "epoch": 3.5994214798917605, + "grad_norm": NaN, + "learning_rate": 0.00010948362001326444, + "loss": 0.0, + "step": 38575 + }, + { + "epoch": 3.599514789586638, + "grad_norm": NaN, + "learning_rate": 0.00010947633742852062, + "loss": 0.0, + "step": 38576 + }, + { + "epoch": 3.5996080992815154, + "grad_norm": NaN, + "learning_rate": 0.00010946905494681644, + "loss": 0.0, + "step": 38577 + }, + { + "epoch": 3.599701408976393, + "grad_norm": NaN, + "learning_rate": 0.00010946177256817044, + "loss": 0.0, + "step": 38578 + }, + { + "epoch": 3.59979471867127, + "grad_norm": NaN, + "learning_rate": 0.00010945449029260117, + "loss": 0.0, + "step": 38579 + }, + { + "epoch": 3.599888028366147, + "grad_norm": NaN, + "learning_rate": 0.00010944720812012707, + "loss": 0.0, + "step": 38580 + }, + { + "epoch": 3.5999813380610246, + "grad_norm": NaN, + "learning_rate": 0.00010943992605076674, + "loss": 0.0, + "step": 38581 + }, + { + "epoch": 3.6000746477559016, + "grad_norm": NaN, + "learning_rate": 0.00010943264408453869, + "loss": 0.0, + "step": 38582 + }, + { + "epoch": 3.600167957450779, + "grad_norm": NaN, + "learning_rate": 0.00010942536222146135, + "loss": 0.0, + "step": 38583 + }, + { + "epoch": 3.6002612671456564, + "grad_norm": NaN, + "learning_rate": 0.00010941808046155334, + "loss": 0.0, + "step": 38584 + }, + { + "epoch": 3.600354576840534, + "grad_norm": NaN, + "learning_rate": 0.00010941079880483315, + "loss": 0.0, + "step": 38585 + }, + { + "epoch": 3.6004478865354113, + "grad_norm": NaN, + "learning_rate": 0.00010940351725131919, + "loss": 0.0, + "step": 38586 + }, + { + "epoch": 3.6005411962302882, + "grad_norm": NaN, + "learning_rate": 0.00010939623580103013, + "loss": 0.0, + "step": 38587 + }, + { + "epoch": 3.6006345059251657, + "grad_norm": NaN, + "learning_rate": 0.00010938895445398444, + "loss": 0.0, + "step": 38588 + }, + { + "epoch": 3.600727815620043, + "grad_norm": NaN, + "learning_rate": 0.00010938167321020053, + "loss": 0.0, + "step": 38589 + }, + { + "epoch": 3.60082112531492, + "grad_norm": NaN, + "learning_rate": 0.00010937439206969702, + "loss": 0.0, + "step": 38590 + }, + { + "epoch": 3.6009144350097975, + "grad_norm": NaN, + "learning_rate": 0.00010936711103249243, + "loss": 0.0, + "step": 38591 + }, + { + "epoch": 3.601007744704675, + "grad_norm": NaN, + "learning_rate": 0.00010935983009860518, + "loss": 0.0, + "step": 38592 + }, + { + "epoch": 3.6011010543995523, + "grad_norm": NaN, + "learning_rate": 0.00010935254926805388, + "loss": 0.0, + "step": 38593 + }, + { + "epoch": 3.6011943640944293, + "grad_norm": NaN, + "learning_rate": 0.00010934526854085705, + "loss": 0.0, + "step": 38594 + }, + { + "epoch": 3.6012876737893067, + "grad_norm": NaN, + "learning_rate": 0.00010933798791703308, + "loss": 0.0, + "step": 38595 + }, + { + "epoch": 3.601380983484184, + "grad_norm": NaN, + "learning_rate": 0.0001093307073966006, + "loss": 0.0, + "step": 38596 + }, + { + "epoch": 3.601474293179061, + "grad_norm": NaN, + "learning_rate": 0.0001093234269795781, + "loss": 0.0, + "step": 38597 + }, + { + "epoch": 3.6015676028739385, + "grad_norm": NaN, + "learning_rate": 0.00010931614666598404, + "loss": 0.0, + "step": 38598 + }, + { + "epoch": 3.601660912568816, + "grad_norm": NaN, + "learning_rate": 0.00010930886645583697, + "loss": 0.0, + "step": 38599 + }, + { + "epoch": 3.6017542222636934, + "grad_norm": NaN, + "learning_rate": 0.00010930158634915544, + "loss": 0.0, + "step": 38600 + }, + { + "epoch": 3.6018475319585708, + "grad_norm": NaN, + "learning_rate": 0.00010929430634595787, + "loss": 0.0, + "step": 38601 + }, + { + "epoch": 3.6019408416534477, + "grad_norm": NaN, + "learning_rate": 0.00010928702644626284, + "loss": 0.0, + "step": 38602 + }, + { + "epoch": 3.602034151348325, + "grad_norm": NaN, + "learning_rate": 0.00010927974665008889, + "loss": 0.0, + "step": 38603 + }, + { + "epoch": 3.602127461043202, + "grad_norm": NaN, + "learning_rate": 0.0001092724669574544, + "loss": 0.0, + "step": 38604 + }, + { + "epoch": 3.6022207707380796, + "grad_norm": NaN, + "learning_rate": 0.00010926518736837802, + "loss": 0.0, + "step": 38605 + }, + { + "epoch": 3.602314080432957, + "grad_norm": NaN, + "learning_rate": 0.0001092579078828782, + "loss": 0.0, + "step": 38606 + }, + { + "epoch": 3.6024073901278344, + "grad_norm": NaN, + "learning_rate": 0.00010925062850097339, + "loss": 0.0, + "step": 38607 + }, + { + "epoch": 3.602500699822712, + "grad_norm": NaN, + "learning_rate": 0.00010924334922268224, + "loss": 0.0, + "step": 38608 + }, + { + "epoch": 3.602594009517589, + "grad_norm": NaN, + "learning_rate": 0.00010923607004802314, + "loss": 0.0, + "step": 38609 + }, + { + "epoch": 3.602687319212466, + "grad_norm": NaN, + "learning_rate": 0.00010922879097701462, + "loss": 0.0, + "step": 38610 + }, + { + "epoch": 3.6027806289073436, + "grad_norm": NaN, + "learning_rate": 0.00010922151200967526, + "loss": 0.0, + "step": 38611 + }, + { + "epoch": 3.6028739386022206, + "grad_norm": NaN, + "learning_rate": 0.0001092142331460235, + "loss": 0.0, + "step": 38612 + }, + { + "epoch": 3.602967248297098, + "grad_norm": NaN, + "learning_rate": 0.00010920695438607781, + "loss": 0.0, + "step": 38613 + }, + { + "epoch": 3.6030605579919754, + "grad_norm": NaN, + "learning_rate": 0.00010919967572985682, + "loss": 0.0, + "step": 38614 + }, + { + "epoch": 3.603153867686853, + "grad_norm": NaN, + "learning_rate": 0.00010919239717737894, + "loss": 0.0, + "step": 38615 + }, + { + "epoch": 3.60324717738173, + "grad_norm": NaN, + "learning_rate": 0.00010918511872866268, + "loss": 0.0, + "step": 38616 + }, + { + "epoch": 3.6033404870766073, + "grad_norm": NaN, + "learning_rate": 0.00010917784038372661, + "loss": 0.0, + "step": 38617 + }, + { + "epoch": 3.6034337967714847, + "grad_norm": NaN, + "learning_rate": 0.00010917056214258916, + "loss": 0.0, + "step": 38618 + }, + { + "epoch": 3.6035271064663617, + "grad_norm": NaN, + "learning_rate": 0.00010916328400526891, + "loss": 0.0, + "step": 38619 + }, + { + "epoch": 3.603620416161239, + "grad_norm": NaN, + "learning_rate": 0.00010915600597178437, + "loss": 0.0, + "step": 38620 + }, + { + "epoch": 3.6037137258561165, + "grad_norm": NaN, + "learning_rate": 0.00010914872804215391, + "loss": 0.0, + "step": 38621 + }, + { + "epoch": 3.603807035550994, + "grad_norm": NaN, + "learning_rate": 0.00010914145021639619, + "loss": 0.0, + "step": 38622 + }, + { + "epoch": 3.603900345245871, + "grad_norm": NaN, + "learning_rate": 0.0001091341724945297, + "loss": 0.0, + "step": 38623 + }, + { + "epoch": 3.6039936549407483, + "grad_norm": NaN, + "learning_rate": 0.00010912689487657281, + "loss": 0.0, + "step": 38624 + }, + { + "epoch": 3.6040869646356257, + "grad_norm": NaN, + "learning_rate": 0.00010911961736254418, + "loss": 0.0, + "step": 38625 + }, + { + "epoch": 3.6041802743305027, + "grad_norm": NaN, + "learning_rate": 0.00010911233995246228, + "loss": 0.0, + "step": 38626 + }, + { + "epoch": 3.60427358402538, + "grad_norm": NaN, + "learning_rate": 0.00010910506264634552, + "loss": 0.0, + "step": 38627 + }, + { + "epoch": 3.6043668937202575, + "grad_norm": NaN, + "learning_rate": 0.0001090977854442125, + "loss": 0.0, + "step": 38628 + }, + { + "epoch": 3.604460203415135, + "grad_norm": NaN, + "learning_rate": 0.00010909050834608173, + "loss": 0.0, + "step": 38629 + }, + { + "epoch": 3.6045535131100124, + "grad_norm": NaN, + "learning_rate": 0.0001090832313519716, + "loss": 0.0, + "step": 38630 + }, + { + "epoch": 3.6046468228048894, + "grad_norm": NaN, + "learning_rate": 0.00010907595446190075, + "loss": 0.0, + "step": 38631 + }, + { + "epoch": 3.6047401324997668, + "grad_norm": NaN, + "learning_rate": 0.00010906867767588765, + "loss": 0.0, + "step": 38632 + }, + { + "epoch": 3.604833442194644, + "grad_norm": NaN, + "learning_rate": 0.00010906140099395071, + "loss": 0.0, + "step": 38633 + }, + { + "epoch": 3.604926751889521, + "grad_norm": NaN, + "learning_rate": 0.00010905412441610851, + "loss": 0.0, + "step": 38634 + }, + { + "epoch": 3.6050200615843986, + "grad_norm": NaN, + "learning_rate": 0.00010904684794237963, + "loss": 0.0, + "step": 38635 + }, + { + "epoch": 3.605113371279276, + "grad_norm": NaN, + "learning_rate": 0.00010903957157278237, + "loss": 0.0, + "step": 38636 + }, + { + "epoch": 3.6052066809741534, + "grad_norm": NaN, + "learning_rate": 0.00010903229530733541, + "loss": 0.0, + "step": 38637 + }, + { + "epoch": 3.6052999906690304, + "grad_norm": NaN, + "learning_rate": 0.00010902501914605718, + "loss": 0.0, + "step": 38638 + }, + { + "epoch": 3.605393300363908, + "grad_norm": NaN, + "learning_rate": 0.00010901774308896615, + "loss": 0.0, + "step": 38639 + }, + { + "epoch": 3.6054866100587852, + "grad_norm": NaN, + "learning_rate": 0.00010901046713608089, + "loss": 0.0, + "step": 38640 + }, + { + "epoch": 3.605579919753662, + "grad_norm": NaN, + "learning_rate": 0.0001090031912874199, + "loss": 0.0, + "step": 38641 + }, + { + "epoch": 3.6056732294485396, + "grad_norm": NaN, + "learning_rate": 0.00010899591554300158, + "loss": 0.0, + "step": 38642 + }, + { + "epoch": 3.605766539143417, + "grad_norm": NaN, + "learning_rate": 0.00010898863990284456, + "loss": 0.0, + "step": 38643 + }, + { + "epoch": 3.6058598488382945, + "grad_norm": NaN, + "learning_rate": 0.00010898136436696728, + "loss": 0.0, + "step": 38644 + }, + { + "epoch": 3.6059531585331714, + "grad_norm": NaN, + "learning_rate": 0.00010897408893538819, + "loss": 0.0, + "step": 38645 + }, + { + "epoch": 3.606046468228049, + "grad_norm": NaN, + "learning_rate": 0.00010896681360812589, + "loss": 0.0, + "step": 38646 + }, + { + "epoch": 3.6061397779229263, + "grad_norm": NaN, + "learning_rate": 0.00010895953838519883, + "loss": 0.0, + "step": 38647 + }, + { + "epoch": 3.6062330876178033, + "grad_norm": NaN, + "learning_rate": 0.00010895226326662546, + "loss": 0.0, + "step": 38648 + }, + { + "epoch": 3.6063263973126807, + "grad_norm": NaN, + "learning_rate": 0.00010894498825242433, + "loss": 0.0, + "step": 38649 + }, + { + "epoch": 3.606419707007558, + "grad_norm": NaN, + "learning_rate": 0.00010893771334261401, + "loss": 0.0, + "step": 38650 + }, + { + "epoch": 3.6065130167024355, + "grad_norm": NaN, + "learning_rate": 0.00010893043853721283, + "loss": 0.0, + "step": 38651 + }, + { + "epoch": 3.606606326397313, + "grad_norm": NaN, + "learning_rate": 0.00010892316383623944, + "loss": 0.0, + "step": 38652 + }, + { + "epoch": 3.60669963609219, + "grad_norm": NaN, + "learning_rate": 0.00010891588923971226, + "loss": 0.0, + "step": 38653 + }, + { + "epoch": 3.6067929457870673, + "grad_norm": NaN, + "learning_rate": 0.00010890861474764975, + "loss": 0.0, + "step": 38654 + }, + { + "epoch": 3.6068862554819443, + "grad_norm": NaN, + "learning_rate": 0.00010890134036007055, + "loss": 0.0, + "step": 38655 + }, + { + "epoch": 3.6069795651768217, + "grad_norm": NaN, + "learning_rate": 0.00010889406607699299, + "loss": 0.0, + "step": 38656 + }, + { + "epoch": 3.607072874871699, + "grad_norm": NaN, + "learning_rate": 0.00010888679189843567, + "loss": 0.0, + "step": 38657 + }, + { + "epoch": 3.6071661845665766, + "grad_norm": NaN, + "learning_rate": 0.00010887951782441709, + "loss": 0.0, + "step": 38658 + }, + { + "epoch": 3.607259494261454, + "grad_norm": NaN, + "learning_rate": 0.00010887224385495566, + "loss": 0.0, + "step": 38659 + }, + { + "epoch": 3.607352803956331, + "grad_norm": NaN, + "learning_rate": 0.00010886496999006996, + "loss": 0.0, + "step": 38660 + }, + { + "epoch": 3.6074461136512084, + "grad_norm": NaN, + "learning_rate": 0.0001088576962297785, + "loss": 0.0, + "step": 38661 + }, + { + "epoch": 3.607539423346086, + "grad_norm": NaN, + "learning_rate": 0.00010885042257409966, + "loss": 0.0, + "step": 38662 + }, + { + "epoch": 3.6076327330409628, + "grad_norm": NaN, + "learning_rate": 0.00010884314902305205, + "loss": 0.0, + "step": 38663 + }, + { + "epoch": 3.60772604273584, + "grad_norm": NaN, + "learning_rate": 0.00010883587557665412, + "loss": 0.0, + "step": 38664 + }, + { + "epoch": 3.6078193524307176, + "grad_norm": NaN, + "learning_rate": 0.0001088286022349243, + "loss": 0.0, + "step": 38665 + }, + { + "epoch": 3.607912662125595, + "grad_norm": NaN, + "learning_rate": 0.0001088213289978812, + "loss": 0.0, + "step": 38666 + }, + { + "epoch": 3.608005971820472, + "grad_norm": NaN, + "learning_rate": 0.00010881405586554332, + "loss": 0.0, + "step": 38667 + }, + { + "epoch": 3.6080992815153494, + "grad_norm": NaN, + "learning_rate": 0.00010880678283792899, + "loss": 0.0, + "step": 38668 + }, + { + "epoch": 3.608192591210227, + "grad_norm": NaN, + "learning_rate": 0.00010879950991505687, + "loss": 0.0, + "step": 38669 + }, + { + "epoch": 3.608285900905104, + "grad_norm": NaN, + "learning_rate": 0.0001087922370969454, + "loss": 0.0, + "step": 38670 + }, + { + "epoch": 3.6083792105999812, + "grad_norm": NaN, + "learning_rate": 0.00010878496438361301, + "loss": 0.0, + "step": 38671 + }, + { + "epoch": 3.6084725202948587, + "grad_norm": NaN, + "learning_rate": 0.00010877769177507828, + "loss": 0.0, + "step": 38672 + }, + { + "epoch": 3.608565829989736, + "grad_norm": NaN, + "learning_rate": 0.0001087704192713597, + "loss": 0.0, + "step": 38673 + }, + { + "epoch": 3.6086591396846135, + "grad_norm": NaN, + "learning_rate": 0.00010876314687247568, + "loss": 0.0, + "step": 38674 + }, + { + "epoch": 3.6087524493794905, + "grad_norm": NaN, + "learning_rate": 0.00010875587457844477, + "loss": 0.0, + "step": 38675 + }, + { + "epoch": 3.608845759074368, + "grad_norm": NaN, + "learning_rate": 0.00010874860238928549, + "loss": 0.0, + "step": 38676 + }, + { + "epoch": 3.608939068769245, + "grad_norm": NaN, + "learning_rate": 0.00010874133030501626, + "loss": 0.0, + "step": 38677 + }, + { + "epoch": 3.6090323784641223, + "grad_norm": NaN, + "learning_rate": 0.0001087340583256556, + "loss": 0.0, + "step": 38678 + }, + { + "epoch": 3.6091256881589997, + "grad_norm": NaN, + "learning_rate": 0.00010872678645122206, + "loss": 0.0, + "step": 38679 + }, + { + "epoch": 3.609218997853877, + "grad_norm": NaN, + "learning_rate": 0.000108719514681734, + "loss": 0.0, + "step": 38680 + }, + { + "epoch": 3.6093123075487545, + "grad_norm": NaN, + "learning_rate": 0.00010871224301721, + "loss": 0.0, + "step": 38681 + }, + { + "epoch": 3.6094056172436315, + "grad_norm": NaN, + "learning_rate": 0.00010870497145766862, + "loss": 0.0, + "step": 38682 + }, + { + "epoch": 3.609498926938509, + "grad_norm": NaN, + "learning_rate": 0.00010869770000312815, + "loss": 0.0, + "step": 38683 + }, + { + "epoch": 3.6095922366333864, + "grad_norm": NaN, + "learning_rate": 0.00010869042865360726, + "loss": 0.0, + "step": 38684 + }, + { + "epoch": 3.6096855463282633, + "grad_norm": NaN, + "learning_rate": 0.0001086831574091244, + "loss": 0.0, + "step": 38685 + }, + { + "epoch": 3.6097788560231407, + "grad_norm": NaN, + "learning_rate": 0.00010867588626969794, + "loss": 0.0, + "step": 38686 + }, + { + "epoch": 3.609872165718018, + "grad_norm": NaN, + "learning_rate": 0.00010866861523534651, + "loss": 0.0, + "step": 38687 + }, + { + "epoch": 3.6099654754128956, + "grad_norm": NaN, + "learning_rate": 0.0001086613443060886, + "loss": 0.0, + "step": 38688 + }, + { + "epoch": 3.6100587851077726, + "grad_norm": NaN, + "learning_rate": 0.00010865407348194256, + "loss": 0.0, + "step": 38689 + }, + { + "epoch": 3.61015209480265, + "grad_norm": NaN, + "learning_rate": 0.00010864680276292695, + "loss": 0.0, + "step": 38690 + }, + { + "epoch": 3.6102454044975274, + "grad_norm": NaN, + "learning_rate": 0.00010863953214906036, + "loss": 0.0, + "step": 38691 + }, + { + "epoch": 3.6103387141924044, + "grad_norm": NaN, + "learning_rate": 0.00010863226164036115, + "loss": 0.0, + "step": 38692 + }, + { + "epoch": 3.610432023887282, + "grad_norm": NaN, + "learning_rate": 0.00010862499123684782, + "loss": 0.0, + "step": 38693 + }, + { + "epoch": 3.610525333582159, + "grad_norm": NaN, + "learning_rate": 0.00010861772093853894, + "loss": 0.0, + "step": 38694 + }, + { + "epoch": 3.6106186432770366, + "grad_norm": NaN, + "learning_rate": 0.0001086104507454529, + "loss": 0.0, + "step": 38695 + }, + { + "epoch": 3.610711952971914, + "grad_norm": NaN, + "learning_rate": 0.00010860318065760825, + "loss": 0.0, + "step": 38696 + }, + { + "epoch": 3.610805262666791, + "grad_norm": NaN, + "learning_rate": 0.0001085959106750234, + "loss": 0.0, + "step": 38697 + }, + { + "epoch": 3.6108985723616684, + "grad_norm": NaN, + "learning_rate": 0.0001085886407977169, + "loss": 0.0, + "step": 38698 + }, + { + "epoch": 3.6109918820565454, + "grad_norm": NaN, + "learning_rate": 0.00010858137102570725, + "loss": 0.0, + "step": 38699 + }, + { + "epoch": 3.611085191751423, + "grad_norm": NaN, + "learning_rate": 0.00010857410135901287, + "loss": 0.0, + "step": 38700 + }, + { + "epoch": 3.6111785014463003, + "grad_norm": NaN, + "learning_rate": 0.00010856683179765229, + "loss": 0.0, + "step": 38701 + }, + { + "epoch": 3.6112718111411777, + "grad_norm": NaN, + "learning_rate": 0.000108559562341644, + "loss": 0.0, + "step": 38702 + }, + { + "epoch": 3.611365120836055, + "grad_norm": NaN, + "learning_rate": 0.00010855229299100643, + "loss": 0.0, + "step": 38703 + }, + { + "epoch": 3.611458430530932, + "grad_norm": NaN, + "learning_rate": 0.00010854502374575813, + "loss": 0.0, + "step": 38704 + }, + { + "epoch": 3.6115517402258095, + "grad_norm": NaN, + "learning_rate": 0.00010853775460591758, + "loss": 0.0, + "step": 38705 + }, + { + "epoch": 3.611645049920687, + "grad_norm": NaN, + "learning_rate": 0.00010853048557150316, + "loss": 0.0, + "step": 38706 + }, + { + "epoch": 3.611738359615564, + "grad_norm": NaN, + "learning_rate": 0.0001085232166425335, + "loss": 0.0, + "step": 38707 + }, + { + "epoch": 3.6118316693104413, + "grad_norm": NaN, + "learning_rate": 0.00010851594781902701, + "loss": 0.0, + "step": 38708 + }, + { + "epoch": 3.6119249790053187, + "grad_norm": NaN, + "learning_rate": 0.00010850867910100212, + "loss": 0.0, + "step": 38709 + }, + { + "epoch": 3.612018288700196, + "grad_norm": NaN, + "learning_rate": 0.00010850141048847741, + "loss": 0.0, + "step": 38710 + }, + { + "epoch": 3.612111598395073, + "grad_norm": NaN, + "learning_rate": 0.00010849414198147133, + "loss": 0.0, + "step": 38711 + }, + { + "epoch": 3.6122049080899505, + "grad_norm": NaN, + "learning_rate": 0.00010848687358000231, + "loss": 0.0, + "step": 38712 + }, + { + "epoch": 3.612298217784828, + "grad_norm": NaN, + "learning_rate": 0.00010847960528408891, + "loss": 0.0, + "step": 38713 + }, + { + "epoch": 3.612391527479705, + "grad_norm": NaN, + "learning_rate": 0.00010847233709374959, + "loss": 0.0, + "step": 38714 + }, + { + "epoch": 3.6124848371745824, + "grad_norm": NaN, + "learning_rate": 0.00010846506900900278, + "loss": 0.0, + "step": 38715 + }, + { + "epoch": 3.6125781468694598, + "grad_norm": NaN, + "learning_rate": 0.00010845780102986698, + "loss": 0.0, + "step": 38716 + }, + { + "epoch": 3.612671456564337, + "grad_norm": NaN, + "learning_rate": 0.00010845053315636077, + "loss": 0.0, + "step": 38717 + }, + { + "epoch": 3.612764766259214, + "grad_norm": NaN, + "learning_rate": 0.00010844326538850245, + "loss": 0.0, + "step": 38718 + }, + { + "epoch": 3.6128580759540916, + "grad_norm": NaN, + "learning_rate": 0.00010843599772631064, + "loss": 0.0, + "step": 38719 + }, + { + "epoch": 3.612951385648969, + "grad_norm": NaN, + "learning_rate": 0.0001084287301698038, + "loss": 0.0, + "step": 38720 + }, + { + "epoch": 3.613044695343846, + "grad_norm": NaN, + "learning_rate": 0.00010842146271900033, + "loss": 0.0, + "step": 38721 + }, + { + "epoch": 3.6131380050387234, + "grad_norm": NaN, + "learning_rate": 0.00010841419537391878, + "loss": 0.0, + "step": 38722 + }, + { + "epoch": 3.613231314733601, + "grad_norm": NaN, + "learning_rate": 0.00010840692813457766, + "loss": 0.0, + "step": 38723 + }, + { + "epoch": 3.6133246244284782, + "grad_norm": NaN, + "learning_rate": 0.00010839966100099533, + "loss": 0.0, + "step": 38724 + }, + { + "epoch": 3.6134179341233557, + "grad_norm": NaN, + "learning_rate": 0.00010839239397319032, + "loss": 0.0, + "step": 38725 + }, + { + "epoch": 3.6135112438182326, + "grad_norm": NaN, + "learning_rate": 0.00010838512705118125, + "loss": 0.0, + "step": 38726 + }, + { + "epoch": 3.61360455351311, + "grad_norm": NaN, + "learning_rate": 0.00010837786023498639, + "loss": 0.0, + "step": 38727 + }, + { + "epoch": 3.6136978632079875, + "grad_norm": NaN, + "learning_rate": 0.00010837059352462427, + "loss": 0.0, + "step": 38728 + }, + { + "epoch": 3.6137911729028644, + "grad_norm": NaN, + "learning_rate": 0.00010836332692011348, + "loss": 0.0, + "step": 38729 + }, + { + "epoch": 3.613884482597742, + "grad_norm": NaN, + "learning_rate": 0.00010835606042147238, + "loss": 0.0, + "step": 38730 + }, + { + "epoch": 3.6139777922926193, + "grad_norm": NaN, + "learning_rate": 0.00010834879402871946, + "loss": 0.0, + "step": 38731 + }, + { + "epoch": 3.6140711019874967, + "grad_norm": NaN, + "learning_rate": 0.00010834152774187327, + "loss": 0.0, + "step": 38732 + }, + { + "epoch": 3.6141644116823737, + "grad_norm": NaN, + "learning_rate": 0.0001083342615609522, + "loss": 0.0, + "step": 38733 + }, + { + "epoch": 3.614257721377251, + "grad_norm": NaN, + "learning_rate": 0.00010832699548597475, + "loss": 0.0, + "step": 38734 + }, + { + "epoch": 3.6143510310721285, + "grad_norm": NaN, + "learning_rate": 0.00010831972951695945, + "loss": 0.0, + "step": 38735 + }, + { + "epoch": 3.6144443407670055, + "grad_norm": NaN, + "learning_rate": 0.0001083124636539247, + "loss": 0.0, + "step": 38736 + }, + { + "epoch": 3.614537650461883, + "grad_norm": NaN, + "learning_rate": 0.000108305197896889, + "loss": 0.0, + "step": 38737 + }, + { + "epoch": 3.6146309601567603, + "grad_norm": NaN, + "learning_rate": 0.00010829793224587087, + "loss": 0.0, + "step": 38738 + }, + { + "epoch": 3.6147242698516378, + "grad_norm": NaN, + "learning_rate": 0.00010829066670088872, + "loss": 0.0, + "step": 38739 + }, + { + "epoch": 3.6148175795465147, + "grad_norm": NaN, + "learning_rate": 0.00010828340126196107, + "loss": 0.0, + "step": 38740 + }, + { + "epoch": 3.614910889241392, + "grad_norm": NaN, + "learning_rate": 0.00010827613592910631, + "loss": 0.0, + "step": 38741 + }, + { + "epoch": 3.6150041989362696, + "grad_norm": NaN, + "learning_rate": 0.00010826887070234302, + "loss": 0.0, + "step": 38742 + }, + { + "epoch": 3.6150975086311465, + "grad_norm": NaN, + "learning_rate": 0.00010826160558168967, + "loss": 0.0, + "step": 38743 + }, + { + "epoch": 3.615190818326024, + "grad_norm": NaN, + "learning_rate": 0.00010825434056716461, + "loss": 0.0, + "step": 38744 + }, + { + "epoch": 3.6152841280209014, + "grad_norm": NaN, + "learning_rate": 0.00010824707565878644, + "loss": 0.0, + "step": 38745 + }, + { + "epoch": 3.615377437715779, + "grad_norm": NaN, + "learning_rate": 0.00010823981085657361, + "loss": 0.0, + "step": 38746 + }, + { + "epoch": 3.615470747410656, + "grad_norm": NaN, + "learning_rate": 0.00010823254616054451, + "loss": 0.0, + "step": 38747 + }, + { + "epoch": 3.615564057105533, + "grad_norm": NaN, + "learning_rate": 0.0001082252815707177, + "loss": 0.0, + "step": 38748 + }, + { + "epoch": 3.6156573668004106, + "grad_norm": NaN, + "learning_rate": 0.00010821801708711167, + "loss": 0.0, + "step": 38749 + }, + { + "epoch": 3.6157506764952876, + "grad_norm": NaN, + "learning_rate": 0.00010821075270974478, + "loss": 0.0, + "step": 38750 + }, + { + "epoch": 3.615843986190165, + "grad_norm": NaN, + "learning_rate": 0.00010820348843863559, + "loss": 0.0, + "step": 38751 + }, + { + "epoch": 3.6159372958850424, + "grad_norm": NaN, + "learning_rate": 0.0001081962242738026, + "loss": 0.0, + "step": 38752 + }, + { + "epoch": 3.61603060557992, + "grad_norm": NaN, + "learning_rate": 0.00010818896021526415, + "loss": 0.0, + "step": 38753 + }, + { + "epoch": 3.6161239152747973, + "grad_norm": NaN, + "learning_rate": 0.00010818169626303881, + "loss": 0.0, + "step": 38754 + }, + { + "epoch": 3.6162172249696742, + "grad_norm": NaN, + "learning_rate": 0.00010817443241714508, + "loss": 0.0, + "step": 38755 + }, + { + "epoch": 3.6163105346645517, + "grad_norm": NaN, + "learning_rate": 0.0001081671686776013, + "loss": 0.0, + "step": 38756 + }, + { + "epoch": 3.616403844359429, + "grad_norm": NaN, + "learning_rate": 0.00010815990504442606, + "loss": 0.0, + "step": 38757 + }, + { + "epoch": 3.616497154054306, + "grad_norm": NaN, + "learning_rate": 0.00010815264151763782, + "loss": 0.0, + "step": 38758 + }, + { + "epoch": 3.6165904637491835, + "grad_norm": NaN, + "learning_rate": 0.00010814537809725494, + "loss": 0.0, + "step": 38759 + }, + { + "epoch": 3.616683773444061, + "grad_norm": NaN, + "learning_rate": 0.00010813811478329601, + "loss": 0.0, + "step": 38760 + }, + { + "epoch": 3.6167770831389383, + "grad_norm": NaN, + "learning_rate": 0.00010813085157577948, + "loss": 0.0, + "step": 38761 + }, + { + "epoch": 3.6168703928338153, + "grad_norm": NaN, + "learning_rate": 0.00010812358847472372, + "loss": 0.0, + "step": 38762 + }, + { + "epoch": 3.6169637025286927, + "grad_norm": NaN, + "learning_rate": 0.00010811632548014728, + "loss": 0.0, + "step": 38763 + }, + { + "epoch": 3.61705701222357, + "grad_norm": NaN, + "learning_rate": 0.00010810906259206873, + "loss": 0.0, + "step": 38764 + }, + { + "epoch": 3.617150321918447, + "grad_norm": NaN, + "learning_rate": 0.0001081017998105063, + "loss": 0.0, + "step": 38765 + }, + { + "epoch": 3.6172436316133245, + "grad_norm": NaN, + "learning_rate": 0.0001080945371354786, + "loss": 0.0, + "step": 38766 + }, + { + "epoch": 3.617336941308202, + "grad_norm": NaN, + "learning_rate": 0.00010808727456700413, + "loss": 0.0, + "step": 38767 + }, + { + "epoch": 3.6174302510030794, + "grad_norm": NaN, + "learning_rate": 0.00010808001210510126, + "loss": 0.0, + "step": 38768 + }, + { + "epoch": 3.6175235606979568, + "grad_norm": NaN, + "learning_rate": 0.00010807274974978852, + "loss": 0.0, + "step": 38769 + }, + { + "epoch": 3.6176168703928338, + "grad_norm": NaN, + "learning_rate": 0.00010806548750108438, + "loss": 0.0, + "step": 38770 + }, + { + "epoch": 3.617710180087711, + "grad_norm": NaN, + "learning_rate": 0.00010805822535900725, + "loss": 0.0, + "step": 38771 + }, + { + "epoch": 3.617803489782588, + "grad_norm": NaN, + "learning_rate": 0.00010805096332357562, + "loss": 0.0, + "step": 38772 + }, + { + "epoch": 3.6178967994774656, + "grad_norm": NaN, + "learning_rate": 0.000108043701394808, + "loss": 0.0, + "step": 38773 + }, + { + "epoch": 3.617990109172343, + "grad_norm": NaN, + "learning_rate": 0.0001080364395727228, + "loss": 0.0, + "step": 38774 + }, + { + "epoch": 3.6180834188672204, + "grad_norm": NaN, + "learning_rate": 0.0001080291778573385, + "loss": 0.0, + "step": 38775 + }, + { + "epoch": 3.618176728562098, + "grad_norm": NaN, + "learning_rate": 0.00010802191624867359, + "loss": 0.0, + "step": 38776 + }, + { + "epoch": 3.618270038256975, + "grad_norm": NaN, + "learning_rate": 0.0001080146547467465, + "loss": 0.0, + "step": 38777 + }, + { + "epoch": 3.618363347951852, + "grad_norm": NaN, + "learning_rate": 0.00010800739335157567, + "loss": 0.0, + "step": 38778 + }, + { + "epoch": 3.6184566576467296, + "grad_norm": NaN, + "learning_rate": 0.00010800013206317968, + "loss": 0.0, + "step": 38779 + }, + { + "epoch": 3.6185499673416066, + "grad_norm": NaN, + "learning_rate": 0.00010799287088157686, + "loss": 0.0, + "step": 38780 + }, + { + "epoch": 3.618643277036484, + "grad_norm": NaN, + "learning_rate": 0.00010798560980678569, + "loss": 0.0, + "step": 38781 + }, + { + "epoch": 3.6187365867313614, + "grad_norm": NaN, + "learning_rate": 0.00010797834883882476, + "loss": 0.0, + "step": 38782 + }, + { + "epoch": 3.618829896426239, + "grad_norm": NaN, + "learning_rate": 0.00010797108797771238, + "loss": 0.0, + "step": 38783 + }, + { + "epoch": 3.618923206121116, + "grad_norm": NaN, + "learning_rate": 0.00010796382722346705, + "loss": 0.0, + "step": 38784 + }, + { + "epoch": 3.6190165158159933, + "grad_norm": NaN, + "learning_rate": 0.00010795656657610733, + "loss": 0.0, + "step": 38785 + }, + { + "epoch": 3.6191098255108707, + "grad_norm": NaN, + "learning_rate": 0.00010794930603565158, + "loss": 0.0, + "step": 38786 + }, + { + "epoch": 3.6192031352057477, + "grad_norm": NaN, + "learning_rate": 0.00010794204560211829, + "loss": 0.0, + "step": 38787 + }, + { + "epoch": 3.619296444900625, + "grad_norm": NaN, + "learning_rate": 0.00010793478527552588, + "loss": 0.0, + "step": 38788 + }, + { + "epoch": 3.6193897545955025, + "grad_norm": NaN, + "learning_rate": 0.00010792752505589287, + "loss": 0.0, + "step": 38789 + }, + { + "epoch": 3.61948306429038, + "grad_norm": NaN, + "learning_rate": 0.00010792026494323774, + "loss": 0.0, + "step": 38790 + }, + { + "epoch": 3.6195763739852573, + "grad_norm": NaN, + "learning_rate": 0.00010791300493757885, + "loss": 0.0, + "step": 38791 + }, + { + "epoch": 3.6196696836801343, + "grad_norm": NaN, + "learning_rate": 0.00010790574503893474, + "loss": 0.0, + "step": 38792 + }, + { + "epoch": 3.6197629933750117, + "grad_norm": NaN, + "learning_rate": 0.0001078984852473239, + "loss": 0.0, + "step": 38793 + }, + { + "epoch": 3.6198563030698887, + "grad_norm": NaN, + "learning_rate": 0.00010789122556276467, + "loss": 0.0, + "step": 38794 + }, + { + "epoch": 3.619949612764766, + "grad_norm": NaN, + "learning_rate": 0.0001078839659852756, + "loss": 0.0, + "step": 38795 + }, + { + "epoch": 3.6200429224596435, + "grad_norm": NaN, + "learning_rate": 0.00010787670651487518, + "loss": 0.0, + "step": 38796 + }, + { + "epoch": 3.620136232154521, + "grad_norm": NaN, + "learning_rate": 0.00010786944715158176, + "loss": 0.0, + "step": 38797 + }, + { + "epoch": 3.6202295418493984, + "grad_norm": NaN, + "learning_rate": 0.00010786218789541386, + "loss": 0.0, + "step": 38798 + }, + { + "epoch": 3.6203228515442754, + "grad_norm": NaN, + "learning_rate": 0.00010785492874638999, + "loss": 0.0, + "step": 38799 + }, + { + "epoch": 3.6204161612391528, + "grad_norm": NaN, + "learning_rate": 0.00010784766970452847, + "loss": 0.0, + "step": 38800 + }, + { + "epoch": 3.62050947093403, + "grad_norm": NaN, + "learning_rate": 0.00010784041076984783, + "loss": 0.0, + "step": 38801 + }, + { + "epoch": 3.620602780628907, + "grad_norm": NaN, + "learning_rate": 0.00010783315194236666, + "loss": 0.0, + "step": 38802 + }, + { + "epoch": 3.6206960903237846, + "grad_norm": NaN, + "learning_rate": 0.0001078258932221032, + "loss": 0.0, + "step": 38803 + }, + { + "epoch": 3.620789400018662, + "grad_norm": NaN, + "learning_rate": 0.00010781863460907599, + "loss": 0.0, + "step": 38804 + }, + { + "epoch": 3.6208827097135394, + "grad_norm": NaN, + "learning_rate": 0.00010781137610330355, + "loss": 0.0, + "step": 38805 + }, + { + "epoch": 3.6209760194084164, + "grad_norm": NaN, + "learning_rate": 0.00010780411770480425, + "loss": 0.0, + "step": 38806 + }, + { + "epoch": 3.621069329103294, + "grad_norm": NaN, + "learning_rate": 0.00010779685941359657, + "loss": 0.0, + "step": 38807 + }, + { + "epoch": 3.6211626387981712, + "grad_norm": NaN, + "learning_rate": 0.00010778960122969902, + "loss": 0.0, + "step": 38808 + }, + { + "epoch": 3.621255948493048, + "grad_norm": NaN, + "learning_rate": 0.00010778234315312998, + "loss": 0.0, + "step": 38809 + }, + { + "epoch": 3.6213492581879256, + "grad_norm": NaN, + "learning_rate": 0.0001077750851839079, + "loss": 0.0, + "step": 38810 + }, + { + "epoch": 3.621442567882803, + "grad_norm": NaN, + "learning_rate": 0.00010776782732205136, + "loss": 0.0, + "step": 38811 + }, + { + "epoch": 3.6215358775776805, + "grad_norm": NaN, + "learning_rate": 0.00010776056956757869, + "loss": 0.0, + "step": 38812 + }, + { + "epoch": 3.621629187272558, + "grad_norm": NaN, + "learning_rate": 0.00010775331192050833, + "loss": 0.0, + "step": 38813 + }, + { + "epoch": 3.621722496967435, + "grad_norm": NaN, + "learning_rate": 0.00010774605438085883, + "loss": 0.0, + "step": 38814 + }, + { + "epoch": 3.6218158066623123, + "grad_norm": NaN, + "learning_rate": 0.0001077387969486486, + "loss": 0.0, + "step": 38815 + }, + { + "epoch": 3.6219091163571893, + "grad_norm": NaN, + "learning_rate": 0.00010773153962389604, + "loss": 0.0, + "step": 38816 + }, + { + "epoch": 3.6220024260520667, + "grad_norm": NaN, + "learning_rate": 0.00010772428240661974, + "loss": 0.0, + "step": 38817 + }, + { + "epoch": 3.622095735746944, + "grad_norm": NaN, + "learning_rate": 0.000107717025296838, + "loss": 0.0, + "step": 38818 + }, + { + "epoch": 3.6221890454418215, + "grad_norm": NaN, + "learning_rate": 0.00010770976829456933, + "loss": 0.0, + "step": 38819 + }, + { + "epoch": 3.622282355136699, + "grad_norm": NaN, + "learning_rate": 0.00010770251139983225, + "loss": 0.0, + "step": 38820 + }, + { + "epoch": 3.622375664831576, + "grad_norm": NaN, + "learning_rate": 0.00010769525461264513, + "loss": 0.0, + "step": 38821 + }, + { + "epoch": 3.6224689745264533, + "grad_norm": NaN, + "learning_rate": 0.0001076879979330264, + "loss": 0.0, + "step": 38822 + }, + { + "epoch": 3.6225622842213308, + "grad_norm": NaN, + "learning_rate": 0.00010768074136099462, + "loss": 0.0, + "step": 38823 + }, + { + "epoch": 3.6226555939162077, + "grad_norm": NaN, + "learning_rate": 0.00010767348489656818, + "loss": 0.0, + "step": 38824 + }, + { + "epoch": 3.622748903611085, + "grad_norm": NaN, + "learning_rate": 0.00010766622853976546, + "loss": 0.0, + "step": 38825 + }, + { + "epoch": 3.6228422133059626, + "grad_norm": NaN, + "learning_rate": 0.00010765897229060506, + "loss": 0.0, + "step": 38826 + }, + { + "epoch": 3.62293552300084, + "grad_norm": NaN, + "learning_rate": 0.00010765171614910531, + "loss": 0.0, + "step": 38827 + }, + { + "epoch": 3.623028832695717, + "grad_norm": NaN, + "learning_rate": 0.00010764446011528467, + "loss": 0.0, + "step": 38828 + }, + { + "epoch": 3.6231221423905944, + "grad_norm": NaN, + "learning_rate": 0.0001076372041891617, + "loss": 0.0, + "step": 38829 + }, + { + "epoch": 3.623215452085472, + "grad_norm": NaN, + "learning_rate": 0.00010762994837075473, + "loss": 0.0, + "step": 38830 + }, + { + "epoch": 3.6233087617803488, + "grad_norm": NaN, + "learning_rate": 0.00010762269266008228, + "loss": 0.0, + "step": 38831 + }, + { + "epoch": 3.623402071475226, + "grad_norm": NaN, + "learning_rate": 0.00010761543705716272, + "loss": 0.0, + "step": 38832 + }, + { + "epoch": 3.6234953811701036, + "grad_norm": NaN, + "learning_rate": 0.00010760818156201456, + "loss": 0.0, + "step": 38833 + }, + { + "epoch": 3.623588690864981, + "grad_norm": NaN, + "learning_rate": 0.00010760092617465627, + "loss": 0.0, + "step": 38834 + }, + { + "epoch": 3.623682000559858, + "grad_norm": NaN, + "learning_rate": 0.00010759367089510622, + "loss": 0.0, + "step": 38835 + }, + { + "epoch": 3.6237753102547354, + "grad_norm": NaN, + "learning_rate": 0.00010758641572338292, + "loss": 0.0, + "step": 38836 + }, + { + "epoch": 3.623868619949613, + "grad_norm": NaN, + "learning_rate": 0.00010757916065950483, + "loss": 0.0, + "step": 38837 + }, + { + "epoch": 3.62396192964449, + "grad_norm": NaN, + "learning_rate": 0.00010757190570349032, + "loss": 0.0, + "step": 38838 + }, + { + "epoch": 3.6240552393393672, + "grad_norm": NaN, + "learning_rate": 0.00010756465085535787, + "loss": 0.0, + "step": 38839 + }, + { + "epoch": 3.6241485490342447, + "grad_norm": NaN, + "learning_rate": 0.00010755739611512605, + "loss": 0.0, + "step": 38840 + }, + { + "epoch": 3.624241858729122, + "grad_norm": NaN, + "learning_rate": 0.00010755014148281311, + "loss": 0.0, + "step": 38841 + }, + { + "epoch": 3.6243351684239995, + "grad_norm": NaN, + "learning_rate": 0.00010754288695843756, + "loss": 0.0, + "step": 38842 + }, + { + "epoch": 3.6244284781188765, + "grad_norm": NaN, + "learning_rate": 0.00010753563254201797, + "loss": 0.0, + "step": 38843 + }, + { + "epoch": 3.624521787813754, + "grad_norm": NaN, + "learning_rate": 0.00010752837823357263, + "loss": 0.0, + "step": 38844 + }, + { + "epoch": 3.6246150975086313, + "grad_norm": NaN, + "learning_rate": 0.00010752112403312003, + "loss": 0.0, + "step": 38845 + }, + { + "epoch": 3.6247084072035083, + "grad_norm": NaN, + "learning_rate": 0.00010751386994067866, + "loss": 0.0, + "step": 38846 + }, + { + "epoch": 3.6248017168983857, + "grad_norm": NaN, + "learning_rate": 0.00010750661595626692, + "loss": 0.0, + "step": 38847 + }, + { + "epoch": 3.624895026593263, + "grad_norm": NaN, + "learning_rate": 0.00010749936207990323, + "loss": 0.0, + "step": 38848 + }, + { + "epoch": 3.6249883362881405, + "grad_norm": NaN, + "learning_rate": 0.00010749210831160615, + "loss": 0.0, + "step": 38849 + }, + { + "epoch": 3.6250816459830175, + "grad_norm": NaN, + "learning_rate": 0.00010748485465139401, + "loss": 0.0, + "step": 38850 + }, + { + "epoch": 3.625174955677895, + "grad_norm": NaN, + "learning_rate": 0.00010747760109928527, + "loss": 0.0, + "step": 38851 + }, + { + "epoch": 3.6252682653727724, + "grad_norm": NaN, + "learning_rate": 0.00010747034765529843, + "loss": 0.0, + "step": 38852 + }, + { + "epoch": 3.6253615750676493, + "grad_norm": NaN, + "learning_rate": 0.00010746309431945188, + "loss": 0.0, + "step": 38853 + }, + { + "epoch": 3.6254548847625268, + "grad_norm": NaN, + "learning_rate": 0.00010745584109176405, + "loss": 0.0, + "step": 38854 + }, + { + "epoch": 3.625548194457404, + "grad_norm": NaN, + "learning_rate": 0.00010744858797225349, + "loss": 0.0, + "step": 38855 + }, + { + "epoch": 3.6256415041522816, + "grad_norm": NaN, + "learning_rate": 0.0001074413349609385, + "loss": 0.0, + "step": 38856 + }, + { + "epoch": 3.6257348138471586, + "grad_norm": NaN, + "learning_rate": 0.0001074340820578376, + "loss": 0.0, + "step": 38857 + }, + { + "epoch": 3.625828123542036, + "grad_norm": NaN, + "learning_rate": 0.00010742682926296924, + "loss": 0.0, + "step": 38858 + }, + { + "epoch": 3.6259214332369134, + "grad_norm": NaN, + "learning_rate": 0.00010741957657635184, + "loss": 0.0, + "step": 38859 + }, + { + "epoch": 3.6260147429317904, + "grad_norm": NaN, + "learning_rate": 0.00010741232399800382, + "loss": 0.0, + "step": 38860 + }, + { + "epoch": 3.626108052626668, + "grad_norm": NaN, + "learning_rate": 0.00010740507152794369, + "loss": 0.0, + "step": 38861 + }, + { + "epoch": 3.626201362321545, + "grad_norm": NaN, + "learning_rate": 0.00010739781916618981, + "loss": 0.0, + "step": 38862 + }, + { + "epoch": 3.6262946720164226, + "grad_norm": NaN, + "learning_rate": 0.00010739056691276062, + "loss": 0.0, + "step": 38863 + }, + { + "epoch": 3.6263879817113, + "grad_norm": NaN, + "learning_rate": 0.00010738331476767468, + "loss": 0.0, + "step": 38864 + }, + { + "epoch": 3.626481291406177, + "grad_norm": NaN, + "learning_rate": 0.00010737606273095031, + "loss": 0.0, + "step": 38865 + }, + { + "epoch": 3.6265746011010545, + "grad_norm": NaN, + "learning_rate": 0.00010736881080260594, + "loss": 0.0, + "step": 38866 + }, + { + "epoch": 3.6266679107959314, + "grad_norm": NaN, + "learning_rate": 0.00010736155898266014, + "loss": 0.0, + "step": 38867 + }, + { + "epoch": 3.626761220490809, + "grad_norm": NaN, + "learning_rate": 0.00010735430727113122, + "loss": 0.0, + "step": 38868 + }, + { + "epoch": 3.6268545301856863, + "grad_norm": NaN, + "learning_rate": 0.00010734705566803763, + "loss": 0.0, + "step": 38869 + }, + { + "epoch": 3.6269478398805637, + "grad_norm": NaN, + "learning_rate": 0.00010733980417339793, + "loss": 0.0, + "step": 38870 + }, + { + "epoch": 3.627041149575441, + "grad_norm": NaN, + "learning_rate": 0.00010733255278723041, + "loss": 0.0, + "step": 38871 + }, + { + "epoch": 3.627134459270318, + "grad_norm": NaN, + "learning_rate": 0.00010732530150955355, + "loss": 0.0, + "step": 38872 + }, + { + "epoch": 3.6272277689651955, + "grad_norm": NaN, + "learning_rate": 0.00010731805034038588, + "loss": 0.0, + "step": 38873 + }, + { + "epoch": 3.627321078660073, + "grad_norm": NaN, + "learning_rate": 0.00010731079927974572, + "loss": 0.0, + "step": 38874 + }, + { + "epoch": 3.62741438835495, + "grad_norm": NaN, + "learning_rate": 0.00010730354832765157, + "loss": 0.0, + "step": 38875 + }, + { + "epoch": 3.6275076980498273, + "grad_norm": NaN, + "learning_rate": 0.0001072962974841218, + "loss": 0.0, + "step": 38876 + }, + { + "epoch": 3.6276010077447047, + "grad_norm": NaN, + "learning_rate": 0.0001072890467491749, + "loss": 0.0, + "step": 38877 + }, + { + "epoch": 3.627694317439582, + "grad_norm": NaN, + "learning_rate": 0.0001072817961228294, + "loss": 0.0, + "step": 38878 + }, + { + "epoch": 3.627787627134459, + "grad_norm": NaN, + "learning_rate": 0.00010727454560510353, + "loss": 0.0, + "step": 38879 + }, + { + "epoch": 3.6278809368293365, + "grad_norm": NaN, + "learning_rate": 0.00010726729519601584, + "loss": 0.0, + "step": 38880 + }, + { + "epoch": 3.627974246524214, + "grad_norm": NaN, + "learning_rate": 0.00010726004489558482, + "loss": 0.0, + "step": 38881 + }, + { + "epoch": 3.628067556219091, + "grad_norm": NaN, + "learning_rate": 0.00010725279470382882, + "loss": 0.0, + "step": 38882 + }, + { + "epoch": 3.6281608659139684, + "grad_norm": NaN, + "learning_rate": 0.00010724554462076627, + "loss": 0.0, + "step": 38883 + }, + { + "epoch": 3.6282541756088458, + "grad_norm": NaN, + "learning_rate": 0.00010723829464641568, + "loss": 0.0, + "step": 38884 + }, + { + "epoch": 3.628347485303723, + "grad_norm": NaN, + "learning_rate": 0.00010723104478079545, + "loss": 0.0, + "step": 38885 + }, + { + "epoch": 3.6284407949986006, + "grad_norm": NaN, + "learning_rate": 0.00010722379502392396, + "loss": 0.0, + "step": 38886 + }, + { + "epoch": 3.6285341046934776, + "grad_norm": NaN, + "learning_rate": 0.00010721654537581971, + "loss": 0.0, + "step": 38887 + }, + { + "epoch": 3.628627414388355, + "grad_norm": NaN, + "learning_rate": 0.00010720929583650111, + "loss": 0.0, + "step": 38888 + }, + { + "epoch": 3.628720724083232, + "grad_norm": NaN, + "learning_rate": 0.00010720204640598658, + "loss": 0.0, + "step": 38889 + }, + { + "epoch": 3.6288140337781094, + "grad_norm": NaN, + "learning_rate": 0.00010719479708429461, + "loss": 0.0, + "step": 38890 + }, + { + "epoch": 3.628907343472987, + "grad_norm": NaN, + "learning_rate": 0.00010718754787144355, + "loss": 0.0, + "step": 38891 + }, + { + "epoch": 3.6290006531678642, + "grad_norm": NaN, + "learning_rate": 0.00010718029876745186, + "loss": 0.0, + "step": 38892 + }, + { + "epoch": 3.6290939628627417, + "grad_norm": NaN, + "learning_rate": 0.00010717304977233805, + "loss": 0.0, + "step": 38893 + }, + { + "epoch": 3.6291872725576186, + "grad_norm": NaN, + "learning_rate": 0.00010716580088612044, + "loss": 0.0, + "step": 38894 + }, + { + "epoch": 3.629280582252496, + "grad_norm": NaN, + "learning_rate": 0.00010715855210881749, + "loss": 0.0, + "step": 38895 + }, + { + "epoch": 3.6293738919473735, + "grad_norm": NaN, + "learning_rate": 0.00010715130344044773, + "loss": 0.0, + "step": 38896 + }, + { + "epoch": 3.6294672016422505, + "grad_norm": NaN, + "learning_rate": 0.00010714405488102945, + "loss": 0.0, + "step": 38897 + }, + { + "epoch": 3.629560511337128, + "grad_norm": NaN, + "learning_rate": 0.00010713680643058114, + "loss": 0.0, + "step": 38898 + }, + { + "epoch": 3.6296538210320053, + "grad_norm": NaN, + "learning_rate": 0.00010712955808912126, + "loss": 0.0, + "step": 38899 + }, + { + "epoch": 3.6297471307268827, + "grad_norm": NaN, + "learning_rate": 0.00010712230985666822, + "loss": 0.0, + "step": 38900 + }, + { + "epoch": 3.6298404404217597, + "grad_norm": NaN, + "learning_rate": 0.00010711506173324041, + "loss": 0.0, + "step": 38901 + }, + { + "epoch": 3.629933750116637, + "grad_norm": NaN, + "learning_rate": 0.00010710781371885632, + "loss": 0.0, + "step": 38902 + }, + { + "epoch": 3.6300270598115145, + "grad_norm": NaN, + "learning_rate": 0.00010710056581353436, + "loss": 0.0, + "step": 38903 + }, + { + "epoch": 3.6301203695063915, + "grad_norm": NaN, + "learning_rate": 0.0001070933180172929, + "loss": 0.0, + "step": 38904 + }, + { + "epoch": 3.630213679201269, + "grad_norm": NaN, + "learning_rate": 0.00010708607033015051, + "loss": 0.0, + "step": 38905 + }, + { + "epoch": 3.6303069888961463, + "grad_norm": NaN, + "learning_rate": 0.00010707882275212548, + "loss": 0.0, + "step": 38906 + }, + { + "epoch": 3.6304002985910238, + "grad_norm": NaN, + "learning_rate": 0.00010707157528323628, + "loss": 0.0, + "step": 38907 + }, + { + "epoch": 3.630493608285901, + "grad_norm": NaN, + "learning_rate": 0.00010706432792350139, + "loss": 0.0, + "step": 38908 + }, + { + "epoch": 3.630586917980778, + "grad_norm": NaN, + "learning_rate": 0.00010705708067293917, + "loss": 0.0, + "step": 38909 + }, + { + "epoch": 3.6306802276756556, + "grad_norm": NaN, + "learning_rate": 0.00010704983353156805, + "loss": 0.0, + "step": 38910 + }, + { + "epoch": 3.6307735373705325, + "grad_norm": NaN, + "learning_rate": 0.00010704258649940653, + "loss": 0.0, + "step": 38911 + }, + { + "epoch": 3.63086684706541, + "grad_norm": NaN, + "learning_rate": 0.00010703533957647297, + "loss": 0.0, + "step": 38912 + }, + { + "epoch": 3.6309601567602874, + "grad_norm": NaN, + "learning_rate": 0.00010702809276278577, + "loss": 0.0, + "step": 38913 + }, + { + "epoch": 3.631053466455165, + "grad_norm": NaN, + "learning_rate": 0.00010702084605836347, + "loss": 0.0, + "step": 38914 + }, + { + "epoch": 3.631146776150042, + "grad_norm": NaN, + "learning_rate": 0.00010701359946322434, + "loss": 0.0, + "step": 38915 + }, + { + "epoch": 3.631240085844919, + "grad_norm": NaN, + "learning_rate": 0.00010700635297738696, + "loss": 0.0, + "step": 38916 + }, + { + "epoch": 3.6313333955397966, + "grad_norm": NaN, + "learning_rate": 0.0001069991066008697, + "loss": 0.0, + "step": 38917 + }, + { + "epoch": 3.631426705234674, + "grad_norm": NaN, + "learning_rate": 0.00010699186033369093, + "loss": 0.0, + "step": 38918 + }, + { + "epoch": 3.631520014929551, + "grad_norm": NaN, + "learning_rate": 0.00010698461417586916, + "loss": 0.0, + "step": 38919 + }, + { + "epoch": 3.6316133246244284, + "grad_norm": NaN, + "learning_rate": 0.00010697736812742275, + "loss": 0.0, + "step": 38920 + }, + { + "epoch": 3.631706634319306, + "grad_norm": NaN, + "learning_rate": 0.00010697012218837013, + "loss": 0.0, + "step": 38921 + }, + { + "epoch": 3.6317999440141833, + "grad_norm": NaN, + "learning_rate": 0.0001069628763587298, + "loss": 0.0, + "step": 38922 + }, + { + "epoch": 3.6318932537090602, + "grad_norm": NaN, + "learning_rate": 0.00010695563063852008, + "loss": 0.0, + "step": 38923 + }, + { + "epoch": 3.6319865634039377, + "grad_norm": NaN, + "learning_rate": 0.00010694838502775944, + "loss": 0.0, + "step": 38924 + }, + { + "epoch": 3.632079873098815, + "grad_norm": NaN, + "learning_rate": 0.00010694113952646633, + "loss": 0.0, + "step": 38925 + }, + { + "epoch": 3.632173182793692, + "grad_norm": NaN, + "learning_rate": 0.00010693389413465914, + "loss": 0.0, + "step": 38926 + }, + { + "epoch": 3.6322664924885695, + "grad_norm": NaN, + "learning_rate": 0.00010692664885235626, + "loss": 0.0, + "step": 38927 + }, + { + "epoch": 3.632359802183447, + "grad_norm": NaN, + "learning_rate": 0.00010691940367957621, + "loss": 0.0, + "step": 38928 + }, + { + "epoch": 3.6324531118783243, + "grad_norm": NaN, + "learning_rate": 0.00010691215861633733, + "loss": 0.0, + "step": 38929 + }, + { + "epoch": 3.6325464215732013, + "grad_norm": NaN, + "learning_rate": 0.00010690491366265803, + "loss": 0.0, + "step": 38930 + }, + { + "epoch": 3.6326397312680787, + "grad_norm": NaN, + "learning_rate": 0.00010689766881855686, + "loss": 0.0, + "step": 38931 + }, + { + "epoch": 3.632733040962956, + "grad_norm": NaN, + "learning_rate": 0.00010689042408405208, + "loss": 0.0, + "step": 38932 + }, + { + "epoch": 3.632826350657833, + "grad_norm": NaN, + "learning_rate": 0.00010688317945916216, + "loss": 0.0, + "step": 38933 + }, + { + "epoch": 3.6329196603527105, + "grad_norm": NaN, + "learning_rate": 0.00010687593494390562, + "loss": 0.0, + "step": 38934 + }, + { + "epoch": 3.633012970047588, + "grad_norm": NaN, + "learning_rate": 0.00010686869053830077, + "loss": 0.0, + "step": 38935 + }, + { + "epoch": 3.6331062797424654, + "grad_norm": NaN, + "learning_rate": 0.00010686144624236602, + "loss": 0.0, + "step": 38936 + }, + { + "epoch": 3.633199589437343, + "grad_norm": NaN, + "learning_rate": 0.00010685420205611991, + "loss": 0.0, + "step": 38937 + }, + { + "epoch": 3.6332928991322198, + "grad_norm": NaN, + "learning_rate": 0.00010684695797958075, + "loss": 0.0, + "step": 38938 + }, + { + "epoch": 3.633386208827097, + "grad_norm": NaN, + "learning_rate": 0.00010683971401276695, + "loss": 0.0, + "step": 38939 + }, + { + "epoch": 3.6334795185219746, + "grad_norm": NaN, + "learning_rate": 0.00010683247015569703, + "loss": 0.0, + "step": 38940 + }, + { + "epoch": 3.6335728282168516, + "grad_norm": NaN, + "learning_rate": 0.00010682522640838936, + "loss": 0.0, + "step": 38941 + }, + { + "epoch": 3.633666137911729, + "grad_norm": NaN, + "learning_rate": 0.00010681798277086227, + "loss": 0.0, + "step": 38942 + }, + { + "epoch": 3.6337594476066064, + "grad_norm": NaN, + "learning_rate": 0.00010681073924313435, + "loss": 0.0, + "step": 38943 + }, + { + "epoch": 3.633852757301484, + "grad_norm": NaN, + "learning_rate": 0.00010680349582522388, + "loss": 0.0, + "step": 38944 + }, + { + "epoch": 3.633946066996361, + "grad_norm": NaN, + "learning_rate": 0.0001067962525171493, + "loss": 0.0, + "step": 38945 + }, + { + "epoch": 3.634039376691238, + "grad_norm": NaN, + "learning_rate": 0.0001067890093189291, + "loss": 0.0, + "step": 38946 + }, + { + "epoch": 3.6341326863861156, + "grad_norm": NaN, + "learning_rate": 0.00010678176623058165, + "loss": 0.0, + "step": 38947 + }, + { + "epoch": 3.6342259960809926, + "grad_norm": NaN, + "learning_rate": 0.00010677452325212531, + "loss": 0.0, + "step": 38948 + }, + { + "epoch": 3.63431930577587, + "grad_norm": NaN, + "learning_rate": 0.00010676728038357862, + "loss": 0.0, + "step": 38949 + }, + { + "epoch": 3.6344126154707475, + "grad_norm": NaN, + "learning_rate": 0.00010676003762495987, + "loss": 0.0, + "step": 38950 + }, + { + "epoch": 3.634505925165625, + "grad_norm": NaN, + "learning_rate": 0.00010675279497628757, + "loss": 0.0, + "step": 38951 + }, + { + "epoch": 3.634599234860502, + "grad_norm": NaN, + "learning_rate": 0.00010674555243758013, + "loss": 0.0, + "step": 38952 + }, + { + "epoch": 3.6346925445553793, + "grad_norm": NaN, + "learning_rate": 0.00010673831000885586, + "loss": 0.0, + "step": 38953 + }, + { + "epoch": 3.6347858542502567, + "grad_norm": NaN, + "learning_rate": 0.0001067310676901333, + "loss": 0.0, + "step": 38954 + }, + { + "epoch": 3.6348791639451337, + "grad_norm": NaN, + "learning_rate": 0.00010672382548143085, + "loss": 0.0, + "step": 38955 + }, + { + "epoch": 3.634972473640011, + "grad_norm": NaN, + "learning_rate": 0.00010671658338276683, + "loss": 0.0, + "step": 38956 + }, + { + "epoch": 3.6350657833348885, + "grad_norm": NaN, + "learning_rate": 0.00010670934139415975, + "loss": 0.0, + "step": 38957 + }, + { + "epoch": 3.635159093029766, + "grad_norm": NaN, + "learning_rate": 0.00010670209951562802, + "loss": 0.0, + "step": 38958 + }, + { + "epoch": 3.6352524027246433, + "grad_norm": NaN, + "learning_rate": 0.00010669485774718997, + "loss": 0.0, + "step": 38959 + }, + { + "epoch": 3.6353457124195203, + "grad_norm": NaN, + "learning_rate": 0.00010668761608886406, + "loss": 0.0, + "step": 38960 + }, + { + "epoch": 3.6354390221143977, + "grad_norm": NaN, + "learning_rate": 0.00010668037454066877, + "loss": 0.0, + "step": 38961 + }, + { + "epoch": 3.6355323318092747, + "grad_norm": NaN, + "learning_rate": 0.00010667313310262242, + "loss": 0.0, + "step": 38962 + }, + { + "epoch": 3.635625641504152, + "grad_norm": NaN, + "learning_rate": 0.00010666589177474346, + "loss": 0.0, + "step": 38963 + }, + { + "epoch": 3.6357189511990295, + "grad_norm": NaN, + "learning_rate": 0.00010665865055705034, + "loss": 0.0, + "step": 38964 + }, + { + "epoch": 3.635812260893907, + "grad_norm": NaN, + "learning_rate": 0.00010665140944956137, + "loss": 0.0, + "step": 38965 + }, + { + "epoch": 3.6359055705887844, + "grad_norm": NaN, + "learning_rate": 0.00010664416845229507, + "loss": 0.0, + "step": 38966 + }, + { + "epoch": 3.6359988802836614, + "grad_norm": NaN, + "learning_rate": 0.0001066369275652698, + "loss": 0.0, + "step": 38967 + }, + { + "epoch": 3.636092189978539, + "grad_norm": NaN, + "learning_rate": 0.00010662968678850393, + "loss": 0.0, + "step": 38968 + }, + { + "epoch": 3.636185499673416, + "grad_norm": NaN, + "learning_rate": 0.000106622446122016, + "loss": 0.0, + "step": 38969 + }, + { + "epoch": 3.636278809368293, + "grad_norm": NaN, + "learning_rate": 0.0001066152055658243, + "loss": 0.0, + "step": 38970 + }, + { + "epoch": 3.6363721190631706, + "grad_norm": NaN, + "learning_rate": 0.00010660796511994725, + "loss": 0.0, + "step": 38971 + }, + { + "epoch": 3.636465428758048, + "grad_norm": NaN, + "learning_rate": 0.00010660072478440335, + "loss": 0.0, + "step": 38972 + }, + { + "epoch": 3.6365587384529254, + "grad_norm": NaN, + "learning_rate": 0.00010659348455921091, + "loss": 0.0, + "step": 38973 + }, + { + "epoch": 3.6366520481478024, + "grad_norm": NaN, + "learning_rate": 0.00010658624444438838, + "loss": 0.0, + "step": 38974 + }, + { + "epoch": 3.63674535784268, + "grad_norm": NaN, + "learning_rate": 0.0001065790044399542, + "loss": 0.0, + "step": 38975 + }, + { + "epoch": 3.6368386675375572, + "grad_norm": NaN, + "learning_rate": 0.00010657176454592674, + "loss": 0.0, + "step": 38976 + }, + { + "epoch": 3.636931977232434, + "grad_norm": NaN, + "learning_rate": 0.00010656452476232437, + "loss": 0.0, + "step": 38977 + }, + { + "epoch": 3.6370252869273116, + "grad_norm": NaN, + "learning_rate": 0.00010655728508916563, + "loss": 0.0, + "step": 38978 + }, + { + "epoch": 3.637118596622189, + "grad_norm": NaN, + "learning_rate": 0.00010655004552646881, + "loss": 0.0, + "step": 38979 + }, + { + "epoch": 3.6372119063170665, + "grad_norm": NaN, + "learning_rate": 0.0001065428060742523, + "loss": 0.0, + "step": 38980 + }, + { + "epoch": 3.637305216011944, + "grad_norm": NaN, + "learning_rate": 0.00010653556673253465, + "loss": 0.0, + "step": 38981 + }, + { + "epoch": 3.637398525706821, + "grad_norm": NaN, + "learning_rate": 0.00010652832750133412, + "loss": 0.0, + "step": 38982 + }, + { + "epoch": 3.6374918354016983, + "grad_norm": NaN, + "learning_rate": 0.00010652108838066918, + "loss": 0.0, + "step": 38983 + }, + { + "epoch": 3.6375851450965753, + "grad_norm": NaN, + "learning_rate": 0.00010651384937055826, + "loss": 0.0, + "step": 38984 + }, + { + "epoch": 3.6376784547914527, + "grad_norm": NaN, + "learning_rate": 0.00010650661047101972, + "loss": 0.0, + "step": 38985 + }, + { + "epoch": 3.63777176448633, + "grad_norm": NaN, + "learning_rate": 0.00010649937168207196, + "loss": 0.0, + "step": 38986 + }, + { + "epoch": 3.6378650741812075, + "grad_norm": NaN, + "learning_rate": 0.00010649213300373347, + "loss": 0.0, + "step": 38987 + }, + { + "epoch": 3.637958383876085, + "grad_norm": NaN, + "learning_rate": 0.00010648489443602254, + "loss": 0.0, + "step": 38988 + }, + { + "epoch": 3.638051693570962, + "grad_norm": NaN, + "learning_rate": 0.00010647765597895767, + "loss": 0.0, + "step": 38989 + }, + { + "epoch": 3.6381450032658393, + "grad_norm": NaN, + "learning_rate": 0.00010647041763255726, + "loss": 0.0, + "step": 38990 + }, + { + "epoch": 3.6382383129607168, + "grad_norm": NaN, + "learning_rate": 0.00010646317939683959, + "loss": 0.0, + "step": 38991 + }, + { + "epoch": 3.6383316226555937, + "grad_norm": NaN, + "learning_rate": 0.00010645594127182323, + "loss": 0.0, + "step": 38992 + }, + { + "epoch": 3.638424932350471, + "grad_norm": NaN, + "learning_rate": 0.00010644870325752652, + "loss": 0.0, + "step": 38993 + }, + { + "epoch": 3.6385182420453486, + "grad_norm": NaN, + "learning_rate": 0.00010644146535396781, + "loss": 0.0, + "step": 38994 + }, + { + "epoch": 3.638611551740226, + "grad_norm": NaN, + "learning_rate": 0.00010643422756116557, + "loss": 0.0, + "step": 38995 + }, + { + "epoch": 3.638704861435103, + "grad_norm": NaN, + "learning_rate": 0.00010642698987913823, + "loss": 0.0, + "step": 38996 + }, + { + "epoch": 3.6387981711299804, + "grad_norm": NaN, + "learning_rate": 0.00010641975230790405, + "loss": 0.0, + "step": 38997 + }, + { + "epoch": 3.638891480824858, + "grad_norm": NaN, + "learning_rate": 0.00010641251484748159, + "loss": 0.0, + "step": 38998 + }, + { + "epoch": 3.638984790519735, + "grad_norm": NaN, + "learning_rate": 0.00010640527749788923, + "loss": 0.0, + "step": 38999 + }, + { + "epoch": 3.639078100214612, + "grad_norm": NaN, + "learning_rate": 0.00010639804025914527, + "loss": 0.0, + "step": 39000 + }, + { + "epoch": 3.6391714099094896, + "grad_norm": NaN, + "learning_rate": 0.00010639080313126819, + "loss": 0.0, + "step": 39001 + }, + { + "epoch": 3.639264719604367, + "grad_norm": NaN, + "learning_rate": 0.00010638356611427643, + "loss": 0.0, + "step": 39002 + }, + { + "epoch": 3.6393580292992445, + "grad_norm": NaN, + "learning_rate": 0.00010637632920818827, + "loss": 0.0, + "step": 39003 + }, + { + "epoch": 3.6394513389941214, + "grad_norm": NaN, + "learning_rate": 0.0001063690924130222, + "loss": 0.0, + "step": 39004 + }, + { + "epoch": 3.639544648688999, + "grad_norm": NaN, + "learning_rate": 0.00010636185572879667, + "loss": 0.0, + "step": 39005 + }, + { + "epoch": 3.639637958383876, + "grad_norm": NaN, + "learning_rate": 0.00010635461915552992, + "loss": 0.0, + "step": 39006 + }, + { + "epoch": 3.6397312680787532, + "grad_norm": NaN, + "learning_rate": 0.00010634738269324049, + "loss": 0.0, + "step": 39007 + }, + { + "epoch": 3.6398245777736307, + "grad_norm": NaN, + "learning_rate": 0.00010634014634194677, + "loss": 0.0, + "step": 39008 + }, + { + "epoch": 3.639917887468508, + "grad_norm": NaN, + "learning_rate": 0.00010633291010166704, + "loss": 0.0, + "step": 39009 + }, + { + "epoch": 3.6400111971633855, + "grad_norm": NaN, + "learning_rate": 0.00010632567397241988, + "loss": 0.0, + "step": 39010 + }, + { + "epoch": 3.6401045068582625, + "grad_norm": NaN, + "learning_rate": 0.00010631843795422354, + "loss": 0.0, + "step": 39011 + }, + { + "epoch": 3.64019781655314, + "grad_norm": NaN, + "learning_rate": 0.00010631120204709645, + "loss": 0.0, + "step": 39012 + }, + { + "epoch": 3.6402911262480173, + "grad_norm": NaN, + "learning_rate": 0.00010630396625105709, + "loss": 0.0, + "step": 39013 + }, + { + "epoch": 3.6403844359428943, + "grad_norm": NaN, + "learning_rate": 0.00010629673056612378, + "loss": 0.0, + "step": 39014 + }, + { + "epoch": 3.6404777456377717, + "grad_norm": NaN, + "learning_rate": 0.00010628949499231489, + "loss": 0.0, + "step": 39015 + }, + { + "epoch": 3.640571055332649, + "grad_norm": NaN, + "learning_rate": 0.00010628225952964892, + "loss": 0.0, + "step": 39016 + }, + { + "epoch": 3.6406643650275266, + "grad_norm": NaN, + "learning_rate": 0.00010627502417814424, + "loss": 0.0, + "step": 39017 + }, + { + "epoch": 3.6407576747224035, + "grad_norm": NaN, + "learning_rate": 0.00010626778893781913, + "loss": 0.0, + "step": 39018 + }, + { + "epoch": 3.640850984417281, + "grad_norm": NaN, + "learning_rate": 0.00010626055380869215, + "loss": 0.0, + "step": 39019 + }, + { + "epoch": 3.6409442941121584, + "grad_norm": NaN, + "learning_rate": 0.0001062533187907816, + "loss": 0.0, + "step": 39020 + }, + { + "epoch": 3.6410376038070353, + "grad_norm": NaN, + "learning_rate": 0.00010624608388410588, + "loss": 0.0, + "step": 39021 + }, + { + "epoch": 3.6411309135019128, + "grad_norm": NaN, + "learning_rate": 0.00010623884908868346, + "loss": 0.0, + "step": 39022 + }, + { + "epoch": 3.64122422319679, + "grad_norm": NaN, + "learning_rate": 0.00010623161440453267, + "loss": 0.0, + "step": 39023 + }, + { + "epoch": 3.6413175328916676, + "grad_norm": NaN, + "learning_rate": 0.00010622437983167187, + "loss": 0.0, + "step": 39024 + }, + { + "epoch": 3.641410842586545, + "grad_norm": NaN, + "learning_rate": 0.00010621714537011958, + "loss": 0.0, + "step": 39025 + }, + { + "epoch": 3.641504152281422, + "grad_norm": NaN, + "learning_rate": 0.00010620991101989403, + "loss": 0.0, + "step": 39026 + }, + { + "epoch": 3.6415974619762994, + "grad_norm": NaN, + "learning_rate": 0.00010620267678101376, + "loss": 0.0, + "step": 39027 + }, + { + "epoch": 3.6416907716711764, + "grad_norm": NaN, + "learning_rate": 0.00010619544265349714, + "loss": 0.0, + "step": 39028 + }, + { + "epoch": 3.641784081366054, + "grad_norm": NaN, + "learning_rate": 0.00010618820863736245, + "loss": 0.0, + "step": 39029 + }, + { + "epoch": 3.6418773910609312, + "grad_norm": NaN, + "learning_rate": 0.00010618097473262823, + "loss": 0.0, + "step": 39030 + }, + { + "epoch": 3.6419707007558086, + "grad_norm": NaN, + "learning_rate": 0.00010617374093931285, + "loss": 0.0, + "step": 39031 + }, + { + "epoch": 3.642064010450686, + "grad_norm": NaN, + "learning_rate": 0.00010616650725743457, + "loss": 0.0, + "step": 39032 + }, + { + "epoch": 3.642157320145563, + "grad_norm": NaN, + "learning_rate": 0.00010615927368701191, + "loss": 0.0, + "step": 39033 + }, + { + "epoch": 3.6422506298404405, + "grad_norm": NaN, + "learning_rate": 0.00010615204022806328, + "loss": 0.0, + "step": 39034 + }, + { + "epoch": 3.642343939535318, + "grad_norm": NaN, + "learning_rate": 0.00010614480688060694, + "loss": 0.0, + "step": 39035 + }, + { + "epoch": 3.642437249230195, + "grad_norm": NaN, + "learning_rate": 0.00010613757364466141, + "loss": 0.0, + "step": 39036 + }, + { + "epoch": 3.6425305589250723, + "grad_norm": NaN, + "learning_rate": 0.00010613034052024506, + "loss": 0.0, + "step": 39037 + }, + { + "epoch": 3.6426238686199497, + "grad_norm": NaN, + "learning_rate": 0.00010612310750737621, + "loss": 0.0, + "step": 39038 + }, + { + "epoch": 3.642717178314827, + "grad_norm": NaN, + "learning_rate": 0.00010611587460607332, + "loss": 0.0, + "step": 39039 + }, + { + "epoch": 3.642810488009704, + "grad_norm": NaN, + "learning_rate": 0.00010610864181635482, + "loss": 0.0, + "step": 39040 + }, + { + "epoch": 3.6429037977045815, + "grad_norm": NaN, + "learning_rate": 0.00010610140913823895, + "loss": 0.0, + "step": 39041 + }, + { + "epoch": 3.642997107399459, + "grad_norm": NaN, + "learning_rate": 0.00010609417657174421, + "loss": 0.0, + "step": 39042 + }, + { + "epoch": 3.643090417094336, + "grad_norm": NaN, + "learning_rate": 0.00010608694411688904, + "loss": 0.0, + "step": 39043 + }, + { + "epoch": 3.6431837267892133, + "grad_norm": NaN, + "learning_rate": 0.00010607971177369167, + "loss": 0.0, + "step": 39044 + }, + { + "epoch": 3.6432770364840907, + "grad_norm": NaN, + "learning_rate": 0.00010607247954217066, + "loss": 0.0, + "step": 39045 + }, + { + "epoch": 3.643370346178968, + "grad_norm": NaN, + "learning_rate": 0.00010606524742234431, + "loss": 0.0, + "step": 39046 + }, + { + "epoch": 3.643463655873845, + "grad_norm": NaN, + "learning_rate": 0.00010605801541423096, + "loss": 0.0, + "step": 39047 + }, + { + "epoch": 3.6435569655687225, + "grad_norm": NaN, + "learning_rate": 0.0001060507835178491, + "loss": 0.0, + "step": 39048 + }, + { + "epoch": 3.6436502752636, + "grad_norm": NaN, + "learning_rate": 0.00010604355173321712, + "loss": 0.0, + "step": 39049 + }, + { + "epoch": 3.643743584958477, + "grad_norm": NaN, + "learning_rate": 0.0001060363200603533, + "loss": 0.0, + "step": 39050 + }, + { + "epoch": 3.6438368946533544, + "grad_norm": NaN, + "learning_rate": 0.00010602908849927611, + "loss": 0.0, + "step": 39051 + }, + { + "epoch": 3.643930204348232, + "grad_norm": NaN, + "learning_rate": 0.00010602185705000397, + "loss": 0.0, + "step": 39052 + }, + { + "epoch": 3.644023514043109, + "grad_norm": NaN, + "learning_rate": 0.00010601462571255517, + "loss": 0.0, + "step": 39053 + }, + { + "epoch": 3.6441168237379866, + "grad_norm": NaN, + "learning_rate": 0.0001060073944869482, + "loss": 0.0, + "step": 39054 + }, + { + "epoch": 3.6442101334328636, + "grad_norm": NaN, + "learning_rate": 0.00010600016337320136, + "loss": 0.0, + "step": 39055 + }, + { + "epoch": 3.644303443127741, + "grad_norm": NaN, + "learning_rate": 0.00010599293237133302, + "loss": 0.0, + "step": 39056 + }, + { + "epoch": 3.6443967528226184, + "grad_norm": NaN, + "learning_rate": 0.00010598570148136171, + "loss": 0.0, + "step": 39057 + }, + { + "epoch": 3.6444900625174954, + "grad_norm": NaN, + "learning_rate": 0.00010597847070330567, + "loss": 0.0, + "step": 39058 + }, + { + "epoch": 3.644583372212373, + "grad_norm": NaN, + "learning_rate": 0.00010597124003718333, + "loss": 0.0, + "step": 39059 + }, + { + "epoch": 3.6446766819072502, + "grad_norm": NaN, + "learning_rate": 0.00010596400948301315, + "loss": 0.0, + "step": 39060 + }, + { + "epoch": 3.6447699916021277, + "grad_norm": NaN, + "learning_rate": 0.00010595677904081337, + "loss": 0.0, + "step": 39061 + }, + { + "epoch": 3.6448633012970046, + "grad_norm": NaN, + "learning_rate": 0.00010594954871060247, + "loss": 0.0, + "step": 39062 + }, + { + "epoch": 3.644956610991882, + "grad_norm": NaN, + "learning_rate": 0.00010594231849239889, + "loss": 0.0, + "step": 39063 + }, + { + "epoch": 3.6450499206867595, + "grad_norm": NaN, + "learning_rate": 0.00010593508838622084, + "loss": 0.0, + "step": 39064 + }, + { + "epoch": 3.6451432303816365, + "grad_norm": NaN, + "learning_rate": 0.00010592785839208685, + "loss": 0.0, + "step": 39065 + }, + { + "epoch": 3.645236540076514, + "grad_norm": NaN, + "learning_rate": 0.00010592062851001532, + "loss": 0.0, + "step": 39066 + }, + { + "epoch": 3.6453298497713913, + "grad_norm": NaN, + "learning_rate": 0.00010591339874002447, + "loss": 0.0, + "step": 39067 + }, + { + "epoch": 3.6454231594662687, + "grad_norm": NaN, + "learning_rate": 0.00010590616908213286, + "loss": 0.0, + "step": 39068 + }, + { + "epoch": 3.6455164691611457, + "grad_norm": NaN, + "learning_rate": 0.00010589893953635878, + "loss": 0.0, + "step": 39069 + }, + { + "epoch": 3.645609778856023, + "grad_norm": NaN, + "learning_rate": 0.00010589171010272062, + "loss": 0.0, + "step": 39070 + }, + { + "epoch": 3.6457030885509005, + "grad_norm": NaN, + "learning_rate": 0.00010588448078123678, + "loss": 0.0, + "step": 39071 + }, + { + "epoch": 3.6457963982457775, + "grad_norm": NaN, + "learning_rate": 0.00010587725157192569, + "loss": 0.0, + "step": 39072 + }, + { + "epoch": 3.645889707940655, + "grad_norm": NaN, + "learning_rate": 0.0001058700224748056, + "loss": 0.0, + "step": 39073 + }, + { + "epoch": 3.6459830176355323, + "grad_norm": NaN, + "learning_rate": 0.00010586279348989499, + "loss": 0.0, + "step": 39074 + }, + { + "epoch": 3.6460763273304098, + "grad_norm": NaN, + "learning_rate": 0.00010585556461721227, + "loss": 0.0, + "step": 39075 + }, + { + "epoch": 3.646169637025287, + "grad_norm": NaN, + "learning_rate": 0.00010584833585677572, + "loss": 0.0, + "step": 39076 + }, + { + "epoch": 3.646262946720164, + "grad_norm": NaN, + "learning_rate": 0.00010584110720860379, + "loss": 0.0, + "step": 39077 + }, + { + "epoch": 3.6463562564150416, + "grad_norm": NaN, + "learning_rate": 0.00010583387867271488, + "loss": 0.0, + "step": 39078 + }, + { + "epoch": 3.6464495661099185, + "grad_norm": NaN, + "learning_rate": 0.00010582665024912727, + "loss": 0.0, + "step": 39079 + }, + { + "epoch": 3.646542875804796, + "grad_norm": NaN, + "learning_rate": 0.00010581942193785943, + "loss": 0.0, + "step": 39080 + }, + { + "epoch": 3.6466361854996734, + "grad_norm": NaN, + "learning_rate": 0.00010581219373892973, + "loss": 0.0, + "step": 39081 + }, + { + "epoch": 3.646729495194551, + "grad_norm": NaN, + "learning_rate": 0.0001058049656523565, + "loss": 0.0, + "step": 39082 + }, + { + "epoch": 3.6468228048894282, + "grad_norm": NaN, + "learning_rate": 0.00010579773767815817, + "loss": 0.0, + "step": 39083 + }, + { + "epoch": 3.646916114584305, + "grad_norm": NaN, + "learning_rate": 0.00010579050981635313, + "loss": 0.0, + "step": 39084 + }, + { + "epoch": 3.6470094242791826, + "grad_norm": NaN, + "learning_rate": 0.00010578328206695967, + "loss": 0.0, + "step": 39085 + }, + { + "epoch": 3.64710273397406, + "grad_norm": NaN, + "learning_rate": 0.00010577605442999625, + "loss": 0.0, + "step": 39086 + }, + { + "epoch": 3.647196043668937, + "grad_norm": NaN, + "learning_rate": 0.00010576882690548126, + "loss": 0.0, + "step": 39087 + }, + { + "epoch": 3.6472893533638144, + "grad_norm": NaN, + "learning_rate": 0.00010576159949343297, + "loss": 0.0, + "step": 39088 + }, + { + "epoch": 3.647382663058692, + "grad_norm": NaN, + "learning_rate": 0.00010575437219386987, + "loss": 0.0, + "step": 39089 + }, + { + "epoch": 3.6474759727535693, + "grad_norm": NaN, + "learning_rate": 0.00010574714500681033, + "loss": 0.0, + "step": 39090 + }, + { + "epoch": 3.6475692824484462, + "grad_norm": NaN, + "learning_rate": 0.00010573991793227262, + "loss": 0.0, + "step": 39091 + }, + { + "epoch": 3.6476625921433237, + "grad_norm": NaN, + "learning_rate": 0.00010573269097027522, + "loss": 0.0, + "step": 39092 + }, + { + "epoch": 3.647755901838201, + "grad_norm": NaN, + "learning_rate": 0.0001057254641208365, + "loss": 0.0, + "step": 39093 + }, + { + "epoch": 3.647849211533078, + "grad_norm": NaN, + "learning_rate": 0.00010571823738397473, + "loss": 0.0, + "step": 39094 + }, + { + "epoch": 3.6479425212279555, + "grad_norm": NaN, + "learning_rate": 0.00010571101075970844, + "loss": 0.0, + "step": 39095 + }, + { + "epoch": 3.648035830922833, + "grad_norm": NaN, + "learning_rate": 0.00010570378424805593, + "loss": 0.0, + "step": 39096 + }, + { + "epoch": 3.6481291406177103, + "grad_norm": NaN, + "learning_rate": 0.00010569655784903552, + "loss": 0.0, + "step": 39097 + }, + { + "epoch": 3.6482224503125877, + "grad_norm": NaN, + "learning_rate": 0.00010568933156266564, + "loss": 0.0, + "step": 39098 + }, + { + "epoch": 3.6483157600074647, + "grad_norm": NaN, + "learning_rate": 0.00010568210538896476, + "loss": 0.0, + "step": 39099 + }, + { + "epoch": 3.648409069702342, + "grad_norm": NaN, + "learning_rate": 0.00010567487932795106, + "loss": 0.0, + "step": 39100 + }, + { + "epoch": 3.648502379397219, + "grad_norm": NaN, + "learning_rate": 0.00010566765337964308, + "loss": 0.0, + "step": 39101 + }, + { + "epoch": 3.6485956890920965, + "grad_norm": NaN, + "learning_rate": 0.00010566042754405906, + "loss": 0.0, + "step": 39102 + }, + { + "epoch": 3.648688998786974, + "grad_norm": NaN, + "learning_rate": 0.00010565320182121747, + "loss": 0.0, + "step": 39103 + }, + { + "epoch": 3.6487823084818514, + "grad_norm": NaN, + "learning_rate": 0.00010564597621113669, + "loss": 0.0, + "step": 39104 + }, + { + "epoch": 3.648875618176729, + "grad_norm": NaN, + "learning_rate": 0.00010563875071383498, + "loss": 0.0, + "step": 39105 + }, + { + "epoch": 3.6489689278716058, + "grad_norm": NaN, + "learning_rate": 0.00010563152532933082, + "loss": 0.0, + "step": 39106 + }, + { + "epoch": 3.649062237566483, + "grad_norm": NaN, + "learning_rate": 0.00010562430005764257, + "loss": 0.0, + "step": 39107 + }, + { + "epoch": 3.6491555472613606, + "grad_norm": NaN, + "learning_rate": 0.00010561707489878854, + "loss": 0.0, + "step": 39108 + }, + { + "epoch": 3.6492488569562376, + "grad_norm": NaN, + "learning_rate": 0.00010560984985278715, + "loss": 0.0, + "step": 39109 + }, + { + "epoch": 3.649342166651115, + "grad_norm": NaN, + "learning_rate": 0.00010560262491965682, + "loss": 0.0, + "step": 39110 + }, + { + "epoch": 3.6494354763459924, + "grad_norm": NaN, + "learning_rate": 0.00010559540009941578, + "loss": 0.0, + "step": 39111 + }, + { + "epoch": 3.64952878604087, + "grad_norm": NaN, + "learning_rate": 0.00010558817539208254, + "loss": 0.0, + "step": 39112 + }, + { + "epoch": 3.649622095735747, + "grad_norm": NaN, + "learning_rate": 0.00010558095079767542, + "loss": 0.0, + "step": 39113 + }, + { + "epoch": 3.6497154054306242, + "grad_norm": NaN, + "learning_rate": 0.00010557372631621271, + "loss": 0.0, + "step": 39114 + }, + { + "epoch": 3.6498087151255016, + "grad_norm": NaN, + "learning_rate": 0.00010556650194771293, + "loss": 0.0, + "step": 39115 + }, + { + "epoch": 3.6499020248203786, + "grad_norm": NaN, + "learning_rate": 0.00010555927769219435, + "loss": 0.0, + "step": 39116 + }, + { + "epoch": 3.649995334515256, + "grad_norm": NaN, + "learning_rate": 0.00010555205354967534, + "loss": 0.0, + "step": 39117 + }, + { + "epoch": 3.6500886442101335, + "grad_norm": NaN, + "learning_rate": 0.0001055448295201743, + "loss": 0.0, + "step": 39118 + }, + { + "epoch": 3.650181953905011, + "grad_norm": NaN, + "learning_rate": 0.00010553760560370965, + "loss": 0.0, + "step": 39119 + }, + { + "epoch": 3.6502752635998883, + "grad_norm": NaN, + "learning_rate": 0.0001055303818002996, + "loss": 0.0, + "step": 39120 + }, + { + "epoch": 3.6503685732947653, + "grad_norm": NaN, + "learning_rate": 0.00010552315810996266, + "loss": 0.0, + "step": 39121 + }, + { + "epoch": 3.6504618829896427, + "grad_norm": NaN, + "learning_rate": 0.00010551593453271721, + "loss": 0.0, + "step": 39122 + }, + { + "epoch": 3.6505551926845197, + "grad_norm": NaN, + "learning_rate": 0.00010550871106858146, + "loss": 0.0, + "step": 39123 + }, + { + "epoch": 3.650648502379397, + "grad_norm": NaN, + "learning_rate": 0.00010550148771757393, + "loss": 0.0, + "step": 39124 + }, + { + "epoch": 3.6507418120742745, + "grad_norm": NaN, + "learning_rate": 0.00010549426447971298, + "loss": 0.0, + "step": 39125 + }, + { + "epoch": 3.650835121769152, + "grad_norm": NaN, + "learning_rate": 0.00010548704135501685, + "loss": 0.0, + "step": 39126 + }, + { + "epoch": 3.6509284314640293, + "grad_norm": NaN, + "learning_rate": 0.00010547981834350402, + "loss": 0.0, + "step": 39127 + }, + { + "epoch": 3.6510217411589063, + "grad_norm": NaN, + "learning_rate": 0.00010547259544519286, + "loss": 0.0, + "step": 39128 + }, + { + "epoch": 3.6511150508537837, + "grad_norm": NaN, + "learning_rate": 0.00010546537266010164, + "loss": 0.0, + "step": 39129 + }, + { + "epoch": 3.651208360548661, + "grad_norm": NaN, + "learning_rate": 0.00010545814998824881, + "loss": 0.0, + "step": 39130 + }, + { + "epoch": 3.651301670243538, + "grad_norm": NaN, + "learning_rate": 0.00010545092742965274, + "loss": 0.0, + "step": 39131 + }, + { + "epoch": 3.6513949799384156, + "grad_norm": NaN, + "learning_rate": 0.00010544370498433172, + "loss": 0.0, + "step": 39132 + }, + { + "epoch": 3.651488289633293, + "grad_norm": NaN, + "learning_rate": 0.00010543648265230416, + "loss": 0.0, + "step": 39133 + }, + { + "epoch": 3.6515815993281704, + "grad_norm": NaN, + "learning_rate": 0.0001054292604335885, + "loss": 0.0, + "step": 39134 + }, + { + "epoch": 3.6516749090230474, + "grad_norm": NaN, + "learning_rate": 0.00010542203832820292, + "loss": 0.0, + "step": 39135 + }, + { + "epoch": 3.651768218717925, + "grad_norm": NaN, + "learning_rate": 0.00010541481633616592, + "loss": 0.0, + "step": 39136 + }, + { + "epoch": 3.651861528412802, + "grad_norm": NaN, + "learning_rate": 0.00010540759445749589, + "loss": 0.0, + "step": 39137 + }, + { + "epoch": 3.651954838107679, + "grad_norm": NaN, + "learning_rate": 0.0001054003726922111, + "loss": 0.0, + "step": 39138 + }, + { + "epoch": 3.6520481478025566, + "grad_norm": NaN, + "learning_rate": 0.00010539315104032993, + "loss": 0.0, + "step": 39139 + }, + { + "epoch": 3.652141457497434, + "grad_norm": NaN, + "learning_rate": 0.00010538592950187081, + "loss": 0.0, + "step": 39140 + }, + { + "epoch": 3.6522347671923114, + "grad_norm": NaN, + "learning_rate": 0.00010537870807685204, + "loss": 0.0, + "step": 39141 + }, + { + "epoch": 3.6523280768871884, + "grad_norm": NaN, + "learning_rate": 0.00010537148676529195, + "loss": 0.0, + "step": 39142 + }, + { + "epoch": 3.652421386582066, + "grad_norm": NaN, + "learning_rate": 0.00010536426556720903, + "loss": 0.0, + "step": 39143 + }, + { + "epoch": 3.6525146962769433, + "grad_norm": NaN, + "learning_rate": 0.00010535704448262151, + "loss": 0.0, + "step": 39144 + }, + { + "epoch": 3.6526080059718202, + "grad_norm": NaN, + "learning_rate": 0.00010534982351154786, + "loss": 0.0, + "step": 39145 + }, + { + "epoch": 3.6527013156666976, + "grad_norm": NaN, + "learning_rate": 0.0001053426026540063, + "loss": 0.0, + "step": 39146 + }, + { + "epoch": 3.652794625361575, + "grad_norm": NaN, + "learning_rate": 0.00010533538191001531, + "loss": 0.0, + "step": 39147 + }, + { + "epoch": 3.6528879350564525, + "grad_norm": NaN, + "learning_rate": 0.00010532816127959324, + "loss": 0.0, + "step": 39148 + }, + { + "epoch": 3.65298124475133, + "grad_norm": NaN, + "learning_rate": 0.00010532094076275836, + "loss": 0.0, + "step": 39149 + }, + { + "epoch": 3.653074554446207, + "grad_norm": NaN, + "learning_rate": 0.00010531372035952913, + "loss": 0.0, + "step": 39150 + }, + { + "epoch": 3.6531678641410843, + "grad_norm": NaN, + "learning_rate": 0.00010530650006992389, + "loss": 0.0, + "step": 39151 + }, + { + "epoch": 3.6532611738359617, + "grad_norm": NaN, + "learning_rate": 0.00010529927989396095, + "loss": 0.0, + "step": 39152 + }, + { + "epoch": 3.6533544835308387, + "grad_norm": NaN, + "learning_rate": 0.00010529205983165869, + "loss": 0.0, + "step": 39153 + }, + { + "epoch": 3.653447793225716, + "grad_norm": NaN, + "learning_rate": 0.00010528483988303553, + "loss": 0.0, + "step": 39154 + }, + { + "epoch": 3.6535411029205935, + "grad_norm": NaN, + "learning_rate": 0.00010527762004810973, + "loss": 0.0, + "step": 39155 + }, + { + "epoch": 3.653634412615471, + "grad_norm": NaN, + "learning_rate": 0.00010527040032689972, + "loss": 0.0, + "step": 39156 + }, + { + "epoch": 3.653727722310348, + "grad_norm": NaN, + "learning_rate": 0.00010526318071942384, + "loss": 0.0, + "step": 39157 + }, + { + "epoch": 3.6538210320052253, + "grad_norm": NaN, + "learning_rate": 0.00010525596122570039, + "loss": 0.0, + "step": 39158 + }, + { + "epoch": 3.6539143417001028, + "grad_norm": NaN, + "learning_rate": 0.00010524874184574781, + "loss": 0.0, + "step": 39159 + }, + { + "epoch": 3.6540076513949797, + "grad_norm": NaN, + "learning_rate": 0.00010524152257958445, + "loss": 0.0, + "step": 39160 + }, + { + "epoch": 3.654100961089857, + "grad_norm": NaN, + "learning_rate": 0.00010523430342722859, + "loss": 0.0, + "step": 39161 + }, + { + "epoch": 3.6541942707847346, + "grad_norm": NaN, + "learning_rate": 0.00010522708438869865, + "loss": 0.0, + "step": 39162 + }, + { + "epoch": 3.654287580479612, + "grad_norm": NaN, + "learning_rate": 0.000105219865464013, + "loss": 0.0, + "step": 39163 + }, + { + "epoch": 3.654380890174489, + "grad_norm": NaN, + "learning_rate": 0.00010521264665318993, + "loss": 0.0, + "step": 39164 + }, + { + "epoch": 3.6544741998693664, + "grad_norm": NaN, + "learning_rate": 0.00010520542795624784, + "loss": 0.0, + "step": 39165 + }, + { + "epoch": 3.654567509564244, + "grad_norm": NaN, + "learning_rate": 0.00010519820937320513, + "loss": 0.0, + "step": 39166 + }, + { + "epoch": 3.654660819259121, + "grad_norm": NaN, + "learning_rate": 0.00010519099090408002, + "loss": 0.0, + "step": 39167 + }, + { + "epoch": 3.654754128953998, + "grad_norm": NaN, + "learning_rate": 0.00010518377254889097, + "loss": 0.0, + "step": 39168 + }, + { + "epoch": 3.6548474386488756, + "grad_norm": NaN, + "learning_rate": 0.00010517655430765639, + "loss": 0.0, + "step": 39169 + }, + { + "epoch": 3.654940748343753, + "grad_norm": NaN, + "learning_rate": 0.00010516933618039444, + "loss": 0.0, + "step": 39170 + }, + { + "epoch": 3.6550340580386305, + "grad_norm": NaN, + "learning_rate": 0.00010516211816712364, + "loss": 0.0, + "step": 39171 + }, + { + "epoch": 3.6551273677335074, + "grad_norm": NaN, + "learning_rate": 0.00010515490026786232, + "loss": 0.0, + "step": 39172 + }, + { + "epoch": 3.655220677428385, + "grad_norm": NaN, + "learning_rate": 0.00010514768248262875, + "loss": 0.0, + "step": 39173 + }, + { + "epoch": 3.655313987123262, + "grad_norm": NaN, + "learning_rate": 0.00010514046481144132, + "loss": 0.0, + "step": 39174 + }, + { + "epoch": 3.6554072968181393, + "grad_norm": NaN, + "learning_rate": 0.00010513324725431847, + "loss": 0.0, + "step": 39175 + }, + { + "epoch": 3.6555006065130167, + "grad_norm": NaN, + "learning_rate": 0.00010512602981127846, + "loss": 0.0, + "step": 39176 + }, + { + "epoch": 3.655593916207894, + "grad_norm": NaN, + "learning_rate": 0.00010511881248233963, + "loss": 0.0, + "step": 39177 + }, + { + "epoch": 3.6556872259027715, + "grad_norm": NaN, + "learning_rate": 0.00010511159526752042, + "loss": 0.0, + "step": 39178 + }, + { + "epoch": 3.6557805355976485, + "grad_norm": NaN, + "learning_rate": 0.00010510437816683909, + "loss": 0.0, + "step": 39179 + }, + { + "epoch": 3.655873845292526, + "grad_norm": NaN, + "learning_rate": 0.00010509716118031401, + "loss": 0.0, + "step": 39180 + }, + { + "epoch": 3.6559671549874033, + "grad_norm": NaN, + "learning_rate": 0.0001050899443079636, + "loss": 0.0, + "step": 39181 + }, + { + "epoch": 3.6560604646822803, + "grad_norm": NaN, + "learning_rate": 0.00010508272754980613, + "loss": 0.0, + "step": 39182 + }, + { + "epoch": 3.6561537743771577, + "grad_norm": NaN, + "learning_rate": 0.00010507551090585996, + "loss": 0.0, + "step": 39183 + }, + { + "epoch": 3.656247084072035, + "grad_norm": NaN, + "learning_rate": 0.00010506829437614349, + "loss": 0.0, + "step": 39184 + }, + { + "epoch": 3.6563403937669126, + "grad_norm": NaN, + "learning_rate": 0.00010506107796067503, + "loss": 0.0, + "step": 39185 + }, + { + "epoch": 3.6564337034617895, + "grad_norm": NaN, + "learning_rate": 0.00010505386165947291, + "loss": 0.0, + "step": 39186 + }, + { + "epoch": 3.656527013156667, + "grad_norm": NaN, + "learning_rate": 0.00010504664547255556, + "loss": 0.0, + "step": 39187 + }, + { + "epoch": 3.6566203228515444, + "grad_norm": NaN, + "learning_rate": 0.00010503942939994124, + "loss": 0.0, + "step": 39188 + }, + { + "epoch": 3.6567136325464213, + "grad_norm": NaN, + "learning_rate": 0.00010503221344164838, + "loss": 0.0, + "step": 39189 + }, + { + "epoch": 3.6568069422412988, + "grad_norm": NaN, + "learning_rate": 0.0001050249975976952, + "loss": 0.0, + "step": 39190 + }, + { + "epoch": 3.656900251936176, + "grad_norm": NaN, + "learning_rate": 0.00010501778186810016, + "loss": 0.0, + "step": 39191 + }, + { + "epoch": 3.6569935616310536, + "grad_norm": NaN, + "learning_rate": 0.00010501056625288162, + "loss": 0.0, + "step": 39192 + }, + { + "epoch": 3.657086871325931, + "grad_norm": NaN, + "learning_rate": 0.00010500335075205781, + "loss": 0.0, + "step": 39193 + }, + { + "epoch": 3.657180181020808, + "grad_norm": NaN, + "learning_rate": 0.00010499613536564718, + "loss": 0.0, + "step": 39194 + }, + { + "epoch": 3.6572734907156854, + "grad_norm": NaN, + "learning_rate": 0.00010498892009366809, + "loss": 0.0, + "step": 39195 + }, + { + "epoch": 3.6573668004105624, + "grad_norm": NaN, + "learning_rate": 0.00010498170493613875, + "loss": 0.0, + "step": 39196 + }, + { + "epoch": 3.65746011010544, + "grad_norm": NaN, + "learning_rate": 0.00010497448989307767, + "loss": 0.0, + "step": 39197 + }, + { + "epoch": 3.6575534198003172, + "grad_norm": NaN, + "learning_rate": 0.00010496727496450314, + "loss": 0.0, + "step": 39198 + }, + { + "epoch": 3.6576467294951946, + "grad_norm": NaN, + "learning_rate": 0.00010496006015043342, + "loss": 0.0, + "step": 39199 + }, + { + "epoch": 3.657740039190072, + "grad_norm": NaN, + "learning_rate": 0.00010495284545088697, + "loss": 0.0, + "step": 39200 + }, + { + "epoch": 3.657833348884949, + "grad_norm": NaN, + "learning_rate": 0.00010494563086588211, + "loss": 0.0, + "step": 39201 + }, + { + "epoch": 3.6579266585798265, + "grad_norm": NaN, + "learning_rate": 0.00010493841639543711, + "loss": 0.0, + "step": 39202 + }, + { + "epoch": 3.658019968274704, + "grad_norm": NaN, + "learning_rate": 0.00010493120203957039, + "loss": 0.0, + "step": 39203 + }, + { + "epoch": 3.658113277969581, + "grad_norm": NaN, + "learning_rate": 0.00010492398779830031, + "loss": 0.0, + "step": 39204 + }, + { + "epoch": 3.6582065876644583, + "grad_norm": NaN, + "learning_rate": 0.00010491677367164511, + "loss": 0.0, + "step": 39205 + }, + { + "epoch": 3.6582998973593357, + "grad_norm": NaN, + "learning_rate": 0.00010490955965962323, + "loss": 0.0, + "step": 39206 + }, + { + "epoch": 3.658393207054213, + "grad_norm": NaN, + "learning_rate": 0.00010490234576225304, + "loss": 0.0, + "step": 39207 + }, + { + "epoch": 3.65848651674909, + "grad_norm": NaN, + "learning_rate": 0.00010489513197955274, + "loss": 0.0, + "step": 39208 + }, + { + "epoch": 3.6585798264439675, + "grad_norm": NaN, + "learning_rate": 0.00010488791831154076, + "loss": 0.0, + "step": 39209 + }, + { + "epoch": 3.658673136138845, + "grad_norm": NaN, + "learning_rate": 0.00010488070475823553, + "loss": 0.0, + "step": 39210 + }, + { + "epoch": 3.658766445833722, + "grad_norm": NaN, + "learning_rate": 0.00010487349131965522, + "loss": 0.0, + "step": 39211 + }, + { + "epoch": 3.6588597555285993, + "grad_norm": NaN, + "learning_rate": 0.00010486627799581824, + "loss": 0.0, + "step": 39212 + }, + { + "epoch": 3.6589530652234767, + "grad_norm": NaN, + "learning_rate": 0.00010485906478674303, + "loss": 0.0, + "step": 39213 + }, + { + "epoch": 3.659046374918354, + "grad_norm": NaN, + "learning_rate": 0.00010485185169244779, + "loss": 0.0, + "step": 39214 + }, + { + "epoch": 3.6591396846132316, + "grad_norm": NaN, + "learning_rate": 0.00010484463871295088, + "loss": 0.0, + "step": 39215 + }, + { + "epoch": 3.6592329943081086, + "grad_norm": NaN, + "learning_rate": 0.00010483742584827078, + "loss": 0.0, + "step": 39216 + }, + { + "epoch": 3.659326304002986, + "grad_norm": NaN, + "learning_rate": 0.00010483021309842568, + "loss": 0.0, + "step": 39217 + }, + { + "epoch": 3.659419613697863, + "grad_norm": NaN, + "learning_rate": 0.00010482300046343392, + "loss": 0.0, + "step": 39218 + }, + { + "epoch": 3.6595129233927404, + "grad_norm": NaN, + "learning_rate": 0.00010481578794331397, + "loss": 0.0, + "step": 39219 + }, + { + "epoch": 3.659606233087618, + "grad_norm": NaN, + "learning_rate": 0.00010480857553808405, + "loss": 0.0, + "step": 39220 + }, + { + "epoch": 3.659699542782495, + "grad_norm": NaN, + "learning_rate": 0.00010480136324776248, + "loss": 0.0, + "step": 39221 + }, + { + "epoch": 3.6597928524773726, + "grad_norm": NaN, + "learning_rate": 0.00010479415107236775, + "loss": 0.0, + "step": 39222 + }, + { + "epoch": 3.6598861621722496, + "grad_norm": NaN, + "learning_rate": 0.00010478693901191806, + "loss": 0.0, + "step": 39223 + }, + { + "epoch": 3.659979471867127, + "grad_norm": NaN, + "learning_rate": 0.00010477972706643177, + "loss": 0.0, + "step": 39224 + }, + { + "epoch": 3.6600727815620044, + "grad_norm": NaN, + "learning_rate": 0.00010477251523592728, + "loss": 0.0, + "step": 39225 + }, + { + "epoch": 3.6601660912568814, + "grad_norm": NaN, + "learning_rate": 0.00010476530352042287, + "loss": 0.0, + "step": 39226 + }, + { + "epoch": 3.660259400951759, + "grad_norm": NaN, + "learning_rate": 0.00010475809191993686, + "loss": 0.0, + "step": 39227 + }, + { + "epoch": 3.6603527106466363, + "grad_norm": NaN, + "learning_rate": 0.00010475088043448768, + "loss": 0.0, + "step": 39228 + }, + { + "epoch": 3.6604460203415137, + "grad_norm": NaN, + "learning_rate": 0.00010474366906409357, + "loss": 0.0, + "step": 39229 + }, + { + "epoch": 3.6605393300363906, + "grad_norm": NaN, + "learning_rate": 0.00010473645780877287, + "loss": 0.0, + "step": 39230 + }, + { + "epoch": 3.660632639731268, + "grad_norm": NaN, + "learning_rate": 0.00010472924666854404, + "loss": 0.0, + "step": 39231 + }, + { + "epoch": 3.6607259494261455, + "grad_norm": NaN, + "learning_rate": 0.00010472203564342527, + "loss": 0.0, + "step": 39232 + }, + { + "epoch": 3.6608192591210225, + "grad_norm": NaN, + "learning_rate": 0.00010471482473343492, + "loss": 0.0, + "step": 39233 + }, + { + "epoch": 3.6609125688159, + "grad_norm": NaN, + "learning_rate": 0.00010470761393859142, + "loss": 0.0, + "step": 39234 + }, + { + "epoch": 3.6610058785107773, + "grad_norm": NaN, + "learning_rate": 0.00010470040325891303, + "loss": 0.0, + "step": 39235 + }, + { + "epoch": 3.6610991882056547, + "grad_norm": NaN, + "learning_rate": 0.00010469319269441812, + "loss": 0.0, + "step": 39236 + }, + { + "epoch": 3.661192497900532, + "grad_norm": NaN, + "learning_rate": 0.00010468598224512492, + "loss": 0.0, + "step": 39237 + }, + { + "epoch": 3.661285807595409, + "grad_norm": NaN, + "learning_rate": 0.00010467877191105189, + "loss": 0.0, + "step": 39238 + }, + { + "epoch": 3.6613791172902865, + "grad_norm": NaN, + "learning_rate": 0.00010467156169221733, + "loss": 0.0, + "step": 39239 + }, + { + "epoch": 3.6614724269851635, + "grad_norm": NaN, + "learning_rate": 0.00010466435158863952, + "loss": 0.0, + "step": 39240 + }, + { + "epoch": 3.661565736680041, + "grad_norm": NaN, + "learning_rate": 0.00010465714160033685, + "loss": 0.0, + "step": 39241 + }, + { + "epoch": 3.6616590463749183, + "grad_norm": NaN, + "learning_rate": 0.00010464993172732768, + "loss": 0.0, + "step": 39242 + }, + { + "epoch": 3.6617523560697958, + "grad_norm": NaN, + "learning_rate": 0.00010464272196963024, + "loss": 0.0, + "step": 39243 + }, + { + "epoch": 3.661845665764673, + "grad_norm": NaN, + "learning_rate": 0.00010463551232726293, + "loss": 0.0, + "step": 39244 + }, + { + "epoch": 3.66193897545955, + "grad_norm": NaN, + "learning_rate": 0.00010462830280024414, + "loss": 0.0, + "step": 39245 + }, + { + "epoch": 3.6620322851544276, + "grad_norm": NaN, + "learning_rate": 0.00010462109338859205, + "loss": 0.0, + "step": 39246 + }, + { + "epoch": 3.662125594849305, + "grad_norm": NaN, + "learning_rate": 0.0001046138840923251, + "loss": 0.0, + "step": 39247 + }, + { + "epoch": 3.662218904544182, + "grad_norm": NaN, + "learning_rate": 0.00010460667491146167, + "loss": 0.0, + "step": 39248 + }, + { + "epoch": 3.6623122142390594, + "grad_norm": NaN, + "learning_rate": 0.00010459946584601993, + "loss": 0.0, + "step": 39249 + }, + { + "epoch": 3.662405523933937, + "grad_norm": NaN, + "learning_rate": 0.0001045922568960183, + "loss": 0.0, + "step": 39250 + }, + { + "epoch": 3.6624988336288142, + "grad_norm": NaN, + "learning_rate": 0.00010458504806147516, + "loss": 0.0, + "step": 39251 + }, + { + "epoch": 3.662592143323691, + "grad_norm": NaN, + "learning_rate": 0.0001045778393424088, + "loss": 0.0, + "step": 39252 + }, + { + "epoch": 3.6626854530185686, + "grad_norm": NaN, + "learning_rate": 0.00010457063073883746, + "loss": 0.0, + "step": 39253 + }, + { + "epoch": 3.662778762713446, + "grad_norm": NaN, + "learning_rate": 0.00010456342225077964, + "loss": 0.0, + "step": 39254 + }, + { + "epoch": 3.662872072408323, + "grad_norm": NaN, + "learning_rate": 0.00010455621387825355, + "loss": 0.0, + "step": 39255 + }, + { + "epoch": 3.6629653821032004, + "grad_norm": NaN, + "learning_rate": 0.00010454900562127751, + "loss": 0.0, + "step": 39256 + }, + { + "epoch": 3.663058691798078, + "grad_norm": NaN, + "learning_rate": 0.00010454179747986992, + "loss": 0.0, + "step": 39257 + }, + { + "epoch": 3.6631520014929553, + "grad_norm": NaN, + "learning_rate": 0.00010453458945404906, + "loss": 0.0, + "step": 39258 + }, + { + "epoch": 3.6632453111878323, + "grad_norm": NaN, + "learning_rate": 0.00010452738154383324, + "loss": 0.0, + "step": 39259 + }, + { + "epoch": 3.6633386208827097, + "grad_norm": NaN, + "learning_rate": 0.0001045201737492409, + "loss": 0.0, + "step": 39260 + }, + { + "epoch": 3.663431930577587, + "grad_norm": NaN, + "learning_rate": 0.00010451296607029024, + "loss": 0.0, + "step": 39261 + }, + { + "epoch": 3.663525240272464, + "grad_norm": NaN, + "learning_rate": 0.0001045057585069996, + "loss": 0.0, + "step": 39262 + }, + { + "epoch": 3.6636185499673415, + "grad_norm": NaN, + "learning_rate": 0.00010449855105938741, + "loss": 0.0, + "step": 39263 + }, + { + "epoch": 3.663711859662219, + "grad_norm": NaN, + "learning_rate": 0.00010449134372747189, + "loss": 0.0, + "step": 39264 + }, + { + "epoch": 3.6638051693570963, + "grad_norm": NaN, + "learning_rate": 0.00010448413651127136, + "loss": 0.0, + "step": 39265 + }, + { + "epoch": 3.6638984790519737, + "grad_norm": NaN, + "learning_rate": 0.00010447692941080427, + "loss": 0.0, + "step": 39266 + }, + { + "epoch": 3.6639917887468507, + "grad_norm": NaN, + "learning_rate": 0.00010446972242608882, + "loss": 0.0, + "step": 39267 + }, + { + "epoch": 3.664085098441728, + "grad_norm": NaN, + "learning_rate": 0.00010446251555714334, + "loss": 0.0, + "step": 39268 + }, + { + "epoch": 3.6641784081366056, + "grad_norm": NaN, + "learning_rate": 0.00010445530880398626, + "loss": 0.0, + "step": 39269 + }, + { + "epoch": 3.6642717178314825, + "grad_norm": NaN, + "learning_rate": 0.00010444810216663582, + "loss": 0.0, + "step": 39270 + }, + { + "epoch": 3.66436502752636, + "grad_norm": NaN, + "learning_rate": 0.00010444089564511032, + "loss": 0.0, + "step": 39271 + }, + { + "epoch": 3.6644583372212374, + "grad_norm": NaN, + "learning_rate": 0.0001044336892394282, + "loss": 0.0, + "step": 39272 + }, + { + "epoch": 3.664551646916115, + "grad_norm": NaN, + "learning_rate": 0.00010442648294960768, + "loss": 0.0, + "step": 39273 + }, + { + "epoch": 3.6646449566109918, + "grad_norm": NaN, + "learning_rate": 0.00010441927677566707, + "loss": 0.0, + "step": 39274 + }, + { + "epoch": 3.664738266305869, + "grad_norm": NaN, + "learning_rate": 0.00010441207071762482, + "loss": 0.0, + "step": 39275 + }, + { + "epoch": 3.6648315760007466, + "grad_norm": NaN, + "learning_rate": 0.00010440486477549912, + "loss": 0.0, + "step": 39276 + }, + { + "epoch": 3.6649248856956236, + "grad_norm": NaN, + "learning_rate": 0.00010439765894930831, + "loss": 0.0, + "step": 39277 + }, + { + "epoch": 3.665018195390501, + "grad_norm": NaN, + "learning_rate": 0.00010439045323907082, + "loss": 0.0, + "step": 39278 + }, + { + "epoch": 3.6651115050853784, + "grad_norm": NaN, + "learning_rate": 0.00010438324764480486, + "loss": 0.0, + "step": 39279 + }, + { + "epoch": 3.665204814780256, + "grad_norm": NaN, + "learning_rate": 0.0001043760421665288, + "loss": 0.0, + "step": 39280 + }, + { + "epoch": 3.665298124475133, + "grad_norm": NaN, + "learning_rate": 0.00010436883680426092, + "loss": 0.0, + "step": 39281 + }, + { + "epoch": 3.6653914341700102, + "grad_norm": NaN, + "learning_rate": 0.00010436163155801957, + "loss": 0.0, + "step": 39282 + }, + { + "epoch": 3.6654847438648877, + "grad_norm": NaN, + "learning_rate": 0.00010435442642782313, + "loss": 0.0, + "step": 39283 + }, + { + "epoch": 3.6655780535597646, + "grad_norm": NaN, + "learning_rate": 0.0001043472214136898, + "loss": 0.0, + "step": 39284 + }, + { + "epoch": 3.665671363254642, + "grad_norm": NaN, + "learning_rate": 0.00010434001651563794, + "loss": 0.0, + "step": 39285 + }, + { + "epoch": 3.6657646729495195, + "grad_norm": NaN, + "learning_rate": 0.00010433281173368601, + "loss": 0.0, + "step": 39286 + }, + { + "epoch": 3.665857982644397, + "grad_norm": NaN, + "learning_rate": 0.00010432560706785211, + "loss": 0.0, + "step": 39287 + }, + { + "epoch": 3.6659512923392743, + "grad_norm": NaN, + "learning_rate": 0.00010431840251815466, + "loss": 0.0, + "step": 39288 + }, + { + "epoch": 3.6660446020341513, + "grad_norm": NaN, + "learning_rate": 0.00010431119808461205, + "loss": 0.0, + "step": 39289 + }, + { + "epoch": 3.6661379117290287, + "grad_norm": NaN, + "learning_rate": 0.0001043039937672425, + "loss": 0.0, + "step": 39290 + }, + { + "epoch": 3.6662312214239057, + "grad_norm": NaN, + "learning_rate": 0.0001042967895660643, + "loss": 0.0, + "step": 39291 + }, + { + "epoch": 3.666324531118783, + "grad_norm": NaN, + "learning_rate": 0.00010428958548109591, + "loss": 0.0, + "step": 39292 + }, + { + "epoch": 3.6664178408136605, + "grad_norm": NaN, + "learning_rate": 0.00010428238151235554, + "loss": 0.0, + "step": 39293 + }, + { + "epoch": 3.666511150508538, + "grad_norm": NaN, + "learning_rate": 0.00010427517765986149, + "loss": 0.0, + "step": 39294 + }, + { + "epoch": 3.6666044602034153, + "grad_norm": NaN, + "learning_rate": 0.00010426797392363217, + "loss": 0.0, + "step": 39295 + }, + { + "epoch": 3.6666977698982923, + "grad_norm": NaN, + "learning_rate": 0.00010426077030368584, + "loss": 0.0, + "step": 39296 + }, + { + "epoch": 3.6667910795931697, + "grad_norm": NaN, + "learning_rate": 0.00010425356680004076, + "loss": 0.0, + "step": 39297 + }, + { + "epoch": 3.666884389288047, + "grad_norm": NaN, + "learning_rate": 0.0001042463634127154, + "loss": 0.0, + "step": 39298 + }, + { + "epoch": 3.666977698982924, + "grad_norm": NaN, + "learning_rate": 0.00010423916014172792, + "loss": 0.0, + "step": 39299 + }, + { + "epoch": 3.6670710086778016, + "grad_norm": NaN, + "learning_rate": 0.00010423195698709669, + "loss": 0.0, + "step": 39300 + }, + { + "epoch": 3.667164318372679, + "grad_norm": NaN, + "learning_rate": 0.00010422475394884008, + "loss": 0.0, + "step": 39301 + }, + { + "epoch": 3.6672576280675564, + "grad_norm": NaN, + "learning_rate": 0.00010421755102697636, + "loss": 0.0, + "step": 39302 + }, + { + "epoch": 3.6673509377624334, + "grad_norm": NaN, + "learning_rate": 0.00010421034822152381, + "loss": 0.0, + "step": 39303 + }, + { + "epoch": 3.667444247457311, + "grad_norm": NaN, + "learning_rate": 0.00010420314553250084, + "loss": 0.0, + "step": 39304 + }, + { + "epoch": 3.667537557152188, + "grad_norm": NaN, + "learning_rate": 0.00010419594295992565, + "loss": 0.0, + "step": 39305 + }, + { + "epoch": 3.667630866847065, + "grad_norm": NaN, + "learning_rate": 0.0001041887405038166, + "loss": 0.0, + "step": 39306 + }, + { + "epoch": 3.6677241765419426, + "grad_norm": NaN, + "learning_rate": 0.00010418153816419208, + "loss": 0.0, + "step": 39307 + }, + { + "epoch": 3.66781748623682, + "grad_norm": NaN, + "learning_rate": 0.0001041743359410703, + "loss": 0.0, + "step": 39308 + }, + { + "epoch": 3.6679107959316974, + "grad_norm": NaN, + "learning_rate": 0.00010416713383446957, + "loss": 0.0, + "step": 39309 + }, + { + "epoch": 3.668004105626575, + "grad_norm": NaN, + "learning_rate": 0.00010415993184440831, + "loss": 0.0, + "step": 39310 + }, + { + "epoch": 3.668097415321452, + "grad_norm": NaN, + "learning_rate": 0.00010415272997090471, + "loss": 0.0, + "step": 39311 + }, + { + "epoch": 3.6681907250163293, + "grad_norm": NaN, + "learning_rate": 0.00010414552821397714, + "loss": 0.0, + "step": 39312 + }, + { + "epoch": 3.6682840347112062, + "grad_norm": NaN, + "learning_rate": 0.00010413832657364397, + "loss": 0.0, + "step": 39313 + }, + { + "epoch": 3.6683773444060837, + "grad_norm": NaN, + "learning_rate": 0.0001041311250499234, + "loss": 0.0, + "step": 39314 + }, + { + "epoch": 3.668470654100961, + "grad_norm": NaN, + "learning_rate": 0.00010412392364283374, + "loss": 0.0, + "step": 39315 + }, + { + "epoch": 3.6685639637958385, + "grad_norm": NaN, + "learning_rate": 0.00010411672235239345, + "loss": 0.0, + "step": 39316 + }, + { + "epoch": 3.668657273490716, + "grad_norm": NaN, + "learning_rate": 0.00010410952117862072, + "loss": 0.0, + "step": 39317 + }, + { + "epoch": 3.668750583185593, + "grad_norm": NaN, + "learning_rate": 0.00010410232012153384, + "loss": 0.0, + "step": 39318 + }, + { + "epoch": 3.6688438928804703, + "grad_norm": NaN, + "learning_rate": 0.00010409511918115123, + "loss": 0.0, + "step": 39319 + }, + { + "epoch": 3.6689372025753477, + "grad_norm": NaN, + "learning_rate": 0.00010408791835749108, + "loss": 0.0, + "step": 39320 + }, + { + "epoch": 3.6690305122702247, + "grad_norm": NaN, + "learning_rate": 0.00010408071765057176, + "loss": 0.0, + "step": 39321 + }, + { + "epoch": 3.669123821965102, + "grad_norm": NaN, + "learning_rate": 0.0001040735170604116, + "loss": 0.0, + "step": 39322 + }, + { + "epoch": 3.6692171316599795, + "grad_norm": NaN, + "learning_rate": 0.00010406631658702884, + "loss": 0.0, + "step": 39323 + }, + { + "epoch": 3.669310441354857, + "grad_norm": NaN, + "learning_rate": 0.00010405911623044193, + "loss": 0.0, + "step": 39324 + }, + { + "epoch": 3.669403751049734, + "grad_norm": NaN, + "learning_rate": 0.00010405191599066895, + "loss": 0.0, + "step": 39325 + }, + { + "epoch": 3.6694970607446113, + "grad_norm": NaN, + "learning_rate": 0.00010404471586772836, + "loss": 0.0, + "step": 39326 + }, + { + "epoch": 3.6695903704394888, + "grad_norm": NaN, + "learning_rate": 0.00010403751586163851, + "loss": 0.0, + "step": 39327 + }, + { + "epoch": 3.6696836801343657, + "grad_norm": NaN, + "learning_rate": 0.0001040303159724176, + "loss": 0.0, + "step": 39328 + }, + { + "epoch": 3.669776989829243, + "grad_norm": NaN, + "learning_rate": 0.00010402311620008397, + "loss": 0.0, + "step": 39329 + }, + { + "epoch": 3.6698702995241206, + "grad_norm": NaN, + "learning_rate": 0.00010401591654465597, + "loss": 0.0, + "step": 39330 + }, + { + "epoch": 3.669963609218998, + "grad_norm": NaN, + "learning_rate": 0.00010400871700615186, + "loss": 0.0, + "step": 39331 + }, + { + "epoch": 3.6700569189138754, + "grad_norm": NaN, + "learning_rate": 0.00010400151758458992, + "loss": 0.0, + "step": 39332 + }, + { + "epoch": 3.6701502286087524, + "grad_norm": NaN, + "learning_rate": 0.00010399431827998855, + "loss": 0.0, + "step": 39333 + }, + { + "epoch": 3.67024353830363, + "grad_norm": NaN, + "learning_rate": 0.00010398711909236598, + "loss": 0.0, + "step": 39334 + }, + { + "epoch": 3.670336847998507, + "grad_norm": NaN, + "learning_rate": 0.00010397992002174052, + "loss": 0.0, + "step": 39335 + }, + { + "epoch": 3.670430157693384, + "grad_norm": NaN, + "learning_rate": 0.00010397272106813054, + "loss": 0.0, + "step": 39336 + }, + { + "epoch": 3.6705234673882616, + "grad_norm": NaN, + "learning_rate": 0.00010396552223155426, + "loss": 0.0, + "step": 39337 + }, + { + "epoch": 3.670616777083139, + "grad_norm": NaN, + "learning_rate": 0.00010395832351202999, + "loss": 0.0, + "step": 39338 + }, + { + "epoch": 3.6707100867780165, + "grad_norm": NaN, + "learning_rate": 0.00010395112490957613, + "loss": 0.0, + "step": 39339 + }, + { + "epoch": 3.6708033964728934, + "grad_norm": NaN, + "learning_rate": 0.00010394392642421088, + "loss": 0.0, + "step": 39340 + }, + { + "epoch": 3.670896706167771, + "grad_norm": NaN, + "learning_rate": 0.00010393672805595256, + "loss": 0.0, + "step": 39341 + }, + { + "epoch": 3.6709900158626483, + "grad_norm": NaN, + "learning_rate": 0.00010392952980481956, + "loss": 0.0, + "step": 39342 + }, + { + "epoch": 3.6710833255575253, + "grad_norm": NaN, + "learning_rate": 0.00010392233167083008, + "loss": 0.0, + "step": 39343 + }, + { + "epoch": 3.6711766352524027, + "grad_norm": NaN, + "learning_rate": 0.00010391513365400244, + "loss": 0.0, + "step": 39344 + }, + { + "epoch": 3.67126994494728, + "grad_norm": NaN, + "learning_rate": 0.00010390793575435502, + "loss": 0.0, + "step": 39345 + }, + { + "epoch": 3.6713632546421575, + "grad_norm": NaN, + "learning_rate": 0.00010390073797190603, + "loss": 0.0, + "step": 39346 + }, + { + "epoch": 3.6714565643370345, + "grad_norm": NaN, + "learning_rate": 0.00010389354030667378, + "loss": 0.0, + "step": 39347 + }, + { + "epoch": 3.671549874031912, + "grad_norm": NaN, + "learning_rate": 0.00010388634275867665, + "loss": 0.0, + "step": 39348 + }, + { + "epoch": 3.6716431837267893, + "grad_norm": NaN, + "learning_rate": 0.00010387914532793289, + "loss": 0.0, + "step": 39349 + }, + { + "epoch": 3.6717364934216663, + "grad_norm": NaN, + "learning_rate": 0.00010387194801446075, + "loss": 0.0, + "step": 39350 + }, + { + "epoch": 3.6718298031165437, + "grad_norm": NaN, + "learning_rate": 0.00010386475081827865, + "loss": 0.0, + "step": 39351 + }, + { + "epoch": 3.671923112811421, + "grad_norm": NaN, + "learning_rate": 0.00010385755373940479, + "loss": 0.0, + "step": 39352 + }, + { + "epoch": 3.6720164225062986, + "grad_norm": NaN, + "learning_rate": 0.00010385035677785746, + "loss": 0.0, + "step": 39353 + }, + { + "epoch": 3.6721097322011755, + "grad_norm": NaN, + "learning_rate": 0.00010384315993365508, + "loss": 0.0, + "step": 39354 + }, + { + "epoch": 3.672203041896053, + "grad_norm": NaN, + "learning_rate": 0.00010383596320681582, + "loss": 0.0, + "step": 39355 + }, + { + "epoch": 3.6722963515909304, + "grad_norm": NaN, + "learning_rate": 0.00010382876659735803, + "loss": 0.0, + "step": 39356 + }, + { + "epoch": 3.6723896612858073, + "grad_norm": NaN, + "learning_rate": 0.00010382157010530005, + "loss": 0.0, + "step": 39357 + }, + { + "epoch": 3.6724829709806848, + "grad_norm": NaN, + "learning_rate": 0.00010381437373066011, + "loss": 0.0, + "step": 39358 + }, + { + "epoch": 3.672576280675562, + "grad_norm": NaN, + "learning_rate": 0.00010380717747345651, + "loss": 0.0, + "step": 39359 + }, + { + "epoch": 3.6726695903704396, + "grad_norm": NaN, + "learning_rate": 0.00010379998133370764, + "loss": 0.0, + "step": 39360 + }, + { + "epoch": 3.672762900065317, + "grad_norm": NaN, + "learning_rate": 0.00010379278531143165, + "loss": 0.0, + "step": 39361 + }, + { + "epoch": 3.672856209760194, + "grad_norm": NaN, + "learning_rate": 0.00010378558940664698, + "loss": 0.0, + "step": 39362 + }, + { + "epoch": 3.6729495194550714, + "grad_norm": NaN, + "learning_rate": 0.00010377839361937187, + "loss": 0.0, + "step": 39363 + }, + { + "epoch": 3.673042829149949, + "grad_norm": NaN, + "learning_rate": 0.00010377119794962457, + "loss": 0.0, + "step": 39364 + }, + { + "epoch": 3.673136138844826, + "grad_norm": NaN, + "learning_rate": 0.00010376400239742344, + "loss": 0.0, + "step": 39365 + }, + { + "epoch": 3.6732294485397032, + "grad_norm": NaN, + "learning_rate": 0.00010375680696278678, + "loss": 0.0, + "step": 39366 + }, + { + "epoch": 3.6733227582345807, + "grad_norm": NaN, + "learning_rate": 0.00010374961164573283, + "loss": 0.0, + "step": 39367 + }, + { + "epoch": 3.673416067929458, + "grad_norm": NaN, + "learning_rate": 0.00010374241644627996, + "loss": 0.0, + "step": 39368 + }, + { + "epoch": 3.673509377624335, + "grad_norm": NaN, + "learning_rate": 0.0001037352213644464, + "loss": 0.0, + "step": 39369 + }, + { + "epoch": 3.6736026873192125, + "grad_norm": NaN, + "learning_rate": 0.00010372802640025041, + "loss": 0.0, + "step": 39370 + }, + { + "epoch": 3.67369599701409, + "grad_norm": NaN, + "learning_rate": 0.00010372083155371041, + "loss": 0.0, + "step": 39371 + }, + { + "epoch": 3.673789306708967, + "grad_norm": NaN, + "learning_rate": 0.00010371363682484462, + "loss": 0.0, + "step": 39372 + }, + { + "epoch": 3.6738826164038443, + "grad_norm": NaN, + "learning_rate": 0.00010370644221367129, + "loss": 0.0, + "step": 39373 + }, + { + "epoch": 3.6739759260987217, + "grad_norm": NaN, + "learning_rate": 0.00010369924772020883, + "loss": 0.0, + "step": 39374 + }, + { + "epoch": 3.674069235793599, + "grad_norm": NaN, + "learning_rate": 0.00010369205334447544, + "loss": 0.0, + "step": 39375 + }, + { + "epoch": 3.674162545488476, + "grad_norm": NaN, + "learning_rate": 0.00010368485908648942, + "loss": 0.0, + "step": 39376 + }, + { + "epoch": 3.6742558551833535, + "grad_norm": NaN, + "learning_rate": 0.00010367766494626914, + "loss": 0.0, + "step": 39377 + }, + { + "epoch": 3.674349164878231, + "grad_norm": NaN, + "learning_rate": 0.0001036704709238328, + "loss": 0.0, + "step": 39378 + }, + { + "epoch": 3.674442474573108, + "grad_norm": NaN, + "learning_rate": 0.0001036632770191987, + "loss": 0.0, + "step": 39379 + }, + { + "epoch": 3.6745357842679853, + "grad_norm": NaN, + "learning_rate": 0.00010365608323238523, + "loss": 0.0, + "step": 39380 + }, + { + "epoch": 3.6746290939628627, + "grad_norm": NaN, + "learning_rate": 0.00010364888956341056, + "loss": 0.0, + "step": 39381 + }, + { + "epoch": 3.67472240365774, + "grad_norm": NaN, + "learning_rate": 0.00010364169601229301, + "loss": 0.0, + "step": 39382 + }, + { + "epoch": 3.6748157133526176, + "grad_norm": NaN, + "learning_rate": 0.00010363450257905097, + "loss": 0.0, + "step": 39383 + }, + { + "epoch": 3.6749090230474946, + "grad_norm": NaN, + "learning_rate": 0.00010362730926370261, + "loss": 0.0, + "step": 39384 + }, + { + "epoch": 3.675002332742372, + "grad_norm": NaN, + "learning_rate": 0.00010362011606626623, + "loss": 0.0, + "step": 39385 + }, + { + "epoch": 3.675095642437249, + "grad_norm": NaN, + "learning_rate": 0.00010361292298676023, + "loss": 0.0, + "step": 39386 + }, + { + "epoch": 3.6751889521321264, + "grad_norm": NaN, + "learning_rate": 0.00010360573002520281, + "loss": 0.0, + "step": 39387 + }, + { + "epoch": 3.675282261827004, + "grad_norm": NaN, + "learning_rate": 0.00010359853718161221, + "loss": 0.0, + "step": 39388 + }, + { + "epoch": 3.675375571521881, + "grad_norm": NaN, + "learning_rate": 0.00010359134445600688, + "loss": 0.0, + "step": 39389 + }, + { + "epoch": 3.6754688812167586, + "grad_norm": NaN, + "learning_rate": 0.00010358415184840496, + "loss": 0.0, + "step": 39390 + }, + { + "epoch": 3.6755621909116356, + "grad_norm": NaN, + "learning_rate": 0.00010357695935882478, + "loss": 0.0, + "step": 39391 + }, + { + "epoch": 3.675655500606513, + "grad_norm": NaN, + "learning_rate": 0.00010356976698728469, + "loss": 0.0, + "step": 39392 + }, + { + "epoch": 3.6757488103013904, + "grad_norm": NaN, + "learning_rate": 0.0001035625747338029, + "loss": 0.0, + "step": 39393 + }, + { + "epoch": 3.6758421199962674, + "grad_norm": NaN, + "learning_rate": 0.00010355538259839771, + "loss": 0.0, + "step": 39394 + }, + { + "epoch": 3.675935429691145, + "grad_norm": NaN, + "learning_rate": 0.00010354819058108746, + "loss": 0.0, + "step": 39395 + }, + { + "epoch": 3.6760287393860223, + "grad_norm": NaN, + "learning_rate": 0.00010354099868189036, + "loss": 0.0, + "step": 39396 + }, + { + "epoch": 3.6761220490808997, + "grad_norm": NaN, + "learning_rate": 0.00010353380690082476, + "loss": 0.0, + "step": 39397 + }, + { + "epoch": 3.6762153587757767, + "grad_norm": NaN, + "learning_rate": 0.00010352661523790894, + "loss": 0.0, + "step": 39398 + }, + { + "epoch": 3.676308668470654, + "grad_norm": NaN, + "learning_rate": 0.00010351942369316113, + "loss": 0.0, + "step": 39399 + }, + { + "epoch": 3.6764019781655315, + "grad_norm": NaN, + "learning_rate": 0.00010351223226659968, + "loss": 0.0, + "step": 39400 + }, + { + "epoch": 3.6764952878604085, + "grad_norm": NaN, + "learning_rate": 0.00010350504095824289, + "loss": 0.0, + "step": 39401 + }, + { + "epoch": 3.676588597555286, + "grad_norm": NaN, + "learning_rate": 0.00010349784976810895, + "loss": 0.0, + "step": 39402 + }, + { + "epoch": 3.6766819072501633, + "grad_norm": NaN, + "learning_rate": 0.00010349065869621622, + "loss": 0.0, + "step": 39403 + }, + { + "epoch": 3.6767752169450407, + "grad_norm": NaN, + "learning_rate": 0.000103483467742583, + "loss": 0.0, + "step": 39404 + }, + { + "epoch": 3.676868526639918, + "grad_norm": NaN, + "learning_rate": 0.0001034762769072275, + "loss": 0.0, + "step": 39405 + }, + { + "epoch": 3.676961836334795, + "grad_norm": NaN, + "learning_rate": 0.00010346908619016806, + "loss": 0.0, + "step": 39406 + }, + { + "epoch": 3.6770551460296725, + "grad_norm": NaN, + "learning_rate": 0.00010346189559142298, + "loss": 0.0, + "step": 39407 + }, + { + "epoch": 3.6771484557245495, + "grad_norm": NaN, + "learning_rate": 0.00010345470511101046, + "loss": 0.0, + "step": 39408 + }, + { + "epoch": 3.677241765419427, + "grad_norm": NaN, + "learning_rate": 0.00010344751474894887, + "loss": 0.0, + "step": 39409 + }, + { + "epoch": 3.6773350751143044, + "grad_norm": NaN, + "learning_rate": 0.00010344032450525649, + "loss": 0.0, + "step": 39410 + }, + { + "epoch": 3.6774283848091818, + "grad_norm": NaN, + "learning_rate": 0.00010343313437995151, + "loss": 0.0, + "step": 39411 + }, + { + "epoch": 3.677521694504059, + "grad_norm": NaN, + "learning_rate": 0.0001034259443730523, + "loss": 0.0, + "step": 39412 + }, + { + "epoch": 3.677615004198936, + "grad_norm": NaN, + "learning_rate": 0.00010341875448457716, + "loss": 0.0, + "step": 39413 + }, + { + "epoch": 3.6777083138938136, + "grad_norm": NaN, + "learning_rate": 0.00010341156471454425, + "loss": 0.0, + "step": 39414 + }, + { + "epoch": 3.677801623588691, + "grad_norm": NaN, + "learning_rate": 0.000103404375062972, + "loss": 0.0, + "step": 39415 + }, + { + "epoch": 3.677894933283568, + "grad_norm": NaN, + "learning_rate": 0.0001033971855298786, + "loss": 0.0, + "step": 39416 + }, + { + "epoch": 3.6779882429784454, + "grad_norm": NaN, + "learning_rate": 0.00010338999611528231, + "loss": 0.0, + "step": 39417 + }, + { + "epoch": 3.678081552673323, + "grad_norm": NaN, + "learning_rate": 0.0001033828068192015, + "loss": 0.0, + "step": 39418 + }, + { + "epoch": 3.6781748623682002, + "grad_norm": NaN, + "learning_rate": 0.0001033756176416544, + "loss": 0.0, + "step": 39419 + }, + { + "epoch": 3.678268172063077, + "grad_norm": NaN, + "learning_rate": 0.00010336842858265925, + "loss": 0.0, + "step": 39420 + }, + { + "epoch": 3.6783614817579546, + "grad_norm": NaN, + "learning_rate": 0.00010336123964223444, + "loss": 0.0, + "step": 39421 + }, + { + "epoch": 3.678454791452832, + "grad_norm": NaN, + "learning_rate": 0.00010335405082039814, + "loss": 0.0, + "step": 39422 + }, + { + "epoch": 3.678548101147709, + "grad_norm": NaN, + "learning_rate": 0.00010334686211716867, + "loss": 0.0, + "step": 39423 + }, + { + "epoch": 3.6786414108425864, + "grad_norm": NaN, + "learning_rate": 0.00010333967353256435, + "loss": 0.0, + "step": 39424 + }, + { + "epoch": 3.678734720537464, + "grad_norm": NaN, + "learning_rate": 0.00010333248506660336, + "loss": 0.0, + "step": 39425 + }, + { + "epoch": 3.6788280302323413, + "grad_norm": NaN, + "learning_rate": 0.00010332529671930405, + "loss": 0.0, + "step": 39426 + }, + { + "epoch": 3.6789213399272187, + "grad_norm": NaN, + "learning_rate": 0.00010331810849068473, + "loss": 0.0, + "step": 39427 + }, + { + "epoch": 3.6790146496220957, + "grad_norm": NaN, + "learning_rate": 0.00010331092038076361, + "loss": 0.0, + "step": 39428 + }, + { + "epoch": 3.679107959316973, + "grad_norm": NaN, + "learning_rate": 0.00010330373238955894, + "loss": 0.0, + "step": 39429 + }, + { + "epoch": 3.67920126901185, + "grad_norm": NaN, + "learning_rate": 0.00010329654451708911, + "loss": 0.0, + "step": 39430 + }, + { + "epoch": 3.6792945787067275, + "grad_norm": NaN, + "learning_rate": 0.00010328935676337231, + "loss": 0.0, + "step": 39431 + }, + { + "epoch": 3.679387888401605, + "grad_norm": NaN, + "learning_rate": 0.00010328216912842682, + "loss": 0.0, + "step": 39432 + }, + { + "epoch": 3.6794811980964823, + "grad_norm": NaN, + "learning_rate": 0.00010327498161227097, + "loss": 0.0, + "step": 39433 + }, + { + "epoch": 3.6795745077913597, + "grad_norm": NaN, + "learning_rate": 0.00010326779421492295, + "loss": 0.0, + "step": 39434 + }, + { + "epoch": 3.6796678174862367, + "grad_norm": NaN, + "learning_rate": 0.00010326060693640113, + "loss": 0.0, + "step": 39435 + }, + { + "epoch": 3.679761127181114, + "grad_norm": NaN, + "learning_rate": 0.00010325341977672377, + "loss": 0.0, + "step": 39436 + }, + { + "epoch": 3.6798544368759916, + "grad_norm": NaN, + "learning_rate": 0.00010324623273590903, + "loss": 0.0, + "step": 39437 + }, + { + "epoch": 3.6799477465708685, + "grad_norm": NaN, + "learning_rate": 0.00010323904581397531, + "loss": 0.0, + "step": 39438 + }, + { + "epoch": 3.680041056265746, + "grad_norm": NaN, + "learning_rate": 0.0001032318590109409, + "loss": 0.0, + "step": 39439 + }, + { + "epoch": 3.6801343659606234, + "grad_norm": NaN, + "learning_rate": 0.00010322467232682393, + "loss": 0.0, + "step": 39440 + }, + { + "epoch": 3.680227675655501, + "grad_norm": NaN, + "learning_rate": 0.00010321748576164281, + "loss": 0.0, + "step": 39441 + }, + { + "epoch": 3.6803209853503778, + "grad_norm": NaN, + "learning_rate": 0.0001032102993154158, + "loss": 0.0, + "step": 39442 + }, + { + "epoch": 3.680414295045255, + "grad_norm": NaN, + "learning_rate": 0.00010320311298816106, + "loss": 0.0, + "step": 39443 + }, + { + "epoch": 3.6805076047401326, + "grad_norm": NaN, + "learning_rate": 0.00010319592677989697, + "loss": 0.0, + "step": 39444 + }, + { + "epoch": 3.6806009144350096, + "grad_norm": NaN, + "learning_rate": 0.00010318874069064183, + "loss": 0.0, + "step": 39445 + }, + { + "epoch": 3.680694224129887, + "grad_norm": NaN, + "learning_rate": 0.00010318155472041376, + "loss": 0.0, + "step": 39446 + }, + { + "epoch": 3.6807875338247644, + "grad_norm": NaN, + "learning_rate": 0.00010317436886923119, + "loss": 0.0, + "step": 39447 + }, + { + "epoch": 3.680880843519642, + "grad_norm": NaN, + "learning_rate": 0.00010316718313711234, + "loss": 0.0, + "step": 39448 + }, + { + "epoch": 3.6809741532145193, + "grad_norm": NaN, + "learning_rate": 0.00010315999752407542, + "loss": 0.0, + "step": 39449 + }, + { + "epoch": 3.6810674629093962, + "grad_norm": NaN, + "learning_rate": 0.00010315281203013879, + "loss": 0.0, + "step": 39450 + }, + { + "epoch": 3.6811607726042737, + "grad_norm": NaN, + "learning_rate": 0.0001031456266553207, + "loss": 0.0, + "step": 39451 + }, + { + "epoch": 3.6812540822991506, + "grad_norm": NaN, + "learning_rate": 0.00010313844139963933, + "loss": 0.0, + "step": 39452 + }, + { + "epoch": 3.681347391994028, + "grad_norm": NaN, + "learning_rate": 0.00010313125626311306, + "loss": 0.0, + "step": 39453 + }, + { + "epoch": 3.6814407016889055, + "grad_norm": NaN, + "learning_rate": 0.00010312407124576017, + "loss": 0.0, + "step": 39454 + }, + { + "epoch": 3.681534011383783, + "grad_norm": NaN, + "learning_rate": 0.0001031168863475988, + "loss": 0.0, + "step": 39455 + }, + { + "epoch": 3.6816273210786603, + "grad_norm": NaN, + "learning_rate": 0.00010310970156864732, + "loss": 0.0, + "step": 39456 + }, + { + "epoch": 3.6817206307735373, + "grad_norm": NaN, + "learning_rate": 0.00010310251690892403, + "loss": 0.0, + "step": 39457 + }, + { + "epoch": 3.6818139404684147, + "grad_norm": NaN, + "learning_rate": 0.0001030953323684471, + "loss": 0.0, + "step": 39458 + }, + { + "epoch": 3.681907250163292, + "grad_norm": NaN, + "learning_rate": 0.00010308814794723488, + "loss": 0.0, + "step": 39459 + }, + { + "epoch": 3.682000559858169, + "grad_norm": NaN, + "learning_rate": 0.00010308096364530558, + "loss": 0.0, + "step": 39460 + }, + { + "epoch": 3.6820938695530465, + "grad_norm": NaN, + "learning_rate": 0.00010307377946267749, + "loss": 0.0, + "step": 39461 + }, + { + "epoch": 3.682187179247924, + "grad_norm": NaN, + "learning_rate": 0.00010306659539936889, + "loss": 0.0, + "step": 39462 + }, + { + "epoch": 3.6822804889428014, + "grad_norm": NaN, + "learning_rate": 0.00010305941145539805, + "loss": 0.0, + "step": 39463 + }, + { + "epoch": 3.6823737986376783, + "grad_norm": NaN, + "learning_rate": 0.00010305222763078315, + "loss": 0.0, + "step": 39464 + }, + { + "epoch": 3.6824671083325557, + "grad_norm": NaN, + "learning_rate": 0.0001030450439255426, + "loss": 0.0, + "step": 39465 + }, + { + "epoch": 3.682560418027433, + "grad_norm": NaN, + "learning_rate": 0.00010303786033969458, + "loss": 0.0, + "step": 39466 + }, + { + "epoch": 3.68265372772231, + "grad_norm": NaN, + "learning_rate": 0.00010303067687325734, + "loss": 0.0, + "step": 39467 + }, + { + "epoch": 3.6827470374171876, + "grad_norm": NaN, + "learning_rate": 0.00010302349352624923, + "loss": 0.0, + "step": 39468 + }, + { + "epoch": 3.682840347112065, + "grad_norm": NaN, + "learning_rate": 0.00010301631029868841, + "loss": 0.0, + "step": 39469 + }, + { + "epoch": 3.6829336568069424, + "grad_norm": NaN, + "learning_rate": 0.00010300912719059318, + "loss": 0.0, + "step": 39470 + }, + { + "epoch": 3.6830269665018194, + "grad_norm": NaN, + "learning_rate": 0.0001030019442019819, + "loss": 0.0, + "step": 39471 + }, + { + "epoch": 3.683120276196697, + "grad_norm": NaN, + "learning_rate": 0.00010299476133287268, + "loss": 0.0, + "step": 39472 + }, + { + "epoch": 3.683213585891574, + "grad_norm": NaN, + "learning_rate": 0.00010298757858328387, + "loss": 0.0, + "step": 39473 + }, + { + "epoch": 3.683306895586451, + "grad_norm": NaN, + "learning_rate": 0.00010298039595323378, + "loss": 0.0, + "step": 39474 + }, + { + "epoch": 3.6834002052813286, + "grad_norm": NaN, + "learning_rate": 0.00010297321344274052, + "loss": 0.0, + "step": 39475 + }, + { + "epoch": 3.683493514976206, + "grad_norm": NaN, + "learning_rate": 0.0001029660310518225, + "loss": 0.0, + "step": 39476 + }, + { + "epoch": 3.6835868246710834, + "grad_norm": NaN, + "learning_rate": 0.00010295884878049797, + "loss": 0.0, + "step": 39477 + }, + { + "epoch": 3.683680134365961, + "grad_norm": NaN, + "learning_rate": 0.00010295166662878509, + "loss": 0.0, + "step": 39478 + }, + { + "epoch": 3.683773444060838, + "grad_norm": NaN, + "learning_rate": 0.0001029444845967022, + "loss": 0.0, + "step": 39479 + }, + { + "epoch": 3.6838667537557153, + "grad_norm": NaN, + "learning_rate": 0.00010293730268426758, + "loss": 0.0, + "step": 39480 + }, + { + "epoch": 3.6839600634505927, + "grad_norm": NaN, + "learning_rate": 0.00010293012089149939, + "loss": 0.0, + "step": 39481 + }, + { + "epoch": 3.6840533731454697, + "grad_norm": NaN, + "learning_rate": 0.00010292293921841601, + "loss": 0.0, + "step": 39482 + }, + { + "epoch": 3.684146682840347, + "grad_norm": NaN, + "learning_rate": 0.00010291575766503568, + "loss": 0.0, + "step": 39483 + }, + { + "epoch": 3.6842399925352245, + "grad_norm": NaN, + "learning_rate": 0.00010290857623137655, + "loss": 0.0, + "step": 39484 + }, + { + "epoch": 3.684333302230102, + "grad_norm": NaN, + "learning_rate": 0.00010290139491745702, + "loss": 0.0, + "step": 39485 + }, + { + "epoch": 3.684426611924979, + "grad_norm": NaN, + "learning_rate": 0.00010289421372329531, + "loss": 0.0, + "step": 39486 + }, + { + "epoch": 3.6845199216198563, + "grad_norm": NaN, + "learning_rate": 0.00010288703264890957, + "loss": 0.0, + "step": 39487 + }, + { + "epoch": 3.6846132313147337, + "grad_norm": NaN, + "learning_rate": 0.00010287985169431821, + "loss": 0.0, + "step": 39488 + }, + { + "epoch": 3.6847065410096107, + "grad_norm": NaN, + "learning_rate": 0.00010287267085953947, + "loss": 0.0, + "step": 39489 + }, + { + "epoch": 3.684799850704488, + "grad_norm": NaN, + "learning_rate": 0.00010286549014459148, + "loss": 0.0, + "step": 39490 + }, + { + "epoch": 3.6848931603993655, + "grad_norm": NaN, + "learning_rate": 0.00010285830954949264, + "loss": 0.0, + "step": 39491 + }, + { + "epoch": 3.684986470094243, + "grad_norm": NaN, + "learning_rate": 0.00010285112907426118, + "loss": 0.0, + "step": 39492 + }, + { + "epoch": 3.68507977978912, + "grad_norm": NaN, + "learning_rate": 0.00010284394871891525, + "loss": 0.0, + "step": 39493 + }, + { + "epoch": 3.6851730894839974, + "grad_norm": NaN, + "learning_rate": 0.00010283676848347326, + "loss": 0.0, + "step": 39494 + }, + { + "epoch": 3.6852663991788748, + "grad_norm": NaN, + "learning_rate": 0.00010282958836795337, + "loss": 0.0, + "step": 39495 + }, + { + "epoch": 3.6853597088737517, + "grad_norm": NaN, + "learning_rate": 0.00010282240837237385, + "loss": 0.0, + "step": 39496 + }, + { + "epoch": 3.685453018568629, + "grad_norm": NaN, + "learning_rate": 0.00010281522849675298, + "loss": 0.0, + "step": 39497 + }, + { + "epoch": 3.6855463282635066, + "grad_norm": NaN, + "learning_rate": 0.00010280804874110906, + "loss": 0.0, + "step": 39498 + }, + { + "epoch": 3.685639637958384, + "grad_norm": NaN, + "learning_rate": 0.00010280086910546021, + "loss": 0.0, + "step": 39499 + }, + { + "epoch": 3.6857329476532614, + "grad_norm": NaN, + "learning_rate": 0.0001027936895898248, + "loss": 0.0, + "step": 39500 + }, + { + "epoch": 3.6858262573481384, + "grad_norm": NaN, + "learning_rate": 0.00010278651019422109, + "loss": 0.0, + "step": 39501 + }, + { + "epoch": 3.685919567043016, + "grad_norm": NaN, + "learning_rate": 0.00010277933091866724, + "loss": 0.0, + "step": 39502 + }, + { + "epoch": 3.686012876737893, + "grad_norm": NaN, + "learning_rate": 0.00010277215176318164, + "loss": 0.0, + "step": 39503 + }, + { + "epoch": 3.68610618643277, + "grad_norm": NaN, + "learning_rate": 0.00010276497272778242, + "loss": 0.0, + "step": 39504 + }, + { + "epoch": 3.6861994961276476, + "grad_norm": NaN, + "learning_rate": 0.00010275779381248782, + "loss": 0.0, + "step": 39505 + }, + { + "epoch": 3.686292805822525, + "grad_norm": NaN, + "learning_rate": 0.00010275061501731625, + "loss": 0.0, + "step": 39506 + }, + { + "epoch": 3.6863861155174025, + "grad_norm": NaN, + "learning_rate": 0.00010274343634228586, + "loss": 0.0, + "step": 39507 + }, + { + "epoch": 3.6864794252122794, + "grad_norm": NaN, + "learning_rate": 0.00010273625778741486, + "loss": 0.0, + "step": 39508 + }, + { + "epoch": 3.686572734907157, + "grad_norm": NaN, + "learning_rate": 0.0001027290793527216, + "loss": 0.0, + "step": 39509 + }, + { + "epoch": 3.6866660446020343, + "grad_norm": NaN, + "learning_rate": 0.00010272190103822424, + "loss": 0.0, + "step": 39510 + }, + { + "epoch": 3.6867593542969113, + "grad_norm": NaN, + "learning_rate": 0.00010271472284394114, + "loss": 0.0, + "step": 39511 + }, + { + "epoch": 3.6868526639917887, + "grad_norm": NaN, + "learning_rate": 0.00010270754476989048, + "loss": 0.0, + "step": 39512 + }, + { + "epoch": 3.686945973686666, + "grad_norm": NaN, + "learning_rate": 0.00010270036681609049, + "loss": 0.0, + "step": 39513 + }, + { + "epoch": 3.6870392833815435, + "grad_norm": NaN, + "learning_rate": 0.00010269318898255947, + "loss": 0.0, + "step": 39514 + }, + { + "epoch": 3.6871325930764205, + "grad_norm": NaN, + "learning_rate": 0.0001026860112693157, + "loss": 0.0, + "step": 39515 + }, + { + "epoch": 3.687225902771298, + "grad_norm": NaN, + "learning_rate": 0.00010267883367637733, + "loss": 0.0, + "step": 39516 + }, + { + "epoch": 3.6873192124661753, + "grad_norm": NaN, + "learning_rate": 0.0001026716562037627, + "loss": 0.0, + "step": 39517 + }, + { + "epoch": 3.6874125221610523, + "grad_norm": NaN, + "learning_rate": 0.00010266447885149004, + "loss": 0.0, + "step": 39518 + }, + { + "epoch": 3.6875058318559297, + "grad_norm": NaN, + "learning_rate": 0.00010265730161957754, + "loss": 0.0, + "step": 39519 + }, + { + "epoch": 3.687599141550807, + "grad_norm": NaN, + "learning_rate": 0.00010265012450804353, + "loss": 0.0, + "step": 39520 + }, + { + "epoch": 3.6876924512456846, + "grad_norm": NaN, + "learning_rate": 0.00010264294751690626, + "loss": 0.0, + "step": 39521 + }, + { + "epoch": 3.687785760940562, + "grad_norm": NaN, + "learning_rate": 0.00010263577064618388, + "loss": 0.0, + "step": 39522 + }, + { + "epoch": 3.687879070635439, + "grad_norm": NaN, + "learning_rate": 0.00010262859389589472, + "loss": 0.0, + "step": 39523 + }, + { + "epoch": 3.6879723803303164, + "grad_norm": NaN, + "learning_rate": 0.00010262141726605708, + "loss": 0.0, + "step": 39524 + }, + { + "epoch": 3.6880656900251934, + "grad_norm": NaN, + "learning_rate": 0.00010261424075668906, + "loss": 0.0, + "step": 39525 + }, + { + "epoch": 3.6881589997200708, + "grad_norm": NaN, + "learning_rate": 0.000102607064367809, + "loss": 0.0, + "step": 39526 + }, + { + "epoch": 3.688252309414948, + "grad_norm": NaN, + "learning_rate": 0.0001025998880994352, + "loss": 0.0, + "step": 39527 + }, + { + "epoch": 3.6883456191098256, + "grad_norm": NaN, + "learning_rate": 0.00010259271195158576, + "loss": 0.0, + "step": 39528 + }, + { + "epoch": 3.688438928804703, + "grad_norm": NaN, + "learning_rate": 0.00010258553592427906, + "loss": 0.0, + "step": 39529 + }, + { + "epoch": 3.68853223849958, + "grad_norm": NaN, + "learning_rate": 0.00010257836001753333, + "loss": 0.0, + "step": 39530 + }, + { + "epoch": 3.6886255481944574, + "grad_norm": NaN, + "learning_rate": 0.00010257118423136668, + "loss": 0.0, + "step": 39531 + }, + { + "epoch": 3.688718857889335, + "grad_norm": NaN, + "learning_rate": 0.00010256400856579753, + "loss": 0.0, + "step": 39532 + }, + { + "epoch": 3.688812167584212, + "grad_norm": NaN, + "learning_rate": 0.00010255683302084408, + "loss": 0.0, + "step": 39533 + }, + { + "epoch": 3.6889054772790892, + "grad_norm": NaN, + "learning_rate": 0.00010254965759652449, + "loss": 0.0, + "step": 39534 + }, + { + "epoch": 3.6889987869739667, + "grad_norm": NaN, + "learning_rate": 0.00010254248229285708, + "loss": 0.0, + "step": 39535 + }, + { + "epoch": 3.689092096668844, + "grad_norm": NaN, + "learning_rate": 0.00010253530710986013, + "loss": 0.0, + "step": 39536 + }, + { + "epoch": 3.689185406363721, + "grad_norm": NaN, + "learning_rate": 0.00010252813204755176, + "loss": 0.0, + "step": 39537 + }, + { + "epoch": 3.6892787160585985, + "grad_norm": NaN, + "learning_rate": 0.00010252095710595032, + "loss": 0.0, + "step": 39538 + }, + { + "epoch": 3.689372025753476, + "grad_norm": NaN, + "learning_rate": 0.00010251378228507405, + "loss": 0.0, + "step": 39539 + }, + { + "epoch": 3.689465335448353, + "grad_norm": NaN, + "learning_rate": 0.00010250660758494111, + "loss": 0.0, + "step": 39540 + }, + { + "epoch": 3.6895586451432303, + "grad_norm": NaN, + "learning_rate": 0.00010249943300556982, + "loss": 0.0, + "step": 39541 + }, + { + "epoch": 3.6896519548381077, + "grad_norm": NaN, + "learning_rate": 0.00010249225854697843, + "loss": 0.0, + "step": 39542 + }, + { + "epoch": 3.689745264532985, + "grad_norm": NaN, + "learning_rate": 0.00010248508420918512, + "loss": 0.0, + "step": 39543 + }, + { + "epoch": 3.6898385742278625, + "grad_norm": NaN, + "learning_rate": 0.00010247790999220813, + "loss": 0.0, + "step": 39544 + }, + { + "epoch": 3.6899318839227395, + "grad_norm": NaN, + "learning_rate": 0.00010247073589606585, + "loss": 0.0, + "step": 39545 + }, + { + "epoch": 3.690025193617617, + "grad_norm": NaN, + "learning_rate": 0.00010246356192077633, + "loss": 0.0, + "step": 39546 + }, + { + "epoch": 3.690118503312494, + "grad_norm": NaN, + "learning_rate": 0.00010245638806635785, + "loss": 0.0, + "step": 39547 + }, + { + "epoch": 3.6902118130073713, + "grad_norm": NaN, + "learning_rate": 0.00010244921433282878, + "loss": 0.0, + "step": 39548 + }, + { + "epoch": 3.6903051227022488, + "grad_norm": NaN, + "learning_rate": 0.00010244204072020726, + "loss": 0.0, + "step": 39549 + }, + { + "epoch": 3.690398432397126, + "grad_norm": NaN, + "learning_rate": 0.00010243486722851153, + "loss": 0.0, + "step": 39550 + }, + { + "epoch": 3.6904917420920036, + "grad_norm": NaN, + "learning_rate": 0.00010242769385775981, + "loss": 0.0, + "step": 39551 + }, + { + "epoch": 3.6905850517868806, + "grad_norm": NaN, + "learning_rate": 0.0001024205206079704, + "loss": 0.0, + "step": 39552 + }, + { + "epoch": 3.690678361481758, + "grad_norm": NaN, + "learning_rate": 0.00010241334747916153, + "loss": 0.0, + "step": 39553 + }, + { + "epoch": 3.6907716711766354, + "grad_norm": NaN, + "learning_rate": 0.00010240617447135138, + "loss": 0.0, + "step": 39554 + }, + { + "epoch": 3.6908649808715124, + "grad_norm": NaN, + "learning_rate": 0.00010239900158455824, + "loss": 0.0, + "step": 39555 + }, + { + "epoch": 3.69095829056639, + "grad_norm": NaN, + "learning_rate": 0.00010239182881880039, + "loss": 0.0, + "step": 39556 + }, + { + "epoch": 3.691051600261267, + "grad_norm": NaN, + "learning_rate": 0.00010238465617409594, + "loss": 0.0, + "step": 39557 + }, + { + "epoch": 3.6911449099561446, + "grad_norm": NaN, + "learning_rate": 0.00010237748365046324, + "loss": 0.0, + "step": 39558 + }, + { + "epoch": 3.6912382196510216, + "grad_norm": NaN, + "learning_rate": 0.00010237031124792051, + "loss": 0.0, + "step": 39559 + }, + { + "epoch": 3.691331529345899, + "grad_norm": NaN, + "learning_rate": 0.00010236313896648592, + "loss": 0.0, + "step": 39560 + }, + { + "epoch": 3.6914248390407765, + "grad_norm": NaN, + "learning_rate": 0.00010235596680617779, + "loss": 0.0, + "step": 39561 + }, + { + "epoch": 3.6915181487356534, + "grad_norm": NaN, + "learning_rate": 0.00010234879476701437, + "loss": 0.0, + "step": 39562 + }, + { + "epoch": 3.691611458430531, + "grad_norm": NaN, + "learning_rate": 0.00010234162284901377, + "loss": 0.0, + "step": 39563 + }, + { + "epoch": 3.6917047681254083, + "grad_norm": NaN, + "learning_rate": 0.00010233445105219433, + "loss": 0.0, + "step": 39564 + }, + { + "epoch": 3.6917980778202857, + "grad_norm": NaN, + "learning_rate": 0.0001023272793765743, + "loss": 0.0, + "step": 39565 + }, + { + "epoch": 3.6918913875151627, + "grad_norm": NaN, + "learning_rate": 0.0001023201078221718, + "loss": 0.0, + "step": 39566 + }, + { + "epoch": 3.69198469721004, + "grad_norm": NaN, + "learning_rate": 0.0001023129363890052, + "loss": 0.0, + "step": 39567 + }, + { + "epoch": 3.6920780069049175, + "grad_norm": NaN, + "learning_rate": 0.0001023057650770927, + "loss": 0.0, + "step": 39568 + }, + { + "epoch": 3.6921713165997945, + "grad_norm": NaN, + "learning_rate": 0.00010229859388645243, + "loss": 0.0, + "step": 39569 + }, + { + "epoch": 3.692264626294672, + "grad_norm": NaN, + "learning_rate": 0.00010229142281710275, + "loss": 0.0, + "step": 39570 + }, + { + "epoch": 3.6923579359895493, + "grad_norm": NaN, + "learning_rate": 0.00010228425186906189, + "loss": 0.0, + "step": 39571 + }, + { + "epoch": 3.6924512456844267, + "grad_norm": NaN, + "learning_rate": 0.00010227708104234796, + "loss": 0.0, + "step": 39572 + }, + { + "epoch": 3.692544555379304, + "grad_norm": NaN, + "learning_rate": 0.00010226991033697932, + "loss": 0.0, + "step": 39573 + }, + { + "epoch": 3.692637865074181, + "grad_norm": NaN, + "learning_rate": 0.00010226273975297421, + "loss": 0.0, + "step": 39574 + }, + { + "epoch": 3.6927311747690585, + "grad_norm": NaN, + "learning_rate": 0.00010225556929035072, + "loss": 0.0, + "step": 39575 + }, + { + "epoch": 3.692824484463936, + "grad_norm": NaN, + "learning_rate": 0.00010224839894912721, + "loss": 0.0, + "step": 39576 + }, + { + "epoch": 3.692917794158813, + "grad_norm": NaN, + "learning_rate": 0.00010224122872932191, + "loss": 0.0, + "step": 39577 + }, + { + "epoch": 3.6930111038536904, + "grad_norm": NaN, + "learning_rate": 0.00010223405863095298, + "loss": 0.0, + "step": 39578 + }, + { + "epoch": 3.6931044135485678, + "grad_norm": NaN, + "learning_rate": 0.00010222688865403868, + "loss": 0.0, + "step": 39579 + }, + { + "epoch": 3.693197723243445, + "grad_norm": NaN, + "learning_rate": 0.0001022197187985973, + "loss": 0.0, + "step": 39580 + }, + { + "epoch": 3.693291032938322, + "grad_norm": NaN, + "learning_rate": 0.00010221254906464696, + "loss": 0.0, + "step": 39581 + }, + { + "epoch": 3.6933843426331996, + "grad_norm": NaN, + "learning_rate": 0.00010220537945220595, + "loss": 0.0, + "step": 39582 + }, + { + "epoch": 3.693477652328077, + "grad_norm": NaN, + "learning_rate": 0.0001021982099612926, + "loss": 0.0, + "step": 39583 + }, + { + "epoch": 3.693570962022954, + "grad_norm": NaN, + "learning_rate": 0.00010219104059192496, + "loss": 0.0, + "step": 39584 + }, + { + "epoch": 3.6936642717178314, + "grad_norm": NaN, + "learning_rate": 0.00010218387134412132, + "loss": 0.0, + "step": 39585 + }, + { + "epoch": 3.693757581412709, + "grad_norm": NaN, + "learning_rate": 0.00010217670221789999, + "loss": 0.0, + "step": 39586 + }, + { + "epoch": 3.6938508911075862, + "grad_norm": NaN, + "learning_rate": 0.00010216953321327912, + "loss": 0.0, + "step": 39587 + }, + { + "epoch": 3.693944200802463, + "grad_norm": NaN, + "learning_rate": 0.00010216236433027694, + "loss": 0.0, + "step": 39588 + }, + { + "epoch": 3.6940375104973406, + "grad_norm": NaN, + "learning_rate": 0.00010215519556891175, + "loss": 0.0, + "step": 39589 + }, + { + "epoch": 3.694130820192218, + "grad_norm": NaN, + "learning_rate": 0.00010214802692920169, + "loss": 0.0, + "step": 39590 + }, + { + "epoch": 3.694224129887095, + "grad_norm": NaN, + "learning_rate": 0.00010214085841116498, + "loss": 0.0, + "step": 39591 + }, + { + "epoch": 3.6943174395819725, + "grad_norm": NaN, + "learning_rate": 0.00010213369001481997, + "loss": 0.0, + "step": 39592 + }, + { + "epoch": 3.69441074927685, + "grad_norm": NaN, + "learning_rate": 0.00010212652174018477, + "loss": 0.0, + "step": 39593 + }, + { + "epoch": 3.6945040589717273, + "grad_norm": NaN, + "learning_rate": 0.00010211935358727769, + "loss": 0.0, + "step": 39594 + }, + { + "epoch": 3.6945973686666047, + "grad_norm": NaN, + "learning_rate": 0.00010211218555611682, + "loss": 0.0, + "step": 39595 + }, + { + "epoch": 3.6946906783614817, + "grad_norm": NaN, + "learning_rate": 0.00010210501764672052, + "loss": 0.0, + "step": 39596 + }, + { + "epoch": 3.694783988056359, + "grad_norm": NaN, + "learning_rate": 0.000102097849859107, + "loss": 0.0, + "step": 39597 + }, + { + "epoch": 3.694877297751236, + "grad_norm": NaN, + "learning_rate": 0.00010209068219329443, + "loss": 0.0, + "step": 39598 + }, + { + "epoch": 3.6949706074461135, + "grad_norm": NaN, + "learning_rate": 0.00010208351464930105, + "loss": 0.0, + "step": 39599 + }, + { + "epoch": 3.695063917140991, + "grad_norm": NaN, + "learning_rate": 0.00010207634722714515, + "loss": 0.0, + "step": 39600 + }, + { + "epoch": 3.6951572268358683, + "grad_norm": NaN, + "learning_rate": 0.00010206917992684483, + "loss": 0.0, + "step": 39601 + }, + { + "epoch": 3.6952505365307458, + "grad_norm": NaN, + "learning_rate": 0.00010206201274841844, + "loss": 0.0, + "step": 39602 + }, + { + "epoch": 3.6953438462256227, + "grad_norm": NaN, + "learning_rate": 0.00010205484569188415, + "loss": 0.0, + "step": 39603 + }, + { + "epoch": 3.6954371559205, + "grad_norm": NaN, + "learning_rate": 0.00010204767875726015, + "loss": 0.0, + "step": 39604 + }, + { + "epoch": 3.6955304656153776, + "grad_norm": NaN, + "learning_rate": 0.00010204051194456472, + "loss": 0.0, + "step": 39605 + }, + { + "epoch": 3.6956237753102545, + "grad_norm": NaN, + "learning_rate": 0.0001020333452538161, + "loss": 0.0, + "step": 39606 + }, + { + "epoch": 3.695717085005132, + "grad_norm": NaN, + "learning_rate": 0.00010202617868503239, + "loss": 0.0, + "step": 39607 + }, + { + "epoch": 3.6958103947000094, + "grad_norm": NaN, + "learning_rate": 0.00010201901223823194, + "loss": 0.0, + "step": 39608 + }, + { + "epoch": 3.695903704394887, + "grad_norm": NaN, + "learning_rate": 0.00010201184591343297, + "loss": 0.0, + "step": 39609 + }, + { + "epoch": 3.6959970140897638, + "grad_norm": NaN, + "learning_rate": 0.00010200467971065359, + "loss": 0.0, + "step": 39610 + }, + { + "epoch": 3.696090323784641, + "grad_norm": NaN, + "learning_rate": 0.00010199751362991213, + "loss": 0.0, + "step": 39611 + }, + { + "epoch": 3.6961836334795186, + "grad_norm": NaN, + "learning_rate": 0.00010199034767122679, + "loss": 0.0, + "step": 39612 + }, + { + "epoch": 3.6962769431743956, + "grad_norm": NaN, + "learning_rate": 0.00010198318183461574, + "loss": 0.0, + "step": 39613 + }, + { + "epoch": 3.696370252869273, + "grad_norm": NaN, + "learning_rate": 0.00010197601612009723, + "loss": 0.0, + "step": 39614 + }, + { + "epoch": 3.6964635625641504, + "grad_norm": NaN, + "learning_rate": 0.00010196885052768953, + "loss": 0.0, + "step": 39615 + }, + { + "epoch": 3.696556872259028, + "grad_norm": NaN, + "learning_rate": 0.00010196168505741076, + "loss": 0.0, + "step": 39616 + }, + { + "epoch": 3.6966501819539053, + "grad_norm": NaN, + "learning_rate": 0.00010195451970927923, + "loss": 0.0, + "step": 39617 + }, + { + "epoch": 3.6967434916487822, + "grad_norm": NaN, + "learning_rate": 0.00010194735448331314, + "loss": 0.0, + "step": 39618 + }, + { + "epoch": 3.6968368013436597, + "grad_norm": NaN, + "learning_rate": 0.00010194018937953064, + "loss": 0.0, + "step": 39619 + }, + { + "epoch": 3.6969301110385366, + "grad_norm": NaN, + "learning_rate": 0.00010193302439795, + "loss": 0.0, + "step": 39620 + }, + { + "epoch": 3.697023420733414, + "grad_norm": NaN, + "learning_rate": 0.0001019258595385895, + "loss": 0.0, + "step": 39621 + }, + { + "epoch": 3.6971167304282915, + "grad_norm": NaN, + "learning_rate": 0.00010191869480146726, + "loss": 0.0, + "step": 39622 + }, + { + "epoch": 3.697210040123169, + "grad_norm": NaN, + "learning_rate": 0.00010191153018660152, + "loss": 0.0, + "step": 39623 + }, + { + "epoch": 3.6973033498180463, + "grad_norm": NaN, + "learning_rate": 0.00010190436569401057, + "loss": 0.0, + "step": 39624 + }, + { + "epoch": 3.6973966595129233, + "grad_norm": NaN, + "learning_rate": 0.00010189720132371255, + "loss": 0.0, + "step": 39625 + }, + { + "epoch": 3.6974899692078007, + "grad_norm": NaN, + "learning_rate": 0.00010189003707572566, + "loss": 0.0, + "step": 39626 + }, + { + "epoch": 3.697583278902678, + "grad_norm": NaN, + "learning_rate": 0.0001018828729500682, + "loss": 0.0, + "step": 39627 + }, + { + "epoch": 3.697676588597555, + "grad_norm": NaN, + "learning_rate": 0.00010187570894675835, + "loss": 0.0, + "step": 39628 + }, + { + "epoch": 3.6977698982924325, + "grad_norm": NaN, + "learning_rate": 0.00010186854506581426, + "loss": 0.0, + "step": 39629 + }, + { + "epoch": 3.69786320798731, + "grad_norm": NaN, + "learning_rate": 0.00010186138130725426, + "loss": 0.0, + "step": 39630 + }, + { + "epoch": 3.6979565176821874, + "grad_norm": NaN, + "learning_rate": 0.00010185421767109647, + "loss": 0.0, + "step": 39631 + }, + { + "epoch": 3.6980498273770643, + "grad_norm": NaN, + "learning_rate": 0.00010184705415735914, + "loss": 0.0, + "step": 39632 + }, + { + "epoch": 3.6981431370719418, + "grad_norm": NaN, + "learning_rate": 0.00010183989076606052, + "loss": 0.0, + "step": 39633 + }, + { + "epoch": 3.698236446766819, + "grad_norm": NaN, + "learning_rate": 0.00010183272749721878, + "loss": 0.0, + "step": 39634 + }, + { + "epoch": 3.698329756461696, + "grad_norm": NaN, + "learning_rate": 0.00010182556435085212, + "loss": 0.0, + "step": 39635 + }, + { + "epoch": 3.6984230661565736, + "grad_norm": NaN, + "learning_rate": 0.00010181840132697882, + "loss": 0.0, + "step": 39636 + }, + { + "epoch": 3.698516375851451, + "grad_norm": NaN, + "learning_rate": 0.00010181123842561702, + "loss": 0.0, + "step": 39637 + }, + { + "epoch": 3.6986096855463284, + "grad_norm": NaN, + "learning_rate": 0.00010180407564678502, + "loss": 0.0, + "step": 39638 + }, + { + "epoch": 3.698702995241206, + "grad_norm": NaN, + "learning_rate": 0.00010179691299050091, + "loss": 0.0, + "step": 39639 + }, + { + "epoch": 3.698796304936083, + "grad_norm": NaN, + "learning_rate": 0.00010178975045678298, + "loss": 0.0, + "step": 39640 + }, + { + "epoch": 3.69888961463096, + "grad_norm": NaN, + "learning_rate": 0.0001017825880456495, + "loss": 0.0, + "step": 39641 + }, + { + "epoch": 3.698982924325837, + "grad_norm": NaN, + "learning_rate": 0.00010177542575711853, + "loss": 0.0, + "step": 39642 + }, + { + "epoch": 3.6990762340207146, + "grad_norm": NaN, + "learning_rate": 0.00010176826359120841, + "loss": 0.0, + "step": 39643 + }, + { + "epoch": 3.699169543715592, + "grad_norm": NaN, + "learning_rate": 0.00010176110154793734, + "loss": 0.0, + "step": 39644 + }, + { + "epoch": 3.6992628534104695, + "grad_norm": NaN, + "learning_rate": 0.00010175393962732342, + "loss": 0.0, + "step": 39645 + }, + { + "epoch": 3.699356163105347, + "grad_norm": NaN, + "learning_rate": 0.00010174677782938498, + "loss": 0.0, + "step": 39646 + }, + { + "epoch": 3.699449472800224, + "grad_norm": NaN, + "learning_rate": 0.00010173961615414022, + "loss": 0.0, + "step": 39647 + }, + { + "epoch": 3.6995427824951013, + "grad_norm": NaN, + "learning_rate": 0.00010173245460160729, + "loss": 0.0, + "step": 39648 + }, + { + "epoch": 3.6996360921899787, + "grad_norm": NaN, + "learning_rate": 0.00010172529317180441, + "loss": 0.0, + "step": 39649 + }, + { + "epoch": 3.6997294018848557, + "grad_norm": NaN, + "learning_rate": 0.00010171813186474986, + "loss": 0.0, + "step": 39650 + }, + { + "epoch": 3.699822711579733, + "grad_norm": NaN, + "learning_rate": 0.00010171097068046175, + "loss": 0.0, + "step": 39651 + }, + { + "epoch": 3.6999160212746105, + "grad_norm": NaN, + "learning_rate": 0.00010170380961895835, + "loss": 0.0, + "step": 39652 + }, + { + "epoch": 3.700009330969488, + "grad_norm": NaN, + "learning_rate": 0.0001016966486802579, + "loss": 0.0, + "step": 39653 + }, + { + "epoch": 3.700102640664365, + "grad_norm": NaN, + "learning_rate": 0.0001016894878643785, + "loss": 0.0, + "step": 39654 + }, + { + "epoch": 3.7001959503592423, + "grad_norm": NaN, + "learning_rate": 0.00010168232717133842, + "loss": 0.0, + "step": 39655 + }, + { + "epoch": 3.7002892600541197, + "grad_norm": NaN, + "learning_rate": 0.00010167516660115596, + "loss": 0.0, + "step": 39656 + }, + { + "epoch": 3.7003825697489967, + "grad_norm": NaN, + "learning_rate": 0.00010166800615384913, + "loss": 0.0, + "step": 39657 + }, + { + "epoch": 3.700475879443874, + "grad_norm": NaN, + "learning_rate": 0.00010166084582943628, + "loss": 0.0, + "step": 39658 + }, + { + "epoch": 3.7005691891387515, + "grad_norm": NaN, + "learning_rate": 0.0001016536856279356, + "loss": 0.0, + "step": 39659 + }, + { + "epoch": 3.700662498833629, + "grad_norm": NaN, + "learning_rate": 0.0001016465255493653, + "loss": 0.0, + "step": 39660 + }, + { + "epoch": 3.7007558085285064, + "grad_norm": NaN, + "learning_rate": 0.00010163936559374346, + "loss": 0.0, + "step": 39661 + }, + { + "epoch": 3.7008491182233834, + "grad_norm": NaN, + "learning_rate": 0.0001016322057610885, + "loss": 0.0, + "step": 39662 + }, + { + "epoch": 3.700942427918261, + "grad_norm": NaN, + "learning_rate": 0.00010162504605141847, + "loss": 0.0, + "step": 39663 + }, + { + "epoch": 3.7010357376131378, + "grad_norm": NaN, + "learning_rate": 0.00010161788646475158, + "loss": 0.0, + "step": 39664 + }, + { + "epoch": 3.701129047308015, + "grad_norm": NaN, + "learning_rate": 0.00010161072700110614, + "loss": 0.0, + "step": 39665 + }, + { + "epoch": 3.7012223570028926, + "grad_norm": NaN, + "learning_rate": 0.00010160356766050025, + "loss": 0.0, + "step": 39666 + }, + { + "epoch": 3.70131566669777, + "grad_norm": NaN, + "learning_rate": 0.00010159640844295213, + "loss": 0.0, + "step": 39667 + }, + { + "epoch": 3.7014089763926474, + "grad_norm": NaN, + "learning_rate": 0.00010158924934848006, + "loss": 0.0, + "step": 39668 + }, + { + "epoch": 3.7015022860875244, + "grad_norm": NaN, + "learning_rate": 0.00010158209037710218, + "loss": 0.0, + "step": 39669 + }, + { + "epoch": 3.701595595782402, + "grad_norm": NaN, + "learning_rate": 0.00010157493152883664, + "loss": 0.0, + "step": 39670 + }, + { + "epoch": 3.7016889054772792, + "grad_norm": NaN, + "learning_rate": 0.00010156777280370178, + "loss": 0.0, + "step": 39671 + }, + { + "epoch": 3.701782215172156, + "grad_norm": NaN, + "learning_rate": 0.00010156061420171569, + "loss": 0.0, + "step": 39672 + }, + { + "epoch": 3.7018755248670336, + "grad_norm": NaN, + "learning_rate": 0.00010155345572289659, + "loss": 0.0, + "step": 39673 + }, + { + "epoch": 3.701968834561911, + "grad_norm": NaN, + "learning_rate": 0.00010154629736726275, + "loss": 0.0, + "step": 39674 + }, + { + "epoch": 3.7020621442567885, + "grad_norm": NaN, + "learning_rate": 0.00010153913913483231, + "loss": 0.0, + "step": 39675 + }, + { + "epoch": 3.7021554539516655, + "grad_norm": NaN, + "learning_rate": 0.00010153198102562345, + "loss": 0.0, + "step": 39676 + }, + { + "epoch": 3.702248763646543, + "grad_norm": NaN, + "learning_rate": 0.00010152482303965444, + "loss": 0.0, + "step": 39677 + }, + { + "epoch": 3.7023420733414203, + "grad_norm": NaN, + "learning_rate": 0.00010151766517694344, + "loss": 0.0, + "step": 39678 + }, + { + "epoch": 3.7024353830362973, + "grad_norm": NaN, + "learning_rate": 0.00010151050743750862, + "loss": 0.0, + "step": 39679 + }, + { + "epoch": 3.7025286927311747, + "grad_norm": NaN, + "learning_rate": 0.00010150334982136826, + "loss": 0.0, + "step": 39680 + }, + { + "epoch": 3.702622002426052, + "grad_norm": NaN, + "learning_rate": 0.00010149619232854051, + "loss": 0.0, + "step": 39681 + }, + { + "epoch": 3.7027153121209295, + "grad_norm": NaN, + "learning_rate": 0.0001014890349590436, + "loss": 0.0, + "step": 39682 + }, + { + "epoch": 3.7028086218158065, + "grad_norm": NaN, + "learning_rate": 0.00010148187771289565, + "loss": 0.0, + "step": 39683 + }, + { + "epoch": 3.702901931510684, + "grad_norm": NaN, + "learning_rate": 0.0001014747205901149, + "loss": 0.0, + "step": 39684 + }, + { + "epoch": 3.7029952412055613, + "grad_norm": NaN, + "learning_rate": 0.00010146756359071965, + "loss": 0.0, + "step": 39685 + }, + { + "epoch": 3.7030885509004383, + "grad_norm": NaN, + "learning_rate": 0.0001014604067147279, + "loss": 0.0, + "step": 39686 + }, + { + "epoch": 3.7031818605953157, + "grad_norm": NaN, + "learning_rate": 0.00010145324996215803, + "loss": 0.0, + "step": 39687 + }, + { + "epoch": 3.703275170290193, + "grad_norm": NaN, + "learning_rate": 0.00010144609333302817, + "loss": 0.0, + "step": 39688 + }, + { + "epoch": 3.7033684799850706, + "grad_norm": NaN, + "learning_rate": 0.00010143893682735645, + "loss": 0.0, + "step": 39689 + }, + { + "epoch": 3.703461789679948, + "grad_norm": NaN, + "learning_rate": 0.00010143178044516116, + "loss": 0.0, + "step": 39690 + }, + { + "epoch": 3.703555099374825, + "grad_norm": NaN, + "learning_rate": 0.00010142462418646052, + "loss": 0.0, + "step": 39691 + }, + { + "epoch": 3.7036484090697024, + "grad_norm": NaN, + "learning_rate": 0.00010141746805127258, + "loss": 0.0, + "step": 39692 + }, + { + "epoch": 3.70374171876458, + "grad_norm": NaN, + "learning_rate": 0.00010141031203961564, + "loss": 0.0, + "step": 39693 + }, + { + "epoch": 3.703835028459457, + "grad_norm": NaN, + "learning_rate": 0.00010140315615150797, + "loss": 0.0, + "step": 39694 + }, + { + "epoch": 3.703928338154334, + "grad_norm": NaN, + "learning_rate": 0.0001013960003869676, + "loss": 0.0, + "step": 39695 + }, + { + "epoch": 3.7040216478492116, + "grad_norm": NaN, + "learning_rate": 0.00010138884474601279, + "loss": 0.0, + "step": 39696 + }, + { + "epoch": 3.704114957544089, + "grad_norm": NaN, + "learning_rate": 0.00010138168922866181, + "loss": 0.0, + "step": 39697 + }, + { + "epoch": 3.704208267238966, + "grad_norm": NaN, + "learning_rate": 0.00010137453383493277, + "loss": 0.0, + "step": 39698 + }, + { + "epoch": 3.7043015769338434, + "grad_norm": NaN, + "learning_rate": 0.00010136737856484385, + "loss": 0.0, + "step": 39699 + }, + { + "epoch": 3.704394886628721, + "grad_norm": NaN, + "learning_rate": 0.00010136022341841336, + "loss": 0.0, + "step": 39700 + }, + { + "epoch": 3.704488196323598, + "grad_norm": NaN, + "learning_rate": 0.00010135306839565934, + "loss": 0.0, + "step": 39701 + }, + { + "epoch": 3.7045815060184752, + "grad_norm": NaN, + "learning_rate": 0.00010134591349660007, + "loss": 0.0, + "step": 39702 + }, + { + "epoch": 3.7046748157133527, + "grad_norm": NaN, + "learning_rate": 0.00010133875872125376, + "loss": 0.0, + "step": 39703 + }, + { + "epoch": 3.70476812540823, + "grad_norm": NaN, + "learning_rate": 0.00010133160406963857, + "loss": 0.0, + "step": 39704 + }, + { + "epoch": 3.704861435103107, + "grad_norm": NaN, + "learning_rate": 0.00010132444954177264, + "loss": 0.0, + "step": 39705 + }, + { + "epoch": 3.7049547447979845, + "grad_norm": NaN, + "learning_rate": 0.00010131729513767429, + "loss": 0.0, + "step": 39706 + }, + { + "epoch": 3.705048054492862, + "grad_norm": NaN, + "learning_rate": 0.0001013101408573616, + "loss": 0.0, + "step": 39707 + }, + { + "epoch": 3.705141364187739, + "grad_norm": NaN, + "learning_rate": 0.00010130298670085276, + "loss": 0.0, + "step": 39708 + }, + { + "epoch": 3.7052346738826163, + "grad_norm": NaN, + "learning_rate": 0.00010129583266816608, + "loss": 0.0, + "step": 39709 + }, + { + "epoch": 3.7053279835774937, + "grad_norm": NaN, + "learning_rate": 0.00010128867875931963, + "loss": 0.0, + "step": 39710 + }, + { + "epoch": 3.705421293272371, + "grad_norm": NaN, + "learning_rate": 0.0001012815249743316, + "loss": 0.0, + "step": 39711 + }, + { + "epoch": 3.7055146029672485, + "grad_norm": NaN, + "learning_rate": 0.0001012743713132203, + "loss": 0.0, + "step": 39712 + }, + { + "epoch": 3.7056079126621255, + "grad_norm": NaN, + "learning_rate": 0.0001012672177760038, + "loss": 0.0, + "step": 39713 + }, + { + "epoch": 3.705701222357003, + "grad_norm": NaN, + "learning_rate": 0.0001012600643627003, + "loss": 0.0, + "step": 39714 + }, + { + "epoch": 3.70579453205188, + "grad_norm": NaN, + "learning_rate": 0.0001012529110733281, + "loss": 0.0, + "step": 39715 + }, + { + "epoch": 3.7058878417467573, + "grad_norm": NaN, + "learning_rate": 0.00010124575790790526, + "loss": 0.0, + "step": 39716 + }, + { + "epoch": 3.7059811514416348, + "grad_norm": NaN, + "learning_rate": 0.00010123860486644999, + "loss": 0.0, + "step": 39717 + }, + { + "epoch": 3.706074461136512, + "grad_norm": NaN, + "learning_rate": 0.00010123145194898055, + "loss": 0.0, + "step": 39718 + }, + { + "epoch": 3.7061677708313896, + "grad_norm": NaN, + "learning_rate": 0.00010122429915551505, + "loss": 0.0, + "step": 39719 + }, + { + "epoch": 3.7062610805262666, + "grad_norm": NaN, + "learning_rate": 0.00010121714648607169, + "loss": 0.0, + "step": 39720 + }, + { + "epoch": 3.706354390221144, + "grad_norm": NaN, + "learning_rate": 0.00010120999394066873, + "loss": 0.0, + "step": 39721 + }, + { + "epoch": 3.7064476999160214, + "grad_norm": NaN, + "learning_rate": 0.00010120284151932427, + "loss": 0.0, + "step": 39722 + }, + { + "epoch": 3.7065410096108984, + "grad_norm": NaN, + "learning_rate": 0.0001011956892220565, + "loss": 0.0, + "step": 39723 + }, + { + "epoch": 3.706634319305776, + "grad_norm": NaN, + "learning_rate": 0.00010118853704888371, + "loss": 0.0, + "step": 39724 + }, + { + "epoch": 3.7067276290006532, + "grad_norm": NaN, + "learning_rate": 0.00010118138499982397, + "loss": 0.0, + "step": 39725 + }, + { + "epoch": 3.7068209386955306, + "grad_norm": NaN, + "learning_rate": 0.00010117423307489549, + "loss": 0.0, + "step": 39726 + }, + { + "epoch": 3.7069142483904076, + "grad_norm": NaN, + "learning_rate": 0.00010116708127411651, + "loss": 0.0, + "step": 39727 + }, + { + "epoch": 3.707007558085285, + "grad_norm": NaN, + "learning_rate": 0.00010115992959750516, + "loss": 0.0, + "step": 39728 + }, + { + "epoch": 3.7071008677801625, + "grad_norm": NaN, + "learning_rate": 0.00010115277804507967, + "loss": 0.0, + "step": 39729 + }, + { + "epoch": 3.7071941774750394, + "grad_norm": NaN, + "learning_rate": 0.00010114562661685813, + "loss": 0.0, + "step": 39730 + }, + { + "epoch": 3.707287487169917, + "grad_norm": NaN, + "learning_rate": 0.00010113847531285881, + "loss": 0.0, + "step": 39731 + }, + { + "epoch": 3.7073807968647943, + "grad_norm": NaN, + "learning_rate": 0.00010113132413309995, + "loss": 0.0, + "step": 39732 + }, + { + "epoch": 3.7074741065596717, + "grad_norm": NaN, + "learning_rate": 0.00010112417307759955, + "loss": 0.0, + "step": 39733 + }, + { + "epoch": 3.707567416254549, + "grad_norm": NaN, + "learning_rate": 0.00010111702214637591, + "loss": 0.0, + "step": 39734 + }, + { + "epoch": 3.707660725949426, + "grad_norm": NaN, + "learning_rate": 0.00010110987133944727, + "loss": 0.0, + "step": 39735 + }, + { + "epoch": 3.7077540356443035, + "grad_norm": NaN, + "learning_rate": 0.0001011027206568317, + "loss": 0.0, + "step": 39736 + }, + { + "epoch": 3.7078473453391805, + "grad_norm": NaN, + "learning_rate": 0.00010109557009854742, + "loss": 0.0, + "step": 39737 + }, + { + "epoch": 3.707940655034058, + "grad_norm": NaN, + "learning_rate": 0.00010108841966461266, + "loss": 0.0, + "step": 39738 + }, + { + "epoch": 3.7080339647289353, + "grad_norm": NaN, + "learning_rate": 0.00010108126935504554, + "loss": 0.0, + "step": 39739 + }, + { + "epoch": 3.7081272744238127, + "grad_norm": NaN, + "learning_rate": 0.00010107411916986422, + "loss": 0.0, + "step": 39740 + }, + { + "epoch": 3.70822058411869, + "grad_norm": NaN, + "learning_rate": 0.000101066969109087, + "loss": 0.0, + "step": 39741 + }, + { + "epoch": 3.708313893813567, + "grad_norm": NaN, + "learning_rate": 0.00010105981917273195, + "loss": 0.0, + "step": 39742 + }, + { + "epoch": 3.7084072035084445, + "grad_norm": NaN, + "learning_rate": 0.00010105266936081724, + "loss": 0.0, + "step": 39743 + }, + { + "epoch": 3.708500513203322, + "grad_norm": NaN, + "learning_rate": 0.00010104551967336117, + "loss": 0.0, + "step": 39744 + }, + { + "epoch": 3.708593822898199, + "grad_norm": NaN, + "learning_rate": 0.00010103837011038179, + "loss": 0.0, + "step": 39745 + }, + { + "epoch": 3.7086871325930764, + "grad_norm": NaN, + "learning_rate": 0.00010103122067189732, + "loss": 0.0, + "step": 39746 + }, + { + "epoch": 3.708780442287954, + "grad_norm": NaN, + "learning_rate": 0.00010102407135792598, + "loss": 0.0, + "step": 39747 + }, + { + "epoch": 3.708873751982831, + "grad_norm": NaN, + "learning_rate": 0.00010101692216848594, + "loss": 0.0, + "step": 39748 + }, + { + "epoch": 3.708967061677708, + "grad_norm": NaN, + "learning_rate": 0.0001010097731035953, + "loss": 0.0, + "step": 39749 + }, + { + "epoch": 3.7090603713725856, + "grad_norm": NaN, + "learning_rate": 0.00010100262416327234, + "loss": 0.0, + "step": 39750 + }, + { + "epoch": 3.709153681067463, + "grad_norm": NaN, + "learning_rate": 0.00010099547534753518, + "loss": 0.0, + "step": 39751 + }, + { + "epoch": 3.70924699076234, + "grad_norm": NaN, + "learning_rate": 0.00010098832665640198, + "loss": 0.0, + "step": 39752 + }, + { + "epoch": 3.7093403004572174, + "grad_norm": NaN, + "learning_rate": 0.00010098117808989101, + "loss": 0.0, + "step": 39753 + }, + { + "epoch": 3.709433610152095, + "grad_norm": NaN, + "learning_rate": 0.00010097402964802033, + "loss": 0.0, + "step": 39754 + }, + { + "epoch": 3.7095269198469722, + "grad_norm": NaN, + "learning_rate": 0.00010096688133080815, + "loss": 0.0, + "step": 39755 + }, + { + "epoch": 3.7096202295418497, + "grad_norm": NaN, + "learning_rate": 0.00010095973313827275, + "loss": 0.0, + "step": 39756 + }, + { + "epoch": 3.7097135392367266, + "grad_norm": NaN, + "learning_rate": 0.00010095258507043216, + "loss": 0.0, + "step": 39757 + }, + { + "epoch": 3.709806848931604, + "grad_norm": NaN, + "learning_rate": 0.0001009454371273046, + "loss": 0.0, + "step": 39758 + }, + { + "epoch": 3.709900158626481, + "grad_norm": NaN, + "learning_rate": 0.00010093828930890832, + "loss": 0.0, + "step": 39759 + }, + { + "epoch": 3.7099934683213585, + "grad_norm": NaN, + "learning_rate": 0.00010093114161526143, + "loss": 0.0, + "step": 39760 + }, + { + "epoch": 3.710086778016236, + "grad_norm": NaN, + "learning_rate": 0.00010092399404638204, + "loss": 0.0, + "step": 39761 + }, + { + "epoch": 3.7101800877111133, + "grad_norm": NaN, + "learning_rate": 0.00010091684660228848, + "loss": 0.0, + "step": 39762 + }, + { + "epoch": 3.7102733974059907, + "grad_norm": NaN, + "learning_rate": 0.00010090969928299882, + "loss": 0.0, + "step": 39763 + }, + { + "epoch": 3.7103667071008677, + "grad_norm": NaN, + "learning_rate": 0.00010090255208853121, + "loss": 0.0, + "step": 39764 + }, + { + "epoch": 3.710460016795745, + "grad_norm": NaN, + "learning_rate": 0.00010089540501890392, + "loss": 0.0, + "step": 39765 + }, + { + "epoch": 3.7105533264906225, + "grad_norm": NaN, + "learning_rate": 0.00010088825807413505, + "loss": 0.0, + "step": 39766 + }, + { + "epoch": 3.7106466361854995, + "grad_norm": NaN, + "learning_rate": 0.00010088111125424277, + "loss": 0.0, + "step": 39767 + }, + { + "epoch": 3.710739945880377, + "grad_norm": NaN, + "learning_rate": 0.0001008739645592453, + "loss": 0.0, + "step": 39768 + }, + { + "epoch": 3.7108332555752543, + "grad_norm": NaN, + "learning_rate": 0.00010086681798916075, + "loss": 0.0, + "step": 39769 + }, + { + "epoch": 3.7109265652701318, + "grad_norm": NaN, + "learning_rate": 0.00010085967154400736, + "loss": 0.0, + "step": 39770 + }, + { + "epoch": 3.7110198749650087, + "grad_norm": NaN, + "learning_rate": 0.00010085252522380331, + "loss": 0.0, + "step": 39771 + }, + { + "epoch": 3.711113184659886, + "grad_norm": NaN, + "learning_rate": 0.00010084537902856664, + "loss": 0.0, + "step": 39772 + }, + { + "epoch": 3.7112064943547636, + "grad_norm": NaN, + "learning_rate": 0.0001008382329583157, + "loss": 0.0, + "step": 39773 + }, + { + "epoch": 3.7112998040496405, + "grad_norm": NaN, + "learning_rate": 0.00010083108701306851, + "loss": 0.0, + "step": 39774 + }, + { + "epoch": 3.711393113744518, + "grad_norm": NaN, + "learning_rate": 0.00010082394119284329, + "loss": 0.0, + "step": 39775 + }, + { + "epoch": 3.7114864234393954, + "grad_norm": NaN, + "learning_rate": 0.00010081679549765828, + "loss": 0.0, + "step": 39776 + }, + { + "epoch": 3.711579733134273, + "grad_norm": NaN, + "learning_rate": 0.00010080964992753157, + "loss": 0.0, + "step": 39777 + }, + { + "epoch": 3.71167304282915, + "grad_norm": NaN, + "learning_rate": 0.00010080250448248129, + "loss": 0.0, + "step": 39778 + }, + { + "epoch": 3.711766352524027, + "grad_norm": NaN, + "learning_rate": 0.00010079535916252573, + "loss": 0.0, + "step": 39779 + }, + { + "epoch": 3.7118596622189046, + "grad_norm": NaN, + "learning_rate": 0.00010078821396768299, + "loss": 0.0, + "step": 39780 + }, + { + "epoch": 3.7119529719137816, + "grad_norm": NaN, + "learning_rate": 0.00010078106889797121, + "loss": 0.0, + "step": 39781 + }, + { + "epoch": 3.712046281608659, + "grad_norm": NaN, + "learning_rate": 0.00010077392395340865, + "loss": 0.0, + "step": 39782 + }, + { + "epoch": 3.7121395913035364, + "grad_norm": NaN, + "learning_rate": 0.00010076677913401339, + "loss": 0.0, + "step": 39783 + }, + { + "epoch": 3.712232900998414, + "grad_norm": NaN, + "learning_rate": 0.00010075963443980357, + "loss": 0.0, + "step": 39784 + }, + { + "epoch": 3.7123262106932913, + "grad_norm": NaN, + "learning_rate": 0.0001007524898707975, + "loss": 0.0, + "step": 39785 + }, + { + "epoch": 3.7124195203881682, + "grad_norm": NaN, + "learning_rate": 0.00010074534542701324, + "loss": 0.0, + "step": 39786 + }, + { + "epoch": 3.7125128300830457, + "grad_norm": NaN, + "learning_rate": 0.00010073820110846892, + "loss": 0.0, + "step": 39787 + }, + { + "epoch": 3.712606139777923, + "grad_norm": NaN, + "learning_rate": 0.00010073105691518285, + "loss": 0.0, + "step": 39788 + }, + { + "epoch": 3.7126994494728, + "grad_norm": NaN, + "learning_rate": 0.00010072391284717306, + "loss": 0.0, + "step": 39789 + }, + { + "epoch": 3.7127927591676775, + "grad_norm": NaN, + "learning_rate": 0.00010071676890445774, + "loss": 0.0, + "step": 39790 + }, + { + "epoch": 3.712886068862555, + "grad_norm": NaN, + "learning_rate": 0.00010070962508705512, + "loss": 0.0, + "step": 39791 + }, + { + "epoch": 3.7129793785574323, + "grad_norm": NaN, + "learning_rate": 0.00010070248139498331, + "loss": 0.0, + "step": 39792 + }, + { + "epoch": 3.7130726882523093, + "grad_norm": NaN, + "learning_rate": 0.00010069533782826045, + "loss": 0.0, + "step": 39793 + }, + { + "epoch": 3.7131659979471867, + "grad_norm": NaN, + "learning_rate": 0.0001006881943869048, + "loss": 0.0, + "step": 39794 + }, + { + "epoch": 3.713259307642064, + "grad_norm": NaN, + "learning_rate": 0.00010068105107093445, + "loss": 0.0, + "step": 39795 + }, + { + "epoch": 3.713352617336941, + "grad_norm": NaN, + "learning_rate": 0.00010067390788036753, + "loss": 0.0, + "step": 39796 + }, + { + "epoch": 3.7134459270318185, + "grad_norm": NaN, + "learning_rate": 0.00010066676481522232, + "loss": 0.0, + "step": 39797 + }, + { + "epoch": 3.713539236726696, + "grad_norm": NaN, + "learning_rate": 0.00010065962187551687, + "loss": 0.0, + "step": 39798 + }, + { + "epoch": 3.7136325464215734, + "grad_norm": NaN, + "learning_rate": 0.00010065247906126937, + "loss": 0.0, + "step": 39799 + }, + { + "epoch": 3.7137258561164503, + "grad_norm": NaN, + "learning_rate": 0.00010064533637249805, + "loss": 0.0, + "step": 39800 + }, + { + "epoch": 3.7138191658113278, + "grad_norm": NaN, + "learning_rate": 0.00010063819380922099, + "loss": 0.0, + "step": 39801 + }, + { + "epoch": 3.713912475506205, + "grad_norm": NaN, + "learning_rate": 0.00010063105137145636, + "loss": 0.0, + "step": 39802 + }, + { + "epoch": 3.714005785201082, + "grad_norm": NaN, + "learning_rate": 0.00010062390905922239, + "loss": 0.0, + "step": 39803 + }, + { + "epoch": 3.7140990948959596, + "grad_norm": NaN, + "learning_rate": 0.00010061676687253718, + "loss": 0.0, + "step": 39804 + }, + { + "epoch": 3.714192404590837, + "grad_norm": NaN, + "learning_rate": 0.00010060962481141887, + "loss": 0.0, + "step": 39805 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": NaN, + "learning_rate": 0.00010060248287588569, + "loss": 0.0, + "step": 39806 + }, + { + "epoch": 3.714379023980592, + "grad_norm": NaN, + "learning_rate": 0.00010059534106595572, + "loss": 0.0, + "step": 39807 + }, + { + "epoch": 3.714472333675469, + "grad_norm": NaN, + "learning_rate": 0.00010058819938164719, + "loss": 0.0, + "step": 39808 + }, + { + "epoch": 3.7145656433703462, + "grad_norm": NaN, + "learning_rate": 0.00010058105782297826, + "loss": 0.0, + "step": 39809 + }, + { + "epoch": 3.714658953065223, + "grad_norm": NaN, + "learning_rate": 0.000100573916389967, + "loss": 0.0, + "step": 39810 + }, + { + "epoch": 3.7147522627601006, + "grad_norm": NaN, + "learning_rate": 0.00010056677508263169, + "loss": 0.0, + "step": 39811 + }, + { + "epoch": 3.714845572454978, + "grad_norm": NaN, + "learning_rate": 0.00010055963390099044, + "loss": 0.0, + "step": 39812 + }, + { + "epoch": 3.7149388821498555, + "grad_norm": NaN, + "learning_rate": 0.00010055249284506132, + "loss": 0.0, + "step": 39813 + }, + { + "epoch": 3.715032191844733, + "grad_norm": NaN, + "learning_rate": 0.00010054535191486259, + "loss": 0.0, + "step": 39814 + }, + { + "epoch": 3.71512550153961, + "grad_norm": NaN, + "learning_rate": 0.00010053821111041243, + "loss": 0.0, + "step": 39815 + }, + { + "epoch": 3.7152188112344873, + "grad_norm": NaN, + "learning_rate": 0.00010053107043172887, + "loss": 0.0, + "step": 39816 + }, + { + "epoch": 3.7153121209293647, + "grad_norm": NaN, + "learning_rate": 0.00010052392987883021, + "loss": 0.0, + "step": 39817 + }, + { + "epoch": 3.7154054306242417, + "grad_norm": NaN, + "learning_rate": 0.00010051678945173453, + "loss": 0.0, + "step": 39818 + }, + { + "epoch": 3.715498740319119, + "grad_norm": NaN, + "learning_rate": 0.00010050964915045994, + "loss": 0.0, + "step": 39819 + }, + { + "epoch": 3.7155920500139965, + "grad_norm": NaN, + "learning_rate": 0.00010050250897502472, + "loss": 0.0, + "step": 39820 + }, + { + "epoch": 3.715685359708874, + "grad_norm": NaN, + "learning_rate": 0.00010049536892544693, + "loss": 0.0, + "step": 39821 + }, + { + "epoch": 3.715778669403751, + "grad_norm": NaN, + "learning_rate": 0.00010048822900174474, + "loss": 0.0, + "step": 39822 + }, + { + "epoch": 3.7158719790986283, + "grad_norm": NaN, + "learning_rate": 0.00010048108920393635, + "loss": 0.0, + "step": 39823 + }, + { + "epoch": 3.7159652887935057, + "grad_norm": NaN, + "learning_rate": 0.00010047394953203987, + "loss": 0.0, + "step": 39824 + }, + { + "epoch": 3.7160585984883827, + "grad_norm": NaN, + "learning_rate": 0.00010046680998607342, + "loss": 0.0, + "step": 39825 + }, + { + "epoch": 3.71615190818326, + "grad_norm": NaN, + "learning_rate": 0.00010045967056605526, + "loss": 0.0, + "step": 39826 + }, + { + "epoch": 3.7162452178781376, + "grad_norm": NaN, + "learning_rate": 0.00010045253127200347, + "loss": 0.0, + "step": 39827 + }, + { + "epoch": 3.716338527573015, + "grad_norm": NaN, + "learning_rate": 0.00010044539210393617, + "loss": 0.0, + "step": 39828 + }, + { + "epoch": 3.7164318372678924, + "grad_norm": NaN, + "learning_rate": 0.00010043825306187163, + "loss": 0.0, + "step": 39829 + }, + { + "epoch": 3.7165251469627694, + "grad_norm": NaN, + "learning_rate": 0.00010043111414582788, + "loss": 0.0, + "step": 39830 + }, + { + "epoch": 3.716618456657647, + "grad_norm": NaN, + "learning_rate": 0.00010042397535582312, + "loss": 0.0, + "step": 39831 + }, + { + "epoch": 3.7167117663525238, + "grad_norm": NaN, + "learning_rate": 0.00010041683669187554, + "loss": 0.0, + "step": 39832 + }, + { + "epoch": 3.716805076047401, + "grad_norm": NaN, + "learning_rate": 0.00010040969815400326, + "loss": 0.0, + "step": 39833 + }, + { + "epoch": 3.7168983857422786, + "grad_norm": NaN, + "learning_rate": 0.00010040255974222438, + "loss": 0.0, + "step": 39834 + }, + { + "epoch": 3.716991695437156, + "grad_norm": NaN, + "learning_rate": 0.00010039542145655714, + "loss": 0.0, + "step": 39835 + }, + { + "epoch": 3.7170850051320334, + "grad_norm": NaN, + "learning_rate": 0.00010038828329701966, + "loss": 0.0, + "step": 39836 + }, + { + "epoch": 3.7171783148269104, + "grad_norm": NaN, + "learning_rate": 0.00010038114526363002, + "loss": 0.0, + "step": 39837 + }, + { + "epoch": 3.717271624521788, + "grad_norm": NaN, + "learning_rate": 0.00010037400735640651, + "loss": 0.0, + "step": 39838 + }, + { + "epoch": 3.7173649342166653, + "grad_norm": NaN, + "learning_rate": 0.00010036686957536715, + "loss": 0.0, + "step": 39839 + }, + { + "epoch": 3.7174582439115422, + "grad_norm": NaN, + "learning_rate": 0.0001003597319205301, + "loss": 0.0, + "step": 39840 + }, + { + "epoch": 3.7175515536064196, + "grad_norm": NaN, + "learning_rate": 0.00010035259439191361, + "loss": 0.0, + "step": 39841 + }, + { + "epoch": 3.717644863301297, + "grad_norm": NaN, + "learning_rate": 0.00010034545698953575, + "loss": 0.0, + "step": 39842 + }, + { + "epoch": 3.7177381729961745, + "grad_norm": NaN, + "learning_rate": 0.00010033831971341467, + "loss": 0.0, + "step": 39843 + }, + { + "epoch": 3.7178314826910515, + "grad_norm": NaN, + "learning_rate": 0.00010033118256356856, + "loss": 0.0, + "step": 39844 + }, + { + "epoch": 3.717924792385929, + "grad_norm": NaN, + "learning_rate": 0.00010032404554001548, + "loss": 0.0, + "step": 39845 + }, + { + "epoch": 3.7180181020808063, + "grad_norm": NaN, + "learning_rate": 0.0001003169086427737, + "loss": 0.0, + "step": 39846 + }, + { + "epoch": 3.7181114117756833, + "grad_norm": NaN, + "learning_rate": 0.00010030977187186129, + "loss": 0.0, + "step": 39847 + }, + { + "epoch": 3.7182047214705607, + "grad_norm": NaN, + "learning_rate": 0.00010030263522729639, + "loss": 0.0, + "step": 39848 + }, + { + "epoch": 3.718298031165438, + "grad_norm": NaN, + "learning_rate": 0.00010029549870909717, + "loss": 0.0, + "step": 39849 + }, + { + "epoch": 3.7183913408603155, + "grad_norm": NaN, + "learning_rate": 0.00010028836231728182, + "loss": 0.0, + "step": 39850 + }, + { + "epoch": 3.718484650555193, + "grad_norm": NaN, + "learning_rate": 0.00010028122605186837, + "loss": 0.0, + "step": 39851 + }, + { + "epoch": 3.71857796025007, + "grad_norm": NaN, + "learning_rate": 0.00010027408991287507, + "loss": 0.0, + "step": 39852 + }, + { + "epoch": 3.7186712699449473, + "grad_norm": NaN, + "learning_rate": 0.00010026695390032004, + "loss": 0.0, + "step": 39853 + }, + { + "epoch": 3.7187645796398243, + "grad_norm": NaN, + "learning_rate": 0.0001002598180142214, + "loss": 0.0, + "step": 39854 + }, + { + "epoch": 3.7188578893347017, + "grad_norm": NaN, + "learning_rate": 0.0001002526822545973, + "loss": 0.0, + "step": 39855 + }, + { + "epoch": 3.718951199029579, + "grad_norm": NaN, + "learning_rate": 0.00010024554662146593, + "loss": 0.0, + "step": 39856 + }, + { + "epoch": 3.7190445087244566, + "grad_norm": NaN, + "learning_rate": 0.00010023841111484533, + "loss": 0.0, + "step": 39857 + }, + { + "epoch": 3.719137818419334, + "grad_norm": NaN, + "learning_rate": 0.00010023127573475375, + "loss": 0.0, + "step": 39858 + }, + { + "epoch": 3.719231128114211, + "grad_norm": NaN, + "learning_rate": 0.00010022414048120934, + "loss": 0.0, + "step": 39859 + }, + { + "epoch": 3.7193244378090884, + "grad_norm": NaN, + "learning_rate": 0.00010021700535423011, + "loss": 0.0, + "step": 39860 + }, + { + "epoch": 3.719417747503966, + "grad_norm": NaN, + "learning_rate": 0.00010020987035383432, + "loss": 0.0, + "step": 39861 + }, + { + "epoch": 3.719511057198843, + "grad_norm": NaN, + "learning_rate": 0.00010020273548004013, + "loss": 0.0, + "step": 39862 + }, + { + "epoch": 3.71960436689372, + "grad_norm": NaN, + "learning_rate": 0.00010019560073286555, + "loss": 0.0, + "step": 39863 + }, + { + "epoch": 3.7196976765885976, + "grad_norm": NaN, + "learning_rate": 0.00010018846611232887, + "loss": 0.0, + "step": 39864 + }, + { + "epoch": 3.719790986283475, + "grad_norm": NaN, + "learning_rate": 0.00010018133161844815, + "loss": 0.0, + "step": 39865 + }, + { + "epoch": 3.719884295978352, + "grad_norm": NaN, + "learning_rate": 0.0001001741972512415, + "loss": 0.0, + "step": 39866 + }, + { + "epoch": 3.7199776056732294, + "grad_norm": NaN, + "learning_rate": 0.00010016706301072716, + "loss": 0.0, + "step": 39867 + }, + { + "epoch": 3.720070915368107, + "grad_norm": NaN, + "learning_rate": 0.00010015992889692322, + "loss": 0.0, + "step": 39868 + }, + { + "epoch": 3.720164225062984, + "grad_norm": NaN, + "learning_rate": 0.00010015279490984776, + "loss": 0.0, + "step": 39869 + }, + { + "epoch": 3.7202575347578613, + "grad_norm": NaN, + "learning_rate": 0.00010014566104951905, + "loss": 0.0, + "step": 39870 + }, + { + "epoch": 3.7203508444527387, + "grad_norm": NaN, + "learning_rate": 0.00010013852731595511, + "loss": 0.0, + "step": 39871 + }, + { + "epoch": 3.720444154147616, + "grad_norm": NaN, + "learning_rate": 0.00010013139370917413, + "loss": 0.0, + "step": 39872 + }, + { + "epoch": 3.7205374638424935, + "grad_norm": NaN, + "learning_rate": 0.00010012426022919428, + "loss": 0.0, + "step": 39873 + }, + { + "epoch": 3.7206307735373705, + "grad_norm": NaN, + "learning_rate": 0.00010011712687603361, + "loss": 0.0, + "step": 39874 + }, + { + "epoch": 3.720724083232248, + "grad_norm": NaN, + "learning_rate": 0.0001001099936497103, + "loss": 0.0, + "step": 39875 + }, + { + "epoch": 3.720817392927125, + "grad_norm": NaN, + "learning_rate": 0.00010010286055024257, + "loss": 0.0, + "step": 39876 + }, + { + "epoch": 3.7209107026220023, + "grad_norm": NaN, + "learning_rate": 0.00010009572757764843, + "loss": 0.0, + "step": 39877 + }, + { + "epoch": 3.7210040123168797, + "grad_norm": NaN, + "learning_rate": 0.00010008859473194607, + "loss": 0.0, + "step": 39878 + }, + { + "epoch": 3.721097322011757, + "grad_norm": NaN, + "learning_rate": 0.00010008146201315367, + "loss": 0.0, + "step": 39879 + }, + { + "epoch": 3.7211906317066346, + "grad_norm": NaN, + "learning_rate": 0.00010007432942128927, + "loss": 0.0, + "step": 39880 + }, + { + "epoch": 3.7212839414015115, + "grad_norm": NaN, + "learning_rate": 0.00010006719695637108, + "loss": 0.0, + "step": 39881 + }, + { + "epoch": 3.721377251096389, + "grad_norm": NaN, + "learning_rate": 0.00010006006461841727, + "loss": 0.0, + "step": 39882 + }, + { + "epoch": 3.7214705607912664, + "grad_norm": NaN, + "learning_rate": 0.00010005293240744583, + "loss": 0.0, + "step": 39883 + }, + { + "epoch": 3.7215638704861433, + "grad_norm": NaN, + "learning_rate": 0.00010004580032347506, + "loss": 0.0, + "step": 39884 + }, + { + "epoch": 3.7216571801810208, + "grad_norm": NaN, + "learning_rate": 0.00010003866836652301, + "loss": 0.0, + "step": 39885 + }, + { + "epoch": 3.721750489875898, + "grad_norm": NaN, + "learning_rate": 0.00010003153653660778, + "loss": 0.0, + "step": 39886 + }, + { + "epoch": 3.7218437995707756, + "grad_norm": NaN, + "learning_rate": 0.0001000244048337476, + "loss": 0.0, + "step": 39887 + }, + { + "epoch": 3.7219371092656526, + "grad_norm": NaN, + "learning_rate": 0.00010001727325796057, + "loss": 0.0, + "step": 39888 + }, + { + "epoch": 3.72203041896053, + "grad_norm": NaN, + "learning_rate": 0.00010001014180926474, + "loss": 0.0, + "step": 39889 + }, + { + "epoch": 3.7221237286554074, + "grad_norm": NaN, + "learning_rate": 0.00010000301048767836, + "loss": 0.0, + "step": 39890 + }, + { + "epoch": 3.7222170383502844, + "grad_norm": NaN, + "learning_rate": 9.999587929321954e-05, + "loss": 0.0, + "step": 39891 + }, + { + "epoch": 3.722310348045162, + "grad_norm": NaN, + "learning_rate": 9.998874822590633e-05, + "loss": 0.0, + "step": 39892 + }, + { + "epoch": 3.7224036577400392, + "grad_norm": NaN, + "learning_rate": 9.998161728575692e-05, + "loss": 0.0, + "step": 39893 + }, + { + "epoch": 3.7224969674349166, + "grad_norm": NaN, + "learning_rate": 9.99744864727895e-05, + "loss": 0.0, + "step": 39894 + }, + { + "epoch": 3.7225902771297936, + "grad_norm": NaN, + "learning_rate": 9.996735578702208e-05, + "loss": 0.0, + "step": 39895 + }, + { + "epoch": 3.722683586824671, + "grad_norm": NaN, + "learning_rate": 9.996022522847287e-05, + "loss": 0.0, + "step": 39896 + }, + { + "epoch": 3.7227768965195485, + "grad_norm": NaN, + "learning_rate": 9.995309479716001e-05, + "loss": 0.0, + "step": 39897 + }, + { + "epoch": 3.7228702062144254, + "grad_norm": NaN, + "learning_rate": 9.994596449310156e-05, + "loss": 0.0, + "step": 39898 + }, + { + "epoch": 3.722963515909303, + "grad_norm": NaN, + "learning_rate": 9.993883431631573e-05, + "loss": 0.0, + "step": 39899 + }, + { + "epoch": 3.7230568256041803, + "grad_norm": NaN, + "learning_rate": 9.993170426682065e-05, + "loss": 0.0, + "step": 39900 + }, + { + "epoch": 3.7231501352990577, + "grad_norm": NaN, + "learning_rate": 9.992457434463433e-05, + "loss": 0.0, + "step": 39901 + }, + { + "epoch": 3.723243444993935, + "grad_norm": NaN, + "learning_rate": 9.991744454977502e-05, + "loss": 0.0, + "step": 39902 + }, + { + "epoch": 3.723336754688812, + "grad_norm": NaN, + "learning_rate": 9.991031488226085e-05, + "loss": 0.0, + "step": 39903 + }, + { + "epoch": 3.7234300643836895, + "grad_norm": NaN, + "learning_rate": 9.990318534210986e-05, + "loss": 0.0, + "step": 39904 + }, + { + "epoch": 3.723523374078567, + "grad_norm": NaN, + "learning_rate": 9.989605592934024e-05, + "loss": 0.0, + "step": 39905 + }, + { + "epoch": 3.723616683773444, + "grad_norm": NaN, + "learning_rate": 9.988892664397016e-05, + "loss": 0.0, + "step": 39906 + }, + { + "epoch": 3.7237099934683213, + "grad_norm": NaN, + "learning_rate": 9.988179748601761e-05, + "loss": 0.0, + "step": 39907 + }, + { + "epoch": 3.7238033031631987, + "grad_norm": NaN, + "learning_rate": 9.98746684555009e-05, + "loss": 0.0, + "step": 39908 + }, + { + "epoch": 3.723896612858076, + "grad_norm": NaN, + "learning_rate": 9.986753955243799e-05, + "loss": 0.0, + "step": 39909 + }, + { + "epoch": 3.723989922552953, + "grad_norm": NaN, + "learning_rate": 9.986041077684705e-05, + "loss": 0.0, + "step": 39910 + }, + { + "epoch": 3.7240832322478306, + "grad_norm": NaN, + "learning_rate": 9.985328212874632e-05, + "loss": 0.0, + "step": 39911 + }, + { + "epoch": 3.724176541942708, + "grad_norm": NaN, + "learning_rate": 9.984615360815379e-05, + "loss": 0.0, + "step": 39912 + }, + { + "epoch": 3.724269851637585, + "grad_norm": NaN, + "learning_rate": 9.983902521508762e-05, + "loss": 0.0, + "step": 39913 + }, + { + "epoch": 3.7243631613324624, + "grad_norm": NaN, + "learning_rate": 9.983189694956599e-05, + "loss": 0.0, + "step": 39914 + }, + { + "epoch": 3.72445647102734, + "grad_norm": NaN, + "learning_rate": 9.982476881160696e-05, + "loss": 0.0, + "step": 39915 + }, + { + "epoch": 3.724549780722217, + "grad_norm": NaN, + "learning_rate": 9.981764080122865e-05, + "loss": 0.0, + "step": 39916 + }, + { + "epoch": 3.724643090417094, + "grad_norm": NaN, + "learning_rate": 9.981051291844929e-05, + "loss": 0.0, + "step": 39917 + }, + { + "epoch": 3.7247364001119716, + "grad_norm": NaN, + "learning_rate": 9.980338516328686e-05, + "loss": 0.0, + "step": 39918 + }, + { + "epoch": 3.724829709806849, + "grad_norm": NaN, + "learning_rate": 9.979625753575958e-05, + "loss": 0.0, + "step": 39919 + }, + { + "epoch": 3.724923019501726, + "grad_norm": NaN, + "learning_rate": 9.978913003588557e-05, + "loss": 0.0, + "step": 39920 + }, + { + "epoch": 3.7250163291966034, + "grad_norm": NaN, + "learning_rate": 9.978200266368287e-05, + "loss": 0.0, + "step": 39921 + }, + { + "epoch": 3.725109638891481, + "grad_norm": NaN, + "learning_rate": 9.977487541916971e-05, + "loss": 0.0, + "step": 39922 + }, + { + "epoch": 3.7252029485863583, + "grad_norm": NaN, + "learning_rate": 9.976774830236419e-05, + "loss": 0.0, + "step": 39923 + }, + { + "epoch": 3.7252962582812357, + "grad_norm": NaN, + "learning_rate": 9.976062131328432e-05, + "loss": 0.0, + "step": 39924 + }, + { + "epoch": 3.7253895679761126, + "grad_norm": NaN, + "learning_rate": 9.975349445194836e-05, + "loss": 0.0, + "step": 39925 + }, + { + "epoch": 3.72548287767099, + "grad_norm": NaN, + "learning_rate": 9.974636771837439e-05, + "loss": 0.0, + "step": 39926 + }, + { + "epoch": 3.725576187365867, + "grad_norm": NaN, + "learning_rate": 9.973924111258048e-05, + "loss": 0.0, + "step": 39927 + }, + { + "epoch": 3.7256694970607445, + "grad_norm": NaN, + "learning_rate": 9.97321146345848e-05, + "loss": 0.0, + "step": 39928 + }, + { + "epoch": 3.725762806755622, + "grad_norm": NaN, + "learning_rate": 9.972498828440552e-05, + "loss": 0.0, + "step": 39929 + }, + { + "epoch": 3.7258561164504993, + "grad_norm": NaN, + "learning_rate": 9.971786206206062e-05, + "loss": 0.0, + "step": 39930 + }, + { + "epoch": 3.7259494261453767, + "grad_norm": NaN, + "learning_rate": 9.971073596756836e-05, + "loss": 0.0, + "step": 39931 + }, + { + "epoch": 3.7260427358402537, + "grad_norm": NaN, + "learning_rate": 9.970361000094681e-05, + "loss": 0.0, + "step": 39932 + }, + { + "epoch": 3.726136045535131, + "grad_norm": NaN, + "learning_rate": 9.969648416221403e-05, + "loss": 0.0, + "step": 39933 + }, + { + "epoch": 3.7262293552300085, + "grad_norm": NaN, + "learning_rate": 9.96893584513882e-05, + "loss": 0.0, + "step": 39934 + }, + { + "epoch": 3.7263226649248855, + "grad_norm": NaN, + "learning_rate": 9.968223286848747e-05, + "loss": 0.0, + "step": 39935 + }, + { + "epoch": 3.726415974619763, + "grad_norm": NaN, + "learning_rate": 9.967510741352988e-05, + "loss": 0.0, + "step": 39936 + }, + { + "epoch": 3.7265092843146403, + "grad_norm": NaN, + "learning_rate": 9.966798208653358e-05, + "loss": 0.0, + "step": 39937 + }, + { + "epoch": 3.7266025940095178, + "grad_norm": NaN, + "learning_rate": 9.966085688751673e-05, + "loss": 0.0, + "step": 39938 + }, + { + "epoch": 3.7266959037043947, + "grad_norm": NaN, + "learning_rate": 9.965373181649736e-05, + "loss": 0.0, + "step": 39939 + }, + { + "epoch": 3.726789213399272, + "grad_norm": NaN, + "learning_rate": 9.964660687349365e-05, + "loss": 0.0, + "step": 39940 + }, + { + "epoch": 3.7268825230941496, + "grad_norm": NaN, + "learning_rate": 9.963948205852375e-05, + "loss": 0.0, + "step": 39941 + }, + { + "epoch": 3.7269758327890266, + "grad_norm": NaN, + "learning_rate": 9.963235737160567e-05, + "loss": 0.0, + "step": 39942 + }, + { + "epoch": 3.727069142483904, + "grad_norm": NaN, + "learning_rate": 9.962523281275762e-05, + "loss": 0.0, + "step": 39943 + }, + { + "epoch": 3.7271624521787814, + "grad_norm": NaN, + "learning_rate": 9.961810838199768e-05, + "loss": 0.0, + "step": 39944 + }, + { + "epoch": 3.727255761873659, + "grad_norm": NaN, + "learning_rate": 9.961098407934392e-05, + "loss": 0.0, + "step": 39945 + }, + { + "epoch": 3.7273490715685362, + "grad_norm": NaN, + "learning_rate": 9.960385990481455e-05, + "loss": 0.0, + "step": 39946 + }, + { + "epoch": 3.727442381263413, + "grad_norm": NaN, + "learning_rate": 9.959673585842764e-05, + "loss": 0.0, + "step": 39947 + }, + { + "epoch": 3.7275356909582906, + "grad_norm": NaN, + "learning_rate": 9.958961194020125e-05, + "loss": 0.0, + "step": 39948 + }, + { + "epoch": 3.7276290006531676, + "grad_norm": NaN, + "learning_rate": 9.95824881501536e-05, + "loss": 0.0, + "step": 39949 + }, + { + "epoch": 3.727722310348045, + "grad_norm": NaN, + "learning_rate": 9.957536448830276e-05, + "loss": 0.0, + "step": 39950 + }, + { + "epoch": 3.7278156200429224, + "grad_norm": NaN, + "learning_rate": 9.956824095466676e-05, + "loss": 0.0, + "step": 39951 + }, + { + "epoch": 3.7279089297378, + "grad_norm": NaN, + "learning_rate": 9.956111754926385e-05, + "loss": 0.0, + "step": 39952 + }, + { + "epoch": 3.7280022394326773, + "grad_norm": NaN, + "learning_rate": 9.955399427211204e-05, + "loss": 0.0, + "step": 39953 + }, + { + "epoch": 3.7280955491275543, + "grad_norm": NaN, + "learning_rate": 9.954687112322944e-05, + "loss": 0.0, + "step": 39954 + }, + { + "epoch": 3.7281888588224317, + "grad_norm": NaN, + "learning_rate": 9.953974810263428e-05, + "loss": 0.0, + "step": 39955 + }, + { + "epoch": 3.728282168517309, + "grad_norm": NaN, + "learning_rate": 9.953262521034455e-05, + "loss": 0.0, + "step": 39956 + }, + { + "epoch": 3.728375478212186, + "grad_norm": NaN, + "learning_rate": 9.952550244637841e-05, + "loss": 0.0, + "step": 39957 + }, + { + "epoch": 3.7284687879070635, + "grad_norm": NaN, + "learning_rate": 9.9518379810754e-05, + "loss": 0.0, + "step": 39958 + }, + { + "epoch": 3.728562097601941, + "grad_norm": NaN, + "learning_rate": 9.951125730348935e-05, + "loss": 0.0, + "step": 39959 + }, + { + "epoch": 3.7286554072968183, + "grad_norm": NaN, + "learning_rate": 9.950413492460262e-05, + "loss": 0.0, + "step": 39960 + }, + { + "epoch": 3.7287487169916953, + "grad_norm": NaN, + "learning_rate": 9.949701267411196e-05, + "loss": 0.0, + "step": 39961 + }, + { + "epoch": 3.7288420266865727, + "grad_norm": NaN, + "learning_rate": 9.948989055203539e-05, + "loss": 0.0, + "step": 39962 + }, + { + "epoch": 3.72893533638145, + "grad_norm": NaN, + "learning_rate": 9.948276855839106e-05, + "loss": 0.0, + "step": 39963 + }, + { + "epoch": 3.729028646076327, + "grad_norm": NaN, + "learning_rate": 9.947564669319715e-05, + "loss": 0.0, + "step": 39964 + }, + { + "epoch": 3.7291219557712045, + "grad_norm": NaN, + "learning_rate": 9.946852495647163e-05, + "loss": 0.0, + "step": 39965 + }, + { + "epoch": 3.729215265466082, + "grad_norm": NaN, + "learning_rate": 9.946140334823273e-05, + "loss": 0.0, + "step": 39966 + }, + { + "epoch": 3.7293085751609594, + "grad_norm": NaN, + "learning_rate": 9.945428186849853e-05, + "loss": 0.0, + "step": 39967 + }, + { + "epoch": 3.729401884855837, + "grad_norm": NaN, + "learning_rate": 9.944716051728704e-05, + "loss": 0.0, + "step": 39968 + }, + { + "epoch": 3.7294951945507138, + "grad_norm": NaN, + "learning_rate": 9.94400392946165e-05, + "loss": 0.0, + "step": 39969 + }, + { + "epoch": 3.729588504245591, + "grad_norm": NaN, + "learning_rate": 9.943291820050499e-05, + "loss": 0.0, + "step": 39970 + }, + { + "epoch": 3.729681813940468, + "grad_norm": NaN, + "learning_rate": 9.942579723497052e-05, + "loss": 0.0, + "step": 39971 + }, + { + "epoch": 3.7297751236353456, + "grad_norm": NaN, + "learning_rate": 9.941867639803131e-05, + "loss": 0.0, + "step": 39972 + }, + { + "epoch": 3.729868433330223, + "grad_norm": NaN, + "learning_rate": 9.941155568970542e-05, + "loss": 0.0, + "step": 39973 + }, + { + "epoch": 3.7299617430251004, + "grad_norm": NaN, + "learning_rate": 9.940443511001093e-05, + "loss": 0.0, + "step": 39974 + }, + { + "epoch": 3.730055052719978, + "grad_norm": NaN, + "learning_rate": 9.9397314658966e-05, + "loss": 0.0, + "step": 39975 + }, + { + "epoch": 3.730148362414855, + "grad_norm": NaN, + "learning_rate": 9.939019433658874e-05, + "loss": 0.0, + "step": 39976 + }, + { + "epoch": 3.7302416721097322, + "grad_norm": NaN, + "learning_rate": 9.938307414289715e-05, + "loss": 0.0, + "step": 39977 + }, + { + "epoch": 3.7303349818046097, + "grad_norm": NaN, + "learning_rate": 9.937595407790945e-05, + "loss": 0.0, + "step": 39978 + }, + { + "epoch": 3.7304282914994866, + "grad_norm": NaN, + "learning_rate": 9.936883414164374e-05, + "loss": 0.0, + "step": 39979 + }, + { + "epoch": 3.730521601194364, + "grad_norm": NaN, + "learning_rate": 9.936171433411801e-05, + "loss": 0.0, + "step": 39980 + }, + { + "epoch": 3.7306149108892415, + "grad_norm": NaN, + "learning_rate": 9.935459465535047e-05, + "loss": 0.0, + "step": 39981 + }, + { + "epoch": 3.730708220584119, + "grad_norm": NaN, + "learning_rate": 9.934747510535924e-05, + "loss": 0.0, + "step": 39982 + }, + { + "epoch": 3.730801530278996, + "grad_norm": NaN, + "learning_rate": 9.934035568416229e-05, + "loss": 0.0, + "step": 39983 + }, + { + "epoch": 3.7308948399738733, + "grad_norm": NaN, + "learning_rate": 9.933323639177788e-05, + "loss": 0.0, + "step": 39984 + }, + { + "epoch": 3.7309881496687507, + "grad_norm": NaN, + "learning_rate": 9.932611722822404e-05, + "loss": 0.0, + "step": 39985 + }, + { + "epoch": 3.7310814593636277, + "grad_norm": NaN, + "learning_rate": 9.931899819351882e-05, + "loss": 0.0, + "step": 39986 + }, + { + "epoch": 3.731174769058505, + "grad_norm": NaN, + "learning_rate": 9.931187928768041e-05, + "loss": 0.0, + "step": 39987 + }, + { + "epoch": 3.7312680787533825, + "grad_norm": NaN, + "learning_rate": 9.93047605107269e-05, + "loss": 0.0, + "step": 39988 + }, + { + "epoch": 3.73136138844826, + "grad_norm": NaN, + "learning_rate": 9.92976418626763e-05, + "loss": 0.0, + "step": 39989 + }, + { + "epoch": 3.731454698143137, + "grad_norm": NaN, + "learning_rate": 9.92905233435468e-05, + "loss": 0.0, + "step": 39990 + }, + { + "epoch": 3.7315480078380143, + "grad_norm": NaN, + "learning_rate": 9.928340495335655e-05, + "loss": 0.0, + "step": 39991 + }, + { + "epoch": 3.7316413175328917, + "grad_norm": NaN, + "learning_rate": 9.92762866921235e-05, + "loss": 0.0, + "step": 39992 + }, + { + "epoch": 3.7317346272277687, + "grad_norm": NaN, + "learning_rate": 9.926916855986581e-05, + "loss": 0.0, + "step": 39993 + }, + { + "epoch": 3.731827936922646, + "grad_norm": NaN, + "learning_rate": 9.926205055660167e-05, + "loss": 0.0, + "step": 39994 + }, + { + "epoch": 3.7319212466175236, + "grad_norm": NaN, + "learning_rate": 9.925493268234906e-05, + "loss": 0.0, + "step": 39995 + }, + { + "epoch": 3.732014556312401, + "grad_norm": NaN, + "learning_rate": 9.924781493712612e-05, + "loss": 0.0, + "step": 39996 + }, + { + "epoch": 3.7321078660072784, + "grad_norm": NaN, + "learning_rate": 9.9240697320951e-05, + "loss": 0.0, + "step": 39997 + }, + { + "epoch": 3.7322011757021554, + "grad_norm": NaN, + "learning_rate": 9.923357983384171e-05, + "loss": 0.0, + "step": 39998 + }, + { + "epoch": 3.732294485397033, + "grad_norm": NaN, + "learning_rate": 9.922646247581643e-05, + "loss": 0.0, + "step": 39999 + }, + { + "epoch": 3.73238779509191, + "grad_norm": NaN, + "learning_rate": 9.921934524689315e-05, + "loss": 0.0, + "step": 40000 + }, + { + "epoch": 3.732481104786787, + "grad_norm": NaN, + "learning_rate": 9.921222814709007e-05, + "loss": 0.0, + "step": 40001 + }, + { + "epoch": 3.7325744144816646, + "grad_norm": NaN, + "learning_rate": 9.92051111764253e-05, + "loss": 0.0, + "step": 40002 + }, + { + "epoch": 3.732667724176542, + "grad_norm": NaN, + "learning_rate": 9.919799433491678e-05, + "loss": 0.0, + "step": 40003 + }, + { + "epoch": 3.7327610338714194, + "grad_norm": NaN, + "learning_rate": 9.919087762258277e-05, + "loss": 0.0, + "step": 40004 + }, + { + "epoch": 3.7328543435662964, + "grad_norm": NaN, + "learning_rate": 9.918376103944133e-05, + "loss": 0.0, + "step": 40005 + }, + { + "epoch": 3.732947653261174, + "grad_norm": NaN, + "learning_rate": 9.917664458551047e-05, + "loss": 0.0, + "step": 40006 + }, + { + "epoch": 3.7330409629560513, + "grad_norm": NaN, + "learning_rate": 9.916952826080839e-05, + "loss": 0.0, + "step": 40007 + }, + { + "epoch": 3.7331342726509282, + "grad_norm": NaN, + "learning_rate": 9.916241206535315e-05, + "loss": 0.0, + "step": 40008 + }, + { + "epoch": 3.7332275823458057, + "grad_norm": NaN, + "learning_rate": 9.91552959991628e-05, + "loss": 0.0, + "step": 40009 + }, + { + "epoch": 3.733320892040683, + "grad_norm": NaN, + "learning_rate": 9.914818006225549e-05, + "loss": 0.0, + "step": 40010 + }, + { + "epoch": 3.7334142017355605, + "grad_norm": NaN, + "learning_rate": 9.914106425464932e-05, + "loss": 0.0, + "step": 40011 + }, + { + "epoch": 3.7335075114304375, + "grad_norm": NaN, + "learning_rate": 9.913394857636232e-05, + "loss": 0.0, + "step": 40012 + }, + { + "epoch": 3.733600821125315, + "grad_norm": NaN, + "learning_rate": 9.912683302741262e-05, + "loss": 0.0, + "step": 40013 + }, + { + "epoch": 3.7336941308201923, + "grad_norm": NaN, + "learning_rate": 9.911971760781837e-05, + "loss": 0.0, + "step": 40014 + }, + { + "epoch": 3.7337874405150693, + "grad_norm": NaN, + "learning_rate": 9.911260231759753e-05, + "loss": 0.0, + "step": 40015 + }, + { + "epoch": 3.7338807502099467, + "grad_norm": NaN, + "learning_rate": 9.910548715676829e-05, + "loss": 0.0, + "step": 40016 + }, + { + "epoch": 3.733974059904824, + "grad_norm": NaN, + "learning_rate": 9.90983721253488e-05, + "loss": 0.0, + "step": 40017 + }, + { + "epoch": 3.7340673695997015, + "grad_norm": NaN, + "learning_rate": 9.909125722335696e-05, + "loss": 0.0, + "step": 40018 + }, + { + "epoch": 3.734160679294579, + "grad_norm": NaN, + "learning_rate": 9.908414245081101e-05, + "loss": 0.0, + "step": 40019 + }, + { + "epoch": 3.734253988989456, + "grad_norm": NaN, + "learning_rate": 9.907702780772903e-05, + "loss": 0.0, + "step": 40020 + }, + { + "epoch": 3.7343472986843333, + "grad_norm": NaN, + "learning_rate": 9.906991329412905e-05, + "loss": 0.0, + "step": 40021 + }, + { + "epoch": 3.7344406083792103, + "grad_norm": NaN, + "learning_rate": 9.906279891002922e-05, + "loss": 0.0, + "step": 40022 + }, + { + "epoch": 3.7345339180740877, + "grad_norm": NaN, + "learning_rate": 9.90556846554476e-05, + "loss": 0.0, + "step": 40023 + }, + { + "epoch": 3.734627227768965, + "grad_norm": NaN, + "learning_rate": 9.904857053040225e-05, + "loss": 0.0, + "step": 40024 + }, + { + "epoch": 3.7347205374638426, + "grad_norm": NaN, + "learning_rate": 9.904145653491132e-05, + "loss": 0.0, + "step": 40025 + }, + { + "epoch": 3.73481384715872, + "grad_norm": NaN, + "learning_rate": 9.903434266899291e-05, + "loss": 0.0, + "step": 40026 + }, + { + "epoch": 3.734907156853597, + "grad_norm": NaN, + "learning_rate": 9.902722893266499e-05, + "loss": 0.0, + "step": 40027 + }, + { + "epoch": 3.7350004665484744, + "grad_norm": NaN, + "learning_rate": 9.902011532594573e-05, + "loss": 0.0, + "step": 40028 + }, + { + "epoch": 3.735093776243352, + "grad_norm": NaN, + "learning_rate": 9.901300184885334e-05, + "loss": 0.0, + "step": 40029 + }, + { + "epoch": 3.735187085938229, + "grad_norm": NaN, + "learning_rate": 9.900588850140566e-05, + "loss": 0.0, + "step": 40030 + }, + { + "epoch": 3.735280395633106, + "grad_norm": NaN, + "learning_rate": 9.89987752836209e-05, + "loss": 0.0, + "step": 40031 + }, + { + "epoch": 3.7353737053279836, + "grad_norm": NaN, + "learning_rate": 9.899166219551721e-05, + "loss": 0.0, + "step": 40032 + }, + { + "epoch": 3.735467015022861, + "grad_norm": NaN, + "learning_rate": 9.898454923711258e-05, + "loss": 0.0, + "step": 40033 + }, + { + "epoch": 3.735560324717738, + "grad_norm": NaN, + "learning_rate": 9.897743640842509e-05, + "loss": 0.0, + "step": 40034 + }, + { + "epoch": 3.7356536344126154, + "grad_norm": NaN, + "learning_rate": 9.897032370947295e-05, + "loss": 0.0, + "step": 40035 + }, + { + "epoch": 3.735746944107493, + "grad_norm": NaN, + "learning_rate": 9.896321114027414e-05, + "loss": 0.0, + "step": 40036 + }, + { + "epoch": 3.73584025380237, + "grad_norm": NaN, + "learning_rate": 9.895609870084668e-05, + "loss": 0.0, + "step": 40037 + }, + { + "epoch": 3.7359335634972473, + "grad_norm": NaN, + "learning_rate": 9.894898639120885e-05, + "loss": 0.0, + "step": 40038 + }, + { + "epoch": 3.7360268731921247, + "grad_norm": NaN, + "learning_rate": 9.894187421137858e-05, + "loss": 0.0, + "step": 40039 + }, + { + "epoch": 3.736120182887002, + "grad_norm": NaN, + "learning_rate": 9.893476216137398e-05, + "loss": 0.0, + "step": 40040 + }, + { + "epoch": 3.7362134925818795, + "grad_norm": NaN, + "learning_rate": 9.89276502412132e-05, + "loss": 0.0, + "step": 40041 + }, + { + "epoch": 3.7363068022767565, + "grad_norm": NaN, + "learning_rate": 9.892053845091425e-05, + "loss": 0.0, + "step": 40042 + }, + { + "epoch": 3.736400111971634, + "grad_norm": NaN, + "learning_rate": 9.891342679049529e-05, + "loss": 0.0, + "step": 40043 + }, + { + "epoch": 3.736493421666511, + "grad_norm": NaN, + "learning_rate": 9.890631525997426e-05, + "loss": 0.0, + "step": 40044 + }, + { + "epoch": 3.7365867313613883, + "grad_norm": NaN, + "learning_rate": 9.889920385936937e-05, + "loss": 0.0, + "step": 40045 + }, + { + "epoch": 3.7366800410562657, + "grad_norm": NaN, + "learning_rate": 9.889209258869871e-05, + "loss": 0.0, + "step": 40046 + }, + { + "epoch": 3.736773350751143, + "grad_norm": NaN, + "learning_rate": 9.888498144798026e-05, + "loss": 0.0, + "step": 40047 + }, + { + "epoch": 3.7368666604460206, + "grad_norm": NaN, + "learning_rate": 9.887787043723218e-05, + "loss": 0.0, + "step": 40048 + }, + { + "epoch": 3.7369599701408975, + "grad_norm": NaN, + "learning_rate": 9.887075955647258e-05, + "loss": 0.0, + "step": 40049 + }, + { + "epoch": 3.737053279835775, + "grad_norm": NaN, + "learning_rate": 9.886364880571941e-05, + "loss": 0.0, + "step": 40050 + }, + { + "epoch": 3.7371465895306524, + "grad_norm": NaN, + "learning_rate": 9.885653818499088e-05, + "loss": 0.0, + "step": 40051 + }, + { + "epoch": 3.7372398992255293, + "grad_norm": NaN, + "learning_rate": 9.884942769430505e-05, + "loss": 0.0, + "step": 40052 + }, + { + "epoch": 3.7373332089204068, + "grad_norm": NaN, + "learning_rate": 9.884231733367991e-05, + "loss": 0.0, + "step": 40053 + }, + { + "epoch": 3.737426518615284, + "grad_norm": NaN, + "learning_rate": 9.883520710313365e-05, + "loss": 0.0, + "step": 40054 + }, + { + "epoch": 3.7375198283101616, + "grad_norm": NaN, + "learning_rate": 9.88280970026843e-05, + "loss": 0.0, + "step": 40055 + }, + { + "epoch": 3.7376131380050386, + "grad_norm": NaN, + "learning_rate": 9.882098703234989e-05, + "loss": 0.0, + "step": 40056 + }, + { + "epoch": 3.737706447699916, + "grad_norm": NaN, + "learning_rate": 9.881387719214861e-05, + "loss": 0.0, + "step": 40057 + }, + { + "epoch": 3.7377997573947934, + "grad_norm": NaN, + "learning_rate": 9.880676748209848e-05, + "loss": 0.0, + "step": 40058 + }, + { + "epoch": 3.7378930670896704, + "grad_norm": NaN, + "learning_rate": 9.87996579022175e-05, + "loss": 0.0, + "step": 40059 + }, + { + "epoch": 3.737986376784548, + "grad_norm": NaN, + "learning_rate": 9.879254845252387e-05, + "loss": 0.0, + "step": 40060 + }, + { + "epoch": 3.7380796864794252, + "grad_norm": NaN, + "learning_rate": 9.878543913303568e-05, + "loss": 0.0, + "step": 40061 + }, + { + "epoch": 3.7381729961743027, + "grad_norm": NaN, + "learning_rate": 9.877832994377087e-05, + "loss": 0.0, + "step": 40062 + }, + { + "epoch": 3.73826630586918, + "grad_norm": NaN, + "learning_rate": 9.87712208847476e-05, + "loss": 0.0, + "step": 40063 + }, + { + "epoch": 3.738359615564057, + "grad_norm": NaN, + "learning_rate": 9.876411195598399e-05, + "loss": 0.0, + "step": 40064 + }, + { + "epoch": 3.7384529252589345, + "grad_norm": NaN, + "learning_rate": 9.8757003157498e-05, + "loss": 0.0, + "step": 40065 + }, + { + "epoch": 3.7385462349538114, + "grad_norm": NaN, + "learning_rate": 9.874989448930776e-05, + "loss": 0.0, + "step": 40066 + }, + { + "epoch": 3.738639544648689, + "grad_norm": NaN, + "learning_rate": 9.874278595143144e-05, + "loss": 0.0, + "step": 40067 + }, + { + "epoch": 3.7387328543435663, + "grad_norm": NaN, + "learning_rate": 9.8735677543887e-05, + "loss": 0.0, + "step": 40068 + }, + { + "epoch": 3.7388261640384437, + "grad_norm": NaN, + "learning_rate": 9.872856926669251e-05, + "loss": 0.0, + "step": 40069 + }, + { + "epoch": 3.738919473733321, + "grad_norm": NaN, + "learning_rate": 9.872146111986615e-05, + "loss": 0.0, + "step": 40070 + }, + { + "epoch": 3.739012783428198, + "grad_norm": NaN, + "learning_rate": 9.871435310342587e-05, + "loss": 0.0, + "step": 40071 + }, + { + "epoch": 3.7391060931230755, + "grad_norm": NaN, + "learning_rate": 9.870724521738979e-05, + "loss": 0.0, + "step": 40072 + }, + { + "epoch": 3.739199402817953, + "grad_norm": NaN, + "learning_rate": 9.870013746177604e-05, + "loss": 0.0, + "step": 40073 + }, + { + "epoch": 3.73929271251283, + "grad_norm": NaN, + "learning_rate": 9.869302983660262e-05, + "loss": 0.0, + "step": 40074 + }, + { + "epoch": 3.7393860222077073, + "grad_norm": NaN, + "learning_rate": 9.868592234188758e-05, + "loss": 0.0, + "step": 40075 + }, + { + "epoch": 3.7394793319025847, + "grad_norm": NaN, + "learning_rate": 9.867881497764913e-05, + "loss": 0.0, + "step": 40076 + }, + { + "epoch": 3.739572641597462, + "grad_norm": NaN, + "learning_rate": 9.867170774390521e-05, + "loss": 0.0, + "step": 40077 + }, + { + "epoch": 3.739665951292339, + "grad_norm": NaN, + "learning_rate": 9.866460064067388e-05, + "loss": 0.0, + "step": 40078 + }, + { + "epoch": 3.7397592609872166, + "grad_norm": NaN, + "learning_rate": 9.865749366797335e-05, + "loss": 0.0, + "step": 40079 + }, + { + "epoch": 3.739852570682094, + "grad_norm": NaN, + "learning_rate": 9.865038682582156e-05, + "loss": 0.0, + "step": 40080 + }, + { + "epoch": 3.739945880376971, + "grad_norm": NaN, + "learning_rate": 9.864328011423661e-05, + "loss": 0.0, + "step": 40081 + }, + { + "epoch": 3.7400391900718484, + "grad_norm": NaN, + "learning_rate": 9.863617353323664e-05, + "loss": 0.0, + "step": 40082 + }, + { + "epoch": 3.740132499766726, + "grad_norm": NaN, + "learning_rate": 9.862906708283963e-05, + "loss": 0.0, + "step": 40083 + }, + { + "epoch": 3.740225809461603, + "grad_norm": NaN, + "learning_rate": 9.862196076306365e-05, + "loss": 0.0, + "step": 40084 + }, + { + "epoch": 3.7403191191564806, + "grad_norm": NaN, + "learning_rate": 9.861485457392687e-05, + "loss": 0.0, + "step": 40085 + }, + { + "epoch": 3.7404124288513576, + "grad_norm": NaN, + "learning_rate": 9.860774851544728e-05, + "loss": 0.0, + "step": 40086 + }, + { + "epoch": 3.740505738546235, + "grad_norm": NaN, + "learning_rate": 9.860064258764299e-05, + "loss": 0.0, + "step": 40087 + }, + { + "epoch": 3.740599048241112, + "grad_norm": NaN, + "learning_rate": 9.859353679053196e-05, + "loss": 0.0, + "step": 40088 + }, + { + "epoch": 3.7406923579359894, + "grad_norm": NaN, + "learning_rate": 9.858643112413236e-05, + "loss": 0.0, + "step": 40089 + }, + { + "epoch": 3.740785667630867, + "grad_norm": NaN, + "learning_rate": 9.85793255884623e-05, + "loss": 0.0, + "step": 40090 + }, + { + "epoch": 3.7408789773257443, + "grad_norm": NaN, + "learning_rate": 9.857222018353971e-05, + "loss": 0.0, + "step": 40091 + }, + { + "epoch": 3.7409722870206217, + "grad_norm": NaN, + "learning_rate": 9.856511490938274e-05, + "loss": 0.0, + "step": 40092 + }, + { + "epoch": 3.7410655967154987, + "grad_norm": NaN, + "learning_rate": 9.85580097660095e-05, + "loss": 0.0, + "step": 40093 + }, + { + "epoch": 3.741158906410376, + "grad_norm": NaN, + "learning_rate": 9.855090475343794e-05, + "loss": 0.0, + "step": 40094 + }, + { + "epoch": 3.7412522161052535, + "grad_norm": NaN, + "learning_rate": 9.85437998716862e-05, + "loss": 0.0, + "step": 40095 + }, + { + "epoch": 3.7413455258001305, + "grad_norm": NaN, + "learning_rate": 9.853669512077238e-05, + "loss": 0.0, + "step": 40096 + }, + { + "epoch": 3.741438835495008, + "grad_norm": NaN, + "learning_rate": 9.852959050071445e-05, + "loss": 0.0, + "step": 40097 + }, + { + "epoch": 3.7415321451898853, + "grad_norm": NaN, + "learning_rate": 9.852248601153053e-05, + "loss": 0.0, + "step": 40098 + }, + { + "epoch": 3.7416254548847627, + "grad_norm": NaN, + "learning_rate": 9.851538165323874e-05, + "loss": 0.0, + "step": 40099 + }, + { + "epoch": 3.7417187645796397, + "grad_norm": NaN, + "learning_rate": 9.850827742585701e-05, + "loss": 0.0, + "step": 40100 + }, + { + "epoch": 3.741812074274517, + "grad_norm": NaN, + "learning_rate": 9.850117332940347e-05, + "loss": 0.0, + "step": 40101 + }, + { + "epoch": 3.7419053839693945, + "grad_norm": NaN, + "learning_rate": 9.849406936389628e-05, + "loss": 0.0, + "step": 40102 + }, + { + "epoch": 3.7419986936642715, + "grad_norm": NaN, + "learning_rate": 9.848696552935333e-05, + "loss": 0.0, + "step": 40103 + }, + { + "epoch": 3.742092003359149, + "grad_norm": NaN, + "learning_rate": 9.847986182579275e-05, + "loss": 0.0, + "step": 40104 + }, + { + "epoch": 3.7421853130540264, + "grad_norm": NaN, + "learning_rate": 9.847275825323272e-05, + "loss": 0.0, + "step": 40105 + }, + { + "epoch": 3.7422786227489038, + "grad_norm": NaN, + "learning_rate": 9.846565481169114e-05, + "loss": 0.0, + "step": 40106 + }, + { + "epoch": 3.7423719324437807, + "grad_norm": NaN, + "learning_rate": 9.845855150118611e-05, + "loss": 0.0, + "step": 40107 + }, + { + "epoch": 3.742465242138658, + "grad_norm": NaN, + "learning_rate": 9.845144832173578e-05, + "loss": 0.0, + "step": 40108 + }, + { + "epoch": 3.7425585518335356, + "grad_norm": NaN, + "learning_rate": 9.84443452733581e-05, + "loss": 0.0, + "step": 40109 + }, + { + "epoch": 3.7426518615284126, + "grad_norm": NaN, + "learning_rate": 9.843724235607119e-05, + "loss": 0.0, + "step": 40110 + }, + { + "epoch": 3.74274517122329, + "grad_norm": NaN, + "learning_rate": 9.843013956989312e-05, + "loss": 0.0, + "step": 40111 + }, + { + "epoch": 3.7428384809181674, + "grad_norm": NaN, + "learning_rate": 9.842303691484191e-05, + "loss": 0.0, + "step": 40112 + }, + { + "epoch": 3.742931790613045, + "grad_norm": NaN, + "learning_rate": 9.841593439093562e-05, + "loss": 0.0, + "step": 40113 + }, + { + "epoch": 3.7430251003079222, + "grad_norm": NaN, + "learning_rate": 9.840883199819237e-05, + "loss": 0.0, + "step": 40114 + }, + { + "epoch": 3.743118410002799, + "grad_norm": NaN, + "learning_rate": 9.840172973663018e-05, + "loss": 0.0, + "step": 40115 + }, + { + "epoch": 3.7432117196976766, + "grad_norm": NaN, + "learning_rate": 9.839462760626703e-05, + "loss": 0.0, + "step": 40116 + }, + { + "epoch": 3.743305029392554, + "grad_norm": NaN, + "learning_rate": 9.838752560712116e-05, + "loss": 0.0, + "step": 40117 + }, + { + "epoch": 3.743398339087431, + "grad_norm": NaN, + "learning_rate": 9.838042373921047e-05, + "loss": 0.0, + "step": 40118 + }, + { + "epoch": 3.7434916487823084, + "grad_norm": NaN, + "learning_rate": 9.837332200255303e-05, + "loss": 0.0, + "step": 40119 + }, + { + "epoch": 3.743584958477186, + "grad_norm": NaN, + "learning_rate": 9.8366220397167e-05, + "loss": 0.0, + "step": 40120 + }, + { + "epoch": 3.7436782681720633, + "grad_norm": NaN, + "learning_rate": 9.835911892307036e-05, + "loss": 0.0, + "step": 40121 + }, + { + "epoch": 3.7437715778669403, + "grad_norm": NaN, + "learning_rate": 9.835201758028117e-05, + "loss": 0.0, + "step": 40122 + }, + { + "epoch": 3.7438648875618177, + "grad_norm": NaN, + "learning_rate": 9.834491636881754e-05, + "loss": 0.0, + "step": 40123 + }, + { + "epoch": 3.743958197256695, + "grad_norm": NaN, + "learning_rate": 9.833781528869745e-05, + "loss": 0.0, + "step": 40124 + }, + { + "epoch": 3.744051506951572, + "grad_norm": NaN, + "learning_rate": 9.833071433993898e-05, + "loss": 0.0, + "step": 40125 + }, + { + "epoch": 3.7441448166464495, + "grad_norm": NaN, + "learning_rate": 9.832361352256026e-05, + "loss": 0.0, + "step": 40126 + }, + { + "epoch": 3.744238126341327, + "grad_norm": NaN, + "learning_rate": 9.831651283657923e-05, + "loss": 0.0, + "step": 40127 + }, + { + "epoch": 3.7443314360362043, + "grad_norm": NaN, + "learning_rate": 9.830941228201397e-05, + "loss": 0.0, + "step": 40128 + }, + { + "epoch": 3.7444247457310813, + "grad_norm": NaN, + "learning_rate": 9.830231185888265e-05, + "loss": 0.0, + "step": 40129 + }, + { + "epoch": 3.7445180554259587, + "grad_norm": NaN, + "learning_rate": 9.82952115672032e-05, + "loss": 0.0, + "step": 40130 + }, + { + "epoch": 3.744611365120836, + "grad_norm": NaN, + "learning_rate": 9.828811140699373e-05, + "loss": 0.0, + "step": 40131 + }, + { + "epoch": 3.744704674815713, + "grad_norm": NaN, + "learning_rate": 9.828101137827223e-05, + "loss": 0.0, + "step": 40132 + }, + { + "epoch": 3.7447979845105905, + "grad_norm": NaN, + "learning_rate": 9.82739114810568e-05, + "loss": 0.0, + "step": 40133 + }, + { + "epoch": 3.744891294205468, + "grad_norm": NaN, + "learning_rate": 9.826681171536552e-05, + "loss": 0.0, + "step": 40134 + }, + { + "epoch": 3.7449846039003454, + "grad_norm": NaN, + "learning_rate": 9.825971208121638e-05, + "loss": 0.0, + "step": 40135 + }, + { + "epoch": 3.745077913595223, + "grad_norm": NaN, + "learning_rate": 9.825261257862748e-05, + "loss": 0.0, + "step": 40136 + }, + { + "epoch": 3.7451712232900998, + "grad_norm": NaN, + "learning_rate": 9.82455132076169e-05, + "loss": 0.0, + "step": 40137 + }, + { + "epoch": 3.745264532984977, + "grad_norm": NaN, + "learning_rate": 9.823841396820256e-05, + "loss": 0.0, + "step": 40138 + }, + { + "epoch": 3.745357842679854, + "grad_norm": NaN, + "learning_rate": 9.823131486040263e-05, + "loss": 0.0, + "step": 40139 + }, + { + "epoch": 3.7454511523747316, + "grad_norm": NaN, + "learning_rate": 9.822421588423521e-05, + "loss": 0.0, + "step": 40140 + }, + { + "epoch": 3.745544462069609, + "grad_norm": NaN, + "learning_rate": 9.821711703971819e-05, + "loss": 0.0, + "step": 40141 + }, + { + "epoch": 3.7456377717644864, + "grad_norm": NaN, + "learning_rate": 9.82100183268697e-05, + "loss": 0.0, + "step": 40142 + }, + { + "epoch": 3.745731081459364, + "grad_norm": NaN, + "learning_rate": 9.820291974570784e-05, + "loss": 0.0, + "step": 40143 + }, + { + "epoch": 3.745824391154241, + "grad_norm": NaN, + "learning_rate": 9.819582129625059e-05, + "loss": 0.0, + "step": 40144 + }, + { + "epoch": 3.7459177008491182, + "grad_norm": NaN, + "learning_rate": 9.818872297851599e-05, + "loss": 0.0, + "step": 40145 + }, + { + "epoch": 3.7460110105439957, + "grad_norm": NaN, + "learning_rate": 9.818162479252219e-05, + "loss": 0.0, + "step": 40146 + }, + { + "epoch": 3.7461043202388726, + "grad_norm": NaN, + "learning_rate": 9.817452673828711e-05, + "loss": 0.0, + "step": 40147 + }, + { + "epoch": 3.74619762993375, + "grad_norm": NaN, + "learning_rate": 9.816742881582886e-05, + "loss": 0.0, + "step": 40148 + }, + { + "epoch": 3.7462909396286275, + "grad_norm": NaN, + "learning_rate": 9.816033102516551e-05, + "loss": 0.0, + "step": 40149 + }, + { + "epoch": 3.746384249323505, + "grad_norm": NaN, + "learning_rate": 9.81532333663151e-05, + "loss": 0.0, + "step": 40150 + }, + { + "epoch": 3.746477559018382, + "grad_norm": NaN, + "learning_rate": 9.814613583929559e-05, + "loss": 0.0, + "step": 40151 + }, + { + "epoch": 3.7465708687132593, + "grad_norm": NaN, + "learning_rate": 9.813903844412517e-05, + "loss": 0.0, + "step": 40152 + }, + { + "epoch": 3.7466641784081367, + "grad_norm": NaN, + "learning_rate": 9.813194118082178e-05, + "loss": 0.0, + "step": 40153 + }, + { + "epoch": 3.7467574881030137, + "grad_norm": NaN, + "learning_rate": 9.812484404940347e-05, + "loss": 0.0, + "step": 40154 + }, + { + "epoch": 3.746850797797891, + "grad_norm": NaN, + "learning_rate": 9.811774704988838e-05, + "loss": 0.0, + "step": 40155 + }, + { + "epoch": 3.7469441074927685, + "grad_norm": NaN, + "learning_rate": 9.811065018229445e-05, + "loss": 0.0, + "step": 40156 + }, + { + "epoch": 3.747037417187646, + "grad_norm": NaN, + "learning_rate": 9.810355344663975e-05, + "loss": 0.0, + "step": 40157 + }, + { + "epoch": 3.7471307268825234, + "grad_norm": NaN, + "learning_rate": 9.80964568429424e-05, + "loss": 0.0, + "step": 40158 + }, + { + "epoch": 3.7472240365774003, + "grad_norm": NaN, + "learning_rate": 9.808936037122033e-05, + "loss": 0.0, + "step": 40159 + }, + { + "epoch": 3.7473173462722777, + "grad_norm": NaN, + "learning_rate": 9.808226403149163e-05, + "loss": 0.0, + "step": 40160 + }, + { + "epoch": 3.7474106559671547, + "grad_norm": NaN, + "learning_rate": 9.807516782377441e-05, + "loss": 0.0, + "step": 40161 + }, + { + "epoch": 3.747503965662032, + "grad_norm": NaN, + "learning_rate": 9.806807174808662e-05, + "loss": 0.0, + "step": 40162 + }, + { + "epoch": 3.7475972753569096, + "grad_norm": NaN, + "learning_rate": 9.806097580444633e-05, + "loss": 0.0, + "step": 40163 + }, + { + "epoch": 3.747690585051787, + "grad_norm": NaN, + "learning_rate": 9.805387999287164e-05, + "loss": 0.0, + "step": 40164 + }, + { + "epoch": 3.7477838947466644, + "grad_norm": NaN, + "learning_rate": 9.80467843133805e-05, + "loss": 0.0, + "step": 40165 + }, + { + "epoch": 3.7478772044415414, + "grad_norm": NaN, + "learning_rate": 9.803968876599099e-05, + "loss": 0.0, + "step": 40166 + }, + { + "epoch": 3.747970514136419, + "grad_norm": NaN, + "learning_rate": 9.803259335072122e-05, + "loss": 0.0, + "step": 40167 + }, + { + "epoch": 3.748063823831296, + "grad_norm": NaN, + "learning_rate": 9.802549806758912e-05, + "loss": 0.0, + "step": 40168 + }, + { + "epoch": 3.748157133526173, + "grad_norm": NaN, + "learning_rate": 9.801840291661278e-05, + "loss": 0.0, + "step": 40169 + }, + { + "epoch": 3.7482504432210506, + "grad_norm": NaN, + "learning_rate": 9.80113078978103e-05, + "loss": 0.0, + "step": 40170 + }, + { + "epoch": 3.748343752915928, + "grad_norm": NaN, + "learning_rate": 9.800421301119961e-05, + "loss": 0.0, + "step": 40171 + }, + { + "epoch": 3.7484370626108054, + "grad_norm": NaN, + "learning_rate": 9.79971182567988e-05, + "loss": 0.0, + "step": 40172 + }, + { + "epoch": 3.7485303723056824, + "grad_norm": NaN, + "learning_rate": 9.799002363462594e-05, + "loss": 0.0, + "step": 40173 + }, + { + "epoch": 3.74862368200056, + "grad_norm": NaN, + "learning_rate": 9.798292914469906e-05, + "loss": 0.0, + "step": 40174 + }, + { + "epoch": 3.7487169916954373, + "grad_norm": NaN, + "learning_rate": 9.797583478703612e-05, + "loss": 0.0, + "step": 40175 + }, + { + "epoch": 3.7488103013903142, + "grad_norm": NaN, + "learning_rate": 9.796874056165527e-05, + "loss": 0.0, + "step": 40176 + }, + { + "epoch": 3.7489036110851917, + "grad_norm": NaN, + "learning_rate": 9.796164646857448e-05, + "loss": 0.0, + "step": 40177 + }, + { + "epoch": 3.748996920780069, + "grad_norm": NaN, + "learning_rate": 9.795455250781187e-05, + "loss": 0.0, + "step": 40178 + }, + { + "epoch": 3.7490902304749465, + "grad_norm": NaN, + "learning_rate": 9.794745867938534e-05, + "loss": 0.0, + "step": 40179 + }, + { + "epoch": 3.749183540169824, + "grad_norm": NaN, + "learning_rate": 9.7940364983313e-05, + "loss": 0.0, + "step": 40180 + }, + { + "epoch": 3.749276849864701, + "grad_norm": NaN, + "learning_rate": 9.793327141961294e-05, + "loss": 0.0, + "step": 40181 + }, + { + "epoch": 3.7493701595595783, + "grad_norm": NaN, + "learning_rate": 9.792617798830313e-05, + "loss": 0.0, + "step": 40182 + }, + { + "epoch": 3.7494634692544553, + "grad_norm": NaN, + "learning_rate": 9.791908468940158e-05, + "loss": 0.0, + "step": 40183 + }, + { + "epoch": 3.7495567789493327, + "grad_norm": NaN, + "learning_rate": 9.791199152292643e-05, + "loss": 0.0, + "step": 40184 + }, + { + "epoch": 3.74965008864421, + "grad_norm": NaN, + "learning_rate": 9.790489848889565e-05, + "loss": 0.0, + "step": 40185 + }, + { + "epoch": 3.7497433983390875, + "grad_norm": NaN, + "learning_rate": 9.789780558732722e-05, + "loss": 0.0, + "step": 40186 + }, + { + "epoch": 3.749836708033965, + "grad_norm": NaN, + "learning_rate": 9.789071281823933e-05, + "loss": 0.0, + "step": 40187 + }, + { + "epoch": 3.749930017728842, + "grad_norm": NaN, + "learning_rate": 9.788362018164989e-05, + "loss": 0.0, + "step": 40188 + }, + { + "epoch": 3.7500233274237194, + "grad_norm": NaN, + "learning_rate": 9.787652767757691e-05, + "loss": 0.0, + "step": 40189 + }, + { + "epoch": 3.7501166371185968, + "grad_norm": NaN, + "learning_rate": 9.786943530603855e-05, + "loss": 0.0, + "step": 40190 + }, + { + "epoch": 3.7502099468134737, + "grad_norm": NaN, + "learning_rate": 9.786234306705275e-05, + "loss": 0.0, + "step": 40191 + }, + { + "epoch": 3.750303256508351, + "grad_norm": NaN, + "learning_rate": 9.785525096063753e-05, + "loss": 0.0, + "step": 40192 + }, + { + "epoch": 3.7503965662032286, + "grad_norm": NaN, + "learning_rate": 9.784815898681101e-05, + "loss": 0.0, + "step": 40193 + }, + { + "epoch": 3.750489875898106, + "grad_norm": NaN, + "learning_rate": 9.784106714559117e-05, + "loss": 0.0, + "step": 40194 + }, + { + "epoch": 3.750583185592983, + "grad_norm": NaN, + "learning_rate": 9.7833975436996e-05, + "loss": 0.0, + "step": 40195 + }, + { + "epoch": 3.7506764952878604, + "grad_norm": NaN, + "learning_rate": 9.782688386104363e-05, + "loss": 0.0, + "step": 40196 + }, + { + "epoch": 3.750769804982738, + "grad_norm": NaN, + "learning_rate": 9.781979241775201e-05, + "loss": 0.0, + "step": 40197 + }, + { + "epoch": 3.750863114677615, + "grad_norm": NaN, + "learning_rate": 9.781270110713919e-05, + "loss": 0.0, + "step": 40198 + }, + { + "epoch": 3.750956424372492, + "grad_norm": NaN, + "learning_rate": 9.780560992922327e-05, + "loss": 0.0, + "step": 40199 + }, + { + "epoch": 3.7510497340673696, + "grad_norm": NaN, + "learning_rate": 9.779851888402218e-05, + "loss": 0.0, + "step": 40200 + }, + { + "epoch": 3.751143043762247, + "grad_norm": NaN, + "learning_rate": 9.779142797155396e-05, + "loss": 0.0, + "step": 40201 + }, + { + "epoch": 3.7512363534571245, + "grad_norm": NaN, + "learning_rate": 9.778433719183675e-05, + "loss": 0.0, + "step": 40202 + }, + { + "epoch": 3.7513296631520014, + "grad_norm": NaN, + "learning_rate": 9.777724654488846e-05, + "loss": 0.0, + "step": 40203 + }, + { + "epoch": 3.751422972846879, + "grad_norm": NaN, + "learning_rate": 9.777015603072712e-05, + "loss": 0.0, + "step": 40204 + }, + { + "epoch": 3.751516282541756, + "grad_norm": NaN, + "learning_rate": 9.776306564937089e-05, + "loss": 0.0, + "step": 40205 + }, + { + "epoch": 3.7516095922366333, + "grad_norm": NaN, + "learning_rate": 9.775597540083767e-05, + "loss": 0.0, + "step": 40206 + }, + { + "epoch": 3.7517029019315107, + "grad_norm": NaN, + "learning_rate": 9.77488852851455e-05, + "loss": 0.0, + "step": 40207 + }, + { + "epoch": 3.751796211626388, + "grad_norm": NaN, + "learning_rate": 9.77417953023125e-05, + "loss": 0.0, + "step": 40208 + }, + { + "epoch": 3.7518895213212655, + "grad_norm": NaN, + "learning_rate": 9.773470545235657e-05, + "loss": 0.0, + "step": 40209 + }, + { + "epoch": 3.7519828310161425, + "grad_norm": NaN, + "learning_rate": 9.772761573529582e-05, + "loss": 0.0, + "step": 40210 + }, + { + "epoch": 3.75207614071102, + "grad_norm": NaN, + "learning_rate": 9.77205261511483e-05, + "loss": 0.0, + "step": 40211 + }, + { + "epoch": 3.7521694504058973, + "grad_norm": NaN, + "learning_rate": 9.771343669993196e-05, + "loss": 0.0, + "step": 40212 + }, + { + "epoch": 3.7522627601007743, + "grad_norm": NaN, + "learning_rate": 9.770634738166483e-05, + "loss": 0.0, + "step": 40213 + }, + { + "epoch": 3.7523560697956517, + "grad_norm": NaN, + "learning_rate": 9.769925819636503e-05, + "loss": 0.0, + "step": 40214 + }, + { + "epoch": 3.752449379490529, + "grad_norm": NaN, + "learning_rate": 9.769216914405049e-05, + "loss": 0.0, + "step": 40215 + }, + { + "epoch": 3.7525426891854066, + "grad_norm": NaN, + "learning_rate": 9.768508022473927e-05, + "loss": 0.0, + "step": 40216 + }, + { + "epoch": 3.7526359988802835, + "grad_norm": NaN, + "learning_rate": 9.767799143844944e-05, + "loss": 0.0, + "step": 40217 + }, + { + "epoch": 3.752729308575161, + "grad_norm": NaN, + "learning_rate": 9.767090278519891e-05, + "loss": 0.0, + "step": 40218 + }, + { + "epoch": 3.7528226182700384, + "grad_norm": NaN, + "learning_rate": 9.766381426500581e-05, + "loss": 0.0, + "step": 40219 + }, + { + "epoch": 3.7529159279649154, + "grad_norm": NaN, + "learning_rate": 9.765672587788813e-05, + "loss": 0.0, + "step": 40220 + }, + { + "epoch": 3.7530092376597928, + "grad_norm": NaN, + "learning_rate": 9.764963762386386e-05, + "loss": 0.0, + "step": 40221 + }, + { + "epoch": 3.75310254735467, + "grad_norm": NaN, + "learning_rate": 9.764254950295111e-05, + "loss": 0.0, + "step": 40222 + }, + { + "epoch": 3.7531958570495476, + "grad_norm": NaN, + "learning_rate": 9.763546151516781e-05, + "loss": 0.0, + "step": 40223 + }, + { + "epoch": 3.7532891667444246, + "grad_norm": NaN, + "learning_rate": 9.7628373660532e-05, + "loss": 0.0, + "step": 40224 + }, + { + "epoch": 3.753382476439302, + "grad_norm": NaN, + "learning_rate": 9.762128593906177e-05, + "loss": 0.0, + "step": 40225 + }, + { + "epoch": 3.7534757861341794, + "grad_norm": NaN, + "learning_rate": 9.761419835077506e-05, + "loss": 0.0, + "step": 40226 + }, + { + "epoch": 3.7535690958290564, + "grad_norm": NaN, + "learning_rate": 9.76071108956899e-05, + "loss": 0.0, + "step": 40227 + }, + { + "epoch": 3.753662405523934, + "grad_norm": NaN, + "learning_rate": 9.760002357382443e-05, + "loss": 0.0, + "step": 40228 + }, + { + "epoch": 3.7537557152188112, + "grad_norm": NaN, + "learning_rate": 9.75929363851965e-05, + "loss": 0.0, + "step": 40229 + }, + { + "epoch": 3.7538490249136887, + "grad_norm": NaN, + "learning_rate": 9.758584932982421e-05, + "loss": 0.0, + "step": 40230 + }, + { + "epoch": 3.753942334608566, + "grad_norm": NaN, + "learning_rate": 9.757876240772562e-05, + "loss": 0.0, + "step": 40231 + }, + { + "epoch": 3.754035644303443, + "grad_norm": NaN, + "learning_rate": 9.757167561891869e-05, + "loss": 0.0, + "step": 40232 + }, + { + "epoch": 3.7541289539983205, + "grad_norm": NaN, + "learning_rate": 9.756458896342143e-05, + "loss": 0.0, + "step": 40233 + }, + { + "epoch": 3.7542222636931974, + "grad_norm": NaN, + "learning_rate": 9.755750244125194e-05, + "loss": 0.0, + "step": 40234 + }, + { + "epoch": 3.754315573388075, + "grad_norm": NaN, + "learning_rate": 9.755041605242816e-05, + "loss": 0.0, + "step": 40235 + }, + { + "epoch": 3.7544088830829523, + "grad_norm": NaN, + "learning_rate": 9.75433297969681e-05, + "loss": 0.0, + "step": 40236 + }, + { + "epoch": 3.7545021927778297, + "grad_norm": NaN, + "learning_rate": 9.753624367488989e-05, + "loss": 0.0, + "step": 40237 + }, + { + "epoch": 3.754595502472707, + "grad_norm": NaN, + "learning_rate": 9.752915768621143e-05, + "loss": 0.0, + "step": 40238 + }, + { + "epoch": 3.754688812167584, + "grad_norm": NaN, + "learning_rate": 9.752207183095074e-05, + "loss": 0.0, + "step": 40239 + }, + { + "epoch": 3.7547821218624615, + "grad_norm": NaN, + "learning_rate": 9.751498610912596e-05, + "loss": 0.0, + "step": 40240 + }, + { + "epoch": 3.754875431557339, + "grad_norm": NaN, + "learning_rate": 9.750790052075497e-05, + "loss": 0.0, + "step": 40241 + }, + { + "epoch": 3.754968741252216, + "grad_norm": NaN, + "learning_rate": 9.750081506585582e-05, + "loss": 0.0, + "step": 40242 + }, + { + "epoch": 3.7550620509470933, + "grad_norm": NaN, + "learning_rate": 9.74937297444466e-05, + "loss": 0.0, + "step": 40243 + }, + { + "epoch": 3.7551553606419708, + "grad_norm": NaN, + "learning_rate": 9.748664455654526e-05, + "loss": 0.0, + "step": 40244 + }, + { + "epoch": 3.755248670336848, + "grad_norm": NaN, + "learning_rate": 9.747955950216977e-05, + "loss": 0.0, + "step": 40245 + }, + { + "epoch": 3.755341980031725, + "grad_norm": NaN, + "learning_rate": 9.747247458133828e-05, + "loss": 0.0, + "step": 40246 + }, + { + "epoch": 3.7554352897266026, + "grad_norm": NaN, + "learning_rate": 9.746538979406869e-05, + "loss": 0.0, + "step": 40247 + }, + { + "epoch": 3.75552859942148, + "grad_norm": NaN, + "learning_rate": 9.745830514037903e-05, + "loss": 0.0, + "step": 40248 + }, + { + "epoch": 3.755621909116357, + "grad_norm": NaN, + "learning_rate": 9.745122062028739e-05, + "loss": 0.0, + "step": 40249 + }, + { + "epoch": 3.7557152188112344, + "grad_norm": NaN, + "learning_rate": 9.744413623381169e-05, + "loss": 0.0, + "step": 40250 + }, + { + "epoch": 3.755808528506112, + "grad_norm": NaN, + "learning_rate": 9.743705198096997e-05, + "loss": 0.0, + "step": 40251 + }, + { + "epoch": 3.755901838200989, + "grad_norm": NaN, + "learning_rate": 9.74299678617803e-05, + "loss": 0.0, + "step": 40252 + }, + { + "epoch": 3.7559951478958666, + "grad_norm": NaN, + "learning_rate": 9.742288387626061e-05, + "loss": 0.0, + "step": 40253 + }, + { + "epoch": 3.7560884575907436, + "grad_norm": NaN, + "learning_rate": 9.741580002442896e-05, + "loss": 0.0, + "step": 40254 + }, + { + "epoch": 3.756181767285621, + "grad_norm": NaN, + "learning_rate": 9.740871630630342e-05, + "loss": 0.0, + "step": 40255 + }, + { + "epoch": 3.756275076980498, + "grad_norm": NaN, + "learning_rate": 9.740163272190183e-05, + "loss": 0.0, + "step": 40256 + }, + { + "epoch": 3.7563683866753754, + "grad_norm": NaN, + "learning_rate": 9.739454927124238e-05, + "loss": 0.0, + "step": 40257 + }, + { + "epoch": 3.756461696370253, + "grad_norm": NaN, + "learning_rate": 9.738746595434302e-05, + "loss": 0.0, + "step": 40258 + }, + { + "epoch": 3.7565550060651303, + "grad_norm": NaN, + "learning_rate": 9.738038277122168e-05, + "loss": 0.0, + "step": 40259 + }, + { + "epoch": 3.7566483157600077, + "grad_norm": NaN, + "learning_rate": 9.737329972189646e-05, + "loss": 0.0, + "step": 40260 + }, + { + "epoch": 3.7567416254548847, + "grad_norm": NaN, + "learning_rate": 9.736621680638542e-05, + "loss": 0.0, + "step": 40261 + }, + { + "epoch": 3.756834935149762, + "grad_norm": NaN, + "learning_rate": 9.735913402470639e-05, + "loss": 0.0, + "step": 40262 + }, + { + "epoch": 3.7569282448446395, + "grad_norm": NaN, + "learning_rate": 9.735205137687754e-05, + "loss": 0.0, + "step": 40263 + }, + { + "epoch": 3.7570215545395165, + "grad_norm": NaN, + "learning_rate": 9.734496886291686e-05, + "loss": 0.0, + "step": 40264 + }, + { + "epoch": 3.757114864234394, + "grad_norm": NaN, + "learning_rate": 9.733788648284227e-05, + "loss": 0.0, + "step": 40265 + }, + { + "epoch": 3.7572081739292713, + "grad_norm": NaN, + "learning_rate": 9.733080423667189e-05, + "loss": 0.0, + "step": 40266 + }, + { + "epoch": 3.7573014836241487, + "grad_norm": NaN, + "learning_rate": 9.732372212442366e-05, + "loss": 0.0, + "step": 40267 + }, + { + "epoch": 3.7573947933190257, + "grad_norm": NaN, + "learning_rate": 9.731664014611556e-05, + "loss": 0.0, + "step": 40268 + }, + { + "epoch": 3.757488103013903, + "grad_norm": NaN, + "learning_rate": 9.730955830176567e-05, + "loss": 0.0, + "step": 40269 + }, + { + "epoch": 3.7575814127087805, + "grad_norm": NaN, + "learning_rate": 9.730247659139196e-05, + "loss": 0.0, + "step": 40270 + }, + { + "epoch": 3.7576747224036575, + "grad_norm": NaN, + "learning_rate": 9.729539501501244e-05, + "loss": 0.0, + "step": 40271 + }, + { + "epoch": 3.757768032098535, + "grad_norm": NaN, + "learning_rate": 9.728831357264514e-05, + "loss": 0.0, + "step": 40272 + }, + { + "epoch": 3.7578613417934124, + "grad_norm": NaN, + "learning_rate": 9.728123226430803e-05, + "loss": 0.0, + "step": 40273 + }, + { + "epoch": 3.7579546514882898, + "grad_norm": NaN, + "learning_rate": 9.72741510900191e-05, + "loss": 0.0, + "step": 40274 + }, + { + "epoch": 3.758047961183167, + "grad_norm": NaN, + "learning_rate": 9.726707004979641e-05, + "loss": 0.0, + "step": 40275 + }, + { + "epoch": 3.758141270878044, + "grad_norm": NaN, + "learning_rate": 9.725998914365794e-05, + "loss": 0.0, + "step": 40276 + }, + { + "epoch": 3.7582345805729216, + "grad_norm": NaN, + "learning_rate": 9.725290837162168e-05, + "loss": 0.0, + "step": 40277 + }, + { + "epoch": 3.7583278902677986, + "grad_norm": NaN, + "learning_rate": 9.724582773370568e-05, + "loss": 0.0, + "step": 40278 + }, + { + "epoch": 3.758421199962676, + "grad_norm": NaN, + "learning_rate": 9.723874722992791e-05, + "loss": 0.0, + "step": 40279 + }, + { + "epoch": 3.7585145096575534, + "grad_norm": NaN, + "learning_rate": 9.723166686030632e-05, + "loss": 0.0, + "step": 40280 + }, + { + "epoch": 3.758607819352431, + "grad_norm": NaN, + "learning_rate": 9.722458662485906e-05, + "loss": 0.0, + "step": 40281 + }, + { + "epoch": 3.7587011290473082, + "grad_norm": NaN, + "learning_rate": 9.721750652360399e-05, + "loss": 0.0, + "step": 40282 + }, + { + "epoch": 3.758794438742185, + "grad_norm": NaN, + "learning_rate": 9.721042655655916e-05, + "loss": 0.0, + "step": 40283 + }, + { + "epoch": 3.7588877484370626, + "grad_norm": NaN, + "learning_rate": 9.720334672374261e-05, + "loss": 0.0, + "step": 40284 + }, + { + "epoch": 3.75898105813194, + "grad_norm": NaN, + "learning_rate": 9.719626702517229e-05, + "loss": 0.0, + "step": 40285 + }, + { + "epoch": 3.759074367826817, + "grad_norm": NaN, + "learning_rate": 9.718918746086619e-05, + "loss": 0.0, + "step": 40286 + }, + { + "epoch": 3.7591676775216945, + "grad_norm": NaN, + "learning_rate": 9.71821080308424e-05, + "loss": 0.0, + "step": 40287 + }, + { + "epoch": 3.759260987216572, + "grad_norm": NaN, + "learning_rate": 9.717502873511883e-05, + "loss": 0.0, + "step": 40288 + }, + { + "epoch": 3.7593542969114493, + "grad_norm": NaN, + "learning_rate": 9.716794957371348e-05, + "loss": 0.0, + "step": 40289 + }, + { + "epoch": 3.7594476066063263, + "grad_norm": NaN, + "learning_rate": 9.716087054664448e-05, + "loss": 0.0, + "step": 40290 + }, + { + "epoch": 3.7595409163012037, + "grad_norm": NaN, + "learning_rate": 9.715379165392965e-05, + "loss": 0.0, + "step": 40291 + }, + { + "epoch": 3.759634225996081, + "grad_norm": NaN, + "learning_rate": 9.714671289558708e-05, + "loss": 0.0, + "step": 40292 + }, + { + "epoch": 3.759727535690958, + "grad_norm": NaN, + "learning_rate": 9.713963427163483e-05, + "loss": 0.0, + "step": 40293 + }, + { + "epoch": 3.7598208453858355, + "grad_norm": NaN, + "learning_rate": 9.713255578209075e-05, + "loss": 0.0, + "step": 40294 + }, + { + "epoch": 3.759914155080713, + "grad_norm": NaN, + "learning_rate": 9.712547742697297e-05, + "loss": 0.0, + "step": 40295 + }, + { + "epoch": 3.7600074647755903, + "grad_norm": NaN, + "learning_rate": 9.711839920629946e-05, + "loss": 0.0, + "step": 40296 + }, + { + "epoch": 3.7601007744704678, + "grad_norm": NaN, + "learning_rate": 9.711132112008812e-05, + "loss": 0.0, + "step": 40297 + }, + { + "epoch": 3.7601940841653447, + "grad_norm": NaN, + "learning_rate": 9.710424316835707e-05, + "loss": 0.0, + "step": 40298 + }, + { + "epoch": 3.760287393860222, + "grad_norm": NaN, + "learning_rate": 9.70971653511243e-05, + "loss": 0.0, + "step": 40299 + }, + { + "epoch": 3.760380703555099, + "grad_norm": NaN, + "learning_rate": 9.709008766840768e-05, + "loss": 0.0, + "step": 40300 + }, + { + "epoch": 3.7604740132499765, + "grad_norm": NaN, + "learning_rate": 9.708301012022534e-05, + "loss": 0.0, + "step": 40301 + }, + { + "epoch": 3.760567322944854, + "grad_norm": NaN, + "learning_rate": 9.707593270659525e-05, + "loss": 0.0, + "step": 40302 + }, + { + "epoch": 3.7606606326397314, + "grad_norm": NaN, + "learning_rate": 9.706885542753535e-05, + "loss": 0.0, + "step": 40303 + }, + { + "epoch": 3.760753942334609, + "grad_norm": NaN, + "learning_rate": 9.706177828306369e-05, + "loss": 0.0, + "step": 40304 + }, + { + "epoch": 3.7608472520294858, + "grad_norm": NaN, + "learning_rate": 9.705470127319827e-05, + "loss": 0.0, + "step": 40305 + }, + { + "epoch": 3.760940561724363, + "grad_norm": NaN, + "learning_rate": 9.704762439795698e-05, + "loss": 0.0, + "step": 40306 + }, + { + "epoch": 3.7610338714192406, + "grad_norm": NaN, + "learning_rate": 9.704054765735796e-05, + "loss": 0.0, + "step": 40307 + }, + { + "epoch": 3.7611271811141176, + "grad_norm": NaN, + "learning_rate": 9.703347105141915e-05, + "loss": 0.0, + "step": 40308 + }, + { + "epoch": 3.761220490808995, + "grad_norm": NaN, + "learning_rate": 9.702639458015848e-05, + "loss": 0.0, + "step": 40309 + }, + { + "epoch": 3.7613138005038724, + "grad_norm": NaN, + "learning_rate": 9.701931824359404e-05, + "loss": 0.0, + "step": 40310 + }, + { + "epoch": 3.76140711019875, + "grad_norm": NaN, + "learning_rate": 9.70122420417438e-05, + "loss": 0.0, + "step": 40311 + }, + { + "epoch": 3.761500419893627, + "grad_norm": NaN, + "learning_rate": 9.700516597462566e-05, + "loss": 0.0, + "step": 40312 + }, + { + "epoch": 3.7615937295885042, + "grad_norm": NaN, + "learning_rate": 9.699809004225777e-05, + "loss": 0.0, + "step": 40313 + }, + { + "epoch": 3.7616870392833817, + "grad_norm": NaN, + "learning_rate": 9.699101424465798e-05, + "loss": 0.0, + "step": 40314 + }, + { + "epoch": 3.7617803489782586, + "grad_norm": NaN, + "learning_rate": 9.698393858184431e-05, + "loss": 0.0, + "step": 40315 + }, + { + "epoch": 3.761873658673136, + "grad_norm": NaN, + "learning_rate": 9.697686305383486e-05, + "loss": 0.0, + "step": 40316 + }, + { + "epoch": 3.7619669683680135, + "grad_norm": NaN, + "learning_rate": 9.69697876606475e-05, + "loss": 0.0, + "step": 40317 + }, + { + "epoch": 3.762060278062891, + "grad_norm": NaN, + "learning_rate": 9.696271240230023e-05, + "loss": 0.0, + "step": 40318 + }, + { + "epoch": 3.762153587757768, + "grad_norm": NaN, + "learning_rate": 9.695563727881113e-05, + "loss": 0.0, + "step": 40319 + }, + { + "epoch": 3.7622468974526453, + "grad_norm": NaN, + "learning_rate": 9.69485622901981e-05, + "loss": 0.0, + "step": 40320 + }, + { + "epoch": 3.7623402071475227, + "grad_norm": NaN, + "learning_rate": 9.694148743647914e-05, + "loss": 0.0, + "step": 40321 + }, + { + "epoch": 3.7624335168423997, + "grad_norm": NaN, + "learning_rate": 9.693441271767231e-05, + "loss": 0.0, + "step": 40322 + }, + { + "epoch": 3.762526826537277, + "grad_norm": NaN, + "learning_rate": 9.692733813379553e-05, + "loss": 0.0, + "step": 40323 + }, + { + "epoch": 3.7626201362321545, + "grad_norm": NaN, + "learning_rate": 9.692026368486679e-05, + "loss": 0.0, + "step": 40324 + }, + { + "epoch": 3.762713445927032, + "grad_norm": NaN, + "learning_rate": 9.691318937090412e-05, + "loss": 0.0, + "step": 40325 + }, + { + "epoch": 3.7628067556219094, + "grad_norm": NaN, + "learning_rate": 9.690611519192545e-05, + "loss": 0.0, + "step": 40326 + }, + { + "epoch": 3.7629000653167863, + "grad_norm": NaN, + "learning_rate": 9.689904114794881e-05, + "loss": 0.0, + "step": 40327 + }, + { + "epoch": 3.7629933750116638, + "grad_norm": NaN, + "learning_rate": 9.689196723899223e-05, + "loss": 0.0, + "step": 40328 + }, + { + "epoch": 3.763086684706541, + "grad_norm": NaN, + "learning_rate": 9.688489346507358e-05, + "loss": 0.0, + "step": 40329 + }, + { + "epoch": 3.763179994401418, + "grad_norm": NaN, + "learning_rate": 9.687781982621095e-05, + "loss": 0.0, + "step": 40330 + }, + { + "epoch": 3.7632733040962956, + "grad_norm": NaN, + "learning_rate": 9.68707463224223e-05, + "loss": 0.0, + "step": 40331 + }, + { + "epoch": 3.763366613791173, + "grad_norm": NaN, + "learning_rate": 9.686367295372555e-05, + "loss": 0.0, + "step": 40332 + }, + { + "epoch": 3.7634599234860504, + "grad_norm": NaN, + "learning_rate": 9.685659972013877e-05, + "loss": 0.0, + "step": 40333 + }, + { + "epoch": 3.7635532331809274, + "grad_norm": NaN, + "learning_rate": 9.684952662167996e-05, + "loss": 0.0, + "step": 40334 + }, + { + "epoch": 3.763646542875805, + "grad_norm": NaN, + "learning_rate": 9.684245365836697e-05, + "loss": 0.0, + "step": 40335 + }, + { + "epoch": 3.763739852570682, + "grad_norm": NaN, + "learning_rate": 9.683538083021792e-05, + "loss": 0.0, + "step": 40336 + }, + { + "epoch": 3.763833162265559, + "grad_norm": NaN, + "learning_rate": 9.682830813725078e-05, + "loss": 0.0, + "step": 40337 + }, + { + "epoch": 3.7639264719604366, + "grad_norm": NaN, + "learning_rate": 9.682123557948345e-05, + "loss": 0.0, + "step": 40338 + }, + { + "epoch": 3.764019781655314, + "grad_norm": NaN, + "learning_rate": 9.681416315693398e-05, + "loss": 0.0, + "step": 40339 + }, + { + "epoch": 3.7641130913501915, + "grad_norm": NaN, + "learning_rate": 9.680709086962038e-05, + "loss": 0.0, + "step": 40340 + }, + { + "epoch": 3.7642064010450684, + "grad_norm": NaN, + "learning_rate": 9.680001871756052e-05, + "loss": 0.0, + "step": 40341 + }, + { + "epoch": 3.764299710739946, + "grad_norm": NaN, + "learning_rate": 9.679294670077248e-05, + "loss": 0.0, + "step": 40342 + }, + { + "epoch": 3.7643930204348233, + "grad_norm": NaN, + "learning_rate": 9.678587481927426e-05, + "loss": 0.0, + "step": 40343 + }, + { + "epoch": 3.7644863301297002, + "grad_norm": NaN, + "learning_rate": 9.677880307308374e-05, + "loss": 0.0, + "step": 40344 + }, + { + "epoch": 3.7645796398245777, + "grad_norm": NaN, + "learning_rate": 9.677173146221897e-05, + "loss": 0.0, + "step": 40345 + }, + { + "epoch": 3.764672949519455, + "grad_norm": NaN, + "learning_rate": 9.676465998669797e-05, + "loss": 0.0, + "step": 40346 + }, + { + "epoch": 3.7647662592143325, + "grad_norm": NaN, + "learning_rate": 9.675758864653859e-05, + "loss": 0.0, + "step": 40347 + }, + { + "epoch": 3.76485956890921, + "grad_norm": NaN, + "learning_rate": 9.675051744175894e-05, + "loss": 0.0, + "step": 40348 + }, + { + "epoch": 3.764952878604087, + "grad_norm": NaN, + "learning_rate": 9.674344637237696e-05, + "loss": 0.0, + "step": 40349 + }, + { + "epoch": 3.7650461882989643, + "grad_norm": NaN, + "learning_rate": 9.673637543841056e-05, + "loss": 0.0, + "step": 40350 + }, + { + "epoch": 3.7651394979938413, + "grad_norm": NaN, + "learning_rate": 9.672930463987783e-05, + "loss": 0.0, + "step": 40351 + }, + { + "epoch": 3.7652328076887187, + "grad_norm": NaN, + "learning_rate": 9.672223397679672e-05, + "loss": 0.0, + "step": 40352 + }, + { + "epoch": 3.765326117383596, + "grad_norm": NaN, + "learning_rate": 9.671516344918512e-05, + "loss": 0.0, + "step": 40353 + }, + { + "epoch": 3.7654194270784735, + "grad_norm": NaN, + "learning_rate": 9.67080930570611e-05, + "loss": 0.0, + "step": 40354 + }, + { + "epoch": 3.765512736773351, + "grad_norm": NaN, + "learning_rate": 9.670102280044267e-05, + "loss": 0.0, + "step": 40355 + }, + { + "epoch": 3.765606046468228, + "grad_norm": NaN, + "learning_rate": 9.669395267934766e-05, + "loss": 0.0, + "step": 40356 + }, + { + "epoch": 3.7656993561631054, + "grad_norm": NaN, + "learning_rate": 9.668688269379423e-05, + "loss": 0.0, + "step": 40357 + }, + { + "epoch": 3.765792665857983, + "grad_norm": NaN, + "learning_rate": 9.66798128438002e-05, + "loss": 0.0, + "step": 40358 + }, + { + "epoch": 3.7658859755528598, + "grad_norm": NaN, + "learning_rate": 9.667274312938362e-05, + "loss": 0.0, + "step": 40359 + }, + { + "epoch": 3.765979285247737, + "grad_norm": NaN, + "learning_rate": 9.666567355056248e-05, + "loss": 0.0, + "step": 40360 + }, + { + "epoch": 3.7660725949426146, + "grad_norm": NaN, + "learning_rate": 9.665860410735473e-05, + "loss": 0.0, + "step": 40361 + }, + { + "epoch": 3.766165904637492, + "grad_norm": NaN, + "learning_rate": 9.665153479977831e-05, + "loss": 0.0, + "step": 40362 + }, + { + "epoch": 3.766259214332369, + "grad_norm": NaN, + "learning_rate": 9.664446562785128e-05, + "loss": 0.0, + "step": 40363 + }, + { + "epoch": 3.7663525240272464, + "grad_norm": NaN, + "learning_rate": 9.663739659159152e-05, + "loss": 0.0, + "step": 40364 + }, + { + "epoch": 3.766445833722124, + "grad_norm": NaN, + "learning_rate": 9.663032769101709e-05, + "loss": 0.0, + "step": 40365 + }, + { + "epoch": 3.766539143417001, + "grad_norm": NaN, + "learning_rate": 9.662325892614596e-05, + "loss": 0.0, + "step": 40366 + }, + { + "epoch": 3.766632453111878, + "grad_norm": NaN, + "learning_rate": 9.661619029699597e-05, + "loss": 0.0, + "step": 40367 + }, + { + "epoch": 3.7667257628067556, + "grad_norm": NaN, + "learning_rate": 9.660912180358527e-05, + "loss": 0.0, + "step": 40368 + }, + { + "epoch": 3.766819072501633, + "grad_norm": NaN, + "learning_rate": 9.660205344593177e-05, + "loss": 0.0, + "step": 40369 + }, + { + "epoch": 3.7669123821965105, + "grad_norm": NaN, + "learning_rate": 9.659498522405337e-05, + "loss": 0.0, + "step": 40370 + }, + { + "epoch": 3.7670056918913875, + "grad_norm": NaN, + "learning_rate": 9.658791713796817e-05, + "loss": 0.0, + "step": 40371 + }, + { + "epoch": 3.767099001586265, + "grad_norm": NaN, + "learning_rate": 9.658084918769405e-05, + "loss": 0.0, + "step": 40372 + }, + { + "epoch": 3.767192311281142, + "grad_norm": NaN, + "learning_rate": 9.657378137324897e-05, + "loss": 0.0, + "step": 40373 + }, + { + "epoch": 3.7672856209760193, + "grad_norm": NaN, + "learning_rate": 9.656671369465097e-05, + "loss": 0.0, + "step": 40374 + }, + { + "epoch": 3.7673789306708967, + "grad_norm": NaN, + "learning_rate": 9.655964615191802e-05, + "loss": 0.0, + "step": 40375 + }, + { + "epoch": 3.767472240365774, + "grad_norm": NaN, + "learning_rate": 9.6552578745068e-05, + "loss": 0.0, + "step": 40376 + }, + { + "epoch": 3.7675655500606515, + "grad_norm": NaN, + "learning_rate": 9.654551147411896e-05, + "loss": 0.0, + "step": 40377 + }, + { + "epoch": 3.7676588597555285, + "grad_norm": NaN, + "learning_rate": 9.65384443390889e-05, + "loss": 0.0, + "step": 40378 + }, + { + "epoch": 3.767752169450406, + "grad_norm": NaN, + "learning_rate": 9.653137733999564e-05, + "loss": 0.0, + "step": 40379 + }, + { + "epoch": 3.7678454791452833, + "grad_norm": NaN, + "learning_rate": 9.652431047685732e-05, + "loss": 0.0, + "step": 40380 + }, + { + "epoch": 3.7679387888401603, + "grad_norm": NaN, + "learning_rate": 9.651724374969184e-05, + "loss": 0.0, + "step": 40381 + }, + { + "epoch": 3.7680320985350377, + "grad_norm": NaN, + "learning_rate": 9.651017715851713e-05, + "loss": 0.0, + "step": 40382 + }, + { + "epoch": 3.768125408229915, + "grad_norm": NaN, + "learning_rate": 9.650311070335121e-05, + "loss": 0.0, + "step": 40383 + }, + { + "epoch": 3.7682187179247926, + "grad_norm": NaN, + "learning_rate": 9.649604438421206e-05, + "loss": 0.0, + "step": 40384 + }, + { + "epoch": 3.7683120276196695, + "grad_norm": NaN, + "learning_rate": 9.648897820111756e-05, + "loss": 0.0, + "step": 40385 + }, + { + "epoch": 3.768405337314547, + "grad_norm": NaN, + "learning_rate": 9.64819121540858e-05, + "loss": 0.0, + "step": 40386 + }, + { + "epoch": 3.7684986470094244, + "grad_norm": NaN, + "learning_rate": 9.647484624313469e-05, + "loss": 0.0, + "step": 40387 + }, + { + "epoch": 3.7685919567043014, + "grad_norm": NaN, + "learning_rate": 9.646778046828211e-05, + "loss": 0.0, + "step": 40388 + }, + { + "epoch": 3.768685266399179, + "grad_norm": NaN, + "learning_rate": 9.646071482954615e-05, + "loss": 0.0, + "step": 40389 + }, + { + "epoch": 3.768778576094056, + "grad_norm": NaN, + "learning_rate": 9.645364932694477e-05, + "loss": 0.0, + "step": 40390 + }, + { + "epoch": 3.7688718857889336, + "grad_norm": NaN, + "learning_rate": 9.644658396049582e-05, + "loss": 0.0, + "step": 40391 + }, + { + "epoch": 3.768965195483811, + "grad_norm": NaN, + "learning_rate": 9.64395187302174e-05, + "loss": 0.0, + "step": 40392 + }, + { + "epoch": 3.769058505178688, + "grad_norm": NaN, + "learning_rate": 9.643245363612744e-05, + "loss": 0.0, + "step": 40393 + }, + { + "epoch": 3.7691518148735654, + "grad_norm": NaN, + "learning_rate": 9.642538867824382e-05, + "loss": 0.0, + "step": 40394 + }, + { + "epoch": 3.7692451245684424, + "grad_norm": NaN, + "learning_rate": 9.64183238565846e-05, + "loss": 0.0, + "step": 40395 + }, + { + "epoch": 3.76933843426332, + "grad_norm": NaN, + "learning_rate": 9.641125917116773e-05, + "loss": 0.0, + "step": 40396 + }, + { + "epoch": 3.7694317439581972, + "grad_norm": NaN, + "learning_rate": 9.64041946220111e-05, + "loss": 0.0, + "step": 40397 + }, + { + "epoch": 3.7695250536530747, + "grad_norm": NaN, + "learning_rate": 9.639713020913272e-05, + "loss": 0.0, + "step": 40398 + }, + { + "epoch": 3.769618363347952, + "grad_norm": NaN, + "learning_rate": 9.639006593255067e-05, + "loss": 0.0, + "step": 40399 + }, + { + "epoch": 3.769711673042829, + "grad_norm": NaN, + "learning_rate": 9.638300179228271e-05, + "loss": 0.0, + "step": 40400 + }, + { + "epoch": 3.7698049827377065, + "grad_norm": NaN, + "learning_rate": 9.637593778834693e-05, + "loss": 0.0, + "step": 40401 + }, + { + "epoch": 3.769898292432584, + "grad_norm": NaN, + "learning_rate": 9.636887392076122e-05, + "loss": 0.0, + "step": 40402 + }, + { + "epoch": 3.769991602127461, + "grad_norm": NaN, + "learning_rate": 9.636181018954358e-05, + "loss": 0.0, + "step": 40403 + }, + { + "epoch": 3.7700849118223383, + "grad_norm": NaN, + "learning_rate": 9.635474659471202e-05, + "loss": 0.0, + "step": 40404 + }, + { + "epoch": 3.7701782215172157, + "grad_norm": NaN, + "learning_rate": 9.634768313628437e-05, + "loss": 0.0, + "step": 40405 + }, + { + "epoch": 3.770271531212093, + "grad_norm": NaN, + "learning_rate": 9.634061981427871e-05, + "loss": 0.0, + "step": 40406 + }, + { + "epoch": 3.77036484090697, + "grad_norm": NaN, + "learning_rate": 9.6333556628713e-05, + "loss": 0.0, + "step": 40407 + }, + { + "epoch": 3.7704581506018475, + "grad_norm": NaN, + "learning_rate": 9.632649357960508e-05, + "loss": 0.0, + "step": 40408 + }, + { + "epoch": 3.770551460296725, + "grad_norm": NaN, + "learning_rate": 9.631943066697301e-05, + "loss": 0.0, + "step": 40409 + }, + { + "epoch": 3.770644769991602, + "grad_norm": NaN, + "learning_rate": 9.631236789083479e-05, + "loss": 0.0, + "step": 40410 + }, + { + "epoch": 3.7707380796864793, + "grad_norm": NaN, + "learning_rate": 9.630530525120824e-05, + "loss": 0.0, + "step": 40411 + }, + { + "epoch": 3.7708313893813568, + "grad_norm": NaN, + "learning_rate": 9.629824274811141e-05, + "loss": 0.0, + "step": 40412 + }, + { + "epoch": 3.770924699076234, + "grad_norm": NaN, + "learning_rate": 9.629118038156228e-05, + "loss": 0.0, + "step": 40413 + }, + { + "epoch": 3.7710180087711116, + "grad_norm": NaN, + "learning_rate": 9.628411815157873e-05, + "loss": 0.0, + "step": 40414 + }, + { + "epoch": 3.7711113184659886, + "grad_norm": NaN, + "learning_rate": 9.627705605817876e-05, + "loss": 0.0, + "step": 40415 + }, + { + "epoch": 3.771204628160866, + "grad_norm": NaN, + "learning_rate": 9.626999410138035e-05, + "loss": 0.0, + "step": 40416 + }, + { + "epoch": 3.771297937855743, + "grad_norm": NaN, + "learning_rate": 9.626293228120137e-05, + "loss": 0.0, + "step": 40417 + }, + { + "epoch": 3.7713912475506204, + "grad_norm": NaN, + "learning_rate": 9.625587059765987e-05, + "loss": 0.0, + "step": 40418 + }, + { + "epoch": 3.771484557245498, + "grad_norm": NaN, + "learning_rate": 9.624880905077381e-05, + "loss": 0.0, + "step": 40419 + }, + { + "epoch": 3.7715778669403752, + "grad_norm": NaN, + "learning_rate": 9.624174764056104e-05, + "loss": 0.0, + "step": 40420 + }, + { + "epoch": 3.7716711766352526, + "grad_norm": NaN, + "learning_rate": 9.623468636703962e-05, + "loss": 0.0, + "step": 40421 + }, + { + "epoch": 3.7717644863301296, + "grad_norm": NaN, + "learning_rate": 9.622762523022747e-05, + "loss": 0.0, + "step": 40422 + }, + { + "epoch": 3.771857796025007, + "grad_norm": NaN, + "learning_rate": 9.622056423014249e-05, + "loss": 0.0, + "step": 40423 + }, + { + "epoch": 3.7719511057198845, + "grad_norm": NaN, + "learning_rate": 9.621350336680273e-05, + "loss": 0.0, + "step": 40424 + }, + { + "epoch": 3.7720444154147614, + "grad_norm": NaN, + "learning_rate": 9.620644264022612e-05, + "loss": 0.0, + "step": 40425 + }, + { + "epoch": 3.772137725109639, + "grad_norm": NaN, + "learning_rate": 9.619938205043054e-05, + "loss": 0.0, + "step": 40426 + }, + { + "epoch": 3.7722310348045163, + "grad_norm": NaN, + "learning_rate": 9.6192321597434e-05, + "loss": 0.0, + "step": 40427 + }, + { + "epoch": 3.7723243444993937, + "grad_norm": NaN, + "learning_rate": 9.618526128125449e-05, + "loss": 0.0, + "step": 40428 + }, + { + "epoch": 3.7724176541942707, + "grad_norm": NaN, + "learning_rate": 9.617820110190986e-05, + "loss": 0.0, + "step": 40429 + }, + { + "epoch": 3.772510963889148, + "grad_norm": NaN, + "learning_rate": 9.617114105941816e-05, + "loss": 0.0, + "step": 40430 + }, + { + "epoch": 3.7726042735840255, + "grad_norm": NaN, + "learning_rate": 9.616408115379733e-05, + "loss": 0.0, + "step": 40431 + }, + { + "epoch": 3.7726975832789025, + "grad_norm": NaN, + "learning_rate": 9.615702138506522e-05, + "loss": 0.0, + "step": 40432 + }, + { + "epoch": 3.77279089297378, + "grad_norm": NaN, + "learning_rate": 9.61499617532399e-05, + "loss": 0.0, + "step": 40433 + }, + { + "epoch": 3.7728842026686573, + "grad_norm": NaN, + "learning_rate": 9.614290225833932e-05, + "loss": 0.0, + "step": 40434 + }, + { + "epoch": 3.7729775123635347, + "grad_norm": NaN, + "learning_rate": 9.613584290038129e-05, + "loss": 0.0, + "step": 40435 + }, + { + "epoch": 3.7730708220584117, + "grad_norm": NaN, + "learning_rate": 9.612878367938386e-05, + "loss": 0.0, + "step": 40436 + }, + { + "epoch": 3.773164131753289, + "grad_norm": NaN, + "learning_rate": 9.61217245953651e-05, + "loss": 0.0, + "step": 40437 + }, + { + "epoch": 3.7732574414481665, + "grad_norm": NaN, + "learning_rate": 9.611466564834272e-05, + "loss": 0.0, + "step": 40438 + }, + { + "epoch": 3.7733507511430435, + "grad_norm": NaN, + "learning_rate": 9.61076068383348e-05, + "loss": 0.0, + "step": 40439 + }, + { + "epoch": 3.773444060837921, + "grad_norm": NaN, + "learning_rate": 9.61005481653593e-05, + "loss": 0.0, + "step": 40440 + }, + { + "epoch": 3.7735373705327984, + "grad_norm": NaN, + "learning_rate": 9.609348962943413e-05, + "loss": 0.0, + "step": 40441 + }, + { + "epoch": 3.773630680227676, + "grad_norm": NaN, + "learning_rate": 9.608643123057722e-05, + "loss": 0.0, + "step": 40442 + }, + { + "epoch": 3.773723989922553, + "grad_norm": NaN, + "learning_rate": 9.60793729688066e-05, + "loss": 0.0, + "step": 40443 + }, + { + "epoch": 3.77381729961743, + "grad_norm": NaN, + "learning_rate": 9.607231484414012e-05, + "loss": 0.0, + "step": 40444 + }, + { + "epoch": 3.7739106093123076, + "grad_norm": NaN, + "learning_rate": 9.606525685659576e-05, + "loss": 0.0, + "step": 40445 + }, + { + "epoch": 3.7740039190071846, + "grad_norm": NaN, + "learning_rate": 9.605819900619151e-05, + "loss": 0.0, + "step": 40446 + }, + { + "epoch": 3.774097228702062, + "grad_norm": NaN, + "learning_rate": 9.605114129294529e-05, + "loss": 0.0, + "step": 40447 + }, + { + "epoch": 3.7741905383969394, + "grad_norm": NaN, + "learning_rate": 9.604408371687503e-05, + "loss": 0.0, + "step": 40448 + }, + { + "epoch": 3.774283848091817, + "grad_norm": NaN, + "learning_rate": 9.603702627799865e-05, + "loss": 0.0, + "step": 40449 + }, + { + "epoch": 3.7743771577866942, + "grad_norm": NaN, + "learning_rate": 9.602996897633413e-05, + "loss": 0.0, + "step": 40450 + }, + { + "epoch": 3.7744704674815712, + "grad_norm": NaN, + "learning_rate": 9.602291181189947e-05, + "loss": 0.0, + "step": 40451 + }, + { + "epoch": 3.7745637771764486, + "grad_norm": NaN, + "learning_rate": 9.601585478471249e-05, + "loss": 0.0, + "step": 40452 + }, + { + "epoch": 3.774657086871326, + "grad_norm": NaN, + "learning_rate": 9.60087978947912e-05, + "loss": 0.0, + "step": 40453 + }, + { + "epoch": 3.774750396566203, + "grad_norm": NaN, + "learning_rate": 9.60017411421536e-05, + "loss": 0.0, + "step": 40454 + }, + { + "epoch": 3.7748437062610805, + "grad_norm": NaN, + "learning_rate": 9.599468452681752e-05, + "loss": 0.0, + "step": 40455 + }, + { + "epoch": 3.774937015955958, + "grad_norm": NaN, + "learning_rate": 9.598762804880097e-05, + "loss": 0.0, + "step": 40456 + }, + { + "epoch": 3.7750303256508353, + "grad_norm": NaN, + "learning_rate": 9.598057170812192e-05, + "loss": 0.0, + "step": 40457 + }, + { + "epoch": 3.7751236353457123, + "grad_norm": NaN, + "learning_rate": 9.597351550479822e-05, + "loss": 0.0, + "step": 40458 + }, + { + "epoch": 3.7752169450405897, + "grad_norm": NaN, + "learning_rate": 9.596645943884791e-05, + "loss": 0.0, + "step": 40459 + }, + { + "epoch": 3.775310254735467, + "grad_norm": NaN, + "learning_rate": 9.59594035102889e-05, + "loss": 0.0, + "step": 40460 + }, + { + "epoch": 3.775403564430344, + "grad_norm": NaN, + "learning_rate": 9.595234771913905e-05, + "loss": 0.0, + "step": 40461 + }, + { + "epoch": 3.7754968741252215, + "grad_norm": NaN, + "learning_rate": 9.594529206541642e-05, + "loss": 0.0, + "step": 40462 + }, + { + "epoch": 3.775590183820099, + "grad_norm": NaN, + "learning_rate": 9.593823654913891e-05, + "loss": 0.0, + "step": 40463 + }, + { + "epoch": 3.7756834935149763, + "grad_norm": NaN, + "learning_rate": 9.59311811703244e-05, + "loss": 0.0, + "step": 40464 + }, + { + "epoch": 3.7757768032098538, + "grad_norm": NaN, + "learning_rate": 9.592412592899091e-05, + "loss": 0.0, + "step": 40465 + }, + { + "epoch": 3.7758701129047307, + "grad_norm": NaN, + "learning_rate": 9.591707082515639e-05, + "loss": 0.0, + "step": 40466 + }, + { + "epoch": 3.775963422599608, + "grad_norm": NaN, + "learning_rate": 9.591001585883866e-05, + "loss": 0.0, + "step": 40467 + }, + { + "epoch": 3.776056732294485, + "grad_norm": NaN, + "learning_rate": 9.590296103005579e-05, + "loss": 0.0, + "step": 40468 + }, + { + "epoch": 3.7761500419893625, + "grad_norm": NaN, + "learning_rate": 9.589590633882568e-05, + "loss": 0.0, + "step": 40469 + }, + { + "epoch": 3.77624335168424, + "grad_norm": NaN, + "learning_rate": 9.588885178516618e-05, + "loss": 0.0, + "step": 40470 + }, + { + "epoch": 3.7763366613791174, + "grad_norm": NaN, + "learning_rate": 9.588179736909535e-05, + "loss": 0.0, + "step": 40471 + }, + { + "epoch": 3.776429971073995, + "grad_norm": NaN, + "learning_rate": 9.587474309063111e-05, + "loss": 0.0, + "step": 40472 + }, + { + "epoch": 3.776523280768872, + "grad_norm": NaN, + "learning_rate": 9.586768894979128e-05, + "loss": 0.0, + "step": 40473 + }, + { + "epoch": 3.776616590463749, + "grad_norm": NaN, + "learning_rate": 9.586063494659391e-05, + "loss": 0.0, + "step": 40474 + }, + { + "epoch": 3.7767099001586266, + "grad_norm": NaN, + "learning_rate": 9.585358108105699e-05, + "loss": 0.0, + "step": 40475 + }, + { + "epoch": 3.7768032098535036, + "grad_norm": NaN, + "learning_rate": 9.584652735319828e-05, + "loss": 0.0, + "step": 40476 + }, + { + "epoch": 3.776896519548381, + "grad_norm": NaN, + "learning_rate": 9.583947376303583e-05, + "loss": 0.0, + "step": 40477 + }, + { + "epoch": 3.7769898292432584, + "grad_norm": NaN, + "learning_rate": 9.583242031058758e-05, + "loss": 0.0, + "step": 40478 + }, + { + "epoch": 3.777083138938136, + "grad_norm": NaN, + "learning_rate": 9.582536699587142e-05, + "loss": 0.0, + "step": 40479 + }, + { + "epoch": 3.777176448633013, + "grad_norm": NaN, + "learning_rate": 9.581831381890529e-05, + "loss": 0.0, + "step": 40480 + }, + { + "epoch": 3.7772697583278902, + "grad_norm": NaN, + "learning_rate": 9.58112607797072e-05, + "loss": 0.0, + "step": 40481 + }, + { + "epoch": 3.7773630680227677, + "grad_norm": NaN, + "learning_rate": 9.580420787829497e-05, + "loss": 0.0, + "step": 40482 + }, + { + "epoch": 3.7774563777176446, + "grad_norm": NaN, + "learning_rate": 9.579715511468658e-05, + "loss": 0.0, + "step": 40483 + }, + { + "epoch": 3.777549687412522, + "grad_norm": NaN, + "learning_rate": 9.57901024889e-05, + "loss": 0.0, + "step": 40484 + }, + { + "epoch": 3.7776429971073995, + "grad_norm": NaN, + "learning_rate": 9.578305000095311e-05, + "loss": 0.0, + "step": 40485 + }, + { + "epoch": 3.777736306802277, + "grad_norm": NaN, + "learning_rate": 9.577599765086385e-05, + "loss": 0.0, + "step": 40486 + }, + { + "epoch": 3.7778296164971543, + "grad_norm": NaN, + "learning_rate": 9.576894543865023e-05, + "loss": 0.0, + "step": 40487 + }, + { + "epoch": 3.7779229261920313, + "grad_norm": NaN, + "learning_rate": 9.576189336433006e-05, + "loss": 0.0, + "step": 40488 + }, + { + "epoch": 3.7780162358869087, + "grad_norm": NaN, + "learning_rate": 9.575484142792131e-05, + "loss": 0.0, + "step": 40489 + }, + { + "epoch": 3.7781095455817857, + "grad_norm": NaN, + "learning_rate": 9.574778962944198e-05, + "loss": 0.0, + "step": 40490 + }, + { + "epoch": 3.778202855276663, + "grad_norm": NaN, + "learning_rate": 9.574073796890994e-05, + "loss": 0.0, + "step": 40491 + }, + { + "epoch": 3.7782961649715405, + "grad_norm": NaN, + "learning_rate": 9.573368644634317e-05, + "loss": 0.0, + "step": 40492 + }, + { + "epoch": 3.778389474666418, + "grad_norm": NaN, + "learning_rate": 9.572663506175946e-05, + "loss": 0.0, + "step": 40493 + }, + { + "epoch": 3.7784827843612954, + "grad_norm": NaN, + "learning_rate": 9.57195838151769e-05, + "loss": 0.0, + "step": 40494 + }, + { + "epoch": 3.7785760940561723, + "grad_norm": NaN, + "learning_rate": 9.571253270661338e-05, + "loss": 0.0, + "step": 40495 + }, + { + "epoch": 3.7786694037510498, + "grad_norm": NaN, + "learning_rate": 9.570548173608674e-05, + "loss": 0.0, + "step": 40496 + }, + { + "epoch": 3.778762713445927, + "grad_norm": NaN, + "learning_rate": 9.569843090361503e-05, + "loss": 0.0, + "step": 40497 + }, + { + "epoch": 3.778856023140804, + "grad_norm": NaN, + "learning_rate": 9.569138020921612e-05, + "loss": 0.0, + "step": 40498 + }, + { + "epoch": 3.7789493328356816, + "grad_norm": NaN, + "learning_rate": 9.56843296529079e-05, + "loss": 0.0, + "step": 40499 + }, + { + "epoch": 3.779042642530559, + "grad_norm": NaN, + "learning_rate": 9.567727923470838e-05, + "loss": 0.0, + "step": 40500 + }, + { + "epoch": 3.7791359522254364, + "grad_norm": NaN, + "learning_rate": 9.567022895463548e-05, + "loss": 0.0, + "step": 40501 + }, + { + "epoch": 3.7792292619203134, + "grad_norm": NaN, + "learning_rate": 9.566317881270703e-05, + "loss": 0.0, + "step": 40502 + }, + { + "epoch": 3.779322571615191, + "grad_norm": NaN, + "learning_rate": 9.565612880894103e-05, + "loss": 0.0, + "step": 40503 + }, + { + "epoch": 3.7794158813100682, + "grad_norm": NaN, + "learning_rate": 9.564907894335547e-05, + "loss": 0.0, + "step": 40504 + }, + { + "epoch": 3.779509191004945, + "grad_norm": NaN, + "learning_rate": 9.564202921596811e-05, + "loss": 0.0, + "step": 40505 + }, + { + "epoch": 3.7796025006998226, + "grad_norm": NaN, + "learning_rate": 9.5634979626797e-05, + "loss": 0.0, + "step": 40506 + }, + { + "epoch": 3.7796958103947, + "grad_norm": NaN, + "learning_rate": 9.562793017586007e-05, + "loss": 0.0, + "step": 40507 + }, + { + "epoch": 3.7797891200895775, + "grad_norm": NaN, + "learning_rate": 9.562088086317512e-05, + "loss": 0.0, + "step": 40508 + }, + { + "epoch": 3.779882429784455, + "grad_norm": NaN, + "learning_rate": 9.561383168876024e-05, + "loss": 0.0, + "step": 40509 + }, + { + "epoch": 3.779975739479332, + "grad_norm": NaN, + "learning_rate": 9.560678265263326e-05, + "loss": 0.0, + "step": 40510 + }, + { + "epoch": 3.7800690491742093, + "grad_norm": NaN, + "learning_rate": 9.55997337548121e-05, + "loss": 0.0, + "step": 40511 + }, + { + "epoch": 3.7801623588690862, + "grad_norm": NaN, + "learning_rate": 9.559268499531468e-05, + "loss": 0.0, + "step": 40512 + }, + { + "epoch": 3.7802556685639637, + "grad_norm": NaN, + "learning_rate": 9.558563637415899e-05, + "loss": 0.0, + "step": 40513 + }, + { + "epoch": 3.780348978258841, + "grad_norm": NaN, + "learning_rate": 9.55785878913629e-05, + "loss": 0.0, + "step": 40514 + }, + { + "epoch": 3.7804422879537185, + "grad_norm": NaN, + "learning_rate": 9.557153954694431e-05, + "loss": 0.0, + "step": 40515 + }, + { + "epoch": 3.780535597648596, + "grad_norm": NaN, + "learning_rate": 9.556449134092122e-05, + "loss": 0.0, + "step": 40516 + }, + { + "epoch": 3.780628907343473, + "grad_norm": NaN, + "learning_rate": 9.555744327331148e-05, + "loss": 0.0, + "step": 40517 + }, + { + "epoch": 3.7807222170383503, + "grad_norm": NaN, + "learning_rate": 9.5550395344133e-05, + "loss": 0.0, + "step": 40518 + }, + { + "epoch": 3.7808155267332277, + "grad_norm": NaN, + "learning_rate": 9.55433475534038e-05, + "loss": 0.0, + "step": 40519 + }, + { + "epoch": 3.7809088364281047, + "grad_norm": NaN, + "learning_rate": 9.553629990114172e-05, + "loss": 0.0, + "step": 40520 + }, + { + "epoch": 3.781002146122982, + "grad_norm": NaN, + "learning_rate": 9.552925238736464e-05, + "loss": 0.0, + "step": 40521 + }, + { + "epoch": 3.7810954558178596, + "grad_norm": NaN, + "learning_rate": 9.552220501209061e-05, + "loss": 0.0, + "step": 40522 + }, + { + "epoch": 3.781188765512737, + "grad_norm": NaN, + "learning_rate": 9.551515777533746e-05, + "loss": 0.0, + "step": 40523 + }, + { + "epoch": 3.781282075207614, + "grad_norm": NaN, + "learning_rate": 9.550811067712307e-05, + "loss": 0.0, + "step": 40524 + }, + { + "epoch": 3.7813753849024914, + "grad_norm": NaN, + "learning_rate": 9.550106371746547e-05, + "loss": 0.0, + "step": 40525 + }, + { + "epoch": 3.781468694597369, + "grad_norm": NaN, + "learning_rate": 9.549401689638252e-05, + "loss": 0.0, + "step": 40526 + }, + { + "epoch": 3.7815620042922458, + "grad_norm": NaN, + "learning_rate": 9.548697021389208e-05, + "loss": 0.0, + "step": 40527 + }, + { + "epoch": 3.781655313987123, + "grad_norm": NaN, + "learning_rate": 9.54799236700122e-05, + "loss": 0.0, + "step": 40528 + }, + { + "epoch": 3.7817486236820006, + "grad_norm": NaN, + "learning_rate": 9.547287726476068e-05, + "loss": 0.0, + "step": 40529 + }, + { + "epoch": 3.781841933376878, + "grad_norm": NaN, + "learning_rate": 9.546583099815547e-05, + "loss": 0.0, + "step": 40530 + }, + { + "epoch": 3.781935243071755, + "grad_norm": NaN, + "learning_rate": 9.545878487021455e-05, + "loss": 0.0, + "step": 40531 + }, + { + "epoch": 3.7820285527666324, + "grad_norm": NaN, + "learning_rate": 9.545173888095576e-05, + "loss": 0.0, + "step": 40532 + }, + { + "epoch": 3.78212186246151, + "grad_norm": NaN, + "learning_rate": 9.5444693030397e-05, + "loss": 0.0, + "step": 40533 + }, + { + "epoch": 3.782215172156387, + "grad_norm": NaN, + "learning_rate": 9.543764731855629e-05, + "loss": 0.0, + "step": 40534 + }, + { + "epoch": 3.7823084818512642, + "grad_norm": NaN, + "learning_rate": 9.543060174545144e-05, + "loss": 0.0, + "step": 40535 + }, + { + "epoch": 3.7824017915461416, + "grad_norm": NaN, + "learning_rate": 9.542355631110044e-05, + "loss": 0.0, + "step": 40536 + }, + { + "epoch": 3.782495101241019, + "grad_norm": NaN, + "learning_rate": 9.541651101552113e-05, + "loss": 0.0, + "step": 40537 + }, + { + "epoch": 3.7825884109358965, + "grad_norm": NaN, + "learning_rate": 9.540946585873148e-05, + "loss": 0.0, + "step": 40538 + }, + { + "epoch": 3.7826817206307735, + "grad_norm": NaN, + "learning_rate": 9.540242084074941e-05, + "loss": 0.0, + "step": 40539 + }, + { + "epoch": 3.782775030325651, + "grad_norm": NaN, + "learning_rate": 9.539537596159277e-05, + "loss": 0.0, + "step": 40540 + }, + { + "epoch": 3.7828683400205283, + "grad_norm": NaN, + "learning_rate": 9.538833122127952e-05, + "loss": 0.0, + "step": 40541 + }, + { + "epoch": 3.7829616497154053, + "grad_norm": NaN, + "learning_rate": 9.53812866198276e-05, + "loss": 0.0, + "step": 40542 + }, + { + "epoch": 3.7830549594102827, + "grad_norm": NaN, + "learning_rate": 9.537424215725483e-05, + "loss": 0.0, + "step": 40543 + }, + { + "epoch": 3.78314826910516, + "grad_norm": NaN, + "learning_rate": 9.536719783357923e-05, + "loss": 0.0, + "step": 40544 + }, + { + "epoch": 3.7832415788000375, + "grad_norm": NaN, + "learning_rate": 9.536015364881868e-05, + "loss": 0.0, + "step": 40545 + }, + { + "epoch": 3.7833348884949145, + "grad_norm": NaN, + "learning_rate": 9.535310960299102e-05, + "loss": 0.0, + "step": 40546 + }, + { + "epoch": 3.783428198189792, + "grad_norm": NaN, + "learning_rate": 9.534606569611424e-05, + "loss": 0.0, + "step": 40547 + }, + { + "epoch": 3.7835215078846693, + "grad_norm": NaN, + "learning_rate": 9.533902192820623e-05, + "loss": 0.0, + "step": 40548 + }, + { + "epoch": 3.7836148175795463, + "grad_norm": NaN, + "learning_rate": 9.533197829928487e-05, + "loss": 0.0, + "step": 40549 + }, + { + "epoch": 3.7837081272744237, + "grad_norm": NaN, + "learning_rate": 9.532493480936808e-05, + "loss": 0.0, + "step": 40550 + }, + { + "epoch": 3.783801436969301, + "grad_norm": NaN, + "learning_rate": 9.531789145847383e-05, + "loss": 0.0, + "step": 40551 + }, + { + "epoch": 3.7838947466641786, + "grad_norm": NaN, + "learning_rate": 9.531084824661999e-05, + "loss": 0.0, + "step": 40552 + }, + { + "epoch": 3.7839880563590556, + "grad_norm": NaN, + "learning_rate": 9.530380517382442e-05, + "loss": 0.0, + "step": 40553 + }, + { + "epoch": 3.784081366053933, + "grad_norm": NaN, + "learning_rate": 9.529676224010513e-05, + "loss": 0.0, + "step": 40554 + }, + { + "epoch": 3.7841746757488104, + "grad_norm": NaN, + "learning_rate": 9.528971944547992e-05, + "loss": 0.0, + "step": 40555 + }, + { + "epoch": 3.7842679854436874, + "grad_norm": NaN, + "learning_rate": 9.528267678996673e-05, + "loss": 0.0, + "step": 40556 + }, + { + "epoch": 3.784361295138565, + "grad_norm": NaN, + "learning_rate": 9.527563427358355e-05, + "loss": 0.0, + "step": 40557 + }, + { + "epoch": 3.784454604833442, + "grad_norm": NaN, + "learning_rate": 9.52685918963482e-05, + "loss": 0.0, + "step": 40558 + }, + { + "epoch": 3.7845479145283196, + "grad_norm": NaN, + "learning_rate": 9.526154965827856e-05, + "loss": 0.0, + "step": 40559 + }, + { + "epoch": 3.784641224223197, + "grad_norm": NaN, + "learning_rate": 9.525450755939265e-05, + "loss": 0.0, + "step": 40560 + }, + { + "epoch": 3.784734533918074, + "grad_norm": NaN, + "learning_rate": 9.524746559970827e-05, + "loss": 0.0, + "step": 40561 + }, + { + "epoch": 3.7848278436129514, + "grad_norm": NaN, + "learning_rate": 9.524042377924336e-05, + "loss": 0.0, + "step": 40562 + }, + { + "epoch": 3.7849211533078284, + "grad_norm": NaN, + "learning_rate": 9.523338209801588e-05, + "loss": 0.0, + "step": 40563 + }, + { + "epoch": 3.785014463002706, + "grad_norm": NaN, + "learning_rate": 9.522634055604365e-05, + "loss": 0.0, + "step": 40564 + }, + { + "epoch": 3.7851077726975833, + "grad_norm": NaN, + "learning_rate": 9.521929915334459e-05, + "loss": 0.0, + "step": 40565 + }, + { + "epoch": 3.7852010823924607, + "grad_norm": NaN, + "learning_rate": 9.521225788993668e-05, + "loss": 0.0, + "step": 40566 + }, + { + "epoch": 3.785294392087338, + "grad_norm": NaN, + "learning_rate": 9.520521676583775e-05, + "loss": 0.0, + "step": 40567 + }, + { + "epoch": 3.785387701782215, + "grad_norm": NaN, + "learning_rate": 9.519817578106567e-05, + "loss": 0.0, + "step": 40568 + }, + { + "epoch": 3.7854810114770925, + "grad_norm": NaN, + "learning_rate": 9.519113493563847e-05, + "loss": 0.0, + "step": 40569 + }, + { + "epoch": 3.78557432117197, + "grad_norm": NaN, + "learning_rate": 9.518409422957393e-05, + "loss": 0.0, + "step": 40570 + }, + { + "epoch": 3.785667630866847, + "grad_norm": NaN, + "learning_rate": 9.517705366289001e-05, + "loss": 0.0, + "step": 40571 + }, + { + "epoch": 3.7857609405617243, + "grad_norm": NaN, + "learning_rate": 9.517001323560463e-05, + "loss": 0.0, + "step": 40572 + }, + { + "epoch": 3.7858542502566017, + "grad_norm": NaN, + "learning_rate": 9.516297294773564e-05, + "loss": 0.0, + "step": 40573 + }, + { + "epoch": 3.785947559951479, + "grad_norm": NaN, + "learning_rate": 9.515593279930094e-05, + "loss": 0.0, + "step": 40574 + }, + { + "epoch": 3.786040869646356, + "grad_norm": NaN, + "learning_rate": 9.514889279031853e-05, + "loss": 0.0, + "step": 40575 + }, + { + "epoch": 3.7861341793412335, + "grad_norm": NaN, + "learning_rate": 9.51418529208062e-05, + "loss": 0.0, + "step": 40576 + }, + { + "epoch": 3.786227489036111, + "grad_norm": NaN, + "learning_rate": 9.513481319078184e-05, + "loss": 0.0, + "step": 40577 + }, + { + "epoch": 3.786320798730988, + "grad_norm": NaN, + "learning_rate": 9.512777360026346e-05, + "loss": 0.0, + "step": 40578 + }, + { + "epoch": 3.7864141084258653, + "grad_norm": NaN, + "learning_rate": 9.512073414926888e-05, + "loss": 0.0, + "step": 40579 + }, + { + "epoch": 3.7865074181207428, + "grad_norm": NaN, + "learning_rate": 9.511369483781605e-05, + "loss": 0.0, + "step": 40580 + }, + { + "epoch": 3.78660072781562, + "grad_norm": NaN, + "learning_rate": 9.510665566592277e-05, + "loss": 0.0, + "step": 40581 + }, + { + "epoch": 3.7866940375104976, + "grad_norm": NaN, + "learning_rate": 9.509961663360703e-05, + "loss": 0.0, + "step": 40582 + }, + { + "epoch": 3.7867873472053746, + "grad_norm": NaN, + "learning_rate": 9.509257774088674e-05, + "loss": 0.0, + "step": 40583 + }, + { + "epoch": 3.786880656900252, + "grad_norm": NaN, + "learning_rate": 9.508553898777969e-05, + "loss": 0.0, + "step": 40584 + }, + { + "epoch": 3.786973966595129, + "grad_norm": NaN, + "learning_rate": 9.507850037430385e-05, + "loss": 0.0, + "step": 40585 + }, + { + "epoch": 3.7870672762900064, + "grad_norm": NaN, + "learning_rate": 9.507146190047722e-05, + "loss": 0.0, + "step": 40586 + }, + { + "epoch": 3.787160585984884, + "grad_norm": NaN, + "learning_rate": 9.506442356631749e-05, + "loss": 0.0, + "step": 40587 + }, + { + "epoch": 3.7872538956797612, + "grad_norm": NaN, + "learning_rate": 9.505738537184265e-05, + "loss": 0.0, + "step": 40588 + }, + { + "epoch": 3.7873472053746386, + "grad_norm": NaN, + "learning_rate": 9.505034731707067e-05, + "loss": 0.0, + "step": 40589 + }, + { + "epoch": 3.7874405150695156, + "grad_norm": NaN, + "learning_rate": 9.504330940201934e-05, + "loss": 0.0, + "step": 40590 + }, + { + "epoch": 3.787533824764393, + "grad_norm": NaN, + "learning_rate": 9.503627162670659e-05, + "loss": 0.0, + "step": 40591 + }, + { + "epoch": 3.7876271344592705, + "grad_norm": NaN, + "learning_rate": 9.502923399115036e-05, + "loss": 0.0, + "step": 40592 + }, + { + "epoch": 3.7877204441541474, + "grad_norm": NaN, + "learning_rate": 9.502219649536848e-05, + "loss": 0.0, + "step": 40593 + }, + { + "epoch": 3.787813753849025, + "grad_norm": NaN, + "learning_rate": 9.501515913937882e-05, + "loss": 0.0, + "step": 40594 + }, + { + "epoch": 3.7879070635439023, + "grad_norm": NaN, + "learning_rate": 9.500812192319941e-05, + "loss": 0.0, + "step": 40595 + }, + { + "epoch": 3.7880003732387797, + "grad_norm": NaN, + "learning_rate": 9.500108484684802e-05, + "loss": 0.0, + "step": 40596 + }, + { + "epoch": 3.7880936829336567, + "grad_norm": NaN, + "learning_rate": 9.499404791034254e-05, + "loss": 0.0, + "step": 40597 + }, + { + "epoch": 3.788186992628534, + "grad_norm": NaN, + "learning_rate": 9.498701111370097e-05, + "loss": 0.0, + "step": 40598 + }, + { + "epoch": 3.7882803023234115, + "grad_norm": NaN, + "learning_rate": 9.497997445694111e-05, + "loss": 0.0, + "step": 40599 + }, + { + "epoch": 3.7883736120182885, + "grad_norm": NaN, + "learning_rate": 9.497293794008084e-05, + "loss": 0.0, + "step": 40600 + }, + { + "epoch": 3.788466921713166, + "grad_norm": NaN, + "learning_rate": 9.496590156313813e-05, + "loss": 0.0, + "step": 40601 + }, + { + "epoch": 3.7885602314080433, + "grad_norm": NaN, + "learning_rate": 9.495886532613083e-05, + "loss": 0.0, + "step": 40602 + }, + { + "epoch": 3.7886535411029207, + "grad_norm": NaN, + "learning_rate": 9.495182922907681e-05, + "loss": 0.0, + "step": 40603 + }, + { + "epoch": 3.788746850797798, + "grad_norm": NaN, + "learning_rate": 9.494479327199401e-05, + "loss": 0.0, + "step": 40604 + }, + { + "epoch": 3.788840160492675, + "grad_norm": NaN, + "learning_rate": 9.493775745490028e-05, + "loss": 0.0, + "step": 40605 + }, + { + "epoch": 3.7889334701875526, + "grad_norm": NaN, + "learning_rate": 9.493072177781348e-05, + "loss": 0.0, + "step": 40606 + }, + { + "epoch": 3.7890267798824295, + "grad_norm": NaN, + "learning_rate": 9.49236862407516e-05, + "loss": 0.0, + "step": 40607 + }, + { + "epoch": 3.789120089577307, + "grad_norm": NaN, + "learning_rate": 9.491665084373245e-05, + "loss": 0.0, + "step": 40608 + }, + { + "epoch": 3.7892133992721844, + "grad_norm": NaN, + "learning_rate": 9.49096155867739e-05, + "loss": 0.0, + "step": 40609 + }, + { + "epoch": 3.789306708967062, + "grad_norm": NaN, + "learning_rate": 9.490258046989396e-05, + "loss": 0.0, + "step": 40610 + }, + { + "epoch": 3.789400018661939, + "grad_norm": NaN, + "learning_rate": 9.489554549311038e-05, + "loss": 0.0, + "step": 40611 + }, + { + "epoch": 3.789493328356816, + "grad_norm": NaN, + "learning_rate": 9.488851065644109e-05, + "loss": 0.0, + "step": 40612 + }, + { + "epoch": 3.7895866380516936, + "grad_norm": NaN, + "learning_rate": 9.488147595990406e-05, + "loss": 0.0, + "step": 40613 + }, + { + "epoch": 3.789679947746571, + "grad_norm": NaN, + "learning_rate": 9.487444140351706e-05, + "loss": 0.0, + "step": 40614 + }, + { + "epoch": 3.789773257441448, + "grad_norm": NaN, + "learning_rate": 9.486740698729801e-05, + "loss": 0.0, + "step": 40615 + }, + { + "epoch": 3.7898665671363254, + "grad_norm": NaN, + "learning_rate": 9.486037271126487e-05, + "loss": 0.0, + "step": 40616 + }, + { + "epoch": 3.789959876831203, + "grad_norm": NaN, + "learning_rate": 9.485333857543545e-05, + "loss": 0.0, + "step": 40617 + }, + { + "epoch": 3.7900531865260803, + "grad_norm": NaN, + "learning_rate": 9.48463045798276e-05, + "loss": 0.0, + "step": 40618 + }, + { + "epoch": 3.7901464962209572, + "grad_norm": NaN, + "learning_rate": 9.483927072445933e-05, + "loss": 0.0, + "step": 40619 + }, + { + "epoch": 3.7902398059158346, + "grad_norm": NaN, + "learning_rate": 9.483223700934843e-05, + "loss": 0.0, + "step": 40620 + }, + { + "epoch": 3.790333115610712, + "grad_norm": NaN, + "learning_rate": 9.482520343451278e-05, + "loss": 0.0, + "step": 40621 + }, + { + "epoch": 3.790426425305589, + "grad_norm": NaN, + "learning_rate": 9.481816999997033e-05, + "loss": 0.0, + "step": 40622 + }, + { + "epoch": 3.7905197350004665, + "grad_norm": NaN, + "learning_rate": 9.481113670573889e-05, + "loss": 0.0, + "step": 40623 + }, + { + "epoch": 3.790613044695344, + "grad_norm": NaN, + "learning_rate": 9.480410355183641e-05, + "loss": 0.0, + "step": 40624 + }, + { + "epoch": 3.7907063543902213, + "grad_norm": NaN, + "learning_rate": 9.479707053828078e-05, + "loss": 0.0, + "step": 40625 + }, + { + "epoch": 3.7907996640850987, + "grad_norm": NaN, + "learning_rate": 9.479003766508978e-05, + "loss": 0.0, + "step": 40626 + }, + { + "epoch": 3.7908929737799757, + "grad_norm": NaN, + "learning_rate": 9.478300493228142e-05, + "loss": 0.0, + "step": 40627 + }, + { + "epoch": 3.790986283474853, + "grad_norm": NaN, + "learning_rate": 9.47759723398735e-05, + "loss": 0.0, + "step": 40628 + }, + { + "epoch": 3.79107959316973, + "grad_norm": NaN, + "learning_rate": 9.476893988788387e-05, + "loss": 0.0, + "step": 40629 + }, + { + "epoch": 3.7911729028646075, + "grad_norm": NaN, + "learning_rate": 9.476190757633055e-05, + "loss": 0.0, + "step": 40630 + }, + { + "epoch": 3.791266212559485, + "grad_norm": NaN, + "learning_rate": 9.475487540523128e-05, + "loss": 0.0, + "step": 40631 + }, + { + "epoch": 3.7913595222543623, + "grad_norm": NaN, + "learning_rate": 9.474784337460398e-05, + "loss": 0.0, + "step": 40632 + }, + { + "epoch": 3.7914528319492398, + "grad_norm": NaN, + "learning_rate": 9.474081148446662e-05, + "loss": 0.0, + "step": 40633 + }, + { + "epoch": 3.7915461416441167, + "grad_norm": NaN, + "learning_rate": 9.473377973483696e-05, + "loss": 0.0, + "step": 40634 + }, + { + "epoch": 3.791639451338994, + "grad_norm": NaN, + "learning_rate": 9.472674812573293e-05, + "loss": 0.0, + "step": 40635 + }, + { + "epoch": 3.7917327610338716, + "grad_norm": NaN, + "learning_rate": 9.471971665717243e-05, + "loss": 0.0, + "step": 40636 + }, + { + "epoch": 3.7918260707287486, + "grad_norm": NaN, + "learning_rate": 9.47126853291733e-05, + "loss": 0.0, + "step": 40637 + }, + { + "epoch": 3.791919380423626, + "grad_norm": NaN, + "learning_rate": 9.47056541417534e-05, + "loss": 0.0, + "step": 40638 + }, + { + "epoch": 3.7920126901185034, + "grad_norm": NaN, + "learning_rate": 9.46986230949307e-05, + "loss": 0.0, + "step": 40639 + }, + { + "epoch": 3.792105999813381, + "grad_norm": NaN, + "learning_rate": 9.469159218872299e-05, + "loss": 0.0, + "step": 40640 + }, + { + "epoch": 3.792199309508258, + "grad_norm": NaN, + "learning_rate": 9.468456142314815e-05, + "loss": 0.0, + "step": 40641 + }, + { + "epoch": 3.792292619203135, + "grad_norm": NaN, + "learning_rate": 9.467753079822414e-05, + "loss": 0.0, + "step": 40642 + }, + { + "epoch": 3.7923859288980126, + "grad_norm": NaN, + "learning_rate": 9.467050031396878e-05, + "loss": 0.0, + "step": 40643 + }, + { + "epoch": 3.7924792385928896, + "grad_norm": NaN, + "learning_rate": 9.466346997039988e-05, + "loss": 0.0, + "step": 40644 + }, + { + "epoch": 3.792572548287767, + "grad_norm": NaN, + "learning_rate": 9.465643976753545e-05, + "loss": 0.0, + "step": 40645 + }, + { + "epoch": 3.7926658579826444, + "grad_norm": NaN, + "learning_rate": 9.46494097053933e-05, + "loss": 0.0, + "step": 40646 + }, + { + "epoch": 3.792759167677522, + "grad_norm": NaN, + "learning_rate": 9.464237978399125e-05, + "loss": 0.0, + "step": 40647 + }, + { + "epoch": 3.792852477372399, + "grad_norm": NaN, + "learning_rate": 9.46353500033473e-05, + "loss": 0.0, + "step": 40648 + }, + { + "epoch": 3.7929457870672763, + "grad_norm": NaN, + "learning_rate": 9.462832036347921e-05, + "loss": 0.0, + "step": 40649 + }, + { + "epoch": 3.7930390967621537, + "grad_norm": NaN, + "learning_rate": 9.462129086440487e-05, + "loss": 0.0, + "step": 40650 + }, + { + "epoch": 3.7931324064570306, + "grad_norm": NaN, + "learning_rate": 9.461426150614226e-05, + "loss": 0.0, + "step": 40651 + }, + { + "epoch": 3.793225716151908, + "grad_norm": NaN, + "learning_rate": 9.460723228870913e-05, + "loss": 0.0, + "step": 40652 + }, + { + "epoch": 3.7933190258467855, + "grad_norm": NaN, + "learning_rate": 9.460020321212338e-05, + "loss": 0.0, + "step": 40653 + }, + { + "epoch": 3.793412335541663, + "grad_norm": NaN, + "learning_rate": 9.459317427640295e-05, + "loss": 0.0, + "step": 40654 + }, + { + "epoch": 3.7935056452365403, + "grad_norm": NaN, + "learning_rate": 9.458614548156566e-05, + "loss": 0.0, + "step": 40655 + }, + { + "epoch": 3.7935989549314173, + "grad_norm": NaN, + "learning_rate": 9.457911682762933e-05, + "loss": 0.0, + "step": 40656 + }, + { + "epoch": 3.7936922646262947, + "grad_norm": NaN, + "learning_rate": 9.457208831461197e-05, + "loss": 0.0, + "step": 40657 + }, + { + "epoch": 3.7937855743211717, + "grad_norm": NaN, + "learning_rate": 9.456505994253133e-05, + "loss": 0.0, + "step": 40658 + }, + { + "epoch": 3.793878884016049, + "grad_norm": NaN, + "learning_rate": 9.45580317114053e-05, + "loss": 0.0, + "step": 40659 + }, + { + "epoch": 3.7939721937109265, + "grad_norm": NaN, + "learning_rate": 9.455100362125181e-05, + "loss": 0.0, + "step": 40660 + }, + { + "epoch": 3.794065503405804, + "grad_norm": NaN, + "learning_rate": 9.454397567208865e-05, + "loss": 0.0, + "step": 40661 + }, + { + "epoch": 3.7941588131006814, + "grad_norm": NaN, + "learning_rate": 9.453694786393375e-05, + "loss": 0.0, + "step": 40662 + }, + { + "epoch": 3.7942521227955583, + "grad_norm": NaN, + "learning_rate": 9.452992019680501e-05, + "loss": 0.0, + "step": 40663 + }, + { + "epoch": 3.7943454324904358, + "grad_norm": NaN, + "learning_rate": 9.452289267072018e-05, + "loss": 0.0, + "step": 40664 + }, + { + "epoch": 3.794438742185313, + "grad_norm": NaN, + "learning_rate": 9.451586528569723e-05, + "loss": 0.0, + "step": 40665 + }, + { + "epoch": 3.79453205188019, + "grad_norm": NaN, + "learning_rate": 9.450883804175403e-05, + "loss": 0.0, + "step": 40666 + }, + { + "epoch": 3.7946253615750676, + "grad_norm": NaN, + "learning_rate": 9.450181093890837e-05, + "loss": 0.0, + "step": 40667 + }, + { + "epoch": 3.794718671269945, + "grad_norm": NaN, + "learning_rate": 9.449478397717817e-05, + "loss": 0.0, + "step": 40668 + }, + { + "epoch": 3.7948119809648224, + "grad_norm": NaN, + "learning_rate": 9.448775715658136e-05, + "loss": 0.0, + "step": 40669 + }, + { + "epoch": 3.7949052906596994, + "grad_norm": NaN, + "learning_rate": 9.448073047713565e-05, + "loss": 0.0, + "step": 40670 + }, + { + "epoch": 3.794998600354577, + "grad_norm": NaN, + "learning_rate": 9.447370393885907e-05, + "loss": 0.0, + "step": 40671 + }, + { + "epoch": 3.7950919100494542, + "grad_norm": NaN, + "learning_rate": 9.446667754176938e-05, + "loss": 0.0, + "step": 40672 + }, + { + "epoch": 3.795185219744331, + "grad_norm": NaN, + "learning_rate": 9.445965128588443e-05, + "loss": 0.0, + "step": 40673 + }, + { + "epoch": 3.7952785294392086, + "grad_norm": NaN, + "learning_rate": 9.445262517122221e-05, + "loss": 0.0, + "step": 40674 + }, + { + "epoch": 3.795371839134086, + "grad_norm": NaN, + "learning_rate": 9.444559919780049e-05, + "loss": 0.0, + "step": 40675 + }, + { + "epoch": 3.7954651488289635, + "grad_norm": NaN, + "learning_rate": 9.44385733656371e-05, + "loss": 0.0, + "step": 40676 + }, + { + "epoch": 3.795558458523841, + "grad_norm": NaN, + "learning_rate": 9.443154767475004e-05, + "loss": 0.0, + "step": 40677 + }, + { + "epoch": 3.795651768218718, + "grad_norm": NaN, + "learning_rate": 9.442452212515705e-05, + "loss": 0.0, + "step": 40678 + }, + { + "epoch": 3.7957450779135953, + "grad_norm": NaN, + "learning_rate": 9.441749671687603e-05, + "loss": 0.0, + "step": 40679 + }, + { + "epoch": 3.7958383876084723, + "grad_norm": NaN, + "learning_rate": 9.441047144992489e-05, + "loss": 0.0, + "step": 40680 + }, + { + "epoch": 3.7959316973033497, + "grad_norm": NaN, + "learning_rate": 9.440344632432144e-05, + "loss": 0.0, + "step": 40681 + }, + { + "epoch": 3.796025006998227, + "grad_norm": NaN, + "learning_rate": 9.439642134008352e-05, + "loss": 0.0, + "step": 40682 + }, + { + "epoch": 3.7961183166931045, + "grad_norm": NaN, + "learning_rate": 9.438939649722909e-05, + "loss": 0.0, + "step": 40683 + }, + { + "epoch": 3.796211626387982, + "grad_norm": NaN, + "learning_rate": 9.438237179577591e-05, + "loss": 0.0, + "step": 40684 + }, + { + "epoch": 3.796304936082859, + "grad_norm": NaN, + "learning_rate": 9.437534723574185e-05, + "loss": 0.0, + "step": 40685 + }, + { + "epoch": 3.7963982457777363, + "grad_norm": NaN, + "learning_rate": 9.43683228171449e-05, + "loss": 0.0, + "step": 40686 + }, + { + "epoch": 3.7964915554726137, + "grad_norm": NaN, + "learning_rate": 9.436129854000275e-05, + "loss": 0.0, + "step": 40687 + }, + { + "epoch": 3.7965848651674907, + "grad_norm": NaN, + "learning_rate": 9.435427440433335e-05, + "loss": 0.0, + "step": 40688 + }, + { + "epoch": 3.796678174862368, + "grad_norm": NaN, + "learning_rate": 9.434725041015459e-05, + "loss": 0.0, + "step": 40689 + }, + { + "epoch": 3.7967714845572456, + "grad_norm": NaN, + "learning_rate": 9.434022655748423e-05, + "loss": 0.0, + "step": 40690 + }, + { + "epoch": 3.796864794252123, + "grad_norm": NaN, + "learning_rate": 9.433320284634017e-05, + "loss": 0.0, + "step": 40691 + }, + { + "epoch": 3.796958103947, + "grad_norm": NaN, + "learning_rate": 9.432617927674038e-05, + "loss": 0.0, + "step": 40692 + }, + { + "epoch": 3.7970514136418774, + "grad_norm": NaN, + "learning_rate": 9.431915584870257e-05, + "loss": 0.0, + "step": 40693 + }, + { + "epoch": 3.797144723336755, + "grad_norm": NaN, + "learning_rate": 9.43121325622446e-05, + "loss": 0.0, + "step": 40694 + }, + { + "epoch": 3.7972380330316318, + "grad_norm": NaN, + "learning_rate": 9.430510941738447e-05, + "loss": 0.0, + "step": 40695 + }, + { + "epoch": 3.797331342726509, + "grad_norm": NaN, + "learning_rate": 9.42980864141399e-05, + "loss": 0.0, + "step": 40696 + }, + { + "epoch": 3.7974246524213866, + "grad_norm": NaN, + "learning_rate": 9.429106355252878e-05, + "loss": 0.0, + "step": 40697 + }, + { + "epoch": 3.797517962116264, + "grad_norm": NaN, + "learning_rate": 9.428404083256905e-05, + "loss": 0.0, + "step": 40698 + }, + { + "epoch": 3.7976112718111414, + "grad_norm": NaN, + "learning_rate": 9.427701825427844e-05, + "loss": 0.0, + "step": 40699 + }, + { + "epoch": 3.7977045815060184, + "grad_norm": NaN, + "learning_rate": 9.426999581767488e-05, + "loss": 0.0, + "step": 40700 + }, + { + "epoch": 3.797797891200896, + "grad_norm": NaN, + "learning_rate": 9.426297352277624e-05, + "loss": 0.0, + "step": 40701 + }, + { + "epoch": 3.797891200895773, + "grad_norm": NaN, + "learning_rate": 9.42559513696003e-05, + "loss": 0.0, + "step": 40702 + }, + { + "epoch": 3.7979845105906502, + "grad_norm": NaN, + "learning_rate": 9.424892935816498e-05, + "loss": 0.0, + "step": 40703 + }, + { + "epoch": 3.7980778202855277, + "grad_norm": NaN, + "learning_rate": 9.424190748848817e-05, + "loss": 0.0, + "step": 40704 + }, + { + "epoch": 3.798171129980405, + "grad_norm": NaN, + "learning_rate": 9.423488576058759e-05, + "loss": 0.0, + "step": 40705 + }, + { + "epoch": 3.7982644396752825, + "grad_norm": NaN, + "learning_rate": 9.422786417448123e-05, + "loss": 0.0, + "step": 40706 + }, + { + "epoch": 3.7983577493701595, + "grad_norm": NaN, + "learning_rate": 9.42208427301869e-05, + "loss": 0.0, + "step": 40707 + }, + { + "epoch": 3.798451059065037, + "grad_norm": NaN, + "learning_rate": 9.421382142772239e-05, + "loss": 0.0, + "step": 40708 + }, + { + "epoch": 3.7985443687599143, + "grad_norm": NaN, + "learning_rate": 9.420680026710563e-05, + "loss": 0.0, + "step": 40709 + }, + { + "epoch": 3.7986376784547913, + "grad_norm": NaN, + "learning_rate": 9.419977924835448e-05, + "loss": 0.0, + "step": 40710 + }, + { + "epoch": 3.7987309881496687, + "grad_norm": NaN, + "learning_rate": 9.419275837148674e-05, + "loss": 0.0, + "step": 40711 + }, + { + "epoch": 3.798824297844546, + "grad_norm": NaN, + "learning_rate": 9.418573763652027e-05, + "loss": 0.0, + "step": 40712 + }, + { + "epoch": 3.7989176075394235, + "grad_norm": NaN, + "learning_rate": 9.417871704347296e-05, + "loss": 0.0, + "step": 40713 + }, + { + "epoch": 3.7990109172343005, + "grad_norm": NaN, + "learning_rate": 9.417169659236261e-05, + "loss": 0.0, + "step": 40714 + }, + { + "epoch": 3.799104226929178, + "grad_norm": NaN, + "learning_rate": 9.416467628320714e-05, + "loss": 0.0, + "step": 40715 + }, + { + "epoch": 3.7991975366240553, + "grad_norm": NaN, + "learning_rate": 9.415765611602434e-05, + "loss": 0.0, + "step": 40716 + }, + { + "epoch": 3.7992908463189323, + "grad_norm": NaN, + "learning_rate": 9.415063609083204e-05, + "loss": 0.0, + "step": 40717 + }, + { + "epoch": 3.7993841560138097, + "grad_norm": NaN, + "learning_rate": 9.414361620764819e-05, + "loss": 0.0, + "step": 40718 + }, + { + "epoch": 3.799477465708687, + "grad_norm": NaN, + "learning_rate": 9.413659646649056e-05, + "loss": 0.0, + "step": 40719 + }, + { + "epoch": 3.7995707754035646, + "grad_norm": NaN, + "learning_rate": 9.412957686737697e-05, + "loss": 0.0, + "step": 40720 + }, + { + "epoch": 3.799664085098442, + "grad_norm": NaN, + "learning_rate": 9.412255741032538e-05, + "loss": 0.0, + "step": 40721 + }, + { + "epoch": 3.799757394793319, + "grad_norm": NaN, + "learning_rate": 9.411553809535354e-05, + "loss": 0.0, + "step": 40722 + }, + { + "epoch": 3.7998507044881964, + "grad_norm": NaN, + "learning_rate": 9.410851892247931e-05, + "loss": 0.0, + "step": 40723 + }, + { + "epoch": 3.7999440141830734, + "grad_norm": NaN, + "learning_rate": 9.410149989172061e-05, + "loss": 0.0, + "step": 40724 + }, + { + "epoch": 3.800037323877951, + "grad_norm": NaN, + "learning_rate": 9.409448100309523e-05, + "loss": 0.0, + "step": 40725 + }, + { + "epoch": 3.800130633572828, + "grad_norm": NaN, + "learning_rate": 9.408746225662098e-05, + "loss": 0.0, + "step": 40726 + }, + { + "epoch": 3.8002239432677056, + "grad_norm": NaN, + "learning_rate": 9.40804436523158e-05, + "loss": 0.0, + "step": 40727 + }, + { + "epoch": 3.800317252962583, + "grad_norm": NaN, + "learning_rate": 9.407342519019748e-05, + "loss": 0.0, + "step": 40728 + }, + { + "epoch": 3.80041056265746, + "grad_norm": NaN, + "learning_rate": 9.406640687028383e-05, + "loss": 0.0, + "step": 40729 + }, + { + "epoch": 3.8005038723523374, + "grad_norm": NaN, + "learning_rate": 9.40593886925928e-05, + "loss": 0.0, + "step": 40730 + }, + { + "epoch": 3.800597182047215, + "grad_norm": NaN, + "learning_rate": 9.405237065714215e-05, + "loss": 0.0, + "step": 40731 + }, + { + "epoch": 3.800690491742092, + "grad_norm": NaN, + "learning_rate": 9.404535276394971e-05, + "loss": 0.0, + "step": 40732 + }, + { + "epoch": 3.8007838014369693, + "grad_norm": NaN, + "learning_rate": 9.403833501303342e-05, + "loss": 0.0, + "step": 40733 + }, + { + "epoch": 3.8008771111318467, + "grad_norm": NaN, + "learning_rate": 9.403131740441106e-05, + "loss": 0.0, + "step": 40734 + }, + { + "epoch": 3.800970420826724, + "grad_norm": NaN, + "learning_rate": 9.40242999381004e-05, + "loss": 0.0, + "step": 40735 + }, + { + "epoch": 3.801063730521601, + "grad_norm": NaN, + "learning_rate": 9.401728261411948e-05, + "loss": 0.0, + "step": 40736 + }, + { + "epoch": 3.8011570402164785, + "grad_norm": NaN, + "learning_rate": 9.401026543248594e-05, + "loss": 0.0, + "step": 40737 + }, + { + "epoch": 3.801250349911356, + "grad_norm": NaN, + "learning_rate": 9.400324839321774e-05, + "loss": 0.0, + "step": 40738 + }, + { + "epoch": 3.801343659606233, + "grad_norm": NaN, + "learning_rate": 9.399623149633272e-05, + "loss": 0.0, + "step": 40739 + }, + { + "epoch": 3.8014369693011103, + "grad_norm": NaN, + "learning_rate": 9.398921474184865e-05, + "loss": 0.0, + "step": 40740 + }, + { + "epoch": 3.8015302789959877, + "grad_norm": NaN, + "learning_rate": 9.398219812978342e-05, + "loss": 0.0, + "step": 40741 + }, + { + "epoch": 3.801623588690865, + "grad_norm": NaN, + "learning_rate": 9.397518166015492e-05, + "loss": 0.0, + "step": 40742 + }, + { + "epoch": 3.801716898385742, + "grad_norm": NaN, + "learning_rate": 9.396816533298086e-05, + "loss": 0.0, + "step": 40743 + }, + { + "epoch": 3.8018102080806195, + "grad_norm": NaN, + "learning_rate": 9.396114914827921e-05, + "loss": 0.0, + "step": 40744 + }, + { + "epoch": 3.801903517775497, + "grad_norm": NaN, + "learning_rate": 9.395413310606777e-05, + "loss": 0.0, + "step": 40745 + }, + { + "epoch": 3.801996827470374, + "grad_norm": NaN, + "learning_rate": 9.394711720636431e-05, + "loss": 0.0, + "step": 40746 + }, + { + "epoch": 3.8020901371652513, + "grad_norm": NaN, + "learning_rate": 9.394010144918675e-05, + "loss": 0.0, + "step": 40747 + }, + { + "epoch": 3.8021834468601288, + "grad_norm": NaN, + "learning_rate": 9.393308583455296e-05, + "loss": 0.0, + "step": 40748 + }, + { + "epoch": 3.802276756555006, + "grad_norm": NaN, + "learning_rate": 9.392607036248065e-05, + "loss": 0.0, + "step": 40749 + }, + { + "epoch": 3.8023700662498836, + "grad_norm": NaN, + "learning_rate": 9.391905503298776e-05, + "loss": 0.0, + "step": 40750 + }, + { + "epoch": 3.8024633759447606, + "grad_norm": NaN, + "learning_rate": 9.391203984609213e-05, + "loss": 0.0, + "step": 40751 + }, + { + "epoch": 3.802556685639638, + "grad_norm": NaN, + "learning_rate": 9.390502480181151e-05, + "loss": 0.0, + "step": 40752 + }, + { + "epoch": 3.8026499953345154, + "grad_norm": NaN, + "learning_rate": 9.389800990016382e-05, + "loss": 0.0, + "step": 40753 + }, + { + "epoch": 3.8027433050293924, + "grad_norm": NaN, + "learning_rate": 9.389099514116691e-05, + "loss": 0.0, + "step": 40754 + }, + { + "epoch": 3.80283661472427, + "grad_norm": NaN, + "learning_rate": 9.388398052483852e-05, + "loss": 0.0, + "step": 40755 + }, + { + "epoch": 3.8029299244191472, + "grad_norm": NaN, + "learning_rate": 9.387696605119658e-05, + "loss": 0.0, + "step": 40756 + }, + { + "epoch": 3.8030232341140247, + "grad_norm": NaN, + "learning_rate": 9.386995172025891e-05, + "loss": 0.0, + "step": 40757 + }, + { + "epoch": 3.8031165438089016, + "grad_norm": NaN, + "learning_rate": 9.386293753204325e-05, + "loss": 0.0, + "step": 40758 + }, + { + "epoch": 3.803209853503779, + "grad_norm": NaN, + "learning_rate": 9.385592348656756e-05, + "loss": 0.0, + "step": 40759 + }, + { + "epoch": 3.8033031631986565, + "grad_norm": NaN, + "learning_rate": 9.384890958384967e-05, + "loss": 0.0, + "step": 40760 + }, + { + "epoch": 3.8033964728935334, + "grad_norm": NaN, + "learning_rate": 9.384189582390728e-05, + "loss": 0.0, + "step": 40761 + }, + { + "epoch": 3.803489782588411, + "grad_norm": NaN, + "learning_rate": 9.38348822067584e-05, + "loss": 0.0, + "step": 40762 + }, + { + "epoch": 3.8035830922832883, + "grad_norm": NaN, + "learning_rate": 9.382786873242073e-05, + "loss": 0.0, + "step": 40763 + }, + { + "epoch": 3.8036764019781657, + "grad_norm": NaN, + "learning_rate": 9.382085540091212e-05, + "loss": 0.0, + "step": 40764 + }, + { + "epoch": 3.8037697116730427, + "grad_norm": NaN, + "learning_rate": 9.38138422122505e-05, + "loss": 0.0, + "step": 40765 + }, + { + "epoch": 3.80386302136792, + "grad_norm": NaN, + "learning_rate": 9.380682916645361e-05, + "loss": 0.0, + "step": 40766 + }, + { + "epoch": 3.8039563310627975, + "grad_norm": NaN, + "learning_rate": 9.379981626353928e-05, + "loss": 0.0, + "step": 40767 + }, + { + "epoch": 3.8040496407576745, + "grad_norm": NaN, + "learning_rate": 9.379280350352542e-05, + "loss": 0.0, + "step": 40768 + }, + { + "epoch": 3.804142950452552, + "grad_norm": NaN, + "learning_rate": 9.378579088642976e-05, + "loss": 0.0, + "step": 40769 + }, + { + "epoch": 3.8042362601474293, + "grad_norm": NaN, + "learning_rate": 9.377877841227018e-05, + "loss": 0.0, + "step": 40770 + }, + { + "epoch": 3.8043295698423067, + "grad_norm": NaN, + "learning_rate": 9.377176608106456e-05, + "loss": 0.0, + "step": 40771 + }, + { + "epoch": 3.804422879537184, + "grad_norm": NaN, + "learning_rate": 9.376475389283064e-05, + "loss": 0.0, + "step": 40772 + }, + { + "epoch": 3.804516189232061, + "grad_norm": NaN, + "learning_rate": 9.37577418475863e-05, + "loss": 0.0, + "step": 40773 + }, + { + "epoch": 3.8046094989269386, + "grad_norm": NaN, + "learning_rate": 9.375072994534942e-05, + "loss": 0.0, + "step": 40774 + }, + { + "epoch": 3.8047028086218155, + "grad_norm": NaN, + "learning_rate": 9.374371818613769e-05, + "loss": 0.0, + "step": 40775 + }, + { + "epoch": 3.804796118316693, + "grad_norm": NaN, + "learning_rate": 9.373670656996906e-05, + "loss": 0.0, + "step": 40776 + }, + { + "epoch": 3.8048894280115704, + "grad_norm": NaN, + "learning_rate": 9.372969509686135e-05, + "loss": 0.0, + "step": 40777 + }, + { + "epoch": 3.804982737706448, + "grad_norm": NaN, + "learning_rate": 9.372268376683228e-05, + "loss": 0.0, + "step": 40778 + }, + { + "epoch": 3.805076047401325, + "grad_norm": NaN, + "learning_rate": 9.371567257989981e-05, + "loss": 0.0, + "step": 40779 + }, + { + "epoch": 3.805169357096202, + "grad_norm": NaN, + "learning_rate": 9.370866153608171e-05, + "loss": 0.0, + "step": 40780 + }, + { + "epoch": 3.8052626667910796, + "grad_norm": NaN, + "learning_rate": 9.370165063539577e-05, + "loss": 0.0, + "step": 40781 + }, + { + "epoch": 3.805355976485957, + "grad_norm": NaN, + "learning_rate": 9.369463987785989e-05, + "loss": 0.0, + "step": 40782 + }, + { + "epoch": 3.805449286180834, + "grad_norm": NaN, + "learning_rate": 9.36876292634919e-05, + "loss": 0.0, + "step": 40783 + }, + { + "epoch": 3.8055425958757114, + "grad_norm": NaN, + "learning_rate": 9.368061879230949e-05, + "loss": 0.0, + "step": 40784 + }, + { + "epoch": 3.805635905570589, + "grad_norm": NaN, + "learning_rate": 9.367360846433065e-05, + "loss": 0.0, + "step": 40785 + }, + { + "epoch": 3.8057292152654663, + "grad_norm": NaN, + "learning_rate": 9.366659827957313e-05, + "loss": 0.0, + "step": 40786 + }, + { + "epoch": 3.8058225249603432, + "grad_norm": NaN, + "learning_rate": 9.365958823805475e-05, + "loss": 0.0, + "step": 40787 + }, + { + "epoch": 3.8059158346552207, + "grad_norm": NaN, + "learning_rate": 9.365257833979334e-05, + "loss": 0.0, + "step": 40788 + }, + { + "epoch": 3.806009144350098, + "grad_norm": NaN, + "learning_rate": 9.364556858480678e-05, + "loss": 0.0, + "step": 40789 + }, + { + "epoch": 3.806102454044975, + "grad_norm": NaN, + "learning_rate": 9.363855897311279e-05, + "loss": 0.0, + "step": 40790 + }, + { + "epoch": 3.8061957637398525, + "grad_norm": NaN, + "learning_rate": 9.363154950472926e-05, + "loss": 0.0, + "step": 40791 + }, + { + "epoch": 3.80628907343473, + "grad_norm": NaN, + "learning_rate": 9.362454017967406e-05, + "loss": 0.0, + "step": 40792 + }, + { + "epoch": 3.8063823831296073, + "grad_norm": NaN, + "learning_rate": 9.361753099796489e-05, + "loss": 0.0, + "step": 40793 + }, + { + "epoch": 3.8064756928244847, + "grad_norm": NaN, + "learning_rate": 9.361052195961963e-05, + "loss": 0.0, + "step": 40794 + }, + { + "epoch": 3.8065690025193617, + "grad_norm": NaN, + "learning_rate": 9.36035130646562e-05, + "loss": 0.0, + "step": 40795 + }, + { + "epoch": 3.806662312214239, + "grad_norm": NaN, + "learning_rate": 9.35965043130922e-05, + "loss": 0.0, + "step": 40796 + }, + { + "epoch": 3.806755621909116, + "grad_norm": NaN, + "learning_rate": 9.358949570494567e-05, + "loss": 0.0, + "step": 40797 + }, + { + "epoch": 3.8068489316039935, + "grad_norm": NaN, + "learning_rate": 9.358248724023435e-05, + "loss": 0.0, + "step": 40798 + }, + { + "epoch": 3.806942241298871, + "grad_norm": NaN, + "learning_rate": 9.3575478918976e-05, + "loss": 0.0, + "step": 40799 + }, + { + "epoch": 3.8070355509937484, + "grad_norm": NaN, + "learning_rate": 9.35684707411885e-05, + "loss": 0.0, + "step": 40800 + }, + { + "epoch": 3.8071288606886258, + "grad_norm": NaN, + "learning_rate": 9.356146270688971e-05, + "loss": 0.0, + "step": 40801 + }, + { + "epoch": 3.8072221703835027, + "grad_norm": NaN, + "learning_rate": 9.355445481609732e-05, + "loss": 0.0, + "step": 40802 + }, + { + "epoch": 3.80731548007838, + "grad_norm": NaN, + "learning_rate": 9.354744706882928e-05, + "loss": 0.0, + "step": 40803 + }, + { + "epoch": 3.8074087897732576, + "grad_norm": NaN, + "learning_rate": 9.35404394651034e-05, + "loss": 0.0, + "step": 40804 + }, + { + "epoch": 3.8075020994681346, + "grad_norm": NaN, + "learning_rate": 9.35334320049374e-05, + "loss": 0.0, + "step": 40805 + }, + { + "epoch": 3.807595409163012, + "grad_norm": NaN, + "learning_rate": 9.35264246883492e-05, + "loss": 0.0, + "step": 40806 + }, + { + "epoch": 3.8076887188578894, + "grad_norm": NaN, + "learning_rate": 9.351941751535655e-05, + "loss": 0.0, + "step": 40807 + }, + { + "epoch": 3.807782028552767, + "grad_norm": NaN, + "learning_rate": 9.351241048597724e-05, + "loss": 0.0, + "step": 40808 + }, + { + "epoch": 3.807875338247644, + "grad_norm": NaN, + "learning_rate": 9.350540360022923e-05, + "loss": 0.0, + "step": 40809 + }, + { + "epoch": 3.807968647942521, + "grad_norm": NaN, + "learning_rate": 9.349839685813016e-05, + "loss": 0.0, + "step": 40810 + }, + { + "epoch": 3.8080619576373986, + "grad_norm": NaN, + "learning_rate": 9.349139025969799e-05, + "loss": 0.0, + "step": 40811 + }, + { + "epoch": 3.8081552673322756, + "grad_norm": NaN, + "learning_rate": 9.348438380495047e-05, + "loss": 0.0, + "step": 40812 + }, + { + "epoch": 3.808248577027153, + "grad_norm": NaN, + "learning_rate": 9.347737749390537e-05, + "loss": 0.0, + "step": 40813 + }, + { + "epoch": 3.8083418867220304, + "grad_norm": NaN, + "learning_rate": 9.347037132658059e-05, + "loss": 0.0, + "step": 40814 + }, + { + "epoch": 3.808435196416908, + "grad_norm": NaN, + "learning_rate": 9.346336530299395e-05, + "loss": 0.0, + "step": 40815 + }, + { + "epoch": 3.8085285061117853, + "grad_norm": NaN, + "learning_rate": 9.345635942316317e-05, + "loss": 0.0, + "step": 40816 + }, + { + "epoch": 3.8086218158066623, + "grad_norm": NaN, + "learning_rate": 9.344935368710613e-05, + "loss": 0.0, + "step": 40817 + }, + { + "epoch": 3.8087151255015397, + "grad_norm": NaN, + "learning_rate": 9.344234809484066e-05, + "loss": 0.0, + "step": 40818 + }, + { + "epoch": 3.8088084351964167, + "grad_norm": NaN, + "learning_rate": 9.34353426463845e-05, + "loss": 0.0, + "step": 40819 + }, + { + "epoch": 3.808901744891294, + "grad_norm": NaN, + "learning_rate": 9.342833734175551e-05, + "loss": 0.0, + "step": 40820 + }, + { + "epoch": 3.8089950545861715, + "grad_norm": NaN, + "learning_rate": 9.342133218097156e-05, + "loss": 0.0, + "step": 40821 + }, + { + "epoch": 3.809088364281049, + "grad_norm": NaN, + "learning_rate": 9.341432716405032e-05, + "loss": 0.0, + "step": 40822 + }, + { + "epoch": 3.8091816739759263, + "grad_norm": NaN, + "learning_rate": 9.340732229100973e-05, + "loss": 0.0, + "step": 40823 + }, + { + "epoch": 3.8092749836708033, + "grad_norm": NaN, + "learning_rate": 9.340031756186758e-05, + "loss": 0.0, + "step": 40824 + }, + { + "epoch": 3.8093682933656807, + "grad_norm": NaN, + "learning_rate": 9.339331297664161e-05, + "loss": 0.0, + "step": 40825 + }, + { + "epoch": 3.809461603060558, + "grad_norm": NaN, + "learning_rate": 9.338630853534967e-05, + "loss": 0.0, + "step": 40826 + }, + { + "epoch": 3.809554912755435, + "grad_norm": NaN, + "learning_rate": 9.337930423800965e-05, + "loss": 0.0, + "step": 40827 + }, + { + "epoch": 3.8096482224503125, + "grad_norm": NaN, + "learning_rate": 9.337230008463921e-05, + "loss": 0.0, + "step": 40828 + }, + { + "epoch": 3.80974153214519, + "grad_norm": NaN, + "learning_rate": 9.336529607525627e-05, + "loss": 0.0, + "step": 40829 + }, + { + "epoch": 3.8098348418400674, + "grad_norm": NaN, + "learning_rate": 9.335829220987862e-05, + "loss": 0.0, + "step": 40830 + }, + { + "epoch": 3.8099281515349444, + "grad_norm": NaN, + "learning_rate": 9.335128848852401e-05, + "loss": 0.0, + "step": 40831 + }, + { + "epoch": 3.8100214612298218, + "grad_norm": NaN, + "learning_rate": 9.334428491121033e-05, + "loss": 0.0, + "step": 40832 + }, + { + "epoch": 3.810114770924699, + "grad_norm": NaN, + "learning_rate": 9.333728147795536e-05, + "loss": 0.0, + "step": 40833 + }, + { + "epoch": 3.810208080619576, + "grad_norm": NaN, + "learning_rate": 9.333027818877685e-05, + "loss": 0.0, + "step": 40834 + }, + { + "epoch": 3.8103013903144536, + "grad_norm": NaN, + "learning_rate": 9.332327504369268e-05, + "loss": 0.0, + "step": 40835 + }, + { + "epoch": 3.810394700009331, + "grad_norm": NaN, + "learning_rate": 9.331627204272067e-05, + "loss": 0.0, + "step": 40836 + }, + { + "epoch": 3.8104880097042084, + "grad_norm": NaN, + "learning_rate": 9.330926918587853e-05, + "loss": 0.0, + "step": 40837 + }, + { + "epoch": 3.810581319399086, + "grad_norm": NaN, + "learning_rate": 9.330226647318414e-05, + "loss": 0.0, + "step": 40838 + }, + { + "epoch": 3.810674629093963, + "grad_norm": NaN, + "learning_rate": 9.329526390465534e-05, + "loss": 0.0, + "step": 40839 + }, + { + "epoch": 3.8107679387888402, + "grad_norm": NaN, + "learning_rate": 9.328826148030982e-05, + "loss": 0.0, + "step": 40840 + }, + { + "epoch": 3.810861248483717, + "grad_norm": NaN, + "learning_rate": 9.328125920016549e-05, + "loss": 0.0, + "step": 40841 + }, + { + "epoch": 3.8109545581785946, + "grad_norm": NaN, + "learning_rate": 9.327425706424014e-05, + "loss": 0.0, + "step": 40842 + }, + { + "epoch": 3.811047867873472, + "grad_norm": NaN, + "learning_rate": 9.326725507255147e-05, + "loss": 0.0, + "step": 40843 + }, + { + "epoch": 3.8111411775683495, + "grad_norm": NaN, + "learning_rate": 9.326025322511739e-05, + "loss": 0.0, + "step": 40844 + }, + { + "epoch": 3.811234487263227, + "grad_norm": NaN, + "learning_rate": 9.325325152195577e-05, + "loss": 0.0, + "step": 40845 + }, + { + "epoch": 3.811327796958104, + "grad_norm": NaN, + "learning_rate": 9.324624996308422e-05, + "loss": 0.0, + "step": 40846 + }, + { + "epoch": 3.8114211066529813, + "grad_norm": NaN, + "learning_rate": 9.323924854852064e-05, + "loss": 0.0, + "step": 40847 + }, + { + "epoch": 3.8115144163478587, + "grad_norm": NaN, + "learning_rate": 9.323224727828291e-05, + "loss": 0.0, + "step": 40848 + }, + { + "epoch": 3.8116077260427357, + "grad_norm": NaN, + "learning_rate": 9.322524615238872e-05, + "loss": 0.0, + "step": 40849 + }, + { + "epoch": 3.811701035737613, + "grad_norm": NaN, + "learning_rate": 9.321824517085595e-05, + "loss": 0.0, + "step": 40850 + }, + { + "epoch": 3.8117943454324905, + "grad_norm": NaN, + "learning_rate": 9.321124433370229e-05, + "loss": 0.0, + "step": 40851 + }, + { + "epoch": 3.811887655127368, + "grad_norm": NaN, + "learning_rate": 9.320424364094566e-05, + "loss": 0.0, + "step": 40852 + }, + { + "epoch": 3.811980964822245, + "grad_norm": NaN, + "learning_rate": 9.319724309260384e-05, + "loss": 0.0, + "step": 40853 + }, + { + "epoch": 3.8120742745171223, + "grad_norm": NaN, + "learning_rate": 9.319024268869453e-05, + "loss": 0.0, + "step": 40854 + }, + { + "epoch": 3.8121675842119997, + "grad_norm": NaN, + "learning_rate": 9.318324242923564e-05, + "loss": 0.0, + "step": 40855 + }, + { + "epoch": 3.8122608939068767, + "grad_norm": NaN, + "learning_rate": 9.317624231424497e-05, + "loss": 0.0, + "step": 40856 + }, + { + "epoch": 3.812354203601754, + "grad_norm": NaN, + "learning_rate": 9.316924234374022e-05, + "loss": 0.0, + "step": 40857 + }, + { + "epoch": 3.8124475132966316, + "grad_norm": NaN, + "learning_rate": 9.316224251773929e-05, + "loss": 0.0, + "step": 40858 + }, + { + "epoch": 3.812540822991509, + "grad_norm": NaN, + "learning_rate": 9.315524283625996e-05, + "loss": 0.0, + "step": 40859 + }, + { + "epoch": 3.812634132686386, + "grad_norm": NaN, + "learning_rate": 9.314824329931996e-05, + "loss": 0.0, + "step": 40860 + }, + { + "epoch": 3.8127274423812634, + "grad_norm": NaN, + "learning_rate": 9.314124390693716e-05, + "loss": 0.0, + "step": 40861 + }, + { + "epoch": 3.812820752076141, + "grad_norm": NaN, + "learning_rate": 9.313424465912937e-05, + "loss": 0.0, + "step": 40862 + }, + { + "epoch": 3.8129140617710178, + "grad_norm": NaN, + "learning_rate": 9.312724555591427e-05, + "loss": 0.0, + "step": 40863 + }, + { + "epoch": 3.813007371465895, + "grad_norm": NaN, + "learning_rate": 9.31202465973098e-05, + "loss": 0.0, + "step": 40864 + }, + { + "epoch": 3.8131006811607726, + "grad_norm": NaN, + "learning_rate": 9.31132477833337e-05, + "loss": 0.0, + "step": 40865 + }, + { + "epoch": 3.81319399085565, + "grad_norm": NaN, + "learning_rate": 9.310624911400372e-05, + "loss": 0.0, + "step": 40866 + }, + { + "epoch": 3.8132873005505274, + "grad_norm": NaN, + "learning_rate": 9.30992505893377e-05, + "loss": 0.0, + "step": 40867 + }, + { + "epoch": 3.8133806102454044, + "grad_norm": NaN, + "learning_rate": 9.309225220935348e-05, + "loss": 0.0, + "step": 40868 + }, + { + "epoch": 3.813473919940282, + "grad_norm": NaN, + "learning_rate": 9.308525397406874e-05, + "loss": 0.0, + "step": 40869 + }, + { + "epoch": 3.813567229635159, + "grad_norm": NaN, + "learning_rate": 9.307825588350138e-05, + "loss": 0.0, + "step": 40870 + }, + { + "epoch": 3.8136605393300362, + "grad_norm": NaN, + "learning_rate": 9.307125793766917e-05, + "loss": 0.0, + "step": 40871 + }, + { + "epoch": 3.8137538490249137, + "grad_norm": NaN, + "learning_rate": 9.306426013658982e-05, + "loss": 0.0, + "step": 40872 + }, + { + "epoch": 3.813847158719791, + "grad_norm": NaN, + "learning_rate": 9.305726248028123e-05, + "loss": 0.0, + "step": 40873 + }, + { + "epoch": 3.8139404684146685, + "grad_norm": NaN, + "learning_rate": 9.305026496876119e-05, + "loss": 0.0, + "step": 40874 + }, + { + "epoch": 3.8140337781095455, + "grad_norm": NaN, + "learning_rate": 9.304326760204738e-05, + "loss": 0.0, + "step": 40875 + }, + { + "epoch": 3.814127087804423, + "grad_norm": NaN, + "learning_rate": 9.303627038015771e-05, + "loss": 0.0, + "step": 40876 + }, + { + "epoch": 3.8142203974993003, + "grad_norm": NaN, + "learning_rate": 9.302927330310995e-05, + "loss": 0.0, + "step": 40877 + }, + { + "epoch": 3.8143137071941773, + "grad_norm": NaN, + "learning_rate": 9.302227637092184e-05, + "loss": 0.0, + "step": 40878 + }, + { + "epoch": 3.8144070168890547, + "grad_norm": NaN, + "learning_rate": 9.30152795836112e-05, + "loss": 0.0, + "step": 40879 + }, + { + "epoch": 3.814500326583932, + "grad_norm": NaN, + "learning_rate": 9.300828294119586e-05, + "loss": 0.0, + "step": 40880 + }, + { + "epoch": 3.8145936362788095, + "grad_norm": NaN, + "learning_rate": 9.300128644369354e-05, + "loss": 0.0, + "step": 40881 + }, + { + "epoch": 3.8146869459736865, + "grad_norm": NaN, + "learning_rate": 9.299429009112205e-05, + "loss": 0.0, + "step": 40882 + }, + { + "epoch": 3.814780255668564, + "grad_norm": NaN, + "learning_rate": 9.298729388349928e-05, + "loss": 0.0, + "step": 40883 + }, + { + "epoch": 3.8148735653634414, + "grad_norm": NaN, + "learning_rate": 9.298029782084282e-05, + "loss": 0.0, + "step": 40884 + }, + { + "epoch": 3.8149668750583183, + "grad_norm": NaN, + "learning_rate": 9.297330190317063e-05, + "loss": 0.0, + "step": 40885 + }, + { + "epoch": 3.8150601847531957, + "grad_norm": NaN, + "learning_rate": 9.296630613050046e-05, + "loss": 0.0, + "step": 40886 + }, + { + "epoch": 3.815153494448073, + "grad_norm": NaN, + "learning_rate": 9.295931050285005e-05, + "loss": 0.0, + "step": 40887 + }, + { + "epoch": 3.8152468041429506, + "grad_norm": NaN, + "learning_rate": 9.295231502023719e-05, + "loss": 0.0, + "step": 40888 + }, + { + "epoch": 3.815340113837828, + "grad_norm": NaN, + "learning_rate": 9.294531968267975e-05, + "loss": 0.0, + "step": 40889 + }, + { + "epoch": 3.815433423532705, + "grad_norm": NaN, + "learning_rate": 9.293832449019545e-05, + "loss": 0.0, + "step": 40890 + }, + { + "epoch": 3.8155267332275824, + "grad_norm": NaN, + "learning_rate": 9.293132944280204e-05, + "loss": 0.0, + "step": 40891 + }, + { + "epoch": 3.8156200429224594, + "grad_norm": NaN, + "learning_rate": 9.292433454051743e-05, + "loss": 0.0, + "step": 40892 + }, + { + "epoch": 3.815713352617337, + "grad_norm": NaN, + "learning_rate": 9.291733978335928e-05, + "loss": 0.0, + "step": 40893 + }, + { + "epoch": 3.815806662312214, + "grad_norm": NaN, + "learning_rate": 9.291034517134548e-05, + "loss": 0.0, + "step": 40894 + }, + { + "epoch": 3.8158999720070916, + "grad_norm": NaN, + "learning_rate": 9.290335070449369e-05, + "loss": 0.0, + "step": 40895 + }, + { + "epoch": 3.815993281701969, + "grad_norm": NaN, + "learning_rate": 9.289635638282179e-05, + "loss": 0.0, + "step": 40896 + }, + { + "epoch": 3.816086591396846, + "grad_norm": NaN, + "learning_rate": 9.288936220634757e-05, + "loss": 0.0, + "step": 40897 + }, + { + "epoch": 3.8161799010917234, + "grad_norm": NaN, + "learning_rate": 9.288236817508872e-05, + "loss": 0.0, + "step": 40898 + }, + { + "epoch": 3.816273210786601, + "grad_norm": NaN, + "learning_rate": 9.287537428906313e-05, + "loss": 0.0, + "step": 40899 + }, + { + "epoch": 3.816366520481478, + "grad_norm": NaN, + "learning_rate": 9.286838054828858e-05, + "loss": 0.0, + "step": 40900 + }, + { + "epoch": 3.8164598301763553, + "grad_norm": NaN, + "learning_rate": 9.286138695278273e-05, + "loss": 0.0, + "step": 40901 + }, + { + "epoch": 3.8165531398712327, + "grad_norm": NaN, + "learning_rate": 9.285439350256349e-05, + "loss": 0.0, + "step": 40902 + }, + { + "epoch": 3.81664644956611, + "grad_norm": NaN, + "learning_rate": 9.284740019764862e-05, + "loss": 0.0, + "step": 40903 + }, + { + "epoch": 3.816739759260987, + "grad_norm": NaN, + "learning_rate": 9.284040703805585e-05, + "loss": 0.0, + "step": 40904 + }, + { + "epoch": 3.8168330689558645, + "grad_norm": NaN, + "learning_rate": 9.283341402380299e-05, + "loss": 0.0, + "step": 40905 + }, + { + "epoch": 3.816926378650742, + "grad_norm": NaN, + "learning_rate": 9.282642115490789e-05, + "loss": 0.0, + "step": 40906 + }, + { + "epoch": 3.817019688345619, + "grad_norm": NaN, + "learning_rate": 9.281942843138817e-05, + "loss": 0.0, + "step": 40907 + }, + { + "epoch": 3.8171129980404963, + "grad_norm": NaN, + "learning_rate": 9.281243585326176e-05, + "loss": 0.0, + "step": 40908 + }, + { + "epoch": 3.8172063077353737, + "grad_norm": NaN, + "learning_rate": 9.280544342054639e-05, + "loss": 0.0, + "step": 40909 + }, + { + "epoch": 3.817299617430251, + "grad_norm": NaN, + "learning_rate": 9.279845113325979e-05, + "loss": 0.0, + "step": 40910 + }, + { + "epoch": 3.8173929271251286, + "grad_norm": NaN, + "learning_rate": 9.279145899141981e-05, + "loss": 0.0, + "step": 40911 + }, + { + "epoch": 3.8174862368200055, + "grad_norm": NaN, + "learning_rate": 9.278446699504425e-05, + "loss": 0.0, + "step": 40912 + }, + { + "epoch": 3.817579546514883, + "grad_norm": NaN, + "learning_rate": 9.277747514415076e-05, + "loss": 0.0, + "step": 40913 + }, + { + "epoch": 3.81767285620976, + "grad_norm": NaN, + "learning_rate": 9.277048343875723e-05, + "loss": 0.0, + "step": 40914 + }, + { + "epoch": 3.8177661659046374, + "grad_norm": NaN, + "learning_rate": 9.276349187888145e-05, + "loss": 0.0, + "step": 40915 + }, + { + "epoch": 3.8178594755995148, + "grad_norm": NaN, + "learning_rate": 9.275650046454108e-05, + "loss": 0.0, + "step": 40916 + }, + { + "epoch": 3.817952785294392, + "grad_norm": NaN, + "learning_rate": 9.274950919575402e-05, + "loss": 0.0, + "step": 40917 + }, + { + "epoch": 3.8180460949892696, + "grad_norm": NaN, + "learning_rate": 9.274251807253804e-05, + "loss": 0.0, + "step": 40918 + }, + { + "epoch": 3.8181394046841466, + "grad_norm": NaN, + "learning_rate": 9.27355270949108e-05, + "loss": 0.0, + "step": 40919 + }, + { + "epoch": 3.818232714379024, + "grad_norm": NaN, + "learning_rate": 9.272853626289016e-05, + "loss": 0.0, + "step": 40920 + }, + { + "epoch": 3.8183260240739014, + "grad_norm": NaN, + "learning_rate": 9.272154557649396e-05, + "loss": 0.0, + "step": 40921 + }, + { + "epoch": 3.8184193337687784, + "grad_norm": NaN, + "learning_rate": 9.271455503573982e-05, + "loss": 0.0, + "step": 40922 + }, + { + "epoch": 3.818512643463656, + "grad_norm": NaN, + "learning_rate": 9.270756464064559e-05, + "loss": 0.0, + "step": 40923 + }, + { + "epoch": 3.8186059531585332, + "grad_norm": NaN, + "learning_rate": 9.270057439122912e-05, + "loss": 0.0, + "step": 40924 + }, + { + "epoch": 3.8186992628534107, + "grad_norm": NaN, + "learning_rate": 9.26935842875081e-05, + "loss": 0.0, + "step": 40925 + }, + { + "epoch": 3.8187925725482876, + "grad_norm": NaN, + "learning_rate": 9.268659432950026e-05, + "loss": 0.0, + "step": 40926 + }, + { + "epoch": 3.818885882243165, + "grad_norm": NaN, + "learning_rate": 9.267960451722352e-05, + "loss": 0.0, + "step": 40927 + }, + { + "epoch": 3.8189791919380425, + "grad_norm": NaN, + "learning_rate": 9.267261485069551e-05, + "loss": 0.0, + "step": 40928 + }, + { + "epoch": 3.8190725016329194, + "grad_norm": NaN, + "learning_rate": 9.266562532993404e-05, + "loss": 0.0, + "step": 40929 + }, + { + "epoch": 3.819165811327797, + "grad_norm": NaN, + "learning_rate": 9.265863595495698e-05, + "loss": 0.0, + "step": 40930 + }, + { + "epoch": 3.8192591210226743, + "grad_norm": NaN, + "learning_rate": 9.265164672578198e-05, + "loss": 0.0, + "step": 40931 + }, + { + "epoch": 3.8193524307175517, + "grad_norm": NaN, + "learning_rate": 9.264465764242682e-05, + "loss": 0.0, + "step": 40932 + }, + { + "epoch": 3.819445740412429, + "grad_norm": NaN, + "learning_rate": 9.263766870490937e-05, + "loss": 0.0, + "step": 40933 + }, + { + "epoch": 3.819539050107306, + "grad_norm": NaN, + "learning_rate": 9.26306799132473e-05, + "loss": 0.0, + "step": 40934 + }, + { + "epoch": 3.8196323598021835, + "grad_norm": NaN, + "learning_rate": 9.262369126745839e-05, + "loss": 0.0, + "step": 40935 + }, + { + "epoch": 3.8197256694970605, + "grad_norm": NaN, + "learning_rate": 9.26167027675605e-05, + "loss": 0.0, + "step": 40936 + }, + { + "epoch": 3.819818979191938, + "grad_norm": NaN, + "learning_rate": 9.260971441357132e-05, + "loss": 0.0, + "step": 40937 + }, + { + "epoch": 3.8199122888868153, + "grad_norm": NaN, + "learning_rate": 9.260272620550858e-05, + "loss": 0.0, + "step": 40938 + }, + { + "epoch": 3.8200055985816928, + "grad_norm": NaN, + "learning_rate": 9.259573814339016e-05, + "loss": 0.0, + "step": 40939 + }, + { + "epoch": 3.82009890827657, + "grad_norm": NaN, + "learning_rate": 9.258875022723377e-05, + "loss": 0.0, + "step": 40940 + }, + { + "epoch": 3.820192217971447, + "grad_norm": NaN, + "learning_rate": 9.258176245705722e-05, + "loss": 0.0, + "step": 40941 + }, + { + "epoch": 3.8202855276663246, + "grad_norm": NaN, + "learning_rate": 9.257477483287814e-05, + "loss": 0.0, + "step": 40942 + }, + { + "epoch": 3.820378837361202, + "grad_norm": NaN, + "learning_rate": 9.256778735471446e-05, + "loss": 0.0, + "step": 40943 + }, + { + "epoch": 3.820472147056079, + "grad_norm": NaN, + "learning_rate": 9.25608000225839e-05, + "loss": 0.0, + "step": 40944 + }, + { + "epoch": 3.8205654567509564, + "grad_norm": NaN, + "learning_rate": 9.255381283650415e-05, + "loss": 0.0, + "step": 40945 + }, + { + "epoch": 3.820658766445834, + "grad_norm": NaN, + "learning_rate": 9.254682579649307e-05, + "loss": 0.0, + "step": 40946 + }, + { + "epoch": 3.820752076140711, + "grad_norm": NaN, + "learning_rate": 9.253983890256843e-05, + "loss": 0.0, + "step": 40947 + }, + { + "epoch": 3.820845385835588, + "grad_norm": NaN, + "learning_rate": 9.253285215474789e-05, + "loss": 0.0, + "step": 40948 + }, + { + "epoch": 3.8209386955304656, + "grad_norm": NaN, + "learning_rate": 9.25258655530493e-05, + "loss": 0.0, + "step": 40949 + }, + { + "epoch": 3.821032005225343, + "grad_norm": NaN, + "learning_rate": 9.251887909749047e-05, + "loss": 0.0, + "step": 40950 + }, + { + "epoch": 3.82112531492022, + "grad_norm": NaN, + "learning_rate": 9.251189278808903e-05, + "loss": 0.0, + "step": 40951 + }, + { + "epoch": 3.8212186246150974, + "grad_norm": NaN, + "learning_rate": 9.250490662486284e-05, + "loss": 0.0, + "step": 40952 + }, + { + "epoch": 3.821311934309975, + "grad_norm": NaN, + "learning_rate": 9.249792060782968e-05, + "loss": 0.0, + "step": 40953 + }, + { + "epoch": 3.8214052440048523, + "grad_norm": NaN, + "learning_rate": 9.249093473700722e-05, + "loss": 0.0, + "step": 40954 + }, + { + "epoch": 3.8214985536997292, + "grad_norm": NaN, + "learning_rate": 9.248394901241329e-05, + "loss": 0.0, + "step": 40955 + }, + { + "epoch": 3.8215918633946067, + "grad_norm": NaN, + "learning_rate": 9.247696343406567e-05, + "loss": 0.0, + "step": 40956 + }, + { + "epoch": 3.821685173089484, + "grad_norm": NaN, + "learning_rate": 9.246997800198204e-05, + "loss": 0.0, + "step": 40957 + }, + { + "epoch": 3.821778482784361, + "grad_norm": NaN, + "learning_rate": 9.246299271618022e-05, + "loss": 0.0, + "step": 40958 + }, + { + "epoch": 3.8218717924792385, + "grad_norm": NaN, + "learning_rate": 9.245600757667807e-05, + "loss": 0.0, + "step": 40959 + }, + { + "epoch": 3.821965102174116, + "grad_norm": NaN, + "learning_rate": 9.244902258349312e-05, + "loss": 0.0, + "step": 40960 + }, + { + "epoch": 3.8220584118689933, + "grad_norm": NaN, + "learning_rate": 9.244203773664327e-05, + "loss": 0.0, + "step": 40961 + }, + { + "epoch": 3.8221517215638707, + "grad_norm": NaN, + "learning_rate": 9.243505303614635e-05, + "loss": 0.0, + "step": 40962 + }, + { + "epoch": 3.8222450312587477, + "grad_norm": NaN, + "learning_rate": 9.242806848201998e-05, + "loss": 0.0, + "step": 40963 + }, + { + "epoch": 3.822338340953625, + "grad_norm": NaN, + "learning_rate": 9.242108407428196e-05, + "loss": 0.0, + "step": 40964 + }, + { + "epoch": 3.8224316506485025, + "grad_norm": NaN, + "learning_rate": 9.241409981295014e-05, + "loss": 0.0, + "step": 40965 + }, + { + "epoch": 3.8225249603433795, + "grad_norm": NaN, + "learning_rate": 9.240711569804216e-05, + "loss": 0.0, + "step": 40966 + }, + { + "epoch": 3.822618270038257, + "grad_norm": NaN, + "learning_rate": 9.240013172957578e-05, + "loss": 0.0, + "step": 40967 + }, + { + "epoch": 3.8227115797331344, + "grad_norm": NaN, + "learning_rate": 9.239314790756889e-05, + "loss": 0.0, + "step": 40968 + }, + { + "epoch": 3.8228048894280118, + "grad_norm": NaN, + "learning_rate": 9.238616423203912e-05, + "loss": 0.0, + "step": 40969 + }, + { + "epoch": 3.8228981991228888, + "grad_norm": NaN, + "learning_rate": 9.237918070300423e-05, + "loss": 0.0, + "step": 40970 + }, + { + "epoch": 3.822991508817766, + "grad_norm": NaN, + "learning_rate": 9.23721973204821e-05, + "loss": 0.0, + "step": 40971 + }, + { + "epoch": 3.8230848185126436, + "grad_norm": NaN, + "learning_rate": 9.236521408449034e-05, + "loss": 0.0, + "step": 40972 + }, + { + "epoch": 3.8231781282075206, + "grad_norm": NaN, + "learning_rate": 9.235823099504675e-05, + "loss": 0.0, + "step": 40973 + }, + { + "epoch": 3.823271437902398, + "grad_norm": NaN, + "learning_rate": 9.235124805216917e-05, + "loss": 0.0, + "step": 40974 + }, + { + "epoch": 3.8233647475972754, + "grad_norm": NaN, + "learning_rate": 9.234426525587527e-05, + "loss": 0.0, + "step": 40975 + }, + { + "epoch": 3.823458057292153, + "grad_norm": NaN, + "learning_rate": 9.233728260618278e-05, + "loss": 0.0, + "step": 40976 + }, + { + "epoch": 3.82355136698703, + "grad_norm": NaN, + "learning_rate": 9.233030010310955e-05, + "loss": 0.0, + "step": 40977 + }, + { + "epoch": 3.823644676681907, + "grad_norm": NaN, + "learning_rate": 9.232331774667327e-05, + "loss": 0.0, + "step": 40978 + }, + { + "epoch": 3.8237379863767846, + "grad_norm": NaN, + "learning_rate": 9.231633553689166e-05, + "loss": 0.0, + "step": 40979 + }, + { + "epoch": 3.8238312960716616, + "grad_norm": NaN, + "learning_rate": 9.23093534737826e-05, + "loss": 0.0, + "step": 40980 + }, + { + "epoch": 3.823924605766539, + "grad_norm": NaN, + "learning_rate": 9.230237155736375e-05, + "loss": 0.0, + "step": 40981 + }, + { + "epoch": 3.8240179154614165, + "grad_norm": NaN, + "learning_rate": 9.229538978765281e-05, + "loss": 0.0, + "step": 40982 + }, + { + "epoch": 3.824111225156294, + "grad_norm": NaN, + "learning_rate": 9.228840816466767e-05, + "loss": 0.0, + "step": 40983 + }, + { + "epoch": 3.8242045348511713, + "grad_norm": NaN, + "learning_rate": 9.228142668842599e-05, + "loss": 0.0, + "step": 40984 + }, + { + "epoch": 3.8242978445460483, + "grad_norm": NaN, + "learning_rate": 9.227444535894557e-05, + "loss": 0.0, + "step": 40985 + }, + { + "epoch": 3.8243911542409257, + "grad_norm": NaN, + "learning_rate": 9.226746417624408e-05, + "loss": 0.0, + "step": 40986 + }, + { + "epoch": 3.8244844639358027, + "grad_norm": NaN, + "learning_rate": 9.226048314033936e-05, + "loss": 0.0, + "step": 40987 + }, + { + "epoch": 3.82457777363068, + "grad_norm": NaN, + "learning_rate": 9.225350225124915e-05, + "loss": 0.0, + "step": 40988 + }, + { + "epoch": 3.8246710833255575, + "grad_norm": NaN, + "learning_rate": 9.224652150899113e-05, + "loss": 0.0, + "step": 40989 + }, + { + "epoch": 3.824764393020435, + "grad_norm": NaN, + "learning_rate": 9.223954091358311e-05, + "loss": 0.0, + "step": 40990 + }, + { + "epoch": 3.8248577027153123, + "grad_norm": NaN, + "learning_rate": 9.223256046504289e-05, + "loss": 0.0, + "step": 40991 + }, + { + "epoch": 3.8249510124101893, + "grad_norm": NaN, + "learning_rate": 9.222558016338805e-05, + "loss": 0.0, + "step": 40992 + }, + { + "epoch": 3.8250443221050667, + "grad_norm": NaN, + "learning_rate": 9.22186000086365e-05, + "loss": 0.0, + "step": 40993 + }, + { + "epoch": 3.825137631799944, + "grad_norm": NaN, + "learning_rate": 9.221162000080596e-05, + "loss": 0.0, + "step": 40994 + }, + { + "epoch": 3.825230941494821, + "grad_norm": NaN, + "learning_rate": 9.22046401399141e-05, + "loss": 0.0, + "step": 40995 + }, + { + "epoch": 3.8253242511896985, + "grad_norm": NaN, + "learning_rate": 9.219766042597871e-05, + "loss": 0.0, + "step": 40996 + }, + { + "epoch": 3.825417560884576, + "grad_norm": NaN, + "learning_rate": 9.219068085901763e-05, + "loss": 0.0, + "step": 40997 + }, + { + "epoch": 3.8255108705794534, + "grad_norm": NaN, + "learning_rate": 9.218370143904847e-05, + "loss": 0.0, + "step": 40998 + }, + { + "epoch": 3.8256041802743304, + "grad_norm": NaN, + "learning_rate": 9.217672216608901e-05, + "loss": 0.0, + "step": 40999 + }, + { + "epoch": 3.8256974899692078, + "grad_norm": NaN, + "learning_rate": 9.216974304015708e-05, + "loss": 0.0, + "step": 41000 + }, + { + "epoch": 3.825790799664085, + "grad_norm": NaN, + "learning_rate": 9.216276406127034e-05, + "loss": 0.0, + "step": 41001 + }, + { + "epoch": 3.825884109358962, + "grad_norm": NaN, + "learning_rate": 9.21557852294465e-05, + "loss": 0.0, + "step": 41002 + }, + { + "epoch": 3.8259774190538396, + "grad_norm": NaN, + "learning_rate": 9.214880654470345e-05, + "loss": 0.0, + "step": 41003 + }, + { + "epoch": 3.826070728748717, + "grad_norm": NaN, + "learning_rate": 9.214182800705879e-05, + "loss": 0.0, + "step": 41004 + }, + { + "epoch": 3.8261640384435944, + "grad_norm": NaN, + "learning_rate": 9.213484961653032e-05, + "loss": 0.0, + "step": 41005 + }, + { + "epoch": 3.826257348138472, + "grad_norm": NaN, + "learning_rate": 9.212787137313583e-05, + "loss": 0.0, + "step": 41006 + }, + { + "epoch": 3.826350657833349, + "grad_norm": NaN, + "learning_rate": 9.2120893276893e-05, + "loss": 0.0, + "step": 41007 + }, + { + "epoch": 3.8264439675282262, + "grad_norm": NaN, + "learning_rate": 9.211391532781957e-05, + "loss": 0.0, + "step": 41008 + }, + { + "epoch": 3.826537277223103, + "grad_norm": NaN, + "learning_rate": 9.210693752593335e-05, + "loss": 0.0, + "step": 41009 + }, + { + "epoch": 3.8266305869179806, + "grad_norm": NaN, + "learning_rate": 9.209995987125202e-05, + "loss": 0.0, + "step": 41010 + }, + { + "epoch": 3.826723896612858, + "grad_norm": NaN, + "learning_rate": 9.20929823637933e-05, + "loss": 0.0, + "step": 41011 + }, + { + "epoch": 3.8268172063077355, + "grad_norm": NaN, + "learning_rate": 9.208600500357504e-05, + "loss": 0.0, + "step": 41012 + }, + { + "epoch": 3.826910516002613, + "grad_norm": NaN, + "learning_rate": 9.207902779061487e-05, + "loss": 0.0, + "step": 41013 + }, + { + "epoch": 3.82700382569749, + "grad_norm": NaN, + "learning_rate": 9.207205072493059e-05, + "loss": 0.0, + "step": 41014 + }, + { + "epoch": 3.8270971353923673, + "grad_norm": NaN, + "learning_rate": 9.206507380653992e-05, + "loss": 0.0, + "step": 41015 + }, + { + "epoch": 3.8271904450872447, + "grad_norm": NaN, + "learning_rate": 9.205809703546063e-05, + "loss": 0.0, + "step": 41016 + }, + { + "epoch": 3.8272837547821217, + "grad_norm": NaN, + "learning_rate": 9.205112041171038e-05, + "loss": 0.0, + "step": 41017 + }, + { + "epoch": 3.827377064476999, + "grad_norm": NaN, + "learning_rate": 9.204414393530701e-05, + "loss": 0.0, + "step": 41018 + }, + { + "epoch": 3.8274703741718765, + "grad_norm": NaN, + "learning_rate": 9.203716760626822e-05, + "loss": 0.0, + "step": 41019 + }, + { + "epoch": 3.827563683866754, + "grad_norm": NaN, + "learning_rate": 9.203019142461169e-05, + "loss": 0.0, + "step": 41020 + }, + { + "epoch": 3.827656993561631, + "grad_norm": NaN, + "learning_rate": 9.202321539035529e-05, + "loss": 0.0, + "step": 41021 + }, + { + "epoch": 3.8277503032565083, + "grad_norm": NaN, + "learning_rate": 9.201623950351662e-05, + "loss": 0.0, + "step": 41022 + }, + { + "epoch": 3.8278436129513858, + "grad_norm": NaN, + "learning_rate": 9.200926376411346e-05, + "loss": 0.0, + "step": 41023 + }, + { + "epoch": 3.8279369226462627, + "grad_norm": NaN, + "learning_rate": 9.200228817216363e-05, + "loss": 0.0, + "step": 41024 + }, + { + "epoch": 3.82803023234114, + "grad_norm": NaN, + "learning_rate": 9.199531272768476e-05, + "loss": 0.0, + "step": 41025 + }, + { + "epoch": 3.8281235420360176, + "grad_norm": NaN, + "learning_rate": 9.198833743069462e-05, + "loss": 0.0, + "step": 41026 + }, + { + "epoch": 3.828216851730895, + "grad_norm": NaN, + "learning_rate": 9.198136228121101e-05, + "loss": 0.0, + "step": 41027 + }, + { + "epoch": 3.8283101614257724, + "grad_norm": NaN, + "learning_rate": 9.197438727925157e-05, + "loss": 0.0, + "step": 41028 + }, + { + "epoch": 3.8284034711206494, + "grad_norm": NaN, + "learning_rate": 9.196741242483408e-05, + "loss": 0.0, + "step": 41029 + }, + { + "epoch": 3.828496780815527, + "grad_norm": NaN, + "learning_rate": 9.196043771797625e-05, + "loss": 0.0, + "step": 41030 + }, + { + "epoch": 3.8285900905104038, + "grad_norm": NaN, + "learning_rate": 9.195346315869581e-05, + "loss": 0.0, + "step": 41031 + }, + { + "epoch": 3.828683400205281, + "grad_norm": NaN, + "learning_rate": 9.194648874701063e-05, + "loss": 0.0, + "step": 41032 + }, + { + "epoch": 3.8287767099001586, + "grad_norm": NaN, + "learning_rate": 9.193951448293821e-05, + "loss": 0.0, + "step": 41033 + }, + { + "epoch": 3.828870019595036, + "grad_norm": NaN, + "learning_rate": 9.193254036649644e-05, + "loss": 0.0, + "step": 41034 + }, + { + "epoch": 3.8289633292899135, + "grad_norm": NaN, + "learning_rate": 9.192556639770307e-05, + "loss": 0.0, + "step": 41035 + }, + { + "epoch": 3.8290566389847904, + "grad_norm": NaN, + "learning_rate": 9.191859257657574e-05, + "loss": 0.0, + "step": 41036 + }, + { + "epoch": 3.829149948679668, + "grad_norm": NaN, + "learning_rate": 9.191161890313219e-05, + "loss": 0.0, + "step": 41037 + }, + { + "epoch": 3.8292432583745453, + "grad_norm": NaN, + "learning_rate": 9.190464537739025e-05, + "loss": 0.0, + "step": 41038 + }, + { + "epoch": 3.8293365680694222, + "grad_norm": NaN, + "learning_rate": 9.189767199936757e-05, + "loss": 0.0, + "step": 41039 + }, + { + "epoch": 3.8294298777642997, + "grad_norm": NaN, + "learning_rate": 9.189069876908187e-05, + "loss": 0.0, + "step": 41040 + }, + { + "epoch": 3.829523187459177, + "grad_norm": NaN, + "learning_rate": 9.188372568655097e-05, + "loss": 0.0, + "step": 41041 + }, + { + "epoch": 3.8296164971540545, + "grad_norm": NaN, + "learning_rate": 9.18767527517925e-05, + "loss": 0.0, + "step": 41042 + }, + { + "epoch": 3.8297098068489315, + "grad_norm": NaN, + "learning_rate": 9.186977996482418e-05, + "loss": 0.0, + "step": 41043 + }, + { + "epoch": 3.829803116543809, + "grad_norm": NaN, + "learning_rate": 9.186280732566389e-05, + "loss": 0.0, + "step": 41044 + }, + { + "epoch": 3.8298964262386863, + "grad_norm": NaN, + "learning_rate": 9.18558348343292e-05, + "loss": 0.0, + "step": 41045 + }, + { + "epoch": 3.8299897359335633, + "grad_norm": NaN, + "learning_rate": 9.184886249083788e-05, + "loss": 0.0, + "step": 41046 + }, + { + "epoch": 3.8300830456284407, + "grad_norm": NaN, + "learning_rate": 9.184189029520774e-05, + "loss": 0.0, + "step": 41047 + }, + { + "epoch": 3.830176355323318, + "grad_norm": NaN, + "learning_rate": 9.183491824745641e-05, + "loss": 0.0, + "step": 41048 + }, + { + "epoch": 3.8302696650181955, + "grad_norm": NaN, + "learning_rate": 9.182794634760163e-05, + "loss": 0.0, + "step": 41049 + }, + { + "epoch": 3.830362974713073, + "grad_norm": NaN, + "learning_rate": 9.182097459566121e-05, + "loss": 0.0, + "step": 41050 + }, + { + "epoch": 3.83045628440795, + "grad_norm": NaN, + "learning_rate": 9.18140029916528e-05, + "loss": 0.0, + "step": 41051 + }, + { + "epoch": 3.8305495941028274, + "grad_norm": NaN, + "learning_rate": 9.180703153559409e-05, + "loss": 0.0, + "step": 41052 + }, + { + "epoch": 3.8306429037977043, + "grad_norm": NaN, + "learning_rate": 9.180006022750293e-05, + "loss": 0.0, + "step": 41053 + }, + { + "epoch": 3.8307362134925818, + "grad_norm": NaN, + "learning_rate": 9.179308906739695e-05, + "loss": 0.0, + "step": 41054 + }, + { + "epoch": 3.830829523187459, + "grad_norm": NaN, + "learning_rate": 9.178611805529386e-05, + "loss": 0.0, + "step": 41055 + }, + { + "epoch": 3.8309228328823366, + "grad_norm": NaN, + "learning_rate": 9.177914719121149e-05, + "loss": 0.0, + "step": 41056 + }, + { + "epoch": 3.831016142577214, + "grad_norm": NaN, + "learning_rate": 9.177217647516749e-05, + "loss": 0.0, + "step": 41057 + }, + { + "epoch": 3.831109452272091, + "grad_norm": NaN, + "learning_rate": 9.176520590717954e-05, + "loss": 0.0, + "step": 41058 + }, + { + "epoch": 3.8312027619669684, + "grad_norm": NaN, + "learning_rate": 9.175823548726552e-05, + "loss": 0.0, + "step": 41059 + }, + { + "epoch": 3.831296071661846, + "grad_norm": NaN, + "learning_rate": 9.175126521544299e-05, + "loss": 0.0, + "step": 41060 + }, + { + "epoch": 3.831389381356723, + "grad_norm": NaN, + "learning_rate": 9.174429509172972e-05, + "loss": 0.0, + "step": 41061 + }, + { + "epoch": 3.8314826910516, + "grad_norm": NaN, + "learning_rate": 9.17373251161435e-05, + "loss": 0.0, + "step": 41062 + }, + { + "epoch": 3.8315760007464776, + "grad_norm": NaN, + "learning_rate": 9.173035528870199e-05, + "loss": 0.0, + "step": 41063 + }, + { + "epoch": 3.831669310441355, + "grad_norm": NaN, + "learning_rate": 9.17233856094229e-05, + "loss": 0.0, + "step": 41064 + }, + { + "epoch": 3.831762620136232, + "grad_norm": NaN, + "learning_rate": 9.171641607832402e-05, + "loss": 0.0, + "step": 41065 + }, + { + "epoch": 3.8318559298311095, + "grad_norm": NaN, + "learning_rate": 9.1709446695423e-05, + "loss": 0.0, + "step": 41066 + }, + { + "epoch": 3.831949239525987, + "grad_norm": NaN, + "learning_rate": 9.170247746073758e-05, + "loss": 0.0, + "step": 41067 + }, + { + "epoch": 3.832042549220864, + "grad_norm": NaN, + "learning_rate": 9.169550837428552e-05, + "loss": 0.0, + "step": 41068 + }, + { + "epoch": 3.8321358589157413, + "grad_norm": NaN, + "learning_rate": 9.168853943608447e-05, + "loss": 0.0, + "step": 41069 + }, + { + "epoch": 3.8322291686106187, + "grad_norm": NaN, + "learning_rate": 9.168157064615223e-05, + "loss": 0.0, + "step": 41070 + }, + { + "epoch": 3.832322478305496, + "grad_norm": NaN, + "learning_rate": 9.16746020045065e-05, + "loss": 0.0, + "step": 41071 + }, + { + "epoch": 3.832415788000373, + "grad_norm": NaN, + "learning_rate": 9.166763351116492e-05, + "loss": 0.0, + "step": 41072 + }, + { + "epoch": 3.8325090976952505, + "grad_norm": NaN, + "learning_rate": 9.166066516614528e-05, + "loss": 0.0, + "step": 41073 + }, + { + "epoch": 3.832602407390128, + "grad_norm": NaN, + "learning_rate": 9.165369696946535e-05, + "loss": 0.0, + "step": 41074 + }, + { + "epoch": 3.832695717085005, + "grad_norm": NaN, + "learning_rate": 9.164672892114271e-05, + "loss": 0.0, + "step": 41075 + }, + { + "epoch": 3.8327890267798823, + "grad_norm": NaN, + "learning_rate": 9.163976102119521e-05, + "loss": 0.0, + "step": 41076 + }, + { + "epoch": 3.8328823364747597, + "grad_norm": NaN, + "learning_rate": 9.163279326964048e-05, + "loss": 0.0, + "step": 41077 + }, + { + "epoch": 3.832975646169637, + "grad_norm": NaN, + "learning_rate": 9.162582566649625e-05, + "loss": 0.0, + "step": 41078 + }, + { + "epoch": 3.8330689558645146, + "grad_norm": NaN, + "learning_rate": 9.16188582117803e-05, + "loss": 0.0, + "step": 41079 + }, + { + "epoch": 3.8331622655593915, + "grad_norm": NaN, + "learning_rate": 9.161189090551027e-05, + "loss": 0.0, + "step": 41080 + }, + { + "epoch": 3.833255575254269, + "grad_norm": NaN, + "learning_rate": 9.160492374770388e-05, + "loss": 0.0, + "step": 41081 + }, + { + "epoch": 3.833348884949146, + "grad_norm": NaN, + "learning_rate": 9.159795673837893e-05, + "loss": 0.0, + "step": 41082 + }, + { + "epoch": 3.8334421946440234, + "grad_norm": NaN, + "learning_rate": 9.159098987755305e-05, + "loss": 0.0, + "step": 41083 + }, + { + "epoch": 3.833535504338901, + "grad_norm": NaN, + "learning_rate": 9.158402316524394e-05, + "loss": 0.0, + "step": 41084 + }, + { + "epoch": 3.833628814033778, + "grad_norm": NaN, + "learning_rate": 9.157705660146942e-05, + "loss": 0.0, + "step": 41085 + }, + { + "epoch": 3.8337221237286556, + "grad_norm": NaN, + "learning_rate": 9.157009018624713e-05, + "loss": 0.0, + "step": 41086 + }, + { + "epoch": 3.8338154334235326, + "grad_norm": NaN, + "learning_rate": 9.156312391959474e-05, + "loss": 0.0, + "step": 41087 + }, + { + "epoch": 3.83390874311841, + "grad_norm": NaN, + "learning_rate": 9.155615780153008e-05, + "loss": 0.0, + "step": 41088 + }, + { + "epoch": 3.8340020528132874, + "grad_norm": NaN, + "learning_rate": 9.154919183207075e-05, + "loss": 0.0, + "step": 41089 + }, + { + "epoch": 3.8340953625081644, + "grad_norm": NaN, + "learning_rate": 9.15422260112345e-05, + "loss": 0.0, + "step": 41090 + }, + { + "epoch": 3.834188672203042, + "grad_norm": NaN, + "learning_rate": 9.153526033903913e-05, + "loss": 0.0, + "step": 41091 + }, + { + "epoch": 3.8342819818979192, + "grad_norm": NaN, + "learning_rate": 9.152829481550223e-05, + "loss": 0.0, + "step": 41092 + }, + { + "epoch": 3.8343752915927967, + "grad_norm": NaN, + "learning_rate": 9.152132944064151e-05, + "loss": 0.0, + "step": 41093 + }, + { + "epoch": 3.8344686012876736, + "grad_norm": NaN, + "learning_rate": 9.151436421447481e-05, + "loss": 0.0, + "step": 41094 + }, + { + "epoch": 3.834561910982551, + "grad_norm": NaN, + "learning_rate": 9.15073991370197e-05, + "loss": 0.0, + "step": 41095 + }, + { + "epoch": 3.8346552206774285, + "grad_norm": NaN, + "learning_rate": 9.150043420829393e-05, + "loss": 0.0, + "step": 41096 + }, + { + "epoch": 3.8347485303723055, + "grad_norm": NaN, + "learning_rate": 9.14934694283153e-05, + "loss": 0.0, + "step": 41097 + }, + { + "epoch": 3.834841840067183, + "grad_norm": NaN, + "learning_rate": 9.14865047971014e-05, + "loss": 0.0, + "step": 41098 + }, + { + "epoch": 3.8349351497620603, + "grad_norm": NaN, + "learning_rate": 9.147954031466997e-05, + "loss": 0.0, + "step": 41099 + }, + { + "epoch": 3.8350284594569377, + "grad_norm": NaN, + "learning_rate": 9.147257598103879e-05, + "loss": 0.0, + "step": 41100 + }, + { + "epoch": 3.835121769151815, + "grad_norm": NaN, + "learning_rate": 9.146561179622546e-05, + "loss": 0.0, + "step": 41101 + }, + { + "epoch": 3.835215078846692, + "grad_norm": NaN, + "learning_rate": 9.145864776024773e-05, + "loss": 0.0, + "step": 41102 + }, + { + "epoch": 3.8353083885415695, + "grad_norm": NaN, + "learning_rate": 9.145168387312339e-05, + "loss": 0.0, + "step": 41103 + }, + { + "epoch": 3.8354016982364465, + "grad_norm": NaN, + "learning_rate": 9.144472013487e-05, + "loss": 0.0, + "step": 41104 + }, + { + "epoch": 3.835495007931324, + "grad_norm": NaN, + "learning_rate": 9.143775654550537e-05, + "loss": 0.0, + "step": 41105 + }, + { + "epoch": 3.8355883176262013, + "grad_norm": NaN, + "learning_rate": 9.143079310504718e-05, + "loss": 0.0, + "step": 41106 + }, + { + "epoch": 3.8356816273210788, + "grad_norm": NaN, + "learning_rate": 9.142382981351312e-05, + "loss": 0.0, + "step": 41107 + }, + { + "epoch": 3.835774937015956, + "grad_norm": NaN, + "learning_rate": 9.141686667092092e-05, + "loss": 0.0, + "step": 41108 + }, + { + "epoch": 3.835868246710833, + "grad_norm": NaN, + "learning_rate": 9.14099036772883e-05, + "loss": 0.0, + "step": 41109 + }, + { + "epoch": 3.8359615564057106, + "grad_norm": NaN, + "learning_rate": 9.14029408326329e-05, + "loss": 0.0, + "step": 41110 + }, + { + "epoch": 3.836054866100588, + "grad_norm": NaN, + "learning_rate": 9.139597813697248e-05, + "loss": 0.0, + "step": 41111 + }, + { + "epoch": 3.836148175795465, + "grad_norm": NaN, + "learning_rate": 9.138901559032474e-05, + "loss": 0.0, + "step": 41112 + }, + { + "epoch": 3.8362414854903424, + "grad_norm": NaN, + "learning_rate": 9.138205319270733e-05, + "loss": 0.0, + "step": 41113 + }, + { + "epoch": 3.83633479518522, + "grad_norm": NaN, + "learning_rate": 9.137509094413803e-05, + "loss": 0.0, + "step": 41114 + }, + { + "epoch": 3.8364281048800972, + "grad_norm": NaN, + "learning_rate": 9.136812884463453e-05, + "loss": 0.0, + "step": 41115 + }, + { + "epoch": 3.836521414574974, + "grad_norm": NaN, + "learning_rate": 9.136116689421444e-05, + "loss": 0.0, + "step": 41116 + }, + { + "epoch": 3.8366147242698516, + "grad_norm": NaN, + "learning_rate": 9.135420509289558e-05, + "loss": 0.0, + "step": 41117 + }, + { + "epoch": 3.836708033964729, + "grad_norm": NaN, + "learning_rate": 9.134724344069564e-05, + "loss": 0.0, + "step": 41118 + }, + { + "epoch": 3.836801343659606, + "grad_norm": NaN, + "learning_rate": 9.134028193763219e-05, + "loss": 0.0, + "step": 41119 + }, + { + "epoch": 3.8368946533544834, + "grad_norm": NaN, + "learning_rate": 9.133332058372312e-05, + "loss": 0.0, + "step": 41120 + }, + { + "epoch": 3.836987963049361, + "grad_norm": NaN, + "learning_rate": 9.1326359378986e-05, + "loss": 0.0, + "step": 41121 + }, + { + "epoch": 3.8370812727442383, + "grad_norm": NaN, + "learning_rate": 9.131939832343853e-05, + "loss": 0.0, + "step": 41122 + }, + { + "epoch": 3.8371745824391157, + "grad_norm": NaN, + "learning_rate": 9.131243741709851e-05, + "loss": 0.0, + "step": 41123 + }, + { + "epoch": 3.8372678921339927, + "grad_norm": NaN, + "learning_rate": 9.130547665998355e-05, + "loss": 0.0, + "step": 41124 + }, + { + "epoch": 3.83736120182887, + "grad_norm": NaN, + "learning_rate": 9.129851605211135e-05, + "loss": 0.0, + "step": 41125 + }, + { + "epoch": 3.837454511523747, + "grad_norm": NaN, + "learning_rate": 9.129155559349969e-05, + "loss": 0.0, + "step": 41126 + }, + { + "epoch": 3.8375478212186245, + "grad_norm": NaN, + "learning_rate": 9.128459528416619e-05, + "loss": 0.0, + "step": 41127 + }, + { + "epoch": 3.837641130913502, + "grad_norm": NaN, + "learning_rate": 9.127763512412853e-05, + "loss": 0.0, + "step": 41128 + }, + { + "epoch": 3.8377344406083793, + "grad_norm": NaN, + "learning_rate": 9.127067511340451e-05, + "loss": 0.0, + "step": 41129 + }, + { + "epoch": 3.8378277503032567, + "grad_norm": NaN, + "learning_rate": 9.126371525201174e-05, + "loss": 0.0, + "step": 41130 + }, + { + "epoch": 3.8379210599981337, + "grad_norm": NaN, + "learning_rate": 9.125675553996793e-05, + "loss": 0.0, + "step": 41131 + }, + { + "epoch": 3.838014369693011, + "grad_norm": NaN, + "learning_rate": 9.124979597729084e-05, + "loss": 0.0, + "step": 41132 + }, + { + "epoch": 3.8381076793878885, + "grad_norm": NaN, + "learning_rate": 9.124283656399805e-05, + "loss": 0.0, + "step": 41133 + }, + { + "epoch": 3.8382009890827655, + "grad_norm": NaN, + "learning_rate": 9.123587730010733e-05, + "loss": 0.0, + "step": 41134 + }, + { + "epoch": 3.838294298777643, + "grad_norm": NaN, + "learning_rate": 9.122891818563642e-05, + "loss": 0.0, + "step": 41135 + }, + { + "epoch": 3.8383876084725204, + "grad_norm": NaN, + "learning_rate": 9.122195922060293e-05, + "loss": 0.0, + "step": 41136 + }, + { + "epoch": 3.838480918167398, + "grad_norm": NaN, + "learning_rate": 9.121500040502455e-05, + "loss": 0.0, + "step": 41137 + }, + { + "epoch": 3.8385742278622748, + "grad_norm": NaN, + "learning_rate": 9.120804173891908e-05, + "loss": 0.0, + "step": 41138 + }, + { + "epoch": 3.838667537557152, + "grad_norm": NaN, + "learning_rate": 9.120108322230413e-05, + "loss": 0.0, + "step": 41139 + }, + { + "epoch": 3.8387608472520296, + "grad_norm": NaN, + "learning_rate": 9.119412485519734e-05, + "loss": 0.0, + "step": 41140 + }, + { + "epoch": 3.8388541569469066, + "grad_norm": NaN, + "learning_rate": 9.118716663761655e-05, + "loss": 0.0, + "step": 41141 + }, + { + "epoch": 3.838947466641784, + "grad_norm": NaN, + "learning_rate": 9.118020856957935e-05, + "loss": 0.0, + "step": 41142 + }, + { + "epoch": 3.8390407763366614, + "grad_norm": NaN, + "learning_rate": 9.117325065110342e-05, + "loss": 0.0, + "step": 41143 + }, + { + "epoch": 3.839134086031539, + "grad_norm": NaN, + "learning_rate": 9.116629288220654e-05, + "loss": 0.0, + "step": 41144 + }, + { + "epoch": 3.8392273957264162, + "grad_norm": NaN, + "learning_rate": 9.11593352629063e-05, + "loss": 0.0, + "step": 41145 + }, + { + "epoch": 3.8393207054212932, + "grad_norm": NaN, + "learning_rate": 9.115237779322047e-05, + "loss": 0.0, + "step": 41146 + }, + { + "epoch": 3.8394140151161706, + "grad_norm": NaN, + "learning_rate": 9.114542047316672e-05, + "loss": 0.0, + "step": 41147 + }, + { + "epoch": 3.8395073248110476, + "grad_norm": NaN, + "learning_rate": 9.113846330276273e-05, + "loss": 0.0, + "step": 41148 + }, + { + "epoch": 3.839600634505925, + "grad_norm": NaN, + "learning_rate": 9.113150628202616e-05, + "loss": 0.0, + "step": 41149 + }, + { + "epoch": 3.8396939442008025, + "grad_norm": NaN, + "learning_rate": 9.11245494109748e-05, + "loss": 0.0, + "step": 41150 + }, + { + "epoch": 3.83978725389568, + "grad_norm": NaN, + "learning_rate": 9.111759268962617e-05, + "loss": 0.0, + "step": 41151 + }, + { + "epoch": 3.8398805635905573, + "grad_norm": NaN, + "learning_rate": 9.111063611799812e-05, + "loss": 0.0, + "step": 41152 + }, + { + "epoch": 3.8399738732854343, + "grad_norm": NaN, + "learning_rate": 9.11036796961083e-05, + "loss": 0.0, + "step": 41153 + }, + { + "epoch": 3.8400671829803117, + "grad_norm": NaN, + "learning_rate": 9.109672342397431e-05, + "loss": 0.0, + "step": 41154 + }, + { + "epoch": 3.840160492675189, + "grad_norm": NaN, + "learning_rate": 9.108976730161396e-05, + "loss": 0.0, + "step": 41155 + }, + { + "epoch": 3.840253802370066, + "grad_norm": NaN, + "learning_rate": 9.108281132904487e-05, + "loss": 0.0, + "step": 41156 + }, + { + "epoch": 3.8403471120649435, + "grad_norm": NaN, + "learning_rate": 9.107585550628471e-05, + "loss": 0.0, + "step": 41157 + }, + { + "epoch": 3.840440421759821, + "grad_norm": NaN, + "learning_rate": 9.106889983335123e-05, + "loss": 0.0, + "step": 41158 + }, + { + "epoch": 3.8405337314546983, + "grad_norm": NaN, + "learning_rate": 9.106194431026208e-05, + "loss": 0.0, + "step": 41159 + }, + { + "epoch": 3.8406270411495753, + "grad_norm": NaN, + "learning_rate": 9.105498893703491e-05, + "loss": 0.0, + "step": 41160 + }, + { + "epoch": 3.8407203508444527, + "grad_norm": NaN, + "learning_rate": 9.104803371368747e-05, + "loss": 0.0, + "step": 41161 + }, + { + "epoch": 3.84081366053933, + "grad_norm": NaN, + "learning_rate": 9.104107864023745e-05, + "loss": 0.0, + "step": 41162 + }, + { + "epoch": 3.840906970234207, + "grad_norm": NaN, + "learning_rate": 9.103412371670243e-05, + "loss": 0.0, + "step": 41163 + }, + { + "epoch": 3.8410002799290845, + "grad_norm": NaN, + "learning_rate": 9.102716894310023e-05, + "loss": 0.0, + "step": 41164 + }, + { + "epoch": 3.841093589623962, + "grad_norm": NaN, + "learning_rate": 9.102021431944845e-05, + "loss": 0.0, + "step": 41165 + }, + { + "epoch": 3.8411868993188394, + "grad_norm": NaN, + "learning_rate": 9.101325984576476e-05, + "loss": 0.0, + "step": 41166 + }, + { + "epoch": 3.8412802090137164, + "grad_norm": NaN, + "learning_rate": 9.100630552206692e-05, + "loss": 0.0, + "step": 41167 + }, + { + "epoch": 3.841373518708594, + "grad_norm": NaN, + "learning_rate": 9.099935134837256e-05, + "loss": 0.0, + "step": 41168 + }, + { + "epoch": 3.841466828403471, + "grad_norm": NaN, + "learning_rate": 9.099239732469932e-05, + "loss": 0.0, + "step": 41169 + }, + { + "epoch": 3.841560138098348, + "grad_norm": NaN, + "learning_rate": 9.098544345106504e-05, + "loss": 0.0, + "step": 41170 + }, + { + "epoch": 3.8416534477932256, + "grad_norm": NaN, + "learning_rate": 9.097848972748721e-05, + "loss": 0.0, + "step": 41171 + }, + { + "epoch": 3.841746757488103, + "grad_norm": NaN, + "learning_rate": 9.09715361539836e-05, + "loss": 0.0, + "step": 41172 + }, + { + "epoch": 3.8418400671829804, + "grad_norm": NaN, + "learning_rate": 9.096458273057195e-05, + "loss": 0.0, + "step": 41173 + }, + { + "epoch": 3.841933376877858, + "grad_norm": NaN, + "learning_rate": 9.095762945726983e-05, + "loss": 0.0, + "step": 41174 + }, + { + "epoch": 3.842026686572735, + "grad_norm": NaN, + "learning_rate": 9.095067633409495e-05, + "loss": 0.0, + "step": 41175 + }, + { + "epoch": 3.8421199962676122, + "grad_norm": NaN, + "learning_rate": 9.094372336106507e-05, + "loss": 0.0, + "step": 41176 + }, + { + "epoch": 3.8422133059624897, + "grad_norm": NaN, + "learning_rate": 9.093677053819775e-05, + "loss": 0.0, + "step": 41177 + }, + { + "epoch": 3.8423066156573666, + "grad_norm": NaN, + "learning_rate": 9.092981786551075e-05, + "loss": 0.0, + "step": 41178 + }, + { + "epoch": 3.842399925352244, + "grad_norm": NaN, + "learning_rate": 9.092286534302173e-05, + "loss": 0.0, + "step": 41179 + }, + { + "epoch": 3.8424932350471215, + "grad_norm": NaN, + "learning_rate": 9.091591297074838e-05, + "loss": 0.0, + "step": 41180 + }, + { + "epoch": 3.842586544741999, + "grad_norm": NaN, + "learning_rate": 9.09089607487083e-05, + "loss": 0.0, + "step": 41181 + }, + { + "epoch": 3.842679854436876, + "grad_norm": NaN, + "learning_rate": 9.090200867691931e-05, + "loss": 0.0, + "step": 41182 + }, + { + "epoch": 3.8427731641317533, + "grad_norm": NaN, + "learning_rate": 9.089505675539893e-05, + "loss": 0.0, + "step": 41183 + }, + { + "epoch": 3.8428664738266307, + "grad_norm": NaN, + "learning_rate": 9.088810498416496e-05, + "loss": 0.0, + "step": 41184 + }, + { + "epoch": 3.8429597835215077, + "grad_norm": NaN, + "learning_rate": 9.088115336323502e-05, + "loss": 0.0, + "step": 41185 + }, + { + "epoch": 3.843053093216385, + "grad_norm": NaN, + "learning_rate": 9.087420189262677e-05, + "loss": 0.0, + "step": 41186 + }, + { + "epoch": 3.8431464029112625, + "grad_norm": NaN, + "learning_rate": 9.086725057235793e-05, + "loss": 0.0, + "step": 41187 + }, + { + "epoch": 3.84323971260614, + "grad_norm": NaN, + "learning_rate": 9.086029940244618e-05, + "loss": 0.0, + "step": 41188 + }, + { + "epoch": 3.843333022301017, + "grad_norm": NaN, + "learning_rate": 9.085334838290912e-05, + "loss": 0.0, + "step": 41189 + }, + { + "epoch": 3.8434263319958943, + "grad_norm": NaN, + "learning_rate": 9.08463975137645e-05, + "loss": 0.0, + "step": 41190 + }, + { + "epoch": 3.8435196416907718, + "grad_norm": NaN, + "learning_rate": 9.083944679503001e-05, + "loss": 0.0, + "step": 41191 + }, + { + "epoch": 3.8436129513856487, + "grad_norm": NaN, + "learning_rate": 9.08324962267232e-05, + "loss": 0.0, + "step": 41192 + }, + { + "epoch": 3.843706261080526, + "grad_norm": NaN, + "learning_rate": 9.082554580886189e-05, + "loss": 0.0, + "step": 41193 + }, + { + "epoch": 3.8437995707754036, + "grad_norm": NaN, + "learning_rate": 9.081859554146367e-05, + "loss": 0.0, + "step": 41194 + }, + { + "epoch": 3.843892880470281, + "grad_norm": NaN, + "learning_rate": 9.081164542454621e-05, + "loss": 0.0, + "step": 41195 + }, + { + "epoch": 3.8439861901651584, + "grad_norm": NaN, + "learning_rate": 9.080469545812722e-05, + "loss": 0.0, + "step": 41196 + }, + { + "epoch": 3.8440794998600354, + "grad_norm": NaN, + "learning_rate": 9.07977456422244e-05, + "loss": 0.0, + "step": 41197 + }, + { + "epoch": 3.844172809554913, + "grad_norm": NaN, + "learning_rate": 9.079079597685529e-05, + "loss": 0.0, + "step": 41198 + }, + { + "epoch": 3.84426611924979, + "grad_norm": NaN, + "learning_rate": 9.078384646203768e-05, + "loss": 0.0, + "step": 41199 + }, + { + "epoch": 3.844359428944667, + "grad_norm": NaN, + "learning_rate": 9.077689709778924e-05, + "loss": 0.0, + "step": 41200 + }, + { + "epoch": 3.8444527386395446, + "grad_norm": NaN, + "learning_rate": 9.076994788412756e-05, + "loss": 0.0, + "step": 41201 + }, + { + "epoch": 3.844546048334422, + "grad_norm": NaN, + "learning_rate": 9.076299882107039e-05, + "loss": 0.0, + "step": 41202 + }, + { + "epoch": 3.8446393580292995, + "grad_norm": NaN, + "learning_rate": 9.075604990863537e-05, + "loss": 0.0, + "step": 41203 + }, + { + "epoch": 3.8447326677241764, + "grad_norm": NaN, + "learning_rate": 9.074910114684013e-05, + "loss": 0.0, + "step": 41204 + }, + { + "epoch": 3.844825977419054, + "grad_norm": NaN, + "learning_rate": 9.074215253570239e-05, + "loss": 0.0, + "step": 41205 + }, + { + "epoch": 3.8449192871139313, + "grad_norm": NaN, + "learning_rate": 9.073520407523983e-05, + "loss": 0.0, + "step": 41206 + }, + { + "epoch": 3.8450125968088082, + "grad_norm": NaN, + "learning_rate": 9.072825576547004e-05, + "loss": 0.0, + "step": 41207 + }, + { + "epoch": 3.8451059065036857, + "grad_norm": NaN, + "learning_rate": 9.072130760641078e-05, + "loss": 0.0, + "step": 41208 + }, + { + "epoch": 3.845199216198563, + "grad_norm": NaN, + "learning_rate": 9.071435959807968e-05, + "loss": 0.0, + "step": 41209 + }, + { + "epoch": 3.8452925258934405, + "grad_norm": NaN, + "learning_rate": 9.070741174049436e-05, + "loss": 0.0, + "step": 41210 + }, + { + "epoch": 3.8453858355883175, + "grad_norm": NaN, + "learning_rate": 9.070046403367258e-05, + "loss": 0.0, + "step": 41211 + }, + { + "epoch": 3.845479145283195, + "grad_norm": NaN, + "learning_rate": 9.069351647763192e-05, + "loss": 0.0, + "step": 41212 + }, + { + "epoch": 3.8455724549780723, + "grad_norm": NaN, + "learning_rate": 9.068656907239005e-05, + "loss": 0.0, + "step": 41213 + }, + { + "epoch": 3.8456657646729493, + "grad_norm": NaN, + "learning_rate": 9.067962181796474e-05, + "loss": 0.0, + "step": 41214 + }, + { + "epoch": 3.8457590743678267, + "grad_norm": NaN, + "learning_rate": 9.067267471437355e-05, + "loss": 0.0, + "step": 41215 + }, + { + "epoch": 3.845852384062704, + "grad_norm": NaN, + "learning_rate": 9.066572776163412e-05, + "loss": 0.0, + "step": 41216 + }, + { + "epoch": 3.8459456937575816, + "grad_norm": NaN, + "learning_rate": 9.065878095976425e-05, + "loss": 0.0, + "step": 41217 + }, + { + "epoch": 3.846039003452459, + "grad_norm": NaN, + "learning_rate": 9.065183430878148e-05, + "loss": 0.0, + "step": 41218 + }, + { + "epoch": 3.846132313147336, + "grad_norm": NaN, + "learning_rate": 9.064488780870349e-05, + "loss": 0.0, + "step": 41219 + }, + { + "epoch": 3.8462256228422134, + "grad_norm": NaN, + "learning_rate": 9.063794145954804e-05, + "loss": 0.0, + "step": 41220 + }, + { + "epoch": 3.8463189325370903, + "grad_norm": NaN, + "learning_rate": 9.063099526133264e-05, + "loss": 0.0, + "step": 41221 + }, + { + "epoch": 3.8464122422319678, + "grad_norm": NaN, + "learning_rate": 9.062404921407508e-05, + "loss": 0.0, + "step": 41222 + }, + { + "epoch": 3.846505551926845, + "grad_norm": NaN, + "learning_rate": 9.061710331779298e-05, + "loss": 0.0, + "step": 41223 + }, + { + "epoch": 3.8465988616217226, + "grad_norm": NaN, + "learning_rate": 9.061015757250396e-05, + "loss": 0.0, + "step": 41224 + }, + { + "epoch": 3.8466921713166, + "grad_norm": NaN, + "learning_rate": 9.060321197822573e-05, + "loss": 0.0, + "step": 41225 + }, + { + "epoch": 3.846785481011477, + "grad_norm": NaN, + "learning_rate": 9.059626653497597e-05, + "loss": 0.0, + "step": 41226 + }, + { + "epoch": 3.8468787907063544, + "grad_norm": NaN, + "learning_rate": 9.058932124277225e-05, + "loss": 0.0, + "step": 41227 + }, + { + "epoch": 3.846972100401232, + "grad_norm": NaN, + "learning_rate": 9.058237610163232e-05, + "loss": 0.0, + "step": 41228 + }, + { + "epoch": 3.847065410096109, + "grad_norm": NaN, + "learning_rate": 9.057543111157385e-05, + "loss": 0.0, + "step": 41229 + }, + { + "epoch": 3.8471587197909862, + "grad_norm": NaN, + "learning_rate": 9.056848627261437e-05, + "loss": 0.0, + "step": 41230 + }, + { + "epoch": 3.8472520294858636, + "grad_norm": NaN, + "learning_rate": 9.056154158477167e-05, + "loss": 0.0, + "step": 41231 + }, + { + "epoch": 3.847345339180741, + "grad_norm": NaN, + "learning_rate": 9.055459704806339e-05, + "loss": 0.0, + "step": 41232 + }, + { + "epoch": 3.847438648875618, + "grad_norm": NaN, + "learning_rate": 9.05476526625071e-05, + "loss": 0.0, + "step": 41233 + }, + { + "epoch": 3.8475319585704955, + "grad_norm": NaN, + "learning_rate": 9.054070842812054e-05, + "loss": 0.0, + "step": 41234 + }, + { + "epoch": 3.847625268265373, + "grad_norm": NaN, + "learning_rate": 9.053376434492139e-05, + "loss": 0.0, + "step": 41235 + }, + { + "epoch": 3.84771857796025, + "grad_norm": NaN, + "learning_rate": 9.052682041292722e-05, + "loss": 0.0, + "step": 41236 + }, + { + "epoch": 3.8478118876551273, + "grad_norm": NaN, + "learning_rate": 9.05198766321557e-05, + "loss": 0.0, + "step": 41237 + }, + { + "epoch": 3.8479051973500047, + "grad_norm": NaN, + "learning_rate": 9.051293300262458e-05, + "loss": 0.0, + "step": 41238 + }, + { + "epoch": 3.847998507044882, + "grad_norm": NaN, + "learning_rate": 9.05059895243514e-05, + "loss": 0.0, + "step": 41239 + }, + { + "epoch": 3.8480918167397595, + "grad_norm": NaN, + "learning_rate": 9.049904619735389e-05, + "loss": 0.0, + "step": 41240 + }, + { + "epoch": 3.8481851264346365, + "grad_norm": NaN, + "learning_rate": 9.049210302164971e-05, + "loss": 0.0, + "step": 41241 + }, + { + "epoch": 3.848278436129514, + "grad_norm": NaN, + "learning_rate": 9.048515999725641e-05, + "loss": 0.0, + "step": 41242 + }, + { + "epoch": 3.848371745824391, + "grad_norm": NaN, + "learning_rate": 9.047821712419176e-05, + "loss": 0.0, + "step": 41243 + }, + { + "epoch": 3.8484650555192683, + "grad_norm": NaN, + "learning_rate": 9.047127440247341e-05, + "loss": 0.0, + "step": 41244 + }, + { + "epoch": 3.8485583652141457, + "grad_norm": NaN, + "learning_rate": 9.046433183211889e-05, + "loss": 0.0, + "step": 41245 + }, + { + "epoch": 3.848651674909023, + "grad_norm": NaN, + "learning_rate": 9.0457389413146e-05, + "loss": 0.0, + "step": 41246 + }, + { + "epoch": 3.8487449846039006, + "grad_norm": NaN, + "learning_rate": 9.045044714557232e-05, + "loss": 0.0, + "step": 41247 + }, + { + "epoch": 3.8488382942987776, + "grad_norm": NaN, + "learning_rate": 9.044350502941549e-05, + "loss": 0.0, + "step": 41248 + }, + { + "epoch": 3.848931603993655, + "grad_norm": NaN, + "learning_rate": 9.04365630646932e-05, + "loss": 0.0, + "step": 41249 + }, + { + "epoch": 3.8490249136885324, + "grad_norm": NaN, + "learning_rate": 9.042962125142312e-05, + "loss": 0.0, + "step": 41250 + }, + { + "epoch": 3.8491182233834094, + "grad_norm": NaN, + "learning_rate": 9.04226795896228e-05, + "loss": 0.0, + "step": 41251 + }, + { + "epoch": 3.849211533078287, + "grad_norm": NaN, + "learning_rate": 9.041573807931e-05, + "loss": 0.0, + "step": 41252 + }, + { + "epoch": 3.849304842773164, + "grad_norm": NaN, + "learning_rate": 9.040879672050235e-05, + "loss": 0.0, + "step": 41253 + }, + { + "epoch": 3.8493981524680416, + "grad_norm": NaN, + "learning_rate": 9.04018555132174e-05, + "loss": 0.0, + "step": 41254 + }, + { + "epoch": 3.8494914621629186, + "grad_norm": NaN, + "learning_rate": 9.039491445747296e-05, + "loss": 0.0, + "step": 41255 + }, + { + "epoch": 3.849584771857796, + "grad_norm": NaN, + "learning_rate": 9.038797355328653e-05, + "loss": 0.0, + "step": 41256 + }, + { + "epoch": 3.8496780815526734, + "grad_norm": NaN, + "learning_rate": 9.038103280067587e-05, + "loss": 0.0, + "step": 41257 + }, + { + "epoch": 3.8497713912475504, + "grad_norm": NaN, + "learning_rate": 9.037409219965859e-05, + "loss": 0.0, + "step": 41258 + }, + { + "epoch": 3.849864700942428, + "grad_norm": NaN, + "learning_rate": 9.036715175025228e-05, + "loss": 0.0, + "step": 41259 + }, + { + "epoch": 3.8499580106373053, + "grad_norm": NaN, + "learning_rate": 9.036021145247468e-05, + "loss": 0.0, + "step": 41260 + }, + { + "epoch": 3.8500513203321827, + "grad_norm": NaN, + "learning_rate": 9.03532713063434e-05, + "loss": 0.0, + "step": 41261 + }, + { + "epoch": 3.85014463002706, + "grad_norm": NaN, + "learning_rate": 9.034633131187604e-05, + "loss": 0.0, + "step": 41262 + }, + { + "epoch": 3.850237939721937, + "grad_norm": NaN, + "learning_rate": 9.033939146909032e-05, + "loss": 0.0, + "step": 41263 + }, + { + "epoch": 3.8503312494168145, + "grad_norm": NaN, + "learning_rate": 9.033245177800388e-05, + "loss": 0.0, + "step": 41264 + }, + { + "epoch": 3.8504245591116915, + "grad_norm": NaN, + "learning_rate": 9.032551223863428e-05, + "loss": 0.0, + "step": 41265 + }, + { + "epoch": 3.850517868806569, + "grad_norm": NaN, + "learning_rate": 9.031857285099926e-05, + "loss": 0.0, + "step": 41266 + }, + { + "epoch": 3.8506111785014463, + "grad_norm": NaN, + "learning_rate": 9.031163361511645e-05, + "loss": 0.0, + "step": 41267 + }, + { + "epoch": 3.8507044881963237, + "grad_norm": NaN, + "learning_rate": 9.030469453100344e-05, + "loss": 0.0, + "step": 41268 + }, + { + "epoch": 3.850797797891201, + "grad_norm": NaN, + "learning_rate": 9.029775559867791e-05, + "loss": 0.0, + "step": 41269 + }, + { + "epoch": 3.850891107586078, + "grad_norm": NaN, + "learning_rate": 9.029081681815755e-05, + "loss": 0.0, + "step": 41270 + }, + { + "epoch": 3.8509844172809555, + "grad_norm": NaN, + "learning_rate": 9.028387818945988e-05, + "loss": 0.0, + "step": 41271 + }, + { + "epoch": 3.851077726975833, + "grad_norm": NaN, + "learning_rate": 9.027693971260267e-05, + "loss": 0.0, + "step": 41272 + }, + { + "epoch": 3.85117103667071, + "grad_norm": NaN, + "learning_rate": 9.027000138760352e-05, + "loss": 0.0, + "step": 41273 + }, + { + "epoch": 3.8512643463655873, + "grad_norm": NaN, + "learning_rate": 9.026306321448001e-05, + "loss": 0.0, + "step": 41274 + }, + { + "epoch": 3.8513576560604648, + "grad_norm": NaN, + "learning_rate": 9.025612519324986e-05, + "loss": 0.0, + "step": 41275 + }, + { + "epoch": 3.851450965755342, + "grad_norm": NaN, + "learning_rate": 9.024918732393074e-05, + "loss": 0.0, + "step": 41276 + }, + { + "epoch": 3.851544275450219, + "grad_norm": NaN, + "learning_rate": 9.024224960654014e-05, + "loss": 0.0, + "step": 41277 + }, + { + "epoch": 3.8516375851450966, + "grad_norm": NaN, + "learning_rate": 9.023531204109586e-05, + "loss": 0.0, + "step": 41278 + }, + { + "epoch": 3.851730894839974, + "grad_norm": NaN, + "learning_rate": 9.02283746276155e-05, + "loss": 0.0, + "step": 41279 + }, + { + "epoch": 3.851824204534851, + "grad_norm": NaN, + "learning_rate": 9.02214373661166e-05, + "loss": 0.0, + "step": 41280 + }, + { + "epoch": 3.8519175142297284, + "grad_norm": NaN, + "learning_rate": 9.021450025661692e-05, + "loss": 0.0, + "step": 41281 + }, + { + "epoch": 3.852010823924606, + "grad_norm": NaN, + "learning_rate": 9.02075632991341e-05, + "loss": 0.0, + "step": 41282 + }, + { + "epoch": 3.8521041336194832, + "grad_norm": NaN, + "learning_rate": 9.020062649368565e-05, + "loss": 0.0, + "step": 41283 + }, + { + "epoch": 3.85219744331436, + "grad_norm": NaN, + "learning_rate": 9.019368984028932e-05, + "loss": 0.0, + "step": 41284 + }, + { + "epoch": 3.8522907530092376, + "grad_norm": NaN, + "learning_rate": 9.018675333896276e-05, + "loss": 0.0, + "step": 41285 + }, + { + "epoch": 3.852384062704115, + "grad_norm": NaN, + "learning_rate": 9.017981698972352e-05, + "loss": 0.0, + "step": 41286 + }, + { + "epoch": 3.852477372398992, + "grad_norm": NaN, + "learning_rate": 9.017288079258933e-05, + "loss": 0.0, + "step": 41287 + }, + { + "epoch": 3.8525706820938694, + "grad_norm": NaN, + "learning_rate": 9.016594474757779e-05, + "loss": 0.0, + "step": 41288 + }, + { + "epoch": 3.852663991788747, + "grad_norm": NaN, + "learning_rate": 9.015900885470645e-05, + "loss": 0.0, + "step": 41289 + }, + { + "epoch": 3.8527573014836243, + "grad_norm": NaN, + "learning_rate": 9.015207311399305e-05, + "loss": 0.0, + "step": 41290 + }, + { + "epoch": 3.8528506111785017, + "grad_norm": NaN, + "learning_rate": 9.014513752545528e-05, + "loss": 0.0, + "step": 41291 + }, + { + "epoch": 3.8529439208733787, + "grad_norm": NaN, + "learning_rate": 9.01382020891106e-05, + "loss": 0.0, + "step": 41292 + }, + { + "epoch": 3.853037230568256, + "grad_norm": NaN, + "learning_rate": 9.013126680497675e-05, + "loss": 0.0, + "step": 41293 + }, + { + "epoch": 3.853130540263133, + "grad_norm": NaN, + "learning_rate": 9.012433167307142e-05, + "loss": 0.0, + "step": 41294 + }, + { + "epoch": 3.8532238499580105, + "grad_norm": NaN, + "learning_rate": 9.011739669341212e-05, + "loss": 0.0, + "step": 41295 + }, + { + "epoch": 3.853317159652888, + "grad_norm": NaN, + "learning_rate": 9.011046186601655e-05, + "loss": 0.0, + "step": 41296 + }, + { + "epoch": 3.8534104693477653, + "grad_norm": NaN, + "learning_rate": 9.010352719090238e-05, + "loss": 0.0, + "step": 41297 + }, + { + "epoch": 3.8535037790426427, + "grad_norm": NaN, + "learning_rate": 9.009659266808716e-05, + "loss": 0.0, + "step": 41298 + }, + { + "epoch": 3.8535970887375197, + "grad_norm": NaN, + "learning_rate": 9.00896582975886e-05, + "loss": 0.0, + "step": 41299 + }, + { + "epoch": 3.853690398432397, + "grad_norm": NaN, + "learning_rate": 9.008272407942421e-05, + "loss": 0.0, + "step": 41300 + }, + { + "epoch": 3.8537837081272746, + "grad_norm": NaN, + "learning_rate": 9.007579001361176e-05, + "loss": 0.0, + "step": 41301 + }, + { + "epoch": 3.8538770178221515, + "grad_norm": NaN, + "learning_rate": 9.006885610016884e-05, + "loss": 0.0, + "step": 41302 + }, + { + "epoch": 3.853970327517029, + "grad_norm": NaN, + "learning_rate": 9.006192233911301e-05, + "loss": 0.0, + "step": 41303 + }, + { + "epoch": 3.8540636372119064, + "grad_norm": NaN, + "learning_rate": 9.0054988730462e-05, + "loss": 0.0, + "step": 41304 + }, + { + "epoch": 3.854156946906784, + "grad_norm": NaN, + "learning_rate": 9.004805527423341e-05, + "loss": 0.0, + "step": 41305 + }, + { + "epoch": 3.8542502566016608, + "grad_norm": NaN, + "learning_rate": 9.004112197044479e-05, + "loss": 0.0, + "step": 41306 + }, + { + "epoch": 3.854343566296538, + "grad_norm": NaN, + "learning_rate": 9.003418881911387e-05, + "loss": 0.0, + "step": 41307 + }, + { + "epoch": 3.8544368759914156, + "grad_norm": NaN, + "learning_rate": 9.00272558202583e-05, + "loss": 0.0, + "step": 41308 + }, + { + "epoch": 3.8545301856862926, + "grad_norm": NaN, + "learning_rate": 9.002032297389556e-05, + "loss": 0.0, + "step": 41309 + }, + { + "epoch": 3.85462349538117, + "grad_norm": NaN, + "learning_rate": 9.001339028004341e-05, + "loss": 0.0, + "step": 41310 + }, + { + "epoch": 3.8547168050760474, + "grad_norm": NaN, + "learning_rate": 9.000645773871948e-05, + "loss": 0.0, + "step": 41311 + }, + { + "epoch": 3.854810114770925, + "grad_norm": NaN, + "learning_rate": 8.999952534994129e-05, + "loss": 0.0, + "step": 41312 + }, + { + "epoch": 3.8549034244658023, + "grad_norm": NaN, + "learning_rate": 8.999259311372656e-05, + "loss": 0.0, + "step": 41313 + }, + { + "epoch": 3.8549967341606792, + "grad_norm": NaN, + "learning_rate": 8.998566103009294e-05, + "loss": 0.0, + "step": 41314 + }, + { + "epoch": 3.8550900438555566, + "grad_norm": NaN, + "learning_rate": 8.997872909905793e-05, + "loss": 0.0, + "step": 41315 + }, + { + "epoch": 3.8551833535504336, + "grad_norm": NaN, + "learning_rate": 8.997179732063926e-05, + "loss": 0.0, + "step": 41316 + }, + { + "epoch": 3.855276663245311, + "grad_norm": NaN, + "learning_rate": 8.996486569485457e-05, + "loss": 0.0, + "step": 41317 + }, + { + "epoch": 3.8553699729401885, + "grad_norm": NaN, + "learning_rate": 8.995793422172138e-05, + "loss": 0.0, + "step": 41318 + }, + { + "epoch": 3.855463282635066, + "grad_norm": NaN, + "learning_rate": 8.995100290125738e-05, + "loss": 0.0, + "step": 41319 + }, + { + "epoch": 3.8555565923299433, + "grad_norm": NaN, + "learning_rate": 8.994407173348024e-05, + "loss": 0.0, + "step": 41320 + }, + { + "epoch": 3.8556499020248203, + "grad_norm": NaN, + "learning_rate": 8.993714071840748e-05, + "loss": 0.0, + "step": 41321 + }, + { + "epoch": 3.8557432117196977, + "grad_norm": NaN, + "learning_rate": 8.993020985605681e-05, + "loss": 0.0, + "step": 41322 + }, + { + "epoch": 3.855836521414575, + "grad_norm": NaN, + "learning_rate": 8.992327914644585e-05, + "loss": 0.0, + "step": 41323 + }, + { + "epoch": 3.855929831109452, + "grad_norm": NaN, + "learning_rate": 8.991634858959215e-05, + "loss": 0.0, + "step": 41324 + }, + { + "epoch": 3.8560231408043295, + "grad_norm": NaN, + "learning_rate": 8.99094181855134e-05, + "loss": 0.0, + "step": 41325 + }, + { + "epoch": 3.856116450499207, + "grad_norm": NaN, + "learning_rate": 8.990248793422722e-05, + "loss": 0.0, + "step": 41326 + }, + { + "epoch": 3.8562097601940843, + "grad_norm": NaN, + "learning_rate": 8.989555783575114e-05, + "loss": 0.0, + "step": 41327 + }, + { + "epoch": 3.8563030698889613, + "grad_norm": NaN, + "learning_rate": 8.988862789010287e-05, + "loss": 0.0, + "step": 41328 + }, + { + "epoch": 3.8563963795838387, + "grad_norm": NaN, + "learning_rate": 8.988169809730009e-05, + "loss": 0.0, + "step": 41329 + }, + { + "epoch": 3.856489689278716, + "grad_norm": NaN, + "learning_rate": 8.987476845736026e-05, + "loss": 0.0, + "step": 41330 + }, + { + "epoch": 3.856582998973593, + "grad_norm": NaN, + "learning_rate": 8.98678389703011e-05, + "loss": 0.0, + "step": 41331 + }, + { + "epoch": 3.8566763086684706, + "grad_norm": NaN, + "learning_rate": 8.986090963614025e-05, + "loss": 0.0, + "step": 41332 + }, + { + "epoch": 3.856769618363348, + "grad_norm": NaN, + "learning_rate": 8.985398045489529e-05, + "loss": 0.0, + "step": 41333 + }, + { + "epoch": 3.8568629280582254, + "grad_norm": NaN, + "learning_rate": 8.984705142658377e-05, + "loss": 0.0, + "step": 41334 + }, + { + "epoch": 3.856956237753103, + "grad_norm": NaN, + "learning_rate": 8.984012255122346e-05, + "loss": 0.0, + "step": 41335 + }, + { + "epoch": 3.85704954744798, + "grad_norm": NaN, + "learning_rate": 8.983319382883188e-05, + "loss": 0.0, + "step": 41336 + }, + { + "epoch": 3.857142857142857, + "grad_norm": NaN, + "learning_rate": 8.982626525942662e-05, + "loss": 0.0, + "step": 41337 + }, + { + "epoch": 3.857236166837734, + "grad_norm": NaN, + "learning_rate": 8.981933684302541e-05, + "loss": 0.0, + "step": 41338 + }, + { + "epoch": 3.8573294765326116, + "grad_norm": NaN, + "learning_rate": 8.981240857964576e-05, + "loss": 0.0, + "step": 41339 + }, + { + "epoch": 3.857422786227489, + "grad_norm": NaN, + "learning_rate": 8.98054804693053e-05, + "loss": 0.0, + "step": 41340 + }, + { + "epoch": 3.8575160959223664, + "grad_norm": NaN, + "learning_rate": 8.979855251202175e-05, + "loss": 0.0, + "step": 41341 + }, + { + "epoch": 3.857609405617244, + "grad_norm": NaN, + "learning_rate": 8.97916247078126e-05, + "loss": 0.0, + "step": 41342 + }, + { + "epoch": 3.857702715312121, + "grad_norm": NaN, + "learning_rate": 8.978469705669556e-05, + "loss": 0.0, + "step": 41343 + }, + { + "epoch": 3.8577960250069983, + "grad_norm": NaN, + "learning_rate": 8.977776955868811e-05, + "loss": 0.0, + "step": 41344 + }, + { + "epoch": 3.8578893347018757, + "grad_norm": NaN, + "learning_rate": 8.977084221380799e-05, + "loss": 0.0, + "step": 41345 + }, + { + "epoch": 3.8579826443967526, + "grad_norm": NaN, + "learning_rate": 8.976391502207283e-05, + "loss": 0.0, + "step": 41346 + }, + { + "epoch": 3.85807595409163, + "grad_norm": NaN, + "learning_rate": 8.97569879835001e-05, + "loss": 0.0, + "step": 41347 + }, + { + "epoch": 3.8581692637865075, + "grad_norm": NaN, + "learning_rate": 8.975006109810756e-05, + "loss": 0.0, + "step": 41348 + }, + { + "epoch": 3.858262573481385, + "grad_norm": NaN, + "learning_rate": 8.974313436591279e-05, + "loss": 0.0, + "step": 41349 + }, + { + "epoch": 3.858355883176262, + "grad_norm": NaN, + "learning_rate": 8.97362077869333e-05, + "loss": 0.0, + "step": 41350 + }, + { + "epoch": 3.8584491928711393, + "grad_norm": NaN, + "learning_rate": 8.972928136118684e-05, + "loss": 0.0, + "step": 41351 + }, + { + "epoch": 3.8585425025660167, + "grad_norm": NaN, + "learning_rate": 8.972235508869099e-05, + "loss": 0.0, + "step": 41352 + }, + { + "epoch": 3.8586358122608937, + "grad_norm": NaN, + "learning_rate": 8.971542896946325e-05, + "loss": 0.0, + "step": 41353 + }, + { + "epoch": 3.858729121955771, + "grad_norm": NaN, + "learning_rate": 8.970850300352138e-05, + "loss": 0.0, + "step": 41354 + }, + { + "epoch": 3.8588224316506485, + "grad_norm": NaN, + "learning_rate": 8.970157719088294e-05, + "loss": 0.0, + "step": 41355 + }, + { + "epoch": 3.858915741345526, + "grad_norm": NaN, + "learning_rate": 8.969465153156546e-05, + "loss": 0.0, + "step": 41356 + }, + { + "epoch": 3.8590090510404034, + "grad_norm": NaN, + "learning_rate": 8.968772602558667e-05, + "loss": 0.0, + "step": 41357 + }, + { + "epoch": 3.8591023607352803, + "grad_norm": NaN, + "learning_rate": 8.968080067296414e-05, + "loss": 0.0, + "step": 41358 + }, + { + "epoch": 3.8591956704301578, + "grad_norm": NaN, + "learning_rate": 8.967387547371542e-05, + "loss": 0.0, + "step": 41359 + }, + { + "epoch": 3.8592889801250347, + "grad_norm": NaN, + "learning_rate": 8.966695042785817e-05, + "loss": 0.0, + "step": 41360 + }, + { + "epoch": 3.859382289819912, + "grad_norm": NaN, + "learning_rate": 8.966002553541003e-05, + "loss": 0.0, + "step": 41361 + }, + { + "epoch": 3.8594755995147896, + "grad_norm": NaN, + "learning_rate": 8.965310079638853e-05, + "loss": 0.0, + "step": 41362 + }, + { + "epoch": 3.859568909209667, + "grad_norm": NaN, + "learning_rate": 8.964617621081133e-05, + "loss": 0.0, + "step": 41363 + }, + { + "epoch": 3.8596622189045444, + "grad_norm": NaN, + "learning_rate": 8.963925177869606e-05, + "loss": 0.0, + "step": 41364 + }, + { + "epoch": 3.8597555285994214, + "grad_norm": NaN, + "learning_rate": 8.963232750006025e-05, + "loss": 0.0, + "step": 41365 + }, + { + "epoch": 3.859848838294299, + "grad_norm": NaN, + "learning_rate": 8.962540337492153e-05, + "loss": 0.0, + "step": 41366 + }, + { + "epoch": 3.8599421479891762, + "grad_norm": NaN, + "learning_rate": 8.961847940329763e-05, + "loss": 0.0, + "step": 41367 + }, + { + "epoch": 3.860035457684053, + "grad_norm": NaN, + "learning_rate": 8.961155558520594e-05, + "loss": 0.0, + "step": 41368 + }, + { + "epoch": 3.8601287673789306, + "grad_norm": NaN, + "learning_rate": 8.960463192066418e-05, + "loss": 0.0, + "step": 41369 + }, + { + "epoch": 3.860222077073808, + "grad_norm": NaN, + "learning_rate": 8.959770840969002e-05, + "loss": 0.0, + "step": 41370 + }, + { + "epoch": 3.8603153867686855, + "grad_norm": NaN, + "learning_rate": 8.959078505230097e-05, + "loss": 0.0, + "step": 41371 + }, + { + "epoch": 3.8604086964635624, + "grad_norm": NaN, + "learning_rate": 8.958386184851463e-05, + "loss": 0.0, + "step": 41372 + }, + { + "epoch": 3.86050200615844, + "grad_norm": NaN, + "learning_rate": 8.957693879834867e-05, + "loss": 0.0, + "step": 41373 + }, + { + "epoch": 3.8605953158533173, + "grad_norm": NaN, + "learning_rate": 8.957001590182064e-05, + "loss": 0.0, + "step": 41374 + }, + { + "epoch": 3.8606886255481943, + "grad_norm": NaN, + "learning_rate": 8.956309315894814e-05, + "loss": 0.0, + "step": 41375 + }, + { + "epoch": 3.8607819352430717, + "grad_norm": NaN, + "learning_rate": 8.955617056974882e-05, + "loss": 0.0, + "step": 41376 + }, + { + "epoch": 3.860875244937949, + "grad_norm": NaN, + "learning_rate": 8.954924813424025e-05, + "loss": 0.0, + "step": 41377 + }, + { + "epoch": 3.8609685546328265, + "grad_norm": NaN, + "learning_rate": 8.954232585244e-05, + "loss": 0.0, + "step": 41378 + }, + { + "epoch": 3.8610618643277035, + "grad_norm": NaN, + "learning_rate": 8.953540372436576e-05, + "loss": 0.0, + "step": 41379 + }, + { + "epoch": 3.861155174022581, + "grad_norm": NaN, + "learning_rate": 8.952848175003506e-05, + "loss": 0.0, + "step": 41380 + }, + { + "epoch": 3.8612484837174583, + "grad_norm": NaN, + "learning_rate": 8.952155992946548e-05, + "loss": 0.0, + "step": 41381 + }, + { + "epoch": 3.8613417934123353, + "grad_norm": NaN, + "learning_rate": 8.951463826267471e-05, + "loss": 0.0, + "step": 41382 + }, + { + "epoch": 3.8614351031072127, + "grad_norm": NaN, + "learning_rate": 8.950771674968029e-05, + "loss": 0.0, + "step": 41383 + }, + { + "epoch": 3.86152841280209, + "grad_norm": NaN, + "learning_rate": 8.950079539049977e-05, + "loss": 0.0, + "step": 41384 + }, + { + "epoch": 3.8616217224969676, + "grad_norm": NaN, + "learning_rate": 8.949387418515088e-05, + "loss": 0.0, + "step": 41385 + }, + { + "epoch": 3.861715032191845, + "grad_norm": NaN, + "learning_rate": 8.948695313365112e-05, + "loss": 0.0, + "step": 41386 + }, + { + "epoch": 3.861808341886722, + "grad_norm": NaN, + "learning_rate": 8.948003223601809e-05, + "loss": 0.0, + "step": 41387 + }, + { + "epoch": 3.8619016515815994, + "grad_norm": NaN, + "learning_rate": 8.947311149226944e-05, + "loss": 0.0, + "step": 41388 + }, + { + "epoch": 3.861994961276477, + "grad_norm": NaN, + "learning_rate": 8.946619090242274e-05, + "loss": 0.0, + "step": 41389 + }, + { + "epoch": 3.8620882709713538, + "grad_norm": NaN, + "learning_rate": 8.94592704664956e-05, + "loss": 0.0, + "step": 41390 + }, + { + "epoch": 3.862181580666231, + "grad_norm": NaN, + "learning_rate": 8.945235018450554e-05, + "loss": 0.0, + "step": 41391 + }, + { + "epoch": 3.8622748903611086, + "grad_norm": NaN, + "learning_rate": 8.944543005647023e-05, + "loss": 0.0, + "step": 41392 + }, + { + "epoch": 3.862368200055986, + "grad_norm": NaN, + "learning_rate": 8.943851008240734e-05, + "loss": 0.0, + "step": 41393 + }, + { + "epoch": 3.862461509750863, + "grad_norm": NaN, + "learning_rate": 8.943159026233425e-05, + "loss": 0.0, + "step": 41394 + }, + { + "epoch": 3.8625548194457404, + "grad_norm": NaN, + "learning_rate": 8.942467059626876e-05, + "loss": 0.0, + "step": 41395 + }, + { + "epoch": 3.862648129140618, + "grad_norm": NaN, + "learning_rate": 8.941775108422841e-05, + "loss": 0.0, + "step": 41396 + }, + { + "epoch": 3.862741438835495, + "grad_norm": NaN, + "learning_rate": 8.941083172623071e-05, + "loss": 0.0, + "step": 41397 + }, + { + "epoch": 3.8628347485303722, + "grad_norm": NaN, + "learning_rate": 8.940391252229335e-05, + "loss": 0.0, + "step": 41398 + }, + { + "epoch": 3.8629280582252497, + "grad_norm": NaN, + "learning_rate": 8.939699347243391e-05, + "loss": 0.0, + "step": 41399 + }, + { + "epoch": 3.863021367920127, + "grad_norm": NaN, + "learning_rate": 8.939007457666991e-05, + "loss": 0.0, + "step": 41400 + }, + { + "epoch": 3.863114677615004, + "grad_norm": NaN, + "learning_rate": 8.938315583501904e-05, + "loss": 0.0, + "step": 41401 + }, + { + "epoch": 3.8632079873098815, + "grad_norm": NaN, + "learning_rate": 8.937623724749886e-05, + "loss": 0.0, + "step": 41402 + }, + { + "epoch": 3.863301297004759, + "grad_norm": NaN, + "learning_rate": 8.93693188141269e-05, + "loss": 0.0, + "step": 41403 + }, + { + "epoch": 3.863394606699636, + "grad_norm": NaN, + "learning_rate": 8.936240053492078e-05, + "loss": 0.0, + "step": 41404 + }, + { + "epoch": 3.8634879163945133, + "grad_norm": NaN, + "learning_rate": 8.935548240989824e-05, + "loss": 0.0, + "step": 41405 + }, + { + "epoch": 3.8635812260893907, + "grad_norm": NaN, + "learning_rate": 8.934856443907664e-05, + "loss": 0.0, + "step": 41406 + }, + { + "epoch": 3.863674535784268, + "grad_norm": NaN, + "learning_rate": 8.934164662247366e-05, + "loss": 0.0, + "step": 41407 + }, + { + "epoch": 3.8637678454791455, + "grad_norm": NaN, + "learning_rate": 8.933472896010698e-05, + "loss": 0.0, + "step": 41408 + }, + { + "epoch": 3.8638611551740225, + "grad_norm": NaN, + "learning_rate": 8.932781145199408e-05, + "loss": 0.0, + "step": 41409 + }, + { + "epoch": 3.8639544648689, + "grad_norm": NaN, + "learning_rate": 8.932089409815256e-05, + "loss": 0.0, + "step": 41410 + }, + { + "epoch": 3.864047774563777, + "grad_norm": NaN, + "learning_rate": 8.931397689860008e-05, + "loss": 0.0, + "step": 41411 + }, + { + "epoch": 3.8641410842586543, + "grad_norm": NaN, + "learning_rate": 8.930705985335416e-05, + "loss": 0.0, + "step": 41412 + }, + { + "epoch": 3.8642343939535317, + "grad_norm": NaN, + "learning_rate": 8.930014296243235e-05, + "loss": 0.0, + "step": 41413 + }, + { + "epoch": 3.864327703648409, + "grad_norm": NaN, + "learning_rate": 8.929322622585238e-05, + "loss": 0.0, + "step": 41414 + }, + { + "epoch": 3.8644210133432866, + "grad_norm": NaN, + "learning_rate": 8.928630964363172e-05, + "loss": 0.0, + "step": 41415 + }, + { + "epoch": 3.8645143230381636, + "grad_norm": NaN, + "learning_rate": 8.927939321578797e-05, + "loss": 0.0, + "step": 41416 + }, + { + "epoch": 3.864607632733041, + "grad_norm": NaN, + "learning_rate": 8.92724769423388e-05, + "loss": 0.0, + "step": 41417 + }, + { + "epoch": 3.8647009424279184, + "grad_norm": NaN, + "learning_rate": 8.926556082330169e-05, + "loss": 0.0, + "step": 41418 + }, + { + "epoch": 3.8647942521227954, + "grad_norm": NaN, + "learning_rate": 8.925864485869421e-05, + "loss": 0.0, + "step": 41419 + }, + { + "epoch": 3.864887561817673, + "grad_norm": NaN, + "learning_rate": 8.92517290485341e-05, + "loss": 0.0, + "step": 41420 + }, + { + "epoch": 3.86498087151255, + "grad_norm": NaN, + "learning_rate": 8.924481339283882e-05, + "loss": 0.0, + "step": 41421 + }, + { + "epoch": 3.8650741812074276, + "grad_norm": NaN, + "learning_rate": 8.923789789162593e-05, + "loss": 0.0, + "step": 41422 + }, + { + "epoch": 3.8651674909023046, + "grad_norm": NaN, + "learning_rate": 8.923098254491316e-05, + "loss": 0.0, + "step": 41423 + }, + { + "epoch": 3.865260800597182, + "grad_norm": NaN, + "learning_rate": 8.922406735271792e-05, + "loss": 0.0, + "step": 41424 + }, + { + "epoch": 3.8653541102920594, + "grad_norm": NaN, + "learning_rate": 8.921715231505787e-05, + "loss": 0.0, + "step": 41425 + }, + { + "epoch": 3.8654474199869364, + "grad_norm": NaN, + "learning_rate": 8.921023743195065e-05, + "loss": 0.0, + "step": 41426 + }, + { + "epoch": 3.865540729681814, + "grad_norm": NaN, + "learning_rate": 8.920332270341379e-05, + "loss": 0.0, + "step": 41427 + }, + { + "epoch": 3.8656340393766913, + "grad_norm": NaN, + "learning_rate": 8.919640812946483e-05, + "loss": 0.0, + "step": 41428 + }, + { + "epoch": 3.8657273490715687, + "grad_norm": NaN, + "learning_rate": 8.918949371012144e-05, + "loss": 0.0, + "step": 41429 + }, + { + "epoch": 3.865820658766446, + "grad_norm": NaN, + "learning_rate": 8.91825794454011e-05, + "loss": 0.0, + "step": 41430 + }, + { + "epoch": 3.865913968461323, + "grad_norm": NaN, + "learning_rate": 8.917566533532145e-05, + "loss": 0.0, + "step": 41431 + }, + { + "epoch": 3.8660072781562005, + "grad_norm": NaN, + "learning_rate": 8.916875137990011e-05, + "loss": 0.0, + "step": 41432 + }, + { + "epoch": 3.8661005878510775, + "grad_norm": NaN, + "learning_rate": 8.91618375791546e-05, + "loss": 0.0, + "step": 41433 + }, + { + "epoch": 3.866193897545955, + "grad_norm": NaN, + "learning_rate": 8.915492393310253e-05, + "loss": 0.0, + "step": 41434 + }, + { + "epoch": 3.8662872072408323, + "grad_norm": NaN, + "learning_rate": 8.914801044176141e-05, + "loss": 0.0, + "step": 41435 + }, + { + "epoch": 3.8663805169357097, + "grad_norm": NaN, + "learning_rate": 8.91410971051489e-05, + "loss": 0.0, + "step": 41436 + }, + { + "epoch": 3.866473826630587, + "grad_norm": NaN, + "learning_rate": 8.91341839232826e-05, + "loss": 0.0, + "step": 41437 + }, + { + "epoch": 3.866567136325464, + "grad_norm": NaN, + "learning_rate": 8.912727089617997e-05, + "loss": 0.0, + "step": 41438 + }, + { + "epoch": 3.8666604460203415, + "grad_norm": NaN, + "learning_rate": 8.91203580238587e-05, + "loss": 0.0, + "step": 41439 + }, + { + "epoch": 3.866753755715219, + "grad_norm": NaN, + "learning_rate": 8.911344530633633e-05, + "loss": 0.0, + "step": 41440 + }, + { + "epoch": 3.866847065410096, + "grad_norm": NaN, + "learning_rate": 8.910653274363039e-05, + "loss": 0.0, + "step": 41441 + }, + { + "epoch": 3.8669403751049733, + "grad_norm": NaN, + "learning_rate": 8.909962033575849e-05, + "loss": 0.0, + "step": 41442 + }, + { + "epoch": 3.8670336847998508, + "grad_norm": NaN, + "learning_rate": 8.909270808273828e-05, + "loss": 0.0, + "step": 41443 + }, + { + "epoch": 3.867126994494728, + "grad_norm": NaN, + "learning_rate": 8.908579598458724e-05, + "loss": 0.0, + "step": 41444 + }, + { + "epoch": 3.867220304189605, + "grad_norm": NaN, + "learning_rate": 8.907888404132294e-05, + "loss": 0.0, + "step": 41445 + }, + { + "epoch": 3.8673136138844826, + "grad_norm": NaN, + "learning_rate": 8.907197225296305e-05, + "loss": 0.0, + "step": 41446 + }, + { + "epoch": 3.86740692357936, + "grad_norm": NaN, + "learning_rate": 8.906506061952508e-05, + "loss": 0.0, + "step": 41447 + }, + { + "epoch": 3.867500233274237, + "grad_norm": NaN, + "learning_rate": 8.905814914102656e-05, + "loss": 0.0, + "step": 41448 + }, + { + "epoch": 3.8675935429691144, + "grad_norm": NaN, + "learning_rate": 8.905123781748519e-05, + "loss": 0.0, + "step": 41449 + }, + { + "epoch": 3.867686852663992, + "grad_norm": NaN, + "learning_rate": 8.904432664891841e-05, + "loss": 0.0, + "step": 41450 + }, + { + "epoch": 3.8677801623588692, + "grad_norm": NaN, + "learning_rate": 8.903741563534385e-05, + "loss": 0.0, + "step": 41451 + }, + { + "epoch": 3.8678734720537467, + "grad_norm": NaN, + "learning_rate": 8.903050477677914e-05, + "loss": 0.0, + "step": 41452 + }, + { + "epoch": 3.8679667817486236, + "grad_norm": NaN, + "learning_rate": 8.902359407324176e-05, + "loss": 0.0, + "step": 41453 + }, + { + "epoch": 3.868060091443501, + "grad_norm": NaN, + "learning_rate": 8.901668352474928e-05, + "loss": 0.0, + "step": 41454 + }, + { + "epoch": 3.868153401138378, + "grad_norm": NaN, + "learning_rate": 8.900977313131937e-05, + "loss": 0.0, + "step": 41455 + }, + { + "epoch": 3.8682467108332554, + "grad_norm": NaN, + "learning_rate": 8.900286289296952e-05, + "loss": 0.0, + "step": 41456 + }, + { + "epoch": 3.868340020528133, + "grad_norm": NaN, + "learning_rate": 8.899595280971729e-05, + "loss": 0.0, + "step": 41457 + }, + { + "epoch": 3.8684333302230103, + "grad_norm": NaN, + "learning_rate": 8.898904288158032e-05, + "loss": 0.0, + "step": 41458 + }, + { + "epoch": 3.8685266399178877, + "grad_norm": NaN, + "learning_rate": 8.898213310857616e-05, + "loss": 0.0, + "step": 41459 + }, + { + "epoch": 3.8686199496127647, + "grad_norm": NaN, + "learning_rate": 8.897522349072227e-05, + "loss": 0.0, + "step": 41460 + }, + { + "epoch": 3.868713259307642, + "grad_norm": NaN, + "learning_rate": 8.896831402803641e-05, + "loss": 0.0, + "step": 41461 + }, + { + "epoch": 3.8688065690025195, + "grad_norm": NaN, + "learning_rate": 8.8961404720536e-05, + "loss": 0.0, + "step": 41462 + }, + { + "epoch": 3.8688998786973965, + "grad_norm": NaN, + "learning_rate": 8.895449556823866e-05, + "loss": 0.0, + "step": 41463 + }, + { + "epoch": 3.868993188392274, + "grad_norm": NaN, + "learning_rate": 8.894758657116198e-05, + "loss": 0.0, + "step": 41464 + }, + { + "epoch": 3.8690864980871513, + "grad_norm": NaN, + "learning_rate": 8.894067772932348e-05, + "loss": 0.0, + "step": 41465 + }, + { + "epoch": 3.8691798077820287, + "grad_norm": NaN, + "learning_rate": 8.893376904274073e-05, + "loss": 0.0, + "step": 41466 + }, + { + "epoch": 3.8692731174769057, + "grad_norm": NaN, + "learning_rate": 8.892686051143138e-05, + "loss": 0.0, + "step": 41467 + }, + { + "epoch": 3.869366427171783, + "grad_norm": NaN, + "learning_rate": 8.891995213541286e-05, + "loss": 0.0, + "step": 41468 + }, + { + "epoch": 3.8694597368666606, + "grad_norm": NaN, + "learning_rate": 8.891304391470281e-05, + "loss": 0.0, + "step": 41469 + }, + { + "epoch": 3.8695530465615375, + "grad_norm": NaN, + "learning_rate": 8.890613584931886e-05, + "loss": 0.0, + "step": 41470 + }, + { + "epoch": 3.869646356256415, + "grad_norm": NaN, + "learning_rate": 8.889922793927847e-05, + "loss": 0.0, + "step": 41471 + }, + { + "epoch": 3.8697396659512924, + "grad_norm": NaN, + "learning_rate": 8.889232018459923e-05, + "loss": 0.0, + "step": 41472 + }, + { + "epoch": 3.86983297564617, + "grad_norm": NaN, + "learning_rate": 8.888541258529876e-05, + "loss": 0.0, + "step": 41473 + }, + { + "epoch": 3.869926285341047, + "grad_norm": NaN, + "learning_rate": 8.887850514139456e-05, + "loss": 0.0, + "step": 41474 + }, + { + "epoch": 3.870019595035924, + "grad_norm": NaN, + "learning_rate": 8.887159785290418e-05, + "loss": 0.0, + "step": 41475 + }, + { + "epoch": 3.8701129047308016, + "grad_norm": NaN, + "learning_rate": 8.886469071984527e-05, + "loss": 0.0, + "step": 41476 + }, + { + "epoch": 3.8702062144256786, + "grad_norm": NaN, + "learning_rate": 8.885778374223529e-05, + "loss": 0.0, + "step": 41477 + }, + { + "epoch": 3.870299524120556, + "grad_norm": NaN, + "learning_rate": 8.885087692009195e-05, + "loss": 0.0, + "step": 41478 + }, + { + "epoch": 3.8703928338154334, + "grad_norm": NaN, + "learning_rate": 8.884397025343264e-05, + "loss": 0.0, + "step": 41479 + }, + { + "epoch": 3.870486143510311, + "grad_norm": NaN, + "learning_rate": 8.883706374227497e-05, + "loss": 0.0, + "step": 41480 + }, + { + "epoch": 3.8705794532051883, + "grad_norm": NaN, + "learning_rate": 8.88301573866366e-05, + "loss": 0.0, + "step": 41481 + }, + { + "epoch": 3.8706727629000652, + "grad_norm": NaN, + "learning_rate": 8.882325118653499e-05, + "loss": 0.0, + "step": 41482 + }, + { + "epoch": 3.8707660725949427, + "grad_norm": NaN, + "learning_rate": 8.88163451419877e-05, + "loss": 0.0, + "step": 41483 + }, + { + "epoch": 3.87085938228982, + "grad_norm": NaN, + "learning_rate": 8.880943925301239e-05, + "loss": 0.0, + "step": 41484 + }, + { + "epoch": 3.870952691984697, + "grad_norm": NaN, + "learning_rate": 8.880253351962652e-05, + "loss": 0.0, + "step": 41485 + }, + { + "epoch": 3.8710460016795745, + "grad_norm": NaN, + "learning_rate": 8.879562794184763e-05, + "loss": 0.0, + "step": 41486 + }, + { + "epoch": 3.871139311374452, + "grad_norm": NaN, + "learning_rate": 8.87887225196934e-05, + "loss": 0.0, + "step": 41487 + }, + { + "epoch": 3.8712326210693293, + "grad_norm": NaN, + "learning_rate": 8.87818172531813e-05, + "loss": 0.0, + "step": 41488 + }, + { + "epoch": 3.8713259307642063, + "grad_norm": NaN, + "learning_rate": 8.877491214232885e-05, + "loss": 0.0, + "step": 41489 + }, + { + "epoch": 3.8714192404590837, + "grad_norm": NaN, + "learning_rate": 8.876800718715373e-05, + "loss": 0.0, + "step": 41490 + }, + { + "epoch": 3.871512550153961, + "grad_norm": NaN, + "learning_rate": 8.87611023876734e-05, + "loss": 0.0, + "step": 41491 + }, + { + "epoch": 3.871605859848838, + "grad_norm": NaN, + "learning_rate": 8.875419774390542e-05, + "loss": 0.0, + "step": 41492 + }, + { + "epoch": 3.8716991695437155, + "grad_norm": NaN, + "learning_rate": 8.874729325586743e-05, + "loss": 0.0, + "step": 41493 + }, + { + "epoch": 3.871792479238593, + "grad_norm": NaN, + "learning_rate": 8.87403889235769e-05, + "loss": 0.0, + "step": 41494 + }, + { + "epoch": 3.8718857889334704, + "grad_norm": NaN, + "learning_rate": 8.87334847470514e-05, + "loss": 0.0, + "step": 41495 + }, + { + "epoch": 3.8719790986283473, + "grad_norm": NaN, + "learning_rate": 8.872658072630853e-05, + "loss": 0.0, + "step": 41496 + }, + { + "epoch": 3.8720724083232247, + "grad_norm": NaN, + "learning_rate": 8.87196768613658e-05, + "loss": 0.0, + "step": 41497 + }, + { + "epoch": 3.872165718018102, + "grad_norm": NaN, + "learning_rate": 8.871277315224076e-05, + "loss": 0.0, + "step": 41498 + }, + { + "epoch": 3.872259027712979, + "grad_norm": NaN, + "learning_rate": 8.870586959895103e-05, + "loss": 0.0, + "step": 41499 + }, + { + "epoch": 3.8723523374078566, + "grad_norm": NaN, + "learning_rate": 8.869896620151407e-05, + "loss": 0.0, + "step": 41500 + }, + { + "epoch": 3.872445647102734, + "grad_norm": NaN, + "learning_rate": 8.869206295994749e-05, + "loss": 0.0, + "step": 41501 + }, + { + "epoch": 3.8725389567976114, + "grad_norm": NaN, + "learning_rate": 8.868515987426886e-05, + "loss": 0.0, + "step": 41502 + }, + { + "epoch": 3.872632266492489, + "grad_norm": NaN, + "learning_rate": 8.867825694449569e-05, + "loss": 0.0, + "step": 41503 + }, + { + "epoch": 3.872725576187366, + "grad_norm": NaN, + "learning_rate": 8.867135417064551e-05, + "loss": 0.0, + "step": 41504 + }, + { + "epoch": 3.872818885882243, + "grad_norm": NaN, + "learning_rate": 8.866445155273599e-05, + "loss": 0.0, + "step": 41505 + }, + { + "epoch": 3.87291219557712, + "grad_norm": NaN, + "learning_rate": 8.865754909078455e-05, + "loss": 0.0, + "step": 41506 + }, + { + "epoch": 3.8730055052719976, + "grad_norm": NaN, + "learning_rate": 8.865064678480876e-05, + "loss": 0.0, + "step": 41507 + }, + { + "epoch": 3.873098814966875, + "grad_norm": NaN, + "learning_rate": 8.864374463482628e-05, + "loss": 0.0, + "step": 41508 + }, + { + "epoch": 3.8731921246617524, + "grad_norm": NaN, + "learning_rate": 8.863684264085452e-05, + "loss": 0.0, + "step": 41509 + }, + { + "epoch": 3.87328543435663, + "grad_norm": NaN, + "learning_rate": 8.862994080291109e-05, + "loss": 0.0, + "step": 41510 + }, + { + "epoch": 3.873378744051507, + "grad_norm": NaN, + "learning_rate": 8.86230391210136e-05, + "loss": 0.0, + "step": 41511 + }, + { + "epoch": 3.8734720537463843, + "grad_norm": NaN, + "learning_rate": 8.861613759517949e-05, + "loss": 0.0, + "step": 41512 + }, + { + "epoch": 3.8735653634412617, + "grad_norm": NaN, + "learning_rate": 8.860923622542635e-05, + "loss": 0.0, + "step": 41513 + }, + { + "epoch": 3.8736586731361387, + "grad_norm": NaN, + "learning_rate": 8.86023350117718e-05, + "loss": 0.0, + "step": 41514 + }, + { + "epoch": 3.873751982831016, + "grad_norm": NaN, + "learning_rate": 8.859543395423324e-05, + "loss": 0.0, + "step": 41515 + }, + { + "epoch": 3.8738452925258935, + "grad_norm": NaN, + "learning_rate": 8.858853305282836e-05, + "loss": 0.0, + "step": 41516 + }, + { + "epoch": 3.873938602220771, + "grad_norm": NaN, + "learning_rate": 8.858163230757466e-05, + "loss": 0.0, + "step": 41517 + }, + { + "epoch": 3.874031911915648, + "grad_norm": NaN, + "learning_rate": 8.857473171848963e-05, + "loss": 0.0, + "step": 41518 + }, + { + "epoch": 3.8741252216105253, + "grad_norm": NaN, + "learning_rate": 8.856783128559087e-05, + "loss": 0.0, + "step": 41519 + }, + { + "epoch": 3.8742185313054027, + "grad_norm": NaN, + "learning_rate": 8.856093100889597e-05, + "loss": 0.0, + "step": 41520 + }, + { + "epoch": 3.8743118410002797, + "grad_norm": NaN, + "learning_rate": 8.855403088842236e-05, + "loss": 0.0, + "step": 41521 + }, + { + "epoch": 3.874405150695157, + "grad_norm": NaN, + "learning_rate": 8.854713092418767e-05, + "loss": 0.0, + "step": 41522 + }, + { + "epoch": 3.8744984603900345, + "grad_norm": NaN, + "learning_rate": 8.854023111620945e-05, + "loss": 0.0, + "step": 41523 + }, + { + "epoch": 3.874591770084912, + "grad_norm": NaN, + "learning_rate": 8.853333146450516e-05, + "loss": 0.0, + "step": 41524 + }, + { + "epoch": 3.8746850797797894, + "grad_norm": NaN, + "learning_rate": 8.852643196909247e-05, + "loss": 0.0, + "step": 41525 + }, + { + "epoch": 3.8747783894746664, + "grad_norm": NaN, + "learning_rate": 8.85195326299888e-05, + "loss": 0.0, + "step": 41526 + }, + { + "epoch": 3.8748716991695438, + "grad_norm": NaN, + "learning_rate": 8.851263344721174e-05, + "loss": 0.0, + "step": 41527 + }, + { + "epoch": 3.8749650088644207, + "grad_norm": NaN, + "learning_rate": 8.850573442077889e-05, + "loss": 0.0, + "step": 41528 + }, + { + "epoch": 3.875058318559298, + "grad_norm": NaN, + "learning_rate": 8.849883555070772e-05, + "loss": 0.0, + "step": 41529 + }, + { + "epoch": 3.8751516282541756, + "grad_norm": NaN, + "learning_rate": 8.849193683701574e-05, + "loss": 0.0, + "step": 41530 + }, + { + "epoch": 3.875244937949053, + "grad_norm": NaN, + "learning_rate": 8.848503827972064e-05, + "loss": 0.0, + "step": 41531 + }, + { + "epoch": 3.8753382476439304, + "grad_norm": NaN, + "learning_rate": 8.847813987883981e-05, + "loss": 0.0, + "step": 41532 + }, + { + "epoch": 3.8754315573388074, + "grad_norm": NaN, + "learning_rate": 8.847124163439081e-05, + "loss": 0.0, + "step": 41533 + }, + { + "epoch": 3.875524867033685, + "grad_norm": NaN, + "learning_rate": 8.84643435463913e-05, + "loss": 0.0, + "step": 41534 + }, + { + "epoch": 3.8756181767285622, + "grad_norm": NaN, + "learning_rate": 8.84574456148587e-05, + "loss": 0.0, + "step": 41535 + }, + { + "epoch": 3.875711486423439, + "grad_norm": NaN, + "learning_rate": 8.845054783981054e-05, + "loss": 0.0, + "step": 41536 + }, + { + "epoch": 3.8758047961183166, + "grad_norm": NaN, + "learning_rate": 8.844365022126448e-05, + "loss": 0.0, + "step": 41537 + }, + { + "epoch": 3.875898105813194, + "grad_norm": NaN, + "learning_rate": 8.843675275923796e-05, + "loss": 0.0, + "step": 41538 + }, + { + "epoch": 3.8759914155080715, + "grad_norm": NaN, + "learning_rate": 8.842985545374848e-05, + "loss": 0.0, + "step": 41539 + }, + { + "epoch": 3.8760847252029484, + "grad_norm": NaN, + "learning_rate": 8.842295830481373e-05, + "loss": 0.0, + "step": 41540 + }, + { + "epoch": 3.876178034897826, + "grad_norm": NaN, + "learning_rate": 8.84160613124511e-05, + "loss": 0.0, + "step": 41541 + }, + { + "epoch": 3.8762713445927033, + "grad_norm": NaN, + "learning_rate": 8.840916447667818e-05, + "loss": 0.0, + "step": 41542 + }, + { + "epoch": 3.8763646542875803, + "grad_norm": NaN, + "learning_rate": 8.840226779751256e-05, + "loss": 0.0, + "step": 41543 + }, + { + "epoch": 3.8764579639824577, + "grad_norm": NaN, + "learning_rate": 8.839537127497169e-05, + "loss": 0.0, + "step": 41544 + }, + { + "epoch": 3.876551273677335, + "grad_norm": NaN, + "learning_rate": 8.838847490907312e-05, + "loss": 0.0, + "step": 41545 + }, + { + "epoch": 3.8766445833722125, + "grad_norm": NaN, + "learning_rate": 8.838157869983448e-05, + "loss": 0.0, + "step": 41546 + }, + { + "epoch": 3.87673789306709, + "grad_norm": NaN, + "learning_rate": 8.837468264727317e-05, + "loss": 0.0, + "step": 41547 + }, + { + "epoch": 3.876831202761967, + "grad_norm": NaN, + "learning_rate": 8.836778675140678e-05, + "loss": 0.0, + "step": 41548 + }, + { + "epoch": 3.8769245124568443, + "grad_norm": NaN, + "learning_rate": 8.83608910122529e-05, + "loss": 0.0, + "step": 41549 + }, + { + "epoch": 3.8770178221517213, + "grad_norm": NaN, + "learning_rate": 8.8353995429829e-05, + "loss": 0.0, + "step": 41550 + }, + { + "epoch": 3.8771111318465987, + "grad_norm": NaN, + "learning_rate": 8.83471000041526e-05, + "loss": 0.0, + "step": 41551 + }, + { + "epoch": 3.877204441541476, + "grad_norm": NaN, + "learning_rate": 8.83402047352413e-05, + "loss": 0.0, + "step": 41552 + }, + { + "epoch": 3.8772977512363536, + "grad_norm": NaN, + "learning_rate": 8.833330962311255e-05, + "loss": 0.0, + "step": 41553 + }, + { + "epoch": 3.877391060931231, + "grad_norm": NaN, + "learning_rate": 8.832641466778394e-05, + "loss": 0.0, + "step": 41554 + }, + { + "epoch": 3.877484370626108, + "grad_norm": NaN, + "learning_rate": 8.831951986927304e-05, + "loss": 0.0, + "step": 41555 + }, + { + "epoch": 3.8775776803209854, + "grad_norm": NaN, + "learning_rate": 8.831262522759727e-05, + "loss": 0.0, + "step": 41556 + }, + { + "epoch": 3.877670990015863, + "grad_norm": NaN, + "learning_rate": 8.830573074277424e-05, + "loss": 0.0, + "step": 41557 + }, + { + "epoch": 3.8777642997107398, + "grad_norm": NaN, + "learning_rate": 8.829883641482148e-05, + "loss": 0.0, + "step": 41558 + }, + { + "epoch": 3.877857609405617, + "grad_norm": NaN, + "learning_rate": 8.829194224375645e-05, + "loss": 0.0, + "step": 41559 + }, + { + "epoch": 3.8779509191004946, + "grad_norm": NaN, + "learning_rate": 8.828504822959677e-05, + "loss": 0.0, + "step": 41560 + }, + { + "epoch": 3.878044228795372, + "grad_norm": NaN, + "learning_rate": 8.827815437235997e-05, + "loss": 0.0, + "step": 41561 + }, + { + "epoch": 3.878137538490249, + "grad_norm": NaN, + "learning_rate": 8.827126067206347e-05, + "loss": 0.0, + "step": 41562 + }, + { + "epoch": 3.8782308481851264, + "grad_norm": NaN, + "learning_rate": 8.826436712872489e-05, + "loss": 0.0, + "step": 41563 + }, + { + "epoch": 3.878324157880004, + "grad_norm": NaN, + "learning_rate": 8.825747374236178e-05, + "loss": 0.0, + "step": 41564 + }, + { + "epoch": 3.878417467574881, + "grad_norm": NaN, + "learning_rate": 8.825058051299156e-05, + "loss": 0.0, + "step": 41565 + }, + { + "epoch": 3.8785107772697582, + "grad_norm": NaN, + "learning_rate": 8.824368744063183e-05, + "loss": 0.0, + "step": 41566 + }, + { + "epoch": 3.8786040869646357, + "grad_norm": NaN, + "learning_rate": 8.823679452530018e-05, + "loss": 0.0, + "step": 41567 + }, + { + "epoch": 3.878697396659513, + "grad_norm": NaN, + "learning_rate": 8.822990176701399e-05, + "loss": 0.0, + "step": 41568 + }, + { + "epoch": 3.8787907063543905, + "grad_norm": NaN, + "learning_rate": 8.822300916579091e-05, + "loss": 0.0, + "step": 41569 + }, + { + "epoch": 3.8788840160492675, + "grad_norm": NaN, + "learning_rate": 8.821611672164839e-05, + "loss": 0.0, + "step": 41570 + }, + { + "epoch": 3.878977325744145, + "grad_norm": NaN, + "learning_rate": 8.820922443460397e-05, + "loss": 0.0, + "step": 41571 + }, + { + "epoch": 3.879070635439022, + "grad_norm": NaN, + "learning_rate": 8.820233230467523e-05, + "loss": 0.0, + "step": 41572 + }, + { + "epoch": 3.8791639451338993, + "grad_norm": NaN, + "learning_rate": 8.819544033187963e-05, + "loss": 0.0, + "step": 41573 + }, + { + "epoch": 3.8792572548287767, + "grad_norm": NaN, + "learning_rate": 8.818854851623469e-05, + "loss": 0.0, + "step": 41574 + }, + { + "epoch": 3.879350564523654, + "grad_norm": NaN, + "learning_rate": 8.818165685775803e-05, + "loss": 0.0, + "step": 41575 + }, + { + "epoch": 3.8794438742185315, + "grad_norm": NaN, + "learning_rate": 8.817476535646707e-05, + "loss": 0.0, + "step": 41576 + }, + { + "epoch": 3.8795371839134085, + "grad_norm": NaN, + "learning_rate": 8.816787401237933e-05, + "loss": 0.0, + "step": 41577 + }, + { + "epoch": 3.879630493608286, + "grad_norm": NaN, + "learning_rate": 8.816098282551242e-05, + "loss": 0.0, + "step": 41578 + }, + { + "epoch": 3.8797238033031634, + "grad_norm": NaN, + "learning_rate": 8.81540917958838e-05, + "loss": 0.0, + "step": 41579 + }, + { + "epoch": 3.8798171129980403, + "grad_norm": NaN, + "learning_rate": 8.814720092351096e-05, + "loss": 0.0, + "step": 41580 + }, + { + "epoch": 3.8799104226929177, + "grad_norm": NaN, + "learning_rate": 8.814031020841154e-05, + "loss": 0.0, + "step": 41581 + }, + { + "epoch": 3.880003732387795, + "grad_norm": NaN, + "learning_rate": 8.813341965060297e-05, + "loss": 0.0, + "step": 41582 + }, + { + "epoch": 3.8800970420826726, + "grad_norm": NaN, + "learning_rate": 8.812652925010273e-05, + "loss": 0.0, + "step": 41583 + }, + { + "epoch": 3.8801903517775496, + "grad_norm": NaN, + "learning_rate": 8.811963900692848e-05, + "loss": 0.0, + "step": 41584 + }, + { + "epoch": 3.880283661472427, + "grad_norm": NaN, + "learning_rate": 8.81127489210976e-05, + "loss": 0.0, + "step": 41585 + }, + { + "epoch": 3.8803769711673044, + "grad_norm": NaN, + "learning_rate": 8.810585899262766e-05, + "loss": 0.0, + "step": 41586 + }, + { + "epoch": 3.8804702808621814, + "grad_norm": NaN, + "learning_rate": 8.809896922153624e-05, + "loss": 0.0, + "step": 41587 + }, + { + "epoch": 3.880563590557059, + "grad_norm": NaN, + "learning_rate": 8.809207960784078e-05, + "loss": 0.0, + "step": 41588 + }, + { + "epoch": 3.880656900251936, + "grad_norm": NaN, + "learning_rate": 8.808519015155879e-05, + "loss": 0.0, + "step": 41589 + }, + { + "epoch": 3.8807502099468136, + "grad_norm": NaN, + "learning_rate": 8.807830085270786e-05, + "loss": 0.0, + "step": 41590 + }, + { + "epoch": 3.8808435196416906, + "grad_norm": NaN, + "learning_rate": 8.807141171130543e-05, + "loss": 0.0, + "step": 41591 + }, + { + "epoch": 3.880936829336568, + "grad_norm": NaN, + "learning_rate": 8.806452272736908e-05, + "loss": 0.0, + "step": 41592 + }, + { + "epoch": 3.8810301390314454, + "grad_norm": NaN, + "learning_rate": 8.805763390091632e-05, + "loss": 0.0, + "step": 41593 + }, + { + "epoch": 3.8811234487263224, + "grad_norm": NaN, + "learning_rate": 8.805074523196462e-05, + "loss": 0.0, + "step": 41594 + }, + { + "epoch": 3.8812167584212, + "grad_norm": NaN, + "learning_rate": 8.804385672053152e-05, + "loss": 0.0, + "step": 41595 + }, + { + "epoch": 3.8813100681160773, + "grad_norm": NaN, + "learning_rate": 8.803696836663461e-05, + "loss": 0.0, + "step": 41596 + }, + { + "epoch": 3.8814033778109547, + "grad_norm": NaN, + "learning_rate": 8.803008017029123e-05, + "loss": 0.0, + "step": 41597 + }, + { + "epoch": 3.881496687505832, + "grad_norm": NaN, + "learning_rate": 8.802319213151908e-05, + "loss": 0.0, + "step": 41598 + }, + { + "epoch": 3.881589997200709, + "grad_norm": NaN, + "learning_rate": 8.801630425033557e-05, + "loss": 0.0, + "step": 41599 + }, + { + "epoch": 3.8816833068955865, + "grad_norm": NaN, + "learning_rate": 8.80094165267582e-05, + "loss": 0.0, + "step": 41600 + }, + { + "epoch": 3.881776616590464, + "grad_norm": NaN, + "learning_rate": 8.800252896080456e-05, + "loss": 0.0, + "step": 41601 + }, + { + "epoch": 3.881869926285341, + "grad_norm": NaN, + "learning_rate": 8.799564155249213e-05, + "loss": 0.0, + "step": 41602 + }, + { + "epoch": 3.8819632359802183, + "grad_norm": NaN, + "learning_rate": 8.798875430183837e-05, + "loss": 0.0, + "step": 41603 + }, + { + "epoch": 3.8820565456750957, + "grad_norm": NaN, + "learning_rate": 8.798186720886088e-05, + "loss": 0.0, + "step": 41604 + }, + { + "epoch": 3.882149855369973, + "grad_norm": NaN, + "learning_rate": 8.797498027357715e-05, + "loss": 0.0, + "step": 41605 + }, + { + "epoch": 3.88224316506485, + "grad_norm": NaN, + "learning_rate": 8.796809349600462e-05, + "loss": 0.0, + "step": 41606 + }, + { + "epoch": 3.8823364747597275, + "grad_norm": NaN, + "learning_rate": 8.796120687616088e-05, + "loss": 0.0, + "step": 41607 + }, + { + "epoch": 3.882429784454605, + "grad_norm": NaN, + "learning_rate": 8.795432041406344e-05, + "loss": 0.0, + "step": 41608 + }, + { + "epoch": 3.882523094149482, + "grad_norm": NaN, + "learning_rate": 8.794743410972971e-05, + "loss": 0.0, + "step": 41609 + }, + { + "epoch": 3.8826164038443594, + "grad_norm": NaN, + "learning_rate": 8.794054796317733e-05, + "loss": 0.0, + "step": 41610 + }, + { + "epoch": 3.8827097135392368, + "grad_norm": NaN, + "learning_rate": 8.793366197442376e-05, + "loss": 0.0, + "step": 41611 + }, + { + "epoch": 3.882803023234114, + "grad_norm": NaN, + "learning_rate": 8.792677614348646e-05, + "loss": 0.0, + "step": 41612 + }, + { + "epoch": 3.882896332928991, + "grad_norm": NaN, + "learning_rate": 8.791989047038303e-05, + "loss": 0.0, + "step": 41613 + }, + { + "epoch": 3.8829896426238686, + "grad_norm": NaN, + "learning_rate": 8.791300495513089e-05, + "loss": 0.0, + "step": 41614 + }, + { + "epoch": 3.883082952318746, + "grad_norm": NaN, + "learning_rate": 8.790611959774758e-05, + "loss": 0.0, + "step": 41615 + }, + { + "epoch": 3.883176262013623, + "grad_norm": NaN, + "learning_rate": 8.789923439825067e-05, + "loss": 0.0, + "step": 41616 + }, + { + "epoch": 3.8832695717085004, + "grad_norm": NaN, + "learning_rate": 8.789234935665757e-05, + "loss": 0.0, + "step": 41617 + }, + { + "epoch": 3.883362881403378, + "grad_norm": NaN, + "learning_rate": 8.78854644729858e-05, + "loss": 0.0, + "step": 41618 + }, + { + "epoch": 3.8834561910982552, + "grad_norm": NaN, + "learning_rate": 8.787857974725296e-05, + "loss": 0.0, + "step": 41619 + }, + { + "epoch": 3.8835495007931327, + "grad_norm": NaN, + "learning_rate": 8.787169517947647e-05, + "loss": 0.0, + "step": 41620 + }, + { + "epoch": 3.8836428104880096, + "grad_norm": NaN, + "learning_rate": 8.78648107696738e-05, + "loss": 0.0, + "step": 41621 + }, + { + "epoch": 3.883736120182887, + "grad_norm": NaN, + "learning_rate": 8.785792651786259e-05, + "loss": 0.0, + "step": 41622 + }, + { + "epoch": 3.883829429877764, + "grad_norm": NaN, + "learning_rate": 8.785104242406022e-05, + "loss": 0.0, + "step": 41623 + }, + { + "epoch": 3.8839227395726414, + "grad_norm": NaN, + "learning_rate": 8.784415848828422e-05, + "loss": 0.0, + "step": 41624 + }, + { + "epoch": 3.884016049267519, + "grad_norm": NaN, + "learning_rate": 8.783727471055216e-05, + "loss": 0.0, + "step": 41625 + }, + { + "epoch": 3.8841093589623963, + "grad_norm": NaN, + "learning_rate": 8.783039109088149e-05, + "loss": 0.0, + "step": 41626 + }, + { + "epoch": 3.8842026686572737, + "grad_norm": NaN, + "learning_rate": 8.782350762928967e-05, + "loss": 0.0, + "step": 41627 + }, + { + "epoch": 3.8842959783521507, + "grad_norm": NaN, + "learning_rate": 8.781662432579431e-05, + "loss": 0.0, + "step": 41628 + }, + { + "epoch": 3.884389288047028, + "grad_norm": NaN, + "learning_rate": 8.780974118041282e-05, + "loss": 0.0, + "step": 41629 + }, + { + "epoch": 3.8844825977419055, + "grad_norm": NaN, + "learning_rate": 8.780285819316274e-05, + "loss": 0.0, + "step": 41630 + }, + { + "epoch": 3.8845759074367825, + "grad_norm": NaN, + "learning_rate": 8.77959753640616e-05, + "loss": 0.0, + "step": 41631 + }, + { + "epoch": 3.88466921713166, + "grad_norm": NaN, + "learning_rate": 8.778909269312681e-05, + "loss": 0.0, + "step": 41632 + }, + { + "epoch": 3.8847625268265373, + "grad_norm": NaN, + "learning_rate": 8.778221018037597e-05, + "loss": 0.0, + "step": 41633 + }, + { + "epoch": 3.8848558365214148, + "grad_norm": NaN, + "learning_rate": 8.777532782582656e-05, + "loss": 0.0, + "step": 41634 + }, + { + "epoch": 3.8849491462162917, + "grad_norm": NaN, + "learning_rate": 8.776844562949601e-05, + "loss": 0.0, + "step": 41635 + }, + { + "epoch": 3.885042455911169, + "grad_norm": NaN, + "learning_rate": 8.776156359140189e-05, + "loss": 0.0, + "step": 41636 + }, + { + "epoch": 3.8851357656060466, + "grad_norm": NaN, + "learning_rate": 8.775468171156171e-05, + "loss": 0.0, + "step": 41637 + }, + { + "epoch": 3.8852290753009235, + "grad_norm": NaN, + "learning_rate": 8.774779998999287e-05, + "loss": 0.0, + "step": 41638 + }, + { + "epoch": 3.885322384995801, + "grad_norm": NaN, + "learning_rate": 8.774091842671297e-05, + "loss": 0.0, + "step": 41639 + }, + { + "epoch": 3.8854156946906784, + "grad_norm": NaN, + "learning_rate": 8.773403702173949e-05, + "loss": 0.0, + "step": 41640 + }, + { + "epoch": 3.885509004385556, + "grad_norm": NaN, + "learning_rate": 8.772715577508986e-05, + "loss": 0.0, + "step": 41641 + }, + { + "epoch": 3.885602314080433, + "grad_norm": NaN, + "learning_rate": 8.772027468678166e-05, + "loss": 0.0, + "step": 41642 + }, + { + "epoch": 3.88569562377531, + "grad_norm": NaN, + "learning_rate": 8.771339375683239e-05, + "loss": 0.0, + "step": 41643 + }, + { + "epoch": 3.8857889334701876, + "grad_norm": NaN, + "learning_rate": 8.770651298525943e-05, + "loss": 0.0, + "step": 41644 + }, + { + "epoch": 3.8858822431650646, + "grad_norm": NaN, + "learning_rate": 8.769963237208039e-05, + "loss": 0.0, + "step": 41645 + }, + { + "epoch": 3.885975552859942, + "grad_norm": NaN, + "learning_rate": 8.769275191731276e-05, + "loss": 0.0, + "step": 41646 + }, + { + "epoch": 3.8860688625548194, + "grad_norm": NaN, + "learning_rate": 8.768587162097396e-05, + "loss": 0.0, + "step": 41647 + }, + { + "epoch": 3.886162172249697, + "grad_norm": NaN, + "learning_rate": 8.767899148308153e-05, + "loss": 0.0, + "step": 41648 + }, + { + "epoch": 3.8862554819445743, + "grad_norm": NaN, + "learning_rate": 8.767211150365301e-05, + "loss": 0.0, + "step": 41649 + }, + { + "epoch": 3.8863487916394512, + "grad_norm": NaN, + "learning_rate": 8.76652316827058e-05, + "loss": 0.0, + "step": 41650 + }, + { + "epoch": 3.8864421013343287, + "grad_norm": NaN, + "learning_rate": 8.765835202025746e-05, + "loss": 0.0, + "step": 41651 + }, + { + "epoch": 3.886535411029206, + "grad_norm": NaN, + "learning_rate": 8.765147251632548e-05, + "loss": 0.0, + "step": 41652 + }, + { + "epoch": 3.886628720724083, + "grad_norm": NaN, + "learning_rate": 8.76445931709273e-05, + "loss": 0.0, + "step": 41653 + }, + { + "epoch": 3.8867220304189605, + "grad_norm": NaN, + "learning_rate": 8.763771398408048e-05, + "loss": 0.0, + "step": 41654 + }, + { + "epoch": 3.886815340113838, + "grad_norm": NaN, + "learning_rate": 8.763083495580248e-05, + "loss": 0.0, + "step": 41655 + }, + { + "epoch": 3.8869086498087153, + "grad_norm": NaN, + "learning_rate": 8.762395608611075e-05, + "loss": 0.0, + "step": 41656 + }, + { + "epoch": 3.8870019595035923, + "grad_norm": NaN, + "learning_rate": 8.761707737502289e-05, + "loss": 0.0, + "step": 41657 + }, + { + "epoch": 3.8870952691984697, + "grad_norm": NaN, + "learning_rate": 8.761019882255629e-05, + "loss": 0.0, + "step": 41658 + }, + { + "epoch": 3.887188578893347, + "grad_norm": NaN, + "learning_rate": 8.760332042872844e-05, + "loss": 0.0, + "step": 41659 + }, + { + "epoch": 3.887281888588224, + "grad_norm": NaN, + "learning_rate": 8.759644219355691e-05, + "loss": 0.0, + "step": 41660 + }, + { + "epoch": 3.8873751982831015, + "grad_norm": NaN, + "learning_rate": 8.758956411705913e-05, + "loss": 0.0, + "step": 41661 + }, + { + "epoch": 3.887468507977979, + "grad_norm": NaN, + "learning_rate": 8.758268619925257e-05, + "loss": 0.0, + "step": 41662 + }, + { + "epoch": 3.8875618176728564, + "grad_norm": NaN, + "learning_rate": 8.757580844015483e-05, + "loss": 0.0, + "step": 41663 + }, + { + "epoch": 3.8876551273677338, + "grad_norm": NaN, + "learning_rate": 8.75689308397832e-05, + "loss": 0.0, + "step": 41664 + }, + { + "epoch": 3.8877484370626108, + "grad_norm": NaN, + "learning_rate": 8.756205339815535e-05, + "loss": 0.0, + "step": 41665 + }, + { + "epoch": 3.887841746757488, + "grad_norm": NaN, + "learning_rate": 8.755517611528869e-05, + "loss": 0.0, + "step": 41666 + }, + { + "epoch": 3.887935056452365, + "grad_norm": NaN, + "learning_rate": 8.754829899120073e-05, + "loss": 0.0, + "step": 41667 + }, + { + "epoch": 3.8880283661472426, + "grad_norm": NaN, + "learning_rate": 8.754142202590895e-05, + "loss": 0.0, + "step": 41668 + }, + { + "epoch": 3.88812167584212, + "grad_norm": NaN, + "learning_rate": 8.753454521943082e-05, + "loss": 0.0, + "step": 41669 + }, + { + "epoch": 3.8882149855369974, + "grad_norm": NaN, + "learning_rate": 8.752766857178386e-05, + "loss": 0.0, + "step": 41670 + }, + { + "epoch": 3.888308295231875, + "grad_norm": NaN, + "learning_rate": 8.752079208298551e-05, + "loss": 0.0, + "step": 41671 + }, + { + "epoch": 3.888401604926752, + "grad_norm": NaN, + "learning_rate": 8.751391575305329e-05, + "loss": 0.0, + "step": 41672 + }, + { + "epoch": 3.888494914621629, + "grad_norm": NaN, + "learning_rate": 8.750703958200467e-05, + "loss": 0.0, + "step": 41673 + }, + { + "epoch": 3.8885882243165066, + "grad_norm": NaN, + "learning_rate": 8.750016356985713e-05, + "loss": 0.0, + "step": 41674 + }, + { + "epoch": 3.8886815340113836, + "grad_norm": NaN, + "learning_rate": 8.749328771662818e-05, + "loss": 0.0, + "step": 41675 + }, + { + "epoch": 3.888774843706261, + "grad_norm": NaN, + "learning_rate": 8.748641202233527e-05, + "loss": 0.0, + "step": 41676 + }, + { + "epoch": 3.8888681534011384, + "grad_norm": NaN, + "learning_rate": 8.74795364869959e-05, + "loss": 0.0, + "step": 41677 + }, + { + "epoch": 3.888961463096016, + "grad_norm": NaN, + "learning_rate": 8.747266111062754e-05, + "loss": 0.0, + "step": 41678 + }, + { + "epoch": 3.889054772790893, + "grad_norm": NaN, + "learning_rate": 8.746578589324769e-05, + "loss": 0.0, + "step": 41679 + }, + { + "epoch": 3.8891480824857703, + "grad_norm": NaN, + "learning_rate": 8.74589108348738e-05, + "loss": 0.0, + "step": 41680 + }, + { + "epoch": 3.8892413921806477, + "grad_norm": NaN, + "learning_rate": 8.745203593552339e-05, + "loss": 0.0, + "step": 41681 + }, + { + "epoch": 3.8893347018755247, + "grad_norm": NaN, + "learning_rate": 8.744516119521393e-05, + "loss": 0.0, + "step": 41682 + }, + { + "epoch": 3.889428011570402, + "grad_norm": NaN, + "learning_rate": 8.743828661396291e-05, + "loss": 0.0, + "step": 41683 + }, + { + "epoch": 3.8895213212652795, + "grad_norm": NaN, + "learning_rate": 8.743141219178777e-05, + "loss": 0.0, + "step": 41684 + }, + { + "epoch": 3.889614630960157, + "grad_norm": NaN, + "learning_rate": 8.742453792870602e-05, + "loss": 0.0, + "step": 41685 + }, + { + "epoch": 3.8897079406550343, + "grad_norm": NaN, + "learning_rate": 8.741766382473514e-05, + "loss": 0.0, + "step": 41686 + }, + { + "epoch": 3.8898012503499113, + "grad_norm": NaN, + "learning_rate": 8.741078987989259e-05, + "loss": 0.0, + "step": 41687 + }, + { + "epoch": 3.8898945600447887, + "grad_norm": NaN, + "learning_rate": 8.740391609419588e-05, + "loss": 0.0, + "step": 41688 + }, + { + "epoch": 3.8899878697396657, + "grad_norm": NaN, + "learning_rate": 8.739704246766245e-05, + "loss": 0.0, + "step": 41689 + }, + { + "epoch": 3.890081179434543, + "grad_norm": NaN, + "learning_rate": 8.73901690003098e-05, + "loss": 0.0, + "step": 41690 + }, + { + "epoch": 3.8901744891294205, + "grad_norm": NaN, + "learning_rate": 8.738329569215541e-05, + "loss": 0.0, + "step": 41691 + }, + { + "epoch": 3.890267798824298, + "grad_norm": NaN, + "learning_rate": 8.737642254321675e-05, + "loss": 0.0, + "step": 41692 + }, + { + "epoch": 3.8903611085191754, + "grad_norm": NaN, + "learning_rate": 8.736954955351128e-05, + "loss": 0.0, + "step": 41693 + }, + { + "epoch": 3.8904544182140524, + "grad_norm": NaN, + "learning_rate": 8.736267672305653e-05, + "loss": 0.0, + "step": 41694 + }, + { + "epoch": 3.8905477279089298, + "grad_norm": NaN, + "learning_rate": 8.735580405186992e-05, + "loss": 0.0, + "step": 41695 + }, + { + "epoch": 3.890641037603807, + "grad_norm": NaN, + "learning_rate": 8.734893153996893e-05, + "loss": 0.0, + "step": 41696 + }, + { + "epoch": 3.890734347298684, + "grad_norm": NaN, + "learning_rate": 8.734205918737105e-05, + "loss": 0.0, + "step": 41697 + }, + { + "epoch": 3.8908276569935616, + "grad_norm": NaN, + "learning_rate": 8.733518699409377e-05, + "loss": 0.0, + "step": 41698 + }, + { + "epoch": 3.890920966688439, + "grad_norm": NaN, + "learning_rate": 8.732831496015454e-05, + "loss": 0.0, + "step": 41699 + }, + { + "epoch": 3.8910142763833164, + "grad_norm": NaN, + "learning_rate": 8.732144308557084e-05, + "loss": 0.0, + "step": 41700 + }, + { + "epoch": 3.8911075860781934, + "grad_norm": NaN, + "learning_rate": 8.731457137036015e-05, + "loss": 0.0, + "step": 41701 + }, + { + "epoch": 3.891200895773071, + "grad_norm": NaN, + "learning_rate": 8.730769981453993e-05, + "loss": 0.0, + "step": 41702 + }, + { + "epoch": 3.8912942054679482, + "grad_norm": NaN, + "learning_rate": 8.730082841812766e-05, + "loss": 0.0, + "step": 41703 + }, + { + "epoch": 3.891387515162825, + "grad_norm": NaN, + "learning_rate": 8.72939571811408e-05, + "loss": 0.0, + "step": 41704 + }, + { + "epoch": 3.8914808248577026, + "grad_norm": NaN, + "learning_rate": 8.728708610359683e-05, + "loss": 0.0, + "step": 41705 + }, + { + "epoch": 3.89157413455258, + "grad_norm": NaN, + "learning_rate": 8.728021518551325e-05, + "loss": 0.0, + "step": 41706 + }, + { + "epoch": 3.8916674442474575, + "grad_norm": NaN, + "learning_rate": 8.727334442690749e-05, + "loss": 0.0, + "step": 41707 + }, + { + "epoch": 3.8917607539423344, + "grad_norm": NaN, + "learning_rate": 8.726647382779701e-05, + "loss": 0.0, + "step": 41708 + }, + { + "epoch": 3.891854063637212, + "grad_norm": NaN, + "learning_rate": 8.725960338819934e-05, + "loss": 0.0, + "step": 41709 + }, + { + "epoch": 3.8919473733320893, + "grad_norm": NaN, + "learning_rate": 8.725273310813188e-05, + "loss": 0.0, + "step": 41710 + }, + { + "epoch": 3.8920406830269663, + "grad_norm": NaN, + "learning_rate": 8.724586298761217e-05, + "loss": 0.0, + "step": 41711 + }, + { + "epoch": 3.8921339927218437, + "grad_norm": NaN, + "learning_rate": 8.723899302665761e-05, + "loss": 0.0, + "step": 41712 + }, + { + "epoch": 3.892227302416721, + "grad_norm": NaN, + "learning_rate": 8.723212322528574e-05, + "loss": 0.0, + "step": 41713 + }, + { + "epoch": 3.8923206121115985, + "grad_norm": NaN, + "learning_rate": 8.722525358351393e-05, + "loss": 0.0, + "step": 41714 + }, + { + "epoch": 3.892413921806476, + "grad_norm": NaN, + "learning_rate": 8.721838410135974e-05, + "loss": 0.0, + "step": 41715 + }, + { + "epoch": 3.892507231501353, + "grad_norm": NaN, + "learning_rate": 8.721151477884061e-05, + "loss": 0.0, + "step": 41716 + }, + { + "epoch": 3.8926005411962303, + "grad_norm": NaN, + "learning_rate": 8.720464561597399e-05, + "loss": 0.0, + "step": 41717 + }, + { + "epoch": 3.8926938508911073, + "grad_norm": NaN, + "learning_rate": 8.719777661277735e-05, + "loss": 0.0, + "step": 41718 + }, + { + "epoch": 3.8927871605859847, + "grad_norm": NaN, + "learning_rate": 8.719090776926813e-05, + "loss": 0.0, + "step": 41719 + }, + { + "epoch": 3.892880470280862, + "grad_norm": NaN, + "learning_rate": 8.718403908546388e-05, + "loss": 0.0, + "step": 41720 + }, + { + "epoch": 3.8929737799757396, + "grad_norm": NaN, + "learning_rate": 8.717717056138198e-05, + "loss": 0.0, + "step": 41721 + }, + { + "epoch": 3.893067089670617, + "grad_norm": NaN, + "learning_rate": 8.717030219703994e-05, + "loss": 0.0, + "step": 41722 + }, + { + "epoch": 3.893160399365494, + "grad_norm": NaN, + "learning_rate": 8.71634339924552e-05, + "loss": 0.0, + "step": 41723 + }, + { + "epoch": 3.8932537090603714, + "grad_norm": NaN, + "learning_rate": 8.715656594764524e-05, + "loss": 0.0, + "step": 41724 + }, + { + "epoch": 3.893347018755249, + "grad_norm": NaN, + "learning_rate": 8.714969806262752e-05, + "loss": 0.0, + "step": 41725 + }, + { + "epoch": 3.8934403284501258, + "grad_norm": NaN, + "learning_rate": 8.71428303374195e-05, + "loss": 0.0, + "step": 41726 + }, + { + "epoch": 3.893533638145003, + "grad_norm": NaN, + "learning_rate": 8.713596277203862e-05, + "loss": 0.0, + "step": 41727 + }, + { + "epoch": 3.8936269478398806, + "grad_norm": NaN, + "learning_rate": 8.712909536650237e-05, + "loss": 0.0, + "step": 41728 + }, + { + "epoch": 3.893720257534758, + "grad_norm": NaN, + "learning_rate": 8.712222812082823e-05, + "loss": 0.0, + "step": 41729 + }, + { + "epoch": 3.893813567229635, + "grad_norm": NaN, + "learning_rate": 8.711536103503362e-05, + "loss": 0.0, + "step": 41730 + }, + { + "epoch": 3.8939068769245124, + "grad_norm": NaN, + "learning_rate": 8.710849410913605e-05, + "loss": 0.0, + "step": 41731 + }, + { + "epoch": 3.89400018661939, + "grad_norm": NaN, + "learning_rate": 8.710162734315292e-05, + "loss": 0.0, + "step": 41732 + }, + { + "epoch": 3.894093496314267, + "grad_norm": NaN, + "learning_rate": 8.709476073710173e-05, + "loss": 0.0, + "step": 41733 + }, + { + "epoch": 3.8941868060091442, + "grad_norm": NaN, + "learning_rate": 8.708789429099992e-05, + "loss": 0.0, + "step": 41734 + }, + { + "epoch": 3.8942801157040217, + "grad_norm": NaN, + "learning_rate": 8.708102800486496e-05, + "loss": 0.0, + "step": 41735 + }, + { + "epoch": 3.894373425398899, + "grad_norm": NaN, + "learning_rate": 8.707416187871432e-05, + "loss": 0.0, + "step": 41736 + }, + { + "epoch": 3.8944667350937765, + "grad_norm": NaN, + "learning_rate": 8.706729591256543e-05, + "loss": 0.0, + "step": 41737 + }, + { + "epoch": 3.8945600447886535, + "grad_norm": NaN, + "learning_rate": 8.706043010643576e-05, + "loss": 0.0, + "step": 41738 + }, + { + "epoch": 3.894653354483531, + "grad_norm": NaN, + "learning_rate": 8.70535644603428e-05, + "loss": 0.0, + "step": 41739 + }, + { + "epoch": 3.894746664178408, + "grad_norm": NaN, + "learning_rate": 8.704669897430398e-05, + "loss": 0.0, + "step": 41740 + }, + { + "epoch": 3.8948399738732853, + "grad_norm": NaN, + "learning_rate": 8.703983364833673e-05, + "loss": 0.0, + "step": 41741 + }, + { + "epoch": 3.8949332835681627, + "grad_norm": NaN, + "learning_rate": 8.703296848245854e-05, + "loss": 0.0, + "step": 41742 + }, + { + "epoch": 3.89502659326304, + "grad_norm": NaN, + "learning_rate": 8.702610347668686e-05, + "loss": 0.0, + "step": 41743 + }, + { + "epoch": 3.8951199029579175, + "grad_norm": NaN, + "learning_rate": 8.701923863103915e-05, + "loss": 0.0, + "step": 41744 + }, + { + "epoch": 3.8952132126527945, + "grad_norm": NaN, + "learning_rate": 8.701237394553285e-05, + "loss": 0.0, + "step": 41745 + }, + { + "epoch": 3.895306522347672, + "grad_norm": NaN, + "learning_rate": 8.700550942018544e-05, + "loss": 0.0, + "step": 41746 + }, + { + "epoch": 3.8953998320425494, + "grad_norm": NaN, + "learning_rate": 8.699864505501436e-05, + "loss": 0.0, + "step": 41747 + }, + { + "epoch": 3.8954931417374263, + "grad_norm": NaN, + "learning_rate": 8.699178085003704e-05, + "loss": 0.0, + "step": 41748 + }, + { + "epoch": 3.8955864514323038, + "grad_norm": NaN, + "learning_rate": 8.698491680527096e-05, + "loss": 0.0, + "step": 41749 + }, + { + "epoch": 3.895679761127181, + "grad_norm": NaN, + "learning_rate": 8.697805292073358e-05, + "loss": 0.0, + "step": 41750 + }, + { + "epoch": 3.8957730708220586, + "grad_norm": NaN, + "learning_rate": 8.697118919644236e-05, + "loss": 0.0, + "step": 41751 + }, + { + "epoch": 3.8958663805169356, + "grad_norm": NaN, + "learning_rate": 8.696432563241474e-05, + "loss": 0.0, + "step": 41752 + }, + { + "epoch": 3.895959690211813, + "grad_norm": NaN, + "learning_rate": 8.695746222866813e-05, + "loss": 0.0, + "step": 41753 + }, + { + "epoch": 3.8960529999066904, + "grad_norm": NaN, + "learning_rate": 8.695059898522004e-05, + "loss": 0.0, + "step": 41754 + }, + { + "epoch": 3.8961463096015674, + "grad_norm": NaN, + "learning_rate": 8.69437359020879e-05, + "loss": 0.0, + "step": 41755 + }, + { + "epoch": 3.896239619296445, + "grad_norm": NaN, + "learning_rate": 8.693687297928914e-05, + "loss": 0.0, + "step": 41756 + }, + { + "epoch": 3.896332928991322, + "grad_norm": NaN, + "learning_rate": 8.693001021684124e-05, + "loss": 0.0, + "step": 41757 + }, + { + "epoch": 3.8964262386861996, + "grad_norm": NaN, + "learning_rate": 8.692314761476166e-05, + "loss": 0.0, + "step": 41758 + }, + { + "epoch": 3.896519548381077, + "grad_norm": NaN, + "learning_rate": 8.691628517306782e-05, + "loss": 0.0, + "step": 41759 + }, + { + "epoch": 3.896612858075954, + "grad_norm": NaN, + "learning_rate": 8.690942289177718e-05, + "loss": 0.0, + "step": 41760 + }, + { + "epoch": 3.8967061677708315, + "grad_norm": NaN, + "learning_rate": 8.69025607709072e-05, + "loss": 0.0, + "step": 41761 + }, + { + "epoch": 3.8967994774657084, + "grad_norm": NaN, + "learning_rate": 8.689569881047531e-05, + "loss": 0.0, + "step": 41762 + }, + { + "epoch": 3.896892787160586, + "grad_norm": NaN, + "learning_rate": 8.688883701049894e-05, + "loss": 0.0, + "step": 41763 + }, + { + "epoch": 3.8969860968554633, + "grad_norm": NaN, + "learning_rate": 8.68819753709956e-05, + "loss": 0.0, + "step": 41764 + }, + { + "epoch": 3.8970794065503407, + "grad_norm": NaN, + "learning_rate": 8.687511389198269e-05, + "loss": 0.0, + "step": 41765 + }, + { + "epoch": 3.897172716245218, + "grad_norm": NaN, + "learning_rate": 8.686825257347766e-05, + "loss": 0.0, + "step": 41766 + }, + { + "epoch": 3.897266025940095, + "grad_norm": NaN, + "learning_rate": 8.686139141549795e-05, + "loss": 0.0, + "step": 41767 + }, + { + "epoch": 3.8973593356349725, + "grad_norm": NaN, + "learning_rate": 8.685453041806104e-05, + "loss": 0.0, + "step": 41768 + }, + { + "epoch": 3.89745264532985, + "grad_norm": NaN, + "learning_rate": 8.684766958118435e-05, + "loss": 0.0, + "step": 41769 + }, + { + "epoch": 3.897545955024727, + "grad_norm": NaN, + "learning_rate": 8.684080890488532e-05, + "loss": 0.0, + "step": 41770 + }, + { + "epoch": 3.8976392647196043, + "grad_norm": NaN, + "learning_rate": 8.68339483891814e-05, + "loss": 0.0, + "step": 41771 + }, + { + "epoch": 3.8977325744144817, + "grad_norm": NaN, + "learning_rate": 8.682708803409006e-05, + "loss": 0.0, + "step": 41772 + }, + { + "epoch": 3.897825884109359, + "grad_norm": NaN, + "learning_rate": 8.68202278396287e-05, + "loss": 0.0, + "step": 41773 + }, + { + "epoch": 3.897919193804236, + "grad_norm": NaN, + "learning_rate": 8.68133678058148e-05, + "loss": 0.0, + "step": 41774 + }, + { + "epoch": 3.8980125034991135, + "grad_norm": NaN, + "learning_rate": 8.68065079326658e-05, + "loss": 0.0, + "step": 41775 + }, + { + "epoch": 3.898105813193991, + "grad_norm": NaN, + "learning_rate": 8.679964822019911e-05, + "loss": 0.0, + "step": 41776 + }, + { + "epoch": 3.898199122888868, + "grad_norm": NaN, + "learning_rate": 8.679278866843219e-05, + "loss": 0.0, + "step": 41777 + }, + { + "epoch": 3.8982924325837454, + "grad_norm": NaN, + "learning_rate": 8.678592927738246e-05, + "loss": 0.0, + "step": 41778 + }, + { + "epoch": 3.898385742278623, + "grad_norm": NaN, + "learning_rate": 8.677907004706743e-05, + "loss": 0.0, + "step": 41779 + }, + { + "epoch": 3.8984790519735, + "grad_norm": NaN, + "learning_rate": 8.677221097750451e-05, + "loss": 0.0, + "step": 41780 + }, + { + "epoch": 3.8985723616683776, + "grad_norm": NaN, + "learning_rate": 8.676535206871109e-05, + "loss": 0.0, + "step": 41781 + }, + { + "epoch": 3.8986656713632546, + "grad_norm": NaN, + "learning_rate": 8.675849332070467e-05, + "loss": 0.0, + "step": 41782 + }, + { + "epoch": 3.898758981058132, + "grad_norm": NaN, + "learning_rate": 8.675163473350267e-05, + "loss": 0.0, + "step": 41783 + }, + { + "epoch": 3.898852290753009, + "grad_norm": NaN, + "learning_rate": 8.674477630712252e-05, + "loss": 0.0, + "step": 41784 + }, + { + "epoch": 3.8989456004478864, + "grad_norm": NaN, + "learning_rate": 8.673791804158163e-05, + "loss": 0.0, + "step": 41785 + }, + { + "epoch": 3.899038910142764, + "grad_norm": NaN, + "learning_rate": 8.673105993689752e-05, + "loss": 0.0, + "step": 41786 + }, + { + "epoch": 3.8991322198376412, + "grad_norm": NaN, + "learning_rate": 8.672420199308759e-05, + "loss": 0.0, + "step": 41787 + }, + { + "epoch": 3.8992255295325187, + "grad_norm": NaN, + "learning_rate": 8.671734421016923e-05, + "loss": 0.0, + "step": 41788 + }, + { + "epoch": 3.8993188392273956, + "grad_norm": NaN, + "learning_rate": 8.671048658815993e-05, + "loss": 0.0, + "step": 41789 + }, + { + "epoch": 3.899412148922273, + "grad_norm": NaN, + "learning_rate": 8.670362912707713e-05, + "loss": 0.0, + "step": 41790 + }, + { + "epoch": 3.8995054586171505, + "grad_norm": NaN, + "learning_rate": 8.669677182693826e-05, + "loss": 0.0, + "step": 41791 + }, + { + "epoch": 3.8995987683120275, + "grad_norm": NaN, + "learning_rate": 8.668991468776074e-05, + "loss": 0.0, + "step": 41792 + }, + { + "epoch": 3.899692078006905, + "grad_norm": NaN, + "learning_rate": 8.668305770956199e-05, + "loss": 0.0, + "step": 41793 + }, + { + "epoch": 3.8997853877017823, + "grad_norm": NaN, + "learning_rate": 8.667620089235949e-05, + "loss": 0.0, + "step": 41794 + }, + { + "epoch": 3.8998786973966597, + "grad_norm": NaN, + "learning_rate": 8.666934423617065e-05, + "loss": 0.0, + "step": 41795 + }, + { + "epoch": 3.8999720070915367, + "grad_norm": NaN, + "learning_rate": 8.666248774101289e-05, + "loss": 0.0, + "step": 41796 + }, + { + "epoch": 3.900065316786414, + "grad_norm": NaN, + "learning_rate": 8.665563140690371e-05, + "loss": 0.0, + "step": 41797 + }, + { + "epoch": 3.9001586264812915, + "grad_norm": NaN, + "learning_rate": 8.664877523386044e-05, + "loss": 0.0, + "step": 41798 + }, + { + "epoch": 3.9002519361761685, + "grad_norm": NaN, + "learning_rate": 8.66419192219006e-05, + "loss": 0.0, + "step": 41799 + }, + { + "epoch": 3.900345245871046, + "grad_norm": NaN, + "learning_rate": 8.66350633710416e-05, + "loss": 0.0, + "step": 41800 + }, + { + "epoch": 3.9004385555659233, + "grad_norm": NaN, + "learning_rate": 8.662820768130082e-05, + "loss": 0.0, + "step": 41801 + }, + { + "epoch": 3.9005318652608008, + "grad_norm": NaN, + "learning_rate": 8.662135215269577e-05, + "loss": 0.0, + "step": 41802 + }, + { + "epoch": 3.9006251749556777, + "grad_norm": NaN, + "learning_rate": 8.661449678524385e-05, + "loss": 0.0, + "step": 41803 + }, + { + "epoch": 3.900718484650555, + "grad_norm": NaN, + "learning_rate": 8.660764157896246e-05, + "loss": 0.0, + "step": 41804 + }, + { + "epoch": 3.9008117943454326, + "grad_norm": NaN, + "learning_rate": 8.660078653386909e-05, + "loss": 0.0, + "step": 41805 + }, + { + "epoch": 3.9009051040403095, + "grad_norm": NaN, + "learning_rate": 8.659393164998114e-05, + "loss": 0.0, + "step": 41806 + }, + { + "epoch": 3.900998413735187, + "grad_norm": NaN, + "learning_rate": 8.658707692731601e-05, + "loss": 0.0, + "step": 41807 + }, + { + "epoch": 3.9010917234300644, + "grad_norm": NaN, + "learning_rate": 8.658022236589118e-05, + "loss": 0.0, + "step": 41808 + }, + { + "epoch": 3.901185033124942, + "grad_norm": NaN, + "learning_rate": 8.657336796572407e-05, + "loss": 0.0, + "step": 41809 + }, + { + "epoch": 3.9012783428198192, + "grad_norm": NaN, + "learning_rate": 8.656651372683207e-05, + "loss": 0.0, + "step": 41810 + }, + { + "epoch": 3.901371652514696, + "grad_norm": NaN, + "learning_rate": 8.655965964923263e-05, + "loss": 0.0, + "step": 41811 + }, + { + "epoch": 3.9014649622095736, + "grad_norm": NaN, + "learning_rate": 8.65528057329432e-05, + "loss": 0.0, + "step": 41812 + }, + { + "epoch": 3.901558271904451, + "grad_norm": NaN, + "learning_rate": 8.654595197798119e-05, + "loss": 0.0, + "step": 41813 + }, + { + "epoch": 3.901651581599328, + "grad_norm": NaN, + "learning_rate": 8.653909838436403e-05, + "loss": 0.0, + "step": 41814 + }, + { + "epoch": 3.9017448912942054, + "grad_norm": NaN, + "learning_rate": 8.653224495210915e-05, + "loss": 0.0, + "step": 41815 + }, + { + "epoch": 3.901838200989083, + "grad_norm": NaN, + "learning_rate": 8.652539168123397e-05, + "loss": 0.0, + "step": 41816 + }, + { + "epoch": 3.9019315106839603, + "grad_norm": NaN, + "learning_rate": 8.65185385717559e-05, + "loss": 0.0, + "step": 41817 + }, + { + "epoch": 3.9020248203788372, + "grad_norm": NaN, + "learning_rate": 8.651168562369238e-05, + "loss": 0.0, + "step": 41818 + }, + { + "epoch": 3.9021181300737147, + "grad_norm": NaN, + "learning_rate": 8.650483283706086e-05, + "loss": 0.0, + "step": 41819 + }, + { + "epoch": 3.902211439768592, + "grad_norm": NaN, + "learning_rate": 8.649798021187871e-05, + "loss": 0.0, + "step": 41820 + }, + { + "epoch": 3.902304749463469, + "grad_norm": NaN, + "learning_rate": 8.649112774816342e-05, + "loss": 0.0, + "step": 41821 + }, + { + "epoch": 3.9023980591583465, + "grad_norm": NaN, + "learning_rate": 8.648427544593235e-05, + "loss": 0.0, + "step": 41822 + }, + { + "epoch": 3.902491368853224, + "grad_norm": NaN, + "learning_rate": 8.647742330520298e-05, + "loss": 0.0, + "step": 41823 + }, + { + "epoch": 3.9025846785481013, + "grad_norm": NaN, + "learning_rate": 8.647057132599268e-05, + "loss": 0.0, + "step": 41824 + }, + { + "epoch": 3.9026779882429783, + "grad_norm": NaN, + "learning_rate": 8.646371950831892e-05, + "loss": 0.0, + "step": 41825 + }, + { + "epoch": 3.9027712979378557, + "grad_norm": NaN, + "learning_rate": 8.64568678521991e-05, + "loss": 0.0, + "step": 41826 + }, + { + "epoch": 3.902864607632733, + "grad_norm": NaN, + "learning_rate": 8.645001635765063e-05, + "loss": 0.0, + "step": 41827 + }, + { + "epoch": 3.90295791732761, + "grad_norm": NaN, + "learning_rate": 8.644316502469093e-05, + "loss": 0.0, + "step": 41828 + }, + { + "epoch": 3.9030512270224875, + "grad_norm": NaN, + "learning_rate": 8.643631385333745e-05, + "loss": 0.0, + "step": 41829 + }, + { + "epoch": 3.903144536717365, + "grad_norm": NaN, + "learning_rate": 8.64294628436076e-05, + "loss": 0.0, + "step": 41830 + }, + { + "epoch": 3.9032378464122424, + "grad_norm": NaN, + "learning_rate": 8.64226119955188e-05, + "loss": 0.0, + "step": 41831 + }, + { + "epoch": 3.90333115610712, + "grad_norm": NaN, + "learning_rate": 8.641576130908845e-05, + "loss": 0.0, + "step": 41832 + }, + { + "epoch": 3.9034244658019968, + "grad_norm": NaN, + "learning_rate": 8.640891078433401e-05, + "loss": 0.0, + "step": 41833 + }, + { + "epoch": 3.903517775496874, + "grad_norm": NaN, + "learning_rate": 8.640206042127283e-05, + "loss": 0.0, + "step": 41834 + }, + { + "epoch": 3.903611085191751, + "grad_norm": NaN, + "learning_rate": 8.63952102199224e-05, + "loss": 0.0, + "step": 41835 + }, + { + "epoch": 3.9037043948866286, + "grad_norm": NaN, + "learning_rate": 8.638836018030011e-05, + "loss": 0.0, + "step": 41836 + }, + { + "epoch": 3.903797704581506, + "grad_norm": NaN, + "learning_rate": 8.638151030242336e-05, + "loss": 0.0, + "step": 41837 + }, + { + "epoch": 3.9038910142763834, + "grad_norm": NaN, + "learning_rate": 8.637466058630961e-05, + "loss": 0.0, + "step": 41838 + }, + { + "epoch": 3.903984323971261, + "grad_norm": NaN, + "learning_rate": 8.636781103197623e-05, + "loss": 0.0, + "step": 41839 + }, + { + "epoch": 3.904077633666138, + "grad_norm": NaN, + "learning_rate": 8.636096163944065e-05, + "loss": 0.0, + "step": 41840 + }, + { + "epoch": 3.9041709433610152, + "grad_norm": NaN, + "learning_rate": 8.635411240872032e-05, + "loss": 0.0, + "step": 41841 + }, + { + "epoch": 3.9042642530558926, + "grad_norm": NaN, + "learning_rate": 8.63472633398326e-05, + "loss": 0.0, + "step": 41842 + }, + { + "epoch": 3.9043575627507696, + "grad_norm": NaN, + "learning_rate": 8.634041443279496e-05, + "loss": 0.0, + "step": 41843 + }, + { + "epoch": 3.904450872445647, + "grad_norm": NaN, + "learning_rate": 8.633356568762477e-05, + "loss": 0.0, + "step": 41844 + }, + { + "epoch": 3.9045441821405245, + "grad_norm": NaN, + "learning_rate": 8.632671710433948e-05, + "loss": 0.0, + "step": 41845 + }, + { + "epoch": 3.904637491835402, + "grad_norm": NaN, + "learning_rate": 8.631986868295647e-05, + "loss": 0.0, + "step": 41846 + }, + { + "epoch": 3.904730801530279, + "grad_norm": NaN, + "learning_rate": 8.631302042349318e-05, + "loss": 0.0, + "step": 41847 + }, + { + "epoch": 3.9048241112251563, + "grad_norm": NaN, + "learning_rate": 8.630617232596701e-05, + "loss": 0.0, + "step": 41848 + }, + { + "epoch": 3.9049174209200337, + "grad_norm": NaN, + "learning_rate": 8.629932439039539e-05, + "loss": 0.0, + "step": 41849 + }, + { + "epoch": 3.9050107306149107, + "grad_norm": NaN, + "learning_rate": 8.629247661679569e-05, + "loss": 0.0, + "step": 41850 + }, + { + "epoch": 3.905104040309788, + "grad_norm": NaN, + "learning_rate": 8.628562900518537e-05, + "loss": 0.0, + "step": 41851 + }, + { + "epoch": 3.9051973500046655, + "grad_norm": NaN, + "learning_rate": 8.627878155558183e-05, + "loss": 0.0, + "step": 41852 + }, + { + "epoch": 3.905290659699543, + "grad_norm": NaN, + "learning_rate": 8.627193426800245e-05, + "loss": 0.0, + "step": 41853 + }, + { + "epoch": 3.9053839693944203, + "grad_norm": NaN, + "learning_rate": 8.626508714246467e-05, + "loss": 0.0, + "step": 41854 + }, + { + "epoch": 3.9054772790892973, + "grad_norm": NaN, + "learning_rate": 8.62582401789859e-05, + "loss": 0.0, + "step": 41855 + }, + { + "epoch": 3.9055705887841747, + "grad_norm": NaN, + "learning_rate": 8.625139337758354e-05, + "loss": 0.0, + "step": 41856 + }, + { + "epoch": 3.9056638984790517, + "grad_norm": NaN, + "learning_rate": 8.6244546738275e-05, + "loss": 0.0, + "step": 41857 + }, + { + "epoch": 3.905757208173929, + "grad_norm": NaN, + "learning_rate": 8.62377002610777e-05, + "loss": 0.0, + "step": 41858 + }, + { + "epoch": 3.9058505178688065, + "grad_norm": NaN, + "learning_rate": 8.623085394600904e-05, + "loss": 0.0, + "step": 41859 + }, + { + "epoch": 3.905943827563684, + "grad_norm": NaN, + "learning_rate": 8.622400779308643e-05, + "loss": 0.0, + "step": 41860 + }, + { + "epoch": 3.9060371372585614, + "grad_norm": NaN, + "learning_rate": 8.621716180232728e-05, + "loss": 0.0, + "step": 41861 + }, + { + "epoch": 3.9061304469534384, + "grad_norm": NaN, + "learning_rate": 8.621031597374899e-05, + "loss": 0.0, + "step": 41862 + }, + { + "epoch": 3.906223756648316, + "grad_norm": NaN, + "learning_rate": 8.620347030736896e-05, + "loss": 0.0, + "step": 41863 + }, + { + "epoch": 3.906317066343193, + "grad_norm": NaN, + "learning_rate": 8.61966248032046e-05, + "loss": 0.0, + "step": 41864 + }, + { + "epoch": 3.90641037603807, + "grad_norm": NaN, + "learning_rate": 8.618977946127334e-05, + "loss": 0.0, + "step": 41865 + }, + { + "epoch": 3.9065036857329476, + "grad_norm": NaN, + "learning_rate": 8.618293428159257e-05, + "loss": 0.0, + "step": 41866 + }, + { + "epoch": 3.906596995427825, + "grad_norm": NaN, + "learning_rate": 8.617608926417968e-05, + "loss": 0.0, + "step": 41867 + }, + { + "epoch": 3.9066903051227024, + "grad_norm": NaN, + "learning_rate": 8.616924440905211e-05, + "loss": 0.0, + "step": 41868 + }, + { + "epoch": 3.9067836148175794, + "grad_norm": NaN, + "learning_rate": 8.616239971622723e-05, + "loss": 0.0, + "step": 41869 + }, + { + "epoch": 3.906876924512457, + "grad_norm": NaN, + "learning_rate": 8.615555518572246e-05, + "loss": 0.0, + "step": 41870 + }, + { + "epoch": 3.9069702342073342, + "grad_norm": NaN, + "learning_rate": 8.61487108175552e-05, + "loss": 0.0, + "step": 41871 + }, + { + "epoch": 3.9070635439022112, + "grad_norm": NaN, + "learning_rate": 8.614186661174286e-05, + "loss": 0.0, + "step": 41872 + }, + { + "epoch": 3.9071568535970886, + "grad_norm": NaN, + "learning_rate": 8.613502256830282e-05, + "loss": 0.0, + "step": 41873 + }, + { + "epoch": 3.907250163291966, + "grad_norm": NaN, + "learning_rate": 8.612817868725251e-05, + "loss": 0.0, + "step": 41874 + }, + { + "epoch": 3.9073434729868435, + "grad_norm": NaN, + "learning_rate": 8.612133496860933e-05, + "loss": 0.0, + "step": 41875 + }, + { + "epoch": 3.907436782681721, + "grad_norm": NaN, + "learning_rate": 8.611449141239067e-05, + "loss": 0.0, + "step": 41876 + }, + { + "epoch": 3.907530092376598, + "grad_norm": NaN, + "learning_rate": 8.610764801861391e-05, + "loss": 0.0, + "step": 41877 + }, + { + "epoch": 3.9076234020714753, + "grad_norm": NaN, + "learning_rate": 8.61008047872965e-05, + "loss": 0.0, + "step": 41878 + }, + { + "epoch": 3.9077167117663523, + "grad_norm": NaN, + "learning_rate": 8.609396171845583e-05, + "loss": 0.0, + "step": 41879 + }, + { + "epoch": 3.9078100214612297, + "grad_norm": NaN, + "learning_rate": 8.608711881210927e-05, + "loss": 0.0, + "step": 41880 + }, + { + "epoch": 3.907903331156107, + "grad_norm": NaN, + "learning_rate": 8.608027606827421e-05, + "loss": 0.0, + "step": 41881 + }, + { + "epoch": 3.9079966408509845, + "grad_norm": NaN, + "learning_rate": 8.60734334869681e-05, + "loss": 0.0, + "step": 41882 + }, + { + "epoch": 3.908089950545862, + "grad_norm": NaN, + "learning_rate": 8.606659106820829e-05, + "loss": 0.0, + "step": 41883 + }, + { + "epoch": 3.908183260240739, + "grad_norm": NaN, + "learning_rate": 8.605974881201223e-05, + "loss": 0.0, + "step": 41884 + }, + { + "epoch": 3.9082765699356163, + "grad_norm": NaN, + "learning_rate": 8.605290671839727e-05, + "loss": 0.0, + "step": 41885 + }, + { + "epoch": 3.9083698796304938, + "grad_norm": NaN, + "learning_rate": 8.604606478738085e-05, + "loss": 0.0, + "step": 41886 + }, + { + "epoch": 3.9084631893253707, + "grad_norm": NaN, + "learning_rate": 8.60392230189803e-05, + "loss": 0.0, + "step": 41887 + }, + { + "epoch": 3.908556499020248, + "grad_norm": NaN, + "learning_rate": 8.603238141321308e-05, + "loss": 0.0, + "step": 41888 + }, + { + "epoch": 3.9086498087151256, + "grad_norm": NaN, + "learning_rate": 8.602553997009658e-05, + "loss": 0.0, + "step": 41889 + }, + { + "epoch": 3.908743118410003, + "grad_norm": NaN, + "learning_rate": 8.601869868964817e-05, + "loss": 0.0, + "step": 41890 + }, + { + "epoch": 3.90883642810488, + "grad_norm": NaN, + "learning_rate": 8.601185757188524e-05, + "loss": 0.0, + "step": 41891 + }, + { + "epoch": 3.9089297377997574, + "grad_norm": NaN, + "learning_rate": 8.600501661682523e-05, + "loss": 0.0, + "step": 41892 + }, + { + "epoch": 3.909023047494635, + "grad_norm": NaN, + "learning_rate": 8.599817582448548e-05, + "loss": 0.0, + "step": 41893 + }, + { + "epoch": 3.909116357189512, + "grad_norm": NaN, + "learning_rate": 8.599133519488342e-05, + "loss": 0.0, + "step": 41894 + }, + { + "epoch": 3.909209666884389, + "grad_norm": NaN, + "learning_rate": 8.598449472803645e-05, + "loss": 0.0, + "step": 41895 + }, + { + "epoch": 3.9093029765792666, + "grad_norm": NaN, + "learning_rate": 8.597765442396191e-05, + "loss": 0.0, + "step": 41896 + }, + { + "epoch": 3.909396286274144, + "grad_norm": NaN, + "learning_rate": 8.597081428267728e-05, + "loss": 0.0, + "step": 41897 + }, + { + "epoch": 3.9094895959690215, + "grad_norm": NaN, + "learning_rate": 8.596397430419986e-05, + "loss": 0.0, + "step": 41898 + }, + { + "epoch": 3.9095829056638984, + "grad_norm": NaN, + "learning_rate": 8.595713448854709e-05, + "loss": 0.0, + "step": 41899 + }, + { + "epoch": 3.909676215358776, + "grad_norm": NaN, + "learning_rate": 8.595029483573638e-05, + "loss": 0.0, + "step": 41900 + }, + { + "epoch": 3.909769525053653, + "grad_norm": NaN, + "learning_rate": 8.594345534578509e-05, + "loss": 0.0, + "step": 41901 + }, + { + "epoch": 3.9098628347485302, + "grad_norm": NaN, + "learning_rate": 8.593661601871061e-05, + "loss": 0.0, + "step": 41902 + }, + { + "epoch": 3.9099561444434077, + "grad_norm": NaN, + "learning_rate": 8.592977685453031e-05, + "loss": 0.0, + "step": 41903 + }, + { + "epoch": 3.910049454138285, + "grad_norm": NaN, + "learning_rate": 8.592293785326165e-05, + "loss": 0.0, + "step": 41904 + }, + { + "epoch": 3.9101427638331625, + "grad_norm": NaN, + "learning_rate": 8.591609901492198e-05, + "loss": 0.0, + "step": 41905 + }, + { + "epoch": 3.9102360735280395, + "grad_norm": NaN, + "learning_rate": 8.590926033952867e-05, + "loss": 0.0, + "step": 41906 + }, + { + "epoch": 3.910329383222917, + "grad_norm": NaN, + "learning_rate": 8.590242182709912e-05, + "loss": 0.0, + "step": 41907 + }, + { + "epoch": 3.9104226929177943, + "grad_norm": NaN, + "learning_rate": 8.589558347765073e-05, + "loss": 0.0, + "step": 41908 + }, + { + "epoch": 3.9105160026126713, + "grad_norm": NaN, + "learning_rate": 8.58887452912009e-05, + "loss": 0.0, + "step": 41909 + }, + { + "epoch": 3.9106093123075487, + "grad_norm": NaN, + "learning_rate": 8.588190726776698e-05, + "loss": 0.0, + "step": 41910 + }, + { + "epoch": 3.910702622002426, + "grad_norm": NaN, + "learning_rate": 8.587506940736638e-05, + "loss": 0.0, + "step": 41911 + }, + { + "epoch": 3.9107959316973036, + "grad_norm": NaN, + "learning_rate": 8.586823171001649e-05, + "loss": 0.0, + "step": 41912 + }, + { + "epoch": 3.9108892413921805, + "grad_norm": NaN, + "learning_rate": 8.586139417573469e-05, + "loss": 0.0, + "step": 41913 + }, + { + "epoch": 3.910982551087058, + "grad_norm": NaN, + "learning_rate": 8.585455680453834e-05, + "loss": 0.0, + "step": 41914 + }, + { + "epoch": 3.9110758607819354, + "grad_norm": NaN, + "learning_rate": 8.584771959644487e-05, + "loss": 0.0, + "step": 41915 + }, + { + "epoch": 3.9111691704768123, + "grad_norm": NaN, + "learning_rate": 8.584088255147167e-05, + "loss": 0.0, + "step": 41916 + }, + { + "epoch": 3.9112624801716898, + "grad_norm": NaN, + "learning_rate": 8.583404566963606e-05, + "loss": 0.0, + "step": 41917 + }, + { + "epoch": 3.911355789866567, + "grad_norm": NaN, + "learning_rate": 8.582720895095549e-05, + "loss": 0.0, + "step": 41918 + }, + { + "epoch": 3.9114490995614446, + "grad_norm": NaN, + "learning_rate": 8.582037239544731e-05, + "loss": 0.0, + "step": 41919 + }, + { + "epoch": 3.9115424092563216, + "grad_norm": NaN, + "learning_rate": 8.58135360031289e-05, + "loss": 0.0, + "step": 41920 + }, + { + "epoch": 3.911635718951199, + "grad_norm": NaN, + "learning_rate": 8.580669977401767e-05, + "loss": 0.0, + "step": 41921 + }, + { + "epoch": 3.9117290286460764, + "grad_norm": NaN, + "learning_rate": 8.579986370813097e-05, + "loss": 0.0, + "step": 41922 + }, + { + "epoch": 3.9118223383409534, + "grad_norm": NaN, + "learning_rate": 8.579302780548624e-05, + "loss": 0.0, + "step": 41923 + }, + { + "epoch": 3.911915648035831, + "grad_norm": NaN, + "learning_rate": 8.578619206610072e-05, + "loss": 0.0, + "step": 41924 + }, + { + "epoch": 3.9120089577307082, + "grad_norm": NaN, + "learning_rate": 8.577935648999202e-05, + "loss": 0.0, + "step": 41925 + }, + { + "epoch": 3.9121022674255856, + "grad_norm": NaN, + "learning_rate": 8.577252107717734e-05, + "loss": 0.0, + "step": 41926 + }, + { + "epoch": 3.912195577120463, + "grad_norm": NaN, + "learning_rate": 8.576568582767413e-05, + "loss": 0.0, + "step": 41927 + }, + { + "epoch": 3.91228888681534, + "grad_norm": NaN, + "learning_rate": 8.575885074149971e-05, + "loss": 0.0, + "step": 41928 + }, + { + "epoch": 3.9123821965102175, + "grad_norm": NaN, + "learning_rate": 8.575201581867153e-05, + "loss": 0.0, + "step": 41929 + }, + { + "epoch": 3.9124755062050944, + "grad_norm": NaN, + "learning_rate": 8.574518105920693e-05, + "loss": 0.0, + "step": 41930 + }, + { + "epoch": 3.912568815899972, + "grad_norm": NaN, + "learning_rate": 8.573834646312333e-05, + "loss": 0.0, + "step": 41931 + }, + { + "epoch": 3.9126621255948493, + "grad_norm": NaN, + "learning_rate": 8.573151203043805e-05, + "loss": 0.0, + "step": 41932 + }, + { + "epoch": 3.9127554352897267, + "grad_norm": NaN, + "learning_rate": 8.572467776116851e-05, + "loss": 0.0, + "step": 41933 + }, + { + "epoch": 3.912848744984604, + "grad_norm": NaN, + "learning_rate": 8.571784365533207e-05, + "loss": 0.0, + "step": 41934 + }, + { + "epoch": 3.912942054679481, + "grad_norm": NaN, + "learning_rate": 8.571100971294611e-05, + "loss": 0.0, + "step": 41935 + }, + { + "epoch": 3.9130353643743585, + "grad_norm": NaN, + "learning_rate": 8.570417593402803e-05, + "loss": 0.0, + "step": 41936 + }, + { + "epoch": 3.913128674069236, + "grad_norm": NaN, + "learning_rate": 8.569734231859514e-05, + "loss": 0.0, + "step": 41937 + }, + { + "epoch": 3.913221983764113, + "grad_norm": NaN, + "learning_rate": 8.569050886666488e-05, + "loss": 0.0, + "step": 41938 + }, + { + "epoch": 3.9133152934589903, + "grad_norm": NaN, + "learning_rate": 8.568367557825463e-05, + "loss": 0.0, + "step": 41939 + }, + { + "epoch": 3.9134086031538677, + "grad_norm": NaN, + "learning_rate": 8.567684245338172e-05, + "loss": 0.0, + "step": 41940 + }, + { + "epoch": 3.913501912848745, + "grad_norm": NaN, + "learning_rate": 8.567000949206355e-05, + "loss": 0.0, + "step": 41941 + }, + { + "epoch": 3.913595222543622, + "grad_norm": NaN, + "learning_rate": 8.566317669431749e-05, + "loss": 0.0, + "step": 41942 + }, + { + "epoch": 3.9136885322384996, + "grad_norm": NaN, + "learning_rate": 8.565634406016092e-05, + "loss": 0.0, + "step": 41943 + }, + { + "epoch": 3.913781841933377, + "grad_norm": NaN, + "learning_rate": 8.564951158961119e-05, + "loss": 0.0, + "step": 41944 + }, + { + "epoch": 3.913875151628254, + "grad_norm": NaN, + "learning_rate": 8.56426792826857e-05, + "loss": 0.0, + "step": 41945 + }, + { + "epoch": 3.9139684613231314, + "grad_norm": NaN, + "learning_rate": 8.563584713940182e-05, + "loss": 0.0, + "step": 41946 + }, + { + "epoch": 3.914061771018009, + "grad_norm": NaN, + "learning_rate": 8.562901515977692e-05, + "loss": 0.0, + "step": 41947 + }, + { + "epoch": 3.914155080712886, + "grad_norm": NaN, + "learning_rate": 8.562218334382836e-05, + "loss": 0.0, + "step": 41948 + }, + { + "epoch": 3.9142483904077636, + "grad_norm": NaN, + "learning_rate": 8.561535169157349e-05, + "loss": 0.0, + "step": 41949 + }, + { + "epoch": 3.9143417001026406, + "grad_norm": NaN, + "learning_rate": 8.560852020302972e-05, + "loss": 0.0, + "step": 41950 + }, + { + "epoch": 3.914435009797518, + "grad_norm": NaN, + "learning_rate": 8.560168887821444e-05, + "loss": 0.0, + "step": 41951 + }, + { + "epoch": 3.914528319492395, + "grad_norm": NaN, + "learning_rate": 8.559485771714496e-05, + "loss": 0.0, + "step": 41952 + }, + { + "epoch": 3.9146216291872724, + "grad_norm": NaN, + "learning_rate": 8.558802671983868e-05, + "loss": 0.0, + "step": 41953 + }, + { + "epoch": 3.91471493888215, + "grad_norm": NaN, + "learning_rate": 8.558119588631298e-05, + "loss": 0.0, + "step": 41954 + }, + { + "epoch": 3.9148082485770272, + "grad_norm": NaN, + "learning_rate": 8.557436521658521e-05, + "loss": 0.0, + "step": 41955 + }, + { + "epoch": 3.9149015582719047, + "grad_norm": NaN, + "learning_rate": 8.556753471067274e-05, + "loss": 0.0, + "step": 41956 + }, + { + "epoch": 3.9149948679667816, + "grad_norm": NaN, + "learning_rate": 8.556070436859295e-05, + "loss": 0.0, + "step": 41957 + }, + { + "epoch": 3.915088177661659, + "grad_norm": NaN, + "learning_rate": 8.555387419036322e-05, + "loss": 0.0, + "step": 41958 + }, + { + "epoch": 3.9151814873565365, + "grad_norm": NaN, + "learning_rate": 8.55470441760008e-05, + "loss": 0.0, + "step": 41959 + }, + { + "epoch": 3.9152747970514135, + "grad_norm": NaN, + "learning_rate": 8.55402143255233e-05, + "loss": 0.0, + "step": 41960 + }, + { + "epoch": 3.915368106746291, + "grad_norm": NaN, + "learning_rate": 8.553338463894787e-05, + "loss": 0.0, + "step": 41961 + }, + { + "epoch": 3.9154614164411683, + "grad_norm": NaN, + "learning_rate": 8.552655511629189e-05, + "loss": 0.0, + "step": 41962 + }, + { + "epoch": 3.9155547261360457, + "grad_norm": NaN, + "learning_rate": 8.55197257575729e-05, + "loss": 0.0, + "step": 41963 + }, + { + "epoch": 3.9156480358309227, + "grad_norm": NaN, + "learning_rate": 8.551289656280808e-05, + "loss": 0.0, + "step": 41964 + }, + { + "epoch": 3.9157413455258, + "grad_norm": NaN, + "learning_rate": 8.550606753201482e-05, + "loss": 0.0, + "step": 41965 + }, + { + "epoch": 3.9158346552206775, + "grad_norm": NaN, + "learning_rate": 8.549923866521062e-05, + "loss": 0.0, + "step": 41966 + }, + { + "epoch": 3.9159279649155545, + "grad_norm": NaN, + "learning_rate": 8.54924099624127e-05, + "loss": 0.0, + "step": 41967 + }, + { + "epoch": 3.916021274610432, + "grad_norm": NaN, + "learning_rate": 8.548558142363844e-05, + "loss": 0.0, + "step": 41968 + }, + { + "epoch": 3.9161145843053093, + "grad_norm": NaN, + "learning_rate": 8.547875304890532e-05, + "loss": 0.0, + "step": 41969 + }, + { + "epoch": 3.9162078940001868, + "grad_norm": NaN, + "learning_rate": 8.547192483823057e-05, + "loss": 0.0, + "step": 41970 + }, + { + "epoch": 3.916301203695064, + "grad_norm": NaN, + "learning_rate": 8.546509679163155e-05, + "loss": 0.0, + "step": 41971 + }, + { + "epoch": 3.916394513389941, + "grad_norm": NaN, + "learning_rate": 8.545826890912578e-05, + "loss": 0.0, + "step": 41972 + }, + { + "epoch": 3.9164878230848186, + "grad_norm": NaN, + "learning_rate": 8.545144119073047e-05, + "loss": 0.0, + "step": 41973 + }, + { + "epoch": 3.9165811327796956, + "grad_norm": NaN, + "learning_rate": 8.544461363646299e-05, + "loss": 0.0, + "step": 41974 + }, + { + "epoch": 3.916674442474573, + "grad_norm": NaN, + "learning_rate": 8.543778624634075e-05, + "loss": 0.0, + "step": 41975 + }, + { + "epoch": 3.9167677521694504, + "grad_norm": NaN, + "learning_rate": 8.543095902038112e-05, + "loss": 0.0, + "step": 41976 + }, + { + "epoch": 3.916861061864328, + "grad_norm": NaN, + "learning_rate": 8.542413195860142e-05, + "loss": 0.0, + "step": 41977 + }, + { + "epoch": 3.9169543715592052, + "grad_norm": NaN, + "learning_rate": 8.5417305061019e-05, + "loss": 0.0, + "step": 41978 + }, + { + "epoch": 3.917047681254082, + "grad_norm": NaN, + "learning_rate": 8.541047832765126e-05, + "loss": 0.0, + "step": 41979 + }, + { + "epoch": 3.9171409909489596, + "grad_norm": NaN, + "learning_rate": 8.540365175851556e-05, + "loss": 0.0, + "step": 41980 + }, + { + "epoch": 3.917234300643837, + "grad_norm": NaN, + "learning_rate": 8.539682535362923e-05, + "loss": 0.0, + "step": 41981 + }, + { + "epoch": 3.917327610338714, + "grad_norm": NaN, + "learning_rate": 8.538999911300962e-05, + "loss": 0.0, + "step": 41982 + }, + { + "epoch": 3.9174209200335914, + "grad_norm": NaN, + "learning_rate": 8.53831730366741e-05, + "loss": 0.0, + "step": 41983 + }, + { + "epoch": 3.917514229728469, + "grad_norm": NaN, + "learning_rate": 8.537634712464006e-05, + "loss": 0.0, + "step": 41984 + }, + { + "epoch": 3.9176075394233463, + "grad_norm": NaN, + "learning_rate": 8.536952137692479e-05, + "loss": 0.0, + "step": 41985 + }, + { + "epoch": 3.9177008491182232, + "grad_norm": NaN, + "learning_rate": 8.53626957935457e-05, + "loss": 0.0, + "step": 41986 + }, + { + "epoch": 3.9177941588131007, + "grad_norm": NaN, + "learning_rate": 8.535587037452014e-05, + "loss": 0.0, + "step": 41987 + }, + { + "epoch": 3.917887468507978, + "grad_norm": NaN, + "learning_rate": 8.534904511986542e-05, + "loss": 0.0, + "step": 41988 + }, + { + "epoch": 3.917980778202855, + "grad_norm": NaN, + "learning_rate": 8.534222002959896e-05, + "loss": 0.0, + "step": 41989 + }, + { + "epoch": 3.9180740878977325, + "grad_norm": NaN, + "learning_rate": 8.533539510373807e-05, + "loss": 0.0, + "step": 41990 + }, + { + "epoch": 3.91816739759261, + "grad_norm": NaN, + "learning_rate": 8.532857034230011e-05, + "loss": 0.0, + "step": 41991 + }, + { + "epoch": 3.9182607072874873, + "grad_norm": NaN, + "learning_rate": 8.532174574530244e-05, + "loss": 0.0, + "step": 41992 + }, + { + "epoch": 3.9183540169823647, + "grad_norm": NaN, + "learning_rate": 8.531492131276241e-05, + "loss": 0.0, + "step": 41993 + }, + { + "epoch": 3.9184473266772417, + "grad_norm": NaN, + "learning_rate": 8.530809704469738e-05, + "loss": 0.0, + "step": 41994 + }, + { + "epoch": 3.918540636372119, + "grad_norm": NaN, + "learning_rate": 8.53012729411247e-05, + "loss": 0.0, + "step": 41995 + }, + { + "epoch": 3.918633946066996, + "grad_norm": NaN, + "learning_rate": 8.529444900206171e-05, + "loss": 0.0, + "step": 41996 + }, + { + "epoch": 3.9187272557618735, + "grad_norm": NaN, + "learning_rate": 8.52876252275257e-05, + "loss": 0.0, + "step": 41997 + }, + { + "epoch": 3.918820565456751, + "grad_norm": NaN, + "learning_rate": 8.528080161753423e-05, + "loss": 0.0, + "step": 41998 + }, + { + "epoch": 3.9189138751516284, + "grad_norm": NaN, + "learning_rate": 8.527397817210443e-05, + "loss": 0.0, + "step": 41999 + }, + { + "epoch": 3.919007184846506, + "grad_norm": NaN, + "learning_rate": 8.52671548912537e-05, + "loss": 0.0, + "step": 42000 + }, + { + "epoch": 3.9191004945413828, + "grad_norm": NaN, + "learning_rate": 8.52603317749995e-05, + "loss": 0.0, + "step": 42001 + }, + { + "epoch": 3.91919380423626, + "grad_norm": NaN, + "learning_rate": 8.525350882335906e-05, + "loss": 0.0, + "step": 42002 + }, + { + "epoch": 3.9192871139311376, + "grad_norm": NaN, + "learning_rate": 8.524668603634972e-05, + "loss": 0.0, + "step": 42003 + }, + { + "epoch": 3.9193804236260146, + "grad_norm": NaN, + "learning_rate": 8.523986341398897e-05, + "loss": 0.0, + "step": 42004 + }, + { + "epoch": 3.919473733320892, + "grad_norm": NaN, + "learning_rate": 8.523304095629404e-05, + "loss": 0.0, + "step": 42005 + }, + { + "epoch": 3.9195670430157694, + "grad_norm": NaN, + "learning_rate": 8.522621866328222e-05, + "loss": 0.0, + "step": 42006 + }, + { + "epoch": 3.919660352710647, + "grad_norm": NaN, + "learning_rate": 8.521939653497105e-05, + "loss": 0.0, + "step": 42007 + }, + { + "epoch": 3.919753662405524, + "grad_norm": NaN, + "learning_rate": 8.521257457137771e-05, + "loss": 0.0, + "step": 42008 + }, + { + "epoch": 3.9198469721004012, + "grad_norm": NaN, + "learning_rate": 8.520575277251955e-05, + "loss": 0.0, + "step": 42009 + }, + { + "epoch": 3.9199402817952786, + "grad_norm": NaN, + "learning_rate": 8.519893113841406e-05, + "loss": 0.0, + "step": 42010 + }, + { + "epoch": 3.9200335914901556, + "grad_norm": NaN, + "learning_rate": 8.519210966907847e-05, + "loss": 0.0, + "step": 42011 + }, + { + "epoch": 3.920126901185033, + "grad_norm": NaN, + "learning_rate": 8.518528836453005e-05, + "loss": 0.0, + "step": 42012 + }, + { + "epoch": 3.9202202108799105, + "grad_norm": NaN, + "learning_rate": 8.51784672247864e-05, + "loss": 0.0, + "step": 42013 + }, + { + "epoch": 3.920313520574788, + "grad_norm": NaN, + "learning_rate": 8.517164624986463e-05, + "loss": 0.0, + "step": 42014 + }, + { + "epoch": 3.920406830269665, + "grad_norm": NaN, + "learning_rate": 8.516482543978209e-05, + "loss": 0.0, + "step": 42015 + }, + { + "epoch": 3.9205001399645423, + "grad_norm": NaN, + "learning_rate": 8.515800479455634e-05, + "loss": 0.0, + "step": 42016 + }, + { + "epoch": 3.9205934496594197, + "grad_norm": NaN, + "learning_rate": 8.51511843142045e-05, + "loss": 0.0, + "step": 42017 + }, + { + "epoch": 3.9206867593542967, + "grad_norm": NaN, + "learning_rate": 8.514436399874398e-05, + "loss": 0.0, + "step": 42018 + }, + { + "epoch": 3.920780069049174, + "grad_norm": NaN, + "learning_rate": 8.513754384819213e-05, + "loss": 0.0, + "step": 42019 + }, + { + "epoch": 3.9208733787440515, + "grad_norm": NaN, + "learning_rate": 8.513072386256632e-05, + "loss": 0.0, + "step": 42020 + }, + { + "epoch": 3.920966688438929, + "grad_norm": NaN, + "learning_rate": 8.512390404188384e-05, + "loss": 0.0, + "step": 42021 + }, + { + "epoch": 3.9210599981338063, + "grad_norm": NaN, + "learning_rate": 8.511708438616204e-05, + "loss": 0.0, + "step": 42022 + }, + { + "epoch": 3.9211533078286833, + "grad_norm": NaN, + "learning_rate": 8.511026489541831e-05, + "loss": 0.0, + "step": 42023 + }, + { + "epoch": 3.9212466175235607, + "grad_norm": NaN, + "learning_rate": 8.510344556966992e-05, + "loss": 0.0, + "step": 42024 + }, + { + "epoch": 3.921339927218438, + "grad_norm": NaN, + "learning_rate": 8.509662640893427e-05, + "loss": 0.0, + "step": 42025 + }, + { + "epoch": 3.921433236913315, + "grad_norm": NaN, + "learning_rate": 8.508980741322868e-05, + "loss": 0.0, + "step": 42026 + }, + { + "epoch": 3.9215265466081926, + "grad_norm": NaN, + "learning_rate": 8.508298858257047e-05, + "loss": 0.0, + "step": 42027 + }, + { + "epoch": 3.92161985630307, + "grad_norm": NaN, + "learning_rate": 8.507616991697698e-05, + "loss": 0.0, + "step": 42028 + }, + { + "epoch": 3.9217131659979474, + "grad_norm": NaN, + "learning_rate": 8.506935141646557e-05, + "loss": 0.0, + "step": 42029 + }, + { + "epoch": 3.9218064756928244, + "grad_norm": NaN, + "learning_rate": 8.506253308105356e-05, + "loss": 0.0, + "step": 42030 + }, + { + "epoch": 3.921899785387702, + "grad_norm": NaN, + "learning_rate": 8.50557149107583e-05, + "loss": 0.0, + "step": 42031 + }, + { + "epoch": 3.921993095082579, + "grad_norm": NaN, + "learning_rate": 8.504889690559709e-05, + "loss": 0.0, + "step": 42032 + }, + { + "epoch": 3.922086404777456, + "grad_norm": NaN, + "learning_rate": 8.504207906558733e-05, + "loss": 0.0, + "step": 42033 + }, + { + "epoch": 3.9221797144723336, + "grad_norm": NaN, + "learning_rate": 8.50352613907463e-05, + "loss": 0.0, + "step": 42034 + }, + { + "epoch": 3.922273024167211, + "grad_norm": NaN, + "learning_rate": 8.502844388109132e-05, + "loss": 0.0, + "step": 42035 + }, + { + "epoch": 3.9223663338620884, + "grad_norm": NaN, + "learning_rate": 8.502162653663987e-05, + "loss": 0.0, + "step": 42036 + }, + { + "epoch": 3.9224596435569654, + "grad_norm": NaN, + "learning_rate": 8.50148093574091e-05, + "loss": 0.0, + "step": 42037 + }, + { + "epoch": 3.922552953251843, + "grad_norm": NaN, + "learning_rate": 8.500799234341638e-05, + "loss": 0.0, + "step": 42038 + }, + { + "epoch": 3.9226462629467203, + "grad_norm": NaN, + "learning_rate": 8.500117549467918e-05, + "loss": 0.0, + "step": 42039 + }, + { + "epoch": 3.9227395726415972, + "grad_norm": NaN, + "learning_rate": 8.499435881121469e-05, + "loss": 0.0, + "step": 42040 + }, + { + "epoch": 3.9228328823364746, + "grad_norm": NaN, + "learning_rate": 8.498754229304024e-05, + "loss": 0.0, + "step": 42041 + }, + { + "epoch": 3.922926192031352, + "grad_norm": NaN, + "learning_rate": 8.498072594017333e-05, + "loss": 0.0, + "step": 42042 + }, + { + "epoch": 3.9230195017262295, + "grad_norm": NaN, + "learning_rate": 8.497390975263111e-05, + "loss": 0.0, + "step": 42043 + }, + { + "epoch": 3.923112811421107, + "grad_norm": NaN, + "learning_rate": 8.49670937304309e-05, + "loss": 0.0, + "step": 42044 + }, + { + "epoch": 3.923206121115984, + "grad_norm": NaN, + "learning_rate": 8.496027787359023e-05, + "loss": 0.0, + "step": 42045 + }, + { + "epoch": 3.9232994308108613, + "grad_norm": NaN, + "learning_rate": 8.495346218212627e-05, + "loss": 0.0, + "step": 42046 + }, + { + "epoch": 3.9233927405057383, + "grad_norm": NaN, + "learning_rate": 8.494664665605631e-05, + "loss": 0.0, + "step": 42047 + }, + { + "epoch": 3.9234860502006157, + "grad_norm": NaN, + "learning_rate": 8.493983129539787e-05, + "loss": 0.0, + "step": 42048 + }, + { + "epoch": 3.923579359895493, + "grad_norm": NaN, + "learning_rate": 8.493301610016814e-05, + "loss": 0.0, + "step": 42049 + }, + { + "epoch": 3.9236726695903705, + "grad_norm": NaN, + "learning_rate": 8.492620107038438e-05, + "loss": 0.0, + "step": 42050 + }, + { + "epoch": 3.923765979285248, + "grad_norm": NaN, + "learning_rate": 8.491938620606415e-05, + "loss": 0.0, + "step": 42051 + }, + { + "epoch": 3.923859288980125, + "grad_norm": NaN, + "learning_rate": 8.491257150722459e-05, + "loss": 0.0, + "step": 42052 + }, + { + "epoch": 3.9239525986750023, + "grad_norm": NaN, + "learning_rate": 8.4905756973883e-05, + "loss": 0.0, + "step": 42053 + }, + { + "epoch": 3.9240459083698798, + "grad_norm": NaN, + "learning_rate": 8.489894260605693e-05, + "loss": 0.0, + "step": 42054 + }, + { + "epoch": 3.9241392180647567, + "grad_norm": NaN, + "learning_rate": 8.489212840376349e-05, + "loss": 0.0, + "step": 42055 + }, + { + "epoch": 3.924232527759634, + "grad_norm": NaN, + "learning_rate": 8.488531436702002e-05, + "loss": 0.0, + "step": 42056 + }, + { + "epoch": 3.9243258374545116, + "grad_norm": NaN, + "learning_rate": 8.487850049584402e-05, + "loss": 0.0, + "step": 42057 + }, + { + "epoch": 3.924419147149389, + "grad_norm": NaN, + "learning_rate": 8.487168679025266e-05, + "loss": 0.0, + "step": 42058 + }, + { + "epoch": 3.924512456844266, + "grad_norm": NaN, + "learning_rate": 8.486487325026324e-05, + "loss": 0.0, + "step": 42059 + }, + { + "epoch": 3.9246057665391434, + "grad_norm": NaN, + "learning_rate": 8.485805987589329e-05, + "loss": 0.0, + "step": 42060 + }, + { + "epoch": 3.924699076234021, + "grad_norm": NaN, + "learning_rate": 8.485124666715991e-05, + "loss": 0.0, + "step": 42061 + }, + { + "epoch": 3.924792385928898, + "grad_norm": NaN, + "learning_rate": 8.484443362408052e-05, + "loss": 0.0, + "step": 42062 + }, + { + "epoch": 3.924885695623775, + "grad_norm": NaN, + "learning_rate": 8.483762074667245e-05, + "loss": 0.0, + "step": 42063 + }, + { + "epoch": 3.9249790053186526, + "grad_norm": NaN, + "learning_rate": 8.4830808034953e-05, + "loss": 0.0, + "step": 42064 + }, + { + "epoch": 3.92507231501353, + "grad_norm": NaN, + "learning_rate": 8.482399548893947e-05, + "loss": 0.0, + "step": 42065 + }, + { + "epoch": 3.9251656247084075, + "grad_norm": NaN, + "learning_rate": 8.481718310864927e-05, + "loss": 0.0, + "step": 42066 + }, + { + "epoch": 3.9252589344032844, + "grad_norm": NaN, + "learning_rate": 8.481037089409962e-05, + "loss": 0.0, + "step": 42067 + }, + { + "epoch": 3.925352244098162, + "grad_norm": NaN, + "learning_rate": 8.48035588453079e-05, + "loss": 0.0, + "step": 42068 + }, + { + "epoch": 3.925445553793039, + "grad_norm": NaN, + "learning_rate": 8.479674696229142e-05, + "loss": 0.0, + "step": 42069 + }, + { + "epoch": 3.9255388634879163, + "grad_norm": NaN, + "learning_rate": 8.47899352450675e-05, + "loss": 0.0, + "step": 42070 + }, + { + "epoch": 3.9256321731827937, + "grad_norm": NaN, + "learning_rate": 8.478312369365346e-05, + "loss": 0.0, + "step": 42071 + }, + { + "epoch": 3.925725482877671, + "grad_norm": NaN, + "learning_rate": 8.477631230806659e-05, + "loss": 0.0, + "step": 42072 + }, + { + "epoch": 3.9258187925725485, + "grad_norm": NaN, + "learning_rate": 8.476950108832421e-05, + "loss": 0.0, + "step": 42073 + }, + { + "epoch": 3.9259121022674255, + "grad_norm": NaN, + "learning_rate": 8.476269003444378e-05, + "loss": 0.0, + "step": 42074 + }, + { + "epoch": 3.926005411962303, + "grad_norm": NaN, + "learning_rate": 8.475587914644242e-05, + "loss": 0.0, + "step": 42075 + }, + { + "epoch": 3.9260987216571803, + "grad_norm": NaN, + "learning_rate": 8.474906842433751e-05, + "loss": 0.0, + "step": 42076 + }, + { + "epoch": 3.9261920313520573, + "grad_norm": NaN, + "learning_rate": 8.47422578681465e-05, + "loss": 0.0, + "step": 42077 + }, + { + "epoch": 3.9262853410469347, + "grad_norm": NaN, + "learning_rate": 8.473544747788653e-05, + "loss": 0.0, + "step": 42078 + }, + { + "epoch": 3.926378650741812, + "grad_norm": NaN, + "learning_rate": 8.472863725357492e-05, + "loss": 0.0, + "step": 42079 + }, + { + "epoch": 3.9264719604366896, + "grad_norm": NaN, + "learning_rate": 8.472182719522917e-05, + "loss": 0.0, + "step": 42080 + }, + { + "epoch": 3.9265652701315665, + "grad_norm": NaN, + "learning_rate": 8.471501730286643e-05, + "loss": 0.0, + "step": 42081 + }, + { + "epoch": 3.926658579826444, + "grad_norm": NaN, + "learning_rate": 8.470820757650399e-05, + "loss": 0.0, + "step": 42082 + }, + { + "epoch": 3.9267518895213214, + "grad_norm": NaN, + "learning_rate": 8.470139801615937e-05, + "loss": 0.0, + "step": 42083 + }, + { + "epoch": 3.9268451992161983, + "grad_norm": NaN, + "learning_rate": 8.469458862184967e-05, + "loss": 0.0, + "step": 42084 + }, + { + "epoch": 3.9269385089110758, + "grad_norm": NaN, + "learning_rate": 8.468777939359224e-05, + "loss": 0.0, + "step": 42085 + }, + { + "epoch": 3.927031818605953, + "grad_norm": NaN, + "learning_rate": 8.468097033140455e-05, + "loss": 0.0, + "step": 42086 + }, + { + "epoch": 3.9271251283008306, + "grad_norm": NaN, + "learning_rate": 8.467416143530376e-05, + "loss": 0.0, + "step": 42087 + }, + { + "epoch": 3.927218437995708, + "grad_norm": NaN, + "learning_rate": 8.466735270530716e-05, + "loss": 0.0, + "step": 42088 + }, + { + "epoch": 3.927311747690585, + "grad_norm": NaN, + "learning_rate": 8.466054414143224e-05, + "loss": 0.0, + "step": 42089 + }, + { + "epoch": 3.9274050573854624, + "grad_norm": NaN, + "learning_rate": 8.465373574369616e-05, + "loss": 0.0, + "step": 42090 + }, + { + "epoch": 3.9274983670803394, + "grad_norm": NaN, + "learning_rate": 8.46469275121162e-05, + "loss": 0.0, + "step": 42091 + }, + { + "epoch": 3.927591676775217, + "grad_norm": NaN, + "learning_rate": 8.464011944670985e-05, + "loss": 0.0, + "step": 42092 + }, + { + "epoch": 3.9276849864700942, + "grad_norm": NaN, + "learning_rate": 8.463331154749429e-05, + "loss": 0.0, + "step": 42093 + }, + { + "epoch": 3.9277782961649716, + "grad_norm": NaN, + "learning_rate": 8.462650381448677e-05, + "loss": 0.0, + "step": 42094 + }, + { + "epoch": 3.927871605859849, + "grad_norm": NaN, + "learning_rate": 8.461969624770482e-05, + "loss": 0.0, + "step": 42095 + }, + { + "epoch": 3.927964915554726, + "grad_norm": NaN, + "learning_rate": 8.461288884716554e-05, + "loss": 0.0, + "step": 42096 + }, + { + "epoch": 3.9280582252496035, + "grad_norm": NaN, + "learning_rate": 8.460608161288626e-05, + "loss": 0.0, + "step": 42097 + }, + { + "epoch": 3.928151534944481, + "grad_norm": NaN, + "learning_rate": 8.459927454488447e-05, + "loss": 0.0, + "step": 42098 + }, + { + "epoch": 3.928244844639358, + "grad_norm": NaN, + "learning_rate": 8.45924676431773e-05, + "loss": 0.0, + "step": 42099 + }, + { + "epoch": 3.9283381543342353, + "grad_norm": NaN, + "learning_rate": 8.458566090778204e-05, + "loss": 0.0, + "step": 42100 + }, + { + "epoch": 3.9284314640291127, + "grad_norm": NaN, + "learning_rate": 8.457885433871618e-05, + "loss": 0.0, + "step": 42101 + }, + { + "epoch": 3.92852477372399, + "grad_norm": NaN, + "learning_rate": 8.457204793599685e-05, + "loss": 0.0, + "step": 42102 + }, + { + "epoch": 3.928618083418867, + "grad_norm": NaN, + "learning_rate": 8.456524169964137e-05, + "loss": 0.0, + "step": 42103 + }, + { + "epoch": 3.9287113931137445, + "grad_norm": NaN, + "learning_rate": 8.455843562966723e-05, + "loss": 0.0, + "step": 42104 + }, + { + "epoch": 3.928804702808622, + "grad_norm": NaN, + "learning_rate": 8.455162972609151e-05, + "loss": 0.0, + "step": 42105 + }, + { + "epoch": 3.928898012503499, + "grad_norm": NaN, + "learning_rate": 8.454482398893165e-05, + "loss": 0.0, + "step": 42106 + }, + { + "epoch": 3.9289913221983763, + "grad_norm": NaN, + "learning_rate": 8.453801841820489e-05, + "loss": 0.0, + "step": 42107 + }, + { + "epoch": 3.9290846318932537, + "grad_norm": NaN, + "learning_rate": 8.453121301392854e-05, + "loss": 0.0, + "step": 42108 + }, + { + "epoch": 3.929177941588131, + "grad_norm": NaN, + "learning_rate": 8.452440777611995e-05, + "loss": 0.0, + "step": 42109 + }, + { + "epoch": 3.9292712512830086, + "grad_norm": NaN, + "learning_rate": 8.45176027047964e-05, + "loss": 0.0, + "step": 42110 + }, + { + "epoch": 3.9293645609778856, + "grad_norm": NaN, + "learning_rate": 8.45107977999751e-05, + "loss": 0.0, + "step": 42111 + }, + { + "epoch": 3.929457870672763, + "grad_norm": NaN, + "learning_rate": 8.45039930616736e-05, + "loss": 0.0, + "step": 42112 + }, + { + "epoch": 3.92955118036764, + "grad_norm": NaN, + "learning_rate": 8.449718848990896e-05, + "loss": 0.0, + "step": 42113 + }, + { + "epoch": 3.9296444900625174, + "grad_norm": NaN, + "learning_rate": 8.44903840846985e-05, + "loss": 0.0, + "step": 42114 + }, + { + "epoch": 3.929737799757395, + "grad_norm": NaN, + "learning_rate": 8.448357984605971e-05, + "loss": 0.0, + "step": 42115 + }, + { + "epoch": 3.929831109452272, + "grad_norm": NaN, + "learning_rate": 8.447677577400972e-05, + "loss": 0.0, + "step": 42116 + }, + { + "epoch": 3.9299244191471496, + "grad_norm": NaN, + "learning_rate": 8.446997186856583e-05, + "loss": 0.0, + "step": 42117 + }, + { + "epoch": 3.9300177288420266, + "grad_norm": NaN, + "learning_rate": 8.44631681297455e-05, + "loss": 0.0, + "step": 42118 + }, + { + "epoch": 3.930111038536904, + "grad_norm": NaN, + "learning_rate": 8.445636455756584e-05, + "loss": 0.0, + "step": 42119 + }, + { + "epoch": 3.9302043482317814, + "grad_norm": NaN, + "learning_rate": 8.444956115204419e-05, + "loss": 0.0, + "step": 42120 + }, + { + "epoch": 3.9302976579266584, + "grad_norm": NaN, + "learning_rate": 8.4442757913198e-05, + "loss": 0.0, + "step": 42121 + }, + { + "epoch": 3.930390967621536, + "grad_norm": NaN, + "learning_rate": 8.443595484104441e-05, + "loss": 0.0, + "step": 42122 + }, + { + "epoch": 3.9304842773164133, + "grad_norm": NaN, + "learning_rate": 8.442915193560068e-05, + "loss": 0.0, + "step": 42123 + }, + { + "epoch": 3.9305775870112907, + "grad_norm": NaN, + "learning_rate": 8.442234919688434e-05, + "loss": 0.0, + "step": 42124 + }, + { + "epoch": 3.9306708967061676, + "grad_norm": NaN, + "learning_rate": 8.441554662491247e-05, + "loss": 0.0, + "step": 42125 + }, + { + "epoch": 3.930764206401045, + "grad_norm": NaN, + "learning_rate": 8.440874421970238e-05, + "loss": 0.0, + "step": 42126 + }, + { + "epoch": 3.9308575160959225, + "grad_norm": NaN, + "learning_rate": 8.440194198127154e-05, + "loss": 0.0, + "step": 42127 + }, + { + "epoch": 3.9309508257907995, + "grad_norm": NaN, + "learning_rate": 8.439513990963707e-05, + "loss": 0.0, + "step": 42128 + }, + { + "epoch": 3.931044135485677, + "grad_norm": NaN, + "learning_rate": 8.438833800481628e-05, + "loss": 0.0, + "step": 42129 + }, + { + "epoch": 3.9311374451805543, + "grad_norm": NaN, + "learning_rate": 8.438153626682662e-05, + "loss": 0.0, + "step": 42130 + }, + { + "epoch": 3.9312307548754317, + "grad_norm": NaN, + "learning_rate": 8.43747346956852e-05, + "loss": 0.0, + "step": 42131 + }, + { + "epoch": 3.9313240645703087, + "grad_norm": NaN, + "learning_rate": 8.436793329140935e-05, + "loss": 0.0, + "step": 42132 + }, + { + "epoch": 3.931417374265186, + "grad_norm": NaN, + "learning_rate": 8.436113205401653e-05, + "loss": 0.0, + "step": 42133 + }, + { + "epoch": 3.9315106839600635, + "grad_norm": NaN, + "learning_rate": 8.43543309835238e-05, + "loss": 0.0, + "step": 42134 + }, + { + "epoch": 3.9316039936549405, + "grad_norm": NaN, + "learning_rate": 8.434753007994855e-05, + "loss": 0.0, + "step": 42135 + }, + { + "epoch": 3.931697303349818, + "grad_norm": NaN, + "learning_rate": 8.434072934330818e-05, + "loss": 0.0, + "step": 42136 + }, + { + "epoch": 3.9317906130446953, + "grad_norm": NaN, + "learning_rate": 8.433392877361984e-05, + "loss": 0.0, + "step": 42137 + }, + { + "epoch": 3.9318839227395728, + "grad_norm": NaN, + "learning_rate": 8.43271283709008e-05, + "loss": 0.0, + "step": 42138 + }, + { + "epoch": 3.93197723243445, + "grad_norm": NaN, + "learning_rate": 8.432032813516854e-05, + "loss": 0.0, + "step": 42139 + }, + { + "epoch": 3.932070542129327, + "grad_norm": NaN, + "learning_rate": 8.431352806644018e-05, + "loss": 0.0, + "step": 42140 + }, + { + "epoch": 3.9321638518242046, + "grad_norm": NaN, + "learning_rate": 8.430672816473298e-05, + "loss": 0.0, + "step": 42141 + }, + { + "epoch": 3.9322571615190816, + "grad_norm": NaN, + "learning_rate": 8.429992843006445e-05, + "loss": 0.0, + "step": 42142 + }, + { + "epoch": 3.932350471213959, + "grad_norm": NaN, + "learning_rate": 8.429312886245166e-05, + "loss": 0.0, + "step": 42143 + }, + { + "epoch": 3.9324437809088364, + "grad_norm": NaN, + "learning_rate": 8.428632946191195e-05, + "loss": 0.0, + "step": 42144 + }, + { + "epoch": 3.932537090603714, + "grad_norm": NaN, + "learning_rate": 8.427953022846274e-05, + "loss": 0.0, + "step": 42145 + }, + { + "epoch": 3.9326304002985912, + "grad_norm": NaN, + "learning_rate": 8.427273116212115e-05, + "loss": 0.0, + "step": 42146 + }, + { + "epoch": 3.932723709993468, + "grad_norm": NaN, + "learning_rate": 8.426593226290448e-05, + "loss": 0.0, + "step": 42147 + }, + { + "epoch": 3.9328170196883456, + "grad_norm": NaN, + "learning_rate": 8.425913353083019e-05, + "loss": 0.0, + "step": 42148 + }, + { + "epoch": 3.932910329383223, + "grad_norm": NaN, + "learning_rate": 8.425233496591532e-05, + "loss": 0.0, + "step": 42149 + }, + { + "epoch": 3.9330036390781, + "grad_norm": NaN, + "learning_rate": 8.424553656817736e-05, + "loss": 0.0, + "step": 42150 + }, + { + "epoch": 3.9330969487729774, + "grad_norm": NaN, + "learning_rate": 8.423873833763359e-05, + "loss": 0.0, + "step": 42151 + }, + { + "epoch": 3.933190258467855, + "grad_norm": NaN, + "learning_rate": 8.42319402743011e-05, + "loss": 0.0, + "step": 42152 + }, + { + "epoch": 3.9332835681627323, + "grad_norm": NaN, + "learning_rate": 8.422514237819744e-05, + "loss": 0.0, + "step": 42153 + }, + { + "epoch": 3.9333768778576093, + "grad_norm": NaN, + "learning_rate": 8.421834464933967e-05, + "loss": 0.0, + "step": 42154 + }, + { + "epoch": 3.9334701875524867, + "grad_norm": NaN, + "learning_rate": 8.421154708774513e-05, + "loss": 0.0, + "step": 42155 + }, + { + "epoch": 3.933563497247364, + "grad_norm": NaN, + "learning_rate": 8.420474969343124e-05, + "loss": 0.0, + "step": 42156 + }, + { + "epoch": 3.933656806942241, + "grad_norm": NaN, + "learning_rate": 8.419795246641513e-05, + "loss": 0.0, + "step": 42157 + }, + { + "epoch": 3.9337501166371185, + "grad_norm": NaN, + "learning_rate": 8.419115540671406e-05, + "loss": 0.0, + "step": 42158 + }, + { + "epoch": 3.933843426331996, + "grad_norm": NaN, + "learning_rate": 8.418435851434552e-05, + "loss": 0.0, + "step": 42159 + }, + { + "epoch": 3.9339367360268733, + "grad_norm": NaN, + "learning_rate": 8.41775617893266e-05, + "loss": 0.0, + "step": 42160 + }, + { + "epoch": 3.9340300457217507, + "grad_norm": NaN, + "learning_rate": 8.417076523167458e-05, + "loss": 0.0, + "step": 42161 + }, + { + "epoch": 3.9341233554166277, + "grad_norm": NaN, + "learning_rate": 8.416396884140694e-05, + "loss": 0.0, + "step": 42162 + }, + { + "epoch": 3.934216665111505, + "grad_norm": NaN, + "learning_rate": 8.415717261854075e-05, + "loss": 0.0, + "step": 42163 + }, + { + "epoch": 3.934309974806382, + "grad_norm": NaN, + "learning_rate": 8.415037656309329e-05, + "loss": 0.0, + "step": 42164 + }, + { + "epoch": 3.9344032845012595, + "grad_norm": NaN, + "learning_rate": 8.414358067508206e-05, + "loss": 0.0, + "step": 42165 + }, + { + "epoch": 3.934496594196137, + "grad_norm": NaN, + "learning_rate": 8.413678495452415e-05, + "loss": 0.0, + "step": 42166 + }, + { + "epoch": 3.9345899038910144, + "grad_norm": NaN, + "learning_rate": 8.412998940143679e-05, + "loss": 0.0, + "step": 42167 + }, + { + "epoch": 3.934683213585892, + "grad_norm": NaN, + "learning_rate": 8.412319401583747e-05, + "loss": 0.0, + "step": 42168 + }, + { + "epoch": 3.9347765232807688, + "grad_norm": NaN, + "learning_rate": 8.411639879774333e-05, + "loss": 0.0, + "step": 42169 + }, + { + "epoch": 3.934869832975646, + "grad_norm": NaN, + "learning_rate": 8.410960374717159e-05, + "loss": 0.0, + "step": 42170 + }, + { + "epoch": 3.9349631426705236, + "grad_norm": NaN, + "learning_rate": 8.410280886413973e-05, + "loss": 0.0, + "step": 42171 + }, + { + "epoch": 3.9350564523654006, + "grad_norm": NaN, + "learning_rate": 8.409601414866485e-05, + "loss": 0.0, + "step": 42172 + }, + { + "epoch": 3.935149762060278, + "grad_norm": NaN, + "learning_rate": 8.408921960076422e-05, + "loss": 0.0, + "step": 42173 + }, + { + "epoch": 3.9352430717551554, + "grad_norm": NaN, + "learning_rate": 8.408242522045532e-05, + "loss": 0.0, + "step": 42174 + }, + { + "epoch": 3.935336381450033, + "grad_norm": NaN, + "learning_rate": 8.40756310077552e-05, + "loss": 0.0, + "step": 42175 + }, + { + "epoch": 3.93542969114491, + "grad_norm": NaN, + "learning_rate": 8.406883696268118e-05, + "loss": 0.0, + "step": 42176 + }, + { + "epoch": 3.9355230008397872, + "grad_norm": NaN, + "learning_rate": 8.40620430852507e-05, + "loss": 0.0, + "step": 42177 + }, + { + "epoch": 3.9356163105346647, + "grad_norm": NaN, + "learning_rate": 8.405524937548083e-05, + "loss": 0.0, + "step": 42178 + }, + { + "epoch": 3.9357096202295416, + "grad_norm": NaN, + "learning_rate": 8.404845583338889e-05, + "loss": 0.0, + "step": 42179 + }, + { + "epoch": 3.935802929924419, + "grad_norm": NaN, + "learning_rate": 8.40416624589923e-05, + "loss": 0.0, + "step": 42180 + }, + { + "epoch": 3.9358962396192965, + "grad_norm": NaN, + "learning_rate": 8.403486925230816e-05, + "loss": 0.0, + "step": 42181 + }, + { + "epoch": 3.935989549314174, + "grad_norm": NaN, + "learning_rate": 8.402807621335375e-05, + "loss": 0.0, + "step": 42182 + }, + { + "epoch": 3.9360828590090513, + "grad_norm": NaN, + "learning_rate": 8.402128334214651e-05, + "loss": 0.0, + "step": 42183 + }, + { + "epoch": 3.9361761687039283, + "grad_norm": NaN, + "learning_rate": 8.40144906387035e-05, + "loss": 0.0, + "step": 42184 + }, + { + "epoch": 3.9362694783988057, + "grad_norm": NaN, + "learning_rate": 8.400769810304215e-05, + "loss": 0.0, + "step": 42185 + }, + { + "epoch": 3.9363627880936827, + "grad_norm": NaN, + "learning_rate": 8.400090573517972e-05, + "loss": 0.0, + "step": 42186 + }, + { + "epoch": 3.93645609778856, + "grad_norm": NaN, + "learning_rate": 8.399411353513336e-05, + "loss": 0.0, + "step": 42187 + }, + { + "epoch": 3.9365494074834375, + "grad_norm": NaN, + "learning_rate": 8.398732150292043e-05, + "loss": 0.0, + "step": 42188 + }, + { + "epoch": 3.936642717178315, + "grad_norm": NaN, + "learning_rate": 8.398052963855826e-05, + "loss": 0.0, + "step": 42189 + }, + { + "epoch": 3.9367360268731924, + "grad_norm": NaN, + "learning_rate": 8.397373794206395e-05, + "loss": 0.0, + "step": 42190 + }, + { + "epoch": 3.9368293365680693, + "grad_norm": NaN, + "learning_rate": 8.396694641345488e-05, + "loss": 0.0, + "step": 42191 + }, + { + "epoch": 3.9369226462629467, + "grad_norm": NaN, + "learning_rate": 8.396015505274841e-05, + "loss": 0.0, + "step": 42192 + }, + { + "epoch": 3.937015955957824, + "grad_norm": NaN, + "learning_rate": 8.395336385996156e-05, + "loss": 0.0, + "step": 42193 + }, + { + "epoch": 3.937109265652701, + "grad_norm": NaN, + "learning_rate": 8.39465728351118e-05, + "loss": 0.0, + "step": 42194 + }, + { + "epoch": 3.9372025753475786, + "grad_norm": NaN, + "learning_rate": 8.393978197821642e-05, + "loss": 0.0, + "step": 42195 + }, + { + "epoch": 3.937295885042456, + "grad_norm": NaN, + "learning_rate": 8.393299128929245e-05, + "loss": 0.0, + "step": 42196 + }, + { + "epoch": 3.9373891947373334, + "grad_norm": NaN, + "learning_rate": 8.392620076835746e-05, + "loss": 0.0, + "step": 42197 + }, + { + "epoch": 3.9374825044322104, + "grad_norm": NaN, + "learning_rate": 8.39194104154285e-05, + "loss": 0.0, + "step": 42198 + }, + { + "epoch": 3.937575814127088, + "grad_norm": NaN, + "learning_rate": 8.391262023052284e-05, + "loss": 0.0, + "step": 42199 + }, + { + "epoch": 3.937669123821965, + "grad_norm": NaN, + "learning_rate": 8.390583021365793e-05, + "loss": 0.0, + "step": 42200 + }, + { + "epoch": 3.937762433516842, + "grad_norm": NaN, + "learning_rate": 8.389904036485087e-05, + "loss": 0.0, + "step": 42201 + }, + { + "epoch": 3.9378557432117196, + "grad_norm": NaN, + "learning_rate": 8.38922506841189e-05, + "loss": 0.0, + "step": 42202 + }, + { + "epoch": 3.937949052906597, + "grad_norm": NaN, + "learning_rate": 8.388546117147949e-05, + "loss": 0.0, + "step": 42203 + }, + { + "epoch": 3.9380423626014744, + "grad_norm": NaN, + "learning_rate": 8.387867182694968e-05, + "loss": 0.0, + "step": 42204 + }, + { + "epoch": 3.938135672296352, + "grad_norm": NaN, + "learning_rate": 8.387188265054678e-05, + "loss": 0.0, + "step": 42205 + }, + { + "epoch": 3.938228981991229, + "grad_norm": NaN, + "learning_rate": 8.38650936422882e-05, + "loss": 0.0, + "step": 42206 + }, + { + "epoch": 3.9383222916861063, + "grad_norm": NaN, + "learning_rate": 8.385830480219105e-05, + "loss": 0.0, + "step": 42207 + }, + { + "epoch": 3.9384156013809832, + "grad_norm": NaN, + "learning_rate": 8.385151613027258e-05, + "loss": 0.0, + "step": 42208 + }, + { + "epoch": 3.9385089110758607, + "grad_norm": NaN, + "learning_rate": 8.384472762655022e-05, + "loss": 0.0, + "step": 42209 + }, + { + "epoch": 3.938602220770738, + "grad_norm": NaN, + "learning_rate": 8.383793929104107e-05, + "loss": 0.0, + "step": 42210 + }, + { + "epoch": 3.9386955304656155, + "grad_norm": NaN, + "learning_rate": 8.383115112376237e-05, + "loss": 0.0, + "step": 42211 + }, + { + "epoch": 3.938788840160493, + "grad_norm": NaN, + "learning_rate": 8.382436312473157e-05, + "loss": 0.0, + "step": 42212 + }, + { + "epoch": 3.93888214985537, + "grad_norm": NaN, + "learning_rate": 8.381757529396577e-05, + "loss": 0.0, + "step": 42213 + }, + { + "epoch": 3.9389754595502473, + "grad_norm": NaN, + "learning_rate": 8.38107876314822e-05, + "loss": 0.0, + "step": 42214 + }, + { + "epoch": 3.9390687692451247, + "grad_norm": NaN, + "learning_rate": 8.380400013729831e-05, + "loss": 0.0, + "step": 42215 + }, + { + "epoch": 3.9391620789400017, + "grad_norm": NaN, + "learning_rate": 8.379721281143118e-05, + "loss": 0.0, + "step": 42216 + }, + { + "epoch": 3.939255388634879, + "grad_norm": NaN, + "learning_rate": 8.379042565389806e-05, + "loss": 0.0, + "step": 42217 + }, + { + "epoch": 3.9393486983297565, + "grad_norm": NaN, + "learning_rate": 8.378363866471639e-05, + "loss": 0.0, + "step": 42218 + }, + { + "epoch": 3.939442008024634, + "grad_norm": NaN, + "learning_rate": 8.377685184390328e-05, + "loss": 0.0, + "step": 42219 + }, + { + "epoch": 3.939535317719511, + "grad_norm": NaN, + "learning_rate": 8.377006519147595e-05, + "loss": 0.0, + "step": 42220 + }, + { + "epoch": 3.9396286274143884, + "grad_norm": NaN, + "learning_rate": 8.376327870745183e-05, + "loss": 0.0, + "step": 42221 + }, + { + "epoch": 3.9397219371092658, + "grad_norm": NaN, + "learning_rate": 8.375649239184796e-05, + "loss": 0.0, + "step": 42222 + }, + { + "epoch": 3.9398152468041427, + "grad_norm": NaN, + "learning_rate": 8.374970624468176e-05, + "loss": 0.0, + "step": 42223 + }, + { + "epoch": 3.93990855649902, + "grad_norm": NaN, + "learning_rate": 8.374292026597051e-05, + "loss": 0.0, + "step": 42224 + }, + { + "epoch": 3.9400018661938976, + "grad_norm": NaN, + "learning_rate": 8.373613445573126e-05, + "loss": 0.0, + "step": 42225 + }, + { + "epoch": 3.940095175888775, + "grad_norm": NaN, + "learning_rate": 8.372934881398144e-05, + "loss": 0.0, + "step": 42226 + }, + { + "epoch": 3.940188485583652, + "grad_norm": NaN, + "learning_rate": 8.372256334073832e-05, + "loss": 0.0, + "step": 42227 + }, + { + "epoch": 3.9402817952785294, + "grad_norm": NaN, + "learning_rate": 8.371577803601897e-05, + "loss": 0.0, + "step": 42228 + }, + { + "epoch": 3.940375104973407, + "grad_norm": NaN, + "learning_rate": 8.370899289984083e-05, + "loss": 0.0, + "step": 42229 + }, + { + "epoch": 3.940468414668284, + "grad_norm": NaN, + "learning_rate": 8.370220793222112e-05, + "loss": 0.0, + "step": 42230 + }, + { + "epoch": 3.940561724363161, + "grad_norm": NaN, + "learning_rate": 8.369542313317697e-05, + "loss": 0.0, + "step": 42231 + }, + { + "epoch": 3.9406550340580386, + "grad_norm": NaN, + "learning_rate": 8.368863850272575e-05, + "loss": 0.0, + "step": 42232 + }, + { + "epoch": 3.940748343752916, + "grad_norm": NaN, + "learning_rate": 8.368185404088475e-05, + "loss": 0.0, + "step": 42233 + }, + { + "epoch": 3.9408416534477935, + "grad_norm": NaN, + "learning_rate": 8.367506974767105e-05, + "loss": 0.0, + "step": 42234 + }, + { + "epoch": 3.9409349631426704, + "grad_norm": NaN, + "learning_rate": 8.366828562310203e-05, + "loss": 0.0, + "step": 42235 + }, + { + "epoch": 3.941028272837548, + "grad_norm": NaN, + "learning_rate": 8.366150166719498e-05, + "loss": 0.0, + "step": 42236 + }, + { + "epoch": 3.9411215825324253, + "grad_norm": NaN, + "learning_rate": 8.365471787996697e-05, + "loss": 0.0, + "step": 42237 + }, + { + "epoch": 3.9412148922273023, + "grad_norm": NaN, + "learning_rate": 8.364793426143541e-05, + "loss": 0.0, + "step": 42238 + }, + { + "epoch": 3.9413082019221797, + "grad_norm": NaN, + "learning_rate": 8.364115081161756e-05, + "loss": 0.0, + "step": 42239 + }, + { + "epoch": 3.941401511617057, + "grad_norm": NaN, + "learning_rate": 8.363436753053048e-05, + "loss": 0.0, + "step": 42240 + }, + { + "epoch": 3.9414948213119345, + "grad_norm": NaN, + "learning_rate": 8.362758441819167e-05, + "loss": 0.0, + "step": 42241 + }, + { + "epoch": 3.9415881310068115, + "grad_norm": NaN, + "learning_rate": 8.36208014746182e-05, + "loss": 0.0, + "step": 42242 + }, + { + "epoch": 3.941681440701689, + "grad_norm": NaN, + "learning_rate": 8.36140186998273e-05, + "loss": 0.0, + "step": 42243 + }, + { + "epoch": 3.9417747503965663, + "grad_norm": NaN, + "learning_rate": 8.36072360938364e-05, + "loss": 0.0, + "step": 42244 + }, + { + "epoch": 3.9418680600914433, + "grad_norm": NaN, + "learning_rate": 8.360045365666258e-05, + "loss": 0.0, + "step": 42245 + }, + { + "epoch": 3.9419613697863207, + "grad_norm": NaN, + "learning_rate": 8.359367138832308e-05, + "loss": 0.0, + "step": 42246 + }, + { + "epoch": 3.942054679481198, + "grad_norm": NaN, + "learning_rate": 8.358688928883533e-05, + "loss": 0.0, + "step": 42247 + }, + { + "epoch": 3.9421479891760756, + "grad_norm": NaN, + "learning_rate": 8.358010735821636e-05, + "loss": 0.0, + "step": 42248 + }, + { + "epoch": 3.9422412988709525, + "grad_norm": NaN, + "learning_rate": 8.357332559648347e-05, + "loss": 0.0, + "step": 42249 + }, + { + "epoch": 3.94233460856583, + "grad_norm": NaN, + "learning_rate": 8.356654400365405e-05, + "loss": 0.0, + "step": 42250 + }, + { + "epoch": 3.9424279182607074, + "grad_norm": NaN, + "learning_rate": 8.355976257974516e-05, + "loss": 0.0, + "step": 42251 + }, + { + "epoch": 3.9425212279555844, + "grad_norm": NaN, + "learning_rate": 8.355298132477407e-05, + "loss": 0.0, + "step": 42252 + }, + { + "epoch": 3.9426145376504618, + "grad_norm": NaN, + "learning_rate": 8.354620023875817e-05, + "loss": 0.0, + "step": 42253 + }, + { + "epoch": 3.942707847345339, + "grad_norm": NaN, + "learning_rate": 8.353941932171453e-05, + "loss": 0.0, + "step": 42254 + }, + { + "epoch": 3.9428011570402166, + "grad_norm": NaN, + "learning_rate": 8.353263857366042e-05, + "loss": 0.0, + "step": 42255 + }, + { + "epoch": 3.942894466735094, + "grad_norm": NaN, + "learning_rate": 8.352585799461322e-05, + "loss": 0.0, + "step": 42256 + }, + { + "epoch": 3.942987776429971, + "grad_norm": NaN, + "learning_rate": 8.351907758459004e-05, + "loss": 0.0, + "step": 42257 + }, + { + "epoch": 3.9430810861248484, + "grad_norm": NaN, + "learning_rate": 8.351229734360806e-05, + "loss": 0.0, + "step": 42258 + }, + { + "epoch": 3.9431743958197254, + "grad_norm": NaN, + "learning_rate": 8.350551727168474e-05, + "loss": 0.0, + "step": 42259 + }, + { + "epoch": 3.943267705514603, + "grad_norm": NaN, + "learning_rate": 8.34987373688371e-05, + "loss": 0.0, + "step": 42260 + }, + { + "epoch": 3.9433610152094802, + "grad_norm": NaN, + "learning_rate": 8.34919576350825e-05, + "loss": 0.0, + "step": 42261 + }, + { + "epoch": 3.9434543249043577, + "grad_norm": NaN, + "learning_rate": 8.348517807043821e-05, + "loss": 0.0, + "step": 42262 + }, + { + "epoch": 3.943547634599235, + "grad_norm": NaN, + "learning_rate": 8.347839867492133e-05, + "loss": 0.0, + "step": 42263 + }, + { + "epoch": 3.943640944294112, + "grad_norm": NaN, + "learning_rate": 8.347161944854919e-05, + "loss": 0.0, + "step": 42264 + }, + { + "epoch": 3.9437342539889895, + "grad_norm": NaN, + "learning_rate": 8.34648403913391e-05, + "loss": 0.0, + "step": 42265 + }, + { + "epoch": 3.943827563683867, + "grad_norm": NaN, + "learning_rate": 8.345806150330809e-05, + "loss": 0.0, + "step": 42266 + }, + { + "epoch": 3.943920873378744, + "grad_norm": NaN, + "learning_rate": 8.345128278447357e-05, + "loss": 0.0, + "step": 42267 + }, + { + "epoch": 3.9440141830736213, + "grad_norm": NaN, + "learning_rate": 8.344450423485279e-05, + "loss": 0.0, + "step": 42268 + }, + { + "epoch": 3.9441074927684987, + "grad_norm": NaN, + "learning_rate": 8.34377258544628e-05, + "loss": 0.0, + "step": 42269 + }, + { + "epoch": 3.944200802463376, + "grad_norm": NaN, + "learning_rate": 8.343094764332101e-05, + "loss": 0.0, + "step": 42270 + }, + { + "epoch": 3.944294112158253, + "grad_norm": NaN, + "learning_rate": 8.342416960144468e-05, + "loss": 0.0, + "step": 42271 + }, + { + "epoch": 3.9443874218531305, + "grad_norm": NaN, + "learning_rate": 8.341739172885082e-05, + "loss": 0.0, + "step": 42272 + }, + { + "epoch": 3.944480731548008, + "grad_norm": NaN, + "learning_rate": 8.341061402555688e-05, + "loss": 0.0, + "step": 42273 + }, + { + "epoch": 3.944574041242885, + "grad_norm": NaN, + "learning_rate": 8.340383649158009e-05, + "loss": 0.0, + "step": 42274 + }, + { + "epoch": 3.9446673509377623, + "grad_norm": NaN, + "learning_rate": 8.339705912693749e-05, + "loss": 0.0, + "step": 42275 + }, + { + "epoch": 3.9447606606326397, + "grad_norm": NaN, + "learning_rate": 8.33902819316465e-05, + "loss": 0.0, + "step": 42276 + }, + { + "epoch": 3.944853970327517, + "grad_norm": NaN, + "learning_rate": 8.338350490572435e-05, + "loss": 0.0, + "step": 42277 + }, + { + "epoch": 3.9449472800223946, + "grad_norm": NaN, + "learning_rate": 8.33767280491881e-05, + "loss": 0.0, + "step": 42278 + }, + { + "epoch": 3.9450405897172716, + "grad_norm": NaN, + "learning_rate": 8.336995136205514e-05, + "loss": 0.0, + "step": 42279 + }, + { + "epoch": 3.945133899412149, + "grad_norm": NaN, + "learning_rate": 8.336317484434273e-05, + "loss": 0.0, + "step": 42280 + }, + { + "epoch": 3.945227209107026, + "grad_norm": NaN, + "learning_rate": 8.335639849606792e-05, + "loss": 0.0, + "step": 42281 + }, + { + "epoch": 3.9453205188019034, + "grad_norm": NaN, + "learning_rate": 8.334962231724809e-05, + "loss": 0.0, + "step": 42282 + }, + { + "epoch": 3.945413828496781, + "grad_norm": NaN, + "learning_rate": 8.334284630790048e-05, + "loss": 0.0, + "step": 42283 + }, + { + "epoch": 3.945507138191658, + "grad_norm": NaN, + "learning_rate": 8.333607046804214e-05, + "loss": 0.0, + "step": 42284 + }, + { + "epoch": 3.9456004478865356, + "grad_norm": NaN, + "learning_rate": 8.332929479769053e-05, + "loss": 0.0, + "step": 42285 + }, + { + "epoch": 3.9456937575814126, + "grad_norm": NaN, + "learning_rate": 8.33225192968628e-05, + "loss": 0.0, + "step": 42286 + }, + { + "epoch": 3.94578706727629, + "grad_norm": NaN, + "learning_rate": 8.331574396557602e-05, + "loss": 0.0, + "step": 42287 + }, + { + "epoch": 3.9458803769711674, + "grad_norm": NaN, + "learning_rate": 8.330896880384772e-05, + "loss": 0.0, + "step": 42288 + }, + { + "epoch": 3.9459736866660444, + "grad_norm": NaN, + "learning_rate": 8.330219381169485e-05, + "loss": 0.0, + "step": 42289 + }, + { + "epoch": 3.946066996360922, + "grad_norm": NaN, + "learning_rate": 8.32954189891347e-05, + "loss": 0.0, + "step": 42290 + }, + { + "epoch": 3.9461603060557993, + "grad_norm": NaN, + "learning_rate": 8.328864433618463e-05, + "loss": 0.0, + "step": 42291 + }, + { + "epoch": 3.9462536157506767, + "grad_norm": NaN, + "learning_rate": 8.328186985286176e-05, + "loss": 0.0, + "step": 42292 + }, + { + "epoch": 3.9463469254455537, + "grad_norm": NaN, + "learning_rate": 8.327509553918326e-05, + "loss": 0.0, + "step": 42293 + }, + { + "epoch": 3.946440235140431, + "grad_norm": NaN, + "learning_rate": 8.326832139516655e-05, + "loss": 0.0, + "step": 42294 + }, + { + "epoch": 3.9465335448353085, + "grad_norm": NaN, + "learning_rate": 8.326154742082865e-05, + "loss": 0.0, + "step": 42295 + }, + { + "epoch": 3.9466268545301855, + "grad_norm": NaN, + "learning_rate": 8.325477361618682e-05, + "loss": 0.0, + "step": 42296 + }, + { + "epoch": 3.946720164225063, + "grad_norm": NaN, + "learning_rate": 8.324799998125847e-05, + "loss": 0.0, + "step": 42297 + }, + { + "epoch": 3.9468134739199403, + "grad_norm": NaN, + "learning_rate": 8.324122651606053e-05, + "loss": 0.0, + "step": 42298 + }, + { + "epoch": 3.9469067836148177, + "grad_norm": NaN, + "learning_rate": 8.323445322061045e-05, + "loss": 0.0, + "step": 42299 + }, + { + "epoch": 3.947000093309695, + "grad_norm": NaN, + "learning_rate": 8.322768009492543e-05, + "loss": 0.0, + "step": 42300 + }, + { + "epoch": 3.947093403004572, + "grad_norm": NaN, + "learning_rate": 8.322090713902252e-05, + "loss": 0.0, + "step": 42301 + }, + { + "epoch": 3.9471867126994495, + "grad_norm": NaN, + "learning_rate": 8.321413435291914e-05, + "loss": 0.0, + "step": 42302 + }, + { + "epoch": 3.9472800223943265, + "grad_norm": NaN, + "learning_rate": 8.320736173663246e-05, + "loss": 0.0, + "step": 42303 + }, + { + "epoch": 3.947373332089204, + "grad_norm": NaN, + "learning_rate": 8.320058929017958e-05, + "loss": 0.0, + "step": 42304 + }, + { + "epoch": 3.9474666417840814, + "grad_norm": NaN, + "learning_rate": 8.319381701357787e-05, + "loss": 0.0, + "step": 42305 + }, + { + "epoch": 3.9475599514789588, + "grad_norm": NaN, + "learning_rate": 8.318704490684455e-05, + "loss": 0.0, + "step": 42306 + }, + { + "epoch": 3.947653261173836, + "grad_norm": NaN, + "learning_rate": 8.318027296999666e-05, + "loss": 0.0, + "step": 42307 + }, + { + "epoch": 3.947746570868713, + "grad_norm": NaN, + "learning_rate": 8.317350120305161e-05, + "loss": 0.0, + "step": 42308 + }, + { + "epoch": 3.9478398805635906, + "grad_norm": NaN, + "learning_rate": 8.31667296060266e-05, + "loss": 0.0, + "step": 42309 + }, + { + "epoch": 3.947933190258468, + "grad_norm": NaN, + "learning_rate": 8.31599581789387e-05, + "loss": 0.0, + "step": 42310 + }, + { + "epoch": 3.948026499953345, + "grad_norm": NaN, + "learning_rate": 8.315318692180526e-05, + "loss": 0.0, + "step": 42311 + }, + { + "epoch": 3.9481198096482224, + "grad_norm": NaN, + "learning_rate": 8.314641583464355e-05, + "loss": 0.0, + "step": 42312 + }, + { + "epoch": 3.9482131193431, + "grad_norm": NaN, + "learning_rate": 8.313964491747056e-05, + "loss": 0.0, + "step": 42313 + }, + { + "epoch": 3.9483064290379772, + "grad_norm": NaN, + "learning_rate": 8.313287417030372e-05, + "loss": 0.0, + "step": 42314 + }, + { + "epoch": 3.948399738732854, + "grad_norm": NaN, + "learning_rate": 8.312610359316022e-05, + "loss": 0.0, + "step": 42315 + }, + { + "epoch": 3.9484930484277316, + "grad_norm": NaN, + "learning_rate": 8.311933318605714e-05, + "loss": 0.0, + "step": 42316 + }, + { + "epoch": 3.948586358122609, + "grad_norm": NaN, + "learning_rate": 8.311256294901181e-05, + "loss": 0.0, + "step": 42317 + }, + { + "epoch": 3.948679667817486, + "grad_norm": NaN, + "learning_rate": 8.310579288204149e-05, + "loss": 0.0, + "step": 42318 + }, + { + "epoch": 3.9487729775123634, + "grad_norm": NaN, + "learning_rate": 8.30990229851632e-05, + "loss": 0.0, + "step": 42319 + }, + { + "epoch": 3.948866287207241, + "grad_norm": NaN, + "learning_rate": 8.309225325839436e-05, + "loss": 0.0, + "step": 42320 + }, + { + "epoch": 3.9489595969021183, + "grad_norm": NaN, + "learning_rate": 8.308548370175214e-05, + "loss": 0.0, + "step": 42321 + }, + { + "epoch": 3.9490529065969957, + "grad_norm": NaN, + "learning_rate": 8.30787143152536e-05, + "loss": 0.0, + "step": 42322 + }, + { + "epoch": 3.9491462162918727, + "grad_norm": NaN, + "learning_rate": 8.30719450989161e-05, + "loss": 0.0, + "step": 42323 + }, + { + "epoch": 3.94923952598675, + "grad_norm": NaN, + "learning_rate": 8.30651760527569e-05, + "loss": 0.0, + "step": 42324 + }, + { + "epoch": 3.949332835681627, + "grad_norm": NaN, + "learning_rate": 8.305840717679302e-05, + "loss": 0.0, + "step": 42325 + }, + { + "epoch": 3.9494261453765045, + "grad_norm": NaN, + "learning_rate": 8.305163847104185e-05, + "loss": 0.0, + "step": 42326 + }, + { + "epoch": 3.949519455071382, + "grad_norm": NaN, + "learning_rate": 8.304486993552055e-05, + "loss": 0.0, + "step": 42327 + }, + { + "epoch": 3.9496127647662593, + "grad_norm": NaN, + "learning_rate": 8.303810157024622e-05, + "loss": 0.0, + "step": 42328 + }, + { + "epoch": 3.9497060744611368, + "grad_norm": NaN, + "learning_rate": 8.303133337523623e-05, + "loss": 0.0, + "step": 42329 + }, + { + "epoch": 3.9497993841560137, + "grad_norm": NaN, + "learning_rate": 8.302456535050777e-05, + "loss": 0.0, + "step": 42330 + }, + { + "epoch": 3.949892693850891, + "grad_norm": NaN, + "learning_rate": 8.301779749607786e-05, + "loss": 0.0, + "step": 42331 + }, + { + "epoch": 3.9499860035457686, + "grad_norm": NaN, + "learning_rate": 8.3011029811964e-05, + "loss": 0.0, + "step": 42332 + }, + { + "epoch": 3.9500793132406455, + "grad_norm": NaN, + "learning_rate": 8.300426229818318e-05, + "loss": 0.0, + "step": 42333 + }, + { + "epoch": 3.950172622935523, + "grad_norm": NaN, + "learning_rate": 8.29974949547526e-05, + "loss": 0.0, + "step": 42334 + }, + { + "epoch": 3.9502659326304004, + "grad_norm": NaN, + "learning_rate": 8.299072778168968e-05, + "loss": 0.0, + "step": 42335 + }, + { + "epoch": 3.950359242325278, + "grad_norm": NaN, + "learning_rate": 8.298396077901136e-05, + "loss": 0.0, + "step": 42336 + }, + { + "epoch": 3.9504525520201548, + "grad_norm": NaN, + "learning_rate": 8.297719394673503e-05, + "loss": 0.0, + "step": 42337 + }, + { + "epoch": 3.950545861715032, + "grad_norm": NaN, + "learning_rate": 8.297042728487793e-05, + "loss": 0.0, + "step": 42338 + }, + { + "epoch": 3.9506391714099096, + "grad_norm": NaN, + "learning_rate": 8.296366079345703e-05, + "loss": 0.0, + "step": 42339 + }, + { + "epoch": 3.9507324811047866, + "grad_norm": NaN, + "learning_rate": 8.295689447248977e-05, + "loss": 0.0, + "step": 42340 + }, + { + "epoch": 3.950825790799664, + "grad_norm": NaN, + "learning_rate": 8.29501283219933e-05, + "loss": 0.0, + "step": 42341 + }, + { + "epoch": 3.9509191004945414, + "grad_norm": NaN, + "learning_rate": 8.294336234198469e-05, + "loss": 0.0, + "step": 42342 + }, + { + "epoch": 3.951012410189419, + "grad_norm": NaN, + "learning_rate": 8.293659653248131e-05, + "loss": 0.0, + "step": 42343 + }, + { + "epoch": 3.951105719884296, + "grad_norm": NaN, + "learning_rate": 8.292983089350033e-05, + "loss": 0.0, + "step": 42344 + }, + { + "epoch": 3.9511990295791732, + "grad_norm": NaN, + "learning_rate": 8.292306542505881e-05, + "loss": 0.0, + "step": 42345 + }, + { + "epoch": 3.9512923392740507, + "grad_norm": NaN, + "learning_rate": 8.291630012717413e-05, + "loss": 0.0, + "step": 42346 + }, + { + "epoch": 3.9513856489689276, + "grad_norm": NaN, + "learning_rate": 8.29095349998635e-05, + "loss": 0.0, + "step": 42347 + }, + { + "epoch": 3.951478958663805, + "grad_norm": NaN, + "learning_rate": 8.290277004314391e-05, + "loss": 0.0, + "step": 42348 + }, + { + "epoch": 3.9515722683586825, + "grad_norm": NaN, + "learning_rate": 8.289600525703279e-05, + "loss": 0.0, + "step": 42349 + }, + { + "epoch": 3.95166557805356, + "grad_norm": NaN, + "learning_rate": 8.288924064154731e-05, + "loss": 0.0, + "step": 42350 + }, + { + "epoch": 3.9517588877484373, + "grad_norm": NaN, + "learning_rate": 8.288247619670446e-05, + "loss": 0.0, + "step": 42351 + }, + { + "epoch": 3.9518521974433143, + "grad_norm": NaN, + "learning_rate": 8.287571192252166e-05, + "loss": 0.0, + "step": 42352 + }, + { + "epoch": 3.9519455071381917, + "grad_norm": NaN, + "learning_rate": 8.286894781901612e-05, + "loss": 0.0, + "step": 42353 + }, + { + "epoch": 3.9520388168330687, + "grad_norm": NaN, + "learning_rate": 8.286218388620481e-05, + "loss": 0.0, + "step": 42354 + }, + { + "epoch": 3.952132126527946, + "grad_norm": NaN, + "learning_rate": 8.285542012410516e-05, + "loss": 0.0, + "step": 42355 + }, + { + "epoch": 3.9522254362228235, + "grad_norm": NaN, + "learning_rate": 8.284865653273434e-05, + "loss": 0.0, + "step": 42356 + }, + { + "epoch": 3.952318745917701, + "grad_norm": NaN, + "learning_rate": 8.284189311210938e-05, + "loss": 0.0, + "step": 42357 + }, + { + "epoch": 3.9524120556125784, + "grad_norm": NaN, + "learning_rate": 8.283512986224765e-05, + "loss": 0.0, + "step": 42358 + }, + { + "epoch": 3.9525053653074553, + "grad_norm": NaN, + "learning_rate": 8.282836678316635e-05, + "loss": 0.0, + "step": 42359 + }, + { + "epoch": 3.9525986750023328, + "grad_norm": NaN, + "learning_rate": 8.282160387488248e-05, + "loss": 0.0, + "step": 42360 + }, + { + "epoch": 3.95269198469721, + "grad_norm": NaN, + "learning_rate": 8.281484113741345e-05, + "loss": 0.0, + "step": 42361 + }, + { + "epoch": 3.952785294392087, + "grad_norm": NaN, + "learning_rate": 8.280807857077644e-05, + "loss": 0.0, + "step": 42362 + }, + { + "epoch": 3.9528786040869646, + "grad_norm": NaN, + "learning_rate": 8.280131617498845e-05, + "loss": 0.0, + "step": 42363 + }, + { + "epoch": 3.952971913781842, + "grad_norm": NaN, + "learning_rate": 8.279455395006688e-05, + "loss": 0.0, + "step": 42364 + }, + { + "epoch": 3.9530652234767194, + "grad_norm": NaN, + "learning_rate": 8.27877918960289e-05, + "loss": 0.0, + "step": 42365 + }, + { + "epoch": 3.9531585331715964, + "grad_norm": NaN, + "learning_rate": 8.278103001289152e-05, + "loss": 0.0, + "step": 42366 + }, + { + "epoch": 3.953251842866474, + "grad_norm": NaN, + "learning_rate": 8.277426830067216e-05, + "loss": 0.0, + "step": 42367 + }, + { + "epoch": 3.953345152561351, + "grad_norm": NaN, + "learning_rate": 8.276750675938797e-05, + "loss": 0.0, + "step": 42368 + }, + { + "epoch": 3.953438462256228, + "grad_norm": NaN, + "learning_rate": 8.276074538905598e-05, + "loss": 0.0, + "step": 42369 + }, + { + "epoch": 3.9535317719511056, + "grad_norm": NaN, + "learning_rate": 8.275398418969355e-05, + "loss": 0.0, + "step": 42370 + }, + { + "epoch": 3.953625081645983, + "grad_norm": NaN, + "learning_rate": 8.27472231613178e-05, + "loss": 0.0, + "step": 42371 + }, + { + "epoch": 3.9537183913408604, + "grad_norm": NaN, + "learning_rate": 8.274046230394596e-05, + "loss": 0.0, + "step": 42372 + }, + { + "epoch": 3.953811701035738, + "grad_norm": NaN, + "learning_rate": 8.273370161759522e-05, + "loss": 0.0, + "step": 42373 + }, + { + "epoch": 3.953905010730615, + "grad_norm": NaN, + "learning_rate": 8.272694110228271e-05, + "loss": 0.0, + "step": 42374 + }, + { + "epoch": 3.9539983204254923, + "grad_norm": NaN, + "learning_rate": 8.272018075802569e-05, + "loss": 0.0, + "step": 42375 + }, + { + "epoch": 3.9540916301203692, + "grad_norm": NaN, + "learning_rate": 8.271342058484139e-05, + "loss": 0.0, + "step": 42376 + }, + { + "epoch": 3.9541849398152467, + "grad_norm": NaN, + "learning_rate": 8.270666058274679e-05, + "loss": 0.0, + "step": 42377 + }, + { + "epoch": 3.954278249510124, + "grad_norm": NaN, + "learning_rate": 8.269990075175929e-05, + "loss": 0.0, + "step": 42378 + }, + { + "epoch": 3.9543715592050015, + "grad_norm": NaN, + "learning_rate": 8.269314109189604e-05, + "loss": 0.0, + "step": 42379 + }, + { + "epoch": 3.954464868899879, + "grad_norm": NaN, + "learning_rate": 8.26863816031741e-05, + "loss": 0.0, + "step": 42380 + }, + { + "epoch": 3.954558178594756, + "grad_norm": NaN, + "learning_rate": 8.26796222856108e-05, + "loss": 0.0, + "step": 42381 + }, + { + "epoch": 3.9546514882896333, + "grad_norm": NaN, + "learning_rate": 8.267286313922334e-05, + "loss": 0.0, + "step": 42382 + }, + { + "epoch": 3.9547447979845107, + "grad_norm": NaN, + "learning_rate": 8.266610416402872e-05, + "loss": 0.0, + "step": 42383 + }, + { + "epoch": 3.9548381076793877, + "grad_norm": NaN, + "learning_rate": 8.265934536004434e-05, + "loss": 0.0, + "step": 42384 + }, + { + "epoch": 3.954931417374265, + "grad_norm": NaN, + "learning_rate": 8.265258672728731e-05, + "loss": 0.0, + "step": 42385 + }, + { + "epoch": 3.9550247270691425, + "grad_norm": NaN, + "learning_rate": 8.264582826577471e-05, + "loss": 0.0, + "step": 42386 + }, + { + "epoch": 3.95511803676402, + "grad_norm": NaN, + "learning_rate": 8.263906997552389e-05, + "loss": 0.0, + "step": 42387 + }, + { + "epoch": 3.955211346458897, + "grad_norm": NaN, + "learning_rate": 8.263231185655202e-05, + "loss": 0.0, + "step": 42388 + }, + { + "epoch": 3.9553046561537744, + "grad_norm": NaN, + "learning_rate": 8.262555390887609e-05, + "loss": 0.0, + "step": 42389 + }, + { + "epoch": 3.9553979658486518, + "grad_norm": NaN, + "learning_rate": 8.261879613251346e-05, + "loss": 0.0, + "step": 42390 + }, + { + "epoch": 3.9554912755435288, + "grad_norm": NaN, + "learning_rate": 8.261203852748135e-05, + "loss": 0.0, + "step": 42391 + }, + { + "epoch": 3.955584585238406, + "grad_norm": NaN, + "learning_rate": 8.260528109379676e-05, + "loss": 0.0, + "step": 42392 + }, + { + "epoch": 3.9556778949332836, + "grad_norm": NaN, + "learning_rate": 8.2598523831477e-05, + "loss": 0.0, + "step": 42393 + }, + { + "epoch": 3.955771204628161, + "grad_norm": NaN, + "learning_rate": 8.259176674053933e-05, + "loss": 0.0, + "step": 42394 + }, + { + "epoch": 3.9558645143230384, + "grad_norm": NaN, + "learning_rate": 8.258500982100068e-05, + "loss": 0.0, + "step": 42395 + }, + { + "epoch": 3.9559578240179154, + "grad_norm": NaN, + "learning_rate": 8.257825307287847e-05, + "loss": 0.0, + "step": 42396 + }, + { + "epoch": 3.956051133712793, + "grad_norm": NaN, + "learning_rate": 8.257149649618983e-05, + "loss": 0.0, + "step": 42397 + }, + { + "epoch": 3.95614444340767, + "grad_norm": NaN, + "learning_rate": 8.256474009095177e-05, + "loss": 0.0, + "step": 42398 + }, + { + "epoch": 3.956237753102547, + "grad_norm": NaN, + "learning_rate": 8.255798385718167e-05, + "loss": 0.0, + "step": 42399 + }, + { + "epoch": 3.9563310627974246, + "grad_norm": NaN, + "learning_rate": 8.255122779489669e-05, + "loss": 0.0, + "step": 42400 + }, + { + "epoch": 3.956424372492302, + "grad_norm": NaN, + "learning_rate": 8.254447190411386e-05, + "loss": 0.0, + "step": 42401 + }, + { + "epoch": 3.9565176821871795, + "grad_norm": NaN, + "learning_rate": 8.25377161848505e-05, + "loss": 0.0, + "step": 42402 + }, + { + "epoch": 3.9566109918820564, + "grad_norm": NaN, + "learning_rate": 8.253096063712381e-05, + "loss": 0.0, + "step": 42403 + }, + { + "epoch": 3.956704301576934, + "grad_norm": NaN, + "learning_rate": 8.25242052609508e-05, + "loss": 0.0, + "step": 42404 + }, + { + "epoch": 3.9567976112718113, + "grad_norm": NaN, + "learning_rate": 8.251745005634877e-05, + "loss": 0.0, + "step": 42405 + }, + { + "epoch": 3.9568909209666883, + "grad_norm": NaN, + "learning_rate": 8.251069502333495e-05, + "loss": 0.0, + "step": 42406 + }, + { + "epoch": 3.9569842306615657, + "grad_norm": NaN, + "learning_rate": 8.250394016192633e-05, + "loss": 0.0, + "step": 42407 + }, + { + "epoch": 3.957077540356443, + "grad_norm": NaN, + "learning_rate": 8.249718547214026e-05, + "loss": 0.0, + "step": 42408 + }, + { + "epoch": 3.9571708500513205, + "grad_norm": NaN, + "learning_rate": 8.249043095399382e-05, + "loss": 0.0, + "step": 42409 + }, + { + "epoch": 3.9572641597461975, + "grad_norm": NaN, + "learning_rate": 8.248367660750424e-05, + "loss": 0.0, + "step": 42410 + }, + { + "epoch": 3.957357469441075, + "grad_norm": NaN, + "learning_rate": 8.247692243268868e-05, + "loss": 0.0, + "step": 42411 + }, + { + "epoch": 3.9574507791359523, + "grad_norm": NaN, + "learning_rate": 8.24701684295643e-05, + "loss": 0.0, + "step": 42412 + }, + { + "epoch": 3.9575440888308293, + "grad_norm": NaN, + "learning_rate": 8.246341459814827e-05, + "loss": 0.0, + "step": 42413 + }, + { + "epoch": 3.9576373985257067, + "grad_norm": NaN, + "learning_rate": 8.245666093845776e-05, + "loss": 0.0, + "step": 42414 + }, + { + "epoch": 3.957730708220584, + "grad_norm": NaN, + "learning_rate": 8.244990745050997e-05, + "loss": 0.0, + "step": 42415 + }, + { + "epoch": 3.9578240179154616, + "grad_norm": NaN, + "learning_rate": 8.244315413432204e-05, + "loss": 0.0, + "step": 42416 + }, + { + "epoch": 3.957917327610339, + "grad_norm": NaN, + "learning_rate": 8.243640098991118e-05, + "loss": 0.0, + "step": 42417 + }, + { + "epoch": 3.958010637305216, + "grad_norm": NaN, + "learning_rate": 8.242964801729452e-05, + "loss": 0.0, + "step": 42418 + }, + { + "epoch": 3.9581039470000934, + "grad_norm": NaN, + "learning_rate": 8.242289521648925e-05, + "loss": 0.0, + "step": 42419 + }, + { + "epoch": 3.9581972566949704, + "grad_norm": NaN, + "learning_rate": 8.241614258751262e-05, + "loss": 0.0, + "step": 42420 + }, + { + "epoch": 3.9582905663898478, + "grad_norm": NaN, + "learning_rate": 8.240939013038158e-05, + "loss": 0.0, + "step": 42421 + }, + { + "epoch": 3.958383876084725, + "grad_norm": NaN, + "learning_rate": 8.240263784511351e-05, + "loss": 0.0, + "step": 42422 + }, + { + "epoch": 3.9584771857796026, + "grad_norm": NaN, + "learning_rate": 8.239588573172554e-05, + "loss": 0.0, + "step": 42423 + }, + { + "epoch": 3.95857049547448, + "grad_norm": NaN, + "learning_rate": 8.238913379023472e-05, + "loss": 0.0, + "step": 42424 + }, + { + "epoch": 3.958663805169357, + "grad_norm": NaN, + "learning_rate": 8.238238202065836e-05, + "loss": 0.0, + "step": 42425 + }, + { + "epoch": 3.9587571148642344, + "grad_norm": NaN, + "learning_rate": 8.237563042301362e-05, + "loss": 0.0, + "step": 42426 + }, + { + "epoch": 3.958850424559112, + "grad_norm": NaN, + "learning_rate": 8.23688789973175e-05, + "loss": 0.0, + "step": 42427 + }, + { + "epoch": 3.958943734253989, + "grad_norm": NaN, + "learning_rate": 8.236212774358736e-05, + "loss": 0.0, + "step": 42428 + }, + { + "epoch": 3.9590370439488662, + "grad_norm": NaN, + "learning_rate": 8.235537666184036e-05, + "loss": 0.0, + "step": 42429 + }, + { + "epoch": 3.9591303536437437, + "grad_norm": NaN, + "learning_rate": 8.234862575209344e-05, + "loss": 0.0, + "step": 42430 + }, + { + "epoch": 3.959223663338621, + "grad_norm": NaN, + "learning_rate": 8.234187501436403e-05, + "loss": 0.0, + "step": 42431 + }, + { + "epoch": 3.959316973033498, + "grad_norm": NaN, + "learning_rate": 8.233512444866921e-05, + "loss": 0.0, + "step": 42432 + }, + { + "epoch": 3.9594102827283755, + "grad_norm": NaN, + "learning_rate": 8.232837405502604e-05, + "loss": 0.0, + "step": 42433 + }, + { + "epoch": 3.959503592423253, + "grad_norm": NaN, + "learning_rate": 8.232162383345183e-05, + "loss": 0.0, + "step": 42434 + }, + { + "epoch": 3.95959690211813, + "grad_norm": NaN, + "learning_rate": 8.231487378396372e-05, + "loss": 0.0, + "step": 42435 + }, + { + "epoch": 3.9596902118130073, + "grad_norm": NaN, + "learning_rate": 8.230812390657872e-05, + "loss": 0.0, + "step": 42436 + }, + { + "epoch": 3.9597835215078847, + "grad_norm": NaN, + "learning_rate": 8.230137420131416e-05, + "loss": 0.0, + "step": 42437 + }, + { + "epoch": 3.959876831202762, + "grad_norm": NaN, + "learning_rate": 8.229462466818722e-05, + "loss": 0.0, + "step": 42438 + }, + { + "epoch": 3.959970140897639, + "grad_norm": NaN, + "learning_rate": 8.22878753072149e-05, + "loss": 0.0, + "step": 42439 + }, + { + "epoch": 3.9600634505925165, + "grad_norm": NaN, + "learning_rate": 8.228112611841448e-05, + "loss": 0.0, + "step": 42440 + }, + { + "epoch": 3.960156760287394, + "grad_norm": NaN, + "learning_rate": 8.227437710180317e-05, + "loss": 0.0, + "step": 42441 + }, + { + "epoch": 3.960250069982271, + "grad_norm": NaN, + "learning_rate": 8.226762825739795e-05, + "loss": 0.0, + "step": 42442 + }, + { + "epoch": 3.9603433796771483, + "grad_norm": NaN, + "learning_rate": 8.226087958521612e-05, + "loss": 0.0, + "step": 42443 + }, + { + "epoch": 3.9604366893720258, + "grad_norm": NaN, + "learning_rate": 8.22541310852749e-05, + "loss": 0.0, + "step": 42444 + }, + { + "epoch": 3.960529999066903, + "grad_norm": NaN, + "learning_rate": 8.22473827575912e-05, + "loss": 0.0, + "step": 42445 + }, + { + "epoch": 3.9606233087617806, + "grad_norm": NaN, + "learning_rate": 8.224063460218243e-05, + "loss": 0.0, + "step": 42446 + }, + { + "epoch": 3.9607166184566576, + "grad_norm": NaN, + "learning_rate": 8.223388661906562e-05, + "loss": 0.0, + "step": 42447 + }, + { + "epoch": 3.960809928151535, + "grad_norm": NaN, + "learning_rate": 8.2227138808258e-05, + "loss": 0.0, + "step": 42448 + }, + { + "epoch": 3.9609032378464124, + "grad_norm": NaN, + "learning_rate": 8.222039116977667e-05, + "loss": 0.0, + "step": 42449 + }, + { + "epoch": 3.9609965475412894, + "grad_norm": NaN, + "learning_rate": 8.22136437036388e-05, + "loss": 0.0, + "step": 42450 + }, + { + "epoch": 3.961089857236167, + "grad_norm": NaN, + "learning_rate": 8.220689640986157e-05, + "loss": 0.0, + "step": 42451 + }, + { + "epoch": 3.961183166931044, + "grad_norm": NaN, + "learning_rate": 8.220014928846212e-05, + "loss": 0.0, + "step": 42452 + }, + { + "epoch": 3.9612764766259216, + "grad_norm": NaN, + "learning_rate": 8.21934023394576e-05, + "loss": 0.0, + "step": 42453 + }, + { + "epoch": 3.9613697863207986, + "grad_norm": NaN, + "learning_rate": 8.218665556286517e-05, + "loss": 0.0, + "step": 42454 + }, + { + "epoch": 3.961463096015676, + "grad_norm": NaN, + "learning_rate": 8.217990895870198e-05, + "loss": 0.0, + "step": 42455 + }, + { + "epoch": 3.9615564057105535, + "grad_norm": NaN, + "learning_rate": 8.217316252698521e-05, + "loss": 0.0, + "step": 42456 + }, + { + "epoch": 3.9616497154054304, + "grad_norm": NaN, + "learning_rate": 8.2166416267732e-05, + "loss": 0.0, + "step": 42457 + }, + { + "epoch": 3.961743025100308, + "grad_norm": NaN, + "learning_rate": 8.215967018095947e-05, + "loss": 0.0, + "step": 42458 + }, + { + "epoch": 3.9618363347951853, + "grad_norm": NaN, + "learning_rate": 8.215292426668482e-05, + "loss": 0.0, + "step": 42459 + }, + { + "epoch": 3.9619296444900627, + "grad_norm": NaN, + "learning_rate": 8.214617852492519e-05, + "loss": 0.0, + "step": 42460 + }, + { + "epoch": 3.9620229541849397, + "grad_norm": NaN, + "learning_rate": 8.213943295569772e-05, + "loss": 0.0, + "step": 42461 + }, + { + "epoch": 3.962116263879817, + "grad_norm": NaN, + "learning_rate": 8.213268755901957e-05, + "loss": 0.0, + "step": 42462 + }, + { + "epoch": 3.9622095735746945, + "grad_norm": NaN, + "learning_rate": 8.212594233490792e-05, + "loss": 0.0, + "step": 42463 + }, + { + "epoch": 3.9623028832695715, + "grad_norm": NaN, + "learning_rate": 8.211919728337985e-05, + "loss": 0.0, + "step": 42464 + }, + { + "epoch": 3.962396192964449, + "grad_norm": NaN, + "learning_rate": 8.211245240445257e-05, + "loss": 0.0, + "step": 42465 + }, + { + "epoch": 3.9624895026593263, + "grad_norm": NaN, + "learning_rate": 8.210570769814323e-05, + "loss": 0.0, + "step": 42466 + }, + { + "epoch": 3.9625828123542037, + "grad_norm": NaN, + "learning_rate": 8.2098963164469e-05, + "loss": 0.0, + "step": 42467 + }, + { + "epoch": 3.962676122049081, + "grad_norm": NaN, + "learning_rate": 8.209221880344689e-05, + "loss": 0.0, + "step": 42468 + }, + { + "epoch": 3.962769431743958, + "grad_norm": NaN, + "learning_rate": 8.20854746150942e-05, + "loss": 0.0, + "step": 42469 + }, + { + "epoch": 3.9628627414388355, + "grad_norm": NaN, + "learning_rate": 8.207873059942809e-05, + "loss": 0.0, + "step": 42470 + }, + { + "epoch": 3.9629560511337125, + "grad_norm": NaN, + "learning_rate": 8.207198675646553e-05, + "loss": 0.0, + "step": 42471 + }, + { + "epoch": 3.96304936082859, + "grad_norm": NaN, + "learning_rate": 8.206524308622386e-05, + "loss": 0.0, + "step": 42472 + }, + { + "epoch": 3.9631426705234674, + "grad_norm": NaN, + "learning_rate": 8.20584995887202e-05, + "loss": 0.0, + "step": 42473 + }, + { + "epoch": 3.963235980218345, + "grad_norm": NaN, + "learning_rate": 8.205175626397154e-05, + "loss": 0.0, + "step": 42474 + }, + { + "epoch": 3.963329289913222, + "grad_norm": NaN, + "learning_rate": 8.204501311199518e-05, + "loss": 0.0, + "step": 42475 + }, + { + "epoch": 3.963422599608099, + "grad_norm": NaN, + "learning_rate": 8.20382701328083e-05, + "loss": 0.0, + "step": 42476 + }, + { + "epoch": 3.9635159093029766, + "grad_norm": NaN, + "learning_rate": 8.203152732642782e-05, + "loss": 0.0, + "step": 42477 + }, + { + "epoch": 3.963609218997854, + "grad_norm": NaN, + "learning_rate": 8.20247846928711e-05, + "loss": 0.0, + "step": 42478 + }, + { + "epoch": 3.963702528692731, + "grad_norm": NaN, + "learning_rate": 8.20180422321553e-05, + "loss": 0.0, + "step": 42479 + }, + { + "epoch": 3.9637958383876084, + "grad_norm": NaN, + "learning_rate": 8.201129994429734e-05, + "loss": 0.0, + "step": 42480 + }, + { + "epoch": 3.963889148082486, + "grad_norm": NaN, + "learning_rate": 8.200455782931457e-05, + "loss": 0.0, + "step": 42481 + }, + { + "epoch": 3.9639824577773632, + "grad_norm": NaN, + "learning_rate": 8.199781588722414e-05, + "loss": 0.0, + "step": 42482 + }, + { + "epoch": 3.96407576747224, + "grad_norm": NaN, + "learning_rate": 8.199107411804298e-05, + "loss": 0.0, + "step": 42483 + }, + { + "epoch": 3.9641690771671176, + "grad_norm": NaN, + "learning_rate": 8.198433252178846e-05, + "loss": 0.0, + "step": 42484 + }, + { + "epoch": 3.964262386861995, + "grad_norm": NaN, + "learning_rate": 8.19775910984776e-05, + "loss": 0.0, + "step": 42485 + }, + { + "epoch": 3.964355696556872, + "grad_norm": NaN, + "learning_rate": 8.19708498481276e-05, + "loss": 0.0, + "step": 42486 + }, + { + "epoch": 3.9644490062517495, + "grad_norm": NaN, + "learning_rate": 8.196410877075555e-05, + "loss": 0.0, + "step": 42487 + }, + { + "epoch": 3.964542315946627, + "grad_norm": NaN, + "learning_rate": 8.195736786637866e-05, + "loss": 0.0, + "step": 42488 + }, + { + "epoch": 3.9646356256415043, + "grad_norm": NaN, + "learning_rate": 8.195062713501399e-05, + "loss": 0.0, + "step": 42489 + }, + { + "epoch": 3.9647289353363817, + "grad_norm": NaN, + "learning_rate": 8.194388657667875e-05, + "loss": 0.0, + "step": 42490 + }, + { + "epoch": 3.9648222450312587, + "grad_norm": NaN, + "learning_rate": 8.193714619139002e-05, + "loss": 0.0, + "step": 42491 + }, + { + "epoch": 3.964915554726136, + "grad_norm": NaN, + "learning_rate": 8.193040597916498e-05, + "loss": 0.0, + "step": 42492 + }, + { + "epoch": 3.965008864421013, + "grad_norm": NaN, + "learning_rate": 8.192366594002075e-05, + "loss": 0.0, + "step": 42493 + }, + { + "epoch": 3.9651021741158905, + "grad_norm": NaN, + "learning_rate": 8.191692607397447e-05, + "loss": 0.0, + "step": 42494 + }, + { + "epoch": 3.965195483810768, + "grad_norm": NaN, + "learning_rate": 8.191018638104328e-05, + "loss": 0.0, + "step": 42495 + }, + { + "epoch": 3.9652887935056453, + "grad_norm": NaN, + "learning_rate": 8.190344686124433e-05, + "loss": 0.0, + "step": 42496 + }, + { + "epoch": 3.9653821032005228, + "grad_norm": NaN, + "learning_rate": 8.189670751459471e-05, + "loss": 0.0, + "step": 42497 + }, + { + "epoch": 3.9654754128953997, + "grad_norm": NaN, + "learning_rate": 8.188996834111162e-05, + "loss": 0.0, + "step": 42498 + }, + { + "epoch": 3.965568722590277, + "grad_norm": NaN, + "learning_rate": 8.188322934081214e-05, + "loss": 0.0, + "step": 42499 + }, + { + "epoch": 3.9656620322851546, + "grad_norm": NaN, + "learning_rate": 8.187649051371346e-05, + "loss": 0.0, + "step": 42500 + }, + { + "epoch": 3.9657553419800315, + "grad_norm": NaN, + "learning_rate": 8.186975185983265e-05, + "loss": 0.0, + "step": 42501 + }, + { + "epoch": 3.965848651674909, + "grad_norm": NaN, + "learning_rate": 8.186301337918692e-05, + "loss": 0.0, + "step": 42502 + }, + { + "epoch": 3.9659419613697864, + "grad_norm": NaN, + "learning_rate": 8.185627507179332e-05, + "loss": 0.0, + "step": 42503 + }, + { + "epoch": 3.966035271064664, + "grad_norm": NaN, + "learning_rate": 8.184953693766906e-05, + "loss": 0.0, + "step": 42504 + }, + { + "epoch": 3.966128580759541, + "grad_norm": NaN, + "learning_rate": 8.184279897683124e-05, + "loss": 0.0, + "step": 42505 + }, + { + "epoch": 3.966221890454418, + "grad_norm": NaN, + "learning_rate": 8.183606118929698e-05, + "loss": 0.0, + "step": 42506 + }, + { + "epoch": 3.9663152001492956, + "grad_norm": NaN, + "learning_rate": 8.182932357508341e-05, + "loss": 0.0, + "step": 42507 + }, + { + "epoch": 3.9664085098441726, + "grad_norm": NaN, + "learning_rate": 8.182258613420769e-05, + "loss": 0.0, + "step": 42508 + }, + { + "epoch": 3.96650181953905, + "grad_norm": NaN, + "learning_rate": 8.181584886668695e-05, + "loss": 0.0, + "step": 42509 + }, + { + "epoch": 3.9665951292339274, + "grad_norm": NaN, + "learning_rate": 8.180911177253829e-05, + "loss": 0.0, + "step": 42510 + }, + { + "epoch": 3.966688438928805, + "grad_norm": NaN, + "learning_rate": 8.180237485177894e-05, + "loss": 0.0, + "step": 42511 + }, + { + "epoch": 3.9667817486236823, + "grad_norm": NaN, + "learning_rate": 8.179563810442581e-05, + "loss": 0.0, + "step": 42512 + }, + { + "epoch": 3.9668750583185592, + "grad_norm": NaN, + "learning_rate": 8.178890153049624e-05, + "loss": 0.0, + "step": 42513 + }, + { + "epoch": 3.9669683680134367, + "grad_norm": NaN, + "learning_rate": 8.178216513000732e-05, + "loss": 0.0, + "step": 42514 + }, + { + "epoch": 3.9670616777083136, + "grad_norm": NaN, + "learning_rate": 8.177542890297606e-05, + "loss": 0.0, + "step": 42515 + }, + { + "epoch": 3.967154987403191, + "grad_norm": NaN, + "learning_rate": 8.176869284941973e-05, + "loss": 0.0, + "step": 42516 + }, + { + "epoch": 3.9672482970980685, + "grad_norm": NaN, + "learning_rate": 8.176195696935545e-05, + "loss": 0.0, + "step": 42517 + }, + { + "epoch": 3.967341606792946, + "grad_norm": NaN, + "learning_rate": 8.17552212628002e-05, + "loss": 0.0, + "step": 42518 + }, + { + "epoch": 3.9674349164878233, + "grad_norm": NaN, + "learning_rate": 8.174848572977124e-05, + "loss": 0.0, + "step": 42519 + }, + { + "epoch": 3.9675282261827003, + "grad_norm": NaN, + "learning_rate": 8.174175037028575e-05, + "loss": 0.0, + "step": 42520 + }, + { + "epoch": 3.9676215358775777, + "grad_norm": NaN, + "learning_rate": 8.173501518436062e-05, + "loss": 0.0, + "step": 42521 + }, + { + "epoch": 3.967714845572455, + "grad_norm": NaN, + "learning_rate": 8.172828017201321e-05, + "loss": 0.0, + "step": 42522 + }, + { + "epoch": 3.967808155267332, + "grad_norm": NaN, + "learning_rate": 8.172154533326054e-05, + "loss": 0.0, + "step": 42523 + }, + { + "epoch": 3.9679014649622095, + "grad_norm": NaN, + "learning_rate": 8.171481066811977e-05, + "loss": 0.0, + "step": 42524 + }, + { + "epoch": 3.967994774657087, + "grad_norm": NaN, + "learning_rate": 8.170807617660802e-05, + "loss": 0.0, + "step": 42525 + }, + { + "epoch": 3.9680880843519644, + "grad_norm": NaN, + "learning_rate": 8.170134185874237e-05, + "loss": 0.0, + "step": 42526 + }, + { + "epoch": 3.9681813940468413, + "grad_norm": NaN, + "learning_rate": 8.169460771454001e-05, + "loss": 0.0, + "step": 42527 + }, + { + "epoch": 3.9682747037417188, + "grad_norm": NaN, + "learning_rate": 8.168787374401801e-05, + "loss": 0.0, + "step": 42528 + }, + { + "epoch": 3.968368013436596, + "grad_norm": NaN, + "learning_rate": 8.16811399471935e-05, + "loss": 0.0, + "step": 42529 + }, + { + "epoch": 3.968461323131473, + "grad_norm": NaN, + "learning_rate": 8.167440632408362e-05, + "loss": 0.0, + "step": 42530 + }, + { + "epoch": 3.9685546328263506, + "grad_norm": NaN, + "learning_rate": 8.166767287470551e-05, + "loss": 0.0, + "step": 42531 + }, + { + "epoch": 3.968647942521228, + "grad_norm": NaN, + "learning_rate": 8.166093959907624e-05, + "loss": 0.0, + "step": 42532 + }, + { + "epoch": 3.9687412522161054, + "grad_norm": NaN, + "learning_rate": 8.165420649721298e-05, + "loss": 0.0, + "step": 42533 + }, + { + "epoch": 3.968834561910983, + "grad_norm": NaN, + "learning_rate": 8.16474735691328e-05, + "loss": 0.0, + "step": 42534 + }, + { + "epoch": 3.96892787160586, + "grad_norm": NaN, + "learning_rate": 8.164074081485286e-05, + "loss": 0.0, + "step": 42535 + }, + { + "epoch": 3.9690211813007372, + "grad_norm": NaN, + "learning_rate": 8.163400823439026e-05, + "loss": 0.0, + "step": 42536 + }, + { + "epoch": 3.969114490995614, + "grad_norm": NaN, + "learning_rate": 8.162727582776214e-05, + "loss": 0.0, + "step": 42537 + }, + { + "epoch": 3.9692078006904916, + "grad_norm": NaN, + "learning_rate": 8.162054359498559e-05, + "loss": 0.0, + "step": 42538 + }, + { + "epoch": 3.969301110385369, + "grad_norm": NaN, + "learning_rate": 8.161381153607775e-05, + "loss": 0.0, + "step": 42539 + }, + { + "epoch": 3.9693944200802465, + "grad_norm": NaN, + "learning_rate": 8.160707965105572e-05, + "loss": 0.0, + "step": 42540 + }, + { + "epoch": 3.969487729775124, + "grad_norm": NaN, + "learning_rate": 8.160034793993666e-05, + "loss": 0.0, + "step": 42541 + }, + { + "epoch": 3.969581039470001, + "grad_norm": NaN, + "learning_rate": 8.159361640273763e-05, + "loss": 0.0, + "step": 42542 + }, + { + "epoch": 3.9696743491648783, + "grad_norm": NaN, + "learning_rate": 8.158688503947576e-05, + "loss": 0.0, + "step": 42543 + }, + { + "epoch": 3.9697676588597557, + "grad_norm": NaN, + "learning_rate": 8.158015385016818e-05, + "loss": 0.0, + "step": 42544 + }, + { + "epoch": 3.9698609685546327, + "grad_norm": NaN, + "learning_rate": 8.157342283483201e-05, + "loss": 0.0, + "step": 42545 + }, + { + "epoch": 3.96995427824951, + "grad_norm": NaN, + "learning_rate": 8.156669199348436e-05, + "loss": 0.0, + "step": 42546 + }, + { + "epoch": 3.9700475879443875, + "grad_norm": NaN, + "learning_rate": 8.155996132614232e-05, + "loss": 0.0, + "step": 42547 + }, + { + "epoch": 3.970140897639265, + "grad_norm": NaN, + "learning_rate": 8.155323083282304e-05, + "loss": 0.0, + "step": 42548 + }, + { + "epoch": 3.970234207334142, + "grad_norm": NaN, + "learning_rate": 8.154650051354362e-05, + "loss": 0.0, + "step": 42549 + }, + { + "epoch": 3.9703275170290193, + "grad_norm": NaN, + "learning_rate": 8.153977036832117e-05, + "loss": 0.0, + "step": 42550 + }, + { + "epoch": 3.9704208267238967, + "grad_norm": NaN, + "learning_rate": 8.15330403971728e-05, + "loss": 0.0, + "step": 42551 + }, + { + "epoch": 3.9705141364187737, + "grad_norm": NaN, + "learning_rate": 8.152631060011565e-05, + "loss": 0.0, + "step": 42552 + }, + { + "epoch": 3.970607446113651, + "grad_norm": NaN, + "learning_rate": 8.151958097716678e-05, + "loss": 0.0, + "step": 42553 + }, + { + "epoch": 3.9707007558085285, + "grad_norm": NaN, + "learning_rate": 8.151285152834332e-05, + "loss": 0.0, + "step": 42554 + }, + { + "epoch": 3.970794065503406, + "grad_norm": NaN, + "learning_rate": 8.150612225366248e-05, + "loss": 0.0, + "step": 42555 + }, + { + "epoch": 3.970887375198283, + "grad_norm": NaN, + "learning_rate": 8.149939315314115e-05, + "loss": 0.0, + "step": 42556 + }, + { + "epoch": 3.9709806848931604, + "grad_norm": NaN, + "learning_rate": 8.149266422679664e-05, + "loss": 0.0, + "step": 42557 + }, + { + "epoch": 3.971073994588038, + "grad_norm": NaN, + "learning_rate": 8.148593547464604e-05, + "loss": 0.0, + "step": 42558 + }, + { + "epoch": 3.9711673042829148, + "grad_norm": NaN, + "learning_rate": 8.147920689670631e-05, + "loss": 0.0, + "step": 42559 + }, + { + "epoch": 3.971260613977792, + "grad_norm": NaN, + "learning_rate": 8.147247849299469e-05, + "loss": 0.0, + "step": 42560 + }, + { + "epoch": 3.9713539236726696, + "grad_norm": NaN, + "learning_rate": 8.146575026352829e-05, + "loss": 0.0, + "step": 42561 + }, + { + "epoch": 3.971447233367547, + "grad_norm": NaN, + "learning_rate": 8.145902220832415e-05, + "loss": 0.0, + "step": 42562 + }, + { + "epoch": 3.9715405430624244, + "grad_norm": NaN, + "learning_rate": 8.145229432739945e-05, + "loss": 0.0, + "step": 42563 + }, + { + "epoch": 3.9716338527573014, + "grad_norm": NaN, + "learning_rate": 8.144556662077122e-05, + "loss": 0.0, + "step": 42564 + }, + { + "epoch": 3.971727162452179, + "grad_norm": NaN, + "learning_rate": 8.143883908845664e-05, + "loss": 0.0, + "step": 42565 + }, + { + "epoch": 3.971820472147056, + "grad_norm": NaN, + "learning_rate": 8.143211173047278e-05, + "loss": 0.0, + "step": 42566 + }, + { + "epoch": 3.971913781841933, + "grad_norm": NaN, + "learning_rate": 8.142538454683673e-05, + "loss": 0.0, + "step": 42567 + }, + { + "epoch": 3.9720070915368106, + "grad_norm": NaN, + "learning_rate": 8.141865753756564e-05, + "loss": 0.0, + "step": 42568 + }, + { + "epoch": 3.972100401231688, + "grad_norm": NaN, + "learning_rate": 8.141193070267656e-05, + "loss": 0.0, + "step": 42569 + }, + { + "epoch": 3.9721937109265655, + "grad_norm": NaN, + "learning_rate": 8.140520404218663e-05, + "loss": 0.0, + "step": 42570 + }, + { + "epoch": 3.9722870206214425, + "grad_norm": NaN, + "learning_rate": 8.139847755611296e-05, + "loss": 0.0, + "step": 42571 + }, + { + "epoch": 3.97238033031632, + "grad_norm": NaN, + "learning_rate": 8.139175124447263e-05, + "loss": 0.0, + "step": 42572 + }, + { + "epoch": 3.9724736400111973, + "grad_norm": NaN, + "learning_rate": 8.138502510728276e-05, + "loss": 0.0, + "step": 42573 + }, + { + "epoch": 3.9725669497060743, + "grad_norm": NaN, + "learning_rate": 8.137829914456043e-05, + "loss": 0.0, + "step": 42574 + }, + { + "epoch": 3.9726602594009517, + "grad_norm": NaN, + "learning_rate": 8.137157335632278e-05, + "loss": 0.0, + "step": 42575 + }, + { + "epoch": 3.972753569095829, + "grad_norm": NaN, + "learning_rate": 8.136484774258688e-05, + "loss": 0.0, + "step": 42576 + }, + { + "epoch": 3.9728468787907065, + "grad_norm": NaN, + "learning_rate": 8.135812230336984e-05, + "loss": 0.0, + "step": 42577 + }, + { + "epoch": 3.9729401884855835, + "grad_norm": NaN, + "learning_rate": 8.135139703868877e-05, + "loss": 0.0, + "step": 42578 + }, + { + "epoch": 3.973033498180461, + "grad_norm": NaN, + "learning_rate": 8.134467194856074e-05, + "loss": 0.0, + "step": 42579 + }, + { + "epoch": 3.9731268078753383, + "grad_norm": NaN, + "learning_rate": 8.133794703300289e-05, + "loss": 0.0, + "step": 42580 + }, + { + "epoch": 3.9732201175702153, + "grad_norm": NaN, + "learning_rate": 8.133122229203231e-05, + "loss": 0.0, + "step": 42581 + }, + { + "epoch": 3.9733134272650927, + "grad_norm": NaN, + "learning_rate": 8.132449772566607e-05, + "loss": 0.0, + "step": 42582 + }, + { + "epoch": 3.97340673695997, + "grad_norm": NaN, + "learning_rate": 8.131777333392131e-05, + "loss": 0.0, + "step": 42583 + }, + { + "epoch": 3.9735000466548476, + "grad_norm": NaN, + "learning_rate": 8.131104911681509e-05, + "loss": 0.0, + "step": 42584 + }, + { + "epoch": 3.973593356349725, + "grad_norm": NaN, + "learning_rate": 8.130432507436451e-05, + "loss": 0.0, + "step": 42585 + }, + { + "epoch": 3.973686666044602, + "grad_norm": NaN, + "learning_rate": 8.129760120658673e-05, + "loss": 0.0, + "step": 42586 + }, + { + "epoch": 3.9737799757394794, + "grad_norm": NaN, + "learning_rate": 8.129087751349877e-05, + "loss": 0.0, + "step": 42587 + }, + { + "epoch": 3.9738732854343564, + "grad_norm": NaN, + "learning_rate": 8.128415399511775e-05, + "loss": 0.0, + "step": 42588 + }, + { + "epoch": 3.973966595129234, + "grad_norm": NaN, + "learning_rate": 8.127743065146077e-05, + "loss": 0.0, + "step": 42589 + }, + { + "epoch": 3.974059904824111, + "grad_norm": NaN, + "learning_rate": 8.127070748254495e-05, + "loss": 0.0, + "step": 42590 + }, + { + "epoch": 3.9741532145189886, + "grad_norm": NaN, + "learning_rate": 8.126398448838733e-05, + "loss": 0.0, + "step": 42591 + }, + { + "epoch": 3.974246524213866, + "grad_norm": NaN, + "learning_rate": 8.125726166900505e-05, + "loss": 0.0, + "step": 42592 + }, + { + "epoch": 3.974339833908743, + "grad_norm": NaN, + "learning_rate": 8.12505390244152e-05, + "loss": 0.0, + "step": 42593 + }, + { + "epoch": 3.9744331436036204, + "grad_norm": NaN, + "learning_rate": 8.124381655463487e-05, + "loss": 0.0, + "step": 42594 + }, + { + "epoch": 3.974526453298498, + "grad_norm": NaN, + "learning_rate": 8.123709425968111e-05, + "loss": 0.0, + "step": 42595 + }, + { + "epoch": 3.974619762993375, + "grad_norm": NaN, + "learning_rate": 8.12303721395711e-05, + "loss": 0.0, + "step": 42596 + }, + { + "epoch": 3.9747130726882522, + "grad_norm": NaN, + "learning_rate": 8.122365019432185e-05, + "loss": 0.0, + "step": 42597 + }, + { + "epoch": 3.9748063823831297, + "grad_norm": NaN, + "learning_rate": 8.12169284239505e-05, + "loss": 0.0, + "step": 42598 + }, + { + "epoch": 3.974899692078007, + "grad_norm": NaN, + "learning_rate": 8.121020682847411e-05, + "loss": 0.0, + "step": 42599 + }, + { + "epoch": 3.974993001772884, + "grad_norm": NaN, + "learning_rate": 8.120348540790979e-05, + "loss": 0.0, + "step": 42600 + }, + { + "epoch": 3.9750863114677615, + "grad_norm": NaN, + "learning_rate": 8.119676416227464e-05, + "loss": 0.0, + "step": 42601 + }, + { + "epoch": 3.975179621162639, + "grad_norm": NaN, + "learning_rate": 8.119004309158573e-05, + "loss": 0.0, + "step": 42602 + }, + { + "epoch": 3.975272930857516, + "grad_norm": NaN, + "learning_rate": 8.118332219586016e-05, + "loss": 0.0, + "step": 42603 + }, + { + "epoch": 3.9753662405523933, + "grad_norm": NaN, + "learning_rate": 8.1176601475115e-05, + "loss": 0.0, + "step": 42604 + }, + { + "epoch": 3.9754595502472707, + "grad_norm": NaN, + "learning_rate": 8.116988092936737e-05, + "loss": 0.0, + "step": 42605 + }, + { + "epoch": 3.975552859942148, + "grad_norm": NaN, + "learning_rate": 8.116316055863435e-05, + "loss": 0.0, + "step": 42606 + }, + { + "epoch": 3.9756461696370256, + "grad_norm": NaN, + "learning_rate": 8.1156440362933e-05, + "loss": 0.0, + "step": 42607 + }, + { + "epoch": 3.9757394793319025, + "grad_norm": NaN, + "learning_rate": 8.114972034228047e-05, + "loss": 0.0, + "step": 42608 + }, + { + "epoch": 3.97583278902678, + "grad_norm": NaN, + "learning_rate": 8.114300049669377e-05, + "loss": 0.0, + "step": 42609 + }, + { + "epoch": 3.975926098721657, + "grad_norm": NaN, + "learning_rate": 8.113628082619002e-05, + "loss": 0.0, + "step": 42610 + }, + { + "epoch": 3.9760194084165343, + "grad_norm": NaN, + "learning_rate": 8.112956133078633e-05, + "loss": 0.0, + "step": 42611 + }, + { + "epoch": 3.9761127181114118, + "grad_norm": NaN, + "learning_rate": 8.112284201049975e-05, + "loss": 0.0, + "step": 42612 + }, + { + "epoch": 3.976206027806289, + "grad_norm": NaN, + "learning_rate": 8.11161228653474e-05, + "loss": 0.0, + "step": 42613 + }, + { + "epoch": 3.9762993375011666, + "grad_norm": NaN, + "learning_rate": 8.110940389534633e-05, + "loss": 0.0, + "step": 42614 + }, + { + "epoch": 3.9763926471960436, + "grad_norm": NaN, + "learning_rate": 8.110268510051366e-05, + "loss": 0.0, + "step": 42615 + }, + { + "epoch": 3.976485956890921, + "grad_norm": NaN, + "learning_rate": 8.109596648086643e-05, + "loss": 0.0, + "step": 42616 + }, + { + "epoch": 3.9765792665857984, + "grad_norm": NaN, + "learning_rate": 8.108924803642173e-05, + "loss": 0.0, + "step": 42617 + }, + { + "epoch": 3.9766725762806754, + "grad_norm": NaN, + "learning_rate": 8.108252976719668e-05, + "loss": 0.0, + "step": 42618 + }, + { + "epoch": 3.976765885975553, + "grad_norm": NaN, + "learning_rate": 8.107581167320835e-05, + "loss": 0.0, + "step": 42619 + }, + { + "epoch": 3.9768591956704302, + "grad_norm": NaN, + "learning_rate": 8.106909375447381e-05, + "loss": 0.0, + "step": 42620 + }, + { + "epoch": 3.9769525053653076, + "grad_norm": NaN, + "learning_rate": 8.106237601101013e-05, + "loss": 0.0, + "step": 42621 + }, + { + "epoch": 3.9770458150601846, + "grad_norm": NaN, + "learning_rate": 8.105565844283441e-05, + "loss": 0.0, + "step": 42622 + }, + { + "epoch": 3.977139124755062, + "grad_norm": NaN, + "learning_rate": 8.104894104996374e-05, + "loss": 0.0, + "step": 42623 + }, + { + "epoch": 3.9772324344499395, + "grad_norm": NaN, + "learning_rate": 8.10422238324152e-05, + "loss": 0.0, + "step": 42624 + }, + { + "epoch": 3.9773257441448164, + "grad_norm": NaN, + "learning_rate": 8.103550679020583e-05, + "loss": 0.0, + "step": 42625 + }, + { + "epoch": 3.977419053839694, + "grad_norm": NaN, + "learning_rate": 8.102878992335275e-05, + "loss": 0.0, + "step": 42626 + }, + { + "epoch": 3.9775123635345713, + "grad_norm": NaN, + "learning_rate": 8.102207323187302e-05, + "loss": 0.0, + "step": 42627 + }, + { + "epoch": 3.9776056732294487, + "grad_norm": NaN, + "learning_rate": 8.101535671578374e-05, + "loss": 0.0, + "step": 42628 + }, + { + "epoch": 3.977698982924326, + "grad_norm": NaN, + "learning_rate": 8.100864037510197e-05, + "loss": 0.0, + "step": 42629 + }, + { + "epoch": 3.977792292619203, + "grad_norm": NaN, + "learning_rate": 8.100192420984478e-05, + "loss": 0.0, + "step": 42630 + }, + { + "epoch": 3.9778856023140805, + "grad_norm": NaN, + "learning_rate": 8.099520822002925e-05, + "loss": 0.0, + "step": 42631 + }, + { + "epoch": 3.9779789120089575, + "grad_norm": NaN, + "learning_rate": 8.098849240567249e-05, + "loss": 0.0, + "step": 42632 + }, + { + "epoch": 3.978072221703835, + "grad_norm": NaN, + "learning_rate": 8.098177676679153e-05, + "loss": 0.0, + "step": 42633 + }, + { + "epoch": 3.9781655313987123, + "grad_norm": NaN, + "learning_rate": 8.097506130340349e-05, + "loss": 0.0, + "step": 42634 + }, + { + "epoch": 3.9782588410935897, + "grad_norm": NaN, + "learning_rate": 8.096834601552538e-05, + "loss": 0.0, + "step": 42635 + }, + { + "epoch": 3.978352150788467, + "grad_norm": NaN, + "learning_rate": 8.096163090317436e-05, + "loss": 0.0, + "step": 42636 + }, + { + "epoch": 3.978445460483344, + "grad_norm": NaN, + "learning_rate": 8.095491596636745e-05, + "loss": 0.0, + "step": 42637 + }, + { + "epoch": 3.9785387701782216, + "grad_norm": NaN, + "learning_rate": 8.094820120512174e-05, + "loss": 0.0, + "step": 42638 + }, + { + "epoch": 3.978632079873099, + "grad_norm": NaN, + "learning_rate": 8.094148661945428e-05, + "loss": 0.0, + "step": 42639 + }, + { + "epoch": 3.978725389567976, + "grad_norm": NaN, + "learning_rate": 8.093477220938217e-05, + "loss": 0.0, + "step": 42640 + }, + { + "epoch": 3.9788186992628534, + "grad_norm": NaN, + "learning_rate": 8.092805797492249e-05, + "loss": 0.0, + "step": 42641 + }, + { + "epoch": 3.978912008957731, + "grad_norm": NaN, + "learning_rate": 8.092134391609228e-05, + "loss": 0.0, + "step": 42642 + }, + { + "epoch": 3.979005318652608, + "grad_norm": NaN, + "learning_rate": 8.091463003290865e-05, + "loss": 0.0, + "step": 42643 + }, + { + "epoch": 3.979098628347485, + "grad_norm": NaN, + "learning_rate": 8.090791632538864e-05, + "loss": 0.0, + "step": 42644 + }, + { + "epoch": 3.9791919380423626, + "grad_norm": NaN, + "learning_rate": 8.090120279354933e-05, + "loss": 0.0, + "step": 42645 + }, + { + "epoch": 3.97928524773724, + "grad_norm": NaN, + "learning_rate": 8.08944894374078e-05, + "loss": 0.0, + "step": 42646 + }, + { + "epoch": 3.979378557432117, + "grad_norm": NaN, + "learning_rate": 8.088777625698111e-05, + "loss": 0.0, + "step": 42647 + }, + { + "epoch": 3.9794718671269944, + "grad_norm": NaN, + "learning_rate": 8.088106325228633e-05, + "loss": 0.0, + "step": 42648 + }, + { + "epoch": 3.979565176821872, + "grad_norm": NaN, + "learning_rate": 8.087435042334053e-05, + "loss": 0.0, + "step": 42649 + }, + { + "epoch": 3.9796584865167492, + "grad_norm": NaN, + "learning_rate": 8.086763777016078e-05, + "loss": 0.0, + "step": 42650 + }, + { + "epoch": 3.9797517962116262, + "grad_norm": NaN, + "learning_rate": 8.086092529276416e-05, + "loss": 0.0, + "step": 42651 + }, + { + "epoch": 3.9798451059065036, + "grad_norm": NaN, + "learning_rate": 8.085421299116771e-05, + "loss": 0.0, + "step": 42652 + }, + { + "epoch": 3.979938415601381, + "grad_norm": NaN, + "learning_rate": 8.084750086538853e-05, + "loss": 0.0, + "step": 42653 + }, + { + "epoch": 3.980031725296258, + "grad_norm": NaN, + "learning_rate": 8.084078891544365e-05, + "loss": 0.0, + "step": 42654 + }, + { + "epoch": 3.9801250349911355, + "grad_norm": NaN, + "learning_rate": 8.083407714135018e-05, + "loss": 0.0, + "step": 42655 + }, + { + "epoch": 3.980218344686013, + "grad_norm": NaN, + "learning_rate": 8.082736554312515e-05, + "loss": 0.0, + "step": 42656 + }, + { + "epoch": 3.9803116543808903, + "grad_norm": NaN, + "learning_rate": 8.082065412078564e-05, + "loss": 0.0, + "step": 42657 + }, + { + "epoch": 3.9804049640757677, + "grad_norm": NaN, + "learning_rate": 8.081394287434872e-05, + "loss": 0.0, + "step": 42658 + }, + { + "epoch": 3.9804982737706447, + "grad_norm": NaN, + "learning_rate": 8.080723180383143e-05, + "loss": 0.0, + "step": 42659 + }, + { + "epoch": 3.980591583465522, + "grad_norm": NaN, + "learning_rate": 8.080052090925089e-05, + "loss": 0.0, + "step": 42660 + }, + { + "epoch": 3.9806848931603995, + "grad_norm": NaN, + "learning_rate": 8.07938101906241e-05, + "loss": 0.0, + "step": 42661 + }, + { + "epoch": 3.9807782028552765, + "grad_norm": NaN, + "learning_rate": 8.078709964796815e-05, + "loss": 0.0, + "step": 42662 + }, + { + "epoch": 3.980871512550154, + "grad_norm": NaN, + "learning_rate": 8.078038928130012e-05, + "loss": 0.0, + "step": 42663 + }, + { + "epoch": 3.9809648222450313, + "grad_norm": NaN, + "learning_rate": 8.077367909063703e-05, + "loss": 0.0, + "step": 42664 + }, + { + "epoch": 3.9810581319399088, + "grad_norm": NaN, + "learning_rate": 8.076696907599598e-05, + "loss": 0.0, + "step": 42665 + }, + { + "epoch": 3.9811514416347857, + "grad_norm": NaN, + "learning_rate": 8.076025923739402e-05, + "loss": 0.0, + "step": 42666 + }, + { + "epoch": 3.981244751329663, + "grad_norm": NaN, + "learning_rate": 8.075354957484821e-05, + "loss": 0.0, + "step": 42667 + }, + { + "epoch": 3.9813380610245406, + "grad_norm": NaN, + "learning_rate": 8.074684008837563e-05, + "loss": 0.0, + "step": 42668 + }, + { + "epoch": 3.9814313707194176, + "grad_norm": NaN, + "learning_rate": 8.07401307779933e-05, + "loss": 0.0, + "step": 42669 + }, + { + "epoch": 3.981524680414295, + "grad_norm": NaN, + "learning_rate": 8.073342164371829e-05, + "loss": 0.0, + "step": 42670 + }, + { + "epoch": 3.9816179901091724, + "grad_norm": NaN, + "learning_rate": 8.072671268556769e-05, + "loss": 0.0, + "step": 42671 + }, + { + "epoch": 3.98171129980405, + "grad_norm": NaN, + "learning_rate": 8.072000390355853e-05, + "loss": 0.0, + "step": 42672 + }, + { + "epoch": 3.981804609498927, + "grad_norm": NaN, + "learning_rate": 8.07132952977079e-05, + "loss": 0.0, + "step": 42673 + }, + { + "epoch": 3.981897919193804, + "grad_norm": NaN, + "learning_rate": 8.07065868680328e-05, + "loss": 0.0, + "step": 42674 + }, + { + "epoch": 3.9819912288886816, + "grad_norm": NaN, + "learning_rate": 8.069987861455036e-05, + "loss": 0.0, + "step": 42675 + }, + { + "epoch": 3.9820845385835586, + "grad_norm": NaN, + "learning_rate": 8.069317053727759e-05, + "loss": 0.0, + "step": 42676 + }, + { + "epoch": 3.982177848278436, + "grad_norm": NaN, + "learning_rate": 8.068646263623154e-05, + "loss": 0.0, + "step": 42677 + }, + { + "epoch": 3.9822711579733134, + "grad_norm": NaN, + "learning_rate": 8.06797549114293e-05, + "loss": 0.0, + "step": 42678 + }, + { + "epoch": 3.982364467668191, + "grad_norm": NaN, + "learning_rate": 8.06730473628879e-05, + "loss": 0.0, + "step": 42679 + }, + { + "epoch": 3.9824577773630683, + "grad_norm": NaN, + "learning_rate": 8.06663399906244e-05, + "loss": 0.0, + "step": 42680 + }, + { + "epoch": 3.9825510870579452, + "grad_norm": NaN, + "learning_rate": 8.065963279465587e-05, + "loss": 0.0, + "step": 42681 + }, + { + "epoch": 3.9826443967528227, + "grad_norm": NaN, + "learning_rate": 8.065292577499934e-05, + "loss": 0.0, + "step": 42682 + }, + { + "epoch": 3.9827377064476996, + "grad_norm": NaN, + "learning_rate": 8.06462189316719e-05, + "loss": 0.0, + "step": 42683 + }, + { + "epoch": 3.982831016142577, + "grad_norm": NaN, + "learning_rate": 8.063951226469056e-05, + "loss": 0.0, + "step": 42684 + }, + { + "epoch": 3.9829243258374545, + "grad_norm": NaN, + "learning_rate": 8.063280577407241e-05, + "loss": 0.0, + "step": 42685 + }, + { + "epoch": 3.983017635532332, + "grad_norm": NaN, + "learning_rate": 8.062609945983448e-05, + "loss": 0.0, + "step": 42686 + }, + { + "epoch": 3.9831109452272093, + "grad_norm": NaN, + "learning_rate": 8.061939332199381e-05, + "loss": 0.0, + "step": 42687 + }, + { + "epoch": 3.9832042549220863, + "grad_norm": NaN, + "learning_rate": 8.06126873605675e-05, + "loss": 0.0, + "step": 42688 + }, + { + "epoch": 3.9832975646169637, + "grad_norm": NaN, + "learning_rate": 8.060598157557256e-05, + "loss": 0.0, + "step": 42689 + }, + { + "epoch": 3.983390874311841, + "grad_norm": NaN, + "learning_rate": 8.059927596702605e-05, + "loss": 0.0, + "step": 42690 + }, + { + "epoch": 3.983484184006718, + "grad_norm": NaN, + "learning_rate": 8.059257053494503e-05, + "loss": 0.0, + "step": 42691 + }, + { + "epoch": 3.9835774937015955, + "grad_norm": NaN, + "learning_rate": 8.058586527934655e-05, + "loss": 0.0, + "step": 42692 + }, + { + "epoch": 3.983670803396473, + "grad_norm": NaN, + "learning_rate": 8.057916020024764e-05, + "loss": 0.0, + "step": 42693 + }, + { + "epoch": 3.9837641130913504, + "grad_norm": NaN, + "learning_rate": 8.057245529766536e-05, + "loss": 0.0, + "step": 42694 + }, + { + "epoch": 3.9838574227862273, + "grad_norm": NaN, + "learning_rate": 8.056575057161677e-05, + "loss": 0.0, + "step": 42695 + }, + { + "epoch": 3.9839507324811048, + "grad_norm": NaN, + "learning_rate": 8.05590460221189e-05, + "loss": 0.0, + "step": 42696 + }, + { + "epoch": 3.984044042175982, + "grad_norm": NaN, + "learning_rate": 8.05523416491888e-05, + "loss": 0.0, + "step": 42697 + }, + { + "epoch": 3.984137351870859, + "grad_norm": NaN, + "learning_rate": 8.054563745284353e-05, + "loss": 0.0, + "step": 42698 + }, + { + "epoch": 3.9842306615657366, + "grad_norm": NaN, + "learning_rate": 8.053893343310015e-05, + "loss": 0.0, + "step": 42699 + }, + { + "epoch": 3.984323971260614, + "grad_norm": NaN, + "learning_rate": 8.053222958997566e-05, + "loss": 0.0, + "step": 42700 + }, + { + "epoch": 3.9844172809554914, + "grad_norm": NaN, + "learning_rate": 8.052552592348714e-05, + "loss": 0.0, + "step": 42701 + }, + { + "epoch": 3.984510590650369, + "grad_norm": NaN, + "learning_rate": 8.051882243365162e-05, + "loss": 0.0, + "step": 42702 + }, + { + "epoch": 3.984603900345246, + "grad_norm": NaN, + "learning_rate": 8.051211912048617e-05, + "loss": 0.0, + "step": 42703 + }, + { + "epoch": 3.9846972100401232, + "grad_norm": NaN, + "learning_rate": 8.050541598400781e-05, + "loss": 0.0, + "step": 42704 + }, + { + "epoch": 3.984790519735, + "grad_norm": NaN, + "learning_rate": 8.04987130242336e-05, + "loss": 0.0, + "step": 42705 + }, + { + "epoch": 3.9848838294298776, + "grad_norm": NaN, + "learning_rate": 8.049201024118056e-05, + "loss": 0.0, + "step": 42706 + }, + { + "epoch": 3.984977139124755, + "grad_norm": NaN, + "learning_rate": 8.048530763486574e-05, + "loss": 0.0, + "step": 42707 + }, + { + "epoch": 3.9850704488196325, + "grad_norm": NaN, + "learning_rate": 8.047860520530621e-05, + "loss": 0.0, + "step": 42708 + }, + { + "epoch": 3.98516375851451, + "grad_norm": NaN, + "learning_rate": 8.047190295251898e-05, + "loss": 0.0, + "step": 42709 + }, + { + "epoch": 3.985257068209387, + "grad_norm": NaN, + "learning_rate": 8.046520087652113e-05, + "loss": 0.0, + "step": 42710 + }, + { + "epoch": 3.9853503779042643, + "grad_norm": NaN, + "learning_rate": 8.045849897732965e-05, + "loss": 0.0, + "step": 42711 + }, + { + "epoch": 3.9854436875991417, + "grad_norm": NaN, + "learning_rate": 8.045179725496161e-05, + "loss": 0.0, + "step": 42712 + }, + { + "epoch": 3.9855369972940187, + "grad_norm": NaN, + "learning_rate": 8.044509570943408e-05, + "loss": 0.0, + "step": 42713 + }, + { + "epoch": 3.985630306988896, + "grad_norm": NaN, + "learning_rate": 8.043839434076403e-05, + "loss": 0.0, + "step": 42714 + }, + { + "epoch": 3.9857236166837735, + "grad_norm": NaN, + "learning_rate": 8.043169314896854e-05, + "loss": 0.0, + "step": 42715 + }, + { + "epoch": 3.985816926378651, + "grad_norm": NaN, + "learning_rate": 8.042499213406466e-05, + "loss": 0.0, + "step": 42716 + }, + { + "epoch": 3.985910236073528, + "grad_norm": NaN, + "learning_rate": 8.041829129606942e-05, + "loss": 0.0, + "step": 42717 + }, + { + "epoch": 3.9860035457684053, + "grad_norm": NaN, + "learning_rate": 8.041159063499984e-05, + "loss": 0.0, + "step": 42718 + }, + { + "epoch": 3.9860968554632827, + "grad_norm": NaN, + "learning_rate": 8.040489015087297e-05, + "loss": 0.0, + "step": 42719 + }, + { + "epoch": 3.9861901651581597, + "grad_norm": NaN, + "learning_rate": 8.039818984370586e-05, + "loss": 0.0, + "step": 42720 + }, + { + "epoch": 3.986283474853037, + "grad_norm": NaN, + "learning_rate": 8.039148971351553e-05, + "loss": 0.0, + "step": 42721 + }, + { + "epoch": 3.9863767845479146, + "grad_norm": NaN, + "learning_rate": 8.038478976031901e-05, + "loss": 0.0, + "step": 42722 + }, + { + "epoch": 3.986470094242792, + "grad_norm": NaN, + "learning_rate": 8.037808998413337e-05, + "loss": 0.0, + "step": 42723 + }, + { + "epoch": 3.9865634039376694, + "grad_norm": NaN, + "learning_rate": 8.037139038497561e-05, + "loss": 0.0, + "step": 42724 + }, + { + "epoch": 3.9866567136325464, + "grad_norm": NaN, + "learning_rate": 8.036469096286279e-05, + "loss": 0.0, + "step": 42725 + }, + { + "epoch": 3.986750023327424, + "grad_norm": NaN, + "learning_rate": 8.035799171781192e-05, + "loss": 0.0, + "step": 42726 + }, + { + "epoch": 3.9868433330223008, + "grad_norm": NaN, + "learning_rate": 8.035129264984007e-05, + "loss": 0.0, + "step": 42727 + }, + { + "epoch": 3.986936642717178, + "grad_norm": NaN, + "learning_rate": 8.034459375896423e-05, + "loss": 0.0, + "step": 42728 + }, + { + "epoch": 3.9870299524120556, + "grad_norm": NaN, + "learning_rate": 8.033789504520146e-05, + "loss": 0.0, + "step": 42729 + }, + { + "epoch": 3.987123262106933, + "grad_norm": NaN, + "learning_rate": 8.033119650856879e-05, + "loss": 0.0, + "step": 42730 + }, + { + "epoch": 3.9872165718018104, + "grad_norm": NaN, + "learning_rate": 8.032449814908325e-05, + "loss": 0.0, + "step": 42731 + }, + { + "epoch": 3.9873098814966874, + "grad_norm": NaN, + "learning_rate": 8.031779996676186e-05, + "loss": 0.0, + "step": 42732 + }, + { + "epoch": 3.987403191191565, + "grad_norm": NaN, + "learning_rate": 8.031110196162168e-05, + "loss": 0.0, + "step": 42733 + }, + { + "epoch": 3.9874965008864423, + "grad_norm": NaN, + "learning_rate": 8.030440413367972e-05, + "loss": 0.0, + "step": 42734 + }, + { + "epoch": 3.9875898105813192, + "grad_norm": NaN, + "learning_rate": 8.0297706482953e-05, + "loss": 0.0, + "step": 42735 + }, + { + "epoch": 3.9876831202761966, + "grad_norm": NaN, + "learning_rate": 8.029100900945858e-05, + "loss": 0.0, + "step": 42736 + }, + { + "epoch": 3.987776429971074, + "grad_norm": NaN, + "learning_rate": 8.028431171321347e-05, + "loss": 0.0, + "step": 42737 + }, + { + "epoch": 3.9878697396659515, + "grad_norm": NaN, + "learning_rate": 8.027761459423471e-05, + "loss": 0.0, + "step": 42738 + }, + { + "epoch": 3.9879630493608285, + "grad_norm": NaN, + "learning_rate": 8.027091765253931e-05, + "loss": 0.0, + "step": 42739 + }, + { + "epoch": 3.988056359055706, + "grad_norm": NaN, + "learning_rate": 8.026422088814433e-05, + "loss": 0.0, + "step": 42740 + }, + { + "epoch": 3.9881496687505833, + "grad_norm": NaN, + "learning_rate": 8.025752430106677e-05, + "loss": 0.0, + "step": 42741 + }, + { + "epoch": 3.9882429784454603, + "grad_norm": NaN, + "learning_rate": 8.025082789132366e-05, + "loss": 0.0, + "step": 42742 + }, + { + "epoch": 3.9883362881403377, + "grad_norm": NaN, + "learning_rate": 8.024413165893203e-05, + "loss": 0.0, + "step": 42743 + }, + { + "epoch": 3.988429597835215, + "grad_norm": NaN, + "learning_rate": 8.023743560390894e-05, + "loss": 0.0, + "step": 42744 + }, + { + "epoch": 3.9885229075300925, + "grad_norm": NaN, + "learning_rate": 8.023073972627135e-05, + "loss": 0.0, + "step": 42745 + }, + { + "epoch": 3.98861621722497, + "grad_norm": NaN, + "learning_rate": 8.022404402603634e-05, + "loss": 0.0, + "step": 42746 + }, + { + "epoch": 3.988709526919847, + "grad_norm": NaN, + "learning_rate": 8.021734850322092e-05, + "loss": 0.0, + "step": 42747 + }, + { + "epoch": 3.9888028366147243, + "grad_norm": NaN, + "learning_rate": 8.02106531578421e-05, + "loss": 0.0, + "step": 42748 + }, + { + "epoch": 3.9888961463096013, + "grad_norm": NaN, + "learning_rate": 8.020395798991693e-05, + "loss": 0.0, + "step": 42749 + }, + { + "epoch": 3.9889894560044787, + "grad_norm": NaN, + "learning_rate": 8.019726299946241e-05, + "loss": 0.0, + "step": 42750 + }, + { + "epoch": 3.989082765699356, + "grad_norm": NaN, + "learning_rate": 8.019056818649558e-05, + "loss": 0.0, + "step": 42751 + }, + { + "epoch": 3.9891760753942336, + "grad_norm": NaN, + "learning_rate": 8.018387355103343e-05, + "loss": 0.0, + "step": 42752 + }, + { + "epoch": 3.989269385089111, + "grad_norm": NaN, + "learning_rate": 8.017717909309302e-05, + "loss": 0.0, + "step": 42753 + }, + { + "epoch": 3.989362694783988, + "grad_norm": NaN, + "learning_rate": 8.017048481269138e-05, + "loss": 0.0, + "step": 42754 + }, + { + "epoch": 3.9894560044788654, + "grad_norm": NaN, + "learning_rate": 8.016379070984551e-05, + "loss": 0.0, + "step": 42755 + }, + { + "epoch": 3.989549314173743, + "grad_norm": NaN, + "learning_rate": 8.015709678457243e-05, + "loss": 0.0, + "step": 42756 + }, + { + "epoch": 3.98964262386862, + "grad_norm": NaN, + "learning_rate": 8.015040303688915e-05, + "loss": 0.0, + "step": 42757 + }, + { + "epoch": 3.989735933563497, + "grad_norm": NaN, + "learning_rate": 8.014370946681271e-05, + "loss": 0.0, + "step": 42758 + }, + { + "epoch": 3.9898292432583746, + "grad_norm": NaN, + "learning_rate": 8.013701607436011e-05, + "loss": 0.0, + "step": 42759 + }, + { + "epoch": 3.989922552953252, + "grad_norm": NaN, + "learning_rate": 8.013032285954843e-05, + "loss": 0.0, + "step": 42760 + }, + { + "epoch": 3.990015862648129, + "grad_norm": NaN, + "learning_rate": 8.012362982239459e-05, + "loss": 0.0, + "step": 42761 + }, + { + "epoch": 3.9901091723430064, + "grad_norm": NaN, + "learning_rate": 8.011693696291569e-05, + "loss": 0.0, + "step": 42762 + }, + { + "epoch": 3.990202482037884, + "grad_norm": NaN, + "learning_rate": 8.011024428112871e-05, + "loss": 0.0, + "step": 42763 + }, + { + "epoch": 3.990295791732761, + "grad_norm": NaN, + "learning_rate": 8.010355177705066e-05, + "loss": 0.0, + "step": 42764 + }, + { + "epoch": 3.9903891014276383, + "grad_norm": NaN, + "learning_rate": 8.00968594506986e-05, + "loss": 0.0, + "step": 42765 + }, + { + "epoch": 3.9904824111225157, + "grad_norm": NaN, + "learning_rate": 8.009016730208951e-05, + "loss": 0.0, + "step": 42766 + }, + { + "epoch": 3.990575720817393, + "grad_norm": NaN, + "learning_rate": 8.008347533124042e-05, + "loss": 0.0, + "step": 42767 + }, + { + "epoch": 3.99066903051227, + "grad_norm": NaN, + "learning_rate": 8.007678353816832e-05, + "loss": 0.0, + "step": 42768 + }, + { + "epoch": 3.9907623402071475, + "grad_norm": NaN, + "learning_rate": 8.007009192289026e-05, + "loss": 0.0, + "step": 42769 + }, + { + "epoch": 3.990855649902025, + "grad_norm": NaN, + "learning_rate": 8.006340048542325e-05, + "loss": 0.0, + "step": 42770 + }, + { + "epoch": 3.990948959596902, + "grad_norm": NaN, + "learning_rate": 8.00567092257843e-05, + "loss": 0.0, + "step": 42771 + }, + { + "epoch": 3.9910422692917793, + "grad_norm": NaN, + "learning_rate": 8.005001814399041e-05, + "loss": 0.0, + "step": 42772 + }, + { + "epoch": 3.9911355789866567, + "grad_norm": NaN, + "learning_rate": 8.004332724005861e-05, + "loss": 0.0, + "step": 42773 + }, + { + "epoch": 3.991228888681534, + "grad_norm": NaN, + "learning_rate": 8.003663651400588e-05, + "loss": 0.0, + "step": 42774 + }, + { + "epoch": 3.9913221983764116, + "grad_norm": NaN, + "learning_rate": 8.002994596584929e-05, + "loss": 0.0, + "step": 42775 + }, + { + "epoch": 3.9914155080712885, + "grad_norm": NaN, + "learning_rate": 8.00232555956058e-05, + "loss": 0.0, + "step": 42776 + }, + { + "epoch": 3.991508817766166, + "grad_norm": NaN, + "learning_rate": 8.001656540329245e-05, + "loss": 0.0, + "step": 42777 + }, + { + "epoch": 3.991602127461043, + "grad_norm": NaN, + "learning_rate": 8.000987538892618e-05, + "loss": 0.0, + "step": 42778 + }, + { + "epoch": 3.9916954371559203, + "grad_norm": NaN, + "learning_rate": 8.000318555252419e-05, + "loss": 0.0, + "step": 42779 + }, + { + "epoch": 3.9917887468507978, + "grad_norm": NaN, + "learning_rate": 7.99964958941033e-05, + "loss": 0.0, + "step": 42780 + }, + { + "epoch": 3.991882056545675, + "grad_norm": NaN, + "learning_rate": 7.99898064136806e-05, + "loss": 0.0, + "step": 42781 + }, + { + "epoch": 3.9919753662405526, + "grad_norm": NaN, + "learning_rate": 7.998311711127308e-05, + "loss": 0.0, + "step": 42782 + }, + { + "epoch": 3.9920686759354296, + "grad_norm": NaN, + "learning_rate": 7.997642798689773e-05, + "loss": 0.0, + "step": 42783 + }, + { + "epoch": 3.992161985630307, + "grad_norm": NaN, + "learning_rate": 7.99697390405716e-05, + "loss": 0.0, + "step": 42784 + }, + { + "epoch": 3.9922552953251844, + "grad_norm": NaN, + "learning_rate": 7.996305027231166e-05, + "loss": 0.0, + "step": 42785 + }, + { + "epoch": 3.9923486050200614, + "grad_norm": NaN, + "learning_rate": 7.995636168213496e-05, + "loss": 0.0, + "step": 42786 + }, + { + "epoch": 3.992441914714939, + "grad_norm": NaN, + "learning_rate": 7.994967327005848e-05, + "loss": 0.0, + "step": 42787 + }, + { + "epoch": 3.9925352244098162, + "grad_norm": NaN, + "learning_rate": 7.994298503609921e-05, + "loss": 0.0, + "step": 42788 + }, + { + "epoch": 3.9926285341046936, + "grad_norm": NaN, + "learning_rate": 7.99362969802742e-05, + "loss": 0.0, + "step": 42789 + }, + { + "epoch": 3.9927218437995706, + "grad_norm": NaN, + "learning_rate": 7.992960910260043e-05, + "loss": 0.0, + "step": 42790 + }, + { + "epoch": 3.992815153494448, + "grad_norm": NaN, + "learning_rate": 7.992292140309488e-05, + "loss": 0.0, + "step": 42791 + }, + { + "epoch": 3.9929084631893255, + "grad_norm": NaN, + "learning_rate": 7.991623388177461e-05, + "loss": 0.0, + "step": 42792 + }, + { + "epoch": 3.9930017728842024, + "grad_norm": NaN, + "learning_rate": 7.99095465386566e-05, + "loss": 0.0, + "step": 42793 + }, + { + "epoch": 3.99309508257908, + "grad_norm": NaN, + "learning_rate": 7.990285937375783e-05, + "loss": 0.0, + "step": 42794 + }, + { + "epoch": 3.9931883922739573, + "grad_norm": NaN, + "learning_rate": 7.989617238709534e-05, + "loss": 0.0, + "step": 42795 + }, + { + "epoch": 3.9932817019688347, + "grad_norm": NaN, + "learning_rate": 7.988948557868611e-05, + "loss": 0.0, + "step": 42796 + }, + { + "epoch": 3.993375011663712, + "grad_norm": NaN, + "learning_rate": 7.988279894854715e-05, + "loss": 0.0, + "step": 42797 + }, + { + "epoch": 3.993468321358589, + "grad_norm": NaN, + "learning_rate": 7.987611249669549e-05, + "loss": 0.0, + "step": 42798 + }, + { + "epoch": 3.9935616310534665, + "grad_norm": NaN, + "learning_rate": 7.986942622314806e-05, + "loss": 0.0, + "step": 42799 + }, + { + "epoch": 3.9936549407483435, + "grad_norm": NaN, + "learning_rate": 7.986274012792192e-05, + "loss": 0.0, + "step": 42800 + }, + { + "epoch": 3.993748250443221, + "grad_norm": NaN, + "learning_rate": 7.985605421103406e-05, + "loss": 0.0, + "step": 42801 + }, + { + "epoch": 3.9938415601380983, + "grad_norm": NaN, + "learning_rate": 7.98493684725015e-05, + "loss": 0.0, + "step": 42802 + }, + { + "epoch": 3.9939348698329757, + "grad_norm": NaN, + "learning_rate": 7.984268291234118e-05, + "loss": 0.0, + "step": 42803 + }, + { + "epoch": 3.994028179527853, + "grad_norm": NaN, + "learning_rate": 7.983599753057013e-05, + "loss": 0.0, + "step": 42804 + }, + { + "epoch": 3.99412148922273, + "grad_norm": NaN, + "learning_rate": 7.982931232720538e-05, + "loss": 0.0, + "step": 42805 + }, + { + "epoch": 3.9942147989176076, + "grad_norm": NaN, + "learning_rate": 7.982262730226391e-05, + "loss": 0.0, + "step": 42806 + }, + { + "epoch": 3.994308108612485, + "grad_norm": NaN, + "learning_rate": 7.981594245576268e-05, + "loss": 0.0, + "step": 42807 + }, + { + "epoch": 3.994401418307362, + "grad_norm": NaN, + "learning_rate": 7.980925778771872e-05, + "loss": 0.0, + "step": 42808 + }, + { + "epoch": 3.9944947280022394, + "grad_norm": NaN, + "learning_rate": 7.980257329814904e-05, + "loss": 0.0, + "step": 42809 + }, + { + "epoch": 3.994588037697117, + "grad_norm": NaN, + "learning_rate": 7.979588898707062e-05, + "loss": 0.0, + "step": 42810 + }, + { + "epoch": 3.994681347391994, + "grad_norm": NaN, + "learning_rate": 7.978920485450047e-05, + "loss": 0.0, + "step": 42811 + }, + { + "epoch": 3.994774657086871, + "grad_norm": NaN, + "learning_rate": 7.978252090045554e-05, + "loss": 0.0, + "step": 42812 + }, + { + "epoch": 3.9948679667817486, + "grad_norm": NaN, + "learning_rate": 7.97758371249529e-05, + "loss": 0.0, + "step": 42813 + }, + { + "epoch": 3.994961276476626, + "grad_norm": NaN, + "learning_rate": 7.97691535280095e-05, + "loss": 0.0, + "step": 42814 + }, + { + "epoch": 3.995054586171503, + "grad_norm": NaN, + "learning_rate": 7.976247010964231e-05, + "loss": 0.0, + "step": 42815 + }, + { + "epoch": 3.9951478958663804, + "grad_norm": NaN, + "learning_rate": 7.975578686986829e-05, + "loss": 0.0, + "step": 42816 + }, + { + "epoch": 3.995241205561258, + "grad_norm": NaN, + "learning_rate": 7.974910380870466e-05, + "loss": 0.0, + "step": 42817 + }, + { + "epoch": 3.9953345152561353, + "grad_norm": NaN, + "learning_rate": 7.974242092616816e-05, + "loss": 0.0, + "step": 42818 + }, + { + "epoch": 3.9954278249510127, + "grad_norm": NaN, + "learning_rate": 7.973573822227581e-05, + "loss": 0.0, + "step": 42819 + }, + { + "epoch": 3.9955211346458896, + "grad_norm": NaN, + "learning_rate": 7.972905569704478e-05, + "loss": 0.0, + "step": 42820 + }, + { + "epoch": 3.995614444340767, + "grad_norm": NaN, + "learning_rate": 7.972237335049191e-05, + "loss": 0.0, + "step": 42821 + }, + { + "epoch": 3.995707754035644, + "grad_norm": NaN, + "learning_rate": 7.971569118263414e-05, + "loss": 0.0, + "step": 42822 + }, + { + "epoch": 3.9958010637305215, + "grad_norm": NaN, + "learning_rate": 7.970900919348867e-05, + "loss": 0.0, + "step": 42823 + }, + { + "epoch": 3.995894373425399, + "grad_norm": NaN, + "learning_rate": 7.970232738307233e-05, + "loss": 0.0, + "step": 42824 + }, + { + "epoch": 3.9959876831202763, + "grad_norm": NaN, + "learning_rate": 7.969564575140213e-05, + "loss": 0.0, + "step": 42825 + }, + { + "epoch": 3.9960809928151537, + "grad_norm": NaN, + "learning_rate": 7.968896429849509e-05, + "loss": 0.0, + "step": 42826 + }, + { + "epoch": 3.9961743025100307, + "grad_norm": NaN, + "learning_rate": 7.968228302436817e-05, + "loss": 0.0, + "step": 42827 + }, + { + "epoch": 3.996267612204908, + "grad_norm": NaN, + "learning_rate": 7.967560192903838e-05, + "loss": 0.0, + "step": 42828 + }, + { + "epoch": 3.9963609218997855, + "grad_norm": NaN, + "learning_rate": 7.966892101252271e-05, + "loss": 0.0, + "step": 42829 + }, + { + "epoch": 3.9964542315946625, + "grad_norm": NaN, + "learning_rate": 7.966224027483812e-05, + "loss": 0.0, + "step": 42830 + }, + { + "epoch": 3.99654754128954, + "grad_norm": NaN, + "learning_rate": 7.965555971600162e-05, + "loss": 0.0, + "step": 42831 + }, + { + "epoch": 3.9966408509844173, + "grad_norm": NaN, + "learning_rate": 7.964887933603023e-05, + "loss": 0.0, + "step": 42832 + }, + { + "epoch": 3.9967341606792948, + "grad_norm": NaN, + "learning_rate": 7.964219913494087e-05, + "loss": 0.0, + "step": 42833 + }, + { + "epoch": 3.9968274703741717, + "grad_norm": NaN, + "learning_rate": 7.963551911275057e-05, + "loss": 0.0, + "step": 42834 + }, + { + "epoch": 3.996920780069049, + "grad_norm": NaN, + "learning_rate": 7.962883926947628e-05, + "loss": 0.0, + "step": 42835 + }, + { + "epoch": 3.9970140897639266, + "grad_norm": NaN, + "learning_rate": 7.962215960513503e-05, + "loss": 0.0, + "step": 42836 + }, + { + "epoch": 3.9971073994588036, + "grad_norm": NaN, + "learning_rate": 7.961548011974378e-05, + "loss": 0.0, + "step": 42837 + }, + { + "epoch": 3.997200709153681, + "grad_norm": NaN, + "learning_rate": 7.96088008133195e-05, + "loss": 0.0, + "step": 42838 + }, + { + "epoch": 3.9972940188485584, + "grad_norm": NaN, + "learning_rate": 7.960212168587919e-05, + "loss": 0.0, + "step": 42839 + }, + { + "epoch": 3.997387328543436, + "grad_norm": NaN, + "learning_rate": 7.959544273743984e-05, + "loss": 0.0, + "step": 42840 + }, + { + "epoch": 3.9974806382383132, + "grad_norm": NaN, + "learning_rate": 7.958876396801842e-05, + "loss": 0.0, + "step": 42841 + }, + { + "epoch": 3.99757394793319, + "grad_norm": NaN, + "learning_rate": 7.95820853776319e-05, + "loss": 0.0, + "step": 42842 + }, + { + "epoch": 3.9976672576280676, + "grad_norm": NaN, + "learning_rate": 7.957540696629729e-05, + "loss": 0.0, + "step": 42843 + }, + { + "epoch": 3.9977605673229446, + "grad_norm": NaN, + "learning_rate": 7.956872873403155e-05, + "loss": 0.0, + "step": 42844 + }, + { + "epoch": 3.997853877017822, + "grad_norm": NaN, + "learning_rate": 7.956205068085168e-05, + "loss": 0.0, + "step": 42845 + }, + { + "epoch": 3.9979471867126994, + "grad_norm": NaN, + "learning_rate": 7.955537280677467e-05, + "loss": 0.0, + "step": 42846 + }, + { + "epoch": 3.998040496407577, + "grad_norm": NaN, + "learning_rate": 7.954869511181745e-05, + "loss": 0.0, + "step": 42847 + }, + { + "epoch": 3.9981338061024543, + "grad_norm": NaN, + "learning_rate": 7.954201759599705e-05, + "loss": 0.0, + "step": 42848 + }, + { + "epoch": 3.9982271157973313, + "grad_norm": NaN, + "learning_rate": 7.95353402593304e-05, + "loss": 0.0, + "step": 42849 + }, + { + "epoch": 3.9983204254922087, + "grad_norm": NaN, + "learning_rate": 7.952866310183454e-05, + "loss": 0.0, + "step": 42850 + }, + { + "epoch": 3.998413735187086, + "grad_norm": NaN, + "learning_rate": 7.952198612352641e-05, + "loss": 0.0, + "step": 42851 + }, + { + "epoch": 3.998507044881963, + "grad_norm": NaN, + "learning_rate": 7.951530932442297e-05, + "loss": 0.0, + "step": 42852 + }, + { + "epoch": 3.9986003545768405, + "grad_norm": NaN, + "learning_rate": 7.950863270454122e-05, + "loss": 0.0, + "step": 42853 + }, + { + "epoch": 3.998693664271718, + "grad_norm": NaN, + "learning_rate": 7.950195626389809e-05, + "loss": 0.0, + "step": 42854 + }, + { + "epoch": 3.9987869739665953, + "grad_norm": NaN, + "learning_rate": 7.94952800025107e-05, + "loss": 0.0, + "step": 42855 + }, + { + "epoch": 3.9988802836614723, + "grad_norm": NaN, + "learning_rate": 7.94886039203959e-05, + "loss": 0.0, + "step": 42856 + }, + { + "epoch": 3.9989735933563497, + "grad_norm": NaN, + "learning_rate": 7.948192801757063e-05, + "loss": 0.0, + "step": 42857 + }, + { + "epoch": 3.999066903051227, + "grad_norm": NaN, + "learning_rate": 7.947525229405202e-05, + "loss": 0.0, + "step": 42858 + }, + { + "epoch": 3.999160212746104, + "grad_norm": NaN, + "learning_rate": 7.946857674985693e-05, + "loss": 0.0, + "step": 42859 + }, + { + "epoch": 3.9992535224409815, + "grad_norm": NaN, + "learning_rate": 7.946190138500226e-05, + "loss": 0.0, + "step": 42860 + }, + { + "epoch": 3.999346832135859, + "grad_norm": NaN, + "learning_rate": 7.945522619950523e-05, + "loss": 0.0, + "step": 42861 + }, + { + "epoch": 3.9994401418307364, + "grad_norm": NaN, + "learning_rate": 7.944855119338258e-05, + "loss": 0.0, + "step": 42862 + }, + { + "epoch": 3.9995334515256133, + "grad_norm": NaN, + "learning_rate": 7.944187636665131e-05, + "loss": 0.0, + "step": 42863 + }, + { + "epoch": 3.9996267612204908, + "grad_norm": NaN, + "learning_rate": 7.943520171932858e-05, + "loss": 0.0, + "step": 42864 + }, + { + "epoch": 3.999720070915368, + "grad_norm": NaN, + "learning_rate": 7.942852725143114e-05, + "loss": 0.0, + "step": 42865 + }, + { + "epoch": 3.999813380610245, + "grad_norm": NaN, + "learning_rate": 7.942185296297603e-05, + "loss": 0.0, + "step": 42866 + }, + { + "epoch": 3.9999066903051226, + "grad_norm": NaN, + "learning_rate": 7.941517885398033e-05, + "loss": 0.0, + "step": 42867 + }, + { + "epoch": 4.0, + "grad_norm": NaN, + "learning_rate": 7.940850492446087e-05, + "loss": 0.0, + "step": 42868 + }, + { + "epoch": 4.0, + "eval_loss": NaN, + "eval_runtime": 26.5734, + "eval_samples_per_second": 6.661, + "eval_steps_per_second": 6.661, + "step": 42868 + }, + { + "epoch": 4.000093309694877, + "grad_norm": NaN, + "learning_rate": 7.940183117443468e-05, + "loss": 0.0, + "step": 42869 + }, + { + "epoch": 4.000186619389755, + "grad_norm": NaN, + "learning_rate": 7.939515760391872e-05, + "loss": 0.0, + "step": 42870 + }, + { + "epoch": 4.000279929084632, + "grad_norm": NaN, + "learning_rate": 7.938848421292994e-05, + "loss": 0.0, + "step": 42871 + }, + { + "epoch": 4.000373238779509, + "grad_norm": NaN, + "learning_rate": 7.938181100148535e-05, + "loss": 0.0, + "step": 42872 + }, + { + "epoch": 4.000466548474386, + "grad_norm": NaN, + "learning_rate": 7.93751379696019e-05, + "loss": 0.0, + "step": 42873 + }, + { + "epoch": 4.000559858169264, + "grad_norm": NaN, + "learning_rate": 7.936846511729654e-05, + "loss": 0.0, + "step": 42874 + }, + { + "epoch": 4.000653167864141, + "grad_norm": NaN, + "learning_rate": 7.936179244458623e-05, + "loss": 0.0, + "step": 42875 + }, + { + "epoch": 4.0007464775590185, + "grad_norm": NaN, + "learning_rate": 7.9355119951488e-05, + "loss": 0.0, + "step": 42876 + }, + { + "epoch": 4.000839787253896, + "grad_norm": NaN, + "learning_rate": 7.934844763801875e-05, + "loss": 0.0, + "step": 42877 + }, + { + "epoch": 4.000933096948773, + "grad_norm": NaN, + "learning_rate": 7.934177550419549e-05, + "loss": 0.0, + "step": 42878 + }, + { + "epoch": 4.00102640664365, + "grad_norm": NaN, + "learning_rate": 7.933510355003512e-05, + "loss": 0.0, + "step": 42879 + }, + { + "epoch": 4.001119716338527, + "grad_norm": NaN, + "learning_rate": 7.932843177555468e-05, + "loss": 0.0, + "step": 42880 + }, + { + "epoch": 4.001213026033405, + "grad_norm": NaN, + "learning_rate": 7.932176018077111e-05, + "loss": 0.0, + "step": 42881 + }, + { + "epoch": 4.001306335728282, + "grad_norm": NaN, + "learning_rate": 7.931508876570135e-05, + "loss": 0.0, + "step": 42882 + }, + { + "epoch": 4.0013996454231595, + "grad_norm": NaN, + "learning_rate": 7.930841753036238e-05, + "loss": 0.0, + "step": 42883 + }, + { + "epoch": 4.001492955118037, + "grad_norm": NaN, + "learning_rate": 7.930174647477119e-05, + "loss": 0.0, + "step": 42884 + }, + { + "epoch": 4.001586264812914, + "grad_norm": NaN, + "learning_rate": 7.929507559894469e-05, + "loss": 0.0, + "step": 42885 + }, + { + "epoch": 4.001679574507792, + "grad_norm": NaN, + "learning_rate": 7.928840490289988e-05, + "loss": 0.0, + "step": 42886 + }, + { + "epoch": 4.001772884202668, + "grad_norm": NaN, + "learning_rate": 7.92817343866537e-05, + "loss": 0.0, + "step": 42887 + }, + { + "epoch": 4.001866193897546, + "grad_norm": NaN, + "learning_rate": 7.927506405022314e-05, + "loss": 0.0, + "step": 42888 + }, + { + "epoch": 4.001959503592423, + "grad_norm": NaN, + "learning_rate": 7.926839389362506e-05, + "loss": 0.0, + "step": 42889 + }, + { + "epoch": 4.002052813287301, + "grad_norm": NaN, + "learning_rate": 7.926172391687664e-05, + "loss": 0.0, + "step": 42890 + }, + { + "epoch": 4.002146122982178, + "grad_norm": NaN, + "learning_rate": 7.925505411999463e-05, + "loss": 0.0, + "step": 42891 + }, + { + "epoch": 4.002239432677055, + "grad_norm": NaN, + "learning_rate": 7.924838450299601e-05, + "loss": 0.0, + "step": 42892 + }, + { + "epoch": 4.002332742371933, + "grad_norm": NaN, + "learning_rate": 7.924171506589789e-05, + "loss": 0.0, + "step": 42893 + }, + { + "epoch": 4.002426052066809, + "grad_norm": NaN, + "learning_rate": 7.923504580871709e-05, + "loss": 0.0, + "step": 42894 + }, + { + "epoch": 4.002519361761687, + "grad_norm": NaN, + "learning_rate": 7.922837673147053e-05, + "loss": 0.0, + "step": 42895 + }, + { + "epoch": 4.002612671456564, + "grad_norm": NaN, + "learning_rate": 7.922170783417538e-05, + "loss": 0.0, + "step": 42896 + }, + { + "epoch": 4.002705981151442, + "grad_norm": NaN, + "learning_rate": 7.921503911684838e-05, + "loss": 0.0, + "step": 42897 + }, + { + "epoch": 4.002799290846319, + "grad_norm": NaN, + "learning_rate": 7.920837057950653e-05, + "loss": 0.0, + "step": 42898 + }, + { + "epoch": 4.002892600541196, + "grad_norm": NaN, + "learning_rate": 7.920170222216695e-05, + "loss": 0.0, + "step": 42899 + }, + { + "epoch": 4.002985910236074, + "grad_norm": NaN, + "learning_rate": 7.919503404484639e-05, + "loss": 0.0, + "step": 42900 + }, + { + "epoch": 4.00307921993095, + "grad_norm": NaN, + "learning_rate": 7.918836604756182e-05, + "loss": 0.0, + "step": 42901 + }, + { + "epoch": 4.003172529625828, + "grad_norm": NaN, + "learning_rate": 7.918169823033039e-05, + "loss": 0.0, + "step": 42902 + }, + { + "epoch": 4.003265839320705, + "grad_norm": NaN, + "learning_rate": 7.917503059316887e-05, + "loss": 0.0, + "step": 42903 + }, + { + "epoch": 4.003359149015583, + "grad_norm": NaN, + "learning_rate": 7.91683631360942e-05, + "loss": 0.0, + "step": 42904 + }, + { + "epoch": 4.00345245871046, + "grad_norm": NaN, + "learning_rate": 7.916169585912352e-05, + "loss": 0.0, + "step": 42905 + }, + { + "epoch": 4.0035457684053375, + "grad_norm": NaN, + "learning_rate": 7.915502876227359e-05, + "loss": 0.0, + "step": 42906 + }, + { + "epoch": 4.003639078100215, + "grad_norm": NaN, + "learning_rate": 7.91483618455614e-05, + "loss": 0.0, + "step": 42907 + }, + { + "epoch": 4.003732387795092, + "grad_norm": NaN, + "learning_rate": 7.914169510900403e-05, + "loss": 0.0, + "step": 42908 + }, + { + "epoch": 4.003825697489969, + "grad_norm": NaN, + "learning_rate": 7.913502855261831e-05, + "loss": 0.0, + "step": 42909 + }, + { + "epoch": 4.003919007184846, + "grad_norm": NaN, + "learning_rate": 7.912836217642114e-05, + "loss": 0.0, + "step": 42910 + }, + { + "epoch": 4.004012316879724, + "grad_norm": NaN, + "learning_rate": 7.912169598042968e-05, + "loss": 0.0, + "step": 42911 + }, + { + "epoch": 4.004105626574601, + "grad_norm": NaN, + "learning_rate": 7.91150299646607e-05, + "loss": 0.0, + "step": 42912 + }, + { + "epoch": 4.0041989362694785, + "grad_norm": NaN, + "learning_rate": 7.910836412913115e-05, + "loss": 0.0, + "step": 42913 + }, + { + "epoch": 4.004292245964356, + "grad_norm": NaN, + "learning_rate": 7.91016984738581e-05, + "loss": 0.0, + "step": 42914 + }, + { + "epoch": 4.004385555659233, + "grad_norm": NaN, + "learning_rate": 7.909503299885842e-05, + "loss": 0.0, + "step": 42915 + }, + { + "epoch": 4.00447886535411, + "grad_norm": NaN, + "learning_rate": 7.908836770414904e-05, + "loss": 0.0, + "step": 42916 + }, + { + "epoch": 4.004572175048987, + "grad_norm": NaN, + "learning_rate": 7.908170258974693e-05, + "loss": 0.0, + "step": 42917 + }, + { + "epoch": 4.004665484743865, + "grad_norm": NaN, + "learning_rate": 7.907503765566905e-05, + "loss": 0.0, + "step": 42918 + }, + { + "epoch": 4.004758794438742, + "grad_norm": NaN, + "learning_rate": 7.906837290193236e-05, + "loss": 0.0, + "step": 42919 + }, + { + "epoch": 4.00485210413362, + "grad_norm": NaN, + "learning_rate": 7.906170832855378e-05, + "loss": 0.0, + "step": 42920 + }, + { + "epoch": 4.004945413828497, + "grad_norm": NaN, + "learning_rate": 7.905504393555025e-05, + "loss": 0.0, + "step": 42921 + }, + { + "epoch": 4.005038723523374, + "grad_norm": NaN, + "learning_rate": 7.904837972293873e-05, + "loss": 0.0, + "step": 42922 + }, + { + "epoch": 4.005132033218251, + "grad_norm": NaN, + "learning_rate": 7.904171569073616e-05, + "loss": 0.0, + "step": 42923 + }, + { + "epoch": 4.005225342913128, + "grad_norm": NaN, + "learning_rate": 7.903505183895951e-05, + "loss": 0.0, + "step": 42924 + }, + { + "epoch": 4.005318652608006, + "grad_norm": NaN, + "learning_rate": 7.902838816762568e-05, + "loss": 0.0, + "step": 42925 + }, + { + "epoch": 4.005411962302883, + "grad_norm": NaN, + "learning_rate": 7.902172467675163e-05, + "loss": 0.0, + "step": 42926 + }, + { + "epoch": 4.005505271997761, + "grad_norm": NaN, + "learning_rate": 7.901506136635426e-05, + "loss": 0.0, + "step": 42927 + }, + { + "epoch": 4.005598581692638, + "grad_norm": NaN, + "learning_rate": 7.900839823645069e-05, + "loss": 0.0, + "step": 42928 + }, + { + "epoch": 4.0056918913875155, + "grad_norm": NaN, + "learning_rate": 7.900173528705767e-05, + "loss": 0.0, + "step": 42929 + }, + { + "epoch": 4.005785201082393, + "grad_norm": NaN, + "learning_rate": 7.899507251819213e-05, + "loss": 0.0, + "step": 42930 + }, + { + "epoch": 4.005878510777269, + "grad_norm": NaN, + "learning_rate": 7.898840992987123e-05, + "loss": 0.0, + "step": 42931 + }, + { + "epoch": 4.005971820472147, + "grad_norm": NaN, + "learning_rate": 7.898174752211168e-05, + "loss": 0.0, + "step": 42932 + }, + { + "epoch": 4.006065130167024, + "grad_norm": NaN, + "learning_rate": 7.897508529493047e-05, + "loss": 0.0, + "step": 42933 + }, + { + "epoch": 4.006158439861902, + "grad_norm": NaN, + "learning_rate": 7.89684232483447e-05, + "loss": 0.0, + "step": 42934 + }, + { + "epoch": 4.006251749556779, + "grad_norm": NaN, + "learning_rate": 7.896176138237111e-05, + "loss": 0.0, + "step": 42935 + }, + { + "epoch": 4.0063450592516565, + "grad_norm": NaN, + "learning_rate": 7.89550996970267e-05, + "loss": 0.0, + "step": 42936 + }, + { + "epoch": 4.006438368946534, + "grad_norm": NaN, + "learning_rate": 7.89484381923285e-05, + "loss": 0.0, + "step": 42937 + }, + { + "epoch": 4.0065316786414105, + "grad_norm": NaN, + "learning_rate": 7.894177686829333e-05, + "loss": 0.0, + "step": 42938 + }, + { + "epoch": 4.006624988336288, + "grad_norm": NaN, + "learning_rate": 7.893511572493813e-05, + "loss": 0.0, + "step": 42939 + }, + { + "epoch": 4.006718298031165, + "grad_norm": NaN, + "learning_rate": 7.892845476227996e-05, + "loss": 0.0, + "step": 42940 + }, + { + "epoch": 4.006811607726043, + "grad_norm": NaN, + "learning_rate": 7.892179398033565e-05, + "loss": 0.0, + "step": 42941 + }, + { + "epoch": 4.00690491742092, + "grad_norm": NaN, + "learning_rate": 7.891513337912209e-05, + "loss": 0.0, + "step": 42942 + }, + { + "epoch": 4.006998227115798, + "grad_norm": NaN, + "learning_rate": 7.890847295865642e-05, + "loss": 0.0, + "step": 42943 + }, + { + "epoch": 4.007091536810675, + "grad_norm": NaN, + "learning_rate": 7.890181271895538e-05, + "loss": 0.0, + "step": 42944 + }, + { + "epoch": 4.0071848465055515, + "grad_norm": NaN, + "learning_rate": 7.88951526600359e-05, + "loss": 0.0, + "step": 42945 + }, + { + "epoch": 4.007278156200429, + "grad_norm": NaN, + "learning_rate": 7.888849278191511e-05, + "loss": 0.0, + "step": 42946 + }, + { + "epoch": 4.007371465895306, + "grad_norm": NaN, + "learning_rate": 7.888183308460975e-05, + "loss": 0.0, + "step": 42947 + }, + { + "epoch": 4.007464775590184, + "grad_norm": NaN, + "learning_rate": 7.887517356813676e-05, + "loss": 0.0, + "step": 42948 + }, + { + "epoch": 4.007558085285061, + "grad_norm": NaN, + "learning_rate": 7.886851423251328e-05, + "loss": 0.0, + "step": 42949 + }, + { + "epoch": 4.007651394979939, + "grad_norm": NaN, + "learning_rate": 7.886185507775601e-05, + "loss": 0.0, + "step": 42950 + }, + { + "epoch": 4.007744704674816, + "grad_norm": NaN, + "learning_rate": 7.885519610388193e-05, + "loss": 0.0, + "step": 42951 + }, + { + "epoch": 4.0078380143696934, + "grad_norm": NaN, + "learning_rate": 7.884853731090812e-05, + "loss": 0.0, + "step": 42952 + }, + { + "epoch": 4.00793132406457, + "grad_norm": NaN, + "learning_rate": 7.884187869885135e-05, + "loss": 0.0, + "step": 42953 + }, + { + "epoch": 4.008024633759447, + "grad_norm": NaN, + "learning_rate": 7.883522026772855e-05, + "loss": 0.0, + "step": 42954 + }, + { + "epoch": 4.008117943454325, + "grad_norm": NaN, + "learning_rate": 7.882856201755681e-05, + "loss": 0.0, + "step": 42955 + }, + { + "epoch": 4.008211253149202, + "grad_norm": NaN, + "learning_rate": 7.88219039483529e-05, + "loss": 0.0, + "step": 42956 + }, + { + "epoch": 4.00830456284408, + "grad_norm": NaN, + "learning_rate": 7.881524606013375e-05, + "loss": 0.0, + "step": 42957 + }, + { + "epoch": 4.008397872538957, + "grad_norm": NaN, + "learning_rate": 7.880858835291644e-05, + "loss": 0.0, + "step": 42958 + }, + { + "epoch": 4.0084911822338345, + "grad_norm": NaN, + "learning_rate": 7.880193082671775e-05, + "loss": 0.0, + "step": 42959 + }, + { + "epoch": 4.008584491928711, + "grad_norm": NaN, + "learning_rate": 7.879527348155468e-05, + "loss": 0.0, + "step": 42960 + }, + { + "epoch": 4.008677801623588, + "grad_norm": NaN, + "learning_rate": 7.878861631744412e-05, + "loss": 0.0, + "step": 42961 + }, + { + "epoch": 4.008771111318466, + "grad_norm": NaN, + "learning_rate": 7.878195933440303e-05, + "loss": 0.0, + "step": 42962 + }, + { + "epoch": 4.008864421013343, + "grad_norm": NaN, + "learning_rate": 7.87753025324483e-05, + "loss": 0.0, + "step": 42963 + }, + { + "epoch": 4.008957730708221, + "grad_norm": NaN, + "learning_rate": 7.876864591159689e-05, + "loss": 0.0, + "step": 42964 + }, + { + "epoch": 4.009051040403098, + "grad_norm": NaN, + "learning_rate": 7.876198947186566e-05, + "loss": 0.0, + "step": 42965 + }, + { + "epoch": 4.0091443500979755, + "grad_norm": NaN, + "learning_rate": 7.875533321327169e-05, + "loss": 0.0, + "step": 42966 + }, + { + "epoch": 4.009237659792852, + "grad_norm": NaN, + "learning_rate": 7.874867713583174e-05, + "loss": 0.0, + "step": 42967 + }, + { + "epoch": 4.0093309694877295, + "grad_norm": NaN, + "learning_rate": 7.874202123956277e-05, + "loss": 0.0, + "step": 42968 + }, + { + "epoch": 4.009424279182607, + "grad_norm": NaN, + "learning_rate": 7.873536552448183e-05, + "loss": 0.0, + "step": 42969 + }, + { + "epoch": 4.009517588877484, + "grad_norm": NaN, + "learning_rate": 7.872870999060568e-05, + "loss": 0.0, + "step": 42970 + }, + { + "epoch": 4.009610898572362, + "grad_norm": NaN, + "learning_rate": 7.872205463795126e-05, + "loss": 0.0, + "step": 42971 + }, + { + "epoch": 4.009704208267239, + "grad_norm": NaN, + "learning_rate": 7.871539946653565e-05, + "loss": 0.0, + "step": 42972 + }, + { + "epoch": 4.009797517962117, + "grad_norm": NaN, + "learning_rate": 7.87087444763756e-05, + "loss": 0.0, + "step": 42973 + }, + { + "epoch": 4.009890827656994, + "grad_norm": NaN, + "learning_rate": 7.870208966748805e-05, + "loss": 0.0, + "step": 42974 + }, + { + "epoch": 4.0099841373518705, + "grad_norm": NaN, + "learning_rate": 7.869543503989008e-05, + "loss": 0.0, + "step": 42975 + }, + { + "epoch": 4.010077447046748, + "grad_norm": NaN, + "learning_rate": 7.868878059359843e-05, + "loss": 0.0, + "step": 42976 + }, + { + "epoch": 4.010170756741625, + "grad_norm": NaN, + "learning_rate": 7.868212632863004e-05, + "loss": 0.0, + "step": 42977 + }, + { + "epoch": 4.010264066436503, + "grad_norm": NaN, + "learning_rate": 7.867547224500199e-05, + "loss": 0.0, + "step": 42978 + }, + { + "epoch": 4.01035737613138, + "grad_norm": NaN, + "learning_rate": 7.866881834273102e-05, + "loss": 0.0, + "step": 42979 + }, + { + "epoch": 4.010450685826258, + "grad_norm": NaN, + "learning_rate": 7.866216462183407e-05, + "loss": 0.0, + "step": 42980 + }, + { + "epoch": 4.010543995521135, + "grad_norm": NaN, + "learning_rate": 7.865551108232822e-05, + "loss": 0.0, + "step": 42981 + }, + { + "epoch": 4.010637305216012, + "grad_norm": NaN, + "learning_rate": 7.86488577242302e-05, + "loss": 0.0, + "step": 42982 + }, + { + "epoch": 4.010730614910889, + "grad_norm": NaN, + "learning_rate": 7.864220454755695e-05, + "loss": 0.0, + "step": 42983 + }, + { + "epoch": 4.010823924605766, + "grad_norm": NaN, + "learning_rate": 7.863555155232554e-05, + "loss": 0.0, + "step": 42984 + }, + { + "epoch": 4.010917234300644, + "grad_norm": NaN, + "learning_rate": 7.862889873855274e-05, + "loss": 0.0, + "step": 42985 + }, + { + "epoch": 4.011010543995521, + "grad_norm": NaN, + "learning_rate": 7.862224610625544e-05, + "loss": 0.0, + "step": 42986 + }, + { + "epoch": 4.011103853690399, + "grad_norm": NaN, + "learning_rate": 7.861559365545076e-05, + "loss": 0.0, + "step": 42987 + }, + { + "epoch": 4.011197163385276, + "grad_norm": NaN, + "learning_rate": 7.86089413861554e-05, + "loss": 0.0, + "step": 42988 + }, + { + "epoch": 4.011290473080153, + "grad_norm": NaN, + "learning_rate": 7.86022892983863e-05, + "loss": 0.0, + "step": 42989 + }, + { + "epoch": 4.01138378277503, + "grad_norm": NaN, + "learning_rate": 7.859563739216055e-05, + "loss": 0.0, + "step": 42990 + }, + { + "epoch": 4.0114770924699075, + "grad_norm": NaN, + "learning_rate": 7.858898566749489e-05, + "loss": 0.0, + "step": 42991 + }, + { + "epoch": 4.011570402164785, + "grad_norm": NaN, + "learning_rate": 7.858233412440622e-05, + "loss": 0.0, + "step": 42992 + }, + { + "epoch": 4.011663711859662, + "grad_norm": NaN, + "learning_rate": 7.857568276291164e-05, + "loss": 0.0, + "step": 42993 + }, + { + "epoch": 4.01175702155454, + "grad_norm": NaN, + "learning_rate": 7.856903158302789e-05, + "loss": 0.0, + "step": 42994 + }, + { + "epoch": 4.011850331249417, + "grad_norm": NaN, + "learning_rate": 7.856238058477184e-05, + "loss": 0.0, + "step": 42995 + }, + { + "epoch": 4.011943640944294, + "grad_norm": NaN, + "learning_rate": 7.855572976816064e-05, + "loss": 0.0, + "step": 42996 + }, + { + "epoch": 4.012036950639171, + "grad_norm": NaN, + "learning_rate": 7.8549079133211e-05, + "loss": 0.0, + "step": 42997 + }, + { + "epoch": 4.0121302603340485, + "grad_norm": NaN, + "learning_rate": 7.854242867993983e-05, + "loss": 0.0, + "step": 42998 + }, + { + "epoch": 4.012223570028926, + "grad_norm": NaN, + "learning_rate": 7.853577840836421e-05, + "loss": 0.0, + "step": 42999 + }, + { + "epoch": 4.012316879723803, + "grad_norm": NaN, + "learning_rate": 7.852912831850085e-05, + "loss": 0.0, + "step": 43000 + }, + { + "epoch": 4.012410189418681, + "grad_norm": NaN, + "learning_rate": 7.85224784103667e-05, + "loss": 0.0, + "step": 43001 + }, + { + "epoch": 4.012503499113558, + "grad_norm": NaN, + "learning_rate": 7.851582868397886e-05, + "loss": 0.0, + "step": 43002 + }, + { + "epoch": 4.012596808808436, + "grad_norm": NaN, + "learning_rate": 7.850917913935395e-05, + "loss": 0.0, + "step": 43003 + }, + { + "epoch": 4.012690118503312, + "grad_norm": NaN, + "learning_rate": 7.850252977650915e-05, + "loss": 0.0, + "step": 43004 + }, + { + "epoch": 4.01278342819819, + "grad_norm": NaN, + "learning_rate": 7.849588059546117e-05, + "loss": 0.0, + "step": 43005 + }, + { + "epoch": 4.012876737893067, + "grad_norm": NaN, + "learning_rate": 7.848923159622694e-05, + "loss": 0.0, + "step": 43006 + }, + { + "epoch": 4.012970047587944, + "grad_norm": NaN, + "learning_rate": 7.84825827788235e-05, + "loss": 0.0, + "step": 43007 + }, + { + "epoch": 4.013063357282822, + "grad_norm": NaN, + "learning_rate": 7.847593414326763e-05, + "loss": 0.0, + "step": 43008 + }, + { + "epoch": 4.013156666977699, + "grad_norm": NaN, + "learning_rate": 7.84692856895762e-05, + "loss": 0.0, + "step": 43009 + }, + { + "epoch": 4.013249976672577, + "grad_norm": NaN, + "learning_rate": 7.846263741776632e-05, + "loss": 0.0, + "step": 43010 + }, + { + "epoch": 4.013343286367453, + "grad_norm": NaN, + "learning_rate": 7.84559893278547e-05, + "loss": 0.0, + "step": 43011 + }, + { + "epoch": 4.013436596062331, + "grad_norm": NaN, + "learning_rate": 7.844934141985825e-05, + "loss": 0.0, + "step": 43012 + }, + { + "epoch": 4.013529905757208, + "grad_norm": NaN, + "learning_rate": 7.844269369379404e-05, + "loss": 0.0, + "step": 43013 + }, + { + "epoch": 4.0136232154520854, + "grad_norm": NaN, + "learning_rate": 7.84360461496788e-05, + "loss": 0.0, + "step": 43014 + }, + { + "epoch": 4.013716525146963, + "grad_norm": NaN, + "learning_rate": 7.842939878752944e-05, + "loss": 0.0, + "step": 43015 + }, + { + "epoch": 4.01380983484184, + "grad_norm": NaN, + "learning_rate": 7.842275160736305e-05, + "loss": 0.0, + "step": 43016 + }, + { + "epoch": 4.013903144536718, + "grad_norm": NaN, + "learning_rate": 7.841610460919634e-05, + "loss": 0.0, + "step": 43017 + }, + { + "epoch": 4.013996454231594, + "grad_norm": NaN, + "learning_rate": 7.840945779304619e-05, + "loss": 0.0, + "step": 43018 + }, + { + "epoch": 4.014089763926472, + "grad_norm": NaN, + "learning_rate": 7.840281115892972e-05, + "loss": 0.0, + "step": 43019 + }, + { + "epoch": 4.014183073621349, + "grad_norm": NaN, + "learning_rate": 7.839616470686363e-05, + "loss": 0.0, + "step": 43020 + }, + { + "epoch": 4.0142763833162265, + "grad_norm": NaN, + "learning_rate": 7.838951843686482e-05, + "loss": 0.0, + "step": 43021 + }, + { + "epoch": 4.014369693011104, + "grad_norm": NaN, + "learning_rate": 7.838287234895036e-05, + "loss": 0.0, + "step": 43022 + }, + { + "epoch": 4.014463002705981, + "grad_norm": NaN, + "learning_rate": 7.8376226443137e-05, + "loss": 0.0, + "step": 43023 + }, + { + "epoch": 4.014556312400859, + "grad_norm": NaN, + "learning_rate": 7.836958071944162e-05, + "loss": 0.0, + "step": 43024 + }, + { + "epoch": 4.014649622095736, + "grad_norm": NaN, + "learning_rate": 7.836293517788127e-05, + "loss": 0.0, + "step": 43025 + }, + { + "epoch": 4.014742931790613, + "grad_norm": NaN, + "learning_rate": 7.835628981847272e-05, + "loss": 0.0, + "step": 43026 + }, + { + "epoch": 4.01483624148549, + "grad_norm": NaN, + "learning_rate": 7.834964464123284e-05, + "loss": 0.0, + "step": 43027 + }, + { + "epoch": 4.0149295511803675, + "grad_norm": NaN, + "learning_rate": 7.83429996461787e-05, + "loss": 0.0, + "step": 43028 + }, + { + "epoch": 4.015022860875245, + "grad_norm": NaN, + "learning_rate": 7.833635483332703e-05, + "loss": 0.0, + "step": 43029 + }, + { + "epoch": 4.015116170570122, + "grad_norm": NaN, + "learning_rate": 7.832971020269474e-05, + "loss": 0.0, + "step": 43030 + }, + { + "epoch": 4.015209480265, + "grad_norm": NaN, + "learning_rate": 7.832306575429886e-05, + "loss": 0.0, + "step": 43031 + }, + { + "epoch": 4.015302789959877, + "grad_norm": NaN, + "learning_rate": 7.831642148815614e-05, + "loss": 0.0, + "step": 43032 + }, + { + "epoch": 4.015396099654754, + "grad_norm": NaN, + "learning_rate": 7.830977740428347e-05, + "loss": 0.0, + "step": 43033 + }, + { + "epoch": 4.015489409349631, + "grad_norm": NaN, + "learning_rate": 7.830313350269791e-05, + "loss": 0.0, + "step": 43034 + }, + { + "epoch": 4.015582719044509, + "grad_norm": NaN, + "learning_rate": 7.82964897834162e-05, + "loss": 0.0, + "step": 43035 + }, + { + "epoch": 4.015676028739386, + "grad_norm": NaN, + "learning_rate": 7.82898462464552e-05, + "loss": 0.0, + "step": 43036 + }, + { + "epoch": 4.015769338434263, + "grad_norm": NaN, + "learning_rate": 7.828320289183202e-05, + "loss": 0.0, + "step": 43037 + }, + { + "epoch": 4.015862648129141, + "grad_norm": NaN, + "learning_rate": 7.827655971956333e-05, + "loss": 0.0, + "step": 43038 + }, + { + "epoch": 4.015955957824018, + "grad_norm": NaN, + "learning_rate": 7.826991672966606e-05, + "loss": 0.0, + "step": 43039 + }, + { + "epoch": 4.016049267518895, + "grad_norm": NaN, + "learning_rate": 7.826327392215725e-05, + "loss": 0.0, + "step": 43040 + }, + { + "epoch": 4.016142577213772, + "grad_norm": NaN, + "learning_rate": 7.825663129705358e-05, + "loss": 0.0, + "step": 43041 + }, + { + "epoch": 4.01623588690865, + "grad_norm": NaN, + "learning_rate": 7.824998885437209e-05, + "loss": 0.0, + "step": 43042 + }, + { + "epoch": 4.016329196603527, + "grad_norm": NaN, + "learning_rate": 7.824334659412967e-05, + "loss": 0.0, + "step": 43043 + }, + { + "epoch": 4.0164225062984045, + "grad_norm": NaN, + "learning_rate": 7.823670451634308e-05, + "loss": 0.0, + "step": 43044 + }, + { + "epoch": 4.016515815993282, + "grad_norm": NaN, + "learning_rate": 7.823006262102932e-05, + "loss": 0.0, + "step": 43045 + }, + { + "epoch": 4.016609125688159, + "grad_norm": NaN, + "learning_rate": 7.822342090820532e-05, + "loss": 0.0, + "step": 43046 + }, + { + "epoch": 4.016702435383037, + "grad_norm": NaN, + "learning_rate": 7.821677937788777e-05, + "loss": 0.0, + "step": 43047 + }, + { + "epoch": 4.016795745077913, + "grad_norm": NaN, + "learning_rate": 7.821013803009375e-05, + "loss": 0.0, + "step": 43048 + }, + { + "epoch": 4.016889054772791, + "grad_norm": NaN, + "learning_rate": 7.820349686484016e-05, + "loss": 0.0, + "step": 43049 + }, + { + "epoch": 4.016982364467668, + "grad_norm": NaN, + "learning_rate": 7.81968558821437e-05, + "loss": 0.0, + "step": 43050 + }, + { + "epoch": 4.0170756741625455, + "grad_norm": NaN, + "learning_rate": 7.819021508202147e-05, + "loss": 0.0, + "step": 43051 + }, + { + "epoch": 4.017168983857423, + "grad_norm": NaN, + "learning_rate": 7.818357446449017e-05, + "loss": 0.0, + "step": 43052 + }, + { + "epoch": 4.0172622935523, + "grad_norm": NaN, + "learning_rate": 7.817693402956674e-05, + "loss": 0.0, + "step": 43053 + }, + { + "epoch": 4.017355603247178, + "grad_norm": NaN, + "learning_rate": 7.817029377726818e-05, + "loss": 0.0, + "step": 43054 + }, + { + "epoch": 4.017448912942054, + "grad_norm": NaN, + "learning_rate": 7.816365370761125e-05, + "loss": 0.0, + "step": 43055 + }, + { + "epoch": 4.017542222636932, + "grad_norm": NaN, + "learning_rate": 7.815701382061282e-05, + "loss": 0.0, + "step": 43056 + }, + { + "epoch": 4.017635532331809, + "grad_norm": NaN, + "learning_rate": 7.815037411628991e-05, + "loss": 0.0, + "step": 43057 + }, + { + "epoch": 4.017728842026687, + "grad_norm": NaN, + "learning_rate": 7.814373459465927e-05, + "loss": 0.0, + "step": 43058 + }, + { + "epoch": 4.017822151721564, + "grad_norm": NaN, + "learning_rate": 7.813709525573776e-05, + "loss": 0.0, + "step": 43059 + }, + { + "epoch": 4.017915461416441, + "grad_norm": NaN, + "learning_rate": 7.813045609954246e-05, + "loss": 0.0, + "step": 43060 + }, + { + "epoch": 4.018008771111319, + "grad_norm": NaN, + "learning_rate": 7.812381712609006e-05, + "loss": 0.0, + "step": 43061 + }, + { + "epoch": 4.018102080806195, + "grad_norm": NaN, + "learning_rate": 7.811717833539745e-05, + "loss": 0.0, + "step": 43062 + }, + { + "epoch": 4.018195390501073, + "grad_norm": NaN, + "learning_rate": 7.811053972748166e-05, + "loss": 0.0, + "step": 43063 + }, + { + "epoch": 4.01828870019595, + "grad_norm": NaN, + "learning_rate": 7.810390130235941e-05, + "loss": 0.0, + "step": 43064 + }, + { + "epoch": 4.018382009890828, + "grad_norm": NaN, + "learning_rate": 7.809726306004762e-05, + "loss": 0.0, + "step": 43065 + }, + { + "epoch": 4.018475319585705, + "grad_norm": NaN, + "learning_rate": 7.809062500056327e-05, + "loss": 0.0, + "step": 43066 + }, + { + "epoch": 4.0185686292805824, + "grad_norm": NaN, + "learning_rate": 7.808398712392312e-05, + "loss": 0.0, + "step": 43067 + }, + { + "epoch": 4.01866193897546, + "grad_norm": NaN, + "learning_rate": 7.807734943014402e-05, + "loss": 0.0, + "step": 43068 + }, + { + "epoch": 4.018755248670337, + "grad_norm": NaN, + "learning_rate": 7.807071191924304e-05, + "loss": 0.0, + "step": 43069 + }, + { + "epoch": 4.018848558365214, + "grad_norm": NaN, + "learning_rate": 7.806407459123686e-05, + "loss": 0.0, + "step": 43070 + }, + { + "epoch": 4.018941868060091, + "grad_norm": NaN, + "learning_rate": 7.80574374461424e-05, + "loss": 0.0, + "step": 43071 + }, + { + "epoch": 4.019035177754969, + "grad_norm": NaN, + "learning_rate": 7.805080048397666e-05, + "loss": 0.0, + "step": 43072 + }, + { + "epoch": 4.019128487449846, + "grad_norm": NaN, + "learning_rate": 7.804416370475636e-05, + "loss": 0.0, + "step": 43073 + }, + { + "epoch": 4.0192217971447235, + "grad_norm": NaN, + "learning_rate": 7.803752710849838e-05, + "loss": 0.0, + "step": 43074 + }, + { + "epoch": 4.019315106839601, + "grad_norm": NaN, + "learning_rate": 7.803089069521979e-05, + "loss": 0.0, + "step": 43075 + }, + { + "epoch": 4.019408416534478, + "grad_norm": NaN, + "learning_rate": 7.80242544649372e-05, + "loss": 0.0, + "step": 43076 + }, + { + "epoch": 4.019501726229355, + "grad_norm": NaN, + "learning_rate": 7.801761841766764e-05, + "loss": 0.0, + "step": 43077 + }, + { + "epoch": 4.019595035924232, + "grad_norm": NaN, + "learning_rate": 7.801098255342804e-05, + "loss": 0.0, + "step": 43078 + }, + { + "epoch": 4.01968834561911, + "grad_norm": NaN, + "learning_rate": 7.800434687223507e-05, + "loss": 0.0, + "step": 43079 + }, + { + "epoch": 4.019781655313987, + "grad_norm": NaN, + "learning_rate": 7.799771137410576e-05, + "loss": 0.0, + "step": 43080 + }, + { + "epoch": 4.0198749650088645, + "grad_norm": NaN, + "learning_rate": 7.799107605905702e-05, + "loss": 0.0, + "step": 43081 + }, + { + "epoch": 4.019968274703742, + "grad_norm": NaN, + "learning_rate": 7.798444092710551e-05, + "loss": 0.0, + "step": 43082 + }, + { + "epoch": 4.020061584398619, + "grad_norm": NaN, + "learning_rate": 7.79778059782683e-05, + "loss": 0.0, + "step": 43083 + }, + { + "epoch": 4.020154894093496, + "grad_norm": NaN, + "learning_rate": 7.797117121256226e-05, + "loss": 0.0, + "step": 43084 + }, + { + "epoch": 4.020248203788373, + "grad_norm": NaN, + "learning_rate": 7.796453663000408e-05, + "loss": 0.0, + "step": 43085 + }, + { + "epoch": 4.020341513483251, + "grad_norm": NaN, + "learning_rate": 7.79579022306108e-05, + "loss": 0.0, + "step": 43086 + }, + { + "epoch": 4.020434823178128, + "grad_norm": NaN, + "learning_rate": 7.795126801439929e-05, + "loss": 0.0, + "step": 43087 + }, + { + "epoch": 4.020528132873006, + "grad_norm": NaN, + "learning_rate": 7.794463398138624e-05, + "loss": 0.0, + "step": 43088 + }, + { + "epoch": 4.020621442567883, + "grad_norm": NaN, + "learning_rate": 7.79380001315887e-05, + "loss": 0.0, + "step": 43089 + }, + { + "epoch": 4.02071475226276, + "grad_norm": NaN, + "learning_rate": 7.793136646502354e-05, + "loss": 0.0, + "step": 43090 + }, + { + "epoch": 4.020808061957637, + "grad_norm": NaN, + "learning_rate": 7.792473298170744e-05, + "loss": 0.0, + "step": 43091 + }, + { + "epoch": 4.020901371652514, + "grad_norm": NaN, + "learning_rate": 7.791809968165747e-05, + "loss": 0.0, + "step": 43092 + }, + { + "epoch": 4.020994681347392, + "grad_norm": NaN, + "learning_rate": 7.791146656489046e-05, + "loss": 0.0, + "step": 43093 + }, + { + "epoch": 4.021087991042269, + "grad_norm": NaN, + "learning_rate": 7.790483363142315e-05, + "loss": 0.0, + "step": 43094 + }, + { + "epoch": 4.021181300737147, + "grad_norm": NaN, + "learning_rate": 7.789820088127257e-05, + "loss": 0.0, + "step": 43095 + }, + { + "epoch": 4.021274610432024, + "grad_norm": NaN, + "learning_rate": 7.789156831445548e-05, + "loss": 0.0, + "step": 43096 + }, + { + "epoch": 4.0213679201269015, + "grad_norm": NaN, + "learning_rate": 7.788493593098872e-05, + "loss": 0.0, + "step": 43097 + }, + { + "epoch": 4.021461229821779, + "grad_norm": NaN, + "learning_rate": 7.78783037308893e-05, + "loss": 0.0, + "step": 43098 + }, + { + "epoch": 4.021554539516655, + "grad_norm": NaN, + "learning_rate": 7.787167171417393e-05, + "loss": 0.0, + "step": 43099 + }, + { + "epoch": 4.021647849211533, + "grad_norm": NaN, + "learning_rate": 7.786503988085948e-05, + "loss": 0.0, + "step": 43100 + }, + { + "epoch": 4.02174115890641, + "grad_norm": NaN, + "learning_rate": 7.785840823096298e-05, + "loss": 0.0, + "step": 43101 + }, + { + "epoch": 4.021834468601288, + "grad_norm": NaN, + "learning_rate": 7.785177676450112e-05, + "loss": 0.0, + "step": 43102 + }, + { + "epoch": 4.021927778296165, + "grad_norm": NaN, + "learning_rate": 7.784514548149077e-05, + "loss": 0.0, + "step": 43103 + }, + { + "epoch": 4.0220210879910425, + "grad_norm": NaN, + "learning_rate": 7.783851438194894e-05, + "loss": 0.0, + "step": 43104 + }, + { + "epoch": 4.02211439768592, + "grad_norm": NaN, + "learning_rate": 7.783188346589237e-05, + "loss": 0.0, + "step": 43105 + }, + { + "epoch": 4.0222077073807965, + "grad_norm": NaN, + "learning_rate": 7.782525273333783e-05, + "loss": 0.0, + "step": 43106 + }, + { + "epoch": 4.022301017075674, + "grad_norm": NaN, + "learning_rate": 7.781862218430244e-05, + "loss": 0.0, + "step": 43107 + }, + { + "epoch": 4.022394326770551, + "grad_norm": NaN, + "learning_rate": 7.781199181880286e-05, + "loss": 0.0, + "step": 43108 + }, + { + "epoch": 4.022487636465429, + "grad_norm": NaN, + "learning_rate": 7.780536163685595e-05, + "loss": 0.0, + "step": 43109 + }, + { + "epoch": 4.022580946160306, + "grad_norm": NaN, + "learning_rate": 7.77987316384787e-05, + "loss": 0.0, + "step": 43110 + }, + { + "epoch": 4.022674255855184, + "grad_norm": NaN, + "learning_rate": 7.779210182368786e-05, + "loss": 0.0, + "step": 43111 + }, + { + "epoch": 4.022767565550061, + "grad_norm": NaN, + "learning_rate": 7.778547219250026e-05, + "loss": 0.0, + "step": 43112 + }, + { + "epoch": 4.0228608752449375, + "grad_norm": NaN, + "learning_rate": 7.77788427449329e-05, + "loss": 0.0, + "step": 43113 + }, + { + "epoch": 4.022954184939815, + "grad_norm": NaN, + "learning_rate": 7.777221348100246e-05, + "loss": 0.0, + "step": 43114 + }, + { + "epoch": 4.023047494634692, + "grad_norm": NaN, + "learning_rate": 7.776558440072593e-05, + "loss": 0.0, + "step": 43115 + }, + { + "epoch": 4.02314080432957, + "grad_norm": NaN, + "learning_rate": 7.77589555041202e-05, + "loss": 0.0, + "step": 43116 + }, + { + "epoch": 4.023234114024447, + "grad_norm": NaN, + "learning_rate": 7.77523267912019e-05, + "loss": 0.0, + "step": 43117 + }, + { + "epoch": 4.023327423719325, + "grad_norm": NaN, + "learning_rate": 7.774569826198812e-05, + "loss": 0.0, + "step": 43118 + }, + { + "epoch": 4.023420733414202, + "grad_norm": NaN, + "learning_rate": 7.773906991649566e-05, + "loss": 0.0, + "step": 43119 + }, + { + "epoch": 4.0235140431090795, + "grad_norm": NaN, + "learning_rate": 7.773244175474122e-05, + "loss": 0.0, + "step": 43120 + }, + { + "epoch": 4.023607352803956, + "grad_norm": NaN, + "learning_rate": 7.772581377674185e-05, + "loss": 0.0, + "step": 43121 + }, + { + "epoch": 4.023700662498833, + "grad_norm": NaN, + "learning_rate": 7.771918598251437e-05, + "loss": 0.0, + "step": 43122 + }, + { + "epoch": 4.023793972193711, + "grad_norm": NaN, + "learning_rate": 7.771255837207548e-05, + "loss": 0.0, + "step": 43123 + }, + { + "epoch": 4.023887281888588, + "grad_norm": NaN, + "learning_rate": 7.770593094544221e-05, + "loss": 0.0, + "step": 43124 + }, + { + "epoch": 4.023980591583466, + "grad_norm": NaN, + "learning_rate": 7.76993037026314e-05, + "loss": 0.0, + "step": 43125 + }, + { + "epoch": 4.024073901278343, + "grad_norm": NaN, + "learning_rate": 7.769267664365971e-05, + "loss": 0.0, + "step": 43126 + }, + { + "epoch": 4.0241672109732205, + "grad_norm": NaN, + "learning_rate": 7.768604976854419e-05, + "loss": 0.0, + "step": 43127 + }, + { + "epoch": 4.024260520668097, + "grad_norm": NaN, + "learning_rate": 7.767942307730168e-05, + "loss": 0.0, + "step": 43128 + }, + { + "epoch": 4.0243538303629744, + "grad_norm": NaN, + "learning_rate": 7.767279656994886e-05, + "loss": 0.0, + "step": 43129 + }, + { + "epoch": 4.024447140057852, + "grad_norm": NaN, + "learning_rate": 7.766617024650277e-05, + "loss": 0.0, + "step": 43130 + }, + { + "epoch": 4.024540449752729, + "grad_norm": NaN, + "learning_rate": 7.765954410698021e-05, + "loss": 0.0, + "step": 43131 + }, + { + "epoch": 4.024633759447607, + "grad_norm": NaN, + "learning_rate": 7.76529181513979e-05, + "loss": 0.0, + "step": 43132 + }, + { + "epoch": 4.024727069142484, + "grad_norm": NaN, + "learning_rate": 7.764629237977284e-05, + "loss": 0.0, + "step": 43133 + }, + { + "epoch": 4.0248203788373615, + "grad_norm": NaN, + "learning_rate": 7.763966679212188e-05, + "loss": 0.0, + "step": 43134 + }, + { + "epoch": 4.024913688532238, + "grad_norm": NaN, + "learning_rate": 7.763304138846172e-05, + "loss": 0.0, + "step": 43135 + }, + { + "epoch": 4.0250069982271155, + "grad_norm": NaN, + "learning_rate": 7.762641616880933e-05, + "loss": 0.0, + "step": 43136 + }, + { + "epoch": 4.025100307921993, + "grad_norm": NaN, + "learning_rate": 7.761979113318159e-05, + "loss": 0.0, + "step": 43137 + }, + { + "epoch": 4.02519361761687, + "grad_norm": NaN, + "learning_rate": 7.761316628159519e-05, + "loss": 0.0, + "step": 43138 + }, + { + "epoch": 4.025286927311748, + "grad_norm": NaN, + "learning_rate": 7.760654161406714e-05, + "loss": 0.0, + "step": 43139 + }, + { + "epoch": 4.025380237006625, + "grad_norm": NaN, + "learning_rate": 7.759991713061418e-05, + "loss": 0.0, + "step": 43140 + }, + { + "epoch": 4.025473546701503, + "grad_norm": NaN, + "learning_rate": 7.759329283125313e-05, + "loss": 0.0, + "step": 43141 + }, + { + "epoch": 4.02556685639638, + "grad_norm": NaN, + "learning_rate": 7.758666871600099e-05, + "loss": 0.0, + "step": 43142 + }, + { + "epoch": 4.0256601660912565, + "grad_norm": NaN, + "learning_rate": 7.758004478487446e-05, + "loss": 0.0, + "step": 43143 + }, + { + "epoch": 4.025753475786134, + "grad_norm": NaN, + "learning_rate": 7.757342103789033e-05, + "loss": 0.0, + "step": 43144 + }, + { + "epoch": 4.025846785481011, + "grad_norm": NaN, + "learning_rate": 7.756679747506566e-05, + "loss": 0.0, + "step": 43145 + }, + { + "epoch": 4.025940095175889, + "grad_norm": NaN, + "learning_rate": 7.756017409641713e-05, + "loss": 0.0, + "step": 43146 + }, + { + "epoch": 4.026033404870766, + "grad_norm": NaN, + "learning_rate": 7.755355090196156e-05, + "loss": 0.0, + "step": 43147 + }, + { + "epoch": 4.026126714565644, + "grad_norm": NaN, + "learning_rate": 7.754692789171594e-05, + "loss": 0.0, + "step": 43148 + }, + { + "epoch": 4.026220024260521, + "grad_norm": NaN, + "learning_rate": 7.7540305065697e-05, + "loss": 0.0, + "step": 43149 + }, + { + "epoch": 4.026313333955398, + "grad_norm": NaN, + "learning_rate": 7.753368242392152e-05, + "loss": 0.0, + "step": 43150 + }, + { + "epoch": 4.026406643650275, + "grad_norm": NaN, + "learning_rate": 7.752705996640653e-05, + "loss": 0.0, + "step": 43151 + }, + { + "epoch": 4.026499953345152, + "grad_norm": NaN, + "learning_rate": 7.752043769316866e-05, + "loss": 0.0, + "step": 43152 + }, + { + "epoch": 4.02659326304003, + "grad_norm": NaN, + "learning_rate": 7.751381560422489e-05, + "loss": 0.0, + "step": 43153 + }, + { + "epoch": 4.026686572734907, + "grad_norm": NaN, + "learning_rate": 7.75071936995921e-05, + "loss": 0.0, + "step": 43154 + }, + { + "epoch": 4.026779882429785, + "grad_norm": NaN, + "learning_rate": 7.75005719792869e-05, + "loss": 0.0, + "step": 43155 + }, + { + "epoch": 4.026873192124662, + "grad_norm": NaN, + "learning_rate": 7.749395044332635e-05, + "loss": 0.0, + "step": 43156 + }, + { + "epoch": 4.026966501819539, + "grad_norm": NaN, + "learning_rate": 7.748732909172723e-05, + "loss": 0.0, + "step": 43157 + }, + { + "epoch": 4.027059811514416, + "grad_norm": NaN, + "learning_rate": 7.748070792450627e-05, + "loss": 0.0, + "step": 43158 + }, + { + "epoch": 4.0271531212092935, + "grad_norm": NaN, + "learning_rate": 7.747408694168041e-05, + "loss": 0.0, + "step": 43159 + }, + { + "epoch": 4.027246430904171, + "grad_norm": NaN, + "learning_rate": 7.746746614326656e-05, + "loss": 0.0, + "step": 43160 + }, + { + "epoch": 4.027339740599048, + "grad_norm": NaN, + "learning_rate": 7.746084552928131e-05, + "loss": 0.0, + "step": 43161 + }, + { + "epoch": 4.027433050293926, + "grad_norm": NaN, + "learning_rate": 7.745422509974172e-05, + "loss": 0.0, + "step": 43162 + }, + { + "epoch": 4.027526359988803, + "grad_norm": NaN, + "learning_rate": 7.74476048546646e-05, + "loss": 0.0, + "step": 43163 + }, + { + "epoch": 4.027619669683681, + "grad_norm": NaN, + "learning_rate": 7.74409847940666e-05, + "loss": 0.0, + "step": 43164 + }, + { + "epoch": 4.027712979378557, + "grad_norm": NaN, + "learning_rate": 7.743436491796476e-05, + "loss": 0.0, + "step": 43165 + }, + { + "epoch": 4.0278062890734345, + "grad_norm": NaN, + "learning_rate": 7.742774522637588e-05, + "loss": 0.0, + "step": 43166 + }, + { + "epoch": 4.027899598768312, + "grad_norm": NaN, + "learning_rate": 7.742112571931662e-05, + "loss": 0.0, + "step": 43167 + }, + { + "epoch": 4.027992908463189, + "grad_norm": NaN, + "learning_rate": 7.741450639680401e-05, + "loss": 0.0, + "step": 43168 + }, + { + "epoch": 4.028086218158067, + "grad_norm": NaN, + "learning_rate": 7.740788725885487e-05, + "loss": 0.0, + "step": 43169 + }, + { + "epoch": 4.028179527852944, + "grad_norm": NaN, + "learning_rate": 7.740126830548586e-05, + "loss": 0.0, + "step": 43170 + }, + { + "epoch": 4.028272837547822, + "grad_norm": NaN, + "learning_rate": 7.739464953671398e-05, + "loss": 0.0, + "step": 43171 + }, + { + "epoch": 4.028366147242698, + "grad_norm": NaN, + "learning_rate": 7.738803095255604e-05, + "loss": 0.0, + "step": 43172 + }, + { + "epoch": 4.028459456937576, + "grad_norm": NaN, + "learning_rate": 7.738141255302871e-05, + "loss": 0.0, + "step": 43173 + }, + { + "epoch": 4.028552766632453, + "grad_norm": NaN, + "learning_rate": 7.7374794338149e-05, + "loss": 0.0, + "step": 43174 + }, + { + "epoch": 4.02864607632733, + "grad_norm": NaN, + "learning_rate": 7.736817630793374e-05, + "loss": 0.0, + "step": 43175 + }, + { + "epoch": 4.028739386022208, + "grad_norm": NaN, + "learning_rate": 7.736155846239957e-05, + "loss": 0.0, + "step": 43176 + }, + { + "epoch": 4.028832695717085, + "grad_norm": NaN, + "learning_rate": 7.735494080156348e-05, + "loss": 0.0, + "step": 43177 + }, + { + "epoch": 4.028926005411963, + "grad_norm": NaN, + "learning_rate": 7.734832332544234e-05, + "loss": 0.0, + "step": 43178 + }, + { + "epoch": 4.029019315106839, + "grad_norm": NaN, + "learning_rate": 7.734170603405276e-05, + "loss": 0.0, + "step": 43179 + }, + { + "epoch": 4.029112624801717, + "grad_norm": NaN, + "learning_rate": 7.733508892741177e-05, + "loss": 0.0, + "step": 43180 + }, + { + "epoch": 4.029205934496594, + "grad_norm": NaN, + "learning_rate": 7.732847200553618e-05, + "loss": 0.0, + "step": 43181 + }, + { + "epoch": 4.0292992441914715, + "grad_norm": NaN, + "learning_rate": 7.732185526844264e-05, + "loss": 0.0, + "step": 43182 + }, + { + "epoch": 4.029392553886349, + "grad_norm": NaN, + "learning_rate": 7.731523871614815e-05, + "loss": 0.0, + "step": 43183 + }, + { + "epoch": 4.029485863581226, + "grad_norm": NaN, + "learning_rate": 7.730862234866955e-05, + "loss": 0.0, + "step": 43184 + }, + { + "epoch": 4.029579173276104, + "grad_norm": NaN, + "learning_rate": 7.730200616602345e-05, + "loss": 0.0, + "step": 43185 + }, + { + "epoch": 4.02967248297098, + "grad_norm": NaN, + "learning_rate": 7.729539016822693e-05, + "loss": 0.0, + "step": 43186 + }, + { + "epoch": 4.029765792665858, + "grad_norm": NaN, + "learning_rate": 7.728877435529666e-05, + "loss": 0.0, + "step": 43187 + }, + { + "epoch": 4.029859102360735, + "grad_norm": NaN, + "learning_rate": 7.728215872724943e-05, + "loss": 0.0, + "step": 43188 + }, + { + "epoch": 4.0299524120556125, + "grad_norm": NaN, + "learning_rate": 7.727554328410224e-05, + "loss": 0.0, + "step": 43189 + }, + { + "epoch": 4.03004572175049, + "grad_norm": NaN, + "learning_rate": 7.726892802587169e-05, + "loss": 0.0, + "step": 43190 + }, + { + "epoch": 4.030139031445367, + "grad_norm": NaN, + "learning_rate": 7.726231295257478e-05, + "loss": 0.0, + "step": 43191 + }, + { + "epoch": 4.030232341140245, + "grad_norm": NaN, + "learning_rate": 7.72556980642283e-05, + "loss": 0.0, + "step": 43192 + }, + { + "epoch": 4.030325650835122, + "grad_norm": NaN, + "learning_rate": 7.724908336084892e-05, + "loss": 0.0, + "step": 43193 + }, + { + "epoch": 4.030418960529999, + "grad_norm": NaN, + "learning_rate": 7.724246884245363e-05, + "loss": 0.0, + "step": 43194 + }, + { + "epoch": 4.030512270224876, + "grad_norm": NaN, + "learning_rate": 7.723585450905926e-05, + "loss": 0.0, + "step": 43195 + }, + { + "epoch": 4.0306055799197535, + "grad_norm": NaN, + "learning_rate": 7.722924036068244e-05, + "loss": 0.0, + "step": 43196 + }, + { + "epoch": 4.030698889614631, + "grad_norm": NaN, + "learning_rate": 7.722262639734014e-05, + "loss": 0.0, + "step": 43197 + }, + { + "epoch": 4.030792199309508, + "grad_norm": NaN, + "learning_rate": 7.721601261904923e-05, + "loss": 0.0, + "step": 43198 + }, + { + "epoch": 4.030885509004386, + "grad_norm": NaN, + "learning_rate": 7.72093990258263e-05, + "loss": 0.0, + "step": 43199 + }, + { + "epoch": 4.030978818699263, + "grad_norm": NaN, + "learning_rate": 7.720278561768837e-05, + "loss": 0.0, + "step": 43200 + }, + { + "epoch": 4.03107212839414, + "grad_norm": NaN, + "learning_rate": 7.719617239465226e-05, + "loss": 0.0, + "step": 43201 + }, + { + "epoch": 4.031165438089017, + "grad_norm": NaN, + "learning_rate": 7.718955935673459e-05, + "loss": 0.0, + "step": 43202 + }, + { + "epoch": 4.031258747783895, + "grad_norm": NaN, + "learning_rate": 7.718294650395238e-05, + "loss": 0.0, + "step": 43203 + }, + { + "epoch": 4.031352057478772, + "grad_norm": NaN, + "learning_rate": 7.717633383632242e-05, + "loss": 0.0, + "step": 43204 + }, + { + "epoch": 4.031445367173649, + "grad_norm": NaN, + "learning_rate": 7.716972135386132e-05, + "loss": 0.0, + "step": 43205 + }, + { + "epoch": 4.031538676868527, + "grad_norm": NaN, + "learning_rate": 7.716310905658614e-05, + "loss": 0.0, + "step": 43206 + }, + { + "epoch": 4.031631986563404, + "grad_norm": NaN, + "learning_rate": 7.715649694451363e-05, + "loss": 0.0, + "step": 43207 + }, + { + "epoch": 4.031725296258281, + "grad_norm": NaN, + "learning_rate": 7.714988501766048e-05, + "loss": 0.0, + "step": 43208 + }, + { + "epoch": 4.031818605953158, + "grad_norm": NaN, + "learning_rate": 7.714327327604364e-05, + "loss": 0.0, + "step": 43209 + }, + { + "epoch": 4.031911915648036, + "grad_norm": NaN, + "learning_rate": 7.713666171967993e-05, + "loss": 0.0, + "step": 43210 + }, + { + "epoch": 4.032005225342913, + "grad_norm": NaN, + "learning_rate": 7.713005034858599e-05, + "loss": 0.0, + "step": 43211 + }, + { + "epoch": 4.0320985350377905, + "grad_norm": NaN, + "learning_rate": 7.71234391627788e-05, + "loss": 0.0, + "step": 43212 + }, + { + "epoch": 4.032191844732668, + "grad_norm": NaN, + "learning_rate": 7.711682816227519e-05, + "loss": 0.0, + "step": 43213 + }, + { + "epoch": 4.032285154427545, + "grad_norm": NaN, + "learning_rate": 7.711021734709177e-05, + "loss": 0.0, + "step": 43214 + }, + { + "epoch": 4.032378464122423, + "grad_norm": NaN, + "learning_rate": 7.710360671724552e-05, + "loss": 0.0, + "step": 43215 + }, + { + "epoch": 4.032471773817299, + "grad_norm": NaN, + "learning_rate": 7.70969962727533e-05, + "loss": 0.0, + "step": 43216 + }, + { + "epoch": 4.032565083512177, + "grad_norm": NaN, + "learning_rate": 7.709038601363167e-05, + "loss": 0.0, + "step": 43217 + }, + { + "epoch": 4.032658393207054, + "grad_norm": NaN, + "learning_rate": 7.708377593989767e-05, + "loss": 0.0, + "step": 43218 + }, + { + "epoch": 4.0327517029019315, + "grad_norm": NaN, + "learning_rate": 7.707716605156808e-05, + "loss": 0.0, + "step": 43219 + }, + { + "epoch": 4.032845012596809, + "grad_norm": NaN, + "learning_rate": 7.707055634865954e-05, + "loss": 0.0, + "step": 43220 + }, + { + "epoch": 4.032938322291686, + "grad_norm": NaN, + "learning_rate": 7.706394683118902e-05, + "loss": 0.0, + "step": 43221 + }, + { + "epoch": 4.033031631986564, + "grad_norm": NaN, + "learning_rate": 7.705733749917336e-05, + "loss": 0.0, + "step": 43222 + }, + { + "epoch": 4.03312494168144, + "grad_norm": NaN, + "learning_rate": 7.705072835262915e-05, + "loss": 0.0, + "step": 43223 + }, + { + "epoch": 4.033218251376318, + "grad_norm": NaN, + "learning_rate": 7.704411939157339e-05, + "loss": 0.0, + "step": 43224 + }, + { + "epoch": 4.033311561071195, + "grad_norm": NaN, + "learning_rate": 7.703751061602288e-05, + "loss": 0.0, + "step": 43225 + }, + { + "epoch": 4.033404870766073, + "grad_norm": NaN, + "learning_rate": 7.703090202599424e-05, + "loss": 0.0, + "step": 43226 + }, + { + "epoch": 4.03349818046095, + "grad_norm": NaN, + "learning_rate": 7.702429362150447e-05, + "loss": 0.0, + "step": 43227 + }, + { + "epoch": 4.033591490155827, + "grad_norm": NaN, + "learning_rate": 7.701768540257032e-05, + "loss": 0.0, + "step": 43228 + }, + { + "epoch": 4.033684799850705, + "grad_norm": NaN, + "learning_rate": 7.701107736920855e-05, + "loss": 0.0, + "step": 43229 + }, + { + "epoch": 4.033778109545581, + "grad_norm": NaN, + "learning_rate": 7.700446952143606e-05, + "loss": 0.0, + "step": 43230 + }, + { + "epoch": 4.033871419240459, + "grad_norm": NaN, + "learning_rate": 7.699786185926947e-05, + "loss": 0.0, + "step": 43231 + }, + { + "epoch": 4.033964728935336, + "grad_norm": NaN, + "learning_rate": 7.699125438272574e-05, + "loss": 0.0, + "step": 43232 + }, + { + "epoch": 4.034058038630214, + "grad_norm": NaN, + "learning_rate": 7.698464709182169e-05, + "loss": 0.0, + "step": 43233 + }, + { + "epoch": 4.034151348325091, + "grad_norm": NaN, + "learning_rate": 7.697803998657393e-05, + "loss": 0.0, + "step": 43234 + }, + { + "epoch": 4.0342446580199685, + "grad_norm": NaN, + "learning_rate": 7.697143306699945e-05, + "loss": 0.0, + "step": 43235 + }, + { + "epoch": 4.034337967714846, + "grad_norm": NaN, + "learning_rate": 7.696482633311504e-05, + "loss": 0.0, + "step": 43236 + }, + { + "epoch": 4.034431277409723, + "grad_norm": NaN, + "learning_rate": 7.695821978493731e-05, + "loss": 0.0, + "step": 43237 + }, + { + "epoch": 4.0345245871046, + "grad_norm": NaN, + "learning_rate": 7.695161342248326e-05, + "loss": 0.0, + "step": 43238 + }, + { + "epoch": 4.034617896799477, + "grad_norm": NaN, + "learning_rate": 7.694500724576967e-05, + "loss": 0.0, + "step": 43239 + }, + { + "epoch": 4.034711206494355, + "grad_norm": NaN, + "learning_rate": 7.693840125481319e-05, + "loss": 0.0, + "step": 43240 + }, + { + "epoch": 4.034804516189232, + "grad_norm": NaN, + "learning_rate": 7.693179544963075e-05, + "loss": 0.0, + "step": 43241 + }, + { + "epoch": 4.0348978258841095, + "grad_norm": NaN, + "learning_rate": 7.692518983023917e-05, + "loss": 0.0, + "step": 43242 + }, + { + "epoch": 4.034991135578987, + "grad_norm": NaN, + "learning_rate": 7.69185843966551e-05, + "loss": 0.0, + "step": 43243 + }, + { + "epoch": 4.035084445273864, + "grad_norm": NaN, + "learning_rate": 7.691197914889545e-05, + "loss": 0.0, + "step": 43244 + }, + { + "epoch": 4.035177754968741, + "grad_norm": NaN, + "learning_rate": 7.690537408697705e-05, + "loss": 0.0, + "step": 43245 + }, + { + "epoch": 4.035271064663618, + "grad_norm": NaN, + "learning_rate": 7.689876921091651e-05, + "loss": 0.0, + "step": 43246 + }, + { + "epoch": 4.035364374358496, + "grad_norm": NaN, + "learning_rate": 7.68921645207308e-05, + "loss": 0.0, + "step": 43247 + }, + { + "epoch": 4.035457684053373, + "grad_norm": NaN, + "learning_rate": 7.688556001643674e-05, + "loss": 0.0, + "step": 43248 + }, + { + "epoch": 4.0355509937482505, + "grad_norm": NaN, + "learning_rate": 7.687895569805092e-05, + "loss": 0.0, + "step": 43249 + }, + { + "epoch": 4.035644303443128, + "grad_norm": NaN, + "learning_rate": 7.687235156559031e-05, + "loss": 0.0, + "step": 43250 + }, + { + "epoch": 4.035737613138005, + "grad_norm": NaN, + "learning_rate": 7.68657476190717e-05, + "loss": 0.0, + "step": 43251 + }, + { + "epoch": 4.035830922832882, + "grad_norm": NaN, + "learning_rate": 7.685914385851172e-05, + "loss": 0.0, + "step": 43252 + }, + { + "epoch": 4.035924232527759, + "grad_norm": NaN, + "learning_rate": 7.685254028392733e-05, + "loss": 0.0, + "step": 43253 + }, + { + "epoch": 4.036017542222637, + "grad_norm": NaN, + "learning_rate": 7.68459368953353e-05, + "loss": 0.0, + "step": 43254 + }, + { + "epoch": 4.036110851917514, + "grad_norm": NaN, + "learning_rate": 7.683933369275229e-05, + "loss": 0.0, + "step": 43255 + }, + { + "epoch": 4.036204161612392, + "grad_norm": NaN, + "learning_rate": 7.683273067619523e-05, + "loss": 0.0, + "step": 43256 + }, + { + "epoch": 4.036297471307269, + "grad_norm": NaN, + "learning_rate": 7.682612784568094e-05, + "loss": 0.0, + "step": 43257 + }, + { + "epoch": 4.036390781002146, + "grad_norm": NaN, + "learning_rate": 7.6819525201226e-05, + "loss": 0.0, + "step": 43258 + }, + { + "epoch": 4.036484090697024, + "grad_norm": NaN, + "learning_rate": 7.681292274284738e-05, + "loss": 0.0, + "step": 43259 + }, + { + "epoch": 4.0365774003919, + "grad_norm": NaN, + "learning_rate": 7.68063204705619e-05, + "loss": 0.0, + "step": 43260 + }, + { + "epoch": 4.036670710086778, + "grad_norm": NaN, + "learning_rate": 7.679971838438611e-05, + "loss": 0.0, + "step": 43261 + }, + { + "epoch": 4.036764019781655, + "grad_norm": NaN, + "learning_rate": 7.679311648433704e-05, + "loss": 0.0, + "step": 43262 + }, + { + "epoch": 4.036857329476533, + "grad_norm": NaN, + "learning_rate": 7.678651477043145e-05, + "loss": 0.0, + "step": 43263 + }, + { + "epoch": 4.03695063917141, + "grad_norm": NaN, + "learning_rate": 7.677991324268593e-05, + "loss": 0.0, + "step": 43264 + }, + { + "epoch": 4.0370439488662875, + "grad_norm": NaN, + "learning_rate": 7.677331190111749e-05, + "loss": 0.0, + "step": 43265 + }, + { + "epoch": 4.037137258561165, + "grad_norm": NaN, + "learning_rate": 7.67667107457428e-05, + "loss": 0.0, + "step": 43266 + }, + { + "epoch": 4.037230568256041, + "grad_norm": NaN, + "learning_rate": 7.676010977657866e-05, + "loss": 0.0, + "step": 43267 + }, + { + "epoch": 4.037323877950919, + "grad_norm": NaN, + "learning_rate": 7.675350899364189e-05, + "loss": 0.0, + "step": 43268 + }, + { + "epoch": 4.037417187645796, + "grad_norm": NaN, + "learning_rate": 7.674690839694924e-05, + "loss": 0.0, + "step": 43269 + }, + { + "epoch": 4.037510497340674, + "grad_norm": NaN, + "learning_rate": 7.67403079865175e-05, + "loss": 0.0, + "step": 43270 + }, + { + "epoch": 4.037603807035551, + "grad_norm": NaN, + "learning_rate": 7.673370776236347e-05, + "loss": 0.0, + "step": 43271 + }, + { + "epoch": 4.0376971167304285, + "grad_norm": NaN, + "learning_rate": 7.67271077245039e-05, + "loss": 0.0, + "step": 43272 + }, + { + "epoch": 4.037790426425306, + "grad_norm": NaN, + "learning_rate": 7.672050787295559e-05, + "loss": 0.0, + "step": 43273 + }, + { + "epoch": 4.0378837361201825, + "grad_norm": NaN, + "learning_rate": 7.671390820773541e-05, + "loss": 0.0, + "step": 43274 + }, + { + "epoch": 4.03797704581506, + "grad_norm": NaN, + "learning_rate": 7.670730872885991e-05, + "loss": 0.0, + "step": 43275 + }, + { + "epoch": 4.038070355509937, + "grad_norm": NaN, + "learning_rate": 7.670070943634608e-05, + "loss": 0.0, + "step": 43276 + }, + { + "epoch": 4.038163665204815, + "grad_norm": NaN, + "learning_rate": 7.669411033021069e-05, + "loss": 0.0, + "step": 43277 + }, + { + "epoch": 4.038256974899692, + "grad_norm": NaN, + "learning_rate": 7.668751141047032e-05, + "loss": 0.0, + "step": 43278 + }, + { + "epoch": 4.03835028459457, + "grad_norm": NaN, + "learning_rate": 7.668091267714198e-05, + "loss": 0.0, + "step": 43279 + }, + { + "epoch": 4.038443594289447, + "grad_norm": NaN, + "learning_rate": 7.667431413024243e-05, + "loss": 0.0, + "step": 43280 + }, + { + "epoch": 4.038536903984324, + "grad_norm": NaN, + "learning_rate": 7.666771576978822e-05, + "loss": 0.0, + "step": 43281 + }, + { + "epoch": 4.038630213679201, + "grad_norm": NaN, + "learning_rate": 7.666111759579639e-05, + "loss": 0.0, + "step": 43282 + }, + { + "epoch": 4.038723523374078, + "grad_norm": NaN, + "learning_rate": 7.665451960828363e-05, + "loss": 0.0, + "step": 43283 + }, + { + "epoch": 4.038816833068956, + "grad_norm": NaN, + "learning_rate": 7.664792180726661e-05, + "loss": 0.0, + "step": 43284 + }, + { + "epoch": 4.038910142763833, + "grad_norm": NaN, + "learning_rate": 7.664132419276225e-05, + "loss": 0.0, + "step": 43285 + }, + { + "epoch": 4.039003452458711, + "grad_norm": NaN, + "learning_rate": 7.663472676478732e-05, + "loss": 0.0, + "step": 43286 + }, + { + "epoch": 4.039096762153588, + "grad_norm": NaN, + "learning_rate": 7.662812952335845e-05, + "loss": 0.0, + "step": 43287 + }, + { + "epoch": 4.0391900718484655, + "grad_norm": NaN, + "learning_rate": 7.662153246849255e-05, + "loss": 0.0, + "step": 43288 + }, + { + "epoch": 4.039283381543342, + "grad_norm": NaN, + "learning_rate": 7.661493560020643e-05, + "loss": 0.0, + "step": 43289 + }, + { + "epoch": 4.039376691238219, + "grad_norm": NaN, + "learning_rate": 7.660833891851669e-05, + "loss": 0.0, + "step": 43290 + }, + { + "epoch": 4.039470000933097, + "grad_norm": NaN, + "learning_rate": 7.660174242344021e-05, + "loss": 0.0, + "step": 43291 + }, + { + "epoch": 4.039563310627974, + "grad_norm": NaN, + "learning_rate": 7.659514611499385e-05, + "loss": 0.0, + "step": 43292 + }, + { + "epoch": 4.039656620322852, + "grad_norm": NaN, + "learning_rate": 7.65885499931942e-05, + "loss": 0.0, + "step": 43293 + }, + { + "epoch": 4.039749930017729, + "grad_norm": NaN, + "learning_rate": 7.658195405805813e-05, + "loss": 0.0, + "step": 43294 + }, + { + "epoch": 4.0398432397126065, + "grad_norm": NaN, + "learning_rate": 7.657535830960248e-05, + "loss": 0.0, + "step": 43295 + }, + { + "epoch": 4.039936549407483, + "grad_norm": NaN, + "learning_rate": 7.656876274784382e-05, + "loss": 0.0, + "step": 43296 + }, + { + "epoch": 4.0400298591023605, + "grad_norm": NaN, + "learning_rate": 7.656216737279912e-05, + "loss": 0.0, + "step": 43297 + }, + { + "epoch": 4.040123168797238, + "grad_norm": NaN, + "learning_rate": 7.655557218448511e-05, + "loss": 0.0, + "step": 43298 + }, + { + "epoch": 4.040216478492115, + "grad_norm": NaN, + "learning_rate": 7.654897718291843e-05, + "loss": 0.0, + "step": 43299 + }, + { + "epoch": 4.040309788186993, + "grad_norm": NaN, + "learning_rate": 7.654238236811598e-05, + "loss": 0.0, + "step": 43300 + }, + { + "epoch": 4.04040309788187, + "grad_norm": NaN, + "learning_rate": 7.653578774009451e-05, + "loss": 0.0, + "step": 43301 + }, + { + "epoch": 4.0404964075767476, + "grad_norm": NaN, + "learning_rate": 7.652919329887076e-05, + "loss": 0.0, + "step": 43302 + }, + { + "epoch": 4.040589717271624, + "grad_norm": NaN, + "learning_rate": 7.652259904446151e-05, + "loss": 0.0, + "step": 43303 + }, + { + "epoch": 4.0406830269665015, + "grad_norm": NaN, + "learning_rate": 7.651600497688354e-05, + "loss": 0.0, + "step": 43304 + }, + { + "epoch": 4.040776336661379, + "grad_norm": NaN, + "learning_rate": 7.650941109615357e-05, + "loss": 0.0, + "step": 43305 + }, + { + "epoch": 4.040869646356256, + "grad_norm": NaN, + "learning_rate": 7.650281740228843e-05, + "loss": 0.0, + "step": 43306 + }, + { + "epoch": 4.040962956051134, + "grad_norm": NaN, + "learning_rate": 7.649622389530484e-05, + "loss": 0.0, + "step": 43307 + }, + { + "epoch": 4.041056265746011, + "grad_norm": NaN, + "learning_rate": 7.648963057521957e-05, + "loss": 0.0, + "step": 43308 + }, + { + "epoch": 4.041149575440889, + "grad_norm": NaN, + "learning_rate": 7.648303744204941e-05, + "loss": 0.0, + "step": 43309 + }, + { + "epoch": 4.041242885135766, + "grad_norm": NaN, + "learning_rate": 7.64764444958111e-05, + "loss": 0.0, + "step": 43310 + }, + { + "epoch": 4.0413361948306425, + "grad_norm": NaN, + "learning_rate": 7.64698517365214e-05, + "loss": 0.0, + "step": 43311 + }, + { + "epoch": 4.04142950452552, + "grad_norm": NaN, + "learning_rate": 7.646325916419712e-05, + "loss": 0.0, + "step": 43312 + }, + { + "epoch": 4.041522814220397, + "grad_norm": NaN, + "learning_rate": 7.645666677885496e-05, + "loss": 0.0, + "step": 43313 + }, + { + "epoch": 4.041616123915275, + "grad_norm": NaN, + "learning_rate": 7.645007458051173e-05, + "loss": 0.0, + "step": 43314 + }, + { + "epoch": 4.041709433610152, + "grad_norm": NaN, + "learning_rate": 7.644348256918416e-05, + "loss": 0.0, + "step": 43315 + }, + { + "epoch": 4.04180274330503, + "grad_norm": NaN, + "learning_rate": 7.643689074488902e-05, + "loss": 0.0, + "step": 43316 + }, + { + "epoch": 4.041896052999907, + "grad_norm": NaN, + "learning_rate": 7.643029910764308e-05, + "loss": 0.0, + "step": 43317 + }, + { + "epoch": 4.041989362694784, + "grad_norm": NaN, + "learning_rate": 7.642370765746316e-05, + "loss": 0.0, + "step": 43318 + }, + { + "epoch": 4.042082672389661, + "grad_norm": NaN, + "learning_rate": 7.641711639436584e-05, + "loss": 0.0, + "step": 43319 + }, + { + "epoch": 4.042175982084538, + "grad_norm": NaN, + "learning_rate": 7.641052531836804e-05, + "loss": 0.0, + "step": 43320 + }, + { + "epoch": 4.042269291779416, + "grad_norm": NaN, + "learning_rate": 7.640393442948657e-05, + "loss": 0.0, + "step": 43321 + }, + { + "epoch": 4.042362601474293, + "grad_norm": NaN, + "learning_rate": 7.639734372773794e-05, + "loss": 0.0, + "step": 43322 + }, + { + "epoch": 4.042455911169171, + "grad_norm": NaN, + "learning_rate": 7.639075321313914e-05, + "loss": 0.0, + "step": 43323 + }, + { + "epoch": 4.042549220864048, + "grad_norm": NaN, + "learning_rate": 7.63841628857069e-05, + "loss": 0.0, + "step": 43324 + }, + { + "epoch": 4.042642530558925, + "grad_norm": NaN, + "learning_rate": 7.637757274545782e-05, + "loss": 0.0, + "step": 43325 + }, + { + "epoch": 4.042735840253802, + "grad_norm": NaN, + "learning_rate": 7.637098279240879e-05, + "loss": 0.0, + "step": 43326 + }, + { + "epoch": 4.0428291499486795, + "grad_norm": NaN, + "learning_rate": 7.636439302657663e-05, + "loss": 0.0, + "step": 43327 + }, + { + "epoch": 4.042922459643557, + "grad_norm": NaN, + "learning_rate": 7.635780344797791e-05, + "loss": 0.0, + "step": 43328 + }, + { + "epoch": 4.043015769338434, + "grad_norm": NaN, + "learning_rate": 7.63512140566295e-05, + "loss": 0.0, + "step": 43329 + }, + { + "epoch": 4.043109079033312, + "grad_norm": NaN, + "learning_rate": 7.63446248525482e-05, + "loss": 0.0, + "step": 43330 + }, + { + "epoch": 4.043202388728189, + "grad_norm": NaN, + "learning_rate": 7.63380358357506e-05, + "loss": 0.0, + "step": 43331 + }, + { + "epoch": 4.043295698423067, + "grad_norm": NaN, + "learning_rate": 7.63314470062536e-05, + "loss": 0.0, + "step": 43332 + }, + { + "epoch": 4.043389008117943, + "grad_norm": NaN, + "learning_rate": 7.632485836407396e-05, + "loss": 0.0, + "step": 43333 + }, + { + "epoch": 4.0434823178128205, + "grad_norm": NaN, + "learning_rate": 7.631826990922829e-05, + "loss": 0.0, + "step": 43334 + }, + { + "epoch": 4.043575627507698, + "grad_norm": NaN, + "learning_rate": 7.631168164173348e-05, + "loss": 0.0, + "step": 43335 + }, + { + "epoch": 4.043668937202575, + "grad_norm": NaN, + "learning_rate": 7.630509356160628e-05, + "loss": 0.0, + "step": 43336 + }, + { + "epoch": 4.043762246897453, + "grad_norm": NaN, + "learning_rate": 7.629850566886329e-05, + "loss": 0.0, + "step": 43337 + }, + { + "epoch": 4.04385555659233, + "grad_norm": NaN, + "learning_rate": 7.629191796352142e-05, + "loss": 0.0, + "step": 43338 + }, + { + "epoch": 4.043948866287208, + "grad_norm": NaN, + "learning_rate": 7.628533044559739e-05, + "loss": 0.0, + "step": 43339 + }, + { + "epoch": 4.044042175982084, + "grad_norm": NaN, + "learning_rate": 7.627874311510792e-05, + "loss": 0.0, + "step": 43340 + }, + { + "epoch": 4.044135485676962, + "grad_norm": NaN, + "learning_rate": 7.627215597206976e-05, + "loss": 0.0, + "step": 43341 + }, + { + "epoch": 4.044228795371839, + "grad_norm": NaN, + "learning_rate": 7.626556901649968e-05, + "loss": 0.0, + "step": 43342 + }, + { + "epoch": 4.044322105066716, + "grad_norm": NaN, + "learning_rate": 7.62589822484144e-05, + "loss": 0.0, + "step": 43343 + }, + { + "epoch": 4.044415414761594, + "grad_norm": NaN, + "learning_rate": 7.625239566783068e-05, + "loss": 0.0, + "step": 43344 + }, + { + "epoch": 4.044508724456471, + "grad_norm": NaN, + "learning_rate": 7.62458092747653e-05, + "loss": 0.0, + "step": 43345 + }, + { + "epoch": 4.044602034151349, + "grad_norm": NaN, + "learning_rate": 7.623922306923494e-05, + "loss": 0.0, + "step": 43346 + }, + { + "epoch": 4.044695343846225, + "grad_norm": NaN, + "learning_rate": 7.623263705125642e-05, + "loss": 0.0, + "step": 43347 + }, + { + "epoch": 4.044788653541103, + "grad_norm": NaN, + "learning_rate": 7.622605122084645e-05, + "loss": 0.0, + "step": 43348 + }, + { + "epoch": 4.04488196323598, + "grad_norm": NaN, + "learning_rate": 7.621946557802176e-05, + "loss": 0.0, + "step": 43349 + }, + { + "epoch": 4.0449752729308575, + "grad_norm": NaN, + "learning_rate": 7.621288012279911e-05, + "loss": 0.0, + "step": 43350 + }, + { + "epoch": 4.045068582625735, + "grad_norm": NaN, + "learning_rate": 7.620629485519528e-05, + "loss": 0.0, + "step": 43351 + }, + { + "epoch": 4.045161892320612, + "grad_norm": NaN, + "learning_rate": 7.619970977522696e-05, + "loss": 0.0, + "step": 43352 + }, + { + "epoch": 4.04525520201549, + "grad_norm": NaN, + "learning_rate": 7.619312488291093e-05, + "loss": 0.0, + "step": 43353 + }, + { + "epoch": 4.045348511710367, + "grad_norm": NaN, + "learning_rate": 7.61865401782639e-05, + "loss": 0.0, + "step": 43354 + }, + { + "epoch": 4.045441821405244, + "grad_norm": NaN, + "learning_rate": 7.617995566130265e-05, + "loss": 0.0, + "step": 43355 + }, + { + "epoch": 4.045535131100121, + "grad_norm": NaN, + "learning_rate": 7.61733713320439e-05, + "loss": 0.0, + "step": 43356 + }, + { + "epoch": 4.0456284407949985, + "grad_norm": NaN, + "learning_rate": 7.61667871905044e-05, + "loss": 0.0, + "step": 43357 + }, + { + "epoch": 4.045721750489876, + "grad_norm": NaN, + "learning_rate": 7.61602032367009e-05, + "loss": 0.0, + "step": 43358 + }, + { + "epoch": 4.045815060184753, + "grad_norm": NaN, + "learning_rate": 7.61536194706501e-05, + "loss": 0.0, + "step": 43359 + }, + { + "epoch": 4.045908369879631, + "grad_norm": NaN, + "learning_rate": 7.614703589236879e-05, + "loss": 0.0, + "step": 43360 + }, + { + "epoch": 4.046001679574508, + "grad_norm": NaN, + "learning_rate": 7.614045250187371e-05, + "loss": 0.0, + "step": 43361 + }, + { + "epoch": 4.046094989269385, + "grad_norm": NaN, + "learning_rate": 7.613386929918155e-05, + "loss": 0.0, + "step": 43362 + }, + { + "epoch": 4.046188298964262, + "grad_norm": NaN, + "learning_rate": 7.612728628430907e-05, + "loss": 0.0, + "step": 43363 + }, + { + "epoch": 4.0462816086591396, + "grad_norm": NaN, + "learning_rate": 7.612070345727305e-05, + "loss": 0.0, + "step": 43364 + }, + { + "epoch": 4.046374918354017, + "grad_norm": NaN, + "learning_rate": 7.611412081809024e-05, + "loss": 0.0, + "step": 43365 + }, + { + "epoch": 4.046468228048894, + "grad_norm": NaN, + "learning_rate": 7.610753836677723e-05, + "loss": 0.0, + "step": 43366 + }, + { + "epoch": 4.046561537743772, + "grad_norm": NaN, + "learning_rate": 7.61009561033509e-05, + "loss": 0.0, + "step": 43367 + }, + { + "epoch": 4.046654847438649, + "grad_norm": NaN, + "learning_rate": 7.6094374027828e-05, + "loss": 0.0, + "step": 43368 + }, + { + "epoch": 4.046748157133526, + "grad_norm": NaN, + "learning_rate": 7.608779214022514e-05, + "loss": 0.0, + "step": 43369 + }, + { + "epoch": 4.046841466828403, + "grad_norm": NaN, + "learning_rate": 7.608121044055916e-05, + "loss": 0.0, + "step": 43370 + }, + { + "epoch": 4.046934776523281, + "grad_norm": NaN, + "learning_rate": 7.607462892884682e-05, + "loss": 0.0, + "step": 43371 + }, + { + "epoch": 4.047028086218158, + "grad_norm": NaN, + "learning_rate": 7.606804760510469e-05, + "loss": 0.0, + "step": 43372 + }, + { + "epoch": 4.047121395913035, + "grad_norm": NaN, + "learning_rate": 7.606146646934968e-05, + "loss": 0.0, + "step": 43373 + }, + { + "epoch": 4.047214705607913, + "grad_norm": NaN, + "learning_rate": 7.60548855215985e-05, + "loss": 0.0, + "step": 43374 + }, + { + "epoch": 4.04730801530279, + "grad_norm": NaN, + "learning_rate": 7.604830476186773e-05, + "loss": 0.0, + "step": 43375 + }, + { + "epoch": 4.047401324997668, + "grad_norm": NaN, + "learning_rate": 7.604172419017429e-05, + "loss": 0.0, + "step": 43376 + }, + { + "epoch": 4.047494634692544, + "grad_norm": NaN, + "learning_rate": 7.603514380653482e-05, + "loss": 0.0, + "step": 43377 + }, + { + "epoch": 4.047587944387422, + "grad_norm": NaN, + "learning_rate": 7.602856361096608e-05, + "loss": 0.0, + "step": 43378 + }, + { + "epoch": 4.047681254082299, + "grad_norm": NaN, + "learning_rate": 7.602198360348478e-05, + "loss": 0.0, + "step": 43379 + }, + { + "epoch": 4.0477745637771765, + "grad_norm": NaN, + "learning_rate": 7.601540378410767e-05, + "loss": 0.0, + "step": 43380 + }, + { + "epoch": 4.047867873472054, + "grad_norm": NaN, + "learning_rate": 7.600882415285147e-05, + "loss": 0.0, + "step": 43381 + }, + { + "epoch": 4.047961183166931, + "grad_norm": NaN, + "learning_rate": 7.60022447097329e-05, + "loss": 0.0, + "step": 43382 + }, + { + "epoch": 4.048054492861809, + "grad_norm": NaN, + "learning_rate": 7.599566545476874e-05, + "loss": 0.0, + "step": 43383 + }, + { + "epoch": 4.048147802556685, + "grad_norm": NaN, + "learning_rate": 7.598908638797565e-05, + "loss": 0.0, + "step": 43384 + }, + { + "epoch": 4.048241112251563, + "grad_norm": NaN, + "learning_rate": 7.598250750937039e-05, + "loss": 0.0, + "step": 43385 + }, + { + "epoch": 4.04833442194644, + "grad_norm": NaN, + "learning_rate": 7.597592881896968e-05, + "loss": 0.0, + "step": 43386 + }, + { + "epoch": 4.0484277316413175, + "grad_norm": NaN, + "learning_rate": 7.596935031679028e-05, + "loss": 0.0, + "step": 43387 + }, + { + "epoch": 4.048521041336195, + "grad_norm": NaN, + "learning_rate": 7.596277200284888e-05, + "loss": 0.0, + "step": 43388 + }, + { + "epoch": 4.048614351031072, + "grad_norm": NaN, + "learning_rate": 7.595619387716222e-05, + "loss": 0.0, + "step": 43389 + }, + { + "epoch": 4.04870766072595, + "grad_norm": NaN, + "learning_rate": 7.594961593974702e-05, + "loss": 0.0, + "step": 43390 + }, + { + "epoch": 4.048800970420826, + "grad_norm": NaN, + "learning_rate": 7.594303819062002e-05, + "loss": 0.0, + "step": 43391 + }, + { + "epoch": 4.048894280115704, + "grad_norm": NaN, + "learning_rate": 7.593646062979794e-05, + "loss": 0.0, + "step": 43392 + }, + { + "epoch": 4.048987589810581, + "grad_norm": NaN, + "learning_rate": 7.592988325729749e-05, + "loss": 0.0, + "step": 43393 + }, + { + "epoch": 4.049080899505459, + "grad_norm": NaN, + "learning_rate": 7.592330607313541e-05, + "loss": 0.0, + "step": 43394 + }, + { + "epoch": 4.049174209200336, + "grad_norm": NaN, + "learning_rate": 7.591672907732843e-05, + "loss": 0.0, + "step": 43395 + }, + { + "epoch": 4.049267518895213, + "grad_norm": NaN, + "learning_rate": 7.591015226989325e-05, + "loss": 0.0, + "step": 43396 + }, + { + "epoch": 4.049360828590091, + "grad_norm": NaN, + "learning_rate": 7.59035756508466e-05, + "loss": 0.0, + "step": 43397 + }, + { + "epoch": 4.049454138284968, + "grad_norm": NaN, + "learning_rate": 7.589699922020521e-05, + "loss": 0.0, + "step": 43398 + }, + { + "epoch": 4.049547447979845, + "grad_norm": NaN, + "learning_rate": 7.589042297798583e-05, + "loss": 0.0, + "step": 43399 + }, + { + "epoch": 4.049640757674722, + "grad_norm": NaN, + "learning_rate": 7.588384692420512e-05, + "loss": 0.0, + "step": 43400 + }, + { + "epoch": 4.0497340673696, + "grad_norm": NaN, + "learning_rate": 7.587727105887986e-05, + "loss": 0.0, + "step": 43401 + }, + { + "epoch": 4.049827377064477, + "grad_norm": NaN, + "learning_rate": 7.587069538202669e-05, + "loss": 0.0, + "step": 43402 + }, + { + "epoch": 4.0499206867593545, + "grad_norm": NaN, + "learning_rate": 7.586411989366243e-05, + "loss": 0.0, + "step": 43403 + }, + { + "epoch": 4.050013996454232, + "grad_norm": NaN, + "learning_rate": 7.585754459380373e-05, + "loss": 0.0, + "step": 43404 + }, + { + "epoch": 4.050107306149109, + "grad_norm": NaN, + "learning_rate": 7.585096948246734e-05, + "loss": 0.0, + "step": 43405 + }, + { + "epoch": 4.050200615843986, + "grad_norm": NaN, + "learning_rate": 7.584439455966998e-05, + "loss": 0.0, + "step": 43406 + }, + { + "epoch": 4.050293925538863, + "grad_norm": NaN, + "learning_rate": 7.583781982542836e-05, + "loss": 0.0, + "step": 43407 + }, + { + "epoch": 4.050387235233741, + "grad_norm": NaN, + "learning_rate": 7.583124527975918e-05, + "loss": 0.0, + "step": 43408 + }, + { + "epoch": 4.050480544928618, + "grad_norm": NaN, + "learning_rate": 7.582467092267923e-05, + "loss": 0.0, + "step": 43409 + }, + { + "epoch": 4.0505738546234955, + "grad_norm": NaN, + "learning_rate": 7.581809675420507e-05, + "loss": 0.0, + "step": 43410 + }, + { + "epoch": 4.050667164318373, + "grad_norm": NaN, + "learning_rate": 7.581152277435355e-05, + "loss": 0.0, + "step": 43411 + }, + { + "epoch": 4.05076047401325, + "grad_norm": NaN, + "learning_rate": 7.580494898314144e-05, + "loss": 0.0, + "step": 43412 + }, + { + "epoch": 4.050853783708127, + "grad_norm": NaN, + "learning_rate": 7.579837538058521e-05, + "loss": 0.0, + "step": 43413 + }, + { + "epoch": 4.050947093403004, + "grad_norm": NaN, + "learning_rate": 7.579180196670182e-05, + "loss": 0.0, + "step": 43414 + }, + { + "epoch": 4.051040403097882, + "grad_norm": NaN, + "learning_rate": 7.578522874150788e-05, + "loss": 0.0, + "step": 43415 + }, + { + "epoch": 4.051133712792759, + "grad_norm": NaN, + "learning_rate": 7.577865570502014e-05, + "loss": 0.0, + "step": 43416 + }, + { + "epoch": 4.0512270224876366, + "grad_norm": NaN, + "learning_rate": 7.577208285725528e-05, + "loss": 0.0, + "step": 43417 + }, + { + "epoch": 4.051320332182514, + "grad_norm": NaN, + "learning_rate": 7.576551019823001e-05, + "loss": 0.0, + "step": 43418 + }, + { + "epoch": 4.051413641877391, + "grad_norm": NaN, + "learning_rate": 7.575893772796108e-05, + "loss": 0.0, + "step": 43419 + }, + { + "epoch": 4.051506951572268, + "grad_norm": NaN, + "learning_rate": 7.575236544646518e-05, + "loss": 0.0, + "step": 43420 + }, + { + "epoch": 4.051600261267145, + "grad_norm": NaN, + "learning_rate": 7.5745793353759e-05, + "loss": 0.0, + "step": 43421 + }, + { + "epoch": 4.051693570962023, + "grad_norm": NaN, + "learning_rate": 7.573922144985929e-05, + "loss": 0.0, + "step": 43422 + }, + { + "epoch": 4.0517868806569, + "grad_norm": NaN, + "learning_rate": 7.573264973478274e-05, + "loss": 0.0, + "step": 43423 + }, + { + "epoch": 4.051880190351778, + "grad_norm": NaN, + "learning_rate": 7.572607820854606e-05, + "loss": 0.0, + "step": 43424 + }, + { + "epoch": 4.051973500046655, + "grad_norm": NaN, + "learning_rate": 7.571950687116594e-05, + "loss": 0.0, + "step": 43425 + }, + { + "epoch": 4.052066809741532, + "grad_norm": NaN, + "learning_rate": 7.571293572265916e-05, + "loss": 0.0, + "step": 43426 + }, + { + "epoch": 4.05216011943641, + "grad_norm": NaN, + "learning_rate": 7.570636476304233e-05, + "loss": 0.0, + "step": 43427 + }, + { + "epoch": 4.052253429131286, + "grad_norm": NaN, + "learning_rate": 7.569979399233223e-05, + "loss": 0.0, + "step": 43428 + }, + { + "epoch": 4.052346738826164, + "grad_norm": NaN, + "learning_rate": 7.569322341054554e-05, + "loss": 0.0, + "step": 43429 + }, + { + "epoch": 4.052440048521041, + "grad_norm": NaN, + "learning_rate": 7.568665301769898e-05, + "loss": 0.0, + "step": 43430 + }, + { + "epoch": 4.052533358215919, + "grad_norm": NaN, + "learning_rate": 7.568008281380923e-05, + "loss": 0.0, + "step": 43431 + }, + { + "epoch": 4.052626667910796, + "grad_norm": NaN, + "learning_rate": 7.567351279889302e-05, + "loss": 0.0, + "step": 43432 + }, + { + "epoch": 4.0527199776056735, + "grad_norm": NaN, + "learning_rate": 7.566694297296708e-05, + "loss": 0.0, + "step": 43433 + }, + { + "epoch": 4.052813287300551, + "grad_norm": NaN, + "learning_rate": 7.566037333604805e-05, + "loss": 0.0, + "step": 43434 + }, + { + "epoch": 4.052906596995427, + "grad_norm": NaN, + "learning_rate": 7.565380388815268e-05, + "loss": 0.0, + "step": 43435 + }, + { + "epoch": 4.052999906690305, + "grad_norm": NaN, + "learning_rate": 7.564723462929766e-05, + "loss": 0.0, + "step": 43436 + }, + { + "epoch": 4.053093216385182, + "grad_norm": NaN, + "learning_rate": 7.56406655594997e-05, + "loss": 0.0, + "step": 43437 + }, + { + "epoch": 4.05318652608006, + "grad_norm": NaN, + "learning_rate": 7.56340966787755e-05, + "loss": 0.0, + "step": 43438 + }, + { + "epoch": 4.053279835774937, + "grad_norm": NaN, + "learning_rate": 7.562752798714176e-05, + "loss": 0.0, + "step": 43439 + }, + { + "epoch": 4.0533731454698145, + "grad_norm": NaN, + "learning_rate": 7.562095948461518e-05, + "loss": 0.0, + "step": 43440 + }, + { + "epoch": 4.053466455164692, + "grad_norm": NaN, + "learning_rate": 7.561439117121249e-05, + "loss": 0.0, + "step": 43441 + }, + { + "epoch": 4.0535597648595685, + "grad_norm": NaN, + "learning_rate": 7.560782304695037e-05, + "loss": 0.0, + "step": 43442 + }, + { + "epoch": 4.053653074554446, + "grad_norm": NaN, + "learning_rate": 7.560125511184548e-05, + "loss": 0.0, + "step": 43443 + }, + { + "epoch": 4.053746384249323, + "grad_norm": NaN, + "learning_rate": 7.55946873659146e-05, + "loss": 0.0, + "step": 43444 + }, + { + "epoch": 4.053839693944201, + "grad_norm": NaN, + "learning_rate": 7.558811980917436e-05, + "loss": 0.0, + "step": 43445 + }, + { + "epoch": 4.053933003639078, + "grad_norm": NaN, + "learning_rate": 7.558155244164151e-05, + "loss": 0.0, + "step": 43446 + }, + { + "epoch": 4.054026313333956, + "grad_norm": NaN, + "learning_rate": 7.557498526333273e-05, + "loss": 0.0, + "step": 43447 + }, + { + "epoch": 4.054119623028833, + "grad_norm": NaN, + "learning_rate": 7.55684182742647e-05, + "loss": 0.0, + "step": 43448 + }, + { + "epoch": 4.05421293272371, + "grad_norm": NaN, + "learning_rate": 7.556185147445415e-05, + "loss": 0.0, + "step": 43449 + }, + { + "epoch": 4.054306242418587, + "grad_norm": NaN, + "learning_rate": 7.555528486391775e-05, + "loss": 0.0, + "step": 43450 + }, + { + "epoch": 4.054399552113464, + "grad_norm": NaN, + "learning_rate": 7.55487184426722e-05, + "loss": 0.0, + "step": 43451 + }, + { + "epoch": 4.054492861808342, + "grad_norm": NaN, + "learning_rate": 7.554215221073423e-05, + "loss": 0.0, + "step": 43452 + }, + { + "epoch": 4.054586171503219, + "grad_norm": NaN, + "learning_rate": 7.553558616812051e-05, + "loss": 0.0, + "step": 43453 + }, + { + "epoch": 4.054679481198097, + "grad_norm": NaN, + "learning_rate": 7.55290203148477e-05, + "loss": 0.0, + "step": 43454 + }, + { + "epoch": 4.054772790892974, + "grad_norm": NaN, + "learning_rate": 7.552245465093257e-05, + "loss": 0.0, + "step": 43455 + }, + { + "epoch": 4.0548661005878515, + "grad_norm": NaN, + "learning_rate": 7.551588917639175e-05, + "loss": 0.0, + "step": 43456 + }, + { + "epoch": 4.054959410282728, + "grad_norm": NaN, + "learning_rate": 7.550932389124199e-05, + "loss": 0.0, + "step": 43457 + }, + { + "epoch": 4.055052719977605, + "grad_norm": NaN, + "learning_rate": 7.550275879549991e-05, + "loss": 0.0, + "step": 43458 + }, + { + "epoch": 4.055146029672483, + "grad_norm": NaN, + "learning_rate": 7.54961938891823e-05, + "loss": 0.0, + "step": 43459 + }, + { + "epoch": 4.05523933936736, + "grad_norm": NaN, + "learning_rate": 7.548962917230575e-05, + "loss": 0.0, + "step": 43460 + }, + { + "epoch": 4.055332649062238, + "grad_norm": NaN, + "learning_rate": 7.548306464488704e-05, + "loss": 0.0, + "step": 43461 + }, + { + "epoch": 4.055425958757115, + "grad_norm": NaN, + "learning_rate": 7.547650030694279e-05, + "loss": 0.0, + "step": 43462 + }, + { + "epoch": 4.0555192684519925, + "grad_norm": NaN, + "learning_rate": 7.546993615848975e-05, + "loss": 0.0, + "step": 43463 + }, + { + "epoch": 4.055612578146869, + "grad_norm": NaN, + "learning_rate": 7.546337219954456e-05, + "loss": 0.0, + "step": 43464 + }, + { + "epoch": 4.0557058878417465, + "grad_norm": NaN, + "learning_rate": 7.545680843012395e-05, + "loss": 0.0, + "step": 43465 + }, + { + "epoch": 4.055799197536624, + "grad_norm": NaN, + "learning_rate": 7.545024485024459e-05, + "loss": 0.0, + "step": 43466 + }, + { + "epoch": 4.055892507231501, + "grad_norm": NaN, + "learning_rate": 7.544368145992319e-05, + "loss": 0.0, + "step": 43467 + }, + { + "epoch": 4.055985816926379, + "grad_norm": NaN, + "learning_rate": 7.543711825917641e-05, + "loss": 0.0, + "step": 43468 + }, + { + "epoch": 4.056079126621256, + "grad_norm": NaN, + "learning_rate": 7.543055524802096e-05, + "loss": 0.0, + "step": 43469 + }, + { + "epoch": 4.056172436316134, + "grad_norm": NaN, + "learning_rate": 7.54239924264735e-05, + "loss": 0.0, + "step": 43470 + }, + { + "epoch": 4.056265746011011, + "grad_norm": NaN, + "learning_rate": 7.541742979455074e-05, + "loss": 0.0, + "step": 43471 + }, + { + "epoch": 4.0563590557058875, + "grad_norm": NaN, + "learning_rate": 7.541086735226939e-05, + "loss": 0.0, + "step": 43472 + }, + { + "epoch": 4.056452365400765, + "grad_norm": NaN, + "learning_rate": 7.54043050996461e-05, + "loss": 0.0, + "step": 43473 + }, + { + "epoch": 4.056545675095642, + "grad_norm": NaN, + "learning_rate": 7.539774303669753e-05, + "loss": 0.0, + "step": 43474 + }, + { + "epoch": 4.05663898479052, + "grad_norm": NaN, + "learning_rate": 7.539118116344043e-05, + "loss": 0.0, + "step": 43475 + }, + { + "epoch": 4.056732294485397, + "grad_norm": NaN, + "learning_rate": 7.538461947989146e-05, + "loss": 0.0, + "step": 43476 + }, + { + "epoch": 4.056825604180275, + "grad_norm": NaN, + "learning_rate": 7.537805798606729e-05, + "loss": 0.0, + "step": 43477 + }, + { + "epoch": 4.056918913875152, + "grad_norm": NaN, + "learning_rate": 7.53714966819846e-05, + "loss": 0.0, + "step": 43478 + }, + { + "epoch": 4.0570122235700286, + "grad_norm": NaN, + "learning_rate": 7.536493556766012e-05, + "loss": 0.0, + "step": 43479 + }, + { + "epoch": 4.057105533264906, + "grad_norm": NaN, + "learning_rate": 7.535837464311047e-05, + "loss": 0.0, + "step": 43480 + }, + { + "epoch": 4.057198842959783, + "grad_norm": NaN, + "learning_rate": 7.535181390835238e-05, + "loss": 0.0, + "step": 43481 + }, + { + "epoch": 4.057292152654661, + "grad_norm": NaN, + "learning_rate": 7.534525336340251e-05, + "loss": 0.0, + "step": 43482 + }, + { + "epoch": 4.057385462349538, + "grad_norm": NaN, + "learning_rate": 7.533869300827754e-05, + "loss": 0.0, + "step": 43483 + }, + { + "epoch": 4.057478772044416, + "grad_norm": NaN, + "learning_rate": 7.533213284299418e-05, + "loss": 0.0, + "step": 43484 + }, + { + "epoch": 4.057572081739293, + "grad_norm": NaN, + "learning_rate": 7.532557286756907e-05, + "loss": 0.0, + "step": 43485 + }, + { + "epoch": 4.05766539143417, + "grad_norm": NaN, + "learning_rate": 7.531901308201891e-05, + "loss": 0.0, + "step": 43486 + }, + { + "epoch": 4.057758701129047, + "grad_norm": NaN, + "learning_rate": 7.531245348636037e-05, + "loss": 0.0, + "step": 43487 + }, + { + "epoch": 4.057852010823924, + "grad_norm": NaN, + "learning_rate": 7.530589408061014e-05, + "loss": 0.0, + "step": 43488 + }, + { + "epoch": 4.057945320518802, + "grad_norm": NaN, + "learning_rate": 7.52993348647849e-05, + "loss": 0.0, + "step": 43489 + }, + { + "epoch": 4.058038630213679, + "grad_norm": NaN, + "learning_rate": 7.529277583890133e-05, + "loss": 0.0, + "step": 43490 + }, + { + "epoch": 4.058131939908557, + "grad_norm": NaN, + "learning_rate": 7.528621700297609e-05, + "loss": 0.0, + "step": 43491 + }, + { + "epoch": 4.058225249603434, + "grad_norm": NaN, + "learning_rate": 7.527965835702589e-05, + "loss": 0.0, + "step": 43492 + }, + { + "epoch": 4.0583185592983115, + "grad_norm": NaN, + "learning_rate": 7.527309990106735e-05, + "loss": 0.0, + "step": 43493 + }, + { + "epoch": 4.058411868993188, + "grad_norm": NaN, + "learning_rate": 7.526654163511719e-05, + "loss": 0.0, + "step": 43494 + }, + { + "epoch": 4.0585051786880655, + "grad_norm": NaN, + "learning_rate": 7.52599835591921e-05, + "loss": 0.0, + "step": 43495 + }, + { + "epoch": 4.058598488382943, + "grad_norm": NaN, + "learning_rate": 7.525342567330871e-05, + "loss": 0.0, + "step": 43496 + }, + { + "epoch": 4.05869179807782, + "grad_norm": NaN, + "learning_rate": 7.524686797748373e-05, + "loss": 0.0, + "step": 43497 + }, + { + "epoch": 4.058785107772698, + "grad_norm": NaN, + "learning_rate": 7.524031047173382e-05, + "loss": 0.0, + "step": 43498 + }, + { + "epoch": 4.058878417467575, + "grad_norm": NaN, + "learning_rate": 7.523375315607566e-05, + "loss": 0.0, + "step": 43499 + }, + { + "epoch": 4.058971727162453, + "grad_norm": NaN, + "learning_rate": 7.52271960305259e-05, + "loss": 0.0, + "step": 43500 + }, + { + "epoch": 4.059065036857329, + "grad_norm": NaN, + "learning_rate": 7.522063909510124e-05, + "loss": 0.0, + "step": 43501 + }, + { + "epoch": 4.0591583465522065, + "grad_norm": NaN, + "learning_rate": 7.521408234981835e-05, + "loss": 0.0, + "step": 43502 + }, + { + "epoch": 4.059251656247084, + "grad_norm": NaN, + "learning_rate": 7.520752579469389e-05, + "loss": 0.0, + "step": 43503 + }, + { + "epoch": 4.059344965941961, + "grad_norm": NaN, + "learning_rate": 7.520096942974454e-05, + "loss": 0.0, + "step": 43504 + }, + { + "epoch": 4.059438275636839, + "grad_norm": NaN, + "learning_rate": 7.519441325498696e-05, + "loss": 0.0, + "step": 43505 + }, + { + "epoch": 4.059531585331716, + "grad_norm": NaN, + "learning_rate": 7.518785727043782e-05, + "loss": 0.0, + "step": 43506 + }, + { + "epoch": 4.059624895026594, + "grad_norm": NaN, + "learning_rate": 7.518130147611382e-05, + "loss": 0.0, + "step": 43507 + }, + { + "epoch": 4.05971820472147, + "grad_norm": NaN, + "learning_rate": 7.51747458720316e-05, + "loss": 0.0, + "step": 43508 + }, + { + "epoch": 4.059811514416348, + "grad_norm": NaN, + "learning_rate": 7.516819045820784e-05, + "loss": 0.0, + "step": 43509 + }, + { + "epoch": 4.059904824111225, + "grad_norm": NaN, + "learning_rate": 7.51616352346592e-05, + "loss": 0.0, + "step": 43510 + }, + { + "epoch": 4.059998133806102, + "grad_norm": NaN, + "learning_rate": 7.515508020140233e-05, + "loss": 0.0, + "step": 43511 + }, + { + "epoch": 4.06009144350098, + "grad_norm": NaN, + "learning_rate": 7.514852535845393e-05, + "loss": 0.0, + "step": 43512 + }, + { + "epoch": 4.060184753195857, + "grad_norm": NaN, + "learning_rate": 7.514197070583069e-05, + "loss": 0.0, + "step": 43513 + }, + { + "epoch": 4.060278062890735, + "grad_norm": NaN, + "learning_rate": 7.513541624354922e-05, + "loss": 0.0, + "step": 43514 + }, + { + "epoch": 4.060371372585611, + "grad_norm": NaN, + "learning_rate": 7.512886197162621e-05, + "loss": 0.0, + "step": 43515 + }, + { + "epoch": 4.060464682280489, + "grad_norm": NaN, + "learning_rate": 7.512230789007833e-05, + "loss": 0.0, + "step": 43516 + }, + { + "epoch": 4.060557991975366, + "grad_norm": NaN, + "learning_rate": 7.511575399892225e-05, + "loss": 0.0, + "step": 43517 + }, + { + "epoch": 4.0606513016702435, + "grad_norm": NaN, + "learning_rate": 7.51092002981746e-05, + "loss": 0.0, + "step": 43518 + }, + { + "epoch": 4.060744611365121, + "grad_norm": NaN, + "learning_rate": 7.510264678785209e-05, + "loss": 0.0, + "step": 43519 + }, + { + "epoch": 4.060837921059998, + "grad_norm": NaN, + "learning_rate": 7.509609346797135e-05, + "loss": 0.0, + "step": 43520 + }, + { + "epoch": 4.060931230754876, + "grad_norm": NaN, + "learning_rate": 7.508954033854906e-05, + "loss": 0.0, + "step": 43521 + }, + { + "epoch": 4.061024540449753, + "grad_norm": NaN, + "learning_rate": 7.508298739960188e-05, + "loss": 0.0, + "step": 43522 + }, + { + "epoch": 4.06111785014463, + "grad_norm": NaN, + "learning_rate": 7.507643465114648e-05, + "loss": 0.0, + "step": 43523 + }, + { + "epoch": 4.061211159839507, + "grad_norm": NaN, + "learning_rate": 7.506988209319949e-05, + "loss": 0.0, + "step": 43524 + }, + { + "epoch": 4.0613044695343845, + "grad_norm": NaN, + "learning_rate": 7.506332972577762e-05, + "loss": 0.0, + "step": 43525 + }, + { + "epoch": 4.061397779229262, + "grad_norm": NaN, + "learning_rate": 7.50567775488975e-05, + "loss": 0.0, + "step": 43526 + }, + { + "epoch": 4.061491088924139, + "grad_norm": NaN, + "learning_rate": 7.505022556257579e-05, + "loss": 0.0, + "step": 43527 + }, + { + "epoch": 4.061584398619017, + "grad_norm": NaN, + "learning_rate": 7.504367376682916e-05, + "loss": 0.0, + "step": 43528 + }, + { + "epoch": 4.061677708313894, + "grad_norm": NaN, + "learning_rate": 7.503712216167424e-05, + "loss": 0.0, + "step": 43529 + }, + { + "epoch": 4.061771018008771, + "grad_norm": NaN, + "learning_rate": 7.503057074712775e-05, + "loss": 0.0, + "step": 43530 + }, + { + "epoch": 4.061864327703648, + "grad_norm": NaN, + "learning_rate": 7.502401952320627e-05, + "loss": 0.0, + "step": 43531 + }, + { + "epoch": 4.0619576373985256, + "grad_norm": NaN, + "learning_rate": 7.501746848992653e-05, + "loss": 0.0, + "step": 43532 + }, + { + "epoch": 4.062050947093403, + "grad_norm": NaN, + "learning_rate": 7.501091764730514e-05, + "loss": 0.0, + "step": 43533 + }, + { + "epoch": 4.06214425678828, + "grad_norm": NaN, + "learning_rate": 7.500436699535879e-05, + "loss": 0.0, + "step": 43534 + }, + { + "epoch": 4.062237566483158, + "grad_norm": NaN, + "learning_rate": 7.49978165341041e-05, + "loss": 0.0, + "step": 43535 + }, + { + "epoch": 4.062330876178035, + "grad_norm": NaN, + "learning_rate": 7.499126626355775e-05, + "loss": 0.0, + "step": 43536 + }, + { + "epoch": 4.062424185872912, + "grad_norm": NaN, + "learning_rate": 7.49847161837364e-05, + "loss": 0.0, + "step": 43537 + }, + { + "epoch": 4.062517495567789, + "grad_norm": NaN, + "learning_rate": 7.497816629465669e-05, + "loss": 0.0, + "step": 43538 + }, + { + "epoch": 4.062610805262667, + "grad_norm": NaN, + "learning_rate": 7.497161659633528e-05, + "loss": 0.0, + "step": 43539 + }, + { + "epoch": 4.062704114957544, + "grad_norm": NaN, + "learning_rate": 7.496506708878883e-05, + "loss": 0.0, + "step": 43540 + }, + { + "epoch": 4.062797424652421, + "grad_norm": NaN, + "learning_rate": 7.495851777203398e-05, + "loss": 0.0, + "step": 43541 + }, + { + "epoch": 4.062890734347299, + "grad_norm": NaN, + "learning_rate": 7.495196864608738e-05, + "loss": 0.0, + "step": 43542 + }, + { + "epoch": 4.062984044042176, + "grad_norm": NaN, + "learning_rate": 7.494541971096571e-05, + "loss": 0.0, + "step": 43543 + }, + { + "epoch": 4.063077353737054, + "grad_norm": NaN, + "learning_rate": 7.49388709666856e-05, + "loss": 0.0, + "step": 43544 + }, + { + "epoch": 4.06317066343193, + "grad_norm": NaN, + "learning_rate": 7.49323224132637e-05, + "loss": 0.0, + "step": 43545 + }, + { + "epoch": 4.063263973126808, + "grad_norm": NaN, + "learning_rate": 7.492577405071668e-05, + "loss": 0.0, + "step": 43546 + }, + { + "epoch": 4.063357282821685, + "grad_norm": NaN, + "learning_rate": 7.491922587906116e-05, + "loss": 0.0, + "step": 43547 + }, + { + "epoch": 4.0634505925165625, + "grad_norm": NaN, + "learning_rate": 7.491267789831384e-05, + "loss": 0.0, + "step": 43548 + }, + { + "epoch": 4.06354390221144, + "grad_norm": NaN, + "learning_rate": 7.490613010849131e-05, + "loss": 0.0, + "step": 43549 + }, + { + "epoch": 4.063637211906317, + "grad_norm": NaN, + "learning_rate": 7.489958250961025e-05, + "loss": 0.0, + "step": 43550 + }, + { + "epoch": 4.063730521601195, + "grad_norm": NaN, + "learning_rate": 7.489303510168732e-05, + "loss": 0.0, + "step": 43551 + }, + { + "epoch": 4.063823831296071, + "grad_norm": NaN, + "learning_rate": 7.488648788473912e-05, + "loss": 0.0, + "step": 43552 + }, + { + "epoch": 4.063917140990949, + "grad_norm": NaN, + "learning_rate": 7.487994085878236e-05, + "loss": 0.0, + "step": 43553 + }, + { + "epoch": 4.064010450685826, + "grad_norm": NaN, + "learning_rate": 7.487339402383366e-05, + "loss": 0.0, + "step": 43554 + }, + { + "epoch": 4.0641037603807035, + "grad_norm": NaN, + "learning_rate": 7.486684737990966e-05, + "loss": 0.0, + "step": 43555 + }, + { + "epoch": 4.064197070075581, + "grad_norm": NaN, + "learning_rate": 7.486030092702699e-05, + "loss": 0.0, + "step": 43556 + }, + { + "epoch": 4.064290379770458, + "grad_norm": NaN, + "learning_rate": 7.485375466520234e-05, + "loss": 0.0, + "step": 43557 + }, + { + "epoch": 4.064383689465336, + "grad_norm": NaN, + "learning_rate": 7.484720859445234e-05, + "loss": 0.0, + "step": 43558 + }, + { + "epoch": 4.064476999160212, + "grad_norm": NaN, + "learning_rate": 7.48406627147936e-05, + "loss": 0.0, + "step": 43559 + }, + { + "epoch": 4.06457030885509, + "grad_norm": NaN, + "learning_rate": 7.48341170262428e-05, + "loss": 0.0, + "step": 43560 + }, + { + "epoch": 4.064663618549967, + "grad_norm": NaN, + "learning_rate": 7.482757152881658e-05, + "loss": 0.0, + "step": 43561 + }, + { + "epoch": 4.064756928244845, + "grad_norm": NaN, + "learning_rate": 7.482102622253159e-05, + "loss": 0.0, + "step": 43562 + }, + { + "epoch": 4.064850237939722, + "grad_norm": NaN, + "learning_rate": 7.481448110740444e-05, + "loss": 0.0, + "step": 43563 + }, + { + "epoch": 4.064943547634599, + "grad_norm": NaN, + "learning_rate": 7.480793618345178e-05, + "loss": 0.0, + "step": 43564 + }, + { + "epoch": 4.065036857329477, + "grad_norm": NaN, + "learning_rate": 7.480139145069031e-05, + "loss": 0.0, + "step": 43565 + }, + { + "epoch": 4.065130167024354, + "grad_norm": NaN, + "learning_rate": 7.479484690913659e-05, + "loss": 0.0, + "step": 43566 + }, + { + "epoch": 4.065223476719231, + "grad_norm": NaN, + "learning_rate": 7.478830255880732e-05, + "loss": 0.0, + "step": 43567 + }, + { + "epoch": 4.065316786414108, + "grad_norm": NaN, + "learning_rate": 7.478175839971908e-05, + "loss": 0.0, + "step": 43568 + }, + { + "epoch": 4.065410096108986, + "grad_norm": NaN, + "learning_rate": 7.477521443188857e-05, + "loss": 0.0, + "step": 43569 + }, + { + "epoch": 4.065503405803863, + "grad_norm": NaN, + "learning_rate": 7.476867065533243e-05, + "loss": 0.0, + "step": 43570 + }, + { + "epoch": 4.0655967154987405, + "grad_norm": NaN, + "learning_rate": 7.476212707006724e-05, + "loss": 0.0, + "step": 43571 + }, + { + "epoch": 4.065690025193618, + "grad_norm": NaN, + "learning_rate": 7.475558367610968e-05, + "loss": 0.0, + "step": 43572 + }, + { + "epoch": 4.065783334888495, + "grad_norm": NaN, + "learning_rate": 7.47490404734764e-05, + "loss": 0.0, + "step": 43573 + }, + { + "epoch": 4.065876644583372, + "grad_norm": NaN, + "learning_rate": 7.474249746218399e-05, + "loss": 0.0, + "step": 43574 + }, + { + "epoch": 4.065969954278249, + "grad_norm": NaN, + "learning_rate": 7.473595464224914e-05, + "loss": 0.0, + "step": 43575 + }, + { + "epoch": 4.066063263973127, + "grad_norm": NaN, + "learning_rate": 7.472941201368846e-05, + "loss": 0.0, + "step": 43576 + }, + { + "epoch": 4.066156573668004, + "grad_norm": NaN, + "learning_rate": 7.47228695765186e-05, + "loss": 0.0, + "step": 43577 + }, + { + "epoch": 4.0662498833628815, + "grad_norm": NaN, + "learning_rate": 7.471632733075616e-05, + "loss": 0.0, + "step": 43578 + }, + { + "epoch": 4.066343193057759, + "grad_norm": NaN, + "learning_rate": 7.47097852764178e-05, + "loss": 0.0, + "step": 43579 + }, + { + "epoch": 4.066436502752636, + "grad_norm": NaN, + "learning_rate": 7.470324341352017e-05, + "loss": 0.0, + "step": 43580 + }, + { + "epoch": 4.066529812447513, + "grad_norm": NaN, + "learning_rate": 7.469670174207988e-05, + "loss": 0.0, + "step": 43581 + }, + { + "epoch": 4.06662312214239, + "grad_norm": NaN, + "learning_rate": 7.469016026211358e-05, + "loss": 0.0, + "step": 43582 + }, + { + "epoch": 4.066716431837268, + "grad_norm": NaN, + "learning_rate": 7.468361897363788e-05, + "loss": 0.0, + "step": 43583 + }, + { + "epoch": 4.066809741532145, + "grad_norm": NaN, + "learning_rate": 7.467707787666942e-05, + "loss": 0.0, + "step": 43584 + }, + { + "epoch": 4.066903051227023, + "grad_norm": NaN, + "learning_rate": 7.467053697122485e-05, + "loss": 0.0, + "step": 43585 + }, + { + "epoch": 4.0669963609219, + "grad_norm": NaN, + "learning_rate": 7.46639962573208e-05, + "loss": 0.0, + "step": 43586 + }, + { + "epoch": 4.067089670616777, + "grad_norm": NaN, + "learning_rate": 7.465745573497387e-05, + "loss": 0.0, + "step": 43587 + }, + { + "epoch": 4.067182980311655, + "grad_norm": NaN, + "learning_rate": 7.465091540420072e-05, + "loss": 0.0, + "step": 43588 + }, + { + "epoch": 4.067276290006531, + "grad_norm": NaN, + "learning_rate": 7.464437526501797e-05, + "loss": 0.0, + "step": 43589 + }, + { + "epoch": 4.067369599701409, + "grad_norm": NaN, + "learning_rate": 7.463783531744226e-05, + "loss": 0.0, + "step": 43590 + }, + { + "epoch": 4.067462909396286, + "grad_norm": NaN, + "learning_rate": 7.463129556149021e-05, + "loss": 0.0, + "step": 43591 + }, + { + "epoch": 4.067556219091164, + "grad_norm": NaN, + "learning_rate": 7.462475599717844e-05, + "loss": 0.0, + "step": 43592 + }, + { + "epoch": 4.067649528786041, + "grad_norm": NaN, + "learning_rate": 7.461821662452359e-05, + "loss": 0.0, + "step": 43593 + }, + { + "epoch": 4.067742838480918, + "grad_norm": NaN, + "learning_rate": 7.461167744354231e-05, + "loss": 0.0, + "step": 43594 + }, + { + "epoch": 4.067836148175796, + "grad_norm": NaN, + "learning_rate": 7.460513845425116e-05, + "loss": 0.0, + "step": 43595 + }, + { + "epoch": 4.067929457870672, + "grad_norm": NaN, + "learning_rate": 7.459859965666684e-05, + "loss": 0.0, + "step": 43596 + }, + { + "epoch": 4.06802276756555, + "grad_norm": NaN, + "learning_rate": 7.459206105080595e-05, + "loss": 0.0, + "step": 43597 + }, + { + "epoch": 4.068116077260427, + "grad_norm": NaN, + "learning_rate": 7.458552263668509e-05, + "loss": 0.0, + "step": 43598 + }, + { + "epoch": 4.068209386955305, + "grad_norm": NaN, + "learning_rate": 7.457898441432092e-05, + "loss": 0.0, + "step": 43599 + }, + { + "epoch": 4.068302696650182, + "grad_norm": NaN, + "learning_rate": 7.457244638373003e-05, + "loss": 0.0, + "step": 43600 + }, + { + "epoch": 4.0683960063450595, + "grad_norm": NaN, + "learning_rate": 7.45659085449291e-05, + "loss": 0.0, + "step": 43601 + }, + { + "epoch": 4.068489316039937, + "grad_norm": NaN, + "learning_rate": 7.45593708979347e-05, + "loss": 0.0, + "step": 43602 + }, + { + "epoch": 4.068582625734813, + "grad_norm": NaN, + "learning_rate": 7.455283344276345e-05, + "loss": 0.0, + "step": 43603 + }, + { + "epoch": 4.068675935429691, + "grad_norm": NaN, + "learning_rate": 7.454629617943203e-05, + "loss": 0.0, + "step": 43604 + }, + { + "epoch": 4.068769245124568, + "grad_norm": NaN, + "learning_rate": 7.453975910795702e-05, + "loss": 0.0, + "step": 43605 + }, + { + "epoch": 4.068862554819446, + "grad_norm": NaN, + "learning_rate": 7.453322222835505e-05, + "loss": 0.0, + "step": 43606 + }, + { + "epoch": 4.068955864514323, + "grad_norm": NaN, + "learning_rate": 7.452668554064274e-05, + "loss": 0.0, + "step": 43607 + }, + { + "epoch": 4.0690491742092005, + "grad_norm": NaN, + "learning_rate": 7.45201490448367e-05, + "loss": 0.0, + "step": 43608 + }, + { + "epoch": 4.069142483904078, + "grad_norm": NaN, + "learning_rate": 7.451361274095358e-05, + "loss": 0.0, + "step": 43609 + }, + { + "epoch": 4.0692357935989545, + "grad_norm": NaN, + "learning_rate": 7.450707662900996e-05, + "loss": 0.0, + "step": 43610 + }, + { + "epoch": 4.069329103293832, + "grad_norm": NaN, + "learning_rate": 7.45005407090225e-05, + "loss": 0.0, + "step": 43611 + }, + { + "epoch": 4.069422412988709, + "grad_norm": NaN, + "learning_rate": 7.44940049810078e-05, + "loss": 0.0, + "step": 43612 + }, + { + "epoch": 4.069515722683587, + "grad_norm": NaN, + "learning_rate": 7.448746944498249e-05, + "loss": 0.0, + "step": 43613 + }, + { + "epoch": 4.069609032378464, + "grad_norm": NaN, + "learning_rate": 7.448093410096314e-05, + "loss": 0.0, + "step": 43614 + }, + { + "epoch": 4.069702342073342, + "grad_norm": NaN, + "learning_rate": 7.447439894896644e-05, + "loss": 0.0, + "step": 43615 + }, + { + "epoch": 4.069795651768219, + "grad_norm": NaN, + "learning_rate": 7.446786398900895e-05, + "loss": 0.0, + "step": 43616 + }, + { + "epoch": 4.069888961463096, + "grad_norm": NaN, + "learning_rate": 7.446132922110732e-05, + "loss": 0.0, + "step": 43617 + }, + { + "epoch": 4.069982271157973, + "grad_norm": NaN, + "learning_rate": 7.445479464527814e-05, + "loss": 0.0, + "step": 43618 + }, + { + "epoch": 4.07007558085285, + "grad_norm": NaN, + "learning_rate": 7.444826026153805e-05, + "loss": 0.0, + "step": 43619 + }, + { + "epoch": 4.070168890547728, + "grad_norm": NaN, + "learning_rate": 7.444172606990365e-05, + "loss": 0.0, + "step": 43620 + }, + { + "epoch": 4.070262200242605, + "grad_norm": NaN, + "learning_rate": 7.443519207039156e-05, + "loss": 0.0, + "step": 43621 + }, + { + "epoch": 4.070355509937483, + "grad_norm": NaN, + "learning_rate": 7.44286582630184e-05, + "loss": 0.0, + "step": 43622 + }, + { + "epoch": 4.07044881963236, + "grad_norm": NaN, + "learning_rate": 7.442212464780075e-05, + "loss": 0.0, + "step": 43623 + }, + { + "epoch": 4.0705421293272375, + "grad_norm": NaN, + "learning_rate": 7.441559122475528e-05, + "loss": 0.0, + "step": 43624 + }, + { + "epoch": 4.070635439022114, + "grad_norm": NaN, + "learning_rate": 7.440905799389855e-05, + "loss": 0.0, + "step": 43625 + }, + { + "epoch": 4.070728748716991, + "grad_norm": NaN, + "learning_rate": 7.440252495524721e-05, + "loss": 0.0, + "step": 43626 + }, + { + "epoch": 4.070822058411869, + "grad_norm": NaN, + "learning_rate": 7.439599210881784e-05, + "loss": 0.0, + "step": 43627 + }, + { + "epoch": 4.070915368106746, + "grad_norm": NaN, + "learning_rate": 7.438945945462708e-05, + "loss": 0.0, + "step": 43628 + }, + { + "epoch": 4.071008677801624, + "grad_norm": NaN, + "learning_rate": 7.438292699269152e-05, + "loss": 0.0, + "step": 43629 + }, + { + "epoch": 4.071101987496501, + "grad_norm": NaN, + "learning_rate": 7.437639472302778e-05, + "loss": 0.0, + "step": 43630 + }, + { + "epoch": 4.0711952971913785, + "grad_norm": NaN, + "learning_rate": 7.436986264565247e-05, + "loss": 0.0, + "step": 43631 + }, + { + "epoch": 4.071288606886255, + "grad_norm": NaN, + "learning_rate": 7.436333076058219e-05, + "loss": 0.0, + "step": 43632 + }, + { + "epoch": 4.0713819165811325, + "grad_norm": NaN, + "learning_rate": 7.435679906783355e-05, + "loss": 0.0, + "step": 43633 + }, + { + "epoch": 4.07147522627601, + "grad_norm": NaN, + "learning_rate": 7.435026756742318e-05, + "loss": 0.0, + "step": 43634 + }, + { + "epoch": 4.071568535970887, + "grad_norm": NaN, + "learning_rate": 7.434373625936766e-05, + "loss": 0.0, + "step": 43635 + }, + { + "epoch": 4.071661845665765, + "grad_norm": NaN, + "learning_rate": 7.43372051436836e-05, + "loss": 0.0, + "step": 43636 + }, + { + "epoch": 4.071755155360642, + "grad_norm": NaN, + "learning_rate": 7.433067422038762e-05, + "loss": 0.0, + "step": 43637 + }, + { + "epoch": 4.07184846505552, + "grad_norm": NaN, + "learning_rate": 7.432414348949633e-05, + "loss": 0.0, + "step": 43638 + }, + { + "epoch": 4.071941774750397, + "grad_norm": NaN, + "learning_rate": 7.431761295102632e-05, + "loss": 0.0, + "step": 43639 + }, + { + "epoch": 4.0720350844452735, + "grad_norm": NaN, + "learning_rate": 7.431108260499419e-05, + "loss": 0.0, + "step": 43640 + }, + { + "epoch": 4.072128394140151, + "grad_norm": NaN, + "learning_rate": 7.430455245141658e-05, + "loss": 0.0, + "step": 43641 + }, + { + "epoch": 4.072221703835028, + "grad_norm": NaN, + "learning_rate": 7.429802249031006e-05, + "loss": 0.0, + "step": 43642 + }, + { + "epoch": 4.072315013529906, + "grad_norm": NaN, + "learning_rate": 7.429149272169123e-05, + "loss": 0.0, + "step": 43643 + }, + { + "epoch": 4.072408323224783, + "grad_norm": NaN, + "learning_rate": 7.428496314557673e-05, + "loss": 0.0, + "step": 43644 + }, + { + "epoch": 4.072501632919661, + "grad_norm": NaN, + "learning_rate": 7.427843376198313e-05, + "loss": 0.0, + "step": 43645 + }, + { + "epoch": 4.072594942614538, + "grad_norm": NaN, + "learning_rate": 7.427190457092703e-05, + "loss": 0.0, + "step": 43646 + }, + { + "epoch": 4.072688252309415, + "grad_norm": NaN, + "learning_rate": 7.426537557242506e-05, + "loss": 0.0, + "step": 43647 + }, + { + "epoch": 4.072781562004292, + "grad_norm": NaN, + "learning_rate": 7.42588467664938e-05, + "loss": 0.0, + "step": 43648 + }, + { + "epoch": 4.072874871699169, + "grad_norm": NaN, + "learning_rate": 7.425231815314984e-05, + "loss": 0.0, + "step": 43649 + }, + { + "epoch": 4.072968181394047, + "grad_norm": NaN, + "learning_rate": 7.424578973240983e-05, + "loss": 0.0, + "step": 43650 + }, + { + "epoch": 4.073061491088924, + "grad_norm": NaN, + "learning_rate": 7.423926150429033e-05, + "loss": 0.0, + "step": 43651 + }, + { + "epoch": 4.073154800783802, + "grad_norm": NaN, + "learning_rate": 7.423273346880794e-05, + "loss": 0.0, + "step": 43652 + }, + { + "epoch": 4.073248110478679, + "grad_norm": NaN, + "learning_rate": 7.422620562597925e-05, + "loss": 0.0, + "step": 43653 + }, + { + "epoch": 4.073341420173556, + "grad_norm": NaN, + "learning_rate": 7.421967797582089e-05, + "loss": 0.0, + "step": 43654 + }, + { + "epoch": 4.073434729868433, + "grad_norm": NaN, + "learning_rate": 7.421315051834943e-05, + "loss": 0.0, + "step": 43655 + }, + { + "epoch": 4.07352803956331, + "grad_norm": NaN, + "learning_rate": 7.420662325358148e-05, + "loss": 0.0, + "step": 43656 + }, + { + "epoch": 4.073621349258188, + "grad_norm": NaN, + "learning_rate": 7.420009618153366e-05, + "loss": 0.0, + "step": 43657 + }, + { + "epoch": 4.073714658953065, + "grad_norm": NaN, + "learning_rate": 7.419356930222251e-05, + "loss": 0.0, + "step": 43658 + }, + { + "epoch": 4.073807968647943, + "grad_norm": NaN, + "learning_rate": 7.418704261566467e-05, + "loss": 0.0, + "step": 43659 + }, + { + "epoch": 4.07390127834282, + "grad_norm": NaN, + "learning_rate": 7.418051612187672e-05, + "loss": 0.0, + "step": 43660 + }, + { + "epoch": 4.0739945880376975, + "grad_norm": NaN, + "learning_rate": 7.417398982087527e-05, + "loss": 0.0, + "step": 43661 + }, + { + "epoch": 4.074087897732574, + "grad_norm": NaN, + "learning_rate": 7.416746371267688e-05, + "loss": 0.0, + "step": 43662 + }, + { + "epoch": 4.0741812074274515, + "grad_norm": NaN, + "learning_rate": 7.416093779729819e-05, + "loss": 0.0, + "step": 43663 + }, + { + "epoch": 4.074274517122329, + "grad_norm": NaN, + "learning_rate": 7.415441207475577e-05, + "loss": 0.0, + "step": 43664 + }, + { + "epoch": 4.074367826817206, + "grad_norm": NaN, + "learning_rate": 7.41478865450662e-05, + "loss": 0.0, + "step": 43665 + }, + { + "epoch": 4.074461136512084, + "grad_norm": NaN, + "learning_rate": 7.414136120824609e-05, + "loss": 0.0, + "step": 43666 + }, + { + "epoch": 4.074554446206961, + "grad_norm": NaN, + "learning_rate": 7.413483606431201e-05, + "loss": 0.0, + "step": 43667 + }, + { + "epoch": 4.074647755901839, + "grad_norm": NaN, + "learning_rate": 7.41283111132806e-05, + "loss": 0.0, + "step": 43668 + }, + { + "epoch": 4.074741065596715, + "grad_norm": NaN, + "learning_rate": 7.412178635516839e-05, + "loss": 0.0, + "step": 43669 + }, + { + "epoch": 4.0748343752915925, + "grad_norm": NaN, + "learning_rate": 7.411526178999198e-05, + "loss": 0.0, + "step": 43670 + }, + { + "epoch": 4.07492768498647, + "grad_norm": NaN, + "learning_rate": 7.410873741776806e-05, + "loss": 0.0, + "step": 43671 + }, + { + "epoch": 4.075020994681347, + "grad_norm": NaN, + "learning_rate": 7.41022132385131e-05, + "loss": 0.0, + "step": 43672 + }, + { + "epoch": 4.075114304376225, + "grad_norm": NaN, + "learning_rate": 7.409568925224367e-05, + "loss": 0.0, + "step": 43673 + }, + { + "epoch": 4.075207614071102, + "grad_norm": NaN, + "learning_rate": 7.408916545897652e-05, + "loss": 0.0, + "step": 43674 + }, + { + "epoch": 4.07530092376598, + "grad_norm": NaN, + "learning_rate": 7.408264185872809e-05, + "loss": 0.0, + "step": 43675 + }, + { + "epoch": 4.075394233460856, + "grad_norm": NaN, + "learning_rate": 7.407611845151497e-05, + "loss": 0.0, + "step": 43676 + }, + { + "epoch": 4.075487543155734, + "grad_norm": NaN, + "learning_rate": 7.406959523735388e-05, + "loss": 0.0, + "step": 43677 + }, + { + "epoch": 4.075580852850611, + "grad_norm": NaN, + "learning_rate": 7.406307221626128e-05, + "loss": 0.0, + "step": 43678 + }, + { + "epoch": 4.075674162545488, + "grad_norm": NaN, + "learning_rate": 7.405654938825377e-05, + "loss": 0.0, + "step": 43679 + }, + { + "epoch": 4.075767472240366, + "grad_norm": NaN, + "learning_rate": 7.405002675334799e-05, + "loss": 0.0, + "step": 43680 + }, + { + "epoch": 4.075860781935243, + "grad_norm": NaN, + "learning_rate": 7.404350431156047e-05, + "loss": 0.0, + "step": 43681 + }, + { + "epoch": 4.075954091630121, + "grad_norm": NaN, + "learning_rate": 7.403698206290782e-05, + "loss": 0.0, + "step": 43682 + }, + { + "epoch": 4.076047401324998, + "grad_norm": NaN, + "learning_rate": 7.403046000740663e-05, + "loss": 0.0, + "step": 43683 + }, + { + "epoch": 4.076140711019875, + "grad_norm": NaN, + "learning_rate": 7.402393814507346e-05, + "loss": 0.0, + "step": 43684 + }, + { + "epoch": 4.076234020714752, + "grad_norm": NaN, + "learning_rate": 7.401741647592493e-05, + "loss": 0.0, + "step": 43685 + }, + { + "epoch": 4.0763273304096295, + "grad_norm": NaN, + "learning_rate": 7.40108949999776e-05, + "loss": 0.0, + "step": 43686 + }, + { + "epoch": 4.076420640104507, + "grad_norm": NaN, + "learning_rate": 7.400437371724804e-05, + "loss": 0.0, + "step": 43687 + }, + { + "epoch": 4.076513949799384, + "grad_norm": NaN, + "learning_rate": 7.399785262775287e-05, + "loss": 0.0, + "step": 43688 + }, + { + "epoch": 4.076607259494262, + "grad_norm": NaN, + "learning_rate": 7.399133173150863e-05, + "loss": 0.0, + "step": 43689 + }, + { + "epoch": 4.076700569189139, + "grad_norm": NaN, + "learning_rate": 7.398481102853192e-05, + "loss": 0.0, + "step": 43690 + }, + { + "epoch": 4.076793878884016, + "grad_norm": NaN, + "learning_rate": 7.397829051883931e-05, + "loss": 0.0, + "step": 43691 + }, + { + "epoch": 4.076887188578893, + "grad_norm": NaN, + "learning_rate": 7.39717702024474e-05, + "loss": 0.0, + "step": 43692 + }, + { + "epoch": 4.0769804982737705, + "grad_norm": NaN, + "learning_rate": 7.396525007937277e-05, + "loss": 0.0, + "step": 43693 + }, + { + "epoch": 4.077073807968648, + "grad_norm": NaN, + "learning_rate": 7.395873014963197e-05, + "loss": 0.0, + "step": 43694 + }, + { + "epoch": 4.077167117663525, + "grad_norm": NaN, + "learning_rate": 7.395221041324159e-05, + "loss": 0.0, + "step": 43695 + }, + { + "epoch": 4.077260427358403, + "grad_norm": NaN, + "learning_rate": 7.394569087021821e-05, + "loss": 0.0, + "step": 43696 + }, + { + "epoch": 4.07735373705328, + "grad_norm": NaN, + "learning_rate": 7.39391715205784e-05, + "loss": 0.0, + "step": 43697 + }, + { + "epoch": 4.077447046748157, + "grad_norm": NaN, + "learning_rate": 7.393265236433877e-05, + "loss": 0.0, + "step": 43698 + }, + { + "epoch": 4.077540356443034, + "grad_norm": NaN, + "learning_rate": 7.392613340151586e-05, + "loss": 0.0, + "step": 43699 + }, + { + "epoch": 4.077633666137912, + "grad_norm": NaN, + "learning_rate": 7.391961463212626e-05, + "loss": 0.0, + "step": 43700 + }, + { + "epoch": 4.077726975832789, + "grad_norm": NaN, + "learning_rate": 7.391309605618656e-05, + "loss": 0.0, + "step": 43701 + }, + { + "epoch": 4.077820285527666, + "grad_norm": NaN, + "learning_rate": 7.39065776737133e-05, + "loss": 0.0, + "step": 43702 + }, + { + "epoch": 4.077913595222544, + "grad_norm": NaN, + "learning_rate": 7.390005948472308e-05, + "loss": 0.0, + "step": 43703 + }, + { + "epoch": 4.078006904917421, + "grad_norm": NaN, + "learning_rate": 7.389354148923246e-05, + "loss": 0.0, + "step": 43704 + }, + { + "epoch": 4.078100214612299, + "grad_norm": NaN, + "learning_rate": 7.388702368725801e-05, + "loss": 0.0, + "step": 43705 + }, + { + "epoch": 4.078193524307175, + "grad_norm": NaN, + "learning_rate": 7.388050607881634e-05, + "loss": 0.0, + "step": 43706 + }, + { + "epoch": 4.078286834002053, + "grad_norm": NaN, + "learning_rate": 7.387398866392397e-05, + "loss": 0.0, + "step": 43707 + }, + { + "epoch": 4.07838014369693, + "grad_norm": NaN, + "learning_rate": 7.386747144259745e-05, + "loss": 0.0, + "step": 43708 + }, + { + "epoch": 4.0784734533918074, + "grad_norm": NaN, + "learning_rate": 7.38609544148535e-05, + "loss": 0.0, + "step": 43709 + }, + { + "epoch": 4.078566763086685, + "grad_norm": NaN, + "learning_rate": 7.385443758070855e-05, + "loss": 0.0, + "step": 43710 + }, + { + "epoch": 4.078660072781562, + "grad_norm": NaN, + "learning_rate": 7.384792094017915e-05, + "loss": 0.0, + "step": 43711 + }, + { + "epoch": 4.07875338247644, + "grad_norm": NaN, + "learning_rate": 7.384140449328204e-05, + "loss": 0.0, + "step": 43712 + }, + { + "epoch": 4.078846692171316, + "grad_norm": NaN, + "learning_rate": 7.38348882400336e-05, + "loss": 0.0, + "step": 43713 + }, + { + "epoch": 4.078940001866194, + "grad_norm": NaN, + "learning_rate": 7.382837218045045e-05, + "loss": 0.0, + "step": 43714 + }, + { + "epoch": 4.079033311561071, + "grad_norm": NaN, + "learning_rate": 7.38218563145493e-05, + "loss": 0.0, + "step": 43715 + }, + { + "epoch": 4.0791266212559485, + "grad_norm": NaN, + "learning_rate": 7.381534064234652e-05, + "loss": 0.0, + "step": 43716 + }, + { + "epoch": 4.079219930950826, + "grad_norm": NaN, + "learning_rate": 7.380882516385872e-05, + "loss": 0.0, + "step": 43717 + }, + { + "epoch": 4.079313240645703, + "grad_norm": NaN, + "learning_rate": 7.380230987910264e-05, + "loss": 0.0, + "step": 43718 + }, + { + "epoch": 4.079406550340581, + "grad_norm": NaN, + "learning_rate": 7.379579478809463e-05, + "loss": 0.0, + "step": 43719 + }, + { + "epoch": 4.079499860035457, + "grad_norm": NaN, + "learning_rate": 7.37892798908513e-05, + "loss": 0.0, + "step": 43720 + }, + { + "epoch": 4.079593169730335, + "grad_norm": NaN, + "learning_rate": 7.378276518738936e-05, + "loss": 0.0, + "step": 43721 + }, + { + "epoch": 4.079686479425212, + "grad_norm": NaN, + "learning_rate": 7.377625067772521e-05, + "loss": 0.0, + "step": 43722 + }, + { + "epoch": 4.0797797891200895, + "grad_norm": NaN, + "learning_rate": 7.376973636187548e-05, + "loss": 0.0, + "step": 43723 + }, + { + "epoch": 4.079873098814967, + "grad_norm": NaN, + "learning_rate": 7.376322223985673e-05, + "loss": 0.0, + "step": 43724 + }, + { + "epoch": 4.079966408509844, + "grad_norm": NaN, + "learning_rate": 7.375670831168553e-05, + "loss": 0.0, + "step": 43725 + }, + { + "epoch": 4.080059718204722, + "grad_norm": NaN, + "learning_rate": 7.375019457737844e-05, + "loss": 0.0, + "step": 43726 + }, + { + "epoch": 4.080153027899598, + "grad_norm": NaN, + "learning_rate": 7.3743681036952e-05, + "loss": 0.0, + "step": 43727 + }, + { + "epoch": 4.080246337594476, + "grad_norm": NaN, + "learning_rate": 7.373716769042282e-05, + "loss": 0.0, + "step": 43728 + }, + { + "epoch": 4.080339647289353, + "grad_norm": NaN, + "learning_rate": 7.373065453780739e-05, + "loss": 0.0, + "step": 43729 + }, + { + "epoch": 4.080432956984231, + "grad_norm": NaN, + "learning_rate": 7.372414157912234e-05, + "loss": 0.0, + "step": 43730 + }, + { + "epoch": 4.080526266679108, + "grad_norm": NaN, + "learning_rate": 7.371762881438419e-05, + "loss": 0.0, + "step": 43731 + }, + { + "epoch": 4.080619576373985, + "grad_norm": NaN, + "learning_rate": 7.371111624360953e-05, + "loss": 0.0, + "step": 43732 + }, + { + "epoch": 4.080712886068863, + "grad_norm": NaN, + "learning_rate": 7.370460386681488e-05, + "loss": 0.0, + "step": 43733 + }, + { + "epoch": 4.08080619576374, + "grad_norm": NaN, + "learning_rate": 7.369809168401685e-05, + "loss": 0.0, + "step": 43734 + }, + { + "epoch": 4.080899505458617, + "grad_norm": NaN, + "learning_rate": 7.369157969523194e-05, + "loss": 0.0, + "step": 43735 + }, + { + "epoch": 4.080992815153494, + "grad_norm": NaN, + "learning_rate": 7.368506790047677e-05, + "loss": 0.0, + "step": 43736 + }, + { + "epoch": 4.081086124848372, + "grad_norm": NaN, + "learning_rate": 7.367855629976784e-05, + "loss": 0.0, + "step": 43737 + }, + { + "epoch": 4.081179434543249, + "grad_norm": NaN, + "learning_rate": 7.367204489312175e-05, + "loss": 0.0, + "step": 43738 + }, + { + "epoch": 4.0812727442381265, + "grad_norm": NaN, + "learning_rate": 7.366553368055504e-05, + "loss": 0.0, + "step": 43739 + }, + { + "epoch": 4.081366053933004, + "grad_norm": NaN, + "learning_rate": 7.365902266208427e-05, + "loss": 0.0, + "step": 43740 + }, + { + "epoch": 4.081459363627881, + "grad_norm": NaN, + "learning_rate": 7.365251183772598e-05, + "loss": 0.0, + "step": 43741 + }, + { + "epoch": 4.081552673322758, + "grad_norm": NaN, + "learning_rate": 7.364600120749671e-05, + "loss": 0.0, + "step": 43742 + }, + { + "epoch": 4.081645983017635, + "grad_norm": NaN, + "learning_rate": 7.363949077141308e-05, + "loss": 0.0, + "step": 43743 + }, + { + "epoch": 4.081739292712513, + "grad_norm": NaN, + "learning_rate": 7.363298052949161e-05, + "loss": 0.0, + "step": 43744 + }, + { + "epoch": 4.08183260240739, + "grad_norm": NaN, + "learning_rate": 7.362647048174883e-05, + "loss": 0.0, + "step": 43745 + }, + { + "epoch": 4.0819259121022675, + "grad_norm": NaN, + "learning_rate": 7.361996062820125e-05, + "loss": 0.0, + "step": 43746 + }, + { + "epoch": 4.082019221797145, + "grad_norm": NaN, + "learning_rate": 7.361345096886562e-05, + "loss": 0.0, + "step": 43747 + }, + { + "epoch": 4.082112531492022, + "grad_norm": NaN, + "learning_rate": 7.360694150375827e-05, + "loss": 0.0, + "step": 43748 + }, + { + "epoch": 4.082205841186899, + "grad_norm": NaN, + "learning_rate": 7.360043223289581e-05, + "loss": 0.0, + "step": 43749 + }, + { + "epoch": 4.082299150881776, + "grad_norm": NaN, + "learning_rate": 7.359392315629492e-05, + "loss": 0.0, + "step": 43750 + }, + { + "epoch": 4.082392460576654, + "grad_norm": NaN, + "learning_rate": 7.358741427397199e-05, + "loss": 0.0, + "step": 43751 + }, + { + "epoch": 4.082485770271531, + "grad_norm": NaN, + "learning_rate": 7.358090558594358e-05, + "loss": 0.0, + "step": 43752 + }, + { + "epoch": 4.082579079966409, + "grad_norm": NaN, + "learning_rate": 7.357439709222641e-05, + "loss": 0.0, + "step": 43753 + }, + { + "epoch": 4.082672389661286, + "grad_norm": NaN, + "learning_rate": 7.356788879283685e-05, + "loss": 0.0, + "step": 43754 + }, + { + "epoch": 4.082765699356163, + "grad_norm": NaN, + "learning_rate": 7.356138068779143e-05, + "loss": 0.0, + "step": 43755 + }, + { + "epoch": 4.082859009051041, + "grad_norm": NaN, + "learning_rate": 7.355487277710691e-05, + "loss": 0.0, + "step": 43756 + }, + { + "epoch": 4.082952318745917, + "grad_norm": NaN, + "learning_rate": 7.354836506079965e-05, + "loss": 0.0, + "step": 43757 + }, + { + "epoch": 4.083045628440795, + "grad_norm": NaN, + "learning_rate": 7.354185753888618e-05, + "loss": 0.0, + "step": 43758 + }, + { + "epoch": 4.083138938135672, + "grad_norm": NaN, + "learning_rate": 7.353535021138324e-05, + "loss": 0.0, + "step": 43759 + }, + { + "epoch": 4.08323224783055, + "grad_norm": NaN, + "learning_rate": 7.352884307830719e-05, + "loss": 0.0, + "step": 43760 + }, + { + "epoch": 4.083325557525427, + "grad_norm": NaN, + "learning_rate": 7.35223361396746e-05, + "loss": 0.0, + "step": 43761 + }, + { + "epoch": 4.0834188672203044, + "grad_norm": NaN, + "learning_rate": 7.351582939550215e-05, + "loss": 0.0, + "step": 43762 + }, + { + "epoch": 4.083512176915182, + "grad_norm": NaN, + "learning_rate": 7.350932284580624e-05, + "loss": 0.0, + "step": 43763 + }, + { + "epoch": 4.083605486610058, + "grad_norm": NaN, + "learning_rate": 7.350281649060342e-05, + "loss": 0.0, + "step": 43764 + }, + { + "epoch": 4.083698796304936, + "grad_norm": NaN, + "learning_rate": 7.349631032991037e-05, + "loss": 0.0, + "step": 43765 + }, + { + "epoch": 4.083792105999813, + "grad_norm": NaN, + "learning_rate": 7.34898043637435e-05, + "loss": 0.0, + "step": 43766 + }, + { + "epoch": 4.083885415694691, + "grad_norm": NaN, + "learning_rate": 7.348329859211937e-05, + "loss": 0.0, + "step": 43767 + }, + { + "epoch": 4.083978725389568, + "grad_norm": NaN, + "learning_rate": 7.347679301505457e-05, + "loss": 0.0, + "step": 43768 + }, + { + "epoch": 4.0840720350844455, + "grad_norm": NaN, + "learning_rate": 7.34702876325656e-05, + "loss": 0.0, + "step": 43769 + }, + { + "epoch": 4.084165344779323, + "grad_norm": NaN, + "learning_rate": 7.346378244466904e-05, + "loss": 0.0, + "step": 43770 + }, + { + "epoch": 4.0842586544741994, + "grad_norm": NaN, + "learning_rate": 7.34572774513814e-05, + "loss": 0.0, + "step": 43771 + }, + { + "epoch": 4.084351964169077, + "grad_norm": NaN, + "learning_rate": 7.345077265271921e-05, + "loss": 0.0, + "step": 43772 + }, + { + "epoch": 4.084445273863954, + "grad_norm": NaN, + "learning_rate": 7.344426804869906e-05, + "loss": 0.0, + "step": 43773 + }, + { + "epoch": 4.084538583558832, + "grad_norm": NaN, + "learning_rate": 7.343776363933744e-05, + "loss": 0.0, + "step": 43774 + }, + { + "epoch": 4.084631893253709, + "grad_norm": NaN, + "learning_rate": 7.343125942465092e-05, + "loss": 0.0, + "step": 43775 + }, + { + "epoch": 4.0847252029485865, + "grad_norm": NaN, + "learning_rate": 7.342475540465602e-05, + "loss": 0.0, + "step": 43776 + }, + { + "epoch": 4.084818512643464, + "grad_norm": NaN, + "learning_rate": 7.341825157936927e-05, + "loss": 0.0, + "step": 43777 + }, + { + "epoch": 4.084911822338341, + "grad_norm": NaN, + "learning_rate": 7.341174794880725e-05, + "loss": 0.0, + "step": 43778 + }, + { + "epoch": 4.085005132033218, + "grad_norm": NaN, + "learning_rate": 7.340524451298644e-05, + "loss": 0.0, + "step": 43779 + }, + { + "epoch": 4.085098441728095, + "grad_norm": NaN, + "learning_rate": 7.339874127192342e-05, + "loss": 0.0, + "step": 43780 + }, + { + "epoch": 4.085191751422973, + "grad_norm": NaN, + "learning_rate": 7.33922382256347e-05, + "loss": 0.0, + "step": 43781 + }, + { + "epoch": 4.08528506111785, + "grad_norm": NaN, + "learning_rate": 7.338573537413684e-05, + "loss": 0.0, + "step": 43782 + }, + { + "epoch": 4.085378370812728, + "grad_norm": NaN, + "learning_rate": 7.337923271744635e-05, + "loss": 0.0, + "step": 43783 + }, + { + "epoch": 4.085471680507605, + "grad_norm": NaN, + "learning_rate": 7.337273025557971e-05, + "loss": 0.0, + "step": 43784 + }, + { + "epoch": 4.085564990202482, + "grad_norm": NaN, + "learning_rate": 7.336622798855366e-05, + "loss": 0.0, + "step": 43785 + }, + { + "epoch": 4.085658299897359, + "grad_norm": NaN, + "learning_rate": 7.33597259163845e-05, + "loss": 0.0, + "step": 43786 + }, + { + "epoch": 4.085751609592236, + "grad_norm": NaN, + "learning_rate": 7.335322403908881e-05, + "loss": 0.0, + "step": 43787 + }, + { + "epoch": 4.085844919287114, + "grad_norm": NaN, + "learning_rate": 7.334672235668328e-05, + "loss": 0.0, + "step": 43788 + }, + { + "epoch": 4.085938228981991, + "grad_norm": NaN, + "learning_rate": 7.334022086918428e-05, + "loss": 0.0, + "step": 43789 + }, + { + "epoch": 4.086031538676869, + "grad_norm": NaN, + "learning_rate": 7.333371957660833e-05, + "loss": 0.0, + "step": 43790 + }, + { + "epoch": 4.086124848371746, + "grad_norm": NaN, + "learning_rate": 7.332721847897213e-05, + "loss": 0.0, + "step": 43791 + }, + { + "epoch": 4.0862181580666235, + "grad_norm": NaN, + "learning_rate": 7.332071757629204e-05, + "loss": 0.0, + "step": 43792 + }, + { + "epoch": 4.0863114677615, + "grad_norm": NaN, + "learning_rate": 7.331421686858461e-05, + "loss": 0.0, + "step": 43793 + }, + { + "epoch": 4.086404777456377, + "grad_norm": NaN, + "learning_rate": 7.330771635586652e-05, + "loss": 0.0, + "step": 43794 + }, + { + "epoch": 4.086498087151255, + "grad_norm": NaN, + "learning_rate": 7.330121603815413e-05, + "loss": 0.0, + "step": 43795 + }, + { + "epoch": 4.086591396846132, + "grad_norm": NaN, + "learning_rate": 7.329471591546398e-05, + "loss": 0.0, + "step": 43796 + }, + { + "epoch": 4.08668470654101, + "grad_norm": NaN, + "learning_rate": 7.328821598781274e-05, + "loss": 0.0, + "step": 43797 + }, + { + "epoch": 4.086778016235887, + "grad_norm": NaN, + "learning_rate": 7.328171625521679e-05, + "loss": 0.0, + "step": 43798 + }, + { + "epoch": 4.0868713259307645, + "grad_norm": NaN, + "learning_rate": 7.327521671769265e-05, + "loss": 0.0, + "step": 43799 + }, + { + "epoch": 4.086964635625642, + "grad_norm": NaN, + "learning_rate": 7.326871737525703e-05, + "loss": 0.0, + "step": 43800 + }, + { + "epoch": 4.0870579453205185, + "grad_norm": NaN, + "learning_rate": 7.326221822792628e-05, + "loss": 0.0, + "step": 43801 + }, + { + "epoch": 4.087151255015396, + "grad_norm": NaN, + "learning_rate": 7.325571927571692e-05, + "loss": 0.0, + "step": 43802 + }, + { + "epoch": 4.087244564710273, + "grad_norm": NaN, + "learning_rate": 7.324922051864564e-05, + "loss": 0.0, + "step": 43803 + }, + { + "epoch": 4.087337874405151, + "grad_norm": NaN, + "learning_rate": 7.32427219567288e-05, + "loss": 0.0, + "step": 43804 + }, + { + "epoch": 4.087431184100028, + "grad_norm": NaN, + "learning_rate": 7.323622358998293e-05, + "loss": 0.0, + "step": 43805 + }, + { + "epoch": 4.087524493794906, + "grad_norm": NaN, + "learning_rate": 7.322972541842471e-05, + "loss": 0.0, + "step": 43806 + }, + { + "epoch": 4.087617803489783, + "grad_norm": NaN, + "learning_rate": 7.322322744207051e-05, + "loss": 0.0, + "step": 43807 + }, + { + "epoch": 4.0877111131846595, + "grad_norm": NaN, + "learning_rate": 7.321672966093681e-05, + "loss": 0.0, + "step": 43808 + }, + { + "epoch": 4.087804422879537, + "grad_norm": NaN, + "learning_rate": 7.321023207504038e-05, + "loss": 0.0, + "step": 43809 + }, + { + "epoch": 4.087897732574414, + "grad_norm": NaN, + "learning_rate": 7.32037346843975e-05, + "loss": 0.0, + "step": 43810 + }, + { + "epoch": 4.087991042269292, + "grad_norm": NaN, + "learning_rate": 7.319723748902474e-05, + "loss": 0.0, + "step": 43811 + }, + { + "epoch": 4.088084351964169, + "grad_norm": NaN, + "learning_rate": 7.319074048893874e-05, + "loss": 0.0, + "step": 43812 + }, + { + "epoch": 4.088177661659047, + "grad_norm": NaN, + "learning_rate": 7.318424368415588e-05, + "loss": 0.0, + "step": 43813 + }, + { + "epoch": 4.088270971353924, + "grad_norm": NaN, + "learning_rate": 7.317774707469275e-05, + "loss": 0.0, + "step": 43814 + }, + { + "epoch": 4.088364281048801, + "grad_norm": NaN, + "learning_rate": 7.317125066056585e-05, + "loss": 0.0, + "step": 43815 + }, + { + "epoch": 4.088457590743678, + "grad_norm": NaN, + "learning_rate": 7.316475444179169e-05, + "loss": 0.0, + "step": 43816 + }, + { + "epoch": 4.088550900438555, + "grad_norm": NaN, + "learning_rate": 7.315825841838682e-05, + "loss": 0.0, + "step": 43817 + }, + { + "epoch": 4.088644210133433, + "grad_norm": NaN, + "learning_rate": 7.315176259036773e-05, + "loss": 0.0, + "step": 43818 + }, + { + "epoch": 4.08873751982831, + "grad_norm": NaN, + "learning_rate": 7.314526695775087e-05, + "loss": 0.0, + "step": 43819 + }, + { + "epoch": 4.088830829523188, + "grad_norm": NaN, + "learning_rate": 7.313877152055296e-05, + "loss": 0.0, + "step": 43820 + }, + { + "epoch": 4.088924139218065, + "grad_norm": NaN, + "learning_rate": 7.313227627879032e-05, + "loss": 0.0, + "step": 43821 + }, + { + "epoch": 4.0890174489129425, + "grad_norm": NaN, + "learning_rate": 7.312578123247948e-05, + "loss": 0.0, + "step": 43822 + }, + { + "epoch": 4.089110758607819, + "grad_norm": NaN, + "learning_rate": 7.311928638163712e-05, + "loss": 0.0, + "step": 43823 + }, + { + "epoch": 4.0892040683026964, + "grad_norm": NaN, + "learning_rate": 7.311279172627959e-05, + "loss": 0.0, + "step": 43824 + }, + { + "epoch": 4.089297377997574, + "grad_norm": NaN, + "learning_rate": 7.310629726642339e-05, + "loss": 0.0, + "step": 43825 + }, + { + "epoch": 4.089390687692451, + "grad_norm": NaN, + "learning_rate": 7.309980300208521e-05, + "loss": 0.0, + "step": 43826 + }, + { + "epoch": 4.089483997387329, + "grad_norm": NaN, + "learning_rate": 7.30933089332814e-05, + "loss": 0.0, + "step": 43827 + }, + { + "epoch": 4.089577307082206, + "grad_norm": NaN, + "learning_rate": 7.308681506002845e-05, + "loss": 0.0, + "step": 43828 + }, + { + "epoch": 4.0896706167770835, + "grad_norm": NaN, + "learning_rate": 7.308032138234307e-05, + "loss": 0.0, + "step": 43829 + }, + { + "epoch": 4.08976392647196, + "grad_norm": NaN, + "learning_rate": 7.30738279002416e-05, + "loss": 0.0, + "step": 43830 + }, + { + "epoch": 4.0898572361668375, + "grad_norm": NaN, + "learning_rate": 7.306733461374053e-05, + "loss": 0.0, + "step": 43831 + }, + { + "epoch": 4.089950545861715, + "grad_norm": NaN, + "learning_rate": 7.306084152285657e-05, + "loss": 0.0, + "step": 43832 + }, + { + "epoch": 4.090043855556592, + "grad_norm": NaN, + "learning_rate": 7.305434862760604e-05, + "loss": 0.0, + "step": 43833 + }, + { + "epoch": 4.09013716525147, + "grad_norm": NaN, + "learning_rate": 7.304785592800543e-05, + "loss": 0.0, + "step": 43834 + }, + { + "epoch": 4.090230474946347, + "grad_norm": NaN, + "learning_rate": 7.304136342407146e-05, + "loss": 0.0, + "step": 43835 + }, + { + "epoch": 4.090323784641225, + "grad_norm": NaN, + "learning_rate": 7.303487111582043e-05, + "loss": 0.0, + "step": 43836 + }, + { + "epoch": 4.090417094336101, + "grad_norm": NaN, + "learning_rate": 7.302837900326888e-05, + "loss": 0.0, + "step": 43837 + }, + { + "epoch": 4.0905104040309785, + "grad_norm": NaN, + "learning_rate": 7.302188708643348e-05, + "loss": 0.0, + "step": 43838 + }, + { + "epoch": 4.090603713725856, + "grad_norm": NaN, + "learning_rate": 7.301539536533055e-05, + "loss": 0.0, + "step": 43839 + }, + { + "epoch": 4.090697023420733, + "grad_norm": NaN, + "learning_rate": 7.300890383997663e-05, + "loss": 0.0, + "step": 43840 + }, + { + "epoch": 4.090790333115611, + "grad_norm": NaN, + "learning_rate": 7.300241251038836e-05, + "loss": 0.0, + "step": 43841 + }, + { + "epoch": 4.090883642810488, + "grad_norm": NaN, + "learning_rate": 7.299592137658207e-05, + "loss": 0.0, + "step": 43842 + }, + { + "epoch": 4.090976952505366, + "grad_norm": NaN, + "learning_rate": 7.298943043857433e-05, + "loss": 0.0, + "step": 43843 + }, + { + "epoch": 4.091070262200242, + "grad_norm": NaN, + "learning_rate": 7.298293969638174e-05, + "loss": 0.0, + "step": 43844 + }, + { + "epoch": 4.09116357189512, + "grad_norm": NaN, + "learning_rate": 7.297644915002067e-05, + "loss": 0.0, + "step": 43845 + }, + { + "epoch": 4.091256881589997, + "grad_norm": NaN, + "learning_rate": 7.29699587995076e-05, + "loss": 0.0, + "step": 43846 + }, + { + "epoch": 4.091350191284874, + "grad_norm": NaN, + "learning_rate": 7.296346864485925e-05, + "loss": 0.0, + "step": 43847 + }, + { + "epoch": 4.091443500979752, + "grad_norm": NaN, + "learning_rate": 7.29569786860919e-05, + "loss": 0.0, + "step": 43848 + }, + { + "epoch": 4.091536810674629, + "grad_norm": NaN, + "learning_rate": 7.295048892322208e-05, + "loss": 0.0, + "step": 43849 + }, + { + "epoch": 4.091630120369507, + "grad_norm": NaN, + "learning_rate": 7.294399935626646e-05, + "loss": 0.0, + "step": 43850 + }, + { + "epoch": 4.091723430064384, + "grad_norm": NaN, + "learning_rate": 7.293750998524137e-05, + "loss": 0.0, + "step": 43851 + }, + { + "epoch": 4.091816739759261, + "grad_norm": NaN, + "learning_rate": 7.293102081016331e-05, + "loss": 0.0, + "step": 43852 + }, + { + "epoch": 4.091910049454138, + "grad_norm": NaN, + "learning_rate": 7.292453183104894e-05, + "loss": 0.0, + "step": 43853 + }, + { + "epoch": 4.0920033591490155, + "grad_norm": NaN, + "learning_rate": 7.29180430479146e-05, + "loss": 0.0, + "step": 43854 + }, + { + "epoch": 4.092096668843893, + "grad_norm": NaN, + "learning_rate": 7.291155446077678e-05, + "loss": 0.0, + "step": 43855 + }, + { + "epoch": 4.09218997853877, + "grad_norm": NaN, + "learning_rate": 7.290506606965216e-05, + "loss": 0.0, + "step": 43856 + }, + { + "epoch": 4.092283288233648, + "grad_norm": NaN, + "learning_rate": 7.2898577874557e-05, + "loss": 0.0, + "step": 43857 + }, + { + "epoch": 4.092376597928525, + "grad_norm": NaN, + "learning_rate": 7.289208987550804e-05, + "loss": 0.0, + "step": 43858 + }, + { + "epoch": 4.092469907623402, + "grad_norm": NaN, + "learning_rate": 7.28856020725216e-05, + "loss": 0.0, + "step": 43859 + }, + { + "epoch": 4.092563217318279, + "grad_norm": NaN, + "learning_rate": 7.287911446561417e-05, + "loss": 0.0, + "step": 43860 + }, + { + "epoch": 4.0926565270131565, + "grad_norm": NaN, + "learning_rate": 7.287262705480243e-05, + "loss": 0.0, + "step": 43861 + }, + { + "epoch": 4.092749836708034, + "grad_norm": NaN, + "learning_rate": 7.286613984010266e-05, + "loss": 0.0, + "step": 43862 + }, + { + "epoch": 4.092843146402911, + "grad_norm": NaN, + "learning_rate": 7.285965282153143e-05, + "loss": 0.0, + "step": 43863 + }, + { + "epoch": 4.092936456097789, + "grad_norm": NaN, + "learning_rate": 7.285316599910535e-05, + "loss": 0.0, + "step": 43864 + }, + { + "epoch": 4.093029765792666, + "grad_norm": NaN, + "learning_rate": 7.284667937284077e-05, + "loss": 0.0, + "step": 43865 + }, + { + "epoch": 4.093123075487543, + "grad_norm": NaN, + "learning_rate": 7.284019294275415e-05, + "loss": 0.0, + "step": 43866 + }, + { + "epoch": 4.09321638518242, + "grad_norm": NaN, + "learning_rate": 7.28337067088622e-05, + "loss": 0.0, + "step": 43867 + }, + { + "epoch": 4.093309694877298, + "grad_norm": NaN, + "learning_rate": 7.282722067118121e-05, + "loss": 0.0, + "step": 43868 + }, + { + "epoch": 4.093403004572175, + "grad_norm": NaN, + "learning_rate": 7.282073482972769e-05, + "loss": 0.0, + "step": 43869 + }, + { + "epoch": 4.093496314267052, + "grad_norm": NaN, + "learning_rate": 7.281424918451829e-05, + "loss": 0.0, + "step": 43870 + }, + { + "epoch": 4.09358962396193, + "grad_norm": NaN, + "learning_rate": 7.280776373556932e-05, + "loss": 0.0, + "step": 43871 + }, + { + "epoch": 4.093682933656807, + "grad_norm": NaN, + "learning_rate": 7.280127848289729e-05, + "loss": 0.0, + "step": 43872 + }, + { + "epoch": 4.093776243351685, + "grad_norm": NaN, + "learning_rate": 7.279479342651884e-05, + "loss": 0.0, + "step": 43873 + }, + { + "epoch": 4.093869553046561, + "grad_norm": NaN, + "learning_rate": 7.278830856645033e-05, + "loss": 0.0, + "step": 43874 + }, + { + "epoch": 4.093962862741439, + "grad_norm": NaN, + "learning_rate": 7.278182390270822e-05, + "loss": 0.0, + "step": 43875 + }, + { + "epoch": 4.094056172436316, + "grad_norm": NaN, + "learning_rate": 7.277533943530917e-05, + "loss": 0.0, + "step": 43876 + }, + { + "epoch": 4.0941494821311935, + "grad_norm": NaN, + "learning_rate": 7.276885516426947e-05, + "loss": 0.0, + "step": 43877 + }, + { + "epoch": 4.094242791826071, + "grad_norm": NaN, + "learning_rate": 7.276237108960566e-05, + "loss": 0.0, + "step": 43878 + }, + { + "epoch": 4.094336101520948, + "grad_norm": NaN, + "learning_rate": 7.275588721133439e-05, + "loss": 0.0, + "step": 43879 + }, + { + "epoch": 4.094429411215826, + "grad_norm": NaN, + "learning_rate": 7.274940352947194e-05, + "loss": 0.0, + "step": 43880 + }, + { + "epoch": 4.094522720910702, + "grad_norm": NaN, + "learning_rate": 7.274292004403483e-05, + "loss": 0.0, + "step": 43881 + }, + { + "epoch": 4.09461603060558, + "grad_norm": NaN, + "learning_rate": 7.273643675503969e-05, + "loss": 0.0, + "step": 43882 + }, + { + "epoch": 4.094709340300457, + "grad_norm": NaN, + "learning_rate": 7.272995366250285e-05, + "loss": 0.0, + "step": 43883 + }, + { + "epoch": 4.0948026499953345, + "grad_norm": NaN, + "learning_rate": 7.272347076644079e-05, + "loss": 0.0, + "step": 43884 + }, + { + "epoch": 4.094895959690212, + "grad_norm": NaN, + "learning_rate": 7.271698806687017e-05, + "loss": 0.0, + "step": 43885 + }, + { + "epoch": 4.094989269385089, + "grad_norm": NaN, + "learning_rate": 7.27105055638073e-05, + "loss": 0.0, + "step": 43886 + }, + { + "epoch": 4.095082579079967, + "grad_norm": NaN, + "learning_rate": 7.270402325726867e-05, + "loss": 0.0, + "step": 43887 + }, + { + "epoch": 4.095175888774843, + "grad_norm": NaN, + "learning_rate": 7.269754114727093e-05, + "loss": 0.0, + "step": 43888 + }, + { + "epoch": 4.095269198469721, + "grad_norm": NaN, + "learning_rate": 7.269105923383036e-05, + "loss": 0.0, + "step": 43889 + }, + { + "epoch": 4.095362508164598, + "grad_norm": NaN, + "learning_rate": 7.268457751696349e-05, + "loss": 0.0, + "step": 43890 + }, + { + "epoch": 4.0954558178594755, + "grad_norm": NaN, + "learning_rate": 7.267809599668693e-05, + "loss": 0.0, + "step": 43891 + }, + { + "epoch": 4.095549127554353, + "grad_norm": NaN, + "learning_rate": 7.267161467301704e-05, + "loss": 0.0, + "step": 43892 + }, + { + "epoch": 4.09564243724923, + "grad_norm": NaN, + "learning_rate": 7.266513354597024e-05, + "loss": 0.0, + "step": 43893 + }, + { + "epoch": 4.095735746944108, + "grad_norm": NaN, + "learning_rate": 7.265865261556324e-05, + "loss": 0.0, + "step": 43894 + }, + { + "epoch": 4.095829056638985, + "grad_norm": NaN, + "learning_rate": 7.265217188181224e-05, + "loss": 0.0, + "step": 43895 + }, + { + "epoch": 4.095922366333862, + "grad_norm": NaN, + "learning_rate": 7.264569134473394e-05, + "loss": 0.0, + "step": 43896 + }, + { + "epoch": 4.096015676028739, + "grad_norm": NaN, + "learning_rate": 7.263921100434475e-05, + "loss": 0.0, + "step": 43897 + }, + { + "epoch": 4.096108985723617, + "grad_norm": NaN, + "learning_rate": 7.263273086066106e-05, + "loss": 0.0, + "step": 43898 + }, + { + "epoch": 4.096202295418494, + "grad_norm": NaN, + "learning_rate": 7.262625091369945e-05, + "loss": 0.0, + "step": 43899 + }, + { + "epoch": 4.096295605113371, + "grad_norm": NaN, + "learning_rate": 7.261977116347642e-05, + "loss": 0.0, + "step": 43900 + }, + { + "epoch": 4.096388914808249, + "grad_norm": NaN, + "learning_rate": 7.261329161000828e-05, + "loss": 0.0, + "step": 43901 + }, + { + "epoch": 4.096482224503126, + "grad_norm": NaN, + "learning_rate": 7.260681225331172e-05, + "loss": 0.0, + "step": 43902 + }, + { + "epoch": 4.096575534198003, + "grad_norm": NaN, + "learning_rate": 7.260033309340307e-05, + "loss": 0.0, + "step": 43903 + }, + { + "epoch": 4.09666884389288, + "grad_norm": NaN, + "learning_rate": 7.259385413029878e-05, + "loss": 0.0, + "step": 43904 + }, + { + "epoch": 4.096762153587758, + "grad_norm": NaN, + "learning_rate": 7.25873753640155e-05, + "loss": 0.0, + "step": 43905 + }, + { + "epoch": 4.096855463282635, + "grad_norm": NaN, + "learning_rate": 7.258089679456955e-05, + "loss": 0.0, + "step": 43906 + }, + { + "epoch": 4.0969487729775125, + "grad_norm": NaN, + "learning_rate": 7.257441842197739e-05, + "loss": 0.0, + "step": 43907 + }, + { + "epoch": 4.09704208267239, + "grad_norm": NaN, + "learning_rate": 7.256794024625565e-05, + "loss": 0.0, + "step": 43908 + }, + { + "epoch": 4.097135392367267, + "grad_norm": NaN, + "learning_rate": 7.256146226742064e-05, + "loss": 0.0, + "step": 43909 + }, + { + "epoch": 4.097228702062144, + "grad_norm": NaN, + "learning_rate": 7.255498448548886e-05, + "loss": 0.0, + "step": 43910 + }, + { + "epoch": 4.097322011757021, + "grad_norm": NaN, + "learning_rate": 7.25485069004769e-05, + "loss": 0.0, + "step": 43911 + }, + { + "epoch": 4.097415321451899, + "grad_norm": NaN, + "learning_rate": 7.25420295124011e-05, + "loss": 0.0, + "step": 43912 + }, + { + "epoch": 4.097508631146776, + "grad_norm": NaN, + "learning_rate": 7.253555232127792e-05, + "loss": 0.0, + "step": 43913 + }, + { + "epoch": 4.0976019408416535, + "grad_norm": NaN, + "learning_rate": 7.252907532712398e-05, + "loss": 0.0, + "step": 43914 + }, + { + "epoch": 4.097695250536531, + "grad_norm": NaN, + "learning_rate": 7.252259852995561e-05, + "loss": 0.0, + "step": 43915 + }, + { + "epoch": 4.097788560231408, + "grad_norm": NaN, + "learning_rate": 7.251612192978927e-05, + "loss": 0.0, + "step": 43916 + }, + { + "epoch": 4.097881869926285, + "grad_norm": NaN, + "learning_rate": 7.250964552664159e-05, + "loss": 0.0, + "step": 43917 + }, + { + "epoch": 4.097975179621162, + "grad_norm": NaN, + "learning_rate": 7.250316932052885e-05, + "loss": 0.0, + "step": 43918 + }, + { + "epoch": 4.09806848931604, + "grad_norm": NaN, + "learning_rate": 7.249669331146757e-05, + "loss": 0.0, + "step": 43919 + }, + { + "epoch": 4.098161799010917, + "grad_norm": NaN, + "learning_rate": 7.249021749947433e-05, + "loss": 0.0, + "step": 43920 + }, + { + "epoch": 4.098255108705795, + "grad_norm": NaN, + "learning_rate": 7.248374188456545e-05, + "loss": 0.0, + "step": 43921 + }, + { + "epoch": 4.098348418400672, + "grad_norm": NaN, + "learning_rate": 7.247726646675741e-05, + "loss": 0.0, + "step": 43922 + }, + { + "epoch": 4.098441728095549, + "grad_norm": NaN, + "learning_rate": 7.247079124606682e-05, + "loss": 0.0, + "step": 43923 + }, + { + "epoch": 4.098535037790427, + "grad_norm": NaN, + "learning_rate": 7.246431622251e-05, + "loss": 0.0, + "step": 43924 + }, + { + "epoch": 4.098628347485303, + "grad_norm": NaN, + "learning_rate": 7.24578413961034e-05, + "loss": 0.0, + "step": 43925 + }, + { + "epoch": 4.098721657180181, + "grad_norm": NaN, + "learning_rate": 7.245136676686363e-05, + "loss": 0.0, + "step": 43926 + }, + { + "epoch": 4.098814966875058, + "grad_norm": NaN, + "learning_rate": 7.244489233480701e-05, + "loss": 0.0, + "step": 43927 + }, + { + "epoch": 4.098908276569936, + "grad_norm": NaN, + "learning_rate": 7.243841809995002e-05, + "loss": 0.0, + "step": 43928 + }, + { + "epoch": 4.099001586264813, + "grad_norm": NaN, + "learning_rate": 7.243194406230926e-05, + "loss": 0.0, + "step": 43929 + }, + { + "epoch": 4.0990948959596905, + "grad_norm": NaN, + "learning_rate": 7.242547022190103e-05, + "loss": 0.0, + "step": 43930 + }, + { + "epoch": 4.099188205654568, + "grad_norm": NaN, + "learning_rate": 7.24189965787418e-05, + "loss": 0.0, + "step": 43931 + }, + { + "epoch": 4.099281515349444, + "grad_norm": NaN, + "learning_rate": 7.241252313284818e-05, + "loss": 0.0, + "step": 43932 + }, + { + "epoch": 4.099374825044322, + "grad_norm": NaN, + "learning_rate": 7.240604988423643e-05, + "loss": 0.0, + "step": 43933 + }, + { + "epoch": 4.099468134739199, + "grad_norm": NaN, + "learning_rate": 7.239957683292315e-05, + "loss": 0.0, + "step": 43934 + }, + { + "epoch": 4.099561444434077, + "grad_norm": NaN, + "learning_rate": 7.239310397892482e-05, + "loss": 0.0, + "step": 43935 + }, + { + "epoch": 4.099654754128954, + "grad_norm": NaN, + "learning_rate": 7.238663132225774e-05, + "loss": 0.0, + "step": 43936 + }, + { + "epoch": 4.0997480638238315, + "grad_norm": NaN, + "learning_rate": 7.238015886293852e-05, + "loss": 0.0, + "step": 43937 + }, + { + "epoch": 4.099841373518709, + "grad_norm": NaN, + "learning_rate": 7.237368660098361e-05, + "loss": 0.0, + "step": 43938 + }, + { + "epoch": 4.099934683213586, + "grad_norm": NaN, + "learning_rate": 7.23672145364093e-05, + "loss": 0.0, + "step": 43939 + }, + { + "epoch": 4.100027992908463, + "grad_norm": NaN, + "learning_rate": 7.236074266923224e-05, + "loss": 0.0, + "step": 43940 + }, + { + "epoch": 4.10012130260334, + "grad_norm": NaN, + "learning_rate": 7.235427099946886e-05, + "loss": 0.0, + "step": 43941 + }, + { + "epoch": 4.100214612298218, + "grad_norm": NaN, + "learning_rate": 7.234779952713546e-05, + "loss": 0.0, + "step": 43942 + }, + { + "epoch": 4.100307921993095, + "grad_norm": NaN, + "learning_rate": 7.234132825224866e-05, + "loss": 0.0, + "step": 43943 + }, + { + "epoch": 4.1004012316879725, + "grad_norm": NaN, + "learning_rate": 7.233485717482492e-05, + "loss": 0.0, + "step": 43944 + }, + { + "epoch": 4.10049454138285, + "grad_norm": NaN, + "learning_rate": 7.232838629488051e-05, + "loss": 0.0, + "step": 43945 + }, + { + "epoch": 4.100587851077727, + "grad_norm": NaN, + "learning_rate": 7.232191561243207e-05, + "loss": 0.0, + "step": 43946 + }, + { + "epoch": 4.100681160772604, + "grad_norm": NaN, + "learning_rate": 7.231544512749603e-05, + "loss": 0.0, + "step": 43947 + }, + { + "epoch": 4.100774470467481, + "grad_norm": NaN, + "learning_rate": 7.230897484008871e-05, + "loss": 0.0, + "step": 43948 + }, + { + "epoch": 4.100867780162359, + "grad_norm": NaN, + "learning_rate": 7.230250475022675e-05, + "loss": 0.0, + "step": 43949 + }, + { + "epoch": 4.100961089857236, + "grad_norm": NaN, + "learning_rate": 7.229603485792646e-05, + "loss": 0.0, + "step": 43950 + }, + { + "epoch": 4.101054399552114, + "grad_norm": NaN, + "learning_rate": 7.228956516320427e-05, + "loss": 0.0, + "step": 43951 + }, + { + "epoch": 4.101147709246991, + "grad_norm": NaN, + "learning_rate": 7.228309566607681e-05, + "loss": 0.0, + "step": 43952 + }, + { + "epoch": 4.101241018941868, + "grad_norm": NaN, + "learning_rate": 7.227662636656037e-05, + "loss": 0.0, + "step": 43953 + }, + { + "epoch": 4.101334328636745, + "grad_norm": NaN, + "learning_rate": 7.227015726467138e-05, + "loss": 0.0, + "step": 43954 + }, + { + "epoch": 4.101427638331622, + "grad_norm": NaN, + "learning_rate": 7.226368836042646e-05, + "loss": 0.0, + "step": 43955 + }, + { + "epoch": 4.1015209480265, + "grad_norm": NaN, + "learning_rate": 7.22572196538419e-05, + "loss": 0.0, + "step": 43956 + }, + { + "epoch": 4.101614257721377, + "grad_norm": NaN, + "learning_rate": 7.225075114493414e-05, + "loss": 0.0, + "step": 43957 + }, + { + "epoch": 4.101707567416255, + "grad_norm": NaN, + "learning_rate": 7.224428283371983e-05, + "loss": 0.0, + "step": 43958 + }, + { + "epoch": 4.101800877111132, + "grad_norm": NaN, + "learning_rate": 7.223781472021521e-05, + "loss": 0.0, + "step": 43959 + }, + { + "epoch": 4.1018941868060095, + "grad_norm": NaN, + "learning_rate": 7.223134680443672e-05, + "loss": 0.0, + "step": 43960 + }, + { + "epoch": 4.101987496500886, + "grad_norm": NaN, + "learning_rate": 7.2224879086401e-05, + "loss": 0.0, + "step": 43961 + }, + { + "epoch": 4.102080806195763, + "grad_norm": NaN, + "learning_rate": 7.221841156612431e-05, + "loss": 0.0, + "step": 43962 + }, + { + "epoch": 4.102174115890641, + "grad_norm": NaN, + "learning_rate": 7.221194424362311e-05, + "loss": 0.0, + "step": 43963 + }, + { + "epoch": 4.102267425585518, + "grad_norm": NaN, + "learning_rate": 7.2205477118914e-05, + "loss": 0.0, + "step": 43964 + }, + { + "epoch": 4.102360735280396, + "grad_norm": NaN, + "learning_rate": 7.219901019201327e-05, + "loss": 0.0, + "step": 43965 + }, + { + "epoch": 4.102454044975273, + "grad_norm": NaN, + "learning_rate": 7.219254346293735e-05, + "loss": 0.0, + "step": 43966 + }, + { + "epoch": 4.1025473546701505, + "grad_norm": NaN, + "learning_rate": 7.218607693170285e-05, + "loss": 0.0, + "step": 43967 + }, + { + "epoch": 4.102640664365028, + "grad_norm": NaN, + "learning_rate": 7.217961059832607e-05, + "loss": 0.0, + "step": 43968 + }, + { + "epoch": 4.1027339740599045, + "grad_norm": NaN, + "learning_rate": 7.217314446282339e-05, + "loss": 0.0, + "step": 43969 + }, + { + "epoch": 4.102827283754782, + "grad_norm": NaN, + "learning_rate": 7.21666785252115e-05, + "loss": 0.0, + "step": 43970 + }, + { + "epoch": 4.102920593449659, + "grad_norm": NaN, + "learning_rate": 7.216021278550657e-05, + "loss": 0.0, + "step": 43971 + }, + { + "epoch": 4.103013903144537, + "grad_norm": NaN, + "learning_rate": 7.21537472437252e-05, + "loss": 0.0, + "step": 43972 + }, + { + "epoch": 4.103107212839414, + "grad_norm": NaN, + "learning_rate": 7.214728189988387e-05, + "loss": 0.0, + "step": 43973 + }, + { + "epoch": 4.103200522534292, + "grad_norm": NaN, + "learning_rate": 7.214081675399882e-05, + "loss": 0.0, + "step": 43974 + }, + { + "epoch": 4.103293832229169, + "grad_norm": NaN, + "learning_rate": 7.213435180608666e-05, + "loss": 0.0, + "step": 43975 + }, + { + "epoch": 4.1033871419240455, + "grad_norm": NaN, + "learning_rate": 7.212788705616384e-05, + "loss": 0.0, + "step": 43976 + }, + { + "epoch": 4.103480451618923, + "grad_norm": NaN, + "learning_rate": 7.212142250424665e-05, + "loss": 0.0, + "step": 43977 + }, + { + "epoch": 4.1035737613138, + "grad_norm": NaN, + "learning_rate": 7.211495815035163e-05, + "loss": 0.0, + "step": 43978 + }, + { + "epoch": 4.103667071008678, + "grad_norm": NaN, + "learning_rate": 7.210849399449527e-05, + "loss": 0.0, + "step": 43979 + }, + { + "epoch": 4.103760380703555, + "grad_norm": NaN, + "learning_rate": 7.210203003669383e-05, + "loss": 0.0, + "step": 43980 + }, + { + "epoch": 4.103853690398433, + "grad_norm": NaN, + "learning_rate": 7.20955662769639e-05, + "loss": 0.0, + "step": 43981 + }, + { + "epoch": 4.10394700009331, + "grad_norm": NaN, + "learning_rate": 7.208910271532192e-05, + "loss": 0.0, + "step": 43982 + }, + { + "epoch": 4.104040309788187, + "grad_norm": NaN, + "learning_rate": 7.208263935178417e-05, + "loss": 0.0, + "step": 43983 + }, + { + "epoch": 4.104133619483064, + "grad_norm": NaN, + "learning_rate": 7.207617618636723e-05, + "loss": 0.0, + "step": 43984 + }, + { + "epoch": 4.104226929177941, + "grad_norm": NaN, + "learning_rate": 7.206971321908757e-05, + "loss": 0.0, + "step": 43985 + }, + { + "epoch": 4.104320238872819, + "grad_norm": NaN, + "learning_rate": 7.20632504499614e-05, + "loss": 0.0, + "step": 43986 + }, + { + "epoch": 4.104413548567696, + "grad_norm": NaN, + "learning_rate": 7.205678787900538e-05, + "loss": 0.0, + "step": 43987 + }, + { + "epoch": 4.104506858262574, + "grad_norm": NaN, + "learning_rate": 7.205032550623591e-05, + "loss": 0.0, + "step": 43988 + }, + { + "epoch": 4.104600167957451, + "grad_norm": NaN, + "learning_rate": 7.204386333166926e-05, + "loss": 0.0, + "step": 43989 + }, + { + "epoch": 4.1046934776523285, + "grad_norm": NaN, + "learning_rate": 7.203740135532203e-05, + "loss": 0.0, + "step": 43990 + }, + { + "epoch": 4.104786787347205, + "grad_norm": NaN, + "learning_rate": 7.203093957721066e-05, + "loss": 0.0, + "step": 43991 + }, + { + "epoch": 4.1048800970420825, + "grad_norm": NaN, + "learning_rate": 7.202447799735138e-05, + "loss": 0.0, + "step": 43992 + }, + { + "epoch": 4.10497340673696, + "grad_norm": NaN, + "learning_rate": 7.201801661576088e-05, + "loss": 0.0, + "step": 43993 + }, + { + "epoch": 4.105066716431837, + "grad_norm": NaN, + "learning_rate": 7.201155543245541e-05, + "loss": 0.0, + "step": 43994 + }, + { + "epoch": 4.105160026126715, + "grad_norm": NaN, + "learning_rate": 7.200509444745138e-05, + "loss": 0.0, + "step": 43995 + }, + { + "epoch": 4.105253335821592, + "grad_norm": NaN, + "learning_rate": 7.199863366076542e-05, + "loss": 0.0, + "step": 43996 + }, + { + "epoch": 4.1053466455164696, + "grad_norm": NaN, + "learning_rate": 7.199217307241377e-05, + "loss": 0.0, + "step": 43997 + }, + { + "epoch": 4.105439955211346, + "grad_norm": NaN, + "learning_rate": 7.198571268241286e-05, + "loss": 0.0, + "step": 43998 + }, + { + "epoch": 4.1055332649062235, + "grad_norm": NaN, + "learning_rate": 7.197925249077928e-05, + "loss": 0.0, + "step": 43999 + }, + { + "epoch": 4.105626574601101, + "grad_norm": NaN, + "learning_rate": 7.197279249752929e-05, + "loss": 0.0, + "step": 44000 + }, + { + "epoch": 4.105719884295978, + "grad_norm": NaN, + "learning_rate": 7.196633270267934e-05, + "loss": 0.0, + "step": 44001 + }, + { + "epoch": 4.105813193990856, + "grad_norm": NaN, + "learning_rate": 7.195987310624597e-05, + "loss": 0.0, + "step": 44002 + }, + { + "epoch": 4.105906503685733, + "grad_norm": NaN, + "learning_rate": 7.19534137082455e-05, + "loss": 0.0, + "step": 44003 + }, + { + "epoch": 4.105999813380611, + "grad_norm": NaN, + "learning_rate": 7.19469545086943e-05, + "loss": 0.0, + "step": 44004 + }, + { + "epoch": 4.106093123075487, + "grad_norm": NaN, + "learning_rate": 7.1940495507609e-05, + "loss": 0.0, + "step": 44005 + }, + { + "epoch": 4.1061864327703645, + "grad_norm": NaN, + "learning_rate": 7.19340367050058e-05, + "loss": 0.0, + "step": 44006 + }, + { + "epoch": 4.106279742465242, + "grad_norm": NaN, + "learning_rate": 7.192757810090125e-05, + "loss": 0.0, + "step": 44007 + }, + { + "epoch": 4.106373052160119, + "grad_norm": NaN, + "learning_rate": 7.19211196953118e-05, + "loss": 0.0, + "step": 44008 + }, + { + "epoch": 4.106466361854997, + "grad_norm": NaN, + "learning_rate": 7.191466148825372e-05, + "loss": 0.0, + "step": 44009 + }, + { + "epoch": 4.106559671549874, + "grad_norm": NaN, + "learning_rate": 7.190820347974356e-05, + "loss": 0.0, + "step": 44010 + }, + { + "epoch": 4.106652981244752, + "grad_norm": NaN, + "learning_rate": 7.190174566979779e-05, + "loss": 0.0, + "step": 44011 + }, + { + "epoch": 4.106746290939629, + "grad_norm": NaN, + "learning_rate": 7.18952880584326e-05, + "loss": 0.0, + "step": 44012 + }, + { + "epoch": 4.106839600634506, + "grad_norm": NaN, + "learning_rate": 7.188883064566462e-05, + "loss": 0.0, + "step": 44013 + }, + { + "epoch": 4.106932910329383, + "grad_norm": NaN, + "learning_rate": 7.188237343151028e-05, + "loss": 0.0, + "step": 44014 + }, + { + "epoch": 4.10702622002426, + "grad_norm": NaN, + "learning_rate": 7.18759164159858e-05, + "loss": 0.0, + "step": 44015 + }, + { + "epoch": 4.107119529719138, + "grad_norm": NaN, + "learning_rate": 7.186945959910778e-05, + "loss": 0.0, + "step": 44016 + }, + { + "epoch": 4.107212839414015, + "grad_norm": NaN, + "learning_rate": 7.186300298089265e-05, + "loss": 0.0, + "step": 44017 + }, + { + "epoch": 4.107306149108893, + "grad_norm": NaN, + "learning_rate": 7.185654656135662e-05, + "loss": 0.0, + "step": 44018 + }, + { + "epoch": 4.10739945880377, + "grad_norm": NaN, + "learning_rate": 7.18500903405163e-05, + "loss": 0.0, + "step": 44019 + }, + { + "epoch": 4.107492768498647, + "grad_norm": NaN, + "learning_rate": 7.184363431838812e-05, + "loss": 0.0, + "step": 44020 + }, + { + "epoch": 4.107586078193524, + "grad_norm": NaN, + "learning_rate": 7.183717849498833e-05, + "loss": 0.0, + "step": 44021 + }, + { + "epoch": 4.1076793878884015, + "grad_norm": NaN, + "learning_rate": 7.183072287033348e-05, + "loss": 0.0, + "step": 44022 + }, + { + "epoch": 4.107772697583279, + "grad_norm": NaN, + "learning_rate": 7.182426744444e-05, + "loss": 0.0, + "step": 44023 + }, + { + "epoch": 4.107866007278156, + "grad_norm": NaN, + "learning_rate": 7.181781221732414e-05, + "loss": 0.0, + "step": 44024 + }, + { + "epoch": 4.107959316973034, + "grad_norm": NaN, + "learning_rate": 7.181135718900248e-05, + "loss": 0.0, + "step": 44025 + }, + { + "epoch": 4.108052626667911, + "grad_norm": NaN, + "learning_rate": 7.180490235949143e-05, + "loss": 0.0, + "step": 44026 + }, + { + "epoch": 4.108145936362788, + "grad_norm": NaN, + "learning_rate": 7.179844772880727e-05, + "loss": 0.0, + "step": 44027 + }, + { + "epoch": 4.108239246057665, + "grad_norm": NaN, + "learning_rate": 7.179199329696649e-05, + "loss": 0.0, + "step": 44028 + }, + { + "epoch": 4.1083325557525425, + "grad_norm": NaN, + "learning_rate": 7.178553906398562e-05, + "loss": 0.0, + "step": 44029 + }, + { + "epoch": 4.10842586544742, + "grad_norm": NaN, + "learning_rate": 7.17790850298808e-05, + "loss": 0.0, + "step": 44030 + }, + { + "epoch": 4.108519175142297, + "grad_norm": NaN, + "learning_rate": 7.177263119466868e-05, + "loss": 0.0, + "step": 44031 + }, + { + "epoch": 4.108612484837175, + "grad_norm": NaN, + "learning_rate": 7.176617755836561e-05, + "loss": 0.0, + "step": 44032 + }, + { + "epoch": 4.108705794532052, + "grad_norm": NaN, + "learning_rate": 7.17597241209879e-05, + "loss": 0.0, + "step": 44033 + }, + { + "epoch": 4.108799104226929, + "grad_norm": NaN, + "learning_rate": 7.175327088255208e-05, + "loss": 0.0, + "step": 44034 + }, + { + "epoch": 4.108892413921806, + "grad_norm": NaN, + "learning_rate": 7.174681784307455e-05, + "loss": 0.0, + "step": 44035 + }, + { + "epoch": 4.108985723616684, + "grad_norm": NaN, + "learning_rate": 7.174036500257158e-05, + "loss": 0.0, + "step": 44036 + }, + { + "epoch": 4.109079033311561, + "grad_norm": NaN, + "learning_rate": 7.173391236105982e-05, + "loss": 0.0, + "step": 44037 + }, + { + "epoch": 4.109172343006438, + "grad_norm": NaN, + "learning_rate": 7.172745991855544e-05, + "loss": 0.0, + "step": 44038 + }, + { + "epoch": 4.109265652701316, + "grad_norm": NaN, + "learning_rate": 7.17210076750749e-05, + "loss": 0.0, + "step": 44039 + }, + { + "epoch": 4.109358962396193, + "grad_norm": NaN, + "learning_rate": 7.171455563063476e-05, + "loss": 0.0, + "step": 44040 + }, + { + "epoch": 4.109452272091071, + "grad_norm": NaN, + "learning_rate": 7.170810378525127e-05, + "loss": 0.0, + "step": 44041 + }, + { + "epoch": 4.109545581785947, + "grad_norm": NaN, + "learning_rate": 7.170165213894083e-05, + "loss": 0.0, + "step": 44042 + }, + { + "epoch": 4.109638891480825, + "grad_norm": NaN, + "learning_rate": 7.169520069172e-05, + "loss": 0.0, + "step": 44043 + }, + { + "epoch": 4.109732201175702, + "grad_norm": NaN, + "learning_rate": 7.168874944360495e-05, + "loss": 0.0, + "step": 44044 + }, + { + "epoch": 4.1098255108705795, + "grad_norm": NaN, + "learning_rate": 7.168229839461229e-05, + "loss": 0.0, + "step": 44045 + }, + { + "epoch": 4.109918820565457, + "grad_norm": NaN, + "learning_rate": 7.167584754475838e-05, + "loss": 0.0, + "step": 44046 + }, + { + "epoch": 4.110012130260334, + "grad_norm": NaN, + "learning_rate": 7.166939689405951e-05, + "loss": 0.0, + "step": 44047 + }, + { + "epoch": 4.110105439955212, + "grad_norm": NaN, + "learning_rate": 7.166294644253218e-05, + "loss": 0.0, + "step": 44048 + }, + { + "epoch": 4.110198749650088, + "grad_norm": NaN, + "learning_rate": 7.165649619019285e-05, + "loss": 0.0, + "step": 44049 + }, + { + "epoch": 4.110292059344966, + "grad_norm": NaN, + "learning_rate": 7.165004613705772e-05, + "loss": 0.0, + "step": 44050 + }, + { + "epoch": 4.110385369039843, + "grad_norm": NaN, + "learning_rate": 7.164359628314338e-05, + "loss": 0.0, + "step": 44051 + }, + { + "epoch": 4.1104786787347205, + "grad_norm": NaN, + "learning_rate": 7.163714662846621e-05, + "loss": 0.0, + "step": 44052 + }, + { + "epoch": 4.110571988429598, + "grad_norm": NaN, + "learning_rate": 7.163069717304246e-05, + "loss": 0.0, + "step": 44053 + }, + { + "epoch": 4.110665298124475, + "grad_norm": NaN, + "learning_rate": 7.16242479168887e-05, + "loss": 0.0, + "step": 44054 + }, + { + "epoch": 4.110758607819353, + "grad_norm": NaN, + "learning_rate": 7.16177988600213e-05, + "loss": 0.0, + "step": 44055 + }, + { + "epoch": 4.110851917514229, + "grad_norm": NaN, + "learning_rate": 7.161135000245649e-05, + "loss": 0.0, + "step": 44056 + }, + { + "epoch": 4.110945227209107, + "grad_norm": NaN, + "learning_rate": 7.160490134421089e-05, + "loss": 0.0, + "step": 44057 + }, + { + "epoch": 4.111038536903984, + "grad_norm": NaN, + "learning_rate": 7.159845288530084e-05, + "loss": 0.0, + "step": 44058 + }, + { + "epoch": 4.1111318465988616, + "grad_norm": NaN, + "learning_rate": 7.159200462574257e-05, + "loss": 0.0, + "step": 44059 + }, + { + "epoch": 4.111225156293739, + "grad_norm": NaN, + "learning_rate": 7.158555656555268e-05, + "loss": 0.0, + "step": 44060 + }, + { + "epoch": 4.111318465988616, + "grad_norm": NaN, + "learning_rate": 7.157910870474756e-05, + "loss": 0.0, + "step": 44061 + }, + { + "epoch": 4.111411775683494, + "grad_norm": NaN, + "learning_rate": 7.157266104334342e-05, + "loss": 0.0, + "step": 44062 + }, + { + "epoch": 4.111505085378371, + "grad_norm": NaN, + "learning_rate": 7.156621358135681e-05, + "loss": 0.0, + "step": 44063 + }, + { + "epoch": 4.111598395073248, + "grad_norm": NaN, + "learning_rate": 7.155976631880415e-05, + "loss": 0.0, + "step": 44064 + }, + { + "epoch": 4.111691704768125, + "grad_norm": NaN, + "learning_rate": 7.155331925570165e-05, + "loss": 0.0, + "step": 44065 + }, + { + "epoch": 4.111785014463003, + "grad_norm": NaN, + "learning_rate": 7.15468723920659e-05, + "loss": 0.0, + "step": 44066 + }, + { + "epoch": 4.11187832415788, + "grad_norm": NaN, + "learning_rate": 7.154042572791325e-05, + "loss": 0.0, + "step": 44067 + }, + { + "epoch": 4.111971633852757, + "grad_norm": NaN, + "learning_rate": 7.153397926325994e-05, + "loss": 0.0, + "step": 44068 + }, + { + "epoch": 4.112064943547635, + "grad_norm": NaN, + "learning_rate": 7.152753299812255e-05, + "loss": 0.0, + "step": 44069 + }, + { + "epoch": 4.112158253242512, + "grad_norm": NaN, + "learning_rate": 7.152108693251745e-05, + "loss": 0.0, + "step": 44070 + }, + { + "epoch": 4.112251562937389, + "grad_norm": NaN, + "learning_rate": 7.151464106646087e-05, + "loss": 0.0, + "step": 44071 + }, + { + "epoch": 4.112344872632266, + "grad_norm": NaN, + "learning_rate": 7.150819539996935e-05, + "loss": 0.0, + "step": 44072 + }, + { + "epoch": 4.112438182327144, + "grad_norm": NaN, + "learning_rate": 7.15017499330593e-05, + "loss": 0.0, + "step": 44073 + }, + { + "epoch": 4.112531492022021, + "grad_norm": NaN, + "learning_rate": 7.149530466574694e-05, + "loss": 0.0, + "step": 44074 + }, + { + "epoch": 4.1126248017168985, + "grad_norm": NaN, + "learning_rate": 7.148885959804882e-05, + "loss": 0.0, + "step": 44075 + }, + { + "epoch": 4.112718111411776, + "grad_norm": NaN, + "learning_rate": 7.148241472998133e-05, + "loss": 0.0, + "step": 44076 + }, + { + "epoch": 4.112811421106653, + "grad_norm": NaN, + "learning_rate": 7.147597006156068e-05, + "loss": 0.0, + "step": 44077 + }, + { + "epoch": 4.11290473080153, + "grad_norm": NaN, + "learning_rate": 7.146952559280344e-05, + "loss": 0.0, + "step": 44078 + }, + { + "epoch": 4.112998040496407, + "grad_norm": NaN, + "learning_rate": 7.1463081323726e-05, + "loss": 0.0, + "step": 44079 + }, + { + "epoch": 4.113091350191285, + "grad_norm": NaN, + "learning_rate": 7.145663725434455e-05, + "loss": 0.0, + "step": 44080 + }, + { + "epoch": 4.113184659886162, + "grad_norm": NaN, + "learning_rate": 7.145019338467572e-05, + "loss": 0.0, + "step": 44081 + }, + { + "epoch": 4.1132779695810395, + "grad_norm": NaN, + "learning_rate": 7.144374971473569e-05, + "loss": 0.0, + "step": 44082 + }, + { + "epoch": 4.113371279275917, + "grad_norm": NaN, + "learning_rate": 7.143730624454096e-05, + "loss": 0.0, + "step": 44083 + }, + { + "epoch": 4.113464588970794, + "grad_norm": NaN, + "learning_rate": 7.143086297410796e-05, + "loss": 0.0, + "step": 44084 + }, + { + "epoch": 4.113557898665672, + "grad_norm": NaN, + "learning_rate": 7.14244199034529e-05, + "loss": 0.0, + "step": 44085 + }, + { + "epoch": 4.113651208360548, + "grad_norm": NaN, + "learning_rate": 7.141797703259229e-05, + "loss": 0.0, + "step": 44086 + }, + { + "epoch": 4.113744518055426, + "grad_norm": NaN, + "learning_rate": 7.141153436154257e-05, + "loss": 0.0, + "step": 44087 + }, + { + "epoch": 4.113837827750303, + "grad_norm": NaN, + "learning_rate": 7.140509189031992e-05, + "loss": 0.0, + "step": 44088 + }, + { + "epoch": 4.113931137445181, + "grad_norm": NaN, + "learning_rate": 7.139864961894088e-05, + "loss": 0.0, + "step": 44089 + }, + { + "epoch": 4.114024447140058, + "grad_norm": NaN, + "learning_rate": 7.139220754742187e-05, + "loss": 0.0, + "step": 44090 + }, + { + "epoch": 4.114117756834935, + "grad_norm": NaN, + "learning_rate": 7.138576567577907e-05, + "loss": 0.0, + "step": 44091 + }, + { + "epoch": 4.114211066529813, + "grad_norm": NaN, + "learning_rate": 7.137932400402905e-05, + "loss": 0.0, + "step": 44092 + }, + { + "epoch": 4.114304376224689, + "grad_norm": NaN, + "learning_rate": 7.137288253218815e-05, + "loss": 0.0, + "step": 44093 + }, + { + "epoch": 4.114397685919567, + "grad_norm": NaN, + "learning_rate": 7.136644126027262e-05, + "loss": 0.0, + "step": 44094 + }, + { + "epoch": 4.114490995614444, + "grad_norm": NaN, + "learning_rate": 7.136000018829898e-05, + "loss": 0.0, + "step": 44095 + }, + { + "epoch": 4.114584305309322, + "grad_norm": NaN, + "learning_rate": 7.135355931628364e-05, + "loss": 0.0, + "step": 44096 + }, + { + "epoch": 4.114677615004199, + "grad_norm": NaN, + "learning_rate": 7.134711864424278e-05, + "loss": 0.0, + "step": 44097 + }, + { + "epoch": 4.1147709246990765, + "grad_norm": NaN, + "learning_rate": 7.134067817219295e-05, + "loss": 0.0, + "step": 44098 + }, + { + "epoch": 4.114864234393954, + "grad_norm": NaN, + "learning_rate": 7.133423790015054e-05, + "loss": 0.0, + "step": 44099 + }, + { + "epoch": 4.11495754408883, + "grad_norm": NaN, + "learning_rate": 7.132779782813174e-05, + "loss": 0.0, + "step": 44100 + }, + { + "epoch": 4.115050853783708, + "grad_norm": NaN, + "learning_rate": 7.132135795615311e-05, + "loss": 0.0, + "step": 44101 + }, + { + "epoch": 4.115144163478585, + "grad_norm": NaN, + "learning_rate": 7.131491828423102e-05, + "loss": 0.0, + "step": 44102 + }, + { + "epoch": 4.115237473173463, + "grad_norm": NaN, + "learning_rate": 7.130847881238166e-05, + "loss": 0.0, + "step": 44103 + }, + { + "epoch": 4.11533078286834, + "grad_norm": NaN, + "learning_rate": 7.13020395406216e-05, + "loss": 0.0, + "step": 44104 + }, + { + "epoch": 4.1154240925632175, + "grad_norm": NaN, + "learning_rate": 7.129560046896718e-05, + "loss": 0.0, + "step": 44105 + }, + { + "epoch": 4.115517402258095, + "grad_norm": NaN, + "learning_rate": 7.128916159743463e-05, + "loss": 0.0, + "step": 44106 + }, + { + "epoch": 4.115610711952972, + "grad_norm": NaN, + "learning_rate": 7.128272292604048e-05, + "loss": 0.0, + "step": 44107 + }, + { + "epoch": 4.115704021647849, + "grad_norm": NaN, + "learning_rate": 7.12762844548011e-05, + "loss": 0.0, + "step": 44108 + }, + { + "epoch": 4.115797331342726, + "grad_norm": NaN, + "learning_rate": 7.12698461837327e-05, + "loss": 0.0, + "step": 44109 + }, + { + "epoch": 4.115890641037604, + "grad_norm": NaN, + "learning_rate": 7.126340811285182e-05, + "loss": 0.0, + "step": 44110 + }, + { + "epoch": 4.115983950732481, + "grad_norm": NaN, + "learning_rate": 7.125697024217483e-05, + "loss": 0.0, + "step": 44111 + }, + { + "epoch": 4.1160772604273586, + "grad_norm": NaN, + "learning_rate": 7.125053257171791e-05, + "loss": 0.0, + "step": 44112 + }, + { + "epoch": 4.116170570122236, + "grad_norm": NaN, + "learning_rate": 7.124409510149762e-05, + "loss": 0.0, + "step": 44113 + }, + { + "epoch": 4.116263879817113, + "grad_norm": NaN, + "learning_rate": 7.123765783153033e-05, + "loss": 0.0, + "step": 44114 + }, + { + "epoch": 4.11635718951199, + "grad_norm": NaN, + "learning_rate": 7.123122076183225e-05, + "loss": 0.0, + "step": 44115 + }, + { + "epoch": 4.116450499206867, + "grad_norm": NaN, + "learning_rate": 7.122478389241988e-05, + "loss": 0.0, + "step": 44116 + }, + { + "epoch": 4.116543808901745, + "grad_norm": NaN, + "learning_rate": 7.12183472233096e-05, + "loss": 0.0, + "step": 44117 + }, + { + "epoch": 4.116637118596622, + "grad_norm": NaN, + "learning_rate": 7.121191075451763e-05, + "loss": 0.0, + "step": 44118 + }, + { + "epoch": 4.1167304282915, + "grad_norm": NaN, + "learning_rate": 7.120547448606048e-05, + "loss": 0.0, + "step": 44119 + }, + { + "epoch": 4.116823737986377, + "grad_norm": NaN, + "learning_rate": 7.119903841795445e-05, + "loss": 0.0, + "step": 44120 + }, + { + "epoch": 4.116917047681254, + "grad_norm": NaN, + "learning_rate": 7.119260255021593e-05, + "loss": 0.0, + "step": 44121 + }, + { + "epoch": 4.117010357376131, + "grad_norm": NaN, + "learning_rate": 7.118616688286128e-05, + "loss": 0.0, + "step": 44122 + }, + { + "epoch": 4.117103667071008, + "grad_norm": NaN, + "learning_rate": 7.117973141590688e-05, + "loss": 0.0, + "step": 44123 + }, + { + "epoch": 4.117196976765886, + "grad_norm": NaN, + "learning_rate": 7.117329614936906e-05, + "loss": 0.0, + "step": 44124 + }, + { + "epoch": 4.117290286460763, + "grad_norm": NaN, + "learning_rate": 7.11668610832642e-05, + "loss": 0.0, + "step": 44125 + }, + { + "epoch": 4.117383596155641, + "grad_norm": NaN, + "learning_rate": 7.116042621760865e-05, + "loss": 0.0, + "step": 44126 + }, + { + "epoch": 4.117476905850518, + "grad_norm": NaN, + "learning_rate": 7.115399155241879e-05, + "loss": 0.0, + "step": 44127 + }, + { + "epoch": 4.1175702155453955, + "grad_norm": NaN, + "learning_rate": 7.114755708771105e-05, + "loss": 0.0, + "step": 44128 + }, + { + "epoch": 4.117663525240273, + "grad_norm": NaN, + "learning_rate": 7.114112282350157e-05, + "loss": 0.0, + "step": 44129 + }, + { + "epoch": 4.117756834935149, + "grad_norm": NaN, + "learning_rate": 7.113468875980693e-05, + "loss": 0.0, + "step": 44130 + }, + { + "epoch": 4.117850144630027, + "grad_norm": NaN, + "learning_rate": 7.112825489664347e-05, + "loss": 0.0, + "step": 44131 + }, + { + "epoch": 4.117943454324904, + "grad_norm": NaN, + "learning_rate": 7.112182123402738e-05, + "loss": 0.0, + "step": 44132 + }, + { + "epoch": 4.118036764019782, + "grad_norm": NaN, + "learning_rate": 7.111538777197517e-05, + "loss": 0.0, + "step": 44133 + }, + { + "epoch": 4.118130073714659, + "grad_norm": NaN, + "learning_rate": 7.110895451050324e-05, + "loss": 0.0, + "step": 44134 + }, + { + "epoch": 4.1182233834095365, + "grad_norm": NaN, + "learning_rate": 7.110252144962775e-05, + "loss": 0.0, + "step": 44135 + }, + { + "epoch": 4.118316693104414, + "grad_norm": NaN, + "learning_rate": 7.109608858936524e-05, + "loss": 0.0, + "step": 44136 + }, + { + "epoch": 4.1184100027992905, + "grad_norm": NaN, + "learning_rate": 7.108965592973205e-05, + "loss": 0.0, + "step": 44137 + }, + { + "epoch": 4.118503312494168, + "grad_norm": NaN, + "learning_rate": 7.108322347074439e-05, + "loss": 0.0, + "step": 44138 + }, + { + "epoch": 4.118596622189045, + "grad_norm": NaN, + "learning_rate": 7.107679121241875e-05, + "loss": 0.0, + "step": 44139 + }, + { + "epoch": 4.118689931883923, + "grad_norm": NaN, + "learning_rate": 7.107035915477154e-05, + "loss": 0.0, + "step": 44140 + }, + { + "epoch": 4.1187832415788, + "grad_norm": NaN, + "learning_rate": 7.106392729781887e-05, + "loss": 0.0, + "step": 44141 + }, + { + "epoch": 4.118876551273678, + "grad_norm": NaN, + "learning_rate": 7.105749564157734e-05, + "loss": 0.0, + "step": 44142 + }, + { + "epoch": 4.118969860968555, + "grad_norm": NaN, + "learning_rate": 7.105106418606326e-05, + "loss": 0.0, + "step": 44143 + }, + { + "epoch": 4.1190631706634315, + "grad_norm": NaN, + "learning_rate": 7.104463293129283e-05, + "loss": 0.0, + "step": 44144 + }, + { + "epoch": 4.119156480358309, + "grad_norm": NaN, + "learning_rate": 7.103820187728256e-05, + "loss": 0.0, + "step": 44145 + }, + { + "epoch": 4.119249790053186, + "grad_norm": NaN, + "learning_rate": 7.103177102404881e-05, + "loss": 0.0, + "step": 44146 + }, + { + "epoch": 4.119343099748064, + "grad_norm": NaN, + "learning_rate": 7.102534037160776e-05, + "loss": 0.0, + "step": 44147 + }, + { + "epoch": 4.119436409442941, + "grad_norm": NaN, + "learning_rate": 7.101890991997593e-05, + "loss": 0.0, + "step": 44148 + }, + { + "epoch": 4.119529719137819, + "grad_norm": NaN, + "learning_rate": 7.10124796691697e-05, + "loss": 0.0, + "step": 44149 + }, + { + "epoch": 4.119623028832696, + "grad_norm": NaN, + "learning_rate": 7.100604961920518e-05, + "loss": 0.0, + "step": 44150 + }, + { + "epoch": 4.119716338527573, + "grad_norm": NaN, + "learning_rate": 7.099961977009897e-05, + "loss": 0.0, + "step": 44151 + }, + { + "epoch": 4.11980964822245, + "grad_norm": NaN, + "learning_rate": 7.099319012186735e-05, + "loss": 0.0, + "step": 44152 + }, + { + "epoch": 4.119902957917327, + "grad_norm": NaN, + "learning_rate": 7.098676067452656e-05, + "loss": 0.0, + "step": 44153 + }, + { + "epoch": 4.119996267612205, + "grad_norm": NaN, + "learning_rate": 7.098033142809307e-05, + "loss": 0.0, + "step": 44154 + }, + { + "epoch": 4.120089577307082, + "grad_norm": NaN, + "learning_rate": 7.097390238258325e-05, + "loss": 0.0, + "step": 44155 + }, + { + "epoch": 4.12018288700196, + "grad_norm": NaN, + "learning_rate": 7.096747353801329e-05, + "loss": 0.0, + "step": 44156 + }, + { + "epoch": 4.120276196696837, + "grad_norm": NaN, + "learning_rate": 7.096104489439965e-05, + "loss": 0.0, + "step": 44157 + }, + { + "epoch": 4.1203695063917145, + "grad_norm": NaN, + "learning_rate": 7.095461645175871e-05, + "loss": 0.0, + "step": 44158 + }, + { + "epoch": 4.120462816086591, + "grad_norm": NaN, + "learning_rate": 7.094818821010673e-05, + "loss": 0.0, + "step": 44159 + }, + { + "epoch": 4.1205561257814685, + "grad_norm": NaN, + "learning_rate": 7.094176016946009e-05, + "loss": 0.0, + "step": 44160 + }, + { + "epoch": 4.120649435476346, + "grad_norm": NaN, + "learning_rate": 7.093533232983516e-05, + "loss": 0.0, + "step": 44161 + }, + { + "epoch": 4.120742745171223, + "grad_norm": NaN, + "learning_rate": 7.092890469124825e-05, + "loss": 0.0, + "step": 44162 + }, + { + "epoch": 4.120836054866101, + "grad_norm": NaN, + "learning_rate": 7.092247725371571e-05, + "loss": 0.0, + "step": 44163 + }, + { + "epoch": 4.120929364560978, + "grad_norm": NaN, + "learning_rate": 7.091605001725387e-05, + "loss": 0.0, + "step": 44164 + }, + { + "epoch": 4.1210226742558556, + "grad_norm": NaN, + "learning_rate": 7.09096229818791e-05, + "loss": 0.0, + "step": 44165 + }, + { + "epoch": 4.121115983950732, + "grad_norm": NaN, + "learning_rate": 7.090319614760774e-05, + "loss": 0.0, + "step": 44166 + }, + { + "epoch": 4.1212092936456095, + "grad_norm": NaN, + "learning_rate": 7.089676951445611e-05, + "loss": 0.0, + "step": 44167 + }, + { + "epoch": 4.121302603340487, + "grad_norm": NaN, + "learning_rate": 7.089034308244057e-05, + "loss": 0.0, + "step": 44168 + }, + { + "epoch": 4.121395913035364, + "grad_norm": NaN, + "learning_rate": 7.088391685157747e-05, + "loss": 0.0, + "step": 44169 + }, + { + "epoch": 4.121489222730242, + "grad_norm": NaN, + "learning_rate": 7.08774908218831e-05, + "loss": 0.0, + "step": 44170 + }, + { + "epoch": 4.121582532425119, + "grad_norm": NaN, + "learning_rate": 7.087106499337383e-05, + "loss": 0.0, + "step": 44171 + }, + { + "epoch": 4.121675842119997, + "grad_norm": NaN, + "learning_rate": 7.08646393660661e-05, + "loss": 0.0, + "step": 44172 + }, + { + "epoch": 4.121769151814873, + "grad_norm": NaN, + "learning_rate": 7.085821393997601e-05, + "loss": 0.0, + "step": 44173 + }, + { + "epoch": 4.1218624615097506, + "grad_norm": NaN, + "learning_rate": 7.085178871512009e-05, + "loss": 0.0, + "step": 44174 + }, + { + "epoch": 4.121955771204628, + "grad_norm": NaN, + "learning_rate": 7.084536369151469e-05, + "loss": 0.0, + "step": 44175 + }, + { + "epoch": 4.122049080899505, + "grad_norm": NaN, + "learning_rate": 7.083893886917598e-05, + "loss": 0.0, + "step": 44176 + }, + { + "epoch": 4.122142390594383, + "grad_norm": NaN, + "learning_rate": 7.083251424812043e-05, + "loss": 0.0, + "step": 44177 + }, + { + "epoch": 4.12223570028926, + "grad_norm": NaN, + "learning_rate": 7.082608982836442e-05, + "loss": 0.0, + "step": 44178 + }, + { + "epoch": 4.122329009984138, + "grad_norm": NaN, + "learning_rate": 7.081966560992411e-05, + "loss": 0.0, + "step": 44179 + }, + { + "epoch": 4.122422319679015, + "grad_norm": NaN, + "learning_rate": 7.081324159281597e-05, + "loss": 0.0, + "step": 44180 + }, + { + "epoch": 4.122515629373892, + "grad_norm": NaN, + "learning_rate": 7.080681777705637e-05, + "loss": 0.0, + "step": 44181 + }, + { + "epoch": 4.122608939068769, + "grad_norm": NaN, + "learning_rate": 7.080039416266145e-05, + "loss": 0.0, + "step": 44182 + }, + { + "epoch": 4.122702248763646, + "grad_norm": NaN, + "learning_rate": 7.079397074964772e-05, + "loss": 0.0, + "step": 44183 + }, + { + "epoch": 4.122795558458524, + "grad_norm": NaN, + "learning_rate": 7.078754753803154e-05, + "loss": 0.0, + "step": 44184 + }, + { + "epoch": 4.122888868153401, + "grad_norm": NaN, + "learning_rate": 7.078112452782903e-05, + "loss": 0.0, + "step": 44185 + }, + { + "epoch": 4.122982177848279, + "grad_norm": NaN, + "learning_rate": 7.077470171905672e-05, + "loss": 0.0, + "step": 44186 + }, + { + "epoch": 4.123075487543156, + "grad_norm": NaN, + "learning_rate": 7.076827911173093e-05, + "loss": 0.0, + "step": 44187 + }, + { + "epoch": 4.123168797238033, + "grad_norm": NaN, + "learning_rate": 7.076185670586781e-05, + "loss": 0.0, + "step": 44188 + }, + { + "epoch": 4.12326210693291, + "grad_norm": NaN, + "learning_rate": 7.075543450148389e-05, + "loss": 0.0, + "step": 44189 + }, + { + "epoch": 4.1233554166277875, + "grad_norm": NaN, + "learning_rate": 7.074901249859547e-05, + "loss": 0.0, + "step": 44190 + }, + { + "epoch": 4.123448726322665, + "grad_norm": NaN, + "learning_rate": 7.074259069721871e-05, + "loss": 0.0, + "step": 44191 + }, + { + "epoch": 4.123542036017542, + "grad_norm": NaN, + "learning_rate": 7.073616909737015e-05, + "loss": 0.0, + "step": 44192 + }, + { + "epoch": 4.12363534571242, + "grad_norm": NaN, + "learning_rate": 7.072974769906607e-05, + "loss": 0.0, + "step": 44193 + }, + { + "epoch": 4.123728655407297, + "grad_norm": NaN, + "learning_rate": 7.072332650232266e-05, + "loss": 0.0, + "step": 44194 + }, + { + "epoch": 4.123821965102174, + "grad_norm": NaN, + "learning_rate": 7.071690550715639e-05, + "loss": 0.0, + "step": 44195 + }, + { + "epoch": 4.123915274797051, + "grad_norm": NaN, + "learning_rate": 7.071048471358355e-05, + "loss": 0.0, + "step": 44196 + }, + { + "epoch": 4.1240085844919285, + "grad_norm": NaN, + "learning_rate": 7.070406412162047e-05, + "loss": 0.0, + "step": 44197 + }, + { + "epoch": 4.124101894186806, + "grad_norm": NaN, + "learning_rate": 7.069764373128347e-05, + "loss": 0.0, + "step": 44198 + }, + { + "epoch": 4.124195203881683, + "grad_norm": NaN, + "learning_rate": 7.069122354258885e-05, + "loss": 0.0, + "step": 44199 + }, + { + "epoch": 4.124288513576561, + "grad_norm": NaN, + "learning_rate": 7.068480355555297e-05, + "loss": 0.0, + "step": 44200 + }, + { + "epoch": 4.124381823271438, + "grad_norm": NaN, + "learning_rate": 7.067838377019211e-05, + "loss": 0.0, + "step": 44201 + }, + { + "epoch": 4.124475132966316, + "grad_norm": NaN, + "learning_rate": 7.067196418652266e-05, + "loss": 0.0, + "step": 44202 + }, + { + "epoch": 4.124568442661192, + "grad_norm": NaN, + "learning_rate": 7.066554480456088e-05, + "loss": 0.0, + "step": 44203 + }, + { + "epoch": 4.12466175235607, + "grad_norm": NaN, + "learning_rate": 7.065912562432315e-05, + "loss": 0.0, + "step": 44204 + }, + { + "epoch": 4.124755062050947, + "grad_norm": NaN, + "learning_rate": 7.065270664582573e-05, + "loss": 0.0, + "step": 44205 + }, + { + "epoch": 4.124848371745824, + "grad_norm": NaN, + "learning_rate": 7.064628786908499e-05, + "loss": 0.0, + "step": 44206 + }, + { + "epoch": 4.124941681440702, + "grad_norm": NaN, + "learning_rate": 7.063986929411723e-05, + "loss": 0.0, + "step": 44207 + }, + { + "epoch": 4.125034991135579, + "grad_norm": NaN, + "learning_rate": 7.063345092093877e-05, + "loss": 0.0, + "step": 44208 + }, + { + "epoch": 4.125128300830457, + "grad_norm": NaN, + "learning_rate": 7.062703274956593e-05, + "loss": 0.0, + "step": 44209 + }, + { + "epoch": 4.125221610525333, + "grad_norm": NaN, + "learning_rate": 7.062061478001503e-05, + "loss": 0.0, + "step": 44210 + }, + { + "epoch": 4.125314920220211, + "grad_norm": NaN, + "learning_rate": 7.061419701230241e-05, + "loss": 0.0, + "step": 44211 + }, + { + "epoch": 4.125408229915088, + "grad_norm": NaN, + "learning_rate": 7.060777944644437e-05, + "loss": 0.0, + "step": 44212 + }, + { + "epoch": 4.1255015396099655, + "grad_norm": NaN, + "learning_rate": 7.060136208245722e-05, + "loss": 0.0, + "step": 44213 + }, + { + "epoch": 4.125594849304843, + "grad_norm": NaN, + "learning_rate": 7.05949449203573e-05, + "loss": 0.0, + "step": 44214 + }, + { + "epoch": 4.12568815899972, + "grad_norm": NaN, + "learning_rate": 7.05885279601609e-05, + "loss": 0.0, + "step": 44215 + }, + { + "epoch": 4.125781468694598, + "grad_norm": NaN, + "learning_rate": 7.058211120188441e-05, + "loss": 0.0, + "step": 44216 + }, + { + "epoch": 4.125874778389474, + "grad_norm": NaN, + "learning_rate": 7.057569464554396e-05, + "loss": 0.0, + "step": 44217 + }, + { + "epoch": 4.125968088084352, + "grad_norm": NaN, + "learning_rate": 7.056927829115608e-05, + "loss": 0.0, + "step": 44218 + }, + { + "epoch": 4.126061397779229, + "grad_norm": NaN, + "learning_rate": 7.056286213873704e-05, + "loss": 0.0, + "step": 44219 + }, + { + "epoch": 4.1261547074741065, + "grad_norm": NaN, + "learning_rate": 7.055644618830297e-05, + "loss": 0.0, + "step": 44220 + }, + { + "epoch": 4.126248017168984, + "grad_norm": NaN, + "learning_rate": 7.055003043987041e-05, + "loss": 0.0, + "step": 44221 + }, + { + "epoch": 4.126341326863861, + "grad_norm": NaN, + "learning_rate": 7.054361489345564e-05, + "loss": 0.0, + "step": 44222 + }, + { + "epoch": 4.126434636558739, + "grad_norm": NaN, + "learning_rate": 7.05371995490748e-05, + "loss": 0.0, + "step": 44223 + }, + { + "epoch": 4.126527946253615, + "grad_norm": NaN, + "learning_rate": 7.053078440674438e-05, + "loss": 0.0, + "step": 44224 + }, + { + "epoch": 4.126621255948493, + "grad_norm": NaN, + "learning_rate": 7.052436946648069e-05, + "loss": 0.0, + "step": 44225 + }, + { + "epoch": 4.12671456564337, + "grad_norm": NaN, + "learning_rate": 7.051795472829988e-05, + "loss": 0.0, + "step": 44226 + }, + { + "epoch": 4.1268078753382476, + "grad_norm": NaN, + "learning_rate": 7.051154019221842e-05, + "loss": 0.0, + "step": 44227 + }, + { + "epoch": 4.126901185033125, + "grad_norm": NaN, + "learning_rate": 7.050512585825263e-05, + "loss": 0.0, + "step": 44228 + }, + { + "epoch": 4.126994494728002, + "grad_norm": NaN, + "learning_rate": 7.049871172641865e-05, + "loss": 0.0, + "step": 44229 + }, + { + "epoch": 4.12708780442288, + "grad_norm": NaN, + "learning_rate": 7.049229779673293e-05, + "loss": 0.0, + "step": 44230 + }, + { + "epoch": 4.127181114117757, + "grad_norm": NaN, + "learning_rate": 7.048588406921176e-05, + "loss": 0.0, + "step": 44231 + }, + { + "epoch": 4.127274423812634, + "grad_norm": NaN, + "learning_rate": 7.047947054387142e-05, + "loss": 0.0, + "step": 44232 + }, + { + "epoch": 4.127367733507511, + "grad_norm": NaN, + "learning_rate": 7.047305722072825e-05, + "loss": 0.0, + "step": 44233 + }, + { + "epoch": 4.127461043202389, + "grad_norm": NaN, + "learning_rate": 7.046664409979852e-05, + "loss": 0.0, + "step": 44234 + }, + { + "epoch": 4.127554352897266, + "grad_norm": NaN, + "learning_rate": 7.046023118109856e-05, + "loss": 0.0, + "step": 44235 + }, + { + "epoch": 4.127647662592143, + "grad_norm": NaN, + "learning_rate": 7.045381846464468e-05, + "loss": 0.0, + "step": 44236 + }, + { + "epoch": 4.127740972287021, + "grad_norm": NaN, + "learning_rate": 7.044740595045316e-05, + "loss": 0.0, + "step": 44237 + }, + { + "epoch": 4.127834281981898, + "grad_norm": NaN, + "learning_rate": 7.044099363854033e-05, + "loss": 0.0, + "step": 44238 + }, + { + "epoch": 4.127927591676775, + "grad_norm": NaN, + "learning_rate": 7.043458152892249e-05, + "loss": 0.0, + "step": 44239 + }, + { + "epoch": 4.128020901371652, + "grad_norm": NaN, + "learning_rate": 7.042816962161593e-05, + "loss": 0.0, + "step": 44240 + }, + { + "epoch": 4.12811421106653, + "grad_norm": NaN, + "learning_rate": 7.042175791663697e-05, + "loss": 0.0, + "step": 44241 + }, + { + "epoch": 4.128207520761407, + "grad_norm": NaN, + "learning_rate": 7.041534641400192e-05, + "loss": 0.0, + "step": 44242 + }, + { + "epoch": 4.1283008304562845, + "grad_norm": NaN, + "learning_rate": 7.040893511372706e-05, + "loss": 0.0, + "step": 44243 + }, + { + "epoch": 4.128394140151162, + "grad_norm": NaN, + "learning_rate": 7.040252401582869e-05, + "loss": 0.0, + "step": 44244 + }, + { + "epoch": 4.128487449846039, + "grad_norm": NaN, + "learning_rate": 7.039611312032313e-05, + "loss": 0.0, + "step": 44245 + }, + { + "epoch": 4.128580759540917, + "grad_norm": NaN, + "learning_rate": 7.038970242722667e-05, + "loss": 0.0, + "step": 44246 + }, + { + "epoch": 4.128674069235793, + "grad_norm": NaN, + "learning_rate": 7.038329193655563e-05, + "loss": 0.0, + "step": 44247 + }, + { + "epoch": 4.128767378930671, + "grad_norm": NaN, + "learning_rate": 7.037688164832628e-05, + "loss": 0.0, + "step": 44248 + }, + { + "epoch": 4.128860688625548, + "grad_norm": NaN, + "learning_rate": 7.037047156255495e-05, + "loss": 0.0, + "step": 44249 + }, + { + "epoch": 4.1289539983204255, + "grad_norm": NaN, + "learning_rate": 7.03640616792579e-05, + "loss": 0.0, + "step": 44250 + }, + { + "epoch": 4.129047308015303, + "grad_norm": NaN, + "learning_rate": 7.035765199845146e-05, + "loss": 0.0, + "step": 44251 + }, + { + "epoch": 4.12914061771018, + "grad_norm": NaN, + "learning_rate": 7.035124252015193e-05, + "loss": 0.0, + "step": 44252 + }, + { + "epoch": 4.129233927405058, + "grad_norm": NaN, + "learning_rate": 7.034483324437559e-05, + "loss": 0.0, + "step": 44253 + }, + { + "epoch": 4.129327237099934, + "grad_norm": NaN, + "learning_rate": 7.033842417113873e-05, + "loss": 0.0, + "step": 44254 + }, + { + "epoch": 4.129420546794812, + "grad_norm": NaN, + "learning_rate": 7.033201530045767e-05, + "loss": 0.0, + "step": 44255 + }, + { + "epoch": 4.129513856489689, + "grad_norm": NaN, + "learning_rate": 7.03256066323487e-05, + "loss": 0.0, + "step": 44256 + }, + { + "epoch": 4.129607166184567, + "grad_norm": NaN, + "learning_rate": 7.03191981668281e-05, + "loss": 0.0, + "step": 44257 + }, + { + "epoch": 4.129700475879444, + "grad_norm": NaN, + "learning_rate": 7.031278990391219e-05, + "loss": 0.0, + "step": 44258 + }, + { + "epoch": 4.129793785574321, + "grad_norm": NaN, + "learning_rate": 7.030638184361725e-05, + "loss": 0.0, + "step": 44259 + }, + { + "epoch": 4.129887095269199, + "grad_norm": NaN, + "learning_rate": 7.029997398595955e-05, + "loss": 0.0, + "step": 44260 + }, + { + "epoch": 4.129980404964075, + "grad_norm": NaN, + "learning_rate": 7.029356633095542e-05, + "loss": 0.0, + "step": 44261 + }, + { + "epoch": 4.130073714658953, + "grad_norm": NaN, + "learning_rate": 7.028715887862116e-05, + "loss": 0.0, + "step": 44262 + }, + { + "epoch": 4.13016702435383, + "grad_norm": NaN, + "learning_rate": 7.028075162897308e-05, + "loss": 0.0, + "step": 44263 + }, + { + "epoch": 4.130260334048708, + "grad_norm": NaN, + "learning_rate": 7.027434458202733e-05, + "loss": 0.0, + "step": 44264 + }, + { + "epoch": 4.130353643743585, + "grad_norm": NaN, + "learning_rate": 7.026793773780034e-05, + "loss": 0.0, + "step": 44265 + }, + { + "epoch": 4.1304469534384625, + "grad_norm": NaN, + "learning_rate": 7.026153109630844e-05, + "loss": 0.0, + "step": 44266 + }, + { + "epoch": 4.13054026313334, + "grad_norm": NaN, + "learning_rate": 7.025512465756774e-05, + "loss": 0.0, + "step": 44267 + }, + { + "epoch": 4.130633572828216, + "grad_norm": NaN, + "learning_rate": 7.024871842159468e-05, + "loss": 0.0, + "step": 44268 + }, + { + "epoch": 4.130726882523094, + "grad_norm": NaN, + "learning_rate": 7.02423123884055e-05, + "loss": 0.0, + "step": 44269 + }, + { + "epoch": 4.130820192217971, + "grad_norm": NaN, + "learning_rate": 7.02359065580165e-05, + "loss": 0.0, + "step": 44270 + }, + { + "epoch": 4.130913501912849, + "grad_norm": NaN, + "learning_rate": 7.022950093044397e-05, + "loss": 0.0, + "step": 44271 + }, + { + "epoch": 4.131006811607726, + "grad_norm": NaN, + "learning_rate": 7.022309550570418e-05, + "loss": 0.0, + "step": 44272 + }, + { + "epoch": 4.1311001213026035, + "grad_norm": NaN, + "learning_rate": 7.021669028381343e-05, + "loss": 0.0, + "step": 44273 + }, + { + "epoch": 4.131193430997481, + "grad_norm": NaN, + "learning_rate": 7.021028526478801e-05, + "loss": 0.0, + "step": 44274 + }, + { + "epoch": 4.131286740692358, + "grad_norm": NaN, + "learning_rate": 7.02038804486442e-05, + "loss": 0.0, + "step": 44275 + }, + { + "epoch": 4.131380050387235, + "grad_norm": NaN, + "learning_rate": 7.019747583539827e-05, + "loss": 0.0, + "step": 44276 + }, + { + "epoch": 4.131473360082112, + "grad_norm": NaN, + "learning_rate": 7.019107142506652e-05, + "loss": 0.0, + "step": 44277 + }, + { + "epoch": 4.13156666977699, + "grad_norm": NaN, + "learning_rate": 7.018466721766525e-05, + "loss": 0.0, + "step": 44278 + }, + { + "epoch": 4.131659979471867, + "grad_norm": NaN, + "learning_rate": 7.017826321321073e-05, + "loss": 0.0, + "step": 44279 + }, + { + "epoch": 4.131753289166745, + "grad_norm": NaN, + "learning_rate": 7.017185941171923e-05, + "loss": 0.0, + "step": 44280 + }, + { + "epoch": 4.131846598861622, + "grad_norm": NaN, + "learning_rate": 7.016545581320703e-05, + "loss": 0.0, + "step": 44281 + }, + { + "epoch": 4.131939908556499, + "grad_norm": NaN, + "learning_rate": 7.015905241769044e-05, + "loss": 0.0, + "step": 44282 + }, + { + "epoch": 4.132033218251376, + "grad_norm": NaN, + "learning_rate": 7.015264922518574e-05, + "loss": 0.0, + "step": 44283 + }, + { + "epoch": 4.132126527946253, + "grad_norm": NaN, + "learning_rate": 7.014624623570919e-05, + "loss": 0.0, + "step": 44284 + }, + { + "epoch": 4.132219837641131, + "grad_norm": NaN, + "learning_rate": 7.013984344927708e-05, + "loss": 0.0, + "step": 44285 + }, + { + "epoch": 4.132313147336008, + "grad_norm": NaN, + "learning_rate": 7.01334408659057e-05, + "loss": 0.0, + "step": 44286 + }, + { + "epoch": 4.132406457030886, + "grad_norm": NaN, + "learning_rate": 7.01270384856113e-05, + "loss": 0.0, + "step": 44287 + }, + { + "epoch": 4.132499766725763, + "grad_norm": NaN, + "learning_rate": 7.01206363084102e-05, + "loss": 0.0, + "step": 44288 + }, + { + "epoch": 4.13259307642064, + "grad_norm": NaN, + "learning_rate": 7.011423433431867e-05, + "loss": 0.0, + "step": 44289 + }, + { + "epoch": 4.132686386115517, + "grad_norm": NaN, + "learning_rate": 7.010783256335297e-05, + "loss": 0.0, + "step": 44290 + }, + { + "epoch": 4.132779695810394, + "grad_norm": NaN, + "learning_rate": 7.010143099552937e-05, + "loss": 0.0, + "step": 44291 + }, + { + "epoch": 4.132873005505272, + "grad_norm": NaN, + "learning_rate": 7.009502963086418e-05, + "loss": 0.0, + "step": 44292 + }, + { + "epoch": 4.132966315200149, + "grad_norm": NaN, + "learning_rate": 7.008862846937367e-05, + "loss": 0.0, + "step": 44293 + }, + { + "epoch": 4.133059624895027, + "grad_norm": NaN, + "learning_rate": 7.008222751107408e-05, + "loss": 0.0, + "step": 44294 + }, + { + "epoch": 4.133152934589904, + "grad_norm": NaN, + "learning_rate": 7.007582675598173e-05, + "loss": 0.0, + "step": 44295 + }, + { + "epoch": 4.1332462442847815, + "grad_norm": NaN, + "learning_rate": 7.006942620411286e-05, + "loss": 0.0, + "step": 44296 + }, + { + "epoch": 4.133339553979659, + "grad_norm": NaN, + "learning_rate": 7.006302585548378e-05, + "loss": 0.0, + "step": 44297 + }, + { + "epoch": 4.133432863674535, + "grad_norm": NaN, + "learning_rate": 7.005662571011074e-05, + "loss": 0.0, + "step": 44298 + }, + { + "epoch": 4.133526173369413, + "grad_norm": NaN, + "learning_rate": 7.005022576801002e-05, + "loss": 0.0, + "step": 44299 + }, + { + "epoch": 4.13361948306429, + "grad_norm": NaN, + "learning_rate": 7.004382602919789e-05, + "loss": 0.0, + "step": 44300 + }, + { + "epoch": 4.133712792759168, + "grad_norm": NaN, + "learning_rate": 7.003742649369063e-05, + "loss": 0.0, + "step": 44301 + }, + { + "epoch": 4.133806102454045, + "grad_norm": NaN, + "learning_rate": 7.003102716150451e-05, + "loss": 0.0, + "step": 44302 + }, + { + "epoch": 4.1338994121489225, + "grad_norm": NaN, + "learning_rate": 7.002462803265578e-05, + "loss": 0.0, + "step": 44303 + }, + { + "epoch": 4.1339927218438, + "grad_norm": NaN, + "learning_rate": 7.001822910716075e-05, + "loss": 0.0, + "step": 44304 + }, + { + "epoch": 4.1340860315386765, + "grad_norm": NaN, + "learning_rate": 7.001183038503567e-05, + "loss": 0.0, + "step": 44305 + }, + { + "epoch": 4.134179341233554, + "grad_norm": NaN, + "learning_rate": 7.00054318662968e-05, + "loss": 0.0, + "step": 44306 + }, + { + "epoch": 4.134272650928431, + "grad_norm": NaN, + "learning_rate": 6.999903355096044e-05, + "loss": 0.0, + "step": 44307 + }, + { + "epoch": 4.134365960623309, + "grad_norm": NaN, + "learning_rate": 6.999263543904283e-05, + "loss": 0.0, + "step": 44308 + }, + { + "epoch": 4.134459270318186, + "grad_norm": NaN, + "learning_rate": 6.998623753056025e-05, + "loss": 0.0, + "step": 44309 + }, + { + "epoch": 4.134552580013064, + "grad_norm": NaN, + "learning_rate": 6.997983982552895e-05, + "loss": 0.0, + "step": 44310 + }, + { + "epoch": 4.134645889707941, + "grad_norm": NaN, + "learning_rate": 6.997344232396522e-05, + "loss": 0.0, + "step": 44311 + }, + { + "epoch": 4.1347391994028175, + "grad_norm": NaN, + "learning_rate": 6.996704502588533e-05, + "loss": 0.0, + "step": 44312 + }, + { + "epoch": 4.134832509097695, + "grad_norm": NaN, + "learning_rate": 6.996064793130555e-05, + "loss": 0.0, + "step": 44313 + }, + { + "epoch": 4.134925818792572, + "grad_norm": NaN, + "learning_rate": 6.99542510402421e-05, + "loss": 0.0, + "step": 44314 + }, + { + "epoch": 4.13501912848745, + "grad_norm": NaN, + "learning_rate": 6.994785435271128e-05, + "loss": 0.0, + "step": 44315 + }, + { + "epoch": 4.135112438182327, + "grad_norm": NaN, + "learning_rate": 6.99414578687294e-05, + "loss": 0.0, + "step": 44316 + }, + { + "epoch": 4.135205747877205, + "grad_norm": NaN, + "learning_rate": 6.993506158831263e-05, + "loss": 0.0, + "step": 44317 + }, + { + "epoch": 4.135299057572082, + "grad_norm": NaN, + "learning_rate": 6.99286655114773e-05, + "loss": 0.0, + "step": 44318 + }, + { + "epoch": 4.1353923672669595, + "grad_norm": NaN, + "learning_rate": 6.992226963823966e-05, + "loss": 0.0, + "step": 44319 + }, + { + "epoch": 4.135485676961836, + "grad_norm": NaN, + "learning_rate": 6.991587396861595e-05, + "loss": 0.0, + "step": 44320 + }, + { + "epoch": 4.135578986656713, + "grad_norm": NaN, + "learning_rate": 6.990947850262245e-05, + "loss": 0.0, + "step": 44321 + }, + { + "epoch": 4.135672296351591, + "grad_norm": NaN, + "learning_rate": 6.990308324027543e-05, + "loss": 0.0, + "step": 44322 + }, + { + "epoch": 4.135765606046468, + "grad_norm": NaN, + "learning_rate": 6.989668818159114e-05, + "loss": 0.0, + "step": 44323 + }, + { + "epoch": 4.135858915741346, + "grad_norm": NaN, + "learning_rate": 6.989029332658585e-05, + "loss": 0.0, + "step": 44324 + }, + { + "epoch": 4.135952225436223, + "grad_norm": NaN, + "learning_rate": 6.98838986752758e-05, + "loss": 0.0, + "step": 44325 + }, + { + "epoch": 4.1360455351311005, + "grad_norm": NaN, + "learning_rate": 6.987750422767728e-05, + "loss": 0.0, + "step": 44326 + }, + { + "epoch": 4.136138844825977, + "grad_norm": NaN, + "learning_rate": 6.987110998380652e-05, + "loss": 0.0, + "step": 44327 + }, + { + "epoch": 4.1362321545208545, + "grad_norm": NaN, + "learning_rate": 6.986471594367981e-05, + "loss": 0.0, + "step": 44328 + }, + { + "epoch": 4.136325464215732, + "grad_norm": NaN, + "learning_rate": 6.985832210731339e-05, + "loss": 0.0, + "step": 44329 + }, + { + "epoch": 4.136418773910609, + "grad_norm": NaN, + "learning_rate": 6.985192847472349e-05, + "loss": 0.0, + "step": 44330 + }, + { + "epoch": 4.136512083605487, + "grad_norm": NaN, + "learning_rate": 6.984553504592641e-05, + "loss": 0.0, + "step": 44331 + }, + { + "epoch": 4.136605393300364, + "grad_norm": NaN, + "learning_rate": 6.98391418209384e-05, + "loss": 0.0, + "step": 44332 + }, + { + "epoch": 4.136698702995242, + "grad_norm": NaN, + "learning_rate": 6.983274879977569e-05, + "loss": 0.0, + "step": 44333 + }, + { + "epoch": 4.136792012690118, + "grad_norm": NaN, + "learning_rate": 6.982635598245455e-05, + "loss": 0.0, + "step": 44334 + }, + { + "epoch": 4.1368853223849955, + "grad_norm": NaN, + "learning_rate": 6.981996336899126e-05, + "loss": 0.0, + "step": 44335 + }, + { + "epoch": 4.136978632079873, + "grad_norm": NaN, + "learning_rate": 6.981357095940202e-05, + "loss": 0.0, + "step": 44336 + }, + { + "epoch": 4.13707194177475, + "grad_norm": NaN, + "learning_rate": 6.980717875370316e-05, + "loss": 0.0, + "step": 44337 + }, + { + "epoch": 4.137165251469628, + "grad_norm": NaN, + "learning_rate": 6.980078675191086e-05, + "loss": 0.0, + "step": 44338 + }, + { + "epoch": 4.137258561164505, + "grad_norm": NaN, + "learning_rate": 6.97943949540414e-05, + "loss": 0.0, + "step": 44339 + }, + { + "epoch": 4.137351870859383, + "grad_norm": NaN, + "learning_rate": 6.978800336011105e-05, + "loss": 0.0, + "step": 44340 + }, + { + "epoch": 4.137445180554259, + "grad_norm": NaN, + "learning_rate": 6.978161197013603e-05, + "loss": 0.0, + "step": 44341 + }, + { + "epoch": 4.137538490249137, + "grad_norm": NaN, + "learning_rate": 6.977522078413261e-05, + "loss": 0.0, + "step": 44342 + }, + { + "epoch": 4.137631799944014, + "grad_norm": NaN, + "learning_rate": 6.976882980211706e-05, + "loss": 0.0, + "step": 44343 + }, + { + "epoch": 4.137725109638891, + "grad_norm": NaN, + "learning_rate": 6.976243902410559e-05, + "loss": 0.0, + "step": 44344 + }, + { + "epoch": 4.137818419333769, + "grad_norm": NaN, + "learning_rate": 6.975604845011449e-05, + "loss": 0.0, + "step": 44345 + }, + { + "epoch": 4.137911729028646, + "grad_norm": NaN, + "learning_rate": 6.974965808015996e-05, + "loss": 0.0, + "step": 44346 + }, + { + "epoch": 4.138005038723524, + "grad_norm": NaN, + "learning_rate": 6.974326791425828e-05, + "loss": 0.0, + "step": 44347 + }, + { + "epoch": 4.138098348418401, + "grad_norm": NaN, + "learning_rate": 6.973687795242572e-05, + "loss": 0.0, + "step": 44348 + }, + { + "epoch": 4.138191658113278, + "grad_norm": NaN, + "learning_rate": 6.973048819467849e-05, + "loss": 0.0, + "step": 44349 + }, + { + "epoch": 4.138284967808155, + "grad_norm": NaN, + "learning_rate": 6.972409864103285e-05, + "loss": 0.0, + "step": 44350 + }, + { + "epoch": 4.138378277503032, + "grad_norm": NaN, + "learning_rate": 6.971770929150506e-05, + "loss": 0.0, + "step": 44351 + }, + { + "epoch": 4.13847158719791, + "grad_norm": NaN, + "learning_rate": 6.971132014611133e-05, + "loss": 0.0, + "step": 44352 + }, + { + "epoch": 4.138564896892787, + "grad_norm": NaN, + "learning_rate": 6.970493120486796e-05, + "loss": 0.0, + "step": 44353 + }, + { + "epoch": 4.138658206587665, + "grad_norm": NaN, + "learning_rate": 6.969854246779114e-05, + "loss": 0.0, + "step": 44354 + }, + { + "epoch": 4.138751516282542, + "grad_norm": NaN, + "learning_rate": 6.969215393489716e-05, + "loss": 0.0, + "step": 44355 + }, + { + "epoch": 4.138844825977419, + "grad_norm": NaN, + "learning_rate": 6.968576560620221e-05, + "loss": 0.0, + "step": 44356 + }, + { + "epoch": 4.138938135672296, + "grad_norm": NaN, + "learning_rate": 6.96793774817226e-05, + "loss": 0.0, + "step": 44357 + }, + { + "epoch": 4.1390314453671735, + "grad_norm": NaN, + "learning_rate": 6.967298956147452e-05, + "loss": 0.0, + "step": 44358 + }, + { + "epoch": 4.139124755062051, + "grad_norm": NaN, + "learning_rate": 6.966660184547425e-05, + "loss": 0.0, + "step": 44359 + }, + { + "epoch": 4.139218064756928, + "grad_norm": NaN, + "learning_rate": 6.966021433373801e-05, + "loss": 0.0, + "step": 44360 + }, + { + "epoch": 4.139311374451806, + "grad_norm": NaN, + "learning_rate": 6.965382702628204e-05, + "loss": 0.0, + "step": 44361 + }, + { + "epoch": 4.139404684146683, + "grad_norm": NaN, + "learning_rate": 6.96474399231226e-05, + "loss": 0.0, + "step": 44362 + }, + { + "epoch": 4.139497993841561, + "grad_norm": NaN, + "learning_rate": 6.96410530242759e-05, + "loss": 0.0, + "step": 44363 + }, + { + "epoch": 4.139591303536437, + "grad_norm": NaN, + "learning_rate": 6.963466632975822e-05, + "loss": 0.0, + "step": 44364 + }, + { + "epoch": 4.1396846132313145, + "grad_norm": NaN, + "learning_rate": 6.962827983958575e-05, + "loss": 0.0, + "step": 44365 + }, + { + "epoch": 4.139777922926192, + "grad_norm": NaN, + "learning_rate": 6.962189355377477e-05, + "loss": 0.0, + "step": 44366 + }, + { + "epoch": 4.139871232621069, + "grad_norm": NaN, + "learning_rate": 6.961550747234151e-05, + "loss": 0.0, + "step": 44367 + }, + { + "epoch": 4.139964542315947, + "grad_norm": NaN, + "learning_rate": 6.960912159530219e-05, + "loss": 0.0, + "step": 44368 + }, + { + "epoch": 4.140057852010824, + "grad_norm": NaN, + "learning_rate": 6.960273592267306e-05, + "loss": 0.0, + "step": 44369 + }, + { + "epoch": 4.140151161705702, + "grad_norm": NaN, + "learning_rate": 6.959635045447039e-05, + "loss": 0.0, + "step": 44370 + }, + { + "epoch": 4.140244471400578, + "grad_norm": NaN, + "learning_rate": 6.958996519071036e-05, + "loss": 0.0, + "step": 44371 + }, + { + "epoch": 4.140337781095456, + "grad_norm": NaN, + "learning_rate": 6.958358013140922e-05, + "loss": 0.0, + "step": 44372 + }, + { + "epoch": 4.140431090790333, + "grad_norm": NaN, + "learning_rate": 6.957719527658322e-05, + "loss": 0.0, + "step": 44373 + }, + { + "epoch": 4.14052440048521, + "grad_norm": NaN, + "learning_rate": 6.957081062624861e-05, + "loss": 0.0, + "step": 44374 + }, + { + "epoch": 4.140617710180088, + "grad_norm": NaN, + "learning_rate": 6.956442618042157e-05, + "loss": 0.0, + "step": 44375 + }, + { + "epoch": 4.140711019874965, + "grad_norm": NaN, + "learning_rate": 6.95580419391184e-05, + "loss": 0.0, + "step": 44376 + }, + { + "epoch": 4.140804329569843, + "grad_norm": NaN, + "learning_rate": 6.955165790235527e-05, + "loss": 0.0, + "step": 44377 + }, + { + "epoch": 4.140897639264719, + "grad_norm": NaN, + "learning_rate": 6.954527407014847e-05, + "loss": 0.0, + "step": 44378 + }, + { + "epoch": 4.140990948959597, + "grad_norm": NaN, + "learning_rate": 6.95388904425142e-05, + "loss": 0.0, + "step": 44379 + }, + { + "epoch": 4.141084258654474, + "grad_norm": NaN, + "learning_rate": 6.953250701946868e-05, + "loss": 0.0, + "step": 44380 + }, + { + "epoch": 4.1411775683493515, + "grad_norm": NaN, + "learning_rate": 6.952612380102817e-05, + "loss": 0.0, + "step": 44381 + }, + { + "epoch": 4.141270878044229, + "grad_norm": NaN, + "learning_rate": 6.951974078720889e-05, + "loss": 0.0, + "step": 44382 + }, + { + "epoch": 4.141364187739106, + "grad_norm": NaN, + "learning_rate": 6.951335797802707e-05, + "loss": 0.0, + "step": 44383 + }, + { + "epoch": 4.141457497433984, + "grad_norm": NaN, + "learning_rate": 6.950697537349894e-05, + "loss": 0.0, + "step": 44384 + }, + { + "epoch": 4.14155080712886, + "grad_norm": NaN, + "learning_rate": 6.950059297364073e-05, + "loss": 0.0, + "step": 44385 + }, + { + "epoch": 4.141644116823738, + "grad_norm": NaN, + "learning_rate": 6.949421077846866e-05, + "loss": 0.0, + "step": 44386 + }, + { + "epoch": 4.141737426518615, + "grad_norm": NaN, + "learning_rate": 6.948782878799896e-05, + "loss": 0.0, + "step": 44387 + }, + { + "epoch": 4.1418307362134925, + "grad_norm": NaN, + "learning_rate": 6.948144700224786e-05, + "loss": 0.0, + "step": 44388 + }, + { + "epoch": 4.14192404590837, + "grad_norm": NaN, + "learning_rate": 6.94750654212316e-05, + "loss": 0.0, + "step": 44389 + }, + { + "epoch": 4.142017355603247, + "grad_norm": NaN, + "learning_rate": 6.946868404496639e-05, + "loss": 0.0, + "step": 44390 + }, + { + "epoch": 4.142110665298125, + "grad_norm": NaN, + "learning_rate": 6.946230287346846e-05, + "loss": 0.0, + "step": 44391 + }, + { + "epoch": 4.142203974993002, + "grad_norm": NaN, + "learning_rate": 6.945592190675404e-05, + "loss": 0.0, + "step": 44392 + }, + { + "epoch": 4.142297284687879, + "grad_norm": NaN, + "learning_rate": 6.944954114483935e-05, + "loss": 0.0, + "step": 44393 + }, + { + "epoch": 4.142390594382756, + "grad_norm": NaN, + "learning_rate": 6.944316058774062e-05, + "loss": 0.0, + "step": 44394 + }, + { + "epoch": 4.142483904077634, + "grad_norm": NaN, + "learning_rate": 6.943678023547408e-05, + "loss": 0.0, + "step": 44395 + }, + { + "epoch": 4.142577213772511, + "grad_norm": NaN, + "learning_rate": 6.943040008805592e-05, + "loss": 0.0, + "step": 44396 + }, + { + "epoch": 4.142670523467388, + "grad_norm": NaN, + "learning_rate": 6.94240201455024e-05, + "loss": 0.0, + "step": 44397 + }, + { + "epoch": 4.142763833162266, + "grad_norm": NaN, + "learning_rate": 6.941764040782973e-05, + "loss": 0.0, + "step": 44398 + }, + { + "epoch": 4.142857142857143, + "grad_norm": NaN, + "learning_rate": 6.941126087505412e-05, + "loss": 0.0, + "step": 44399 + }, + { + "epoch": 4.14295045255202, + "grad_norm": NaN, + "learning_rate": 6.94048815471918e-05, + "loss": 0.0, + "step": 44400 + }, + { + "epoch": 4.143043762246897, + "grad_norm": NaN, + "learning_rate": 6.9398502424259e-05, + "loss": 0.0, + "step": 44401 + }, + { + "epoch": 4.143137071941775, + "grad_norm": NaN, + "learning_rate": 6.939212350627193e-05, + "loss": 0.0, + "step": 44402 + }, + { + "epoch": 4.143230381636652, + "grad_norm": NaN, + "learning_rate": 6.938574479324683e-05, + "loss": 0.0, + "step": 44403 + }, + { + "epoch": 4.1433236913315294, + "grad_norm": NaN, + "learning_rate": 6.937936628519987e-05, + "loss": 0.0, + "step": 44404 + }, + { + "epoch": 4.143417001026407, + "grad_norm": NaN, + "learning_rate": 6.937298798214732e-05, + "loss": 0.0, + "step": 44405 + }, + { + "epoch": 4.143510310721284, + "grad_norm": NaN, + "learning_rate": 6.936660988410538e-05, + "loss": 0.0, + "step": 44406 + }, + { + "epoch": 4.143603620416161, + "grad_norm": NaN, + "learning_rate": 6.936023199109025e-05, + "loss": 0.0, + "step": 44407 + }, + { + "epoch": 4.143696930111038, + "grad_norm": NaN, + "learning_rate": 6.935385430311817e-05, + "loss": 0.0, + "step": 44408 + }, + { + "epoch": 4.143790239805916, + "grad_norm": NaN, + "learning_rate": 6.934747682020535e-05, + "loss": 0.0, + "step": 44409 + }, + { + "epoch": 4.143883549500793, + "grad_norm": NaN, + "learning_rate": 6.9341099542368e-05, + "loss": 0.0, + "step": 44410 + }, + { + "epoch": 4.1439768591956705, + "grad_norm": NaN, + "learning_rate": 6.933472246962236e-05, + "loss": 0.0, + "step": 44411 + }, + { + "epoch": 4.144070168890548, + "grad_norm": NaN, + "learning_rate": 6.932834560198461e-05, + "loss": 0.0, + "step": 44412 + }, + { + "epoch": 4.144163478585425, + "grad_norm": NaN, + "learning_rate": 6.932196893947098e-05, + "loss": 0.0, + "step": 44413 + }, + { + "epoch": 4.144256788280303, + "grad_norm": NaN, + "learning_rate": 6.931559248209768e-05, + "loss": 0.0, + "step": 44414 + }, + { + "epoch": 4.144350097975179, + "grad_norm": NaN, + "learning_rate": 6.930921622988095e-05, + "loss": 0.0, + "step": 44415 + }, + { + "epoch": 4.144443407670057, + "grad_norm": NaN, + "learning_rate": 6.930284018283695e-05, + "loss": 0.0, + "step": 44416 + }, + { + "epoch": 4.144536717364934, + "grad_norm": NaN, + "learning_rate": 6.929646434098195e-05, + "loss": 0.0, + "step": 44417 + }, + { + "epoch": 4.1446300270598115, + "grad_norm": NaN, + "learning_rate": 6.929008870433212e-05, + "loss": 0.0, + "step": 44418 + }, + { + "epoch": 4.144723336754689, + "grad_norm": NaN, + "learning_rate": 6.92837132729037e-05, + "loss": 0.0, + "step": 44419 + }, + { + "epoch": 4.144816646449566, + "grad_norm": NaN, + "learning_rate": 6.927733804671288e-05, + "loss": 0.0, + "step": 44420 + }, + { + "epoch": 4.144909956144444, + "grad_norm": NaN, + "learning_rate": 6.927096302577587e-05, + "loss": 0.0, + "step": 44421 + }, + { + "epoch": 4.14500326583932, + "grad_norm": NaN, + "learning_rate": 6.92645882101089e-05, + "loss": 0.0, + "step": 44422 + }, + { + "epoch": 4.145096575534198, + "grad_norm": NaN, + "learning_rate": 6.925821359972815e-05, + "loss": 0.0, + "step": 44423 + }, + { + "epoch": 4.145189885229075, + "grad_norm": NaN, + "learning_rate": 6.925183919464985e-05, + "loss": 0.0, + "step": 44424 + }, + { + "epoch": 4.145283194923953, + "grad_norm": NaN, + "learning_rate": 6.924546499489023e-05, + "loss": 0.0, + "step": 44425 + }, + { + "epoch": 4.14537650461883, + "grad_norm": NaN, + "learning_rate": 6.923909100046543e-05, + "loss": 0.0, + "step": 44426 + }, + { + "epoch": 4.145469814313707, + "grad_norm": NaN, + "learning_rate": 6.923271721139172e-05, + "loss": 0.0, + "step": 44427 + }, + { + "epoch": 4.145563124008585, + "grad_norm": NaN, + "learning_rate": 6.92263436276853e-05, + "loss": 0.0, + "step": 44428 + }, + { + "epoch": 4.145656433703461, + "grad_norm": NaN, + "learning_rate": 6.921997024936233e-05, + "loss": 0.0, + "step": 44429 + }, + { + "epoch": 4.145749743398339, + "grad_norm": NaN, + "learning_rate": 6.921359707643906e-05, + "loss": 0.0, + "step": 44430 + }, + { + "epoch": 4.145843053093216, + "grad_norm": NaN, + "learning_rate": 6.920722410893169e-05, + "loss": 0.0, + "step": 44431 + }, + { + "epoch": 4.145936362788094, + "grad_norm": NaN, + "learning_rate": 6.920085134685642e-05, + "loss": 0.0, + "step": 44432 + }, + { + "epoch": 4.146029672482971, + "grad_norm": NaN, + "learning_rate": 6.919447879022943e-05, + "loss": 0.0, + "step": 44433 + }, + { + "epoch": 4.1461229821778485, + "grad_norm": NaN, + "learning_rate": 6.918810643906695e-05, + "loss": 0.0, + "step": 44434 + }, + { + "epoch": 4.146216291872726, + "grad_norm": NaN, + "learning_rate": 6.918173429338518e-05, + "loss": 0.0, + "step": 44435 + }, + { + "epoch": 4.146309601567603, + "grad_norm": NaN, + "learning_rate": 6.917536235320032e-05, + "loss": 0.0, + "step": 44436 + }, + { + "epoch": 4.14640291126248, + "grad_norm": NaN, + "learning_rate": 6.916899061852859e-05, + "loss": 0.0, + "step": 44437 + }, + { + "epoch": 4.146496220957357, + "grad_norm": NaN, + "learning_rate": 6.916261908938615e-05, + "loss": 0.0, + "step": 44438 + }, + { + "epoch": 4.146589530652235, + "grad_norm": NaN, + "learning_rate": 6.915624776578921e-05, + "loss": 0.0, + "step": 44439 + }, + { + "epoch": 4.146682840347112, + "grad_norm": NaN, + "learning_rate": 6.914987664775402e-05, + "loss": 0.0, + "step": 44440 + }, + { + "epoch": 4.1467761500419895, + "grad_norm": NaN, + "learning_rate": 6.914350573529671e-05, + "loss": 0.0, + "step": 44441 + }, + { + "epoch": 4.146869459736867, + "grad_norm": NaN, + "learning_rate": 6.913713502843354e-05, + "loss": 0.0, + "step": 44442 + }, + { + "epoch": 4.146962769431744, + "grad_norm": NaN, + "learning_rate": 6.913076452718067e-05, + "loss": 0.0, + "step": 44443 + }, + { + "epoch": 4.147056079126621, + "grad_norm": NaN, + "learning_rate": 6.912439423155431e-05, + "loss": 0.0, + "step": 44444 + }, + { + "epoch": 4.147149388821498, + "grad_norm": NaN, + "learning_rate": 6.911802414157067e-05, + "loss": 0.0, + "step": 44445 + }, + { + "epoch": 4.147242698516376, + "grad_norm": NaN, + "learning_rate": 6.911165425724592e-05, + "loss": 0.0, + "step": 44446 + }, + { + "epoch": 4.147336008211253, + "grad_norm": NaN, + "learning_rate": 6.910528457859628e-05, + "loss": 0.0, + "step": 44447 + }, + { + "epoch": 4.147429317906131, + "grad_norm": NaN, + "learning_rate": 6.909891510563795e-05, + "loss": 0.0, + "step": 44448 + }, + { + "epoch": 4.147522627601008, + "grad_norm": NaN, + "learning_rate": 6.90925458383871e-05, + "loss": 0.0, + "step": 44449 + }, + { + "epoch": 4.147615937295885, + "grad_norm": NaN, + "learning_rate": 6.908617677685994e-05, + "loss": 0.0, + "step": 44450 + }, + { + "epoch": 4.147709246990762, + "grad_norm": NaN, + "learning_rate": 6.907980792107266e-05, + "loss": 0.0, + "step": 44451 + }, + { + "epoch": 4.147802556685639, + "grad_norm": NaN, + "learning_rate": 6.907343927104148e-05, + "loss": 0.0, + "step": 44452 + }, + { + "epoch": 4.147895866380517, + "grad_norm": NaN, + "learning_rate": 6.906707082678254e-05, + "loss": 0.0, + "step": 44453 + }, + { + "epoch": 4.147989176075394, + "grad_norm": NaN, + "learning_rate": 6.90607025883121e-05, + "loss": 0.0, + "step": 44454 + }, + { + "epoch": 4.148082485770272, + "grad_norm": NaN, + "learning_rate": 6.90543345556463e-05, + "loss": 0.0, + "step": 44455 + }, + { + "epoch": 4.148175795465149, + "grad_norm": NaN, + "learning_rate": 6.904796672880134e-05, + "loss": 0.0, + "step": 44456 + }, + { + "epoch": 4.1482691051600264, + "grad_norm": NaN, + "learning_rate": 6.904159910779344e-05, + "loss": 0.0, + "step": 44457 + }, + { + "epoch": 4.148362414854903, + "grad_norm": NaN, + "learning_rate": 6.903523169263874e-05, + "loss": 0.0, + "step": 44458 + }, + { + "epoch": 4.14845572454978, + "grad_norm": NaN, + "learning_rate": 6.902886448335349e-05, + "loss": 0.0, + "step": 44459 + }, + { + "epoch": 4.148549034244658, + "grad_norm": NaN, + "learning_rate": 6.902249747995384e-05, + "loss": 0.0, + "step": 44460 + }, + { + "epoch": 4.148642343939535, + "grad_norm": NaN, + "learning_rate": 6.9016130682456e-05, + "loss": 0.0, + "step": 44461 + }, + { + "epoch": 4.148735653634413, + "grad_norm": NaN, + "learning_rate": 6.900976409087616e-05, + "loss": 0.0, + "step": 44462 + }, + { + "epoch": 4.14882896332929, + "grad_norm": NaN, + "learning_rate": 6.900339770523047e-05, + "loss": 0.0, + "step": 44463 + }, + { + "epoch": 4.1489222730241675, + "grad_norm": NaN, + "learning_rate": 6.899703152553517e-05, + "loss": 0.0, + "step": 44464 + }, + { + "epoch": 4.149015582719045, + "grad_norm": NaN, + "learning_rate": 6.89906655518064e-05, + "loss": 0.0, + "step": 44465 + }, + { + "epoch": 4.1491088924139214, + "grad_norm": NaN, + "learning_rate": 6.89842997840604e-05, + "loss": 0.0, + "step": 44466 + }, + { + "epoch": 4.149202202108799, + "grad_norm": NaN, + "learning_rate": 6.897793422231332e-05, + "loss": 0.0, + "step": 44467 + }, + { + "epoch": 4.149295511803676, + "grad_norm": NaN, + "learning_rate": 6.897156886658134e-05, + "loss": 0.0, + "step": 44468 + }, + { + "epoch": 4.149388821498554, + "grad_norm": NaN, + "learning_rate": 6.896520371688067e-05, + "loss": 0.0, + "step": 44469 + }, + { + "epoch": 4.149482131193431, + "grad_norm": NaN, + "learning_rate": 6.895883877322747e-05, + "loss": 0.0, + "step": 44470 + }, + { + "epoch": 4.1495754408883085, + "grad_norm": NaN, + "learning_rate": 6.895247403563795e-05, + "loss": 0.0, + "step": 44471 + }, + { + "epoch": 4.149668750583186, + "grad_norm": NaN, + "learning_rate": 6.894610950412827e-05, + "loss": 0.0, + "step": 44472 + }, + { + "epoch": 4.1497620602780625, + "grad_norm": NaN, + "learning_rate": 6.893974517871464e-05, + "loss": 0.0, + "step": 44473 + }, + { + "epoch": 4.14985536997294, + "grad_norm": NaN, + "learning_rate": 6.89333810594132e-05, + "loss": 0.0, + "step": 44474 + }, + { + "epoch": 4.149948679667817, + "grad_norm": NaN, + "learning_rate": 6.892701714624017e-05, + "loss": 0.0, + "step": 44475 + }, + { + "epoch": 4.150041989362695, + "grad_norm": NaN, + "learning_rate": 6.892065343921174e-05, + "loss": 0.0, + "step": 44476 + }, + { + "epoch": 4.150135299057572, + "grad_norm": NaN, + "learning_rate": 6.891428993834404e-05, + "loss": 0.0, + "step": 44477 + }, + { + "epoch": 4.15022860875245, + "grad_norm": NaN, + "learning_rate": 6.89079266436533e-05, + "loss": 0.0, + "step": 44478 + }, + { + "epoch": 4.150321918447327, + "grad_norm": NaN, + "learning_rate": 6.890156355515568e-05, + "loss": 0.0, + "step": 44479 + }, + { + "epoch": 4.150415228142204, + "grad_norm": NaN, + "learning_rate": 6.889520067286736e-05, + "loss": 0.0, + "step": 44480 + }, + { + "epoch": 4.150508537837081, + "grad_norm": NaN, + "learning_rate": 6.88888379968045e-05, + "loss": 0.0, + "step": 44481 + }, + { + "epoch": 4.150601847531958, + "grad_norm": NaN, + "learning_rate": 6.888247552698332e-05, + "loss": 0.0, + "step": 44482 + }, + { + "epoch": 4.150695157226836, + "grad_norm": NaN, + "learning_rate": 6.887611326341997e-05, + "loss": 0.0, + "step": 44483 + }, + { + "epoch": 4.150788466921713, + "grad_norm": NaN, + "learning_rate": 6.886975120613065e-05, + "loss": 0.0, + "step": 44484 + }, + { + "epoch": 4.150881776616591, + "grad_norm": NaN, + "learning_rate": 6.886338935513149e-05, + "loss": 0.0, + "step": 44485 + }, + { + "epoch": 4.150975086311468, + "grad_norm": NaN, + "learning_rate": 6.885702771043872e-05, + "loss": 0.0, + "step": 44486 + }, + { + "epoch": 4.1510683960063455, + "grad_norm": NaN, + "learning_rate": 6.885066627206848e-05, + "loss": 0.0, + "step": 44487 + }, + { + "epoch": 4.151161705701222, + "grad_norm": NaN, + "learning_rate": 6.884430504003697e-05, + "loss": 0.0, + "step": 44488 + }, + { + "epoch": 4.151255015396099, + "grad_norm": NaN, + "learning_rate": 6.883794401436034e-05, + "loss": 0.0, + "step": 44489 + }, + { + "epoch": 4.151348325090977, + "grad_norm": NaN, + "learning_rate": 6.883158319505477e-05, + "loss": 0.0, + "step": 44490 + }, + { + "epoch": 4.151441634785854, + "grad_norm": NaN, + "learning_rate": 6.882522258213644e-05, + "loss": 0.0, + "step": 44491 + }, + { + "epoch": 4.151534944480732, + "grad_norm": NaN, + "learning_rate": 6.881886217562154e-05, + "loss": 0.0, + "step": 44492 + }, + { + "epoch": 4.151628254175609, + "grad_norm": NaN, + "learning_rate": 6.881250197552621e-05, + "loss": 0.0, + "step": 44493 + }, + { + "epoch": 4.1517215638704865, + "grad_norm": NaN, + "learning_rate": 6.880614198186664e-05, + "loss": 0.0, + "step": 44494 + }, + { + "epoch": 4.151814873565363, + "grad_norm": NaN, + "learning_rate": 6.879978219465901e-05, + "loss": 0.0, + "step": 44495 + }, + { + "epoch": 4.1519081832602405, + "grad_norm": NaN, + "learning_rate": 6.879342261391949e-05, + "loss": 0.0, + "step": 44496 + }, + { + "epoch": 4.152001492955118, + "grad_norm": NaN, + "learning_rate": 6.878706323966421e-05, + "loss": 0.0, + "step": 44497 + }, + { + "epoch": 4.152094802649995, + "grad_norm": NaN, + "learning_rate": 6.87807040719094e-05, + "loss": 0.0, + "step": 44498 + }, + { + "epoch": 4.152188112344873, + "grad_norm": NaN, + "learning_rate": 6.87743451106712e-05, + "loss": 0.0, + "step": 44499 + }, + { + "epoch": 4.15228142203975, + "grad_norm": NaN, + "learning_rate": 6.876798635596576e-05, + "loss": 0.0, + "step": 44500 + }, + { + "epoch": 4.152374731734628, + "grad_norm": NaN, + "learning_rate": 6.876162780780928e-05, + "loss": 0.0, + "step": 44501 + }, + { + "epoch": 4.152468041429504, + "grad_norm": NaN, + "learning_rate": 6.875526946621791e-05, + "loss": 0.0, + "step": 44502 + }, + { + "epoch": 4.1525613511243815, + "grad_norm": NaN, + "learning_rate": 6.874891133120782e-05, + "loss": 0.0, + "step": 44503 + }, + { + "epoch": 4.152654660819259, + "grad_norm": NaN, + "learning_rate": 6.874255340279519e-05, + "loss": 0.0, + "step": 44504 + }, + { + "epoch": 4.152747970514136, + "grad_norm": NaN, + "learning_rate": 6.873619568099616e-05, + "loss": 0.0, + "step": 44505 + }, + { + "epoch": 4.152841280209014, + "grad_norm": NaN, + "learning_rate": 6.872983816582694e-05, + "loss": 0.0, + "step": 44506 + }, + { + "epoch": 4.152934589903891, + "grad_norm": NaN, + "learning_rate": 6.872348085730365e-05, + "loss": 0.0, + "step": 44507 + }, + { + "epoch": 4.153027899598769, + "grad_norm": NaN, + "learning_rate": 6.871712375544247e-05, + "loss": 0.0, + "step": 44508 + }, + { + "epoch": 4.153121209293646, + "grad_norm": NaN, + "learning_rate": 6.871076686025958e-05, + "loss": 0.0, + "step": 44509 + }, + { + "epoch": 4.153214518988523, + "grad_norm": NaN, + "learning_rate": 6.870441017177111e-05, + "loss": 0.0, + "step": 44510 + }, + { + "epoch": 4.1533078286834, + "grad_norm": NaN, + "learning_rate": 6.869805368999326e-05, + "loss": 0.0, + "step": 44511 + }, + { + "epoch": 4.153401138378277, + "grad_norm": NaN, + "learning_rate": 6.869169741494217e-05, + "loss": 0.0, + "step": 44512 + }, + { + "epoch": 4.153494448073155, + "grad_norm": NaN, + "learning_rate": 6.868534134663402e-05, + "loss": 0.0, + "step": 44513 + }, + { + "epoch": 4.153587757768032, + "grad_norm": NaN, + "learning_rate": 6.867898548508493e-05, + "loss": 0.0, + "step": 44514 + }, + { + "epoch": 4.15368106746291, + "grad_norm": NaN, + "learning_rate": 6.867262983031111e-05, + "loss": 0.0, + "step": 44515 + }, + { + "epoch": 4.153774377157787, + "grad_norm": NaN, + "learning_rate": 6.86662743823287e-05, + "loss": 0.0, + "step": 44516 + }, + { + "epoch": 4.153867686852664, + "grad_norm": NaN, + "learning_rate": 6.865991914115387e-05, + "loss": 0.0, + "step": 44517 + }, + { + "epoch": 4.153960996547541, + "grad_norm": NaN, + "learning_rate": 6.865356410680277e-05, + "loss": 0.0, + "step": 44518 + }, + { + "epoch": 4.1540543062424184, + "grad_norm": NaN, + "learning_rate": 6.864720927929154e-05, + "loss": 0.0, + "step": 44519 + }, + { + "epoch": 4.154147615937296, + "grad_norm": NaN, + "learning_rate": 6.864085465863638e-05, + "loss": 0.0, + "step": 44520 + }, + { + "epoch": 4.154240925632173, + "grad_norm": NaN, + "learning_rate": 6.863450024485341e-05, + "loss": 0.0, + "step": 44521 + }, + { + "epoch": 4.154334235327051, + "grad_norm": NaN, + "learning_rate": 6.86281460379588e-05, + "loss": 0.0, + "step": 44522 + }, + { + "epoch": 4.154427545021928, + "grad_norm": NaN, + "learning_rate": 6.862179203796873e-05, + "loss": 0.0, + "step": 44523 + }, + { + "epoch": 4.154520854716805, + "grad_norm": NaN, + "learning_rate": 6.861543824489928e-05, + "loss": 0.0, + "step": 44524 + }, + { + "epoch": 4.154614164411682, + "grad_norm": NaN, + "learning_rate": 6.860908465876677e-05, + "loss": 0.0, + "step": 44525 + }, + { + "epoch": 4.1547074741065595, + "grad_norm": NaN, + "learning_rate": 6.860273127958718e-05, + "loss": 0.0, + "step": 44526 + }, + { + "epoch": 4.154800783801437, + "grad_norm": NaN, + "learning_rate": 6.85963781073767e-05, + "loss": 0.0, + "step": 44527 + }, + { + "epoch": 4.154894093496314, + "grad_norm": NaN, + "learning_rate": 6.859002514215162e-05, + "loss": 0.0, + "step": 44528 + }, + { + "epoch": 4.154987403191192, + "grad_norm": NaN, + "learning_rate": 6.858367238392792e-05, + "loss": 0.0, + "step": 44529 + }, + { + "epoch": 4.155080712886069, + "grad_norm": NaN, + "learning_rate": 6.857731983272182e-05, + "loss": 0.0, + "step": 44530 + }, + { + "epoch": 4.155174022580946, + "grad_norm": NaN, + "learning_rate": 6.85709674885495e-05, + "loss": 0.0, + "step": 44531 + }, + { + "epoch": 4.155267332275823, + "grad_norm": NaN, + "learning_rate": 6.856461535142708e-05, + "loss": 0.0, + "step": 44532 + }, + { + "epoch": 4.1553606419707005, + "grad_norm": NaN, + "learning_rate": 6.855826342137073e-05, + "loss": 0.0, + "step": 44533 + }, + { + "epoch": 4.155453951665578, + "grad_norm": NaN, + "learning_rate": 6.855191169839657e-05, + "loss": 0.0, + "step": 44534 + }, + { + "epoch": 4.155547261360455, + "grad_norm": NaN, + "learning_rate": 6.854556018252077e-05, + "loss": 0.0, + "step": 44535 + }, + { + "epoch": 4.155640571055333, + "grad_norm": NaN, + "learning_rate": 6.85392088737595e-05, + "loss": 0.0, + "step": 44536 + }, + { + "epoch": 4.15573388075021, + "grad_norm": NaN, + "learning_rate": 6.853285777212888e-05, + "loss": 0.0, + "step": 44537 + }, + { + "epoch": 4.155827190445088, + "grad_norm": NaN, + "learning_rate": 6.852650687764508e-05, + "loss": 0.0, + "step": 44538 + }, + { + "epoch": 4.155920500139964, + "grad_norm": NaN, + "learning_rate": 6.852015619032424e-05, + "loss": 0.0, + "step": 44539 + }, + { + "epoch": 4.156013809834842, + "grad_norm": NaN, + "learning_rate": 6.85138057101825e-05, + "loss": 0.0, + "step": 44540 + }, + { + "epoch": 4.156107119529719, + "grad_norm": NaN, + "learning_rate": 6.850745543723602e-05, + "loss": 0.0, + "step": 44541 + }, + { + "epoch": 4.156200429224596, + "grad_norm": NaN, + "learning_rate": 6.850110537150093e-05, + "loss": 0.0, + "step": 44542 + }, + { + "epoch": 4.156293738919474, + "grad_norm": NaN, + "learning_rate": 6.849475551299339e-05, + "loss": 0.0, + "step": 44543 + }, + { + "epoch": 4.156387048614351, + "grad_norm": NaN, + "learning_rate": 6.848840586172956e-05, + "loss": 0.0, + "step": 44544 + }, + { + "epoch": 4.156480358309229, + "grad_norm": NaN, + "learning_rate": 6.848205641772555e-05, + "loss": 0.0, + "step": 44545 + }, + { + "epoch": 4.156573668004105, + "grad_norm": NaN, + "learning_rate": 6.847570718099753e-05, + "loss": 0.0, + "step": 44546 + }, + { + "epoch": 4.156666977698983, + "grad_norm": NaN, + "learning_rate": 6.846935815156163e-05, + "loss": 0.0, + "step": 44547 + }, + { + "epoch": 4.15676028739386, + "grad_norm": NaN, + "learning_rate": 6.8463009329434e-05, + "loss": 0.0, + "step": 44548 + }, + { + "epoch": 4.1568535970887375, + "grad_norm": NaN, + "learning_rate": 6.845666071463079e-05, + "loss": 0.0, + "step": 44549 + }, + { + "epoch": 4.156946906783615, + "grad_norm": NaN, + "learning_rate": 6.845031230716815e-05, + "loss": 0.0, + "step": 44550 + }, + { + "epoch": 4.157040216478492, + "grad_norm": NaN, + "learning_rate": 6.844396410706219e-05, + "loss": 0.0, + "step": 44551 + }, + { + "epoch": 4.15713352617337, + "grad_norm": NaN, + "learning_rate": 6.843761611432907e-05, + "loss": 0.0, + "step": 44552 + }, + { + "epoch": 4.157226835868247, + "grad_norm": NaN, + "learning_rate": 6.843126832898494e-05, + "loss": 0.0, + "step": 44553 + }, + { + "epoch": 4.157320145563124, + "grad_norm": NaN, + "learning_rate": 6.842492075104593e-05, + "loss": 0.0, + "step": 44554 + }, + { + "epoch": 4.157413455258001, + "grad_norm": NaN, + "learning_rate": 6.841857338052818e-05, + "loss": 0.0, + "step": 44555 + }, + { + "epoch": 4.1575067649528785, + "grad_norm": NaN, + "learning_rate": 6.841222621744785e-05, + "loss": 0.0, + "step": 44556 + }, + { + "epoch": 4.157600074647756, + "grad_norm": NaN, + "learning_rate": 6.840587926182105e-05, + "loss": 0.0, + "step": 44557 + }, + { + "epoch": 4.157693384342633, + "grad_norm": NaN, + "learning_rate": 6.839953251366391e-05, + "loss": 0.0, + "step": 44558 + }, + { + "epoch": 4.157786694037511, + "grad_norm": NaN, + "learning_rate": 6.83931859729926e-05, + "loss": 0.0, + "step": 44559 + }, + { + "epoch": 4.157880003732388, + "grad_norm": NaN, + "learning_rate": 6.838683963982325e-05, + "loss": 0.0, + "step": 44560 + }, + { + "epoch": 4.157973313427265, + "grad_norm": NaN, + "learning_rate": 6.838049351417199e-05, + "loss": 0.0, + "step": 44561 + }, + { + "epoch": 4.158066623122142, + "grad_norm": NaN, + "learning_rate": 6.837414759605489e-05, + "loss": 0.0, + "step": 44562 + }, + { + "epoch": 4.15815993281702, + "grad_norm": NaN, + "learning_rate": 6.836780188548829e-05, + "loss": 0.0, + "step": 44563 + }, + { + "epoch": 4.158253242511897, + "grad_norm": NaN, + "learning_rate": 6.83614563824881e-05, + "loss": 0.0, + "step": 44564 + }, + { + "epoch": 4.158346552206774, + "grad_norm": NaN, + "learning_rate": 6.83551110870705e-05, + "loss": 0.0, + "step": 44565 + }, + { + "epoch": 4.158439861901652, + "grad_norm": NaN, + "learning_rate": 6.83487659992518e-05, + "loss": 0.0, + "step": 44566 + }, + { + "epoch": 4.158533171596529, + "grad_norm": NaN, + "learning_rate": 6.834242111904792e-05, + "loss": 0.0, + "step": 44567 + }, + { + "epoch": 4.158626481291406, + "grad_norm": NaN, + "learning_rate": 6.833607644647502e-05, + "loss": 0.0, + "step": 44568 + }, + { + "epoch": 4.158719790986283, + "grad_norm": NaN, + "learning_rate": 6.832973198154942e-05, + "loss": 0.0, + "step": 44569 + }, + { + "epoch": 4.158813100681161, + "grad_norm": NaN, + "learning_rate": 6.832338772428705e-05, + "loss": 0.0, + "step": 44570 + }, + { + "epoch": 4.158906410376038, + "grad_norm": NaN, + "learning_rate": 6.831704367470405e-05, + "loss": 0.0, + "step": 44571 + }, + { + "epoch": 4.1589997200709155, + "grad_norm": NaN, + "learning_rate": 6.831069983281671e-05, + "loss": 0.0, + "step": 44572 + }, + { + "epoch": 4.159093029765793, + "grad_norm": NaN, + "learning_rate": 6.830435619864102e-05, + "loss": 0.0, + "step": 44573 + }, + { + "epoch": 4.15918633946067, + "grad_norm": NaN, + "learning_rate": 6.829801277219311e-05, + "loss": 0.0, + "step": 44574 + }, + { + "epoch": 4.159279649155547, + "grad_norm": NaN, + "learning_rate": 6.829166955348926e-05, + "loss": 0.0, + "step": 44575 + }, + { + "epoch": 4.159372958850424, + "grad_norm": NaN, + "learning_rate": 6.828532654254542e-05, + "loss": 0.0, + "step": 44576 + }, + { + "epoch": 4.159466268545302, + "grad_norm": NaN, + "learning_rate": 6.82789837393778e-05, + "loss": 0.0, + "step": 44577 + }, + { + "epoch": 4.159559578240179, + "grad_norm": NaN, + "learning_rate": 6.82726411440025e-05, + "loss": 0.0, + "step": 44578 + }, + { + "epoch": 4.1596528879350565, + "grad_norm": NaN, + "learning_rate": 6.826629875643569e-05, + "loss": 0.0, + "step": 44579 + }, + { + "epoch": 4.159746197629934, + "grad_norm": NaN, + "learning_rate": 6.825995657669345e-05, + "loss": 0.0, + "step": 44580 + }, + { + "epoch": 4.159839507324811, + "grad_norm": NaN, + "learning_rate": 6.825361460479193e-05, + "loss": 0.0, + "step": 44581 + }, + { + "epoch": 4.159932817019689, + "grad_norm": NaN, + "learning_rate": 6.824727284074727e-05, + "loss": 0.0, + "step": 44582 + }, + { + "epoch": 4.160026126714565, + "grad_norm": NaN, + "learning_rate": 6.824093128457556e-05, + "loss": 0.0, + "step": 44583 + }, + { + "epoch": 4.160119436409443, + "grad_norm": NaN, + "learning_rate": 6.823458993629294e-05, + "loss": 0.0, + "step": 44584 + }, + { + "epoch": 4.16021274610432, + "grad_norm": NaN, + "learning_rate": 6.822824879591556e-05, + "loss": 0.0, + "step": 44585 + }, + { + "epoch": 4.1603060557991975, + "grad_norm": NaN, + "learning_rate": 6.822190786345951e-05, + "loss": 0.0, + "step": 44586 + }, + { + "epoch": 4.160399365494075, + "grad_norm": NaN, + "learning_rate": 6.821556713894092e-05, + "loss": 0.0, + "step": 44587 + }, + { + "epoch": 4.160492675188952, + "grad_norm": NaN, + "learning_rate": 6.820922662237592e-05, + "loss": 0.0, + "step": 44588 + }, + { + "epoch": 4.16058598488383, + "grad_norm": NaN, + "learning_rate": 6.820288631378062e-05, + "loss": 0.0, + "step": 44589 + }, + { + "epoch": 4.160679294578706, + "grad_norm": NaN, + "learning_rate": 6.819654621317116e-05, + "loss": 0.0, + "step": 44590 + }, + { + "epoch": 4.160772604273584, + "grad_norm": NaN, + "learning_rate": 6.819020632056364e-05, + "loss": 0.0, + "step": 44591 + }, + { + "epoch": 4.160865913968461, + "grad_norm": NaN, + "learning_rate": 6.81838666359742e-05, + "loss": 0.0, + "step": 44592 + }, + { + "epoch": 4.160959223663339, + "grad_norm": NaN, + "learning_rate": 6.817752715941896e-05, + "loss": 0.0, + "step": 44593 + }, + { + "epoch": 4.161052533358216, + "grad_norm": NaN, + "learning_rate": 6.817118789091402e-05, + "loss": 0.0, + "step": 44594 + }, + { + "epoch": 4.161145843053093, + "grad_norm": NaN, + "learning_rate": 6.816484883047552e-05, + "loss": 0.0, + "step": 44595 + }, + { + "epoch": 4.161239152747971, + "grad_norm": NaN, + "learning_rate": 6.815850997811956e-05, + "loss": 0.0, + "step": 44596 + }, + { + "epoch": 4.161332462442848, + "grad_norm": NaN, + "learning_rate": 6.815217133386224e-05, + "loss": 0.0, + "step": 44597 + }, + { + "epoch": 4.161425772137725, + "grad_norm": NaN, + "learning_rate": 6.814583289771974e-05, + "loss": 0.0, + "step": 44598 + }, + { + "epoch": 4.161519081832602, + "grad_norm": NaN, + "learning_rate": 6.813949466970813e-05, + "loss": 0.0, + "step": 44599 + }, + { + "epoch": 4.16161239152748, + "grad_norm": NaN, + "learning_rate": 6.813315664984347e-05, + "loss": 0.0, + "step": 44600 + }, + { + "epoch": 4.161705701222357, + "grad_norm": NaN, + "learning_rate": 6.812681883814207e-05, + "loss": 0.0, + "step": 44601 + }, + { + "epoch": 4.1617990109172345, + "grad_norm": NaN, + "learning_rate": 6.812048123461985e-05, + "loss": 0.0, + "step": 44602 + }, + { + "epoch": 4.161892320612112, + "grad_norm": NaN, + "learning_rate": 6.811414383929294e-05, + "loss": 0.0, + "step": 44603 + }, + { + "epoch": 4.161985630306989, + "grad_norm": NaN, + "learning_rate": 6.810780665217762e-05, + "loss": 0.0, + "step": 44604 + }, + { + "epoch": 4.162078940001866, + "grad_norm": NaN, + "learning_rate": 6.810146967328982e-05, + "loss": 0.0, + "step": 44605 + }, + { + "epoch": 4.162172249696743, + "grad_norm": NaN, + "learning_rate": 6.809513290264568e-05, + "loss": 0.0, + "step": 44606 + }, + { + "epoch": 4.162265559391621, + "grad_norm": NaN, + "learning_rate": 6.808879634026146e-05, + "loss": 0.0, + "step": 44607 + }, + { + "epoch": 4.162358869086498, + "grad_norm": NaN, + "learning_rate": 6.808245998615312e-05, + "loss": 0.0, + "step": 44608 + }, + { + "epoch": 4.1624521787813755, + "grad_norm": NaN, + "learning_rate": 6.807612384033675e-05, + "loss": 0.0, + "step": 44609 + }, + { + "epoch": 4.162545488476253, + "grad_norm": NaN, + "learning_rate": 6.806978790282865e-05, + "loss": 0.0, + "step": 44610 + }, + { + "epoch": 4.16263879817113, + "grad_norm": NaN, + "learning_rate": 6.806345217364476e-05, + "loss": 0.0, + "step": 44611 + }, + { + "epoch": 4.162732107866007, + "grad_norm": NaN, + "learning_rate": 6.805711665280118e-05, + "loss": 0.0, + "step": 44612 + }, + { + "epoch": 4.162825417560884, + "grad_norm": NaN, + "learning_rate": 6.805078134031417e-05, + "loss": 0.0, + "step": 44613 + }, + { + "epoch": 4.162918727255762, + "grad_norm": NaN, + "learning_rate": 6.804444623619971e-05, + "loss": 0.0, + "step": 44614 + }, + { + "epoch": 4.163012036950639, + "grad_norm": NaN, + "learning_rate": 6.803811134047388e-05, + "loss": 0.0, + "step": 44615 + }, + { + "epoch": 4.163105346645517, + "grad_norm": NaN, + "learning_rate": 6.803177665315299e-05, + "loss": 0.0, + "step": 44616 + }, + { + "epoch": 4.163198656340394, + "grad_norm": NaN, + "learning_rate": 6.802544217425294e-05, + "loss": 0.0, + "step": 44617 + }, + { + "epoch": 4.163291966035271, + "grad_norm": NaN, + "learning_rate": 6.801910790378984e-05, + "loss": 0.0, + "step": 44618 + }, + { + "epoch": 4.163385275730148, + "grad_norm": NaN, + "learning_rate": 6.801277384178e-05, + "loss": 0.0, + "step": 44619 + }, + { + "epoch": 4.163478585425025, + "grad_norm": NaN, + "learning_rate": 6.800643998823931e-05, + "loss": 0.0, + "step": 44620 + }, + { + "epoch": 4.163571895119903, + "grad_norm": NaN, + "learning_rate": 6.800010634318397e-05, + "loss": 0.0, + "step": 44621 + }, + { + "epoch": 4.16366520481478, + "grad_norm": NaN, + "learning_rate": 6.799377290663007e-05, + "loss": 0.0, + "step": 44622 + }, + { + "epoch": 4.163758514509658, + "grad_norm": NaN, + "learning_rate": 6.798743967859372e-05, + "loss": 0.0, + "step": 44623 + }, + { + "epoch": 4.163851824204535, + "grad_norm": NaN, + "learning_rate": 6.7981106659091e-05, + "loss": 0.0, + "step": 44624 + }, + { + "epoch": 4.1639451338994125, + "grad_norm": NaN, + "learning_rate": 6.797477384813803e-05, + "loss": 0.0, + "step": 44625 + }, + { + "epoch": 4.16403844359429, + "grad_norm": NaN, + "learning_rate": 6.796844124575093e-05, + "loss": 0.0, + "step": 44626 + }, + { + "epoch": 4.164131753289166, + "grad_norm": NaN, + "learning_rate": 6.796210885194578e-05, + "loss": 0.0, + "step": 44627 + }, + { + "epoch": 4.164225062984044, + "grad_norm": NaN, + "learning_rate": 6.795577666673866e-05, + "loss": 0.0, + "step": 44628 + }, + { + "epoch": 4.164318372678921, + "grad_norm": NaN, + "learning_rate": 6.794944469014572e-05, + "loss": 0.0, + "step": 44629 + }, + { + "epoch": 4.164411682373799, + "grad_norm": NaN, + "learning_rate": 6.794311292218303e-05, + "loss": 0.0, + "step": 44630 + }, + { + "epoch": 4.164504992068676, + "grad_norm": NaN, + "learning_rate": 6.793678136286671e-05, + "loss": 0.0, + "step": 44631 + }, + { + "epoch": 4.1645983017635535, + "grad_norm": NaN, + "learning_rate": 6.793045001221283e-05, + "loss": 0.0, + "step": 44632 + }, + { + "epoch": 4.164691611458431, + "grad_norm": NaN, + "learning_rate": 6.79241188702375e-05, + "loss": 0.0, + "step": 44633 + }, + { + "epoch": 4.1647849211533075, + "grad_norm": NaN, + "learning_rate": 6.791778793695686e-05, + "loss": 0.0, + "step": 44634 + }, + { + "epoch": 4.164878230848185, + "grad_norm": NaN, + "learning_rate": 6.791145721238694e-05, + "loss": 0.0, + "step": 44635 + }, + { + "epoch": 4.164971540543062, + "grad_norm": NaN, + "learning_rate": 6.790512669654386e-05, + "loss": 0.0, + "step": 44636 + }, + { + "epoch": 4.16506485023794, + "grad_norm": NaN, + "learning_rate": 6.789879638944374e-05, + "loss": 0.0, + "step": 44637 + }, + { + "epoch": 4.165158159932817, + "grad_norm": NaN, + "learning_rate": 6.78924662911026e-05, + "loss": 0.0, + "step": 44638 + }, + { + "epoch": 4.1652514696276945, + "grad_norm": NaN, + "learning_rate": 6.78861364015367e-05, + "loss": 0.0, + "step": 44639 + }, + { + "epoch": 4.165344779322572, + "grad_norm": NaN, + "learning_rate": 6.7879806720762e-05, + "loss": 0.0, + "step": 44640 + }, + { + "epoch": 4.1654380890174485, + "grad_norm": NaN, + "learning_rate": 6.787347724879455e-05, + "loss": 0.0, + "step": 44641 + }, + { + "epoch": 4.165531398712326, + "grad_norm": NaN, + "learning_rate": 6.786714798565062e-05, + "loss": 0.0, + "step": 44642 + }, + { + "epoch": 4.165624708407203, + "grad_norm": NaN, + "learning_rate": 6.786081893134616e-05, + "loss": 0.0, + "step": 44643 + }, + { + "epoch": 4.165718018102081, + "grad_norm": NaN, + "learning_rate": 6.785449008589727e-05, + "loss": 0.0, + "step": 44644 + }, + { + "epoch": 4.165811327796958, + "grad_norm": NaN, + "learning_rate": 6.784816144932016e-05, + "loss": 0.0, + "step": 44645 + }, + { + "epoch": 4.165904637491836, + "grad_norm": NaN, + "learning_rate": 6.784183302163082e-05, + "loss": 0.0, + "step": 44646 + }, + { + "epoch": 4.165997947186713, + "grad_norm": NaN, + "learning_rate": 6.783550480284528e-05, + "loss": 0.0, + "step": 44647 + }, + { + "epoch": 4.1660912568815895, + "grad_norm": NaN, + "learning_rate": 6.782917679297982e-05, + "loss": 0.0, + "step": 44648 + }, + { + "epoch": 4.166184566576467, + "grad_norm": NaN, + "learning_rate": 6.782284899205038e-05, + "loss": 0.0, + "step": 44649 + }, + { + "epoch": 4.166277876271344, + "grad_norm": NaN, + "learning_rate": 6.781652140007302e-05, + "loss": 0.0, + "step": 44650 + }, + { + "epoch": 4.166371185966222, + "grad_norm": NaN, + "learning_rate": 6.781019401706403e-05, + "loss": 0.0, + "step": 44651 + }, + { + "epoch": 4.166464495661099, + "grad_norm": NaN, + "learning_rate": 6.780386684303931e-05, + "loss": 0.0, + "step": 44652 + }, + { + "epoch": 4.166557805355977, + "grad_norm": NaN, + "learning_rate": 6.779753987801495e-05, + "loss": 0.0, + "step": 44653 + }, + { + "epoch": 4.166651115050854, + "grad_norm": NaN, + "learning_rate": 6.77912131220072e-05, + "loss": 0.0, + "step": 44654 + }, + { + "epoch": 4.1667444247457315, + "grad_norm": NaN, + "learning_rate": 6.7784886575032e-05, + "loss": 0.0, + "step": 44655 + }, + { + "epoch": 4.166837734440608, + "grad_norm": NaN, + "learning_rate": 6.777856023710542e-05, + "loss": 0.0, + "step": 44656 + }, + { + "epoch": 4.166931044135485, + "grad_norm": NaN, + "learning_rate": 6.777223410824371e-05, + "loss": 0.0, + "step": 44657 + }, + { + "epoch": 4.167024353830363, + "grad_norm": NaN, + "learning_rate": 6.776590818846281e-05, + "loss": 0.0, + "step": 44658 + }, + { + "epoch": 4.16711766352524, + "grad_norm": NaN, + "learning_rate": 6.775958247777878e-05, + "loss": 0.0, + "step": 44659 + }, + { + "epoch": 4.167210973220118, + "grad_norm": NaN, + "learning_rate": 6.775325697620789e-05, + "loss": 0.0, + "step": 44660 + }, + { + "epoch": 4.167304282914995, + "grad_norm": NaN, + "learning_rate": 6.774693168376604e-05, + "loss": 0.0, + "step": 44661 + }, + { + "epoch": 4.1673975926098725, + "grad_norm": NaN, + "learning_rate": 6.774060660046932e-05, + "loss": 0.0, + "step": 44662 + }, + { + "epoch": 4.167490902304749, + "grad_norm": NaN, + "learning_rate": 6.773428172633398e-05, + "loss": 0.0, + "step": 44663 + }, + { + "epoch": 4.1675842119996265, + "grad_norm": NaN, + "learning_rate": 6.772795706137596e-05, + "loss": 0.0, + "step": 44664 + }, + { + "epoch": 4.167677521694504, + "grad_norm": NaN, + "learning_rate": 6.772163260561135e-05, + "loss": 0.0, + "step": 44665 + }, + { + "epoch": 4.167770831389381, + "grad_norm": NaN, + "learning_rate": 6.771530835905625e-05, + "loss": 0.0, + "step": 44666 + }, + { + "epoch": 4.167864141084259, + "grad_norm": NaN, + "learning_rate": 6.770898432172678e-05, + "loss": 0.0, + "step": 44667 + }, + { + "epoch": 4.167957450779136, + "grad_norm": NaN, + "learning_rate": 6.770266049363896e-05, + "loss": 0.0, + "step": 44668 + }, + { + "epoch": 4.168050760474014, + "grad_norm": NaN, + "learning_rate": 6.76963368748089e-05, + "loss": 0.0, + "step": 44669 + }, + { + "epoch": 4.168144070168891, + "grad_norm": NaN, + "learning_rate": 6.769001346525267e-05, + "loss": 0.0, + "step": 44670 + }, + { + "epoch": 4.1682373798637675, + "grad_norm": NaN, + "learning_rate": 6.768369026498637e-05, + "loss": 0.0, + "step": 44671 + }, + { + "epoch": 4.168330689558645, + "grad_norm": NaN, + "learning_rate": 6.767736727402603e-05, + "loss": 0.0, + "step": 44672 + }, + { + "epoch": 4.168423999253522, + "grad_norm": NaN, + "learning_rate": 6.76710444923878e-05, + "loss": 0.0, + "step": 44673 + }, + { + "epoch": 4.1685173089484, + "grad_norm": NaN, + "learning_rate": 6.766472192008771e-05, + "loss": 0.0, + "step": 44674 + }, + { + "epoch": 4.168610618643277, + "grad_norm": NaN, + "learning_rate": 6.765839955714182e-05, + "loss": 0.0, + "step": 44675 + }, + { + "epoch": 4.168703928338155, + "grad_norm": NaN, + "learning_rate": 6.765207740356618e-05, + "loss": 0.0, + "step": 44676 + }, + { + "epoch": 4.168797238033032, + "grad_norm": NaN, + "learning_rate": 6.764575545937703e-05, + "loss": 0.0, + "step": 44677 + }, + { + "epoch": 4.168890547727909, + "grad_norm": NaN, + "learning_rate": 6.763943372459028e-05, + "loss": 0.0, + "step": 44678 + }, + { + "epoch": 4.168983857422786, + "grad_norm": NaN, + "learning_rate": 6.763311219922197e-05, + "loss": 0.0, + "step": 44679 + }, + { + "epoch": 4.169077167117663, + "grad_norm": NaN, + "learning_rate": 6.76267908832884e-05, + "loss": 0.0, + "step": 44680 + }, + { + "epoch": 4.169170476812541, + "grad_norm": NaN, + "learning_rate": 6.762046977680543e-05, + "loss": 0.0, + "step": 44681 + }, + { + "epoch": 4.169263786507418, + "grad_norm": NaN, + "learning_rate": 6.761414887978915e-05, + "loss": 0.0, + "step": 44682 + }, + { + "epoch": 4.169357096202296, + "grad_norm": NaN, + "learning_rate": 6.76078281922558e-05, + "loss": 0.0, + "step": 44683 + }, + { + "epoch": 4.169450405897173, + "grad_norm": NaN, + "learning_rate": 6.760150771422127e-05, + "loss": 0.0, + "step": 44684 + }, + { + "epoch": 4.16954371559205, + "grad_norm": NaN, + "learning_rate": 6.759518744570165e-05, + "loss": 0.0, + "step": 44685 + }, + { + "epoch": 4.169637025286927, + "grad_norm": NaN, + "learning_rate": 6.758886738671317e-05, + "loss": 0.0, + "step": 44686 + }, + { + "epoch": 4.1697303349818045, + "grad_norm": NaN, + "learning_rate": 6.758254753727173e-05, + "loss": 0.0, + "step": 44687 + }, + { + "epoch": 4.169823644676682, + "grad_norm": NaN, + "learning_rate": 6.75762278973934e-05, + "loss": 0.0, + "step": 44688 + }, + { + "epoch": 4.169916954371559, + "grad_norm": NaN, + "learning_rate": 6.756990846709443e-05, + "loss": 0.0, + "step": 44689 + }, + { + "epoch": 4.170010264066437, + "grad_norm": NaN, + "learning_rate": 6.756358924639068e-05, + "loss": 0.0, + "step": 44690 + }, + { + "epoch": 4.170103573761314, + "grad_norm": NaN, + "learning_rate": 6.755727023529827e-05, + "loss": 0.0, + "step": 44691 + }, + { + "epoch": 4.170196883456191, + "grad_norm": NaN, + "learning_rate": 6.755095143383341e-05, + "loss": 0.0, + "step": 44692 + }, + { + "epoch": 4.170290193151068, + "grad_norm": NaN, + "learning_rate": 6.7544632842012e-05, + "loss": 0.0, + "step": 44693 + }, + { + "epoch": 4.1703835028459455, + "grad_norm": NaN, + "learning_rate": 6.753831445985012e-05, + "loss": 0.0, + "step": 44694 + }, + { + "epoch": 4.170476812540823, + "grad_norm": NaN, + "learning_rate": 6.753199628736397e-05, + "loss": 0.0, + "step": 44695 + }, + { + "epoch": 4.1705701222357, + "grad_norm": NaN, + "learning_rate": 6.752567832456948e-05, + "loss": 0.0, + "step": 44696 + }, + { + "epoch": 4.170663431930578, + "grad_norm": NaN, + "learning_rate": 6.75193605714827e-05, + "loss": 0.0, + "step": 44697 + }, + { + "epoch": 4.170756741625455, + "grad_norm": NaN, + "learning_rate": 6.751304302811987e-05, + "loss": 0.0, + "step": 44698 + }, + { + "epoch": 4.170850051320333, + "grad_norm": NaN, + "learning_rate": 6.750672569449687e-05, + "loss": 0.0, + "step": 44699 + }, + { + "epoch": 4.170943361015209, + "grad_norm": NaN, + "learning_rate": 6.750040857062978e-05, + "loss": 0.0, + "step": 44700 + }, + { + "epoch": 4.1710366707100865, + "grad_norm": NaN, + "learning_rate": 6.749409165653482e-05, + "loss": 0.0, + "step": 44701 + }, + { + "epoch": 4.171129980404964, + "grad_norm": NaN, + "learning_rate": 6.74877749522279e-05, + "loss": 0.0, + "step": 44702 + }, + { + "epoch": 4.171223290099841, + "grad_norm": NaN, + "learning_rate": 6.748145845772508e-05, + "loss": 0.0, + "step": 44703 + }, + { + "epoch": 4.171316599794719, + "grad_norm": NaN, + "learning_rate": 6.747514217304256e-05, + "loss": 0.0, + "step": 44704 + }, + { + "epoch": 4.171409909489596, + "grad_norm": NaN, + "learning_rate": 6.746882609819626e-05, + "loss": 0.0, + "step": 44705 + }, + { + "epoch": 4.171503219184474, + "grad_norm": NaN, + "learning_rate": 6.746251023320223e-05, + "loss": 0.0, + "step": 44706 + }, + { + "epoch": 4.17159652887935, + "grad_norm": NaN, + "learning_rate": 6.74561945780767e-05, + "loss": 0.0, + "step": 44707 + }, + { + "epoch": 4.171689838574228, + "grad_norm": NaN, + "learning_rate": 6.744987913283555e-05, + "loss": 0.0, + "step": 44708 + }, + { + "epoch": 4.171783148269105, + "grad_norm": NaN, + "learning_rate": 6.744356389749486e-05, + "loss": 0.0, + "step": 44709 + }, + { + "epoch": 4.171876457963982, + "grad_norm": NaN, + "learning_rate": 6.743724887207084e-05, + "loss": 0.0, + "step": 44710 + }, + { + "epoch": 4.17196976765886, + "grad_norm": NaN, + "learning_rate": 6.743093405657933e-05, + "loss": 0.0, + "step": 44711 + }, + { + "epoch": 4.172063077353737, + "grad_norm": NaN, + "learning_rate": 6.74246194510366e-05, + "loss": 0.0, + "step": 44712 + }, + { + "epoch": 4.172156387048615, + "grad_norm": NaN, + "learning_rate": 6.741830505545857e-05, + "loss": 0.0, + "step": 44713 + }, + { + "epoch": 4.172249696743491, + "grad_norm": NaN, + "learning_rate": 6.741199086986123e-05, + "loss": 0.0, + "step": 44714 + }, + { + "epoch": 4.172343006438369, + "grad_norm": NaN, + "learning_rate": 6.740567689426088e-05, + "loss": 0.0, + "step": 44715 + }, + { + "epoch": 4.172436316133246, + "grad_norm": NaN, + "learning_rate": 6.739936312867335e-05, + "loss": 0.0, + "step": 44716 + }, + { + "epoch": 4.1725296258281235, + "grad_norm": NaN, + "learning_rate": 6.739304957311473e-05, + "loss": 0.0, + "step": 44717 + }, + { + "epoch": 4.172622935523001, + "grad_norm": NaN, + "learning_rate": 6.738673622760122e-05, + "loss": 0.0, + "step": 44718 + }, + { + "epoch": 4.172716245217878, + "grad_norm": NaN, + "learning_rate": 6.738042309214871e-05, + "loss": 0.0, + "step": 44719 + }, + { + "epoch": 4.172809554912756, + "grad_norm": NaN, + "learning_rate": 6.737411016677327e-05, + "loss": 0.0, + "step": 44720 + }, + { + "epoch": 4.172902864607633, + "grad_norm": NaN, + "learning_rate": 6.736779745149109e-05, + "loss": 0.0, + "step": 44721 + }, + { + "epoch": 4.17299617430251, + "grad_norm": NaN, + "learning_rate": 6.736148494631807e-05, + "loss": 0.0, + "step": 44722 + }, + { + "epoch": 4.173089483997387, + "grad_norm": NaN, + "learning_rate": 6.735517265127026e-05, + "loss": 0.0, + "step": 44723 + }, + { + "epoch": 4.1731827936922645, + "grad_norm": NaN, + "learning_rate": 6.734886056636388e-05, + "loss": 0.0, + "step": 44724 + }, + { + "epoch": 4.173276103387142, + "grad_norm": NaN, + "learning_rate": 6.73425486916148e-05, + "loss": 0.0, + "step": 44725 + }, + { + "epoch": 4.173369413082019, + "grad_norm": NaN, + "learning_rate": 6.733623702703908e-05, + "loss": 0.0, + "step": 44726 + }, + { + "epoch": 4.173462722776897, + "grad_norm": NaN, + "learning_rate": 6.732992557265294e-05, + "loss": 0.0, + "step": 44727 + }, + { + "epoch": 4.173556032471774, + "grad_norm": NaN, + "learning_rate": 6.732361432847224e-05, + "loss": 0.0, + "step": 44728 + }, + { + "epoch": 4.173649342166651, + "grad_norm": NaN, + "learning_rate": 6.731730329451306e-05, + "loss": 0.0, + "step": 44729 + }, + { + "epoch": 4.173742651861528, + "grad_norm": NaN, + "learning_rate": 6.731099247079159e-05, + "loss": 0.0, + "step": 44730 + }, + { + "epoch": 4.173835961556406, + "grad_norm": NaN, + "learning_rate": 6.73046818573237e-05, + "loss": 0.0, + "step": 44731 + }, + { + "epoch": 4.173929271251283, + "grad_norm": NaN, + "learning_rate": 6.729837145412549e-05, + "loss": 0.0, + "step": 44732 + }, + { + "epoch": 4.17402258094616, + "grad_norm": NaN, + "learning_rate": 6.729206126121311e-05, + "loss": 0.0, + "step": 44733 + }, + { + "epoch": 4.174115890641038, + "grad_norm": NaN, + "learning_rate": 6.728575127860246e-05, + "loss": 0.0, + "step": 44734 + }, + { + "epoch": 4.174209200335915, + "grad_norm": NaN, + "learning_rate": 6.727944150630958e-05, + "loss": 0.0, + "step": 44735 + }, + { + "epoch": 4.174302510030792, + "grad_norm": NaN, + "learning_rate": 6.727313194435071e-05, + "loss": 0.0, + "step": 44736 + }, + { + "epoch": 4.174395819725669, + "grad_norm": NaN, + "learning_rate": 6.726682259274169e-05, + "loss": 0.0, + "step": 44737 + }, + { + "epoch": 4.174489129420547, + "grad_norm": NaN, + "learning_rate": 6.726051345149858e-05, + "loss": 0.0, + "step": 44738 + }, + { + "epoch": 4.174582439115424, + "grad_norm": NaN, + "learning_rate": 6.72542045206376e-05, + "loss": 0.0, + "step": 44739 + }, + { + "epoch": 4.1746757488103015, + "grad_norm": NaN, + "learning_rate": 6.72478958001746e-05, + "loss": 0.0, + "step": 44740 + }, + { + "epoch": 4.174769058505179, + "grad_norm": NaN, + "learning_rate": 6.724158729012562e-05, + "loss": 0.0, + "step": 44741 + }, + { + "epoch": 4.174862368200056, + "grad_norm": NaN, + "learning_rate": 6.72352789905069e-05, + "loss": 0.0, + "step": 44742 + }, + { + "epoch": 4.174955677894934, + "grad_norm": NaN, + "learning_rate": 6.722897090133428e-05, + "loss": 0.0, + "step": 44743 + }, + { + "epoch": 4.17504898758981, + "grad_norm": NaN, + "learning_rate": 6.722266302262382e-05, + "loss": 0.0, + "step": 44744 + }, + { + "epoch": 4.175142297284688, + "grad_norm": NaN, + "learning_rate": 6.721635535439172e-05, + "loss": 0.0, + "step": 44745 + }, + { + "epoch": 4.175235606979565, + "grad_norm": NaN, + "learning_rate": 6.721004789665385e-05, + "loss": 0.0, + "step": 44746 + }, + { + "epoch": 4.1753289166744425, + "grad_norm": NaN, + "learning_rate": 6.720374064942624e-05, + "loss": 0.0, + "step": 44747 + }, + { + "epoch": 4.17542222636932, + "grad_norm": NaN, + "learning_rate": 6.719743361272511e-05, + "loss": 0.0, + "step": 44748 + }, + { + "epoch": 4.175515536064197, + "grad_norm": NaN, + "learning_rate": 6.719112678656626e-05, + "loss": 0.0, + "step": 44749 + }, + { + "epoch": 4.175608845759075, + "grad_norm": NaN, + "learning_rate": 6.71848201709659e-05, + "loss": 0.0, + "step": 44750 + }, + { + "epoch": 4.175702155453951, + "grad_norm": NaN, + "learning_rate": 6.717851376594008e-05, + "loss": 0.0, + "step": 44751 + }, + { + "epoch": 4.175795465148829, + "grad_norm": NaN, + "learning_rate": 6.717220757150465e-05, + "loss": 0.0, + "step": 44752 + }, + { + "epoch": 4.175888774843706, + "grad_norm": NaN, + "learning_rate": 6.716590158767581e-05, + "loss": 0.0, + "step": 44753 + }, + { + "epoch": 4.1759820845385835, + "grad_norm": NaN, + "learning_rate": 6.715959581446959e-05, + "loss": 0.0, + "step": 44754 + }, + { + "epoch": 4.176075394233461, + "grad_norm": NaN, + "learning_rate": 6.71532902519019e-05, + "loss": 0.0, + "step": 44755 + }, + { + "epoch": 4.176168703928338, + "grad_norm": NaN, + "learning_rate": 6.714698489998893e-05, + "loss": 0.0, + "step": 44756 + }, + { + "epoch": 4.176262013623216, + "grad_norm": NaN, + "learning_rate": 6.71406797587466e-05, + "loss": 0.0, + "step": 44757 + }, + { + "epoch": 4.176355323318092, + "grad_norm": NaN, + "learning_rate": 6.71343748281909e-05, + "loss": 0.0, + "step": 44758 + }, + { + "epoch": 4.17644863301297, + "grad_norm": NaN, + "learning_rate": 6.712807010833808e-05, + "loss": 0.0, + "step": 44759 + }, + { + "epoch": 4.176541942707847, + "grad_norm": NaN, + "learning_rate": 6.712176559920396e-05, + "loss": 0.0, + "step": 44760 + }, + { + "epoch": 4.176635252402725, + "grad_norm": NaN, + "learning_rate": 6.711546130080457e-05, + "loss": 0.0, + "step": 44761 + }, + { + "epoch": 4.176728562097602, + "grad_norm": NaN, + "learning_rate": 6.710915721315613e-05, + "loss": 0.0, + "step": 44762 + }, + { + "epoch": 4.176821871792479, + "grad_norm": NaN, + "learning_rate": 6.710285333627451e-05, + "loss": 0.0, + "step": 44763 + }, + { + "epoch": 4.176915181487357, + "grad_norm": NaN, + "learning_rate": 6.709654967017569e-05, + "loss": 0.0, + "step": 44764 + }, + { + "epoch": 4.177008491182233, + "grad_norm": NaN, + "learning_rate": 6.709024621487592e-05, + "loss": 0.0, + "step": 44765 + }, + { + "epoch": 4.177101800877111, + "grad_norm": NaN, + "learning_rate": 6.708394297039104e-05, + "loss": 0.0, + "step": 44766 + }, + { + "epoch": 4.177195110571988, + "grad_norm": NaN, + "learning_rate": 6.707763993673706e-05, + "loss": 0.0, + "step": 44767 + }, + { + "epoch": 4.177288420266866, + "grad_norm": NaN, + "learning_rate": 6.707133711393022e-05, + "loss": 0.0, + "step": 44768 + }, + { + "epoch": 4.177381729961743, + "grad_norm": NaN, + "learning_rate": 6.706503450198632e-05, + "loss": 0.0, + "step": 44769 + }, + { + "epoch": 4.1774750396566205, + "grad_norm": NaN, + "learning_rate": 6.705873210092142e-05, + "loss": 0.0, + "step": 44770 + }, + { + "epoch": 4.177568349351498, + "grad_norm": NaN, + "learning_rate": 6.705242991075171e-05, + "loss": 0.0, + "step": 44771 + }, + { + "epoch": 4.177661659046375, + "grad_norm": NaN, + "learning_rate": 6.704612793149304e-05, + "loss": 0.0, + "step": 44772 + }, + { + "epoch": 4.177754968741252, + "grad_norm": NaN, + "learning_rate": 6.703982616316145e-05, + "loss": 0.0, + "step": 44773 + }, + { + "epoch": 4.177848278436129, + "grad_norm": NaN, + "learning_rate": 6.70335246057731e-05, + "loss": 0.0, + "step": 44774 + }, + { + "epoch": 4.177941588131007, + "grad_norm": NaN, + "learning_rate": 6.702722325934388e-05, + "loss": 0.0, + "step": 44775 + }, + { + "epoch": 4.178034897825884, + "grad_norm": NaN, + "learning_rate": 6.702092212388978e-05, + "loss": 0.0, + "step": 44776 + }, + { + "epoch": 4.1781282075207615, + "grad_norm": NaN, + "learning_rate": 6.701462119942703e-05, + "loss": 0.0, + "step": 44777 + }, + { + "epoch": 4.178221517215639, + "grad_norm": NaN, + "learning_rate": 6.700832048597145e-05, + "loss": 0.0, + "step": 44778 + }, + { + "epoch": 4.178314826910516, + "grad_norm": NaN, + "learning_rate": 6.700201998353908e-05, + "loss": 0.0, + "step": 44779 + }, + { + "epoch": 4.178408136605393, + "grad_norm": NaN, + "learning_rate": 6.699571969214609e-05, + "loss": 0.0, + "step": 44780 + }, + { + "epoch": 4.17850144630027, + "grad_norm": NaN, + "learning_rate": 6.698941961180833e-05, + "loss": 0.0, + "step": 44781 + }, + { + "epoch": 4.178594755995148, + "grad_norm": NaN, + "learning_rate": 6.698311974254184e-05, + "loss": 0.0, + "step": 44782 + }, + { + "epoch": 4.178688065690025, + "grad_norm": NaN, + "learning_rate": 6.697682008436281e-05, + "loss": 0.0, + "step": 44783 + }, + { + "epoch": 4.178781375384903, + "grad_norm": NaN, + "learning_rate": 6.697052063728706e-05, + "loss": 0.0, + "step": 44784 + }, + { + "epoch": 4.17887468507978, + "grad_norm": NaN, + "learning_rate": 6.696422140133062e-05, + "loss": 0.0, + "step": 44785 + }, + { + "epoch": 4.178967994774657, + "grad_norm": NaN, + "learning_rate": 6.69579223765097e-05, + "loss": 0.0, + "step": 44786 + }, + { + "epoch": 4.179061304469535, + "grad_norm": NaN, + "learning_rate": 6.695162356284007e-05, + "loss": 0.0, + "step": 44787 + }, + { + "epoch": 4.179154614164411, + "grad_norm": NaN, + "learning_rate": 6.69453249603379e-05, + "loss": 0.0, + "step": 44788 + }, + { + "epoch": 4.179247923859289, + "grad_norm": NaN, + "learning_rate": 6.693902656901923e-05, + "loss": 0.0, + "step": 44789 + }, + { + "epoch": 4.179341233554166, + "grad_norm": NaN, + "learning_rate": 6.69327283888999e-05, + "loss": 0.0, + "step": 44790 + }, + { + "epoch": 4.179434543249044, + "grad_norm": NaN, + "learning_rate": 6.692643041999609e-05, + "loss": 0.0, + "step": 44791 + }, + { + "epoch": 4.179527852943921, + "grad_norm": NaN, + "learning_rate": 6.692013266232381e-05, + "loss": 0.0, + "step": 44792 + }, + { + "epoch": 4.1796211626387985, + "grad_norm": NaN, + "learning_rate": 6.691383511589892e-05, + "loss": 0.0, + "step": 44793 + }, + { + "epoch": 4.179714472333676, + "grad_norm": NaN, + "learning_rate": 6.690753778073759e-05, + "loss": 0.0, + "step": 44794 + }, + { + "epoch": 4.179807782028552, + "grad_norm": NaN, + "learning_rate": 6.690124065685584e-05, + "loss": 0.0, + "step": 44795 + }, + { + "epoch": 4.17990109172343, + "grad_norm": NaN, + "learning_rate": 6.68949437442695e-05, + "loss": 0.0, + "step": 44796 + }, + { + "epoch": 4.179994401418307, + "grad_norm": NaN, + "learning_rate": 6.688864704299476e-05, + "loss": 0.0, + "step": 44797 + }, + { + "epoch": 4.180087711113185, + "grad_norm": NaN, + "learning_rate": 6.688235055304761e-05, + "loss": 0.0, + "step": 44798 + }, + { + "epoch": 4.180181020808062, + "grad_norm": NaN, + "learning_rate": 6.687605427444394e-05, + "loss": 0.0, + "step": 44799 + }, + { + "epoch": 4.1802743305029395, + "grad_norm": NaN, + "learning_rate": 6.686975820719992e-05, + "loss": 0.0, + "step": 44800 + }, + { + "epoch": 4.180367640197817, + "grad_norm": NaN, + "learning_rate": 6.686346235133144e-05, + "loss": 0.0, + "step": 44801 + }, + { + "epoch": 4.1804609498926935, + "grad_norm": NaN, + "learning_rate": 6.68571667068545e-05, + "loss": 0.0, + "step": 44802 + }, + { + "epoch": 4.180554259587571, + "grad_norm": NaN, + "learning_rate": 6.685087127378527e-05, + "loss": 0.0, + "step": 44803 + }, + { + "epoch": 4.180647569282448, + "grad_norm": NaN, + "learning_rate": 6.684457605213955e-05, + "loss": 0.0, + "step": 44804 + }, + { + "epoch": 4.180740878977326, + "grad_norm": NaN, + "learning_rate": 6.683828104193342e-05, + "loss": 0.0, + "step": 44805 + }, + { + "epoch": 4.180834188672203, + "grad_norm": NaN, + "learning_rate": 6.683198624318303e-05, + "loss": 0.0, + "step": 44806 + }, + { + "epoch": 4.1809274983670806, + "grad_norm": NaN, + "learning_rate": 6.682569165590418e-05, + "loss": 0.0, + "step": 44807 + }, + { + "epoch": 4.181020808061958, + "grad_norm": NaN, + "learning_rate": 6.681939728011291e-05, + "loss": 0.0, + "step": 44808 + }, + { + "epoch": 4.1811141177568345, + "grad_norm": NaN, + "learning_rate": 6.681310311582538e-05, + "loss": 0.0, + "step": 44809 + }, + { + "epoch": 4.181207427451712, + "grad_norm": NaN, + "learning_rate": 6.680680916305745e-05, + "loss": 0.0, + "step": 44810 + }, + { + "epoch": 4.181300737146589, + "grad_norm": NaN, + "learning_rate": 6.680051542182509e-05, + "loss": 0.0, + "step": 44811 + }, + { + "epoch": 4.181394046841467, + "grad_norm": NaN, + "learning_rate": 6.679422189214448e-05, + "loss": 0.0, + "step": 44812 + }, + { + "epoch": 4.181487356536344, + "grad_norm": NaN, + "learning_rate": 6.678792857403146e-05, + "loss": 0.0, + "step": 44813 + }, + { + "epoch": 4.181580666231222, + "grad_norm": NaN, + "learning_rate": 6.678163546750205e-05, + "loss": 0.0, + "step": 44814 + }, + { + "epoch": 4.181673975926099, + "grad_norm": NaN, + "learning_rate": 6.677534257257237e-05, + "loss": 0.0, + "step": 44815 + }, + { + "epoch": 4.181767285620976, + "grad_norm": NaN, + "learning_rate": 6.676904988925832e-05, + "loss": 0.0, + "step": 44816 + }, + { + "epoch": 4.181860595315853, + "grad_norm": NaN, + "learning_rate": 6.676275741757585e-05, + "loss": 0.0, + "step": 44817 + }, + { + "epoch": 4.18195390501073, + "grad_norm": NaN, + "learning_rate": 6.675646515754114e-05, + "loss": 0.0, + "step": 44818 + }, + { + "epoch": 4.182047214705608, + "grad_norm": NaN, + "learning_rate": 6.675017310917003e-05, + "loss": 0.0, + "step": 44819 + }, + { + "epoch": 4.182140524400485, + "grad_norm": NaN, + "learning_rate": 6.674388127247852e-05, + "loss": 0.0, + "step": 44820 + }, + { + "epoch": 4.182233834095363, + "grad_norm": NaN, + "learning_rate": 6.673758964748276e-05, + "loss": 0.0, + "step": 44821 + }, + { + "epoch": 4.18232714379024, + "grad_norm": NaN, + "learning_rate": 6.67312982341986e-05, + "loss": 0.0, + "step": 44822 + }, + { + "epoch": 4.1824204534851175, + "grad_norm": NaN, + "learning_rate": 6.6725007032642e-05, + "loss": 0.0, + "step": 44823 + }, + { + "epoch": 4.182513763179994, + "grad_norm": NaN, + "learning_rate": 6.671871604282918e-05, + "loss": 0.0, + "step": 44824 + }, + { + "epoch": 4.182607072874871, + "grad_norm": NaN, + "learning_rate": 6.671242526477588e-05, + "loss": 0.0, + "step": 44825 + }, + { + "epoch": 4.182700382569749, + "grad_norm": NaN, + "learning_rate": 6.670613469849825e-05, + "loss": 0.0, + "step": 44826 + }, + { + "epoch": 4.182793692264626, + "grad_norm": NaN, + "learning_rate": 6.669984434401232e-05, + "loss": 0.0, + "step": 44827 + }, + { + "epoch": 4.182887001959504, + "grad_norm": NaN, + "learning_rate": 6.669355420133389e-05, + "loss": 0.0, + "step": 44828 + }, + { + "epoch": 4.182980311654381, + "grad_norm": NaN, + "learning_rate": 6.668726427047912e-05, + "loss": 0.0, + "step": 44829 + }, + { + "epoch": 4.1830736213492585, + "grad_norm": NaN, + "learning_rate": 6.668097455146403e-05, + "loss": 0.0, + "step": 44830 + }, + { + "epoch": 4.183166931044135, + "grad_norm": NaN, + "learning_rate": 6.667468504430442e-05, + "loss": 0.0, + "step": 44831 + }, + { + "epoch": 4.1832602407390125, + "grad_norm": NaN, + "learning_rate": 6.666839574901648e-05, + "loss": 0.0, + "step": 44832 + }, + { + "epoch": 4.18335355043389, + "grad_norm": NaN, + "learning_rate": 6.666210666561615e-05, + "loss": 0.0, + "step": 44833 + }, + { + "epoch": 4.183446860128767, + "grad_norm": NaN, + "learning_rate": 6.665581779411931e-05, + "loss": 0.0, + "step": 44834 + }, + { + "epoch": 4.183540169823645, + "grad_norm": NaN, + "learning_rate": 6.664952913454207e-05, + "loss": 0.0, + "step": 44835 + }, + { + "epoch": 4.183633479518522, + "grad_norm": NaN, + "learning_rate": 6.664324068690044e-05, + "loss": 0.0, + "step": 44836 + }, + { + "epoch": 4.1837267892134, + "grad_norm": NaN, + "learning_rate": 6.663695245121027e-05, + "loss": 0.0, + "step": 44837 + }, + { + "epoch": 4.183820098908277, + "grad_norm": NaN, + "learning_rate": 6.663066442748768e-05, + "loss": 0.0, + "step": 44838 + }, + { + "epoch": 4.1839134086031535, + "grad_norm": NaN, + "learning_rate": 6.662437661574868e-05, + "loss": 0.0, + "step": 44839 + }, + { + "epoch": 4.184006718298031, + "grad_norm": NaN, + "learning_rate": 6.661808901600907e-05, + "loss": 0.0, + "step": 44840 + }, + { + "epoch": 4.184100027992908, + "grad_norm": NaN, + "learning_rate": 6.661180162828501e-05, + "loss": 0.0, + "step": 44841 + }, + { + "epoch": 4.184193337687786, + "grad_norm": NaN, + "learning_rate": 6.660551445259251e-05, + "loss": 0.0, + "step": 44842 + }, + { + "epoch": 4.184286647382663, + "grad_norm": NaN, + "learning_rate": 6.659922748894736e-05, + "loss": 0.0, + "step": 44843 + }, + { + "epoch": 4.184379957077541, + "grad_norm": NaN, + "learning_rate": 6.659294073736578e-05, + "loss": 0.0, + "step": 44844 + }, + { + "epoch": 4.184473266772418, + "grad_norm": NaN, + "learning_rate": 6.65866541978636e-05, + "loss": 0.0, + "step": 44845 + }, + { + "epoch": 4.184566576467295, + "grad_norm": NaN, + "learning_rate": 6.658036787045678e-05, + "loss": 0.0, + "step": 44846 + }, + { + "epoch": 4.184659886162172, + "grad_norm": NaN, + "learning_rate": 6.65740817551615e-05, + "loss": 0.0, + "step": 44847 + }, + { + "epoch": 4.184753195857049, + "grad_norm": NaN, + "learning_rate": 6.656779585199355e-05, + "loss": 0.0, + "step": 44848 + }, + { + "epoch": 4.184846505551927, + "grad_norm": NaN, + "learning_rate": 6.656151016096892e-05, + "loss": 0.0, + "step": 44849 + }, + { + "epoch": 4.184939815246804, + "grad_norm": NaN, + "learning_rate": 6.655522468210378e-05, + "loss": 0.0, + "step": 44850 + }, + { + "epoch": 4.185033124941682, + "grad_norm": NaN, + "learning_rate": 6.654893941541392e-05, + "loss": 0.0, + "step": 44851 + }, + { + "epoch": 4.185126434636559, + "grad_norm": NaN, + "learning_rate": 6.654265436091535e-05, + "loss": 0.0, + "step": 44852 + }, + { + "epoch": 4.185219744331436, + "grad_norm": NaN, + "learning_rate": 6.653636951862417e-05, + "loss": 0.0, + "step": 44853 + }, + { + "epoch": 4.185313054026313, + "grad_norm": NaN, + "learning_rate": 6.653008488855623e-05, + "loss": 0.0, + "step": 44854 + }, + { + "epoch": 4.1854063637211905, + "grad_norm": NaN, + "learning_rate": 6.652380047072752e-05, + "loss": 0.0, + "step": 44855 + }, + { + "epoch": 4.185499673416068, + "grad_norm": NaN, + "learning_rate": 6.651751626515415e-05, + "loss": 0.0, + "step": 44856 + }, + { + "epoch": 4.185592983110945, + "grad_norm": NaN, + "learning_rate": 6.651123227185195e-05, + "loss": 0.0, + "step": 44857 + }, + { + "epoch": 4.185686292805823, + "grad_norm": NaN, + "learning_rate": 6.650494849083692e-05, + "loss": 0.0, + "step": 44858 + }, + { + "epoch": 4.1857796025007, + "grad_norm": NaN, + "learning_rate": 6.649866492212517e-05, + "loss": 0.0, + "step": 44859 + }, + { + "epoch": 4.1858729121955776, + "grad_norm": NaN, + "learning_rate": 6.649238156573254e-05, + "loss": 0.0, + "step": 44860 + }, + { + "epoch": 4.185966221890454, + "grad_norm": NaN, + "learning_rate": 6.648609842167497e-05, + "loss": 0.0, + "step": 44861 + }, + { + "epoch": 4.1860595315853315, + "grad_norm": NaN, + "learning_rate": 6.647981548996863e-05, + "loss": 0.0, + "step": 44862 + }, + { + "epoch": 4.186152841280209, + "grad_norm": NaN, + "learning_rate": 6.647353277062929e-05, + "loss": 0.0, + "step": 44863 + }, + { + "epoch": 4.186246150975086, + "grad_norm": NaN, + "learning_rate": 6.646725026367304e-05, + "loss": 0.0, + "step": 44864 + }, + { + "epoch": 4.186339460669964, + "grad_norm": NaN, + "learning_rate": 6.64609679691159e-05, + "loss": 0.0, + "step": 44865 + }, + { + "epoch": 4.186432770364841, + "grad_norm": NaN, + "learning_rate": 6.645468588697365e-05, + "loss": 0.0, + "step": 44866 + }, + { + "epoch": 4.186526080059719, + "grad_norm": NaN, + "learning_rate": 6.644840401726245e-05, + "loss": 0.0, + "step": 44867 + }, + { + "epoch": 4.186619389754595, + "grad_norm": NaN, + "learning_rate": 6.644212235999826e-05, + "loss": 0.0, + "step": 44868 + }, + { + "epoch": 4.1867126994494726, + "grad_norm": NaN, + "learning_rate": 6.64358409151969e-05, + "loss": 0.0, + "step": 44869 + }, + { + "epoch": 4.18680600914435, + "grad_norm": NaN, + "learning_rate": 6.642955968287448e-05, + "loss": 0.0, + "step": 44870 + }, + { + "epoch": 4.186899318839227, + "grad_norm": NaN, + "learning_rate": 6.6423278663047e-05, + "loss": 0.0, + "step": 44871 + }, + { + "epoch": 4.186992628534105, + "grad_norm": NaN, + "learning_rate": 6.641699785573026e-05, + "loss": 0.0, + "step": 44872 + }, + { + "epoch": 4.187085938228982, + "grad_norm": NaN, + "learning_rate": 6.641071726094039e-05, + "loss": 0.0, + "step": 44873 + }, + { + "epoch": 4.18717924792386, + "grad_norm": NaN, + "learning_rate": 6.640443687869333e-05, + "loss": 0.0, + "step": 44874 + }, + { + "epoch": 4.187272557618736, + "grad_norm": NaN, + "learning_rate": 6.639815670900495e-05, + "loss": 0.0, + "step": 44875 + }, + { + "epoch": 4.187365867313614, + "grad_norm": NaN, + "learning_rate": 6.639187675189134e-05, + "loss": 0.0, + "step": 44876 + }, + { + "epoch": 4.187459177008491, + "grad_norm": NaN, + "learning_rate": 6.638559700736846e-05, + "loss": 0.0, + "step": 44877 + }, + { + "epoch": 4.187552486703368, + "grad_norm": NaN, + "learning_rate": 6.637931747545215e-05, + "loss": 0.0, + "step": 44878 + }, + { + "epoch": 4.187645796398246, + "grad_norm": NaN, + "learning_rate": 6.637303815615851e-05, + "loss": 0.0, + "step": 44879 + }, + { + "epoch": 4.187739106093123, + "grad_norm": NaN, + "learning_rate": 6.636675904950352e-05, + "loss": 0.0, + "step": 44880 + }, + { + "epoch": 4.187832415788001, + "grad_norm": NaN, + "learning_rate": 6.636048015550299e-05, + "loss": 0.0, + "step": 44881 + }, + { + "epoch": 4.187925725482877, + "grad_norm": NaN, + "learning_rate": 6.635420147417302e-05, + "loss": 0.0, + "step": 44882 + }, + { + "epoch": 4.188019035177755, + "grad_norm": NaN, + "learning_rate": 6.634792300552959e-05, + "loss": 0.0, + "step": 44883 + }, + { + "epoch": 4.188112344872632, + "grad_norm": NaN, + "learning_rate": 6.634164474958851e-05, + "loss": 0.0, + "step": 44884 + }, + { + "epoch": 4.1882056545675095, + "grad_norm": NaN, + "learning_rate": 6.633536670636591e-05, + "loss": 0.0, + "step": 44885 + }, + { + "epoch": 4.188298964262387, + "grad_norm": NaN, + "learning_rate": 6.632908887587773e-05, + "loss": 0.0, + "step": 44886 + }, + { + "epoch": 4.188392273957264, + "grad_norm": NaN, + "learning_rate": 6.632281125813977e-05, + "loss": 0.0, + "step": 44887 + }, + { + "epoch": 4.188485583652142, + "grad_norm": NaN, + "learning_rate": 6.63165338531682e-05, + "loss": 0.0, + "step": 44888 + }, + { + "epoch": 4.188578893347019, + "grad_norm": NaN, + "learning_rate": 6.631025666097894e-05, + "loss": 0.0, + "step": 44889 + }, + { + "epoch": 4.188672203041896, + "grad_norm": NaN, + "learning_rate": 6.63039796815878e-05, + "loss": 0.0, + "step": 44890 + }, + { + "epoch": 4.188765512736773, + "grad_norm": NaN, + "learning_rate": 6.629770291501093e-05, + "loss": 0.0, + "step": 44891 + }, + { + "epoch": 4.1888588224316505, + "grad_norm": NaN, + "learning_rate": 6.629142636126417e-05, + "loss": 0.0, + "step": 44892 + }, + { + "epoch": 4.188952132126528, + "grad_norm": NaN, + "learning_rate": 6.628515002036348e-05, + "loss": 0.0, + "step": 44893 + }, + { + "epoch": 4.189045441821405, + "grad_norm": NaN, + "learning_rate": 6.627887389232495e-05, + "loss": 0.0, + "step": 44894 + }, + { + "epoch": 4.189138751516283, + "grad_norm": NaN, + "learning_rate": 6.627259797716437e-05, + "loss": 0.0, + "step": 44895 + }, + { + "epoch": 4.18923206121116, + "grad_norm": NaN, + "learning_rate": 6.626632227489773e-05, + "loss": 0.0, + "step": 44896 + }, + { + "epoch": 4.189325370906037, + "grad_norm": NaN, + "learning_rate": 6.626004678554115e-05, + "loss": 0.0, + "step": 44897 + }, + { + "epoch": 4.189418680600914, + "grad_norm": NaN, + "learning_rate": 6.625377150911041e-05, + "loss": 0.0, + "step": 44898 + }, + { + "epoch": 4.189511990295792, + "grad_norm": NaN, + "learning_rate": 6.624749644562145e-05, + "loss": 0.0, + "step": 44899 + }, + { + "epoch": 4.189605299990669, + "grad_norm": NaN, + "learning_rate": 6.624122159509043e-05, + "loss": 0.0, + "step": 44900 + }, + { + "epoch": 4.189698609685546, + "grad_norm": NaN, + "learning_rate": 6.623494695753305e-05, + "loss": 0.0, + "step": 44901 + }, + { + "epoch": 4.189791919380424, + "grad_norm": NaN, + "learning_rate": 6.622867253296544e-05, + "loss": 0.0, + "step": 44902 + }, + { + "epoch": 4.189885229075301, + "grad_norm": NaN, + "learning_rate": 6.622239832140355e-05, + "loss": 0.0, + "step": 44903 + }, + { + "epoch": 4.189978538770179, + "grad_norm": NaN, + "learning_rate": 6.621612432286318e-05, + "loss": 0.0, + "step": 44904 + }, + { + "epoch": 4.190071848465055, + "grad_norm": NaN, + "learning_rate": 6.620985053736043e-05, + "loss": 0.0, + "step": 44905 + }, + { + "epoch": 4.190165158159933, + "grad_norm": NaN, + "learning_rate": 6.620357696491128e-05, + "loss": 0.0, + "step": 44906 + }, + { + "epoch": 4.19025846785481, + "grad_norm": NaN, + "learning_rate": 6.61973036055315e-05, + "loss": 0.0, + "step": 44907 + }, + { + "epoch": 4.1903517775496875, + "grad_norm": NaN, + "learning_rate": 6.61910304592372e-05, + "loss": 0.0, + "step": 44908 + }, + { + "epoch": 4.190445087244565, + "grad_norm": NaN, + "learning_rate": 6.618475752604435e-05, + "loss": 0.0, + "step": 44909 + }, + { + "epoch": 4.190538396939442, + "grad_norm": NaN, + "learning_rate": 6.617848480596873e-05, + "loss": 0.0, + "step": 44910 + }, + { + "epoch": 4.19063170663432, + "grad_norm": NaN, + "learning_rate": 6.617221229902644e-05, + "loss": 0.0, + "step": 44911 + }, + { + "epoch": 4.190725016329196, + "grad_norm": NaN, + "learning_rate": 6.616594000523344e-05, + "loss": 0.0, + "step": 44912 + }, + { + "epoch": 4.190818326024074, + "grad_norm": NaN, + "learning_rate": 6.61596679246055e-05, + "loss": 0.0, + "step": 44913 + }, + { + "epoch": 4.190911635718951, + "grad_norm": NaN, + "learning_rate": 6.615339605715874e-05, + "loss": 0.0, + "step": 44914 + }, + { + "epoch": 4.1910049454138285, + "grad_norm": NaN, + "learning_rate": 6.614712440290912e-05, + "loss": 0.0, + "step": 44915 + }, + { + "epoch": 4.191098255108706, + "grad_norm": NaN, + "learning_rate": 6.614085296187243e-05, + "loss": 0.0, + "step": 44916 + }, + { + "epoch": 4.191191564803583, + "grad_norm": NaN, + "learning_rate": 6.613458173406473e-05, + "loss": 0.0, + "step": 44917 + }, + { + "epoch": 4.191284874498461, + "grad_norm": NaN, + "learning_rate": 6.612831071950204e-05, + "loss": 0.0, + "step": 44918 + }, + { + "epoch": 4.191378184193337, + "grad_norm": NaN, + "learning_rate": 6.612203991820008e-05, + "loss": 0.0, + "step": 44919 + }, + { + "epoch": 4.191471493888215, + "grad_norm": NaN, + "learning_rate": 6.611576933017498e-05, + "loss": 0.0, + "step": 44920 + }, + { + "epoch": 4.191564803583092, + "grad_norm": NaN, + "learning_rate": 6.610949895544268e-05, + "loss": 0.0, + "step": 44921 + }, + { + "epoch": 4.1916581132779696, + "grad_norm": NaN, + "learning_rate": 6.610322879401897e-05, + "loss": 0.0, + "step": 44922 + }, + { + "epoch": 4.191751422972847, + "grad_norm": NaN, + "learning_rate": 6.609695884591997e-05, + "loss": 0.0, + "step": 44923 + }, + { + "epoch": 4.191844732667724, + "grad_norm": NaN, + "learning_rate": 6.609068911116158e-05, + "loss": 0.0, + "step": 44924 + }, + { + "epoch": 4.191938042362602, + "grad_norm": NaN, + "learning_rate": 6.608441958975961e-05, + "loss": 0.0, + "step": 44925 + }, + { + "epoch": 4.192031352057478, + "grad_norm": NaN, + "learning_rate": 6.607815028173016e-05, + "loss": 0.0, + "step": 44926 + }, + { + "epoch": 4.192124661752356, + "grad_norm": NaN, + "learning_rate": 6.607188118708918e-05, + "loss": 0.0, + "step": 44927 + }, + { + "epoch": 4.192217971447233, + "grad_norm": NaN, + "learning_rate": 6.606561230585242e-05, + "loss": 0.0, + "step": 44928 + }, + { + "epoch": 4.192311281142111, + "grad_norm": NaN, + "learning_rate": 6.6059343638036e-05, + "loss": 0.0, + "step": 44929 + }, + { + "epoch": 4.192404590836988, + "grad_norm": NaN, + "learning_rate": 6.605307518365586e-05, + "loss": 0.0, + "step": 44930 + }, + { + "epoch": 4.192497900531865, + "grad_norm": NaN, + "learning_rate": 6.604680694272778e-05, + "loss": 0.0, + "step": 44931 + }, + { + "epoch": 4.192591210226743, + "grad_norm": NaN, + "learning_rate": 6.604053891526784e-05, + "loss": 0.0, + "step": 44932 + }, + { + "epoch": 4.19268451992162, + "grad_norm": NaN, + "learning_rate": 6.603427110129202e-05, + "loss": 0.0, + "step": 44933 + }, + { + "epoch": 4.192777829616497, + "grad_norm": NaN, + "learning_rate": 6.602800350081604e-05, + "loss": 0.0, + "step": 44934 + }, + { + "epoch": 4.192871139311374, + "grad_norm": NaN, + "learning_rate": 6.602173611385609e-05, + "loss": 0.0, + "step": 44935 + }, + { + "epoch": 4.192964449006252, + "grad_norm": NaN, + "learning_rate": 6.60154689404279e-05, + "loss": 0.0, + "step": 44936 + }, + { + "epoch": 4.193057758701129, + "grad_norm": NaN, + "learning_rate": 6.600920198054753e-05, + "loss": 0.0, + "step": 44937 + }, + { + "epoch": 4.1931510683960065, + "grad_norm": NaN, + "learning_rate": 6.600293523423094e-05, + "loss": 0.0, + "step": 44938 + }, + { + "epoch": 4.193244378090884, + "grad_norm": NaN, + "learning_rate": 6.599666870149389e-05, + "loss": 0.0, + "step": 44939 + }, + { + "epoch": 4.193337687785761, + "grad_norm": NaN, + "learning_rate": 6.599040238235248e-05, + "loss": 0.0, + "step": 44940 + }, + { + "epoch": 4.193430997480638, + "grad_norm": NaN, + "learning_rate": 6.598413627682266e-05, + "loss": 0.0, + "step": 44941 + }, + { + "epoch": 4.193524307175515, + "grad_norm": NaN, + "learning_rate": 6.597787038492015e-05, + "loss": 0.0, + "step": 44942 + }, + { + "epoch": 4.193617616870393, + "grad_norm": NaN, + "learning_rate": 6.597160470666112e-05, + "loss": 0.0, + "step": 44943 + }, + { + "epoch": 4.19371092656527, + "grad_norm": NaN, + "learning_rate": 6.596533924206145e-05, + "loss": 0.0, + "step": 44944 + }, + { + "epoch": 4.1938042362601475, + "grad_norm": NaN, + "learning_rate": 6.595907399113692e-05, + "loss": 0.0, + "step": 44945 + }, + { + "epoch": 4.193897545955025, + "grad_norm": NaN, + "learning_rate": 6.595280895390361e-05, + "loss": 0.0, + "step": 44946 + }, + { + "epoch": 4.193990855649902, + "grad_norm": NaN, + "learning_rate": 6.594654413037748e-05, + "loss": 0.0, + "step": 44947 + }, + { + "epoch": 4.194084165344779, + "grad_norm": NaN, + "learning_rate": 6.594027952057428e-05, + "loss": 0.0, + "step": 44948 + }, + { + "epoch": 4.194177475039656, + "grad_norm": NaN, + "learning_rate": 6.593401512451012e-05, + "loss": 0.0, + "step": 44949 + }, + { + "epoch": 4.194270784734534, + "grad_norm": NaN, + "learning_rate": 6.592775094220089e-05, + "loss": 0.0, + "step": 44950 + }, + { + "epoch": 4.194364094429411, + "grad_norm": NaN, + "learning_rate": 6.592148697366238e-05, + "loss": 0.0, + "step": 44951 + }, + { + "epoch": 4.194457404124289, + "grad_norm": NaN, + "learning_rate": 6.591522321891066e-05, + "loss": 0.0, + "step": 44952 + }, + { + "epoch": 4.194550713819166, + "grad_norm": NaN, + "learning_rate": 6.59089596779617e-05, + "loss": 0.0, + "step": 44953 + }, + { + "epoch": 4.194644023514043, + "grad_norm": NaN, + "learning_rate": 6.590269635083123e-05, + "loss": 0.0, + "step": 44954 + }, + { + "epoch": 4.19473733320892, + "grad_norm": NaN, + "learning_rate": 6.589643323753535e-05, + "loss": 0.0, + "step": 44955 + }, + { + "epoch": 4.194830642903797, + "grad_norm": NaN, + "learning_rate": 6.589017033808998e-05, + "loss": 0.0, + "step": 44956 + }, + { + "epoch": 4.194923952598675, + "grad_norm": NaN, + "learning_rate": 6.588390765251087e-05, + "loss": 0.0, + "step": 44957 + }, + { + "epoch": 4.195017262293552, + "grad_norm": NaN, + "learning_rate": 6.587764518081414e-05, + "loss": 0.0, + "step": 44958 + }, + { + "epoch": 4.19511057198843, + "grad_norm": NaN, + "learning_rate": 6.587138292301566e-05, + "loss": 0.0, + "step": 44959 + }, + { + "epoch": 4.195203881683307, + "grad_norm": NaN, + "learning_rate": 6.586512087913126e-05, + "loss": 0.0, + "step": 44960 + }, + { + "epoch": 4.1952971913781845, + "grad_norm": NaN, + "learning_rate": 6.585885904917699e-05, + "loss": 0.0, + "step": 44961 + }, + { + "epoch": 4.195390501073062, + "grad_norm": NaN, + "learning_rate": 6.585259743316875e-05, + "loss": 0.0, + "step": 44962 + }, + { + "epoch": 4.195483810767938, + "grad_norm": NaN, + "learning_rate": 6.584633603112233e-05, + "loss": 0.0, + "step": 44963 + }, + { + "epoch": 4.195577120462816, + "grad_norm": NaN, + "learning_rate": 6.584007484305379e-05, + "loss": 0.0, + "step": 44964 + }, + { + "epoch": 4.195670430157693, + "grad_norm": NaN, + "learning_rate": 6.583381386897908e-05, + "loss": 0.0, + "step": 44965 + }, + { + "epoch": 4.195763739852571, + "grad_norm": NaN, + "learning_rate": 6.582755310891394e-05, + "loss": 0.0, + "step": 44966 + }, + { + "epoch": 4.195857049547448, + "grad_norm": NaN, + "learning_rate": 6.582129256287444e-05, + "loss": 0.0, + "step": 44967 + }, + { + "epoch": 4.1959503592423255, + "grad_norm": NaN, + "learning_rate": 6.581503223087653e-05, + "loss": 0.0, + "step": 44968 + }, + { + "epoch": 4.196043668937203, + "grad_norm": NaN, + "learning_rate": 6.580877211293593e-05, + "loss": 0.0, + "step": 44969 + }, + { + "epoch": 4.1961369786320795, + "grad_norm": NaN, + "learning_rate": 6.580251220906874e-05, + "loss": 0.0, + "step": 44970 + }, + { + "epoch": 4.196230288326957, + "grad_norm": NaN, + "learning_rate": 6.57962525192909e-05, + "loss": 0.0, + "step": 44971 + }, + { + "epoch": 4.196323598021834, + "grad_norm": NaN, + "learning_rate": 6.578999304361811e-05, + "loss": 0.0, + "step": 44972 + }, + { + "epoch": 4.196416907716712, + "grad_norm": NaN, + "learning_rate": 6.578373378206647e-05, + "loss": 0.0, + "step": 44973 + }, + { + "epoch": 4.196510217411589, + "grad_norm": NaN, + "learning_rate": 6.577747473465188e-05, + "loss": 0.0, + "step": 44974 + }, + { + "epoch": 4.196603527106467, + "grad_norm": NaN, + "learning_rate": 6.57712159013902e-05, + "loss": 0.0, + "step": 44975 + }, + { + "epoch": 4.196696836801344, + "grad_norm": NaN, + "learning_rate": 6.576495728229737e-05, + "loss": 0.0, + "step": 44976 + }, + { + "epoch": 4.196790146496221, + "grad_norm": NaN, + "learning_rate": 6.575869887738931e-05, + "loss": 0.0, + "step": 44977 + }, + { + "epoch": 4.196883456191098, + "grad_norm": NaN, + "learning_rate": 6.575244068668192e-05, + "loss": 0.0, + "step": 44978 + }, + { + "epoch": 4.196976765885975, + "grad_norm": NaN, + "learning_rate": 6.574618271019118e-05, + "loss": 0.0, + "step": 44979 + }, + { + "epoch": 4.197070075580853, + "grad_norm": NaN, + "learning_rate": 6.573992494793282e-05, + "loss": 0.0, + "step": 44980 + }, + { + "epoch": 4.19716338527573, + "grad_norm": NaN, + "learning_rate": 6.573366739992293e-05, + "loss": 0.0, + "step": 44981 + }, + { + "epoch": 4.197256694970608, + "grad_norm": NaN, + "learning_rate": 6.572741006617741e-05, + "loss": 0.0, + "step": 44982 + }, + { + "epoch": 4.197350004665485, + "grad_norm": NaN, + "learning_rate": 6.572115294671202e-05, + "loss": 0.0, + "step": 44983 + }, + { + "epoch": 4.197443314360362, + "grad_norm": NaN, + "learning_rate": 6.571489604154283e-05, + "loss": 0.0, + "step": 44984 + }, + { + "epoch": 4.197536624055239, + "grad_norm": NaN, + "learning_rate": 6.570863935068573e-05, + "loss": 0.0, + "step": 44985 + }, + { + "epoch": 4.197629933750116, + "grad_norm": NaN, + "learning_rate": 6.57023828741565e-05, + "loss": 0.0, + "step": 44986 + }, + { + "epoch": 4.197723243444994, + "grad_norm": NaN, + "learning_rate": 6.569612661197118e-05, + "loss": 0.0, + "step": 44987 + }, + { + "epoch": 4.197816553139871, + "grad_norm": NaN, + "learning_rate": 6.568987056414572e-05, + "loss": 0.0, + "step": 44988 + }, + { + "epoch": 4.197909862834749, + "grad_norm": NaN, + "learning_rate": 6.56836147306958e-05, + "loss": 0.0, + "step": 44989 + }, + { + "epoch": 4.198003172529626, + "grad_norm": NaN, + "learning_rate": 6.567735911163755e-05, + "loss": 0.0, + "step": 44990 + }, + { + "epoch": 4.1980964822245035, + "grad_norm": NaN, + "learning_rate": 6.567110370698685e-05, + "loss": 0.0, + "step": 44991 + }, + { + "epoch": 4.19818979191938, + "grad_norm": NaN, + "learning_rate": 6.566484851675943e-05, + "loss": 0.0, + "step": 44992 + }, + { + "epoch": 4.198283101614257, + "grad_norm": NaN, + "learning_rate": 6.565859354097138e-05, + "loss": 0.0, + "step": 44993 + }, + { + "epoch": 4.198376411309135, + "grad_norm": NaN, + "learning_rate": 6.565233877963861e-05, + "loss": 0.0, + "step": 44994 + }, + { + "epoch": 4.198469721004012, + "grad_norm": NaN, + "learning_rate": 6.564608423277684e-05, + "loss": 0.0, + "step": 44995 + }, + { + "epoch": 4.19856303069889, + "grad_norm": NaN, + "learning_rate": 6.563982990040213e-05, + "loss": 0.0, + "step": 44996 + }, + { + "epoch": 4.198656340393767, + "grad_norm": NaN, + "learning_rate": 6.563357578253042e-05, + "loss": 0.0, + "step": 44997 + }, + { + "epoch": 4.1987496500886445, + "grad_norm": NaN, + "learning_rate": 6.56273218791774e-05, + "loss": 0.0, + "step": 44998 + }, + { + "epoch": 4.198842959783521, + "grad_norm": NaN, + "learning_rate": 6.562106819035919e-05, + "loss": 0.0, + "step": 44999 + }, + { + "epoch": 4.1989362694783985, + "grad_norm": NaN, + "learning_rate": 6.561481471609164e-05, + "loss": 0.0, + "step": 45000 + }, + { + "epoch": 4.199029579173276, + "grad_norm": NaN, + "learning_rate": 6.560856145639053e-05, + "loss": 0.0, + "step": 45001 + }, + { + "epoch": 4.199122888868153, + "grad_norm": NaN, + "learning_rate": 6.56023084112719e-05, + "loss": 0.0, + "step": 45002 + }, + { + "epoch": 4.199216198563031, + "grad_norm": NaN, + "learning_rate": 6.559605558075164e-05, + "loss": 0.0, + "step": 45003 + }, + { + "epoch": 4.199309508257908, + "grad_norm": NaN, + "learning_rate": 6.558980296484552e-05, + "loss": 0.0, + "step": 45004 + }, + { + "epoch": 4.199402817952786, + "grad_norm": NaN, + "learning_rate": 6.558355056356957e-05, + "loss": 0.0, + "step": 45005 + }, + { + "epoch": 4.199496127647663, + "grad_norm": NaN, + "learning_rate": 6.55772983769397e-05, + "loss": 0.0, + "step": 45006 + }, + { + "epoch": 4.1995894373425395, + "grad_norm": NaN, + "learning_rate": 6.557104640497163e-05, + "loss": 0.0, + "step": 45007 + }, + { + "epoch": 4.199682747037417, + "grad_norm": NaN, + "learning_rate": 6.556479464768145e-05, + "loss": 0.0, + "step": 45008 + }, + { + "epoch": 4.199776056732294, + "grad_norm": NaN, + "learning_rate": 6.555854310508505e-05, + "loss": 0.0, + "step": 45009 + }, + { + "epoch": 4.199869366427172, + "grad_norm": NaN, + "learning_rate": 6.555229177719813e-05, + "loss": 0.0, + "step": 45010 + }, + { + "epoch": 4.199962676122049, + "grad_norm": NaN, + "learning_rate": 6.554604066403678e-05, + "loss": 0.0, + "step": 45011 + }, + { + "epoch": 4.200055985816927, + "grad_norm": NaN, + "learning_rate": 6.553978976561683e-05, + "loss": 0.0, + "step": 45012 + }, + { + "epoch": 4.200149295511804, + "grad_norm": NaN, + "learning_rate": 6.553353908195419e-05, + "loss": 0.0, + "step": 45013 + }, + { + "epoch": 4.200242605206681, + "grad_norm": NaN, + "learning_rate": 6.552728861306472e-05, + "loss": 0.0, + "step": 45014 + }, + { + "epoch": 4.200335914901558, + "grad_norm": NaN, + "learning_rate": 6.552103835896433e-05, + "loss": 0.0, + "step": 45015 + }, + { + "epoch": 4.200429224596435, + "grad_norm": NaN, + "learning_rate": 6.551478831966894e-05, + "loss": 0.0, + "step": 45016 + }, + { + "epoch": 4.200522534291313, + "grad_norm": NaN, + "learning_rate": 6.55085384951944e-05, + "loss": 0.0, + "step": 45017 + }, + { + "epoch": 4.20061584398619, + "grad_norm": NaN, + "learning_rate": 6.550228888555661e-05, + "loss": 0.0, + "step": 45018 + }, + { + "epoch": 4.200709153681068, + "grad_norm": NaN, + "learning_rate": 6.549603949077149e-05, + "loss": 0.0, + "step": 45019 + }, + { + "epoch": 4.200802463375945, + "grad_norm": NaN, + "learning_rate": 6.54897903108549e-05, + "loss": 0.0, + "step": 45020 + }, + { + "epoch": 4.2008957730708225, + "grad_norm": NaN, + "learning_rate": 6.548354134582274e-05, + "loss": 0.0, + "step": 45021 + }, + { + "epoch": 4.200989082765699, + "grad_norm": NaN, + "learning_rate": 6.547729259569088e-05, + "loss": 0.0, + "step": 45022 + }, + { + "epoch": 4.2010823924605765, + "grad_norm": NaN, + "learning_rate": 6.547104406047525e-05, + "loss": 0.0, + "step": 45023 + }, + { + "epoch": 4.201175702155454, + "grad_norm": NaN, + "learning_rate": 6.546479574019171e-05, + "loss": 0.0, + "step": 45024 + }, + { + "epoch": 4.201269011850331, + "grad_norm": NaN, + "learning_rate": 6.545854763485616e-05, + "loss": 0.0, + "step": 45025 + }, + { + "epoch": 4.201362321545209, + "grad_norm": NaN, + "learning_rate": 6.545229974448453e-05, + "loss": 0.0, + "step": 45026 + }, + { + "epoch": 4.201455631240086, + "grad_norm": NaN, + "learning_rate": 6.544605206909254e-05, + "loss": 0.0, + "step": 45027 + }, + { + "epoch": 4.201548940934964, + "grad_norm": NaN, + "learning_rate": 6.543980460869626e-05, + "loss": 0.0, + "step": 45028 + }, + { + "epoch": 4.20164225062984, + "grad_norm": NaN, + "learning_rate": 6.543355736331156e-05, + "loss": 0.0, + "step": 45029 + }, + { + "epoch": 4.2017355603247175, + "grad_norm": NaN, + "learning_rate": 6.542731033295414e-05, + "loss": 0.0, + "step": 45030 + }, + { + "epoch": 4.201828870019595, + "grad_norm": NaN, + "learning_rate": 6.54210635176401e-05, + "loss": 0.0, + "step": 45031 + }, + { + "epoch": 4.201922179714472, + "grad_norm": NaN, + "learning_rate": 6.54148169173853e-05, + "loss": 0.0, + "step": 45032 + }, + { + "epoch": 4.20201548940935, + "grad_norm": NaN, + "learning_rate": 6.540857053220543e-05, + "loss": 0.0, + "step": 45033 + }, + { + "epoch": 4.202108799104227, + "grad_norm": NaN, + "learning_rate": 6.540232436211659e-05, + "loss": 0.0, + "step": 45034 + }, + { + "epoch": 4.202202108799105, + "grad_norm": NaN, + "learning_rate": 6.539607840713461e-05, + "loss": 0.0, + "step": 45035 + }, + { + "epoch": 4.202295418493981, + "grad_norm": NaN, + "learning_rate": 6.538983266727524e-05, + "loss": 0.0, + "step": 45036 + }, + { + "epoch": 4.202388728188859, + "grad_norm": NaN, + "learning_rate": 6.538358714255451e-05, + "loss": 0.0, + "step": 45037 + }, + { + "epoch": 4.202482037883736, + "grad_norm": NaN, + "learning_rate": 6.537734183298833e-05, + "loss": 0.0, + "step": 45038 + }, + { + "epoch": 4.202575347578613, + "grad_norm": NaN, + "learning_rate": 6.537109673859238e-05, + "loss": 0.0, + "step": 45039 + }, + { + "epoch": 4.202668657273491, + "grad_norm": NaN, + "learning_rate": 6.536485185938272e-05, + "loss": 0.0, + "step": 45040 + }, + { + "epoch": 4.202761966968368, + "grad_norm": NaN, + "learning_rate": 6.535860719537522e-05, + "loss": 0.0, + "step": 45041 + }, + { + "epoch": 4.202855276663246, + "grad_norm": NaN, + "learning_rate": 6.535236274658558e-05, + "loss": 0.0, + "step": 45042 + }, + { + "epoch": 4.202948586358122, + "grad_norm": NaN, + "learning_rate": 6.534611851302991e-05, + "loss": 0.0, + "step": 45043 + }, + { + "epoch": 4.203041896053, + "grad_norm": NaN, + "learning_rate": 6.533987449472401e-05, + "loss": 0.0, + "step": 45044 + }, + { + "epoch": 4.203135205747877, + "grad_norm": NaN, + "learning_rate": 6.53336306916836e-05, + "loss": 0.0, + "step": 45045 + }, + { + "epoch": 4.203228515442754, + "grad_norm": NaN, + "learning_rate": 6.53273871039248e-05, + "loss": 0.0, + "step": 45046 + }, + { + "epoch": 4.203321825137632, + "grad_norm": NaN, + "learning_rate": 6.532114373146338e-05, + "loss": 0.0, + "step": 45047 + }, + { + "epoch": 4.203415134832509, + "grad_norm": NaN, + "learning_rate": 6.53149005743151e-05, + "loss": 0.0, + "step": 45048 + }, + { + "epoch": 4.203508444527387, + "grad_norm": NaN, + "learning_rate": 6.5308657632496e-05, + "loss": 0.0, + "step": 45049 + }, + { + "epoch": 4.203601754222264, + "grad_norm": NaN, + "learning_rate": 6.530241490602191e-05, + "loss": 0.0, + "step": 45050 + }, + { + "epoch": 4.203695063917141, + "grad_norm": NaN, + "learning_rate": 6.529617239490868e-05, + "loss": 0.0, + "step": 45051 + }, + { + "epoch": 4.203788373612018, + "grad_norm": NaN, + "learning_rate": 6.52899300991722e-05, + "loss": 0.0, + "step": 45052 + }, + { + "epoch": 4.2038816833068955, + "grad_norm": NaN, + "learning_rate": 6.528368801882834e-05, + "loss": 0.0, + "step": 45053 + }, + { + "epoch": 4.203974993001773, + "grad_norm": NaN, + "learning_rate": 6.527744615389296e-05, + "loss": 0.0, + "step": 45054 + }, + { + "epoch": 4.20406830269665, + "grad_norm": NaN, + "learning_rate": 6.527120450438194e-05, + "loss": 0.0, + "step": 45055 + }, + { + "epoch": 4.204161612391528, + "grad_norm": NaN, + "learning_rate": 6.526496307031114e-05, + "loss": 0.0, + "step": 45056 + }, + { + "epoch": 4.204254922086405, + "grad_norm": NaN, + "learning_rate": 6.525872185169644e-05, + "loss": 0.0, + "step": 45057 + }, + { + "epoch": 4.204348231781282, + "grad_norm": NaN, + "learning_rate": 6.52524808485537e-05, + "loss": 0.0, + "step": 45058 + }, + { + "epoch": 4.204441541476159, + "grad_norm": NaN, + "learning_rate": 6.524624006089882e-05, + "loss": 0.0, + "step": 45059 + }, + { + "epoch": 4.2045348511710365, + "grad_norm": NaN, + "learning_rate": 6.523999948874763e-05, + "loss": 0.0, + "step": 45060 + }, + { + "epoch": 4.204628160865914, + "grad_norm": NaN, + "learning_rate": 6.523375913211603e-05, + "loss": 0.0, + "step": 45061 + }, + { + "epoch": 4.204721470560791, + "grad_norm": NaN, + "learning_rate": 6.522751899101984e-05, + "loss": 0.0, + "step": 45062 + }, + { + "epoch": 4.204814780255669, + "grad_norm": NaN, + "learning_rate": 6.5221279065475e-05, + "loss": 0.0, + "step": 45063 + }, + { + "epoch": 4.204908089950546, + "grad_norm": NaN, + "learning_rate": 6.52150393554973e-05, + "loss": 0.0, + "step": 45064 + }, + { + "epoch": 4.205001399645423, + "grad_norm": NaN, + "learning_rate": 6.520879986110265e-05, + "loss": 0.0, + "step": 45065 + }, + { + "epoch": 4.2050947093403, + "grad_norm": NaN, + "learning_rate": 6.52025605823069e-05, + "loss": 0.0, + "step": 45066 + }, + { + "epoch": 4.205188019035178, + "grad_norm": NaN, + "learning_rate": 6.519632151912592e-05, + "loss": 0.0, + "step": 45067 + }, + { + "epoch": 4.205281328730055, + "grad_norm": NaN, + "learning_rate": 6.519008267157558e-05, + "loss": 0.0, + "step": 45068 + }, + { + "epoch": 4.205374638424932, + "grad_norm": NaN, + "learning_rate": 6.518384403967173e-05, + "loss": 0.0, + "step": 45069 + }, + { + "epoch": 4.20546794811981, + "grad_norm": NaN, + "learning_rate": 6.51776056234303e-05, + "loss": 0.0, + "step": 45070 + }, + { + "epoch": 4.205561257814687, + "grad_norm": NaN, + "learning_rate": 6.517136742286698e-05, + "loss": 0.0, + "step": 45071 + }, + { + "epoch": 4.205654567509564, + "grad_norm": NaN, + "learning_rate": 6.516512943799781e-05, + "loss": 0.0, + "step": 45072 + }, + { + "epoch": 4.205747877204441, + "grad_norm": NaN, + "learning_rate": 6.515889166883863e-05, + "loss": 0.0, + "step": 45073 + }, + { + "epoch": 4.205841186899319, + "grad_norm": NaN, + "learning_rate": 6.515265411540515e-05, + "loss": 0.0, + "step": 45074 + }, + { + "epoch": 4.205934496594196, + "grad_norm": NaN, + "learning_rate": 6.514641677771339e-05, + "loss": 0.0, + "step": 45075 + }, + { + "epoch": 4.2060278062890735, + "grad_norm": NaN, + "learning_rate": 6.514017965577922e-05, + "loss": 0.0, + "step": 45076 + }, + { + "epoch": 4.206121115983951, + "grad_norm": NaN, + "learning_rate": 6.513394274961832e-05, + "loss": 0.0, + "step": 45077 + }, + { + "epoch": 4.206214425678828, + "grad_norm": NaN, + "learning_rate": 6.512770605924672e-05, + "loss": 0.0, + "step": 45078 + }, + { + "epoch": 4.206307735373706, + "grad_norm": NaN, + "learning_rate": 6.512146958468027e-05, + "loss": 0.0, + "step": 45079 + }, + { + "epoch": 4.206401045068582, + "grad_norm": NaN, + "learning_rate": 6.511523332593468e-05, + "loss": 0.0, + "step": 45080 + }, + { + "epoch": 4.20649435476346, + "grad_norm": NaN, + "learning_rate": 6.510899728302595e-05, + "loss": 0.0, + "step": 45081 + }, + { + "epoch": 4.206587664458337, + "grad_norm": NaN, + "learning_rate": 6.510276145596996e-05, + "loss": 0.0, + "step": 45082 + }, + { + "epoch": 4.2066809741532145, + "grad_norm": NaN, + "learning_rate": 6.509652584478239e-05, + "loss": 0.0, + "step": 45083 + }, + { + "epoch": 4.206774283848092, + "grad_norm": NaN, + "learning_rate": 6.509029044947925e-05, + "loss": 0.0, + "step": 45084 + }, + { + "epoch": 4.206867593542969, + "grad_norm": NaN, + "learning_rate": 6.508405527007641e-05, + "loss": 0.0, + "step": 45085 + }, + { + "epoch": 4.206960903237847, + "grad_norm": NaN, + "learning_rate": 6.507782030658956e-05, + "loss": 0.0, + "step": 45086 + }, + { + "epoch": 4.207054212932723, + "grad_norm": NaN, + "learning_rate": 6.50715855590347e-05, + "loss": 0.0, + "step": 45087 + }, + { + "epoch": 4.207147522627601, + "grad_norm": NaN, + "learning_rate": 6.506535102742765e-05, + "loss": 0.0, + "step": 45088 + }, + { + "epoch": 4.207240832322478, + "grad_norm": NaN, + "learning_rate": 6.505911671178424e-05, + "loss": 0.0, + "step": 45089 + }, + { + "epoch": 4.207334142017356, + "grad_norm": NaN, + "learning_rate": 6.505288261212034e-05, + "loss": 0.0, + "step": 45090 + }, + { + "epoch": 4.207427451712233, + "grad_norm": NaN, + "learning_rate": 6.50466487284518e-05, + "loss": 0.0, + "step": 45091 + }, + { + "epoch": 4.20752076140711, + "grad_norm": NaN, + "learning_rate": 6.504041506079448e-05, + "loss": 0.0, + "step": 45092 + }, + { + "epoch": 4.207614071101988, + "grad_norm": NaN, + "learning_rate": 6.503418160916419e-05, + "loss": 0.0, + "step": 45093 + }, + { + "epoch": 4.207707380796865, + "grad_norm": NaN, + "learning_rate": 6.502794837357683e-05, + "loss": 0.0, + "step": 45094 + }, + { + "epoch": 4.207800690491742, + "grad_norm": NaN, + "learning_rate": 6.502171535404821e-05, + "loss": 0.0, + "step": 45095 + }, + { + "epoch": 4.207894000186619, + "grad_norm": NaN, + "learning_rate": 6.501548255059421e-05, + "loss": 0.0, + "step": 45096 + }, + { + "epoch": 4.207987309881497, + "grad_norm": NaN, + "learning_rate": 6.500924996323066e-05, + "loss": 0.0, + "step": 45097 + }, + { + "epoch": 4.208080619576374, + "grad_norm": NaN, + "learning_rate": 6.500301759197341e-05, + "loss": 0.0, + "step": 45098 + }, + { + "epoch": 4.2081739292712514, + "grad_norm": NaN, + "learning_rate": 6.49967854368383e-05, + "loss": 0.0, + "step": 45099 + }, + { + "epoch": 4.208267238966129, + "grad_norm": NaN, + "learning_rate": 6.499055349784119e-05, + "loss": 0.0, + "step": 45100 + }, + { + "epoch": 4.208360548661006, + "grad_norm": NaN, + "learning_rate": 6.498432177499792e-05, + "loss": 0.0, + "step": 45101 + }, + { + "epoch": 4.208453858355883, + "grad_norm": NaN, + "learning_rate": 6.497809026832435e-05, + "loss": 0.0, + "step": 45102 + }, + { + "epoch": 4.20854716805076, + "grad_norm": NaN, + "learning_rate": 6.49718589778363e-05, + "loss": 0.0, + "step": 45103 + }, + { + "epoch": 4.208640477745638, + "grad_norm": NaN, + "learning_rate": 6.496562790354962e-05, + "loss": 0.0, + "step": 45104 + }, + { + "epoch": 4.208733787440515, + "grad_norm": NaN, + "learning_rate": 6.495939704548015e-05, + "loss": 0.0, + "step": 45105 + }, + { + "epoch": 4.2088270971353925, + "grad_norm": NaN, + "learning_rate": 6.495316640364377e-05, + "loss": 0.0, + "step": 45106 + }, + { + "epoch": 4.20892040683027, + "grad_norm": NaN, + "learning_rate": 6.494693597805629e-05, + "loss": 0.0, + "step": 45107 + }, + { + "epoch": 4.209013716525147, + "grad_norm": NaN, + "learning_rate": 6.494070576873355e-05, + "loss": 0.0, + "step": 45108 + }, + { + "epoch": 4.209107026220024, + "grad_norm": NaN, + "learning_rate": 6.493447577569139e-05, + "loss": 0.0, + "step": 45109 + }, + { + "epoch": 4.209200335914901, + "grad_norm": NaN, + "learning_rate": 6.492824599894567e-05, + "loss": 0.0, + "step": 45110 + }, + { + "epoch": 4.209293645609779, + "grad_norm": NaN, + "learning_rate": 6.492201643851223e-05, + "loss": 0.0, + "step": 45111 + }, + { + "epoch": 4.209386955304656, + "grad_norm": NaN, + "learning_rate": 6.491578709440687e-05, + "loss": 0.0, + "step": 45112 + }, + { + "epoch": 4.2094802649995335, + "grad_norm": NaN, + "learning_rate": 6.490955796664549e-05, + "loss": 0.0, + "step": 45113 + }, + { + "epoch": 4.209573574694411, + "grad_norm": NaN, + "learning_rate": 6.490332905524393e-05, + "loss": 0.0, + "step": 45114 + }, + { + "epoch": 4.209666884389288, + "grad_norm": NaN, + "learning_rate": 6.489710036021791e-05, + "loss": 0.0, + "step": 45115 + }, + { + "epoch": 4.209760194084165, + "grad_norm": NaN, + "learning_rate": 6.48908718815834e-05, + "loss": 0.0, + "step": 45116 + }, + { + "epoch": 4.209853503779042, + "grad_norm": NaN, + "learning_rate": 6.488464361935623e-05, + "loss": 0.0, + "step": 45117 + }, + { + "epoch": 4.20994681347392, + "grad_norm": NaN, + "learning_rate": 6.48784155735521e-05, + "loss": 0.0, + "step": 45118 + }, + { + "epoch": 4.210040123168797, + "grad_norm": NaN, + "learning_rate": 6.4872187744187e-05, + "loss": 0.0, + "step": 45119 + }, + { + "epoch": 4.210133432863675, + "grad_norm": NaN, + "learning_rate": 6.486596013127678e-05, + "loss": 0.0, + "step": 45120 + }, + { + "epoch": 4.210226742558552, + "grad_norm": NaN, + "learning_rate": 6.485973273483708e-05, + "loss": 0.0, + "step": 45121 + }, + { + "epoch": 4.210320052253429, + "grad_norm": NaN, + "learning_rate": 6.485350555488391e-05, + "loss": 0.0, + "step": 45122 + }, + { + "epoch": 4.210413361948307, + "grad_norm": NaN, + "learning_rate": 6.484727859143307e-05, + "loss": 0.0, + "step": 45123 + }, + { + "epoch": 4.210506671643183, + "grad_norm": NaN, + "learning_rate": 6.484105184450034e-05, + "loss": 0.0, + "step": 45124 + }, + { + "epoch": 4.210599981338061, + "grad_norm": NaN, + "learning_rate": 6.483482531410163e-05, + "loss": 0.0, + "step": 45125 + }, + { + "epoch": 4.210693291032938, + "grad_norm": NaN, + "learning_rate": 6.482859900025272e-05, + "loss": 0.0, + "step": 45126 + }, + { + "epoch": 4.210786600727816, + "grad_norm": NaN, + "learning_rate": 6.482237290296945e-05, + "loss": 0.0, + "step": 45127 + }, + { + "epoch": 4.210879910422693, + "grad_norm": NaN, + "learning_rate": 6.481614702226765e-05, + "loss": 0.0, + "step": 45128 + }, + { + "epoch": 4.2109732201175705, + "grad_norm": NaN, + "learning_rate": 6.480992135816315e-05, + "loss": 0.0, + "step": 45129 + }, + { + "epoch": 4.211066529812448, + "grad_norm": NaN, + "learning_rate": 6.480369591067179e-05, + "loss": 0.0, + "step": 45130 + }, + { + "epoch": 4.211159839507324, + "grad_norm": NaN, + "learning_rate": 6.479747067980941e-05, + "loss": 0.0, + "step": 45131 + }, + { + "epoch": 4.211253149202202, + "grad_norm": NaN, + "learning_rate": 6.479124566559182e-05, + "loss": 0.0, + "step": 45132 + }, + { + "epoch": 4.211346458897079, + "grad_norm": NaN, + "learning_rate": 6.478502086803482e-05, + "loss": 0.0, + "step": 45133 + }, + { + "epoch": 4.211439768591957, + "grad_norm": NaN, + "learning_rate": 6.47787962871543e-05, + "loss": 0.0, + "step": 45134 + }, + { + "epoch": 4.211533078286834, + "grad_norm": NaN, + "learning_rate": 6.477257192296605e-05, + "loss": 0.0, + "step": 45135 + }, + { + "epoch": 4.2116263879817115, + "grad_norm": NaN, + "learning_rate": 6.476634777548591e-05, + "loss": 0.0, + "step": 45136 + }, + { + "epoch": 4.211719697676589, + "grad_norm": NaN, + "learning_rate": 6.476012384472967e-05, + "loss": 0.0, + "step": 45137 + }, + { + "epoch": 4.2118130073714655, + "grad_norm": NaN, + "learning_rate": 6.47539001307132e-05, + "loss": 0.0, + "step": 45138 + }, + { + "epoch": 4.211906317066343, + "grad_norm": NaN, + "learning_rate": 6.474767663345233e-05, + "loss": 0.0, + "step": 45139 + }, + { + "epoch": 4.21199962676122, + "grad_norm": NaN, + "learning_rate": 6.474145335296285e-05, + "loss": 0.0, + "step": 45140 + }, + { + "epoch": 4.212092936456098, + "grad_norm": NaN, + "learning_rate": 6.47352302892606e-05, + "loss": 0.0, + "step": 45141 + }, + { + "epoch": 4.212186246150975, + "grad_norm": NaN, + "learning_rate": 6.47290074423614e-05, + "loss": 0.0, + "step": 45142 + }, + { + "epoch": 4.212279555845853, + "grad_norm": NaN, + "learning_rate": 6.472278481228109e-05, + "loss": 0.0, + "step": 45143 + }, + { + "epoch": 4.21237286554073, + "grad_norm": NaN, + "learning_rate": 6.471656239903546e-05, + "loss": 0.0, + "step": 45144 + }, + { + "epoch": 4.212466175235607, + "grad_norm": NaN, + "learning_rate": 6.471034020264036e-05, + "loss": 0.0, + "step": 45145 + }, + { + "epoch": 4.212559484930484, + "grad_norm": NaN, + "learning_rate": 6.470411822311157e-05, + "loss": 0.0, + "step": 45146 + }, + { + "epoch": 4.212652794625361, + "grad_norm": NaN, + "learning_rate": 6.469789646046498e-05, + "loss": 0.0, + "step": 45147 + }, + { + "epoch": 4.212746104320239, + "grad_norm": NaN, + "learning_rate": 6.469167491471635e-05, + "loss": 0.0, + "step": 45148 + }, + { + "epoch": 4.212839414015116, + "grad_norm": NaN, + "learning_rate": 6.468545358588152e-05, + "loss": 0.0, + "step": 45149 + }, + { + "epoch": 4.212932723709994, + "grad_norm": NaN, + "learning_rate": 6.467923247397632e-05, + "loss": 0.0, + "step": 45150 + }, + { + "epoch": 4.213026033404871, + "grad_norm": NaN, + "learning_rate": 6.467301157901655e-05, + "loss": 0.0, + "step": 45151 + }, + { + "epoch": 4.2131193430997484, + "grad_norm": NaN, + "learning_rate": 6.466679090101804e-05, + "loss": 0.0, + "step": 45152 + }, + { + "epoch": 4.213212652794625, + "grad_norm": NaN, + "learning_rate": 6.466057043999661e-05, + "loss": 0.0, + "step": 45153 + }, + { + "epoch": 4.213305962489502, + "grad_norm": NaN, + "learning_rate": 6.465435019596806e-05, + "loss": 0.0, + "step": 45154 + }, + { + "epoch": 4.21339927218438, + "grad_norm": NaN, + "learning_rate": 6.46481301689482e-05, + "loss": 0.0, + "step": 45155 + }, + { + "epoch": 4.213492581879257, + "grad_norm": NaN, + "learning_rate": 6.46419103589529e-05, + "loss": 0.0, + "step": 45156 + }, + { + "epoch": 4.213585891574135, + "grad_norm": NaN, + "learning_rate": 6.46356907659979e-05, + "loss": 0.0, + "step": 45157 + }, + { + "epoch": 4.213679201269012, + "grad_norm": NaN, + "learning_rate": 6.462947139009908e-05, + "loss": 0.0, + "step": 45158 + }, + { + "epoch": 4.2137725109638895, + "grad_norm": NaN, + "learning_rate": 6.462325223127221e-05, + "loss": 0.0, + "step": 45159 + }, + { + "epoch": 4.213865820658766, + "grad_norm": NaN, + "learning_rate": 6.461703328953313e-05, + "loss": 0.0, + "step": 45160 + }, + { + "epoch": 4.2139591303536434, + "grad_norm": NaN, + "learning_rate": 6.461081456489762e-05, + "loss": 0.0, + "step": 45161 + }, + { + "epoch": 4.214052440048521, + "grad_norm": NaN, + "learning_rate": 6.460459605738153e-05, + "loss": 0.0, + "step": 45162 + }, + { + "epoch": 4.214145749743398, + "grad_norm": NaN, + "learning_rate": 6.459837776700066e-05, + "loss": 0.0, + "step": 45163 + }, + { + "epoch": 4.214239059438276, + "grad_norm": NaN, + "learning_rate": 6.459215969377082e-05, + "loss": 0.0, + "step": 45164 + }, + { + "epoch": 4.214332369133153, + "grad_norm": NaN, + "learning_rate": 6.458594183770782e-05, + "loss": 0.0, + "step": 45165 + }, + { + "epoch": 4.2144256788280305, + "grad_norm": NaN, + "learning_rate": 6.457972419882746e-05, + "loss": 0.0, + "step": 45166 + }, + { + "epoch": 4.214518988522908, + "grad_norm": NaN, + "learning_rate": 6.457350677714555e-05, + "loss": 0.0, + "step": 45167 + }, + { + "epoch": 4.2146122982177845, + "grad_norm": NaN, + "learning_rate": 6.456728957267791e-05, + "loss": 0.0, + "step": 45168 + }, + { + "epoch": 4.214705607912662, + "grad_norm": NaN, + "learning_rate": 6.456107258544036e-05, + "loss": 0.0, + "step": 45169 + }, + { + "epoch": 4.214798917607539, + "grad_norm": NaN, + "learning_rate": 6.455485581544868e-05, + "loss": 0.0, + "step": 45170 + }, + { + "epoch": 4.214892227302417, + "grad_norm": NaN, + "learning_rate": 6.45486392627187e-05, + "loss": 0.0, + "step": 45171 + }, + { + "epoch": 4.214985536997294, + "grad_norm": NaN, + "learning_rate": 6.45424229272662e-05, + "loss": 0.0, + "step": 45172 + }, + { + "epoch": 4.215078846692172, + "grad_norm": NaN, + "learning_rate": 6.453620680910702e-05, + "loss": 0.0, + "step": 45173 + }, + { + "epoch": 4.215172156387049, + "grad_norm": NaN, + "learning_rate": 6.452999090825696e-05, + "loss": 0.0, + "step": 45174 + }, + { + "epoch": 4.2152654660819255, + "grad_norm": NaN, + "learning_rate": 6.45237752247318e-05, + "loss": 0.0, + "step": 45175 + }, + { + "epoch": 4.215358775776803, + "grad_norm": NaN, + "learning_rate": 6.451755975854735e-05, + "loss": 0.0, + "step": 45176 + }, + { + "epoch": 4.21545208547168, + "grad_norm": NaN, + "learning_rate": 6.451134450971944e-05, + "loss": 0.0, + "step": 45177 + }, + { + "epoch": 4.215545395166558, + "grad_norm": NaN, + "learning_rate": 6.450512947826386e-05, + "loss": 0.0, + "step": 45178 + }, + { + "epoch": 4.215638704861435, + "grad_norm": NaN, + "learning_rate": 6.449891466419641e-05, + "loss": 0.0, + "step": 45179 + }, + { + "epoch": 4.215732014556313, + "grad_norm": NaN, + "learning_rate": 6.449270006753288e-05, + "loss": 0.0, + "step": 45180 + }, + { + "epoch": 4.21582532425119, + "grad_norm": NaN, + "learning_rate": 6.448648568828909e-05, + "loss": 0.0, + "step": 45181 + }, + { + "epoch": 4.215918633946067, + "grad_norm": NaN, + "learning_rate": 6.448027152648083e-05, + "loss": 0.0, + "step": 45182 + }, + { + "epoch": 4.216011943640944, + "grad_norm": NaN, + "learning_rate": 6.447405758212392e-05, + "loss": 0.0, + "step": 45183 + }, + { + "epoch": 4.216105253335821, + "grad_norm": NaN, + "learning_rate": 6.446784385523415e-05, + "loss": 0.0, + "step": 45184 + }, + { + "epoch": 4.216198563030699, + "grad_norm": NaN, + "learning_rate": 6.44616303458273e-05, + "loss": 0.0, + "step": 45185 + }, + { + "epoch": 4.216291872725576, + "grad_norm": NaN, + "learning_rate": 6.445541705391918e-05, + "loss": 0.0, + "step": 45186 + }, + { + "epoch": 4.216385182420454, + "grad_norm": NaN, + "learning_rate": 6.44492039795256e-05, + "loss": 0.0, + "step": 45187 + }, + { + "epoch": 4.216478492115331, + "grad_norm": NaN, + "learning_rate": 6.444299112266237e-05, + "loss": 0.0, + "step": 45188 + }, + { + "epoch": 4.216571801810208, + "grad_norm": NaN, + "learning_rate": 6.443677848334526e-05, + "loss": 0.0, + "step": 45189 + }, + { + "epoch": 4.216665111505085, + "grad_norm": NaN, + "learning_rate": 6.443056606159008e-05, + "loss": 0.0, + "step": 45190 + }, + { + "epoch": 4.2167584211999625, + "grad_norm": NaN, + "learning_rate": 6.44243538574126e-05, + "loss": 0.0, + "step": 45191 + }, + { + "epoch": 4.21685173089484, + "grad_norm": NaN, + "learning_rate": 6.441814187082866e-05, + "loss": 0.0, + "step": 45192 + }, + { + "epoch": 4.216945040589717, + "grad_norm": NaN, + "learning_rate": 6.441193010185403e-05, + "loss": 0.0, + "step": 45193 + }, + { + "epoch": 4.217038350284595, + "grad_norm": NaN, + "learning_rate": 6.440571855050452e-05, + "loss": 0.0, + "step": 45194 + }, + { + "epoch": 4.217131659979472, + "grad_norm": NaN, + "learning_rate": 6.43995072167959e-05, + "loss": 0.0, + "step": 45195 + }, + { + "epoch": 4.21722496967435, + "grad_norm": NaN, + "learning_rate": 6.439329610074397e-05, + "loss": 0.0, + "step": 45196 + }, + { + "epoch": 4.217318279369226, + "grad_norm": NaN, + "learning_rate": 6.438708520236454e-05, + "loss": 0.0, + "step": 45197 + }, + { + "epoch": 4.2174115890641035, + "grad_norm": NaN, + "learning_rate": 6.43808745216734e-05, + "loss": 0.0, + "step": 45198 + }, + { + "epoch": 4.217504898758981, + "grad_norm": NaN, + "learning_rate": 6.437466405868631e-05, + "loss": 0.0, + "step": 45199 + }, + { + "epoch": 4.217598208453858, + "grad_norm": NaN, + "learning_rate": 6.436845381341911e-05, + "loss": 0.0, + "step": 45200 + }, + { + "epoch": 4.217691518148736, + "grad_norm": NaN, + "learning_rate": 6.436224378588756e-05, + "loss": 0.0, + "step": 45201 + }, + { + "epoch": 4.217784827843613, + "grad_norm": NaN, + "learning_rate": 6.435603397610746e-05, + "loss": 0.0, + "step": 45202 + }, + { + "epoch": 4.217878137538491, + "grad_norm": NaN, + "learning_rate": 6.434982438409456e-05, + "loss": 0.0, + "step": 45203 + }, + { + "epoch": 4.217971447233367, + "grad_norm": NaN, + "learning_rate": 6.434361500986472e-05, + "loss": 0.0, + "step": 45204 + }, + { + "epoch": 4.218064756928245, + "grad_norm": NaN, + "learning_rate": 6.43374058534337e-05, + "loss": 0.0, + "step": 45205 + }, + { + "epoch": 4.218158066623122, + "grad_norm": NaN, + "learning_rate": 6.433119691481726e-05, + "loss": 0.0, + "step": 45206 + }, + { + "epoch": 4.218251376317999, + "grad_norm": NaN, + "learning_rate": 6.432498819403122e-05, + "loss": 0.0, + "step": 45207 + }, + { + "epoch": 4.218344686012877, + "grad_norm": NaN, + "learning_rate": 6.431877969109134e-05, + "loss": 0.0, + "step": 45208 + }, + { + "epoch": 4.218437995707754, + "grad_norm": NaN, + "learning_rate": 6.431257140601344e-05, + "loss": 0.0, + "step": 45209 + }, + { + "epoch": 4.218531305402632, + "grad_norm": NaN, + "learning_rate": 6.43063633388133e-05, + "loss": 0.0, + "step": 45210 + }, + { + "epoch": 4.218624615097509, + "grad_norm": NaN, + "learning_rate": 6.430015548950668e-05, + "loss": 0.0, + "step": 45211 + }, + { + "epoch": 4.218717924792386, + "grad_norm": NaN, + "learning_rate": 6.429394785810936e-05, + "loss": 0.0, + "step": 45212 + }, + { + "epoch": 4.218811234487263, + "grad_norm": NaN, + "learning_rate": 6.428774044463715e-05, + "loss": 0.0, + "step": 45213 + }, + { + "epoch": 4.2189045441821404, + "grad_norm": NaN, + "learning_rate": 6.428153324910586e-05, + "loss": 0.0, + "step": 45214 + }, + { + "epoch": 4.218997853877018, + "grad_norm": NaN, + "learning_rate": 6.42753262715312e-05, + "loss": 0.0, + "step": 45215 + }, + { + "epoch": 4.219091163571895, + "grad_norm": NaN, + "learning_rate": 6.4269119511929e-05, + "loss": 0.0, + "step": 45216 + }, + { + "epoch": 4.219184473266773, + "grad_norm": NaN, + "learning_rate": 6.426291297031506e-05, + "loss": 0.0, + "step": 45217 + }, + { + "epoch": 4.21927778296165, + "grad_norm": NaN, + "learning_rate": 6.42567066467051e-05, + "loss": 0.0, + "step": 45218 + }, + { + "epoch": 4.219371092656527, + "grad_norm": NaN, + "learning_rate": 6.425050054111495e-05, + "loss": 0.0, + "step": 45219 + }, + { + "epoch": 4.219464402351404, + "grad_norm": NaN, + "learning_rate": 6.424429465356038e-05, + "loss": 0.0, + "step": 45220 + }, + { + "epoch": 4.2195577120462815, + "grad_norm": NaN, + "learning_rate": 6.423808898405717e-05, + "loss": 0.0, + "step": 45221 + }, + { + "epoch": 4.219651021741159, + "grad_norm": NaN, + "learning_rate": 6.423188353262109e-05, + "loss": 0.0, + "step": 45222 + }, + { + "epoch": 4.219744331436036, + "grad_norm": NaN, + "learning_rate": 6.422567829926793e-05, + "loss": 0.0, + "step": 45223 + }, + { + "epoch": 4.219837641130914, + "grad_norm": NaN, + "learning_rate": 6.421947328401344e-05, + "loss": 0.0, + "step": 45224 + }, + { + "epoch": 4.219930950825791, + "grad_norm": NaN, + "learning_rate": 6.421326848687344e-05, + "loss": 0.0, + "step": 45225 + }, + { + "epoch": 4.220024260520668, + "grad_norm": NaN, + "learning_rate": 6.420706390786368e-05, + "loss": 0.0, + "step": 45226 + }, + { + "epoch": 4.220117570215545, + "grad_norm": NaN, + "learning_rate": 6.420085954699994e-05, + "loss": 0.0, + "step": 45227 + }, + { + "epoch": 4.2202108799104225, + "grad_norm": NaN, + "learning_rate": 6.419465540429802e-05, + "loss": 0.0, + "step": 45228 + }, + { + "epoch": 4.2203041896053, + "grad_norm": NaN, + "learning_rate": 6.418845147977364e-05, + "loss": 0.0, + "step": 45229 + }, + { + "epoch": 4.220397499300177, + "grad_norm": NaN, + "learning_rate": 6.418224777344264e-05, + "loss": 0.0, + "step": 45230 + }, + { + "epoch": 4.220490808995055, + "grad_norm": NaN, + "learning_rate": 6.417604428532073e-05, + "loss": 0.0, + "step": 45231 + }, + { + "epoch": 4.220584118689932, + "grad_norm": NaN, + "learning_rate": 6.416984101542374e-05, + "loss": 0.0, + "step": 45232 + }, + { + "epoch": 4.220677428384809, + "grad_norm": NaN, + "learning_rate": 6.41636379637674e-05, + "loss": 0.0, + "step": 45233 + }, + { + "epoch": 4.220770738079686, + "grad_norm": NaN, + "learning_rate": 6.415743513036752e-05, + "loss": 0.0, + "step": 45234 + }, + { + "epoch": 4.220864047774564, + "grad_norm": NaN, + "learning_rate": 6.415123251523984e-05, + "loss": 0.0, + "step": 45235 + }, + { + "epoch": 4.220957357469441, + "grad_norm": NaN, + "learning_rate": 6.414503011840017e-05, + "loss": 0.0, + "step": 45236 + }, + { + "epoch": 4.221050667164318, + "grad_norm": NaN, + "learning_rate": 6.413882793986422e-05, + "loss": 0.0, + "step": 45237 + }, + { + "epoch": 4.221143976859196, + "grad_norm": NaN, + "learning_rate": 6.413262597964784e-05, + "loss": 0.0, + "step": 45238 + }, + { + "epoch": 4.221237286554073, + "grad_norm": NaN, + "learning_rate": 6.412642423776673e-05, + "loss": 0.0, + "step": 45239 + }, + { + "epoch": 4.221330596248951, + "grad_norm": NaN, + "learning_rate": 6.412022271423668e-05, + "loss": 0.0, + "step": 45240 + }, + { + "epoch": 4.221423905943827, + "grad_norm": NaN, + "learning_rate": 6.411402140907348e-05, + "loss": 0.0, + "step": 45241 + }, + { + "epoch": 4.221517215638705, + "grad_norm": NaN, + "learning_rate": 6.410782032229287e-05, + "loss": 0.0, + "step": 45242 + }, + { + "epoch": 4.221610525333582, + "grad_norm": NaN, + "learning_rate": 6.410161945391064e-05, + "loss": 0.0, + "step": 45243 + }, + { + "epoch": 4.2217038350284595, + "grad_norm": NaN, + "learning_rate": 6.409541880394254e-05, + "loss": 0.0, + "step": 45244 + }, + { + "epoch": 4.221797144723337, + "grad_norm": NaN, + "learning_rate": 6.408921837240435e-05, + "loss": 0.0, + "step": 45245 + }, + { + "epoch": 4.221890454418214, + "grad_norm": NaN, + "learning_rate": 6.408301815931183e-05, + "loss": 0.0, + "step": 45246 + }, + { + "epoch": 4.221983764113092, + "grad_norm": NaN, + "learning_rate": 6.407681816468073e-05, + "loss": 0.0, + "step": 45247 + }, + { + "epoch": 4.222077073807968, + "grad_norm": NaN, + "learning_rate": 6.407061838852683e-05, + "loss": 0.0, + "step": 45248 + }, + { + "epoch": 4.222170383502846, + "grad_norm": NaN, + "learning_rate": 6.406441883086589e-05, + "loss": 0.0, + "step": 45249 + }, + { + "epoch": 4.222263693197723, + "grad_norm": NaN, + "learning_rate": 6.40582194917137e-05, + "loss": 0.0, + "step": 45250 + }, + { + "epoch": 4.2223570028926005, + "grad_norm": NaN, + "learning_rate": 6.405202037108598e-05, + "loss": 0.0, + "step": 45251 + }, + { + "epoch": 4.222450312587478, + "grad_norm": NaN, + "learning_rate": 6.404582146899852e-05, + "loss": 0.0, + "step": 45252 + }, + { + "epoch": 4.222543622282355, + "grad_norm": NaN, + "learning_rate": 6.403962278546709e-05, + "loss": 0.0, + "step": 45253 + }, + { + "epoch": 4.222636931977233, + "grad_norm": NaN, + "learning_rate": 6.403342432050742e-05, + "loss": 0.0, + "step": 45254 + }, + { + "epoch": 4.222730241672109, + "grad_norm": NaN, + "learning_rate": 6.402722607413529e-05, + "loss": 0.0, + "step": 45255 + }, + { + "epoch": 4.222823551366987, + "grad_norm": NaN, + "learning_rate": 6.402102804636645e-05, + "loss": 0.0, + "step": 45256 + }, + { + "epoch": 4.222916861061864, + "grad_norm": NaN, + "learning_rate": 6.401483023721668e-05, + "loss": 0.0, + "step": 45257 + }, + { + "epoch": 4.223010170756742, + "grad_norm": NaN, + "learning_rate": 6.40086326467017e-05, + "loss": 0.0, + "step": 45258 + }, + { + "epoch": 4.223103480451619, + "grad_norm": NaN, + "learning_rate": 6.400243527483732e-05, + "loss": 0.0, + "step": 45259 + }, + { + "epoch": 4.223196790146496, + "grad_norm": NaN, + "learning_rate": 6.399623812163927e-05, + "loss": 0.0, + "step": 45260 + }, + { + "epoch": 4.223290099841374, + "grad_norm": NaN, + "learning_rate": 6.399004118712331e-05, + "loss": 0.0, + "step": 45261 + }, + { + "epoch": 4.223383409536251, + "grad_norm": NaN, + "learning_rate": 6.398384447130518e-05, + "loss": 0.0, + "step": 45262 + }, + { + "epoch": 4.223476719231128, + "grad_norm": NaN, + "learning_rate": 6.397764797420066e-05, + "loss": 0.0, + "step": 45263 + }, + { + "epoch": 4.223570028926005, + "grad_norm": NaN, + "learning_rate": 6.39714516958255e-05, + "loss": 0.0, + "step": 45264 + }, + { + "epoch": 4.223663338620883, + "grad_norm": NaN, + "learning_rate": 6.396525563619547e-05, + "loss": 0.0, + "step": 45265 + }, + { + "epoch": 4.22375664831576, + "grad_norm": NaN, + "learning_rate": 6.39590597953263e-05, + "loss": 0.0, + "step": 45266 + }, + { + "epoch": 4.2238499580106375, + "grad_norm": NaN, + "learning_rate": 6.395286417323375e-05, + "loss": 0.0, + "step": 45267 + }, + { + "epoch": 4.223943267705515, + "grad_norm": NaN, + "learning_rate": 6.394666876993358e-05, + "loss": 0.0, + "step": 45268 + }, + { + "epoch": 4.224036577400392, + "grad_norm": NaN, + "learning_rate": 6.394047358544155e-05, + "loss": 0.0, + "step": 45269 + }, + { + "epoch": 4.224129887095269, + "grad_norm": NaN, + "learning_rate": 6.39342786197734e-05, + "loss": 0.0, + "step": 45270 + }, + { + "epoch": 4.224223196790146, + "grad_norm": NaN, + "learning_rate": 6.392808387294487e-05, + "loss": 0.0, + "step": 45271 + }, + { + "epoch": 4.224316506485024, + "grad_norm": NaN, + "learning_rate": 6.392188934497174e-05, + "loss": 0.0, + "step": 45272 + }, + { + "epoch": 4.224409816179901, + "grad_norm": NaN, + "learning_rate": 6.391569503586975e-05, + "loss": 0.0, + "step": 45273 + }, + { + "epoch": 4.2245031258747785, + "grad_norm": NaN, + "learning_rate": 6.390950094565463e-05, + "loss": 0.0, + "step": 45274 + }, + { + "epoch": 4.224596435569656, + "grad_norm": NaN, + "learning_rate": 6.390330707434215e-05, + "loss": 0.0, + "step": 45275 + }, + { + "epoch": 4.224689745264533, + "grad_norm": NaN, + "learning_rate": 6.389711342194806e-05, + "loss": 0.0, + "step": 45276 + }, + { + "epoch": 4.22478305495941, + "grad_norm": NaN, + "learning_rate": 6.389091998848813e-05, + "loss": 0.0, + "step": 45277 + }, + { + "epoch": 4.224876364654287, + "grad_norm": NaN, + "learning_rate": 6.388472677397805e-05, + "loss": 0.0, + "step": 45278 + }, + { + "epoch": 4.224969674349165, + "grad_norm": NaN, + "learning_rate": 6.38785337784336e-05, + "loss": 0.0, + "step": 45279 + }, + { + "epoch": 4.225062984044042, + "grad_norm": NaN, + "learning_rate": 6.387234100187055e-05, + "loss": 0.0, + "step": 45280 + }, + { + "epoch": 4.2251562937389195, + "grad_norm": NaN, + "learning_rate": 6.38661484443046e-05, + "loss": 0.0, + "step": 45281 + }, + { + "epoch": 4.225249603433797, + "grad_norm": NaN, + "learning_rate": 6.385995610575155e-05, + "loss": 0.0, + "step": 45282 + }, + { + "epoch": 4.225342913128674, + "grad_norm": NaN, + "learning_rate": 6.385376398622709e-05, + "loss": 0.0, + "step": 45283 + }, + { + "epoch": 4.225436222823552, + "grad_norm": NaN, + "learning_rate": 6.384757208574701e-05, + "loss": 0.0, + "step": 45284 + }, + { + "epoch": 4.225529532518428, + "grad_norm": NaN, + "learning_rate": 6.384138040432702e-05, + "loss": 0.0, + "step": 45285 + }, + { + "epoch": 4.225622842213306, + "grad_norm": NaN, + "learning_rate": 6.383518894198288e-05, + "loss": 0.0, + "step": 45286 + }, + { + "epoch": 4.225716151908183, + "grad_norm": NaN, + "learning_rate": 6.382899769873033e-05, + "loss": 0.0, + "step": 45287 + }, + { + "epoch": 4.225809461603061, + "grad_norm": NaN, + "learning_rate": 6.382280667458512e-05, + "loss": 0.0, + "step": 45288 + }, + { + "epoch": 4.225902771297938, + "grad_norm": NaN, + "learning_rate": 6.381661586956296e-05, + "loss": 0.0, + "step": 45289 + }, + { + "epoch": 4.225996080992815, + "grad_norm": NaN, + "learning_rate": 6.381042528367965e-05, + "loss": 0.0, + "step": 45290 + }, + { + "epoch": 4.226089390687693, + "grad_norm": NaN, + "learning_rate": 6.380423491695088e-05, + "loss": 0.0, + "step": 45291 + }, + { + "epoch": 4.226182700382569, + "grad_norm": NaN, + "learning_rate": 6.37980447693924e-05, + "loss": 0.0, + "step": 45292 + }, + { + "epoch": 4.226276010077447, + "grad_norm": NaN, + "learning_rate": 6.379185484101998e-05, + "loss": 0.0, + "step": 45293 + }, + { + "epoch": 4.226369319772324, + "grad_norm": NaN, + "learning_rate": 6.378566513184932e-05, + "loss": 0.0, + "step": 45294 + }, + { + "epoch": 4.226462629467202, + "grad_norm": NaN, + "learning_rate": 6.377947564189618e-05, + "loss": 0.0, + "step": 45295 + }, + { + "epoch": 4.226555939162079, + "grad_norm": NaN, + "learning_rate": 6.377328637117628e-05, + "loss": 0.0, + "step": 45296 + }, + { + "epoch": 4.2266492488569565, + "grad_norm": NaN, + "learning_rate": 6.37670973197054e-05, + "loss": 0.0, + "step": 45297 + }, + { + "epoch": 4.226742558551834, + "grad_norm": NaN, + "learning_rate": 6.376090848749923e-05, + "loss": 0.0, + "step": 45298 + }, + { + "epoch": 4.22683586824671, + "grad_norm": NaN, + "learning_rate": 6.375471987457351e-05, + "loss": 0.0, + "step": 45299 + }, + { + "epoch": 4.226929177941588, + "grad_norm": NaN, + "learning_rate": 6.374853148094401e-05, + "loss": 0.0, + "step": 45300 + }, + { + "epoch": 4.227022487636465, + "grad_norm": NaN, + "learning_rate": 6.374234330662644e-05, + "loss": 0.0, + "step": 45301 + }, + { + "epoch": 4.227115797331343, + "grad_norm": NaN, + "learning_rate": 6.373615535163653e-05, + "loss": 0.0, + "step": 45302 + }, + { + "epoch": 4.22720910702622, + "grad_norm": NaN, + "learning_rate": 6.372996761599003e-05, + "loss": 0.0, + "step": 45303 + }, + { + "epoch": 4.2273024167210975, + "grad_norm": NaN, + "learning_rate": 6.372378009970268e-05, + "loss": 0.0, + "step": 45304 + }, + { + "epoch": 4.227395726415975, + "grad_norm": NaN, + "learning_rate": 6.371759280279019e-05, + "loss": 0.0, + "step": 45305 + }, + { + "epoch": 4.2274890361108515, + "grad_norm": NaN, + "learning_rate": 6.37114057252683e-05, + "loss": 0.0, + "step": 45306 + }, + { + "epoch": 4.227582345805729, + "grad_norm": NaN, + "learning_rate": 6.370521886715274e-05, + "loss": 0.0, + "step": 45307 + }, + { + "epoch": 4.227675655500606, + "grad_norm": NaN, + "learning_rate": 6.369903222845926e-05, + "loss": 0.0, + "step": 45308 + }, + { + "epoch": 4.227768965195484, + "grad_norm": NaN, + "learning_rate": 6.369284580920356e-05, + "loss": 0.0, + "step": 45309 + }, + { + "epoch": 4.227862274890361, + "grad_norm": NaN, + "learning_rate": 6.36866596094014e-05, + "loss": 0.0, + "step": 45310 + }, + { + "epoch": 4.227955584585239, + "grad_norm": NaN, + "learning_rate": 6.368047362906849e-05, + "loss": 0.0, + "step": 45311 + }, + { + "epoch": 4.228048894280116, + "grad_norm": NaN, + "learning_rate": 6.367428786822057e-05, + "loss": 0.0, + "step": 45312 + }, + { + "epoch": 4.228142203974993, + "grad_norm": NaN, + "learning_rate": 6.366810232687338e-05, + "loss": 0.0, + "step": 45313 + }, + { + "epoch": 4.22823551366987, + "grad_norm": NaN, + "learning_rate": 6.36619170050426e-05, + "loss": 0.0, + "step": 45314 + }, + { + "epoch": 4.228328823364747, + "grad_norm": NaN, + "learning_rate": 6.3655731902744e-05, + "loss": 0.0, + "step": 45315 + }, + { + "epoch": 4.228422133059625, + "grad_norm": NaN, + "learning_rate": 6.364954701999331e-05, + "loss": 0.0, + "step": 45316 + }, + { + "epoch": 4.228515442754502, + "grad_norm": NaN, + "learning_rate": 6.364336235680622e-05, + "loss": 0.0, + "step": 45317 + }, + { + "epoch": 4.22860875244938, + "grad_norm": NaN, + "learning_rate": 6.36371779131985e-05, + "loss": 0.0, + "step": 45318 + }, + { + "epoch": 4.228702062144257, + "grad_norm": NaN, + "learning_rate": 6.363099368918585e-05, + "loss": 0.0, + "step": 45319 + }, + { + "epoch": 4.2287953718391345, + "grad_norm": NaN, + "learning_rate": 6.3624809684784e-05, + "loss": 0.0, + "step": 45320 + }, + { + "epoch": 4.228888681534011, + "grad_norm": NaN, + "learning_rate": 6.361862590000867e-05, + "loss": 0.0, + "step": 45321 + }, + { + "epoch": 4.228981991228888, + "grad_norm": NaN, + "learning_rate": 6.36124423348756e-05, + "loss": 0.0, + "step": 45322 + }, + { + "epoch": 4.229075300923766, + "grad_norm": NaN, + "learning_rate": 6.360625898940047e-05, + "loss": 0.0, + "step": 45323 + }, + { + "epoch": 4.229168610618643, + "grad_norm": NaN, + "learning_rate": 6.360007586359905e-05, + "loss": 0.0, + "step": 45324 + }, + { + "epoch": 4.229261920313521, + "grad_norm": NaN, + "learning_rate": 6.359389295748705e-05, + "loss": 0.0, + "step": 45325 + }, + { + "epoch": 4.229355230008398, + "grad_norm": NaN, + "learning_rate": 6.358771027108016e-05, + "loss": 0.0, + "step": 45326 + }, + { + "epoch": 4.2294485397032755, + "grad_norm": NaN, + "learning_rate": 6.358152780439414e-05, + "loss": 0.0, + "step": 45327 + }, + { + "epoch": 4.229541849398153, + "grad_norm": NaN, + "learning_rate": 6.357534555744469e-05, + "loss": 0.0, + "step": 45328 + }, + { + "epoch": 4.2296351590930295, + "grad_norm": NaN, + "learning_rate": 6.356916353024756e-05, + "loss": 0.0, + "step": 45329 + }, + { + "epoch": 4.229728468787907, + "grad_norm": NaN, + "learning_rate": 6.356298172281842e-05, + "loss": 0.0, + "step": 45330 + }, + { + "epoch": 4.229821778482784, + "grad_norm": NaN, + "learning_rate": 6.355680013517302e-05, + "loss": 0.0, + "step": 45331 + }, + { + "epoch": 4.229915088177662, + "grad_norm": NaN, + "learning_rate": 6.355061876732706e-05, + "loss": 0.0, + "step": 45332 + }, + { + "epoch": 4.230008397872539, + "grad_norm": NaN, + "learning_rate": 6.354443761929628e-05, + "loss": 0.0, + "step": 45333 + }, + { + "epoch": 4.2301017075674165, + "grad_norm": NaN, + "learning_rate": 6.353825669109638e-05, + "loss": 0.0, + "step": 45334 + }, + { + "epoch": 4.230195017262294, + "grad_norm": NaN, + "learning_rate": 6.353207598274309e-05, + "loss": 0.0, + "step": 45335 + }, + { + "epoch": 4.2302883269571705, + "grad_norm": NaN, + "learning_rate": 6.352589549425211e-05, + "loss": 0.0, + "step": 45336 + }, + { + "epoch": 4.230381636652048, + "grad_norm": NaN, + "learning_rate": 6.351971522563917e-05, + "loss": 0.0, + "step": 45337 + }, + { + "epoch": 4.230474946346925, + "grad_norm": NaN, + "learning_rate": 6.351353517691996e-05, + "loss": 0.0, + "step": 45338 + }, + { + "epoch": 4.230568256041803, + "grad_norm": NaN, + "learning_rate": 6.350735534811023e-05, + "loss": 0.0, + "step": 45339 + }, + { + "epoch": 4.23066156573668, + "grad_norm": NaN, + "learning_rate": 6.350117573922566e-05, + "loss": 0.0, + "step": 45340 + }, + { + "epoch": 4.230754875431558, + "grad_norm": NaN, + "learning_rate": 6.349499635028198e-05, + "loss": 0.0, + "step": 45341 + }, + { + "epoch": 4.230848185126435, + "grad_norm": NaN, + "learning_rate": 6.348881718129492e-05, + "loss": 0.0, + "step": 45342 + }, + { + "epoch": 4.2309414948213115, + "grad_norm": NaN, + "learning_rate": 6.348263823228015e-05, + "loss": 0.0, + "step": 45343 + }, + { + "epoch": 4.231034804516189, + "grad_norm": NaN, + "learning_rate": 6.34764595032534e-05, + "loss": 0.0, + "step": 45344 + }, + { + "epoch": 4.231128114211066, + "grad_norm": NaN, + "learning_rate": 6.34702809942304e-05, + "loss": 0.0, + "step": 45345 + }, + { + "epoch": 4.231221423905944, + "grad_norm": NaN, + "learning_rate": 6.346410270522684e-05, + "loss": 0.0, + "step": 45346 + }, + { + "epoch": 4.231314733600821, + "grad_norm": NaN, + "learning_rate": 6.34579246362584e-05, + "loss": 0.0, + "step": 45347 + }, + { + "epoch": 4.231408043295699, + "grad_norm": NaN, + "learning_rate": 6.345174678734086e-05, + "loss": 0.0, + "step": 45348 + }, + { + "epoch": 4.231501352990576, + "grad_norm": NaN, + "learning_rate": 6.344556915848987e-05, + "loss": 0.0, + "step": 45349 + }, + { + "epoch": 4.231594662685453, + "grad_norm": NaN, + "learning_rate": 6.343939174972118e-05, + "loss": 0.0, + "step": 45350 + }, + { + "epoch": 4.23168797238033, + "grad_norm": NaN, + "learning_rate": 6.343321456105046e-05, + "loss": 0.0, + "step": 45351 + }, + { + "epoch": 4.231781282075207, + "grad_norm": NaN, + "learning_rate": 6.342703759249344e-05, + "loss": 0.0, + "step": 45352 + }, + { + "epoch": 4.231874591770085, + "grad_norm": NaN, + "learning_rate": 6.34208608440658e-05, + "loss": 0.0, + "step": 45353 + }, + { + "epoch": 4.231967901464962, + "grad_norm": NaN, + "learning_rate": 6.341468431578329e-05, + "loss": 0.0, + "step": 45354 + }, + { + "epoch": 4.23206121115984, + "grad_norm": NaN, + "learning_rate": 6.340850800766159e-05, + "loss": 0.0, + "step": 45355 + }, + { + "epoch": 4.232154520854717, + "grad_norm": NaN, + "learning_rate": 6.340233191971638e-05, + "loss": 0.0, + "step": 45356 + }, + { + "epoch": 4.2322478305495945, + "grad_norm": NaN, + "learning_rate": 6.33961560519634e-05, + "loss": 0.0, + "step": 45357 + }, + { + "epoch": 4.232341140244471, + "grad_norm": NaN, + "learning_rate": 6.338998040441833e-05, + "loss": 0.0, + "step": 45358 + }, + { + "epoch": 4.2324344499393485, + "grad_norm": NaN, + "learning_rate": 6.33838049770969e-05, + "loss": 0.0, + "step": 45359 + }, + { + "epoch": 4.232527759634226, + "grad_norm": NaN, + "learning_rate": 6.33776297700148e-05, + "loss": 0.0, + "step": 45360 + }, + { + "epoch": 4.232621069329103, + "grad_norm": NaN, + "learning_rate": 6.337145478318771e-05, + "loss": 0.0, + "step": 45361 + }, + { + "epoch": 4.232714379023981, + "grad_norm": NaN, + "learning_rate": 6.336528001663136e-05, + "loss": 0.0, + "step": 45362 + }, + { + "epoch": 4.232807688718858, + "grad_norm": NaN, + "learning_rate": 6.335910547036144e-05, + "loss": 0.0, + "step": 45363 + }, + { + "epoch": 4.232900998413736, + "grad_norm": NaN, + "learning_rate": 6.335293114439364e-05, + "loss": 0.0, + "step": 45364 + }, + { + "epoch": 4.232994308108612, + "grad_norm": NaN, + "learning_rate": 6.334675703874368e-05, + "loss": 0.0, + "step": 45365 + }, + { + "epoch": 4.2330876178034895, + "grad_norm": NaN, + "learning_rate": 6.334058315342724e-05, + "loss": 0.0, + "step": 45366 + }, + { + "epoch": 4.233180927498367, + "grad_norm": NaN, + "learning_rate": 6.333440948846003e-05, + "loss": 0.0, + "step": 45367 + }, + { + "epoch": 4.233274237193244, + "grad_norm": NaN, + "learning_rate": 6.332823604385776e-05, + "loss": 0.0, + "step": 45368 + }, + { + "epoch": 4.233367546888122, + "grad_norm": NaN, + "learning_rate": 6.332206281963609e-05, + "loss": 0.0, + "step": 45369 + }, + { + "epoch": 4.233460856582999, + "grad_norm": NaN, + "learning_rate": 6.331588981581076e-05, + "loss": 0.0, + "step": 45370 + }, + { + "epoch": 4.233554166277877, + "grad_norm": NaN, + "learning_rate": 6.330971703239742e-05, + "loss": 0.0, + "step": 45371 + }, + { + "epoch": 4.233647475972753, + "grad_norm": NaN, + "learning_rate": 6.330354446941179e-05, + "loss": 0.0, + "step": 45372 + }, + { + "epoch": 4.233740785667631, + "grad_norm": NaN, + "learning_rate": 6.329737212686959e-05, + "loss": 0.0, + "step": 45373 + }, + { + "epoch": 4.233834095362508, + "grad_norm": NaN, + "learning_rate": 6.329120000478647e-05, + "loss": 0.0, + "step": 45374 + }, + { + "epoch": 4.233927405057385, + "grad_norm": NaN, + "learning_rate": 6.328502810317815e-05, + "loss": 0.0, + "step": 45375 + }, + { + "epoch": 4.234020714752263, + "grad_norm": NaN, + "learning_rate": 6.32788564220603e-05, + "loss": 0.0, + "step": 45376 + }, + { + "epoch": 4.23411402444714, + "grad_norm": NaN, + "learning_rate": 6.327268496144866e-05, + "loss": 0.0, + "step": 45377 + }, + { + "epoch": 4.234207334142018, + "grad_norm": NaN, + "learning_rate": 6.326651372135888e-05, + "loss": 0.0, + "step": 45378 + }, + { + "epoch": 4.234300643836894, + "grad_norm": NaN, + "learning_rate": 6.326034270180666e-05, + "loss": 0.0, + "step": 45379 + }, + { + "epoch": 4.234393953531772, + "grad_norm": NaN, + "learning_rate": 6.325417190280769e-05, + "loss": 0.0, + "step": 45380 + }, + { + "epoch": 4.234487263226649, + "grad_norm": NaN, + "learning_rate": 6.324800132437762e-05, + "loss": 0.0, + "step": 45381 + }, + { + "epoch": 4.2345805729215265, + "grad_norm": NaN, + "learning_rate": 6.324183096653228e-05, + "loss": 0.0, + "step": 45382 + }, + { + "epoch": 4.234673882616404, + "grad_norm": NaN, + "learning_rate": 6.323566082928722e-05, + "loss": 0.0, + "step": 45383 + }, + { + "epoch": 4.234767192311281, + "grad_norm": NaN, + "learning_rate": 6.322949091265818e-05, + "loss": 0.0, + "step": 45384 + }, + { + "epoch": 4.234860502006159, + "grad_norm": NaN, + "learning_rate": 6.322332121666083e-05, + "loss": 0.0, + "step": 45385 + }, + { + "epoch": 4.234953811701036, + "grad_norm": NaN, + "learning_rate": 6.321715174131087e-05, + "loss": 0.0, + "step": 45386 + }, + { + "epoch": 4.235047121395913, + "grad_norm": NaN, + "learning_rate": 6.321098248662398e-05, + "loss": 0.0, + "step": 45387 + }, + { + "epoch": 4.23514043109079, + "grad_norm": NaN, + "learning_rate": 6.320481345261587e-05, + "loss": 0.0, + "step": 45388 + }, + { + "epoch": 4.2352337407856675, + "grad_norm": NaN, + "learning_rate": 6.319864463930218e-05, + "loss": 0.0, + "step": 45389 + }, + { + "epoch": 4.235327050480545, + "grad_norm": NaN, + "learning_rate": 6.319247604669866e-05, + "loss": 0.0, + "step": 45390 + }, + { + "epoch": 4.235420360175422, + "grad_norm": NaN, + "learning_rate": 6.318630767482094e-05, + "loss": 0.0, + "step": 45391 + }, + { + "epoch": 4.2355136698703, + "grad_norm": NaN, + "learning_rate": 6.318013952368472e-05, + "loss": 0.0, + "step": 45392 + }, + { + "epoch": 4.235606979565177, + "grad_norm": NaN, + "learning_rate": 6.31739715933057e-05, + "loss": 0.0, + "step": 45393 + }, + { + "epoch": 4.235700289260054, + "grad_norm": NaN, + "learning_rate": 6.316780388369953e-05, + "loss": 0.0, + "step": 45394 + }, + { + "epoch": 4.235793598954931, + "grad_norm": NaN, + "learning_rate": 6.316163639488193e-05, + "loss": 0.0, + "step": 45395 + }, + { + "epoch": 4.2358869086498085, + "grad_norm": NaN, + "learning_rate": 6.315546912686856e-05, + "loss": 0.0, + "step": 45396 + }, + { + "epoch": 4.235980218344686, + "grad_norm": NaN, + "learning_rate": 6.314930207967512e-05, + "loss": 0.0, + "step": 45397 + }, + { + "epoch": 4.236073528039563, + "grad_norm": NaN, + "learning_rate": 6.314313525331727e-05, + "loss": 0.0, + "step": 45398 + }, + { + "epoch": 4.236166837734441, + "grad_norm": NaN, + "learning_rate": 6.313696864781068e-05, + "loss": 0.0, + "step": 45399 + }, + { + "epoch": 4.236260147429318, + "grad_norm": NaN, + "learning_rate": 6.313080226317107e-05, + "loss": 0.0, + "step": 45400 + }, + { + "epoch": 4.236353457124196, + "grad_norm": NaN, + "learning_rate": 6.312463609941408e-05, + "loss": 0.0, + "step": 45401 + }, + { + "epoch": 4.236446766819072, + "grad_norm": NaN, + "learning_rate": 6.311847015655543e-05, + "loss": 0.0, + "step": 45402 + }, + { + "epoch": 4.23654007651395, + "grad_norm": NaN, + "learning_rate": 6.311230443461076e-05, + "loss": 0.0, + "step": 45403 + }, + { + "epoch": 4.236633386208827, + "grad_norm": NaN, + "learning_rate": 6.310613893359576e-05, + "loss": 0.0, + "step": 45404 + }, + { + "epoch": 4.236726695903704, + "grad_norm": NaN, + "learning_rate": 6.30999736535261e-05, + "loss": 0.0, + "step": 45405 + }, + { + "epoch": 4.236820005598582, + "grad_norm": NaN, + "learning_rate": 6.309380859441748e-05, + "loss": 0.0, + "step": 45406 + }, + { + "epoch": 4.236913315293459, + "grad_norm": NaN, + "learning_rate": 6.308764375628557e-05, + "loss": 0.0, + "step": 45407 + }, + { + "epoch": 4.237006624988337, + "grad_norm": NaN, + "learning_rate": 6.3081479139146e-05, + "loss": 0.0, + "step": 45408 + }, + { + "epoch": 4.237099934683213, + "grad_norm": NaN, + "learning_rate": 6.30753147430145e-05, + "loss": 0.0, + "step": 45409 + }, + { + "epoch": 4.237193244378091, + "grad_norm": NaN, + "learning_rate": 6.306915056790675e-05, + "loss": 0.0, + "step": 45410 + }, + { + "epoch": 4.237286554072968, + "grad_norm": NaN, + "learning_rate": 6.306298661383838e-05, + "loss": 0.0, + "step": 45411 + }, + { + "epoch": 4.2373798637678455, + "grad_norm": NaN, + "learning_rate": 6.305682288082509e-05, + "loss": 0.0, + "step": 45412 + }, + { + "epoch": 4.237473173462723, + "grad_norm": NaN, + "learning_rate": 6.305065936888252e-05, + "loss": 0.0, + "step": 45413 + }, + { + "epoch": 4.2375664831576, + "grad_norm": NaN, + "learning_rate": 6.30444960780264e-05, + "loss": 0.0, + "step": 45414 + }, + { + "epoch": 4.237659792852478, + "grad_norm": NaN, + "learning_rate": 6.303833300827234e-05, + "loss": 0.0, + "step": 45415 + }, + { + "epoch": 4.237753102547354, + "grad_norm": NaN, + "learning_rate": 6.3032170159636e-05, + "loss": 0.0, + "step": 45416 + }, + { + "epoch": 4.237846412242232, + "grad_norm": NaN, + "learning_rate": 6.30260075321332e-05, + "loss": 0.0, + "step": 45417 + }, + { + "epoch": 4.237939721937109, + "grad_norm": NaN, + "learning_rate": 6.301984512577943e-05, + "loss": 0.0, + "step": 45418 + }, + { + "epoch": 4.2380330316319865, + "grad_norm": NaN, + "learning_rate": 6.301368294059039e-05, + "loss": 0.0, + "step": 45419 + }, + { + "epoch": 4.238126341326864, + "grad_norm": NaN, + "learning_rate": 6.300752097658189e-05, + "loss": 0.0, + "step": 45420 + }, + { + "epoch": 4.238219651021741, + "grad_norm": NaN, + "learning_rate": 6.300135923376944e-05, + "loss": 0.0, + "step": 45421 + }, + { + "epoch": 4.238312960716619, + "grad_norm": NaN, + "learning_rate": 6.29951977121687e-05, + "loss": 0.0, + "step": 45422 + }, + { + "epoch": 4.238406270411495, + "grad_norm": NaN, + "learning_rate": 6.298903641179552e-05, + "loss": 0.0, + "step": 45423 + }, + { + "epoch": 4.238499580106373, + "grad_norm": NaN, + "learning_rate": 6.29828753326654e-05, + "loss": 0.0, + "step": 45424 + }, + { + "epoch": 4.23859288980125, + "grad_norm": NaN, + "learning_rate": 6.297671447479399e-05, + "loss": 0.0, + "step": 45425 + }, + { + "epoch": 4.238686199496128, + "grad_norm": NaN, + "learning_rate": 6.297055383819712e-05, + "loss": 0.0, + "step": 45426 + }, + { + "epoch": 4.238779509191005, + "grad_norm": NaN, + "learning_rate": 6.29643934228903e-05, + "loss": 0.0, + "step": 45427 + }, + { + "epoch": 4.238872818885882, + "grad_norm": NaN, + "learning_rate": 6.295823322888925e-05, + "loss": 0.0, + "step": 45428 + }, + { + "epoch": 4.23896612858076, + "grad_norm": NaN, + "learning_rate": 6.295207325620961e-05, + "loss": 0.0, + "step": 45429 + }, + { + "epoch": 4.239059438275637, + "grad_norm": NaN, + "learning_rate": 6.294591350486708e-05, + "loss": 0.0, + "step": 45430 + }, + { + "epoch": 4.239152747970514, + "grad_norm": NaN, + "learning_rate": 6.29397539748773e-05, + "loss": 0.0, + "step": 45431 + }, + { + "epoch": 4.239246057665391, + "grad_norm": NaN, + "learning_rate": 6.293359466625594e-05, + "loss": 0.0, + "step": 45432 + }, + { + "epoch": 4.239339367360269, + "grad_norm": NaN, + "learning_rate": 6.292743557901866e-05, + "loss": 0.0, + "step": 45433 + }, + { + "epoch": 4.239432677055146, + "grad_norm": NaN, + "learning_rate": 6.29212767131811e-05, + "loss": 0.0, + "step": 45434 + }, + { + "epoch": 4.2395259867500235, + "grad_norm": NaN, + "learning_rate": 6.291511806875897e-05, + "loss": 0.0, + "step": 45435 + }, + { + "epoch": 4.239619296444901, + "grad_norm": NaN, + "learning_rate": 6.290895964576787e-05, + "loss": 0.0, + "step": 45436 + }, + { + "epoch": 4.239712606139778, + "grad_norm": NaN, + "learning_rate": 6.290280144422349e-05, + "loss": 0.0, + "step": 45437 + }, + { + "epoch": 4.239805915834655, + "grad_norm": NaN, + "learning_rate": 6.289664346414151e-05, + "loss": 0.0, + "step": 45438 + }, + { + "epoch": 4.239899225529532, + "grad_norm": NaN, + "learning_rate": 6.289048570553753e-05, + "loss": 0.0, + "step": 45439 + }, + { + "epoch": 4.23999253522441, + "grad_norm": NaN, + "learning_rate": 6.288432816842728e-05, + "loss": 0.0, + "step": 45440 + }, + { + "epoch": 4.240085844919287, + "grad_norm": NaN, + "learning_rate": 6.287817085282634e-05, + "loss": 0.0, + "step": 45441 + }, + { + "epoch": 4.2401791546141645, + "grad_norm": NaN, + "learning_rate": 6.287201375875044e-05, + "loss": 0.0, + "step": 45442 + }, + { + "epoch": 4.240272464309042, + "grad_norm": NaN, + "learning_rate": 6.286585688621517e-05, + "loss": 0.0, + "step": 45443 + }, + { + "epoch": 4.240365774003919, + "grad_norm": NaN, + "learning_rate": 6.285970023523623e-05, + "loss": 0.0, + "step": 45444 + }, + { + "epoch": 4.240459083698797, + "grad_norm": NaN, + "learning_rate": 6.285354380582925e-05, + "loss": 0.0, + "step": 45445 + }, + { + "epoch": 4.240552393393673, + "grad_norm": NaN, + "learning_rate": 6.28473875980099e-05, + "loss": 0.0, + "step": 45446 + }, + { + "epoch": 4.240645703088551, + "grad_norm": NaN, + "learning_rate": 6.284123161179382e-05, + "loss": 0.0, + "step": 45447 + }, + { + "epoch": 4.240739012783428, + "grad_norm": NaN, + "learning_rate": 6.283507584719668e-05, + "loss": 0.0, + "step": 45448 + }, + { + "epoch": 4.2408323224783055, + "grad_norm": NaN, + "learning_rate": 6.282892030423411e-05, + "loss": 0.0, + "step": 45449 + }, + { + "epoch": 4.240925632173183, + "grad_norm": NaN, + "learning_rate": 6.282276498292178e-05, + "loss": 0.0, + "step": 45450 + }, + { + "epoch": 4.24101894186806, + "grad_norm": NaN, + "learning_rate": 6.281660988327534e-05, + "loss": 0.0, + "step": 45451 + }, + { + "epoch": 4.241112251562938, + "grad_norm": NaN, + "learning_rate": 6.281045500531044e-05, + "loss": 0.0, + "step": 45452 + }, + { + "epoch": 4.241205561257814, + "grad_norm": NaN, + "learning_rate": 6.28043003490427e-05, + "loss": 0.0, + "step": 45453 + }, + { + "epoch": 4.241298870952692, + "grad_norm": NaN, + "learning_rate": 6.279814591448776e-05, + "loss": 0.0, + "step": 45454 + }, + { + "epoch": 4.241392180647569, + "grad_norm": NaN, + "learning_rate": 6.27919917016614e-05, + "loss": 0.0, + "step": 45455 + }, + { + "epoch": 4.241485490342447, + "grad_norm": NaN, + "learning_rate": 6.278583771057913e-05, + "loss": 0.0, + "step": 45456 + }, + { + "epoch": 4.241578800037324, + "grad_norm": NaN, + "learning_rate": 6.277968394125658e-05, + "loss": 0.0, + "step": 45457 + }, + { + "epoch": 4.241672109732201, + "grad_norm": NaN, + "learning_rate": 6.277353039370959e-05, + "loss": 0.0, + "step": 45458 + }, + { + "epoch": 4.241765419427079, + "grad_norm": NaN, + "learning_rate": 6.276737706795358e-05, + "loss": 0.0, + "step": 45459 + }, + { + "epoch": 4.241858729121955, + "grad_norm": NaN, + "learning_rate": 6.276122396400426e-05, + "loss": 0.0, + "step": 45460 + }, + { + "epoch": 4.241952038816833, + "grad_norm": NaN, + "learning_rate": 6.275507108187741e-05, + "loss": 0.0, + "step": 45461 + }, + { + "epoch": 4.24204534851171, + "grad_norm": NaN, + "learning_rate": 6.274891842158851e-05, + "loss": 0.0, + "step": 45462 + }, + { + "epoch": 4.242138658206588, + "grad_norm": NaN, + "learning_rate": 6.274276598315321e-05, + "loss": 0.0, + "step": 45463 + }, + { + "epoch": 4.242231967901465, + "grad_norm": NaN, + "learning_rate": 6.273661376658732e-05, + "loss": 0.0, + "step": 45464 + }, + { + "epoch": 4.2423252775963425, + "grad_norm": NaN, + "learning_rate": 6.273046177190632e-05, + "loss": 0.0, + "step": 45465 + }, + { + "epoch": 4.24241858729122, + "grad_norm": NaN, + "learning_rate": 6.272430999912586e-05, + "loss": 0.0, + "step": 45466 + }, + { + "epoch": 4.242511896986096, + "grad_norm": NaN, + "learning_rate": 6.271815844826172e-05, + "loss": 0.0, + "step": 45467 + }, + { + "epoch": 4.242605206680974, + "grad_norm": NaN, + "learning_rate": 6.271200711932941e-05, + "loss": 0.0, + "step": 45468 + }, + { + "epoch": 4.242698516375851, + "grad_norm": NaN, + "learning_rate": 6.270585601234452e-05, + "loss": 0.0, + "step": 45469 + }, + { + "epoch": 4.242791826070729, + "grad_norm": NaN, + "learning_rate": 6.269970512732292e-05, + "loss": 0.0, + "step": 45470 + }, + { + "epoch": 4.242885135765606, + "grad_norm": NaN, + "learning_rate": 6.269355446428005e-05, + "loss": 0.0, + "step": 45471 + }, + { + "epoch": 4.2429784454604835, + "grad_norm": NaN, + "learning_rate": 6.268740402323156e-05, + "loss": 0.0, + "step": 45472 + }, + { + "epoch": 4.243071755155361, + "grad_norm": NaN, + "learning_rate": 6.268125380419324e-05, + "loss": 0.0, + "step": 45473 + }, + { + "epoch": 4.243165064850238, + "grad_norm": NaN, + "learning_rate": 6.267510380718055e-05, + "loss": 0.0, + "step": 45474 + }, + { + "epoch": 4.243258374545115, + "grad_norm": NaN, + "learning_rate": 6.266895403220923e-05, + "loss": 0.0, + "step": 45475 + }, + { + "epoch": 4.243351684239992, + "grad_norm": NaN, + "learning_rate": 6.266280447929488e-05, + "loss": 0.0, + "step": 45476 + }, + { + "epoch": 4.24344499393487, + "grad_norm": NaN, + "learning_rate": 6.265665514845315e-05, + "loss": 0.0, + "step": 45477 + }, + { + "epoch": 4.243538303629747, + "grad_norm": NaN, + "learning_rate": 6.265050603969968e-05, + "loss": 0.0, + "step": 45478 + }, + { + "epoch": 4.243631613324625, + "grad_norm": NaN, + "learning_rate": 6.264435715305009e-05, + "loss": 0.0, + "step": 45479 + }, + { + "epoch": 4.243724923019502, + "grad_norm": NaN, + "learning_rate": 6.263820848852002e-05, + "loss": 0.0, + "step": 45480 + }, + { + "epoch": 4.243818232714379, + "grad_norm": NaN, + "learning_rate": 6.263206004612513e-05, + "loss": 0.0, + "step": 45481 + }, + { + "epoch": 4.243911542409256, + "grad_norm": NaN, + "learning_rate": 6.262591182588101e-05, + "loss": 0.0, + "step": 45482 + }, + { + "epoch": 4.244004852104133, + "grad_norm": NaN, + "learning_rate": 6.261976382780331e-05, + "loss": 0.0, + "step": 45483 + }, + { + "epoch": 4.244098161799011, + "grad_norm": NaN, + "learning_rate": 6.261361605190767e-05, + "loss": 0.0, + "step": 45484 + }, + { + "epoch": 4.244191471493888, + "grad_norm": NaN, + "learning_rate": 6.260746849820973e-05, + "loss": 0.0, + "step": 45485 + }, + { + "epoch": 4.244284781188766, + "grad_norm": NaN, + "learning_rate": 6.26013211667251e-05, + "loss": 0.0, + "step": 45486 + }, + { + "epoch": 4.244378090883643, + "grad_norm": NaN, + "learning_rate": 6.259517405746941e-05, + "loss": 0.0, + "step": 45487 + }, + { + "epoch": 4.2444714005785205, + "grad_norm": NaN, + "learning_rate": 6.258902717045833e-05, + "loss": 0.0, + "step": 45488 + }, + { + "epoch": 4.244564710273397, + "grad_norm": NaN, + "learning_rate": 6.258288050570744e-05, + "loss": 0.0, + "step": 45489 + }, + { + "epoch": 4.244658019968274, + "grad_norm": NaN, + "learning_rate": 6.25767340632324e-05, + "loss": 0.0, + "step": 45490 + }, + { + "epoch": 4.244751329663152, + "grad_norm": NaN, + "learning_rate": 6.257058784304882e-05, + "loss": 0.0, + "step": 45491 + }, + { + "epoch": 4.244844639358029, + "grad_norm": NaN, + "learning_rate": 6.256444184517229e-05, + "loss": 0.0, + "step": 45492 + }, + { + "epoch": 4.244937949052907, + "grad_norm": NaN, + "learning_rate": 6.255829606961859e-05, + "loss": 0.0, + "step": 45493 + }, + { + "epoch": 4.245031258747784, + "grad_norm": NaN, + "learning_rate": 6.255215051640318e-05, + "loss": 0.0, + "step": 45494 + }, + { + "epoch": 4.2451245684426615, + "grad_norm": NaN, + "learning_rate": 6.25460051855417e-05, + "loss": 0.0, + "step": 45495 + }, + { + "epoch": 4.245217878137538, + "grad_norm": NaN, + "learning_rate": 6.253986007704994e-05, + "loss": 0.0, + "step": 45496 + }, + { + "epoch": 4.2453111878324155, + "grad_norm": NaN, + "learning_rate": 6.253371519094334e-05, + "loss": 0.0, + "step": 45497 + }, + { + "epoch": 4.245404497527293, + "grad_norm": NaN, + "learning_rate": 6.252757052723753e-05, + "loss": 0.0, + "step": 45498 + }, + { + "epoch": 4.24549780722217, + "grad_norm": NaN, + "learning_rate": 6.252142608594834e-05, + "loss": 0.0, + "step": 45499 + }, + { + "epoch": 4.245591116917048, + "grad_norm": NaN, + "learning_rate": 6.251528186709117e-05, + "loss": 0.0, + "step": 45500 + }, + { + "epoch": 4.245684426611925, + "grad_norm": NaN, + "learning_rate": 6.250913787068168e-05, + "loss": 0.0, + "step": 45501 + }, + { + "epoch": 4.2457777363068026, + "grad_norm": NaN, + "learning_rate": 6.250299409673563e-05, + "loss": 0.0, + "step": 45502 + }, + { + "epoch": 4.24587104600168, + "grad_norm": NaN, + "learning_rate": 6.24968505452685e-05, + "loss": 0.0, + "step": 45503 + }, + { + "epoch": 4.2459643556965565, + "grad_norm": NaN, + "learning_rate": 6.24907072162959e-05, + "loss": 0.0, + "step": 45504 + }, + { + "epoch": 4.246057665391434, + "grad_norm": NaN, + "learning_rate": 6.248456410983363e-05, + "loss": 0.0, + "step": 45505 + }, + { + "epoch": 4.246150975086311, + "grad_norm": NaN, + "learning_rate": 6.247842122589712e-05, + "loss": 0.0, + "step": 45506 + }, + { + "epoch": 4.246244284781189, + "grad_norm": NaN, + "learning_rate": 6.247227856450203e-05, + "loss": 0.0, + "step": 45507 + }, + { + "epoch": 4.246337594476066, + "grad_norm": NaN, + "learning_rate": 6.24661361256641e-05, + "loss": 0.0, + "step": 45508 + }, + { + "epoch": 4.246430904170944, + "grad_norm": NaN, + "learning_rate": 6.245999390939879e-05, + "loss": 0.0, + "step": 45509 + }, + { + "epoch": 4.246524213865821, + "grad_norm": NaN, + "learning_rate": 6.245385191572174e-05, + "loss": 0.0, + "step": 45510 + }, + { + "epoch": 4.2466175235606975, + "grad_norm": NaN, + "learning_rate": 6.244771014464873e-05, + "loss": 0.0, + "step": 45511 + }, + { + "epoch": 4.246710833255575, + "grad_norm": NaN, + "learning_rate": 6.24415685961952e-05, + "loss": 0.0, + "step": 45512 + }, + { + "epoch": 4.246804142950452, + "grad_norm": NaN, + "learning_rate": 6.243542727037676e-05, + "loss": 0.0, + "step": 45513 + }, + { + "epoch": 4.24689745264533, + "grad_norm": NaN, + "learning_rate": 6.24292861672092e-05, + "loss": 0.0, + "step": 45514 + }, + { + "epoch": 4.246990762340207, + "grad_norm": NaN, + "learning_rate": 6.242314528670798e-05, + "loss": 0.0, + "step": 45515 + }, + { + "epoch": 4.247084072035085, + "grad_norm": NaN, + "learning_rate": 6.24170046288887e-05, + "loss": 0.0, + "step": 45516 + }, + { + "epoch": 4.247177381729962, + "grad_norm": NaN, + "learning_rate": 6.241086419376715e-05, + "loss": 0.0, + "step": 45517 + }, + { + "epoch": 4.2472706914248395, + "grad_norm": NaN, + "learning_rate": 6.240472398135876e-05, + "loss": 0.0, + "step": 45518 + }, + { + "epoch": 4.247364001119716, + "grad_norm": NaN, + "learning_rate": 6.23985839916792e-05, + "loss": 0.0, + "step": 45519 + }, + { + "epoch": 4.247457310814593, + "grad_norm": NaN, + "learning_rate": 6.239244422474412e-05, + "loss": 0.0, + "step": 45520 + }, + { + "epoch": 4.247550620509471, + "grad_norm": NaN, + "learning_rate": 6.238630468056909e-05, + "loss": 0.0, + "step": 45521 + }, + { + "epoch": 4.247643930204348, + "grad_norm": NaN, + "learning_rate": 6.238016535916973e-05, + "loss": 0.0, + "step": 45522 + }, + { + "epoch": 4.247737239899226, + "grad_norm": NaN, + "learning_rate": 6.237402626056166e-05, + "loss": 0.0, + "step": 45523 + }, + { + "epoch": 4.247830549594103, + "grad_norm": NaN, + "learning_rate": 6.236788738476048e-05, + "loss": 0.0, + "step": 45524 + }, + { + "epoch": 4.2479238592889805, + "grad_norm": NaN, + "learning_rate": 6.23617487317818e-05, + "loss": 0.0, + "step": 45525 + }, + { + "epoch": 4.248017168983857, + "grad_norm": NaN, + "learning_rate": 6.235561030164122e-05, + "loss": 0.0, + "step": 45526 + }, + { + "epoch": 4.2481104786787345, + "grad_norm": NaN, + "learning_rate": 6.23494720943544e-05, + "loss": 0.0, + "step": 45527 + }, + { + "epoch": 4.248203788373612, + "grad_norm": NaN, + "learning_rate": 6.234333410993687e-05, + "loss": 0.0, + "step": 45528 + }, + { + "epoch": 4.248297098068489, + "grad_norm": NaN, + "learning_rate": 6.23371963484043e-05, + "loss": 0.0, + "step": 45529 + }, + { + "epoch": 4.248390407763367, + "grad_norm": NaN, + "learning_rate": 6.233105880977222e-05, + "loss": 0.0, + "step": 45530 + }, + { + "epoch": 4.248483717458244, + "grad_norm": NaN, + "learning_rate": 6.232492149405638e-05, + "loss": 0.0, + "step": 45531 + }, + { + "epoch": 4.248577027153122, + "grad_norm": NaN, + "learning_rate": 6.231878440127224e-05, + "loss": 0.0, + "step": 45532 + }, + { + "epoch": 4.248670336847998, + "grad_norm": NaN, + "learning_rate": 6.231264753143541e-05, + "loss": 0.0, + "step": 45533 + }, + { + "epoch": 4.2487636465428755, + "grad_norm": NaN, + "learning_rate": 6.230651088456166e-05, + "loss": 0.0, + "step": 45534 + }, + { + "epoch": 4.248856956237753, + "grad_norm": NaN, + "learning_rate": 6.23003744606664e-05, + "loss": 0.0, + "step": 45535 + }, + { + "epoch": 4.24895026593263, + "grad_norm": NaN, + "learning_rate": 6.229423825976527e-05, + "loss": 0.0, + "step": 45536 + }, + { + "epoch": 4.249043575627508, + "grad_norm": NaN, + "learning_rate": 6.228810228187402e-05, + "loss": 0.0, + "step": 45537 + }, + { + "epoch": 4.249136885322385, + "grad_norm": NaN, + "learning_rate": 6.228196652700808e-05, + "loss": 0.0, + "step": 45538 + }, + { + "epoch": 4.249230195017263, + "grad_norm": NaN, + "learning_rate": 6.227583099518308e-05, + "loss": 0.0, + "step": 45539 + }, + { + "epoch": 4.249323504712139, + "grad_norm": NaN, + "learning_rate": 6.226969568641475e-05, + "loss": 0.0, + "step": 45540 + }, + { + "epoch": 4.249416814407017, + "grad_norm": NaN, + "learning_rate": 6.226356060071855e-05, + "loss": 0.0, + "step": 45541 + }, + { + "epoch": 4.249510124101894, + "grad_norm": NaN, + "learning_rate": 6.225742573811006e-05, + "loss": 0.0, + "step": 45542 + }, + { + "epoch": 4.249603433796771, + "grad_norm": NaN, + "learning_rate": 6.225129109860506e-05, + "loss": 0.0, + "step": 45543 + }, + { + "epoch": 4.249696743491649, + "grad_norm": NaN, + "learning_rate": 6.224515668221898e-05, + "loss": 0.0, + "step": 45544 + }, + { + "epoch": 4.249790053186526, + "grad_norm": NaN, + "learning_rate": 6.223902248896742e-05, + "loss": 0.0, + "step": 45545 + }, + { + "epoch": 4.249883362881404, + "grad_norm": NaN, + "learning_rate": 6.223288851886613e-05, + "loss": 0.0, + "step": 45546 + }, + { + "epoch": 4.249976672576281, + "grad_norm": NaN, + "learning_rate": 6.222675477193057e-05, + "loss": 0.0, + "step": 45547 + }, + { + "epoch": 4.250069982271158, + "grad_norm": NaN, + "learning_rate": 6.222062124817629e-05, + "loss": 0.0, + "step": 45548 + }, + { + "epoch": 4.250163291966035, + "grad_norm": NaN, + "learning_rate": 6.221448794761909e-05, + "loss": 0.0, + "step": 45549 + }, + { + "epoch": 4.2502566016609125, + "grad_norm": NaN, + "learning_rate": 6.220835487027439e-05, + "loss": 0.0, + "step": 45550 + }, + { + "epoch": 4.25034991135579, + "grad_norm": NaN, + "learning_rate": 6.220222201615778e-05, + "loss": 0.0, + "step": 45551 + }, + { + "epoch": 4.250443221050667, + "grad_norm": NaN, + "learning_rate": 6.219608938528502e-05, + "loss": 0.0, + "step": 45552 + }, + { + "epoch": 4.250536530745545, + "grad_norm": NaN, + "learning_rate": 6.218995697767156e-05, + "loss": 0.0, + "step": 45553 + }, + { + "epoch": 4.250629840440422, + "grad_norm": NaN, + "learning_rate": 6.218382479333294e-05, + "loss": 0.0, + "step": 45554 + }, + { + "epoch": 4.250723150135299, + "grad_norm": NaN, + "learning_rate": 6.217769283228498e-05, + "loss": 0.0, + "step": 45555 + }, + { + "epoch": 4.250816459830176, + "grad_norm": NaN, + "learning_rate": 6.217156109454306e-05, + "loss": 0.0, + "step": 45556 + }, + { + "epoch": 4.2509097695250535, + "grad_norm": NaN, + "learning_rate": 6.21654295801228e-05, + "loss": 0.0, + "step": 45557 + }, + { + "epoch": 4.251003079219931, + "grad_norm": NaN, + "learning_rate": 6.215929828903994e-05, + "loss": 0.0, + "step": 45558 + }, + { + "epoch": 4.251096388914808, + "grad_norm": NaN, + "learning_rate": 6.215316722130989e-05, + "loss": 0.0, + "step": 45559 + }, + { + "epoch": 4.251189698609686, + "grad_norm": NaN, + "learning_rate": 6.214703637694827e-05, + "loss": 0.0, + "step": 45560 + }, + { + "epoch": 4.251283008304563, + "grad_norm": NaN, + "learning_rate": 6.214090575597083e-05, + "loss": 0.0, + "step": 45561 + }, + { + "epoch": 4.251376317999441, + "grad_norm": NaN, + "learning_rate": 6.2134775358393e-05, + "loss": 0.0, + "step": 45562 + }, + { + "epoch": 4.251469627694317, + "grad_norm": NaN, + "learning_rate": 6.21286451842304e-05, + "loss": 0.0, + "step": 45563 + }, + { + "epoch": 4.2515629373891946, + "grad_norm": NaN, + "learning_rate": 6.21225152334986e-05, + "loss": 0.0, + "step": 45564 + }, + { + "epoch": 4.251656247084072, + "grad_norm": NaN, + "learning_rate": 6.211638550621324e-05, + "loss": 0.0, + "step": 45565 + }, + { + "epoch": 4.251749556778949, + "grad_norm": NaN, + "learning_rate": 6.211025600238987e-05, + "loss": 0.0, + "step": 45566 + }, + { + "epoch": 4.251842866473827, + "grad_norm": NaN, + "learning_rate": 6.210412672204409e-05, + "loss": 0.0, + "step": 45567 + }, + { + "epoch": 4.251936176168704, + "grad_norm": NaN, + "learning_rate": 6.209799766519141e-05, + "loss": 0.0, + "step": 45568 + }, + { + "epoch": 4.252029485863581, + "grad_norm": NaN, + "learning_rate": 6.209186883184758e-05, + "loss": 0.0, + "step": 45569 + }, + { + "epoch": 4.252122795558458, + "grad_norm": NaN, + "learning_rate": 6.208574022202807e-05, + "loss": 0.0, + "step": 45570 + }, + { + "epoch": 4.252216105253336, + "grad_norm": NaN, + "learning_rate": 6.20796118357484e-05, + "loss": 0.0, + "step": 45571 + }, + { + "epoch": 4.252309414948213, + "grad_norm": NaN, + "learning_rate": 6.207348367302435e-05, + "loss": 0.0, + "step": 45572 + }, + { + "epoch": 4.25240272464309, + "grad_norm": NaN, + "learning_rate": 6.206735573387134e-05, + "loss": 0.0, + "step": 45573 + }, + { + "epoch": 4.252496034337968, + "grad_norm": NaN, + "learning_rate": 6.206122801830494e-05, + "loss": 0.0, + "step": 45574 + }, + { + "epoch": 4.252589344032845, + "grad_norm": NaN, + "learning_rate": 6.205510052634089e-05, + "loss": 0.0, + "step": 45575 + }, + { + "epoch": 4.252682653727723, + "grad_norm": NaN, + "learning_rate": 6.204897325799461e-05, + "loss": 0.0, + "step": 45576 + }, + { + "epoch": 4.252775963422599, + "grad_norm": NaN, + "learning_rate": 6.20428462132817e-05, + "loss": 0.0, + "step": 45577 + }, + { + "epoch": 4.252869273117477, + "grad_norm": NaN, + "learning_rate": 6.203671939221786e-05, + "loss": 0.0, + "step": 45578 + }, + { + "epoch": 4.252962582812354, + "grad_norm": NaN, + "learning_rate": 6.203059279481855e-05, + "loss": 0.0, + "step": 45579 + }, + { + "epoch": 4.2530558925072315, + "grad_norm": NaN, + "learning_rate": 6.202446642109932e-05, + "loss": 0.0, + "step": 45580 + }, + { + "epoch": 4.253149202202109, + "grad_norm": NaN, + "learning_rate": 6.201834027107593e-05, + "loss": 0.0, + "step": 45581 + }, + { + "epoch": 4.253242511896986, + "grad_norm": NaN, + "learning_rate": 6.201221434476378e-05, + "loss": 0.0, + "step": 45582 + }, + { + "epoch": 4.253335821591864, + "grad_norm": NaN, + "learning_rate": 6.200608864217845e-05, + "loss": 0.0, + "step": 45583 + }, + { + "epoch": 4.25342913128674, + "grad_norm": NaN, + "learning_rate": 6.19999631633357e-05, + "loss": 0.0, + "step": 45584 + }, + { + "epoch": 4.253522440981618, + "grad_norm": NaN, + "learning_rate": 6.199383790825088e-05, + "loss": 0.0, + "step": 45585 + }, + { + "epoch": 4.253615750676495, + "grad_norm": NaN, + "learning_rate": 6.198771287693964e-05, + "loss": 0.0, + "step": 45586 + }, + { + "epoch": 4.2537090603713725, + "grad_norm": NaN, + "learning_rate": 6.198158806941767e-05, + "loss": 0.0, + "step": 45587 + }, + { + "epoch": 4.25380237006625, + "grad_norm": NaN, + "learning_rate": 6.19754634857004e-05, + "loss": 0.0, + "step": 45588 + }, + { + "epoch": 4.253895679761127, + "grad_norm": NaN, + "learning_rate": 6.19693391258034e-05, + "loss": 0.0, + "step": 45589 + }, + { + "epoch": 4.253988989456005, + "grad_norm": NaN, + "learning_rate": 6.196321498974239e-05, + "loss": 0.0, + "step": 45590 + }, + { + "epoch": 4.254082299150882, + "grad_norm": NaN, + "learning_rate": 6.19570910775328e-05, + "loss": 0.0, + "step": 45591 + }, + { + "epoch": 4.254175608845759, + "grad_norm": NaN, + "learning_rate": 6.195096738919019e-05, + "loss": 0.0, + "step": 45592 + }, + { + "epoch": 4.254268918540636, + "grad_norm": NaN, + "learning_rate": 6.19448439247303e-05, + "loss": 0.0, + "step": 45593 + }, + { + "epoch": 4.254362228235514, + "grad_norm": NaN, + "learning_rate": 6.193872068416853e-05, + "loss": 0.0, + "step": 45594 + }, + { + "epoch": 4.254455537930391, + "grad_norm": NaN, + "learning_rate": 6.193259766752046e-05, + "loss": 0.0, + "step": 45595 + }, + { + "epoch": 4.254548847625268, + "grad_norm": NaN, + "learning_rate": 6.192647487480178e-05, + "loss": 0.0, + "step": 45596 + }, + { + "epoch": 4.254642157320146, + "grad_norm": NaN, + "learning_rate": 6.192035230602798e-05, + "loss": 0.0, + "step": 45597 + }, + { + "epoch": 4.254735467015023, + "grad_norm": NaN, + "learning_rate": 6.191422996121456e-05, + "loss": 0.0, + "step": 45598 + }, + { + "epoch": 4.2548287767099, + "grad_norm": NaN, + "learning_rate": 6.190810784037727e-05, + "loss": 0.0, + "step": 45599 + }, + { + "epoch": 4.254922086404777, + "grad_norm": NaN, + "learning_rate": 6.190198594353152e-05, + "loss": 0.0, + "step": 45600 + }, + { + "epoch": 4.255015396099655, + "grad_norm": NaN, + "learning_rate": 6.189586427069286e-05, + "loss": 0.0, + "step": 45601 + }, + { + "epoch": 4.255108705794532, + "grad_norm": NaN, + "learning_rate": 6.188974282187701e-05, + "loss": 0.0, + "step": 45602 + }, + { + "epoch": 4.2552020154894095, + "grad_norm": NaN, + "learning_rate": 6.18836215970994e-05, + "loss": 0.0, + "step": 45603 + }, + { + "epoch": 4.255295325184287, + "grad_norm": NaN, + "learning_rate": 6.187750059637561e-05, + "loss": 0.0, + "step": 45604 + }, + { + "epoch": 4.255388634879164, + "grad_norm": NaN, + "learning_rate": 6.18713798197213e-05, + "loss": 0.0, + "step": 45605 + }, + { + "epoch": 4.255481944574041, + "grad_norm": NaN, + "learning_rate": 6.186525926715189e-05, + "loss": 0.0, + "step": 45606 + }, + { + "epoch": 4.255575254268918, + "grad_norm": NaN, + "learning_rate": 6.185913893868311e-05, + "loss": 0.0, + "step": 45607 + }, + { + "epoch": 4.255668563963796, + "grad_norm": NaN, + "learning_rate": 6.18530188343304e-05, + "loss": 0.0, + "step": 45608 + }, + { + "epoch": 4.255761873658673, + "grad_norm": NaN, + "learning_rate": 6.184689895410928e-05, + "loss": 0.0, + "step": 45609 + }, + { + "epoch": 4.2558551833535505, + "grad_norm": NaN, + "learning_rate": 6.184077929803548e-05, + "loss": 0.0, + "step": 45610 + }, + { + "epoch": 4.255948493048428, + "grad_norm": NaN, + "learning_rate": 6.183465986612442e-05, + "loss": 0.0, + "step": 45611 + }, + { + "epoch": 4.256041802743305, + "grad_norm": NaN, + "learning_rate": 6.182854065839164e-05, + "loss": 0.0, + "step": 45612 + }, + { + "epoch": 4.256135112438182, + "grad_norm": NaN, + "learning_rate": 6.182242167485288e-05, + "loss": 0.0, + "step": 45613 + }, + { + "epoch": 4.256228422133059, + "grad_norm": NaN, + "learning_rate": 6.181630291552354e-05, + "loss": 0.0, + "step": 45614 + }, + { + "epoch": 4.256321731827937, + "grad_norm": NaN, + "learning_rate": 6.181018438041914e-05, + "loss": 0.0, + "step": 45615 + }, + { + "epoch": 4.256415041522814, + "grad_norm": NaN, + "learning_rate": 6.180406606955545e-05, + "loss": 0.0, + "step": 45616 + }, + { + "epoch": 4.2565083512176916, + "grad_norm": NaN, + "learning_rate": 6.179794798294782e-05, + "loss": 0.0, + "step": 45617 + }, + { + "epoch": 4.256601660912569, + "grad_norm": NaN, + "learning_rate": 6.179183012061184e-05, + "loss": 0.0, + "step": 45618 + }, + { + "epoch": 4.256694970607446, + "grad_norm": NaN, + "learning_rate": 6.178571248256321e-05, + "loss": 0.0, + "step": 45619 + }, + { + "epoch": 4.256788280302324, + "grad_norm": NaN, + "learning_rate": 6.177959506881733e-05, + "loss": 0.0, + "step": 45620 + }, + { + "epoch": 4.2568815899972, + "grad_norm": NaN, + "learning_rate": 6.177347787938975e-05, + "loss": 0.0, + "step": 45621 + }, + { + "epoch": 4.256974899692078, + "grad_norm": NaN, + "learning_rate": 6.17673609142962e-05, + "loss": 0.0, + "step": 45622 + }, + { + "epoch": 4.257068209386955, + "grad_norm": NaN, + "learning_rate": 6.176124417355206e-05, + "loss": 0.0, + "step": 45623 + }, + { + "epoch": 4.257161519081833, + "grad_norm": NaN, + "learning_rate": 6.175512765717287e-05, + "loss": 0.0, + "step": 45624 + }, + { + "epoch": 4.25725482877671, + "grad_norm": NaN, + "learning_rate": 6.174901136517435e-05, + "loss": 0.0, + "step": 45625 + }, + { + "epoch": 4.257348138471587, + "grad_norm": NaN, + "learning_rate": 6.174289529757191e-05, + "loss": 0.0, + "step": 45626 + }, + { + "epoch": 4.257441448166465, + "grad_norm": NaN, + "learning_rate": 6.17367794543811e-05, + "loss": 0.0, + "step": 45627 + }, + { + "epoch": 4.257534757861341, + "grad_norm": NaN, + "learning_rate": 6.173066383561761e-05, + "loss": 0.0, + "step": 45628 + }, + { + "epoch": 4.257628067556219, + "grad_norm": NaN, + "learning_rate": 6.172454844129685e-05, + "loss": 0.0, + "step": 45629 + }, + { + "epoch": 4.257721377251096, + "grad_norm": NaN, + "learning_rate": 6.171843327143436e-05, + "loss": 0.0, + "step": 45630 + }, + { + "epoch": 4.257814686945974, + "grad_norm": NaN, + "learning_rate": 6.171231832604584e-05, + "loss": 0.0, + "step": 45631 + }, + { + "epoch": 4.257907996640851, + "grad_norm": NaN, + "learning_rate": 6.17062036051467e-05, + "loss": 0.0, + "step": 45632 + }, + { + "epoch": 4.2580013063357285, + "grad_norm": NaN, + "learning_rate": 6.170008910875248e-05, + "loss": 0.0, + "step": 45633 + }, + { + "epoch": 4.258094616030606, + "grad_norm": NaN, + "learning_rate": 6.169397483687886e-05, + "loss": 0.0, + "step": 45634 + }, + { + "epoch": 4.258187925725483, + "grad_norm": NaN, + "learning_rate": 6.168786078954127e-05, + "loss": 0.0, + "step": 45635 + }, + { + "epoch": 4.25828123542036, + "grad_norm": NaN, + "learning_rate": 6.168174696675522e-05, + "loss": 0.0, + "step": 45636 + }, + { + "epoch": 4.258374545115237, + "grad_norm": NaN, + "learning_rate": 6.167563336853645e-05, + "loss": 0.0, + "step": 45637 + }, + { + "epoch": 4.258467854810115, + "grad_norm": NaN, + "learning_rate": 6.166951999490031e-05, + "loss": 0.0, + "step": 45638 + }, + { + "epoch": 4.258561164504992, + "grad_norm": NaN, + "learning_rate": 6.166340684586239e-05, + "loss": 0.0, + "step": 45639 + }, + { + "epoch": 4.2586544741998695, + "grad_norm": NaN, + "learning_rate": 6.165729392143833e-05, + "loss": 0.0, + "step": 45640 + }, + { + "epoch": 4.258747783894747, + "grad_norm": NaN, + "learning_rate": 6.165118122164354e-05, + "loss": 0.0, + "step": 45641 + }, + { + "epoch": 4.258841093589624, + "grad_norm": NaN, + "learning_rate": 6.164506874649363e-05, + "loss": 0.0, + "step": 45642 + }, + { + "epoch": 4.258934403284501, + "grad_norm": NaN, + "learning_rate": 6.163895649600422e-05, + "loss": 0.0, + "step": 45643 + }, + { + "epoch": 4.259027712979378, + "grad_norm": NaN, + "learning_rate": 6.163284447019066e-05, + "loss": 0.0, + "step": 45644 + }, + { + "epoch": 4.259121022674256, + "grad_norm": NaN, + "learning_rate": 6.162673266906864e-05, + "loss": 0.0, + "step": 45645 + }, + { + "epoch": 4.259214332369133, + "grad_norm": NaN, + "learning_rate": 6.162062109265373e-05, + "loss": 0.0, + "step": 45646 + }, + { + "epoch": 4.259307642064011, + "grad_norm": NaN, + "learning_rate": 6.161450974096128e-05, + "loss": 0.0, + "step": 45647 + }, + { + "epoch": 4.259400951758888, + "grad_norm": NaN, + "learning_rate": 6.160839861400702e-05, + "loss": 0.0, + "step": 45648 + }, + { + "epoch": 4.259494261453765, + "grad_norm": NaN, + "learning_rate": 6.160228771180646e-05, + "loss": 0.0, + "step": 45649 + }, + { + "epoch": 4.259587571148642, + "grad_norm": NaN, + "learning_rate": 6.1596177034375e-05, + "loss": 0.0, + "step": 45650 + }, + { + "epoch": 4.259680880843519, + "grad_norm": NaN, + "learning_rate": 6.159006658172832e-05, + "loss": 0.0, + "step": 45651 + }, + { + "epoch": 4.259774190538397, + "grad_norm": NaN, + "learning_rate": 6.158395635388196e-05, + "loss": 0.0, + "step": 45652 + }, + { + "epoch": 4.259867500233274, + "grad_norm": NaN, + "learning_rate": 6.157784635085132e-05, + "loss": 0.0, + "step": 45653 + }, + { + "epoch": 4.259960809928152, + "grad_norm": NaN, + "learning_rate": 6.157173657265213e-05, + "loss": 0.0, + "step": 45654 + }, + { + "epoch": 4.260054119623029, + "grad_norm": NaN, + "learning_rate": 6.156562701929975e-05, + "loss": 0.0, + "step": 45655 + }, + { + "epoch": 4.2601474293179065, + "grad_norm": NaN, + "learning_rate": 6.155951769080972e-05, + "loss": 0.0, + "step": 45656 + }, + { + "epoch": 4.260240739012783, + "grad_norm": NaN, + "learning_rate": 6.155340858719778e-05, + "loss": 0.0, + "step": 45657 + }, + { + "epoch": 4.26033404870766, + "grad_norm": NaN, + "learning_rate": 6.154729970847924e-05, + "loss": 0.0, + "step": 45658 + }, + { + "epoch": 4.260427358402538, + "grad_norm": NaN, + "learning_rate": 6.154119105466967e-05, + "loss": 0.0, + "step": 45659 + }, + { + "epoch": 4.260520668097415, + "grad_norm": NaN, + "learning_rate": 6.153508262578475e-05, + "loss": 0.0, + "step": 45660 + }, + { + "epoch": 4.260613977792293, + "grad_norm": NaN, + "learning_rate": 6.152897442183987e-05, + "loss": 0.0, + "step": 45661 + }, + { + "epoch": 4.26070728748717, + "grad_norm": NaN, + "learning_rate": 6.152286644285054e-05, + "loss": 0.0, + "step": 45662 + }, + { + "epoch": 4.2608005971820475, + "grad_norm": NaN, + "learning_rate": 6.151675868883246e-05, + "loss": 0.0, + "step": 45663 + }, + { + "epoch": 4.260893906876925, + "grad_norm": NaN, + "learning_rate": 6.151065115980099e-05, + "loss": 0.0, + "step": 45664 + }, + { + "epoch": 4.2609872165718015, + "grad_norm": NaN, + "learning_rate": 6.15045438557717e-05, + "loss": 0.0, + "step": 45665 + }, + { + "epoch": 4.261080526266679, + "grad_norm": NaN, + "learning_rate": 6.149843677676021e-05, + "loss": 0.0, + "step": 45666 + }, + { + "epoch": 4.261173835961556, + "grad_norm": NaN, + "learning_rate": 6.149232992278194e-05, + "loss": 0.0, + "step": 45667 + }, + { + "epoch": 4.261267145656434, + "grad_norm": NaN, + "learning_rate": 6.14862232938524e-05, + "loss": 0.0, + "step": 45668 + }, + { + "epoch": 4.261360455351311, + "grad_norm": NaN, + "learning_rate": 6.148011688998729e-05, + "loss": 0.0, + "step": 45669 + }, + { + "epoch": 4.261453765046189, + "grad_norm": NaN, + "learning_rate": 6.147401071120196e-05, + "loss": 0.0, + "step": 45670 + }, + { + "epoch": 4.261547074741066, + "grad_norm": NaN, + "learning_rate": 6.146790475751194e-05, + "loss": 0.0, + "step": 45671 + }, + { + "epoch": 4.2616403844359425, + "grad_norm": NaN, + "learning_rate": 6.146179902893293e-05, + "loss": 0.0, + "step": 45672 + }, + { + "epoch": 4.26173369413082, + "grad_norm": NaN, + "learning_rate": 6.145569352548028e-05, + "loss": 0.0, + "step": 45673 + }, + { + "epoch": 4.261827003825697, + "grad_norm": NaN, + "learning_rate": 6.144958824716952e-05, + "loss": 0.0, + "step": 45674 + }, + { + "epoch": 4.261920313520575, + "grad_norm": NaN, + "learning_rate": 6.144348319401633e-05, + "loss": 0.0, + "step": 45675 + }, + { + "epoch": 4.262013623215452, + "grad_norm": NaN, + "learning_rate": 6.14373783660361e-05, + "loss": 0.0, + "step": 45676 + }, + { + "epoch": 4.26210693291033, + "grad_norm": NaN, + "learning_rate": 6.14312737632443e-05, + "loss": 0.0, + "step": 45677 + }, + { + "epoch": 4.262200242605207, + "grad_norm": NaN, + "learning_rate": 6.142516938565664e-05, + "loss": 0.0, + "step": 45678 + }, + { + "epoch": 4.262293552300084, + "grad_norm": NaN, + "learning_rate": 6.141906523328845e-05, + "loss": 0.0, + "step": 45679 + }, + { + "epoch": 4.262386861994961, + "grad_norm": NaN, + "learning_rate": 6.141296130615537e-05, + "loss": 0.0, + "step": 45680 + }, + { + "epoch": 4.262480171689838, + "grad_norm": NaN, + "learning_rate": 6.140685760427296e-05, + "loss": 0.0, + "step": 45681 + }, + { + "epoch": 4.262573481384716, + "grad_norm": NaN, + "learning_rate": 6.140075412765655e-05, + "loss": 0.0, + "step": 45682 + }, + { + "epoch": 4.262666791079593, + "grad_norm": NaN, + "learning_rate": 6.139465087632182e-05, + "loss": 0.0, + "step": 45683 + }, + { + "epoch": 4.262760100774471, + "grad_norm": NaN, + "learning_rate": 6.138854785028433e-05, + "loss": 0.0, + "step": 45684 + }, + { + "epoch": 4.262853410469348, + "grad_norm": NaN, + "learning_rate": 6.13824450495594e-05, + "loss": 0.0, + "step": 45685 + }, + { + "epoch": 4.262946720164225, + "grad_norm": NaN, + "learning_rate": 6.13763424741627e-05, + "loss": 0.0, + "step": 45686 + }, + { + "epoch": 4.263040029859102, + "grad_norm": NaN, + "learning_rate": 6.137024012410975e-05, + "loss": 0.0, + "step": 45687 + }, + { + "epoch": 4.263133339553979, + "grad_norm": NaN, + "learning_rate": 6.136413799941595e-05, + "loss": 0.0, + "step": 45688 + }, + { + "epoch": 4.263226649248857, + "grad_norm": NaN, + "learning_rate": 6.135803610009693e-05, + "loss": 0.0, + "step": 45689 + }, + { + "epoch": 4.263319958943734, + "grad_norm": NaN, + "learning_rate": 6.135193442616823e-05, + "loss": 0.0, + "step": 45690 + }, + { + "epoch": 4.263413268638612, + "grad_norm": NaN, + "learning_rate": 6.134583297764518e-05, + "loss": 0.0, + "step": 45691 + }, + { + "epoch": 4.263506578333489, + "grad_norm": NaN, + "learning_rate": 6.133973175454346e-05, + "loss": 0.0, + "step": 45692 + }, + { + "epoch": 4.2635998880283665, + "grad_norm": NaN, + "learning_rate": 6.133363075687862e-05, + "loss": 0.0, + "step": 45693 + }, + { + "epoch": 4.263693197723243, + "grad_norm": NaN, + "learning_rate": 6.132752998466597e-05, + "loss": 0.0, + "step": 45694 + }, + { + "epoch": 4.2637865074181205, + "grad_norm": NaN, + "learning_rate": 6.13214294379212e-05, + "loss": 0.0, + "step": 45695 + }, + { + "epoch": 4.263879817112998, + "grad_norm": NaN, + "learning_rate": 6.131532911665983e-05, + "loss": 0.0, + "step": 45696 + }, + { + "epoch": 4.263973126807875, + "grad_norm": NaN, + "learning_rate": 6.130922902089718e-05, + "loss": 0.0, + "step": 45697 + }, + { + "epoch": 4.264066436502753, + "grad_norm": NaN, + "learning_rate": 6.130312915064902e-05, + "loss": 0.0, + "step": 45698 + }, + { + "epoch": 4.26415974619763, + "grad_norm": NaN, + "learning_rate": 6.129702950593065e-05, + "loss": 0.0, + "step": 45699 + }, + { + "epoch": 4.264253055892508, + "grad_norm": NaN, + "learning_rate": 6.129093008675762e-05, + "loss": 0.0, + "step": 45700 + }, + { + "epoch": 4.264346365587384, + "grad_norm": NaN, + "learning_rate": 6.128483089314559e-05, + "loss": 0.0, + "step": 45701 + }, + { + "epoch": 4.2644396752822615, + "grad_norm": NaN, + "learning_rate": 6.127873192510988e-05, + "loss": 0.0, + "step": 45702 + }, + { + "epoch": 4.264532984977139, + "grad_norm": NaN, + "learning_rate": 6.127263318266604e-05, + "loss": 0.0, + "step": 45703 + }, + { + "epoch": 4.264626294672016, + "grad_norm": NaN, + "learning_rate": 6.126653466582971e-05, + "loss": 0.0, + "step": 45704 + }, + { + "epoch": 4.264719604366894, + "grad_norm": NaN, + "learning_rate": 6.126043637461623e-05, + "loss": 0.0, + "step": 45705 + }, + { + "epoch": 4.264812914061771, + "grad_norm": NaN, + "learning_rate": 6.125433830904115e-05, + "loss": 0.0, + "step": 45706 + }, + { + "epoch": 4.264906223756649, + "grad_norm": NaN, + "learning_rate": 6.124824046912007e-05, + "loss": 0.0, + "step": 45707 + }, + { + "epoch": 4.264999533451526, + "grad_norm": NaN, + "learning_rate": 6.12421428548684e-05, + "loss": 0.0, + "step": 45708 + }, + { + "epoch": 4.265092843146403, + "grad_norm": NaN, + "learning_rate": 6.12360454663016e-05, + "loss": 0.0, + "step": 45709 + }, + { + "epoch": 4.26518615284128, + "grad_norm": NaN, + "learning_rate": 6.122994830343536e-05, + "loss": 0.0, + "step": 45710 + }, + { + "epoch": 4.265279462536157, + "grad_norm": NaN, + "learning_rate": 6.122385136628503e-05, + "loss": 0.0, + "step": 45711 + }, + { + "epoch": 4.265372772231035, + "grad_norm": NaN, + "learning_rate": 6.121775465486606e-05, + "loss": 0.0, + "step": 45712 + }, + { + "epoch": 4.265466081925912, + "grad_norm": NaN, + "learning_rate": 6.121165816919416e-05, + "loss": 0.0, + "step": 45713 + }, + { + "epoch": 4.26555939162079, + "grad_norm": NaN, + "learning_rate": 6.120556190928467e-05, + "loss": 0.0, + "step": 45714 + }, + { + "epoch": 4.265652701315667, + "grad_norm": NaN, + "learning_rate": 6.119946587515307e-05, + "loss": 0.0, + "step": 45715 + }, + { + "epoch": 4.265746011010544, + "grad_norm": NaN, + "learning_rate": 6.119337006681502e-05, + "loss": 0.0, + "step": 45716 + }, + { + "epoch": 4.265839320705421, + "grad_norm": NaN, + "learning_rate": 6.118727448428583e-05, + "loss": 0.0, + "step": 45717 + }, + { + "epoch": 4.2659326304002985, + "grad_norm": NaN, + "learning_rate": 6.118117912758114e-05, + "loss": 0.0, + "step": 45718 + }, + { + "epoch": 4.266025940095176, + "grad_norm": NaN, + "learning_rate": 6.117508399671647e-05, + "loss": 0.0, + "step": 45719 + }, + { + "epoch": 4.266119249790053, + "grad_norm": NaN, + "learning_rate": 6.116898909170713e-05, + "loss": 0.0, + "step": 45720 + }, + { + "epoch": 4.266212559484931, + "grad_norm": NaN, + "learning_rate": 6.11628944125688e-05, + "loss": 0.0, + "step": 45721 + }, + { + "epoch": 4.266305869179808, + "grad_norm": NaN, + "learning_rate": 6.115679995931695e-05, + "loss": 0.0, + "step": 45722 + }, + { + "epoch": 4.266399178874685, + "grad_norm": NaN, + "learning_rate": 6.115070573196693e-05, + "loss": 0.0, + "step": 45723 + }, + { + "epoch": 4.266492488569562, + "grad_norm": NaN, + "learning_rate": 6.114461173053441e-05, + "loss": 0.0, + "step": 45724 + }, + { + "epoch": 4.2665857982644395, + "grad_norm": NaN, + "learning_rate": 6.113851795503488e-05, + "loss": 0.0, + "step": 45725 + }, + { + "epoch": 4.266679107959317, + "grad_norm": NaN, + "learning_rate": 6.113242440548368e-05, + "loss": 0.0, + "step": 45726 + }, + { + "epoch": 4.266772417654194, + "grad_norm": NaN, + "learning_rate": 6.112633108189643e-05, + "loss": 0.0, + "step": 45727 + }, + { + "epoch": 4.266865727349072, + "grad_norm": NaN, + "learning_rate": 6.112023798428865e-05, + "loss": 0.0, + "step": 45728 + }, + { + "epoch": 4.266959037043949, + "grad_norm": NaN, + "learning_rate": 6.111414511267567e-05, + "loss": 0.0, + "step": 45729 + }, + { + "epoch": 4.267052346738826, + "grad_norm": NaN, + "learning_rate": 6.110805246707316e-05, + "loss": 0.0, + "step": 45730 + }, + { + "epoch": 4.267145656433703, + "grad_norm": NaN, + "learning_rate": 6.110196004749659e-05, + "loss": 0.0, + "step": 45731 + }, + { + "epoch": 4.267238966128581, + "grad_norm": NaN, + "learning_rate": 6.109586785396129e-05, + "loss": 0.0, + "step": 45732 + }, + { + "epoch": 4.267332275823458, + "grad_norm": NaN, + "learning_rate": 6.108977588648292e-05, + "loss": 0.0, + "step": 45733 + }, + { + "epoch": 4.267425585518335, + "grad_norm": NaN, + "learning_rate": 6.108368414507696e-05, + "loss": 0.0, + "step": 45734 + }, + { + "epoch": 4.267518895213213, + "grad_norm": NaN, + "learning_rate": 6.107759262975874e-05, + "loss": 0.0, + "step": 45735 + }, + { + "epoch": 4.26761220490809, + "grad_norm": NaN, + "learning_rate": 6.107150134054392e-05, + "loss": 0.0, + "step": 45736 + }, + { + "epoch": 4.267705514602968, + "grad_norm": NaN, + "learning_rate": 6.106541027744797e-05, + "loss": 0.0, + "step": 45737 + }, + { + "epoch": 4.267798824297844, + "grad_norm": NaN, + "learning_rate": 6.105931944048626e-05, + "loss": 0.0, + "step": 45738 + }, + { + "epoch": 4.267892133992722, + "grad_norm": NaN, + "learning_rate": 6.105322882967439e-05, + "loss": 0.0, + "step": 45739 + }, + { + "epoch": 4.267985443687599, + "grad_norm": NaN, + "learning_rate": 6.104713844502788e-05, + "loss": 0.0, + "step": 45740 + }, + { + "epoch": 4.268078753382476, + "grad_norm": NaN, + "learning_rate": 6.1041048286562e-05, + "loss": 0.0, + "step": 45741 + }, + { + "epoch": 4.268172063077354, + "grad_norm": NaN, + "learning_rate": 6.1034958354292536e-05, + "loss": 0.0, + "step": 45742 + }, + { + "epoch": 4.268265372772231, + "grad_norm": NaN, + "learning_rate": 6.102886864823473e-05, + "loss": 0.0, + "step": 45743 + }, + { + "epoch": 4.268358682467109, + "grad_norm": NaN, + "learning_rate": 6.1022779168404126e-05, + "loss": 0.0, + "step": 45744 + }, + { + "epoch": 4.268451992161985, + "grad_norm": NaN, + "learning_rate": 6.101668991481634e-05, + "loss": 0.0, + "step": 45745 + }, + { + "epoch": 4.268545301856863, + "grad_norm": NaN, + "learning_rate": 6.101060088748669e-05, + "loss": 0.0, + "step": 45746 + }, + { + "epoch": 4.26863861155174, + "grad_norm": NaN, + "learning_rate": 6.1004512086430674e-05, + "loss": 0.0, + "step": 45747 + }, + { + "epoch": 4.2687319212466175, + "grad_norm": NaN, + "learning_rate": 6.099842351166393e-05, + "loss": 0.0, + "step": 45748 + }, + { + "epoch": 4.268825230941495, + "grad_norm": NaN, + "learning_rate": 6.099233516320178e-05, + "loss": 0.0, + "step": 45749 + }, + { + "epoch": 4.268918540636372, + "grad_norm": NaN, + "learning_rate": 6.09862470410597e-05, + "loss": 0.0, + "step": 45750 + }, + { + "epoch": 4.26901185033125, + "grad_norm": NaN, + "learning_rate": 6.0980159145253334e-05, + "loss": 0.0, + "step": 45751 + }, + { + "epoch": 4.269105160026127, + "grad_norm": NaN, + "learning_rate": 6.0974071475798e-05, + "loss": 0.0, + "step": 45752 + }, + { + "epoch": 4.269198469721004, + "grad_norm": NaN, + "learning_rate": 6.096798403270917e-05, + "loss": 0.0, + "step": 45753 + }, + { + "epoch": 4.269291779415881, + "grad_norm": NaN, + "learning_rate": 6.096189681600249e-05, + "loss": 0.0, + "step": 45754 + }, + { + "epoch": 4.2693850891107585, + "grad_norm": NaN, + "learning_rate": 6.095580982569323e-05, + "loss": 0.0, + "step": 45755 + }, + { + "epoch": 4.269478398805636, + "grad_norm": NaN, + "learning_rate": 6.094972306179702e-05, + "loss": 0.0, + "step": 45756 + }, + { + "epoch": 4.269571708500513, + "grad_norm": NaN, + "learning_rate": 6.094363652432934e-05, + "loss": 0.0, + "step": 45757 + }, + { + "epoch": 4.269665018195391, + "grad_norm": NaN, + "learning_rate": 6.093755021330551e-05, + "loss": 0.0, + "step": 45758 + }, + { + "epoch": 4.269758327890268, + "grad_norm": NaN, + "learning_rate": 6.093146412874116e-05, + "loss": 0.0, + "step": 45759 + }, + { + "epoch": 4.269851637585145, + "grad_norm": NaN, + "learning_rate": 6.0925378270651764e-05, + "loss": 0.0, + "step": 45760 + }, + { + "epoch": 4.269944947280022, + "grad_norm": NaN, + "learning_rate": 6.091929263905263e-05, + "loss": 0.0, + "step": 45761 + }, + { + "epoch": 4.2700382569749, + "grad_norm": NaN, + "learning_rate": 6.091320723395941e-05, + "loss": 0.0, + "step": 45762 + }, + { + "epoch": 4.270131566669777, + "grad_norm": NaN, + "learning_rate": 6.090712205538758e-05, + "loss": 0.0, + "step": 45763 + }, + { + "epoch": 4.270224876364654, + "grad_norm": NaN, + "learning_rate": 6.090103710335241e-05, + "loss": 0.0, + "step": 45764 + }, + { + "epoch": 4.270318186059532, + "grad_norm": NaN, + "learning_rate": 6.089495237786959e-05, + "loss": 0.0, + "step": 45765 + }, + { + "epoch": 4.270411495754409, + "grad_norm": NaN, + "learning_rate": 6.088886787895456e-05, + "loss": 0.0, + "step": 45766 + }, + { + "epoch": 4.270504805449286, + "grad_norm": NaN, + "learning_rate": 6.088278360662262e-05, + "loss": 0.0, + "step": 45767 + }, + { + "epoch": 4.270598115144163, + "grad_norm": NaN, + "learning_rate": 6.0876699560889426e-05, + "loss": 0.0, + "step": 45768 + }, + { + "epoch": 4.270691424839041, + "grad_norm": NaN, + "learning_rate": 6.087061574177044e-05, + "loss": 0.0, + "step": 45769 + }, + { + "epoch": 4.270784734533918, + "grad_norm": NaN, + "learning_rate": 6.086453214928097e-05, + "loss": 0.0, + "step": 45770 + }, + { + "epoch": 4.2708780442287955, + "grad_norm": NaN, + "learning_rate": 6.0858448783436634e-05, + "loss": 0.0, + "step": 45771 + }, + { + "epoch": 4.270971353923673, + "grad_norm": NaN, + "learning_rate": 6.0852365644252926e-05, + "loss": 0.0, + "step": 45772 + }, + { + "epoch": 4.27106466361855, + "grad_norm": NaN, + "learning_rate": 6.084628273174513e-05, + "loss": 0.0, + "step": 45773 + }, + { + "epoch": 4.271157973313427, + "grad_norm": NaN, + "learning_rate": 6.084020004592888e-05, + "loss": 0.0, + "step": 45774 + }, + { + "epoch": 4.271251283008304, + "grad_norm": NaN, + "learning_rate": 6.083411758681965e-05, + "loss": 0.0, + "step": 45775 + }, + { + "epoch": 4.271344592703182, + "grad_norm": NaN, + "learning_rate": 6.082803535443273e-05, + "loss": 0.0, + "step": 45776 + }, + { + "epoch": 4.271437902398059, + "grad_norm": NaN, + "learning_rate": 6.0821953348783746e-05, + "loss": 0.0, + "step": 45777 + }, + { + "epoch": 4.2715312120929365, + "grad_norm": NaN, + "learning_rate": 6.081587156988819e-05, + "loss": 0.0, + "step": 45778 + }, + { + "epoch": 4.271624521787814, + "grad_norm": NaN, + "learning_rate": 6.080979001776135e-05, + "loss": 0.0, + "step": 45779 + }, + { + "epoch": 4.271717831482691, + "grad_norm": NaN, + "learning_rate": 6.080370869241883e-05, + "loss": 0.0, + "step": 45780 + }, + { + "epoch": 4.271811141177569, + "grad_norm": NaN, + "learning_rate": 6.079762759387613e-05, + "loss": 0.0, + "step": 45781 + }, + { + "epoch": 4.271904450872445, + "grad_norm": NaN, + "learning_rate": 6.079154672214852e-05, + "loss": 0.0, + "step": 45782 + }, + { + "epoch": 4.271997760567323, + "grad_norm": NaN, + "learning_rate": 6.078546607725164e-05, + "loss": 0.0, + "step": 45783 + }, + { + "epoch": 4.2720910702622, + "grad_norm": NaN, + "learning_rate": 6.0779385659200926e-05, + "loss": 0.0, + "step": 45784 + }, + { + "epoch": 4.272184379957078, + "grad_norm": NaN, + "learning_rate": 6.077330546801172e-05, + "loss": 0.0, + "step": 45785 + }, + { + "epoch": 4.272277689651955, + "grad_norm": NaN, + "learning_rate": 6.0767225503699626e-05, + "loss": 0.0, + "step": 45786 + }, + { + "epoch": 4.272370999346832, + "grad_norm": NaN, + "learning_rate": 6.0761145766280085e-05, + "loss": 0.0, + "step": 45787 + }, + { + "epoch": 4.27246430904171, + "grad_norm": NaN, + "learning_rate": 6.0755066255768406e-05, + "loss": 0.0, + "step": 45788 + }, + { + "epoch": 4.272557618736586, + "grad_norm": NaN, + "learning_rate": 6.074898697218027e-05, + "loss": 0.0, + "step": 45789 + }, + { + "epoch": 4.272650928431464, + "grad_norm": NaN, + "learning_rate": 6.074290791553097e-05, + "loss": 0.0, + "step": 45790 + }, + { + "epoch": 4.272744238126341, + "grad_norm": NaN, + "learning_rate": 6.073682908583596e-05, + "loss": 0.0, + "step": 45791 + }, + { + "epoch": 4.272837547821219, + "grad_norm": NaN, + "learning_rate": 6.073075048311086e-05, + "loss": 0.0, + "step": 45792 + }, + { + "epoch": 4.272930857516096, + "grad_norm": NaN, + "learning_rate": 6.07246721073709e-05, + "loss": 0.0, + "step": 45793 + }, + { + "epoch": 4.2730241672109734, + "grad_norm": NaN, + "learning_rate": 6.071859395863172e-05, + "loss": 0.0, + "step": 45794 + }, + { + "epoch": 4.273117476905851, + "grad_norm": NaN, + "learning_rate": 6.071251603690877e-05, + "loss": 0.0, + "step": 45795 + }, + { + "epoch": 4.273210786600728, + "grad_norm": NaN, + "learning_rate": 6.070643834221732e-05, + "loss": 0.0, + "step": 45796 + }, + { + "epoch": 4.273304096295605, + "grad_norm": NaN, + "learning_rate": 6.070036087457299e-05, + "loss": 0.0, + "step": 45797 + }, + { + "epoch": 4.273397405990482, + "grad_norm": NaN, + "learning_rate": 6.069428363399127e-05, + "loss": 0.0, + "step": 45798 + }, + { + "epoch": 4.27349071568536, + "grad_norm": NaN, + "learning_rate": 6.0688206620487415e-05, + "loss": 0.0, + "step": 45799 + }, + { + "epoch": 4.273584025380237, + "grad_norm": NaN, + "learning_rate": 6.068212983407704e-05, + "loss": 0.0, + "step": 45800 + }, + { + "epoch": 4.2736773350751145, + "grad_norm": NaN, + "learning_rate": 6.067605327477562e-05, + "loss": 0.0, + "step": 45801 + }, + { + "epoch": 4.273770644769992, + "grad_norm": NaN, + "learning_rate": 6.066997694259842e-05, + "loss": 0.0, + "step": 45802 + }, + { + "epoch": 4.273863954464868, + "grad_norm": NaN, + "learning_rate": 6.066390083756106e-05, + "loss": 0.0, + "step": 45803 + }, + { + "epoch": 4.273957264159746, + "grad_norm": NaN, + "learning_rate": 6.065782495967899e-05, + "loss": 0.0, + "step": 45804 + }, + { + "epoch": 4.274050573854623, + "grad_norm": NaN, + "learning_rate": 6.065174930896751e-05, + "loss": 0.0, + "step": 45805 + }, + { + "epoch": 4.274143883549501, + "grad_norm": NaN, + "learning_rate": 6.0645673885442206e-05, + "loss": 0.0, + "step": 45806 + }, + { + "epoch": 4.274237193244378, + "grad_norm": NaN, + "learning_rate": 6.063959868911854e-05, + "loss": 0.0, + "step": 45807 + }, + { + "epoch": 4.2743305029392555, + "grad_norm": NaN, + "learning_rate": 6.063352372001179e-05, + "loss": 0.0, + "step": 45808 + }, + { + "epoch": 4.274423812634133, + "grad_norm": NaN, + "learning_rate": 6.062744897813758e-05, + "loss": 0.0, + "step": 45809 + }, + { + "epoch": 4.27451712232901, + "grad_norm": NaN, + "learning_rate": 6.0621374463511354e-05, + "loss": 0.0, + "step": 45810 + }, + { + "epoch": 4.274610432023887, + "grad_norm": NaN, + "learning_rate": 6.061530017614837e-05, + "loss": 0.0, + "step": 45811 + }, + { + "epoch": 4.274703741718764, + "grad_norm": NaN, + "learning_rate": 6.060922611606426e-05, + "loss": 0.0, + "step": 45812 + }, + { + "epoch": 4.274797051413642, + "grad_norm": NaN, + "learning_rate": 6.0603152283274473e-05, + "loss": 0.0, + "step": 45813 + }, + { + "epoch": 4.274890361108519, + "grad_norm": NaN, + "learning_rate": 6.0597078677794276e-05, + "loss": 0.0, + "step": 45814 + }, + { + "epoch": 4.274983670803397, + "grad_norm": NaN, + "learning_rate": 6.0591005299639284e-05, + "loss": 0.0, + "step": 45815 + }, + { + "epoch": 4.275076980498274, + "grad_norm": NaN, + "learning_rate": 6.058493214882492e-05, + "loss": 0.0, + "step": 45816 + }, + { + "epoch": 4.275170290193151, + "grad_norm": NaN, + "learning_rate": 6.05788592253665e-05, + "loss": 0.0, + "step": 45817 + }, + { + "epoch": 4.275263599888028, + "grad_norm": NaN, + "learning_rate": 6.057278652927959e-05, + "loss": 0.0, + "step": 45818 + }, + { + "epoch": 4.275356909582905, + "grad_norm": NaN, + "learning_rate": 6.056671406057964e-05, + "loss": 0.0, + "step": 45819 + }, + { + "epoch": 4.275450219277783, + "grad_norm": NaN, + "learning_rate": 6.0560641819281955e-05, + "loss": 0.0, + "step": 45820 + }, + { + "epoch": 4.27554352897266, + "grad_norm": NaN, + "learning_rate": 6.055456980540211e-05, + "loss": 0.0, + "step": 45821 + }, + { + "epoch": 4.275636838667538, + "grad_norm": NaN, + "learning_rate": 6.0548498018955546e-05, + "loss": 0.0, + "step": 45822 + }, + { + "epoch": 4.275730148362415, + "grad_norm": NaN, + "learning_rate": 6.054242645995755e-05, + "loss": 0.0, + "step": 45823 + }, + { + "epoch": 4.2758234580572925, + "grad_norm": NaN, + "learning_rate": 6.053635512842373e-05, + "loss": 0.0, + "step": 45824 + }, + { + "epoch": 4.27591676775217, + "grad_norm": NaN, + "learning_rate": 6.05302840243695e-05, + "loss": 0.0, + "step": 45825 + }, + { + "epoch": 4.276010077447046, + "grad_norm": NaN, + "learning_rate": 6.052421314781015e-05, + "loss": 0.0, + "step": 45826 + }, + { + "epoch": 4.276103387141924, + "grad_norm": NaN, + "learning_rate": 6.051814249876127e-05, + "loss": 0.0, + "step": 45827 + }, + { + "epoch": 4.276196696836801, + "grad_norm": NaN, + "learning_rate": 6.05120720772383e-05, + "loss": 0.0, + "step": 45828 + }, + { + "epoch": 4.276290006531679, + "grad_norm": NaN, + "learning_rate": 6.050600188325652e-05, + "loss": 0.0, + "step": 45829 + }, + { + "epoch": 4.276383316226556, + "grad_norm": NaN, + "learning_rate": 6.049993191683152e-05, + "loss": 0.0, + "step": 45830 + }, + { + "epoch": 4.2764766259214335, + "grad_norm": NaN, + "learning_rate": 6.0493862177978695e-05, + "loss": 0.0, + "step": 45831 + }, + { + "epoch": 4.276569935616311, + "grad_norm": NaN, + "learning_rate": 6.048779266671344e-05, + "loss": 0.0, + "step": 45832 + }, + { + "epoch": 4.2766632453111875, + "grad_norm": NaN, + "learning_rate": 6.048172338305128e-05, + "loss": 0.0, + "step": 45833 + }, + { + "epoch": 4.276756555006065, + "grad_norm": NaN, + "learning_rate": 6.047565432700747e-05, + "loss": 0.0, + "step": 45834 + }, + { + "epoch": 4.276849864700942, + "grad_norm": NaN, + "learning_rate": 6.046958549859761e-05, + "loss": 0.0, + "step": 45835 + }, + { + "epoch": 4.27694317439582, + "grad_norm": NaN, + "learning_rate": 6.0463516897837115e-05, + "loss": 0.0, + "step": 45836 + }, + { + "epoch": 4.277036484090697, + "grad_norm": NaN, + "learning_rate": 6.0457448524741295e-05, + "loss": 0.0, + "step": 45837 + }, + { + "epoch": 4.277129793785575, + "grad_norm": NaN, + "learning_rate": 6.0451380379325674e-05, + "loss": 0.0, + "step": 45838 + }, + { + "epoch": 4.277223103480452, + "grad_norm": NaN, + "learning_rate": 6.044531246160576e-05, + "loss": 0.0, + "step": 45839 + }, + { + "epoch": 4.2773164131753285, + "grad_norm": NaN, + "learning_rate": 6.043924477159675e-05, + "loss": 0.0, + "step": 45840 + }, + { + "epoch": 4.277409722870206, + "grad_norm": NaN, + "learning_rate": 6.043317730931427e-05, + "loss": 0.0, + "step": 45841 + }, + { + "epoch": 4.277503032565083, + "grad_norm": NaN, + "learning_rate": 6.042711007477374e-05, + "loss": 0.0, + "step": 45842 + }, + { + "epoch": 4.277596342259961, + "grad_norm": NaN, + "learning_rate": 6.042104306799045e-05, + "loss": 0.0, + "step": 45843 + }, + { + "epoch": 4.277689651954838, + "grad_norm": NaN, + "learning_rate": 6.041497628897994e-05, + "loss": 0.0, + "step": 45844 + }, + { + "epoch": 4.277782961649716, + "grad_norm": NaN, + "learning_rate": 6.040890973775768e-05, + "loss": 0.0, + "step": 45845 + }, + { + "epoch": 4.277876271344593, + "grad_norm": NaN, + "learning_rate": 6.04028434143389e-05, + "loss": 0.0, + "step": 45846 + }, + { + "epoch": 4.27796958103947, + "grad_norm": NaN, + "learning_rate": 6.039677731873922e-05, + "loss": 0.0, + "step": 45847 + }, + { + "epoch": 4.278062890734347, + "grad_norm": NaN, + "learning_rate": 6.039071145097404e-05, + "loss": 0.0, + "step": 45848 + }, + { + "epoch": 4.278156200429224, + "grad_norm": NaN, + "learning_rate": 6.038464581105863e-05, + "loss": 0.0, + "step": 45849 + }, + { + "epoch": 4.278249510124102, + "grad_norm": NaN, + "learning_rate": 6.037858039900856e-05, + "loss": 0.0, + "step": 45850 + }, + { + "epoch": 4.278342819818979, + "grad_norm": NaN, + "learning_rate": 6.037251521483928e-05, + "loss": 0.0, + "step": 45851 + }, + { + "epoch": 4.278436129513857, + "grad_norm": NaN, + "learning_rate": 6.036645025856603e-05, + "loss": 0.0, + "step": 45852 + }, + { + "epoch": 4.278529439208734, + "grad_norm": NaN, + "learning_rate": 6.0360385530204405e-05, + "loss": 0.0, + "step": 45853 + }, + { + "epoch": 4.2786227489036115, + "grad_norm": NaN, + "learning_rate": 6.0354321029769816e-05, + "loss": 0.0, + "step": 45854 + }, + { + "epoch": 4.278716058598488, + "grad_norm": NaN, + "learning_rate": 6.0348256757277534e-05, + "loss": 0.0, + "step": 45855 + }, + { + "epoch": 4.2788093682933654, + "grad_norm": NaN, + "learning_rate": 6.034219271274312e-05, + "loss": 0.0, + "step": 45856 + }, + { + "epoch": 4.278902677988243, + "grad_norm": NaN, + "learning_rate": 6.033612889618201e-05, + "loss": 0.0, + "step": 45857 + }, + { + "epoch": 4.27899598768312, + "grad_norm": NaN, + "learning_rate": 6.033006530760945e-05, + "loss": 0.0, + "step": 45858 + }, + { + "epoch": 4.279089297377998, + "grad_norm": NaN, + "learning_rate": 6.032400194704103e-05, + "loss": 0.0, + "step": 45859 + }, + { + "epoch": 4.279182607072875, + "grad_norm": NaN, + "learning_rate": 6.0317938814492175e-05, + "loss": 0.0, + "step": 45860 + }, + { + "epoch": 4.2792759167677525, + "grad_norm": NaN, + "learning_rate": 6.031187590997812e-05, + "loss": 0.0, + "step": 45861 + }, + { + "epoch": 4.279369226462629, + "grad_norm": NaN, + "learning_rate": 6.0305813233514444e-05, + "loss": 0.0, + "step": 45862 + }, + { + "epoch": 4.2794625361575065, + "grad_norm": NaN, + "learning_rate": 6.0299750785116585e-05, + "loss": 0.0, + "step": 45863 + }, + { + "epoch": 4.279555845852384, + "grad_norm": NaN, + "learning_rate": 6.029368856479976e-05, + "loss": 0.0, + "step": 45864 + }, + { + "epoch": 4.279649155547261, + "grad_norm": NaN, + "learning_rate": 6.028762657257958e-05, + "loss": 0.0, + "step": 45865 + }, + { + "epoch": 4.279742465242139, + "grad_norm": NaN, + "learning_rate": 6.028156480847138e-05, + "loss": 0.0, + "step": 45866 + }, + { + "epoch": 4.279835774937016, + "grad_norm": NaN, + "learning_rate": 6.02755032724906e-05, + "loss": 0.0, + "step": 45867 + }, + { + "epoch": 4.279929084631894, + "grad_norm": NaN, + "learning_rate": 6.026944196465262e-05, + "loss": 0.0, + "step": 45868 + }, + { + "epoch": 4.280022394326771, + "grad_norm": NaN, + "learning_rate": 6.026338088497288e-05, + "loss": 0.0, + "step": 45869 + }, + { + "epoch": 4.2801157040216475, + "grad_norm": NaN, + "learning_rate": 6.025732003346678e-05, + "loss": 0.0, + "step": 45870 + }, + { + "epoch": 4.280209013716525, + "grad_norm": NaN, + "learning_rate": 6.0251259410149736e-05, + "loss": 0.0, + "step": 45871 + }, + { + "epoch": 4.280302323411402, + "grad_norm": NaN, + "learning_rate": 6.024519901503714e-05, + "loss": 0.0, + "step": 45872 + }, + { + "epoch": 4.28039563310628, + "grad_norm": NaN, + "learning_rate": 6.023913884814443e-05, + "loss": 0.0, + "step": 45873 + }, + { + "epoch": 4.280488942801157, + "grad_norm": NaN, + "learning_rate": 6.023307890948699e-05, + "loss": 0.0, + "step": 45874 + }, + { + "epoch": 4.280582252496035, + "grad_norm": NaN, + "learning_rate": 6.0227019199080244e-05, + "loss": 0.0, + "step": 45875 + }, + { + "epoch": 4.280675562190911, + "grad_norm": NaN, + "learning_rate": 6.0220959716939596e-05, + "loss": 0.0, + "step": 45876 + }, + { + "epoch": 4.280768871885789, + "grad_norm": NaN, + "learning_rate": 6.02149004630805e-05, + "loss": 0.0, + "step": 45877 + }, + { + "epoch": 4.280862181580666, + "grad_norm": NaN, + "learning_rate": 6.020884143751821e-05, + "loss": 0.0, + "step": 45878 + }, + { + "epoch": 4.280955491275543, + "grad_norm": NaN, + "learning_rate": 6.02027826402683e-05, + "loss": 0.0, + "step": 45879 + }, + { + "epoch": 4.281048800970421, + "grad_norm": NaN, + "learning_rate": 6.0196724071346166e-05, + "loss": 0.0, + "step": 45880 + }, + { + "epoch": 4.281142110665298, + "grad_norm": NaN, + "learning_rate": 6.019066573076704e-05, + "loss": 0.0, + "step": 45881 + }, + { + "epoch": 4.281235420360176, + "grad_norm": NaN, + "learning_rate": 6.018460761854651e-05, + "loss": 0.0, + "step": 45882 + }, + { + "epoch": 4.281328730055053, + "grad_norm": NaN, + "learning_rate": 6.0178549734699974e-05, + "loss": 0.0, + "step": 45883 + }, + { + "epoch": 4.28142203974993, + "grad_norm": NaN, + "learning_rate": 6.017249207924268e-05, + "loss": 0.0, + "step": 45884 + }, + { + "epoch": 4.281515349444807, + "grad_norm": NaN, + "learning_rate": 6.016643465219016e-05, + "loss": 0.0, + "step": 45885 + }, + { + "epoch": 4.2816086591396845, + "grad_norm": NaN, + "learning_rate": 6.016037745355786e-05, + "loss": 0.0, + "step": 45886 + }, + { + "epoch": 4.281701968834562, + "grad_norm": NaN, + "learning_rate": 6.0154320483360994e-05, + "loss": 0.0, + "step": 45887 + }, + { + "epoch": 4.281795278529439, + "grad_norm": NaN, + "learning_rate": 6.014826374161513e-05, + "loss": 0.0, + "step": 45888 + }, + { + "epoch": 4.281888588224317, + "grad_norm": NaN, + "learning_rate": 6.014220722833567e-05, + "loss": 0.0, + "step": 45889 + }, + { + "epoch": 4.281981897919194, + "grad_norm": NaN, + "learning_rate": 6.013615094353784e-05, + "loss": 0.0, + "step": 45890 + }, + { + "epoch": 4.282075207614071, + "grad_norm": NaN, + "learning_rate": 6.0130094887237236e-05, + "loss": 0.0, + "step": 45891 + }, + { + "epoch": 4.282168517308948, + "grad_norm": NaN, + "learning_rate": 6.012403905944922e-05, + "loss": 0.0, + "step": 45892 + }, + { + "epoch": 4.2822618270038255, + "grad_norm": NaN, + "learning_rate": 6.011798346018903e-05, + "loss": 0.0, + "step": 45893 + }, + { + "epoch": 4.282355136698703, + "grad_norm": NaN, + "learning_rate": 6.011192808947225e-05, + "loss": 0.0, + "step": 45894 + }, + { + "epoch": 4.28244844639358, + "grad_norm": NaN, + "learning_rate": 6.010587294731427e-05, + "loss": 0.0, + "step": 45895 + }, + { + "epoch": 4.282541756088458, + "grad_norm": NaN, + "learning_rate": 6.00998180337303e-05, + "loss": 0.0, + "step": 45896 + }, + { + "epoch": 4.282635065783335, + "grad_norm": NaN, + "learning_rate": 6.009376334873593e-05, + "loss": 0.0, + "step": 45897 + }, + { + "epoch": 4.282728375478213, + "grad_norm": NaN, + "learning_rate": 6.008770889234654e-05, + "loss": 0.0, + "step": 45898 + }, + { + "epoch": 4.282821685173089, + "grad_norm": NaN, + "learning_rate": 6.008165466457738e-05, + "loss": 0.0, + "step": 45899 + }, + { + "epoch": 4.282914994867967, + "grad_norm": NaN, + "learning_rate": 6.0075600665443955e-05, + "loss": 0.0, + "step": 45900 + }, + { + "epoch": 4.283008304562844, + "grad_norm": NaN, + "learning_rate": 6.0069546894961714e-05, + "loss": 0.0, + "step": 45901 + }, + { + "epoch": 4.283101614257721, + "grad_norm": NaN, + "learning_rate": 6.0063493353145884e-05, + "loss": 0.0, + "step": 45902 + }, + { + "epoch": 4.283194923952599, + "grad_norm": NaN, + "learning_rate": 6.005744004001198e-05, + "loss": 0.0, + "step": 45903 + }, + { + "epoch": 4.283288233647476, + "grad_norm": NaN, + "learning_rate": 6.005138695557536e-05, + "loss": 0.0, + "step": 45904 + }, + { + "epoch": 4.283381543342354, + "grad_norm": NaN, + "learning_rate": 6.004533409985143e-05, + "loss": 0.0, + "step": 45905 + }, + { + "epoch": 4.28347485303723, + "grad_norm": NaN, + "learning_rate": 6.003928147285556e-05, + "loss": 0.0, + "step": 45906 + }, + { + "epoch": 4.283568162732108, + "grad_norm": NaN, + "learning_rate": 6.0033229074603165e-05, + "loss": 0.0, + "step": 45907 + }, + { + "epoch": 4.283661472426985, + "grad_norm": NaN, + "learning_rate": 6.00271769051096e-05, + "loss": 0.0, + "step": 45908 + }, + { + "epoch": 4.2837547821218624, + "grad_norm": NaN, + "learning_rate": 6.002112496439029e-05, + "loss": 0.0, + "step": 45909 + }, + { + "epoch": 4.28384809181674, + "grad_norm": NaN, + "learning_rate": 6.0015073252460596e-05, + "loss": 0.0, + "step": 45910 + }, + { + "epoch": 4.283941401511617, + "grad_norm": NaN, + "learning_rate": 6.000902176933591e-05, + "loss": 0.0, + "step": 45911 + }, + { + "epoch": 4.284034711206495, + "grad_norm": NaN, + "learning_rate": 6.0002970515031634e-05, + "loss": 0.0, + "step": 45912 + }, + { + "epoch": 4.284128020901371, + "grad_norm": NaN, + "learning_rate": 5.999691948956314e-05, + "loss": 0.0, + "step": 45913 + }, + { + "epoch": 4.284221330596249, + "grad_norm": NaN, + "learning_rate": 5.9990868692945824e-05, + "loss": 0.0, + "step": 45914 + }, + { + "epoch": 4.284314640291126, + "grad_norm": NaN, + "learning_rate": 5.998481812519505e-05, + "loss": 0.0, + "step": 45915 + }, + { + "epoch": 4.2844079499860035, + "grad_norm": NaN, + "learning_rate": 5.997876778632624e-05, + "loss": 0.0, + "step": 45916 + }, + { + "epoch": 4.284501259680881, + "grad_norm": NaN, + "learning_rate": 5.997271767635474e-05, + "loss": 0.0, + "step": 45917 + }, + { + "epoch": 4.284594569375758, + "grad_norm": NaN, + "learning_rate": 5.996666779529597e-05, + "loss": 0.0, + "step": 45918 + }, + { + "epoch": 4.284687879070636, + "grad_norm": NaN, + "learning_rate": 5.996061814316527e-05, + "loss": 0.0, + "step": 45919 + }, + { + "epoch": 4.284781188765512, + "grad_norm": NaN, + "learning_rate": 5.9954568719978055e-05, + "loss": 0.0, + "step": 45920 + }, + { + "epoch": 4.28487449846039, + "grad_norm": NaN, + "learning_rate": 5.9948519525749714e-05, + "loss": 0.0, + "step": 45921 + }, + { + "epoch": 4.284967808155267, + "grad_norm": NaN, + "learning_rate": 5.99424705604956e-05, + "loss": 0.0, + "step": 45922 + }, + { + "epoch": 4.2850611178501445, + "grad_norm": NaN, + "learning_rate": 5.993642182423111e-05, + "loss": 0.0, + "step": 45923 + }, + { + "epoch": 4.285154427545022, + "grad_norm": NaN, + "learning_rate": 5.993037331697167e-05, + "loss": 0.0, + "step": 45924 + }, + { + "epoch": 4.285247737239899, + "grad_norm": NaN, + "learning_rate": 5.99243250387325e-05, + "loss": 0.0, + "step": 45925 + }, + { + "epoch": 4.285341046934777, + "grad_norm": NaN, + "learning_rate": 5.991827698952916e-05, + "loss": 0.0, + "step": 45926 + }, + { + "epoch": 4.285434356629654, + "grad_norm": NaN, + "learning_rate": 5.991222916937698e-05, + "loss": 0.0, + "step": 45927 + }, + { + "epoch": 4.285527666324531, + "grad_norm": NaN, + "learning_rate": 5.9906181578291234e-05, + "loss": 0.0, + "step": 45928 + }, + { + "epoch": 4.285620976019408, + "grad_norm": NaN, + "learning_rate": 5.990013421628743e-05, + "loss": 0.0, + "step": 45929 + }, + { + "epoch": 4.285714285714286, + "grad_norm": NaN, + "learning_rate": 5.9894087083380945e-05, + "loss": 0.0, + "step": 45930 + }, + { + "epoch": 4.285807595409163, + "grad_norm": NaN, + "learning_rate": 5.9888040179586986e-05, + "loss": 0.0, + "step": 45931 + }, + { + "epoch": 4.28590090510404, + "grad_norm": NaN, + "learning_rate": 5.988199350492112e-05, + "loss": 0.0, + "step": 45932 + }, + { + "epoch": 4.285994214798918, + "grad_norm": NaN, + "learning_rate": 5.9875947059398696e-05, + "loss": 0.0, + "step": 45933 + }, + { + "epoch": 4.286087524493795, + "grad_norm": NaN, + "learning_rate": 5.986990084303494e-05, + "loss": 0.0, + "step": 45934 + }, + { + "epoch": 4.286180834188672, + "grad_norm": NaN, + "learning_rate": 5.986385485584538e-05, + "loss": 0.0, + "step": 45935 + }, + { + "epoch": 4.286274143883549, + "grad_norm": NaN, + "learning_rate": 5.985780909784538e-05, + "loss": 0.0, + "step": 45936 + }, + { + "epoch": 4.286367453578427, + "grad_norm": NaN, + "learning_rate": 5.985176356905018e-05, + "loss": 0.0, + "step": 45937 + }, + { + "epoch": 4.286460763273304, + "grad_norm": NaN, + "learning_rate": 5.9845718269475277e-05, + "loss": 0.0, + "step": 45938 + }, + { + "epoch": 4.2865540729681815, + "grad_norm": NaN, + "learning_rate": 5.9839673199136065e-05, + "loss": 0.0, + "step": 45939 + }, + { + "epoch": 4.286647382663059, + "grad_norm": NaN, + "learning_rate": 5.9833628358047744e-05, + "loss": 0.0, + "step": 45940 + }, + { + "epoch": 4.286740692357936, + "grad_norm": NaN, + "learning_rate": 5.982758374622587e-05, + "loss": 0.0, + "step": 45941 + }, + { + "epoch": 4.286834002052814, + "grad_norm": NaN, + "learning_rate": 5.982153936368572e-05, + "loss": 0.0, + "step": 45942 + }, + { + "epoch": 4.28692731174769, + "grad_norm": NaN, + "learning_rate": 5.981549521044269e-05, + "loss": 0.0, + "step": 45943 + }, + { + "epoch": 4.287020621442568, + "grad_norm": NaN, + "learning_rate": 5.980945128651214e-05, + "loss": 0.0, + "step": 45944 + }, + { + "epoch": 4.287113931137445, + "grad_norm": NaN, + "learning_rate": 5.9803407591909446e-05, + "loss": 0.0, + "step": 45945 + }, + { + "epoch": 4.2872072408323225, + "grad_norm": NaN, + "learning_rate": 5.9797364126649965e-05, + "loss": 0.0, + "step": 45946 + }, + { + "epoch": 4.2873005505272, + "grad_norm": NaN, + "learning_rate": 5.979132089074907e-05, + "loss": 0.0, + "step": 45947 + }, + { + "epoch": 4.287393860222077, + "grad_norm": NaN, + "learning_rate": 5.978527788422211e-05, + "loss": 0.0, + "step": 45948 + }, + { + "epoch": 4.287487169916955, + "grad_norm": NaN, + "learning_rate": 5.977923510708447e-05, + "loss": 0.0, + "step": 45949 + }, + { + "epoch": 4.287580479611831, + "grad_norm": NaN, + "learning_rate": 5.9773192559351526e-05, + "loss": 0.0, + "step": 45950 + }, + { + "epoch": 4.287673789306709, + "grad_norm": NaN, + "learning_rate": 5.9767150241038596e-05, + "loss": 0.0, + "step": 45951 + }, + { + "epoch": 4.287767099001586, + "grad_norm": NaN, + "learning_rate": 5.976110815216109e-05, + "loss": 0.0, + "step": 45952 + }, + { + "epoch": 4.287860408696464, + "grad_norm": NaN, + "learning_rate": 5.975506629273436e-05, + "loss": 0.0, + "step": 45953 + }, + { + "epoch": 4.287953718391341, + "grad_norm": NaN, + "learning_rate": 5.9749024662773756e-05, + "loss": 0.0, + "step": 45954 + }, + { + "epoch": 4.288047028086218, + "grad_norm": NaN, + "learning_rate": 5.9742983262294656e-05, + "loss": 0.0, + "step": 45955 + }, + { + "epoch": 4.288140337781096, + "grad_norm": NaN, + "learning_rate": 5.973694209131241e-05, + "loss": 0.0, + "step": 45956 + }, + { + "epoch": 4.288233647475972, + "grad_norm": NaN, + "learning_rate": 5.973090114984239e-05, + "loss": 0.0, + "step": 45957 + }, + { + "epoch": 4.28832695717085, + "grad_norm": NaN, + "learning_rate": 5.972486043789993e-05, + "loss": 0.0, + "step": 45958 + }, + { + "epoch": 4.288420266865727, + "grad_norm": NaN, + "learning_rate": 5.971881995550043e-05, + "loss": 0.0, + "step": 45959 + }, + { + "epoch": 4.288513576560605, + "grad_norm": NaN, + "learning_rate": 5.9712779702659215e-05, + "loss": 0.0, + "step": 45960 + }, + { + "epoch": 4.288606886255482, + "grad_norm": NaN, + "learning_rate": 5.970673967939166e-05, + "loss": 0.0, + "step": 45961 + }, + { + "epoch": 4.2887001959503595, + "grad_norm": NaN, + "learning_rate": 5.970069988571311e-05, + "loss": 0.0, + "step": 45962 + }, + { + "epoch": 4.288793505645237, + "grad_norm": NaN, + "learning_rate": 5.969466032163894e-05, + "loss": 0.0, + "step": 45963 + }, + { + "epoch": 4.288886815340113, + "grad_norm": NaN, + "learning_rate": 5.968862098718451e-05, + "loss": 0.0, + "step": 45964 + }, + { + "epoch": 4.288980125034991, + "grad_norm": NaN, + "learning_rate": 5.9682581882365136e-05, + "loss": 0.0, + "step": 45965 + }, + { + "epoch": 4.289073434729868, + "grad_norm": NaN, + "learning_rate": 5.967654300719623e-05, + "loss": 0.0, + "step": 45966 + }, + { + "epoch": 4.289166744424746, + "grad_norm": NaN, + "learning_rate": 5.967050436169311e-05, + "loss": 0.0, + "step": 45967 + }, + { + "epoch": 4.289260054119623, + "grad_norm": NaN, + "learning_rate": 5.9664465945871184e-05, + "loss": 0.0, + "step": 45968 + }, + { + "epoch": 4.2893533638145005, + "grad_norm": NaN, + "learning_rate": 5.9658427759745665e-05, + "loss": 0.0, + "step": 45969 + }, + { + "epoch": 4.289446673509378, + "grad_norm": NaN, + "learning_rate": 5.9652389803332046e-05, + "loss": 0.0, + "step": 45970 + }, + { + "epoch": 4.289539983204255, + "grad_norm": NaN, + "learning_rate": 5.96463520766457e-05, + "loss": 0.0, + "step": 45971 + }, + { + "epoch": 4.289633292899132, + "grad_norm": NaN, + "learning_rate": 5.964031457970181e-05, + "loss": 0.0, + "step": 45972 + }, + { + "epoch": 4.289726602594009, + "grad_norm": NaN, + "learning_rate": 5.9634277312515885e-05, + "loss": 0.0, + "step": 45973 + }, + { + "epoch": 4.289819912288887, + "grad_norm": NaN, + "learning_rate": 5.962824027510327e-05, + "loss": 0.0, + "step": 45974 + }, + { + "epoch": 4.289913221983764, + "grad_norm": NaN, + "learning_rate": 5.9622203467479165e-05, + "loss": 0.0, + "step": 45975 + }, + { + "epoch": 4.2900065316786415, + "grad_norm": NaN, + "learning_rate": 5.961616688965909e-05, + "loss": 0.0, + "step": 45976 + }, + { + "epoch": 4.290099841373519, + "grad_norm": NaN, + "learning_rate": 5.9610130541658375e-05, + "loss": 0.0, + "step": 45977 + }, + { + "epoch": 4.290193151068396, + "grad_norm": NaN, + "learning_rate": 5.960409442349221e-05, + "loss": 0.0, + "step": 45978 + }, + { + "epoch": 4.290286460763273, + "grad_norm": NaN, + "learning_rate": 5.959805853517612e-05, + "loss": 0.0, + "step": 45979 + }, + { + "epoch": 4.29037977045815, + "grad_norm": NaN, + "learning_rate": 5.9592022876725384e-05, + "loss": 0.0, + "step": 45980 + }, + { + "epoch": 4.290473080153028, + "grad_norm": NaN, + "learning_rate": 5.9585987448155354e-05, + "loss": 0.0, + "step": 45981 + }, + { + "epoch": 4.290566389847905, + "grad_norm": NaN, + "learning_rate": 5.9579952249481375e-05, + "loss": 0.0, + "step": 45982 + }, + { + "epoch": 4.290659699542783, + "grad_norm": NaN, + "learning_rate": 5.9573917280718796e-05, + "loss": 0.0, + "step": 45983 + }, + { + "epoch": 4.29075300923766, + "grad_norm": NaN, + "learning_rate": 5.956788254188295e-05, + "loss": 0.0, + "step": 45984 + }, + { + "epoch": 4.290846318932537, + "grad_norm": NaN, + "learning_rate": 5.956184803298918e-05, + "loss": 0.0, + "step": 45985 + }, + { + "epoch": 4.290939628627415, + "grad_norm": NaN, + "learning_rate": 5.955581375405287e-05, + "loss": 0.0, + "step": 45986 + }, + { + "epoch": 4.291032938322291, + "grad_norm": NaN, + "learning_rate": 5.954977970508932e-05, + "loss": 0.0, + "step": 45987 + }, + { + "epoch": 4.291126248017169, + "grad_norm": NaN, + "learning_rate": 5.954374588611389e-05, + "loss": 0.0, + "step": 45988 + }, + { + "epoch": 4.291219557712046, + "grad_norm": NaN, + "learning_rate": 5.953771229714191e-05, + "loss": 0.0, + "step": 45989 + }, + { + "epoch": 4.291312867406924, + "grad_norm": NaN, + "learning_rate": 5.953167893818874e-05, + "loss": 0.0, + "step": 45990 + }, + { + "epoch": 4.291406177101801, + "grad_norm": NaN, + "learning_rate": 5.952564580926971e-05, + "loss": 0.0, + "step": 45991 + }, + { + "epoch": 4.2914994867966785, + "grad_norm": NaN, + "learning_rate": 5.951961291040016e-05, + "loss": 0.0, + "step": 45992 + }, + { + "epoch": 4.291592796491555, + "grad_norm": NaN, + "learning_rate": 5.9513580241595424e-05, + "loss": 0.0, + "step": 45993 + }, + { + "epoch": 4.291686106186432, + "grad_norm": NaN, + "learning_rate": 5.9507547802870866e-05, + "loss": 0.0, + "step": 45994 + }, + { + "epoch": 4.29177941588131, + "grad_norm": NaN, + "learning_rate": 5.950151559424179e-05, + "loss": 0.0, + "step": 45995 + }, + { + "epoch": 4.291872725576187, + "grad_norm": NaN, + "learning_rate": 5.949548361572357e-05, + "loss": 0.0, + "step": 45996 + }, + { + "epoch": 4.291966035271065, + "grad_norm": NaN, + "learning_rate": 5.9489451867331515e-05, + "loss": 0.0, + "step": 45997 + }, + { + "epoch": 4.292059344965942, + "grad_norm": NaN, + "learning_rate": 5.948342034908098e-05, + "loss": 0.0, + "step": 45998 + }, + { + "epoch": 4.2921526546608195, + "grad_norm": NaN, + "learning_rate": 5.94773890609873e-05, + "loss": 0.0, + "step": 45999 + }, + { + "epoch": 4.292245964355697, + "grad_norm": NaN, + "learning_rate": 5.947135800306579e-05, + "loss": 0.0, + "step": 46000 + }, + { + "epoch": 4.2923392740505735, + "grad_norm": NaN, + "learning_rate": 5.9465327175331804e-05, + "loss": 0.0, + "step": 46001 + }, + { + "epoch": 4.292432583745451, + "grad_norm": NaN, + "learning_rate": 5.945929657780067e-05, + "loss": 0.0, + "step": 46002 + }, + { + "epoch": 4.292525893440328, + "grad_norm": NaN, + "learning_rate": 5.945326621048773e-05, + "loss": 0.0, + "step": 46003 + }, + { + "epoch": 4.292619203135206, + "grad_norm": NaN, + "learning_rate": 5.944723607340831e-05, + "loss": 0.0, + "step": 46004 + }, + { + "epoch": 4.292712512830083, + "grad_norm": NaN, + "learning_rate": 5.944120616657774e-05, + "loss": 0.0, + "step": 46005 + }, + { + "epoch": 4.292805822524961, + "grad_norm": NaN, + "learning_rate": 5.943517649001136e-05, + "loss": 0.0, + "step": 46006 + }, + { + "epoch": 4.292899132219838, + "grad_norm": NaN, + "learning_rate": 5.94291470437245e-05, + "loss": 0.0, + "step": 46007 + }, + { + "epoch": 4.2929924419147145, + "grad_norm": NaN, + "learning_rate": 5.9423117827732505e-05, + "loss": 0.0, + "step": 46008 + }, + { + "epoch": 4.293085751609592, + "grad_norm": NaN, + "learning_rate": 5.941708884205068e-05, + "loss": 0.0, + "step": 46009 + }, + { + "epoch": 4.293179061304469, + "grad_norm": NaN, + "learning_rate": 5.941106008669436e-05, + "loss": 0.0, + "step": 46010 + }, + { + "epoch": 4.293272370999347, + "grad_norm": NaN, + "learning_rate": 5.9405031561678866e-05, + "loss": 0.0, + "step": 46011 + }, + { + "epoch": 4.293365680694224, + "grad_norm": NaN, + "learning_rate": 5.9399003267019615e-05, + "loss": 0.0, + "step": 46012 + }, + { + "epoch": 4.293458990389102, + "grad_norm": NaN, + "learning_rate": 5.939297520273175e-05, + "loss": 0.0, + "step": 46013 + }, + { + "epoch": 4.293552300083979, + "grad_norm": NaN, + "learning_rate": 5.9386947368830753e-05, + "loss": 0.0, + "step": 46014 + }, + { + "epoch": 4.2936456097788565, + "grad_norm": NaN, + "learning_rate": 5.938091976533198e-05, + "loss": 0.0, + "step": 46015 + }, + { + "epoch": 4.293738919473733, + "grad_norm": NaN, + "learning_rate": 5.9374892392250554e-05, + "loss": 0.0, + "step": 46016 + }, + { + "epoch": 4.29383222916861, + "grad_norm": NaN, + "learning_rate": 5.9368865249602006e-05, + "loss": 0.0, + "step": 46017 + }, + { + "epoch": 4.293925538863488, + "grad_norm": NaN, + "learning_rate": 5.936283833740158e-05, + "loss": 0.0, + "step": 46018 + }, + { + "epoch": 4.294018848558365, + "grad_norm": NaN, + "learning_rate": 5.935681165566461e-05, + "loss": 0.0, + "step": 46019 + }, + { + "epoch": 4.294112158253243, + "grad_norm": NaN, + "learning_rate": 5.935078520440642e-05, + "loss": 0.0, + "step": 46020 + }, + { + "epoch": 4.29420546794812, + "grad_norm": NaN, + "learning_rate": 5.934475898364231e-05, + "loss": 0.0, + "step": 46021 + }, + { + "epoch": 4.2942987776429975, + "grad_norm": NaN, + "learning_rate": 5.9338732993387646e-05, + "loss": 0.0, + "step": 46022 + }, + { + "epoch": 4.294392087337874, + "grad_norm": NaN, + "learning_rate": 5.933270723365772e-05, + "loss": 0.0, + "step": 46023 + }, + { + "epoch": 4.2944853970327515, + "grad_norm": NaN, + "learning_rate": 5.932668170446785e-05, + "loss": 0.0, + "step": 46024 + }, + { + "epoch": 4.294578706727629, + "grad_norm": NaN, + "learning_rate": 5.9320656405833396e-05, + "loss": 0.0, + "step": 46025 + }, + { + "epoch": 4.294672016422506, + "grad_norm": NaN, + "learning_rate": 5.931463133776963e-05, + "loss": 0.0, + "step": 46026 + }, + { + "epoch": 4.294765326117384, + "grad_norm": NaN, + "learning_rate": 5.93086065002919e-05, + "loss": 0.0, + "step": 46027 + }, + { + "epoch": 4.294858635812261, + "grad_norm": NaN, + "learning_rate": 5.930258189341552e-05, + "loss": 0.0, + "step": 46028 + }, + { + "epoch": 4.2949519455071385, + "grad_norm": NaN, + "learning_rate": 5.929655751715581e-05, + "loss": 0.0, + "step": 46029 + }, + { + "epoch": 4.295045255202015, + "grad_norm": NaN, + "learning_rate": 5.9290533371528086e-05, + "loss": 0.0, + "step": 46030 + }, + { + "epoch": 4.2951385648968925, + "grad_norm": NaN, + "learning_rate": 5.928450945654765e-05, + "loss": 0.0, + "step": 46031 + }, + { + "epoch": 4.29523187459177, + "grad_norm": NaN, + "learning_rate": 5.927848577222986e-05, + "loss": 0.0, + "step": 46032 + }, + { + "epoch": 4.295325184286647, + "grad_norm": NaN, + "learning_rate": 5.9272462318589984e-05, + "loss": 0.0, + "step": 46033 + }, + { + "epoch": 4.295418493981525, + "grad_norm": NaN, + "learning_rate": 5.926643909564338e-05, + "loss": 0.0, + "step": 46034 + }, + { + "epoch": 4.295511803676402, + "grad_norm": NaN, + "learning_rate": 5.926041610340533e-05, + "loss": 0.0, + "step": 46035 + }, + { + "epoch": 4.29560511337128, + "grad_norm": NaN, + "learning_rate": 5.925439334189117e-05, + "loss": 0.0, + "step": 46036 + }, + { + "epoch": 4.295698423066156, + "grad_norm": NaN, + "learning_rate": 5.9248370811116194e-05, + "loss": 0.0, + "step": 46037 + }, + { + "epoch": 4.2957917327610335, + "grad_norm": NaN, + "learning_rate": 5.924234851109574e-05, + "loss": 0.0, + "step": 46038 + }, + { + "epoch": 4.295885042455911, + "grad_norm": NaN, + "learning_rate": 5.923632644184511e-05, + "loss": 0.0, + "step": 46039 + }, + { + "epoch": 4.295978352150788, + "grad_norm": NaN, + "learning_rate": 5.923030460337962e-05, + "loss": 0.0, + "step": 46040 + }, + { + "epoch": 4.296071661845666, + "grad_norm": NaN, + "learning_rate": 5.922428299571457e-05, + "loss": 0.0, + "step": 46041 + }, + { + "epoch": 4.296164971540543, + "grad_norm": NaN, + "learning_rate": 5.921826161886528e-05, + "loss": 0.0, + "step": 46042 + }, + { + "epoch": 4.296258281235421, + "grad_norm": NaN, + "learning_rate": 5.921224047284707e-05, + "loss": 0.0, + "step": 46043 + }, + { + "epoch": 4.296351590930298, + "grad_norm": NaN, + "learning_rate": 5.920621955767522e-05, + "loss": 0.0, + "step": 46044 + }, + { + "epoch": 4.296444900625175, + "grad_norm": NaN, + "learning_rate": 5.9200198873365064e-05, + "loss": 0.0, + "step": 46045 + }, + { + "epoch": 4.296538210320052, + "grad_norm": NaN, + "learning_rate": 5.919417841993191e-05, + "loss": 0.0, + "step": 46046 + }, + { + "epoch": 4.296631520014929, + "grad_norm": NaN, + "learning_rate": 5.918815819739106e-05, + "loss": 0.0, + "step": 46047 + }, + { + "epoch": 4.296724829709807, + "grad_norm": NaN, + "learning_rate": 5.918213820575782e-05, + "loss": 0.0, + "step": 46048 + }, + { + "epoch": 4.296818139404684, + "grad_norm": NaN, + "learning_rate": 5.9176118445047516e-05, + "loss": 0.0, + "step": 46049 + }, + { + "epoch": 4.296911449099562, + "grad_norm": NaN, + "learning_rate": 5.917009891527541e-05, + "loss": 0.0, + "step": 46050 + }, + { + "epoch": 4.297004758794439, + "grad_norm": NaN, + "learning_rate": 5.916407961645686e-05, + "loss": 0.0, + "step": 46051 + }, + { + "epoch": 4.297098068489316, + "grad_norm": NaN, + "learning_rate": 5.915806054860714e-05, + "loss": 0.0, + "step": 46052 + }, + { + "epoch": 4.297191378184193, + "grad_norm": NaN, + "learning_rate": 5.915204171174156e-05, + "loss": 0.0, + "step": 46053 + }, + { + "epoch": 4.2972846878790705, + "grad_norm": NaN, + "learning_rate": 5.914602310587543e-05, + "loss": 0.0, + "step": 46054 + }, + { + "epoch": 4.297377997573948, + "grad_norm": NaN, + "learning_rate": 5.914000473102405e-05, + "loss": 0.0, + "step": 46055 + }, + { + "epoch": 4.297471307268825, + "grad_norm": NaN, + "learning_rate": 5.913398658720273e-05, + "loss": 0.0, + "step": 46056 + }, + { + "epoch": 4.297564616963703, + "grad_norm": NaN, + "learning_rate": 5.912796867442676e-05, + "loss": 0.0, + "step": 46057 + }, + { + "epoch": 4.29765792665858, + "grad_norm": NaN, + "learning_rate": 5.912195099271145e-05, + "loss": 0.0, + "step": 46058 + }, + { + "epoch": 4.297751236353458, + "grad_norm": NaN, + "learning_rate": 5.911593354207209e-05, + "loss": 0.0, + "step": 46059 + }, + { + "epoch": 4.297844546048334, + "grad_norm": NaN, + "learning_rate": 5.9109916322523995e-05, + "loss": 0.0, + "step": 46060 + }, + { + "epoch": 4.2979378557432115, + "grad_norm": NaN, + "learning_rate": 5.910389933408246e-05, + "loss": 0.0, + "step": 46061 + }, + { + "epoch": 4.298031165438089, + "grad_norm": NaN, + "learning_rate": 5.9097882576762785e-05, + "loss": 0.0, + "step": 46062 + }, + { + "epoch": 4.298124475132966, + "grad_norm": NaN, + "learning_rate": 5.909186605058026e-05, + "loss": 0.0, + "step": 46063 + }, + { + "epoch": 4.298217784827844, + "grad_norm": NaN, + "learning_rate": 5.9085849755550194e-05, + "loss": 0.0, + "step": 46064 + }, + { + "epoch": 4.298311094522721, + "grad_norm": NaN, + "learning_rate": 5.90798336916879e-05, + "loss": 0.0, + "step": 46065 + }, + { + "epoch": 4.298404404217599, + "grad_norm": NaN, + "learning_rate": 5.907381785900862e-05, + "loss": 0.0, + "step": 46066 + }, + { + "epoch": 4.298497713912475, + "grad_norm": NaN, + "learning_rate": 5.9067802257527705e-05, + "loss": 0.0, + "step": 46067 + }, + { + "epoch": 4.298591023607353, + "grad_norm": NaN, + "learning_rate": 5.9061786887260445e-05, + "loss": 0.0, + "step": 46068 + }, + { + "epoch": 4.29868433330223, + "grad_norm": NaN, + "learning_rate": 5.9055771748222114e-05, + "loss": 0.0, + "step": 46069 + }, + { + "epoch": 4.298777642997107, + "grad_norm": NaN, + "learning_rate": 5.9049756840428006e-05, + "loss": 0.0, + "step": 46070 + }, + { + "epoch": 4.298870952691985, + "grad_norm": NaN, + "learning_rate": 5.904374216389343e-05, + "loss": 0.0, + "step": 46071 + }, + { + "epoch": 4.298964262386862, + "grad_norm": NaN, + "learning_rate": 5.903772771863369e-05, + "loss": 0.0, + "step": 46072 + }, + { + "epoch": 4.29905757208174, + "grad_norm": NaN, + "learning_rate": 5.903171350466405e-05, + "loss": 0.0, + "step": 46073 + }, + { + "epoch": 4.299150881776616, + "grad_norm": NaN, + "learning_rate": 5.902569952199981e-05, + "loss": 0.0, + "step": 46074 + }, + { + "epoch": 4.299244191471494, + "grad_norm": NaN, + "learning_rate": 5.901968577065628e-05, + "loss": 0.0, + "step": 46075 + }, + { + "epoch": 4.299337501166371, + "grad_norm": NaN, + "learning_rate": 5.901367225064872e-05, + "loss": 0.0, + "step": 46076 + }, + { + "epoch": 4.2994308108612485, + "grad_norm": NaN, + "learning_rate": 5.900765896199246e-05, + "loss": 0.0, + "step": 46077 + }, + { + "epoch": 4.299524120556126, + "grad_norm": NaN, + "learning_rate": 5.9001645904702756e-05, + "loss": 0.0, + "step": 46078 + }, + { + "epoch": 4.299617430251003, + "grad_norm": NaN, + "learning_rate": 5.899563307879491e-05, + "loss": 0.0, + "step": 46079 + }, + { + "epoch": 4.299710739945881, + "grad_norm": NaN, + "learning_rate": 5.898962048428422e-05, + "loss": 0.0, + "step": 46080 + }, + { + "epoch": 4.299804049640757, + "grad_norm": NaN, + "learning_rate": 5.898360812118597e-05, + "loss": 0.0, + "step": 46081 + }, + { + "epoch": 4.299897359335635, + "grad_norm": NaN, + "learning_rate": 5.897759598951543e-05, + "loss": 0.0, + "step": 46082 + }, + { + "epoch": 4.299990669030512, + "grad_norm": NaN, + "learning_rate": 5.8971584089287904e-05, + "loss": 0.0, + "step": 46083 + }, + { + "epoch": 4.3000839787253895, + "grad_norm": NaN, + "learning_rate": 5.896557242051868e-05, + "loss": 0.0, + "step": 46084 + }, + { + "epoch": 4.300177288420267, + "grad_norm": NaN, + "learning_rate": 5.8959560983223046e-05, + "loss": 0.0, + "step": 46085 + }, + { + "epoch": 4.300270598115144, + "grad_norm": NaN, + "learning_rate": 5.8953549777416256e-05, + "loss": 0.0, + "step": 46086 + }, + { + "epoch": 4.300363907810022, + "grad_norm": NaN, + "learning_rate": 5.894753880311363e-05, + "loss": 0.0, + "step": 46087 + }, + { + "epoch": 4.300457217504899, + "grad_norm": NaN, + "learning_rate": 5.8941528060330465e-05, + "loss": 0.0, + "step": 46088 + }, + { + "epoch": 4.300550527199776, + "grad_norm": NaN, + "learning_rate": 5.893551754908199e-05, + "loss": 0.0, + "step": 46089 + }, + { + "epoch": 4.300643836894653, + "grad_norm": NaN, + "learning_rate": 5.892950726938354e-05, + "loss": 0.0, + "step": 46090 + }, + { + "epoch": 4.3007371465895305, + "grad_norm": NaN, + "learning_rate": 5.892349722125036e-05, + "loss": 0.0, + "step": 46091 + }, + { + "epoch": 4.300830456284408, + "grad_norm": NaN, + "learning_rate": 5.891748740469775e-05, + "loss": 0.0, + "step": 46092 + }, + { + "epoch": 4.300923765979285, + "grad_norm": NaN, + "learning_rate": 5.8911477819741e-05, + "loss": 0.0, + "step": 46093 + }, + { + "epoch": 4.301017075674163, + "grad_norm": NaN, + "learning_rate": 5.890546846639536e-05, + "loss": 0.0, + "step": 46094 + }, + { + "epoch": 4.30111038536904, + "grad_norm": NaN, + "learning_rate": 5.889945934467614e-05, + "loss": 0.0, + "step": 46095 + }, + { + "epoch": 4.301203695063917, + "grad_norm": NaN, + "learning_rate": 5.889345045459863e-05, + "loss": 0.0, + "step": 46096 + }, + { + "epoch": 4.301297004758794, + "grad_norm": NaN, + "learning_rate": 5.888744179617806e-05, + "loss": 0.0, + "step": 46097 + }, + { + "epoch": 4.301390314453672, + "grad_norm": NaN, + "learning_rate": 5.888143336942974e-05, + "loss": 0.0, + "step": 46098 + }, + { + "epoch": 4.301483624148549, + "grad_norm": NaN, + "learning_rate": 5.887542517436895e-05, + "loss": 0.0, + "step": 46099 + }, + { + "epoch": 4.301576933843426, + "grad_norm": NaN, + "learning_rate": 5.886941721101095e-05, + "loss": 0.0, + "step": 46100 + }, + { + "epoch": 4.301670243538304, + "grad_norm": NaN, + "learning_rate": 5.886340947937104e-05, + "loss": 0.0, + "step": 46101 + }, + { + "epoch": 4.301763553233181, + "grad_norm": NaN, + "learning_rate": 5.885740197946449e-05, + "loss": 0.0, + "step": 46102 + }, + { + "epoch": 4.301856862928059, + "grad_norm": NaN, + "learning_rate": 5.885139471130656e-05, + "loss": 0.0, + "step": 46103 + }, + { + "epoch": 4.301950172622935, + "grad_norm": NaN, + "learning_rate": 5.8845387674912514e-05, + "loss": 0.0, + "step": 46104 + }, + { + "epoch": 4.302043482317813, + "grad_norm": NaN, + "learning_rate": 5.883938087029767e-05, + "loss": 0.0, + "step": 46105 + }, + { + "epoch": 4.30213679201269, + "grad_norm": NaN, + "learning_rate": 5.883337429747726e-05, + "loss": 0.0, + "step": 46106 + }, + { + "epoch": 4.3022301017075675, + "grad_norm": NaN, + "learning_rate": 5.8827367956466583e-05, + "loss": 0.0, + "step": 46107 + }, + { + "epoch": 4.302323411402445, + "grad_norm": NaN, + "learning_rate": 5.882136184728091e-05, + "loss": 0.0, + "step": 46108 + }, + { + "epoch": 4.302416721097322, + "grad_norm": NaN, + "learning_rate": 5.881535596993549e-05, + "loss": 0.0, + "step": 46109 + }, + { + "epoch": 4.302510030792199, + "grad_norm": NaN, + "learning_rate": 5.8809350324445606e-05, + "loss": 0.0, + "step": 46110 + }, + { + "epoch": 4.302603340487076, + "grad_norm": NaN, + "learning_rate": 5.880334491082655e-05, + "loss": 0.0, + "step": 46111 + }, + { + "epoch": 4.302696650181954, + "grad_norm": NaN, + "learning_rate": 5.879733972909355e-05, + "loss": 0.0, + "step": 46112 + }, + { + "epoch": 4.302789959876831, + "grad_norm": NaN, + "learning_rate": 5.8791334779261916e-05, + "loss": 0.0, + "step": 46113 + }, + { + "epoch": 4.3028832695717085, + "grad_norm": NaN, + "learning_rate": 5.87853300613469e-05, + "loss": 0.0, + "step": 46114 + }, + { + "epoch": 4.302976579266586, + "grad_norm": NaN, + "learning_rate": 5.877932557536376e-05, + "loss": 0.0, + "step": 46115 + }, + { + "epoch": 4.303069888961463, + "grad_norm": NaN, + "learning_rate": 5.877332132132778e-05, + "loss": 0.0, + "step": 46116 + }, + { + "epoch": 4.303163198656341, + "grad_norm": NaN, + "learning_rate": 5.876731729925422e-05, + "loss": 0.0, + "step": 46117 + }, + { + "epoch": 4.303256508351217, + "grad_norm": NaN, + "learning_rate": 5.876131350915834e-05, + "loss": 0.0, + "step": 46118 + }, + { + "epoch": 4.303349818046095, + "grad_norm": NaN, + "learning_rate": 5.8755309951055425e-05, + "loss": 0.0, + "step": 46119 + }, + { + "epoch": 4.303443127740972, + "grad_norm": NaN, + "learning_rate": 5.8749306624960715e-05, + "loss": 0.0, + "step": 46120 + }, + { + "epoch": 4.30353643743585, + "grad_norm": NaN, + "learning_rate": 5.87433035308895e-05, + "loss": 0.0, + "step": 46121 + }, + { + "epoch": 4.303629747130727, + "grad_norm": NaN, + "learning_rate": 5.873730066885703e-05, + "loss": 0.0, + "step": 46122 + }, + { + "epoch": 4.303723056825604, + "grad_norm": NaN, + "learning_rate": 5.8731298038878574e-05, + "loss": 0.0, + "step": 46123 + }, + { + "epoch": 4.303816366520482, + "grad_norm": NaN, + "learning_rate": 5.872529564096938e-05, + "loss": 0.0, + "step": 46124 + }, + { + "epoch": 4.303909676215358, + "grad_norm": NaN, + "learning_rate": 5.871929347514472e-05, + "loss": 0.0, + "step": 46125 + }, + { + "epoch": 4.304002985910236, + "grad_norm": NaN, + "learning_rate": 5.871329154141986e-05, + "loss": 0.0, + "step": 46126 + }, + { + "epoch": 4.304096295605113, + "grad_norm": NaN, + "learning_rate": 5.8707289839810076e-05, + "loss": 0.0, + "step": 46127 + }, + { + "epoch": 4.304189605299991, + "grad_norm": NaN, + "learning_rate": 5.87012883703306e-05, + "loss": 0.0, + "step": 46128 + }, + { + "epoch": 4.304282914994868, + "grad_norm": NaN, + "learning_rate": 5.8695287132996686e-05, + "loss": 0.0, + "step": 46129 + }, + { + "epoch": 4.3043762246897455, + "grad_norm": NaN, + "learning_rate": 5.868928612782364e-05, + "loss": 0.0, + "step": 46130 + }, + { + "epoch": 4.304469534384623, + "grad_norm": NaN, + "learning_rate": 5.868328535482666e-05, + "loss": 0.0, + "step": 46131 + }, + { + "epoch": 4.3045628440795, + "grad_norm": NaN, + "learning_rate": 5.8677284814021055e-05, + "loss": 0.0, + "step": 46132 + }, + { + "epoch": 4.304656153774377, + "grad_norm": NaN, + "learning_rate": 5.867128450542206e-05, + "loss": 0.0, + "step": 46133 + }, + { + "epoch": 4.304749463469254, + "grad_norm": NaN, + "learning_rate": 5.866528442904493e-05, + "loss": 0.0, + "step": 46134 + }, + { + "epoch": 4.304842773164132, + "grad_norm": NaN, + "learning_rate": 5.865928458490493e-05, + "loss": 0.0, + "step": 46135 + }, + { + "epoch": 4.304936082859009, + "grad_norm": NaN, + "learning_rate": 5.865328497301732e-05, + "loss": 0.0, + "step": 46136 + }, + { + "epoch": 4.3050293925538865, + "grad_norm": NaN, + "learning_rate": 5.864728559339734e-05, + "loss": 0.0, + "step": 46137 + }, + { + "epoch": 4.305122702248764, + "grad_norm": NaN, + "learning_rate": 5.864128644606024e-05, + "loss": 0.0, + "step": 46138 + }, + { + "epoch": 4.305216011943641, + "grad_norm": NaN, + "learning_rate": 5.863528753102131e-05, + "loss": 0.0, + "step": 46139 + }, + { + "epoch": 4.305309321638518, + "grad_norm": NaN, + "learning_rate": 5.8629288848295757e-05, + "loss": 0.0, + "step": 46140 + }, + { + "epoch": 4.305402631333395, + "grad_norm": NaN, + "learning_rate": 5.862329039789887e-05, + "loss": 0.0, + "step": 46141 + }, + { + "epoch": 4.305495941028273, + "grad_norm": NaN, + "learning_rate": 5.861729217984588e-05, + "loss": 0.0, + "step": 46142 + }, + { + "epoch": 4.30558925072315, + "grad_norm": NaN, + "learning_rate": 5.861129419415206e-05, + "loss": 0.0, + "step": 46143 + }, + { + "epoch": 4.3056825604180275, + "grad_norm": NaN, + "learning_rate": 5.8605296440832634e-05, + "loss": 0.0, + "step": 46144 + }, + { + "epoch": 4.305775870112905, + "grad_norm": NaN, + "learning_rate": 5.8599298919902874e-05, + "loss": 0.0, + "step": 46145 + }, + { + "epoch": 4.305869179807782, + "grad_norm": NaN, + "learning_rate": 5.8593301631378016e-05, + "loss": 0.0, + "step": 46146 + }, + { + "epoch": 4.305962489502659, + "grad_norm": NaN, + "learning_rate": 5.858730457527332e-05, + "loss": 0.0, + "step": 46147 + }, + { + "epoch": 4.306055799197536, + "grad_norm": NaN, + "learning_rate": 5.858130775160403e-05, + "loss": 0.0, + "step": 46148 + }, + { + "epoch": 4.306149108892414, + "grad_norm": NaN, + "learning_rate": 5.8575311160385395e-05, + "loss": 0.0, + "step": 46149 + }, + { + "epoch": 4.306242418587291, + "grad_norm": NaN, + "learning_rate": 5.856931480163266e-05, + "loss": 0.0, + "step": 46150 + }, + { + "epoch": 4.306335728282169, + "grad_norm": NaN, + "learning_rate": 5.856331867536106e-05, + "loss": 0.0, + "step": 46151 + }, + { + "epoch": 4.306429037977046, + "grad_norm": NaN, + "learning_rate": 5.855732278158588e-05, + "loss": 0.0, + "step": 46152 + }, + { + "epoch": 4.306522347671923, + "grad_norm": NaN, + "learning_rate": 5.855132712032233e-05, + "loss": 0.0, + "step": 46153 + }, + { + "epoch": 4.3066156573668, + "grad_norm": NaN, + "learning_rate": 5.854533169158566e-05, + "loss": 0.0, + "step": 46154 + }, + { + "epoch": 4.306708967061677, + "grad_norm": NaN, + "learning_rate": 5.853933649539112e-05, + "loss": 0.0, + "step": 46155 + }, + { + "epoch": 4.306802276756555, + "grad_norm": NaN, + "learning_rate": 5.853334153175396e-05, + "loss": 0.0, + "step": 46156 + }, + { + "epoch": 4.306895586451432, + "grad_norm": NaN, + "learning_rate": 5.852734680068942e-05, + "loss": 0.0, + "step": 46157 + }, + { + "epoch": 4.30698889614631, + "grad_norm": NaN, + "learning_rate": 5.852135230221274e-05, + "loss": 0.0, + "step": 46158 + }, + { + "epoch": 4.307082205841187, + "grad_norm": NaN, + "learning_rate": 5.8515358036339154e-05, + "loss": 0.0, + "step": 46159 + }, + { + "epoch": 4.3071755155360645, + "grad_norm": NaN, + "learning_rate": 5.850936400308393e-05, + "loss": 0.0, + "step": 46160 + }, + { + "epoch": 4.307268825230942, + "grad_norm": NaN, + "learning_rate": 5.8503370202462285e-05, + "loss": 0.0, + "step": 46161 + }, + { + "epoch": 4.307362134925818, + "grad_norm": NaN, + "learning_rate": 5.849737663448946e-05, + "loss": 0.0, + "step": 46162 + }, + { + "epoch": 4.307455444620696, + "grad_norm": NaN, + "learning_rate": 5.84913832991807e-05, + "loss": 0.0, + "step": 46163 + }, + { + "epoch": 4.307548754315573, + "grad_norm": NaN, + "learning_rate": 5.848539019655125e-05, + "loss": 0.0, + "step": 46164 + }, + { + "epoch": 4.307642064010451, + "grad_norm": NaN, + "learning_rate": 5.847939732661635e-05, + "loss": 0.0, + "step": 46165 + }, + { + "epoch": 4.307735373705328, + "grad_norm": NaN, + "learning_rate": 5.847340468939122e-05, + "loss": 0.0, + "step": 46166 + }, + { + "epoch": 4.3078286834002055, + "grad_norm": NaN, + "learning_rate": 5.8467412284891124e-05, + "loss": 0.0, + "step": 46167 + }, + { + "epoch": 4.307921993095083, + "grad_norm": NaN, + "learning_rate": 5.8461420113131276e-05, + "loss": 0.0, + "step": 46168 + }, + { + "epoch": 4.3080153027899595, + "grad_norm": NaN, + "learning_rate": 5.845542817412692e-05, + "loss": 0.0, + "step": 46169 + }, + { + "epoch": 4.308108612484837, + "grad_norm": NaN, + "learning_rate": 5.844943646789329e-05, + "loss": 0.0, + "step": 46170 + }, + { + "epoch": 4.308201922179714, + "grad_norm": NaN, + "learning_rate": 5.844344499444563e-05, + "loss": 0.0, + "step": 46171 + }, + { + "epoch": 4.308295231874592, + "grad_norm": NaN, + "learning_rate": 5.8437453753799166e-05, + "loss": 0.0, + "step": 46172 + }, + { + "epoch": 4.308388541569469, + "grad_norm": NaN, + "learning_rate": 5.843146274596914e-05, + "loss": 0.0, + "step": 46173 + }, + { + "epoch": 4.308481851264347, + "grad_norm": NaN, + "learning_rate": 5.842547197097078e-05, + "loss": 0.0, + "step": 46174 + }, + { + "epoch": 4.308575160959224, + "grad_norm": NaN, + "learning_rate": 5.8419481428819306e-05, + "loss": 0.0, + "step": 46175 + }, + { + "epoch": 4.308668470654101, + "grad_norm": NaN, + "learning_rate": 5.841349111952997e-05, + "loss": 0.0, + "step": 46176 + }, + { + "epoch": 4.308761780348978, + "grad_norm": NaN, + "learning_rate": 5.8407501043118e-05, + "loss": 0.0, + "step": 46177 + }, + { + "epoch": 4.308855090043855, + "grad_norm": NaN, + "learning_rate": 5.8401511199598615e-05, + "loss": 0.0, + "step": 46178 + }, + { + "epoch": 4.308948399738733, + "grad_norm": NaN, + "learning_rate": 5.839552158898706e-05, + "loss": 0.0, + "step": 46179 + }, + { + "epoch": 4.30904170943361, + "grad_norm": NaN, + "learning_rate": 5.838953221129855e-05, + "loss": 0.0, + "step": 46180 + }, + { + "epoch": 4.309135019128488, + "grad_norm": NaN, + "learning_rate": 5.838354306654833e-05, + "loss": 0.0, + "step": 46181 + }, + { + "epoch": 4.309228328823365, + "grad_norm": NaN, + "learning_rate": 5.837755415475161e-05, + "loss": 0.0, + "step": 46182 + }, + { + "epoch": 4.3093216385182425, + "grad_norm": NaN, + "learning_rate": 5.8371565475923655e-05, + "loss": 0.0, + "step": 46183 + }, + { + "epoch": 4.309414948213119, + "grad_norm": NaN, + "learning_rate": 5.8365577030079644e-05, + "loss": 0.0, + "step": 46184 + }, + { + "epoch": 4.309508257907996, + "grad_norm": NaN, + "learning_rate": 5.8359588817234826e-05, + "loss": 0.0, + "step": 46185 + }, + { + "epoch": 4.309601567602874, + "grad_norm": NaN, + "learning_rate": 5.835360083740445e-05, + "loss": 0.0, + "step": 46186 + }, + { + "epoch": 4.309694877297751, + "grad_norm": NaN, + "learning_rate": 5.83476130906037e-05, + "loss": 0.0, + "step": 46187 + }, + { + "epoch": 4.309788186992629, + "grad_norm": NaN, + "learning_rate": 5.834162557684781e-05, + "loss": 0.0, + "step": 46188 + }, + { + "epoch": 4.309881496687506, + "grad_norm": NaN, + "learning_rate": 5.8335638296152034e-05, + "loss": 0.0, + "step": 46189 + }, + { + "epoch": 4.3099748063823835, + "grad_norm": NaN, + "learning_rate": 5.832965124853158e-05, + "loss": 0.0, + "step": 46190 + }, + { + "epoch": 4.31006811607726, + "grad_norm": NaN, + "learning_rate": 5.832366443400166e-05, + "loss": 0.0, + "step": 46191 + }, + { + "epoch": 4.3101614257721375, + "grad_norm": NaN, + "learning_rate": 5.83176778525775e-05, + "loss": 0.0, + "step": 46192 + }, + { + "epoch": 4.310254735467015, + "grad_norm": NaN, + "learning_rate": 5.831169150427433e-05, + "loss": 0.0, + "step": 46193 + }, + { + "epoch": 4.310348045161892, + "grad_norm": NaN, + "learning_rate": 5.830570538910738e-05, + "loss": 0.0, + "step": 46194 + }, + { + "epoch": 4.31044135485677, + "grad_norm": NaN, + "learning_rate": 5.829971950709184e-05, + "loss": 0.0, + "step": 46195 + }, + { + "epoch": 4.310534664551647, + "grad_norm": NaN, + "learning_rate": 5.8293733858242965e-05, + "loss": 0.0, + "step": 46196 + }, + { + "epoch": 4.3106279742465246, + "grad_norm": NaN, + "learning_rate": 5.828774844257595e-05, + "loss": 0.0, + "step": 46197 + }, + { + "epoch": 4.310721283941401, + "grad_norm": NaN, + "learning_rate": 5.828176326010603e-05, + "loss": 0.0, + "step": 46198 + }, + { + "epoch": 4.3108145936362785, + "grad_norm": NaN, + "learning_rate": 5.827577831084841e-05, + "loss": 0.0, + "step": 46199 + }, + { + "epoch": 4.310907903331156, + "grad_norm": NaN, + "learning_rate": 5.8269793594818326e-05, + "loss": 0.0, + "step": 46200 + }, + { + "epoch": 4.311001213026033, + "grad_norm": NaN, + "learning_rate": 5.826380911203097e-05, + "loss": 0.0, + "step": 46201 + }, + { + "epoch": 4.311094522720911, + "grad_norm": NaN, + "learning_rate": 5.825782486250159e-05, + "loss": 0.0, + "step": 46202 + }, + { + "epoch": 4.311187832415788, + "grad_norm": NaN, + "learning_rate": 5.8251840846245376e-05, + "loss": 0.0, + "step": 46203 + }, + { + "epoch": 4.311281142110666, + "grad_norm": NaN, + "learning_rate": 5.824585706327754e-05, + "loss": 0.0, + "step": 46204 + }, + { + "epoch": 4.311374451805543, + "grad_norm": NaN, + "learning_rate": 5.8239873513613326e-05, + "loss": 0.0, + "step": 46205 + }, + { + "epoch": 4.3114677615004195, + "grad_norm": NaN, + "learning_rate": 5.8233890197267927e-05, + "loss": 0.0, + "step": 46206 + }, + { + "epoch": 4.311561071195297, + "grad_norm": NaN, + "learning_rate": 5.822790711425656e-05, + "loss": 0.0, + "step": 46207 + }, + { + "epoch": 4.311654380890174, + "grad_norm": NaN, + "learning_rate": 5.822192426459446e-05, + "loss": 0.0, + "step": 46208 + }, + { + "epoch": 4.311747690585052, + "grad_norm": NaN, + "learning_rate": 5.82159416482968e-05, + "loss": 0.0, + "step": 46209 + }, + { + "epoch": 4.311841000279929, + "grad_norm": NaN, + "learning_rate": 5.820995926537882e-05, + "loss": 0.0, + "step": 46210 + }, + { + "epoch": 4.311934309974807, + "grad_norm": NaN, + "learning_rate": 5.820397711585573e-05, + "loss": 0.0, + "step": 46211 + }, + { + "epoch": 4.312027619669684, + "grad_norm": NaN, + "learning_rate": 5.819799519974271e-05, + "loss": 0.0, + "step": 46212 + }, + { + "epoch": 4.312120929364561, + "grad_norm": NaN, + "learning_rate": 5.819201351705502e-05, + "loss": 0.0, + "step": 46213 + }, + { + "epoch": 4.312214239059438, + "grad_norm": NaN, + "learning_rate": 5.818603206780783e-05, + "loss": 0.0, + "step": 46214 + }, + { + "epoch": 4.312307548754315, + "grad_norm": NaN, + "learning_rate": 5.8180050852016366e-05, + "loss": 0.0, + "step": 46215 + }, + { + "epoch": 4.312400858449193, + "grad_norm": NaN, + "learning_rate": 5.817406986969582e-05, + "loss": 0.0, + "step": 46216 + }, + { + "epoch": 4.31249416814407, + "grad_norm": NaN, + "learning_rate": 5.816808912086143e-05, + "loss": 0.0, + "step": 46217 + }, + { + "epoch": 4.312587477838948, + "grad_norm": NaN, + "learning_rate": 5.816210860552838e-05, + "loss": 0.0, + "step": 46218 + }, + { + "epoch": 4.312680787533825, + "grad_norm": NaN, + "learning_rate": 5.8156128323711896e-05, + "loss": 0.0, + "step": 46219 + }, + { + "epoch": 4.3127740972287025, + "grad_norm": NaN, + "learning_rate": 5.815014827542716e-05, + "loss": 0.0, + "step": 46220 + }, + { + "epoch": 4.312867406923579, + "grad_norm": NaN, + "learning_rate": 5.814416846068939e-05, + "loss": 0.0, + "step": 46221 + }, + { + "epoch": 4.3129607166184565, + "grad_norm": NaN, + "learning_rate": 5.81381888795138e-05, + "loss": 0.0, + "step": 46222 + }, + { + "epoch": 4.313054026313334, + "grad_norm": NaN, + "learning_rate": 5.8132209531915566e-05, + "loss": 0.0, + "step": 46223 + }, + { + "epoch": 4.313147336008211, + "grad_norm": NaN, + "learning_rate": 5.812623041790993e-05, + "loss": 0.0, + "step": 46224 + }, + { + "epoch": 4.313240645703089, + "grad_norm": NaN, + "learning_rate": 5.812025153751206e-05, + "loss": 0.0, + "step": 46225 + }, + { + "epoch": 4.313333955397966, + "grad_norm": NaN, + "learning_rate": 5.8114272890737174e-05, + "loss": 0.0, + "step": 46226 + }, + { + "epoch": 4.313427265092843, + "grad_norm": NaN, + "learning_rate": 5.810829447760048e-05, + "loss": 0.0, + "step": 46227 + }, + { + "epoch": 4.31352057478772, + "grad_norm": NaN, + "learning_rate": 5.810231629811717e-05, + "loss": 0.0, + "step": 46228 + }, + { + "epoch": 4.3136138844825975, + "grad_norm": NaN, + "learning_rate": 5.8096338352302455e-05, + "loss": 0.0, + "step": 46229 + }, + { + "epoch": 4.313707194177475, + "grad_norm": NaN, + "learning_rate": 5.8090360640171516e-05, + "loss": 0.0, + "step": 46230 + }, + { + "epoch": 4.313800503872352, + "grad_norm": NaN, + "learning_rate": 5.8084383161739575e-05, + "loss": 0.0, + "step": 46231 + }, + { + "epoch": 4.31389381356723, + "grad_norm": NaN, + "learning_rate": 5.8078405917021806e-05, + "loss": 0.0, + "step": 46232 + }, + { + "epoch": 4.313987123262107, + "grad_norm": NaN, + "learning_rate": 5.8072428906033434e-05, + "loss": 0.0, + "step": 46233 + }, + { + "epoch": 4.314080432956985, + "grad_norm": NaN, + "learning_rate": 5.8066452128789644e-05, + "loss": 0.0, + "step": 46234 + }, + { + "epoch": 4.314173742651861, + "grad_norm": NaN, + "learning_rate": 5.806047558530556e-05, + "loss": 0.0, + "step": 46235 + }, + { + "epoch": 4.314267052346739, + "grad_norm": NaN, + "learning_rate": 5.805449927559659e-05, + "loss": 0.0, + "step": 46236 + }, + { + "epoch": 4.314360362041616, + "grad_norm": NaN, + "learning_rate": 5.804852319967772e-05, + "loss": 0.0, + "step": 46237 + }, + { + "epoch": 4.314453671736493, + "grad_norm": NaN, + "learning_rate": 5.804254735756422e-05, + "loss": 0.0, + "step": 46238 + }, + { + "epoch": 4.314546981431371, + "grad_norm": NaN, + "learning_rate": 5.8036571749271284e-05, + "loss": 0.0, + "step": 46239 + }, + { + "epoch": 4.314640291126248, + "grad_norm": NaN, + "learning_rate": 5.8030596374814114e-05, + "loss": 0.0, + "step": 46240 + }, + { + "epoch": 4.314733600821126, + "grad_norm": NaN, + "learning_rate": 5.802462123420789e-05, + "loss": 0.0, + "step": 46241 + }, + { + "epoch": 4.314826910516002, + "grad_norm": NaN, + "learning_rate": 5.801864632746778e-05, + "loss": 0.0, + "step": 46242 + }, + { + "epoch": 4.31492022021088, + "grad_norm": NaN, + "learning_rate": 5.8012671654609046e-05, + "loss": 0.0, + "step": 46243 + }, + { + "epoch": 4.315013529905757, + "grad_norm": NaN, + "learning_rate": 5.800669721564681e-05, + "loss": 0.0, + "step": 46244 + }, + { + "epoch": 4.3151068396006345, + "grad_norm": NaN, + "learning_rate": 5.800072301059631e-05, + "loss": 0.0, + "step": 46245 + }, + { + "epoch": 4.315200149295512, + "grad_norm": NaN, + "learning_rate": 5.799474903947271e-05, + "loss": 0.0, + "step": 46246 + }, + { + "epoch": 4.315293458990389, + "grad_norm": NaN, + "learning_rate": 5.798877530229123e-05, + "loss": 0.0, + "step": 46247 + }, + { + "epoch": 4.315386768685267, + "grad_norm": NaN, + "learning_rate": 5.798280179906701e-05, + "loss": 0.0, + "step": 46248 + }, + { + "epoch": 4.315480078380144, + "grad_norm": NaN, + "learning_rate": 5.797682852981528e-05, + "loss": 0.0, + "step": 46249 + }, + { + "epoch": 4.315573388075021, + "grad_norm": NaN, + "learning_rate": 5.7970855494551206e-05, + "loss": 0.0, + "step": 46250 + }, + { + "epoch": 4.315666697769898, + "grad_norm": NaN, + "learning_rate": 5.796488269328999e-05, + "loss": 0.0, + "step": 46251 + }, + { + "epoch": 4.3157600074647755, + "grad_norm": NaN, + "learning_rate": 5.7958910126046836e-05, + "loss": 0.0, + "step": 46252 + }, + { + "epoch": 4.315853317159653, + "grad_norm": NaN, + "learning_rate": 5.795293779283689e-05, + "loss": 0.0, + "step": 46253 + }, + { + "epoch": 4.31594662685453, + "grad_norm": NaN, + "learning_rate": 5.794696569367536e-05, + "loss": 0.0, + "step": 46254 + }, + { + "epoch": 4.316039936549408, + "grad_norm": NaN, + "learning_rate": 5.794099382857743e-05, + "loss": 0.0, + "step": 46255 + }, + { + "epoch": 4.316133246244285, + "grad_norm": NaN, + "learning_rate": 5.7935022197558265e-05, + "loss": 0.0, + "step": 46256 + }, + { + "epoch": 4.316226555939162, + "grad_norm": NaN, + "learning_rate": 5.792905080063309e-05, + "loss": 0.0, + "step": 46257 + }, + { + "epoch": 4.316319865634039, + "grad_norm": NaN, + "learning_rate": 5.792307963781705e-05, + "loss": 0.0, + "step": 46258 + }, + { + "epoch": 4.3164131753289166, + "grad_norm": NaN, + "learning_rate": 5.791710870912536e-05, + "loss": 0.0, + "step": 46259 + }, + { + "epoch": 4.316506485023794, + "grad_norm": NaN, + "learning_rate": 5.7911138014573174e-05, + "loss": 0.0, + "step": 46260 + }, + { + "epoch": 4.316599794718671, + "grad_norm": NaN, + "learning_rate": 5.790516755417568e-05, + "loss": 0.0, + "step": 46261 + }, + { + "epoch": 4.316693104413549, + "grad_norm": NaN, + "learning_rate": 5.789919732794807e-05, + "loss": 0.0, + "step": 46262 + }, + { + "epoch": 4.316786414108426, + "grad_norm": NaN, + "learning_rate": 5.7893227335905535e-05, + "loss": 0.0, + "step": 46263 + }, + { + "epoch": 4.316879723803303, + "grad_norm": NaN, + "learning_rate": 5.7887257578063225e-05, + "loss": 0.0, + "step": 46264 + }, + { + "epoch": 4.31697303349818, + "grad_norm": NaN, + "learning_rate": 5.7881288054436325e-05, + "loss": 0.0, + "step": 46265 + }, + { + "epoch": 4.317066343193058, + "grad_norm": NaN, + "learning_rate": 5.787531876504003e-05, + "loss": 0.0, + "step": 46266 + }, + { + "epoch": 4.317159652887935, + "grad_norm": NaN, + "learning_rate": 5.786934970988951e-05, + "loss": 0.0, + "step": 46267 + }, + { + "epoch": 4.317252962582812, + "grad_norm": NaN, + "learning_rate": 5.7863380888999944e-05, + "loss": 0.0, + "step": 46268 + }, + { + "epoch": 4.31734627227769, + "grad_norm": NaN, + "learning_rate": 5.78574123023865e-05, + "loss": 0.0, + "step": 46269 + }, + { + "epoch": 4.317439581972567, + "grad_norm": NaN, + "learning_rate": 5.785144395006437e-05, + "loss": 0.0, + "step": 46270 + }, + { + "epoch": 4.317532891667444, + "grad_norm": NaN, + "learning_rate": 5.7845475832048716e-05, + "loss": 0.0, + "step": 46271 + }, + { + "epoch": 4.317626201362321, + "grad_norm": NaN, + "learning_rate": 5.783950794835472e-05, + "loss": 0.0, + "step": 46272 + }, + { + "epoch": 4.317719511057199, + "grad_norm": NaN, + "learning_rate": 5.78335402989975e-05, + "loss": 0.0, + "step": 46273 + }, + { + "epoch": 4.317812820752076, + "grad_norm": NaN, + "learning_rate": 5.7827572883992395e-05, + "loss": 0.0, + "step": 46274 + }, + { + "epoch": 4.3179061304469535, + "grad_norm": NaN, + "learning_rate": 5.782160570335442e-05, + "loss": 0.0, + "step": 46275 + }, + { + "epoch": 4.317999440141831, + "grad_norm": NaN, + "learning_rate": 5.7815638757098736e-05, + "loss": 0.0, + "step": 46276 + }, + { + "epoch": 4.318092749836708, + "grad_norm": NaN, + "learning_rate": 5.7809672045240694e-05, + "loss": 0.0, + "step": 46277 + }, + { + "epoch": 4.318186059531586, + "grad_norm": NaN, + "learning_rate": 5.7803705567795286e-05, + "loss": 0.0, + "step": 46278 + }, + { + "epoch": 4.318279369226462, + "grad_norm": NaN, + "learning_rate": 5.7797739324777684e-05, + "loss": 0.0, + "step": 46279 + }, + { + "epoch": 4.31837267892134, + "grad_norm": NaN, + "learning_rate": 5.779177331620324e-05, + "loss": 0.0, + "step": 46280 + }, + { + "epoch": 4.318465988616217, + "grad_norm": NaN, + "learning_rate": 5.778580754208694e-05, + "loss": 0.0, + "step": 46281 + }, + { + "epoch": 4.3185592983110945, + "grad_norm": NaN, + "learning_rate": 5.7779842002444e-05, + "loss": 0.0, + "step": 46282 + }, + { + "epoch": 4.318652608005972, + "grad_norm": NaN, + "learning_rate": 5.777387669728963e-05, + "loss": 0.0, + "step": 46283 + }, + { + "epoch": 4.318745917700849, + "grad_norm": NaN, + "learning_rate": 5.776791162663897e-05, + "loss": 0.0, + "step": 46284 + }, + { + "epoch": 4.318839227395727, + "grad_norm": NaN, + "learning_rate": 5.776194679050718e-05, + "loss": 0.0, + "step": 46285 + }, + { + "epoch": 4.318932537090603, + "grad_norm": NaN, + "learning_rate": 5.775598218890943e-05, + "loss": 0.0, + "step": 46286 + }, + { + "epoch": 4.319025846785481, + "grad_norm": NaN, + "learning_rate": 5.77500178218609e-05, + "loss": 0.0, + "step": 46287 + }, + { + "epoch": 4.319119156480358, + "grad_norm": NaN, + "learning_rate": 5.774405368937675e-05, + "loss": 0.0, + "step": 46288 + }, + { + "epoch": 4.319212466175236, + "grad_norm": NaN, + "learning_rate": 5.7738089791472145e-05, + "loss": 0.0, + "step": 46289 + }, + { + "epoch": 4.319305775870113, + "grad_norm": NaN, + "learning_rate": 5.7732126128162245e-05, + "loss": 0.0, + "step": 46290 + }, + { + "epoch": 4.31939908556499, + "grad_norm": NaN, + "learning_rate": 5.772616269946222e-05, + "loss": 0.0, + "step": 46291 + }, + { + "epoch": 4.319492395259868, + "grad_norm": NaN, + "learning_rate": 5.772019950538723e-05, + "loss": 0.0, + "step": 46292 + }, + { + "epoch": 4.319585704954745, + "grad_norm": NaN, + "learning_rate": 5.771423654595242e-05, + "loss": 0.0, + "step": 46293 + }, + { + "epoch": 4.319679014649622, + "grad_norm": NaN, + "learning_rate": 5.770827382117299e-05, + "loss": 0.0, + "step": 46294 + }, + { + "epoch": 4.319772324344499, + "grad_norm": NaN, + "learning_rate": 5.7702311331064076e-05, + "loss": 0.0, + "step": 46295 + }, + { + "epoch": 4.319865634039377, + "grad_norm": NaN, + "learning_rate": 5.7696349075640834e-05, + "loss": 0.0, + "step": 46296 + }, + { + "epoch": 4.319958943734254, + "grad_norm": NaN, + "learning_rate": 5.7690387054918455e-05, + "loss": 0.0, + "step": 46297 + }, + { + "epoch": 4.3200522534291315, + "grad_norm": NaN, + "learning_rate": 5.768442526891205e-05, + "loss": 0.0, + "step": 46298 + }, + { + "epoch": 4.320145563124009, + "grad_norm": NaN, + "learning_rate": 5.767846371763683e-05, + "loss": 0.0, + "step": 46299 + }, + { + "epoch": 4.320238872818885, + "grad_norm": NaN, + "learning_rate": 5.767250240110792e-05, + "loss": 0.0, + "step": 46300 + }, + { + "epoch": 4.320332182513763, + "grad_norm": NaN, + "learning_rate": 5.766654131934048e-05, + "loss": 0.0, + "step": 46301 + }, + { + "epoch": 4.32042549220864, + "grad_norm": NaN, + "learning_rate": 5.766058047234968e-05, + "loss": 0.0, + "step": 46302 + }, + { + "epoch": 4.320518801903518, + "grad_norm": NaN, + "learning_rate": 5.7654619860150676e-05, + "loss": 0.0, + "step": 46303 + }, + { + "epoch": 4.320612111598395, + "grad_norm": NaN, + "learning_rate": 5.764865948275861e-05, + "loss": 0.0, + "step": 46304 + }, + { + "epoch": 4.3207054212932725, + "grad_norm": NaN, + "learning_rate": 5.7642699340188645e-05, + "loss": 0.0, + "step": 46305 + }, + { + "epoch": 4.32079873098815, + "grad_norm": NaN, + "learning_rate": 5.763673943245595e-05, + "loss": 0.0, + "step": 46306 + }, + { + "epoch": 4.320892040683027, + "grad_norm": NaN, + "learning_rate": 5.763077975957565e-05, + "loss": 0.0, + "step": 46307 + }, + { + "epoch": 4.320985350377904, + "grad_norm": NaN, + "learning_rate": 5.762482032156293e-05, + "loss": 0.0, + "step": 46308 + }, + { + "epoch": 4.321078660072781, + "grad_norm": NaN, + "learning_rate": 5.7618861118432924e-05, + "loss": 0.0, + "step": 46309 + }, + { + "epoch": 4.321171969767659, + "grad_norm": NaN, + "learning_rate": 5.7612902150200786e-05, + "loss": 0.0, + "step": 46310 + }, + { + "epoch": 4.321265279462536, + "grad_norm": NaN, + "learning_rate": 5.760694341688162e-05, + "loss": 0.0, + "step": 46311 + }, + { + "epoch": 4.3213585891574136, + "grad_norm": NaN, + "learning_rate": 5.760098491849073e-05, + "loss": 0.0, + "step": 46312 + }, + { + "epoch": 4.321451898852291, + "grad_norm": NaN, + "learning_rate": 5.759502665504312e-05, + "loss": 0.0, + "step": 46313 + }, + { + "epoch": 4.321545208547168, + "grad_norm": NaN, + "learning_rate": 5.758906862655393e-05, + "loss": 0.0, + "step": 46314 + }, + { + "epoch": 4.321638518242045, + "grad_norm": NaN, + "learning_rate": 5.758311083303847e-05, + "loss": 0.0, + "step": 46315 + }, + { + "epoch": 4.321731827936922, + "grad_norm": NaN, + "learning_rate": 5.757715327451173e-05, + "loss": 0.0, + "step": 46316 + }, + { + "epoch": 4.3218251376318, + "grad_norm": NaN, + "learning_rate": 5.7571195950988855e-05, + "loss": 0.0, + "step": 46317 + }, + { + "epoch": 4.321918447326677, + "grad_norm": NaN, + "learning_rate": 5.756523886248516e-05, + "loss": 0.0, + "step": 46318 + }, + { + "epoch": 4.322011757021555, + "grad_norm": NaN, + "learning_rate": 5.7559282009015616e-05, + "loss": 0.0, + "step": 46319 + }, + { + "epoch": 4.322105066716432, + "grad_norm": NaN, + "learning_rate": 5.755332539059539e-05, + "loss": 0.0, + "step": 46320 + }, + { + "epoch": 4.322198376411309, + "grad_norm": NaN, + "learning_rate": 5.754736900723978e-05, + "loss": 0.0, + "step": 46321 + }, + { + "epoch": 4.322291686106187, + "grad_norm": NaN, + "learning_rate": 5.7541412858963776e-05, + "loss": 0.0, + "step": 46322 + }, + { + "epoch": 4.322384995801063, + "grad_norm": NaN, + "learning_rate": 5.753545694578251e-05, + "loss": 0.0, + "step": 46323 + }, + { + "epoch": 4.322478305495941, + "grad_norm": NaN, + "learning_rate": 5.752950126771128e-05, + "loss": 0.0, + "step": 46324 + }, + { + "epoch": 4.322571615190818, + "grad_norm": NaN, + "learning_rate": 5.752354582476511e-05, + "loss": 0.0, + "step": 46325 + }, + { + "epoch": 4.322664924885696, + "grad_norm": NaN, + "learning_rate": 5.751759061695916e-05, + "loss": 0.0, + "step": 46326 + }, + { + "epoch": 4.322758234580573, + "grad_norm": NaN, + "learning_rate": 5.7511635644308584e-05, + "loss": 0.0, + "step": 46327 + }, + { + "epoch": 4.3228515442754505, + "grad_norm": NaN, + "learning_rate": 5.750568090682851e-05, + "loss": 0.0, + "step": 46328 + }, + { + "epoch": 4.322944853970328, + "grad_norm": NaN, + "learning_rate": 5.7499726404534105e-05, + "loss": 0.0, + "step": 46329 + }, + { + "epoch": 4.323038163665204, + "grad_norm": NaN, + "learning_rate": 5.749377213744048e-05, + "loss": 0.0, + "step": 46330 + }, + { + "epoch": 4.323131473360082, + "grad_norm": NaN, + "learning_rate": 5.74878181055628e-05, + "loss": 0.0, + "step": 46331 + }, + { + "epoch": 4.323224783054959, + "grad_norm": NaN, + "learning_rate": 5.7481864308916207e-05, + "loss": 0.0, + "step": 46332 + }, + { + "epoch": 4.323318092749837, + "grad_norm": NaN, + "learning_rate": 5.7475910747515816e-05, + "loss": 0.0, + "step": 46333 + }, + { + "epoch": 4.323411402444714, + "grad_norm": NaN, + "learning_rate": 5.7469957421376766e-05, + "loss": 0.0, + "step": 46334 + }, + { + "epoch": 4.3235047121395915, + "grad_norm": NaN, + "learning_rate": 5.74640043305142e-05, + "loss": 0.0, + "step": 46335 + }, + { + "epoch": 4.323598021834469, + "grad_norm": NaN, + "learning_rate": 5.745805147494327e-05, + "loss": 0.0, + "step": 46336 + }, + { + "epoch": 4.3236913315293455, + "grad_norm": NaN, + "learning_rate": 5.745209885467912e-05, + "loss": 0.0, + "step": 46337 + }, + { + "epoch": 4.323784641224223, + "grad_norm": NaN, + "learning_rate": 5.7446146469736844e-05, + "loss": 0.0, + "step": 46338 + }, + { + "epoch": 4.3238779509191, + "grad_norm": NaN, + "learning_rate": 5.744019432013162e-05, + "loss": 0.0, + "step": 46339 + }, + { + "epoch": 4.323971260613978, + "grad_norm": NaN, + "learning_rate": 5.7434242405878545e-05, + "loss": 0.0, + "step": 46340 + }, + { + "epoch": 4.324064570308855, + "grad_norm": NaN, + "learning_rate": 5.742829072699278e-05, + "loss": 0.0, + "step": 46341 + }, + { + "epoch": 4.324157880003733, + "grad_norm": NaN, + "learning_rate": 5.7422339283489446e-05, + "loss": 0.0, + "step": 46342 + }, + { + "epoch": 4.32425118969861, + "grad_norm": NaN, + "learning_rate": 5.7416388075383684e-05, + "loss": 0.0, + "step": 46343 + }, + { + "epoch": 4.3243444993934865, + "grad_norm": NaN, + "learning_rate": 5.7410437102690625e-05, + "loss": 0.0, + "step": 46344 + }, + { + "epoch": 4.324437809088364, + "grad_norm": NaN, + "learning_rate": 5.740448636542539e-05, + "loss": 0.0, + "step": 46345 + }, + { + "epoch": 4.324531118783241, + "grad_norm": NaN, + "learning_rate": 5.7398535863603065e-05, + "loss": 0.0, + "step": 46346 + }, + { + "epoch": 4.324624428478119, + "grad_norm": NaN, + "learning_rate": 5.739258559723895e-05, + "loss": 0.0, + "step": 46347 + }, + { + "epoch": 4.324717738172996, + "grad_norm": NaN, + "learning_rate": 5.738663556634799e-05, + "loss": 0.0, + "step": 46348 + }, + { + "epoch": 4.324811047867874, + "grad_norm": NaN, + "learning_rate": 5.7380685770945336e-05, + "loss": 0.0, + "step": 46349 + }, + { + "epoch": 4.324904357562751, + "grad_norm": NaN, + "learning_rate": 5.737473621104627e-05, + "loss": 0.0, + "step": 46350 + }, + { + "epoch": 4.3249976672576285, + "grad_norm": NaN, + "learning_rate": 5.7368786886665755e-05, + "loss": 0.0, + "step": 46351 + }, + { + "epoch": 4.325090976952505, + "grad_norm": NaN, + "learning_rate": 5.7362837797818924e-05, + "loss": 0.0, + "step": 46352 + }, + { + "epoch": 4.325184286647382, + "grad_norm": NaN, + "learning_rate": 5.735688894452106e-05, + "loss": 0.0, + "step": 46353 + }, + { + "epoch": 4.32527759634226, + "grad_norm": NaN, + "learning_rate": 5.735094032678712e-05, + "loss": 0.0, + "step": 46354 + }, + { + "epoch": 4.325370906037137, + "grad_norm": NaN, + "learning_rate": 5.734499194463227e-05, + "loss": 0.0, + "step": 46355 + }, + { + "epoch": 4.325464215732015, + "grad_norm": NaN, + "learning_rate": 5.733904379807174e-05, + "loss": 0.0, + "step": 46356 + }, + { + "epoch": 4.325557525426892, + "grad_norm": NaN, + "learning_rate": 5.7333095887120524e-05, + "loss": 0.0, + "step": 46357 + }, + { + "epoch": 4.3256508351217695, + "grad_norm": NaN, + "learning_rate": 5.732714821179376e-05, + "loss": 0.0, + "step": 46358 + }, + { + "epoch": 4.325744144816646, + "grad_norm": NaN, + "learning_rate": 5.732120077210669e-05, + "loss": 0.0, + "step": 46359 + }, + { + "epoch": 4.3258374545115235, + "grad_norm": NaN, + "learning_rate": 5.7315253568074316e-05, + "loss": 0.0, + "step": 46360 + }, + { + "epoch": 4.325930764206401, + "grad_norm": NaN, + "learning_rate": 5.730930659971174e-05, + "loss": 0.0, + "step": 46361 + }, + { + "epoch": 4.326024073901278, + "grad_norm": NaN, + "learning_rate": 5.730335986703424e-05, + "loss": 0.0, + "step": 46362 + }, + { + "epoch": 4.326117383596156, + "grad_norm": NaN, + "learning_rate": 5.7297413370056775e-05, + "loss": 0.0, + "step": 46363 + }, + { + "epoch": 4.326210693291033, + "grad_norm": NaN, + "learning_rate": 5.72914671087945e-05, + "loss": 0.0, + "step": 46364 + }, + { + "epoch": 4.326304002985911, + "grad_norm": NaN, + "learning_rate": 5.728552108326265e-05, + "loss": 0.0, + "step": 46365 + }, + { + "epoch": 4.326397312680788, + "grad_norm": NaN, + "learning_rate": 5.727957529347621e-05, + "loss": 0.0, + "step": 46366 + }, + { + "epoch": 4.3264906223756645, + "grad_norm": NaN, + "learning_rate": 5.727362973945029e-05, + "loss": 0.0, + "step": 46367 + }, + { + "epoch": 4.326583932070542, + "grad_norm": NaN, + "learning_rate": 5.726768442120016e-05, + "loss": 0.0, + "step": 46368 + }, + { + "epoch": 4.326677241765419, + "grad_norm": NaN, + "learning_rate": 5.72617393387408e-05, + "loss": 0.0, + "step": 46369 + }, + { + "epoch": 4.326770551460297, + "grad_norm": NaN, + "learning_rate": 5.7255794492087353e-05, + "loss": 0.0, + "step": 46370 + }, + { + "epoch": 4.326863861155174, + "grad_norm": NaN, + "learning_rate": 5.724984988125495e-05, + "loss": 0.0, + "step": 46371 + }, + { + "epoch": 4.326957170850052, + "grad_norm": NaN, + "learning_rate": 5.7243905506258714e-05, + "loss": 0.0, + "step": 46372 + }, + { + "epoch": 4.327050480544929, + "grad_norm": NaN, + "learning_rate": 5.723796136711375e-05, + "loss": 0.0, + "step": 46373 + }, + { + "epoch": 4.3271437902398056, + "grad_norm": NaN, + "learning_rate": 5.7232017463835164e-05, + "loss": 0.0, + "step": 46374 + }, + { + "epoch": 4.327237099934683, + "grad_norm": NaN, + "learning_rate": 5.7226073796438074e-05, + "loss": 0.0, + "step": 46375 + }, + { + "epoch": 4.32733040962956, + "grad_norm": NaN, + "learning_rate": 5.72201303649376e-05, + "loss": 0.0, + "step": 46376 + }, + { + "epoch": 4.327423719324438, + "grad_norm": NaN, + "learning_rate": 5.721418716934885e-05, + "loss": 0.0, + "step": 46377 + }, + { + "epoch": 4.327517029019315, + "grad_norm": NaN, + "learning_rate": 5.720824420968695e-05, + "loss": 0.0, + "step": 46378 + }, + { + "epoch": 4.327610338714193, + "grad_norm": NaN, + "learning_rate": 5.7202301485966975e-05, + "loss": 0.0, + "step": 46379 + }, + { + "epoch": 4.32770364840907, + "grad_norm": NaN, + "learning_rate": 5.719635899820408e-05, + "loss": 0.0, + "step": 46380 + }, + { + "epoch": 4.327796958103947, + "grad_norm": NaN, + "learning_rate": 5.719041674641335e-05, + "loss": 0.0, + "step": 46381 + }, + { + "epoch": 4.327890267798824, + "grad_norm": NaN, + "learning_rate": 5.718447473060989e-05, + "loss": 0.0, + "step": 46382 + }, + { + "epoch": 4.327983577493701, + "grad_norm": NaN, + "learning_rate": 5.717853295080882e-05, + "loss": 0.0, + "step": 46383 + }, + { + "epoch": 4.328076887188579, + "grad_norm": NaN, + "learning_rate": 5.7172591407025194e-05, + "loss": 0.0, + "step": 46384 + }, + { + "epoch": 4.328170196883456, + "grad_norm": NaN, + "learning_rate": 5.7166650099274266e-05, + "loss": 0.0, + "step": 46385 + }, + { + "epoch": 4.328263506578334, + "grad_norm": NaN, + "learning_rate": 5.7160709027571e-05, + "loss": 0.0, + "step": 46386 + }, + { + "epoch": 4.328356816273211, + "grad_norm": NaN, + "learning_rate": 5.7154768191930506e-05, + "loss": 0.0, + "step": 46387 + }, + { + "epoch": 4.328450125968088, + "grad_norm": NaN, + "learning_rate": 5.714882759236804e-05, + "loss": 0.0, + "step": 46388 + }, + { + "epoch": 4.328543435662965, + "grad_norm": NaN, + "learning_rate": 5.714288722889854e-05, + "loss": 0.0, + "step": 46389 + }, + { + "epoch": 4.3286367453578425, + "grad_norm": NaN, + "learning_rate": 5.713694710153712e-05, + "loss": 0.0, + "step": 46390 + }, + { + "epoch": 4.32873005505272, + "grad_norm": NaN, + "learning_rate": 5.713100721029903e-05, + "loss": 0.0, + "step": 46391 + }, + { + "epoch": 4.328823364747597, + "grad_norm": NaN, + "learning_rate": 5.712506755519925e-05, + "loss": 0.0, + "step": 46392 + }, + { + "epoch": 4.328916674442475, + "grad_norm": NaN, + "learning_rate": 5.7119128136252836e-05, + "loss": 0.0, + "step": 46393 + }, + { + "epoch": 4.329009984137352, + "grad_norm": NaN, + "learning_rate": 5.7113188953475086e-05, + "loss": 0.0, + "step": 46394 + }, + { + "epoch": 4.32910329383223, + "grad_norm": NaN, + "learning_rate": 5.710725000688093e-05, + "loss": 0.0, + "step": 46395 + }, + { + "epoch": 4.329196603527106, + "grad_norm": NaN, + "learning_rate": 5.7101311296485464e-05, + "loss": 0.0, + "step": 46396 + }, + { + "epoch": 4.3292899132219835, + "grad_norm": NaN, + "learning_rate": 5.709537282230395e-05, + "loss": 0.0, + "step": 46397 + }, + { + "epoch": 4.329383222916861, + "grad_norm": NaN, + "learning_rate": 5.7089434584351316e-05, + "loss": 0.0, + "step": 46398 + }, + { + "epoch": 4.329476532611738, + "grad_norm": NaN, + "learning_rate": 5.70834965826427e-05, + "loss": 0.0, + "step": 46399 + }, + { + "epoch": 4.329569842306616, + "grad_norm": NaN, + "learning_rate": 5.70775588171933e-05, + "loss": 0.0, + "step": 46400 + }, + { + "epoch": 4.329663152001493, + "grad_norm": NaN, + "learning_rate": 5.707162128801811e-05, + "loss": 0.0, + "step": 46401 + }, + { + "epoch": 4.329756461696371, + "grad_norm": NaN, + "learning_rate": 5.7065683995132225e-05, + "loss": 0.0, + "step": 46402 + }, + { + "epoch": 4.329849771391247, + "grad_norm": NaN, + "learning_rate": 5.705974693855086e-05, + "loss": 0.0, + "step": 46403 + }, + { + "epoch": 4.329943081086125, + "grad_norm": NaN, + "learning_rate": 5.705381011828898e-05, + "loss": 0.0, + "step": 46404 + }, + { + "epoch": 4.330036390781002, + "grad_norm": NaN, + "learning_rate": 5.7047873534361664e-05, + "loss": 0.0, + "step": 46405 + }, + { + "epoch": 4.330129700475879, + "grad_norm": NaN, + "learning_rate": 5.704193718678419e-05, + "loss": 0.0, + "step": 46406 + }, + { + "epoch": 4.330223010170757, + "grad_norm": NaN, + "learning_rate": 5.7036001075571483e-05, + "loss": 0.0, + "step": 46407 + }, + { + "epoch": 4.330316319865634, + "grad_norm": NaN, + "learning_rate": 5.7030065200738615e-05, + "loss": 0.0, + "step": 46408 + }, + { + "epoch": 4.330409629560512, + "grad_norm": NaN, + "learning_rate": 5.702412956230087e-05, + "loss": 0.0, + "step": 46409 + }, + { + "epoch": 4.330502939255389, + "grad_norm": NaN, + "learning_rate": 5.7018194160273175e-05, + "loss": 0.0, + "step": 46410 + }, + { + "epoch": 4.330596248950266, + "grad_norm": NaN, + "learning_rate": 5.7012258994670615e-05, + "loss": 0.0, + "step": 46411 + }, + { + "epoch": 4.330689558645143, + "grad_norm": NaN, + "learning_rate": 5.700632406550843e-05, + "loss": 0.0, + "step": 46412 + }, + { + "epoch": 4.3307828683400205, + "grad_norm": NaN, + "learning_rate": 5.700038937280158e-05, + "loss": 0.0, + "step": 46413 + }, + { + "epoch": 4.330876178034898, + "grad_norm": NaN, + "learning_rate": 5.699445491656512e-05, + "loss": 0.0, + "step": 46414 + }, + { + "epoch": 4.330969487729775, + "grad_norm": NaN, + "learning_rate": 5.698852069681433e-05, + "loss": 0.0, + "step": 46415 + }, + { + "epoch": 4.331062797424653, + "grad_norm": NaN, + "learning_rate": 5.698258671356413e-05, + "loss": 0.0, + "step": 46416 + }, + { + "epoch": 4.331156107119529, + "grad_norm": NaN, + "learning_rate": 5.6976652966829645e-05, + "loss": 0.0, + "step": 46417 + }, + { + "epoch": 4.331249416814407, + "grad_norm": NaN, + "learning_rate": 5.6970719456625994e-05, + "loss": 0.0, + "step": 46418 + }, + { + "epoch": 4.331342726509284, + "grad_norm": NaN, + "learning_rate": 5.696478618296822e-05, + "loss": 0.0, + "step": 46419 + }, + { + "epoch": 4.3314360362041615, + "grad_norm": NaN, + "learning_rate": 5.695885314587145e-05, + "loss": 0.0, + "step": 46420 + }, + { + "epoch": 4.331529345899039, + "grad_norm": NaN, + "learning_rate": 5.695292034535076e-05, + "loss": 0.0, + "step": 46421 + }, + { + "epoch": 4.331622655593916, + "grad_norm": NaN, + "learning_rate": 5.6946987781421165e-05, + "loss": 0.0, + "step": 46422 + }, + { + "epoch": 4.331715965288794, + "grad_norm": NaN, + "learning_rate": 5.694105545409794e-05, + "loss": 0.0, + "step": 46423 + }, + { + "epoch": 4.331809274983671, + "grad_norm": NaN, + "learning_rate": 5.6935123363395985e-05, + "loss": 0.0, + "step": 46424 + }, + { + "epoch": 4.331902584678548, + "grad_norm": NaN, + "learning_rate": 5.692919150933038e-05, + "loss": 0.0, + "step": 46425 + }, + { + "epoch": 4.331995894373425, + "grad_norm": NaN, + "learning_rate": 5.692325989191638e-05, + "loss": 0.0, + "step": 46426 + }, + { + "epoch": 4.332089204068303, + "grad_norm": NaN, + "learning_rate": 5.691732851116892e-05, + "loss": 0.0, + "step": 46427 + }, + { + "epoch": 4.33218251376318, + "grad_norm": NaN, + "learning_rate": 5.691139736710305e-05, + "loss": 0.0, + "step": 46428 + }, + { + "epoch": 4.332275823458057, + "grad_norm": NaN, + "learning_rate": 5.6905466459734043e-05, + "loss": 0.0, + "step": 46429 + }, + { + "epoch": 4.332369133152935, + "grad_norm": NaN, + "learning_rate": 5.689953578907679e-05, + "loss": 0.0, + "step": 46430 + }, + { + "epoch": 4.332462442847812, + "grad_norm": NaN, + "learning_rate": 5.689360535514641e-05, + "loss": 0.0, + "step": 46431 + }, + { + "epoch": 4.332555752542689, + "grad_norm": NaN, + "learning_rate": 5.68876751579581e-05, + "loss": 0.0, + "step": 46432 + }, + { + "epoch": 4.332649062237566, + "grad_norm": NaN, + "learning_rate": 5.688174519752679e-05, + "loss": 0.0, + "step": 46433 + }, + { + "epoch": 4.332742371932444, + "grad_norm": NaN, + "learning_rate": 5.687581547386758e-05, + "loss": 0.0, + "step": 46434 + }, + { + "epoch": 4.332835681627321, + "grad_norm": NaN, + "learning_rate": 5.686988598699567e-05, + "loss": 0.0, + "step": 46435 + }, + { + "epoch": 4.332928991322198, + "grad_norm": NaN, + "learning_rate": 5.686395673692603e-05, + "loss": 0.0, + "step": 46436 + }, + { + "epoch": 4.333022301017076, + "grad_norm": NaN, + "learning_rate": 5.685802772367369e-05, + "loss": 0.0, + "step": 46437 + }, + { + "epoch": 4.333115610711953, + "grad_norm": NaN, + "learning_rate": 5.68520989472539e-05, + "loss": 0.0, + "step": 46438 + }, + { + "epoch": 4.333208920406831, + "grad_norm": NaN, + "learning_rate": 5.684617040768157e-05, + "loss": 0.0, + "step": 46439 + }, + { + "epoch": 4.333302230101707, + "grad_norm": NaN, + "learning_rate": 5.6840242104971804e-05, + "loss": 0.0, + "step": 46440 + }, + { + "epoch": 4.333395539796585, + "grad_norm": NaN, + "learning_rate": 5.6834314039139784e-05, + "loss": 0.0, + "step": 46441 + }, + { + "epoch": 4.333488849491462, + "grad_norm": NaN, + "learning_rate": 5.682838621020047e-05, + "loss": 0.0, + "step": 46442 + }, + { + "epoch": 4.3335821591863395, + "grad_norm": NaN, + "learning_rate": 5.682245861816892e-05, + "loss": 0.0, + "step": 46443 + }, + { + "epoch": 4.333675468881217, + "grad_norm": NaN, + "learning_rate": 5.681653126306035e-05, + "loss": 0.0, + "step": 46444 + }, + { + "epoch": 4.333768778576094, + "grad_norm": NaN, + "learning_rate": 5.68106041448897e-05, + "loss": 0.0, + "step": 46445 + }, + { + "epoch": 4.333862088270972, + "grad_norm": NaN, + "learning_rate": 5.6804677263672024e-05, + "loss": 0.0, + "step": 46446 + }, + { + "epoch": 4.333955397965848, + "grad_norm": NaN, + "learning_rate": 5.679875061942254e-05, + "loss": 0.0, + "step": 46447 + }, + { + "epoch": 4.334048707660726, + "grad_norm": NaN, + "learning_rate": 5.6792824212156175e-05, + "loss": 0.0, + "step": 46448 + }, + { + "epoch": 4.334142017355603, + "grad_norm": NaN, + "learning_rate": 5.678689804188801e-05, + "loss": 0.0, + "step": 46449 + }, + { + "epoch": 4.3342353270504805, + "grad_norm": NaN, + "learning_rate": 5.678097210863324e-05, + "loss": 0.0, + "step": 46450 + }, + { + "epoch": 4.334328636745358, + "grad_norm": NaN, + "learning_rate": 5.677504641240681e-05, + "loss": 0.0, + "step": 46451 + }, + { + "epoch": 4.334421946440235, + "grad_norm": NaN, + "learning_rate": 5.676912095322375e-05, + "loss": 0.0, + "step": 46452 + }, + { + "epoch": 4.334515256135113, + "grad_norm": NaN, + "learning_rate": 5.67631957310993e-05, + "loss": 0.0, + "step": 46453 + }, + { + "epoch": 4.334608565829989, + "grad_norm": NaN, + "learning_rate": 5.67572707460484e-05, + "loss": 0.0, + "step": 46454 + }, + { + "epoch": 4.334701875524867, + "grad_norm": NaN, + "learning_rate": 5.6751345998086064e-05, + "loss": 0.0, + "step": 46455 + }, + { + "epoch": 4.334795185219744, + "grad_norm": NaN, + "learning_rate": 5.6745421487227545e-05, + "loss": 0.0, + "step": 46456 + }, + { + "epoch": 4.334888494914622, + "grad_norm": NaN, + "learning_rate": 5.673949721348775e-05, + "loss": 0.0, + "step": 46457 + }, + { + "epoch": 4.334981804609499, + "grad_norm": NaN, + "learning_rate": 5.673357317688172e-05, + "loss": 0.0, + "step": 46458 + }, + { + "epoch": 4.335075114304376, + "grad_norm": NaN, + "learning_rate": 5.6727649377424704e-05, + "loss": 0.0, + "step": 46459 + }, + { + "epoch": 4.335168423999254, + "grad_norm": NaN, + "learning_rate": 5.672172581513153e-05, + "loss": 0.0, + "step": 46460 + }, + { + "epoch": 4.33526173369413, + "grad_norm": NaN, + "learning_rate": 5.671580249001748e-05, + "loss": 0.0, + "step": 46461 + }, + { + "epoch": 4.335355043389008, + "grad_norm": NaN, + "learning_rate": 5.670987940209748e-05, + "loss": 0.0, + "step": 46462 + }, + { + "epoch": 4.335448353083885, + "grad_norm": NaN, + "learning_rate": 5.670395655138655e-05, + "loss": 0.0, + "step": 46463 + }, + { + "epoch": 4.335541662778763, + "grad_norm": NaN, + "learning_rate": 5.669803393789992e-05, + "loss": 0.0, + "step": 46464 + }, + { + "epoch": 4.33563497247364, + "grad_norm": NaN, + "learning_rate": 5.669211156165253e-05, + "loss": 0.0, + "step": 46465 + }, + { + "epoch": 4.3357282821685175, + "grad_norm": NaN, + "learning_rate": 5.668618942265938e-05, + "loss": 0.0, + "step": 46466 + }, + { + "epoch": 4.335821591863395, + "grad_norm": NaN, + "learning_rate": 5.668026752093572e-05, + "loss": 0.0, + "step": 46467 + }, + { + "epoch": 4.335914901558272, + "grad_norm": NaN, + "learning_rate": 5.667434585649645e-05, + "loss": 0.0, + "step": 46468 + }, + { + "epoch": 4.336008211253149, + "grad_norm": NaN, + "learning_rate": 5.666842442935661e-05, + "loss": 0.0, + "step": 46469 + }, + { + "epoch": 4.336101520948026, + "grad_norm": NaN, + "learning_rate": 5.666250323953144e-05, + "loss": 0.0, + "step": 46470 + }, + { + "epoch": 4.336194830642904, + "grad_norm": NaN, + "learning_rate": 5.665658228703581e-05, + "loss": 0.0, + "step": 46471 + }, + { + "epoch": 4.336288140337781, + "grad_norm": NaN, + "learning_rate": 5.665066157188481e-05, + "loss": 0.0, + "step": 46472 + }, + { + "epoch": 4.3363814500326585, + "grad_norm": NaN, + "learning_rate": 5.6644741094093606e-05, + "loss": 0.0, + "step": 46473 + }, + { + "epoch": 4.336474759727536, + "grad_norm": NaN, + "learning_rate": 5.663882085367713e-05, + "loss": 0.0, + "step": 46474 + }, + { + "epoch": 4.336568069422413, + "grad_norm": NaN, + "learning_rate": 5.6632900850650426e-05, + "loss": 0.0, + "step": 46475 + }, + { + "epoch": 4.33666137911729, + "grad_norm": NaN, + "learning_rate": 5.66269810850287e-05, + "loss": 0.0, + "step": 46476 + }, + { + "epoch": 4.336754688812167, + "grad_norm": NaN, + "learning_rate": 5.662106155682685e-05, + "loss": 0.0, + "step": 46477 + }, + { + "epoch": 4.336847998507045, + "grad_norm": NaN, + "learning_rate": 5.661514226605992e-05, + "loss": 0.0, + "step": 46478 + }, + { + "epoch": 4.336941308201922, + "grad_norm": NaN, + "learning_rate": 5.6609223212743136e-05, + "loss": 0.0, + "step": 46479 + }, + { + "epoch": 4.3370346178968, + "grad_norm": NaN, + "learning_rate": 5.660330439689137e-05, + "loss": 0.0, + "step": 46480 + }, + { + "epoch": 4.337127927591677, + "grad_norm": NaN, + "learning_rate": 5.6597385818519684e-05, + "loss": 0.0, + "step": 46481 + }, + { + "epoch": 4.337221237286554, + "grad_norm": NaN, + "learning_rate": 5.6591467477643293e-05, + "loss": 0.0, + "step": 46482 + }, + { + "epoch": 4.337314546981432, + "grad_norm": NaN, + "learning_rate": 5.6585549374277076e-05, + "loss": 0.0, + "step": 46483 + }, + { + "epoch": 4.337407856676308, + "grad_norm": NaN, + "learning_rate": 5.657963150843608e-05, + "loss": 0.0, + "step": 46484 + }, + { + "epoch": 4.337501166371186, + "grad_norm": NaN, + "learning_rate": 5.65737138801355e-05, + "loss": 0.0, + "step": 46485 + }, + { + "epoch": 4.337594476066063, + "grad_norm": NaN, + "learning_rate": 5.6567796489390246e-05, + "loss": 0.0, + "step": 46486 + }, + { + "epoch": 4.337687785760941, + "grad_norm": NaN, + "learning_rate": 5.656187933621538e-05, + "loss": 0.0, + "step": 46487 + }, + { + "epoch": 4.337781095455818, + "grad_norm": NaN, + "learning_rate": 5.655596242062605e-05, + "loss": 0.0, + "step": 46488 + }, + { + "epoch": 4.3378744051506954, + "grad_norm": NaN, + "learning_rate": 5.655004574263719e-05, + "loss": 0.0, + "step": 46489 + }, + { + "epoch": 4.337967714845573, + "grad_norm": NaN, + "learning_rate": 5.654412930226382e-05, + "loss": 0.0, + "step": 46490 + }, + { + "epoch": 4.338061024540449, + "grad_norm": NaN, + "learning_rate": 5.653821309952116e-05, + "loss": 0.0, + "step": 46491 + }, + { + "epoch": 4.338154334235327, + "grad_norm": NaN, + "learning_rate": 5.6532297134424074e-05, + "loss": 0.0, + "step": 46492 + }, + { + "epoch": 4.338247643930204, + "grad_norm": NaN, + "learning_rate": 5.652638140698763e-05, + "loss": 0.0, + "step": 46493 + }, + { + "epoch": 4.338340953625082, + "grad_norm": NaN, + "learning_rate": 5.6520465917227014e-05, + "loss": 0.0, + "step": 46494 + }, + { + "epoch": 4.338434263319959, + "grad_norm": NaN, + "learning_rate": 5.65145506651571e-05, + "loss": 0.0, + "step": 46495 + }, + { + "epoch": 4.3385275730148365, + "grad_norm": NaN, + "learning_rate": 5.6508635650792956e-05, + "loss": 0.0, + "step": 46496 + }, + { + "epoch": 4.338620882709714, + "grad_norm": NaN, + "learning_rate": 5.650272087414976e-05, + "loss": 0.0, + "step": 46497 + }, + { + "epoch": 4.33871419240459, + "grad_norm": NaN, + "learning_rate": 5.649680633524232e-05, + "loss": 0.0, + "step": 46498 + }, + { + "epoch": 4.338807502099468, + "grad_norm": NaN, + "learning_rate": 5.649089203408587e-05, + "loss": 0.0, + "step": 46499 + }, + { + "epoch": 4.338900811794345, + "grad_norm": NaN, + "learning_rate": 5.648497797069544e-05, + "loss": 0.0, + "step": 46500 + }, + { + "epoch": 4.338994121489223, + "grad_norm": NaN, + "learning_rate": 5.64790641450859e-05, + "loss": 0.0, + "step": 46501 + }, + { + "epoch": 4.3390874311841, + "grad_norm": NaN, + "learning_rate": 5.647315055727244e-05, + "loss": 0.0, + "step": 46502 + }, + { + "epoch": 4.3391807408789775, + "grad_norm": NaN, + "learning_rate": 5.646723720727013e-05, + "loss": 0.0, + "step": 46503 + }, + { + "epoch": 4.339274050573855, + "grad_norm": NaN, + "learning_rate": 5.646132409509381e-05, + "loss": 0.0, + "step": 46504 + }, + { + "epoch": 4.3393673602687315, + "grad_norm": NaN, + "learning_rate": 5.645541122075873e-05, + "loss": 0.0, + "step": 46505 + }, + { + "epoch": 4.339460669963609, + "grad_norm": NaN, + "learning_rate": 5.644949858427979e-05, + "loss": 0.0, + "step": 46506 + }, + { + "epoch": 4.339553979658486, + "grad_norm": NaN, + "learning_rate": 5.644358618567201e-05, + "loss": 0.0, + "step": 46507 + }, + { + "epoch": 4.339647289353364, + "grad_norm": NaN, + "learning_rate": 5.6437674024950574e-05, + "loss": 0.0, + "step": 46508 + }, + { + "epoch": 4.339740599048241, + "grad_norm": NaN, + "learning_rate": 5.643176210213037e-05, + "loss": 0.0, + "step": 46509 + }, + { + "epoch": 4.339833908743119, + "grad_norm": NaN, + "learning_rate": 5.642585041722643e-05, + "loss": 0.0, + "step": 46510 + }, + { + "epoch": 4.339927218437996, + "grad_norm": NaN, + "learning_rate": 5.6419938970253936e-05, + "loss": 0.0, + "step": 46511 + }, + { + "epoch": 4.340020528132873, + "grad_norm": NaN, + "learning_rate": 5.641402776122775e-05, + "loss": 0.0, + "step": 46512 + }, + { + "epoch": 4.34011383782775, + "grad_norm": NaN, + "learning_rate": 5.6408116790162935e-05, + "loss": 0.0, + "step": 46513 + }, + { + "epoch": 4.340207147522627, + "grad_norm": NaN, + "learning_rate": 5.640220605707466e-05, + "loss": 0.0, + "step": 46514 + }, + { + "epoch": 4.340300457217505, + "grad_norm": NaN, + "learning_rate": 5.639629556197779e-05, + "loss": 0.0, + "step": 46515 + }, + { + "epoch": 4.340393766912382, + "grad_norm": NaN, + "learning_rate": 5.639038530488736e-05, + "loss": 0.0, + "step": 46516 + }, + { + "epoch": 4.34048707660726, + "grad_norm": NaN, + "learning_rate": 5.6384475285818556e-05, + "loss": 0.0, + "step": 46517 + }, + { + "epoch": 4.340580386302137, + "grad_norm": NaN, + "learning_rate": 5.637856550478625e-05, + "loss": 0.0, + "step": 46518 + }, + { + "epoch": 4.3406736959970145, + "grad_norm": NaN, + "learning_rate": 5.637265596180547e-05, + "loss": 0.0, + "step": 46519 + }, + { + "epoch": 4.340767005691891, + "grad_norm": NaN, + "learning_rate": 5.636674665689139e-05, + "loss": 0.0, + "step": 46520 + }, + { + "epoch": 4.340860315386768, + "grad_norm": NaN, + "learning_rate": 5.636083759005889e-05, + "loss": 0.0, + "step": 46521 + }, + { + "epoch": 4.340953625081646, + "grad_norm": NaN, + "learning_rate": 5.635492876132299e-05, + "loss": 0.0, + "step": 46522 + }, + { + "epoch": 4.341046934776523, + "grad_norm": NaN, + "learning_rate": 5.634902017069887e-05, + "loss": 0.0, + "step": 46523 + }, + { + "epoch": 4.341140244471401, + "grad_norm": NaN, + "learning_rate": 5.6343111818201385e-05, + "loss": 0.0, + "step": 46524 + }, + { + "epoch": 4.341233554166278, + "grad_norm": NaN, + "learning_rate": 5.633720370384559e-05, + "loss": 0.0, + "step": 46525 + }, + { + "epoch": 4.3413268638611555, + "grad_norm": NaN, + "learning_rate": 5.633129582764664e-05, + "loss": 0.0, + "step": 46526 + }, + { + "epoch": 4.341420173556033, + "grad_norm": NaN, + "learning_rate": 5.63253881896194e-05, + "loss": 0.0, + "step": 46527 + }, + { + "epoch": 4.3415134832509095, + "grad_norm": NaN, + "learning_rate": 5.63194807897789e-05, + "loss": 0.0, + "step": 46528 + }, + { + "epoch": 4.341606792945787, + "grad_norm": NaN, + "learning_rate": 5.631357362814031e-05, + "loss": 0.0, + "step": 46529 + }, + { + "epoch": 4.341700102640664, + "grad_norm": NaN, + "learning_rate": 5.63076667047185e-05, + "loss": 0.0, + "step": 46530 + }, + { + "epoch": 4.341793412335542, + "grad_norm": NaN, + "learning_rate": 5.630176001952851e-05, + "loss": 0.0, + "step": 46531 + }, + { + "epoch": 4.341886722030419, + "grad_norm": NaN, + "learning_rate": 5.629585357258547e-05, + "loss": 0.0, + "step": 46532 + }, + { + "epoch": 4.341980031725297, + "grad_norm": NaN, + "learning_rate": 5.628994736390426e-05, + "loss": 0.0, + "step": 46533 + }, + { + "epoch": 4.342073341420173, + "grad_norm": NaN, + "learning_rate": 5.6284041393499914e-05, + "loss": 0.0, + "step": 46534 + }, + { + "epoch": 4.3421666511150505, + "grad_norm": NaN, + "learning_rate": 5.62781356613876e-05, + "loss": 0.0, + "step": 46535 + }, + { + "epoch": 4.342259960809928, + "grad_norm": NaN, + "learning_rate": 5.62722301675821e-05, + "loss": 0.0, + "step": 46536 + }, + { + "epoch": 4.342353270504805, + "grad_norm": NaN, + "learning_rate": 5.626632491209862e-05, + "loss": 0.0, + "step": 46537 + }, + { + "epoch": 4.342446580199683, + "grad_norm": NaN, + "learning_rate": 5.626041989495217e-05, + "loss": 0.0, + "step": 46538 + }, + { + "epoch": 4.34253988989456, + "grad_norm": NaN, + "learning_rate": 5.625451511615758e-05, + "loss": 0.0, + "step": 46539 + }, + { + "epoch": 4.342633199589438, + "grad_norm": NaN, + "learning_rate": 5.624861057573006e-05, + "loss": 0.0, + "step": 46540 + }, + { + "epoch": 4.342726509284315, + "grad_norm": NaN, + "learning_rate": 5.62427062736846e-05, + "loss": 0.0, + "step": 46541 + }, + { + "epoch": 4.342819818979192, + "grad_norm": NaN, + "learning_rate": 5.623680221003605e-05, + "loss": 0.0, + "step": 46542 + }, + { + "epoch": 4.342913128674069, + "grad_norm": NaN, + "learning_rate": 5.62308983847996e-05, + "loss": 0.0, + "step": 46543 + }, + { + "epoch": 4.343006438368946, + "grad_norm": NaN, + "learning_rate": 5.622499479799024e-05, + "loss": 0.0, + "step": 46544 + }, + { + "epoch": 4.343099748063824, + "grad_norm": NaN, + "learning_rate": 5.621909144962285e-05, + "loss": 0.0, + "step": 46545 + }, + { + "epoch": 4.343193057758701, + "grad_norm": NaN, + "learning_rate": 5.621318833971258e-05, + "loss": 0.0, + "step": 46546 + }, + { + "epoch": 4.343286367453579, + "grad_norm": NaN, + "learning_rate": 5.620728546827443e-05, + "loss": 0.0, + "step": 46547 + }, + { + "epoch": 4.343379677148456, + "grad_norm": NaN, + "learning_rate": 5.6201382835323284e-05, + "loss": 0.0, + "step": 46548 + }, + { + "epoch": 4.343472986843333, + "grad_norm": NaN, + "learning_rate": 5.6195480440874265e-05, + "loss": 0.0, + "step": 46549 + }, + { + "epoch": 4.34356629653821, + "grad_norm": NaN, + "learning_rate": 5.618957828494243e-05, + "loss": 0.0, + "step": 46550 + }, + { + "epoch": 4.3436596062330874, + "grad_norm": NaN, + "learning_rate": 5.618367636754258e-05, + "loss": 0.0, + "step": 46551 + }, + { + "epoch": 4.343752915927965, + "grad_norm": NaN, + "learning_rate": 5.617777468868996e-05, + "loss": 0.0, + "step": 46552 + }, + { + "epoch": 4.343846225622842, + "grad_norm": NaN, + "learning_rate": 5.6171873248399435e-05, + "loss": 0.0, + "step": 46553 + }, + { + "epoch": 4.34393953531772, + "grad_norm": NaN, + "learning_rate": 5.6165972046685983e-05, + "loss": 0.0, + "step": 46554 + }, + { + "epoch": 4.344032845012597, + "grad_norm": NaN, + "learning_rate": 5.616007108356476e-05, + "loss": 0.0, + "step": 46555 + }, + { + "epoch": 4.3441261547074745, + "grad_norm": NaN, + "learning_rate": 5.615417035905063e-05, + "loss": 0.0, + "step": 46556 + }, + { + "epoch": 4.344219464402351, + "grad_norm": NaN, + "learning_rate": 5.6148269873158595e-05, + "loss": 0.0, + "step": 46557 + }, + { + "epoch": 4.3443127740972285, + "grad_norm": NaN, + "learning_rate": 5.6142369625903824e-05, + "loss": 0.0, + "step": 46558 + }, + { + "epoch": 4.344406083792106, + "grad_norm": NaN, + "learning_rate": 5.613646961730113e-05, + "loss": 0.0, + "step": 46559 + }, + { + "epoch": 4.344499393486983, + "grad_norm": NaN, + "learning_rate": 5.6130569847365556e-05, + "loss": 0.0, + "step": 46560 + }, + { + "epoch": 4.344592703181861, + "grad_norm": NaN, + "learning_rate": 5.6124670316112235e-05, + "loss": 0.0, + "step": 46561 + }, + { + "epoch": 4.344686012876738, + "grad_norm": NaN, + "learning_rate": 5.6118771023556016e-05, + "loss": 0.0, + "step": 46562 + }, + { + "epoch": 4.344779322571616, + "grad_norm": NaN, + "learning_rate": 5.611287196971191e-05, + "loss": 0.0, + "step": 46563 + }, + { + "epoch": 4.344872632266492, + "grad_norm": NaN, + "learning_rate": 5.610697315459504e-05, + "loss": 0.0, + "step": 46564 + }, + { + "epoch": 4.3449659419613695, + "grad_norm": NaN, + "learning_rate": 5.61010745782203e-05, + "loss": 0.0, + "step": 46565 + }, + { + "epoch": 4.345059251656247, + "grad_norm": NaN, + "learning_rate": 5.609517624060264e-05, + "loss": 0.0, + "step": 46566 + }, + { + "epoch": 4.345152561351124, + "grad_norm": NaN, + "learning_rate": 5.608927814175725e-05, + "loss": 0.0, + "step": 46567 + }, + { + "epoch": 4.345245871046002, + "grad_norm": NaN, + "learning_rate": 5.608338028169893e-05, + "loss": 0.0, + "step": 46568 + }, + { + "epoch": 4.345339180740879, + "grad_norm": NaN, + "learning_rate": 5.6077482660442726e-05, + "loss": 0.0, + "step": 46569 + }, + { + "epoch": 4.345432490435757, + "grad_norm": NaN, + "learning_rate": 5.607158527800377e-05, + "loss": 0.0, + "step": 46570 + }, + { + "epoch": 4.345525800130633, + "grad_norm": NaN, + "learning_rate": 5.606568813439683e-05, + "loss": 0.0, + "step": 46571 + }, + { + "epoch": 4.345619109825511, + "grad_norm": NaN, + "learning_rate": 5.6059791229637066e-05, + "loss": 0.0, + "step": 46572 + }, + { + "epoch": 4.345712419520388, + "grad_norm": NaN, + "learning_rate": 5.605389456373949e-05, + "loss": 0.0, + "step": 46573 + }, + { + "epoch": 4.345805729215265, + "grad_norm": NaN, + "learning_rate": 5.604799813671893e-05, + "loss": 0.0, + "step": 46574 + }, + { + "epoch": 4.345899038910143, + "grad_norm": NaN, + "learning_rate": 5.6042101948590517e-05, + "loss": 0.0, + "step": 46575 + }, + { + "epoch": 4.34599234860502, + "grad_norm": NaN, + "learning_rate": 5.603620599936927e-05, + "loss": 0.0, + "step": 46576 + }, + { + "epoch": 4.346085658299898, + "grad_norm": NaN, + "learning_rate": 5.603031028907001e-05, + "loss": 0.0, + "step": 46577 + }, + { + "epoch": 4.346178967994774, + "grad_norm": NaN, + "learning_rate": 5.602441481770788e-05, + "loss": 0.0, + "step": 46578 + }, + { + "epoch": 4.346272277689652, + "grad_norm": NaN, + "learning_rate": 5.6018519585297894e-05, + "loss": 0.0, + "step": 46579 + }, + { + "epoch": 4.346365587384529, + "grad_norm": NaN, + "learning_rate": 5.601262459185485e-05, + "loss": 0.0, + "step": 46580 + }, + { + "epoch": 4.3464588970794065, + "grad_norm": NaN, + "learning_rate": 5.600672983739393e-05, + "loss": 0.0, + "step": 46581 + }, + { + "epoch": 4.346552206774284, + "grad_norm": NaN, + "learning_rate": 5.60008353219301e-05, + "loss": 0.0, + "step": 46582 + }, + { + "epoch": 4.346645516469161, + "grad_norm": NaN, + "learning_rate": 5.5994941045478195e-05, + "loss": 0.0, + "step": 46583 + }, + { + "epoch": 4.346738826164039, + "grad_norm": NaN, + "learning_rate": 5.598904700805337e-05, + "loss": 0.0, + "step": 46584 + }, + { + "epoch": 4.346832135858916, + "grad_norm": NaN, + "learning_rate": 5.598315320967058e-05, + "loss": 0.0, + "step": 46585 + }, + { + "epoch": 4.346925445553793, + "grad_norm": NaN, + "learning_rate": 5.59772596503447e-05, + "loss": 0.0, + "step": 46586 + }, + { + "epoch": 4.34701875524867, + "grad_norm": NaN, + "learning_rate": 5.5971366330090824e-05, + "loss": 0.0, + "step": 46587 + }, + { + "epoch": 4.3471120649435475, + "grad_norm": NaN, + "learning_rate": 5.5965473248923994e-05, + "loss": 0.0, + "step": 46588 + }, + { + "epoch": 4.347205374638425, + "grad_norm": NaN, + "learning_rate": 5.595958040685898e-05, + "loss": 0.0, + "step": 46589 + }, + { + "epoch": 4.347298684333302, + "grad_norm": NaN, + "learning_rate": 5.595368780391096e-05, + "loss": 0.0, + "step": 46590 + }, + { + "epoch": 4.34739199402818, + "grad_norm": NaN, + "learning_rate": 5.59477954400949e-05, + "loss": 0.0, + "step": 46591 + }, + { + "epoch": 4.347485303723057, + "grad_norm": NaN, + "learning_rate": 5.5941903315425646e-05, + "loss": 0.0, + "step": 46592 + }, + { + "epoch": 4.347578613417934, + "grad_norm": NaN, + "learning_rate": 5.593601142991831e-05, + "loss": 0.0, + "step": 46593 + }, + { + "epoch": 4.347671923112811, + "grad_norm": NaN, + "learning_rate": 5.5930119783587876e-05, + "loss": 0.0, + "step": 46594 + }, + { + "epoch": 4.347765232807689, + "grad_norm": NaN, + "learning_rate": 5.592422837644919e-05, + "loss": 0.0, + "step": 46595 + }, + { + "epoch": 4.347858542502566, + "grad_norm": NaN, + "learning_rate": 5.5918337208517425e-05, + "loss": 0.0, + "step": 46596 + }, + { + "epoch": 4.347951852197443, + "grad_norm": NaN, + "learning_rate": 5.591244627980742e-05, + "loss": 0.0, + "step": 46597 + }, + { + "epoch": 4.348045161892321, + "grad_norm": NaN, + "learning_rate": 5.590655559033413e-05, + "loss": 0.0, + "step": 46598 + }, + { + "epoch": 4.348138471587198, + "grad_norm": NaN, + "learning_rate": 5.59006651401127e-05, + "loss": 0.0, + "step": 46599 + }, + { + "epoch": 4.348231781282076, + "grad_norm": NaN, + "learning_rate": 5.5894774929157956e-05, + "loss": 0.0, + "step": 46600 + }, + { + "epoch": 4.348325090976952, + "grad_norm": NaN, + "learning_rate": 5.588888495748487e-05, + "loss": 0.0, + "step": 46601 + }, + { + "epoch": 4.34841840067183, + "grad_norm": NaN, + "learning_rate": 5.5882995225108574e-05, + "loss": 0.0, + "step": 46602 + }, + { + "epoch": 4.348511710366707, + "grad_norm": NaN, + "learning_rate": 5.587710573204389e-05, + "loss": 0.0, + "step": 46603 + }, + { + "epoch": 4.3486050200615844, + "grad_norm": NaN, + "learning_rate": 5.587121647830579e-05, + "loss": 0.0, + "step": 46604 + }, + { + "epoch": 4.348698329756462, + "grad_norm": NaN, + "learning_rate": 5.586532746390941e-05, + "loss": 0.0, + "step": 46605 + }, + { + "epoch": 4.348791639451339, + "grad_norm": NaN, + "learning_rate": 5.585943868886956e-05, + "loss": 0.0, + "step": 46606 + }, + { + "epoch": 4.348884949146217, + "grad_norm": NaN, + "learning_rate": 5.585355015320122e-05, + "loss": 0.0, + "step": 46607 + }, + { + "epoch": 4.348978258841093, + "grad_norm": NaN, + "learning_rate": 5.5847661856919527e-05, + "loss": 0.0, + "step": 46608 + }, + { + "epoch": 4.349071568535971, + "grad_norm": NaN, + "learning_rate": 5.5841773800039225e-05, + "loss": 0.0, + "step": 46609 + }, + { + "epoch": 4.349164878230848, + "grad_norm": NaN, + "learning_rate": 5.583588598257546e-05, + "loss": 0.0, + "step": 46610 + }, + { + "epoch": 4.3492581879257255, + "grad_norm": NaN, + "learning_rate": 5.582999840454321e-05, + "loss": 0.0, + "step": 46611 + }, + { + "epoch": 4.349351497620603, + "grad_norm": NaN, + "learning_rate": 5.582411106595725e-05, + "loss": 0.0, + "step": 46612 + }, + { + "epoch": 4.34944480731548, + "grad_norm": NaN, + "learning_rate": 5.5818223966832744e-05, + "loss": 0.0, + "step": 46613 + }, + { + "epoch": 4.349538117010358, + "grad_norm": NaN, + "learning_rate": 5.581233710718464e-05, + "loss": 0.0, + "step": 46614 + }, + { + "epoch": 4.349631426705234, + "grad_norm": NaN, + "learning_rate": 5.580645048702777e-05, + "loss": 0.0, + "step": 46615 + }, + { + "epoch": 4.349724736400112, + "grad_norm": NaN, + "learning_rate": 5.580056410637722e-05, + "loss": 0.0, + "step": 46616 + }, + { + "epoch": 4.349818046094989, + "grad_norm": NaN, + "learning_rate": 5.5794677965248016e-05, + "loss": 0.0, + "step": 46617 + }, + { + "epoch": 4.3499113557898665, + "grad_norm": NaN, + "learning_rate": 5.5788792063654914e-05, + "loss": 0.0, + "step": 46618 + }, + { + "epoch": 4.350004665484744, + "grad_norm": NaN, + "learning_rate": 5.5782906401613056e-05, + "loss": 0.0, + "step": 46619 + }, + { + "epoch": 4.350097975179621, + "grad_norm": NaN, + "learning_rate": 5.577702097913744e-05, + "loss": 0.0, + "step": 46620 + }, + { + "epoch": 4.350191284874499, + "grad_norm": NaN, + "learning_rate": 5.577113579624282e-05, + "loss": 0.0, + "step": 46621 + }, + { + "epoch": 4.350284594569375, + "grad_norm": NaN, + "learning_rate": 5.5765250852944346e-05, + "loss": 0.0, + "step": 46622 + }, + { + "epoch": 4.350377904264253, + "grad_norm": NaN, + "learning_rate": 5.575936614925697e-05, + "loss": 0.0, + "step": 46623 + }, + { + "epoch": 4.35047121395913, + "grad_norm": NaN, + "learning_rate": 5.5753481685195514e-05, + "loss": 0.0, + "step": 46624 + }, + { + "epoch": 4.350564523654008, + "grad_norm": NaN, + "learning_rate": 5.574759746077508e-05, + "loss": 0.0, + "step": 46625 + }, + { + "epoch": 4.350657833348885, + "grad_norm": NaN, + "learning_rate": 5.574171347601065e-05, + "loss": 0.0, + "step": 46626 + }, + { + "epoch": 4.350751143043762, + "grad_norm": NaN, + "learning_rate": 5.573582973091701e-05, + "loss": 0.0, + "step": 46627 + }, + { + "epoch": 4.35084445273864, + "grad_norm": NaN, + "learning_rate": 5.572994622550929e-05, + "loss": 0.0, + "step": 46628 + }, + { + "epoch": 4.350937762433517, + "grad_norm": NaN, + "learning_rate": 5.5724062959802435e-05, + "loss": 0.0, + "step": 46629 + }, + { + "epoch": 4.351031072128394, + "grad_norm": NaN, + "learning_rate": 5.5718179933811256e-05, + "loss": 0.0, + "step": 46630 + }, + { + "epoch": 4.351124381823271, + "grad_norm": NaN, + "learning_rate": 5.571229714755088e-05, + "loss": 0.0, + "step": 46631 + }, + { + "epoch": 4.351217691518149, + "grad_norm": NaN, + "learning_rate": 5.570641460103625e-05, + "loss": 0.0, + "step": 46632 + }, + { + "epoch": 4.351311001213026, + "grad_norm": NaN, + "learning_rate": 5.570053229428218e-05, + "loss": 0.0, + "step": 46633 + }, + { + "epoch": 4.3514043109079035, + "grad_norm": NaN, + "learning_rate": 5.569465022730375e-05, + "loss": 0.0, + "step": 46634 + }, + { + "epoch": 4.351497620602781, + "grad_norm": NaN, + "learning_rate": 5.568876840011596e-05, + "loss": 0.0, + "step": 46635 + }, + { + "epoch": 4.351590930297658, + "grad_norm": NaN, + "learning_rate": 5.5682886812733585e-05, + "loss": 0.0, + "step": 46636 + }, + { + "epoch": 4.351684239992535, + "grad_norm": NaN, + "learning_rate": 5.5677005465171754e-05, + "loss": 0.0, + "step": 46637 + }, + { + "epoch": 4.351777549687412, + "grad_norm": NaN, + "learning_rate": 5.5671124357445405e-05, + "loss": 0.0, + "step": 46638 + }, + { + "epoch": 4.35187085938229, + "grad_norm": NaN, + "learning_rate": 5.566524348956933e-05, + "loss": 0.0, + "step": 46639 + }, + { + "epoch": 4.351964169077167, + "grad_norm": NaN, + "learning_rate": 5.56593628615587e-05, + "loss": 0.0, + "step": 46640 + }, + { + "epoch": 4.3520574787720445, + "grad_norm": NaN, + "learning_rate": 5.5653482473428326e-05, + "loss": 0.0, + "step": 46641 + }, + { + "epoch": 4.352150788466922, + "grad_norm": NaN, + "learning_rate": 5.5647602325193144e-05, + "loss": 0.0, + "step": 46642 + }, + { + "epoch": 4.352244098161799, + "grad_norm": NaN, + "learning_rate": 5.564172241686826e-05, + "loss": 0.0, + "step": 46643 + }, + { + "epoch": 4.352337407856677, + "grad_norm": NaN, + "learning_rate": 5.5635842748468475e-05, + "loss": 0.0, + "step": 46644 + }, + { + "epoch": 4.352430717551553, + "grad_norm": NaN, + "learning_rate": 5.5629963320008736e-05, + "loss": 0.0, + "step": 46645 + }, + { + "epoch": 4.352524027246431, + "grad_norm": NaN, + "learning_rate": 5.562408413150414e-05, + "loss": 0.0, + "step": 46646 + }, + { + "epoch": 4.352617336941308, + "grad_norm": NaN, + "learning_rate": 5.561820518296945e-05, + "loss": 0.0, + "step": 46647 + }, + { + "epoch": 4.352710646636186, + "grad_norm": NaN, + "learning_rate": 5.5612326474419746e-05, + "loss": 0.0, + "step": 46648 + }, + { + "epoch": 4.352803956331063, + "grad_norm": NaN, + "learning_rate": 5.560644800587001e-05, + "loss": 0.0, + "step": 46649 + }, + { + "epoch": 4.35289726602594, + "grad_norm": NaN, + "learning_rate": 5.5600569777334996e-05, + "loss": 0.0, + "step": 46650 + }, + { + "epoch": 4.352990575720817, + "grad_norm": NaN, + "learning_rate": 5.559469178882982e-05, + "loss": 0.0, + "step": 46651 + }, + { + "epoch": 4.353083885415694, + "grad_norm": NaN, + "learning_rate": 5.558881404036943e-05, + "loss": 0.0, + "step": 46652 + }, + { + "epoch": 4.353177195110572, + "grad_norm": NaN, + "learning_rate": 5.5582936531968625e-05, + "loss": 0.0, + "step": 46653 + }, + { + "epoch": 4.353270504805449, + "grad_norm": NaN, + "learning_rate": 5.5577059263642496e-05, + "loss": 0.0, + "step": 46654 + }, + { + "epoch": 4.353363814500327, + "grad_norm": NaN, + "learning_rate": 5.557118223540598e-05, + "loss": 0.0, + "step": 46655 + }, + { + "epoch": 4.353457124195204, + "grad_norm": NaN, + "learning_rate": 5.556530544727386e-05, + "loss": 0.0, + "step": 46656 + }, + { + "epoch": 4.3535504338900815, + "grad_norm": NaN, + "learning_rate": 5.555942889926127e-05, + "loss": 0.0, + "step": 46657 + }, + { + "epoch": 4.353643743584959, + "grad_norm": NaN, + "learning_rate": 5.555355259138311e-05, + "loss": 0.0, + "step": 46658 + }, + { + "epoch": 4.353737053279835, + "grad_norm": NaN, + "learning_rate": 5.55476765236542e-05, + "loss": 0.0, + "step": 46659 + }, + { + "epoch": 4.353830362974713, + "grad_norm": NaN, + "learning_rate": 5.5541800696089613e-05, + "loss": 0.0, + "step": 46660 + }, + { + "epoch": 4.35392367266959, + "grad_norm": NaN, + "learning_rate": 5.55359251087043e-05, + "loss": 0.0, + "step": 46661 + }, + { + "epoch": 4.354016982364468, + "grad_norm": NaN, + "learning_rate": 5.553004976151304e-05, + "loss": 0.0, + "step": 46662 + }, + { + "epoch": 4.354110292059345, + "grad_norm": NaN, + "learning_rate": 5.552417465453094e-05, + "loss": 0.0, + "step": 46663 + }, + { + "epoch": 4.3542036017542225, + "grad_norm": NaN, + "learning_rate": 5.551829978777292e-05, + "loss": 0.0, + "step": 46664 + }, + { + "epoch": 4.3542969114491, + "grad_norm": NaN, + "learning_rate": 5.551242516125378e-05, + "loss": 0.0, + "step": 46665 + }, + { + "epoch": 4.3543902211439764, + "grad_norm": NaN, + "learning_rate": 5.55065507749886e-05, + "loss": 0.0, + "step": 46666 + }, + { + "epoch": 4.354483530838854, + "grad_norm": NaN, + "learning_rate": 5.550067662899233e-05, + "loss": 0.0, + "step": 46667 + }, + { + "epoch": 4.354576840533731, + "grad_norm": NaN, + "learning_rate": 5.5494802723279734e-05, + "loss": 0.0, + "step": 46668 + }, + { + "epoch": 4.354670150228609, + "grad_norm": NaN, + "learning_rate": 5.548892905786592e-05, + "loss": 0.0, + "step": 46669 + }, + { + "epoch": 4.354763459923486, + "grad_norm": NaN, + "learning_rate": 5.5483055632765813e-05, + "loss": 0.0, + "step": 46670 + }, + { + "epoch": 4.3548567696183635, + "grad_norm": NaN, + "learning_rate": 5.547718244799419e-05, + "loss": 0.0, + "step": 46671 + }, + { + "epoch": 4.354950079313241, + "grad_norm": NaN, + "learning_rate": 5.5471309503566146e-05, + "loss": 0.0, + "step": 46672 + }, + { + "epoch": 4.355043389008118, + "grad_norm": NaN, + "learning_rate": 5.54654367994966e-05, + "loss": 0.0, + "step": 46673 + }, + { + "epoch": 4.355136698702995, + "grad_norm": NaN, + "learning_rate": 5.5459564335800366e-05, + "loss": 0.0, + "step": 46674 + }, + { + "epoch": 4.355230008397872, + "grad_norm": NaN, + "learning_rate": 5.5453692112492486e-05, + "loss": 0.0, + "step": 46675 + }, + { + "epoch": 4.35532331809275, + "grad_norm": NaN, + "learning_rate": 5.544782012958792e-05, + "loss": 0.0, + "step": 46676 + }, + { + "epoch": 4.355416627787627, + "grad_norm": NaN, + "learning_rate": 5.544194838710144e-05, + "loss": 0.0, + "step": 46677 + }, + { + "epoch": 4.355509937482505, + "grad_norm": NaN, + "learning_rate": 5.543607688504812e-05, + "loss": 0.0, + "step": 46678 + }, + { + "epoch": 4.355603247177382, + "grad_norm": NaN, + "learning_rate": 5.543020562344289e-05, + "loss": 0.0, + "step": 46679 + }, + { + "epoch": 4.355696556872259, + "grad_norm": NaN, + "learning_rate": 5.542433460230055e-05, + "loss": 0.0, + "step": 46680 + }, + { + "epoch": 4.355789866567136, + "grad_norm": NaN, + "learning_rate": 5.541846382163614e-05, + "loss": 0.0, + "step": 46681 + }, + { + "epoch": 4.355883176262013, + "grad_norm": NaN, + "learning_rate": 5.541259328146462e-05, + "loss": 0.0, + "step": 46682 + }, + { + "epoch": 4.355976485956891, + "grad_norm": NaN, + "learning_rate": 5.5406722981800774e-05, + "loss": 0.0, + "step": 46683 + }, + { + "epoch": 4.356069795651768, + "grad_norm": NaN, + "learning_rate": 5.5400852922659636e-05, + "loss": 0.0, + "step": 46684 + }, + { + "epoch": 4.356163105346646, + "grad_norm": NaN, + "learning_rate": 5.5394983104056115e-05, + "loss": 0.0, + "step": 46685 + }, + { + "epoch": 4.356256415041523, + "grad_norm": NaN, + "learning_rate": 5.538911352600513e-05, + "loss": 0.0, + "step": 46686 + }, + { + "epoch": 4.3563497247364005, + "grad_norm": NaN, + "learning_rate": 5.5383244188521645e-05, + "loss": 0.0, + "step": 46687 + }, + { + "epoch": 4.356443034431277, + "grad_norm": NaN, + "learning_rate": 5.537737509162045e-05, + "loss": 0.0, + "step": 46688 + }, + { + "epoch": 4.356536344126154, + "grad_norm": NaN, + "learning_rate": 5.5371506235316615e-05, + "loss": 0.0, + "step": 46689 + }, + { + "epoch": 4.356629653821032, + "grad_norm": NaN, + "learning_rate": 5.536563761962506e-05, + "loss": 0.0, + "step": 46690 + }, + { + "epoch": 4.356722963515909, + "grad_norm": NaN, + "learning_rate": 5.5359769244560536e-05, + "loss": 0.0, + "step": 46691 + }, + { + "epoch": 4.356816273210787, + "grad_norm": NaN, + "learning_rate": 5.535390111013816e-05, + "loss": 0.0, + "step": 46692 + }, + { + "epoch": 4.356909582905664, + "grad_norm": NaN, + "learning_rate": 5.5348033216372814e-05, + "loss": 0.0, + "step": 46693 + }, + { + "epoch": 4.3570028926005415, + "grad_norm": NaN, + "learning_rate": 5.534216556327928e-05, + "loss": 0.0, + "step": 46694 + }, + { + "epoch": 4.357096202295418, + "grad_norm": NaN, + "learning_rate": 5.533629815087263e-05, + "loss": 0.0, + "step": 46695 + }, + { + "epoch": 4.3571895119902955, + "grad_norm": NaN, + "learning_rate": 5.5330430979167787e-05, + "loss": 0.0, + "step": 46696 + }, + { + "epoch": 4.357282821685173, + "grad_norm": NaN, + "learning_rate": 5.53245640481795e-05, + "loss": 0.0, + "step": 46697 + }, + { + "epoch": 4.35737613138005, + "grad_norm": NaN, + "learning_rate": 5.5318697357922874e-05, + "loss": 0.0, + "step": 46698 + }, + { + "epoch": 4.357469441074928, + "grad_norm": NaN, + "learning_rate": 5.53128309084128e-05, + "loss": 0.0, + "step": 46699 + }, + { + "epoch": 4.357562750769805, + "grad_norm": NaN, + "learning_rate": 5.530696469966404e-05, + "loss": 0.0, + "step": 46700 + }, + { + "epoch": 4.357656060464683, + "grad_norm": NaN, + "learning_rate": 5.5301098731691664e-05, + "loss": 0.0, + "step": 46701 + }, + { + "epoch": 4.35774937015956, + "grad_norm": NaN, + "learning_rate": 5.52952330045106e-05, + "loss": 0.0, + "step": 46702 + }, + { + "epoch": 4.3578426798544365, + "grad_norm": NaN, + "learning_rate": 5.52893675181356e-05, + "loss": 0.0, + "step": 46703 + }, + { + "epoch": 4.357935989549314, + "grad_norm": NaN, + "learning_rate": 5.528350227258175e-05, + "loss": 0.0, + "step": 46704 + }, + { + "epoch": 4.358029299244191, + "grad_norm": NaN, + "learning_rate": 5.5277637267863926e-05, + "loss": 0.0, + "step": 46705 + }, + { + "epoch": 4.358122608939069, + "grad_norm": NaN, + "learning_rate": 5.5271772503996914e-05, + "loss": 0.0, + "step": 46706 + }, + { + "epoch": 4.358215918633946, + "grad_norm": NaN, + "learning_rate": 5.5265907980995776e-05, + "loss": 0.0, + "step": 46707 + }, + { + "epoch": 4.358309228328824, + "grad_norm": NaN, + "learning_rate": 5.526004369887544e-05, + "loss": 0.0, + "step": 46708 + }, + { + "epoch": 4.358402538023701, + "grad_norm": NaN, + "learning_rate": 5.5254179657650623e-05, + "loss": 0.0, + "step": 46709 + }, + { + "epoch": 4.358495847718578, + "grad_norm": NaN, + "learning_rate": 5.524831585733643e-05, + "loss": 0.0, + "step": 46710 + }, + { + "epoch": 4.358589157413455, + "grad_norm": NaN, + "learning_rate": 5.524245229794775e-05, + "loss": 0.0, + "step": 46711 + }, + { + "epoch": 4.358682467108332, + "grad_norm": NaN, + "learning_rate": 5.523658897949934e-05, + "loss": 0.0, + "step": 46712 + }, + { + "epoch": 4.35877577680321, + "grad_norm": NaN, + "learning_rate": 5.523072590200625e-05, + "loss": 0.0, + "step": 46713 + }, + { + "epoch": 4.358869086498087, + "grad_norm": NaN, + "learning_rate": 5.522486306548342e-05, + "loss": 0.0, + "step": 46714 + }, + { + "epoch": 4.358962396192965, + "grad_norm": NaN, + "learning_rate": 5.521900046994557e-05, + "loss": 0.0, + "step": 46715 + }, + { + "epoch": 4.359055705887842, + "grad_norm": NaN, + "learning_rate": 5.521313811540779e-05, + "loss": 0.0, + "step": 46716 + }, + { + "epoch": 4.3591490155827195, + "grad_norm": NaN, + "learning_rate": 5.520727600188497e-05, + "loss": 0.0, + "step": 46717 + }, + { + "epoch": 4.359242325277596, + "grad_norm": NaN, + "learning_rate": 5.520141412939186e-05, + "loss": 0.0, + "step": 46718 + }, + { + "epoch": 4.3593356349724734, + "grad_norm": NaN, + "learning_rate": 5.5195552497943544e-05, + "loss": 0.0, + "step": 46719 + }, + { + "epoch": 4.359428944667351, + "grad_norm": NaN, + "learning_rate": 5.518969110755489e-05, + "loss": 0.0, + "step": 46720 + }, + { + "epoch": 4.359522254362228, + "grad_norm": NaN, + "learning_rate": 5.5183829958240666e-05, + "loss": 0.0, + "step": 46721 + }, + { + "epoch": 4.359615564057106, + "grad_norm": NaN, + "learning_rate": 5.517796905001592e-05, + "loss": 0.0, + "step": 46722 + }, + { + "epoch": 4.359708873751983, + "grad_norm": NaN, + "learning_rate": 5.517210838289552e-05, + "loss": 0.0, + "step": 46723 + }, + { + "epoch": 4.35980218344686, + "grad_norm": NaN, + "learning_rate": 5.5166247956894356e-05, + "loss": 0.0, + "step": 46724 + }, + { + "epoch": 4.359895493141737, + "grad_norm": NaN, + "learning_rate": 5.516038777202732e-05, + "loss": 0.0, + "step": 46725 + }, + { + "epoch": 4.3599888028366145, + "grad_norm": NaN, + "learning_rate": 5.5154527828309334e-05, + "loss": 0.0, + "step": 46726 + }, + { + "epoch": 4.360082112531492, + "grad_norm": NaN, + "learning_rate": 5.514866812575529e-05, + "loss": 0.0, + "step": 46727 + }, + { + "epoch": 4.360175422226369, + "grad_norm": NaN, + "learning_rate": 5.514280866438009e-05, + "loss": 0.0, + "step": 46728 + }, + { + "epoch": 4.360268731921247, + "grad_norm": NaN, + "learning_rate": 5.513694944419864e-05, + "loss": 0.0, + "step": 46729 + }, + { + "epoch": 4.360362041616124, + "grad_norm": NaN, + "learning_rate": 5.51310904652258e-05, + "loss": 0.0, + "step": 46730 + }, + { + "epoch": 4.360455351311002, + "grad_norm": NaN, + "learning_rate": 5.512523172747656e-05, + "loss": 0.0, + "step": 46731 + }, + { + "epoch": 4.360548661005878, + "grad_norm": NaN, + "learning_rate": 5.511937323096565e-05, + "loss": 0.0, + "step": 46732 + }, + { + "epoch": 4.3606419707007555, + "grad_norm": NaN, + "learning_rate": 5.51135149757081e-05, + "loss": 0.0, + "step": 46733 + }, + { + "epoch": 4.360735280395633, + "grad_norm": NaN, + "learning_rate": 5.5107656961718844e-05, + "loss": 0.0, + "step": 46734 + }, + { + "epoch": 4.36082859009051, + "grad_norm": NaN, + "learning_rate": 5.5101799189012616e-05, + "loss": 0.0, + "step": 46735 + }, + { + "epoch": 4.360921899785388, + "grad_norm": NaN, + "learning_rate": 5.509594165760443e-05, + "loss": 0.0, + "step": 46736 + }, + { + "epoch": 4.361015209480265, + "grad_norm": NaN, + "learning_rate": 5.509008436750922e-05, + "loss": 0.0, + "step": 46737 + }, + { + "epoch": 4.361108519175143, + "grad_norm": NaN, + "learning_rate": 5.5084227318741694e-05, + "loss": 0.0, + "step": 46738 + }, + { + "epoch": 4.361201828870019, + "grad_norm": NaN, + "learning_rate": 5.507837051131692e-05, + "loss": 0.0, + "step": 46739 + }, + { + "epoch": 4.361295138564897, + "grad_norm": NaN, + "learning_rate": 5.507251394524978e-05, + "loss": 0.0, + "step": 46740 + }, + { + "epoch": 4.361388448259774, + "grad_norm": NaN, + "learning_rate": 5.506665762055501e-05, + "loss": 0.0, + "step": 46741 + }, + { + "epoch": 4.361481757954651, + "grad_norm": NaN, + "learning_rate": 5.5060801537247665e-05, + "loss": 0.0, + "step": 46742 + }, + { + "epoch": 4.361575067649529, + "grad_norm": NaN, + "learning_rate": 5.505494569534264e-05, + "loss": 0.0, + "step": 46743 + }, + { + "epoch": 4.361668377344406, + "grad_norm": NaN, + "learning_rate": 5.5049090094854646e-05, + "loss": 0.0, + "step": 46744 + }, + { + "epoch": 4.361761687039284, + "grad_norm": NaN, + "learning_rate": 5.5043234735798766e-05, + "loss": 0.0, + "step": 46745 + }, + { + "epoch": 4.361854996734161, + "grad_norm": NaN, + "learning_rate": 5.5037379618189844e-05, + "loss": 0.0, + "step": 46746 + }, + { + "epoch": 4.361948306429038, + "grad_norm": NaN, + "learning_rate": 5.503152474204262e-05, + "loss": 0.0, + "step": 46747 + }, + { + "epoch": 4.362041616123915, + "grad_norm": NaN, + "learning_rate": 5.502567010737217e-05, + "loss": 0.0, + "step": 46748 + }, + { + "epoch": 4.3621349258187925, + "grad_norm": NaN, + "learning_rate": 5.501981571419335e-05, + "loss": 0.0, + "step": 46749 + }, + { + "epoch": 4.36222823551367, + "grad_norm": NaN, + "learning_rate": 5.5013961562520895e-05, + "loss": 0.0, + "step": 46750 + }, + { + "epoch": 4.362321545208547, + "grad_norm": NaN, + "learning_rate": 5.500810765236985e-05, + "loss": 0.0, + "step": 46751 + }, + { + "epoch": 4.362414854903425, + "grad_norm": NaN, + "learning_rate": 5.500225398375511e-05, + "loss": 0.0, + "step": 46752 + }, + { + "epoch": 4.362508164598302, + "grad_norm": NaN, + "learning_rate": 5.499640055669139e-05, + "loss": 0.0, + "step": 46753 + }, + { + "epoch": 4.362601474293179, + "grad_norm": NaN, + "learning_rate": 5.499054737119374e-05, + "loss": 0.0, + "step": 46754 + }, + { + "epoch": 4.362694783988056, + "grad_norm": NaN, + "learning_rate": 5.498469442727703e-05, + "loss": 0.0, + "step": 46755 + }, + { + "epoch": 4.3627880936829335, + "grad_norm": NaN, + "learning_rate": 5.497884172495599e-05, + "loss": 0.0, + "step": 46756 + }, + { + "epoch": 4.362881403377811, + "grad_norm": NaN, + "learning_rate": 5.497298926424567e-05, + "loss": 0.0, + "step": 46757 + }, + { + "epoch": 4.362974713072688, + "grad_norm": NaN, + "learning_rate": 5.496713704516087e-05, + "loss": 0.0, + "step": 46758 + }, + { + "epoch": 4.363068022767566, + "grad_norm": NaN, + "learning_rate": 5.496128506771651e-05, + "loss": 0.0, + "step": 46759 + }, + { + "epoch": 4.363161332462443, + "grad_norm": NaN, + "learning_rate": 5.495543333192745e-05, + "loss": 0.0, + "step": 46760 + }, + { + "epoch": 4.36325464215732, + "grad_norm": NaN, + "learning_rate": 5.494958183780855e-05, + "loss": 0.0, + "step": 46761 + }, + { + "epoch": 4.363347951852197, + "grad_norm": NaN, + "learning_rate": 5.494373058537471e-05, + "loss": 0.0, + "step": 46762 + }, + { + "epoch": 4.363441261547075, + "grad_norm": NaN, + "learning_rate": 5.493787957464082e-05, + "loss": 0.0, + "step": 46763 + }, + { + "epoch": 4.363534571241952, + "grad_norm": NaN, + "learning_rate": 5.493202880562172e-05, + "loss": 0.0, + "step": 46764 + }, + { + "epoch": 4.363627880936829, + "grad_norm": NaN, + "learning_rate": 5.4926178278332346e-05, + "loss": 0.0, + "step": 46765 + }, + { + "epoch": 4.363721190631707, + "grad_norm": NaN, + "learning_rate": 5.492032799278751e-05, + "loss": 0.0, + "step": 46766 + }, + { + "epoch": 4.363814500326584, + "grad_norm": NaN, + "learning_rate": 5.491447794900211e-05, + "loss": 0.0, + "step": 46767 + }, + { + "epoch": 4.363907810021461, + "grad_norm": NaN, + "learning_rate": 5.490862814699105e-05, + "loss": 0.0, + "step": 46768 + }, + { + "epoch": 4.364001119716338, + "grad_norm": NaN, + "learning_rate": 5.4902778586769164e-05, + "loss": 0.0, + "step": 46769 + }, + { + "epoch": 4.364094429411216, + "grad_norm": NaN, + "learning_rate": 5.489692926835135e-05, + "loss": 0.0, + "step": 46770 + }, + { + "epoch": 4.364187739106093, + "grad_norm": NaN, + "learning_rate": 5.4891080191752476e-05, + "loss": 0.0, + "step": 46771 + }, + { + "epoch": 4.3642810488009705, + "grad_norm": NaN, + "learning_rate": 5.4885231356987395e-05, + "loss": 0.0, + "step": 46772 + }, + { + "epoch": 4.364374358495848, + "grad_norm": NaN, + "learning_rate": 5.487938276407101e-05, + "loss": 0.0, + "step": 46773 + }, + { + "epoch": 4.364467668190725, + "grad_norm": NaN, + "learning_rate": 5.487353441301817e-05, + "loss": 0.0, + "step": 46774 + }, + { + "epoch": 4.364560977885603, + "grad_norm": NaN, + "learning_rate": 5.4867686303843815e-05, + "loss": 0.0, + "step": 46775 + }, + { + "epoch": 4.364654287580479, + "grad_norm": NaN, + "learning_rate": 5.486183843656263e-05, + "loss": 0.0, + "step": 46776 + }, + { + "epoch": 4.364747597275357, + "grad_norm": NaN, + "learning_rate": 5.4855990811189675e-05, + "loss": 0.0, + "step": 46777 + }, + { + "epoch": 4.364840906970234, + "grad_norm": NaN, + "learning_rate": 5.485014342773982e-05, + "loss": 0.0, + "step": 46778 + }, + { + "epoch": 4.3649342166651115, + "grad_norm": NaN, + "learning_rate": 5.484429628622773e-05, + "loss": 0.0, + "step": 46779 + }, + { + "epoch": 4.365027526359989, + "grad_norm": NaN, + "learning_rate": 5.483844938666848e-05, + "loss": 0.0, + "step": 46780 + }, + { + "epoch": 4.365120836054866, + "grad_norm": NaN, + "learning_rate": 5.4832602729076914e-05, + "loss": 0.0, + "step": 46781 + }, + { + "epoch": 4.365214145749744, + "grad_norm": NaN, + "learning_rate": 5.482675631346773e-05, + "loss": 0.0, + "step": 46782 + }, + { + "epoch": 4.36530745544462, + "grad_norm": NaN, + "learning_rate": 5.4820910139855964e-05, + "loss": 0.0, + "step": 46783 + }, + { + "epoch": 4.365400765139498, + "grad_norm": NaN, + "learning_rate": 5.48150642082565e-05, + "loss": 0.0, + "step": 46784 + }, + { + "epoch": 4.365494074834375, + "grad_norm": NaN, + "learning_rate": 5.4809218518684026e-05, + "loss": 0.0, + "step": 46785 + }, + { + "epoch": 4.3655873845292525, + "grad_norm": NaN, + "learning_rate": 5.480337307115354e-05, + "loss": 0.0, + "step": 46786 + }, + { + "epoch": 4.36568069422413, + "grad_norm": NaN, + "learning_rate": 5.479752786567995e-05, + "loss": 0.0, + "step": 46787 + }, + { + "epoch": 4.365774003919007, + "grad_norm": NaN, + "learning_rate": 5.479168290227794e-05, + "loss": 0.0, + "step": 46788 + }, + { + "epoch": 4.365867313613885, + "grad_norm": NaN, + "learning_rate": 5.478583818096253e-05, + "loss": 0.0, + "step": 46789 + }, + { + "epoch": 4.365960623308762, + "grad_norm": NaN, + "learning_rate": 5.477999370174857e-05, + "loss": 0.0, + "step": 46790 + }, + { + "epoch": 4.366053933003639, + "grad_norm": NaN, + "learning_rate": 5.477414946465079e-05, + "loss": 0.0, + "step": 46791 + }, + { + "epoch": 4.366147242698516, + "grad_norm": NaN, + "learning_rate": 5.476830546968419e-05, + "loss": 0.0, + "step": 46792 + }, + { + "epoch": 4.366240552393394, + "grad_norm": NaN, + "learning_rate": 5.476246171686363e-05, + "loss": 0.0, + "step": 46793 + }, + { + "epoch": 4.366333862088271, + "grad_norm": NaN, + "learning_rate": 5.4756618206203814e-05, + "loss": 0.0, + "step": 46794 + }, + { + "epoch": 4.366427171783148, + "grad_norm": NaN, + "learning_rate": 5.475077493771974e-05, + "loss": 0.0, + "step": 46795 + }, + { + "epoch": 4.366520481478026, + "grad_norm": NaN, + "learning_rate": 5.474493191142625e-05, + "loss": 0.0, + "step": 46796 + }, + { + "epoch": 4.366613791172903, + "grad_norm": NaN, + "learning_rate": 5.473908912733818e-05, + "loss": 0.0, + "step": 46797 + }, + { + "epoch": 4.36670710086778, + "grad_norm": NaN, + "learning_rate": 5.473324658547038e-05, + "loss": 0.0, + "step": 46798 + }, + { + "epoch": 4.366800410562657, + "grad_norm": NaN, + "learning_rate": 5.472740428583772e-05, + "loss": 0.0, + "step": 46799 + }, + { + "epoch": 4.366893720257535, + "grad_norm": NaN, + "learning_rate": 5.472156222845505e-05, + "loss": 0.0, + "step": 46800 + }, + { + "epoch": 4.366987029952412, + "grad_norm": NaN, + "learning_rate": 5.4715720413337215e-05, + "loss": 0.0, + "step": 46801 + }, + { + "epoch": 4.3670803396472895, + "grad_norm": NaN, + "learning_rate": 5.470987884049908e-05, + "loss": 0.0, + "step": 46802 + }, + { + "epoch": 4.367173649342167, + "grad_norm": NaN, + "learning_rate": 5.47040375099555e-05, + "loss": 0.0, + "step": 46803 + }, + { + "epoch": 4.367266959037044, + "grad_norm": NaN, + "learning_rate": 5.4698196421721326e-05, + "loss": 0.0, + "step": 46804 + }, + { + "epoch": 4.367360268731921, + "grad_norm": NaN, + "learning_rate": 5.469235557581141e-05, + "loss": 0.0, + "step": 46805 + }, + { + "epoch": 4.367453578426798, + "grad_norm": NaN, + "learning_rate": 5.468651497224059e-05, + "loss": 0.0, + "step": 46806 + }, + { + "epoch": 4.367546888121676, + "grad_norm": NaN, + "learning_rate": 5.4680674611023746e-05, + "loss": 0.0, + "step": 46807 + }, + { + "epoch": 4.367640197816553, + "grad_norm": NaN, + "learning_rate": 5.467483449217569e-05, + "loss": 0.0, + "step": 46808 + }, + { + "epoch": 4.3677335075114305, + "grad_norm": NaN, + "learning_rate": 5.46689946157113e-05, + "loss": 0.0, + "step": 46809 + }, + { + "epoch": 4.367826817206308, + "grad_norm": NaN, + "learning_rate": 5.4663154981645425e-05, + "loss": 0.0, + "step": 46810 + }, + { + "epoch": 4.367920126901185, + "grad_norm": NaN, + "learning_rate": 5.465731558999291e-05, + "loss": 0.0, + "step": 46811 + }, + { + "epoch": 4.368013436596062, + "grad_norm": NaN, + "learning_rate": 5.465147644076858e-05, + "loss": 0.0, + "step": 46812 + }, + { + "epoch": 4.368106746290939, + "grad_norm": NaN, + "learning_rate": 5.464563753398731e-05, + "loss": 0.0, + "step": 46813 + }, + { + "epoch": 4.368200055985817, + "grad_norm": NaN, + "learning_rate": 5.463979886966394e-05, + "loss": 0.0, + "step": 46814 + }, + { + "epoch": 4.368293365680694, + "grad_norm": NaN, + "learning_rate": 5.4633960447813305e-05, + "loss": 0.0, + "step": 46815 + }, + { + "epoch": 4.368386675375572, + "grad_norm": NaN, + "learning_rate": 5.4628122268450274e-05, + "loss": 0.0, + "step": 46816 + }, + { + "epoch": 4.368479985070449, + "grad_norm": NaN, + "learning_rate": 5.462228433158968e-05, + "loss": 0.0, + "step": 46817 + }, + { + "epoch": 4.368573294765326, + "grad_norm": NaN, + "learning_rate": 5.461644663724634e-05, + "loss": 0.0, + "step": 46818 + }, + { + "epoch": 4.368666604460204, + "grad_norm": NaN, + "learning_rate": 5.461060918543519e-05, + "loss": 0.0, + "step": 46819 + }, + { + "epoch": 4.36875991415508, + "grad_norm": NaN, + "learning_rate": 5.460477197617089e-05, + "loss": 0.0, + "step": 46820 + }, + { + "epoch": 4.368853223849958, + "grad_norm": NaN, + "learning_rate": 5.4598935009468465e-05, + "loss": 0.0, + "step": 46821 + }, + { + "epoch": 4.368946533544835, + "grad_norm": NaN, + "learning_rate": 5.459309828534273e-05, + "loss": 0.0, + "step": 46822 + }, + { + "epoch": 4.369039843239713, + "grad_norm": NaN, + "learning_rate": 5.4587261803808393e-05, + "loss": 0.0, + "step": 46823 + }, + { + "epoch": 4.36913315293459, + "grad_norm": NaN, + "learning_rate": 5.458142556488042e-05, + "loss": 0.0, + "step": 46824 + }, + { + "epoch": 4.3692264626294675, + "grad_norm": NaN, + "learning_rate": 5.4575589568573674e-05, + "loss": 0.0, + "step": 46825 + }, + { + "epoch": 4.369319772324345, + "grad_norm": NaN, + "learning_rate": 5.456975381490284e-05, + "loss": 0.0, + "step": 46826 + }, + { + "epoch": 4.369413082019221, + "grad_norm": NaN, + "learning_rate": 5.4563918303882886e-05, + "loss": 0.0, + "step": 46827 + }, + { + "epoch": 4.369506391714099, + "grad_norm": NaN, + "learning_rate": 5.455808303552869e-05, + "loss": 0.0, + "step": 46828 + }, + { + "epoch": 4.369599701408976, + "grad_norm": NaN, + "learning_rate": 5.45522480098549e-05, + "loss": 0.0, + "step": 46829 + }, + { + "epoch": 4.369693011103854, + "grad_norm": NaN, + "learning_rate": 5.454641322687654e-05, + "loss": 0.0, + "step": 46830 + }, + { + "epoch": 4.369786320798731, + "grad_norm": NaN, + "learning_rate": 5.4540578686608414e-05, + "loss": 0.0, + "step": 46831 + }, + { + "epoch": 4.3698796304936085, + "grad_norm": NaN, + "learning_rate": 5.4534744389065216e-05, + "loss": 0.0, + "step": 46832 + }, + { + "epoch": 4.369972940188486, + "grad_norm": NaN, + "learning_rate": 5.452891033426193e-05, + "loss": 0.0, + "step": 46833 + }, + { + "epoch": 4.370066249883363, + "grad_norm": NaN, + "learning_rate": 5.4523076522213357e-05, + "loss": 0.0, + "step": 46834 + }, + { + "epoch": 4.37015955957824, + "grad_norm": NaN, + "learning_rate": 5.45172429529343e-05, + "loss": 0.0, + "step": 46835 + }, + { + "epoch": 4.370252869273117, + "grad_norm": NaN, + "learning_rate": 5.451140962643963e-05, + "loss": 0.0, + "step": 46836 + }, + { + "epoch": 4.370346178967995, + "grad_norm": NaN, + "learning_rate": 5.450557654274415e-05, + "loss": 0.0, + "step": 46837 + }, + { + "epoch": 4.370439488662872, + "grad_norm": NaN, + "learning_rate": 5.449974370186271e-05, + "loss": 0.0, + "step": 46838 + }, + { + "epoch": 4.3705327983577495, + "grad_norm": NaN, + "learning_rate": 5.449391110381012e-05, + "loss": 0.0, + "step": 46839 + }, + { + "epoch": 4.370626108052627, + "grad_norm": NaN, + "learning_rate": 5.448807874860123e-05, + "loss": 0.0, + "step": 46840 + }, + { + "epoch": 4.3707194177475035, + "grad_norm": NaN, + "learning_rate": 5.448224663625086e-05, + "loss": 0.0, + "step": 46841 + }, + { + "epoch": 4.370812727442381, + "grad_norm": NaN, + "learning_rate": 5.4476414766773856e-05, + "loss": 0.0, + "step": 46842 + }, + { + "epoch": 4.370906037137258, + "grad_norm": NaN, + "learning_rate": 5.447058314018502e-05, + "loss": 0.0, + "step": 46843 + }, + { + "epoch": 4.370999346832136, + "grad_norm": NaN, + "learning_rate": 5.4464751756499186e-05, + "loss": 0.0, + "step": 46844 + }, + { + "epoch": 4.371092656527013, + "grad_norm": NaN, + "learning_rate": 5.4458920615731197e-05, + "loss": 0.0, + "step": 46845 + }, + { + "epoch": 4.371185966221891, + "grad_norm": NaN, + "learning_rate": 5.4453089717895875e-05, + "loss": 0.0, + "step": 46846 + }, + { + "epoch": 4.371279275916768, + "grad_norm": NaN, + "learning_rate": 5.4447259063008034e-05, + "loss": 0.0, + "step": 46847 + }, + { + "epoch": 4.371372585611645, + "grad_norm": NaN, + "learning_rate": 5.4441428651082514e-05, + "loss": 0.0, + "step": 46848 + }, + { + "epoch": 4.371465895306522, + "grad_norm": NaN, + "learning_rate": 5.443559848213415e-05, + "loss": 0.0, + "step": 46849 + }, + { + "epoch": 4.371559205001399, + "grad_norm": NaN, + "learning_rate": 5.442976855617772e-05, + "loss": 0.0, + "step": 46850 + }, + { + "epoch": 4.371652514696277, + "grad_norm": NaN, + "learning_rate": 5.4423938873228095e-05, + "loss": 0.0, + "step": 46851 + }, + { + "epoch": 4.371745824391154, + "grad_norm": NaN, + "learning_rate": 5.441810943330009e-05, + "loss": 0.0, + "step": 46852 + }, + { + "epoch": 4.371839134086032, + "grad_norm": NaN, + "learning_rate": 5.441228023640851e-05, + "loss": 0.0, + "step": 46853 + }, + { + "epoch": 4.371932443780909, + "grad_norm": NaN, + "learning_rate": 5.4406451282568186e-05, + "loss": 0.0, + "step": 46854 + }, + { + "epoch": 4.3720257534757865, + "grad_norm": NaN, + "learning_rate": 5.440062257179394e-05, + "loss": 0.0, + "step": 46855 + }, + { + "epoch": 4.372119063170663, + "grad_norm": NaN, + "learning_rate": 5.4394794104100586e-05, + "loss": 0.0, + "step": 46856 + }, + { + "epoch": 4.37221237286554, + "grad_norm": NaN, + "learning_rate": 5.438896587950295e-05, + "loss": 0.0, + "step": 46857 + }, + { + "epoch": 4.372305682560418, + "grad_norm": NaN, + "learning_rate": 5.438313789801586e-05, + "loss": 0.0, + "step": 46858 + }, + { + "epoch": 4.372398992255295, + "grad_norm": NaN, + "learning_rate": 5.437731015965412e-05, + "loss": 0.0, + "step": 46859 + }, + { + "epoch": 4.372492301950173, + "grad_norm": NaN, + "learning_rate": 5.437148266443256e-05, + "loss": 0.0, + "step": 46860 + }, + { + "epoch": 4.37258561164505, + "grad_norm": NaN, + "learning_rate": 5.4365655412366e-05, + "loss": 0.0, + "step": 46861 + }, + { + "epoch": 4.3726789213399275, + "grad_norm": NaN, + "learning_rate": 5.4359828403469234e-05, + "loss": 0.0, + "step": 46862 + }, + { + "epoch": 4.372772231034805, + "grad_norm": NaN, + "learning_rate": 5.435400163775711e-05, + "loss": 0.0, + "step": 46863 + }, + { + "epoch": 4.3728655407296815, + "grad_norm": NaN, + "learning_rate": 5.434817511524441e-05, + "loss": 0.0, + "step": 46864 + }, + { + "epoch": 4.372958850424559, + "grad_norm": NaN, + "learning_rate": 5.434234883594598e-05, + "loss": 0.0, + "step": 46865 + }, + { + "epoch": 4.373052160119436, + "grad_norm": NaN, + "learning_rate": 5.433652279987667e-05, + "loss": 0.0, + "step": 46866 + }, + { + "epoch": 4.373145469814314, + "grad_norm": NaN, + "learning_rate": 5.433069700705113e-05, + "loss": 0.0, + "step": 46867 + }, + { + "epoch": 4.373238779509191, + "grad_norm": NaN, + "learning_rate": 5.4324871457484344e-05, + "loss": 0.0, + "step": 46868 + }, + { + "epoch": 4.373332089204069, + "grad_norm": NaN, + "learning_rate": 5.431904615119112e-05, + "loss": 0.0, + "step": 46869 + }, + { + "epoch": 4.373425398898946, + "grad_norm": NaN, + "learning_rate": 5.431322108818612e-05, + "loss": 0.0, + "step": 46870 + }, + { + "epoch": 4.3735187085938225, + "grad_norm": NaN, + "learning_rate": 5.4307396268484315e-05, + "loss": 0.0, + "step": 46871 + }, + { + "epoch": 4.3736120182887, + "grad_norm": NaN, + "learning_rate": 5.4301571692100446e-05, + "loss": 0.0, + "step": 46872 + }, + { + "epoch": 4.373705327983577, + "grad_norm": NaN, + "learning_rate": 5.429574735904932e-05, + "loss": 0.0, + "step": 46873 + }, + { + "epoch": 4.373798637678455, + "grad_norm": NaN, + "learning_rate": 5.4289923269345784e-05, + "loss": 0.0, + "step": 46874 + }, + { + "epoch": 4.373891947373332, + "grad_norm": NaN, + "learning_rate": 5.42840994230046e-05, + "loss": 0.0, + "step": 46875 + }, + { + "epoch": 4.37398525706821, + "grad_norm": NaN, + "learning_rate": 5.4278275820040606e-05, + "loss": 0.0, + "step": 46876 + }, + { + "epoch": 4.374078566763087, + "grad_norm": NaN, + "learning_rate": 5.427245246046862e-05, + "loss": 0.0, + "step": 46877 + }, + { + "epoch": 4.374171876457964, + "grad_norm": NaN, + "learning_rate": 5.426662934430341e-05, + "loss": 0.0, + "step": 46878 + }, + { + "epoch": 4.374265186152841, + "grad_norm": NaN, + "learning_rate": 5.42608064715598e-05, + "loss": 0.0, + "step": 46879 + }, + { + "epoch": 4.374358495847718, + "grad_norm": NaN, + "learning_rate": 5.425498384225261e-05, + "loss": 0.0, + "step": 46880 + }, + { + "epoch": 4.374451805542596, + "grad_norm": NaN, + "learning_rate": 5.424916145639663e-05, + "loss": 0.0, + "step": 46881 + }, + { + "epoch": 4.374545115237473, + "grad_norm": NaN, + "learning_rate": 5.424333931400666e-05, + "loss": 0.0, + "step": 46882 + }, + { + "epoch": 4.374638424932351, + "grad_norm": NaN, + "learning_rate": 5.423751741509753e-05, + "loss": 0.0, + "step": 46883 + }, + { + "epoch": 4.374731734627228, + "grad_norm": NaN, + "learning_rate": 5.423169575968401e-05, + "loss": 0.0, + "step": 46884 + }, + { + "epoch": 4.374825044322105, + "grad_norm": NaN, + "learning_rate": 5.422587434778093e-05, + "loss": 0.0, + "step": 46885 + }, + { + "epoch": 4.374918354016982, + "grad_norm": NaN, + "learning_rate": 5.422005317940308e-05, + "loss": 0.0, + "step": 46886 + }, + { + "epoch": 4.3750116637118595, + "grad_norm": NaN, + "learning_rate": 5.421423225456526e-05, + "loss": 0.0, + "step": 46887 + }, + { + "epoch": 4.375104973406737, + "grad_norm": NaN, + "learning_rate": 5.420841157328228e-05, + "loss": 0.0, + "step": 46888 + }, + { + "epoch": 4.375198283101614, + "grad_norm": NaN, + "learning_rate": 5.420259113556893e-05, + "loss": 0.0, + "step": 46889 + }, + { + "epoch": 4.375291592796492, + "grad_norm": NaN, + "learning_rate": 5.419677094144e-05, + "loss": 0.0, + "step": 46890 + }, + { + "epoch": 4.375384902491369, + "grad_norm": NaN, + "learning_rate": 5.419095099091032e-05, + "loss": 0.0, + "step": 46891 + }, + { + "epoch": 4.3754782121862466, + "grad_norm": NaN, + "learning_rate": 5.4185131283994666e-05, + "loss": 0.0, + "step": 46892 + }, + { + "epoch": 4.375571521881123, + "grad_norm": NaN, + "learning_rate": 5.4179311820707836e-05, + "loss": 0.0, + "step": 46893 + }, + { + "epoch": 4.3756648315760005, + "grad_norm": NaN, + "learning_rate": 5.417349260106463e-05, + "loss": 0.0, + "step": 46894 + }, + { + "epoch": 4.375758141270878, + "grad_norm": NaN, + "learning_rate": 5.4167673625079855e-05, + "loss": 0.0, + "step": 46895 + }, + { + "epoch": 4.375851450965755, + "grad_norm": NaN, + "learning_rate": 5.416185489276829e-05, + "loss": 0.0, + "step": 46896 + }, + { + "epoch": 4.375944760660633, + "grad_norm": NaN, + "learning_rate": 5.415603640414475e-05, + "loss": 0.0, + "step": 46897 + }, + { + "epoch": 4.37603807035551, + "grad_norm": NaN, + "learning_rate": 5.415021815922401e-05, + "loss": 0.0, + "step": 46898 + }, + { + "epoch": 4.376131380050388, + "grad_norm": NaN, + "learning_rate": 5.414440015802087e-05, + "loss": 0.0, + "step": 46899 + }, + { + "epoch": 4.376224689745264, + "grad_norm": NaN, + "learning_rate": 5.413858240055013e-05, + "loss": 0.0, + "step": 46900 + }, + { + "epoch": 4.3763179994401415, + "grad_norm": NaN, + "learning_rate": 5.413276488682659e-05, + "loss": 0.0, + "step": 46901 + }, + { + "epoch": 4.376411309135019, + "grad_norm": NaN, + "learning_rate": 5.412694761686501e-05, + "loss": 0.0, + "step": 46902 + }, + { + "epoch": 4.376504618829896, + "grad_norm": NaN, + "learning_rate": 5.412113059068022e-05, + "loss": 0.0, + "step": 46903 + }, + { + "epoch": 4.376597928524774, + "grad_norm": NaN, + "learning_rate": 5.4115313808286986e-05, + "loss": 0.0, + "step": 46904 + }, + { + "epoch": 4.376691238219651, + "grad_norm": NaN, + "learning_rate": 5.410949726970011e-05, + "loss": 0.0, + "step": 46905 + }, + { + "epoch": 4.376784547914529, + "grad_norm": NaN, + "learning_rate": 5.4103680974934374e-05, + "loss": 0.0, + "step": 46906 + }, + { + "epoch": 4.376877857609406, + "grad_norm": NaN, + "learning_rate": 5.409786492400456e-05, + "loss": 0.0, + "step": 46907 + }, + { + "epoch": 4.376971167304283, + "grad_norm": NaN, + "learning_rate": 5.409204911692549e-05, + "loss": 0.0, + "step": 46908 + }, + { + "epoch": 4.37706447699916, + "grad_norm": NaN, + "learning_rate": 5.408623355371191e-05, + "loss": 0.0, + "step": 46909 + }, + { + "epoch": 4.377157786694037, + "grad_norm": NaN, + "learning_rate": 5.408041823437863e-05, + "loss": 0.0, + "step": 46910 + }, + { + "epoch": 4.377251096388915, + "grad_norm": NaN, + "learning_rate": 5.407460315894044e-05, + "loss": 0.0, + "step": 46911 + }, + { + "epoch": 4.377344406083792, + "grad_norm": NaN, + "learning_rate": 5.406878832741212e-05, + "loss": 0.0, + "step": 46912 + }, + { + "epoch": 4.37743771577867, + "grad_norm": NaN, + "learning_rate": 5.406297373980845e-05, + "loss": 0.0, + "step": 46913 + }, + { + "epoch": 4.377531025473547, + "grad_norm": NaN, + "learning_rate": 5.405715939614422e-05, + "loss": 0.0, + "step": 46914 + }, + { + "epoch": 4.377624335168424, + "grad_norm": NaN, + "learning_rate": 5.405134529643421e-05, + "loss": 0.0, + "step": 46915 + }, + { + "epoch": 4.377717644863301, + "grad_norm": NaN, + "learning_rate": 5.4045531440693205e-05, + "loss": 0.0, + "step": 46916 + }, + { + "epoch": 4.3778109545581785, + "grad_norm": NaN, + "learning_rate": 5.403971782893599e-05, + "loss": 0.0, + "step": 46917 + }, + { + "epoch": 4.377904264253056, + "grad_norm": NaN, + "learning_rate": 5.403390446117735e-05, + "loss": 0.0, + "step": 46918 + }, + { + "epoch": 4.377997573947933, + "grad_norm": NaN, + "learning_rate": 5.402809133743206e-05, + "loss": 0.0, + "step": 46919 + }, + { + "epoch": 4.378090883642811, + "grad_norm": NaN, + "learning_rate": 5.402227845771493e-05, + "loss": 0.0, + "step": 46920 + }, + { + "epoch": 4.378184193337688, + "grad_norm": NaN, + "learning_rate": 5.401646582204067e-05, + "loss": 0.0, + "step": 46921 + }, + { + "epoch": 4.378277503032565, + "grad_norm": NaN, + "learning_rate": 5.401065343042414e-05, + "loss": 0.0, + "step": 46922 + }, + { + "epoch": 4.378370812727442, + "grad_norm": NaN, + "learning_rate": 5.400484128288008e-05, + "loss": 0.0, + "step": 46923 + }, + { + "epoch": 4.3784641224223195, + "grad_norm": NaN, + "learning_rate": 5.399902937942327e-05, + "loss": 0.0, + "step": 46924 + }, + { + "epoch": 4.378557432117197, + "grad_norm": NaN, + "learning_rate": 5.399321772006847e-05, + "loss": 0.0, + "step": 46925 + }, + { + "epoch": 4.378650741812074, + "grad_norm": NaN, + "learning_rate": 5.39874063048305e-05, + "loss": 0.0, + "step": 46926 + }, + { + "epoch": 4.378744051506952, + "grad_norm": NaN, + "learning_rate": 5.3981595133724105e-05, + "loss": 0.0, + "step": 46927 + }, + { + "epoch": 4.378837361201829, + "grad_norm": NaN, + "learning_rate": 5.397578420676407e-05, + "loss": 0.0, + "step": 46928 + }, + { + "epoch": 4.378930670896706, + "grad_norm": NaN, + "learning_rate": 5.396997352396518e-05, + "loss": 0.0, + "step": 46929 + }, + { + "epoch": 4.379023980591583, + "grad_norm": NaN, + "learning_rate": 5.396416308534221e-05, + "loss": 0.0, + "step": 46930 + }, + { + "epoch": 4.379117290286461, + "grad_norm": NaN, + "learning_rate": 5.3958352890909915e-05, + "loss": 0.0, + "step": 46931 + }, + { + "epoch": 4.379210599981338, + "grad_norm": NaN, + "learning_rate": 5.3952542940683075e-05, + "loss": 0.0, + "step": 46932 + }, + { + "epoch": 4.379303909676215, + "grad_norm": NaN, + "learning_rate": 5.394673323467646e-05, + "loss": 0.0, + "step": 46933 + }, + { + "epoch": 4.379397219371093, + "grad_norm": NaN, + "learning_rate": 5.394092377290485e-05, + "loss": 0.0, + "step": 46934 + }, + { + "epoch": 4.37949052906597, + "grad_norm": NaN, + "learning_rate": 5.393511455538303e-05, + "loss": 0.0, + "step": 46935 + }, + { + "epoch": 4.379583838760848, + "grad_norm": NaN, + "learning_rate": 5.392930558212575e-05, + "loss": 0.0, + "step": 46936 + }, + { + "epoch": 4.379677148455724, + "grad_norm": NaN, + "learning_rate": 5.3923496853147795e-05, + "loss": 0.0, + "step": 46937 + }, + { + "epoch": 4.379770458150602, + "grad_norm": NaN, + "learning_rate": 5.3917688368463914e-05, + "loss": 0.0, + "step": 46938 + }, + { + "epoch": 4.379863767845479, + "grad_norm": NaN, + "learning_rate": 5.3911880128088905e-05, + "loss": 0.0, + "step": 46939 + }, + { + "epoch": 4.3799570775403565, + "grad_norm": NaN, + "learning_rate": 5.390607213203752e-05, + "loss": 0.0, + "step": 46940 + }, + { + "epoch": 4.380050387235234, + "grad_norm": NaN, + "learning_rate": 5.390026438032452e-05, + "loss": 0.0, + "step": 46941 + }, + { + "epoch": 4.380143696930111, + "grad_norm": NaN, + "learning_rate": 5.3894456872964675e-05, + "loss": 0.0, + "step": 46942 + }, + { + "epoch": 4.380237006624989, + "grad_norm": NaN, + "learning_rate": 5.388864960997277e-05, + "loss": 0.0, + "step": 46943 + }, + { + "epoch": 4.380330316319865, + "grad_norm": NaN, + "learning_rate": 5.3882842591363554e-05, + "loss": 0.0, + "step": 46944 + }, + { + "epoch": 4.380423626014743, + "grad_norm": NaN, + "learning_rate": 5.3877035817151795e-05, + "loss": 0.0, + "step": 46945 + }, + { + "epoch": 4.38051693570962, + "grad_norm": NaN, + "learning_rate": 5.387122928735227e-05, + "loss": 0.0, + "step": 46946 + }, + { + "epoch": 4.3806102454044975, + "grad_norm": NaN, + "learning_rate": 5.3865423001979716e-05, + "loss": 0.0, + "step": 46947 + }, + { + "epoch": 4.380703555099375, + "grad_norm": NaN, + "learning_rate": 5.3859616961048934e-05, + "loss": 0.0, + "step": 46948 + }, + { + "epoch": 4.380796864794252, + "grad_norm": NaN, + "learning_rate": 5.385381116457466e-05, + "loss": 0.0, + "step": 46949 + }, + { + "epoch": 4.38089017448913, + "grad_norm": NaN, + "learning_rate": 5.3848005612571656e-05, + "loss": 0.0, + "step": 46950 + }, + { + "epoch": 4.380983484184007, + "grad_norm": NaN, + "learning_rate": 5.38422003050547e-05, + "loss": 0.0, + "step": 46951 + }, + { + "epoch": 4.381076793878884, + "grad_norm": NaN, + "learning_rate": 5.383639524203853e-05, + "loss": 0.0, + "step": 46952 + }, + { + "epoch": 4.381170103573761, + "grad_norm": NaN, + "learning_rate": 5.383059042353794e-05, + "loss": 0.0, + "step": 46953 + }, + { + "epoch": 4.3812634132686386, + "grad_norm": NaN, + "learning_rate": 5.382478584956766e-05, + "loss": 0.0, + "step": 46954 + }, + { + "epoch": 4.381356722963516, + "grad_norm": NaN, + "learning_rate": 5.3818981520142464e-05, + "loss": 0.0, + "step": 46955 + }, + { + "epoch": 4.381450032658393, + "grad_norm": NaN, + "learning_rate": 5.3813177435277105e-05, + "loss": 0.0, + "step": 46956 + }, + { + "epoch": 4.381543342353271, + "grad_norm": NaN, + "learning_rate": 5.380737359498634e-05, + "loss": 0.0, + "step": 46957 + }, + { + "epoch": 4.381636652048147, + "grad_norm": NaN, + "learning_rate": 5.380156999928494e-05, + "loss": 0.0, + "step": 46958 + }, + { + "epoch": 4.381729961743025, + "grad_norm": NaN, + "learning_rate": 5.379576664818765e-05, + "loss": 0.0, + "step": 46959 + }, + { + "epoch": 4.381823271437902, + "grad_norm": NaN, + "learning_rate": 5.378996354170923e-05, + "loss": 0.0, + "step": 46960 + }, + { + "epoch": 4.38191658113278, + "grad_norm": NaN, + "learning_rate": 5.3784160679864425e-05, + "loss": 0.0, + "step": 46961 + }, + { + "epoch": 4.382009890827657, + "grad_norm": NaN, + "learning_rate": 5.377835806266799e-05, + "loss": 0.0, + "step": 46962 + }, + { + "epoch": 4.382103200522534, + "grad_norm": NaN, + "learning_rate": 5.377255569013471e-05, + "loss": 0.0, + "step": 46963 + }, + { + "epoch": 4.382196510217412, + "grad_norm": NaN, + "learning_rate": 5.37667535622793e-05, + "loss": 0.0, + "step": 46964 + }, + { + "epoch": 4.382289819912289, + "grad_norm": NaN, + "learning_rate": 5.3760951679116534e-05, + "loss": 0.0, + "step": 46965 + }, + { + "epoch": 4.382383129607166, + "grad_norm": NaN, + "learning_rate": 5.375515004066115e-05, + "loss": 0.0, + "step": 46966 + }, + { + "epoch": 4.382476439302043, + "grad_norm": NaN, + "learning_rate": 5.374934864692793e-05, + "loss": 0.0, + "step": 46967 + }, + { + "epoch": 4.382569748996921, + "grad_norm": NaN, + "learning_rate": 5.3743547497931584e-05, + "loss": 0.0, + "step": 46968 + }, + { + "epoch": 4.382663058691798, + "grad_norm": NaN, + "learning_rate": 5.3737746593686895e-05, + "loss": 0.0, + "step": 46969 + }, + { + "epoch": 4.3827563683866755, + "grad_norm": NaN, + "learning_rate": 5.373194593420859e-05, + "loss": 0.0, + "step": 46970 + }, + { + "epoch": 4.382849678081553, + "grad_norm": NaN, + "learning_rate": 5.372614551951145e-05, + "loss": 0.0, + "step": 46971 + }, + { + "epoch": 4.38294298777643, + "grad_norm": NaN, + "learning_rate": 5.3720345349610184e-05, + "loss": 0.0, + "step": 46972 + }, + { + "epoch": 4.383036297471307, + "grad_norm": NaN, + "learning_rate": 5.371454542451957e-05, + "loss": 0.0, + "step": 46973 + }, + { + "epoch": 4.383129607166184, + "grad_norm": NaN, + "learning_rate": 5.370874574425434e-05, + "loss": 0.0, + "step": 46974 + }, + { + "epoch": 4.383222916861062, + "grad_norm": NaN, + "learning_rate": 5.370294630882925e-05, + "loss": 0.0, + "step": 46975 + }, + { + "epoch": 4.383316226555939, + "grad_norm": NaN, + "learning_rate": 5.369714711825905e-05, + "loss": 0.0, + "step": 46976 + }, + { + "epoch": 4.3834095362508165, + "grad_norm": NaN, + "learning_rate": 5.369134817255847e-05, + "loss": 0.0, + "step": 46977 + }, + { + "epoch": 4.383502845945694, + "grad_norm": NaN, + "learning_rate": 5.368554947174227e-05, + "loss": 0.0, + "step": 46978 + }, + { + "epoch": 4.383596155640571, + "grad_norm": NaN, + "learning_rate": 5.367975101582517e-05, + "loss": 0.0, + "step": 46979 + }, + { + "epoch": 4.383689465335449, + "grad_norm": NaN, + "learning_rate": 5.367395280482194e-05, + "loss": 0.0, + "step": 46980 + }, + { + "epoch": 4.383782775030325, + "grad_norm": NaN, + "learning_rate": 5.366815483874731e-05, + "loss": 0.0, + "step": 46981 + }, + { + "epoch": 4.383876084725203, + "grad_norm": NaN, + "learning_rate": 5.366235711761604e-05, + "loss": 0.0, + "step": 46982 + }, + { + "epoch": 4.38396939442008, + "grad_norm": NaN, + "learning_rate": 5.365655964144284e-05, + "loss": 0.0, + "step": 46983 + }, + { + "epoch": 4.384062704114958, + "grad_norm": NaN, + "learning_rate": 5.3650762410242486e-05, + "loss": 0.0, + "step": 46984 + }, + { + "epoch": 4.384156013809835, + "grad_norm": NaN, + "learning_rate": 5.36449654240297e-05, + "loss": 0.0, + "step": 46985 + }, + { + "epoch": 4.384249323504712, + "grad_norm": NaN, + "learning_rate": 5.3639168682819224e-05, + "loss": 0.0, + "step": 46986 + }, + { + "epoch": 4.38434263319959, + "grad_norm": NaN, + "learning_rate": 5.363337218662579e-05, + "loss": 0.0, + "step": 46987 + }, + { + "epoch": 4.384435942894466, + "grad_norm": NaN, + "learning_rate": 5.362757593546415e-05, + "loss": 0.0, + "step": 46988 + }, + { + "epoch": 4.384529252589344, + "grad_norm": NaN, + "learning_rate": 5.362177992934903e-05, + "loss": 0.0, + "step": 46989 + }, + { + "epoch": 4.384622562284221, + "grad_norm": NaN, + "learning_rate": 5.361598416829518e-05, + "loss": 0.0, + "step": 46990 + }, + { + "epoch": 4.384715871979099, + "grad_norm": NaN, + "learning_rate": 5.3610188652317336e-05, + "loss": 0.0, + "step": 46991 + }, + { + "epoch": 4.384809181673976, + "grad_norm": NaN, + "learning_rate": 5.3604393381430224e-05, + "loss": 0.0, + "step": 46992 + }, + { + "epoch": 4.3849024913688535, + "grad_norm": NaN, + "learning_rate": 5.359859835564859e-05, + "loss": 0.0, + "step": 46993 + }, + { + "epoch": 4.384995801063731, + "grad_norm": NaN, + "learning_rate": 5.3592803574987155e-05, + "loss": 0.0, + "step": 46994 + }, + { + "epoch": 4.385089110758607, + "grad_norm": NaN, + "learning_rate": 5.358700903946067e-05, + "loss": 0.0, + "step": 46995 + }, + { + "epoch": 4.385182420453485, + "grad_norm": NaN, + "learning_rate": 5.358121474908386e-05, + "loss": 0.0, + "step": 46996 + }, + { + "epoch": 4.385275730148362, + "grad_norm": NaN, + "learning_rate": 5.357542070387145e-05, + "loss": 0.0, + "step": 46997 + }, + { + "epoch": 4.38536903984324, + "grad_norm": NaN, + "learning_rate": 5.356962690383819e-05, + "loss": 0.0, + "step": 46998 + }, + { + "epoch": 4.385462349538117, + "grad_norm": NaN, + "learning_rate": 5.3563833348998805e-05, + "loss": 0.0, + "step": 46999 + }, + { + "epoch": 4.3855556592329945, + "grad_norm": NaN, + "learning_rate": 5.355804003936803e-05, + "loss": 0.0, + "step": 47000 + }, + { + "epoch": 4.385648968927872, + "grad_norm": NaN, + "learning_rate": 5.355224697496059e-05, + "loss": 0.0, + "step": 47001 + }, + { + "epoch": 4.3857422786227485, + "grad_norm": NaN, + "learning_rate": 5.3546454155791206e-05, + "loss": 0.0, + "step": 47002 + }, + { + "epoch": 4.385835588317626, + "grad_norm": NaN, + "learning_rate": 5.354066158187463e-05, + "loss": 0.0, + "step": 47003 + }, + { + "epoch": 4.385928898012503, + "grad_norm": NaN, + "learning_rate": 5.353486925322557e-05, + "loss": 0.0, + "step": 47004 + }, + { + "epoch": 4.386022207707381, + "grad_norm": NaN, + "learning_rate": 5.3529077169858755e-05, + "loss": 0.0, + "step": 47005 + }, + { + "epoch": 4.386115517402258, + "grad_norm": NaN, + "learning_rate": 5.3523285331788926e-05, + "loss": 0.0, + "step": 47006 + }, + { + "epoch": 4.3862088270971356, + "grad_norm": NaN, + "learning_rate": 5.3517493739030823e-05, + "loss": 0.0, + "step": 47007 + }, + { + "epoch": 4.386302136792013, + "grad_norm": NaN, + "learning_rate": 5.3511702391599124e-05, + "loss": 0.0, + "step": 47008 + }, + { + "epoch": 4.38639544648689, + "grad_norm": NaN, + "learning_rate": 5.35059112895086e-05, + "loss": 0.0, + "step": 47009 + }, + { + "epoch": 4.386488756181767, + "grad_norm": NaN, + "learning_rate": 5.3500120432773955e-05, + "loss": 0.0, + "step": 47010 + }, + { + "epoch": 4.386582065876644, + "grad_norm": NaN, + "learning_rate": 5.349432982140992e-05, + "loss": 0.0, + "step": 47011 + }, + { + "epoch": 4.386675375571522, + "grad_norm": NaN, + "learning_rate": 5.348853945543121e-05, + "loss": 0.0, + "step": 47012 + }, + { + "epoch": 4.386768685266399, + "grad_norm": NaN, + "learning_rate": 5.3482749334852564e-05, + "loss": 0.0, + "step": 47013 + }, + { + "epoch": 4.386861994961277, + "grad_norm": NaN, + "learning_rate": 5.347695945968871e-05, + "loss": 0.0, + "step": 47014 + }, + { + "epoch": 4.386955304656154, + "grad_norm": NaN, + "learning_rate": 5.347116982995434e-05, + "loss": 0.0, + "step": 47015 + }, + { + "epoch": 4.387048614351031, + "grad_norm": NaN, + "learning_rate": 5.346538044566419e-05, + "loss": 0.0, + "step": 47016 + }, + { + "epoch": 4.387141924045908, + "grad_norm": NaN, + "learning_rate": 5.3459591306832977e-05, + "loss": 0.0, + "step": 47017 + }, + { + "epoch": 4.387235233740785, + "grad_norm": NaN, + "learning_rate": 5.345380241347544e-05, + "loss": 0.0, + "step": 47018 + }, + { + "epoch": 4.387328543435663, + "grad_norm": NaN, + "learning_rate": 5.3448013765606263e-05, + "loss": 0.0, + "step": 47019 + }, + { + "epoch": 4.38742185313054, + "grad_norm": NaN, + "learning_rate": 5.34422253632402e-05, + "loss": 0.0, + "step": 47020 + }, + { + "epoch": 4.387515162825418, + "grad_norm": NaN, + "learning_rate": 5.343643720639197e-05, + "loss": 0.0, + "step": 47021 + }, + { + "epoch": 4.387608472520295, + "grad_norm": NaN, + "learning_rate": 5.3430649295076264e-05, + "loss": 0.0, + "step": 47022 + }, + { + "epoch": 4.3877017822151725, + "grad_norm": NaN, + "learning_rate": 5.34248616293078e-05, + "loss": 0.0, + "step": 47023 + }, + { + "epoch": 4.38779509191005, + "grad_norm": NaN, + "learning_rate": 5.341907420910131e-05, + "loss": 0.0, + "step": 47024 + }, + { + "epoch": 4.387888401604926, + "grad_norm": NaN, + "learning_rate": 5.34132870344715e-05, + "loss": 0.0, + "step": 47025 + }, + { + "epoch": 4.387981711299804, + "grad_norm": NaN, + "learning_rate": 5.3407500105433105e-05, + "loss": 0.0, + "step": 47026 + }, + { + "epoch": 4.388075020994681, + "grad_norm": NaN, + "learning_rate": 5.340171342200082e-05, + "loss": 0.0, + "step": 47027 + }, + { + "epoch": 4.388168330689559, + "grad_norm": NaN, + "learning_rate": 5.3395926984189366e-05, + "loss": 0.0, + "step": 47028 + }, + { + "epoch": 4.388261640384436, + "grad_norm": NaN, + "learning_rate": 5.339014079201344e-05, + "loss": 0.0, + "step": 47029 + }, + { + "epoch": 4.3883549500793135, + "grad_norm": NaN, + "learning_rate": 5.338435484548778e-05, + "loss": 0.0, + "step": 47030 + }, + { + "epoch": 4.388448259774191, + "grad_norm": NaN, + "learning_rate": 5.3378569144627074e-05, + "loss": 0.0, + "step": 47031 + }, + { + "epoch": 4.3885415694690675, + "grad_norm": NaN, + "learning_rate": 5.3372783689446064e-05, + "loss": 0.0, + "step": 47032 + }, + { + "epoch": 4.388634879163945, + "grad_norm": NaN, + "learning_rate": 5.3366998479959436e-05, + "loss": 0.0, + "step": 47033 + }, + { + "epoch": 4.388728188858822, + "grad_norm": NaN, + "learning_rate": 5.336121351618189e-05, + "loss": 0.0, + "step": 47034 + }, + { + "epoch": 4.3888214985537, + "grad_norm": NaN, + "learning_rate": 5.335542879812817e-05, + "loss": 0.0, + "step": 47035 + }, + { + "epoch": 4.388914808248577, + "grad_norm": NaN, + "learning_rate": 5.334964432581296e-05, + "loss": 0.0, + "step": 47036 + }, + { + "epoch": 4.389008117943455, + "grad_norm": NaN, + "learning_rate": 5.334386009925097e-05, + "loss": 0.0, + "step": 47037 + }, + { + "epoch": 4.389101427638332, + "grad_norm": NaN, + "learning_rate": 5.3338076118456925e-05, + "loss": 0.0, + "step": 47038 + }, + { + "epoch": 4.3891947373332085, + "grad_norm": NaN, + "learning_rate": 5.33322923834455e-05, + "loss": 0.0, + "step": 47039 + }, + { + "epoch": 4.389288047028086, + "grad_norm": NaN, + "learning_rate": 5.3326508894231426e-05, + "loss": 0.0, + "step": 47040 + }, + { + "epoch": 4.389381356722963, + "grad_norm": NaN, + "learning_rate": 5.332072565082941e-05, + "loss": 0.0, + "step": 47041 + }, + { + "epoch": 4.389474666417841, + "grad_norm": NaN, + "learning_rate": 5.331494265325415e-05, + "loss": 0.0, + "step": 47042 + }, + { + "epoch": 4.389567976112718, + "grad_norm": NaN, + "learning_rate": 5.3309159901520356e-05, + "loss": 0.0, + "step": 47043 + }, + { + "epoch": 4.389661285807596, + "grad_norm": NaN, + "learning_rate": 5.330337739564271e-05, + "loss": 0.0, + "step": 47044 + }, + { + "epoch": 4.389754595502473, + "grad_norm": NaN, + "learning_rate": 5.329759513563594e-05, + "loss": 0.0, + "step": 47045 + }, + { + "epoch": 4.38984790519735, + "grad_norm": NaN, + "learning_rate": 5.329181312151475e-05, + "loss": 0.0, + "step": 47046 + }, + { + "epoch": 4.389941214892227, + "grad_norm": NaN, + "learning_rate": 5.3286031353293826e-05, + "loss": 0.0, + "step": 47047 + }, + { + "epoch": 4.390034524587104, + "grad_norm": NaN, + "learning_rate": 5.328024983098787e-05, + "loss": 0.0, + "step": 47048 + }, + { + "epoch": 4.390127834281982, + "grad_norm": NaN, + "learning_rate": 5.327446855461159e-05, + "loss": 0.0, + "step": 47049 + }, + { + "epoch": 4.390221143976859, + "grad_norm": NaN, + "learning_rate": 5.326868752417968e-05, + "loss": 0.0, + "step": 47050 + }, + { + "epoch": 4.390314453671737, + "grad_norm": NaN, + "learning_rate": 5.326290673970686e-05, + "loss": 0.0, + "step": 47051 + }, + { + "epoch": 4.390407763366614, + "grad_norm": NaN, + "learning_rate": 5.325712620120779e-05, + "loss": 0.0, + "step": 47052 + }, + { + "epoch": 4.3905010730614915, + "grad_norm": NaN, + "learning_rate": 5.32513459086972e-05, + "loss": 0.0, + "step": 47053 + }, + { + "epoch": 4.390594382756368, + "grad_norm": NaN, + "learning_rate": 5.3245565862189783e-05, + "loss": 0.0, + "step": 47054 + }, + { + "epoch": 4.3906876924512455, + "grad_norm": NaN, + "learning_rate": 5.323978606170024e-05, + "loss": 0.0, + "step": 47055 + }, + { + "epoch": 4.390781002146123, + "grad_norm": NaN, + "learning_rate": 5.323400650724323e-05, + "loss": 0.0, + "step": 47056 + }, + { + "epoch": 4.390874311841, + "grad_norm": NaN, + "learning_rate": 5.32282271988335e-05, + "loss": 0.0, + "step": 47057 + }, + { + "epoch": 4.390967621535878, + "grad_norm": NaN, + "learning_rate": 5.322244813648571e-05, + "loss": 0.0, + "step": 47058 + }, + { + "epoch": 4.391060931230755, + "grad_norm": NaN, + "learning_rate": 5.3216669320214575e-05, + "loss": 0.0, + "step": 47059 + }, + { + "epoch": 4.391154240925633, + "grad_norm": NaN, + "learning_rate": 5.321089075003478e-05, + "loss": 0.0, + "step": 47060 + }, + { + "epoch": 4.391247550620509, + "grad_norm": NaN, + "learning_rate": 5.320511242596102e-05, + "loss": 0.0, + "step": 47061 + }, + { + "epoch": 4.3913408603153865, + "grad_norm": NaN, + "learning_rate": 5.319933434800799e-05, + "loss": 0.0, + "step": 47062 + }, + { + "epoch": 4.391434170010264, + "grad_norm": NaN, + "learning_rate": 5.319355651619037e-05, + "loss": 0.0, + "step": 47063 + }, + { + "epoch": 4.391527479705141, + "grad_norm": NaN, + "learning_rate": 5.318777893052286e-05, + "loss": 0.0, + "step": 47064 + }, + { + "epoch": 4.391620789400019, + "grad_norm": NaN, + "learning_rate": 5.318200159102015e-05, + "loss": 0.0, + "step": 47065 + }, + { + "epoch": 4.391714099094896, + "grad_norm": NaN, + "learning_rate": 5.317622449769694e-05, + "loss": 0.0, + "step": 47066 + }, + { + "epoch": 4.391807408789774, + "grad_norm": NaN, + "learning_rate": 5.31704476505679e-05, + "loss": 0.0, + "step": 47067 + }, + { + "epoch": 4.391900718484651, + "grad_norm": NaN, + "learning_rate": 5.316467104964773e-05, + "loss": 0.0, + "step": 47068 + }, + { + "epoch": 4.3919940281795276, + "grad_norm": NaN, + "learning_rate": 5.3158894694951116e-05, + "loss": 0.0, + "step": 47069 + }, + { + "epoch": 4.392087337874405, + "grad_norm": NaN, + "learning_rate": 5.315311858649275e-05, + "loss": 0.0, + "step": 47070 + }, + { + "epoch": 4.392180647569282, + "grad_norm": NaN, + "learning_rate": 5.31473427242873e-05, + "loss": 0.0, + "step": 47071 + }, + { + "epoch": 4.39227395726416, + "grad_norm": NaN, + "learning_rate": 5.314156710834949e-05, + "loss": 0.0, + "step": 47072 + }, + { + "epoch": 4.392367266959037, + "grad_norm": NaN, + "learning_rate": 5.313579173869398e-05, + "loss": 0.0, + "step": 47073 + }, + { + "epoch": 4.392460576653915, + "grad_norm": NaN, + "learning_rate": 5.3130016615335445e-05, + "loss": 0.0, + "step": 47074 + }, + { + "epoch": 4.392553886348791, + "grad_norm": NaN, + "learning_rate": 5.312424173828859e-05, + "loss": 0.0, + "step": 47075 + }, + { + "epoch": 4.392647196043669, + "grad_norm": NaN, + "learning_rate": 5.311846710756809e-05, + "loss": 0.0, + "step": 47076 + }, + { + "epoch": 4.392740505738546, + "grad_norm": NaN, + "learning_rate": 5.3112692723188635e-05, + "loss": 0.0, + "step": 47077 + }, + { + "epoch": 4.392833815433423, + "grad_norm": NaN, + "learning_rate": 5.31069185851649e-05, + "loss": 0.0, + "step": 47078 + }, + { + "epoch": 4.392927125128301, + "grad_norm": NaN, + "learning_rate": 5.310114469351157e-05, + "loss": 0.0, + "step": 47079 + }, + { + "epoch": 4.393020434823178, + "grad_norm": NaN, + "learning_rate": 5.3095371048243325e-05, + "loss": 0.0, + "step": 47080 + }, + { + "epoch": 4.393113744518056, + "grad_norm": NaN, + "learning_rate": 5.308959764937484e-05, + "loss": 0.0, + "step": 47081 + }, + { + "epoch": 4.393207054212933, + "grad_norm": NaN, + "learning_rate": 5.30838244969208e-05, + "loss": 0.0, + "step": 47082 + }, + { + "epoch": 4.39330036390781, + "grad_norm": NaN, + "learning_rate": 5.3078051590895885e-05, + "loss": 0.0, + "step": 47083 + }, + { + "epoch": 4.393393673602687, + "grad_norm": NaN, + "learning_rate": 5.307227893131479e-05, + "loss": 0.0, + "step": 47084 + }, + { + "epoch": 4.3934869832975645, + "grad_norm": NaN, + "learning_rate": 5.306650651819216e-05, + "loss": 0.0, + "step": 47085 + }, + { + "epoch": 4.393580292992442, + "grad_norm": NaN, + "learning_rate": 5.3060734351542707e-05, + "loss": 0.0, + "step": 47086 + }, + { + "epoch": 4.393673602687319, + "grad_norm": NaN, + "learning_rate": 5.305496243138108e-05, + "loss": 0.0, + "step": 47087 + }, + { + "epoch": 4.393766912382197, + "grad_norm": NaN, + "learning_rate": 5.3049190757721976e-05, + "loss": 0.0, + "step": 47088 + }, + { + "epoch": 4.393860222077074, + "grad_norm": NaN, + "learning_rate": 5.3043419330580045e-05, + "loss": 0.0, + "step": 47089 + }, + { + "epoch": 4.393953531771951, + "grad_norm": NaN, + "learning_rate": 5.303764814996999e-05, + "loss": 0.0, + "step": 47090 + }, + { + "epoch": 4.394046841466828, + "grad_norm": NaN, + "learning_rate": 5.303187721590647e-05, + "loss": 0.0, + "step": 47091 + }, + { + "epoch": 4.3941401511617055, + "grad_norm": NaN, + "learning_rate": 5.302610652840416e-05, + "loss": 0.0, + "step": 47092 + }, + { + "epoch": 4.394233460856583, + "grad_norm": NaN, + "learning_rate": 5.302033608747773e-05, + "loss": 0.0, + "step": 47093 + }, + { + "epoch": 4.39432677055146, + "grad_norm": NaN, + "learning_rate": 5.301456589314187e-05, + "loss": 0.0, + "step": 47094 + }, + { + "epoch": 4.394420080246338, + "grad_norm": NaN, + "learning_rate": 5.3008795945411236e-05, + "loss": 0.0, + "step": 47095 + }, + { + "epoch": 4.394513389941215, + "grad_norm": NaN, + "learning_rate": 5.300302624430051e-05, + "loss": 0.0, + "step": 47096 + }, + { + "epoch": 4.394606699636093, + "grad_norm": NaN, + "learning_rate": 5.2997256789824336e-05, + "loss": 0.0, + "step": 47097 + }, + { + "epoch": 4.394700009330969, + "grad_norm": NaN, + "learning_rate": 5.299148758199742e-05, + "loss": 0.0, + "step": 47098 + }, + { + "epoch": 4.394793319025847, + "grad_norm": NaN, + "learning_rate": 5.298571862083441e-05, + "loss": 0.0, + "step": 47099 + }, + { + "epoch": 4.394886628720724, + "grad_norm": NaN, + "learning_rate": 5.297994990634998e-05, + "loss": 0.0, + "step": 47100 + }, + { + "epoch": 4.394979938415601, + "grad_norm": NaN, + "learning_rate": 5.2974181438558796e-05, + "loss": 0.0, + "step": 47101 + }, + { + "epoch": 4.395073248110479, + "grad_norm": NaN, + "learning_rate": 5.296841321747553e-05, + "loss": 0.0, + "step": 47102 + }, + { + "epoch": 4.395166557805356, + "grad_norm": NaN, + "learning_rate": 5.2962645243114844e-05, + "loss": 0.0, + "step": 47103 + }, + { + "epoch": 4.395259867500234, + "grad_norm": NaN, + "learning_rate": 5.295687751549142e-05, + "loss": 0.0, + "step": 47104 + }, + { + "epoch": 4.39535317719511, + "grad_norm": NaN, + "learning_rate": 5.29511100346199e-05, + "loss": 0.0, + "step": 47105 + }, + { + "epoch": 4.395446486889988, + "grad_norm": NaN, + "learning_rate": 5.294534280051495e-05, + "loss": 0.0, + "step": 47106 + }, + { + "epoch": 4.395539796584865, + "grad_norm": NaN, + "learning_rate": 5.2939575813191246e-05, + "loss": 0.0, + "step": 47107 + }, + { + "epoch": 4.3956331062797425, + "grad_norm": NaN, + "learning_rate": 5.293380907266346e-05, + "loss": 0.0, + "step": 47108 + }, + { + "epoch": 4.39572641597462, + "grad_norm": NaN, + "learning_rate": 5.2928042578946234e-05, + "loss": 0.0, + "step": 47109 + }, + { + "epoch": 4.395819725669497, + "grad_norm": NaN, + "learning_rate": 5.292227633205425e-05, + "loss": 0.0, + "step": 47110 + }, + { + "epoch": 4.395913035364375, + "grad_norm": NaN, + "learning_rate": 5.291651033200216e-05, + "loss": 0.0, + "step": 47111 + }, + { + "epoch": 4.396006345059251, + "grad_norm": NaN, + "learning_rate": 5.291074457880461e-05, + "loss": 0.0, + "step": 47112 + }, + { + "epoch": 4.396099654754129, + "grad_norm": NaN, + "learning_rate": 5.29049790724763e-05, + "loss": 0.0, + "step": 47113 + }, + { + "epoch": 4.396192964449006, + "grad_norm": NaN, + "learning_rate": 5.289921381303185e-05, + "loss": 0.0, + "step": 47114 + }, + { + "epoch": 4.3962862741438835, + "grad_norm": NaN, + "learning_rate": 5.289344880048594e-05, + "loss": 0.0, + "step": 47115 + }, + { + "epoch": 4.396379583838761, + "grad_norm": NaN, + "learning_rate": 5.2887684034853225e-05, + "loss": 0.0, + "step": 47116 + }, + { + "epoch": 4.396472893533638, + "grad_norm": NaN, + "learning_rate": 5.288191951614836e-05, + "loss": 0.0, + "step": 47117 + }, + { + "epoch": 4.396566203228516, + "grad_norm": NaN, + "learning_rate": 5.287615524438602e-05, + "loss": 0.0, + "step": 47118 + }, + { + "epoch": 4.396659512923392, + "grad_norm": NaN, + "learning_rate": 5.287039121958083e-05, + "loss": 0.0, + "step": 47119 + }, + { + "epoch": 4.39675282261827, + "grad_norm": NaN, + "learning_rate": 5.286462744174747e-05, + "loss": 0.0, + "step": 47120 + }, + { + "epoch": 4.396846132313147, + "grad_norm": NaN, + "learning_rate": 5.285886391090059e-05, + "loss": 0.0, + "step": 47121 + }, + { + "epoch": 4.396939442008025, + "grad_norm": NaN, + "learning_rate": 5.285310062705484e-05, + "loss": 0.0, + "step": 47122 + }, + { + "epoch": 4.397032751702902, + "grad_norm": NaN, + "learning_rate": 5.2847337590224876e-05, + "loss": 0.0, + "step": 47123 + }, + { + "epoch": 4.397126061397779, + "grad_norm": NaN, + "learning_rate": 5.284157480042537e-05, + "loss": 0.0, + "step": 47124 + }, + { + "epoch": 4.397219371092657, + "grad_norm": NaN, + "learning_rate": 5.2835812257670946e-05, + "loss": 0.0, + "step": 47125 + }, + { + "epoch": 4.397312680787534, + "grad_norm": NaN, + "learning_rate": 5.283004996197626e-05, + "loss": 0.0, + "step": 47126 + }, + { + "epoch": 4.397405990482411, + "grad_norm": NaN, + "learning_rate": 5.282428791335594e-05, + "loss": 0.0, + "step": 47127 + }, + { + "epoch": 4.397499300177288, + "grad_norm": NaN, + "learning_rate": 5.281852611182477e-05, + "loss": 0.0, + "step": 47128 + }, + { + "epoch": 4.397592609872166, + "grad_norm": NaN, + "learning_rate": 5.281276455739723e-05, + "loss": 0.0, + "step": 47129 + }, + { + "epoch": 4.397685919567043, + "grad_norm": NaN, + "learning_rate": 5.2807003250088e-05, + "loss": 0.0, + "step": 47130 + }, + { + "epoch": 4.39777922926192, + "grad_norm": NaN, + "learning_rate": 5.2801242189911875e-05, + "loss": 0.0, + "step": 47131 + }, + { + "epoch": 4.397872538956798, + "grad_norm": NaN, + "learning_rate": 5.279548137688335e-05, + "loss": 0.0, + "step": 47132 + }, + { + "epoch": 4.397965848651675, + "grad_norm": NaN, + "learning_rate": 5.278972081101708e-05, + "loss": 0.0, + "step": 47133 + }, + { + "epoch": 4.398059158346552, + "grad_norm": NaN, + "learning_rate": 5.2783960492327837e-05, + "loss": 0.0, + "step": 47134 + }, + { + "epoch": 4.398152468041429, + "grad_norm": NaN, + "learning_rate": 5.277820042083015e-05, + "loss": 0.0, + "step": 47135 + }, + { + "epoch": 4.398245777736307, + "grad_norm": NaN, + "learning_rate": 5.277244059653868e-05, + "loss": 0.0, + "step": 47136 + }, + { + "epoch": 4.398339087431184, + "grad_norm": NaN, + "learning_rate": 5.27666810194681e-05, + "loss": 0.0, + "step": 47137 + }, + { + "epoch": 4.3984323971260615, + "grad_norm": NaN, + "learning_rate": 5.276092168963306e-05, + "loss": 0.0, + "step": 47138 + }, + { + "epoch": 4.398525706820939, + "grad_norm": NaN, + "learning_rate": 5.275516260704819e-05, + "loss": 0.0, + "step": 47139 + }, + { + "epoch": 4.398619016515816, + "grad_norm": NaN, + "learning_rate": 5.274940377172813e-05, + "loss": 0.0, + "step": 47140 + }, + { + "epoch": 4.398712326210694, + "grad_norm": NaN, + "learning_rate": 5.274364518368753e-05, + "loss": 0.0, + "step": 47141 + }, + { + "epoch": 4.39880563590557, + "grad_norm": NaN, + "learning_rate": 5.273788684294103e-05, + "loss": 0.0, + "step": 47142 + }, + { + "epoch": 4.398898945600448, + "grad_norm": NaN, + "learning_rate": 5.273212874950327e-05, + "loss": 0.0, + "step": 47143 + }, + { + "epoch": 4.398992255295325, + "grad_norm": NaN, + "learning_rate": 5.272637090338889e-05, + "loss": 0.0, + "step": 47144 + }, + { + "epoch": 4.3990855649902025, + "grad_norm": NaN, + "learning_rate": 5.2720613304612543e-05, + "loss": 0.0, + "step": 47145 + }, + { + "epoch": 4.39917887468508, + "grad_norm": NaN, + "learning_rate": 5.271485595318886e-05, + "loss": 0.0, + "step": 47146 + }, + { + "epoch": 4.399272184379957, + "grad_norm": NaN, + "learning_rate": 5.270909884913247e-05, + "loss": 0.0, + "step": 47147 + }, + { + "epoch": 4.399365494074834, + "grad_norm": NaN, + "learning_rate": 5.270334199245803e-05, + "loss": 0.0, + "step": 47148 + }, + { + "epoch": 4.399458803769711, + "grad_norm": NaN, + "learning_rate": 5.269758538318016e-05, + "loss": 0.0, + "step": 47149 + }, + { + "epoch": 4.399552113464589, + "grad_norm": NaN, + "learning_rate": 5.2691829021313526e-05, + "loss": 0.0, + "step": 47150 + }, + { + "epoch": 4.399645423159466, + "grad_norm": NaN, + "learning_rate": 5.2686072906872735e-05, + "loss": 0.0, + "step": 47151 + }, + { + "epoch": 4.399738732854344, + "grad_norm": NaN, + "learning_rate": 5.268031703987245e-05, + "loss": 0.0, + "step": 47152 + }, + { + "epoch": 4.399832042549221, + "grad_norm": NaN, + "learning_rate": 5.267456142032726e-05, + "loss": 0.0, + "step": 47153 + }, + { + "epoch": 4.399925352244098, + "grad_norm": NaN, + "learning_rate": 5.266880604825186e-05, + "loss": 0.0, + "step": 47154 + }, + { + "epoch": 4.400018661938976, + "grad_norm": NaN, + "learning_rate": 5.266305092366083e-05, + "loss": 0.0, + "step": 47155 + }, + { + "epoch": 4.400111971633852, + "grad_norm": NaN, + "learning_rate": 5.265729604656883e-05, + "loss": 0.0, + "step": 47156 + }, + { + "epoch": 4.40020528132873, + "grad_norm": NaN, + "learning_rate": 5.2651541416990506e-05, + "loss": 0.0, + "step": 47157 + }, + { + "epoch": 4.400298591023607, + "grad_norm": NaN, + "learning_rate": 5.2645787034940476e-05, + "loss": 0.0, + "step": 47158 + }, + { + "epoch": 4.400391900718485, + "grad_norm": NaN, + "learning_rate": 5.264003290043335e-05, + "loss": 0.0, + "step": 47159 + }, + { + "epoch": 4.400485210413362, + "grad_norm": NaN, + "learning_rate": 5.2634279013483803e-05, + "loss": 0.0, + "step": 47160 + }, + { + "epoch": 4.4005785201082395, + "grad_norm": NaN, + "learning_rate": 5.2628525374106434e-05, + "loss": 0.0, + "step": 47161 + }, + { + "epoch": 4.400671829803117, + "grad_norm": NaN, + "learning_rate": 5.262277198231588e-05, + "loss": 0.0, + "step": 47162 + }, + { + "epoch": 4.400765139497993, + "grad_norm": NaN, + "learning_rate": 5.2617018838126786e-05, + "loss": 0.0, + "step": 47163 + }, + { + "epoch": 4.400858449192871, + "grad_norm": NaN, + "learning_rate": 5.261126594155375e-05, + "loss": 0.0, + "step": 47164 + }, + { + "epoch": 4.400951758887748, + "grad_norm": NaN, + "learning_rate": 5.260551329261136e-05, + "loss": 0.0, + "step": 47165 + }, + { + "epoch": 4.401045068582626, + "grad_norm": NaN, + "learning_rate": 5.2599760891314415e-05, + "loss": 0.0, + "step": 47166 + }, + { + "epoch": 4.401138378277503, + "grad_norm": NaN, + "learning_rate": 5.2594008737677375e-05, + "loss": 0.0, + "step": 47167 + }, + { + "epoch": 4.4012316879723805, + "grad_norm": NaN, + "learning_rate": 5.258825683171485e-05, + "loss": 0.0, + "step": 47168 + }, + { + "epoch": 4.401324997667258, + "grad_norm": NaN, + "learning_rate": 5.258250517344164e-05, + "loss": 0.0, + "step": 47169 + }, + { + "epoch": 4.401418307362135, + "grad_norm": NaN, + "learning_rate": 5.257675376287223e-05, + "loss": 0.0, + "step": 47170 + }, + { + "epoch": 4.401511617057012, + "grad_norm": NaN, + "learning_rate": 5.257100260002121e-05, + "loss": 0.0, + "step": 47171 + }, + { + "epoch": 4.401604926751889, + "grad_norm": NaN, + "learning_rate": 5.256525168490337e-05, + "loss": 0.0, + "step": 47172 + }, + { + "epoch": 4.401698236446767, + "grad_norm": NaN, + "learning_rate": 5.255950101753319e-05, + "loss": 0.0, + "step": 47173 + }, + { + "epoch": 4.401791546141644, + "grad_norm": NaN, + "learning_rate": 5.255375059792527e-05, + "loss": 0.0, + "step": 47174 + }, + { + "epoch": 4.401884855836522, + "grad_norm": NaN, + "learning_rate": 5.2548000426094414e-05, + "loss": 0.0, + "step": 47175 + }, + { + "epoch": 4.401978165531399, + "grad_norm": NaN, + "learning_rate": 5.2542250502055066e-05, + "loss": 0.0, + "step": 47176 + }, + { + "epoch": 4.402071475226276, + "grad_norm": NaN, + "learning_rate": 5.253650082582186e-05, + "loss": 0.0, + "step": 47177 + }, + { + "epoch": 4.402164784921153, + "grad_norm": NaN, + "learning_rate": 5.2530751397409564e-05, + "loss": 0.0, + "step": 47178 + }, + { + "epoch": 4.40225809461603, + "grad_norm": NaN, + "learning_rate": 5.252500221683264e-05, + "loss": 0.0, + "step": 47179 + }, + { + "epoch": 4.402351404310908, + "grad_norm": NaN, + "learning_rate": 5.2519253284105755e-05, + "loss": 0.0, + "step": 47180 + }, + { + "epoch": 4.402444714005785, + "grad_norm": NaN, + "learning_rate": 5.251350459924353e-05, + "loss": 0.0, + "step": 47181 + }, + { + "epoch": 4.402538023700663, + "grad_norm": NaN, + "learning_rate": 5.250775616226061e-05, + "loss": 0.0, + "step": 47182 + }, + { + "epoch": 4.40263133339554, + "grad_norm": NaN, + "learning_rate": 5.250200797317156e-05, + "loss": 0.0, + "step": 47183 + }, + { + "epoch": 4.4027246430904174, + "grad_norm": NaN, + "learning_rate": 5.2496260031991037e-05, + "loss": 0.0, + "step": 47184 + }, + { + "epoch": 4.402817952785295, + "grad_norm": NaN, + "learning_rate": 5.249051233873365e-05, + "loss": 0.0, + "step": 47185 + }, + { + "epoch": 4.402911262480171, + "grad_norm": NaN, + "learning_rate": 5.2484764893413975e-05, + "loss": 0.0, + "step": 47186 + }, + { + "epoch": 4.403004572175049, + "grad_norm": NaN, + "learning_rate": 5.2479017696046685e-05, + "loss": 0.0, + "step": 47187 + }, + { + "epoch": 4.403097881869926, + "grad_norm": NaN, + "learning_rate": 5.247327074664635e-05, + "loss": 0.0, + "step": 47188 + }, + { + "epoch": 4.403191191564804, + "grad_norm": NaN, + "learning_rate": 5.2467524045227606e-05, + "loss": 0.0, + "step": 47189 + }, + { + "epoch": 4.403284501259681, + "grad_norm": NaN, + "learning_rate": 5.2461777591805064e-05, + "loss": 0.0, + "step": 47190 + }, + { + "epoch": 4.4033778109545585, + "grad_norm": NaN, + "learning_rate": 5.245603138639332e-05, + "loss": 0.0, + "step": 47191 + }, + { + "epoch": 4.403471120649435, + "grad_norm": NaN, + "learning_rate": 5.2450285429006996e-05, + "loss": 0.0, + "step": 47192 + }, + { + "epoch": 4.403564430344312, + "grad_norm": NaN, + "learning_rate": 5.244453971966069e-05, + "loss": 0.0, + "step": 47193 + }, + { + "epoch": 4.40365774003919, + "grad_norm": NaN, + "learning_rate": 5.2438794258369024e-05, + "loss": 0.0, + "step": 47194 + }, + { + "epoch": 4.403751049734067, + "grad_norm": NaN, + "learning_rate": 5.243304904514661e-05, + "loss": 0.0, + "step": 47195 + }, + { + "epoch": 4.403844359428945, + "grad_norm": NaN, + "learning_rate": 5.242730408000805e-05, + "loss": 0.0, + "step": 47196 + }, + { + "epoch": 4.403937669123822, + "grad_norm": NaN, + "learning_rate": 5.242155936296795e-05, + "loss": 0.0, + "step": 47197 + }, + { + "epoch": 4.4040309788186995, + "grad_norm": NaN, + "learning_rate": 5.241581489404093e-05, + "loss": 0.0, + "step": 47198 + }, + { + "epoch": 4.404124288513577, + "grad_norm": NaN, + "learning_rate": 5.241007067324158e-05, + "loss": 0.0, + "step": 47199 + }, + { + "epoch": 4.4042175982084535, + "grad_norm": NaN, + "learning_rate": 5.2404326700584515e-05, + "loss": 0.0, + "step": 47200 + }, + { + "epoch": 4.404310907903331, + "grad_norm": NaN, + "learning_rate": 5.239858297608433e-05, + "loss": 0.0, + "step": 47201 + }, + { + "epoch": 4.404404217598208, + "grad_norm": NaN, + "learning_rate": 5.239283949975563e-05, + "loss": 0.0, + "step": 47202 + }, + { + "epoch": 4.404497527293086, + "grad_norm": NaN, + "learning_rate": 5.2387096271613e-05, + "loss": 0.0, + "step": 47203 + }, + { + "epoch": 4.404590836987963, + "grad_norm": NaN, + "learning_rate": 5.2381353291671145e-05, + "loss": 0.0, + "step": 47204 + }, + { + "epoch": 4.404684146682841, + "grad_norm": NaN, + "learning_rate": 5.237561055994456e-05, + "loss": 0.0, + "step": 47205 + }, + { + "epoch": 4.404777456377718, + "grad_norm": NaN, + "learning_rate": 5.236986807644781e-05, + "loss": 0.0, + "step": 47206 + }, + { + "epoch": 4.4048707660725945, + "grad_norm": NaN, + "learning_rate": 5.236412584119568e-05, + "loss": 0.0, + "step": 47207 + }, + { + "epoch": 4.404964075767472, + "grad_norm": NaN, + "learning_rate": 5.2358383854202604e-05, + "loss": 0.0, + "step": 47208 + }, + { + "epoch": 4.405057385462349, + "grad_norm": NaN, + "learning_rate": 5.235264211548317e-05, + "loss": 0.0, + "step": 47209 + }, + { + "epoch": 4.405150695157227, + "grad_norm": NaN, + "learning_rate": 5.2346900625052166e-05, + "loss": 0.0, + "step": 47210 + }, + { + "epoch": 4.405244004852104, + "grad_norm": NaN, + "learning_rate": 5.234115938292399e-05, + "loss": 0.0, + "step": 47211 + }, + { + "epoch": 4.405337314546982, + "grad_norm": NaN, + "learning_rate": 5.23354183891133e-05, + "loss": 0.0, + "step": 47212 + }, + { + "epoch": 4.405430624241859, + "grad_norm": NaN, + "learning_rate": 5.2329677643634775e-05, + "loss": 0.0, + "step": 47213 + }, + { + "epoch": 4.4055239339367365, + "grad_norm": NaN, + "learning_rate": 5.232393714650292e-05, + "loss": 0.0, + "step": 47214 + }, + { + "epoch": 4.405617243631613, + "grad_norm": NaN, + "learning_rate": 5.23181968977323e-05, + "loss": 0.0, + "step": 47215 + }, + { + "epoch": 4.40571055332649, + "grad_norm": NaN, + "learning_rate": 5.2312456897337676e-05, + "loss": 0.0, + "step": 47216 + }, + { + "epoch": 4.405803863021368, + "grad_norm": NaN, + "learning_rate": 5.2306717145333484e-05, + "loss": 0.0, + "step": 47217 + }, + { + "epoch": 4.405897172716245, + "grad_norm": NaN, + "learning_rate": 5.230097764173433e-05, + "loss": 0.0, + "step": 47218 + }, + { + "epoch": 4.405990482411123, + "grad_norm": NaN, + "learning_rate": 5.229523838655494e-05, + "loss": 0.0, + "step": 47219 + }, + { + "epoch": 4.406083792106, + "grad_norm": NaN, + "learning_rate": 5.228949937980976e-05, + "loss": 0.0, + "step": 47220 + }, + { + "epoch": 4.4061771018008775, + "grad_norm": NaN, + "learning_rate": 5.228376062151339e-05, + "loss": 0.0, + "step": 47221 + }, + { + "epoch": 4.406270411495754, + "grad_norm": NaN, + "learning_rate": 5.227802211168059e-05, + "loss": 0.0, + "step": 47222 + }, + { + "epoch": 4.4063637211906315, + "grad_norm": NaN, + "learning_rate": 5.2272283850325775e-05, + "loss": 0.0, + "step": 47223 + }, + { + "epoch": 4.406457030885509, + "grad_norm": NaN, + "learning_rate": 5.2266545837463584e-05, + "loss": 0.0, + "step": 47224 + }, + { + "epoch": 4.406550340580386, + "grad_norm": NaN, + "learning_rate": 5.2260808073108614e-05, + "loss": 0.0, + "step": 47225 + }, + { + "epoch": 4.406643650275264, + "grad_norm": NaN, + "learning_rate": 5.225507055727548e-05, + "loss": 0.0, + "step": 47226 + }, + { + "epoch": 4.406736959970141, + "grad_norm": NaN, + "learning_rate": 5.224933328997871e-05, + "loss": 0.0, + "step": 47227 + }, + { + "epoch": 4.406830269665019, + "grad_norm": NaN, + "learning_rate": 5.2243596271232965e-05, + "loss": 0.0, + "step": 47228 + }, + { + "epoch": 4.406923579359895, + "grad_norm": NaN, + "learning_rate": 5.223785950105278e-05, + "loss": 0.0, + "step": 47229 + }, + { + "epoch": 4.4070168890547725, + "grad_norm": NaN, + "learning_rate": 5.2232122979452757e-05, + "loss": 0.0, + "step": 47230 + }, + { + "epoch": 4.40711019874965, + "grad_norm": NaN, + "learning_rate": 5.2226386706447494e-05, + "loss": 0.0, + "step": 47231 + }, + { + "epoch": 4.407203508444527, + "grad_norm": NaN, + "learning_rate": 5.2220650682051545e-05, + "loss": 0.0, + "step": 47232 + }, + { + "epoch": 4.407296818139405, + "grad_norm": NaN, + "learning_rate": 5.221491490627954e-05, + "loss": 0.0, + "step": 47233 + }, + { + "epoch": 4.407390127834282, + "grad_norm": NaN, + "learning_rate": 5.220917937914604e-05, + "loss": 0.0, + "step": 47234 + }, + { + "epoch": 4.40748343752916, + "grad_norm": NaN, + "learning_rate": 5.220344410066562e-05, + "loss": 0.0, + "step": 47235 + }, + { + "epoch": 4.407576747224036, + "grad_norm": NaN, + "learning_rate": 5.219770907085287e-05, + "loss": 0.0, + "step": 47236 + }, + { + "epoch": 4.407670056918914, + "grad_norm": NaN, + "learning_rate": 5.219197428972238e-05, + "loss": 0.0, + "step": 47237 + }, + { + "epoch": 4.407763366613791, + "grad_norm": NaN, + "learning_rate": 5.218623975728872e-05, + "loss": 0.0, + "step": 47238 + }, + { + "epoch": 4.407856676308668, + "grad_norm": NaN, + "learning_rate": 5.21805054735665e-05, + "loss": 0.0, + "step": 47239 + }, + { + "epoch": 4.407949986003546, + "grad_norm": NaN, + "learning_rate": 5.217477143857024e-05, + "loss": 0.0, + "step": 47240 + }, + { + "epoch": 4.408043295698423, + "grad_norm": NaN, + "learning_rate": 5.2169037652314536e-05, + "loss": 0.0, + "step": 47241 + }, + { + "epoch": 4.408136605393301, + "grad_norm": NaN, + "learning_rate": 5.2163304114814096e-05, + "loss": 0.0, + "step": 47242 + }, + { + "epoch": 4.408229915088178, + "grad_norm": NaN, + "learning_rate": 5.215757082608333e-05, + "loss": 0.0, + "step": 47243 + }, + { + "epoch": 4.408323224783055, + "grad_norm": NaN, + "learning_rate": 5.215183778613682e-05, + "loss": 0.0, + "step": 47244 + }, + { + "epoch": 4.408416534477932, + "grad_norm": NaN, + "learning_rate": 5.2146104994989304e-05, + "loss": 0.0, + "step": 47245 + }, + { + "epoch": 4.4085098441728094, + "grad_norm": NaN, + "learning_rate": 5.2140372452655217e-05, + "loss": 0.0, + "step": 47246 + }, + { + "epoch": 4.408603153867687, + "grad_norm": NaN, + "learning_rate": 5.213464015914913e-05, + "loss": 0.0, + "step": 47247 + }, + { + "epoch": 4.408696463562564, + "grad_norm": NaN, + "learning_rate": 5.212890811448574e-05, + "loss": 0.0, + "step": 47248 + }, + { + "epoch": 4.408789773257442, + "grad_norm": NaN, + "learning_rate": 5.212317631867951e-05, + "loss": 0.0, + "step": 47249 + }, + { + "epoch": 4.408883082952319, + "grad_norm": NaN, + "learning_rate": 5.2117444771745e-05, + "loss": 0.0, + "step": 47250 + }, + { + "epoch": 4.408976392647196, + "grad_norm": NaN, + "learning_rate": 5.211171347369693e-05, + "loss": 0.0, + "step": 47251 + }, + { + "epoch": 4.409069702342073, + "grad_norm": NaN, + "learning_rate": 5.210598242454973e-05, + "loss": 0.0, + "step": 47252 + }, + { + "epoch": 4.4091630120369505, + "grad_norm": NaN, + "learning_rate": 5.2100251624317966e-05, + "loss": 0.0, + "step": 47253 + }, + { + "epoch": 4.409256321731828, + "grad_norm": NaN, + "learning_rate": 5.2094521073016366e-05, + "loss": 0.0, + "step": 47254 + }, + { + "epoch": 4.409349631426705, + "grad_norm": NaN, + "learning_rate": 5.208879077065934e-05, + "loss": 0.0, + "step": 47255 + }, + { + "epoch": 4.409442941121583, + "grad_norm": NaN, + "learning_rate": 5.208306071726146e-05, + "loss": 0.0, + "step": 47256 + }, + { + "epoch": 4.40953625081646, + "grad_norm": NaN, + "learning_rate": 5.2077330912837465e-05, + "loss": 0.0, + "step": 47257 + }, + { + "epoch": 4.409629560511338, + "grad_norm": NaN, + "learning_rate": 5.207160135740175e-05, + "loss": 0.0, + "step": 47258 + }, + { + "epoch": 4.409722870206214, + "grad_norm": NaN, + "learning_rate": 5.206587205096891e-05, + "loss": 0.0, + "step": 47259 + }, + { + "epoch": 4.4098161799010915, + "grad_norm": NaN, + "learning_rate": 5.2060142993553636e-05, + "loss": 0.0, + "step": 47260 + }, + { + "epoch": 4.409909489595969, + "grad_norm": NaN, + "learning_rate": 5.205441418517036e-05, + "loss": 0.0, + "step": 47261 + }, + { + "epoch": 4.410002799290846, + "grad_norm": NaN, + "learning_rate": 5.2048685625833635e-05, + "loss": 0.0, + "step": 47262 + }, + { + "epoch": 4.410096108985724, + "grad_norm": NaN, + "learning_rate": 5.20429573155582e-05, + "loss": 0.0, + "step": 47263 + }, + { + "epoch": 4.410189418680601, + "grad_norm": NaN, + "learning_rate": 5.2037229254358454e-05, + "loss": 0.0, + "step": 47264 + }, + { + "epoch": 4.410282728375478, + "grad_norm": NaN, + "learning_rate": 5.203150144224896e-05, + "loss": 0.0, + "step": 47265 + }, + { + "epoch": 4.410376038070355, + "grad_norm": NaN, + "learning_rate": 5.202577387924443e-05, + "loss": 0.0, + "step": 47266 + }, + { + "epoch": 4.410469347765233, + "grad_norm": NaN, + "learning_rate": 5.2020046565359296e-05, + "loss": 0.0, + "step": 47267 + }, + { + "epoch": 4.41056265746011, + "grad_norm": NaN, + "learning_rate": 5.201431950060816e-05, + "loss": 0.0, + "step": 47268 + }, + { + "epoch": 4.410655967154987, + "grad_norm": NaN, + "learning_rate": 5.200859268500559e-05, + "loss": 0.0, + "step": 47269 + }, + { + "epoch": 4.410749276849865, + "grad_norm": NaN, + "learning_rate": 5.200286611856613e-05, + "loss": 0.0, + "step": 47270 + }, + { + "epoch": 4.410842586544742, + "grad_norm": NaN, + "learning_rate": 5.199713980130436e-05, + "loss": 0.0, + "step": 47271 + }, + { + "epoch": 4.41093589623962, + "grad_norm": NaN, + "learning_rate": 5.1991413733234823e-05, + "loss": 0.0, + "step": 47272 + }, + { + "epoch": 4.411029205934496, + "grad_norm": NaN, + "learning_rate": 5.19856879143721e-05, + "loss": 0.0, + "step": 47273 + }, + { + "epoch": 4.411122515629374, + "grad_norm": NaN, + "learning_rate": 5.1979962344730744e-05, + "loss": 0.0, + "step": 47274 + }, + { + "epoch": 4.411215825324251, + "grad_norm": NaN, + "learning_rate": 5.197423702432529e-05, + "loss": 0.0, + "step": 47275 + }, + { + "epoch": 4.4113091350191285, + "grad_norm": NaN, + "learning_rate": 5.196851195317028e-05, + "loss": 0.0, + "step": 47276 + }, + { + "epoch": 4.411402444714006, + "grad_norm": NaN, + "learning_rate": 5.1962787131280384e-05, + "loss": 0.0, + "step": 47277 + }, + { + "epoch": 4.411495754408883, + "grad_norm": NaN, + "learning_rate": 5.195706255867006e-05, + "loss": 0.0, + "step": 47278 + }, + { + "epoch": 4.411589064103761, + "grad_norm": NaN, + "learning_rate": 5.195133823535382e-05, + "loss": 0.0, + "step": 47279 + }, + { + "epoch": 4.411682373798637, + "grad_norm": NaN, + "learning_rate": 5.1945614161346364e-05, + "loss": 0.0, + "step": 47280 + }, + { + "epoch": 4.411775683493515, + "grad_norm": NaN, + "learning_rate": 5.193989033666214e-05, + "loss": 0.0, + "step": 47281 + }, + { + "epoch": 4.411868993188392, + "grad_norm": NaN, + "learning_rate": 5.193416676131567e-05, + "loss": 0.0, + "step": 47282 + }, + { + "epoch": 4.4119623028832695, + "grad_norm": NaN, + "learning_rate": 5.192844343532165e-05, + "loss": 0.0, + "step": 47283 + }, + { + "epoch": 4.412055612578147, + "grad_norm": NaN, + "learning_rate": 5.192272035869451e-05, + "loss": 0.0, + "step": 47284 + }, + { + "epoch": 4.412148922273024, + "grad_norm": NaN, + "learning_rate": 5.191699753144879e-05, + "loss": 0.0, + "step": 47285 + }, + { + "epoch": 4.412242231967902, + "grad_norm": NaN, + "learning_rate": 5.19112749535992e-05, + "loss": 0.0, + "step": 47286 + }, + { + "epoch": 4.412335541662779, + "grad_norm": NaN, + "learning_rate": 5.190555262516012e-05, + "loss": 0.0, + "step": 47287 + }, + { + "epoch": 4.412428851357656, + "grad_norm": NaN, + "learning_rate": 5.189983054614611e-05, + "loss": 0.0, + "step": 47288 + }, + { + "epoch": 4.412522161052533, + "grad_norm": NaN, + "learning_rate": 5.189410871657188e-05, + "loss": 0.0, + "step": 47289 + }, + { + "epoch": 4.412615470747411, + "grad_norm": NaN, + "learning_rate": 5.188838713645181e-05, + "loss": 0.0, + "step": 47290 + }, + { + "epoch": 4.412708780442288, + "grad_norm": NaN, + "learning_rate": 5.188266580580047e-05, + "loss": 0.0, + "step": 47291 + }, + { + "epoch": 4.412802090137165, + "grad_norm": NaN, + "learning_rate": 5.187694472463254e-05, + "loss": 0.0, + "step": 47292 + }, + { + "epoch": 4.412895399832043, + "grad_norm": NaN, + "learning_rate": 5.187122389296242e-05, + "loss": 0.0, + "step": 47293 + }, + { + "epoch": 4.41298870952692, + "grad_norm": NaN, + "learning_rate": 5.186550331080466e-05, + "loss": 0.0, + "step": 47294 + }, + { + "epoch": 4.413082019221797, + "grad_norm": NaN, + "learning_rate": 5.185978297817397e-05, + "loss": 0.0, + "step": 47295 + }, + { + "epoch": 4.413175328916674, + "grad_norm": NaN, + "learning_rate": 5.185406289508471e-05, + "loss": 0.0, + "step": 47296 + }, + { + "epoch": 4.413268638611552, + "grad_norm": NaN, + "learning_rate": 5.1848343061551466e-05, + "loss": 0.0, + "step": 47297 + }, + { + "epoch": 4.413361948306429, + "grad_norm": NaN, + "learning_rate": 5.18426234775889e-05, + "loss": 0.0, + "step": 47298 + }, + { + "epoch": 4.4134552580013064, + "grad_norm": NaN, + "learning_rate": 5.183690414321142e-05, + "loss": 0.0, + "step": 47299 + }, + { + "epoch": 4.413548567696184, + "grad_norm": NaN, + "learning_rate": 5.183118505843355e-05, + "loss": 0.0, + "step": 47300 + }, + { + "epoch": 4.413641877391061, + "grad_norm": NaN, + "learning_rate": 5.1825466223270005e-05, + "loss": 0.0, + "step": 47301 + }, + { + "epoch": 4.413735187085938, + "grad_norm": NaN, + "learning_rate": 5.181974763773518e-05, + "loss": 0.0, + "step": 47302 + }, + { + "epoch": 4.413828496780815, + "grad_norm": NaN, + "learning_rate": 5.1814029301843594e-05, + "loss": 0.0, + "step": 47303 + }, + { + "epoch": 4.413921806475693, + "grad_norm": NaN, + "learning_rate": 5.180831121560995e-05, + "loss": 0.0, + "step": 47304 + }, + { + "epoch": 4.41401511617057, + "grad_norm": NaN, + "learning_rate": 5.180259337904863e-05, + "loss": 0.0, + "step": 47305 + }, + { + "epoch": 4.4141084258654475, + "grad_norm": NaN, + "learning_rate": 5.179687579217418e-05, + "loss": 0.0, + "step": 47306 + }, + { + "epoch": 4.414201735560325, + "grad_norm": NaN, + "learning_rate": 5.179115845500128e-05, + "loss": 0.0, + "step": 47307 + }, + { + "epoch": 4.414295045255202, + "grad_norm": NaN, + "learning_rate": 5.178544136754433e-05, + "loss": 0.0, + "step": 47308 + }, + { + "epoch": 4.414388354950079, + "grad_norm": NaN, + "learning_rate": 5.1779724529817856e-05, + "loss": 0.0, + "step": 47309 + }, + { + "epoch": 4.414481664644956, + "grad_norm": NaN, + "learning_rate": 5.1774007941836536e-05, + "loss": 0.0, + "step": 47310 + }, + { + "epoch": 4.414574974339834, + "grad_norm": NaN, + "learning_rate": 5.176829160361478e-05, + "loss": 0.0, + "step": 47311 + }, + { + "epoch": 4.414668284034711, + "grad_norm": NaN, + "learning_rate": 5.176257551516711e-05, + "loss": 0.0, + "step": 47312 + }, + { + "epoch": 4.4147615937295885, + "grad_norm": NaN, + "learning_rate": 5.1756859676508214e-05, + "loss": 0.0, + "step": 47313 + }, + { + "epoch": 4.414854903424466, + "grad_norm": NaN, + "learning_rate": 5.1751144087652406e-05, + "loss": 0.0, + "step": 47314 + }, + { + "epoch": 4.414948213119343, + "grad_norm": NaN, + "learning_rate": 5.174542874861445e-05, + "loss": 0.0, + "step": 47315 + }, + { + "epoch": 4.415041522814221, + "grad_norm": NaN, + "learning_rate": 5.17397136594087e-05, + "loss": 0.0, + "step": 47316 + }, + { + "epoch": 4.415134832509097, + "grad_norm": NaN, + "learning_rate": 5.1733998820049714e-05, + "loss": 0.0, + "step": 47317 + }, + { + "epoch": 4.415228142203975, + "grad_norm": NaN, + "learning_rate": 5.172828423055216e-05, + "loss": 0.0, + "step": 47318 + }, + { + "epoch": 4.415321451898852, + "grad_norm": NaN, + "learning_rate": 5.172256989093042e-05, + "loss": 0.0, + "step": 47319 + }, + { + "epoch": 4.41541476159373, + "grad_norm": NaN, + "learning_rate": 5.171685580119901e-05, + "loss": 0.0, + "step": 47320 + }, + { + "epoch": 4.415508071288607, + "grad_norm": NaN, + "learning_rate": 5.171114196137263e-05, + "loss": 0.0, + "step": 47321 + }, + { + "epoch": 4.415601380983484, + "grad_norm": NaN, + "learning_rate": 5.1705428371465646e-05, + "loss": 0.0, + "step": 47322 + }, + { + "epoch": 4.415694690678362, + "grad_norm": NaN, + "learning_rate": 5.169971503149261e-05, + "loss": 0.0, + "step": 47323 + }, + { + "epoch": 4.415788000373238, + "grad_norm": NaN, + "learning_rate": 5.169400194146816e-05, + "loss": 0.0, + "step": 47324 + }, + { + "epoch": 4.415881310068116, + "grad_norm": NaN, + "learning_rate": 5.16882891014067e-05, + "loss": 0.0, + "step": 47325 + }, + { + "epoch": 4.415974619762993, + "grad_norm": NaN, + "learning_rate": 5.168257651132274e-05, + "loss": 0.0, + "step": 47326 + }, + { + "epoch": 4.416067929457871, + "grad_norm": NaN, + "learning_rate": 5.167686417123098e-05, + "loss": 0.0, + "step": 47327 + }, + { + "epoch": 4.416161239152748, + "grad_norm": NaN, + "learning_rate": 5.1671152081145766e-05, + "loss": 0.0, + "step": 47328 + }, + { + "epoch": 4.4162545488476255, + "grad_norm": NaN, + "learning_rate": 5.166544024108164e-05, + "loss": 0.0, + "step": 47329 + }, + { + "epoch": 4.416347858542503, + "grad_norm": NaN, + "learning_rate": 5.165972865105327e-05, + "loss": 0.0, + "step": 47330 + }, + { + "epoch": 4.41644116823738, + "grad_norm": NaN, + "learning_rate": 5.165401731107502e-05, + "loss": 0.0, + "step": 47331 + }, + { + "epoch": 4.416534477932257, + "grad_norm": NaN, + "learning_rate": 5.1648306221161435e-05, + "loss": 0.0, + "step": 47332 + }, + { + "epoch": 4.416627787627134, + "grad_norm": NaN, + "learning_rate": 5.164259538132717e-05, + "loss": 0.0, + "step": 47333 + }, + { + "epoch": 4.416721097322012, + "grad_norm": NaN, + "learning_rate": 5.163688479158661e-05, + "loss": 0.0, + "step": 47334 + }, + { + "epoch": 4.416814407016889, + "grad_norm": NaN, + "learning_rate": 5.1631174451954246e-05, + "loss": 0.0, + "step": 47335 + }, + { + "epoch": 4.4169077167117665, + "grad_norm": NaN, + "learning_rate": 5.1625464362444765e-05, + "loss": 0.0, + "step": 47336 + }, + { + "epoch": 4.417001026406644, + "grad_norm": NaN, + "learning_rate": 5.1619754523072556e-05, + "loss": 0.0, + "step": 47337 + }, + { + "epoch": 4.417094336101521, + "grad_norm": NaN, + "learning_rate": 5.161404493385211e-05, + "loss": 0.0, + "step": 47338 + }, + { + "epoch": 4.417187645796398, + "grad_norm": NaN, + "learning_rate": 5.16083355947981e-05, + "loss": 0.0, + "step": 47339 + }, + { + "epoch": 4.417280955491275, + "grad_norm": NaN, + "learning_rate": 5.1602626505924896e-05, + "loss": 0.0, + "step": 47340 + }, + { + "epoch": 4.417374265186153, + "grad_norm": NaN, + "learning_rate": 5.159691766724702e-05, + "loss": 0.0, + "step": 47341 + }, + { + "epoch": 4.41746757488103, + "grad_norm": NaN, + "learning_rate": 5.159120907877914e-05, + "loss": 0.0, + "step": 47342 + }, + { + "epoch": 4.417560884575908, + "grad_norm": NaN, + "learning_rate": 5.15855007405356e-05, + "loss": 0.0, + "step": 47343 + }, + { + "epoch": 4.417654194270785, + "grad_norm": NaN, + "learning_rate": 5.1579792652530935e-05, + "loss": 0.0, + "step": 47344 + }, + { + "epoch": 4.417747503965662, + "grad_norm": NaN, + "learning_rate": 5.1574084814779804e-05, + "loss": 0.0, + "step": 47345 + }, + { + "epoch": 4.417840813660539, + "grad_norm": NaN, + "learning_rate": 5.1568377227296564e-05, + "loss": 0.0, + "step": 47346 + }, + { + "epoch": 4.417934123355416, + "grad_norm": NaN, + "learning_rate": 5.156266989009574e-05, + "loss": 0.0, + "step": 47347 + }, + { + "epoch": 4.418027433050294, + "grad_norm": NaN, + "learning_rate": 5.155696280319199e-05, + "loss": 0.0, + "step": 47348 + }, + { + "epoch": 4.418120742745171, + "grad_norm": NaN, + "learning_rate": 5.155125596659967e-05, + "loss": 0.0, + "step": 47349 + }, + { + "epoch": 4.418214052440049, + "grad_norm": NaN, + "learning_rate": 5.154554938033329e-05, + "loss": 0.0, + "step": 47350 + }, + { + "epoch": 4.418307362134926, + "grad_norm": NaN, + "learning_rate": 5.1539843044407506e-05, + "loss": 0.0, + "step": 47351 + }, + { + "epoch": 4.4184006718298034, + "grad_norm": NaN, + "learning_rate": 5.1534136958836645e-05, + "loss": 0.0, + "step": 47352 + }, + { + "epoch": 4.41849398152468, + "grad_norm": NaN, + "learning_rate": 5.152843112363536e-05, + "loss": 0.0, + "step": 47353 + }, + { + "epoch": 4.418587291219557, + "grad_norm": NaN, + "learning_rate": 5.152272553881815e-05, + "loss": 0.0, + "step": 47354 + }, + { + "epoch": 4.418680600914435, + "grad_norm": NaN, + "learning_rate": 5.151702020439939e-05, + "loss": 0.0, + "step": 47355 + }, + { + "epoch": 4.418773910609312, + "grad_norm": NaN, + "learning_rate": 5.15113151203937e-05, + "loss": 0.0, + "step": 47356 + }, + { + "epoch": 4.41886722030419, + "grad_norm": NaN, + "learning_rate": 5.150561028681563e-05, + "loss": 0.0, + "step": 47357 + }, + { + "epoch": 4.418960529999067, + "grad_norm": NaN, + "learning_rate": 5.14999057036795e-05, + "loss": 0.0, + "step": 47358 + }, + { + "epoch": 4.4190538396939445, + "grad_norm": NaN, + "learning_rate": 5.149420137100005e-05, + "loss": 0.0, + "step": 47359 + }, + { + "epoch": 4.419147149388822, + "grad_norm": NaN, + "learning_rate": 5.14884972887916e-05, + "loss": 0.0, + "step": 47360 + }, + { + "epoch": 4.4192404590836984, + "grad_norm": NaN, + "learning_rate": 5.1482793457068684e-05, + "loss": 0.0, + "step": 47361 + }, + { + "epoch": 4.419333768778576, + "grad_norm": NaN, + "learning_rate": 5.147708987584595e-05, + "loss": 0.0, + "step": 47362 + }, + { + "epoch": 4.419427078473453, + "grad_norm": NaN, + "learning_rate": 5.1471386545137725e-05, + "loss": 0.0, + "step": 47363 + }, + { + "epoch": 4.419520388168331, + "grad_norm": NaN, + "learning_rate": 5.1465683464958534e-05, + "loss": 0.0, + "step": 47364 + }, + { + "epoch": 4.419613697863208, + "grad_norm": NaN, + "learning_rate": 5.145998063532301e-05, + "loss": 0.0, + "step": 47365 + }, + { + "epoch": 4.4197070075580855, + "grad_norm": NaN, + "learning_rate": 5.145427805624553e-05, + "loss": 0.0, + "step": 47366 + }, + { + "epoch": 4.419800317252963, + "grad_norm": NaN, + "learning_rate": 5.144857572774057e-05, + "loss": 0.0, + "step": 47367 + }, + { + "epoch": 4.4198936269478395, + "grad_norm": NaN, + "learning_rate": 5.1442873649822794e-05, + "loss": 0.0, + "step": 47368 + }, + { + "epoch": 4.419986936642717, + "grad_norm": NaN, + "learning_rate": 5.143717182250654e-05, + "loss": 0.0, + "step": 47369 + }, + { + "epoch": 4.420080246337594, + "grad_norm": NaN, + "learning_rate": 5.143147024580633e-05, + "loss": 0.0, + "step": 47370 + }, + { + "epoch": 4.420173556032472, + "grad_norm": NaN, + "learning_rate": 5.142576891973678e-05, + "loss": 0.0, + "step": 47371 + }, + { + "epoch": 4.420266865727349, + "grad_norm": NaN, + "learning_rate": 5.1420067844312246e-05, + "loss": 0.0, + "step": 47372 + }, + { + "epoch": 4.420360175422227, + "grad_norm": NaN, + "learning_rate": 5.141436701954723e-05, + "loss": 0.0, + "step": 47373 + }, + { + "epoch": 4.420453485117104, + "grad_norm": NaN, + "learning_rate": 5.140866644545637e-05, + "loss": 0.0, + "step": 47374 + }, + { + "epoch": 4.420546794811981, + "grad_norm": NaN, + "learning_rate": 5.1402966122054015e-05, + "loss": 0.0, + "step": 47375 + }, + { + "epoch": 4.420640104506858, + "grad_norm": NaN, + "learning_rate": 5.1397266049354676e-05, + "loss": 0.0, + "step": 47376 + }, + { + "epoch": 4.420733414201735, + "grad_norm": NaN, + "learning_rate": 5.1391566227372975e-05, + "loss": 0.0, + "step": 47377 + }, + { + "epoch": 4.420826723896613, + "grad_norm": NaN, + "learning_rate": 5.138586665612327e-05, + "loss": 0.0, + "step": 47378 + }, + { + "epoch": 4.42092003359149, + "grad_norm": NaN, + "learning_rate": 5.1380167335620023e-05, + "loss": 0.0, + "step": 47379 + }, + { + "epoch": 4.421013343286368, + "grad_norm": NaN, + "learning_rate": 5.13744682658779e-05, + "loss": 0.0, + "step": 47380 + }, + { + "epoch": 4.421106652981245, + "grad_norm": NaN, + "learning_rate": 5.1368769446911256e-05, + "loss": 0.0, + "step": 47381 + }, + { + "epoch": 4.421199962676122, + "grad_norm": NaN, + "learning_rate": 5.136307087873455e-05, + "loss": 0.0, + "step": 47382 + }, + { + "epoch": 4.421293272370999, + "grad_norm": NaN, + "learning_rate": 5.135737256136244e-05, + "loss": 0.0, + "step": 47383 + }, + { + "epoch": 4.421386582065876, + "grad_norm": NaN, + "learning_rate": 5.1351674494809265e-05, + "loss": 0.0, + "step": 47384 + }, + { + "epoch": 4.421479891760754, + "grad_norm": NaN, + "learning_rate": 5.134597667908951e-05, + "loss": 0.0, + "step": 47385 + }, + { + "epoch": 4.421573201455631, + "grad_norm": NaN, + "learning_rate": 5.134027911421781e-05, + "loss": 0.0, + "step": 47386 + }, + { + "epoch": 4.421666511150509, + "grad_norm": NaN, + "learning_rate": 5.133458180020851e-05, + "loss": 0.0, + "step": 47387 + }, + { + "epoch": 4.421759820845386, + "grad_norm": NaN, + "learning_rate": 5.1328884737076096e-05, + "loss": 0.0, + "step": 47388 + }, + { + "epoch": 4.4218531305402635, + "grad_norm": NaN, + "learning_rate": 5.132318792483519e-05, + "loss": 0.0, + "step": 47389 + }, + { + "epoch": 4.42194644023514, + "grad_norm": NaN, + "learning_rate": 5.1317491363500076e-05, + "loss": 0.0, + "step": 47390 + }, + { + "epoch": 4.4220397499300175, + "grad_norm": NaN, + "learning_rate": 5.131179505308542e-05, + "loss": 0.0, + "step": 47391 + }, + { + "epoch": 4.422133059624895, + "grad_norm": NaN, + "learning_rate": 5.130609899360568e-05, + "loss": 0.0, + "step": 47392 + }, + { + "epoch": 4.422226369319772, + "grad_norm": NaN, + "learning_rate": 5.1300403185075184e-05, + "loss": 0.0, + "step": 47393 + }, + { + "epoch": 4.42231967901465, + "grad_norm": NaN, + "learning_rate": 5.129470762750859e-05, + "loss": 0.0, + "step": 47394 + }, + { + "epoch": 4.422412988709527, + "grad_norm": NaN, + "learning_rate": 5.1289012320920356e-05, + "loss": 0.0, + "step": 47395 + }, + { + "epoch": 4.422506298404405, + "grad_norm": NaN, + "learning_rate": 5.128331726532483e-05, + "loss": 0.0, + "step": 47396 + }, + { + "epoch": 4.422599608099281, + "grad_norm": NaN, + "learning_rate": 5.127762246073663e-05, + "loss": 0.0, + "step": 47397 + }, + { + "epoch": 4.4226929177941585, + "grad_norm": NaN, + "learning_rate": 5.1271927907170254e-05, + "loss": 0.0, + "step": 47398 + }, + { + "epoch": 4.422786227489036, + "grad_norm": NaN, + "learning_rate": 5.126623360464e-05, + "loss": 0.0, + "step": 47399 + }, + { + "epoch": 4.422879537183913, + "grad_norm": NaN, + "learning_rate": 5.1260539553160526e-05, + "loss": 0.0, + "step": 47400 + }, + { + "epoch": 4.422972846878791, + "grad_norm": NaN, + "learning_rate": 5.125484575274631e-05, + "loss": 0.0, + "step": 47401 + }, + { + "epoch": 4.423066156573668, + "grad_norm": NaN, + "learning_rate": 5.124915220341166e-05, + "loss": 0.0, + "step": 47402 + }, + { + "epoch": 4.423159466268546, + "grad_norm": NaN, + "learning_rate": 5.124345890517128e-05, + "loss": 0.0, + "step": 47403 + }, + { + "epoch": 4.423252775963423, + "grad_norm": NaN, + "learning_rate": 5.1237765858039466e-05, + "loss": 0.0, + "step": 47404 + }, + { + "epoch": 4.4233460856583, + "grad_norm": NaN, + "learning_rate": 5.12320730620307e-05, + "loss": 0.0, + "step": 47405 + }, + { + "epoch": 4.423439395353177, + "grad_norm": NaN, + "learning_rate": 5.122638051715964e-05, + "loss": 0.0, + "step": 47406 + }, + { + "epoch": 4.423532705048054, + "grad_norm": NaN, + "learning_rate": 5.122068822344057e-05, + "loss": 0.0, + "step": 47407 + }, + { + "epoch": 4.423626014742932, + "grad_norm": NaN, + "learning_rate": 5.121499618088797e-05, + "loss": 0.0, + "step": 47408 + }, + { + "epoch": 4.423719324437809, + "grad_norm": NaN, + "learning_rate": 5.1209304389516485e-05, + "loss": 0.0, + "step": 47409 + }, + { + "epoch": 4.423812634132687, + "grad_norm": NaN, + "learning_rate": 5.120361284934043e-05, + "loss": 0.0, + "step": 47410 + }, + { + "epoch": 4.423905943827564, + "grad_norm": NaN, + "learning_rate": 5.119792156037426e-05, + "loss": 0.0, + "step": 47411 + }, + { + "epoch": 4.423999253522441, + "grad_norm": NaN, + "learning_rate": 5.119223052263261e-05, + "loss": 0.0, + "step": 47412 + }, + { + "epoch": 4.424092563217318, + "grad_norm": NaN, + "learning_rate": 5.11865397361298e-05, + "loss": 0.0, + "step": 47413 + }, + { + "epoch": 4.4241858729121954, + "grad_norm": NaN, + "learning_rate": 5.1180849200880305e-05, + "loss": 0.0, + "step": 47414 + }, + { + "epoch": 4.424279182607073, + "grad_norm": NaN, + "learning_rate": 5.117515891689873e-05, + "loss": 0.0, + "step": 47415 + }, + { + "epoch": 4.42437249230195, + "grad_norm": NaN, + "learning_rate": 5.11694688841994e-05, + "loss": 0.0, + "step": 47416 + }, + { + "epoch": 4.424465801996828, + "grad_norm": NaN, + "learning_rate": 5.116377910279679e-05, + "loss": 0.0, + "step": 47417 + }, + { + "epoch": 4.424559111691705, + "grad_norm": NaN, + "learning_rate": 5.1158089572705515e-05, + "loss": 0.0, + "step": 47418 + }, + { + "epoch": 4.424652421386582, + "grad_norm": NaN, + "learning_rate": 5.115240029393988e-05, + "loss": 0.0, + "step": 47419 + }, + { + "epoch": 4.424745731081459, + "grad_norm": NaN, + "learning_rate": 5.114671126651437e-05, + "loss": 0.0, + "step": 47420 + }, + { + "epoch": 4.4248390407763365, + "grad_norm": NaN, + "learning_rate": 5.1141022490443587e-05, + "loss": 0.0, + "step": 47421 + }, + { + "epoch": 4.424932350471214, + "grad_norm": NaN, + "learning_rate": 5.113533396574185e-05, + "loss": 0.0, + "step": 47422 + }, + { + "epoch": 4.425025660166091, + "grad_norm": NaN, + "learning_rate": 5.112964569242363e-05, + "loss": 0.0, + "step": 47423 + }, + { + "epoch": 4.425118969860969, + "grad_norm": NaN, + "learning_rate": 5.112395767050353e-05, + "loss": 0.0, + "step": 47424 + }, + { + "epoch": 4.425212279555846, + "grad_norm": NaN, + "learning_rate": 5.111826989999585e-05, + "loss": 0.0, + "step": 47425 + }, + { + "epoch": 4.425305589250723, + "grad_norm": NaN, + "learning_rate": 5.111258238091509e-05, + "loss": 0.0, + "step": 47426 + }, + { + "epoch": 4.4253988989456, + "grad_norm": NaN, + "learning_rate": 5.110689511327582e-05, + "loss": 0.0, + "step": 47427 + }, + { + "epoch": 4.4254922086404775, + "grad_norm": NaN, + "learning_rate": 5.1101208097092344e-05, + "loss": 0.0, + "step": 47428 + }, + { + "epoch": 4.425585518335355, + "grad_norm": NaN, + "learning_rate": 5.109552133237923e-05, + "loss": 0.0, + "step": 47429 + }, + { + "epoch": 4.425678828030232, + "grad_norm": NaN, + "learning_rate": 5.108983481915097e-05, + "loss": 0.0, + "step": 47430 + }, + { + "epoch": 4.42577213772511, + "grad_norm": NaN, + "learning_rate": 5.108414855742186e-05, + "loss": 0.0, + "step": 47431 + }, + { + "epoch": 4.425865447419987, + "grad_norm": NaN, + "learning_rate": 5.1078462547206504e-05, + "loss": 0.0, + "step": 47432 + }, + { + "epoch": 4.425958757114865, + "grad_norm": NaN, + "learning_rate": 5.1072776788519374e-05, + "loss": 0.0, + "step": 47433 + }, + { + "epoch": 4.426052066809741, + "grad_norm": NaN, + "learning_rate": 5.1067091281374756e-05, + "loss": 0.0, + "step": 47434 + }, + { + "epoch": 4.426145376504619, + "grad_norm": NaN, + "learning_rate": 5.106140602578726e-05, + "loss": 0.0, + "step": 47435 + }, + { + "epoch": 4.426238686199496, + "grad_norm": NaN, + "learning_rate": 5.105572102177137e-05, + "loss": 0.0, + "step": 47436 + }, + { + "epoch": 4.426331995894373, + "grad_norm": NaN, + "learning_rate": 5.105003626934138e-05, + "loss": 0.0, + "step": 47437 + }, + { + "epoch": 4.426425305589251, + "grad_norm": NaN, + "learning_rate": 5.104435176851186e-05, + "loss": 0.0, + "step": 47438 + }, + { + "epoch": 4.426518615284128, + "grad_norm": NaN, + "learning_rate": 5.103866751929731e-05, + "loss": 0.0, + "step": 47439 + }, + { + "epoch": 4.426611924979006, + "grad_norm": NaN, + "learning_rate": 5.1032983521712005e-05, + "loss": 0.0, + "step": 47440 + }, + { + "epoch": 4.426705234673882, + "grad_norm": NaN, + "learning_rate": 5.102729977577056e-05, + "loss": 0.0, + "step": 47441 + }, + { + "epoch": 4.42679854436876, + "grad_norm": NaN, + "learning_rate": 5.102161628148742e-05, + "loss": 0.0, + "step": 47442 + }, + { + "epoch": 4.426891854063637, + "grad_norm": NaN, + "learning_rate": 5.10159330388769e-05, + "loss": 0.0, + "step": 47443 + }, + { + "epoch": 4.4269851637585145, + "grad_norm": NaN, + "learning_rate": 5.101025004795358e-05, + "loss": 0.0, + "step": 47444 + }, + { + "epoch": 4.427078473453392, + "grad_norm": NaN, + "learning_rate": 5.100456730873194e-05, + "loss": 0.0, + "step": 47445 + }, + { + "epoch": 4.427171783148269, + "grad_norm": NaN, + "learning_rate": 5.099888482122623e-05, + "loss": 0.0, + "step": 47446 + }, + { + "epoch": 4.427265092843147, + "grad_norm": NaN, + "learning_rate": 5.099320258545111e-05, + "loss": 0.0, + "step": 47447 + }, + { + "epoch": 4.427358402538024, + "grad_norm": NaN, + "learning_rate": 5.098752060142098e-05, + "loss": 0.0, + "step": 47448 + }, + { + "epoch": 4.427451712232901, + "grad_norm": NaN, + "learning_rate": 5.098183886915016e-05, + "loss": 0.0, + "step": 47449 + }, + { + "epoch": 4.427545021927778, + "grad_norm": NaN, + "learning_rate": 5.097615738865329e-05, + "loss": 0.0, + "step": 47450 + }, + { + "epoch": 4.4276383316226555, + "grad_norm": NaN, + "learning_rate": 5.0970476159944666e-05, + "loss": 0.0, + "step": 47451 + }, + { + "epoch": 4.427731641317533, + "grad_norm": NaN, + "learning_rate": 5.0964795183038755e-05, + "loss": 0.0, + "step": 47452 + }, + { + "epoch": 4.42782495101241, + "grad_norm": NaN, + "learning_rate": 5.095911445795011e-05, + "loss": 0.0, + "step": 47453 + }, + { + "epoch": 4.427918260707288, + "grad_norm": NaN, + "learning_rate": 5.095343398469306e-05, + "loss": 0.0, + "step": 47454 + }, + { + "epoch": 4.428011570402165, + "grad_norm": NaN, + "learning_rate": 5.0947753763282034e-05, + "loss": 0.0, + "step": 47455 + }, + { + "epoch": 4.428104880097042, + "grad_norm": NaN, + "learning_rate": 5.094207379373164e-05, + "loss": 0.0, + "step": 47456 + }, + { + "epoch": 4.428198189791919, + "grad_norm": NaN, + "learning_rate": 5.0936394076056145e-05, + "loss": 0.0, + "step": 47457 + }, + { + "epoch": 4.428291499486797, + "grad_norm": NaN, + "learning_rate": 5.093071461027001e-05, + "loss": 0.0, + "step": 47458 + }, + { + "epoch": 4.428384809181674, + "grad_norm": NaN, + "learning_rate": 5.092503539638782e-05, + "loss": 0.0, + "step": 47459 + }, + { + "epoch": 4.428478118876551, + "grad_norm": NaN, + "learning_rate": 5.091935643442387e-05, + "loss": 0.0, + "step": 47460 + }, + { + "epoch": 4.428571428571429, + "grad_norm": NaN, + "learning_rate": 5.0913677724392596e-05, + "loss": 0.0, + "step": 47461 + }, + { + "epoch": 4.428664738266306, + "grad_norm": NaN, + "learning_rate": 5.090799926630858e-05, + "loss": 0.0, + "step": 47462 + }, + { + "epoch": 4.428758047961183, + "grad_norm": NaN, + "learning_rate": 5.090232106018609e-05, + "loss": 0.0, + "step": 47463 + }, + { + "epoch": 4.42885135765606, + "grad_norm": NaN, + "learning_rate": 5.089664310603966e-05, + "loss": 0.0, + "step": 47464 + }, + { + "epoch": 4.428944667350938, + "grad_norm": NaN, + "learning_rate": 5.089096540388379e-05, + "loss": 0.0, + "step": 47465 + }, + { + "epoch": 4.429037977045815, + "grad_norm": NaN, + "learning_rate": 5.088528795373273e-05, + "loss": 0.0, + "step": 47466 + }, + { + "epoch": 4.4291312867406925, + "grad_norm": NaN, + "learning_rate": 5.087961075560106e-05, + "loss": 0.0, + "step": 47467 + }, + { + "epoch": 4.42922459643557, + "grad_norm": NaN, + "learning_rate": 5.087393380950323e-05, + "loss": 0.0, + "step": 47468 + }, + { + "epoch": 4.429317906130447, + "grad_norm": NaN, + "learning_rate": 5.0868257115453536e-05, + "loss": 0.0, + "step": 47469 + }, + { + "epoch": 4.429411215825324, + "grad_norm": NaN, + "learning_rate": 5.086258067346655e-05, + "loss": 0.0, + "step": 47470 + }, + { + "epoch": 4.429504525520201, + "grad_norm": NaN, + "learning_rate": 5.08569044835567e-05, + "loss": 0.0, + "step": 47471 + }, + { + "epoch": 4.429597835215079, + "grad_norm": NaN, + "learning_rate": 5.085122854573827e-05, + "loss": 0.0, + "step": 47472 + }, + { + "epoch": 4.429691144909956, + "grad_norm": NaN, + "learning_rate": 5.084555286002583e-05, + "loss": 0.0, + "step": 47473 + }, + { + "epoch": 4.4297844546048335, + "grad_norm": NaN, + "learning_rate": 5.083987742643384e-05, + "loss": 0.0, + "step": 47474 + }, + { + "epoch": 4.429877764299711, + "grad_norm": NaN, + "learning_rate": 5.0834202244976586e-05, + "loss": 0.0, + "step": 47475 + }, + { + "epoch": 4.429971073994588, + "grad_norm": NaN, + "learning_rate": 5.082852731566861e-05, + "loss": 0.0, + "step": 47476 + }, + { + "epoch": 4.430064383689466, + "grad_norm": NaN, + "learning_rate": 5.0822852638524376e-05, + "loss": 0.0, + "step": 47477 + }, + { + "epoch": 4.430157693384342, + "grad_norm": NaN, + "learning_rate": 5.081717821355813e-05, + "loss": 0.0, + "step": 47478 + }, + { + "epoch": 4.43025100307922, + "grad_norm": NaN, + "learning_rate": 5.08115040407845e-05, + "loss": 0.0, + "step": 47479 + }, + { + "epoch": 4.430344312774097, + "grad_norm": NaN, + "learning_rate": 5.080583012021786e-05, + "loss": 0.0, + "step": 47480 + }, + { + "epoch": 4.4304376224689745, + "grad_norm": NaN, + "learning_rate": 5.080015645187252e-05, + "loss": 0.0, + "step": 47481 + }, + { + "epoch": 4.430530932163852, + "grad_norm": NaN, + "learning_rate": 5.079448303576303e-05, + "loss": 0.0, + "step": 47482 + }, + { + "epoch": 4.430624241858729, + "grad_norm": NaN, + "learning_rate": 5.0788809871903856e-05, + "loss": 0.0, + "step": 47483 + }, + { + "epoch": 4.430717551553607, + "grad_norm": NaN, + "learning_rate": 5.078313696030923e-05, + "loss": 0.0, + "step": 47484 + }, + { + "epoch": 4.430810861248483, + "grad_norm": NaN, + "learning_rate": 5.077746430099376e-05, + "loss": 0.0, + "step": 47485 + }, + { + "epoch": 4.430904170943361, + "grad_norm": NaN, + "learning_rate": 5.077179189397185e-05, + "loss": 0.0, + "step": 47486 + }, + { + "epoch": 4.430997480638238, + "grad_norm": NaN, + "learning_rate": 5.076611973925778e-05, + "loss": 0.0, + "step": 47487 + }, + { + "epoch": 4.431090790333116, + "grad_norm": NaN, + "learning_rate": 5.0760447836866124e-05, + "loss": 0.0, + "step": 47488 + }, + { + "epoch": 4.431184100027993, + "grad_norm": NaN, + "learning_rate": 5.0754776186811306e-05, + "loss": 0.0, + "step": 47489 + }, + { + "epoch": 4.43127740972287, + "grad_norm": NaN, + "learning_rate": 5.0749104789107585e-05, + "loss": 0.0, + "step": 47490 + }, + { + "epoch": 4.431370719417748, + "grad_norm": NaN, + "learning_rate": 5.0743433643769546e-05, + "loss": 0.0, + "step": 47491 + }, + { + "epoch": 4.431464029112625, + "grad_norm": NaN, + "learning_rate": 5.0737762750811614e-05, + "loss": 0.0, + "step": 47492 + }, + { + "epoch": 4.431557338807502, + "grad_norm": NaN, + "learning_rate": 5.0732092110248026e-05, + "loss": 0.0, + "step": 47493 + }, + { + "epoch": 4.431650648502379, + "grad_norm": NaN, + "learning_rate": 5.072642172209344e-05, + "loss": 0.0, + "step": 47494 + }, + { + "epoch": 4.431743958197257, + "grad_norm": NaN, + "learning_rate": 5.0720751586362105e-05, + "loss": 0.0, + "step": 47495 + }, + { + "epoch": 4.431837267892134, + "grad_norm": NaN, + "learning_rate": 5.0715081703068435e-05, + "loss": 0.0, + "step": 47496 + }, + { + "epoch": 4.4319305775870115, + "grad_norm": NaN, + "learning_rate": 5.070941207222699e-05, + "loss": 0.0, + "step": 47497 + }, + { + "epoch": 4.432023887281889, + "grad_norm": NaN, + "learning_rate": 5.070374269385206e-05, + "loss": 0.0, + "step": 47498 + }, + { + "epoch": 4.432117196976765, + "grad_norm": NaN, + "learning_rate": 5.0698073567958056e-05, + "loss": 0.0, + "step": 47499 + }, + { + "epoch": 4.432210506671643, + "grad_norm": NaN, + "learning_rate": 5.0692404694559516e-05, + "loss": 0.0, + "step": 47500 + }, + { + "epoch": 4.43230381636652, + "grad_norm": NaN, + "learning_rate": 5.06867360736707e-05, + "loss": 0.0, + "step": 47501 + }, + { + "epoch": 4.432397126061398, + "grad_norm": NaN, + "learning_rate": 5.068106770530613e-05, + "loss": 0.0, + "step": 47502 + }, + { + "epoch": 4.432490435756275, + "grad_norm": NaN, + "learning_rate": 5.067539958948023e-05, + "loss": 0.0, + "step": 47503 + }, + { + "epoch": 4.4325837454511525, + "grad_norm": NaN, + "learning_rate": 5.066973172620726e-05, + "loss": 0.0, + "step": 47504 + }, + { + "epoch": 4.43267705514603, + "grad_norm": NaN, + "learning_rate": 5.0664064115501816e-05, + "loss": 0.0, + "step": 47505 + }, + { + "epoch": 4.432770364840907, + "grad_norm": NaN, + "learning_rate": 5.065839675737825e-05, + "loss": 0.0, + "step": 47506 + }, + { + "epoch": 4.432863674535784, + "grad_norm": NaN, + "learning_rate": 5.0652729651850884e-05, + "loss": 0.0, + "step": 47507 + }, + { + "epoch": 4.432956984230661, + "grad_norm": NaN, + "learning_rate": 5.064706279893423e-05, + "loss": 0.0, + "step": 47508 + }, + { + "epoch": 4.433050293925539, + "grad_norm": NaN, + "learning_rate": 5.064139619864271e-05, + "loss": 0.0, + "step": 47509 + }, + { + "epoch": 4.433143603620416, + "grad_norm": NaN, + "learning_rate": 5.063572985099059e-05, + "loss": 0.0, + "step": 47510 + }, + { + "epoch": 4.433236913315294, + "grad_norm": NaN, + "learning_rate": 5.063006375599244e-05, + "loss": 0.0, + "step": 47511 + }, + { + "epoch": 4.433330223010171, + "grad_norm": NaN, + "learning_rate": 5.062439791366265e-05, + "loss": 0.0, + "step": 47512 + }, + { + "epoch": 4.433423532705048, + "grad_norm": NaN, + "learning_rate": 5.0618732324015464e-05, + "loss": 0.0, + "step": 47513 + }, + { + "epoch": 4.433516842399925, + "grad_norm": NaN, + "learning_rate": 5.061306698706546e-05, + "loss": 0.0, + "step": 47514 + }, + { + "epoch": 4.433610152094802, + "grad_norm": NaN, + "learning_rate": 5.060740190282703e-05, + "loss": 0.0, + "step": 47515 + }, + { + "epoch": 4.43370346178968, + "grad_norm": NaN, + "learning_rate": 5.060173707131444e-05, + "loss": 0.0, + "step": 47516 + }, + { + "epoch": 4.433796771484557, + "grad_norm": NaN, + "learning_rate": 5.0596072492542244e-05, + "loss": 0.0, + "step": 47517 + }, + { + "epoch": 4.433890081179435, + "grad_norm": NaN, + "learning_rate": 5.059040816652483e-05, + "loss": 0.0, + "step": 47518 + }, + { + "epoch": 4.433983390874312, + "grad_norm": NaN, + "learning_rate": 5.058474409327647e-05, + "loss": 0.0, + "step": 47519 + }, + { + "epoch": 4.4340767005691895, + "grad_norm": NaN, + "learning_rate": 5.057908027281171e-05, + "loss": 0.0, + "step": 47520 + }, + { + "epoch": 4.434170010264067, + "grad_norm": NaN, + "learning_rate": 5.057341670514493e-05, + "loss": 0.0, + "step": 47521 + }, + { + "epoch": 4.434263319958943, + "grad_norm": NaN, + "learning_rate": 5.056775339029042e-05, + "loss": 0.0, + "step": 47522 + }, + { + "epoch": 4.434356629653821, + "grad_norm": NaN, + "learning_rate": 5.056209032826271e-05, + "loss": 0.0, + "step": 47523 + }, + { + "epoch": 4.434449939348698, + "grad_norm": NaN, + "learning_rate": 5.055642751907617e-05, + "loss": 0.0, + "step": 47524 + }, + { + "epoch": 4.434543249043576, + "grad_norm": NaN, + "learning_rate": 5.055076496274511e-05, + "loss": 0.0, + "step": 47525 + }, + { + "epoch": 4.434636558738453, + "grad_norm": NaN, + "learning_rate": 5.054510265928403e-05, + "loss": 0.0, + "step": 47526 + }, + { + "epoch": 4.4347298684333305, + "grad_norm": NaN, + "learning_rate": 5.053944060870734e-05, + "loss": 0.0, + "step": 47527 + }, + { + "epoch": 4.434823178128208, + "grad_norm": NaN, + "learning_rate": 5.05337788110293e-05, + "loss": 0.0, + "step": 47528 + }, + { + "epoch": 4.4349164878230845, + "grad_norm": NaN, + "learning_rate": 5.052811726626443e-05, + "loss": 0.0, + "step": 47529 + }, + { + "epoch": 4.435009797517962, + "grad_norm": NaN, + "learning_rate": 5.0522455974427156e-05, + "loss": 0.0, + "step": 47530 + }, + { + "epoch": 4.435103107212839, + "grad_norm": NaN, + "learning_rate": 5.05167949355317e-05, + "loss": 0.0, + "step": 47531 + }, + { + "epoch": 4.435196416907717, + "grad_norm": NaN, + "learning_rate": 5.051113414959264e-05, + "loss": 0.0, + "step": 47532 + }, + { + "epoch": 4.435289726602594, + "grad_norm": NaN, + "learning_rate": 5.050547361662432e-05, + "loss": 0.0, + "step": 47533 + }, + { + "epoch": 4.4353830362974715, + "grad_norm": NaN, + "learning_rate": 5.049981333664101e-05, + "loss": 0.0, + "step": 47534 + }, + { + "epoch": 4.435476345992349, + "grad_norm": NaN, + "learning_rate": 5.049415330965726e-05, + "loss": 0.0, + "step": 47535 + }, + { + "epoch": 4.4355696556872255, + "grad_norm": NaN, + "learning_rate": 5.048849353568745e-05, + "loss": 0.0, + "step": 47536 + }, + { + "epoch": 4.435662965382103, + "grad_norm": NaN, + "learning_rate": 5.048283401474581e-05, + "loss": 0.0, + "step": 47537 + }, + { + "epoch": 4.43575627507698, + "grad_norm": NaN, + "learning_rate": 5.0477174746846955e-05, + "loss": 0.0, + "step": 47538 + }, + { + "epoch": 4.435849584771858, + "grad_norm": NaN, + "learning_rate": 5.0471515732005074e-05, + "loss": 0.0, + "step": 47539 + }, + { + "epoch": 4.435942894466735, + "grad_norm": NaN, + "learning_rate": 5.0465856970234676e-05, + "loss": 0.0, + "step": 47540 + }, + { + "epoch": 4.436036204161613, + "grad_norm": NaN, + "learning_rate": 5.0460198461550194e-05, + "loss": 0.0, + "step": 47541 + }, + { + "epoch": 4.43612951385649, + "grad_norm": NaN, + "learning_rate": 5.045454020596582e-05, + "loss": 0.0, + "step": 47542 + }, + { + "epoch": 4.4362228235513665, + "grad_norm": NaN, + "learning_rate": 5.044888220349611e-05, + "loss": 0.0, + "step": 47543 + }, + { + "epoch": 4.436316133246244, + "grad_norm": NaN, + "learning_rate": 5.044322445415547e-05, + "loss": 0.0, + "step": 47544 + }, + { + "epoch": 4.436409442941121, + "grad_norm": NaN, + "learning_rate": 5.043756695795811e-05, + "loss": 0.0, + "step": 47545 + }, + { + "epoch": 4.436502752635999, + "grad_norm": NaN, + "learning_rate": 5.0431909714918574e-05, + "loss": 0.0, + "step": 47546 + }, + { + "epoch": 4.436596062330876, + "grad_norm": NaN, + "learning_rate": 5.042625272505126e-05, + "loss": 0.0, + "step": 47547 + }, + { + "epoch": 4.436689372025754, + "grad_norm": NaN, + "learning_rate": 5.042059598837038e-05, + "loss": 0.0, + "step": 47548 + }, + { + "epoch": 4.436782681720631, + "grad_norm": NaN, + "learning_rate": 5.0414939504890465e-05, + "loss": 0.0, + "step": 47549 + }, + { + "epoch": 4.4368759914155085, + "grad_norm": NaN, + "learning_rate": 5.040928327462592e-05, + "loss": 0.0, + "step": 47550 + }, + { + "epoch": 4.436969301110385, + "grad_norm": NaN, + "learning_rate": 5.040362729759098e-05, + "loss": 0.0, + "step": 47551 + }, + { + "epoch": 4.437062610805262, + "grad_norm": NaN, + "learning_rate": 5.039797157380015e-05, + "loss": 0.0, + "step": 47552 + }, + { + "epoch": 4.43715592050014, + "grad_norm": NaN, + "learning_rate": 5.0392316103267817e-05, + "loss": 0.0, + "step": 47553 + }, + { + "epoch": 4.437249230195017, + "grad_norm": NaN, + "learning_rate": 5.0386660886008234e-05, + "loss": 0.0, + "step": 47554 + }, + { + "epoch": 4.437342539889895, + "grad_norm": NaN, + "learning_rate": 5.038100592203591e-05, + "loss": 0.0, + "step": 47555 + }, + { + "epoch": 4.437435849584772, + "grad_norm": NaN, + "learning_rate": 5.037535121136522e-05, + "loss": 0.0, + "step": 47556 + }, + { + "epoch": 4.4375291592796495, + "grad_norm": NaN, + "learning_rate": 5.036969675401041e-05, + "loss": 0.0, + "step": 47557 + }, + { + "epoch": 4.437622468974526, + "grad_norm": NaN, + "learning_rate": 5.036404254998599e-05, + "loss": 0.0, + "step": 47558 + }, + { + "epoch": 4.4377157786694035, + "grad_norm": NaN, + "learning_rate": 5.035838859930635e-05, + "loss": 0.0, + "step": 47559 + }, + { + "epoch": 4.437809088364281, + "grad_norm": NaN, + "learning_rate": 5.035273490198571e-05, + "loss": 0.0, + "step": 47560 + }, + { + "epoch": 4.437902398059158, + "grad_norm": NaN, + "learning_rate": 5.034708145803861e-05, + "loss": 0.0, + "step": 47561 + }, + { + "epoch": 4.437995707754036, + "grad_norm": NaN, + "learning_rate": 5.03414282674794e-05, + "loss": 0.0, + "step": 47562 + }, + { + "epoch": 4.438089017448913, + "grad_norm": NaN, + "learning_rate": 5.033577533032231e-05, + "loss": 0.0, + "step": 47563 + }, + { + "epoch": 4.438182327143791, + "grad_norm": NaN, + "learning_rate": 5.033012264658189e-05, + "loss": 0.0, + "step": 47564 + }, + { + "epoch": 4.438275636838668, + "grad_norm": NaN, + "learning_rate": 5.0324470216272476e-05, + "loss": 0.0, + "step": 47565 + }, + { + "epoch": 4.4383689465335445, + "grad_norm": NaN, + "learning_rate": 5.0318818039408316e-05, + "loss": 0.0, + "step": 47566 + }, + { + "epoch": 4.438462256228422, + "grad_norm": NaN, + "learning_rate": 5.0313166116003925e-05, + "loss": 0.0, + "step": 47567 + }, + { + "epoch": 4.438555565923299, + "grad_norm": NaN, + "learning_rate": 5.030751444607367e-05, + "loss": 0.0, + "step": 47568 + }, + { + "epoch": 4.438648875618177, + "grad_norm": NaN, + "learning_rate": 5.0301863029631766e-05, + "loss": 0.0, + "step": 47569 + }, + { + "epoch": 4.438742185313054, + "grad_norm": NaN, + "learning_rate": 5.029621186669276e-05, + "loss": 0.0, + "step": 47570 + }, + { + "epoch": 4.438835495007932, + "grad_norm": NaN, + "learning_rate": 5.0290560957270985e-05, + "loss": 0.0, + "step": 47571 + }, + { + "epoch": 4.438928804702808, + "grad_norm": NaN, + "learning_rate": 5.028491030138069e-05, + "loss": 0.0, + "step": 47572 + }, + { + "epoch": 4.439022114397686, + "grad_norm": NaN, + "learning_rate": 5.027925989903639e-05, + "loss": 0.0, + "step": 47573 + }, + { + "epoch": 4.439115424092563, + "grad_norm": NaN, + "learning_rate": 5.027360975025242e-05, + "loss": 0.0, + "step": 47574 + }, + { + "epoch": 4.43920873378744, + "grad_norm": NaN, + "learning_rate": 5.0267959855043035e-05, + "loss": 0.0, + "step": 47575 + }, + { + "epoch": 4.439302043482318, + "grad_norm": NaN, + "learning_rate": 5.026231021342272e-05, + "loss": 0.0, + "step": 47576 + }, + { + "epoch": 4.439395353177195, + "grad_norm": NaN, + "learning_rate": 5.0256660825405824e-05, + "loss": 0.0, + "step": 47577 + }, + { + "epoch": 4.439488662872073, + "grad_norm": NaN, + "learning_rate": 5.0251011691006674e-05, + "loss": 0.0, + "step": 47578 + }, + { + "epoch": 4.43958197256695, + "grad_norm": NaN, + "learning_rate": 5.0245362810239674e-05, + "loss": 0.0, + "step": 47579 + }, + { + "epoch": 4.439675282261827, + "grad_norm": NaN, + "learning_rate": 5.023971418311914e-05, + "loss": 0.0, + "step": 47580 + }, + { + "epoch": 4.439768591956704, + "grad_norm": NaN, + "learning_rate": 5.0234065809659494e-05, + "loss": 0.0, + "step": 47581 + }, + { + "epoch": 4.4398619016515815, + "grad_norm": NaN, + "learning_rate": 5.02284176898751e-05, + "loss": 0.0, + "step": 47582 + }, + { + "epoch": 4.439955211346459, + "grad_norm": NaN, + "learning_rate": 5.022276982378018e-05, + "loss": 0.0, + "step": 47583 + }, + { + "epoch": 4.440048521041336, + "grad_norm": NaN, + "learning_rate": 5.0217122211389246e-05, + "loss": 0.0, + "step": 47584 + }, + { + "epoch": 4.440141830736214, + "grad_norm": NaN, + "learning_rate": 5.021147485271668e-05, + "loss": 0.0, + "step": 47585 + }, + { + "epoch": 4.440235140431091, + "grad_norm": NaN, + "learning_rate": 5.0205827747776664e-05, + "loss": 0.0, + "step": 47586 + }, + { + "epoch": 4.440328450125968, + "grad_norm": NaN, + "learning_rate": 5.0200180896583724e-05, + "loss": 0.0, + "step": 47587 + }, + { + "epoch": 4.440421759820845, + "grad_norm": NaN, + "learning_rate": 5.019453429915221e-05, + "loss": 0.0, + "step": 47588 + }, + { + "epoch": 4.4405150695157225, + "grad_norm": NaN, + "learning_rate": 5.018888795549632e-05, + "loss": 0.0, + "step": 47589 + }, + { + "epoch": 4.4406083792106, + "grad_norm": NaN, + "learning_rate": 5.0183241865630575e-05, + "loss": 0.0, + "step": 47590 + }, + { + "epoch": 4.440701688905477, + "grad_norm": NaN, + "learning_rate": 5.0177596029569325e-05, + "loss": 0.0, + "step": 47591 + }, + { + "epoch": 4.440794998600355, + "grad_norm": NaN, + "learning_rate": 5.01719504473268e-05, + "loss": 0.0, + "step": 47592 + }, + { + "epoch": 4.440888308295232, + "grad_norm": NaN, + "learning_rate": 5.016630511891746e-05, + "loss": 0.0, + "step": 47593 + }, + { + "epoch": 4.44098161799011, + "grad_norm": NaN, + "learning_rate": 5.0160660044355686e-05, + "loss": 0.0, + "step": 47594 + }, + { + "epoch": 4.441074927684986, + "grad_norm": NaN, + "learning_rate": 5.0155015223655686e-05, + "loss": 0.0, + "step": 47595 + }, + { + "epoch": 4.4411682373798635, + "grad_norm": NaN, + "learning_rate": 5.014937065683195e-05, + "loss": 0.0, + "step": 47596 + }, + { + "epoch": 4.441261547074741, + "grad_norm": NaN, + "learning_rate": 5.014372634389885e-05, + "loss": 0.0, + "step": 47597 + }, + { + "epoch": 4.441354856769618, + "grad_norm": NaN, + "learning_rate": 5.0138082284870574e-05, + "loss": 0.0, + "step": 47598 + }, + { + "epoch": 4.441448166464496, + "grad_norm": NaN, + "learning_rate": 5.0132438479761615e-05, + "loss": 0.0, + "step": 47599 + }, + { + "epoch": 4.441541476159373, + "grad_norm": NaN, + "learning_rate": 5.0126794928586336e-05, + "loss": 0.0, + "step": 47600 + }, + { + "epoch": 4.441634785854251, + "grad_norm": NaN, + "learning_rate": 5.0121151631358914e-05, + "loss": 0.0, + "step": 47601 + }, + { + "epoch": 4.441728095549127, + "grad_norm": NaN, + "learning_rate": 5.011550858809388e-05, + "loss": 0.0, + "step": 47602 + }, + { + "epoch": 4.441821405244005, + "grad_norm": NaN, + "learning_rate": 5.0109865798805585e-05, + "loss": 0.0, + "step": 47603 + }, + { + "epoch": 4.441914714938882, + "grad_norm": NaN, + "learning_rate": 5.01042232635082e-05, + "loss": 0.0, + "step": 47604 + }, + { + "epoch": 4.442008024633759, + "grad_norm": NaN, + "learning_rate": 5.009858098221623e-05, + "loss": 0.0, + "step": 47605 + }, + { + "epoch": 4.442101334328637, + "grad_norm": NaN, + "learning_rate": 5.0092938954944016e-05, + "loss": 0.0, + "step": 47606 + }, + { + "epoch": 4.442194644023514, + "grad_norm": NaN, + "learning_rate": 5.008729718170578e-05, + "loss": 0.0, + "step": 47607 + }, + { + "epoch": 4.442287953718392, + "grad_norm": NaN, + "learning_rate": 5.0081655662516e-05, + "loss": 0.0, + "step": 47608 + }, + { + "epoch": 4.442381263413269, + "grad_norm": NaN, + "learning_rate": 5.0076014397389015e-05, + "loss": 0.0, + "step": 47609 + }, + { + "epoch": 4.442474573108146, + "grad_norm": NaN, + "learning_rate": 5.0070373386339026e-05, + "loss": 0.0, + "step": 47610 + }, + { + "epoch": 4.442567882803023, + "grad_norm": NaN, + "learning_rate": 5.006473262938053e-05, + "loss": 0.0, + "step": 47611 + }, + { + "epoch": 4.4426611924979005, + "grad_norm": NaN, + "learning_rate": 5.005909212652786e-05, + "loss": 0.0, + "step": 47612 + }, + { + "epoch": 4.442754502192778, + "grad_norm": NaN, + "learning_rate": 5.0053451877795216e-05, + "loss": 0.0, + "step": 47613 + }, + { + "epoch": 4.442847811887655, + "grad_norm": NaN, + "learning_rate": 5.0047811883197096e-05, + "loss": 0.0, + "step": 47614 + }, + { + "epoch": 4.442941121582533, + "grad_norm": NaN, + "learning_rate": 5.004217214274778e-05, + "loss": 0.0, + "step": 47615 + }, + { + "epoch": 4.443034431277409, + "grad_norm": NaN, + "learning_rate": 5.003653265646161e-05, + "loss": 0.0, + "step": 47616 + }, + { + "epoch": 4.443127740972287, + "grad_norm": NaN, + "learning_rate": 5.003089342435291e-05, + "loss": 0.0, + "step": 47617 + }, + { + "epoch": 4.443221050667164, + "grad_norm": NaN, + "learning_rate": 5.002525444643605e-05, + "loss": 0.0, + "step": 47618 + }, + { + "epoch": 4.4433143603620415, + "grad_norm": NaN, + "learning_rate": 5.0019615722725366e-05, + "loss": 0.0, + "step": 47619 + }, + { + "epoch": 4.443407670056919, + "grad_norm": NaN, + "learning_rate": 5.001397725323515e-05, + "loss": 0.0, + "step": 47620 + }, + { + "epoch": 4.443500979751796, + "grad_norm": NaN, + "learning_rate": 5.000833903797979e-05, + "loss": 0.0, + "step": 47621 + }, + { + "epoch": 4.443594289446674, + "grad_norm": NaN, + "learning_rate": 5.000270107697361e-05, + "loss": 0.0, + "step": 47622 + }, + { + "epoch": 4.443687599141551, + "grad_norm": NaN, + "learning_rate": 4.999706337023095e-05, + "loss": 0.0, + "step": 47623 + }, + { + "epoch": 4.443780908836428, + "grad_norm": NaN, + "learning_rate": 4.99914259177661e-05, + "loss": 0.0, + "step": 47624 + }, + { + "epoch": 4.443874218531305, + "grad_norm": NaN, + "learning_rate": 4.998578871959346e-05, + "loss": 0.0, + "step": 47625 + }, + { + "epoch": 4.443967528226183, + "grad_norm": NaN, + "learning_rate": 4.998015177572731e-05, + "loss": 0.0, + "step": 47626 + }, + { + "epoch": 4.44406083792106, + "grad_norm": NaN, + "learning_rate": 4.997451508618202e-05, + "loss": 0.0, + "step": 47627 + }, + { + "epoch": 4.444154147615937, + "grad_norm": NaN, + "learning_rate": 4.9968878650971906e-05, + "loss": 0.0, + "step": 47628 + }, + { + "epoch": 4.444247457310815, + "grad_norm": NaN, + "learning_rate": 4.996324247011135e-05, + "loss": 0.0, + "step": 47629 + }, + { + "epoch": 4.444340767005692, + "grad_norm": NaN, + "learning_rate": 4.995760654361454e-05, + "loss": 0.0, + "step": 47630 + }, + { + "epoch": 4.444434076700569, + "grad_norm": NaN, + "learning_rate": 4.995197087149595e-05, + "loss": 0.0, + "step": 47631 + }, + { + "epoch": 4.444527386395446, + "grad_norm": NaN, + "learning_rate": 4.994633545376991e-05, + "loss": 0.0, + "step": 47632 + }, + { + "epoch": 4.444620696090324, + "grad_norm": NaN, + "learning_rate": 4.99407002904506e-05, + "loss": 0.0, + "step": 47633 + }, + { + "epoch": 4.444714005785201, + "grad_norm": NaN, + "learning_rate": 4.9935065381552496e-05, + "loss": 0.0, + "step": 47634 + }, + { + "epoch": 4.4448073154800785, + "grad_norm": NaN, + "learning_rate": 4.992943072708993e-05, + "loss": 0.0, + "step": 47635 + }, + { + "epoch": 4.444900625174956, + "grad_norm": NaN, + "learning_rate": 4.992379632707709e-05, + "loss": 0.0, + "step": 47636 + }, + { + "epoch": 4.444993934869833, + "grad_norm": NaN, + "learning_rate": 4.991816218152844e-05, + "loss": 0.0, + "step": 47637 + }, + { + "epoch": 4.445087244564711, + "grad_norm": NaN, + "learning_rate": 4.991252829045829e-05, + "loss": 0.0, + "step": 47638 + }, + { + "epoch": 4.445180554259587, + "grad_norm": NaN, + "learning_rate": 4.9906894653880845e-05, + "loss": 0.0, + "step": 47639 + }, + { + "epoch": 4.445273863954465, + "grad_norm": NaN, + "learning_rate": 4.990126127181057e-05, + "loss": 0.0, + "step": 47640 + }, + { + "epoch": 4.445367173649342, + "grad_norm": NaN, + "learning_rate": 4.9895628144261766e-05, + "loss": 0.0, + "step": 47641 + }, + { + "epoch": 4.4454604833442195, + "grad_norm": NaN, + "learning_rate": 4.988999527124866e-05, + "loss": 0.0, + "step": 47642 + }, + { + "epoch": 4.445553793039097, + "grad_norm": NaN, + "learning_rate": 4.988436265278566e-05, + "loss": 0.0, + "step": 47643 + }, + { + "epoch": 4.445647102733974, + "grad_norm": NaN, + "learning_rate": 4.987873028888713e-05, + "loss": 0.0, + "step": 47644 + }, + { + "epoch": 4.445740412428852, + "grad_norm": NaN, + "learning_rate": 4.987309817956725e-05, + "loss": 0.0, + "step": 47645 + }, + { + "epoch": 4.445833722123728, + "grad_norm": NaN, + "learning_rate": 4.986746632484044e-05, + "loss": 0.0, + "step": 47646 + }, + { + "epoch": 4.445927031818606, + "grad_norm": NaN, + "learning_rate": 4.986183472472107e-05, + "loss": 0.0, + "step": 47647 + }, + { + "epoch": 4.446020341513483, + "grad_norm": NaN, + "learning_rate": 4.9856203379223285e-05, + "loss": 0.0, + "step": 47648 + }, + { + "epoch": 4.4461136512083606, + "grad_norm": NaN, + "learning_rate": 4.985057228836157e-05, + "loss": 0.0, + "step": 47649 + }, + { + "epoch": 4.446206960903238, + "grad_norm": NaN, + "learning_rate": 4.984494145215022e-05, + "loss": 0.0, + "step": 47650 + }, + { + "epoch": 4.446300270598115, + "grad_norm": NaN, + "learning_rate": 4.9839310870603424e-05, + "loss": 0.0, + "step": 47651 + }, + { + "epoch": 4.446393580292993, + "grad_norm": NaN, + "learning_rate": 4.983368054373564e-05, + "loss": 0.0, + "step": 47652 + }, + { + "epoch": 4.446486889987869, + "grad_norm": NaN, + "learning_rate": 4.982805047156114e-05, + "loss": 0.0, + "step": 47653 + }, + { + "epoch": 4.446580199682747, + "grad_norm": NaN, + "learning_rate": 4.9822420654094225e-05, + "loss": 0.0, + "step": 47654 + }, + { + "epoch": 4.446673509377624, + "grad_norm": NaN, + "learning_rate": 4.9816791091349226e-05, + "loss": 0.0, + "step": 47655 + }, + { + "epoch": 4.446766819072502, + "grad_norm": NaN, + "learning_rate": 4.9811161783340435e-05, + "loss": 0.0, + "step": 47656 + }, + { + "epoch": 4.446860128767379, + "grad_norm": NaN, + "learning_rate": 4.980553273008219e-05, + "loss": 0.0, + "step": 47657 + }, + { + "epoch": 4.446953438462256, + "grad_norm": NaN, + "learning_rate": 4.9799903931588796e-05, + "loss": 0.0, + "step": 47658 + }, + { + "epoch": 4.447046748157134, + "grad_norm": NaN, + "learning_rate": 4.979427538787457e-05, + "loss": 0.0, + "step": 47659 + }, + { + "epoch": 4.44714005785201, + "grad_norm": NaN, + "learning_rate": 4.978864709895381e-05, + "loss": 0.0, + "step": 47660 + }, + { + "epoch": 4.447233367546888, + "grad_norm": NaN, + "learning_rate": 4.978301906484085e-05, + "loss": 0.0, + "step": 47661 + }, + { + "epoch": 4.447326677241765, + "grad_norm": NaN, + "learning_rate": 4.977739128554998e-05, + "loss": 0.0, + "step": 47662 + }, + { + "epoch": 4.447419986936643, + "grad_norm": NaN, + "learning_rate": 4.977176376109552e-05, + "loss": 0.0, + "step": 47663 + }, + { + "epoch": 4.44751329663152, + "grad_norm": NaN, + "learning_rate": 4.976613649149176e-05, + "loss": 0.0, + "step": 47664 + }, + { + "epoch": 4.4476066063263975, + "grad_norm": NaN, + "learning_rate": 4.976050947675303e-05, + "loss": 0.0, + "step": 47665 + }, + { + "epoch": 4.447699916021275, + "grad_norm": NaN, + "learning_rate": 4.975488271689364e-05, + "loss": 0.0, + "step": 47666 + }, + { + "epoch": 4.447793225716152, + "grad_norm": NaN, + "learning_rate": 4.974925621192788e-05, + "loss": 0.0, + "step": 47667 + }, + { + "epoch": 4.447886535411029, + "grad_norm": NaN, + "learning_rate": 4.974362996187007e-05, + "loss": 0.0, + "step": 47668 + }, + { + "epoch": 4.447979845105906, + "grad_norm": NaN, + "learning_rate": 4.973800396673451e-05, + "loss": 0.0, + "step": 47669 + }, + { + "epoch": 4.448073154800784, + "grad_norm": NaN, + "learning_rate": 4.9732378226535504e-05, + "loss": 0.0, + "step": 47670 + }, + { + "epoch": 4.448166464495661, + "grad_norm": NaN, + "learning_rate": 4.972675274128736e-05, + "loss": 0.0, + "step": 47671 + }, + { + "epoch": 4.4482597741905385, + "grad_norm": NaN, + "learning_rate": 4.972112751100438e-05, + "loss": 0.0, + "step": 47672 + }, + { + "epoch": 4.448353083885416, + "grad_norm": NaN, + "learning_rate": 4.971550253570092e-05, + "loss": 0.0, + "step": 47673 + }, + { + "epoch": 4.448446393580293, + "grad_norm": NaN, + "learning_rate": 4.970987781539112e-05, + "loss": 0.0, + "step": 47674 + }, + { + "epoch": 4.44853970327517, + "grad_norm": NaN, + "learning_rate": 4.970425335008945e-05, + "loss": 0.0, + "step": 47675 + }, + { + "epoch": 4.448633012970047, + "grad_norm": NaN, + "learning_rate": 4.9698629139810206e-05, + "loss": 0.0, + "step": 47676 + }, + { + "epoch": 4.448726322664925, + "grad_norm": NaN, + "learning_rate": 4.969300518456753e-05, + "loss": 0.0, + "step": 47677 + }, + { + "epoch": 4.448819632359802, + "grad_norm": NaN, + "learning_rate": 4.968738148437589e-05, + "loss": 0.0, + "step": 47678 + }, + { + "epoch": 4.44891294205468, + "grad_norm": NaN, + "learning_rate": 4.968175803924956e-05, + "loss": 0.0, + "step": 47679 + }, + { + "epoch": 4.449006251749557, + "grad_norm": NaN, + "learning_rate": 4.967613484920272e-05, + "loss": 0.0, + "step": 47680 + }, + { + "epoch": 4.449099561444434, + "grad_norm": NaN, + "learning_rate": 4.96705119142498e-05, + "loss": 0.0, + "step": 47681 + }, + { + "epoch": 4.449192871139312, + "grad_norm": NaN, + "learning_rate": 4.9664889234405075e-05, + "loss": 0.0, + "step": 47682 + }, + { + "epoch": 4.449286180834188, + "grad_norm": NaN, + "learning_rate": 4.965926680968273e-05, + "loss": 0.0, + "step": 47683 + }, + { + "epoch": 4.449379490529066, + "grad_norm": NaN, + "learning_rate": 4.9653644640097196e-05, + "loss": 0.0, + "step": 47684 + }, + { + "epoch": 4.449472800223943, + "grad_norm": NaN, + "learning_rate": 4.964802272566278e-05, + "loss": 0.0, + "step": 47685 + }, + { + "epoch": 4.449566109918821, + "grad_norm": NaN, + "learning_rate": 4.96424010663936e-05, + "loss": 0.0, + "step": 47686 + }, + { + "epoch": 4.449659419613698, + "grad_norm": NaN, + "learning_rate": 4.9636779662304116e-05, + "loss": 0.0, + "step": 47687 + }, + { + "epoch": 4.4497527293085755, + "grad_norm": NaN, + "learning_rate": 4.963115851340858e-05, + "loss": 0.0, + "step": 47688 + }, + { + "epoch": 4.449846039003452, + "grad_norm": NaN, + "learning_rate": 4.962553761972127e-05, + "loss": 0.0, + "step": 47689 + }, + { + "epoch": 4.449939348698329, + "grad_norm": NaN, + "learning_rate": 4.96199169812565e-05, + "loss": 0.0, + "step": 47690 + }, + { + "epoch": 4.450032658393207, + "grad_norm": NaN, + "learning_rate": 4.961429659802852e-05, + "loss": 0.0, + "step": 47691 + }, + { + "epoch": 4.450125968088084, + "grad_norm": NaN, + "learning_rate": 4.960867647005167e-05, + "loss": 0.0, + "step": 47692 + }, + { + "epoch": 4.450219277782962, + "grad_norm": NaN, + "learning_rate": 4.960305659734021e-05, + "loss": 0.0, + "step": 47693 + }, + { + "epoch": 4.450312587477839, + "grad_norm": NaN, + "learning_rate": 4.9597436979908434e-05, + "loss": 0.0, + "step": 47694 + }, + { + "epoch": 4.4504058971727165, + "grad_norm": NaN, + "learning_rate": 4.959181761777064e-05, + "loss": 0.0, + "step": 47695 + }, + { + "epoch": 4.450499206867594, + "grad_norm": NaN, + "learning_rate": 4.9586198510941107e-05, + "loss": 0.0, + "step": 47696 + }, + { + "epoch": 4.4505925165624705, + "grad_norm": NaN, + "learning_rate": 4.9580579659434116e-05, + "loss": 0.0, + "step": 47697 + }, + { + "epoch": 4.450685826257348, + "grad_norm": NaN, + "learning_rate": 4.957496106326398e-05, + "loss": 0.0, + "step": 47698 + }, + { + "epoch": 4.450779135952225, + "grad_norm": NaN, + "learning_rate": 4.9569342722444966e-05, + "loss": 0.0, + "step": 47699 + }, + { + "epoch": 4.450872445647103, + "grad_norm": NaN, + "learning_rate": 4.956372463699136e-05, + "loss": 0.0, + "step": 47700 + }, + { + "epoch": 4.45096575534198, + "grad_norm": NaN, + "learning_rate": 4.9558106806917466e-05, + "loss": 0.0, + "step": 47701 + }, + { + "epoch": 4.4510590650368576, + "grad_norm": NaN, + "learning_rate": 4.9552489232237535e-05, + "loss": 0.0, + "step": 47702 + }, + { + "epoch": 4.451152374731735, + "grad_norm": NaN, + "learning_rate": 4.954687191296587e-05, + "loss": 0.0, + "step": 47703 + }, + { + "epoch": 4.4512456844266115, + "grad_norm": NaN, + "learning_rate": 4.954125484911677e-05, + "loss": 0.0, + "step": 47704 + }, + { + "epoch": 4.451338994121489, + "grad_norm": NaN, + "learning_rate": 4.953563804070449e-05, + "loss": 0.0, + "step": 47705 + }, + { + "epoch": 4.451432303816366, + "grad_norm": NaN, + "learning_rate": 4.953002148774332e-05, + "loss": 0.0, + "step": 47706 + }, + { + "epoch": 4.451525613511244, + "grad_norm": NaN, + "learning_rate": 4.9524405190247554e-05, + "loss": 0.0, + "step": 47707 + }, + { + "epoch": 4.451618923206121, + "grad_norm": NaN, + "learning_rate": 4.951878914823144e-05, + "loss": 0.0, + "step": 47708 + }, + { + "epoch": 4.451712232900999, + "grad_norm": NaN, + "learning_rate": 4.95131733617093e-05, + "loss": 0.0, + "step": 47709 + }, + { + "epoch": 4.451805542595876, + "grad_norm": NaN, + "learning_rate": 4.9507557830695385e-05, + "loss": 0.0, + "step": 47710 + }, + { + "epoch": 4.451898852290753, + "grad_norm": NaN, + "learning_rate": 4.9501942555203984e-05, + "loss": 0.0, + "step": 47711 + }, + { + "epoch": 4.45199216198563, + "grad_norm": NaN, + "learning_rate": 4.9496327535249386e-05, + "loss": 0.0, + "step": 47712 + }, + { + "epoch": 4.452085471680507, + "grad_norm": NaN, + "learning_rate": 4.9490712770845833e-05, + "loss": 0.0, + "step": 47713 + }, + { + "epoch": 4.452178781375385, + "grad_norm": NaN, + "learning_rate": 4.948509826200764e-05, + "loss": 0.0, + "step": 47714 + }, + { + "epoch": 4.452272091070262, + "grad_norm": NaN, + "learning_rate": 4.947948400874907e-05, + "loss": 0.0, + "step": 47715 + }, + { + "epoch": 4.45236540076514, + "grad_norm": NaN, + "learning_rate": 4.947387001108439e-05, + "loss": 0.0, + "step": 47716 + }, + { + "epoch": 4.452458710460017, + "grad_norm": NaN, + "learning_rate": 4.9468256269027925e-05, + "loss": 0.0, + "step": 47717 + }, + { + "epoch": 4.4525520201548945, + "grad_norm": NaN, + "learning_rate": 4.946264278259381e-05, + "loss": 0.0, + "step": 47718 + }, + { + "epoch": 4.452645329849771, + "grad_norm": NaN, + "learning_rate": 4.9457029551796466e-05, + "loss": 0.0, + "step": 47719 + }, + { + "epoch": 4.452738639544648, + "grad_norm": NaN, + "learning_rate": 4.945141657665016e-05, + "loss": 0.0, + "step": 47720 + }, + { + "epoch": 4.452831949239526, + "grad_norm": NaN, + "learning_rate": 4.9445803857169026e-05, + "loss": 0.0, + "step": 47721 + }, + { + "epoch": 4.452925258934403, + "grad_norm": NaN, + "learning_rate": 4.9440191393367464e-05, + "loss": 0.0, + "step": 47722 + }, + { + "epoch": 4.453018568629281, + "grad_norm": NaN, + "learning_rate": 4.943457918525976e-05, + "loss": 0.0, + "step": 47723 + }, + { + "epoch": 4.453111878324158, + "grad_norm": NaN, + "learning_rate": 4.942896723286002e-05, + "loss": 0.0, + "step": 47724 + }, + { + "epoch": 4.4532051880190355, + "grad_norm": NaN, + "learning_rate": 4.942335553618269e-05, + "loss": 0.0, + "step": 47725 + }, + { + "epoch": 4.453298497713912, + "grad_norm": NaN, + "learning_rate": 4.9417744095241984e-05, + "loss": 0.0, + "step": 47726 + }, + { + "epoch": 4.4533918074087895, + "grad_norm": NaN, + "learning_rate": 4.9412132910052146e-05, + "loss": 0.0, + "step": 47727 + }, + { + "epoch": 4.453485117103667, + "grad_norm": NaN, + "learning_rate": 4.9406521980627444e-05, + "loss": 0.0, + "step": 47728 + }, + { + "epoch": 4.453578426798544, + "grad_norm": NaN, + "learning_rate": 4.9400911306982175e-05, + "loss": 0.0, + "step": 47729 + }, + { + "epoch": 4.453671736493422, + "grad_norm": NaN, + "learning_rate": 4.93953008891306e-05, + "loss": 0.0, + "step": 47730 + }, + { + "epoch": 4.453765046188299, + "grad_norm": NaN, + "learning_rate": 4.9389690727086955e-05, + "loss": 0.0, + "step": 47731 + }, + { + "epoch": 4.453858355883177, + "grad_norm": NaN, + "learning_rate": 4.938408082086553e-05, + "loss": 0.0, + "step": 47732 + }, + { + "epoch": 4.453951665578053, + "grad_norm": NaN, + "learning_rate": 4.937847117048059e-05, + "loss": 0.0, + "step": 47733 + }, + { + "epoch": 4.4540449752729305, + "grad_norm": NaN, + "learning_rate": 4.937286177594637e-05, + "loss": 0.0, + "step": 47734 + }, + { + "epoch": 4.454138284967808, + "grad_norm": NaN, + "learning_rate": 4.9367252637277164e-05, + "loss": 0.0, + "step": 47735 + }, + { + "epoch": 4.454231594662685, + "grad_norm": NaN, + "learning_rate": 4.936164375448725e-05, + "loss": 0.0, + "step": 47736 + }, + { + "epoch": 4.454324904357563, + "grad_norm": NaN, + "learning_rate": 4.935603512759083e-05, + "loss": 0.0, + "step": 47737 + }, + { + "epoch": 4.45441821405244, + "grad_norm": NaN, + "learning_rate": 4.935042675660221e-05, + "loss": 0.0, + "step": 47738 + }, + { + "epoch": 4.454511523747318, + "grad_norm": NaN, + "learning_rate": 4.9344818641535644e-05, + "loss": 0.0, + "step": 47739 + }, + { + "epoch": 4.454604833442195, + "grad_norm": NaN, + "learning_rate": 4.9339210782405395e-05, + "loss": 0.0, + "step": 47740 + }, + { + "epoch": 4.454698143137072, + "grad_norm": NaN, + "learning_rate": 4.9333603179225713e-05, + "loss": 0.0, + "step": 47741 + }, + { + "epoch": 4.454791452831949, + "grad_norm": NaN, + "learning_rate": 4.9327995832010844e-05, + "loss": 0.0, + "step": 47742 + }, + { + "epoch": 4.454884762526826, + "grad_norm": NaN, + "learning_rate": 4.932238874077508e-05, + "loss": 0.0, + "step": 47743 + }, + { + "epoch": 4.454978072221704, + "grad_norm": NaN, + "learning_rate": 4.931678190553264e-05, + "loss": 0.0, + "step": 47744 + }, + { + "epoch": 4.455071381916581, + "grad_norm": NaN, + "learning_rate": 4.9311175326297815e-05, + "loss": 0.0, + "step": 47745 + }, + { + "epoch": 4.455164691611459, + "grad_norm": NaN, + "learning_rate": 4.9305569003084826e-05, + "loss": 0.0, + "step": 47746 + }, + { + "epoch": 4.455258001306336, + "grad_norm": NaN, + "learning_rate": 4.929996293590797e-05, + "loss": 0.0, + "step": 47747 + }, + { + "epoch": 4.455351311001213, + "grad_norm": NaN, + "learning_rate": 4.9294357124781454e-05, + "loss": 0.0, + "step": 47748 + }, + { + "epoch": 4.45544462069609, + "grad_norm": NaN, + "learning_rate": 4.928875156971958e-05, + "loss": 0.0, + "step": 47749 + }, + { + "epoch": 4.4555379303909675, + "grad_norm": NaN, + "learning_rate": 4.928314627073656e-05, + "loss": 0.0, + "step": 47750 + }, + { + "epoch": 4.455631240085845, + "grad_norm": NaN, + "learning_rate": 4.927754122784667e-05, + "loss": 0.0, + "step": 47751 + }, + { + "epoch": 4.455724549780722, + "grad_norm": NaN, + "learning_rate": 4.927193644106416e-05, + "loss": 0.0, + "step": 47752 + }, + { + "epoch": 4.4558178594756, + "grad_norm": NaN, + "learning_rate": 4.926633191040328e-05, + "loss": 0.0, + "step": 47753 + }, + { + "epoch": 4.455911169170477, + "grad_norm": NaN, + "learning_rate": 4.926072763587827e-05, + "loss": 0.0, + "step": 47754 + }, + { + "epoch": 4.456004478865355, + "grad_norm": NaN, + "learning_rate": 4.92551236175034e-05, + "loss": 0.0, + "step": 47755 + }, + { + "epoch": 4.456097788560231, + "grad_norm": NaN, + "learning_rate": 4.924951985529289e-05, + "loss": 0.0, + "step": 47756 + }, + { + "epoch": 4.4561910982551085, + "grad_norm": NaN, + "learning_rate": 4.924391634926103e-05, + "loss": 0.0, + "step": 47757 + }, + { + "epoch": 4.456284407949986, + "grad_norm": NaN, + "learning_rate": 4.9238313099422025e-05, + "loss": 0.0, + "step": 47758 + }, + { + "epoch": 4.456377717644863, + "grad_norm": NaN, + "learning_rate": 4.9232710105790154e-05, + "loss": 0.0, + "step": 47759 + }, + { + "epoch": 4.456471027339741, + "grad_norm": NaN, + "learning_rate": 4.922710736837964e-05, + "loss": 0.0, + "step": 47760 + }, + { + "epoch": 4.456564337034618, + "grad_norm": NaN, + "learning_rate": 4.922150488720475e-05, + "loss": 0.0, + "step": 47761 + }, + { + "epoch": 4.456657646729496, + "grad_norm": NaN, + "learning_rate": 4.921590266227971e-05, + "loss": 0.0, + "step": 47762 + }, + { + "epoch": 4.456750956424372, + "grad_norm": NaN, + "learning_rate": 4.921030069361879e-05, + "loss": 0.0, + "step": 47763 + }, + { + "epoch": 4.4568442661192496, + "grad_norm": NaN, + "learning_rate": 4.9204698981236205e-05, + "loss": 0.0, + "step": 47764 + }, + { + "epoch": 4.456937575814127, + "grad_norm": NaN, + "learning_rate": 4.9199097525146224e-05, + "loss": 0.0, + "step": 47765 + }, + { + "epoch": 4.457030885509004, + "grad_norm": NaN, + "learning_rate": 4.9193496325363066e-05, + "loss": 0.0, + "step": 47766 + }, + { + "epoch": 4.457124195203882, + "grad_norm": NaN, + "learning_rate": 4.918789538190101e-05, + "loss": 0.0, + "step": 47767 + }, + { + "epoch": 4.457217504898759, + "grad_norm": NaN, + "learning_rate": 4.918229469477426e-05, + "loss": 0.0, + "step": 47768 + }, + { + "epoch": 4.457310814593637, + "grad_norm": NaN, + "learning_rate": 4.917669426399706e-05, + "loss": 0.0, + "step": 47769 + }, + { + "epoch": 4.457404124288513, + "grad_norm": NaN, + "learning_rate": 4.917109408958367e-05, + "loss": 0.0, + "step": 47770 + }, + { + "epoch": 4.457497433983391, + "grad_norm": NaN, + "learning_rate": 4.916549417154831e-05, + "loss": 0.0, + "step": 47771 + }, + { + "epoch": 4.457590743678268, + "grad_norm": NaN, + "learning_rate": 4.9159894509905235e-05, + "loss": 0.0, + "step": 47772 + }, + { + "epoch": 4.457684053373145, + "grad_norm": NaN, + "learning_rate": 4.915429510466869e-05, + "loss": 0.0, + "step": 47773 + }, + { + "epoch": 4.457777363068023, + "grad_norm": NaN, + "learning_rate": 4.914869595585288e-05, + "loss": 0.0, + "step": 47774 + }, + { + "epoch": 4.4578706727629, + "grad_norm": NaN, + "learning_rate": 4.9143097063472074e-05, + "loss": 0.0, + "step": 47775 + }, + { + "epoch": 4.457963982457778, + "grad_norm": NaN, + "learning_rate": 4.91374984275405e-05, + "loss": 0.0, + "step": 47776 + }, + { + "epoch": 4.458057292152654, + "grad_norm": NaN, + "learning_rate": 4.9131900048072375e-05, + "loss": 0.0, + "step": 47777 + }, + { + "epoch": 4.458150601847532, + "grad_norm": NaN, + "learning_rate": 4.912630192508196e-05, + "loss": 0.0, + "step": 47778 + }, + { + "epoch": 4.458243911542409, + "grad_norm": NaN, + "learning_rate": 4.9120704058583465e-05, + "loss": 0.0, + "step": 47779 + }, + { + "epoch": 4.4583372212372865, + "grad_norm": NaN, + "learning_rate": 4.911510644859116e-05, + "loss": 0.0, + "step": 47780 + }, + { + "epoch": 4.458430530932164, + "grad_norm": NaN, + "learning_rate": 4.910950909511924e-05, + "loss": 0.0, + "step": 47781 + }, + { + "epoch": 4.458523840627041, + "grad_norm": NaN, + "learning_rate": 4.9103911998181944e-05, + "loss": 0.0, + "step": 47782 + }, + { + "epoch": 4.458617150321919, + "grad_norm": NaN, + "learning_rate": 4.9098315157793536e-05, + "loss": 0.0, + "step": 47783 + }, + { + "epoch": 4.458710460016796, + "grad_norm": NaN, + "learning_rate": 4.909271857396821e-05, + "loss": 0.0, + "step": 47784 + }, + { + "epoch": 4.458803769711673, + "grad_norm": NaN, + "learning_rate": 4.908712224672019e-05, + "loss": 0.0, + "step": 47785 + }, + { + "epoch": 4.45889707940655, + "grad_norm": NaN, + "learning_rate": 4.9081526176063766e-05, + "loss": 0.0, + "step": 47786 + }, + { + "epoch": 4.4589903891014275, + "grad_norm": NaN, + "learning_rate": 4.90759303620131e-05, + "loss": 0.0, + "step": 47787 + }, + { + "epoch": 4.459083698796305, + "grad_norm": NaN, + "learning_rate": 4.9070334804582444e-05, + "loss": 0.0, + "step": 47788 + }, + { + "epoch": 4.459177008491182, + "grad_norm": NaN, + "learning_rate": 4.906473950378604e-05, + "loss": 0.0, + "step": 47789 + }, + { + "epoch": 4.45927031818606, + "grad_norm": NaN, + "learning_rate": 4.90591444596381e-05, + "loss": 0.0, + "step": 47790 + }, + { + "epoch": 4.459363627880937, + "grad_norm": NaN, + "learning_rate": 4.905354967215285e-05, + "loss": 0.0, + "step": 47791 + }, + { + "epoch": 4.459456937575814, + "grad_norm": NaN, + "learning_rate": 4.9047955141344536e-05, + "loss": 0.0, + "step": 47792 + }, + { + "epoch": 4.459550247270691, + "grad_norm": NaN, + "learning_rate": 4.904236086722735e-05, + "loss": 0.0, + "step": 47793 + }, + { + "epoch": 4.459643556965569, + "grad_norm": NaN, + "learning_rate": 4.9036766849815543e-05, + "loss": 0.0, + "step": 47794 + }, + { + "epoch": 4.459736866660446, + "grad_norm": NaN, + "learning_rate": 4.903117308912335e-05, + "loss": 0.0, + "step": 47795 + }, + { + "epoch": 4.459830176355323, + "grad_norm": NaN, + "learning_rate": 4.9025579585164956e-05, + "loss": 0.0, + "step": 47796 + }, + { + "epoch": 4.459923486050201, + "grad_norm": NaN, + "learning_rate": 4.9019986337954596e-05, + "loss": 0.0, + "step": 47797 + }, + { + "epoch": 4.460016795745078, + "grad_norm": NaN, + "learning_rate": 4.9014393347506505e-05, + "loss": 0.0, + "step": 47798 + }, + { + "epoch": 4.460110105439956, + "grad_norm": NaN, + "learning_rate": 4.9008800613834894e-05, + "loss": 0.0, + "step": 47799 + }, + { + "epoch": 4.460203415134832, + "grad_norm": NaN, + "learning_rate": 4.9003208136953986e-05, + "loss": 0.0, + "step": 47800 + }, + { + "epoch": 4.46029672482971, + "grad_norm": NaN, + "learning_rate": 4.899761591687801e-05, + "loss": 0.0, + "step": 47801 + }, + { + "epoch": 4.460390034524587, + "grad_norm": NaN, + "learning_rate": 4.899202395362118e-05, + "loss": 0.0, + "step": 47802 + }, + { + "epoch": 4.4604833442194645, + "grad_norm": NaN, + "learning_rate": 4.8986432247197706e-05, + "loss": 0.0, + "step": 47803 + }, + { + "epoch": 4.460576653914342, + "grad_norm": NaN, + "learning_rate": 4.898084079762181e-05, + "loss": 0.0, + "step": 47804 + }, + { + "epoch": 4.460669963609219, + "grad_norm": NaN, + "learning_rate": 4.8975249604907704e-05, + "loss": 0.0, + "step": 47805 + }, + { + "epoch": 4.460763273304096, + "grad_norm": NaN, + "learning_rate": 4.896965866906963e-05, + "loss": 0.0, + "step": 47806 + }, + { + "epoch": 4.460856582998973, + "grad_norm": NaN, + "learning_rate": 4.896406799012177e-05, + "loss": 0.0, + "step": 47807 + }, + { + "epoch": 4.460949892693851, + "grad_norm": NaN, + "learning_rate": 4.8958477568078366e-05, + "loss": 0.0, + "step": 47808 + }, + { + "epoch": 4.461043202388728, + "grad_norm": NaN, + "learning_rate": 4.895288740295363e-05, + "loss": 0.0, + "step": 47809 + }, + { + "epoch": 4.4611365120836055, + "grad_norm": NaN, + "learning_rate": 4.894729749476175e-05, + "loss": 0.0, + "step": 47810 + }, + { + "epoch": 4.461229821778483, + "grad_norm": NaN, + "learning_rate": 4.894170784351697e-05, + "loss": 0.0, + "step": 47811 + }, + { + "epoch": 4.46132313147336, + "grad_norm": NaN, + "learning_rate": 4.8936118449233495e-05, + "loss": 0.0, + "step": 47812 + }, + { + "epoch": 4.461416441168238, + "grad_norm": NaN, + "learning_rate": 4.893052931192554e-05, + "loss": 0.0, + "step": 47813 + }, + { + "epoch": 4.461509750863114, + "grad_norm": NaN, + "learning_rate": 4.8924940431607304e-05, + "loss": 0.0, + "step": 47814 + }, + { + "epoch": 4.461603060557992, + "grad_norm": NaN, + "learning_rate": 4.8919351808293e-05, + "loss": 0.0, + "step": 47815 + }, + { + "epoch": 4.461696370252869, + "grad_norm": NaN, + "learning_rate": 4.8913763441996826e-05, + "loss": 0.0, + "step": 47816 + }, + { + "epoch": 4.461789679947747, + "grad_norm": NaN, + "learning_rate": 4.8908175332733026e-05, + "loss": 0.0, + "step": 47817 + }, + { + "epoch": 4.461882989642624, + "grad_norm": NaN, + "learning_rate": 4.890258748051579e-05, + "loss": 0.0, + "step": 47818 + }, + { + "epoch": 4.461976299337501, + "grad_norm": NaN, + "learning_rate": 4.8896999885359325e-05, + "loss": 0.0, + "step": 47819 + }, + { + "epoch": 4.462069609032379, + "grad_norm": NaN, + "learning_rate": 4.889141254727784e-05, + "loss": 0.0, + "step": 47820 + }, + { + "epoch": 4.462162918727255, + "grad_norm": NaN, + "learning_rate": 4.888582546628553e-05, + "loss": 0.0, + "step": 47821 + }, + { + "epoch": 4.462256228422133, + "grad_norm": NaN, + "learning_rate": 4.8880238642396635e-05, + "loss": 0.0, + "step": 47822 + }, + { + "epoch": 4.46234953811701, + "grad_norm": NaN, + "learning_rate": 4.8874652075625334e-05, + "loss": 0.0, + "step": 47823 + }, + { + "epoch": 4.462442847811888, + "grad_norm": NaN, + "learning_rate": 4.8869065765985827e-05, + "loss": 0.0, + "step": 47824 + }, + { + "epoch": 4.462536157506765, + "grad_norm": NaN, + "learning_rate": 4.886347971349234e-05, + "loss": 0.0, + "step": 47825 + }, + { + "epoch": 4.462629467201642, + "grad_norm": NaN, + "learning_rate": 4.8857893918159055e-05, + "loss": 0.0, + "step": 47826 + }, + { + "epoch": 4.46272277689652, + "grad_norm": NaN, + "learning_rate": 4.885230838000019e-05, + "loss": 0.0, + "step": 47827 + }, + { + "epoch": 4.462816086591397, + "grad_norm": NaN, + "learning_rate": 4.884672309902994e-05, + "loss": 0.0, + "step": 47828 + }, + { + "epoch": 4.462909396286274, + "grad_norm": NaN, + "learning_rate": 4.884113807526251e-05, + "loss": 0.0, + "step": 47829 + }, + { + "epoch": 4.463002705981151, + "grad_norm": NaN, + "learning_rate": 4.883555330871211e-05, + "loss": 0.0, + "step": 47830 + }, + { + "epoch": 4.463096015676029, + "grad_norm": NaN, + "learning_rate": 4.882996879939292e-05, + "loss": 0.0, + "step": 47831 + }, + { + "epoch": 4.463189325370906, + "grad_norm": NaN, + "learning_rate": 4.8824384547319164e-05, + "loss": 0.0, + "step": 47832 + }, + { + "epoch": 4.4632826350657835, + "grad_norm": NaN, + "learning_rate": 4.8818800552505e-05, + "loss": 0.0, + "step": 47833 + }, + { + "epoch": 4.463375944760661, + "grad_norm": NaN, + "learning_rate": 4.8813216814964686e-05, + "loss": 0.0, + "step": 47834 + }, + { + "epoch": 4.463469254455538, + "grad_norm": NaN, + "learning_rate": 4.880763333471237e-05, + "loss": 0.0, + "step": 47835 + }, + { + "epoch": 4.463562564150415, + "grad_norm": NaN, + "learning_rate": 4.880205011176227e-05, + "loss": 0.0, + "step": 47836 + }, + { + "epoch": 4.463655873845292, + "grad_norm": NaN, + "learning_rate": 4.879646714612857e-05, + "loss": 0.0, + "step": 47837 + }, + { + "epoch": 4.46374918354017, + "grad_norm": NaN, + "learning_rate": 4.879088443782549e-05, + "loss": 0.0, + "step": 47838 + }, + { + "epoch": 4.463842493235047, + "grad_norm": NaN, + "learning_rate": 4.8785301986867216e-05, + "loss": 0.0, + "step": 47839 + }, + { + "epoch": 4.4639358029299245, + "grad_norm": NaN, + "learning_rate": 4.877971979326792e-05, + "loss": 0.0, + "step": 47840 + }, + { + "epoch": 4.464029112624802, + "grad_norm": NaN, + "learning_rate": 4.8774137857041826e-05, + "loss": 0.0, + "step": 47841 + }, + { + "epoch": 4.464122422319679, + "grad_norm": NaN, + "learning_rate": 4.87685561782031e-05, + "loss": 0.0, + "step": 47842 + }, + { + "epoch": 4.464215732014556, + "grad_norm": NaN, + "learning_rate": 4.876297475676596e-05, + "loss": 0.0, + "step": 47843 + }, + { + "epoch": 4.464309041709433, + "grad_norm": NaN, + "learning_rate": 4.875739359274458e-05, + "loss": 0.0, + "step": 47844 + }, + { + "epoch": 4.464402351404311, + "grad_norm": NaN, + "learning_rate": 4.875181268615317e-05, + "loss": 0.0, + "step": 47845 + }, + { + "epoch": 4.464495661099188, + "grad_norm": NaN, + "learning_rate": 4.874623203700589e-05, + "loss": 0.0, + "step": 47846 + }, + { + "epoch": 4.464588970794066, + "grad_norm": NaN, + "learning_rate": 4.874065164531696e-05, + "loss": 0.0, + "step": 47847 + }, + { + "epoch": 4.464682280488943, + "grad_norm": NaN, + "learning_rate": 4.873507151110055e-05, + "loss": 0.0, + "step": 47848 + }, + { + "epoch": 4.46477559018382, + "grad_norm": NaN, + "learning_rate": 4.872949163437086e-05, + "loss": 0.0, + "step": 47849 + }, + { + "epoch": 4.464868899878697, + "grad_norm": NaN, + "learning_rate": 4.8723912015142085e-05, + "loss": 0.0, + "step": 47850 + }, + { + "epoch": 4.464962209573574, + "grad_norm": NaN, + "learning_rate": 4.87183326534284e-05, + "loss": 0.0, + "step": 47851 + }, + { + "epoch": 4.465055519268452, + "grad_norm": NaN, + "learning_rate": 4.8712753549243976e-05, + "loss": 0.0, + "step": 47852 + }, + { + "epoch": 4.465148828963329, + "grad_norm": NaN, + "learning_rate": 4.8707174702603024e-05, + "loss": 0.0, + "step": 47853 + }, + { + "epoch": 4.465242138658207, + "grad_norm": NaN, + "learning_rate": 4.870159611351973e-05, + "loss": 0.0, + "step": 47854 + }, + { + "epoch": 4.465335448353084, + "grad_norm": NaN, + "learning_rate": 4.869601778200827e-05, + "loss": 0.0, + "step": 47855 + }, + { + "epoch": 4.4654287580479615, + "grad_norm": NaN, + "learning_rate": 4.869043970808281e-05, + "loss": 0.0, + "step": 47856 + }, + { + "epoch": 4.465522067742839, + "grad_norm": NaN, + "learning_rate": 4.868486189175757e-05, + "loss": 0.0, + "step": 47857 + }, + { + "epoch": 4.465615377437715, + "grad_norm": NaN, + "learning_rate": 4.86792843330467e-05, + "loss": 0.0, + "step": 47858 + }, + { + "epoch": 4.465708687132593, + "grad_norm": NaN, + "learning_rate": 4.867370703196441e-05, + "loss": 0.0, + "step": 47859 + }, + { + "epoch": 4.46580199682747, + "grad_norm": NaN, + "learning_rate": 4.866812998852486e-05, + "loss": 0.0, + "step": 47860 + }, + { + "epoch": 4.465895306522348, + "grad_norm": NaN, + "learning_rate": 4.8662553202742244e-05, + "loss": 0.0, + "step": 47861 + }, + { + "epoch": 4.465988616217225, + "grad_norm": NaN, + "learning_rate": 4.865697667463073e-05, + "loss": 0.0, + "step": 47862 + }, + { + "epoch": 4.4660819259121025, + "grad_norm": NaN, + "learning_rate": 4.8651400404204515e-05, + "loss": 0.0, + "step": 47863 + }, + { + "epoch": 4.46617523560698, + "grad_norm": NaN, + "learning_rate": 4.864582439147775e-05, + "loss": 0.0, + "step": 47864 + }, + { + "epoch": 4.4662685453018565, + "grad_norm": NaN, + "learning_rate": 4.864024863646465e-05, + "loss": 0.0, + "step": 47865 + }, + { + "epoch": 4.466361854996734, + "grad_norm": NaN, + "learning_rate": 4.863467313917936e-05, + "loss": 0.0, + "step": 47866 + }, + { + "epoch": 4.466455164691611, + "grad_norm": NaN, + "learning_rate": 4.8629097899636075e-05, + "loss": 0.0, + "step": 47867 + }, + { + "epoch": 4.466548474386489, + "grad_norm": NaN, + "learning_rate": 4.862352291784896e-05, + "loss": 0.0, + "step": 47868 + }, + { + "epoch": 4.466641784081366, + "grad_norm": NaN, + "learning_rate": 4.86179481938322e-05, + "loss": 0.0, + "step": 47869 + }, + { + "epoch": 4.466735093776244, + "grad_norm": NaN, + "learning_rate": 4.861237372759997e-05, + "loss": 0.0, + "step": 47870 + }, + { + "epoch": 4.466828403471121, + "grad_norm": NaN, + "learning_rate": 4.860679951916644e-05, + "loss": 0.0, + "step": 47871 + }, + { + "epoch": 4.466921713165998, + "grad_norm": NaN, + "learning_rate": 4.860122556854579e-05, + "loss": 0.0, + "step": 47872 + }, + { + "epoch": 4.467015022860875, + "grad_norm": NaN, + "learning_rate": 4.859565187575217e-05, + "loss": 0.0, + "step": 47873 + }, + { + "epoch": 4.467108332555752, + "grad_norm": NaN, + "learning_rate": 4.859007844079978e-05, + "loss": 0.0, + "step": 47874 + }, + { + "epoch": 4.46720164225063, + "grad_norm": NaN, + "learning_rate": 4.858450526370278e-05, + "loss": 0.0, + "step": 47875 + }, + { + "epoch": 4.467294951945507, + "grad_norm": NaN, + "learning_rate": 4.8578932344475334e-05, + "loss": 0.0, + "step": 47876 + }, + { + "epoch": 4.467388261640385, + "grad_norm": NaN, + "learning_rate": 4.8573359683131624e-05, + "loss": 0.0, + "step": 47877 + }, + { + "epoch": 4.467481571335262, + "grad_norm": NaN, + "learning_rate": 4.856778727968582e-05, + "loss": 0.0, + "step": 47878 + }, + { + "epoch": 4.4675748810301394, + "grad_norm": NaN, + "learning_rate": 4.8562215134152084e-05, + "loss": 0.0, + "step": 47879 + }, + { + "epoch": 4.467668190725016, + "grad_norm": NaN, + "learning_rate": 4.855664324654458e-05, + "loss": 0.0, + "step": 47880 + }, + { + "epoch": 4.467761500419893, + "grad_norm": NaN, + "learning_rate": 4.855107161687748e-05, + "loss": 0.0, + "step": 47881 + }, + { + "epoch": 4.467854810114771, + "grad_norm": NaN, + "learning_rate": 4.854550024516497e-05, + "loss": 0.0, + "step": 47882 + }, + { + "epoch": 4.467948119809648, + "grad_norm": NaN, + "learning_rate": 4.853992913142119e-05, + "loss": 0.0, + "step": 47883 + }, + { + "epoch": 4.468041429504526, + "grad_norm": NaN, + "learning_rate": 4.853435827566031e-05, + "loss": 0.0, + "step": 47884 + }, + { + "epoch": 4.468134739199403, + "grad_norm": NaN, + "learning_rate": 4.852878767789651e-05, + "loss": 0.0, + "step": 47885 + }, + { + "epoch": 4.4682280488942805, + "grad_norm": NaN, + "learning_rate": 4.852321733814395e-05, + "loss": 0.0, + "step": 47886 + }, + { + "epoch": 4.468321358589157, + "grad_norm": NaN, + "learning_rate": 4.8517647256416775e-05, + "loss": 0.0, + "step": 47887 + }, + { + "epoch": 4.468414668284034, + "grad_norm": NaN, + "learning_rate": 4.851207743272917e-05, + "loss": 0.0, + "step": 47888 + }, + { + "epoch": 4.468507977978912, + "grad_norm": NaN, + "learning_rate": 4.850650786709528e-05, + "loss": 0.0, + "step": 47889 + }, + { + "epoch": 4.468601287673789, + "grad_norm": NaN, + "learning_rate": 4.8500938559529296e-05, + "loss": 0.0, + "step": 47890 + }, + { + "epoch": 4.468694597368667, + "grad_norm": NaN, + "learning_rate": 4.8495369510045346e-05, + "loss": 0.0, + "step": 47891 + }, + { + "epoch": 4.468787907063544, + "grad_norm": NaN, + "learning_rate": 4.848980071865761e-05, + "loss": 0.0, + "step": 47892 + }, + { + "epoch": 4.4688812167584215, + "grad_norm": NaN, + "learning_rate": 4.8484232185380235e-05, + "loss": 0.0, + "step": 47893 + }, + { + "epoch": 4.468974526453298, + "grad_norm": NaN, + "learning_rate": 4.8478663910227385e-05, + "loss": 0.0, + "step": 47894 + }, + { + "epoch": 4.4690678361481755, + "grad_norm": NaN, + "learning_rate": 4.847309589321321e-05, + "loss": 0.0, + "step": 47895 + }, + { + "epoch": 4.469161145843053, + "grad_norm": NaN, + "learning_rate": 4.846752813435189e-05, + "loss": 0.0, + "step": 47896 + }, + { + "epoch": 4.46925445553793, + "grad_norm": NaN, + "learning_rate": 4.8461960633657564e-05, + "loss": 0.0, + "step": 47897 + }, + { + "epoch": 4.469347765232808, + "grad_norm": NaN, + "learning_rate": 4.8456393391144406e-05, + "loss": 0.0, + "step": 47898 + }, + { + "epoch": 4.469441074927685, + "grad_norm": NaN, + "learning_rate": 4.845082640682656e-05, + "loss": 0.0, + "step": 47899 + }, + { + "epoch": 4.469534384622563, + "grad_norm": NaN, + "learning_rate": 4.844525968071817e-05, + "loss": 0.0, + "step": 47900 + }, + { + "epoch": 4.46962769431744, + "grad_norm": NaN, + "learning_rate": 4.8439693212833403e-05, + "loss": 0.0, + "step": 47901 + }, + { + "epoch": 4.4697210040123165, + "grad_norm": NaN, + "learning_rate": 4.843412700318642e-05, + "loss": 0.0, + "step": 47902 + }, + { + "epoch": 4.469814313707194, + "grad_norm": NaN, + "learning_rate": 4.8428561051791354e-05, + "loss": 0.0, + "step": 47903 + }, + { + "epoch": 4.469907623402071, + "grad_norm": NaN, + "learning_rate": 4.842299535866238e-05, + "loss": 0.0, + "step": 47904 + }, + { + "epoch": 4.470000933096949, + "grad_norm": NaN, + "learning_rate": 4.841742992381364e-05, + "loss": 0.0, + "step": 47905 + }, + { + "epoch": 4.470094242791826, + "grad_norm": NaN, + "learning_rate": 4.841186474725928e-05, + "loss": 0.0, + "step": 47906 + }, + { + "epoch": 4.470187552486704, + "grad_norm": NaN, + "learning_rate": 4.840629982901345e-05, + "loss": 0.0, + "step": 47907 + }, + { + "epoch": 4.470280862181581, + "grad_norm": NaN, + "learning_rate": 4.8400735169090315e-05, + "loss": 0.0, + "step": 47908 + }, + { + "epoch": 4.470374171876458, + "grad_norm": NaN, + "learning_rate": 4.839517076750401e-05, + "loss": 0.0, + "step": 47909 + }, + { + "epoch": 4.470467481571335, + "grad_norm": NaN, + "learning_rate": 4.838960662426869e-05, + "loss": 0.0, + "step": 47910 + }, + { + "epoch": 4.470560791266212, + "grad_norm": NaN, + "learning_rate": 4.838404273939849e-05, + "loss": 0.0, + "step": 47911 + }, + { + "epoch": 4.47065410096109, + "grad_norm": NaN, + "learning_rate": 4.837847911290758e-05, + "loss": 0.0, + "step": 47912 + }, + { + "epoch": 4.470747410655967, + "grad_norm": NaN, + "learning_rate": 4.837291574481009e-05, + "loss": 0.0, + "step": 47913 + }, + { + "epoch": 4.470840720350845, + "grad_norm": NaN, + "learning_rate": 4.836735263512018e-05, + "loss": 0.0, + "step": 47914 + }, + { + "epoch": 4.470934030045722, + "grad_norm": NaN, + "learning_rate": 4.836178978385199e-05, + "loss": 0.0, + "step": 47915 + }, + { + "epoch": 4.4710273397405995, + "grad_norm": NaN, + "learning_rate": 4.8356227191019655e-05, + "loss": 0.0, + "step": 47916 + }, + { + "epoch": 4.471120649435476, + "grad_norm": NaN, + "learning_rate": 4.835066485663731e-05, + "loss": 0.0, + "step": 47917 + }, + { + "epoch": 4.4712139591303535, + "grad_norm": NaN, + "learning_rate": 4.8345102780719135e-05, + "loss": 0.0, + "step": 47918 + }, + { + "epoch": 4.471307268825231, + "grad_norm": NaN, + "learning_rate": 4.833954096327923e-05, + "loss": 0.0, + "step": 47919 + }, + { + "epoch": 4.471400578520108, + "grad_norm": NaN, + "learning_rate": 4.833397940433177e-05, + "loss": 0.0, + "step": 47920 + }, + { + "epoch": 4.471493888214986, + "grad_norm": NaN, + "learning_rate": 4.83284181038909e-05, + "loss": 0.0, + "step": 47921 + }, + { + "epoch": 4.471587197909863, + "grad_norm": NaN, + "learning_rate": 4.8322857061970726e-05, + "loss": 0.0, + "step": 47922 + }, + { + "epoch": 4.47168050760474, + "grad_norm": NaN, + "learning_rate": 4.831729627858542e-05, + "loss": 0.0, + "step": 47923 + }, + { + "epoch": 4.471773817299617, + "grad_norm": NaN, + "learning_rate": 4.831173575374909e-05, + "loss": 0.0, + "step": 47924 + }, + { + "epoch": 4.4718671269944945, + "grad_norm": NaN, + "learning_rate": 4.830617548747591e-05, + "loss": 0.0, + "step": 47925 + }, + { + "epoch": 4.471960436689372, + "grad_norm": NaN, + "learning_rate": 4.830061547978e-05, + "loss": 0.0, + "step": 47926 + }, + { + "epoch": 4.472053746384249, + "grad_norm": NaN, + "learning_rate": 4.82950557306755e-05, + "loss": 0.0, + "step": 47927 + }, + { + "epoch": 4.472147056079127, + "grad_norm": NaN, + "learning_rate": 4.828949624017653e-05, + "loss": 0.0, + "step": 47928 + }, + { + "epoch": 4.472240365774004, + "grad_norm": NaN, + "learning_rate": 4.8283937008297265e-05, + "loss": 0.0, + "step": 47929 + }, + { + "epoch": 4.472333675468882, + "grad_norm": NaN, + "learning_rate": 4.8278378035051804e-05, + "loss": 0.0, + "step": 47930 + }, + { + "epoch": 4.472426985163758, + "grad_norm": NaN, + "learning_rate": 4.8272819320454284e-05, + "loss": 0.0, + "step": 47931 + }, + { + "epoch": 4.472520294858636, + "grad_norm": NaN, + "learning_rate": 4.8267260864518874e-05, + "loss": 0.0, + "step": 47932 + }, + { + "epoch": 4.472613604553513, + "grad_norm": NaN, + "learning_rate": 4.8261702667259675e-05, + "loss": 0.0, + "step": 47933 + }, + { + "epoch": 4.47270691424839, + "grad_norm": NaN, + "learning_rate": 4.825614472869083e-05, + "loss": 0.0, + "step": 47934 + }, + { + "epoch": 4.472800223943268, + "grad_norm": NaN, + "learning_rate": 4.825058704882646e-05, + "loss": 0.0, + "step": 47935 + }, + { + "epoch": 4.472893533638145, + "grad_norm": NaN, + "learning_rate": 4.824502962768072e-05, + "loss": 0.0, + "step": 47936 + }, + { + "epoch": 4.472986843333023, + "grad_norm": NaN, + "learning_rate": 4.823947246526773e-05, + "loss": 0.0, + "step": 47937 + }, + { + "epoch": 4.473080153027899, + "grad_norm": NaN, + "learning_rate": 4.8233915561601614e-05, + "loss": 0.0, + "step": 47938 + }, + { + "epoch": 4.473173462722777, + "grad_norm": NaN, + "learning_rate": 4.822835891669649e-05, + "loss": 0.0, + "step": 47939 + }, + { + "epoch": 4.473266772417654, + "grad_norm": NaN, + "learning_rate": 4.822280253056653e-05, + "loss": 0.0, + "step": 47940 + }, + { + "epoch": 4.473360082112531, + "grad_norm": NaN, + "learning_rate": 4.8217246403225815e-05, + "loss": 0.0, + "step": 47941 + }, + { + "epoch": 4.473453391807409, + "grad_norm": NaN, + "learning_rate": 4.82116905346885e-05, + "loss": 0.0, + "step": 47942 + }, + { + "epoch": 4.473546701502286, + "grad_norm": NaN, + "learning_rate": 4.8206134924968705e-05, + "loss": 0.0, + "step": 47943 + }, + { + "epoch": 4.473640011197164, + "grad_norm": NaN, + "learning_rate": 4.820057957408055e-05, + "loss": 0.0, + "step": 47944 + }, + { + "epoch": 4.473733320892041, + "grad_norm": NaN, + "learning_rate": 4.8195024482038164e-05, + "loss": 0.0, + "step": 47945 + }, + { + "epoch": 4.473826630586918, + "grad_norm": NaN, + "learning_rate": 4.818946964885567e-05, + "loss": 0.0, + "step": 47946 + }, + { + "epoch": 4.473919940281795, + "grad_norm": NaN, + "learning_rate": 4.8183915074547206e-05, + "loss": 0.0, + "step": 47947 + }, + { + "epoch": 4.4740132499766725, + "grad_norm": NaN, + "learning_rate": 4.8178360759126875e-05, + "loss": 0.0, + "step": 47948 + }, + { + "epoch": 4.47410655967155, + "grad_norm": NaN, + "learning_rate": 4.817280670260883e-05, + "loss": 0.0, + "step": 47949 + }, + { + "epoch": 4.474199869366427, + "grad_norm": NaN, + "learning_rate": 4.816725290500715e-05, + "loss": 0.0, + "step": 47950 + }, + { + "epoch": 4.474293179061305, + "grad_norm": NaN, + "learning_rate": 4.8161699366336004e-05, + "loss": 0.0, + "step": 47951 + }, + { + "epoch": 4.474386488756182, + "grad_norm": NaN, + "learning_rate": 4.815614608660947e-05, + "loss": 0.0, + "step": 47952 + }, + { + "epoch": 4.474479798451059, + "grad_norm": NaN, + "learning_rate": 4.815059306584171e-05, + "loss": 0.0, + "step": 47953 + }, + { + "epoch": 4.474573108145936, + "grad_norm": NaN, + "learning_rate": 4.81450403040468e-05, + "loss": 0.0, + "step": 47954 + }, + { + "epoch": 4.4746664178408135, + "grad_norm": NaN, + "learning_rate": 4.8139487801238904e-05, + "loss": 0.0, + "step": 47955 + }, + { + "epoch": 4.474759727535691, + "grad_norm": NaN, + "learning_rate": 4.8133935557432104e-05, + "loss": 0.0, + "step": 47956 + }, + { + "epoch": 4.474853037230568, + "grad_norm": NaN, + "learning_rate": 4.8128383572640525e-05, + "loss": 0.0, + "step": 47957 + }, + { + "epoch": 4.474946346925446, + "grad_norm": NaN, + "learning_rate": 4.812283184687828e-05, + "loss": 0.0, + "step": 47958 + }, + { + "epoch": 4.475039656620323, + "grad_norm": NaN, + "learning_rate": 4.8117280380159526e-05, + "loss": 0.0, + "step": 47959 + }, + { + "epoch": 4.4751329663152, + "grad_norm": NaN, + "learning_rate": 4.811172917249833e-05, + "loss": 0.0, + "step": 47960 + }, + { + "epoch": 4.475226276010077, + "grad_norm": NaN, + "learning_rate": 4.810617822390883e-05, + "loss": 0.0, + "step": 47961 + }, + { + "epoch": 4.475319585704955, + "grad_norm": NaN, + "learning_rate": 4.810062753440513e-05, + "loss": 0.0, + "step": 47962 + }, + { + "epoch": 4.475412895399832, + "grad_norm": NaN, + "learning_rate": 4.8095077104001345e-05, + "loss": 0.0, + "step": 47963 + }, + { + "epoch": 4.475506205094709, + "grad_norm": NaN, + "learning_rate": 4.80895269327116e-05, + "loss": 0.0, + "step": 47964 + }, + { + "epoch": 4.475599514789587, + "grad_norm": NaN, + "learning_rate": 4.8083977020550015e-05, + "loss": 0.0, + "step": 47965 + }, + { + "epoch": 4.475692824484464, + "grad_norm": NaN, + "learning_rate": 4.807842736753066e-05, + "loss": 0.0, + "step": 47966 + }, + { + "epoch": 4.475786134179341, + "grad_norm": NaN, + "learning_rate": 4.8072877973667695e-05, + "loss": 0.0, + "step": 47967 + }, + { + "epoch": 4.475879443874218, + "grad_norm": NaN, + "learning_rate": 4.8067328838975194e-05, + "loss": 0.0, + "step": 47968 + }, + { + "epoch": 4.475972753569096, + "grad_norm": NaN, + "learning_rate": 4.8061779963467286e-05, + "loss": 0.0, + "step": 47969 + }, + { + "epoch": 4.476066063263973, + "grad_norm": NaN, + "learning_rate": 4.805623134715807e-05, + "loss": 0.0, + "step": 47970 + }, + { + "epoch": 4.4761593729588505, + "grad_norm": NaN, + "learning_rate": 4.805068299006167e-05, + "loss": 0.0, + "step": 47971 + }, + { + "epoch": 4.476252682653728, + "grad_norm": NaN, + "learning_rate": 4.804513489219217e-05, + "loss": 0.0, + "step": 47972 + }, + { + "epoch": 4.476345992348605, + "grad_norm": NaN, + "learning_rate": 4.8039587053563705e-05, + "loss": 0.0, + "step": 47973 + }, + { + "epoch": 4.476439302043483, + "grad_norm": NaN, + "learning_rate": 4.803403947419036e-05, + "loss": 0.0, + "step": 47974 + }, + { + "epoch": 4.476532611738359, + "grad_norm": NaN, + "learning_rate": 4.802849215408624e-05, + "loss": 0.0, + "step": 47975 + }, + { + "epoch": 4.476625921433237, + "grad_norm": NaN, + "learning_rate": 4.8022945093265484e-05, + "loss": 0.0, + "step": 47976 + }, + { + "epoch": 4.476719231128114, + "grad_norm": NaN, + "learning_rate": 4.8017398291742135e-05, + "loss": 0.0, + "step": 47977 + }, + { + "epoch": 4.4768125408229915, + "grad_norm": NaN, + "learning_rate": 4.801185174953037e-05, + "loss": 0.0, + "step": 47978 + }, + { + "epoch": 4.476905850517869, + "grad_norm": NaN, + "learning_rate": 4.800630546664422e-05, + "loss": 0.0, + "step": 47979 + }, + { + "epoch": 4.476999160212746, + "grad_norm": NaN, + "learning_rate": 4.800075944309785e-05, + "loss": 0.0, + "step": 47980 + }, + { + "epoch": 4.477092469907624, + "grad_norm": NaN, + "learning_rate": 4.7995213678905276e-05, + "loss": 0.0, + "step": 47981 + }, + { + "epoch": 4.4771857796025, + "grad_norm": NaN, + "learning_rate": 4.7989668174080746e-05, + "loss": 0.0, + "step": 47982 + }, + { + "epoch": 4.477279089297378, + "grad_norm": NaN, + "learning_rate": 4.7984122928638236e-05, + "loss": 0.0, + "step": 47983 + }, + { + "epoch": 4.477372398992255, + "grad_norm": NaN, + "learning_rate": 4.797857794259183e-05, + "loss": 0.0, + "step": 47984 + }, + { + "epoch": 4.477465708687133, + "grad_norm": NaN, + "learning_rate": 4.7973033215955766e-05, + "loss": 0.0, + "step": 47985 + }, + { + "epoch": 4.47755901838201, + "grad_norm": NaN, + "learning_rate": 4.7967488748744015e-05, + "loss": 0.0, + "step": 47986 + }, + { + "epoch": 4.477652328076887, + "grad_norm": NaN, + "learning_rate": 4.7961944540970724e-05, + "loss": 0.0, + "step": 47987 + }, + { + "epoch": 4.477745637771765, + "grad_norm": NaN, + "learning_rate": 4.7956400592649967e-05, + "loss": 0.0, + "step": 47988 + }, + { + "epoch": 4.477838947466642, + "grad_norm": NaN, + "learning_rate": 4.7950856903795866e-05, + "loss": 0.0, + "step": 47989 + }, + { + "epoch": 4.477932257161519, + "grad_norm": NaN, + "learning_rate": 4.7945313474422495e-05, + "loss": 0.0, + "step": 47990 + }, + { + "epoch": 4.478025566856396, + "grad_norm": NaN, + "learning_rate": 4.793977030454397e-05, + "loss": 0.0, + "step": 47991 + }, + { + "epoch": 4.478118876551274, + "grad_norm": NaN, + "learning_rate": 4.793422739417438e-05, + "loss": 0.0, + "step": 47992 + }, + { + "epoch": 4.478212186246151, + "grad_norm": NaN, + "learning_rate": 4.792868474332782e-05, + "loss": 0.0, + "step": 47993 + }, + { + "epoch": 4.4783054959410284, + "grad_norm": NaN, + "learning_rate": 4.792314235201837e-05, + "loss": 0.0, + "step": 47994 + }, + { + "epoch": 4.478398805635906, + "grad_norm": NaN, + "learning_rate": 4.791760022026014e-05, + "loss": 0.0, + "step": 47995 + }, + { + "epoch": 4.478492115330782, + "grad_norm": NaN, + "learning_rate": 4.791205834806721e-05, + "loss": 0.0, + "step": 47996 + }, + { + "epoch": 4.47858542502566, + "grad_norm": NaN, + "learning_rate": 4.7906516735453666e-05, + "loss": 0.0, + "step": 47997 + }, + { + "epoch": 4.478678734720537, + "grad_norm": NaN, + "learning_rate": 4.7900975382433606e-05, + "loss": 0.0, + "step": 47998 + }, + { + "epoch": 4.478772044415415, + "grad_norm": NaN, + "learning_rate": 4.789543428902113e-05, + "loss": 0.0, + "step": 47999 + }, + { + "epoch": 4.478865354110292, + "grad_norm": NaN, + "learning_rate": 4.788989345523033e-05, + "loss": 0.0, + "step": 48000 + }, + { + "epoch": 4.4789586638051695, + "grad_norm": NaN, + "learning_rate": 4.7884352881075256e-05, + "loss": 0.0, + "step": 48001 + }, + { + "epoch": 4.479051973500047, + "grad_norm": NaN, + "learning_rate": 4.787881256657005e-05, + "loss": 0.0, + "step": 48002 + }, + { + "epoch": 4.479145283194924, + "grad_norm": NaN, + "learning_rate": 4.787327251172875e-05, + "loss": 0.0, + "step": 48003 + }, + { + "epoch": 4.479238592889801, + "grad_norm": NaN, + "learning_rate": 4.786773271656549e-05, + "loss": 0.0, + "step": 48004 + }, + { + "epoch": 4.479331902584678, + "grad_norm": NaN, + "learning_rate": 4.786219318109432e-05, + "loss": 0.0, + "step": 48005 + }, + { + "epoch": 4.479425212279556, + "grad_norm": NaN, + "learning_rate": 4.785665390532933e-05, + "loss": 0.0, + "step": 48006 + }, + { + "epoch": 4.479518521974433, + "grad_norm": NaN, + "learning_rate": 4.785111488928463e-05, + "loss": 0.0, + "step": 48007 + }, + { + "epoch": 4.4796118316693105, + "grad_norm": NaN, + "learning_rate": 4.784557613297427e-05, + "loss": 0.0, + "step": 48008 + }, + { + "epoch": 4.479705141364188, + "grad_norm": NaN, + "learning_rate": 4.784003763641236e-05, + "loss": 0.0, + "step": 48009 + }, + { + "epoch": 4.479798451059065, + "grad_norm": NaN, + "learning_rate": 4.783449939961296e-05, + "loss": 0.0, + "step": 48010 + }, + { + "epoch": 4.479891760753942, + "grad_norm": NaN, + "learning_rate": 4.7828961422590164e-05, + "loss": 0.0, + "step": 48011 + }, + { + "epoch": 4.479985070448819, + "grad_norm": NaN, + "learning_rate": 4.782342370535808e-05, + "loss": 0.0, + "step": 48012 + }, + { + "epoch": 4.480078380143697, + "grad_norm": NaN, + "learning_rate": 4.781788624793073e-05, + "loss": 0.0, + "step": 48013 + }, + { + "epoch": 4.480171689838574, + "grad_norm": NaN, + "learning_rate": 4.781234905032225e-05, + "loss": 0.0, + "step": 48014 + }, + { + "epoch": 4.480264999533452, + "grad_norm": NaN, + "learning_rate": 4.7806812112546674e-05, + "loss": 0.0, + "step": 48015 + }, + { + "epoch": 4.480358309228329, + "grad_norm": NaN, + "learning_rate": 4.780127543461813e-05, + "loss": 0.0, + "step": 48016 + }, + { + "epoch": 4.480451618923206, + "grad_norm": NaN, + "learning_rate": 4.779573901655065e-05, + "loss": 0.0, + "step": 48017 + }, + { + "epoch": 4.480544928618084, + "grad_norm": NaN, + "learning_rate": 4.779020285835834e-05, + "loss": 0.0, + "step": 48018 + }, + { + "epoch": 4.48063823831296, + "grad_norm": NaN, + "learning_rate": 4.778466696005521e-05, + "loss": 0.0, + "step": 48019 + }, + { + "epoch": 4.480731548007838, + "grad_norm": NaN, + "learning_rate": 4.777913132165549e-05, + "loss": 0.0, + "step": 48020 + }, + { + "epoch": 4.480824857702715, + "grad_norm": NaN, + "learning_rate": 4.7773595943173124e-05, + "loss": 0.0, + "step": 48021 + }, + { + "epoch": 4.480918167397593, + "grad_norm": NaN, + "learning_rate": 4.7768060824622154e-05, + "loss": 0.0, + "step": 48022 + }, + { + "epoch": 4.48101147709247, + "grad_norm": NaN, + "learning_rate": 4.7762525966016835e-05, + "loss": 0.0, + "step": 48023 + }, + { + "epoch": 4.4811047867873475, + "grad_norm": NaN, + "learning_rate": 4.7756991367371083e-05, + "loss": 0.0, + "step": 48024 + }, + { + "epoch": 4.481198096482225, + "grad_norm": NaN, + "learning_rate": 4.775145702869896e-05, + "loss": 0.0, + "step": 48025 + }, + { + "epoch": 4.481291406177101, + "grad_norm": NaN, + "learning_rate": 4.7745922950014684e-05, + "loss": 0.0, + "step": 48026 + }, + { + "epoch": 4.481384715871979, + "grad_norm": NaN, + "learning_rate": 4.774038913133219e-05, + "loss": 0.0, + "step": 48027 + }, + { + "epoch": 4.481478025566856, + "grad_norm": NaN, + "learning_rate": 4.7734855572665545e-05, + "loss": 0.0, + "step": 48028 + }, + { + "epoch": 4.481571335261734, + "grad_norm": NaN, + "learning_rate": 4.772932227402897e-05, + "loss": 0.0, + "step": 48029 + }, + { + "epoch": 4.481664644956611, + "grad_norm": NaN, + "learning_rate": 4.7723789235436374e-05, + "loss": 0.0, + "step": 48030 + }, + { + "epoch": 4.4817579546514885, + "grad_norm": NaN, + "learning_rate": 4.7718256456901904e-05, + "loss": 0.0, + "step": 48031 + }, + { + "epoch": 4.481851264346366, + "grad_norm": NaN, + "learning_rate": 4.771272393843961e-05, + "loss": 0.0, + "step": 48032 + }, + { + "epoch": 4.481944574041243, + "grad_norm": NaN, + "learning_rate": 4.770719168006354e-05, + "loss": 0.0, + "step": 48033 + }, + { + "epoch": 4.48203788373612, + "grad_norm": NaN, + "learning_rate": 4.770165968178779e-05, + "loss": 0.0, + "step": 48034 + }, + { + "epoch": 4.482131193430997, + "grad_norm": NaN, + "learning_rate": 4.769612794362642e-05, + "loss": 0.0, + "step": 48035 + }, + { + "epoch": 4.482224503125875, + "grad_norm": NaN, + "learning_rate": 4.769059646559349e-05, + "loss": 0.0, + "step": 48036 + }, + { + "epoch": 4.482317812820752, + "grad_norm": NaN, + "learning_rate": 4.768506524770306e-05, + "loss": 0.0, + "step": 48037 + }, + { + "epoch": 4.48241112251563, + "grad_norm": NaN, + "learning_rate": 4.7679534289969217e-05, + "loss": 0.0, + "step": 48038 + }, + { + "epoch": 4.482504432210507, + "grad_norm": NaN, + "learning_rate": 4.7674003592406007e-05, + "loss": 0.0, + "step": 48039 + }, + { + "epoch": 4.4825977419053835, + "grad_norm": NaN, + "learning_rate": 4.7668473155027486e-05, + "loss": 0.0, + "step": 48040 + }, + { + "epoch": 4.482691051600261, + "grad_norm": NaN, + "learning_rate": 4.7662942977847715e-05, + "loss": 0.0, + "step": 48041 + }, + { + "epoch": 4.482784361295138, + "grad_norm": NaN, + "learning_rate": 4.765741306088079e-05, + "loss": 0.0, + "step": 48042 + }, + { + "epoch": 4.482877670990016, + "grad_norm": NaN, + "learning_rate": 4.765188340414072e-05, + "loss": 0.0, + "step": 48043 + }, + { + "epoch": 4.482970980684893, + "grad_norm": NaN, + "learning_rate": 4.764635400764162e-05, + "loss": 0.0, + "step": 48044 + }, + { + "epoch": 4.483064290379771, + "grad_norm": NaN, + "learning_rate": 4.7640824871397496e-05, + "loss": 0.0, + "step": 48045 + }, + { + "epoch": 4.483157600074648, + "grad_norm": NaN, + "learning_rate": 4.763529599542245e-05, + "loss": 0.0, + "step": 48046 + }, + { + "epoch": 4.4832509097695254, + "grad_norm": NaN, + "learning_rate": 4.7629767379730514e-05, + "loss": 0.0, + "step": 48047 + }, + { + "epoch": 4.483344219464402, + "grad_norm": NaN, + "learning_rate": 4.762423902433576e-05, + "loss": 0.0, + "step": 48048 + }, + { + "epoch": 4.483437529159279, + "grad_norm": NaN, + "learning_rate": 4.761871092925225e-05, + "loss": 0.0, + "step": 48049 + }, + { + "epoch": 4.483530838854157, + "grad_norm": NaN, + "learning_rate": 4.761318309449402e-05, + "loss": 0.0, + "step": 48050 + }, + { + "epoch": 4.483624148549034, + "grad_norm": NaN, + "learning_rate": 4.760765552007513e-05, + "loss": 0.0, + "step": 48051 + }, + { + "epoch": 4.483717458243912, + "grad_norm": NaN, + "learning_rate": 4.7602128206009654e-05, + "loss": 0.0, + "step": 48052 + }, + { + "epoch": 4.483810767938789, + "grad_norm": NaN, + "learning_rate": 4.7596601152311635e-05, + "loss": 0.0, + "step": 48053 + }, + { + "epoch": 4.4839040776336665, + "grad_norm": NaN, + "learning_rate": 4.7591074358995116e-05, + "loss": 0.0, + "step": 48054 + }, + { + "epoch": 4.483997387328543, + "grad_norm": NaN, + "learning_rate": 4.758554782607417e-05, + "loss": 0.0, + "step": 48055 + }, + { + "epoch": 4.4840906970234204, + "grad_norm": NaN, + "learning_rate": 4.758002155356282e-05, + "loss": 0.0, + "step": 48056 + }, + { + "epoch": 4.484184006718298, + "grad_norm": NaN, + "learning_rate": 4.75744955414751e-05, + "loss": 0.0, + "step": 48057 + }, + { + "epoch": 4.484277316413175, + "grad_norm": NaN, + "learning_rate": 4.75689697898252e-05, + "loss": 0.0, + "step": 48058 + }, + { + "epoch": 4.484370626108053, + "grad_norm": NaN, + "learning_rate": 4.7563444298627015e-05, + "loss": 0.0, + "step": 48059 + }, + { + "epoch": 4.48446393580293, + "grad_norm": NaN, + "learning_rate": 4.755791906789459e-05, + "loss": 0.0, + "step": 48060 + }, + { + "epoch": 4.4845572454978075, + "grad_norm": NaN, + "learning_rate": 4.7552394097642136e-05, + "loss": 0.0, + "step": 48061 + }, + { + "epoch": 4.484650555192685, + "grad_norm": NaN, + "learning_rate": 4.7546869387883545e-05, + "loss": 0.0, + "step": 48062 + }, + { + "epoch": 4.4847438648875615, + "grad_norm": NaN, + "learning_rate": 4.754134493863287e-05, + "loss": 0.0, + "step": 48063 + }, + { + "epoch": 4.484837174582439, + "grad_norm": NaN, + "learning_rate": 4.75358207499043e-05, + "loss": 0.0, + "step": 48064 + }, + { + "epoch": 4.484930484277316, + "grad_norm": NaN, + "learning_rate": 4.7530296821711734e-05, + "loss": 0.0, + "step": 48065 + }, + { + "epoch": 4.485023793972194, + "grad_norm": NaN, + "learning_rate": 4.752477315406922e-05, + "loss": 0.0, + "step": 48066 + }, + { + "epoch": 4.485117103667071, + "grad_norm": NaN, + "learning_rate": 4.7519249746990955e-05, + "loss": 0.0, + "step": 48067 + }, + { + "epoch": 4.485210413361949, + "grad_norm": NaN, + "learning_rate": 4.751372660049082e-05, + "loss": 0.0, + "step": 48068 + }, + { + "epoch": 4.485303723056826, + "grad_norm": NaN, + "learning_rate": 4.750820371458288e-05, + "loss": 0.0, + "step": 48069 + }, + { + "epoch": 4.4853970327517025, + "grad_norm": NaN, + "learning_rate": 4.750268108928133e-05, + "loss": 0.0, + "step": 48070 + }, + { + "epoch": 4.48549034244658, + "grad_norm": NaN, + "learning_rate": 4.7497158724600035e-05, + "loss": 0.0, + "step": 48071 + }, + { + "epoch": 4.485583652141457, + "grad_norm": NaN, + "learning_rate": 4.749163662055306e-05, + "loss": 0.0, + "step": 48072 + }, + { + "epoch": 4.485676961836335, + "grad_norm": NaN, + "learning_rate": 4.748611477715458e-05, + "loss": 0.0, + "step": 48073 + }, + { + "epoch": 4.485770271531212, + "grad_norm": NaN, + "learning_rate": 4.74805931944185e-05, + "loss": 0.0, + "step": 48074 + }, + { + "epoch": 4.48586358122609, + "grad_norm": NaN, + "learning_rate": 4.747507187235884e-05, + "loss": 0.0, + "step": 48075 + }, + { + "epoch": 4.485956890920967, + "grad_norm": NaN, + "learning_rate": 4.746955081098981e-05, + "loss": 0.0, + "step": 48076 + }, + { + "epoch": 4.486050200615844, + "grad_norm": NaN, + "learning_rate": 4.74640300103253e-05, + "loss": 0.0, + "step": 48077 + }, + { + "epoch": 4.486143510310721, + "grad_norm": NaN, + "learning_rate": 4.745850947037937e-05, + "loss": 0.0, + "step": 48078 + }, + { + "epoch": 4.486236820005598, + "grad_norm": NaN, + "learning_rate": 4.745298919116609e-05, + "loss": 0.0, + "step": 48079 + }, + { + "epoch": 4.486330129700476, + "grad_norm": NaN, + "learning_rate": 4.744746917269947e-05, + "loss": 0.0, + "step": 48080 + }, + { + "epoch": 4.486423439395353, + "grad_norm": NaN, + "learning_rate": 4.7441949414993565e-05, + "loss": 0.0, + "step": 48081 + }, + { + "epoch": 4.486516749090231, + "grad_norm": NaN, + "learning_rate": 4.7436429918062404e-05, + "loss": 0.0, + "step": 48082 + }, + { + "epoch": 4.486610058785108, + "grad_norm": NaN, + "learning_rate": 4.743091068192002e-05, + "loss": 0.0, + "step": 48083 + }, + { + "epoch": 4.486703368479985, + "grad_norm": NaN, + "learning_rate": 4.742539170658044e-05, + "loss": 0.0, + "step": 48084 + }, + { + "epoch": 4.486796678174862, + "grad_norm": NaN, + "learning_rate": 4.741987299205772e-05, + "loss": 0.0, + "step": 48085 + }, + { + "epoch": 4.4868899878697395, + "grad_norm": NaN, + "learning_rate": 4.741435453836586e-05, + "loss": 0.0, + "step": 48086 + }, + { + "epoch": 4.486983297564617, + "grad_norm": NaN, + "learning_rate": 4.740883634551891e-05, + "loss": 0.0, + "step": 48087 + }, + { + "epoch": 4.487076607259494, + "grad_norm": NaN, + "learning_rate": 4.74033184135309e-05, + "loss": 0.0, + "step": 48088 + }, + { + "epoch": 4.487169916954372, + "grad_norm": NaN, + "learning_rate": 4.7397800742415876e-05, + "loss": 0.0, + "step": 48089 + }, + { + "epoch": 4.487263226649249, + "grad_norm": NaN, + "learning_rate": 4.739228333218783e-05, + "loss": 0.0, + "step": 48090 + }, + { + "epoch": 4.487356536344127, + "grad_norm": NaN, + "learning_rate": 4.738676618286082e-05, + "loss": 0.0, + "step": 48091 + }, + { + "epoch": 4.487449846039003, + "grad_norm": NaN, + "learning_rate": 4.738124929444888e-05, + "loss": 0.0, + "step": 48092 + }, + { + "epoch": 4.4875431557338805, + "grad_norm": NaN, + "learning_rate": 4.7375732666966e-05, + "loss": 0.0, + "step": 48093 + }, + { + "epoch": 4.487636465428758, + "grad_norm": NaN, + "learning_rate": 4.737021630042626e-05, + "loss": 0.0, + "step": 48094 + }, + { + "epoch": 4.487729775123635, + "grad_norm": NaN, + "learning_rate": 4.736470019484359e-05, + "loss": 0.0, + "step": 48095 + }, + { + "epoch": 4.487823084818513, + "grad_norm": NaN, + "learning_rate": 4.73591843502322e-05, + "loss": 0.0, + "step": 48096 + }, + { + "epoch": 4.48791639451339, + "grad_norm": NaN, + "learning_rate": 4.735366876660592e-05, + "loss": 0.0, + "step": 48097 + }, + { + "epoch": 4.488009704208268, + "grad_norm": NaN, + "learning_rate": 4.734815344397884e-05, + "loss": 0.0, + "step": 48098 + }, + { + "epoch": 4.488103013903144, + "grad_norm": NaN, + "learning_rate": 4.734263838236508e-05, + "loss": 0.0, + "step": 48099 + }, + { + "epoch": 4.488196323598022, + "grad_norm": NaN, + "learning_rate": 4.7337123581778536e-05, + "loss": 0.0, + "step": 48100 + }, + { + "epoch": 4.488289633292899, + "grad_norm": NaN, + "learning_rate": 4.7331609042233216e-05, + "loss": 0.0, + "step": 48101 + }, + { + "epoch": 4.488382942987776, + "grad_norm": NaN, + "learning_rate": 4.732609476374331e-05, + "loss": 0.0, + "step": 48102 + }, + { + "epoch": 4.488476252682654, + "grad_norm": NaN, + "learning_rate": 4.7320580746322675e-05, + "loss": 0.0, + "step": 48103 + }, + { + "epoch": 4.488569562377531, + "grad_norm": NaN, + "learning_rate": 4.731506698998534e-05, + "loss": 0.0, + "step": 48104 + }, + { + "epoch": 4.488662872072409, + "grad_norm": NaN, + "learning_rate": 4.7309553494745476e-05, + "loss": 0.0, + "step": 48105 + }, + { + "epoch": 4.488756181767286, + "grad_norm": NaN, + "learning_rate": 4.730404026061695e-05, + "loss": 0.0, + "step": 48106 + }, + { + "epoch": 4.488849491462163, + "grad_norm": NaN, + "learning_rate": 4.729852728761377e-05, + "loss": 0.0, + "step": 48107 + }, + { + "epoch": 4.48894280115704, + "grad_norm": NaN, + "learning_rate": 4.729301457575013e-05, + "loss": 0.0, + "step": 48108 + }, + { + "epoch": 4.4890361108519174, + "grad_norm": NaN, + "learning_rate": 4.728750212503986e-05, + "loss": 0.0, + "step": 48109 + }, + { + "epoch": 4.489129420546795, + "grad_norm": NaN, + "learning_rate": 4.7281989935497e-05, + "loss": 0.0, + "step": 48110 + }, + { + "epoch": 4.489222730241672, + "grad_norm": NaN, + "learning_rate": 4.727647800713572e-05, + "loss": 0.0, + "step": 48111 + }, + { + "epoch": 4.48931603993655, + "grad_norm": NaN, + "learning_rate": 4.7270966339969876e-05, + "loss": 0.0, + "step": 48112 + }, + { + "epoch": 4.489409349631426, + "grad_norm": NaN, + "learning_rate": 4.726545493401348e-05, + "loss": 0.0, + "step": 48113 + }, + { + "epoch": 4.489502659326304, + "grad_norm": NaN, + "learning_rate": 4.7259943789280705e-05, + "loss": 0.0, + "step": 48114 + }, + { + "epoch": 4.489595969021181, + "grad_norm": NaN, + "learning_rate": 4.72544329057854e-05, + "loss": 0.0, + "step": 48115 + }, + { + "epoch": 4.4896892787160585, + "grad_norm": NaN, + "learning_rate": 4.72489222835416e-05, + "loss": 0.0, + "step": 48116 + }, + { + "epoch": 4.489782588410936, + "grad_norm": NaN, + "learning_rate": 4.724341192256344e-05, + "loss": 0.0, + "step": 48117 + }, + { + "epoch": 4.489875898105813, + "grad_norm": NaN, + "learning_rate": 4.723790182286481e-05, + "loss": 0.0, + "step": 48118 + }, + { + "epoch": 4.489969207800691, + "grad_norm": NaN, + "learning_rate": 4.723239198445969e-05, + "loss": 0.0, + "step": 48119 + }, + { + "epoch": 4.490062517495568, + "grad_norm": NaN, + "learning_rate": 4.722688240736226e-05, + "loss": 0.0, + "step": 48120 + }, + { + "epoch": 4.490155827190445, + "grad_norm": NaN, + "learning_rate": 4.7221373091586374e-05, + "loss": 0.0, + "step": 48121 + }, + { + "epoch": 4.490249136885322, + "grad_norm": NaN, + "learning_rate": 4.721586403714609e-05, + "loss": 0.0, + "step": 48122 + }, + { + "epoch": 4.4903424465801995, + "grad_norm": NaN, + "learning_rate": 4.7210355244055415e-05, + "loss": 0.0, + "step": 48123 + }, + { + "epoch": 4.490435756275077, + "grad_norm": NaN, + "learning_rate": 4.720484671232837e-05, + "loss": 0.0, + "step": 48124 + }, + { + "epoch": 4.490529065969954, + "grad_norm": NaN, + "learning_rate": 4.7199338441978944e-05, + "loss": 0.0, + "step": 48125 + }, + { + "epoch": 4.490622375664832, + "grad_norm": NaN, + "learning_rate": 4.719383043302114e-05, + "loss": 0.0, + "step": 48126 + }, + { + "epoch": 4.490715685359709, + "grad_norm": NaN, + "learning_rate": 4.718832268546898e-05, + "loss": 0.0, + "step": 48127 + }, + { + "epoch": 4.490808995054586, + "grad_norm": NaN, + "learning_rate": 4.718281519933646e-05, + "loss": 0.0, + "step": 48128 + }, + { + "epoch": 4.490902304749463, + "grad_norm": NaN, + "learning_rate": 4.7177307974637586e-05, + "loss": 0.0, + "step": 48129 + }, + { + "epoch": 4.490995614444341, + "grad_norm": NaN, + "learning_rate": 4.7171801011386356e-05, + "loss": 0.0, + "step": 48130 + }, + { + "epoch": 4.491088924139218, + "grad_norm": NaN, + "learning_rate": 4.716629430959676e-05, + "loss": 0.0, + "step": 48131 + }, + { + "epoch": 4.491182233834095, + "grad_norm": NaN, + "learning_rate": 4.7160787869282845e-05, + "loss": 0.0, + "step": 48132 + }, + { + "epoch": 4.491275543528973, + "grad_norm": NaN, + "learning_rate": 4.715528169045852e-05, + "loss": 0.0, + "step": 48133 + }, + { + "epoch": 4.49136885322385, + "grad_norm": NaN, + "learning_rate": 4.7149775773137954e-05, + "loss": 0.0, + "step": 48134 + }, + { + "epoch": 4.491462162918728, + "grad_norm": NaN, + "learning_rate": 4.7144270117334975e-05, + "loss": 0.0, + "step": 48135 + }, + { + "epoch": 4.491555472613604, + "grad_norm": NaN, + "learning_rate": 4.713876472306361e-05, + "loss": 0.0, + "step": 48136 + }, + { + "epoch": 4.491648782308482, + "grad_norm": NaN, + "learning_rate": 4.7133259590338e-05, + "loss": 0.0, + "step": 48137 + }, + { + "epoch": 4.491742092003359, + "grad_norm": NaN, + "learning_rate": 4.712775471917199e-05, + "loss": 0.0, + "step": 48138 + }, + { + "epoch": 4.4918354016982365, + "grad_norm": NaN, + "learning_rate": 4.712225010957958e-05, + "loss": 0.0, + "step": 48139 + }, + { + "epoch": 4.491928711393114, + "grad_norm": NaN, + "learning_rate": 4.7116745761574904e-05, + "loss": 0.0, + "step": 48140 + }, + { + "epoch": 4.492022021087991, + "grad_norm": NaN, + "learning_rate": 4.7111241675171815e-05, + "loss": 0.0, + "step": 48141 + }, + { + "epoch": 4.492115330782869, + "grad_norm": NaN, + "learning_rate": 4.710573785038433e-05, + "loss": 0.0, + "step": 48142 + }, + { + "epoch": 4.492208640477745, + "grad_norm": NaN, + "learning_rate": 4.7100234287226555e-05, + "loss": 0.0, + "step": 48143 + }, + { + "epoch": 4.492301950172623, + "grad_norm": NaN, + "learning_rate": 4.7094730985712356e-05, + "loss": 0.0, + "step": 48144 + }, + { + "epoch": 4.4923952598675, + "grad_norm": NaN, + "learning_rate": 4.708922794585572e-05, + "loss": 0.0, + "step": 48145 + }, + { + "epoch": 4.4924885695623775, + "grad_norm": NaN, + "learning_rate": 4.70837251676708e-05, + "loss": 0.0, + "step": 48146 + }, + { + "epoch": 4.492581879257255, + "grad_norm": NaN, + "learning_rate": 4.7078222651171424e-05, + "loss": 0.0, + "step": 48147 + }, + { + "epoch": 4.492675188952132, + "grad_norm": NaN, + "learning_rate": 4.707272039637159e-05, + "loss": 0.0, + "step": 48148 + }, + { + "epoch": 4.49276849864701, + "grad_norm": NaN, + "learning_rate": 4.706721840328546e-05, + "loss": 0.0, + "step": 48149 + }, + { + "epoch": 4.492861808341886, + "grad_norm": NaN, + "learning_rate": 4.7061716671926837e-05, + "loss": 0.0, + "step": 48150 + }, + { + "epoch": 4.492955118036764, + "grad_norm": NaN, + "learning_rate": 4.705621520230972e-05, + "loss": 0.0, + "step": 48151 + }, + { + "epoch": 4.493048427731641, + "grad_norm": NaN, + "learning_rate": 4.705071399444826e-05, + "loss": 0.0, + "step": 48152 + }, + { + "epoch": 4.493141737426519, + "grad_norm": NaN, + "learning_rate": 4.704521304835628e-05, + "loss": 0.0, + "step": 48153 + }, + { + "epoch": 4.493235047121396, + "grad_norm": NaN, + "learning_rate": 4.703971236404779e-05, + "loss": 0.0, + "step": 48154 + }, + { + "epoch": 4.493328356816273, + "grad_norm": NaN, + "learning_rate": 4.703421194153691e-05, + "loss": 0.0, + "step": 48155 + }, + { + "epoch": 4.493421666511151, + "grad_norm": NaN, + "learning_rate": 4.702871178083747e-05, + "loss": 0.0, + "step": 48156 + }, + { + "epoch": 4.493514976206027, + "grad_norm": NaN, + "learning_rate": 4.702321188196347e-05, + "loss": 0.0, + "step": 48157 + }, + { + "epoch": 4.493608285900905, + "grad_norm": NaN, + "learning_rate": 4.701771224492903e-05, + "loss": 0.0, + "step": 48158 + }, + { + "epoch": 4.493701595595782, + "grad_norm": NaN, + "learning_rate": 4.701221286974801e-05, + "loss": 0.0, + "step": 48159 + }, + { + "epoch": 4.49379490529066, + "grad_norm": NaN, + "learning_rate": 4.700671375643436e-05, + "loss": 0.0, + "step": 48160 + }, + { + "epoch": 4.493888214985537, + "grad_norm": NaN, + "learning_rate": 4.7001214905002234e-05, + "loss": 0.0, + "step": 48161 + }, + { + "epoch": 4.4939815246804145, + "grad_norm": NaN, + "learning_rate": 4.6995716315465446e-05, + "loss": 0.0, + "step": 48162 + }, + { + "epoch": 4.494074834375292, + "grad_norm": NaN, + "learning_rate": 4.6990217987837995e-05, + "loss": 0.0, + "step": 48163 + }, + { + "epoch": 4.494168144070169, + "grad_norm": NaN, + "learning_rate": 4.6984719922134015e-05, + "loss": 0.0, + "step": 48164 + }, + { + "epoch": 4.494261453765046, + "grad_norm": NaN, + "learning_rate": 4.6979222118367324e-05, + "loss": 0.0, + "step": 48165 + }, + { + "epoch": 4.494354763459923, + "grad_norm": NaN, + "learning_rate": 4.697372457655195e-05, + "loss": 0.0, + "step": 48166 + }, + { + "epoch": 4.494448073154801, + "grad_norm": NaN, + "learning_rate": 4.696822729670188e-05, + "loss": 0.0, + "step": 48167 + }, + { + "epoch": 4.494541382849678, + "grad_norm": NaN, + "learning_rate": 4.696273027883104e-05, + "loss": 0.0, + "step": 48168 + }, + { + "epoch": 4.4946346925445555, + "grad_norm": NaN, + "learning_rate": 4.695723352295354e-05, + "loss": 0.0, + "step": 48169 + }, + { + "epoch": 4.494728002239433, + "grad_norm": NaN, + "learning_rate": 4.695173702908324e-05, + "loss": 0.0, + "step": 48170 + }, + { + "epoch": 4.49482131193431, + "grad_norm": NaN, + "learning_rate": 4.694624079723408e-05, + "loss": 0.0, + "step": 48171 + }, + { + "epoch": 4.494914621629187, + "grad_norm": NaN, + "learning_rate": 4.6940744827420214e-05, + "loss": 0.0, + "step": 48172 + }, + { + "epoch": 4.495007931324064, + "grad_norm": NaN, + "learning_rate": 4.693524911965546e-05, + "loss": 0.0, + "step": 48173 + }, + { + "epoch": 4.495101241018942, + "grad_norm": NaN, + "learning_rate": 4.6929753673953776e-05, + "loss": 0.0, + "step": 48174 + }, + { + "epoch": 4.495194550713819, + "grad_norm": NaN, + "learning_rate": 4.692425849032931e-05, + "loss": 0.0, + "step": 48175 + }, + { + "epoch": 4.4952878604086965, + "grad_norm": NaN, + "learning_rate": 4.691876356879585e-05, + "loss": 0.0, + "step": 48176 + }, + { + "epoch": 4.495381170103574, + "grad_norm": NaN, + "learning_rate": 4.69132689093674e-05, + "loss": 0.0, + "step": 48177 + }, + { + "epoch": 4.495474479798451, + "grad_norm": NaN, + "learning_rate": 4.6907774512058075e-05, + "loss": 0.0, + "step": 48178 + }, + { + "epoch": 4.495567789493329, + "grad_norm": NaN, + "learning_rate": 4.6902280376881704e-05, + "loss": 0.0, + "step": 48179 + }, + { + "epoch": 4.495661099188205, + "grad_norm": NaN, + "learning_rate": 4.689678650385223e-05, + "loss": 0.0, + "step": 48180 + }, + { + "epoch": 4.495754408883083, + "grad_norm": NaN, + "learning_rate": 4.689129289298378e-05, + "loss": 0.0, + "step": 48181 + }, + { + "epoch": 4.49584771857796, + "grad_norm": NaN, + "learning_rate": 4.688579954429019e-05, + "loss": 0.0, + "step": 48182 + }, + { + "epoch": 4.495941028272838, + "grad_norm": NaN, + "learning_rate": 4.688030645778541e-05, + "loss": 0.0, + "step": 48183 + }, + { + "epoch": 4.496034337967715, + "grad_norm": NaN, + "learning_rate": 4.687481363348356e-05, + "loss": 0.0, + "step": 48184 + }, + { + "epoch": 4.496127647662592, + "grad_norm": NaN, + "learning_rate": 4.686932107139847e-05, + "loss": 0.0, + "step": 48185 + }, + { + "epoch": 4.49622095735747, + "grad_norm": NaN, + "learning_rate": 4.686382877154409e-05, + "loss": 0.0, + "step": 48186 + }, + { + "epoch": 4.496314267052346, + "grad_norm": NaN, + "learning_rate": 4.685833673393456e-05, + "loss": 0.0, + "step": 48187 + }, + { + "epoch": 4.496407576747224, + "grad_norm": NaN, + "learning_rate": 4.685284495858366e-05, + "loss": 0.0, + "step": 48188 + }, + { + "epoch": 4.496500886442101, + "grad_norm": NaN, + "learning_rate": 4.6847353445505374e-05, + "loss": 0.0, + "step": 48189 + }, + { + "epoch": 4.496594196136979, + "grad_norm": NaN, + "learning_rate": 4.684186219471381e-05, + "loss": 0.0, + "step": 48190 + }, + { + "epoch": 4.496687505831856, + "grad_norm": NaN, + "learning_rate": 4.683637120622279e-05, + "loss": 0.0, + "step": 48191 + }, + { + "epoch": 4.4967808155267335, + "grad_norm": NaN, + "learning_rate": 4.683088048004626e-05, + "loss": 0.0, + "step": 48192 + }, + { + "epoch": 4.496874125221611, + "grad_norm": NaN, + "learning_rate": 4.682539001619834e-05, + "loss": 0.0, + "step": 48193 + }, + { + "epoch": 4.496967434916487, + "grad_norm": NaN, + "learning_rate": 4.681989981469285e-05, + "loss": 0.0, + "step": 48194 + }, + { + "epoch": 4.497060744611365, + "grad_norm": NaN, + "learning_rate": 4.681440987554373e-05, + "loss": 0.0, + "step": 48195 + }, + { + "epoch": 4.497154054306242, + "grad_norm": NaN, + "learning_rate": 4.680892019876511e-05, + "loss": 0.0, + "step": 48196 + }, + { + "epoch": 4.49724736400112, + "grad_norm": NaN, + "learning_rate": 4.680343078437078e-05, + "loss": 0.0, + "step": 48197 + }, + { + "epoch": 4.497340673695997, + "grad_norm": NaN, + "learning_rate": 4.679794163237471e-05, + "loss": 0.0, + "step": 48198 + }, + { + "epoch": 4.4974339833908745, + "grad_norm": NaN, + "learning_rate": 4.6792452742791006e-05, + "loss": 0.0, + "step": 48199 + }, + { + "epoch": 4.497527293085752, + "grad_norm": NaN, + "learning_rate": 4.6786964115633475e-05, + "loss": 0.0, + "step": 48200 + }, + { + "epoch": 4.4976206027806285, + "grad_norm": NaN, + "learning_rate": 4.678147575091606e-05, + "loss": 0.0, + "step": 48201 + }, + { + "epoch": 4.497713912475506, + "grad_norm": NaN, + "learning_rate": 4.677598764865288e-05, + "loss": 0.0, + "step": 48202 + }, + { + "epoch": 4.497807222170383, + "grad_norm": NaN, + "learning_rate": 4.677049980885774e-05, + "loss": 0.0, + "step": 48203 + }, + { + "epoch": 4.497900531865261, + "grad_norm": NaN, + "learning_rate": 4.6765012231544604e-05, + "loss": 0.0, + "step": 48204 + }, + { + "epoch": 4.497993841560138, + "grad_norm": NaN, + "learning_rate": 4.6759524916727534e-05, + "loss": 0.0, + "step": 48205 + }, + { + "epoch": 4.498087151255016, + "grad_norm": NaN, + "learning_rate": 4.675403786442032e-05, + "loss": 0.0, + "step": 48206 + }, + { + "epoch": 4.498180460949893, + "grad_norm": NaN, + "learning_rate": 4.6748551074637054e-05, + "loss": 0.0, + "step": 48207 + }, + { + "epoch": 4.49827377064477, + "grad_norm": NaN, + "learning_rate": 4.674306454739169e-05, + "loss": 0.0, + "step": 48208 + }, + { + "epoch": 4.498367080339647, + "grad_norm": NaN, + "learning_rate": 4.673757828269804e-05, + "loss": 0.0, + "step": 48209 + }, + { + "epoch": 4.498460390034524, + "grad_norm": NaN, + "learning_rate": 4.673209228057016e-05, + "loss": 0.0, + "step": 48210 + }, + { + "epoch": 4.498553699729402, + "grad_norm": NaN, + "learning_rate": 4.6726606541022045e-05, + "loss": 0.0, + "step": 48211 + }, + { + "epoch": 4.498647009424279, + "grad_norm": NaN, + "learning_rate": 4.672112106406747e-05, + "loss": 0.0, + "step": 48212 + }, + { + "epoch": 4.498740319119157, + "grad_norm": NaN, + "learning_rate": 4.671563584972061e-05, + "loss": 0.0, + "step": 48213 + }, + { + "epoch": 4.498833628814034, + "grad_norm": NaN, + "learning_rate": 4.671015089799523e-05, + "loss": 0.0, + "step": 48214 + }, + { + "epoch": 4.4989269385089115, + "grad_norm": NaN, + "learning_rate": 4.670466620890528e-05, + "loss": 0.0, + "step": 48215 + }, + { + "epoch": 4.499020248203788, + "grad_norm": NaN, + "learning_rate": 4.669918178246489e-05, + "loss": 0.0, + "step": 48216 + }, + { + "epoch": 4.499113557898665, + "grad_norm": NaN, + "learning_rate": 4.66936976186878e-05, + "loss": 0.0, + "step": 48217 + }, + { + "epoch": 4.499206867593543, + "grad_norm": NaN, + "learning_rate": 4.668821371758801e-05, + "loss": 0.0, + "step": 48218 + }, + { + "epoch": 4.49930017728842, + "grad_norm": NaN, + "learning_rate": 4.668273007917958e-05, + "loss": 0.0, + "step": 48219 + }, + { + "epoch": 4.499393486983298, + "grad_norm": NaN, + "learning_rate": 4.667724670347631e-05, + "loss": 0.0, + "step": 48220 + }, + { + "epoch": 4.499486796678175, + "grad_norm": NaN, + "learning_rate": 4.667176359049216e-05, + "loss": 0.0, + "step": 48221 + }, + { + "epoch": 4.4995801063730525, + "grad_norm": NaN, + "learning_rate": 4.66662807402412e-05, + "loss": 0.0, + "step": 48222 + }, + { + "epoch": 4.49967341606793, + "grad_norm": NaN, + "learning_rate": 4.6660798152737225e-05, + "loss": 0.0, + "step": 48223 + }, + { + "epoch": 4.4997667257628065, + "grad_norm": NaN, + "learning_rate": 4.6655315827994176e-05, + "loss": 0.0, + "step": 48224 + }, + { + "epoch": 4.499860035457684, + "grad_norm": NaN, + "learning_rate": 4.664983376602614e-05, + "loss": 0.0, + "step": 48225 + }, + { + "epoch": 4.499953345152561, + "grad_norm": NaN, + "learning_rate": 4.664435196684691e-05, + "loss": 0.0, + "step": 48226 + }, + { + "epoch": 4.500046654847439, + "grad_norm": NaN, + "learning_rate": 4.663887043047044e-05, + "loss": 0.0, + "step": 48227 + }, + { + "epoch": 4.500139964542316, + "grad_norm": NaN, + "learning_rate": 4.66333891569108e-05, + "loss": 0.0, + "step": 48228 + }, + { + "epoch": 4.5002332742371935, + "grad_norm": NaN, + "learning_rate": 4.662790814618179e-05, + "loss": 0.0, + "step": 48229 + }, + { + "epoch": 4.50032658393207, + "grad_norm": NaN, + "learning_rate": 4.662242739829734e-05, + "loss": 0.0, + "step": 48230 + }, + { + "epoch": 4.5004198936269475, + "grad_norm": NaN, + "learning_rate": 4.6616946913271523e-05, + "loss": 0.0, + "step": 48231 + }, + { + "epoch": 4.500513203321825, + "grad_norm": NaN, + "learning_rate": 4.6611466691118135e-05, + "loss": 0.0, + "step": 48232 + }, + { + "epoch": 4.500606513016702, + "grad_norm": NaN, + "learning_rate": 4.660598673185112e-05, + "loss": 0.0, + "step": 48233 + }, + { + "epoch": 4.50069982271158, + "grad_norm": NaN, + "learning_rate": 4.660050703548454e-05, + "loss": 0.0, + "step": 48234 + }, + { + "epoch": 4.500793132406457, + "grad_norm": NaN, + "learning_rate": 4.65950276020322e-05, + "loss": 0.0, + "step": 48235 + }, + { + "epoch": 4.500886442101335, + "grad_norm": NaN, + "learning_rate": 4.658954843150803e-05, + "loss": 0.0, + "step": 48236 + }, + { + "epoch": 4.500979751796212, + "grad_norm": NaN, + "learning_rate": 4.6584069523926106e-05, + "loss": 0.0, + "step": 48237 + }, + { + "epoch": 4.5010730614910885, + "grad_norm": NaN, + "learning_rate": 4.65785908793002e-05, + "loss": 0.0, + "step": 48238 + }, + { + "epoch": 4.501166371185966, + "grad_norm": NaN, + "learning_rate": 4.6573112497644246e-05, + "loss": 0.0, + "step": 48239 + }, + { + "epoch": 4.501259680880843, + "grad_norm": NaN, + "learning_rate": 4.656763437897234e-05, + "loss": 0.0, + "step": 48240 + }, + { + "epoch": 4.501352990575721, + "grad_norm": NaN, + "learning_rate": 4.656215652329825e-05, + "loss": 0.0, + "step": 48241 + }, + { + "epoch": 4.501446300270598, + "grad_norm": NaN, + "learning_rate": 4.655667893063591e-05, + "loss": 0.0, + "step": 48242 + }, + { + "epoch": 4.501539609965476, + "grad_norm": NaN, + "learning_rate": 4.6551201600999385e-05, + "loss": 0.0, + "step": 48243 + }, + { + "epoch": 4.501632919660353, + "grad_norm": NaN, + "learning_rate": 4.654572453440242e-05, + "loss": 0.0, + "step": 48244 + }, + { + "epoch": 4.50172622935523, + "grad_norm": NaN, + "learning_rate": 4.654024773085908e-05, + "loss": 0.0, + "step": 48245 + }, + { + "epoch": 4.501819539050107, + "grad_norm": NaN, + "learning_rate": 4.653477119038328e-05, + "loss": 0.0, + "step": 48246 + }, + { + "epoch": 4.501912848744984, + "grad_norm": NaN, + "learning_rate": 4.652929491298883e-05, + "loss": 0.0, + "step": 48247 + }, + { + "epoch": 4.502006158439862, + "grad_norm": NaN, + "learning_rate": 4.6523818898689766e-05, + "loss": 0.0, + "step": 48248 + }, + { + "epoch": 4.502099468134739, + "grad_norm": NaN, + "learning_rate": 4.651834314750002e-05, + "loss": 0.0, + "step": 48249 + }, + { + "epoch": 4.502192777829617, + "grad_norm": NaN, + "learning_rate": 4.651286765943338e-05, + "loss": 0.0, + "step": 48250 + }, + { + "epoch": 4.502286087524494, + "grad_norm": NaN, + "learning_rate": 4.650739243450393e-05, + "loss": 0.0, + "step": 48251 + }, + { + "epoch": 4.5023793972193715, + "grad_norm": NaN, + "learning_rate": 4.650191747272555e-05, + "loss": 0.0, + "step": 48252 + }, + { + "epoch": 4.502472706914248, + "grad_norm": NaN, + "learning_rate": 4.649644277411205e-05, + "loss": 0.0, + "step": 48253 + }, + { + "epoch": 4.5025660166091255, + "grad_norm": NaN, + "learning_rate": 4.6490968338677496e-05, + "loss": 0.0, + "step": 48254 + }, + { + "epoch": 4.502659326304003, + "grad_norm": NaN, + "learning_rate": 4.648549416643578e-05, + "loss": 0.0, + "step": 48255 + }, + { + "epoch": 4.50275263599888, + "grad_norm": NaN, + "learning_rate": 4.64800202574007e-05, + "loss": 0.0, + "step": 48256 + }, + { + "epoch": 4.502845945693758, + "grad_norm": NaN, + "learning_rate": 4.647454661158636e-05, + "loss": 0.0, + "step": 48257 + }, + { + "epoch": 4.502939255388635, + "grad_norm": NaN, + "learning_rate": 4.646907322900652e-05, + "loss": 0.0, + "step": 48258 + }, + { + "epoch": 4.503032565083512, + "grad_norm": NaN, + "learning_rate": 4.6463600109675134e-05, + "loss": 0.0, + "step": 48259 + }, + { + "epoch": 4.503125874778389, + "grad_norm": NaN, + "learning_rate": 4.6458127253606213e-05, + "loss": 0.0, + "step": 48260 + }, + { + "epoch": 4.5032191844732665, + "grad_norm": NaN, + "learning_rate": 4.6452654660813564e-05, + "loss": 0.0, + "step": 48261 + }, + { + "epoch": 4.503312494168144, + "grad_norm": NaN, + "learning_rate": 4.6447182331311105e-05, + "loss": 0.0, + "step": 48262 + }, + { + "epoch": 4.503405803863021, + "grad_norm": NaN, + "learning_rate": 4.6441710265112876e-05, + "loss": 0.0, + "step": 48263 + }, + { + "epoch": 4.503499113557899, + "grad_norm": NaN, + "learning_rate": 4.643623846223265e-05, + "loss": 0.0, + "step": 48264 + }, + { + "epoch": 4.503592423252776, + "grad_norm": NaN, + "learning_rate": 4.6430766922684356e-05, + "loss": 0.0, + "step": 48265 + }, + { + "epoch": 4.503685732947654, + "grad_norm": NaN, + "learning_rate": 4.6425295646482024e-05, + "loss": 0.0, + "step": 48266 + }, + { + "epoch": 4.503779042642531, + "grad_norm": NaN, + "learning_rate": 4.6419824633639427e-05, + "loss": 0.0, + "step": 48267 + }, + { + "epoch": 4.503872352337408, + "grad_norm": NaN, + "learning_rate": 4.641435388417051e-05, + "loss": 0.0, + "step": 48268 + }, + { + "epoch": 4.503965662032285, + "grad_norm": NaN, + "learning_rate": 4.64088833980893e-05, + "loss": 0.0, + "step": 48269 + }, + { + "epoch": 4.504058971727162, + "grad_norm": NaN, + "learning_rate": 4.640341317540955e-05, + "loss": 0.0, + "step": 48270 + }, + { + "epoch": 4.50415228142204, + "grad_norm": NaN, + "learning_rate": 4.6397943216145186e-05, + "loss": 0.0, + "step": 48271 + }, + { + "epoch": 4.504245591116917, + "grad_norm": NaN, + "learning_rate": 4.6392473520310276e-05, + "loss": 0.0, + "step": 48272 + }, + { + "epoch": 4.504338900811795, + "grad_norm": NaN, + "learning_rate": 4.638700408791855e-05, + "loss": 0.0, + "step": 48273 + }, + { + "epoch": 4.504432210506671, + "grad_norm": NaN, + "learning_rate": 4.638153491898393e-05, + "loss": 0.0, + "step": 48274 + }, + { + "epoch": 4.504525520201549, + "grad_norm": NaN, + "learning_rate": 4.6376066013520494e-05, + "loss": 0.0, + "step": 48275 + }, + { + "epoch": 4.504618829896426, + "grad_norm": NaN, + "learning_rate": 4.6370597371541954e-05, + "loss": 0.0, + "step": 48276 + }, + { + "epoch": 4.5047121395913035, + "grad_norm": NaN, + "learning_rate": 4.636512899306225e-05, + "loss": 0.0, + "step": 48277 + }, + { + "epoch": 4.504805449286181, + "grad_norm": NaN, + "learning_rate": 4.6359660878095414e-05, + "loss": 0.0, + "step": 48278 + }, + { + "epoch": 4.504898758981058, + "grad_norm": NaN, + "learning_rate": 4.6354193026655204e-05, + "loss": 0.0, + "step": 48279 + }, + { + "epoch": 4.504992068675936, + "grad_norm": NaN, + "learning_rate": 4.634872543875556e-05, + "loss": 0.0, + "step": 48280 + }, + { + "epoch": 4.505085378370813, + "grad_norm": NaN, + "learning_rate": 4.634325811441047e-05, + "loss": 0.0, + "step": 48281 + }, + { + "epoch": 4.50517868806569, + "grad_norm": NaN, + "learning_rate": 4.633779105363368e-05, + "loss": 0.0, + "step": 48282 + }, + { + "epoch": 4.505271997760567, + "grad_norm": NaN, + "learning_rate": 4.6332324256439234e-05, + "loss": 0.0, + "step": 48283 + }, + { + "epoch": 4.5053653074554445, + "grad_norm": NaN, + "learning_rate": 4.632685772284103e-05, + "loss": 0.0, + "step": 48284 + }, + { + "epoch": 4.505458617150322, + "grad_norm": NaN, + "learning_rate": 4.632139145285281e-05, + "loss": 0.0, + "step": 48285 + }, + { + "epoch": 4.505551926845199, + "grad_norm": NaN, + "learning_rate": 4.6315925446488634e-05, + "loss": 0.0, + "step": 48286 + }, + { + "epoch": 4.505645236540077, + "grad_norm": NaN, + "learning_rate": 4.63104597037624e-05, + "loss": 0.0, + "step": 48287 + }, + { + "epoch": 4.505738546234954, + "grad_norm": NaN, + "learning_rate": 4.6304994224687844e-05, + "loss": 0.0, + "step": 48288 + }, + { + "epoch": 4.505831855929831, + "grad_norm": NaN, + "learning_rate": 4.6299529009279035e-05, + "loss": 0.0, + "step": 48289 + }, + { + "epoch": 4.505925165624708, + "grad_norm": NaN, + "learning_rate": 4.6294064057549834e-05, + "loss": 0.0, + "step": 48290 + }, + { + "epoch": 4.5060184753195855, + "grad_norm": NaN, + "learning_rate": 4.628859936951402e-05, + "loss": 0.0, + "step": 48291 + }, + { + "epoch": 4.506111785014463, + "grad_norm": NaN, + "learning_rate": 4.6283134945185625e-05, + "loss": 0.0, + "step": 48292 + }, + { + "epoch": 4.50620509470934, + "grad_norm": NaN, + "learning_rate": 4.6277670784578556e-05, + "loss": 0.0, + "step": 48293 + }, + { + "epoch": 4.506298404404218, + "grad_norm": NaN, + "learning_rate": 4.6272206887706524e-05, + "loss": 0.0, + "step": 48294 + }, + { + "epoch": 4.506391714099095, + "grad_norm": NaN, + "learning_rate": 4.626674325458362e-05, + "loss": 0.0, + "step": 48295 + }, + { + "epoch": 4.506485023793973, + "grad_norm": NaN, + "learning_rate": 4.6261279885223684e-05, + "loss": 0.0, + "step": 48296 + }, + { + "epoch": 4.506578333488849, + "grad_norm": NaN, + "learning_rate": 4.625581677964049e-05, + "loss": 0.0, + "step": 48297 + }, + { + "epoch": 4.506671643183727, + "grad_norm": NaN, + "learning_rate": 4.625035393784808e-05, + "loss": 0.0, + "step": 48298 + }, + { + "epoch": 4.506764952878604, + "grad_norm": NaN, + "learning_rate": 4.624489135986033e-05, + "loss": 0.0, + "step": 48299 + }, + { + "epoch": 4.506858262573481, + "grad_norm": NaN, + "learning_rate": 4.6239429045690986e-05, + "loss": 0.0, + "step": 48300 + }, + { + "epoch": 4.506951572268359, + "grad_norm": NaN, + "learning_rate": 4.623396699535414e-05, + "loss": 0.0, + "step": 48301 + }, + { + "epoch": 4.507044881963236, + "grad_norm": NaN, + "learning_rate": 4.622850520886353e-05, + "loss": 0.0, + "step": 48302 + }, + { + "epoch": 4.507138191658113, + "grad_norm": NaN, + "learning_rate": 4.622304368623305e-05, + "loss": 0.0, + "step": 48303 + }, + { + "epoch": 4.50723150135299, + "grad_norm": NaN, + "learning_rate": 4.621758242747671e-05, + "loss": 0.0, + "step": 48304 + }, + { + "epoch": 4.507324811047868, + "grad_norm": NaN, + "learning_rate": 4.621212143260828e-05, + "loss": 0.0, + "step": 48305 + }, + { + "epoch": 4.507418120742745, + "grad_norm": NaN, + "learning_rate": 4.620666070164162e-05, + "loss": 0.0, + "step": 48306 + }, + { + "epoch": 4.5075114304376225, + "grad_norm": NaN, + "learning_rate": 4.620120023459078e-05, + "loss": 0.0, + "step": 48307 + }, + { + "epoch": 4.5076047401325, + "grad_norm": NaN, + "learning_rate": 4.619574003146951e-05, + "loss": 0.0, + "step": 48308 + }, + { + "epoch": 4.507698049827377, + "grad_norm": NaN, + "learning_rate": 4.619028009229165e-05, + "loss": 0.0, + "step": 48309 + }, + { + "epoch": 4.507791359522255, + "grad_norm": NaN, + "learning_rate": 4.618482041707126e-05, + "loss": 0.0, + "step": 48310 + }, + { + "epoch": 4.507884669217132, + "grad_norm": NaN, + "learning_rate": 4.6179361005822085e-05, + "loss": 0.0, + "step": 48311 + }, + { + "epoch": 4.507977978912009, + "grad_norm": NaN, + "learning_rate": 4.6173901858557974e-05, + "loss": 0.0, + "step": 48312 + }, + { + "epoch": 4.508071288606886, + "grad_norm": NaN, + "learning_rate": 4.616844297529297e-05, + "loss": 0.0, + "step": 48313 + }, + { + "epoch": 4.5081645983017635, + "grad_norm": NaN, + "learning_rate": 4.6162984356040834e-05, + "loss": 0.0, + "step": 48314 + }, + { + "epoch": 4.508257907996641, + "grad_norm": NaN, + "learning_rate": 4.615752600081541e-05, + "loss": 0.0, + "step": 48315 + }, + { + "epoch": 4.508351217691518, + "grad_norm": NaN, + "learning_rate": 4.6152067909630726e-05, + "loss": 0.0, + "step": 48316 + }, + { + "epoch": 4.508444527386396, + "grad_norm": NaN, + "learning_rate": 4.614661008250052e-05, + "loss": 0.0, + "step": 48317 + }, + { + "epoch": 4.508537837081272, + "grad_norm": NaN, + "learning_rate": 4.614115251943868e-05, + "loss": 0.0, + "step": 48318 + }, + { + "epoch": 4.50863114677615, + "grad_norm": NaN, + "learning_rate": 4.613569522045923e-05, + "loss": 0.0, + "step": 48319 + }, + { + "epoch": 4.508724456471027, + "grad_norm": NaN, + "learning_rate": 4.613023818557582e-05, + "loss": 0.0, + "step": 48320 + }, + { + "epoch": 4.508817766165905, + "grad_norm": NaN, + "learning_rate": 4.612478141480252e-05, + "loss": 0.0, + "step": 48321 + }, + { + "epoch": 4.508911075860782, + "grad_norm": NaN, + "learning_rate": 4.6119324908153154e-05, + "loss": 0.0, + "step": 48322 + }, + { + "epoch": 4.509004385555659, + "grad_norm": NaN, + "learning_rate": 4.6113868665641504e-05, + "loss": 0.0, + "step": 48323 + }, + { + "epoch": 4.509097695250537, + "grad_norm": NaN, + "learning_rate": 4.610841268728153e-05, + "loss": 0.0, + "step": 48324 + }, + { + "epoch": 4.509191004945414, + "grad_norm": NaN, + "learning_rate": 4.6102956973087166e-05, + "loss": 0.0, + "step": 48325 + }, + { + "epoch": 4.509284314640291, + "grad_norm": NaN, + "learning_rate": 4.6097501523072084e-05, + "loss": 0.0, + "step": 48326 + }, + { + "epoch": 4.509377624335168, + "grad_norm": NaN, + "learning_rate": 4.6092046337250347e-05, + "loss": 0.0, + "step": 48327 + }, + { + "epoch": 4.509470934030046, + "grad_norm": NaN, + "learning_rate": 4.608659141563579e-05, + "loss": 0.0, + "step": 48328 + }, + { + "epoch": 4.509564243724923, + "grad_norm": NaN, + "learning_rate": 4.6081136758242156e-05, + "loss": 0.0, + "step": 48329 + }, + { + "epoch": 4.5096575534198005, + "grad_norm": NaN, + "learning_rate": 4.607568236508345e-05, + "loss": 0.0, + "step": 48330 + }, + { + "epoch": 4.509750863114678, + "grad_norm": NaN, + "learning_rate": 4.607022823617356e-05, + "loss": 0.0, + "step": 48331 + }, + { + "epoch": 4.509844172809555, + "grad_norm": NaN, + "learning_rate": 4.6064774371526196e-05, + "loss": 0.0, + "step": 48332 + }, + { + "epoch": 4.509937482504432, + "grad_norm": NaN, + "learning_rate": 4.6059320771155366e-05, + "loss": 0.0, + "step": 48333 + }, + { + "epoch": 4.510030792199309, + "grad_norm": NaN, + "learning_rate": 4.605386743507494e-05, + "loss": 0.0, + "step": 48334 + }, + { + "epoch": 4.510124101894187, + "grad_norm": NaN, + "learning_rate": 4.604841436329865e-05, + "loss": 0.0, + "step": 48335 + }, + { + "epoch": 4.510217411589064, + "grad_norm": NaN, + "learning_rate": 4.60429615558405e-05, + "loss": 0.0, + "step": 48336 + }, + { + "epoch": 4.5103107212839415, + "grad_norm": NaN, + "learning_rate": 4.603750901271435e-05, + "loss": 0.0, + "step": 48337 + }, + { + "epoch": 4.510404030978819, + "grad_norm": NaN, + "learning_rate": 4.6032056733933905e-05, + "loss": 0.0, + "step": 48338 + }, + { + "epoch": 4.510497340673696, + "grad_norm": NaN, + "learning_rate": 4.602660471951321e-05, + "loss": 0.0, + "step": 48339 + }, + { + "epoch": 4.510590650368574, + "grad_norm": NaN, + "learning_rate": 4.602115296946609e-05, + "loss": 0.0, + "step": 48340 + }, + { + "epoch": 4.51068396006345, + "grad_norm": NaN, + "learning_rate": 4.60157014838063e-05, + "loss": 0.0, + "step": 48341 + }, + { + "epoch": 4.510777269758328, + "grad_norm": NaN, + "learning_rate": 4.601025026254783e-05, + "loss": 0.0, + "step": 48342 + }, + { + "epoch": 4.510870579453205, + "grad_norm": NaN, + "learning_rate": 4.600479930570451e-05, + "loss": 0.0, + "step": 48343 + }, + { + "epoch": 4.5109638891480826, + "grad_norm": NaN, + "learning_rate": 4.59993486132901e-05, + "loss": 0.0, + "step": 48344 + }, + { + "epoch": 4.51105719884296, + "grad_norm": NaN, + "learning_rate": 4.599389818531863e-05, + "loss": 0.0, + "step": 48345 + }, + { + "epoch": 4.511150508537837, + "grad_norm": NaN, + "learning_rate": 4.5988448021803834e-05, + "loss": 0.0, + "step": 48346 + }, + { + "epoch": 4.511243818232714, + "grad_norm": NaN, + "learning_rate": 4.5982998122759555e-05, + "loss": 0.0, + "step": 48347 + }, + { + "epoch": 4.511337127927591, + "grad_norm": NaN, + "learning_rate": 4.597754848819978e-05, + "loss": 0.0, + "step": 48348 + }, + { + "epoch": 4.511430437622469, + "grad_norm": NaN, + "learning_rate": 4.597209911813826e-05, + "loss": 0.0, + "step": 48349 + }, + { + "epoch": 4.511523747317346, + "grad_norm": NaN, + "learning_rate": 4.5966650012588805e-05, + "loss": 0.0, + "step": 48350 + }, + { + "epoch": 4.511617057012224, + "grad_norm": NaN, + "learning_rate": 4.5961201171565445e-05, + "loss": 0.0, + "step": 48351 + }, + { + "epoch": 4.511710366707101, + "grad_norm": NaN, + "learning_rate": 4.5955752595081886e-05, + "loss": 0.0, + "step": 48352 + }, + { + "epoch": 4.511803676401978, + "grad_norm": NaN, + "learning_rate": 4.595030428315197e-05, + "loss": 0.0, + "step": 48353 + }, + { + "epoch": 4.511896986096856, + "grad_norm": NaN, + "learning_rate": 4.594485623578973e-05, + "loss": 0.0, + "step": 48354 + }, + { + "epoch": 4.511990295791732, + "grad_norm": NaN, + "learning_rate": 4.5939408453008824e-05, + "loss": 0.0, + "step": 48355 + }, + { + "epoch": 4.51208360548661, + "grad_norm": NaN, + "learning_rate": 4.593396093482315e-05, + "loss": 0.0, + "step": 48356 + }, + { + "epoch": 4.512176915181487, + "grad_norm": NaN, + "learning_rate": 4.592851368124666e-05, + "loss": 0.0, + "step": 48357 + }, + { + "epoch": 4.512270224876365, + "grad_norm": NaN, + "learning_rate": 4.592306669229306e-05, + "loss": 0.0, + "step": 48358 + }, + { + "epoch": 4.512363534571242, + "grad_norm": NaN, + "learning_rate": 4.5917619967976304e-05, + "loss": 0.0, + "step": 48359 + }, + { + "epoch": 4.5124568442661195, + "grad_norm": NaN, + "learning_rate": 4.591217350831025e-05, + "loss": 0.0, + "step": 48360 + }, + { + "epoch": 4.512550153960997, + "grad_norm": NaN, + "learning_rate": 4.590672731330862e-05, + "loss": 0.0, + "step": 48361 + }, + { + "epoch": 4.512643463655873, + "grad_norm": NaN, + "learning_rate": 4.590128138298541e-05, + "loss": 0.0, + "step": 48362 + }, + { + "epoch": 4.512736773350751, + "grad_norm": NaN, + "learning_rate": 4.589583571735442e-05, + "loss": 0.0, + "step": 48363 + }, + { + "epoch": 4.512830083045628, + "grad_norm": NaN, + "learning_rate": 4.5890390316429415e-05, + "loss": 0.0, + "step": 48364 + }, + { + "epoch": 4.512923392740506, + "grad_norm": NaN, + "learning_rate": 4.588494518022433e-05, + "loss": 0.0, + "step": 48365 + }, + { + "epoch": 4.513016702435383, + "grad_norm": NaN, + "learning_rate": 4.587950030875306e-05, + "loss": 0.0, + "step": 48366 + }, + { + "epoch": 4.5131100121302605, + "grad_norm": NaN, + "learning_rate": 4.5874055702029274e-05, + "loss": 0.0, + "step": 48367 + }, + { + "epoch": 4.513203321825138, + "grad_norm": NaN, + "learning_rate": 4.5868611360066963e-05, + "loss": 0.0, + "step": 48368 + }, + { + "epoch": 4.513296631520015, + "grad_norm": NaN, + "learning_rate": 4.586316728287999e-05, + "loss": 0.0, + "step": 48369 + }, + { + "epoch": 4.513389941214892, + "grad_norm": NaN, + "learning_rate": 4.585772347048203e-05, + "loss": 0.0, + "step": 48370 + }, + { + "epoch": 4.513483250909769, + "grad_norm": NaN, + "learning_rate": 4.5852279922887076e-05, + "loss": 0.0, + "step": 48371 + }, + { + "epoch": 4.513576560604647, + "grad_norm": NaN, + "learning_rate": 4.584683664010897e-05, + "loss": 0.0, + "step": 48372 + }, + { + "epoch": 4.513669870299524, + "grad_norm": NaN, + "learning_rate": 4.584139362216141e-05, + "loss": 0.0, + "step": 48373 + }, + { + "epoch": 4.513763179994402, + "grad_norm": NaN, + "learning_rate": 4.583595086905839e-05, + "loss": 0.0, + "step": 48374 + }, + { + "epoch": 4.513856489689279, + "grad_norm": NaN, + "learning_rate": 4.5830508380813724e-05, + "loss": 0.0, + "step": 48375 + }, + { + "epoch": 4.5139497993841555, + "grad_norm": NaN, + "learning_rate": 4.582506615744112e-05, + "loss": 0.0, + "step": 48376 + }, + { + "epoch": 4.514043109079033, + "grad_norm": NaN, + "learning_rate": 4.581962419895458e-05, + "loss": 0.0, + "step": 48377 + }, + { + "epoch": 4.51413641877391, + "grad_norm": NaN, + "learning_rate": 4.581418250536791e-05, + "loss": 0.0, + "step": 48378 + }, + { + "epoch": 4.514229728468788, + "grad_norm": NaN, + "learning_rate": 4.580874107669483e-05, + "loss": 0.0, + "step": 48379 + }, + { + "epoch": 4.514323038163665, + "grad_norm": NaN, + "learning_rate": 4.5803299912949295e-05, + "loss": 0.0, + "step": 48380 + }, + { + "epoch": 4.514416347858543, + "grad_norm": NaN, + "learning_rate": 4.5797859014145144e-05, + "loss": 0.0, + "step": 48381 + }, + { + "epoch": 4.51450965755342, + "grad_norm": NaN, + "learning_rate": 4.5792418380296084e-05, + "loss": 0.0, + "step": 48382 + }, + { + "epoch": 4.5146029672482975, + "grad_norm": NaN, + "learning_rate": 4.578697801141607e-05, + "loss": 0.0, + "step": 48383 + }, + { + "epoch": 4.514696276943175, + "grad_norm": NaN, + "learning_rate": 4.578153790751897e-05, + "loss": 0.0, + "step": 48384 + }, + { + "epoch": 4.514789586638051, + "grad_norm": NaN, + "learning_rate": 4.5776098068618435e-05, + "loss": 0.0, + "step": 48385 + }, + { + "epoch": 4.514882896332929, + "grad_norm": NaN, + "learning_rate": 4.577065849472846e-05, + "loss": 0.0, + "step": 48386 + }, + { + "epoch": 4.514976206027806, + "grad_norm": NaN, + "learning_rate": 4.576521918586288e-05, + "loss": 0.0, + "step": 48387 + }, + { + "epoch": 4.515069515722684, + "grad_norm": NaN, + "learning_rate": 4.5759780142035384e-05, + "loss": 0.0, + "step": 48388 + }, + { + "epoch": 4.515162825417561, + "grad_norm": NaN, + "learning_rate": 4.575434136325992e-05, + "loss": 0.0, + "step": 48389 + }, + { + "epoch": 4.5152561351124385, + "grad_norm": NaN, + "learning_rate": 4.5748902849550347e-05, + "loss": 0.0, + "step": 48390 + }, + { + "epoch": 4.515349444807315, + "grad_norm": NaN, + "learning_rate": 4.574346460092033e-05, + "loss": 0.0, + "step": 48391 + }, + { + "epoch": 4.5154427545021925, + "grad_norm": NaN, + "learning_rate": 4.573802661738389e-05, + "loss": 0.0, + "step": 48392 + }, + { + "epoch": 4.51553606419707, + "grad_norm": NaN, + "learning_rate": 4.573258889895467e-05, + "loss": 0.0, + "step": 48393 + }, + { + "epoch": 4.515629373891947, + "grad_norm": NaN, + "learning_rate": 4.5727151445646643e-05, + "loss": 0.0, + "step": 48394 + }, + { + "epoch": 4.515722683586825, + "grad_norm": NaN, + "learning_rate": 4.572171425747361e-05, + "loss": 0.0, + "step": 48395 + }, + { + "epoch": 4.515815993281702, + "grad_norm": NaN, + "learning_rate": 4.5716277334449304e-05, + "loss": 0.0, + "step": 48396 + }, + { + "epoch": 4.5159093029765796, + "grad_norm": NaN, + "learning_rate": 4.571084067658764e-05, + "loss": 0.0, + "step": 48397 + }, + { + "epoch": 4.516002612671457, + "grad_norm": NaN, + "learning_rate": 4.570540428390246e-05, + "loss": 0.0, + "step": 48398 + }, + { + "epoch": 4.5160959223663335, + "grad_norm": NaN, + "learning_rate": 4.5699968156407465e-05, + "loss": 0.0, + "step": 48399 + }, + { + "epoch": 4.516189232061211, + "grad_norm": NaN, + "learning_rate": 4.56945322941166e-05, + "loss": 0.0, + "step": 48400 + }, + { + "epoch": 4.516282541756088, + "grad_norm": NaN, + "learning_rate": 4.568909669704368e-05, + "loss": 0.0, + "step": 48401 + }, + { + "epoch": 4.516375851450966, + "grad_norm": NaN, + "learning_rate": 4.56836613652024e-05, + "loss": 0.0, + "step": 48402 + }, + { + "epoch": 4.516469161145843, + "grad_norm": NaN, + "learning_rate": 4.567822629860672e-05, + "loss": 0.0, + "step": 48403 + }, + { + "epoch": 4.516562470840721, + "grad_norm": NaN, + "learning_rate": 4.5672791497270434e-05, + "loss": 0.0, + "step": 48404 + }, + { + "epoch": 4.516655780535598, + "grad_norm": NaN, + "learning_rate": 4.566735696120727e-05, + "loss": 0.0, + "step": 48405 + }, + { + "epoch": 4.5167490902304746, + "grad_norm": NaN, + "learning_rate": 4.566192269043113e-05, + "loss": 0.0, + "step": 48406 + }, + { + "epoch": 4.516842399925352, + "grad_norm": NaN, + "learning_rate": 4.565648868495587e-05, + "loss": 0.0, + "step": 48407 + }, + { + "epoch": 4.516935709620229, + "grad_norm": NaN, + "learning_rate": 4.565105494479517e-05, + "loss": 0.0, + "step": 48408 + }, + { + "epoch": 4.517029019315107, + "grad_norm": NaN, + "learning_rate": 4.5645621469962956e-05, + "loss": 0.0, + "step": 48409 + }, + { + "epoch": 4.517122329009984, + "grad_norm": NaN, + "learning_rate": 4.564018826047308e-05, + "loss": 0.0, + "step": 48410 + }, + { + "epoch": 4.517215638704862, + "grad_norm": NaN, + "learning_rate": 4.563475531633918e-05, + "loss": 0.0, + "step": 48411 + }, + { + "epoch": 4.517308948399739, + "grad_norm": NaN, + "learning_rate": 4.562932263757525e-05, + "loss": 0.0, + "step": 48412 + }, + { + "epoch": 4.5174022580946165, + "grad_norm": NaN, + "learning_rate": 4.562389022419507e-05, + "loss": 0.0, + "step": 48413 + }, + { + "epoch": 4.517495567789493, + "grad_norm": NaN, + "learning_rate": 4.5618458076212313e-05, + "loss": 0.0, + "step": 48414 + }, + { + "epoch": 4.51758887748437, + "grad_norm": NaN, + "learning_rate": 4.5613026193640964e-05, + "loss": 0.0, + "step": 48415 + }, + { + "epoch": 4.517682187179248, + "grad_norm": NaN, + "learning_rate": 4.56075945764948e-05, + "loss": 0.0, + "step": 48416 + }, + { + "epoch": 4.517775496874125, + "grad_norm": NaN, + "learning_rate": 4.56021632247875e-05, + "loss": 0.0, + "step": 48417 + }, + { + "epoch": 4.517868806569003, + "grad_norm": NaN, + "learning_rate": 4.559673213853304e-05, + "loss": 0.0, + "step": 48418 + }, + { + "epoch": 4.51796211626388, + "grad_norm": NaN, + "learning_rate": 4.55913013177452e-05, + "loss": 0.0, + "step": 48419 + }, + { + "epoch": 4.518055425958757, + "grad_norm": NaN, + "learning_rate": 4.558587076243766e-05, + "loss": 0.0, + "step": 48420 + }, + { + "epoch": 4.518148735653634, + "grad_norm": NaN, + "learning_rate": 4.558044047262438e-05, + "loss": 0.0, + "step": 48421 + }, + { + "epoch": 4.5182420453485115, + "grad_norm": NaN, + "learning_rate": 4.557501044831915e-05, + "loss": 0.0, + "step": 48422 + }, + { + "epoch": 4.518335355043389, + "grad_norm": NaN, + "learning_rate": 4.5569580689535624e-05, + "loss": 0.0, + "step": 48423 + }, + { + "epoch": 4.518428664738266, + "grad_norm": NaN, + "learning_rate": 4.556415119628779e-05, + "loss": 0.0, + "step": 48424 + }, + { + "epoch": 4.518521974433144, + "grad_norm": NaN, + "learning_rate": 4.5558721968589404e-05, + "loss": 0.0, + "step": 48425 + }, + { + "epoch": 4.518615284128021, + "grad_norm": NaN, + "learning_rate": 4.555329300645418e-05, + "loss": 0.0, + "step": 48426 + }, + { + "epoch": 4.518708593822899, + "grad_norm": NaN, + "learning_rate": 4.554786430989603e-05, + "loss": 0.0, + "step": 48427 + }, + { + "epoch": 4.518801903517775, + "grad_norm": NaN, + "learning_rate": 4.5542435878928774e-05, + "loss": 0.0, + "step": 48428 + }, + { + "epoch": 4.5188952132126525, + "grad_norm": NaN, + "learning_rate": 4.553700771356606e-05, + "loss": 0.0, + "step": 48429 + }, + { + "epoch": 4.51898852290753, + "grad_norm": NaN, + "learning_rate": 4.553157981382184e-05, + "loss": 0.0, + "step": 48430 + }, + { + "epoch": 4.519081832602407, + "grad_norm": NaN, + "learning_rate": 4.552615217970987e-05, + "loss": 0.0, + "step": 48431 + }, + { + "epoch": 4.519175142297285, + "grad_norm": NaN, + "learning_rate": 4.552072481124393e-05, + "loss": 0.0, + "step": 48432 + }, + { + "epoch": 4.519268451992162, + "grad_norm": NaN, + "learning_rate": 4.5515297708437865e-05, + "loss": 0.0, + "step": 48433 + }, + { + "epoch": 4.51936176168704, + "grad_norm": NaN, + "learning_rate": 4.550987087130543e-05, + "loss": 0.0, + "step": 48434 + }, + { + "epoch": 4.519455071381916, + "grad_norm": NaN, + "learning_rate": 4.550444429986044e-05, + "loss": 0.0, + "step": 48435 + }, + { + "epoch": 4.519548381076794, + "grad_norm": NaN, + "learning_rate": 4.5499017994116755e-05, + "loss": 0.0, + "step": 48436 + }, + { + "epoch": 4.519641690771671, + "grad_norm": NaN, + "learning_rate": 4.5493591954088e-05, + "loss": 0.0, + "step": 48437 + }, + { + "epoch": 4.519735000466548, + "grad_norm": NaN, + "learning_rate": 4.548816617978813e-05, + "loss": 0.0, + "step": 48438 + }, + { + "epoch": 4.519828310161426, + "grad_norm": NaN, + "learning_rate": 4.548274067123096e-05, + "loss": 0.0, + "step": 48439 + }, + { + "epoch": 4.519921619856303, + "grad_norm": NaN, + "learning_rate": 4.5477315428430125e-05, + "loss": 0.0, + "step": 48440 + }, + { + "epoch": 4.520014929551181, + "grad_norm": NaN, + "learning_rate": 4.547189045139956e-05, + "loss": 0.0, + "step": 48441 + }, + { + "epoch": 4.520108239246058, + "grad_norm": NaN, + "learning_rate": 4.546646574015305e-05, + "loss": 0.0, + "step": 48442 + }, + { + "epoch": 4.520201548940935, + "grad_norm": NaN, + "learning_rate": 4.5461041294704266e-05, + "loss": 0.0, + "step": 48443 + }, + { + "epoch": 4.520294858635812, + "grad_norm": NaN, + "learning_rate": 4.5455617115067114e-05, + "loss": 0.0, + "step": 48444 + }, + { + "epoch": 4.5203881683306895, + "grad_norm": NaN, + "learning_rate": 4.545019320125543e-05, + "loss": 0.0, + "step": 48445 + }, + { + "epoch": 4.520481478025567, + "grad_norm": NaN, + "learning_rate": 4.544476955328285e-05, + "loss": 0.0, + "step": 48446 + }, + { + "epoch": 4.520574787720444, + "grad_norm": NaN, + "learning_rate": 4.5439346171163264e-05, + "loss": 0.0, + "step": 48447 + }, + { + "epoch": 4.520668097415322, + "grad_norm": NaN, + "learning_rate": 4.543392305491051e-05, + "loss": 0.0, + "step": 48448 + }, + { + "epoch": 4.520761407110199, + "grad_norm": NaN, + "learning_rate": 4.542850020453823e-05, + "loss": 0.0, + "step": 48449 + }, + { + "epoch": 4.520854716805076, + "grad_norm": NaN, + "learning_rate": 4.542307762006034e-05, + "loss": 0.0, + "step": 48450 + }, + { + "epoch": 4.520948026499953, + "grad_norm": NaN, + "learning_rate": 4.5417655301490624e-05, + "loss": 0.0, + "step": 48451 + }, + { + "epoch": 4.5210413361948305, + "grad_norm": NaN, + "learning_rate": 4.5412233248842744e-05, + "loss": 0.0, + "step": 48452 + }, + { + "epoch": 4.521134645889708, + "grad_norm": NaN, + "learning_rate": 4.540681146213062e-05, + "loss": 0.0, + "step": 48453 + }, + { + "epoch": 4.521227955584585, + "grad_norm": NaN, + "learning_rate": 4.540138994136804e-05, + "loss": 0.0, + "step": 48454 + }, + { + "epoch": 4.521321265279463, + "grad_norm": NaN, + "learning_rate": 4.539596868656865e-05, + "loss": 0.0, + "step": 48455 + }, + { + "epoch": 4.52141457497434, + "grad_norm": NaN, + "learning_rate": 4.5390547697746366e-05, + "loss": 0.0, + "step": 48456 + }, + { + "epoch": 4.521507884669218, + "grad_norm": NaN, + "learning_rate": 4.538512697491499e-05, + "loss": 0.0, + "step": 48457 + }, + { + "epoch": 4.521601194364094, + "grad_norm": NaN, + "learning_rate": 4.537970651808814e-05, + "loss": 0.0, + "step": 48458 + }, + { + "epoch": 4.5216945040589716, + "grad_norm": NaN, + "learning_rate": 4.537428632727976e-05, + "loss": 0.0, + "step": 48459 + }, + { + "epoch": 4.521787813753849, + "grad_norm": NaN, + "learning_rate": 4.5368866402503625e-05, + "loss": 0.0, + "step": 48460 + }, + { + "epoch": 4.521881123448726, + "grad_norm": NaN, + "learning_rate": 4.5363446743773355e-05, + "loss": 0.0, + "step": 48461 + }, + { + "epoch": 4.521974433143604, + "grad_norm": NaN, + "learning_rate": 4.535802735110292e-05, + "loss": 0.0, + "step": 48462 + }, + { + "epoch": 4.522067742838481, + "grad_norm": NaN, + "learning_rate": 4.5352608224506054e-05, + "loss": 0.0, + "step": 48463 + }, + { + "epoch": 4.522161052533358, + "grad_norm": NaN, + "learning_rate": 4.534718936399641e-05, + "loss": 0.0, + "step": 48464 + }, + { + "epoch": 4.522254362228235, + "grad_norm": NaN, + "learning_rate": 4.534177076958792e-05, + "loss": 0.0, + "step": 48465 + }, + { + "epoch": 4.522347671923113, + "grad_norm": NaN, + "learning_rate": 4.533635244129436e-05, + "loss": 0.0, + "step": 48466 + }, + { + "epoch": 4.52244098161799, + "grad_norm": NaN, + "learning_rate": 4.5330934379129335e-05, + "loss": 0.0, + "step": 48467 + }, + { + "epoch": 4.522534291312867, + "grad_norm": NaN, + "learning_rate": 4.5325516583106794e-05, + "loss": 0.0, + "step": 48468 + }, + { + "epoch": 4.522627601007745, + "grad_norm": NaN, + "learning_rate": 4.532009905324046e-05, + "loss": 0.0, + "step": 48469 + }, + { + "epoch": 4.522720910702622, + "grad_norm": NaN, + "learning_rate": 4.531468178954412e-05, + "loss": 0.0, + "step": 48470 + }, + { + "epoch": 4.5228142203975, + "grad_norm": NaN, + "learning_rate": 4.530926479203151e-05, + "loss": 0.0, + "step": 48471 + }, + { + "epoch": 4.522907530092376, + "grad_norm": NaN, + "learning_rate": 4.530384806071644e-05, + "loss": 0.0, + "step": 48472 + }, + { + "epoch": 4.523000839787254, + "grad_norm": NaN, + "learning_rate": 4.5298431595612675e-05, + "loss": 0.0, + "step": 48473 + }, + { + "epoch": 4.523094149482131, + "grad_norm": NaN, + "learning_rate": 4.529301539673396e-05, + "loss": 0.0, + "step": 48474 + }, + { + "epoch": 4.5231874591770085, + "grad_norm": NaN, + "learning_rate": 4.5287599464094116e-05, + "loss": 0.0, + "step": 48475 + }, + { + "epoch": 4.523280768871886, + "grad_norm": NaN, + "learning_rate": 4.528218379770689e-05, + "loss": 0.0, + "step": 48476 + }, + { + "epoch": 4.523374078566763, + "grad_norm": NaN, + "learning_rate": 4.5276768397586036e-05, + "loss": 0.0, + "step": 48477 + }, + { + "epoch": 4.523467388261641, + "grad_norm": NaN, + "learning_rate": 4.527135326374534e-05, + "loss": 0.0, + "step": 48478 + }, + { + "epoch": 4.523560697956517, + "grad_norm": NaN, + "learning_rate": 4.526593839619857e-05, + "loss": 0.0, + "step": 48479 + }, + { + "epoch": 4.523654007651395, + "grad_norm": NaN, + "learning_rate": 4.5260523794959545e-05, + "loss": 0.0, + "step": 48480 + }, + { + "epoch": 4.523747317346272, + "grad_norm": NaN, + "learning_rate": 4.525510946004189e-05, + "loss": 0.0, + "step": 48481 + }, + { + "epoch": 4.5238406270411495, + "grad_norm": NaN, + "learning_rate": 4.524969539145952e-05, + "loss": 0.0, + "step": 48482 + }, + { + "epoch": 4.523933936736027, + "grad_norm": NaN, + "learning_rate": 4.5244281589226175e-05, + "loss": 0.0, + "step": 48483 + }, + { + "epoch": 4.524027246430904, + "grad_norm": NaN, + "learning_rate": 4.523886805335549e-05, + "loss": 0.0, + "step": 48484 + }, + { + "epoch": 4.524120556125782, + "grad_norm": NaN, + "learning_rate": 4.52334547838614e-05, + "loss": 0.0, + "step": 48485 + }, + { + "epoch": 4.524213865820659, + "grad_norm": NaN, + "learning_rate": 4.5228041780757634e-05, + "loss": 0.0, + "step": 48486 + }, + { + "epoch": 4.524307175515536, + "grad_norm": NaN, + "learning_rate": 4.522262904405783e-05, + "loss": 0.0, + "step": 48487 + }, + { + "epoch": 4.524400485210413, + "grad_norm": NaN, + "learning_rate": 4.5217216573775905e-05, + "loss": 0.0, + "step": 48488 + }, + { + "epoch": 4.524493794905291, + "grad_norm": NaN, + "learning_rate": 4.5211804369925584e-05, + "loss": 0.0, + "step": 48489 + }, + { + "epoch": 4.524587104600168, + "grad_norm": NaN, + "learning_rate": 4.5206392432520535e-05, + "loss": 0.0, + "step": 48490 + }, + { + "epoch": 4.524680414295045, + "grad_norm": NaN, + "learning_rate": 4.5200980761574616e-05, + "loss": 0.0, + "step": 48491 + }, + { + "epoch": 4.524773723989923, + "grad_norm": NaN, + "learning_rate": 4.5195569357101596e-05, + "loss": 0.0, + "step": 48492 + }, + { + "epoch": 4.524867033684799, + "grad_norm": NaN, + "learning_rate": 4.519015821911512e-05, + "loss": 0.0, + "step": 48493 + }, + { + "epoch": 4.524960343379677, + "grad_norm": NaN, + "learning_rate": 4.5184747347629074e-05, + "loss": 0.0, + "step": 48494 + }, + { + "epoch": 4.525053653074554, + "grad_norm": NaN, + "learning_rate": 4.517933674265722e-05, + "loss": 0.0, + "step": 48495 + }, + { + "epoch": 4.525146962769432, + "grad_norm": NaN, + "learning_rate": 4.5173926404213144e-05, + "loss": 0.0, + "step": 48496 + }, + { + "epoch": 4.525240272464309, + "grad_norm": NaN, + "learning_rate": 4.5168516332310775e-05, + "loss": 0.0, + "step": 48497 + }, + { + "epoch": 4.5253335821591865, + "grad_norm": NaN, + "learning_rate": 4.516310652696389e-05, + "loss": 0.0, + "step": 48498 + }, + { + "epoch": 4.525426891854064, + "grad_norm": NaN, + "learning_rate": 4.515769698818605e-05, + "loss": 0.0, + "step": 48499 + }, + { + "epoch": 4.525520201548941, + "grad_norm": NaN, + "learning_rate": 4.515228771599118e-05, + "loss": 0.0, + "step": 48500 + }, + { + "epoch": 4.525613511243819, + "grad_norm": NaN, + "learning_rate": 4.514687871039303e-05, + "loss": 0.0, + "step": 48501 + }, + { + "epoch": 4.525706820938695, + "grad_norm": NaN, + "learning_rate": 4.514146997140522e-05, + "loss": 0.0, + "step": 48502 + }, + { + "epoch": 4.525800130633573, + "grad_norm": NaN, + "learning_rate": 4.5136061499041634e-05, + "loss": 0.0, + "step": 48503 + }, + { + "epoch": 4.52589344032845, + "grad_norm": NaN, + "learning_rate": 4.5130653293316034e-05, + "loss": 0.0, + "step": 48504 + }, + { + "epoch": 4.5259867500233275, + "grad_norm": NaN, + "learning_rate": 4.5125245354242015e-05, + "loss": 0.0, + "step": 48505 + }, + { + "epoch": 4.526080059718205, + "grad_norm": NaN, + "learning_rate": 4.5119837681833485e-05, + "loss": 0.0, + "step": 48506 + }, + { + "epoch": 4.526173369413082, + "grad_norm": NaN, + "learning_rate": 4.511443027610413e-05, + "loss": 0.0, + "step": 48507 + }, + { + "epoch": 4.526266679107959, + "grad_norm": NaN, + "learning_rate": 4.510902313706771e-05, + "loss": 0.0, + "step": 48508 + }, + { + "epoch": 4.526359988802836, + "grad_norm": NaN, + "learning_rate": 4.510361626473798e-05, + "loss": 0.0, + "step": 48509 + }, + { + "epoch": 4.526453298497714, + "grad_norm": NaN, + "learning_rate": 4.509820965912869e-05, + "loss": 0.0, + "step": 48510 + }, + { + "epoch": 4.526546608192591, + "grad_norm": NaN, + "learning_rate": 4.5092803320253574e-05, + "loss": 0.0, + "step": 48511 + }, + { + "epoch": 4.526639917887469, + "grad_norm": NaN, + "learning_rate": 4.5087397248126365e-05, + "loss": 0.0, + "step": 48512 + }, + { + "epoch": 4.526733227582346, + "grad_norm": NaN, + "learning_rate": 4.508199144276084e-05, + "loss": 0.0, + "step": 48513 + }, + { + "epoch": 4.526826537277223, + "grad_norm": NaN, + "learning_rate": 4.5076585904170734e-05, + "loss": 0.0, + "step": 48514 + }, + { + "epoch": 4.526919846972101, + "grad_norm": NaN, + "learning_rate": 4.5071180632369796e-05, + "loss": 0.0, + "step": 48515 + }, + { + "epoch": 4.527013156666977, + "grad_norm": NaN, + "learning_rate": 4.506577562737176e-05, + "loss": 0.0, + "step": 48516 + }, + { + "epoch": 4.527106466361855, + "grad_norm": NaN, + "learning_rate": 4.506037088919036e-05, + "loss": 0.0, + "step": 48517 + }, + { + "epoch": 4.527199776056732, + "grad_norm": NaN, + "learning_rate": 4.505496641783936e-05, + "loss": 0.0, + "step": 48518 + }, + { + "epoch": 4.52729308575161, + "grad_norm": NaN, + "learning_rate": 4.50495622133325e-05, + "loss": 0.0, + "step": 48519 + }, + { + "epoch": 4.527386395446487, + "grad_norm": NaN, + "learning_rate": 4.50441582756835e-05, + "loss": 0.0, + "step": 48520 + }, + { + "epoch": 4.527479705141364, + "grad_norm": NaN, + "learning_rate": 4.503875460490613e-05, + "loss": 0.0, + "step": 48521 + }, + { + "epoch": 4.527573014836242, + "grad_norm": NaN, + "learning_rate": 4.5033351201014106e-05, + "loss": 0.0, + "step": 48522 + }, + { + "epoch": 4.527666324531118, + "grad_norm": NaN, + "learning_rate": 4.502794806402117e-05, + "loss": 0.0, + "step": 48523 + }, + { + "epoch": 4.527759634225996, + "grad_norm": NaN, + "learning_rate": 4.5022545193941076e-05, + "loss": 0.0, + "step": 48524 + }, + { + "epoch": 4.527852943920873, + "grad_norm": NaN, + "learning_rate": 4.501714259078756e-05, + "loss": 0.0, + "step": 48525 + }, + { + "epoch": 4.527946253615751, + "grad_norm": NaN, + "learning_rate": 4.501174025457435e-05, + "loss": 0.0, + "step": 48526 + }, + { + "epoch": 4.528039563310628, + "grad_norm": NaN, + "learning_rate": 4.500633818531521e-05, + "loss": 0.0, + "step": 48527 + }, + { + "epoch": 4.5281328730055055, + "grad_norm": NaN, + "learning_rate": 4.5000936383023775e-05, + "loss": 0.0, + "step": 48528 + }, + { + "epoch": 4.528226182700383, + "grad_norm": NaN, + "learning_rate": 4.499553484771391e-05, + "loss": 0.0, + "step": 48529 + }, + { + "epoch": 4.52831949239526, + "grad_norm": NaN, + "learning_rate": 4.4990133579399327e-05, + "loss": 0.0, + "step": 48530 + }, + { + "epoch": 4.528412802090137, + "grad_norm": NaN, + "learning_rate": 4.4984732578093634e-05, + "loss": 0.0, + "step": 48531 + }, + { + "epoch": 4.528506111785014, + "grad_norm": NaN, + "learning_rate": 4.4979331843810715e-05, + "loss": 0.0, + "step": 48532 + }, + { + "epoch": 4.528599421479892, + "grad_norm": NaN, + "learning_rate": 4.49739313765643e-05, + "loss": 0.0, + "step": 48533 + }, + { + "epoch": 4.528692731174769, + "grad_norm": NaN, + "learning_rate": 4.496853117636795e-05, + "loss": 0.0, + "step": 48534 + }, + { + "epoch": 4.5287860408696465, + "grad_norm": NaN, + "learning_rate": 4.4963131243235575e-05, + "loss": 0.0, + "step": 48535 + }, + { + "epoch": 4.528879350564524, + "grad_norm": NaN, + "learning_rate": 4.495773157718089e-05, + "loss": 0.0, + "step": 48536 + }, + { + "epoch": 4.5289726602594005, + "grad_norm": NaN, + "learning_rate": 4.495233217821747e-05, + "loss": 0.0, + "step": 48537 + }, + { + "epoch": 4.529065969954278, + "grad_norm": NaN, + "learning_rate": 4.494693304635921e-05, + "loss": 0.0, + "step": 48538 + }, + { + "epoch": 4.529159279649155, + "grad_norm": NaN, + "learning_rate": 4.494153418161984e-05, + "loss": 0.0, + "step": 48539 + }, + { + "epoch": 4.529252589344033, + "grad_norm": NaN, + "learning_rate": 4.4936135584012914e-05, + "loss": 0.0, + "step": 48540 + }, + { + "epoch": 4.52934589903891, + "grad_norm": NaN, + "learning_rate": 4.493073725355232e-05, + "loss": 0.0, + "step": 48541 + }, + { + "epoch": 4.529439208733788, + "grad_norm": NaN, + "learning_rate": 4.4925339190251794e-05, + "loss": 0.0, + "step": 48542 + }, + { + "epoch": 4.529532518428665, + "grad_norm": NaN, + "learning_rate": 4.491994139412491e-05, + "loss": 0.0, + "step": 48543 + }, + { + "epoch": 4.529625828123542, + "grad_norm": NaN, + "learning_rate": 4.491454386518552e-05, + "loss": 0.0, + "step": 48544 + }, + { + "epoch": 4.529719137818419, + "grad_norm": NaN, + "learning_rate": 4.490914660344734e-05, + "loss": 0.0, + "step": 48545 + }, + { + "epoch": 4.529812447513296, + "grad_norm": NaN, + "learning_rate": 4.490374960892405e-05, + "loss": 0.0, + "step": 48546 + }, + { + "epoch": 4.529905757208174, + "grad_norm": NaN, + "learning_rate": 4.4898352881629404e-05, + "loss": 0.0, + "step": 48547 + }, + { + "epoch": 4.529999066903051, + "grad_norm": NaN, + "learning_rate": 4.489295642157711e-05, + "loss": 0.0, + "step": 48548 + }, + { + "epoch": 4.530092376597929, + "grad_norm": NaN, + "learning_rate": 4.488756022878089e-05, + "loss": 0.0, + "step": 48549 + }, + { + "epoch": 4.530185686292806, + "grad_norm": NaN, + "learning_rate": 4.488216430325448e-05, + "loss": 0.0, + "step": 48550 + }, + { + "epoch": 4.5302789959876835, + "grad_norm": NaN, + "learning_rate": 4.487676864501158e-05, + "loss": 0.0, + "step": 48551 + }, + { + "epoch": 4.53037230568256, + "grad_norm": NaN, + "learning_rate": 4.487137325406591e-05, + "loss": 0.0, + "step": 48552 + }, + { + "epoch": 4.530465615377437, + "grad_norm": NaN, + "learning_rate": 4.48659781304312e-05, + "loss": 0.0, + "step": 48553 + }, + { + "epoch": 4.530558925072315, + "grad_norm": NaN, + "learning_rate": 4.486058327412117e-05, + "loss": 0.0, + "step": 48554 + }, + { + "epoch": 4.530652234767192, + "grad_norm": NaN, + "learning_rate": 4.485518868514954e-05, + "loss": 0.0, + "step": 48555 + }, + { + "epoch": 4.53074554446207, + "grad_norm": NaN, + "learning_rate": 4.484979436353e-05, + "loss": 0.0, + "step": 48556 + }, + { + "epoch": 4.530838854156947, + "grad_norm": NaN, + "learning_rate": 4.484440030927631e-05, + "loss": 0.0, + "step": 48557 + }, + { + "epoch": 4.5309321638518245, + "grad_norm": NaN, + "learning_rate": 4.4839006522402134e-05, + "loss": 0.0, + "step": 48558 + }, + { + "epoch": 4.531025473546702, + "grad_norm": NaN, + "learning_rate": 4.483361300292123e-05, + "loss": 0.0, + "step": 48559 + }, + { + "epoch": 4.5311187832415785, + "grad_norm": NaN, + "learning_rate": 4.482821975084729e-05, + "loss": 0.0, + "step": 48560 + }, + { + "epoch": 4.531212092936456, + "grad_norm": NaN, + "learning_rate": 4.4822826766194025e-05, + "loss": 0.0, + "step": 48561 + }, + { + "epoch": 4.531305402631333, + "grad_norm": NaN, + "learning_rate": 4.481743404897516e-05, + "loss": 0.0, + "step": 48562 + }, + { + "epoch": 4.531398712326211, + "grad_norm": NaN, + "learning_rate": 4.481204159920442e-05, + "loss": 0.0, + "step": 48563 + }, + { + "epoch": 4.531492022021088, + "grad_norm": NaN, + "learning_rate": 4.4806649416895474e-05, + "loss": 0.0, + "step": 48564 + }, + { + "epoch": 4.531585331715966, + "grad_norm": NaN, + "learning_rate": 4.480125750206207e-05, + "loss": 0.0, + "step": 48565 + }, + { + "epoch": 4.531678641410842, + "grad_norm": NaN, + "learning_rate": 4.4795865854717915e-05, + "loss": 0.0, + "step": 48566 + }, + { + "epoch": 4.5317719511057195, + "grad_norm": NaN, + "learning_rate": 4.4790474474876696e-05, + "loss": 0.0, + "step": 48567 + }, + { + "epoch": 4.531865260800597, + "grad_norm": NaN, + "learning_rate": 4.478508336255214e-05, + "loss": 0.0, + "step": 48568 + }, + { + "epoch": 4.531958570495474, + "grad_norm": NaN, + "learning_rate": 4.477969251775794e-05, + "loss": 0.0, + "step": 48569 + }, + { + "epoch": 4.532051880190352, + "grad_norm": NaN, + "learning_rate": 4.4774301940507836e-05, + "loss": 0.0, + "step": 48570 + }, + { + "epoch": 4.532145189885229, + "grad_norm": NaN, + "learning_rate": 4.476891163081554e-05, + "loss": 0.0, + "step": 48571 + }, + { + "epoch": 4.532238499580107, + "grad_norm": NaN, + "learning_rate": 4.476352158869464e-05, + "loss": 0.0, + "step": 48572 + }, + { + "epoch": 4.532331809274984, + "grad_norm": NaN, + "learning_rate": 4.475813181415899e-05, + "loss": 0.0, + "step": 48573 + }, + { + "epoch": 4.532425118969861, + "grad_norm": NaN, + "learning_rate": 4.475274230722227e-05, + "loss": 0.0, + "step": 48574 + }, + { + "epoch": 4.532518428664738, + "grad_norm": NaN, + "learning_rate": 4.474735306789805e-05, + "loss": 0.0, + "step": 48575 + }, + { + "epoch": 4.532611738359615, + "grad_norm": NaN, + "learning_rate": 4.4741964096200196e-05, + "loss": 0.0, + "step": 48576 + }, + { + "epoch": 4.532705048054493, + "grad_norm": NaN, + "learning_rate": 4.4736575392142386e-05, + "loss": 0.0, + "step": 48577 + }, + { + "epoch": 4.53279835774937, + "grad_norm": NaN, + "learning_rate": 4.473118695573818e-05, + "loss": 0.0, + "step": 48578 + }, + { + "epoch": 4.532891667444248, + "grad_norm": NaN, + "learning_rate": 4.4725798787001445e-05, + "loss": 0.0, + "step": 48579 + }, + { + "epoch": 4.532984977139125, + "grad_norm": NaN, + "learning_rate": 4.4720410885945865e-05, + "loss": 0.0, + "step": 48580 + }, + { + "epoch": 4.533078286834002, + "grad_norm": NaN, + "learning_rate": 4.471502325258499e-05, + "loss": 0.0, + "step": 48581 + }, + { + "epoch": 4.533171596528879, + "grad_norm": NaN, + "learning_rate": 4.470963588693269e-05, + "loss": 0.0, + "step": 48582 + }, + { + "epoch": 4.533264906223756, + "grad_norm": NaN, + "learning_rate": 4.470424878900258e-05, + "loss": 0.0, + "step": 48583 + }, + { + "epoch": 4.533358215918634, + "grad_norm": NaN, + "learning_rate": 4.469886195880837e-05, + "loss": 0.0, + "step": 48584 + }, + { + "epoch": 4.533451525613511, + "grad_norm": NaN, + "learning_rate": 4.469347539636376e-05, + "loss": 0.0, + "step": 48585 + }, + { + "epoch": 4.533544835308389, + "grad_norm": NaN, + "learning_rate": 4.468808910168247e-05, + "loss": 0.0, + "step": 48586 + }, + { + "epoch": 4.533638145003266, + "grad_norm": NaN, + "learning_rate": 4.468270307477816e-05, + "loss": 0.0, + "step": 48587 + }, + { + "epoch": 4.5337314546981435, + "grad_norm": NaN, + "learning_rate": 4.467731731566453e-05, + "loss": 0.0, + "step": 48588 + }, + { + "epoch": 4.53382476439302, + "grad_norm": NaN, + "learning_rate": 4.467193182435531e-05, + "loss": 0.0, + "step": 48589 + }, + { + "epoch": 4.5339180740878975, + "grad_norm": NaN, + "learning_rate": 4.4666546600864135e-05, + "loss": 0.0, + "step": 48590 + }, + { + "epoch": 4.534011383782775, + "grad_norm": NaN, + "learning_rate": 4.4661161645204754e-05, + "loss": 0.0, + "step": 48591 + }, + { + "epoch": 4.534104693477652, + "grad_norm": NaN, + "learning_rate": 4.465577695739082e-05, + "loss": 0.0, + "step": 48592 + }, + { + "epoch": 4.53419800317253, + "grad_norm": NaN, + "learning_rate": 4.465039253743605e-05, + "loss": 0.0, + "step": 48593 + }, + { + "epoch": 4.534291312867407, + "grad_norm": NaN, + "learning_rate": 4.464500838535412e-05, + "loss": 0.0, + "step": 48594 + }, + { + "epoch": 4.534384622562285, + "grad_norm": NaN, + "learning_rate": 4.463962450115872e-05, + "loss": 0.0, + "step": 48595 + }, + { + "epoch": 4.534477932257161, + "grad_norm": NaN, + "learning_rate": 4.4634240884863556e-05, + "loss": 0.0, + "step": 48596 + }, + { + "epoch": 4.5345712419520385, + "grad_norm": NaN, + "learning_rate": 4.46288575364823e-05, + "loss": 0.0, + "step": 48597 + }, + { + "epoch": 4.534664551646916, + "grad_norm": NaN, + "learning_rate": 4.462347445602865e-05, + "loss": 0.0, + "step": 48598 + }, + { + "epoch": 4.534757861341793, + "grad_norm": NaN, + "learning_rate": 4.4618091643516274e-05, + "loss": 0.0, + "step": 48599 + }, + { + "epoch": 4.534851171036671, + "grad_norm": NaN, + "learning_rate": 4.461270909895889e-05, + "loss": 0.0, + "step": 48600 + }, + { + "epoch": 4.534944480731548, + "grad_norm": NaN, + "learning_rate": 4.4607326822370174e-05, + "loss": 0.0, + "step": 48601 + }, + { + "epoch": 4.535037790426426, + "grad_norm": NaN, + "learning_rate": 4.4601944813763795e-05, + "loss": 0.0, + "step": 48602 + }, + { + "epoch": 4.535131100121303, + "grad_norm": NaN, + "learning_rate": 4.459656307315344e-05, + "loss": 0.0, + "step": 48603 + }, + { + "epoch": 4.53522440981618, + "grad_norm": NaN, + "learning_rate": 4.459118160055282e-05, + "loss": 0.0, + "step": 48604 + }, + { + "epoch": 4.535317719511057, + "grad_norm": NaN, + "learning_rate": 4.458580039597559e-05, + "loss": 0.0, + "step": 48605 + }, + { + "epoch": 4.535411029205934, + "grad_norm": NaN, + "learning_rate": 4.458041945943545e-05, + "loss": 0.0, + "step": 48606 + }, + { + "epoch": 4.535504338900812, + "grad_norm": NaN, + "learning_rate": 4.457503879094609e-05, + "loss": 0.0, + "step": 48607 + }, + { + "epoch": 4.535597648595689, + "grad_norm": NaN, + "learning_rate": 4.4569658390521145e-05, + "loss": 0.0, + "step": 48608 + }, + { + "epoch": 4.535690958290567, + "grad_norm": NaN, + "learning_rate": 4.4564278258174344e-05, + "loss": 0.0, + "step": 48609 + }, + { + "epoch": 4.535784267985443, + "grad_norm": NaN, + "learning_rate": 4.455889839391936e-05, + "loss": 0.0, + "step": 48610 + }, + { + "epoch": 4.535877577680321, + "grad_norm": NaN, + "learning_rate": 4.455351879776985e-05, + "loss": 0.0, + "step": 48611 + }, + { + "epoch": 4.535970887375198, + "grad_norm": NaN, + "learning_rate": 4.454813946973951e-05, + "loss": 0.0, + "step": 48612 + }, + { + "epoch": 4.5360641970700755, + "grad_norm": NaN, + "learning_rate": 4.454276040984203e-05, + "loss": 0.0, + "step": 48613 + }, + { + "epoch": 4.536157506764953, + "grad_norm": NaN, + "learning_rate": 4.453738161809105e-05, + "loss": 0.0, + "step": 48614 + }, + { + "epoch": 4.53625081645983, + "grad_norm": NaN, + "learning_rate": 4.453200309450032e-05, + "loss": 0.0, + "step": 48615 + }, + { + "epoch": 4.536344126154708, + "grad_norm": NaN, + "learning_rate": 4.452662483908337e-05, + "loss": 0.0, + "step": 48616 + }, + { + "epoch": 4.536437435849585, + "grad_norm": NaN, + "learning_rate": 4.452124685185402e-05, + "loss": 0.0, + "step": 48617 + }, + { + "epoch": 4.536530745544463, + "grad_norm": NaN, + "learning_rate": 4.45158691328259e-05, + "loss": 0.0, + "step": 48618 + }, + { + "epoch": 4.536624055239339, + "grad_norm": NaN, + "learning_rate": 4.451049168201266e-05, + "loss": 0.0, + "step": 48619 + }, + { + "epoch": 4.5367173649342165, + "grad_norm": NaN, + "learning_rate": 4.4505114499428014e-05, + "loss": 0.0, + "step": 48620 + }, + { + "epoch": 4.536810674629094, + "grad_norm": NaN, + "learning_rate": 4.4499737585085594e-05, + "loss": 0.0, + "step": 48621 + }, + { + "epoch": 4.536903984323971, + "grad_norm": NaN, + "learning_rate": 4.449436093899909e-05, + "loss": 0.0, + "step": 48622 + }, + { + "epoch": 4.536997294018849, + "grad_norm": NaN, + "learning_rate": 4.448898456118218e-05, + "loss": 0.0, + "step": 48623 + }, + { + "epoch": 4.537090603713726, + "grad_norm": NaN, + "learning_rate": 4.448360845164853e-05, + "loss": 0.0, + "step": 48624 + }, + { + "epoch": 4.537183913408603, + "grad_norm": NaN, + "learning_rate": 4.44782326104118e-05, + "loss": 0.0, + "step": 48625 + }, + { + "epoch": 4.53727722310348, + "grad_norm": NaN, + "learning_rate": 4.447285703748567e-05, + "loss": 0.0, + "step": 48626 + }, + { + "epoch": 4.537370532798358, + "grad_norm": NaN, + "learning_rate": 4.446748173288381e-05, + "loss": 0.0, + "step": 48627 + }, + { + "epoch": 4.537463842493235, + "grad_norm": NaN, + "learning_rate": 4.4462106696619885e-05, + "loss": 0.0, + "step": 48628 + }, + { + "epoch": 4.537557152188112, + "grad_norm": NaN, + "learning_rate": 4.4456731928707546e-05, + "loss": 0.0, + "step": 48629 + }, + { + "epoch": 4.53765046188299, + "grad_norm": NaN, + "learning_rate": 4.4451357429160486e-05, + "loss": 0.0, + "step": 48630 + }, + { + "epoch": 4.537743771577867, + "grad_norm": NaN, + "learning_rate": 4.4445983197992353e-05, + "loss": 0.0, + "step": 48631 + }, + { + "epoch": 4.537837081272745, + "grad_norm": NaN, + "learning_rate": 4.444060923521681e-05, + "loss": 0.0, + "step": 48632 + }, + { + "epoch": 4.537930390967621, + "grad_norm": NaN, + "learning_rate": 4.443523554084754e-05, + "loss": 0.0, + "step": 48633 + }, + { + "epoch": 4.538023700662499, + "grad_norm": NaN, + "learning_rate": 4.44298621148982e-05, + "loss": 0.0, + "step": 48634 + }, + { + "epoch": 4.538117010357376, + "grad_norm": NaN, + "learning_rate": 4.442448895738242e-05, + "loss": 0.0, + "step": 48635 + }, + { + "epoch": 4.538210320052253, + "grad_norm": NaN, + "learning_rate": 4.441911606831392e-05, + "loss": 0.0, + "step": 48636 + }, + { + "epoch": 4.538303629747131, + "grad_norm": NaN, + "learning_rate": 4.4413743447706334e-05, + "loss": 0.0, + "step": 48637 + }, + { + "epoch": 4.538396939442008, + "grad_norm": NaN, + "learning_rate": 4.440837109557331e-05, + "loss": 0.0, + "step": 48638 + }, + { + "epoch": 4.538490249136886, + "grad_norm": NaN, + "learning_rate": 4.440299901192851e-05, + "loss": 0.0, + "step": 48639 + }, + { + "epoch": 4.538583558831762, + "grad_norm": NaN, + "learning_rate": 4.4397627196785626e-05, + "loss": 0.0, + "step": 48640 + }, + { + "epoch": 4.53867686852664, + "grad_norm": NaN, + "learning_rate": 4.4392255650158274e-05, + "loss": 0.0, + "step": 48641 + }, + { + "epoch": 4.538770178221517, + "grad_norm": NaN, + "learning_rate": 4.438688437206015e-05, + "loss": 0.0, + "step": 48642 + }, + { + "epoch": 4.5388634879163945, + "grad_norm": NaN, + "learning_rate": 4.4381513362504884e-05, + "loss": 0.0, + "step": 48643 + }, + { + "epoch": 4.538956797611272, + "grad_norm": NaN, + "learning_rate": 4.4376142621506134e-05, + "loss": 0.0, + "step": 48644 + }, + { + "epoch": 4.539050107306149, + "grad_norm": NaN, + "learning_rate": 4.4370772149077584e-05, + "loss": 0.0, + "step": 48645 + }, + { + "epoch": 4.539143417001027, + "grad_norm": NaN, + "learning_rate": 4.4365401945232855e-05, + "loss": 0.0, + "step": 48646 + }, + { + "epoch": 4.539236726695904, + "grad_norm": NaN, + "learning_rate": 4.4360032009985634e-05, + "loss": 0.0, + "step": 48647 + }, + { + "epoch": 4.539330036390781, + "grad_norm": NaN, + "learning_rate": 4.435466234334955e-05, + "loss": 0.0, + "step": 48648 + }, + { + "epoch": 4.539423346085658, + "grad_norm": NaN, + "learning_rate": 4.434929294533825e-05, + "loss": 0.0, + "step": 48649 + }, + { + "epoch": 4.5395166557805355, + "grad_norm": NaN, + "learning_rate": 4.434392381596542e-05, + "loss": 0.0, + "step": 48650 + }, + { + "epoch": 4.539609965475413, + "grad_norm": NaN, + "learning_rate": 4.433855495524469e-05, + "loss": 0.0, + "step": 48651 + }, + { + "epoch": 4.53970327517029, + "grad_norm": NaN, + "learning_rate": 4.433318636318973e-05, + "loss": 0.0, + "step": 48652 + }, + { + "epoch": 4.539796584865168, + "grad_norm": NaN, + "learning_rate": 4.432781803981416e-05, + "loss": 0.0, + "step": 48653 + }, + { + "epoch": 4.539889894560044, + "grad_norm": NaN, + "learning_rate": 4.432244998513165e-05, + "loss": 0.0, + "step": 48654 + }, + { + "epoch": 4.539983204254922, + "grad_norm": NaN, + "learning_rate": 4.431708219915584e-05, + "loss": 0.0, + "step": 48655 + }, + { + "epoch": 4.540076513949799, + "grad_norm": NaN, + "learning_rate": 4.431171468190038e-05, + "loss": 0.0, + "step": 48656 + }, + { + "epoch": 4.540169823644677, + "grad_norm": NaN, + "learning_rate": 4.4306347433378934e-05, + "loss": 0.0, + "step": 48657 + }, + { + "epoch": 4.540263133339554, + "grad_norm": NaN, + "learning_rate": 4.430098045360513e-05, + "loss": 0.0, + "step": 48658 + }, + { + "epoch": 4.540356443034431, + "grad_norm": NaN, + "learning_rate": 4.429561374259263e-05, + "loss": 0.0, + "step": 48659 + }, + { + "epoch": 4.540449752729309, + "grad_norm": NaN, + "learning_rate": 4.429024730035508e-05, + "loss": 0.0, + "step": 48660 + }, + { + "epoch": 4.540543062424186, + "grad_norm": NaN, + "learning_rate": 4.4284881126906096e-05, + "loss": 0.0, + "step": 48661 + }, + { + "epoch": 4.540636372119063, + "grad_norm": NaN, + "learning_rate": 4.427951522225937e-05, + "loss": 0.0, + "step": 48662 + }, + { + "epoch": 4.54072968181394, + "grad_norm": NaN, + "learning_rate": 4.427414958642851e-05, + "loss": 0.0, + "step": 48663 + }, + { + "epoch": 4.540822991508818, + "grad_norm": NaN, + "learning_rate": 4.426878421942715e-05, + "loss": 0.0, + "step": 48664 + }, + { + "epoch": 4.540916301203695, + "grad_norm": NaN, + "learning_rate": 4.426341912126896e-05, + "loss": 0.0, + "step": 48665 + }, + { + "epoch": 4.5410096108985725, + "grad_norm": NaN, + "learning_rate": 4.425805429196759e-05, + "loss": 0.0, + "step": 48666 + }, + { + "epoch": 4.54110292059345, + "grad_norm": NaN, + "learning_rate": 4.425268973153665e-05, + "loss": 0.0, + "step": 48667 + }, + { + "epoch": 4.541196230288327, + "grad_norm": NaN, + "learning_rate": 4.424732543998981e-05, + "loss": 0.0, + "step": 48668 + }, + { + "epoch": 4.541289539983204, + "grad_norm": NaN, + "learning_rate": 4.424196141734068e-05, + "loss": 0.0, + "step": 48669 + }, + { + "epoch": 4.541382849678081, + "grad_norm": NaN, + "learning_rate": 4.4236597663602915e-05, + "loss": 0.0, + "step": 48670 + }, + { + "epoch": 4.541476159372959, + "grad_norm": NaN, + "learning_rate": 4.4231234178790174e-05, + "loss": 0.0, + "step": 48671 + }, + { + "epoch": 4.541569469067836, + "grad_norm": NaN, + "learning_rate": 4.4225870962916044e-05, + "loss": 0.0, + "step": 48672 + }, + { + "epoch": 4.5416627787627135, + "grad_norm": NaN, + "learning_rate": 4.422050801599421e-05, + "loss": 0.0, + "step": 48673 + }, + { + "epoch": 4.541756088457591, + "grad_norm": NaN, + "learning_rate": 4.421514533803828e-05, + "loss": 0.0, + "step": 48674 + }, + { + "epoch": 4.541849398152468, + "grad_norm": NaN, + "learning_rate": 4.42097829290619e-05, + "loss": 0.0, + "step": 48675 + }, + { + "epoch": 4.541942707847346, + "grad_norm": NaN, + "learning_rate": 4.420442078907872e-05, + "loss": 0.0, + "step": 48676 + }, + { + "epoch": 4.542036017542222, + "grad_norm": NaN, + "learning_rate": 4.4199058918102346e-05, + "loss": 0.0, + "step": 48677 + }, + { + "epoch": 4.5421293272371, + "grad_norm": NaN, + "learning_rate": 4.419369731614642e-05, + "loss": 0.0, + "step": 48678 + }, + { + "epoch": 4.542222636931977, + "grad_norm": NaN, + "learning_rate": 4.418833598322459e-05, + "loss": 0.0, + "step": 48679 + }, + { + "epoch": 4.542315946626855, + "grad_norm": NaN, + "learning_rate": 4.4182974919350475e-05, + "loss": 0.0, + "step": 48680 + }, + { + "epoch": 4.542409256321732, + "grad_norm": NaN, + "learning_rate": 4.4177614124537716e-05, + "loss": 0.0, + "step": 48681 + }, + { + "epoch": 4.542502566016609, + "grad_norm": NaN, + "learning_rate": 4.4172253598799915e-05, + "loss": 0.0, + "step": 48682 + }, + { + "epoch": 4.542595875711486, + "grad_norm": NaN, + "learning_rate": 4.416689334215074e-05, + "loss": 0.0, + "step": 48683 + }, + { + "epoch": 4.542689185406363, + "grad_norm": NaN, + "learning_rate": 4.41615333546038e-05, + "loss": 0.0, + "step": 48684 + }, + { + "epoch": 4.542782495101241, + "grad_norm": NaN, + "learning_rate": 4.4156173636172744e-05, + "loss": 0.0, + "step": 48685 + }, + { + "epoch": 4.542875804796118, + "grad_norm": NaN, + "learning_rate": 4.415081418687117e-05, + "loss": 0.0, + "step": 48686 + }, + { + "epoch": 4.542969114490996, + "grad_norm": NaN, + "learning_rate": 4.414545500671272e-05, + "loss": 0.0, + "step": 48687 + }, + { + "epoch": 4.543062424185873, + "grad_norm": NaN, + "learning_rate": 4.4140096095711034e-05, + "loss": 0.0, + "step": 48688 + }, + { + "epoch": 4.5431557338807504, + "grad_norm": NaN, + "learning_rate": 4.413473745387972e-05, + "loss": 0.0, + "step": 48689 + }, + { + "epoch": 4.543249043575628, + "grad_norm": NaN, + "learning_rate": 4.412937908123241e-05, + "loss": 0.0, + "step": 48690 + }, + { + "epoch": 4.543342353270505, + "grad_norm": NaN, + "learning_rate": 4.412402097778272e-05, + "loss": 0.0, + "step": 48691 + }, + { + "epoch": 4.543435662965382, + "grad_norm": NaN, + "learning_rate": 4.4118663143544296e-05, + "loss": 0.0, + "step": 48692 + }, + { + "epoch": 4.543528972660259, + "grad_norm": NaN, + "learning_rate": 4.411330557853073e-05, + "loss": 0.0, + "step": 48693 + }, + { + "epoch": 4.543622282355137, + "grad_norm": NaN, + "learning_rate": 4.410794828275567e-05, + "loss": 0.0, + "step": 48694 + }, + { + "epoch": 4.543715592050014, + "grad_norm": NaN, + "learning_rate": 4.410259125623274e-05, + "loss": 0.0, + "step": 48695 + }, + { + "epoch": 4.5438089017448915, + "grad_norm": NaN, + "learning_rate": 4.409723449897554e-05, + "loss": 0.0, + "step": 48696 + }, + { + "epoch": 4.543902211439769, + "grad_norm": NaN, + "learning_rate": 4.40918780109977e-05, + "loss": 0.0, + "step": 48697 + }, + { + "epoch": 4.543995521134645, + "grad_norm": NaN, + "learning_rate": 4.408652179231285e-05, + "loss": 0.0, + "step": 48698 + }, + { + "epoch": 4.544088830829523, + "grad_norm": NaN, + "learning_rate": 4.4081165842934594e-05, + "loss": 0.0, + "step": 48699 + }, + { + "epoch": 4.5441821405244, + "grad_norm": NaN, + "learning_rate": 4.407581016287656e-05, + "loss": 0.0, + "step": 48700 + }, + { + "epoch": 4.544275450219278, + "grad_norm": NaN, + "learning_rate": 4.4070454752152354e-05, + "loss": 0.0, + "step": 48701 + }, + { + "epoch": 4.544368759914155, + "grad_norm": NaN, + "learning_rate": 4.4065099610775614e-05, + "loss": 0.0, + "step": 48702 + }, + { + "epoch": 4.5444620696090325, + "grad_norm": NaN, + "learning_rate": 4.4059744738759946e-05, + "loss": 0.0, + "step": 48703 + }, + { + "epoch": 4.54455537930391, + "grad_norm": NaN, + "learning_rate": 4.405439013611896e-05, + "loss": 0.0, + "step": 48704 + }, + { + "epoch": 4.544648688998787, + "grad_norm": NaN, + "learning_rate": 4.404903580286629e-05, + "loss": 0.0, + "step": 48705 + }, + { + "epoch": 4.544741998693664, + "grad_norm": NaN, + "learning_rate": 4.404368173901552e-05, + "loss": 0.0, + "step": 48706 + }, + { + "epoch": 4.544835308388541, + "grad_norm": NaN, + "learning_rate": 4.4038327944580286e-05, + "loss": 0.0, + "step": 48707 + }, + { + "epoch": 4.544928618083419, + "grad_norm": NaN, + "learning_rate": 4.4032974419574205e-05, + "loss": 0.0, + "step": 48708 + }, + { + "epoch": 4.545021927778296, + "grad_norm": NaN, + "learning_rate": 4.402762116401087e-05, + "loss": 0.0, + "step": 48709 + }, + { + "epoch": 4.545115237473174, + "grad_norm": NaN, + "learning_rate": 4.402226817790391e-05, + "loss": 0.0, + "step": 48710 + }, + { + "epoch": 4.545208547168051, + "grad_norm": NaN, + "learning_rate": 4.4016915461266926e-05, + "loss": 0.0, + "step": 48711 + }, + { + "epoch": 4.545301856862928, + "grad_norm": NaN, + "learning_rate": 4.401156301411353e-05, + "loss": 0.0, + "step": 48712 + }, + { + "epoch": 4.545395166557805, + "grad_norm": NaN, + "learning_rate": 4.400621083645733e-05, + "loss": 0.0, + "step": 48713 + }, + { + "epoch": 4.545488476252682, + "grad_norm": NaN, + "learning_rate": 4.400085892831196e-05, + "loss": 0.0, + "step": 48714 + }, + { + "epoch": 4.54558178594756, + "grad_norm": NaN, + "learning_rate": 4.399550728969098e-05, + "loss": 0.0, + "step": 48715 + }, + { + "epoch": 4.545675095642437, + "grad_norm": NaN, + "learning_rate": 4.399015592060802e-05, + "loss": 0.0, + "step": 48716 + }, + { + "epoch": 4.545768405337315, + "grad_norm": NaN, + "learning_rate": 4.398480482107671e-05, + "loss": 0.0, + "step": 48717 + }, + { + "epoch": 4.545861715032192, + "grad_norm": NaN, + "learning_rate": 4.397945399111063e-05, + "loss": 0.0, + "step": 48718 + }, + { + "epoch": 4.5459550247270695, + "grad_norm": NaN, + "learning_rate": 4.3974103430723394e-05, + "loss": 0.0, + "step": 48719 + }, + { + "epoch": 4.546048334421947, + "grad_norm": NaN, + "learning_rate": 4.3968753139928606e-05, + "loss": 0.0, + "step": 48720 + }, + { + "epoch": 4.546141644116823, + "grad_norm": NaN, + "learning_rate": 4.396340311873988e-05, + "loss": 0.0, + "step": 48721 + }, + { + "epoch": 4.546234953811701, + "grad_norm": NaN, + "learning_rate": 4.3958053367170785e-05, + "loss": 0.0, + "step": 48722 + }, + { + "epoch": 4.546328263506578, + "grad_norm": NaN, + "learning_rate": 4.3952703885234965e-05, + "loss": 0.0, + "step": 48723 + }, + { + "epoch": 4.546421573201456, + "grad_norm": NaN, + "learning_rate": 4.394735467294601e-05, + "loss": 0.0, + "step": 48724 + }, + { + "epoch": 4.546514882896333, + "grad_norm": NaN, + "learning_rate": 4.39420057303175e-05, + "loss": 0.0, + "step": 48725 + }, + { + "epoch": 4.5466081925912105, + "grad_norm": NaN, + "learning_rate": 4.3936657057363075e-05, + "loss": 0.0, + "step": 48726 + }, + { + "epoch": 4.546701502286087, + "grad_norm": NaN, + "learning_rate": 4.393130865409629e-05, + "loss": 0.0, + "step": 48727 + }, + { + "epoch": 4.5467948119809645, + "grad_norm": NaN, + "learning_rate": 4.3925960520530786e-05, + "loss": 0.0, + "step": 48728 + }, + { + "epoch": 4.546888121675842, + "grad_norm": NaN, + "learning_rate": 4.392061265668014e-05, + "loss": 0.0, + "step": 48729 + }, + { + "epoch": 4.546981431370719, + "grad_norm": NaN, + "learning_rate": 4.3915265062557944e-05, + "loss": 0.0, + "step": 48730 + }, + { + "epoch": 4.547074741065597, + "grad_norm": NaN, + "learning_rate": 4.39099177381778e-05, + "loss": 0.0, + "step": 48731 + }, + { + "epoch": 4.547168050760474, + "grad_norm": NaN, + "learning_rate": 4.390457068355333e-05, + "loss": 0.0, + "step": 48732 + }, + { + "epoch": 4.547261360455352, + "grad_norm": NaN, + "learning_rate": 4.3899223898698085e-05, + "loss": 0.0, + "step": 48733 + }, + { + "epoch": 4.547354670150229, + "grad_norm": NaN, + "learning_rate": 4.3893877383625695e-05, + "loss": 0.0, + "step": 48734 + }, + { + "epoch": 4.547447979845106, + "grad_norm": NaN, + "learning_rate": 4.388853113834975e-05, + "loss": 0.0, + "step": 48735 + }, + { + "epoch": 4.547541289539983, + "grad_norm": NaN, + "learning_rate": 4.388318516288384e-05, + "loss": 0.0, + "step": 48736 + }, + { + "epoch": 4.54763459923486, + "grad_norm": NaN, + "learning_rate": 4.387783945724154e-05, + "loss": 0.0, + "step": 48737 + }, + { + "epoch": 4.547727908929738, + "grad_norm": NaN, + "learning_rate": 4.387249402143647e-05, + "loss": 0.0, + "step": 48738 + }, + { + "epoch": 4.547821218624615, + "grad_norm": NaN, + "learning_rate": 4.38671488554822e-05, + "loss": 0.0, + "step": 48739 + }, + { + "epoch": 4.547914528319493, + "grad_norm": NaN, + "learning_rate": 4.3861803959392344e-05, + "loss": 0.0, + "step": 48740 + }, + { + "epoch": 4.54800783801437, + "grad_norm": NaN, + "learning_rate": 4.3856459333180475e-05, + "loss": 0.0, + "step": 48741 + }, + { + "epoch": 4.548101147709247, + "grad_norm": NaN, + "learning_rate": 4.3851114976860183e-05, + "loss": 0.0, + "step": 48742 + }, + { + "epoch": 4.548194457404124, + "grad_norm": NaN, + "learning_rate": 4.384577089044508e-05, + "loss": 0.0, + "step": 48743 + }, + { + "epoch": 4.548287767099001, + "grad_norm": NaN, + "learning_rate": 4.384042707394872e-05, + "loss": 0.0, + "step": 48744 + }, + { + "epoch": 4.548381076793879, + "grad_norm": NaN, + "learning_rate": 4.3835083527384705e-05, + "loss": 0.0, + "step": 48745 + }, + { + "epoch": 4.548474386488756, + "grad_norm": NaN, + "learning_rate": 4.3829740250766626e-05, + "loss": 0.0, + "step": 48746 + }, + { + "epoch": 4.548567696183634, + "grad_norm": NaN, + "learning_rate": 4.382439724410807e-05, + "loss": 0.0, + "step": 48747 + }, + { + "epoch": 4.548661005878511, + "grad_norm": NaN, + "learning_rate": 4.381905450742262e-05, + "loss": 0.0, + "step": 48748 + }, + { + "epoch": 4.5487543155733885, + "grad_norm": NaN, + "learning_rate": 4.381371204072387e-05, + "loss": 0.0, + "step": 48749 + }, + { + "epoch": 4.548847625268265, + "grad_norm": NaN, + "learning_rate": 4.380836984402538e-05, + "loss": 0.0, + "step": 48750 + }, + { + "epoch": 4.5489409349631424, + "grad_norm": NaN, + "learning_rate": 4.380302791734076e-05, + "loss": 0.0, + "step": 48751 + }, + { + "epoch": 4.54903424465802, + "grad_norm": NaN, + "learning_rate": 4.3797686260683576e-05, + "loss": 0.0, + "step": 48752 + }, + { + "epoch": 4.549127554352897, + "grad_norm": NaN, + "learning_rate": 4.379234487406742e-05, + "loss": 0.0, + "step": 48753 + }, + { + "epoch": 4.549220864047775, + "grad_norm": NaN, + "learning_rate": 4.3787003757505875e-05, + "loss": 0.0, + "step": 48754 + }, + { + "epoch": 4.549314173742652, + "grad_norm": NaN, + "learning_rate": 4.378166291101251e-05, + "loss": 0.0, + "step": 48755 + }, + { + "epoch": 4.5494074834375295, + "grad_norm": NaN, + "learning_rate": 4.377632233460092e-05, + "loss": 0.0, + "step": 48756 + }, + { + "epoch": 4.549500793132406, + "grad_norm": NaN, + "learning_rate": 4.377098202828466e-05, + "loss": 0.0, + "step": 48757 + }, + { + "epoch": 4.5495941028272835, + "grad_norm": NaN, + "learning_rate": 4.3765641992077336e-05, + "loss": 0.0, + "step": 48758 + }, + { + "epoch": 4.549687412522161, + "grad_norm": NaN, + "learning_rate": 4.3760302225992524e-05, + "loss": 0.0, + "step": 48759 + }, + { + "epoch": 4.549780722217038, + "grad_norm": NaN, + "learning_rate": 4.3754962730043785e-05, + "loss": 0.0, + "step": 48760 + }, + { + "epoch": 4.549874031911916, + "grad_norm": NaN, + "learning_rate": 4.37496235042447e-05, + "loss": 0.0, + "step": 48761 + }, + { + "epoch": 4.549967341606793, + "grad_norm": NaN, + "learning_rate": 4.3744284548608844e-05, + "loss": 0.0, + "step": 48762 + }, + { + "epoch": 4.550060651301671, + "grad_norm": NaN, + "learning_rate": 4.3738945863149815e-05, + "loss": 0.0, + "step": 48763 + }, + { + "epoch": 4.550153960996548, + "grad_norm": NaN, + "learning_rate": 4.3733607447881156e-05, + "loss": 0.0, + "step": 48764 + }, + { + "epoch": 4.5502472706914245, + "grad_norm": NaN, + "learning_rate": 4.372826930281646e-05, + "loss": 0.0, + "step": 48765 + }, + { + "epoch": 4.550340580386302, + "grad_norm": NaN, + "learning_rate": 4.372293142796929e-05, + "loss": 0.0, + "step": 48766 + }, + { + "epoch": 4.550433890081179, + "grad_norm": NaN, + "learning_rate": 4.371759382335323e-05, + "loss": 0.0, + "step": 48767 + }, + { + "epoch": 4.550527199776057, + "grad_norm": NaN, + "learning_rate": 4.3712256488981845e-05, + "loss": 0.0, + "step": 48768 + }, + { + "epoch": 4.550620509470934, + "grad_norm": NaN, + "learning_rate": 4.370691942486869e-05, + "loss": 0.0, + "step": 48769 + }, + { + "epoch": 4.550713819165812, + "grad_norm": NaN, + "learning_rate": 4.3701582631027375e-05, + "loss": 0.0, + "step": 48770 + }, + { + "epoch": 4.550807128860688, + "grad_norm": NaN, + "learning_rate": 4.369624610747144e-05, + "loss": 0.0, + "step": 48771 + }, + { + "epoch": 4.550900438555566, + "grad_norm": NaN, + "learning_rate": 4.369090985421447e-05, + "loss": 0.0, + "step": 48772 + }, + { + "epoch": 4.550993748250443, + "grad_norm": NaN, + "learning_rate": 4.368557387127e-05, + "loss": 0.0, + "step": 48773 + }, + { + "epoch": 4.55108705794532, + "grad_norm": NaN, + "learning_rate": 4.368023815865165e-05, + "loss": 0.0, + "step": 48774 + }, + { + "epoch": 4.551180367640198, + "grad_norm": NaN, + "learning_rate": 4.367490271637295e-05, + "loss": 0.0, + "step": 48775 + }, + { + "epoch": 4.551273677335075, + "grad_norm": NaN, + "learning_rate": 4.366956754444748e-05, + "loss": 0.0, + "step": 48776 + }, + { + "epoch": 4.551366987029953, + "grad_norm": NaN, + "learning_rate": 4.36642326428888e-05, + "loss": 0.0, + "step": 48777 + }, + { + "epoch": 4.55146029672483, + "grad_norm": NaN, + "learning_rate": 4.3658898011710465e-05, + "loss": 0.0, + "step": 48778 + }, + { + "epoch": 4.551553606419707, + "grad_norm": NaN, + "learning_rate": 4.3653563650926074e-05, + "loss": 0.0, + "step": 48779 + }, + { + "epoch": 4.551646916114584, + "grad_norm": NaN, + "learning_rate": 4.3648229560549145e-05, + "loss": 0.0, + "step": 48780 + }, + { + "epoch": 4.5517402258094615, + "grad_norm": NaN, + "learning_rate": 4.3642895740593284e-05, + "loss": 0.0, + "step": 48781 + }, + { + "epoch": 4.551833535504339, + "grad_norm": NaN, + "learning_rate": 4.363756219107203e-05, + "loss": 0.0, + "step": 48782 + }, + { + "epoch": 4.551926845199216, + "grad_norm": NaN, + "learning_rate": 4.363222891199894e-05, + "loss": 0.0, + "step": 48783 + }, + { + "epoch": 4.552020154894094, + "grad_norm": NaN, + "learning_rate": 4.362689590338759e-05, + "loss": 0.0, + "step": 48784 + }, + { + "epoch": 4.552113464588971, + "grad_norm": NaN, + "learning_rate": 4.362156316525153e-05, + "loss": 0.0, + "step": 48785 + }, + { + "epoch": 4.552206774283848, + "grad_norm": NaN, + "learning_rate": 4.3616230697604334e-05, + "loss": 0.0, + "step": 48786 + }, + { + "epoch": 4.552300083978725, + "grad_norm": NaN, + "learning_rate": 4.361089850045954e-05, + "loss": 0.0, + "step": 48787 + }, + { + "epoch": 4.5523933936736025, + "grad_norm": NaN, + "learning_rate": 4.3605566573830705e-05, + "loss": 0.0, + "step": 48788 + }, + { + "epoch": 4.55248670336848, + "grad_norm": NaN, + "learning_rate": 4.360023491773142e-05, + "loss": 0.0, + "step": 48789 + }, + { + "epoch": 4.552580013063357, + "grad_norm": NaN, + "learning_rate": 4.3594903532175215e-05, + "loss": 0.0, + "step": 48790 + }, + { + "epoch": 4.552673322758235, + "grad_norm": NaN, + "learning_rate": 4.358957241717564e-05, + "loss": 0.0, + "step": 48791 + }, + { + "epoch": 4.552766632453112, + "grad_norm": NaN, + "learning_rate": 4.358424157274626e-05, + "loss": 0.0, + "step": 48792 + }, + { + "epoch": 4.55285994214799, + "grad_norm": NaN, + "learning_rate": 4.357891099890065e-05, + "loss": 0.0, + "step": 48793 + }, + { + "epoch": 4.552953251842866, + "grad_norm": NaN, + "learning_rate": 4.357358069565233e-05, + "loss": 0.0, + "step": 48794 + }, + { + "epoch": 4.553046561537744, + "grad_norm": NaN, + "learning_rate": 4.3568250663014874e-05, + "loss": 0.0, + "step": 48795 + }, + { + "epoch": 4.553139871232621, + "grad_norm": NaN, + "learning_rate": 4.356292090100183e-05, + "loss": 0.0, + "step": 48796 + }, + { + "epoch": 4.553233180927498, + "grad_norm": NaN, + "learning_rate": 4.3557591409626746e-05, + "loss": 0.0, + "step": 48797 + }, + { + "epoch": 4.553326490622376, + "grad_norm": NaN, + "learning_rate": 4.355226218890316e-05, + "loss": 0.0, + "step": 48798 + }, + { + "epoch": 4.553419800317253, + "grad_norm": NaN, + "learning_rate": 4.3546933238844675e-05, + "loss": 0.0, + "step": 48799 + }, + { + "epoch": 4.55351311001213, + "grad_norm": NaN, + "learning_rate": 4.3541604559464775e-05, + "loss": 0.0, + "step": 48800 + }, + { + "epoch": 4.553606419707007, + "grad_norm": NaN, + "learning_rate": 4.353627615077705e-05, + "loss": 0.0, + "step": 48801 + }, + { + "epoch": 4.553699729401885, + "grad_norm": NaN, + "learning_rate": 4.353094801279505e-05, + "loss": 0.0, + "step": 48802 + }, + { + "epoch": 4.553793039096762, + "grad_norm": NaN, + "learning_rate": 4.352562014553229e-05, + "loss": 0.0, + "step": 48803 + }, + { + "epoch": 4.5538863487916394, + "grad_norm": NaN, + "learning_rate": 4.352029254900234e-05, + "loss": 0.0, + "step": 48804 + }, + { + "epoch": 4.553979658486517, + "grad_norm": NaN, + "learning_rate": 4.351496522321876e-05, + "loss": 0.0, + "step": 48805 + }, + { + "epoch": 4.554072968181394, + "grad_norm": NaN, + "learning_rate": 4.3509638168195066e-05, + "loss": 0.0, + "step": 48806 + }, + { + "epoch": 4.554166277876272, + "grad_norm": NaN, + "learning_rate": 4.350431138394482e-05, + "loss": 0.0, + "step": 48807 + }, + { + "epoch": 4.554259587571149, + "grad_norm": NaN, + "learning_rate": 4.349898487048157e-05, + "loss": 0.0, + "step": 48808 + }, + { + "epoch": 4.554352897266026, + "grad_norm": NaN, + "learning_rate": 4.3493658627818854e-05, + "loss": 0.0, + "step": 48809 + }, + { + "epoch": 4.554446206960903, + "grad_norm": NaN, + "learning_rate": 4.348833265597021e-05, + "loss": 0.0, + "step": 48810 + }, + { + "epoch": 4.5545395166557805, + "grad_norm": NaN, + "learning_rate": 4.348300695494919e-05, + "loss": 0.0, + "step": 48811 + }, + { + "epoch": 4.554632826350658, + "grad_norm": NaN, + "learning_rate": 4.347768152476932e-05, + "loss": 0.0, + "step": 48812 + }, + { + "epoch": 4.554726136045535, + "grad_norm": NaN, + "learning_rate": 4.347235636544415e-05, + "loss": 0.0, + "step": 48813 + }, + { + "epoch": 4.554819445740413, + "grad_norm": NaN, + "learning_rate": 4.346703147698724e-05, + "loss": 0.0, + "step": 48814 + }, + { + "epoch": 4.554912755435289, + "grad_norm": NaN, + "learning_rate": 4.34617068594121e-05, + "loss": 0.0, + "step": 48815 + }, + { + "epoch": 4.555006065130167, + "grad_norm": NaN, + "learning_rate": 4.345638251273228e-05, + "loss": 0.0, + "step": 48816 + }, + { + "epoch": 4.555099374825044, + "grad_norm": NaN, + "learning_rate": 4.345105843696133e-05, + "loss": 0.0, + "step": 48817 + }, + { + "epoch": 4.5551926845199215, + "grad_norm": NaN, + "learning_rate": 4.3445734632112766e-05, + "loss": 0.0, + "step": 48818 + }, + { + "epoch": 4.555285994214799, + "grad_norm": NaN, + "learning_rate": 4.3440411098200136e-05, + "loss": 0.0, + "step": 48819 + }, + { + "epoch": 4.555379303909676, + "grad_norm": NaN, + "learning_rate": 4.3435087835236975e-05, + "loss": 0.0, + "step": 48820 + }, + { + "epoch": 4.555472613604554, + "grad_norm": NaN, + "learning_rate": 4.3429764843236814e-05, + "loss": 0.0, + "step": 48821 + }, + { + "epoch": 4.555565923299431, + "grad_norm": NaN, + "learning_rate": 4.342444212221319e-05, + "loss": 0.0, + "step": 48822 + }, + { + "epoch": 4.555659232994308, + "grad_norm": NaN, + "learning_rate": 4.3419119672179656e-05, + "loss": 0.0, + "step": 48823 + }, + { + "epoch": 4.555752542689185, + "grad_norm": NaN, + "learning_rate": 4.341379749314972e-05, + "loss": 0.0, + "step": 48824 + }, + { + "epoch": 4.555845852384063, + "grad_norm": NaN, + "learning_rate": 4.340847558513692e-05, + "loss": 0.0, + "step": 48825 + }, + { + "epoch": 4.55593916207894, + "grad_norm": NaN, + "learning_rate": 4.3403153948154806e-05, + "loss": 0.0, + "step": 48826 + }, + { + "epoch": 4.556032471773817, + "grad_norm": NaN, + "learning_rate": 4.339783258221688e-05, + "loss": 0.0, + "step": 48827 + }, + { + "epoch": 4.556125781468695, + "grad_norm": NaN, + "learning_rate": 4.33925114873367e-05, + "loss": 0.0, + "step": 48828 + }, + { + "epoch": 4.556219091163572, + "grad_norm": NaN, + "learning_rate": 4.3387190663527775e-05, + "loss": 0.0, + "step": 48829 + }, + { + "epoch": 4.556312400858449, + "grad_norm": NaN, + "learning_rate": 4.338187011080364e-05, + "loss": 0.0, + "step": 48830 + }, + { + "epoch": 4.556405710553326, + "grad_norm": NaN, + "learning_rate": 4.337654982917784e-05, + "loss": 0.0, + "step": 48831 + }, + { + "epoch": 4.556499020248204, + "grad_norm": NaN, + "learning_rate": 4.337122981866387e-05, + "loss": 0.0, + "step": 48832 + }, + { + "epoch": 4.556592329943081, + "grad_norm": NaN, + "learning_rate": 4.336591007927529e-05, + "loss": 0.0, + "step": 48833 + }, + { + "epoch": 4.5566856396379585, + "grad_norm": NaN, + "learning_rate": 4.336059061102562e-05, + "loss": 0.0, + "step": 48834 + }, + { + "epoch": 4.556778949332836, + "grad_norm": NaN, + "learning_rate": 4.335527141392837e-05, + "loss": 0.0, + "step": 48835 + }, + { + "epoch": 4.556872259027713, + "grad_norm": NaN, + "learning_rate": 4.334995248799707e-05, + "loss": 0.0, + "step": 48836 + }, + { + "epoch": 4.556965568722591, + "grad_norm": NaN, + "learning_rate": 4.334463383324525e-05, + "loss": 0.0, + "step": 48837 + }, + { + "epoch": 4.557058878417467, + "grad_norm": NaN, + "learning_rate": 4.33393154496864e-05, + "loss": 0.0, + "step": 48838 + }, + { + "epoch": 4.557152188112345, + "grad_norm": NaN, + "learning_rate": 4.333399733733416e-05, + "loss": 0.0, + "step": 48839 + }, + { + "epoch": 4.557245497807222, + "grad_norm": NaN, + "learning_rate": 4.3328679496201916e-05, + "loss": 0.0, + "step": 48840 + }, + { + "epoch": 4.5573388075020995, + "grad_norm": NaN, + "learning_rate": 4.332336192630325e-05, + "loss": 0.0, + "step": 48841 + }, + { + "epoch": 4.557432117196977, + "grad_norm": NaN, + "learning_rate": 4.3318044627651666e-05, + "loss": 0.0, + "step": 48842 + }, + { + "epoch": 4.557525426891854, + "grad_norm": NaN, + "learning_rate": 4.3312727600260696e-05, + "loss": 0.0, + "step": 48843 + }, + { + "epoch": 4.557618736586731, + "grad_norm": NaN, + "learning_rate": 4.330741084414386e-05, + "loss": 0.0, + "step": 48844 + }, + { + "epoch": 4.557712046281608, + "grad_norm": NaN, + "learning_rate": 4.3302094359314666e-05, + "loss": 0.0, + "step": 48845 + }, + { + "epoch": 4.557805355976486, + "grad_norm": NaN, + "learning_rate": 4.329677814578665e-05, + "loss": 0.0, + "step": 48846 + }, + { + "epoch": 4.557898665671363, + "grad_norm": NaN, + "learning_rate": 4.329146220357331e-05, + "loss": 0.0, + "step": 48847 + }, + { + "epoch": 4.557991975366241, + "grad_norm": NaN, + "learning_rate": 4.328614653268818e-05, + "loss": 0.0, + "step": 48848 + }, + { + "epoch": 4.558085285061118, + "grad_norm": NaN, + "learning_rate": 4.328083113314477e-05, + "loss": 0.0, + "step": 48849 + }, + { + "epoch": 4.558178594755995, + "grad_norm": NaN, + "learning_rate": 4.327551600495659e-05, + "loss": 0.0, + "step": 48850 + }, + { + "epoch": 4.558271904450873, + "grad_norm": NaN, + "learning_rate": 4.327020114813716e-05, + "loss": 0.0, + "step": 48851 + }, + { + "epoch": 4.558365214145749, + "grad_norm": NaN, + "learning_rate": 4.3264886562699974e-05, + "loss": 0.0, + "step": 48852 + }, + { + "epoch": 4.558458523840627, + "grad_norm": NaN, + "learning_rate": 4.325957224865858e-05, + "loss": 0.0, + "step": 48853 + }, + { + "epoch": 4.558551833535504, + "grad_norm": NaN, + "learning_rate": 4.325425820602648e-05, + "loss": 0.0, + "step": 48854 + }, + { + "epoch": 4.558645143230382, + "grad_norm": NaN, + "learning_rate": 4.3248944434817186e-05, + "loss": 0.0, + "step": 48855 + }, + { + "epoch": 4.558738452925259, + "grad_norm": NaN, + "learning_rate": 4.324363093504419e-05, + "loss": 0.0, + "step": 48856 + }, + { + "epoch": 4.5588317626201365, + "grad_norm": NaN, + "learning_rate": 4.3238317706721013e-05, + "loss": 0.0, + "step": 48857 + }, + { + "epoch": 4.558925072315014, + "grad_norm": NaN, + "learning_rate": 4.323300474986119e-05, + "loss": 0.0, + "step": 48858 + }, + { + "epoch": 4.55901838200989, + "grad_norm": NaN, + "learning_rate": 4.3227692064478185e-05, + "loss": 0.0, + "step": 48859 + }, + { + "epoch": 4.559111691704768, + "grad_norm": NaN, + "learning_rate": 4.3222379650585546e-05, + "loss": 0.0, + "step": 48860 + }, + { + "epoch": 4.559205001399645, + "grad_norm": NaN, + "learning_rate": 4.321706750819676e-05, + "loss": 0.0, + "step": 48861 + }, + { + "epoch": 4.559298311094523, + "grad_norm": NaN, + "learning_rate": 4.321175563732535e-05, + "loss": 0.0, + "step": 48862 + }, + { + "epoch": 4.5593916207894, + "grad_norm": NaN, + "learning_rate": 4.320644403798481e-05, + "loss": 0.0, + "step": 48863 + }, + { + "epoch": 4.5594849304842775, + "grad_norm": NaN, + "learning_rate": 4.320113271018863e-05, + "loss": 0.0, + "step": 48864 + }, + { + "epoch": 4.559578240179155, + "grad_norm": NaN, + "learning_rate": 4.3195821653950345e-05, + "loss": 0.0, + "step": 48865 + }, + { + "epoch": 4.559671549874032, + "grad_norm": NaN, + "learning_rate": 4.319051086928346e-05, + "loss": 0.0, + "step": 48866 + }, + { + "epoch": 4.559764859568909, + "grad_norm": NaN, + "learning_rate": 4.318520035620145e-05, + "loss": 0.0, + "step": 48867 + }, + { + "epoch": 4.559858169263786, + "grad_norm": NaN, + "learning_rate": 4.317989011471785e-05, + "loss": 0.0, + "step": 48868 + }, + { + "epoch": 4.559951478958664, + "grad_norm": NaN, + "learning_rate": 4.317458014484613e-05, + "loss": 0.0, + "step": 48869 + }, + { + "epoch": 4.560044788653541, + "grad_norm": NaN, + "learning_rate": 4.3169270446599814e-05, + "loss": 0.0, + "step": 48870 + }, + { + "epoch": 4.5601380983484185, + "grad_norm": NaN, + "learning_rate": 4.316396101999239e-05, + "loss": 0.0, + "step": 48871 + }, + { + "epoch": 4.560231408043296, + "grad_norm": NaN, + "learning_rate": 4.315865186503739e-05, + "loss": 0.0, + "step": 48872 + }, + { + "epoch": 4.560324717738173, + "grad_norm": NaN, + "learning_rate": 4.315334298174823e-05, + "loss": 0.0, + "step": 48873 + }, + { + "epoch": 4.56041802743305, + "grad_norm": NaN, + "learning_rate": 4.3148034370138565e-05, + "loss": 0.0, + "step": 48874 + }, + { + "epoch": 4.560511337127927, + "grad_norm": NaN, + "learning_rate": 4.314272603022173e-05, + "loss": 0.0, + "step": 48875 + }, + { + "epoch": 4.560604646822805, + "grad_norm": NaN, + "learning_rate": 4.313741796201126e-05, + "loss": 0.0, + "step": 48876 + }, + { + "epoch": 4.560697956517682, + "grad_norm": NaN, + "learning_rate": 4.3132110165520776e-05, + "loss": 0.0, + "step": 48877 + }, + { + "epoch": 4.56079126621256, + "grad_norm": NaN, + "learning_rate": 4.3126802640763634e-05, + "loss": 0.0, + "step": 48878 + }, + { + "epoch": 4.560884575907437, + "grad_norm": NaN, + "learning_rate": 4.3121495387753316e-05, + "loss": 0.0, + "step": 48879 + }, + { + "epoch": 4.560977885602314, + "grad_norm": NaN, + "learning_rate": 4.3116188406503484e-05, + "loss": 0.0, + "step": 48880 + }, + { + "epoch": 4.561071195297192, + "grad_norm": NaN, + "learning_rate": 4.311088169702747e-05, + "loss": 0.0, + "step": 48881 + }, + { + "epoch": 4.561164504992068, + "grad_norm": NaN, + "learning_rate": 4.310557525933876e-05, + "loss": 0.0, + "step": 48882 + }, + { + "epoch": 4.561257814686946, + "grad_norm": NaN, + "learning_rate": 4.310026909345102e-05, + "loss": 0.0, + "step": 48883 + }, + { + "epoch": 4.561351124381823, + "grad_norm": NaN, + "learning_rate": 4.309496319937758e-05, + "loss": 0.0, + "step": 48884 + }, + { + "epoch": 4.561444434076701, + "grad_norm": NaN, + "learning_rate": 4.308965757713197e-05, + "loss": 0.0, + "step": 48885 + }, + { + "epoch": 4.561537743771578, + "grad_norm": NaN, + "learning_rate": 4.308435222672771e-05, + "loss": 0.0, + "step": 48886 + }, + { + "epoch": 4.5616310534664555, + "grad_norm": NaN, + "learning_rate": 4.307904714817825e-05, + "loss": 0.0, + "step": 48887 + }, + { + "epoch": 4.561724363161332, + "grad_norm": NaN, + "learning_rate": 4.307374234149711e-05, + "loss": 0.0, + "step": 48888 + }, + { + "epoch": 4.561817672856209, + "grad_norm": NaN, + "learning_rate": 4.3068437806697784e-05, + "loss": 0.0, + "step": 48889 + }, + { + "epoch": 4.561910982551087, + "grad_norm": NaN, + "learning_rate": 4.3063133543793724e-05, + "loss": 0.0, + "step": 48890 + }, + { + "epoch": 4.562004292245964, + "grad_norm": NaN, + "learning_rate": 4.3057829552798454e-05, + "loss": 0.0, + "step": 48891 + }, + { + "epoch": 4.562097601940842, + "grad_norm": NaN, + "learning_rate": 4.3052525833725426e-05, + "loss": 0.0, + "step": 48892 + }, + { + "epoch": 4.562190911635719, + "grad_norm": NaN, + "learning_rate": 4.304722238658816e-05, + "loss": 0.0, + "step": 48893 + }, + { + "epoch": 4.5622842213305965, + "grad_norm": NaN, + "learning_rate": 4.304191921140012e-05, + "loss": 0.0, + "step": 48894 + }, + { + "epoch": 4.562377531025474, + "grad_norm": NaN, + "learning_rate": 4.30366163081748e-05, + "loss": 0.0, + "step": 48895 + }, + { + "epoch": 4.5624708407203505, + "grad_norm": NaN, + "learning_rate": 4.3031313676925677e-05, + "loss": 0.0, + "step": 48896 + }, + { + "epoch": 4.562564150415228, + "grad_norm": NaN, + "learning_rate": 4.302601131766623e-05, + "loss": 0.0, + "step": 48897 + }, + { + "epoch": 4.562657460110105, + "grad_norm": NaN, + "learning_rate": 4.3020709230409974e-05, + "loss": 0.0, + "step": 48898 + }, + { + "epoch": 4.562750769804983, + "grad_norm": NaN, + "learning_rate": 4.301540741517034e-05, + "loss": 0.0, + "step": 48899 + }, + { + "epoch": 4.56284407949986, + "grad_norm": NaN, + "learning_rate": 4.301010587196084e-05, + "loss": 0.0, + "step": 48900 + }, + { + "epoch": 4.562937389194738, + "grad_norm": NaN, + "learning_rate": 4.300480460079494e-05, + "loss": 0.0, + "step": 48901 + }, + { + "epoch": 4.563030698889615, + "grad_norm": NaN, + "learning_rate": 4.2999503601686136e-05, + "loss": 0.0, + "step": 48902 + }, + { + "epoch": 4.5631240085844915, + "grad_norm": NaN, + "learning_rate": 4.2994202874647896e-05, + "loss": 0.0, + "step": 48903 + }, + { + "epoch": 4.563217318279369, + "grad_norm": NaN, + "learning_rate": 4.2988902419693715e-05, + "loss": 0.0, + "step": 48904 + }, + { + "epoch": 4.563310627974246, + "grad_norm": NaN, + "learning_rate": 4.2983602236837044e-05, + "loss": 0.0, + "step": 48905 + }, + { + "epoch": 4.563403937669124, + "grad_norm": NaN, + "learning_rate": 4.2978302326091375e-05, + "loss": 0.0, + "step": 48906 + }, + { + "epoch": 4.563497247364001, + "grad_norm": NaN, + "learning_rate": 4.297300268747018e-05, + "loss": 0.0, + "step": 48907 + }, + { + "epoch": 4.563590557058879, + "grad_norm": NaN, + "learning_rate": 4.2967703320986923e-05, + "loss": 0.0, + "step": 48908 + }, + { + "epoch": 4.563683866753756, + "grad_norm": NaN, + "learning_rate": 4.2962404226655105e-05, + "loss": 0.0, + "step": 48909 + }, + { + "epoch": 4.5637771764486335, + "grad_norm": NaN, + "learning_rate": 4.295710540448818e-05, + "loss": 0.0, + "step": 48910 + }, + { + "epoch": 4.56387048614351, + "grad_norm": NaN, + "learning_rate": 4.2951806854499584e-05, + "loss": 0.0, + "step": 48911 + }, + { + "epoch": 4.563963795838387, + "grad_norm": NaN, + "learning_rate": 4.294650857670292e-05, + "loss": 0.0, + "step": 48912 + }, + { + "epoch": 4.564057105533265, + "grad_norm": NaN, + "learning_rate": 4.294121057111153e-05, + "loss": 0.0, + "step": 48913 + }, + { + "epoch": 4.564150415228142, + "grad_norm": NaN, + "learning_rate": 4.29359128377389e-05, + "loss": 0.0, + "step": 48914 + }, + { + "epoch": 4.56424372492302, + "grad_norm": NaN, + "learning_rate": 4.293061537659861e-05, + "loss": 0.0, + "step": 48915 + }, + { + "epoch": 4.564337034617897, + "grad_norm": NaN, + "learning_rate": 4.292531818770398e-05, + "loss": 0.0, + "step": 48916 + }, + { + "epoch": 4.564430344312774, + "grad_norm": NaN, + "learning_rate": 4.292002127106854e-05, + "loss": 0.0, + "step": 48917 + }, + { + "epoch": 4.564523654007651, + "grad_norm": NaN, + "learning_rate": 4.291472462670583e-05, + "loss": 0.0, + "step": 48918 + }, + { + "epoch": 4.5646169637025285, + "grad_norm": NaN, + "learning_rate": 4.290942825462922e-05, + "loss": 0.0, + "step": 48919 + }, + { + "epoch": 4.564710273397406, + "grad_norm": NaN, + "learning_rate": 4.290413215485215e-05, + "loss": 0.0, + "step": 48920 + }, + { + "epoch": 4.564803583092283, + "grad_norm": NaN, + "learning_rate": 4.2898836327388265e-05, + "loss": 0.0, + "step": 48921 + }, + { + "epoch": 4.564896892787161, + "grad_norm": NaN, + "learning_rate": 4.289354077225085e-05, + "loss": 0.0, + "step": 48922 + }, + { + "epoch": 4.564990202482038, + "grad_norm": NaN, + "learning_rate": 4.288824548945339e-05, + "loss": 0.0, + "step": 48923 + }, + { + "epoch": 4.5650835121769155, + "grad_norm": NaN, + "learning_rate": 4.28829504790095e-05, + "loss": 0.0, + "step": 48924 + }, + { + "epoch": 4.565176821871793, + "grad_norm": NaN, + "learning_rate": 4.287765574093247e-05, + "loss": 0.0, + "step": 48925 + }, + { + "epoch": 4.5652701315666695, + "grad_norm": NaN, + "learning_rate": 4.2872361275235794e-05, + "loss": 0.0, + "step": 48926 + }, + { + "epoch": 4.565363441261547, + "grad_norm": NaN, + "learning_rate": 4.286706708193306e-05, + "loss": 0.0, + "step": 48927 + }, + { + "epoch": 4.565456750956424, + "grad_norm": NaN, + "learning_rate": 4.286177316103759e-05, + "loss": 0.0, + "step": 48928 + }, + { + "epoch": 4.565550060651302, + "grad_norm": NaN, + "learning_rate": 4.28564795125629e-05, + "loss": 0.0, + "step": 48929 + }, + { + "epoch": 4.565643370346179, + "grad_norm": NaN, + "learning_rate": 4.285118613652244e-05, + "loss": 0.0, + "step": 48930 + }, + { + "epoch": 4.565736680041057, + "grad_norm": NaN, + "learning_rate": 4.2845893032929676e-05, + "loss": 0.0, + "step": 48931 + }, + { + "epoch": 4.565829989735933, + "grad_norm": NaN, + "learning_rate": 4.284060020179807e-05, + "loss": 0.0, + "step": 48932 + }, + { + "epoch": 4.5659232994308105, + "grad_norm": NaN, + "learning_rate": 4.283530764314106e-05, + "loss": 0.0, + "step": 48933 + }, + { + "epoch": 4.566016609125688, + "grad_norm": NaN, + "learning_rate": 4.283001535697212e-05, + "loss": 0.0, + "step": 48934 + }, + { + "epoch": 4.566109918820565, + "grad_norm": NaN, + "learning_rate": 4.282472334330472e-05, + "loss": 0.0, + "step": 48935 + }, + { + "epoch": 4.566203228515443, + "grad_norm": NaN, + "learning_rate": 4.281943160215229e-05, + "loss": 0.0, + "step": 48936 + }, + { + "epoch": 4.56629653821032, + "grad_norm": NaN, + "learning_rate": 4.28141401335283e-05, + "loss": 0.0, + "step": 48937 + }, + { + "epoch": 4.566389847905198, + "grad_norm": NaN, + "learning_rate": 4.280884893744621e-05, + "loss": 0.0, + "step": 48938 + }, + { + "epoch": 4.566483157600075, + "grad_norm": NaN, + "learning_rate": 4.280355801391944e-05, + "loss": 0.0, + "step": 48939 + }, + { + "epoch": 4.566576467294952, + "grad_norm": NaN, + "learning_rate": 4.279826736296149e-05, + "loss": 0.0, + "step": 48940 + }, + { + "epoch": 4.566669776989829, + "grad_norm": NaN, + "learning_rate": 4.2792976984585775e-05, + "loss": 0.0, + "step": 48941 + }, + { + "epoch": 4.566763086684706, + "grad_norm": NaN, + "learning_rate": 4.2787686878805775e-05, + "loss": 0.0, + "step": 48942 + }, + { + "epoch": 4.566856396379584, + "grad_norm": NaN, + "learning_rate": 4.2782397045634914e-05, + "loss": 0.0, + "step": 48943 + }, + { + "epoch": 4.566949706074461, + "grad_norm": NaN, + "learning_rate": 4.2777107485086675e-05, + "loss": 0.0, + "step": 48944 + }, + { + "epoch": 4.567043015769339, + "grad_norm": NaN, + "learning_rate": 4.277181819717448e-05, + "loss": 0.0, + "step": 48945 + }, + { + "epoch": 4.567136325464216, + "grad_norm": NaN, + "learning_rate": 4.2766529181911795e-05, + "loss": 0.0, + "step": 48946 + }, + { + "epoch": 4.567229635159093, + "grad_norm": NaN, + "learning_rate": 4.276124043931205e-05, + "loss": 0.0, + "step": 48947 + }, + { + "epoch": 4.56732294485397, + "grad_norm": NaN, + "learning_rate": 4.275595196938872e-05, + "loss": 0.0, + "step": 48948 + }, + { + "epoch": 4.5674162545488475, + "grad_norm": NaN, + "learning_rate": 4.2750663772155176e-05, + "loss": 0.0, + "step": 48949 + }, + { + "epoch": 4.567509564243725, + "grad_norm": NaN, + "learning_rate": 4.274537584762503e-05, + "loss": 0.0, + "step": 48950 + }, + { + "epoch": 4.567602873938602, + "grad_norm": NaN, + "learning_rate": 4.2740088195811556e-05, + "loss": 0.0, + "step": 48951 + }, + { + "epoch": 4.56769618363348, + "grad_norm": NaN, + "learning_rate": 4.2734800816728226e-05, + "loss": 0.0, + "step": 48952 + }, + { + "epoch": 4.567789493328357, + "grad_norm": NaN, + "learning_rate": 4.2729513710388624e-05, + "loss": 0.0, + "step": 48953 + }, + { + "epoch": 4.567882803023235, + "grad_norm": NaN, + "learning_rate": 4.272422687680604e-05, + "loss": 0.0, + "step": 48954 + }, + { + "epoch": 4.567976112718111, + "grad_norm": NaN, + "learning_rate": 4.2718940315993926e-05, + "loss": 0.0, + "step": 48955 + }, + { + "epoch": 4.5680694224129885, + "grad_norm": NaN, + "learning_rate": 4.271365402796586e-05, + "loss": 0.0, + "step": 48956 + }, + { + "epoch": 4.568162732107866, + "grad_norm": NaN, + "learning_rate": 4.2708368012735145e-05, + "loss": 0.0, + "step": 48957 + }, + { + "epoch": 4.568256041802743, + "grad_norm": NaN, + "learning_rate": 4.270308227031522e-05, + "loss": 0.0, + "step": 48958 + }, + { + "epoch": 4.568349351497621, + "grad_norm": NaN, + "learning_rate": 4.269779680071966e-05, + "loss": 0.0, + "step": 48959 + }, + { + "epoch": 4.568442661192498, + "grad_norm": NaN, + "learning_rate": 4.269251160396178e-05, + "loss": 0.0, + "step": 48960 + }, + { + "epoch": 4.568535970887375, + "grad_norm": NaN, + "learning_rate": 4.2687226680055016e-05, + "loss": 0.0, + "step": 48961 + }, + { + "epoch": 4.568629280582252, + "grad_norm": NaN, + "learning_rate": 4.268194202901293e-05, + "loss": 0.0, + "step": 48962 + }, + { + "epoch": 4.56872259027713, + "grad_norm": NaN, + "learning_rate": 4.267665765084883e-05, + "loss": 0.0, + "step": 48963 + }, + { + "epoch": 4.568815899972007, + "grad_norm": NaN, + "learning_rate": 4.2671373545576156e-05, + "loss": 0.0, + "step": 48964 + }, + { + "epoch": 4.568909209666884, + "grad_norm": NaN, + "learning_rate": 4.266608971320846e-05, + "loss": 0.0, + "step": 48965 + }, + { + "epoch": 4.569002519361762, + "grad_norm": NaN, + "learning_rate": 4.266080615375908e-05, + "loss": 0.0, + "step": 48966 + }, + { + "epoch": 4.569095829056639, + "grad_norm": NaN, + "learning_rate": 4.265552286724142e-05, + "loss": 0.0, + "step": 48967 + }, + { + "epoch": 4.569189138751517, + "grad_norm": NaN, + "learning_rate": 4.265023985366906e-05, + "loss": 0.0, + "step": 48968 + }, + { + "epoch": 4.569282448446393, + "grad_norm": NaN, + "learning_rate": 4.264495711305528e-05, + "loss": 0.0, + "step": 48969 + }, + { + "epoch": 4.569375758141271, + "grad_norm": NaN, + "learning_rate": 4.263967464541354e-05, + "loss": 0.0, + "step": 48970 + }, + { + "epoch": 4.569469067836148, + "grad_norm": NaN, + "learning_rate": 4.2634392450757396e-05, + "loss": 0.0, + "step": 48971 + }, + { + "epoch": 4.5695623775310255, + "grad_norm": NaN, + "learning_rate": 4.262911052910015e-05, + "loss": 0.0, + "step": 48972 + }, + { + "epoch": 4.569655687225903, + "grad_norm": NaN, + "learning_rate": 4.262382888045523e-05, + "loss": 0.0, + "step": 48973 + }, + { + "epoch": 4.56974899692078, + "grad_norm": NaN, + "learning_rate": 4.261854750483617e-05, + "loss": 0.0, + "step": 48974 + }, + { + "epoch": 4.569842306615658, + "grad_norm": NaN, + "learning_rate": 4.261326640225631e-05, + "loss": 0.0, + "step": 48975 + }, + { + "epoch": 4.569935616310534, + "grad_norm": NaN, + "learning_rate": 4.2607985572729095e-05, + "loss": 0.0, + "step": 48976 + }, + { + "epoch": 4.570028926005412, + "grad_norm": NaN, + "learning_rate": 4.2602705016267964e-05, + "loss": 0.0, + "step": 48977 + }, + { + "epoch": 4.570122235700289, + "grad_norm": NaN, + "learning_rate": 4.2597424732886346e-05, + "loss": 0.0, + "step": 48978 + }, + { + "epoch": 4.5702155453951665, + "grad_norm": NaN, + "learning_rate": 4.2592144722597657e-05, + "loss": 0.0, + "step": 48979 + }, + { + "epoch": 4.570308855090044, + "grad_norm": NaN, + "learning_rate": 4.258686498541532e-05, + "loss": 0.0, + "step": 48980 + }, + { + "epoch": 4.570402164784921, + "grad_norm": NaN, + "learning_rate": 4.258158552135278e-05, + "loss": 0.0, + "step": 48981 + }, + { + "epoch": 4.570495474479799, + "grad_norm": NaN, + "learning_rate": 4.257630633042344e-05, + "loss": 0.0, + "step": 48982 + }, + { + "epoch": 4.570588784174676, + "grad_norm": NaN, + "learning_rate": 4.257102741264074e-05, + "loss": 0.0, + "step": 48983 + }, + { + "epoch": 4.570682093869553, + "grad_norm": NaN, + "learning_rate": 4.256574876801808e-05, + "loss": 0.0, + "step": 48984 + }, + { + "epoch": 4.57077540356443, + "grad_norm": NaN, + "learning_rate": 4.2560470396568905e-05, + "loss": 0.0, + "step": 48985 + }, + { + "epoch": 4.5708687132593075, + "grad_norm": NaN, + "learning_rate": 4.2555192298306634e-05, + "loss": 0.0, + "step": 48986 + }, + { + "epoch": 4.570962022954185, + "grad_norm": NaN, + "learning_rate": 4.254991447324463e-05, + "loss": 0.0, + "step": 48987 + }, + { + "epoch": 4.571055332649062, + "grad_norm": NaN, + "learning_rate": 4.254463692139644e-05, + "loss": 0.0, + "step": 48988 + }, + { + "epoch": 4.57114864234394, + "grad_norm": NaN, + "learning_rate": 4.253935964277538e-05, + "loss": 0.0, + "step": 48989 + }, + { + "epoch": 4.571241952038816, + "grad_norm": NaN, + "learning_rate": 4.253408263739485e-05, + "loss": 0.0, + "step": 48990 + }, + { + "epoch": 4.571335261733694, + "grad_norm": NaN, + "learning_rate": 4.25288059052684e-05, + "loss": 0.0, + "step": 48991 + }, + { + "epoch": 4.571428571428571, + "grad_norm": NaN, + "learning_rate": 4.252352944640933e-05, + "loss": 0.0, + "step": 48992 + }, + { + "epoch": 4.571521881123449, + "grad_norm": NaN, + "learning_rate": 4.251825326083102e-05, + "loss": 0.0, + "step": 48993 + }, + { + "epoch": 4.571615190818326, + "grad_norm": NaN, + "learning_rate": 4.2512977348547066e-05, + "loss": 0.0, + "step": 48994 + }, + { + "epoch": 4.571708500513203, + "grad_norm": NaN, + "learning_rate": 4.250770170957071e-05, + "loss": 0.0, + "step": 48995 + }, + { + "epoch": 4.571801810208081, + "grad_norm": NaN, + "learning_rate": 4.250242634391538e-05, + "loss": 0.0, + "step": 48996 + }, + { + "epoch": 4.571895119902958, + "grad_norm": NaN, + "learning_rate": 4.249715125159464e-05, + "loss": 0.0, + "step": 48997 + }, + { + "epoch": 4.571988429597836, + "grad_norm": NaN, + "learning_rate": 4.2491876432621746e-05, + "loss": 0.0, + "step": 48998 + }, + { + "epoch": 4.572081739292712, + "grad_norm": NaN, + "learning_rate": 4.248660188701012e-05, + "loss": 0.0, + "step": 48999 + }, + { + "epoch": 4.57217504898759, + "grad_norm": NaN, + "learning_rate": 4.248132761477331e-05, + "loss": 0.0, + "step": 49000 + }, + { + "epoch": 4.572268358682467, + "grad_norm": NaN, + "learning_rate": 4.247605361592459e-05, + "loss": 0.0, + "step": 49001 + }, + { + "epoch": 4.5723616683773445, + "grad_norm": NaN, + "learning_rate": 4.2470779890477365e-05, + "loss": 0.0, + "step": 49002 + }, + { + "epoch": 4.572454978072222, + "grad_norm": NaN, + "learning_rate": 4.24655064384452e-05, + "loss": 0.0, + "step": 49003 + }, + { + "epoch": 4.572548287767099, + "grad_norm": NaN, + "learning_rate": 4.246023325984132e-05, + "loss": 0.0, + "step": 49004 + }, + { + "epoch": 4.572641597461976, + "grad_norm": NaN, + "learning_rate": 4.2454960354679193e-05, + "loss": 0.0, + "step": 49005 + }, + { + "epoch": 4.572734907156853, + "grad_norm": NaN, + "learning_rate": 4.244968772297233e-05, + "loss": 0.0, + "step": 49006 + }, + { + "epoch": 4.572828216851731, + "grad_norm": NaN, + "learning_rate": 4.2444415364734004e-05, + "loss": 0.0, + "step": 49007 + }, + { + "epoch": 4.572921526546608, + "grad_norm": NaN, + "learning_rate": 4.243914327997763e-05, + "loss": 0.0, + "step": 49008 + }, + { + "epoch": 4.5730148362414855, + "grad_norm": NaN, + "learning_rate": 4.243387146871673e-05, + "loss": 0.0, + "step": 49009 + }, + { + "epoch": 4.573108145936363, + "grad_norm": NaN, + "learning_rate": 4.2428599930964595e-05, + "loss": 0.0, + "step": 49010 + }, + { + "epoch": 4.57320145563124, + "grad_norm": NaN, + "learning_rate": 4.242332866673462e-05, + "loss": 0.0, + "step": 49011 + }, + { + "epoch": 4.573294765326118, + "grad_norm": NaN, + "learning_rate": 4.241805767604035e-05, + "loss": 0.0, + "step": 49012 + }, + { + "epoch": 4.573388075020994, + "grad_norm": NaN, + "learning_rate": 4.2412786958895035e-05, + "loss": 0.0, + "step": 49013 + }, + { + "epoch": 4.573481384715872, + "grad_norm": NaN, + "learning_rate": 4.240751651531209e-05, + "loss": 0.0, + "step": 49014 + }, + { + "epoch": 4.573574694410749, + "grad_norm": NaN, + "learning_rate": 4.240224634530506e-05, + "loss": 0.0, + "step": 49015 + }, + { + "epoch": 4.573668004105627, + "grad_norm": NaN, + "learning_rate": 4.239697644888719e-05, + "loss": 0.0, + "step": 49016 + }, + { + "epoch": 4.573761313800504, + "grad_norm": NaN, + "learning_rate": 4.239170682607188e-05, + "loss": 0.0, + "step": 49017 + }, + { + "epoch": 4.573854623495381, + "grad_norm": NaN, + "learning_rate": 4.238643747687269e-05, + "loss": 0.0, + "step": 49018 + }, + { + "epoch": 4.573947933190259, + "grad_norm": NaN, + "learning_rate": 4.2381168401302864e-05, + "loss": 0.0, + "step": 49019 + }, + { + "epoch": 4.574041242885135, + "grad_norm": NaN, + "learning_rate": 4.2375899599375854e-05, + "loss": 0.0, + "step": 49020 + }, + { + "epoch": 4.574134552580013, + "grad_norm": NaN, + "learning_rate": 4.237063107110505e-05, + "loss": 0.0, + "step": 49021 + }, + { + "epoch": 4.57422786227489, + "grad_norm": NaN, + "learning_rate": 4.2365362816503835e-05, + "loss": 0.0, + "step": 49022 + }, + { + "epoch": 4.574321171969768, + "grad_norm": NaN, + "learning_rate": 4.236009483558563e-05, + "loss": 0.0, + "step": 49023 + }, + { + "epoch": 4.574414481664645, + "grad_norm": NaN, + "learning_rate": 4.235482712836381e-05, + "loss": 0.0, + "step": 49024 + }, + { + "epoch": 4.5745077913595225, + "grad_norm": NaN, + "learning_rate": 4.2349559694851735e-05, + "loss": 0.0, + "step": 49025 + }, + { + "epoch": 4.5746011010544, + "grad_norm": NaN, + "learning_rate": 4.234429253506293e-05, + "loss": 0.0, + "step": 49026 + }, + { + "epoch": 4.574694410749277, + "grad_norm": NaN, + "learning_rate": 4.2339025649010655e-05, + "loss": 0.0, + "step": 49027 + }, + { + "epoch": 4.574787720444154, + "grad_norm": NaN, + "learning_rate": 4.2333759036708306e-05, + "loss": 0.0, + "step": 49028 + }, + { + "epoch": 4.574881030139031, + "grad_norm": NaN, + "learning_rate": 4.23284926981694e-05, + "loss": 0.0, + "step": 49029 + }, + { + "epoch": 4.574974339833909, + "grad_norm": NaN, + "learning_rate": 4.232322663340721e-05, + "loss": 0.0, + "step": 49030 + }, + { + "epoch": 4.575067649528786, + "grad_norm": NaN, + "learning_rate": 4.23179608424351e-05, + "loss": 0.0, + "step": 49031 + }, + { + "epoch": 4.5751609592236635, + "grad_norm": NaN, + "learning_rate": 4.2312695325266613e-05, + "loss": 0.0, + "step": 49032 + }, + { + "epoch": 4.575254268918541, + "grad_norm": NaN, + "learning_rate": 4.2307430081914994e-05, + "loss": 0.0, + "step": 49033 + }, + { + "epoch": 4.5753475786134175, + "grad_norm": NaN, + "learning_rate": 4.2302165112393636e-05, + "loss": 0.0, + "step": 49034 + }, + { + "epoch": 4.575440888308295, + "grad_norm": NaN, + "learning_rate": 4.2296900416716064e-05, + "loss": 0.0, + "step": 49035 + }, + { + "epoch": 4.575534198003172, + "grad_norm": NaN, + "learning_rate": 4.229163599489551e-05, + "loss": 0.0, + "step": 49036 + }, + { + "epoch": 4.57562750769805, + "grad_norm": NaN, + "learning_rate": 4.2286371846945394e-05, + "loss": 0.0, + "step": 49037 + }, + { + "epoch": 4.575720817392927, + "grad_norm": NaN, + "learning_rate": 4.22811079728792e-05, + "loss": 0.0, + "step": 49038 + }, + { + "epoch": 4.5758141270878046, + "grad_norm": NaN, + "learning_rate": 4.2275844372710204e-05, + "loss": 0.0, + "step": 49039 + }, + { + "epoch": 4.575907436782682, + "grad_norm": NaN, + "learning_rate": 4.227058104645177e-05, + "loss": 0.0, + "step": 49040 + }, + { + "epoch": 4.576000746477559, + "grad_norm": NaN, + "learning_rate": 4.2265317994117444e-05, + "loss": 0.0, + "step": 49041 + }, + { + "epoch": 4.576094056172437, + "grad_norm": NaN, + "learning_rate": 4.226005521572045e-05, + "loss": 0.0, + "step": 49042 + }, + { + "epoch": 4.576187365867313, + "grad_norm": NaN, + "learning_rate": 4.225479271127417e-05, + "loss": 0.0, + "step": 49043 + }, + { + "epoch": 4.576280675562191, + "grad_norm": NaN, + "learning_rate": 4.224953048079214e-05, + "loss": 0.0, + "step": 49044 + }, + { + "epoch": 4.576373985257068, + "grad_norm": NaN, + "learning_rate": 4.2244268524287574e-05, + "loss": 0.0, + "step": 49045 + }, + { + "epoch": 4.576467294951946, + "grad_norm": NaN, + "learning_rate": 4.223900684177388e-05, + "loss": 0.0, + "step": 49046 + }, + { + "epoch": 4.576560604646823, + "grad_norm": NaN, + "learning_rate": 4.223374543326456e-05, + "loss": 0.0, + "step": 49047 + }, + { + "epoch": 4.5766539143417, + "grad_norm": NaN, + "learning_rate": 4.222848429877285e-05, + "loss": 0.0, + "step": 49048 + }, + { + "epoch": 4.576747224036577, + "grad_norm": NaN, + "learning_rate": 4.222322343831213e-05, + "loss": 0.0, + "step": 49049 + }, + { + "epoch": 4.576840533731454, + "grad_norm": NaN, + "learning_rate": 4.221796285189594e-05, + "loss": 0.0, + "step": 49050 + }, + { + "epoch": 4.576933843426332, + "grad_norm": NaN, + "learning_rate": 4.2212702539537465e-05, + "loss": 0.0, + "step": 49051 + }, + { + "epoch": 4.577027153121209, + "grad_norm": NaN, + "learning_rate": 4.220744250125015e-05, + "loss": 0.0, + "step": 49052 + }, + { + "epoch": 4.577120462816087, + "grad_norm": NaN, + "learning_rate": 4.220218273704744e-05, + "loss": 0.0, + "step": 49053 + }, + { + "epoch": 4.577213772510964, + "grad_norm": NaN, + "learning_rate": 4.2196923246942615e-05, + "loss": 0.0, + "step": 49054 + }, + { + "epoch": 4.5773070822058415, + "grad_norm": NaN, + "learning_rate": 4.219166403094903e-05, + "loss": 0.0, + "step": 49055 + }, + { + "epoch": 4.577400391900719, + "grad_norm": NaN, + "learning_rate": 4.21864050890802e-05, + "loss": 0.0, + "step": 49056 + }, + { + "epoch": 4.577493701595595, + "grad_norm": NaN, + "learning_rate": 4.2181146421349364e-05, + "loss": 0.0, + "step": 49057 + }, + { + "epoch": 4.577587011290473, + "grad_norm": NaN, + "learning_rate": 4.2175888027769885e-05, + "loss": 0.0, + "step": 49058 + }, + { + "epoch": 4.57768032098535, + "grad_norm": NaN, + "learning_rate": 4.217062990835527e-05, + "loss": 0.0, + "step": 49059 + }, + { + "epoch": 4.577773630680228, + "grad_norm": NaN, + "learning_rate": 4.2165372063118765e-05, + "loss": 0.0, + "step": 49060 + }, + { + "epoch": 4.577866940375105, + "grad_norm": NaN, + "learning_rate": 4.2160114492073736e-05, + "loss": 0.0, + "step": 49061 + }, + { + "epoch": 4.5779602500699825, + "grad_norm": NaN, + "learning_rate": 4.215485719523367e-05, + "loss": 0.0, + "step": 49062 + }, + { + "epoch": 4.57805355976486, + "grad_norm": NaN, + "learning_rate": 4.214960017261177e-05, + "loss": 0.0, + "step": 49063 + }, + { + "epoch": 4.5781468694597365, + "grad_norm": NaN, + "learning_rate": 4.214434342422157e-05, + "loss": 0.0, + "step": 49064 + }, + { + "epoch": 4.578240179154614, + "grad_norm": NaN, + "learning_rate": 4.2139086950076314e-05, + "loss": 0.0, + "step": 49065 + }, + { + "epoch": 4.578333488849491, + "grad_norm": NaN, + "learning_rate": 4.213383075018938e-05, + "loss": 0.0, + "step": 49066 + }, + { + "epoch": 4.578426798544369, + "grad_norm": NaN, + "learning_rate": 4.2128574824574245e-05, + "loss": 0.0, + "step": 49067 + }, + { + "epoch": 4.578520108239246, + "grad_norm": NaN, + "learning_rate": 4.212331917324414e-05, + "loss": 0.0, + "step": 49068 + }, + { + "epoch": 4.578613417934124, + "grad_norm": NaN, + "learning_rate": 4.211806379621242e-05, + "loss": 0.0, + "step": 49069 + }, + { + "epoch": 4.578706727629001, + "grad_norm": NaN, + "learning_rate": 4.211280869349262e-05, + "loss": 0.0, + "step": 49070 + }, + { + "epoch": 4.578800037323878, + "grad_norm": NaN, + "learning_rate": 4.210755386509794e-05, + "loss": 0.0, + "step": 49071 + }, + { + "epoch": 4.578893347018755, + "grad_norm": NaN, + "learning_rate": 4.2102299311041736e-05, + "loss": 0.0, + "step": 49072 + }, + { + "epoch": 4.578986656713632, + "grad_norm": NaN, + "learning_rate": 4.209704503133751e-05, + "loss": 0.0, + "step": 49073 + }, + { + "epoch": 4.57907996640851, + "grad_norm": NaN, + "learning_rate": 4.20917910259985e-05, + "loss": 0.0, + "step": 49074 + }, + { + "epoch": 4.579173276103387, + "grad_norm": NaN, + "learning_rate": 4.208653729503806e-05, + "loss": 0.0, + "step": 49075 + }, + { + "epoch": 4.579266585798265, + "grad_norm": NaN, + "learning_rate": 4.2081283838469674e-05, + "loss": 0.0, + "step": 49076 + }, + { + "epoch": 4.579359895493142, + "grad_norm": NaN, + "learning_rate": 4.2076030656306576e-05, + "loss": 0.0, + "step": 49077 + }, + { + "epoch": 4.579453205188019, + "grad_norm": NaN, + "learning_rate": 4.2070777748562095e-05, + "loss": 0.0, + "step": 49078 + }, + { + "epoch": 4.579546514882896, + "grad_norm": NaN, + "learning_rate": 4.206552511524978e-05, + "loss": 0.0, + "step": 49079 + }, + { + "epoch": 4.579639824577773, + "grad_norm": NaN, + "learning_rate": 4.206027275638279e-05, + "loss": 0.0, + "step": 49080 + }, + { + "epoch": 4.579733134272651, + "grad_norm": NaN, + "learning_rate": 4.205502067197451e-05, + "loss": 0.0, + "step": 49081 + }, + { + "epoch": 4.579826443967528, + "grad_norm": NaN, + "learning_rate": 4.204976886203844e-05, + "loss": 0.0, + "step": 49082 + }, + { + "epoch": 4.579919753662406, + "grad_norm": NaN, + "learning_rate": 4.2044517326587776e-05, + "loss": 0.0, + "step": 49083 + }, + { + "epoch": 4.580013063357283, + "grad_norm": NaN, + "learning_rate": 4.203926606563588e-05, + "loss": 0.0, + "step": 49084 + }, + { + "epoch": 4.5801063730521605, + "grad_norm": NaN, + "learning_rate": 4.203401507919625e-05, + "loss": 0.0, + "step": 49085 + }, + { + "epoch": 4.580199682747037, + "grad_norm": NaN, + "learning_rate": 4.2028764367282086e-05, + "loss": 0.0, + "step": 49086 + }, + { + "epoch": 4.5802929924419145, + "grad_norm": NaN, + "learning_rate": 4.202351392990673e-05, + "loss": 0.0, + "step": 49087 + }, + { + "epoch": 4.580386302136792, + "grad_norm": NaN, + "learning_rate": 4.201826376708371e-05, + "loss": 0.0, + "step": 49088 + }, + { + "epoch": 4.580479611831669, + "grad_norm": NaN, + "learning_rate": 4.201301387882621e-05, + "loss": 0.0, + "step": 49089 + }, + { + "epoch": 4.580572921526547, + "grad_norm": NaN, + "learning_rate": 4.2007764265147567e-05, + "loss": 0.0, + "step": 49090 + }, + { + "epoch": 4.580666231221424, + "grad_norm": NaN, + "learning_rate": 4.2002514926061285e-05, + "loss": 0.0, + "step": 49091 + }, + { + "epoch": 4.5807595409163016, + "grad_norm": NaN, + "learning_rate": 4.199726586158058e-05, + "loss": 0.0, + "step": 49092 + }, + { + "epoch": 4.580852850611178, + "grad_norm": NaN, + "learning_rate": 4.199201707171877e-05, + "loss": 0.0, + "step": 49093 + }, + { + "epoch": 4.5809461603060555, + "grad_norm": NaN, + "learning_rate": 4.198676855648937e-05, + "loss": 0.0, + "step": 49094 + }, + { + "epoch": 4.581039470000933, + "grad_norm": NaN, + "learning_rate": 4.1981520315905585e-05, + "loss": 0.0, + "step": 49095 + }, + { + "epoch": 4.58113277969581, + "grad_norm": NaN, + "learning_rate": 4.197627234998074e-05, + "loss": 0.0, + "step": 49096 + }, + { + "epoch": 4.581226089390688, + "grad_norm": NaN, + "learning_rate": 4.1971024658728335e-05, + "loss": 0.0, + "step": 49097 + }, + { + "epoch": 4.581319399085565, + "grad_norm": NaN, + "learning_rate": 4.1965777242161516e-05, + "loss": 0.0, + "step": 49098 + }, + { + "epoch": 4.581412708780443, + "grad_norm": NaN, + "learning_rate": 4.1960530100293764e-05, + "loss": 0.0, + "step": 49099 + }, + { + "epoch": 4.58150601847532, + "grad_norm": NaN, + "learning_rate": 4.1955283233138434e-05, + "loss": 0.0, + "step": 49100 + }, + { + "epoch": 4.5815993281701966, + "grad_norm": NaN, + "learning_rate": 4.195003664070872e-05, + "loss": 0.0, + "step": 49101 + }, + { + "epoch": 4.581692637865074, + "grad_norm": NaN, + "learning_rate": 4.194479032301809e-05, + "loss": 0.0, + "step": 49102 + }, + { + "epoch": 4.581785947559951, + "grad_norm": NaN, + "learning_rate": 4.193954428007992e-05, + "loss": 0.0, + "step": 49103 + }, + { + "epoch": 4.581879257254829, + "grad_norm": NaN, + "learning_rate": 4.1934298511907374e-05, + "loss": 0.0, + "step": 49104 + }, + { + "epoch": 4.581972566949706, + "grad_norm": NaN, + "learning_rate": 4.192905301851395e-05, + "loss": 0.0, + "step": 49105 + }, + { + "epoch": 4.582065876644584, + "grad_norm": NaN, + "learning_rate": 4.192380779991296e-05, + "loss": 0.0, + "step": 49106 + }, + { + "epoch": 4.58215918633946, + "grad_norm": NaN, + "learning_rate": 4.1918562856117624e-05, + "loss": 0.0, + "step": 49107 + }, + { + "epoch": 4.582252496034338, + "grad_norm": NaN, + "learning_rate": 4.191331818714143e-05, + "loss": 0.0, + "step": 49108 + }, + { + "epoch": 4.582345805729215, + "grad_norm": NaN, + "learning_rate": 4.190807379299768e-05, + "loss": 0.0, + "step": 49109 + }, + { + "epoch": 4.582439115424092, + "grad_norm": NaN, + "learning_rate": 4.19028296736996e-05, + "loss": 0.0, + "step": 49110 + }, + { + "epoch": 4.58253242511897, + "grad_norm": NaN, + "learning_rate": 4.1897585829260674e-05, + "loss": 0.0, + "step": 49111 + }, + { + "epoch": 4.582625734813847, + "grad_norm": NaN, + "learning_rate": 4.189234225969414e-05, + "loss": 0.0, + "step": 49112 + }, + { + "epoch": 4.582719044508725, + "grad_norm": NaN, + "learning_rate": 4.1887098965013295e-05, + "loss": 0.0, + "step": 49113 + }, + { + "epoch": 4.582812354203602, + "grad_norm": NaN, + "learning_rate": 4.188185594523163e-05, + "loss": 0.0, + "step": 49114 + }, + { + "epoch": 4.5829056638984795, + "grad_norm": NaN, + "learning_rate": 4.1876613200362344e-05, + "loss": 0.0, + "step": 49115 + }, + { + "epoch": 4.582998973593356, + "grad_norm": NaN, + "learning_rate": 4.1871370730418735e-05, + "loss": 0.0, + "step": 49116 + }, + { + "epoch": 4.5830922832882335, + "grad_norm": NaN, + "learning_rate": 4.18661285354143e-05, + "loss": 0.0, + "step": 49117 + }, + { + "epoch": 4.583185592983111, + "grad_norm": NaN, + "learning_rate": 4.1860886615362226e-05, + "loss": 0.0, + "step": 49118 + }, + { + "epoch": 4.583278902677988, + "grad_norm": NaN, + "learning_rate": 4.1855644970275834e-05, + "loss": 0.0, + "step": 49119 + }, + { + "epoch": 4.583372212372866, + "grad_norm": NaN, + "learning_rate": 4.18504036001686e-05, + "loss": 0.0, + "step": 49120 + }, + { + "epoch": 4.583465522067743, + "grad_norm": NaN, + "learning_rate": 4.1845162505053715e-05, + "loss": 0.0, + "step": 49121 + }, + { + "epoch": 4.58355883176262, + "grad_norm": NaN, + "learning_rate": 4.1839921684944475e-05, + "loss": 0.0, + "step": 49122 + }, + { + "epoch": 4.583652141457497, + "grad_norm": NaN, + "learning_rate": 4.183468113985438e-05, + "loss": 0.0, + "step": 49123 + }, + { + "epoch": 4.5837454511523745, + "grad_norm": NaN, + "learning_rate": 4.182944086979662e-05, + "loss": 0.0, + "step": 49124 + }, + { + "epoch": 4.583838760847252, + "grad_norm": NaN, + "learning_rate": 4.182420087478447e-05, + "loss": 0.0, + "step": 49125 + }, + { + "epoch": 4.583932070542129, + "grad_norm": NaN, + "learning_rate": 4.181896115483145e-05, + "loss": 0.0, + "step": 49126 + }, + { + "epoch": 4.584025380237007, + "grad_norm": NaN, + "learning_rate": 4.181372170995071e-05, + "loss": 0.0, + "step": 49127 + }, + { + "epoch": 4.584118689931884, + "grad_norm": NaN, + "learning_rate": 4.1808482540155576e-05, + "loss": 0.0, + "step": 49128 + }, + { + "epoch": 4.584211999626762, + "grad_norm": NaN, + "learning_rate": 4.1803243645459534e-05, + "loss": 0.0, + "step": 49129 + }, + { + "epoch": 4.584305309321638, + "grad_norm": NaN, + "learning_rate": 4.179800502587573e-05, + "loss": 0.0, + "step": 49130 + }, + { + "epoch": 4.584398619016516, + "grad_norm": NaN, + "learning_rate": 4.1792766681417506e-05, + "loss": 0.0, + "step": 49131 + }, + { + "epoch": 4.584491928711393, + "grad_norm": NaN, + "learning_rate": 4.178752861209831e-05, + "loss": 0.0, + "step": 49132 + }, + { + "epoch": 4.58458523840627, + "grad_norm": NaN, + "learning_rate": 4.178229081793134e-05, + "loss": 0.0, + "step": 49133 + }, + { + "epoch": 4.584678548101148, + "grad_norm": NaN, + "learning_rate": 4.177705329892989e-05, + "loss": 0.0, + "step": 49134 + }, + { + "epoch": 4.584771857796025, + "grad_norm": NaN, + "learning_rate": 4.177181605510744e-05, + "loss": 0.0, + "step": 49135 + }, + { + "epoch": 4.584865167490903, + "grad_norm": NaN, + "learning_rate": 4.17665790864771e-05, + "loss": 0.0, + "step": 49136 + }, + { + "epoch": 4.584958477185779, + "grad_norm": NaN, + "learning_rate": 4.176134239305234e-05, + "loss": 0.0, + "step": 49137 + }, + { + "epoch": 4.585051786880657, + "grad_norm": NaN, + "learning_rate": 4.1756105974846463e-05, + "loss": 0.0, + "step": 49138 + }, + { + "epoch": 4.585145096575534, + "grad_norm": NaN, + "learning_rate": 4.175086983187266e-05, + "loss": 0.0, + "step": 49139 + }, + { + "epoch": 4.5852384062704115, + "grad_norm": NaN, + "learning_rate": 4.174563396414438e-05, + "loss": 0.0, + "step": 49140 + }, + { + "epoch": 4.585331715965289, + "grad_norm": NaN, + "learning_rate": 4.1740398371674925e-05, + "loss": 0.0, + "step": 49141 + }, + { + "epoch": 4.585425025660166, + "grad_norm": NaN, + "learning_rate": 4.1735163054477465e-05, + "loss": 0.0, + "step": 49142 + }, + { + "epoch": 4.585518335355044, + "grad_norm": NaN, + "learning_rate": 4.172992801256547e-05, + "loss": 0.0, + "step": 49143 + }, + { + "epoch": 4.585611645049921, + "grad_norm": NaN, + "learning_rate": 4.172469324595225e-05, + "loss": 0.0, + "step": 49144 + }, + { + "epoch": 4.585704954744798, + "grad_norm": NaN, + "learning_rate": 4.171945875465096e-05, + "loss": 0.0, + "step": 49145 + }, + { + "epoch": 4.585798264439675, + "grad_norm": NaN, + "learning_rate": 4.171422453867506e-05, + "loss": 0.0, + "step": 49146 + }, + { + "epoch": 4.5858915741345525, + "grad_norm": NaN, + "learning_rate": 4.170899059803785e-05, + "loss": 0.0, + "step": 49147 + }, + { + "epoch": 4.58598488382943, + "grad_norm": NaN, + "learning_rate": 4.170375693275251e-05, + "loss": 0.0, + "step": 49148 + }, + { + "epoch": 4.586078193524307, + "grad_norm": NaN, + "learning_rate": 4.169852354283249e-05, + "loss": 0.0, + "step": 49149 + }, + { + "epoch": 4.586171503219185, + "grad_norm": NaN, + "learning_rate": 4.1693290428291085e-05, + "loss": 0.0, + "step": 49150 + }, + { + "epoch": 4.586264812914061, + "grad_norm": NaN, + "learning_rate": 4.168805758914146e-05, + "loss": 0.0, + "step": 49151 + }, + { + "epoch": 4.586358122608939, + "grad_norm": NaN, + "learning_rate": 4.168282502539708e-05, + "loss": 0.0, + "step": 49152 + }, + { + "epoch": 4.586451432303816, + "grad_norm": NaN, + "learning_rate": 4.1677592737071225e-05, + "loss": 0.0, + "step": 49153 + }, + { + "epoch": 4.5865447419986936, + "grad_norm": NaN, + "learning_rate": 4.167236072417708e-05, + "loss": 0.0, + "step": 49154 + }, + { + "epoch": 4.586638051693571, + "grad_norm": NaN, + "learning_rate": 4.166712898672812e-05, + "loss": 0.0, + "step": 49155 + }, + { + "epoch": 4.586731361388448, + "grad_norm": NaN, + "learning_rate": 4.166189752473753e-05, + "loss": 0.0, + "step": 49156 + }, + { + "epoch": 4.586824671083326, + "grad_norm": NaN, + "learning_rate": 4.1656666338218566e-05, + "loss": 0.0, + "step": 49157 + }, + { + "epoch": 4.586917980778203, + "grad_norm": NaN, + "learning_rate": 4.165143542718472e-05, + "loss": 0.0, + "step": 49158 + }, + { + "epoch": 4.587011290473081, + "grad_norm": NaN, + "learning_rate": 4.164620479164912e-05, + "loss": 0.0, + "step": 49159 + }, + { + "epoch": 4.587104600167957, + "grad_norm": NaN, + "learning_rate": 4.16409744316251e-05, + "loss": 0.0, + "step": 49160 + }, + { + "epoch": 4.587197909862835, + "grad_norm": NaN, + "learning_rate": 4.163574434712604e-05, + "loss": 0.0, + "step": 49161 + }, + { + "epoch": 4.587291219557712, + "grad_norm": NaN, + "learning_rate": 4.163051453816517e-05, + "loss": 0.0, + "step": 49162 + }, + { + "epoch": 4.587384529252589, + "grad_norm": NaN, + "learning_rate": 4.162528500475574e-05, + "loss": 0.0, + "step": 49163 + }, + { + "epoch": 4.587477838947467, + "grad_norm": NaN, + "learning_rate": 4.1620055746911195e-05, + "loss": 0.0, + "step": 49164 + }, + { + "epoch": 4.587571148642344, + "grad_norm": NaN, + "learning_rate": 4.161482676464471e-05, + "loss": 0.0, + "step": 49165 + }, + { + "epoch": 4.587664458337221, + "grad_norm": NaN, + "learning_rate": 4.160959805796955e-05, + "loss": 0.0, + "step": 49166 + }, + { + "epoch": 4.587757768032098, + "grad_norm": NaN, + "learning_rate": 4.1604369626899194e-05, + "loss": 0.0, + "step": 49167 + }, + { + "epoch": 4.587851077726976, + "grad_norm": NaN, + "learning_rate": 4.1599141471446746e-05, + "loss": 0.0, + "step": 49168 + }, + { + "epoch": 4.587944387421853, + "grad_norm": NaN, + "learning_rate": 4.1593913591625535e-05, + "loss": 0.0, + "step": 49169 + }, + { + "epoch": 4.5880376971167305, + "grad_norm": NaN, + "learning_rate": 4.1588685987448976e-05, + "loss": 0.0, + "step": 49170 + }, + { + "epoch": 4.588131006811608, + "grad_norm": NaN, + "learning_rate": 4.158345865893024e-05, + "loss": 0.0, + "step": 49171 + }, + { + "epoch": 4.588224316506485, + "grad_norm": NaN, + "learning_rate": 4.157823160608259e-05, + "loss": 0.0, + "step": 49172 + }, + { + "epoch": 4.588317626201363, + "grad_norm": NaN, + "learning_rate": 4.157300482891948e-05, + "loss": 0.0, + "step": 49173 + }, + { + "epoch": 4.588410935896239, + "grad_norm": NaN, + "learning_rate": 4.156777832745402e-05, + "loss": 0.0, + "step": 49174 + }, + { + "epoch": 4.588504245591117, + "grad_norm": NaN, + "learning_rate": 4.156255210169962e-05, + "loss": 0.0, + "step": 49175 + }, + { + "epoch": 4.588597555285994, + "grad_norm": NaN, + "learning_rate": 4.155732615166957e-05, + "loss": 0.0, + "step": 49176 + }, + { + "epoch": 4.5886908649808715, + "grad_norm": NaN, + "learning_rate": 4.155210047737701e-05, + "loss": 0.0, + "step": 49177 + }, + { + "epoch": 4.588784174675749, + "grad_norm": NaN, + "learning_rate": 4.15468750788354e-05, + "loss": 0.0, + "step": 49178 + }, + { + "epoch": 4.588877484370626, + "grad_norm": NaN, + "learning_rate": 4.1541649956057996e-05, + "loss": 0.0, + "step": 49179 + }, + { + "epoch": 4.588970794065504, + "grad_norm": NaN, + "learning_rate": 4.1536425109057965e-05, + "loss": 0.0, + "step": 49180 + }, + { + "epoch": 4.58906410376038, + "grad_norm": NaN, + "learning_rate": 4.1531200537848716e-05, + "loss": 0.0, + "step": 49181 + }, + { + "epoch": 4.589157413455258, + "grad_norm": NaN, + "learning_rate": 4.152597624244355e-05, + "loss": 0.0, + "step": 49182 + }, + { + "epoch": 4.589250723150135, + "grad_norm": NaN, + "learning_rate": 4.1520752222855594e-05, + "loss": 0.0, + "step": 49183 + }, + { + "epoch": 4.589344032845013, + "grad_norm": NaN, + "learning_rate": 4.151552847909827e-05, + "loss": 0.0, + "step": 49184 + }, + { + "epoch": 4.58943734253989, + "grad_norm": NaN, + "learning_rate": 4.151030501118488e-05, + "loss": 0.0, + "step": 49185 + }, + { + "epoch": 4.589530652234767, + "grad_norm": NaN, + "learning_rate": 4.150508181912854e-05, + "loss": 0.0, + "step": 49186 + }, + { + "epoch": 4.589623961929645, + "grad_norm": NaN, + "learning_rate": 4.14998589029427e-05, + "loss": 0.0, + "step": 49187 + }, + { + "epoch": 4.589717271624522, + "grad_norm": NaN, + "learning_rate": 4.149463626264062e-05, + "loss": 0.0, + "step": 49188 + }, + { + "epoch": 4.589810581319399, + "grad_norm": NaN, + "learning_rate": 4.148941389823545e-05, + "loss": 0.0, + "step": 49189 + }, + { + "epoch": 4.589903891014276, + "grad_norm": NaN, + "learning_rate": 4.1484191809740594e-05, + "loss": 0.0, + "step": 49190 + }, + { + "epoch": 4.589997200709154, + "grad_norm": NaN, + "learning_rate": 4.147896999716935e-05, + "loss": 0.0, + "step": 49191 + }, + { + "epoch": 4.590090510404031, + "grad_norm": NaN, + "learning_rate": 4.147374846053484e-05, + "loss": 0.0, + "step": 49192 + }, + { + "epoch": 4.5901838200989085, + "grad_norm": NaN, + "learning_rate": 4.146852719985049e-05, + "loss": 0.0, + "step": 49193 + }, + { + "epoch": 4.590277129793786, + "grad_norm": NaN, + "learning_rate": 4.1463306215129567e-05, + "loss": 0.0, + "step": 49194 + }, + { + "epoch": 4.590370439488662, + "grad_norm": NaN, + "learning_rate": 4.145808550638522e-05, + "loss": 0.0, + "step": 49195 + }, + { + "epoch": 4.59046374918354, + "grad_norm": NaN, + "learning_rate": 4.1452865073630834e-05, + "loss": 0.0, + "step": 49196 + }, + { + "epoch": 4.590557058878417, + "grad_norm": NaN, + "learning_rate": 4.1447644916879724e-05, + "loss": 0.0, + "step": 49197 + }, + { + "epoch": 4.590650368573295, + "grad_norm": NaN, + "learning_rate": 4.1442425036145007e-05, + "loss": 0.0, + "step": 49198 + }, + { + "epoch": 4.590743678268172, + "grad_norm": NaN, + "learning_rate": 4.143720543144013e-05, + "loss": 0.0, + "step": 49199 + }, + { + "epoch": 4.5908369879630495, + "grad_norm": NaN, + "learning_rate": 4.143198610277823e-05, + "loss": 0.0, + "step": 49200 + }, + { + "epoch": 4.590930297657927, + "grad_norm": NaN, + "learning_rate": 4.14267670501726e-05, + "loss": 0.0, + "step": 49201 + }, + { + "epoch": 4.591023607352804, + "grad_norm": NaN, + "learning_rate": 4.142154827363663e-05, + "loss": 0.0, + "step": 49202 + }, + { + "epoch": 4.591116917047681, + "grad_norm": NaN, + "learning_rate": 4.141632977318343e-05, + "loss": 0.0, + "step": 49203 + }, + { + "epoch": 4.591210226742558, + "grad_norm": NaN, + "learning_rate": 4.1411111548826315e-05, + "loss": 0.0, + "step": 49204 + }, + { + "epoch": 4.591303536437436, + "grad_norm": NaN, + "learning_rate": 4.1405893600578666e-05, + "loss": 0.0, + "step": 49205 + }, + { + "epoch": 4.591396846132313, + "grad_norm": NaN, + "learning_rate": 4.1400675928453606e-05, + "loss": 0.0, + "step": 49206 + }, + { + "epoch": 4.5914901558271906, + "grad_norm": NaN, + "learning_rate": 4.139545853246442e-05, + "loss": 0.0, + "step": 49207 + }, + { + "epoch": 4.591583465522068, + "grad_norm": NaN, + "learning_rate": 4.139024141262451e-05, + "loss": 0.0, + "step": 49208 + }, + { + "epoch": 4.591676775216945, + "grad_norm": NaN, + "learning_rate": 4.138502456894698e-05, + "loss": 0.0, + "step": 49209 + }, + { + "epoch": 4.591770084911822, + "grad_norm": NaN, + "learning_rate": 4.1379808001445114e-05, + "loss": 0.0, + "step": 49210 + }, + { + "epoch": 4.591863394606699, + "grad_norm": NaN, + "learning_rate": 4.137459171013232e-05, + "loss": 0.0, + "step": 49211 + }, + { + "epoch": 4.591956704301577, + "grad_norm": NaN, + "learning_rate": 4.136937569502167e-05, + "loss": 0.0, + "step": 49212 + }, + { + "epoch": 4.592050013996454, + "grad_norm": NaN, + "learning_rate": 4.136415995612658e-05, + "loss": 0.0, + "step": 49213 + }, + { + "epoch": 4.592143323691332, + "grad_norm": NaN, + "learning_rate": 4.135894449346027e-05, + "loss": 0.0, + "step": 49214 + }, + { + "epoch": 4.592236633386209, + "grad_norm": NaN, + "learning_rate": 4.135372930703591e-05, + "loss": 0.0, + "step": 49215 + }, + { + "epoch": 4.592329943081086, + "grad_norm": NaN, + "learning_rate": 4.134851439686686e-05, + "loss": 0.0, + "step": 49216 + }, + { + "epoch": 4.592423252775964, + "grad_norm": NaN, + "learning_rate": 4.134329976296641e-05, + "loss": 0.0, + "step": 49217 + }, + { + "epoch": 4.59251656247084, + "grad_norm": NaN, + "learning_rate": 4.133808540534767e-05, + "loss": 0.0, + "step": 49218 + }, + { + "epoch": 4.592609872165718, + "grad_norm": NaN, + "learning_rate": 4.133287132402404e-05, + "loss": 0.0, + "step": 49219 + }, + { + "epoch": 4.592703181860595, + "grad_norm": NaN, + "learning_rate": 4.1327657519008776e-05, + "loss": 0.0, + "step": 49220 + }, + { + "epoch": 4.592796491555473, + "grad_norm": NaN, + "learning_rate": 4.132244399031499e-05, + "loss": 0.0, + "step": 49221 + }, + { + "epoch": 4.59288980125035, + "grad_norm": NaN, + "learning_rate": 4.131723073795608e-05, + "loss": 0.0, + "step": 49222 + }, + { + "epoch": 4.5929831109452275, + "grad_norm": NaN, + "learning_rate": 4.13120177619453e-05, + "loss": 0.0, + "step": 49223 + }, + { + "epoch": 4.593076420640104, + "grad_norm": NaN, + "learning_rate": 4.130680506229579e-05, + "loss": 0.0, + "step": 49224 + }, + { + "epoch": 4.593169730334981, + "grad_norm": NaN, + "learning_rate": 4.130159263902091e-05, + "loss": 0.0, + "step": 49225 + }, + { + "epoch": 4.593263040029859, + "grad_norm": NaN, + "learning_rate": 4.1296380492133934e-05, + "loss": 0.0, + "step": 49226 + }, + { + "epoch": 4.593356349724736, + "grad_norm": NaN, + "learning_rate": 4.1291168621647964e-05, + "loss": 0.0, + "step": 49227 + }, + { + "epoch": 4.593449659419614, + "grad_norm": NaN, + "learning_rate": 4.12859570275764e-05, + "loss": 0.0, + "step": 49228 + }, + { + "epoch": 4.593542969114491, + "grad_norm": NaN, + "learning_rate": 4.128074570993248e-05, + "loss": 0.0, + "step": 49229 + }, + { + "epoch": 4.5936362788093685, + "grad_norm": NaN, + "learning_rate": 4.1275534668729335e-05, + "loss": 0.0, + "step": 49230 + }, + { + "epoch": 4.593729588504246, + "grad_norm": NaN, + "learning_rate": 4.1270323903980334e-05, + "loss": 0.0, + "step": 49231 + }, + { + "epoch": 4.593822898199123, + "grad_norm": NaN, + "learning_rate": 4.126511341569875e-05, + "loss": 0.0, + "step": 49232 + }, + { + "epoch": 4.593916207894, + "grad_norm": NaN, + "learning_rate": 4.125990320389768e-05, + "loss": 0.0, + "step": 49233 + }, + { + "epoch": 4.594009517588877, + "grad_norm": NaN, + "learning_rate": 4.125469326859051e-05, + "loss": 0.0, + "step": 49234 + }, + { + "epoch": 4.594102827283755, + "grad_norm": NaN, + "learning_rate": 4.124948360979049e-05, + "loss": 0.0, + "step": 49235 + }, + { + "epoch": 4.594196136978632, + "grad_norm": NaN, + "learning_rate": 4.124427422751073e-05, + "loss": 0.0, + "step": 49236 + }, + { + "epoch": 4.59428944667351, + "grad_norm": NaN, + "learning_rate": 4.123906512176459e-05, + "loss": 0.0, + "step": 49237 + }, + { + "epoch": 4.594382756368387, + "grad_norm": NaN, + "learning_rate": 4.123385629256536e-05, + "loss": 0.0, + "step": 49238 + }, + { + "epoch": 4.5944760660632635, + "grad_norm": NaN, + "learning_rate": 4.12286477399261e-05, + "loss": 0.0, + "step": 49239 + }, + { + "epoch": 4.594569375758141, + "grad_norm": NaN, + "learning_rate": 4.122343946386024e-05, + "loss": 0.0, + "step": 49240 + }, + { + "epoch": 4.594662685453018, + "grad_norm": NaN, + "learning_rate": 4.1218231464380985e-05, + "loss": 0.0, + "step": 49241 + }, + { + "epoch": 4.594755995147896, + "grad_norm": NaN, + "learning_rate": 4.1213023741501454e-05, + "loss": 0.0, + "step": 49242 + }, + { + "epoch": 4.594849304842773, + "grad_norm": NaN, + "learning_rate": 4.120781629523506e-05, + "loss": 0.0, + "step": 49243 + }, + { + "epoch": 4.594942614537651, + "grad_norm": NaN, + "learning_rate": 4.120260912559493e-05, + "loss": 0.0, + "step": 49244 + }, + { + "epoch": 4.595035924232528, + "grad_norm": NaN, + "learning_rate": 4.1197402232594284e-05, + "loss": 0.0, + "step": 49245 + }, + { + "epoch": 4.5951292339274055, + "grad_norm": NaN, + "learning_rate": 4.11921956162465e-05, + "loss": 0.0, + "step": 49246 + }, + { + "epoch": 4.595222543622282, + "grad_norm": NaN, + "learning_rate": 4.1186989276564694e-05, + "loss": 0.0, + "step": 49247 + }, + { + "epoch": 4.595315853317159, + "grad_norm": NaN, + "learning_rate": 4.1181783213562105e-05, + "loss": 0.0, + "step": 49248 + }, + { + "epoch": 4.595409163012037, + "grad_norm": NaN, + "learning_rate": 4.11765774272521e-05, + "loss": 0.0, + "step": 49249 + }, + { + "epoch": 4.595502472706914, + "grad_norm": NaN, + "learning_rate": 4.117137191764771e-05, + "loss": 0.0, + "step": 49250 + }, + { + "epoch": 4.595595782401792, + "grad_norm": NaN, + "learning_rate": 4.116616668476234e-05, + "loss": 0.0, + "step": 49251 + }, + { + "epoch": 4.595689092096669, + "grad_norm": NaN, + "learning_rate": 4.116096172860921e-05, + "loss": 0.0, + "step": 49252 + }, + { + "epoch": 4.5957824017915465, + "grad_norm": NaN, + "learning_rate": 4.115575704920142e-05, + "loss": 0.0, + "step": 49253 + }, + { + "epoch": 4.595875711486423, + "grad_norm": NaN, + "learning_rate": 4.1150552646552345e-05, + "loss": 0.0, + "step": 49254 + }, + { + "epoch": 4.5959690211813005, + "grad_norm": NaN, + "learning_rate": 4.114534852067523e-05, + "loss": 0.0, + "step": 49255 + }, + { + "epoch": 4.596062330876178, + "grad_norm": NaN, + "learning_rate": 4.114014467158316e-05, + "loss": 0.0, + "step": 49256 + }, + { + "epoch": 4.596155640571055, + "grad_norm": NaN, + "learning_rate": 4.1134941099289484e-05, + "loss": 0.0, + "step": 49257 + }, + { + "epoch": 4.596248950265933, + "grad_norm": NaN, + "learning_rate": 4.1129737803807455e-05, + "loss": 0.0, + "step": 49258 + }, + { + "epoch": 4.59634225996081, + "grad_norm": NaN, + "learning_rate": 4.112453478515017e-05, + "loss": 0.0, + "step": 49259 + }, + { + "epoch": 4.596435569655688, + "grad_norm": NaN, + "learning_rate": 4.111933204333097e-05, + "loss": 0.0, + "step": 49260 + }, + { + "epoch": 4.596528879350565, + "grad_norm": NaN, + "learning_rate": 4.1114129578363115e-05, + "loss": 0.0, + "step": 49261 + }, + { + "epoch": 4.5966221890454415, + "grad_norm": NaN, + "learning_rate": 4.1108927390259686e-05, + "loss": 0.0, + "step": 49262 + }, + { + "epoch": 4.596715498740319, + "grad_norm": NaN, + "learning_rate": 4.1103725479034034e-05, + "loss": 0.0, + "step": 49263 + }, + { + "epoch": 4.596808808435196, + "grad_norm": NaN, + "learning_rate": 4.109852384469941e-05, + "loss": 0.0, + "step": 49264 + }, + { + "epoch": 4.596902118130074, + "grad_norm": NaN, + "learning_rate": 4.109332248726888e-05, + "loss": 0.0, + "step": 49265 + }, + { + "epoch": 4.596995427824951, + "grad_norm": NaN, + "learning_rate": 4.10881214067558e-05, + "loss": 0.0, + "step": 49266 + }, + { + "epoch": 4.597088737519829, + "grad_norm": NaN, + "learning_rate": 4.108292060317342e-05, + "loss": 0.0, + "step": 49267 + }, + { + "epoch": 4.597182047214705, + "grad_norm": NaN, + "learning_rate": 4.107772007653483e-05, + "loss": 0.0, + "step": 49268 + }, + { + "epoch": 4.5972753569095826, + "grad_norm": NaN, + "learning_rate": 4.107251982685338e-05, + "loss": 0.0, + "step": 49269 + }, + { + "epoch": 4.59736866660446, + "grad_norm": NaN, + "learning_rate": 4.106731985414229e-05, + "loss": 0.0, + "step": 49270 + }, + { + "epoch": 4.597461976299337, + "grad_norm": NaN, + "learning_rate": 4.106212015841464e-05, + "loss": 0.0, + "step": 49271 + }, + { + "epoch": 4.597555285994215, + "grad_norm": NaN, + "learning_rate": 4.105692073968379e-05, + "loss": 0.0, + "step": 49272 + }, + { + "epoch": 4.597648595689092, + "grad_norm": NaN, + "learning_rate": 4.105172159796296e-05, + "loss": 0.0, + "step": 49273 + }, + { + "epoch": 4.59774190538397, + "grad_norm": NaN, + "learning_rate": 4.104652273326525e-05, + "loss": 0.0, + "step": 49274 + }, + { + "epoch": 4.597835215078847, + "grad_norm": NaN, + "learning_rate": 4.1041324145604e-05, + "loss": 0.0, + "step": 49275 + }, + { + "epoch": 4.597928524773724, + "grad_norm": NaN, + "learning_rate": 4.103612583499244e-05, + "loss": 0.0, + "step": 49276 + }, + { + "epoch": 4.598021834468601, + "grad_norm": NaN, + "learning_rate": 4.103092780144363e-05, + "loss": 0.0, + "step": 49277 + }, + { + "epoch": 4.598115144163478, + "grad_norm": NaN, + "learning_rate": 4.102573004497095e-05, + "loss": 0.0, + "step": 49278 + }, + { + "epoch": 4.598208453858356, + "grad_norm": NaN, + "learning_rate": 4.102053256558761e-05, + "loss": 0.0, + "step": 49279 + }, + { + "epoch": 4.598301763553233, + "grad_norm": NaN, + "learning_rate": 4.101533536330667e-05, + "loss": 0.0, + "step": 49280 + }, + { + "epoch": 4.598395073248111, + "grad_norm": NaN, + "learning_rate": 4.101013843814148e-05, + "loss": 0.0, + "step": 49281 + }, + { + "epoch": 4.598488382942988, + "grad_norm": NaN, + "learning_rate": 4.100494179010529e-05, + "loss": 0.0, + "step": 49282 + }, + { + "epoch": 4.598581692637865, + "grad_norm": NaN, + "learning_rate": 4.0999745419211144e-05, + "loss": 0.0, + "step": 49283 + }, + { + "epoch": 4.598675002332742, + "grad_norm": NaN, + "learning_rate": 4.099454932547241e-05, + "loss": 0.0, + "step": 49284 + }, + { + "epoch": 4.5987683120276195, + "grad_norm": NaN, + "learning_rate": 4.098935350890229e-05, + "loss": 0.0, + "step": 49285 + }, + { + "epoch": 4.598861621722497, + "grad_norm": NaN, + "learning_rate": 4.098415796951386e-05, + "loss": 0.0, + "step": 49286 + }, + { + "epoch": 4.598954931417374, + "grad_norm": NaN, + "learning_rate": 4.0978962707320464e-05, + "loss": 0.0, + "step": 49287 + }, + { + "epoch": 4.599048241112252, + "grad_norm": NaN, + "learning_rate": 4.097376772233527e-05, + "loss": 0.0, + "step": 49288 + }, + { + "epoch": 4.599141550807129, + "grad_norm": NaN, + "learning_rate": 4.096857301457151e-05, + "loss": 0.0, + "step": 49289 + }, + { + "epoch": 4.599234860502007, + "grad_norm": NaN, + "learning_rate": 4.09633785840424e-05, + "loss": 0.0, + "step": 49290 + }, + { + "epoch": 4.599328170196883, + "grad_norm": NaN, + "learning_rate": 4.095818443076103e-05, + "loss": 0.0, + "step": 49291 + }, + { + "epoch": 4.5994214798917605, + "grad_norm": NaN, + "learning_rate": 4.095299055474075e-05, + "loss": 0.0, + "step": 49292 + }, + { + "epoch": 4.599514789586638, + "grad_norm": NaN, + "learning_rate": 4.094779695599474e-05, + "loss": 0.0, + "step": 49293 + }, + { + "epoch": 4.599608099281515, + "grad_norm": NaN, + "learning_rate": 4.09426036345361e-05, + "loss": 0.0, + "step": 49294 + }, + { + "epoch": 4.599701408976393, + "grad_norm": NaN, + "learning_rate": 4.093741059037816e-05, + "loss": 0.0, + "step": 49295 + }, + { + "epoch": 4.59979471867127, + "grad_norm": NaN, + "learning_rate": 4.0932217823534126e-05, + "loss": 0.0, + "step": 49296 + }, + { + "epoch": 4.599888028366148, + "grad_norm": NaN, + "learning_rate": 4.092702533401705e-05, + "loss": 0.0, + "step": 49297 + }, + { + "epoch": 4.599981338061024, + "grad_norm": NaN, + "learning_rate": 4.092183312184029e-05, + "loss": 0.0, + "step": 49298 + }, + { + "epoch": 4.600074647755902, + "grad_norm": NaN, + "learning_rate": 4.0916641187017054e-05, + "loss": 0.0, + "step": 49299 + }, + { + "epoch": 4.600167957450779, + "grad_norm": NaN, + "learning_rate": 4.091144952956039e-05, + "loss": 0.0, + "step": 49300 + }, + { + "epoch": 4.600261267145656, + "grad_norm": NaN, + "learning_rate": 4.090625814948363e-05, + "loss": 0.0, + "step": 49301 + }, + { + "epoch": 4.600354576840534, + "grad_norm": NaN, + "learning_rate": 4.090106704679999e-05, + "loss": 0.0, + "step": 49302 + }, + { + "epoch": 4.600447886535411, + "grad_norm": NaN, + "learning_rate": 4.0895876221522545e-05, + "loss": 0.0, + "step": 49303 + }, + { + "epoch": 4.600541196230289, + "grad_norm": NaN, + "learning_rate": 4.0890685673664585e-05, + "loss": 0.0, + "step": 49304 + }, + { + "epoch": 4.600634505925166, + "grad_norm": NaN, + "learning_rate": 4.088549540323935e-05, + "loss": 0.0, + "step": 49305 + }, + { + "epoch": 4.600727815620043, + "grad_norm": NaN, + "learning_rate": 4.088030541025989e-05, + "loss": 0.0, + "step": 49306 + }, + { + "epoch": 4.60082112531492, + "grad_norm": NaN, + "learning_rate": 4.0875115694739546e-05, + "loss": 0.0, + "step": 49307 + }, + { + "epoch": 4.6009144350097975, + "grad_norm": NaN, + "learning_rate": 4.086992625669148e-05, + "loss": 0.0, + "step": 49308 + }, + { + "epoch": 4.601007744704675, + "grad_norm": NaN, + "learning_rate": 4.086473709612878e-05, + "loss": 0.0, + "step": 49309 + }, + { + "epoch": 4.601101054399552, + "grad_norm": NaN, + "learning_rate": 4.085954821306478e-05, + "loss": 0.0, + "step": 49310 + }, + { + "epoch": 4.60119436409443, + "grad_norm": NaN, + "learning_rate": 4.085435960751266e-05, + "loss": 0.0, + "step": 49311 + }, + { + "epoch": 4.601287673789306, + "grad_norm": NaN, + "learning_rate": 4.084917127948548e-05, + "loss": 0.0, + "step": 49312 + }, + { + "epoch": 4.601380983484184, + "grad_norm": NaN, + "learning_rate": 4.084398322899657e-05, + "loss": 0.0, + "step": 49313 + }, + { + "epoch": 4.601474293179061, + "grad_norm": NaN, + "learning_rate": 4.083879545605912e-05, + "loss": 0.0, + "step": 49314 + }, + { + "epoch": 4.6015676028739385, + "grad_norm": NaN, + "learning_rate": 4.0833607960686196e-05, + "loss": 0.0, + "step": 49315 + }, + { + "epoch": 4.601660912568816, + "grad_norm": NaN, + "learning_rate": 4.082842074289111e-05, + "loss": 0.0, + "step": 49316 + }, + { + "epoch": 4.601754222263693, + "grad_norm": NaN, + "learning_rate": 4.082323380268704e-05, + "loss": 0.0, + "step": 49317 + }, + { + "epoch": 4.601847531958571, + "grad_norm": NaN, + "learning_rate": 4.081804714008708e-05, + "loss": 0.0, + "step": 49318 + }, + { + "epoch": 4.601940841653448, + "grad_norm": NaN, + "learning_rate": 4.081286075510452e-05, + "loss": 0.0, + "step": 49319 + }, + { + "epoch": 4.602034151348325, + "grad_norm": NaN, + "learning_rate": 4.080767464775256e-05, + "loss": 0.0, + "step": 49320 + }, + { + "epoch": 4.602127461043202, + "grad_norm": NaN, + "learning_rate": 4.0802488818044236e-05, + "loss": 0.0, + "step": 49321 + }, + { + "epoch": 4.60222077073808, + "grad_norm": NaN, + "learning_rate": 4.079730326599288e-05, + "loss": 0.0, + "step": 49322 + }, + { + "epoch": 4.602314080432957, + "grad_norm": NaN, + "learning_rate": 4.0792117991611644e-05, + "loss": 0.0, + "step": 49323 + }, + { + "epoch": 4.602407390127834, + "grad_norm": NaN, + "learning_rate": 4.07869329949137e-05, + "loss": 0.0, + "step": 49324 + }, + { + "epoch": 4.602500699822712, + "grad_norm": NaN, + "learning_rate": 4.078174827591224e-05, + "loss": 0.0, + "step": 49325 + }, + { + "epoch": 4.602594009517589, + "grad_norm": NaN, + "learning_rate": 4.0776563834620436e-05, + "loss": 0.0, + "step": 49326 + }, + { + "epoch": 4.602687319212466, + "grad_norm": NaN, + "learning_rate": 4.0771379671051474e-05, + "loss": 0.0, + "step": 49327 + }, + { + "epoch": 4.602780628907343, + "grad_norm": NaN, + "learning_rate": 4.0766195785218545e-05, + "loss": 0.0, + "step": 49328 + }, + { + "epoch": 4.602873938602221, + "grad_norm": NaN, + "learning_rate": 4.0761012177134814e-05, + "loss": 0.0, + "step": 49329 + }, + { + "epoch": 4.602967248297098, + "grad_norm": NaN, + "learning_rate": 4.075582884681347e-05, + "loss": 0.0, + "step": 49330 + }, + { + "epoch": 4.603060557991975, + "grad_norm": NaN, + "learning_rate": 4.07506457942677e-05, + "loss": 0.0, + "step": 49331 + }, + { + "epoch": 4.603153867686853, + "grad_norm": NaN, + "learning_rate": 4.074546301951067e-05, + "loss": 0.0, + "step": 49332 + }, + { + "epoch": 4.60324717738173, + "grad_norm": NaN, + "learning_rate": 4.0740280522555556e-05, + "loss": 0.0, + "step": 49333 + }, + { + "epoch": 4.603340487076608, + "grad_norm": NaN, + "learning_rate": 4.0735098303415594e-05, + "loss": 0.0, + "step": 49334 + }, + { + "epoch": 4.603433796771484, + "grad_norm": NaN, + "learning_rate": 4.072991636210383e-05, + "loss": 0.0, + "step": 49335 + }, + { + "epoch": 4.603527106466362, + "grad_norm": NaN, + "learning_rate": 4.072473469863357e-05, + "loss": 0.0, + "step": 49336 + }, + { + "epoch": 4.603620416161239, + "grad_norm": NaN, + "learning_rate": 4.071955331301797e-05, + "loss": 0.0, + "step": 49337 + }, + { + "epoch": 4.6037137258561165, + "grad_norm": NaN, + "learning_rate": 4.071437220527009e-05, + "loss": 0.0, + "step": 49338 + }, + { + "epoch": 4.603807035550994, + "grad_norm": NaN, + "learning_rate": 4.070919137540323e-05, + "loss": 0.0, + "step": 49339 + }, + { + "epoch": 4.603900345245871, + "grad_norm": NaN, + "learning_rate": 4.070401082343057e-05, + "loss": 0.0, + "step": 49340 + }, + { + "epoch": 4.603993654940748, + "grad_norm": NaN, + "learning_rate": 4.0698830549365145e-05, + "loss": 0.0, + "step": 49341 + }, + { + "epoch": 4.604086964635625, + "grad_norm": NaN, + "learning_rate": 4.069365055322026e-05, + "loss": 0.0, + "step": 49342 + }, + { + "epoch": 4.604180274330503, + "grad_norm": NaN, + "learning_rate": 4.068847083500908e-05, + "loss": 0.0, + "step": 49343 + }, + { + "epoch": 4.60427358402538, + "grad_norm": NaN, + "learning_rate": 4.068329139474467e-05, + "loss": 0.0, + "step": 49344 + }, + { + "epoch": 4.6043668937202575, + "grad_norm": NaN, + "learning_rate": 4.067811223244029e-05, + "loss": 0.0, + "step": 49345 + }, + { + "epoch": 4.604460203415135, + "grad_norm": NaN, + "learning_rate": 4.067293334810914e-05, + "loss": 0.0, + "step": 49346 + }, + { + "epoch": 4.604553513110012, + "grad_norm": NaN, + "learning_rate": 4.0667754741764246e-05, + "loss": 0.0, + "step": 49347 + }, + { + "epoch": 4.60464682280489, + "grad_norm": NaN, + "learning_rate": 4.0662576413418926e-05, + "loss": 0.0, + "step": 49348 + }, + { + "epoch": 4.604740132499767, + "grad_norm": NaN, + "learning_rate": 4.065739836308631e-05, + "loss": 0.0, + "step": 49349 + }, + { + "epoch": 4.604833442194644, + "grad_norm": NaN, + "learning_rate": 4.065222059077946e-05, + "loss": 0.0, + "step": 49350 + }, + { + "epoch": 4.604926751889521, + "grad_norm": NaN, + "learning_rate": 4.0647043096511656e-05, + "loss": 0.0, + "step": 49351 + }, + { + "epoch": 4.605020061584399, + "grad_norm": NaN, + "learning_rate": 4.0641865880296086e-05, + "loss": 0.0, + "step": 49352 + }, + { + "epoch": 4.605113371279276, + "grad_norm": NaN, + "learning_rate": 4.063668894214576e-05, + "loss": 0.0, + "step": 49353 + }, + { + "epoch": 4.605206680974153, + "grad_norm": NaN, + "learning_rate": 4.0631512282073984e-05, + "loss": 0.0, + "step": 49354 + }, + { + "epoch": 4.605299990669031, + "grad_norm": NaN, + "learning_rate": 4.0626335900093913e-05, + "loss": 0.0, + "step": 49355 + }, + { + "epoch": 4.605393300363907, + "grad_norm": NaN, + "learning_rate": 4.06211597962186e-05, + "loss": 0.0, + "step": 49356 + }, + { + "epoch": 4.605486610058785, + "grad_norm": NaN, + "learning_rate": 4.061598397046131e-05, + "loss": 0.0, + "step": 49357 + }, + { + "epoch": 4.605579919753662, + "grad_norm": NaN, + "learning_rate": 4.061080842283521e-05, + "loss": 0.0, + "step": 49358 + }, + { + "epoch": 4.60567322944854, + "grad_norm": NaN, + "learning_rate": 4.060563315335335e-05, + "loss": 0.0, + "step": 49359 + }, + { + "epoch": 4.605766539143417, + "grad_norm": NaN, + "learning_rate": 4.0600458162028984e-05, + "loss": 0.0, + "step": 49360 + }, + { + "epoch": 4.6058598488382945, + "grad_norm": NaN, + "learning_rate": 4.059528344887526e-05, + "loss": 0.0, + "step": 49361 + }, + { + "epoch": 4.605953158533172, + "grad_norm": NaN, + "learning_rate": 4.059010901390531e-05, + "loss": 0.0, + "step": 49362 + }, + { + "epoch": 4.606046468228049, + "grad_norm": NaN, + "learning_rate": 4.058493485713232e-05, + "loss": 0.0, + "step": 49363 + }, + { + "epoch": 4.606139777922926, + "grad_norm": NaN, + "learning_rate": 4.057976097856941e-05, + "loss": 0.0, + "step": 49364 + }, + { + "epoch": 4.606233087617803, + "grad_norm": NaN, + "learning_rate": 4.057458737822977e-05, + "loss": 0.0, + "step": 49365 + }, + { + "epoch": 4.606326397312681, + "grad_norm": NaN, + "learning_rate": 4.0569414056126534e-05, + "loss": 0.0, + "step": 49366 + }, + { + "epoch": 4.606419707007558, + "grad_norm": NaN, + "learning_rate": 4.0564241012272856e-05, + "loss": 0.0, + "step": 49367 + }, + { + "epoch": 4.6065130167024355, + "grad_norm": NaN, + "learning_rate": 4.055906824668191e-05, + "loss": 0.0, + "step": 49368 + }, + { + "epoch": 4.606606326397313, + "grad_norm": NaN, + "learning_rate": 4.055389575936683e-05, + "loss": 0.0, + "step": 49369 + }, + { + "epoch": 4.60669963609219, + "grad_norm": NaN, + "learning_rate": 4.0548723550340775e-05, + "loss": 0.0, + "step": 49370 + }, + { + "epoch": 4.606792945787067, + "grad_norm": NaN, + "learning_rate": 4.054355161961688e-05, + "loss": 0.0, + "step": 49371 + }, + { + "epoch": 4.606886255481944, + "grad_norm": NaN, + "learning_rate": 4.053837996720833e-05, + "loss": 0.0, + "step": 49372 + }, + { + "epoch": 4.606979565176822, + "grad_norm": NaN, + "learning_rate": 4.053320859312824e-05, + "loss": 0.0, + "step": 49373 + }, + { + "epoch": 4.607072874871699, + "grad_norm": NaN, + "learning_rate": 4.052803749738978e-05, + "loss": 0.0, + "step": 49374 + }, + { + "epoch": 4.607166184566577, + "grad_norm": NaN, + "learning_rate": 4.052286668000609e-05, + "loss": 0.0, + "step": 49375 + }, + { + "epoch": 4.607259494261454, + "grad_norm": NaN, + "learning_rate": 4.051769614099033e-05, + "loss": 0.0, + "step": 49376 + }, + { + "epoch": 4.607352803956331, + "grad_norm": NaN, + "learning_rate": 4.051252588035562e-05, + "loss": 0.0, + "step": 49377 + }, + { + "epoch": 4.607446113651209, + "grad_norm": NaN, + "learning_rate": 4.0507355898115175e-05, + "loss": 0.0, + "step": 49378 + }, + { + "epoch": 4.607539423346085, + "grad_norm": NaN, + "learning_rate": 4.0502186194282004e-05, + "loss": 0.0, + "step": 49379 + }, + { + "epoch": 4.607632733040963, + "grad_norm": NaN, + "learning_rate": 4.049701676886939e-05, + "loss": 0.0, + "step": 49380 + }, + { + "epoch": 4.60772604273584, + "grad_norm": NaN, + "learning_rate": 4.0491847621890464e-05, + "loss": 0.0, + "step": 49381 + }, + { + "epoch": 4.607819352430718, + "grad_norm": NaN, + "learning_rate": 4.0486678753358245e-05, + "loss": 0.0, + "step": 49382 + }, + { + "epoch": 4.607912662125595, + "grad_norm": NaN, + "learning_rate": 4.048151016328599e-05, + "loss": 0.0, + "step": 49383 + }, + { + "epoch": 4.6080059718204724, + "grad_norm": NaN, + "learning_rate": 4.047634185168687e-05, + "loss": 0.0, + "step": 49384 + }, + { + "epoch": 4.608099281515349, + "grad_norm": NaN, + "learning_rate": 4.047117381857388e-05, + "loss": 0.0, + "step": 49385 + }, + { + "epoch": 4.608192591210226, + "grad_norm": NaN, + "learning_rate": 4.0466006063960296e-05, + "loss": 0.0, + "step": 49386 + }, + { + "epoch": 4.608285900905104, + "grad_norm": NaN, + "learning_rate": 4.046083858785926e-05, + "loss": 0.0, + "step": 49387 + }, + { + "epoch": 4.608379210599981, + "grad_norm": NaN, + "learning_rate": 4.045567139028375e-05, + "loss": 0.0, + "step": 49388 + }, + { + "epoch": 4.608472520294859, + "grad_norm": NaN, + "learning_rate": 4.04505044712471e-05, + "loss": 0.0, + "step": 49389 + }, + { + "epoch": 4.608565829989736, + "grad_norm": NaN, + "learning_rate": 4.0445337830762385e-05, + "loss": 0.0, + "step": 49390 + }, + { + "epoch": 4.6086591396846135, + "grad_norm": NaN, + "learning_rate": 4.044017146884263e-05, + "loss": 0.0, + "step": 49391 + }, + { + "epoch": 4.608752449379491, + "grad_norm": NaN, + "learning_rate": 4.0435005385501136e-05, + "loss": 0.0, + "step": 49392 + }, + { + "epoch": 4.608845759074367, + "grad_norm": NaN, + "learning_rate": 4.0429839580750987e-05, + "loss": 0.0, + "step": 49393 + }, + { + "epoch": 4.608939068769245, + "grad_norm": NaN, + "learning_rate": 4.0424674054605214e-05, + "loss": 0.0, + "step": 49394 + }, + { + "epoch": 4.609032378464122, + "grad_norm": NaN, + "learning_rate": 4.041950880707709e-05, + "loss": 0.0, + "step": 49395 + }, + { + "epoch": 4.609125688159, + "grad_norm": NaN, + "learning_rate": 4.041434383817975e-05, + "loss": 0.0, + "step": 49396 + }, + { + "epoch": 4.609218997853877, + "grad_norm": NaN, + "learning_rate": 4.040917914792616e-05, + "loss": 0.0, + "step": 49397 + }, + { + "epoch": 4.6093123075487545, + "grad_norm": NaN, + "learning_rate": 4.0404014736329624e-05, + "loss": 0.0, + "step": 49398 + }, + { + "epoch": 4.609405617243632, + "grad_norm": NaN, + "learning_rate": 4.039885060340321e-05, + "loss": 0.0, + "step": 49399 + }, + { + "epoch": 4.6094989269385085, + "grad_norm": NaN, + "learning_rate": 4.039368674916004e-05, + "loss": 0.0, + "step": 49400 + }, + { + "epoch": 4.609592236633386, + "grad_norm": NaN, + "learning_rate": 4.0388523173613264e-05, + "loss": 0.0, + "step": 49401 + }, + { + "epoch": 4.609685546328263, + "grad_norm": NaN, + "learning_rate": 4.038335987677601e-05, + "loss": 0.0, + "step": 49402 + }, + { + "epoch": 4.609778856023141, + "grad_norm": NaN, + "learning_rate": 4.03781968586614e-05, + "loss": 0.0, + "step": 49403 + }, + { + "epoch": 4.609872165718018, + "grad_norm": NaN, + "learning_rate": 4.037303411928257e-05, + "loss": 0.0, + "step": 49404 + }, + { + "epoch": 4.609965475412896, + "grad_norm": NaN, + "learning_rate": 4.036787165865262e-05, + "loss": 0.0, + "step": 49405 + }, + { + "epoch": 4.610058785107773, + "grad_norm": NaN, + "learning_rate": 4.036270947678472e-05, + "loss": 0.0, + "step": 49406 + }, + { + "epoch": 4.61015209480265, + "grad_norm": NaN, + "learning_rate": 4.035754757369195e-05, + "loss": 0.0, + "step": 49407 + }, + { + "epoch": 4.610245404497527, + "grad_norm": NaN, + "learning_rate": 4.0352385949387476e-05, + "loss": 0.0, + "step": 49408 + }, + { + "epoch": 4.610338714192404, + "grad_norm": NaN, + "learning_rate": 4.0347224603884406e-05, + "loss": 0.0, + "step": 49409 + }, + { + "epoch": 4.610432023887282, + "grad_norm": NaN, + "learning_rate": 4.034206353719585e-05, + "loss": 0.0, + "step": 49410 + }, + { + "epoch": 4.610525333582159, + "grad_norm": NaN, + "learning_rate": 4.033690274933494e-05, + "loss": 0.0, + "step": 49411 + }, + { + "epoch": 4.610618643277037, + "grad_norm": NaN, + "learning_rate": 4.033174224031482e-05, + "loss": 0.0, + "step": 49412 + }, + { + "epoch": 4.610711952971914, + "grad_norm": NaN, + "learning_rate": 4.0326582010148585e-05, + "loss": 0.0, + "step": 49413 + }, + { + "epoch": 4.610805262666791, + "grad_norm": NaN, + "learning_rate": 4.0321422058849345e-05, + "loss": 0.0, + "step": 49414 + }, + { + "epoch": 4.610898572361668, + "grad_norm": NaN, + "learning_rate": 4.031626238643026e-05, + "loss": 0.0, + "step": 49415 + }, + { + "epoch": 4.610991882056545, + "grad_norm": NaN, + "learning_rate": 4.0311102992904435e-05, + "loss": 0.0, + "step": 49416 + }, + { + "epoch": 4.611085191751423, + "grad_norm": NaN, + "learning_rate": 4.030594387828497e-05, + "loss": 0.0, + "step": 49417 + }, + { + "epoch": 4.6111785014463, + "grad_norm": NaN, + "learning_rate": 4.030078504258499e-05, + "loss": 0.0, + "step": 49418 + }, + { + "epoch": 4.611271811141178, + "grad_norm": NaN, + "learning_rate": 4.029562648581763e-05, + "loss": 0.0, + "step": 49419 + }, + { + "epoch": 4.611365120836055, + "grad_norm": NaN, + "learning_rate": 4.0290468207996e-05, + "loss": 0.0, + "step": 49420 + }, + { + "epoch": 4.6114584305309325, + "grad_norm": NaN, + "learning_rate": 4.0285310209133207e-05, + "loss": 0.0, + "step": 49421 + }, + { + "epoch": 4.61155174022581, + "grad_norm": NaN, + "learning_rate": 4.028015248924236e-05, + "loss": 0.0, + "step": 49422 + }, + { + "epoch": 4.6116450499206865, + "grad_norm": NaN, + "learning_rate": 4.0274995048336595e-05, + "loss": 0.0, + "step": 49423 + }, + { + "epoch": 4.611738359615564, + "grad_norm": NaN, + "learning_rate": 4.0269837886429005e-05, + "loss": 0.0, + "step": 49424 + }, + { + "epoch": 4.611831669310441, + "grad_norm": NaN, + "learning_rate": 4.026468100353277e-05, + "loss": 0.0, + "step": 49425 + }, + { + "epoch": 4.611924979005319, + "grad_norm": NaN, + "learning_rate": 4.025952439966085e-05, + "loss": 0.0, + "step": 49426 + }, + { + "epoch": 4.612018288700196, + "grad_norm": NaN, + "learning_rate": 4.025436807482649e-05, + "loss": 0.0, + "step": 49427 + }, + { + "epoch": 4.612111598395074, + "grad_norm": NaN, + "learning_rate": 4.024921202904281e-05, + "loss": 0.0, + "step": 49428 + }, + { + "epoch": 4.61220490808995, + "grad_norm": NaN, + "learning_rate": 4.0244056262322786e-05, + "loss": 0.0, + "step": 49429 + }, + { + "epoch": 4.6122982177848275, + "grad_norm": NaN, + "learning_rate": 4.023890077467966e-05, + "loss": 0.0, + "step": 49430 + }, + { + "epoch": 4.612391527479705, + "grad_norm": NaN, + "learning_rate": 4.023374556612655e-05, + "loss": 0.0, + "step": 49431 + }, + { + "epoch": 4.612484837174582, + "grad_norm": NaN, + "learning_rate": 4.02285906366764e-05, + "loss": 0.0, + "step": 49432 + }, + { + "epoch": 4.61257814686946, + "grad_norm": NaN, + "learning_rate": 4.022343598634249e-05, + "loss": 0.0, + "step": 49433 + }, + { + "epoch": 4.612671456564337, + "grad_norm": NaN, + "learning_rate": 4.021828161513791e-05, + "loss": 0.0, + "step": 49434 + }, + { + "epoch": 4.612764766259215, + "grad_norm": NaN, + "learning_rate": 4.021312752307562e-05, + "loss": 0.0, + "step": 49435 + }, + { + "epoch": 4.612858075954092, + "grad_norm": NaN, + "learning_rate": 4.0207973710168874e-05, + "loss": 0.0, + "step": 49436 + }, + { + "epoch": 4.612951385648969, + "grad_norm": NaN, + "learning_rate": 4.020282017643072e-05, + "loss": 0.0, + "step": 49437 + }, + { + "epoch": 4.613044695343846, + "grad_norm": NaN, + "learning_rate": 4.0197666921874275e-05, + "loss": 0.0, + "step": 49438 + }, + { + "epoch": 4.613138005038723, + "grad_norm": NaN, + "learning_rate": 4.0192513946512644e-05, + "loss": 0.0, + "step": 49439 + }, + { + "epoch": 4.613231314733601, + "grad_norm": NaN, + "learning_rate": 4.0187361250358905e-05, + "loss": 0.0, + "step": 49440 + }, + { + "epoch": 4.613324624428478, + "grad_norm": NaN, + "learning_rate": 4.01822088334262e-05, + "loss": 0.0, + "step": 49441 + }, + { + "epoch": 4.613417934123356, + "grad_norm": NaN, + "learning_rate": 4.017705669572759e-05, + "loss": 0.0, + "step": 49442 + }, + { + "epoch": 4.613511243818233, + "grad_norm": NaN, + "learning_rate": 4.017190483727622e-05, + "loss": 0.0, + "step": 49443 + }, + { + "epoch": 4.61360455351311, + "grad_norm": NaN, + "learning_rate": 4.016675325808513e-05, + "loss": 0.0, + "step": 49444 + }, + { + "epoch": 4.613697863207987, + "grad_norm": NaN, + "learning_rate": 4.016160195816747e-05, + "loss": 0.0, + "step": 49445 + }, + { + "epoch": 4.6137911729028644, + "grad_norm": NaN, + "learning_rate": 4.015645093753631e-05, + "loss": 0.0, + "step": 49446 + }, + { + "epoch": 4.613884482597742, + "grad_norm": NaN, + "learning_rate": 4.0151300196204757e-05, + "loss": 0.0, + "step": 49447 + }, + { + "epoch": 4.613977792292619, + "grad_norm": NaN, + "learning_rate": 4.014614973418591e-05, + "loss": 0.0, + "step": 49448 + }, + { + "epoch": 4.614071101987497, + "grad_norm": NaN, + "learning_rate": 4.0140999551492864e-05, + "loss": 0.0, + "step": 49449 + }, + { + "epoch": 4.614164411682374, + "grad_norm": NaN, + "learning_rate": 4.013584964813872e-05, + "loss": 0.0, + "step": 49450 + }, + { + "epoch": 4.6142577213772515, + "grad_norm": NaN, + "learning_rate": 4.013070002413656e-05, + "loss": 0.0, + "step": 49451 + }, + { + "epoch": 4.614351031072128, + "grad_norm": NaN, + "learning_rate": 4.012555067949949e-05, + "loss": 0.0, + "step": 49452 + }, + { + "epoch": 4.6144443407670055, + "grad_norm": NaN, + "learning_rate": 4.012040161424059e-05, + "loss": 0.0, + "step": 49453 + }, + { + "epoch": 4.614537650461883, + "grad_norm": NaN, + "learning_rate": 4.011525282837296e-05, + "loss": 0.0, + "step": 49454 + }, + { + "epoch": 4.61463096015676, + "grad_norm": NaN, + "learning_rate": 4.01101043219097e-05, + "loss": 0.0, + "step": 49455 + }, + { + "epoch": 4.614724269851638, + "grad_norm": NaN, + "learning_rate": 4.0104956094863885e-05, + "loss": 0.0, + "step": 49456 + }, + { + "epoch": 4.614817579546515, + "grad_norm": NaN, + "learning_rate": 4.009980814724861e-05, + "loss": 0.0, + "step": 49457 + }, + { + "epoch": 4.614910889241392, + "grad_norm": NaN, + "learning_rate": 4.009466047907698e-05, + "loss": 0.0, + "step": 49458 + }, + { + "epoch": 4.615004198936269, + "grad_norm": NaN, + "learning_rate": 4.008951309036207e-05, + "loss": 0.0, + "step": 49459 + }, + { + "epoch": 4.6150975086311465, + "grad_norm": NaN, + "learning_rate": 4.0084365981116965e-05, + "loss": 0.0, + "step": 49460 + }, + { + "epoch": 4.615190818326024, + "grad_norm": NaN, + "learning_rate": 4.007921915135475e-05, + "loss": 0.0, + "step": 49461 + }, + { + "epoch": 4.615284128020901, + "grad_norm": NaN, + "learning_rate": 4.007407260108853e-05, + "loss": 0.0, + "step": 49462 + }, + { + "epoch": 4.615377437715779, + "grad_norm": NaN, + "learning_rate": 4.0068926330331394e-05, + "loss": 0.0, + "step": 49463 + }, + { + "epoch": 4.615470747410656, + "grad_norm": NaN, + "learning_rate": 4.006378033909639e-05, + "loss": 0.0, + "step": 49464 + }, + { + "epoch": 4.615564057105534, + "grad_norm": NaN, + "learning_rate": 4.0058634627396636e-05, + "loss": 0.0, + "step": 49465 + }, + { + "epoch": 4.615657366800411, + "grad_norm": NaN, + "learning_rate": 4.0053489195245216e-05, + "loss": 0.0, + "step": 49466 + }, + { + "epoch": 4.615750676495288, + "grad_norm": NaN, + "learning_rate": 4.0048344042655184e-05, + "loss": 0.0, + "step": 49467 + }, + { + "epoch": 4.615843986190165, + "grad_norm": NaN, + "learning_rate": 4.0043199169639666e-05, + "loss": 0.0, + "step": 49468 + }, + { + "epoch": 4.615937295885042, + "grad_norm": NaN, + "learning_rate": 4.003805457621175e-05, + "loss": 0.0, + "step": 49469 + }, + { + "epoch": 4.61603060557992, + "grad_norm": NaN, + "learning_rate": 4.00329102623844e-05, + "loss": 0.0, + "step": 49470 + }, + { + "epoch": 4.616123915274797, + "grad_norm": NaN, + "learning_rate": 4.0027766228170845e-05, + "loss": 0.0, + "step": 49471 + }, + { + "epoch": 4.616217224969675, + "grad_norm": NaN, + "learning_rate": 4.002262247358413e-05, + "loss": 0.0, + "step": 49472 + }, + { + "epoch": 4.616310534664551, + "grad_norm": NaN, + "learning_rate": 4.001747899863722e-05, + "loss": 0.0, + "step": 49473 + }, + { + "epoch": 4.616403844359429, + "grad_norm": NaN, + "learning_rate": 4.001233580334333e-05, + "loss": 0.0, + "step": 49474 + }, + { + "epoch": 4.616497154054306, + "grad_norm": NaN, + "learning_rate": 4.00071928877155e-05, + "loss": 0.0, + "step": 49475 + }, + { + "epoch": 4.6165904637491835, + "grad_norm": NaN, + "learning_rate": 4.000205025176678e-05, + "loss": 0.0, + "step": 49476 + }, + { + "epoch": 4.616683773444061, + "grad_norm": NaN, + "learning_rate": 3.9996907895510285e-05, + "loss": 0.0, + "step": 49477 + }, + { + "epoch": 4.616777083138938, + "grad_norm": NaN, + "learning_rate": 3.999176581895904e-05, + "loss": 0.0, + "step": 49478 + }, + { + "epoch": 4.616870392833816, + "grad_norm": NaN, + "learning_rate": 3.998662402212617e-05, + "loss": 0.0, + "step": 49479 + }, + { + "epoch": 4.616963702528693, + "grad_norm": NaN, + "learning_rate": 3.9981482505024713e-05, + "loss": 0.0, + "step": 49480 + }, + { + "epoch": 4.61705701222357, + "grad_norm": NaN, + "learning_rate": 3.9976341267667755e-05, + "loss": 0.0, + "step": 49481 + }, + { + "epoch": 4.617150321918447, + "grad_norm": NaN, + "learning_rate": 3.997120031006838e-05, + "loss": 0.0, + "step": 49482 + }, + { + "epoch": 4.6172436316133245, + "grad_norm": NaN, + "learning_rate": 3.996605963223965e-05, + "loss": 0.0, + "step": 49483 + }, + { + "epoch": 4.617336941308202, + "grad_norm": NaN, + "learning_rate": 3.996091923419462e-05, + "loss": 0.0, + "step": 49484 + }, + { + "epoch": 4.617430251003079, + "grad_norm": NaN, + "learning_rate": 3.99557791159464e-05, + "loss": 0.0, + "step": 49485 + }, + { + "epoch": 4.617523560697957, + "grad_norm": NaN, + "learning_rate": 3.995063927750802e-05, + "loss": 0.0, + "step": 49486 + }, + { + "epoch": 4.617616870392834, + "grad_norm": NaN, + "learning_rate": 3.994549971889258e-05, + "loss": 0.0, + "step": 49487 + }, + { + "epoch": 4.617710180087711, + "grad_norm": NaN, + "learning_rate": 3.994036044011312e-05, + "loss": 0.0, + "step": 49488 + }, + { + "epoch": 4.617803489782588, + "grad_norm": NaN, + "learning_rate": 3.9935221441182715e-05, + "loss": 0.0, + "step": 49489 + }, + { + "epoch": 4.617896799477466, + "grad_norm": NaN, + "learning_rate": 3.9930082722114456e-05, + "loss": 0.0, + "step": 49490 + }, + { + "epoch": 4.617990109172343, + "grad_norm": NaN, + "learning_rate": 3.992494428292138e-05, + "loss": 0.0, + "step": 49491 + }, + { + "epoch": 4.61808341886722, + "grad_norm": NaN, + "learning_rate": 3.9919806123616564e-05, + "loss": 0.0, + "step": 49492 + }, + { + "epoch": 4.618176728562098, + "grad_norm": NaN, + "learning_rate": 3.9914668244213075e-05, + "loss": 0.0, + "step": 49493 + }, + { + "epoch": 4.618270038256975, + "grad_norm": NaN, + "learning_rate": 3.990953064472397e-05, + "loss": 0.0, + "step": 49494 + }, + { + "epoch": 4.618363347951853, + "grad_norm": NaN, + "learning_rate": 3.990439332516233e-05, + "loss": 0.0, + "step": 49495 + }, + { + "epoch": 4.618456657646729, + "grad_norm": NaN, + "learning_rate": 3.9899256285541195e-05, + "loss": 0.0, + "step": 49496 + }, + { + "epoch": 4.618549967341607, + "grad_norm": NaN, + "learning_rate": 3.989411952587364e-05, + "loss": 0.0, + "step": 49497 + }, + { + "epoch": 4.618643277036484, + "grad_norm": NaN, + "learning_rate": 3.988898304617272e-05, + "loss": 0.0, + "step": 49498 + }, + { + "epoch": 4.6187365867313614, + "grad_norm": NaN, + "learning_rate": 3.98838468464515e-05, + "loss": 0.0, + "step": 49499 + }, + { + "epoch": 4.618829896426239, + "grad_norm": NaN, + "learning_rate": 3.987871092672304e-05, + "loss": 0.0, + "step": 49500 + }, + { + "epoch": 4.618923206121116, + "grad_norm": NaN, + "learning_rate": 3.9873575287000406e-05, + "loss": 0.0, + "step": 49501 + }, + { + "epoch": 4.619016515815993, + "grad_norm": NaN, + "learning_rate": 3.986843992729665e-05, + "loss": 0.0, + "step": 49502 + }, + { + "epoch": 4.61910982551087, + "grad_norm": NaN, + "learning_rate": 3.986330484762482e-05, + "loss": 0.0, + "step": 49503 + }, + { + "epoch": 4.619203135205748, + "grad_norm": NaN, + "learning_rate": 3.985817004799797e-05, + "loss": 0.0, + "step": 49504 + }, + { + "epoch": 4.619296444900625, + "grad_norm": NaN, + "learning_rate": 3.985303552842917e-05, + "loss": 0.0, + "step": 49505 + }, + { + "epoch": 4.6193897545955025, + "grad_norm": NaN, + "learning_rate": 3.984790128893149e-05, + "loss": 0.0, + "step": 49506 + }, + { + "epoch": 4.61948306429038, + "grad_norm": NaN, + "learning_rate": 3.984276732951796e-05, + "loss": 0.0, + "step": 49507 + }, + { + "epoch": 4.619576373985257, + "grad_norm": NaN, + "learning_rate": 3.983763365020163e-05, + "loss": 0.0, + "step": 49508 + }, + { + "epoch": 4.619669683680135, + "grad_norm": NaN, + "learning_rate": 3.983250025099558e-05, + "loss": 0.0, + "step": 49509 + }, + { + "epoch": 4.619762993375011, + "grad_norm": NaN, + "learning_rate": 3.982736713191284e-05, + "loss": 0.0, + "step": 49510 + }, + { + "epoch": 4.619856303069889, + "grad_norm": NaN, + "learning_rate": 3.982223429296646e-05, + "loss": 0.0, + "step": 49511 + }, + { + "epoch": 4.619949612764766, + "grad_norm": NaN, + "learning_rate": 3.9817101734169525e-05, + "loss": 0.0, + "step": 49512 + }, + { + "epoch": 4.6200429224596435, + "grad_norm": NaN, + "learning_rate": 3.981196945553505e-05, + "loss": 0.0, + "step": 49513 + }, + { + "epoch": 4.620136232154521, + "grad_norm": NaN, + "learning_rate": 3.980683745707608e-05, + "loss": 0.0, + "step": 49514 + }, + { + "epoch": 4.620229541849398, + "grad_norm": NaN, + "learning_rate": 3.9801705738805714e-05, + "loss": 0.0, + "step": 49515 + }, + { + "epoch": 4.620322851544276, + "grad_norm": NaN, + "learning_rate": 3.979657430073695e-05, + "loss": 0.0, + "step": 49516 + }, + { + "epoch": 4.620416161239152, + "grad_norm": NaN, + "learning_rate": 3.9791443142882855e-05, + "loss": 0.0, + "step": 49517 + }, + { + "epoch": 4.62050947093403, + "grad_norm": NaN, + "learning_rate": 3.9786312265256474e-05, + "loss": 0.0, + "step": 49518 + }, + { + "epoch": 4.620602780628907, + "grad_norm": NaN, + "learning_rate": 3.978118166787085e-05, + "loss": 0.0, + "step": 49519 + }, + { + "epoch": 4.620696090323785, + "grad_norm": NaN, + "learning_rate": 3.977605135073903e-05, + "loss": 0.0, + "step": 49520 + }, + { + "epoch": 4.620789400018662, + "grad_norm": NaN, + "learning_rate": 3.9770921313874055e-05, + "loss": 0.0, + "step": 49521 + }, + { + "epoch": 4.620882709713539, + "grad_norm": NaN, + "learning_rate": 3.9765791557288977e-05, + "loss": 0.0, + "step": 49522 + }, + { + "epoch": 4.620976019408417, + "grad_norm": NaN, + "learning_rate": 3.976066208099686e-05, + "loss": 0.0, + "step": 49523 + }, + { + "epoch": 4.621069329103294, + "grad_norm": NaN, + "learning_rate": 3.9755532885010696e-05, + "loss": 0.0, + "step": 49524 + }, + { + "epoch": 4.621162638798171, + "grad_norm": NaN, + "learning_rate": 3.9750403969343576e-05, + "loss": 0.0, + "step": 49525 + }, + { + "epoch": 4.621255948493048, + "grad_norm": NaN, + "learning_rate": 3.974527533400851e-05, + "loss": 0.0, + "step": 49526 + }, + { + "epoch": 4.621349258187926, + "grad_norm": NaN, + "learning_rate": 3.974014697901854e-05, + "loss": 0.0, + "step": 49527 + }, + { + "epoch": 4.621442567882803, + "grad_norm": NaN, + "learning_rate": 3.973501890438672e-05, + "loss": 0.0, + "step": 49528 + }, + { + "epoch": 4.6215358775776805, + "grad_norm": NaN, + "learning_rate": 3.97298911101261e-05, + "loss": 0.0, + "step": 49529 + }, + { + "epoch": 4.621629187272558, + "grad_norm": NaN, + "learning_rate": 3.9724763596249695e-05, + "loss": 0.0, + "step": 49530 + }, + { + "epoch": 4.621722496967434, + "grad_norm": NaN, + "learning_rate": 3.971963636277054e-05, + "loss": 0.0, + "step": 49531 + }, + { + "epoch": 4.621815806662312, + "grad_norm": NaN, + "learning_rate": 3.971450940970168e-05, + "loss": 0.0, + "step": 49532 + }, + { + "epoch": 4.621909116357189, + "grad_norm": NaN, + "learning_rate": 3.9709382737056175e-05, + "loss": 0.0, + "step": 49533 + }, + { + "epoch": 4.622002426052067, + "grad_norm": NaN, + "learning_rate": 3.9704256344847027e-05, + "loss": 0.0, + "step": 49534 + }, + { + "epoch": 4.622095735746944, + "grad_norm": NaN, + "learning_rate": 3.969913023308728e-05, + "loss": 0.0, + "step": 49535 + }, + { + "epoch": 4.6221890454418215, + "grad_norm": NaN, + "learning_rate": 3.969400440178998e-05, + "loss": 0.0, + "step": 49536 + }, + { + "epoch": 4.622282355136699, + "grad_norm": NaN, + "learning_rate": 3.968887885096814e-05, + "loss": 0.0, + "step": 49537 + }, + { + "epoch": 4.622375664831576, + "grad_norm": NaN, + "learning_rate": 3.96837535806348e-05, + "loss": 0.0, + "step": 49538 + }, + { + "epoch": 4.622468974526454, + "grad_norm": NaN, + "learning_rate": 3.967862859080302e-05, + "loss": 0.0, + "step": 49539 + }, + { + "epoch": 4.62256228422133, + "grad_norm": NaN, + "learning_rate": 3.9673503881485796e-05, + "loss": 0.0, + "step": 49540 + }, + { + "epoch": 4.622655593916208, + "grad_norm": NaN, + "learning_rate": 3.966837945269616e-05, + "loss": 0.0, + "step": 49541 + }, + { + "epoch": 4.622748903611085, + "grad_norm": NaN, + "learning_rate": 3.966325530444716e-05, + "loss": 0.0, + "step": 49542 + }, + { + "epoch": 4.622842213305963, + "grad_norm": NaN, + "learning_rate": 3.965813143675183e-05, + "loss": 0.0, + "step": 49543 + }, + { + "epoch": 4.62293552300084, + "grad_norm": NaN, + "learning_rate": 3.965300784962316e-05, + "loss": 0.0, + "step": 49544 + }, + { + "epoch": 4.623028832695717, + "grad_norm": NaN, + "learning_rate": 3.9647884543074225e-05, + "loss": 0.0, + "step": 49545 + }, + { + "epoch": 4.623122142390594, + "grad_norm": NaN, + "learning_rate": 3.964276151711804e-05, + "loss": 0.0, + "step": 49546 + }, + { + "epoch": 4.623215452085471, + "grad_norm": NaN, + "learning_rate": 3.9637638771767604e-05, + "loss": 0.0, + "step": 49547 + }, + { + "epoch": 4.623308761780349, + "grad_norm": NaN, + "learning_rate": 3.963251630703595e-05, + "loss": 0.0, + "step": 49548 + }, + { + "epoch": 4.623402071475226, + "grad_norm": NaN, + "learning_rate": 3.962739412293613e-05, + "loss": 0.0, + "step": 49549 + }, + { + "epoch": 4.623495381170104, + "grad_norm": NaN, + "learning_rate": 3.962227221948115e-05, + "loss": 0.0, + "step": 49550 + }, + { + "epoch": 4.623588690864981, + "grad_norm": NaN, + "learning_rate": 3.961715059668403e-05, + "loss": 0.0, + "step": 49551 + }, + { + "epoch": 4.6236820005598585, + "grad_norm": NaN, + "learning_rate": 3.961202925455782e-05, + "loss": 0.0, + "step": 49552 + }, + { + "epoch": 4.623775310254736, + "grad_norm": NaN, + "learning_rate": 3.96069081931155e-05, + "loss": 0.0, + "step": 49553 + }, + { + "epoch": 4.623868619949612, + "grad_norm": NaN, + "learning_rate": 3.960178741237011e-05, + "loss": 0.0, + "step": 49554 + }, + { + "epoch": 4.62396192964449, + "grad_norm": NaN, + "learning_rate": 3.959666691233468e-05, + "loss": 0.0, + "step": 49555 + }, + { + "epoch": 4.624055239339367, + "grad_norm": NaN, + "learning_rate": 3.9591546693022226e-05, + "loss": 0.0, + "step": 49556 + }, + { + "epoch": 4.624148549034245, + "grad_norm": NaN, + "learning_rate": 3.9586426754445757e-05, + "loss": 0.0, + "step": 49557 + }, + { + "epoch": 4.624241858729122, + "grad_norm": NaN, + "learning_rate": 3.958130709661829e-05, + "loss": 0.0, + "step": 49558 + }, + { + "epoch": 4.6243351684239995, + "grad_norm": NaN, + "learning_rate": 3.957618771955286e-05, + "loss": 0.0, + "step": 49559 + }, + { + "epoch": 4.624428478118877, + "grad_norm": NaN, + "learning_rate": 3.957106862326248e-05, + "loss": 0.0, + "step": 49560 + }, + { + "epoch": 4.6245217878137534, + "grad_norm": NaN, + "learning_rate": 3.9565949807760143e-05, + "loss": 0.0, + "step": 49561 + }, + { + "epoch": 4.624615097508631, + "grad_norm": NaN, + "learning_rate": 3.9560831273058906e-05, + "loss": 0.0, + "step": 49562 + }, + { + "epoch": 4.624708407203508, + "grad_norm": NaN, + "learning_rate": 3.9555713019171744e-05, + "loss": 0.0, + "step": 49563 + }, + { + "epoch": 4.624801716898386, + "grad_norm": NaN, + "learning_rate": 3.955059504611169e-05, + "loss": 0.0, + "step": 49564 + }, + { + "epoch": 4.624895026593263, + "grad_norm": NaN, + "learning_rate": 3.9545477353891754e-05, + "loss": 0.0, + "step": 49565 + }, + { + "epoch": 4.6249883362881405, + "grad_norm": NaN, + "learning_rate": 3.9540359942524954e-05, + "loss": 0.0, + "step": 49566 + }, + { + "epoch": 4.625081645983018, + "grad_norm": NaN, + "learning_rate": 3.953524281202431e-05, + "loss": 0.0, + "step": 49567 + }, + { + "epoch": 4.625174955677895, + "grad_norm": NaN, + "learning_rate": 3.95301259624028e-05, + "loss": 0.0, + "step": 49568 + }, + { + "epoch": 4.625268265372772, + "grad_norm": NaN, + "learning_rate": 3.952500939367346e-05, + "loss": 0.0, + "step": 49569 + }, + { + "epoch": 4.625361575067649, + "grad_norm": NaN, + "learning_rate": 3.951989310584931e-05, + "loss": 0.0, + "step": 49570 + }, + { + "epoch": 4.625454884762527, + "grad_norm": NaN, + "learning_rate": 3.951477709894333e-05, + "loss": 0.0, + "step": 49571 + }, + { + "epoch": 4.625548194457404, + "grad_norm": NaN, + "learning_rate": 3.950966137296856e-05, + "loss": 0.0, + "step": 49572 + }, + { + "epoch": 4.625641504152282, + "grad_norm": NaN, + "learning_rate": 3.9504545927937974e-05, + "loss": 0.0, + "step": 49573 + }, + { + "epoch": 4.625734813847159, + "grad_norm": NaN, + "learning_rate": 3.949943076386462e-05, + "loss": 0.0, + "step": 49574 + }, + { + "epoch": 4.6258281235420355, + "grad_norm": NaN, + "learning_rate": 3.9494315880761454e-05, + "loss": 0.0, + "step": 49575 + }, + { + "epoch": 4.625921433236913, + "grad_norm": NaN, + "learning_rate": 3.948920127864153e-05, + "loss": 0.0, + "step": 49576 + }, + { + "epoch": 4.62601474293179, + "grad_norm": NaN, + "learning_rate": 3.948408695751781e-05, + "loss": 0.0, + "step": 49577 + }, + { + "epoch": 4.626108052626668, + "grad_norm": NaN, + "learning_rate": 3.947897291740332e-05, + "loss": 0.0, + "step": 49578 + }, + { + "epoch": 4.626201362321545, + "grad_norm": NaN, + "learning_rate": 3.947385915831108e-05, + "loss": 0.0, + "step": 49579 + }, + { + "epoch": 4.626294672016423, + "grad_norm": NaN, + "learning_rate": 3.946874568025408e-05, + "loss": 0.0, + "step": 49580 + }, + { + "epoch": 4.6263879817113, + "grad_norm": NaN, + "learning_rate": 3.9463632483245294e-05, + "loss": 0.0, + "step": 49581 + }, + { + "epoch": 4.6264812914061775, + "grad_norm": NaN, + "learning_rate": 3.945851956729777e-05, + "loss": 0.0, + "step": 49582 + }, + { + "epoch": 4.626574601101055, + "grad_norm": NaN, + "learning_rate": 3.9453406932424474e-05, + "loss": 0.0, + "step": 49583 + }, + { + "epoch": 4.626667910795931, + "grad_norm": NaN, + "learning_rate": 3.944829457863842e-05, + "loss": 0.0, + "step": 49584 + }, + { + "epoch": 4.626761220490809, + "grad_norm": NaN, + "learning_rate": 3.9443182505952606e-05, + "loss": 0.0, + "step": 49585 + }, + { + "epoch": 4.626854530185686, + "grad_norm": NaN, + "learning_rate": 3.9438070714380035e-05, + "loss": 0.0, + "step": 49586 + }, + { + "epoch": 4.626947839880564, + "grad_norm": NaN, + "learning_rate": 3.943295920393369e-05, + "loss": 0.0, + "step": 49587 + }, + { + "epoch": 4.627041149575441, + "grad_norm": NaN, + "learning_rate": 3.942784797462658e-05, + "loss": 0.0, + "step": 49588 + }, + { + "epoch": 4.6271344592703185, + "grad_norm": NaN, + "learning_rate": 3.94227370264717e-05, + "loss": 0.0, + "step": 49589 + }, + { + "epoch": 4.627227768965195, + "grad_norm": NaN, + "learning_rate": 3.941762635948205e-05, + "loss": 0.0, + "step": 49590 + }, + { + "epoch": 4.6273210786600725, + "grad_norm": NaN, + "learning_rate": 3.9412515973670604e-05, + "loss": 0.0, + "step": 49591 + }, + { + "epoch": 4.62741438835495, + "grad_norm": NaN, + "learning_rate": 3.940740586905038e-05, + "loss": 0.0, + "step": 49592 + }, + { + "epoch": 4.627507698049827, + "grad_norm": NaN, + "learning_rate": 3.940229604563438e-05, + "loss": 0.0, + "step": 49593 + }, + { + "epoch": 4.627601007744705, + "grad_norm": NaN, + "learning_rate": 3.939718650343555e-05, + "loss": 0.0, + "step": 49594 + }, + { + "epoch": 4.627694317439582, + "grad_norm": NaN, + "learning_rate": 3.9392077242466924e-05, + "loss": 0.0, + "step": 49595 + }, + { + "epoch": 4.62778762713446, + "grad_norm": NaN, + "learning_rate": 3.938696826274148e-05, + "loss": 0.0, + "step": 49596 + }, + { + "epoch": 4.627880936829337, + "grad_norm": NaN, + "learning_rate": 3.9381859564272225e-05, + "loss": 0.0, + "step": 49597 + }, + { + "epoch": 4.6279742465242135, + "grad_norm": NaN, + "learning_rate": 3.9376751147072124e-05, + "loss": 0.0, + "step": 49598 + }, + { + "epoch": 4.628067556219091, + "grad_norm": NaN, + "learning_rate": 3.937164301115417e-05, + "loss": 0.0, + "step": 49599 + }, + { + "epoch": 4.628160865913968, + "grad_norm": NaN, + "learning_rate": 3.9366535156531356e-05, + "loss": 0.0, + "step": 49600 + }, + { + "epoch": 4.628254175608846, + "grad_norm": NaN, + "learning_rate": 3.936142758321667e-05, + "loss": 0.0, + "step": 49601 + }, + { + "epoch": 4.628347485303723, + "grad_norm": NaN, + "learning_rate": 3.935632029122311e-05, + "loss": 0.0, + "step": 49602 + }, + { + "epoch": 4.628440794998601, + "grad_norm": NaN, + "learning_rate": 3.9351213280563644e-05, + "loss": 0.0, + "step": 49603 + }, + { + "epoch": 4.628534104693478, + "grad_norm": NaN, + "learning_rate": 3.9346106551251274e-05, + "loss": 0.0, + "step": 49604 + }, + { + "epoch": 4.628627414388355, + "grad_norm": NaN, + "learning_rate": 3.934100010329897e-05, + "loss": 0.0, + "step": 49605 + }, + { + "epoch": 4.628720724083232, + "grad_norm": NaN, + "learning_rate": 3.933589393671973e-05, + "loss": 0.0, + "step": 49606 + }, + { + "epoch": 4.628814033778109, + "grad_norm": NaN, + "learning_rate": 3.933078805152651e-05, + "loss": 0.0, + "step": 49607 + }, + { + "epoch": 4.628907343472987, + "grad_norm": NaN, + "learning_rate": 3.932568244773234e-05, + "loss": 0.0, + "step": 49608 + }, + { + "epoch": 4.629000653167864, + "grad_norm": NaN, + "learning_rate": 3.932057712535016e-05, + "loss": 0.0, + "step": 49609 + }, + { + "epoch": 4.629093962862742, + "grad_norm": NaN, + "learning_rate": 3.931547208439297e-05, + "loss": 0.0, + "step": 49610 + }, + { + "epoch": 4.629187272557619, + "grad_norm": NaN, + "learning_rate": 3.9310367324873746e-05, + "loss": 0.0, + "step": 49611 + }, + { + "epoch": 4.6292805822524965, + "grad_norm": NaN, + "learning_rate": 3.930526284680548e-05, + "loss": 0.0, + "step": 49612 + }, + { + "epoch": 4.629373891947373, + "grad_norm": NaN, + "learning_rate": 3.9300158650201124e-05, + "loss": 0.0, + "step": 49613 + }, + { + "epoch": 4.6294672016422505, + "grad_norm": NaN, + "learning_rate": 3.929505473507369e-05, + "loss": 0.0, + "step": 49614 + }, + { + "epoch": 4.629560511337128, + "grad_norm": NaN, + "learning_rate": 3.928995110143614e-05, + "loss": 0.0, + "step": 49615 + }, + { + "epoch": 4.629653821032005, + "grad_norm": NaN, + "learning_rate": 3.9284847749301434e-05, + "loss": 0.0, + "step": 49616 + }, + { + "epoch": 4.629747130726883, + "grad_norm": NaN, + "learning_rate": 3.9279744678682564e-05, + "loss": 0.0, + "step": 49617 + }, + { + "epoch": 4.62984044042176, + "grad_norm": NaN, + "learning_rate": 3.927464188959251e-05, + "loss": 0.0, + "step": 49618 + }, + { + "epoch": 4.629933750116637, + "grad_norm": NaN, + "learning_rate": 3.9269539382044244e-05, + "loss": 0.0, + "step": 49619 + }, + { + "epoch": 4.630027059811514, + "grad_norm": NaN, + "learning_rate": 3.926443715605074e-05, + "loss": 0.0, + "step": 49620 + }, + { + "epoch": 4.6301203695063915, + "grad_norm": NaN, + "learning_rate": 3.925933521162497e-05, + "loss": 0.0, + "step": 49621 + }, + { + "epoch": 4.630213679201269, + "grad_norm": NaN, + "learning_rate": 3.92542335487799e-05, + "loss": 0.0, + "step": 49622 + }, + { + "epoch": 4.630306988896146, + "grad_norm": NaN, + "learning_rate": 3.924913216752852e-05, + "loss": 0.0, + "step": 49623 + }, + { + "epoch": 4.630400298591024, + "grad_norm": NaN, + "learning_rate": 3.924403106788378e-05, + "loss": 0.0, + "step": 49624 + }, + { + "epoch": 4.630493608285901, + "grad_norm": NaN, + "learning_rate": 3.923893024985865e-05, + "loss": 0.0, + "step": 49625 + }, + { + "epoch": 4.630586917980779, + "grad_norm": NaN, + "learning_rate": 3.923382971346614e-05, + "loss": 0.0, + "step": 49626 + }, + { + "epoch": 4.630680227675655, + "grad_norm": NaN, + "learning_rate": 3.922872945871917e-05, + "loss": 0.0, + "step": 49627 + }, + { + "epoch": 4.6307735373705325, + "grad_norm": NaN, + "learning_rate": 3.922362948563073e-05, + "loss": 0.0, + "step": 49628 + }, + { + "epoch": 4.63086684706541, + "grad_norm": NaN, + "learning_rate": 3.921852979421379e-05, + "loss": 0.0, + "step": 49629 + }, + { + "epoch": 4.630960156760287, + "grad_norm": NaN, + "learning_rate": 3.9213430384481314e-05, + "loss": 0.0, + "step": 49630 + }, + { + "epoch": 4.631053466455165, + "grad_norm": NaN, + "learning_rate": 3.9208331256446265e-05, + "loss": 0.0, + "step": 49631 + }, + { + "epoch": 4.631146776150042, + "grad_norm": NaN, + "learning_rate": 3.920323241012161e-05, + "loss": 0.0, + "step": 49632 + }, + { + "epoch": 4.63124008584492, + "grad_norm": NaN, + "learning_rate": 3.919813384552031e-05, + "loss": 0.0, + "step": 49633 + }, + { + "epoch": 4.631333395539796, + "grad_norm": NaN, + "learning_rate": 3.9193035562655335e-05, + "loss": 0.0, + "step": 49634 + }, + { + "epoch": 4.631426705234674, + "grad_norm": NaN, + "learning_rate": 3.9187937561539645e-05, + "loss": 0.0, + "step": 49635 + }, + { + "epoch": 4.631520014929551, + "grad_norm": NaN, + "learning_rate": 3.918283984218622e-05, + "loss": 0.0, + "step": 49636 + }, + { + "epoch": 4.631613324624428, + "grad_norm": NaN, + "learning_rate": 3.9177742404608e-05, + "loss": 0.0, + "step": 49637 + }, + { + "epoch": 4.631706634319306, + "grad_norm": NaN, + "learning_rate": 3.917264524881795e-05, + "loss": 0.0, + "step": 49638 + }, + { + "epoch": 4.631799944014183, + "grad_norm": NaN, + "learning_rate": 3.9167548374829025e-05, + "loss": 0.0, + "step": 49639 + }, + { + "epoch": 4.631893253709061, + "grad_norm": NaN, + "learning_rate": 3.91624517826542e-05, + "loss": 0.0, + "step": 49640 + }, + { + "epoch": 4.631986563403938, + "grad_norm": NaN, + "learning_rate": 3.9157355472306434e-05, + "loss": 0.0, + "step": 49641 + }, + { + "epoch": 4.632079873098815, + "grad_norm": NaN, + "learning_rate": 3.915225944379867e-05, + "loss": 0.0, + "step": 49642 + }, + { + "epoch": 4.632173182793692, + "grad_norm": NaN, + "learning_rate": 3.914716369714388e-05, + "loss": 0.0, + "step": 49643 + }, + { + "epoch": 4.6322664924885695, + "grad_norm": NaN, + "learning_rate": 3.914206823235503e-05, + "loss": 0.0, + "step": 49644 + }, + { + "epoch": 4.632359802183447, + "grad_norm": NaN, + "learning_rate": 3.913697304944504e-05, + "loss": 0.0, + "step": 49645 + }, + { + "epoch": 4.632453111878324, + "grad_norm": NaN, + "learning_rate": 3.91318781484269e-05, + "loss": 0.0, + "step": 49646 + }, + { + "epoch": 4.632546421573202, + "grad_norm": NaN, + "learning_rate": 3.912678352931354e-05, + "loss": 0.0, + "step": 49647 + }, + { + "epoch": 4.632639731268078, + "grad_norm": NaN, + "learning_rate": 3.912168919211793e-05, + "loss": 0.0, + "step": 49648 + }, + { + "epoch": 4.632733040962956, + "grad_norm": NaN, + "learning_rate": 3.911659513685302e-05, + "loss": 0.0, + "step": 49649 + }, + { + "epoch": 4.632826350657833, + "grad_norm": NaN, + "learning_rate": 3.9111501363531765e-05, + "loss": 0.0, + "step": 49650 + }, + { + "epoch": 4.6329196603527105, + "grad_norm": NaN, + "learning_rate": 3.910640787216712e-05, + "loss": 0.0, + "step": 49651 + }, + { + "epoch": 4.633012970047588, + "grad_norm": NaN, + "learning_rate": 3.910131466277203e-05, + "loss": 0.0, + "step": 49652 + }, + { + "epoch": 4.633106279742465, + "grad_norm": NaN, + "learning_rate": 3.909622173535944e-05, + "loss": 0.0, + "step": 49653 + }, + { + "epoch": 4.633199589437343, + "grad_norm": NaN, + "learning_rate": 3.909112908994231e-05, + "loss": 0.0, + "step": 49654 + }, + { + "epoch": 4.63329289913222, + "grad_norm": NaN, + "learning_rate": 3.9086036726533576e-05, + "loss": 0.0, + "step": 49655 + }, + { + "epoch": 4.633386208827098, + "grad_norm": NaN, + "learning_rate": 3.908094464514621e-05, + "loss": 0.0, + "step": 49656 + }, + { + "epoch": 4.633479518521974, + "grad_norm": NaN, + "learning_rate": 3.9075852845793127e-05, + "loss": 0.0, + "step": 49657 + }, + { + "epoch": 4.633572828216852, + "grad_norm": NaN, + "learning_rate": 3.907076132848731e-05, + "loss": 0.0, + "step": 49658 + }, + { + "epoch": 4.633666137911729, + "grad_norm": NaN, + "learning_rate": 3.9065670093241684e-05, + "loss": 0.0, + "step": 49659 + }, + { + "epoch": 4.633759447606606, + "grad_norm": NaN, + "learning_rate": 3.906057914006919e-05, + "loss": 0.0, + "step": 49660 + }, + { + "epoch": 4.633852757301484, + "grad_norm": NaN, + "learning_rate": 3.905548846898279e-05, + "loss": 0.0, + "step": 49661 + }, + { + "epoch": 4.633946066996361, + "grad_norm": NaN, + "learning_rate": 3.905039807999541e-05, + "loss": 0.0, + "step": 49662 + }, + { + "epoch": 4.634039376691238, + "grad_norm": NaN, + "learning_rate": 3.904530797312e-05, + "loss": 0.0, + "step": 49663 + }, + { + "epoch": 4.634132686386115, + "grad_norm": NaN, + "learning_rate": 3.904021814836951e-05, + "loss": 0.0, + "step": 49664 + }, + { + "epoch": 4.634225996080993, + "grad_norm": NaN, + "learning_rate": 3.903512860575687e-05, + "loss": 0.0, + "step": 49665 + }, + { + "epoch": 4.63431930577587, + "grad_norm": NaN, + "learning_rate": 3.903003934529503e-05, + "loss": 0.0, + "step": 49666 + }, + { + "epoch": 4.6344126154707475, + "grad_norm": NaN, + "learning_rate": 3.9024950366996936e-05, + "loss": 0.0, + "step": 49667 + }, + { + "epoch": 4.634505925165625, + "grad_norm": NaN, + "learning_rate": 3.901986167087552e-05, + "loss": 0.0, + "step": 49668 + }, + { + "epoch": 4.634599234860502, + "grad_norm": NaN, + "learning_rate": 3.9014773256943715e-05, + "loss": 0.0, + "step": 49669 + }, + { + "epoch": 4.63469254455538, + "grad_norm": NaN, + "learning_rate": 3.900968512521447e-05, + "loss": 0.0, + "step": 49670 + }, + { + "epoch": 4.634785854250256, + "grad_norm": NaN, + "learning_rate": 3.900459727570072e-05, + "loss": 0.0, + "step": 49671 + }, + { + "epoch": 4.634879163945134, + "grad_norm": NaN, + "learning_rate": 3.8999509708415385e-05, + "loss": 0.0, + "step": 49672 + }, + { + "epoch": 4.634972473640011, + "grad_norm": NaN, + "learning_rate": 3.899442242337142e-05, + "loss": 0.0, + "step": 49673 + }, + { + "epoch": 4.6350657833348885, + "grad_norm": NaN, + "learning_rate": 3.898933542058177e-05, + "loss": 0.0, + "step": 49674 + }, + { + "epoch": 4.635159093029766, + "grad_norm": NaN, + "learning_rate": 3.898424870005936e-05, + "loss": 0.0, + "step": 49675 + }, + { + "epoch": 4.635252402724643, + "grad_norm": NaN, + "learning_rate": 3.897916226181711e-05, + "loss": 0.0, + "step": 49676 + }, + { + "epoch": 4.635345712419521, + "grad_norm": NaN, + "learning_rate": 3.897407610586795e-05, + "loss": 0.0, + "step": 49677 + }, + { + "epoch": 4.635439022114397, + "grad_norm": NaN, + "learning_rate": 3.896899023222484e-05, + "loss": 0.0, + "step": 49678 + }, + { + "epoch": 4.635532331809275, + "grad_norm": NaN, + "learning_rate": 3.8963904640900696e-05, + "loss": 0.0, + "step": 49679 + }, + { + "epoch": 4.635625641504152, + "grad_norm": NaN, + "learning_rate": 3.895881933190846e-05, + "loss": 0.0, + "step": 49680 + }, + { + "epoch": 4.6357189511990295, + "grad_norm": NaN, + "learning_rate": 3.895373430526104e-05, + "loss": 0.0, + "step": 49681 + }, + { + "epoch": 4.635812260893907, + "grad_norm": NaN, + "learning_rate": 3.8948649560971384e-05, + "loss": 0.0, + "step": 49682 + }, + { + "epoch": 4.635905570588784, + "grad_norm": NaN, + "learning_rate": 3.894356509905243e-05, + "loss": 0.0, + "step": 49683 + }, + { + "epoch": 4.635998880283662, + "grad_norm": NaN, + "learning_rate": 3.8938480919517065e-05, + "loss": 0.0, + "step": 49684 + }, + { + "epoch": 4.636092189978539, + "grad_norm": NaN, + "learning_rate": 3.8933397022378266e-05, + "loss": 0.0, + "step": 49685 + }, + { + "epoch": 4.636185499673416, + "grad_norm": NaN, + "learning_rate": 3.8928313407648924e-05, + "loss": 0.0, + "step": 49686 + }, + { + "epoch": 4.636278809368293, + "grad_norm": NaN, + "learning_rate": 3.892323007534199e-05, + "loss": 0.0, + "step": 49687 + }, + { + "epoch": 4.636372119063171, + "grad_norm": NaN, + "learning_rate": 3.8918147025470373e-05, + "loss": 0.0, + "step": 49688 + }, + { + "epoch": 4.636465428758048, + "grad_norm": NaN, + "learning_rate": 3.891306425804701e-05, + "loss": 0.0, + "step": 49689 + }, + { + "epoch": 4.636558738452925, + "grad_norm": NaN, + "learning_rate": 3.890798177308482e-05, + "loss": 0.0, + "step": 49690 + }, + { + "epoch": 4.636652048147803, + "grad_norm": NaN, + "learning_rate": 3.890289957059672e-05, + "loss": 0.0, + "step": 49691 + }, + { + "epoch": 4.636745357842679, + "grad_norm": NaN, + "learning_rate": 3.889781765059562e-05, + "loss": 0.0, + "step": 49692 + }, + { + "epoch": 4.636838667537557, + "grad_norm": NaN, + "learning_rate": 3.889273601309448e-05, + "loss": 0.0, + "step": 49693 + }, + { + "epoch": 4.636931977232434, + "grad_norm": NaN, + "learning_rate": 3.88876546581062e-05, + "loss": 0.0, + "step": 49694 + }, + { + "epoch": 4.637025286927312, + "grad_norm": NaN, + "learning_rate": 3.888257358564369e-05, + "loss": 0.0, + "step": 49695 + }, + { + "epoch": 4.637118596622189, + "grad_norm": NaN, + "learning_rate": 3.8877492795719874e-05, + "loss": 0.0, + "step": 49696 + }, + { + "epoch": 4.6372119063170665, + "grad_norm": NaN, + "learning_rate": 3.8872412288347685e-05, + "loss": 0.0, + "step": 49697 + }, + { + "epoch": 4.637305216011944, + "grad_norm": NaN, + "learning_rate": 3.886733206354004e-05, + "loss": 0.0, + "step": 49698 + }, + { + "epoch": 4.637398525706821, + "grad_norm": NaN, + "learning_rate": 3.886225212130984e-05, + "loss": 0.0, + "step": 49699 + }, + { + "epoch": 4.637491835401698, + "grad_norm": NaN, + "learning_rate": 3.8857172461670024e-05, + "loss": 0.0, + "step": 49700 + }, + { + "epoch": 4.637585145096575, + "grad_norm": NaN, + "learning_rate": 3.885209308463348e-05, + "loss": 0.0, + "step": 49701 + }, + { + "epoch": 4.637678454791453, + "grad_norm": NaN, + "learning_rate": 3.884701399021315e-05, + "loss": 0.0, + "step": 49702 + }, + { + "epoch": 4.63777176448633, + "grad_norm": NaN, + "learning_rate": 3.884193517842194e-05, + "loss": 0.0, + "step": 49703 + }, + { + "epoch": 4.6378650741812075, + "grad_norm": NaN, + "learning_rate": 3.883685664927276e-05, + "loss": 0.0, + "step": 49704 + }, + { + "epoch": 4.637958383876085, + "grad_norm": NaN, + "learning_rate": 3.883177840277851e-05, + "loss": 0.0, + "step": 49705 + }, + { + "epoch": 4.638051693570962, + "grad_norm": NaN, + "learning_rate": 3.882670043895213e-05, + "loss": 0.0, + "step": 49706 + }, + { + "epoch": 4.638145003265839, + "grad_norm": NaN, + "learning_rate": 3.882162275780651e-05, + "loss": 0.0, + "step": 49707 + }, + { + "epoch": 4.638238312960716, + "grad_norm": NaN, + "learning_rate": 3.881654535935458e-05, + "loss": 0.0, + "step": 49708 + }, + { + "epoch": 4.638331622655594, + "grad_norm": NaN, + "learning_rate": 3.881146824360923e-05, + "loss": 0.0, + "step": 49709 + }, + { + "epoch": 4.638424932350471, + "grad_norm": NaN, + "learning_rate": 3.880639141058339e-05, + "loss": 0.0, + "step": 49710 + }, + { + "epoch": 4.638518242045349, + "grad_norm": NaN, + "learning_rate": 3.880131486028995e-05, + "loss": 0.0, + "step": 49711 + }, + { + "epoch": 4.638611551740226, + "grad_norm": NaN, + "learning_rate": 3.879623859274184e-05, + "loss": 0.0, + "step": 49712 + }, + { + "epoch": 4.638704861435103, + "grad_norm": NaN, + "learning_rate": 3.8791162607951947e-05, + "loss": 0.0, + "step": 49713 + }, + { + "epoch": 4.638798171129981, + "grad_norm": NaN, + "learning_rate": 3.878608690593318e-05, + "loss": 0.0, + "step": 49714 + }, + { + "epoch": 4.638891480824857, + "grad_norm": NaN, + "learning_rate": 3.8781011486698454e-05, + "loss": 0.0, + "step": 49715 + }, + { + "epoch": 4.638984790519735, + "grad_norm": NaN, + "learning_rate": 3.877593635026067e-05, + "loss": 0.0, + "step": 49716 + }, + { + "epoch": 4.639078100214612, + "grad_norm": NaN, + "learning_rate": 3.877086149663272e-05, + "loss": 0.0, + "step": 49717 + }, + { + "epoch": 4.63917140990949, + "grad_norm": NaN, + "learning_rate": 3.876578692582754e-05, + "loss": 0.0, + "step": 49718 + }, + { + "epoch": 4.639264719604367, + "grad_norm": NaN, + "learning_rate": 3.8760712637858e-05, + "loss": 0.0, + "step": 49719 + }, + { + "epoch": 4.6393580292992445, + "grad_norm": NaN, + "learning_rate": 3.8755638632737036e-05, + "loss": 0.0, + "step": 49720 + }, + { + "epoch": 4.639451338994122, + "grad_norm": NaN, + "learning_rate": 3.875056491047751e-05, + "loss": 0.0, + "step": 49721 + }, + { + "epoch": 4.639544648688998, + "grad_norm": NaN, + "learning_rate": 3.874549147109235e-05, + "loss": 0.0, + "step": 49722 + }, + { + "epoch": 4.639637958383876, + "grad_norm": NaN, + "learning_rate": 3.8740418314594456e-05, + "loss": 0.0, + "step": 49723 + }, + { + "epoch": 4.639731268078753, + "grad_norm": NaN, + "learning_rate": 3.8735345440996704e-05, + "loss": 0.0, + "step": 49724 + }, + { + "epoch": 4.639824577773631, + "grad_norm": NaN, + "learning_rate": 3.873027285031202e-05, + "loss": 0.0, + "step": 49725 + }, + { + "epoch": 4.639917887468508, + "grad_norm": NaN, + "learning_rate": 3.872520054255329e-05, + "loss": 0.0, + "step": 49726 + }, + { + "epoch": 4.6400111971633855, + "grad_norm": NaN, + "learning_rate": 3.872012851773342e-05, + "loss": 0.0, + "step": 49727 + }, + { + "epoch": 4.640104506858263, + "grad_norm": NaN, + "learning_rate": 3.871505677586529e-05, + "loss": 0.0, + "step": 49728 + }, + { + "epoch": 4.64019781655314, + "grad_norm": NaN, + "learning_rate": 3.870998531696181e-05, + "loss": 0.0, + "step": 49729 + }, + { + "epoch": 4.640291126248017, + "grad_norm": NaN, + "learning_rate": 3.8704914141035825e-05, + "loss": 0.0, + "step": 49730 + }, + { + "epoch": 4.640384435942894, + "grad_norm": NaN, + "learning_rate": 3.869984324810037e-05, + "loss": 0.0, + "step": 49731 + }, + { + "epoch": 4.640477745637772, + "grad_norm": NaN, + "learning_rate": 3.8694772638168186e-05, + "loss": 0.0, + "step": 49732 + }, + { + "epoch": 4.640571055332649, + "grad_norm": NaN, + "learning_rate": 3.86897023112522e-05, + "loss": 0.0, + "step": 49733 + }, + { + "epoch": 4.6406643650275266, + "grad_norm": NaN, + "learning_rate": 3.868463226736541e-05, + "loss": 0.0, + "step": 49734 + }, + { + "epoch": 4.640757674722404, + "grad_norm": NaN, + "learning_rate": 3.867956250652057e-05, + "loss": 0.0, + "step": 49735 + }, + { + "epoch": 4.6408509844172805, + "grad_norm": NaN, + "learning_rate": 3.867449302873058e-05, + "loss": 0.0, + "step": 49736 + }, + { + "epoch": 4.640944294112158, + "grad_norm": NaN, + "learning_rate": 3.866942383400848e-05, + "loss": 0.0, + "step": 49737 + }, + { + "epoch": 4.641037603807035, + "grad_norm": NaN, + "learning_rate": 3.866435492236701e-05, + "loss": 0.0, + "step": 49738 + }, + { + "epoch": 4.641130913501913, + "grad_norm": NaN, + "learning_rate": 3.86592862938191e-05, + "loss": 0.0, + "step": 49739 + }, + { + "epoch": 4.64122422319679, + "grad_norm": NaN, + "learning_rate": 3.865421794837764e-05, + "loss": 0.0, + "step": 49740 + }, + { + "epoch": 4.641317532891668, + "grad_norm": NaN, + "learning_rate": 3.864914988605553e-05, + "loss": 0.0, + "step": 49741 + }, + { + "epoch": 4.641410842586545, + "grad_norm": NaN, + "learning_rate": 3.864408210686566e-05, + "loss": 0.0, + "step": 49742 + }, + { + "epoch": 4.641504152281422, + "grad_norm": NaN, + "learning_rate": 3.863901461082088e-05, + "loss": 0.0, + "step": 49743 + }, + { + "epoch": 4.641597461976299, + "grad_norm": NaN, + "learning_rate": 3.863394739793411e-05, + "loss": 0.0, + "step": 49744 + }, + { + "epoch": 4.641690771671176, + "grad_norm": NaN, + "learning_rate": 3.862888046821821e-05, + "loss": 0.0, + "step": 49745 + }, + { + "epoch": 4.641784081366054, + "grad_norm": NaN, + "learning_rate": 3.8623813821686086e-05, + "loss": 0.0, + "step": 49746 + }, + { + "epoch": 4.641877391060931, + "grad_norm": NaN, + "learning_rate": 3.861874745835061e-05, + "loss": 0.0, + "step": 49747 + }, + { + "epoch": 4.641970700755809, + "grad_norm": NaN, + "learning_rate": 3.861368137822467e-05, + "loss": 0.0, + "step": 49748 + }, + { + "epoch": 4.642064010450686, + "grad_norm": NaN, + "learning_rate": 3.860861558132115e-05, + "loss": 0.0, + "step": 49749 + }, + { + "epoch": 4.6421573201455635, + "grad_norm": NaN, + "learning_rate": 3.860355006765291e-05, + "loss": 0.0, + "step": 49750 + }, + { + "epoch": 4.64225062984044, + "grad_norm": NaN, + "learning_rate": 3.859848483723284e-05, + "loss": 0.0, + "step": 49751 + }, + { + "epoch": 4.642343939535317, + "grad_norm": NaN, + "learning_rate": 3.859341989007382e-05, + "loss": 0.0, + "step": 49752 + }, + { + "epoch": 4.642437249230195, + "grad_norm": NaN, + "learning_rate": 3.858835522618876e-05, + "loss": 0.0, + "step": 49753 + }, + { + "epoch": 4.642530558925072, + "grad_norm": NaN, + "learning_rate": 3.858329084559049e-05, + "loss": 0.0, + "step": 49754 + }, + { + "epoch": 4.64262386861995, + "grad_norm": NaN, + "learning_rate": 3.8578226748291905e-05, + "loss": 0.0, + "step": 49755 + }, + { + "epoch": 4.642717178314827, + "grad_norm": NaN, + "learning_rate": 3.85731629343059e-05, + "loss": 0.0, + "step": 49756 + }, + { + "epoch": 4.6428104880097045, + "grad_norm": NaN, + "learning_rate": 3.8568099403645316e-05, + "loss": 0.0, + "step": 49757 + }, + { + "epoch": 4.642903797704582, + "grad_norm": NaN, + "learning_rate": 3.856303615632305e-05, + "loss": 0.0, + "step": 49758 + }, + { + "epoch": 4.6429971073994585, + "grad_norm": NaN, + "learning_rate": 3.855797319235198e-05, + "loss": 0.0, + "step": 49759 + }, + { + "epoch": 4.643090417094336, + "grad_norm": NaN, + "learning_rate": 3.855291051174498e-05, + "loss": 0.0, + "step": 49760 + }, + { + "epoch": 4.643183726789213, + "grad_norm": NaN, + "learning_rate": 3.854784811451489e-05, + "loss": 0.0, + "step": 49761 + }, + { + "epoch": 4.643277036484091, + "grad_norm": NaN, + "learning_rate": 3.854278600067464e-05, + "loss": 0.0, + "step": 49762 + }, + { + "epoch": 4.643370346178968, + "grad_norm": NaN, + "learning_rate": 3.853772417023704e-05, + "loss": 0.0, + "step": 49763 + }, + { + "epoch": 4.643463655873846, + "grad_norm": NaN, + "learning_rate": 3.853266262321501e-05, + "loss": 0.0, + "step": 49764 + }, + { + "epoch": 4.643556965568722, + "grad_norm": NaN, + "learning_rate": 3.852760135962139e-05, + "loss": 0.0, + "step": 49765 + }, + { + "epoch": 4.6436502752635995, + "grad_norm": NaN, + "learning_rate": 3.852254037946907e-05, + "loss": 0.0, + "step": 49766 + }, + { + "epoch": 4.643743584958477, + "grad_norm": NaN, + "learning_rate": 3.8517479682770885e-05, + "loss": 0.0, + "step": 49767 + }, + { + "epoch": 4.643836894653354, + "grad_norm": NaN, + "learning_rate": 3.85124192695397e-05, + "loss": 0.0, + "step": 49768 + }, + { + "epoch": 4.643930204348232, + "grad_norm": NaN, + "learning_rate": 3.850735913978849e-05, + "loss": 0.0, + "step": 49769 + }, + { + "epoch": 4.644023514043109, + "grad_norm": NaN, + "learning_rate": 3.850229929353001e-05, + "loss": 0.0, + "step": 49770 + }, + { + "epoch": 4.644116823737987, + "grad_norm": NaN, + "learning_rate": 3.8497239730777094e-05, + "loss": 0.0, + "step": 49771 + }, + { + "epoch": 4.644210133432864, + "grad_norm": NaN, + "learning_rate": 3.849218045154276e-05, + "loss": 0.0, + "step": 49772 + }, + { + "epoch": 4.6443034431277415, + "grad_norm": NaN, + "learning_rate": 3.8487121455839736e-05, + "loss": 0.0, + "step": 49773 + }, + { + "epoch": 4.644396752822618, + "grad_norm": NaN, + "learning_rate": 3.8482062743680885e-05, + "loss": 0.0, + "step": 49774 + }, + { + "epoch": 4.644490062517495, + "grad_norm": NaN, + "learning_rate": 3.8477004315079196e-05, + "loss": 0.0, + "step": 49775 + }, + { + "epoch": 4.644583372212373, + "grad_norm": NaN, + "learning_rate": 3.8471946170047406e-05, + "loss": 0.0, + "step": 49776 + }, + { + "epoch": 4.64467668190725, + "grad_norm": NaN, + "learning_rate": 3.846688830859839e-05, + "loss": 0.0, + "step": 49777 + }, + { + "epoch": 4.644769991602128, + "grad_norm": NaN, + "learning_rate": 3.846183073074512e-05, + "loss": 0.0, + "step": 49778 + }, + { + "epoch": 4.644863301297005, + "grad_norm": NaN, + "learning_rate": 3.8456773436500323e-05, + "loss": 0.0, + "step": 49779 + }, + { + "epoch": 4.644956610991882, + "grad_norm": NaN, + "learning_rate": 3.845171642587687e-05, + "loss": 0.0, + "step": 49780 + }, + { + "epoch": 4.645049920686759, + "grad_norm": NaN, + "learning_rate": 3.844665969888775e-05, + "loss": 0.0, + "step": 49781 + }, + { + "epoch": 4.6451432303816365, + "grad_norm": NaN, + "learning_rate": 3.844160325554568e-05, + "loss": 0.0, + "step": 49782 + }, + { + "epoch": 4.645236540076514, + "grad_norm": NaN, + "learning_rate": 3.8436547095863576e-05, + "loss": 0.0, + "step": 49783 + }, + { + "epoch": 4.645329849771391, + "grad_norm": NaN, + "learning_rate": 3.8431491219854274e-05, + "loss": 0.0, + "step": 49784 + }, + { + "epoch": 4.645423159466269, + "grad_norm": NaN, + "learning_rate": 3.842643562753065e-05, + "loss": 0.0, + "step": 49785 + }, + { + "epoch": 4.645516469161146, + "grad_norm": NaN, + "learning_rate": 3.8421380318905556e-05, + "loss": 0.0, + "step": 49786 + }, + { + "epoch": 4.6456097788560236, + "grad_norm": NaN, + "learning_rate": 3.841632529399183e-05, + "loss": 0.0, + "step": 49787 + }, + { + "epoch": 4.6457030885509, + "grad_norm": NaN, + "learning_rate": 3.841127055280233e-05, + "loss": 0.0, + "step": 49788 + }, + { + "epoch": 4.6457963982457775, + "grad_norm": NaN, + "learning_rate": 3.840621609534992e-05, + "loss": 0.0, + "step": 49789 + }, + { + "epoch": 4.645889707940655, + "grad_norm": NaN, + "learning_rate": 3.8401161921647443e-05, + "loss": 0.0, + "step": 49790 + }, + { + "epoch": 4.645983017635532, + "grad_norm": NaN, + "learning_rate": 3.839610803170777e-05, + "loss": 0.0, + "step": 49791 + }, + { + "epoch": 4.64607632733041, + "grad_norm": NaN, + "learning_rate": 3.839105442554371e-05, + "loss": 0.0, + "step": 49792 + }, + { + "epoch": 4.646169637025287, + "grad_norm": NaN, + "learning_rate": 3.838600110316814e-05, + "loss": 0.0, + "step": 49793 + }, + { + "epoch": 4.646262946720165, + "grad_norm": NaN, + "learning_rate": 3.838094806459393e-05, + "loss": 0.0, + "step": 49794 + }, + { + "epoch": 4.646356256415041, + "grad_norm": NaN, + "learning_rate": 3.8375895309833884e-05, + "loss": 0.0, + "step": 49795 + }, + { + "epoch": 4.6464495661099185, + "grad_norm": NaN, + "learning_rate": 3.837084283890087e-05, + "loss": 0.0, + "step": 49796 + }, + { + "epoch": 4.646542875804796, + "grad_norm": NaN, + "learning_rate": 3.836579065180776e-05, + "loss": 0.0, + "step": 49797 + }, + { + "epoch": 4.646636185499673, + "grad_norm": NaN, + "learning_rate": 3.836073874856736e-05, + "loss": 0.0, + "step": 49798 + }, + { + "epoch": 4.646729495194551, + "grad_norm": NaN, + "learning_rate": 3.8355687129192515e-05, + "loss": 0.0, + "step": 49799 + }, + { + "epoch": 4.646822804889428, + "grad_norm": NaN, + "learning_rate": 3.835063579369611e-05, + "loss": 0.0, + "step": 49800 + }, + { + "epoch": 4.646916114584306, + "grad_norm": NaN, + "learning_rate": 3.834558474209095e-05, + "loss": 0.0, + "step": 49801 + }, + { + "epoch": 4.647009424279183, + "grad_norm": NaN, + "learning_rate": 3.834053397438991e-05, + "loss": 0.0, + "step": 49802 + }, + { + "epoch": 4.64710273397406, + "grad_norm": NaN, + "learning_rate": 3.833548349060576e-05, + "loss": 0.0, + "step": 49803 + }, + { + "epoch": 4.647196043668937, + "grad_norm": NaN, + "learning_rate": 3.8330433290751484e-05, + "loss": 0.0, + "step": 49804 + }, + { + "epoch": 4.647289353363814, + "grad_norm": NaN, + "learning_rate": 3.83253833748398e-05, + "loss": 0.0, + "step": 49805 + }, + { + "epoch": 4.647382663058692, + "grad_norm": NaN, + "learning_rate": 3.8320333742883534e-05, + "loss": 0.0, + "step": 49806 + }, + { + "epoch": 4.647475972753569, + "grad_norm": NaN, + "learning_rate": 3.8315284394895676e-05, + "loss": 0.0, + "step": 49807 + }, + { + "epoch": 4.647569282448447, + "grad_norm": NaN, + "learning_rate": 3.8310235330888915e-05, + "loss": 0.0, + "step": 49808 + }, + { + "epoch": 4.647662592143323, + "grad_norm": NaN, + "learning_rate": 3.8305186550876085e-05, + "loss": 0.0, + "step": 49809 + }, + { + "epoch": 4.647755901838201, + "grad_norm": NaN, + "learning_rate": 3.830013805487019e-05, + "loss": 0.0, + "step": 49810 + }, + { + "epoch": 4.647849211533078, + "grad_norm": NaN, + "learning_rate": 3.829508984288389e-05, + "loss": 0.0, + "step": 49811 + }, + { + "epoch": 4.6479425212279555, + "grad_norm": NaN, + "learning_rate": 3.829004191493007e-05, + "loss": 0.0, + "step": 49812 + }, + { + "epoch": 4.648035830922833, + "grad_norm": NaN, + "learning_rate": 3.828499427102165e-05, + "loss": 0.0, + "step": 49813 + }, + { + "epoch": 4.64812914061771, + "grad_norm": NaN, + "learning_rate": 3.827994691117136e-05, + "loss": 0.0, + "step": 49814 + }, + { + "epoch": 4.648222450312588, + "grad_norm": NaN, + "learning_rate": 3.8274899835392017e-05, + "loss": 0.0, + "step": 49815 + }, + { + "epoch": 4.648315760007465, + "grad_norm": NaN, + "learning_rate": 3.8269853043696616e-05, + "loss": 0.0, + "step": 49816 + }, + { + "epoch": 4.648409069702342, + "grad_norm": NaN, + "learning_rate": 3.8264806536097814e-05, + "loss": 0.0, + "step": 49817 + }, + { + "epoch": 4.648502379397219, + "grad_norm": NaN, + "learning_rate": 3.825976031260849e-05, + "loss": 0.0, + "step": 49818 + }, + { + "epoch": 4.6485956890920965, + "grad_norm": NaN, + "learning_rate": 3.825471437324157e-05, + "loss": 0.0, + "step": 49819 + }, + { + "epoch": 4.648688998786974, + "grad_norm": NaN, + "learning_rate": 3.824966871800977e-05, + "loss": 0.0, + "step": 49820 + }, + { + "epoch": 4.648782308481851, + "grad_norm": NaN, + "learning_rate": 3.824462334692592e-05, + "loss": 0.0, + "step": 49821 + }, + { + "epoch": 4.648875618176729, + "grad_norm": NaN, + "learning_rate": 3.8239578260002976e-05, + "loss": 0.0, + "step": 49822 + }, + { + "epoch": 4.648968927871606, + "grad_norm": NaN, + "learning_rate": 3.823453345725361e-05, + "loss": 0.0, + "step": 49823 + }, + { + "epoch": 4.649062237566483, + "grad_norm": NaN, + "learning_rate": 3.822948893869069e-05, + "loss": 0.0, + "step": 49824 + }, + { + "epoch": 4.64915554726136, + "grad_norm": NaN, + "learning_rate": 3.8224444704327174e-05, + "loss": 0.0, + "step": 49825 + }, + { + "epoch": 4.649248856956238, + "grad_norm": NaN, + "learning_rate": 3.821940075417572e-05, + "loss": 0.0, + "step": 49826 + }, + { + "epoch": 4.649342166651115, + "grad_norm": NaN, + "learning_rate": 3.821435708824923e-05, + "loss": 0.0, + "step": 49827 + }, + { + "epoch": 4.649435476345992, + "grad_norm": NaN, + "learning_rate": 3.8209313706560525e-05, + "loss": 0.0, + "step": 49828 + }, + { + "epoch": 4.64952878604087, + "grad_norm": NaN, + "learning_rate": 3.8204270609122406e-05, + "loss": 0.0, + "step": 49829 + }, + { + "epoch": 4.649622095735747, + "grad_norm": NaN, + "learning_rate": 3.819922779594772e-05, + "loss": 0.0, + "step": 49830 + }, + { + "epoch": 4.649715405430625, + "grad_norm": NaN, + "learning_rate": 3.819418526704927e-05, + "loss": 0.0, + "step": 49831 + }, + { + "epoch": 4.649808715125501, + "grad_norm": NaN, + "learning_rate": 3.818914302243989e-05, + "loss": 0.0, + "step": 49832 + }, + { + "epoch": 4.649902024820379, + "grad_norm": NaN, + "learning_rate": 3.8184101062132396e-05, + "loss": 0.0, + "step": 49833 + }, + { + "epoch": 4.649995334515256, + "grad_norm": NaN, + "learning_rate": 3.817905938613962e-05, + "loss": 0.0, + "step": 49834 + }, + { + "epoch": 4.6500886442101335, + "grad_norm": NaN, + "learning_rate": 3.817401799447436e-05, + "loss": 0.0, + "step": 49835 + }, + { + "epoch": 4.650181953905011, + "grad_norm": NaN, + "learning_rate": 3.816897688714945e-05, + "loss": 0.0, + "step": 49836 + }, + { + "epoch": 4.650275263599888, + "grad_norm": NaN, + "learning_rate": 3.8163936064177715e-05, + "loss": 0.0, + "step": 49837 + }, + { + "epoch": 4.650368573294765, + "grad_norm": NaN, + "learning_rate": 3.815889552557196e-05, + "loss": 0.0, + "step": 49838 + }, + { + "epoch": 4.650461882989642, + "grad_norm": NaN, + "learning_rate": 3.815385527134499e-05, + "loss": 0.0, + "step": 49839 + }, + { + "epoch": 4.65055519268452, + "grad_norm": NaN, + "learning_rate": 3.814881530150965e-05, + "loss": 0.0, + "step": 49840 + }, + { + "epoch": 4.650648502379397, + "grad_norm": NaN, + "learning_rate": 3.814377561607869e-05, + "loss": 0.0, + "step": 49841 + }, + { + "epoch": 4.6507418120742745, + "grad_norm": NaN, + "learning_rate": 3.813873621506506e-05, + "loss": 0.0, + "step": 49842 + }, + { + "epoch": 4.650835121769152, + "grad_norm": NaN, + "learning_rate": 3.8133697098481453e-05, + "loss": 0.0, + "step": 49843 + }, + { + "epoch": 4.650928431464029, + "grad_norm": NaN, + "learning_rate": 3.8128658266340665e-05, + "loss": 0.0, + "step": 49844 + }, + { + "epoch": 4.651021741158907, + "grad_norm": NaN, + "learning_rate": 3.812361971865564e-05, + "loss": 0.0, + "step": 49845 + }, + { + "epoch": 4.651115050853784, + "grad_norm": NaN, + "learning_rate": 3.811858145543908e-05, + "loss": 0.0, + "step": 49846 + }, + { + "epoch": 4.651208360548661, + "grad_norm": NaN, + "learning_rate": 3.811354347670377e-05, + "loss": 0.0, + "step": 49847 + }, + { + "epoch": 4.651301670243538, + "grad_norm": NaN, + "learning_rate": 3.810850578246268e-05, + "loss": 0.0, + "step": 49848 + }, + { + "epoch": 4.6513949799384156, + "grad_norm": NaN, + "learning_rate": 3.810346837272847e-05, + "loss": 0.0, + "step": 49849 + }, + { + "epoch": 4.651488289633293, + "grad_norm": NaN, + "learning_rate": 3.8098431247513936e-05, + "loss": 0.0, + "step": 49850 + }, + { + "epoch": 4.65158159932817, + "grad_norm": NaN, + "learning_rate": 3.8093394406832026e-05, + "loss": 0.0, + "step": 49851 + }, + { + "epoch": 4.651674909023048, + "grad_norm": NaN, + "learning_rate": 3.8088357850695445e-05, + "loss": 0.0, + "step": 49852 + }, + { + "epoch": 4.651768218717924, + "grad_norm": NaN, + "learning_rate": 3.8083321579116954e-05, + "loss": 0.0, + "step": 49853 + }, + { + "epoch": 4.651861528412802, + "grad_norm": NaN, + "learning_rate": 3.807828559210953e-05, + "loss": 0.0, + "step": 49854 + }, + { + "epoch": 4.651954838107679, + "grad_norm": NaN, + "learning_rate": 3.8073249889685814e-05, + "loss": 0.0, + "step": 49855 + }, + { + "epoch": 4.652048147802557, + "grad_norm": NaN, + "learning_rate": 3.806821447185863e-05, + "loss": 0.0, + "step": 49856 + }, + { + "epoch": 4.652141457497434, + "grad_norm": NaN, + "learning_rate": 3.806317933864091e-05, + "loss": 0.0, + "step": 49857 + }, + { + "epoch": 4.652234767192311, + "grad_norm": NaN, + "learning_rate": 3.8058144490045316e-05, + "loss": 0.0, + "step": 49858 + }, + { + "epoch": 4.652328076887189, + "grad_norm": NaN, + "learning_rate": 3.8053109926084656e-05, + "loss": 0.0, + "step": 49859 + }, + { + "epoch": 4.652421386582066, + "grad_norm": NaN, + "learning_rate": 3.804807564677187e-05, + "loss": 0.0, + "step": 49860 + }, + { + "epoch": 4.652514696276943, + "grad_norm": NaN, + "learning_rate": 3.804304165211963e-05, + "loss": 0.0, + "step": 49861 + }, + { + "epoch": 4.65260800597182, + "grad_norm": NaN, + "learning_rate": 3.803800794214072e-05, + "loss": 0.0, + "step": 49862 + }, + { + "epoch": 4.652701315666698, + "grad_norm": NaN, + "learning_rate": 3.803297451684807e-05, + "loss": 0.0, + "step": 49863 + }, + { + "epoch": 4.652794625361575, + "grad_norm": NaN, + "learning_rate": 3.802794137625437e-05, + "loss": 0.0, + "step": 49864 + }, + { + "epoch": 4.6528879350564525, + "grad_norm": NaN, + "learning_rate": 3.802290852037239e-05, + "loss": 0.0, + "step": 49865 + }, + { + "epoch": 4.65298124475133, + "grad_norm": NaN, + "learning_rate": 3.8017875949215064e-05, + "loss": 0.0, + "step": 49866 + }, + { + "epoch": 4.653074554446207, + "grad_norm": NaN, + "learning_rate": 3.8012843662795076e-05, + "loss": 0.0, + "step": 49867 + }, + { + "epoch": 4.653167864141084, + "grad_norm": NaN, + "learning_rate": 3.8007811661125216e-05, + "loss": 0.0, + "step": 49868 + }, + { + "epoch": 4.653261173835961, + "grad_norm": NaN, + "learning_rate": 3.8002779944218395e-05, + "loss": 0.0, + "step": 49869 + }, + { + "epoch": 4.653354483530839, + "grad_norm": NaN, + "learning_rate": 3.799774851208728e-05, + "loss": 0.0, + "step": 49870 + }, + { + "epoch": 4.653447793225716, + "grad_norm": NaN, + "learning_rate": 3.799271736474468e-05, + "loss": 0.0, + "step": 49871 + }, + { + "epoch": 4.6535411029205935, + "grad_norm": NaN, + "learning_rate": 3.798768650220351e-05, + "loss": 0.0, + "step": 49872 + }, + { + "epoch": 4.653634412615471, + "grad_norm": NaN, + "learning_rate": 3.7982655924476406e-05, + "loss": 0.0, + "step": 49873 + }, + { + "epoch": 4.653727722310348, + "grad_norm": NaN, + "learning_rate": 3.7977625631576255e-05, + "loss": 0.0, + "step": 49874 + }, + { + "epoch": 4.653821032005226, + "grad_norm": NaN, + "learning_rate": 3.79725956235158e-05, + "loss": 0.0, + "step": 49875 + }, + { + "epoch": 4.653914341700102, + "grad_norm": NaN, + "learning_rate": 3.796756590030785e-05, + "loss": 0.0, + "step": 49876 + }, + { + "epoch": 4.65400765139498, + "grad_norm": NaN, + "learning_rate": 3.796253646196521e-05, + "loss": 0.0, + "step": 49877 + }, + { + "epoch": 4.654100961089857, + "grad_norm": NaN, + "learning_rate": 3.795750730850064e-05, + "loss": 0.0, + "step": 49878 + }, + { + "epoch": 4.654194270784735, + "grad_norm": NaN, + "learning_rate": 3.795247843992689e-05, + "loss": 0.0, + "step": 49879 + }, + { + "epoch": 4.654287580479612, + "grad_norm": NaN, + "learning_rate": 3.794744985625689e-05, + "loss": 0.0, + "step": 49880 + }, + { + "epoch": 4.654380890174489, + "grad_norm": NaN, + "learning_rate": 3.7942421557503285e-05, + "loss": 0.0, + "step": 49881 + }, + { + "epoch": 4.654474199869366, + "grad_norm": NaN, + "learning_rate": 3.793739354367887e-05, + "loss": 0.0, + "step": 49882 + }, + { + "epoch": 4.654567509564243, + "grad_norm": NaN, + "learning_rate": 3.793236581479655e-05, + "loss": 0.0, + "step": 49883 + }, + { + "epoch": 4.654660819259121, + "grad_norm": NaN, + "learning_rate": 3.792733837086899e-05, + "loss": 0.0, + "step": 49884 + }, + { + "epoch": 4.654754128953998, + "grad_norm": NaN, + "learning_rate": 3.792231121190896e-05, + "loss": 0.0, + "step": 49885 + }, + { + "epoch": 4.654847438648876, + "grad_norm": NaN, + "learning_rate": 3.7917284337929385e-05, + "loss": 0.0, + "step": 49886 + }, + { + "epoch": 4.654940748343753, + "grad_norm": NaN, + "learning_rate": 3.791225774894292e-05, + "loss": 0.0, + "step": 49887 + }, + { + "epoch": 4.6550340580386305, + "grad_norm": NaN, + "learning_rate": 3.790723144496231e-05, + "loss": 0.0, + "step": 49888 + }, + { + "epoch": 4.655127367733508, + "grad_norm": NaN, + "learning_rate": 3.790220542600053e-05, + "loss": 0.0, + "step": 49889 + }, + { + "epoch": 4.655220677428385, + "grad_norm": NaN, + "learning_rate": 3.7897179692070156e-05, + "loss": 0.0, + "step": 49890 + }, + { + "epoch": 4.655313987123262, + "grad_norm": NaN, + "learning_rate": 3.7892154243184023e-05, + "loss": 0.0, + "step": 49891 + }, + { + "epoch": 4.655407296818139, + "grad_norm": NaN, + "learning_rate": 3.7887129079355025e-05, + "loss": 0.0, + "step": 49892 + }, + { + "epoch": 4.655500606513017, + "grad_norm": NaN, + "learning_rate": 3.7882104200595785e-05, + "loss": 0.0, + "step": 49893 + }, + { + "epoch": 4.655593916207894, + "grad_norm": NaN, + "learning_rate": 3.787707960691911e-05, + "loss": 0.0, + "step": 49894 + }, + { + "epoch": 4.6556872259027715, + "grad_norm": NaN, + "learning_rate": 3.7872055298337896e-05, + "loss": 0.0, + "step": 49895 + }, + { + "epoch": 4.655780535597649, + "grad_norm": NaN, + "learning_rate": 3.786703127486479e-05, + "loss": 0.0, + "step": 49896 + }, + { + "epoch": 4.6558738452925255, + "grad_norm": NaN, + "learning_rate": 3.786200753651255e-05, + "loss": 0.0, + "step": 49897 + }, + { + "epoch": 4.655967154987403, + "grad_norm": NaN, + "learning_rate": 3.78569840832941e-05, + "loss": 0.0, + "step": 49898 + }, + { + "epoch": 4.65606046468228, + "grad_norm": NaN, + "learning_rate": 3.785196091522209e-05, + "loss": 0.0, + "step": 49899 + }, + { + "epoch": 4.656153774377158, + "grad_norm": NaN, + "learning_rate": 3.784693803230926e-05, + "loss": 0.0, + "step": 49900 + }, + { + "epoch": 4.656247084072035, + "grad_norm": NaN, + "learning_rate": 3.784191543456855e-05, + "loss": 0.0, + "step": 49901 + }, + { + "epoch": 4.6563403937669126, + "grad_norm": NaN, + "learning_rate": 3.783689312201256e-05, + "loss": 0.0, + "step": 49902 + }, + { + "epoch": 4.65643370346179, + "grad_norm": NaN, + "learning_rate": 3.7831871094654096e-05, + "loss": 0.0, + "step": 49903 + }, + { + "epoch": 4.656527013156667, + "grad_norm": NaN, + "learning_rate": 3.7826849352506045e-05, + "loss": 0.0, + "step": 49904 + }, + { + "epoch": 4.656620322851544, + "grad_norm": NaN, + "learning_rate": 3.782182789558104e-05, + "loss": 0.0, + "step": 49905 + }, + { + "epoch": 4.656713632546421, + "grad_norm": NaN, + "learning_rate": 3.7816806723891856e-05, + "loss": 0.0, + "step": 49906 + }, + { + "epoch": 4.656806942241299, + "grad_norm": NaN, + "learning_rate": 3.78117858374514e-05, + "loss": 0.0, + "step": 49907 + }, + { + "epoch": 4.656900251936176, + "grad_norm": NaN, + "learning_rate": 3.780676523627227e-05, + "loss": 0.0, + "step": 49908 + }, + { + "epoch": 4.656993561631054, + "grad_norm": NaN, + "learning_rate": 3.780174492036728e-05, + "loss": 0.0, + "step": 49909 + }, + { + "epoch": 4.657086871325931, + "grad_norm": NaN, + "learning_rate": 3.7796724889749296e-05, + "loss": 0.0, + "step": 49910 + }, + { + "epoch": 4.657180181020808, + "grad_norm": NaN, + "learning_rate": 3.779170514443095e-05, + "loss": 0.0, + "step": 49911 + }, + { + "epoch": 4.657273490715685, + "grad_norm": NaN, + "learning_rate": 3.7786685684425025e-05, + "loss": 0.0, + "step": 49912 + }, + { + "epoch": 4.657366800410562, + "grad_norm": NaN, + "learning_rate": 3.778166650974442e-05, + "loss": 0.0, + "step": 49913 + }, + { + "epoch": 4.65746011010544, + "grad_norm": NaN, + "learning_rate": 3.777664762040173e-05, + "loss": 0.0, + "step": 49914 + }, + { + "epoch": 4.657553419800317, + "grad_norm": NaN, + "learning_rate": 3.777162901640975e-05, + "loss": 0.0, + "step": 49915 + }, + { + "epoch": 4.657646729495195, + "grad_norm": NaN, + "learning_rate": 3.776661069778137e-05, + "loss": 0.0, + "step": 49916 + }, + { + "epoch": 4.657740039190072, + "grad_norm": NaN, + "learning_rate": 3.776159266452915e-05, + "loss": 0.0, + "step": 49917 + }, + { + "epoch": 4.6578333488849495, + "grad_norm": NaN, + "learning_rate": 3.7756574916666034e-05, + "loss": 0.0, + "step": 49918 + }, + { + "epoch": 4.657926658579827, + "grad_norm": NaN, + "learning_rate": 3.7751557454204676e-05, + "loss": 0.0, + "step": 49919 + }, + { + "epoch": 4.658019968274703, + "grad_norm": NaN, + "learning_rate": 3.774654027715779e-05, + "loss": 0.0, + "step": 49920 + }, + { + "epoch": 4.658113277969581, + "grad_norm": NaN, + "learning_rate": 3.77415233855383e-05, + "loss": 0.0, + "step": 49921 + }, + { + "epoch": 4.658206587664458, + "grad_norm": NaN, + "learning_rate": 3.773650677935882e-05, + "loss": 0.0, + "step": 49922 + }, + { + "epoch": 4.658299897359336, + "grad_norm": NaN, + "learning_rate": 3.7731490458632104e-05, + "loss": 0.0, + "step": 49923 + }, + { + "epoch": 4.658393207054213, + "grad_norm": NaN, + "learning_rate": 3.7726474423371036e-05, + "loss": 0.0, + "step": 49924 + }, + { + "epoch": 4.6584865167490905, + "grad_norm": NaN, + "learning_rate": 3.772145867358824e-05, + "loss": 0.0, + "step": 49925 + }, + { + "epoch": 4.658579826443967, + "grad_norm": NaN, + "learning_rate": 3.7716443209296476e-05, + "loss": 0.0, + "step": 49926 + }, + { + "epoch": 4.6586731361388445, + "grad_norm": NaN, + "learning_rate": 3.771142803050863e-05, + "loss": 0.0, + "step": 49927 + }, + { + "epoch": 4.658766445833722, + "grad_norm": NaN, + "learning_rate": 3.77064131372373e-05, + "loss": 0.0, + "step": 49928 + }, + { + "epoch": 4.658859755528599, + "grad_norm": NaN, + "learning_rate": 3.770139852949527e-05, + "loss": 0.0, + "step": 49929 + }, + { + "epoch": 4.658953065223477, + "grad_norm": NaN, + "learning_rate": 3.769638420729539e-05, + "loss": 0.0, + "step": 49930 + }, + { + "epoch": 4.659046374918354, + "grad_norm": NaN, + "learning_rate": 3.769137017065031e-05, + "loss": 0.0, + "step": 49931 + }, + { + "epoch": 4.659139684613232, + "grad_norm": NaN, + "learning_rate": 3.7686356419572754e-05, + "loss": 0.0, + "step": 49932 + }, + { + "epoch": 4.659232994308109, + "grad_norm": NaN, + "learning_rate": 3.768134295407562e-05, + "loss": 0.0, + "step": 49933 + }, + { + "epoch": 4.6593263040029855, + "grad_norm": NaN, + "learning_rate": 3.76763297741715e-05, + "loss": 0.0, + "step": 49934 + }, + { + "epoch": 4.659419613697863, + "grad_norm": NaN, + "learning_rate": 3.767131687987318e-05, + "loss": 0.0, + "step": 49935 + }, + { + "epoch": 4.65951292339274, + "grad_norm": NaN, + "learning_rate": 3.7666304271193495e-05, + "loss": 0.0, + "step": 49936 + }, + { + "epoch": 4.659606233087618, + "grad_norm": NaN, + "learning_rate": 3.766129194814508e-05, + "loss": 0.0, + "step": 49937 + }, + { + "epoch": 4.659699542782495, + "grad_norm": NaN, + "learning_rate": 3.765627991074069e-05, + "loss": 0.0, + "step": 49938 + }, + { + "epoch": 4.659792852477373, + "grad_norm": NaN, + "learning_rate": 3.765126815899318e-05, + "loss": 0.0, + "step": 49939 + }, + { + "epoch": 4.65988616217225, + "grad_norm": NaN, + "learning_rate": 3.764625669291518e-05, + "loss": 0.0, + "step": 49940 + }, + { + "epoch": 4.659979471867127, + "grad_norm": NaN, + "learning_rate": 3.76412455125194e-05, + "loss": 0.0, + "step": 49941 + }, + { + "epoch": 4.660072781562004, + "grad_norm": NaN, + "learning_rate": 3.763623461781876e-05, + "loss": 0.0, + "step": 49942 + }, + { + "epoch": 4.660166091256881, + "grad_norm": NaN, + "learning_rate": 3.7631224008825845e-05, + "loss": 0.0, + "step": 49943 + }, + { + "epoch": 4.660259400951759, + "grad_norm": NaN, + "learning_rate": 3.762621368555339e-05, + "loss": 0.0, + "step": 49944 + }, + { + "epoch": 4.660352710646636, + "grad_norm": NaN, + "learning_rate": 3.762120364801429e-05, + "loss": 0.0, + "step": 49945 + }, + { + "epoch": 4.660446020341514, + "grad_norm": NaN, + "learning_rate": 3.761619389622112e-05, + "loss": 0.0, + "step": 49946 + }, + { + "epoch": 4.660539330036391, + "grad_norm": NaN, + "learning_rate": 3.761118443018663e-05, + "loss": 0.0, + "step": 49947 + }, + { + "epoch": 4.6606326397312685, + "grad_norm": NaN, + "learning_rate": 3.7606175249923716e-05, + "loss": 0.0, + "step": 49948 + }, + { + "epoch": 4.660725949426145, + "grad_norm": NaN, + "learning_rate": 3.7601166355444944e-05, + "loss": 0.0, + "step": 49949 + }, + { + "epoch": 4.6608192591210225, + "grad_norm": NaN, + "learning_rate": 3.7596157746763064e-05, + "loss": 0.0, + "step": 49950 + }, + { + "epoch": 4.6609125688159, + "grad_norm": NaN, + "learning_rate": 3.759114942389095e-05, + "loss": 0.0, + "step": 49951 + }, + { + "epoch": 4.661005878510777, + "grad_norm": NaN, + "learning_rate": 3.758614138684121e-05, + "loss": 0.0, + "step": 49952 + }, + { + "epoch": 4.661099188205655, + "grad_norm": NaN, + "learning_rate": 3.7581133635626556e-05, + "loss": 0.0, + "step": 49953 + }, + { + "epoch": 4.661192497900532, + "grad_norm": NaN, + "learning_rate": 3.757612617025988e-05, + "loss": 0.0, + "step": 49954 + }, + { + "epoch": 4.661285807595409, + "grad_norm": NaN, + "learning_rate": 3.757111899075371e-05, + "loss": 0.0, + "step": 49955 + }, + { + "epoch": 4.661379117290286, + "grad_norm": NaN, + "learning_rate": 3.756611209712093e-05, + "loss": 0.0, + "step": 49956 + }, + { + "epoch": 4.6614724269851635, + "grad_norm": NaN, + "learning_rate": 3.7561105489374284e-05, + "loss": 0.0, + "step": 49957 + }, + { + "epoch": 4.661565736680041, + "grad_norm": NaN, + "learning_rate": 3.755609916752632e-05, + "loss": 0.0, + "step": 49958 + }, + { + "epoch": 4.661659046374918, + "grad_norm": NaN, + "learning_rate": 3.755109313158996e-05, + "loss": 0.0, + "step": 49959 + }, + { + "epoch": 4.661752356069796, + "grad_norm": NaN, + "learning_rate": 3.754608738157789e-05, + "loss": 0.0, + "step": 49960 + }, + { + "epoch": 4.661845665764673, + "grad_norm": NaN, + "learning_rate": 3.7541081917502714e-05, + "loss": 0.0, + "step": 49961 + }, + { + "epoch": 4.661938975459551, + "grad_norm": NaN, + "learning_rate": 3.753607673937735e-05, + "loss": 0.0, + "step": 49962 + }, + { + "epoch": 4.662032285154428, + "grad_norm": NaN, + "learning_rate": 3.753107184721439e-05, + "loss": 0.0, + "step": 49963 + }, + { + "epoch": 4.6621255948493046, + "grad_norm": NaN, + "learning_rate": 3.752606724102655e-05, + "loss": 0.0, + "step": 49964 + }, + { + "epoch": 4.662218904544182, + "grad_norm": NaN, + "learning_rate": 3.7521062920826686e-05, + "loss": 0.0, + "step": 49965 + }, + { + "epoch": 4.662312214239059, + "grad_norm": NaN, + "learning_rate": 3.751605888662741e-05, + "loss": 0.0, + "step": 49966 + }, + { + "epoch": 4.662405523933937, + "grad_norm": NaN, + "learning_rate": 3.7511055138441414e-05, + "loss": 0.0, + "step": 49967 + }, + { + "epoch": 4.662498833628814, + "grad_norm": NaN, + "learning_rate": 3.750605167628159e-05, + "loss": 0.0, + "step": 49968 + }, + { + "epoch": 4.662592143323692, + "grad_norm": NaN, + "learning_rate": 3.750104850016049e-05, + "loss": 0.0, + "step": 49969 + }, + { + "epoch": 4.662685453018568, + "grad_norm": NaN, + "learning_rate": 3.7496045610090874e-05, + "loss": 0.0, + "step": 49970 + }, + { + "epoch": 4.662778762713446, + "grad_norm": NaN, + "learning_rate": 3.749104300608556e-05, + "loss": 0.0, + "step": 49971 + }, + { + "epoch": 4.662872072408323, + "grad_norm": NaN, + "learning_rate": 3.748604068815716e-05, + "loss": 0.0, + "step": 49972 + }, + { + "epoch": 4.6629653821032, + "grad_norm": NaN, + "learning_rate": 3.748103865631838e-05, + "loss": 0.0, + "step": 49973 + }, + { + "epoch": 4.663058691798078, + "grad_norm": NaN, + "learning_rate": 3.747603691058208e-05, + "loss": 0.0, + "step": 49974 + }, + { + "epoch": 4.663152001492955, + "grad_norm": NaN, + "learning_rate": 3.747103545096084e-05, + "loss": 0.0, + "step": 49975 + }, + { + "epoch": 4.663245311187833, + "grad_norm": NaN, + "learning_rate": 3.746603427746739e-05, + "loss": 0.0, + "step": 49976 + }, + { + "epoch": 4.66333862088271, + "grad_norm": NaN, + "learning_rate": 3.7461033390114545e-05, + "loss": 0.0, + "step": 49977 + }, + { + "epoch": 4.663431930577587, + "grad_norm": NaN, + "learning_rate": 3.745603278891494e-05, + "loss": 0.0, + "step": 49978 + }, + { + "epoch": 4.663525240272464, + "grad_norm": NaN, + "learning_rate": 3.745103247388124e-05, + "loss": 0.0, + "step": 49979 + }, + { + "epoch": 4.6636185499673415, + "grad_norm": NaN, + "learning_rate": 3.744603244502633e-05, + "loss": 0.0, + "step": 49980 + }, + { + "epoch": 4.663711859662219, + "grad_norm": NaN, + "learning_rate": 3.744103270236276e-05, + "loss": 0.0, + "step": 49981 + }, + { + "epoch": 4.663805169357096, + "grad_norm": NaN, + "learning_rate": 3.7436033245903255e-05, + "loss": 0.0, + "step": 49982 + }, + { + "epoch": 4.663898479051974, + "grad_norm": NaN, + "learning_rate": 3.743103407566067e-05, + "loss": 0.0, + "step": 49983 + }, + { + "epoch": 4.663991788746851, + "grad_norm": NaN, + "learning_rate": 3.7426035191647574e-05, + "loss": 0.0, + "step": 49984 + }, + { + "epoch": 4.664085098441728, + "grad_norm": NaN, + "learning_rate": 3.742103659387669e-05, + "loss": 0.0, + "step": 49985 + }, + { + "epoch": 4.664178408136605, + "grad_norm": NaN, + "learning_rate": 3.7416038282360824e-05, + "loss": 0.0, + "step": 49986 + }, + { + "epoch": 4.6642717178314825, + "grad_norm": NaN, + "learning_rate": 3.74110402571126e-05, + "loss": 0.0, + "step": 49987 + }, + { + "epoch": 4.66436502752636, + "grad_norm": NaN, + "learning_rate": 3.74060425181447e-05, + "loss": 0.0, + "step": 49988 + }, + { + "epoch": 4.664458337221237, + "grad_norm": NaN, + "learning_rate": 3.740104506546997e-05, + "loss": 0.0, + "step": 49989 + }, + { + "epoch": 4.664551646916115, + "grad_norm": NaN, + "learning_rate": 3.7396047899100985e-05, + "loss": 0.0, + "step": 49990 + }, + { + "epoch": 4.664644956610992, + "grad_norm": NaN, + "learning_rate": 3.739105101905045e-05, + "loss": 0.0, + "step": 49991 + }, + { + "epoch": 4.66473826630587, + "grad_norm": NaN, + "learning_rate": 3.738605442533121e-05, + "loss": 0.0, + "step": 49992 + }, + { + "epoch": 4.664831576000746, + "grad_norm": NaN, + "learning_rate": 3.738105811795578e-05, + "loss": 0.0, + "step": 49993 + }, + { + "epoch": 4.664924885695624, + "grad_norm": NaN, + "learning_rate": 3.737606209693699e-05, + "loss": 0.0, + "step": 49994 + }, + { + "epoch": 4.665018195390501, + "grad_norm": NaN, + "learning_rate": 3.7371066362287564e-05, + "loss": 0.0, + "step": 49995 + }, + { + "epoch": 4.665111505085378, + "grad_norm": NaN, + "learning_rate": 3.736607091402007e-05, + "loss": 0.0, + "step": 49996 + }, + { + "epoch": 4.665204814780256, + "grad_norm": NaN, + "learning_rate": 3.736107575214734e-05, + "loss": 0.0, + "step": 49997 + }, + { + "epoch": 4.665298124475133, + "grad_norm": NaN, + "learning_rate": 3.7356080876682073e-05, + "loss": 0.0, + "step": 49998 + }, + { + "epoch": 4.66539143417001, + "grad_norm": NaN, + "learning_rate": 3.735108628763684e-05, + "loss": 0.0, + "step": 49999 + }, + { + "epoch": 4.665484743864887, + "grad_norm": NaN, + "learning_rate": 3.7346091985024454e-05, + "loss": 0.0, + "step": 50000 + }, + { + "epoch": 4.665578053559765, + "grad_norm": NaN, + "learning_rate": 3.734109796885763e-05, + "loss": 0.0, + "step": 50001 + }, + { + "epoch": 4.665671363254642, + "grad_norm": NaN, + "learning_rate": 3.733610423914895e-05, + "loss": 0.0, + "step": 50002 + }, + { + "epoch": 4.6657646729495195, + "grad_norm": NaN, + "learning_rate": 3.7331110795911215e-05, + "loss": 0.0, + "step": 50003 + }, + { + "epoch": 4.665857982644397, + "grad_norm": NaN, + "learning_rate": 3.732611763915714e-05, + "loss": 0.0, + "step": 50004 + }, + { + "epoch": 4.665951292339274, + "grad_norm": NaN, + "learning_rate": 3.7321124768899284e-05, + "loss": 0.0, + "step": 50005 + }, + { + "epoch": 4.666044602034152, + "grad_norm": NaN, + "learning_rate": 3.7316132185150514e-05, + "loss": 0.0, + "step": 50006 + }, + { + "epoch": 4.666137911729029, + "grad_norm": NaN, + "learning_rate": 3.7311139887923404e-05, + "loss": 0.0, + "step": 50007 + }, + { + "epoch": 4.666231221423906, + "grad_norm": NaN, + "learning_rate": 3.730614787723062e-05, + "loss": 0.0, + "step": 50008 + }, + { + "epoch": 4.666324531118783, + "grad_norm": NaN, + "learning_rate": 3.7301156153085024e-05, + "loss": 0.0, + "step": 50009 + }, + { + "epoch": 4.6664178408136605, + "grad_norm": NaN, + "learning_rate": 3.7296164715499165e-05, + "loss": 0.0, + "step": 50010 + }, + { + "epoch": 4.666511150508538, + "grad_norm": NaN, + "learning_rate": 3.729117356448571e-05, + "loss": 0.0, + "step": 50011 + }, + { + "epoch": 4.666604460203415, + "grad_norm": NaN, + "learning_rate": 3.7286182700057525e-05, + "loss": 0.0, + "step": 50012 + }, + { + "epoch": 4.666697769898293, + "grad_norm": NaN, + "learning_rate": 3.7281192122227135e-05, + "loss": 0.0, + "step": 50013 + }, + { + "epoch": 4.666791079593169, + "grad_norm": NaN, + "learning_rate": 3.7276201831007236e-05, + "loss": 0.0, + "step": 50014 + }, + { + "epoch": 4.666884389288047, + "grad_norm": NaN, + "learning_rate": 3.7271211826410656e-05, + "loss": 0.0, + "step": 50015 + }, + { + "epoch": 4.666977698982924, + "grad_norm": NaN, + "learning_rate": 3.7266222108449944e-05, + "loss": 0.0, + "step": 50016 + }, + { + "epoch": 4.667071008677802, + "grad_norm": NaN, + "learning_rate": 3.7261232677137785e-05, + "loss": 0.0, + "step": 50017 + }, + { + "epoch": 4.667164318372679, + "grad_norm": NaN, + "learning_rate": 3.725624353248702e-05, + "loss": 0.0, + "step": 50018 + }, + { + "epoch": 4.667257628067556, + "grad_norm": NaN, + "learning_rate": 3.725125467451018e-05, + "loss": 0.0, + "step": 50019 + }, + { + "epoch": 4.667350937762434, + "grad_norm": NaN, + "learning_rate": 3.724626610321996e-05, + "loss": 0.0, + "step": 50020 + }, + { + "epoch": 4.667444247457311, + "grad_norm": NaN, + "learning_rate": 3.724127781862917e-05, + "loss": 0.0, + "step": 50021 + }, + { + "epoch": 4.667537557152188, + "grad_norm": NaN, + "learning_rate": 3.7236289820750366e-05, + "loss": 0.0, + "step": 50022 + }, + { + "epoch": 4.667630866847065, + "grad_norm": NaN, + "learning_rate": 3.723130210959622e-05, + "loss": 0.0, + "step": 50023 + }, + { + "epoch": 4.667724176541943, + "grad_norm": NaN, + "learning_rate": 3.7226314685179565e-05, + "loss": 0.0, + "step": 50024 + }, + { + "epoch": 4.66781748623682, + "grad_norm": NaN, + "learning_rate": 3.7221327547512945e-05, + "loss": 0.0, + "step": 50025 + }, + { + "epoch": 4.667910795931697, + "grad_norm": NaN, + "learning_rate": 3.7216340696609024e-05, + "loss": 0.0, + "step": 50026 + }, + { + "epoch": 4.668004105626575, + "grad_norm": NaN, + "learning_rate": 3.7211354132480644e-05, + "loss": 0.0, + "step": 50027 + }, + { + "epoch": 4.668097415321452, + "grad_norm": NaN, + "learning_rate": 3.7206367855140286e-05, + "loss": 0.0, + "step": 50028 + }, + { + "epoch": 4.668190725016329, + "grad_norm": NaN, + "learning_rate": 3.720138186460076e-05, + "loss": 0.0, + "step": 50029 + }, + { + "epoch": 4.668284034711206, + "grad_norm": NaN, + "learning_rate": 3.7196396160874756e-05, + "loss": 0.0, + "step": 50030 + }, + { + "epoch": 4.668377344406084, + "grad_norm": NaN, + "learning_rate": 3.719141074397482e-05, + "loss": 0.0, + "step": 50031 + }, + { + "epoch": 4.668470654100961, + "grad_norm": NaN, + "learning_rate": 3.7186425613913734e-05, + "loss": 0.0, + "step": 50032 + }, + { + "epoch": 4.6685639637958385, + "grad_norm": NaN, + "learning_rate": 3.7181440770704205e-05, + "loss": 0.0, + "step": 50033 + }, + { + "epoch": 4.668657273490716, + "grad_norm": NaN, + "learning_rate": 3.717645621435878e-05, + "loss": 0.0, + "step": 50034 + }, + { + "epoch": 4.668750583185593, + "grad_norm": NaN, + "learning_rate": 3.717147194489024e-05, + "loss": 0.0, + "step": 50035 + }, + { + "epoch": 4.668843892880471, + "grad_norm": NaN, + "learning_rate": 3.716648796231127e-05, + "loss": 0.0, + "step": 50036 + }, + { + "epoch": 4.668937202575347, + "grad_norm": NaN, + "learning_rate": 3.716150426663439e-05, + "loss": 0.0, + "step": 50037 + }, + { + "epoch": 4.669030512270225, + "grad_norm": NaN, + "learning_rate": 3.715652085787244e-05, + "loss": 0.0, + "step": 50038 + }, + { + "epoch": 4.669123821965102, + "grad_norm": NaN, + "learning_rate": 3.7151537736038075e-05, + "loss": 0.0, + "step": 50039 + }, + { + "epoch": 4.6692171316599795, + "grad_norm": NaN, + "learning_rate": 3.7146554901143826e-05, + "loss": 0.0, + "step": 50040 + }, + { + "epoch": 4.669310441354857, + "grad_norm": NaN, + "learning_rate": 3.714157235320251e-05, + "loss": 0.0, + "step": 50041 + }, + { + "epoch": 4.669403751049734, + "grad_norm": NaN, + "learning_rate": 3.713659009222679e-05, + "loss": 0.0, + "step": 50042 + }, + { + "epoch": 4.669497060744611, + "grad_norm": NaN, + "learning_rate": 3.71316081182292e-05, + "loss": 0.0, + "step": 50043 + }, + { + "epoch": 4.669590370439488, + "grad_norm": NaN, + "learning_rate": 3.712662643122254e-05, + "loss": 0.0, + "step": 50044 + }, + { + "epoch": 4.669683680134366, + "grad_norm": NaN, + "learning_rate": 3.712164503121946e-05, + "loss": 0.0, + "step": 50045 + }, + { + "epoch": 4.669776989829243, + "grad_norm": NaN, + "learning_rate": 3.711666391823254e-05, + "loss": 0.0, + "step": 50046 + }, + { + "epoch": 4.669870299524121, + "grad_norm": NaN, + "learning_rate": 3.711168309227454e-05, + "loss": 0.0, + "step": 50047 + }, + { + "epoch": 4.669963609218998, + "grad_norm": NaN, + "learning_rate": 3.710670255335812e-05, + "loss": 0.0, + "step": 50048 + }, + { + "epoch": 4.670056918913875, + "grad_norm": NaN, + "learning_rate": 3.7101722301495846e-05, + "loss": 0.0, + "step": 50049 + }, + { + "epoch": 4.670150228608753, + "grad_norm": NaN, + "learning_rate": 3.709674233670048e-05, + "loss": 0.0, + "step": 50050 + }, + { + "epoch": 4.670243538303629, + "grad_norm": NaN, + "learning_rate": 3.70917626589847e-05, + "loss": 0.0, + "step": 50051 + }, + { + "epoch": 4.670336847998507, + "grad_norm": NaN, + "learning_rate": 3.7086783268361036e-05, + "loss": 0.0, + "step": 50052 + }, + { + "epoch": 4.670430157693384, + "grad_norm": NaN, + "learning_rate": 3.7081804164842314e-05, + "loss": 0.0, + "step": 50053 + }, + { + "epoch": 4.670523467388262, + "grad_norm": NaN, + "learning_rate": 3.7076825348441076e-05, + "loss": 0.0, + "step": 50054 + }, + { + "epoch": 4.670616777083139, + "grad_norm": NaN, + "learning_rate": 3.707184681916999e-05, + "loss": 0.0, + "step": 50055 + }, + { + "epoch": 4.6707100867780165, + "grad_norm": NaN, + "learning_rate": 3.706686857704183e-05, + "loss": 0.0, + "step": 50056 + }, + { + "epoch": 4.670803396472894, + "grad_norm": NaN, + "learning_rate": 3.706189062206911e-05, + "loss": 0.0, + "step": 50057 + }, + { + "epoch": 4.67089670616777, + "grad_norm": NaN, + "learning_rate": 3.705691295426451e-05, + "loss": 0.0, + "step": 50058 + }, + { + "epoch": 4.670990015862648, + "grad_norm": NaN, + "learning_rate": 3.705193557364081e-05, + "loss": 0.0, + "step": 50059 + }, + { + "epoch": 4.671083325557525, + "grad_norm": NaN, + "learning_rate": 3.7046958480210535e-05, + "loss": 0.0, + "step": 50060 + }, + { + "epoch": 4.671176635252403, + "grad_norm": NaN, + "learning_rate": 3.7041981673986356e-05, + "loss": 0.0, + "step": 50061 + }, + { + "epoch": 4.67126994494728, + "grad_norm": NaN, + "learning_rate": 3.7037005154981034e-05, + "loss": 0.0, + "step": 50062 + }, + { + "epoch": 4.6713632546421575, + "grad_norm": NaN, + "learning_rate": 3.7032028923207105e-05, + "loss": 0.0, + "step": 50063 + }, + { + "epoch": 4.671456564337035, + "grad_norm": NaN, + "learning_rate": 3.702705297867722e-05, + "loss": 0.0, + "step": 50064 + }, + { + "epoch": 4.671549874031912, + "grad_norm": NaN, + "learning_rate": 3.702207732140416e-05, + "loss": 0.0, + "step": 50065 + }, + { + "epoch": 4.671643183726789, + "grad_norm": NaN, + "learning_rate": 3.70171019514004e-05, + "loss": 0.0, + "step": 50066 + }, + { + "epoch": 4.671736493421666, + "grad_norm": NaN, + "learning_rate": 3.701212686867873e-05, + "loss": 0.0, + "step": 50067 + }, + { + "epoch": 4.671829803116544, + "grad_norm": NaN, + "learning_rate": 3.700715207325179e-05, + "loss": 0.0, + "step": 50068 + }, + { + "epoch": 4.671923112811421, + "grad_norm": NaN, + "learning_rate": 3.7002177565132115e-05, + "loss": 0.0, + "step": 50069 + }, + { + "epoch": 4.672016422506299, + "grad_norm": NaN, + "learning_rate": 3.699720334433245e-05, + "loss": 0.0, + "step": 50070 + }, + { + "epoch": 4.672109732201176, + "grad_norm": NaN, + "learning_rate": 3.699222941086549e-05, + "loss": 0.0, + "step": 50071 + }, + { + "epoch": 4.6722030418960525, + "grad_norm": NaN, + "learning_rate": 3.698725576474373e-05, + "loss": 0.0, + "step": 50072 + }, + { + "epoch": 4.67229635159093, + "grad_norm": NaN, + "learning_rate": 3.6982282405979935e-05, + "loss": 0.0, + "step": 50073 + }, + { + "epoch": 4.672389661285807, + "grad_norm": NaN, + "learning_rate": 3.697730933458676e-05, + "loss": 0.0, + "step": 50074 + }, + { + "epoch": 4.672482970980685, + "grad_norm": NaN, + "learning_rate": 3.6972336550576725e-05, + "loss": 0.0, + "step": 50075 + }, + { + "epoch": 4.672576280675562, + "grad_norm": NaN, + "learning_rate": 3.6967364053962595e-05, + "loss": 0.0, + "step": 50076 + }, + { + "epoch": 4.67266959037044, + "grad_norm": NaN, + "learning_rate": 3.696239184475703e-05, + "loss": 0.0, + "step": 50077 + }, + { + "epoch": 4.672762900065317, + "grad_norm": NaN, + "learning_rate": 3.695741992297251e-05, + "loss": 0.0, + "step": 50078 + }, + { + "epoch": 4.6728562097601944, + "grad_norm": NaN, + "learning_rate": 3.695244828862184e-05, + "loss": 0.0, + "step": 50079 + }, + { + "epoch": 4.672949519455072, + "grad_norm": NaN, + "learning_rate": 3.694747694171766e-05, + "loss": 0.0, + "step": 50080 + }, + { + "epoch": 4.673042829149948, + "grad_norm": NaN, + "learning_rate": 3.694250588227246e-05, + "loss": 0.0, + "step": 50081 + }, + { + "epoch": 4.673136138844826, + "grad_norm": NaN, + "learning_rate": 3.693753511029903e-05, + "loss": 0.0, + "step": 50082 + }, + { + "epoch": 4.673229448539703, + "grad_norm": NaN, + "learning_rate": 3.693256462581e-05, + "loss": 0.0, + "step": 50083 + }, + { + "epoch": 4.673322758234581, + "grad_norm": NaN, + "learning_rate": 3.6927594428817876e-05, + "loss": 0.0, + "step": 50084 + }, + { + "epoch": 4.673416067929458, + "grad_norm": NaN, + "learning_rate": 3.692262451933542e-05, + "loss": 0.0, + "step": 50085 + }, + { + "epoch": 4.6735093776243355, + "grad_norm": NaN, + "learning_rate": 3.691765489737529e-05, + "loss": 0.0, + "step": 50086 + }, + { + "epoch": 4.673602687319212, + "grad_norm": NaN, + "learning_rate": 3.6912685562949964e-05, + "loss": 0.0, + "step": 50087 + }, + { + "epoch": 4.673695997014089, + "grad_norm": NaN, + "learning_rate": 3.6907716516072234e-05, + "loss": 0.0, + "step": 50088 + }, + { + "epoch": 4.673789306708967, + "grad_norm": NaN, + "learning_rate": 3.690274775675472e-05, + "loss": 0.0, + "step": 50089 + }, + { + "epoch": 4.673882616403844, + "grad_norm": NaN, + "learning_rate": 3.689777928500993e-05, + "loss": 0.0, + "step": 50090 + }, + { + "epoch": 4.673975926098722, + "grad_norm": NaN, + "learning_rate": 3.6892811100850626e-05, + "loss": 0.0, + "step": 50091 + }, + { + "epoch": 4.674069235793599, + "grad_norm": NaN, + "learning_rate": 3.688784320428945e-05, + "loss": 0.0, + "step": 50092 + }, + { + "epoch": 4.6741625454884765, + "grad_norm": NaN, + "learning_rate": 3.688287559533888e-05, + "loss": 0.0, + "step": 50093 + }, + { + "epoch": 4.674255855183354, + "grad_norm": NaN, + "learning_rate": 3.687790827401171e-05, + "loss": 0.0, + "step": 50094 + }, + { + "epoch": 4.6743491648782305, + "grad_norm": NaN, + "learning_rate": 3.687294124032055e-05, + "loss": 0.0, + "step": 50095 + }, + { + "epoch": 4.674442474573108, + "grad_norm": NaN, + "learning_rate": 3.6867974494277886e-05, + "loss": 0.0, + "step": 50096 + }, + { + "epoch": 4.674535784267985, + "grad_norm": NaN, + "learning_rate": 3.686300803589655e-05, + "loss": 0.0, + "step": 50097 + }, + { + "epoch": 4.674629093962863, + "grad_norm": NaN, + "learning_rate": 3.6858041865189035e-05, + "loss": 0.0, + "step": 50098 + }, + { + "epoch": 4.67472240365774, + "grad_norm": NaN, + "learning_rate": 3.6853075982167946e-05, + "loss": 0.0, + "step": 50099 + }, + { + "epoch": 4.674815713352618, + "grad_norm": NaN, + "learning_rate": 3.684811038684607e-05, + "loss": 0.0, + "step": 50100 + }, + { + "epoch": 4.674909023047495, + "grad_norm": NaN, + "learning_rate": 3.684314507923588e-05, + "loss": 0.0, + "step": 50101 + }, + { + "epoch": 4.6750023327423715, + "grad_norm": NaN, + "learning_rate": 3.683818005935e-05, + "loss": 0.0, + "step": 50102 + }, + { + "epoch": 4.675095642437249, + "grad_norm": NaN, + "learning_rate": 3.683321532720121e-05, + "loss": 0.0, + "step": 50103 + }, + { + "epoch": 4.675188952132126, + "grad_norm": NaN, + "learning_rate": 3.682825088280194e-05, + "loss": 0.0, + "step": 50104 + }, + { + "epoch": 4.675282261827004, + "grad_norm": NaN, + "learning_rate": 3.682328672616495e-05, + "loss": 0.0, + "step": 50105 + }, + { + "epoch": 4.675375571521881, + "grad_norm": NaN, + "learning_rate": 3.681832285730285e-05, + "loss": 0.0, + "step": 50106 + }, + { + "epoch": 4.675468881216759, + "grad_norm": NaN, + "learning_rate": 3.681335927622814e-05, + "loss": 0.0, + "step": 50107 + }, + { + "epoch": 4.675562190911636, + "grad_norm": NaN, + "learning_rate": 3.6808395982953584e-05, + "loss": 0.0, + "step": 50108 + }, + { + "epoch": 4.6756555006065135, + "grad_norm": NaN, + "learning_rate": 3.6803432977491784e-05, + "loss": 0.0, + "step": 50109 + }, + { + "epoch": 4.67574881030139, + "grad_norm": NaN, + "learning_rate": 3.6798470259855236e-05, + "loss": 0.0, + "step": 50110 + }, + { + "epoch": 4.675842119996267, + "grad_norm": NaN, + "learning_rate": 3.67935078300567e-05, + "loss": 0.0, + "step": 50111 + }, + { + "epoch": 4.675935429691145, + "grad_norm": NaN, + "learning_rate": 3.678854568810876e-05, + "loss": 0.0, + "step": 50112 + }, + { + "epoch": 4.676028739386022, + "grad_norm": NaN, + "learning_rate": 3.6783583834023936e-05, + "loss": 0.0, + "step": 50113 + }, + { + "epoch": 4.6761220490809, + "grad_norm": NaN, + "learning_rate": 3.677862226781497e-05, + "loss": 0.0, + "step": 50114 + }, + { + "epoch": 4.676215358775777, + "grad_norm": NaN, + "learning_rate": 3.677366098949447e-05, + "loss": 0.0, + "step": 50115 + }, + { + "epoch": 4.676308668470654, + "grad_norm": NaN, + "learning_rate": 3.676869999907494e-05, + "loss": 0.0, + "step": 50116 + }, + { + "epoch": 4.676401978165531, + "grad_norm": NaN, + "learning_rate": 3.676373929656908e-05, + "loss": 0.0, + "step": 50117 + }, + { + "epoch": 4.6764952878604085, + "grad_norm": NaN, + "learning_rate": 3.6758778881989534e-05, + "loss": 0.0, + "step": 50118 + }, + { + "epoch": 4.676588597555286, + "grad_norm": NaN, + "learning_rate": 3.67538187553488e-05, + "loss": 0.0, + "step": 50119 + }, + { + "epoch": 4.676681907250163, + "grad_norm": NaN, + "learning_rate": 3.67488589166596e-05, + "loss": 0.0, + "step": 50120 + }, + { + "epoch": 4.676775216945041, + "grad_norm": NaN, + "learning_rate": 3.6743899365934526e-05, + "loss": 0.0, + "step": 50121 + }, + { + "epoch": 4.676868526639918, + "grad_norm": NaN, + "learning_rate": 3.67389401031861e-05, + "loss": 0.0, + "step": 50122 + }, + { + "epoch": 4.676961836334796, + "grad_norm": NaN, + "learning_rate": 3.673398112842704e-05, + "loss": 0.0, + "step": 50123 + }, + { + "epoch": 4.677055146029672, + "grad_norm": NaN, + "learning_rate": 3.672902244166995e-05, + "loss": 0.0, + "step": 50124 + }, + { + "epoch": 4.6771484557245495, + "grad_norm": NaN, + "learning_rate": 3.672406404292732e-05, + "loss": 0.0, + "step": 50125 + }, + { + "epoch": 4.677241765419427, + "grad_norm": NaN, + "learning_rate": 3.671910593221188e-05, + "loss": 0.0, + "step": 50126 + }, + { + "epoch": 4.677335075114304, + "grad_norm": NaN, + "learning_rate": 3.671414810953625e-05, + "loss": 0.0, + "step": 50127 + }, + { + "epoch": 4.677428384809182, + "grad_norm": NaN, + "learning_rate": 3.6709190574912876e-05, + "loss": 0.0, + "step": 50128 + }, + { + "epoch": 4.677521694504059, + "grad_norm": NaN, + "learning_rate": 3.6704233328354535e-05, + "loss": 0.0, + "step": 50129 + }, + { + "epoch": 4.677615004198937, + "grad_norm": NaN, + "learning_rate": 3.66992763698738e-05, + "loss": 0.0, + "step": 50130 + }, + { + "epoch": 4.677708313893813, + "grad_norm": NaN, + "learning_rate": 3.6694319699483166e-05, + "loss": 0.0, + "step": 50131 + }, + { + "epoch": 4.677801623588691, + "grad_norm": NaN, + "learning_rate": 3.668936331719536e-05, + "loss": 0.0, + "step": 50132 + }, + { + "epoch": 4.677894933283568, + "grad_norm": NaN, + "learning_rate": 3.6684407223022965e-05, + "loss": 0.0, + "step": 50133 + }, + { + "epoch": 4.677988242978445, + "grad_norm": NaN, + "learning_rate": 3.6679451416978476e-05, + "loss": 0.0, + "step": 50134 + }, + { + "epoch": 4.678081552673323, + "grad_norm": NaN, + "learning_rate": 3.667449589907462e-05, + "loss": 0.0, + "step": 50135 + }, + { + "epoch": 4.6781748623682, + "grad_norm": NaN, + "learning_rate": 3.666954066932399e-05, + "loss": 0.0, + "step": 50136 + }, + { + "epoch": 4.678268172063078, + "grad_norm": NaN, + "learning_rate": 3.666458572773908e-05, + "loss": 0.0, + "step": 50137 + }, + { + "epoch": 4.678361481757955, + "grad_norm": NaN, + "learning_rate": 3.665963107433257e-05, + "loss": 0.0, + "step": 50138 + }, + { + "epoch": 4.678454791452832, + "grad_norm": NaN, + "learning_rate": 3.6654676709117106e-05, + "loss": 0.0, + "step": 50139 + }, + { + "epoch": 4.678548101147709, + "grad_norm": NaN, + "learning_rate": 3.6649722632105136e-05, + "loss": 0.0, + "step": 50140 + }, + { + "epoch": 4.6786414108425864, + "grad_norm": NaN, + "learning_rate": 3.6644768843309436e-05, + "loss": 0.0, + "step": 50141 + }, + { + "epoch": 4.678734720537464, + "grad_norm": NaN, + "learning_rate": 3.6639815342742404e-05, + "loss": 0.0, + "step": 50142 + }, + { + "epoch": 4.678828030232341, + "grad_norm": NaN, + "learning_rate": 3.663486213041681e-05, + "loss": 0.0, + "step": 50143 + }, + { + "epoch": 4.678921339927219, + "grad_norm": NaN, + "learning_rate": 3.66299092063452e-05, + "loss": 0.0, + "step": 50144 + }, + { + "epoch": 4.679014649622096, + "grad_norm": NaN, + "learning_rate": 3.6624956570540084e-05, + "loss": 0.0, + "step": 50145 + }, + { + "epoch": 4.679107959316973, + "grad_norm": NaN, + "learning_rate": 3.662000422301415e-05, + "loss": 0.0, + "step": 50146 + }, + { + "epoch": 4.67920126901185, + "grad_norm": NaN, + "learning_rate": 3.661505216377999e-05, + "loss": 0.0, + "step": 50147 + }, + { + "epoch": 4.6792945787067275, + "grad_norm": NaN, + "learning_rate": 3.66101003928501e-05, + "loss": 0.0, + "step": 50148 + }, + { + "epoch": 4.679387888401605, + "grad_norm": NaN, + "learning_rate": 3.660514891023717e-05, + "loss": 0.0, + "step": 50149 + }, + { + "epoch": 4.679481198096482, + "grad_norm": NaN, + "learning_rate": 3.660019771595381e-05, + "loss": 0.0, + "step": 50150 + }, + { + "epoch": 4.67957450779136, + "grad_norm": NaN, + "learning_rate": 3.6595246810012455e-05, + "loss": 0.0, + "step": 50151 + }, + { + "epoch": 4.679667817486237, + "grad_norm": NaN, + "learning_rate": 3.659029619242583e-05, + "loss": 0.0, + "step": 50152 + }, + { + "epoch": 4.679761127181115, + "grad_norm": NaN, + "learning_rate": 3.6585345863206553e-05, + "loss": 0.0, + "step": 50153 + }, + { + "epoch": 4.679854436875991, + "grad_norm": NaN, + "learning_rate": 3.658039582236705e-05, + "loss": 0.0, + "step": 50154 + }, + { + "epoch": 4.6799477465708685, + "grad_norm": NaN, + "learning_rate": 3.657544606992005e-05, + "loss": 0.0, + "step": 50155 + }, + { + "epoch": 4.680041056265746, + "grad_norm": NaN, + "learning_rate": 3.6570496605878135e-05, + "loss": 0.0, + "step": 50156 + }, + { + "epoch": 4.680134365960623, + "grad_norm": NaN, + "learning_rate": 3.656554743025374e-05, + "loss": 0.0, + "step": 50157 + }, + { + "epoch": 4.680227675655501, + "grad_norm": NaN, + "learning_rate": 3.6560598543059634e-05, + "loss": 0.0, + "step": 50158 + }, + { + "epoch": 4.680320985350378, + "grad_norm": NaN, + "learning_rate": 3.655564994430834e-05, + "loss": 0.0, + "step": 50159 + }, + { + "epoch": 4.680414295045255, + "grad_norm": NaN, + "learning_rate": 3.655070163401234e-05, + "loss": 0.0, + "step": 50160 + }, + { + "epoch": 4.680507604740132, + "grad_norm": NaN, + "learning_rate": 3.654575361218436e-05, + "loss": 0.0, + "step": 50161 + }, + { + "epoch": 4.68060091443501, + "grad_norm": NaN, + "learning_rate": 3.654080587883695e-05, + "loss": 0.0, + "step": 50162 + }, + { + "epoch": 4.680694224129887, + "grad_norm": NaN, + "learning_rate": 3.653585843398257e-05, + "loss": 0.0, + "step": 50163 + }, + { + "epoch": 4.680787533824764, + "grad_norm": NaN, + "learning_rate": 3.653091127763394e-05, + "loss": 0.0, + "step": 50164 + }, + { + "epoch": 4.680880843519642, + "grad_norm": NaN, + "learning_rate": 3.652596440980364e-05, + "loss": 0.0, + "step": 50165 + }, + { + "epoch": 4.680974153214519, + "grad_norm": NaN, + "learning_rate": 3.65210178305041e-05, + "loss": 0.0, + "step": 50166 + }, + { + "epoch": 4.681067462909397, + "grad_norm": NaN, + "learning_rate": 3.651607153974806e-05, + "loss": 0.0, + "step": 50167 + }, + { + "epoch": 4.681160772604273, + "grad_norm": NaN, + "learning_rate": 3.651112553754807e-05, + "loss": 0.0, + "step": 50168 + }, + { + "epoch": 4.681254082299151, + "grad_norm": NaN, + "learning_rate": 3.650617982391657e-05, + "loss": 0.0, + "step": 50169 + }, + { + "epoch": 4.681347391994028, + "grad_norm": NaN, + "learning_rate": 3.650123439886628e-05, + "loss": 0.0, + "step": 50170 + }, + { + "epoch": 4.6814407016889055, + "grad_norm": NaN, + "learning_rate": 3.649628926240979e-05, + "loss": 0.0, + "step": 50171 + }, + { + "epoch": 4.681534011383783, + "grad_norm": NaN, + "learning_rate": 3.649134441455952e-05, + "loss": 0.0, + "step": 50172 + }, + { + "epoch": 4.68162732107866, + "grad_norm": NaN, + "learning_rate": 3.648639985532818e-05, + "loss": 0.0, + "step": 50173 + }, + { + "epoch": 4.681720630773538, + "grad_norm": NaN, + "learning_rate": 3.648145558472834e-05, + "loss": 0.0, + "step": 50174 + }, + { + "epoch": 4.681813940468414, + "grad_norm": NaN, + "learning_rate": 3.647651160277244e-05, + "loss": 0.0, + "step": 50175 + }, + { + "epoch": 4.681907250163292, + "grad_norm": NaN, + "learning_rate": 3.6471567909473195e-05, + "loss": 0.0, + "step": 50176 + }, + { + "epoch": 4.682000559858169, + "grad_norm": NaN, + "learning_rate": 3.646662450484317e-05, + "loss": 0.0, + "step": 50177 + }, + { + "epoch": 4.6820938695530465, + "grad_norm": NaN, + "learning_rate": 3.646168138889479e-05, + "loss": 0.0, + "step": 50178 + }, + { + "epoch": 4.682187179247924, + "grad_norm": NaN, + "learning_rate": 3.6456738561640764e-05, + "loss": 0.0, + "step": 50179 + }, + { + "epoch": 4.682280488942801, + "grad_norm": NaN, + "learning_rate": 3.6451796023093626e-05, + "loss": 0.0, + "step": 50180 + }, + { + "epoch": 4.682373798637679, + "grad_norm": NaN, + "learning_rate": 3.6446853773265935e-05, + "loss": 0.0, + "step": 50181 + }, + { + "epoch": 4.682467108332556, + "grad_norm": NaN, + "learning_rate": 3.6441911812170255e-05, + "loss": 0.0, + "step": 50182 + }, + { + "epoch": 4.682560418027433, + "grad_norm": NaN, + "learning_rate": 3.643697013981917e-05, + "loss": 0.0, + "step": 50183 + }, + { + "epoch": 4.68265372772231, + "grad_norm": NaN, + "learning_rate": 3.643202875622521e-05, + "loss": 0.0, + "step": 50184 + }, + { + "epoch": 4.682747037417188, + "grad_norm": NaN, + "learning_rate": 3.642708766140097e-05, + "loss": 0.0, + "step": 50185 + }, + { + "epoch": 4.682840347112065, + "grad_norm": NaN, + "learning_rate": 3.6422146855359e-05, + "loss": 0.0, + "step": 50186 + }, + { + "epoch": 4.682933656806942, + "grad_norm": NaN, + "learning_rate": 3.641720633811186e-05, + "loss": 0.0, + "step": 50187 + }, + { + "epoch": 4.68302696650182, + "grad_norm": NaN, + "learning_rate": 3.641226610967218e-05, + "loss": 0.0, + "step": 50188 + }, + { + "epoch": 4.683120276196696, + "grad_norm": NaN, + "learning_rate": 3.640732617005235e-05, + "loss": 0.0, + "step": 50189 + }, + { + "epoch": 4.683213585891574, + "grad_norm": NaN, + "learning_rate": 3.640238651926509e-05, + "loss": 0.0, + "step": 50190 + }, + { + "epoch": 4.683306895586451, + "grad_norm": NaN, + "learning_rate": 3.6397447157322967e-05, + "loss": 0.0, + "step": 50191 + }, + { + "epoch": 4.683400205281329, + "grad_norm": NaN, + "learning_rate": 3.63925080842384e-05, + "loss": 0.0, + "step": 50192 + }, + { + "epoch": 4.683493514976206, + "grad_norm": NaN, + "learning_rate": 3.6387569300024056e-05, + "loss": 0.0, + "step": 50193 + }, + { + "epoch": 4.6835868246710834, + "grad_norm": NaN, + "learning_rate": 3.638263080469253e-05, + "loss": 0.0, + "step": 50194 + }, + { + "epoch": 4.683680134365961, + "grad_norm": NaN, + "learning_rate": 3.637769259825621e-05, + "loss": 0.0, + "step": 50195 + }, + { + "epoch": 4.683773444060838, + "grad_norm": NaN, + "learning_rate": 3.6372754680727826e-05, + "loss": 0.0, + "step": 50196 + }, + { + "epoch": 4.683866753755716, + "grad_norm": NaN, + "learning_rate": 3.6367817052119896e-05, + "loss": 0.0, + "step": 50197 + }, + { + "epoch": 4.683960063450592, + "grad_norm": NaN, + "learning_rate": 3.636287971244487e-05, + "loss": 0.0, + "step": 50198 + }, + { + "epoch": 4.68405337314547, + "grad_norm": NaN, + "learning_rate": 3.635794266171542e-05, + "loss": 0.0, + "step": 50199 + }, + { + "epoch": 4.684146682840347, + "grad_norm": NaN, + "learning_rate": 3.6353005899944096e-05, + "loss": 0.0, + "step": 50200 + }, + { + "epoch": 4.6842399925352245, + "grad_norm": NaN, + "learning_rate": 3.634806942714333e-05, + "loss": 0.0, + "step": 50201 + }, + { + "epoch": 4.684333302230102, + "grad_norm": NaN, + "learning_rate": 3.63431332433258e-05, + "loss": 0.0, + "step": 50202 + }, + { + "epoch": 4.684426611924979, + "grad_norm": NaN, + "learning_rate": 3.6338197348504065e-05, + "loss": 0.0, + "step": 50203 + }, + { + "epoch": 4.684519921619856, + "grad_norm": NaN, + "learning_rate": 3.6333261742690536e-05, + "loss": 0.0, + "step": 50204 + }, + { + "epoch": 4.684613231314733, + "grad_norm": NaN, + "learning_rate": 3.632832642589789e-05, + "loss": 0.0, + "step": 50205 + }, + { + "epoch": 4.684706541009611, + "grad_norm": NaN, + "learning_rate": 3.6323391398138676e-05, + "loss": 0.0, + "step": 50206 + }, + { + "epoch": 4.684799850704488, + "grad_norm": NaN, + "learning_rate": 3.6318456659425335e-05, + "loss": 0.0, + "step": 50207 + }, + { + "epoch": 4.6848931603993655, + "grad_norm": NaN, + "learning_rate": 3.6313522209770505e-05, + "loss": 0.0, + "step": 50208 + }, + { + "epoch": 4.684986470094243, + "grad_norm": NaN, + "learning_rate": 3.630858804918678e-05, + "loss": 0.0, + "step": 50209 + }, + { + "epoch": 4.68507977978912, + "grad_norm": NaN, + "learning_rate": 3.630365417768654e-05, + "loss": 0.0, + "step": 50210 + }, + { + "epoch": 4.685173089483998, + "grad_norm": NaN, + "learning_rate": 3.629872059528246e-05, + "loss": 0.0, + "step": 50211 + }, + { + "epoch": 4.685266399178874, + "grad_norm": NaN, + "learning_rate": 3.6293787301987106e-05, + "loss": 0.0, + "step": 50212 + }, + { + "epoch": 4.685359708873752, + "grad_norm": NaN, + "learning_rate": 3.628885429781289e-05, + "loss": 0.0, + "step": 50213 + }, + { + "epoch": 4.685453018568629, + "grad_norm": NaN, + "learning_rate": 3.628392158277246e-05, + "loss": 0.0, + "step": 50214 + }, + { + "epoch": 4.685546328263507, + "grad_norm": NaN, + "learning_rate": 3.627898915687834e-05, + "loss": 0.0, + "step": 50215 + }, + { + "epoch": 4.685639637958384, + "grad_norm": NaN, + "learning_rate": 3.627405702014307e-05, + "loss": 0.0, + "step": 50216 + }, + { + "epoch": 4.685732947653261, + "grad_norm": NaN, + "learning_rate": 3.626912517257918e-05, + "loss": 0.0, + "step": 50217 + }, + { + "epoch": 4.685826257348139, + "grad_norm": NaN, + "learning_rate": 3.6264193614199214e-05, + "loss": 0.0, + "step": 50218 + }, + { + "epoch": 4.685919567043015, + "grad_norm": NaN, + "learning_rate": 3.6259262345015706e-05, + "loss": 0.0, + "step": 50219 + }, + { + "epoch": 4.686012876737893, + "grad_norm": NaN, + "learning_rate": 3.6254331365041216e-05, + "loss": 0.0, + "step": 50220 + }, + { + "epoch": 4.68610618643277, + "grad_norm": NaN, + "learning_rate": 3.624940067428826e-05, + "loss": 0.0, + "step": 50221 + }, + { + "epoch": 4.686199496127648, + "grad_norm": NaN, + "learning_rate": 3.624447027276939e-05, + "loss": 0.0, + "step": 50222 + }, + { + "epoch": 4.686292805822525, + "grad_norm": NaN, + "learning_rate": 3.6239540160497125e-05, + "loss": 0.0, + "step": 50223 + }, + { + "epoch": 4.6863861155174025, + "grad_norm": NaN, + "learning_rate": 3.6234610337484016e-05, + "loss": 0.0, + "step": 50224 + }, + { + "epoch": 4.68647942521228, + "grad_norm": NaN, + "learning_rate": 3.622968080374258e-05, + "loss": 0.0, + "step": 50225 + }, + { + "epoch": 4.686572734907157, + "grad_norm": NaN, + "learning_rate": 3.6224751559285384e-05, + "loss": 0.0, + "step": 50226 + }, + { + "epoch": 4.686666044602034, + "grad_norm": NaN, + "learning_rate": 3.6219822604124933e-05, + "loss": 0.0, + "step": 50227 + }, + { + "epoch": 4.686759354296911, + "grad_norm": NaN, + "learning_rate": 3.621489393827377e-05, + "loss": 0.0, + "step": 50228 + }, + { + "epoch": 4.686852663991789, + "grad_norm": NaN, + "learning_rate": 3.620996556174442e-05, + "loss": 0.0, + "step": 50229 + }, + { + "epoch": 4.686945973686666, + "grad_norm": NaN, + "learning_rate": 3.6205037474549434e-05, + "loss": 0.0, + "step": 50230 + }, + { + "epoch": 4.6870392833815435, + "grad_norm": NaN, + "learning_rate": 3.6200109676701314e-05, + "loss": 0.0, + "step": 50231 + }, + { + "epoch": 4.687132593076421, + "grad_norm": NaN, + "learning_rate": 3.619518216821266e-05, + "loss": 0.0, + "step": 50232 + }, + { + "epoch": 4.6872259027712975, + "grad_norm": NaN, + "learning_rate": 3.619025494909586e-05, + "loss": 0.0, + "step": 50233 + }, + { + "epoch": 4.687319212466175, + "grad_norm": NaN, + "learning_rate": 3.618532801936356e-05, + "loss": 0.0, + "step": 50234 + }, + { + "epoch": 4.687412522161052, + "grad_norm": NaN, + "learning_rate": 3.6180401379028304e-05, + "loss": 0.0, + "step": 50235 + }, + { + "epoch": 4.68750583185593, + "grad_norm": NaN, + "learning_rate": 3.61754750281025e-05, + "loss": 0.0, + "step": 50236 + }, + { + "epoch": 4.687599141550807, + "grad_norm": NaN, + "learning_rate": 3.617054896659877e-05, + "loss": 0.0, + "step": 50237 + }, + { + "epoch": 4.687692451245685, + "grad_norm": NaN, + "learning_rate": 3.6165623194529656e-05, + "loss": 0.0, + "step": 50238 + }, + { + "epoch": 4.687785760940562, + "grad_norm": NaN, + "learning_rate": 3.6160697711907576e-05, + "loss": 0.0, + "step": 50239 + }, + { + "epoch": 4.687879070635439, + "grad_norm": NaN, + "learning_rate": 3.615577251874515e-05, + "loss": 0.0, + "step": 50240 + }, + { + "epoch": 4.687972380330316, + "grad_norm": NaN, + "learning_rate": 3.6150847615054924e-05, + "loss": 0.0, + "step": 50241 + }, + { + "epoch": 4.688065690025193, + "grad_norm": NaN, + "learning_rate": 3.614592300084926e-05, + "loss": 0.0, + "step": 50242 + }, + { + "epoch": 4.688158999720071, + "grad_norm": NaN, + "learning_rate": 3.6140998676140845e-05, + "loss": 0.0, + "step": 50243 + }, + { + "epoch": 4.688252309414948, + "grad_norm": NaN, + "learning_rate": 3.61360746409422e-05, + "loss": 0.0, + "step": 50244 + }, + { + "epoch": 4.688345619109826, + "grad_norm": NaN, + "learning_rate": 3.613115089526569e-05, + "loss": 0.0, + "step": 50245 + }, + { + "epoch": 4.688438928804703, + "grad_norm": NaN, + "learning_rate": 3.612622743912396e-05, + "loss": 0.0, + "step": 50246 + }, + { + "epoch": 4.6885322384995805, + "grad_norm": NaN, + "learning_rate": 3.612130427252956e-05, + "loss": 0.0, + "step": 50247 + }, + { + "epoch": 4.688625548194457, + "grad_norm": NaN, + "learning_rate": 3.611638139549486e-05, + "loss": 0.0, + "step": 50248 + }, + { + "epoch": 4.688718857889334, + "grad_norm": NaN, + "learning_rate": 3.611145880803252e-05, + "loss": 0.0, + "step": 50249 + }, + { + "epoch": 4.688812167584212, + "grad_norm": NaN, + "learning_rate": 3.6106536510155045e-05, + "loss": 0.0, + "step": 50250 + }, + { + "epoch": 4.688905477279089, + "grad_norm": NaN, + "learning_rate": 3.610161450187481e-05, + "loss": 0.0, + "step": 50251 + }, + { + "epoch": 4.688998786973967, + "grad_norm": NaN, + "learning_rate": 3.6096692783204476e-05, + "loss": 0.0, + "step": 50252 + }, + { + "epoch": 4.689092096668844, + "grad_norm": NaN, + "learning_rate": 3.609177135415652e-05, + "loss": 0.0, + "step": 50253 + }, + { + "epoch": 4.6891854063637215, + "grad_norm": NaN, + "learning_rate": 3.608685021474345e-05, + "loss": 0.0, + "step": 50254 + }, + { + "epoch": 4.689278716058599, + "grad_norm": NaN, + "learning_rate": 3.6081929364977766e-05, + "loss": 0.0, + "step": 50255 + }, + { + "epoch": 4.6893720257534754, + "grad_norm": NaN, + "learning_rate": 3.6077008804872e-05, + "loss": 0.0, + "step": 50256 + }, + { + "epoch": 4.689465335448353, + "grad_norm": NaN, + "learning_rate": 3.607208853443865e-05, + "loss": 0.0, + "step": 50257 + }, + { + "epoch": 4.68955864514323, + "grad_norm": NaN, + "learning_rate": 3.606716855369024e-05, + "loss": 0.0, + "step": 50258 + }, + { + "epoch": 4.689651954838108, + "grad_norm": NaN, + "learning_rate": 3.606224886263925e-05, + "loss": 0.0, + "step": 50259 + }, + { + "epoch": 4.689745264532985, + "grad_norm": NaN, + "learning_rate": 3.6057329461298226e-05, + "loss": 0.0, + "step": 50260 + }, + { + "epoch": 4.6898385742278625, + "grad_norm": NaN, + "learning_rate": 3.605241034967966e-05, + "loss": 0.0, + "step": 50261 + }, + { + "epoch": 4.689931883922739, + "grad_norm": NaN, + "learning_rate": 3.6047491527796054e-05, + "loss": 0.0, + "step": 50262 + }, + { + "epoch": 4.6900251936176165, + "grad_norm": NaN, + "learning_rate": 3.6042572995659924e-05, + "loss": 0.0, + "step": 50263 + }, + { + "epoch": 4.690118503312494, + "grad_norm": NaN, + "learning_rate": 3.603765475328378e-05, + "loss": 0.0, + "step": 50264 + }, + { + "epoch": 4.690211813007371, + "grad_norm": NaN, + "learning_rate": 3.603273680068012e-05, + "loss": 0.0, + "step": 50265 + }, + { + "epoch": 4.690305122702249, + "grad_norm": NaN, + "learning_rate": 3.602781913786145e-05, + "loss": 0.0, + "step": 50266 + }, + { + "epoch": 4.690398432397126, + "grad_norm": NaN, + "learning_rate": 3.602290176484027e-05, + "loss": 0.0, + "step": 50267 + }, + { + "epoch": 4.690491742092004, + "grad_norm": NaN, + "learning_rate": 3.601798468162909e-05, + "loss": 0.0, + "step": 50268 + }, + { + "epoch": 4.690585051786881, + "grad_norm": NaN, + "learning_rate": 3.601306788824041e-05, + "loss": 0.0, + "step": 50269 + }, + { + "epoch": 4.690678361481758, + "grad_norm": NaN, + "learning_rate": 3.600815138468673e-05, + "loss": 0.0, + "step": 50270 + }, + { + "epoch": 4.690771671176635, + "grad_norm": NaN, + "learning_rate": 3.600323517098057e-05, + "loss": 0.0, + "step": 50271 + }, + { + "epoch": 4.690864980871512, + "grad_norm": NaN, + "learning_rate": 3.5998319247134394e-05, + "loss": 0.0, + "step": 50272 + }, + { + "epoch": 4.69095829056639, + "grad_norm": NaN, + "learning_rate": 3.599340361316074e-05, + "loss": 0.0, + "step": 50273 + }, + { + "epoch": 4.691051600261267, + "grad_norm": NaN, + "learning_rate": 3.5988488269072074e-05, + "loss": 0.0, + "step": 50274 + }, + { + "epoch": 4.691144909956145, + "grad_norm": NaN, + "learning_rate": 3.598357321488093e-05, + "loss": 0.0, + "step": 50275 + }, + { + "epoch": 4.691238219651022, + "grad_norm": NaN, + "learning_rate": 3.597865845059981e-05, + "loss": 0.0, + "step": 50276 + }, + { + "epoch": 4.691331529345899, + "grad_norm": NaN, + "learning_rate": 3.5973743976241096e-05, + "loss": 0.0, + "step": 50277 + }, + { + "epoch": 4.691424839040776, + "grad_norm": NaN, + "learning_rate": 3.5968829791817444e-05, + "loss": 0.0, + "step": 50278 + }, + { + "epoch": 4.691518148735653, + "grad_norm": NaN, + "learning_rate": 3.596391589734131e-05, + "loss": 0.0, + "step": 50279 + }, + { + "epoch": 4.691611458430531, + "grad_norm": NaN, + "learning_rate": 3.5959002292825054e-05, + "loss": 0.0, + "step": 50280 + }, + { + "epoch": 4.691704768125408, + "grad_norm": NaN, + "learning_rate": 3.5954088978281335e-05, + "loss": 0.0, + "step": 50281 + }, + { + "epoch": 4.691798077820286, + "grad_norm": NaN, + "learning_rate": 3.594917595372262e-05, + "loss": 0.0, + "step": 50282 + }, + { + "epoch": 4.691891387515163, + "grad_norm": NaN, + "learning_rate": 3.594426321916128e-05, + "loss": 0.0, + "step": 50283 + }, + { + "epoch": 4.6919846972100405, + "grad_norm": NaN, + "learning_rate": 3.5939350774609936e-05, + "loss": 0.0, + "step": 50284 + }, + { + "epoch": 4.692078006904917, + "grad_norm": NaN, + "learning_rate": 3.5934438620081084e-05, + "loss": 0.0, + "step": 50285 + }, + { + "epoch": 4.6921713165997945, + "grad_norm": NaN, + "learning_rate": 3.592952675558708e-05, + "loss": 0.0, + "step": 50286 + }, + { + "epoch": 4.692264626294672, + "grad_norm": NaN, + "learning_rate": 3.592461518114054e-05, + "loss": 0.0, + "step": 50287 + }, + { + "epoch": 4.692357935989549, + "grad_norm": NaN, + "learning_rate": 3.5919703896753946e-05, + "loss": 0.0, + "step": 50288 + }, + { + "epoch": 4.692451245684427, + "grad_norm": NaN, + "learning_rate": 3.591479290243967e-05, + "loss": 0.0, + "step": 50289 + }, + { + "epoch": 4.692544555379304, + "grad_norm": NaN, + "learning_rate": 3.5909882198210335e-05, + "loss": 0.0, + "step": 50290 + }, + { + "epoch": 4.692637865074182, + "grad_norm": NaN, + "learning_rate": 3.590497178407837e-05, + "loss": 0.0, + "step": 50291 + }, + { + "epoch": 4.692731174769058, + "grad_norm": NaN, + "learning_rate": 3.590006166005627e-05, + "loss": 0.0, + "step": 50292 + }, + { + "epoch": 4.6928244844639355, + "grad_norm": NaN, + "learning_rate": 3.5895151826156486e-05, + "loss": 0.0, + "step": 50293 + }, + { + "epoch": 4.692917794158813, + "grad_norm": NaN, + "learning_rate": 3.5890242282391554e-05, + "loss": 0.0, + "step": 50294 + }, + { + "epoch": 4.69301110385369, + "grad_norm": NaN, + "learning_rate": 3.588533302877393e-05, + "loss": 0.0, + "step": 50295 + }, + { + "epoch": 4.693104413548568, + "grad_norm": NaN, + "learning_rate": 3.58804240653161e-05, + "loss": 0.0, + "step": 50296 + }, + { + "epoch": 4.693197723243445, + "grad_norm": NaN, + "learning_rate": 3.5875515392030565e-05, + "loss": 0.0, + "step": 50297 + }, + { + "epoch": 4.693291032938323, + "grad_norm": NaN, + "learning_rate": 3.587060700892976e-05, + "loss": 0.0, + "step": 50298 + }, + { + "epoch": 4.6933843426332, + "grad_norm": NaN, + "learning_rate": 3.58656989160262e-05, + "loss": 0.0, + "step": 50299 + }, + { + "epoch": 4.693477652328077, + "grad_norm": NaN, + "learning_rate": 3.5860791113332374e-05, + "loss": 0.0, + "step": 50300 + }, + { + "epoch": 4.693570962022954, + "grad_norm": NaN, + "learning_rate": 3.585588360086072e-05, + "loss": 0.0, + "step": 50301 + }, + { + "epoch": 4.693664271717831, + "grad_norm": NaN, + "learning_rate": 3.585097637862376e-05, + "loss": 0.0, + "step": 50302 + }, + { + "epoch": 4.693757581412709, + "grad_norm": NaN, + "learning_rate": 3.5846069446633965e-05, + "loss": 0.0, + "step": 50303 + }, + { + "epoch": 4.693850891107586, + "grad_norm": NaN, + "learning_rate": 3.584116280490378e-05, + "loss": 0.0, + "step": 50304 + }, + { + "epoch": 4.693944200802464, + "grad_norm": NaN, + "learning_rate": 3.58362564534457e-05, + "loss": 0.0, + "step": 50305 + }, + { + "epoch": 4.69403751049734, + "grad_norm": NaN, + "learning_rate": 3.583135039227222e-05, + "loss": 0.0, + "step": 50306 + }, + { + "epoch": 4.694130820192218, + "grad_norm": NaN, + "learning_rate": 3.582644462139579e-05, + "loss": 0.0, + "step": 50307 + }, + { + "epoch": 4.694224129887095, + "grad_norm": NaN, + "learning_rate": 3.5821539140828886e-05, + "loss": 0.0, + "step": 50308 + }, + { + "epoch": 4.6943174395819725, + "grad_norm": NaN, + "learning_rate": 3.581663395058398e-05, + "loss": 0.0, + "step": 50309 + }, + { + "epoch": 4.69441074927685, + "grad_norm": NaN, + "learning_rate": 3.581172905067356e-05, + "loss": 0.0, + "step": 50310 + }, + { + "epoch": 4.694504058971727, + "grad_norm": NaN, + "learning_rate": 3.580682444111009e-05, + "loss": 0.0, + "step": 50311 + }, + { + "epoch": 4.694597368666605, + "grad_norm": NaN, + "learning_rate": 3.580192012190603e-05, + "loss": 0.0, + "step": 50312 + }, + { + "epoch": 4.694690678361482, + "grad_norm": NaN, + "learning_rate": 3.579701609307386e-05, + "loss": 0.0, + "step": 50313 + }, + { + "epoch": 4.6947839880563595, + "grad_norm": NaN, + "learning_rate": 3.579211235462605e-05, + "loss": 0.0, + "step": 50314 + }, + { + "epoch": 4.694877297751236, + "grad_norm": NaN, + "learning_rate": 3.578720890657507e-05, + "loss": 0.0, + "step": 50315 + }, + { + "epoch": 4.6949706074461135, + "grad_norm": NaN, + "learning_rate": 3.5782305748933384e-05, + "loss": 0.0, + "step": 50316 + }, + { + "epoch": 4.695063917140991, + "grad_norm": NaN, + "learning_rate": 3.577740288171347e-05, + "loss": 0.0, + "step": 50317 + }, + { + "epoch": 4.695157226835868, + "grad_norm": NaN, + "learning_rate": 3.577250030492777e-05, + "loss": 0.0, + "step": 50318 + }, + { + "epoch": 4.695250536530746, + "grad_norm": NaN, + "learning_rate": 3.576759801858878e-05, + "loss": 0.0, + "step": 50319 + }, + { + "epoch": 4.695343846225623, + "grad_norm": NaN, + "learning_rate": 3.5762696022708985e-05, + "loss": 0.0, + "step": 50320 + }, + { + "epoch": 4.6954371559205, + "grad_norm": NaN, + "learning_rate": 3.575779431730072e-05, + "loss": 0.0, + "step": 50321 + }, + { + "epoch": 4.695530465615377, + "grad_norm": NaN, + "learning_rate": 3.575289290237659e-05, + "loss": 0.0, + "step": 50322 + }, + { + "epoch": 4.6956237753102545, + "grad_norm": NaN, + "learning_rate": 3.574799177794907e-05, + "loss": 0.0, + "step": 50323 + }, + { + "epoch": 4.695717085005132, + "grad_norm": NaN, + "learning_rate": 3.5743090944030464e-05, + "loss": 0.0, + "step": 50324 + }, + { + "epoch": 4.695810394700009, + "grad_norm": NaN, + "learning_rate": 3.573819040063337e-05, + "loss": 0.0, + "step": 50325 + }, + { + "epoch": 4.695903704394887, + "grad_norm": NaN, + "learning_rate": 3.573329014777027e-05, + "loss": 0.0, + "step": 50326 + }, + { + "epoch": 4.695997014089764, + "grad_norm": NaN, + "learning_rate": 3.572839018545346e-05, + "loss": 0.0, + "step": 50327 + }, + { + "epoch": 4.696090323784642, + "grad_norm": NaN, + "learning_rate": 3.5723490513695556e-05, + "loss": 0.0, + "step": 50328 + }, + { + "epoch": 4.696183633479518, + "grad_norm": NaN, + "learning_rate": 3.571859113250897e-05, + "loss": 0.0, + "step": 50329 + }, + { + "epoch": 4.696276943174396, + "grad_norm": NaN, + "learning_rate": 3.571369204190614e-05, + "loss": 0.0, + "step": 50330 + }, + { + "epoch": 4.696370252869273, + "grad_norm": NaN, + "learning_rate": 3.5708793241899545e-05, + "loss": 0.0, + "step": 50331 + }, + { + "epoch": 4.69646356256415, + "grad_norm": NaN, + "learning_rate": 3.570389473250164e-05, + "loss": 0.0, + "step": 50332 + }, + { + "epoch": 4.696556872259028, + "grad_norm": NaN, + "learning_rate": 3.569899651372487e-05, + "loss": 0.0, + "step": 50333 + }, + { + "epoch": 4.696650181953905, + "grad_norm": NaN, + "learning_rate": 3.569409858558169e-05, + "loss": 0.0, + "step": 50334 + }, + { + "epoch": 4.696743491648783, + "grad_norm": NaN, + "learning_rate": 3.5689200948084564e-05, + "loss": 0.0, + "step": 50335 + }, + { + "epoch": 4.696836801343659, + "grad_norm": NaN, + "learning_rate": 3.5684303601245936e-05, + "loss": 0.0, + "step": 50336 + }, + { + "epoch": 4.696930111038537, + "grad_norm": NaN, + "learning_rate": 3.567940654507826e-05, + "loss": 0.0, + "step": 50337 + }, + { + "epoch": 4.697023420733414, + "grad_norm": NaN, + "learning_rate": 3.5674509779593976e-05, + "loss": 0.0, + "step": 50338 + }, + { + "epoch": 4.6971167304282915, + "grad_norm": NaN, + "learning_rate": 3.566961330480557e-05, + "loss": 0.0, + "step": 50339 + }, + { + "epoch": 4.697210040123169, + "grad_norm": NaN, + "learning_rate": 3.566471712072544e-05, + "loss": 0.0, + "step": 50340 + }, + { + "epoch": 4.697303349818046, + "grad_norm": NaN, + "learning_rate": 3.5659821227366104e-05, + "loss": 0.0, + "step": 50341 + }, + { + "epoch": 4.697396659512924, + "grad_norm": NaN, + "learning_rate": 3.565492562473995e-05, + "loss": 0.0, + "step": 50342 + }, + { + "epoch": 4.697489969207801, + "grad_norm": NaN, + "learning_rate": 3.5650030312859454e-05, + "loss": 0.0, + "step": 50343 + }, + { + "epoch": 4.697583278902678, + "grad_norm": NaN, + "learning_rate": 3.564513529173705e-05, + "loss": 0.0, + "step": 50344 + }, + { + "epoch": 4.697676588597555, + "grad_norm": NaN, + "learning_rate": 3.5640240561385195e-05, + "loss": 0.0, + "step": 50345 + }, + { + "epoch": 4.6977698982924325, + "grad_norm": NaN, + "learning_rate": 3.563534612181634e-05, + "loss": 0.0, + "step": 50346 + }, + { + "epoch": 4.69786320798731, + "grad_norm": NaN, + "learning_rate": 3.563045197304293e-05, + "loss": 0.0, + "step": 50347 + }, + { + "epoch": 4.697956517682187, + "grad_norm": NaN, + "learning_rate": 3.562555811507737e-05, + "loss": 0.0, + "step": 50348 + }, + { + "epoch": 4.698049827377065, + "grad_norm": NaN, + "learning_rate": 3.5620664547932174e-05, + "loss": 0.0, + "step": 50349 + }, + { + "epoch": 4.698143137071941, + "grad_norm": NaN, + "learning_rate": 3.561577127161973e-05, + "loss": 0.0, + "step": 50350 + }, + { + "epoch": 4.698236446766819, + "grad_norm": NaN, + "learning_rate": 3.561087828615249e-05, + "loss": 0.0, + "step": 50351 + }, + { + "epoch": 4.698329756461696, + "grad_norm": NaN, + "learning_rate": 3.5605985591542904e-05, + "loss": 0.0, + "step": 50352 + }, + { + "epoch": 4.698423066156574, + "grad_norm": NaN, + "learning_rate": 3.560109318780342e-05, + "loss": 0.0, + "step": 50353 + }, + { + "epoch": 4.698516375851451, + "grad_norm": NaN, + "learning_rate": 3.559620107494646e-05, + "loss": 0.0, + "step": 50354 + }, + { + "epoch": 4.698609685546328, + "grad_norm": NaN, + "learning_rate": 3.5591309252984466e-05, + "loss": 0.0, + "step": 50355 + }, + { + "epoch": 4.698702995241206, + "grad_norm": NaN, + "learning_rate": 3.5586417721929896e-05, + "loss": 0.0, + "step": 50356 + }, + { + "epoch": 4.698796304936083, + "grad_norm": NaN, + "learning_rate": 3.5581526481795175e-05, + "loss": 0.0, + "step": 50357 + }, + { + "epoch": 4.69888961463096, + "grad_norm": NaN, + "learning_rate": 3.557663553259272e-05, + "loss": 0.0, + "step": 50358 + }, + { + "epoch": 4.698982924325837, + "grad_norm": NaN, + "learning_rate": 3.5571744874335e-05, + "loss": 0.0, + "step": 50359 + }, + { + "epoch": 4.699076234020715, + "grad_norm": NaN, + "learning_rate": 3.556685450703443e-05, + "loss": 0.0, + "step": 50360 + }, + { + "epoch": 4.699169543715592, + "grad_norm": NaN, + "learning_rate": 3.5561964430703464e-05, + "loss": 0.0, + "step": 50361 + }, + { + "epoch": 4.6992628534104695, + "grad_norm": NaN, + "learning_rate": 3.555707464535452e-05, + "loss": 0.0, + "step": 50362 + }, + { + "epoch": 4.699356163105347, + "grad_norm": NaN, + "learning_rate": 3.555218515100002e-05, + "loss": 0.0, + "step": 50363 + }, + { + "epoch": 4.699449472800224, + "grad_norm": NaN, + "learning_rate": 3.554729594765243e-05, + "loss": 0.0, + "step": 50364 + }, + { + "epoch": 4.699542782495101, + "grad_norm": NaN, + "learning_rate": 3.554240703532415e-05, + "loss": 0.0, + "step": 50365 + }, + { + "epoch": 4.699636092189978, + "grad_norm": NaN, + "learning_rate": 3.553751841402763e-05, + "loss": 0.0, + "step": 50366 + }, + { + "epoch": 4.699729401884856, + "grad_norm": NaN, + "learning_rate": 3.55326300837753e-05, + "loss": 0.0, + "step": 50367 + }, + { + "epoch": 4.699822711579733, + "grad_norm": NaN, + "learning_rate": 3.552774204457958e-05, + "loss": 0.0, + "step": 50368 + }, + { + "epoch": 4.6999160212746105, + "grad_norm": NaN, + "learning_rate": 3.552285429645289e-05, + "loss": 0.0, + "step": 50369 + }, + { + "epoch": 4.700009330969488, + "grad_norm": NaN, + "learning_rate": 3.551796683940768e-05, + "loss": 0.0, + "step": 50370 + }, + { + "epoch": 4.700102640664365, + "grad_norm": NaN, + "learning_rate": 3.551307967345637e-05, + "loss": 0.0, + "step": 50371 + }, + { + "epoch": 4.700195950359243, + "grad_norm": NaN, + "learning_rate": 3.550819279861138e-05, + "loss": 0.0, + "step": 50372 + }, + { + "epoch": 4.700289260054119, + "grad_norm": NaN, + "learning_rate": 3.550330621488515e-05, + "loss": 0.0, + "step": 50373 + }, + { + "epoch": 4.700382569748997, + "grad_norm": NaN, + "learning_rate": 3.5498419922290094e-05, + "loss": 0.0, + "step": 50374 + }, + { + "epoch": 4.700475879443874, + "grad_norm": NaN, + "learning_rate": 3.5493533920838626e-05, + "loss": 0.0, + "step": 50375 + }, + { + "epoch": 4.7005691891387515, + "grad_norm": NaN, + "learning_rate": 3.5488648210543194e-05, + "loss": 0.0, + "step": 50376 + }, + { + "epoch": 4.700662498833629, + "grad_norm": NaN, + "learning_rate": 3.5483762791416205e-05, + "loss": 0.0, + "step": 50377 + }, + { + "epoch": 4.700755808528506, + "grad_norm": NaN, + "learning_rate": 3.547887766347009e-05, + "loss": 0.0, + "step": 50378 + }, + { + "epoch": 4.700849118223383, + "grad_norm": NaN, + "learning_rate": 3.547399282671726e-05, + "loss": 0.0, + "step": 50379 + }, + { + "epoch": 4.70094242791826, + "grad_norm": NaN, + "learning_rate": 3.5469108281170145e-05, + "loss": 0.0, + "step": 50380 + }, + { + "epoch": 4.701035737613138, + "grad_norm": NaN, + "learning_rate": 3.546422402684114e-05, + "loss": 0.0, + "step": 50381 + }, + { + "epoch": 4.701129047308015, + "grad_norm": NaN, + "learning_rate": 3.5459340063742716e-05, + "loss": 0.0, + "step": 50382 + }, + { + "epoch": 4.701222357002893, + "grad_norm": NaN, + "learning_rate": 3.545445639188725e-05, + "loss": 0.0, + "step": 50383 + }, + { + "epoch": 4.70131566669777, + "grad_norm": NaN, + "learning_rate": 3.544957301128717e-05, + "loss": 0.0, + "step": 50384 + }, + { + "epoch": 4.701408976392647, + "grad_norm": NaN, + "learning_rate": 3.544468992195488e-05, + "loss": 0.0, + "step": 50385 + }, + { + "epoch": 4.701502286087525, + "grad_norm": NaN, + "learning_rate": 3.543980712390283e-05, + "loss": 0.0, + "step": 50386 + }, + { + "epoch": 4.701595595782402, + "grad_norm": NaN, + "learning_rate": 3.54349246171434e-05, + "loss": 0.0, + "step": 50387 + }, + { + "epoch": 4.701688905477279, + "grad_norm": NaN, + "learning_rate": 3.543004240168904e-05, + "loss": 0.0, + "step": 50388 + }, + { + "epoch": 4.701782215172156, + "grad_norm": NaN, + "learning_rate": 3.5425160477552125e-05, + "loss": 0.0, + "step": 50389 + }, + { + "epoch": 4.701875524867034, + "grad_norm": NaN, + "learning_rate": 3.542027884474509e-05, + "loss": 0.0, + "step": 50390 + }, + { + "epoch": 4.701968834561911, + "grad_norm": NaN, + "learning_rate": 3.541539750328035e-05, + "loss": 0.0, + "step": 50391 + }, + { + "epoch": 4.7020621442567885, + "grad_norm": NaN, + "learning_rate": 3.5410516453170304e-05, + "loss": 0.0, + "step": 50392 + }, + { + "epoch": 4.702155453951666, + "grad_norm": NaN, + "learning_rate": 3.540563569442737e-05, + "loss": 0.0, + "step": 50393 + }, + { + "epoch": 4.702248763646542, + "grad_norm": NaN, + "learning_rate": 3.540075522706398e-05, + "loss": 0.0, + "step": 50394 + }, + { + "epoch": 4.70234207334142, + "grad_norm": NaN, + "learning_rate": 3.5395875051092515e-05, + "loss": 0.0, + "step": 50395 + }, + { + "epoch": 4.702435383036297, + "grad_norm": NaN, + "learning_rate": 3.539099516652538e-05, + "loss": 0.0, + "step": 50396 + }, + { + "epoch": 4.702528692731175, + "grad_norm": NaN, + "learning_rate": 3.538611557337499e-05, + "loss": 0.0, + "step": 50397 + }, + { + "epoch": 4.702622002426052, + "grad_norm": NaN, + "learning_rate": 3.538123627165378e-05, + "loss": 0.0, + "step": 50398 + }, + { + "epoch": 4.7027153121209295, + "grad_norm": NaN, + "learning_rate": 3.5376357261374125e-05, + "loss": 0.0, + "step": 50399 + }, + { + "epoch": 4.702808621815807, + "grad_norm": NaN, + "learning_rate": 3.5371478542548426e-05, + "loss": 0.0, + "step": 50400 + }, + { + "epoch": 4.702901931510684, + "grad_norm": NaN, + "learning_rate": 3.5366600115189116e-05, + "loss": 0.0, + "step": 50401 + }, + { + "epoch": 4.702995241205561, + "grad_norm": NaN, + "learning_rate": 3.5361721979308575e-05, + "loss": 0.0, + "step": 50402 + }, + { + "epoch": 4.703088550900438, + "grad_norm": NaN, + "learning_rate": 3.535684413491922e-05, + "loss": 0.0, + "step": 50403 + }, + { + "epoch": 4.703181860595316, + "grad_norm": NaN, + "learning_rate": 3.5351966582033455e-05, + "loss": 0.0, + "step": 50404 + }, + { + "epoch": 4.703275170290193, + "grad_norm": NaN, + "learning_rate": 3.5347089320663664e-05, + "loss": 0.0, + "step": 50405 + }, + { + "epoch": 4.703368479985071, + "grad_norm": NaN, + "learning_rate": 3.534221235082227e-05, + "loss": 0.0, + "step": 50406 + }, + { + "epoch": 4.703461789679948, + "grad_norm": NaN, + "learning_rate": 3.533733567252168e-05, + "loss": 0.0, + "step": 50407 + }, + { + "epoch": 4.703555099374825, + "grad_norm": NaN, + "learning_rate": 3.533245928577426e-05, + "loss": 0.0, + "step": 50408 + }, + { + "epoch": 4.703648409069702, + "grad_norm": NaN, + "learning_rate": 3.532758319059243e-05, + "loss": 0.0, + "step": 50409 + }, + { + "epoch": 4.703741718764579, + "grad_norm": NaN, + "learning_rate": 3.5322707386988604e-05, + "loss": 0.0, + "step": 50410 + }, + { + "epoch": 4.703835028459457, + "grad_norm": NaN, + "learning_rate": 3.5317831874975155e-05, + "loss": 0.0, + "step": 50411 + }, + { + "epoch": 4.703928338154334, + "grad_norm": NaN, + "learning_rate": 3.531295665456448e-05, + "loss": 0.0, + "step": 50412 + }, + { + "epoch": 4.704021647849212, + "grad_norm": NaN, + "learning_rate": 3.5308081725768996e-05, + "loss": 0.0, + "step": 50413 + }, + { + "epoch": 4.704114957544089, + "grad_norm": NaN, + "learning_rate": 3.530320708860108e-05, + "loss": 0.0, + "step": 50414 + }, + { + "epoch": 4.7042082672389665, + "grad_norm": NaN, + "learning_rate": 3.529833274307314e-05, + "loss": 0.0, + "step": 50415 + }, + { + "epoch": 4.704301576933844, + "grad_norm": NaN, + "learning_rate": 3.529345868919755e-05, + "loss": 0.0, + "step": 50416 + }, + { + "epoch": 4.70439488662872, + "grad_norm": NaN, + "learning_rate": 3.528858492698672e-05, + "loss": 0.0, + "step": 50417 + }, + { + "epoch": 4.704488196323598, + "grad_norm": NaN, + "learning_rate": 3.5283711456453044e-05, + "loss": 0.0, + "step": 50418 + }, + { + "epoch": 4.704581506018475, + "grad_norm": NaN, + "learning_rate": 3.5278838277608905e-05, + "loss": 0.0, + "step": 50419 + }, + { + "epoch": 4.704674815713353, + "grad_norm": NaN, + "learning_rate": 3.52739653904667e-05, + "loss": 0.0, + "step": 50420 + }, + { + "epoch": 4.70476812540823, + "grad_norm": NaN, + "learning_rate": 3.526909279503881e-05, + "loss": 0.0, + "step": 50421 + }, + { + "epoch": 4.7048614351031075, + "grad_norm": NaN, + "learning_rate": 3.5264220491337624e-05, + "loss": 0.0, + "step": 50422 + }, + { + "epoch": 4.704954744797984, + "grad_norm": NaN, + "learning_rate": 3.525934847937556e-05, + "loss": 0.0, + "step": 50423 + }, + { + "epoch": 4.7050480544928615, + "grad_norm": NaN, + "learning_rate": 3.525447675916497e-05, + "loss": 0.0, + "step": 50424 + }, + { + "epoch": 4.705141364187739, + "grad_norm": NaN, + "learning_rate": 3.524960533071826e-05, + "loss": 0.0, + "step": 50425 + }, + { + "epoch": 4.705234673882616, + "grad_norm": NaN, + "learning_rate": 3.52447341940478e-05, + "loss": 0.0, + "step": 50426 + }, + { + "epoch": 4.705327983577494, + "grad_norm": NaN, + "learning_rate": 3.5239863349165995e-05, + "loss": 0.0, + "step": 50427 + }, + { + "epoch": 4.705421293272371, + "grad_norm": NaN, + "learning_rate": 3.523499279608523e-05, + "loss": 0.0, + "step": 50428 + }, + { + "epoch": 4.7055146029672485, + "grad_norm": NaN, + "learning_rate": 3.523012253481789e-05, + "loss": 0.0, + "step": 50429 + }, + { + "epoch": 4.705607912662126, + "grad_norm": NaN, + "learning_rate": 3.522525256537633e-05, + "loss": 0.0, + "step": 50430 + }, + { + "epoch": 4.705701222357003, + "grad_norm": NaN, + "learning_rate": 3.522038288777297e-05, + "loss": 0.0, + "step": 50431 + }, + { + "epoch": 4.70579453205188, + "grad_norm": NaN, + "learning_rate": 3.521551350202017e-05, + "loss": 0.0, + "step": 50432 + }, + { + "epoch": 4.705887841746757, + "grad_norm": NaN, + "learning_rate": 3.521064440813031e-05, + "loss": 0.0, + "step": 50433 + }, + { + "epoch": 4.705981151441635, + "grad_norm": NaN, + "learning_rate": 3.5205775606115783e-05, + "loss": 0.0, + "step": 50434 + }, + { + "epoch": 4.706074461136512, + "grad_norm": NaN, + "learning_rate": 3.5200907095988975e-05, + "loss": 0.0, + "step": 50435 + }, + { + "epoch": 4.70616777083139, + "grad_norm": NaN, + "learning_rate": 3.5196038877762236e-05, + "loss": 0.0, + "step": 50436 + }, + { + "epoch": 4.706261080526267, + "grad_norm": NaN, + "learning_rate": 3.519117095144797e-05, + "loss": 0.0, + "step": 50437 + }, + { + "epoch": 4.7063543902211435, + "grad_norm": NaN, + "learning_rate": 3.518630331705856e-05, + "loss": 0.0, + "step": 50438 + }, + { + "epoch": 4.706447699916021, + "grad_norm": NaN, + "learning_rate": 3.518143597460636e-05, + "loss": 0.0, + "step": 50439 + }, + { + "epoch": 4.706541009610898, + "grad_norm": NaN, + "learning_rate": 3.5176568924103754e-05, + "loss": 0.0, + "step": 50440 + }, + { + "epoch": 4.706634319305776, + "grad_norm": NaN, + "learning_rate": 3.517170216556312e-05, + "loss": 0.0, + "step": 50441 + }, + { + "epoch": 4.706727629000653, + "grad_norm": NaN, + "learning_rate": 3.516683569899683e-05, + "loss": 0.0, + "step": 50442 + }, + { + "epoch": 4.706820938695531, + "grad_norm": NaN, + "learning_rate": 3.5161969524417276e-05, + "loss": 0.0, + "step": 50443 + }, + { + "epoch": 4.706914248390408, + "grad_norm": NaN, + "learning_rate": 3.51571036418368e-05, + "loss": 0.0, + "step": 50444 + }, + { + "epoch": 4.7070075580852855, + "grad_norm": NaN, + "learning_rate": 3.5152238051267803e-05, + "loss": 0.0, + "step": 50445 + }, + { + "epoch": 4.707100867780162, + "grad_norm": NaN, + "learning_rate": 3.5147372752722644e-05, + "loss": 0.0, + "step": 50446 + }, + { + "epoch": 4.707194177475039, + "grad_norm": NaN, + "learning_rate": 3.5142507746213675e-05, + "loss": 0.0, + "step": 50447 + }, + { + "epoch": 4.707287487169917, + "grad_norm": NaN, + "learning_rate": 3.51376430317533e-05, + "loss": 0.0, + "step": 50448 + }, + { + "epoch": 4.707380796864794, + "grad_norm": NaN, + "learning_rate": 3.513277860935388e-05, + "loss": 0.0, + "step": 50449 + }, + { + "epoch": 4.707474106559672, + "grad_norm": NaN, + "learning_rate": 3.512791447902776e-05, + "loss": 0.0, + "step": 50450 + }, + { + "epoch": 4.707567416254549, + "grad_norm": NaN, + "learning_rate": 3.512305064078734e-05, + "loss": 0.0, + "step": 50451 + }, + { + "epoch": 4.7076607259494265, + "grad_norm": NaN, + "learning_rate": 3.511818709464497e-05, + "loss": 0.0, + "step": 50452 + }, + { + "epoch": 4.707754035644303, + "grad_norm": NaN, + "learning_rate": 3.5113323840613025e-05, + "loss": 0.0, + "step": 50453 + }, + { + "epoch": 4.7078473453391805, + "grad_norm": NaN, + "learning_rate": 3.510846087870385e-05, + "loss": 0.0, + "step": 50454 + }, + { + "epoch": 4.707940655034058, + "grad_norm": NaN, + "learning_rate": 3.510359820892984e-05, + "loss": 0.0, + "step": 50455 + }, + { + "epoch": 4.708033964728935, + "grad_norm": NaN, + "learning_rate": 3.5098735831303334e-05, + "loss": 0.0, + "step": 50456 + }, + { + "epoch": 4.708127274423813, + "grad_norm": NaN, + "learning_rate": 3.5093873745836714e-05, + "loss": 0.0, + "step": 50457 + }, + { + "epoch": 4.70822058411869, + "grad_norm": NaN, + "learning_rate": 3.508901195254233e-05, + "loss": 0.0, + "step": 50458 + }, + { + "epoch": 4.708313893813568, + "grad_norm": NaN, + "learning_rate": 3.5084150451432554e-05, + "loss": 0.0, + "step": 50459 + }, + { + "epoch": 4.708407203508445, + "grad_norm": NaN, + "learning_rate": 3.507928924251974e-05, + "loss": 0.0, + "step": 50460 + }, + { + "epoch": 4.7085005132033215, + "grad_norm": NaN, + "learning_rate": 3.507442832581626e-05, + "loss": 0.0, + "step": 50461 + }, + { + "epoch": 4.708593822898199, + "grad_norm": NaN, + "learning_rate": 3.506956770133447e-05, + "loss": 0.0, + "step": 50462 + }, + { + "epoch": 4.708687132593076, + "grad_norm": NaN, + "learning_rate": 3.5064707369086695e-05, + "loss": 0.0, + "step": 50463 + }, + { + "epoch": 4.708780442287954, + "grad_norm": NaN, + "learning_rate": 3.505984732908535e-05, + "loss": 0.0, + "step": 50464 + }, + { + "epoch": 4.708873751982831, + "grad_norm": NaN, + "learning_rate": 3.505498758134276e-05, + "loss": 0.0, + "step": 50465 + }, + { + "epoch": 4.708967061677709, + "grad_norm": NaN, + "learning_rate": 3.505012812587128e-05, + "loss": 0.0, + "step": 50466 + }, + { + "epoch": 4.709060371372585, + "grad_norm": NaN, + "learning_rate": 3.5045268962683276e-05, + "loss": 0.0, + "step": 50467 + }, + { + "epoch": 4.709153681067463, + "grad_norm": NaN, + "learning_rate": 3.504041009179111e-05, + "loss": 0.0, + "step": 50468 + }, + { + "epoch": 4.70924699076234, + "grad_norm": NaN, + "learning_rate": 3.5035551513207125e-05, + "loss": 0.0, + "step": 50469 + }, + { + "epoch": 4.709340300457217, + "grad_norm": NaN, + "learning_rate": 3.5030693226943665e-05, + "loss": 0.0, + "step": 50470 + }, + { + "epoch": 4.709433610152095, + "grad_norm": NaN, + "learning_rate": 3.502583523301311e-05, + "loss": 0.0, + "step": 50471 + }, + { + "epoch": 4.709526919846972, + "grad_norm": NaN, + "learning_rate": 3.502097753142777e-05, + "loss": 0.0, + "step": 50472 + }, + { + "epoch": 4.70962022954185, + "grad_norm": NaN, + "learning_rate": 3.501612012220007e-05, + "loss": 0.0, + "step": 50473 + }, + { + "epoch": 4.709713539236727, + "grad_norm": NaN, + "learning_rate": 3.5011263005342284e-05, + "loss": 0.0, + "step": 50474 + }, + { + "epoch": 4.709806848931604, + "grad_norm": NaN, + "learning_rate": 3.5006406180866803e-05, + "loss": 0.0, + "step": 50475 + }, + { + "epoch": 4.709900158626481, + "grad_norm": NaN, + "learning_rate": 3.5001549648785964e-05, + "loss": 0.0, + "step": 50476 + }, + { + "epoch": 4.7099934683213585, + "grad_norm": NaN, + "learning_rate": 3.499669340911213e-05, + "loss": 0.0, + "step": 50477 + }, + { + "epoch": 4.710086778016236, + "grad_norm": NaN, + "learning_rate": 3.4991837461857624e-05, + "loss": 0.0, + "step": 50478 + }, + { + "epoch": 4.710180087711113, + "grad_norm": NaN, + "learning_rate": 3.49869818070348e-05, + "loss": 0.0, + "step": 50479 + }, + { + "epoch": 4.710273397405991, + "grad_norm": NaN, + "learning_rate": 3.4982126444656034e-05, + "loss": 0.0, + "step": 50480 + }, + { + "epoch": 4.710366707100868, + "grad_norm": NaN, + "learning_rate": 3.497727137473364e-05, + "loss": 0.0, + "step": 50481 + }, + { + "epoch": 4.710460016795745, + "grad_norm": NaN, + "learning_rate": 3.497241659727998e-05, + "loss": 0.0, + "step": 50482 + }, + { + "epoch": 4.710553326490622, + "grad_norm": NaN, + "learning_rate": 3.4967562112307377e-05, + "loss": 0.0, + "step": 50483 + }, + { + "epoch": 4.7106466361854995, + "grad_norm": NaN, + "learning_rate": 3.49627079198282e-05, + "loss": 0.0, + "step": 50484 + }, + { + "epoch": 4.710739945880377, + "grad_norm": NaN, + "learning_rate": 3.495785401985478e-05, + "loss": 0.0, + "step": 50485 + }, + { + "epoch": 4.710833255575254, + "grad_norm": NaN, + "learning_rate": 3.4953000412399446e-05, + "loss": 0.0, + "step": 50486 + }, + { + "epoch": 4.710926565270132, + "grad_norm": NaN, + "learning_rate": 3.4948147097474566e-05, + "loss": 0.0, + "step": 50487 + }, + { + "epoch": 4.711019874965009, + "grad_norm": NaN, + "learning_rate": 3.494329407509246e-05, + "loss": 0.0, + "step": 50488 + }, + { + "epoch": 4.711113184659887, + "grad_norm": NaN, + "learning_rate": 3.4938441345265486e-05, + "loss": 0.0, + "step": 50489 + }, + { + "epoch": 4.711206494354763, + "grad_norm": NaN, + "learning_rate": 3.493358890800596e-05, + "loss": 0.0, + "step": 50490 + }, + { + "epoch": 4.7112998040496405, + "grad_norm": NaN, + "learning_rate": 3.492873676332623e-05, + "loss": 0.0, + "step": 50491 + }, + { + "epoch": 4.711393113744518, + "grad_norm": NaN, + "learning_rate": 3.492388491123864e-05, + "loss": 0.0, + "step": 50492 + }, + { + "epoch": 4.711486423439395, + "grad_norm": NaN, + "learning_rate": 3.491903335175553e-05, + "loss": 0.0, + "step": 50493 + }, + { + "epoch": 4.711579733134273, + "grad_norm": NaN, + "learning_rate": 3.4914182084889225e-05, + "loss": 0.0, + "step": 50494 + }, + { + "epoch": 4.71167304282915, + "grad_norm": NaN, + "learning_rate": 3.4909331110652054e-05, + "loss": 0.0, + "step": 50495 + }, + { + "epoch": 4.711766352524027, + "grad_norm": NaN, + "learning_rate": 3.4904480429056363e-05, + "loss": 0.0, + "step": 50496 + }, + { + "epoch": 4.711859662218904, + "grad_norm": NaN, + "learning_rate": 3.489963004011451e-05, + "loss": 0.0, + "step": 50497 + }, + { + "epoch": 4.711952971913782, + "grad_norm": NaN, + "learning_rate": 3.489477994383878e-05, + "loss": 0.0, + "step": 50498 + }, + { + "epoch": 4.712046281608659, + "grad_norm": NaN, + "learning_rate": 3.4889930140241525e-05, + "loss": 0.0, + "step": 50499 + }, + { + "epoch": 4.712139591303536, + "grad_norm": NaN, + "learning_rate": 3.488508062933508e-05, + "loss": 0.0, + "step": 50500 + }, + { + "epoch": 4.712232900998414, + "grad_norm": NaN, + "learning_rate": 3.488023141113179e-05, + "loss": 0.0, + "step": 50501 + }, + { + "epoch": 4.712326210693291, + "grad_norm": NaN, + "learning_rate": 3.487538248564396e-05, + "loss": 0.0, + "step": 50502 + }, + { + "epoch": 4.712419520388169, + "grad_norm": NaN, + "learning_rate": 3.4870533852883934e-05, + "loss": 0.0, + "step": 50503 + }, + { + "epoch": 4.712512830083046, + "grad_norm": NaN, + "learning_rate": 3.486568551286403e-05, + "loss": 0.0, + "step": 50504 + }, + { + "epoch": 4.712606139777923, + "grad_norm": NaN, + "learning_rate": 3.4860837465596594e-05, + "loss": 0.0, + "step": 50505 + }, + { + "epoch": 4.7126994494728, + "grad_norm": NaN, + "learning_rate": 3.4855989711093934e-05, + "loss": 0.0, + "step": 50506 + }, + { + "epoch": 4.7127927591676775, + "grad_norm": NaN, + "learning_rate": 3.485114224936838e-05, + "loss": 0.0, + "step": 50507 + }, + { + "epoch": 4.712886068862555, + "grad_norm": NaN, + "learning_rate": 3.484629508043227e-05, + "loss": 0.0, + "step": 50508 + }, + { + "epoch": 4.712979378557432, + "grad_norm": NaN, + "learning_rate": 3.484144820429791e-05, + "loss": 0.0, + "step": 50509 + }, + { + "epoch": 4.71307268825231, + "grad_norm": NaN, + "learning_rate": 3.4836601620977635e-05, + "loss": 0.0, + "step": 50510 + }, + { + "epoch": 4.713165997947186, + "grad_norm": NaN, + "learning_rate": 3.483175533048377e-05, + "loss": 0.0, + "step": 50511 + }, + { + "epoch": 4.713259307642064, + "grad_norm": NaN, + "learning_rate": 3.482690933282865e-05, + "loss": 0.0, + "step": 50512 + }, + { + "epoch": 4.713352617336941, + "grad_norm": NaN, + "learning_rate": 3.4822063628024575e-05, + "loss": 0.0, + "step": 50513 + }, + { + "epoch": 4.7134459270318185, + "grad_norm": NaN, + "learning_rate": 3.481721821608388e-05, + "loss": 0.0, + "step": 50514 + }, + { + "epoch": 4.713539236726696, + "grad_norm": NaN, + "learning_rate": 3.481237309701887e-05, + "loss": 0.0, + "step": 50515 + }, + { + "epoch": 4.713632546421573, + "grad_norm": NaN, + "learning_rate": 3.480752827084186e-05, + "loss": 0.0, + "step": 50516 + }, + { + "epoch": 4.713725856116451, + "grad_norm": NaN, + "learning_rate": 3.4802683737565206e-05, + "loss": 0.0, + "step": 50517 + }, + { + "epoch": 4.713819165811328, + "grad_norm": NaN, + "learning_rate": 3.47978394972012e-05, + "loss": 0.0, + "step": 50518 + }, + { + "epoch": 4.713912475506205, + "grad_norm": NaN, + "learning_rate": 3.479299554976216e-05, + "loss": 0.0, + "step": 50519 + }, + { + "epoch": 4.714005785201082, + "grad_norm": NaN, + "learning_rate": 3.47881518952604e-05, + "loss": 0.0, + "step": 50520 + }, + { + "epoch": 4.71409909489596, + "grad_norm": NaN, + "learning_rate": 3.478330853370825e-05, + "loss": 0.0, + "step": 50521 + }, + { + "epoch": 4.714192404590837, + "grad_norm": NaN, + "learning_rate": 3.4778465465118e-05, + "loss": 0.0, + "step": 50522 + }, + { + "epoch": 4.714285714285714, + "grad_norm": NaN, + "learning_rate": 3.477362268950199e-05, + "loss": 0.0, + "step": 50523 + }, + { + "epoch": 4.714379023980592, + "grad_norm": NaN, + "learning_rate": 3.476878020687254e-05, + "loss": 0.0, + "step": 50524 + }, + { + "epoch": 4.714472333675469, + "grad_norm": NaN, + "learning_rate": 3.476393801724193e-05, + "loss": 0.0, + "step": 50525 + }, + { + "epoch": 4.714565643370346, + "grad_norm": NaN, + "learning_rate": 3.4759096120622506e-05, + "loss": 0.0, + "step": 50526 + }, + { + "epoch": 4.714658953065223, + "grad_norm": NaN, + "learning_rate": 3.475425451702655e-05, + "loss": 0.0, + "step": 50527 + }, + { + "epoch": 4.714752262760101, + "grad_norm": NaN, + "learning_rate": 3.474941320646639e-05, + "loss": 0.0, + "step": 50528 + }, + { + "epoch": 4.714845572454978, + "grad_norm": NaN, + "learning_rate": 3.474457218895433e-05, + "loss": 0.0, + "step": 50529 + }, + { + "epoch": 4.7149388821498555, + "grad_norm": NaN, + "learning_rate": 3.47397314645027e-05, + "loss": 0.0, + "step": 50530 + }, + { + "epoch": 4.715032191844733, + "grad_norm": NaN, + "learning_rate": 3.4734891033123766e-05, + "loss": 0.0, + "step": 50531 + }, + { + "epoch": 4.71512550153961, + "grad_norm": NaN, + "learning_rate": 3.473005089482987e-05, + "loss": 0.0, + "step": 50532 + }, + { + "epoch": 4.715218811234488, + "grad_norm": NaN, + "learning_rate": 3.4725211049633306e-05, + "loss": 0.0, + "step": 50533 + }, + { + "epoch": 4.715312120929364, + "grad_norm": NaN, + "learning_rate": 3.472037149754638e-05, + "loss": 0.0, + "step": 50534 + }, + { + "epoch": 4.715405430624242, + "grad_norm": NaN, + "learning_rate": 3.471553223858141e-05, + "loss": 0.0, + "step": 50535 + }, + { + "epoch": 4.715498740319119, + "grad_norm": NaN, + "learning_rate": 3.4710693272750695e-05, + "loss": 0.0, + "step": 50536 + }, + { + "epoch": 4.7155920500139965, + "grad_norm": NaN, + "learning_rate": 3.4705854600066515e-05, + "loss": 0.0, + "step": 50537 + }, + { + "epoch": 4.715685359708874, + "grad_norm": NaN, + "learning_rate": 3.4701016220541226e-05, + "loss": 0.0, + "step": 50538 + }, + { + "epoch": 4.715778669403751, + "grad_norm": NaN, + "learning_rate": 3.469617813418708e-05, + "loss": 0.0, + "step": 50539 + }, + { + "epoch": 4.715871979098628, + "grad_norm": NaN, + "learning_rate": 3.46913403410164e-05, + "loss": 0.0, + "step": 50540 + }, + { + "epoch": 4.715965288793505, + "grad_norm": NaN, + "learning_rate": 3.468650284104147e-05, + "loss": 0.0, + "step": 50541 + }, + { + "epoch": 4.716058598488383, + "grad_norm": NaN, + "learning_rate": 3.468166563427463e-05, + "loss": 0.0, + "step": 50542 + }, + { + "epoch": 4.71615190818326, + "grad_norm": NaN, + "learning_rate": 3.467682872072814e-05, + "loss": 0.0, + "step": 50543 + }, + { + "epoch": 4.7162452178781376, + "grad_norm": NaN, + "learning_rate": 3.46719921004143e-05, + "loss": 0.0, + "step": 50544 + }, + { + "epoch": 4.716338527573015, + "grad_norm": NaN, + "learning_rate": 3.466715577334545e-05, + "loss": 0.0, + "step": 50545 + }, + { + "epoch": 4.716431837267892, + "grad_norm": NaN, + "learning_rate": 3.466231973953384e-05, + "loss": 0.0, + "step": 50546 + }, + { + "epoch": 4.71652514696277, + "grad_norm": NaN, + "learning_rate": 3.4657483998991795e-05, + "loss": 0.0, + "step": 50547 + }, + { + "epoch": 4.716618456657646, + "grad_norm": NaN, + "learning_rate": 3.465264855173159e-05, + "loss": 0.0, + "step": 50548 + }, + { + "epoch": 4.716711766352524, + "grad_norm": NaN, + "learning_rate": 3.464781339776555e-05, + "loss": 0.0, + "step": 50549 + }, + { + "epoch": 4.716805076047401, + "grad_norm": NaN, + "learning_rate": 3.4642978537105936e-05, + "loss": 0.0, + "step": 50550 + }, + { + "epoch": 4.716898385742279, + "grad_norm": NaN, + "learning_rate": 3.463814396976506e-05, + "loss": 0.0, + "step": 50551 + }, + { + "epoch": 4.716991695437156, + "grad_norm": NaN, + "learning_rate": 3.4633309695755205e-05, + "loss": 0.0, + "step": 50552 + }, + { + "epoch": 4.717085005132033, + "grad_norm": NaN, + "learning_rate": 3.462847571508869e-05, + "loss": 0.0, + "step": 50553 + }, + { + "epoch": 4.717178314826911, + "grad_norm": NaN, + "learning_rate": 3.462364202777776e-05, + "loss": 0.0, + "step": 50554 + }, + { + "epoch": 4.717271624521787, + "grad_norm": NaN, + "learning_rate": 3.4618808633834744e-05, + "loss": 0.0, + "step": 50555 + }, + { + "epoch": 4.717364934216665, + "grad_norm": NaN, + "learning_rate": 3.4613975533271916e-05, + "loss": 0.0, + "step": 50556 + }, + { + "epoch": 4.717458243911542, + "grad_norm": NaN, + "learning_rate": 3.460914272610156e-05, + "loss": 0.0, + "step": 50557 + }, + { + "epoch": 4.71755155360642, + "grad_norm": NaN, + "learning_rate": 3.460431021233599e-05, + "loss": 0.0, + "step": 50558 + }, + { + "epoch": 4.717644863301297, + "grad_norm": NaN, + "learning_rate": 3.459947799198747e-05, + "loss": 0.0, + "step": 50559 + }, + { + "epoch": 4.7177381729961745, + "grad_norm": NaN, + "learning_rate": 3.4594646065068287e-05, + "loss": 0.0, + "step": 50560 + }, + { + "epoch": 4.717831482691052, + "grad_norm": NaN, + "learning_rate": 3.458981443159075e-05, + "loss": 0.0, + "step": 50561 + }, + { + "epoch": 4.717924792385929, + "grad_norm": NaN, + "learning_rate": 3.4584983091567116e-05, + "loss": 0.0, + "step": 50562 + }, + { + "epoch": 4.718018102080806, + "grad_norm": NaN, + "learning_rate": 3.4580152045009684e-05, + "loss": 0.0, + "step": 50563 + }, + { + "epoch": 4.718111411775683, + "grad_norm": NaN, + "learning_rate": 3.457532129193073e-05, + "loss": 0.0, + "step": 50564 + }, + { + "epoch": 4.718204721470561, + "grad_norm": NaN, + "learning_rate": 3.457049083234254e-05, + "loss": 0.0, + "step": 50565 + }, + { + "epoch": 4.718298031165438, + "grad_norm": NaN, + "learning_rate": 3.4565660666257404e-05, + "loss": 0.0, + "step": 50566 + }, + { + "epoch": 4.7183913408603155, + "grad_norm": NaN, + "learning_rate": 3.4560830793687597e-05, + "loss": 0.0, + "step": 50567 + }, + { + "epoch": 4.718484650555193, + "grad_norm": NaN, + "learning_rate": 3.4556001214645415e-05, + "loss": 0.0, + "step": 50568 + }, + { + "epoch": 4.71857796025007, + "grad_norm": NaN, + "learning_rate": 3.455117192914312e-05, + "loss": 0.0, + "step": 50569 + }, + { + "epoch": 4.718671269944947, + "grad_norm": NaN, + "learning_rate": 3.454634293719299e-05, + "loss": 0.0, + "step": 50570 + }, + { + "epoch": 4.718764579639824, + "grad_norm": NaN, + "learning_rate": 3.45415142388073e-05, + "loss": 0.0, + "step": 50571 + }, + { + "epoch": 4.718857889334702, + "grad_norm": NaN, + "learning_rate": 3.4536685833998353e-05, + "loss": 0.0, + "step": 50572 + }, + { + "epoch": 4.718951199029579, + "grad_norm": NaN, + "learning_rate": 3.4531857722778414e-05, + "loss": 0.0, + "step": 50573 + }, + { + "epoch": 4.719044508724457, + "grad_norm": NaN, + "learning_rate": 3.452702990515975e-05, + "loss": 0.0, + "step": 50574 + }, + { + "epoch": 4.719137818419334, + "grad_norm": NaN, + "learning_rate": 3.452220238115464e-05, + "loss": 0.0, + "step": 50575 + }, + { + "epoch": 4.719231128114211, + "grad_norm": NaN, + "learning_rate": 3.451737515077537e-05, + "loss": 0.0, + "step": 50576 + }, + { + "epoch": 4.719324437809089, + "grad_norm": NaN, + "learning_rate": 3.451254821403419e-05, + "loss": 0.0, + "step": 50577 + }, + { + "epoch": 4.719417747503965, + "grad_norm": NaN, + "learning_rate": 3.4507721570943404e-05, + "loss": 0.0, + "step": 50578 + }, + { + "epoch": 4.719511057198843, + "grad_norm": NaN, + "learning_rate": 3.4502895221515275e-05, + "loss": 0.0, + "step": 50579 + }, + { + "epoch": 4.71960436689372, + "grad_norm": NaN, + "learning_rate": 3.449806916576207e-05, + "loss": 0.0, + "step": 50580 + }, + { + "epoch": 4.719697676588598, + "grad_norm": NaN, + "learning_rate": 3.4493243403696046e-05, + "loss": 0.0, + "step": 50581 + }, + { + "epoch": 4.719790986283475, + "grad_norm": NaN, + "learning_rate": 3.448841793532949e-05, + "loss": 0.0, + "step": 50582 + }, + { + "epoch": 4.7198842959783525, + "grad_norm": NaN, + "learning_rate": 3.4483592760674694e-05, + "loss": 0.0, + "step": 50583 + }, + { + "epoch": 4.719977605673229, + "grad_norm": NaN, + "learning_rate": 3.447876787974384e-05, + "loss": 0.0, + "step": 50584 + }, + { + "epoch": 4.720070915368106, + "grad_norm": NaN, + "learning_rate": 3.447394329254934e-05, + "loss": 0.0, + "step": 50585 + }, + { + "epoch": 4.720164225062984, + "grad_norm": NaN, + "learning_rate": 3.446911899910335e-05, + "loss": 0.0, + "step": 50586 + }, + { + "epoch": 4.720257534757861, + "grad_norm": NaN, + "learning_rate": 3.4464294999418116e-05, + "loss": 0.0, + "step": 50587 + }, + { + "epoch": 4.720350844452739, + "grad_norm": NaN, + "learning_rate": 3.445947129350606e-05, + "loss": 0.0, + "step": 50588 + }, + { + "epoch": 4.720444154147616, + "grad_norm": NaN, + "learning_rate": 3.445464788137929e-05, + "loss": 0.0, + "step": 50589 + }, + { + "epoch": 4.7205374638424935, + "grad_norm": NaN, + "learning_rate": 3.4449824763050114e-05, + "loss": 0.0, + "step": 50590 + }, + { + "epoch": 4.720630773537371, + "grad_norm": NaN, + "learning_rate": 3.444500193853082e-05, + "loss": 0.0, + "step": 50591 + }, + { + "epoch": 4.7207240832322475, + "grad_norm": NaN, + "learning_rate": 3.444017940783364e-05, + "loss": 0.0, + "step": 50592 + }, + { + "epoch": 4.720817392927125, + "grad_norm": NaN, + "learning_rate": 3.4435357170970864e-05, + "loss": 0.0, + "step": 50593 + }, + { + "epoch": 4.720910702622002, + "grad_norm": NaN, + "learning_rate": 3.4430535227954744e-05, + "loss": 0.0, + "step": 50594 + }, + { + "epoch": 4.72100401231688, + "grad_norm": NaN, + "learning_rate": 3.442571357879752e-05, + "loss": 0.0, + "step": 50595 + }, + { + "epoch": 4.721097322011757, + "grad_norm": NaN, + "learning_rate": 3.4420892223511496e-05, + "loss": 0.0, + "step": 50596 + }, + { + "epoch": 4.7211906317066346, + "grad_norm": NaN, + "learning_rate": 3.4416071162108884e-05, + "loss": 0.0, + "step": 50597 + }, + { + "epoch": 4.721283941401512, + "grad_norm": NaN, + "learning_rate": 3.441125039460198e-05, + "loss": 0.0, + "step": 50598 + }, + { + "epoch": 4.7213772510963885, + "grad_norm": NaN, + "learning_rate": 3.440642992100301e-05, + "loss": 0.0, + "step": 50599 + }, + { + "epoch": 4.721470560791266, + "grad_norm": NaN, + "learning_rate": 3.440160974132426e-05, + "loss": 0.0, + "step": 50600 + }, + { + "epoch": 4.721563870486143, + "grad_norm": NaN, + "learning_rate": 3.439678985557797e-05, + "loss": 0.0, + "step": 50601 + }, + { + "epoch": 4.721657180181021, + "grad_norm": NaN, + "learning_rate": 3.4391970263776404e-05, + "loss": 0.0, + "step": 50602 + }, + { + "epoch": 4.721750489875898, + "grad_norm": NaN, + "learning_rate": 3.438715096593181e-05, + "loss": 0.0, + "step": 50603 + }, + { + "epoch": 4.721843799570776, + "grad_norm": NaN, + "learning_rate": 3.4382331962056446e-05, + "loss": 0.0, + "step": 50604 + }, + { + "epoch": 4.721937109265653, + "grad_norm": NaN, + "learning_rate": 3.437751325216256e-05, + "loss": 0.0, + "step": 50605 + }, + { + "epoch": 4.72203041896053, + "grad_norm": NaN, + "learning_rate": 3.4372694836262395e-05, + "loss": 0.0, + "step": 50606 + }, + { + "epoch": 4.722123728655407, + "grad_norm": NaN, + "learning_rate": 3.436787671436824e-05, + "loss": 0.0, + "step": 50607 + }, + { + "epoch": 4.722217038350284, + "grad_norm": NaN, + "learning_rate": 3.436305888649231e-05, + "loss": 0.0, + "step": 50608 + }, + { + "epoch": 4.722310348045162, + "grad_norm": NaN, + "learning_rate": 3.4358241352646865e-05, + "loss": 0.0, + "step": 50609 + }, + { + "epoch": 4.722403657740039, + "grad_norm": NaN, + "learning_rate": 3.4353424112844156e-05, + "loss": 0.0, + "step": 50610 + }, + { + "epoch": 4.722496967434917, + "grad_norm": NaN, + "learning_rate": 3.434860716709645e-05, + "loss": 0.0, + "step": 50611 + }, + { + "epoch": 4.722590277129794, + "grad_norm": NaN, + "learning_rate": 3.434379051541596e-05, + "loss": 0.0, + "step": 50612 + }, + { + "epoch": 4.722683586824671, + "grad_norm": NaN, + "learning_rate": 3.433897415781495e-05, + "loss": 0.0, + "step": 50613 + }, + { + "epoch": 4.722776896519548, + "grad_norm": NaN, + "learning_rate": 3.433415809430567e-05, + "loss": 0.0, + "step": 50614 + }, + { + "epoch": 4.722870206214425, + "grad_norm": NaN, + "learning_rate": 3.432934232490037e-05, + "loss": 0.0, + "step": 50615 + }, + { + "epoch": 4.722963515909303, + "grad_norm": NaN, + "learning_rate": 3.432452684961128e-05, + "loss": 0.0, + "step": 50616 + }, + { + "epoch": 4.72305682560418, + "grad_norm": NaN, + "learning_rate": 3.4319711668450646e-05, + "loss": 0.0, + "step": 50617 + }, + { + "epoch": 4.723150135299058, + "grad_norm": NaN, + "learning_rate": 3.431489678143074e-05, + "loss": 0.0, + "step": 50618 + }, + { + "epoch": 4.723243444993935, + "grad_norm": NaN, + "learning_rate": 3.431008218856377e-05, + "loss": 0.0, + "step": 50619 + }, + { + "epoch": 4.7233367546888125, + "grad_norm": NaN, + "learning_rate": 3.430526788986199e-05, + "loss": 0.0, + "step": 50620 + }, + { + "epoch": 4.72343006438369, + "grad_norm": NaN, + "learning_rate": 3.430045388533765e-05, + "loss": 0.0, + "step": 50621 + }, + { + "epoch": 4.7235233740785665, + "grad_norm": NaN, + "learning_rate": 3.429564017500292e-05, + "loss": 0.0, + "step": 50622 + }, + { + "epoch": 4.723616683773444, + "grad_norm": NaN, + "learning_rate": 3.429082675887022e-05, + "loss": 0.0, + "step": 50623 + }, + { + "epoch": 4.723709993468321, + "grad_norm": NaN, + "learning_rate": 3.428601363695159e-05, + "loss": 0.0, + "step": 50624 + }, + { + "epoch": 4.723803303163199, + "grad_norm": NaN, + "learning_rate": 3.428120080925934e-05, + "loss": 0.0, + "step": 50625 + }, + { + "epoch": 4.723896612858076, + "grad_norm": NaN, + "learning_rate": 3.42763882758058e-05, + "loss": 0.0, + "step": 50626 + }, + { + "epoch": 4.723989922552954, + "grad_norm": NaN, + "learning_rate": 3.427157603660308e-05, + "loss": 0.0, + "step": 50627 + }, + { + "epoch": 4.72408323224783, + "grad_norm": NaN, + "learning_rate": 3.42667640916634e-05, + "loss": 0.0, + "step": 50628 + }, + { + "epoch": 4.7241765419427075, + "grad_norm": NaN, + "learning_rate": 3.4261952440999166e-05, + "loss": 0.0, + "step": 50629 + }, + { + "epoch": 4.724269851637585, + "grad_norm": NaN, + "learning_rate": 3.4257141084622433e-05, + "loss": 0.0, + "step": 50630 + }, + { + "epoch": 4.724363161332462, + "grad_norm": NaN, + "learning_rate": 3.4252330022545475e-05, + "loss": 0.0, + "step": 50631 + }, + { + "epoch": 4.72445647102734, + "grad_norm": NaN, + "learning_rate": 3.424751925478066e-05, + "loss": 0.0, + "step": 50632 + }, + { + "epoch": 4.724549780722217, + "grad_norm": NaN, + "learning_rate": 3.424270878134006e-05, + "loss": 0.0, + "step": 50633 + }, + { + "epoch": 4.724643090417095, + "grad_norm": NaN, + "learning_rate": 3.4237898602235907e-05, + "loss": 0.0, + "step": 50634 + }, + { + "epoch": 4.724736400111972, + "grad_norm": NaN, + "learning_rate": 3.4233088717480576e-05, + "loss": 0.0, + "step": 50635 + }, + { + "epoch": 4.724829709806849, + "grad_norm": NaN, + "learning_rate": 3.4228279127086175e-05, + "loss": 0.0, + "step": 50636 + }, + { + "epoch": 4.724923019501726, + "grad_norm": NaN, + "learning_rate": 3.422346983106495e-05, + "loss": 0.0, + "step": 50637 + }, + { + "epoch": 4.725016329196603, + "grad_norm": NaN, + "learning_rate": 3.421866082942916e-05, + "loss": 0.0, + "step": 50638 + }, + { + "epoch": 4.725109638891481, + "grad_norm": NaN, + "learning_rate": 3.4213852122191004e-05, + "loss": 0.0, + "step": 50639 + }, + { + "epoch": 4.725202948586358, + "grad_norm": NaN, + "learning_rate": 3.420904370936273e-05, + "loss": 0.0, + "step": 50640 + }, + { + "epoch": 4.725296258281236, + "grad_norm": NaN, + "learning_rate": 3.420423559095658e-05, + "loss": 0.0, + "step": 50641 + }, + { + "epoch": 4.725389567976113, + "grad_norm": NaN, + "learning_rate": 3.419942776698474e-05, + "loss": 0.0, + "step": 50642 + }, + { + "epoch": 4.72548287767099, + "grad_norm": NaN, + "learning_rate": 3.419462023745945e-05, + "loss": 0.0, + "step": 50643 + }, + { + "epoch": 4.725576187365867, + "grad_norm": NaN, + "learning_rate": 3.418981300239293e-05, + "loss": 0.0, + "step": 50644 + }, + { + "epoch": 4.7256694970607445, + "grad_norm": NaN, + "learning_rate": 3.4185006061797425e-05, + "loss": 0.0, + "step": 50645 + }, + { + "epoch": 4.725762806755622, + "grad_norm": NaN, + "learning_rate": 3.4180199415685125e-05, + "loss": 0.0, + "step": 50646 + }, + { + "epoch": 4.725856116450499, + "grad_norm": NaN, + "learning_rate": 3.417539306406829e-05, + "loss": 0.0, + "step": 50647 + }, + { + "epoch": 4.725949426145377, + "grad_norm": NaN, + "learning_rate": 3.4170587006959106e-05, + "loss": 0.0, + "step": 50648 + }, + { + "epoch": 4.726042735840254, + "grad_norm": NaN, + "learning_rate": 3.416578124436981e-05, + "loss": 0.0, + "step": 50649 + }, + { + "epoch": 4.726136045535132, + "grad_norm": NaN, + "learning_rate": 3.4160975776312634e-05, + "loss": 0.0, + "step": 50650 + }, + { + "epoch": 4.726229355230008, + "grad_norm": NaN, + "learning_rate": 3.415617060279976e-05, + "loss": 0.0, + "step": 50651 + }, + { + "epoch": 4.7263226649248855, + "grad_norm": NaN, + "learning_rate": 3.415136572384345e-05, + "loss": 0.0, + "step": 50652 + }, + { + "epoch": 4.726415974619763, + "grad_norm": NaN, + "learning_rate": 3.414656113945589e-05, + "loss": 0.0, + "step": 50653 + }, + { + "epoch": 4.72650928431464, + "grad_norm": NaN, + "learning_rate": 3.414175684964931e-05, + "loss": 0.0, + "step": 50654 + }, + { + "epoch": 4.726602594009518, + "grad_norm": NaN, + "learning_rate": 3.413695285443591e-05, + "loss": 0.0, + "step": 50655 + }, + { + "epoch": 4.726695903704395, + "grad_norm": NaN, + "learning_rate": 3.413214915382794e-05, + "loss": 0.0, + "step": 50656 + }, + { + "epoch": 4.726789213399272, + "grad_norm": NaN, + "learning_rate": 3.4127345747837574e-05, + "loss": 0.0, + "step": 50657 + }, + { + "epoch": 4.726882523094149, + "grad_norm": NaN, + "learning_rate": 3.412254263647705e-05, + "loss": 0.0, + "step": 50658 + }, + { + "epoch": 4.7269758327890266, + "grad_norm": NaN, + "learning_rate": 3.4117739819758575e-05, + "loss": 0.0, + "step": 50659 + }, + { + "epoch": 4.727069142483904, + "grad_norm": NaN, + "learning_rate": 3.411293729769431e-05, + "loss": 0.0, + "step": 50660 + }, + { + "epoch": 4.727162452178781, + "grad_norm": NaN, + "learning_rate": 3.410813507029661e-05, + "loss": 0.0, + "step": 50661 + }, + { + "epoch": 4.727255761873659, + "grad_norm": NaN, + "learning_rate": 3.410333313757755e-05, + "loss": 0.0, + "step": 50662 + }, + { + "epoch": 4.727349071568536, + "grad_norm": NaN, + "learning_rate": 3.409853149954933e-05, + "loss": 0.0, + "step": 50663 + }, + { + "epoch": 4.727442381263414, + "grad_norm": NaN, + "learning_rate": 3.409373015622431e-05, + "loss": 0.0, + "step": 50664 + }, + { + "epoch": 4.72753569095829, + "grad_norm": NaN, + "learning_rate": 3.408892910761456e-05, + "loss": 0.0, + "step": 50665 + }, + { + "epoch": 4.727629000653168, + "grad_norm": NaN, + "learning_rate": 3.408412835373228e-05, + "loss": 0.0, + "step": 50666 + }, + { + "epoch": 4.727722310348045, + "grad_norm": NaN, + "learning_rate": 3.407932789458981e-05, + "loss": 0.0, + "step": 50667 + }, + { + "epoch": 4.727815620042922, + "grad_norm": NaN, + "learning_rate": 3.407452773019923e-05, + "loss": 0.0, + "step": 50668 + }, + { + "epoch": 4.7279089297378, + "grad_norm": NaN, + "learning_rate": 3.406972786057273e-05, + "loss": 0.0, + "step": 50669 + }, + { + "epoch": 4.728002239432677, + "grad_norm": NaN, + "learning_rate": 3.4064928285722675e-05, + "loss": 0.0, + "step": 50670 + }, + { + "epoch": 4.728095549127555, + "grad_norm": NaN, + "learning_rate": 3.4060129005661116e-05, + "loss": 0.0, + "step": 50671 + }, + { + "epoch": 4.728188858822431, + "grad_norm": NaN, + "learning_rate": 3.405533002040027e-05, + "loss": 0.0, + "step": 50672 + }, + { + "epoch": 4.728282168517309, + "grad_norm": NaN, + "learning_rate": 3.405053132995245e-05, + "loss": 0.0, + "step": 50673 + }, + { + "epoch": 4.728375478212186, + "grad_norm": NaN, + "learning_rate": 3.404573293432974e-05, + "loss": 0.0, + "step": 50674 + }, + { + "epoch": 4.7284687879070635, + "grad_norm": NaN, + "learning_rate": 3.4040934833544335e-05, + "loss": 0.0, + "step": 50675 + }, + { + "epoch": 4.728562097601941, + "grad_norm": NaN, + "learning_rate": 3.403613702760858e-05, + "loss": 0.0, + "step": 50676 + }, + { + "epoch": 4.728655407296818, + "grad_norm": NaN, + "learning_rate": 3.403133951653452e-05, + "loss": 0.0, + "step": 50677 + }, + { + "epoch": 4.728748716991696, + "grad_norm": NaN, + "learning_rate": 3.4026542300334375e-05, + "loss": 0.0, + "step": 50678 + }, + { + "epoch": 4.728842026686573, + "grad_norm": NaN, + "learning_rate": 3.4021745379020474e-05, + "loss": 0.0, + "step": 50679 + }, + { + "epoch": 4.72893533638145, + "grad_norm": NaN, + "learning_rate": 3.401694875260487e-05, + "loss": 0.0, + "step": 50680 + }, + { + "epoch": 4.729028646076327, + "grad_norm": NaN, + "learning_rate": 3.40121524210998e-05, + "loss": 0.0, + "step": 50681 + }, + { + "epoch": 4.7291219557712045, + "grad_norm": NaN, + "learning_rate": 3.400735638451747e-05, + "loss": 0.0, + "step": 50682 + }, + { + "epoch": 4.729215265466082, + "grad_norm": NaN, + "learning_rate": 3.400256064287009e-05, + "loss": 0.0, + "step": 50683 + }, + { + "epoch": 4.729308575160959, + "grad_norm": NaN, + "learning_rate": 3.399776519616982e-05, + "loss": 0.0, + "step": 50684 + }, + { + "epoch": 4.729401884855837, + "grad_norm": NaN, + "learning_rate": 3.399297004442888e-05, + "loss": 0.0, + "step": 50685 + }, + { + "epoch": 4.729495194550713, + "grad_norm": NaN, + "learning_rate": 3.398817518765945e-05, + "loss": 0.0, + "step": 50686 + }, + { + "epoch": 4.729588504245591, + "grad_norm": NaN, + "learning_rate": 3.3983380625873725e-05, + "loss": 0.0, + "step": 50687 + }, + { + "epoch": 4.729681813940468, + "grad_norm": NaN, + "learning_rate": 3.3978586359083906e-05, + "loss": 0.0, + "step": 50688 + }, + { + "epoch": 4.729775123635346, + "grad_norm": NaN, + "learning_rate": 3.3973792387302164e-05, + "loss": 0.0, + "step": 50689 + }, + { + "epoch": 4.729868433330223, + "grad_norm": NaN, + "learning_rate": 3.3968998710540704e-05, + "loss": 0.0, + "step": 50690 + }, + { + "epoch": 4.7299617430251, + "grad_norm": NaN, + "learning_rate": 3.39642053288117e-05, + "loss": 0.0, + "step": 50691 + }, + { + "epoch": 4.730055052719978, + "grad_norm": NaN, + "learning_rate": 3.395941224212735e-05, + "loss": 0.0, + "step": 50692 + }, + { + "epoch": 4.730148362414855, + "grad_norm": NaN, + "learning_rate": 3.395461945049984e-05, + "loss": 0.0, + "step": 50693 + }, + { + "epoch": 4.730241672109733, + "grad_norm": NaN, + "learning_rate": 3.394982695394137e-05, + "loss": 0.0, + "step": 50694 + }, + { + "epoch": 4.730334981804609, + "grad_norm": NaN, + "learning_rate": 3.394503475246411e-05, + "loss": 0.0, + "step": 50695 + }, + { + "epoch": 4.730428291499487, + "grad_norm": NaN, + "learning_rate": 3.394024284608025e-05, + "loss": 0.0, + "step": 50696 + }, + { + "epoch": 4.730521601194364, + "grad_norm": NaN, + "learning_rate": 3.3935451234801965e-05, + "loss": 0.0, + "step": 50697 + }, + { + "epoch": 4.7306149108892415, + "grad_norm": NaN, + "learning_rate": 3.393065991864141e-05, + "loss": 0.0, + "step": 50698 + }, + { + "epoch": 4.730708220584119, + "grad_norm": NaN, + "learning_rate": 3.392586889761088e-05, + "loss": 0.0, + "step": 50699 + }, + { + "epoch": 4.730801530278996, + "grad_norm": NaN, + "learning_rate": 3.392107817172244e-05, + "loss": 0.0, + "step": 50700 + }, + { + "epoch": 4.730894839973873, + "grad_norm": NaN, + "learning_rate": 3.391628774098828e-05, + "loss": 0.0, + "step": 50701 + }, + { + "epoch": 4.73098814966875, + "grad_norm": NaN, + "learning_rate": 3.39114976054207e-05, + "loss": 0.0, + "step": 50702 + }, + { + "epoch": 4.731081459363628, + "grad_norm": NaN, + "learning_rate": 3.3906707765031746e-05, + "loss": 0.0, + "step": 50703 + }, + { + "epoch": 4.731174769058505, + "grad_norm": NaN, + "learning_rate": 3.39019182198336e-05, + "loss": 0.0, + "step": 50704 + }, + { + "epoch": 4.7312680787533825, + "grad_norm": NaN, + "learning_rate": 3.3897128969838584e-05, + "loss": 0.0, + "step": 50705 + }, + { + "epoch": 4.73136138844826, + "grad_norm": NaN, + "learning_rate": 3.389234001505872e-05, + "loss": 0.0, + "step": 50706 + }, + { + "epoch": 4.731454698143137, + "grad_norm": NaN, + "learning_rate": 3.3887551355506196e-05, + "loss": 0.0, + "step": 50707 + }, + { + "epoch": 4.731548007838015, + "grad_norm": NaN, + "learning_rate": 3.3882762991193336e-05, + "loss": 0.0, + "step": 50708 + }, + { + "epoch": 4.731641317532891, + "grad_norm": NaN, + "learning_rate": 3.387797492213215e-05, + "loss": 0.0, + "step": 50709 + }, + { + "epoch": 4.731734627227769, + "grad_norm": NaN, + "learning_rate": 3.3873187148334846e-05, + "loss": 0.0, + "step": 50710 + }, + { + "epoch": 4.731827936922646, + "grad_norm": NaN, + "learning_rate": 3.386839966981371e-05, + "loss": 0.0, + "step": 50711 + }, + { + "epoch": 4.731921246617524, + "grad_norm": NaN, + "learning_rate": 3.3863612486580785e-05, + "loss": 0.0, + "step": 50712 + }, + { + "epoch": 4.732014556312401, + "grad_norm": NaN, + "learning_rate": 3.3858825598648264e-05, + "loss": 0.0, + "step": 50713 + }, + { + "epoch": 4.732107866007278, + "grad_norm": NaN, + "learning_rate": 3.385403900602841e-05, + "loss": 0.0, + "step": 50714 + }, + { + "epoch": 4.732201175702156, + "grad_norm": NaN, + "learning_rate": 3.3849252708733314e-05, + "loss": 0.0, + "step": 50715 + }, + { + "epoch": 4.732294485397032, + "grad_norm": NaN, + "learning_rate": 3.3844466706775104e-05, + "loss": 0.0, + "step": 50716 + }, + { + "epoch": 4.73238779509191, + "grad_norm": NaN, + "learning_rate": 3.383968100016609e-05, + "loss": 0.0, + "step": 50717 + }, + { + "epoch": 4.732481104786787, + "grad_norm": NaN, + "learning_rate": 3.383489558891832e-05, + "loss": 0.0, + "step": 50718 + }, + { + "epoch": 4.732574414481665, + "grad_norm": NaN, + "learning_rate": 3.383011047304395e-05, + "loss": 0.0, + "step": 50719 + }, + { + "epoch": 4.732667724176542, + "grad_norm": NaN, + "learning_rate": 3.382532565255529e-05, + "loss": 0.0, + "step": 50720 + }, + { + "epoch": 4.732761033871419, + "grad_norm": NaN, + "learning_rate": 3.3820541127464375e-05, + "loss": 0.0, + "step": 50721 + }, + { + "epoch": 4.732854343566297, + "grad_norm": NaN, + "learning_rate": 3.3815756897783355e-05, + "loss": 0.0, + "step": 50722 + }, + { + "epoch": 4.732947653261174, + "grad_norm": NaN, + "learning_rate": 3.381097296352454e-05, + "loss": 0.0, + "step": 50723 + }, + { + "epoch": 4.733040962956051, + "grad_norm": NaN, + "learning_rate": 3.380618932469996e-05, + "loss": 0.0, + "step": 50724 + }, + { + "epoch": 4.733134272650928, + "grad_norm": NaN, + "learning_rate": 3.380140598132182e-05, + "loss": 0.0, + "step": 50725 + }, + { + "epoch": 4.733227582345806, + "grad_norm": NaN, + "learning_rate": 3.3796622933402276e-05, + "loss": 0.0, + "step": 50726 + }, + { + "epoch": 4.733320892040683, + "grad_norm": NaN, + "learning_rate": 3.3791840180953505e-05, + "loss": 0.0, + "step": 50727 + }, + { + "epoch": 4.7334142017355605, + "grad_norm": NaN, + "learning_rate": 3.378705772398768e-05, + "loss": 0.0, + "step": 50728 + }, + { + "epoch": 4.733507511430438, + "grad_norm": NaN, + "learning_rate": 3.3782275562516906e-05, + "loss": 0.0, + "step": 50729 + }, + { + "epoch": 4.733600821125314, + "grad_norm": NaN, + "learning_rate": 3.377749369655341e-05, + "loss": 0.0, + "step": 50730 + }, + { + "epoch": 4.733694130820192, + "grad_norm": NaN, + "learning_rate": 3.37727121261093e-05, + "loss": 0.0, + "step": 50731 + }, + { + "epoch": 4.733787440515069, + "grad_norm": NaN, + "learning_rate": 3.376793085119676e-05, + "loss": 0.0, + "step": 50732 + }, + { + "epoch": 4.733880750209947, + "grad_norm": NaN, + "learning_rate": 3.376314987182791e-05, + "loss": 0.0, + "step": 50733 + }, + { + "epoch": 4.733974059904824, + "grad_norm": NaN, + "learning_rate": 3.3758369188015e-05, + "loss": 0.0, + "step": 50734 + }, + { + "epoch": 4.7340673695997015, + "grad_norm": NaN, + "learning_rate": 3.37535887997701e-05, + "loss": 0.0, + "step": 50735 + }, + { + "epoch": 4.734160679294579, + "grad_norm": NaN, + "learning_rate": 3.374880870710535e-05, + "loss": 0.0, + "step": 50736 + }, + { + "epoch": 4.734253988989456, + "grad_norm": NaN, + "learning_rate": 3.374402891003302e-05, + "loss": 0.0, + "step": 50737 + }, + { + "epoch": 4.734347298684334, + "grad_norm": NaN, + "learning_rate": 3.373924940856514e-05, + "loss": 0.0, + "step": 50738 + }, + { + "epoch": 4.73444060837921, + "grad_norm": NaN, + "learning_rate": 3.373447020271387e-05, + "loss": 0.0, + "step": 50739 + }, + { + "epoch": 4.734533918074088, + "grad_norm": NaN, + "learning_rate": 3.372969129249149e-05, + "loss": 0.0, + "step": 50740 + }, + { + "epoch": 4.734627227768965, + "grad_norm": NaN, + "learning_rate": 3.372491267791002e-05, + "loss": 0.0, + "step": 50741 + }, + { + "epoch": 4.734720537463843, + "grad_norm": NaN, + "learning_rate": 3.372013435898162e-05, + "loss": 0.0, + "step": 50742 + }, + { + "epoch": 4.73481384715872, + "grad_norm": NaN, + "learning_rate": 3.3715356335718546e-05, + "loss": 0.0, + "step": 50743 + }, + { + "epoch": 4.734907156853597, + "grad_norm": NaN, + "learning_rate": 3.3710578608132836e-05, + "loss": 0.0, + "step": 50744 + }, + { + "epoch": 4.735000466548474, + "grad_norm": NaN, + "learning_rate": 3.370580117623664e-05, + "loss": 0.0, + "step": 50745 + }, + { + "epoch": 4.735093776243351, + "grad_norm": NaN, + "learning_rate": 3.3701024040042216e-05, + "loss": 0.0, + "step": 50746 + }, + { + "epoch": 4.735187085938229, + "grad_norm": NaN, + "learning_rate": 3.369624719956159e-05, + "loss": 0.0, + "step": 50747 + }, + { + "epoch": 4.735280395633106, + "grad_norm": NaN, + "learning_rate": 3.369147065480694e-05, + "loss": 0.0, + "step": 50748 + }, + { + "epoch": 4.735373705327984, + "grad_norm": NaN, + "learning_rate": 3.3686694405790494e-05, + "loss": 0.0, + "step": 50749 + }, + { + "epoch": 4.735467015022861, + "grad_norm": NaN, + "learning_rate": 3.368191845252428e-05, + "loss": 0.0, + "step": 50750 + }, + { + "epoch": 4.7355603247177385, + "grad_norm": NaN, + "learning_rate": 3.3677142795020453e-05, + "loss": 0.0, + "step": 50751 + }, + { + "epoch": 4.735653634412616, + "grad_norm": NaN, + "learning_rate": 3.367236743329129e-05, + "loss": 0.0, + "step": 50752 + }, + { + "epoch": 4.735746944107492, + "grad_norm": NaN, + "learning_rate": 3.366759236734879e-05, + "loss": 0.0, + "step": 50753 + }, + { + "epoch": 4.73584025380237, + "grad_norm": NaN, + "learning_rate": 3.3662817597205105e-05, + "loss": 0.0, + "step": 50754 + }, + { + "epoch": 4.735933563497247, + "grad_norm": NaN, + "learning_rate": 3.3658043122872496e-05, + "loss": 0.0, + "step": 50755 + }, + { + "epoch": 4.736026873192125, + "grad_norm": NaN, + "learning_rate": 3.365326894436297e-05, + "loss": 0.0, + "step": 50756 + }, + { + "epoch": 4.736120182887002, + "grad_norm": NaN, + "learning_rate": 3.3648495061688686e-05, + "loss": 0.0, + "step": 50757 + }, + { + "epoch": 4.7362134925818795, + "grad_norm": NaN, + "learning_rate": 3.364372147486188e-05, + "loss": 0.0, + "step": 50758 + }, + { + "epoch": 4.736306802276757, + "grad_norm": NaN, + "learning_rate": 3.363894818389459e-05, + "loss": 0.0, + "step": 50759 + }, + { + "epoch": 4.7364001119716335, + "grad_norm": NaN, + "learning_rate": 3.363417518879895e-05, + "loss": 0.0, + "step": 50760 + }, + { + "epoch": 4.736493421666511, + "grad_norm": NaN, + "learning_rate": 3.362940248958721e-05, + "loss": 0.0, + "step": 50761 + }, + { + "epoch": 4.736586731361388, + "grad_norm": NaN, + "learning_rate": 3.3624630086271395e-05, + "loss": 0.0, + "step": 50762 + }, + { + "epoch": 4.736680041056266, + "grad_norm": NaN, + "learning_rate": 3.3619857978863626e-05, + "loss": 0.0, + "step": 50763 + }, + { + "epoch": 4.736773350751143, + "grad_norm": NaN, + "learning_rate": 3.361508616737616e-05, + "loss": 0.0, + "step": 50764 + }, + { + "epoch": 4.736866660446021, + "grad_norm": NaN, + "learning_rate": 3.361031465182102e-05, + "loss": 0.0, + "step": 50765 + }, + { + "epoch": 4.736959970140898, + "grad_norm": NaN, + "learning_rate": 3.3605543432210325e-05, + "loss": 0.0, + "step": 50766 + }, + { + "epoch": 4.737053279835775, + "grad_norm": NaN, + "learning_rate": 3.360077250855635e-05, + "loss": 0.0, + "step": 50767 + }, + { + "epoch": 4.737146589530652, + "grad_norm": NaN, + "learning_rate": 3.359600188087108e-05, + "loss": 0.0, + "step": 50768 + }, + { + "epoch": 4.737239899225529, + "grad_norm": NaN, + "learning_rate": 3.359123154916669e-05, + "loss": 0.0, + "step": 50769 + }, + { + "epoch": 4.737333208920407, + "grad_norm": NaN, + "learning_rate": 3.358646151345532e-05, + "loss": 0.0, + "step": 50770 + }, + { + "epoch": 4.737426518615284, + "grad_norm": NaN, + "learning_rate": 3.358169177374905e-05, + "loss": 0.0, + "step": 50771 + }, + { + "epoch": 4.737519828310162, + "grad_norm": NaN, + "learning_rate": 3.3576922330060144e-05, + "loss": 0.0, + "step": 50772 + }, + { + "epoch": 4.737613138005039, + "grad_norm": NaN, + "learning_rate": 3.357215318240059e-05, + "loss": 0.0, + "step": 50773 + }, + { + "epoch": 4.737706447699916, + "grad_norm": NaN, + "learning_rate": 3.356738433078252e-05, + "loss": 0.0, + "step": 50774 + }, + { + "epoch": 4.737799757394793, + "grad_norm": NaN, + "learning_rate": 3.356261577521819e-05, + "loss": 0.0, + "step": 50775 + }, + { + "epoch": 4.73789306708967, + "grad_norm": NaN, + "learning_rate": 3.35578475157196e-05, + "loss": 0.0, + "step": 50776 + }, + { + "epoch": 4.737986376784548, + "grad_norm": NaN, + "learning_rate": 3.355307955229885e-05, + "loss": 0.0, + "step": 50777 + }, + { + "epoch": 4.738079686479425, + "grad_norm": NaN, + "learning_rate": 3.354831188496822e-05, + "loss": 0.0, + "step": 50778 + }, + { + "epoch": 4.738172996174303, + "grad_norm": NaN, + "learning_rate": 3.35435445137397e-05, + "loss": 0.0, + "step": 50779 + }, + { + "epoch": 4.73826630586918, + "grad_norm": NaN, + "learning_rate": 3.3538777438625416e-05, + "loss": 0.0, + "step": 50780 + }, + { + "epoch": 4.7383596155640575, + "grad_norm": NaN, + "learning_rate": 3.353401065963758e-05, + "loss": 0.0, + "step": 50781 + }, + { + "epoch": 4.738452925258934, + "grad_norm": NaN, + "learning_rate": 3.3529244176788235e-05, + "loss": 0.0, + "step": 50782 + }, + { + "epoch": 4.738546234953811, + "grad_norm": NaN, + "learning_rate": 3.352447799008947e-05, + "loss": 0.0, + "step": 50783 + }, + { + "epoch": 4.738639544648689, + "grad_norm": NaN, + "learning_rate": 3.3519712099553534e-05, + "loss": 0.0, + "step": 50784 + }, + { + "epoch": 4.738732854343566, + "grad_norm": NaN, + "learning_rate": 3.351494650519244e-05, + "loss": 0.0, + "step": 50785 + }, + { + "epoch": 4.738826164038444, + "grad_norm": NaN, + "learning_rate": 3.351018120701826e-05, + "loss": 0.0, + "step": 50786 + }, + { + "epoch": 4.738919473733321, + "grad_norm": NaN, + "learning_rate": 3.350541620504328e-05, + "loss": 0.0, + "step": 50787 + }, + { + "epoch": 4.7390127834281985, + "grad_norm": NaN, + "learning_rate": 3.3500651499279493e-05, + "loss": 0.0, + "step": 50788 + }, + { + "epoch": 4.739106093123075, + "grad_norm": NaN, + "learning_rate": 3.349588708973899e-05, + "loss": 0.0, + "step": 50789 + }, + { + "epoch": 4.7391994028179525, + "grad_norm": NaN, + "learning_rate": 3.349112297643402e-05, + "loss": 0.0, + "step": 50790 + }, + { + "epoch": 4.73929271251283, + "grad_norm": NaN, + "learning_rate": 3.3486359159376553e-05, + "loss": 0.0, + "step": 50791 + }, + { + "epoch": 4.739386022207707, + "grad_norm": NaN, + "learning_rate": 3.348159563857874e-05, + "loss": 0.0, + "step": 50792 + }, + { + "epoch": 4.739479331902585, + "grad_norm": NaN, + "learning_rate": 3.347683241405278e-05, + "loss": 0.0, + "step": 50793 + }, + { + "epoch": 4.739572641597462, + "grad_norm": NaN, + "learning_rate": 3.3472069485810695e-05, + "loss": 0.0, + "step": 50794 + }, + { + "epoch": 4.73966595129234, + "grad_norm": NaN, + "learning_rate": 3.346730685386455e-05, + "loss": 0.0, + "step": 50795 + }, + { + "epoch": 4.739759260987217, + "grad_norm": NaN, + "learning_rate": 3.346254451822663e-05, + "loss": 0.0, + "step": 50796 + }, + { + "epoch": 4.7398525706820935, + "grad_norm": NaN, + "learning_rate": 3.345778247890889e-05, + "loss": 0.0, + "step": 50797 + }, + { + "epoch": 4.739945880376971, + "grad_norm": NaN, + "learning_rate": 3.3453020735923456e-05, + "loss": 0.0, + "step": 50798 + }, + { + "epoch": 4.740039190071848, + "grad_norm": NaN, + "learning_rate": 3.3448259289282545e-05, + "loss": 0.0, + "step": 50799 + }, + { + "epoch": 4.740132499766726, + "grad_norm": NaN, + "learning_rate": 3.3443498138998134e-05, + "loss": 0.0, + "step": 50800 + }, + { + "epoch": 4.740225809461603, + "grad_norm": NaN, + "learning_rate": 3.343873728508233e-05, + "loss": 0.0, + "step": 50801 + }, + { + "epoch": 4.740319119156481, + "grad_norm": NaN, + "learning_rate": 3.343397672754739e-05, + "loss": 0.0, + "step": 50802 + }, + { + "epoch": 4.740412428851357, + "grad_norm": NaN, + "learning_rate": 3.3429216466405274e-05, + "loss": 0.0, + "step": 50803 + }, + { + "epoch": 4.740505738546235, + "grad_norm": NaN, + "learning_rate": 3.342445650166809e-05, + "loss": 0.0, + "step": 50804 + }, + { + "epoch": 4.740599048241112, + "grad_norm": NaN, + "learning_rate": 3.341969683334806e-05, + "loss": 0.0, + "step": 50805 + }, + { + "epoch": 4.740692357935989, + "grad_norm": NaN, + "learning_rate": 3.341493746145717e-05, + "loss": 0.0, + "step": 50806 + }, + { + "epoch": 4.740785667630867, + "grad_norm": NaN, + "learning_rate": 3.341017838600751e-05, + "loss": 0.0, + "step": 50807 + }, + { + "epoch": 4.740878977325744, + "grad_norm": NaN, + "learning_rate": 3.340541960701132e-05, + "loss": 0.0, + "step": 50808 + }, + { + "epoch": 4.740972287020622, + "grad_norm": NaN, + "learning_rate": 3.3400661124480525e-05, + "loss": 0.0, + "step": 50809 + }, + { + "epoch": 4.741065596715499, + "grad_norm": NaN, + "learning_rate": 3.339590293842736e-05, + "loss": 0.0, + "step": 50810 + }, + { + "epoch": 4.7411589064103765, + "grad_norm": NaN, + "learning_rate": 3.339114504886389e-05, + "loss": 0.0, + "step": 50811 + }, + { + "epoch": 4.741252216105253, + "grad_norm": NaN, + "learning_rate": 3.338638745580213e-05, + "loss": 0.0, + "step": 50812 + }, + { + "epoch": 4.7413455258001305, + "grad_norm": NaN, + "learning_rate": 3.3381630159254284e-05, + "loss": 0.0, + "step": 50813 + }, + { + "epoch": 4.741438835495008, + "grad_norm": NaN, + "learning_rate": 3.337687315923246e-05, + "loss": 0.0, + "step": 50814 + }, + { + "epoch": 4.741532145189885, + "grad_norm": NaN, + "learning_rate": 3.3372116455748594e-05, + "loss": 0.0, + "step": 50815 + }, + { + "epoch": 4.741625454884763, + "grad_norm": NaN, + "learning_rate": 3.3367360048814996e-05, + "loss": 0.0, + "step": 50816 + }, + { + "epoch": 4.74171876457964, + "grad_norm": NaN, + "learning_rate": 3.336260393844359e-05, + "loss": 0.0, + "step": 50817 + }, + { + "epoch": 4.741812074274517, + "grad_norm": NaN, + "learning_rate": 3.3357848124646494e-05, + "loss": 0.0, + "step": 50818 + }, + { + "epoch": 4.741905383969394, + "grad_norm": NaN, + "learning_rate": 3.3353092607435925e-05, + "loss": 0.0, + "step": 50819 + }, + { + "epoch": 4.7419986936642715, + "grad_norm": NaN, + "learning_rate": 3.3348337386823855e-05, + "loss": 0.0, + "step": 50820 + }, + { + "epoch": 4.742092003359149, + "grad_norm": NaN, + "learning_rate": 3.334358246282235e-05, + "loss": 0.0, + "step": 50821 + }, + { + "epoch": 4.742185313054026, + "grad_norm": NaN, + "learning_rate": 3.333882783544365e-05, + "loss": 0.0, + "step": 50822 + }, + { + "epoch": 4.742278622748904, + "grad_norm": NaN, + "learning_rate": 3.333407350469971e-05, + "loss": 0.0, + "step": 50823 + }, + { + "epoch": 4.742371932443781, + "grad_norm": NaN, + "learning_rate": 3.332931947060262e-05, + "loss": 0.0, + "step": 50824 + }, + { + "epoch": 4.742465242138659, + "grad_norm": NaN, + "learning_rate": 3.33245657331646e-05, + "loss": 0.0, + "step": 50825 + }, + { + "epoch": 4.742558551833535, + "grad_norm": NaN, + "learning_rate": 3.3319812292397615e-05, + "loss": 0.0, + "step": 50826 + }, + { + "epoch": 4.742651861528413, + "grad_norm": NaN, + "learning_rate": 3.3315059148313724e-05, + "loss": 0.0, + "step": 50827 + }, + { + "epoch": 4.74274517122329, + "grad_norm": NaN, + "learning_rate": 3.3310306300925166e-05, + "loss": 0.0, + "step": 50828 + }, + { + "epoch": 4.742838480918167, + "grad_norm": NaN, + "learning_rate": 3.330555375024389e-05, + "loss": 0.0, + "step": 50829 + }, + { + "epoch": 4.742931790613045, + "grad_norm": NaN, + "learning_rate": 3.330080149628197e-05, + "loss": 0.0, + "step": 50830 + }, + { + "epoch": 4.743025100307922, + "grad_norm": NaN, + "learning_rate": 3.329604953905163e-05, + "loss": 0.0, + "step": 50831 + }, + { + "epoch": 4.7431184100028, + "grad_norm": NaN, + "learning_rate": 3.329129787856483e-05, + "loss": 0.0, + "step": 50832 + }, + { + "epoch": 4.743211719697676, + "grad_norm": NaN, + "learning_rate": 3.328654651483364e-05, + "loss": 0.0, + "step": 50833 + }, + { + "epoch": 4.743305029392554, + "grad_norm": NaN, + "learning_rate": 3.3281795447870294e-05, + "loss": 0.0, + "step": 50834 + }, + { + "epoch": 4.743398339087431, + "grad_norm": NaN, + "learning_rate": 3.327704467768671e-05, + "loss": 0.0, + "step": 50835 + }, + { + "epoch": 4.7434916487823084, + "grad_norm": NaN, + "learning_rate": 3.327229420429497e-05, + "loss": 0.0, + "step": 50836 + }, + { + "epoch": 4.743584958477186, + "grad_norm": NaN, + "learning_rate": 3.326754402770731e-05, + "loss": 0.0, + "step": 50837 + }, + { + "epoch": 4.743678268172063, + "grad_norm": NaN, + "learning_rate": 3.326279414793566e-05, + "loss": 0.0, + "step": 50838 + }, + { + "epoch": 4.743771577866941, + "grad_norm": NaN, + "learning_rate": 3.32580445649921e-05, + "loss": 0.0, + "step": 50839 + }, + { + "epoch": 4.743864887561818, + "grad_norm": NaN, + "learning_rate": 3.325329527888884e-05, + "loss": 0.0, + "step": 50840 + }, + { + "epoch": 4.743958197256695, + "grad_norm": NaN, + "learning_rate": 3.324854628963783e-05, + "loss": 0.0, + "step": 50841 + }, + { + "epoch": 4.744051506951572, + "grad_norm": NaN, + "learning_rate": 3.324379759725113e-05, + "loss": 0.0, + "step": 50842 + }, + { + "epoch": 4.7441448166464495, + "grad_norm": NaN, + "learning_rate": 3.323904920174096e-05, + "loss": 0.0, + "step": 50843 + }, + { + "epoch": 4.744238126341327, + "grad_norm": NaN, + "learning_rate": 3.3234301103119245e-05, + "loss": 0.0, + "step": 50844 + }, + { + "epoch": 4.744331436036204, + "grad_norm": NaN, + "learning_rate": 3.3229553301398076e-05, + "loss": 0.0, + "step": 50845 + }, + { + "epoch": 4.744424745731082, + "grad_norm": NaN, + "learning_rate": 3.322480579658965e-05, + "loss": 0.0, + "step": 50846 + }, + { + "epoch": 4.744518055425958, + "grad_norm": NaN, + "learning_rate": 3.322005858870587e-05, + "loss": 0.0, + "step": 50847 + }, + { + "epoch": 4.744611365120836, + "grad_norm": NaN, + "learning_rate": 3.321531167775894e-05, + "loss": 0.0, + "step": 50848 + }, + { + "epoch": 4.744704674815713, + "grad_norm": NaN, + "learning_rate": 3.32105650637609e-05, + "loss": 0.0, + "step": 50849 + }, + { + "epoch": 4.7447979845105905, + "grad_norm": NaN, + "learning_rate": 3.3205818746723723e-05, + "loss": 0.0, + "step": 50850 + }, + { + "epoch": 4.744891294205468, + "grad_norm": NaN, + "learning_rate": 3.320107272665959e-05, + "loss": 0.0, + "step": 50851 + }, + { + "epoch": 4.744984603900345, + "grad_norm": NaN, + "learning_rate": 3.319632700358058e-05, + "loss": 0.0, + "step": 50852 + }, + { + "epoch": 4.745077913595223, + "grad_norm": NaN, + "learning_rate": 3.3191581577498624e-05, + "loss": 0.0, + "step": 50853 + }, + { + "epoch": 4.7451712232901, + "grad_norm": NaN, + "learning_rate": 3.318683644842592e-05, + "loss": 0.0, + "step": 50854 + }, + { + "epoch": 4.745264532984978, + "grad_norm": NaN, + "learning_rate": 3.318209161637452e-05, + "loss": 0.0, + "step": 50855 + }, + { + "epoch": 4.745357842679854, + "grad_norm": NaN, + "learning_rate": 3.3177347081356376e-05, + "loss": 0.0, + "step": 50856 + }, + { + "epoch": 4.745451152374732, + "grad_norm": NaN, + "learning_rate": 3.3172602843383686e-05, + "loss": 0.0, + "step": 50857 + }, + { + "epoch": 4.745544462069609, + "grad_norm": NaN, + "learning_rate": 3.3167858902468485e-05, + "loss": 0.0, + "step": 50858 + }, + { + "epoch": 4.745637771764486, + "grad_norm": NaN, + "learning_rate": 3.316311525862274e-05, + "loss": 0.0, + "step": 50859 + }, + { + "epoch": 4.745731081459364, + "grad_norm": NaN, + "learning_rate": 3.3158371911858674e-05, + "loss": 0.0, + "step": 50860 + }, + { + "epoch": 4.745824391154241, + "grad_norm": NaN, + "learning_rate": 3.31536288621882e-05, + "loss": 0.0, + "step": 50861 + }, + { + "epoch": 4.745917700849118, + "grad_norm": NaN, + "learning_rate": 3.314888610962339e-05, + "loss": 0.0, + "step": 50862 + }, + { + "epoch": 4.746011010543995, + "grad_norm": NaN, + "learning_rate": 3.3144143654176445e-05, + "loss": 0.0, + "step": 50863 + }, + { + "epoch": 4.746104320238873, + "grad_norm": NaN, + "learning_rate": 3.313940149585928e-05, + "loss": 0.0, + "step": 50864 + }, + { + "epoch": 4.74619762993375, + "grad_norm": NaN, + "learning_rate": 3.3134659634683963e-05, + "loss": 0.0, + "step": 50865 + }, + { + "epoch": 4.7462909396286275, + "grad_norm": NaN, + "learning_rate": 3.3129918070662665e-05, + "loss": 0.0, + "step": 50866 + }, + { + "epoch": 4.746384249323505, + "grad_norm": NaN, + "learning_rate": 3.3125176803807325e-05, + "loss": 0.0, + "step": 50867 + }, + { + "epoch": 4.746477559018382, + "grad_norm": NaN, + "learning_rate": 3.312043583413e-05, + "loss": 0.0, + "step": 50868 + }, + { + "epoch": 4.74657086871326, + "grad_norm": NaN, + "learning_rate": 3.311569516164286e-05, + "loss": 0.0, + "step": 50869 + }, + { + "epoch": 4.746664178408136, + "grad_norm": NaN, + "learning_rate": 3.311095478635786e-05, + "loss": 0.0, + "step": 50870 + }, + { + "epoch": 4.746757488103014, + "grad_norm": NaN, + "learning_rate": 3.310621470828702e-05, + "loss": 0.0, + "step": 50871 + }, + { + "epoch": 4.746850797797891, + "grad_norm": NaN, + "learning_rate": 3.310147492744255e-05, + "loss": 0.0, + "step": 50872 + }, + { + "epoch": 4.7469441074927685, + "grad_norm": NaN, + "learning_rate": 3.3096735443836333e-05, + "loss": 0.0, + "step": 50873 + }, + { + "epoch": 4.747037417187646, + "grad_norm": NaN, + "learning_rate": 3.309199625748047e-05, + "loss": 0.0, + "step": 50874 + }, + { + "epoch": 4.747130726882523, + "grad_norm": NaN, + "learning_rate": 3.30872573683871e-05, + "loss": 0.0, + "step": 50875 + }, + { + "epoch": 4.747224036577401, + "grad_norm": NaN, + "learning_rate": 3.3082518776568164e-05, + "loss": 0.0, + "step": 50876 + }, + { + "epoch": 4.747317346272277, + "grad_norm": NaN, + "learning_rate": 3.307778048203571e-05, + "loss": 0.0, + "step": 50877 + }, + { + "epoch": 4.747410655967155, + "grad_norm": NaN, + "learning_rate": 3.307304248480192e-05, + "loss": 0.0, + "step": 50878 + }, + { + "epoch": 4.747503965662032, + "grad_norm": NaN, + "learning_rate": 3.306830478487867e-05, + "loss": 0.0, + "step": 50879 + }, + { + "epoch": 4.74759727535691, + "grad_norm": NaN, + "learning_rate": 3.306356738227807e-05, + "loss": 0.0, + "step": 50880 + }, + { + "epoch": 4.747690585051787, + "grad_norm": NaN, + "learning_rate": 3.3058830277012264e-05, + "loss": 0.0, + "step": 50881 + }, + { + "epoch": 4.747783894746664, + "grad_norm": NaN, + "learning_rate": 3.305409346909316e-05, + "loss": 0.0, + "step": 50882 + }, + { + "epoch": 4.747877204441542, + "grad_norm": NaN, + "learning_rate": 3.3049356958532816e-05, + "loss": 0.0, + "step": 50883 + }, + { + "epoch": 4.747970514136419, + "grad_norm": NaN, + "learning_rate": 3.304462074534339e-05, + "loss": 0.0, + "step": 50884 + }, + { + "epoch": 4.748063823831296, + "grad_norm": NaN, + "learning_rate": 3.3039884829536775e-05, + "loss": 0.0, + "step": 50885 + }, + { + "epoch": 4.748157133526173, + "grad_norm": NaN, + "learning_rate": 3.303514921112513e-05, + "loss": 0.0, + "step": 50886 + }, + { + "epoch": 4.748250443221051, + "grad_norm": NaN, + "learning_rate": 3.303041389012048e-05, + "loss": 0.0, + "step": 50887 + }, + { + "epoch": 4.748343752915928, + "grad_norm": NaN, + "learning_rate": 3.3025678866534774e-05, + "loss": 0.0, + "step": 50888 + }, + { + "epoch": 4.7484370626108054, + "grad_norm": NaN, + "learning_rate": 3.3020944140380154e-05, + "loss": 0.0, + "step": 50889 + }, + { + "epoch": 4.748530372305683, + "grad_norm": NaN, + "learning_rate": 3.301620971166865e-05, + "loss": 0.0, + "step": 50890 + }, + { + "epoch": 4.748623682000559, + "grad_norm": NaN, + "learning_rate": 3.30114755804122e-05, + "loss": 0.0, + "step": 50891 + }, + { + "epoch": 4.748716991695437, + "grad_norm": NaN, + "learning_rate": 3.3006741746622954e-05, + "loss": 0.0, + "step": 50892 + }, + { + "epoch": 4.748810301390314, + "grad_norm": NaN, + "learning_rate": 3.3002008210312936e-05, + "loss": 0.0, + "step": 50893 + }, + { + "epoch": 4.748903611085192, + "grad_norm": NaN, + "learning_rate": 3.2997274971494084e-05, + "loss": 0.0, + "step": 50894 + }, + { + "epoch": 4.748996920780069, + "grad_norm": NaN, + "learning_rate": 3.2992542030178545e-05, + "loss": 0.0, + "step": 50895 + }, + { + "epoch": 4.7490902304749465, + "grad_norm": NaN, + "learning_rate": 3.298780938637834e-05, + "loss": 0.0, + "step": 50896 + }, + { + "epoch": 4.749183540169824, + "grad_norm": NaN, + "learning_rate": 3.298307704010541e-05, + "loss": 0.0, + "step": 50897 + }, + { + "epoch": 4.749276849864701, + "grad_norm": NaN, + "learning_rate": 3.2978344991371884e-05, + "loss": 0.0, + "step": 50898 + }, + { + "epoch": 4.749370159559578, + "grad_norm": NaN, + "learning_rate": 3.297361324018979e-05, + "loss": 0.0, + "step": 50899 + }, + { + "epoch": 4.749463469254455, + "grad_norm": NaN, + "learning_rate": 3.2968881786571075e-05, + "loss": 0.0, + "step": 50900 + }, + { + "epoch": 4.749556778949333, + "grad_norm": NaN, + "learning_rate": 3.296415063052786e-05, + "loss": 0.0, + "step": 50901 + }, + { + "epoch": 4.74965008864421, + "grad_norm": NaN, + "learning_rate": 3.295941977207218e-05, + "loss": 0.0, + "step": 50902 + }, + { + "epoch": 4.7497433983390875, + "grad_norm": NaN, + "learning_rate": 3.295468921121594e-05, + "loss": 0.0, + "step": 50903 + }, + { + "epoch": 4.749836708033965, + "grad_norm": NaN, + "learning_rate": 3.2949958947971336e-05, + "loss": 0.0, + "step": 50904 + }, + { + "epoch": 4.749930017728842, + "grad_norm": NaN, + "learning_rate": 3.294522898235026e-05, + "loss": 0.0, + "step": 50905 + }, + { + "epoch": 4.750023327423719, + "grad_norm": NaN, + "learning_rate": 3.294049931436477e-05, + "loss": 0.0, + "step": 50906 + }, + { + "epoch": 4.750116637118596, + "grad_norm": NaN, + "learning_rate": 3.2935769944027e-05, + "loss": 0.0, + "step": 50907 + }, + { + "epoch": 4.750209946813474, + "grad_norm": NaN, + "learning_rate": 3.293104087134885e-05, + "loss": 0.0, + "step": 50908 + }, + { + "epoch": 4.750303256508351, + "grad_norm": NaN, + "learning_rate": 3.292631209634233e-05, + "loss": 0.0, + "step": 50909 + }, + { + "epoch": 4.750396566203229, + "grad_norm": NaN, + "learning_rate": 3.2921583619019606e-05, + "loss": 0.0, + "step": 50910 + }, + { + "epoch": 4.750489875898106, + "grad_norm": NaN, + "learning_rate": 3.291685543939256e-05, + "loss": 0.0, + "step": 50911 + }, + { + "epoch": 4.750583185592983, + "grad_norm": NaN, + "learning_rate": 3.291212755747325e-05, + "loss": 0.0, + "step": 50912 + }, + { + "epoch": 4.750676495287861, + "grad_norm": NaN, + "learning_rate": 3.290739997327378e-05, + "loss": 0.0, + "step": 50913 + }, + { + "epoch": 4.750769804982737, + "grad_norm": NaN, + "learning_rate": 3.290267268680608e-05, + "loss": 0.0, + "step": 50914 + }, + { + "epoch": 4.750863114677615, + "grad_norm": NaN, + "learning_rate": 3.289794569808215e-05, + "loss": 0.0, + "step": 50915 + }, + { + "epoch": 4.750956424372492, + "grad_norm": NaN, + "learning_rate": 3.289321900711413e-05, + "loss": 0.0, + "step": 50916 + }, + { + "epoch": 4.75104973406737, + "grad_norm": NaN, + "learning_rate": 3.2888492613913935e-05, + "loss": 0.0, + "step": 50917 + }, + { + "epoch": 4.751143043762247, + "grad_norm": NaN, + "learning_rate": 3.288376651849355e-05, + "loss": 0.0, + "step": 50918 + }, + { + "epoch": 4.7512363534571245, + "grad_norm": NaN, + "learning_rate": 3.2879040720865165e-05, + "loss": 0.0, + "step": 50919 + }, + { + "epoch": 4.751329663152001, + "grad_norm": NaN, + "learning_rate": 3.2874315221040585e-05, + "loss": 0.0, + "step": 50920 + }, + { + "epoch": 4.751422972846878, + "grad_norm": NaN, + "learning_rate": 3.2869590019031984e-05, + "loss": 0.0, + "step": 50921 + }, + { + "epoch": 4.751516282541756, + "grad_norm": NaN, + "learning_rate": 3.286486511485134e-05, + "loss": 0.0, + "step": 50922 + }, + { + "epoch": 4.751609592236633, + "grad_norm": NaN, + "learning_rate": 3.286014050851057e-05, + "loss": 0.0, + "step": 50923 + }, + { + "epoch": 4.751702901931511, + "grad_norm": NaN, + "learning_rate": 3.2855416200021804e-05, + "loss": 0.0, + "step": 50924 + }, + { + "epoch": 4.751796211626388, + "grad_norm": NaN, + "learning_rate": 3.285069218939706e-05, + "loss": 0.0, + "step": 50925 + }, + { + "epoch": 4.7518895213212655, + "grad_norm": NaN, + "learning_rate": 3.2845968476648224e-05, + "loss": 0.0, + "step": 50926 + }, + { + "epoch": 4.751982831016143, + "grad_norm": NaN, + "learning_rate": 3.284124506178743e-05, + "loss": 0.0, + "step": 50927 + }, + { + "epoch": 4.75207614071102, + "grad_norm": NaN, + "learning_rate": 3.2836521944826676e-05, + "loss": 0.0, + "step": 50928 + }, + { + "epoch": 4.752169450405897, + "grad_norm": NaN, + "learning_rate": 3.283179912577785e-05, + "loss": 0.0, + "step": 50929 + }, + { + "epoch": 4.752262760100774, + "grad_norm": NaN, + "learning_rate": 3.282707660465311e-05, + "loss": 0.0, + "step": 50930 + }, + { + "epoch": 4.752356069795652, + "grad_norm": NaN, + "learning_rate": 3.282235438146445e-05, + "loss": 0.0, + "step": 50931 + }, + { + "epoch": 4.752449379490529, + "grad_norm": NaN, + "learning_rate": 3.281763245622373e-05, + "loss": 0.0, + "step": 50932 + }, + { + "epoch": 4.752542689185407, + "grad_norm": NaN, + "learning_rate": 3.2812910828943115e-05, + "loss": 0.0, + "step": 50933 + }, + { + "epoch": 4.752635998880284, + "grad_norm": NaN, + "learning_rate": 3.2808189499634587e-05, + "loss": 0.0, + "step": 50934 + }, + { + "epoch": 4.7527293085751605, + "grad_norm": NaN, + "learning_rate": 3.280346846831005e-05, + "loss": 0.0, + "step": 50935 + }, + { + "epoch": 4.752822618270038, + "grad_norm": NaN, + "learning_rate": 3.27987477349816e-05, + "loss": 0.0, + "step": 50936 + }, + { + "epoch": 4.752915927964915, + "grad_norm": NaN, + "learning_rate": 3.2794027299661255e-05, + "loss": 0.0, + "step": 50937 + }, + { + "epoch": 4.753009237659793, + "grad_norm": NaN, + "learning_rate": 3.278930716236091e-05, + "loss": 0.0, + "step": 50938 + }, + { + "epoch": 4.75310254735467, + "grad_norm": NaN, + "learning_rate": 3.278458732309267e-05, + "loss": 0.0, + "step": 50939 + }, + { + "epoch": 4.753195857049548, + "grad_norm": NaN, + "learning_rate": 3.277986778186856e-05, + "loss": 0.0, + "step": 50940 + }, + { + "epoch": 4.753289166744425, + "grad_norm": NaN, + "learning_rate": 3.277514853870043e-05, + "loss": 0.0, + "step": 50941 + }, + { + "epoch": 4.7533824764393025, + "grad_norm": NaN, + "learning_rate": 3.277042959360041e-05, + "loss": 0.0, + "step": 50942 + }, + { + "epoch": 4.753475786134179, + "grad_norm": NaN, + "learning_rate": 3.2765710946580506e-05, + "loss": 0.0, + "step": 50943 + }, + { + "epoch": 4.753569095829056, + "grad_norm": NaN, + "learning_rate": 3.276099259765259e-05, + "loss": 0.0, + "step": 50944 + }, + { + "epoch": 4.753662405523934, + "grad_norm": NaN, + "learning_rate": 3.275627454682878e-05, + "loss": 0.0, + "step": 50945 + }, + { + "epoch": 4.753755715218811, + "grad_norm": NaN, + "learning_rate": 3.275155679412107e-05, + "loss": 0.0, + "step": 50946 + }, + { + "epoch": 4.753849024913689, + "grad_norm": NaN, + "learning_rate": 3.2746839339541346e-05, + "loss": 0.0, + "step": 50947 + }, + { + "epoch": 4.753942334608566, + "grad_norm": NaN, + "learning_rate": 3.274212218310172e-05, + "loss": 0.0, + "step": 50948 + }, + { + "epoch": 4.7540356443034435, + "grad_norm": NaN, + "learning_rate": 3.273740532481418e-05, + "loss": 0.0, + "step": 50949 + }, + { + "epoch": 4.75412895399832, + "grad_norm": NaN, + "learning_rate": 3.2732688764690604e-05, + "loss": 0.0, + "step": 50950 + }, + { + "epoch": 4.7542222636931974, + "grad_norm": NaN, + "learning_rate": 3.272797250274314e-05, + "loss": 0.0, + "step": 50951 + }, + { + "epoch": 4.754315573388075, + "grad_norm": NaN, + "learning_rate": 3.272325653898365e-05, + "loss": 0.0, + "step": 50952 + }, + { + "epoch": 4.754408883082952, + "grad_norm": NaN, + "learning_rate": 3.271854087342416e-05, + "loss": 0.0, + "step": 50953 + }, + { + "epoch": 4.75450219277783, + "grad_norm": NaN, + "learning_rate": 3.2713825506076755e-05, + "loss": 0.0, + "step": 50954 + }, + { + "epoch": 4.754595502472707, + "grad_norm": NaN, + "learning_rate": 3.27091104369533e-05, + "loss": 0.0, + "step": 50955 + }, + { + "epoch": 4.7546888121675845, + "grad_norm": NaN, + "learning_rate": 3.27043956660658e-05, + "loss": 0.0, + "step": 50956 + }, + { + "epoch": 4.754782121862462, + "grad_norm": NaN, + "learning_rate": 3.269968119342635e-05, + "loss": 0.0, + "step": 50957 + }, + { + "epoch": 4.7548754315573385, + "grad_norm": NaN, + "learning_rate": 3.269496701904679e-05, + "loss": 0.0, + "step": 50958 + }, + { + "epoch": 4.754968741252216, + "grad_norm": NaN, + "learning_rate": 3.269025314293922e-05, + "loss": 0.0, + "step": 50959 + }, + { + "epoch": 4.755062050947093, + "grad_norm": NaN, + "learning_rate": 3.268553956511563e-05, + "loss": 0.0, + "step": 50960 + }, + { + "epoch": 4.755155360641971, + "grad_norm": NaN, + "learning_rate": 3.268082628558788e-05, + "loss": 0.0, + "step": 50961 + }, + { + "epoch": 4.755248670336848, + "grad_norm": NaN, + "learning_rate": 3.2676113304368075e-05, + "loss": 0.0, + "step": 50962 + }, + { + "epoch": 4.755341980031726, + "grad_norm": NaN, + "learning_rate": 3.267140062146819e-05, + "loss": 0.0, + "step": 50963 + }, + { + "epoch": 4.755435289726602, + "grad_norm": NaN, + "learning_rate": 3.266668823690011e-05, + "loss": 0.0, + "step": 50964 + }, + { + "epoch": 4.7555285994214795, + "grad_norm": NaN, + "learning_rate": 3.2661976150675914e-05, + "loss": 0.0, + "step": 50965 + }, + { + "epoch": 4.755621909116357, + "grad_norm": NaN, + "learning_rate": 3.265726436280761e-05, + "loss": 0.0, + "step": 50966 + }, + { + "epoch": 4.755715218811234, + "grad_norm": NaN, + "learning_rate": 3.265255287330703e-05, + "loss": 0.0, + "step": 50967 + }, + { + "epoch": 4.755808528506112, + "grad_norm": NaN, + "learning_rate": 3.264784168218629e-05, + "loss": 0.0, + "step": 50968 + }, + { + "epoch": 4.755901838200989, + "grad_norm": NaN, + "learning_rate": 3.264313078945737e-05, + "loss": 0.0, + "step": 50969 + }, + { + "epoch": 4.755995147895867, + "grad_norm": NaN, + "learning_rate": 3.263842019513212e-05, + "loss": 0.0, + "step": 50970 + }, + { + "epoch": 4.756088457590744, + "grad_norm": NaN, + "learning_rate": 3.263370989922266e-05, + "loss": 0.0, + "step": 50971 + }, + { + "epoch": 4.7561817672856215, + "grad_norm": NaN, + "learning_rate": 3.262899990174093e-05, + "loss": 0.0, + "step": 50972 + }, + { + "epoch": 4.756275076980498, + "grad_norm": NaN, + "learning_rate": 3.262429020269882e-05, + "loss": 0.0, + "step": 50973 + }, + { + "epoch": 4.756368386675375, + "grad_norm": NaN, + "learning_rate": 3.261958080210839e-05, + "loss": 0.0, + "step": 50974 + }, + { + "epoch": 4.756461696370253, + "grad_norm": NaN, + "learning_rate": 3.2614871699981643e-05, + "loss": 0.0, + "step": 50975 + }, + { + "epoch": 4.75655500606513, + "grad_norm": NaN, + "learning_rate": 3.2610162896330425e-05, + "loss": 0.0, + "step": 50976 + }, + { + "epoch": 4.756648315760008, + "grad_norm": NaN, + "learning_rate": 3.260545439116685e-05, + "loss": 0.0, + "step": 50977 + }, + { + "epoch": 4.756741625454885, + "grad_norm": NaN, + "learning_rate": 3.260074618450284e-05, + "loss": 0.0, + "step": 50978 + }, + { + "epoch": 4.756834935149762, + "grad_norm": NaN, + "learning_rate": 3.25960382763503e-05, + "loss": 0.0, + "step": 50979 + }, + { + "epoch": 4.756928244844639, + "grad_norm": NaN, + "learning_rate": 3.2591330666721286e-05, + "loss": 0.0, + "step": 50980 + }, + { + "epoch": 4.7570215545395165, + "grad_norm": NaN, + "learning_rate": 3.258662335562778e-05, + "loss": 0.0, + "step": 50981 + }, + { + "epoch": 4.757114864234394, + "grad_norm": NaN, + "learning_rate": 3.2581916343081626e-05, + "loss": 0.0, + "step": 50982 + }, + { + "epoch": 4.757208173929271, + "grad_norm": NaN, + "learning_rate": 3.257720962909492e-05, + "loss": 0.0, + "step": 50983 + }, + { + "epoch": 4.757301483624149, + "grad_norm": NaN, + "learning_rate": 3.257250321367963e-05, + "loss": 0.0, + "step": 50984 + }, + { + "epoch": 4.757394793319026, + "grad_norm": NaN, + "learning_rate": 3.2567797096847594e-05, + "loss": 0.0, + "step": 50985 + }, + { + "epoch": 4.757488103013904, + "grad_norm": NaN, + "learning_rate": 3.256309127861092e-05, + "loss": 0.0, + "step": 50986 + }, + { + "epoch": 4.75758141270878, + "grad_norm": NaN, + "learning_rate": 3.255838575898154e-05, + "loss": 0.0, + "step": 50987 + }, + { + "epoch": 4.7576747224036575, + "grad_norm": NaN, + "learning_rate": 3.2553680537971304e-05, + "loss": 0.0, + "step": 50988 + }, + { + "epoch": 4.757768032098535, + "grad_norm": NaN, + "learning_rate": 3.254897561559233e-05, + "loss": 0.0, + "step": 50989 + }, + { + "epoch": 4.757861341793412, + "grad_norm": NaN, + "learning_rate": 3.254427099185656e-05, + "loss": 0.0, + "step": 50990 + }, + { + "epoch": 4.75795465148829, + "grad_norm": NaN, + "learning_rate": 3.253956666677582e-05, + "loss": 0.0, + "step": 50991 + }, + { + "epoch": 4.758047961183167, + "grad_norm": NaN, + "learning_rate": 3.253486264036223e-05, + "loss": 0.0, + "step": 50992 + }, + { + "epoch": 4.758141270878045, + "grad_norm": NaN, + "learning_rate": 3.253015891262772e-05, + "loss": 0.0, + "step": 50993 + }, + { + "epoch": 4.758234580572921, + "grad_norm": NaN, + "learning_rate": 3.252545548358413e-05, + "loss": 0.0, + "step": 50994 + }, + { + "epoch": 4.758327890267799, + "grad_norm": NaN, + "learning_rate": 3.25207523532436e-05, + "loss": 0.0, + "step": 50995 + }, + { + "epoch": 4.758421199962676, + "grad_norm": NaN, + "learning_rate": 3.251604952161791e-05, + "loss": 0.0, + "step": 50996 + }, + { + "epoch": 4.758514509657553, + "grad_norm": NaN, + "learning_rate": 3.2511346988719144e-05, + "loss": 0.0, + "step": 50997 + }, + { + "epoch": 4.758607819352431, + "grad_norm": NaN, + "learning_rate": 3.250664475455926e-05, + "loss": 0.0, + "step": 50998 + }, + { + "epoch": 4.758701129047308, + "grad_norm": NaN, + "learning_rate": 3.250194281915009e-05, + "loss": 0.0, + "step": 50999 + }, + { + "epoch": 4.758794438742186, + "grad_norm": NaN, + "learning_rate": 3.2497241182503714e-05, + "loss": 0.0, + "step": 51000 + }, + { + "epoch": 4.758887748437063, + "grad_norm": NaN, + "learning_rate": 3.249253984463209e-05, + "loss": 0.0, + "step": 51001 + }, + { + "epoch": 4.75898105813194, + "grad_norm": NaN, + "learning_rate": 3.248783880554705e-05, + "loss": 0.0, + "step": 51002 + }, + { + "epoch": 4.759074367826817, + "grad_norm": NaN, + "learning_rate": 3.2483138065260675e-05, + "loss": 0.0, + "step": 51003 + }, + { + "epoch": 4.7591676775216945, + "grad_norm": NaN, + "learning_rate": 3.24784376237849e-05, + "loss": 0.0, + "step": 51004 + }, + { + "epoch": 4.759260987216572, + "grad_norm": NaN, + "learning_rate": 3.247373748113157e-05, + "loss": 0.0, + "step": 51005 + }, + { + "epoch": 4.759354296911449, + "grad_norm": NaN, + "learning_rate": 3.2469037637312744e-05, + "loss": 0.0, + "step": 51006 + }, + { + "epoch": 4.759447606606327, + "grad_norm": NaN, + "learning_rate": 3.2464338092340385e-05, + "loss": 0.0, + "step": 51007 + }, + { + "epoch": 4.759540916301203, + "grad_norm": NaN, + "learning_rate": 3.2459638846226315e-05, + "loss": 0.0, + "step": 51008 + }, + { + "epoch": 4.759634225996081, + "grad_norm": NaN, + "learning_rate": 3.245493989898261e-05, + "loss": 0.0, + "step": 51009 + }, + { + "epoch": 4.759727535690958, + "grad_norm": NaN, + "learning_rate": 3.2450241250621215e-05, + "loss": 0.0, + "step": 51010 + }, + { + "epoch": 4.7598208453858355, + "grad_norm": NaN, + "learning_rate": 3.2445542901153956e-05, + "loss": 0.0, + "step": 51011 + }, + { + "epoch": 4.759914155080713, + "grad_norm": NaN, + "learning_rate": 3.244084485059289e-05, + "loss": 0.0, + "step": 51012 + }, + { + "epoch": 4.76000746477559, + "grad_norm": NaN, + "learning_rate": 3.2436147098949984e-05, + "loss": 0.0, + "step": 51013 + }, + { + "epoch": 4.760100774470468, + "grad_norm": NaN, + "learning_rate": 3.243144964623704e-05, + "loss": 0.0, + "step": 51014 + }, + { + "epoch": 4.760194084165345, + "grad_norm": NaN, + "learning_rate": 3.2426752492466136e-05, + "loss": 0.0, + "step": 51015 + }, + { + "epoch": 4.760287393860222, + "grad_norm": NaN, + "learning_rate": 3.242205563764921e-05, + "loss": 0.0, + "step": 51016 + }, + { + "epoch": 4.760380703555099, + "grad_norm": NaN, + "learning_rate": 3.241735908179808e-05, + "loss": 0.0, + "step": 51017 + }, + { + "epoch": 4.7604740132499765, + "grad_norm": NaN, + "learning_rate": 3.241266282492484e-05, + "loss": 0.0, + "step": 51018 + }, + { + "epoch": 4.760567322944854, + "grad_norm": NaN, + "learning_rate": 3.240796686704138e-05, + "loss": 0.0, + "step": 51019 + }, + { + "epoch": 4.760660632639731, + "grad_norm": NaN, + "learning_rate": 3.2403271208159576e-05, + "loss": 0.0, + "step": 51020 + }, + { + "epoch": 4.760753942334609, + "grad_norm": NaN, + "learning_rate": 3.239857584829143e-05, + "loss": 0.0, + "step": 51021 + }, + { + "epoch": 4.760847252029486, + "grad_norm": NaN, + "learning_rate": 3.239388078744892e-05, + "loss": 0.0, + "step": 51022 + }, + { + "epoch": 4.760940561724363, + "grad_norm": NaN, + "learning_rate": 3.238918602564386e-05, + "loss": 0.0, + "step": 51023 + }, + { + "epoch": 4.76103387141924, + "grad_norm": NaN, + "learning_rate": 3.23844915628883e-05, + "loss": 0.0, + "step": 51024 + }, + { + "epoch": 4.761127181114118, + "grad_norm": NaN, + "learning_rate": 3.2379797399194176e-05, + "loss": 0.0, + "step": 51025 + }, + { + "epoch": 4.761220490808995, + "grad_norm": NaN, + "learning_rate": 3.237510353457329e-05, + "loss": 0.0, + "step": 51026 + }, + { + "epoch": 4.761313800503872, + "grad_norm": NaN, + "learning_rate": 3.237040996903773e-05, + "loss": 0.0, + "step": 51027 + }, + { + "epoch": 4.76140711019875, + "grad_norm": NaN, + "learning_rate": 3.2365716702599415e-05, + "loss": 0.0, + "step": 51028 + }, + { + "epoch": 4.761500419893627, + "grad_norm": NaN, + "learning_rate": 3.2361023735270155e-05, + "loss": 0.0, + "step": 51029 + }, + { + "epoch": 4.761593729588505, + "grad_norm": NaN, + "learning_rate": 3.2356331067062e-05, + "loss": 0.0, + "step": 51030 + }, + { + "epoch": 4.761687039283381, + "grad_norm": NaN, + "learning_rate": 3.235163869798689e-05, + "loss": 0.0, + "step": 51031 + }, + { + "epoch": 4.761780348978259, + "grad_norm": NaN, + "learning_rate": 3.234694662805663e-05, + "loss": 0.0, + "step": 51032 + }, + { + "epoch": 4.761873658673136, + "grad_norm": NaN, + "learning_rate": 3.234225485728327e-05, + "loss": 0.0, + "step": 51033 + }, + { + "epoch": 4.7619669683680135, + "grad_norm": NaN, + "learning_rate": 3.2337563385678704e-05, + "loss": 0.0, + "step": 51034 + }, + { + "epoch": 4.762060278062891, + "grad_norm": NaN, + "learning_rate": 3.233287221325487e-05, + "loss": 0.0, + "step": 51035 + }, + { + "epoch": 4.762153587757768, + "grad_norm": NaN, + "learning_rate": 3.2328181340023665e-05, + "loss": 0.0, + "step": 51036 + }, + { + "epoch": 4.762246897452645, + "grad_norm": NaN, + "learning_rate": 3.2323490765997064e-05, + "loss": 0.0, + "step": 51037 + }, + { + "epoch": 4.762340207147522, + "grad_norm": NaN, + "learning_rate": 3.231880049118694e-05, + "loss": 0.0, + "step": 51038 + }, + { + "epoch": 4.7624335168424, + "grad_norm": NaN, + "learning_rate": 3.231411051560531e-05, + "loss": 0.0, + "step": 51039 + }, + { + "epoch": 4.762526826537277, + "grad_norm": NaN, + "learning_rate": 3.230942083926395e-05, + "loss": 0.0, + "step": 51040 + }, + { + "epoch": 4.7626201362321545, + "grad_norm": NaN, + "learning_rate": 3.230473146217491e-05, + "loss": 0.0, + "step": 51041 + }, + { + "epoch": 4.762713445927032, + "grad_norm": NaN, + "learning_rate": 3.23000423843501e-05, + "loss": 0.0, + "step": 51042 + }, + { + "epoch": 4.762806755621909, + "grad_norm": NaN, + "learning_rate": 3.229535360580136e-05, + "loss": 0.0, + "step": 51043 + }, + { + "epoch": 4.762900065316787, + "grad_norm": NaN, + "learning_rate": 3.229066512654068e-05, + "loss": 0.0, + "step": 51044 + }, + { + "epoch": 4.762993375011664, + "grad_norm": NaN, + "learning_rate": 3.228597694658004e-05, + "loss": 0.0, + "step": 51045 + }, + { + "epoch": 4.763086684706541, + "grad_norm": NaN, + "learning_rate": 3.2281289065931203e-05, + "loss": 0.0, + "step": 51046 + }, + { + "epoch": 4.763179994401418, + "grad_norm": NaN, + "learning_rate": 3.22766014846062e-05, + "loss": 0.0, + "step": 51047 + }, + { + "epoch": 4.763273304096296, + "grad_norm": NaN, + "learning_rate": 3.2271914202616986e-05, + "loss": 0.0, + "step": 51048 + }, + { + "epoch": 4.763366613791173, + "grad_norm": NaN, + "learning_rate": 3.226722721997534e-05, + "loss": 0.0, + "step": 51049 + }, + { + "epoch": 4.76345992348605, + "grad_norm": NaN, + "learning_rate": 3.226254053669329e-05, + "loss": 0.0, + "step": 51050 + }, + { + "epoch": 4.763553233180928, + "grad_norm": NaN, + "learning_rate": 3.225785415278278e-05, + "loss": 0.0, + "step": 51051 + }, + { + "epoch": 4.763646542875804, + "grad_norm": NaN, + "learning_rate": 3.225316806825557e-05, + "loss": 0.0, + "step": 51052 + }, + { + "epoch": 4.763739852570682, + "grad_norm": NaN, + "learning_rate": 3.2248482283123716e-05, + "loss": 0.0, + "step": 51053 + }, + { + "epoch": 4.763833162265559, + "grad_norm": NaN, + "learning_rate": 3.224379679739914e-05, + "loss": 0.0, + "step": 51054 + }, + { + "epoch": 4.763926471960437, + "grad_norm": NaN, + "learning_rate": 3.223911161109362e-05, + "loss": 0.0, + "step": 51055 + }, + { + "epoch": 4.764019781655314, + "grad_norm": NaN, + "learning_rate": 3.2234426724219196e-05, + "loss": 0.0, + "step": 51056 + }, + { + "epoch": 4.7641130913501915, + "grad_norm": NaN, + "learning_rate": 3.2229742136787793e-05, + "loss": 0.0, + "step": 51057 + }, + { + "epoch": 4.764206401045069, + "grad_norm": NaN, + "learning_rate": 3.222505784881117e-05, + "loss": 0.0, + "step": 51058 + }, + { + "epoch": 4.764299710739946, + "grad_norm": NaN, + "learning_rate": 3.222037386030139e-05, + "loss": 0.0, + "step": 51059 + }, + { + "epoch": 4.764393020434823, + "grad_norm": NaN, + "learning_rate": 3.221569017127035e-05, + "loss": 0.0, + "step": 51060 + }, + { + "epoch": 4.7644863301297, + "grad_norm": NaN, + "learning_rate": 3.221100678172985e-05, + "loss": 0.0, + "step": 51061 + }, + { + "epoch": 4.764579639824578, + "grad_norm": NaN, + "learning_rate": 3.220632369169189e-05, + "loss": 0.0, + "step": 51062 + }, + { + "epoch": 4.764672949519455, + "grad_norm": NaN, + "learning_rate": 3.220164090116839e-05, + "loss": 0.0, + "step": 51063 + }, + { + "epoch": 4.7647662592143325, + "grad_norm": NaN, + "learning_rate": 3.219695841017117e-05, + "loss": 0.0, + "step": 51064 + }, + { + "epoch": 4.76485956890921, + "grad_norm": NaN, + "learning_rate": 3.2192276218712195e-05, + "loss": 0.0, + "step": 51065 + }, + { + "epoch": 4.764952878604087, + "grad_norm": NaN, + "learning_rate": 3.218759432680343e-05, + "loss": 0.0, + "step": 51066 + }, + { + "epoch": 4.765046188298964, + "grad_norm": NaN, + "learning_rate": 3.218291273445663e-05, + "loss": 0.0, + "step": 51067 + }, + { + "epoch": 4.765139497993841, + "grad_norm": NaN, + "learning_rate": 3.217823144168382e-05, + "loss": 0.0, + "step": 51068 + }, + { + "epoch": 4.765232807688719, + "grad_norm": NaN, + "learning_rate": 3.21735504484969e-05, + "loss": 0.0, + "step": 51069 + }, + { + "epoch": 4.765326117383596, + "grad_norm": NaN, + "learning_rate": 3.216886975490767e-05, + "loss": 0.0, + "step": 51070 + }, + { + "epoch": 4.7654194270784735, + "grad_norm": NaN, + "learning_rate": 3.2164189360928134e-05, + "loss": 0.0, + "step": 51071 + }, + { + "epoch": 4.765512736773351, + "grad_norm": NaN, + "learning_rate": 3.215950926657016e-05, + "loss": 0.0, + "step": 51072 + }, + { + "epoch": 4.765606046468228, + "grad_norm": NaN, + "learning_rate": 3.215482947184564e-05, + "loss": 0.0, + "step": 51073 + }, + { + "epoch": 4.765699356163106, + "grad_norm": NaN, + "learning_rate": 3.2150149976766495e-05, + "loss": 0.0, + "step": 51074 + }, + { + "epoch": 4.765792665857982, + "grad_norm": NaN, + "learning_rate": 3.21454707813446e-05, + "loss": 0.0, + "step": 51075 + }, + { + "epoch": 4.76588597555286, + "grad_norm": NaN, + "learning_rate": 3.2140791885591857e-05, + "loss": 0.0, + "step": 51076 + }, + { + "epoch": 4.765979285247737, + "grad_norm": NaN, + "learning_rate": 3.213611328952018e-05, + "loss": 0.0, + "step": 51077 + }, + { + "epoch": 4.766072594942615, + "grad_norm": NaN, + "learning_rate": 3.2131434993141436e-05, + "loss": 0.0, + "step": 51078 + }, + { + "epoch": 4.766165904637492, + "grad_norm": NaN, + "learning_rate": 3.212675699646754e-05, + "loss": 0.0, + "step": 51079 + }, + { + "epoch": 4.766259214332369, + "grad_norm": NaN, + "learning_rate": 3.2122079299510394e-05, + "loss": 0.0, + "step": 51080 + }, + { + "epoch": 4.766352524027246, + "grad_norm": NaN, + "learning_rate": 3.211740190228187e-05, + "loss": 0.0, + "step": 51081 + }, + { + "epoch": 4.766445833722123, + "grad_norm": NaN, + "learning_rate": 3.211272480479389e-05, + "loss": 0.0, + "step": 51082 + }, + { + "epoch": 4.766539143417001, + "grad_norm": NaN, + "learning_rate": 3.210804800705832e-05, + "loss": 0.0, + "step": 51083 + }, + { + "epoch": 4.766632453111878, + "grad_norm": NaN, + "learning_rate": 3.210337150908707e-05, + "loss": 0.0, + "step": 51084 + }, + { + "epoch": 4.766725762806756, + "grad_norm": NaN, + "learning_rate": 3.209869531089202e-05, + "loss": 0.0, + "step": 51085 + }, + { + "epoch": 4.766819072501633, + "grad_norm": NaN, + "learning_rate": 3.20940194124851e-05, + "loss": 0.0, + "step": 51086 + }, + { + "epoch": 4.7669123821965105, + "grad_norm": NaN, + "learning_rate": 3.208934381387809e-05, + "loss": 0.0, + "step": 51087 + }, + { + "epoch": 4.767005691891388, + "grad_norm": NaN, + "learning_rate": 3.208466851508298e-05, + "loss": 0.0, + "step": 51088 + }, + { + "epoch": 4.767099001586264, + "grad_norm": NaN, + "learning_rate": 3.207999351611167e-05, + "loss": 0.0, + "step": 51089 + }, + { + "epoch": 4.767192311281142, + "grad_norm": NaN, + "learning_rate": 3.207531881697593e-05, + "loss": 0.0, + "step": 51090 + }, + { + "epoch": 4.767285620976019, + "grad_norm": NaN, + "learning_rate": 3.207064441768776e-05, + "loss": 0.0, + "step": 51091 + }, + { + "epoch": 4.767378930670897, + "grad_norm": NaN, + "learning_rate": 3.2065970318259056e-05, + "loss": 0.0, + "step": 51092 + }, + { + "epoch": 4.767472240365774, + "grad_norm": NaN, + "learning_rate": 3.2061296518701564e-05, + "loss": 0.0, + "step": 51093 + }, + { + "epoch": 4.7675655500606515, + "grad_norm": NaN, + "learning_rate": 3.2056623019027305e-05, + "loss": 0.0, + "step": 51094 + }, + { + "epoch": 4.767658859755529, + "grad_norm": NaN, + "learning_rate": 3.205194981924816e-05, + "loss": 0.0, + "step": 51095 + }, + { + "epoch": 4.7677521694504055, + "grad_norm": NaN, + "learning_rate": 3.204727691937588e-05, + "loss": 0.0, + "step": 51096 + }, + { + "epoch": 4.767845479145283, + "grad_norm": NaN, + "learning_rate": 3.204260431942247e-05, + "loss": 0.0, + "step": 51097 + }, + { + "epoch": 4.76793878884016, + "grad_norm": NaN, + "learning_rate": 3.203793201939982e-05, + "loss": 0.0, + "step": 51098 + }, + { + "epoch": 4.768032098535038, + "grad_norm": NaN, + "learning_rate": 3.20332600193197e-05, + "loss": 0.0, + "step": 51099 + }, + { + "epoch": 4.768125408229915, + "grad_norm": NaN, + "learning_rate": 3.202858831919408e-05, + "loss": 0.0, + "step": 51100 + }, + { + "epoch": 4.768218717924793, + "grad_norm": NaN, + "learning_rate": 3.2023916919034855e-05, + "loss": 0.0, + "step": 51101 + }, + { + "epoch": 4.76831202761967, + "grad_norm": NaN, + "learning_rate": 3.2019245818853784e-05, + "loss": 0.0, + "step": 51102 + }, + { + "epoch": 4.768405337314547, + "grad_norm": NaN, + "learning_rate": 3.201457501866286e-05, + "loss": 0.0, + "step": 51103 + }, + { + "epoch": 4.768498647009424, + "grad_norm": NaN, + "learning_rate": 3.200990451847397e-05, + "loss": 0.0, + "step": 51104 + }, + { + "epoch": 4.768591956704301, + "grad_norm": NaN, + "learning_rate": 3.200523431829885e-05, + "loss": 0.0, + "step": 51105 + }, + { + "epoch": 4.768685266399179, + "grad_norm": NaN, + "learning_rate": 3.200056441814952e-05, + "loss": 0.0, + "step": 51106 + }, + { + "epoch": 4.768778576094056, + "grad_norm": NaN, + "learning_rate": 3.199589481803784e-05, + "loss": 0.0, + "step": 51107 + }, + { + "epoch": 4.768871885788934, + "grad_norm": NaN, + "learning_rate": 3.1991225517975556e-05, + "loss": 0.0, + "step": 51108 + }, + { + "epoch": 4.768965195483811, + "grad_norm": NaN, + "learning_rate": 3.198655651797466e-05, + "loss": 0.0, + "step": 51109 + }, + { + "epoch": 4.769058505178688, + "grad_norm": NaN, + "learning_rate": 3.198188781804701e-05, + "loss": 0.0, + "step": 51110 + }, + { + "epoch": 4.769151814873565, + "grad_norm": NaN, + "learning_rate": 3.197721941820444e-05, + "loss": 0.0, + "step": 51111 + }, + { + "epoch": 4.769245124568442, + "grad_norm": NaN, + "learning_rate": 3.1972551318458854e-05, + "loss": 0.0, + "step": 51112 + }, + { + "epoch": 4.76933843426332, + "grad_norm": NaN, + "learning_rate": 3.196788351882212e-05, + "loss": 0.0, + "step": 51113 + }, + { + "epoch": 4.769431743958197, + "grad_norm": NaN, + "learning_rate": 3.196321601930607e-05, + "loss": 0.0, + "step": 51114 + }, + { + "epoch": 4.769525053653075, + "grad_norm": NaN, + "learning_rate": 3.195854881992262e-05, + "loss": 0.0, + "step": 51115 + }, + { + "epoch": 4.769618363347952, + "grad_norm": NaN, + "learning_rate": 3.1953881920683594e-05, + "loss": 0.0, + "step": 51116 + }, + { + "epoch": 4.7697116730428295, + "grad_norm": NaN, + "learning_rate": 3.194921532160088e-05, + "loss": 0.0, + "step": 51117 + }, + { + "epoch": 4.769804982737707, + "grad_norm": NaN, + "learning_rate": 3.194454902268637e-05, + "loss": 0.0, + "step": 51118 + }, + { + "epoch": 4.7698982924325835, + "grad_norm": NaN, + "learning_rate": 3.1939883023951865e-05, + "loss": 0.0, + "step": 51119 + }, + { + "epoch": 4.769991602127461, + "grad_norm": NaN, + "learning_rate": 3.193521732540928e-05, + "loss": 0.0, + "step": 51120 + }, + { + "epoch": 4.770084911822338, + "grad_norm": NaN, + "learning_rate": 3.1930551927070474e-05, + "loss": 0.0, + "step": 51121 + }, + { + "epoch": 4.770178221517216, + "grad_norm": NaN, + "learning_rate": 3.192588682894729e-05, + "loss": 0.0, + "step": 51122 + }, + { + "epoch": 4.770271531212093, + "grad_norm": NaN, + "learning_rate": 3.19212220310516e-05, + "loss": 0.0, + "step": 51123 + }, + { + "epoch": 4.7703648409069705, + "grad_norm": NaN, + "learning_rate": 3.1916557533395256e-05, + "loss": 0.0, + "step": 51124 + }, + { + "epoch": 4.770458150601847, + "grad_norm": NaN, + "learning_rate": 3.1911893335990136e-05, + "loss": 0.0, + "step": 51125 + }, + { + "epoch": 4.7705514602967245, + "grad_norm": NaN, + "learning_rate": 3.19072294388481e-05, + "loss": 0.0, + "step": 51126 + }, + { + "epoch": 4.770644769991602, + "grad_norm": NaN, + "learning_rate": 3.1902565841980985e-05, + "loss": 0.0, + "step": 51127 + }, + { + "epoch": 4.770738079686479, + "grad_norm": NaN, + "learning_rate": 3.189790254540066e-05, + "loss": 0.0, + "step": 51128 + }, + { + "epoch": 4.770831389381357, + "grad_norm": NaN, + "learning_rate": 3.189323954911899e-05, + "loss": 0.0, + "step": 51129 + }, + { + "epoch": 4.770924699076234, + "grad_norm": NaN, + "learning_rate": 3.188857685314787e-05, + "loss": 0.0, + "step": 51130 + }, + { + "epoch": 4.771018008771112, + "grad_norm": NaN, + "learning_rate": 3.188391445749903e-05, + "loss": 0.0, + "step": 51131 + }, + { + "epoch": 4.771111318465989, + "grad_norm": NaN, + "learning_rate": 3.1879252362184444e-05, + "loss": 0.0, + "step": 51132 + }, + { + "epoch": 4.7712046281608655, + "grad_norm": NaN, + "learning_rate": 3.187459056721597e-05, + "loss": 0.0, + "step": 51133 + }, + { + "epoch": 4.771297937855743, + "grad_norm": NaN, + "learning_rate": 3.1869929072605344e-05, + "loss": 0.0, + "step": 51134 + }, + { + "epoch": 4.77139124755062, + "grad_norm": NaN, + "learning_rate": 3.1865267878364524e-05, + "loss": 0.0, + "step": 51135 + }, + { + "epoch": 4.771484557245498, + "grad_norm": NaN, + "learning_rate": 3.186060698450539e-05, + "loss": 0.0, + "step": 51136 + }, + { + "epoch": 4.771577866940375, + "grad_norm": NaN, + "learning_rate": 3.185594639103964e-05, + "loss": 0.0, + "step": 51137 + }, + { + "epoch": 4.771671176635253, + "grad_norm": NaN, + "learning_rate": 3.185128609797927e-05, + "loss": 0.0, + "step": 51138 + }, + { + "epoch": 4.77176448633013, + "grad_norm": NaN, + "learning_rate": 3.184662610533612e-05, + "loss": 0.0, + "step": 51139 + }, + { + "epoch": 4.771857796025007, + "grad_norm": NaN, + "learning_rate": 3.184196641312192e-05, + "loss": 0.0, + "step": 51140 + }, + { + "epoch": 4.771951105719884, + "grad_norm": NaN, + "learning_rate": 3.1837307021348646e-05, + "loss": 0.0, + "step": 51141 + }, + { + "epoch": 4.772044415414761, + "grad_norm": NaN, + "learning_rate": 3.1832647930028135e-05, + "loss": 0.0, + "step": 51142 + }, + { + "epoch": 4.772137725109639, + "grad_norm": NaN, + "learning_rate": 3.1827989139172125e-05, + "loss": 0.0, + "step": 51143 + }, + { + "epoch": 4.772231034804516, + "grad_norm": NaN, + "learning_rate": 3.1823330648792556e-05, + "loss": 0.0, + "step": 51144 + }, + { + "epoch": 4.772324344499394, + "grad_norm": NaN, + "learning_rate": 3.181867245890126e-05, + "loss": 0.0, + "step": 51145 + }, + { + "epoch": 4.772417654194271, + "grad_norm": NaN, + "learning_rate": 3.1814014569510065e-05, + "loss": 0.0, + "step": 51146 + }, + { + "epoch": 4.7725109638891485, + "grad_norm": NaN, + "learning_rate": 3.1809356980630826e-05, + "loss": 0.0, + "step": 51147 + }, + { + "epoch": 4.772604273584025, + "grad_norm": NaN, + "learning_rate": 3.180469969227539e-05, + "loss": 0.0, + "step": 51148 + }, + { + "epoch": 4.7726975832789025, + "grad_norm": NaN, + "learning_rate": 3.180004270445558e-05, + "loss": 0.0, + "step": 51149 + }, + { + "epoch": 4.77279089297378, + "grad_norm": NaN, + "learning_rate": 3.179538601718324e-05, + "loss": 0.0, + "step": 51150 + }, + { + "epoch": 4.772884202668657, + "grad_norm": NaN, + "learning_rate": 3.179072963047024e-05, + "loss": 0.0, + "step": 51151 + }, + { + "epoch": 4.772977512363535, + "grad_norm": NaN, + "learning_rate": 3.178607354432838e-05, + "loss": 0.0, + "step": 51152 + }, + { + "epoch": 4.773070822058412, + "grad_norm": NaN, + "learning_rate": 3.1781417758769526e-05, + "loss": 0.0, + "step": 51153 + }, + { + "epoch": 4.773164131753289, + "grad_norm": NaN, + "learning_rate": 3.1776762273805514e-05, + "loss": 0.0, + "step": 51154 + }, + { + "epoch": 4.773257441448166, + "grad_norm": NaN, + "learning_rate": 3.177210708944815e-05, + "loss": 0.0, + "step": 51155 + }, + { + "epoch": 4.7733507511430435, + "grad_norm": NaN, + "learning_rate": 3.176745220570932e-05, + "loss": 0.0, + "step": 51156 + }, + { + "epoch": 4.773444060837921, + "grad_norm": NaN, + "learning_rate": 3.1762797622600826e-05, + "loss": 0.0, + "step": 51157 + }, + { + "epoch": 4.773537370532798, + "grad_norm": NaN, + "learning_rate": 3.175814334013453e-05, + "loss": 0.0, + "step": 51158 + }, + { + "epoch": 4.773630680227676, + "grad_norm": NaN, + "learning_rate": 3.1753489358322235e-05, + "loss": 0.0, + "step": 51159 + }, + { + "epoch": 4.773723989922553, + "grad_norm": NaN, + "learning_rate": 3.1748835677175776e-05, + "loss": 0.0, + "step": 51160 + }, + { + "epoch": 4.773817299617431, + "grad_norm": NaN, + "learning_rate": 3.174418229670702e-05, + "loss": 0.0, + "step": 51161 + }, + { + "epoch": 4.773910609312308, + "grad_norm": NaN, + "learning_rate": 3.173952921692777e-05, + "loss": 0.0, + "step": 51162 + }, + { + "epoch": 4.774003919007185, + "grad_norm": NaN, + "learning_rate": 3.173487643784987e-05, + "loss": 0.0, + "step": 51163 + }, + { + "epoch": 4.774097228702062, + "grad_norm": NaN, + "learning_rate": 3.173022395948514e-05, + "loss": 0.0, + "step": 51164 + }, + { + "epoch": 4.774190538396939, + "grad_norm": NaN, + "learning_rate": 3.172557178184542e-05, + "loss": 0.0, + "step": 51165 + }, + { + "epoch": 4.774283848091817, + "grad_norm": NaN, + "learning_rate": 3.172091990494254e-05, + "loss": 0.0, + "step": 51166 + }, + { + "epoch": 4.774377157786694, + "grad_norm": NaN, + "learning_rate": 3.171626832878833e-05, + "loss": 0.0, + "step": 51167 + }, + { + "epoch": 4.774470467481572, + "grad_norm": NaN, + "learning_rate": 3.171161705339459e-05, + "loss": 0.0, + "step": 51168 + }, + { + "epoch": 4.774563777176448, + "grad_norm": NaN, + "learning_rate": 3.170696607877319e-05, + "loss": 0.0, + "step": 51169 + }, + { + "epoch": 4.774657086871326, + "grad_norm": NaN, + "learning_rate": 3.1702315404935916e-05, + "loss": 0.0, + "step": 51170 + }, + { + "epoch": 4.774750396566203, + "grad_norm": NaN, + "learning_rate": 3.169766503189462e-05, + "loss": 0.0, + "step": 51171 + }, + { + "epoch": 4.7748437062610805, + "grad_norm": NaN, + "learning_rate": 3.169301495966112e-05, + "loss": 0.0, + "step": 51172 + }, + { + "epoch": 4.774937015955958, + "grad_norm": NaN, + "learning_rate": 3.1688365188247235e-05, + "loss": 0.0, + "step": 51173 + }, + { + "epoch": 4.775030325650835, + "grad_norm": NaN, + "learning_rate": 3.1683715717664844e-05, + "loss": 0.0, + "step": 51174 + }, + { + "epoch": 4.775123635345713, + "grad_norm": NaN, + "learning_rate": 3.167906654792562e-05, + "loss": 0.0, + "step": 51175 + }, + { + "epoch": 4.77521694504059, + "grad_norm": NaN, + "learning_rate": 3.167441767904152e-05, + "loss": 0.0, + "step": 51176 + }, + { + "epoch": 4.775310254735467, + "grad_norm": NaN, + "learning_rate": 3.166976911102437e-05, + "loss": 0.0, + "step": 51177 + }, + { + "epoch": 4.775403564430344, + "grad_norm": NaN, + "learning_rate": 3.166512084388587e-05, + "loss": 0.0, + "step": 51178 + }, + { + "epoch": 4.7754968741252215, + "grad_norm": NaN, + "learning_rate": 3.166047287763794e-05, + "loss": 0.0, + "step": 51179 + }, + { + "epoch": 4.775590183820099, + "grad_norm": NaN, + "learning_rate": 3.165582521229242e-05, + "loss": 0.0, + "step": 51180 + }, + { + "epoch": 4.775683493514976, + "grad_norm": NaN, + "learning_rate": 3.1651177847861e-05, + "loss": 0.0, + "step": 51181 + }, + { + "epoch": 4.775776803209854, + "grad_norm": NaN, + "learning_rate": 3.164653078435563e-05, + "loss": 0.0, + "step": 51182 + }, + { + "epoch": 4.775870112904731, + "grad_norm": NaN, + "learning_rate": 3.1641884021788055e-05, + "loss": 0.0, + "step": 51183 + }, + { + "epoch": 4.775963422599608, + "grad_norm": NaN, + "learning_rate": 3.1637237560170126e-05, + "loss": 0.0, + "step": 51184 + }, + { + "epoch": 4.776056732294485, + "grad_norm": NaN, + "learning_rate": 3.163259139951362e-05, + "loss": 0.0, + "step": 51185 + }, + { + "epoch": 4.7761500419893625, + "grad_norm": NaN, + "learning_rate": 3.162794553983039e-05, + "loss": 0.0, + "step": 51186 + }, + { + "epoch": 4.77624335168424, + "grad_norm": NaN, + "learning_rate": 3.162329998113222e-05, + "loss": 0.0, + "step": 51187 + }, + { + "epoch": 4.776336661379117, + "grad_norm": NaN, + "learning_rate": 3.1618654723430935e-05, + "loss": 0.0, + "step": 51188 + }, + { + "epoch": 4.776429971073995, + "grad_norm": NaN, + "learning_rate": 3.161400976673834e-05, + "loss": 0.0, + "step": 51189 + }, + { + "epoch": 4.776523280768872, + "grad_norm": NaN, + "learning_rate": 3.1609365111066254e-05, + "loss": 0.0, + "step": 51190 + }, + { + "epoch": 4.77661659046375, + "grad_norm": NaN, + "learning_rate": 3.1604720756426484e-05, + "loss": 0.0, + "step": 51191 + }, + { + "epoch": 4.776709900158626, + "grad_norm": NaN, + "learning_rate": 3.1600076702830836e-05, + "loss": 0.0, + "step": 51192 + }, + { + "epoch": 4.776803209853504, + "grad_norm": NaN, + "learning_rate": 3.1595432950291115e-05, + "loss": 0.0, + "step": 51193 + }, + { + "epoch": 4.776896519548381, + "grad_norm": NaN, + "learning_rate": 3.159078949881914e-05, + "loss": 0.0, + "step": 51194 + }, + { + "epoch": 4.776989829243258, + "grad_norm": NaN, + "learning_rate": 3.1586146348426716e-05, + "loss": 0.0, + "step": 51195 + }, + { + "epoch": 4.777083138938136, + "grad_norm": NaN, + "learning_rate": 3.1581503499125634e-05, + "loss": 0.0, + "step": 51196 + }, + { + "epoch": 4.777176448633013, + "grad_norm": NaN, + "learning_rate": 3.157686095092771e-05, + "loss": 0.0, + "step": 51197 + }, + { + "epoch": 4.77726975832789, + "grad_norm": NaN, + "learning_rate": 3.157221870384476e-05, + "loss": 0.0, + "step": 51198 + }, + { + "epoch": 4.777363068022767, + "grad_norm": NaN, + "learning_rate": 3.1567576757888564e-05, + "loss": 0.0, + "step": 51199 + }, + { + "epoch": 4.777456377717645, + "grad_norm": NaN, + "learning_rate": 3.156293511307096e-05, + "loss": 0.0, + "step": 51200 + }, + { + "epoch": 4.777549687412522, + "grad_norm": NaN, + "learning_rate": 3.1558293769403706e-05, + "loss": 0.0, + "step": 51201 + }, + { + "epoch": 4.7776429971073995, + "grad_norm": NaN, + "learning_rate": 3.1553652726898635e-05, + "loss": 0.0, + "step": 51202 + }, + { + "epoch": 4.777736306802277, + "grad_norm": NaN, + "learning_rate": 3.154901198556753e-05, + "loss": 0.0, + "step": 51203 + }, + { + "epoch": 4.777829616497154, + "grad_norm": NaN, + "learning_rate": 3.154437154542221e-05, + "loss": 0.0, + "step": 51204 + }, + { + "epoch": 4.777922926192032, + "grad_norm": NaN, + "learning_rate": 3.1539731406474466e-05, + "loss": 0.0, + "step": 51205 + }, + { + "epoch": 4.778016235886908, + "grad_norm": NaN, + "learning_rate": 3.1535091568736096e-05, + "loss": 0.0, + "step": 51206 + }, + { + "epoch": 4.778109545581786, + "grad_norm": NaN, + "learning_rate": 3.153045203221888e-05, + "loss": 0.0, + "step": 51207 + }, + { + "epoch": 4.778202855276663, + "grad_norm": NaN, + "learning_rate": 3.152581279693464e-05, + "loss": 0.0, + "step": 51208 + }, + { + "epoch": 4.7782961649715405, + "grad_norm": NaN, + "learning_rate": 3.1521173862895176e-05, + "loss": 0.0, + "step": 51209 + }, + { + "epoch": 4.778389474666418, + "grad_norm": NaN, + "learning_rate": 3.151653523011226e-05, + "loss": 0.0, + "step": 51210 + }, + { + "epoch": 4.778482784361295, + "grad_norm": NaN, + "learning_rate": 3.1511896898597685e-05, + "loss": 0.0, + "step": 51211 + }, + { + "epoch": 4.778576094056173, + "grad_norm": NaN, + "learning_rate": 3.1507258868363284e-05, + "loss": 0.0, + "step": 51212 + }, + { + "epoch": 4.778669403751049, + "grad_norm": NaN, + "learning_rate": 3.15026211394208e-05, + "loss": 0.0, + "step": 51213 + }, + { + "epoch": 4.778762713445927, + "grad_norm": NaN, + "learning_rate": 3.1497983711782056e-05, + "loss": 0.0, + "step": 51214 + }, + { + "epoch": 4.778856023140804, + "grad_norm": NaN, + "learning_rate": 3.1493346585458844e-05, + "loss": 0.0, + "step": 51215 + }, + { + "epoch": 4.778949332835682, + "grad_norm": NaN, + "learning_rate": 3.148870976046295e-05, + "loss": 0.0, + "step": 51216 + }, + { + "epoch": 4.779042642530559, + "grad_norm": NaN, + "learning_rate": 3.148407323680614e-05, + "loss": 0.0, + "step": 51217 + }, + { + "epoch": 4.779135952225436, + "grad_norm": NaN, + "learning_rate": 3.147943701450029e-05, + "loss": 0.0, + "step": 51218 + }, + { + "epoch": 4.779229261920314, + "grad_norm": NaN, + "learning_rate": 3.147480109355703e-05, + "loss": 0.0, + "step": 51219 + }, + { + "epoch": 4.779322571615191, + "grad_norm": NaN, + "learning_rate": 3.1470165473988273e-05, + "loss": 0.0, + "step": 51220 + }, + { + "epoch": 4.779415881310068, + "grad_norm": NaN, + "learning_rate": 3.146553015580578e-05, + "loss": 0.0, + "step": 51221 + }, + { + "epoch": 4.779509191004945, + "grad_norm": NaN, + "learning_rate": 3.146089513902135e-05, + "loss": 0.0, + "step": 51222 + }, + { + "epoch": 4.779602500699823, + "grad_norm": NaN, + "learning_rate": 3.145626042364674e-05, + "loss": 0.0, + "step": 51223 + }, + { + "epoch": 4.7796958103947, + "grad_norm": NaN, + "learning_rate": 3.1451626009693736e-05, + "loss": 0.0, + "step": 51224 + }, + { + "epoch": 4.7797891200895775, + "grad_norm": NaN, + "learning_rate": 3.1446991897174134e-05, + "loss": 0.0, + "step": 51225 + }, + { + "epoch": 4.779882429784455, + "grad_norm": NaN, + "learning_rate": 3.14423580860997e-05, + "loss": 0.0, + "step": 51226 + }, + { + "epoch": 4.779975739479331, + "grad_norm": NaN, + "learning_rate": 3.143772457648224e-05, + "loss": 0.0, + "step": 51227 + }, + { + "epoch": 4.780069049174209, + "grad_norm": NaN, + "learning_rate": 3.1433091368333535e-05, + "loss": 0.0, + "step": 51228 + }, + { + "epoch": 4.780162358869086, + "grad_norm": NaN, + "learning_rate": 3.142845846166535e-05, + "loss": 0.0, + "step": 51229 + }, + { + "epoch": 4.780255668563964, + "grad_norm": NaN, + "learning_rate": 3.1423825856489485e-05, + "loss": 0.0, + "step": 51230 + }, + { + "epoch": 4.780348978258841, + "grad_norm": NaN, + "learning_rate": 3.1419193552817686e-05, + "loss": 0.0, + "step": 51231 + }, + { + "epoch": 4.7804422879537185, + "grad_norm": NaN, + "learning_rate": 3.1414561550661766e-05, + "loss": 0.0, + "step": 51232 + }, + { + "epoch": 4.780535597648596, + "grad_norm": NaN, + "learning_rate": 3.140992985003347e-05, + "loss": 0.0, + "step": 51233 + }, + { + "epoch": 4.780628907343473, + "grad_norm": NaN, + "learning_rate": 3.140529845094461e-05, + "loss": 0.0, + "step": 51234 + }, + { + "epoch": 4.780722217038351, + "grad_norm": NaN, + "learning_rate": 3.1400667353406935e-05, + "loss": 0.0, + "step": 51235 + }, + { + "epoch": 4.780815526733227, + "grad_norm": NaN, + "learning_rate": 3.139603655743224e-05, + "loss": 0.0, + "step": 51236 + }, + { + "epoch": 4.780908836428105, + "grad_norm": NaN, + "learning_rate": 3.139140606303229e-05, + "loss": 0.0, + "step": 51237 + }, + { + "epoch": 4.781002146122982, + "grad_norm": NaN, + "learning_rate": 3.138677587021885e-05, + "loss": 0.0, + "step": 51238 + }, + { + "epoch": 4.7810954558178596, + "grad_norm": NaN, + "learning_rate": 3.138214597900372e-05, + "loss": 0.0, + "step": 51239 + }, + { + "epoch": 4.781188765512737, + "grad_norm": NaN, + "learning_rate": 3.137751638939864e-05, + "loss": 0.0, + "step": 51240 + }, + { + "epoch": 4.781282075207614, + "grad_norm": NaN, + "learning_rate": 3.1372887101415414e-05, + "loss": 0.0, + "step": 51241 + }, + { + "epoch": 4.781375384902491, + "grad_norm": NaN, + "learning_rate": 3.136825811506578e-05, + "loss": 0.0, + "step": 51242 + }, + { + "epoch": 4.781468694597368, + "grad_norm": NaN, + "learning_rate": 3.136362943036153e-05, + "loss": 0.0, + "step": 51243 + }, + { + "epoch": 4.781562004292246, + "grad_norm": NaN, + "learning_rate": 3.135900104731442e-05, + "loss": 0.0, + "step": 51244 + }, + { + "epoch": 4.781655313987123, + "grad_norm": NaN, + "learning_rate": 3.135437296593624e-05, + "loss": 0.0, + "step": 51245 + }, + { + "epoch": 4.781748623682001, + "grad_norm": NaN, + "learning_rate": 3.134974518623873e-05, + "loss": 0.0, + "step": 51246 + }, + { + "epoch": 4.781841933376878, + "grad_norm": NaN, + "learning_rate": 3.134511770823369e-05, + "loss": 0.0, + "step": 51247 + }, + { + "epoch": 4.781935243071755, + "grad_norm": NaN, + "learning_rate": 3.134049053193285e-05, + "loss": 0.0, + "step": 51248 + }, + { + "epoch": 4.782028552766633, + "grad_norm": NaN, + "learning_rate": 3.1335863657348e-05, + "loss": 0.0, + "step": 51249 + }, + { + "epoch": 4.782121862461509, + "grad_norm": NaN, + "learning_rate": 3.133123708449089e-05, + "loss": 0.0, + "step": 51250 + }, + { + "epoch": 4.782215172156387, + "grad_norm": NaN, + "learning_rate": 3.1326610813373315e-05, + "loss": 0.0, + "step": 51251 + }, + { + "epoch": 4.782308481851264, + "grad_norm": NaN, + "learning_rate": 3.1321984844007e-05, + "loss": 0.0, + "step": 51252 + }, + { + "epoch": 4.782401791546142, + "grad_norm": NaN, + "learning_rate": 3.131735917640371e-05, + "loss": 0.0, + "step": 51253 + }, + { + "epoch": 4.782495101241019, + "grad_norm": NaN, + "learning_rate": 3.131273381057523e-05, + "loss": 0.0, + "step": 51254 + }, + { + "epoch": 4.7825884109358965, + "grad_norm": NaN, + "learning_rate": 3.130810874653331e-05, + "loss": 0.0, + "step": 51255 + }, + { + "epoch": 4.782681720630774, + "grad_norm": NaN, + "learning_rate": 3.130348398428971e-05, + "loss": 0.0, + "step": 51256 + }, + { + "epoch": 4.78277503032565, + "grad_norm": NaN, + "learning_rate": 3.129885952385621e-05, + "loss": 0.0, + "step": 51257 + }, + { + "epoch": 4.782868340020528, + "grad_norm": NaN, + "learning_rate": 3.129423536524452e-05, + "loss": 0.0, + "step": 51258 + }, + { + "epoch": 4.782961649715405, + "grad_norm": NaN, + "learning_rate": 3.128961150846644e-05, + "loss": 0.0, + "step": 51259 + }, + { + "epoch": 4.783054959410283, + "grad_norm": NaN, + "learning_rate": 3.128498795353371e-05, + "loss": 0.0, + "step": 51260 + }, + { + "epoch": 4.78314826910516, + "grad_norm": NaN, + "learning_rate": 3.1280364700458085e-05, + "loss": 0.0, + "step": 51261 + }, + { + "epoch": 4.7832415788000375, + "grad_norm": NaN, + "learning_rate": 3.127574174925132e-05, + "loss": 0.0, + "step": 51262 + }, + { + "epoch": 4.783334888494915, + "grad_norm": NaN, + "learning_rate": 3.127111909992519e-05, + "loss": 0.0, + "step": 51263 + }, + { + "epoch": 4.783428198189792, + "grad_norm": NaN, + "learning_rate": 3.126649675249143e-05, + "loss": 0.0, + "step": 51264 + }, + { + "epoch": 4.783521507884669, + "grad_norm": NaN, + "learning_rate": 3.126187470696179e-05, + "loss": 0.0, + "step": 51265 + }, + { + "epoch": 4.783614817579546, + "grad_norm": NaN, + "learning_rate": 3.125725296334803e-05, + "loss": 0.0, + "step": 51266 + }, + { + "epoch": 4.783708127274424, + "grad_norm": NaN, + "learning_rate": 3.1252631521661915e-05, + "loss": 0.0, + "step": 51267 + }, + { + "epoch": 4.783801436969301, + "grad_norm": NaN, + "learning_rate": 3.1248010381915166e-05, + "loss": 0.0, + "step": 51268 + }, + { + "epoch": 4.783894746664179, + "grad_norm": NaN, + "learning_rate": 3.124338954411957e-05, + "loss": 0.0, + "step": 51269 + }, + { + "epoch": 4.783988056359056, + "grad_norm": NaN, + "learning_rate": 3.123876900828684e-05, + "loss": 0.0, + "step": 51270 + }, + { + "epoch": 4.7840813660539325, + "grad_norm": NaN, + "learning_rate": 3.123414877442874e-05, + "loss": 0.0, + "step": 51271 + }, + { + "epoch": 4.78417467574881, + "grad_norm": NaN, + "learning_rate": 3.122952884255703e-05, + "loss": 0.0, + "step": 51272 + }, + { + "epoch": 4.784267985443687, + "grad_norm": NaN, + "learning_rate": 3.122490921268344e-05, + "loss": 0.0, + "step": 51273 + }, + { + "epoch": 4.784361295138565, + "grad_norm": NaN, + "learning_rate": 3.1220289884819724e-05, + "loss": 0.0, + "step": 51274 + }, + { + "epoch": 4.784454604833442, + "grad_norm": NaN, + "learning_rate": 3.121567085897761e-05, + "loss": 0.0, + "step": 51275 + }, + { + "epoch": 4.78454791452832, + "grad_norm": NaN, + "learning_rate": 3.121105213516888e-05, + "loss": 0.0, + "step": 51276 + }, + { + "epoch": 4.784641224223197, + "grad_norm": NaN, + "learning_rate": 3.120643371340526e-05, + "loss": 0.0, + "step": 51277 + }, + { + "epoch": 4.7847345339180745, + "grad_norm": NaN, + "learning_rate": 3.120181559369846e-05, + "loss": 0.0, + "step": 51278 + }, + { + "epoch": 4.784827843612952, + "grad_norm": NaN, + "learning_rate": 3.1197197776060265e-05, + "loss": 0.0, + "step": 51279 + }, + { + "epoch": 4.784921153307828, + "grad_norm": NaN, + "learning_rate": 3.119258026050242e-05, + "loss": 0.0, + "step": 51280 + }, + { + "epoch": 4.785014463002706, + "grad_norm": NaN, + "learning_rate": 3.118796304703664e-05, + "loss": 0.0, + "step": 51281 + }, + { + "epoch": 4.785107772697583, + "grad_norm": NaN, + "learning_rate": 3.118334613567467e-05, + "loss": 0.0, + "step": 51282 + }, + { + "epoch": 4.785201082392461, + "grad_norm": NaN, + "learning_rate": 3.1178729526428266e-05, + "loss": 0.0, + "step": 51283 + }, + { + "epoch": 4.785294392087338, + "grad_norm": NaN, + "learning_rate": 3.117411321930915e-05, + "loss": 0.0, + "step": 51284 + }, + { + "epoch": 4.7853877017822155, + "grad_norm": NaN, + "learning_rate": 3.116949721432906e-05, + "loss": 0.0, + "step": 51285 + }, + { + "epoch": 4.785481011477092, + "grad_norm": NaN, + "learning_rate": 3.1164881511499736e-05, + "loss": 0.0, + "step": 51286 + }, + { + "epoch": 4.7855743211719695, + "grad_norm": NaN, + "learning_rate": 3.1160266110832934e-05, + "loss": 0.0, + "step": 51287 + }, + { + "epoch": 4.785667630866847, + "grad_norm": NaN, + "learning_rate": 3.115565101234035e-05, + "loss": 0.0, + "step": 51288 + }, + { + "epoch": 4.785760940561724, + "grad_norm": NaN, + "learning_rate": 3.115103621603376e-05, + "loss": 0.0, + "step": 51289 + }, + { + "epoch": 4.785854250256602, + "grad_norm": NaN, + "learning_rate": 3.114642172192487e-05, + "loss": 0.0, + "step": 51290 + }, + { + "epoch": 4.785947559951479, + "grad_norm": NaN, + "learning_rate": 3.114180753002542e-05, + "loss": 0.0, + "step": 51291 + }, + { + "epoch": 4.7860408696463566, + "grad_norm": NaN, + "learning_rate": 3.1137193640347155e-05, + "loss": 0.0, + "step": 51292 + }, + { + "epoch": 4.786134179341234, + "grad_norm": NaN, + "learning_rate": 3.113258005290179e-05, + "loss": 0.0, + "step": 51293 + }, + { + "epoch": 4.7862274890361105, + "grad_norm": NaN, + "learning_rate": 3.112796676770106e-05, + "loss": 0.0, + "step": 51294 + }, + { + "epoch": 4.786320798730988, + "grad_norm": NaN, + "learning_rate": 3.112335378475669e-05, + "loss": 0.0, + "step": 51295 + }, + { + "epoch": 4.786414108425865, + "grad_norm": NaN, + "learning_rate": 3.1118741104080443e-05, + "loss": 0.0, + "step": 51296 + }, + { + "epoch": 4.786507418120743, + "grad_norm": NaN, + "learning_rate": 3.111412872568401e-05, + "loss": 0.0, + "step": 51297 + }, + { + "epoch": 4.78660072781562, + "grad_norm": NaN, + "learning_rate": 3.110951664957913e-05, + "loss": 0.0, + "step": 51298 + }, + { + "epoch": 4.786694037510498, + "grad_norm": NaN, + "learning_rate": 3.110490487577752e-05, + "loss": 0.0, + "step": 51299 + }, + { + "epoch": 4.786787347205375, + "grad_norm": NaN, + "learning_rate": 3.110029340429093e-05, + "loss": 0.0, + "step": 51300 + }, + { + "epoch": 4.7868806569002516, + "grad_norm": NaN, + "learning_rate": 3.109568223513107e-05, + "loss": 0.0, + "step": 51301 + }, + { + "epoch": 4.786973966595129, + "grad_norm": NaN, + "learning_rate": 3.109107136830967e-05, + "loss": 0.0, + "step": 51302 + }, + { + "epoch": 4.787067276290006, + "grad_norm": NaN, + "learning_rate": 3.1086460803838446e-05, + "loss": 0.0, + "step": 51303 + }, + { + "epoch": 4.787160585984884, + "grad_norm": NaN, + "learning_rate": 3.1081850541729145e-05, + "loss": 0.0, + "step": 51304 + }, + { + "epoch": 4.787253895679761, + "grad_norm": NaN, + "learning_rate": 3.107724058199346e-05, + "loss": 0.0, + "step": 51305 + }, + { + "epoch": 4.787347205374639, + "grad_norm": NaN, + "learning_rate": 3.1072630924643114e-05, + "loss": 0.0, + "step": 51306 + }, + { + "epoch": 4.787440515069516, + "grad_norm": NaN, + "learning_rate": 3.106802156968986e-05, + "loss": 0.0, + "step": 51307 + }, + { + "epoch": 4.7875338247643935, + "grad_norm": NaN, + "learning_rate": 3.1063412517145386e-05, + "loss": 0.0, + "step": 51308 + }, + { + "epoch": 4.78762713445927, + "grad_norm": NaN, + "learning_rate": 3.105880376702142e-05, + "loss": 0.0, + "step": 51309 + }, + { + "epoch": 4.787720444154147, + "grad_norm": NaN, + "learning_rate": 3.105419531932968e-05, + "loss": 0.0, + "step": 51310 + }, + { + "epoch": 4.787813753849025, + "grad_norm": NaN, + "learning_rate": 3.104958717408189e-05, + "loss": 0.0, + "step": 51311 + }, + { + "epoch": 4.787907063543902, + "grad_norm": NaN, + "learning_rate": 3.1044979331289765e-05, + "loss": 0.0, + "step": 51312 + }, + { + "epoch": 4.78800037323878, + "grad_norm": NaN, + "learning_rate": 3.104037179096503e-05, + "loss": 0.0, + "step": 51313 + }, + { + "epoch": 4.788093682933657, + "grad_norm": NaN, + "learning_rate": 3.103576455311939e-05, + "loss": 0.0, + "step": 51314 + }, + { + "epoch": 4.788186992628534, + "grad_norm": NaN, + "learning_rate": 3.103115761776454e-05, + "loss": 0.0, + "step": 51315 + }, + { + "epoch": 4.788280302323411, + "grad_norm": NaN, + "learning_rate": 3.102655098491223e-05, + "loss": 0.0, + "step": 51316 + }, + { + "epoch": 4.7883736120182885, + "grad_norm": NaN, + "learning_rate": 3.102194465457417e-05, + "loss": 0.0, + "step": 51317 + }, + { + "epoch": 4.788466921713166, + "grad_norm": NaN, + "learning_rate": 3.101733862676204e-05, + "loss": 0.0, + "step": 51318 + }, + { + "epoch": 4.788560231408043, + "grad_norm": NaN, + "learning_rate": 3.1012732901487574e-05, + "loss": 0.0, + "step": 51319 + }, + { + "epoch": 4.788653541102921, + "grad_norm": NaN, + "learning_rate": 3.1008127478762496e-05, + "loss": 0.0, + "step": 51320 + }, + { + "epoch": 4.788746850797798, + "grad_norm": NaN, + "learning_rate": 3.1003522358598495e-05, + "loss": 0.0, + "step": 51321 + }, + { + "epoch": 4.788840160492676, + "grad_norm": NaN, + "learning_rate": 3.099891754100729e-05, + "loss": 0.0, + "step": 51322 + }, + { + "epoch": 4.788933470187552, + "grad_norm": NaN, + "learning_rate": 3.0994313026000575e-05, + "loss": 0.0, + "step": 51323 + }, + { + "epoch": 4.7890267798824295, + "grad_norm": NaN, + "learning_rate": 3.098970881359008e-05, + "loss": 0.0, + "step": 51324 + }, + { + "epoch": 4.789120089577307, + "grad_norm": NaN, + "learning_rate": 3.09851049037875e-05, + "loss": 0.0, + "step": 51325 + }, + { + "epoch": 4.789213399272184, + "grad_norm": NaN, + "learning_rate": 3.098050129660454e-05, + "loss": 0.0, + "step": 51326 + }, + { + "epoch": 4.789306708967062, + "grad_norm": NaN, + "learning_rate": 3.097589799205292e-05, + "loss": 0.0, + "step": 51327 + }, + { + "epoch": 4.789400018661939, + "grad_norm": NaN, + "learning_rate": 3.097129499014432e-05, + "loss": 0.0, + "step": 51328 + }, + { + "epoch": 4.789493328356817, + "grad_norm": NaN, + "learning_rate": 3.096669229089047e-05, + "loss": 0.0, + "step": 51329 + }, + { + "epoch": 4.789586638051693, + "grad_norm": NaN, + "learning_rate": 3.096208989430305e-05, + "loss": 0.0, + "step": 51330 + }, + { + "epoch": 4.789679947746571, + "grad_norm": NaN, + "learning_rate": 3.0957487800393796e-05, + "loss": 0.0, + "step": 51331 + }, + { + "epoch": 4.789773257441448, + "grad_norm": NaN, + "learning_rate": 3.095288600917436e-05, + "loss": 0.0, + "step": 51332 + }, + { + "epoch": 4.789866567136325, + "grad_norm": NaN, + "learning_rate": 3.094828452065649e-05, + "loss": 0.0, + "step": 51333 + }, + { + "epoch": 4.789959876831203, + "grad_norm": NaN, + "learning_rate": 3.0943683334851857e-05, + "loss": 0.0, + "step": 51334 + }, + { + "epoch": 4.79005318652608, + "grad_norm": NaN, + "learning_rate": 3.093908245177217e-05, + "loss": 0.0, + "step": 51335 + }, + { + "epoch": 4.790146496220958, + "grad_norm": NaN, + "learning_rate": 3.0934481871429124e-05, + "loss": 0.0, + "step": 51336 + }, + { + "epoch": 4.790239805915835, + "grad_norm": NaN, + "learning_rate": 3.0929881593834435e-05, + "loss": 0.0, + "step": 51337 + }, + { + "epoch": 4.790333115610712, + "grad_norm": NaN, + "learning_rate": 3.0925281618999774e-05, + "loss": 0.0, + "step": 51338 + }, + { + "epoch": 4.790426425305589, + "grad_norm": NaN, + "learning_rate": 3.092068194693684e-05, + "loss": 0.0, + "step": 51339 + }, + { + "epoch": 4.7905197350004665, + "grad_norm": NaN, + "learning_rate": 3.091608257765736e-05, + "loss": 0.0, + "step": 51340 + }, + { + "epoch": 4.790613044695344, + "grad_norm": NaN, + "learning_rate": 3.0911483511173e-05, + "loss": 0.0, + "step": 51341 + }, + { + "epoch": 4.790706354390221, + "grad_norm": NaN, + "learning_rate": 3.0906884747495446e-05, + "loss": 0.0, + "step": 51342 + }, + { + "epoch": 4.790799664085099, + "grad_norm": NaN, + "learning_rate": 3.0902286286636416e-05, + "loss": 0.0, + "step": 51343 + }, + { + "epoch": 4.790892973779975, + "grad_norm": NaN, + "learning_rate": 3.08976881286076e-05, + "loss": 0.0, + "step": 51344 + }, + { + "epoch": 4.790986283474853, + "grad_norm": NaN, + "learning_rate": 3.089309027342069e-05, + "loss": 0.0, + "step": 51345 + }, + { + "epoch": 4.79107959316973, + "grad_norm": NaN, + "learning_rate": 3.0888492721087346e-05, + "loss": 0.0, + "step": 51346 + }, + { + "epoch": 4.7911729028646075, + "grad_norm": NaN, + "learning_rate": 3.08838954716193e-05, + "loss": 0.0, + "step": 51347 + }, + { + "epoch": 4.791266212559485, + "grad_norm": NaN, + "learning_rate": 3.087929852502822e-05, + "loss": 0.0, + "step": 51348 + }, + { + "epoch": 4.791359522254362, + "grad_norm": NaN, + "learning_rate": 3.087470188132578e-05, + "loss": 0.0, + "step": 51349 + }, + { + "epoch": 4.79145283194924, + "grad_norm": NaN, + "learning_rate": 3.0870105540523696e-05, + "loss": 0.0, + "step": 51350 + }, + { + "epoch": 4.791546141644117, + "grad_norm": NaN, + "learning_rate": 3.086550950263364e-05, + "loss": 0.0, + "step": 51351 + }, + { + "epoch": 4.791639451338995, + "grad_norm": NaN, + "learning_rate": 3.0860913767667326e-05, + "loss": 0.0, + "step": 51352 + }, + { + "epoch": 4.791732761033871, + "grad_norm": NaN, + "learning_rate": 3.0856318335636406e-05, + "loss": 0.0, + "step": 51353 + }, + { + "epoch": 4.7918260707287486, + "grad_norm": NaN, + "learning_rate": 3.0851723206552566e-05, + "loss": 0.0, + "step": 51354 + }, + { + "epoch": 4.791919380423626, + "grad_norm": NaN, + "learning_rate": 3.08471283804275e-05, + "loss": 0.0, + "step": 51355 + }, + { + "epoch": 4.792012690118503, + "grad_norm": NaN, + "learning_rate": 3.0842533857272896e-05, + "loss": 0.0, + "step": 51356 + }, + { + "epoch": 4.792105999813381, + "grad_norm": NaN, + "learning_rate": 3.0837939637100424e-05, + "loss": 0.0, + "step": 51357 + }, + { + "epoch": 4.792199309508258, + "grad_norm": NaN, + "learning_rate": 3.0833345719921774e-05, + "loss": 0.0, + "step": 51358 + }, + { + "epoch": 4.792292619203135, + "grad_norm": NaN, + "learning_rate": 3.0828752105748635e-05, + "loss": 0.0, + "step": 51359 + }, + { + "epoch": 4.792385928898012, + "grad_norm": NaN, + "learning_rate": 3.0824158794592684e-05, + "loss": 0.0, + "step": 51360 + }, + { + "epoch": 4.79247923859289, + "grad_norm": NaN, + "learning_rate": 3.081956578646559e-05, + "loss": 0.0, + "step": 51361 + }, + { + "epoch": 4.792572548287767, + "grad_norm": NaN, + "learning_rate": 3.081497308137902e-05, + "loss": 0.0, + "step": 51362 + }, + { + "epoch": 4.792665857982644, + "grad_norm": NaN, + "learning_rate": 3.081038067934468e-05, + "loss": 0.0, + "step": 51363 + }, + { + "epoch": 4.792759167677522, + "grad_norm": NaN, + "learning_rate": 3.080578858037423e-05, + "loss": 0.0, + "step": 51364 + }, + { + "epoch": 4.792852477372399, + "grad_norm": NaN, + "learning_rate": 3.080119678447937e-05, + "loss": 0.0, + "step": 51365 + }, + { + "epoch": 4.792945787067277, + "grad_norm": NaN, + "learning_rate": 3.0796605291671746e-05, + "loss": 0.0, + "step": 51366 + }, + { + "epoch": 4.793039096762153, + "grad_norm": NaN, + "learning_rate": 3.079201410196303e-05, + "loss": 0.0, + "step": 51367 + }, + { + "epoch": 4.793132406457031, + "grad_norm": NaN, + "learning_rate": 3.078742321536492e-05, + "loss": 0.0, + "step": 51368 + }, + { + "epoch": 4.793225716151908, + "grad_norm": NaN, + "learning_rate": 3.0782832631889086e-05, + "loss": 0.0, + "step": 51369 + }, + { + "epoch": 4.7933190258467855, + "grad_norm": NaN, + "learning_rate": 3.077824235154719e-05, + "loss": 0.0, + "step": 51370 + }, + { + "epoch": 4.793412335541663, + "grad_norm": NaN, + "learning_rate": 3.0773652374350916e-05, + "loss": 0.0, + "step": 51371 + }, + { + "epoch": 4.79350564523654, + "grad_norm": NaN, + "learning_rate": 3.0769062700311905e-05, + "loss": 0.0, + "step": 51372 + }, + { + "epoch": 4.793598954931418, + "grad_norm": NaN, + "learning_rate": 3.076447332944188e-05, + "loss": 0.0, + "step": 51373 + }, + { + "epoch": 4.793692264626294, + "grad_norm": NaN, + "learning_rate": 3.075988426175245e-05, + "loss": 0.0, + "step": 51374 + }, + { + "epoch": 4.793785574321172, + "grad_norm": NaN, + "learning_rate": 3.075529549725532e-05, + "loss": 0.0, + "step": 51375 + }, + { + "epoch": 4.793878884016049, + "grad_norm": NaN, + "learning_rate": 3.0750707035962155e-05, + "loss": 0.0, + "step": 51376 + }, + { + "epoch": 4.7939721937109265, + "grad_norm": NaN, + "learning_rate": 3.074611887788462e-05, + "loss": 0.0, + "step": 51377 + }, + { + "epoch": 4.794065503405804, + "grad_norm": NaN, + "learning_rate": 3.074153102303438e-05, + "loss": 0.0, + "step": 51378 + }, + { + "epoch": 4.794158813100681, + "grad_norm": NaN, + "learning_rate": 3.073694347142309e-05, + "loss": 0.0, + "step": 51379 + }, + { + "epoch": 4.794252122795559, + "grad_norm": NaN, + "learning_rate": 3.073235622306244e-05, + "loss": 0.0, + "step": 51380 + }, + { + "epoch": 4.794345432490436, + "grad_norm": NaN, + "learning_rate": 3.0727769277964056e-05, + "loss": 0.0, + "step": 51381 + }, + { + "epoch": 4.794438742185313, + "grad_norm": NaN, + "learning_rate": 3.072318263613964e-05, + "loss": 0.0, + "step": 51382 + }, + { + "epoch": 4.79453205188019, + "grad_norm": NaN, + "learning_rate": 3.071859629760083e-05, + "loss": 0.0, + "step": 51383 + }, + { + "epoch": 4.794625361575068, + "grad_norm": NaN, + "learning_rate": 3.07140102623593e-05, + "loss": 0.0, + "step": 51384 + }, + { + "epoch": 4.794718671269945, + "grad_norm": NaN, + "learning_rate": 3.0709424530426716e-05, + "loss": 0.0, + "step": 51385 + }, + { + "epoch": 4.794811980964822, + "grad_norm": NaN, + "learning_rate": 3.070483910181472e-05, + "loss": 0.0, + "step": 51386 + }, + { + "epoch": 4.7949052906597, + "grad_norm": NaN, + "learning_rate": 3.070025397653498e-05, + "loss": 0.0, + "step": 51387 + }, + { + "epoch": 4.794998600354576, + "grad_norm": NaN, + "learning_rate": 3.0695669154599146e-05, + "loss": 0.0, + "step": 51388 + }, + { + "epoch": 4.795091910049454, + "grad_norm": NaN, + "learning_rate": 3.06910846360189e-05, + "loss": 0.0, + "step": 51389 + }, + { + "epoch": 4.795185219744331, + "grad_norm": NaN, + "learning_rate": 3.068650042080588e-05, + "loss": 0.0, + "step": 51390 + }, + { + "epoch": 4.795278529439209, + "grad_norm": NaN, + "learning_rate": 3.0681916508971736e-05, + "loss": 0.0, + "step": 51391 + }, + { + "epoch": 4.795371839134086, + "grad_norm": NaN, + "learning_rate": 3.067733290052814e-05, + "loss": 0.0, + "step": 51392 + }, + { + "epoch": 4.7954651488289635, + "grad_norm": NaN, + "learning_rate": 3.067274959548674e-05, + "loss": 0.0, + "step": 51393 + }, + { + "epoch": 4.795558458523841, + "grad_norm": NaN, + "learning_rate": 3.0668166593859196e-05, + "loss": 0.0, + "step": 51394 + }, + { + "epoch": 4.795651768218718, + "grad_norm": NaN, + "learning_rate": 3.066358389565716e-05, + "loss": 0.0, + "step": 51395 + }, + { + "epoch": 4.795745077913596, + "grad_norm": NaN, + "learning_rate": 3.065900150089227e-05, + "loss": 0.0, + "step": 51396 + }, + { + "epoch": 4.795838387608472, + "grad_norm": NaN, + "learning_rate": 3.0654419409576196e-05, + "loss": 0.0, + "step": 51397 + }, + { + "epoch": 4.79593169730335, + "grad_norm": NaN, + "learning_rate": 3.0649837621720564e-05, + "loss": 0.0, + "step": 51398 + }, + { + "epoch": 4.796025006998227, + "grad_norm": NaN, + "learning_rate": 3.064525613733706e-05, + "loss": 0.0, + "step": 51399 + }, + { + "epoch": 4.7961183166931045, + "grad_norm": NaN, + "learning_rate": 3.064067495643731e-05, + "loss": 0.0, + "step": 51400 + }, + { + "epoch": 4.796211626387982, + "grad_norm": NaN, + "learning_rate": 3.063609407903296e-05, + "loss": 0.0, + "step": 51401 + }, + { + "epoch": 4.796304936082859, + "grad_norm": NaN, + "learning_rate": 3.0631513505135656e-05, + "loss": 0.0, + "step": 51402 + }, + { + "epoch": 4.796398245777736, + "grad_norm": NaN, + "learning_rate": 3.0626933234757065e-05, + "loss": 0.0, + "step": 51403 + }, + { + "epoch": 4.796491555472613, + "grad_norm": NaN, + "learning_rate": 3.062235326790883e-05, + "loss": 0.0, + "step": 51404 + }, + { + "epoch": 4.796584865167491, + "grad_norm": NaN, + "learning_rate": 3.061777360460258e-05, + "loss": 0.0, + "step": 51405 + }, + { + "epoch": 4.796678174862368, + "grad_norm": NaN, + "learning_rate": 3.061319424484996e-05, + "loss": 0.0, + "step": 51406 + }, + { + "epoch": 4.796771484557246, + "grad_norm": NaN, + "learning_rate": 3.060861518866264e-05, + "loss": 0.0, + "step": 51407 + }, + { + "epoch": 4.796864794252123, + "grad_norm": NaN, + "learning_rate": 3.060403643605223e-05, + "loss": 0.0, + "step": 51408 + }, + { + "epoch": 4.796958103947, + "grad_norm": NaN, + "learning_rate": 3.0599457987030403e-05, + "loss": 0.0, + "step": 51409 + }, + { + "epoch": 4.797051413641878, + "grad_norm": NaN, + "learning_rate": 3.059487984160877e-05, + "loss": 0.0, + "step": 51410 + }, + { + "epoch": 4.797144723336754, + "grad_norm": NaN, + "learning_rate": 3.059030199979899e-05, + "loss": 0.0, + "step": 51411 + }, + { + "epoch": 4.797238033031632, + "grad_norm": NaN, + "learning_rate": 3.058572446161271e-05, + "loss": 0.0, + "step": 51412 + }, + { + "epoch": 4.797331342726509, + "grad_norm": NaN, + "learning_rate": 3.058114722706154e-05, + "loss": 0.0, + "step": 51413 + }, + { + "epoch": 4.797424652421387, + "grad_norm": NaN, + "learning_rate": 3.057657029615715e-05, + "loss": 0.0, + "step": 51414 + }, + { + "epoch": 4.797517962116264, + "grad_norm": NaN, + "learning_rate": 3.0571993668911174e-05, + "loss": 0.0, + "step": 51415 + }, + { + "epoch": 4.797611271811141, + "grad_norm": NaN, + "learning_rate": 3.056741734533523e-05, + "loss": 0.0, + "step": 51416 + }, + { + "epoch": 4.797704581506019, + "grad_norm": NaN, + "learning_rate": 3.056284132544097e-05, + "loss": 0.0, + "step": 51417 + }, + { + "epoch": 4.797797891200895, + "grad_norm": NaN, + "learning_rate": 3.055826560924002e-05, + "loss": 0.0, + "step": 51418 + }, + { + "epoch": 4.797891200895773, + "grad_norm": NaN, + "learning_rate": 3.055369019674403e-05, + "loss": 0.0, + "step": 51419 + }, + { + "epoch": 4.79798451059065, + "grad_norm": NaN, + "learning_rate": 3.054911508796461e-05, + "loss": 0.0, + "step": 51420 + }, + { + "epoch": 4.798077820285528, + "grad_norm": NaN, + "learning_rate": 3.054454028291342e-05, + "loss": 0.0, + "step": 51421 + }, + { + "epoch": 4.798171129980405, + "grad_norm": NaN, + "learning_rate": 3.0539965781602064e-05, + "loss": 0.0, + "step": 51422 + }, + { + "epoch": 4.7982644396752825, + "grad_norm": NaN, + "learning_rate": 3.053539158404219e-05, + "loss": 0.0, + "step": 51423 + }, + { + "epoch": 4.79835774937016, + "grad_norm": NaN, + "learning_rate": 3.053081769024543e-05, + "loss": 0.0, + "step": 51424 + }, + { + "epoch": 4.798451059065037, + "grad_norm": NaN, + "learning_rate": 3.0526244100223425e-05, + "loss": 0.0, + "step": 51425 + }, + { + "epoch": 4.798544368759914, + "grad_norm": NaN, + "learning_rate": 3.052167081398777e-05, + "loss": 0.0, + "step": 51426 + }, + { + "epoch": 4.798637678454791, + "grad_norm": NaN, + "learning_rate": 3.0517097831550125e-05, + "loss": 0.0, + "step": 51427 + }, + { + "epoch": 4.798730988149669, + "grad_norm": NaN, + "learning_rate": 3.0512525152922096e-05, + "loss": 0.0, + "step": 51428 + }, + { + "epoch": 4.798824297844546, + "grad_norm": NaN, + "learning_rate": 3.0507952778115337e-05, + "loss": 0.0, + "step": 51429 + }, + { + "epoch": 4.7989176075394235, + "grad_norm": NaN, + "learning_rate": 3.050338070714145e-05, + "loss": 0.0, + "step": 51430 + }, + { + "epoch": 4.799010917234301, + "grad_norm": NaN, + "learning_rate": 3.0498808940012064e-05, + "loss": 0.0, + "step": 51431 + }, + { + "epoch": 4.7991042269291775, + "grad_norm": NaN, + "learning_rate": 3.049423747673882e-05, + "loss": 0.0, + "step": 51432 + }, + { + "epoch": 4.799197536624055, + "grad_norm": NaN, + "learning_rate": 3.0489666317333322e-05, + "loss": 0.0, + "step": 51433 + }, + { + "epoch": 4.799290846318932, + "grad_norm": NaN, + "learning_rate": 3.0485095461807198e-05, + "loss": 0.0, + "step": 51434 + }, + { + "epoch": 4.79938415601381, + "grad_norm": NaN, + "learning_rate": 3.0480524910172084e-05, + "loss": 0.0, + "step": 51435 + }, + { + "epoch": 4.799477465708687, + "grad_norm": NaN, + "learning_rate": 3.0475954662439574e-05, + "loss": 0.0, + "step": 51436 + }, + { + "epoch": 4.799570775403565, + "grad_norm": NaN, + "learning_rate": 3.0471384718621323e-05, + "loss": 0.0, + "step": 51437 + }, + { + "epoch": 4.799664085098442, + "grad_norm": NaN, + "learning_rate": 3.046681507872889e-05, + "loss": 0.0, + "step": 51438 + }, + { + "epoch": 4.799757394793319, + "grad_norm": NaN, + "learning_rate": 3.0462245742774e-05, + "loss": 0.0, + "step": 51439 + }, + { + "epoch": 4.799850704488196, + "grad_norm": NaN, + "learning_rate": 3.045767671076819e-05, + "loss": 0.0, + "step": 51440 + }, + { + "epoch": 4.799944014183073, + "grad_norm": NaN, + "learning_rate": 3.045310798272306e-05, + "loss": 0.0, + "step": 51441 + }, + { + "epoch": 4.800037323877951, + "grad_norm": NaN, + "learning_rate": 3.044853955865032e-05, + "loss": 0.0, + "step": 51442 + }, + { + "epoch": 4.800130633572828, + "grad_norm": NaN, + "learning_rate": 3.0443971438561503e-05, + "loss": 0.0, + "step": 51443 + }, + { + "epoch": 4.800223943267706, + "grad_norm": NaN, + "learning_rate": 3.0439403622468267e-05, + "loss": 0.0, + "step": 51444 + }, + { + "epoch": 4.800317252962583, + "grad_norm": NaN, + "learning_rate": 3.0434836110382183e-05, + "loss": 0.0, + "step": 51445 + }, + { + "epoch": 4.8004105626574605, + "grad_norm": NaN, + "learning_rate": 3.043026890231491e-05, + "loss": 0.0, + "step": 51446 + }, + { + "epoch": 4.800503872352337, + "grad_norm": NaN, + "learning_rate": 3.042570199827805e-05, + "loss": 0.0, + "step": 51447 + }, + { + "epoch": 4.800597182047214, + "grad_norm": NaN, + "learning_rate": 3.0421135398283204e-05, + "loss": 0.0, + "step": 51448 + }, + { + "epoch": 4.800690491742092, + "grad_norm": NaN, + "learning_rate": 3.041656910234197e-05, + "loss": 0.0, + "step": 51449 + }, + { + "epoch": 4.800783801436969, + "grad_norm": NaN, + "learning_rate": 3.0412003110465993e-05, + "loss": 0.0, + "step": 51450 + }, + { + "epoch": 4.800877111131847, + "grad_norm": NaN, + "learning_rate": 3.040743742266688e-05, + "loss": 0.0, + "step": 51451 + }, + { + "epoch": 4.800970420826724, + "grad_norm": NaN, + "learning_rate": 3.0402872038956204e-05, + "loss": 0.0, + "step": 51452 + }, + { + "epoch": 4.8010637305216015, + "grad_norm": NaN, + "learning_rate": 3.039830695934561e-05, + "loss": 0.0, + "step": 51453 + }, + { + "epoch": 4.801157040216479, + "grad_norm": NaN, + "learning_rate": 3.0393742183846697e-05, + "loss": 0.0, + "step": 51454 + }, + { + "epoch": 4.8012503499113555, + "grad_norm": NaN, + "learning_rate": 3.0389177712471065e-05, + "loss": 0.0, + "step": 51455 + }, + { + "epoch": 4.801343659606233, + "grad_norm": NaN, + "learning_rate": 3.0384613545230313e-05, + "loss": 0.0, + "step": 51456 + }, + { + "epoch": 4.80143696930111, + "grad_norm": NaN, + "learning_rate": 3.0380049682136054e-05, + "loss": 0.0, + "step": 51457 + }, + { + "epoch": 4.801530278995988, + "grad_norm": NaN, + "learning_rate": 3.037548612319991e-05, + "loss": 0.0, + "step": 51458 + }, + { + "epoch": 4.801623588690865, + "grad_norm": NaN, + "learning_rate": 3.0370922868433456e-05, + "loss": 0.0, + "step": 51459 + }, + { + "epoch": 4.801716898385743, + "grad_norm": NaN, + "learning_rate": 3.0366359917848316e-05, + "loss": 0.0, + "step": 51460 + }, + { + "epoch": 4.801810208080619, + "grad_norm": NaN, + "learning_rate": 3.0361797271456095e-05, + "loss": 0.0, + "step": 51461 + }, + { + "epoch": 4.8019035177754965, + "grad_norm": NaN, + "learning_rate": 3.0357234929268372e-05, + "loss": 0.0, + "step": 51462 + }, + { + "epoch": 4.801996827470374, + "grad_norm": NaN, + "learning_rate": 3.0352672891296746e-05, + "loss": 0.0, + "step": 51463 + }, + { + "epoch": 4.802090137165251, + "grad_norm": NaN, + "learning_rate": 3.034811115755285e-05, + "loss": 0.0, + "step": 51464 + }, + { + "epoch": 4.802183446860129, + "grad_norm": NaN, + "learning_rate": 3.0343549728048256e-05, + "loss": 0.0, + "step": 51465 + }, + { + "epoch": 4.802276756555006, + "grad_norm": NaN, + "learning_rate": 3.0338988602794584e-05, + "loss": 0.0, + "step": 51466 + }, + { + "epoch": 4.802370066249884, + "grad_norm": NaN, + "learning_rate": 3.0334427781803393e-05, + "loss": 0.0, + "step": 51467 + }, + { + "epoch": 4.802463375944761, + "grad_norm": NaN, + "learning_rate": 3.032986726508633e-05, + "loss": 0.0, + "step": 51468 + }, + { + "epoch": 4.8025566856396384, + "grad_norm": NaN, + "learning_rate": 3.0325307052654958e-05, + "loss": 0.0, + "step": 51469 + }, + { + "epoch": 4.802649995334515, + "grad_norm": NaN, + "learning_rate": 3.0320747144520874e-05, + "loss": 0.0, + "step": 51470 + }, + { + "epoch": 4.802743305029392, + "grad_norm": NaN, + "learning_rate": 3.031618754069567e-05, + "loss": 0.0, + "step": 51471 + }, + { + "epoch": 4.80283661472427, + "grad_norm": NaN, + "learning_rate": 3.0311628241190968e-05, + "loss": 0.0, + "step": 51472 + }, + { + "epoch": 4.802929924419147, + "grad_norm": NaN, + "learning_rate": 3.0307069246018323e-05, + "loss": 0.0, + "step": 51473 + }, + { + "epoch": 4.803023234114025, + "grad_norm": NaN, + "learning_rate": 3.0302510555189362e-05, + "loss": 0.0, + "step": 51474 + }, + { + "epoch": 4.803116543808902, + "grad_norm": NaN, + "learning_rate": 3.0297952168715643e-05, + "loss": 0.0, + "step": 51475 + }, + { + "epoch": 4.803209853503779, + "grad_norm": NaN, + "learning_rate": 3.029339408660874e-05, + "loss": 0.0, + "step": 51476 + }, + { + "epoch": 4.803303163198656, + "grad_norm": NaN, + "learning_rate": 3.0288836308880358e-05, + "loss": 0.0, + "step": 51477 + }, + { + "epoch": 4.803396472893533, + "grad_norm": NaN, + "learning_rate": 3.0284278835541977e-05, + "loss": 0.0, + "step": 51478 + }, + { + "epoch": 4.803489782588411, + "grad_norm": NaN, + "learning_rate": 3.0279721666605146e-05, + "loss": 0.0, + "step": 51479 + }, + { + "epoch": 4.803583092283288, + "grad_norm": NaN, + "learning_rate": 3.0275164802081626e-05, + "loss": 0.0, + "step": 51480 + }, + { + "epoch": 4.803676401978166, + "grad_norm": NaN, + "learning_rate": 3.0270608241982842e-05, + "loss": 0.0, + "step": 51481 + }, + { + "epoch": 4.803769711673043, + "grad_norm": NaN, + "learning_rate": 3.0266051986320384e-05, + "loss": 0.0, + "step": 51482 + }, + { + "epoch": 4.8038630213679205, + "grad_norm": NaN, + "learning_rate": 3.0261496035105993e-05, + "loss": 0.0, + "step": 51483 + }, + { + "epoch": 4.803956331062797, + "grad_norm": NaN, + "learning_rate": 3.0256940388351092e-05, + "loss": 0.0, + "step": 51484 + }, + { + "epoch": 4.8040496407576745, + "grad_norm": NaN, + "learning_rate": 3.0252385046067273e-05, + "loss": 0.0, + "step": 51485 + }, + { + "epoch": 4.804142950452552, + "grad_norm": NaN, + "learning_rate": 3.024783000826626e-05, + "loss": 0.0, + "step": 51486 + }, + { + "epoch": 4.804236260147429, + "grad_norm": NaN, + "learning_rate": 3.024327527495951e-05, + "loss": 0.0, + "step": 51487 + }, + { + "epoch": 4.804329569842307, + "grad_norm": NaN, + "learning_rate": 3.023872084615863e-05, + "loss": 0.0, + "step": 51488 + }, + { + "epoch": 4.804422879537184, + "grad_norm": NaN, + "learning_rate": 3.0234166721875196e-05, + "loss": 0.0, + "step": 51489 + }, + { + "epoch": 4.804516189232062, + "grad_norm": NaN, + "learning_rate": 3.0229612902120815e-05, + "loss": 0.0, + "step": 51490 + }, + { + "epoch": 4.804609498926938, + "grad_norm": NaN, + "learning_rate": 3.022505938690704e-05, + "loss": 0.0, + "step": 51491 + }, + { + "epoch": 4.8047028086218155, + "grad_norm": NaN, + "learning_rate": 3.022050617624545e-05, + "loss": 0.0, + "step": 51492 + }, + { + "epoch": 4.804796118316693, + "grad_norm": NaN, + "learning_rate": 3.021595327014765e-05, + "loss": 0.0, + "step": 51493 + }, + { + "epoch": 4.80488942801157, + "grad_norm": NaN, + "learning_rate": 3.0211400668625186e-05, + "loss": 0.0, + "step": 51494 + }, + { + "epoch": 4.804982737706448, + "grad_norm": NaN, + "learning_rate": 3.0206848371689656e-05, + "loss": 0.0, + "step": 51495 + }, + { + "epoch": 4.805076047401325, + "grad_norm": NaN, + "learning_rate": 3.020229637935261e-05, + "loss": 0.0, + "step": 51496 + }, + { + "epoch": 4.805169357096203, + "grad_norm": NaN, + "learning_rate": 3.019774469162565e-05, + "loss": 0.0, + "step": 51497 + }, + { + "epoch": 4.80526266679108, + "grad_norm": NaN, + "learning_rate": 3.019319330852033e-05, + "loss": 0.0, + "step": 51498 + }, + { + "epoch": 4.805355976485957, + "grad_norm": NaN, + "learning_rate": 3.018864223004825e-05, + "loss": 0.0, + "step": 51499 + }, + { + "epoch": 4.805449286180834, + "grad_norm": NaN, + "learning_rate": 3.0184091456220944e-05, + "loss": 0.0, + "step": 51500 + }, + { + "epoch": 4.805542595875711, + "grad_norm": NaN, + "learning_rate": 3.0179540987050005e-05, + "loss": 0.0, + "step": 51501 + }, + { + "epoch": 4.805635905570589, + "grad_norm": NaN, + "learning_rate": 3.017499082254699e-05, + "loss": 0.0, + "step": 51502 + }, + { + "epoch": 4.805729215265466, + "grad_norm": NaN, + "learning_rate": 3.017044096272349e-05, + "loss": 0.0, + "step": 51503 + }, + { + "epoch": 4.805822524960344, + "grad_norm": NaN, + "learning_rate": 3.0165891407591066e-05, + "loss": 0.0, + "step": 51504 + }, + { + "epoch": 4.80591583465522, + "grad_norm": NaN, + "learning_rate": 3.0161342157161268e-05, + "loss": 0.0, + "step": 51505 + }, + { + "epoch": 4.806009144350098, + "grad_norm": NaN, + "learning_rate": 3.015679321144569e-05, + "loss": 0.0, + "step": 51506 + }, + { + "epoch": 4.806102454044975, + "grad_norm": NaN, + "learning_rate": 3.0152244570455876e-05, + "loss": 0.0, + "step": 51507 + }, + { + "epoch": 4.8061957637398525, + "grad_norm": NaN, + "learning_rate": 3.0147696234203412e-05, + "loss": 0.0, + "step": 51508 + }, + { + "epoch": 4.80628907343473, + "grad_norm": NaN, + "learning_rate": 3.0143148202699857e-05, + "loss": 0.0, + "step": 51509 + }, + { + "epoch": 4.806382383129607, + "grad_norm": NaN, + "learning_rate": 3.013860047595677e-05, + "loss": 0.0, + "step": 51510 + }, + { + "epoch": 4.806475692824485, + "grad_norm": NaN, + "learning_rate": 3.0134053053985707e-05, + "loss": 0.0, + "step": 51511 + }, + { + "epoch": 4.806569002519362, + "grad_norm": NaN, + "learning_rate": 3.0129505936798243e-05, + "loss": 0.0, + "step": 51512 + }, + { + "epoch": 4.806662312214239, + "grad_norm": NaN, + "learning_rate": 3.0124959124405933e-05, + "loss": 0.0, + "step": 51513 + }, + { + "epoch": 4.806755621909116, + "grad_norm": NaN, + "learning_rate": 3.0120412616820322e-05, + "loss": 0.0, + "step": 51514 + }, + { + "epoch": 4.8068489316039935, + "grad_norm": NaN, + "learning_rate": 3.0115866414053047e-05, + "loss": 0.0, + "step": 51515 + }, + { + "epoch": 4.806942241298871, + "grad_norm": NaN, + "learning_rate": 3.0111320516115585e-05, + "loss": 0.0, + "step": 51516 + }, + { + "epoch": 4.807035550993748, + "grad_norm": NaN, + "learning_rate": 3.010677492301949e-05, + "loss": 0.0, + "step": 51517 + }, + { + "epoch": 4.807128860688626, + "grad_norm": NaN, + "learning_rate": 3.0102229634776405e-05, + "loss": 0.0, + "step": 51518 + }, + { + "epoch": 4.807222170383503, + "grad_norm": NaN, + "learning_rate": 3.0097684651397818e-05, + "loss": 0.0, + "step": 51519 + }, + { + "epoch": 4.80731548007838, + "grad_norm": NaN, + "learning_rate": 3.009313997289524e-05, + "loss": 0.0, + "step": 51520 + }, + { + "epoch": 4.807408789773257, + "grad_norm": NaN, + "learning_rate": 3.0088595599280396e-05, + "loss": 0.0, + "step": 51521 + }, + { + "epoch": 4.807502099468135, + "grad_norm": NaN, + "learning_rate": 3.0084051530564672e-05, + "loss": 0.0, + "step": 51522 + }, + { + "epoch": 4.807595409163012, + "grad_norm": NaN, + "learning_rate": 3.0079507766759646e-05, + "loss": 0.0, + "step": 51523 + }, + { + "epoch": 4.807688718857889, + "grad_norm": NaN, + "learning_rate": 3.0074964307876992e-05, + "loss": 0.0, + "step": 51524 + }, + { + "epoch": 4.807782028552767, + "grad_norm": NaN, + "learning_rate": 3.0070421153928133e-05, + "loss": 0.0, + "step": 51525 + }, + { + "epoch": 4.807875338247644, + "grad_norm": NaN, + "learning_rate": 3.006587830492464e-05, + "loss": 0.0, + "step": 51526 + }, + { + "epoch": 4.807968647942522, + "grad_norm": NaN, + "learning_rate": 3.006133576087816e-05, + "loss": 0.0, + "step": 51527 + }, + { + "epoch": 4.808061957637398, + "grad_norm": NaN, + "learning_rate": 3.0056793521800143e-05, + "loss": 0.0, + "step": 51528 + }, + { + "epoch": 4.808155267332276, + "grad_norm": NaN, + "learning_rate": 3.005225158770212e-05, + "loss": 0.0, + "step": 51529 + }, + { + "epoch": 4.808248577027153, + "grad_norm": NaN, + "learning_rate": 3.0047709958595777e-05, + "loss": 0.0, + "step": 51530 + }, + { + "epoch": 4.8083418867220304, + "grad_norm": NaN, + "learning_rate": 3.0043168634492526e-05, + "loss": 0.0, + "step": 51531 + }, + { + "epoch": 4.808435196416908, + "grad_norm": NaN, + "learning_rate": 3.0038627615403986e-05, + "loss": 0.0, + "step": 51532 + }, + { + "epoch": 4.808528506111785, + "grad_norm": NaN, + "learning_rate": 3.0034086901341653e-05, + "loss": 0.0, + "step": 51533 + }, + { + "epoch": 4.808621815806662, + "grad_norm": NaN, + "learning_rate": 3.0029546492317115e-05, + "loss": 0.0, + "step": 51534 + }, + { + "epoch": 4.808715125501539, + "grad_norm": NaN, + "learning_rate": 3.002500638834191e-05, + "loss": 0.0, + "step": 51535 + }, + { + "epoch": 4.808808435196417, + "grad_norm": NaN, + "learning_rate": 3.0020466589427567e-05, + "loss": 0.0, + "step": 51536 + }, + { + "epoch": 4.808901744891294, + "grad_norm": NaN, + "learning_rate": 3.0015927095585624e-05, + "loss": 0.0, + "step": 51537 + }, + { + "epoch": 4.8089950545861715, + "grad_norm": NaN, + "learning_rate": 3.0011387906827656e-05, + "loss": 0.0, + "step": 51538 + }, + { + "epoch": 4.809088364281049, + "grad_norm": NaN, + "learning_rate": 3.0006849023165168e-05, + "loss": 0.0, + "step": 51539 + }, + { + "epoch": 4.809181673975926, + "grad_norm": NaN, + "learning_rate": 3.000231044460974e-05, + "loss": 0.0, + "step": 51540 + }, + { + "epoch": 4.809274983670804, + "grad_norm": NaN, + "learning_rate": 2.9997772171172873e-05, + "loss": 0.0, + "step": 51541 + }, + { + "epoch": 4.809368293365681, + "grad_norm": NaN, + "learning_rate": 2.9993234202866128e-05, + "loss": 0.0, + "step": 51542 + }, + { + "epoch": 4.809461603060558, + "grad_norm": NaN, + "learning_rate": 2.9988696539701044e-05, + "loss": 0.0, + "step": 51543 + }, + { + "epoch": 4.809554912755435, + "grad_norm": NaN, + "learning_rate": 2.9984159181689148e-05, + "loss": 0.0, + "step": 51544 + }, + { + "epoch": 4.8096482224503125, + "grad_norm": NaN, + "learning_rate": 2.9979622128841996e-05, + "loss": 0.0, + "step": 51545 + }, + { + "epoch": 4.80974153214519, + "grad_norm": NaN, + "learning_rate": 2.9975085381171094e-05, + "loss": 0.0, + "step": 51546 + }, + { + "epoch": 4.809834841840067, + "grad_norm": NaN, + "learning_rate": 2.9970548938688016e-05, + "loss": 0.0, + "step": 51547 + }, + { + "epoch": 4.809928151534945, + "grad_norm": NaN, + "learning_rate": 2.9966012801404255e-05, + "loss": 0.0, + "step": 51548 + }, + { + "epoch": 4.810021461229821, + "grad_norm": NaN, + "learning_rate": 2.996147696933138e-05, + "loss": 0.0, + "step": 51549 + }, + { + "epoch": 4.810114770924699, + "grad_norm": NaN, + "learning_rate": 2.9956941442480908e-05, + "loss": 0.0, + "step": 51550 + }, + { + "epoch": 4.810208080619576, + "grad_norm": NaN, + "learning_rate": 2.995240622086437e-05, + "loss": 0.0, + "step": 51551 + }, + { + "epoch": 4.810301390314454, + "grad_norm": NaN, + "learning_rate": 2.9947871304493283e-05, + "loss": 0.0, + "step": 51552 + }, + { + "epoch": 4.810394700009331, + "grad_norm": NaN, + "learning_rate": 2.994333669337926e-05, + "loss": 0.0, + "step": 51553 + }, + { + "epoch": 4.810488009704208, + "grad_norm": NaN, + "learning_rate": 2.9938802387533732e-05, + "loss": 0.0, + "step": 51554 + }, + { + "epoch": 4.810581319399086, + "grad_norm": NaN, + "learning_rate": 2.9934268386968223e-05, + "loss": 0.0, + "step": 51555 + }, + { + "epoch": 4.810674629093963, + "grad_norm": NaN, + "learning_rate": 2.9929734691694374e-05, + "loss": 0.0, + "step": 51556 + }, + { + "epoch": 4.81076793878884, + "grad_norm": NaN, + "learning_rate": 2.992520130172362e-05, + "loss": 0.0, + "step": 51557 + }, + { + "epoch": 4.810861248483717, + "grad_norm": NaN, + "learning_rate": 2.9920668217067462e-05, + "loss": 0.0, + "step": 51558 + }, + { + "epoch": 4.810954558178595, + "grad_norm": NaN, + "learning_rate": 2.991613543773755e-05, + "loss": 0.0, + "step": 51559 + }, + { + "epoch": 4.811047867873472, + "grad_norm": NaN, + "learning_rate": 2.9911602963745308e-05, + "loss": 0.0, + "step": 51560 + }, + { + "epoch": 4.8111411775683495, + "grad_norm": NaN, + "learning_rate": 2.990707079510223e-05, + "loss": 0.0, + "step": 51561 + }, + { + "epoch": 4.811234487263227, + "grad_norm": NaN, + "learning_rate": 2.9902538931819987e-05, + "loss": 0.0, + "step": 51562 + }, + { + "epoch": 4.811327796958104, + "grad_norm": NaN, + "learning_rate": 2.9898007373909972e-05, + "loss": 0.0, + "step": 51563 + }, + { + "epoch": 4.811421106652981, + "grad_norm": NaN, + "learning_rate": 2.989347612138371e-05, + "loss": 0.0, + "step": 51564 + }, + { + "epoch": 4.811514416347858, + "grad_norm": NaN, + "learning_rate": 2.988894517425282e-05, + "loss": 0.0, + "step": 51565 + }, + { + "epoch": 4.811607726042736, + "grad_norm": NaN, + "learning_rate": 2.988441453252875e-05, + "loss": 0.0, + "step": 51566 + }, + { + "epoch": 4.811701035737613, + "grad_norm": NaN, + "learning_rate": 2.987988419622297e-05, + "loss": 0.0, + "step": 51567 + }, + { + "epoch": 4.8117943454324905, + "grad_norm": NaN, + "learning_rate": 2.9875354165347153e-05, + "loss": 0.0, + "step": 51568 + }, + { + "epoch": 4.811887655127368, + "grad_norm": NaN, + "learning_rate": 2.9870824439912695e-05, + "loss": 0.0, + "step": 51569 + }, + { + "epoch": 4.811980964822245, + "grad_norm": NaN, + "learning_rate": 2.9866295019931097e-05, + "loss": 0.0, + "step": 51570 + }, + { + "epoch": 4.812074274517123, + "grad_norm": NaN, + "learning_rate": 2.9861765905414002e-05, + "loss": 0.0, + "step": 51571 + }, + { + "epoch": 4.812167584211999, + "grad_norm": NaN, + "learning_rate": 2.985723709637282e-05, + "loss": 0.0, + "step": 51572 + }, + { + "epoch": 4.812260893906877, + "grad_norm": NaN, + "learning_rate": 2.9852708592819042e-05, + "loss": 0.0, + "step": 51573 + }, + { + "epoch": 4.812354203601754, + "grad_norm": NaN, + "learning_rate": 2.984818039476432e-05, + "loss": 0.0, + "step": 51574 + }, + { + "epoch": 4.812447513296632, + "grad_norm": NaN, + "learning_rate": 2.984365250222005e-05, + "loss": 0.0, + "step": 51575 + }, + { + "epoch": 4.812540822991509, + "grad_norm": NaN, + "learning_rate": 2.9839124915197737e-05, + "loss": 0.0, + "step": 51576 + }, + { + "epoch": 4.812634132686386, + "grad_norm": NaN, + "learning_rate": 2.9834597633709006e-05, + "loss": 0.0, + "step": 51577 + }, + { + "epoch": 4.812727442381263, + "grad_norm": NaN, + "learning_rate": 2.9830070657765264e-05, + "loss": 0.0, + "step": 51578 + }, + { + "epoch": 4.81282075207614, + "grad_norm": NaN, + "learning_rate": 2.9825543987378052e-05, + "loss": 0.0, + "step": 51579 + }, + { + "epoch": 4.812914061771018, + "grad_norm": NaN, + "learning_rate": 2.982101762255888e-05, + "loss": 0.0, + "step": 51580 + }, + { + "epoch": 4.813007371465895, + "grad_norm": NaN, + "learning_rate": 2.9816491563319265e-05, + "loss": 0.0, + "step": 51581 + }, + { + "epoch": 4.813100681160773, + "grad_norm": NaN, + "learning_rate": 2.9811965809670723e-05, + "loss": 0.0, + "step": 51582 + }, + { + "epoch": 4.81319399085565, + "grad_norm": NaN, + "learning_rate": 2.980744036162474e-05, + "loss": 0.0, + "step": 51583 + }, + { + "epoch": 4.8132873005505274, + "grad_norm": NaN, + "learning_rate": 2.9802915219192824e-05, + "loss": 0.0, + "step": 51584 + }, + { + "epoch": 4.813380610245405, + "grad_norm": NaN, + "learning_rate": 2.97983903823865e-05, + "loss": 0.0, + "step": 51585 + }, + { + "epoch": 4.813473919940282, + "grad_norm": NaN, + "learning_rate": 2.9793865851217257e-05, + "loss": 0.0, + "step": 51586 + }, + { + "epoch": 4.813567229635159, + "grad_norm": NaN, + "learning_rate": 2.9789341625696606e-05, + "loss": 0.0, + "step": 51587 + }, + { + "epoch": 4.813660539330036, + "grad_norm": NaN, + "learning_rate": 2.978481770583605e-05, + "loss": 0.0, + "step": 51588 + }, + { + "epoch": 4.813753849024914, + "grad_norm": NaN, + "learning_rate": 2.97802940916471e-05, + "loss": 0.0, + "step": 51589 + }, + { + "epoch": 4.813847158719791, + "grad_norm": NaN, + "learning_rate": 2.9775770783141213e-05, + "loss": 0.0, + "step": 51590 + }, + { + "epoch": 4.8139404684146685, + "grad_norm": NaN, + "learning_rate": 2.9771247780329993e-05, + "loss": 0.0, + "step": 51591 + }, + { + "epoch": 4.814033778109546, + "grad_norm": NaN, + "learning_rate": 2.9766725083224835e-05, + "loss": 0.0, + "step": 51592 + }, + { + "epoch": 4.8141270878044224, + "grad_norm": NaN, + "learning_rate": 2.976220269183726e-05, + "loss": 0.0, + "step": 51593 + }, + { + "epoch": 4.8142203974993, + "grad_norm": NaN, + "learning_rate": 2.9757680606178847e-05, + "loss": 0.0, + "step": 51594 + }, + { + "epoch": 4.814313707194177, + "grad_norm": NaN, + "learning_rate": 2.9753158826261e-05, + "loss": 0.0, + "step": 51595 + }, + { + "epoch": 4.814407016889055, + "grad_norm": NaN, + "learning_rate": 2.9748637352095207e-05, + "loss": 0.0, + "step": 51596 + }, + { + "epoch": 4.814500326583932, + "grad_norm": NaN, + "learning_rate": 2.9744116183693097e-05, + "loss": 0.0, + "step": 51597 + }, + { + "epoch": 4.8145936362788095, + "grad_norm": NaN, + "learning_rate": 2.9739595321066038e-05, + "loss": 0.0, + "step": 51598 + }, + { + "epoch": 4.814686945973687, + "grad_norm": NaN, + "learning_rate": 2.9735074764225514e-05, + "loss": 0.0, + "step": 51599 + }, + { + "epoch": 4.814780255668564, + "grad_norm": NaN, + "learning_rate": 2.9730554513183158e-05, + "loss": 0.0, + "step": 51600 + }, + { + "epoch": 4.814873565363441, + "grad_norm": NaN, + "learning_rate": 2.972603456795033e-05, + "loss": 0.0, + "step": 51601 + }, + { + "epoch": 4.814966875058318, + "grad_norm": NaN, + "learning_rate": 2.972151492853852e-05, + "loss": 0.0, + "step": 51602 + }, + { + "epoch": 4.815060184753196, + "grad_norm": NaN, + "learning_rate": 2.971699559495937e-05, + "loss": 0.0, + "step": 51603 + }, + { + "epoch": 4.815153494448073, + "grad_norm": NaN, + "learning_rate": 2.971247656722422e-05, + "loss": 0.0, + "step": 51604 + }, + { + "epoch": 4.815246804142951, + "grad_norm": NaN, + "learning_rate": 2.970795784534456e-05, + "loss": 0.0, + "step": 51605 + }, + { + "epoch": 4.815340113837828, + "grad_norm": NaN, + "learning_rate": 2.9703439429332016e-05, + "loss": 0.0, + "step": 51606 + }, + { + "epoch": 4.815433423532705, + "grad_norm": NaN, + "learning_rate": 2.969892131919796e-05, + "loss": 0.0, + "step": 51607 + }, + { + "epoch": 4.815526733227582, + "grad_norm": NaN, + "learning_rate": 2.9694403514953853e-05, + "loss": 0.0, + "step": 51608 + }, + { + "epoch": 4.815620042922459, + "grad_norm": NaN, + "learning_rate": 2.968988601661133e-05, + "loss": 0.0, + "step": 51609 + }, + { + "epoch": 4.815713352617337, + "grad_norm": NaN, + "learning_rate": 2.9685368824181755e-05, + "loss": 0.0, + "step": 51610 + }, + { + "epoch": 4.815806662312214, + "grad_norm": NaN, + "learning_rate": 2.9680851937676614e-05, + "loss": 0.0, + "step": 51611 + }, + { + "epoch": 4.815899972007092, + "grad_norm": NaN, + "learning_rate": 2.9676335357107483e-05, + "loss": 0.0, + "step": 51612 + }, + { + "epoch": 4.815993281701969, + "grad_norm": NaN, + "learning_rate": 2.967181908248577e-05, + "loss": 0.0, + "step": 51613 + }, + { + "epoch": 4.8160865913968465, + "grad_norm": NaN, + "learning_rate": 2.9667303113822917e-05, + "loss": 0.0, + "step": 51614 + }, + { + "epoch": 4.816179901091724, + "grad_norm": NaN, + "learning_rate": 2.966278745113056e-05, + "loss": 0.0, + "step": 51615 + }, + { + "epoch": 4.8162732107866, + "grad_norm": NaN, + "learning_rate": 2.9658272094420044e-05, + "loss": 0.0, + "step": 51616 + }, + { + "epoch": 4.816366520481478, + "grad_norm": NaN, + "learning_rate": 2.9653757043702858e-05, + "loss": 0.0, + "step": 51617 + }, + { + "epoch": 4.816459830176355, + "grad_norm": NaN, + "learning_rate": 2.9649242298990595e-05, + "loss": 0.0, + "step": 51618 + }, + { + "epoch": 4.816553139871233, + "grad_norm": NaN, + "learning_rate": 2.9644727860294625e-05, + "loss": 0.0, + "step": 51619 + }, + { + "epoch": 4.81664644956611, + "grad_norm": NaN, + "learning_rate": 2.964021372762641e-05, + "loss": 0.0, + "step": 51620 + }, + { + "epoch": 4.8167397592609875, + "grad_norm": NaN, + "learning_rate": 2.9635699900997557e-05, + "loss": 0.0, + "step": 51621 + }, + { + "epoch": 4.816833068955864, + "grad_norm": NaN, + "learning_rate": 2.9631186380419437e-05, + "loss": 0.0, + "step": 51622 + }, + { + "epoch": 4.8169263786507415, + "grad_norm": NaN, + "learning_rate": 2.962667316590354e-05, + "loss": 0.0, + "step": 51623 + }, + { + "epoch": 4.817019688345619, + "grad_norm": NaN, + "learning_rate": 2.9622160257461363e-05, + "loss": 0.0, + "step": 51624 + }, + { + "epoch": 4.817112998040496, + "grad_norm": NaN, + "learning_rate": 2.9617647655104342e-05, + "loss": 0.0, + "step": 51625 + }, + { + "epoch": 4.817206307735374, + "grad_norm": NaN, + "learning_rate": 2.9613135358844053e-05, + "loss": 0.0, + "step": 51626 + }, + { + "epoch": 4.817299617430251, + "grad_norm": NaN, + "learning_rate": 2.960862336869187e-05, + "loss": 0.0, + "step": 51627 + }, + { + "epoch": 4.817392927125129, + "grad_norm": NaN, + "learning_rate": 2.960411168465925e-05, + "loss": 0.0, + "step": 51628 + }, + { + "epoch": 4.817486236820006, + "grad_norm": NaN, + "learning_rate": 2.959960030675778e-05, + "loss": 0.0, + "step": 51629 + }, + { + "epoch": 4.8175795465148825, + "grad_norm": NaN, + "learning_rate": 2.9595089234998826e-05, + "loss": 0.0, + "step": 51630 + }, + { + "epoch": 4.81767285620976, + "grad_norm": NaN, + "learning_rate": 2.9590578469393856e-05, + "loss": 0.0, + "step": 51631 + }, + { + "epoch": 4.817766165904637, + "grad_norm": NaN, + "learning_rate": 2.958606800995446e-05, + "loss": 0.0, + "step": 51632 + }, + { + "epoch": 4.817859475599515, + "grad_norm": NaN, + "learning_rate": 2.9581557856691983e-05, + "loss": 0.0, + "step": 51633 + }, + { + "epoch": 4.817952785294392, + "grad_norm": NaN, + "learning_rate": 2.9577048009617882e-05, + "loss": 0.0, + "step": 51634 + }, + { + "epoch": 4.81804609498927, + "grad_norm": NaN, + "learning_rate": 2.9572538468743777e-05, + "loss": 0.0, + "step": 51635 + }, + { + "epoch": 4.818139404684147, + "grad_norm": NaN, + "learning_rate": 2.9568029234080982e-05, + "loss": 0.0, + "step": 51636 + }, + { + "epoch": 4.818232714379024, + "grad_norm": NaN, + "learning_rate": 2.9563520305640964e-05, + "loss": 0.0, + "step": 51637 + }, + { + "epoch": 4.818326024073901, + "grad_norm": NaN, + "learning_rate": 2.9559011683435336e-05, + "loss": 0.0, + "step": 51638 + }, + { + "epoch": 4.818419333768778, + "grad_norm": NaN, + "learning_rate": 2.955450336747542e-05, + "loss": 0.0, + "step": 51639 + }, + { + "epoch": 4.818512643463656, + "grad_norm": NaN, + "learning_rate": 2.9549995357772673e-05, + "loss": 0.0, + "step": 51640 + }, + { + "epoch": 4.818605953158533, + "grad_norm": NaN, + "learning_rate": 2.95454876543387e-05, + "loss": 0.0, + "step": 51641 + }, + { + "epoch": 4.818699262853411, + "grad_norm": NaN, + "learning_rate": 2.9540980257184816e-05, + "loss": 0.0, + "step": 51642 + }, + { + "epoch": 4.818792572548288, + "grad_norm": NaN, + "learning_rate": 2.9536473166322506e-05, + "loss": 0.0, + "step": 51643 + }, + { + "epoch": 4.8188858822431655, + "grad_norm": NaN, + "learning_rate": 2.9531966381763345e-05, + "loss": 0.0, + "step": 51644 + }, + { + "epoch": 4.818979191938042, + "grad_norm": NaN, + "learning_rate": 2.9527459903518657e-05, + "loss": 0.0, + "step": 51645 + }, + { + "epoch": 4.8190725016329194, + "grad_norm": NaN, + "learning_rate": 2.9522953731599917e-05, + "loss": 0.0, + "step": 51646 + }, + { + "epoch": 4.819165811327797, + "grad_norm": NaN, + "learning_rate": 2.9518447866018703e-05, + "loss": 0.0, + "step": 51647 + }, + { + "epoch": 4.819259121022674, + "grad_norm": NaN, + "learning_rate": 2.951394230678635e-05, + "loss": 0.0, + "step": 51648 + }, + { + "epoch": 4.819352430717552, + "grad_norm": NaN, + "learning_rate": 2.9509437053914298e-05, + "loss": 0.0, + "step": 51649 + }, + { + "epoch": 4.819445740412429, + "grad_norm": NaN, + "learning_rate": 2.9504932107414142e-05, + "loss": 0.0, + "step": 51650 + }, + { + "epoch": 4.819539050107306, + "grad_norm": NaN, + "learning_rate": 2.950042746729722e-05, + "loss": 0.0, + "step": 51651 + }, + { + "epoch": 4.819632359802183, + "grad_norm": NaN, + "learning_rate": 2.949592313357496e-05, + "loss": 0.0, + "step": 51652 + }, + { + "epoch": 4.8197256694970605, + "grad_norm": NaN, + "learning_rate": 2.9491419106258967e-05, + "loss": 0.0, + "step": 51653 + }, + { + "epoch": 4.819818979191938, + "grad_norm": NaN, + "learning_rate": 2.9486915385360548e-05, + "loss": 0.0, + "step": 51654 + }, + { + "epoch": 4.819912288886815, + "grad_norm": NaN, + "learning_rate": 2.948241197089116e-05, + "loss": 0.0, + "step": 51655 + }, + { + "epoch": 4.820005598581693, + "grad_norm": NaN, + "learning_rate": 2.947790886286238e-05, + "loss": 0.0, + "step": 51656 + }, + { + "epoch": 4.82009890827657, + "grad_norm": NaN, + "learning_rate": 2.947340606128555e-05, + "loss": 0.0, + "step": 51657 + }, + { + "epoch": 4.820192217971448, + "grad_norm": NaN, + "learning_rate": 2.9468903566172085e-05, + "loss": 0.0, + "step": 51658 + }, + { + "epoch": 4.820285527666325, + "grad_norm": NaN, + "learning_rate": 2.9464401377533587e-05, + "loss": 0.0, + "step": 51659 + }, + { + "epoch": 4.8203788373612015, + "grad_norm": NaN, + "learning_rate": 2.9459899495381374e-05, + "loss": 0.0, + "step": 51660 + }, + { + "epoch": 4.820472147056079, + "grad_norm": NaN, + "learning_rate": 2.9455397919726888e-05, + "loss": 0.0, + "step": 51661 + }, + { + "epoch": 4.820565456750956, + "grad_norm": NaN, + "learning_rate": 2.9450896650581702e-05, + "loss": 0.0, + "step": 51662 + }, + { + "epoch": 4.820658766445834, + "grad_norm": NaN, + "learning_rate": 2.9446395687957076e-05, + "loss": 0.0, + "step": 51663 + }, + { + "epoch": 4.820752076140711, + "grad_norm": NaN, + "learning_rate": 2.9441895031864614e-05, + "loss": 0.0, + "step": 51664 + }, + { + "epoch": 4.820845385835589, + "grad_norm": NaN, + "learning_rate": 2.943739468231573e-05, + "loss": 0.0, + "step": 51665 + }, + { + "epoch": 4.820938695530465, + "grad_norm": NaN, + "learning_rate": 2.9432894639321774e-05, + "loss": 0.0, + "step": 51666 + }, + { + "epoch": 4.821032005225343, + "grad_norm": NaN, + "learning_rate": 2.9428394902894307e-05, + "loss": 0.0, + "step": 51667 + }, + { + "epoch": 4.82112531492022, + "grad_norm": NaN, + "learning_rate": 2.9423895473044707e-05, + "loss": 0.0, + "step": 51668 + }, + { + "epoch": 4.821218624615097, + "grad_norm": NaN, + "learning_rate": 2.9419396349784357e-05, + "loss": 0.0, + "step": 51669 + }, + { + "epoch": 4.821311934309975, + "grad_norm": NaN, + "learning_rate": 2.9414897533124855e-05, + "loss": 0.0, + "step": 51670 + }, + { + "epoch": 4.821405244004852, + "grad_norm": NaN, + "learning_rate": 2.9410399023077504e-05, + "loss": 0.0, + "step": 51671 + }, + { + "epoch": 4.82149855369973, + "grad_norm": NaN, + "learning_rate": 2.9405900819653762e-05, + "loss": 0.0, + "step": 51672 + }, + { + "epoch": 4.821591863394607, + "grad_norm": NaN, + "learning_rate": 2.9401402922865152e-05, + "loss": 0.0, + "step": 51673 + }, + { + "epoch": 4.821685173089484, + "grad_norm": NaN, + "learning_rate": 2.939690533272302e-05, + "loss": 0.0, + "step": 51674 + }, + { + "epoch": 4.821778482784361, + "grad_norm": NaN, + "learning_rate": 2.9392408049238786e-05, + "loss": 0.0, + "step": 51675 + }, + { + "epoch": 4.8218717924792385, + "grad_norm": NaN, + "learning_rate": 2.9387911072424025e-05, + "loss": 0.0, + "step": 51676 + }, + { + "epoch": 4.821965102174116, + "grad_norm": NaN, + "learning_rate": 2.938341440229003e-05, + "loss": 0.0, + "step": 51677 + }, + { + "epoch": 4.822058411868993, + "grad_norm": NaN, + "learning_rate": 2.9378918038848238e-05, + "loss": 0.0, + "step": 51678 + }, + { + "epoch": 4.822151721563871, + "grad_norm": NaN, + "learning_rate": 2.9374421982110208e-05, + "loss": 0.0, + "step": 51679 + }, + { + "epoch": 4.822245031258748, + "grad_norm": NaN, + "learning_rate": 2.9369926232087264e-05, + "loss": 0.0, + "step": 51680 + }, + { + "epoch": 4.822338340953625, + "grad_norm": NaN, + "learning_rate": 2.93654307887908e-05, + "loss": 0.0, + "step": 51681 + }, + { + "epoch": 4.822431650648502, + "grad_norm": NaN, + "learning_rate": 2.936093565223242e-05, + "loss": 0.0, + "step": 51682 + }, + { + "epoch": 4.8225249603433795, + "grad_norm": NaN, + "learning_rate": 2.9356440822423383e-05, + "loss": 0.0, + "step": 51683 + }, + { + "epoch": 4.822618270038257, + "grad_norm": NaN, + "learning_rate": 2.9351946299375144e-05, + "loss": 0.0, + "step": 51684 + }, + { + "epoch": 4.822711579733134, + "grad_norm": NaN, + "learning_rate": 2.934745208309925e-05, + "loss": 0.0, + "step": 51685 + }, + { + "epoch": 4.822804889428012, + "grad_norm": NaN, + "learning_rate": 2.9342958173607018e-05, + "loss": 0.0, + "step": 51686 + }, + { + "epoch": 4.822898199122889, + "grad_norm": NaN, + "learning_rate": 2.933846457090984e-05, + "loss": 0.0, + "step": 51687 + }, + { + "epoch": 4.822991508817767, + "grad_norm": NaN, + "learning_rate": 2.9333971275019297e-05, + "loss": 0.0, + "step": 51688 + }, + { + "epoch": 4.823084818512643, + "grad_norm": NaN, + "learning_rate": 2.9329478285946672e-05, + "loss": 0.0, + "step": 51689 + }, + { + "epoch": 4.823178128207521, + "grad_norm": NaN, + "learning_rate": 2.932498560370341e-05, + "loss": 0.0, + "step": 51690 + }, + { + "epoch": 4.823271437902398, + "grad_norm": NaN, + "learning_rate": 2.9320493228301033e-05, + "loss": 0.0, + "step": 51691 + }, + { + "epoch": 4.823364747597275, + "grad_norm": NaN, + "learning_rate": 2.931600115975087e-05, + "loss": 0.0, + "step": 51692 + }, + { + "epoch": 4.823458057292153, + "grad_norm": NaN, + "learning_rate": 2.9311509398064303e-05, + "loss": 0.0, + "step": 51693 + }, + { + "epoch": 4.82355136698703, + "grad_norm": NaN, + "learning_rate": 2.9307017943252914e-05, + "loss": 0.0, + "step": 51694 + }, + { + "epoch": 4.823644676681907, + "grad_norm": NaN, + "learning_rate": 2.9302526795327973e-05, + "loss": 0.0, + "step": 51695 + }, + { + "epoch": 4.823737986376784, + "grad_norm": NaN, + "learning_rate": 2.9298035954300924e-05, + "loss": 0.0, + "step": 51696 + }, + { + "epoch": 4.823831296071662, + "grad_norm": NaN, + "learning_rate": 2.929354542018329e-05, + "loss": 0.0, + "step": 51697 + }, + { + "epoch": 4.823924605766539, + "grad_norm": NaN, + "learning_rate": 2.9289055192986378e-05, + "loss": 0.0, + "step": 51698 + }, + { + "epoch": 4.8240179154614165, + "grad_norm": NaN, + "learning_rate": 2.9284565272721612e-05, + "loss": 0.0, + "step": 51699 + }, + { + "epoch": 4.824111225156294, + "grad_norm": NaN, + "learning_rate": 2.92800756594005e-05, + "loss": 0.0, + "step": 51700 + }, + { + "epoch": 4.824204534851171, + "grad_norm": NaN, + "learning_rate": 2.9275586353034313e-05, + "loss": 0.0, + "step": 51701 + }, + { + "epoch": 4.824297844546049, + "grad_norm": NaN, + "learning_rate": 2.92710973536346e-05, + "loss": 0.0, + "step": 51702 + }, + { + "epoch": 4.824391154240926, + "grad_norm": NaN, + "learning_rate": 2.926660866121276e-05, + "loss": 0.0, + "step": 51703 + }, + { + "epoch": 4.824484463935803, + "grad_norm": NaN, + "learning_rate": 2.9262120275780104e-05, + "loss": 0.0, + "step": 51704 + }, + { + "epoch": 4.82457777363068, + "grad_norm": NaN, + "learning_rate": 2.9257632197348123e-05, + "loss": 0.0, + "step": 51705 + }, + { + "epoch": 4.8246710833255575, + "grad_norm": NaN, + "learning_rate": 2.9253144425928278e-05, + "loss": 0.0, + "step": 51706 + }, + { + "epoch": 4.824764393020435, + "grad_norm": NaN, + "learning_rate": 2.9248656961531835e-05, + "loss": 0.0, + "step": 51707 + }, + { + "epoch": 4.824857702715312, + "grad_norm": NaN, + "learning_rate": 2.9244169804170324e-05, + "loss": 0.0, + "step": 51708 + }, + { + "epoch": 4.82495101241019, + "grad_norm": NaN, + "learning_rate": 2.923968295385517e-05, + "loss": 0.0, + "step": 51709 + }, + { + "epoch": 4.825044322105066, + "grad_norm": NaN, + "learning_rate": 2.923519641059763e-05, + "loss": 0.0, + "step": 51710 + }, + { + "epoch": 4.825137631799944, + "grad_norm": NaN, + "learning_rate": 2.9230710174409273e-05, + "loss": 0.0, + "step": 51711 + }, + { + "epoch": 4.825230941494821, + "grad_norm": NaN, + "learning_rate": 2.9226224245301477e-05, + "loss": 0.0, + "step": 51712 + }, + { + "epoch": 4.8253242511896985, + "grad_norm": NaN, + "learning_rate": 2.9221738623285534e-05, + "loss": 0.0, + "step": 51713 + }, + { + "epoch": 4.825417560884576, + "grad_norm": NaN, + "learning_rate": 2.9217253308373017e-05, + "loss": 0.0, + "step": 51714 + }, + { + "epoch": 4.825510870579453, + "grad_norm": NaN, + "learning_rate": 2.92127683005752e-05, + "loss": 0.0, + "step": 51715 + }, + { + "epoch": 4.825604180274331, + "grad_norm": NaN, + "learning_rate": 2.9208283599903492e-05, + "loss": 0.0, + "step": 51716 + }, + { + "epoch": 4.825697489969208, + "grad_norm": NaN, + "learning_rate": 2.9203799206369434e-05, + "loss": 0.0, + "step": 51717 + }, + { + "epoch": 4.825790799664085, + "grad_norm": NaN, + "learning_rate": 2.9199315119984278e-05, + "loss": 0.0, + "step": 51718 + }, + { + "epoch": 4.825884109358962, + "grad_norm": NaN, + "learning_rate": 2.919483134075944e-05, + "loss": 0.0, + "step": 51719 + }, + { + "epoch": 4.82597741905384, + "grad_norm": NaN, + "learning_rate": 2.9190347868706436e-05, + "loss": 0.0, + "step": 51720 + }, + { + "epoch": 4.826070728748717, + "grad_norm": NaN, + "learning_rate": 2.9185864703836565e-05, + "loss": 0.0, + "step": 51721 + }, + { + "epoch": 4.826164038443594, + "grad_norm": NaN, + "learning_rate": 2.918138184616121e-05, + "loss": 0.0, + "step": 51722 + }, + { + "epoch": 4.826257348138472, + "grad_norm": NaN, + "learning_rate": 2.9176899295691886e-05, + "loss": 0.0, + "step": 51723 + }, + { + "epoch": 4.826350657833349, + "grad_norm": NaN, + "learning_rate": 2.917241705243988e-05, + "loss": 0.0, + "step": 51724 + }, + { + "epoch": 4.826443967528226, + "grad_norm": NaN, + "learning_rate": 2.916793511641658e-05, + "loss": 0.0, + "step": 51725 + }, + { + "epoch": 4.826537277223103, + "grad_norm": NaN, + "learning_rate": 2.9163453487633514e-05, + "loss": 0.0, + "step": 51726 + }, + { + "epoch": 4.826630586917981, + "grad_norm": NaN, + "learning_rate": 2.9158972166101958e-05, + "loss": 0.0, + "step": 51727 + }, + { + "epoch": 4.826723896612858, + "grad_norm": NaN, + "learning_rate": 2.915449115183328e-05, + "loss": 0.0, + "step": 51728 + }, + { + "epoch": 4.8268172063077355, + "grad_norm": NaN, + "learning_rate": 2.9150010444839028e-05, + "loss": 0.0, + "step": 51729 + }, + { + "epoch": 4.826910516002613, + "grad_norm": NaN, + "learning_rate": 2.9145530045130454e-05, + "loss": 0.0, + "step": 51730 + }, + { + "epoch": 4.82700382569749, + "grad_norm": NaN, + "learning_rate": 2.914104995271897e-05, + "loss": 0.0, + "step": 51731 + }, + { + "epoch": 4.827097135392368, + "grad_norm": NaN, + "learning_rate": 2.9136570167616063e-05, + "loss": 0.0, + "step": 51732 + }, + { + "epoch": 4.827190445087244, + "grad_norm": NaN, + "learning_rate": 2.9132090689833025e-05, + "loss": 0.0, + "step": 51733 + }, + { + "epoch": 4.827283754782122, + "grad_norm": NaN, + "learning_rate": 2.9127611519381232e-05, + "loss": 0.0, + "step": 51734 + }, + { + "epoch": 4.827377064476999, + "grad_norm": NaN, + "learning_rate": 2.912313265627219e-05, + "loss": 0.0, + "step": 51735 + }, + { + "epoch": 4.8274703741718765, + "grad_norm": NaN, + "learning_rate": 2.911865410051719e-05, + "loss": 0.0, + "step": 51736 + }, + { + "epoch": 4.827563683866754, + "grad_norm": NaN, + "learning_rate": 2.911417585212759e-05, + "loss": 0.0, + "step": 51737 + }, + { + "epoch": 4.827656993561631, + "grad_norm": NaN, + "learning_rate": 2.9109697911114915e-05, + "loss": 0.0, + "step": 51738 + }, + { + "epoch": 4.827750303256508, + "grad_norm": NaN, + "learning_rate": 2.9105220277490404e-05, + "loss": 0.0, + "step": 51739 + }, + { + "epoch": 4.827843612951385, + "grad_norm": NaN, + "learning_rate": 2.910074295126553e-05, + "loss": 0.0, + "step": 51740 + }, + { + "epoch": 4.827936922646263, + "grad_norm": NaN, + "learning_rate": 2.909626593245169e-05, + "loss": 0.0, + "step": 51741 + }, + { + "epoch": 4.82803023234114, + "grad_norm": NaN, + "learning_rate": 2.9091789221060168e-05, + "loss": 0.0, + "step": 51742 + }, + { + "epoch": 4.828123542036018, + "grad_norm": NaN, + "learning_rate": 2.9087312817102443e-05, + "loss": 0.0, + "step": 51743 + }, + { + "epoch": 4.828216851730895, + "grad_norm": NaN, + "learning_rate": 2.9082836720589902e-05, + "loss": 0.0, + "step": 51744 + }, + { + "epoch": 4.828310161425772, + "grad_norm": NaN, + "learning_rate": 2.9078360931533808e-05, + "loss": 0.0, + "step": 51745 + }, + { + "epoch": 4.82840347112065, + "grad_norm": NaN, + "learning_rate": 2.9073885449945662e-05, + "loss": 0.0, + "step": 51746 + }, + { + "epoch": 4.828496780815526, + "grad_norm": NaN, + "learning_rate": 2.9069410275836842e-05, + "loss": 0.0, + "step": 51747 + }, + { + "epoch": 4.828590090510404, + "grad_norm": NaN, + "learning_rate": 2.906493540921862e-05, + "loss": 0.0, + "step": 51748 + }, + { + "epoch": 4.828683400205281, + "grad_norm": NaN, + "learning_rate": 2.9060460850102474e-05, + "loss": 0.0, + "step": 51749 + }, + { + "epoch": 4.828776709900159, + "grad_norm": NaN, + "learning_rate": 2.9055986598499792e-05, + "loss": 0.0, + "step": 51750 + }, + { + "epoch": 4.828870019595036, + "grad_norm": NaN, + "learning_rate": 2.9051512654421833e-05, + "loss": 0.0, + "step": 51751 + }, + { + "epoch": 4.8289633292899135, + "grad_norm": NaN, + "learning_rate": 2.9047039017880084e-05, + "loss": 0.0, + "step": 51752 + }, + { + "epoch": 4.829056638984791, + "grad_norm": NaN, + "learning_rate": 2.9042565688885943e-05, + "loss": 0.0, + "step": 51753 + }, + { + "epoch": 4.829149948679667, + "grad_norm": NaN, + "learning_rate": 2.903809266745063e-05, + "loss": 0.0, + "step": 51754 + }, + { + "epoch": 4.829243258374545, + "grad_norm": NaN, + "learning_rate": 2.9033619953585668e-05, + "loss": 0.0, + "step": 51755 + }, + { + "epoch": 4.829336568069422, + "grad_norm": NaN, + "learning_rate": 2.9029147547302402e-05, + "loss": 0.0, + "step": 51756 + }, + { + "epoch": 4.8294298777643, + "grad_norm": NaN, + "learning_rate": 2.9024675448612117e-05, + "loss": 0.0, + "step": 51757 + }, + { + "epoch": 4.829523187459177, + "grad_norm": NaN, + "learning_rate": 2.902020365752631e-05, + "loss": 0.0, + "step": 51758 + }, + { + "epoch": 4.8296164971540545, + "grad_norm": NaN, + "learning_rate": 2.9015732174056267e-05, + "loss": 0.0, + "step": 51759 + }, + { + "epoch": 4.829709806848932, + "grad_norm": NaN, + "learning_rate": 2.9011260998213314e-05, + "loss": 0.0, + "step": 51760 + }, + { + "epoch": 4.829803116543809, + "grad_norm": NaN, + "learning_rate": 2.9006790130008994e-05, + "loss": 0.0, + "step": 51761 + }, + { + "epoch": 4.829896426238686, + "grad_norm": NaN, + "learning_rate": 2.9002319569454498e-05, + "loss": 0.0, + "step": 51762 + }, + { + "epoch": 4.829989735933563, + "grad_norm": NaN, + "learning_rate": 2.8997849316561243e-05, + "loss": 0.0, + "step": 51763 + }, + { + "epoch": 4.830083045628441, + "grad_norm": NaN, + "learning_rate": 2.899337937134068e-05, + "loss": 0.0, + "step": 51764 + }, + { + "epoch": 4.830176355323318, + "grad_norm": NaN, + "learning_rate": 2.8988909733804077e-05, + "loss": 0.0, + "step": 51765 + }, + { + "epoch": 4.8302696650181955, + "grad_norm": NaN, + "learning_rate": 2.8984440403962778e-05, + "loss": 0.0, + "step": 51766 + }, + { + "epoch": 4.830362974713073, + "grad_norm": NaN, + "learning_rate": 2.8979971381828288e-05, + "loss": 0.0, + "step": 51767 + }, + { + "epoch": 4.8304562844079495, + "grad_norm": NaN, + "learning_rate": 2.8975502667411848e-05, + "loss": 0.0, + "step": 51768 + }, + { + "epoch": 4.830549594102827, + "grad_norm": NaN, + "learning_rate": 2.897103426072482e-05, + "loss": 0.0, + "step": 51769 + }, + { + "epoch": 4.830642903797704, + "grad_norm": NaN, + "learning_rate": 2.8966566161778653e-05, + "loss": 0.0, + "step": 51770 + }, + { + "epoch": 4.830736213492582, + "grad_norm": NaN, + "learning_rate": 2.896209837058463e-05, + "loss": 0.0, + "step": 51771 + }, + { + "epoch": 4.830829523187459, + "grad_norm": NaN, + "learning_rate": 2.8957630887154104e-05, + "loss": 0.0, + "step": 51772 + }, + { + "epoch": 4.830922832882337, + "grad_norm": NaN, + "learning_rate": 2.895316371149855e-05, + "loss": 0.0, + "step": 51773 + }, + { + "epoch": 4.831016142577214, + "grad_norm": NaN, + "learning_rate": 2.8948696843629195e-05, + "loss": 0.0, + "step": 51774 + }, + { + "epoch": 4.831109452272091, + "grad_norm": NaN, + "learning_rate": 2.8944230283557407e-05, + "loss": 0.0, + "step": 51775 + }, + { + "epoch": 4.831202761966969, + "grad_norm": NaN, + "learning_rate": 2.8939764031294647e-05, + "loss": 0.0, + "step": 51776 + }, + { + "epoch": 4.831296071661845, + "grad_norm": NaN, + "learning_rate": 2.8935298086852145e-05, + "loss": 0.0, + "step": 51777 + }, + { + "epoch": 4.831389381356723, + "grad_norm": NaN, + "learning_rate": 2.8930832450241364e-05, + "loss": 0.0, + "step": 51778 + }, + { + "epoch": 4.8314826910516, + "grad_norm": NaN, + "learning_rate": 2.8926367121473636e-05, + "loss": 0.0, + "step": 51779 + }, + { + "epoch": 4.831576000746478, + "grad_norm": NaN, + "learning_rate": 2.892190210056022e-05, + "loss": 0.0, + "step": 51780 + }, + { + "epoch": 4.831669310441355, + "grad_norm": NaN, + "learning_rate": 2.8917437387512583e-05, + "loss": 0.0, + "step": 51781 + }, + { + "epoch": 4.8317626201362325, + "grad_norm": NaN, + "learning_rate": 2.891297298234207e-05, + "loss": 0.0, + "step": 51782 + }, + { + "epoch": 4.831855929831109, + "grad_norm": NaN, + "learning_rate": 2.8908508885059924e-05, + "loss": 0.0, + "step": 51783 + }, + { + "epoch": 4.831949239525986, + "grad_norm": NaN, + "learning_rate": 2.8904045095677595e-05, + "loss": 0.0, + "step": 51784 + }, + { + "epoch": 4.832042549220864, + "grad_norm": NaN, + "learning_rate": 2.889958161420647e-05, + "loss": 0.0, + "step": 51785 + }, + { + "epoch": 4.832135858915741, + "grad_norm": NaN, + "learning_rate": 2.8895118440657744e-05, + "loss": 0.0, + "step": 51786 + }, + { + "epoch": 4.832229168610619, + "grad_norm": NaN, + "learning_rate": 2.8890655575042897e-05, + "loss": 0.0, + "step": 51787 + }, + { + "epoch": 4.832322478305496, + "grad_norm": NaN, + "learning_rate": 2.888619301737327e-05, + "loss": 0.0, + "step": 51788 + }, + { + "epoch": 4.8324157880003735, + "grad_norm": NaN, + "learning_rate": 2.8881730767660123e-05, + "loss": 0.0, + "step": 51789 + }, + { + "epoch": 4.832509097695251, + "grad_norm": NaN, + "learning_rate": 2.8877268825914872e-05, + "loss": 0.0, + "step": 51790 + }, + { + "epoch": 4.8326024073901275, + "grad_norm": NaN, + "learning_rate": 2.88728071921489e-05, + "loss": 0.0, + "step": 51791 + }, + { + "epoch": 4.832695717085005, + "grad_norm": NaN, + "learning_rate": 2.886834586637341e-05, + "loss": 0.0, + "step": 51792 + }, + { + "epoch": 4.832789026779882, + "grad_norm": NaN, + "learning_rate": 2.8863884848599893e-05, + "loss": 0.0, + "step": 51793 + }, + { + "epoch": 4.83288233647476, + "grad_norm": NaN, + "learning_rate": 2.885942413883966e-05, + "loss": 0.0, + "step": 51794 + }, + { + "epoch": 4.832975646169637, + "grad_norm": NaN, + "learning_rate": 2.8854963737103947e-05, + "loss": 0.0, + "step": 51795 + }, + { + "epoch": 4.833068955864515, + "grad_norm": NaN, + "learning_rate": 2.8850503643404217e-05, + "loss": 0.0, + "step": 51796 + }, + { + "epoch": 4.833162265559392, + "grad_norm": NaN, + "learning_rate": 2.8846043857751806e-05, + "loss": 0.0, + "step": 51797 + }, + { + "epoch": 4.8332555752542685, + "grad_norm": NaN, + "learning_rate": 2.884158438015794e-05, + "loss": 0.0, + "step": 51798 + }, + { + "epoch": 4.833348884949146, + "grad_norm": NaN, + "learning_rate": 2.883712521063406e-05, + "loss": 0.0, + "step": 51799 + }, + { + "epoch": 4.833442194644023, + "grad_norm": NaN, + "learning_rate": 2.883266634919154e-05, + "loss": 0.0, + "step": 51800 + }, + { + "epoch": 4.833535504338901, + "grad_norm": NaN, + "learning_rate": 2.8828207795841575e-05, + "loss": 0.0, + "step": 51801 + }, + { + "epoch": 4.833628814033778, + "grad_norm": NaN, + "learning_rate": 2.8823749550595648e-05, + "loss": 0.0, + "step": 51802 + }, + { + "epoch": 4.833722123728656, + "grad_norm": NaN, + "learning_rate": 2.8819291613465007e-05, + "loss": 0.0, + "step": 51803 + }, + { + "epoch": 4.833815433423533, + "grad_norm": NaN, + "learning_rate": 2.881483398446097e-05, + "loss": 0.0, + "step": 51804 + }, + { + "epoch": 4.8339087431184105, + "grad_norm": NaN, + "learning_rate": 2.8810376663594985e-05, + "loss": 0.0, + "step": 51805 + }, + { + "epoch": 4.834002052813287, + "grad_norm": NaN, + "learning_rate": 2.880591965087829e-05, + "loss": 0.0, + "step": 51806 + }, + { + "epoch": 4.834095362508164, + "grad_norm": NaN, + "learning_rate": 2.8801462946322174e-05, + "loss": 0.0, + "step": 51807 + }, + { + "epoch": 4.834188672203042, + "grad_norm": NaN, + "learning_rate": 2.879700654993813e-05, + "loss": 0.0, + "step": 51808 + }, + { + "epoch": 4.834281981897919, + "grad_norm": NaN, + "learning_rate": 2.8792550461737364e-05, + "loss": 0.0, + "step": 51809 + }, + { + "epoch": 4.834375291592797, + "grad_norm": NaN, + "learning_rate": 2.8788094681731184e-05, + "loss": 0.0, + "step": 51810 + }, + { + "epoch": 4.834468601287674, + "grad_norm": NaN, + "learning_rate": 2.8783639209931065e-05, + "loss": 0.0, + "step": 51811 + }, + { + "epoch": 4.834561910982551, + "grad_norm": NaN, + "learning_rate": 2.8779184046348196e-05, + "loss": 0.0, + "step": 51812 + }, + { + "epoch": 4.834655220677428, + "grad_norm": NaN, + "learning_rate": 2.8774729190993922e-05, + "loss": 0.0, + "step": 51813 + }, + { + "epoch": 4.8347485303723055, + "grad_norm": NaN, + "learning_rate": 2.8770274643879664e-05, + "loss": 0.0, + "step": 51814 + }, + { + "epoch": 4.834841840067183, + "grad_norm": NaN, + "learning_rate": 2.8765820405016633e-05, + "loss": 0.0, + "step": 51815 + }, + { + "epoch": 4.83493514976206, + "grad_norm": NaN, + "learning_rate": 2.876136647441623e-05, + "loss": 0.0, + "step": 51816 + }, + { + "epoch": 4.835028459456938, + "grad_norm": NaN, + "learning_rate": 2.8756912852089804e-05, + "loss": 0.0, + "step": 51817 + }, + { + "epoch": 4.835121769151815, + "grad_norm": NaN, + "learning_rate": 2.8752459538048556e-05, + "loss": 0.0, + "step": 51818 + }, + { + "epoch": 4.8352150788466925, + "grad_norm": NaN, + "learning_rate": 2.8748006532303913e-05, + "loss": 0.0, + "step": 51819 + }, + { + "epoch": 4.83530838854157, + "grad_norm": NaN, + "learning_rate": 2.8743553834867232e-05, + "loss": 0.0, + "step": 51820 + }, + { + "epoch": 4.8354016982364465, + "grad_norm": NaN, + "learning_rate": 2.8739101445749672e-05, + "loss": 0.0, + "step": 51821 + }, + { + "epoch": 4.835495007931324, + "grad_norm": NaN, + "learning_rate": 2.8734649364962724e-05, + "loss": 0.0, + "step": 51822 + }, + { + "epoch": 4.835588317626201, + "grad_norm": NaN, + "learning_rate": 2.8730197592517663e-05, + "loss": 0.0, + "step": 51823 + }, + { + "epoch": 4.835681627321079, + "grad_norm": NaN, + "learning_rate": 2.872574612842571e-05, + "loss": 0.0, + "step": 51824 + }, + { + "epoch": 4.835774937015956, + "grad_norm": NaN, + "learning_rate": 2.8721294972698304e-05, + "loss": 0.0, + "step": 51825 + }, + { + "epoch": 4.835868246710834, + "grad_norm": NaN, + "learning_rate": 2.871684412534676e-05, + "loss": 0.0, + "step": 51826 + }, + { + "epoch": 4.83596155640571, + "grad_norm": NaN, + "learning_rate": 2.8712393586382265e-05, + "loss": 0.0, + "step": 51827 + }, + { + "epoch": 4.8360548661005875, + "grad_norm": NaN, + "learning_rate": 2.8707943355816275e-05, + "loss": 0.0, + "step": 51828 + }, + { + "epoch": 4.836148175795465, + "grad_norm": NaN, + "learning_rate": 2.87034934336601e-05, + "loss": 0.0, + "step": 51829 + }, + { + "epoch": 4.836241485490342, + "grad_norm": NaN, + "learning_rate": 2.8699043819924917e-05, + "loss": 0.0, + "step": 51830 + }, + { + "epoch": 4.83633479518522, + "grad_norm": NaN, + "learning_rate": 2.8694594514622177e-05, + "loss": 0.0, + "step": 51831 + }, + { + "epoch": 4.836428104880097, + "grad_norm": NaN, + "learning_rate": 2.8690145517763192e-05, + "loss": 0.0, + "step": 51832 + }, + { + "epoch": 4.836521414574975, + "grad_norm": NaN, + "learning_rate": 2.8685696829359166e-05, + "loss": 0.0, + "step": 51833 + }, + { + "epoch": 4.836614724269852, + "grad_norm": NaN, + "learning_rate": 2.8681248449421496e-05, + "loss": 0.0, + "step": 51834 + }, + { + "epoch": 4.836708033964729, + "grad_norm": NaN, + "learning_rate": 2.8676800377961517e-05, + "loss": 0.0, + "step": 51835 + }, + { + "epoch": 4.836801343659606, + "grad_norm": NaN, + "learning_rate": 2.8672352614990425e-05, + "loss": 0.0, + "step": 51836 + }, + { + "epoch": 4.836894653354483, + "grad_norm": NaN, + "learning_rate": 2.866790516051964e-05, + "loss": 0.0, + "step": 51837 + }, + { + "epoch": 4.836987963049361, + "grad_norm": NaN, + "learning_rate": 2.866345801456047e-05, + "loss": 0.0, + "step": 51838 + }, + { + "epoch": 4.837081272744238, + "grad_norm": NaN, + "learning_rate": 2.865901117712409e-05, + "loss": 0.0, + "step": 51839 + }, + { + "epoch": 4.837174582439116, + "grad_norm": NaN, + "learning_rate": 2.865456464822196e-05, + "loss": 0.0, + "step": 51840 + }, + { + "epoch": 4.837267892133993, + "grad_norm": NaN, + "learning_rate": 2.8650118427865356e-05, + "loss": 0.0, + "step": 51841 + }, + { + "epoch": 4.83736120182887, + "grad_norm": NaN, + "learning_rate": 2.8645672516065478e-05, + "loss": 0.0, + "step": 51842 + }, + { + "epoch": 4.837454511523747, + "grad_norm": NaN, + "learning_rate": 2.8641226912833754e-05, + "loss": 0.0, + "step": 51843 + }, + { + "epoch": 4.8375478212186245, + "grad_norm": NaN, + "learning_rate": 2.8636781618181475e-05, + "loss": 0.0, + "step": 51844 + }, + { + "epoch": 4.837641130913502, + "grad_norm": NaN, + "learning_rate": 2.8632336632119816e-05, + "loss": 0.0, + "step": 51845 + }, + { + "epoch": 4.837734440608379, + "grad_norm": NaN, + "learning_rate": 2.8627891954660236e-05, + "loss": 0.0, + "step": 51846 + }, + { + "epoch": 4.837827750303257, + "grad_norm": NaN, + "learning_rate": 2.862344758581399e-05, + "loss": 0.0, + "step": 51847 + }, + { + "epoch": 4.837921059998134, + "grad_norm": NaN, + "learning_rate": 2.8619003525592305e-05, + "loss": 0.0, + "step": 51848 + }, + { + "epoch": 4.838014369693012, + "grad_norm": NaN, + "learning_rate": 2.86145597740066e-05, + "loss": 0.0, + "step": 51849 + }, + { + "epoch": 4.838107679387888, + "grad_norm": NaN, + "learning_rate": 2.8610116331068055e-05, + "loss": 0.0, + "step": 51850 + }, + { + "epoch": 4.8382009890827655, + "grad_norm": NaN, + "learning_rate": 2.8605673196788043e-05, + "loss": 0.0, + "step": 51851 + }, + { + "epoch": 4.838294298777643, + "grad_norm": NaN, + "learning_rate": 2.8601230371177903e-05, + "loss": 0.0, + "step": 51852 + }, + { + "epoch": 4.83838760847252, + "grad_norm": NaN, + "learning_rate": 2.8596787854248775e-05, + "loss": 0.0, + "step": 51853 + }, + { + "epoch": 4.838480918167398, + "grad_norm": NaN, + "learning_rate": 2.8592345646012104e-05, + "loss": 0.0, + "step": 51854 + }, + { + "epoch": 4.838574227862275, + "grad_norm": NaN, + "learning_rate": 2.858790374647916e-05, + "loss": 0.0, + "step": 51855 + }, + { + "epoch": 4.838667537557152, + "grad_norm": NaN, + "learning_rate": 2.8583462155661157e-05, + "loss": 0.0, + "step": 51856 + }, + { + "epoch": 4.838760847252029, + "grad_norm": NaN, + "learning_rate": 2.8579020873569463e-05, + "loss": 0.0, + "step": 51857 + }, + { + "epoch": 4.838854156946907, + "grad_norm": NaN, + "learning_rate": 2.8574579900215407e-05, + "loss": 0.0, + "step": 51858 + }, + { + "epoch": 4.838947466641784, + "grad_norm": NaN, + "learning_rate": 2.857013923561014e-05, + "loss": 0.0, + "step": 51859 + }, + { + "epoch": 4.839040776336661, + "grad_norm": NaN, + "learning_rate": 2.8565698879765076e-05, + "loss": 0.0, + "step": 51860 + }, + { + "epoch": 4.839134086031539, + "grad_norm": NaN, + "learning_rate": 2.8561258832691503e-05, + "loss": 0.0, + "step": 51861 + }, + { + "epoch": 4.839227395726416, + "grad_norm": NaN, + "learning_rate": 2.8556819094400595e-05, + "loss": 0.0, + "step": 51862 + }, + { + "epoch": 4.839320705421294, + "grad_norm": NaN, + "learning_rate": 2.855237966490378e-05, + "loss": 0.0, + "step": 51863 + }, + { + "epoch": 4.83941401511617, + "grad_norm": NaN, + "learning_rate": 2.854794054421231e-05, + "loss": 0.0, + "step": 51864 + }, + { + "epoch": 4.839507324811048, + "grad_norm": NaN, + "learning_rate": 2.8543501732337383e-05, + "loss": 0.0, + "step": 51865 + }, + { + "epoch": 4.839600634505925, + "grad_norm": NaN, + "learning_rate": 2.8539063229290384e-05, + "loss": 0.0, + "step": 51866 + }, + { + "epoch": 4.8396939442008025, + "grad_norm": NaN, + "learning_rate": 2.8534625035082605e-05, + "loss": 0.0, + "step": 51867 + }, + { + "epoch": 4.83978725389568, + "grad_norm": NaN, + "learning_rate": 2.853018714972522e-05, + "loss": 0.0, + "step": 51868 + }, + { + "epoch": 4.839880563590557, + "grad_norm": NaN, + "learning_rate": 2.8525749573229623e-05, + "loss": 0.0, + "step": 51869 + }, + { + "epoch": 4.839973873285435, + "grad_norm": NaN, + "learning_rate": 2.852131230560712e-05, + "loss": 0.0, + "step": 51870 + }, + { + "epoch": 4.840067182980311, + "grad_norm": NaN, + "learning_rate": 2.8516875346868835e-05, + "loss": 0.0, + "step": 51871 + }, + { + "epoch": 4.840160492675189, + "grad_norm": NaN, + "learning_rate": 2.851243869702619e-05, + "loss": 0.0, + "step": 51872 + }, + { + "epoch": 4.840253802370066, + "grad_norm": NaN, + "learning_rate": 2.850800235609048e-05, + "loss": 0.0, + "step": 51873 + }, + { + "epoch": 4.8403471120649435, + "grad_norm": NaN, + "learning_rate": 2.850356632407283e-05, + "loss": 0.0, + "step": 51874 + }, + { + "epoch": 4.840440421759821, + "grad_norm": NaN, + "learning_rate": 2.8499130600984677e-05, + "loss": 0.0, + "step": 51875 + }, + { + "epoch": 4.840533731454698, + "grad_norm": NaN, + "learning_rate": 2.849469518683728e-05, + "loss": 0.0, + "step": 51876 + }, + { + "epoch": 4.840627041149576, + "grad_norm": NaN, + "learning_rate": 2.8490260081641797e-05, + "loss": 0.0, + "step": 51877 + }, + { + "epoch": 4.840720350844453, + "grad_norm": NaN, + "learning_rate": 2.8485825285409635e-05, + "loss": 0.0, + "step": 51878 + }, + { + "epoch": 4.84081366053933, + "grad_norm": NaN, + "learning_rate": 2.848139079815207e-05, + "loss": 0.0, + "step": 51879 + }, + { + "epoch": 4.840906970234207, + "grad_norm": NaN, + "learning_rate": 2.8476956619880244e-05, + "loss": 0.0, + "step": 51880 + }, + { + "epoch": 4.8410002799290845, + "grad_norm": NaN, + "learning_rate": 2.847252275060556e-05, + "loss": 0.0, + "step": 51881 + }, + { + "epoch": 4.841093589623962, + "grad_norm": NaN, + "learning_rate": 2.8468089190339294e-05, + "loss": 0.0, + "step": 51882 + }, + { + "epoch": 4.841186899318839, + "grad_norm": NaN, + "learning_rate": 2.8463655939092588e-05, + "loss": 0.0, + "step": 51883 + }, + { + "epoch": 4.841280209013717, + "grad_norm": NaN, + "learning_rate": 2.845922299687685e-05, + "loss": 0.0, + "step": 51884 + }, + { + "epoch": 4.841373518708593, + "grad_norm": NaN, + "learning_rate": 2.845479036370334e-05, + "loss": 0.0, + "step": 51885 + }, + { + "epoch": 4.841466828403471, + "grad_norm": NaN, + "learning_rate": 2.8450358039583225e-05, + "loss": 0.0, + "step": 51886 + }, + { + "epoch": 4.841560138098348, + "grad_norm": NaN, + "learning_rate": 2.844592602452787e-05, + "loss": 0.0, + "step": 51887 + }, + { + "epoch": 4.841653447793226, + "grad_norm": NaN, + "learning_rate": 2.844149431854853e-05, + "loss": 0.0, + "step": 51888 + }, + { + "epoch": 4.841746757488103, + "grad_norm": NaN, + "learning_rate": 2.8437062921656443e-05, + "loss": 0.0, + "step": 51889 + }, + { + "epoch": 4.84184006718298, + "grad_norm": NaN, + "learning_rate": 2.8432631833862924e-05, + "loss": 0.0, + "step": 51890 + }, + { + "epoch": 4.841933376877858, + "grad_norm": NaN, + "learning_rate": 2.8428201055179188e-05, + "loss": 0.0, + "step": 51891 + }, + { + "epoch": 4.842026686572735, + "grad_norm": NaN, + "learning_rate": 2.8423770585616535e-05, + "loss": 0.0, + "step": 51892 + }, + { + "epoch": 4.842119996267613, + "grad_norm": NaN, + "learning_rate": 2.841934042518625e-05, + "loss": 0.0, + "step": 51893 + }, + { + "epoch": 4.842213305962489, + "grad_norm": NaN, + "learning_rate": 2.841491057389949e-05, + "loss": 0.0, + "step": 51894 + }, + { + "epoch": 4.842306615657367, + "grad_norm": NaN, + "learning_rate": 2.8410481031767652e-05, + "loss": 0.0, + "step": 51895 + }, + { + "epoch": 4.842399925352244, + "grad_norm": NaN, + "learning_rate": 2.840605179880197e-05, + "loss": 0.0, + "step": 51896 + }, + { + "epoch": 4.8424932350471215, + "grad_norm": NaN, + "learning_rate": 2.8401622875013606e-05, + "loss": 0.0, + "step": 51897 + }, + { + "epoch": 4.842586544741999, + "grad_norm": NaN, + "learning_rate": 2.839719426041392e-05, + "loss": 0.0, + "step": 51898 + }, + { + "epoch": 4.842679854436876, + "grad_norm": NaN, + "learning_rate": 2.8392765955014196e-05, + "loss": 0.0, + "step": 51899 + }, + { + "epoch": 4.842773164131753, + "grad_norm": NaN, + "learning_rate": 2.838833795882558e-05, + "loss": 0.0, + "step": 51900 + }, + { + "epoch": 4.84286647382663, + "grad_norm": NaN, + "learning_rate": 2.838391027185941e-05, + "loss": 0.0, + "step": 51901 + }, + { + "epoch": 4.842959783521508, + "grad_norm": NaN, + "learning_rate": 2.8379482894127e-05, + "loss": 0.0, + "step": 51902 + }, + { + "epoch": 4.843053093216385, + "grad_norm": NaN, + "learning_rate": 2.8375055825639432e-05, + "loss": 0.0, + "step": 51903 + }, + { + "epoch": 4.8431464029112625, + "grad_norm": NaN, + "learning_rate": 2.8370629066408117e-05, + "loss": 0.0, + "step": 51904 + }, + { + "epoch": 4.84323971260614, + "grad_norm": NaN, + "learning_rate": 2.8366202616444316e-05, + "loss": 0.0, + "step": 51905 + }, + { + "epoch": 4.843333022301017, + "grad_norm": NaN, + "learning_rate": 2.8361776475759136e-05, + "loss": 0.0, + "step": 51906 + }, + { + "epoch": 4.843426331995895, + "grad_norm": NaN, + "learning_rate": 2.835735064436398e-05, + "loss": 0.0, + "step": 51907 + }, + { + "epoch": 4.843519641690771, + "grad_norm": NaN, + "learning_rate": 2.8352925122270076e-05, + "loss": 0.0, + "step": 51908 + }, + { + "epoch": 4.843612951385649, + "grad_norm": NaN, + "learning_rate": 2.834849990948858e-05, + "loss": 0.0, + "step": 51909 + }, + { + "epoch": 4.843706261080526, + "grad_norm": NaN, + "learning_rate": 2.8344075006030837e-05, + "loss": 0.0, + "step": 51910 + }, + { + "epoch": 4.843799570775404, + "grad_norm": NaN, + "learning_rate": 2.8339650411908133e-05, + "loss": 0.0, + "step": 51911 + }, + { + "epoch": 4.843892880470281, + "grad_norm": NaN, + "learning_rate": 2.8335226127131562e-05, + "loss": 0.0, + "step": 51912 + }, + { + "epoch": 4.843986190165158, + "grad_norm": NaN, + "learning_rate": 2.833080215171251e-05, + "loss": 0.0, + "step": 51913 + }, + { + "epoch": 4.844079499860036, + "grad_norm": NaN, + "learning_rate": 2.8326378485662244e-05, + "loss": 0.0, + "step": 51914 + }, + { + "epoch": 4.844172809554912, + "grad_norm": NaN, + "learning_rate": 2.8321955128991864e-05, + "loss": 0.0, + "step": 51915 + }, + { + "epoch": 4.84426611924979, + "grad_norm": NaN, + "learning_rate": 2.8317532081712762e-05, + "loss": 0.0, + "step": 51916 + }, + { + "epoch": 4.844359428944667, + "grad_norm": NaN, + "learning_rate": 2.8313109343836148e-05, + "loss": 0.0, + "step": 51917 + }, + { + "epoch": 4.844452738639545, + "grad_norm": NaN, + "learning_rate": 2.8308686915373196e-05, + "loss": 0.0, + "step": 51918 + }, + { + "epoch": 4.844546048334422, + "grad_norm": NaN, + "learning_rate": 2.830426479633523e-05, + "loss": 0.0, + "step": 51919 + }, + { + "epoch": 4.8446393580292995, + "grad_norm": NaN, + "learning_rate": 2.8299842986733506e-05, + "loss": 0.0, + "step": 51920 + }, + { + "epoch": 4.844732667724177, + "grad_norm": NaN, + "learning_rate": 2.829542148657915e-05, + "loss": 0.0, + "step": 51921 + }, + { + "epoch": 4.844825977419054, + "grad_norm": NaN, + "learning_rate": 2.8291000295883537e-05, + "loss": 0.0, + "step": 51922 + }, + { + "epoch": 4.844919287113931, + "grad_norm": NaN, + "learning_rate": 2.828657941465789e-05, + "loss": 0.0, + "step": 51923 + }, + { + "epoch": 4.845012596808808, + "grad_norm": NaN, + "learning_rate": 2.828215884291335e-05, + "loss": 0.0, + "step": 51924 + }, + { + "epoch": 4.845105906503686, + "grad_norm": NaN, + "learning_rate": 2.8277738580661258e-05, + "loss": 0.0, + "step": 51925 + }, + { + "epoch": 4.845199216198563, + "grad_norm": NaN, + "learning_rate": 2.8273318627912807e-05, + "loss": 0.0, + "step": 51926 + }, + { + "epoch": 4.8452925258934405, + "grad_norm": NaN, + "learning_rate": 2.8268898984679268e-05, + "loss": 0.0, + "step": 51927 + }, + { + "epoch": 4.845385835588318, + "grad_norm": NaN, + "learning_rate": 2.826447965097185e-05, + "loss": 0.0, + "step": 51928 + }, + { + "epoch": 4.8454791452831945, + "grad_norm": NaN, + "learning_rate": 2.8260060626801794e-05, + "loss": 0.0, + "step": 51929 + }, + { + "epoch": 4.845572454978072, + "grad_norm": NaN, + "learning_rate": 2.8255641912180345e-05, + "loss": 0.0, + "step": 51930 + }, + { + "epoch": 4.845665764672949, + "grad_norm": NaN, + "learning_rate": 2.8251223507118735e-05, + "loss": 0.0, + "step": 51931 + }, + { + "epoch": 4.845759074367827, + "grad_norm": NaN, + "learning_rate": 2.8246805411628216e-05, + "loss": 0.0, + "step": 51932 + }, + { + "epoch": 4.845852384062704, + "grad_norm": NaN, + "learning_rate": 2.8242387625719986e-05, + "loss": 0.0, + "step": 51933 + }, + { + "epoch": 4.8459456937575816, + "grad_norm": NaN, + "learning_rate": 2.8237970149405293e-05, + "loss": 0.0, + "step": 51934 + }, + { + "epoch": 4.846039003452459, + "grad_norm": NaN, + "learning_rate": 2.823355298269539e-05, + "loss": 0.0, + "step": 51935 + }, + { + "epoch": 4.846132313147336, + "grad_norm": NaN, + "learning_rate": 2.8229136125601487e-05, + "loss": 0.0, + "step": 51936 + }, + { + "epoch": 4.846225622842213, + "grad_norm": NaN, + "learning_rate": 2.8224719578134857e-05, + "loss": 0.0, + "step": 51937 + }, + { + "epoch": 4.84631893253709, + "grad_norm": NaN, + "learning_rate": 2.822030334030661e-05, + "loss": 0.0, + "step": 51938 + }, + { + "epoch": 4.846412242231968, + "grad_norm": NaN, + "learning_rate": 2.8215887412128102e-05, + "loss": 0.0, + "step": 51939 + }, + { + "epoch": 4.846505551926845, + "grad_norm": NaN, + "learning_rate": 2.8211471793610542e-05, + "loss": 0.0, + "step": 51940 + }, + { + "epoch": 4.846598861621723, + "grad_norm": NaN, + "learning_rate": 2.8207056484765066e-05, + "loss": 0.0, + "step": 51941 + }, + { + "epoch": 4.8466921713166, + "grad_norm": NaN, + "learning_rate": 2.8202641485603005e-05, + "loss": 0.0, + "step": 51942 + }, + { + "epoch": 4.846785481011477, + "grad_norm": NaN, + "learning_rate": 2.8198226796135596e-05, + "loss": 0.0, + "step": 51943 + }, + { + "epoch": 4.846878790706354, + "grad_norm": NaN, + "learning_rate": 2.8193812416373933e-05, + "loss": 0.0, + "step": 51944 + }, + { + "epoch": 4.846972100401231, + "grad_norm": NaN, + "learning_rate": 2.8189398346329356e-05, + "loss": 0.0, + "step": 51945 + }, + { + "epoch": 4.847065410096109, + "grad_norm": NaN, + "learning_rate": 2.8184984586013105e-05, + "loss": 0.0, + "step": 51946 + }, + { + "epoch": 4.847158719790986, + "grad_norm": NaN, + "learning_rate": 2.818057113543627e-05, + "loss": 0.0, + "step": 51947 + }, + { + "epoch": 4.847252029485864, + "grad_norm": NaN, + "learning_rate": 2.8176157994610192e-05, + "loss": 0.0, + "step": 51948 + }, + { + "epoch": 4.847345339180741, + "grad_norm": NaN, + "learning_rate": 2.81717451635461e-05, + "loss": 0.0, + "step": 51949 + }, + { + "epoch": 4.8474386488756185, + "grad_norm": NaN, + "learning_rate": 2.8167332642255097e-05, + "loss": 0.0, + "step": 51950 + }, + { + "epoch": 4.847531958570496, + "grad_norm": NaN, + "learning_rate": 2.8162920430748508e-05, + "loss": 0.0, + "step": 51951 + }, + { + "epoch": 4.847625268265372, + "grad_norm": NaN, + "learning_rate": 2.815850852903758e-05, + "loss": 0.0, + "step": 51952 + }, + { + "epoch": 4.84771857796025, + "grad_norm": NaN, + "learning_rate": 2.8154096937133374e-05, + "loss": 0.0, + "step": 51953 + }, + { + "epoch": 4.847811887655127, + "grad_norm": NaN, + "learning_rate": 2.814968565504726e-05, + "loss": 0.0, + "step": 51954 + }, + { + "epoch": 4.847905197350005, + "grad_norm": NaN, + "learning_rate": 2.8145274682790425e-05, + "loss": 0.0, + "step": 51955 + }, + { + "epoch": 4.847998507044882, + "grad_norm": NaN, + "learning_rate": 2.8140864020373994e-05, + "loss": 0.0, + "step": 51956 + }, + { + "epoch": 4.8480918167397595, + "grad_norm": NaN, + "learning_rate": 2.813645366780929e-05, + "loss": 0.0, + "step": 51957 + }, + { + "epoch": 4.848185126434636, + "grad_norm": NaN, + "learning_rate": 2.81320436251075e-05, + "loss": 0.0, + "step": 51958 + }, + { + "epoch": 4.8482784361295135, + "grad_norm": NaN, + "learning_rate": 2.812763389227978e-05, + "loss": 0.0, + "step": 51959 + }, + { + "epoch": 4.848371745824391, + "grad_norm": NaN, + "learning_rate": 2.8123224469337403e-05, + "loss": 0.0, + "step": 51960 + }, + { + "epoch": 4.848465055519268, + "grad_norm": NaN, + "learning_rate": 2.8118815356291602e-05, + "loss": 0.0, + "step": 51961 + }, + { + "epoch": 4.848558365214146, + "grad_norm": NaN, + "learning_rate": 2.8114406553153483e-05, + "loss": 0.0, + "step": 51962 + }, + { + "epoch": 4.848651674909023, + "grad_norm": NaN, + "learning_rate": 2.810999805993436e-05, + "loss": 0.0, + "step": 51963 + }, + { + "epoch": 4.848744984603901, + "grad_norm": NaN, + "learning_rate": 2.8105589876645396e-05, + "loss": 0.0, + "step": 51964 + }, + { + "epoch": 4.848838294298778, + "grad_norm": NaN, + "learning_rate": 2.8101182003297806e-05, + "loss": 0.0, + "step": 51965 + }, + { + "epoch": 4.848931603993655, + "grad_norm": NaN, + "learning_rate": 2.8096774439902815e-05, + "loss": 0.0, + "step": 51966 + }, + { + "epoch": 4.849024913688532, + "grad_norm": NaN, + "learning_rate": 2.809236718647161e-05, + "loss": 0.0, + "step": 51967 + }, + { + "epoch": 4.849118223383409, + "grad_norm": NaN, + "learning_rate": 2.80879602430154e-05, + "loss": 0.0, + "step": 51968 + }, + { + "epoch": 4.849211533078287, + "grad_norm": NaN, + "learning_rate": 2.8083553609545395e-05, + "loss": 0.0, + "step": 51969 + }, + { + "epoch": 4.849304842773164, + "grad_norm": NaN, + "learning_rate": 2.8079147286072817e-05, + "loss": 0.0, + "step": 51970 + }, + { + "epoch": 4.849398152468042, + "grad_norm": NaN, + "learning_rate": 2.8074741272608826e-05, + "loss": 0.0, + "step": 51971 + }, + { + "epoch": 4.849491462162919, + "grad_norm": NaN, + "learning_rate": 2.807033556916466e-05, + "loss": 0.0, + "step": 51972 + }, + { + "epoch": 4.849584771857796, + "grad_norm": NaN, + "learning_rate": 2.8065930175751527e-05, + "loss": 0.0, + "step": 51973 + }, + { + "epoch": 4.849678081552673, + "grad_norm": NaN, + "learning_rate": 2.806152509238059e-05, + "loss": 0.0, + "step": 51974 + }, + { + "epoch": 4.84977139124755, + "grad_norm": NaN, + "learning_rate": 2.80571203190631e-05, + "loss": 0.0, + "step": 51975 + }, + { + "epoch": 4.849864700942428, + "grad_norm": NaN, + "learning_rate": 2.8052715855810215e-05, + "loss": 0.0, + "step": 51976 + }, + { + "epoch": 4.849958010637305, + "grad_norm": NaN, + "learning_rate": 2.8048311702633146e-05, + "loss": 0.0, + "step": 51977 + }, + { + "epoch": 4.850051320332183, + "grad_norm": NaN, + "learning_rate": 2.8043907859543102e-05, + "loss": 0.0, + "step": 51978 + }, + { + "epoch": 4.85014463002706, + "grad_norm": NaN, + "learning_rate": 2.803950432655127e-05, + "loss": 0.0, + "step": 51979 + }, + { + "epoch": 4.8502379397219375, + "grad_norm": NaN, + "learning_rate": 2.8035101103668862e-05, + "loss": 0.0, + "step": 51980 + }, + { + "epoch": 4.850331249416814, + "grad_norm": NaN, + "learning_rate": 2.803069819090708e-05, + "loss": 0.0, + "step": 51981 + }, + { + "epoch": 4.8504245591116915, + "grad_norm": NaN, + "learning_rate": 2.8026295588277053e-05, + "loss": 0.0, + "step": 51982 + }, + { + "epoch": 4.850517868806569, + "grad_norm": NaN, + "learning_rate": 2.802189329579004e-05, + "loss": 0.0, + "step": 51983 + }, + { + "epoch": 4.850611178501446, + "grad_norm": NaN, + "learning_rate": 2.8017491313457274e-05, + "loss": 0.0, + "step": 51984 + }, + { + "epoch": 4.850704488196324, + "grad_norm": NaN, + "learning_rate": 2.8013089641289804e-05, + "loss": 0.0, + "step": 51985 + }, + { + "epoch": 4.850797797891201, + "grad_norm": NaN, + "learning_rate": 2.800868827929897e-05, + "loss": 0.0, + "step": 51986 + }, + { + "epoch": 4.8508911075860786, + "grad_norm": NaN, + "learning_rate": 2.8004287227495926e-05, + "loss": 0.0, + "step": 51987 + }, + { + "epoch": 4.850984417280955, + "grad_norm": NaN, + "learning_rate": 2.7999886485891764e-05, + "loss": 0.0, + "step": 51988 + }, + { + "epoch": 4.8510777269758325, + "grad_norm": NaN, + "learning_rate": 2.7995486054497797e-05, + "loss": 0.0, + "step": 51989 + }, + { + "epoch": 4.85117103667071, + "grad_norm": NaN, + "learning_rate": 2.7991085933325207e-05, + "loss": 0.0, + "step": 51990 + }, + { + "epoch": 4.851264346365587, + "grad_norm": NaN, + "learning_rate": 2.798668612238506e-05, + "loss": 0.0, + "step": 51991 + }, + { + "epoch": 4.851357656060465, + "grad_norm": NaN, + "learning_rate": 2.7982286621688678e-05, + "loss": 0.0, + "step": 51992 + }, + { + "epoch": 4.851450965755342, + "grad_norm": NaN, + "learning_rate": 2.797788743124723e-05, + "loss": 0.0, + "step": 51993 + }, + { + "epoch": 4.85154427545022, + "grad_norm": NaN, + "learning_rate": 2.797348855107178e-05, + "loss": 0.0, + "step": 51994 + }, + { + "epoch": 4.851637585145097, + "grad_norm": NaN, + "learning_rate": 2.7969089981173655e-05, + "loss": 0.0, + "step": 51995 + }, + { + "epoch": 4.8517308948399736, + "grad_norm": NaN, + "learning_rate": 2.796469172156402e-05, + "loss": 0.0, + "step": 51996 + }, + { + "epoch": 4.851824204534851, + "grad_norm": NaN, + "learning_rate": 2.7960293772253956e-05, + "loss": 0.0, + "step": 51997 + }, + { + "epoch": 4.851917514229728, + "grad_norm": NaN, + "learning_rate": 2.7955896133254736e-05, + "loss": 0.0, + "step": 51998 + }, + { + "epoch": 4.852010823924606, + "grad_norm": NaN, + "learning_rate": 2.7951498804577565e-05, + "loss": 0.0, + "step": 51999 + }, + { + "epoch": 4.852104133619483, + "grad_norm": NaN, + "learning_rate": 2.7947101786233507e-05, + "loss": 0.0, + "step": 52000 + }, + { + "epoch": 4.852197443314361, + "grad_norm": NaN, + "learning_rate": 2.7942705078233846e-05, + "loss": 0.0, + "step": 52001 + }, + { + "epoch": 4.852290753009237, + "grad_norm": NaN, + "learning_rate": 2.7938308680589745e-05, + "loss": 0.0, + "step": 52002 + }, + { + "epoch": 4.852384062704115, + "grad_norm": NaN, + "learning_rate": 2.7933912593312356e-05, + "loss": 0.0, + "step": 52003 + }, + { + "epoch": 4.852477372398992, + "grad_norm": NaN, + "learning_rate": 2.792951681641286e-05, + "loss": 0.0, + "step": 52004 + }, + { + "epoch": 4.852570682093869, + "grad_norm": NaN, + "learning_rate": 2.7925121349902462e-05, + "loss": 0.0, + "step": 52005 + }, + { + "epoch": 4.852663991788747, + "grad_norm": NaN, + "learning_rate": 2.7920726193792302e-05, + "loss": 0.0, + "step": 52006 + }, + { + "epoch": 4.852757301483624, + "grad_norm": NaN, + "learning_rate": 2.7916331348093574e-05, + "loss": 0.0, + "step": 52007 + }, + { + "epoch": 4.852850611178502, + "grad_norm": NaN, + "learning_rate": 2.7911936812817466e-05, + "loss": 0.0, + "step": 52008 + }, + { + "epoch": 4.852943920873379, + "grad_norm": NaN, + "learning_rate": 2.7907542587975135e-05, + "loss": 0.0, + "step": 52009 + }, + { + "epoch": 4.8530372305682565, + "grad_norm": NaN, + "learning_rate": 2.7903148673577742e-05, + "loss": 0.0, + "step": 52010 + }, + { + "epoch": 4.853130540263133, + "grad_norm": NaN, + "learning_rate": 2.7898755069636474e-05, + "loss": 0.0, + "step": 52011 + }, + { + "epoch": 4.8532238499580105, + "grad_norm": NaN, + "learning_rate": 2.7894361776162526e-05, + "loss": 0.0, + "step": 52012 + }, + { + "epoch": 4.853317159652888, + "grad_norm": NaN, + "learning_rate": 2.7889968793167023e-05, + "loss": 0.0, + "step": 52013 + }, + { + "epoch": 4.853410469347765, + "grad_norm": NaN, + "learning_rate": 2.788557612066117e-05, + "loss": 0.0, + "step": 52014 + }, + { + "epoch": 4.853503779042643, + "grad_norm": NaN, + "learning_rate": 2.788118375865612e-05, + "loss": 0.0, + "step": 52015 + }, + { + "epoch": 4.85359708873752, + "grad_norm": NaN, + "learning_rate": 2.7876791707163043e-05, + "loss": 0.0, + "step": 52016 + }, + { + "epoch": 4.853690398432397, + "grad_norm": NaN, + "learning_rate": 2.7872399966193103e-05, + "loss": 0.0, + "step": 52017 + }, + { + "epoch": 4.853783708127274, + "grad_norm": NaN, + "learning_rate": 2.7868008535757497e-05, + "loss": 0.0, + "step": 52018 + }, + { + "epoch": 4.8538770178221515, + "grad_norm": NaN, + "learning_rate": 2.7863617415867344e-05, + "loss": 0.0, + "step": 52019 + }, + { + "epoch": 4.853970327517029, + "grad_norm": NaN, + "learning_rate": 2.785922660653384e-05, + "loss": 0.0, + "step": 52020 + }, + { + "epoch": 4.854063637211906, + "grad_norm": NaN, + "learning_rate": 2.7854836107768142e-05, + "loss": 0.0, + "step": 52021 + }, + { + "epoch": 4.854156946906784, + "grad_norm": NaN, + "learning_rate": 2.785044591958142e-05, + "loss": 0.0, + "step": 52022 + }, + { + "epoch": 4.854250256601661, + "grad_norm": NaN, + "learning_rate": 2.784605604198482e-05, + "loss": 0.0, + "step": 52023 + }, + { + "epoch": 4.854343566296539, + "grad_norm": NaN, + "learning_rate": 2.7841666474989516e-05, + "loss": 0.0, + "step": 52024 + }, + { + "epoch": 4.854436875991415, + "grad_norm": NaN, + "learning_rate": 2.783727721860668e-05, + "loss": 0.0, + "step": 52025 + }, + { + "epoch": 4.854530185686293, + "grad_norm": NaN, + "learning_rate": 2.7832888272847435e-05, + "loss": 0.0, + "step": 52026 + }, + { + "epoch": 4.85462349538117, + "grad_norm": NaN, + "learning_rate": 2.7828499637722995e-05, + "loss": 0.0, + "step": 52027 + }, + { + "epoch": 4.854716805076047, + "grad_norm": NaN, + "learning_rate": 2.7824111313244514e-05, + "loss": 0.0, + "step": 52028 + }, + { + "epoch": 4.854810114770925, + "grad_norm": NaN, + "learning_rate": 2.7819723299423047e-05, + "loss": 0.0, + "step": 52029 + }, + { + "epoch": 4.854903424465802, + "grad_norm": NaN, + "learning_rate": 2.781533559626989e-05, + "loss": 0.0, + "step": 52030 + }, + { + "epoch": 4.85499673416068, + "grad_norm": NaN, + "learning_rate": 2.7810948203796164e-05, + "loss": 0.0, + "step": 52031 + }, + { + "epoch": 4.855090043855556, + "grad_norm": NaN, + "learning_rate": 2.7806561122012927e-05, + "loss": 0.0, + "step": 52032 + }, + { + "epoch": 4.855183353550434, + "grad_norm": NaN, + "learning_rate": 2.7802174350931437e-05, + "loss": 0.0, + "step": 52033 + }, + { + "epoch": 4.855276663245311, + "grad_norm": NaN, + "learning_rate": 2.7797787890562873e-05, + "loss": 0.0, + "step": 52034 + }, + { + "epoch": 4.8553699729401885, + "grad_norm": NaN, + "learning_rate": 2.7793401740918252e-05, + "loss": 0.0, + "step": 52035 + }, + { + "epoch": 4.855463282635066, + "grad_norm": NaN, + "learning_rate": 2.7789015902008854e-05, + "loss": 0.0, + "step": 52036 + }, + { + "epoch": 4.855556592329943, + "grad_norm": NaN, + "learning_rate": 2.7784630373845818e-05, + "loss": 0.0, + "step": 52037 + }, + { + "epoch": 4.855649902024821, + "grad_norm": NaN, + "learning_rate": 2.7780245156440184e-05, + "loss": 0.0, + "step": 52038 + }, + { + "epoch": 4.855743211719698, + "grad_norm": NaN, + "learning_rate": 2.7775860249803227e-05, + "loss": 0.0, + "step": 52039 + }, + { + "epoch": 4.855836521414575, + "grad_norm": NaN, + "learning_rate": 2.7771475653946056e-05, + "loss": 0.0, + "step": 52040 + }, + { + "epoch": 4.855929831109452, + "grad_norm": NaN, + "learning_rate": 2.7767091368879812e-05, + "loss": 0.0, + "step": 52041 + }, + { + "epoch": 4.8560231408043295, + "grad_norm": NaN, + "learning_rate": 2.7762707394615652e-05, + "loss": 0.0, + "step": 52042 + }, + { + "epoch": 4.856116450499207, + "grad_norm": NaN, + "learning_rate": 2.77583237311647e-05, + "loss": 0.0, + "step": 52043 + }, + { + "epoch": 4.856209760194084, + "grad_norm": NaN, + "learning_rate": 2.775394037853813e-05, + "loss": 0.0, + "step": 52044 + }, + { + "epoch": 4.856303069888962, + "grad_norm": NaN, + "learning_rate": 2.7749557336747084e-05, + "loss": 0.0, + "step": 52045 + }, + { + "epoch": 4.856396379583838, + "grad_norm": NaN, + "learning_rate": 2.7745174605802706e-05, + "loss": 0.0, + "step": 52046 + }, + { + "epoch": 4.856489689278716, + "grad_norm": NaN, + "learning_rate": 2.7740792185716116e-05, + "loss": 0.0, + "step": 52047 + }, + { + "epoch": 4.856582998973593, + "grad_norm": NaN, + "learning_rate": 2.7736410076498488e-05, + "loss": 0.0, + "step": 52048 + }, + { + "epoch": 4.8566763086684706, + "grad_norm": NaN, + "learning_rate": 2.773202827816095e-05, + "loss": 0.0, + "step": 52049 + }, + { + "epoch": 4.856769618363348, + "grad_norm": NaN, + "learning_rate": 2.7727646790714643e-05, + "loss": 0.0, + "step": 52050 + }, + { + "epoch": 4.856862928058225, + "grad_norm": NaN, + "learning_rate": 2.7723265614170702e-05, + "loss": 0.0, + "step": 52051 + }, + { + "epoch": 4.856956237753103, + "grad_norm": NaN, + "learning_rate": 2.7718884748540294e-05, + "loss": 0.0, + "step": 52052 + }, + { + "epoch": 4.85704954744798, + "grad_norm": NaN, + "learning_rate": 2.7714504193834536e-05, + "loss": 0.0, + "step": 52053 + }, + { + "epoch": 4.857142857142857, + "grad_norm": NaN, + "learning_rate": 2.7710123950064555e-05, + "loss": 0.0, + "step": 52054 + }, + { + "epoch": 4.857236166837734, + "grad_norm": NaN, + "learning_rate": 2.770574401724151e-05, + "loss": 0.0, + "step": 52055 + }, + { + "epoch": 4.857329476532612, + "grad_norm": NaN, + "learning_rate": 2.7701364395376542e-05, + "loss": 0.0, + "step": 52056 + }, + { + "epoch": 4.857422786227489, + "grad_norm": NaN, + "learning_rate": 2.769698508448077e-05, + "loss": 0.0, + "step": 52057 + }, + { + "epoch": 4.857516095922366, + "grad_norm": NaN, + "learning_rate": 2.7692606084565343e-05, + "loss": 0.0, + "step": 52058 + }, + { + "epoch": 4.857609405617244, + "grad_norm": NaN, + "learning_rate": 2.7688227395641382e-05, + "loss": 0.0, + "step": 52059 + }, + { + "epoch": 4.857702715312121, + "grad_norm": NaN, + "learning_rate": 2.768384901772003e-05, + "loss": 0.0, + "step": 52060 + }, + { + "epoch": 4.857796025006998, + "grad_norm": NaN, + "learning_rate": 2.7679470950812406e-05, + "loss": 0.0, + "step": 52061 + }, + { + "epoch": 4.857889334701875, + "grad_norm": NaN, + "learning_rate": 2.7675093194929653e-05, + "loss": 0.0, + "step": 52062 + }, + { + "epoch": 4.857982644396753, + "grad_norm": NaN, + "learning_rate": 2.767071575008292e-05, + "loss": 0.0, + "step": 52063 + }, + { + "epoch": 4.85807595409163, + "grad_norm": NaN, + "learning_rate": 2.7666338616283317e-05, + "loss": 0.0, + "step": 52064 + }, + { + "epoch": 4.8581692637865075, + "grad_norm": NaN, + "learning_rate": 2.7661961793541963e-05, + "loss": 0.0, + "step": 52065 + }, + { + "epoch": 4.858262573481385, + "grad_norm": NaN, + "learning_rate": 2.7657585281870013e-05, + "loss": 0.0, + "step": 52066 + }, + { + "epoch": 4.858355883176262, + "grad_norm": NaN, + "learning_rate": 2.7653209081278577e-05, + "loss": 0.0, + "step": 52067 + }, + { + "epoch": 4.85844919287114, + "grad_norm": NaN, + "learning_rate": 2.7648833191778792e-05, + "loss": 0.0, + "step": 52068 + }, + { + "epoch": 4.858542502566016, + "grad_norm": NaN, + "learning_rate": 2.764445761338178e-05, + "loss": 0.0, + "step": 52069 + }, + { + "epoch": 4.858635812260894, + "grad_norm": NaN, + "learning_rate": 2.7640082346098673e-05, + "loss": 0.0, + "step": 52070 + }, + { + "epoch": 4.858729121955771, + "grad_norm": NaN, + "learning_rate": 2.7635707389940592e-05, + "loss": 0.0, + "step": 52071 + }, + { + "epoch": 4.8588224316506485, + "grad_norm": NaN, + "learning_rate": 2.7631332744918692e-05, + "loss": 0.0, + "step": 52072 + }, + { + "epoch": 4.858915741345526, + "grad_norm": NaN, + "learning_rate": 2.7626958411043988e-05, + "loss": 0.0, + "step": 52073 + }, + { + "epoch": 4.859009051040403, + "grad_norm": NaN, + "learning_rate": 2.762258438832771e-05, + "loss": 0.0, + "step": 52074 + }, + { + "epoch": 4.85910236073528, + "grad_norm": NaN, + "learning_rate": 2.761821067678096e-05, + "loss": 0.0, + "step": 52075 + }, + { + "epoch": 4.859195670430157, + "grad_norm": NaN, + "learning_rate": 2.761383727641484e-05, + "loss": 0.0, + "step": 52076 + }, + { + "epoch": 4.859288980125035, + "grad_norm": NaN, + "learning_rate": 2.7609464187240494e-05, + "loss": 0.0, + "step": 52077 + }, + { + "epoch": 4.859382289819912, + "grad_norm": NaN, + "learning_rate": 2.760509140926901e-05, + "loss": 0.0, + "step": 52078 + }, + { + "epoch": 4.85947559951479, + "grad_norm": NaN, + "learning_rate": 2.760071894251153e-05, + "loss": 0.0, + "step": 52079 + }, + { + "epoch": 4.859568909209667, + "grad_norm": NaN, + "learning_rate": 2.7596346786979163e-05, + "loss": 0.0, + "step": 52080 + }, + { + "epoch": 4.859662218904544, + "grad_norm": NaN, + "learning_rate": 2.7591974942683033e-05, + "loss": 0.0, + "step": 52081 + }, + { + "epoch": 4.859755528599422, + "grad_norm": NaN, + "learning_rate": 2.758760340963423e-05, + "loss": 0.0, + "step": 52082 + }, + { + "epoch": 4.859848838294299, + "grad_norm": NaN, + "learning_rate": 2.7583232187843913e-05, + "loss": 0.0, + "step": 52083 + }, + { + "epoch": 4.859942147989176, + "grad_norm": NaN, + "learning_rate": 2.7578861277323155e-05, + "loss": 0.0, + "step": 52084 + }, + { + "epoch": 4.860035457684053, + "grad_norm": NaN, + "learning_rate": 2.7574490678083116e-05, + "loss": 0.0, + "step": 52085 + }, + { + "epoch": 4.860128767378931, + "grad_norm": NaN, + "learning_rate": 2.7570120390134855e-05, + "loss": 0.0, + "step": 52086 + }, + { + "epoch": 4.860222077073808, + "grad_norm": NaN, + "learning_rate": 2.7565750413489524e-05, + "loss": 0.0, + "step": 52087 + }, + { + "epoch": 4.8603153867686855, + "grad_norm": NaN, + "learning_rate": 2.7561380748158217e-05, + "loss": 0.0, + "step": 52088 + }, + { + "epoch": 4.860408696463563, + "grad_norm": NaN, + "learning_rate": 2.755701139415206e-05, + "loss": 0.0, + "step": 52089 + }, + { + "epoch": 4.860502006158439, + "grad_norm": NaN, + "learning_rate": 2.7552642351482145e-05, + "loss": 0.0, + "step": 52090 + }, + { + "epoch": 4.860595315853317, + "grad_norm": NaN, + "learning_rate": 2.7548273620159595e-05, + "loss": 0.0, + "step": 52091 + }, + { + "epoch": 4.860688625548194, + "grad_norm": NaN, + "learning_rate": 2.7543905200195498e-05, + "loss": 0.0, + "step": 52092 + }, + { + "epoch": 4.860781935243072, + "grad_norm": NaN, + "learning_rate": 2.7539537091600983e-05, + "loss": 0.0, + "step": 52093 + }, + { + "epoch": 4.860875244937949, + "grad_norm": NaN, + "learning_rate": 2.7535169294387153e-05, + "loss": 0.0, + "step": 52094 + }, + { + "epoch": 4.8609685546328265, + "grad_norm": NaN, + "learning_rate": 2.7530801808565105e-05, + "loss": 0.0, + "step": 52095 + }, + { + "epoch": 4.861061864327704, + "grad_norm": NaN, + "learning_rate": 2.7526434634145962e-05, + "loss": 0.0, + "step": 52096 + }, + { + "epoch": 4.861155174022581, + "grad_norm": NaN, + "learning_rate": 2.752206777114081e-05, + "loss": 0.0, + "step": 52097 + }, + { + "epoch": 4.861248483717458, + "grad_norm": NaN, + "learning_rate": 2.751770121956076e-05, + "loss": 0.0, + "step": 52098 + }, + { + "epoch": 4.861341793412335, + "grad_norm": NaN, + "learning_rate": 2.751333497941691e-05, + "loss": 0.0, + "step": 52099 + }, + { + "epoch": 4.861435103107213, + "grad_norm": NaN, + "learning_rate": 2.750896905072037e-05, + "loss": 0.0, + "step": 52100 + }, + { + "epoch": 4.86152841280209, + "grad_norm": NaN, + "learning_rate": 2.7504603433482225e-05, + "loss": 0.0, + "step": 52101 + }, + { + "epoch": 4.861621722496968, + "grad_norm": NaN, + "learning_rate": 2.7500238127713597e-05, + "loss": 0.0, + "step": 52102 + }, + { + "epoch": 4.861715032191845, + "grad_norm": NaN, + "learning_rate": 2.7495873133425577e-05, + "loss": 0.0, + "step": 52103 + }, + { + "epoch": 4.861808341886722, + "grad_norm": NaN, + "learning_rate": 2.7491508450629274e-05, + "loss": 0.0, + "step": 52104 + }, + { + "epoch": 4.861901651581599, + "grad_norm": NaN, + "learning_rate": 2.748714407933576e-05, + "loss": 0.0, + "step": 52105 + }, + { + "epoch": 4.861994961276476, + "grad_norm": NaN, + "learning_rate": 2.748278001955616e-05, + "loss": 0.0, + "step": 52106 + }, + { + "epoch": 4.862088270971354, + "grad_norm": NaN, + "learning_rate": 2.747841627130155e-05, + "loss": 0.0, + "step": 52107 + }, + { + "epoch": 4.862181580666231, + "grad_norm": NaN, + "learning_rate": 2.747405283458302e-05, + "loss": 0.0, + "step": 52108 + }, + { + "epoch": 4.862274890361109, + "grad_norm": NaN, + "learning_rate": 2.7469689709411695e-05, + "loss": 0.0, + "step": 52109 + }, + { + "epoch": 4.862368200055986, + "grad_norm": NaN, + "learning_rate": 2.746532689579865e-05, + "loss": 0.0, + "step": 52110 + }, + { + "epoch": 4.862461509750863, + "grad_norm": NaN, + "learning_rate": 2.746096439375499e-05, + "loss": 0.0, + "step": 52111 + }, + { + "epoch": 4.862554819445741, + "grad_norm": NaN, + "learning_rate": 2.7456602203291793e-05, + "loss": 0.0, + "step": 52112 + }, + { + "epoch": 4.862648129140617, + "grad_norm": NaN, + "learning_rate": 2.745224032442015e-05, + "loss": 0.0, + "step": 52113 + }, + { + "epoch": 4.862741438835495, + "grad_norm": NaN, + "learning_rate": 2.7447878757151166e-05, + "loss": 0.0, + "step": 52114 + }, + { + "epoch": 4.862834748530372, + "grad_norm": NaN, + "learning_rate": 2.7443517501495915e-05, + "loss": 0.0, + "step": 52115 + }, + { + "epoch": 4.86292805822525, + "grad_norm": NaN, + "learning_rate": 2.7439156557465508e-05, + "loss": 0.0, + "step": 52116 + }, + { + "epoch": 4.863021367920127, + "grad_norm": NaN, + "learning_rate": 2.7434795925071e-05, + "loss": 0.0, + "step": 52117 + }, + { + "epoch": 4.8631146776150045, + "grad_norm": NaN, + "learning_rate": 2.743043560432352e-05, + "loss": 0.0, + "step": 52118 + }, + { + "epoch": 4.863207987309881, + "grad_norm": NaN, + "learning_rate": 2.742607559523412e-05, + "loss": 0.0, + "step": 52119 + }, + { + "epoch": 4.863301297004758, + "grad_norm": NaN, + "learning_rate": 2.742171589781391e-05, + "loss": 0.0, + "step": 52120 + }, + { + "epoch": 4.863394606699636, + "grad_norm": NaN, + "learning_rate": 2.7417356512073967e-05, + "loss": 0.0, + "step": 52121 + }, + { + "epoch": 4.863487916394513, + "grad_norm": NaN, + "learning_rate": 2.741299743802538e-05, + "loss": 0.0, + "step": 52122 + }, + { + "epoch": 4.863581226089391, + "grad_norm": NaN, + "learning_rate": 2.7408638675679206e-05, + "loss": 0.0, + "step": 52123 + }, + { + "epoch": 4.863674535784268, + "grad_norm": NaN, + "learning_rate": 2.740428022504657e-05, + "loss": 0.0, + "step": 52124 + }, + { + "epoch": 4.8637678454791455, + "grad_norm": NaN, + "learning_rate": 2.7399922086138533e-05, + "loss": 0.0, + "step": 52125 + }, + { + "epoch": 4.863861155174023, + "grad_norm": NaN, + "learning_rate": 2.7395564258966162e-05, + "loss": 0.0, + "step": 52126 + }, + { + "epoch": 4.8639544648689, + "grad_norm": NaN, + "learning_rate": 2.7391206743540574e-05, + "loss": 0.0, + "step": 52127 + }, + { + "epoch": 4.864047774563777, + "grad_norm": NaN, + "learning_rate": 2.73868495398728e-05, + "loss": 0.0, + "step": 52128 + }, + { + "epoch": 4.864141084258654, + "grad_norm": NaN, + "learning_rate": 2.7382492647973976e-05, + "loss": 0.0, + "step": 52129 + }, + { + "epoch": 4.864234393953532, + "grad_norm": NaN, + "learning_rate": 2.737813606785513e-05, + "loss": 0.0, + "step": 52130 + }, + { + "epoch": 4.864327703648409, + "grad_norm": NaN, + "learning_rate": 2.7373779799527367e-05, + "loss": 0.0, + "step": 52131 + }, + { + "epoch": 4.864421013343287, + "grad_norm": NaN, + "learning_rate": 2.736942384300177e-05, + "loss": 0.0, + "step": 52132 + }, + { + "epoch": 4.864514323038164, + "grad_norm": NaN, + "learning_rate": 2.7365068198289382e-05, + "loss": 0.0, + "step": 52133 + }, + { + "epoch": 4.8646076327330405, + "grad_norm": NaN, + "learning_rate": 2.7360712865401307e-05, + "loss": 0.0, + "step": 52134 + }, + { + "epoch": 4.864700942427918, + "grad_norm": NaN, + "learning_rate": 2.735635784434861e-05, + "loss": 0.0, + "step": 52135 + }, + { + "epoch": 4.864794252122795, + "grad_norm": NaN, + "learning_rate": 2.7352003135142375e-05, + "loss": 0.0, + "step": 52136 + }, + { + "epoch": 4.864887561817673, + "grad_norm": NaN, + "learning_rate": 2.734764873779365e-05, + "loss": 0.0, + "step": 52137 + }, + { + "epoch": 4.86498087151255, + "grad_norm": NaN, + "learning_rate": 2.7343294652313537e-05, + "loss": 0.0, + "step": 52138 + }, + { + "epoch": 4.865074181207428, + "grad_norm": NaN, + "learning_rate": 2.7338940878713077e-05, + "loss": 0.0, + "step": 52139 + }, + { + "epoch": 4.865167490902305, + "grad_norm": NaN, + "learning_rate": 2.7334587417003368e-05, + "loss": 0.0, + "step": 52140 + }, + { + "epoch": 4.8652608005971825, + "grad_norm": NaN, + "learning_rate": 2.733023426719546e-05, + "loss": 0.0, + "step": 52141 + }, + { + "epoch": 4.865354110292059, + "grad_norm": NaN, + "learning_rate": 2.7325881429300435e-05, + "loss": 0.0, + "step": 52142 + }, + { + "epoch": 4.865447419986936, + "grad_norm": NaN, + "learning_rate": 2.732152890332934e-05, + "loss": 0.0, + "step": 52143 + }, + { + "epoch": 4.865540729681814, + "grad_norm": NaN, + "learning_rate": 2.7317176689293262e-05, + "loss": 0.0, + "step": 52144 + }, + { + "epoch": 4.865634039376691, + "grad_norm": NaN, + "learning_rate": 2.731282478720327e-05, + "loss": 0.0, + "step": 52145 + }, + { + "epoch": 4.865727349071569, + "grad_norm": NaN, + "learning_rate": 2.730847319707042e-05, + "loss": 0.0, + "step": 52146 + }, + { + "epoch": 4.865820658766446, + "grad_norm": NaN, + "learning_rate": 2.7304121918905786e-05, + "loss": 0.0, + "step": 52147 + }, + { + "epoch": 4.8659139684613235, + "grad_norm": NaN, + "learning_rate": 2.7299770952720413e-05, + "loss": 0.0, + "step": 52148 + }, + { + "epoch": 4.8660072781562, + "grad_norm": NaN, + "learning_rate": 2.7295420298525372e-05, + "loss": 0.0, + "step": 52149 + }, + { + "epoch": 4.8661005878510775, + "grad_norm": NaN, + "learning_rate": 2.729106995633174e-05, + "loss": 0.0, + "step": 52150 + }, + { + "epoch": 4.866193897545955, + "grad_norm": NaN, + "learning_rate": 2.7286719926150574e-05, + "loss": 0.0, + "step": 52151 + }, + { + "epoch": 4.866287207240832, + "grad_norm": NaN, + "learning_rate": 2.7282370207992914e-05, + "loss": 0.0, + "step": 52152 + }, + { + "epoch": 4.86638051693571, + "grad_norm": NaN, + "learning_rate": 2.727802080186984e-05, + "loss": 0.0, + "step": 52153 + }, + { + "epoch": 4.866473826630587, + "grad_norm": NaN, + "learning_rate": 2.7273671707792417e-05, + "loss": 0.0, + "step": 52154 + }, + { + "epoch": 4.866567136325465, + "grad_norm": NaN, + "learning_rate": 2.7269322925771677e-05, + "loss": 0.0, + "step": 52155 + }, + { + "epoch": 4.866660446020342, + "grad_norm": NaN, + "learning_rate": 2.7264974455818688e-05, + "loss": 0.0, + "step": 52156 + }, + { + "epoch": 4.8667537557152185, + "grad_norm": NaN, + "learning_rate": 2.7260626297944533e-05, + "loss": 0.0, + "step": 52157 + }, + { + "epoch": 4.866847065410096, + "grad_norm": NaN, + "learning_rate": 2.725627845216023e-05, + "loss": 0.0, + "step": 52158 + }, + { + "epoch": 4.866940375104973, + "grad_norm": NaN, + "learning_rate": 2.725193091847687e-05, + "loss": 0.0, + "step": 52159 + }, + { + "epoch": 4.867033684799851, + "grad_norm": NaN, + "learning_rate": 2.7247583696905468e-05, + "loss": 0.0, + "step": 52160 + }, + { + "epoch": 4.867126994494728, + "grad_norm": NaN, + "learning_rate": 2.7243236787457107e-05, + "loss": 0.0, + "step": 52161 + }, + { + "epoch": 4.867220304189606, + "grad_norm": NaN, + "learning_rate": 2.7238890190142847e-05, + "loss": 0.0, + "step": 52162 + }, + { + "epoch": 4.867313613884482, + "grad_norm": NaN, + "learning_rate": 2.72345439049737e-05, + "loss": 0.0, + "step": 52163 + }, + { + "epoch": 4.86740692357936, + "grad_norm": NaN, + "learning_rate": 2.723019793196075e-05, + "loss": 0.0, + "step": 52164 + }, + { + "epoch": 4.867500233274237, + "grad_norm": NaN, + "learning_rate": 2.7225852271115045e-05, + "loss": 0.0, + "step": 52165 + }, + { + "epoch": 4.867593542969114, + "grad_norm": NaN, + "learning_rate": 2.7221506922447638e-05, + "loss": 0.0, + "step": 52166 + }, + { + "epoch": 4.867686852663992, + "grad_norm": NaN, + "learning_rate": 2.721716188596956e-05, + "loss": 0.0, + "step": 52167 + }, + { + "epoch": 4.867780162358869, + "grad_norm": NaN, + "learning_rate": 2.721281716169186e-05, + "loss": 0.0, + "step": 52168 + }, + { + "epoch": 4.867873472053747, + "grad_norm": NaN, + "learning_rate": 2.7208472749625597e-05, + "loss": 0.0, + "step": 52169 + }, + { + "epoch": 4.867966781748624, + "grad_norm": NaN, + "learning_rate": 2.720412864978182e-05, + "loss": 0.0, + "step": 52170 + }, + { + "epoch": 4.868060091443501, + "grad_norm": NaN, + "learning_rate": 2.7199784862171564e-05, + "loss": 0.0, + "step": 52171 + }, + { + "epoch": 4.868153401138378, + "grad_norm": NaN, + "learning_rate": 2.719544138680589e-05, + "loss": 0.0, + "step": 52172 + }, + { + "epoch": 4.868246710833255, + "grad_norm": NaN, + "learning_rate": 2.7191098223695817e-05, + "loss": 0.0, + "step": 52173 + }, + { + "epoch": 4.868340020528133, + "grad_norm": NaN, + "learning_rate": 2.718675537285241e-05, + "loss": 0.0, + "step": 52174 + }, + { + "epoch": 4.86843333022301, + "grad_norm": NaN, + "learning_rate": 2.7182412834286703e-05, + "loss": 0.0, + "step": 52175 + }, + { + "epoch": 4.868526639917888, + "grad_norm": NaN, + "learning_rate": 2.7178070608009727e-05, + "loss": 0.0, + "step": 52176 + }, + { + "epoch": 4.868619949612765, + "grad_norm": NaN, + "learning_rate": 2.7173728694032553e-05, + "loss": 0.0, + "step": 52177 + }, + { + "epoch": 4.868713259307642, + "grad_norm": NaN, + "learning_rate": 2.716938709236619e-05, + "loss": 0.0, + "step": 52178 + }, + { + "epoch": 4.868806569002519, + "grad_norm": NaN, + "learning_rate": 2.7165045803021696e-05, + "loss": 0.0, + "step": 52179 + }, + { + "epoch": 4.8688998786973965, + "grad_norm": NaN, + "learning_rate": 2.716070482601011e-05, + "loss": 0.0, + "step": 52180 + }, + { + "epoch": 4.868993188392274, + "grad_norm": NaN, + "learning_rate": 2.7156364161342458e-05, + "loss": 0.0, + "step": 52181 + }, + { + "epoch": 4.869086498087151, + "grad_norm": NaN, + "learning_rate": 2.7152023809029784e-05, + "loss": 0.0, + "step": 52182 + }, + { + "epoch": 4.869179807782029, + "grad_norm": NaN, + "learning_rate": 2.7147683769083122e-05, + "loss": 0.0, + "step": 52183 + }, + { + "epoch": 4.869273117476906, + "grad_norm": NaN, + "learning_rate": 2.7143344041513503e-05, + "loss": 0.0, + "step": 52184 + }, + { + "epoch": 4.869366427171784, + "grad_norm": NaN, + "learning_rate": 2.7139004626331984e-05, + "loss": 0.0, + "step": 52185 + }, + { + "epoch": 4.86945973686666, + "grad_norm": NaN, + "learning_rate": 2.713466552354957e-05, + "loss": 0.0, + "step": 52186 + }, + { + "epoch": 4.8695530465615375, + "grad_norm": NaN, + "learning_rate": 2.7130326733177304e-05, + "loss": 0.0, + "step": 52187 + }, + { + "epoch": 4.869646356256415, + "grad_norm": NaN, + "learning_rate": 2.7125988255226228e-05, + "loss": 0.0, + "step": 52188 + }, + { + "epoch": 4.869739665951292, + "grad_norm": NaN, + "learning_rate": 2.7121650089707364e-05, + "loss": 0.0, + "step": 52189 + }, + { + "epoch": 4.86983297564617, + "grad_norm": NaN, + "learning_rate": 2.7117312236631737e-05, + "loss": 0.0, + "step": 52190 + }, + { + "epoch": 4.869926285341047, + "grad_norm": NaN, + "learning_rate": 2.7112974696010393e-05, + "loss": 0.0, + "step": 52191 + }, + { + "epoch": 4.870019595035924, + "grad_norm": NaN, + "learning_rate": 2.710863746785435e-05, + "loss": 0.0, + "step": 52192 + }, + { + "epoch": 4.870112904730801, + "grad_norm": NaN, + "learning_rate": 2.7104300552174636e-05, + "loss": 0.0, + "step": 52193 + }, + { + "epoch": 4.870206214425679, + "grad_norm": NaN, + "learning_rate": 2.709996394898229e-05, + "loss": 0.0, + "step": 52194 + }, + { + "epoch": 4.870299524120556, + "grad_norm": NaN, + "learning_rate": 2.709562765828832e-05, + "loss": 0.0, + "step": 52195 + }, + { + "epoch": 4.870392833815433, + "grad_norm": NaN, + "learning_rate": 2.709129168010377e-05, + "loss": 0.0, + "step": 52196 + }, + { + "epoch": 4.870486143510311, + "grad_norm": NaN, + "learning_rate": 2.7086956014439648e-05, + "loss": 0.0, + "step": 52197 + }, + { + "epoch": 4.870579453205188, + "grad_norm": NaN, + "learning_rate": 2.7082620661306993e-05, + "loss": 0.0, + "step": 52198 + }, + { + "epoch": 4.870672762900066, + "grad_norm": NaN, + "learning_rate": 2.7078285620716827e-05, + "loss": 0.0, + "step": 52199 + }, + { + "epoch": 4.870766072594943, + "grad_norm": NaN, + "learning_rate": 2.7073950892680165e-05, + "loss": 0.0, + "step": 52200 + }, + { + "epoch": 4.87085938228982, + "grad_norm": NaN, + "learning_rate": 2.706961647720802e-05, + "loss": 0.0, + "step": 52201 + }, + { + "epoch": 4.870952691984697, + "grad_norm": NaN, + "learning_rate": 2.7065282374311446e-05, + "loss": 0.0, + "step": 52202 + }, + { + "epoch": 4.8710460016795745, + "grad_norm": NaN, + "learning_rate": 2.706094858400143e-05, + "loss": 0.0, + "step": 52203 + }, + { + "epoch": 4.871139311374452, + "grad_norm": NaN, + "learning_rate": 2.705661510628901e-05, + "loss": 0.0, + "step": 52204 + }, + { + "epoch": 4.871232621069329, + "grad_norm": NaN, + "learning_rate": 2.7052281941185196e-05, + "loss": 0.0, + "step": 52205 + }, + { + "epoch": 4.871325930764207, + "grad_norm": NaN, + "learning_rate": 2.704794908870101e-05, + "loss": 0.0, + "step": 52206 + }, + { + "epoch": 4.871419240459083, + "grad_norm": NaN, + "learning_rate": 2.704361654884748e-05, + "loss": 0.0, + "step": 52207 + }, + { + "epoch": 4.871512550153961, + "grad_norm": NaN, + "learning_rate": 2.7039284321635595e-05, + "loss": 0.0, + "step": 52208 + }, + { + "epoch": 4.871605859848838, + "grad_norm": NaN, + "learning_rate": 2.7034952407076384e-05, + "loss": 0.0, + "step": 52209 + }, + { + "epoch": 4.8716991695437155, + "grad_norm": NaN, + "learning_rate": 2.703062080518088e-05, + "loss": 0.0, + "step": 52210 + }, + { + "epoch": 4.871792479238593, + "grad_norm": NaN, + "learning_rate": 2.7026289515960066e-05, + "loss": 0.0, + "step": 52211 + }, + { + "epoch": 4.87188578893347, + "grad_norm": NaN, + "learning_rate": 2.7021958539424975e-05, + "loss": 0.0, + "step": 52212 + }, + { + "epoch": 4.871979098628348, + "grad_norm": NaN, + "learning_rate": 2.7017627875586622e-05, + "loss": 0.0, + "step": 52213 + }, + { + "epoch": 4.872072408323225, + "grad_norm": NaN, + "learning_rate": 2.7013297524456014e-05, + "loss": 0.0, + "step": 52214 + }, + { + "epoch": 4.872165718018102, + "grad_norm": NaN, + "learning_rate": 2.7008967486044136e-05, + "loss": 0.0, + "step": 52215 + }, + { + "epoch": 4.872259027712979, + "grad_norm": NaN, + "learning_rate": 2.7004637760362036e-05, + "loss": 0.0, + "step": 52216 + }, + { + "epoch": 4.872352337407857, + "grad_norm": NaN, + "learning_rate": 2.700030834742072e-05, + "loss": 0.0, + "step": 52217 + }, + { + "epoch": 4.872445647102734, + "grad_norm": NaN, + "learning_rate": 2.699597924723116e-05, + "loss": 0.0, + "step": 52218 + }, + { + "epoch": 4.872538956797611, + "grad_norm": NaN, + "learning_rate": 2.6991650459804416e-05, + "loss": 0.0, + "step": 52219 + }, + { + "epoch": 4.872632266492489, + "grad_norm": NaN, + "learning_rate": 2.6987321985151443e-05, + "loss": 0.0, + "step": 52220 + }, + { + "epoch": 4.872725576187366, + "grad_norm": NaN, + "learning_rate": 2.698299382328327e-05, + "loss": 0.0, + "step": 52221 + }, + { + "epoch": 4.872818885882243, + "grad_norm": NaN, + "learning_rate": 2.6978665974210923e-05, + "loss": 0.0, + "step": 52222 + }, + { + "epoch": 4.87291219557712, + "grad_norm": NaN, + "learning_rate": 2.697433843794537e-05, + "loss": 0.0, + "step": 52223 + }, + { + "epoch": 4.873005505271998, + "grad_norm": NaN, + "learning_rate": 2.6970011214497637e-05, + "loss": 0.0, + "step": 52224 + }, + { + "epoch": 4.873098814966875, + "grad_norm": NaN, + "learning_rate": 2.6965684303878736e-05, + "loss": 0.0, + "step": 52225 + }, + { + "epoch": 4.8731921246617524, + "grad_norm": NaN, + "learning_rate": 2.696135770609964e-05, + "loss": 0.0, + "step": 52226 + }, + { + "epoch": 4.87328543435663, + "grad_norm": NaN, + "learning_rate": 2.6957031421171372e-05, + "loss": 0.0, + "step": 52227 + }, + { + "epoch": 4.873378744051507, + "grad_norm": NaN, + "learning_rate": 2.6952705449104906e-05, + "loss": 0.0, + "step": 52228 + }, + { + "epoch": 4.873472053746385, + "grad_norm": NaN, + "learning_rate": 2.6948379789911285e-05, + "loss": 0.0, + "step": 52229 + }, + { + "epoch": 4.873565363441261, + "grad_norm": NaN, + "learning_rate": 2.6944054443601464e-05, + "loss": 0.0, + "step": 52230 + }, + { + "epoch": 4.873658673136139, + "grad_norm": NaN, + "learning_rate": 2.6939729410186474e-05, + "loss": 0.0, + "step": 52231 + }, + { + "epoch": 4.873751982831016, + "grad_norm": NaN, + "learning_rate": 2.69354046896773e-05, + "loss": 0.0, + "step": 52232 + }, + { + "epoch": 4.8738452925258935, + "grad_norm": NaN, + "learning_rate": 2.6931080282084932e-05, + "loss": 0.0, + "step": 52233 + }, + { + "epoch": 4.873938602220771, + "grad_norm": NaN, + "learning_rate": 2.692675618742037e-05, + "loss": 0.0, + "step": 52234 + }, + { + "epoch": 4.874031911915648, + "grad_norm": NaN, + "learning_rate": 2.692243240569461e-05, + "loss": 0.0, + "step": 52235 + }, + { + "epoch": 4.874125221610525, + "grad_norm": NaN, + "learning_rate": 2.6918108936918638e-05, + "loss": 0.0, + "step": 52236 + }, + { + "epoch": 4.874218531305402, + "grad_norm": NaN, + "learning_rate": 2.691378578110347e-05, + "loss": 0.0, + "step": 52237 + }, + { + "epoch": 4.87431184100028, + "grad_norm": NaN, + "learning_rate": 2.690946293826007e-05, + "loss": 0.0, + "step": 52238 + }, + { + "epoch": 4.874405150695157, + "grad_norm": NaN, + "learning_rate": 2.6905140408399457e-05, + "loss": 0.0, + "step": 52239 + }, + { + "epoch": 4.8744984603900345, + "grad_norm": NaN, + "learning_rate": 2.690081819153261e-05, + "loss": 0.0, + "step": 52240 + }, + { + "epoch": 4.874591770084912, + "grad_norm": NaN, + "learning_rate": 2.68964962876705e-05, + "loss": 0.0, + "step": 52241 + }, + { + "epoch": 4.874685079779789, + "grad_norm": NaN, + "learning_rate": 2.6892174696824155e-05, + "loss": 0.0, + "step": 52242 + }, + { + "epoch": 4.874778389474667, + "grad_norm": NaN, + "learning_rate": 2.6887853419004528e-05, + "loss": 0.0, + "step": 52243 + }, + { + "epoch": 4.874871699169544, + "grad_norm": NaN, + "learning_rate": 2.6883532454222633e-05, + "loss": 0.0, + "step": 52244 + }, + { + "epoch": 4.874965008864421, + "grad_norm": NaN, + "learning_rate": 2.6879211802489424e-05, + "loss": 0.0, + "step": 52245 + }, + { + "epoch": 4.875058318559298, + "grad_norm": NaN, + "learning_rate": 2.6874891463815927e-05, + "loss": 0.0, + "step": 52246 + }, + { + "epoch": 4.875151628254176, + "grad_norm": NaN, + "learning_rate": 2.6870571438213102e-05, + "loss": 0.0, + "step": 52247 + }, + { + "epoch": 4.875244937949053, + "grad_norm": NaN, + "learning_rate": 2.686625172569195e-05, + "loss": 0.0, + "step": 52248 + }, + { + "epoch": 4.87533824764393, + "grad_norm": NaN, + "learning_rate": 2.6861932326263436e-05, + "loss": 0.0, + "step": 52249 + }, + { + "epoch": 4.875431557338808, + "grad_norm": NaN, + "learning_rate": 2.6857613239938546e-05, + "loss": 0.0, + "step": 52250 + }, + { + "epoch": 4.875524867033684, + "grad_norm": NaN, + "learning_rate": 2.6853294466728276e-05, + "loss": 0.0, + "step": 52251 + }, + { + "epoch": 4.875618176728562, + "grad_norm": NaN, + "learning_rate": 2.6848976006643595e-05, + "loss": 0.0, + "step": 52252 + }, + { + "epoch": 4.875711486423439, + "grad_norm": NaN, + "learning_rate": 2.68446578596955e-05, + "loss": 0.0, + "step": 52253 + }, + { + "epoch": 4.875804796118317, + "grad_norm": NaN, + "learning_rate": 2.6840340025894946e-05, + "loss": 0.0, + "step": 52254 + }, + { + "epoch": 4.875898105813194, + "grad_norm": NaN, + "learning_rate": 2.683602250525294e-05, + "loss": 0.0, + "step": 52255 + }, + { + "epoch": 4.8759914155080715, + "grad_norm": NaN, + "learning_rate": 2.6831705297780426e-05, + "loss": 0.0, + "step": 52256 + }, + { + "epoch": 4.876084725202949, + "grad_norm": NaN, + "learning_rate": 2.682738840348841e-05, + "loss": 0.0, + "step": 52257 + }, + { + "epoch": 4.876178034897826, + "grad_norm": NaN, + "learning_rate": 2.6823071822387864e-05, + "loss": 0.0, + "step": 52258 + }, + { + "epoch": 4.876271344592703, + "grad_norm": NaN, + "learning_rate": 2.681875555448975e-05, + "loss": 0.0, + "step": 52259 + }, + { + "epoch": 4.87636465428758, + "grad_norm": NaN, + "learning_rate": 2.6814439599805042e-05, + "loss": 0.0, + "step": 52260 + }, + { + "epoch": 4.876457963982458, + "grad_norm": NaN, + "learning_rate": 2.6810123958344748e-05, + "loss": 0.0, + "step": 52261 + }, + { + "epoch": 4.876551273677335, + "grad_norm": NaN, + "learning_rate": 2.6805808630119792e-05, + "loss": 0.0, + "step": 52262 + }, + { + "epoch": 4.8766445833722125, + "grad_norm": NaN, + "learning_rate": 2.6801493615141196e-05, + "loss": 0.0, + "step": 52263 + }, + { + "epoch": 4.87673789306709, + "grad_norm": NaN, + "learning_rate": 2.6797178913419888e-05, + "loss": 0.0, + "step": 52264 + }, + { + "epoch": 4.876831202761967, + "grad_norm": NaN, + "learning_rate": 2.6792864524966874e-05, + "loss": 0.0, + "step": 52265 + }, + { + "epoch": 4.876924512456844, + "grad_norm": NaN, + "learning_rate": 2.6788550449793095e-05, + "loss": 0.0, + "step": 52266 + }, + { + "epoch": 4.877017822151721, + "grad_norm": NaN, + "learning_rate": 2.6784236687909543e-05, + "loss": 0.0, + "step": 52267 + }, + { + "epoch": 4.877111131846599, + "grad_norm": NaN, + "learning_rate": 2.6779923239327178e-05, + "loss": 0.0, + "step": 52268 + }, + { + "epoch": 4.877204441541476, + "grad_norm": NaN, + "learning_rate": 2.677561010405697e-05, + "loss": 0.0, + "step": 52269 + }, + { + "epoch": 4.877297751236354, + "grad_norm": NaN, + "learning_rate": 2.6771297282109867e-05, + "loss": 0.0, + "step": 52270 + }, + { + "epoch": 4.877391060931231, + "grad_norm": NaN, + "learning_rate": 2.676698477349687e-05, + "loss": 0.0, + "step": 52271 + }, + { + "epoch": 4.877484370626108, + "grad_norm": NaN, + "learning_rate": 2.6762672578228922e-05, + "loss": 0.0, + "step": 52272 + }, + { + "epoch": 4.877577680320986, + "grad_norm": NaN, + "learning_rate": 2.6758360696316982e-05, + "loss": 0.0, + "step": 52273 + }, + { + "epoch": 4.877670990015862, + "grad_norm": NaN, + "learning_rate": 2.6754049127772043e-05, + "loss": 0.0, + "step": 52274 + }, + { + "epoch": 4.87776429971074, + "grad_norm": NaN, + "learning_rate": 2.674973787260504e-05, + "loss": 0.0, + "step": 52275 + }, + { + "epoch": 4.877857609405617, + "grad_norm": NaN, + "learning_rate": 2.674542693082694e-05, + "loss": 0.0, + "step": 52276 + }, + { + "epoch": 4.877950919100495, + "grad_norm": NaN, + "learning_rate": 2.674111630244871e-05, + "loss": 0.0, + "step": 52277 + }, + { + "epoch": 4.878044228795372, + "grad_norm": NaN, + "learning_rate": 2.6736805987481315e-05, + "loss": 0.0, + "step": 52278 + }, + { + "epoch": 4.8781375384902494, + "grad_norm": NaN, + "learning_rate": 2.6732495985935703e-05, + "loss": 0.0, + "step": 52279 + }, + { + "epoch": 4.878230848185126, + "grad_norm": NaN, + "learning_rate": 2.672818629782284e-05, + "loss": 0.0, + "step": 52280 + }, + { + "epoch": 4.878324157880003, + "grad_norm": NaN, + "learning_rate": 2.6723876923153682e-05, + "loss": 0.0, + "step": 52281 + }, + { + "epoch": 4.878417467574881, + "grad_norm": NaN, + "learning_rate": 2.6719567861939183e-05, + "loss": 0.0, + "step": 52282 + }, + { + "epoch": 4.878510777269758, + "grad_norm": NaN, + "learning_rate": 2.6715259114190324e-05, + "loss": 0.0, + "step": 52283 + }, + { + "epoch": 4.878604086964636, + "grad_norm": NaN, + "learning_rate": 2.6710950679918025e-05, + "loss": 0.0, + "step": 52284 + }, + { + "epoch": 4.878697396659513, + "grad_norm": NaN, + "learning_rate": 2.6706642559133263e-05, + "loss": 0.0, + "step": 52285 + }, + { + "epoch": 4.8787907063543905, + "grad_norm": NaN, + "learning_rate": 2.6702334751846976e-05, + "loss": 0.0, + "step": 52286 + }, + { + "epoch": 4.878884016049268, + "grad_norm": NaN, + "learning_rate": 2.6698027258070144e-05, + "loss": 0.0, + "step": 52287 + }, + { + "epoch": 4.8789773257441444, + "grad_norm": NaN, + "learning_rate": 2.66937200778137e-05, + "loss": 0.0, + "step": 52288 + }, + { + "epoch": 4.879070635439022, + "grad_norm": NaN, + "learning_rate": 2.6689413211088594e-05, + "loss": 0.0, + "step": 52289 + }, + { + "epoch": 4.879163945133899, + "grad_norm": NaN, + "learning_rate": 2.668510665790578e-05, + "loss": 0.0, + "step": 52290 + }, + { + "epoch": 4.879257254828777, + "grad_norm": NaN, + "learning_rate": 2.6680800418276234e-05, + "loss": 0.0, + "step": 52291 + }, + { + "epoch": 4.879350564523654, + "grad_norm": NaN, + "learning_rate": 2.667649449221086e-05, + "loss": 0.0, + "step": 52292 + }, + { + "epoch": 4.8794438742185315, + "grad_norm": NaN, + "learning_rate": 2.6672188879720635e-05, + "loss": 0.0, + "step": 52293 + }, + { + "epoch": 4.879537183913409, + "grad_norm": NaN, + "learning_rate": 2.6667883580816502e-05, + "loss": 0.0, + "step": 52294 + }, + { + "epoch": 4.8796304936082855, + "grad_norm": NaN, + "learning_rate": 2.66635785955094e-05, + "loss": 0.0, + "step": 52295 + }, + { + "epoch": 4.879723803303163, + "grad_norm": NaN, + "learning_rate": 2.6659273923810287e-05, + "loss": 0.0, + "step": 52296 + }, + { + "epoch": 4.87981711299804, + "grad_norm": NaN, + "learning_rate": 2.665496956573012e-05, + "loss": 0.0, + "step": 52297 + }, + { + "epoch": 4.879910422692918, + "grad_norm": NaN, + "learning_rate": 2.6650665521279813e-05, + "loss": 0.0, + "step": 52298 + }, + { + "epoch": 4.880003732387795, + "grad_norm": NaN, + "learning_rate": 2.6646361790470316e-05, + "loss": 0.0, + "step": 52299 + }, + { + "epoch": 4.880097042082673, + "grad_norm": NaN, + "learning_rate": 2.664205837331261e-05, + "loss": 0.0, + "step": 52300 + }, + { + "epoch": 4.88019035177755, + "grad_norm": NaN, + "learning_rate": 2.663775526981758e-05, + "loss": 0.0, + "step": 52301 + }, + { + "epoch": 4.880283661472427, + "grad_norm": NaN, + "learning_rate": 2.6633452479996204e-05, + "loss": 0.0, + "step": 52302 + }, + { + "epoch": 4.880376971167304, + "grad_norm": NaN, + "learning_rate": 2.6629150003859423e-05, + "loss": 0.0, + "step": 52303 + }, + { + "epoch": 4.880470280862181, + "grad_norm": NaN, + "learning_rate": 2.662484784141816e-05, + "loss": 0.0, + "step": 52304 + }, + { + "epoch": 4.880563590557059, + "grad_norm": NaN, + "learning_rate": 2.6620545992683363e-05, + "loss": 0.0, + "step": 52305 + }, + { + "epoch": 4.880656900251936, + "grad_norm": NaN, + "learning_rate": 2.6616244457665964e-05, + "loss": 0.0, + "step": 52306 + }, + { + "epoch": 4.880750209946814, + "grad_norm": NaN, + "learning_rate": 2.6611943236376927e-05, + "loss": 0.0, + "step": 52307 + }, + { + "epoch": 4.880843519641691, + "grad_norm": NaN, + "learning_rate": 2.660764232882714e-05, + "loss": 0.0, + "step": 52308 + }, + { + "epoch": 4.880936829336568, + "grad_norm": NaN, + "learning_rate": 2.6603341735027594e-05, + "loss": 0.0, + "step": 52309 + }, + { + "epoch": 4.881030139031445, + "grad_norm": NaN, + "learning_rate": 2.6599041454989185e-05, + "loss": 0.0, + "step": 52310 + }, + { + "epoch": 4.881123448726322, + "grad_norm": NaN, + "learning_rate": 2.659474148872285e-05, + "loss": 0.0, + "step": 52311 + }, + { + "epoch": 4.8812167584212, + "grad_norm": NaN, + "learning_rate": 2.6590441836239546e-05, + "loss": 0.0, + "step": 52312 + }, + { + "epoch": 4.881310068116077, + "grad_norm": NaN, + "learning_rate": 2.6586142497550183e-05, + "loss": 0.0, + "step": 52313 + }, + { + "epoch": 4.881403377810955, + "grad_norm": NaN, + "learning_rate": 2.6581843472665704e-05, + "loss": 0.0, + "step": 52314 + }, + { + "epoch": 4.881496687505832, + "grad_norm": NaN, + "learning_rate": 2.6577544761597048e-05, + "loss": 0.0, + "step": 52315 + }, + { + "epoch": 4.8815899972007095, + "grad_norm": NaN, + "learning_rate": 2.6573246364355123e-05, + "loss": 0.0, + "step": 52316 + }, + { + "epoch": 4.881683306895587, + "grad_norm": NaN, + "learning_rate": 2.6568948280950868e-05, + "loss": 0.0, + "step": 52317 + }, + { + "epoch": 4.8817766165904635, + "grad_norm": NaN, + "learning_rate": 2.656465051139523e-05, + "loss": 0.0, + "step": 52318 + }, + { + "epoch": 4.881869926285341, + "grad_norm": NaN, + "learning_rate": 2.6560353055699113e-05, + "loss": 0.0, + "step": 52319 + }, + { + "epoch": 4.881963235980218, + "grad_norm": NaN, + "learning_rate": 2.6556055913873457e-05, + "loss": 0.0, + "step": 52320 + }, + { + "epoch": 4.882056545675096, + "grad_norm": NaN, + "learning_rate": 2.6551759085929175e-05, + "loss": 0.0, + "step": 52321 + }, + { + "epoch": 4.882149855369973, + "grad_norm": NaN, + "learning_rate": 2.6547462571877204e-05, + "loss": 0.0, + "step": 52322 + }, + { + "epoch": 4.882243165064851, + "grad_norm": NaN, + "learning_rate": 2.654316637172847e-05, + "loss": 0.0, + "step": 52323 + }, + { + "epoch": 4.882336474759727, + "grad_norm": NaN, + "learning_rate": 2.6538870485493897e-05, + "loss": 0.0, + "step": 52324 + }, + { + "epoch": 4.8824297844546045, + "grad_norm": NaN, + "learning_rate": 2.653457491318439e-05, + "loss": 0.0, + "step": 52325 + }, + { + "epoch": 4.882523094149482, + "grad_norm": NaN, + "learning_rate": 2.65302796548109e-05, + "loss": 0.0, + "step": 52326 + }, + { + "epoch": 4.882616403844359, + "grad_norm": NaN, + "learning_rate": 2.6525984710384323e-05, + "loss": 0.0, + "step": 52327 + }, + { + "epoch": 4.882709713539237, + "grad_norm": NaN, + "learning_rate": 2.6521690079915607e-05, + "loss": 0.0, + "step": 52328 + }, + { + "epoch": 4.882803023234114, + "grad_norm": NaN, + "learning_rate": 2.6517395763415645e-05, + "loss": 0.0, + "step": 52329 + }, + { + "epoch": 4.882896332928992, + "grad_norm": NaN, + "learning_rate": 2.6513101760895373e-05, + "loss": 0.0, + "step": 52330 + }, + { + "epoch": 4.882989642623869, + "grad_norm": NaN, + "learning_rate": 2.65088080723657e-05, + "loss": 0.0, + "step": 52331 + }, + { + "epoch": 4.883082952318746, + "grad_norm": NaN, + "learning_rate": 2.6504514697837538e-05, + "loss": 0.0, + "step": 52332 + }, + { + "epoch": 4.883176262013623, + "grad_norm": NaN, + "learning_rate": 2.6500221637321787e-05, + "loss": 0.0, + "step": 52333 + }, + { + "epoch": 4.8832695717085, + "grad_norm": NaN, + "learning_rate": 2.6495928890829465e-05, + "loss": 0.0, + "step": 52334 + }, + { + "epoch": 4.883362881403378, + "grad_norm": NaN, + "learning_rate": 2.6491636458371372e-05, + "loss": 0.0, + "step": 52335 + }, + { + "epoch": 4.883456191098255, + "grad_norm": NaN, + "learning_rate": 2.648734433995842e-05, + "loss": 0.0, + "step": 52336 + }, + { + "epoch": 4.883549500793133, + "grad_norm": NaN, + "learning_rate": 2.6483052535601636e-05, + "loss": 0.0, + "step": 52337 + }, + { + "epoch": 4.88364281048801, + "grad_norm": NaN, + "learning_rate": 2.6478761045311836e-05, + "loss": 0.0, + "step": 52338 + }, + { + "epoch": 4.883736120182887, + "grad_norm": NaN, + "learning_rate": 2.64744698690999e-05, + "loss": 0.0, + "step": 52339 + }, + { + "epoch": 4.883829429877764, + "grad_norm": NaN, + "learning_rate": 2.6470179006976887e-05, + "loss": 0.0, + "step": 52340 + }, + { + "epoch": 4.8839227395726414, + "grad_norm": NaN, + "learning_rate": 2.646588845895359e-05, + "loss": 0.0, + "step": 52341 + }, + { + "epoch": 4.884016049267519, + "grad_norm": NaN, + "learning_rate": 2.6461598225040925e-05, + "loss": 0.0, + "step": 52342 + }, + { + "epoch": 4.884109358962396, + "grad_norm": NaN, + "learning_rate": 2.6457308305249824e-05, + "loss": 0.0, + "step": 52343 + }, + { + "epoch": 4.884202668657274, + "grad_norm": NaN, + "learning_rate": 2.6453018699591194e-05, + "loss": 0.0, + "step": 52344 + }, + { + "epoch": 4.884295978352151, + "grad_norm": NaN, + "learning_rate": 2.6448729408075943e-05, + "loss": 0.0, + "step": 52345 + }, + { + "epoch": 4.8843892880470285, + "grad_norm": NaN, + "learning_rate": 2.6444440430714976e-05, + "loss": 0.0, + "step": 52346 + }, + { + "epoch": 4.884482597741905, + "grad_norm": NaN, + "learning_rate": 2.6440151767519187e-05, + "loss": 0.0, + "step": 52347 + }, + { + "epoch": 4.8845759074367825, + "grad_norm": NaN, + "learning_rate": 2.64358634184995e-05, + "loss": 0.0, + "step": 52348 + }, + { + "epoch": 4.88466921713166, + "grad_norm": NaN, + "learning_rate": 2.6431575383666803e-05, + "loss": 0.0, + "step": 52349 + }, + { + "epoch": 4.884762526826537, + "grad_norm": NaN, + "learning_rate": 2.642728766303201e-05, + "loss": 0.0, + "step": 52350 + }, + { + "epoch": 4.884855836521415, + "grad_norm": NaN, + "learning_rate": 2.6423000256606023e-05, + "loss": 0.0, + "step": 52351 + }, + { + "epoch": 4.884949146216292, + "grad_norm": NaN, + "learning_rate": 2.641871316439974e-05, + "loss": 0.0, + "step": 52352 + }, + { + "epoch": 4.885042455911169, + "grad_norm": NaN, + "learning_rate": 2.6414426386424064e-05, + "loss": 0.0, + "step": 52353 + }, + { + "epoch": 4.885135765606046, + "grad_norm": NaN, + "learning_rate": 2.641013992268989e-05, + "loss": 0.0, + "step": 52354 + }, + { + "epoch": 4.8852290753009235, + "grad_norm": NaN, + "learning_rate": 2.640585377320812e-05, + "loss": 0.0, + "step": 52355 + }, + { + "epoch": 4.885322384995801, + "grad_norm": NaN, + "learning_rate": 2.6401567937989647e-05, + "loss": 0.0, + "step": 52356 + }, + { + "epoch": 4.885415694690678, + "grad_norm": NaN, + "learning_rate": 2.639728241704537e-05, + "loss": 0.0, + "step": 52357 + }, + { + "epoch": 4.885509004385556, + "grad_norm": NaN, + "learning_rate": 2.63929972103862e-05, + "loss": 0.0, + "step": 52358 + }, + { + "epoch": 4.885602314080433, + "grad_norm": NaN, + "learning_rate": 2.6388712318023026e-05, + "loss": 0.0, + "step": 52359 + }, + { + "epoch": 4.885695623775311, + "grad_norm": NaN, + "learning_rate": 2.638442773996673e-05, + "loss": 0.0, + "step": 52360 + }, + { + "epoch": 4.885788933470187, + "grad_norm": NaN, + "learning_rate": 2.638014347622822e-05, + "loss": 0.0, + "step": 52361 + }, + { + "epoch": 4.885882243165065, + "grad_norm": NaN, + "learning_rate": 2.6375859526818393e-05, + "loss": 0.0, + "step": 52362 + }, + { + "epoch": 4.885975552859942, + "grad_norm": NaN, + "learning_rate": 2.6371575891748137e-05, + "loss": 0.0, + "step": 52363 + }, + { + "epoch": 4.886068862554819, + "grad_norm": NaN, + "learning_rate": 2.6367292571028324e-05, + "loss": 0.0, + "step": 52364 + }, + { + "epoch": 4.886162172249697, + "grad_norm": NaN, + "learning_rate": 2.6363009564669886e-05, + "loss": 0.0, + "step": 52365 + }, + { + "epoch": 4.886255481944574, + "grad_norm": NaN, + "learning_rate": 2.6358726872683673e-05, + "loss": 0.0, + "step": 52366 + }, + { + "epoch": 4.886348791639452, + "grad_norm": NaN, + "learning_rate": 2.6354444495080597e-05, + "loss": 0.0, + "step": 52367 + }, + { + "epoch": 4.886442101334328, + "grad_norm": NaN, + "learning_rate": 2.6350162431871512e-05, + "loss": 0.0, + "step": 52368 + }, + { + "epoch": 4.886535411029206, + "grad_norm": NaN, + "learning_rate": 2.63458806830674e-05, + "loss": 0.0, + "step": 52369 + }, + { + "epoch": 4.886628720724083, + "grad_norm": NaN, + "learning_rate": 2.6341599248679064e-05, + "loss": 0.0, + "step": 52370 + }, + { + "epoch": 4.8867220304189605, + "grad_norm": NaN, + "learning_rate": 2.633731812871736e-05, + "loss": 0.0, + "step": 52371 + }, + { + "epoch": 4.886815340113838, + "grad_norm": NaN, + "learning_rate": 2.63330373231933e-05, + "loss": 0.0, + "step": 52372 + }, + { + "epoch": 4.886908649808715, + "grad_norm": NaN, + "learning_rate": 2.6328756832117643e-05, + "loss": 0.0, + "step": 52373 + }, + { + "epoch": 4.887001959503593, + "grad_norm": NaN, + "learning_rate": 2.6324476655501308e-05, + "loss": 0.0, + "step": 52374 + }, + { + "epoch": 4.88709526919847, + "grad_norm": NaN, + "learning_rate": 2.6320196793355258e-05, + "loss": 0.0, + "step": 52375 + }, + { + "epoch": 4.887188578893347, + "grad_norm": NaN, + "learning_rate": 2.631591724569028e-05, + "loss": 0.0, + "step": 52376 + }, + { + "epoch": 4.887281888588224, + "grad_norm": NaN, + "learning_rate": 2.6311638012517237e-05, + "loss": 0.0, + "step": 52377 + }, + { + "epoch": 4.8873751982831015, + "grad_norm": NaN, + "learning_rate": 2.6307359093847154e-05, + "loss": 0.0, + "step": 52378 + }, + { + "epoch": 4.887468507977979, + "grad_norm": NaN, + "learning_rate": 2.6303080489690764e-05, + "loss": 0.0, + "step": 52379 + }, + { + "epoch": 4.887561817672856, + "grad_norm": NaN, + "learning_rate": 2.629880220005895e-05, + "loss": 0.0, + "step": 52380 + }, + { + "epoch": 4.887655127367734, + "grad_norm": NaN, + "learning_rate": 2.629452422496273e-05, + "loss": 0.0, + "step": 52381 + }, + { + "epoch": 4.88774843706261, + "grad_norm": NaN, + "learning_rate": 2.6290246564412853e-05, + "loss": 0.0, + "step": 52382 + }, + { + "epoch": 4.887841746757488, + "grad_norm": NaN, + "learning_rate": 2.628596921842019e-05, + "loss": 0.0, + "step": 52383 + }, + { + "epoch": 4.887935056452365, + "grad_norm": NaN, + "learning_rate": 2.6281692186995745e-05, + "loss": 0.0, + "step": 52384 + }, + { + "epoch": 4.888028366147243, + "grad_norm": NaN, + "learning_rate": 2.6277415470150263e-05, + "loss": 0.0, + "step": 52385 + }, + { + "epoch": 4.88812167584212, + "grad_norm": NaN, + "learning_rate": 2.6273139067894672e-05, + "loss": 0.0, + "step": 52386 + }, + { + "epoch": 4.888214985536997, + "grad_norm": NaN, + "learning_rate": 2.6268862980239823e-05, + "loss": 0.0, + "step": 52387 + }, + { + "epoch": 4.888308295231875, + "grad_norm": NaN, + "learning_rate": 2.6264587207196612e-05, + "loss": 0.0, + "step": 52388 + }, + { + "epoch": 4.888401604926752, + "grad_norm": NaN, + "learning_rate": 2.6260311748775912e-05, + "loss": 0.0, + "step": 52389 + }, + { + "epoch": 4.88849491462163, + "grad_norm": NaN, + "learning_rate": 2.6256036604988582e-05, + "loss": 0.0, + "step": 52390 + }, + { + "epoch": 4.888588224316506, + "grad_norm": NaN, + "learning_rate": 2.625176177584548e-05, + "loss": 0.0, + "step": 52391 + }, + { + "epoch": 4.888681534011384, + "grad_norm": NaN, + "learning_rate": 2.6247487261357498e-05, + "loss": 0.0, + "step": 52392 + }, + { + "epoch": 4.888774843706261, + "grad_norm": NaN, + "learning_rate": 2.6243213061535506e-05, + "loss": 0.0, + "step": 52393 + }, + { + "epoch": 4.8888681534011384, + "grad_norm": NaN, + "learning_rate": 2.623893917639035e-05, + "loss": 0.0, + "step": 52394 + }, + { + "epoch": 4.888961463096016, + "grad_norm": NaN, + "learning_rate": 2.623466560593292e-05, + "loss": 0.0, + "step": 52395 + }, + { + "epoch": 4.889054772790893, + "grad_norm": NaN, + "learning_rate": 2.6230392350174073e-05, + "loss": 0.0, + "step": 52396 + }, + { + "epoch": 4.88914808248577, + "grad_norm": NaN, + "learning_rate": 2.6226119409124684e-05, + "loss": 0.0, + "step": 52397 + }, + { + "epoch": 4.889241392180647, + "grad_norm": NaN, + "learning_rate": 2.6221846782795598e-05, + "loss": 0.0, + "step": 52398 + }, + { + "epoch": 4.889334701875525, + "grad_norm": NaN, + "learning_rate": 2.6217574471197683e-05, + "loss": 0.0, + "step": 52399 + }, + { + "epoch": 4.889428011570402, + "grad_norm": NaN, + "learning_rate": 2.621330247434182e-05, + "loss": 0.0, + "step": 52400 + }, + { + "epoch": 4.8895213212652795, + "grad_norm": NaN, + "learning_rate": 2.620903079223886e-05, + "loss": 0.0, + "step": 52401 + }, + { + "epoch": 4.889614630960157, + "grad_norm": NaN, + "learning_rate": 2.620475942489967e-05, + "loss": 0.0, + "step": 52402 + }, + { + "epoch": 4.889707940655034, + "grad_norm": NaN, + "learning_rate": 2.6200488372335082e-05, + "loss": 0.0, + "step": 52403 + }, + { + "epoch": 4.889801250349912, + "grad_norm": NaN, + "learning_rate": 2.619621763455601e-05, + "loss": 0.0, + "step": 52404 + }, + { + "epoch": 4.889894560044788, + "grad_norm": NaN, + "learning_rate": 2.619194721157326e-05, + "loss": 0.0, + "step": 52405 + }, + { + "epoch": 4.889987869739666, + "grad_norm": NaN, + "learning_rate": 2.6187677103397692e-05, + "loss": 0.0, + "step": 52406 + }, + { + "epoch": 4.890081179434543, + "grad_norm": NaN, + "learning_rate": 2.6183407310040245e-05, + "loss": 0.0, + "step": 52407 + }, + { + "epoch": 4.8901744891294205, + "grad_norm": NaN, + "learning_rate": 2.617913783151169e-05, + "loss": 0.0, + "step": 52408 + }, + { + "epoch": 4.890267798824298, + "grad_norm": NaN, + "learning_rate": 2.617486866782286e-05, + "loss": 0.0, + "step": 52409 + }, + { + "epoch": 4.890361108519175, + "grad_norm": NaN, + "learning_rate": 2.617059981898476e-05, + "loss": 0.0, + "step": 52410 + }, + { + "epoch": 4.890454418214053, + "grad_norm": NaN, + "learning_rate": 2.616633128500808e-05, + "loss": 0.0, + "step": 52411 + }, + { + "epoch": 4.890547727908929, + "grad_norm": NaN, + "learning_rate": 2.616206306590371e-05, + "loss": 0.0, + "step": 52412 + }, + { + "epoch": 4.890641037603807, + "grad_norm": NaN, + "learning_rate": 2.615779516168261e-05, + "loss": 0.0, + "step": 52413 + }, + { + "epoch": 4.890734347298684, + "grad_norm": NaN, + "learning_rate": 2.6153527572355504e-05, + "loss": 0.0, + "step": 52414 + }, + { + "epoch": 4.890827656993562, + "grad_norm": NaN, + "learning_rate": 2.6149260297933266e-05, + "loss": 0.0, + "step": 52415 + }, + { + "epoch": 4.890920966688439, + "grad_norm": NaN, + "learning_rate": 2.614499333842684e-05, + "loss": 0.0, + "step": 52416 + }, + { + "epoch": 4.891014276383316, + "grad_norm": NaN, + "learning_rate": 2.6140726693846964e-05, + "loss": 0.0, + "step": 52417 + }, + { + "epoch": 4.891107586078194, + "grad_norm": NaN, + "learning_rate": 2.61364603642045e-05, + "loss": 0.0, + "step": 52418 + }, + { + "epoch": 4.891200895773071, + "grad_norm": NaN, + "learning_rate": 2.61321943495104e-05, + "loss": 0.0, + "step": 52419 + }, + { + "epoch": 4.891294205467948, + "grad_norm": NaN, + "learning_rate": 2.6127928649775394e-05, + "loss": 0.0, + "step": 52420 + }, + { + "epoch": 4.891387515162825, + "grad_norm": NaN, + "learning_rate": 2.6123663265010335e-05, + "loss": 0.0, + "step": 52421 + }, + { + "epoch": 4.891480824857703, + "grad_norm": NaN, + "learning_rate": 2.6119398195226187e-05, + "loss": 0.0, + "step": 52422 + }, + { + "epoch": 4.89157413455258, + "grad_norm": NaN, + "learning_rate": 2.6115133440433673e-05, + "loss": 0.0, + "step": 52423 + }, + { + "epoch": 4.8916674442474575, + "grad_norm": NaN, + "learning_rate": 2.6110869000643646e-05, + "loss": 0.0, + "step": 52424 + }, + { + "epoch": 4.891760753942335, + "grad_norm": NaN, + "learning_rate": 2.6106604875867053e-05, + "loss": 0.0, + "step": 52425 + }, + { + "epoch": 4.891854063637211, + "grad_norm": NaN, + "learning_rate": 2.610234106611462e-05, + "loss": 0.0, + "step": 52426 + }, + { + "epoch": 4.891947373332089, + "grad_norm": NaN, + "learning_rate": 2.60980775713972e-05, + "loss": 0.0, + "step": 52427 + }, + { + "epoch": 4.892040683026966, + "grad_norm": NaN, + "learning_rate": 2.609381439172573e-05, + "loss": 0.0, + "step": 52428 + }, + { + "epoch": 4.892133992721844, + "grad_norm": NaN, + "learning_rate": 2.6089551527110962e-05, + "loss": 0.0, + "step": 52429 + }, + { + "epoch": 4.892227302416721, + "grad_norm": NaN, + "learning_rate": 2.608528897756375e-05, + "loss": 0.0, + "step": 52430 + }, + { + "epoch": 4.8923206121115985, + "grad_norm": NaN, + "learning_rate": 2.608102674309495e-05, + "loss": 0.0, + "step": 52431 + }, + { + "epoch": 4.892413921806476, + "grad_norm": NaN, + "learning_rate": 2.607676482371539e-05, + "loss": 0.0, + "step": 52432 + }, + { + "epoch": 4.892507231501353, + "grad_norm": NaN, + "learning_rate": 2.607250321943591e-05, + "loss": 0.0, + "step": 52433 + }, + { + "epoch": 4.892600541196231, + "grad_norm": NaN, + "learning_rate": 2.606824193026735e-05, + "loss": 0.0, + "step": 52434 + }, + { + "epoch": 4.892693850891107, + "grad_norm": NaN, + "learning_rate": 2.606398095622053e-05, + "loss": 0.0, + "step": 52435 + }, + { + "epoch": 4.892787160585985, + "grad_norm": NaN, + "learning_rate": 2.6059720297306286e-05, + "loss": 0.0, + "step": 52436 + }, + { + "epoch": 4.892880470280862, + "grad_norm": NaN, + "learning_rate": 2.6055459953535468e-05, + "loss": 0.0, + "step": 52437 + }, + { + "epoch": 4.89297377997574, + "grad_norm": NaN, + "learning_rate": 2.6051199924918904e-05, + "loss": 0.0, + "step": 52438 + }, + { + "epoch": 4.893067089670617, + "grad_norm": NaN, + "learning_rate": 2.6046940211467416e-05, + "loss": 0.0, + "step": 52439 + }, + { + "epoch": 4.893160399365494, + "grad_norm": NaN, + "learning_rate": 2.6042680813191846e-05, + "loss": 0.0, + "step": 52440 + }, + { + "epoch": 4.893253709060371, + "grad_norm": NaN, + "learning_rate": 2.6038421730103022e-05, + "loss": 0.0, + "step": 52441 + }, + { + "epoch": 4.893347018755248, + "grad_norm": NaN, + "learning_rate": 2.6034162962211764e-05, + "loss": 0.0, + "step": 52442 + }, + { + "epoch": 4.893440328450126, + "grad_norm": NaN, + "learning_rate": 2.6029904509528916e-05, + "loss": 0.0, + "step": 52443 + }, + { + "epoch": 4.893533638145003, + "grad_norm": NaN, + "learning_rate": 2.6025646372065252e-05, + "loss": 0.0, + "step": 52444 + }, + { + "epoch": 4.893626947839881, + "grad_norm": NaN, + "learning_rate": 2.6021388549831727e-05, + "loss": 0.0, + "step": 52445 + }, + { + "epoch": 4.893720257534758, + "grad_norm": NaN, + "learning_rate": 2.601713104283904e-05, + "loss": 0.0, + "step": 52446 + }, + { + "epoch": 4.8938135672296355, + "grad_norm": NaN, + "learning_rate": 2.6012873851098044e-05, + "loss": 0.0, + "step": 52447 + }, + { + "epoch": 4.893906876924513, + "grad_norm": NaN, + "learning_rate": 2.6008616974619644e-05, + "loss": 0.0, + "step": 52448 + }, + { + "epoch": 4.894000186619389, + "grad_norm": NaN, + "learning_rate": 2.6004360413414554e-05, + "loss": 0.0, + "step": 52449 + }, + { + "epoch": 4.894093496314267, + "grad_norm": NaN, + "learning_rate": 2.600010416749363e-05, + "loss": 0.0, + "step": 52450 + }, + { + "epoch": 4.894186806009144, + "grad_norm": NaN, + "learning_rate": 2.599584823686776e-05, + "loss": 0.0, + "step": 52451 + }, + { + "epoch": 4.894280115704022, + "grad_norm": NaN, + "learning_rate": 2.599159262154769e-05, + "loss": 0.0, + "step": 52452 + }, + { + "epoch": 4.894373425398899, + "grad_norm": NaN, + "learning_rate": 2.5987337321544243e-05, + "loss": 0.0, + "step": 52453 + }, + { + "epoch": 4.8944667350937765, + "grad_norm": NaN, + "learning_rate": 2.598308233686831e-05, + "loss": 0.0, + "step": 52454 + }, + { + "epoch": 4.894560044788654, + "grad_norm": NaN, + "learning_rate": 2.597882766753065e-05, + "loss": 0.0, + "step": 52455 + }, + { + "epoch": 4.8946533544835304, + "grad_norm": NaN, + "learning_rate": 2.5974573313542035e-05, + "loss": 0.0, + "step": 52456 + }, + { + "epoch": 4.894746664178408, + "grad_norm": NaN, + "learning_rate": 2.5970319274913426e-05, + "loss": 0.0, + "step": 52457 + }, + { + "epoch": 4.894839973873285, + "grad_norm": NaN, + "learning_rate": 2.5966065551655512e-05, + "loss": 0.0, + "step": 52458 + }, + { + "epoch": 4.894933283568163, + "grad_norm": NaN, + "learning_rate": 2.5961812143779122e-05, + "loss": 0.0, + "step": 52459 + }, + { + "epoch": 4.89502659326304, + "grad_norm": NaN, + "learning_rate": 2.5957559051295175e-05, + "loss": 0.0, + "step": 52460 + }, + { + "epoch": 4.8951199029579175, + "grad_norm": NaN, + "learning_rate": 2.5953306274214364e-05, + "loss": 0.0, + "step": 52461 + }, + { + "epoch": 4.895213212652795, + "grad_norm": NaN, + "learning_rate": 2.5949053812547515e-05, + "loss": 0.0, + "step": 52462 + }, + { + "epoch": 4.895306522347672, + "grad_norm": NaN, + "learning_rate": 2.594480166630557e-05, + "loss": 0.0, + "step": 52463 + }, + { + "epoch": 4.895399832042549, + "grad_norm": NaN, + "learning_rate": 2.5940549835499185e-05, + "loss": 0.0, + "step": 52464 + }, + { + "epoch": 4.895493141737426, + "grad_norm": NaN, + "learning_rate": 2.5936298320139205e-05, + "loss": 0.0, + "step": 52465 + }, + { + "epoch": 4.895586451432304, + "grad_norm": NaN, + "learning_rate": 2.5932047120236547e-05, + "loss": 0.0, + "step": 52466 + }, + { + "epoch": 4.895679761127181, + "grad_norm": NaN, + "learning_rate": 2.5927796235801896e-05, + "loss": 0.0, + "step": 52467 + }, + { + "epoch": 4.895773070822059, + "grad_norm": NaN, + "learning_rate": 2.5923545666846068e-05, + "loss": 0.0, + "step": 52468 + }, + { + "epoch": 4.895866380516936, + "grad_norm": NaN, + "learning_rate": 2.5919295413379994e-05, + "loss": 0.0, + "step": 52469 + }, + { + "epoch": 4.8959596902118125, + "grad_norm": NaN, + "learning_rate": 2.5915045475414343e-05, + "loss": 0.0, + "step": 52470 + }, + { + "epoch": 4.89605299990669, + "grad_norm": NaN, + "learning_rate": 2.5910795852959958e-05, + "loss": 0.0, + "step": 52471 + }, + { + "epoch": 4.896146309601567, + "grad_norm": NaN, + "learning_rate": 2.5906546546027717e-05, + "loss": 0.0, + "step": 52472 + }, + { + "epoch": 4.896239619296445, + "grad_norm": NaN, + "learning_rate": 2.5902297554628344e-05, + "loss": 0.0, + "step": 52473 + }, + { + "epoch": 4.896332928991322, + "grad_norm": NaN, + "learning_rate": 2.5898048878772627e-05, + "loss": 0.0, + "step": 52474 + }, + { + "epoch": 4.8964262386862, + "grad_norm": NaN, + "learning_rate": 2.5893800518471475e-05, + "loss": 0.0, + "step": 52475 + }, + { + "epoch": 4.896519548381077, + "grad_norm": NaN, + "learning_rate": 2.58895524737356e-05, + "loss": 0.0, + "step": 52476 + }, + { + "epoch": 4.8966128580759545, + "grad_norm": NaN, + "learning_rate": 2.5885304744575837e-05, + "loss": 0.0, + "step": 52477 + }, + { + "epoch": 4.896706167770831, + "grad_norm": NaN, + "learning_rate": 2.5881057331002964e-05, + "loss": 0.0, + "step": 52478 + }, + { + "epoch": 4.896799477465708, + "grad_norm": NaN, + "learning_rate": 2.587681023302779e-05, + "loss": 0.0, + "step": 52479 + }, + { + "epoch": 4.896892787160586, + "grad_norm": NaN, + "learning_rate": 2.5872563450661138e-05, + "loss": 0.0, + "step": 52480 + }, + { + "epoch": 4.896986096855463, + "grad_norm": NaN, + "learning_rate": 2.5868316983913768e-05, + "loss": 0.0, + "step": 52481 + }, + { + "epoch": 4.897079406550341, + "grad_norm": NaN, + "learning_rate": 2.5864070832796468e-05, + "loss": 0.0, + "step": 52482 + }, + { + "epoch": 4.897172716245218, + "grad_norm": NaN, + "learning_rate": 2.585982499732015e-05, + "loss": 0.0, + "step": 52483 + }, + { + "epoch": 4.8972660259400955, + "grad_norm": NaN, + "learning_rate": 2.585557947749547e-05, + "loss": 0.0, + "step": 52484 + }, + { + "epoch": 4.897359335634972, + "grad_norm": NaN, + "learning_rate": 2.585133427333325e-05, + "loss": 0.0, + "step": 52485 + }, + { + "epoch": 4.8974526453298495, + "grad_norm": NaN, + "learning_rate": 2.5847089384844387e-05, + "loss": 0.0, + "step": 52486 + }, + { + "epoch": 4.897545955024727, + "grad_norm": NaN, + "learning_rate": 2.5842844812039553e-05, + "loss": 0.0, + "step": 52487 + }, + { + "epoch": 4.897639264719604, + "grad_norm": NaN, + "learning_rate": 2.5838600554929556e-05, + "loss": 0.0, + "step": 52488 + }, + { + "epoch": 4.897732574414482, + "grad_norm": NaN, + "learning_rate": 2.5834356613525287e-05, + "loss": 0.0, + "step": 52489 + }, + { + "epoch": 4.897825884109359, + "grad_norm": NaN, + "learning_rate": 2.5830112987837438e-05, + "loss": 0.0, + "step": 52490 + }, + { + "epoch": 4.897919193804237, + "grad_norm": NaN, + "learning_rate": 2.58258696778768e-05, + "loss": 0.0, + "step": 52491 + }, + { + "epoch": 4.898012503499114, + "grad_norm": NaN, + "learning_rate": 2.5821626683654245e-05, + "loss": 0.0, + "step": 52492 + }, + { + "epoch": 4.8981058131939905, + "grad_norm": NaN, + "learning_rate": 2.5817384005180485e-05, + "loss": 0.0, + "step": 52493 + }, + { + "epoch": 4.898199122888868, + "grad_norm": NaN, + "learning_rate": 2.5813141642466295e-05, + "loss": 0.0, + "step": 52494 + }, + { + "epoch": 4.898292432583745, + "grad_norm": NaN, + "learning_rate": 2.5808899595522563e-05, + "loss": 0.0, + "step": 52495 + }, + { + "epoch": 4.898385742278623, + "grad_norm": NaN, + "learning_rate": 2.5804657864359985e-05, + "loss": 0.0, + "step": 52496 + }, + { + "epoch": 4.8984790519735, + "grad_norm": NaN, + "learning_rate": 2.580041644898933e-05, + "loss": 0.0, + "step": 52497 + }, + { + "epoch": 4.898572361668378, + "grad_norm": NaN, + "learning_rate": 2.579617534942151e-05, + "loss": 0.0, + "step": 52498 + }, + { + "epoch": 4.898665671363254, + "grad_norm": NaN, + "learning_rate": 2.5791934565667182e-05, + "loss": 0.0, + "step": 52499 + }, + { + "epoch": 4.898758981058132, + "grad_norm": NaN, + "learning_rate": 2.5787694097737137e-05, + "loss": 0.0, + "step": 52500 + }, + { + "epoch": 4.898852290753009, + "grad_norm": NaN, + "learning_rate": 2.5783453945642252e-05, + "loss": 0.0, + "step": 52501 + }, + { + "epoch": 4.898945600447886, + "grad_norm": NaN, + "learning_rate": 2.5779214109393215e-05, + "loss": 0.0, + "step": 52502 + }, + { + "epoch": 4.899038910142764, + "grad_norm": NaN, + "learning_rate": 2.5774974589000818e-05, + "loss": 0.0, + "step": 52503 + }, + { + "epoch": 4.899132219837641, + "grad_norm": NaN, + "learning_rate": 2.577073538447592e-05, + "loss": 0.0, + "step": 52504 + }, + { + "epoch": 4.899225529532519, + "grad_norm": NaN, + "learning_rate": 2.5766496495829215e-05, + "loss": 0.0, + "step": 52505 + }, + { + "epoch": 4.899318839227396, + "grad_norm": NaN, + "learning_rate": 2.5762257923071472e-05, + "loss": 0.0, + "step": 52506 + }, + { + "epoch": 4.8994121489222735, + "grad_norm": NaN, + "learning_rate": 2.575801966621357e-05, + "loss": 0.0, + "step": 52507 + }, + { + "epoch": 4.89950545861715, + "grad_norm": NaN, + "learning_rate": 2.5753781725266183e-05, + "loss": 0.0, + "step": 52508 + }, + { + "epoch": 4.8995987683120275, + "grad_norm": NaN, + "learning_rate": 2.57495441002401e-05, + "loss": 0.0, + "step": 52509 + }, + { + "epoch": 4.899692078006905, + "grad_norm": NaN, + "learning_rate": 2.574530679114618e-05, + "loss": 0.0, + "step": 52510 + }, + { + "epoch": 4.899785387701782, + "grad_norm": NaN, + "learning_rate": 2.574106979799512e-05, + "loss": 0.0, + "step": 52511 + }, + { + "epoch": 4.89987869739666, + "grad_norm": NaN, + "learning_rate": 2.573683312079765e-05, + "loss": 0.0, + "step": 52512 + }, + { + "epoch": 4.899972007091537, + "grad_norm": NaN, + "learning_rate": 2.5732596759564704e-05, + "loss": 0.0, + "step": 52513 + }, + { + "epoch": 4.900065316786414, + "grad_norm": NaN, + "learning_rate": 2.5728360714306906e-05, + "loss": 0.0, + "step": 52514 + }, + { + "epoch": 4.900158626481291, + "grad_norm": NaN, + "learning_rate": 2.5724124985035027e-05, + "loss": 0.0, + "step": 52515 + }, + { + "epoch": 4.9002519361761685, + "grad_norm": NaN, + "learning_rate": 2.5719889571759976e-05, + "loss": 0.0, + "step": 52516 + }, + { + "epoch": 4.900345245871046, + "grad_norm": NaN, + "learning_rate": 2.571565447449238e-05, + "loss": 0.0, + "step": 52517 + }, + { + "epoch": 4.900438555565923, + "grad_norm": NaN, + "learning_rate": 2.571141969324303e-05, + "loss": 0.0, + "step": 52518 + }, + { + "epoch": 4.900531865260801, + "grad_norm": NaN, + "learning_rate": 2.5707185228022797e-05, + "loss": 0.0, + "step": 52519 + }, + { + "epoch": 4.900625174955678, + "grad_norm": NaN, + "learning_rate": 2.5702951078842295e-05, + "loss": 0.0, + "step": 52520 + }, + { + "epoch": 4.900718484650556, + "grad_norm": NaN, + "learning_rate": 2.5698717245712442e-05, + "loss": 0.0, + "step": 52521 + }, + { + "epoch": 4.900811794345432, + "grad_norm": NaN, + "learning_rate": 2.56944837286439e-05, + "loss": 0.0, + "step": 52522 + }, + { + "epoch": 4.9009051040403095, + "grad_norm": NaN, + "learning_rate": 2.5690250527647428e-05, + "loss": 0.0, + "step": 52523 + }, + { + "epoch": 4.900998413735187, + "grad_norm": NaN, + "learning_rate": 2.568601764273388e-05, + "loss": 0.0, + "step": 52524 + }, + { + "epoch": 4.901091723430064, + "grad_norm": NaN, + "learning_rate": 2.5681785073913953e-05, + "loss": 0.0, + "step": 52525 + }, + { + "epoch": 4.901185033124942, + "grad_norm": NaN, + "learning_rate": 2.5677552821198366e-05, + "loss": 0.0, + "step": 52526 + }, + { + "epoch": 4.901278342819819, + "grad_norm": NaN, + "learning_rate": 2.5673320884598013e-05, + "loss": 0.0, + "step": 52527 + }, + { + "epoch": 4.901371652514697, + "grad_norm": NaN, + "learning_rate": 2.5669089264123553e-05, + "loss": 0.0, + "step": 52528 + }, + { + "epoch": 4.901464962209573, + "grad_norm": NaN, + "learning_rate": 2.5664857959785712e-05, + "loss": 0.0, + "step": 52529 + }, + { + "epoch": 4.901558271904451, + "grad_norm": NaN, + "learning_rate": 2.5660626971595394e-05, + "loss": 0.0, + "step": 52530 + }, + { + "epoch": 4.901651581599328, + "grad_norm": NaN, + "learning_rate": 2.5656396299563244e-05, + "loss": 0.0, + "step": 52531 + }, + { + "epoch": 4.901744891294205, + "grad_norm": NaN, + "learning_rate": 2.5652165943699982e-05, + "loss": 0.0, + "step": 52532 + }, + { + "epoch": 4.901838200989083, + "grad_norm": NaN, + "learning_rate": 2.564793590401652e-05, + "loss": 0.0, + "step": 52533 + }, + { + "epoch": 4.90193151068396, + "grad_norm": NaN, + "learning_rate": 2.5643706180523466e-05, + "loss": 0.0, + "step": 52534 + }, + { + "epoch": 4.902024820378838, + "grad_norm": NaN, + "learning_rate": 2.563947677323161e-05, + "loss": 0.0, + "step": 52535 + }, + { + "epoch": 4.902118130073715, + "grad_norm": NaN, + "learning_rate": 2.5635247682151794e-05, + "loss": 0.0, + "step": 52536 + }, + { + "epoch": 4.902211439768592, + "grad_norm": NaN, + "learning_rate": 2.5631018907294674e-05, + "loss": 0.0, + "step": 52537 + }, + { + "epoch": 4.902304749463469, + "grad_norm": NaN, + "learning_rate": 2.562679044867098e-05, + "loss": 0.0, + "step": 52538 + }, + { + "epoch": 4.9023980591583465, + "grad_norm": NaN, + "learning_rate": 2.56225623062916e-05, + "loss": 0.0, + "step": 52539 + }, + { + "epoch": 4.902491368853224, + "grad_norm": NaN, + "learning_rate": 2.5618334480167157e-05, + "loss": 0.0, + "step": 52540 + }, + { + "epoch": 4.902584678548101, + "grad_norm": NaN, + "learning_rate": 2.5614106970308414e-05, + "loss": 0.0, + "step": 52541 + }, + { + "epoch": 4.902677988242979, + "grad_norm": NaN, + "learning_rate": 2.5609879776726224e-05, + "loss": 0.0, + "step": 52542 + }, + { + "epoch": 4.902771297937855, + "grad_norm": NaN, + "learning_rate": 2.560565289943123e-05, + "loss": 0.0, + "step": 52543 + }, + { + "epoch": 4.902864607632733, + "grad_norm": NaN, + "learning_rate": 2.5601426338434172e-05, + "loss": 0.0, + "step": 52544 + }, + { + "epoch": 4.90295791732761, + "grad_norm": NaN, + "learning_rate": 2.5597200093745913e-05, + "loss": 0.0, + "step": 52545 + }, + { + "epoch": 4.9030512270224875, + "grad_norm": NaN, + "learning_rate": 2.5592974165377073e-05, + "loss": 0.0, + "step": 52546 + }, + { + "epoch": 4.903144536717365, + "grad_norm": NaN, + "learning_rate": 2.5588748553338428e-05, + "loss": 0.0, + "step": 52547 + }, + { + "epoch": 4.903237846412242, + "grad_norm": NaN, + "learning_rate": 2.5584523257640804e-05, + "loss": 0.0, + "step": 52548 + }, + { + "epoch": 4.90333115610712, + "grad_norm": NaN, + "learning_rate": 2.5580298278294858e-05, + "loss": 0.0, + "step": 52549 + }, + { + "epoch": 4.903424465801997, + "grad_norm": NaN, + "learning_rate": 2.5576073615311316e-05, + "loss": 0.0, + "step": 52550 + }, + { + "epoch": 4.903517775496875, + "grad_norm": NaN, + "learning_rate": 2.5571849268701035e-05, + "loss": 0.0, + "step": 52551 + }, + { + "epoch": 4.903611085191751, + "grad_norm": NaN, + "learning_rate": 2.5567625238474636e-05, + "loss": 0.0, + "step": 52552 + }, + { + "epoch": 4.903704394886629, + "grad_norm": NaN, + "learning_rate": 2.5563401524642886e-05, + "loss": 0.0, + "step": 52553 + }, + { + "epoch": 4.903797704581506, + "grad_norm": NaN, + "learning_rate": 2.55591781272166e-05, + "loss": 0.0, + "step": 52554 + }, + { + "epoch": 4.903891014276383, + "grad_norm": NaN, + "learning_rate": 2.555495504620641e-05, + "loss": 0.0, + "step": 52555 + }, + { + "epoch": 4.903984323971261, + "grad_norm": NaN, + "learning_rate": 2.555073228162312e-05, + "loss": 0.0, + "step": 52556 + }, + { + "epoch": 4.904077633666138, + "grad_norm": NaN, + "learning_rate": 2.5546509833477508e-05, + "loss": 0.0, + "step": 52557 + }, + { + "epoch": 4.904170943361015, + "grad_norm": NaN, + "learning_rate": 2.554228770178018e-05, + "loss": 0.0, + "step": 52558 + }, + { + "epoch": 4.904264253055892, + "grad_norm": NaN, + "learning_rate": 2.5538065886541975e-05, + "loss": 0.0, + "step": 52559 + }, + { + "epoch": 4.90435756275077, + "grad_norm": NaN, + "learning_rate": 2.5533844387773638e-05, + "loss": 0.0, + "step": 52560 + }, + { + "epoch": 4.904450872445647, + "grad_norm": NaN, + "learning_rate": 2.5529623205485795e-05, + "loss": 0.0, + "step": 52561 + }, + { + "epoch": 4.9045441821405245, + "grad_norm": NaN, + "learning_rate": 2.5525402339689282e-05, + "loss": 0.0, + "step": 52562 + }, + { + "epoch": 4.904637491835402, + "grad_norm": NaN, + "learning_rate": 2.5521181790394846e-05, + "loss": 0.0, + "step": 52563 + }, + { + "epoch": 4.904730801530279, + "grad_norm": NaN, + "learning_rate": 2.5516961557613092e-05, + "loss": 0.0, + "step": 52564 + }, + { + "epoch": 4.904824111225157, + "grad_norm": NaN, + "learning_rate": 2.5512741641354912e-05, + "loss": 0.0, + "step": 52565 + }, + { + "epoch": 4.904917420920033, + "grad_norm": NaN, + "learning_rate": 2.5508522041630895e-05, + "loss": 0.0, + "step": 52566 + }, + { + "epoch": 4.905010730614911, + "grad_norm": NaN, + "learning_rate": 2.5504302758451805e-05, + "loss": 0.0, + "step": 52567 + }, + { + "epoch": 4.905104040309788, + "grad_norm": NaN, + "learning_rate": 2.550008379182848e-05, + "loss": 0.0, + "step": 52568 + }, + { + "epoch": 4.9051973500046655, + "grad_norm": NaN, + "learning_rate": 2.549586514177151e-05, + "loss": 0.0, + "step": 52569 + }, + { + "epoch": 4.905290659699543, + "grad_norm": NaN, + "learning_rate": 2.5491646808291636e-05, + "loss": 0.0, + "step": 52570 + }, + { + "epoch": 4.90538396939442, + "grad_norm": NaN, + "learning_rate": 2.5487428791399704e-05, + "loss": 0.0, + "step": 52571 + }, + { + "epoch": 4.905477279089298, + "grad_norm": NaN, + "learning_rate": 2.548321109110632e-05, + "loss": 0.0, + "step": 52572 + }, + { + "epoch": 4.905570588784174, + "grad_norm": NaN, + "learning_rate": 2.5478993707422207e-05, + "loss": 0.0, + "step": 52573 + }, + { + "epoch": 4.905663898479052, + "grad_norm": NaN, + "learning_rate": 2.547477664035821e-05, + "loss": 0.0, + "step": 52574 + }, + { + "epoch": 4.905757208173929, + "grad_norm": NaN, + "learning_rate": 2.5470559889924914e-05, + "loss": 0.0, + "step": 52575 + }, + { + "epoch": 4.9058505178688065, + "grad_norm": NaN, + "learning_rate": 2.546634345613308e-05, + "loss": 0.0, + "step": 52576 + }, + { + "epoch": 4.905943827563684, + "grad_norm": NaN, + "learning_rate": 2.5462127338993504e-05, + "loss": 0.0, + "step": 52577 + }, + { + "epoch": 4.906037137258561, + "grad_norm": NaN, + "learning_rate": 2.5457911538516808e-05, + "loss": 0.0, + "step": 52578 + }, + { + "epoch": 4.906130446953439, + "grad_norm": NaN, + "learning_rate": 2.545369605471373e-05, + "loss": 0.0, + "step": 52579 + }, + { + "epoch": 4.906223756648316, + "grad_norm": NaN, + "learning_rate": 2.5449480887595064e-05, + "loss": 0.0, + "step": 52580 + }, + { + "epoch": 4.906317066343193, + "grad_norm": NaN, + "learning_rate": 2.5445266037171454e-05, + "loss": 0.0, + "step": 52581 + }, + { + "epoch": 4.90641037603807, + "grad_norm": NaN, + "learning_rate": 2.5441051503453587e-05, + "loss": 0.0, + "step": 52582 + }, + { + "epoch": 4.906503685732948, + "grad_norm": NaN, + "learning_rate": 2.5436837286452304e-05, + "loss": 0.0, + "step": 52583 + }, + { + "epoch": 4.906596995427825, + "grad_norm": NaN, + "learning_rate": 2.543262338617822e-05, + "loss": 0.0, + "step": 52584 + }, + { + "epoch": 4.906690305122702, + "grad_norm": NaN, + "learning_rate": 2.542840980264202e-05, + "loss": 0.0, + "step": 52585 + }, + { + "epoch": 4.90678361481758, + "grad_norm": NaN, + "learning_rate": 2.5424196535854562e-05, + "loss": 0.0, + "step": 52586 + }, + { + "epoch": 4.906876924512456, + "grad_norm": NaN, + "learning_rate": 2.5419983585826425e-05, + "loss": 0.0, + "step": 52587 + }, + { + "epoch": 4.906970234207334, + "grad_norm": NaN, + "learning_rate": 2.5415770952568332e-05, + "loss": 0.0, + "step": 52588 + }, + { + "epoch": 4.907063543902211, + "grad_norm": NaN, + "learning_rate": 2.5411558636091105e-05, + "loss": 0.0, + "step": 52589 + }, + { + "epoch": 4.907156853597089, + "grad_norm": NaN, + "learning_rate": 2.540734663640534e-05, + "loss": 0.0, + "step": 52590 + }, + { + "epoch": 4.907250163291966, + "grad_norm": NaN, + "learning_rate": 2.5403134953521743e-05, + "loss": 0.0, + "step": 52591 + }, + { + "epoch": 4.9073434729868435, + "grad_norm": NaN, + "learning_rate": 2.5398923587451154e-05, + "loss": 0.0, + "step": 52592 + }, + { + "epoch": 4.907436782681721, + "grad_norm": NaN, + "learning_rate": 2.53947125382041e-05, + "loss": 0.0, + "step": 52593 + }, + { + "epoch": 4.907530092376598, + "grad_norm": NaN, + "learning_rate": 2.5390501805791434e-05, + "loss": 0.0, + "step": 52594 + }, + { + "epoch": 4.907623402071475, + "grad_norm": NaN, + "learning_rate": 2.538629139022384e-05, + "loss": 0.0, + "step": 52595 + }, + { + "epoch": 4.907716711766352, + "grad_norm": NaN, + "learning_rate": 2.538208129151192e-05, + "loss": 0.0, + "step": 52596 + }, + { + "epoch": 4.90781002146123, + "grad_norm": NaN, + "learning_rate": 2.537787150966648e-05, + "loss": 0.0, + "step": 52597 + }, + { + "epoch": 4.907903331156107, + "grad_norm": NaN, + "learning_rate": 2.537366204469825e-05, + "loss": 0.0, + "step": 52598 + }, + { + "epoch": 4.9079966408509845, + "grad_norm": NaN, + "learning_rate": 2.5369452896617802e-05, + "loss": 0.0, + "step": 52599 + }, + { + "epoch": 4.908089950545862, + "grad_norm": NaN, + "learning_rate": 2.5365244065435947e-05, + "loss": 0.0, + "step": 52600 + }, + { + "epoch": 4.908183260240739, + "grad_norm": NaN, + "learning_rate": 2.5361035551163406e-05, + "loss": 0.0, + "step": 52601 + }, + { + "epoch": 4.908276569935616, + "grad_norm": NaN, + "learning_rate": 2.535682735381074e-05, + "loss": 0.0, + "step": 52602 + }, + { + "epoch": 4.908369879630493, + "grad_norm": NaN, + "learning_rate": 2.5352619473388785e-05, + "loss": 0.0, + "step": 52603 + }, + { + "epoch": 4.908463189325371, + "grad_norm": NaN, + "learning_rate": 2.534841190990824e-05, + "loss": 0.0, + "step": 52604 + }, + { + "epoch": 4.908556499020248, + "grad_norm": NaN, + "learning_rate": 2.5344204663379675e-05, + "loss": 0.0, + "step": 52605 + }, + { + "epoch": 4.908649808715126, + "grad_norm": NaN, + "learning_rate": 2.5339997733813915e-05, + "loss": 0.0, + "step": 52606 + }, + { + "epoch": 4.908743118410003, + "grad_norm": NaN, + "learning_rate": 2.5335791121221654e-05, + "loss": 0.0, + "step": 52607 + }, + { + "epoch": 4.90883642810488, + "grad_norm": NaN, + "learning_rate": 2.5331584825613465e-05, + "loss": 0.0, + "step": 52608 + }, + { + "epoch": 4.908929737799758, + "grad_norm": NaN, + "learning_rate": 2.5327378847000174e-05, + "loss": 0.0, + "step": 52609 + }, + { + "epoch": 4.909023047494634, + "grad_norm": NaN, + "learning_rate": 2.5323173185392454e-05, + "loss": 0.0, + "step": 52610 + }, + { + "epoch": 4.909116357189512, + "grad_norm": NaN, + "learning_rate": 2.5318967840800898e-05, + "loss": 0.0, + "step": 52611 + }, + { + "epoch": 4.909209666884389, + "grad_norm": NaN, + "learning_rate": 2.5314762813236363e-05, + "loss": 0.0, + "step": 52612 + }, + { + "epoch": 4.909302976579267, + "grad_norm": NaN, + "learning_rate": 2.531055810270939e-05, + "loss": 0.0, + "step": 52613 + }, + { + "epoch": 4.909396286274144, + "grad_norm": NaN, + "learning_rate": 2.530635370923072e-05, + "loss": 0.0, + "step": 52614 + }, + { + "epoch": 4.9094895959690215, + "grad_norm": NaN, + "learning_rate": 2.5302149632811115e-05, + "loss": 0.0, + "step": 52615 + }, + { + "epoch": 4.909582905663898, + "grad_norm": NaN, + "learning_rate": 2.529794587346116e-05, + "loss": 0.0, + "step": 52616 + }, + { + "epoch": 4.909676215358775, + "grad_norm": NaN, + "learning_rate": 2.529374243119157e-05, + "loss": 0.0, + "step": 52617 + }, + { + "epoch": 4.909769525053653, + "grad_norm": NaN, + "learning_rate": 2.5289539306013118e-05, + "loss": 0.0, + "step": 52618 + }, + { + "epoch": 4.90986283474853, + "grad_norm": NaN, + "learning_rate": 2.5285336497936394e-05, + "loss": 0.0, + "step": 52619 + }, + { + "epoch": 4.909956144443408, + "grad_norm": NaN, + "learning_rate": 2.528113400697207e-05, + "loss": 0.0, + "step": 52620 + }, + { + "epoch": 4.910049454138285, + "grad_norm": NaN, + "learning_rate": 2.5276931833130958e-05, + "loss": 0.0, + "step": 52621 + }, + { + "epoch": 4.9101427638331625, + "grad_norm": NaN, + "learning_rate": 2.5272729976423632e-05, + "loss": 0.0, + "step": 52622 + }, + { + "epoch": 4.91023607352804, + "grad_norm": NaN, + "learning_rate": 2.5268528436860753e-05, + "loss": 0.0, + "step": 52623 + }, + { + "epoch": 4.910329383222917, + "grad_norm": NaN, + "learning_rate": 2.526432721445314e-05, + "loss": 0.0, + "step": 52624 + }, + { + "epoch": 4.910422692917794, + "grad_norm": NaN, + "learning_rate": 2.526012630921137e-05, + "loss": 0.0, + "step": 52625 + }, + { + "epoch": 4.910516002612671, + "grad_norm": NaN, + "learning_rate": 2.5255925721146086e-05, + "loss": 0.0, + "step": 52626 + }, + { + "epoch": 4.910609312307549, + "grad_norm": NaN, + "learning_rate": 2.5251725450268128e-05, + "loss": 0.0, + "step": 52627 + }, + { + "epoch": 4.910702622002426, + "grad_norm": NaN, + "learning_rate": 2.5247525496588024e-05, + "loss": 0.0, + "step": 52628 + }, + { + "epoch": 4.9107959316973036, + "grad_norm": NaN, + "learning_rate": 2.5243325860116494e-05, + "loss": 0.0, + "step": 52629 + }, + { + "epoch": 4.910889241392181, + "grad_norm": NaN, + "learning_rate": 2.523912654086428e-05, + "loss": 0.0, + "step": 52630 + }, + { + "epoch": 4.9109825510870575, + "grad_norm": NaN, + "learning_rate": 2.5234927538841942e-05, + "loss": 0.0, + "step": 52631 + }, + { + "epoch": 4.911075860781935, + "grad_norm": NaN, + "learning_rate": 2.5230728854060274e-05, + "loss": 0.0, + "step": 52632 + }, + { + "epoch": 4.911169170476812, + "grad_norm": NaN, + "learning_rate": 2.5226530486529926e-05, + "loss": 0.0, + "step": 52633 + }, + { + "epoch": 4.91126248017169, + "grad_norm": NaN, + "learning_rate": 2.5222332436261482e-05, + "loss": 0.0, + "step": 52634 + }, + { + "epoch": 4.911355789866567, + "grad_norm": NaN, + "learning_rate": 2.521813470326573e-05, + "loss": 0.0, + "step": 52635 + }, + { + "epoch": 4.911449099561445, + "grad_norm": NaN, + "learning_rate": 2.5213937287553327e-05, + "loss": 0.0, + "step": 52636 + }, + { + "epoch": 4.911542409256322, + "grad_norm": NaN, + "learning_rate": 2.5209740189134847e-05, + "loss": 0.0, + "step": 52637 + }, + { + "epoch": 4.911635718951199, + "grad_norm": NaN, + "learning_rate": 2.520554340802107e-05, + "loss": 0.0, + "step": 52638 + }, + { + "epoch": 4.911729028646076, + "grad_norm": NaN, + "learning_rate": 2.520134694422266e-05, + "loss": 0.0, + "step": 52639 + }, + { + "epoch": 4.911822338340953, + "grad_norm": NaN, + "learning_rate": 2.519715079775019e-05, + "loss": 0.0, + "step": 52640 + }, + { + "epoch": 4.911915648035831, + "grad_norm": NaN, + "learning_rate": 2.5192954968614422e-05, + "loss": 0.0, + "step": 52641 + }, + { + "epoch": 4.912008957730708, + "grad_norm": NaN, + "learning_rate": 2.5188759456826034e-05, + "loss": 0.0, + "step": 52642 + }, + { + "epoch": 4.912102267425586, + "grad_norm": NaN, + "learning_rate": 2.5184564262395605e-05, + "loss": 0.0, + "step": 52643 + }, + { + "epoch": 4.912195577120463, + "grad_norm": NaN, + "learning_rate": 2.5180369385333877e-05, + "loss": 0.0, + "step": 52644 + }, + { + "epoch": 4.9122888868153405, + "grad_norm": NaN, + "learning_rate": 2.5176174825651535e-05, + "loss": 0.0, + "step": 52645 + }, + { + "epoch": 4.912382196510217, + "grad_norm": NaN, + "learning_rate": 2.5171980583359123e-05, + "loss": 0.0, + "step": 52646 + }, + { + "epoch": 4.912475506205094, + "grad_norm": NaN, + "learning_rate": 2.5167786658467436e-05, + "loss": 0.0, + "step": 52647 + }, + { + "epoch": 4.912568815899972, + "grad_norm": NaN, + "learning_rate": 2.516359305098713e-05, + "loss": 0.0, + "step": 52648 + }, + { + "epoch": 4.912662125594849, + "grad_norm": NaN, + "learning_rate": 2.5159399760928744e-05, + "loss": 0.0, + "step": 52649 + }, + { + "epoch": 4.912755435289727, + "grad_norm": NaN, + "learning_rate": 2.5155206788303073e-05, + "loss": 0.0, + "step": 52650 + }, + { + "epoch": 4.912848744984604, + "grad_norm": NaN, + "learning_rate": 2.5151014133120757e-05, + "loss": 0.0, + "step": 52651 + }, + { + "epoch": 4.9129420546794815, + "grad_norm": NaN, + "learning_rate": 2.5146821795392352e-05, + "loss": 0.0, + "step": 52652 + }, + { + "epoch": 4.913035364374359, + "grad_norm": NaN, + "learning_rate": 2.5142629775128637e-05, + "loss": 0.0, + "step": 52653 + }, + { + "epoch": 4.9131286740692355, + "grad_norm": NaN, + "learning_rate": 2.5138438072340266e-05, + "loss": 0.0, + "step": 52654 + }, + { + "epoch": 4.913221983764113, + "grad_norm": NaN, + "learning_rate": 2.5134246687037785e-05, + "loss": 0.0, + "step": 52655 + }, + { + "epoch": 4.91331529345899, + "grad_norm": NaN, + "learning_rate": 2.5130055619231997e-05, + "loss": 0.0, + "step": 52656 + }, + { + "epoch": 4.913408603153868, + "grad_norm": NaN, + "learning_rate": 2.5125864868933466e-05, + "loss": 0.0, + "step": 52657 + }, + { + "epoch": 4.913501912848745, + "grad_norm": NaN, + "learning_rate": 2.5121674436152812e-05, + "loss": 0.0, + "step": 52658 + }, + { + "epoch": 4.913595222543623, + "grad_norm": NaN, + "learning_rate": 2.5117484320900842e-05, + "loss": 0.0, + "step": 52659 + }, + { + "epoch": 4.913688532238499, + "grad_norm": NaN, + "learning_rate": 2.511329452318807e-05, + "loss": 0.0, + "step": 52660 + }, + { + "epoch": 4.9137818419333765, + "grad_norm": NaN, + "learning_rate": 2.510910504302515e-05, + "loss": 0.0, + "step": 52661 + }, + { + "epoch": 4.913875151628254, + "grad_norm": NaN, + "learning_rate": 2.5104915880422876e-05, + "loss": 0.0, + "step": 52662 + }, + { + "epoch": 4.913968461323131, + "grad_norm": NaN, + "learning_rate": 2.5100727035391756e-05, + "loss": 0.0, + "step": 52663 + }, + { + "epoch": 4.914061771018009, + "grad_norm": NaN, + "learning_rate": 2.5096538507942443e-05, + "loss": 0.0, + "step": 52664 + }, + { + "epoch": 4.914155080712886, + "grad_norm": NaN, + "learning_rate": 2.5092350298085718e-05, + "loss": 0.0, + "step": 52665 + }, + { + "epoch": 4.914248390407764, + "grad_norm": NaN, + "learning_rate": 2.50881624058321e-05, + "loss": 0.0, + "step": 52666 + }, + { + "epoch": 4.914341700102641, + "grad_norm": NaN, + "learning_rate": 2.5083974831192253e-05, + "loss": 0.0, + "step": 52667 + }, + { + "epoch": 4.9144350097975185, + "grad_norm": NaN, + "learning_rate": 2.5079787574176934e-05, + "loss": 0.0, + "step": 52668 + }, + { + "epoch": 4.914528319492395, + "grad_norm": NaN, + "learning_rate": 2.5075600634796632e-05, + "loss": 0.0, + "step": 52669 + }, + { + "epoch": 4.914621629187272, + "grad_norm": NaN, + "learning_rate": 2.507141401306211e-05, + "loss": 0.0, + "step": 52670 + }, + { + "epoch": 4.91471493888215, + "grad_norm": NaN, + "learning_rate": 2.5067227708984016e-05, + "loss": 0.0, + "step": 52671 + }, + { + "epoch": 4.914808248577027, + "grad_norm": NaN, + "learning_rate": 2.506304172257287e-05, + "loss": 0.0, + "step": 52672 + }, + { + "epoch": 4.914901558271905, + "grad_norm": NaN, + "learning_rate": 2.5058856053839455e-05, + "loss": 0.0, + "step": 52673 + }, + { + "epoch": 4.914994867966782, + "grad_norm": NaN, + "learning_rate": 2.5054670702794384e-05, + "loss": 0.0, + "step": 52674 + }, + { + "epoch": 4.915088177661659, + "grad_norm": NaN, + "learning_rate": 2.5050485669448196e-05, + "loss": 0.0, + "step": 52675 + }, + { + "epoch": 4.915181487356536, + "grad_norm": NaN, + "learning_rate": 2.504630095381165e-05, + "loss": 0.0, + "step": 52676 + }, + { + "epoch": 4.9152747970514135, + "grad_norm": NaN, + "learning_rate": 2.5042116555895386e-05, + "loss": 0.0, + "step": 52677 + }, + { + "epoch": 4.915368106746291, + "grad_norm": NaN, + "learning_rate": 2.5037932475709933e-05, + "loss": 0.0, + "step": 52678 + }, + { + "epoch": 4.915461416441168, + "grad_norm": NaN, + "learning_rate": 2.5033748713266026e-05, + "loss": 0.0, + "step": 52679 + }, + { + "epoch": 4.915554726136046, + "grad_norm": NaN, + "learning_rate": 2.5029565268574313e-05, + "loss": 0.0, + "step": 52680 + }, + { + "epoch": 4.915648035830923, + "grad_norm": NaN, + "learning_rate": 2.5025382141645335e-05, + "loss": 0.0, + "step": 52681 + }, + { + "epoch": 4.9157413455258006, + "grad_norm": NaN, + "learning_rate": 2.5021199332489828e-05, + "loss": 0.0, + "step": 52682 + }, + { + "epoch": 4.915834655220677, + "grad_norm": NaN, + "learning_rate": 2.501701684111842e-05, + "loss": 0.0, + "step": 52683 + }, + { + "epoch": 4.9159279649155545, + "grad_norm": NaN, + "learning_rate": 2.5012834667541653e-05, + "loss": 0.0, + "step": 52684 + }, + { + "epoch": 4.916021274610432, + "grad_norm": NaN, + "learning_rate": 2.5008652811770253e-05, + "loss": 0.0, + "step": 52685 + }, + { + "epoch": 4.916114584305309, + "grad_norm": NaN, + "learning_rate": 2.5004471273814857e-05, + "loss": 0.0, + "step": 52686 + }, + { + "epoch": 4.916207894000187, + "grad_norm": NaN, + "learning_rate": 2.5000290053685996e-05, + "loss": 0.0, + "step": 52687 + }, + { + "epoch": 4.916301203695064, + "grad_norm": NaN, + "learning_rate": 2.4996109151394404e-05, + "loss": 0.0, + "step": 52688 + }, + { + "epoch": 4.916394513389942, + "grad_norm": NaN, + "learning_rate": 2.4991928566950716e-05, + "loss": 0.0, + "step": 52689 + }, + { + "epoch": 4.916487823084818, + "grad_norm": NaN, + "learning_rate": 2.4987748300365445e-05, + "loss": 0.0, + "step": 52690 + }, + { + "epoch": 4.9165811327796956, + "grad_norm": NaN, + "learning_rate": 2.498356835164934e-05, + "loss": 0.0, + "step": 52691 + }, + { + "epoch": 4.916674442474573, + "grad_norm": NaN, + "learning_rate": 2.4979388720813028e-05, + "loss": 0.0, + "step": 52692 + }, + { + "epoch": 4.91676775216945, + "grad_norm": NaN, + "learning_rate": 2.4975209407867007e-05, + "loss": 0.0, + "step": 52693 + }, + { + "epoch": 4.916861061864328, + "grad_norm": NaN, + "learning_rate": 2.4971030412822046e-05, + "loss": 0.0, + "step": 52694 + }, + { + "epoch": 4.916954371559205, + "grad_norm": NaN, + "learning_rate": 2.4966851735688748e-05, + "loss": 0.0, + "step": 52695 + }, + { + "epoch": 4.917047681254083, + "grad_norm": NaN, + "learning_rate": 2.4962673376477638e-05, + "loss": 0.0, + "step": 52696 + }, + { + "epoch": 4.91714099094896, + "grad_norm": NaN, + "learning_rate": 2.4958495335199456e-05, + "loss": 0.0, + "step": 52697 + }, + { + "epoch": 4.917234300643837, + "grad_norm": NaN, + "learning_rate": 2.49543176118648e-05, + "loss": 0.0, + "step": 52698 + }, + { + "epoch": 4.917327610338714, + "grad_norm": NaN, + "learning_rate": 2.495014020648422e-05, + "loss": 0.0, + "step": 52699 + }, + { + "epoch": 4.917420920033591, + "grad_norm": NaN, + "learning_rate": 2.4945963119068445e-05, + "loss": 0.0, + "step": 52700 + }, + { + "epoch": 4.917514229728469, + "grad_norm": NaN, + "learning_rate": 2.494178634962802e-05, + "loss": 0.0, + "step": 52701 + }, + { + "epoch": 4.917607539423346, + "grad_norm": NaN, + "learning_rate": 2.4937609898173545e-05, + "loss": 0.0, + "step": 52702 + }, + { + "epoch": 4.917700849118224, + "grad_norm": NaN, + "learning_rate": 2.493343376471575e-05, + "loss": 0.0, + "step": 52703 + }, + { + "epoch": 4.9177941588131, + "grad_norm": NaN, + "learning_rate": 2.4929257949265156e-05, + "loss": 0.0, + "step": 52704 + }, + { + "epoch": 4.917887468507978, + "grad_norm": NaN, + "learning_rate": 2.4925082451832362e-05, + "loss": 0.0, + "step": 52705 + }, + { + "epoch": 4.917980778202855, + "grad_norm": NaN, + "learning_rate": 2.492090727242812e-05, + "loss": 0.0, + "step": 52706 + }, + { + "epoch": 4.9180740878977325, + "grad_norm": NaN, + "learning_rate": 2.4916732411062872e-05, + "loss": 0.0, + "step": 52707 + }, + { + "epoch": 4.91816739759261, + "grad_norm": NaN, + "learning_rate": 2.491255786774738e-05, + "loss": 0.0, + "step": 52708 + }, + { + "epoch": 4.918260707287487, + "grad_norm": NaN, + "learning_rate": 2.4908383642492214e-05, + "loss": 0.0, + "step": 52709 + }, + { + "epoch": 4.918354016982365, + "grad_norm": NaN, + "learning_rate": 2.4904209735307918e-05, + "loss": 0.0, + "step": 52710 + }, + { + "epoch": 4.918447326677242, + "grad_norm": NaN, + "learning_rate": 2.4900036146205182e-05, + "loss": 0.0, + "step": 52711 + }, + { + "epoch": 4.918540636372119, + "grad_norm": NaN, + "learning_rate": 2.489586287519465e-05, + "loss": 0.0, + "step": 52712 + }, + { + "epoch": 4.918633946066996, + "grad_norm": NaN, + "learning_rate": 2.4891689922286794e-05, + "loss": 0.0, + "step": 52713 + }, + { + "epoch": 4.9187272557618735, + "grad_norm": NaN, + "learning_rate": 2.4887517287492342e-05, + "loss": 0.0, + "step": 52714 + }, + { + "epoch": 4.918820565456751, + "grad_norm": NaN, + "learning_rate": 2.4883344970821916e-05, + "loss": 0.0, + "step": 52715 + }, + { + "epoch": 4.918913875151628, + "grad_norm": NaN, + "learning_rate": 2.487917297228601e-05, + "loss": 0.0, + "step": 52716 + }, + { + "epoch": 4.919007184846506, + "grad_norm": NaN, + "learning_rate": 2.4875001291895347e-05, + "loss": 0.0, + "step": 52717 + }, + { + "epoch": 4.919100494541383, + "grad_norm": NaN, + "learning_rate": 2.487082992966052e-05, + "loss": 0.0, + "step": 52718 + }, + { + "epoch": 4.91919380423626, + "grad_norm": NaN, + "learning_rate": 2.4866658885592034e-05, + "loss": 0.0, + "step": 52719 + }, + { + "epoch": 4.919287113931137, + "grad_norm": NaN, + "learning_rate": 2.48624881597006e-05, + "loss": 0.0, + "step": 52720 + }, + { + "epoch": 4.919380423626015, + "grad_norm": NaN, + "learning_rate": 2.4858317751996842e-05, + "loss": 0.0, + "step": 52721 + }, + { + "epoch": 4.919473733320892, + "grad_norm": NaN, + "learning_rate": 2.485414766249122e-05, + "loss": 0.0, + "step": 52722 + }, + { + "epoch": 4.919567043015769, + "grad_norm": NaN, + "learning_rate": 2.484997789119447e-05, + "loss": 0.0, + "step": 52723 + }, + { + "epoch": 4.919660352710647, + "grad_norm": NaN, + "learning_rate": 2.4845808438117194e-05, + "loss": 0.0, + "step": 52724 + }, + { + "epoch": 4.919753662405524, + "grad_norm": NaN, + "learning_rate": 2.484163930326989e-05, + "loss": 0.0, + "step": 52725 + }, + { + "epoch": 4.919846972100402, + "grad_norm": NaN, + "learning_rate": 2.4837470486663253e-05, + "loss": 0.0, + "step": 52726 + }, + { + "epoch": 4.919940281795278, + "grad_norm": NaN, + "learning_rate": 2.4833301988307873e-05, + "loss": 0.0, + "step": 52727 + }, + { + "epoch": 4.920033591490156, + "grad_norm": NaN, + "learning_rate": 2.482913380821428e-05, + "loss": 0.0, + "step": 52728 + }, + { + "epoch": 4.920126901185033, + "grad_norm": NaN, + "learning_rate": 2.4824965946393146e-05, + "loss": 0.0, + "step": 52729 + }, + { + "epoch": 4.9202202108799105, + "grad_norm": NaN, + "learning_rate": 2.4820798402855076e-05, + "loss": 0.0, + "step": 52730 + }, + { + "epoch": 4.920313520574788, + "grad_norm": NaN, + "learning_rate": 2.4816631177610565e-05, + "loss": 0.0, + "step": 52731 + }, + { + "epoch": 4.920406830269665, + "grad_norm": NaN, + "learning_rate": 2.481246427067032e-05, + "loss": 0.0, + "step": 52732 + }, + { + "epoch": 4.920500139964542, + "grad_norm": NaN, + "learning_rate": 2.4808297682044932e-05, + "loss": 0.0, + "step": 52733 + }, + { + "epoch": 4.920593449659419, + "grad_norm": NaN, + "learning_rate": 2.4804131411744877e-05, + "loss": 0.0, + "step": 52734 + }, + { + "epoch": 4.920686759354297, + "grad_norm": NaN, + "learning_rate": 2.479996545978088e-05, + "loss": 0.0, + "step": 52735 + }, + { + "epoch": 4.920780069049174, + "grad_norm": NaN, + "learning_rate": 2.4795799826163514e-05, + "loss": 0.0, + "step": 52736 + }, + { + "epoch": 4.9208733787440515, + "grad_norm": NaN, + "learning_rate": 2.4791634510903258e-05, + "loss": 0.0, + "step": 52737 + }, + { + "epoch": 4.920966688438929, + "grad_norm": NaN, + "learning_rate": 2.4787469514010832e-05, + "loss": 0.0, + "step": 52738 + }, + { + "epoch": 4.921059998133806, + "grad_norm": NaN, + "learning_rate": 2.4783304835496814e-05, + "loss": 0.0, + "step": 52739 + }, + { + "epoch": 4.921153307828684, + "grad_norm": NaN, + "learning_rate": 2.4779140475371696e-05, + "loss": 0.0, + "step": 52740 + }, + { + "epoch": 4.921246617523561, + "grad_norm": NaN, + "learning_rate": 2.4774976433646148e-05, + "loss": 0.0, + "step": 52741 + }, + { + "epoch": 4.921339927218438, + "grad_norm": NaN, + "learning_rate": 2.47708127103308e-05, + "loss": 0.0, + "step": 52742 + }, + { + "epoch": 4.921433236913315, + "grad_norm": NaN, + "learning_rate": 2.476664930543609e-05, + "loss": 0.0, + "step": 52743 + }, + { + "epoch": 4.9215265466081926, + "grad_norm": NaN, + "learning_rate": 2.4762486218972783e-05, + "loss": 0.0, + "step": 52744 + }, + { + "epoch": 4.92161985630307, + "grad_norm": NaN, + "learning_rate": 2.4758323450951277e-05, + "loss": 0.0, + "step": 52745 + }, + { + "epoch": 4.921713165997947, + "grad_norm": NaN, + "learning_rate": 2.475416100138232e-05, + "loss": 0.0, + "step": 52746 + }, + { + "epoch": 4.921806475692825, + "grad_norm": NaN, + "learning_rate": 2.4749998870276456e-05, + "loss": 0.0, + "step": 52747 + }, + { + "epoch": 4.921899785387701, + "grad_norm": NaN, + "learning_rate": 2.4745837057644167e-05, + "loss": 0.0, + "step": 52748 + }, + { + "epoch": 4.921993095082579, + "grad_norm": NaN, + "learning_rate": 2.4741675563496138e-05, + "loss": 0.0, + "step": 52749 + }, + { + "epoch": 4.922086404777456, + "grad_norm": NaN, + "learning_rate": 2.4737514387842973e-05, + "loss": 0.0, + "step": 52750 + }, + { + "epoch": 4.922179714472334, + "grad_norm": NaN, + "learning_rate": 2.4733353530695128e-05, + "loss": 0.0, + "step": 52751 + }, + { + "epoch": 4.922273024167211, + "grad_norm": NaN, + "learning_rate": 2.4729192992063296e-05, + "loss": 0.0, + "step": 52752 + }, + { + "epoch": 4.922366333862088, + "grad_norm": NaN, + "learning_rate": 2.4725032771958054e-05, + "loss": 0.0, + "step": 52753 + }, + { + "epoch": 4.922459643556966, + "grad_norm": NaN, + "learning_rate": 2.4720872870389873e-05, + "loss": 0.0, + "step": 52754 + }, + { + "epoch": 4.922552953251843, + "grad_norm": NaN, + "learning_rate": 2.471671328736943e-05, + "loss": 0.0, + "step": 52755 + }, + { + "epoch": 4.92264626294672, + "grad_norm": NaN, + "learning_rate": 2.471255402290732e-05, + "loss": 0.0, + "step": 52756 + }, + { + "epoch": 4.922739572641597, + "grad_norm": NaN, + "learning_rate": 2.4708395077014e-05, + "loss": 0.0, + "step": 52757 + }, + { + "epoch": 4.922832882336475, + "grad_norm": NaN, + "learning_rate": 2.4704236449700154e-05, + "loss": 0.0, + "step": 52758 + }, + { + "epoch": 4.922926192031352, + "grad_norm": NaN, + "learning_rate": 2.470007814097635e-05, + "loss": 0.0, + "step": 52759 + }, + { + "epoch": 4.9230195017262295, + "grad_norm": NaN, + "learning_rate": 2.4695920150853054e-05, + "loss": 0.0, + "step": 52760 + }, + { + "epoch": 4.923112811421107, + "grad_norm": NaN, + "learning_rate": 2.4691762479340965e-05, + "loss": 0.0, + "step": 52761 + }, + { + "epoch": 4.923206121115984, + "grad_norm": NaN, + "learning_rate": 2.468760512645064e-05, + "loss": 0.0, + "step": 52762 + }, + { + "epoch": 4.923299430810861, + "grad_norm": NaN, + "learning_rate": 2.4683448092192533e-05, + "loss": 0.0, + "step": 52763 + }, + { + "epoch": 4.923392740505738, + "grad_norm": NaN, + "learning_rate": 2.4679291376577338e-05, + "loss": 0.0, + "step": 52764 + }, + { + "epoch": 4.923486050200616, + "grad_norm": NaN, + "learning_rate": 2.4675134979615613e-05, + "loss": 0.0, + "step": 52765 + }, + { + "epoch": 4.923579359895493, + "grad_norm": NaN, + "learning_rate": 2.4670978901317816e-05, + "loss": 0.0, + "step": 52766 + }, + { + "epoch": 4.9236726695903705, + "grad_norm": NaN, + "learning_rate": 2.466682314169464e-05, + "loss": 0.0, + "step": 52767 + }, + { + "epoch": 4.923765979285248, + "grad_norm": NaN, + "learning_rate": 2.4662667700756646e-05, + "loss": 0.0, + "step": 52768 + }, + { + "epoch": 4.923859288980125, + "grad_norm": NaN, + "learning_rate": 2.4658512578514285e-05, + "loss": 0.0, + "step": 52769 + }, + { + "epoch": 4.923952598675003, + "grad_norm": NaN, + "learning_rate": 2.4654357774978234e-05, + "loss": 0.0, + "step": 52770 + }, + { + "epoch": 4.924045908369879, + "grad_norm": NaN, + "learning_rate": 2.4650203290159056e-05, + "loss": 0.0, + "step": 52771 + }, + { + "epoch": 4.924139218064757, + "grad_norm": NaN, + "learning_rate": 2.4646049124067218e-05, + "loss": 0.0, + "step": 52772 + }, + { + "epoch": 4.924232527759634, + "grad_norm": NaN, + "learning_rate": 2.464189527671337e-05, + "loss": 0.0, + "step": 52773 + }, + { + "epoch": 4.924325837454512, + "grad_norm": NaN, + "learning_rate": 2.463774174810808e-05, + "loss": 0.0, + "step": 52774 + }, + { + "epoch": 4.924419147149389, + "grad_norm": NaN, + "learning_rate": 2.4633588538261812e-05, + "loss": 0.0, + "step": 52775 + }, + { + "epoch": 4.924512456844266, + "grad_norm": NaN, + "learning_rate": 2.4629435647185235e-05, + "loss": 0.0, + "step": 52776 + }, + { + "epoch": 4.924605766539143, + "grad_norm": NaN, + "learning_rate": 2.4625283074888896e-05, + "loss": 0.0, + "step": 52777 + }, + { + "epoch": 4.92469907623402, + "grad_norm": NaN, + "learning_rate": 2.462113082138325e-05, + "loss": 0.0, + "step": 52778 + }, + { + "epoch": 4.924792385928898, + "grad_norm": NaN, + "learning_rate": 2.4616978886678956e-05, + "loss": 0.0, + "step": 52779 + }, + { + "epoch": 4.924885695623775, + "grad_norm": NaN, + "learning_rate": 2.4612827270786557e-05, + "loss": 0.0, + "step": 52780 + }, + { + "epoch": 4.924979005318653, + "grad_norm": NaN, + "learning_rate": 2.460867597371659e-05, + "loss": 0.0, + "step": 52781 + }, + { + "epoch": 4.92507231501353, + "grad_norm": NaN, + "learning_rate": 2.460452499547962e-05, + "loss": 0.0, + "step": 52782 + }, + { + "epoch": 4.9251656247084075, + "grad_norm": NaN, + "learning_rate": 2.460037433608618e-05, + "loss": 0.0, + "step": 52783 + }, + { + "epoch": 4.925258934403285, + "grad_norm": NaN, + "learning_rate": 2.4596223995546872e-05, + "loss": 0.0, + "step": 52784 + }, + { + "epoch": 4.925352244098161, + "grad_norm": NaN, + "learning_rate": 2.4592073973872195e-05, + "loss": 0.0, + "step": 52785 + }, + { + "epoch": 4.925445553793039, + "grad_norm": NaN, + "learning_rate": 2.458792427107273e-05, + "loss": 0.0, + "step": 52786 + }, + { + "epoch": 4.925538863487916, + "grad_norm": NaN, + "learning_rate": 2.458377488715903e-05, + "loss": 0.0, + "step": 52787 + }, + { + "epoch": 4.925632173182794, + "grad_norm": NaN, + "learning_rate": 2.457962582214164e-05, + "loss": 0.0, + "step": 52788 + }, + { + "epoch": 4.925725482877671, + "grad_norm": NaN, + "learning_rate": 2.4575477076031103e-05, + "loss": 0.0, + "step": 52789 + }, + { + "epoch": 4.9258187925725485, + "grad_norm": NaN, + "learning_rate": 2.4571328648837974e-05, + "loss": 0.0, + "step": 52790 + }, + { + "epoch": 4.925912102267426, + "grad_norm": NaN, + "learning_rate": 2.4567180540572845e-05, + "loss": 0.0, + "step": 52791 + }, + { + "epoch": 4.9260054119623025, + "grad_norm": NaN, + "learning_rate": 2.456303275124614e-05, + "loss": 0.0, + "step": 52792 + }, + { + "epoch": 4.92609872165718, + "grad_norm": NaN, + "learning_rate": 2.455888528086854e-05, + "loss": 0.0, + "step": 52793 + }, + { + "epoch": 4.926192031352057, + "grad_norm": NaN, + "learning_rate": 2.4554738129450546e-05, + "loss": 0.0, + "step": 52794 + }, + { + "epoch": 4.926285341046935, + "grad_norm": NaN, + "learning_rate": 2.4550591297002637e-05, + "loss": 0.0, + "step": 52795 + }, + { + "epoch": 4.926378650741812, + "grad_norm": NaN, + "learning_rate": 2.454644478353545e-05, + "loss": 0.0, + "step": 52796 + }, + { + "epoch": 4.92647196043669, + "grad_norm": NaN, + "learning_rate": 2.4542298589059517e-05, + "loss": 0.0, + "step": 52797 + }, + { + "epoch": 4.926565270131567, + "grad_norm": NaN, + "learning_rate": 2.4538152713585292e-05, + "loss": 0.0, + "step": 52798 + }, + { + "epoch": 4.926658579826444, + "grad_norm": NaN, + "learning_rate": 2.4534007157123414e-05, + "loss": 0.0, + "step": 52799 + }, + { + "epoch": 4.926751889521321, + "grad_norm": NaN, + "learning_rate": 2.452986191968443e-05, + "loss": 0.0, + "step": 52800 + }, + { + "epoch": 4.926845199216198, + "grad_norm": NaN, + "learning_rate": 2.452571700127876e-05, + "loss": 0.0, + "step": 52801 + }, + { + "epoch": 4.926938508911076, + "grad_norm": NaN, + "learning_rate": 2.452157240191707e-05, + "loss": 0.0, + "step": 52802 + }, + { + "epoch": 4.927031818605953, + "grad_norm": NaN, + "learning_rate": 2.451742812160989e-05, + "loss": 0.0, + "step": 52803 + }, + { + "epoch": 4.927125128300831, + "grad_norm": NaN, + "learning_rate": 2.4513284160367634e-05, + "loss": 0.0, + "step": 52804 + }, + { + "epoch": 4.927218437995708, + "grad_norm": NaN, + "learning_rate": 2.4509140518200966e-05, + "loss": 0.0, + "step": 52805 + }, + { + "epoch": 4.9273117476905846, + "grad_norm": NaN, + "learning_rate": 2.4504997195120418e-05, + "loss": 0.0, + "step": 52806 + }, + { + "epoch": 4.927405057385462, + "grad_norm": NaN, + "learning_rate": 2.4500854191136405e-05, + "loss": 0.0, + "step": 52807 + }, + { + "epoch": 4.927498367080339, + "grad_norm": NaN, + "learning_rate": 2.4496711506259597e-05, + "loss": 0.0, + "step": 52808 + }, + { + "epoch": 4.927591676775217, + "grad_norm": NaN, + "learning_rate": 2.4492569140500508e-05, + "loss": 0.0, + "step": 52809 + }, + { + "epoch": 4.927684986470094, + "grad_norm": NaN, + "learning_rate": 2.4488427093869555e-05, + "loss": 0.0, + "step": 52810 + }, + { + "epoch": 4.927778296164972, + "grad_norm": NaN, + "learning_rate": 2.4484285366377387e-05, + "loss": 0.0, + "step": 52811 + }, + { + "epoch": 4.927871605859849, + "grad_norm": NaN, + "learning_rate": 2.4480143958034542e-05, + "loss": 0.0, + "step": 52812 + }, + { + "epoch": 4.9279649155547265, + "grad_norm": NaN, + "learning_rate": 2.4476002868851445e-05, + "loss": 0.0, + "step": 52813 + }, + { + "epoch": 4.928058225249604, + "grad_norm": NaN, + "learning_rate": 2.4471862098838724e-05, + "loss": 0.0, + "step": 52814 + }, + { + "epoch": 4.92815153494448, + "grad_norm": NaN, + "learning_rate": 2.44677216480069e-05, + "loss": 0.0, + "step": 52815 + }, + { + "epoch": 4.928244844639358, + "grad_norm": NaN, + "learning_rate": 2.4463581516366415e-05, + "loss": 0.0, + "step": 52816 + }, + { + "epoch": 4.928338154334235, + "grad_norm": NaN, + "learning_rate": 2.445944170392788e-05, + "loss": 0.0, + "step": 52817 + }, + { + "epoch": 4.928431464029113, + "grad_norm": NaN, + "learning_rate": 2.44553022107018e-05, + "loss": 0.0, + "step": 52818 + }, + { + "epoch": 4.92852477372399, + "grad_norm": NaN, + "learning_rate": 2.44511630366987e-05, + "loss": 0.0, + "step": 52819 + }, + { + "epoch": 4.9286180834188675, + "grad_norm": NaN, + "learning_rate": 2.4447024181929108e-05, + "loss": 0.0, + "step": 52820 + }, + { + "epoch": 4.928711393113744, + "grad_norm": NaN, + "learning_rate": 2.4442885646403532e-05, + "loss": 0.0, + "step": 52821 + }, + { + "epoch": 4.9288047028086215, + "grad_norm": NaN, + "learning_rate": 2.443874743013251e-05, + "loss": 0.0, + "step": 52822 + }, + { + "epoch": 4.928898012503499, + "grad_norm": NaN, + "learning_rate": 2.4434609533126555e-05, + "loss": 0.0, + "step": 52823 + }, + { + "epoch": 4.928991322198376, + "grad_norm": NaN, + "learning_rate": 2.4430471955396186e-05, + "loss": 0.0, + "step": 52824 + }, + { + "epoch": 4.929084631893254, + "grad_norm": NaN, + "learning_rate": 2.442633469695195e-05, + "loss": 0.0, + "step": 52825 + }, + { + "epoch": 4.929177941588131, + "grad_norm": NaN, + "learning_rate": 2.4422197757804335e-05, + "loss": 0.0, + "step": 52826 + }, + { + "epoch": 4.929271251283009, + "grad_norm": NaN, + "learning_rate": 2.441806113796387e-05, + "loss": 0.0, + "step": 52827 + }, + { + "epoch": 4.929364560977886, + "grad_norm": NaN, + "learning_rate": 2.441392483744109e-05, + "loss": 0.0, + "step": 52828 + }, + { + "epoch": 4.9294578706727625, + "grad_norm": NaN, + "learning_rate": 2.4409788856246475e-05, + "loss": 0.0, + "step": 52829 + }, + { + "epoch": 4.92955118036764, + "grad_norm": NaN, + "learning_rate": 2.4405653194390586e-05, + "loss": 0.0, + "step": 52830 + }, + { + "epoch": 4.929644490062517, + "grad_norm": NaN, + "learning_rate": 2.4401517851883905e-05, + "loss": 0.0, + "step": 52831 + }, + { + "epoch": 4.929737799757395, + "grad_norm": NaN, + "learning_rate": 2.4397382828736966e-05, + "loss": 0.0, + "step": 52832 + }, + { + "epoch": 4.929831109452272, + "grad_norm": NaN, + "learning_rate": 2.4393248124960257e-05, + "loss": 0.0, + "step": 52833 + }, + { + "epoch": 4.92992441914715, + "grad_norm": NaN, + "learning_rate": 2.4389113740564337e-05, + "loss": 0.0, + "step": 52834 + }, + { + "epoch": 4.930017728842027, + "grad_norm": NaN, + "learning_rate": 2.4384979675559713e-05, + "loss": 0.0, + "step": 52835 + }, + { + "epoch": 4.930111038536904, + "grad_norm": NaN, + "learning_rate": 2.438084592995681e-05, + "loss": 0.0, + "step": 52836 + }, + { + "epoch": 4.930204348231781, + "grad_norm": NaN, + "learning_rate": 2.4376712503766238e-05, + "loss": 0.0, + "step": 52837 + }, + { + "epoch": 4.930297657926658, + "grad_norm": NaN, + "learning_rate": 2.4372579396998505e-05, + "loss": 0.0, + "step": 52838 + }, + { + "epoch": 4.930390967621536, + "grad_norm": NaN, + "learning_rate": 2.4368446609664017e-05, + "loss": 0.0, + "step": 52839 + }, + { + "epoch": 4.930484277316413, + "grad_norm": NaN, + "learning_rate": 2.43643141417734e-05, + "loss": 0.0, + "step": 52840 + }, + { + "epoch": 4.930577587011291, + "grad_norm": NaN, + "learning_rate": 2.4360181993337146e-05, + "loss": 0.0, + "step": 52841 + }, + { + "epoch": 4.930670896706168, + "grad_norm": NaN, + "learning_rate": 2.4356050164365665e-05, + "loss": 0.0, + "step": 52842 + }, + { + "epoch": 4.9307642064010455, + "grad_norm": NaN, + "learning_rate": 2.4351918654869563e-05, + "loss": 0.0, + "step": 52843 + }, + { + "epoch": 4.930857516095922, + "grad_norm": NaN, + "learning_rate": 2.4347787464859347e-05, + "loss": 0.0, + "step": 52844 + }, + { + "epoch": 4.9309508257907995, + "grad_norm": NaN, + "learning_rate": 2.4343656594345413e-05, + "loss": 0.0, + "step": 52845 + }, + { + "epoch": 4.931044135485677, + "grad_norm": NaN, + "learning_rate": 2.433952604333838e-05, + "loss": 0.0, + "step": 52846 + }, + { + "epoch": 4.931137445180554, + "grad_norm": NaN, + "learning_rate": 2.4335395811848744e-05, + "loss": 0.0, + "step": 52847 + }, + { + "epoch": 4.931230754875432, + "grad_norm": NaN, + "learning_rate": 2.4331265899886898e-05, + "loss": 0.0, + "step": 52848 + }, + { + "epoch": 4.931324064570309, + "grad_norm": NaN, + "learning_rate": 2.4327136307463447e-05, + "loss": 0.0, + "step": 52849 + }, + { + "epoch": 4.931417374265186, + "grad_norm": NaN, + "learning_rate": 2.43230070345889e-05, + "loss": 0.0, + "step": 52850 + }, + { + "epoch": 4.931510683960063, + "grad_norm": NaN, + "learning_rate": 2.4318878081273646e-05, + "loss": 0.0, + "step": 52851 + }, + { + "epoch": 4.9316039936549405, + "grad_norm": NaN, + "learning_rate": 2.43147494475283e-05, + "loss": 0.0, + "step": 52852 + }, + { + "epoch": 4.931697303349818, + "grad_norm": NaN, + "learning_rate": 2.4310621133363343e-05, + "loss": 0.0, + "step": 52853 + }, + { + "epoch": 4.931790613044695, + "grad_norm": NaN, + "learning_rate": 2.430649313878918e-05, + "loss": 0.0, + "step": 52854 + }, + { + "epoch": 4.931883922739573, + "grad_norm": NaN, + "learning_rate": 2.430236546381639e-05, + "loss": 0.0, + "step": 52855 + }, + { + "epoch": 4.93197723243445, + "grad_norm": NaN, + "learning_rate": 2.429823810845546e-05, + "loss": 0.0, + "step": 52856 + }, + { + "epoch": 4.932070542129328, + "grad_norm": NaN, + "learning_rate": 2.4294111072716886e-05, + "loss": 0.0, + "step": 52857 + }, + { + "epoch": 4.932163851824205, + "grad_norm": NaN, + "learning_rate": 2.4289984356611136e-05, + "loss": 0.0, + "step": 52858 + }, + { + "epoch": 4.932257161519082, + "grad_norm": NaN, + "learning_rate": 2.428585796014873e-05, + "loss": 0.0, + "step": 52859 + }, + { + "epoch": 4.932350471213959, + "grad_norm": NaN, + "learning_rate": 2.4281731883340144e-05, + "loss": 0.0, + "step": 52860 + }, + { + "epoch": 4.932443780908836, + "grad_norm": NaN, + "learning_rate": 2.427760612619587e-05, + "loss": 0.0, + "step": 52861 + }, + { + "epoch": 4.932537090603714, + "grad_norm": NaN, + "learning_rate": 2.42734806887264e-05, + "loss": 0.0, + "step": 52862 + }, + { + "epoch": 4.932630400298591, + "grad_norm": NaN, + "learning_rate": 2.426935557094222e-05, + "loss": 0.0, + "step": 52863 + }, + { + "epoch": 4.932723709993469, + "grad_norm": NaN, + "learning_rate": 2.426523077285385e-05, + "loss": 0.0, + "step": 52864 + }, + { + "epoch": 4.932817019688345, + "grad_norm": NaN, + "learning_rate": 2.4261106294471723e-05, + "loss": 0.0, + "step": 52865 + }, + { + "epoch": 4.932910329383223, + "grad_norm": NaN, + "learning_rate": 2.425698213580638e-05, + "loss": 0.0, + "step": 52866 + }, + { + "epoch": 4.9330036390781, + "grad_norm": NaN, + "learning_rate": 2.4252858296868265e-05, + "loss": 0.0, + "step": 52867 + }, + { + "epoch": 4.933096948772977, + "grad_norm": NaN, + "learning_rate": 2.4248734777667888e-05, + "loss": 0.0, + "step": 52868 + }, + { + "epoch": 4.933190258467855, + "grad_norm": NaN, + "learning_rate": 2.424461157821574e-05, + "loss": 0.0, + "step": 52869 + }, + { + "epoch": 4.933283568162732, + "grad_norm": NaN, + "learning_rate": 2.4240488698522274e-05, + "loss": 0.0, + "step": 52870 + }, + { + "epoch": 4.93337687785761, + "grad_norm": NaN, + "learning_rate": 2.4236366138598006e-05, + "loss": 0.0, + "step": 52871 + }, + { + "epoch": 4.933470187552487, + "grad_norm": NaN, + "learning_rate": 2.4232243898453406e-05, + "loss": 0.0, + "step": 52872 + }, + { + "epoch": 4.933563497247364, + "grad_norm": NaN, + "learning_rate": 2.4228121978098936e-05, + "loss": 0.0, + "step": 52873 + }, + { + "epoch": 4.933656806942241, + "grad_norm": NaN, + "learning_rate": 2.4224000377545116e-05, + "loss": 0.0, + "step": 52874 + }, + { + "epoch": 4.9337501166371185, + "grad_norm": NaN, + "learning_rate": 2.4219879096802392e-05, + "loss": 0.0, + "step": 52875 + }, + { + "epoch": 4.933843426331996, + "grad_norm": NaN, + "learning_rate": 2.421575813588127e-05, + "loss": 0.0, + "step": 52876 + }, + { + "epoch": 4.933936736026873, + "grad_norm": NaN, + "learning_rate": 2.4211637494792207e-05, + "loss": 0.0, + "step": 52877 + }, + { + "epoch": 4.934030045721751, + "grad_norm": NaN, + "learning_rate": 2.4207517173545682e-05, + "loss": 0.0, + "step": 52878 + }, + { + "epoch": 4.934123355416628, + "grad_norm": NaN, + "learning_rate": 2.4203397172152234e-05, + "loss": 0.0, + "step": 52879 + }, + { + "epoch": 4.934216665111505, + "grad_norm": NaN, + "learning_rate": 2.419927749062219e-05, + "loss": 0.0, + "step": 52880 + }, + { + "epoch": 4.934309974806382, + "grad_norm": NaN, + "learning_rate": 2.419515812896617e-05, + "loss": 0.0, + "step": 52881 + }, + { + "epoch": 4.9344032845012595, + "grad_norm": NaN, + "learning_rate": 2.4191039087194618e-05, + "loss": 0.0, + "step": 52882 + }, + { + "epoch": 4.934496594196137, + "grad_norm": NaN, + "learning_rate": 2.418692036531793e-05, + "loss": 0.0, + "step": 52883 + }, + { + "epoch": 4.934589903891014, + "grad_norm": NaN, + "learning_rate": 2.418280196334666e-05, + "loss": 0.0, + "step": 52884 + }, + { + "epoch": 4.934683213585892, + "grad_norm": NaN, + "learning_rate": 2.41786838812913e-05, + "loss": 0.0, + "step": 52885 + }, + { + "epoch": 4.934776523280769, + "grad_norm": NaN, + "learning_rate": 2.4174566119162192e-05, + "loss": 0.0, + "step": 52886 + }, + { + "epoch": 4.934869832975647, + "grad_norm": NaN, + "learning_rate": 2.4170448676969927e-05, + "loss": 0.0, + "step": 52887 + }, + { + "epoch": 4.934963142670523, + "grad_norm": NaN, + "learning_rate": 2.416633155472498e-05, + "loss": 0.0, + "step": 52888 + }, + { + "epoch": 4.935056452365401, + "grad_norm": NaN, + "learning_rate": 2.4162214752437713e-05, + "loss": 0.0, + "step": 52889 + }, + { + "epoch": 4.935149762060278, + "grad_norm": NaN, + "learning_rate": 2.415809827011868e-05, + "loss": 0.0, + "step": 52890 + }, + { + "epoch": 4.935243071755155, + "grad_norm": NaN, + "learning_rate": 2.4153982107778373e-05, + "loss": 0.0, + "step": 52891 + }, + { + "epoch": 4.935336381450033, + "grad_norm": NaN, + "learning_rate": 2.4149866265427136e-05, + "loss": 0.0, + "step": 52892 + }, + { + "epoch": 4.93542969114491, + "grad_norm": NaN, + "learning_rate": 2.414575074307554e-05, + "loss": 0.0, + "step": 52893 + }, + { + "epoch": 4.935523000839787, + "grad_norm": NaN, + "learning_rate": 2.4141635540734018e-05, + "loss": 0.0, + "step": 52894 + }, + { + "epoch": 4.935616310534664, + "grad_norm": NaN, + "learning_rate": 2.4137520658413033e-05, + "loss": 0.0, + "step": 52895 + }, + { + "epoch": 4.935709620229542, + "grad_norm": NaN, + "learning_rate": 2.413340609612307e-05, + "loss": 0.0, + "step": 52896 + }, + { + "epoch": 4.935802929924419, + "grad_norm": NaN, + "learning_rate": 2.412929185387455e-05, + "loss": 0.0, + "step": 52897 + }, + { + "epoch": 4.9358962396192965, + "grad_norm": NaN, + "learning_rate": 2.4125177931677964e-05, + "loss": 0.0, + "step": 52898 + }, + { + "epoch": 4.935989549314174, + "grad_norm": NaN, + "learning_rate": 2.4121064329543755e-05, + "loss": 0.0, + "step": 52899 + }, + { + "epoch": 4.936082859009051, + "grad_norm": NaN, + "learning_rate": 2.4116951047482396e-05, + "loss": 0.0, + "step": 52900 + }, + { + "epoch": 4.936176168703929, + "grad_norm": NaN, + "learning_rate": 2.411283808550435e-05, + "loss": 0.0, + "step": 52901 + }, + { + "epoch": 4.936269478398805, + "grad_norm": NaN, + "learning_rate": 2.4108725443620052e-05, + "loss": 0.0, + "step": 52902 + }, + { + "epoch": 4.936362788093683, + "grad_norm": NaN, + "learning_rate": 2.4104613121839982e-05, + "loss": 0.0, + "step": 52903 + }, + { + "epoch": 4.93645609778856, + "grad_norm": NaN, + "learning_rate": 2.410050112017458e-05, + "loss": 0.0, + "step": 52904 + }, + { + "epoch": 4.9365494074834375, + "grad_norm": NaN, + "learning_rate": 2.4096389438634323e-05, + "loss": 0.0, + "step": 52905 + }, + { + "epoch": 4.936642717178315, + "grad_norm": NaN, + "learning_rate": 2.4092278077229648e-05, + "loss": 0.0, + "step": 52906 + }, + { + "epoch": 4.936736026873192, + "grad_norm": NaN, + "learning_rate": 2.4088167035971e-05, + "loss": 0.0, + "step": 52907 + }, + { + "epoch": 4.93682933656807, + "grad_norm": NaN, + "learning_rate": 2.4084056314868853e-05, + "loss": 0.0, + "step": 52908 + }, + { + "epoch": 4.936922646262946, + "grad_norm": NaN, + "learning_rate": 2.407994591393367e-05, + "loss": 0.0, + "step": 52909 + }, + { + "epoch": 4.937015955957824, + "grad_norm": NaN, + "learning_rate": 2.4075835833175868e-05, + "loss": 0.0, + "step": 52910 + }, + { + "epoch": 4.937109265652701, + "grad_norm": NaN, + "learning_rate": 2.4071726072605906e-05, + "loss": 0.0, + "step": 52911 + }, + { + "epoch": 4.937202575347579, + "grad_norm": NaN, + "learning_rate": 2.406761663223425e-05, + "loss": 0.0, + "step": 52912 + }, + { + "epoch": 4.937295885042456, + "grad_norm": NaN, + "learning_rate": 2.4063507512071332e-05, + "loss": 0.0, + "step": 52913 + }, + { + "epoch": 4.937389194737333, + "grad_norm": NaN, + "learning_rate": 2.4059398712127636e-05, + "loss": 0.0, + "step": 52914 + }, + { + "epoch": 4.937482504432211, + "grad_norm": NaN, + "learning_rate": 2.4055290232413565e-05, + "loss": 0.0, + "step": 52915 + }, + { + "epoch": 4.937575814127088, + "grad_norm": NaN, + "learning_rate": 2.4051182072939578e-05, + "loss": 0.0, + "step": 52916 + }, + { + "epoch": 4.937669123821965, + "grad_norm": NaN, + "learning_rate": 2.4047074233716134e-05, + "loss": 0.0, + "step": 52917 + }, + { + "epoch": 4.937762433516842, + "grad_norm": NaN, + "learning_rate": 2.404296671475366e-05, + "loss": 0.0, + "step": 52918 + }, + { + "epoch": 4.93785574321172, + "grad_norm": NaN, + "learning_rate": 2.4038859516062625e-05, + "loss": 0.0, + "step": 52919 + }, + { + "epoch": 4.937949052906597, + "grad_norm": NaN, + "learning_rate": 2.4034752637653444e-05, + "loss": 0.0, + "step": 52920 + }, + { + "epoch": 4.9380423626014744, + "grad_norm": NaN, + "learning_rate": 2.4030646079536586e-05, + "loss": 0.0, + "step": 52921 + }, + { + "epoch": 4.938135672296352, + "grad_norm": NaN, + "learning_rate": 2.4026539841722482e-05, + "loss": 0.0, + "step": 52922 + }, + { + "epoch": 4.938228981991228, + "grad_norm": NaN, + "learning_rate": 2.402243392422155e-05, + "loss": 0.0, + "step": 52923 + }, + { + "epoch": 4.938322291686106, + "grad_norm": NaN, + "learning_rate": 2.4018328327044273e-05, + "loss": 0.0, + "step": 52924 + }, + { + "epoch": 4.938415601380983, + "grad_norm": NaN, + "learning_rate": 2.4014223050201055e-05, + "loss": 0.0, + "step": 52925 + }, + { + "epoch": 4.938508911075861, + "grad_norm": NaN, + "learning_rate": 2.4010118093702384e-05, + "loss": 0.0, + "step": 52926 + }, + { + "epoch": 4.938602220770738, + "grad_norm": NaN, + "learning_rate": 2.4006013457558608e-05, + "loss": 0.0, + "step": 52927 + }, + { + "epoch": 4.9386955304656155, + "grad_norm": NaN, + "learning_rate": 2.4001909141780233e-05, + "loss": 0.0, + "step": 52928 + }, + { + "epoch": 4.938788840160493, + "grad_norm": NaN, + "learning_rate": 2.3997805146377735e-05, + "loss": 0.0, + "step": 52929 + }, + { + "epoch": 4.93888214985537, + "grad_norm": NaN, + "learning_rate": 2.3993701471361404e-05, + "loss": 0.0, + "step": 52930 + }, + { + "epoch": 4.938975459550248, + "grad_norm": NaN, + "learning_rate": 2.3989598116741797e-05, + "loss": 0.0, + "step": 52931 + }, + { + "epoch": 4.939068769245124, + "grad_norm": NaN, + "learning_rate": 2.3985495082529326e-05, + "loss": 0.0, + "step": 52932 + }, + { + "epoch": 4.939162078940002, + "grad_norm": NaN, + "learning_rate": 2.3981392368734413e-05, + "loss": 0.0, + "step": 52933 + }, + { + "epoch": 4.939255388634879, + "grad_norm": NaN, + "learning_rate": 2.3977289975367504e-05, + "loss": 0.0, + "step": 52934 + }, + { + "epoch": 4.9393486983297565, + "grad_norm": NaN, + "learning_rate": 2.3973187902438982e-05, + "loss": 0.0, + "step": 52935 + }, + { + "epoch": 4.939442008024634, + "grad_norm": NaN, + "learning_rate": 2.3969086149959332e-05, + "loss": 0.0, + "step": 52936 + }, + { + "epoch": 4.939535317719511, + "grad_norm": NaN, + "learning_rate": 2.3964984717938975e-05, + "loss": 0.0, + "step": 52937 + }, + { + "epoch": 4.939628627414388, + "grad_norm": NaN, + "learning_rate": 2.3960883606388298e-05, + "loss": 0.0, + "step": 52938 + }, + { + "epoch": 4.939721937109265, + "grad_norm": NaN, + "learning_rate": 2.3956782815317782e-05, + "loss": 0.0, + "step": 52939 + }, + { + "epoch": 4.939815246804143, + "grad_norm": NaN, + "learning_rate": 2.3952682344737818e-05, + "loss": 0.0, + "step": 52940 + }, + { + "epoch": 4.93990855649902, + "grad_norm": NaN, + "learning_rate": 2.3948582194658844e-05, + "loss": 0.0, + "step": 52941 + }, + { + "epoch": 4.940001866193898, + "grad_norm": NaN, + "learning_rate": 2.3944482365091304e-05, + "loss": 0.0, + "step": 52942 + }, + { + "epoch": 4.940095175888775, + "grad_norm": NaN, + "learning_rate": 2.394038285604559e-05, + "loss": 0.0, + "step": 52943 + }, + { + "epoch": 4.940188485583652, + "grad_norm": NaN, + "learning_rate": 2.393628366753213e-05, + "loss": 0.0, + "step": 52944 + }, + { + "epoch": 4.94028179527853, + "grad_norm": NaN, + "learning_rate": 2.3932184799561376e-05, + "loss": 0.0, + "step": 52945 + }, + { + "epoch": 4.940375104973406, + "grad_norm": NaN, + "learning_rate": 2.3928086252143725e-05, + "loss": 0.0, + "step": 52946 + }, + { + "epoch": 4.940468414668284, + "grad_norm": NaN, + "learning_rate": 2.3923988025289616e-05, + "loss": 0.0, + "step": 52947 + }, + { + "epoch": 4.940561724363161, + "grad_norm": NaN, + "learning_rate": 2.3919890119009437e-05, + "loss": 0.0, + "step": 52948 + }, + { + "epoch": 4.940655034058039, + "grad_norm": NaN, + "learning_rate": 2.3915792533313655e-05, + "loss": 0.0, + "step": 52949 + }, + { + "epoch": 4.940748343752916, + "grad_norm": NaN, + "learning_rate": 2.3911695268212637e-05, + "loss": 0.0, + "step": 52950 + }, + { + "epoch": 4.9408416534477935, + "grad_norm": NaN, + "learning_rate": 2.390759832371685e-05, + "loss": 0.0, + "step": 52951 + }, + { + "epoch": 4.940934963142671, + "grad_norm": NaN, + "learning_rate": 2.3903501699836675e-05, + "loss": 0.0, + "step": 52952 + }, + { + "epoch": 4.941028272837547, + "grad_norm": NaN, + "learning_rate": 2.389940539658256e-05, + "loss": 0.0, + "step": 52953 + }, + { + "epoch": 4.941121582532425, + "grad_norm": NaN, + "learning_rate": 2.389530941396488e-05, + "loss": 0.0, + "step": 52954 + }, + { + "epoch": 4.941214892227302, + "grad_norm": NaN, + "learning_rate": 2.3891213751994094e-05, + "loss": 0.0, + "step": 52955 + }, + { + "epoch": 4.94130820192218, + "grad_norm": NaN, + "learning_rate": 2.388711841068059e-05, + "loss": 0.0, + "step": 52956 + }, + { + "epoch": 4.941401511617057, + "grad_norm": NaN, + "learning_rate": 2.3883023390034777e-05, + "loss": 0.0, + "step": 52957 + }, + { + "epoch": 4.9414948213119345, + "grad_norm": NaN, + "learning_rate": 2.3878928690067082e-05, + "loss": 0.0, + "step": 52958 + }, + { + "epoch": 4.941588131006812, + "grad_norm": NaN, + "learning_rate": 2.3874834310787915e-05, + "loss": 0.0, + "step": 52959 + }, + { + "epoch": 4.941681440701689, + "grad_norm": NaN, + "learning_rate": 2.3870740252207683e-05, + "loss": 0.0, + "step": 52960 + }, + { + "epoch": 4.941774750396566, + "grad_norm": NaN, + "learning_rate": 2.386664651433679e-05, + "loss": 0.0, + "step": 52961 + }, + { + "epoch": 4.941868060091443, + "grad_norm": NaN, + "learning_rate": 2.3862553097185666e-05, + "loss": 0.0, + "step": 52962 + }, + { + "epoch": 4.941961369786321, + "grad_norm": NaN, + "learning_rate": 2.3858460000764684e-05, + "loss": 0.0, + "step": 52963 + }, + { + "epoch": 4.942054679481198, + "grad_norm": NaN, + "learning_rate": 2.3854367225084287e-05, + "loss": 0.0, + "step": 52964 + }, + { + "epoch": 4.942147989176076, + "grad_norm": NaN, + "learning_rate": 2.3850274770154865e-05, + "loss": 0.0, + "step": 52965 + }, + { + "epoch": 4.942241298870953, + "grad_norm": NaN, + "learning_rate": 2.3846182635986828e-05, + "loss": 0.0, + "step": 52966 + }, + { + "epoch": 4.9423346085658295, + "grad_norm": NaN, + "learning_rate": 2.3842090822590564e-05, + "loss": 0.0, + "step": 52967 + }, + { + "epoch": 4.942427918260707, + "grad_norm": NaN, + "learning_rate": 2.3837999329976503e-05, + "loss": 0.0, + "step": 52968 + }, + { + "epoch": 4.942521227955584, + "grad_norm": NaN, + "learning_rate": 2.3833908158155035e-05, + "loss": 0.0, + "step": 52969 + }, + { + "epoch": 4.942614537650462, + "grad_norm": NaN, + "learning_rate": 2.3829817307136566e-05, + "loss": 0.0, + "step": 52970 + }, + { + "epoch": 4.942707847345339, + "grad_norm": NaN, + "learning_rate": 2.382572677693151e-05, + "loss": 0.0, + "step": 52971 + }, + { + "epoch": 4.942801157040217, + "grad_norm": NaN, + "learning_rate": 2.382163656755025e-05, + "loss": 0.0, + "step": 52972 + }, + { + "epoch": 4.942894466735094, + "grad_norm": NaN, + "learning_rate": 2.3817546679003186e-05, + "loss": 0.0, + "step": 52973 + }, + { + "epoch": 4.9429877764299714, + "grad_norm": NaN, + "learning_rate": 2.381345711130072e-05, + "loss": 0.0, + "step": 52974 + }, + { + "epoch": 4.943081086124849, + "grad_norm": NaN, + "learning_rate": 2.380936786445325e-05, + "loss": 0.0, + "step": 52975 + }, + { + "epoch": 4.943174395819725, + "grad_norm": NaN, + "learning_rate": 2.3805278938471178e-05, + "loss": 0.0, + "step": 52976 + }, + { + "epoch": 4.943267705514603, + "grad_norm": NaN, + "learning_rate": 2.380119033336492e-05, + "loss": 0.0, + "step": 52977 + }, + { + "epoch": 4.94336101520948, + "grad_norm": NaN, + "learning_rate": 2.3797102049144823e-05, + "loss": 0.0, + "step": 52978 + }, + { + "epoch": 4.943454324904358, + "grad_norm": NaN, + "learning_rate": 2.3793014085821337e-05, + "loss": 0.0, + "step": 52979 + }, + { + "epoch": 4.943547634599235, + "grad_norm": NaN, + "learning_rate": 2.378892644340482e-05, + "loss": 0.0, + "step": 52980 + }, + { + "epoch": 4.9436409442941125, + "grad_norm": NaN, + "learning_rate": 2.378483912190568e-05, + "loss": 0.0, + "step": 52981 + }, + { + "epoch": 4.943734253988989, + "grad_norm": NaN, + "learning_rate": 2.378075212133429e-05, + "loss": 0.0, + "step": 52982 + }, + { + "epoch": 4.943827563683866, + "grad_norm": NaN, + "learning_rate": 2.3776665441701075e-05, + "loss": 0.0, + "step": 52983 + }, + { + "epoch": 4.943920873378744, + "grad_norm": NaN, + "learning_rate": 2.3772579083016414e-05, + "loss": 0.0, + "step": 52984 + }, + { + "epoch": 4.944014183073621, + "grad_norm": NaN, + "learning_rate": 2.3768493045290678e-05, + "loss": 0.0, + "step": 52985 + }, + { + "epoch": 4.944107492768499, + "grad_norm": NaN, + "learning_rate": 2.3764407328534273e-05, + "loss": 0.0, + "step": 52986 + }, + { + "epoch": 4.944200802463376, + "grad_norm": NaN, + "learning_rate": 2.37603219327576e-05, + "loss": 0.0, + "step": 52987 + }, + { + "epoch": 4.9442941121582535, + "grad_norm": NaN, + "learning_rate": 2.3756236857971022e-05, + "loss": 0.0, + "step": 52988 + }, + { + "epoch": 4.944387421853131, + "grad_norm": NaN, + "learning_rate": 2.3752152104184942e-05, + "loss": 0.0, + "step": 52989 + }, + { + "epoch": 4.9444807315480075, + "grad_norm": NaN, + "learning_rate": 2.3748067671409743e-05, + "loss": 0.0, + "step": 52990 + }, + { + "epoch": 4.944574041242885, + "grad_norm": NaN, + "learning_rate": 2.3743983559655805e-05, + "loss": 0.0, + "step": 52991 + }, + { + "epoch": 4.944667350937762, + "grad_norm": NaN, + "learning_rate": 2.3739899768933518e-05, + "loss": 0.0, + "step": 52992 + }, + { + "epoch": 4.94476066063264, + "grad_norm": NaN, + "learning_rate": 2.3735816299253275e-05, + "loss": 0.0, + "step": 52993 + }, + { + "epoch": 4.944853970327517, + "grad_norm": NaN, + "learning_rate": 2.373173315062543e-05, + "loss": 0.0, + "step": 52994 + }, + { + "epoch": 4.944947280022395, + "grad_norm": NaN, + "learning_rate": 2.37276503230604e-05, + "loss": 0.0, + "step": 52995 + }, + { + "epoch": 4.945040589717272, + "grad_norm": NaN, + "learning_rate": 2.372356781656855e-05, + "loss": 0.0, + "step": 52996 + }, + { + "epoch": 4.9451338994121485, + "grad_norm": NaN, + "learning_rate": 2.3719485631160246e-05, + "loss": 0.0, + "step": 52997 + }, + { + "epoch": 4.945227209107026, + "grad_norm": NaN, + "learning_rate": 2.371540376684589e-05, + "loss": 0.0, + "step": 52998 + }, + { + "epoch": 4.945320518801903, + "grad_norm": NaN, + "learning_rate": 2.3711322223635866e-05, + "loss": 0.0, + "step": 52999 + }, + { + "epoch": 4.945413828496781, + "grad_norm": NaN, + "learning_rate": 2.3707241001540524e-05, + "loss": 0.0, + "step": 53000 + }, + { + "epoch": 4.945507138191658, + "grad_norm": NaN, + "learning_rate": 2.3703160100570273e-05, + "loss": 0.0, + "step": 53001 + }, + { + "epoch": 4.945600447886536, + "grad_norm": NaN, + "learning_rate": 2.369907952073546e-05, + "loss": 0.0, + "step": 53002 + }, + { + "epoch": 4.945693757581413, + "grad_norm": NaN, + "learning_rate": 2.3694999262046488e-05, + "loss": 0.0, + "step": 53003 + }, + { + "epoch": 4.9457870672762905, + "grad_norm": NaN, + "learning_rate": 2.3690919324513703e-05, + "loss": 0.0, + "step": 53004 + }, + { + "epoch": 4.945880376971167, + "grad_norm": NaN, + "learning_rate": 2.3686839708147508e-05, + "loss": 0.0, + "step": 53005 + }, + { + "epoch": 4.945973686666044, + "grad_norm": NaN, + "learning_rate": 2.3682760412958252e-05, + "loss": 0.0, + "step": 53006 + }, + { + "epoch": 4.946066996360922, + "grad_norm": NaN, + "learning_rate": 2.367868143895632e-05, + "loss": 0.0, + "step": 53007 + }, + { + "epoch": 4.946160306055799, + "grad_norm": NaN, + "learning_rate": 2.3674602786152092e-05, + "loss": 0.0, + "step": 53008 + }, + { + "epoch": 4.946253615750677, + "grad_norm": NaN, + "learning_rate": 2.3670524454555924e-05, + "loss": 0.0, + "step": 53009 + }, + { + "epoch": 4.946346925445554, + "grad_norm": NaN, + "learning_rate": 2.366644644417819e-05, + "loss": 0.0, + "step": 53010 + }, + { + "epoch": 4.946440235140431, + "grad_norm": NaN, + "learning_rate": 2.366236875502925e-05, + "loss": 0.0, + "step": 53011 + }, + { + "epoch": 4.946533544835308, + "grad_norm": NaN, + "learning_rate": 2.3658291387119494e-05, + "loss": 0.0, + "step": 53012 + }, + { + "epoch": 4.9466268545301855, + "grad_norm": NaN, + "learning_rate": 2.3654214340459282e-05, + "loss": 0.0, + "step": 53013 + }, + { + "epoch": 4.946720164225063, + "grad_norm": NaN, + "learning_rate": 2.3650137615058972e-05, + "loss": 0.0, + "step": 53014 + }, + { + "epoch": 4.94681347391994, + "grad_norm": NaN, + "learning_rate": 2.3646061210928924e-05, + "loss": 0.0, + "step": 53015 + }, + { + "epoch": 4.946906783614818, + "grad_norm": NaN, + "learning_rate": 2.3641985128079527e-05, + "loss": 0.0, + "step": 53016 + }, + { + "epoch": 4.947000093309695, + "grad_norm": NaN, + "learning_rate": 2.3637909366521125e-05, + "loss": 0.0, + "step": 53017 + }, + { + "epoch": 4.947093403004573, + "grad_norm": NaN, + "learning_rate": 2.3633833926264088e-05, + "loss": 0.0, + "step": 53018 + }, + { + "epoch": 4.947186712699449, + "grad_norm": NaN, + "learning_rate": 2.362975880731878e-05, + "loss": 0.0, + "step": 53019 + }, + { + "epoch": 4.9472800223943265, + "grad_norm": NaN, + "learning_rate": 2.3625684009695562e-05, + "loss": 0.0, + "step": 53020 + }, + { + "epoch": 4.947373332089204, + "grad_norm": NaN, + "learning_rate": 2.36216095334048e-05, + "loss": 0.0, + "step": 53021 + }, + { + "epoch": 4.947466641784081, + "grad_norm": NaN, + "learning_rate": 2.3617535378456843e-05, + "loss": 0.0, + "step": 53022 + }, + { + "epoch": 4.947559951478959, + "grad_norm": NaN, + "learning_rate": 2.361346154486206e-05, + "loss": 0.0, + "step": 53023 + }, + { + "epoch": 4.947653261173836, + "grad_norm": NaN, + "learning_rate": 2.36093880326308e-05, + "loss": 0.0, + "step": 53024 + }, + { + "epoch": 4.947746570868714, + "grad_norm": NaN, + "learning_rate": 2.3605314841773432e-05, + "loss": 0.0, + "step": 53025 + }, + { + "epoch": 4.94783988056359, + "grad_norm": NaN, + "learning_rate": 2.36012419723003e-05, + "loss": 0.0, + "step": 53026 + }, + { + "epoch": 4.947933190258468, + "grad_norm": NaN, + "learning_rate": 2.3597169424221778e-05, + "loss": 0.0, + "step": 53027 + }, + { + "epoch": 4.948026499953345, + "grad_norm": NaN, + "learning_rate": 2.3593097197548193e-05, + "loss": 0.0, + "step": 53028 + }, + { + "epoch": 4.948119809648222, + "grad_norm": NaN, + "learning_rate": 2.3589025292289937e-05, + "loss": 0.0, + "step": 53029 + }, + { + "epoch": 4.9482131193431, + "grad_norm": NaN, + "learning_rate": 2.3584953708457334e-05, + "loss": 0.0, + "step": 53030 + }, + { + "epoch": 4.948306429037977, + "grad_norm": NaN, + "learning_rate": 2.3580882446060758e-05, + "loss": 0.0, + "step": 53031 + }, + { + "epoch": 4.948399738732855, + "grad_norm": NaN, + "learning_rate": 2.3576811505110532e-05, + "loss": 0.0, + "step": 53032 + }, + { + "epoch": 4.948493048427732, + "grad_norm": NaN, + "learning_rate": 2.3572740885617038e-05, + "loss": 0.0, + "step": 53033 + }, + { + "epoch": 4.948586358122609, + "grad_norm": NaN, + "learning_rate": 2.356867058759061e-05, + "loss": 0.0, + "step": 53034 + }, + { + "epoch": 4.948679667817486, + "grad_norm": NaN, + "learning_rate": 2.356460061104161e-05, + "loss": 0.0, + "step": 53035 + }, + { + "epoch": 4.9487729775123634, + "grad_norm": NaN, + "learning_rate": 2.3560530955980366e-05, + "loss": 0.0, + "step": 53036 + }, + { + "epoch": 4.948866287207241, + "grad_norm": NaN, + "learning_rate": 2.3556461622417232e-05, + "loss": 0.0, + "step": 53037 + }, + { + "epoch": 4.948959596902118, + "grad_norm": NaN, + "learning_rate": 2.355239261036257e-05, + "loss": 0.0, + "step": 53038 + }, + { + "epoch": 4.949052906596996, + "grad_norm": NaN, + "learning_rate": 2.3548323919826717e-05, + "loss": 0.0, + "step": 53039 + }, + { + "epoch": 4.949146216291872, + "grad_norm": NaN, + "learning_rate": 2.3544255550820018e-05, + "loss": 0.0, + "step": 53040 + }, + { + "epoch": 4.94923952598675, + "grad_norm": NaN, + "learning_rate": 2.3540187503352832e-05, + "loss": 0.0, + "step": 53041 + }, + { + "epoch": 4.949332835681627, + "grad_norm": NaN, + "learning_rate": 2.3536119777435464e-05, + "loss": 0.0, + "step": 53042 + }, + { + "epoch": 4.9494261453765045, + "grad_norm": NaN, + "learning_rate": 2.353205237307829e-05, + "loss": 0.0, + "step": 53043 + }, + { + "epoch": 4.949519455071382, + "grad_norm": NaN, + "learning_rate": 2.3527985290291657e-05, + "loss": 0.0, + "step": 53044 + }, + { + "epoch": 4.949612764766259, + "grad_norm": NaN, + "learning_rate": 2.3523918529085882e-05, + "loss": 0.0, + "step": 53045 + }, + { + "epoch": 4.949706074461137, + "grad_norm": NaN, + "learning_rate": 2.3519852089471315e-05, + "loss": 0.0, + "step": 53046 + }, + { + "epoch": 4.949799384156014, + "grad_norm": NaN, + "learning_rate": 2.3515785971458308e-05, + "loss": 0.0, + "step": 53047 + }, + { + "epoch": 4.949892693850892, + "grad_norm": NaN, + "learning_rate": 2.3511720175057185e-05, + "loss": 0.0, + "step": 53048 + }, + { + "epoch": 4.949986003545768, + "grad_norm": NaN, + "learning_rate": 2.3507654700278278e-05, + "loss": 0.0, + "step": 53049 + }, + { + "epoch": 4.9500793132406455, + "grad_norm": NaN, + "learning_rate": 2.3503589547131957e-05, + "loss": 0.0, + "step": 53050 + }, + { + "epoch": 4.950172622935523, + "grad_norm": NaN, + "learning_rate": 2.3499524715628515e-05, + "loss": 0.0, + "step": 53051 + }, + { + "epoch": 4.9502659326304, + "grad_norm": NaN, + "learning_rate": 2.3495460205778326e-05, + "loss": 0.0, + "step": 53052 + }, + { + "epoch": 4.950359242325278, + "grad_norm": NaN, + "learning_rate": 2.34913960175917e-05, + "loss": 0.0, + "step": 53053 + }, + { + "epoch": 4.950452552020155, + "grad_norm": NaN, + "learning_rate": 2.348733215107898e-05, + "loss": 0.0, + "step": 53054 + }, + { + "epoch": 4.950545861715032, + "grad_norm": NaN, + "learning_rate": 2.3483268606250505e-05, + "loss": 0.0, + "step": 53055 + }, + { + "epoch": 4.950639171409909, + "grad_norm": NaN, + "learning_rate": 2.3479205383116584e-05, + "loss": 0.0, + "step": 53056 + }, + { + "epoch": 4.950732481104787, + "grad_norm": NaN, + "learning_rate": 2.3475142481687576e-05, + "loss": 0.0, + "step": 53057 + }, + { + "epoch": 4.950825790799664, + "grad_norm": NaN, + "learning_rate": 2.3471079901973804e-05, + "loss": 0.0, + "step": 53058 + }, + { + "epoch": 4.950919100494541, + "grad_norm": NaN, + "learning_rate": 2.3467017643985598e-05, + "loss": 0.0, + "step": 53059 + }, + { + "epoch": 4.951012410189419, + "grad_norm": NaN, + "learning_rate": 2.346295570773328e-05, + "loss": 0.0, + "step": 53060 + }, + { + "epoch": 4.951105719884296, + "grad_norm": NaN, + "learning_rate": 2.3458894093227187e-05, + "loss": 0.0, + "step": 53061 + }, + { + "epoch": 4.951199029579174, + "grad_norm": NaN, + "learning_rate": 2.3454832800477637e-05, + "loss": 0.0, + "step": 53062 + }, + { + "epoch": 4.95129233927405, + "grad_norm": NaN, + "learning_rate": 2.3450771829494964e-05, + "loss": 0.0, + "step": 53063 + }, + { + "epoch": 4.951385648968928, + "grad_norm": NaN, + "learning_rate": 2.344671118028948e-05, + "loss": 0.0, + "step": 53064 + }, + { + "epoch": 4.951478958663805, + "grad_norm": NaN, + "learning_rate": 2.344265085287154e-05, + "loss": 0.0, + "step": 53065 + }, + { + "epoch": 4.9515722683586825, + "grad_norm": NaN, + "learning_rate": 2.3438590847251443e-05, + "loss": 0.0, + "step": 53066 + }, + { + "epoch": 4.95166557805356, + "grad_norm": NaN, + "learning_rate": 2.343453116343951e-05, + "loss": 0.0, + "step": 53067 + }, + { + "epoch": 4.951758887748437, + "grad_norm": NaN, + "learning_rate": 2.3430471801446077e-05, + "loss": 0.0, + "step": 53068 + }, + { + "epoch": 4.951852197443315, + "grad_norm": NaN, + "learning_rate": 2.342641276128146e-05, + "loss": 0.0, + "step": 53069 + }, + { + "epoch": 4.951945507138191, + "grad_norm": NaN, + "learning_rate": 2.3422354042955997e-05, + "loss": 0.0, + "step": 53070 + }, + { + "epoch": 4.952038816833069, + "grad_norm": NaN, + "learning_rate": 2.3418295646479983e-05, + "loss": 0.0, + "step": 53071 + }, + { + "epoch": 4.952132126527946, + "grad_norm": NaN, + "learning_rate": 2.3414237571863742e-05, + "loss": 0.0, + "step": 53072 + }, + { + "epoch": 4.9522254362228235, + "grad_norm": NaN, + "learning_rate": 2.34101798191176e-05, + "loss": 0.0, + "step": 53073 + }, + { + "epoch": 4.952318745917701, + "grad_norm": NaN, + "learning_rate": 2.3406122388251876e-05, + "loss": 0.0, + "step": 53074 + }, + { + "epoch": 4.952412055612578, + "grad_norm": NaN, + "learning_rate": 2.340206527927688e-05, + "loss": 0.0, + "step": 53075 + }, + { + "epoch": 4.952505365307456, + "grad_norm": NaN, + "learning_rate": 2.339800849220293e-05, + "loss": 0.0, + "step": 53076 + }, + { + "epoch": 4.952598675002333, + "grad_norm": NaN, + "learning_rate": 2.339395202704034e-05, + "loss": 0.0, + "step": 53077 + }, + { + "epoch": 4.95269198469721, + "grad_norm": NaN, + "learning_rate": 2.3389895883799426e-05, + "loss": 0.0, + "step": 53078 + }, + { + "epoch": 4.952785294392087, + "grad_norm": NaN, + "learning_rate": 2.3385840062490508e-05, + "loss": 0.0, + "step": 53079 + }, + { + "epoch": 4.952878604086965, + "grad_norm": NaN, + "learning_rate": 2.3381784563123895e-05, + "loss": 0.0, + "step": 53080 + }, + { + "epoch": 4.952971913781842, + "grad_norm": NaN, + "learning_rate": 2.3377729385709897e-05, + "loss": 0.0, + "step": 53081 + }, + { + "epoch": 4.953065223476719, + "grad_norm": NaN, + "learning_rate": 2.337367453025881e-05, + "loss": 0.0, + "step": 53082 + }, + { + "epoch": 4.953158533171597, + "grad_norm": NaN, + "learning_rate": 2.3369619996780982e-05, + "loss": 0.0, + "step": 53083 + }, + { + "epoch": 4.953251842866473, + "grad_norm": NaN, + "learning_rate": 2.3365565785286678e-05, + "loss": 0.0, + "step": 53084 + }, + { + "epoch": 4.953345152561351, + "grad_norm": NaN, + "learning_rate": 2.336151189578624e-05, + "loss": 0.0, + "step": 53085 + }, + { + "epoch": 4.953438462256228, + "grad_norm": NaN, + "learning_rate": 2.3357458328289956e-05, + "loss": 0.0, + "step": 53086 + }, + { + "epoch": 4.953531771951106, + "grad_norm": NaN, + "learning_rate": 2.3353405082808156e-05, + "loss": 0.0, + "step": 53087 + }, + { + "epoch": 4.953625081645983, + "grad_norm": NaN, + "learning_rate": 2.3349352159351115e-05, + "loss": 0.0, + "step": 53088 + }, + { + "epoch": 4.9537183913408604, + "grad_norm": NaN, + "learning_rate": 2.334529955792917e-05, + "loss": 0.0, + "step": 53089 + }, + { + "epoch": 4.953811701035738, + "grad_norm": NaN, + "learning_rate": 2.33412472785526e-05, + "loss": 0.0, + "step": 53090 + }, + { + "epoch": 4.953905010730615, + "grad_norm": NaN, + "learning_rate": 2.333719532123171e-05, + "loss": 0.0, + "step": 53091 + }, + { + "epoch": 4.953998320425493, + "grad_norm": NaN, + "learning_rate": 2.3333143685976828e-05, + "loss": 0.0, + "step": 53092 + }, + { + "epoch": 4.954091630120369, + "grad_norm": NaN, + "learning_rate": 2.332909237279823e-05, + "loss": 0.0, + "step": 53093 + }, + { + "epoch": 4.954184939815247, + "grad_norm": NaN, + "learning_rate": 2.332504138170624e-05, + "loss": 0.0, + "step": 53094 + }, + { + "epoch": 4.954278249510124, + "grad_norm": NaN, + "learning_rate": 2.3320990712711145e-05, + "loss": 0.0, + "step": 53095 + }, + { + "epoch": 4.9543715592050015, + "grad_norm": NaN, + "learning_rate": 2.3316940365823226e-05, + "loss": 0.0, + "step": 53096 + }, + { + "epoch": 4.954464868899879, + "grad_norm": NaN, + "learning_rate": 2.3312890341052827e-05, + "loss": 0.0, + "step": 53097 + }, + { + "epoch": 4.954558178594756, + "grad_norm": NaN, + "learning_rate": 2.33088406384102e-05, + "loss": 0.0, + "step": 53098 + }, + { + "epoch": 4.954651488289633, + "grad_norm": NaN, + "learning_rate": 2.330479125790567e-05, + "loss": 0.0, + "step": 53099 + }, + { + "epoch": 4.95474479798451, + "grad_norm": NaN, + "learning_rate": 2.330074219954953e-05, + "loss": 0.0, + "step": 53100 + }, + { + "epoch": 4.954838107679388, + "grad_norm": NaN, + "learning_rate": 2.329669346335206e-05, + "loss": 0.0, + "step": 53101 + }, + { + "epoch": 4.954931417374265, + "grad_norm": NaN, + "learning_rate": 2.3292645049323583e-05, + "loss": 0.0, + "step": 53102 + }, + { + "epoch": 4.9550247270691425, + "grad_norm": NaN, + "learning_rate": 2.328859695747437e-05, + "loss": 0.0, + "step": 53103 + }, + { + "epoch": 4.95511803676402, + "grad_norm": NaN, + "learning_rate": 2.3284549187814717e-05, + "loss": 0.0, + "step": 53104 + }, + { + "epoch": 4.955211346458897, + "grad_norm": NaN, + "learning_rate": 2.3280501740354912e-05, + "loss": 0.0, + "step": 53105 + }, + { + "epoch": 4.955304656153775, + "grad_norm": NaN, + "learning_rate": 2.3276454615105268e-05, + "loss": 0.0, + "step": 53106 + }, + { + "epoch": 4.955397965848651, + "grad_norm": NaN, + "learning_rate": 2.3272407812076054e-05, + "loss": 0.0, + "step": 53107 + }, + { + "epoch": 4.955491275543529, + "grad_norm": NaN, + "learning_rate": 2.326836133127757e-05, + "loss": 0.0, + "step": 53108 + }, + { + "epoch": 4.955584585238406, + "grad_norm": NaN, + "learning_rate": 2.32643151727201e-05, + "loss": 0.0, + "step": 53109 + }, + { + "epoch": 4.955677894933284, + "grad_norm": NaN, + "learning_rate": 2.3260269336413938e-05, + "loss": 0.0, + "step": 53110 + }, + { + "epoch": 4.955771204628161, + "grad_norm": NaN, + "learning_rate": 2.3256223822369363e-05, + "loss": 0.0, + "step": 53111 + }, + { + "epoch": 4.955864514323038, + "grad_norm": NaN, + "learning_rate": 2.325217863059668e-05, + "loss": 0.0, + "step": 53112 + }, + { + "epoch": 4.955957824017916, + "grad_norm": NaN, + "learning_rate": 2.3248133761106146e-05, + "loss": 0.0, + "step": 53113 + }, + { + "epoch": 4.956051133712792, + "grad_norm": NaN, + "learning_rate": 2.3244089213908057e-05, + "loss": 0.0, + "step": 53114 + }, + { + "epoch": 4.95614444340767, + "grad_norm": NaN, + "learning_rate": 2.324004498901272e-05, + "loss": 0.0, + "step": 53115 + }, + { + "epoch": 4.956237753102547, + "grad_norm": NaN, + "learning_rate": 2.323600108643039e-05, + "loss": 0.0, + "step": 53116 + }, + { + "epoch": 4.956331062797425, + "grad_norm": NaN, + "learning_rate": 2.323195750617136e-05, + "loss": 0.0, + "step": 53117 + }, + { + "epoch": 4.956424372492302, + "grad_norm": NaN, + "learning_rate": 2.322791424824591e-05, + "loss": 0.0, + "step": 53118 + }, + { + "epoch": 4.9565176821871795, + "grad_norm": NaN, + "learning_rate": 2.322387131266431e-05, + "loss": 0.0, + "step": 53119 + }, + { + "epoch": 4.956610991882057, + "grad_norm": NaN, + "learning_rate": 2.321982869943687e-05, + "loss": 0.0, + "step": 53120 + }, + { + "epoch": 4.956704301576934, + "grad_norm": NaN, + "learning_rate": 2.321578640857385e-05, + "loss": 0.0, + "step": 53121 + }, + { + "epoch": 4.956797611271811, + "grad_norm": NaN, + "learning_rate": 2.3211744440085506e-05, + "loss": 0.0, + "step": 53122 + }, + { + "epoch": 4.956890920966688, + "grad_norm": NaN, + "learning_rate": 2.3207702793982163e-05, + "loss": 0.0, + "step": 53123 + }, + { + "epoch": 4.956984230661566, + "grad_norm": NaN, + "learning_rate": 2.3203661470274064e-05, + "loss": 0.0, + "step": 53124 + }, + { + "epoch": 4.957077540356443, + "grad_norm": NaN, + "learning_rate": 2.3199620468971486e-05, + "loss": 0.0, + "step": 53125 + }, + { + "epoch": 4.9571708500513205, + "grad_norm": NaN, + "learning_rate": 2.3195579790084735e-05, + "loss": 0.0, + "step": 53126 + }, + { + "epoch": 4.957264159746198, + "grad_norm": NaN, + "learning_rate": 2.3191539433624033e-05, + "loss": 0.0, + "step": 53127 + }, + { + "epoch": 4.9573574694410745, + "grad_norm": NaN, + "learning_rate": 2.3187499399599714e-05, + "loss": 0.0, + "step": 53128 + }, + { + "epoch": 4.957450779135952, + "grad_norm": NaN, + "learning_rate": 2.3183459688021994e-05, + "loss": 0.0, + "step": 53129 + }, + { + "epoch": 4.957544088830829, + "grad_norm": NaN, + "learning_rate": 2.3179420298901185e-05, + "loss": 0.0, + "step": 53130 + }, + { + "epoch": 4.957637398525707, + "grad_norm": NaN, + "learning_rate": 2.317538123224753e-05, + "loss": 0.0, + "step": 53131 + }, + { + "epoch": 4.957730708220584, + "grad_norm": NaN, + "learning_rate": 2.3171342488071322e-05, + "loss": 0.0, + "step": 53132 + }, + { + "epoch": 4.957824017915462, + "grad_norm": NaN, + "learning_rate": 2.3167304066382814e-05, + "loss": 0.0, + "step": 53133 + }, + { + "epoch": 4.957917327610339, + "grad_norm": NaN, + "learning_rate": 2.3163265967192285e-05, + "loss": 0.0, + "step": 53134 + }, + { + "epoch": 4.958010637305216, + "grad_norm": NaN, + "learning_rate": 2.315922819051001e-05, + "loss": 0.0, + "step": 53135 + }, + { + "epoch": 4.958103947000093, + "grad_norm": NaN, + "learning_rate": 2.315519073634623e-05, + "loss": 0.0, + "step": 53136 + }, + { + "epoch": 4.95819725669497, + "grad_norm": NaN, + "learning_rate": 2.315115360471122e-05, + "loss": 0.0, + "step": 53137 + }, + { + "epoch": 4.958290566389848, + "grad_norm": NaN, + "learning_rate": 2.3147116795615274e-05, + "loss": 0.0, + "step": 53138 + }, + { + "epoch": 4.958383876084725, + "grad_norm": NaN, + "learning_rate": 2.314308030906861e-05, + "loss": 0.0, + "step": 53139 + }, + { + "epoch": 4.958477185779603, + "grad_norm": NaN, + "learning_rate": 2.313904414508153e-05, + "loss": 0.0, + "step": 53140 + }, + { + "epoch": 4.95857049547448, + "grad_norm": NaN, + "learning_rate": 2.313500830366427e-05, + "loss": 0.0, + "step": 53141 + }, + { + "epoch": 4.9586638051693575, + "grad_norm": NaN, + "learning_rate": 2.3130972784827125e-05, + "loss": 0.0, + "step": 53142 + }, + { + "epoch": 4.958757114864234, + "grad_norm": NaN, + "learning_rate": 2.3126937588580313e-05, + "loss": 0.0, + "step": 53143 + }, + { + "epoch": 4.958850424559111, + "grad_norm": NaN, + "learning_rate": 2.3122902714934118e-05, + "loss": 0.0, + "step": 53144 + }, + { + "epoch": 4.958943734253989, + "grad_norm": NaN, + "learning_rate": 2.3118868163898807e-05, + "loss": 0.0, + "step": 53145 + }, + { + "epoch": 4.959037043948866, + "grad_norm": NaN, + "learning_rate": 2.311483393548463e-05, + "loss": 0.0, + "step": 53146 + }, + { + "epoch": 4.959130353643744, + "grad_norm": NaN, + "learning_rate": 2.3110800029701836e-05, + "loss": 0.0, + "step": 53147 + }, + { + "epoch": 4.959223663338621, + "grad_norm": NaN, + "learning_rate": 2.3106766446560693e-05, + "loss": 0.0, + "step": 53148 + }, + { + "epoch": 4.9593169730334985, + "grad_norm": NaN, + "learning_rate": 2.3102733186071453e-05, + "loss": 0.0, + "step": 53149 + }, + { + "epoch": 4.959410282728376, + "grad_norm": NaN, + "learning_rate": 2.3098700248244377e-05, + "loss": 0.0, + "step": 53150 + }, + { + "epoch": 4.9595035924232524, + "grad_norm": NaN, + "learning_rate": 2.309466763308971e-05, + "loss": 0.0, + "step": 53151 + }, + { + "epoch": 4.95959690211813, + "grad_norm": NaN, + "learning_rate": 2.309063534061772e-05, + "loss": 0.0, + "step": 53152 + }, + { + "epoch": 4.959690211813007, + "grad_norm": NaN, + "learning_rate": 2.3086603370838653e-05, + "loss": 0.0, + "step": 53153 + }, + { + "epoch": 4.959783521507885, + "grad_norm": NaN, + "learning_rate": 2.308257172376275e-05, + "loss": 0.0, + "step": 53154 + }, + { + "epoch": 4.959876831202762, + "grad_norm": NaN, + "learning_rate": 2.307854039940027e-05, + "loss": 0.0, + "step": 53155 + }, + { + "epoch": 4.9599701408976395, + "grad_norm": NaN, + "learning_rate": 2.307450939776147e-05, + "loss": 0.0, + "step": 53156 + }, + { + "epoch": 4.960063450592516, + "grad_norm": NaN, + "learning_rate": 2.3070478718856576e-05, + "loss": 0.0, + "step": 53157 + }, + { + "epoch": 4.9601567602873935, + "grad_norm": NaN, + "learning_rate": 2.3066448362695884e-05, + "loss": 0.0, + "step": 53158 + }, + { + "epoch": 4.960250069982271, + "grad_norm": NaN, + "learning_rate": 2.3062418329289595e-05, + "loss": 0.0, + "step": 53159 + }, + { + "epoch": 4.960343379677148, + "grad_norm": NaN, + "learning_rate": 2.305838861864799e-05, + "loss": 0.0, + "step": 53160 + }, + { + "epoch": 4.960436689372026, + "grad_norm": NaN, + "learning_rate": 2.3054359230781288e-05, + "loss": 0.0, + "step": 53161 + }, + { + "epoch": 4.960529999066903, + "grad_norm": NaN, + "learning_rate": 2.3050330165699737e-05, + "loss": 0.0, + "step": 53162 + }, + { + "epoch": 4.960623308761781, + "grad_norm": NaN, + "learning_rate": 2.304630142341361e-05, + "loss": 0.0, + "step": 53163 + }, + { + "epoch": 4.960716618456658, + "grad_norm": NaN, + "learning_rate": 2.304227300393313e-05, + "loss": 0.0, + "step": 53164 + }, + { + "epoch": 4.960809928151535, + "grad_norm": NaN, + "learning_rate": 2.3038244907268526e-05, + "loss": 0.0, + "step": 53165 + }, + { + "epoch": 4.960903237846412, + "grad_norm": NaN, + "learning_rate": 2.303421713343007e-05, + "loss": 0.0, + "step": 53166 + }, + { + "epoch": 4.960996547541289, + "grad_norm": NaN, + "learning_rate": 2.303018968242799e-05, + "loss": 0.0, + "step": 53167 + }, + { + "epoch": 4.961089857236167, + "grad_norm": NaN, + "learning_rate": 2.3026162554272525e-05, + "loss": 0.0, + "step": 53168 + }, + { + "epoch": 4.961183166931044, + "grad_norm": NaN, + "learning_rate": 2.302213574897392e-05, + "loss": 0.0, + "step": 53169 + }, + { + "epoch": 4.961276476625922, + "grad_norm": NaN, + "learning_rate": 2.3018109266542394e-05, + "loss": 0.0, + "step": 53170 + }, + { + "epoch": 4.961369786320799, + "grad_norm": NaN, + "learning_rate": 2.3014083106988214e-05, + "loss": 0.0, + "step": 53171 + }, + { + "epoch": 4.961463096015676, + "grad_norm": NaN, + "learning_rate": 2.3010057270321598e-05, + "loss": 0.0, + "step": 53172 + }, + { + "epoch": 4.961556405710553, + "grad_norm": NaN, + "learning_rate": 2.3006031756552808e-05, + "loss": 0.0, + "step": 53173 + }, + { + "epoch": 4.96164971540543, + "grad_norm": NaN, + "learning_rate": 2.3002006565692032e-05, + "loss": 0.0, + "step": 53174 + }, + { + "epoch": 4.961743025100308, + "grad_norm": NaN, + "learning_rate": 2.299798169774955e-05, + "loss": 0.0, + "step": 53175 + }, + { + "epoch": 4.961836334795185, + "grad_norm": NaN, + "learning_rate": 2.2993957152735588e-05, + "loss": 0.0, + "step": 53176 + }, + { + "epoch": 4.961929644490063, + "grad_norm": NaN, + "learning_rate": 2.298993293066035e-05, + "loss": 0.0, + "step": 53177 + }, + { + "epoch": 4.96202295418494, + "grad_norm": NaN, + "learning_rate": 2.2985909031534113e-05, + "loss": 0.0, + "step": 53178 + }, + { + "epoch": 4.9621162638798175, + "grad_norm": NaN, + "learning_rate": 2.2981885455367067e-05, + "loss": 0.0, + "step": 53179 + }, + { + "epoch": 4.962209573574694, + "grad_norm": NaN, + "learning_rate": 2.2977862202169473e-05, + "loss": 0.0, + "step": 53180 + }, + { + "epoch": 4.9623028832695715, + "grad_norm": NaN, + "learning_rate": 2.297383927195154e-05, + "loss": 0.0, + "step": 53181 + }, + { + "epoch": 4.962396192964449, + "grad_norm": NaN, + "learning_rate": 2.2969816664723507e-05, + "loss": 0.0, + "step": 53182 + }, + { + "epoch": 4.962489502659326, + "grad_norm": NaN, + "learning_rate": 2.29657943804956e-05, + "loss": 0.0, + "step": 53183 + }, + { + "epoch": 4.962582812354204, + "grad_norm": NaN, + "learning_rate": 2.2961772419278047e-05, + "loss": 0.0, + "step": 53184 + }, + { + "epoch": 4.962676122049081, + "grad_norm": NaN, + "learning_rate": 2.295775078108107e-05, + "loss": 0.0, + "step": 53185 + }, + { + "epoch": 4.962769431743959, + "grad_norm": NaN, + "learning_rate": 2.295372946591491e-05, + "loss": 0.0, + "step": 53186 + }, + { + "epoch": 4.962862741438835, + "grad_norm": NaN, + "learning_rate": 2.2949708473789746e-05, + "loss": 0.0, + "step": 53187 + }, + { + "epoch": 4.9629560511337125, + "grad_norm": NaN, + "learning_rate": 2.2945687804715917e-05, + "loss": 0.0, + "step": 53188 + }, + { + "epoch": 4.96304936082859, + "grad_norm": NaN, + "learning_rate": 2.2941667458703517e-05, + "loss": 0.0, + "step": 53189 + }, + { + "epoch": 4.963142670523467, + "grad_norm": NaN, + "learning_rate": 2.2937647435762785e-05, + "loss": 0.0, + "step": 53190 + }, + { + "epoch": 4.963235980218345, + "grad_norm": NaN, + "learning_rate": 2.2933627735904048e-05, + "loss": 0.0, + "step": 53191 + }, + { + "epoch": 4.963329289913222, + "grad_norm": NaN, + "learning_rate": 2.292960835913743e-05, + "loss": 0.0, + "step": 53192 + }, + { + "epoch": 4.9634225996081, + "grad_norm": NaN, + "learning_rate": 2.2925589305473175e-05, + "loss": 0.0, + "step": 53193 + }, + { + "epoch": 4.963515909302977, + "grad_norm": NaN, + "learning_rate": 2.2921570574921504e-05, + "loss": 0.0, + "step": 53194 + }, + { + "epoch": 4.963609218997854, + "grad_norm": NaN, + "learning_rate": 2.291755216749263e-05, + "loss": 0.0, + "step": 53195 + }, + { + "epoch": 4.963702528692731, + "grad_norm": NaN, + "learning_rate": 2.291353408319678e-05, + "loss": 0.0, + "step": 53196 + }, + { + "epoch": 4.963795838387608, + "grad_norm": NaN, + "learning_rate": 2.2909516322044168e-05, + "loss": 0.0, + "step": 53197 + }, + { + "epoch": 4.963889148082486, + "grad_norm": NaN, + "learning_rate": 2.2905498884044998e-05, + "loss": 0.0, + "step": 53198 + }, + { + "epoch": 4.963982457777363, + "grad_norm": NaN, + "learning_rate": 2.2901481769209507e-05, + "loss": 0.0, + "step": 53199 + }, + { + "epoch": 4.964075767472241, + "grad_norm": NaN, + "learning_rate": 2.28974649775479e-05, + "loss": 0.0, + "step": 53200 + }, + { + "epoch": 4.964169077167117, + "grad_norm": NaN, + "learning_rate": 2.2893448509070377e-05, + "loss": 0.0, + "step": 53201 + }, + { + "epoch": 4.964262386861995, + "grad_norm": NaN, + "learning_rate": 2.2889432363787175e-05, + "loss": 0.0, + "step": 53202 + }, + { + "epoch": 4.964355696556872, + "grad_norm": NaN, + "learning_rate": 2.2885416541708483e-05, + "loss": 0.0, + "step": 53203 + }, + { + "epoch": 4.9644490062517495, + "grad_norm": NaN, + "learning_rate": 2.2881401042844533e-05, + "loss": 0.0, + "step": 53204 + }, + { + "epoch": 4.964542315946627, + "grad_norm": NaN, + "learning_rate": 2.287738586720553e-05, + "loss": 0.0, + "step": 53205 + }, + { + "epoch": 4.964635625641504, + "grad_norm": NaN, + "learning_rate": 2.287337101480166e-05, + "loss": 0.0, + "step": 53206 + }, + { + "epoch": 4.964728935336382, + "grad_norm": NaN, + "learning_rate": 2.286935648564316e-05, + "loss": 0.0, + "step": 53207 + }, + { + "epoch": 4.964822245031259, + "grad_norm": NaN, + "learning_rate": 2.2865342279740228e-05, + "loss": 0.0, + "step": 53208 + }, + { + "epoch": 4.964915554726136, + "grad_norm": NaN, + "learning_rate": 2.286132839710308e-05, + "loss": 0.0, + "step": 53209 + }, + { + "epoch": 4.965008864421013, + "grad_norm": NaN, + "learning_rate": 2.28573148377419e-05, + "loss": 0.0, + "step": 53210 + }, + { + "epoch": 4.9651021741158905, + "grad_norm": NaN, + "learning_rate": 2.2853301601666906e-05, + "loss": 0.0, + "step": 53211 + }, + { + "epoch": 4.965195483810768, + "grad_norm": NaN, + "learning_rate": 2.28492886888883e-05, + "loss": 0.0, + "step": 53212 + }, + { + "epoch": 4.965288793505645, + "grad_norm": NaN, + "learning_rate": 2.284527609941629e-05, + "loss": 0.0, + "step": 53213 + }, + { + "epoch": 4.965382103200523, + "grad_norm": NaN, + "learning_rate": 2.2841263833261076e-05, + "loss": 0.0, + "step": 53214 + }, + { + "epoch": 4.9654754128954, + "grad_norm": NaN, + "learning_rate": 2.2837251890432874e-05, + "loss": 0.0, + "step": 53215 + }, + { + "epoch": 4.965568722590277, + "grad_norm": NaN, + "learning_rate": 2.283324027094186e-05, + "loss": 0.0, + "step": 53216 + }, + { + "epoch": 4.965662032285154, + "grad_norm": NaN, + "learning_rate": 2.2829228974798247e-05, + "loss": 0.0, + "step": 53217 + }, + { + "epoch": 4.9657553419800315, + "grad_norm": NaN, + "learning_rate": 2.282521800201224e-05, + "loss": 0.0, + "step": 53218 + }, + { + "epoch": 4.965848651674909, + "grad_norm": NaN, + "learning_rate": 2.282120735259402e-05, + "loss": 0.0, + "step": 53219 + }, + { + "epoch": 4.965941961369786, + "grad_norm": NaN, + "learning_rate": 2.2817197026553803e-05, + "loss": 0.0, + "step": 53220 + }, + { + "epoch": 4.966035271064664, + "grad_norm": NaN, + "learning_rate": 2.2813187023901784e-05, + "loss": 0.0, + "step": 53221 + }, + { + "epoch": 4.966128580759541, + "grad_norm": NaN, + "learning_rate": 2.2809177344648144e-05, + "loss": 0.0, + "step": 53222 + }, + { + "epoch": 4.966221890454419, + "grad_norm": NaN, + "learning_rate": 2.28051679888031e-05, + "loss": 0.0, + "step": 53223 + }, + { + "epoch": 4.966315200149295, + "grad_norm": NaN, + "learning_rate": 2.2801158956376837e-05, + "loss": 0.0, + "step": 53224 + }, + { + "epoch": 4.966408509844173, + "grad_norm": NaN, + "learning_rate": 2.2797150247379502e-05, + "loss": 0.0, + "step": 53225 + }, + { + "epoch": 4.96650181953905, + "grad_norm": NaN, + "learning_rate": 2.2793141861821413e-05, + "loss": 0.0, + "step": 53226 + }, + { + "epoch": 4.966595129233927, + "grad_norm": NaN, + "learning_rate": 2.2789133799712654e-05, + "loss": 0.0, + "step": 53227 + }, + { + "epoch": 4.966688438928805, + "grad_norm": NaN, + "learning_rate": 2.2785126061063392e-05, + "loss": 0.0, + "step": 53228 + }, + { + "epoch": 4.966781748623682, + "grad_norm": NaN, + "learning_rate": 2.2781118645883944e-05, + "loss": 0.0, + "step": 53229 + }, + { + "epoch": 4.966875058318559, + "grad_norm": NaN, + "learning_rate": 2.2777111554184407e-05, + "loss": 0.0, + "step": 53230 + }, + { + "epoch": 4.966968368013436, + "grad_norm": NaN, + "learning_rate": 2.2773104785974938e-05, + "loss": 0.0, + "step": 53231 + }, + { + "epoch": 4.967061677708314, + "grad_norm": NaN, + "learning_rate": 2.276909834126586e-05, + "loss": 0.0, + "step": 53232 + }, + { + "epoch": 4.967154987403191, + "grad_norm": NaN, + "learning_rate": 2.276509222006722e-05, + "loss": 0.0, + "step": 53233 + }, + { + "epoch": 4.9672482970980685, + "grad_norm": NaN, + "learning_rate": 2.2761086422389242e-05, + "loss": 0.0, + "step": 53234 + }, + { + "epoch": 4.967341606792946, + "grad_norm": NaN, + "learning_rate": 2.2757080948242195e-05, + "loss": 0.0, + "step": 53235 + }, + { + "epoch": 4.967434916487823, + "grad_norm": NaN, + "learning_rate": 2.2753075797636177e-05, + "loss": 0.0, + "step": 53236 + }, + { + "epoch": 4.967528226182701, + "grad_norm": NaN, + "learning_rate": 2.2749070970581346e-05, + "loss": 0.0, + "step": 53237 + }, + { + "epoch": 4.967621535877578, + "grad_norm": NaN, + "learning_rate": 2.2745066467088008e-05, + "loss": 0.0, + "step": 53238 + }, + { + "epoch": 4.967714845572455, + "grad_norm": NaN, + "learning_rate": 2.2741062287166223e-05, + "loss": 0.0, + "step": 53239 + }, + { + "epoch": 4.967808155267332, + "grad_norm": NaN, + "learning_rate": 2.2737058430826234e-05, + "loss": 0.0, + "step": 53240 + }, + { + "epoch": 4.9679014649622095, + "grad_norm": NaN, + "learning_rate": 2.2733054898078197e-05, + "loss": 0.0, + "step": 53241 + }, + { + "epoch": 4.967994774657087, + "grad_norm": NaN, + "learning_rate": 2.272905168893232e-05, + "loss": 0.0, + "step": 53242 + }, + { + "epoch": 4.968088084351964, + "grad_norm": NaN, + "learning_rate": 2.2725048803398748e-05, + "loss": 0.0, + "step": 53243 + }, + { + "epoch": 4.968181394046842, + "grad_norm": NaN, + "learning_rate": 2.2721046241487674e-05, + "loss": 0.0, + "step": 53244 + }, + { + "epoch": 4.968274703741718, + "grad_norm": NaN, + "learning_rate": 2.2717044003209282e-05, + "loss": 0.0, + "step": 53245 + }, + { + "epoch": 4.968368013436596, + "grad_norm": NaN, + "learning_rate": 2.271304208857374e-05, + "loss": 0.0, + "step": 53246 + }, + { + "epoch": 4.968461323131473, + "grad_norm": NaN, + "learning_rate": 2.2709040497591235e-05, + "loss": 0.0, + "step": 53247 + }, + { + "epoch": 4.968554632826351, + "grad_norm": NaN, + "learning_rate": 2.2705039230271928e-05, + "loss": 0.0, + "step": 53248 + }, + { + "epoch": 4.968647942521228, + "grad_norm": NaN, + "learning_rate": 2.2701038286626004e-05, + "loss": 0.0, + "step": 53249 + }, + { + "epoch": 4.968741252216105, + "grad_norm": NaN, + "learning_rate": 2.2697037666663614e-05, + "loss": 0.0, + "step": 53250 + }, + { + "epoch": 4.968834561910983, + "grad_norm": NaN, + "learning_rate": 2.269303737039496e-05, + "loss": 0.0, + "step": 53251 + }, + { + "epoch": 4.96892787160586, + "grad_norm": NaN, + "learning_rate": 2.2689037397830207e-05, + "loss": 0.0, + "step": 53252 + }, + { + "epoch": 4.969021181300737, + "grad_norm": NaN, + "learning_rate": 2.2685037748979507e-05, + "loss": 0.0, + "step": 53253 + }, + { + "epoch": 4.969114490995614, + "grad_norm": NaN, + "learning_rate": 2.2681038423853053e-05, + "loss": 0.0, + "step": 53254 + }, + { + "epoch": 4.969207800690492, + "grad_norm": NaN, + "learning_rate": 2.267703942246099e-05, + "loss": 0.0, + "step": 53255 + }, + { + "epoch": 4.969301110385369, + "grad_norm": NaN, + "learning_rate": 2.267304074481351e-05, + "loss": 0.0, + "step": 53256 + }, + { + "epoch": 4.9693944200802465, + "grad_norm": NaN, + "learning_rate": 2.2669042390920766e-05, + "loss": 0.0, + "step": 53257 + }, + { + "epoch": 4.969487729775124, + "grad_norm": NaN, + "learning_rate": 2.266504436079294e-05, + "loss": 0.0, + "step": 53258 + }, + { + "epoch": 4.969581039470001, + "grad_norm": NaN, + "learning_rate": 2.2661046654440184e-05, + "loss": 0.0, + "step": 53259 + }, + { + "epoch": 4.969674349164878, + "grad_norm": NaN, + "learning_rate": 2.2657049271872613e-05, + "loss": 0.0, + "step": 53260 + }, + { + "epoch": 4.969767658859755, + "grad_norm": NaN, + "learning_rate": 2.265305221310053e-05, + "loss": 0.0, + "step": 53261 + }, + { + "epoch": 4.969860968554633, + "grad_norm": NaN, + "learning_rate": 2.264905547813398e-05, + "loss": 0.0, + "step": 53262 + }, + { + "epoch": 4.96995427824951, + "grad_norm": NaN, + "learning_rate": 2.2645059066983107e-05, + "loss": 0.0, + "step": 53263 + }, + { + "epoch": 4.9700475879443875, + "grad_norm": NaN, + "learning_rate": 2.2641062979658214e-05, + "loss": 0.0, + "step": 53264 + }, + { + "epoch": 4.970140897639265, + "grad_norm": NaN, + "learning_rate": 2.2637067216169314e-05, + "loss": 0.0, + "step": 53265 + }, + { + "epoch": 4.970234207334142, + "grad_norm": NaN, + "learning_rate": 2.2633071776526612e-05, + "loss": 0.0, + "step": 53266 + }, + { + "epoch": 4.97032751702902, + "grad_norm": NaN, + "learning_rate": 2.2629076660740337e-05, + "loss": 0.0, + "step": 53267 + }, + { + "epoch": 4.970420826723896, + "grad_norm": NaN, + "learning_rate": 2.2625081868820562e-05, + "loss": 0.0, + "step": 53268 + }, + { + "epoch": 4.970514136418774, + "grad_norm": NaN, + "learning_rate": 2.2621087400777427e-05, + "loss": 0.0, + "step": 53269 + }, + { + "epoch": 4.970607446113651, + "grad_norm": NaN, + "learning_rate": 2.261709325662121e-05, + "loss": 0.0, + "step": 53270 + }, + { + "epoch": 4.9707007558085285, + "grad_norm": NaN, + "learning_rate": 2.2613099436361955e-05, + "loss": 0.0, + "step": 53271 + }, + { + "epoch": 4.970794065503406, + "grad_norm": NaN, + "learning_rate": 2.2609105940009813e-05, + "loss": 0.0, + "step": 53272 + }, + { + "epoch": 4.970887375198283, + "grad_norm": NaN, + "learning_rate": 2.260511276757505e-05, + "loss": 0.0, + "step": 53273 + }, + { + "epoch": 4.97098068489316, + "grad_norm": NaN, + "learning_rate": 2.2601119919067724e-05, + "loss": 0.0, + "step": 53274 + }, + { + "epoch": 4.971073994588037, + "grad_norm": NaN, + "learning_rate": 2.259712739449797e-05, + "loss": 0.0, + "step": 53275 + }, + { + "epoch": 4.971167304282915, + "grad_norm": NaN, + "learning_rate": 2.2593135193876055e-05, + "loss": 0.0, + "step": 53276 + }, + { + "epoch": 4.971260613977792, + "grad_norm": NaN, + "learning_rate": 2.2589143317212017e-05, + "loss": 0.0, + "step": 53277 + }, + { + "epoch": 4.97135392367267, + "grad_norm": NaN, + "learning_rate": 2.2585151764516012e-05, + "loss": 0.0, + "step": 53278 + }, + { + "epoch": 4.971447233367547, + "grad_norm": NaN, + "learning_rate": 2.2581160535798287e-05, + "loss": 0.0, + "step": 53279 + }, + { + "epoch": 4.971540543062424, + "grad_norm": NaN, + "learning_rate": 2.2577169631068897e-05, + "loss": 0.0, + "step": 53280 + }, + { + "epoch": 4.971633852757302, + "grad_norm": NaN, + "learning_rate": 2.2573179050337987e-05, + "loss": 0.0, + "step": 53281 + }, + { + "epoch": 4.971727162452179, + "grad_norm": NaN, + "learning_rate": 2.2569188793615812e-05, + "loss": 0.0, + "step": 53282 + }, + { + "epoch": 4.971820472147056, + "grad_norm": NaN, + "learning_rate": 2.25651988609124e-05, + "loss": 0.0, + "step": 53283 + }, + { + "epoch": 4.971913781841933, + "grad_norm": NaN, + "learning_rate": 2.2561209252237922e-05, + "loss": 0.0, + "step": 53284 + }, + { + "epoch": 4.972007091536811, + "grad_norm": NaN, + "learning_rate": 2.2557219967602558e-05, + "loss": 0.0, + "step": 53285 + }, + { + "epoch": 4.972100401231688, + "grad_norm": NaN, + "learning_rate": 2.2553231007016432e-05, + "loss": 0.0, + "step": 53286 + }, + { + "epoch": 4.9721937109265655, + "grad_norm": NaN, + "learning_rate": 2.2549242370489683e-05, + "loss": 0.0, + "step": 53287 + }, + { + "epoch": 4.972287020621443, + "grad_norm": NaN, + "learning_rate": 2.2545254058032457e-05, + "loss": 0.0, + "step": 53288 + }, + { + "epoch": 4.972380330316319, + "grad_norm": NaN, + "learning_rate": 2.2541266069654895e-05, + "loss": 0.0, + "step": 53289 + }, + { + "epoch": 4.972473640011197, + "grad_norm": NaN, + "learning_rate": 2.2537278405367138e-05, + "loss": 0.0, + "step": 53290 + }, + { + "epoch": 4.972566949706074, + "grad_norm": NaN, + "learning_rate": 2.2533291065179326e-05, + "loss": 0.0, + "step": 53291 + }, + { + "epoch": 4.972660259400952, + "grad_norm": NaN, + "learning_rate": 2.2529304049101587e-05, + "loss": 0.0, + "step": 53292 + }, + { + "epoch": 4.972753569095829, + "grad_norm": NaN, + "learning_rate": 2.252531735714408e-05, + "loss": 0.0, + "step": 53293 + }, + { + "epoch": 4.9728468787907065, + "grad_norm": NaN, + "learning_rate": 2.2521330989316928e-05, + "loss": 0.0, + "step": 53294 + }, + { + "epoch": 4.972940188485584, + "grad_norm": NaN, + "learning_rate": 2.251734494563027e-05, + "loss": 0.0, + "step": 53295 + }, + { + "epoch": 4.973033498180461, + "grad_norm": NaN, + "learning_rate": 2.251335922609424e-05, + "loss": 0.0, + "step": 53296 + }, + { + "epoch": 4.973126807875338, + "grad_norm": NaN, + "learning_rate": 2.250937383071896e-05, + "loss": 0.0, + "step": 53297 + }, + { + "epoch": 4.973220117570215, + "grad_norm": NaN, + "learning_rate": 2.2505388759514566e-05, + "loss": 0.0, + "step": 53298 + }, + { + "epoch": 4.973313427265093, + "grad_norm": NaN, + "learning_rate": 2.2501404012491255e-05, + "loss": 0.0, + "step": 53299 + }, + { + "epoch": 4.97340673695997, + "grad_norm": NaN, + "learning_rate": 2.2497419589659067e-05, + "loss": 0.0, + "step": 53300 + }, + { + "epoch": 4.973500046654848, + "grad_norm": NaN, + "learning_rate": 2.2493435491028143e-05, + "loss": 0.0, + "step": 53301 + }, + { + "epoch": 4.973593356349725, + "grad_norm": NaN, + "learning_rate": 2.2489451716608713e-05, + "loss": 0.0, + "step": 53302 + }, + { + "epoch": 4.973686666044602, + "grad_norm": NaN, + "learning_rate": 2.24854682664108e-05, + "loss": 0.0, + "step": 53303 + }, + { + "epoch": 4.973779975739479, + "grad_norm": NaN, + "learning_rate": 2.2481485140444545e-05, + "loss": 0.0, + "step": 53304 + }, + { + "epoch": 4.973873285434356, + "grad_norm": NaN, + "learning_rate": 2.2477502338720156e-05, + "loss": 0.0, + "step": 53305 + }, + { + "epoch": 4.973966595129234, + "grad_norm": NaN, + "learning_rate": 2.2473519861247674e-05, + "loss": 0.0, + "step": 53306 + }, + { + "epoch": 4.974059904824111, + "grad_norm": NaN, + "learning_rate": 2.2469537708037212e-05, + "loss": 0.0, + "step": 53307 + }, + { + "epoch": 4.974153214518989, + "grad_norm": NaN, + "learning_rate": 2.2465555879099023e-05, + "loss": 0.0, + "step": 53308 + }, + { + "epoch": 4.974246524213866, + "grad_norm": NaN, + "learning_rate": 2.2461574374443102e-05, + "loss": 0.0, + "step": 53309 + }, + { + "epoch": 4.9743398339087435, + "grad_norm": NaN, + "learning_rate": 2.2457593194079576e-05, + "loss": 0.0, + "step": 53310 + }, + { + "epoch": 4.974433143603621, + "grad_norm": NaN, + "learning_rate": 2.2453612338018683e-05, + "loss": 0.0, + "step": 53311 + }, + { + "epoch": 4.974526453298497, + "grad_norm": NaN, + "learning_rate": 2.2449631806270453e-05, + "loss": 0.0, + "step": 53312 + }, + { + "epoch": 4.974619762993375, + "grad_norm": NaN, + "learning_rate": 2.2445651598844972e-05, + "loss": 0.0, + "step": 53313 + }, + { + "epoch": 4.974713072688252, + "grad_norm": NaN, + "learning_rate": 2.2441671715752503e-05, + "loss": 0.0, + "step": 53314 + }, + { + "epoch": 4.97480638238313, + "grad_norm": NaN, + "learning_rate": 2.2437692157003017e-05, + "loss": 0.0, + "step": 53315 + }, + { + "epoch": 4.974899692078007, + "grad_norm": NaN, + "learning_rate": 2.2433712922606678e-05, + "loss": 0.0, + "step": 53316 + }, + { + "epoch": 4.9749930017728845, + "grad_norm": NaN, + "learning_rate": 2.2429734012573692e-05, + "loss": 0.0, + "step": 53317 + }, + { + "epoch": 4.975086311467761, + "grad_norm": NaN, + "learning_rate": 2.242575542691405e-05, + "loss": 0.0, + "step": 53318 + }, + { + "epoch": 4.9751796211626385, + "grad_norm": NaN, + "learning_rate": 2.2421777165637896e-05, + "loss": 0.0, + "step": 53319 + }, + { + "epoch": 4.975272930857516, + "grad_norm": NaN, + "learning_rate": 2.2417799228755452e-05, + "loss": 0.0, + "step": 53320 + }, + { + "epoch": 4.975366240552393, + "grad_norm": NaN, + "learning_rate": 2.2413821616276696e-05, + "loss": 0.0, + "step": 53321 + }, + { + "epoch": 4.975459550247271, + "grad_norm": NaN, + "learning_rate": 2.2409844328211786e-05, + "loss": 0.0, + "step": 53322 + }, + { + "epoch": 4.975552859942148, + "grad_norm": NaN, + "learning_rate": 2.2405867364570897e-05, + "loss": 0.0, + "step": 53323 + }, + { + "epoch": 4.9756461696370256, + "grad_norm": NaN, + "learning_rate": 2.240189072536407e-05, + "loss": 0.0, + "step": 53324 + }, + { + "epoch": 4.975739479331903, + "grad_norm": NaN, + "learning_rate": 2.2397914410601396e-05, + "loss": 0.0, + "step": 53325 + }, + { + "epoch": 4.9758327890267795, + "grad_norm": NaN, + "learning_rate": 2.239393842029309e-05, + "loss": 0.0, + "step": 53326 + }, + { + "epoch": 4.975926098721657, + "grad_norm": NaN, + "learning_rate": 2.238996275444917e-05, + "loss": 0.0, + "step": 53327 + }, + { + "epoch": 4.976019408416534, + "grad_norm": NaN, + "learning_rate": 2.238598741307978e-05, + "loss": 0.0, + "step": 53328 + }, + { + "epoch": 4.976112718111412, + "grad_norm": NaN, + "learning_rate": 2.2382012396195016e-05, + "loss": 0.0, + "step": 53329 + }, + { + "epoch": 4.976206027806289, + "grad_norm": NaN, + "learning_rate": 2.2378037703804995e-05, + "loss": 0.0, + "step": 53330 + }, + { + "epoch": 4.976299337501167, + "grad_norm": NaN, + "learning_rate": 2.2374063335919818e-05, + "loss": 0.0, + "step": 53331 + }, + { + "epoch": 4.976392647196044, + "grad_norm": NaN, + "learning_rate": 2.2370089292549586e-05, + "loss": 0.0, + "step": 53332 + }, + { + "epoch": 4.9764859568909205, + "grad_norm": NaN, + "learning_rate": 2.2366115573704414e-05, + "loss": 0.0, + "step": 53333 + }, + { + "epoch": 4.976579266585798, + "grad_norm": NaN, + "learning_rate": 2.2362142179394404e-05, + "loss": 0.0, + "step": 53334 + }, + { + "epoch": 4.976672576280675, + "grad_norm": NaN, + "learning_rate": 2.2358169109629652e-05, + "loss": 0.0, + "step": 53335 + }, + { + "epoch": 4.976765885975553, + "grad_norm": NaN, + "learning_rate": 2.2354196364420234e-05, + "loss": 0.0, + "step": 53336 + }, + { + "epoch": 4.97685919567043, + "grad_norm": NaN, + "learning_rate": 2.235022394377636e-05, + "loss": 0.0, + "step": 53337 + }, + { + "epoch": 4.976952505365308, + "grad_norm": NaN, + "learning_rate": 2.2346251847708015e-05, + "loss": 0.0, + "step": 53338 + }, + { + "epoch": 4.977045815060185, + "grad_norm": NaN, + "learning_rate": 2.2342280076225294e-05, + "loss": 0.0, + "step": 53339 + }, + { + "epoch": 4.9771391247550625, + "grad_norm": NaN, + "learning_rate": 2.2338308629338423e-05, + "loss": 0.0, + "step": 53340 + }, + { + "epoch": 4.977232434449939, + "grad_norm": NaN, + "learning_rate": 2.233433750705738e-05, + "loss": 0.0, + "step": 53341 + }, + { + "epoch": 4.977325744144816, + "grad_norm": NaN, + "learning_rate": 2.2330366709392254e-05, + "loss": 0.0, + "step": 53342 + }, + { + "epoch": 4.977419053839694, + "grad_norm": NaN, + "learning_rate": 2.232639623635327e-05, + "loss": 0.0, + "step": 53343 + }, + { + "epoch": 4.977512363534571, + "grad_norm": NaN, + "learning_rate": 2.232242608795039e-05, + "loss": 0.0, + "step": 53344 + }, + { + "epoch": 4.977605673229449, + "grad_norm": NaN, + "learning_rate": 2.2318456264193736e-05, + "loss": 0.0, + "step": 53345 + }, + { + "epoch": 4.977698982924326, + "grad_norm": NaN, + "learning_rate": 2.2314486765093497e-05, + "loss": 0.0, + "step": 53346 + }, + { + "epoch": 4.977792292619203, + "grad_norm": NaN, + "learning_rate": 2.2310517590659655e-05, + "loss": 0.0, + "step": 53347 + }, + { + "epoch": 4.97788560231408, + "grad_norm": NaN, + "learning_rate": 2.2306548740902313e-05, + "loss": 0.0, + "step": 53348 + }, + { + "epoch": 4.9779789120089575, + "grad_norm": NaN, + "learning_rate": 2.230258021583165e-05, + "loss": 0.0, + "step": 53349 + }, + { + "epoch": 4.978072221703835, + "grad_norm": NaN, + "learning_rate": 2.2298612015457673e-05, + "loss": 0.0, + "step": 53350 + }, + { + "epoch": 4.978165531398712, + "grad_norm": NaN, + "learning_rate": 2.2294644139790457e-05, + "loss": 0.0, + "step": 53351 + }, + { + "epoch": 4.97825884109359, + "grad_norm": NaN, + "learning_rate": 2.2290676588840213e-05, + "loss": 0.0, + "step": 53352 + }, + { + "epoch": 4.978352150788467, + "grad_norm": NaN, + "learning_rate": 2.2286709362616894e-05, + "loss": 0.0, + "step": 53353 + }, + { + "epoch": 4.978445460483345, + "grad_norm": NaN, + "learning_rate": 2.2282742461130633e-05, + "loss": 0.0, + "step": 53354 + }, + { + "epoch": 4.978538770178222, + "grad_norm": NaN, + "learning_rate": 2.2278775884391582e-05, + "loss": 0.0, + "step": 53355 + }, + { + "epoch": 4.9786320798730985, + "grad_norm": NaN, + "learning_rate": 2.2274809632409717e-05, + "loss": 0.0, + "step": 53356 + }, + { + "epoch": 4.978725389567976, + "grad_norm": NaN, + "learning_rate": 2.2270843705195167e-05, + "loss": 0.0, + "step": 53357 + }, + { + "epoch": 4.978818699262853, + "grad_norm": NaN, + "learning_rate": 2.2266878102758092e-05, + "loss": 0.0, + "step": 53358 + }, + { + "epoch": 4.978912008957731, + "grad_norm": NaN, + "learning_rate": 2.2262912825108458e-05, + "loss": 0.0, + "step": 53359 + }, + { + "epoch": 4.979005318652608, + "grad_norm": NaN, + "learning_rate": 2.2258947872256382e-05, + "loss": 0.0, + "step": 53360 + }, + { + "epoch": 4.979098628347486, + "grad_norm": NaN, + "learning_rate": 2.2254983244212023e-05, + "loss": 0.0, + "step": 53361 + }, + { + "epoch": 4.979191938042362, + "grad_norm": NaN, + "learning_rate": 2.2251018940985364e-05, + "loss": 0.0, + "step": 53362 + }, + { + "epoch": 4.97928524773724, + "grad_norm": NaN, + "learning_rate": 2.2247054962586476e-05, + "loss": 0.0, + "step": 53363 + }, + { + "epoch": 4.979378557432117, + "grad_norm": NaN, + "learning_rate": 2.2243091309025574e-05, + "loss": 0.0, + "step": 53364 + }, + { + "epoch": 4.979471867126994, + "grad_norm": NaN, + "learning_rate": 2.2239127980312605e-05, + "loss": 0.0, + "step": 53365 + }, + { + "epoch": 4.979565176821872, + "grad_norm": NaN, + "learning_rate": 2.2235164976457642e-05, + "loss": 0.0, + "step": 53366 + }, + { + "epoch": 4.979658486516749, + "grad_norm": NaN, + "learning_rate": 2.2231202297470897e-05, + "loss": 0.0, + "step": 53367 + }, + { + "epoch": 4.979751796211627, + "grad_norm": NaN, + "learning_rate": 2.2227239943362307e-05, + "loss": 0.0, + "step": 53368 + }, + { + "epoch": 4.979845105906504, + "grad_norm": NaN, + "learning_rate": 2.2223277914141967e-05, + "loss": 0.0, + "step": 53369 + }, + { + "epoch": 4.979938415601381, + "grad_norm": NaN, + "learning_rate": 2.2219316209820052e-05, + "loss": 0.0, + "step": 53370 + }, + { + "epoch": 4.980031725296258, + "grad_norm": NaN, + "learning_rate": 2.2215354830406522e-05, + "loss": 0.0, + "step": 53371 + }, + { + "epoch": 4.9801250349911355, + "grad_norm": NaN, + "learning_rate": 2.2211393775911466e-05, + "loss": 0.0, + "step": 53372 + }, + { + "epoch": 4.980218344686013, + "grad_norm": NaN, + "learning_rate": 2.2207433046345046e-05, + "loss": 0.0, + "step": 53373 + }, + { + "epoch": 4.98031165438089, + "grad_norm": NaN, + "learning_rate": 2.2203472641717218e-05, + "loss": 0.0, + "step": 53374 + }, + { + "epoch": 4.980404964075768, + "grad_norm": NaN, + "learning_rate": 2.2199512562038157e-05, + "loss": 0.0, + "step": 53375 + }, + { + "epoch": 4.980498273770645, + "grad_norm": NaN, + "learning_rate": 2.219555280731784e-05, + "loss": 0.0, + "step": 53376 + }, + { + "epoch": 4.980591583465522, + "grad_norm": NaN, + "learning_rate": 2.2191593377566343e-05, + "loss": 0.0, + "step": 53377 + }, + { + "epoch": 4.980684893160399, + "grad_norm": NaN, + "learning_rate": 2.2187634272793836e-05, + "loss": 0.0, + "step": 53378 + }, + { + "epoch": 4.9807782028552765, + "grad_norm": NaN, + "learning_rate": 2.2183675493010266e-05, + "loss": 0.0, + "step": 53379 + }, + { + "epoch": 4.980871512550154, + "grad_norm": NaN, + "learning_rate": 2.2179717038225726e-05, + "loss": 0.0, + "step": 53380 + }, + { + "epoch": 4.980964822245031, + "grad_norm": NaN, + "learning_rate": 2.2175758908450368e-05, + "loss": 0.0, + "step": 53381 + }, + { + "epoch": 4.981058131939909, + "grad_norm": NaN, + "learning_rate": 2.217180110369416e-05, + "loss": 0.0, + "step": 53382 + }, + { + "epoch": 4.981151441634786, + "grad_norm": NaN, + "learning_rate": 2.216784362396715e-05, + "loss": 0.0, + "step": 53383 + }, + { + "epoch": 4.981244751329664, + "grad_norm": NaN, + "learning_rate": 2.2163886469279524e-05, + "loss": 0.0, + "step": 53384 + }, + { + "epoch": 4.98133806102454, + "grad_norm": NaN, + "learning_rate": 2.2159929639641233e-05, + "loss": 0.0, + "step": 53385 + }, + { + "epoch": 4.9814313707194176, + "grad_norm": NaN, + "learning_rate": 2.215597313506234e-05, + "loss": 0.0, + "step": 53386 + }, + { + "epoch": 4.981524680414295, + "grad_norm": NaN, + "learning_rate": 2.2152016955553014e-05, + "loss": 0.0, + "step": 53387 + }, + { + "epoch": 4.981617990109172, + "grad_norm": NaN, + "learning_rate": 2.2148061101123187e-05, + "loss": 0.0, + "step": 53388 + }, + { + "epoch": 4.98171129980405, + "grad_norm": NaN, + "learning_rate": 2.2144105571782934e-05, + "loss": 0.0, + "step": 53389 + }, + { + "epoch": 4.981804609498927, + "grad_norm": NaN, + "learning_rate": 2.2140150367542424e-05, + "loss": 0.0, + "step": 53390 + }, + { + "epoch": 4.981897919193804, + "grad_norm": NaN, + "learning_rate": 2.2136195488411608e-05, + "loss": 0.0, + "step": 53391 + }, + { + "epoch": 4.981991228888681, + "grad_norm": NaN, + "learning_rate": 2.213224093440052e-05, + "loss": 0.0, + "step": 53392 + }, + { + "epoch": 4.982084538583559, + "grad_norm": NaN, + "learning_rate": 2.2128286705519337e-05, + "loss": 0.0, + "step": 53393 + }, + { + "epoch": 4.982177848278436, + "grad_norm": NaN, + "learning_rate": 2.2124332801778022e-05, + "loss": 0.0, + "step": 53394 + }, + { + "epoch": 4.982271157973313, + "grad_norm": NaN, + "learning_rate": 2.2120379223186614e-05, + "loss": 0.0, + "step": 53395 + }, + { + "epoch": 4.982364467668191, + "grad_norm": NaN, + "learning_rate": 2.2116425969755254e-05, + "loss": 0.0, + "step": 53396 + }, + { + "epoch": 4.982457777363068, + "grad_norm": NaN, + "learning_rate": 2.2112473041493916e-05, + "loss": 0.0, + "step": 53397 + }, + { + "epoch": 4.982551087057946, + "grad_norm": NaN, + "learning_rate": 2.210852043841263e-05, + "loss": 0.0, + "step": 53398 + }, + { + "epoch": 4.982644396752823, + "grad_norm": NaN, + "learning_rate": 2.2104568160521584e-05, + "loss": 0.0, + "step": 53399 + }, + { + "epoch": 4.9827377064477, + "grad_norm": NaN, + "learning_rate": 2.210061620783069e-05, + "loss": 0.0, + "step": 53400 + }, + { + "epoch": 4.982831016142577, + "grad_norm": NaN, + "learning_rate": 2.2096664580350004e-05, + "loss": 0.0, + "step": 53401 + }, + { + "epoch": 4.9829243258374545, + "grad_norm": NaN, + "learning_rate": 2.2092713278089687e-05, + "loss": 0.0, + "step": 53402 + }, + { + "epoch": 4.983017635532332, + "grad_norm": NaN, + "learning_rate": 2.208876230105966e-05, + "loss": 0.0, + "step": 53403 + }, + { + "epoch": 4.983110945227209, + "grad_norm": NaN, + "learning_rate": 2.208481164927e-05, + "loss": 0.0, + "step": 53404 + }, + { + "epoch": 4.983204254922087, + "grad_norm": NaN, + "learning_rate": 2.2080861322730853e-05, + "loss": 0.0, + "step": 53405 + }, + { + "epoch": 4.983297564616963, + "grad_norm": NaN, + "learning_rate": 2.2076911321452123e-05, + "loss": 0.0, + "step": 53406 + }, + { + "epoch": 4.983390874311841, + "grad_norm": NaN, + "learning_rate": 2.2072961645443904e-05, + "loss": 0.0, + "step": 53407 + }, + { + "epoch": 4.983484184006718, + "grad_norm": NaN, + "learning_rate": 2.2069012294716303e-05, + "loss": 0.0, + "step": 53408 + }, + { + "epoch": 4.9835774937015955, + "grad_norm": NaN, + "learning_rate": 2.206506326927928e-05, + "loss": 0.0, + "step": 53409 + }, + { + "epoch": 4.983670803396473, + "grad_norm": NaN, + "learning_rate": 2.2061114569142863e-05, + "loss": 0.0, + "step": 53410 + }, + { + "epoch": 4.98376411309135, + "grad_norm": NaN, + "learning_rate": 2.2057166194317205e-05, + "loss": 0.0, + "step": 53411 + }, + { + "epoch": 4.983857422786228, + "grad_norm": NaN, + "learning_rate": 2.2053218144812202e-05, + "loss": 0.0, + "step": 53412 + }, + { + "epoch": 4.983950732481105, + "grad_norm": NaN, + "learning_rate": 2.204927042063801e-05, + "loss": 0.0, + "step": 53413 + }, + { + "epoch": 4.984044042175982, + "grad_norm": NaN, + "learning_rate": 2.204532302180464e-05, + "loss": 0.0, + "step": 53414 + }, + { + "epoch": 4.984137351870859, + "grad_norm": NaN, + "learning_rate": 2.204137594832205e-05, + "loss": 0.0, + "step": 53415 + }, + { + "epoch": 4.984230661565737, + "grad_norm": NaN, + "learning_rate": 2.203742920020038e-05, + "loss": 0.0, + "step": 53416 + }, + { + "epoch": 4.984323971260614, + "grad_norm": NaN, + "learning_rate": 2.203348277744964e-05, + "loss": 0.0, + "step": 53417 + }, + { + "epoch": 4.984417280955491, + "grad_norm": NaN, + "learning_rate": 2.202953668007979e-05, + "loss": 0.0, + "step": 53418 + }, + { + "epoch": 4.984510590650369, + "grad_norm": NaN, + "learning_rate": 2.2025590908100988e-05, + "loss": 0.0, + "step": 53419 + }, + { + "epoch": 4.984603900345246, + "grad_norm": NaN, + "learning_rate": 2.2021645461523156e-05, + "loss": 0.0, + "step": 53420 + }, + { + "epoch": 4.984697210040123, + "grad_norm": NaN, + "learning_rate": 2.2017700340356342e-05, + "loss": 0.0, + "step": 53421 + }, + { + "epoch": 4.984790519735, + "grad_norm": NaN, + "learning_rate": 2.2013755544610685e-05, + "loss": 0.0, + "step": 53422 + }, + { + "epoch": 4.984883829429878, + "grad_norm": NaN, + "learning_rate": 2.200981107429609e-05, + "loss": 0.0, + "step": 53423 + }, + { + "epoch": 4.984977139124755, + "grad_norm": NaN, + "learning_rate": 2.2005866929422588e-05, + "loss": 0.0, + "step": 53424 + }, + { + "epoch": 4.9850704488196325, + "grad_norm": NaN, + "learning_rate": 2.2001923110000337e-05, + "loss": 0.0, + "step": 53425 + }, + { + "epoch": 4.98516375851451, + "grad_norm": NaN, + "learning_rate": 2.199797961603924e-05, + "loss": 0.0, + "step": 53426 + }, + { + "epoch": 4.985257068209387, + "grad_norm": NaN, + "learning_rate": 2.1994036447549328e-05, + "loss": 0.0, + "step": 53427 + }, + { + "epoch": 4.985350377904265, + "grad_norm": NaN, + "learning_rate": 2.1990093604540725e-05, + "loss": 0.0, + "step": 53428 + }, + { + "epoch": 4.985443687599141, + "grad_norm": NaN, + "learning_rate": 2.1986151087023375e-05, + "loss": 0.0, + "step": 53429 + }, + { + "epoch": 4.985536997294019, + "grad_norm": NaN, + "learning_rate": 2.198220889500728e-05, + "loss": 0.0, + "step": 53430 + }, + { + "epoch": 4.985630306988896, + "grad_norm": NaN, + "learning_rate": 2.1978267028502573e-05, + "loss": 0.0, + "step": 53431 + }, + { + "epoch": 4.9857236166837735, + "grad_norm": NaN, + "learning_rate": 2.1974325487519173e-05, + "loss": 0.0, + "step": 53432 + }, + { + "epoch": 4.985816926378651, + "grad_norm": NaN, + "learning_rate": 2.197038427206711e-05, + "loss": 0.0, + "step": 53433 + }, + { + "epoch": 4.985910236073528, + "grad_norm": NaN, + "learning_rate": 2.1966443382156507e-05, + "loss": 0.0, + "step": 53434 + }, + { + "epoch": 4.986003545768405, + "grad_norm": NaN, + "learning_rate": 2.1962502817797274e-05, + "loss": 0.0, + "step": 53435 + }, + { + "epoch": 4.986096855463282, + "grad_norm": NaN, + "learning_rate": 2.1958562578999433e-05, + "loss": 0.0, + "step": 53436 + }, + { + "epoch": 4.98619016515816, + "grad_norm": NaN, + "learning_rate": 2.1954622665773096e-05, + "loss": 0.0, + "step": 53437 + }, + { + "epoch": 4.986283474853037, + "grad_norm": NaN, + "learning_rate": 2.1950683078128205e-05, + "loss": 0.0, + "step": 53438 + }, + { + "epoch": 4.9863767845479146, + "grad_norm": NaN, + "learning_rate": 2.194674381607477e-05, + "loss": 0.0, + "step": 53439 + }, + { + "epoch": 4.986470094242792, + "grad_norm": NaN, + "learning_rate": 2.1942804879622876e-05, + "loss": 0.0, + "step": 53440 + }, + { + "epoch": 4.986563403937669, + "grad_norm": NaN, + "learning_rate": 2.1938866268782492e-05, + "loss": 0.0, + "step": 53441 + }, + { + "epoch": 4.986656713632547, + "grad_norm": NaN, + "learning_rate": 2.1934927983563582e-05, + "loss": 0.0, + "step": 53442 + }, + { + "epoch": 4.986750023327423, + "grad_norm": NaN, + "learning_rate": 2.193099002397628e-05, + "loss": 0.0, + "step": 53443 + }, + { + "epoch": 4.986843333022301, + "grad_norm": NaN, + "learning_rate": 2.1927052390030508e-05, + "loss": 0.0, + "step": 53444 + }, + { + "epoch": 4.986936642717178, + "grad_norm": NaN, + "learning_rate": 2.1923115081736258e-05, + "loss": 0.0, + "step": 53445 + }, + { + "epoch": 4.987029952412056, + "grad_norm": NaN, + "learning_rate": 2.1919178099103673e-05, + "loss": 0.0, + "step": 53446 + }, + { + "epoch": 4.987123262106933, + "grad_norm": NaN, + "learning_rate": 2.1915241442142645e-05, + "loss": 0.0, + "step": 53447 + }, + { + "epoch": 4.98721657180181, + "grad_norm": NaN, + "learning_rate": 2.1911305110863164e-05, + "loss": 0.0, + "step": 53448 + }, + { + "epoch": 4.987309881496688, + "grad_norm": NaN, + "learning_rate": 2.1907369105275375e-05, + "loss": 0.0, + "step": 53449 + }, + { + "epoch": 4.987403191191564, + "grad_norm": NaN, + "learning_rate": 2.1903433425389116e-05, + "loss": 0.0, + "step": 53450 + }, + { + "epoch": 4.987496500886442, + "grad_norm": NaN, + "learning_rate": 2.1899498071214533e-05, + "loss": 0.0, + "step": 53451 + }, + { + "epoch": 4.987589810581319, + "grad_norm": NaN, + "learning_rate": 2.1895563042761614e-05, + "loss": 0.0, + "step": 53452 + }, + { + "epoch": 4.987683120276197, + "grad_norm": NaN, + "learning_rate": 2.1891628340040257e-05, + "loss": 0.0, + "step": 53453 + }, + { + "epoch": 4.987776429971074, + "grad_norm": NaN, + "learning_rate": 2.1887693963060565e-05, + "loss": 0.0, + "step": 53454 + }, + { + "epoch": 4.9878697396659515, + "grad_norm": NaN, + "learning_rate": 2.1883759911832567e-05, + "loss": 0.0, + "step": 53455 + }, + { + "epoch": 4.987963049360829, + "grad_norm": NaN, + "learning_rate": 2.1879826186366152e-05, + "loss": 0.0, + "step": 53456 + }, + { + "epoch": 4.988056359055706, + "grad_norm": NaN, + "learning_rate": 2.187589278667141e-05, + "loss": 0.0, + "step": 53457 + }, + { + "epoch": 4.988149668750583, + "grad_norm": NaN, + "learning_rate": 2.1871959712758358e-05, + "loss": 0.0, + "step": 53458 + }, + { + "epoch": 4.98824297844546, + "grad_norm": NaN, + "learning_rate": 2.186802696463688e-05, + "loss": 0.0, + "step": 53459 + }, + { + "epoch": 4.988336288140338, + "grad_norm": NaN, + "learning_rate": 2.186409454231709e-05, + "loss": 0.0, + "step": 53460 + }, + { + "epoch": 4.988429597835215, + "grad_norm": NaN, + "learning_rate": 2.1860162445808992e-05, + "loss": 0.0, + "step": 53461 + }, + { + "epoch": 4.9885229075300925, + "grad_norm": NaN, + "learning_rate": 2.1856230675122482e-05, + "loss": 0.0, + "step": 53462 + }, + { + "epoch": 4.98861621722497, + "grad_norm": NaN, + "learning_rate": 2.1852299230267667e-05, + "loss": 0.0, + "step": 53463 + }, + { + "epoch": 4.9887095269198465, + "grad_norm": NaN, + "learning_rate": 2.1848368111254472e-05, + "loss": 0.0, + "step": 53464 + }, + { + "epoch": 4.988802836614724, + "grad_norm": NaN, + "learning_rate": 2.1844437318092888e-05, + "loss": 0.0, + "step": 53465 + }, + { + "epoch": 4.988896146309601, + "grad_norm": NaN, + "learning_rate": 2.1840506850792993e-05, + "loss": 0.0, + "step": 53466 + }, + { + "epoch": 4.988989456004479, + "grad_norm": NaN, + "learning_rate": 2.183657670936471e-05, + "loss": 0.0, + "step": 53467 + }, + { + "epoch": 4.989082765699356, + "grad_norm": NaN, + "learning_rate": 2.1832646893818e-05, + "loss": 0.0, + "step": 53468 + }, + { + "epoch": 4.989176075394234, + "grad_norm": NaN, + "learning_rate": 2.182871740416297e-05, + "loss": 0.0, + "step": 53469 + }, + { + "epoch": 4.989269385089111, + "grad_norm": NaN, + "learning_rate": 2.1824788240409515e-05, + "loss": 0.0, + "step": 53470 + }, + { + "epoch": 4.989362694783988, + "grad_norm": NaN, + "learning_rate": 2.182085940256762e-05, + "loss": 0.0, + "step": 53471 + }, + { + "epoch": 4.989456004478866, + "grad_norm": NaN, + "learning_rate": 2.1816930890647365e-05, + "loss": 0.0, + "step": 53472 + }, + { + "epoch": 4.989549314173742, + "grad_norm": NaN, + "learning_rate": 2.181300270465866e-05, + "loss": 0.0, + "step": 53473 + }, + { + "epoch": 4.98964262386862, + "grad_norm": NaN, + "learning_rate": 2.1809074844611497e-05, + "loss": 0.0, + "step": 53474 + }, + { + "epoch": 4.989735933563497, + "grad_norm": NaN, + "learning_rate": 2.180514731051593e-05, + "loss": 0.0, + "step": 53475 + }, + { + "epoch": 4.989829243258375, + "grad_norm": NaN, + "learning_rate": 2.1801220102381887e-05, + "loss": 0.0, + "step": 53476 + }, + { + "epoch": 4.989922552953252, + "grad_norm": NaN, + "learning_rate": 2.1797293220219313e-05, + "loss": 0.0, + "step": 53477 + }, + { + "epoch": 4.9900158626481295, + "grad_norm": NaN, + "learning_rate": 2.1793366664038314e-05, + "loss": 0.0, + "step": 53478 + }, + { + "epoch": 4.990109172343006, + "grad_norm": NaN, + "learning_rate": 2.1789440433848783e-05, + "loss": 0.0, + "step": 53479 + }, + { + "epoch": 4.990202482037883, + "grad_norm": NaN, + "learning_rate": 2.178551452966069e-05, + "loss": 0.0, + "step": 53480 + }, + { + "epoch": 4.990295791732761, + "grad_norm": NaN, + "learning_rate": 2.178158895148412e-05, + "loss": 0.0, + "step": 53481 + }, + { + "epoch": 4.990389101427638, + "grad_norm": NaN, + "learning_rate": 2.1777663699328957e-05, + "loss": 0.0, + "step": 53482 + }, + { + "epoch": 4.990482411122516, + "grad_norm": NaN, + "learning_rate": 2.1773738773205178e-05, + "loss": 0.0, + "step": 53483 + }, + { + "epoch": 4.990575720817393, + "grad_norm": NaN, + "learning_rate": 2.1769814173122863e-05, + "loss": 0.0, + "step": 53484 + }, + { + "epoch": 4.9906690305122705, + "grad_norm": NaN, + "learning_rate": 2.1765889899091864e-05, + "loss": 0.0, + "step": 53485 + }, + { + "epoch": 4.990762340207148, + "grad_norm": NaN, + "learning_rate": 2.1761965951122245e-05, + "loss": 0.0, + "step": 53486 + }, + { + "epoch": 4.9908556499020245, + "grad_norm": NaN, + "learning_rate": 2.1758042329223996e-05, + "loss": 0.0, + "step": 53487 + }, + { + "epoch": 4.990948959596902, + "grad_norm": NaN, + "learning_rate": 2.175411903340699e-05, + "loss": 0.0, + "step": 53488 + }, + { + "epoch": 4.991042269291779, + "grad_norm": NaN, + "learning_rate": 2.175019606368131e-05, + "loss": 0.0, + "step": 53489 + }, + { + "epoch": 4.991135578986657, + "grad_norm": NaN, + "learning_rate": 2.174627342005692e-05, + "loss": 0.0, + "step": 53490 + }, + { + "epoch": 4.991228888681534, + "grad_norm": NaN, + "learning_rate": 2.1742351102543702e-05, + "loss": 0.0, + "step": 53491 + }, + { + "epoch": 4.991322198376412, + "grad_norm": NaN, + "learning_rate": 2.1738429111151717e-05, + "loss": 0.0, + "step": 53492 + }, + { + "epoch": 4.991415508071289, + "grad_norm": NaN, + "learning_rate": 2.1734507445890953e-05, + "loss": 0.0, + "step": 53493 + }, + { + "epoch": 4.9915088177661655, + "grad_norm": NaN, + "learning_rate": 2.173058610677127e-05, + "loss": 0.0, + "step": 53494 + }, + { + "epoch": 4.991602127461043, + "grad_norm": NaN, + "learning_rate": 2.1726665093802757e-05, + "loss": 0.0, + "step": 53495 + }, + { + "epoch": 4.99169543715592, + "grad_norm": NaN, + "learning_rate": 2.1722744406995362e-05, + "loss": 0.0, + "step": 53496 + }, + { + "epoch": 4.991788746850798, + "grad_norm": NaN, + "learning_rate": 2.1718824046358955e-05, + "loss": 0.0, + "step": 53497 + }, + { + "epoch": 4.991882056545675, + "grad_norm": NaN, + "learning_rate": 2.1714904011903617e-05, + "loss": 0.0, + "step": 53498 + }, + { + "epoch": 4.991975366240553, + "grad_norm": NaN, + "learning_rate": 2.1710984303639318e-05, + "loss": 0.0, + "step": 53499 + }, + { + "epoch": 4.99206867593543, + "grad_norm": NaN, + "learning_rate": 2.1707064921575918e-05, + "loss": 0.0, + "step": 53500 + }, + { + "epoch": 4.992161985630307, + "grad_norm": NaN, + "learning_rate": 2.1703145865723477e-05, + "loss": 0.0, + "step": 53501 + }, + { + "epoch": 4.992255295325184, + "grad_norm": NaN, + "learning_rate": 2.169922713609195e-05, + "loss": 0.0, + "step": 53502 + }, + { + "epoch": 4.992348605020061, + "grad_norm": NaN, + "learning_rate": 2.1695308732691235e-05, + "loss": 0.0, + "step": 53503 + }, + { + "epoch": 4.992441914714939, + "grad_norm": NaN, + "learning_rate": 2.169139065553137e-05, + "loss": 0.0, + "step": 53504 + }, + { + "epoch": 4.992535224409816, + "grad_norm": NaN, + "learning_rate": 2.1687472904622333e-05, + "loss": 0.0, + "step": 53505 + }, + { + "epoch": 4.992628534104694, + "grad_norm": NaN, + "learning_rate": 2.1683555479973964e-05, + "loss": 0.0, + "step": 53506 + }, + { + "epoch": 4.992721843799571, + "grad_norm": NaN, + "learning_rate": 2.167963838159639e-05, + "loss": 0.0, + "step": 53507 + }, + { + "epoch": 4.992815153494448, + "grad_norm": NaN, + "learning_rate": 2.167572160949943e-05, + "loss": 0.0, + "step": 53508 + }, + { + "epoch": 4.992908463189325, + "grad_norm": NaN, + "learning_rate": 2.167180516369307e-05, + "loss": 0.0, + "step": 53509 + }, + { + "epoch": 4.993001772884202, + "grad_norm": NaN, + "learning_rate": 2.1667889044187357e-05, + "loss": 0.0, + "step": 53510 + }, + { + "epoch": 4.99309508257908, + "grad_norm": NaN, + "learning_rate": 2.166397325099216e-05, + "loss": 0.0, + "step": 53511 + }, + { + "epoch": 4.993188392273957, + "grad_norm": NaN, + "learning_rate": 2.1660057784117436e-05, + "loss": 0.0, + "step": 53512 + }, + { + "epoch": 4.993281701968835, + "grad_norm": NaN, + "learning_rate": 2.165614264357322e-05, + "loss": 0.0, + "step": 53513 + }, + { + "epoch": 4.993375011663712, + "grad_norm": NaN, + "learning_rate": 2.1652227829369396e-05, + "loss": 0.0, + "step": 53514 + }, + { + "epoch": 4.9934683213585895, + "grad_norm": NaN, + "learning_rate": 2.1648313341515898e-05, + "loss": 0.0, + "step": 53515 + }, + { + "epoch": 4.993561631053467, + "grad_norm": NaN, + "learning_rate": 2.1644399180022792e-05, + "loss": 0.0, + "step": 53516 + }, + { + "epoch": 4.9936549407483435, + "grad_norm": NaN, + "learning_rate": 2.1640485344899917e-05, + "loss": 0.0, + "step": 53517 + }, + { + "epoch": 4.993748250443221, + "grad_norm": NaN, + "learning_rate": 2.163657183615723e-05, + "loss": 0.0, + "step": 53518 + }, + { + "epoch": 4.993841560138098, + "grad_norm": NaN, + "learning_rate": 2.1632658653804775e-05, + "loss": 0.0, + "step": 53519 + }, + { + "epoch": 4.993934869832976, + "grad_norm": NaN, + "learning_rate": 2.1628745797852427e-05, + "loss": 0.0, + "step": 53520 + }, + { + "epoch": 4.994028179527853, + "grad_norm": NaN, + "learning_rate": 2.162483326831011e-05, + "loss": 0.0, + "step": 53521 + }, + { + "epoch": 4.994121489222731, + "grad_norm": NaN, + "learning_rate": 2.1620921065187885e-05, + "loss": 0.0, + "step": 53522 + }, + { + "epoch": 4.994214798917607, + "grad_norm": NaN, + "learning_rate": 2.161700918849556e-05, + "loss": 0.0, + "step": 53523 + }, + { + "epoch": 4.9943081086124845, + "grad_norm": NaN, + "learning_rate": 2.161309763824319e-05, + "loss": 0.0, + "step": 53524 + }, + { + "epoch": 4.994401418307362, + "grad_norm": NaN, + "learning_rate": 2.160918641444071e-05, + "loss": 0.0, + "step": 53525 + }, + { + "epoch": 4.994494728002239, + "grad_norm": NaN, + "learning_rate": 2.160527551709797e-05, + "loss": 0.0, + "step": 53526 + }, + { + "epoch": 4.994588037697117, + "grad_norm": NaN, + "learning_rate": 2.1601364946225013e-05, + "loss": 0.0, + "step": 53527 + }, + { + "epoch": 4.994681347391994, + "grad_norm": NaN, + "learning_rate": 2.1597454701831783e-05, + "loss": 0.0, + "step": 53528 + }, + { + "epoch": 4.994774657086872, + "grad_norm": NaN, + "learning_rate": 2.1593544783928125e-05, + "loss": 0.0, + "step": 53529 + }, + { + "epoch": 4.994867966781749, + "grad_norm": NaN, + "learning_rate": 2.1589635192524073e-05, + "loss": 0.0, + "step": 53530 + }, + { + "epoch": 4.994961276476626, + "grad_norm": NaN, + "learning_rate": 2.1585725927629593e-05, + "loss": 0.0, + "step": 53531 + }, + { + "epoch": 4.995054586171503, + "grad_norm": NaN, + "learning_rate": 2.158181698925449e-05, + "loss": 0.0, + "step": 53532 + }, + { + "epoch": 4.99514789586638, + "grad_norm": NaN, + "learning_rate": 2.1577908377408825e-05, + "loss": 0.0, + "step": 53533 + }, + { + "epoch": 4.995241205561258, + "grad_norm": NaN, + "learning_rate": 2.157400009210252e-05, + "loss": 0.0, + "step": 53534 + }, + { + "epoch": 4.995334515256135, + "grad_norm": NaN, + "learning_rate": 2.157009213334544e-05, + "loss": 0.0, + "step": 53535 + }, + { + "epoch": 4.995427824951013, + "grad_norm": NaN, + "learning_rate": 2.1566184501147606e-05, + "loss": 0.0, + "step": 53536 + }, + { + "epoch": 4.99552113464589, + "grad_norm": NaN, + "learning_rate": 2.156227719551894e-05, + "loss": 0.0, + "step": 53537 + }, + { + "epoch": 4.995614444340767, + "grad_norm": NaN, + "learning_rate": 2.155837021646929e-05, + "loss": 0.0, + "step": 53538 + }, + { + "epoch": 4.995707754035644, + "grad_norm": NaN, + "learning_rate": 2.1554463564008696e-05, + "loss": 0.0, + "step": 53539 + }, + { + "epoch": 4.9958010637305215, + "grad_norm": NaN, + "learning_rate": 2.1550557238147097e-05, + "loss": 0.0, + "step": 53540 + }, + { + "epoch": 4.995894373425399, + "grad_norm": NaN, + "learning_rate": 2.1546651238894307e-05, + "loss": 0.0, + "step": 53541 + }, + { + "epoch": 4.995987683120276, + "grad_norm": NaN, + "learning_rate": 2.154274556626035e-05, + "loss": 0.0, + "step": 53542 + }, + { + "epoch": 4.996080992815154, + "grad_norm": NaN, + "learning_rate": 2.1538840220255198e-05, + "loss": 0.0, + "step": 53543 + }, + { + "epoch": 4.996174302510031, + "grad_norm": NaN, + "learning_rate": 2.153493520088863e-05, + "loss": 0.0, + "step": 53544 + }, + { + "epoch": 4.996267612204909, + "grad_norm": NaN, + "learning_rate": 2.153103050817072e-05, + "loss": 0.0, + "step": 53545 + }, + { + "epoch": 4.996360921899785, + "grad_norm": NaN, + "learning_rate": 2.1527126142111377e-05, + "loss": 0.0, + "step": 53546 + }, + { + "epoch": 4.9964542315946625, + "grad_norm": NaN, + "learning_rate": 2.152322210272043e-05, + "loss": 0.0, + "step": 53547 + }, + { + "epoch": 4.99654754128954, + "grad_norm": NaN, + "learning_rate": 2.1519318390007896e-05, + "loss": 0.0, + "step": 53548 + }, + { + "epoch": 4.996640850984417, + "grad_norm": NaN, + "learning_rate": 2.1515415003983706e-05, + "loss": 0.0, + "step": 53549 + }, + { + "epoch": 4.996734160679295, + "grad_norm": NaN, + "learning_rate": 2.15115119446577e-05, + "loss": 0.0, + "step": 53550 + }, + { + "epoch": 4.996827470374172, + "grad_norm": NaN, + "learning_rate": 2.150760921203989e-05, + "loss": 0.0, + "step": 53551 + }, + { + "epoch": 4.996920780069049, + "grad_norm": NaN, + "learning_rate": 2.1503706806140197e-05, + "loss": 0.0, + "step": 53552 + }, + { + "epoch": 4.997014089763926, + "grad_norm": NaN, + "learning_rate": 2.1499804726968434e-05, + "loss": 0.0, + "step": 53553 + }, + { + "epoch": 4.997107399458804, + "grad_norm": NaN, + "learning_rate": 2.149590297453469e-05, + "loss": 0.0, + "step": 53554 + }, + { + "epoch": 4.997200709153681, + "grad_norm": NaN, + "learning_rate": 2.1492001548848757e-05, + "loss": 0.0, + "step": 53555 + }, + { + "epoch": 4.997294018848558, + "grad_norm": NaN, + "learning_rate": 2.1488100449920566e-05, + "loss": 0.0, + "step": 53556 + }, + { + "epoch": 4.997387328543436, + "grad_norm": NaN, + "learning_rate": 2.1484199677760132e-05, + "loss": 0.0, + "step": 53557 + }, + { + "epoch": 4.997480638238313, + "grad_norm": NaN, + "learning_rate": 2.148029923237729e-05, + "loss": 0.0, + "step": 53558 + }, + { + "epoch": 4.997573947933191, + "grad_norm": NaN, + "learning_rate": 2.147639911378193e-05, + "loss": 0.0, + "step": 53559 + }, + { + "epoch": 4.997667257628067, + "grad_norm": NaN, + "learning_rate": 2.1472499321984086e-05, + "loss": 0.0, + "step": 53560 + }, + { + "epoch": 4.997760567322945, + "grad_norm": NaN, + "learning_rate": 2.146859985699353e-05, + "loss": 0.0, + "step": 53561 + }, + { + "epoch": 4.997853877017822, + "grad_norm": NaN, + "learning_rate": 2.1464700718820292e-05, + "loss": 0.0, + "step": 53562 + }, + { + "epoch": 4.997947186712699, + "grad_norm": NaN, + "learning_rate": 2.1460801907474273e-05, + "loss": 0.0, + "step": 53563 + }, + { + "epoch": 4.998040496407577, + "grad_norm": NaN, + "learning_rate": 2.1456903422965295e-05, + "loss": 0.0, + "step": 53564 + }, + { + "epoch": 4.998133806102454, + "grad_norm": NaN, + "learning_rate": 2.1453005265303363e-05, + "loss": 0.0, + "step": 53565 + }, + { + "epoch": 4.998227115797332, + "grad_norm": NaN, + "learning_rate": 2.144910743449839e-05, + "loss": 0.0, + "step": 53566 + }, + { + "epoch": 4.998320425492208, + "grad_norm": NaN, + "learning_rate": 2.14452099305602e-05, + "loss": 0.0, + "step": 53567 + }, + { + "epoch": 4.998413735187086, + "grad_norm": NaN, + "learning_rate": 2.144131275349879e-05, + "loss": 0.0, + "step": 53568 + }, + { + "epoch": 4.998507044881963, + "grad_norm": NaN, + "learning_rate": 2.143741590332408e-05, + "loss": 0.0, + "step": 53569 + }, + { + "epoch": 4.9986003545768405, + "grad_norm": NaN, + "learning_rate": 2.1433519380045878e-05, + "loss": 0.0, + "step": 53570 + }, + { + "epoch": 4.998693664271718, + "grad_norm": NaN, + "learning_rate": 2.142962318367416e-05, + "loss": 0.0, + "step": 53571 + }, + { + "epoch": 4.998786973966595, + "grad_norm": NaN, + "learning_rate": 2.142572731421887e-05, + "loss": 0.0, + "step": 53572 + }, + { + "epoch": 4.998880283661473, + "grad_norm": NaN, + "learning_rate": 2.1421831771689817e-05, + "loss": 0.0, + "step": 53573 + }, + { + "epoch": 4.99897359335635, + "grad_norm": NaN, + "learning_rate": 2.1417936556096976e-05, + "loss": 0.0, + "step": 53574 + }, + { + "epoch": 4.999066903051227, + "grad_norm": NaN, + "learning_rate": 2.141404166745027e-05, + "loss": 0.0, + "step": 53575 + }, + { + "epoch": 4.999160212746104, + "grad_norm": NaN, + "learning_rate": 2.1410147105759496e-05, + "loss": 0.0, + "step": 53576 + }, + { + "epoch": 4.9992535224409815, + "grad_norm": NaN, + "learning_rate": 2.1406252871034657e-05, + "loss": 0.0, + "step": 53577 + }, + { + "epoch": 4.999346832135859, + "grad_norm": NaN, + "learning_rate": 2.1402358963285667e-05, + "loss": 0.0, + "step": 53578 + }, + { + "epoch": 4.999440141830736, + "grad_norm": NaN, + "learning_rate": 2.139846538252231e-05, + "loss": 0.0, + "step": 53579 + }, + { + "epoch": 4.999533451525614, + "grad_norm": NaN, + "learning_rate": 2.1394572128754605e-05, + "loss": 0.0, + "step": 53580 + }, + { + "epoch": 4.99962676122049, + "grad_norm": NaN, + "learning_rate": 2.139067920199244e-05, + "loss": 0.0, + "step": 53581 + }, + { + "epoch": 4.999720070915368, + "grad_norm": NaN, + "learning_rate": 2.1386786602245604e-05, + "loss": 0.0, + "step": 53582 + }, + { + "epoch": 4.999813380610245, + "grad_norm": NaN, + "learning_rate": 2.138289432952411e-05, + "loss": 0.0, + "step": 53583 + }, + { + "epoch": 4.999906690305123, + "grad_norm": NaN, + "learning_rate": 2.137900238383786e-05, + "loss": 0.0, + "step": 53584 + }, + { + "epoch": 5.0, + "grad_norm": NaN, + "learning_rate": 2.1375110765196624e-05, + "loss": 0.0, + "step": 53585 + }, + { + "epoch": 5.0, + "eval_loss": NaN, + "eval_runtime": 26.2964, + "eval_samples_per_second": 6.731, + "eval_steps_per_second": 6.731, + "step": 53585 + }, + { + "epoch": 5.000093309694877, + "grad_norm": NaN, + "learning_rate": 2.1371219473610417e-05, + "loss": 0.0, + "step": 53586 + }, + { + "epoch": 5.000186619389755, + "grad_norm": NaN, + "learning_rate": 2.1367328509089137e-05, + "loss": 0.0, + "step": 53587 + }, + { + "epoch": 5.000279929084632, + "grad_norm": NaN, + "learning_rate": 2.136343787164256e-05, + "loss": 0.0, + "step": 53588 + }, + { + "epoch": 5.000373238779509, + "grad_norm": NaN, + "learning_rate": 2.1359547561280687e-05, + "loss": 0.0, + "step": 53589 + }, + { + "epoch": 5.000466548474386, + "grad_norm": NaN, + "learning_rate": 2.1355657578013435e-05, + "loss": 0.0, + "step": 53590 + }, + { + "epoch": 5.000559858169264, + "grad_norm": NaN, + "learning_rate": 2.135176792185054e-05, + "loss": 0.0, + "step": 53591 + }, + { + "epoch": 5.000653167864141, + "grad_norm": NaN, + "learning_rate": 2.1347878592802055e-05, + "loss": 0.0, + "step": 53592 + }, + { + "epoch": 5.0007464775590185, + "grad_norm": NaN, + "learning_rate": 2.1343989590877808e-05, + "loss": 0.0, + "step": 53593 + }, + { + "epoch": 5.000839787253896, + "grad_norm": NaN, + "learning_rate": 2.1340100916087633e-05, + "loss": 0.0, + "step": 53594 + }, + { + "epoch": 5.000933096948773, + "grad_norm": NaN, + "learning_rate": 2.1336212568441517e-05, + "loss": 0.0, + "step": 53595 + }, + { + "epoch": 5.00102640664365, + "grad_norm": NaN, + "learning_rate": 2.1332324547949308e-05, + "loss": 0.0, + "step": 53596 + }, + { + "epoch": 5.001119716338527, + "grad_norm": NaN, + "learning_rate": 2.1328436854620827e-05, + "loss": 0.0, + "step": 53597 + }, + { + "epoch": 5.001213026033405, + "grad_norm": NaN, + "learning_rate": 2.1324549488466085e-05, + "loss": 0.0, + "step": 53598 + }, + { + "epoch": 5.001306335728282, + "grad_norm": NaN, + "learning_rate": 2.1320662449494825e-05, + "loss": 0.0, + "step": 53599 + }, + { + "epoch": 5.0013996454231595, + "grad_norm": NaN, + "learning_rate": 2.131677573771705e-05, + "loss": 0.0, + "step": 53600 + }, + { + "epoch": 5.001492955118037, + "grad_norm": NaN, + "learning_rate": 2.131288935314261e-05, + "loss": 0.0, + "step": 53601 + }, + { + "epoch": 5.001586264812914, + "grad_norm": NaN, + "learning_rate": 2.1309003295781324e-05, + "loss": 0.0, + "step": 53602 + }, + { + "epoch": 5.001679574507792, + "grad_norm": NaN, + "learning_rate": 2.1305117565643137e-05, + "loss": 0.0, + "step": 53603 + }, + { + "epoch": 5.001772884202668, + "grad_norm": NaN, + "learning_rate": 2.130123216273796e-05, + "loss": 0.0, + "step": 53604 + }, + { + "epoch": 5.001866193897546, + "grad_norm": NaN, + "learning_rate": 2.1297347087075545e-05, + "loss": 0.0, + "step": 53605 + }, + { + "epoch": 5.001959503592423, + "grad_norm": NaN, + "learning_rate": 2.1293462338665905e-05, + "loss": 0.0, + "step": 53606 + }, + { + "epoch": 5.002052813287301, + "grad_norm": NaN, + "learning_rate": 2.1289577917518886e-05, + "loss": 0.0, + "step": 53607 + }, + { + "epoch": 5.002146122982178, + "grad_norm": NaN, + "learning_rate": 2.1285693823644273e-05, + "loss": 0.0, + "step": 53608 + }, + { + "epoch": 5.002239432677055, + "grad_norm": NaN, + "learning_rate": 2.128181005705206e-05, + "loss": 0.0, + "step": 53609 + }, + { + "epoch": 5.002332742371933, + "grad_norm": NaN, + "learning_rate": 2.1277926617752095e-05, + "loss": 0.0, + "step": 53610 + }, + { + "epoch": 5.002426052066809, + "grad_norm": NaN, + "learning_rate": 2.127404350575418e-05, + "loss": 0.0, + "step": 53611 + }, + { + "epoch": 5.002519361761687, + "grad_norm": NaN, + "learning_rate": 2.1270160721068258e-05, + "loss": 0.0, + "step": 53612 + }, + { + "epoch": 5.002612671456564, + "grad_norm": NaN, + "learning_rate": 2.1266278263704223e-05, + "loss": 0.0, + "step": 53613 + }, + { + "epoch": 5.002705981151442, + "grad_norm": NaN, + "learning_rate": 2.126239613367185e-05, + "loss": 0.0, + "step": 53614 + }, + { + "epoch": 5.002799290846319, + "grad_norm": NaN, + "learning_rate": 2.1258514330981097e-05, + "loss": 0.0, + "step": 53615 + }, + { + "epoch": 5.002892600541196, + "grad_norm": NaN, + "learning_rate": 2.1254632855641823e-05, + "loss": 0.0, + "step": 53616 + }, + { + "epoch": 5.002985910236074, + "grad_norm": NaN, + "learning_rate": 2.125075170766384e-05, + "loss": 0.0, + "step": 53617 + }, + { + "epoch": 5.00307921993095, + "grad_norm": NaN, + "learning_rate": 2.1246870887057083e-05, + "loss": 0.0, + "step": 53618 + }, + { + "epoch": 5.003172529625828, + "grad_norm": NaN, + "learning_rate": 2.1242990393831417e-05, + "loss": 0.0, + "step": 53619 + }, + { + "epoch": 5.003265839320705, + "grad_norm": NaN, + "learning_rate": 2.123911022799663e-05, + "loss": 0.0, + "step": 53620 + }, + { + "epoch": 5.003359149015583, + "grad_norm": NaN, + "learning_rate": 2.123523038956267e-05, + "loss": 0.0, + "step": 53621 + }, + { + "epoch": 5.00345245871046, + "grad_norm": NaN, + "learning_rate": 2.1231350878539406e-05, + "loss": 0.0, + "step": 53622 + }, + { + "epoch": 5.0035457684053375, + "grad_norm": NaN, + "learning_rate": 2.1227471694936615e-05, + "loss": 0.0, + "step": 53623 + }, + { + "epoch": 5.003639078100215, + "grad_norm": NaN, + "learning_rate": 2.122359283876424e-05, + "loss": 0.0, + "step": 53624 + }, + { + "epoch": 5.003732387795092, + "grad_norm": NaN, + "learning_rate": 2.1219714310032176e-05, + "loss": 0.0, + "step": 53625 + }, + { + "epoch": 5.003825697489969, + "grad_norm": NaN, + "learning_rate": 2.1215836108750146e-05, + "loss": 0.0, + "step": 53626 + }, + { + "epoch": 5.003919007184846, + "grad_norm": NaN, + "learning_rate": 2.121195823492814e-05, + "loss": 0.0, + "step": 53627 + }, + { + "epoch": 5.004012316879724, + "grad_norm": NaN, + "learning_rate": 2.1208080688576e-05, + "loss": 0.0, + "step": 53628 + }, + { + "epoch": 5.004105626574601, + "grad_norm": NaN, + "learning_rate": 2.1204203469703507e-05, + "loss": 0.0, + "step": 53629 + }, + { + "epoch": 5.0041989362694785, + "grad_norm": NaN, + "learning_rate": 2.120032657832058e-05, + "loss": 0.0, + "step": 53630 + }, + { + "epoch": 5.004292245964356, + "grad_norm": NaN, + "learning_rate": 2.1196450014437134e-05, + "loss": 0.0, + "step": 53631 + }, + { + "epoch": 5.004385555659233, + "grad_norm": NaN, + "learning_rate": 2.119257377806287e-05, + "loss": 0.0, + "step": 53632 + }, + { + "epoch": 5.00447886535411, + "grad_norm": NaN, + "learning_rate": 2.1188697869207773e-05, + "loss": 0.0, + "step": 53633 + }, + { + "epoch": 5.004572175048987, + "grad_norm": NaN, + "learning_rate": 2.118482228788171e-05, + "loss": 0.0, + "step": 53634 + }, + { + "epoch": 5.004665484743865, + "grad_norm": NaN, + "learning_rate": 2.118094703409441e-05, + "loss": 0.0, + "step": 53635 + }, + { + "epoch": 5.004758794438742, + "grad_norm": NaN, + "learning_rate": 2.117707210785583e-05, + "loss": 0.0, + "step": 53636 + }, + { + "epoch": 5.00485210413362, + "grad_norm": NaN, + "learning_rate": 2.11731975091758e-05, + "loss": 0.0, + "step": 53637 + }, + { + "epoch": 5.004945413828497, + "grad_norm": NaN, + "learning_rate": 2.1169323238064157e-05, + "loss": 0.0, + "step": 53638 + }, + { + "epoch": 5.005038723523374, + "grad_norm": NaN, + "learning_rate": 2.1165449294530777e-05, + "loss": 0.0, + "step": 53639 + }, + { + "epoch": 5.005132033218251, + "grad_norm": NaN, + "learning_rate": 2.116157567858549e-05, + "loss": 0.0, + "step": 53640 + }, + { + "epoch": 5.005225342913128, + "grad_norm": NaN, + "learning_rate": 2.1157702390238153e-05, + "loss": 0.0, + "step": 53641 + }, + { + "epoch": 5.005318652608006, + "grad_norm": NaN, + "learning_rate": 2.1153829429498654e-05, + "loss": 0.0, + "step": 53642 + }, + { + "epoch": 5.005411962302883, + "grad_norm": NaN, + "learning_rate": 2.114995679637672e-05, + "loss": 0.0, + "step": 53643 + }, + { + "epoch": 5.005505271997761, + "grad_norm": NaN, + "learning_rate": 2.1146084490882315e-05, + "loss": 0.0, + "step": 53644 + }, + { + "epoch": 5.005598581692638, + "grad_norm": NaN, + "learning_rate": 2.1142212513025276e-05, + "loss": 0.0, + "step": 53645 + }, + { + "epoch": 5.0056918913875155, + "grad_norm": NaN, + "learning_rate": 2.1138340862815363e-05, + "loss": 0.0, + "step": 53646 + }, + { + "epoch": 5.005785201082393, + "grad_norm": NaN, + "learning_rate": 2.11344695402625e-05, + "loss": 0.0, + "step": 53647 + }, + { + "epoch": 5.005878510777269, + "grad_norm": NaN, + "learning_rate": 2.1130598545376548e-05, + "loss": 0.0, + "step": 53648 + }, + { + "epoch": 5.005971820472147, + "grad_norm": NaN, + "learning_rate": 2.112672787816725e-05, + "loss": 0.0, + "step": 53649 + }, + { + "epoch": 5.006065130167024, + "grad_norm": NaN, + "learning_rate": 2.1122857538644527e-05, + "loss": 0.0, + "step": 53650 + }, + { + "epoch": 5.006158439861902, + "grad_norm": NaN, + "learning_rate": 2.1118987526818244e-05, + "loss": 0.0, + "step": 53651 + }, + { + "epoch": 5.006251749556779, + "grad_norm": NaN, + "learning_rate": 2.1115117842698138e-05, + "loss": 0.0, + "step": 53652 + }, + { + "epoch": 5.0063450592516565, + "grad_norm": NaN, + "learning_rate": 2.1111248486294134e-05, + "loss": 0.0, + "step": 53653 + }, + { + "epoch": 5.006438368946534, + "grad_norm": NaN, + "learning_rate": 2.1107379457616082e-05, + "loss": 0.0, + "step": 53654 + }, + { + "epoch": 5.0065316786414105, + "grad_norm": NaN, + "learning_rate": 2.1103510756673715e-05, + "loss": 0.0, + "step": 53655 + }, + { + "epoch": 5.006624988336288, + "grad_norm": NaN, + "learning_rate": 2.109964238347696e-05, + "loss": 0.0, + "step": 53656 + }, + { + "epoch": 5.006718298031165, + "grad_norm": NaN, + "learning_rate": 2.109577433803568e-05, + "loss": 0.0, + "step": 53657 + }, + { + "epoch": 5.006811607726043, + "grad_norm": NaN, + "learning_rate": 2.1091906620359582e-05, + "loss": 0.0, + "step": 53658 + }, + { + "epoch": 5.00690491742092, + "grad_norm": NaN, + "learning_rate": 2.1088039230458626e-05, + "loss": 0.0, + "step": 53659 + }, + { + "epoch": 5.006998227115798, + "grad_norm": NaN, + "learning_rate": 2.1084172168342632e-05, + "loss": 0.0, + "step": 53660 + }, + { + "epoch": 5.007091536810675, + "grad_norm": NaN, + "learning_rate": 2.108030543402133e-05, + "loss": 0.0, + "step": 53661 + }, + { + "epoch": 5.0071848465055515, + "grad_norm": NaN, + "learning_rate": 2.1076439027504648e-05, + "loss": 0.0, + "step": 53662 + }, + { + "epoch": 5.007278156200429, + "grad_norm": NaN, + "learning_rate": 2.107257294880244e-05, + "loss": 0.0, + "step": 53663 + }, + { + "epoch": 5.007371465895306, + "grad_norm": NaN, + "learning_rate": 2.1068707197924416e-05, + "loss": 0.0, + "step": 53664 + }, + { + "epoch": 5.007464775590184, + "grad_norm": NaN, + "learning_rate": 2.1064841774880503e-05, + "loss": 0.0, + "step": 53665 + }, + { + "epoch": 5.007558085285061, + "grad_norm": NaN, + "learning_rate": 2.1060976679680543e-05, + "loss": 0.0, + "step": 53666 + }, + { + "epoch": 5.007651394979939, + "grad_norm": NaN, + "learning_rate": 2.1057111912334257e-05, + "loss": 0.0, + "step": 53667 + }, + { + "epoch": 5.007744704674816, + "grad_norm": NaN, + "learning_rate": 2.1053247472851574e-05, + "loss": 0.0, + "step": 53668 + }, + { + "epoch": 5.0078380143696934, + "grad_norm": NaN, + "learning_rate": 2.1049383361242322e-05, + "loss": 0.0, + "step": 53669 + }, + { + "epoch": 5.00793132406457, + "grad_norm": NaN, + "learning_rate": 2.1045519577516224e-05, + "loss": 0.0, + "step": 53670 + }, + { + "epoch": 5.008024633759447, + "grad_norm": NaN, + "learning_rate": 2.1041656121683186e-05, + "loss": 0.0, + "step": 53671 + }, + { + "epoch": 5.008117943454325, + "grad_norm": NaN, + "learning_rate": 2.103779299375304e-05, + "loss": 0.0, + "step": 53672 + }, + { + "epoch": 5.008211253149202, + "grad_norm": NaN, + "learning_rate": 2.1033930193735572e-05, + "loss": 0.0, + "step": 53673 + }, + { + "epoch": 5.00830456284408, + "grad_norm": NaN, + "learning_rate": 2.1030067721640626e-05, + "loss": 0.0, + "step": 53674 + }, + { + "epoch": 5.008397872538957, + "grad_norm": NaN, + "learning_rate": 2.1026205577478013e-05, + "loss": 0.0, + "step": 53675 + }, + { + "epoch": 5.0084911822338345, + "grad_norm": NaN, + "learning_rate": 2.1022343761257537e-05, + "loss": 0.0, + "step": 53676 + }, + { + "epoch": 5.008584491928711, + "grad_norm": NaN, + "learning_rate": 2.1018482272989045e-05, + "loss": 0.0, + "step": 53677 + }, + { + "epoch": 5.008677801623588, + "grad_norm": NaN, + "learning_rate": 2.1014621112682363e-05, + "loss": 0.0, + "step": 53678 + }, + { + "epoch": 5.008771111318466, + "grad_norm": NaN, + "learning_rate": 2.101076028034726e-05, + "loss": 0.0, + "step": 53679 + }, + { + "epoch": 5.008864421013343, + "grad_norm": NaN, + "learning_rate": 2.1006899775993603e-05, + "loss": 0.0, + "step": 53680 + }, + { + "epoch": 5.008957730708221, + "grad_norm": NaN, + "learning_rate": 2.1003039599631193e-05, + "loss": 0.0, + "step": 53681 + }, + { + "epoch": 5.009051040403098, + "grad_norm": NaN, + "learning_rate": 2.0999179751269828e-05, + "loss": 0.0, + "step": 53682 + }, + { + "epoch": 5.0091443500979755, + "grad_norm": NaN, + "learning_rate": 2.099532023091935e-05, + "loss": 0.0, + "step": 53683 + }, + { + "epoch": 5.009237659792852, + "grad_norm": NaN, + "learning_rate": 2.0991461038589548e-05, + "loss": 0.0, + "step": 53684 + }, + { + "epoch": 5.0093309694877295, + "grad_norm": NaN, + "learning_rate": 2.0987602174290247e-05, + "loss": 0.0, + "step": 53685 + }, + { + "epoch": 5.009424279182607, + "grad_norm": NaN, + "learning_rate": 2.0983743638031275e-05, + "loss": 0.0, + "step": 53686 + }, + { + "epoch": 5.009517588877484, + "grad_norm": NaN, + "learning_rate": 2.0979885429822408e-05, + "loss": 0.0, + "step": 53687 + }, + { + "epoch": 5.009610898572362, + "grad_norm": NaN, + "learning_rate": 2.097602754967347e-05, + "loss": 0.0, + "step": 53688 + }, + { + "epoch": 5.009704208267239, + "grad_norm": NaN, + "learning_rate": 2.0972169997594318e-05, + "loss": 0.0, + "step": 53689 + }, + { + "epoch": 5.009797517962117, + "grad_norm": NaN, + "learning_rate": 2.096831277359465e-05, + "loss": 0.0, + "step": 53690 + }, + { + "epoch": 5.009890827656994, + "grad_norm": NaN, + "learning_rate": 2.0964455877684365e-05, + "loss": 0.0, + "step": 53691 + }, + { + "epoch": 5.0099841373518705, + "grad_norm": NaN, + "learning_rate": 2.0960599309873283e-05, + "loss": 0.0, + "step": 53692 + }, + { + "epoch": 5.010077447046748, + "grad_norm": NaN, + "learning_rate": 2.0956743070171106e-05, + "loss": 0.0, + "step": 53693 + }, + { + "epoch": 5.010170756741625, + "grad_norm": NaN, + "learning_rate": 2.0952887158587744e-05, + "loss": 0.0, + "step": 53694 + }, + { + "epoch": 5.010264066436503, + "grad_norm": NaN, + "learning_rate": 2.094903157513299e-05, + "loss": 0.0, + "step": 53695 + }, + { + "epoch": 5.01035737613138, + "grad_norm": NaN, + "learning_rate": 2.0945176319816552e-05, + "loss": 0.0, + "step": 53696 + }, + { + "epoch": 5.010450685826258, + "grad_norm": NaN, + "learning_rate": 2.0941321392648343e-05, + "loss": 0.0, + "step": 53697 + }, + { + "epoch": 5.010543995521135, + "grad_norm": NaN, + "learning_rate": 2.0937466793638146e-05, + "loss": 0.0, + "step": 53698 + }, + { + "epoch": 5.010637305216012, + "grad_norm": NaN, + "learning_rate": 2.0933612522795696e-05, + "loss": 0.0, + "step": 53699 + }, + { + "epoch": 5.010730614910889, + "grad_norm": NaN, + "learning_rate": 2.0929758580130845e-05, + "loss": 0.0, + "step": 53700 + }, + { + "epoch": 5.010823924605766, + "grad_norm": NaN, + "learning_rate": 2.092590496565344e-05, + "loss": 0.0, + "step": 53701 + }, + { + "epoch": 5.010917234300644, + "grad_norm": NaN, + "learning_rate": 2.0922051679373136e-05, + "loss": 0.0, + "step": 53702 + }, + { + "epoch": 5.011010543995521, + "grad_norm": NaN, + "learning_rate": 2.091819872129986e-05, + "loss": 0.0, + "step": 53703 + }, + { + "epoch": 5.011103853690399, + "grad_norm": NaN, + "learning_rate": 2.0914346091443407e-05, + "loss": 0.0, + "step": 53704 + }, + { + "epoch": 5.011197163385276, + "grad_norm": NaN, + "learning_rate": 2.0910493789813466e-05, + "loss": 0.0, + "step": 53705 + }, + { + "epoch": 5.011290473080153, + "grad_norm": NaN, + "learning_rate": 2.090664181641993e-05, + "loss": 0.0, + "step": 53706 + }, + { + "epoch": 5.01138378277503, + "grad_norm": NaN, + "learning_rate": 2.0902790171272605e-05, + "loss": 0.0, + "step": 53707 + }, + { + "epoch": 5.0114770924699075, + "grad_norm": NaN, + "learning_rate": 2.0898938854381153e-05, + "loss": 0.0, + "step": 53708 + }, + { + "epoch": 5.011570402164785, + "grad_norm": NaN, + "learning_rate": 2.0895087865755517e-05, + "loss": 0.0, + "step": 53709 + }, + { + "epoch": 5.011663711859662, + "grad_norm": NaN, + "learning_rate": 2.089123720540542e-05, + "loss": 0.0, + "step": 53710 + }, + { + "epoch": 5.01175702155454, + "grad_norm": NaN, + "learning_rate": 2.0887386873340657e-05, + "loss": 0.0, + "step": 53711 + }, + { + "epoch": 5.011850331249417, + "grad_norm": NaN, + "learning_rate": 2.0883536869571033e-05, + "loss": 0.0, + "step": 53712 + }, + { + "epoch": 5.011943640944294, + "grad_norm": NaN, + "learning_rate": 2.0879687194106308e-05, + "loss": 0.0, + "step": 53713 + }, + { + "epoch": 5.012036950639171, + "grad_norm": NaN, + "learning_rate": 2.087583784695631e-05, + "loss": 0.0, + "step": 53714 + }, + { + "epoch": 5.0121302603340485, + "grad_norm": NaN, + "learning_rate": 2.0871988828130797e-05, + "loss": 0.0, + "step": 53715 + }, + { + "epoch": 5.012223570028926, + "grad_norm": NaN, + "learning_rate": 2.0868140137639578e-05, + "loss": 0.0, + "step": 53716 + }, + { + "epoch": 5.012316879723803, + "grad_norm": NaN, + "learning_rate": 2.0864291775492407e-05, + "loss": 0.0, + "step": 53717 + }, + { + "epoch": 5.012410189418681, + "grad_norm": NaN, + "learning_rate": 2.08604437416991e-05, + "loss": 0.0, + "step": 53718 + }, + { + "epoch": 5.012503499113558, + "grad_norm": NaN, + "learning_rate": 2.0856596036269445e-05, + "loss": 0.0, + "step": 53719 + }, + { + "epoch": 5.012596808808436, + "grad_norm": NaN, + "learning_rate": 2.0852748659213185e-05, + "loss": 0.0, + "step": 53720 + }, + { + "epoch": 5.012690118503312, + "grad_norm": NaN, + "learning_rate": 2.0848901610540148e-05, + "loss": 0.0, + "step": 53721 + }, + { + "epoch": 5.01278342819819, + "grad_norm": NaN, + "learning_rate": 2.0845054890260087e-05, + "loss": 0.0, + "step": 53722 + }, + { + "epoch": 5.012876737893067, + "grad_norm": NaN, + "learning_rate": 2.08412084983828e-05, + "loss": 0.0, + "step": 53723 + }, + { + "epoch": 5.012970047587944, + "grad_norm": NaN, + "learning_rate": 2.083736243491806e-05, + "loss": 0.0, + "step": 53724 + }, + { + "epoch": 5.013063357282822, + "grad_norm": NaN, + "learning_rate": 2.083351669987564e-05, + "loss": 0.0, + "step": 53725 + }, + { + "epoch": 5.013156666977699, + "grad_norm": NaN, + "learning_rate": 2.082967129326532e-05, + "loss": 0.0, + "step": 53726 + }, + { + "epoch": 5.013249976672577, + "grad_norm": NaN, + "learning_rate": 2.0825826215096893e-05, + "loss": 0.0, + "step": 53727 + }, + { + "epoch": 5.013343286367453, + "grad_norm": NaN, + "learning_rate": 2.0821981465380128e-05, + "loss": 0.0, + "step": 53728 + }, + { + "epoch": 5.013436596062331, + "grad_norm": NaN, + "learning_rate": 2.081813704412479e-05, + "loss": 0.0, + "step": 53729 + }, + { + "epoch": 5.013529905757208, + "grad_norm": NaN, + "learning_rate": 2.0814292951340667e-05, + "loss": 0.0, + "step": 53730 + }, + { + "epoch": 5.0136232154520854, + "grad_norm": NaN, + "learning_rate": 2.0810449187037537e-05, + "loss": 0.0, + "step": 53731 + }, + { + "epoch": 5.013716525146963, + "grad_norm": NaN, + "learning_rate": 2.0806605751225158e-05, + "loss": 0.0, + "step": 53732 + }, + { + "epoch": 5.01380983484184, + "grad_norm": NaN, + "learning_rate": 2.080276264391334e-05, + "loss": 0.0, + "step": 53733 + }, + { + "epoch": 5.013903144536718, + "grad_norm": NaN, + "learning_rate": 2.0798919865111758e-05, + "loss": 0.0, + "step": 53734 + }, + { + "epoch": 5.013996454231594, + "grad_norm": NaN, + "learning_rate": 2.0795077414830287e-05, + "loss": 0.0, + "step": 53735 + }, + { + "epoch": 5.014089763926472, + "grad_norm": NaN, + "learning_rate": 2.07912352930787e-05, + "loss": 0.0, + "step": 53736 + }, + { + "epoch": 5.014183073621349, + "grad_norm": NaN, + "learning_rate": 2.0787393499866663e-05, + "loss": 0.0, + "step": 53737 + }, + { + "epoch": 5.0142763833162265, + "grad_norm": NaN, + "learning_rate": 2.0783552035204025e-05, + "loss": 0.0, + "step": 53738 + }, + { + "epoch": 5.014369693011104, + "grad_norm": NaN, + "learning_rate": 2.077971089910059e-05, + "loss": 0.0, + "step": 53739 + }, + { + "epoch": 5.014463002705981, + "grad_norm": NaN, + "learning_rate": 2.077587009156599e-05, + "loss": 0.0, + "step": 53740 + }, + { + "epoch": 5.014556312400859, + "grad_norm": NaN, + "learning_rate": 2.077202961261012e-05, + "loss": 0.0, + "step": 53741 + }, + { + "epoch": 5.014649622095736, + "grad_norm": NaN, + "learning_rate": 2.0768189462242724e-05, + "loss": 0.0, + "step": 53742 + }, + { + "epoch": 5.014742931790613, + "grad_norm": NaN, + "learning_rate": 2.0764349640473478e-05, + "loss": 0.0, + "step": 53743 + }, + { + "epoch": 5.01483624148549, + "grad_norm": NaN, + "learning_rate": 2.0760510147312236e-05, + "loss": 0.0, + "step": 53744 + }, + { + "epoch": 5.0149295511803675, + "grad_norm": NaN, + "learning_rate": 2.075667098276878e-05, + "loss": 0.0, + "step": 53745 + }, + { + "epoch": 5.015022860875245, + "grad_norm": NaN, + "learning_rate": 2.0752832146852748e-05, + "loss": 0.0, + "step": 53746 + }, + { + "epoch": 5.015116170570122, + "grad_norm": NaN, + "learning_rate": 2.0748993639574e-05, + "loss": 0.0, + "step": 53747 + }, + { + "epoch": 5.015209480265, + "grad_norm": NaN, + "learning_rate": 2.07451554609423e-05, + "loss": 0.0, + "step": 53748 + }, + { + "epoch": 5.015302789959877, + "grad_norm": NaN, + "learning_rate": 2.0741317610967363e-05, + "loss": 0.0, + "step": 53749 + }, + { + "epoch": 5.015396099654754, + "grad_norm": NaN, + "learning_rate": 2.073748008965897e-05, + "loss": 0.0, + "step": 53750 + }, + { + "epoch": 5.015489409349631, + "grad_norm": NaN, + "learning_rate": 2.073364289702688e-05, + "loss": 0.0, + "step": 53751 + }, + { + "epoch": 5.015582719044509, + "grad_norm": NaN, + "learning_rate": 2.0729806033080837e-05, + "loss": 0.0, + "step": 53752 + }, + { + "epoch": 5.015676028739386, + "grad_norm": NaN, + "learning_rate": 2.0725969497830614e-05, + "loss": 0.0, + "step": 53753 + }, + { + "epoch": 5.015769338434263, + "grad_norm": NaN, + "learning_rate": 2.0722133291285936e-05, + "loss": 0.0, + "step": 53754 + }, + { + "epoch": 5.015862648129141, + "grad_norm": NaN, + "learning_rate": 2.07182974134566e-05, + "loss": 0.0, + "step": 53755 + }, + { + "epoch": 5.015955957824018, + "grad_norm": NaN, + "learning_rate": 2.071446186435232e-05, + "loss": 0.0, + "step": 53756 + }, + { + "epoch": 5.016049267518895, + "grad_norm": NaN, + "learning_rate": 2.0710626643982886e-05, + "loss": 0.0, + "step": 53757 + }, + { + "epoch": 5.016142577213772, + "grad_norm": NaN, + "learning_rate": 2.0706791752358017e-05, + "loss": 0.0, + "step": 53758 + }, + { + "epoch": 5.01623588690865, + "grad_norm": NaN, + "learning_rate": 2.0702957189487485e-05, + "loss": 0.0, + "step": 53759 + }, + { + "epoch": 5.016329196603527, + "grad_norm": NaN, + "learning_rate": 2.069912295538102e-05, + "loss": 0.0, + "step": 53760 + }, + { + "epoch": 5.0164225062984045, + "grad_norm": NaN, + "learning_rate": 2.0695289050048397e-05, + "loss": 0.0, + "step": 53761 + }, + { + "epoch": 5.016515815993282, + "grad_norm": NaN, + "learning_rate": 2.0691455473499343e-05, + "loss": 0.0, + "step": 53762 + }, + { + "epoch": 5.016609125688159, + "grad_norm": NaN, + "learning_rate": 2.0687622225743627e-05, + "loss": 0.0, + "step": 53763 + }, + { + "epoch": 5.016702435383037, + "grad_norm": NaN, + "learning_rate": 2.0683789306790966e-05, + "loss": 0.0, + "step": 53764 + }, + { + "epoch": 5.016795745077913, + "grad_norm": NaN, + "learning_rate": 2.067995671665113e-05, + "loss": 0.0, + "step": 53765 + }, + { + "epoch": 5.016889054772791, + "grad_norm": NaN, + "learning_rate": 2.067612445533385e-05, + "loss": 0.0, + "step": 53766 + }, + { + "epoch": 5.016982364467668, + "grad_norm": NaN, + "learning_rate": 2.0672292522848895e-05, + "loss": 0.0, + "step": 53767 + }, + { + "epoch": 5.0170756741625455, + "grad_norm": NaN, + "learning_rate": 2.0668460919205976e-05, + "loss": 0.0, + "step": 53768 + }, + { + "epoch": 5.017168983857423, + "grad_norm": NaN, + "learning_rate": 2.0664629644414838e-05, + "loss": 0.0, + "step": 53769 + }, + { + "epoch": 5.0172622935523, + "grad_norm": NaN, + "learning_rate": 2.0660798698485254e-05, + "loss": 0.0, + "step": 53770 + }, + { + "epoch": 5.017355603247178, + "grad_norm": NaN, + "learning_rate": 2.0656968081426932e-05, + "loss": 0.0, + "step": 53771 + }, + { + "epoch": 5.017448912942054, + "grad_norm": NaN, + "learning_rate": 2.0653137793249637e-05, + "loss": 0.0, + "step": 53772 + }, + { + "epoch": 5.017542222636932, + "grad_norm": NaN, + "learning_rate": 2.06493078339631e-05, + "loss": 0.0, + "step": 53773 + }, + { + "epoch": 5.017635532331809, + "grad_norm": NaN, + "learning_rate": 2.0645478203577044e-05, + "loss": 0.0, + "step": 53774 + }, + { + "epoch": 5.017728842026687, + "grad_norm": NaN, + "learning_rate": 2.0641648902101233e-05, + "loss": 0.0, + "step": 53775 + }, + { + "epoch": 5.017822151721564, + "grad_norm": NaN, + "learning_rate": 2.063781992954538e-05, + "loss": 0.0, + "step": 53776 + }, + { + "epoch": 5.017915461416441, + "grad_norm": NaN, + "learning_rate": 2.0633991285919276e-05, + "loss": 0.0, + "step": 53777 + }, + { + "epoch": 5.018008771111319, + "grad_norm": NaN, + "learning_rate": 2.0630162971232535e-05, + "loss": 0.0, + "step": 53778 + }, + { + "epoch": 5.018102080806195, + "grad_norm": NaN, + "learning_rate": 2.062633498549499e-05, + "loss": 0.0, + "step": 53779 + }, + { + "epoch": 5.018195390501073, + "grad_norm": NaN, + "learning_rate": 2.0622507328716408e-05, + "loss": 0.0, + "step": 53780 + }, + { + "epoch": 5.01828870019595, + "grad_norm": NaN, + "learning_rate": 2.0618680000906378e-05, + "loss": 0.0, + "step": 53781 + }, + { + "epoch": 5.018382009890828, + "grad_norm": NaN, + "learning_rate": 2.061485300207476e-05, + "loss": 0.0, + "step": 53782 + }, + { + "epoch": 5.018475319585705, + "grad_norm": NaN, + "learning_rate": 2.0611026332231297e-05, + "loss": 0.0, + "step": 53783 + }, + { + "epoch": 5.0185686292805824, + "grad_norm": NaN, + "learning_rate": 2.0607199991385575e-05, + "loss": 0.0, + "step": 53784 + }, + { + "epoch": 5.01866193897546, + "grad_norm": NaN, + "learning_rate": 2.060337397954746e-05, + "loss": 0.0, + "step": 53785 + }, + { + "epoch": 5.018755248670337, + "grad_norm": NaN, + "learning_rate": 2.0599548296726625e-05, + "loss": 0.0, + "step": 53786 + }, + { + "epoch": 5.018848558365214, + "grad_norm": NaN, + "learning_rate": 2.059572294293281e-05, + "loss": 0.0, + "step": 53787 + }, + { + "epoch": 5.018941868060091, + "grad_norm": NaN, + "learning_rate": 2.059189791817575e-05, + "loss": 0.0, + "step": 53788 + }, + { + "epoch": 5.019035177754969, + "grad_norm": NaN, + "learning_rate": 2.058807322246514e-05, + "loss": 0.0, + "step": 53789 + }, + { + "epoch": 5.019128487449846, + "grad_norm": NaN, + "learning_rate": 2.0584248855810746e-05, + "loss": 0.0, + "step": 53790 + }, + { + "epoch": 5.0192217971447235, + "grad_norm": NaN, + "learning_rate": 2.0580424818222263e-05, + "loss": 0.0, + "step": 53791 + }, + { + "epoch": 5.019315106839601, + "grad_norm": NaN, + "learning_rate": 2.0576601109709413e-05, + "loss": 0.0, + "step": 53792 + }, + { + "epoch": 5.019408416534478, + "grad_norm": NaN, + "learning_rate": 2.0572777730281935e-05, + "loss": 0.0, + "step": 53793 + }, + { + "epoch": 5.019501726229355, + "grad_norm": NaN, + "learning_rate": 2.0568954679949545e-05, + "loss": 0.0, + "step": 53794 + }, + { + "epoch": 5.019595035924232, + "grad_norm": NaN, + "learning_rate": 2.0565131958721965e-05, + "loss": 0.0, + "step": 53795 + }, + { + "epoch": 5.01968834561911, + "grad_norm": NaN, + "learning_rate": 2.0561309566608918e-05, + "loss": 0.0, + "step": 53796 + }, + { + "epoch": 5.019781655313987, + "grad_norm": NaN, + "learning_rate": 2.0557487503620117e-05, + "loss": 0.0, + "step": 53797 + }, + { + "epoch": 5.0198749650088645, + "grad_norm": NaN, + "learning_rate": 2.055366576976527e-05, + "loss": 0.0, + "step": 53798 + }, + { + "epoch": 5.019968274703742, + "grad_norm": NaN, + "learning_rate": 2.054984436505412e-05, + "loss": 0.0, + "step": 53799 + }, + { + "epoch": 5.020061584398619, + "grad_norm": NaN, + "learning_rate": 2.0546023289496355e-05, + "loss": 0.0, + "step": 53800 + }, + { + "epoch": 5.020154894093496, + "grad_norm": NaN, + "learning_rate": 2.054220254310172e-05, + "loss": 0.0, + "step": 53801 + }, + { + "epoch": 5.020248203788373, + "grad_norm": NaN, + "learning_rate": 2.053838212587991e-05, + "loss": 0.0, + "step": 53802 + }, + { + "epoch": 5.020341513483251, + "grad_norm": NaN, + "learning_rate": 2.0534562037840646e-05, + "loss": 0.0, + "step": 53803 + }, + { + "epoch": 5.020434823178128, + "grad_norm": NaN, + "learning_rate": 2.053074227899364e-05, + "loss": 0.0, + "step": 53804 + }, + { + "epoch": 5.020528132873006, + "grad_norm": NaN, + "learning_rate": 2.0526922849348598e-05, + "loss": 0.0, + "step": 53805 + }, + { + "epoch": 5.020621442567883, + "grad_norm": NaN, + "learning_rate": 2.0523103748915248e-05, + "loss": 0.0, + "step": 53806 + }, + { + "epoch": 5.02071475226276, + "grad_norm": NaN, + "learning_rate": 2.051928497770328e-05, + "loss": 0.0, + "step": 53807 + }, + { + "epoch": 5.020808061957637, + "grad_norm": NaN, + "learning_rate": 2.0515466535722424e-05, + "loss": 0.0, + "step": 53808 + }, + { + "epoch": 5.020901371652514, + "grad_norm": NaN, + "learning_rate": 2.0511648422982385e-05, + "loss": 0.0, + "step": 53809 + }, + { + "epoch": 5.020994681347392, + "grad_norm": NaN, + "learning_rate": 2.0507830639492856e-05, + "loss": 0.0, + "step": 53810 + }, + { + "epoch": 5.021087991042269, + "grad_norm": NaN, + "learning_rate": 2.0504013185263562e-05, + "loss": 0.0, + "step": 53811 + }, + { + "epoch": 5.021181300737147, + "grad_norm": NaN, + "learning_rate": 2.05001960603042e-05, + "loss": 0.0, + "step": 53812 + }, + { + "epoch": 5.021274610432024, + "grad_norm": NaN, + "learning_rate": 2.049637926462447e-05, + "loss": 0.0, + "step": 53813 + }, + { + "epoch": 5.0213679201269015, + "grad_norm": NaN, + "learning_rate": 2.0492562798234102e-05, + "loss": 0.0, + "step": 53814 + }, + { + "epoch": 5.021461229821779, + "grad_norm": NaN, + "learning_rate": 2.0488746661142773e-05, + "loss": 0.0, + "step": 53815 + }, + { + "epoch": 5.021554539516655, + "grad_norm": NaN, + "learning_rate": 2.048493085336021e-05, + "loss": 0.0, + "step": 53816 + }, + { + "epoch": 5.021647849211533, + "grad_norm": NaN, + "learning_rate": 2.0481115374896085e-05, + "loss": 0.0, + "step": 53817 + }, + { + "epoch": 5.02174115890641, + "grad_norm": NaN, + "learning_rate": 2.0477300225760123e-05, + "loss": 0.0, + "step": 53818 + }, + { + "epoch": 5.021834468601288, + "grad_norm": NaN, + "learning_rate": 2.047348540596202e-05, + "loss": 0.0, + "step": 53819 + }, + { + "epoch": 5.021927778296165, + "grad_norm": NaN, + "learning_rate": 2.046967091551148e-05, + "loss": 0.0, + "step": 53820 + }, + { + "epoch": 5.0220210879910425, + "grad_norm": NaN, + "learning_rate": 2.0465856754418202e-05, + "loss": 0.0, + "step": 53821 + }, + { + "epoch": 5.02211439768592, + "grad_norm": NaN, + "learning_rate": 2.0462042922691875e-05, + "loss": 0.0, + "step": 53822 + }, + { + "epoch": 5.0222077073807965, + "grad_norm": NaN, + "learning_rate": 2.0458229420342187e-05, + "loss": 0.0, + "step": 53823 + }, + { + "epoch": 5.022301017075674, + "grad_norm": NaN, + "learning_rate": 2.0454416247378868e-05, + "loss": 0.0, + "step": 53824 + }, + { + "epoch": 5.022394326770551, + "grad_norm": NaN, + "learning_rate": 2.045060340381158e-05, + "loss": 0.0, + "step": 53825 + }, + { + "epoch": 5.022487636465429, + "grad_norm": NaN, + "learning_rate": 2.044679088965004e-05, + "loss": 0.0, + "step": 53826 + }, + { + "epoch": 5.022580946160306, + "grad_norm": NaN, + "learning_rate": 2.044297870490393e-05, + "loss": 0.0, + "step": 53827 + }, + { + "epoch": 5.022674255855184, + "grad_norm": NaN, + "learning_rate": 2.0439166849582956e-05, + "loss": 0.0, + "step": 53828 + }, + { + "epoch": 5.022767565550061, + "grad_norm": NaN, + "learning_rate": 2.0435355323696796e-05, + "loss": 0.0, + "step": 53829 + }, + { + "epoch": 5.0228608752449375, + "grad_norm": NaN, + "learning_rate": 2.043154412725514e-05, + "loss": 0.0, + "step": 53830 + }, + { + "epoch": 5.022954184939815, + "grad_norm": NaN, + "learning_rate": 2.0427733260267696e-05, + "loss": 0.0, + "step": 53831 + }, + { + "epoch": 5.023047494634692, + "grad_norm": NaN, + "learning_rate": 2.0423922722744147e-05, + "loss": 0.0, + "step": 53832 + }, + { + "epoch": 5.02314080432957, + "grad_norm": NaN, + "learning_rate": 2.042011251469416e-05, + "loss": 0.0, + "step": 53833 + }, + { + "epoch": 5.023234114024447, + "grad_norm": NaN, + "learning_rate": 2.0416302636127464e-05, + "loss": 0.0, + "step": 53834 + }, + { + "epoch": 5.023327423719325, + "grad_norm": NaN, + "learning_rate": 2.0412493087053715e-05, + "loss": 0.0, + "step": 53835 + }, + { + "epoch": 5.023420733414202, + "grad_norm": NaN, + "learning_rate": 2.0408683867482613e-05, + "loss": 0.0, + "step": 53836 + }, + { + "epoch": 5.0235140431090795, + "grad_norm": NaN, + "learning_rate": 2.040487497742384e-05, + "loss": 0.0, + "step": 53837 + }, + { + "epoch": 5.023607352803956, + "grad_norm": NaN, + "learning_rate": 2.040106641688708e-05, + "loss": 0.0, + "step": 53838 + }, + { + "epoch": 5.023700662498833, + "grad_norm": NaN, + "learning_rate": 2.039725818588202e-05, + "loss": 0.0, + "step": 53839 + }, + { + "epoch": 5.023793972193711, + "grad_norm": NaN, + "learning_rate": 2.039345028441834e-05, + "loss": 0.0, + "step": 53840 + }, + { + "epoch": 5.023887281888588, + "grad_norm": NaN, + "learning_rate": 2.038964271250571e-05, + "loss": 0.0, + "step": 53841 + }, + { + "epoch": 5.023980591583466, + "grad_norm": NaN, + "learning_rate": 2.0385835470153843e-05, + "loss": 0.0, + "step": 53842 + }, + { + "epoch": 5.024073901278343, + "grad_norm": NaN, + "learning_rate": 2.0382028557372398e-05, + "loss": 0.0, + "step": 53843 + }, + { + "epoch": 5.0241672109732205, + "grad_norm": NaN, + "learning_rate": 2.037822197417105e-05, + "loss": 0.0, + "step": 53844 + }, + { + "epoch": 5.024260520668097, + "grad_norm": NaN, + "learning_rate": 2.0374415720559507e-05, + "loss": 0.0, + "step": 53845 + }, + { + "epoch": 5.0243538303629744, + "grad_norm": NaN, + "learning_rate": 2.037060979654741e-05, + "loss": 0.0, + "step": 53846 + }, + { + "epoch": 5.024447140057852, + "grad_norm": NaN, + "learning_rate": 2.0366804202144454e-05, + "loss": 0.0, + "step": 53847 + }, + { + "epoch": 5.024540449752729, + "grad_norm": NaN, + "learning_rate": 2.0362998937360313e-05, + "loss": 0.0, + "step": 53848 + }, + { + "epoch": 5.024633759447607, + "grad_norm": NaN, + "learning_rate": 2.0359194002204664e-05, + "loss": 0.0, + "step": 53849 + }, + { + "epoch": 5.024727069142484, + "grad_norm": NaN, + "learning_rate": 2.0355389396687198e-05, + "loss": 0.0, + "step": 53850 + }, + { + "epoch": 5.0248203788373615, + "grad_norm": NaN, + "learning_rate": 2.035158512081756e-05, + "loss": 0.0, + "step": 53851 + }, + { + "epoch": 5.024913688532238, + "grad_norm": NaN, + "learning_rate": 2.0347781174605437e-05, + "loss": 0.0, + "step": 53852 + }, + { + "epoch": 5.0250069982271155, + "grad_norm": NaN, + "learning_rate": 2.034397755806051e-05, + "loss": 0.0, + "step": 53853 + }, + { + "epoch": 5.025100307921993, + "grad_norm": NaN, + "learning_rate": 2.0340174271192432e-05, + "loss": 0.0, + "step": 53854 + }, + { + "epoch": 5.02519361761687, + "grad_norm": NaN, + "learning_rate": 2.0336371314010887e-05, + "loss": 0.0, + "step": 53855 + }, + { + "epoch": 5.025286927311748, + "grad_norm": NaN, + "learning_rate": 2.0332568686525525e-05, + "loss": 0.0, + "step": 53856 + }, + { + "epoch": 5.025380237006625, + "grad_norm": NaN, + "learning_rate": 2.0328766388746043e-05, + "loss": 0.0, + "step": 53857 + }, + { + "epoch": 5.025473546701503, + "grad_norm": NaN, + "learning_rate": 2.03249644206821e-05, + "loss": 0.0, + "step": 53858 + }, + { + "epoch": 5.02556685639638, + "grad_norm": NaN, + "learning_rate": 2.0321162782343354e-05, + "loss": 0.0, + "step": 53859 + }, + { + "epoch": 5.0256601660912565, + "grad_norm": NaN, + "learning_rate": 2.031736147373948e-05, + "loss": 0.0, + "step": 53860 + }, + { + "epoch": 5.025753475786134, + "grad_norm": NaN, + "learning_rate": 2.0313560494880153e-05, + "loss": 0.0, + "step": 53861 + }, + { + "epoch": 5.025846785481011, + "grad_norm": NaN, + "learning_rate": 2.0309759845775003e-05, + "loss": 0.0, + "step": 53862 + }, + { + "epoch": 5.025940095175889, + "grad_norm": NaN, + "learning_rate": 2.0305959526433734e-05, + "loss": 0.0, + "step": 53863 + }, + { + "epoch": 5.026033404870766, + "grad_norm": NaN, + "learning_rate": 2.030215953686599e-05, + "loss": 0.0, + "step": 53864 + }, + { + "epoch": 5.026126714565644, + "grad_norm": NaN, + "learning_rate": 2.0298359877081434e-05, + "loss": 0.0, + "step": 53865 + }, + { + "epoch": 5.026220024260521, + "grad_norm": NaN, + "learning_rate": 2.0294560547089715e-05, + "loss": 0.0, + "step": 53866 + }, + { + "epoch": 5.026313333955398, + "grad_norm": NaN, + "learning_rate": 2.029076154690052e-05, + "loss": 0.0, + "step": 53867 + }, + { + "epoch": 5.026406643650275, + "grad_norm": NaN, + "learning_rate": 2.0286962876523484e-05, + "loss": 0.0, + "step": 53868 + }, + { + "epoch": 5.026499953345152, + "grad_norm": NaN, + "learning_rate": 2.02831645359683e-05, + "loss": 0.0, + "step": 53869 + }, + { + "epoch": 5.02659326304003, + "grad_norm": NaN, + "learning_rate": 2.027936652524458e-05, + "loss": 0.0, + "step": 53870 + }, + { + "epoch": 5.026686572734907, + "grad_norm": NaN, + "learning_rate": 2.027556884436201e-05, + "loss": 0.0, + "step": 53871 + }, + { + "epoch": 5.026779882429785, + "grad_norm": NaN, + "learning_rate": 2.027177149333026e-05, + "loss": 0.0, + "step": 53872 + }, + { + "epoch": 5.026873192124662, + "grad_norm": NaN, + "learning_rate": 2.0267974472158942e-05, + "loss": 0.0, + "step": 53873 + }, + { + "epoch": 5.026966501819539, + "grad_norm": NaN, + "learning_rate": 2.0264177780857745e-05, + "loss": 0.0, + "step": 53874 + }, + { + "epoch": 5.027059811514416, + "grad_norm": NaN, + "learning_rate": 2.0260381419436317e-05, + "loss": 0.0, + "step": 53875 + }, + { + "epoch": 5.0271531212092935, + "grad_norm": NaN, + "learning_rate": 2.0256585387904306e-05, + "loss": 0.0, + "step": 53876 + }, + { + "epoch": 5.027246430904171, + "grad_norm": NaN, + "learning_rate": 2.0252789686271352e-05, + "loss": 0.0, + "step": 53877 + }, + { + "epoch": 5.027339740599048, + "grad_norm": NaN, + "learning_rate": 2.0248994314547134e-05, + "loss": 0.0, + "step": 53878 + }, + { + "epoch": 5.027433050293926, + "grad_norm": NaN, + "learning_rate": 2.024519927274129e-05, + "loss": 0.0, + "step": 53879 + }, + { + "epoch": 5.027526359988803, + "grad_norm": NaN, + "learning_rate": 2.0241404560863466e-05, + "loss": 0.0, + "step": 53880 + }, + { + "epoch": 5.027619669683681, + "grad_norm": NaN, + "learning_rate": 2.0237610178923318e-05, + "loss": 0.0, + "step": 53881 + }, + { + "epoch": 5.027712979378557, + "grad_norm": NaN, + "learning_rate": 2.0233816126930475e-05, + "loss": 0.0, + "step": 53882 + }, + { + "epoch": 5.0278062890734345, + "grad_norm": NaN, + "learning_rate": 2.0230022404894608e-05, + "loss": 0.0, + "step": 53883 + }, + { + "epoch": 5.027899598768312, + "grad_norm": NaN, + "learning_rate": 2.0226229012825345e-05, + "loss": 0.0, + "step": 53884 + }, + { + "epoch": 5.027992908463189, + "grad_norm": NaN, + "learning_rate": 2.0222435950732345e-05, + "loss": 0.0, + "step": 53885 + }, + { + "epoch": 5.028086218158067, + "grad_norm": NaN, + "learning_rate": 2.0218643218625252e-05, + "loss": 0.0, + "step": 53886 + }, + { + "epoch": 5.028179527852944, + "grad_norm": NaN, + "learning_rate": 2.0214850816513705e-05, + "loss": 0.0, + "step": 53887 + }, + { + "epoch": 5.028272837547822, + "grad_norm": NaN, + "learning_rate": 2.0211058744407332e-05, + "loss": 0.0, + "step": 53888 + }, + { + "epoch": 5.028366147242698, + "grad_norm": NaN, + "learning_rate": 2.0207267002315807e-05, + "loss": 0.0, + "step": 53889 + }, + { + "epoch": 5.028459456937576, + "grad_norm": NaN, + "learning_rate": 2.0203475590248742e-05, + "loss": 0.0, + "step": 53890 + }, + { + "epoch": 5.028552766632453, + "grad_norm": NaN, + "learning_rate": 2.0199684508215806e-05, + "loss": 0.0, + "step": 53891 + }, + { + "epoch": 5.02864607632733, + "grad_norm": NaN, + "learning_rate": 2.0195893756226616e-05, + "loss": 0.0, + "step": 53892 + }, + { + "epoch": 5.028739386022208, + "grad_norm": NaN, + "learning_rate": 2.019210333429081e-05, + "loss": 0.0, + "step": 53893 + }, + { + "epoch": 5.028832695717085, + "grad_norm": NaN, + "learning_rate": 2.018831324241803e-05, + "loss": 0.0, + "step": 53894 + }, + { + "epoch": 5.028926005411963, + "grad_norm": NaN, + "learning_rate": 2.0184523480617935e-05, + "loss": 0.0, + "step": 53895 + }, + { + "epoch": 5.029019315106839, + "grad_norm": NaN, + "learning_rate": 2.018073404890012e-05, + "loss": 0.0, + "step": 53896 + }, + { + "epoch": 5.029112624801717, + "grad_norm": NaN, + "learning_rate": 2.017694494727426e-05, + "loss": 0.0, + "step": 53897 + }, + { + "epoch": 5.029205934496594, + "grad_norm": NaN, + "learning_rate": 2.0173156175749977e-05, + "loss": 0.0, + "step": 53898 + }, + { + "epoch": 5.0292992441914715, + "grad_norm": NaN, + "learning_rate": 2.0169367734336884e-05, + "loss": 0.0, + "step": 53899 + }, + { + "epoch": 5.029392553886349, + "grad_norm": NaN, + "learning_rate": 2.0165579623044637e-05, + "loss": 0.0, + "step": 53900 + }, + { + "epoch": 5.029485863581226, + "grad_norm": NaN, + "learning_rate": 2.0161791841882864e-05, + "loss": 0.0, + "step": 53901 + }, + { + "epoch": 5.029579173276104, + "grad_norm": NaN, + "learning_rate": 2.015800439086119e-05, + "loss": 0.0, + "step": 53902 + }, + { + "epoch": 5.02967248297098, + "grad_norm": NaN, + "learning_rate": 2.0154217269989252e-05, + "loss": 0.0, + "step": 53903 + }, + { + "epoch": 5.029765792665858, + "grad_norm": NaN, + "learning_rate": 2.0150430479276668e-05, + "loss": 0.0, + "step": 53904 + }, + { + "epoch": 5.029859102360735, + "grad_norm": NaN, + "learning_rate": 2.0146644018733077e-05, + "loss": 0.0, + "step": 53905 + }, + { + "epoch": 5.0299524120556125, + "grad_norm": NaN, + "learning_rate": 2.014285788836812e-05, + "loss": 0.0, + "step": 53906 + }, + { + "epoch": 5.03004572175049, + "grad_norm": NaN, + "learning_rate": 2.013907208819139e-05, + "loss": 0.0, + "step": 53907 + }, + { + "epoch": 5.030139031445367, + "grad_norm": NaN, + "learning_rate": 2.013528661821254e-05, + "loss": 0.0, + "step": 53908 + }, + { + "epoch": 5.030232341140245, + "grad_norm": NaN, + "learning_rate": 2.013150147844119e-05, + "loss": 0.0, + "step": 53909 + }, + { + "epoch": 5.030325650835122, + "grad_norm": NaN, + "learning_rate": 2.012771666888696e-05, + "loss": 0.0, + "step": 53910 + }, + { + "epoch": 5.030418960529999, + "grad_norm": NaN, + "learning_rate": 2.012393218955946e-05, + "loss": 0.0, + "step": 53911 + }, + { + "epoch": 5.030512270224876, + "grad_norm": NaN, + "learning_rate": 2.012014804046835e-05, + "loss": 0.0, + "step": 53912 + }, + { + "epoch": 5.0306055799197535, + "grad_norm": NaN, + "learning_rate": 2.0116364221623217e-05, + "loss": 0.0, + "step": 53913 + }, + { + "epoch": 5.030698889614631, + "grad_norm": NaN, + "learning_rate": 2.0112580733033707e-05, + "loss": 0.0, + "step": 53914 + }, + { + "epoch": 5.030792199309508, + "grad_norm": NaN, + "learning_rate": 2.010879757470943e-05, + "loss": 0.0, + "step": 53915 + }, + { + "epoch": 5.030885509004386, + "grad_norm": NaN, + "learning_rate": 2.0105014746659993e-05, + "loss": 0.0, + "step": 53916 + }, + { + "epoch": 5.030978818699263, + "grad_norm": NaN, + "learning_rate": 2.010123224889502e-05, + "loss": 0.0, + "step": 53917 + }, + { + "epoch": 5.03107212839414, + "grad_norm": NaN, + "learning_rate": 2.009745008142416e-05, + "loss": 0.0, + "step": 53918 + }, + { + "epoch": 5.031165438089017, + "grad_norm": NaN, + "learning_rate": 2.0093668244256983e-05, + "loss": 0.0, + "step": 53919 + }, + { + "epoch": 5.031258747783895, + "grad_norm": NaN, + "learning_rate": 2.0089886737403134e-05, + "loss": 0.0, + "step": 53920 + }, + { + "epoch": 5.031352057478772, + "grad_norm": NaN, + "learning_rate": 2.008610556087222e-05, + "loss": 0.0, + "step": 53921 + }, + { + "epoch": 5.031445367173649, + "grad_norm": NaN, + "learning_rate": 2.008232471467385e-05, + "loss": 0.0, + "step": 53922 + }, + { + "epoch": 5.031538676868527, + "grad_norm": NaN, + "learning_rate": 2.0078544198817648e-05, + "loss": 0.0, + "step": 53923 + }, + { + "epoch": 5.031631986563404, + "grad_norm": NaN, + "learning_rate": 2.0074764013313228e-05, + "loss": 0.0, + "step": 53924 + }, + { + "epoch": 5.031725296258281, + "grad_norm": NaN, + "learning_rate": 2.0070984158170194e-05, + "loss": 0.0, + "step": 53925 + }, + { + "epoch": 5.031818605953158, + "grad_norm": NaN, + "learning_rate": 2.006720463339816e-05, + "loss": 0.0, + "step": 53926 + }, + { + "epoch": 5.031911915648036, + "grad_norm": NaN, + "learning_rate": 2.006342543900673e-05, + "loss": 0.0, + "step": 53927 + }, + { + "epoch": 5.032005225342913, + "grad_norm": NaN, + "learning_rate": 2.0059646575005517e-05, + "loss": 0.0, + "step": 53928 + }, + { + "epoch": 5.0320985350377905, + "grad_norm": NaN, + "learning_rate": 2.0055868041404144e-05, + "loss": 0.0, + "step": 53929 + }, + { + "epoch": 5.032191844732668, + "grad_norm": NaN, + "learning_rate": 2.0052089838212187e-05, + "loss": 0.0, + "step": 53930 + }, + { + "epoch": 5.032285154427545, + "grad_norm": NaN, + "learning_rate": 2.004831196543929e-05, + "loss": 0.0, + "step": 53931 + }, + { + "epoch": 5.032378464122423, + "grad_norm": NaN, + "learning_rate": 2.0044534423095045e-05, + "loss": 0.0, + "step": 53932 + }, + { + "epoch": 5.032471773817299, + "grad_norm": NaN, + "learning_rate": 2.004075721118904e-05, + "loss": 0.0, + "step": 53933 + }, + { + "epoch": 5.032565083512177, + "grad_norm": NaN, + "learning_rate": 2.003698032973089e-05, + "loss": 0.0, + "step": 53934 + }, + { + "epoch": 5.032658393207054, + "grad_norm": NaN, + "learning_rate": 2.0033203778730218e-05, + "loss": 0.0, + "step": 53935 + }, + { + "epoch": 5.0327517029019315, + "grad_norm": NaN, + "learning_rate": 2.0029427558196598e-05, + "loss": 0.0, + "step": 53936 + }, + { + "epoch": 5.032845012596809, + "grad_norm": NaN, + "learning_rate": 2.0025651668139643e-05, + "loss": 0.0, + "step": 53937 + }, + { + "epoch": 5.032938322291686, + "grad_norm": NaN, + "learning_rate": 2.0021876108568956e-05, + "loss": 0.0, + "step": 53938 + }, + { + "epoch": 5.033031631986564, + "grad_norm": NaN, + "learning_rate": 2.0018100879494137e-05, + "loss": 0.0, + "step": 53939 + }, + { + "epoch": 5.03312494168144, + "grad_norm": NaN, + "learning_rate": 2.001432598092477e-05, + "loss": 0.0, + "step": 53940 + }, + { + "epoch": 5.033218251376318, + "grad_norm": NaN, + "learning_rate": 2.001055141287049e-05, + "loss": 0.0, + "step": 53941 + }, + { + "epoch": 5.033311561071195, + "grad_norm": NaN, + "learning_rate": 2.0006777175340865e-05, + "loss": 0.0, + "step": 53942 + }, + { + "epoch": 5.033404870766073, + "grad_norm": NaN, + "learning_rate": 2.0003003268345492e-05, + "loss": 0.0, + "step": 53943 + }, + { + "epoch": 5.03349818046095, + "grad_norm": NaN, + "learning_rate": 1.9999229691893976e-05, + "loss": 0.0, + "step": 53944 + }, + { + "epoch": 5.033591490155827, + "grad_norm": NaN, + "learning_rate": 1.999545644599591e-05, + "loss": 0.0, + "step": 53945 + }, + { + "epoch": 5.033684799850705, + "grad_norm": NaN, + "learning_rate": 1.999168353066089e-05, + "loss": 0.0, + "step": 53946 + }, + { + "epoch": 5.033778109545581, + "grad_norm": NaN, + "learning_rate": 1.998791094589852e-05, + "loss": 0.0, + "step": 53947 + }, + { + "epoch": 5.033871419240459, + "grad_norm": NaN, + "learning_rate": 1.9984138691718365e-05, + "loss": 0.0, + "step": 53948 + }, + { + "epoch": 5.033964728935336, + "grad_norm": NaN, + "learning_rate": 1.998036676813003e-05, + "loss": 0.0, + "step": 53949 + }, + { + "epoch": 5.034058038630214, + "grad_norm": NaN, + "learning_rate": 1.9976595175143107e-05, + "loss": 0.0, + "step": 53950 + }, + { + "epoch": 5.034151348325091, + "grad_norm": NaN, + "learning_rate": 1.9972823912767206e-05, + "loss": 0.0, + "step": 53951 + }, + { + "epoch": 5.0342446580199685, + "grad_norm": NaN, + "learning_rate": 1.996905298101187e-05, + "loss": 0.0, + "step": 53952 + }, + { + "epoch": 5.034337967714846, + "grad_norm": NaN, + "learning_rate": 1.996528237988674e-05, + "loss": 0.0, + "step": 53953 + }, + { + "epoch": 5.034431277409723, + "grad_norm": NaN, + "learning_rate": 1.9961512109401357e-05, + "loss": 0.0, + "step": 53954 + }, + { + "epoch": 5.0345245871046, + "grad_norm": NaN, + "learning_rate": 1.9957742169565332e-05, + "loss": 0.0, + "step": 53955 + }, + { + "epoch": 5.034617896799477, + "grad_norm": NaN, + "learning_rate": 1.995397256038826e-05, + "loss": 0.0, + "step": 53956 + }, + { + "epoch": 5.034711206494355, + "grad_norm": NaN, + "learning_rate": 1.9950203281879696e-05, + "loss": 0.0, + "step": 53957 + }, + { + "epoch": 5.034804516189232, + "grad_norm": NaN, + "learning_rate": 1.9946434334049248e-05, + "loss": 0.0, + "step": 53958 + }, + { + "epoch": 5.0348978258841095, + "grad_norm": NaN, + "learning_rate": 1.9942665716906497e-05, + "loss": 0.0, + "step": 53959 + }, + { + "epoch": 5.034991135578987, + "grad_norm": NaN, + "learning_rate": 1.9938897430461014e-05, + "loss": 0.0, + "step": 53960 + }, + { + "epoch": 5.035084445273864, + "grad_norm": NaN, + "learning_rate": 1.9935129474722395e-05, + "loss": 0.0, + "step": 53961 + }, + { + "epoch": 5.035177754968741, + "grad_norm": NaN, + "learning_rate": 1.993136184970021e-05, + "loss": 0.0, + "step": 53962 + }, + { + "epoch": 5.035271064663618, + "grad_norm": NaN, + "learning_rate": 1.992759455540406e-05, + "loss": 0.0, + "step": 53963 + }, + { + "epoch": 5.035364374358496, + "grad_norm": NaN, + "learning_rate": 1.9923827591843477e-05, + "loss": 0.0, + "step": 53964 + }, + { + "epoch": 5.035457684053373, + "grad_norm": NaN, + "learning_rate": 1.9920060959028094e-05, + "loss": 0.0, + "step": 53965 + }, + { + "epoch": 5.0355509937482505, + "grad_norm": NaN, + "learning_rate": 1.9916294656967453e-05, + "loss": 0.0, + "step": 53966 + }, + { + "epoch": 5.035644303443128, + "grad_norm": NaN, + "learning_rate": 1.991252868567114e-05, + "loss": 0.0, + "step": 53967 + }, + { + "epoch": 5.035737613138005, + "grad_norm": NaN, + "learning_rate": 1.9908763045148723e-05, + "loss": 0.0, + "step": 53968 + }, + { + "epoch": 5.035830922832882, + "grad_norm": NaN, + "learning_rate": 1.9904997735409805e-05, + "loss": 0.0, + "step": 53969 + }, + { + "epoch": 5.035924232527759, + "grad_norm": NaN, + "learning_rate": 1.9901232756463932e-05, + "loss": 0.0, + "step": 53970 + }, + { + "epoch": 5.036017542222637, + "grad_norm": NaN, + "learning_rate": 1.9897468108320674e-05, + "loss": 0.0, + "step": 53971 + }, + { + "epoch": 5.036110851917514, + "grad_norm": NaN, + "learning_rate": 1.989370379098963e-05, + "loss": 0.0, + "step": 53972 + }, + { + "epoch": 5.036204161612392, + "grad_norm": NaN, + "learning_rate": 1.9889939804480354e-05, + "loss": 0.0, + "step": 53973 + }, + { + "epoch": 5.036297471307269, + "grad_norm": NaN, + "learning_rate": 1.9886176148802425e-05, + "loss": 0.0, + "step": 53974 + }, + { + "epoch": 5.036390781002146, + "grad_norm": NaN, + "learning_rate": 1.9882412823965398e-05, + "loss": 0.0, + "step": 53975 + }, + { + "epoch": 5.036484090697024, + "grad_norm": NaN, + "learning_rate": 1.9878649829978854e-05, + "loss": 0.0, + "step": 53976 + }, + { + "epoch": 5.0365774003919, + "grad_norm": NaN, + "learning_rate": 1.9874887166852364e-05, + "loss": 0.0, + "step": 53977 + }, + { + "epoch": 5.036670710086778, + "grad_norm": NaN, + "learning_rate": 1.9871124834595474e-05, + "loss": 0.0, + "step": 53978 + }, + { + "epoch": 5.036764019781655, + "grad_norm": NaN, + "learning_rate": 1.9867362833217794e-05, + "loss": 0.0, + "step": 53979 + }, + { + "epoch": 5.036857329476533, + "grad_norm": NaN, + "learning_rate": 1.9863601162728843e-05, + "loss": 0.0, + "step": 53980 + }, + { + "epoch": 5.03695063917141, + "grad_norm": NaN, + "learning_rate": 1.98598398231382e-05, + "loss": 0.0, + "step": 53981 + }, + { + "epoch": 5.0370439488662875, + "grad_norm": NaN, + "learning_rate": 1.9856078814455444e-05, + "loss": 0.0, + "step": 53982 + }, + { + "epoch": 5.037137258561165, + "grad_norm": NaN, + "learning_rate": 1.985231813669013e-05, + "loss": 0.0, + "step": 53983 + }, + { + "epoch": 5.037230568256041, + "grad_norm": NaN, + "learning_rate": 1.984855778985182e-05, + "loss": 0.0, + "step": 53984 + }, + { + "epoch": 5.037323877950919, + "grad_norm": NaN, + "learning_rate": 1.9844797773950072e-05, + "loss": 0.0, + "step": 53985 + }, + { + "epoch": 5.037417187645796, + "grad_norm": NaN, + "learning_rate": 1.9841038088994444e-05, + "loss": 0.0, + "step": 53986 + }, + { + "epoch": 5.037510497340674, + "grad_norm": NaN, + "learning_rate": 1.9837278734994494e-05, + "loss": 0.0, + "step": 53987 + }, + { + "epoch": 5.037603807035551, + "grad_norm": NaN, + "learning_rate": 1.98335197119598e-05, + "loss": 0.0, + "step": 53988 + }, + { + "epoch": 5.0376971167304285, + "grad_norm": NaN, + "learning_rate": 1.9829761019899905e-05, + "loss": 0.0, + "step": 53989 + }, + { + "epoch": 5.037790426425306, + "grad_norm": NaN, + "learning_rate": 1.9826002658824363e-05, + "loss": 0.0, + "step": 53990 + }, + { + "epoch": 5.0378837361201825, + "grad_norm": NaN, + "learning_rate": 1.982224462874274e-05, + "loss": 0.0, + "step": 53991 + }, + { + "epoch": 5.03797704581506, + "grad_norm": NaN, + "learning_rate": 1.9818486929664574e-05, + "loss": 0.0, + "step": 53992 + }, + { + "epoch": 5.038070355509937, + "grad_norm": NaN, + "learning_rate": 1.981472956159944e-05, + "loss": 0.0, + "step": 53993 + }, + { + "epoch": 5.038163665204815, + "grad_norm": NaN, + "learning_rate": 1.98109725245569e-05, + "loss": 0.0, + "step": 53994 + }, + { + "epoch": 5.038256974899692, + "grad_norm": NaN, + "learning_rate": 1.980721581854648e-05, + "loss": 0.0, + "step": 53995 + }, + { + "epoch": 5.03835028459457, + "grad_norm": NaN, + "learning_rate": 1.9803459443577753e-05, + "loss": 0.0, + "step": 53996 + }, + { + "epoch": 5.038443594289447, + "grad_norm": NaN, + "learning_rate": 1.9799703399660242e-05, + "loss": 0.0, + "step": 53997 + }, + { + "epoch": 5.038536903984324, + "grad_norm": NaN, + "learning_rate": 1.9795947686803533e-05, + "loss": 0.0, + "step": 53998 + }, + { + "epoch": 5.038630213679201, + "grad_norm": NaN, + "learning_rate": 1.979219230501716e-05, + "loss": 0.0, + "step": 53999 + }, + { + "epoch": 5.038723523374078, + "grad_norm": NaN, + "learning_rate": 1.978843725431066e-05, + "loss": 0.0, + "step": 54000 + }, + { + "epoch": 5.038816833068956, + "grad_norm": NaN, + "learning_rate": 1.9784682534693608e-05, + "loss": 0.0, + "step": 54001 + }, + { + "epoch": 5.038910142763833, + "grad_norm": NaN, + "learning_rate": 1.9780928146175534e-05, + "loss": 0.0, + "step": 54002 + }, + { + "epoch": 5.039003452458711, + "grad_norm": NaN, + "learning_rate": 1.977717408876597e-05, + "loss": 0.0, + "step": 54003 + }, + { + "epoch": 5.039096762153588, + "grad_norm": NaN, + "learning_rate": 1.9773420362474486e-05, + "loss": 0.0, + "step": 54004 + }, + { + "epoch": 5.0391900718484655, + "grad_norm": NaN, + "learning_rate": 1.9769666967310628e-05, + "loss": 0.0, + "step": 54005 + }, + { + "epoch": 5.039283381543342, + "grad_norm": NaN, + "learning_rate": 1.976591390328392e-05, + "loss": 0.0, + "step": 54006 + }, + { + "epoch": 5.039376691238219, + "grad_norm": NaN, + "learning_rate": 1.9762161170403905e-05, + "loss": 0.0, + "step": 54007 + }, + { + "epoch": 5.039470000933097, + "grad_norm": NaN, + "learning_rate": 1.9758408768680162e-05, + "loss": 0.0, + "step": 54008 + }, + { + "epoch": 5.039563310627974, + "grad_norm": NaN, + "learning_rate": 1.975465669812218e-05, + "loss": 0.0, + "step": 54009 + }, + { + "epoch": 5.039656620322852, + "grad_norm": NaN, + "learning_rate": 1.9750904958739532e-05, + "loss": 0.0, + "step": 54010 + }, + { + "epoch": 5.039749930017729, + "grad_norm": NaN, + "learning_rate": 1.974715355054175e-05, + "loss": 0.0, + "step": 54011 + }, + { + "epoch": 5.0398432397126065, + "grad_norm": NaN, + "learning_rate": 1.9743402473538374e-05, + "loss": 0.0, + "step": 54012 + }, + { + "epoch": 5.039936549407483, + "grad_norm": NaN, + "learning_rate": 1.973965172773894e-05, + "loss": 0.0, + "step": 54013 + }, + { + "epoch": 5.0400298591023605, + "grad_norm": NaN, + "learning_rate": 1.9735901313152985e-05, + "loss": 0.0, + "step": 54014 + }, + { + "epoch": 5.040123168797238, + "grad_norm": NaN, + "learning_rate": 1.973215122979006e-05, + "loss": 0.0, + "step": 54015 + }, + { + "epoch": 5.040216478492115, + "grad_norm": NaN, + "learning_rate": 1.972840147765966e-05, + "loss": 0.0, + "step": 54016 + }, + { + "epoch": 5.040309788186993, + "grad_norm": NaN, + "learning_rate": 1.9724652056771363e-05, + "loss": 0.0, + "step": 54017 + }, + { + "epoch": 5.04040309788187, + "grad_norm": NaN, + "learning_rate": 1.972090296713469e-05, + "loss": 0.0, + "step": 54018 + }, + { + "epoch": 5.0404964075767476, + "grad_norm": NaN, + "learning_rate": 1.9717154208759167e-05, + "loss": 0.0, + "step": 54019 + }, + { + "epoch": 5.040589717271624, + "grad_norm": NaN, + "learning_rate": 1.9713405781654306e-05, + "loss": 0.0, + "step": 54020 + }, + { + "epoch": 5.0406830269665015, + "grad_norm": NaN, + "learning_rate": 1.9709657685829683e-05, + "loss": 0.0, + "step": 54021 + }, + { + "epoch": 5.040776336661379, + "grad_norm": NaN, + "learning_rate": 1.9705909921294804e-05, + "loss": 0.0, + "step": 54022 + }, + { + "epoch": 5.040869646356256, + "grad_norm": NaN, + "learning_rate": 1.9702162488059197e-05, + "loss": 0.0, + "step": 54023 + }, + { + "epoch": 5.040962956051134, + "grad_norm": NaN, + "learning_rate": 1.969841538613239e-05, + "loss": 0.0, + "step": 54024 + }, + { + "epoch": 5.041056265746011, + "grad_norm": NaN, + "learning_rate": 1.9694668615523917e-05, + "loss": 0.0, + "step": 54025 + }, + { + "epoch": 5.041149575440889, + "grad_norm": NaN, + "learning_rate": 1.9690922176243296e-05, + "loss": 0.0, + "step": 54026 + }, + { + "epoch": 5.041242885135766, + "grad_norm": NaN, + "learning_rate": 1.9687176068300066e-05, + "loss": 0.0, + "step": 54027 + }, + { + "epoch": 5.0413361948306425, + "grad_norm": NaN, + "learning_rate": 1.968343029170375e-05, + "loss": 0.0, + "step": 54028 + }, + { + "epoch": 5.04142950452552, + "grad_norm": NaN, + "learning_rate": 1.967968484646386e-05, + "loss": 0.0, + "step": 54029 + }, + { + "epoch": 5.041522814220397, + "grad_norm": NaN, + "learning_rate": 1.9675939732589942e-05, + "loss": 0.0, + "step": 54030 + }, + { + "epoch": 5.041616123915275, + "grad_norm": NaN, + "learning_rate": 1.967219495009148e-05, + "loss": 0.0, + "step": 54031 + }, + { + "epoch": 5.041709433610152, + "grad_norm": NaN, + "learning_rate": 1.966845049897804e-05, + "loss": 0.0, + "step": 54032 + }, + { + "epoch": 5.04180274330503, + "grad_norm": NaN, + "learning_rate": 1.9664706379259125e-05, + "loss": 0.0, + "step": 54033 + }, + { + "epoch": 5.041896052999907, + "grad_norm": NaN, + "learning_rate": 1.9660962590944235e-05, + "loss": 0.0, + "step": 54034 + }, + { + "epoch": 5.041989362694784, + "grad_norm": NaN, + "learning_rate": 1.9657219134042924e-05, + "loss": 0.0, + "step": 54035 + }, + { + "epoch": 5.042082672389661, + "grad_norm": NaN, + "learning_rate": 1.9653476008564684e-05, + "loss": 0.0, + "step": 54036 + }, + { + "epoch": 5.042175982084538, + "grad_norm": NaN, + "learning_rate": 1.964973321451906e-05, + "loss": 0.0, + "step": 54037 + }, + { + "epoch": 5.042269291779416, + "grad_norm": NaN, + "learning_rate": 1.9645990751915542e-05, + "loss": 0.0, + "step": 54038 + }, + { + "epoch": 5.042362601474293, + "grad_norm": NaN, + "learning_rate": 1.964224862076364e-05, + "loss": 0.0, + "step": 54039 + }, + { + "epoch": 5.042455911169171, + "grad_norm": NaN, + "learning_rate": 1.9638506821072897e-05, + "loss": 0.0, + "step": 54040 + }, + { + "epoch": 5.042549220864048, + "grad_norm": NaN, + "learning_rate": 1.963476535285279e-05, + "loss": 0.0, + "step": 54041 + }, + { + "epoch": 5.042642530558925, + "grad_norm": NaN, + "learning_rate": 1.9631024216112923e-05, + "loss": 0.0, + "step": 54042 + }, + { + "epoch": 5.042735840253802, + "grad_norm": NaN, + "learning_rate": 1.9627283410862694e-05, + "loss": 0.0, + "step": 54043 + }, + { + "epoch": 5.0428291499486795, + "grad_norm": NaN, + "learning_rate": 1.9623542937111643e-05, + "loss": 0.0, + "step": 54044 + }, + { + "epoch": 5.042922459643557, + "grad_norm": NaN, + "learning_rate": 1.9619802794869375e-05, + "loss": 0.0, + "step": 54045 + }, + { + "epoch": 5.043015769338434, + "grad_norm": NaN, + "learning_rate": 1.9616062984145294e-05, + "loss": 0.0, + "step": 54046 + }, + { + "epoch": 5.043109079033312, + "grad_norm": NaN, + "learning_rate": 1.961232350494893e-05, + "loss": 0.0, + "step": 54047 + }, + { + "epoch": 5.043202388728189, + "grad_norm": NaN, + "learning_rate": 1.9608584357289797e-05, + "loss": 0.0, + "step": 54048 + }, + { + "epoch": 5.043295698423067, + "grad_norm": NaN, + "learning_rate": 1.9604845541177423e-05, + "loss": 0.0, + "step": 54049 + }, + { + "epoch": 5.043389008117943, + "grad_norm": NaN, + "learning_rate": 1.9601107056621297e-05, + "loss": 0.0, + "step": 54050 + }, + { + "epoch": 5.0434823178128205, + "grad_norm": NaN, + "learning_rate": 1.959736890363093e-05, + "loss": 0.0, + "step": 54051 + }, + { + "epoch": 5.043575627507698, + "grad_norm": NaN, + "learning_rate": 1.9593631082215826e-05, + "loss": 0.0, + "step": 54052 + }, + { + "epoch": 5.043668937202575, + "grad_norm": NaN, + "learning_rate": 1.9589893592385486e-05, + "loss": 0.0, + "step": 54053 + }, + { + "epoch": 5.043762246897453, + "grad_norm": NaN, + "learning_rate": 1.958615643414943e-05, + "loss": 0.0, + "step": 54054 + }, + { + "epoch": 5.04385555659233, + "grad_norm": NaN, + "learning_rate": 1.9582419607517135e-05, + "loss": 0.0, + "step": 54055 + }, + { + "epoch": 5.043948866287208, + "grad_norm": NaN, + "learning_rate": 1.957868311249811e-05, + "loss": 0.0, + "step": 54056 + }, + { + "epoch": 5.044042175982084, + "grad_norm": NaN, + "learning_rate": 1.9574946949101868e-05, + "loss": 0.0, + "step": 54057 + }, + { + "epoch": 5.044135485676962, + "grad_norm": NaN, + "learning_rate": 1.9571211117337892e-05, + "loss": 0.0, + "step": 54058 + }, + { + "epoch": 5.044228795371839, + "grad_norm": NaN, + "learning_rate": 1.95674756172157e-05, + "loss": 0.0, + "step": 54059 + }, + { + "epoch": 5.044322105066716, + "grad_norm": NaN, + "learning_rate": 1.956374044874478e-05, + "loss": 0.0, + "step": 54060 + }, + { + "epoch": 5.044415414761594, + "grad_norm": NaN, + "learning_rate": 1.9560005611934626e-05, + "loss": 0.0, + "step": 54061 + }, + { + "epoch": 5.044508724456471, + "grad_norm": NaN, + "learning_rate": 1.955627110679473e-05, + "loss": 0.0, + "step": 54062 + }, + { + "epoch": 5.044602034151349, + "grad_norm": NaN, + "learning_rate": 1.9552536933334618e-05, + "loss": 0.0, + "step": 54063 + }, + { + "epoch": 5.044695343846225, + "grad_norm": NaN, + "learning_rate": 1.9548803091563746e-05, + "loss": 0.0, + "step": 54064 + }, + { + "epoch": 5.044788653541103, + "grad_norm": NaN, + "learning_rate": 1.954506958149163e-05, + "loss": 0.0, + "step": 54065 + }, + { + "epoch": 5.04488196323598, + "grad_norm": NaN, + "learning_rate": 1.9541336403127754e-05, + "loss": 0.0, + "step": 54066 + }, + { + "epoch": 5.0449752729308575, + "grad_norm": NaN, + "learning_rate": 1.9537603556481618e-05, + "loss": 0.0, + "step": 54067 + }, + { + "epoch": 5.045068582625735, + "grad_norm": NaN, + "learning_rate": 1.953387104156271e-05, + "loss": 0.0, + "step": 54068 + }, + { + "epoch": 5.045161892320612, + "grad_norm": NaN, + "learning_rate": 1.9530138858380524e-05, + "loss": 0.0, + "step": 54069 + }, + { + "epoch": 5.04525520201549, + "grad_norm": NaN, + "learning_rate": 1.952640700694455e-05, + "loss": 0.0, + "step": 54070 + }, + { + "epoch": 5.045348511710367, + "grad_norm": NaN, + "learning_rate": 1.9522675487264265e-05, + "loss": 0.0, + "step": 54071 + }, + { + "epoch": 5.045441821405244, + "grad_norm": NaN, + "learning_rate": 1.9518944299349183e-05, + "loss": 0.0, + "step": 54072 + }, + { + "epoch": 5.045535131100121, + "grad_norm": NaN, + "learning_rate": 1.9515213443208754e-05, + "loss": 0.0, + "step": 54073 + }, + { + "epoch": 5.0456284407949985, + "grad_norm": NaN, + "learning_rate": 1.9511482918852493e-05, + "loss": 0.0, + "step": 54074 + }, + { + "epoch": 5.045721750489876, + "grad_norm": NaN, + "learning_rate": 1.9507752726289876e-05, + "loss": 0.0, + "step": 54075 + }, + { + "epoch": 5.045815060184753, + "grad_norm": NaN, + "learning_rate": 1.9504022865530394e-05, + "loss": 0.0, + "step": 54076 + }, + { + "epoch": 5.045908369879631, + "grad_norm": NaN, + "learning_rate": 1.950029333658352e-05, + "loss": 0.0, + "step": 54077 + }, + { + "epoch": 5.046001679574508, + "grad_norm": NaN, + "learning_rate": 1.9496564139458754e-05, + "loss": 0.0, + "step": 54078 + }, + { + "epoch": 5.046094989269385, + "grad_norm": NaN, + "learning_rate": 1.9492835274165513e-05, + "loss": 0.0, + "step": 54079 + }, + { + "epoch": 5.046188298964262, + "grad_norm": NaN, + "learning_rate": 1.948910674071341e-05, + "loss": 0.0, + "step": 54080 + }, + { + "epoch": 5.0462816086591396, + "grad_norm": NaN, + "learning_rate": 1.9485378539111824e-05, + "loss": 0.0, + "step": 54081 + }, + { + "epoch": 5.046374918354017, + "grad_norm": NaN, + "learning_rate": 1.9481650669370226e-05, + "loss": 0.0, + "step": 54082 + }, + { + "epoch": 5.046468228048894, + "grad_norm": NaN, + "learning_rate": 1.9477923131498174e-05, + "loss": 0.0, + "step": 54083 + }, + { + "epoch": 5.046561537743772, + "grad_norm": NaN, + "learning_rate": 1.9474195925505082e-05, + "loss": 0.0, + "step": 54084 + }, + { + "epoch": 5.046654847438649, + "grad_norm": NaN, + "learning_rate": 1.9470469051400424e-05, + "loss": 0.0, + "step": 54085 + }, + { + "epoch": 5.046748157133526, + "grad_norm": NaN, + "learning_rate": 1.946674250919374e-05, + "loss": 0.0, + "step": 54086 + }, + { + "epoch": 5.046841466828403, + "grad_norm": NaN, + "learning_rate": 1.946301629889444e-05, + "loss": 0.0, + "step": 54087 + }, + { + "epoch": 5.046934776523281, + "grad_norm": NaN, + "learning_rate": 1.9459290420511985e-05, + "loss": 0.0, + "step": 54088 + }, + { + "epoch": 5.047028086218158, + "grad_norm": NaN, + "learning_rate": 1.9455564874055967e-05, + "loss": 0.0, + "step": 54089 + }, + { + "epoch": 5.047121395913035, + "grad_norm": NaN, + "learning_rate": 1.945183965953573e-05, + "loss": 0.0, + "step": 54090 + }, + { + "epoch": 5.047214705607913, + "grad_norm": NaN, + "learning_rate": 1.9448114776960794e-05, + "loss": 0.0, + "step": 54091 + }, + { + "epoch": 5.04730801530279, + "grad_norm": NaN, + "learning_rate": 1.9444390226340623e-05, + "loss": 0.0, + "step": 54092 + }, + { + "epoch": 5.047401324997668, + "grad_norm": NaN, + "learning_rate": 1.944066600768471e-05, + "loss": 0.0, + "step": 54093 + }, + { + "epoch": 5.047494634692544, + "grad_norm": NaN, + "learning_rate": 1.9436942121002507e-05, + "loss": 0.0, + "step": 54094 + }, + { + "epoch": 5.047587944387422, + "grad_norm": NaN, + "learning_rate": 1.9433218566303465e-05, + "loss": 0.0, + "step": 54095 + }, + { + "epoch": 5.047681254082299, + "grad_norm": NaN, + "learning_rate": 1.942949534359709e-05, + "loss": 0.0, + "step": 54096 + }, + { + "epoch": 5.0477745637771765, + "grad_norm": NaN, + "learning_rate": 1.9425772452892823e-05, + "loss": 0.0, + "step": 54097 + }, + { + "epoch": 5.047867873472054, + "grad_norm": NaN, + "learning_rate": 1.942204989420014e-05, + "loss": 0.0, + "step": 54098 + }, + { + "epoch": 5.047961183166931, + "grad_norm": NaN, + "learning_rate": 1.9418327667528506e-05, + "loss": 0.0, + "step": 54099 + }, + { + "epoch": 5.048054492861809, + "grad_norm": NaN, + "learning_rate": 1.941460577288739e-05, + "loss": 0.0, + "step": 54100 + }, + { + "epoch": 5.048147802556685, + "grad_norm": NaN, + "learning_rate": 1.941088421028625e-05, + "loss": 0.0, + "step": 54101 + }, + { + "epoch": 5.048241112251563, + "grad_norm": NaN, + "learning_rate": 1.940716297973453e-05, + "loss": 0.0, + "step": 54102 + }, + { + "epoch": 5.04833442194644, + "grad_norm": NaN, + "learning_rate": 1.9403442081241728e-05, + "loss": 0.0, + "step": 54103 + }, + { + "epoch": 5.0484277316413175, + "grad_norm": NaN, + "learning_rate": 1.9399721514817278e-05, + "loss": 0.0, + "step": 54104 + }, + { + "epoch": 5.048521041336195, + "grad_norm": NaN, + "learning_rate": 1.9396001280470645e-05, + "loss": 0.0, + "step": 54105 + }, + { + "epoch": 5.048614351031072, + "grad_norm": NaN, + "learning_rate": 1.9392281378211306e-05, + "loss": 0.0, + "step": 54106 + }, + { + "epoch": 5.04870766072595, + "grad_norm": NaN, + "learning_rate": 1.9388561808048696e-05, + "loss": 0.0, + "step": 54107 + }, + { + "epoch": 5.048800970420826, + "grad_norm": NaN, + "learning_rate": 1.938484256999228e-05, + "loss": 0.0, + "step": 54108 + }, + { + "epoch": 5.048894280115704, + "grad_norm": NaN, + "learning_rate": 1.9381123664051533e-05, + "loss": 0.0, + "step": 54109 + }, + { + "epoch": 5.048987589810581, + "grad_norm": NaN, + "learning_rate": 1.9377405090235882e-05, + "loss": 0.0, + "step": 54110 + }, + { + "epoch": 5.049080899505459, + "grad_norm": NaN, + "learning_rate": 1.9373686848554797e-05, + "loss": 0.0, + "step": 54111 + }, + { + "epoch": 5.049174209200336, + "grad_norm": NaN, + "learning_rate": 1.9369968939017742e-05, + "loss": 0.0, + "step": 54112 + }, + { + "epoch": 5.049267518895213, + "grad_norm": NaN, + "learning_rate": 1.936625136163416e-05, + "loss": 0.0, + "step": 54113 + }, + { + "epoch": 5.049360828590091, + "grad_norm": NaN, + "learning_rate": 1.9362534116413507e-05, + "loss": 0.0, + "step": 54114 + }, + { + "epoch": 5.049454138284968, + "grad_norm": NaN, + "learning_rate": 1.9358817203365213e-05, + "loss": 0.0, + "step": 54115 + }, + { + "epoch": 5.049547447979845, + "grad_norm": NaN, + "learning_rate": 1.9355100622498764e-05, + "loss": 0.0, + "step": 54116 + }, + { + "epoch": 5.049640757674722, + "grad_norm": NaN, + "learning_rate": 1.935138437382356e-05, + "loss": 0.0, + "step": 54117 + }, + { + "epoch": 5.0497340673696, + "grad_norm": NaN, + "learning_rate": 1.9347668457349152e-05, + "loss": 0.0, + "step": 54118 + }, + { + "epoch": 5.049827377064477, + "grad_norm": NaN, + "learning_rate": 1.934395287308489e-05, + "loss": 0.0, + "step": 54119 + }, + { + "epoch": 5.0499206867593545, + "grad_norm": NaN, + "learning_rate": 1.934023762104021e-05, + "loss": 0.0, + "step": 54120 + }, + { + "epoch": 5.050013996454232, + "grad_norm": NaN, + "learning_rate": 1.9336522701224676e-05, + "loss": 0.0, + "step": 54121 + }, + { + "epoch": 5.050107306149109, + "grad_norm": NaN, + "learning_rate": 1.9332808113647614e-05, + "loss": 0.0, + "step": 54122 + }, + { + "epoch": 5.050200615843986, + "grad_norm": NaN, + "learning_rate": 1.9329093858318494e-05, + "loss": 0.0, + "step": 54123 + }, + { + "epoch": 5.050293925538863, + "grad_norm": NaN, + "learning_rate": 1.9325379935246833e-05, + "loss": 0.0, + "step": 54124 + }, + { + "epoch": 5.050387235233741, + "grad_norm": NaN, + "learning_rate": 1.9321666344442e-05, + "loss": 0.0, + "step": 54125 + }, + { + "epoch": 5.050480544928618, + "grad_norm": NaN, + "learning_rate": 1.9317953085913422e-05, + "loss": 0.0, + "step": 54126 + }, + { + "epoch": 5.0505738546234955, + "grad_norm": NaN, + "learning_rate": 1.9314240159670662e-05, + "loss": 0.0, + "step": 54127 + }, + { + "epoch": 5.050667164318373, + "grad_norm": NaN, + "learning_rate": 1.9310527565723027e-05, + "loss": 0.0, + "step": 54128 + }, + { + "epoch": 5.05076047401325, + "grad_norm": NaN, + "learning_rate": 1.9306815304079977e-05, + "loss": 0.0, + "step": 54129 + }, + { + "epoch": 5.050853783708127, + "grad_norm": NaN, + "learning_rate": 1.930310337475105e-05, + "loss": 0.0, + "step": 54130 + }, + { + "epoch": 5.050947093403004, + "grad_norm": NaN, + "learning_rate": 1.9299391777745577e-05, + "loss": 0.0, + "step": 54131 + }, + { + "epoch": 5.051040403097882, + "grad_norm": NaN, + "learning_rate": 1.9295680513072997e-05, + "loss": 0.0, + "step": 54132 + }, + { + "epoch": 5.051133712792759, + "grad_norm": NaN, + "learning_rate": 1.929196958074287e-05, + "loss": 0.0, + "step": 54133 + }, + { + "epoch": 5.0512270224876366, + "grad_norm": NaN, + "learning_rate": 1.9288258980764492e-05, + "loss": 0.0, + "step": 54134 + }, + { + "epoch": 5.051320332182514, + "grad_norm": NaN, + "learning_rate": 1.9284548713147334e-05, + "loss": 0.0, + "step": 54135 + }, + { + "epoch": 5.051413641877391, + "grad_norm": NaN, + "learning_rate": 1.9280838777900907e-05, + "loss": 0.0, + "step": 54136 + }, + { + "epoch": 5.051506951572268, + "grad_norm": NaN, + "learning_rate": 1.9277129175034556e-05, + "loss": 0.0, + "step": 54137 + }, + { + "epoch": 5.051600261267145, + "grad_norm": NaN, + "learning_rate": 1.9273419904557736e-05, + "loss": 0.0, + "step": 54138 + }, + { + "epoch": 5.051693570962023, + "grad_norm": NaN, + "learning_rate": 1.926971096647989e-05, + "loss": 0.0, + "step": 54139 + }, + { + "epoch": 5.0517868806569, + "grad_norm": NaN, + "learning_rate": 1.926600236081043e-05, + "loss": 0.0, + "step": 54140 + }, + { + "epoch": 5.051880190351778, + "grad_norm": NaN, + "learning_rate": 1.926229408755881e-05, + "loss": 0.0, + "step": 54141 + }, + { + "epoch": 5.051973500046655, + "grad_norm": NaN, + "learning_rate": 1.9258586146734462e-05, + "loss": 0.0, + "step": 54142 + }, + { + "epoch": 5.052066809741532, + "grad_norm": NaN, + "learning_rate": 1.925487853834679e-05, + "loss": 0.0, + "step": 54143 + }, + { + "epoch": 5.05216011943641, + "grad_norm": NaN, + "learning_rate": 1.9251171262405224e-05, + "loss": 0.0, + "step": 54144 + }, + { + "epoch": 5.052253429131286, + "grad_norm": NaN, + "learning_rate": 1.9247464318919204e-05, + "loss": 0.0, + "step": 54145 + }, + { + "epoch": 5.052346738826164, + "grad_norm": NaN, + "learning_rate": 1.9243757707898156e-05, + "loss": 0.0, + "step": 54146 + }, + { + "epoch": 5.052440048521041, + "grad_norm": NaN, + "learning_rate": 1.9240051429351506e-05, + "loss": 0.0, + "step": 54147 + }, + { + "epoch": 5.052533358215919, + "grad_norm": NaN, + "learning_rate": 1.9236345483288662e-05, + "loss": 0.0, + "step": 54148 + }, + { + "epoch": 5.052626667910796, + "grad_norm": NaN, + "learning_rate": 1.923263986971905e-05, + "loss": 0.0, + "step": 54149 + }, + { + "epoch": 5.0527199776056735, + "grad_norm": NaN, + "learning_rate": 1.92289345886521e-05, + "loss": 0.0, + "step": 54150 + }, + { + "epoch": 5.052813287300551, + "grad_norm": NaN, + "learning_rate": 1.9225229640097234e-05, + "loss": 0.0, + "step": 54151 + }, + { + "epoch": 5.052906596995427, + "grad_norm": NaN, + "learning_rate": 1.9221525024063878e-05, + "loss": 0.0, + "step": 54152 + }, + { + "epoch": 5.052999906690305, + "grad_norm": NaN, + "learning_rate": 1.921782074056144e-05, + "loss": 0.0, + "step": 54153 + }, + { + "epoch": 5.053093216385182, + "grad_norm": NaN, + "learning_rate": 1.921411678959935e-05, + "loss": 0.0, + "step": 54154 + }, + { + "epoch": 5.05318652608006, + "grad_norm": NaN, + "learning_rate": 1.921041317118698e-05, + "loss": 0.0, + "step": 54155 + }, + { + "epoch": 5.053279835774937, + "grad_norm": NaN, + "learning_rate": 1.9206709885333837e-05, + "loss": 0.0, + "step": 54156 + }, + { + "epoch": 5.0533731454698145, + "grad_norm": NaN, + "learning_rate": 1.920300693204927e-05, + "loss": 0.0, + "step": 54157 + }, + { + "epoch": 5.053466455164692, + "grad_norm": NaN, + "learning_rate": 1.9199304311342683e-05, + "loss": 0.0, + "step": 54158 + }, + { + "epoch": 5.0535597648595685, + "grad_norm": NaN, + "learning_rate": 1.919560202322359e-05, + "loss": 0.0, + "step": 54159 + }, + { + "epoch": 5.053653074554446, + "grad_norm": NaN, + "learning_rate": 1.919190006770128e-05, + "loss": 0.0, + "step": 54160 + }, + { + "epoch": 5.053746384249323, + "grad_norm": NaN, + "learning_rate": 1.9188198444785208e-05, + "loss": 0.0, + "step": 54161 + }, + { + "epoch": 5.053839693944201, + "grad_norm": NaN, + "learning_rate": 1.918449715448484e-05, + "loss": 0.0, + "step": 54162 + }, + { + "epoch": 5.053933003639078, + "grad_norm": NaN, + "learning_rate": 1.9180796196809535e-05, + "loss": 0.0, + "step": 54163 + }, + { + "epoch": 5.054026313333956, + "grad_norm": NaN, + "learning_rate": 1.9177095571768662e-05, + "loss": 0.0, + "step": 54164 + }, + { + "epoch": 5.054119623028833, + "grad_norm": NaN, + "learning_rate": 1.9173395279371756e-05, + "loss": 0.0, + "step": 54165 + }, + { + "epoch": 5.05421293272371, + "grad_norm": NaN, + "learning_rate": 1.916969531962812e-05, + "loss": 0.0, + "step": 54166 + }, + { + "epoch": 5.054306242418587, + "grad_norm": NaN, + "learning_rate": 1.9165995692547164e-05, + "loss": 0.0, + "step": 54167 + }, + { + "epoch": 5.054399552113464, + "grad_norm": NaN, + "learning_rate": 1.9162296398138398e-05, + "loss": 0.0, + "step": 54168 + }, + { + "epoch": 5.054492861808342, + "grad_norm": NaN, + "learning_rate": 1.9158597436411115e-05, + "loss": 0.0, + "step": 54169 + }, + { + "epoch": 5.054586171503219, + "grad_norm": NaN, + "learning_rate": 1.9154898807374725e-05, + "loss": 0.0, + "step": 54170 + }, + { + "epoch": 5.054679481198097, + "grad_norm": NaN, + "learning_rate": 1.9151200511038732e-05, + "loss": 0.0, + "step": 54171 + }, + { + "epoch": 5.054772790892974, + "grad_norm": NaN, + "learning_rate": 1.9147502547412452e-05, + "loss": 0.0, + "step": 54172 + }, + { + "epoch": 5.0548661005878515, + "grad_norm": NaN, + "learning_rate": 1.9143804916505274e-05, + "loss": 0.0, + "step": 54173 + }, + { + "epoch": 5.054959410282728, + "grad_norm": NaN, + "learning_rate": 1.9140107618326688e-05, + "loss": 0.0, + "step": 54174 + }, + { + "epoch": 5.055052719977605, + "grad_norm": NaN, + "learning_rate": 1.9136410652886026e-05, + "loss": 0.0, + "step": 54175 + }, + { + "epoch": 5.055146029672483, + "grad_norm": NaN, + "learning_rate": 1.9132714020192675e-05, + "loss": 0.0, + "step": 54176 + }, + { + "epoch": 5.05523933936736, + "grad_norm": NaN, + "learning_rate": 1.912901772025613e-05, + "loss": 0.0, + "step": 54177 + }, + { + "epoch": 5.055332649062238, + "grad_norm": NaN, + "learning_rate": 1.91253217530857e-05, + "loss": 0.0, + "step": 54178 + }, + { + "epoch": 5.055425958757115, + "grad_norm": NaN, + "learning_rate": 1.9121626118690774e-05, + "loss": 0.0, + "step": 54179 + }, + { + "epoch": 5.0555192684519925, + "grad_norm": NaN, + "learning_rate": 1.911793081708085e-05, + "loss": 0.0, + "step": 54180 + }, + { + "epoch": 5.055612578146869, + "grad_norm": NaN, + "learning_rate": 1.911423584826522e-05, + "loss": 0.0, + "step": 54181 + }, + { + "epoch": 5.0557058878417465, + "grad_norm": NaN, + "learning_rate": 1.9110541212253305e-05, + "loss": 0.0, + "step": 54182 + }, + { + "epoch": 5.055799197536624, + "grad_norm": NaN, + "learning_rate": 1.9106846909054537e-05, + "loss": 0.0, + "step": 54183 + }, + { + "epoch": 5.055892507231501, + "grad_norm": NaN, + "learning_rate": 1.910315293867827e-05, + "loss": 0.0, + "step": 54184 + }, + { + "epoch": 5.055985816926379, + "grad_norm": NaN, + "learning_rate": 1.909945930113391e-05, + "loss": 0.0, + "step": 54185 + }, + { + "epoch": 5.056079126621256, + "grad_norm": NaN, + "learning_rate": 1.9095765996430845e-05, + "loss": 0.0, + "step": 54186 + }, + { + "epoch": 5.056172436316134, + "grad_norm": NaN, + "learning_rate": 1.9092073024578476e-05, + "loss": 0.0, + "step": 54187 + }, + { + "epoch": 5.056265746011011, + "grad_norm": NaN, + "learning_rate": 1.9088380385586193e-05, + "loss": 0.0, + "step": 54188 + }, + { + "epoch": 5.0563590557058875, + "grad_norm": NaN, + "learning_rate": 1.9084688079463377e-05, + "loss": 0.0, + "step": 54189 + }, + { + "epoch": 5.056452365400765, + "grad_norm": NaN, + "learning_rate": 1.9080996106219387e-05, + "loss": 0.0, + "step": 54190 + }, + { + "epoch": 5.056545675095642, + "grad_norm": NaN, + "learning_rate": 1.9077304465863695e-05, + "loss": 0.0, + "step": 54191 + }, + { + "epoch": 5.05663898479052, + "grad_norm": NaN, + "learning_rate": 1.9073613158405616e-05, + "loss": 0.0, + "step": 54192 + }, + { + "epoch": 5.056732294485397, + "grad_norm": NaN, + "learning_rate": 1.906992218385452e-05, + "loss": 0.0, + "step": 54193 + }, + { + "epoch": 5.056825604180275, + "grad_norm": NaN, + "learning_rate": 1.90662315422199e-05, + "loss": 0.0, + "step": 54194 + }, + { + "epoch": 5.056918913875152, + "grad_norm": NaN, + "learning_rate": 1.906254123351102e-05, + "loss": 0.0, + "step": 54195 + }, + { + "epoch": 5.0570122235700286, + "grad_norm": NaN, + "learning_rate": 1.9058851257737283e-05, + "loss": 0.0, + "step": 54196 + }, + { + "epoch": 5.057105533264906, + "grad_norm": NaN, + "learning_rate": 1.9055161614908173e-05, + "loss": 0.0, + "step": 54197 + }, + { + "epoch": 5.057198842959783, + "grad_norm": NaN, + "learning_rate": 1.9051472305032972e-05, + "loss": 0.0, + "step": 54198 + }, + { + "epoch": 5.057292152654661, + "grad_norm": NaN, + "learning_rate": 1.904778332812103e-05, + "loss": 0.0, + "step": 54199 + }, + { + "epoch": 5.057385462349538, + "grad_norm": NaN, + "learning_rate": 1.904409468418187e-05, + "loss": 0.0, + "step": 54200 + }, + { + "epoch": 5.057478772044416, + "grad_norm": NaN, + "learning_rate": 1.9040406373224754e-05, + "loss": 0.0, + "step": 54201 + }, + { + "epoch": 5.057572081739293, + "grad_norm": NaN, + "learning_rate": 1.9036718395259037e-05, + "loss": 0.0, + "step": 54202 + }, + { + "epoch": 5.05766539143417, + "grad_norm": NaN, + "learning_rate": 1.903303075029423e-05, + "loss": 0.0, + "step": 54203 + }, + { + "epoch": 5.057758701129047, + "grad_norm": NaN, + "learning_rate": 1.9029343438339596e-05, + "loss": 0.0, + "step": 54204 + }, + { + "epoch": 5.057852010823924, + "grad_norm": NaN, + "learning_rate": 1.902565645940452e-05, + "loss": 0.0, + "step": 54205 + }, + { + "epoch": 5.057945320518802, + "grad_norm": NaN, + "learning_rate": 1.9021969813498466e-05, + "loss": 0.0, + "step": 54206 + }, + { + "epoch": 5.058038630213679, + "grad_norm": NaN, + "learning_rate": 1.901828350063071e-05, + "loss": 0.0, + "step": 54207 + }, + { + "epoch": 5.058131939908557, + "grad_norm": NaN, + "learning_rate": 1.9014597520810626e-05, + "loss": 0.0, + "step": 54208 + }, + { + "epoch": 5.058225249603434, + "grad_norm": NaN, + "learning_rate": 1.9010911874047675e-05, + "loss": 0.0, + "step": 54209 + }, + { + "epoch": 5.0583185592983115, + "grad_norm": NaN, + "learning_rate": 1.9007226560351147e-05, + "loss": 0.0, + "step": 54210 + }, + { + "epoch": 5.058411868993188, + "grad_norm": NaN, + "learning_rate": 1.9003541579730404e-05, + "loss": 0.0, + "step": 54211 + }, + { + "epoch": 5.0585051786880655, + "grad_norm": NaN, + "learning_rate": 1.8999856932194934e-05, + "loss": 0.0, + "step": 54212 + }, + { + "epoch": 5.058598488382943, + "grad_norm": NaN, + "learning_rate": 1.899617261775397e-05, + "loss": 0.0, + "step": 54213 + }, + { + "epoch": 5.05869179807782, + "grad_norm": NaN, + "learning_rate": 1.8992488636416896e-05, + "loss": 0.0, + "step": 54214 + }, + { + "epoch": 5.058785107772698, + "grad_norm": NaN, + "learning_rate": 1.8988804988193195e-05, + "loss": 0.0, + "step": 54215 + }, + { + "epoch": 5.058878417467575, + "grad_norm": NaN, + "learning_rate": 1.8985121673092126e-05, + "loss": 0.0, + "step": 54216 + }, + { + "epoch": 5.058971727162453, + "grad_norm": NaN, + "learning_rate": 1.898143869112304e-05, + "loss": 0.0, + "step": 54217 + }, + { + "epoch": 5.059065036857329, + "grad_norm": NaN, + "learning_rate": 1.897775604229542e-05, + "loss": 0.0, + "step": 54218 + }, + { + "epoch": 5.0591583465522065, + "grad_norm": NaN, + "learning_rate": 1.897407372661851e-05, + "loss": 0.0, + "step": 54219 + }, + { + "epoch": 5.059251656247084, + "grad_norm": NaN, + "learning_rate": 1.89703917441017e-05, + "loss": 0.0, + "step": 54220 + }, + { + "epoch": 5.059344965941961, + "grad_norm": NaN, + "learning_rate": 1.896671009475441e-05, + "loss": 0.0, + "step": 54221 + }, + { + "epoch": 5.059438275636839, + "grad_norm": NaN, + "learning_rate": 1.8963028778585944e-05, + "loss": 0.0, + "step": 54222 + }, + { + "epoch": 5.059531585331716, + "grad_norm": NaN, + "learning_rate": 1.8959347795605652e-05, + "loss": 0.0, + "step": 54223 + }, + { + "epoch": 5.059624895026594, + "grad_norm": NaN, + "learning_rate": 1.8955667145822978e-05, + "loss": 0.0, + "step": 54224 + }, + { + "epoch": 5.05971820472147, + "grad_norm": NaN, + "learning_rate": 1.8951986829247203e-05, + "loss": 0.0, + "step": 54225 + }, + { + "epoch": 5.059811514416348, + "grad_norm": NaN, + "learning_rate": 1.8948306845887694e-05, + "loss": 0.0, + "step": 54226 + }, + { + "epoch": 5.059904824111225, + "grad_norm": NaN, + "learning_rate": 1.8944627195753836e-05, + "loss": 0.0, + "step": 54227 + }, + { + "epoch": 5.059998133806102, + "grad_norm": NaN, + "learning_rate": 1.8940947878854917e-05, + "loss": 0.0, + "step": 54228 + }, + { + "epoch": 5.06009144350098, + "grad_norm": NaN, + "learning_rate": 1.893726889520043e-05, + "loss": 0.0, + "step": 54229 + }, + { + "epoch": 5.060184753195857, + "grad_norm": NaN, + "learning_rate": 1.89335902447996e-05, + "loss": 0.0, + "step": 54230 + }, + { + "epoch": 5.060278062890735, + "grad_norm": NaN, + "learning_rate": 1.892991192766179e-05, + "loss": 0.0, + "step": 54231 + }, + { + "epoch": 5.060371372585611, + "grad_norm": NaN, + "learning_rate": 1.8926233943796455e-05, + "loss": 0.0, + "step": 54232 + }, + { + "epoch": 5.060464682280489, + "grad_norm": NaN, + "learning_rate": 1.8922556293212852e-05, + "loss": 0.0, + "step": 54233 + }, + { + "epoch": 5.060557991975366, + "grad_norm": NaN, + "learning_rate": 1.8918878975920333e-05, + "loss": 0.0, + "step": 54234 + }, + { + "epoch": 5.0606513016702435, + "grad_norm": NaN, + "learning_rate": 1.8915201991928332e-05, + "loss": 0.0, + "step": 54235 + }, + { + "epoch": 5.060744611365121, + "grad_norm": NaN, + "learning_rate": 1.891152534124611e-05, + "loss": 0.0, + "step": 54236 + }, + { + "epoch": 5.060837921059998, + "grad_norm": NaN, + "learning_rate": 1.8907849023883008e-05, + "loss": 0.0, + "step": 54237 + }, + { + "epoch": 5.060931230754876, + "grad_norm": NaN, + "learning_rate": 1.8904173039848493e-05, + "loss": 0.0, + "step": 54238 + }, + { + "epoch": 5.061024540449753, + "grad_norm": NaN, + "learning_rate": 1.8900497389151796e-05, + "loss": 0.0, + "step": 54239 + }, + { + "epoch": 5.06111785014463, + "grad_norm": NaN, + "learning_rate": 1.8896822071802253e-05, + "loss": 0.0, + "step": 54240 + }, + { + "epoch": 5.061211159839507, + "grad_norm": NaN, + "learning_rate": 1.8893147087809336e-05, + "loss": 0.0, + "step": 54241 + }, + { + "epoch": 5.0613044695343845, + "grad_norm": NaN, + "learning_rate": 1.8889472437182286e-05, + "loss": 0.0, + "step": 54242 + }, + { + "epoch": 5.061397779229262, + "grad_norm": NaN, + "learning_rate": 1.888579811993041e-05, + "loss": 0.0, + "step": 54243 + }, + { + "epoch": 5.061491088924139, + "grad_norm": NaN, + "learning_rate": 1.8882124136063194e-05, + "loss": 0.0, + "step": 54244 + }, + { + "epoch": 5.061584398619017, + "grad_norm": NaN, + "learning_rate": 1.887845048558987e-05, + "loss": 0.0, + "step": 54245 + }, + { + "epoch": 5.061677708313894, + "grad_norm": NaN, + "learning_rate": 1.887477716851975e-05, + "loss": 0.0, + "step": 54246 + }, + { + "epoch": 5.061771018008771, + "grad_norm": NaN, + "learning_rate": 1.8871104184862312e-05, + "loss": 0.0, + "step": 54247 + }, + { + "epoch": 5.061864327703648, + "grad_norm": NaN, + "learning_rate": 1.8867431534626783e-05, + "loss": 0.0, + "step": 54248 + }, + { + "epoch": 5.0619576373985256, + "grad_norm": NaN, + "learning_rate": 1.8863759217822483e-05, + "loss": 0.0, + "step": 54249 + }, + { + "epoch": 5.062050947093403, + "grad_norm": NaN, + "learning_rate": 1.8860087234458876e-05, + "loss": 0.0, + "step": 54250 + }, + { + "epoch": 5.06214425678828, + "grad_norm": NaN, + "learning_rate": 1.8856415584545188e-05, + "loss": 0.0, + "step": 54251 + }, + { + "epoch": 5.062237566483158, + "grad_norm": NaN, + "learning_rate": 1.8852744268090743e-05, + "loss": 0.0, + "step": 54252 + }, + { + "epoch": 5.062330876178035, + "grad_norm": NaN, + "learning_rate": 1.8849073285105e-05, + "loss": 0.0, + "step": 54253 + }, + { + "epoch": 5.062424185872912, + "grad_norm": NaN, + "learning_rate": 1.884540263559717e-05, + "loss": 0.0, + "step": 54254 + }, + { + "epoch": 5.062517495567789, + "grad_norm": NaN, + "learning_rate": 1.8841732319576607e-05, + "loss": 0.0, + "step": 54255 + }, + { + "epoch": 5.062610805262667, + "grad_norm": NaN, + "learning_rate": 1.8838062337052727e-05, + "loss": 0.0, + "step": 54256 + }, + { + "epoch": 5.062704114957544, + "grad_norm": NaN, + "learning_rate": 1.8834392688034772e-05, + "loss": 0.0, + "step": 54257 + }, + { + "epoch": 5.062797424652421, + "grad_norm": NaN, + "learning_rate": 1.8830723372532065e-05, + "loss": 0.0, + "step": 54258 + }, + { + "epoch": 5.062890734347299, + "grad_norm": NaN, + "learning_rate": 1.8827054390554046e-05, + "loss": 0.0, + "step": 54259 + }, + { + "epoch": 5.062984044042176, + "grad_norm": NaN, + "learning_rate": 1.882338574210993e-05, + "loss": 0.0, + "step": 54260 + }, + { + "epoch": 5.063077353737054, + "grad_norm": NaN, + "learning_rate": 1.8819717427209058e-05, + "loss": 0.0, + "step": 54261 + }, + { + "epoch": 5.06317066343193, + "grad_norm": NaN, + "learning_rate": 1.8816049445860852e-05, + "loss": 0.0, + "step": 54262 + }, + { + "epoch": 5.063263973126808, + "grad_norm": NaN, + "learning_rate": 1.881238179807453e-05, + "loss": 0.0, + "step": 54263 + }, + { + "epoch": 5.063357282821685, + "grad_norm": NaN, + "learning_rate": 1.880871448385944e-05, + "loss": 0.0, + "step": 54264 + }, + { + "epoch": 5.0634505925165625, + "grad_norm": NaN, + "learning_rate": 1.8805047503224997e-05, + "loss": 0.0, + "step": 54265 + }, + { + "epoch": 5.06354390221144, + "grad_norm": NaN, + "learning_rate": 1.8801380856180376e-05, + "loss": 0.0, + "step": 54266 + }, + { + "epoch": 5.063637211906317, + "grad_norm": NaN, + "learning_rate": 1.8797714542735022e-05, + "loss": 0.0, + "step": 54267 + }, + { + "epoch": 5.063730521601195, + "grad_norm": NaN, + "learning_rate": 1.879404856289824e-05, + "loss": 0.0, + "step": 54268 + }, + { + "epoch": 5.063823831296071, + "grad_norm": NaN, + "learning_rate": 1.879038291667926e-05, + "loss": 0.0, + "step": 54269 + }, + { + "epoch": 5.063917140990949, + "grad_norm": NaN, + "learning_rate": 1.8786717604087537e-05, + "loss": 0.0, + "step": 54270 + }, + { + "epoch": 5.064010450685826, + "grad_norm": NaN, + "learning_rate": 1.87830526251323e-05, + "loss": 0.0, + "step": 54271 + }, + { + "epoch": 5.0641037603807035, + "grad_norm": NaN, + "learning_rate": 1.8779387979822858e-05, + "loss": 0.0, + "step": 54272 + }, + { + "epoch": 5.064197070075581, + "grad_norm": NaN, + "learning_rate": 1.877572366816862e-05, + "loss": 0.0, + "step": 54273 + }, + { + "epoch": 5.064290379770458, + "grad_norm": NaN, + "learning_rate": 1.8772059690178807e-05, + "loss": 0.0, + "step": 54274 + }, + { + "epoch": 5.064383689465336, + "grad_norm": NaN, + "learning_rate": 1.8768396045862755e-05, + "loss": 0.0, + "step": 54275 + }, + { + "epoch": 5.064476999160212, + "grad_norm": NaN, + "learning_rate": 1.8764732735229847e-05, + "loss": 0.0, + "step": 54276 + }, + { + "epoch": 5.06457030885509, + "grad_norm": NaN, + "learning_rate": 1.8761069758289333e-05, + "loss": 0.0, + "step": 54277 + }, + { + "epoch": 5.064663618549967, + "grad_norm": NaN, + "learning_rate": 1.87574071150505e-05, + "loss": 0.0, + "step": 54278 + }, + { + "epoch": 5.064756928244845, + "grad_norm": NaN, + "learning_rate": 1.875374480552276e-05, + "loss": 0.0, + "step": 54279 + }, + { + "epoch": 5.064850237939722, + "grad_norm": NaN, + "learning_rate": 1.875008282971535e-05, + "loss": 0.0, + "step": 54280 + }, + { + "epoch": 5.064943547634599, + "grad_norm": NaN, + "learning_rate": 1.8746421187637572e-05, + "loss": 0.0, + "step": 54281 + }, + { + "epoch": 5.065036857329477, + "grad_norm": NaN, + "learning_rate": 1.8742759879298815e-05, + "loss": 0.0, + "step": 54282 + }, + { + "epoch": 5.065130167024354, + "grad_norm": NaN, + "learning_rate": 1.8739098904708318e-05, + "loss": 0.0, + "step": 54283 + }, + { + "epoch": 5.065223476719231, + "grad_norm": NaN, + "learning_rate": 1.8735438263875375e-05, + "loss": 0.0, + "step": 54284 + }, + { + "epoch": 5.065316786414108, + "grad_norm": NaN, + "learning_rate": 1.8731777956809396e-05, + "loss": 0.0, + "step": 54285 + }, + { + "epoch": 5.065410096108986, + "grad_norm": NaN, + "learning_rate": 1.8728117983519575e-05, + "loss": 0.0, + "step": 54286 + }, + { + "epoch": 5.065503405803863, + "grad_norm": NaN, + "learning_rate": 1.8724458344015237e-05, + "loss": 0.0, + "step": 54287 + }, + { + "epoch": 5.0655967154987405, + "grad_norm": NaN, + "learning_rate": 1.872079903830579e-05, + "loss": 0.0, + "step": 54288 + }, + { + "epoch": 5.065690025193618, + "grad_norm": NaN, + "learning_rate": 1.8717140066400427e-05, + "loss": 0.0, + "step": 54289 + }, + { + "epoch": 5.065783334888495, + "grad_norm": NaN, + "learning_rate": 1.871348142830846e-05, + "loss": 0.0, + "step": 54290 + }, + { + "epoch": 5.065876644583372, + "grad_norm": NaN, + "learning_rate": 1.8709823124039276e-05, + "loss": 0.0, + "step": 54291 + }, + { + "epoch": 5.065969954278249, + "grad_norm": NaN, + "learning_rate": 1.8706165153602088e-05, + "loss": 0.0, + "step": 54292 + }, + { + "epoch": 5.066063263973127, + "grad_norm": NaN, + "learning_rate": 1.8702507517006206e-05, + "loss": 0.0, + "step": 54293 + }, + { + "epoch": 5.066156573668004, + "grad_norm": NaN, + "learning_rate": 1.8698850214261e-05, + "loss": 0.0, + "step": 54294 + }, + { + "epoch": 5.0662498833628815, + "grad_norm": NaN, + "learning_rate": 1.8695193245375705e-05, + "loss": 0.0, + "step": 54295 + }, + { + "epoch": 5.066343193057759, + "grad_norm": NaN, + "learning_rate": 1.869153661035961e-05, + "loss": 0.0, + "step": 54296 + }, + { + "epoch": 5.066436502752636, + "grad_norm": NaN, + "learning_rate": 1.8687880309222103e-05, + "loss": 0.0, + "step": 54297 + }, + { + "epoch": 5.066529812447513, + "grad_norm": NaN, + "learning_rate": 1.868422434197238e-05, + "loss": 0.0, + "step": 54298 + }, + { + "epoch": 5.06662312214239, + "grad_norm": NaN, + "learning_rate": 1.8680568708619752e-05, + "loss": 0.0, + "step": 54299 + }, + { + "epoch": 5.066716431837268, + "grad_norm": NaN, + "learning_rate": 1.8676913409173593e-05, + "loss": 0.0, + "step": 54300 + }, + { + "epoch": 5.066809741532145, + "grad_norm": NaN, + "learning_rate": 1.8673258443643113e-05, + "loss": 0.0, + "step": 54301 + }, + { + "epoch": 5.066903051227023, + "grad_norm": NaN, + "learning_rate": 1.866960381203762e-05, + "loss": 0.0, + "step": 54302 + }, + { + "epoch": 5.0669963609219, + "grad_norm": NaN, + "learning_rate": 1.8665949514366457e-05, + "loss": 0.0, + "step": 54303 + }, + { + "epoch": 5.067089670616777, + "grad_norm": NaN, + "learning_rate": 1.8662295550638835e-05, + "loss": 0.0, + "step": 54304 + }, + { + "epoch": 5.067182980311655, + "grad_norm": NaN, + "learning_rate": 1.8658641920864127e-05, + "loss": 0.0, + "step": 54305 + }, + { + "epoch": 5.067276290006531, + "grad_norm": NaN, + "learning_rate": 1.865498862505161e-05, + "loss": 0.0, + "step": 54306 + }, + { + "epoch": 5.067369599701409, + "grad_norm": NaN, + "learning_rate": 1.8651335663210475e-05, + "loss": 0.0, + "step": 54307 + }, + { + "epoch": 5.067462909396286, + "grad_norm": NaN, + "learning_rate": 1.8647683035350135e-05, + "loss": 0.0, + "step": 54308 + }, + { + "epoch": 5.067556219091164, + "grad_norm": NaN, + "learning_rate": 1.8644030741479847e-05, + "loss": 0.0, + "step": 54309 + }, + { + "epoch": 5.067649528786041, + "grad_norm": NaN, + "learning_rate": 1.8640378781608822e-05, + "loss": 0.0, + "step": 54310 + }, + { + "epoch": 5.067742838480918, + "grad_norm": NaN, + "learning_rate": 1.8636727155746433e-05, + "loss": 0.0, + "step": 54311 + }, + { + "epoch": 5.067836148175796, + "grad_norm": NaN, + "learning_rate": 1.863307586390196e-05, + "loss": 0.0, + "step": 54312 + }, + { + "epoch": 5.067929457870672, + "grad_norm": NaN, + "learning_rate": 1.8629424906084605e-05, + "loss": 0.0, + "step": 54313 + }, + { + "epoch": 5.06802276756555, + "grad_norm": NaN, + "learning_rate": 1.8625774282303734e-05, + "loss": 0.0, + "step": 54314 + }, + { + "epoch": 5.068116077260427, + "grad_norm": NaN, + "learning_rate": 1.862212399256864e-05, + "loss": 0.0, + "step": 54315 + }, + { + "epoch": 5.068209386955305, + "grad_norm": NaN, + "learning_rate": 1.8618474036888492e-05, + "loss": 0.0, + "step": 54316 + }, + { + "epoch": 5.068302696650182, + "grad_norm": NaN, + "learning_rate": 1.861482441527272e-05, + "loss": 0.0, + "step": 54317 + }, + { + "epoch": 5.0683960063450595, + "grad_norm": NaN, + "learning_rate": 1.8611175127730506e-05, + "loss": 0.0, + "step": 54318 + }, + { + "epoch": 5.068489316039937, + "grad_norm": NaN, + "learning_rate": 1.8607526174271115e-05, + "loss": 0.0, + "step": 54319 + }, + { + "epoch": 5.068582625734813, + "grad_norm": NaN, + "learning_rate": 1.860387755490393e-05, + "loss": 0.0, + "step": 54320 + }, + { + "epoch": 5.068675935429691, + "grad_norm": NaN, + "learning_rate": 1.8600229269638124e-05, + "loss": 0.0, + "step": 54321 + }, + { + "epoch": 5.068769245124568, + "grad_norm": NaN, + "learning_rate": 1.8596581318482974e-05, + "loss": 0.0, + "step": 54322 + }, + { + "epoch": 5.068862554819446, + "grad_norm": NaN, + "learning_rate": 1.8592933701447872e-05, + "loss": 0.0, + "step": 54323 + }, + { + "epoch": 5.068955864514323, + "grad_norm": NaN, + "learning_rate": 1.8589286418541978e-05, + "loss": 0.0, + "step": 54324 + }, + { + "epoch": 5.0690491742092005, + "grad_norm": NaN, + "learning_rate": 1.8585639469774566e-05, + "loss": 0.0, + "step": 54325 + }, + { + "epoch": 5.069142483904078, + "grad_norm": NaN, + "learning_rate": 1.8581992855154998e-05, + "loss": 0.0, + "step": 54326 + }, + { + "epoch": 5.0692357935989545, + "grad_norm": NaN, + "learning_rate": 1.8578346574692482e-05, + "loss": 0.0, + "step": 54327 + }, + { + "epoch": 5.069329103293832, + "grad_norm": NaN, + "learning_rate": 1.857470062839626e-05, + "loss": 0.0, + "step": 54328 + }, + { + "epoch": 5.069422412988709, + "grad_norm": NaN, + "learning_rate": 1.857105501627571e-05, + "loss": 0.0, + "step": 54329 + }, + { + "epoch": 5.069515722683587, + "grad_norm": NaN, + "learning_rate": 1.8567409738339988e-05, + "loss": 0.0, + "step": 54330 + }, + { + "epoch": 5.069609032378464, + "grad_norm": NaN, + "learning_rate": 1.856376479459839e-05, + "loss": 0.0, + "step": 54331 + }, + { + "epoch": 5.069702342073342, + "grad_norm": NaN, + "learning_rate": 1.8560120185060273e-05, + "loss": 0.0, + "step": 54332 + }, + { + "epoch": 5.069795651768219, + "grad_norm": NaN, + "learning_rate": 1.8556475909734798e-05, + "loss": 0.0, + "step": 54333 + }, + { + "epoch": 5.069888961463096, + "grad_norm": NaN, + "learning_rate": 1.8552831968631236e-05, + "loss": 0.0, + "step": 54334 + }, + { + "epoch": 5.069982271157973, + "grad_norm": NaN, + "learning_rate": 1.8549188361758955e-05, + "loss": 0.0, + "step": 54335 + }, + { + "epoch": 5.07007558085285, + "grad_norm": NaN, + "learning_rate": 1.8545545089127108e-05, + "loss": 0.0, + "step": 54336 + }, + { + "epoch": 5.070168890547728, + "grad_norm": NaN, + "learning_rate": 1.8541902150744968e-05, + "loss": 0.0, + "step": 54337 + }, + { + "epoch": 5.070262200242605, + "grad_norm": NaN, + "learning_rate": 1.8538259546621887e-05, + "loss": 0.0, + "step": 54338 + }, + { + "epoch": 5.070355509937483, + "grad_norm": NaN, + "learning_rate": 1.853461727676705e-05, + "loss": 0.0, + "step": 54339 + }, + { + "epoch": 5.07044881963236, + "grad_norm": NaN, + "learning_rate": 1.8530975341189704e-05, + "loss": 0.0, + "step": 54340 + }, + { + "epoch": 5.0705421293272375, + "grad_norm": NaN, + "learning_rate": 1.852733373989922e-05, + "loss": 0.0, + "step": 54341 + }, + { + "epoch": 5.070635439022114, + "grad_norm": NaN, + "learning_rate": 1.8523692472904693e-05, + "loss": 0.0, + "step": 54342 + }, + { + "epoch": 5.070728748716991, + "grad_norm": NaN, + "learning_rate": 1.8520051540215518e-05, + "loss": 0.0, + "step": 54343 + }, + { + "epoch": 5.070822058411869, + "grad_norm": NaN, + "learning_rate": 1.8516410941840937e-05, + "loss": 0.0, + "step": 54344 + }, + { + "epoch": 5.070915368106746, + "grad_norm": NaN, + "learning_rate": 1.8512770677790108e-05, + "loss": 0.0, + "step": 54345 + }, + { + "epoch": 5.071008677801624, + "grad_norm": NaN, + "learning_rate": 1.8509130748072388e-05, + "loss": 0.0, + "step": 54346 + }, + { + "epoch": 5.071101987496501, + "grad_norm": NaN, + "learning_rate": 1.850549115269702e-05, + "loss": 0.0, + "step": 54347 + }, + { + "epoch": 5.0711952971913785, + "grad_norm": NaN, + "learning_rate": 1.8501851891673185e-05, + "loss": 0.0, + "step": 54348 + }, + { + "epoch": 5.071288606886255, + "grad_norm": NaN, + "learning_rate": 1.8498212965010205e-05, + "loss": 0.0, + "step": 54349 + }, + { + "epoch": 5.0713819165811325, + "grad_norm": NaN, + "learning_rate": 1.849457437271736e-05, + "loss": 0.0, + "step": 54350 + }, + { + "epoch": 5.07147522627601, + "grad_norm": NaN, + "learning_rate": 1.8490936114803784e-05, + "loss": 0.0, + "step": 54351 + }, + { + "epoch": 5.071568535970887, + "grad_norm": NaN, + "learning_rate": 1.8487298191278843e-05, + "loss": 0.0, + "step": 54352 + }, + { + "epoch": 5.071661845665765, + "grad_norm": NaN, + "learning_rate": 1.848366060215178e-05, + "loss": 0.0, + "step": 54353 + }, + { + "epoch": 5.071755155360642, + "grad_norm": NaN, + "learning_rate": 1.848002334743173e-05, + "loss": 0.0, + "step": 54354 + }, + { + "epoch": 5.07184846505552, + "grad_norm": NaN, + "learning_rate": 1.8476386427128083e-05, + "loss": 0.0, + "step": 54355 + }, + { + "epoch": 5.071941774750397, + "grad_norm": NaN, + "learning_rate": 1.8472749841250034e-05, + "loss": 0.0, + "step": 54356 + }, + { + "epoch": 5.0720350844452735, + "grad_norm": NaN, + "learning_rate": 1.846911358980675e-05, + "loss": 0.0, + "step": 54357 + }, + { + "epoch": 5.072128394140151, + "grad_norm": NaN, + "learning_rate": 1.8465477672807606e-05, + "loss": 0.0, + "step": 54358 + }, + { + "epoch": 5.072221703835028, + "grad_norm": NaN, + "learning_rate": 1.846184209026179e-05, + "loss": 0.0, + "step": 54359 + }, + { + "epoch": 5.072315013529906, + "grad_norm": NaN, + "learning_rate": 1.8458206842178502e-05, + "loss": 0.0, + "step": 54360 + }, + { + "epoch": 5.072408323224783, + "grad_norm": NaN, + "learning_rate": 1.8454571928567097e-05, + "loss": 0.0, + "step": 54361 + }, + { + "epoch": 5.072501632919661, + "grad_norm": NaN, + "learning_rate": 1.84509373494367e-05, + "loss": 0.0, + "step": 54362 + }, + { + "epoch": 5.072594942614538, + "grad_norm": NaN, + "learning_rate": 1.8447303104796573e-05, + "loss": 0.0, + "step": 54363 + }, + { + "epoch": 5.072688252309415, + "grad_norm": NaN, + "learning_rate": 1.844366919465606e-05, + "loss": 0.0, + "step": 54364 + }, + { + "epoch": 5.072781562004292, + "grad_norm": NaN, + "learning_rate": 1.8440035619024297e-05, + "loss": 0.0, + "step": 54365 + }, + { + "epoch": 5.072874871699169, + "grad_norm": NaN, + "learning_rate": 1.8436402377910496e-05, + "loss": 0.0, + "step": 54366 + }, + { + "epoch": 5.072968181394047, + "grad_norm": NaN, + "learning_rate": 1.843276947132404e-05, + "loss": 0.0, + "step": 54367 + }, + { + "epoch": 5.073061491088924, + "grad_norm": NaN, + "learning_rate": 1.8429136899274027e-05, + "loss": 0.0, + "step": 54368 + }, + { + "epoch": 5.073154800783802, + "grad_norm": NaN, + "learning_rate": 1.8425504661769724e-05, + "loss": 0.0, + "step": 54369 + }, + { + "epoch": 5.073248110478679, + "grad_norm": NaN, + "learning_rate": 1.8421872758820457e-05, + "loss": 0.0, + "step": 54370 + }, + { + "epoch": 5.073341420173556, + "grad_norm": NaN, + "learning_rate": 1.8418241190435346e-05, + "loss": 0.0, + "step": 54371 + }, + { + "epoch": 5.073434729868433, + "grad_norm": NaN, + "learning_rate": 1.8414609956623655e-05, + "loss": 0.0, + "step": 54372 + }, + { + "epoch": 5.07352803956331, + "grad_norm": NaN, + "learning_rate": 1.8410979057394674e-05, + "loss": 0.0, + "step": 54373 + }, + { + "epoch": 5.073621349258188, + "grad_norm": NaN, + "learning_rate": 1.8407348492757585e-05, + "loss": 0.0, + "step": 54374 + }, + { + "epoch": 5.073714658953065, + "grad_norm": NaN, + "learning_rate": 1.840371826272159e-05, + "loss": 0.0, + "step": 54375 + }, + { + "epoch": 5.073807968647943, + "grad_norm": NaN, + "learning_rate": 1.8400088367296018e-05, + "loss": 0.0, + "step": 54376 + }, + { + "epoch": 5.07390127834282, + "grad_norm": NaN, + "learning_rate": 1.8396458806489994e-05, + "loss": 0.0, + "step": 54377 + }, + { + "epoch": 5.0739945880376975, + "grad_norm": NaN, + "learning_rate": 1.839282958031278e-05, + "loss": 0.0, + "step": 54378 + }, + { + "epoch": 5.074087897732574, + "grad_norm": NaN, + "learning_rate": 1.8389200688773664e-05, + "loss": 0.0, + "step": 54379 + }, + { + "epoch": 5.0741812074274515, + "grad_norm": NaN, + "learning_rate": 1.8385572131881775e-05, + "loss": 0.0, + "step": 54380 + }, + { + "epoch": 5.074274517122329, + "grad_norm": NaN, + "learning_rate": 1.8381943909646407e-05, + "loss": 0.0, + "step": 54381 + }, + { + "epoch": 5.074367826817206, + "grad_norm": NaN, + "learning_rate": 1.83783160220768e-05, + "loss": 0.0, + "step": 54382 + }, + { + "epoch": 5.074461136512084, + "grad_norm": NaN, + "learning_rate": 1.8374688469182096e-05, + "loss": 0.0, + "step": 54383 + }, + { + "epoch": 5.074554446206961, + "grad_norm": NaN, + "learning_rate": 1.837106125097159e-05, + "loss": 0.0, + "step": 54384 + }, + { + "epoch": 5.074647755901839, + "grad_norm": NaN, + "learning_rate": 1.8367434367454527e-05, + "loss": 0.0, + "step": 54385 + }, + { + "epoch": 5.074741065596715, + "grad_norm": NaN, + "learning_rate": 1.8363807818640008e-05, + "loss": 0.0, + "step": 54386 + }, + { + "epoch": 5.0748343752915925, + "grad_norm": NaN, + "learning_rate": 1.8360181604537368e-05, + "loss": 0.0, + "step": 54387 + }, + { + "epoch": 5.07492768498647, + "grad_norm": NaN, + "learning_rate": 1.8356555725155826e-05, + "loss": 0.0, + "step": 54388 + }, + { + "epoch": 5.075020994681347, + "grad_norm": NaN, + "learning_rate": 1.8352930180504494e-05, + "loss": 0.0, + "step": 54389 + }, + { + "epoch": 5.075114304376225, + "grad_norm": NaN, + "learning_rate": 1.8349304970592715e-05, + "loss": 0.0, + "step": 54390 + }, + { + "epoch": 5.075207614071102, + "grad_norm": NaN, + "learning_rate": 1.8345680095429664e-05, + "loss": 0.0, + "step": 54391 + }, + { + "epoch": 5.07530092376598, + "grad_norm": NaN, + "learning_rate": 1.8342055555024504e-05, + "loss": 0.0, + "step": 54392 + }, + { + "epoch": 5.075394233460856, + "grad_norm": NaN, + "learning_rate": 1.8338431349386524e-05, + "loss": 0.0, + "step": 54393 + }, + { + "epoch": 5.075487543155734, + "grad_norm": NaN, + "learning_rate": 1.833480747852495e-05, + "loss": 0.0, + "step": 54394 + }, + { + "epoch": 5.075580852850611, + "grad_norm": NaN, + "learning_rate": 1.8331183942448873e-05, + "loss": 0.0, + "step": 54395 + }, + { + "epoch": 5.075674162545488, + "grad_norm": NaN, + "learning_rate": 1.832756074116764e-05, + "loss": 0.0, + "step": 54396 + }, + { + "epoch": 5.075767472240366, + "grad_norm": NaN, + "learning_rate": 1.8323937874690457e-05, + "loss": 0.0, + "step": 54397 + }, + { + "epoch": 5.075860781935243, + "grad_norm": NaN, + "learning_rate": 1.832031534302642e-05, + "loss": 0.0, + "step": 54398 + }, + { + "epoch": 5.075954091630121, + "grad_norm": NaN, + "learning_rate": 1.8316693146184854e-05, + "loss": 0.0, + "step": 54399 + }, + { + "epoch": 5.076047401324998, + "grad_norm": NaN, + "learning_rate": 1.8313071284174946e-05, + "loss": 0.0, + "step": 54400 + }, + { + "epoch": 5.076140711019875, + "grad_norm": NaN, + "learning_rate": 1.830944975700583e-05, + "loss": 0.0, + "step": 54401 + }, + { + "epoch": 5.076234020714752, + "grad_norm": NaN, + "learning_rate": 1.830582856468681e-05, + "loss": 0.0, + "step": 54402 + }, + { + "epoch": 5.0763273304096295, + "grad_norm": NaN, + "learning_rate": 1.8302207707227095e-05, + "loss": 0.0, + "step": 54403 + }, + { + "epoch": 5.076420640104507, + "grad_norm": NaN, + "learning_rate": 1.8298587184635783e-05, + "loss": 0.0, + "step": 54404 + }, + { + "epoch": 5.076513949799384, + "grad_norm": NaN, + "learning_rate": 1.829496699692221e-05, + "loss": 0.0, + "step": 54405 + }, + { + "epoch": 5.076607259494262, + "grad_norm": NaN, + "learning_rate": 1.829134714409549e-05, + "loss": 0.0, + "step": 54406 + }, + { + "epoch": 5.076700569189139, + "grad_norm": NaN, + "learning_rate": 1.828772762616485e-05, + "loss": 0.0, + "step": 54407 + }, + { + "epoch": 5.076793878884016, + "grad_norm": NaN, + "learning_rate": 1.8284108443139546e-05, + "loss": 0.0, + "step": 54408 + }, + { + "epoch": 5.076887188578893, + "grad_norm": NaN, + "learning_rate": 1.82804895950287e-05, + "loss": 0.0, + "step": 54409 + }, + { + "epoch": 5.0769804982737705, + "grad_norm": NaN, + "learning_rate": 1.827687108184153e-05, + "loss": 0.0, + "step": 54410 + }, + { + "epoch": 5.077073807968648, + "grad_norm": NaN, + "learning_rate": 1.8273252903587327e-05, + "loss": 0.0, + "step": 54411 + }, + { + "epoch": 5.077167117663525, + "grad_norm": NaN, + "learning_rate": 1.8269635060275177e-05, + "loss": 0.0, + "step": 54412 + }, + { + "epoch": 5.077260427358403, + "grad_norm": NaN, + "learning_rate": 1.8266017551914296e-05, + "loss": 0.0, + "step": 54413 + }, + { + "epoch": 5.07735373705328, + "grad_norm": NaN, + "learning_rate": 1.8262400378513976e-05, + "loss": 0.0, + "step": 54414 + }, + { + "epoch": 5.077447046748157, + "grad_norm": NaN, + "learning_rate": 1.8258783540083273e-05, + "loss": 0.0, + "step": 54415 + }, + { + "epoch": 5.077540356443034, + "grad_norm": NaN, + "learning_rate": 1.8255167036631513e-05, + "loss": 0.0, + "step": 54416 + }, + { + "epoch": 5.077633666137912, + "grad_norm": NaN, + "learning_rate": 1.8251550868167842e-05, + "loss": 0.0, + "step": 54417 + }, + { + "epoch": 5.077726975832789, + "grad_norm": NaN, + "learning_rate": 1.8247935034701416e-05, + "loss": 0.0, + "step": 54418 + }, + { + "epoch": 5.077820285527666, + "grad_norm": NaN, + "learning_rate": 1.824431953624148e-05, + "loss": 0.0, + "step": 54419 + }, + { + "epoch": 5.077913595222544, + "grad_norm": NaN, + "learning_rate": 1.8240704372797243e-05, + "loss": 0.0, + "step": 54420 + }, + { + "epoch": 5.078006904917421, + "grad_norm": NaN, + "learning_rate": 1.8237089544377797e-05, + "loss": 0.0, + "step": 54421 + }, + { + "epoch": 5.078100214612299, + "grad_norm": NaN, + "learning_rate": 1.8233475050992436e-05, + "loss": 0.0, + "step": 54422 + }, + { + "epoch": 5.078193524307175, + "grad_norm": NaN, + "learning_rate": 1.822986089265035e-05, + "loss": 0.0, + "step": 54423 + }, + { + "epoch": 5.078286834002053, + "grad_norm": NaN, + "learning_rate": 1.8226247069360635e-05, + "loss": 0.0, + "step": 54424 + }, + { + "epoch": 5.07838014369693, + "grad_norm": NaN, + "learning_rate": 1.8222633581132562e-05, + "loss": 0.0, + "step": 54425 + }, + { + "epoch": 5.0784734533918074, + "grad_norm": NaN, + "learning_rate": 1.8219020427975327e-05, + "loss": 0.0, + "step": 54426 + }, + { + "epoch": 5.078566763086685, + "grad_norm": NaN, + "learning_rate": 1.8215407609898024e-05, + "loss": 0.0, + "step": 54427 + }, + { + "epoch": 5.078660072781562, + "grad_norm": NaN, + "learning_rate": 1.8211795126909944e-05, + "loss": 0.0, + "step": 54428 + }, + { + "epoch": 5.07875338247644, + "grad_norm": NaN, + "learning_rate": 1.8208182979020247e-05, + "loss": 0.0, + "step": 54429 + }, + { + "epoch": 5.078846692171316, + "grad_norm": NaN, + "learning_rate": 1.820457116623806e-05, + "loss": 0.0, + "step": 54430 + }, + { + "epoch": 5.078940001866194, + "grad_norm": NaN, + "learning_rate": 1.820095968857262e-05, + "loss": 0.0, + "step": 54431 + }, + { + "epoch": 5.079033311561071, + "grad_norm": NaN, + "learning_rate": 1.8197348546033143e-05, + "loss": 0.0, + "step": 54432 + }, + { + "epoch": 5.0791266212559485, + "grad_norm": NaN, + "learning_rate": 1.8193737738628684e-05, + "loss": 0.0, + "step": 54433 + }, + { + "epoch": 5.079219930950826, + "grad_norm": NaN, + "learning_rate": 1.8190127266368553e-05, + "loss": 0.0, + "step": 54434 + }, + { + "epoch": 5.079313240645703, + "grad_norm": NaN, + "learning_rate": 1.8186517129261913e-05, + "loss": 0.0, + "step": 54435 + }, + { + "epoch": 5.079406550340581, + "grad_norm": NaN, + "learning_rate": 1.818290732731785e-05, + "loss": 0.0, + "step": 54436 + }, + { + "epoch": 5.079499860035457, + "grad_norm": NaN, + "learning_rate": 1.8179297860545632e-05, + "loss": 0.0, + "step": 54437 + }, + { + "epoch": 5.079593169730335, + "grad_norm": NaN, + "learning_rate": 1.8175688728954447e-05, + "loss": 0.0, + "step": 54438 + }, + { + "epoch": 5.079686479425212, + "grad_norm": NaN, + "learning_rate": 1.817207993255337e-05, + "loss": 0.0, + "step": 54439 + }, + { + "epoch": 5.0797797891200895, + "grad_norm": NaN, + "learning_rate": 1.816847147135169e-05, + "loss": 0.0, + "step": 54440 + }, + { + "epoch": 5.079873098814967, + "grad_norm": NaN, + "learning_rate": 1.8164863345358543e-05, + "loss": 0.0, + "step": 54441 + }, + { + "epoch": 5.079966408509844, + "grad_norm": NaN, + "learning_rate": 1.8161255554583047e-05, + "loss": 0.0, + "step": 54442 + }, + { + "epoch": 5.080059718204722, + "grad_norm": NaN, + "learning_rate": 1.8157648099034467e-05, + "loss": 0.0, + "step": 54443 + }, + { + "epoch": 5.080153027899598, + "grad_norm": NaN, + "learning_rate": 1.815404097872194e-05, + "loss": 0.0, + "step": 54444 + }, + { + "epoch": 5.080246337594476, + "grad_norm": NaN, + "learning_rate": 1.8150434193654596e-05, + "loss": 0.0, + "step": 54445 + }, + { + "epoch": 5.080339647289353, + "grad_norm": NaN, + "learning_rate": 1.814682774384166e-05, + "loss": 0.0, + "step": 54446 + }, + { + "epoch": 5.080432956984231, + "grad_norm": NaN, + "learning_rate": 1.8143221629292303e-05, + "loss": 0.0, + "step": 54447 + }, + { + "epoch": 5.080526266679108, + "grad_norm": NaN, + "learning_rate": 1.8139615850015627e-05, + "loss": 0.0, + "step": 54448 + }, + { + "epoch": 5.080619576373985, + "grad_norm": NaN, + "learning_rate": 1.8136010406020867e-05, + "loss": 0.0, + "step": 54449 + }, + { + "epoch": 5.080712886068863, + "grad_norm": NaN, + "learning_rate": 1.8132405297317204e-05, + "loss": 0.0, + "step": 54450 + }, + { + "epoch": 5.08080619576374, + "grad_norm": NaN, + "learning_rate": 1.812880052391373e-05, + "loss": 0.0, + "step": 54451 + }, + { + "epoch": 5.080899505458617, + "grad_norm": NaN, + "learning_rate": 1.8125196085819697e-05, + "loss": 0.0, + "step": 54452 + }, + { + "epoch": 5.080992815153494, + "grad_norm": NaN, + "learning_rate": 1.812159198304416e-05, + "loss": 0.0, + "step": 54453 + }, + { + "epoch": 5.081086124848372, + "grad_norm": NaN, + "learning_rate": 1.8117988215596386e-05, + "loss": 0.0, + "step": 54454 + }, + { + "epoch": 5.081179434543249, + "grad_norm": NaN, + "learning_rate": 1.8114384783485537e-05, + "loss": 0.0, + "step": 54455 + }, + { + "epoch": 5.0812727442381265, + "grad_norm": NaN, + "learning_rate": 1.811078168672067e-05, + "loss": 0.0, + "step": 54456 + }, + { + "epoch": 5.081366053933004, + "grad_norm": NaN, + "learning_rate": 1.8107178925311065e-05, + "loss": 0.0, + "step": 54457 + }, + { + "epoch": 5.081459363627881, + "grad_norm": NaN, + "learning_rate": 1.810357649926585e-05, + "loss": 0.0, + "step": 54458 + }, + { + "epoch": 5.081552673322758, + "grad_norm": NaN, + "learning_rate": 1.809997440859411e-05, + "loss": 0.0, + "step": 54459 + }, + { + "epoch": 5.081645983017635, + "grad_norm": NaN, + "learning_rate": 1.8096372653305093e-05, + "loss": 0.0, + "step": 54460 + }, + { + "epoch": 5.081739292712513, + "grad_norm": NaN, + "learning_rate": 1.809277123340797e-05, + "loss": 0.0, + "step": 54461 + }, + { + "epoch": 5.08183260240739, + "grad_norm": NaN, + "learning_rate": 1.8089170148911774e-05, + "loss": 0.0, + "step": 54462 + }, + { + "epoch": 5.0819259121022675, + "grad_norm": NaN, + "learning_rate": 1.8085569399825796e-05, + "loss": 0.0, + "step": 54463 + }, + { + "epoch": 5.082019221797145, + "grad_norm": NaN, + "learning_rate": 1.8081968986159156e-05, + "loss": 0.0, + "step": 54464 + }, + { + "epoch": 5.082112531492022, + "grad_norm": NaN, + "learning_rate": 1.8078368907920937e-05, + "loss": 0.0, + "step": 54465 + }, + { + "epoch": 5.082205841186899, + "grad_norm": NaN, + "learning_rate": 1.8074769165120363e-05, + "loss": 0.0, + "step": 54466 + }, + { + "epoch": 5.082299150881776, + "grad_norm": NaN, + "learning_rate": 1.8071169757766624e-05, + "loss": 0.0, + "step": 54467 + }, + { + "epoch": 5.082392460576654, + "grad_norm": NaN, + "learning_rate": 1.8067570685868748e-05, + "loss": 0.0, + "step": 54468 + }, + { + "epoch": 5.082485770271531, + "grad_norm": NaN, + "learning_rate": 1.8063971949435997e-05, + "loss": 0.0, + "step": 54469 + }, + { + "epoch": 5.082579079966409, + "grad_norm": NaN, + "learning_rate": 1.8060373548477506e-05, + "loss": 0.0, + "step": 54470 + }, + { + "epoch": 5.082672389661286, + "grad_norm": NaN, + "learning_rate": 1.8056775483002328e-05, + "loss": 0.0, + "step": 54471 + }, + { + "epoch": 5.082765699356163, + "grad_norm": NaN, + "learning_rate": 1.8053177753019733e-05, + "loss": 0.0, + "step": 54472 + }, + { + "epoch": 5.082859009051041, + "grad_norm": NaN, + "learning_rate": 1.804958035853886e-05, + "loss": 0.0, + "step": 54473 + }, + { + "epoch": 5.082952318745917, + "grad_norm": NaN, + "learning_rate": 1.8045983299568746e-05, + "loss": 0.0, + "step": 54474 + }, + { + "epoch": 5.083045628440795, + "grad_norm": NaN, + "learning_rate": 1.8042386576118638e-05, + "loss": 0.0, + "step": 54475 + }, + { + "epoch": 5.083138938135672, + "grad_norm": NaN, + "learning_rate": 1.803879018819767e-05, + "loss": 0.0, + "step": 54476 + }, + { + "epoch": 5.08323224783055, + "grad_norm": NaN, + "learning_rate": 1.8035194135814933e-05, + "loss": 0.0, + "step": 54477 + }, + { + "epoch": 5.083325557525427, + "grad_norm": NaN, + "learning_rate": 1.803159841897962e-05, + "loss": 0.0, + "step": 54478 + }, + { + "epoch": 5.0834188672203044, + "grad_norm": NaN, + "learning_rate": 1.802800303770089e-05, + "loss": 0.0, + "step": 54479 + }, + { + "epoch": 5.083512176915182, + "grad_norm": NaN, + "learning_rate": 1.80244079919878e-05, + "loss": 0.0, + "step": 54480 + }, + { + "epoch": 5.083605486610058, + "grad_norm": NaN, + "learning_rate": 1.8020813281849577e-05, + "loss": 0.0, + "step": 54481 + }, + { + "epoch": 5.083698796304936, + "grad_norm": NaN, + "learning_rate": 1.8017218907295367e-05, + "loss": 0.0, + "step": 54482 + }, + { + "epoch": 5.083792105999813, + "grad_norm": NaN, + "learning_rate": 1.801362486833421e-05, + "loss": 0.0, + "step": 54483 + }, + { + "epoch": 5.083885415694691, + "grad_norm": NaN, + "learning_rate": 1.801003116497533e-05, + "loss": 0.0, + "step": 54484 + }, + { + "epoch": 5.083978725389568, + "grad_norm": NaN, + "learning_rate": 1.8006437797227873e-05, + "loss": 0.0, + "step": 54485 + }, + { + "epoch": 5.0840720350844455, + "grad_norm": NaN, + "learning_rate": 1.80028447651009e-05, + "loss": 0.0, + "step": 54486 + }, + { + "epoch": 5.084165344779323, + "grad_norm": NaN, + "learning_rate": 1.7999252068603598e-05, + "loss": 0.0, + "step": 54487 + }, + { + "epoch": 5.0842586544741994, + "grad_norm": NaN, + "learning_rate": 1.7995659707745148e-05, + "loss": 0.0, + "step": 54488 + }, + { + "epoch": 5.084351964169077, + "grad_norm": NaN, + "learning_rate": 1.799206768253457e-05, + "loss": 0.0, + "step": 54489 + }, + { + "epoch": 5.084445273863954, + "grad_norm": NaN, + "learning_rate": 1.7988475992981082e-05, + "loss": 0.0, + "step": 54490 + }, + { + "epoch": 5.084538583558832, + "grad_norm": NaN, + "learning_rate": 1.798488463909379e-05, + "loss": 0.0, + "step": 54491 + }, + { + "epoch": 5.084631893253709, + "grad_norm": NaN, + "learning_rate": 1.798129362088185e-05, + "loss": 0.0, + "step": 54492 + }, + { + "epoch": 5.0847252029485865, + "grad_norm": NaN, + "learning_rate": 1.7977702938354356e-05, + "loss": 0.0, + "step": 54493 + }, + { + "epoch": 5.084818512643464, + "grad_norm": NaN, + "learning_rate": 1.797411259152047e-05, + "loss": 0.0, + "step": 54494 + }, + { + "epoch": 5.084911822338341, + "grad_norm": NaN, + "learning_rate": 1.7970522580389297e-05, + "loss": 0.0, + "step": 54495 + }, + { + "epoch": 5.085005132033218, + "grad_norm": NaN, + "learning_rate": 1.796693290497e-05, + "loss": 0.0, + "step": 54496 + }, + { + "epoch": 5.085098441728095, + "grad_norm": NaN, + "learning_rate": 1.796334356527162e-05, + "loss": 0.0, + "step": 54497 + }, + { + "epoch": 5.085191751422973, + "grad_norm": NaN, + "learning_rate": 1.7959754561303385e-05, + "loss": 0.0, + "step": 54498 + }, + { + "epoch": 5.08528506111785, + "grad_norm": NaN, + "learning_rate": 1.795616589307442e-05, + "loss": 0.0, + "step": 54499 + }, + { + "epoch": 5.085378370812728, + "grad_norm": NaN, + "learning_rate": 1.7952577560593735e-05, + "loss": 0.0, + "step": 54500 + }, + { + "epoch": 5.085471680507605, + "grad_norm": NaN, + "learning_rate": 1.794898956387057e-05, + "loss": 0.0, + "step": 54501 + }, + { + "epoch": 5.085564990202482, + "grad_norm": NaN, + "learning_rate": 1.7945401902914037e-05, + "loss": 0.0, + "step": 54502 + }, + { + "epoch": 5.085658299897359, + "grad_norm": NaN, + "learning_rate": 1.794181457773316e-05, + "loss": 0.0, + "step": 54503 + }, + { + "epoch": 5.085751609592236, + "grad_norm": NaN, + "learning_rate": 1.7938227588337167e-05, + "loss": 0.0, + "step": 54504 + }, + { + "epoch": 5.085844919287114, + "grad_norm": NaN, + "learning_rate": 1.7934640934735168e-05, + "loss": 0.0, + "step": 54505 + }, + { + "epoch": 5.085938228981991, + "grad_norm": NaN, + "learning_rate": 1.7931054616936202e-05, + "loss": 0.0, + "step": 54506 + }, + { + "epoch": 5.086031538676869, + "grad_norm": NaN, + "learning_rate": 1.7927468634949467e-05, + "loss": 0.0, + "step": 54507 + }, + { + "epoch": 5.086124848371746, + "grad_norm": NaN, + "learning_rate": 1.7923882988784084e-05, + "loss": 0.0, + "step": 54508 + }, + { + "epoch": 5.0862181580666235, + "grad_norm": NaN, + "learning_rate": 1.792029767844908e-05, + "loss": 0.0, + "step": 54509 + }, + { + "epoch": 5.0863114677615, + "grad_norm": NaN, + "learning_rate": 1.7916712703953684e-05, + "loss": 0.0, + "step": 54510 + }, + { + "epoch": 5.086404777456377, + "grad_norm": NaN, + "learning_rate": 1.7913128065306968e-05, + "loss": 0.0, + "step": 54511 + }, + { + "epoch": 5.086498087151255, + "grad_norm": NaN, + "learning_rate": 1.7909543762517998e-05, + "loss": 0.0, + "step": 54512 + }, + { + "epoch": 5.086591396846132, + "grad_norm": NaN, + "learning_rate": 1.790595979559596e-05, + "loss": 0.0, + "step": 54513 + }, + { + "epoch": 5.08668470654101, + "grad_norm": NaN, + "learning_rate": 1.7902376164549964e-05, + "loss": 0.0, + "step": 54514 + }, + { + "epoch": 5.086778016235887, + "grad_norm": NaN, + "learning_rate": 1.7898792869389024e-05, + "loss": 0.0, + "step": 54515 + }, + { + "epoch": 5.0868713259307645, + "grad_norm": NaN, + "learning_rate": 1.789520991012236e-05, + "loss": 0.0, + "step": 54516 + }, + { + "epoch": 5.086964635625642, + "grad_norm": NaN, + "learning_rate": 1.7891627286759068e-05, + "loss": 0.0, + "step": 54517 + }, + { + "epoch": 5.0870579453205185, + "grad_norm": NaN, + "learning_rate": 1.7888044999308193e-05, + "loss": 0.0, + "step": 54518 + }, + { + "epoch": 5.087151255015396, + "grad_norm": NaN, + "learning_rate": 1.7884463047778892e-05, + "loss": 0.0, + "step": 54519 + }, + { + "epoch": 5.087244564710273, + "grad_norm": NaN, + "learning_rate": 1.788088143218032e-05, + "loss": 0.0, + "step": 54520 + }, + { + "epoch": 5.087337874405151, + "grad_norm": NaN, + "learning_rate": 1.7877300152521444e-05, + "loss": 0.0, + "step": 54521 + }, + { + "epoch": 5.087431184100028, + "grad_norm": NaN, + "learning_rate": 1.7873719208811504e-05, + "loss": 0.0, + "step": 54522 + }, + { + "epoch": 5.087524493794906, + "grad_norm": NaN, + "learning_rate": 1.787013860105959e-05, + "loss": 0.0, + "step": 54523 + }, + { + "epoch": 5.087617803489783, + "grad_norm": NaN, + "learning_rate": 1.7866558329274698e-05, + "loss": 0.0, + "step": 54524 + }, + { + "epoch": 5.0877111131846595, + "grad_norm": NaN, + "learning_rate": 1.7862978393466053e-05, + "loss": 0.0, + "step": 54525 + }, + { + "epoch": 5.087804422879537, + "grad_norm": NaN, + "learning_rate": 1.785939879364273e-05, + "loss": 0.0, + "step": 54526 + }, + { + "epoch": 5.087897732574414, + "grad_norm": NaN, + "learning_rate": 1.785581952981376e-05, + "loss": 0.0, + "step": 54527 + }, + { + "epoch": 5.087991042269292, + "grad_norm": NaN, + "learning_rate": 1.7852240601988326e-05, + "loss": 0.0, + "step": 54528 + }, + { + "epoch": 5.088084351964169, + "grad_norm": NaN, + "learning_rate": 1.7848662010175495e-05, + "loss": 0.0, + "step": 54529 + }, + { + "epoch": 5.088177661659047, + "grad_norm": NaN, + "learning_rate": 1.7845083754384375e-05, + "loss": 0.0, + "step": 54530 + }, + { + "epoch": 5.088270971353924, + "grad_norm": NaN, + "learning_rate": 1.784150583462406e-05, + "loss": 0.0, + "step": 54531 + }, + { + "epoch": 5.088364281048801, + "grad_norm": NaN, + "learning_rate": 1.7837928250903638e-05, + "loss": 0.0, + "step": 54532 + }, + { + "epoch": 5.088457590743678, + "grad_norm": NaN, + "learning_rate": 1.783435100323224e-05, + "loss": 0.0, + "step": 54533 + }, + { + "epoch": 5.088550900438555, + "grad_norm": NaN, + "learning_rate": 1.7830774091618903e-05, + "loss": 0.0, + "step": 54534 + }, + { + "epoch": 5.088644210133433, + "grad_norm": NaN, + "learning_rate": 1.7827197516072776e-05, + "loss": 0.0, + "step": 54535 + }, + { + "epoch": 5.08873751982831, + "grad_norm": NaN, + "learning_rate": 1.7823621276602933e-05, + "loss": 0.0, + "step": 54536 + }, + { + "epoch": 5.088830829523188, + "grad_norm": NaN, + "learning_rate": 1.7820045373218467e-05, + "loss": 0.0, + "step": 54537 + }, + { + "epoch": 5.088924139218065, + "grad_norm": NaN, + "learning_rate": 1.7816469805928468e-05, + "loss": 0.0, + "step": 54538 + }, + { + "epoch": 5.0890174489129425, + "grad_norm": NaN, + "learning_rate": 1.781289457474203e-05, + "loss": 0.0, + "step": 54539 + }, + { + "epoch": 5.089110758607819, + "grad_norm": NaN, + "learning_rate": 1.780931967966828e-05, + "loss": 0.0, + "step": 54540 + }, + { + "epoch": 5.0892040683026964, + "grad_norm": NaN, + "learning_rate": 1.7805745120716213e-05, + "loss": 0.0, + "step": 54541 + }, + { + "epoch": 5.089297377997574, + "grad_norm": NaN, + "learning_rate": 1.7802170897895e-05, + "loss": 0.0, + "step": 54542 + }, + { + "epoch": 5.089390687692451, + "grad_norm": NaN, + "learning_rate": 1.7798597011213738e-05, + "loss": 0.0, + "step": 54543 + }, + { + "epoch": 5.089483997387329, + "grad_norm": NaN, + "learning_rate": 1.7795023460681418e-05, + "loss": 0.0, + "step": 54544 + }, + { + "epoch": 5.089577307082206, + "grad_norm": NaN, + "learning_rate": 1.779145024630723e-05, + "loss": 0.0, + "step": 54545 + }, + { + "epoch": 5.0896706167770835, + "grad_norm": NaN, + "learning_rate": 1.7787877368100258e-05, + "loss": 0.0, + "step": 54546 + }, + { + "epoch": 5.08976392647196, + "grad_norm": NaN, + "learning_rate": 1.7784304826069467e-05, + "loss": 0.0, + "step": 54547 + }, + { + "epoch": 5.0898572361668375, + "grad_norm": NaN, + "learning_rate": 1.7780732620224075e-05, + "loss": 0.0, + "step": 54548 + }, + { + "epoch": 5.089950545861715, + "grad_norm": NaN, + "learning_rate": 1.7777160750573137e-05, + "loss": 0.0, + "step": 54549 + }, + { + "epoch": 5.090043855556592, + "grad_norm": NaN, + "learning_rate": 1.777358921712565e-05, + "loss": 0.0, + "step": 54550 + }, + { + "epoch": 5.09013716525147, + "grad_norm": NaN, + "learning_rate": 1.7770018019890787e-05, + "loss": 0.0, + "step": 54551 + }, + { + "epoch": 5.090230474946347, + "grad_norm": NaN, + "learning_rate": 1.7766447158877623e-05, + "loss": 0.0, + "step": 54552 + }, + { + "epoch": 5.090323784641225, + "grad_norm": NaN, + "learning_rate": 1.7762876634095153e-05, + "loss": 0.0, + "step": 54553 + }, + { + "epoch": 5.090417094336101, + "grad_norm": NaN, + "learning_rate": 1.775930644555257e-05, + "loss": 0.0, + "step": 54554 + }, + { + "epoch": 5.0905104040309785, + "grad_norm": NaN, + "learning_rate": 1.7755736593258897e-05, + "loss": 0.0, + "step": 54555 + }, + { + "epoch": 5.090603713725856, + "grad_norm": NaN, + "learning_rate": 1.775216707722318e-05, + "loss": 0.0, + "step": 54556 + }, + { + "epoch": 5.090697023420733, + "grad_norm": NaN, + "learning_rate": 1.774859789745454e-05, + "loss": 0.0, + "step": 54557 + }, + { + "epoch": 5.090790333115611, + "grad_norm": NaN, + "learning_rate": 1.7745029053962063e-05, + "loss": 0.0, + "step": 54558 + }, + { + "epoch": 5.090883642810488, + "grad_norm": NaN, + "learning_rate": 1.774146054675475e-05, + "loss": 0.0, + "step": 54559 + }, + { + "epoch": 5.090976952505366, + "grad_norm": NaN, + "learning_rate": 1.7737892375841763e-05, + "loss": 0.0, + "step": 54560 + }, + { + "epoch": 5.091070262200242, + "grad_norm": NaN, + "learning_rate": 1.773432454123216e-05, + "loss": 0.0, + "step": 54561 + }, + { + "epoch": 5.09116357189512, + "grad_norm": NaN, + "learning_rate": 1.7730757042934933e-05, + "loss": 0.0, + "step": 54562 + }, + { + "epoch": 5.091256881589997, + "grad_norm": NaN, + "learning_rate": 1.772718988095923e-05, + "loss": 0.0, + "step": 54563 + }, + { + "epoch": 5.091350191284874, + "grad_norm": NaN, + "learning_rate": 1.772362305531414e-05, + "loss": 0.0, + "step": 54564 + }, + { + "epoch": 5.091443500979752, + "grad_norm": NaN, + "learning_rate": 1.772005656600862e-05, + "loss": 0.0, + "step": 54565 + }, + { + "epoch": 5.091536810674629, + "grad_norm": NaN, + "learning_rate": 1.7716490413051854e-05, + "loss": 0.0, + "step": 54566 + }, + { + "epoch": 5.091630120369507, + "grad_norm": NaN, + "learning_rate": 1.7712924596452876e-05, + "loss": 0.0, + "step": 54567 + }, + { + "epoch": 5.091723430064384, + "grad_norm": NaN, + "learning_rate": 1.7709359116220735e-05, + "loss": 0.0, + "step": 54568 + }, + { + "epoch": 5.091816739759261, + "grad_norm": NaN, + "learning_rate": 1.7705793972364503e-05, + "loss": 0.0, + "step": 54569 + }, + { + "epoch": 5.091910049454138, + "grad_norm": NaN, + "learning_rate": 1.7702229164893256e-05, + "loss": 0.0, + "step": 54570 + }, + { + "epoch": 5.0920033591490155, + "grad_norm": NaN, + "learning_rate": 1.769866469381604e-05, + "loss": 0.0, + "step": 54571 + }, + { + "epoch": 5.092096668843893, + "grad_norm": NaN, + "learning_rate": 1.7695100559141928e-05, + "loss": 0.0, + "step": 54572 + }, + { + "epoch": 5.09218997853877, + "grad_norm": NaN, + "learning_rate": 1.769153676088e-05, + "loss": 0.0, + "step": 54573 + }, + { + "epoch": 5.092283288233648, + "grad_norm": NaN, + "learning_rate": 1.7687973299039294e-05, + "loss": 0.0, + "step": 54574 + }, + { + "epoch": 5.092376597928525, + "grad_norm": NaN, + "learning_rate": 1.7684410173628873e-05, + "loss": 0.0, + "step": 54575 + }, + { + "epoch": 5.092469907623402, + "grad_norm": NaN, + "learning_rate": 1.7680847384657808e-05, + "loss": 0.0, + "step": 54576 + }, + { + "epoch": 5.092563217318279, + "grad_norm": NaN, + "learning_rate": 1.767728493213515e-05, + "loss": 0.0, + "step": 54577 + }, + { + "epoch": 5.0926565270131565, + "grad_norm": NaN, + "learning_rate": 1.7673722816069953e-05, + "loss": 0.0, + "step": 54578 + }, + { + "epoch": 5.092749836708034, + "grad_norm": NaN, + "learning_rate": 1.7670161036471293e-05, + "loss": 0.0, + "step": 54579 + }, + { + "epoch": 5.092843146402911, + "grad_norm": NaN, + "learning_rate": 1.7666599593348198e-05, + "loss": 0.0, + "step": 54580 + }, + { + "epoch": 5.092936456097789, + "grad_norm": NaN, + "learning_rate": 1.7663038486709757e-05, + "loss": 0.0, + "step": 54581 + }, + { + "epoch": 5.093029765792666, + "grad_norm": NaN, + "learning_rate": 1.7659477716565e-05, + "loss": 0.0, + "step": 54582 + }, + { + "epoch": 5.093123075487543, + "grad_norm": NaN, + "learning_rate": 1.765591728292299e-05, + "loss": 0.0, + "step": 54583 + }, + { + "epoch": 5.09321638518242, + "grad_norm": NaN, + "learning_rate": 1.7652357185792777e-05, + "loss": 0.0, + "step": 54584 + }, + { + "epoch": 5.093309694877298, + "grad_norm": NaN, + "learning_rate": 1.7648797425183426e-05, + "loss": 0.0, + "step": 54585 + }, + { + "epoch": 5.093403004572175, + "grad_norm": NaN, + "learning_rate": 1.764523800110396e-05, + "loss": 0.0, + "step": 54586 + }, + { + "epoch": 5.093496314267052, + "grad_norm": NaN, + "learning_rate": 1.7641678913563496e-05, + "loss": 0.0, + "step": 54587 + }, + { + "epoch": 5.09358962396193, + "grad_norm": NaN, + "learning_rate": 1.7638120162570967e-05, + "loss": 0.0, + "step": 54588 + }, + { + "epoch": 5.093682933656807, + "grad_norm": NaN, + "learning_rate": 1.7634561748135524e-05, + "loss": 0.0, + "step": 54589 + }, + { + "epoch": 5.093776243351685, + "grad_norm": NaN, + "learning_rate": 1.7631003670266203e-05, + "loss": 0.0, + "step": 54590 + }, + { + "epoch": 5.093869553046561, + "grad_norm": NaN, + "learning_rate": 1.7627445928971967e-05, + "loss": 0.0, + "step": 54591 + }, + { + "epoch": 5.093962862741439, + "grad_norm": NaN, + "learning_rate": 1.762388852426196e-05, + "loss": 0.0, + "step": 54592 + }, + { + "epoch": 5.094056172436316, + "grad_norm": NaN, + "learning_rate": 1.7620331456145236e-05, + "loss": 0.0, + "step": 54593 + }, + { + "epoch": 5.0941494821311935, + "grad_norm": NaN, + "learning_rate": 1.7616774724630712e-05, + "loss": 0.0, + "step": 54594 + }, + { + "epoch": 5.094242791826071, + "grad_norm": NaN, + "learning_rate": 1.7613218329727556e-05, + "loss": 0.0, + "step": 54595 + }, + { + "epoch": 5.094336101520948, + "grad_norm": NaN, + "learning_rate": 1.76096622714448e-05, + "loss": 0.0, + "step": 54596 + }, + { + "epoch": 5.094429411215826, + "grad_norm": NaN, + "learning_rate": 1.7606106549791387e-05, + "loss": 0.0, + "step": 54597 + }, + { + "epoch": 5.094522720910702, + "grad_norm": NaN, + "learning_rate": 1.7602551164776473e-05, + "loss": 0.0, + "step": 54598 + }, + { + "epoch": 5.09461603060558, + "grad_norm": NaN, + "learning_rate": 1.759899611640907e-05, + "loss": 0.0, + "step": 54599 + }, + { + "epoch": 5.094709340300457, + "grad_norm": NaN, + "learning_rate": 1.7595441404698152e-05, + "loss": 0.0, + "step": 54600 + }, + { + "epoch": 5.0948026499953345, + "grad_norm": NaN, + "learning_rate": 1.7591887029652812e-05, + "loss": 0.0, + "step": 54601 + }, + { + "epoch": 5.094895959690212, + "grad_norm": NaN, + "learning_rate": 1.7588332991282096e-05, + "loss": 0.0, + "step": 54602 + }, + { + "epoch": 5.094989269385089, + "grad_norm": NaN, + "learning_rate": 1.7584779289595026e-05, + "loss": 0.0, + "step": 54603 + }, + { + "epoch": 5.095082579079967, + "grad_norm": NaN, + "learning_rate": 1.7581225924600645e-05, + "loss": 0.0, + "step": 54604 + }, + { + "epoch": 5.095175888774843, + "grad_norm": NaN, + "learning_rate": 1.757767289630798e-05, + "loss": 0.0, + "step": 54605 + }, + { + "epoch": 5.095269198469721, + "grad_norm": NaN, + "learning_rate": 1.757412020472606e-05, + "loss": 0.0, + "step": 54606 + }, + { + "epoch": 5.095362508164598, + "grad_norm": NaN, + "learning_rate": 1.7570567849863925e-05, + "loss": 0.0, + "step": 54607 + }, + { + "epoch": 5.0954558178594755, + "grad_norm": NaN, + "learning_rate": 1.7567015831730603e-05, + "loss": 0.0, + "step": 54608 + }, + { + "epoch": 5.095549127554353, + "grad_norm": NaN, + "learning_rate": 1.756346415033513e-05, + "loss": 0.0, + "step": 54609 + }, + { + "epoch": 5.09564243724923, + "grad_norm": NaN, + "learning_rate": 1.7559912805686542e-05, + "loss": 0.0, + "step": 54610 + }, + { + "epoch": 5.095735746944108, + "grad_norm": NaN, + "learning_rate": 1.755636179779386e-05, + "loss": 0.0, + "step": 54611 + }, + { + "epoch": 5.095829056638985, + "grad_norm": NaN, + "learning_rate": 1.7552811126666126e-05, + "loss": 0.0, + "step": 54612 + }, + { + "epoch": 5.095922366333862, + "grad_norm": NaN, + "learning_rate": 1.7549260792312354e-05, + "loss": 0.0, + "step": 54613 + }, + { + "epoch": 5.096015676028739, + "grad_norm": NaN, + "learning_rate": 1.754571079474158e-05, + "loss": 0.0, + "step": 54614 + }, + { + "epoch": 5.096108985723617, + "grad_norm": NaN, + "learning_rate": 1.7542161133962834e-05, + "loss": 0.0, + "step": 54615 + }, + { + "epoch": 5.096202295418494, + "grad_norm": NaN, + "learning_rate": 1.7538611809985127e-05, + "loss": 0.0, + "step": 54616 + }, + { + "epoch": 5.096295605113371, + "grad_norm": NaN, + "learning_rate": 1.7535062822817482e-05, + "loss": 0.0, + "step": 54617 + }, + { + "epoch": 5.096388914808249, + "grad_norm": NaN, + "learning_rate": 1.753151417246894e-05, + "loss": 0.0, + "step": 54618 + }, + { + "epoch": 5.096482224503126, + "grad_norm": NaN, + "learning_rate": 1.7527965858948532e-05, + "loss": 0.0, + "step": 54619 + }, + { + "epoch": 5.096575534198003, + "grad_norm": NaN, + "learning_rate": 1.752441788226525e-05, + "loss": 0.0, + "step": 54620 + }, + { + "epoch": 5.09666884389288, + "grad_norm": NaN, + "learning_rate": 1.7520870242428136e-05, + "loss": 0.0, + "step": 54621 + }, + { + "epoch": 5.096762153587758, + "grad_norm": NaN, + "learning_rate": 1.75173229394462e-05, + "loss": 0.0, + "step": 54622 + }, + { + "epoch": 5.096855463282635, + "grad_norm": NaN, + "learning_rate": 1.751377597332848e-05, + "loss": 0.0, + "step": 54623 + }, + { + "epoch": 5.0969487729775125, + "grad_norm": NaN, + "learning_rate": 1.7510229344083977e-05, + "loss": 0.0, + "step": 54624 + }, + { + "epoch": 5.09704208267239, + "grad_norm": NaN, + "learning_rate": 1.750668305172171e-05, + "loss": 0.0, + "step": 54625 + }, + { + "epoch": 5.097135392367267, + "grad_norm": NaN, + "learning_rate": 1.7503137096250708e-05, + "loss": 0.0, + "step": 54626 + }, + { + "epoch": 5.097228702062144, + "grad_norm": NaN, + "learning_rate": 1.7499591477679965e-05, + "loss": 0.0, + "step": 54627 + }, + { + "epoch": 5.097322011757021, + "grad_norm": NaN, + "learning_rate": 1.749604619601852e-05, + "loss": 0.0, + "step": 54628 + }, + { + "epoch": 5.097415321451899, + "grad_norm": NaN, + "learning_rate": 1.749250125127539e-05, + "loss": 0.0, + "step": 54629 + }, + { + "epoch": 5.097508631146776, + "grad_norm": NaN, + "learning_rate": 1.7488956643459574e-05, + "loss": 0.0, + "step": 54630 + }, + { + "epoch": 5.0976019408416535, + "grad_norm": NaN, + "learning_rate": 1.7485412372580103e-05, + "loss": 0.0, + "step": 54631 + }, + { + "epoch": 5.097695250536531, + "grad_norm": NaN, + "learning_rate": 1.7481868438645924e-05, + "loss": 0.0, + "step": 54632 + }, + { + "epoch": 5.097788560231408, + "grad_norm": NaN, + "learning_rate": 1.7478324841666123e-05, + "loss": 0.0, + "step": 54633 + }, + { + "epoch": 5.097881869926285, + "grad_norm": NaN, + "learning_rate": 1.7474781581649734e-05, + "loss": 0.0, + "step": 54634 + }, + { + "epoch": 5.097975179621162, + "grad_norm": NaN, + "learning_rate": 1.747123865860564e-05, + "loss": 0.0, + "step": 54635 + }, + { + "epoch": 5.09806848931604, + "grad_norm": NaN, + "learning_rate": 1.7467696072542976e-05, + "loss": 0.0, + "step": 54636 + }, + { + "epoch": 5.098161799010917, + "grad_norm": NaN, + "learning_rate": 1.7464153823470713e-05, + "loss": 0.0, + "step": 54637 + }, + { + "epoch": 5.098255108705795, + "grad_norm": NaN, + "learning_rate": 1.7460611911397792e-05, + "loss": 0.0, + "step": 54638 + }, + { + "epoch": 5.098348418400672, + "grad_norm": NaN, + "learning_rate": 1.745707033633331e-05, + "loss": 0.0, + "step": 54639 + }, + { + "epoch": 5.098441728095549, + "grad_norm": NaN, + "learning_rate": 1.7453529098286228e-05, + "loss": 0.0, + "step": 54640 + }, + { + "epoch": 5.098535037790427, + "grad_norm": NaN, + "learning_rate": 1.744998819726557e-05, + "loss": 0.0, + "step": 54641 + }, + { + "epoch": 5.098628347485303, + "grad_norm": NaN, + "learning_rate": 1.7446447633280325e-05, + "loss": 0.0, + "step": 54642 + }, + { + "epoch": 5.098721657180181, + "grad_norm": NaN, + "learning_rate": 1.744290740633949e-05, + "loss": 0.0, + "step": 54643 + }, + { + "epoch": 5.098814966875058, + "grad_norm": NaN, + "learning_rate": 1.7439367516452076e-05, + "loss": 0.0, + "step": 54644 + }, + { + "epoch": 5.098908276569936, + "grad_norm": NaN, + "learning_rate": 1.7435827963627087e-05, + "loss": 0.0, + "step": 54645 + }, + { + "epoch": 5.099001586264813, + "grad_norm": NaN, + "learning_rate": 1.7432288747873517e-05, + "loss": 0.0, + "step": 54646 + }, + { + "epoch": 5.0990948959596905, + "grad_norm": NaN, + "learning_rate": 1.7428749869200364e-05, + "loss": 0.0, + "step": 54647 + }, + { + "epoch": 5.099188205654568, + "grad_norm": NaN, + "learning_rate": 1.7425211327616634e-05, + "loss": 0.0, + "step": 54648 + }, + { + "epoch": 5.099281515349444, + "grad_norm": NaN, + "learning_rate": 1.742167312313132e-05, + "loss": 0.0, + "step": 54649 + }, + { + "epoch": 5.099374825044322, + "grad_norm": NaN, + "learning_rate": 1.741813525575343e-05, + "loss": 0.0, + "step": 54650 + }, + { + "epoch": 5.099468134739199, + "grad_norm": NaN, + "learning_rate": 1.7414597725491925e-05, + "loss": 0.0, + "step": 54651 + }, + { + "epoch": 5.099561444434077, + "grad_norm": NaN, + "learning_rate": 1.7411060532355847e-05, + "loss": 0.0, + "step": 54652 + }, + { + "epoch": 5.099654754128954, + "grad_norm": NaN, + "learning_rate": 1.7407523676354157e-05, + "loss": 0.0, + "step": 54653 + }, + { + "epoch": 5.0997480638238315, + "grad_norm": NaN, + "learning_rate": 1.7403987157495845e-05, + "loss": 0.0, + "step": 54654 + }, + { + "epoch": 5.099841373518709, + "grad_norm": NaN, + "learning_rate": 1.7400450975789925e-05, + "loss": 0.0, + "step": 54655 + }, + { + "epoch": 5.099934683213586, + "grad_norm": NaN, + "learning_rate": 1.7396915131245387e-05, + "loss": 0.0, + "step": 54656 + }, + { + "epoch": 5.100027992908463, + "grad_norm": NaN, + "learning_rate": 1.7393379623871205e-05, + "loss": 0.0, + "step": 54657 + }, + { + "epoch": 5.10012130260334, + "grad_norm": NaN, + "learning_rate": 1.7389844453676377e-05, + "loss": 0.0, + "step": 54658 + }, + { + "epoch": 5.100214612298218, + "grad_norm": NaN, + "learning_rate": 1.738630962066991e-05, + "loss": 0.0, + "step": 54659 + }, + { + "epoch": 5.100307921993095, + "grad_norm": NaN, + "learning_rate": 1.738277512486074e-05, + "loss": 0.0, + "step": 54660 + }, + { + "epoch": 5.1004012316879725, + "grad_norm": NaN, + "learning_rate": 1.7379240966257924e-05, + "loss": 0.0, + "step": 54661 + }, + { + "epoch": 5.10049454138285, + "grad_norm": NaN, + "learning_rate": 1.7375707144870392e-05, + "loss": 0.0, + "step": 54662 + }, + { + "epoch": 5.100587851077727, + "grad_norm": NaN, + "learning_rate": 1.737217366070716e-05, + "loss": 0.0, + "step": 54663 + }, + { + "epoch": 5.100681160772604, + "grad_norm": NaN, + "learning_rate": 1.7368640513777203e-05, + "loss": 0.0, + "step": 54664 + }, + { + "epoch": 5.100774470467481, + "grad_norm": NaN, + "learning_rate": 1.7365107704089497e-05, + "loss": 0.0, + "step": 54665 + }, + { + "epoch": 5.100867780162359, + "grad_norm": NaN, + "learning_rate": 1.7361575231653037e-05, + "loss": 0.0, + "step": 54666 + }, + { + "epoch": 5.100961089857236, + "grad_norm": NaN, + "learning_rate": 1.735804309647679e-05, + "loss": 0.0, + "step": 54667 + }, + { + "epoch": 5.101054399552114, + "grad_norm": NaN, + "learning_rate": 1.735451129856976e-05, + "loss": 0.0, + "step": 54668 + }, + { + "epoch": 5.101147709246991, + "grad_norm": NaN, + "learning_rate": 1.735097983794092e-05, + "loss": 0.0, + "step": 54669 + }, + { + "epoch": 5.101241018941868, + "grad_norm": NaN, + "learning_rate": 1.7347448714599245e-05, + "loss": 0.0, + "step": 54670 + }, + { + "epoch": 5.101334328636745, + "grad_norm": NaN, + "learning_rate": 1.7343917928553707e-05, + "loss": 0.0, + "step": 54671 + }, + { + "epoch": 5.101427638331622, + "grad_norm": NaN, + "learning_rate": 1.734038747981329e-05, + "loss": 0.0, + "step": 54672 + }, + { + "epoch": 5.1015209480265, + "grad_norm": NaN, + "learning_rate": 1.7336857368386964e-05, + "loss": 0.0, + "step": 54673 + }, + { + "epoch": 5.101614257721377, + "grad_norm": NaN, + "learning_rate": 1.7333327594283726e-05, + "loss": 0.0, + "step": 54674 + }, + { + "epoch": 5.101707567416255, + "grad_norm": NaN, + "learning_rate": 1.7329798157512566e-05, + "loss": 0.0, + "step": 54675 + }, + { + "epoch": 5.101800877111132, + "grad_norm": NaN, + "learning_rate": 1.732626905808236e-05, + "loss": 0.0, + "step": 54676 + }, + { + "epoch": 5.1018941868060095, + "grad_norm": NaN, + "learning_rate": 1.7322740296002184e-05, + "loss": 0.0, + "step": 54677 + }, + { + "epoch": 5.101987496500886, + "grad_norm": NaN, + "learning_rate": 1.7319211871280968e-05, + "loss": 0.0, + "step": 54678 + }, + { + "epoch": 5.102080806195763, + "grad_norm": NaN, + "learning_rate": 1.731568378392772e-05, + "loss": 0.0, + "step": 54679 + }, + { + "epoch": 5.102174115890641, + "grad_norm": NaN, + "learning_rate": 1.7312156033951364e-05, + "loss": 0.0, + "step": 54680 + }, + { + "epoch": 5.102267425585518, + "grad_norm": NaN, + "learning_rate": 1.7308628621360892e-05, + "loss": 0.0, + "step": 54681 + }, + { + "epoch": 5.102360735280396, + "grad_norm": NaN, + "learning_rate": 1.7305101546165266e-05, + "loss": 0.0, + "step": 54682 + }, + { + "epoch": 5.102454044975273, + "grad_norm": NaN, + "learning_rate": 1.7301574808373465e-05, + "loss": 0.0, + "step": 54683 + }, + { + "epoch": 5.1025473546701505, + "grad_norm": NaN, + "learning_rate": 1.7298048407994457e-05, + "loss": 0.0, + "step": 54684 + }, + { + "epoch": 5.102640664365028, + "grad_norm": NaN, + "learning_rate": 1.729452234503721e-05, + "loss": 0.0, + "step": 54685 + }, + { + "epoch": 5.1027339740599045, + "grad_norm": NaN, + "learning_rate": 1.729099661951066e-05, + "loss": 0.0, + "step": 54686 + }, + { + "epoch": 5.102827283754782, + "grad_norm": NaN, + "learning_rate": 1.7287471231423805e-05, + "loss": 0.0, + "step": 54687 + }, + { + "epoch": 5.102920593449659, + "grad_norm": NaN, + "learning_rate": 1.72839461807856e-05, + "loss": 0.0, + "step": 54688 + }, + { + "epoch": 5.103013903144537, + "grad_norm": NaN, + "learning_rate": 1.7280421467605012e-05, + "loss": 0.0, + "step": 54689 + }, + { + "epoch": 5.103107212839414, + "grad_norm": NaN, + "learning_rate": 1.7276897091890988e-05, + "loss": 0.0, + "step": 54690 + }, + { + "epoch": 5.103200522534292, + "grad_norm": NaN, + "learning_rate": 1.72733730536525e-05, + "loss": 0.0, + "step": 54691 + }, + { + "epoch": 5.103293832229169, + "grad_norm": NaN, + "learning_rate": 1.726984935289852e-05, + "loss": 0.0, + "step": 54692 + }, + { + "epoch": 5.1033871419240455, + "grad_norm": NaN, + "learning_rate": 1.726632598963797e-05, + "loss": 0.0, + "step": 54693 + }, + { + "epoch": 5.103480451618923, + "grad_norm": NaN, + "learning_rate": 1.7262802963879864e-05, + "loss": 0.0, + "step": 54694 + }, + { + "epoch": 5.1035737613138, + "grad_norm": NaN, + "learning_rate": 1.7259280275633108e-05, + "loss": 0.0, + "step": 54695 + }, + { + "epoch": 5.103667071008678, + "grad_norm": NaN, + "learning_rate": 1.7255757924906684e-05, + "loss": 0.0, + "step": 54696 + }, + { + "epoch": 5.103760380703555, + "grad_norm": NaN, + "learning_rate": 1.725223591170956e-05, + "loss": 0.0, + "step": 54697 + }, + { + "epoch": 5.103853690398433, + "grad_norm": NaN, + "learning_rate": 1.724871423605067e-05, + "loss": 0.0, + "step": 54698 + }, + { + "epoch": 5.10394700009331, + "grad_norm": NaN, + "learning_rate": 1.724519289793897e-05, + "loss": 0.0, + "step": 54699 + }, + { + "epoch": 5.104040309788187, + "grad_norm": NaN, + "learning_rate": 1.724167189738342e-05, + "loss": 0.0, + "step": 54700 + }, + { + "epoch": 5.104133619483064, + "grad_norm": NaN, + "learning_rate": 1.7238151234392976e-05, + "loss": 0.0, + "step": 54701 + }, + { + "epoch": 5.104226929177941, + "grad_norm": NaN, + "learning_rate": 1.7234630908976587e-05, + "loss": 0.0, + "step": 54702 + }, + { + "epoch": 5.104320238872819, + "grad_norm": NaN, + "learning_rate": 1.723111092114321e-05, + "loss": 0.0, + "step": 54703 + }, + { + "epoch": 5.104413548567696, + "grad_norm": NaN, + "learning_rate": 1.7227591270901787e-05, + "loss": 0.0, + "step": 54704 + }, + { + "epoch": 5.104506858262574, + "grad_norm": NaN, + "learning_rate": 1.722407195826126e-05, + "loss": 0.0, + "step": 54705 + }, + { + "epoch": 5.104600167957451, + "grad_norm": NaN, + "learning_rate": 1.722055298323059e-05, + "loss": 0.0, + "step": 54706 + }, + { + "epoch": 5.1046934776523285, + "grad_norm": NaN, + "learning_rate": 1.721703434581872e-05, + "loss": 0.0, + "step": 54707 + }, + { + "epoch": 5.104786787347205, + "grad_norm": NaN, + "learning_rate": 1.721351604603461e-05, + "loss": 0.0, + "step": 54708 + }, + { + "epoch": 5.1048800970420825, + "grad_norm": NaN, + "learning_rate": 1.7209998083887183e-05, + "loss": 0.0, + "step": 54709 + }, + { + "epoch": 5.10497340673696, + "grad_norm": NaN, + "learning_rate": 1.7206480459385402e-05, + "loss": 0.0, + "step": 54710 + }, + { + "epoch": 5.105066716431837, + "grad_norm": NaN, + "learning_rate": 1.720296317253821e-05, + "loss": 0.0, + "step": 54711 + }, + { + "epoch": 5.105160026126715, + "grad_norm": NaN, + "learning_rate": 1.719944622335453e-05, + "loss": 0.0, + "step": 54712 + }, + { + "epoch": 5.105253335821592, + "grad_norm": NaN, + "learning_rate": 1.7195929611843325e-05, + "loss": 0.0, + "step": 54713 + }, + { + "epoch": 5.1053466455164696, + "grad_norm": NaN, + "learning_rate": 1.7192413338013533e-05, + "loss": 0.0, + "step": 54714 + }, + { + "epoch": 5.105439955211346, + "grad_norm": NaN, + "learning_rate": 1.7188897401874103e-05, + "loss": 0.0, + "step": 54715 + }, + { + "epoch": 5.1055332649062235, + "grad_norm": NaN, + "learning_rate": 1.7185381803433957e-05, + "loss": 0.0, + "step": 54716 + }, + { + "epoch": 5.105626574601101, + "grad_norm": NaN, + "learning_rate": 1.718186654270204e-05, + "loss": 0.0, + "step": 54717 + }, + { + "epoch": 5.105719884295978, + "grad_norm": NaN, + "learning_rate": 1.7178351619687292e-05, + "loss": 0.0, + "step": 54718 + }, + { + "epoch": 5.105813193990856, + "grad_norm": NaN, + "learning_rate": 1.7174837034398643e-05, + "loss": 0.0, + "step": 54719 + }, + { + "epoch": 5.105906503685733, + "grad_norm": NaN, + "learning_rate": 1.717132278684505e-05, + "loss": 0.0, + "step": 54720 + }, + { + "epoch": 5.105999813380611, + "grad_norm": NaN, + "learning_rate": 1.7167808877035437e-05, + "loss": 0.0, + "step": 54721 + }, + { + "epoch": 5.106093123075487, + "grad_norm": NaN, + "learning_rate": 1.7164295304978736e-05, + "loss": 0.0, + "step": 54722 + }, + { + "epoch": 5.1061864327703645, + "grad_norm": NaN, + "learning_rate": 1.7160782070683887e-05, + "loss": 0.0, + "step": 54723 + }, + { + "epoch": 5.106279742465242, + "grad_norm": NaN, + "learning_rate": 1.7157269174159814e-05, + "loss": 0.0, + "step": 54724 + }, + { + "epoch": 5.106373052160119, + "grad_norm": NaN, + "learning_rate": 1.715375661541545e-05, + "loss": 0.0, + "step": 54725 + }, + { + "epoch": 5.106466361854997, + "grad_norm": NaN, + "learning_rate": 1.7150244394459745e-05, + "loss": 0.0, + "step": 54726 + }, + { + "epoch": 5.106559671549874, + "grad_norm": NaN, + "learning_rate": 1.7146732511301597e-05, + "loss": 0.0, + "step": 54727 + }, + { + "epoch": 5.106652981244752, + "grad_norm": NaN, + "learning_rate": 1.7143220965949966e-05, + "loss": 0.0, + "step": 54728 + }, + { + "epoch": 5.106746290939629, + "grad_norm": NaN, + "learning_rate": 1.7139709758413778e-05, + "loss": 0.0, + "step": 54729 + }, + { + "epoch": 5.106839600634506, + "grad_norm": NaN, + "learning_rate": 1.7136198888701942e-05, + "loss": 0.0, + "step": 54730 + }, + { + "epoch": 5.106932910329383, + "grad_norm": NaN, + "learning_rate": 1.71326883568234e-05, + "loss": 0.0, + "step": 54731 + }, + { + "epoch": 5.10702622002426, + "grad_norm": NaN, + "learning_rate": 1.712917816278706e-05, + "loss": 0.0, + "step": 54732 + }, + { + "epoch": 5.107119529719138, + "grad_norm": NaN, + "learning_rate": 1.7125668306601868e-05, + "loss": 0.0, + "step": 54733 + }, + { + "epoch": 5.107212839414015, + "grad_norm": NaN, + "learning_rate": 1.7122158788276737e-05, + "loss": 0.0, + "step": 54734 + }, + { + "epoch": 5.107306149108893, + "grad_norm": NaN, + "learning_rate": 1.71186496078206e-05, + "loss": 0.0, + "step": 54735 + }, + { + "epoch": 5.10739945880377, + "grad_norm": NaN, + "learning_rate": 1.7115140765242373e-05, + "loss": 0.0, + "step": 54736 + }, + { + "epoch": 5.107492768498647, + "grad_norm": NaN, + "learning_rate": 1.7111632260550985e-05, + "loss": 0.0, + "step": 54737 + }, + { + "epoch": 5.107586078193524, + "grad_norm": NaN, + "learning_rate": 1.7108124093755355e-05, + "loss": 0.0, + "step": 54738 + }, + { + "epoch": 5.1076793878884015, + "grad_norm": NaN, + "learning_rate": 1.7104616264864397e-05, + "loss": 0.0, + "step": 54739 + }, + { + "epoch": 5.107772697583279, + "grad_norm": NaN, + "learning_rate": 1.7101108773887017e-05, + "loss": 0.0, + "step": 54740 + }, + { + "epoch": 5.107866007278156, + "grad_norm": NaN, + "learning_rate": 1.709760162083216e-05, + "loss": 0.0, + "step": 54741 + }, + { + "epoch": 5.107959316973034, + "grad_norm": NaN, + "learning_rate": 1.709409480570874e-05, + "loss": 0.0, + "step": 54742 + }, + { + "epoch": 5.108052626667911, + "grad_norm": NaN, + "learning_rate": 1.709058832852566e-05, + "loss": 0.0, + "step": 54743 + }, + { + "epoch": 5.108145936362788, + "grad_norm": NaN, + "learning_rate": 1.7087082189291852e-05, + "loss": 0.0, + "step": 54744 + }, + { + "epoch": 5.108239246057665, + "grad_norm": NaN, + "learning_rate": 1.708357638801622e-05, + "loss": 0.0, + "step": 54745 + }, + { + "epoch": 5.1083325557525425, + "grad_norm": NaN, + "learning_rate": 1.708007092470767e-05, + "loss": 0.0, + "step": 54746 + }, + { + "epoch": 5.10842586544742, + "grad_norm": NaN, + "learning_rate": 1.7076565799375126e-05, + "loss": 0.0, + "step": 54747 + }, + { + "epoch": 5.108519175142297, + "grad_norm": NaN, + "learning_rate": 1.7073061012027516e-05, + "loss": 0.0, + "step": 54748 + }, + { + "epoch": 5.108612484837175, + "grad_norm": NaN, + "learning_rate": 1.7069556562673724e-05, + "loss": 0.0, + "step": 54749 + }, + { + "epoch": 5.108705794532052, + "grad_norm": NaN, + "learning_rate": 1.7066052451322676e-05, + "loss": 0.0, + "step": 54750 + }, + { + "epoch": 5.108799104226929, + "grad_norm": NaN, + "learning_rate": 1.706254867798328e-05, + "loss": 0.0, + "step": 54751 + }, + { + "epoch": 5.108892413921806, + "grad_norm": NaN, + "learning_rate": 1.7059045242664443e-05, + "loss": 0.0, + "step": 54752 + }, + { + "epoch": 5.108985723616684, + "grad_norm": NaN, + "learning_rate": 1.7055542145375063e-05, + "loss": 0.0, + "step": 54753 + }, + { + "epoch": 5.109079033311561, + "grad_norm": NaN, + "learning_rate": 1.7052039386124078e-05, + "loss": 0.0, + "step": 54754 + }, + { + "epoch": 5.109172343006438, + "grad_norm": NaN, + "learning_rate": 1.7048536964920367e-05, + "loss": 0.0, + "step": 54755 + }, + { + "epoch": 5.109265652701316, + "grad_norm": NaN, + "learning_rate": 1.704503488177284e-05, + "loss": 0.0, + "step": 54756 + }, + { + "epoch": 5.109358962396193, + "grad_norm": NaN, + "learning_rate": 1.7041533136690406e-05, + "loss": 0.0, + "step": 54757 + }, + { + "epoch": 5.109452272091071, + "grad_norm": NaN, + "learning_rate": 1.7038031729681974e-05, + "loss": 0.0, + "step": 54758 + }, + { + "epoch": 5.109545581785947, + "grad_norm": NaN, + "learning_rate": 1.703453066075642e-05, + "loss": 0.0, + "step": 54759 + }, + { + "epoch": 5.109638891480825, + "grad_norm": NaN, + "learning_rate": 1.7031029929922686e-05, + "loss": 0.0, + "step": 54760 + }, + { + "epoch": 5.109732201175702, + "grad_norm": NaN, + "learning_rate": 1.702752953718965e-05, + "loss": 0.0, + "step": 54761 + }, + { + "epoch": 5.1098255108705795, + "grad_norm": NaN, + "learning_rate": 1.702402948256622e-05, + "loss": 0.0, + "step": 54762 + }, + { + "epoch": 5.109918820565457, + "grad_norm": NaN, + "learning_rate": 1.702052976606129e-05, + "loss": 0.0, + "step": 54763 + }, + { + "epoch": 5.110012130260334, + "grad_norm": NaN, + "learning_rate": 1.7017030387683773e-05, + "loss": 0.0, + "step": 54764 + }, + { + "epoch": 5.110105439955212, + "grad_norm": NaN, + "learning_rate": 1.7013531347442538e-05, + "loss": 0.0, + "step": 54765 + }, + { + "epoch": 5.110198749650088, + "grad_norm": NaN, + "learning_rate": 1.70100326453465e-05, + "loss": 0.0, + "step": 54766 + }, + { + "epoch": 5.110292059344966, + "grad_norm": NaN, + "learning_rate": 1.7006534281404566e-05, + "loss": 0.0, + "step": 54767 + }, + { + "epoch": 5.110385369039843, + "grad_norm": NaN, + "learning_rate": 1.7003036255625614e-05, + "loss": 0.0, + "step": 54768 + }, + { + "epoch": 5.1104786787347205, + "grad_norm": NaN, + "learning_rate": 1.699953856801855e-05, + "loss": 0.0, + "step": 54769 + }, + { + "epoch": 5.110571988429598, + "grad_norm": NaN, + "learning_rate": 1.6996041218592255e-05, + "loss": 0.0, + "step": 54770 + }, + { + "epoch": 5.110665298124475, + "grad_norm": NaN, + "learning_rate": 1.6992544207355618e-05, + "loss": 0.0, + "step": 54771 + }, + { + "epoch": 5.110758607819353, + "grad_norm": NaN, + "learning_rate": 1.6989047534317566e-05, + "loss": 0.0, + "step": 54772 + }, + { + "epoch": 5.110851917514229, + "grad_norm": NaN, + "learning_rate": 1.6985551199486947e-05, + "loss": 0.0, + "step": 54773 + }, + { + "epoch": 5.110945227209107, + "grad_norm": NaN, + "learning_rate": 1.698205520287268e-05, + "loss": 0.0, + "step": 54774 + }, + { + "epoch": 5.111038536903984, + "grad_norm": NaN, + "learning_rate": 1.697855954448363e-05, + "loss": 0.0, + "step": 54775 + }, + { + "epoch": 5.1111318465988616, + "grad_norm": NaN, + "learning_rate": 1.6975064224328715e-05, + "loss": 0.0, + "step": 54776 + }, + { + "epoch": 5.111225156293739, + "grad_norm": NaN, + "learning_rate": 1.6971569242416788e-05, + "loss": 0.0, + "step": 54777 + }, + { + "epoch": 5.111318465988616, + "grad_norm": NaN, + "learning_rate": 1.696807459875677e-05, + "loss": 0.0, + "step": 54778 + }, + { + "epoch": 5.111411775683494, + "grad_norm": NaN, + "learning_rate": 1.6964580293357537e-05, + "loss": 0.0, + "step": 54779 + }, + { + "epoch": 5.111505085378371, + "grad_norm": NaN, + "learning_rate": 1.696108632622795e-05, + "loss": 0.0, + "step": 54780 + }, + { + "epoch": 5.111598395073248, + "grad_norm": NaN, + "learning_rate": 1.695759269737692e-05, + "loss": 0.0, + "step": 54781 + }, + { + "epoch": 5.111691704768125, + "grad_norm": NaN, + "learning_rate": 1.695409940681332e-05, + "loss": 0.0, + "step": 54782 + }, + { + "epoch": 5.111785014463003, + "grad_norm": NaN, + "learning_rate": 1.6950606454546045e-05, + "loss": 0.0, + "step": 54783 + }, + { + "epoch": 5.11187832415788, + "grad_norm": NaN, + "learning_rate": 1.6947113840583955e-05, + "loss": 0.0, + "step": 54784 + }, + { + "epoch": 5.111971633852757, + "grad_norm": NaN, + "learning_rate": 1.6943621564935938e-05, + "loss": 0.0, + "step": 54785 + }, + { + "epoch": 5.112064943547635, + "grad_norm": NaN, + "learning_rate": 1.6940129627610892e-05, + "loss": 0.0, + "step": 54786 + }, + { + "epoch": 5.112158253242512, + "grad_norm": NaN, + "learning_rate": 1.693663802861766e-05, + "loss": 0.0, + "step": 54787 + }, + { + "epoch": 5.112251562937389, + "grad_norm": NaN, + "learning_rate": 1.693314676796515e-05, + "loss": 0.0, + "step": 54788 + }, + { + "epoch": 5.112344872632266, + "grad_norm": NaN, + "learning_rate": 1.6929655845662236e-05, + "loss": 0.0, + "step": 54789 + }, + { + "epoch": 5.112438182327144, + "grad_norm": NaN, + "learning_rate": 1.6926165261717783e-05, + "loss": 0.0, + "step": 54790 + }, + { + "epoch": 5.112531492022021, + "grad_norm": NaN, + "learning_rate": 1.692267501614068e-05, + "loss": 0.0, + "step": 54791 + }, + { + "epoch": 5.1126248017168985, + "grad_norm": NaN, + "learning_rate": 1.6919185108939785e-05, + "loss": 0.0, + "step": 54792 + }, + { + "epoch": 5.112718111411776, + "grad_norm": NaN, + "learning_rate": 1.6915695540123997e-05, + "loss": 0.0, + "step": 54793 + }, + { + "epoch": 5.112811421106653, + "grad_norm": NaN, + "learning_rate": 1.6912206309702154e-05, + "loss": 0.0, + "step": 54794 + }, + { + "epoch": 5.11290473080153, + "grad_norm": NaN, + "learning_rate": 1.690871741768315e-05, + "loss": 0.0, + "step": 54795 + }, + { + "epoch": 5.112998040496407, + "grad_norm": NaN, + "learning_rate": 1.690522886407586e-05, + "loss": 0.0, + "step": 54796 + }, + { + "epoch": 5.113091350191285, + "grad_norm": NaN, + "learning_rate": 1.6901740648889146e-05, + "loss": 0.0, + "step": 54797 + }, + { + "epoch": 5.113184659886162, + "grad_norm": NaN, + "learning_rate": 1.6898252772131883e-05, + "loss": 0.0, + "step": 54798 + }, + { + "epoch": 5.1132779695810395, + "grad_norm": NaN, + "learning_rate": 1.689476523381293e-05, + "loss": 0.0, + "step": 54799 + }, + { + "epoch": 5.113371279275917, + "grad_norm": NaN, + "learning_rate": 1.6891278033941147e-05, + "loss": 0.0, + "step": 54800 + }, + { + "epoch": 5.113464588970794, + "grad_norm": NaN, + "learning_rate": 1.688779117252543e-05, + "loss": 0.0, + "step": 54801 + }, + { + "epoch": 5.113557898665672, + "grad_norm": NaN, + "learning_rate": 1.688430464957463e-05, + "loss": 0.0, + "step": 54802 + }, + { + "epoch": 5.113651208360548, + "grad_norm": NaN, + "learning_rate": 1.68808184650976e-05, + "loss": 0.0, + "step": 54803 + }, + { + "epoch": 5.113744518055426, + "grad_norm": NaN, + "learning_rate": 1.6877332619103228e-05, + "loss": 0.0, + "step": 54804 + }, + { + "epoch": 5.113837827750303, + "grad_norm": NaN, + "learning_rate": 1.6873847111600358e-05, + "loss": 0.0, + "step": 54805 + }, + { + "epoch": 5.113931137445181, + "grad_norm": NaN, + "learning_rate": 1.6870361942597864e-05, + "loss": 0.0, + "step": 54806 + }, + { + "epoch": 5.114024447140058, + "grad_norm": NaN, + "learning_rate": 1.686687711210461e-05, + "loss": 0.0, + "step": 54807 + }, + { + "epoch": 5.114117756834935, + "grad_norm": NaN, + "learning_rate": 1.6863392620129436e-05, + "loss": 0.0, + "step": 54808 + }, + { + "epoch": 5.114211066529813, + "grad_norm": NaN, + "learning_rate": 1.685990846668123e-05, + "loss": 0.0, + "step": 54809 + }, + { + "epoch": 5.114304376224689, + "grad_norm": NaN, + "learning_rate": 1.685642465176883e-05, + "loss": 0.0, + "step": 54810 + }, + { + "epoch": 5.114397685919567, + "grad_norm": NaN, + "learning_rate": 1.6852941175401104e-05, + "loss": 0.0, + "step": 54811 + }, + { + "epoch": 5.114490995614444, + "grad_norm": NaN, + "learning_rate": 1.684945803758691e-05, + "loss": 0.0, + "step": 54812 + }, + { + "epoch": 5.114584305309322, + "grad_norm": NaN, + "learning_rate": 1.6845975238335098e-05, + "loss": 0.0, + "step": 54813 + }, + { + "epoch": 5.114677615004199, + "grad_norm": NaN, + "learning_rate": 1.684249277765452e-05, + "loss": 0.0, + "step": 54814 + }, + { + "epoch": 5.1147709246990765, + "grad_norm": NaN, + "learning_rate": 1.6839010655554058e-05, + "loss": 0.0, + "step": 54815 + }, + { + "epoch": 5.114864234393954, + "grad_norm": NaN, + "learning_rate": 1.6835528872042535e-05, + "loss": 0.0, + "step": 54816 + }, + { + "epoch": 5.11495754408883, + "grad_norm": NaN, + "learning_rate": 1.6832047427128814e-05, + "loss": 0.0, + "step": 54817 + }, + { + "epoch": 5.115050853783708, + "grad_norm": NaN, + "learning_rate": 1.6828566320821752e-05, + "loss": 0.0, + "step": 54818 + }, + { + "epoch": 5.115144163478585, + "grad_norm": NaN, + "learning_rate": 1.682508555313021e-05, + "loss": 0.0, + "step": 54819 + }, + { + "epoch": 5.115237473173463, + "grad_norm": NaN, + "learning_rate": 1.682160512406301e-05, + "loss": 0.0, + "step": 54820 + }, + { + "epoch": 5.11533078286834, + "grad_norm": NaN, + "learning_rate": 1.6818125033629016e-05, + "loss": 0.0, + "step": 54821 + }, + { + "epoch": 5.1154240925632175, + "grad_norm": NaN, + "learning_rate": 1.6814645281837087e-05, + "loss": 0.0, + "step": 54822 + }, + { + "epoch": 5.115517402258095, + "grad_norm": NaN, + "learning_rate": 1.6811165868696062e-05, + "loss": 0.0, + "step": 54823 + }, + { + "epoch": 5.115610711952972, + "grad_norm": NaN, + "learning_rate": 1.680768679421479e-05, + "loss": 0.0, + "step": 54824 + }, + { + "epoch": 5.115704021647849, + "grad_norm": NaN, + "learning_rate": 1.6804208058402115e-05, + "loss": 0.0, + "step": 54825 + }, + { + "epoch": 5.115797331342726, + "grad_norm": NaN, + "learning_rate": 1.680072966126687e-05, + "loss": 0.0, + "step": 54826 + }, + { + "epoch": 5.115890641037604, + "grad_norm": NaN, + "learning_rate": 1.6797251602817924e-05, + "loss": 0.0, + "step": 54827 + }, + { + "epoch": 5.115983950732481, + "grad_norm": NaN, + "learning_rate": 1.6793773883064114e-05, + "loss": 0.0, + "step": 54828 + }, + { + "epoch": 5.1160772604273586, + "grad_norm": NaN, + "learning_rate": 1.6790296502014273e-05, + "loss": 0.0, + "step": 54829 + }, + { + "epoch": 5.116170570122236, + "grad_norm": NaN, + "learning_rate": 1.6786819459677254e-05, + "loss": 0.0, + "step": 54830 + }, + { + "epoch": 5.116263879817113, + "grad_norm": NaN, + "learning_rate": 1.6783342756061884e-05, + "loss": 0.0, + "step": 54831 + }, + { + "epoch": 5.11635718951199, + "grad_norm": NaN, + "learning_rate": 1.6779866391177012e-05, + "loss": 0.0, + "step": 54832 + }, + { + "epoch": 5.116450499206867, + "grad_norm": NaN, + "learning_rate": 1.6776390365031488e-05, + "loss": 0.0, + "step": 54833 + }, + { + "epoch": 5.116543808901745, + "grad_norm": NaN, + "learning_rate": 1.6772914677634132e-05, + "loss": 0.0, + "step": 54834 + }, + { + "epoch": 5.116637118596622, + "grad_norm": NaN, + "learning_rate": 1.6769439328993796e-05, + "loss": 0.0, + "step": 54835 + }, + { + "epoch": 5.1167304282915, + "grad_norm": NaN, + "learning_rate": 1.676596431911929e-05, + "loss": 0.0, + "step": 54836 + }, + { + "epoch": 5.116823737986377, + "grad_norm": NaN, + "learning_rate": 1.6762489648019494e-05, + "loss": 0.0, + "step": 54837 + }, + { + "epoch": 5.116917047681254, + "grad_norm": NaN, + "learning_rate": 1.67590153157032e-05, + "loss": 0.0, + "step": 54838 + }, + { + "epoch": 5.117010357376131, + "grad_norm": NaN, + "learning_rate": 1.6755541322179266e-05, + "loss": 0.0, + "step": 54839 + }, + { + "epoch": 5.117103667071008, + "grad_norm": NaN, + "learning_rate": 1.6752067667456536e-05, + "loss": 0.0, + "step": 54840 + }, + { + "epoch": 5.117196976765886, + "grad_norm": NaN, + "learning_rate": 1.6748594351543804e-05, + "loss": 0.0, + "step": 54841 + }, + { + "epoch": 5.117290286460763, + "grad_norm": NaN, + "learning_rate": 1.6745121374449943e-05, + "loss": 0.0, + "step": 54842 + }, + { + "epoch": 5.117383596155641, + "grad_norm": NaN, + "learning_rate": 1.6741648736183765e-05, + "loss": 0.0, + "step": 54843 + }, + { + "epoch": 5.117476905850518, + "grad_norm": NaN, + "learning_rate": 1.673817643675408e-05, + "loss": 0.0, + "step": 54844 + }, + { + "epoch": 5.1175702155453955, + "grad_norm": NaN, + "learning_rate": 1.6734704476169765e-05, + "loss": 0.0, + "step": 54845 + }, + { + "epoch": 5.117663525240273, + "grad_norm": NaN, + "learning_rate": 1.673123285443959e-05, + "loss": 0.0, + "step": 54846 + }, + { + "epoch": 5.117756834935149, + "grad_norm": NaN, + "learning_rate": 1.672776157157244e-05, + "loss": 0.0, + "step": 54847 + }, + { + "epoch": 5.117850144630027, + "grad_norm": NaN, + "learning_rate": 1.6724290627577103e-05, + "loss": 0.0, + "step": 54848 + }, + { + "epoch": 5.117943454324904, + "grad_norm": NaN, + "learning_rate": 1.6720820022462405e-05, + "loss": 0.0, + "step": 54849 + }, + { + "epoch": 5.118036764019782, + "grad_norm": NaN, + "learning_rate": 1.6717349756237193e-05, + "loss": 0.0, + "step": 54850 + }, + { + "epoch": 5.118130073714659, + "grad_norm": NaN, + "learning_rate": 1.6713879828910286e-05, + "loss": 0.0, + "step": 54851 + }, + { + "epoch": 5.1182233834095365, + "grad_norm": NaN, + "learning_rate": 1.6710410240490485e-05, + "loss": 0.0, + "step": 54852 + }, + { + "epoch": 5.118316693104414, + "grad_norm": NaN, + "learning_rate": 1.6706940990986644e-05, + "loss": 0.0, + "step": 54853 + }, + { + "epoch": 5.1184100027992905, + "grad_norm": NaN, + "learning_rate": 1.670347208040756e-05, + "loss": 0.0, + "step": 54854 + }, + { + "epoch": 5.118503312494168, + "grad_norm": NaN, + "learning_rate": 1.6700003508762057e-05, + "loss": 0.0, + "step": 54855 + }, + { + "epoch": 5.118596622189045, + "grad_norm": NaN, + "learning_rate": 1.669653527605896e-05, + "loss": 0.0, + "step": 54856 + }, + { + "epoch": 5.118689931883923, + "grad_norm": NaN, + "learning_rate": 1.6693067382307095e-05, + "loss": 0.0, + "step": 54857 + }, + { + "epoch": 5.1187832415788, + "grad_norm": NaN, + "learning_rate": 1.668959982751526e-05, + "loss": 0.0, + "step": 54858 + }, + { + "epoch": 5.118876551273678, + "grad_norm": NaN, + "learning_rate": 1.6686132611692294e-05, + "loss": 0.0, + "step": 54859 + }, + { + "epoch": 5.118969860968555, + "grad_norm": NaN, + "learning_rate": 1.668266573484699e-05, + "loss": 0.0, + "step": 54860 + }, + { + "epoch": 5.1190631706634315, + "grad_norm": NaN, + "learning_rate": 1.6679199196988173e-05, + "loss": 0.0, + "step": 54861 + }, + { + "epoch": 5.119156480358309, + "grad_norm": NaN, + "learning_rate": 1.6675732998124673e-05, + "loss": 0.0, + "step": 54862 + }, + { + "epoch": 5.119249790053186, + "grad_norm": NaN, + "learning_rate": 1.6672267138265284e-05, + "loss": 0.0, + "step": 54863 + }, + { + "epoch": 5.119343099748064, + "grad_norm": NaN, + "learning_rate": 1.6668801617418827e-05, + "loss": 0.0, + "step": 54864 + }, + { + "epoch": 5.119436409442941, + "grad_norm": NaN, + "learning_rate": 1.6665336435594113e-05, + "loss": 0.0, + "step": 54865 + }, + { + "epoch": 5.119529719137819, + "grad_norm": NaN, + "learning_rate": 1.6661871592799953e-05, + "loss": 0.0, + "step": 54866 + }, + { + "epoch": 5.119623028832696, + "grad_norm": NaN, + "learning_rate": 1.6658407089045157e-05, + "loss": 0.0, + "step": 54867 + }, + { + "epoch": 5.119716338527573, + "grad_norm": NaN, + "learning_rate": 1.665494292433853e-05, + "loss": 0.0, + "step": 54868 + }, + { + "epoch": 5.11980964822245, + "grad_norm": NaN, + "learning_rate": 1.6651479098688873e-05, + "loss": 0.0, + "step": 54869 + }, + { + "epoch": 5.119902957917327, + "grad_norm": NaN, + "learning_rate": 1.6648015612105027e-05, + "loss": 0.0, + "step": 54870 + }, + { + "epoch": 5.119996267612205, + "grad_norm": NaN, + "learning_rate": 1.664455246459576e-05, + "loss": 0.0, + "step": 54871 + }, + { + "epoch": 5.120089577307082, + "grad_norm": NaN, + "learning_rate": 1.6641089656169894e-05, + "loss": 0.0, + "step": 54872 + }, + { + "epoch": 5.12018288700196, + "grad_norm": NaN, + "learning_rate": 1.663762718683623e-05, + "loss": 0.0, + "step": 54873 + }, + { + "epoch": 5.120276196696837, + "grad_norm": NaN, + "learning_rate": 1.6634165056603578e-05, + "loss": 0.0, + "step": 54874 + }, + { + "epoch": 5.1203695063917145, + "grad_norm": NaN, + "learning_rate": 1.663070326548075e-05, + "loss": 0.0, + "step": 54875 + }, + { + "epoch": 5.120462816086591, + "grad_norm": NaN, + "learning_rate": 1.6627241813476522e-05, + "loss": 0.0, + "step": 54876 + }, + { + "epoch": 5.1205561257814685, + "grad_norm": NaN, + "learning_rate": 1.662378070059972e-05, + "loss": 0.0, + "step": 54877 + }, + { + "epoch": 5.120649435476346, + "grad_norm": NaN, + "learning_rate": 1.662031992685912e-05, + "loss": 0.0, + "step": 54878 + }, + { + "epoch": 5.120742745171223, + "grad_norm": NaN, + "learning_rate": 1.6616859492263553e-05, + "loss": 0.0, + "step": 54879 + }, + { + "epoch": 5.120836054866101, + "grad_norm": NaN, + "learning_rate": 1.6613399396821786e-05, + "loss": 0.0, + "step": 54880 + }, + { + "epoch": 5.120929364560978, + "grad_norm": NaN, + "learning_rate": 1.660993964054265e-05, + "loss": 0.0, + "step": 54881 + }, + { + "epoch": 5.1210226742558556, + "grad_norm": NaN, + "learning_rate": 1.660648022343491e-05, + "loss": 0.0, + "step": 54882 + }, + { + "epoch": 5.121115983950732, + "grad_norm": NaN, + "learning_rate": 1.6603021145507383e-05, + "loss": 0.0, + "step": 54883 + }, + { + "epoch": 5.1212092936456095, + "grad_norm": NaN, + "learning_rate": 1.659956240676887e-05, + "loss": 0.0, + "step": 54884 + }, + { + "epoch": 5.121302603340487, + "grad_norm": NaN, + "learning_rate": 1.6596104007228138e-05, + "loss": 0.0, + "step": 54885 + }, + { + "epoch": 5.121395913035364, + "grad_norm": NaN, + "learning_rate": 1.659264594689399e-05, + "loss": 0.0, + "step": 54886 + }, + { + "epoch": 5.121489222730242, + "grad_norm": NaN, + "learning_rate": 1.6589188225775247e-05, + "loss": 0.0, + "step": 54887 + }, + { + "epoch": 5.121582532425119, + "grad_norm": NaN, + "learning_rate": 1.658573084388065e-05, + "loss": 0.0, + "step": 54888 + }, + { + "epoch": 5.121675842119997, + "grad_norm": NaN, + "learning_rate": 1.6582273801219048e-05, + "loss": 0.0, + "step": 54889 + }, + { + "epoch": 5.121769151814873, + "grad_norm": NaN, + "learning_rate": 1.657881709779918e-05, + "loss": 0.0, + "step": 54890 + }, + { + "epoch": 5.1218624615097506, + "grad_norm": NaN, + "learning_rate": 1.6575360733629874e-05, + "loss": 0.0, + "step": 54891 + }, + { + "epoch": 5.121955771204628, + "grad_norm": NaN, + "learning_rate": 1.65719047087199e-05, + "loss": 0.0, + "step": 54892 + }, + { + "epoch": 5.122049080899505, + "grad_norm": NaN, + "learning_rate": 1.6568449023078046e-05, + "loss": 0.0, + "step": 54893 + }, + { + "epoch": 5.122142390594383, + "grad_norm": NaN, + "learning_rate": 1.6564993676713096e-05, + "loss": 0.0, + "step": 54894 + }, + { + "epoch": 5.12223570028926, + "grad_norm": NaN, + "learning_rate": 1.656153866963381e-05, + "loss": 0.0, + "step": 54895 + }, + { + "epoch": 5.122329009984138, + "grad_norm": NaN, + "learning_rate": 1.655808400184907e-05, + "loss": 0.0, + "step": 54896 + }, + { + "epoch": 5.122422319679015, + "grad_norm": NaN, + "learning_rate": 1.6554629673367566e-05, + "loss": 0.0, + "step": 54897 + }, + { + "epoch": 5.122515629373892, + "grad_norm": NaN, + "learning_rate": 1.655117568419807e-05, + "loss": 0.0, + "step": 54898 + }, + { + "epoch": 5.122608939068769, + "grad_norm": NaN, + "learning_rate": 1.6547722034349464e-05, + "loss": 0.0, + "step": 54899 + }, + { + "epoch": 5.122702248763646, + "grad_norm": NaN, + "learning_rate": 1.6544268723830456e-05, + "loss": 0.0, + "step": 54900 + }, + { + "epoch": 5.122795558458524, + "grad_norm": NaN, + "learning_rate": 1.654081575264982e-05, + "loss": 0.0, + "step": 54901 + }, + { + "epoch": 5.122888868153401, + "grad_norm": NaN, + "learning_rate": 1.6537363120816372e-05, + "loss": 0.0, + "step": 54902 + }, + { + "epoch": 5.122982177848279, + "grad_norm": NaN, + "learning_rate": 1.6533910828338864e-05, + "loss": 0.0, + "step": 54903 + }, + { + "epoch": 5.123075487543156, + "grad_norm": NaN, + "learning_rate": 1.6530458875226092e-05, + "loss": 0.0, + "step": 54904 + }, + { + "epoch": 5.123168797238033, + "grad_norm": NaN, + "learning_rate": 1.6527007261486818e-05, + "loss": 0.0, + "step": 54905 + }, + { + "epoch": 5.12326210693291, + "grad_norm": NaN, + "learning_rate": 1.6523555987129834e-05, + "loss": 0.0, + "step": 54906 + }, + { + "epoch": 5.1233554166277875, + "grad_norm": NaN, + "learning_rate": 1.6520105052163912e-05, + "loss": 0.0, + "step": 54907 + }, + { + "epoch": 5.123448726322665, + "grad_norm": NaN, + "learning_rate": 1.6516654456597818e-05, + "loss": 0.0, + "step": 54908 + }, + { + "epoch": 5.123542036017542, + "grad_norm": NaN, + "learning_rate": 1.6513204200440322e-05, + "loss": 0.0, + "step": 54909 + }, + { + "epoch": 5.12363534571242, + "grad_norm": NaN, + "learning_rate": 1.650975428370022e-05, + "loss": 0.0, + "step": 54910 + }, + { + "epoch": 5.123728655407297, + "grad_norm": NaN, + "learning_rate": 1.650630470638626e-05, + "loss": 0.0, + "step": 54911 + }, + { + "epoch": 5.123821965102174, + "grad_norm": NaN, + "learning_rate": 1.6502855468507226e-05, + "loss": 0.0, + "step": 54912 + }, + { + "epoch": 5.123915274797051, + "grad_norm": NaN, + "learning_rate": 1.64994065700719e-05, + "loss": 0.0, + "step": 54913 + }, + { + "epoch": 5.1240085844919285, + "grad_norm": NaN, + "learning_rate": 1.649595801108902e-05, + "loss": 0.0, + "step": 54914 + }, + { + "epoch": 5.124101894186806, + "grad_norm": NaN, + "learning_rate": 1.649250979156737e-05, + "loss": 0.0, + "step": 54915 + }, + { + "epoch": 5.124195203881683, + "grad_norm": NaN, + "learning_rate": 1.6489061911515737e-05, + "loss": 0.0, + "step": 54916 + }, + { + "epoch": 5.124288513576561, + "grad_norm": NaN, + "learning_rate": 1.648561437094285e-05, + "loss": 0.0, + "step": 54917 + }, + { + "epoch": 5.124381823271438, + "grad_norm": NaN, + "learning_rate": 1.648216716985752e-05, + "loss": 0.0, + "step": 54918 + }, + { + "epoch": 5.124475132966316, + "grad_norm": NaN, + "learning_rate": 1.6478720308268466e-05, + "loss": 0.0, + "step": 54919 + }, + { + "epoch": 5.124568442661192, + "grad_norm": NaN, + "learning_rate": 1.647527378618449e-05, + "loss": 0.0, + "step": 54920 + }, + { + "epoch": 5.12466175235607, + "grad_norm": NaN, + "learning_rate": 1.647182760361433e-05, + "loss": 0.0, + "step": 54921 + }, + { + "epoch": 5.124755062050947, + "grad_norm": NaN, + "learning_rate": 1.646838176056676e-05, + "loss": 0.0, + "step": 54922 + }, + { + "epoch": 5.124848371745824, + "grad_norm": NaN, + "learning_rate": 1.646493625705055e-05, + "loss": 0.0, + "step": 54923 + }, + { + "epoch": 5.124941681440702, + "grad_norm": NaN, + "learning_rate": 1.646149109307443e-05, + "loss": 0.0, + "step": 54924 + }, + { + "epoch": 5.125034991135579, + "grad_norm": NaN, + "learning_rate": 1.6458046268647202e-05, + "loss": 0.0, + "step": 54925 + }, + { + "epoch": 5.125128300830457, + "grad_norm": NaN, + "learning_rate": 1.6454601783777587e-05, + "loss": 0.0, + "step": 54926 + }, + { + "epoch": 5.125221610525333, + "grad_norm": NaN, + "learning_rate": 1.6451157638474366e-05, + "loss": 0.0, + "step": 54927 + }, + { + "epoch": 5.125314920220211, + "grad_norm": NaN, + "learning_rate": 1.6447713832746294e-05, + "loss": 0.0, + "step": 54928 + }, + { + "epoch": 5.125408229915088, + "grad_norm": NaN, + "learning_rate": 1.6444270366602137e-05, + "loss": 0.0, + "step": 54929 + }, + { + "epoch": 5.1255015396099655, + "grad_norm": NaN, + "learning_rate": 1.6440827240050615e-05, + "loss": 0.0, + "step": 54930 + }, + { + "epoch": 5.125594849304843, + "grad_norm": NaN, + "learning_rate": 1.643738445310052e-05, + "loss": 0.0, + "step": 54931 + }, + { + "epoch": 5.12568815899972, + "grad_norm": NaN, + "learning_rate": 1.6433942005760587e-05, + "loss": 0.0, + "step": 54932 + }, + { + "epoch": 5.125781468694598, + "grad_norm": NaN, + "learning_rate": 1.6430499898039552e-05, + "loss": 0.0, + "step": 54933 + }, + { + "epoch": 5.125874778389474, + "grad_norm": NaN, + "learning_rate": 1.642705812994624e-05, + "loss": 0.0, + "step": 54934 + }, + { + "epoch": 5.125968088084352, + "grad_norm": NaN, + "learning_rate": 1.6423616701489333e-05, + "loss": 0.0, + "step": 54935 + }, + { + "epoch": 5.126061397779229, + "grad_norm": NaN, + "learning_rate": 1.6420175612677573e-05, + "loss": 0.0, + "step": 54936 + }, + { + "epoch": 5.1261547074741065, + "grad_norm": NaN, + "learning_rate": 1.641673486351978e-05, + "loss": 0.0, + "step": 54937 + }, + { + "epoch": 5.126248017168984, + "grad_norm": NaN, + "learning_rate": 1.641329445402464e-05, + "loss": 0.0, + "step": 54938 + }, + { + "epoch": 5.126341326863861, + "grad_norm": NaN, + "learning_rate": 1.6409854384200888e-05, + "loss": 0.0, + "step": 54939 + }, + { + "epoch": 5.126434636558739, + "grad_norm": NaN, + "learning_rate": 1.640641465405737e-05, + "loss": 0.0, + "step": 54940 + }, + { + "epoch": 5.126527946253615, + "grad_norm": NaN, + "learning_rate": 1.640297526360273e-05, + "loss": 0.0, + "step": 54941 + }, + { + "epoch": 5.126621255948493, + "grad_norm": NaN, + "learning_rate": 1.6399536212845708e-05, + "loss": 0.0, + "step": 54942 + }, + { + "epoch": 5.12671456564337, + "grad_norm": NaN, + "learning_rate": 1.6396097501795168e-05, + "loss": 0.0, + "step": 54943 + }, + { + "epoch": 5.1268078753382476, + "grad_norm": NaN, + "learning_rate": 1.639265913045975e-05, + "loss": 0.0, + "step": 54944 + }, + { + "epoch": 5.126901185033125, + "grad_norm": NaN, + "learning_rate": 1.63892210988482e-05, + "loss": 0.0, + "step": 54945 + }, + { + "epoch": 5.126994494728002, + "grad_norm": NaN, + "learning_rate": 1.6385783406969304e-05, + "loss": 0.0, + "step": 54946 + }, + { + "epoch": 5.12708780442288, + "grad_norm": NaN, + "learning_rate": 1.6382346054831763e-05, + "loss": 0.0, + "step": 54947 + }, + { + "epoch": 5.127181114117757, + "grad_norm": NaN, + "learning_rate": 1.6378909042444348e-05, + "loss": 0.0, + "step": 54948 + }, + { + "epoch": 5.127274423812634, + "grad_norm": NaN, + "learning_rate": 1.637547236981579e-05, + "loss": 0.0, + "step": 54949 + }, + { + "epoch": 5.127367733507511, + "grad_norm": NaN, + "learning_rate": 1.637203603695481e-05, + "loss": 0.0, + "step": 54950 + }, + { + "epoch": 5.127461043202389, + "grad_norm": NaN, + "learning_rate": 1.6368600043870172e-05, + "loss": 0.0, + "step": 54951 + }, + { + "epoch": 5.127554352897266, + "grad_norm": NaN, + "learning_rate": 1.6365164390570584e-05, + "loss": 0.0, + "step": 54952 + }, + { + "epoch": 5.127647662592143, + "grad_norm": NaN, + "learning_rate": 1.6361729077064805e-05, + "loss": 0.0, + "step": 54953 + }, + { + "epoch": 5.127740972287021, + "grad_norm": NaN, + "learning_rate": 1.6358294103361563e-05, + "loss": 0.0, + "step": 54954 + }, + { + "epoch": 5.127834281981898, + "grad_norm": NaN, + "learning_rate": 1.63548594694696e-05, + "loss": 0.0, + "step": 54955 + }, + { + "epoch": 5.127927591676775, + "grad_norm": NaN, + "learning_rate": 1.6351425175397625e-05, + "loss": 0.0, + "step": 54956 + }, + { + "epoch": 5.128020901371652, + "grad_norm": NaN, + "learning_rate": 1.63479912211544e-05, + "loss": 0.0, + "step": 54957 + }, + { + "epoch": 5.12811421106653, + "grad_norm": NaN, + "learning_rate": 1.634455760674863e-05, + "loss": 0.0, + "step": 54958 + }, + { + "epoch": 5.128207520761407, + "grad_norm": NaN, + "learning_rate": 1.6341124332189064e-05, + "loss": 0.0, + "step": 54959 + }, + { + "epoch": 5.1283008304562845, + "grad_norm": NaN, + "learning_rate": 1.6337691397484427e-05, + "loss": 0.0, + "step": 54960 + }, + { + "epoch": 5.128394140151162, + "grad_norm": NaN, + "learning_rate": 1.6334258802643456e-05, + "loss": 0.0, + "step": 54961 + }, + { + "epoch": 5.128487449846039, + "grad_norm": NaN, + "learning_rate": 1.6330826547674852e-05, + "loss": 0.0, + "step": 54962 + }, + { + "epoch": 5.128580759540917, + "grad_norm": NaN, + "learning_rate": 1.6327394632587372e-05, + "loss": 0.0, + "step": 54963 + }, + { + "epoch": 5.128674069235793, + "grad_norm": NaN, + "learning_rate": 1.632396305738974e-05, + "loss": 0.0, + "step": 54964 + }, + { + "epoch": 5.128767378930671, + "grad_norm": NaN, + "learning_rate": 1.6320531822090648e-05, + "loss": 0.0, + "step": 54965 + }, + { + "epoch": 5.128860688625548, + "grad_norm": NaN, + "learning_rate": 1.6317100926698863e-05, + "loss": 0.0, + "step": 54966 + }, + { + "epoch": 5.1289539983204255, + "grad_norm": NaN, + "learning_rate": 1.6313670371223086e-05, + "loss": 0.0, + "step": 54967 + }, + { + "epoch": 5.129047308015303, + "grad_norm": NaN, + "learning_rate": 1.6310240155672033e-05, + "loss": 0.0, + "step": 54968 + }, + { + "epoch": 5.12914061771018, + "grad_norm": NaN, + "learning_rate": 1.630681028005446e-05, + "loss": 0.0, + "step": 54969 + }, + { + "epoch": 5.129233927405058, + "grad_norm": NaN, + "learning_rate": 1.6303380744379048e-05, + "loss": 0.0, + "step": 54970 + }, + { + "epoch": 5.129327237099934, + "grad_norm": NaN, + "learning_rate": 1.6299951548654515e-05, + "loss": 0.0, + "step": 54971 + }, + { + "epoch": 5.129420546794812, + "grad_norm": NaN, + "learning_rate": 1.6296522692889647e-05, + "loss": 0.0, + "step": 54972 + }, + { + "epoch": 5.129513856489689, + "grad_norm": NaN, + "learning_rate": 1.6293094177093097e-05, + "loss": 0.0, + "step": 54973 + }, + { + "epoch": 5.129607166184567, + "grad_norm": NaN, + "learning_rate": 1.6289666001273576e-05, + "loss": 0.0, + "step": 54974 + }, + { + "epoch": 5.129700475879444, + "grad_norm": NaN, + "learning_rate": 1.628623816543988e-05, + "loss": 0.0, + "step": 54975 + }, + { + "epoch": 5.129793785574321, + "grad_norm": NaN, + "learning_rate": 1.6282810669600633e-05, + "loss": 0.0, + "step": 54976 + }, + { + "epoch": 5.129887095269199, + "grad_norm": NaN, + "learning_rate": 1.627938351376456e-05, + "loss": 0.0, + "step": 54977 + }, + { + "epoch": 5.129980404964075, + "grad_norm": NaN, + "learning_rate": 1.6275956697940472e-05, + "loss": 0.0, + "step": 54978 + }, + { + "epoch": 5.130073714658953, + "grad_norm": NaN, + "learning_rate": 1.627253022213698e-05, + "loss": 0.0, + "step": 54979 + }, + { + "epoch": 5.13016702435383, + "grad_norm": NaN, + "learning_rate": 1.626910408636281e-05, + "loss": 0.0, + "step": 54980 + }, + { + "epoch": 5.130260334048708, + "grad_norm": NaN, + "learning_rate": 1.6265678290626754e-05, + "loss": 0.0, + "step": 54981 + }, + { + "epoch": 5.130353643743585, + "grad_norm": NaN, + "learning_rate": 1.6262252834937422e-05, + "loss": 0.0, + "step": 54982 + }, + { + "epoch": 5.1304469534384625, + "grad_norm": NaN, + "learning_rate": 1.625882771930354e-05, + "loss": 0.0, + "step": 54983 + }, + { + "epoch": 5.13054026313334, + "grad_norm": NaN, + "learning_rate": 1.6255402943733904e-05, + "loss": 0.0, + "step": 54984 + }, + { + "epoch": 5.130633572828216, + "grad_norm": NaN, + "learning_rate": 1.625197850823712e-05, + "loss": 0.0, + "step": 54985 + }, + { + "epoch": 5.130726882523094, + "grad_norm": NaN, + "learning_rate": 1.6248554412821913e-05, + "loss": 0.0, + "step": 54986 + }, + { + "epoch": 5.130820192217971, + "grad_norm": NaN, + "learning_rate": 1.624513065749708e-05, + "loss": 0.0, + "step": 54987 + }, + { + "epoch": 5.130913501912849, + "grad_norm": NaN, + "learning_rate": 1.6241707242271213e-05, + "loss": 0.0, + "step": 54988 + }, + { + "epoch": 5.131006811607726, + "grad_norm": NaN, + "learning_rate": 1.623828416715307e-05, + "loss": 0.0, + "step": 54989 + }, + { + "epoch": 5.1311001213026035, + "grad_norm": NaN, + "learning_rate": 1.6234861432151347e-05, + "loss": 0.0, + "step": 54990 + }, + { + "epoch": 5.131193430997481, + "grad_norm": NaN, + "learning_rate": 1.623143903727475e-05, + "loss": 0.0, + "step": 54991 + }, + { + "epoch": 5.131286740692358, + "grad_norm": NaN, + "learning_rate": 1.6228016982531976e-05, + "loss": 0.0, + "step": 54992 + }, + { + "epoch": 5.131380050387235, + "grad_norm": NaN, + "learning_rate": 1.6224595267931734e-05, + "loss": 0.0, + "step": 54993 + }, + { + "epoch": 5.131473360082112, + "grad_norm": NaN, + "learning_rate": 1.6221173893482715e-05, + "loss": 0.0, + "step": 54994 + }, + { + "epoch": 5.13156666977699, + "grad_norm": NaN, + "learning_rate": 1.621775285919361e-05, + "loss": 0.0, + "step": 54995 + }, + { + "epoch": 5.131659979471867, + "grad_norm": NaN, + "learning_rate": 1.621433216507315e-05, + "loss": 0.0, + "step": 54996 + }, + { + "epoch": 5.131753289166745, + "grad_norm": NaN, + "learning_rate": 1.6210911811129997e-05, + "loss": 0.0, + "step": 54997 + }, + { + "epoch": 5.131846598861622, + "grad_norm": NaN, + "learning_rate": 1.6207491797372868e-05, + "loss": 0.0, + "step": 54998 + }, + { + "epoch": 5.131939908556499, + "grad_norm": NaN, + "learning_rate": 1.6204072123810463e-05, + "loss": 0.0, + "step": 54999 + }, + { + "epoch": 5.132033218251376, + "grad_norm": NaN, + "learning_rate": 1.6200652790451473e-05, + "loss": 0.0, + "step": 55000 + }, + { + "epoch": 5.132126527946253, + "grad_norm": NaN, + "learning_rate": 1.6197233797304576e-05, + "loss": 0.0, + "step": 55001 + }, + { + "epoch": 5.132219837641131, + "grad_norm": NaN, + "learning_rate": 1.619381514437848e-05, + "loss": 0.0, + "step": 55002 + }, + { + "epoch": 5.132313147336008, + "grad_norm": NaN, + "learning_rate": 1.619039683168188e-05, + "loss": 0.0, + "step": 55003 + }, + { + "epoch": 5.132406457030886, + "grad_norm": NaN, + "learning_rate": 1.6186978859223465e-05, + "loss": 0.0, + "step": 55004 + }, + { + "epoch": 5.132499766725763, + "grad_norm": NaN, + "learning_rate": 1.6183561227011916e-05, + "loss": 0.0, + "step": 55005 + }, + { + "epoch": 5.13259307642064, + "grad_norm": NaN, + "learning_rate": 1.618014393505594e-05, + "loss": 0.0, + "step": 55006 + }, + { + "epoch": 5.132686386115517, + "grad_norm": NaN, + "learning_rate": 1.6176726983364213e-05, + "loss": 0.0, + "step": 55007 + }, + { + "epoch": 5.132779695810394, + "grad_norm": NaN, + "learning_rate": 1.617331037194543e-05, + "loss": 0.0, + "step": 55008 + }, + { + "epoch": 5.132873005505272, + "grad_norm": NaN, + "learning_rate": 1.616989410080825e-05, + "loss": 0.0, + "step": 55009 + }, + { + "epoch": 5.132966315200149, + "grad_norm": NaN, + "learning_rate": 1.616647816996143e-05, + "loss": 0.0, + "step": 55010 + }, + { + "epoch": 5.133059624895027, + "grad_norm": NaN, + "learning_rate": 1.6163062579413606e-05, + "loss": 0.0, + "step": 55011 + }, + { + "epoch": 5.133152934589904, + "grad_norm": NaN, + "learning_rate": 1.6159647329173424e-05, + "loss": 0.0, + "step": 55012 + }, + { + "epoch": 5.1332462442847815, + "grad_norm": NaN, + "learning_rate": 1.615623241924967e-05, + "loss": 0.0, + "step": 55013 + }, + { + "epoch": 5.133339553979659, + "grad_norm": NaN, + "learning_rate": 1.6152817849650935e-05, + "loss": 0.0, + "step": 55014 + }, + { + "epoch": 5.133432863674535, + "grad_norm": NaN, + "learning_rate": 1.6149403620385924e-05, + "loss": 0.0, + "step": 55015 + }, + { + "epoch": 5.133526173369413, + "grad_norm": NaN, + "learning_rate": 1.614598973146337e-05, + "loss": 0.0, + "step": 55016 + }, + { + "epoch": 5.13361948306429, + "grad_norm": NaN, + "learning_rate": 1.6142576182891897e-05, + "loss": 0.0, + "step": 55017 + }, + { + "epoch": 5.133712792759168, + "grad_norm": NaN, + "learning_rate": 1.6139162974680164e-05, + "loss": 0.0, + "step": 55018 + }, + { + "epoch": 5.133806102454045, + "grad_norm": NaN, + "learning_rate": 1.6135750106836966e-05, + "loss": 0.0, + "step": 55019 + }, + { + "epoch": 5.1338994121489225, + "grad_norm": NaN, + "learning_rate": 1.6132337579370857e-05, + "loss": 0.0, + "step": 55020 + }, + { + "epoch": 5.1339927218438, + "grad_norm": NaN, + "learning_rate": 1.6128925392290522e-05, + "loss": 0.0, + "step": 55021 + }, + { + "epoch": 5.1340860315386765, + "grad_norm": NaN, + "learning_rate": 1.6125513545604745e-05, + "loss": 0.0, + "step": 55022 + }, + { + "epoch": 5.134179341233554, + "grad_norm": NaN, + "learning_rate": 1.6122102039322093e-05, + "loss": 0.0, + "step": 55023 + }, + { + "epoch": 5.134272650928431, + "grad_norm": NaN, + "learning_rate": 1.6118690873451256e-05, + "loss": 0.0, + "step": 55024 + }, + { + "epoch": 5.134365960623309, + "grad_norm": NaN, + "learning_rate": 1.6115280048000978e-05, + "loss": 0.0, + "step": 55025 + }, + { + "epoch": 5.134459270318186, + "grad_norm": NaN, + "learning_rate": 1.6111869562979868e-05, + "loss": 0.0, + "step": 55026 + }, + { + "epoch": 5.134552580013064, + "grad_norm": NaN, + "learning_rate": 1.6108459418396567e-05, + "loss": 0.0, + "step": 55027 + }, + { + "epoch": 5.134645889707941, + "grad_norm": NaN, + "learning_rate": 1.6105049614259874e-05, + "loss": 0.0, + "step": 55028 + }, + { + "epoch": 5.1347391994028175, + "grad_norm": NaN, + "learning_rate": 1.6101640150578326e-05, + "loss": 0.0, + "step": 55029 + }, + { + "epoch": 5.134832509097695, + "grad_norm": NaN, + "learning_rate": 1.609823102736062e-05, + "loss": 0.0, + "step": 55030 + }, + { + "epoch": 5.134925818792572, + "grad_norm": NaN, + "learning_rate": 1.6094822244615513e-05, + "loss": 0.0, + "step": 55031 + }, + { + "epoch": 5.13501912848745, + "grad_norm": NaN, + "learning_rate": 1.6091413802351565e-05, + "loss": 0.0, + "step": 55032 + }, + { + "epoch": 5.135112438182327, + "grad_norm": NaN, + "learning_rate": 1.608800570057747e-05, + "loss": 0.0, + "step": 55033 + }, + { + "epoch": 5.135205747877205, + "grad_norm": NaN, + "learning_rate": 1.6084597939301957e-05, + "loss": 0.0, + "step": 55034 + }, + { + "epoch": 5.135299057572082, + "grad_norm": NaN, + "learning_rate": 1.608119051853363e-05, + "loss": 0.0, + "step": 55035 + }, + { + "epoch": 5.1353923672669595, + "grad_norm": NaN, + "learning_rate": 1.6077783438281156e-05, + "loss": 0.0, + "step": 55036 + }, + { + "epoch": 5.135485676961836, + "grad_norm": NaN, + "learning_rate": 1.6074376698553205e-05, + "loss": 0.0, + "step": 55037 + }, + { + "epoch": 5.135578986656713, + "grad_norm": NaN, + "learning_rate": 1.607097029935844e-05, + "loss": 0.0, + "step": 55038 + }, + { + "epoch": 5.135672296351591, + "grad_norm": NaN, + "learning_rate": 1.6067564240705534e-05, + "loss": 0.0, + "step": 55039 + }, + { + "epoch": 5.135765606046468, + "grad_norm": NaN, + "learning_rate": 1.6064158522603133e-05, + "loss": 0.0, + "step": 55040 + }, + { + "epoch": 5.135858915741346, + "grad_norm": NaN, + "learning_rate": 1.6060753145059896e-05, + "loss": 0.0, + "step": 55041 + }, + { + "epoch": 5.135952225436223, + "grad_norm": NaN, + "learning_rate": 1.6057348108084502e-05, + "loss": 0.0, + "step": 55042 + }, + { + "epoch": 5.1360455351311005, + "grad_norm": NaN, + "learning_rate": 1.605394341168557e-05, + "loss": 0.0, + "step": 55043 + }, + { + "epoch": 5.136138844825977, + "grad_norm": NaN, + "learning_rate": 1.6050539055871803e-05, + "loss": 0.0, + "step": 55044 + }, + { + "epoch": 5.1362321545208545, + "grad_norm": NaN, + "learning_rate": 1.604713504065184e-05, + "loss": 0.0, + "step": 55045 + }, + { + "epoch": 5.136325464215732, + "grad_norm": NaN, + "learning_rate": 1.604373136603432e-05, + "loss": 0.0, + "step": 55046 + }, + { + "epoch": 5.136418773910609, + "grad_norm": NaN, + "learning_rate": 1.6040328032027887e-05, + "loss": 0.0, + "step": 55047 + }, + { + "epoch": 5.136512083605487, + "grad_norm": NaN, + "learning_rate": 1.6036925038641274e-05, + "loss": 0.0, + "step": 55048 + }, + { + "epoch": 5.136605393300364, + "grad_norm": NaN, + "learning_rate": 1.6033522385883052e-05, + "loss": 0.0, + "step": 55049 + }, + { + "epoch": 5.136698702995242, + "grad_norm": NaN, + "learning_rate": 1.6030120073761864e-05, + "loss": 0.0, + "step": 55050 + }, + { + "epoch": 5.136792012690118, + "grad_norm": NaN, + "learning_rate": 1.602671810228647e-05, + "loss": 0.0, + "step": 55051 + }, + { + "epoch": 5.1368853223849955, + "grad_norm": NaN, + "learning_rate": 1.6023316471465403e-05, + "loss": 0.0, + "step": 55052 + }, + { + "epoch": 5.136978632079873, + "grad_norm": NaN, + "learning_rate": 1.601991518130733e-05, + "loss": 0.0, + "step": 55053 + }, + { + "epoch": 5.13707194177475, + "grad_norm": NaN, + "learning_rate": 1.601651423182098e-05, + "loss": 0.0, + "step": 55054 + }, + { + "epoch": 5.137165251469628, + "grad_norm": NaN, + "learning_rate": 1.6013113623014918e-05, + "loss": 0.0, + "step": 55055 + }, + { + "epoch": 5.137258561164505, + "grad_norm": NaN, + "learning_rate": 1.6009713354897795e-05, + "loss": 0.0, + "step": 55056 + }, + { + "epoch": 5.137351870859383, + "grad_norm": NaN, + "learning_rate": 1.600631342747833e-05, + "loss": 0.0, + "step": 55057 + }, + { + "epoch": 5.137445180554259, + "grad_norm": NaN, + "learning_rate": 1.600291384076509e-05, + "loss": 0.0, + "step": 55058 + }, + { + "epoch": 5.137538490249137, + "grad_norm": NaN, + "learning_rate": 1.599951459476671e-05, + "loss": 0.0, + "step": 55059 + }, + { + "epoch": 5.137631799944014, + "grad_norm": NaN, + "learning_rate": 1.5996115689491946e-05, + "loss": 0.0, + "step": 55060 + }, + { + "epoch": 5.137725109638891, + "grad_norm": NaN, + "learning_rate": 1.5992717124949323e-05, + "loss": 0.0, + "step": 55061 + }, + { + "epoch": 5.137818419333769, + "grad_norm": NaN, + "learning_rate": 1.5989318901147503e-05, + "loss": 0.0, + "step": 55062 + }, + { + "epoch": 5.137911729028646, + "grad_norm": NaN, + "learning_rate": 1.598592101809521e-05, + "loss": 0.0, + "step": 55063 + }, + { + "epoch": 5.138005038723524, + "grad_norm": NaN, + "learning_rate": 1.598252347580099e-05, + "loss": 0.0, + "step": 55064 + }, + { + "epoch": 5.138098348418401, + "grad_norm": NaN, + "learning_rate": 1.5979126274273468e-05, + "loss": 0.0, + "step": 55065 + }, + { + "epoch": 5.138191658113278, + "grad_norm": NaN, + "learning_rate": 1.5975729413521403e-05, + "loss": 0.0, + "step": 55066 + }, + { + "epoch": 5.138284967808155, + "grad_norm": NaN, + "learning_rate": 1.5972332893553325e-05, + "loss": 0.0, + "step": 55067 + }, + { + "epoch": 5.138378277503032, + "grad_norm": NaN, + "learning_rate": 1.596893671437787e-05, + "loss": 0.0, + "step": 55068 + }, + { + "epoch": 5.13847158719791, + "grad_norm": NaN, + "learning_rate": 1.5965540876003776e-05, + "loss": 0.0, + "step": 55069 + }, + { + "epoch": 5.138564896892787, + "grad_norm": NaN, + "learning_rate": 1.5962145378439573e-05, + "loss": 0.0, + "step": 55070 + }, + { + "epoch": 5.138658206587665, + "grad_norm": NaN, + "learning_rate": 1.5958750221693893e-05, + "loss": 0.0, + "step": 55071 + }, + { + "epoch": 5.138751516282542, + "grad_norm": NaN, + "learning_rate": 1.5955355405775467e-05, + "loss": 0.0, + "step": 55072 + }, + { + "epoch": 5.138844825977419, + "grad_norm": NaN, + "learning_rate": 1.5951960930692833e-05, + "loss": 0.0, + "step": 55073 + }, + { + "epoch": 5.138938135672296, + "grad_norm": NaN, + "learning_rate": 1.594856679645463e-05, + "loss": 0.0, + "step": 55074 + }, + { + "epoch": 5.1390314453671735, + "grad_norm": NaN, + "learning_rate": 1.5945173003069577e-05, + "loss": 0.0, + "step": 55075 + }, + { + "epoch": 5.139124755062051, + "grad_norm": NaN, + "learning_rate": 1.5941779550546197e-05, + "loss": 0.0, + "step": 55076 + }, + { + "epoch": 5.139218064756928, + "grad_norm": NaN, + "learning_rate": 1.593838643889315e-05, + "loss": 0.0, + "step": 55077 + }, + { + "epoch": 5.139311374451806, + "grad_norm": NaN, + "learning_rate": 1.5934993668119107e-05, + "loss": 0.0, + "step": 55078 + }, + { + "epoch": 5.139404684146683, + "grad_norm": NaN, + "learning_rate": 1.593160123823265e-05, + "loss": 0.0, + "step": 55079 + }, + { + "epoch": 5.139497993841561, + "grad_norm": NaN, + "learning_rate": 1.592820914924242e-05, + "loss": 0.0, + "step": 55080 + }, + { + "epoch": 5.139591303536437, + "grad_norm": NaN, + "learning_rate": 1.592481740115703e-05, + "loss": 0.0, + "step": 55081 + }, + { + "epoch": 5.1396846132313145, + "grad_norm": NaN, + "learning_rate": 1.5921425993985105e-05, + "loss": 0.0, + "step": 55082 + }, + { + "epoch": 5.139777922926192, + "grad_norm": NaN, + "learning_rate": 1.59180349277353e-05, + "loss": 0.0, + "step": 55083 + }, + { + "epoch": 5.139871232621069, + "grad_norm": NaN, + "learning_rate": 1.59146442024162e-05, + "loss": 0.0, + "step": 55084 + }, + { + "epoch": 5.139964542315947, + "grad_norm": NaN, + "learning_rate": 1.5911253818036407e-05, + "loss": 0.0, + "step": 55085 + }, + { + "epoch": 5.140057852010824, + "grad_norm": NaN, + "learning_rate": 1.5907863774604633e-05, + "loss": 0.0, + "step": 55086 + }, + { + "epoch": 5.140151161705702, + "grad_norm": NaN, + "learning_rate": 1.5904474072129425e-05, + "loss": 0.0, + "step": 55087 + }, + { + "epoch": 5.140244471400578, + "grad_norm": NaN, + "learning_rate": 1.5901084710619388e-05, + "loss": 0.0, + "step": 55088 + }, + { + "epoch": 5.140337781095456, + "grad_norm": NaN, + "learning_rate": 1.589769569008322e-05, + "loss": 0.0, + "step": 55089 + }, + { + "epoch": 5.140431090790333, + "grad_norm": NaN, + "learning_rate": 1.5894307010529457e-05, + "loss": 0.0, + "step": 55090 + }, + { + "epoch": 5.14052440048521, + "grad_norm": NaN, + "learning_rate": 1.5890918671966714e-05, + "loss": 0.0, + "step": 55091 + }, + { + "epoch": 5.140617710180088, + "grad_norm": NaN, + "learning_rate": 1.5887530674403715e-05, + "loss": 0.0, + "step": 55092 + }, + { + "epoch": 5.140711019874965, + "grad_norm": NaN, + "learning_rate": 1.5884143017848956e-05, + "loss": 0.0, + "step": 55093 + }, + { + "epoch": 5.140804329569843, + "grad_norm": NaN, + "learning_rate": 1.588075570231106e-05, + "loss": 0.0, + "step": 55094 + }, + { + "epoch": 5.140897639264719, + "grad_norm": NaN, + "learning_rate": 1.587736872779874e-05, + "loss": 0.0, + "step": 55095 + }, + { + "epoch": 5.140990948959597, + "grad_norm": NaN, + "learning_rate": 1.58739820943205e-05, + "loss": 0.0, + "step": 55096 + }, + { + "epoch": 5.141084258654474, + "grad_norm": NaN, + "learning_rate": 1.5870595801884976e-05, + "loss": 0.0, + "step": 55097 + }, + { + "epoch": 5.1411775683493515, + "grad_norm": NaN, + "learning_rate": 1.5867209850500857e-05, + "loss": 0.0, + "step": 55098 + }, + { + "epoch": 5.141270878044229, + "grad_norm": NaN, + "learning_rate": 1.5863824240176647e-05, + "loss": 0.0, + "step": 55099 + }, + { + "epoch": 5.141364187739106, + "grad_norm": NaN, + "learning_rate": 1.586043897092098e-05, + "loss": 0.0, + "step": 55100 + }, + { + "epoch": 5.141457497433984, + "grad_norm": NaN, + "learning_rate": 1.5857054042742528e-05, + "loss": 0.0, + "step": 55101 + }, + { + "epoch": 5.14155080712886, + "grad_norm": NaN, + "learning_rate": 1.5853669455649836e-05, + "loss": 0.0, + "step": 55102 + }, + { + "epoch": 5.141644116823738, + "grad_norm": NaN, + "learning_rate": 1.585028520965148e-05, + "loss": 0.0, + "step": 55103 + }, + { + "epoch": 5.141737426518615, + "grad_norm": NaN, + "learning_rate": 1.5846901304756188e-05, + "loss": 0.0, + "step": 55104 + }, + { + "epoch": 5.1418307362134925, + "grad_norm": NaN, + "learning_rate": 1.584351774097245e-05, + "loss": 0.0, + "step": 55105 + }, + { + "epoch": 5.14192404590837, + "grad_norm": NaN, + "learning_rate": 1.5840134518308877e-05, + "loss": 0.0, + "step": 55106 + }, + { + "epoch": 5.142017355603247, + "grad_norm": NaN, + "learning_rate": 1.583675163677415e-05, + "loss": 0.0, + "step": 55107 + }, + { + "epoch": 5.142110665298125, + "grad_norm": NaN, + "learning_rate": 1.5833369096376803e-05, + "loss": 0.0, + "step": 55108 + }, + { + "epoch": 5.142203974993002, + "grad_norm": NaN, + "learning_rate": 1.582998689712542e-05, + "loss": 0.0, + "step": 55109 + }, + { + "epoch": 5.142297284687879, + "grad_norm": NaN, + "learning_rate": 1.5826605039028705e-05, + "loss": 0.0, + "step": 55110 + }, + { + "epoch": 5.142390594382756, + "grad_norm": NaN, + "learning_rate": 1.5823223522095158e-05, + "loss": 0.0, + "step": 55111 + }, + { + "epoch": 5.142483904077634, + "grad_norm": NaN, + "learning_rate": 1.5819842346333367e-05, + "loss": 0.0, + "step": 55112 + }, + { + "epoch": 5.142577213772511, + "grad_norm": NaN, + "learning_rate": 1.5816461511752028e-05, + "loss": 0.0, + "step": 55113 + }, + { + "epoch": 5.142670523467388, + "grad_norm": NaN, + "learning_rate": 1.5813081018359664e-05, + "loss": 0.0, + "step": 55114 + }, + { + "epoch": 5.142763833162266, + "grad_norm": NaN, + "learning_rate": 1.580970086616484e-05, + "loss": 0.0, + "step": 55115 + }, + { + "epoch": 5.142857142857143, + "grad_norm": NaN, + "learning_rate": 1.5806321055176274e-05, + "loss": 0.0, + "step": 55116 + }, + { + "epoch": 5.14295045255202, + "grad_norm": NaN, + "learning_rate": 1.5802941585402433e-05, + "loss": 0.0, + "step": 55117 + }, + { + "epoch": 5.143043762246897, + "grad_norm": NaN, + "learning_rate": 1.5799562456851943e-05, + "loss": 0.0, + "step": 55118 + }, + { + "epoch": 5.143137071941775, + "grad_norm": NaN, + "learning_rate": 1.5796183669533463e-05, + "loss": 0.0, + "step": 55119 + }, + { + "epoch": 5.143230381636652, + "grad_norm": NaN, + "learning_rate": 1.5792805223455483e-05, + "loss": 0.0, + "step": 55120 + }, + { + "epoch": 5.1433236913315294, + "grad_norm": NaN, + "learning_rate": 1.578942711862667e-05, + "loss": 0.0, + "step": 55121 + }, + { + "epoch": 5.143417001026407, + "grad_norm": NaN, + "learning_rate": 1.5786049355055624e-05, + "loss": 0.0, + "step": 55122 + }, + { + "epoch": 5.143510310721284, + "grad_norm": NaN, + "learning_rate": 1.578267193275083e-05, + "loss": 0.0, + "step": 55123 + }, + { + "epoch": 5.143603620416161, + "grad_norm": NaN, + "learning_rate": 1.5779294851721007e-05, + "loss": 0.0, + "step": 55124 + }, + { + "epoch": 5.143696930111038, + "grad_norm": NaN, + "learning_rate": 1.577591811197464e-05, + "loss": 0.0, + "step": 55125 + }, + { + "epoch": 5.143790239805916, + "grad_norm": NaN, + "learning_rate": 1.577254171352033e-05, + "loss": 0.0, + "step": 55126 + }, + { + "epoch": 5.143883549500793, + "grad_norm": NaN, + "learning_rate": 1.5769165656366745e-05, + "loss": 0.0, + "step": 55127 + }, + { + "epoch": 5.1439768591956705, + "grad_norm": NaN, + "learning_rate": 1.576578994052239e-05, + "loss": 0.0, + "step": 55128 + }, + { + "epoch": 5.144070168890548, + "grad_norm": NaN, + "learning_rate": 1.5762414565995823e-05, + "loss": 0.0, + "step": 55129 + }, + { + "epoch": 5.144163478585425, + "grad_norm": NaN, + "learning_rate": 1.5759039532795724e-05, + "loss": 0.0, + "step": 55130 + }, + { + "epoch": 5.144256788280303, + "grad_norm": NaN, + "learning_rate": 1.5755664840930604e-05, + "loss": 0.0, + "step": 55131 + }, + { + "epoch": 5.144350097975179, + "grad_norm": NaN, + "learning_rate": 1.5752290490409035e-05, + "loss": 0.0, + "step": 55132 + }, + { + "epoch": 5.144443407670057, + "grad_norm": NaN, + "learning_rate": 1.5748916481239683e-05, + "loss": 0.0, + "step": 55133 + }, + { + "epoch": 5.144536717364934, + "grad_norm": NaN, + "learning_rate": 1.5745542813431034e-05, + "loss": 0.0, + "step": 55134 + }, + { + "epoch": 5.1446300270598115, + "grad_norm": NaN, + "learning_rate": 1.5742169486991684e-05, + "loss": 0.0, + "step": 55135 + }, + { + "epoch": 5.144723336754689, + "grad_norm": NaN, + "learning_rate": 1.573879650193028e-05, + "loss": 0.0, + "step": 55136 + }, + { + "epoch": 5.144816646449566, + "grad_norm": NaN, + "learning_rate": 1.573542385825531e-05, + "loss": 0.0, + "step": 55137 + }, + { + "epoch": 5.144909956144444, + "grad_norm": NaN, + "learning_rate": 1.573205155597535e-05, + "loss": 0.0, + "step": 55138 + }, + { + "epoch": 5.14500326583932, + "grad_norm": NaN, + "learning_rate": 1.572867959509908e-05, + "loss": 0.0, + "step": 55139 + }, + { + "epoch": 5.145096575534198, + "grad_norm": NaN, + "learning_rate": 1.572530797563496e-05, + "loss": 0.0, + "step": 55140 + }, + { + "epoch": 5.145189885229075, + "grad_norm": NaN, + "learning_rate": 1.5721936697591596e-05, + "loss": 0.0, + "step": 55141 + }, + { + "epoch": 5.145283194923953, + "grad_norm": NaN, + "learning_rate": 1.571856576097762e-05, + "loss": 0.0, + "step": 55142 + }, + { + "epoch": 5.14537650461883, + "grad_norm": NaN, + "learning_rate": 1.571519516580152e-05, + "loss": 0.0, + "step": 55143 + }, + { + "epoch": 5.145469814313707, + "grad_norm": NaN, + "learning_rate": 1.571182491207189e-05, + "loss": 0.0, + "step": 55144 + }, + { + "epoch": 5.145563124008585, + "grad_norm": NaN, + "learning_rate": 1.5708454999797343e-05, + "loss": 0.0, + "step": 55145 + }, + { + "epoch": 5.145656433703461, + "grad_norm": NaN, + "learning_rate": 1.57050854289864e-05, + "loss": 0.0, + "step": 55146 + }, + { + "epoch": 5.145749743398339, + "grad_norm": NaN, + "learning_rate": 1.5701716199647613e-05, + "loss": 0.0, + "step": 55147 + }, + { + "epoch": 5.145843053093216, + "grad_norm": NaN, + "learning_rate": 1.569834731178965e-05, + "loss": 0.0, + "step": 55148 + }, + { + "epoch": 5.145936362788094, + "grad_norm": NaN, + "learning_rate": 1.5694978765420958e-05, + "loss": 0.0, + "step": 55149 + }, + { + "epoch": 5.146029672482971, + "grad_norm": NaN, + "learning_rate": 1.5691610560550126e-05, + "loss": 0.0, + "step": 55150 + }, + { + "epoch": 5.1461229821778485, + "grad_norm": NaN, + "learning_rate": 1.5688242697185804e-05, + "loss": 0.0, + "step": 55151 + }, + { + "epoch": 5.146216291872726, + "grad_norm": NaN, + "learning_rate": 1.5684875175336465e-05, + "loss": 0.0, + "step": 55152 + }, + { + "epoch": 5.146309601567603, + "grad_norm": NaN, + "learning_rate": 1.5681507995010685e-05, + "loss": 0.0, + "step": 55153 + }, + { + "epoch": 5.14640291126248, + "grad_norm": NaN, + "learning_rate": 1.5678141156217087e-05, + "loss": 0.0, + "step": 55154 + }, + { + "epoch": 5.146496220957357, + "grad_norm": NaN, + "learning_rate": 1.5674774658964155e-05, + "loss": 0.0, + "step": 55155 + }, + { + "epoch": 5.146589530652235, + "grad_norm": NaN, + "learning_rate": 1.5671408503260464e-05, + "loss": 0.0, + "step": 55156 + }, + { + "epoch": 5.146682840347112, + "grad_norm": NaN, + "learning_rate": 1.5668042689114637e-05, + "loss": 0.0, + "step": 55157 + }, + { + "epoch": 5.1467761500419895, + "grad_norm": NaN, + "learning_rate": 1.5664677216535133e-05, + "loss": 0.0, + "step": 55158 + }, + { + "epoch": 5.146869459736867, + "grad_norm": NaN, + "learning_rate": 1.566131208553058e-05, + "loss": 0.0, + "step": 55159 + }, + { + "epoch": 5.146962769431744, + "grad_norm": NaN, + "learning_rate": 1.565794729610956e-05, + "loss": 0.0, + "step": 55160 + }, + { + "epoch": 5.147056079126621, + "grad_norm": NaN, + "learning_rate": 1.5654582848280523e-05, + "loss": 0.0, + "step": 55161 + }, + { + "epoch": 5.147149388821498, + "grad_norm": NaN, + "learning_rate": 1.5651218742052117e-05, + "loss": 0.0, + "step": 55162 + }, + { + "epoch": 5.147242698516376, + "grad_norm": NaN, + "learning_rate": 1.5647854977432887e-05, + "loss": 0.0, + "step": 55163 + }, + { + "epoch": 5.147336008211253, + "grad_norm": NaN, + "learning_rate": 1.564449155443131e-05, + "loss": 0.0, + "step": 55164 + }, + { + "epoch": 5.147429317906131, + "grad_norm": NaN, + "learning_rate": 1.5641128473056007e-05, + "loss": 0.0, + "step": 55165 + }, + { + "epoch": 5.147522627601008, + "grad_norm": NaN, + "learning_rate": 1.563776573331556e-05, + "loss": 0.0, + "step": 55166 + }, + { + "epoch": 5.147615937295885, + "grad_norm": NaN, + "learning_rate": 1.563440333521841e-05, + "loss": 0.0, + "step": 55167 + }, + { + "epoch": 5.147709246990762, + "grad_norm": NaN, + "learning_rate": 1.5631041278773233e-05, + "loss": 0.0, + "step": 55168 + }, + { + "epoch": 5.147802556685639, + "grad_norm": NaN, + "learning_rate": 1.5627679563988487e-05, + "loss": 0.0, + "step": 55169 + }, + { + "epoch": 5.147895866380517, + "grad_norm": NaN, + "learning_rate": 1.562431819087272e-05, + "loss": 0.0, + "step": 55170 + }, + { + "epoch": 5.147989176075394, + "grad_norm": NaN, + "learning_rate": 1.5620957159434556e-05, + "loss": 0.0, + "step": 55171 + }, + { + "epoch": 5.148082485770272, + "grad_norm": NaN, + "learning_rate": 1.561759646968247e-05, + "loss": 0.0, + "step": 55172 + }, + { + "epoch": 5.148175795465149, + "grad_norm": NaN, + "learning_rate": 1.561423612162499e-05, + "loss": 0.0, + "step": 55173 + }, + { + "epoch": 5.1482691051600264, + "grad_norm": NaN, + "learning_rate": 1.561087611527076e-05, + "loss": 0.0, + "step": 55174 + }, + { + "epoch": 5.148362414854903, + "grad_norm": NaN, + "learning_rate": 1.560751645062824e-05, + "loss": 0.0, + "step": 55175 + }, + { + "epoch": 5.14845572454978, + "grad_norm": NaN, + "learning_rate": 1.560415712770597e-05, + "loss": 0.0, + "step": 55176 + }, + { + "epoch": 5.148549034244658, + "grad_norm": NaN, + "learning_rate": 1.560079814651258e-05, + "loss": 0.0, + "step": 55177 + }, + { + "epoch": 5.148642343939535, + "grad_norm": NaN, + "learning_rate": 1.559743950705649e-05, + "loss": 0.0, + "step": 55178 + }, + { + "epoch": 5.148735653634413, + "grad_norm": NaN, + "learning_rate": 1.5594081209346305e-05, + "loss": 0.0, + "step": 55179 + }, + { + "epoch": 5.14882896332929, + "grad_norm": NaN, + "learning_rate": 1.559072325339059e-05, + "loss": 0.0, + "step": 55180 + }, + { + "epoch": 5.1489222730241675, + "grad_norm": NaN, + "learning_rate": 1.558736563919783e-05, + "loss": 0.0, + "step": 55181 + }, + { + "epoch": 5.149015582719045, + "grad_norm": NaN, + "learning_rate": 1.5584008366776563e-05, + "loss": 0.0, + "step": 55182 + }, + { + "epoch": 5.1491088924139214, + "grad_norm": NaN, + "learning_rate": 1.55806514361354e-05, + "loss": 0.0, + "step": 55183 + }, + { + "epoch": 5.149202202108799, + "grad_norm": NaN, + "learning_rate": 1.5577294847282786e-05, + "loss": 0.0, + "step": 55184 + }, + { + "epoch": 5.149295511803676, + "grad_norm": NaN, + "learning_rate": 1.557393860022726e-05, + "loss": 0.0, + "step": 55185 + }, + { + "epoch": 5.149388821498554, + "grad_norm": NaN, + "learning_rate": 1.5570582694977452e-05, + "loss": 0.0, + "step": 55186 + }, + { + "epoch": 5.149482131193431, + "grad_norm": NaN, + "learning_rate": 1.5567227131541804e-05, + "loss": 0.0, + "step": 55187 + }, + { + "epoch": 5.1495754408883085, + "grad_norm": NaN, + "learning_rate": 1.5563871909928843e-05, + "loss": 0.0, + "step": 55188 + }, + { + "epoch": 5.149668750583186, + "grad_norm": NaN, + "learning_rate": 1.5560517030147195e-05, + "loss": 0.0, + "step": 55189 + }, + { + "epoch": 5.1497620602780625, + "grad_norm": NaN, + "learning_rate": 1.5557162492205306e-05, + "loss": 0.0, + "step": 55190 + }, + { + "epoch": 5.14985536997294, + "grad_norm": NaN, + "learning_rate": 1.5553808296111682e-05, + "loss": 0.0, + "step": 55191 + }, + { + "epoch": 5.149948679667817, + "grad_norm": NaN, + "learning_rate": 1.5550454441874965e-05, + "loss": 0.0, + "step": 55192 + }, + { + "epoch": 5.150041989362695, + "grad_norm": NaN, + "learning_rate": 1.5547100929503587e-05, + "loss": 0.0, + "step": 55193 + }, + { + "epoch": 5.150135299057572, + "grad_norm": NaN, + "learning_rate": 1.5543747759006085e-05, + "loss": 0.0, + "step": 55194 + }, + { + "epoch": 5.15022860875245, + "grad_norm": NaN, + "learning_rate": 1.5540394930391043e-05, + "loss": 0.0, + "step": 55195 + }, + { + "epoch": 5.150321918447327, + "grad_norm": NaN, + "learning_rate": 1.5537042443666898e-05, + "loss": 0.0, + "step": 55196 + }, + { + "epoch": 5.150415228142204, + "grad_norm": NaN, + "learning_rate": 1.5533690298842246e-05, + "loss": 0.0, + "step": 55197 + }, + { + "epoch": 5.150508537837081, + "grad_norm": NaN, + "learning_rate": 1.5530338495925627e-05, + "loss": 0.0, + "step": 55198 + }, + { + "epoch": 5.150601847531958, + "grad_norm": NaN, + "learning_rate": 1.5526987034925452e-05, + "loss": 0.0, + "step": 55199 + }, + { + "epoch": 5.150695157226836, + "grad_norm": NaN, + "learning_rate": 1.5523635915850353e-05, + "loss": 0.0, + "step": 55200 + }, + { + "epoch": 5.150788466921713, + "grad_norm": NaN, + "learning_rate": 1.552028513870885e-05, + "loss": 0.0, + "step": 55201 + }, + { + "epoch": 5.150881776616591, + "grad_norm": NaN, + "learning_rate": 1.5516934703509354e-05, + "loss": 0.0, + "step": 55202 + }, + { + "epoch": 5.150975086311468, + "grad_norm": NaN, + "learning_rate": 1.5513584610260492e-05, + "loss": 0.0, + "step": 55203 + }, + { + "epoch": 5.1510683960063455, + "grad_norm": NaN, + "learning_rate": 1.5510234858970778e-05, + "loss": 0.0, + "step": 55204 + }, + { + "epoch": 5.151161705701222, + "grad_norm": NaN, + "learning_rate": 1.550688544964863e-05, + "loss": 0.0, + "step": 55205 + }, + { + "epoch": 5.151255015396099, + "grad_norm": NaN, + "learning_rate": 1.5503536382302666e-05, + "loss": 0.0, + "step": 55206 + }, + { + "epoch": 5.151348325090977, + "grad_norm": NaN, + "learning_rate": 1.550018765694141e-05, + "loss": 0.0, + "step": 55207 + }, + { + "epoch": 5.151441634785854, + "grad_norm": NaN, + "learning_rate": 1.549683927357325e-05, + "loss": 0.0, + "step": 55208 + }, + { + "epoch": 5.151534944480732, + "grad_norm": NaN, + "learning_rate": 1.549349123220684e-05, + "loss": 0.0, + "step": 55209 + }, + { + "epoch": 5.151628254175609, + "grad_norm": NaN, + "learning_rate": 1.5490143532850646e-05, + "loss": 0.0, + "step": 55210 + }, + { + "epoch": 5.1517215638704865, + "grad_norm": NaN, + "learning_rate": 1.5486796175513132e-05, + "loss": 0.0, + "step": 55211 + }, + { + "epoch": 5.151814873565363, + "grad_norm": NaN, + "learning_rate": 1.5483449160202877e-05, + "loss": 0.0, + "step": 55212 + }, + { + "epoch": 5.1519081832602405, + "grad_norm": NaN, + "learning_rate": 1.5480102486928386e-05, + "loss": 0.0, + "step": 55213 + }, + { + "epoch": 5.152001492955118, + "grad_norm": NaN, + "learning_rate": 1.547675615569809e-05, + "loss": 0.0, + "step": 55214 + }, + { + "epoch": 5.152094802649995, + "grad_norm": NaN, + "learning_rate": 1.547341016652061e-05, + "loss": 0.0, + "step": 55215 + }, + { + "epoch": 5.152188112344873, + "grad_norm": NaN, + "learning_rate": 1.5470064519404362e-05, + "loss": 0.0, + "step": 55216 + }, + { + "epoch": 5.15228142203975, + "grad_norm": NaN, + "learning_rate": 1.546671921435787e-05, + "loss": 0.0, + "step": 55217 + }, + { + "epoch": 5.152374731734628, + "grad_norm": NaN, + "learning_rate": 1.5463374251389727e-05, + "loss": 0.0, + "step": 55218 + }, + { + "epoch": 5.152468041429504, + "grad_norm": NaN, + "learning_rate": 1.5460029630508324e-05, + "loss": 0.0, + "step": 55219 + }, + { + "epoch": 5.1525613511243815, + "grad_norm": NaN, + "learning_rate": 1.545668535172219e-05, + "loss": 0.0, + "step": 55220 + }, + { + "epoch": 5.152654660819259, + "grad_norm": NaN, + "learning_rate": 1.5453341415039922e-05, + "loss": 0.0, + "step": 55221 + }, + { + "epoch": 5.152747970514136, + "grad_norm": NaN, + "learning_rate": 1.5449997820469907e-05, + "loss": 0.0, + "step": 55222 + }, + { + "epoch": 5.152841280209014, + "grad_norm": NaN, + "learning_rate": 1.5446654568020657e-05, + "loss": 0.0, + "step": 55223 + }, + { + "epoch": 5.152934589903891, + "grad_norm": NaN, + "learning_rate": 1.544331165770078e-05, + "loss": 0.0, + "step": 55224 + }, + { + "epoch": 5.153027899598769, + "grad_norm": NaN, + "learning_rate": 1.5439969089518656e-05, + "loss": 0.0, + "step": 55225 + }, + { + "epoch": 5.153121209293646, + "grad_norm": NaN, + "learning_rate": 1.5436626863482827e-05, + "loss": 0.0, + "step": 55226 + }, + { + "epoch": 5.153214518988523, + "grad_norm": NaN, + "learning_rate": 1.5433284979601833e-05, + "loss": 0.0, + "step": 55227 + }, + { + "epoch": 5.1533078286834, + "grad_norm": NaN, + "learning_rate": 1.5429943437884108e-05, + "loss": 0.0, + "step": 55228 + }, + { + "epoch": 5.153401138378277, + "grad_norm": NaN, + "learning_rate": 1.5426602238338152e-05, + "loss": 0.0, + "step": 55229 + }, + { + "epoch": 5.153494448073155, + "grad_norm": NaN, + "learning_rate": 1.5423261380972535e-05, + "loss": 0.0, + "step": 55230 + }, + { + "epoch": 5.153587757768032, + "grad_norm": NaN, + "learning_rate": 1.5419920865795658e-05, + "loss": 0.0, + "step": 55231 + }, + { + "epoch": 5.15368106746291, + "grad_norm": NaN, + "learning_rate": 1.5416580692816057e-05, + "loss": 0.0, + "step": 55232 + }, + { + "epoch": 5.153774377157787, + "grad_norm": NaN, + "learning_rate": 1.5413240862042266e-05, + "loss": 0.0, + "step": 55233 + }, + { + "epoch": 5.153867686852664, + "grad_norm": NaN, + "learning_rate": 1.540990137348268e-05, + "loss": 0.0, + "step": 55234 + }, + { + "epoch": 5.153960996547541, + "grad_norm": NaN, + "learning_rate": 1.5406562227145863e-05, + "loss": 0.0, + "step": 55235 + }, + { + "epoch": 5.1540543062424184, + "grad_norm": NaN, + "learning_rate": 1.540322342304034e-05, + "loss": 0.0, + "step": 55236 + }, + { + "epoch": 5.154147615937296, + "grad_norm": NaN, + "learning_rate": 1.539988496117447e-05, + "loss": 0.0, + "step": 55237 + }, + { + "epoch": 5.154240925632173, + "grad_norm": NaN, + "learning_rate": 1.5396546841556862e-05, + "loss": 0.0, + "step": 55238 + }, + { + "epoch": 5.154334235327051, + "grad_norm": NaN, + "learning_rate": 1.5393209064195995e-05, + "loss": 0.0, + "step": 55239 + }, + { + "epoch": 5.154427545021928, + "grad_norm": NaN, + "learning_rate": 1.5389871629100257e-05, + "loss": 0.0, + "step": 55240 + }, + { + "epoch": 5.154520854716805, + "grad_norm": NaN, + "learning_rate": 1.5386534536278235e-05, + "loss": 0.0, + "step": 55241 + }, + { + "epoch": 5.154614164411682, + "grad_norm": NaN, + "learning_rate": 1.538319778573841e-05, + "loss": 0.0, + "step": 55242 + }, + { + "epoch": 5.1547074741065595, + "grad_norm": NaN, + "learning_rate": 1.5379861377489183e-05, + "loss": 0.0, + "step": 55243 + }, + { + "epoch": 5.154800783801437, + "grad_norm": NaN, + "learning_rate": 1.537652531153911e-05, + "loss": 0.0, + "step": 55244 + }, + { + "epoch": 5.154894093496314, + "grad_norm": NaN, + "learning_rate": 1.537318958789669e-05, + "loss": 0.0, + "step": 55245 + }, + { + "epoch": 5.154987403191192, + "grad_norm": NaN, + "learning_rate": 1.536985420657031e-05, + "loss": 0.0, + "step": 55246 + }, + { + "epoch": 5.155080712886069, + "grad_norm": NaN, + "learning_rate": 1.536651916756853e-05, + "loss": 0.0, + "step": 55247 + }, + { + "epoch": 5.155174022580946, + "grad_norm": NaN, + "learning_rate": 1.5363184470899847e-05, + "loss": 0.0, + "step": 55248 + }, + { + "epoch": 5.155267332275823, + "grad_norm": NaN, + "learning_rate": 1.5359850116572654e-05, + "loss": 0.0, + "step": 55249 + }, + { + "epoch": 5.1553606419707005, + "grad_norm": NaN, + "learning_rate": 1.5356516104595506e-05, + "loss": 0.0, + "step": 55250 + }, + { + "epoch": 5.155453951665578, + "grad_norm": NaN, + "learning_rate": 1.5353182434976868e-05, + "loss": 0.0, + "step": 55251 + }, + { + "epoch": 5.155547261360455, + "grad_norm": NaN, + "learning_rate": 1.534984910772515e-05, + "loss": 0.0, + "step": 55252 + }, + { + "epoch": 5.155640571055333, + "grad_norm": NaN, + "learning_rate": 1.5346516122848922e-05, + "loss": 0.0, + "step": 55253 + }, + { + "epoch": 5.15573388075021, + "grad_norm": NaN, + "learning_rate": 1.5343183480356636e-05, + "loss": 0.0, + "step": 55254 + }, + { + "epoch": 5.155827190445088, + "grad_norm": NaN, + "learning_rate": 1.533985118025668e-05, + "loss": 0.0, + "step": 55255 + }, + { + "epoch": 5.155920500139964, + "grad_norm": NaN, + "learning_rate": 1.533651922255763e-05, + "loss": 0.0, + "step": 55256 + }, + { + "epoch": 5.156013809834842, + "grad_norm": NaN, + "learning_rate": 1.533318760726795e-05, + "loss": 0.0, + "step": 55257 + }, + { + "epoch": 5.156107119529719, + "grad_norm": NaN, + "learning_rate": 1.5329856334396014e-05, + "loss": 0.0, + "step": 55258 + }, + { + "epoch": 5.156200429224596, + "grad_norm": NaN, + "learning_rate": 1.532652540395043e-05, + "loss": 0.0, + "step": 55259 + }, + { + "epoch": 5.156293738919474, + "grad_norm": NaN, + "learning_rate": 1.5323194815939577e-05, + "loss": 0.0, + "step": 55260 + }, + { + "epoch": 5.156387048614351, + "grad_norm": NaN, + "learning_rate": 1.53198645703719e-05, + "loss": 0.0, + "step": 55261 + }, + { + "epoch": 5.156480358309229, + "grad_norm": NaN, + "learning_rate": 1.5316534667255986e-05, + "loss": 0.0, + "step": 55262 + }, + { + "epoch": 5.156573668004105, + "grad_norm": NaN, + "learning_rate": 1.5313205106600185e-05, + "loss": 0.0, + "step": 55263 + }, + { + "epoch": 5.156666977698983, + "grad_norm": NaN, + "learning_rate": 1.5309875888412986e-05, + "loss": 0.0, + "step": 55264 + }, + { + "epoch": 5.15676028739386, + "grad_norm": NaN, + "learning_rate": 1.5306547012702918e-05, + "loss": 0.0, + "step": 55265 + }, + { + "epoch": 5.1568535970887375, + "grad_norm": NaN, + "learning_rate": 1.530321847947839e-05, + "loss": 0.0, + "step": 55266 + }, + { + "epoch": 5.156946906783615, + "grad_norm": NaN, + "learning_rate": 1.5299890288747845e-05, + "loss": 0.0, + "step": 55267 + }, + { + "epoch": 5.157040216478492, + "grad_norm": NaN, + "learning_rate": 1.5296562440519828e-05, + "loss": 0.0, + "step": 55268 + }, + { + "epoch": 5.15713352617337, + "grad_norm": NaN, + "learning_rate": 1.529323493480273e-05, + "loss": 0.0, + "step": 55269 + }, + { + "epoch": 5.157226835868247, + "grad_norm": NaN, + "learning_rate": 1.528990777160501e-05, + "loss": 0.0, + "step": 55270 + }, + { + "epoch": 5.157320145563124, + "grad_norm": NaN, + "learning_rate": 1.5286580950935195e-05, + "loss": 0.0, + "step": 55271 + }, + { + "epoch": 5.157413455258001, + "grad_norm": NaN, + "learning_rate": 1.528325447280165e-05, + "loss": 0.0, + "step": 55272 + }, + { + "epoch": 5.1575067649528785, + "grad_norm": NaN, + "learning_rate": 1.5279928337212892e-05, + "loss": 0.0, + "step": 55273 + }, + { + "epoch": 5.157600074647756, + "grad_norm": NaN, + "learning_rate": 1.5276602544177426e-05, + "loss": 0.0, + "step": 55274 + }, + { + "epoch": 5.157693384342633, + "grad_norm": NaN, + "learning_rate": 1.5273277093703584e-05, + "loss": 0.0, + "step": 55275 + }, + { + "epoch": 5.157786694037511, + "grad_norm": NaN, + "learning_rate": 1.5269951985799916e-05, + "loss": 0.0, + "step": 55276 + }, + { + "epoch": 5.157880003732388, + "grad_norm": NaN, + "learning_rate": 1.5266627220474864e-05, + "loss": 0.0, + "step": 55277 + }, + { + "epoch": 5.157973313427265, + "grad_norm": NaN, + "learning_rate": 1.5263302797736838e-05, + "loss": 0.0, + "step": 55278 + }, + { + "epoch": 5.158066623122142, + "grad_norm": NaN, + "learning_rate": 1.525997871759433e-05, + "loss": 0.0, + "step": 55279 + }, + { + "epoch": 5.15815993281702, + "grad_norm": NaN, + "learning_rate": 1.5256654980055816e-05, + "loss": 0.0, + "step": 55280 + }, + { + "epoch": 5.158253242511897, + "grad_norm": NaN, + "learning_rate": 1.5253331585129659e-05, + "loss": 0.0, + "step": 55281 + }, + { + "epoch": 5.158346552206774, + "grad_norm": NaN, + "learning_rate": 1.5250008532824398e-05, + "loss": 0.0, + "step": 55282 + }, + { + "epoch": 5.158439861901652, + "grad_norm": NaN, + "learning_rate": 1.524668582314848e-05, + "loss": 0.0, + "step": 55283 + }, + { + "epoch": 5.158533171596529, + "grad_norm": NaN, + "learning_rate": 1.5243363456110263e-05, + "loss": 0.0, + "step": 55284 + }, + { + "epoch": 5.158626481291406, + "grad_norm": NaN, + "learning_rate": 1.5240041431718275e-05, + "loss": 0.0, + "step": 55285 + }, + { + "epoch": 5.158719790986283, + "grad_norm": NaN, + "learning_rate": 1.5236719749980974e-05, + "loss": 0.0, + "step": 55286 + }, + { + "epoch": 5.158813100681161, + "grad_norm": NaN, + "learning_rate": 1.5233398410906721e-05, + "loss": 0.0, + "step": 55287 + }, + { + "epoch": 5.158906410376038, + "grad_norm": NaN, + "learning_rate": 1.5230077414504044e-05, + "loss": 0.0, + "step": 55288 + }, + { + "epoch": 5.1589997200709155, + "grad_norm": NaN, + "learning_rate": 1.5226756760781384e-05, + "loss": 0.0, + "step": 55289 + }, + { + "epoch": 5.159093029765793, + "grad_norm": NaN, + "learning_rate": 1.5223436449747101e-05, + "loss": 0.0, + "step": 55290 + }, + { + "epoch": 5.15918633946067, + "grad_norm": NaN, + "learning_rate": 1.5220116481409738e-05, + "loss": 0.0, + "step": 55291 + }, + { + "epoch": 5.159279649155547, + "grad_norm": NaN, + "learning_rate": 1.5216796855777707e-05, + "loss": 0.0, + "step": 55292 + }, + { + "epoch": 5.159372958850424, + "grad_norm": NaN, + "learning_rate": 1.5213477572859383e-05, + "loss": 0.0, + "step": 55293 + }, + { + "epoch": 5.159466268545302, + "grad_norm": NaN, + "learning_rate": 1.5210158632663293e-05, + "loss": 0.0, + "step": 55294 + }, + { + "epoch": 5.159559578240179, + "grad_norm": NaN, + "learning_rate": 1.5206840035197866e-05, + "loss": 0.0, + "step": 55295 + }, + { + "epoch": 5.1596528879350565, + "grad_norm": NaN, + "learning_rate": 1.5203521780471455e-05, + "loss": 0.0, + "step": 55296 + }, + { + "epoch": 5.159746197629934, + "grad_norm": NaN, + "learning_rate": 1.5200203868492594e-05, + "loss": 0.0, + "step": 55297 + }, + { + "epoch": 5.159839507324811, + "grad_norm": NaN, + "learning_rate": 1.5196886299269707e-05, + "loss": 0.0, + "step": 55298 + }, + { + "epoch": 5.159932817019689, + "grad_norm": NaN, + "learning_rate": 1.5193569072811152e-05, + "loss": 0.0, + "step": 55299 + }, + { + "epoch": 5.160026126714565, + "grad_norm": NaN, + "learning_rate": 1.519025218912544e-05, + "loss": 0.0, + "step": 55300 + }, + { + "epoch": 5.160119436409443, + "grad_norm": NaN, + "learning_rate": 1.5186935648221015e-05, + "loss": 0.0, + "step": 55301 + }, + { + "epoch": 5.16021274610432, + "grad_norm": NaN, + "learning_rate": 1.5183619450106222e-05, + "loss": 0.0, + "step": 55302 + }, + { + "epoch": 5.1603060557991975, + "grad_norm": NaN, + "learning_rate": 1.5180303594789616e-05, + "loss": 0.0, + "step": 55303 + }, + { + "epoch": 5.160399365494075, + "grad_norm": NaN, + "learning_rate": 1.5176988082279513e-05, + "loss": 0.0, + "step": 55304 + }, + { + "epoch": 5.160492675188952, + "grad_norm": NaN, + "learning_rate": 1.5173672912584383e-05, + "loss": 0.0, + "step": 55305 + }, + { + "epoch": 5.16058598488383, + "grad_norm": NaN, + "learning_rate": 1.5170358085712709e-05, + "loss": 0.0, + "step": 55306 + }, + { + "epoch": 5.160679294578706, + "grad_norm": NaN, + "learning_rate": 1.5167043601672829e-05, + "loss": 0.0, + "step": 55307 + }, + { + "epoch": 5.160772604273584, + "grad_norm": NaN, + "learning_rate": 1.5163729460473239e-05, + "loss": 0.0, + "step": 55308 + }, + { + "epoch": 5.160865913968461, + "grad_norm": NaN, + "learning_rate": 1.5160415662122366e-05, + "loss": 0.0, + "step": 55309 + }, + { + "epoch": 5.160959223663339, + "grad_norm": NaN, + "learning_rate": 1.5157102206628553e-05, + "loss": 0.0, + "step": 55310 + }, + { + "epoch": 5.161052533358216, + "grad_norm": NaN, + "learning_rate": 1.5153789094000324e-05, + "loss": 0.0, + "step": 55311 + }, + { + "epoch": 5.161145843053093, + "grad_norm": NaN, + "learning_rate": 1.5150476324246092e-05, + "loss": 0.0, + "step": 55312 + }, + { + "epoch": 5.161239152747971, + "grad_norm": NaN, + "learning_rate": 1.5147163897374199e-05, + "loss": 0.0, + "step": 55313 + }, + { + "epoch": 5.161332462442848, + "grad_norm": NaN, + "learning_rate": 1.5143851813393154e-05, + "loss": 0.0, + "step": 55314 + }, + { + "epoch": 5.161425772137725, + "grad_norm": NaN, + "learning_rate": 1.5140540072311368e-05, + "loss": 0.0, + "step": 55315 + }, + { + "epoch": 5.161519081832602, + "grad_norm": NaN, + "learning_rate": 1.5137228674137186e-05, + "loss": 0.0, + "step": 55316 + }, + { + "epoch": 5.16161239152748, + "grad_norm": NaN, + "learning_rate": 1.51339176188791e-05, + "loss": 0.0, + "step": 55317 + }, + { + "epoch": 5.161705701222357, + "grad_norm": NaN, + "learning_rate": 1.513060690654555e-05, + "loss": 0.0, + "step": 55318 + }, + { + "epoch": 5.1617990109172345, + "grad_norm": NaN, + "learning_rate": 1.5127296537144867e-05, + "loss": 0.0, + "step": 55319 + }, + { + "epoch": 5.161892320612112, + "grad_norm": NaN, + "learning_rate": 1.5123986510685527e-05, + "loss": 0.0, + "step": 55320 + }, + { + "epoch": 5.161985630306989, + "grad_norm": NaN, + "learning_rate": 1.5120676827175971e-05, + "loss": 0.0, + "step": 55321 + }, + { + "epoch": 5.162078940001866, + "grad_norm": NaN, + "learning_rate": 1.5117367486624526e-05, + "loss": 0.0, + "step": 55322 + }, + { + "epoch": 5.162172249696743, + "grad_norm": NaN, + "learning_rate": 1.5114058489039688e-05, + "loss": 0.0, + "step": 55323 + }, + { + "epoch": 5.162265559391621, + "grad_norm": NaN, + "learning_rate": 1.5110749834429864e-05, + "loss": 0.0, + "step": 55324 + }, + { + "epoch": 5.162358869086498, + "grad_norm": NaN, + "learning_rate": 1.5107441522803398e-05, + "loss": 0.0, + "step": 55325 + }, + { + "epoch": 5.1624521787813755, + "grad_norm": NaN, + "learning_rate": 1.5104133554168785e-05, + "loss": 0.0, + "step": 55326 + }, + { + "epoch": 5.162545488476253, + "grad_norm": NaN, + "learning_rate": 1.5100825928534416e-05, + "loss": 0.0, + "step": 55327 + }, + { + "epoch": 5.16263879817113, + "grad_norm": NaN, + "learning_rate": 1.5097518645908635e-05, + "loss": 0.0, + "step": 55328 + }, + { + "epoch": 5.162732107866007, + "grad_norm": NaN, + "learning_rate": 1.5094211706299936e-05, + "loss": 0.0, + "step": 55329 + }, + { + "epoch": 5.162825417560884, + "grad_norm": NaN, + "learning_rate": 1.5090905109716711e-05, + "loss": 0.0, + "step": 55330 + }, + { + "epoch": 5.162918727255762, + "grad_norm": NaN, + "learning_rate": 1.5087598856167304e-05, + "loss": 0.0, + "step": 55331 + }, + { + "epoch": 5.163012036950639, + "grad_norm": NaN, + "learning_rate": 1.5084292945660192e-05, + "loss": 0.0, + "step": 55332 + }, + { + "epoch": 5.163105346645517, + "grad_norm": NaN, + "learning_rate": 1.5080987378203802e-05, + "loss": 0.0, + "step": 55333 + }, + { + "epoch": 5.163198656340394, + "grad_norm": NaN, + "learning_rate": 1.5077682153806425e-05, + "loss": 0.0, + "step": 55334 + }, + { + "epoch": 5.163291966035271, + "grad_norm": NaN, + "learning_rate": 1.5074377272476557e-05, + "loss": 0.0, + "step": 55335 + }, + { + "epoch": 5.163385275730148, + "grad_norm": NaN, + "learning_rate": 1.5071072734222623e-05, + "loss": 0.0, + "step": 55336 + }, + { + "epoch": 5.163478585425025, + "grad_norm": NaN, + "learning_rate": 1.5067768539052915e-05, + "loss": 0.0, + "step": 55337 + }, + { + "epoch": 5.163571895119903, + "grad_norm": NaN, + "learning_rate": 1.506446468697593e-05, + "loss": 0.0, + "step": 55338 + }, + { + "epoch": 5.16366520481478, + "grad_norm": NaN, + "learning_rate": 1.5061161178000075e-05, + "loss": 0.0, + "step": 55339 + }, + { + "epoch": 5.163758514509658, + "grad_norm": NaN, + "learning_rate": 1.5057858012133644e-05, + "loss": 0.0, + "step": 55340 + }, + { + "epoch": 5.163851824204535, + "grad_norm": NaN, + "learning_rate": 1.5054555189385147e-05, + "loss": 0.0, + "step": 55341 + }, + { + "epoch": 5.1639451338994125, + "grad_norm": NaN, + "learning_rate": 1.5051252709762979e-05, + "loss": 0.0, + "step": 55342 + }, + { + "epoch": 5.16403844359429, + "grad_norm": NaN, + "learning_rate": 1.504795057327543e-05, + "loss": 0.0, + "step": 55343 + }, + { + "epoch": 5.164131753289166, + "grad_norm": NaN, + "learning_rate": 1.5044648779931012e-05, + "loss": 0.0, + "step": 55344 + }, + { + "epoch": 5.164225062984044, + "grad_norm": NaN, + "learning_rate": 1.5041347329738068e-05, + "loss": 0.0, + "step": 55345 + }, + { + "epoch": 5.164318372678921, + "grad_norm": NaN, + "learning_rate": 1.503804622270499e-05, + "loss": 0.0, + "step": 55346 + }, + { + "epoch": 5.164411682373799, + "grad_norm": NaN, + "learning_rate": 1.5034745458840192e-05, + "loss": 0.0, + "step": 55347 + }, + { + "epoch": 5.164504992068676, + "grad_norm": NaN, + "learning_rate": 1.5031445038152046e-05, + "loss": 0.0, + "step": 55348 + }, + { + "epoch": 5.1645983017635535, + "grad_norm": NaN, + "learning_rate": 1.5028144960648964e-05, + "loss": 0.0, + "step": 55349 + }, + { + "epoch": 5.164691611458431, + "grad_norm": NaN, + "learning_rate": 1.5024845226339355e-05, + "loss": 0.0, + "step": 55350 + }, + { + "epoch": 5.1647849211533075, + "grad_norm": NaN, + "learning_rate": 1.502154583523153e-05, + "loss": 0.0, + "step": 55351 + }, + { + "epoch": 5.164878230848185, + "grad_norm": NaN, + "learning_rate": 1.5018246787333949e-05, + "loss": 0.0, + "step": 55352 + }, + { + "epoch": 5.164971540543062, + "grad_norm": NaN, + "learning_rate": 1.5014948082655004e-05, + "loss": 0.0, + "step": 55353 + }, + { + "epoch": 5.16506485023794, + "grad_norm": NaN, + "learning_rate": 1.5011649721203023e-05, + "loss": 0.0, + "step": 55354 + }, + { + "epoch": 5.165158159932817, + "grad_norm": NaN, + "learning_rate": 1.5008351702986449e-05, + "loss": 0.0, + "step": 55355 + }, + { + "epoch": 5.1652514696276945, + "grad_norm": NaN, + "learning_rate": 1.5005054028013674e-05, + "loss": 0.0, + "step": 55356 + }, + { + "epoch": 5.165344779322572, + "grad_norm": NaN, + "learning_rate": 1.500175669629301e-05, + "loss": 0.0, + "step": 55357 + }, + { + "epoch": 5.1654380890174485, + "grad_norm": NaN, + "learning_rate": 1.4998459707832916e-05, + "loss": 0.0, + "step": 55358 + }, + { + "epoch": 5.165531398712326, + "grad_norm": NaN, + "learning_rate": 1.4995163062641768e-05, + "loss": 0.0, + "step": 55359 + }, + { + "epoch": 5.165624708407203, + "grad_norm": NaN, + "learning_rate": 1.4991866760727878e-05, + "loss": 0.0, + "step": 55360 + }, + { + "epoch": 5.165718018102081, + "grad_norm": NaN, + "learning_rate": 1.4988570802099703e-05, + "loss": 0.0, + "step": 55361 + }, + { + "epoch": 5.165811327796958, + "grad_norm": NaN, + "learning_rate": 1.4985275186765639e-05, + "loss": 0.0, + "step": 55362 + }, + { + "epoch": 5.165904637491836, + "grad_norm": NaN, + "learning_rate": 1.498197991473396e-05, + "loss": 0.0, + "step": 55363 + }, + { + "epoch": 5.165997947186713, + "grad_norm": NaN, + "learning_rate": 1.4978684986013145e-05, + "loss": 0.0, + "step": 55364 + }, + { + "epoch": 5.1660912568815895, + "grad_norm": NaN, + "learning_rate": 1.4975390400611554e-05, + "loss": 0.0, + "step": 55365 + }, + { + "epoch": 5.166184566576467, + "grad_norm": NaN, + "learning_rate": 1.4972096158537494e-05, + "loss": 0.0, + "step": 55366 + }, + { + "epoch": 5.166277876271344, + "grad_norm": NaN, + "learning_rate": 1.4968802259799428e-05, + "loss": 0.0, + "step": 55367 + }, + { + "epoch": 5.166371185966222, + "grad_norm": NaN, + "learning_rate": 1.4965508704405733e-05, + "loss": 0.0, + "step": 55368 + }, + { + "epoch": 5.166464495661099, + "grad_norm": NaN, + "learning_rate": 1.4962215492364682e-05, + "loss": 0.0, + "step": 55369 + }, + { + "epoch": 5.166557805355977, + "grad_norm": NaN, + "learning_rate": 1.4958922623684755e-05, + "loss": 0.0, + "step": 55370 + }, + { + "epoch": 5.166651115050854, + "grad_norm": NaN, + "learning_rate": 1.495563009837431e-05, + "loss": 0.0, + "step": 55371 + }, + { + "epoch": 5.1667444247457315, + "grad_norm": NaN, + "learning_rate": 1.4952337916441643e-05, + "loss": 0.0, + "step": 55372 + }, + { + "epoch": 5.166837734440608, + "grad_norm": NaN, + "learning_rate": 1.4949046077895194e-05, + "loss": 0.0, + "step": 55373 + }, + { + "epoch": 5.166931044135485, + "grad_norm": NaN, + "learning_rate": 1.494575458274334e-05, + "loss": 0.0, + "step": 55374 + }, + { + "epoch": 5.167024353830363, + "grad_norm": NaN, + "learning_rate": 1.4942463430994379e-05, + "loss": 0.0, + "step": 55375 + }, + { + "epoch": 5.16711766352524, + "grad_norm": NaN, + "learning_rate": 1.4939172622656765e-05, + "loss": 0.0, + "step": 55376 + }, + { + "epoch": 5.167210973220118, + "grad_norm": NaN, + "learning_rate": 1.4935882157738844e-05, + "loss": 0.0, + "step": 55377 + }, + { + "epoch": 5.167304282914995, + "grad_norm": NaN, + "learning_rate": 1.4932592036248909e-05, + "loss": 0.0, + "step": 55378 + }, + { + "epoch": 5.1673975926098725, + "grad_norm": NaN, + "learning_rate": 1.4929302258195402e-05, + "loss": 0.0, + "step": 55379 + }, + { + "epoch": 5.167490902304749, + "grad_norm": NaN, + "learning_rate": 1.4926012823586703e-05, + "loss": 0.0, + "step": 55380 + }, + { + "epoch": 5.1675842119996265, + "grad_norm": NaN, + "learning_rate": 1.4922723732431085e-05, + "loss": 0.0, + "step": 55381 + }, + { + "epoch": 5.167677521694504, + "grad_norm": NaN, + "learning_rate": 1.491943498473701e-05, + "loss": 0.0, + "step": 55382 + }, + { + "epoch": 5.167770831389381, + "grad_norm": NaN, + "learning_rate": 1.4916146580512789e-05, + "loss": 0.0, + "step": 55383 + }, + { + "epoch": 5.167864141084259, + "grad_norm": NaN, + "learning_rate": 1.491285851976678e-05, + "loss": 0.0, + "step": 55384 + }, + { + "epoch": 5.167957450779136, + "grad_norm": NaN, + "learning_rate": 1.4909570802507376e-05, + "loss": 0.0, + "step": 55385 + }, + { + "epoch": 5.168050760474014, + "grad_norm": NaN, + "learning_rate": 1.4906283428742904e-05, + "loss": 0.0, + "step": 55386 + }, + { + "epoch": 5.168144070168891, + "grad_norm": NaN, + "learning_rate": 1.4902996398481743e-05, + "loss": 0.0, + "step": 55387 + }, + { + "epoch": 5.1682373798637675, + "grad_norm": NaN, + "learning_rate": 1.4899709711732233e-05, + "loss": 0.0, + "step": 55388 + }, + { + "epoch": 5.168330689558645, + "grad_norm": NaN, + "learning_rate": 1.4896423368502752e-05, + "loss": 0.0, + "step": 55389 + }, + { + "epoch": 5.168423999253522, + "grad_norm": NaN, + "learning_rate": 1.4893137368801644e-05, + "loss": 0.0, + "step": 55390 + }, + { + "epoch": 5.1685173089484, + "grad_norm": NaN, + "learning_rate": 1.4889851712637252e-05, + "loss": 0.0, + "step": 55391 + }, + { + "epoch": 5.168610618643277, + "grad_norm": NaN, + "learning_rate": 1.4886566400017951e-05, + "loss": 0.0, + "step": 55392 + }, + { + "epoch": 5.168703928338155, + "grad_norm": NaN, + "learning_rate": 1.4883281430952105e-05, + "loss": 0.0, + "step": 55393 + }, + { + "epoch": 5.168797238033032, + "grad_norm": NaN, + "learning_rate": 1.4879996805448069e-05, + "loss": 0.0, + "step": 55394 + }, + { + "epoch": 5.168890547727909, + "grad_norm": NaN, + "learning_rate": 1.4876712523514106e-05, + "loss": 0.0, + "step": 55395 + }, + { + "epoch": 5.168983857422786, + "grad_norm": NaN, + "learning_rate": 1.4873428585158675e-05, + "loss": 0.0, + "step": 55396 + }, + { + "epoch": 5.169077167117663, + "grad_norm": NaN, + "learning_rate": 1.487014499039012e-05, + "loss": 0.0, + "step": 55397 + }, + { + "epoch": 5.169170476812541, + "grad_norm": NaN, + "learning_rate": 1.4866861739216702e-05, + "loss": 0.0, + "step": 55398 + }, + { + "epoch": 5.169263786507418, + "grad_norm": NaN, + "learning_rate": 1.4863578831646844e-05, + "loss": 0.0, + "step": 55399 + }, + { + "epoch": 5.169357096202296, + "grad_norm": NaN, + "learning_rate": 1.486029626768891e-05, + "loss": 0.0, + "step": 55400 + }, + { + "epoch": 5.169450405897173, + "grad_norm": NaN, + "learning_rate": 1.4857014047351141e-05, + "loss": 0.0, + "step": 55401 + }, + { + "epoch": 5.16954371559205, + "grad_norm": NaN, + "learning_rate": 1.4853732170641996e-05, + "loss": 0.0, + "step": 55402 + }, + { + "epoch": 5.169637025286927, + "grad_norm": NaN, + "learning_rate": 1.4850450637569805e-05, + "loss": 0.0, + "step": 55403 + }, + { + "epoch": 5.1697303349818045, + "grad_norm": NaN, + "learning_rate": 1.4847169448142826e-05, + "loss": 0.0, + "step": 55404 + }, + { + "epoch": 5.169823644676682, + "grad_norm": NaN, + "learning_rate": 1.4843888602369485e-05, + "loss": 0.0, + "step": 55405 + }, + { + "epoch": 5.169916954371559, + "grad_norm": NaN, + "learning_rate": 1.4840608100258128e-05, + "loss": 0.0, + "step": 55406 + }, + { + "epoch": 5.170010264066437, + "grad_norm": NaN, + "learning_rate": 1.4837327941817011e-05, + "loss": 0.0, + "step": 55407 + }, + { + "epoch": 5.170103573761314, + "grad_norm": NaN, + "learning_rate": 1.4834048127054565e-05, + "loss": 0.0, + "step": 55408 + }, + { + "epoch": 5.170196883456191, + "grad_norm": NaN, + "learning_rate": 1.4830768655979114e-05, + "loss": 0.0, + "step": 55409 + }, + { + "epoch": 5.170290193151068, + "grad_norm": NaN, + "learning_rate": 1.4827489528598918e-05, + "loss": 0.0, + "step": 55410 + }, + { + "epoch": 5.1703835028459455, + "grad_norm": NaN, + "learning_rate": 1.4824210744922421e-05, + "loss": 0.0, + "step": 55411 + }, + { + "epoch": 5.170476812540823, + "grad_norm": NaN, + "learning_rate": 1.4820932304957933e-05, + "loss": 0.0, + "step": 55412 + }, + { + "epoch": 5.1705701222357, + "grad_norm": NaN, + "learning_rate": 1.4817654208713714e-05, + "loss": 0.0, + "step": 55413 + }, + { + "epoch": 5.170663431930578, + "grad_norm": NaN, + "learning_rate": 1.4814376456198174e-05, + "loss": 0.0, + "step": 55414 + }, + { + "epoch": 5.170756741625455, + "grad_norm": NaN, + "learning_rate": 1.4811099047419673e-05, + "loss": 0.0, + "step": 55415 + }, + { + "epoch": 5.170850051320333, + "grad_norm": NaN, + "learning_rate": 1.4807821982386437e-05, + "loss": 0.0, + "step": 55416 + }, + { + "epoch": 5.170943361015209, + "grad_norm": NaN, + "learning_rate": 1.4804545261106893e-05, + "loss": 0.0, + "step": 55417 + }, + { + "epoch": 5.1710366707100865, + "grad_norm": NaN, + "learning_rate": 1.4801268883589384e-05, + "loss": 0.0, + "step": 55418 + }, + { + "epoch": 5.171129980404964, + "grad_norm": NaN, + "learning_rate": 1.4797992849842121e-05, + "loss": 0.0, + "step": 55419 + }, + { + "epoch": 5.171223290099841, + "grad_norm": NaN, + "learning_rate": 1.4794717159873548e-05, + "loss": 0.0, + "step": 55420 + }, + { + "epoch": 5.171316599794719, + "grad_norm": NaN, + "learning_rate": 1.4791441813691973e-05, + "loss": 0.0, + "step": 55421 + }, + { + "epoch": 5.171409909489596, + "grad_norm": NaN, + "learning_rate": 1.4788166811305691e-05, + "loss": 0.0, + "step": 55422 + }, + { + "epoch": 5.171503219184474, + "grad_norm": NaN, + "learning_rate": 1.478489215272306e-05, + "loss": 0.0, + "step": 55423 + }, + { + "epoch": 5.17159652887935, + "grad_norm": NaN, + "learning_rate": 1.4781617837952392e-05, + "loss": 0.0, + "step": 55424 + }, + { + "epoch": 5.171689838574228, + "grad_norm": NaN, + "learning_rate": 1.4778343867002013e-05, + "loss": 0.0, + "step": 55425 + }, + { + "epoch": 5.171783148269105, + "grad_norm": NaN, + "learning_rate": 1.4775070239880249e-05, + "loss": 0.0, + "step": 55426 + }, + { + "epoch": 5.171876457963982, + "grad_norm": NaN, + "learning_rate": 1.4771796956595428e-05, + "loss": 0.0, + "step": 55427 + }, + { + "epoch": 5.17196976765886, + "grad_norm": NaN, + "learning_rate": 1.4768524017155858e-05, + "loss": 0.0, + "step": 55428 + }, + { + "epoch": 5.172063077353737, + "grad_norm": NaN, + "learning_rate": 1.4765251421569885e-05, + "loss": 0.0, + "step": 55429 + }, + { + "epoch": 5.172156387048615, + "grad_norm": NaN, + "learning_rate": 1.4761979169845816e-05, + "loss": 0.0, + "step": 55430 + }, + { + "epoch": 5.172249696743491, + "grad_norm": NaN, + "learning_rate": 1.475870726199198e-05, + "loss": 0.0, + "step": 55431 + }, + { + "epoch": 5.172343006438369, + "grad_norm": NaN, + "learning_rate": 1.475543569801667e-05, + "loss": 0.0, + "step": 55432 + }, + { + "epoch": 5.172436316133246, + "grad_norm": NaN, + "learning_rate": 1.4752164477928246e-05, + "loss": 0.0, + "step": 55433 + }, + { + "epoch": 5.1725296258281235, + "grad_norm": NaN, + "learning_rate": 1.4748893601735e-05, + "loss": 0.0, + "step": 55434 + }, + { + "epoch": 5.172622935523001, + "grad_norm": NaN, + "learning_rate": 1.4745623069445244e-05, + "loss": 0.0, + "step": 55435 + }, + { + "epoch": 5.172716245217878, + "grad_norm": NaN, + "learning_rate": 1.4742352881067304e-05, + "loss": 0.0, + "step": 55436 + }, + { + "epoch": 5.172809554912756, + "grad_norm": NaN, + "learning_rate": 1.4739083036609505e-05, + "loss": 0.0, + "step": 55437 + }, + { + "epoch": 5.172902864607633, + "grad_norm": NaN, + "learning_rate": 1.4735813536080177e-05, + "loss": 0.0, + "step": 55438 + }, + { + "epoch": 5.17299617430251, + "grad_norm": NaN, + "learning_rate": 1.4732544379487544e-05, + "loss": 0.0, + "step": 55439 + }, + { + "epoch": 5.173089483997387, + "grad_norm": NaN, + "learning_rate": 1.4729275566840015e-05, + "loss": 0.0, + "step": 55440 + }, + { + "epoch": 5.1731827936922645, + "grad_norm": NaN, + "learning_rate": 1.4726007098145887e-05, + "loss": 0.0, + "step": 55441 + }, + { + "epoch": 5.173276103387142, + "grad_norm": NaN, + "learning_rate": 1.47227389734134e-05, + "loss": 0.0, + "step": 55442 + }, + { + "epoch": 5.173369413082019, + "grad_norm": NaN, + "learning_rate": 1.471947119265095e-05, + "loss": 0.0, + "step": 55443 + }, + { + "epoch": 5.173462722776897, + "grad_norm": NaN, + "learning_rate": 1.4716203755866828e-05, + "loss": 0.0, + "step": 55444 + }, + { + "epoch": 5.173556032471774, + "grad_norm": NaN, + "learning_rate": 1.471293666306928e-05, + "loss": 0.0, + "step": 55445 + }, + { + "epoch": 5.173649342166651, + "grad_norm": NaN, + "learning_rate": 1.4709669914266681e-05, + "loss": 0.0, + "step": 55446 + }, + { + "epoch": 5.173742651861528, + "grad_norm": NaN, + "learning_rate": 1.4706403509467357e-05, + "loss": 0.0, + "step": 55447 + }, + { + "epoch": 5.173835961556406, + "grad_norm": NaN, + "learning_rate": 1.4703137448679503e-05, + "loss": 0.0, + "step": 55448 + }, + { + "epoch": 5.173929271251283, + "grad_norm": NaN, + "learning_rate": 1.4699871731911527e-05, + "loss": 0.0, + "step": 55449 + }, + { + "epoch": 5.17402258094616, + "grad_norm": NaN, + "learning_rate": 1.4696606359171708e-05, + "loss": 0.0, + "step": 55450 + }, + { + "epoch": 5.174115890641038, + "grad_norm": NaN, + "learning_rate": 1.4693341330468306e-05, + "loss": 0.0, + "step": 55451 + }, + { + "epoch": 5.174209200335915, + "grad_norm": NaN, + "learning_rate": 1.4690076645809679e-05, + "loss": 0.0, + "step": 55452 + }, + { + "epoch": 5.174302510030792, + "grad_norm": NaN, + "learning_rate": 1.468681230520412e-05, + "loss": 0.0, + "step": 55453 + }, + { + "epoch": 5.174395819725669, + "grad_norm": NaN, + "learning_rate": 1.4683548308659876e-05, + "loss": 0.0, + "step": 55454 + }, + { + "epoch": 5.174489129420547, + "grad_norm": NaN, + "learning_rate": 1.4680284656185303e-05, + "loss": 0.0, + "step": 55455 + }, + { + "epoch": 5.174582439115424, + "grad_norm": NaN, + "learning_rate": 1.4677021347788714e-05, + "loss": 0.0, + "step": 55456 + }, + { + "epoch": 5.1746757488103015, + "grad_norm": NaN, + "learning_rate": 1.4673758383478317e-05, + "loss": 0.0, + "step": 55457 + }, + { + "epoch": 5.174769058505179, + "grad_norm": NaN, + "learning_rate": 1.467049576326249e-05, + "loss": 0.0, + "step": 55458 + }, + { + "epoch": 5.174862368200056, + "grad_norm": NaN, + "learning_rate": 1.466723348714951e-05, + "loss": 0.0, + "step": 55459 + }, + { + "epoch": 5.174955677894934, + "grad_norm": NaN, + "learning_rate": 1.4663971555147668e-05, + "loss": 0.0, + "step": 55460 + }, + { + "epoch": 5.17504898758981, + "grad_norm": NaN, + "learning_rate": 1.4660709967265244e-05, + "loss": 0.0, + "step": 55461 + }, + { + "epoch": 5.175142297284688, + "grad_norm": NaN, + "learning_rate": 1.4657448723510562e-05, + "loss": 0.0, + "step": 55462 + }, + { + "epoch": 5.175235606979565, + "grad_norm": NaN, + "learning_rate": 1.4654187823891882e-05, + "loss": 0.0, + "step": 55463 + }, + { + "epoch": 5.1753289166744425, + "grad_norm": NaN, + "learning_rate": 1.4650927268417517e-05, + "loss": 0.0, + "step": 55464 + }, + { + "epoch": 5.17542222636932, + "grad_norm": NaN, + "learning_rate": 1.4647667057095759e-05, + "loss": 0.0, + "step": 55465 + }, + { + "epoch": 5.175515536064197, + "grad_norm": NaN, + "learning_rate": 1.4644407189934881e-05, + "loss": 0.0, + "step": 55466 + }, + { + "epoch": 5.175608845759075, + "grad_norm": NaN, + "learning_rate": 1.4641147666943165e-05, + "loss": 0.0, + "step": 55467 + }, + { + "epoch": 5.175702155453951, + "grad_norm": NaN, + "learning_rate": 1.4637888488128934e-05, + "loss": 0.0, + "step": 55468 + }, + { + "epoch": 5.175795465148829, + "grad_norm": NaN, + "learning_rate": 1.4634629653500452e-05, + "loss": 0.0, + "step": 55469 + }, + { + "epoch": 5.175888774843706, + "grad_norm": NaN, + "learning_rate": 1.4631371163066008e-05, + "loss": 0.0, + "step": 55470 + }, + { + "epoch": 5.1759820845385835, + "grad_norm": NaN, + "learning_rate": 1.462811301683388e-05, + "loss": 0.0, + "step": 55471 + }, + { + "epoch": 5.176075394233461, + "grad_norm": NaN, + "learning_rate": 1.4624855214812365e-05, + "loss": 0.0, + "step": 55472 + }, + { + "epoch": 5.176168703928338, + "grad_norm": NaN, + "learning_rate": 1.4621597757009751e-05, + "loss": 0.0, + "step": 55473 + }, + { + "epoch": 5.176262013623216, + "grad_norm": NaN, + "learning_rate": 1.46183406434343e-05, + "loss": 0.0, + "step": 55474 + }, + { + "epoch": 5.176355323318092, + "grad_norm": NaN, + "learning_rate": 1.4615083874094308e-05, + "loss": 0.0, + "step": 55475 + }, + { + "epoch": 5.17644863301297, + "grad_norm": NaN, + "learning_rate": 1.4611827448998064e-05, + "loss": 0.0, + "step": 55476 + }, + { + "epoch": 5.176541942707847, + "grad_norm": NaN, + "learning_rate": 1.4608571368153833e-05, + "loss": 0.0, + "step": 55477 + }, + { + "epoch": 5.176635252402725, + "grad_norm": NaN, + "learning_rate": 1.4605315631569887e-05, + "loss": 0.0, + "step": 55478 + }, + { + "epoch": 5.176728562097602, + "grad_norm": NaN, + "learning_rate": 1.4602060239254537e-05, + "loss": 0.0, + "step": 55479 + }, + { + "epoch": 5.176821871792479, + "grad_norm": NaN, + "learning_rate": 1.4598805191216028e-05, + "loss": 0.0, + "step": 55480 + }, + { + "epoch": 5.176915181487357, + "grad_norm": NaN, + "learning_rate": 1.4595550487462638e-05, + "loss": 0.0, + "step": 55481 + }, + { + "epoch": 5.177008491182233, + "grad_norm": NaN, + "learning_rate": 1.4592296128002706e-05, + "loss": 0.0, + "step": 55482 + }, + { + "epoch": 5.177101800877111, + "grad_norm": NaN, + "learning_rate": 1.4589042112844379e-05, + "loss": 0.0, + "step": 55483 + }, + { + "epoch": 5.177195110571988, + "grad_norm": NaN, + "learning_rate": 1.4585788441996033e-05, + "loss": 0.0, + "step": 55484 + }, + { + "epoch": 5.177288420266866, + "grad_norm": NaN, + "learning_rate": 1.458253511546596e-05, + "loss": 0.0, + "step": 55485 + }, + { + "epoch": 5.177381729961743, + "grad_norm": NaN, + "learning_rate": 1.4579282133262322e-05, + "loss": 0.0, + "step": 55486 + }, + { + "epoch": 5.1774750396566205, + "grad_norm": NaN, + "learning_rate": 1.4576029495393477e-05, + "loss": 0.0, + "step": 55487 + }, + { + "epoch": 5.177568349351498, + "grad_norm": NaN, + "learning_rate": 1.4572777201867702e-05, + "loss": 0.0, + "step": 55488 + }, + { + "epoch": 5.177661659046375, + "grad_norm": NaN, + "learning_rate": 1.4569525252693193e-05, + "loss": 0.0, + "step": 55489 + }, + { + "epoch": 5.177754968741252, + "grad_norm": NaN, + "learning_rate": 1.4566273647878274e-05, + "loss": 0.0, + "step": 55490 + }, + { + "epoch": 5.177848278436129, + "grad_norm": NaN, + "learning_rate": 1.456302238743124e-05, + "loss": 0.0, + "step": 55491 + }, + { + "epoch": 5.177941588131007, + "grad_norm": NaN, + "learning_rate": 1.4559771471360265e-05, + "loss": 0.0, + "step": 55492 + }, + { + "epoch": 5.178034897825884, + "grad_norm": NaN, + "learning_rate": 1.4556520899673695e-05, + "loss": 0.0, + "step": 55493 + }, + { + "epoch": 5.1781282075207615, + "grad_norm": NaN, + "learning_rate": 1.4553270672379808e-05, + "loss": 0.0, + "step": 55494 + }, + { + "epoch": 5.178221517215639, + "grad_norm": NaN, + "learning_rate": 1.4550020789486777e-05, + "loss": 0.0, + "step": 55495 + }, + { + "epoch": 5.178314826910516, + "grad_norm": NaN, + "learning_rate": 1.4546771251002947e-05, + "loss": 0.0, + "step": 55496 + }, + { + "epoch": 5.178408136605393, + "grad_norm": NaN, + "learning_rate": 1.4543522056936546e-05, + "loss": 0.0, + "step": 55497 + }, + { + "epoch": 5.17850144630027, + "grad_norm": NaN, + "learning_rate": 1.454027320729585e-05, + "loss": 0.0, + "step": 55498 + }, + { + "epoch": 5.178594755995148, + "grad_norm": NaN, + "learning_rate": 1.45370247020891e-05, + "loss": 0.0, + "step": 55499 + }, + { + "epoch": 5.178688065690025, + "grad_norm": NaN, + "learning_rate": 1.4533776541324576e-05, + "loss": 0.0, + "step": 55500 + }, + { + "epoch": 5.178781375384903, + "grad_norm": NaN, + "learning_rate": 1.4530528725010537e-05, + "loss": 0.0, + "step": 55501 + }, + { + "epoch": 5.17887468507978, + "grad_norm": NaN, + "learning_rate": 1.4527281253155226e-05, + "loss": 0.0, + "step": 55502 + }, + { + "epoch": 5.178967994774657, + "grad_norm": NaN, + "learning_rate": 1.452403412576692e-05, + "loss": 0.0, + "step": 55503 + }, + { + "epoch": 5.179061304469535, + "grad_norm": NaN, + "learning_rate": 1.4520787342853846e-05, + "loss": 0.0, + "step": 55504 + }, + { + "epoch": 5.179154614164411, + "grad_norm": NaN, + "learning_rate": 1.4517540904424297e-05, + "loss": 0.0, + "step": 55505 + }, + { + "epoch": 5.179247923859289, + "grad_norm": NaN, + "learning_rate": 1.4514294810486499e-05, + "loss": 0.0, + "step": 55506 + }, + { + "epoch": 5.179341233554166, + "grad_norm": NaN, + "learning_rate": 1.4511049061048713e-05, + "loss": 0.0, + "step": 55507 + }, + { + "epoch": 5.179434543249044, + "grad_norm": NaN, + "learning_rate": 1.4507803656119199e-05, + "loss": 0.0, + "step": 55508 + }, + { + "epoch": 5.179527852943921, + "grad_norm": NaN, + "learning_rate": 1.45045585957062e-05, + "loss": 0.0, + "step": 55509 + }, + { + "epoch": 5.1796211626387985, + "grad_norm": NaN, + "learning_rate": 1.4501313879817978e-05, + "loss": 0.0, + "step": 55510 + }, + { + "epoch": 5.179714472333676, + "grad_norm": NaN, + "learning_rate": 1.4498069508462773e-05, + "loss": 0.0, + "step": 55511 + }, + { + "epoch": 5.179807782028552, + "grad_norm": NaN, + "learning_rate": 1.449482548164883e-05, + "loss": 0.0, + "step": 55512 + }, + { + "epoch": 5.17990109172343, + "grad_norm": NaN, + "learning_rate": 1.4491581799384427e-05, + "loss": 0.0, + "step": 55513 + }, + { + "epoch": 5.179994401418307, + "grad_norm": NaN, + "learning_rate": 1.4488338461677773e-05, + "loss": 0.0, + "step": 55514 + }, + { + "epoch": 5.180087711113185, + "grad_norm": NaN, + "learning_rate": 1.4485095468537145e-05, + "loss": 0.0, + "step": 55515 + }, + { + "epoch": 5.180181020808062, + "grad_norm": NaN, + "learning_rate": 1.4481852819970768e-05, + "loss": 0.0, + "step": 55516 + }, + { + "epoch": 5.1802743305029395, + "grad_norm": NaN, + "learning_rate": 1.4478610515986904e-05, + "loss": 0.0, + "step": 55517 + }, + { + "epoch": 5.180367640197817, + "grad_norm": NaN, + "learning_rate": 1.447536855659378e-05, + "loss": 0.0, + "step": 55518 + }, + { + "epoch": 5.1804609498926935, + "grad_norm": NaN, + "learning_rate": 1.4472126941799655e-05, + "loss": 0.0, + "step": 55519 + }, + { + "epoch": 5.180554259587571, + "grad_norm": NaN, + "learning_rate": 1.4468885671612756e-05, + "loss": 0.0, + "step": 55520 + }, + { + "epoch": 5.180647569282448, + "grad_norm": NaN, + "learning_rate": 1.4465644746041344e-05, + "loss": 0.0, + "step": 55521 + }, + { + "epoch": 5.180740878977326, + "grad_norm": NaN, + "learning_rate": 1.4462404165093643e-05, + "loss": 0.0, + "step": 55522 + }, + { + "epoch": 5.180834188672203, + "grad_norm": NaN, + "learning_rate": 1.44591639287779e-05, + "loss": 0.0, + "step": 55523 + }, + { + "epoch": 5.1809274983670806, + "grad_norm": NaN, + "learning_rate": 1.4455924037102356e-05, + "loss": 0.0, + "step": 55524 + }, + { + "epoch": 5.181020808061958, + "grad_norm": NaN, + "learning_rate": 1.4452684490075256e-05, + "loss": 0.0, + "step": 55525 + }, + { + "epoch": 5.1811141177568345, + "grad_norm": NaN, + "learning_rate": 1.4449445287704809e-05, + "loss": 0.0, + "step": 55526 + }, + { + "epoch": 5.181207427451712, + "grad_norm": NaN, + "learning_rate": 1.4446206429999275e-05, + "loss": 0.0, + "step": 55527 + }, + { + "epoch": 5.181300737146589, + "grad_norm": NaN, + "learning_rate": 1.444296791696688e-05, + "loss": 0.0, + "step": 55528 + }, + { + "epoch": 5.181394046841467, + "grad_norm": NaN, + "learning_rate": 1.4439729748615902e-05, + "loss": 0.0, + "step": 55529 + }, + { + "epoch": 5.181487356536344, + "grad_norm": NaN, + "learning_rate": 1.443649192495447e-05, + "loss": 0.0, + "step": 55530 + }, + { + "epoch": 5.181580666231222, + "grad_norm": NaN, + "learning_rate": 1.4433254445990921e-05, + "loss": 0.0, + "step": 55531 + }, + { + "epoch": 5.181673975926099, + "grad_norm": NaN, + "learning_rate": 1.4430017311733438e-05, + "loss": 0.0, + "step": 55532 + }, + { + "epoch": 5.181767285620976, + "grad_norm": NaN, + "learning_rate": 1.4426780522190261e-05, + "loss": 0.0, + "step": 55533 + }, + { + "epoch": 5.181860595315853, + "grad_norm": NaN, + "learning_rate": 1.4423544077369636e-05, + "loss": 0.0, + "step": 55534 + }, + { + "epoch": 5.18195390501073, + "grad_norm": NaN, + "learning_rate": 1.4420307977279754e-05, + "loss": 0.0, + "step": 55535 + }, + { + "epoch": 5.182047214705608, + "grad_norm": NaN, + "learning_rate": 1.4417072221928893e-05, + "loss": 0.0, + "step": 55536 + }, + { + "epoch": 5.182140524400485, + "grad_norm": NaN, + "learning_rate": 1.441383681132523e-05, + "loss": 0.0, + "step": 55537 + }, + { + "epoch": 5.182233834095363, + "grad_norm": NaN, + "learning_rate": 1.4410601745477041e-05, + "loss": 0.0, + "step": 55538 + }, + { + "epoch": 5.18232714379024, + "grad_norm": NaN, + "learning_rate": 1.4407367024392503e-05, + "loss": 0.0, + "step": 55539 + }, + { + "epoch": 5.1824204534851175, + "grad_norm": NaN, + "learning_rate": 1.4404132648079874e-05, + "loss": 0.0, + "step": 55540 + }, + { + "epoch": 5.182513763179994, + "grad_norm": NaN, + "learning_rate": 1.4400898616547385e-05, + "loss": 0.0, + "step": 55541 + }, + { + "epoch": 5.182607072874871, + "grad_norm": NaN, + "learning_rate": 1.4397664929803227e-05, + "loss": 0.0, + "step": 55542 + }, + { + "epoch": 5.182700382569749, + "grad_norm": NaN, + "learning_rate": 1.4394431587855643e-05, + "loss": 0.0, + "step": 55543 + }, + { + "epoch": 5.182793692264626, + "grad_norm": NaN, + "learning_rate": 1.4391198590712844e-05, + "loss": 0.0, + "step": 55544 + }, + { + "epoch": 5.182887001959504, + "grad_norm": NaN, + "learning_rate": 1.4387965938383056e-05, + "loss": 0.0, + "step": 55545 + }, + { + "epoch": 5.182980311654381, + "grad_norm": NaN, + "learning_rate": 1.4384733630874506e-05, + "loss": 0.0, + "step": 55546 + }, + { + "epoch": 5.1830736213492585, + "grad_norm": NaN, + "learning_rate": 1.4381501668195405e-05, + "loss": 0.0, + "step": 55547 + }, + { + "epoch": 5.183166931044135, + "grad_norm": NaN, + "learning_rate": 1.437827005035398e-05, + "loss": 0.0, + "step": 55548 + }, + { + "epoch": 5.1832602407390125, + "grad_norm": NaN, + "learning_rate": 1.4375038777358422e-05, + "loss": 0.0, + "step": 55549 + }, + { + "epoch": 5.18335355043389, + "grad_norm": NaN, + "learning_rate": 1.4371807849216977e-05, + "loss": 0.0, + "step": 55550 + }, + { + "epoch": 5.183446860128767, + "grad_norm": NaN, + "learning_rate": 1.4368577265937853e-05, + "loss": 0.0, + "step": 55551 + }, + { + "epoch": 5.183540169823645, + "grad_norm": NaN, + "learning_rate": 1.4365347027529245e-05, + "loss": 0.0, + "step": 55552 + }, + { + "epoch": 5.183633479518522, + "grad_norm": NaN, + "learning_rate": 1.4362117133999395e-05, + "loss": 0.0, + "step": 55553 + }, + { + "epoch": 5.1837267892134, + "grad_norm": NaN, + "learning_rate": 1.43588875853565e-05, + "loss": 0.0, + "step": 55554 + }, + { + "epoch": 5.183820098908277, + "grad_norm": NaN, + "learning_rate": 1.435565838160878e-05, + "loss": 0.0, + "step": 55555 + }, + { + "epoch": 5.1839134086031535, + "grad_norm": NaN, + "learning_rate": 1.435242952276442e-05, + "loss": 0.0, + "step": 55556 + }, + { + "epoch": 5.184006718298031, + "grad_norm": NaN, + "learning_rate": 1.4349201008831673e-05, + "loss": 0.0, + "step": 55557 + }, + { + "epoch": 5.184100027992908, + "grad_norm": NaN, + "learning_rate": 1.4345972839818703e-05, + "loss": 0.0, + "step": 55558 + }, + { + "epoch": 5.184193337687786, + "grad_norm": NaN, + "learning_rate": 1.4342745015733753e-05, + "loss": 0.0, + "step": 55559 + }, + { + "epoch": 5.184286647382663, + "grad_norm": NaN, + "learning_rate": 1.4339517536585015e-05, + "loss": 0.0, + "step": 55560 + }, + { + "epoch": 5.184379957077541, + "grad_norm": NaN, + "learning_rate": 1.43362904023807e-05, + "loss": 0.0, + "step": 55561 + }, + { + "epoch": 5.184473266772418, + "grad_norm": NaN, + "learning_rate": 1.433306361312902e-05, + "loss": 0.0, + "step": 55562 + }, + { + "epoch": 5.184566576467295, + "grad_norm": NaN, + "learning_rate": 1.4329837168838165e-05, + "loss": 0.0, + "step": 55563 + }, + { + "epoch": 5.184659886162172, + "grad_norm": NaN, + "learning_rate": 1.4326611069516347e-05, + "loss": 0.0, + "step": 55564 + }, + { + "epoch": 5.184753195857049, + "grad_norm": NaN, + "learning_rate": 1.4323385315171759e-05, + "loss": 0.0, + "step": 55565 + }, + { + "epoch": 5.184846505551927, + "grad_norm": NaN, + "learning_rate": 1.4320159905812612e-05, + "loss": 0.0, + "step": 55566 + }, + { + "epoch": 5.184939815246804, + "grad_norm": NaN, + "learning_rate": 1.4316934841447114e-05, + "loss": 0.0, + "step": 55567 + }, + { + "epoch": 5.185033124941682, + "grad_norm": NaN, + "learning_rate": 1.431371012208346e-05, + "loss": 0.0, + "step": 55568 + }, + { + "epoch": 5.185126434636559, + "grad_norm": NaN, + "learning_rate": 1.4310485747729844e-05, + "loss": 0.0, + "step": 55569 + }, + { + "epoch": 5.185219744331436, + "grad_norm": NaN, + "learning_rate": 1.4307261718394475e-05, + "loss": 0.0, + "step": 55570 + }, + { + "epoch": 5.185313054026313, + "grad_norm": NaN, + "learning_rate": 1.430403803408553e-05, + "loss": 0.0, + "step": 55571 + }, + { + "epoch": 5.1854063637211905, + "grad_norm": NaN, + "learning_rate": 1.4300814694811235e-05, + "loss": 0.0, + "step": 55572 + }, + { + "epoch": 5.185499673416068, + "grad_norm": NaN, + "learning_rate": 1.4297591700579752e-05, + "loss": 0.0, + "step": 55573 + }, + { + "epoch": 5.185592983110945, + "grad_norm": NaN, + "learning_rate": 1.4294369051399308e-05, + "loss": 0.0, + "step": 55574 + }, + { + "epoch": 5.185686292805823, + "grad_norm": NaN, + "learning_rate": 1.4291146747278092e-05, + "loss": 0.0, + "step": 55575 + }, + { + "epoch": 5.1857796025007, + "grad_norm": NaN, + "learning_rate": 1.4287924788224286e-05, + "loss": 0.0, + "step": 55576 + }, + { + "epoch": 5.1858729121955776, + "grad_norm": NaN, + "learning_rate": 1.4284703174246082e-05, + "loss": 0.0, + "step": 55577 + }, + { + "epoch": 5.185966221890454, + "grad_norm": NaN, + "learning_rate": 1.4281481905351672e-05, + "loss": 0.0, + "step": 55578 + }, + { + "epoch": 5.1860595315853315, + "grad_norm": NaN, + "learning_rate": 1.427826098154925e-05, + "loss": 0.0, + "step": 55579 + }, + { + "epoch": 5.186152841280209, + "grad_norm": NaN, + "learning_rate": 1.4275040402847027e-05, + "loss": 0.0, + "step": 55580 + }, + { + "epoch": 5.186246150975086, + "grad_norm": NaN, + "learning_rate": 1.4271820169253145e-05, + "loss": 0.0, + "step": 55581 + }, + { + "epoch": 5.186339460669964, + "grad_norm": NaN, + "learning_rate": 1.4268600280775832e-05, + "loss": 0.0, + "step": 55582 + }, + { + "epoch": 5.186432770364841, + "grad_norm": NaN, + "learning_rate": 1.4265380737423265e-05, + "loss": 0.0, + "step": 55583 + }, + { + "epoch": 5.186526080059719, + "grad_norm": NaN, + "learning_rate": 1.426216153920362e-05, + "loss": 0.0, + "step": 55584 + }, + { + "epoch": 5.186619389754595, + "grad_norm": NaN, + "learning_rate": 1.4258942686125092e-05, + "loss": 0.0, + "step": 55585 + }, + { + "epoch": 5.1867126994494726, + "grad_norm": NaN, + "learning_rate": 1.4255724178195871e-05, + "loss": 0.0, + "step": 55586 + }, + { + "epoch": 5.18680600914435, + "grad_norm": NaN, + "learning_rate": 1.425250601542412e-05, + "loss": 0.0, + "step": 55587 + }, + { + "epoch": 5.186899318839227, + "grad_norm": NaN, + "learning_rate": 1.4249288197818033e-05, + "loss": 0.0, + "step": 55588 + }, + { + "epoch": 5.186992628534105, + "grad_norm": NaN, + "learning_rate": 1.42460707253858e-05, + "loss": 0.0, + "step": 55589 + }, + { + "epoch": 5.187085938228982, + "grad_norm": NaN, + "learning_rate": 1.42428535981356e-05, + "loss": 0.0, + "step": 55590 + }, + { + "epoch": 5.18717924792386, + "grad_norm": NaN, + "learning_rate": 1.4239636816075611e-05, + "loss": 0.0, + "step": 55591 + }, + { + "epoch": 5.187272557618736, + "grad_norm": NaN, + "learning_rate": 1.4236420379214009e-05, + "loss": 0.0, + "step": 55592 + }, + { + "epoch": 5.187365867313614, + "grad_norm": NaN, + "learning_rate": 1.4233204287558968e-05, + "loss": 0.0, + "step": 55593 + }, + { + "epoch": 5.187459177008491, + "grad_norm": NaN, + "learning_rate": 1.422998854111867e-05, + "loss": 0.0, + "step": 55594 + }, + { + "epoch": 5.187552486703368, + "grad_norm": NaN, + "learning_rate": 1.4226773139901288e-05, + "loss": 0.0, + "step": 55595 + }, + { + "epoch": 5.187645796398246, + "grad_norm": NaN, + "learning_rate": 1.4223558083915016e-05, + "loss": 0.0, + "step": 55596 + }, + { + "epoch": 5.187739106093123, + "grad_norm": NaN, + "learning_rate": 1.4220343373168015e-05, + "loss": 0.0, + "step": 55597 + }, + { + "epoch": 5.187832415788001, + "grad_norm": NaN, + "learning_rate": 1.421712900766846e-05, + "loss": 0.0, + "step": 55598 + }, + { + "epoch": 5.187925725482877, + "grad_norm": NaN, + "learning_rate": 1.4213914987424512e-05, + "loss": 0.0, + "step": 55599 + }, + { + "epoch": 5.188019035177755, + "grad_norm": NaN, + "learning_rate": 1.4210701312444368e-05, + "loss": 0.0, + "step": 55600 + }, + { + "epoch": 5.188112344872632, + "grad_norm": NaN, + "learning_rate": 1.4207487982736182e-05, + "loss": 0.0, + "step": 55601 + }, + { + "epoch": 5.1882056545675095, + "grad_norm": NaN, + "learning_rate": 1.4204274998308135e-05, + "loss": 0.0, + "step": 55602 + }, + { + "epoch": 5.188298964262387, + "grad_norm": NaN, + "learning_rate": 1.4201062359168387e-05, + "loss": 0.0, + "step": 55603 + }, + { + "epoch": 5.188392273957264, + "grad_norm": NaN, + "learning_rate": 1.4197850065325112e-05, + "loss": 0.0, + "step": 55604 + }, + { + "epoch": 5.188485583652142, + "grad_norm": NaN, + "learning_rate": 1.4194638116786472e-05, + "loss": 0.0, + "step": 55605 + }, + { + "epoch": 5.188578893347019, + "grad_norm": NaN, + "learning_rate": 1.4191426513560628e-05, + "loss": 0.0, + "step": 55606 + }, + { + "epoch": 5.188672203041896, + "grad_norm": NaN, + "learning_rate": 1.4188215255655772e-05, + "loss": 0.0, + "step": 55607 + }, + { + "epoch": 5.188765512736773, + "grad_norm": NaN, + "learning_rate": 1.4185004343080065e-05, + "loss": 0.0, + "step": 55608 + }, + { + "epoch": 5.1888588224316505, + "grad_norm": NaN, + "learning_rate": 1.4181793775841649e-05, + "loss": 0.0, + "step": 55609 + }, + { + "epoch": 5.188952132126528, + "grad_norm": NaN, + "learning_rate": 1.4178583553948685e-05, + "loss": 0.0, + "step": 55610 + }, + { + "epoch": 5.189045441821405, + "grad_norm": NaN, + "learning_rate": 1.417537367740937e-05, + "loss": 0.0, + "step": 55611 + }, + { + "epoch": 5.189138751516283, + "grad_norm": NaN, + "learning_rate": 1.417216414623184e-05, + "loss": 0.0, + "step": 55612 + }, + { + "epoch": 5.18923206121116, + "grad_norm": NaN, + "learning_rate": 1.4168954960424262e-05, + "loss": 0.0, + "step": 55613 + }, + { + "epoch": 5.189325370906037, + "grad_norm": NaN, + "learning_rate": 1.4165746119994792e-05, + "loss": 0.0, + "step": 55614 + }, + { + "epoch": 5.189418680600914, + "grad_norm": NaN, + "learning_rate": 1.4162537624951593e-05, + "loss": 0.0, + "step": 55615 + }, + { + "epoch": 5.189511990295792, + "grad_norm": NaN, + "learning_rate": 1.4159329475302822e-05, + "loss": 0.0, + "step": 55616 + }, + { + "epoch": 5.189605299990669, + "grad_norm": NaN, + "learning_rate": 1.4156121671056642e-05, + "loss": 0.0, + "step": 55617 + }, + { + "epoch": 5.189698609685546, + "grad_norm": NaN, + "learning_rate": 1.4152914212221194e-05, + "loss": 0.0, + "step": 55618 + }, + { + "epoch": 5.189791919380424, + "grad_norm": NaN, + "learning_rate": 1.4149707098804657e-05, + "loss": 0.0, + "step": 55619 + }, + { + "epoch": 5.189885229075301, + "grad_norm": NaN, + "learning_rate": 1.4146500330815158e-05, + "loss": 0.0, + "step": 55620 + }, + { + "epoch": 5.189978538770179, + "grad_norm": NaN, + "learning_rate": 1.4143293908260873e-05, + "loss": 0.0, + "step": 55621 + }, + { + "epoch": 5.190071848465055, + "grad_norm": NaN, + "learning_rate": 1.414008783114996e-05, + "loss": 0.0, + "step": 55622 + }, + { + "epoch": 5.190165158159933, + "grad_norm": NaN, + "learning_rate": 1.4136882099490548e-05, + "loss": 0.0, + "step": 55623 + }, + { + "epoch": 5.19025846785481, + "grad_norm": NaN, + "learning_rate": 1.4133676713290798e-05, + "loss": 0.0, + "step": 55624 + }, + { + "epoch": 5.1903517775496875, + "grad_norm": NaN, + "learning_rate": 1.4130471672558868e-05, + "loss": 0.0, + "step": 55625 + }, + { + "epoch": 5.190445087244565, + "grad_norm": NaN, + "learning_rate": 1.4127266977302887e-05, + "loss": 0.0, + "step": 55626 + }, + { + "epoch": 5.190538396939442, + "grad_norm": NaN, + "learning_rate": 1.412406262753103e-05, + "loss": 0.0, + "step": 55627 + }, + { + "epoch": 5.19063170663432, + "grad_norm": NaN, + "learning_rate": 1.4120858623251441e-05, + "loss": 0.0, + "step": 55628 + }, + { + "epoch": 5.190725016329196, + "grad_norm": NaN, + "learning_rate": 1.4117654964472247e-05, + "loss": 0.0, + "step": 55629 + }, + { + "epoch": 5.190818326024074, + "grad_norm": NaN, + "learning_rate": 1.4114451651201592e-05, + "loss": 0.0, + "step": 55630 + }, + { + "epoch": 5.190911635718951, + "grad_norm": NaN, + "learning_rate": 1.4111248683447652e-05, + "loss": 0.0, + "step": 55631 + }, + { + "epoch": 5.1910049454138285, + "grad_norm": NaN, + "learning_rate": 1.4108046061218553e-05, + "loss": 0.0, + "step": 55632 + }, + { + "epoch": 5.191098255108706, + "grad_norm": NaN, + "learning_rate": 1.4104843784522424e-05, + "loss": 0.0, + "step": 55633 + }, + { + "epoch": 5.191191564803583, + "grad_norm": NaN, + "learning_rate": 1.410164185336744e-05, + "loss": 0.0, + "step": 55634 + }, + { + "epoch": 5.191284874498461, + "grad_norm": NaN, + "learning_rate": 1.4098440267761712e-05, + "loss": 0.0, + "step": 55635 + }, + { + "epoch": 5.191378184193337, + "grad_norm": NaN, + "learning_rate": 1.40952390277134e-05, + "loss": 0.0, + "step": 55636 + }, + { + "epoch": 5.191471493888215, + "grad_norm": NaN, + "learning_rate": 1.4092038133230633e-05, + "loss": 0.0, + "step": 55637 + }, + { + "epoch": 5.191564803583092, + "grad_norm": NaN, + "learning_rate": 1.4088837584321566e-05, + "loss": 0.0, + "step": 55638 + }, + { + "epoch": 5.1916581132779696, + "grad_norm": NaN, + "learning_rate": 1.4085637380994314e-05, + "loss": 0.0, + "step": 55639 + }, + { + "epoch": 5.191751422972847, + "grad_norm": NaN, + "learning_rate": 1.4082437523257035e-05, + "loss": 0.0, + "step": 55640 + }, + { + "epoch": 5.191844732667724, + "grad_norm": NaN, + "learning_rate": 1.4079238011117839e-05, + "loss": 0.0, + "step": 55641 + }, + { + "epoch": 5.191938042362602, + "grad_norm": NaN, + "learning_rate": 1.4076038844584886e-05, + "loss": 0.0, + "step": 55642 + }, + { + "epoch": 5.192031352057478, + "grad_norm": NaN, + "learning_rate": 1.4072840023666303e-05, + "loss": 0.0, + "step": 55643 + }, + { + "epoch": 5.192124661752356, + "grad_norm": NaN, + "learning_rate": 1.406964154837022e-05, + "loss": 0.0, + "step": 55644 + }, + { + "epoch": 5.192217971447233, + "grad_norm": NaN, + "learning_rate": 1.4066443418704777e-05, + "loss": 0.0, + "step": 55645 + }, + { + "epoch": 5.192311281142111, + "grad_norm": NaN, + "learning_rate": 1.4063245634678116e-05, + "loss": 0.0, + "step": 55646 + }, + { + "epoch": 5.192404590836988, + "grad_norm": NaN, + "learning_rate": 1.4060048196298334e-05, + "loss": 0.0, + "step": 55647 + }, + { + "epoch": 5.192497900531865, + "grad_norm": NaN, + "learning_rate": 1.4056851103573591e-05, + "loss": 0.0, + "step": 55648 + }, + { + "epoch": 5.192591210226743, + "grad_norm": NaN, + "learning_rate": 1.4053654356511995e-05, + "loss": 0.0, + "step": 55649 + }, + { + "epoch": 5.19268451992162, + "grad_norm": NaN, + "learning_rate": 1.4050457955121691e-05, + "loss": 0.0, + "step": 55650 + }, + { + "epoch": 5.192777829616497, + "grad_norm": NaN, + "learning_rate": 1.4047261899410805e-05, + "loss": 0.0, + "step": 55651 + }, + { + "epoch": 5.192871139311374, + "grad_norm": NaN, + "learning_rate": 1.4044066189387465e-05, + "loss": 0.0, + "step": 55652 + }, + { + "epoch": 5.192964449006252, + "grad_norm": NaN, + "learning_rate": 1.404087082505978e-05, + "loss": 0.0, + "step": 55653 + }, + { + "epoch": 5.193057758701129, + "grad_norm": NaN, + "learning_rate": 1.4037675806435894e-05, + "loss": 0.0, + "step": 55654 + }, + { + "epoch": 5.1931510683960065, + "grad_norm": NaN, + "learning_rate": 1.4034481133523918e-05, + "loss": 0.0, + "step": 55655 + }, + { + "epoch": 5.193244378090884, + "grad_norm": NaN, + "learning_rate": 1.4031286806331976e-05, + "loss": 0.0, + "step": 55656 + }, + { + "epoch": 5.193337687785761, + "grad_norm": NaN, + "learning_rate": 1.4028092824868198e-05, + "loss": 0.0, + "step": 55657 + }, + { + "epoch": 5.193430997480638, + "grad_norm": NaN, + "learning_rate": 1.402489918914071e-05, + "loss": 0.0, + "step": 55658 + }, + { + "epoch": 5.193524307175515, + "grad_norm": NaN, + "learning_rate": 1.402170589915762e-05, + "loss": 0.0, + "step": 55659 + }, + { + "epoch": 5.193617616870393, + "grad_norm": NaN, + "learning_rate": 1.4018512954927058e-05, + "loss": 0.0, + "step": 55660 + }, + { + "epoch": 5.19371092656527, + "grad_norm": NaN, + "learning_rate": 1.4015320356457132e-05, + "loss": 0.0, + "step": 55661 + }, + { + "epoch": 5.1938042362601475, + "grad_norm": NaN, + "learning_rate": 1.4012128103755953e-05, + "loss": 0.0, + "step": 55662 + }, + { + "epoch": 5.193897545955025, + "grad_norm": NaN, + "learning_rate": 1.4008936196831665e-05, + "loss": 0.0, + "step": 55663 + }, + { + "epoch": 5.193990855649902, + "grad_norm": NaN, + "learning_rate": 1.4005744635692361e-05, + "loss": 0.0, + "step": 55664 + }, + { + "epoch": 5.194084165344779, + "grad_norm": NaN, + "learning_rate": 1.4002553420346168e-05, + "loss": 0.0, + "step": 55665 + }, + { + "epoch": 5.194177475039656, + "grad_norm": NaN, + "learning_rate": 1.3999362550801197e-05, + "loss": 0.0, + "step": 55666 + }, + { + "epoch": 5.194270784734534, + "grad_norm": NaN, + "learning_rate": 1.3996172027065555e-05, + "loss": 0.0, + "step": 55667 + }, + { + "epoch": 5.194364094429411, + "grad_norm": NaN, + "learning_rate": 1.3992981849147373e-05, + "loss": 0.0, + "step": 55668 + }, + { + "epoch": 5.194457404124289, + "grad_norm": NaN, + "learning_rate": 1.398979201705474e-05, + "loss": 0.0, + "step": 55669 + }, + { + "epoch": 5.194550713819166, + "grad_norm": NaN, + "learning_rate": 1.3986602530795771e-05, + "loss": 0.0, + "step": 55670 + }, + { + "epoch": 5.194644023514043, + "grad_norm": NaN, + "learning_rate": 1.398341339037859e-05, + "loss": 0.0, + "step": 55671 + }, + { + "epoch": 5.19473733320892, + "grad_norm": NaN, + "learning_rate": 1.3980224595811308e-05, + "loss": 0.0, + "step": 55672 + }, + { + "epoch": 5.194830642903797, + "grad_norm": NaN, + "learning_rate": 1.3977036147102e-05, + "loss": 0.0, + "step": 55673 + }, + { + "epoch": 5.194923952598675, + "grad_norm": NaN, + "learning_rate": 1.3973848044258811e-05, + "loss": 0.0, + "step": 55674 + }, + { + "epoch": 5.195017262293552, + "grad_norm": NaN, + "learning_rate": 1.397066028728982e-05, + "loss": 0.0, + "step": 55675 + }, + { + "epoch": 5.19511057198843, + "grad_norm": NaN, + "learning_rate": 1.3967472876203168e-05, + "loss": 0.0, + "step": 55676 + }, + { + "epoch": 5.195203881683307, + "grad_norm": NaN, + "learning_rate": 1.3964285811006915e-05, + "loss": 0.0, + "step": 55677 + }, + { + "epoch": 5.1952971913781845, + "grad_norm": NaN, + "learning_rate": 1.3961099091709189e-05, + "loss": 0.0, + "step": 55678 + }, + { + "epoch": 5.195390501073062, + "grad_norm": NaN, + "learning_rate": 1.3957912718318099e-05, + "loss": 0.0, + "step": 55679 + }, + { + "epoch": 5.195483810767938, + "grad_norm": NaN, + "learning_rate": 1.3954726690841739e-05, + "loss": 0.0, + "step": 55680 + }, + { + "epoch": 5.195577120462816, + "grad_norm": NaN, + "learning_rate": 1.3951541009288203e-05, + "loss": 0.0, + "step": 55681 + }, + { + "epoch": 5.195670430157693, + "grad_norm": NaN, + "learning_rate": 1.3948355673665602e-05, + "loss": 0.0, + "step": 55682 + }, + { + "epoch": 5.195763739852571, + "grad_norm": NaN, + "learning_rate": 1.3945170683982027e-05, + "loss": 0.0, + "step": 55683 + }, + { + "epoch": 5.195857049547448, + "grad_norm": NaN, + "learning_rate": 1.3941986040245572e-05, + "loss": 0.0, + "step": 55684 + }, + { + "epoch": 5.1959503592423255, + "grad_norm": NaN, + "learning_rate": 1.3938801742464367e-05, + "loss": 0.0, + "step": 55685 + }, + { + "epoch": 5.196043668937203, + "grad_norm": NaN, + "learning_rate": 1.393561779064647e-05, + "loss": 0.0, + "step": 55686 + }, + { + "epoch": 5.1961369786320795, + "grad_norm": NaN, + "learning_rate": 1.393243418479999e-05, + "loss": 0.0, + "step": 55687 + }, + { + "epoch": 5.196230288326957, + "grad_norm": NaN, + "learning_rate": 1.3929250924933021e-05, + "loss": 0.0, + "step": 55688 + }, + { + "epoch": 5.196323598021834, + "grad_norm": NaN, + "learning_rate": 1.392606801105366e-05, + "loss": 0.0, + "step": 55689 + }, + { + "epoch": 5.196416907716712, + "grad_norm": NaN, + "learning_rate": 1.3922885443170012e-05, + "loss": 0.0, + "step": 55690 + }, + { + "epoch": 5.196510217411589, + "grad_norm": NaN, + "learning_rate": 1.3919703221290158e-05, + "loss": 0.0, + "step": 55691 + }, + { + "epoch": 5.196603527106467, + "grad_norm": NaN, + "learning_rate": 1.3916521345422171e-05, + "loss": 0.0, + "step": 55692 + }, + { + "epoch": 5.196696836801344, + "grad_norm": NaN, + "learning_rate": 1.3913339815574165e-05, + "loss": 0.0, + "step": 55693 + }, + { + "epoch": 5.196790146496221, + "grad_norm": NaN, + "learning_rate": 1.3910158631754231e-05, + "loss": 0.0, + "step": 55694 + }, + { + "epoch": 5.196883456191098, + "grad_norm": NaN, + "learning_rate": 1.3906977793970448e-05, + "loss": 0.0, + "step": 55695 + }, + { + "epoch": 5.196976765885975, + "grad_norm": NaN, + "learning_rate": 1.3903797302230907e-05, + "loss": 0.0, + "step": 55696 + }, + { + "epoch": 5.197070075580853, + "grad_norm": NaN, + "learning_rate": 1.3900617156543703e-05, + "loss": 0.0, + "step": 55697 + }, + { + "epoch": 5.19716338527573, + "grad_norm": NaN, + "learning_rate": 1.3897437356916897e-05, + "loss": 0.0, + "step": 55698 + }, + { + "epoch": 5.197256694970608, + "grad_norm": NaN, + "learning_rate": 1.3894257903358613e-05, + "loss": 0.0, + "step": 55699 + }, + { + "epoch": 5.197350004665485, + "grad_norm": NaN, + "learning_rate": 1.3891078795876897e-05, + "loss": 0.0, + "step": 55700 + }, + { + "epoch": 5.197443314360362, + "grad_norm": NaN, + "learning_rate": 1.3887900034479843e-05, + "loss": 0.0, + "step": 55701 + }, + { + "epoch": 5.197536624055239, + "grad_norm": NaN, + "learning_rate": 1.388472161917556e-05, + "loss": 0.0, + "step": 55702 + }, + { + "epoch": 5.197629933750116, + "grad_norm": NaN, + "learning_rate": 1.3881543549972091e-05, + "loss": 0.0, + "step": 55703 + }, + { + "epoch": 5.197723243444994, + "grad_norm": NaN, + "learning_rate": 1.3878365826877546e-05, + "loss": 0.0, + "step": 55704 + }, + { + "epoch": 5.197816553139871, + "grad_norm": NaN, + "learning_rate": 1.3875188449899987e-05, + "loss": 0.0, + "step": 55705 + }, + { + "epoch": 5.197909862834749, + "grad_norm": NaN, + "learning_rate": 1.3872011419047508e-05, + "loss": 0.0, + "step": 55706 + }, + { + "epoch": 5.198003172529626, + "grad_norm": NaN, + "learning_rate": 1.3868834734328183e-05, + "loss": 0.0, + "step": 55707 + }, + { + "epoch": 5.1980964822245035, + "grad_norm": NaN, + "learning_rate": 1.386565839575009e-05, + "loss": 0.0, + "step": 55708 + }, + { + "epoch": 5.19818979191938, + "grad_norm": NaN, + "learning_rate": 1.386248240332129e-05, + "loss": 0.0, + "step": 55709 + }, + { + "epoch": 5.198283101614257, + "grad_norm": NaN, + "learning_rate": 1.3859306757049877e-05, + "loss": 0.0, + "step": 55710 + }, + { + "epoch": 5.198376411309135, + "grad_norm": NaN, + "learning_rate": 1.3856131456943908e-05, + "loss": 0.0, + "step": 55711 + }, + { + "epoch": 5.198469721004012, + "grad_norm": NaN, + "learning_rate": 1.385295650301148e-05, + "loss": 0.0, + "step": 55712 + }, + { + "epoch": 5.19856303069889, + "grad_norm": NaN, + "learning_rate": 1.3849781895260637e-05, + "loss": 0.0, + "step": 55713 + }, + { + "epoch": 5.198656340393767, + "grad_norm": NaN, + "learning_rate": 1.3846607633699486e-05, + "loss": 0.0, + "step": 55714 + }, + { + "epoch": 5.1987496500886445, + "grad_norm": NaN, + "learning_rate": 1.3843433718336071e-05, + "loss": 0.0, + "step": 55715 + }, + { + "epoch": 5.198842959783521, + "grad_norm": NaN, + "learning_rate": 1.3840260149178473e-05, + "loss": 0.0, + "step": 55716 + }, + { + "epoch": 5.1989362694783985, + "grad_norm": NaN, + "learning_rate": 1.3837086926234763e-05, + "loss": 0.0, + "step": 55717 + }, + { + "epoch": 5.199029579173276, + "grad_norm": NaN, + "learning_rate": 1.3833914049513006e-05, + "loss": 0.0, + "step": 55718 + }, + { + "epoch": 5.199122888868153, + "grad_norm": NaN, + "learning_rate": 1.3830741519021277e-05, + "loss": 0.0, + "step": 55719 + }, + { + "epoch": 5.199216198563031, + "grad_norm": NaN, + "learning_rate": 1.3827569334767619e-05, + "loss": 0.0, + "step": 55720 + }, + { + "epoch": 5.199309508257908, + "grad_norm": NaN, + "learning_rate": 1.3824397496760125e-05, + "loss": 0.0, + "step": 55721 + }, + { + "epoch": 5.199402817952786, + "grad_norm": NaN, + "learning_rate": 1.3821226005006858e-05, + "loss": 0.0, + "step": 55722 + }, + { + "epoch": 5.199496127647663, + "grad_norm": NaN, + "learning_rate": 1.381805485951586e-05, + "loss": 0.0, + "step": 55723 + }, + { + "epoch": 5.1995894373425395, + "grad_norm": NaN, + "learning_rate": 1.3814884060295205e-05, + "loss": 0.0, + "step": 55724 + }, + { + "epoch": 5.199682747037417, + "grad_norm": NaN, + "learning_rate": 1.3811713607352975e-05, + "loss": 0.0, + "step": 55725 + }, + { + "epoch": 5.199776056732294, + "grad_norm": NaN, + "learning_rate": 1.380854350069721e-05, + "loss": 0.0, + "step": 55726 + }, + { + "epoch": 5.199869366427172, + "grad_norm": NaN, + "learning_rate": 1.3805373740335974e-05, + "loss": 0.0, + "step": 55727 + }, + { + "epoch": 5.199962676122049, + "grad_norm": NaN, + "learning_rate": 1.3802204326277322e-05, + "loss": 0.0, + "step": 55728 + }, + { + "epoch": 5.200055985816927, + "grad_norm": NaN, + "learning_rate": 1.3799035258529335e-05, + "loss": 0.0, + "step": 55729 + }, + { + "epoch": 5.200149295511804, + "grad_norm": NaN, + "learning_rate": 1.3795866537100036e-05, + "loss": 0.0, + "step": 55730 + }, + { + "epoch": 5.200242605206681, + "grad_norm": NaN, + "learning_rate": 1.3792698161997523e-05, + "loss": 0.0, + "step": 55731 + }, + { + "epoch": 5.200335914901558, + "grad_norm": NaN, + "learning_rate": 1.378953013322982e-05, + "loss": 0.0, + "step": 55732 + }, + { + "epoch": 5.200429224596435, + "grad_norm": NaN, + "learning_rate": 1.3786362450804989e-05, + "loss": 0.0, + "step": 55733 + }, + { + "epoch": 5.200522534291313, + "grad_norm": NaN, + "learning_rate": 1.3783195114731104e-05, + "loss": 0.0, + "step": 55734 + }, + { + "epoch": 5.20061584398619, + "grad_norm": NaN, + "learning_rate": 1.378002812501618e-05, + "loss": 0.0, + "step": 55735 + }, + { + "epoch": 5.200709153681068, + "grad_norm": NaN, + "learning_rate": 1.3776861481668306e-05, + "loss": 0.0, + "step": 55736 + }, + { + "epoch": 5.200802463375945, + "grad_norm": NaN, + "learning_rate": 1.3773695184695527e-05, + "loss": 0.0, + "step": 55737 + }, + { + "epoch": 5.2008957730708225, + "grad_norm": NaN, + "learning_rate": 1.3770529234105887e-05, + "loss": 0.0, + "step": 55738 + }, + { + "epoch": 5.200989082765699, + "grad_norm": NaN, + "learning_rate": 1.376736362990743e-05, + "loss": 0.0, + "step": 55739 + }, + { + "epoch": 5.2010823924605765, + "grad_norm": NaN, + "learning_rate": 1.3764198372108215e-05, + "loss": 0.0, + "step": 55740 + }, + { + "epoch": 5.201175702155454, + "grad_norm": NaN, + "learning_rate": 1.3761033460716287e-05, + "loss": 0.0, + "step": 55741 + }, + { + "epoch": 5.201269011850331, + "grad_norm": NaN, + "learning_rate": 1.3757868895739688e-05, + "loss": 0.0, + "step": 55742 + }, + { + "epoch": 5.201362321545209, + "grad_norm": NaN, + "learning_rate": 1.3754704677186478e-05, + "loss": 0.0, + "step": 55743 + }, + { + "epoch": 5.201455631240086, + "grad_norm": NaN, + "learning_rate": 1.3751540805064704e-05, + "loss": 0.0, + "step": 55744 + }, + { + "epoch": 5.201548940934964, + "grad_norm": NaN, + "learning_rate": 1.3748377279382389e-05, + "loss": 0.0, + "step": 55745 + }, + { + "epoch": 5.20164225062984, + "grad_norm": NaN, + "learning_rate": 1.3745214100147594e-05, + "loss": 0.0, + "step": 55746 + }, + { + "epoch": 5.2017355603247175, + "grad_norm": NaN, + "learning_rate": 1.3742051267368348e-05, + "loss": 0.0, + "step": 55747 + }, + { + "epoch": 5.201828870019595, + "grad_norm": NaN, + "learning_rate": 1.373888878105271e-05, + "loss": 0.0, + "step": 55748 + }, + { + "epoch": 5.201922179714472, + "grad_norm": NaN, + "learning_rate": 1.3735726641208722e-05, + "loss": 0.0, + "step": 55749 + }, + { + "epoch": 5.20201548940935, + "grad_norm": NaN, + "learning_rate": 1.3732564847844413e-05, + "loss": 0.0, + "step": 55750 + }, + { + "epoch": 5.202108799104227, + "grad_norm": NaN, + "learning_rate": 1.3729403400967826e-05, + "loss": 0.0, + "step": 55751 + }, + { + "epoch": 5.202202108799105, + "grad_norm": NaN, + "learning_rate": 1.3726242300586988e-05, + "loss": 0.0, + "step": 55752 + }, + { + "epoch": 5.202295418493981, + "grad_norm": NaN, + "learning_rate": 1.3723081546709957e-05, + "loss": 0.0, + "step": 55753 + }, + { + "epoch": 5.202388728188859, + "grad_norm": NaN, + "learning_rate": 1.3719921139344765e-05, + "loss": 0.0, + "step": 55754 + }, + { + "epoch": 5.202482037883736, + "grad_norm": NaN, + "learning_rate": 1.371676107849945e-05, + "loss": 0.0, + "step": 55755 + }, + { + "epoch": 5.202575347578613, + "grad_norm": NaN, + "learning_rate": 1.3713601364182025e-05, + "loss": 0.0, + "step": 55756 + }, + { + "epoch": 5.202668657273491, + "grad_norm": NaN, + "learning_rate": 1.371044199640055e-05, + "loss": 0.0, + "step": 55757 + }, + { + "epoch": 5.202761966968368, + "grad_norm": NaN, + "learning_rate": 1.3707282975163053e-05, + "loss": 0.0, + "step": 55758 + }, + { + "epoch": 5.202855276663246, + "grad_norm": NaN, + "learning_rate": 1.3704124300477559e-05, + "loss": 0.0, + "step": 55759 + }, + { + "epoch": 5.202948586358122, + "grad_norm": NaN, + "learning_rate": 1.3700965972352096e-05, + "loss": 0.0, + "step": 55760 + }, + { + "epoch": 5.203041896053, + "grad_norm": NaN, + "learning_rate": 1.3697807990794707e-05, + "loss": 0.0, + "step": 55761 + }, + { + "epoch": 5.203135205747877, + "grad_norm": NaN, + "learning_rate": 1.369465035581342e-05, + "loss": 0.0, + "step": 55762 + }, + { + "epoch": 5.203228515442754, + "grad_norm": NaN, + "learning_rate": 1.3691493067416259e-05, + "loss": 0.0, + "step": 55763 + }, + { + "epoch": 5.203321825137632, + "grad_norm": NaN, + "learning_rate": 1.3688336125611255e-05, + "loss": 0.0, + "step": 55764 + }, + { + "epoch": 5.203415134832509, + "grad_norm": NaN, + "learning_rate": 1.3685179530406431e-05, + "loss": 0.0, + "step": 55765 + }, + { + "epoch": 5.203508444527387, + "grad_norm": NaN, + "learning_rate": 1.3682023281809818e-05, + "loss": 0.0, + "step": 55766 + }, + { + "epoch": 5.203601754222264, + "grad_norm": NaN, + "learning_rate": 1.3678867379829438e-05, + "loss": 0.0, + "step": 55767 + }, + { + "epoch": 5.203695063917141, + "grad_norm": NaN, + "learning_rate": 1.3675711824473306e-05, + "loss": 0.0, + "step": 55768 + }, + { + "epoch": 5.203788373612018, + "grad_norm": NaN, + "learning_rate": 1.3672556615749464e-05, + "loss": 0.0, + "step": 55769 + }, + { + "epoch": 5.2038816833068955, + "grad_norm": NaN, + "learning_rate": 1.3669401753665938e-05, + "loss": 0.0, + "step": 55770 + }, + { + "epoch": 5.203974993001773, + "grad_norm": NaN, + "learning_rate": 1.3666247238230737e-05, + "loss": 0.0, + "step": 55771 + }, + { + "epoch": 5.20406830269665, + "grad_norm": NaN, + "learning_rate": 1.3663093069451874e-05, + "loss": 0.0, + "step": 55772 + }, + { + "epoch": 5.204161612391528, + "grad_norm": NaN, + "learning_rate": 1.3659939247337376e-05, + "loss": 0.0, + "step": 55773 + }, + { + "epoch": 5.204254922086405, + "grad_norm": NaN, + "learning_rate": 1.3656785771895284e-05, + "loss": 0.0, + "step": 55774 + }, + { + "epoch": 5.204348231781282, + "grad_norm": NaN, + "learning_rate": 1.3653632643133577e-05, + "loss": 0.0, + "step": 55775 + }, + { + "epoch": 5.204441541476159, + "grad_norm": NaN, + "learning_rate": 1.3650479861060314e-05, + "loss": 0.0, + "step": 55776 + }, + { + "epoch": 5.2045348511710365, + "grad_norm": NaN, + "learning_rate": 1.3647327425683475e-05, + "loss": 0.0, + "step": 55777 + }, + { + "epoch": 5.204628160865914, + "grad_norm": NaN, + "learning_rate": 1.3644175337011098e-05, + "loss": 0.0, + "step": 55778 + }, + { + "epoch": 5.204721470560791, + "grad_norm": NaN, + "learning_rate": 1.3641023595051198e-05, + "loss": 0.0, + "step": 55779 + }, + { + "epoch": 5.204814780255669, + "grad_norm": NaN, + "learning_rate": 1.3637872199811768e-05, + "loss": 0.0, + "step": 55780 + }, + { + "epoch": 5.204908089950546, + "grad_norm": NaN, + "learning_rate": 1.3634721151300848e-05, + "loss": 0.0, + "step": 55781 + }, + { + "epoch": 5.205001399645423, + "grad_norm": NaN, + "learning_rate": 1.3631570449526436e-05, + "loss": 0.0, + "step": 55782 + }, + { + "epoch": 5.2050947093403, + "grad_norm": NaN, + "learning_rate": 1.362842009449654e-05, + "loss": 0.0, + "step": 55783 + }, + { + "epoch": 5.205188019035178, + "grad_norm": NaN, + "learning_rate": 1.362527008621917e-05, + "loss": 0.0, + "step": 55784 + }, + { + "epoch": 5.205281328730055, + "grad_norm": NaN, + "learning_rate": 1.3622120424702355e-05, + "loss": 0.0, + "step": 55785 + }, + { + "epoch": 5.205374638424932, + "grad_norm": NaN, + "learning_rate": 1.3618971109954086e-05, + "loss": 0.0, + "step": 55786 + }, + { + "epoch": 5.20546794811981, + "grad_norm": NaN, + "learning_rate": 1.3615822141982374e-05, + "loss": 0.0, + "step": 55787 + }, + { + "epoch": 5.205561257814687, + "grad_norm": NaN, + "learning_rate": 1.3612673520795215e-05, + "loss": 0.0, + "step": 55788 + }, + { + "epoch": 5.205654567509564, + "grad_norm": NaN, + "learning_rate": 1.3609525246400632e-05, + "loss": 0.0, + "step": 55789 + }, + { + "epoch": 5.205747877204441, + "grad_norm": NaN, + "learning_rate": 1.3606377318806606e-05, + "loss": 0.0, + "step": 55790 + }, + { + "epoch": 5.205841186899319, + "grad_norm": NaN, + "learning_rate": 1.360322973802121e-05, + "loss": 0.0, + "step": 55791 + }, + { + "epoch": 5.205934496594196, + "grad_norm": NaN, + "learning_rate": 1.3600082504052373e-05, + "loss": 0.0, + "step": 55792 + }, + { + "epoch": 5.2060278062890735, + "grad_norm": NaN, + "learning_rate": 1.359693561690809e-05, + "loss": 0.0, + "step": 55793 + }, + { + "epoch": 5.206121115983951, + "grad_norm": NaN, + "learning_rate": 1.3593789076596451e-05, + "loss": 0.0, + "step": 55794 + }, + { + "epoch": 5.206214425678828, + "grad_norm": NaN, + "learning_rate": 1.3590642883125368e-05, + "loss": 0.0, + "step": 55795 + }, + { + "epoch": 5.206307735373706, + "grad_norm": NaN, + "learning_rate": 1.3587497036502837e-05, + "loss": 0.0, + "step": 55796 + }, + { + "epoch": 5.206401045068582, + "grad_norm": NaN, + "learning_rate": 1.3584351536736965e-05, + "loss": 0.0, + "step": 55797 + }, + { + "epoch": 5.20649435476346, + "grad_norm": NaN, + "learning_rate": 1.3581206383835646e-05, + "loss": 0.0, + "step": 55798 + }, + { + "epoch": 5.206587664458337, + "grad_norm": NaN, + "learning_rate": 1.357806157780691e-05, + "loss": 0.0, + "step": 55799 + }, + { + "epoch": 5.2066809741532145, + "grad_norm": NaN, + "learning_rate": 1.3574917118658746e-05, + "loss": 0.0, + "step": 55800 + }, + { + "epoch": 5.206774283848092, + "grad_norm": NaN, + "learning_rate": 1.357177300639915e-05, + "loss": 0.0, + "step": 55801 + }, + { + "epoch": 5.206867593542969, + "grad_norm": NaN, + "learning_rate": 1.3568629241036132e-05, + "loss": 0.0, + "step": 55802 + }, + { + "epoch": 5.206960903237847, + "grad_norm": NaN, + "learning_rate": 1.3565485822577671e-05, + "loss": 0.0, + "step": 55803 + }, + { + "epoch": 5.207054212932723, + "grad_norm": NaN, + "learning_rate": 1.3562342751031773e-05, + "loss": 0.0, + "step": 55804 + }, + { + "epoch": 5.207147522627601, + "grad_norm": NaN, + "learning_rate": 1.3559200026406419e-05, + "loss": 0.0, + "step": 55805 + }, + { + "epoch": 5.207240832322478, + "grad_norm": NaN, + "learning_rate": 1.3556057648709601e-05, + "loss": 0.0, + "step": 55806 + }, + { + "epoch": 5.207334142017356, + "grad_norm": NaN, + "learning_rate": 1.3552915617949312e-05, + "loss": 0.0, + "step": 55807 + }, + { + "epoch": 5.207427451712233, + "grad_norm": NaN, + "learning_rate": 1.3549773934133528e-05, + "loss": 0.0, + "step": 55808 + }, + { + "epoch": 5.20752076140711, + "grad_norm": NaN, + "learning_rate": 1.3546632597270262e-05, + "loss": 0.0, + "step": 55809 + }, + { + "epoch": 5.207614071101988, + "grad_norm": NaN, + "learning_rate": 1.3543491607367491e-05, + "loss": 0.0, + "step": 55810 + }, + { + "epoch": 5.207707380796865, + "grad_norm": NaN, + "learning_rate": 1.354035096443319e-05, + "loss": 0.0, + "step": 55811 + }, + { + "epoch": 5.207800690491742, + "grad_norm": NaN, + "learning_rate": 1.3537210668475368e-05, + "loss": 0.0, + "step": 55812 + }, + { + "epoch": 5.207894000186619, + "grad_norm": NaN, + "learning_rate": 1.3534070719501972e-05, + "loss": 0.0, + "step": 55813 + }, + { + "epoch": 5.207987309881497, + "grad_norm": NaN, + "learning_rate": 1.3530931117521027e-05, + "loss": 0.0, + "step": 55814 + }, + { + "epoch": 5.208080619576374, + "grad_norm": NaN, + "learning_rate": 1.3527791862540494e-05, + "loss": 0.0, + "step": 55815 + }, + { + "epoch": 5.2081739292712514, + "grad_norm": NaN, + "learning_rate": 1.3524652954568366e-05, + "loss": 0.0, + "step": 55816 + }, + { + "epoch": 5.208267238966129, + "grad_norm": NaN, + "learning_rate": 1.352151439361262e-05, + "loss": 0.0, + "step": 55817 + }, + { + "epoch": 5.208360548661006, + "grad_norm": NaN, + "learning_rate": 1.3518376179681233e-05, + "loss": 0.0, + "step": 55818 + }, + { + "epoch": 5.208453858355883, + "grad_norm": NaN, + "learning_rate": 1.3515238312782184e-05, + "loss": 0.0, + "step": 55819 + }, + { + "epoch": 5.20854716805076, + "grad_norm": NaN, + "learning_rate": 1.3512100792923446e-05, + "loss": 0.0, + "step": 55820 + }, + { + "epoch": 5.208640477745638, + "grad_norm": NaN, + "learning_rate": 1.3508963620113017e-05, + "loss": 0.0, + "step": 55821 + }, + { + "epoch": 5.208733787440515, + "grad_norm": NaN, + "learning_rate": 1.3505826794358853e-05, + "loss": 0.0, + "step": 55822 + }, + { + "epoch": 5.2088270971353925, + "grad_norm": NaN, + "learning_rate": 1.3502690315668952e-05, + "loss": 0.0, + "step": 55823 + }, + { + "epoch": 5.20892040683027, + "grad_norm": NaN, + "learning_rate": 1.349955418405127e-05, + "loss": 0.0, + "step": 55824 + }, + { + "epoch": 5.209013716525147, + "grad_norm": NaN, + "learning_rate": 1.3496418399513753e-05, + "loss": 0.0, + "step": 55825 + }, + { + "epoch": 5.209107026220024, + "grad_norm": NaN, + "learning_rate": 1.349328296206446e-05, + "loss": 0.0, + "step": 55826 + }, + { + "epoch": 5.209200335914901, + "grad_norm": NaN, + "learning_rate": 1.3490147871711305e-05, + "loss": 0.0, + "step": 55827 + }, + { + "epoch": 5.209293645609779, + "grad_norm": NaN, + "learning_rate": 1.3487013128462227e-05, + "loss": 0.0, + "step": 55828 + }, + { + "epoch": 5.209386955304656, + "grad_norm": NaN, + "learning_rate": 1.3483878732325287e-05, + "loss": 0.0, + "step": 55829 + }, + { + "epoch": 5.2094802649995335, + "grad_norm": NaN, + "learning_rate": 1.3480744683308382e-05, + "loss": 0.0, + "step": 55830 + }, + { + "epoch": 5.209573574694411, + "grad_norm": NaN, + "learning_rate": 1.3477610981419485e-05, + "loss": 0.0, + "step": 55831 + }, + { + "epoch": 5.209666884389288, + "grad_norm": NaN, + "learning_rate": 1.3474477626666625e-05, + "loss": 0.0, + "step": 55832 + }, + { + "epoch": 5.209760194084165, + "grad_norm": NaN, + "learning_rate": 1.3471344619057695e-05, + "loss": 0.0, + "step": 55833 + }, + { + "epoch": 5.209853503779042, + "grad_norm": NaN, + "learning_rate": 1.3468211958600672e-05, + "loss": 0.0, + "step": 55834 + }, + { + "epoch": 5.20994681347392, + "grad_norm": NaN, + "learning_rate": 1.3465079645303601e-05, + "loss": 0.0, + "step": 55835 + }, + { + "epoch": 5.210040123168797, + "grad_norm": NaN, + "learning_rate": 1.3461947679174357e-05, + "loss": 0.0, + "step": 55836 + }, + { + "epoch": 5.210133432863675, + "grad_norm": NaN, + "learning_rate": 1.3458816060220901e-05, + "loss": 0.0, + "step": 55837 + }, + { + "epoch": 5.210226742558552, + "grad_norm": NaN, + "learning_rate": 1.3455684788451293e-05, + "loss": 0.0, + "step": 55838 + }, + { + "epoch": 5.210320052253429, + "grad_norm": NaN, + "learning_rate": 1.3452553863873394e-05, + "loss": 0.0, + "step": 55839 + }, + { + "epoch": 5.210413361948307, + "grad_norm": NaN, + "learning_rate": 1.3449423286495181e-05, + "loss": 0.0, + "step": 55840 + }, + { + "epoch": 5.210506671643183, + "grad_norm": NaN, + "learning_rate": 1.3446293056324697e-05, + "loss": 0.0, + "step": 55841 + }, + { + "epoch": 5.210599981338061, + "grad_norm": NaN, + "learning_rate": 1.3443163173369803e-05, + "loss": 0.0, + "step": 55842 + }, + { + "epoch": 5.210693291032938, + "grad_norm": NaN, + "learning_rate": 1.3440033637638509e-05, + "loss": 0.0, + "step": 55843 + }, + { + "epoch": 5.210786600727816, + "grad_norm": NaN, + "learning_rate": 1.3436904449138742e-05, + "loss": 0.0, + "step": 55844 + }, + { + "epoch": 5.210879910422693, + "grad_norm": NaN, + "learning_rate": 1.3433775607878477e-05, + "loss": 0.0, + "step": 55845 + }, + { + "epoch": 5.2109732201175705, + "grad_norm": NaN, + "learning_rate": 1.3430647113865662e-05, + "loss": 0.0, + "step": 55846 + }, + { + "epoch": 5.211066529812448, + "grad_norm": NaN, + "learning_rate": 1.3427518967108269e-05, + "loss": 0.0, + "step": 55847 + }, + { + "epoch": 5.211159839507324, + "grad_norm": NaN, + "learning_rate": 1.3424391167614229e-05, + "loss": 0.0, + "step": 55848 + }, + { + "epoch": 5.211253149202202, + "grad_norm": NaN, + "learning_rate": 1.34212637153915e-05, + "loss": 0.0, + "step": 55849 + }, + { + "epoch": 5.211346458897079, + "grad_norm": NaN, + "learning_rate": 1.3418136610448061e-05, + "loss": 0.0, + "step": 55850 + }, + { + "epoch": 5.211439768591957, + "grad_norm": NaN, + "learning_rate": 1.341500985279182e-05, + "loss": 0.0, + "step": 55851 + }, + { + "epoch": 5.211533078286834, + "grad_norm": NaN, + "learning_rate": 1.3411883442430755e-05, + "loss": 0.0, + "step": 55852 + }, + { + "epoch": 5.2116263879817115, + "grad_norm": NaN, + "learning_rate": 1.3408757379372825e-05, + "loss": 0.0, + "step": 55853 + }, + { + "epoch": 5.211719697676589, + "grad_norm": NaN, + "learning_rate": 1.3405631663625942e-05, + "loss": 0.0, + "step": 55854 + }, + { + "epoch": 5.2118130073714655, + "grad_norm": NaN, + "learning_rate": 1.3402506295198084e-05, + "loss": 0.0, + "step": 55855 + }, + { + "epoch": 5.211906317066343, + "grad_norm": NaN, + "learning_rate": 1.3399381274097192e-05, + "loss": 0.0, + "step": 55856 + }, + { + "epoch": 5.21199962676122, + "grad_norm": NaN, + "learning_rate": 1.3396256600331212e-05, + "loss": 0.0, + "step": 55857 + }, + { + "epoch": 5.212092936456098, + "grad_norm": NaN, + "learning_rate": 1.3393132273908086e-05, + "loss": 0.0, + "step": 55858 + }, + { + "epoch": 5.212186246150975, + "grad_norm": NaN, + "learning_rate": 1.3390008294835758e-05, + "loss": 0.0, + "step": 55859 + }, + { + "epoch": 5.212279555845853, + "grad_norm": NaN, + "learning_rate": 1.3386884663122172e-05, + "loss": 0.0, + "step": 55860 + }, + { + "epoch": 5.21237286554073, + "grad_norm": NaN, + "learning_rate": 1.338376137877527e-05, + "loss": 0.0, + "step": 55861 + }, + { + "epoch": 5.212466175235607, + "grad_norm": NaN, + "learning_rate": 1.3380638441802982e-05, + "loss": 0.0, + "step": 55862 + }, + { + "epoch": 5.212559484930484, + "grad_norm": NaN, + "learning_rate": 1.337751585221325e-05, + "loss": 0.0, + "step": 55863 + }, + { + "epoch": 5.212652794625361, + "grad_norm": NaN, + "learning_rate": 1.3374393610014066e-05, + "loss": 0.0, + "step": 55864 + }, + { + "epoch": 5.212746104320239, + "grad_norm": NaN, + "learning_rate": 1.337127171521331e-05, + "loss": 0.0, + "step": 55865 + }, + { + "epoch": 5.212839414015116, + "grad_norm": NaN, + "learning_rate": 1.336815016781891e-05, + "loss": 0.0, + "step": 55866 + }, + { + "epoch": 5.212932723709994, + "grad_norm": NaN, + "learning_rate": 1.3365028967838887e-05, + "loss": 0.0, + "step": 55867 + }, + { + "epoch": 5.213026033404871, + "grad_norm": NaN, + "learning_rate": 1.336190811528109e-05, + "loss": 0.0, + "step": 55868 + }, + { + "epoch": 5.2131193430997484, + "grad_norm": NaN, + "learning_rate": 1.3358787610153459e-05, + "loss": 0.0, + "step": 55869 + }, + { + "epoch": 5.213212652794625, + "grad_norm": NaN, + "learning_rate": 1.3355667452464009e-05, + "loss": 0.0, + "step": 55870 + }, + { + "epoch": 5.213305962489502, + "grad_norm": NaN, + "learning_rate": 1.3352547642220579e-05, + "loss": 0.0, + "step": 55871 + }, + { + "epoch": 5.21339927218438, + "grad_norm": NaN, + "learning_rate": 1.3349428179431132e-05, + "loss": 0.0, + "step": 55872 + }, + { + "epoch": 5.213492581879257, + "grad_norm": NaN, + "learning_rate": 1.3346309064103661e-05, + "loss": 0.0, + "step": 55873 + }, + { + "epoch": 5.213585891574135, + "grad_norm": NaN, + "learning_rate": 1.3343190296246027e-05, + "loss": 0.0, + "step": 55874 + }, + { + "epoch": 5.213679201269012, + "grad_norm": NaN, + "learning_rate": 1.3340071875866137e-05, + "loss": 0.0, + "step": 55875 + }, + { + "epoch": 5.2137725109638895, + "grad_norm": NaN, + "learning_rate": 1.3336953802972022e-05, + "loss": 0.0, + "step": 55876 + }, + { + "epoch": 5.213865820658766, + "grad_norm": NaN, + "learning_rate": 1.3333836077571524e-05, + "loss": 0.0, + "step": 55877 + }, + { + "epoch": 5.2139591303536434, + "grad_norm": NaN, + "learning_rate": 1.333071869967257e-05, + "loss": 0.0, + "step": 55878 + }, + { + "epoch": 5.214052440048521, + "grad_norm": NaN, + "learning_rate": 1.3327601669283172e-05, + "loss": 0.0, + "step": 55879 + }, + { + "epoch": 5.214145749743398, + "grad_norm": NaN, + "learning_rate": 1.332448498641117e-05, + "loss": 0.0, + "step": 55880 + }, + { + "epoch": 5.214239059438276, + "grad_norm": NaN, + "learning_rate": 1.3321368651064496e-05, + "loss": 0.0, + "step": 55881 + }, + { + "epoch": 5.214332369133153, + "grad_norm": NaN, + "learning_rate": 1.331825266325114e-05, + "loss": 0.0, + "step": 55882 + }, + { + "epoch": 5.2144256788280305, + "grad_norm": NaN, + "learning_rate": 1.3315137022978945e-05, + "loss": 0.0, + "step": 55883 + }, + { + "epoch": 5.214518988522908, + "grad_norm": NaN, + "learning_rate": 1.3312021730255857e-05, + "loss": 0.0, + "step": 55884 + }, + { + "epoch": 5.2146122982177845, + "grad_norm": NaN, + "learning_rate": 1.3308906785089852e-05, + "loss": 0.0, + "step": 55885 + }, + { + "epoch": 5.214705607912662, + "grad_norm": NaN, + "learning_rate": 1.3305792187488773e-05, + "loss": 0.0, + "step": 55886 + }, + { + "epoch": 5.214798917607539, + "grad_norm": NaN, + "learning_rate": 1.3302677937460581e-05, + "loss": 0.0, + "step": 55887 + }, + { + "epoch": 5.214892227302417, + "grad_norm": NaN, + "learning_rate": 1.3299564035013188e-05, + "loss": 0.0, + "step": 55888 + }, + { + "epoch": 5.214985536997294, + "grad_norm": NaN, + "learning_rate": 1.3296450480154503e-05, + "loss": 0.0, + "step": 55889 + }, + { + "epoch": 5.215078846692172, + "grad_norm": NaN, + "learning_rate": 1.3293337272892452e-05, + "loss": 0.0, + "step": 55890 + }, + { + "epoch": 5.215172156387049, + "grad_norm": NaN, + "learning_rate": 1.3290224413234962e-05, + "loss": 0.0, + "step": 55891 + }, + { + "epoch": 5.2152654660819255, + "grad_norm": NaN, + "learning_rate": 1.3287111901189912e-05, + "loss": 0.0, + "step": 55892 + }, + { + "epoch": 5.215358775776803, + "grad_norm": NaN, + "learning_rate": 1.328399973676526e-05, + "loss": 0.0, + "step": 55893 + }, + { + "epoch": 5.21545208547168, + "grad_norm": NaN, + "learning_rate": 1.3280887919968886e-05, + "loss": 0.0, + "step": 55894 + }, + { + "epoch": 5.215545395166558, + "grad_norm": NaN, + "learning_rate": 1.3277776450808713e-05, + "loss": 0.0, + "step": 55895 + }, + { + "epoch": 5.215638704861435, + "grad_norm": NaN, + "learning_rate": 1.3274665329292655e-05, + "loss": 0.0, + "step": 55896 + }, + { + "epoch": 5.215732014556313, + "grad_norm": NaN, + "learning_rate": 1.3271554555428637e-05, + "loss": 0.0, + "step": 55897 + }, + { + "epoch": 5.21582532425119, + "grad_norm": NaN, + "learning_rate": 1.3268444129224538e-05, + "loss": 0.0, + "step": 55898 + }, + { + "epoch": 5.215918633946067, + "grad_norm": NaN, + "learning_rate": 1.3265334050688298e-05, + "loss": 0.0, + "step": 55899 + }, + { + "epoch": 5.216011943640944, + "grad_norm": NaN, + "learning_rate": 1.3262224319827796e-05, + "loss": 0.0, + "step": 55900 + }, + { + "epoch": 5.216105253335821, + "grad_norm": NaN, + "learning_rate": 1.3259114936650927e-05, + "loss": 0.0, + "step": 55901 + }, + { + "epoch": 5.216198563030699, + "grad_norm": NaN, + "learning_rate": 1.3256005901165685e-05, + "loss": 0.0, + "step": 55902 + }, + { + "epoch": 5.216291872725576, + "grad_norm": NaN, + "learning_rate": 1.3252897213379876e-05, + "loss": 0.0, + "step": 55903 + }, + { + "epoch": 5.216385182420454, + "grad_norm": NaN, + "learning_rate": 1.3249788873301415e-05, + "loss": 0.0, + "step": 55904 + }, + { + "epoch": 5.216478492115331, + "grad_norm": NaN, + "learning_rate": 1.3246680880938292e-05, + "loss": 0.0, + "step": 55905 + }, + { + "epoch": 5.216571801810208, + "grad_norm": NaN, + "learning_rate": 1.3243573236298322e-05, + "loss": 0.0, + "step": 55906 + }, + { + "epoch": 5.216665111505085, + "grad_norm": NaN, + "learning_rate": 1.324046593938941e-05, + "loss": 0.0, + "step": 55907 + }, + { + "epoch": 5.2167584211999625, + "grad_norm": NaN, + "learning_rate": 1.3237358990219538e-05, + "loss": 0.0, + "step": 55908 + }, + { + "epoch": 5.21685173089484, + "grad_norm": NaN, + "learning_rate": 1.3234252388796512e-05, + "loss": 0.0, + "step": 55909 + }, + { + "epoch": 5.216945040589717, + "grad_norm": NaN, + "learning_rate": 1.3231146135128245e-05, + "loss": 0.0, + "step": 55910 + }, + { + "epoch": 5.217038350284595, + "grad_norm": NaN, + "learning_rate": 1.322804022922273e-05, + "loss": 0.0, + "step": 55911 + }, + { + "epoch": 5.217131659979472, + "grad_norm": NaN, + "learning_rate": 1.3224934671087745e-05, + "loss": 0.0, + "step": 55912 + }, + { + "epoch": 5.21722496967435, + "grad_norm": NaN, + "learning_rate": 1.322182946073123e-05, + "loss": 0.0, + "step": 55913 + }, + { + "epoch": 5.217318279369226, + "grad_norm": NaN, + "learning_rate": 1.3218724598161118e-05, + "loss": 0.0, + "step": 55914 + }, + { + "epoch": 5.2174115890641035, + "grad_norm": NaN, + "learning_rate": 1.3215620083385265e-05, + "loss": 0.0, + "step": 55915 + }, + { + "epoch": 5.217504898758981, + "grad_norm": NaN, + "learning_rate": 1.321251591641153e-05, + "loss": 0.0, + "step": 55916 + }, + { + "epoch": 5.217598208453858, + "grad_norm": NaN, + "learning_rate": 1.3209412097247912e-05, + "loss": 0.0, + "step": 55917 + }, + { + "epoch": 5.217691518148736, + "grad_norm": NaN, + "learning_rate": 1.32063086259022e-05, + "loss": 0.0, + "step": 55918 + }, + { + "epoch": 5.217784827843613, + "grad_norm": NaN, + "learning_rate": 1.3203205502382308e-05, + "loss": 0.0, + "step": 55919 + }, + { + "epoch": 5.217878137538491, + "grad_norm": NaN, + "learning_rate": 1.3200102726696177e-05, + "loss": 0.0, + "step": 55920 + }, + { + "epoch": 5.217971447233367, + "grad_norm": NaN, + "learning_rate": 1.3197000298851651e-05, + "loss": 0.0, + "step": 55921 + }, + { + "epoch": 5.218064756928245, + "grad_norm": NaN, + "learning_rate": 1.3193898218856607e-05, + "loss": 0.0, + "step": 55922 + }, + { + "epoch": 5.218158066623122, + "grad_norm": NaN, + "learning_rate": 1.319079648671899e-05, + "loss": 0.0, + "step": 55923 + }, + { + "epoch": 5.218251376317999, + "grad_norm": NaN, + "learning_rate": 1.3187695102446643e-05, + "loss": 0.0, + "step": 55924 + }, + { + "epoch": 5.218344686012877, + "grad_norm": NaN, + "learning_rate": 1.3184594066047427e-05, + "loss": 0.0, + "step": 55925 + }, + { + "epoch": 5.218437995707754, + "grad_norm": NaN, + "learning_rate": 1.3181493377529317e-05, + "loss": 0.0, + "step": 55926 + }, + { + "epoch": 5.218531305402632, + "grad_norm": NaN, + "learning_rate": 1.317839303690011e-05, + "loss": 0.0, + "step": 55927 + }, + { + "epoch": 5.218624615097509, + "grad_norm": NaN, + "learning_rate": 1.3175293044167694e-05, + "loss": 0.0, + "step": 55928 + }, + { + "epoch": 5.218717924792386, + "grad_norm": NaN, + "learning_rate": 1.3172193399340036e-05, + "loss": 0.0, + "step": 55929 + }, + { + "epoch": 5.218811234487263, + "grad_norm": NaN, + "learning_rate": 1.3169094102424926e-05, + "loss": 0.0, + "step": 55930 + }, + { + "epoch": 5.2189045441821404, + "grad_norm": NaN, + "learning_rate": 1.3165995153430276e-05, + "loss": 0.0, + "step": 55931 + }, + { + "epoch": 5.218997853877018, + "grad_norm": NaN, + "learning_rate": 1.3162896552363978e-05, + "loss": 0.0, + "step": 55932 + }, + { + "epoch": 5.219091163571895, + "grad_norm": NaN, + "learning_rate": 1.3159798299233876e-05, + "loss": 0.0, + "step": 55933 + }, + { + "epoch": 5.219184473266773, + "grad_norm": NaN, + "learning_rate": 1.3156700394047898e-05, + "loss": 0.0, + "step": 55934 + }, + { + "epoch": 5.21927778296165, + "grad_norm": NaN, + "learning_rate": 1.3153602836813887e-05, + "loss": 0.0, + "step": 55935 + }, + { + "epoch": 5.219371092656527, + "grad_norm": NaN, + "learning_rate": 1.315050562753972e-05, + "loss": 0.0, + "step": 55936 + }, + { + "epoch": 5.219464402351404, + "grad_norm": NaN, + "learning_rate": 1.3147408766233276e-05, + "loss": 0.0, + "step": 55937 + }, + { + "epoch": 5.2195577120462815, + "grad_norm": NaN, + "learning_rate": 1.3144312252902445e-05, + "loss": 0.0, + "step": 55938 + }, + { + "epoch": 5.219651021741159, + "grad_norm": NaN, + "learning_rate": 1.3141216087555056e-05, + "loss": 0.0, + "step": 55939 + }, + { + "epoch": 5.219744331436036, + "grad_norm": NaN, + "learning_rate": 1.313812027019907e-05, + "loss": 0.0, + "step": 55940 + }, + { + "epoch": 5.219837641130914, + "grad_norm": NaN, + "learning_rate": 1.3135024800842282e-05, + "loss": 0.0, + "step": 55941 + }, + { + "epoch": 5.219930950825791, + "grad_norm": NaN, + "learning_rate": 1.3131929679492547e-05, + "loss": 0.0, + "step": 55942 + }, + { + "epoch": 5.220024260520668, + "grad_norm": NaN, + "learning_rate": 1.312883490615783e-05, + "loss": 0.0, + "step": 55943 + }, + { + "epoch": 5.220117570215545, + "grad_norm": NaN, + "learning_rate": 1.312574048084591e-05, + "loss": 0.0, + "step": 55944 + }, + { + "epoch": 5.2202108799104225, + "grad_norm": NaN, + "learning_rate": 1.3122646403564658e-05, + "loss": 0.0, + "step": 55945 + }, + { + "epoch": 5.2203041896053, + "grad_norm": NaN, + "learning_rate": 1.311955267432202e-05, + "loss": 0.0, + "step": 55946 + }, + { + "epoch": 5.220397499300177, + "grad_norm": NaN, + "learning_rate": 1.3116459293125809e-05, + "loss": 0.0, + "step": 55947 + }, + { + "epoch": 5.220490808995055, + "grad_norm": NaN, + "learning_rate": 1.3113366259983848e-05, + "loss": 0.0, + "step": 55948 + }, + { + "epoch": 5.220584118689932, + "grad_norm": NaN, + "learning_rate": 1.31102735749041e-05, + "loss": 0.0, + "step": 55949 + }, + { + "epoch": 5.220677428384809, + "grad_norm": NaN, + "learning_rate": 1.310718123789436e-05, + "loss": 0.0, + "step": 55950 + }, + { + "epoch": 5.220770738079686, + "grad_norm": NaN, + "learning_rate": 1.3104089248962469e-05, + "loss": 0.0, + "step": 55951 + }, + { + "epoch": 5.220864047774564, + "grad_norm": NaN, + "learning_rate": 1.3100997608116387e-05, + "loss": 0.0, + "step": 55952 + }, + { + "epoch": 5.220957357469441, + "grad_norm": NaN, + "learning_rate": 1.3097906315363894e-05, + "loss": 0.0, + "step": 55953 + }, + { + "epoch": 5.221050667164318, + "grad_norm": NaN, + "learning_rate": 1.3094815370712846e-05, + "loss": 0.0, + "step": 55954 + }, + { + "epoch": 5.221143976859196, + "grad_norm": NaN, + "learning_rate": 1.3091724774171176e-05, + "loss": 0.0, + "step": 55955 + }, + { + "epoch": 5.221237286554073, + "grad_norm": NaN, + "learning_rate": 1.3088634525746672e-05, + "loss": 0.0, + "step": 55956 + }, + { + "epoch": 5.221330596248951, + "grad_norm": NaN, + "learning_rate": 1.308554462544718e-05, + "loss": 0.0, + "step": 55957 + }, + { + "epoch": 5.221423905943827, + "grad_norm": NaN, + "learning_rate": 1.3082455073280646e-05, + "loss": 0.0, + "step": 55958 + }, + { + "epoch": 5.221517215638705, + "grad_norm": NaN, + "learning_rate": 1.3079365869254843e-05, + "loss": 0.0, + "step": 55959 + }, + { + "epoch": 5.221610525333582, + "grad_norm": NaN, + "learning_rate": 1.3076277013377634e-05, + "loss": 0.0, + "step": 55960 + }, + { + "epoch": 5.2217038350284595, + "grad_norm": NaN, + "learning_rate": 1.3073188505656946e-05, + "loss": 0.0, + "step": 55961 + }, + { + "epoch": 5.221797144723337, + "grad_norm": NaN, + "learning_rate": 1.3070100346100538e-05, + "loss": 0.0, + "step": 55962 + }, + { + "epoch": 5.221890454418214, + "grad_norm": NaN, + "learning_rate": 1.3067012534716287e-05, + "loss": 0.0, + "step": 55963 + }, + { + "epoch": 5.221983764113092, + "grad_norm": NaN, + "learning_rate": 1.306392507151212e-05, + "loss": 0.0, + "step": 55964 + }, + { + "epoch": 5.222077073807968, + "grad_norm": NaN, + "learning_rate": 1.3060837956495801e-05, + "loss": 0.0, + "step": 55965 + }, + { + "epoch": 5.222170383502846, + "grad_norm": NaN, + "learning_rate": 1.305775118967517e-05, + "loss": 0.0, + "step": 55966 + }, + { + "epoch": 5.222263693197723, + "grad_norm": NaN, + "learning_rate": 1.3054664771058172e-05, + "loss": 0.0, + "step": 55967 + }, + { + "epoch": 5.2223570028926005, + "grad_norm": NaN, + "learning_rate": 1.3051578700652566e-05, + "loss": 0.0, + "step": 55968 + }, + { + "epoch": 5.222450312587478, + "grad_norm": NaN, + "learning_rate": 1.3048492978466196e-05, + "loss": 0.0, + "step": 55969 + }, + { + "epoch": 5.222543622282355, + "grad_norm": NaN, + "learning_rate": 1.3045407604507007e-05, + "loss": 0.0, + "step": 55970 + }, + { + "epoch": 5.222636931977233, + "grad_norm": NaN, + "learning_rate": 1.3042322578782743e-05, + "loss": 0.0, + "step": 55971 + }, + { + "epoch": 5.222730241672109, + "grad_norm": NaN, + "learning_rate": 1.3039237901301264e-05, + "loss": 0.0, + "step": 55972 + }, + { + "epoch": 5.222823551366987, + "grad_norm": NaN, + "learning_rate": 1.303615357207048e-05, + "loss": 0.0, + "step": 55973 + }, + { + "epoch": 5.222916861061864, + "grad_norm": NaN, + "learning_rate": 1.303306959109815e-05, + "loss": 0.0, + "step": 55974 + }, + { + "epoch": 5.223010170756742, + "grad_norm": NaN, + "learning_rate": 1.3029985958392136e-05, + "loss": 0.0, + "step": 55975 + }, + { + "epoch": 5.223103480451619, + "grad_norm": NaN, + "learning_rate": 1.302690267396035e-05, + "loss": 0.0, + "step": 55976 + }, + { + "epoch": 5.223196790146496, + "grad_norm": NaN, + "learning_rate": 1.3023819737810514e-05, + "loss": 0.0, + "step": 55977 + }, + { + "epoch": 5.223290099841374, + "grad_norm": NaN, + "learning_rate": 1.3020737149950578e-05, + "loss": 0.0, + "step": 55978 + }, + { + "epoch": 5.223383409536251, + "grad_norm": NaN, + "learning_rate": 1.3017654910388315e-05, + "loss": 0.0, + "step": 55979 + }, + { + "epoch": 5.223476719231128, + "grad_norm": NaN, + "learning_rate": 1.3014573019131552e-05, + "loss": 0.0, + "step": 55980 + }, + { + "epoch": 5.223570028926005, + "grad_norm": NaN, + "learning_rate": 1.3011491476188202e-05, + "loss": 0.0, + "step": 55981 + }, + { + "epoch": 5.223663338620883, + "grad_norm": NaN, + "learning_rate": 1.3008410281566023e-05, + "loss": 0.0, + "step": 55982 + }, + { + "epoch": 5.22375664831576, + "grad_norm": NaN, + "learning_rate": 1.3005329435272844e-05, + "loss": 0.0, + "step": 55983 + }, + { + "epoch": 5.2238499580106375, + "grad_norm": NaN, + "learning_rate": 1.300224893731659e-05, + "loss": 0.0, + "step": 55984 + }, + { + "epoch": 5.223943267705515, + "grad_norm": NaN, + "learning_rate": 1.2999168787705021e-05, + "loss": 0.0, + "step": 55985 + }, + { + "epoch": 5.224036577400392, + "grad_norm": NaN, + "learning_rate": 1.2996088986445935e-05, + "loss": 0.0, + "step": 55986 + }, + { + "epoch": 5.224129887095269, + "grad_norm": NaN, + "learning_rate": 1.2993009533547272e-05, + "loss": 0.0, + "step": 55987 + }, + { + "epoch": 5.224223196790146, + "grad_norm": NaN, + "learning_rate": 1.2989930429016776e-05, + "loss": 0.0, + "step": 55988 + }, + { + "epoch": 5.224316506485024, + "grad_norm": NaN, + "learning_rate": 1.2986851672862275e-05, + "loss": 0.0, + "step": 55989 + }, + { + "epoch": 5.224409816179901, + "grad_norm": NaN, + "learning_rate": 1.2983773265091679e-05, + "loss": 0.0, + "step": 55990 + }, + { + "epoch": 5.2245031258747785, + "grad_norm": NaN, + "learning_rate": 1.2980695205712748e-05, + "loss": 0.0, + "step": 55991 + }, + { + "epoch": 5.224596435569656, + "grad_norm": NaN, + "learning_rate": 1.2977617494733278e-05, + "loss": 0.0, + "step": 55992 + }, + { + "epoch": 5.224689745264533, + "grad_norm": NaN, + "learning_rate": 1.2974540132161192e-05, + "loss": 0.0, + "step": 55993 + }, + { + "epoch": 5.22478305495941, + "grad_norm": NaN, + "learning_rate": 1.2971463118004222e-05, + "loss": 0.0, + "step": 55994 + }, + { + "epoch": 5.224876364654287, + "grad_norm": NaN, + "learning_rate": 1.2968386452270224e-05, + "loss": 0.0, + "step": 55995 + }, + { + "epoch": 5.224969674349165, + "grad_norm": NaN, + "learning_rate": 1.2965310134967077e-05, + "loss": 0.0, + "step": 55996 + }, + { + "epoch": 5.225062984044042, + "grad_norm": NaN, + "learning_rate": 1.2962234166102525e-05, + "loss": 0.0, + "step": 55997 + }, + { + "epoch": 5.2251562937389195, + "grad_norm": NaN, + "learning_rate": 1.2959158545684377e-05, + "loss": 0.0, + "step": 55998 + }, + { + "epoch": 5.225249603433797, + "grad_norm": NaN, + "learning_rate": 1.2956083273720563e-05, + "loss": 0.0, + "step": 55999 + }, + { + "epoch": 5.225342913128674, + "grad_norm": NaN, + "learning_rate": 1.2953008350218808e-05, + "loss": 0.0, + "step": 56000 + }, + { + "epoch": 5.225436222823552, + "grad_norm": NaN, + "learning_rate": 1.2949933775186921e-05, + "loss": 0.0, + "step": 56001 + }, + { + "epoch": 5.225529532518428, + "grad_norm": NaN, + "learning_rate": 1.2946859548632816e-05, + "loss": 0.0, + "step": 56002 + }, + { + "epoch": 5.225622842213306, + "grad_norm": NaN, + "learning_rate": 1.2943785670564216e-05, + "loss": 0.0, + "step": 56003 + }, + { + "epoch": 5.225716151908183, + "grad_norm": NaN, + "learning_rate": 1.2940712140988952e-05, + "loss": 0.0, + "step": 56004 + }, + { + "epoch": 5.225809461603061, + "grad_norm": NaN, + "learning_rate": 1.2937638959914897e-05, + "loss": 0.0, + "step": 56005 + }, + { + "epoch": 5.225902771297938, + "grad_norm": NaN, + "learning_rate": 1.2934566127349816e-05, + "loss": 0.0, + "step": 56006 + }, + { + "epoch": 5.225996080992815, + "grad_norm": NaN, + "learning_rate": 1.2931493643301499e-05, + "loss": 0.0, + "step": 56007 + }, + { + "epoch": 5.226089390687693, + "grad_norm": NaN, + "learning_rate": 1.2928421507777841e-05, + "loss": 0.0, + "step": 56008 + }, + { + "epoch": 5.226182700382569, + "grad_norm": NaN, + "learning_rate": 1.2925349720786571e-05, + "loss": 0.0, + "step": 56009 + }, + { + "epoch": 5.226276010077447, + "grad_norm": NaN, + "learning_rate": 1.2922278282335514e-05, + "loss": 0.0, + "step": 56010 + }, + { + "epoch": 5.226369319772324, + "grad_norm": NaN, + "learning_rate": 1.2919207192432546e-05, + "loss": 0.0, + "step": 56011 + }, + { + "epoch": 5.226462629467202, + "grad_norm": NaN, + "learning_rate": 1.291613645108538e-05, + "loss": 0.0, + "step": 56012 + }, + { + "epoch": 5.226555939162079, + "grad_norm": NaN, + "learning_rate": 1.2913066058301891e-05, + "loss": 0.0, + "step": 56013 + }, + { + "epoch": 5.2266492488569565, + "grad_norm": NaN, + "learning_rate": 1.2909996014089908e-05, + "loss": 0.0, + "step": 56014 + }, + { + "epoch": 5.226742558551834, + "grad_norm": NaN, + "learning_rate": 1.2906926318457123e-05, + "loss": 0.0, + "step": 56015 + }, + { + "epoch": 5.22683586824671, + "grad_norm": NaN, + "learning_rate": 1.2903856971411446e-05, + "loss": 0.0, + "step": 56016 + }, + { + "epoch": 5.226929177941588, + "grad_norm": NaN, + "learning_rate": 1.2900787972960691e-05, + "loss": 0.0, + "step": 56017 + }, + { + "epoch": 5.227022487636465, + "grad_norm": NaN, + "learning_rate": 1.2897719323112548e-05, + "loss": 0.0, + "step": 56018 + }, + { + "epoch": 5.227115797331343, + "grad_norm": NaN, + "learning_rate": 1.289465102187493e-05, + "loss": 0.0, + "step": 56019 + }, + { + "epoch": 5.22720910702622, + "grad_norm": NaN, + "learning_rate": 1.2891583069255628e-05, + "loss": 0.0, + "step": 56020 + }, + { + "epoch": 5.2273024167210975, + "grad_norm": NaN, + "learning_rate": 1.2888515465262355e-05, + "loss": 0.0, + "step": 56021 + }, + { + "epoch": 5.227395726415975, + "grad_norm": NaN, + "learning_rate": 1.2885448209903038e-05, + "loss": 0.0, + "step": 56022 + }, + { + "epoch": 5.2274890361108515, + "grad_norm": NaN, + "learning_rate": 1.2882381303185369e-05, + "loss": 0.0, + "step": 56023 + }, + { + "epoch": 5.227582345805729, + "grad_norm": NaN, + "learning_rate": 1.2879314745117174e-05, + "loss": 0.0, + "step": 56024 + }, + { + "epoch": 5.227675655500606, + "grad_norm": NaN, + "learning_rate": 1.2876248535706301e-05, + "loss": 0.0, + "step": 56025 + }, + { + "epoch": 5.227768965195484, + "grad_norm": NaN, + "learning_rate": 1.287318267496049e-05, + "loss": 0.0, + "step": 56026 + }, + { + "epoch": 5.227862274890361, + "grad_norm": NaN, + "learning_rate": 1.2870117162887538e-05, + "loss": 0.0, + "step": 56027 + }, + { + "epoch": 5.227955584585239, + "grad_norm": NaN, + "learning_rate": 1.2867051999495303e-05, + "loss": 0.0, + "step": 56028 + }, + { + "epoch": 5.228048894280116, + "grad_norm": NaN, + "learning_rate": 1.2863987184791513e-05, + "loss": 0.0, + "step": 56029 + }, + { + "epoch": 5.228142203974993, + "grad_norm": NaN, + "learning_rate": 1.2860922718783946e-05, + "loss": 0.0, + "step": 56030 + }, + { + "epoch": 5.22823551366987, + "grad_norm": NaN, + "learning_rate": 1.2857858601480475e-05, + "loss": 0.0, + "step": 56031 + }, + { + "epoch": 5.228328823364747, + "grad_norm": NaN, + "learning_rate": 1.2854794832888832e-05, + "loss": 0.0, + "step": 56032 + }, + { + "epoch": 5.228422133059625, + "grad_norm": NaN, + "learning_rate": 1.2851731413016791e-05, + "loss": 0.0, + "step": 56033 + }, + { + "epoch": 5.228515442754502, + "grad_norm": NaN, + "learning_rate": 1.2848668341872215e-05, + "loss": 0.0, + "step": 56034 + }, + { + "epoch": 5.22860875244938, + "grad_norm": NaN, + "learning_rate": 1.284560561946283e-05, + "loss": 0.0, + "step": 56035 + }, + { + "epoch": 5.228702062144257, + "grad_norm": NaN, + "learning_rate": 1.284254324579641e-05, + "loss": 0.0, + "step": 56036 + }, + { + "epoch": 5.2287953718391345, + "grad_norm": NaN, + "learning_rate": 1.2839481220880837e-05, + "loss": 0.0, + "step": 56037 + }, + { + "epoch": 5.228888681534011, + "grad_norm": NaN, + "learning_rate": 1.2836419544723786e-05, + "loss": 0.0, + "step": 56038 + }, + { + "epoch": 5.228981991228888, + "grad_norm": NaN, + "learning_rate": 1.2833358217333084e-05, + "loss": 0.0, + "step": 56039 + }, + { + "epoch": 5.229075300923766, + "grad_norm": NaN, + "learning_rate": 1.2830297238716558e-05, + "loss": 0.0, + "step": 56040 + }, + { + "epoch": 5.229168610618643, + "grad_norm": NaN, + "learning_rate": 1.2827236608881936e-05, + "loss": 0.0, + "step": 56041 + }, + { + "epoch": 5.229261920313521, + "grad_norm": NaN, + "learning_rate": 1.2824176327836993e-05, + "loss": 0.0, + "step": 56042 + }, + { + "epoch": 5.229355230008398, + "grad_norm": NaN, + "learning_rate": 1.2821116395589593e-05, + "loss": 0.0, + "step": 56043 + }, + { + "epoch": 5.2294485397032755, + "grad_norm": NaN, + "learning_rate": 1.2818056812147409e-05, + "loss": 0.0, + "step": 56044 + }, + { + "epoch": 5.229541849398153, + "grad_norm": NaN, + "learning_rate": 1.2814997577518272e-05, + "loss": 0.0, + "step": 56045 + }, + { + "epoch": 5.2296351590930295, + "grad_norm": NaN, + "learning_rate": 1.2811938691709989e-05, + "loss": 0.0, + "step": 56046 + }, + { + "epoch": 5.229728468787907, + "grad_norm": NaN, + "learning_rate": 1.2808880154730289e-05, + "loss": 0.0, + "step": 56047 + }, + { + "epoch": 5.229821778482784, + "grad_norm": NaN, + "learning_rate": 1.280582196658695e-05, + "loss": 0.0, + "step": 56048 + }, + { + "epoch": 5.229915088177662, + "grad_norm": NaN, + "learning_rate": 1.2802764127287812e-05, + "loss": 0.0, + "step": 56049 + }, + { + "epoch": 5.230008397872539, + "grad_norm": NaN, + "learning_rate": 1.2799706636840556e-05, + "loss": 0.0, + "step": 56050 + }, + { + "epoch": 5.2301017075674165, + "grad_norm": NaN, + "learning_rate": 1.2796649495253025e-05, + "loss": 0.0, + "step": 56051 + }, + { + "epoch": 5.230195017262294, + "grad_norm": NaN, + "learning_rate": 1.2793592702532996e-05, + "loss": 0.0, + "step": 56052 + }, + { + "epoch": 5.2302883269571705, + "grad_norm": NaN, + "learning_rate": 1.279053625868816e-05, + "loss": 0.0, + "step": 56053 + }, + { + "epoch": 5.230381636652048, + "grad_norm": NaN, + "learning_rate": 1.2787480163726383e-05, + "loss": 0.0, + "step": 56054 + }, + { + "epoch": 5.230474946346925, + "grad_norm": NaN, + "learning_rate": 1.278442441765542e-05, + "loss": 0.0, + "step": 56055 + }, + { + "epoch": 5.230568256041803, + "grad_norm": NaN, + "learning_rate": 1.2781369020482967e-05, + "loss": 0.0, + "step": 56056 + }, + { + "epoch": 5.23066156573668, + "grad_norm": NaN, + "learning_rate": 1.277831397221687e-05, + "loss": 0.0, + "step": 56057 + }, + { + "epoch": 5.230754875431558, + "grad_norm": NaN, + "learning_rate": 1.2775259272864902e-05, + "loss": 0.0, + "step": 56058 + }, + { + "epoch": 5.230848185126435, + "grad_norm": NaN, + "learning_rate": 1.2772204922434758e-05, + "loss": 0.0, + "step": 56059 + }, + { + "epoch": 5.2309414948213115, + "grad_norm": NaN, + "learning_rate": 1.276915092093425e-05, + "loss": 0.0, + "step": 56060 + }, + { + "epoch": 5.231034804516189, + "grad_norm": NaN, + "learning_rate": 1.276609726837119e-05, + "loss": 0.0, + "step": 56061 + }, + { + "epoch": 5.231128114211066, + "grad_norm": NaN, + "learning_rate": 1.2763043964753233e-05, + "loss": 0.0, + "step": 56062 + }, + { + "epoch": 5.231221423905944, + "grad_norm": NaN, + "learning_rate": 1.2759991010088227e-05, + "loss": 0.0, + "step": 56063 + }, + { + "epoch": 5.231314733600821, + "grad_norm": NaN, + "learning_rate": 1.2756938404383931e-05, + "loss": 0.0, + "step": 56064 + }, + { + "epoch": 5.231408043295699, + "grad_norm": NaN, + "learning_rate": 1.2753886147648041e-05, + "loss": 0.0, + "step": 56065 + }, + { + "epoch": 5.231501352990576, + "grad_norm": NaN, + "learning_rate": 1.2750834239888414e-05, + "loss": 0.0, + "step": 56066 + }, + { + "epoch": 5.231594662685453, + "grad_norm": NaN, + "learning_rate": 1.2747782681112728e-05, + "loss": 0.0, + "step": 56067 + }, + { + "epoch": 5.23168797238033, + "grad_norm": NaN, + "learning_rate": 1.2744731471328746e-05, + "loss": 0.0, + "step": 56068 + }, + { + "epoch": 5.231781282075207, + "grad_norm": NaN, + "learning_rate": 1.2741680610544325e-05, + "loss": 0.0, + "step": 56069 + }, + { + "epoch": 5.231874591770085, + "grad_norm": NaN, + "learning_rate": 1.273863009876711e-05, + "loss": 0.0, + "step": 56070 + }, + { + "epoch": 5.231967901464962, + "grad_norm": NaN, + "learning_rate": 1.2735579936004864e-05, + "loss": 0.0, + "step": 56071 + }, + { + "epoch": 5.23206121115984, + "grad_norm": NaN, + "learning_rate": 1.2732530122265444e-05, + "loss": 0.0, + "step": 56072 + }, + { + "epoch": 5.232154520854717, + "grad_norm": NaN, + "learning_rate": 1.2729480657556513e-05, + "loss": 0.0, + "step": 56073 + }, + { + "epoch": 5.2322478305495945, + "grad_norm": NaN, + "learning_rate": 1.2726431541885812e-05, + "loss": 0.0, + "step": 56074 + }, + { + "epoch": 5.232341140244471, + "grad_norm": NaN, + "learning_rate": 1.2723382775261205e-05, + "loss": 0.0, + "step": 56075 + }, + { + "epoch": 5.2324344499393485, + "grad_norm": NaN, + "learning_rate": 1.2720334357690332e-05, + "loss": 0.0, + "step": 56076 + }, + { + "epoch": 5.232527759634226, + "grad_norm": NaN, + "learning_rate": 1.2717286289180956e-05, + "loss": 0.0, + "step": 56077 + }, + { + "epoch": 5.232621069329103, + "grad_norm": NaN, + "learning_rate": 1.2714238569740903e-05, + "loss": 0.0, + "step": 56078 + }, + { + "epoch": 5.232714379023981, + "grad_norm": NaN, + "learning_rate": 1.271119119937785e-05, + "loss": 0.0, + "step": 56079 + }, + { + "epoch": 5.232807688718858, + "grad_norm": NaN, + "learning_rate": 1.2708144178099544e-05, + "loss": 0.0, + "step": 56080 + }, + { + "epoch": 5.232900998413736, + "grad_norm": NaN, + "learning_rate": 1.2705097505913824e-05, + "loss": 0.0, + "step": 56081 + }, + { + "epoch": 5.232994308108612, + "grad_norm": NaN, + "learning_rate": 1.2702051182828337e-05, + "loss": 0.0, + "step": 56082 + }, + { + "epoch": 5.2330876178034895, + "grad_norm": NaN, + "learning_rate": 1.2699005208850827e-05, + "loss": 0.0, + "step": 56083 + }, + { + "epoch": 5.233180927498367, + "grad_norm": NaN, + "learning_rate": 1.2695959583989135e-05, + "loss": 0.0, + "step": 56084 + }, + { + "epoch": 5.233274237193244, + "grad_norm": NaN, + "learning_rate": 1.2692914308250924e-05, + "loss": 0.0, + "step": 56085 + }, + { + "epoch": 5.233367546888122, + "grad_norm": NaN, + "learning_rate": 1.2689869381643936e-05, + "loss": 0.0, + "step": 56086 + }, + { + "epoch": 5.233460856582999, + "grad_norm": NaN, + "learning_rate": 1.2686824804175983e-05, + "loss": 0.0, + "step": 56087 + }, + { + "epoch": 5.233554166277877, + "grad_norm": NaN, + "learning_rate": 1.2683780575854691e-05, + "loss": 0.0, + "step": 56088 + }, + { + "epoch": 5.233647475972753, + "grad_norm": NaN, + "learning_rate": 1.2680736696687922e-05, + "loss": 0.0, + "step": 56089 + }, + { + "epoch": 5.233740785667631, + "grad_norm": NaN, + "learning_rate": 1.2677693166683366e-05, + "loss": 0.0, + "step": 56090 + }, + { + "epoch": 5.233834095362508, + "grad_norm": NaN, + "learning_rate": 1.2674649985848723e-05, + "loss": 0.0, + "step": 56091 + }, + { + "epoch": 5.233927405057385, + "grad_norm": NaN, + "learning_rate": 1.2671607154191782e-05, + "loss": 0.0, + "step": 56092 + }, + { + "epoch": 5.234020714752263, + "grad_norm": NaN, + "learning_rate": 1.2668564671720288e-05, + "loss": 0.0, + "step": 56093 + }, + { + "epoch": 5.23411402444714, + "grad_norm": NaN, + "learning_rate": 1.2665522538441919e-05, + "loss": 0.0, + "step": 56094 + }, + { + "epoch": 5.234207334142018, + "grad_norm": NaN, + "learning_rate": 1.2662480754364451e-05, + "loss": 0.0, + "step": 56095 + }, + { + "epoch": 5.234300643836894, + "grad_norm": NaN, + "learning_rate": 1.2659439319495645e-05, + "loss": 0.0, + "step": 56096 + }, + { + "epoch": 5.234393953531772, + "grad_norm": NaN, + "learning_rate": 1.2656398233843146e-05, + "loss": 0.0, + "step": 56097 + }, + { + "epoch": 5.234487263226649, + "grad_norm": NaN, + "learning_rate": 1.265335749741478e-05, + "loss": 0.0, + "step": 56098 + }, + { + "epoch": 5.2345805729215265, + "grad_norm": NaN, + "learning_rate": 1.2650317110218256e-05, + "loss": 0.0, + "step": 56099 + }, + { + "epoch": 5.234673882616404, + "grad_norm": NaN, + "learning_rate": 1.2647277072261237e-05, + "loss": 0.0, + "step": 56100 + }, + { + "epoch": 5.234767192311281, + "grad_norm": NaN, + "learning_rate": 1.2644237383551548e-05, + "loss": 0.0, + "step": 56101 + }, + { + "epoch": 5.234860502006159, + "grad_norm": NaN, + "learning_rate": 1.2641198044096884e-05, + "loss": 0.0, + "step": 56102 + }, + { + "epoch": 5.234953811701036, + "grad_norm": NaN, + "learning_rate": 1.2638159053904923e-05, + "loss": 0.0, + "step": 56103 + }, + { + "epoch": 5.235047121395913, + "grad_norm": NaN, + "learning_rate": 1.2635120412983441e-05, + "loss": 0.0, + "step": 56104 + }, + { + "epoch": 5.23514043109079, + "grad_norm": NaN, + "learning_rate": 1.2632082121340198e-05, + "loss": 0.0, + "step": 56105 + }, + { + "epoch": 5.2352337407856675, + "grad_norm": NaN, + "learning_rate": 1.2629044178982823e-05, + "loss": 0.0, + "step": 56106 + }, + { + "epoch": 5.235327050480545, + "grad_norm": NaN, + "learning_rate": 1.2626006585919124e-05, + "loss": 0.0, + "step": 56107 + }, + { + "epoch": 5.235420360175422, + "grad_norm": NaN, + "learning_rate": 1.262296934215683e-05, + "loss": 0.0, + "step": 56108 + }, + { + "epoch": 5.2355136698703, + "grad_norm": NaN, + "learning_rate": 1.2619932447703567e-05, + "loss": 0.0, + "step": 56109 + }, + { + "epoch": 5.235606979565177, + "grad_norm": NaN, + "learning_rate": 1.2616895902567164e-05, + "loss": 0.0, + "step": 56110 + }, + { + "epoch": 5.235700289260054, + "grad_norm": NaN, + "learning_rate": 1.2613859706755313e-05, + "loss": 0.0, + "step": 56111 + }, + { + "epoch": 5.235793598954931, + "grad_norm": NaN, + "learning_rate": 1.261082386027566e-05, + "loss": 0.0, + "step": 56112 + }, + { + "epoch": 5.2358869086498085, + "grad_norm": NaN, + "learning_rate": 1.2607788363136045e-05, + "loss": 0.0, + "step": 56113 + }, + { + "epoch": 5.235980218344686, + "grad_norm": NaN, + "learning_rate": 1.2604753215344098e-05, + "loss": 0.0, + "step": 56114 + }, + { + "epoch": 5.236073528039563, + "grad_norm": NaN, + "learning_rate": 1.2601718416907547e-05, + "loss": 0.0, + "step": 56115 + }, + { + "epoch": 5.236166837734441, + "grad_norm": NaN, + "learning_rate": 1.2598683967834168e-05, + "loss": 0.0, + "step": 56116 + }, + { + "epoch": 5.236260147429318, + "grad_norm": NaN, + "learning_rate": 1.2595649868131619e-05, + "loss": 0.0, + "step": 56117 + }, + { + "epoch": 5.236353457124196, + "grad_norm": NaN, + "learning_rate": 1.2592616117807596e-05, + "loss": 0.0, + "step": 56118 + }, + { + "epoch": 5.236446766819072, + "grad_norm": NaN, + "learning_rate": 1.2589582716869895e-05, + "loss": 0.0, + "step": 56119 + }, + { + "epoch": 5.23654007651395, + "grad_norm": NaN, + "learning_rate": 1.2586549665326157e-05, + "loss": 0.0, + "step": 56120 + }, + { + "epoch": 5.236633386208827, + "grad_norm": NaN, + "learning_rate": 1.2583516963184093e-05, + "loss": 0.0, + "step": 56121 + }, + { + "epoch": 5.236726695903704, + "grad_norm": NaN, + "learning_rate": 1.2580484610451497e-05, + "loss": 0.0, + "step": 56122 + }, + { + "epoch": 5.236820005598582, + "grad_norm": NaN, + "learning_rate": 1.2577452607135996e-05, + "loss": 0.0, + "step": 56123 + }, + { + "epoch": 5.236913315293459, + "grad_norm": NaN, + "learning_rate": 1.2574420953245284e-05, + "loss": 0.0, + "step": 56124 + }, + { + "epoch": 5.237006624988337, + "grad_norm": NaN, + "learning_rate": 1.257138964878719e-05, + "loss": 0.0, + "step": 56125 + }, + { + "epoch": 5.237099934683213, + "grad_norm": NaN, + "learning_rate": 1.2568358693769271e-05, + "loss": 0.0, + "step": 56126 + }, + { + "epoch": 5.237193244378091, + "grad_norm": NaN, + "learning_rate": 1.2565328088199338e-05, + "loss": 0.0, + "step": 56127 + }, + { + "epoch": 5.237286554072968, + "grad_norm": NaN, + "learning_rate": 1.256229783208509e-05, + "loss": 0.0, + "step": 56128 + }, + { + "epoch": 5.2373798637678455, + "grad_norm": NaN, + "learning_rate": 1.2559267925434147e-05, + "loss": 0.0, + "step": 56129 + }, + { + "epoch": 5.237473173462723, + "grad_norm": NaN, + "learning_rate": 1.2556238368254307e-05, + "loss": 0.0, + "step": 56130 + }, + { + "epoch": 5.2375664831576, + "grad_norm": NaN, + "learning_rate": 1.2553209160553262e-05, + "loss": 0.0, + "step": 56131 + }, + { + "epoch": 5.237659792852478, + "grad_norm": NaN, + "learning_rate": 1.2550180302338642e-05, + "loss": 0.0, + "step": 56132 + }, + { + "epoch": 5.237753102547354, + "grad_norm": NaN, + "learning_rate": 1.2547151793618221e-05, + "loss": 0.0, + "step": 56133 + }, + { + "epoch": 5.237846412242232, + "grad_norm": NaN, + "learning_rate": 1.2544123634399694e-05, + "loss": 0.0, + "step": 56134 + }, + { + "epoch": 5.237939721937109, + "grad_norm": NaN, + "learning_rate": 1.2541095824690706e-05, + "loss": 0.0, + "step": 56135 + }, + { + "epoch": 5.2380330316319865, + "grad_norm": NaN, + "learning_rate": 1.2538068364499016e-05, + "loss": 0.0, + "step": 56136 + }, + { + "epoch": 5.238126341326864, + "grad_norm": NaN, + "learning_rate": 1.2535041253832318e-05, + "loss": 0.0, + "step": 56137 + }, + { + "epoch": 5.238219651021741, + "grad_norm": NaN, + "learning_rate": 1.253201449269824e-05, + "loss": 0.0, + "step": 56138 + }, + { + "epoch": 5.238312960716619, + "grad_norm": NaN, + "learning_rate": 1.2528988081104557e-05, + "loss": 0.0, + "step": 56139 + }, + { + "epoch": 5.238406270411495, + "grad_norm": NaN, + "learning_rate": 1.2525962019058966e-05, + "loss": 0.0, + "step": 56140 + }, + { + "epoch": 5.238499580106373, + "grad_norm": NaN, + "learning_rate": 1.2522936306569076e-05, + "loss": 0.0, + "step": 56141 + }, + { + "epoch": 5.23859288980125, + "grad_norm": NaN, + "learning_rate": 1.2519910943642646e-05, + "loss": 0.0, + "step": 56142 + }, + { + "epoch": 5.238686199496128, + "grad_norm": NaN, + "learning_rate": 1.2516885930287407e-05, + "loss": 0.0, + "step": 56143 + }, + { + "epoch": 5.238779509191005, + "grad_norm": NaN, + "learning_rate": 1.2513861266510933e-05, + "loss": 0.0, + "step": 56144 + }, + { + "epoch": 5.238872818885882, + "grad_norm": NaN, + "learning_rate": 1.2510836952321019e-05, + "loss": 0.0, + "step": 56145 + }, + { + "epoch": 5.23896612858076, + "grad_norm": NaN, + "learning_rate": 1.250781298772534e-05, + "loss": 0.0, + "step": 56146 + }, + { + "epoch": 5.239059438275637, + "grad_norm": NaN, + "learning_rate": 1.2504789372731511e-05, + "loss": 0.0, + "step": 56147 + }, + { + "epoch": 5.239152747970514, + "grad_norm": NaN, + "learning_rate": 1.2501766107347306e-05, + "loss": 0.0, + "step": 56148 + }, + { + "epoch": 5.239246057665391, + "grad_norm": NaN, + "learning_rate": 1.2498743191580401e-05, + "loss": 0.0, + "step": 56149 + }, + { + "epoch": 5.239339367360269, + "grad_norm": NaN, + "learning_rate": 1.2495720625438411e-05, + "loss": 0.0, + "step": 56150 + }, + { + "epoch": 5.239432677055146, + "grad_norm": NaN, + "learning_rate": 1.2492698408929109e-05, + "loss": 0.0, + "step": 56151 + }, + { + "epoch": 5.2395259867500235, + "grad_norm": NaN, + "learning_rate": 1.2489676542060157e-05, + "loss": 0.0, + "step": 56152 + }, + { + "epoch": 5.239619296444901, + "grad_norm": NaN, + "learning_rate": 1.2486655024839165e-05, + "loss": 0.0, + "step": 56153 + }, + { + "epoch": 5.239712606139778, + "grad_norm": NaN, + "learning_rate": 1.2483633857273912e-05, + "loss": 0.0, + "step": 56154 + }, + { + "epoch": 5.239805915834655, + "grad_norm": NaN, + "learning_rate": 1.2480613039372056e-05, + "loss": 0.0, + "step": 56155 + }, + { + "epoch": 5.239899225529532, + "grad_norm": NaN, + "learning_rate": 1.2477592571141227e-05, + "loss": 0.0, + "step": 56156 + }, + { + "epoch": 5.23999253522441, + "grad_norm": NaN, + "learning_rate": 1.2474572452589198e-05, + "loss": 0.0, + "step": 56157 + }, + { + "epoch": 5.240085844919287, + "grad_norm": NaN, + "learning_rate": 1.2471552683723551e-05, + "loss": 0.0, + "step": 56158 + }, + { + "epoch": 5.2401791546141645, + "grad_norm": NaN, + "learning_rate": 1.2468533264551994e-05, + "loss": 0.0, + "step": 56159 + }, + { + "epoch": 5.240272464309042, + "grad_norm": NaN, + "learning_rate": 1.2465514195082255e-05, + "loss": 0.0, + "step": 56160 + }, + { + "epoch": 5.240365774003919, + "grad_norm": NaN, + "learning_rate": 1.246249547532196e-05, + "loss": 0.0, + "step": 56161 + }, + { + "epoch": 5.240459083698797, + "grad_norm": NaN, + "learning_rate": 1.245947710527877e-05, + "loss": 0.0, + "step": 56162 + }, + { + "epoch": 5.240552393393673, + "grad_norm": NaN, + "learning_rate": 1.245645908496043e-05, + "loss": 0.0, + "step": 56163 + }, + { + "epoch": 5.240645703088551, + "grad_norm": NaN, + "learning_rate": 1.2453441414374515e-05, + "loss": 0.0, + "step": 56164 + }, + { + "epoch": 5.240739012783428, + "grad_norm": NaN, + "learning_rate": 1.2450424093528805e-05, + "loss": 0.0, + "step": 56165 + }, + { + "epoch": 5.2408323224783055, + "grad_norm": NaN, + "learning_rate": 1.2447407122430924e-05, + "loss": 0.0, + "step": 56166 + }, + { + "epoch": 5.240925632173183, + "grad_norm": NaN, + "learning_rate": 1.2444390501088503e-05, + "loss": 0.0, + "step": 56167 + }, + { + "epoch": 5.24101894186806, + "grad_norm": NaN, + "learning_rate": 1.2441374229509265e-05, + "loss": 0.0, + "step": 56168 + }, + { + "epoch": 5.241112251562938, + "grad_norm": NaN, + "learning_rate": 1.243835830770089e-05, + "loss": 0.0, + "step": 56169 + }, + { + "epoch": 5.241205561257814, + "grad_norm": NaN, + "learning_rate": 1.2435342735670972e-05, + "loss": 0.0, + "step": 56170 + }, + { + "epoch": 5.241298870952692, + "grad_norm": NaN, + "learning_rate": 1.2432327513427271e-05, + "loss": 0.0, + "step": 56171 + }, + { + "epoch": 5.241392180647569, + "grad_norm": NaN, + "learning_rate": 1.2429312640977413e-05, + "loss": 0.0, + "step": 56172 + }, + { + "epoch": 5.241485490342447, + "grad_norm": NaN, + "learning_rate": 1.2426298118329025e-05, + "loss": 0.0, + "step": 56173 + }, + { + "epoch": 5.241578800037324, + "grad_norm": NaN, + "learning_rate": 1.242328394548982e-05, + "loss": 0.0, + "step": 56174 + }, + { + "epoch": 5.241672109732201, + "grad_norm": NaN, + "learning_rate": 1.2420270122467491e-05, + "loss": 0.0, + "step": 56175 + }, + { + "epoch": 5.241765419427079, + "grad_norm": NaN, + "learning_rate": 1.2417256649269597e-05, + "loss": 0.0, + "step": 56176 + }, + { + "epoch": 5.241858729121955, + "grad_norm": NaN, + "learning_rate": 1.24142435259039e-05, + "loss": 0.0, + "step": 56177 + }, + { + "epoch": 5.241952038816833, + "grad_norm": NaN, + "learning_rate": 1.2411230752378043e-05, + "loss": 0.0, + "step": 56178 + }, + { + "epoch": 5.24204534851171, + "grad_norm": NaN, + "learning_rate": 1.2408218328699621e-05, + "loss": 0.0, + "step": 56179 + }, + { + "epoch": 5.242138658206588, + "grad_norm": NaN, + "learning_rate": 1.2405206254876377e-05, + "loss": 0.0, + "step": 56180 + }, + { + "epoch": 5.242231967901465, + "grad_norm": NaN, + "learning_rate": 1.2402194530915938e-05, + "loss": 0.0, + "step": 56181 + }, + { + "epoch": 5.2423252775963425, + "grad_norm": NaN, + "learning_rate": 1.2399183156825931e-05, + "loss": 0.0, + "step": 56182 + }, + { + "epoch": 5.24241858729122, + "grad_norm": NaN, + "learning_rate": 1.2396172132614052e-05, + "loss": 0.0, + "step": 56183 + }, + { + "epoch": 5.242511896986096, + "grad_norm": NaN, + "learning_rate": 1.2393161458287976e-05, + "loss": 0.0, + "step": 56184 + }, + { + "epoch": 5.242605206680974, + "grad_norm": NaN, + "learning_rate": 1.2390151133855281e-05, + "loss": 0.0, + "step": 56185 + }, + { + "epoch": 5.242698516375851, + "grad_norm": NaN, + "learning_rate": 1.2387141159323693e-05, + "loss": 0.0, + "step": 56186 + }, + { + "epoch": 5.242791826070729, + "grad_norm": NaN, + "learning_rate": 1.2384131534700858e-05, + "loss": 0.0, + "step": 56187 + }, + { + "epoch": 5.242885135765606, + "grad_norm": NaN, + "learning_rate": 1.2381122259994369e-05, + "loss": 0.0, + "step": 56188 + }, + { + "epoch": 5.2429784454604835, + "grad_norm": NaN, + "learning_rate": 1.2378113335211954e-05, + "loss": 0.0, + "step": 56189 + }, + { + "epoch": 5.243071755155361, + "grad_norm": NaN, + "learning_rate": 1.2375104760361238e-05, + "loss": 0.0, + "step": 56190 + }, + { + "epoch": 5.243165064850238, + "grad_norm": NaN, + "learning_rate": 1.2372096535449833e-05, + "loss": 0.0, + "step": 56191 + }, + { + "epoch": 5.243258374545115, + "grad_norm": NaN, + "learning_rate": 1.2369088660485415e-05, + "loss": 0.0, + "step": 56192 + }, + { + "epoch": 5.243351684239992, + "grad_norm": NaN, + "learning_rate": 1.2366081135475681e-05, + "loss": 0.0, + "step": 56193 + }, + { + "epoch": 5.24344499393487, + "grad_norm": NaN, + "learning_rate": 1.2363073960428189e-05, + "loss": 0.0, + "step": 56194 + }, + { + "epoch": 5.243538303629747, + "grad_norm": NaN, + "learning_rate": 1.2360067135350632e-05, + "loss": 0.0, + "step": 56195 + }, + { + "epoch": 5.243631613324625, + "grad_norm": NaN, + "learning_rate": 1.235706066025069e-05, + "loss": 0.0, + "step": 56196 + }, + { + "epoch": 5.243724923019502, + "grad_norm": NaN, + "learning_rate": 1.235405453513592e-05, + "loss": 0.0, + "step": 56197 + }, + { + "epoch": 5.243818232714379, + "grad_norm": NaN, + "learning_rate": 1.2351048760014038e-05, + "loss": 0.0, + "step": 56198 + }, + { + "epoch": 5.243911542409256, + "grad_norm": NaN, + "learning_rate": 1.2348043334892682e-05, + "loss": 0.0, + "step": 56199 + }, + { + "epoch": 5.244004852104133, + "grad_norm": NaN, + "learning_rate": 1.2345038259779433e-05, + "loss": 0.0, + "step": 56200 + }, + { + "epoch": 5.244098161799011, + "grad_norm": NaN, + "learning_rate": 1.2342033534682033e-05, + "loss": 0.0, + "step": 56201 + }, + { + "epoch": 5.244191471493888, + "grad_norm": NaN, + "learning_rate": 1.233902915960801e-05, + "loss": 0.0, + "step": 56202 + }, + { + "epoch": 5.244284781188766, + "grad_norm": NaN, + "learning_rate": 1.2336025134565075e-05, + "loss": 0.0, + "step": 56203 + }, + { + "epoch": 5.244378090883643, + "grad_norm": NaN, + "learning_rate": 1.2333021459560888e-05, + "loss": 0.0, + "step": 56204 + }, + { + "epoch": 5.2444714005785205, + "grad_norm": NaN, + "learning_rate": 1.2330018134602976e-05, + "loss": 0.0, + "step": 56205 + }, + { + "epoch": 5.244564710273397, + "grad_norm": NaN, + "learning_rate": 1.2327015159699099e-05, + "loss": 0.0, + "step": 56206 + }, + { + "epoch": 5.244658019968274, + "grad_norm": NaN, + "learning_rate": 1.2324012534856853e-05, + "loss": 0.0, + "step": 56207 + }, + { + "epoch": 5.244751329663152, + "grad_norm": NaN, + "learning_rate": 1.2321010260083796e-05, + "loss": 0.0, + "step": 56208 + }, + { + "epoch": 5.244844639358029, + "grad_norm": NaN, + "learning_rate": 1.2318008335387674e-05, + "loss": 0.0, + "step": 56209 + }, + { + "epoch": 5.244937949052907, + "grad_norm": NaN, + "learning_rate": 1.231500676077608e-05, + "loss": 0.0, + "step": 56210 + }, + { + "epoch": 5.245031258747784, + "grad_norm": NaN, + "learning_rate": 1.2312005536256591e-05, + "loss": 0.0, + "step": 56211 + }, + { + "epoch": 5.2451245684426615, + "grad_norm": NaN, + "learning_rate": 1.230900466183692e-05, + "loss": 0.0, + "step": 56212 + }, + { + "epoch": 5.245217878137538, + "grad_norm": NaN, + "learning_rate": 1.2306004137524689e-05, + "loss": 0.0, + "step": 56213 + }, + { + "epoch": 5.2453111878324155, + "grad_norm": NaN, + "learning_rate": 1.2303003963327446e-05, + "loss": 0.0, + "step": 56214 + }, + { + "epoch": 5.245404497527293, + "grad_norm": NaN, + "learning_rate": 1.2300004139252901e-05, + "loss": 0.0, + "step": 56215 + }, + { + "epoch": 5.24549780722217, + "grad_norm": NaN, + "learning_rate": 1.229700466530868e-05, + "loss": 0.0, + "step": 56216 + }, + { + "epoch": 5.245591116917048, + "grad_norm": NaN, + "learning_rate": 1.2294005541502327e-05, + "loss": 0.0, + "step": 56217 + }, + { + "epoch": 5.245684426611925, + "grad_norm": NaN, + "learning_rate": 1.2291006767841572e-05, + "loss": 0.0, + "step": 56218 + }, + { + "epoch": 5.2457777363068026, + "grad_norm": NaN, + "learning_rate": 1.2288008344334004e-05, + "loss": 0.0, + "step": 56219 + }, + { + "epoch": 5.24587104600168, + "grad_norm": NaN, + "learning_rate": 1.228501027098719e-05, + "loss": 0.0, + "step": 56220 + }, + { + "epoch": 5.2459643556965565, + "grad_norm": NaN, + "learning_rate": 1.2282012547808834e-05, + "loss": 0.0, + "step": 56221 + }, + { + "epoch": 5.246057665391434, + "grad_norm": NaN, + "learning_rate": 1.2279015174806533e-05, + "loss": 0.0, + "step": 56222 + }, + { + "epoch": 5.246150975086311, + "grad_norm": NaN, + "learning_rate": 1.2276018151987865e-05, + "loss": 0.0, + "step": 56223 + }, + { + "epoch": 5.246244284781189, + "grad_norm": NaN, + "learning_rate": 1.2273021479360507e-05, + "loss": 0.0, + "step": 56224 + }, + { + "epoch": 5.246337594476066, + "grad_norm": NaN, + "learning_rate": 1.2270025156932084e-05, + "loss": 0.0, + "step": 56225 + }, + { + "epoch": 5.246430904170944, + "grad_norm": NaN, + "learning_rate": 1.2267029184710126e-05, + "loss": 0.0, + "step": 56226 + }, + { + "epoch": 5.246524213865821, + "grad_norm": NaN, + "learning_rate": 1.2264033562702357e-05, + "loss": 0.0, + "step": 56227 + }, + { + "epoch": 5.2466175235606975, + "grad_norm": NaN, + "learning_rate": 1.2261038290916358e-05, + "loss": 0.0, + "step": 56228 + }, + { + "epoch": 5.246710833255575, + "grad_norm": NaN, + "learning_rate": 1.2258043369359704e-05, + "loss": 0.0, + "step": 56229 + }, + { + "epoch": 5.246804142950452, + "grad_norm": NaN, + "learning_rate": 1.2255048798040057e-05, + "loss": 0.0, + "step": 56230 + }, + { + "epoch": 5.24689745264533, + "grad_norm": NaN, + "learning_rate": 1.2252054576965042e-05, + "loss": 0.0, + "step": 56231 + }, + { + "epoch": 5.246990762340207, + "grad_norm": NaN, + "learning_rate": 1.2249060706142205e-05, + "loss": 0.0, + "step": 56232 + }, + { + "epoch": 5.247084072035085, + "grad_norm": NaN, + "learning_rate": 1.2246067185579223e-05, + "loss": 0.0, + "step": 56233 + }, + { + "epoch": 5.247177381729962, + "grad_norm": NaN, + "learning_rate": 1.2243074015283722e-05, + "loss": 0.0, + "step": 56234 + }, + { + "epoch": 5.2472706914248395, + "grad_norm": NaN, + "learning_rate": 1.2240081195263213e-05, + "loss": 0.0, + "step": 56235 + }, + { + "epoch": 5.247364001119716, + "grad_norm": NaN, + "learning_rate": 1.2237088725525408e-05, + "loss": 0.0, + "step": 56236 + }, + { + "epoch": 5.247457310814593, + "grad_norm": NaN, + "learning_rate": 1.2234096606077865e-05, + "loss": 0.0, + "step": 56237 + }, + { + "epoch": 5.247550620509471, + "grad_norm": NaN, + "learning_rate": 1.2231104836928213e-05, + "loss": 0.0, + "step": 56238 + }, + { + "epoch": 5.247643930204348, + "grad_norm": NaN, + "learning_rate": 1.2228113418084046e-05, + "loss": 0.0, + "step": 56239 + }, + { + "epoch": 5.247737239899226, + "grad_norm": NaN, + "learning_rate": 1.2225122349552973e-05, + "loss": 0.0, + "step": 56240 + }, + { + "epoch": 5.247830549594103, + "grad_norm": NaN, + "learning_rate": 1.2222131631342608e-05, + "loss": 0.0, + "step": 56241 + }, + { + "epoch": 5.2479238592889805, + "grad_norm": NaN, + "learning_rate": 1.2219141263460557e-05, + "loss": 0.0, + "step": 56242 + }, + { + "epoch": 5.248017168983857, + "grad_norm": NaN, + "learning_rate": 1.2216151245914401e-05, + "loss": 0.0, + "step": 56243 + }, + { + "epoch": 5.2481104786787345, + "grad_norm": NaN, + "learning_rate": 1.2213161578711766e-05, + "loss": 0.0, + "step": 56244 + }, + { + "epoch": 5.248203788373612, + "grad_norm": NaN, + "learning_rate": 1.2210172261860263e-05, + "loss": 0.0, + "step": 56245 + }, + { + "epoch": 5.248297098068489, + "grad_norm": NaN, + "learning_rate": 1.2207183295367433e-05, + "loss": 0.0, + "step": 56246 + }, + { + "epoch": 5.248390407763367, + "grad_norm": NaN, + "learning_rate": 1.2204194679240941e-05, + "loss": 0.0, + "step": 56247 + }, + { + "epoch": 5.248483717458244, + "grad_norm": NaN, + "learning_rate": 1.2201206413488395e-05, + "loss": 0.0, + "step": 56248 + }, + { + "epoch": 5.248577027153122, + "grad_norm": NaN, + "learning_rate": 1.2198218498117307e-05, + "loss": 0.0, + "step": 56249 + }, + { + "epoch": 5.248670336847998, + "grad_norm": NaN, + "learning_rate": 1.2195230933135352e-05, + "loss": 0.0, + "step": 56250 + }, + { + "epoch": 5.2487636465428755, + "grad_norm": NaN, + "learning_rate": 1.2192243718550127e-05, + "loss": 0.0, + "step": 56251 + }, + { + "epoch": 5.248856956237753, + "grad_norm": NaN, + "learning_rate": 1.2189256854369172e-05, + "loss": 0.0, + "step": 56252 + }, + { + "epoch": 5.24895026593263, + "grad_norm": NaN, + "learning_rate": 1.2186270340600119e-05, + "loss": 0.0, + "step": 56253 + }, + { + "epoch": 5.249043575627508, + "grad_norm": NaN, + "learning_rate": 1.2183284177250591e-05, + "loss": 0.0, + "step": 56254 + }, + { + "epoch": 5.249136885322385, + "grad_norm": NaN, + "learning_rate": 1.21802983643281e-05, + "loss": 0.0, + "step": 56255 + }, + { + "epoch": 5.249230195017263, + "grad_norm": NaN, + "learning_rate": 1.2177312901840291e-05, + "loss": 0.0, + "step": 56256 + }, + { + "epoch": 5.249323504712139, + "grad_norm": NaN, + "learning_rate": 1.2174327789794791e-05, + "loss": 0.0, + "step": 56257 + }, + { + "epoch": 5.249416814407017, + "grad_norm": NaN, + "learning_rate": 1.2171343028199093e-05, + "loss": 0.0, + "step": 56258 + }, + { + "epoch": 5.249510124101894, + "grad_norm": NaN, + "learning_rate": 1.2168358617060875e-05, + "loss": 0.0, + "step": 56259 + }, + { + "epoch": 5.249603433796771, + "grad_norm": NaN, + "learning_rate": 1.2165374556387714e-05, + "loss": 0.0, + "step": 56260 + }, + { + "epoch": 5.249696743491649, + "grad_norm": NaN, + "learning_rate": 1.216239084618712e-05, + "loss": 0.0, + "step": 56261 + }, + { + "epoch": 5.249790053186526, + "grad_norm": NaN, + "learning_rate": 1.2159407486466771e-05, + "loss": 0.0, + "step": 56262 + }, + { + "epoch": 5.249883362881404, + "grad_norm": NaN, + "learning_rate": 1.2156424477234227e-05, + "loss": 0.0, + "step": 56263 + }, + { + "epoch": 5.249976672576281, + "grad_norm": NaN, + "learning_rate": 1.2153441818497034e-05, + "loss": 0.0, + "step": 56264 + }, + { + "epoch": 5.250069982271158, + "grad_norm": NaN, + "learning_rate": 1.2150459510262816e-05, + "loss": 0.0, + "step": 56265 + }, + { + "epoch": 5.250163291966035, + "grad_norm": NaN, + "learning_rate": 1.214747755253917e-05, + "loss": 0.0, + "step": 56266 + }, + { + "epoch": 5.2502566016609125, + "grad_norm": NaN, + "learning_rate": 1.2144495945333604e-05, + "loss": 0.0, + "step": 56267 + }, + { + "epoch": 5.25034991135579, + "grad_norm": NaN, + "learning_rate": 1.214151468865378e-05, + "loss": 0.0, + "step": 56268 + }, + { + "epoch": 5.250443221050667, + "grad_norm": NaN, + "learning_rate": 1.2138533782507276e-05, + "loss": 0.0, + "step": 56269 + }, + { + "epoch": 5.250536530745545, + "grad_norm": NaN, + "learning_rate": 1.2135553226901584e-05, + "loss": 0.0, + "step": 56270 + }, + { + "epoch": 5.250629840440422, + "grad_norm": NaN, + "learning_rate": 1.2132573021844366e-05, + "loss": 0.0, + "step": 56271 + }, + { + "epoch": 5.250723150135299, + "grad_norm": NaN, + "learning_rate": 1.21295931673432e-05, + "loss": 0.0, + "step": 56272 + }, + { + "epoch": 5.250816459830176, + "grad_norm": NaN, + "learning_rate": 1.2126613663405593e-05, + "loss": 0.0, + "step": 56273 + }, + { + "epoch": 5.2509097695250535, + "grad_norm": NaN, + "learning_rate": 1.2123634510039193e-05, + "loss": 0.0, + "step": 56274 + }, + { + "epoch": 5.251003079219931, + "grad_norm": NaN, + "learning_rate": 1.2120655707251543e-05, + "loss": 0.0, + "step": 56275 + }, + { + "epoch": 5.251096388914808, + "grad_norm": NaN, + "learning_rate": 1.2117677255050217e-05, + "loss": 0.0, + "step": 56276 + }, + { + "epoch": 5.251189698609686, + "grad_norm": NaN, + "learning_rate": 1.2114699153442814e-05, + "loss": 0.0, + "step": 56277 + }, + { + "epoch": 5.251283008304563, + "grad_norm": NaN, + "learning_rate": 1.2111721402436875e-05, + "loss": 0.0, + "step": 56278 + }, + { + "epoch": 5.251376317999441, + "grad_norm": NaN, + "learning_rate": 1.2108744002039977e-05, + "loss": 0.0, + "step": 56279 + }, + { + "epoch": 5.251469627694317, + "grad_norm": NaN, + "learning_rate": 1.2105766952259698e-05, + "loss": 0.0, + "step": 56280 + }, + { + "epoch": 5.2515629373891946, + "grad_norm": NaN, + "learning_rate": 1.2102790253103617e-05, + "loss": 0.0, + "step": 56281 + }, + { + "epoch": 5.251656247084072, + "grad_norm": NaN, + "learning_rate": 1.2099813904579276e-05, + "loss": 0.0, + "step": 56282 + }, + { + "epoch": 5.251749556778949, + "grad_norm": NaN, + "learning_rate": 1.2096837906694267e-05, + "loss": 0.0, + "step": 56283 + }, + { + "epoch": 5.251842866473827, + "grad_norm": NaN, + "learning_rate": 1.2093862259456155e-05, + "loss": 0.0, + "step": 56284 + }, + { + "epoch": 5.251936176168704, + "grad_norm": NaN, + "learning_rate": 1.2090886962872499e-05, + "loss": 0.0, + "step": 56285 + }, + { + "epoch": 5.252029485863581, + "grad_norm": NaN, + "learning_rate": 1.2087912016950857e-05, + "loss": 0.0, + "step": 56286 + }, + { + "epoch": 5.252122795558458, + "grad_norm": NaN, + "learning_rate": 1.208493742169881e-05, + "loss": 0.0, + "step": 56287 + }, + { + "epoch": 5.252216105253336, + "grad_norm": NaN, + "learning_rate": 1.2081963177123916e-05, + "loss": 0.0, + "step": 56288 + }, + { + "epoch": 5.252309414948213, + "grad_norm": NaN, + "learning_rate": 1.2078989283233737e-05, + "loss": 0.0, + "step": 56289 + }, + { + "epoch": 5.25240272464309, + "grad_norm": NaN, + "learning_rate": 1.2076015740035817e-05, + "loss": 0.0, + "step": 56290 + }, + { + "epoch": 5.252496034337968, + "grad_norm": NaN, + "learning_rate": 1.2073042547537748e-05, + "loss": 0.0, + "step": 56291 + }, + { + "epoch": 5.252589344032845, + "grad_norm": NaN, + "learning_rate": 1.2070069705747093e-05, + "loss": 0.0, + "step": 56292 + }, + { + "epoch": 5.252682653727723, + "grad_norm": NaN, + "learning_rate": 1.2067097214671346e-05, + "loss": 0.0, + "step": 56293 + }, + { + "epoch": 5.252775963422599, + "grad_norm": NaN, + "learning_rate": 1.2064125074318132e-05, + "loss": 0.0, + "step": 56294 + }, + { + "epoch": 5.252869273117477, + "grad_norm": NaN, + "learning_rate": 1.206115328469503e-05, + "loss": 0.0, + "step": 56295 + }, + { + "epoch": 5.252962582812354, + "grad_norm": NaN, + "learning_rate": 1.2058181845809483e-05, + "loss": 0.0, + "step": 56296 + }, + { + "epoch": 5.2530558925072315, + "grad_norm": NaN, + "learning_rate": 1.2055210757669154e-05, + "loss": 0.0, + "step": 56297 + }, + { + "epoch": 5.253149202202109, + "grad_norm": NaN, + "learning_rate": 1.2052240020281585e-05, + "loss": 0.0, + "step": 56298 + }, + { + "epoch": 5.253242511896986, + "grad_norm": NaN, + "learning_rate": 1.2049269633654252e-05, + "loss": 0.0, + "step": 56299 + }, + { + "epoch": 5.253335821591864, + "grad_norm": NaN, + "learning_rate": 1.2046299597794801e-05, + "loss": 0.0, + "step": 56300 + }, + { + "epoch": 5.25342913128674, + "grad_norm": NaN, + "learning_rate": 1.2043329912710759e-05, + "loss": 0.0, + "step": 56301 + }, + { + "epoch": 5.253522440981618, + "grad_norm": NaN, + "learning_rate": 1.2040360578409602e-05, + "loss": 0.0, + "step": 56302 + }, + { + "epoch": 5.253615750676495, + "grad_norm": NaN, + "learning_rate": 1.2037391594898992e-05, + "loss": 0.0, + "step": 56303 + }, + { + "epoch": 5.2537090603713725, + "grad_norm": NaN, + "learning_rate": 1.203442296218644e-05, + "loss": 0.0, + "step": 56304 + }, + { + "epoch": 5.25380237006625, + "grad_norm": NaN, + "learning_rate": 1.2031454680279423e-05, + "loss": 0.0, + "step": 56305 + }, + { + "epoch": 5.253895679761127, + "grad_norm": NaN, + "learning_rate": 1.2028486749185583e-05, + "loss": 0.0, + "step": 56306 + }, + { + "epoch": 5.253988989456005, + "grad_norm": NaN, + "learning_rate": 1.2025519168912451e-05, + "loss": 0.0, + "step": 56307 + }, + { + "epoch": 5.254082299150882, + "grad_norm": NaN, + "learning_rate": 1.2022551939467517e-05, + "loss": 0.0, + "step": 56308 + }, + { + "epoch": 5.254175608845759, + "grad_norm": NaN, + "learning_rate": 1.2019585060858377e-05, + "loss": 0.0, + "step": 56309 + }, + { + "epoch": 5.254268918540636, + "grad_norm": NaN, + "learning_rate": 1.2016618533092576e-05, + "loss": 0.0, + "step": 56310 + }, + { + "epoch": 5.254362228235514, + "grad_norm": NaN, + "learning_rate": 1.2013652356177605e-05, + "loss": 0.0, + "step": 56311 + }, + { + "epoch": 5.254455537930391, + "grad_norm": NaN, + "learning_rate": 1.2010686530121044e-05, + "loss": 0.0, + "step": 56312 + }, + { + "epoch": 5.254548847625268, + "grad_norm": NaN, + "learning_rate": 1.2007721054930452e-05, + "loss": 0.0, + "step": 56313 + }, + { + "epoch": 5.254642157320146, + "grad_norm": NaN, + "learning_rate": 1.2004755930613341e-05, + "loss": 0.0, + "step": 56314 + }, + { + "epoch": 5.254735467015023, + "grad_norm": NaN, + "learning_rate": 1.2001791157177271e-05, + "loss": 0.0, + "step": 56315 + }, + { + "epoch": 5.2548287767099, + "grad_norm": NaN, + "learning_rate": 1.1998826734629751e-05, + "loss": 0.0, + "step": 56316 + }, + { + "epoch": 5.254922086404777, + "grad_norm": NaN, + "learning_rate": 1.1995862662978345e-05, + "loss": 0.0, + "step": 56317 + }, + { + "epoch": 5.255015396099655, + "grad_norm": NaN, + "learning_rate": 1.1992898942230578e-05, + "loss": 0.0, + "step": 56318 + }, + { + "epoch": 5.255108705794532, + "grad_norm": NaN, + "learning_rate": 1.1989935572393994e-05, + "loss": 0.0, + "step": 56319 + }, + { + "epoch": 5.2552020154894095, + "grad_norm": NaN, + "learning_rate": 1.1986972553476121e-05, + "loss": 0.0, + "step": 56320 + }, + { + "epoch": 5.255295325184287, + "grad_norm": NaN, + "learning_rate": 1.1984009885484486e-05, + "loss": 0.0, + "step": 56321 + }, + { + "epoch": 5.255388634879164, + "grad_norm": NaN, + "learning_rate": 1.1981047568426633e-05, + "loss": 0.0, + "step": 56322 + }, + { + "epoch": 5.255481944574041, + "grad_norm": NaN, + "learning_rate": 1.197808560231009e-05, + "loss": 0.0, + "step": 56323 + }, + { + "epoch": 5.255575254268918, + "grad_norm": NaN, + "learning_rate": 1.19751239871424e-05, + "loss": 0.0, + "step": 56324 + }, + { + "epoch": 5.255668563963796, + "grad_norm": NaN, + "learning_rate": 1.1972162722931072e-05, + "loss": 0.0, + "step": 56325 + }, + { + "epoch": 5.255761873658673, + "grad_norm": NaN, + "learning_rate": 1.1969201809683654e-05, + "loss": 0.0, + "step": 56326 + }, + { + "epoch": 5.2558551833535505, + "grad_norm": NaN, + "learning_rate": 1.1966241247407671e-05, + "loss": 0.0, + "step": 56327 + }, + { + "epoch": 5.255948493048428, + "grad_norm": NaN, + "learning_rate": 1.1963281036110633e-05, + "loss": 0.0, + "step": 56328 + }, + { + "epoch": 5.256041802743305, + "grad_norm": NaN, + "learning_rate": 1.1960321175800103e-05, + "loss": 0.0, + "step": 56329 + }, + { + "epoch": 5.256135112438182, + "grad_norm": NaN, + "learning_rate": 1.1957361666483573e-05, + "loss": 0.0, + "step": 56330 + }, + { + "epoch": 5.256228422133059, + "grad_norm": NaN, + "learning_rate": 1.1954402508168586e-05, + "loss": 0.0, + "step": 56331 + }, + { + "epoch": 5.256321731827937, + "grad_norm": NaN, + "learning_rate": 1.1951443700862657e-05, + "loss": 0.0, + "step": 56332 + }, + { + "epoch": 5.256415041522814, + "grad_norm": NaN, + "learning_rate": 1.1948485244573308e-05, + "loss": 0.0, + "step": 56333 + }, + { + "epoch": 5.2565083512176916, + "grad_norm": NaN, + "learning_rate": 1.194552713930807e-05, + "loss": 0.0, + "step": 56334 + }, + { + "epoch": 5.256601660912569, + "grad_norm": NaN, + "learning_rate": 1.1942569385074468e-05, + "loss": 0.0, + "step": 56335 + }, + { + "epoch": 5.256694970607446, + "grad_norm": NaN, + "learning_rate": 1.1939611981880049e-05, + "loss": 0.0, + "step": 56336 + }, + { + "epoch": 5.256788280302324, + "grad_norm": NaN, + "learning_rate": 1.1936654929732236e-05, + "loss": 0.0, + "step": 56337 + }, + { + "epoch": 5.2568815899972, + "grad_norm": NaN, + "learning_rate": 1.1933698228638645e-05, + "loss": 0.0, + "step": 56338 + }, + { + "epoch": 5.256974899692078, + "grad_norm": NaN, + "learning_rate": 1.1930741878606781e-05, + "loss": 0.0, + "step": 56339 + }, + { + "epoch": 5.257068209386955, + "grad_norm": NaN, + "learning_rate": 1.1927785879644092e-05, + "loss": 0.0, + "step": 56340 + }, + { + "epoch": 5.257161519081833, + "grad_norm": NaN, + "learning_rate": 1.192483023175817e-05, + "loss": 0.0, + "step": 56341 + }, + { + "epoch": 5.25725482877671, + "grad_norm": NaN, + "learning_rate": 1.1921874934956526e-05, + "loss": 0.0, + "step": 56342 + }, + { + "epoch": 5.257348138471587, + "grad_norm": NaN, + "learning_rate": 1.1918919989246605e-05, + "loss": 0.0, + "step": 56343 + }, + { + "epoch": 5.257441448166465, + "grad_norm": NaN, + "learning_rate": 1.1915965394636001e-05, + "loss": 0.0, + "step": 56344 + }, + { + "epoch": 5.257534757861341, + "grad_norm": NaN, + "learning_rate": 1.1913011151132207e-05, + "loss": 0.0, + "step": 56345 + }, + { + "epoch": 5.257628067556219, + "grad_norm": NaN, + "learning_rate": 1.1910057258742667e-05, + "loss": 0.0, + "step": 56346 + }, + { + "epoch": 5.257721377251096, + "grad_norm": NaN, + "learning_rate": 1.1907103717474992e-05, + "loss": 0.0, + "step": 56347 + }, + { + "epoch": 5.257814686945974, + "grad_norm": NaN, + "learning_rate": 1.1904150527336658e-05, + "loss": 0.0, + "step": 56348 + }, + { + "epoch": 5.257907996640851, + "grad_norm": NaN, + "learning_rate": 1.1901197688335113e-05, + "loss": 0.0, + "step": 56349 + }, + { + "epoch": 5.2580013063357285, + "grad_norm": NaN, + "learning_rate": 1.189824520047793e-05, + "loss": 0.0, + "step": 56350 + }, + { + "epoch": 5.258094616030606, + "grad_norm": NaN, + "learning_rate": 1.1895293063772604e-05, + "loss": 0.0, + "step": 56351 + }, + { + "epoch": 5.258187925725483, + "grad_norm": NaN, + "learning_rate": 1.1892341278226647e-05, + "loss": 0.0, + "step": 56352 + }, + { + "epoch": 5.25828123542036, + "grad_norm": NaN, + "learning_rate": 1.1889389843847553e-05, + "loss": 0.0, + "step": 56353 + }, + { + "epoch": 5.258374545115237, + "grad_norm": NaN, + "learning_rate": 1.188643876064283e-05, + "loss": 0.0, + "step": 56354 + }, + { + "epoch": 5.258467854810115, + "grad_norm": NaN, + "learning_rate": 1.1883488028619976e-05, + "loss": 0.0, + "step": 56355 + }, + { + "epoch": 5.258561164504992, + "grad_norm": NaN, + "learning_rate": 1.1880537647786497e-05, + "loss": 0.0, + "step": 56356 + }, + { + "epoch": 5.2586544741998695, + "grad_norm": NaN, + "learning_rate": 1.1877587618149909e-05, + "loss": 0.0, + "step": 56357 + }, + { + "epoch": 5.258747783894747, + "grad_norm": NaN, + "learning_rate": 1.1874637939717685e-05, + "loss": 0.0, + "step": 56358 + }, + { + "epoch": 5.258841093589624, + "grad_norm": NaN, + "learning_rate": 1.1871688612497355e-05, + "loss": 0.0, + "step": 56359 + }, + { + "epoch": 5.258934403284501, + "grad_norm": NaN, + "learning_rate": 1.1868739636496393e-05, + "loss": 0.0, + "step": 56360 + }, + { + "epoch": 5.259027712979378, + "grad_norm": NaN, + "learning_rate": 1.1865791011722314e-05, + "loss": 0.0, + "step": 56361 + }, + { + "epoch": 5.259121022674256, + "grad_norm": NaN, + "learning_rate": 1.186284273818261e-05, + "loss": 0.0, + "step": 56362 + }, + { + "epoch": 5.259214332369133, + "grad_norm": NaN, + "learning_rate": 1.1859894815884791e-05, + "loss": 0.0, + "step": 56363 + }, + { + "epoch": 5.259307642064011, + "grad_norm": NaN, + "learning_rate": 1.1856947244836319e-05, + "loss": 0.0, + "step": 56364 + }, + { + "epoch": 5.259400951758888, + "grad_norm": NaN, + "learning_rate": 1.1854000025044718e-05, + "loss": 0.0, + "step": 56365 + }, + { + "epoch": 5.259494261453765, + "grad_norm": NaN, + "learning_rate": 1.1851053156517471e-05, + "loss": 0.0, + "step": 56366 + }, + { + "epoch": 5.259587571148642, + "grad_norm": NaN, + "learning_rate": 1.1848106639262068e-05, + "loss": 0.0, + "step": 56367 + }, + { + "epoch": 5.259680880843519, + "grad_norm": NaN, + "learning_rate": 1.1845160473286019e-05, + "loss": 0.0, + "step": 56368 + }, + { + "epoch": 5.259774190538397, + "grad_norm": NaN, + "learning_rate": 1.1842214658596788e-05, + "loss": 0.0, + "step": 56369 + }, + { + "epoch": 5.259867500233274, + "grad_norm": NaN, + "learning_rate": 1.1839269195201901e-05, + "loss": 0.0, + "step": 56370 + }, + { + "epoch": 5.259960809928152, + "grad_norm": NaN, + "learning_rate": 1.18363240831088e-05, + "loss": 0.0, + "step": 56371 + }, + { + "epoch": 5.260054119623029, + "grad_norm": NaN, + "learning_rate": 1.1833379322325015e-05, + "loss": 0.0, + "step": 56372 + }, + { + "epoch": 5.2601474293179065, + "grad_norm": NaN, + "learning_rate": 1.1830434912858023e-05, + "loss": 0.0, + "step": 56373 + }, + { + "epoch": 5.260240739012783, + "grad_norm": NaN, + "learning_rate": 1.18274908547153e-05, + "loss": 0.0, + "step": 56374 + }, + { + "epoch": 5.26033404870766, + "grad_norm": NaN, + "learning_rate": 1.1824547147904323e-05, + "loss": 0.0, + "step": 56375 + }, + { + "epoch": 5.260427358402538, + "grad_norm": NaN, + "learning_rate": 1.1821603792432604e-05, + "loss": 0.0, + "step": 56376 + }, + { + "epoch": 5.260520668097415, + "grad_norm": NaN, + "learning_rate": 1.1818660788307621e-05, + "loss": 0.0, + "step": 56377 + }, + { + "epoch": 5.260613977792293, + "grad_norm": NaN, + "learning_rate": 1.181571813553685e-05, + "loss": 0.0, + "step": 56378 + }, + { + "epoch": 5.26070728748717, + "grad_norm": NaN, + "learning_rate": 1.1812775834127768e-05, + "loss": 0.0, + "step": 56379 + }, + { + "epoch": 5.2608005971820475, + "grad_norm": NaN, + "learning_rate": 1.1809833884087905e-05, + "loss": 0.0, + "step": 56380 + }, + { + "epoch": 5.260893906876925, + "grad_norm": NaN, + "learning_rate": 1.1806892285424635e-05, + "loss": 0.0, + "step": 56381 + }, + { + "epoch": 5.2609872165718015, + "grad_norm": NaN, + "learning_rate": 1.180395103814552e-05, + "loss": 0.0, + "step": 56382 + }, + { + "epoch": 5.261080526266679, + "grad_norm": NaN, + "learning_rate": 1.1801010142258055e-05, + "loss": 0.0, + "step": 56383 + }, + { + "epoch": 5.261173835961556, + "grad_norm": NaN, + "learning_rate": 1.1798069597769633e-05, + "loss": 0.0, + "step": 56384 + }, + { + "epoch": 5.261267145656434, + "grad_norm": NaN, + "learning_rate": 1.1795129404687798e-05, + "loss": 0.0, + "step": 56385 + }, + { + "epoch": 5.261360455351311, + "grad_norm": NaN, + "learning_rate": 1.1792189563020044e-05, + "loss": 0.0, + "step": 56386 + }, + { + "epoch": 5.261453765046189, + "grad_norm": NaN, + "learning_rate": 1.1789250072773765e-05, + "loss": 0.0, + "step": 56387 + }, + { + "epoch": 5.261547074741066, + "grad_norm": NaN, + "learning_rate": 1.1786310933956505e-05, + "loss": 0.0, + "step": 56388 + }, + { + "epoch": 5.2616403844359425, + "grad_norm": NaN, + "learning_rate": 1.1783372146575709e-05, + "loss": 0.0, + "step": 56389 + }, + { + "epoch": 5.26173369413082, + "grad_norm": NaN, + "learning_rate": 1.1780433710638854e-05, + "loss": 0.0, + "step": 56390 + }, + { + "epoch": 5.261827003825697, + "grad_norm": NaN, + "learning_rate": 1.1777495626153416e-05, + "loss": 0.0, + "step": 56391 + }, + { + "epoch": 5.261920313520575, + "grad_norm": NaN, + "learning_rate": 1.1774557893126874e-05, + "loss": 0.0, + "step": 56392 + }, + { + "epoch": 5.262013623215452, + "grad_norm": NaN, + "learning_rate": 1.177162051156667e-05, + "loss": 0.0, + "step": 56393 + }, + { + "epoch": 5.26210693291033, + "grad_norm": NaN, + "learning_rate": 1.1768683481480285e-05, + "loss": 0.0, + "step": 56394 + }, + { + "epoch": 5.262200242605207, + "grad_norm": NaN, + "learning_rate": 1.176574680287521e-05, + "loss": 0.0, + "step": 56395 + }, + { + "epoch": 5.262293552300084, + "grad_norm": NaN, + "learning_rate": 1.1762810475758871e-05, + "loss": 0.0, + "step": 56396 + }, + { + "epoch": 5.262386861994961, + "grad_norm": NaN, + "learning_rate": 1.1759874500138767e-05, + "loss": 0.0, + "step": 56397 + }, + { + "epoch": 5.262480171689838, + "grad_norm": NaN, + "learning_rate": 1.1756938876022353e-05, + "loss": 0.0, + "step": 56398 + }, + { + "epoch": 5.262573481384716, + "grad_norm": NaN, + "learning_rate": 1.1754003603417094e-05, + "loss": 0.0, + "step": 56399 + }, + { + "epoch": 5.262666791079593, + "grad_norm": NaN, + "learning_rate": 1.175106868233045e-05, + "loss": 0.0, + "step": 56400 + }, + { + "epoch": 5.262760100774471, + "grad_norm": NaN, + "learning_rate": 1.1748134112769897e-05, + "loss": 0.0, + "step": 56401 + }, + { + "epoch": 5.262853410469348, + "grad_norm": NaN, + "learning_rate": 1.1745199894742862e-05, + "loss": 0.0, + "step": 56402 + }, + { + "epoch": 5.262946720164225, + "grad_norm": NaN, + "learning_rate": 1.1742266028256841e-05, + "loss": 0.0, + "step": 56403 + }, + { + "epoch": 5.263040029859102, + "grad_norm": NaN, + "learning_rate": 1.1739332513319293e-05, + "loss": 0.0, + "step": 56404 + }, + { + "epoch": 5.263133339553979, + "grad_norm": NaN, + "learning_rate": 1.1736399349937665e-05, + "loss": 0.0, + "step": 56405 + }, + { + "epoch": 5.263226649248857, + "grad_norm": NaN, + "learning_rate": 1.1733466538119395e-05, + "loss": 0.0, + "step": 56406 + }, + { + "epoch": 5.263319958943734, + "grad_norm": NaN, + "learning_rate": 1.1730534077871984e-05, + "loss": 0.0, + "step": 56407 + }, + { + "epoch": 5.263413268638612, + "grad_norm": NaN, + "learning_rate": 1.1727601969202854e-05, + "loss": 0.0, + "step": 56408 + }, + { + "epoch": 5.263506578333489, + "grad_norm": NaN, + "learning_rate": 1.1724670212119486e-05, + "loss": 0.0, + "step": 56409 + }, + { + "epoch": 5.2635998880283665, + "grad_norm": NaN, + "learning_rate": 1.1721738806629305e-05, + "loss": 0.0, + "step": 56410 + }, + { + "epoch": 5.263693197723243, + "grad_norm": NaN, + "learning_rate": 1.1718807752739789e-05, + "loss": 0.0, + "step": 56411 + }, + { + "epoch": 5.2637865074181205, + "grad_norm": NaN, + "learning_rate": 1.1715877050458382e-05, + "loss": 0.0, + "step": 56412 + }, + { + "epoch": 5.263879817112998, + "grad_norm": NaN, + "learning_rate": 1.1712946699792548e-05, + "loss": 0.0, + "step": 56413 + }, + { + "epoch": 5.263973126807875, + "grad_norm": NaN, + "learning_rate": 1.1710016700749709e-05, + "loss": 0.0, + "step": 56414 + }, + { + "epoch": 5.264066436502753, + "grad_norm": NaN, + "learning_rate": 1.1707087053337344e-05, + "loss": 0.0, + "step": 56415 + }, + { + "epoch": 5.26415974619763, + "grad_norm": NaN, + "learning_rate": 1.1704157757562883e-05, + "loss": 0.0, + "step": 56416 + }, + { + "epoch": 5.264253055892508, + "grad_norm": NaN, + "learning_rate": 1.1701228813433799e-05, + "loss": 0.0, + "step": 56417 + }, + { + "epoch": 5.264346365587384, + "grad_norm": NaN, + "learning_rate": 1.1698300220957507e-05, + "loss": 0.0, + "step": 56418 + }, + { + "epoch": 5.2644396752822615, + "grad_norm": NaN, + "learning_rate": 1.1695371980141467e-05, + "loss": 0.0, + "step": 56419 + }, + { + "epoch": 5.264532984977139, + "grad_norm": NaN, + "learning_rate": 1.1692444090993137e-05, + "loss": 0.0, + "step": 56420 + }, + { + "epoch": 5.264626294672016, + "grad_norm": NaN, + "learning_rate": 1.1689516553519962e-05, + "loss": 0.0, + "step": 56421 + }, + { + "epoch": 5.264719604366894, + "grad_norm": NaN, + "learning_rate": 1.1686589367729355e-05, + "loss": 0.0, + "step": 56422 + }, + { + "epoch": 5.264812914061771, + "grad_norm": NaN, + "learning_rate": 1.168366253362879e-05, + "loss": 0.0, + "step": 56423 + }, + { + "epoch": 5.264906223756649, + "grad_norm": NaN, + "learning_rate": 1.1680736051225698e-05, + "loss": 0.0, + "step": 56424 + }, + { + "epoch": 5.264999533451526, + "grad_norm": NaN, + "learning_rate": 1.167780992052752e-05, + "loss": 0.0, + "step": 56425 + }, + { + "epoch": 5.265092843146403, + "grad_norm": NaN, + "learning_rate": 1.1674884141541701e-05, + "loss": 0.0, + "step": 56426 + }, + { + "epoch": 5.26518615284128, + "grad_norm": NaN, + "learning_rate": 1.167195871427567e-05, + "loss": 0.0, + "step": 56427 + }, + { + "epoch": 5.265279462536157, + "grad_norm": NaN, + "learning_rate": 1.166903363873687e-05, + "loss": 0.0, + "step": 56428 + }, + { + "epoch": 5.265372772231035, + "grad_norm": NaN, + "learning_rate": 1.1666108914932743e-05, + "loss": 0.0, + "step": 56429 + }, + { + "epoch": 5.265466081925912, + "grad_norm": NaN, + "learning_rate": 1.1663184542870734e-05, + "loss": 0.0, + "step": 56430 + }, + { + "epoch": 5.26555939162079, + "grad_norm": NaN, + "learning_rate": 1.1660260522558257e-05, + "loss": 0.0, + "step": 56431 + }, + { + "epoch": 5.265652701315667, + "grad_norm": NaN, + "learning_rate": 1.165733685400277e-05, + "loss": 0.0, + "step": 56432 + }, + { + "epoch": 5.265746011010544, + "grad_norm": NaN, + "learning_rate": 1.1654413537211682e-05, + "loss": 0.0, + "step": 56433 + }, + { + "epoch": 5.265839320705421, + "grad_norm": NaN, + "learning_rate": 1.165149057219244e-05, + "loss": 0.0, + "step": 56434 + }, + { + "epoch": 5.2659326304002985, + "grad_norm": NaN, + "learning_rate": 1.1648567958952487e-05, + "loss": 0.0, + "step": 56435 + }, + { + "epoch": 5.266025940095176, + "grad_norm": NaN, + "learning_rate": 1.1645645697499234e-05, + "loss": 0.0, + "step": 56436 + }, + { + "epoch": 5.266119249790053, + "grad_norm": NaN, + "learning_rate": 1.1642723787840124e-05, + "loss": 0.0, + "step": 56437 + }, + { + "epoch": 5.266212559484931, + "grad_norm": NaN, + "learning_rate": 1.1639802229982586e-05, + "loss": 0.0, + "step": 56438 + }, + { + "epoch": 5.266305869179808, + "grad_norm": NaN, + "learning_rate": 1.163688102393403e-05, + "loss": 0.0, + "step": 56439 + }, + { + "epoch": 5.266399178874685, + "grad_norm": NaN, + "learning_rate": 1.1633960169701917e-05, + "loss": 0.0, + "step": 56440 + }, + { + "epoch": 5.266492488569562, + "grad_norm": NaN, + "learning_rate": 1.1631039667293656e-05, + "loss": 0.0, + "step": 56441 + }, + { + "epoch": 5.2665857982644395, + "grad_norm": NaN, + "learning_rate": 1.162811951671666e-05, + "loss": 0.0, + "step": 56442 + }, + { + "epoch": 5.266679107959317, + "grad_norm": NaN, + "learning_rate": 1.1625199717978373e-05, + "loss": 0.0, + "step": 56443 + }, + { + "epoch": 5.266772417654194, + "grad_norm": NaN, + "learning_rate": 1.1622280271086221e-05, + "loss": 0.0, + "step": 56444 + }, + { + "epoch": 5.266865727349072, + "grad_norm": NaN, + "learning_rate": 1.1619361176047615e-05, + "loss": 0.0, + "step": 56445 + }, + { + "epoch": 5.266959037043949, + "grad_norm": NaN, + "learning_rate": 1.1616442432869982e-05, + "loss": 0.0, + "step": 56446 + }, + { + "epoch": 5.267052346738826, + "grad_norm": NaN, + "learning_rate": 1.1613524041560734e-05, + "loss": 0.0, + "step": 56447 + }, + { + "epoch": 5.267145656433703, + "grad_norm": NaN, + "learning_rate": 1.1610606002127315e-05, + "loss": 0.0, + "step": 56448 + }, + { + "epoch": 5.267238966128581, + "grad_norm": NaN, + "learning_rate": 1.1607688314577118e-05, + "loss": 0.0, + "step": 56449 + }, + { + "epoch": 5.267332275823458, + "grad_norm": NaN, + "learning_rate": 1.1604770978917587e-05, + "loss": 0.0, + "step": 56450 + }, + { + "epoch": 5.267425585518335, + "grad_norm": NaN, + "learning_rate": 1.1601853995156134e-05, + "loss": 0.0, + "step": 56451 + }, + { + "epoch": 5.267518895213213, + "grad_norm": NaN, + "learning_rate": 1.1598937363300154e-05, + "loss": 0.0, + "step": 56452 + }, + { + "epoch": 5.26761220490809, + "grad_norm": NaN, + "learning_rate": 1.1596021083357088e-05, + "loss": 0.0, + "step": 56453 + }, + { + "epoch": 5.267705514602968, + "grad_norm": NaN, + "learning_rate": 1.1593105155334331e-05, + "loss": 0.0, + "step": 56454 + }, + { + "epoch": 5.267798824297844, + "grad_norm": NaN, + "learning_rate": 1.1590189579239311e-05, + "loss": 0.0, + "step": 56455 + }, + { + "epoch": 5.267892133992722, + "grad_norm": NaN, + "learning_rate": 1.1587274355079456e-05, + "loss": 0.0, + "step": 56456 + }, + { + "epoch": 5.267985443687599, + "grad_norm": NaN, + "learning_rate": 1.1584359482862143e-05, + "loss": 0.0, + "step": 56457 + }, + { + "epoch": 5.268078753382476, + "grad_norm": NaN, + "learning_rate": 1.1581444962594798e-05, + "loss": 0.0, + "step": 56458 + }, + { + "epoch": 5.268172063077354, + "grad_norm": NaN, + "learning_rate": 1.1578530794284851e-05, + "loss": 0.0, + "step": 56459 + }, + { + "epoch": 5.268265372772231, + "grad_norm": NaN, + "learning_rate": 1.1575616977939694e-05, + "loss": 0.0, + "step": 56460 + }, + { + "epoch": 5.268358682467109, + "grad_norm": NaN, + "learning_rate": 1.1572703513566738e-05, + "loss": 0.0, + "step": 56461 + }, + { + "epoch": 5.268451992161985, + "grad_norm": NaN, + "learning_rate": 1.1569790401173396e-05, + "loss": 0.0, + "step": 56462 + }, + { + "epoch": 5.268545301856863, + "grad_norm": NaN, + "learning_rate": 1.1566877640767058e-05, + "loss": 0.0, + "step": 56463 + }, + { + "epoch": 5.26863861155174, + "grad_norm": NaN, + "learning_rate": 1.1563965232355155e-05, + "loss": 0.0, + "step": 56464 + }, + { + "epoch": 5.2687319212466175, + "grad_norm": NaN, + "learning_rate": 1.1561053175945079e-05, + "loss": 0.0, + "step": 56465 + }, + { + "epoch": 5.268825230941495, + "grad_norm": NaN, + "learning_rate": 1.1558141471544225e-05, + "loss": 0.0, + "step": 56466 + }, + { + "epoch": 5.268918540636372, + "grad_norm": NaN, + "learning_rate": 1.155523011916002e-05, + "loss": 0.0, + "step": 56467 + }, + { + "epoch": 5.26901185033125, + "grad_norm": NaN, + "learning_rate": 1.1552319118799842e-05, + "loss": 0.0, + "step": 56468 + }, + { + "epoch": 5.269105160026127, + "grad_norm": NaN, + "learning_rate": 1.1549408470471117e-05, + "loss": 0.0, + "step": 56469 + }, + { + "epoch": 5.269198469721004, + "grad_norm": NaN, + "learning_rate": 1.1546498174181224e-05, + "loss": 0.0, + "step": 56470 + }, + { + "epoch": 5.269291779415881, + "grad_norm": NaN, + "learning_rate": 1.154358822993759e-05, + "loss": 0.0, + "step": 56471 + }, + { + "epoch": 5.2693850891107585, + "grad_norm": NaN, + "learning_rate": 1.1540678637747592e-05, + "loss": 0.0, + "step": 56472 + }, + { + "epoch": 5.269478398805636, + "grad_norm": NaN, + "learning_rate": 1.1537769397618624e-05, + "loss": 0.0, + "step": 56473 + }, + { + "epoch": 5.269571708500513, + "grad_norm": NaN, + "learning_rate": 1.1534860509558097e-05, + "loss": 0.0, + "step": 56474 + }, + { + "epoch": 5.269665018195391, + "grad_norm": NaN, + "learning_rate": 1.1531951973573405e-05, + "loss": 0.0, + "step": 56475 + }, + { + "epoch": 5.269758327890268, + "grad_norm": NaN, + "learning_rate": 1.1529043789671944e-05, + "loss": 0.0, + "step": 56476 + }, + { + "epoch": 5.269851637585145, + "grad_norm": NaN, + "learning_rate": 1.1526135957861088e-05, + "loss": 0.0, + "step": 56477 + }, + { + "epoch": 5.269944947280022, + "grad_norm": NaN, + "learning_rate": 1.1523228478148266e-05, + "loss": 0.0, + "step": 56478 + }, + { + "epoch": 5.2700382569749, + "grad_norm": NaN, + "learning_rate": 1.1520321350540856e-05, + "loss": 0.0, + "step": 56479 + }, + { + "epoch": 5.270131566669777, + "grad_norm": NaN, + "learning_rate": 1.1517414575046252e-05, + "loss": 0.0, + "step": 56480 + }, + { + "epoch": 5.270224876364654, + "grad_norm": NaN, + "learning_rate": 1.151450815167183e-05, + "loss": 0.0, + "step": 56481 + }, + { + "epoch": 5.270318186059532, + "grad_norm": NaN, + "learning_rate": 1.1511602080424986e-05, + "loss": 0.0, + "step": 56482 + }, + { + "epoch": 5.270411495754409, + "grad_norm": NaN, + "learning_rate": 1.150869636131313e-05, + "loss": 0.0, + "step": 56483 + }, + { + "epoch": 5.270504805449286, + "grad_norm": NaN, + "learning_rate": 1.1505790994343639e-05, + "loss": 0.0, + "step": 56484 + }, + { + "epoch": 5.270598115144163, + "grad_norm": NaN, + "learning_rate": 1.1502885979523874e-05, + "loss": 0.0, + "step": 56485 + }, + { + "epoch": 5.270691424839041, + "grad_norm": NaN, + "learning_rate": 1.1499981316861263e-05, + "loss": 0.0, + "step": 56486 + }, + { + "epoch": 5.270784734533918, + "grad_norm": NaN, + "learning_rate": 1.1497077006363165e-05, + "loss": 0.0, + "step": 56487 + }, + { + "epoch": 5.2708780442287955, + "grad_norm": NaN, + "learning_rate": 1.1494173048036975e-05, + "loss": 0.0, + "step": 56488 + }, + { + "epoch": 5.270971353923673, + "grad_norm": NaN, + "learning_rate": 1.1491269441890072e-05, + "loss": 0.0, + "step": 56489 + }, + { + "epoch": 5.27106466361855, + "grad_norm": NaN, + "learning_rate": 1.1488366187929848e-05, + "loss": 0.0, + "step": 56490 + }, + { + "epoch": 5.271157973313427, + "grad_norm": NaN, + "learning_rate": 1.1485463286163665e-05, + "loss": 0.0, + "step": 56491 + }, + { + "epoch": 5.271251283008304, + "grad_norm": NaN, + "learning_rate": 1.1482560736598934e-05, + "loss": 0.0, + "step": 56492 + }, + { + "epoch": 5.271344592703182, + "grad_norm": NaN, + "learning_rate": 1.1479658539242997e-05, + "loss": 0.0, + "step": 56493 + }, + { + "epoch": 5.271437902398059, + "grad_norm": NaN, + "learning_rate": 1.1476756694103284e-05, + "loss": 0.0, + "step": 56494 + }, + { + "epoch": 5.2715312120929365, + "grad_norm": NaN, + "learning_rate": 1.147385520118712e-05, + "loss": 0.0, + "step": 56495 + }, + { + "epoch": 5.271624521787814, + "grad_norm": NaN, + "learning_rate": 1.1470954060501918e-05, + "loss": 0.0, + "step": 56496 + }, + { + "epoch": 5.271717831482691, + "grad_norm": NaN, + "learning_rate": 1.1468053272055056e-05, + "loss": 0.0, + "step": 56497 + }, + { + "epoch": 5.271811141177569, + "grad_norm": NaN, + "learning_rate": 1.1465152835853874e-05, + "loss": 0.0, + "step": 56498 + }, + { + "epoch": 5.271904450872445, + "grad_norm": NaN, + "learning_rate": 1.1462252751905788e-05, + "loss": 0.0, + "step": 56499 + }, + { + "epoch": 5.271997760567323, + "grad_norm": NaN, + "learning_rate": 1.1459353020218154e-05, + "loss": 0.0, + "step": 56500 + }, + { + "epoch": 5.2720910702622, + "grad_norm": NaN, + "learning_rate": 1.1456453640798352e-05, + "loss": 0.0, + "step": 56501 + }, + { + "epoch": 5.272184379957078, + "grad_norm": NaN, + "learning_rate": 1.1453554613653743e-05, + "loss": 0.0, + "step": 56502 + }, + { + "epoch": 5.272277689651955, + "grad_norm": NaN, + "learning_rate": 1.1450655938791703e-05, + "loss": 0.0, + "step": 56503 + }, + { + "epoch": 5.272370999346832, + "grad_norm": NaN, + "learning_rate": 1.1447757616219594e-05, + "loss": 0.0, + "step": 56504 + }, + { + "epoch": 5.27246430904171, + "grad_norm": NaN, + "learning_rate": 1.1444859645944809e-05, + "loss": 0.0, + "step": 56505 + }, + { + "epoch": 5.272557618736586, + "grad_norm": NaN, + "learning_rate": 1.1441962027974694e-05, + "loss": 0.0, + "step": 56506 + }, + { + "epoch": 5.272650928431464, + "grad_norm": NaN, + "learning_rate": 1.143906476231664e-05, + "loss": 0.0, + "step": 56507 + }, + { + "epoch": 5.272744238126341, + "grad_norm": NaN, + "learning_rate": 1.1436167848977978e-05, + "loss": 0.0, + "step": 56508 + }, + { + "epoch": 5.272837547821219, + "grad_norm": NaN, + "learning_rate": 1.1433271287966116e-05, + "loss": 0.0, + "step": 56509 + }, + { + "epoch": 5.272930857516096, + "grad_norm": NaN, + "learning_rate": 1.1430375079288384e-05, + "loss": 0.0, + "step": 56510 + }, + { + "epoch": 5.2730241672109734, + "grad_norm": NaN, + "learning_rate": 1.1427479222952173e-05, + "loss": 0.0, + "step": 56511 + }, + { + "epoch": 5.273117476905851, + "grad_norm": NaN, + "learning_rate": 1.1424583718964814e-05, + "loss": 0.0, + "step": 56512 + }, + { + "epoch": 5.273210786600728, + "grad_norm": NaN, + "learning_rate": 1.1421688567333715e-05, + "loss": 0.0, + "step": 56513 + }, + { + "epoch": 5.273304096295605, + "grad_norm": NaN, + "learning_rate": 1.1418793768066204e-05, + "loss": 0.0, + "step": 56514 + }, + { + "epoch": 5.273397405990482, + "grad_norm": NaN, + "learning_rate": 1.1415899321169641e-05, + "loss": 0.0, + "step": 56515 + }, + { + "epoch": 5.27349071568536, + "grad_norm": NaN, + "learning_rate": 1.141300522665139e-05, + "loss": 0.0, + "step": 56516 + }, + { + "epoch": 5.273584025380237, + "grad_norm": NaN, + "learning_rate": 1.1410111484518825e-05, + "loss": 0.0, + "step": 56517 + }, + { + "epoch": 5.2736773350751145, + "grad_norm": NaN, + "learning_rate": 1.1407218094779291e-05, + "loss": 0.0, + "step": 56518 + }, + { + "epoch": 5.273770644769992, + "grad_norm": NaN, + "learning_rate": 1.140432505744015e-05, + "loss": 0.0, + "step": 56519 + }, + { + "epoch": 5.273863954464868, + "grad_norm": NaN, + "learning_rate": 1.1401432372508763e-05, + "loss": 0.0, + "step": 56520 + }, + { + "epoch": 5.273957264159746, + "grad_norm": NaN, + "learning_rate": 1.1398540039992454e-05, + "loss": 0.0, + "step": 56521 + }, + { + "epoch": 5.274050573854623, + "grad_norm": NaN, + "learning_rate": 1.139564805989862e-05, + "loss": 0.0, + "step": 56522 + }, + { + "epoch": 5.274143883549501, + "grad_norm": NaN, + "learning_rate": 1.1392756432234589e-05, + "loss": 0.0, + "step": 56523 + }, + { + "epoch": 5.274237193244378, + "grad_norm": NaN, + "learning_rate": 1.1389865157007721e-05, + "loss": 0.0, + "step": 56524 + }, + { + "epoch": 5.2743305029392555, + "grad_norm": NaN, + "learning_rate": 1.138697423422536e-05, + "loss": 0.0, + "step": 56525 + }, + { + "epoch": 5.274423812634133, + "grad_norm": NaN, + "learning_rate": 1.1384083663894883e-05, + "loss": 0.0, + "step": 56526 + }, + { + "epoch": 5.27451712232901, + "grad_norm": NaN, + "learning_rate": 1.13811934460236e-05, + "loss": 0.0, + "step": 56527 + }, + { + "epoch": 5.274610432023887, + "grad_norm": NaN, + "learning_rate": 1.1378303580618892e-05, + "loss": 0.0, + "step": 56528 + }, + { + "epoch": 5.274703741718764, + "grad_norm": NaN, + "learning_rate": 1.1375414067688083e-05, + "loss": 0.0, + "step": 56529 + }, + { + "epoch": 5.274797051413642, + "grad_norm": NaN, + "learning_rate": 1.1372524907238534e-05, + "loss": 0.0, + "step": 56530 + }, + { + "epoch": 5.274890361108519, + "grad_norm": NaN, + "learning_rate": 1.1369636099277607e-05, + "loss": 0.0, + "step": 56531 + }, + { + "epoch": 5.274983670803397, + "grad_norm": NaN, + "learning_rate": 1.1366747643812612e-05, + "loss": 0.0, + "step": 56532 + }, + { + "epoch": 5.275076980498274, + "grad_norm": NaN, + "learning_rate": 1.136385954085091e-05, + "loss": 0.0, + "step": 56533 + }, + { + "epoch": 5.275170290193151, + "grad_norm": NaN, + "learning_rate": 1.1360971790399864e-05, + "loss": 0.0, + "step": 56534 + }, + { + "epoch": 5.275263599888028, + "grad_norm": NaN, + "learning_rate": 1.1358084392466782e-05, + "loss": 0.0, + "step": 56535 + }, + { + "epoch": 5.275356909582905, + "grad_norm": NaN, + "learning_rate": 1.1355197347059042e-05, + "loss": 0.0, + "step": 56536 + }, + { + "epoch": 5.275450219277783, + "grad_norm": NaN, + "learning_rate": 1.1352310654183939e-05, + "loss": 0.0, + "step": 56537 + }, + { + "epoch": 5.27554352897266, + "grad_norm": NaN, + "learning_rate": 1.1349424313848866e-05, + "loss": 0.0, + "step": 56538 + }, + { + "epoch": 5.275636838667538, + "grad_norm": NaN, + "learning_rate": 1.1346538326061116e-05, + "loss": 0.0, + "step": 56539 + }, + { + "epoch": 5.275730148362415, + "grad_norm": NaN, + "learning_rate": 1.1343652690828054e-05, + "loss": 0.0, + "step": 56540 + }, + { + "epoch": 5.2758234580572925, + "grad_norm": NaN, + "learning_rate": 1.1340767408157003e-05, + "loss": 0.0, + "step": 56541 + }, + { + "epoch": 5.27591676775217, + "grad_norm": NaN, + "learning_rate": 1.1337882478055327e-05, + "loss": 0.0, + "step": 56542 + }, + { + "epoch": 5.276010077447046, + "grad_norm": NaN, + "learning_rate": 1.1334997900530317e-05, + "loss": 0.0, + "step": 56543 + }, + { + "epoch": 5.276103387141924, + "grad_norm": NaN, + "learning_rate": 1.1332113675589355e-05, + "loss": 0.0, + "step": 56544 + }, + { + "epoch": 5.276196696836801, + "grad_norm": NaN, + "learning_rate": 1.132922980323973e-05, + "loss": 0.0, + "step": 56545 + }, + { + "epoch": 5.276290006531679, + "grad_norm": NaN, + "learning_rate": 1.1326346283488808e-05, + "loss": 0.0, + "step": 56546 + }, + { + "epoch": 5.276383316226556, + "grad_norm": NaN, + "learning_rate": 1.1323463116343912e-05, + "loss": 0.0, + "step": 56547 + }, + { + "epoch": 5.2764766259214335, + "grad_norm": NaN, + "learning_rate": 1.1320580301812354e-05, + "loss": 0.0, + "step": 56548 + }, + { + "epoch": 5.276569935616311, + "grad_norm": NaN, + "learning_rate": 1.1317697839901496e-05, + "loss": 0.0, + "step": 56549 + }, + { + "epoch": 5.2766632453111875, + "grad_norm": NaN, + "learning_rate": 1.1314815730618648e-05, + "loss": 0.0, + "step": 56550 + }, + { + "epoch": 5.276756555006065, + "grad_norm": NaN, + "learning_rate": 1.1311933973971137e-05, + "loss": 0.0, + "step": 56551 + }, + { + "epoch": 5.276849864700942, + "grad_norm": NaN, + "learning_rate": 1.1309052569966309e-05, + "loss": 0.0, + "step": 56552 + }, + { + "epoch": 5.27694317439582, + "grad_norm": NaN, + "learning_rate": 1.1306171518611456e-05, + "loss": 0.0, + "step": 56553 + }, + { + "epoch": 5.277036484090697, + "grad_norm": NaN, + "learning_rate": 1.130329081991394e-05, + "loss": 0.0, + "step": 56554 + }, + { + "epoch": 5.277129793785575, + "grad_norm": NaN, + "learning_rate": 1.1300410473881071e-05, + "loss": 0.0, + "step": 56555 + }, + { + "epoch": 5.277223103480452, + "grad_norm": NaN, + "learning_rate": 1.1297530480520162e-05, + "loss": 0.0, + "step": 56556 + }, + { + "epoch": 5.2773164131753285, + "grad_norm": NaN, + "learning_rate": 1.1294650839838553e-05, + "loss": 0.0, + "step": 56557 + }, + { + "epoch": 5.277409722870206, + "grad_norm": NaN, + "learning_rate": 1.1291771551843559e-05, + "loss": 0.0, + "step": 56558 + }, + { + "epoch": 5.277503032565083, + "grad_norm": NaN, + "learning_rate": 1.1288892616542505e-05, + "loss": 0.0, + "step": 56559 + }, + { + "epoch": 5.277596342259961, + "grad_norm": NaN, + "learning_rate": 1.1286014033942702e-05, + "loss": 0.0, + "step": 56560 + }, + { + "epoch": 5.277689651954838, + "grad_norm": NaN, + "learning_rate": 1.128313580405148e-05, + "loss": 0.0, + "step": 56561 + }, + { + "epoch": 5.277782961649716, + "grad_norm": NaN, + "learning_rate": 1.1280257926876147e-05, + "loss": 0.0, + "step": 56562 + }, + { + "epoch": 5.277876271344593, + "grad_norm": NaN, + "learning_rate": 1.127738040242403e-05, + "loss": 0.0, + "step": 56563 + }, + { + "epoch": 5.27796958103947, + "grad_norm": NaN, + "learning_rate": 1.1274503230702442e-05, + "loss": 0.0, + "step": 56564 + }, + { + "epoch": 5.278062890734347, + "grad_norm": NaN, + "learning_rate": 1.1271626411718692e-05, + "loss": 0.0, + "step": 56565 + }, + { + "epoch": 5.278156200429224, + "grad_norm": NaN, + "learning_rate": 1.126874994548011e-05, + "loss": 0.0, + "step": 56566 + }, + { + "epoch": 5.278249510124102, + "grad_norm": NaN, + "learning_rate": 1.1265873831993989e-05, + "loss": 0.0, + "step": 56567 + }, + { + "epoch": 5.278342819818979, + "grad_norm": NaN, + "learning_rate": 1.126299807126767e-05, + "loss": 0.0, + "step": 56568 + }, + { + "epoch": 5.278436129513857, + "grad_norm": NaN, + "learning_rate": 1.1260122663308451e-05, + "loss": 0.0, + "step": 56569 + }, + { + "epoch": 5.278529439208734, + "grad_norm": NaN, + "learning_rate": 1.1257247608123626e-05, + "loss": 0.0, + "step": 56570 + }, + { + "epoch": 5.2786227489036115, + "grad_norm": NaN, + "learning_rate": 1.1254372905720538e-05, + "loss": 0.0, + "step": 56571 + }, + { + "epoch": 5.278716058598488, + "grad_norm": NaN, + "learning_rate": 1.1251498556106481e-05, + "loss": 0.0, + "step": 56572 + }, + { + "epoch": 5.2788093682933654, + "grad_norm": NaN, + "learning_rate": 1.1248624559288748e-05, + "loss": 0.0, + "step": 56573 + }, + { + "epoch": 5.278902677988243, + "grad_norm": NaN, + "learning_rate": 1.124575091527467e-05, + "loss": 0.0, + "step": 56574 + }, + { + "epoch": 5.27899598768312, + "grad_norm": NaN, + "learning_rate": 1.1242877624071538e-05, + "loss": 0.0, + "step": 56575 + }, + { + "epoch": 5.279089297377998, + "grad_norm": NaN, + "learning_rate": 1.1240004685686683e-05, + "loss": 0.0, + "step": 56576 + }, + { + "epoch": 5.279182607072875, + "grad_norm": NaN, + "learning_rate": 1.1237132100127377e-05, + "loss": 0.0, + "step": 56577 + }, + { + "epoch": 5.2792759167677525, + "grad_norm": NaN, + "learning_rate": 1.1234259867400935e-05, + "loss": 0.0, + "step": 56578 + }, + { + "epoch": 5.279369226462629, + "grad_norm": NaN, + "learning_rate": 1.1231387987514685e-05, + "loss": 0.0, + "step": 56579 + }, + { + "epoch": 5.2794625361575065, + "grad_norm": NaN, + "learning_rate": 1.12285164604759e-05, + "loss": 0.0, + "step": 56580 + }, + { + "epoch": 5.279555845852384, + "grad_norm": NaN, + "learning_rate": 1.1225645286291879e-05, + "loss": 0.0, + "step": 56581 + }, + { + "epoch": 5.279649155547261, + "grad_norm": NaN, + "learning_rate": 1.1222774464969947e-05, + "loss": 0.0, + "step": 56582 + }, + { + "epoch": 5.279742465242139, + "grad_norm": NaN, + "learning_rate": 1.1219903996517382e-05, + "loss": 0.0, + "step": 56583 + }, + { + "epoch": 5.279835774937016, + "grad_norm": NaN, + "learning_rate": 1.1217033880941495e-05, + "loss": 0.0, + "step": 56584 + }, + { + "epoch": 5.279929084631894, + "grad_norm": NaN, + "learning_rate": 1.121416411824958e-05, + "loss": 0.0, + "step": 56585 + }, + { + "epoch": 5.280022394326771, + "grad_norm": NaN, + "learning_rate": 1.1211294708448947e-05, + "loss": 0.0, + "step": 56586 + }, + { + "epoch": 5.2801157040216475, + "grad_norm": NaN, + "learning_rate": 1.120842565154686e-05, + "loss": 0.0, + "step": 56587 + }, + { + "epoch": 5.280209013716525, + "grad_norm": NaN, + "learning_rate": 1.1205556947550643e-05, + "loss": 0.0, + "step": 56588 + }, + { + "epoch": 5.280302323411402, + "grad_norm": NaN, + "learning_rate": 1.1202688596467573e-05, + "loss": 0.0, + "step": 56589 + }, + { + "epoch": 5.28039563310628, + "grad_norm": NaN, + "learning_rate": 1.1199820598304965e-05, + "loss": 0.0, + "step": 56590 + }, + { + "epoch": 5.280488942801157, + "grad_norm": NaN, + "learning_rate": 1.1196952953070093e-05, + "loss": 0.0, + "step": 56591 + }, + { + "epoch": 5.280582252496035, + "grad_norm": NaN, + "learning_rate": 1.1194085660770252e-05, + "loss": 0.0, + "step": 56592 + }, + { + "epoch": 5.280675562190911, + "grad_norm": NaN, + "learning_rate": 1.119121872141272e-05, + "loss": 0.0, + "step": 56593 + }, + { + "epoch": 5.280768871885789, + "grad_norm": NaN, + "learning_rate": 1.1188352135004824e-05, + "loss": 0.0, + "step": 56594 + }, + { + "epoch": 5.280862181580666, + "grad_norm": NaN, + "learning_rate": 1.1185485901553809e-05, + "loss": 0.0, + "step": 56595 + }, + { + "epoch": 5.280955491275543, + "grad_norm": NaN, + "learning_rate": 1.1182620021067002e-05, + "loss": 0.0, + "step": 56596 + }, + { + "epoch": 5.281048800970421, + "grad_norm": NaN, + "learning_rate": 1.1179754493551663e-05, + "loss": 0.0, + "step": 56597 + }, + { + "epoch": 5.281142110665298, + "grad_norm": NaN, + "learning_rate": 1.1176889319015086e-05, + "loss": 0.0, + "step": 56598 + }, + { + "epoch": 5.281235420360176, + "grad_norm": NaN, + "learning_rate": 1.1174024497464551e-05, + "loss": 0.0, + "step": 56599 + }, + { + "epoch": 5.281328730055053, + "grad_norm": NaN, + "learning_rate": 1.117116002890735e-05, + "loss": 0.0, + "step": 56600 + }, + { + "epoch": 5.28142203974993, + "grad_norm": NaN, + "learning_rate": 1.1168295913350778e-05, + "loss": 0.0, + "step": 56601 + }, + { + "epoch": 5.281515349444807, + "grad_norm": NaN, + "learning_rate": 1.1165432150802094e-05, + "loss": 0.0, + "step": 56602 + }, + { + "epoch": 5.2816086591396845, + "grad_norm": NaN, + "learning_rate": 1.116256874126858e-05, + "loss": 0.0, + "step": 56603 + }, + { + "epoch": 5.281701968834562, + "grad_norm": NaN, + "learning_rate": 1.1159705684757541e-05, + "loss": 0.0, + "step": 56604 + }, + { + "epoch": 5.281795278529439, + "grad_norm": NaN, + "learning_rate": 1.1156842981276242e-05, + "loss": 0.0, + "step": 56605 + }, + { + "epoch": 5.281888588224317, + "grad_norm": NaN, + "learning_rate": 1.115398063083196e-05, + "loss": 0.0, + "step": 56606 + }, + { + "epoch": 5.281981897919194, + "grad_norm": NaN, + "learning_rate": 1.1151118633431988e-05, + "loss": 0.0, + "step": 56607 + }, + { + "epoch": 5.282075207614071, + "grad_norm": NaN, + "learning_rate": 1.114825698908357e-05, + "loss": 0.0, + "step": 56608 + }, + { + "epoch": 5.282168517308948, + "grad_norm": NaN, + "learning_rate": 1.1145395697794019e-05, + "loss": 0.0, + "step": 56609 + }, + { + "epoch": 5.2822618270038255, + "grad_norm": NaN, + "learning_rate": 1.1142534759570592e-05, + "loss": 0.0, + "step": 56610 + }, + { + "epoch": 5.282355136698703, + "grad_norm": NaN, + "learning_rate": 1.113967417442057e-05, + "loss": 0.0, + "step": 56611 + }, + { + "epoch": 5.28244844639358, + "grad_norm": NaN, + "learning_rate": 1.1136813942351214e-05, + "loss": 0.0, + "step": 56612 + }, + { + "epoch": 5.282541756088458, + "grad_norm": NaN, + "learning_rate": 1.1133954063369799e-05, + "loss": 0.0, + "step": 56613 + }, + { + "epoch": 5.282635065783335, + "grad_norm": NaN, + "learning_rate": 1.113109453748362e-05, + "loss": 0.0, + "step": 56614 + }, + { + "epoch": 5.282728375478213, + "grad_norm": NaN, + "learning_rate": 1.1128235364699922e-05, + "loss": 0.0, + "step": 56615 + }, + { + "epoch": 5.282821685173089, + "grad_norm": NaN, + "learning_rate": 1.1125376545025999e-05, + "loss": 0.0, + "step": 56616 + }, + { + "epoch": 5.282914994867967, + "grad_norm": NaN, + "learning_rate": 1.1122518078469095e-05, + "loss": 0.0, + "step": 56617 + }, + { + "epoch": 5.283008304562844, + "grad_norm": NaN, + "learning_rate": 1.1119659965036488e-05, + "loss": 0.0, + "step": 56618 + }, + { + "epoch": 5.283101614257721, + "grad_norm": NaN, + "learning_rate": 1.1116802204735453e-05, + "loss": 0.0, + "step": 56619 + }, + { + "epoch": 5.283194923952599, + "grad_norm": NaN, + "learning_rate": 1.1113944797573237e-05, + "loss": 0.0, + "step": 56620 + }, + { + "epoch": 5.283288233647476, + "grad_norm": NaN, + "learning_rate": 1.1111087743557118e-05, + "loss": 0.0, + "step": 56621 + }, + { + "epoch": 5.283381543342354, + "grad_norm": NaN, + "learning_rate": 1.1108231042694372e-05, + "loss": 0.0, + "step": 56622 + }, + { + "epoch": 5.28347485303723, + "grad_norm": NaN, + "learning_rate": 1.1105374694992259e-05, + "loss": 0.0, + "step": 56623 + }, + { + "epoch": 5.283568162732108, + "grad_norm": NaN, + "learning_rate": 1.1102518700458024e-05, + "loss": 0.0, + "step": 56624 + }, + { + "epoch": 5.283661472426985, + "grad_norm": NaN, + "learning_rate": 1.1099663059098929e-05, + "loss": 0.0, + "step": 56625 + }, + { + "epoch": 5.2837547821218624, + "grad_norm": NaN, + "learning_rate": 1.1096807770922267e-05, + "loss": 0.0, + "step": 56626 + }, + { + "epoch": 5.28384809181674, + "grad_norm": NaN, + "learning_rate": 1.1093952835935265e-05, + "loss": 0.0, + "step": 56627 + }, + { + "epoch": 5.283941401511617, + "grad_norm": NaN, + "learning_rate": 1.1091098254145186e-05, + "loss": 0.0, + "step": 56628 + }, + { + "epoch": 5.284034711206495, + "grad_norm": NaN, + "learning_rate": 1.1088244025559306e-05, + "loss": 0.0, + "step": 56629 + }, + { + "epoch": 5.284128020901371, + "grad_norm": NaN, + "learning_rate": 1.1085390150184869e-05, + "loss": 0.0, + "step": 56630 + }, + { + "epoch": 5.284221330596249, + "grad_norm": NaN, + "learning_rate": 1.1082536628029138e-05, + "loss": 0.0, + "step": 56631 + }, + { + "epoch": 5.284314640291126, + "grad_norm": NaN, + "learning_rate": 1.107968345909937e-05, + "loss": 0.0, + "step": 56632 + }, + { + "epoch": 5.2844079499860035, + "grad_norm": NaN, + "learning_rate": 1.1076830643402812e-05, + "loss": 0.0, + "step": 56633 + }, + { + "epoch": 5.284501259680881, + "grad_norm": NaN, + "learning_rate": 1.1073978180946725e-05, + "loss": 0.0, + "step": 56634 + }, + { + "epoch": 5.284594569375758, + "grad_norm": NaN, + "learning_rate": 1.107112607173835e-05, + "loss": 0.0, + "step": 56635 + }, + { + "epoch": 5.284687879070636, + "grad_norm": NaN, + "learning_rate": 1.1068274315784953e-05, + "loss": 0.0, + "step": 56636 + }, + { + "epoch": 5.284781188765512, + "grad_norm": NaN, + "learning_rate": 1.1065422913093775e-05, + "loss": 0.0, + "step": 56637 + }, + { + "epoch": 5.28487449846039, + "grad_norm": NaN, + "learning_rate": 1.1062571863672076e-05, + "loss": 0.0, + "step": 56638 + }, + { + "epoch": 5.284967808155267, + "grad_norm": NaN, + "learning_rate": 1.1059721167527102e-05, + "loss": 0.0, + "step": 56639 + }, + { + "epoch": 5.2850611178501445, + "grad_norm": NaN, + "learning_rate": 1.1056870824666097e-05, + "loss": 0.0, + "step": 56640 + }, + { + "epoch": 5.285154427545022, + "grad_norm": NaN, + "learning_rate": 1.1054020835096306e-05, + "loss": 0.0, + "step": 56641 + }, + { + "epoch": 5.285247737239899, + "grad_norm": NaN, + "learning_rate": 1.1051171198824988e-05, + "loss": 0.0, + "step": 56642 + }, + { + "epoch": 5.285341046934777, + "grad_norm": NaN, + "learning_rate": 1.1048321915859387e-05, + "loss": 0.0, + "step": 56643 + }, + { + "epoch": 5.285434356629654, + "grad_norm": NaN, + "learning_rate": 1.1045472986206716e-05, + "loss": 0.0, + "step": 56644 + }, + { + "epoch": 5.285527666324531, + "grad_norm": NaN, + "learning_rate": 1.1042624409874285e-05, + "loss": 0.0, + "step": 56645 + }, + { + "epoch": 5.285620976019408, + "grad_norm": NaN, + "learning_rate": 1.1039776186869287e-05, + "loss": 0.0, + "step": 56646 + }, + { + "epoch": 5.285714285714286, + "grad_norm": NaN, + "learning_rate": 1.1036928317198934e-05, + "loss": 0.0, + "step": 56647 + }, + { + "epoch": 5.285807595409163, + "grad_norm": NaN, + "learning_rate": 1.103408080087057e-05, + "loss": 0.0, + "step": 56648 + }, + { + "epoch": 5.28590090510404, + "grad_norm": NaN, + "learning_rate": 1.103123363789134e-05, + "loss": 0.0, + "step": 56649 + }, + { + "epoch": 5.285994214798918, + "grad_norm": NaN, + "learning_rate": 1.1028386828268538e-05, + "loss": 0.0, + "step": 56650 + }, + { + "epoch": 5.286087524493795, + "grad_norm": NaN, + "learning_rate": 1.1025540372009356e-05, + "loss": 0.0, + "step": 56651 + }, + { + "epoch": 5.286180834188672, + "grad_norm": NaN, + "learning_rate": 1.1022694269121074e-05, + "loss": 0.0, + "step": 56652 + }, + { + "epoch": 5.286274143883549, + "grad_norm": NaN, + "learning_rate": 1.1019848519610902e-05, + "loss": 0.0, + "step": 56653 + }, + { + "epoch": 5.286367453578427, + "grad_norm": NaN, + "learning_rate": 1.10170031234861e-05, + "loss": 0.0, + "step": 56654 + }, + { + "epoch": 5.286460763273304, + "grad_norm": NaN, + "learning_rate": 1.1014158080753882e-05, + "loss": 0.0, + "step": 56655 + }, + { + "epoch": 5.2865540729681815, + "grad_norm": NaN, + "learning_rate": 1.1011313391421472e-05, + "loss": 0.0, + "step": 56656 + }, + { + "epoch": 5.286647382663059, + "grad_norm": NaN, + "learning_rate": 1.100846905549615e-05, + "loss": 0.0, + "step": 56657 + }, + { + "epoch": 5.286740692357936, + "grad_norm": NaN, + "learning_rate": 1.1005625072985107e-05, + "loss": 0.0, + "step": 56658 + }, + { + "epoch": 5.286834002052814, + "grad_norm": NaN, + "learning_rate": 1.1002781443895575e-05, + "loss": 0.0, + "step": 56659 + }, + { + "epoch": 5.28692731174769, + "grad_norm": NaN, + "learning_rate": 1.0999938168234813e-05, + "loss": 0.0, + "step": 56660 + }, + { + "epoch": 5.287020621442568, + "grad_norm": NaN, + "learning_rate": 1.0997095246010013e-05, + "loss": 0.0, + "step": 56661 + }, + { + "epoch": 5.287113931137445, + "grad_norm": NaN, + "learning_rate": 1.0994252677228438e-05, + "loss": 0.0, + "step": 56662 + }, + { + "epoch": 5.2872072408323225, + "grad_norm": NaN, + "learning_rate": 1.0991410461897315e-05, + "loss": 0.0, + "step": 56663 + }, + { + "epoch": 5.2873005505272, + "grad_norm": NaN, + "learning_rate": 1.098856860002384e-05, + "loss": 0.0, + "step": 56664 + }, + { + "epoch": 5.287393860222077, + "grad_norm": NaN, + "learning_rate": 1.0985727091615254e-05, + "loss": 0.0, + "step": 56665 + }, + { + "epoch": 5.287487169916955, + "grad_norm": NaN, + "learning_rate": 1.0982885936678786e-05, + "loss": 0.0, + "step": 56666 + }, + { + "epoch": 5.287580479611831, + "grad_norm": NaN, + "learning_rate": 1.0980045135221666e-05, + "loss": 0.0, + "step": 56667 + }, + { + "epoch": 5.287673789306709, + "grad_norm": NaN, + "learning_rate": 1.0977204687251118e-05, + "loss": 0.0, + "step": 56668 + }, + { + "epoch": 5.287767099001586, + "grad_norm": NaN, + "learning_rate": 1.0974364592774337e-05, + "loss": 0.0, + "step": 56669 + }, + { + "epoch": 5.287860408696464, + "grad_norm": NaN, + "learning_rate": 1.0971524851798586e-05, + "loss": 0.0, + "step": 56670 + }, + { + "epoch": 5.287953718391341, + "grad_norm": NaN, + "learning_rate": 1.0968685464331056e-05, + "loss": 0.0, + "step": 56671 + }, + { + "epoch": 5.288047028086218, + "grad_norm": NaN, + "learning_rate": 1.096584643037896e-05, + "loss": 0.0, + "step": 56672 + }, + { + "epoch": 5.288140337781096, + "grad_norm": NaN, + "learning_rate": 1.0963007749949543e-05, + "loss": 0.0, + "step": 56673 + }, + { + "epoch": 5.288233647475972, + "grad_norm": NaN, + "learning_rate": 1.0960169423050013e-05, + "loss": 0.0, + "step": 56674 + }, + { + "epoch": 5.28832695717085, + "grad_norm": NaN, + "learning_rate": 1.0957331449687583e-05, + "loss": 0.0, + "step": 56675 + }, + { + "epoch": 5.288420266865727, + "grad_norm": NaN, + "learning_rate": 1.0954493829869482e-05, + "loss": 0.0, + "step": 56676 + }, + { + "epoch": 5.288513576560605, + "grad_norm": NaN, + "learning_rate": 1.0951656563602901e-05, + "loss": 0.0, + "step": 56677 + }, + { + "epoch": 5.288606886255482, + "grad_norm": NaN, + "learning_rate": 1.0948819650895085e-05, + "loss": 0.0, + "step": 56678 + }, + { + "epoch": 5.2887001959503595, + "grad_norm": NaN, + "learning_rate": 1.0945983091753213e-05, + "loss": 0.0, + "step": 56679 + }, + { + "epoch": 5.288793505645237, + "grad_norm": NaN, + "learning_rate": 1.0943146886184512e-05, + "loss": 0.0, + "step": 56680 + }, + { + "epoch": 5.288886815340113, + "grad_norm": NaN, + "learning_rate": 1.0940311034196209e-05, + "loss": 0.0, + "step": 56681 + }, + { + "epoch": 5.288980125034991, + "grad_norm": NaN, + "learning_rate": 1.0937475535795481e-05, + "loss": 0.0, + "step": 56682 + }, + { + "epoch": 5.289073434729868, + "grad_norm": NaN, + "learning_rate": 1.0934640390989607e-05, + "loss": 0.0, + "step": 56683 + }, + { + "epoch": 5.289166744424746, + "grad_norm": NaN, + "learning_rate": 1.0931805599785715e-05, + "loss": 0.0, + "step": 56684 + }, + { + "epoch": 5.289260054119623, + "grad_norm": NaN, + "learning_rate": 1.092897116219103e-05, + "loss": 0.0, + "step": 56685 + }, + { + "epoch": 5.2893533638145005, + "grad_norm": NaN, + "learning_rate": 1.0926137078212815e-05, + "loss": 0.0, + "step": 56686 + }, + { + "epoch": 5.289446673509378, + "grad_norm": NaN, + "learning_rate": 1.0923303347858214e-05, + "loss": 0.0, + "step": 56687 + }, + { + "epoch": 5.289539983204255, + "grad_norm": NaN, + "learning_rate": 1.0920469971134438e-05, + "loss": 0.0, + "step": 56688 + }, + { + "epoch": 5.289633292899132, + "grad_norm": NaN, + "learning_rate": 1.0917636948048747e-05, + "loss": 0.0, + "step": 56689 + }, + { + "epoch": 5.289726602594009, + "grad_norm": NaN, + "learning_rate": 1.0914804278608286e-05, + "loss": 0.0, + "step": 56690 + }, + { + "epoch": 5.289819912288887, + "grad_norm": NaN, + "learning_rate": 1.0911971962820265e-05, + "loss": 0.0, + "step": 56691 + }, + { + "epoch": 5.289913221983764, + "grad_norm": NaN, + "learning_rate": 1.0909140000691929e-05, + "loss": 0.0, + "step": 56692 + }, + { + "epoch": 5.2900065316786415, + "grad_norm": NaN, + "learning_rate": 1.0906308392230439e-05, + "loss": 0.0, + "step": 56693 + }, + { + "epoch": 5.290099841373519, + "grad_norm": NaN, + "learning_rate": 1.090347713744299e-05, + "loss": 0.0, + "step": 56694 + }, + { + "epoch": 5.290193151068396, + "grad_norm": NaN, + "learning_rate": 1.0900646236336808e-05, + "loss": 0.0, + "step": 56695 + }, + { + "epoch": 5.290286460763273, + "grad_norm": NaN, + "learning_rate": 1.089781568891907e-05, + "loss": 0.0, + "step": 56696 + }, + { + "epoch": 5.29037977045815, + "grad_norm": NaN, + "learning_rate": 1.089498549519699e-05, + "loss": 0.0, + "step": 56697 + }, + { + "epoch": 5.290473080153028, + "grad_norm": NaN, + "learning_rate": 1.0892155655177743e-05, + "loss": 0.0, + "step": 56698 + }, + { + "epoch": 5.290566389847905, + "grad_norm": NaN, + "learning_rate": 1.0889326168868556e-05, + "loss": 0.0, + "step": 56699 + }, + { + "epoch": 5.290659699542783, + "grad_norm": NaN, + "learning_rate": 1.0886497036276593e-05, + "loss": 0.0, + "step": 56700 + }, + { + "epoch": 5.29075300923766, + "grad_norm": NaN, + "learning_rate": 1.0883668257409062e-05, + "loss": 0.0, + "step": 56701 + }, + { + "epoch": 5.290846318932537, + "grad_norm": NaN, + "learning_rate": 1.0880839832273142e-05, + "loss": 0.0, + "step": 56702 + }, + { + "epoch": 5.290939628627415, + "grad_norm": NaN, + "learning_rate": 1.0878011760876042e-05, + "loss": 0.0, + "step": 56703 + }, + { + "epoch": 5.291032938322291, + "grad_norm": NaN, + "learning_rate": 1.087518404322496e-05, + "loss": 0.0, + "step": 56704 + }, + { + "epoch": 5.291126248017169, + "grad_norm": NaN, + "learning_rate": 1.087235667932707e-05, + "loss": 0.0, + "step": 56705 + }, + { + "epoch": 5.291219557712046, + "grad_norm": NaN, + "learning_rate": 1.0869529669189553e-05, + "loss": 0.0, + "step": 56706 + }, + { + "epoch": 5.291312867406924, + "grad_norm": NaN, + "learning_rate": 1.0866703012819617e-05, + "loss": 0.0, + "step": 56707 + }, + { + "epoch": 5.291406177101801, + "grad_norm": NaN, + "learning_rate": 1.086387671022444e-05, + "loss": 0.0, + "step": 56708 + }, + { + "epoch": 5.2914994867966785, + "grad_norm": NaN, + "learning_rate": 1.08610507614112e-05, + "loss": 0.0, + "step": 56709 + }, + { + "epoch": 5.291592796491555, + "grad_norm": NaN, + "learning_rate": 1.0858225166387107e-05, + "loss": 0.0, + "step": 56710 + }, + { + "epoch": 5.291686106186432, + "grad_norm": NaN, + "learning_rate": 1.0855399925159324e-05, + "loss": 0.0, + "step": 56711 + }, + { + "epoch": 5.29177941588131, + "grad_norm": NaN, + "learning_rate": 1.0852575037735028e-05, + "loss": 0.0, + "step": 56712 + }, + { + "epoch": 5.291872725576187, + "grad_norm": NaN, + "learning_rate": 1.0849750504121429e-05, + "loss": 0.0, + "step": 56713 + }, + { + "epoch": 5.291966035271065, + "grad_norm": NaN, + "learning_rate": 1.0846926324325706e-05, + "loss": 0.0, + "step": 56714 + }, + { + "epoch": 5.292059344965942, + "grad_norm": NaN, + "learning_rate": 1.0844102498355001e-05, + "loss": 0.0, + "step": 56715 + }, + { + "epoch": 5.2921526546608195, + "grad_norm": NaN, + "learning_rate": 1.0841279026216542e-05, + "loss": 0.0, + "step": 56716 + }, + { + "epoch": 5.292245964355697, + "grad_norm": NaN, + "learning_rate": 1.0838455907917459e-05, + "loss": 0.0, + "step": 56717 + }, + { + "epoch": 5.2923392740505735, + "grad_norm": NaN, + "learning_rate": 1.0835633143465011e-05, + "loss": 0.0, + "step": 56718 + }, + { + "epoch": 5.292432583745451, + "grad_norm": NaN, + "learning_rate": 1.0832810732866309e-05, + "loss": 0.0, + "step": 56719 + }, + { + "epoch": 5.292525893440328, + "grad_norm": NaN, + "learning_rate": 1.0829988676128499e-05, + "loss": 0.0, + "step": 56720 + }, + { + "epoch": 5.292619203135206, + "grad_norm": NaN, + "learning_rate": 1.0827166973258872e-05, + "loss": 0.0, + "step": 56721 + }, + { + "epoch": 5.292712512830083, + "grad_norm": NaN, + "learning_rate": 1.082434562426449e-05, + "loss": 0.0, + "step": 56722 + }, + { + "epoch": 5.292805822524961, + "grad_norm": NaN, + "learning_rate": 1.0821524629152567e-05, + "loss": 0.0, + "step": 56723 + }, + { + "epoch": 5.292899132219838, + "grad_norm": NaN, + "learning_rate": 1.081870398793031e-05, + "loss": 0.0, + "step": 56724 + }, + { + "epoch": 5.2929924419147145, + "grad_norm": NaN, + "learning_rate": 1.081588370060485e-05, + "loss": 0.0, + "step": 56725 + }, + { + "epoch": 5.293085751609592, + "grad_norm": NaN, + "learning_rate": 1.0813063767183328e-05, + "loss": 0.0, + "step": 56726 + }, + { + "epoch": 5.293179061304469, + "grad_norm": NaN, + "learning_rate": 1.0810244187673006e-05, + "loss": 0.0, + "step": 56727 + }, + { + "epoch": 5.293272370999347, + "grad_norm": NaN, + "learning_rate": 1.0807424962080996e-05, + "loss": 0.0, + "step": 56728 + }, + { + "epoch": 5.293365680694224, + "grad_norm": NaN, + "learning_rate": 1.0804606090414424e-05, + "loss": 0.0, + "step": 56729 + }, + { + "epoch": 5.293458990389102, + "grad_norm": NaN, + "learning_rate": 1.080178757268057e-05, + "loss": 0.0, + "step": 56730 + }, + { + "epoch": 5.293552300083979, + "grad_norm": NaN, + "learning_rate": 1.079896940888651e-05, + "loss": 0.0, + "step": 56731 + }, + { + "epoch": 5.2936456097788565, + "grad_norm": NaN, + "learning_rate": 1.0796151599039404e-05, + "loss": 0.0, + "step": 56732 + }, + { + "epoch": 5.293738919473733, + "grad_norm": NaN, + "learning_rate": 1.0793334143146515e-05, + "loss": 0.0, + "step": 56733 + }, + { + "epoch": 5.29383222916861, + "grad_norm": NaN, + "learning_rate": 1.0790517041214903e-05, + "loss": 0.0, + "step": 56734 + }, + { + "epoch": 5.293925538863488, + "grad_norm": NaN, + "learning_rate": 1.0787700293251744e-05, + "loss": 0.0, + "step": 56735 + }, + { + "epoch": 5.294018848558365, + "grad_norm": NaN, + "learning_rate": 1.0784883899264284e-05, + "loss": 0.0, + "step": 56736 + }, + { + "epoch": 5.294112158253243, + "grad_norm": NaN, + "learning_rate": 1.0782067859259586e-05, + "loss": 0.0, + "step": 56737 + }, + { + "epoch": 5.29420546794812, + "grad_norm": NaN, + "learning_rate": 1.077925217324484e-05, + "loss": 0.0, + "step": 56738 + }, + { + "epoch": 5.2942987776429975, + "grad_norm": NaN, + "learning_rate": 1.0776436841227259e-05, + "loss": 0.0, + "step": 56739 + }, + { + "epoch": 5.294392087337874, + "grad_norm": NaN, + "learning_rate": 1.0773621863213938e-05, + "loss": 0.0, + "step": 56740 + }, + { + "epoch": 5.2944853970327515, + "grad_norm": NaN, + "learning_rate": 1.0770807239212053e-05, + "loss": 0.0, + "step": 56741 + }, + { + "epoch": 5.294578706727629, + "grad_norm": NaN, + "learning_rate": 1.076799296922875e-05, + "loss": 0.0, + "step": 56742 + }, + { + "epoch": 5.294672016422506, + "grad_norm": NaN, + "learning_rate": 1.0765179053271222e-05, + "loss": 0.0, + "step": 56743 + }, + { + "epoch": 5.294765326117384, + "grad_norm": NaN, + "learning_rate": 1.076236549134658e-05, + "loss": 0.0, + "step": 56744 + }, + { + "epoch": 5.294858635812261, + "grad_norm": NaN, + "learning_rate": 1.0759552283462003e-05, + "loss": 0.0, + "step": 56745 + }, + { + "epoch": 5.2949519455071385, + "grad_norm": NaN, + "learning_rate": 1.0756739429624633e-05, + "loss": 0.0, + "step": 56746 + }, + { + "epoch": 5.295045255202015, + "grad_norm": NaN, + "learning_rate": 1.0753926929841633e-05, + "loss": 0.0, + "step": 56747 + }, + { + "epoch": 5.2951385648968925, + "grad_norm": NaN, + "learning_rate": 1.0751114784120145e-05, + "loss": 0.0, + "step": 56748 + }, + { + "epoch": 5.29523187459177, + "grad_norm": NaN, + "learning_rate": 1.0748302992467334e-05, + "loss": 0.0, + "step": 56749 + }, + { + "epoch": 5.295325184286647, + "grad_norm": NaN, + "learning_rate": 1.0745491554890323e-05, + "loss": 0.0, + "step": 56750 + }, + { + "epoch": 5.295418493981525, + "grad_norm": NaN, + "learning_rate": 1.0742680471396292e-05, + "loss": 0.0, + "step": 56751 + }, + { + "epoch": 5.295511803676402, + "grad_norm": NaN, + "learning_rate": 1.0739869741992368e-05, + "loss": 0.0, + "step": 56752 + }, + { + "epoch": 5.29560511337128, + "grad_norm": NaN, + "learning_rate": 1.0737059366685696e-05, + "loss": 0.0, + "step": 56753 + }, + { + "epoch": 5.295698423066156, + "grad_norm": NaN, + "learning_rate": 1.0734249345483437e-05, + "loss": 0.0, + "step": 56754 + }, + { + "epoch": 5.2957917327610335, + "grad_norm": NaN, + "learning_rate": 1.0731439678392716e-05, + "loss": 0.0, + "step": 56755 + }, + { + "epoch": 5.295885042455911, + "grad_norm": NaN, + "learning_rate": 1.0728630365420714e-05, + "loss": 0.0, + "step": 56756 + }, + { + "epoch": 5.295978352150788, + "grad_norm": NaN, + "learning_rate": 1.0725821406574541e-05, + "loss": 0.0, + "step": 56757 + }, + { + "epoch": 5.296071661845666, + "grad_norm": NaN, + "learning_rate": 1.0723012801861326e-05, + "loss": 0.0, + "step": 56758 + }, + { + "epoch": 5.296164971540543, + "grad_norm": NaN, + "learning_rate": 1.0720204551288275e-05, + "loss": 0.0, + "step": 56759 + }, + { + "epoch": 5.296258281235421, + "grad_norm": NaN, + "learning_rate": 1.0717396654862453e-05, + "loss": 0.0, + "step": 56760 + }, + { + "epoch": 5.296351590930298, + "grad_norm": NaN, + "learning_rate": 1.0714589112591022e-05, + "loss": 0.0, + "step": 56761 + }, + { + "epoch": 5.296444900625175, + "grad_norm": NaN, + "learning_rate": 1.0711781924481172e-05, + "loss": 0.0, + "step": 56762 + }, + { + "epoch": 5.296538210320052, + "grad_norm": NaN, + "learning_rate": 1.0708975090539984e-05, + "loss": 0.0, + "step": 56763 + }, + { + "epoch": 5.296631520014929, + "grad_norm": NaN, + "learning_rate": 1.0706168610774584e-05, + "loss": 0.0, + "step": 56764 + }, + { + "epoch": 5.296724829709807, + "grad_norm": NaN, + "learning_rate": 1.0703362485192186e-05, + "loss": 0.0, + "step": 56765 + }, + { + "epoch": 5.296818139404684, + "grad_norm": NaN, + "learning_rate": 1.070055671379983e-05, + "loss": 0.0, + "step": 56766 + }, + { + "epoch": 5.296911449099562, + "grad_norm": NaN, + "learning_rate": 1.0697751296604696e-05, + "loss": 0.0, + "step": 56767 + }, + { + "epoch": 5.297004758794439, + "grad_norm": NaN, + "learning_rate": 1.0694946233613961e-05, + "loss": 0.0, + "step": 56768 + }, + { + "epoch": 5.297098068489316, + "grad_norm": NaN, + "learning_rate": 1.0692141524834668e-05, + "loss": 0.0, + "step": 56769 + }, + { + "epoch": 5.297191378184193, + "grad_norm": NaN, + "learning_rate": 1.0689337170273982e-05, + "loss": 0.0, + "step": 56770 + }, + { + "epoch": 5.2972846878790705, + "grad_norm": NaN, + "learning_rate": 1.068653316993911e-05, + "loss": 0.0, + "step": 56771 + }, + { + "epoch": 5.297377997573948, + "grad_norm": NaN, + "learning_rate": 1.0683729523837065e-05, + "loss": 0.0, + "step": 56772 + }, + { + "epoch": 5.297471307268825, + "grad_norm": NaN, + "learning_rate": 1.0680926231975023e-05, + "loss": 0.0, + "step": 56773 + }, + { + "epoch": 5.297564616963703, + "grad_norm": NaN, + "learning_rate": 1.0678123294360148e-05, + "loss": 0.0, + "step": 56774 + }, + { + "epoch": 5.29765792665858, + "grad_norm": NaN, + "learning_rate": 1.067532071099953e-05, + "loss": 0.0, + "step": 56775 + }, + { + "epoch": 5.297751236353458, + "grad_norm": NaN, + "learning_rate": 1.0672518481900267e-05, + "loss": 0.0, + "step": 56776 + }, + { + "epoch": 5.297844546048334, + "grad_norm": NaN, + "learning_rate": 1.0669716607069551e-05, + "loss": 0.0, + "step": 56777 + }, + { + "epoch": 5.2979378557432115, + "grad_norm": NaN, + "learning_rate": 1.0666915086514477e-05, + "loss": 0.0, + "step": 56778 + }, + { + "epoch": 5.298031165438089, + "grad_norm": NaN, + "learning_rate": 1.0664113920242124e-05, + "loss": 0.0, + "step": 56779 + }, + { + "epoch": 5.298124475132966, + "grad_norm": NaN, + "learning_rate": 1.06613131082597e-05, + "loss": 0.0, + "step": 56780 + }, + { + "epoch": 5.298217784827844, + "grad_norm": NaN, + "learning_rate": 1.065851265057427e-05, + "loss": 0.0, + "step": 56781 + }, + { + "epoch": 5.298311094522721, + "grad_norm": NaN, + "learning_rate": 1.0655712547192925e-05, + "loss": 0.0, + "step": 56782 + }, + { + "epoch": 5.298404404217599, + "grad_norm": NaN, + "learning_rate": 1.0652912798122892e-05, + "loss": 0.0, + "step": 56783 + }, + { + "epoch": 5.298497713912475, + "grad_norm": NaN, + "learning_rate": 1.0650113403371186e-05, + "loss": 0.0, + "step": 56784 + }, + { + "epoch": 5.298591023607353, + "grad_norm": NaN, + "learning_rate": 1.0647314362944963e-05, + "loss": 0.0, + "step": 56785 + }, + { + "epoch": 5.29868433330223, + "grad_norm": NaN, + "learning_rate": 1.0644515676851355e-05, + "loss": 0.0, + "step": 56786 + }, + { + "epoch": 5.298777642997107, + "grad_norm": NaN, + "learning_rate": 1.0641717345097455e-05, + "loss": 0.0, + "step": 56787 + }, + { + "epoch": 5.298870952691985, + "grad_norm": NaN, + "learning_rate": 1.063891936769039e-05, + "loss": 0.0, + "step": 56788 + }, + { + "epoch": 5.298964262386862, + "grad_norm": NaN, + "learning_rate": 1.0636121744637271e-05, + "loss": 0.0, + "step": 56789 + }, + { + "epoch": 5.29905757208174, + "grad_norm": NaN, + "learning_rate": 1.063332447594521e-05, + "loss": 0.0, + "step": 56790 + }, + { + "epoch": 5.299150881776616, + "grad_norm": NaN, + "learning_rate": 1.0630527561621316e-05, + "loss": 0.0, + "step": 56791 + }, + { + "epoch": 5.299244191471494, + "grad_norm": NaN, + "learning_rate": 1.0627731001672702e-05, + "loss": 0.0, + "step": 56792 + }, + { + "epoch": 5.299337501166371, + "grad_norm": NaN, + "learning_rate": 1.0624934796106477e-05, + "loss": 0.0, + "step": 56793 + }, + { + "epoch": 5.2994308108612485, + "grad_norm": NaN, + "learning_rate": 1.0622138944929787e-05, + "loss": 0.0, + "step": 56794 + }, + { + "epoch": 5.299524120556126, + "grad_norm": NaN, + "learning_rate": 1.0619343448149709e-05, + "loss": 0.0, + "step": 56795 + }, + { + "epoch": 5.299617430251003, + "grad_norm": NaN, + "learning_rate": 1.0616548305773304e-05, + "loss": 0.0, + "step": 56796 + }, + { + "epoch": 5.299710739945881, + "grad_norm": NaN, + "learning_rate": 1.06137535178078e-05, + "loss": 0.0, + "step": 56797 + }, + { + "epoch": 5.299804049640757, + "grad_norm": NaN, + "learning_rate": 1.061095908426019e-05, + "loss": 0.0, + "step": 56798 + }, + { + "epoch": 5.299897359335635, + "grad_norm": NaN, + "learning_rate": 1.0608165005137603e-05, + "loss": 0.0, + "step": 56799 + }, + { + "epoch": 5.299990669030512, + "grad_norm": NaN, + "learning_rate": 1.06053712804472e-05, + "loss": 0.0, + "step": 56800 + }, + { + "epoch": 5.3000839787253895, + "grad_norm": NaN, + "learning_rate": 1.0602577910196041e-05, + "loss": 0.0, + "step": 56801 + }, + { + "epoch": 5.300177288420267, + "grad_norm": NaN, + "learning_rate": 1.0599784894391189e-05, + "loss": 0.0, + "step": 56802 + }, + { + "epoch": 5.300270598115144, + "grad_norm": NaN, + "learning_rate": 1.0596992233039853e-05, + "loss": 0.0, + "step": 56803 + }, + { + "epoch": 5.300363907810022, + "grad_norm": NaN, + "learning_rate": 1.0594199926149045e-05, + "loss": 0.0, + "step": 56804 + }, + { + "epoch": 5.300457217504899, + "grad_norm": NaN, + "learning_rate": 1.0591407973725858e-05, + "loss": 0.0, + "step": 56805 + }, + { + "epoch": 5.300550527199776, + "grad_norm": NaN, + "learning_rate": 1.0588616375777486e-05, + "loss": 0.0, + "step": 56806 + }, + { + "epoch": 5.300643836894653, + "grad_norm": NaN, + "learning_rate": 1.0585825132310944e-05, + "loss": 0.0, + "step": 56807 + }, + { + "epoch": 5.3007371465895305, + "grad_norm": NaN, + "learning_rate": 1.0583034243333321e-05, + "loss": 0.0, + "step": 56808 + }, + { + "epoch": 5.300830456284408, + "grad_norm": NaN, + "learning_rate": 1.0580243708851782e-05, + "loss": 0.0, + "step": 56809 + }, + { + "epoch": 5.300923765979285, + "grad_norm": NaN, + "learning_rate": 1.057745352887337e-05, + "loss": 0.0, + "step": 56810 + }, + { + "epoch": 5.301017075674163, + "grad_norm": NaN, + "learning_rate": 1.0574663703405161e-05, + "loss": 0.0, + "step": 56811 + }, + { + "epoch": 5.30111038536904, + "grad_norm": NaN, + "learning_rate": 1.0571874232454337e-05, + "loss": 0.0, + "step": 56812 + }, + { + "epoch": 5.301203695063917, + "grad_norm": NaN, + "learning_rate": 1.056908511602792e-05, + "loss": 0.0, + "step": 56813 + }, + { + "epoch": 5.301297004758794, + "grad_norm": NaN, + "learning_rate": 1.0566296354132974e-05, + "loss": 0.0, + "step": 56814 + }, + { + "epoch": 5.301390314453672, + "grad_norm": NaN, + "learning_rate": 1.0563507946776695e-05, + "loss": 0.0, + "step": 56815 + }, + { + "epoch": 5.301483624148549, + "grad_norm": NaN, + "learning_rate": 1.0560719893966091e-05, + "loss": 0.0, + "step": 56816 + }, + { + "epoch": 5.301576933843426, + "grad_norm": NaN, + "learning_rate": 1.055793219570824e-05, + "loss": 0.0, + "step": 56817 + }, + { + "epoch": 5.301670243538304, + "grad_norm": NaN, + "learning_rate": 1.0555144852010306e-05, + "loss": 0.0, + "step": 56818 + }, + { + "epoch": 5.301763553233181, + "grad_norm": NaN, + "learning_rate": 1.0552357862879312e-05, + "loss": 0.0, + "step": 56819 + }, + { + "epoch": 5.301856862928059, + "grad_norm": NaN, + "learning_rate": 1.054957122832234e-05, + "loss": 0.0, + "step": 56820 + }, + { + "epoch": 5.301950172622935, + "grad_norm": NaN, + "learning_rate": 1.0546784948346548e-05, + "loss": 0.0, + "step": 56821 + }, + { + "epoch": 5.302043482317813, + "grad_norm": NaN, + "learning_rate": 1.0543999022958948e-05, + "loss": 0.0, + "step": 56822 + }, + { + "epoch": 5.30213679201269, + "grad_norm": NaN, + "learning_rate": 1.054121345216662e-05, + "loss": 0.0, + "step": 56823 + }, + { + "epoch": 5.3022301017075675, + "grad_norm": NaN, + "learning_rate": 1.053842823597672e-05, + "loss": 0.0, + "step": 56824 + }, + { + "epoch": 5.302323411402445, + "grad_norm": NaN, + "learning_rate": 1.053564337439628e-05, + "loss": 0.0, + "step": 56825 + }, + { + "epoch": 5.302416721097322, + "grad_norm": NaN, + "learning_rate": 1.0532858867432342e-05, + "loss": 0.0, + "step": 56826 + }, + { + "epoch": 5.302510030792199, + "grad_norm": NaN, + "learning_rate": 1.0530074715092102e-05, + "loss": 0.0, + "step": 56827 + }, + { + "epoch": 5.302603340487076, + "grad_norm": NaN, + "learning_rate": 1.052729091738252e-05, + "loss": 0.0, + "step": 56828 + }, + { + "epoch": 5.302696650181954, + "grad_norm": NaN, + "learning_rate": 1.0524507474310739e-05, + "loss": 0.0, + "step": 56829 + }, + { + "epoch": 5.302789959876831, + "grad_norm": NaN, + "learning_rate": 1.0521724385883806e-05, + "loss": 0.0, + "step": 56830 + }, + { + "epoch": 5.3028832695717085, + "grad_norm": NaN, + "learning_rate": 1.0518941652108798e-05, + "loss": 0.0, + "step": 56831 + }, + { + "epoch": 5.302976579266586, + "grad_norm": NaN, + "learning_rate": 1.0516159272992858e-05, + "loss": 0.0, + "step": 56832 + }, + { + "epoch": 5.303069888961463, + "grad_norm": NaN, + "learning_rate": 1.0513377248542964e-05, + "loss": 0.0, + "step": 56833 + }, + { + "epoch": 5.303163198656341, + "grad_norm": NaN, + "learning_rate": 1.0510595578766229e-05, + "loss": 0.0, + "step": 56834 + }, + { + "epoch": 5.303256508351217, + "grad_norm": NaN, + "learning_rate": 1.0507814263669762e-05, + "loss": 0.0, + "step": 56835 + }, + { + "epoch": 5.303349818046095, + "grad_norm": NaN, + "learning_rate": 1.0505033303260591e-05, + "loss": 0.0, + "step": 56836 + }, + { + "epoch": 5.303443127740972, + "grad_norm": NaN, + "learning_rate": 1.050225269754576e-05, + "loss": 0.0, + "step": 56837 + }, + { + "epoch": 5.30353643743585, + "grad_norm": NaN, + "learning_rate": 1.049947244653243e-05, + "loss": 0.0, + "step": 56838 + }, + { + "epoch": 5.303629747130727, + "grad_norm": NaN, + "learning_rate": 1.0496692550227614e-05, + "loss": 0.0, + "step": 56839 + }, + { + "epoch": 5.303723056825604, + "grad_norm": NaN, + "learning_rate": 1.0493913008638339e-05, + "loss": 0.0, + "step": 56840 + }, + { + "epoch": 5.303816366520482, + "grad_norm": NaN, + "learning_rate": 1.0491133821771779e-05, + "loss": 0.0, + "step": 56841 + }, + { + "epoch": 5.303909676215358, + "grad_norm": NaN, + "learning_rate": 1.04883549896349e-05, + "loss": 0.0, + "step": 56842 + }, + { + "epoch": 5.304002985910236, + "grad_norm": NaN, + "learning_rate": 1.0485576512234795e-05, + "loss": 0.0, + "step": 56843 + }, + { + "epoch": 5.304096295605113, + "grad_norm": NaN, + "learning_rate": 1.048279838957859e-05, + "loss": 0.0, + "step": 56844 + }, + { + "epoch": 5.304189605299991, + "grad_norm": NaN, + "learning_rate": 1.048002062167328e-05, + "loss": 0.0, + "step": 56845 + }, + { + "epoch": 5.304282914994868, + "grad_norm": NaN, + "learning_rate": 1.047724320852591e-05, + "loss": 0.0, + "step": 56846 + }, + { + "epoch": 5.3043762246897455, + "grad_norm": NaN, + "learning_rate": 1.047446615014364e-05, + "loss": 0.0, + "step": 56847 + }, + { + "epoch": 5.304469534384623, + "grad_norm": NaN, + "learning_rate": 1.0471689446533449e-05, + "loss": 0.0, + "step": 56848 + }, + { + "epoch": 5.3045628440795, + "grad_norm": NaN, + "learning_rate": 1.0468913097702397e-05, + "loss": 0.0, + "step": 56849 + }, + { + "epoch": 5.304656153774377, + "grad_norm": NaN, + "learning_rate": 1.0466137103657613e-05, + "loss": 0.0, + "step": 56850 + }, + { + "epoch": 5.304749463469254, + "grad_norm": NaN, + "learning_rate": 1.046336146440609e-05, + "loss": 0.0, + "step": 56851 + }, + { + "epoch": 5.304842773164132, + "grad_norm": NaN, + "learning_rate": 1.0460586179954871e-05, + "loss": 0.0, + "step": 56852 + }, + { + "epoch": 5.304936082859009, + "grad_norm": NaN, + "learning_rate": 1.0457811250311104e-05, + "loss": 0.0, + "step": 56853 + }, + { + "epoch": 5.3050293925538865, + "grad_norm": NaN, + "learning_rate": 1.0455036675481747e-05, + "loss": 0.0, + "step": 56854 + }, + { + "epoch": 5.305122702248764, + "grad_norm": NaN, + "learning_rate": 1.0452262455473898e-05, + "loss": 0.0, + "step": 56855 + }, + { + "epoch": 5.305216011943641, + "grad_norm": NaN, + "learning_rate": 1.044948859029463e-05, + "loss": 0.0, + "step": 56856 + }, + { + "epoch": 5.305309321638518, + "grad_norm": NaN, + "learning_rate": 1.0446715079950973e-05, + "loss": 0.0, + "step": 56857 + }, + { + "epoch": 5.305402631333395, + "grad_norm": NaN, + "learning_rate": 1.044394192444994e-05, + "loss": 0.0, + "step": 56858 + }, + { + "epoch": 5.305495941028273, + "grad_norm": NaN, + "learning_rate": 1.044116912379867e-05, + "loss": 0.0, + "step": 56859 + }, + { + "epoch": 5.30558925072315, + "grad_norm": NaN, + "learning_rate": 1.0438396678004162e-05, + "loss": 0.0, + "step": 56860 + }, + { + "epoch": 5.3056825604180275, + "grad_norm": NaN, + "learning_rate": 1.0435624587073426e-05, + "loss": 0.0, + "step": 56861 + }, + { + "epoch": 5.305775870112905, + "grad_norm": NaN, + "learning_rate": 1.0432852851013607e-05, + "loss": 0.0, + "step": 56862 + }, + { + "epoch": 5.305869179807782, + "grad_norm": NaN, + "learning_rate": 1.043008146983168e-05, + "loss": 0.0, + "step": 56863 + }, + { + "epoch": 5.305962489502659, + "grad_norm": NaN, + "learning_rate": 1.0427310443534693e-05, + "loss": 0.0, + "step": 56864 + }, + { + "epoch": 5.306055799197536, + "grad_norm": NaN, + "learning_rate": 1.0424539772129752e-05, + "loss": 0.0, + "step": 56865 + }, + { + "epoch": 5.306149108892414, + "grad_norm": NaN, + "learning_rate": 1.0421769455623841e-05, + "loss": 0.0, + "step": 56866 + }, + { + "epoch": 5.306242418587291, + "grad_norm": NaN, + "learning_rate": 1.0418999494024e-05, + "loss": 0.0, + "step": 56867 + }, + { + "epoch": 5.306335728282169, + "grad_norm": NaN, + "learning_rate": 1.0416229887337341e-05, + "loss": 0.0, + "step": 56868 + }, + { + "epoch": 5.306429037977046, + "grad_norm": NaN, + "learning_rate": 1.0413460635570825e-05, + "loss": 0.0, + "step": 56869 + }, + { + "epoch": 5.306522347671923, + "grad_norm": NaN, + "learning_rate": 1.041069173873153e-05, + "loss": 0.0, + "step": 56870 + }, + { + "epoch": 5.3066156573668, + "grad_norm": NaN, + "learning_rate": 1.040792319682655e-05, + "loss": 0.0, + "step": 56871 + }, + { + "epoch": 5.306708967061677, + "grad_norm": NaN, + "learning_rate": 1.0405155009862798e-05, + "loss": 0.0, + "step": 56872 + }, + { + "epoch": 5.306802276756555, + "grad_norm": NaN, + "learning_rate": 1.0402387177847415e-05, + "loss": 0.0, + "step": 56873 + }, + { + "epoch": 5.306895586451432, + "grad_norm": NaN, + "learning_rate": 1.0399619700787449e-05, + "loss": 0.0, + "step": 56874 + }, + { + "epoch": 5.30698889614631, + "grad_norm": NaN, + "learning_rate": 1.039685257868984e-05, + "loss": 0.0, + "step": 56875 + }, + { + "epoch": 5.307082205841187, + "grad_norm": NaN, + "learning_rate": 1.039408581156172e-05, + "loss": 0.0, + "step": 56876 + }, + { + "epoch": 5.3071755155360645, + "grad_norm": NaN, + "learning_rate": 1.039131939941008e-05, + "loss": 0.0, + "step": 56877 + }, + { + "epoch": 5.307268825230942, + "grad_norm": NaN, + "learning_rate": 1.0388553342241918e-05, + "loss": 0.0, + "step": 56878 + }, + { + "epoch": 5.307362134925818, + "grad_norm": NaN, + "learning_rate": 1.0385787640064357e-05, + "loss": 0.0, + "step": 56879 + }, + { + "epoch": 5.307455444620696, + "grad_norm": NaN, + "learning_rate": 1.038302229288438e-05, + "loss": 0.0, + "step": 56880 + }, + { + "epoch": 5.307548754315573, + "grad_norm": NaN, + "learning_rate": 1.0380257300708977e-05, + "loss": 0.0, + "step": 56881 + }, + { + "epoch": 5.307642064010451, + "grad_norm": NaN, + "learning_rate": 1.0377492663545262e-05, + "loss": 0.0, + "step": 56882 + }, + { + "epoch": 5.307735373705328, + "grad_norm": NaN, + "learning_rate": 1.037472838140021e-05, + "loss": 0.0, + "step": 56883 + }, + { + "epoch": 5.3078286834002055, + "grad_norm": NaN, + "learning_rate": 1.037196445428085e-05, + "loss": 0.0, + "step": 56884 + }, + { + "epoch": 5.307921993095083, + "grad_norm": NaN, + "learning_rate": 1.0369200882194261e-05, + "loss": 0.0, + "step": 56885 + }, + { + "epoch": 5.3080153027899595, + "grad_norm": NaN, + "learning_rate": 1.0366437665147403e-05, + "loss": 0.0, + "step": 56886 + }, + { + "epoch": 5.308108612484837, + "grad_norm": NaN, + "learning_rate": 1.036367480314732e-05, + "loss": 0.0, + "step": 56887 + }, + { + "epoch": 5.308201922179714, + "grad_norm": NaN, + "learning_rate": 1.036091229620109e-05, + "loss": 0.0, + "step": 56888 + }, + { + "epoch": 5.308295231874592, + "grad_norm": NaN, + "learning_rate": 1.0358150144315675e-05, + "loss": 0.0, + "step": 56889 + }, + { + "epoch": 5.308388541569469, + "grad_norm": NaN, + "learning_rate": 1.0355388347498084e-05, + "loss": 0.0, + "step": 56890 + }, + { + "epoch": 5.308481851264347, + "grad_norm": NaN, + "learning_rate": 1.0352626905755428e-05, + "loss": 0.0, + "step": 56891 + }, + { + "epoch": 5.308575160959224, + "grad_norm": NaN, + "learning_rate": 1.0349865819094655e-05, + "loss": 0.0, + "step": 56892 + }, + { + "epoch": 5.308668470654101, + "grad_norm": NaN, + "learning_rate": 1.034710508752279e-05, + "loss": 0.0, + "step": 56893 + }, + { + "epoch": 5.308761780348978, + "grad_norm": NaN, + "learning_rate": 1.0344344711046892e-05, + "loss": 0.0, + "step": 56894 + }, + { + "epoch": 5.308855090043855, + "grad_norm": NaN, + "learning_rate": 1.0341584689673959e-05, + "loss": 0.0, + "step": 56895 + }, + { + "epoch": 5.308948399738733, + "grad_norm": NaN, + "learning_rate": 1.0338825023410968e-05, + "loss": 0.0, + "step": 56896 + }, + { + "epoch": 5.30904170943361, + "grad_norm": NaN, + "learning_rate": 1.033606571226503e-05, + "loss": 0.0, + "step": 56897 + }, + { + "epoch": 5.309135019128488, + "grad_norm": NaN, + "learning_rate": 1.033330675624307e-05, + "loss": 0.0, + "step": 56898 + }, + { + "epoch": 5.309228328823365, + "grad_norm": NaN, + "learning_rate": 1.0330548155352119e-05, + "loss": 0.0, + "step": 56899 + }, + { + "epoch": 5.3093216385182425, + "grad_norm": NaN, + "learning_rate": 1.0327789909599271e-05, + "loss": 0.0, + "step": 56900 + }, + { + "epoch": 5.309414948213119, + "grad_norm": NaN, + "learning_rate": 1.0325032018991435e-05, + "loss": 0.0, + "step": 56901 + }, + { + "epoch": 5.309508257907996, + "grad_norm": NaN, + "learning_rate": 1.0322274483535658e-05, + "loss": 0.0, + "step": 56902 + }, + { + "epoch": 5.309601567602874, + "grad_norm": NaN, + "learning_rate": 1.0319517303239e-05, + "loss": 0.0, + "step": 56903 + }, + { + "epoch": 5.309694877297751, + "grad_norm": NaN, + "learning_rate": 1.0316760478108421e-05, + "loss": 0.0, + "step": 56904 + }, + { + "epoch": 5.309788186992629, + "grad_norm": NaN, + "learning_rate": 1.0314004008150918e-05, + "loss": 0.0, + "step": 56905 + }, + { + "epoch": 5.309881496687506, + "grad_norm": NaN, + "learning_rate": 1.0311247893373565e-05, + "loss": 0.0, + "step": 56906 + }, + { + "epoch": 5.3099748063823835, + "grad_norm": NaN, + "learning_rate": 1.0308492133783275e-05, + "loss": 0.0, + "step": 56907 + }, + { + "epoch": 5.31006811607726, + "grad_norm": NaN, + "learning_rate": 1.0305736729387142e-05, + "loss": 0.0, + "step": 56908 + }, + { + "epoch": 5.3101614257721375, + "grad_norm": NaN, + "learning_rate": 1.0302981680192162e-05, + "loss": 0.0, + "step": 56909 + }, + { + "epoch": 5.310254735467015, + "grad_norm": NaN, + "learning_rate": 1.030022698620528e-05, + "loss": 0.0, + "step": 56910 + }, + { + "epoch": 5.310348045161892, + "grad_norm": NaN, + "learning_rate": 1.0297472647433553e-05, + "loss": 0.0, + "step": 56911 + }, + { + "epoch": 5.31044135485677, + "grad_norm": NaN, + "learning_rate": 1.0294718663883994e-05, + "loss": 0.0, + "step": 56912 + }, + { + "epoch": 5.310534664551647, + "grad_norm": NaN, + "learning_rate": 1.0291965035563549e-05, + "loss": 0.0, + "step": 56913 + }, + { + "epoch": 5.3106279742465246, + "grad_norm": NaN, + "learning_rate": 1.028921176247926e-05, + "loss": 0.0, + "step": 56914 + }, + { + "epoch": 5.310721283941401, + "grad_norm": NaN, + "learning_rate": 1.028645884463814e-05, + "loss": 0.0, + "step": 56915 + }, + { + "epoch": 5.3108145936362785, + "grad_norm": NaN, + "learning_rate": 1.0283706282047132e-05, + "loss": 0.0, + "step": 56916 + }, + { + "epoch": 5.310907903331156, + "grad_norm": NaN, + "learning_rate": 1.02809540747133e-05, + "loss": 0.0, + "step": 56917 + }, + { + "epoch": 5.311001213026033, + "grad_norm": NaN, + "learning_rate": 1.0278202222643616e-05, + "loss": 0.0, + "step": 56918 + }, + { + "epoch": 5.311094522720911, + "grad_norm": NaN, + "learning_rate": 1.0275450725845047e-05, + "loss": 0.0, + "step": 56919 + }, + { + "epoch": 5.311187832415788, + "grad_norm": NaN, + "learning_rate": 1.0272699584324667e-05, + "loss": 0.0, + "step": 56920 + }, + { + "epoch": 5.311281142110666, + "grad_norm": NaN, + "learning_rate": 1.0269948798089389e-05, + "loss": 0.0, + "step": 56921 + }, + { + "epoch": 5.311374451805543, + "grad_norm": NaN, + "learning_rate": 1.0267198367146223e-05, + "loss": 0.0, + "step": 56922 + }, + { + "epoch": 5.3114677615004195, + "grad_norm": NaN, + "learning_rate": 1.0264448291502214e-05, + "loss": 0.0, + "step": 56923 + }, + { + "epoch": 5.311561071195297, + "grad_norm": NaN, + "learning_rate": 1.0261698571164306e-05, + "loss": 0.0, + "step": 56924 + }, + { + "epoch": 5.311654380890174, + "grad_norm": NaN, + "learning_rate": 1.0258949206139478e-05, + "loss": 0.0, + "step": 56925 + }, + { + "epoch": 5.311747690585052, + "grad_norm": NaN, + "learning_rate": 1.0256200196434804e-05, + "loss": 0.0, + "step": 56926 + }, + { + "epoch": 5.311841000279929, + "grad_norm": NaN, + "learning_rate": 1.0253451542057183e-05, + "loss": 0.0, + "step": 56927 + }, + { + "epoch": 5.311934309974807, + "grad_norm": NaN, + "learning_rate": 1.0250703243013625e-05, + "loss": 0.0, + "step": 56928 + }, + { + "epoch": 5.312027619669684, + "grad_norm": NaN, + "learning_rate": 1.0247955299311172e-05, + "loss": 0.0, + "step": 56929 + }, + { + "epoch": 5.312120929364561, + "grad_norm": NaN, + "learning_rate": 1.0245207710956754e-05, + "loss": 0.0, + "step": 56930 + }, + { + "epoch": 5.312214239059438, + "grad_norm": NaN, + "learning_rate": 1.0242460477957332e-05, + "loss": 0.0, + "step": 56931 + }, + { + "epoch": 5.312307548754315, + "grad_norm": NaN, + "learning_rate": 1.0239713600319999e-05, + "loss": 0.0, + "step": 56932 + }, + { + "epoch": 5.312400858449193, + "grad_norm": NaN, + "learning_rate": 1.023696707805165e-05, + "loss": 0.0, + "step": 56933 + }, + { + "epoch": 5.31249416814407, + "grad_norm": NaN, + "learning_rate": 1.0234220911159263e-05, + "loss": 0.0, + "step": 56934 + }, + { + "epoch": 5.312587477838948, + "grad_norm": NaN, + "learning_rate": 1.02314750996499e-05, + "loss": 0.0, + "step": 56935 + }, + { + "epoch": 5.312680787533825, + "grad_norm": NaN, + "learning_rate": 1.022872964353047e-05, + "loss": 0.0, + "step": 56936 + }, + { + "epoch": 5.3127740972287025, + "grad_norm": NaN, + "learning_rate": 1.0225984542807952e-05, + "loss": 0.0, + "step": 56937 + }, + { + "epoch": 5.312867406923579, + "grad_norm": NaN, + "learning_rate": 1.0223239797489408e-05, + "loss": 0.0, + "step": 56938 + }, + { + "epoch": 5.3129607166184565, + "grad_norm": NaN, + "learning_rate": 1.022049540758173e-05, + "loss": 0.0, + "step": 56939 + }, + { + "epoch": 5.313054026313334, + "grad_norm": NaN, + "learning_rate": 1.0217751373091897e-05, + "loss": 0.0, + "step": 56940 + }, + { + "epoch": 5.313147336008211, + "grad_norm": NaN, + "learning_rate": 1.0215007694026973e-05, + "loss": 0.0, + "step": 56941 + }, + { + "epoch": 5.313240645703089, + "grad_norm": NaN, + "learning_rate": 1.0212264370393813e-05, + "loss": 0.0, + "step": 56942 + }, + { + "epoch": 5.313333955397966, + "grad_norm": NaN, + "learning_rate": 1.02095214021995e-05, + "loss": 0.0, + "step": 56943 + }, + { + "epoch": 5.313427265092843, + "grad_norm": NaN, + "learning_rate": 1.0206778789450975e-05, + "loss": 0.0, + "step": 56944 + }, + { + "epoch": 5.31352057478772, + "grad_norm": NaN, + "learning_rate": 1.0204036532155169e-05, + "loss": 0.0, + "step": 56945 + }, + { + "epoch": 5.3136138844825975, + "grad_norm": NaN, + "learning_rate": 1.020129463031909e-05, + "loss": 0.0, + "step": 56946 + }, + { + "epoch": 5.313707194177475, + "grad_norm": NaN, + "learning_rate": 1.0198553083949751e-05, + "loss": 0.0, + "step": 56947 + }, + { + "epoch": 5.313800503872352, + "grad_norm": NaN, + "learning_rate": 1.0195811893054012e-05, + "loss": 0.0, + "step": 56948 + }, + { + "epoch": 5.31389381356723, + "grad_norm": NaN, + "learning_rate": 1.0193071057638952e-05, + "loss": 0.0, + "step": 56949 + }, + { + "epoch": 5.313987123262107, + "grad_norm": NaN, + "learning_rate": 1.0190330577711514e-05, + "loss": 0.0, + "step": 56950 + }, + { + "epoch": 5.314080432956985, + "grad_norm": NaN, + "learning_rate": 1.018759045327861e-05, + "loss": 0.0, + "step": 56951 + }, + { + "epoch": 5.314173742651861, + "grad_norm": NaN, + "learning_rate": 1.0184850684347267e-05, + "loss": 0.0, + "step": 56952 + }, + { + "epoch": 5.314267052346739, + "grad_norm": NaN, + "learning_rate": 1.0182111270924447e-05, + "loss": 0.0, + "step": 56953 + }, + { + "epoch": 5.314360362041616, + "grad_norm": NaN, + "learning_rate": 1.0179372213017061e-05, + "loss": 0.0, + "step": 56954 + }, + { + "epoch": 5.314453671736493, + "grad_norm": NaN, + "learning_rate": 1.0176633510632154e-05, + "loss": 0.0, + "step": 56955 + }, + { + "epoch": 5.314546981431371, + "grad_norm": NaN, + "learning_rate": 1.0173895163776652e-05, + "loss": 0.0, + "step": 56956 + }, + { + "epoch": 5.314640291126248, + "grad_norm": NaN, + "learning_rate": 1.0171157172457467e-05, + "loss": 0.0, + "step": 56957 + }, + { + "epoch": 5.314733600821126, + "grad_norm": NaN, + "learning_rate": 1.0168419536681643e-05, + "loss": 0.0, + "step": 56958 + }, + { + "epoch": 5.314826910516002, + "grad_norm": NaN, + "learning_rate": 1.0165682256456125e-05, + "loss": 0.0, + "step": 56959 + }, + { + "epoch": 5.31492022021088, + "grad_norm": NaN, + "learning_rate": 1.0162945331787808e-05, + "loss": 0.0, + "step": 56960 + }, + { + "epoch": 5.315013529905757, + "grad_norm": NaN, + "learning_rate": 1.0160208762683719e-05, + "loss": 0.0, + "step": 56961 + }, + { + "epoch": 5.3151068396006345, + "grad_norm": NaN, + "learning_rate": 1.0157472549150835e-05, + "loss": 0.0, + "step": 56962 + }, + { + "epoch": 5.315200149295512, + "grad_norm": NaN, + "learning_rate": 1.0154736691196003e-05, + "loss": 0.0, + "step": 56963 + }, + { + "epoch": 5.315293458990389, + "grad_norm": NaN, + "learning_rate": 1.0152001188826314e-05, + "loss": 0.0, + "step": 56964 + }, + { + "epoch": 5.315386768685267, + "grad_norm": NaN, + "learning_rate": 1.0149266042048631e-05, + "loss": 0.0, + "step": 56965 + }, + { + "epoch": 5.315480078380144, + "grad_norm": NaN, + "learning_rate": 1.0146531250869915e-05, + "loss": 0.0, + "step": 56966 + }, + { + "epoch": 5.315573388075021, + "grad_norm": NaN, + "learning_rate": 1.0143796815297194e-05, + "loss": 0.0, + "step": 56967 + }, + { + "epoch": 5.315666697769898, + "grad_norm": NaN, + "learning_rate": 1.0141062735337346e-05, + "loss": 0.0, + "step": 56968 + }, + { + "epoch": 5.3157600074647755, + "grad_norm": NaN, + "learning_rate": 1.0138329010997315e-05, + "loss": 0.0, + "step": 56969 + }, + { + "epoch": 5.315853317159653, + "grad_norm": NaN, + "learning_rate": 1.0135595642284128e-05, + "loss": 0.0, + "step": 56970 + }, + { + "epoch": 5.31594662685453, + "grad_norm": NaN, + "learning_rate": 1.013286262920468e-05, + "loss": 0.0, + "step": 56971 + }, + { + "epoch": 5.316039936549408, + "grad_norm": NaN, + "learning_rate": 1.0130129971765898e-05, + "loss": 0.0, + "step": 56972 + }, + { + "epoch": 5.316133246244285, + "grad_norm": NaN, + "learning_rate": 1.0127397669974814e-05, + "loss": 0.0, + "step": 56973 + }, + { + "epoch": 5.316226555939162, + "grad_norm": NaN, + "learning_rate": 1.0124665723838282e-05, + "loss": 0.0, + "step": 56974 + }, + { + "epoch": 5.316319865634039, + "grad_norm": NaN, + "learning_rate": 1.0121934133363284e-05, + "loss": 0.0, + "step": 56975 + }, + { + "epoch": 5.3164131753289166, + "grad_norm": NaN, + "learning_rate": 1.0119202898556833e-05, + "loss": 0.0, + "step": 56976 + }, + { + "epoch": 5.316506485023794, + "grad_norm": NaN, + "learning_rate": 1.011647201942577e-05, + "loss": 0.0, + "step": 56977 + }, + { + "epoch": 5.316599794718671, + "grad_norm": NaN, + "learning_rate": 1.0113741495977057e-05, + "loss": 0.0, + "step": 56978 + }, + { + "epoch": 5.316693104413549, + "grad_norm": NaN, + "learning_rate": 1.0111011328217722e-05, + "loss": 0.0, + "step": 56979 + }, + { + "epoch": 5.316786414108426, + "grad_norm": NaN, + "learning_rate": 1.0108281516154593e-05, + "loss": 0.0, + "step": 56980 + }, + { + "epoch": 5.316879723803303, + "grad_norm": NaN, + "learning_rate": 1.0105552059794697e-05, + "loss": 0.0, + "step": 56981 + }, + { + "epoch": 5.31697303349818, + "grad_norm": NaN, + "learning_rate": 1.0102822959144947e-05, + "loss": 0.0, + "step": 56982 + }, + { + "epoch": 5.317066343193058, + "grad_norm": NaN, + "learning_rate": 1.010009421421225e-05, + "loss": 0.0, + "step": 56983 + }, + { + "epoch": 5.317159652887935, + "grad_norm": NaN, + "learning_rate": 1.0097365825003589e-05, + "loss": 0.0, + "step": 56984 + }, + { + "epoch": 5.317252962582812, + "grad_norm": NaN, + "learning_rate": 1.0094637791525905e-05, + "loss": 0.0, + "step": 56985 + }, + { + "epoch": 5.31734627227769, + "grad_norm": NaN, + "learning_rate": 1.009191011378606e-05, + "loss": 0.0, + "step": 56986 + }, + { + "epoch": 5.317439581972567, + "grad_norm": NaN, + "learning_rate": 1.0089182791791084e-05, + "loss": 0.0, + "step": 56987 + }, + { + "epoch": 5.317532891667444, + "grad_norm": NaN, + "learning_rate": 1.0086455825547885e-05, + "loss": 0.0, + "step": 56988 + }, + { + "epoch": 5.317626201362321, + "grad_norm": NaN, + "learning_rate": 1.0083729215063341e-05, + "loss": 0.0, + "step": 56989 + }, + { + "epoch": 5.317719511057199, + "grad_norm": NaN, + "learning_rate": 1.0081002960344447e-05, + "loss": 0.0, + "step": 56990 + }, + { + "epoch": 5.317812820752076, + "grad_norm": NaN, + "learning_rate": 1.0078277061398149e-05, + "loss": 0.0, + "step": 56991 + }, + { + "epoch": 5.3179061304469535, + "grad_norm": NaN, + "learning_rate": 1.0075551518231272e-05, + "loss": 0.0, + "step": 56992 + }, + { + "epoch": 5.317999440141831, + "grad_norm": NaN, + "learning_rate": 1.0072826330850863e-05, + "loss": 0.0, + "step": 56993 + }, + { + "epoch": 5.318092749836708, + "grad_norm": NaN, + "learning_rate": 1.0070101499263833e-05, + "loss": 0.0, + "step": 56994 + }, + { + "epoch": 5.318186059531586, + "grad_norm": NaN, + "learning_rate": 1.0067377023477024e-05, + "loss": 0.0, + "step": 56995 + }, + { + "epoch": 5.318279369226462, + "grad_norm": NaN, + "learning_rate": 1.0064652903497467e-05, + "loss": 0.0, + "step": 56996 + }, + { + "epoch": 5.31837267892134, + "grad_norm": NaN, + "learning_rate": 1.0061929139332053e-05, + "loss": 0.0, + "step": 56997 + }, + { + "epoch": 5.318465988616217, + "grad_norm": NaN, + "learning_rate": 1.0059205730987662e-05, + "loss": 0.0, + "step": 56998 + }, + { + "epoch": 5.3185592983110945, + "grad_norm": NaN, + "learning_rate": 1.0056482678471273e-05, + "loss": 0.0, + "step": 56999 + }, + { + "epoch": 5.318652608005972, + "grad_norm": NaN, + "learning_rate": 1.0053759981789827e-05, + "loss": 0.0, + "step": 57000 + }, + { + "epoch": 5.318745917700849, + "grad_norm": NaN, + "learning_rate": 1.0051037640950155e-05, + "loss": 0.0, + "step": 57001 + }, + { + "epoch": 5.318839227395727, + "grad_norm": NaN, + "learning_rate": 1.0048315655959266e-05, + "loss": 0.0, + "step": 57002 + }, + { + "epoch": 5.318932537090603, + "grad_norm": NaN, + "learning_rate": 1.0045594026824088e-05, + "loss": 0.0, + "step": 57003 + }, + { + "epoch": 5.319025846785481, + "grad_norm": NaN, + "learning_rate": 1.004287275355145e-05, + "loss": 0.0, + "step": 57004 + }, + { + "epoch": 5.319119156480358, + "grad_norm": NaN, + "learning_rate": 1.0040151836148347e-05, + "loss": 0.0, + "step": 57005 + }, + { + "epoch": 5.319212466175236, + "grad_norm": NaN, + "learning_rate": 1.0037431274621705e-05, + "loss": 0.0, + "step": 57006 + }, + { + "epoch": 5.319305775870113, + "grad_norm": NaN, + "learning_rate": 1.0034711068978368e-05, + "loss": 0.0, + "step": 57007 + }, + { + "epoch": 5.31939908556499, + "grad_norm": NaN, + "learning_rate": 1.003199121922535e-05, + "loss": 0.0, + "step": 57008 + }, + { + "epoch": 5.319492395259868, + "grad_norm": NaN, + "learning_rate": 1.0029271725369508e-05, + "loss": 0.0, + "step": 57009 + }, + { + "epoch": 5.319585704954745, + "grad_norm": NaN, + "learning_rate": 1.0026552587417725e-05, + "loss": 0.0, + "step": 57010 + }, + { + "epoch": 5.319679014649622, + "grad_norm": NaN, + "learning_rate": 1.0023833805377008e-05, + "loss": 0.0, + "step": 57011 + }, + { + "epoch": 5.319772324344499, + "grad_norm": NaN, + "learning_rate": 1.0021115379254185e-05, + "loss": 0.0, + "step": 57012 + }, + { + "epoch": 5.319865634039377, + "grad_norm": NaN, + "learning_rate": 1.0018397309056187e-05, + "loss": 0.0, + "step": 57013 + }, + { + "epoch": 5.319958943734254, + "grad_norm": NaN, + "learning_rate": 1.0015679594789972e-05, + "loss": 0.0, + "step": 57014 + }, + { + "epoch": 5.3200522534291315, + "grad_norm": NaN, + "learning_rate": 1.0012962236462418e-05, + "loss": 0.0, + "step": 57015 + }, + { + "epoch": 5.320145563124009, + "grad_norm": NaN, + "learning_rate": 1.0010245234080388e-05, + "loss": 0.0, + "step": 57016 + }, + { + "epoch": 5.320238872818885, + "grad_norm": NaN, + "learning_rate": 1.0007528587650893e-05, + "loss": 0.0, + "step": 57017 + }, + { + "epoch": 5.320332182513763, + "grad_norm": NaN, + "learning_rate": 1.0004812297180743e-05, + "loss": 0.0, + "step": 57018 + }, + { + "epoch": 5.32042549220864, + "grad_norm": NaN, + "learning_rate": 1.0002096362676898e-05, + "loss": 0.0, + "step": 57019 + }, + { + "epoch": 5.320518801903518, + "grad_norm": NaN, + "learning_rate": 9.999380784146272e-06, + "loss": 0.0, + "step": 57020 + }, + { + "epoch": 5.320612111598395, + "grad_norm": NaN, + "learning_rate": 9.996665561595724e-06, + "loss": 0.0, + "step": 57021 + }, + { + "epoch": 5.3207054212932725, + "grad_norm": NaN, + "learning_rate": 9.993950695032183e-06, + "loss": 0.0, + "step": 57022 + }, + { + "epoch": 5.32079873098815, + "grad_norm": NaN, + "learning_rate": 9.991236184462592e-06, + "loss": 0.0, + "step": 57023 + }, + { + "epoch": 5.320892040683027, + "grad_norm": NaN, + "learning_rate": 9.988522029893764e-06, + "loss": 0.0, + "step": 57024 + }, + { + "epoch": 5.320985350377904, + "grad_norm": NaN, + "learning_rate": 9.985808231332659e-06, + "loss": 0.0, + "step": 57025 + }, + { + "epoch": 5.321078660072781, + "grad_norm": NaN, + "learning_rate": 9.983094788786206e-06, + "loss": 0.0, + "step": 57026 + }, + { + "epoch": 5.321171969767659, + "grad_norm": NaN, + "learning_rate": 9.980381702261215e-06, + "loss": 0.0, + "step": 57027 + }, + { + "epoch": 5.321265279462536, + "grad_norm": NaN, + "learning_rate": 9.977668971764663e-06, + "loss": 0.0, + "step": 57028 + }, + { + "epoch": 5.3213585891574136, + "grad_norm": NaN, + "learning_rate": 9.974956597303446e-06, + "loss": 0.0, + "step": 57029 + }, + { + "epoch": 5.321451898852291, + "grad_norm": NaN, + "learning_rate": 9.972244578884391e-06, + "loss": 0.0, + "step": 57030 + }, + { + "epoch": 5.321545208547168, + "grad_norm": NaN, + "learning_rate": 9.969532916514444e-06, + "loss": 0.0, + "step": 57031 + }, + { + "epoch": 5.321638518242045, + "grad_norm": NaN, + "learning_rate": 9.966821610200548e-06, + "loss": 0.0, + "step": 57032 + }, + { + "epoch": 5.321731827936922, + "grad_norm": NaN, + "learning_rate": 9.96411065994948e-06, + "loss": 0.0, + "step": 57033 + }, + { + "epoch": 5.3218251376318, + "grad_norm": NaN, + "learning_rate": 9.961400065768221e-06, + "loss": 0.0, + "step": 57034 + }, + { + "epoch": 5.321918447326677, + "grad_norm": NaN, + "learning_rate": 9.958689827663664e-06, + "loss": 0.0, + "step": 57035 + }, + { + "epoch": 5.322011757021555, + "grad_norm": NaN, + "learning_rate": 9.955979945642617e-06, + "loss": 0.0, + "step": 57036 + }, + { + "epoch": 5.322105066716432, + "grad_norm": NaN, + "learning_rate": 9.953270419712062e-06, + "loss": 0.0, + "step": 57037 + }, + { + "epoch": 5.322198376411309, + "grad_norm": NaN, + "learning_rate": 9.950561249878874e-06, + "loss": 0.0, + "step": 57038 + }, + { + "epoch": 5.322291686106187, + "grad_norm": NaN, + "learning_rate": 9.947852436149884e-06, + "loss": 0.0, + "step": 57039 + }, + { + "epoch": 5.322384995801063, + "grad_norm": NaN, + "learning_rate": 9.94514397853205e-06, + "loss": 0.0, + "step": 57040 + }, + { + "epoch": 5.322478305495941, + "grad_norm": NaN, + "learning_rate": 9.942435877032234e-06, + "loss": 0.0, + "step": 57041 + }, + { + "epoch": 5.322571615190818, + "grad_norm": NaN, + "learning_rate": 9.939728131657283e-06, + "loss": 0.0, + "step": 57042 + }, + { + "epoch": 5.322664924885696, + "grad_norm": NaN, + "learning_rate": 9.93702074241412e-06, + "loss": 0.0, + "step": 57043 + }, + { + "epoch": 5.322758234580573, + "grad_norm": NaN, + "learning_rate": 9.93431370930966e-06, + "loss": 0.0, + "step": 57044 + }, + { + "epoch": 5.3228515442754505, + "grad_norm": NaN, + "learning_rate": 9.931607032350696e-06, + "loss": 0.0, + "step": 57045 + }, + { + "epoch": 5.322944853970328, + "grad_norm": NaN, + "learning_rate": 9.92890071154419e-06, + "loss": 0.0, + "step": 57046 + }, + { + "epoch": 5.323038163665204, + "grad_norm": NaN, + "learning_rate": 9.926194746897038e-06, + "loss": 0.0, + "step": 57047 + }, + { + "epoch": 5.323131473360082, + "grad_norm": NaN, + "learning_rate": 9.923489138416013e-06, + "loss": 0.0, + "step": 57048 + }, + { + "epoch": 5.323224783054959, + "grad_norm": NaN, + "learning_rate": 9.92078388610808e-06, + "loss": 0.0, + "step": 57049 + }, + { + "epoch": 5.323318092749837, + "grad_norm": NaN, + "learning_rate": 9.918078989980132e-06, + "loss": 0.0, + "step": 57050 + }, + { + "epoch": 5.323411402444714, + "grad_norm": NaN, + "learning_rate": 9.915374450038965e-06, + "loss": 0.0, + "step": 57051 + }, + { + "epoch": 5.3235047121395915, + "grad_norm": NaN, + "learning_rate": 9.91267026629154e-06, + "loss": 0.0, + "step": 57052 + }, + { + "epoch": 5.323598021834469, + "grad_norm": NaN, + "learning_rate": 9.909966438744698e-06, + "loss": 0.0, + "step": 57053 + }, + { + "epoch": 5.3236913315293455, + "grad_norm": NaN, + "learning_rate": 9.90726296740529e-06, + "loss": 0.0, + "step": 57054 + }, + { + "epoch": 5.323784641224223, + "grad_norm": NaN, + "learning_rate": 9.904559852280253e-06, + "loss": 0.0, + "step": 57055 + }, + { + "epoch": 5.3238779509191, + "grad_norm": NaN, + "learning_rate": 9.90185709337637e-06, + "loss": 0.0, + "step": 57056 + }, + { + "epoch": 5.323971260613978, + "grad_norm": NaN, + "learning_rate": 9.899154690700583e-06, + "loss": 0.0, + "step": 57057 + }, + { + "epoch": 5.324064570308855, + "grad_norm": NaN, + "learning_rate": 9.896452644259773e-06, + "loss": 0.0, + "step": 57058 + }, + { + "epoch": 5.324157880003733, + "grad_norm": NaN, + "learning_rate": 9.89375095406073e-06, + "loss": 0.0, + "step": 57059 + }, + { + "epoch": 5.32425118969861, + "grad_norm": NaN, + "learning_rate": 9.89104962011042e-06, + "loss": 0.0, + "step": 57060 + }, + { + "epoch": 5.3243444993934865, + "grad_norm": NaN, + "learning_rate": 9.888348642415667e-06, + "loss": 0.0, + "step": 57061 + }, + { + "epoch": 5.324437809088364, + "grad_norm": NaN, + "learning_rate": 9.8856480209833e-06, + "loss": 0.0, + "step": 57062 + }, + { + "epoch": 5.324531118783241, + "grad_norm": NaN, + "learning_rate": 9.882947755820247e-06, + "loss": 0.0, + "step": 57063 + }, + { + "epoch": 5.324624428478119, + "grad_norm": NaN, + "learning_rate": 9.880247846933386e-06, + "loss": 0.0, + "step": 57064 + }, + { + "epoch": 5.324717738172996, + "grad_norm": NaN, + "learning_rate": 9.877548294329497e-06, + "loss": 0.0, + "step": 57065 + }, + { + "epoch": 5.324811047867874, + "grad_norm": NaN, + "learning_rate": 9.87484909801552e-06, + "loss": 0.0, + "step": 57066 + }, + { + "epoch": 5.324904357562751, + "grad_norm": NaN, + "learning_rate": 9.8721502579983e-06, + "loss": 0.0, + "step": 57067 + }, + { + "epoch": 5.3249976672576285, + "grad_norm": NaN, + "learning_rate": 9.86945177428467e-06, + "loss": 0.0, + "step": 57068 + }, + { + "epoch": 5.325090976952505, + "grad_norm": NaN, + "learning_rate": 9.86675364688152e-06, + "loss": 0.0, + "step": 57069 + }, + { + "epoch": 5.325184286647382, + "grad_norm": NaN, + "learning_rate": 9.864055875795746e-06, + "loss": 0.0, + "step": 57070 + }, + { + "epoch": 5.32527759634226, + "grad_norm": NaN, + "learning_rate": 9.861358461034108e-06, + "loss": 0.0, + "step": 57071 + }, + { + "epoch": 5.325370906037137, + "grad_norm": NaN, + "learning_rate": 9.858661402603534e-06, + "loss": 0.0, + "step": 57072 + }, + { + "epoch": 5.325464215732015, + "grad_norm": NaN, + "learning_rate": 9.855964700510921e-06, + "loss": 0.0, + "step": 57073 + }, + { + "epoch": 5.325557525426892, + "grad_norm": NaN, + "learning_rate": 9.853268354763011e-06, + "loss": 0.0, + "step": 57074 + }, + { + "epoch": 5.3256508351217695, + "grad_norm": NaN, + "learning_rate": 9.850572365366749e-06, + "loss": 0.0, + "step": 57075 + }, + { + "epoch": 5.325744144816646, + "grad_norm": NaN, + "learning_rate": 9.847876732328996e-06, + "loss": 0.0, + "step": 57076 + }, + { + "epoch": 5.3258374545115235, + "grad_norm": NaN, + "learning_rate": 9.84518145565653e-06, + "loss": 0.0, + "step": 57077 + }, + { + "epoch": 5.325930764206401, + "grad_norm": NaN, + "learning_rate": 9.842486535356264e-06, + "loss": 0.0, + "step": 57078 + }, + { + "epoch": 5.326024073901278, + "grad_norm": NaN, + "learning_rate": 9.839791971435074e-06, + "loss": 0.0, + "step": 57079 + }, + { + "epoch": 5.326117383596156, + "grad_norm": NaN, + "learning_rate": 9.83709776389972e-06, + "loss": 0.0, + "step": 57080 + }, + { + "epoch": 5.326210693291033, + "grad_norm": NaN, + "learning_rate": 9.834403912757132e-06, + "loss": 0.0, + "step": 57081 + }, + { + "epoch": 5.326304002985911, + "grad_norm": NaN, + "learning_rate": 9.831710418014171e-06, + "loss": 0.0, + "step": 57082 + }, + { + "epoch": 5.326397312680788, + "grad_norm": NaN, + "learning_rate": 9.82901727967758e-06, + "loss": 0.0, + "step": 57083 + }, + { + "epoch": 5.3264906223756645, + "grad_norm": NaN, + "learning_rate": 9.826324497754307e-06, + "loss": 0.0, + "step": 57084 + }, + { + "epoch": 5.326583932070542, + "grad_norm": NaN, + "learning_rate": 9.823632072251209e-06, + "loss": 0.0, + "step": 57085 + }, + { + "epoch": 5.326677241765419, + "grad_norm": NaN, + "learning_rate": 9.820940003175032e-06, + "loss": 0.0, + "step": 57086 + }, + { + "epoch": 5.326770551460297, + "grad_norm": NaN, + "learning_rate": 9.818248290532687e-06, + "loss": 0.0, + "step": 57087 + }, + { + "epoch": 5.326863861155174, + "grad_norm": NaN, + "learning_rate": 9.815556934331054e-06, + "loss": 0.0, + "step": 57088 + }, + { + "epoch": 5.326957170850052, + "grad_norm": NaN, + "learning_rate": 9.81286593457689e-06, + "loss": 0.0, + "step": 57089 + }, + { + "epoch": 5.327050480544929, + "grad_norm": NaN, + "learning_rate": 9.810175291277095e-06, + "loss": 0.0, + "step": 57090 + }, + { + "epoch": 5.3271437902398056, + "grad_norm": NaN, + "learning_rate": 9.807485004438525e-06, + "loss": 0.0, + "step": 57091 + }, + { + "epoch": 5.327237099934683, + "grad_norm": NaN, + "learning_rate": 9.804795074067945e-06, + "loss": 0.0, + "step": 57092 + }, + { + "epoch": 5.32733040962956, + "grad_norm": NaN, + "learning_rate": 9.802105500172263e-06, + "loss": 0.0, + "step": 57093 + }, + { + "epoch": 5.327423719324438, + "grad_norm": NaN, + "learning_rate": 9.799416282758294e-06, + "loss": 0.0, + "step": 57094 + }, + { + "epoch": 5.327517029019315, + "grad_norm": NaN, + "learning_rate": 9.796727421832895e-06, + "loss": 0.0, + "step": 57095 + }, + { + "epoch": 5.327610338714193, + "grad_norm": NaN, + "learning_rate": 9.79403891740288e-06, + "loss": 0.0, + "step": 57096 + }, + { + "epoch": 5.32770364840907, + "grad_norm": NaN, + "learning_rate": 9.791350769475076e-06, + "loss": 0.0, + "step": 57097 + }, + { + "epoch": 5.327796958103947, + "grad_norm": NaN, + "learning_rate": 9.788662978056345e-06, + "loss": 0.0, + "step": 57098 + }, + { + "epoch": 5.327890267798824, + "grad_norm": NaN, + "learning_rate": 9.785975543153546e-06, + "loss": 0.0, + "step": 57099 + }, + { + "epoch": 5.327983577493701, + "grad_norm": NaN, + "learning_rate": 9.78328846477341e-06, + "loss": 0.0, + "step": 57100 + }, + { + "epoch": 5.328076887188579, + "grad_norm": NaN, + "learning_rate": 9.780601742922878e-06, + "loss": 0.0, + "step": 57101 + }, + { + "epoch": 5.328170196883456, + "grad_norm": NaN, + "learning_rate": 9.777915377608748e-06, + "loss": 0.0, + "step": 57102 + }, + { + "epoch": 5.328263506578334, + "grad_norm": NaN, + "learning_rate": 9.775229368837795e-06, + "loss": 0.0, + "step": 57103 + }, + { + "epoch": 5.328356816273211, + "grad_norm": NaN, + "learning_rate": 9.772543716616932e-06, + "loss": 0.0, + "step": 57104 + }, + { + "epoch": 5.328450125968088, + "grad_norm": NaN, + "learning_rate": 9.76985842095297e-06, + "loss": 0.0, + "step": 57105 + }, + { + "epoch": 5.328543435662965, + "grad_norm": NaN, + "learning_rate": 9.76717348185267e-06, + "loss": 0.0, + "step": 57106 + }, + { + "epoch": 5.3286367453578425, + "grad_norm": NaN, + "learning_rate": 9.764488899322925e-06, + "loss": 0.0, + "step": 57107 + }, + { + "epoch": 5.32873005505272, + "grad_norm": NaN, + "learning_rate": 9.761804673370583e-06, + "loss": 0.0, + "step": 57108 + }, + { + "epoch": 5.328823364747597, + "grad_norm": NaN, + "learning_rate": 9.759120804002368e-06, + "loss": 0.0, + "step": 57109 + }, + { + "epoch": 5.328916674442475, + "grad_norm": NaN, + "learning_rate": 9.756437291225194e-06, + "loss": 0.0, + "step": 57110 + }, + { + "epoch": 5.329009984137352, + "grad_norm": NaN, + "learning_rate": 9.753754135045889e-06, + "loss": 0.0, + "step": 57111 + }, + { + "epoch": 5.32910329383223, + "grad_norm": NaN, + "learning_rate": 9.751071335471195e-06, + "loss": 0.0, + "step": 57112 + }, + { + "epoch": 5.329196603527106, + "grad_norm": NaN, + "learning_rate": 9.748388892507992e-06, + "loss": 0.0, + "step": 57113 + }, + { + "epoch": 5.3292899132219835, + "grad_norm": NaN, + "learning_rate": 9.745706806163123e-06, + "loss": 0.0, + "step": 57114 + }, + { + "epoch": 5.329383222916861, + "grad_norm": NaN, + "learning_rate": 9.743025076443334e-06, + "loss": 0.0, + "step": 57115 + }, + { + "epoch": 5.329476532611738, + "grad_norm": NaN, + "learning_rate": 9.740343703355502e-06, + "loss": 0.0, + "step": 57116 + }, + { + "epoch": 5.329569842306616, + "grad_norm": NaN, + "learning_rate": 9.737662686906455e-06, + "loss": 0.0, + "step": 57117 + }, + { + "epoch": 5.329663152001493, + "grad_norm": NaN, + "learning_rate": 9.734982027102921e-06, + "loss": 0.0, + "step": 57118 + }, + { + "epoch": 5.329756461696371, + "grad_norm": NaN, + "learning_rate": 9.732301723951814e-06, + "loss": 0.0, + "step": 57119 + }, + { + "epoch": 5.329849771391247, + "grad_norm": NaN, + "learning_rate": 9.72962177745994e-06, + "loss": 0.0, + "step": 57120 + }, + { + "epoch": 5.329943081086125, + "grad_norm": NaN, + "learning_rate": 9.726942187634029e-06, + "loss": 0.0, + "step": 57121 + }, + { + "epoch": 5.330036390781002, + "grad_norm": NaN, + "learning_rate": 9.724262954480993e-06, + "loss": 0.0, + "step": 57122 + }, + { + "epoch": 5.330129700475879, + "grad_norm": NaN, + "learning_rate": 9.721584078007611e-06, + "loss": 0.0, + "step": 57123 + }, + { + "epoch": 5.330223010170757, + "grad_norm": NaN, + "learning_rate": 9.718905558220641e-06, + "loss": 0.0, + "step": 57124 + }, + { + "epoch": 5.330316319865634, + "grad_norm": NaN, + "learning_rate": 9.71622739512698e-06, + "loss": 0.0, + "step": 57125 + }, + { + "epoch": 5.330409629560512, + "grad_norm": NaN, + "learning_rate": 9.713549588733404e-06, + "loss": 0.0, + "step": 57126 + }, + { + "epoch": 5.330502939255389, + "grad_norm": NaN, + "learning_rate": 9.710872139046678e-06, + "loss": 0.0, + "step": 57127 + }, + { + "epoch": 5.330596248950266, + "grad_norm": NaN, + "learning_rate": 9.708195046073659e-06, + "loss": 0.0, + "step": 57128 + }, + { + "epoch": 5.330689558645143, + "grad_norm": NaN, + "learning_rate": 9.705518309821175e-06, + "loss": 0.0, + "step": 57129 + }, + { + "epoch": 5.3307828683400205, + "grad_norm": NaN, + "learning_rate": 9.702841930295957e-06, + "loss": 0.0, + "step": 57130 + }, + { + "epoch": 5.330876178034898, + "grad_norm": NaN, + "learning_rate": 9.700165907504864e-06, + "loss": 0.0, + "step": 57131 + }, + { + "epoch": 5.330969487729775, + "grad_norm": NaN, + "learning_rate": 9.697490241454691e-06, + "loss": 0.0, + "step": 57132 + }, + { + "epoch": 5.331062797424653, + "grad_norm": NaN, + "learning_rate": 9.694814932152251e-06, + "loss": 0.0, + "step": 57133 + }, + { + "epoch": 5.331156107119529, + "grad_norm": NaN, + "learning_rate": 9.692139979604318e-06, + "loss": 0.0, + "step": 57134 + }, + { + "epoch": 5.331249416814407, + "grad_norm": NaN, + "learning_rate": 9.689465383817724e-06, + "loss": 0.0, + "step": 57135 + }, + { + "epoch": 5.331342726509284, + "grad_norm": NaN, + "learning_rate": 9.686791144799261e-06, + "loss": 0.0, + "step": 57136 + }, + { + "epoch": 5.3314360362041615, + "grad_norm": NaN, + "learning_rate": 9.684117262555708e-06, + "loss": 0.0, + "step": 57137 + }, + { + "epoch": 5.331529345899039, + "grad_norm": NaN, + "learning_rate": 9.681443737093891e-06, + "loss": 0.0, + "step": 57138 + }, + { + "epoch": 5.331622655593916, + "grad_norm": NaN, + "learning_rate": 9.678770568420591e-06, + "loss": 0.0, + "step": 57139 + }, + { + "epoch": 5.331715965288794, + "grad_norm": NaN, + "learning_rate": 9.676097756542617e-06, + "loss": 0.0, + "step": 57140 + }, + { + "epoch": 5.331809274983671, + "grad_norm": NaN, + "learning_rate": 9.673425301466764e-06, + "loss": 0.0, + "step": 57141 + }, + { + "epoch": 5.331902584678548, + "grad_norm": NaN, + "learning_rate": 9.67075320319981e-06, + "loss": 0.0, + "step": 57142 + }, + { + "epoch": 5.331995894373425, + "grad_norm": NaN, + "learning_rate": 9.6680814617486e-06, + "loss": 0.0, + "step": 57143 + }, + { + "epoch": 5.332089204068303, + "grad_norm": NaN, + "learning_rate": 9.665410077119829e-06, + "loss": 0.0, + "step": 57144 + }, + { + "epoch": 5.33218251376318, + "grad_norm": NaN, + "learning_rate": 9.662739049320389e-06, + "loss": 0.0, + "step": 57145 + }, + { + "epoch": 5.332275823458057, + "grad_norm": NaN, + "learning_rate": 9.660068378357044e-06, + "loss": 0.0, + "step": 57146 + }, + { + "epoch": 5.332369133152935, + "grad_norm": NaN, + "learning_rate": 9.657398064236522e-06, + "loss": 0.0, + "step": 57147 + }, + { + "epoch": 5.332462442847812, + "grad_norm": NaN, + "learning_rate": 9.654728106965698e-06, + "loss": 0.0, + "step": 57148 + }, + { + "epoch": 5.332555752542689, + "grad_norm": NaN, + "learning_rate": 9.652058506551353e-06, + "loss": 0.0, + "step": 57149 + }, + { + "epoch": 5.332649062237566, + "grad_norm": NaN, + "learning_rate": 9.649389263000196e-06, + "loss": 0.0, + "step": 57150 + }, + { + "epoch": 5.332742371932444, + "grad_norm": NaN, + "learning_rate": 9.64672037631909e-06, + "loss": 0.0, + "step": 57151 + }, + { + "epoch": 5.332835681627321, + "grad_norm": NaN, + "learning_rate": 9.644051846514811e-06, + "loss": 0.0, + "step": 57152 + }, + { + "epoch": 5.332928991322198, + "grad_norm": NaN, + "learning_rate": 9.641383673594106e-06, + "loss": 0.0, + "step": 57153 + }, + { + "epoch": 5.333022301017076, + "grad_norm": NaN, + "learning_rate": 9.6387158575638e-06, + "loss": 0.0, + "step": 57154 + }, + { + "epoch": 5.333115610711953, + "grad_norm": NaN, + "learning_rate": 9.636048398430673e-06, + "loss": 0.0, + "step": 57155 + }, + { + "epoch": 5.333208920406831, + "grad_norm": NaN, + "learning_rate": 9.63338129620147e-06, + "loss": 0.0, + "step": 57156 + }, + { + "epoch": 5.333302230101707, + "grad_norm": NaN, + "learning_rate": 9.630714550883001e-06, + "loss": 0.0, + "step": 57157 + }, + { + "epoch": 5.333395539796585, + "grad_norm": NaN, + "learning_rate": 9.628048162482078e-06, + "loss": 0.0, + "step": 57158 + }, + { + "epoch": 5.333488849491462, + "grad_norm": NaN, + "learning_rate": 9.62538213100541e-06, + "loss": 0.0, + "step": 57159 + }, + { + "epoch": 5.3335821591863395, + "grad_norm": NaN, + "learning_rate": 9.622716456459829e-06, + "loss": 0.0, + "step": 57160 + }, + { + "epoch": 5.333675468881217, + "grad_norm": NaN, + "learning_rate": 9.62005113885211e-06, + "loss": 0.0, + "step": 57161 + }, + { + "epoch": 5.333768778576094, + "grad_norm": NaN, + "learning_rate": 9.61738617818898e-06, + "loss": 0.0, + "step": 57162 + }, + { + "epoch": 5.333862088270972, + "grad_norm": NaN, + "learning_rate": 9.614721574477285e-06, + "loss": 0.0, + "step": 57163 + }, + { + "epoch": 5.333955397965848, + "grad_norm": NaN, + "learning_rate": 9.61205732772377e-06, + "loss": 0.0, + "step": 57164 + }, + { + "epoch": 5.334048707660726, + "grad_norm": NaN, + "learning_rate": 9.60939343793518e-06, + "loss": 0.0, + "step": 57165 + }, + { + "epoch": 5.334142017355603, + "grad_norm": NaN, + "learning_rate": 9.606729905118327e-06, + "loss": 0.0, + "step": 57166 + }, + { + "epoch": 5.3342353270504805, + "grad_norm": NaN, + "learning_rate": 9.604066729279986e-06, + "loss": 0.0, + "step": 57167 + }, + { + "epoch": 5.334328636745358, + "grad_norm": NaN, + "learning_rate": 9.601403910426903e-06, + "loss": 0.0, + "step": 57168 + }, + { + "epoch": 5.334421946440235, + "grad_norm": NaN, + "learning_rate": 9.598741448565856e-06, + "loss": 0.0, + "step": 57169 + }, + { + "epoch": 5.334515256135113, + "grad_norm": NaN, + "learning_rate": 9.59607934370364e-06, + "loss": 0.0, + "step": 57170 + }, + { + "epoch": 5.334608565829989, + "grad_norm": NaN, + "learning_rate": 9.59341759584698e-06, + "loss": 0.0, + "step": 57171 + }, + { + "epoch": 5.334701875524867, + "grad_norm": NaN, + "learning_rate": 9.590756205002676e-06, + "loss": 0.0, + "step": 57172 + }, + { + "epoch": 5.334795185219744, + "grad_norm": NaN, + "learning_rate": 9.588095171177502e-06, + "loss": 0.0, + "step": 57173 + }, + { + "epoch": 5.334888494914622, + "grad_norm": NaN, + "learning_rate": 9.585434494378186e-06, + "loss": 0.0, + "step": 57174 + }, + { + "epoch": 5.334981804609499, + "grad_norm": NaN, + "learning_rate": 9.582774174611524e-06, + "loss": 0.0, + "step": 57175 + }, + { + "epoch": 5.335075114304376, + "grad_norm": NaN, + "learning_rate": 9.580114211884277e-06, + "loss": 0.0, + "step": 57176 + }, + { + "epoch": 5.335168423999254, + "grad_norm": NaN, + "learning_rate": 9.577454606203205e-06, + "loss": 0.0, + "step": 57177 + }, + { + "epoch": 5.33526173369413, + "grad_norm": NaN, + "learning_rate": 9.574795357575072e-06, + "loss": 0.0, + "step": 57178 + }, + { + "epoch": 5.335355043389008, + "grad_norm": NaN, + "learning_rate": 9.572136466006619e-06, + "loss": 0.0, + "step": 57179 + }, + { + "epoch": 5.335448353083885, + "grad_norm": NaN, + "learning_rate": 9.569477931504644e-06, + "loss": 0.0, + "step": 57180 + }, + { + "epoch": 5.335541662778763, + "grad_norm": NaN, + "learning_rate": 9.566819754075889e-06, + "loss": 0.0, + "step": 57181 + }, + { + "epoch": 5.33563497247364, + "grad_norm": NaN, + "learning_rate": 9.564161933727099e-06, + "loss": 0.0, + "step": 57182 + }, + { + "epoch": 5.3357282821685175, + "grad_norm": NaN, + "learning_rate": 9.561504470465053e-06, + "loss": 0.0, + "step": 57183 + }, + { + "epoch": 5.335821591863395, + "grad_norm": NaN, + "learning_rate": 9.558847364296496e-06, + "loss": 0.0, + "step": 57184 + }, + { + "epoch": 5.335914901558272, + "grad_norm": NaN, + "learning_rate": 9.556190615228188e-06, + "loss": 0.0, + "step": 57185 + }, + { + "epoch": 5.336008211253149, + "grad_norm": NaN, + "learning_rate": 9.55353422326689e-06, + "loss": 0.0, + "step": 57186 + }, + { + "epoch": 5.336101520948026, + "grad_norm": NaN, + "learning_rate": 9.550878188419347e-06, + "loss": 0.0, + "step": 57187 + }, + { + "epoch": 5.336194830642904, + "grad_norm": NaN, + "learning_rate": 9.548222510692322e-06, + "loss": 0.0, + "step": 57188 + }, + { + "epoch": 5.336288140337781, + "grad_norm": NaN, + "learning_rate": 9.545567190092557e-06, + "loss": 0.0, + "step": 57189 + }, + { + "epoch": 5.3363814500326585, + "grad_norm": NaN, + "learning_rate": 9.54291222662683e-06, + "loss": 0.0, + "step": 57190 + }, + { + "epoch": 5.336474759727536, + "grad_norm": NaN, + "learning_rate": 9.54025762030184e-06, + "loss": 0.0, + "step": 57191 + }, + { + "epoch": 5.336568069422413, + "grad_norm": NaN, + "learning_rate": 9.537603371124375e-06, + "loss": 0.0, + "step": 57192 + }, + { + "epoch": 5.33666137911729, + "grad_norm": NaN, + "learning_rate": 9.5349494791012e-06, + "loss": 0.0, + "step": 57193 + }, + { + "epoch": 5.336754688812167, + "grad_norm": NaN, + "learning_rate": 9.532295944238994e-06, + "loss": 0.0, + "step": 57194 + }, + { + "epoch": 5.336847998507045, + "grad_norm": NaN, + "learning_rate": 9.529642766544583e-06, + "loss": 0.0, + "step": 57195 + }, + { + "epoch": 5.336941308201922, + "grad_norm": NaN, + "learning_rate": 9.526989946024694e-06, + "loss": 0.0, + "step": 57196 + }, + { + "epoch": 5.3370346178968, + "grad_norm": NaN, + "learning_rate": 9.524337482686007e-06, + "loss": 0.0, + "step": 57197 + }, + { + "epoch": 5.337127927591677, + "grad_norm": NaN, + "learning_rate": 9.521685376535349e-06, + "loss": 0.0, + "step": 57198 + }, + { + "epoch": 5.337221237286554, + "grad_norm": NaN, + "learning_rate": 9.519033627579464e-06, + "loss": 0.0, + "step": 57199 + }, + { + "epoch": 5.337314546981432, + "grad_norm": NaN, + "learning_rate": 9.516382235825e-06, + "loss": 0.0, + "step": 57200 + }, + { + "epoch": 5.337407856676308, + "grad_norm": NaN, + "learning_rate": 9.513731201278796e-06, + "loss": 0.0, + "step": 57201 + }, + { + "epoch": 5.337501166371186, + "grad_norm": NaN, + "learning_rate": 9.511080523947584e-06, + "loss": 0.0, + "step": 57202 + }, + { + "epoch": 5.337594476066063, + "grad_norm": NaN, + "learning_rate": 9.508430203838024e-06, + "loss": 0.0, + "step": 57203 + }, + { + "epoch": 5.337687785760941, + "grad_norm": NaN, + "learning_rate": 9.505780240956928e-06, + "loss": 0.0, + "step": 57204 + }, + { + "epoch": 5.337781095455818, + "grad_norm": NaN, + "learning_rate": 9.503130635311024e-06, + "loss": 0.0, + "step": 57205 + }, + { + "epoch": 5.3378744051506954, + "grad_norm": NaN, + "learning_rate": 9.500481386907038e-06, + "loss": 0.0, + "step": 57206 + }, + { + "epoch": 5.337967714845573, + "grad_norm": NaN, + "learning_rate": 9.4978324957517e-06, + "loss": 0.0, + "step": 57207 + }, + { + "epoch": 5.338061024540449, + "grad_norm": NaN, + "learning_rate": 9.49518396185177e-06, + "loss": 0.0, + "step": 57208 + }, + { + "epoch": 5.338154334235327, + "grad_norm": NaN, + "learning_rate": 9.492535785213962e-06, + "loss": 0.0, + "step": 57209 + }, + { + "epoch": 5.338247643930204, + "grad_norm": NaN, + "learning_rate": 9.489887965845017e-06, + "loss": 0.0, + "step": 57210 + }, + { + "epoch": 5.338340953625082, + "grad_norm": NaN, + "learning_rate": 9.487240503751648e-06, + "loss": 0.0, + "step": 57211 + }, + { + "epoch": 5.338434263319959, + "grad_norm": NaN, + "learning_rate": 9.484593398940615e-06, + "loss": 0.0, + "step": 57212 + }, + { + "epoch": 5.3385275730148365, + "grad_norm": NaN, + "learning_rate": 9.481946651418647e-06, + "loss": 0.0, + "step": 57213 + }, + { + "epoch": 5.338620882709714, + "grad_norm": NaN, + "learning_rate": 9.479300261192458e-06, + "loss": 0.0, + "step": 57214 + }, + { + "epoch": 5.33871419240459, + "grad_norm": NaN, + "learning_rate": 9.47665422826877e-06, + "loss": 0.0, + "step": 57215 + }, + { + "epoch": 5.338807502099468, + "grad_norm": NaN, + "learning_rate": 9.474008552654334e-06, + "loss": 0.0, + "step": 57216 + }, + { + "epoch": 5.338900811794345, + "grad_norm": NaN, + "learning_rate": 9.471363234355872e-06, + "loss": 0.0, + "step": 57217 + }, + { + "epoch": 5.338994121489223, + "grad_norm": NaN, + "learning_rate": 9.4687182733801e-06, + "loss": 0.0, + "step": 57218 + }, + { + "epoch": 5.3390874311841, + "grad_norm": NaN, + "learning_rate": 9.466073669733742e-06, + "loss": 0.0, + "step": 57219 + }, + { + "epoch": 5.3391807408789775, + "grad_norm": NaN, + "learning_rate": 9.463429423423547e-06, + "loss": 0.0, + "step": 57220 + }, + { + "epoch": 5.339274050573855, + "grad_norm": NaN, + "learning_rate": 9.460785534456206e-06, + "loss": 0.0, + "step": 57221 + }, + { + "epoch": 5.3393673602687315, + "grad_norm": NaN, + "learning_rate": 9.458142002838465e-06, + "loss": 0.0, + "step": 57222 + }, + { + "epoch": 5.339460669963609, + "grad_norm": NaN, + "learning_rate": 9.455498828577035e-06, + "loss": 0.0, + "step": 57223 + }, + { + "epoch": 5.339553979658486, + "grad_norm": NaN, + "learning_rate": 9.452856011678628e-06, + "loss": 0.0, + "step": 57224 + }, + { + "epoch": 5.339647289353364, + "grad_norm": NaN, + "learning_rate": 9.450213552149987e-06, + "loss": 0.0, + "step": 57225 + }, + { + "epoch": 5.339740599048241, + "grad_norm": NaN, + "learning_rate": 9.447571449997827e-06, + "loss": 0.0, + "step": 57226 + }, + { + "epoch": 5.339833908743119, + "grad_norm": NaN, + "learning_rate": 9.444929705228838e-06, + "loss": 0.0, + "step": 57227 + }, + { + "epoch": 5.339927218437996, + "grad_norm": NaN, + "learning_rate": 9.442288317849767e-06, + "loss": 0.0, + "step": 57228 + }, + { + "epoch": 5.340020528132873, + "grad_norm": NaN, + "learning_rate": 9.439647287867324e-06, + "loss": 0.0, + "step": 57229 + }, + { + "epoch": 5.34011383782775, + "grad_norm": NaN, + "learning_rate": 9.437006615288224e-06, + "loss": 0.0, + "step": 57230 + }, + { + "epoch": 5.340207147522627, + "grad_norm": NaN, + "learning_rate": 9.434366300119173e-06, + "loss": 0.0, + "step": 57231 + }, + { + "epoch": 5.340300457217505, + "grad_norm": NaN, + "learning_rate": 9.431726342366885e-06, + "loss": 0.0, + "step": 57232 + }, + { + "epoch": 5.340393766912382, + "grad_norm": NaN, + "learning_rate": 9.429086742038088e-06, + "loss": 0.0, + "step": 57233 + }, + { + "epoch": 5.34048707660726, + "grad_norm": NaN, + "learning_rate": 9.426447499139494e-06, + "loss": 0.0, + "step": 57234 + }, + { + "epoch": 5.340580386302137, + "grad_norm": NaN, + "learning_rate": 9.423808613677764e-06, + "loss": 0.0, + "step": 57235 + }, + { + "epoch": 5.3406736959970145, + "grad_norm": NaN, + "learning_rate": 9.421170085659675e-06, + "loss": 0.0, + "step": 57236 + }, + { + "epoch": 5.340767005691891, + "grad_norm": NaN, + "learning_rate": 9.41853191509192e-06, + "loss": 0.0, + "step": 57237 + }, + { + "epoch": 5.340860315386768, + "grad_norm": NaN, + "learning_rate": 9.415894101981147e-06, + "loss": 0.0, + "step": 57238 + }, + { + "epoch": 5.340953625081646, + "grad_norm": NaN, + "learning_rate": 9.41325664633415e-06, + "loss": 0.0, + "step": 57239 + }, + { + "epoch": 5.341046934776523, + "grad_norm": NaN, + "learning_rate": 9.410619548157605e-06, + "loss": 0.0, + "step": 57240 + }, + { + "epoch": 5.341140244471401, + "grad_norm": NaN, + "learning_rate": 9.407982807458175e-06, + "loss": 0.0, + "step": 57241 + }, + { + "epoch": 5.341233554166278, + "grad_norm": NaN, + "learning_rate": 9.405346424242605e-06, + "loss": 0.0, + "step": 57242 + }, + { + "epoch": 5.3413268638611555, + "grad_norm": NaN, + "learning_rate": 9.402710398517604e-06, + "loss": 0.0, + "step": 57243 + }, + { + "epoch": 5.341420173556033, + "grad_norm": NaN, + "learning_rate": 9.40007473028987e-06, + "loss": 0.0, + "step": 57244 + }, + { + "epoch": 5.3415134832509095, + "grad_norm": NaN, + "learning_rate": 9.397439419566077e-06, + "loss": 0.0, + "step": 57245 + }, + { + "epoch": 5.341606792945787, + "grad_norm": NaN, + "learning_rate": 9.394804466352957e-06, + "loss": 0.0, + "step": 57246 + }, + { + "epoch": 5.341700102640664, + "grad_norm": NaN, + "learning_rate": 9.3921698706572e-06, + "loss": 0.0, + "step": 57247 + }, + { + "epoch": 5.341793412335542, + "grad_norm": NaN, + "learning_rate": 9.389535632485507e-06, + "loss": 0.0, + "step": 57248 + }, + { + "epoch": 5.341886722030419, + "grad_norm": NaN, + "learning_rate": 9.386901751844566e-06, + "loss": 0.0, + "step": 57249 + }, + { + "epoch": 5.341980031725297, + "grad_norm": NaN, + "learning_rate": 9.384268228741093e-06, + "loss": 0.0, + "step": 57250 + }, + { + "epoch": 5.342073341420173, + "grad_norm": NaN, + "learning_rate": 9.381635063181747e-06, + "loss": 0.0, + "step": 57251 + }, + { + "epoch": 5.3421666511150505, + "grad_norm": NaN, + "learning_rate": 9.379002255173273e-06, + "loss": 0.0, + "step": 57252 + }, + { + "epoch": 5.342259960809928, + "grad_norm": NaN, + "learning_rate": 9.376369804722333e-06, + "loss": 0.0, + "step": 57253 + }, + { + "epoch": 5.342353270504805, + "grad_norm": NaN, + "learning_rate": 9.373737711835622e-06, + "loss": 0.0, + "step": 57254 + }, + { + "epoch": 5.342446580199683, + "grad_norm": NaN, + "learning_rate": 9.371105976519849e-06, + "loss": 0.0, + "step": 57255 + }, + { + "epoch": 5.34253988989456, + "grad_norm": NaN, + "learning_rate": 9.368474598781694e-06, + "loss": 0.0, + "step": 57256 + }, + { + "epoch": 5.342633199589438, + "grad_norm": NaN, + "learning_rate": 9.365843578627853e-06, + "loss": 0.0, + "step": 57257 + }, + { + "epoch": 5.342726509284315, + "grad_norm": NaN, + "learning_rate": 9.363212916064999e-06, + "loss": 0.0, + "step": 57258 + }, + { + "epoch": 5.342819818979192, + "grad_norm": NaN, + "learning_rate": 9.360582611099849e-06, + "loss": 0.0, + "step": 57259 + }, + { + "epoch": 5.342913128674069, + "grad_norm": NaN, + "learning_rate": 9.357952663739076e-06, + "loss": 0.0, + "step": 57260 + }, + { + "epoch": 5.343006438368946, + "grad_norm": NaN, + "learning_rate": 9.355323073989363e-06, + "loss": 0.0, + "step": 57261 + }, + { + "epoch": 5.343099748063824, + "grad_norm": NaN, + "learning_rate": 9.3526938418574e-06, + "loss": 0.0, + "step": 57262 + }, + { + "epoch": 5.343193057758701, + "grad_norm": NaN, + "learning_rate": 9.350064967349885e-06, + "loss": 0.0, + "step": 57263 + }, + { + "epoch": 5.343286367453579, + "grad_norm": NaN, + "learning_rate": 9.347436450473478e-06, + "loss": 0.0, + "step": 57264 + }, + { + "epoch": 5.343379677148456, + "grad_norm": NaN, + "learning_rate": 9.34480829123489e-06, + "loss": 0.0, + "step": 57265 + }, + { + "epoch": 5.343472986843333, + "grad_norm": NaN, + "learning_rate": 9.342180489640782e-06, + "loss": 0.0, + "step": 57266 + }, + { + "epoch": 5.34356629653821, + "grad_norm": NaN, + "learning_rate": 9.339553045697833e-06, + "loss": 0.0, + "step": 57267 + }, + { + "epoch": 5.3436596062330874, + "grad_norm": NaN, + "learning_rate": 9.336925959412739e-06, + "loss": 0.0, + "step": 57268 + }, + { + "epoch": 5.343752915927965, + "grad_norm": NaN, + "learning_rate": 9.334299230792192e-06, + "loss": 0.0, + "step": 57269 + }, + { + "epoch": 5.343846225622842, + "grad_norm": NaN, + "learning_rate": 9.331672859842837e-06, + "loss": 0.0, + "step": 57270 + }, + { + "epoch": 5.34393953531772, + "grad_norm": NaN, + "learning_rate": 9.32904684657137e-06, + "loss": 0.0, + "step": 57271 + }, + { + "epoch": 5.344032845012597, + "grad_norm": NaN, + "learning_rate": 9.326421190984468e-06, + "loss": 0.0, + "step": 57272 + }, + { + "epoch": 5.3441261547074745, + "grad_norm": NaN, + "learning_rate": 9.323795893088809e-06, + "loss": 0.0, + "step": 57273 + }, + { + "epoch": 5.344219464402351, + "grad_norm": NaN, + "learning_rate": 9.321170952891055e-06, + "loss": 0.0, + "step": 57274 + }, + { + "epoch": 5.3443127740972285, + "grad_norm": NaN, + "learning_rate": 9.3185463703979e-06, + "loss": 0.0, + "step": 57275 + }, + { + "epoch": 5.344406083792106, + "grad_norm": NaN, + "learning_rate": 9.315922145616023e-06, + "loss": 0.0, + "step": 57276 + }, + { + "epoch": 5.344499393486983, + "grad_norm": NaN, + "learning_rate": 9.313298278552067e-06, + "loss": 0.0, + "step": 57277 + }, + { + "epoch": 5.344592703181861, + "grad_norm": NaN, + "learning_rate": 9.310674769212728e-06, + "loss": 0.0, + "step": 57278 + }, + { + "epoch": 5.344686012876738, + "grad_norm": NaN, + "learning_rate": 9.30805161760465e-06, + "loss": 0.0, + "step": 57279 + }, + { + "epoch": 5.344779322571616, + "grad_norm": NaN, + "learning_rate": 9.305428823734529e-06, + "loss": 0.0, + "step": 57280 + }, + { + "epoch": 5.344872632266492, + "grad_norm": NaN, + "learning_rate": 9.302806387609024e-06, + "loss": 0.0, + "step": 57281 + }, + { + "epoch": 5.3449659419613695, + "grad_norm": NaN, + "learning_rate": 9.300184309234815e-06, + "loss": 0.0, + "step": 57282 + }, + { + "epoch": 5.345059251656247, + "grad_norm": NaN, + "learning_rate": 9.297562588618545e-06, + "loss": 0.0, + "step": 57283 + }, + { + "epoch": 5.345152561351124, + "grad_norm": NaN, + "learning_rate": 9.294941225766895e-06, + "loss": 0.0, + "step": 57284 + }, + { + "epoch": 5.345245871046002, + "grad_norm": NaN, + "learning_rate": 9.292320220686539e-06, + "loss": 0.0, + "step": 57285 + }, + { + "epoch": 5.345339180740879, + "grad_norm": NaN, + "learning_rate": 9.289699573384125e-06, + "loss": 0.0, + "step": 57286 + }, + { + "epoch": 5.345432490435757, + "grad_norm": NaN, + "learning_rate": 9.287079283866328e-06, + "loss": 0.0, + "step": 57287 + }, + { + "epoch": 5.345525800130633, + "grad_norm": NaN, + "learning_rate": 9.284459352139811e-06, + "loss": 0.0, + "step": 57288 + }, + { + "epoch": 5.345619109825511, + "grad_norm": NaN, + "learning_rate": 9.28183977821122e-06, + "loss": 0.0, + "step": 57289 + }, + { + "epoch": 5.345712419520388, + "grad_norm": NaN, + "learning_rate": 9.27922056208723e-06, + "loss": 0.0, + "step": 57290 + }, + { + "epoch": 5.345805729215265, + "grad_norm": NaN, + "learning_rate": 9.276601703774505e-06, + "loss": 0.0, + "step": 57291 + }, + { + "epoch": 5.345899038910143, + "grad_norm": NaN, + "learning_rate": 9.27398320327969e-06, + "loss": 0.0, + "step": 57292 + }, + { + "epoch": 5.34599234860502, + "grad_norm": NaN, + "learning_rate": 9.27136506060946e-06, + "loss": 0.0, + "step": 57293 + }, + { + "epoch": 5.346085658299898, + "grad_norm": NaN, + "learning_rate": 9.268747275770445e-06, + "loss": 0.0, + "step": 57294 + }, + { + "epoch": 5.346178967994774, + "grad_norm": NaN, + "learning_rate": 9.266129848769321e-06, + "loss": 0.0, + "step": 57295 + }, + { + "epoch": 5.346272277689652, + "grad_norm": NaN, + "learning_rate": 9.263512779612753e-06, + "loss": 0.0, + "step": 57296 + }, + { + "epoch": 5.346365587384529, + "grad_norm": NaN, + "learning_rate": 9.260896068307367e-06, + "loss": 0.0, + "step": 57297 + }, + { + "epoch": 5.3464588970794065, + "grad_norm": NaN, + "learning_rate": 9.258279714859839e-06, + "loss": 0.0, + "step": 57298 + }, + { + "epoch": 5.346552206774284, + "grad_norm": NaN, + "learning_rate": 9.255663719276818e-06, + "loss": 0.0, + "step": 57299 + }, + { + "epoch": 5.346645516469161, + "grad_norm": NaN, + "learning_rate": 9.253048081564945e-06, + "loss": 0.0, + "step": 57300 + }, + { + "epoch": 5.346738826164039, + "grad_norm": NaN, + "learning_rate": 9.250432801730884e-06, + "loss": 0.0, + "step": 57301 + }, + { + "epoch": 5.346832135858916, + "grad_norm": NaN, + "learning_rate": 9.247817879781294e-06, + "loss": 0.0, + "step": 57302 + }, + { + "epoch": 5.346925445553793, + "grad_norm": NaN, + "learning_rate": 9.245203315722787e-06, + "loss": 0.0, + "step": 57303 + }, + { + "epoch": 5.34701875524867, + "grad_norm": NaN, + "learning_rate": 9.242589109562043e-06, + "loss": 0.0, + "step": 57304 + }, + { + "epoch": 5.3471120649435475, + "grad_norm": NaN, + "learning_rate": 9.239975261305688e-06, + "loss": 0.0, + "step": 57305 + }, + { + "epoch": 5.347205374638425, + "grad_norm": NaN, + "learning_rate": 9.2373617709604e-06, + "loss": 0.0, + "step": 57306 + }, + { + "epoch": 5.347298684333302, + "grad_norm": NaN, + "learning_rate": 9.23474863853279e-06, + "loss": 0.0, + "step": 57307 + }, + { + "epoch": 5.34739199402818, + "grad_norm": NaN, + "learning_rate": 9.232135864029522e-06, + "loss": 0.0, + "step": 57308 + }, + { + "epoch": 5.347485303723057, + "grad_norm": NaN, + "learning_rate": 9.229523447457221e-06, + "loss": 0.0, + "step": 57309 + }, + { + "epoch": 5.347578613417934, + "grad_norm": NaN, + "learning_rate": 9.22691138882255e-06, + "loss": 0.0, + "step": 57310 + }, + { + "epoch": 5.347671923112811, + "grad_norm": NaN, + "learning_rate": 9.224299688132153e-06, + "loss": 0.0, + "step": 57311 + }, + { + "epoch": 5.347765232807689, + "grad_norm": NaN, + "learning_rate": 9.221688345392659e-06, + "loss": 0.0, + "step": 57312 + }, + { + "epoch": 5.347858542502566, + "grad_norm": NaN, + "learning_rate": 9.219077360610694e-06, + "loss": 0.0, + "step": 57313 + }, + { + "epoch": 5.347951852197443, + "grad_norm": NaN, + "learning_rate": 9.216466733792937e-06, + "loss": 0.0, + "step": 57314 + }, + { + "epoch": 5.348045161892321, + "grad_norm": NaN, + "learning_rate": 9.213856464945985e-06, + "loss": 0.0, + "step": 57315 + }, + { + "epoch": 5.348138471587198, + "grad_norm": NaN, + "learning_rate": 9.211246554076496e-06, + "loss": 0.0, + "step": 57316 + }, + { + "epoch": 5.348231781282076, + "grad_norm": NaN, + "learning_rate": 9.2086370011911e-06, + "loss": 0.0, + "step": 57317 + }, + { + "epoch": 5.348325090976952, + "grad_norm": NaN, + "learning_rate": 9.20602780629644e-06, + "loss": 0.0, + "step": 57318 + }, + { + "epoch": 5.34841840067183, + "grad_norm": NaN, + "learning_rate": 9.203418969399145e-06, + "loss": 0.0, + "step": 57319 + }, + { + "epoch": 5.348511710366707, + "grad_norm": NaN, + "learning_rate": 9.200810490505844e-06, + "loss": 0.0, + "step": 57320 + }, + { + "epoch": 5.3486050200615844, + "grad_norm": NaN, + "learning_rate": 9.198202369623164e-06, + "loss": 0.0, + "step": 57321 + }, + { + "epoch": 5.348698329756462, + "grad_norm": NaN, + "learning_rate": 9.195594606757766e-06, + "loss": 0.0, + "step": 57322 + }, + { + "epoch": 5.348791639451339, + "grad_norm": NaN, + "learning_rate": 9.192987201916247e-06, + "loss": 0.0, + "step": 57323 + }, + { + "epoch": 5.348884949146217, + "grad_norm": NaN, + "learning_rate": 9.190380155105264e-06, + "loss": 0.0, + "step": 57324 + }, + { + "epoch": 5.348978258841093, + "grad_norm": NaN, + "learning_rate": 9.187773466331433e-06, + "loss": 0.0, + "step": 57325 + }, + { + "epoch": 5.349071568535971, + "grad_norm": NaN, + "learning_rate": 9.185167135601379e-06, + "loss": 0.0, + "step": 57326 + }, + { + "epoch": 5.349164878230848, + "grad_norm": NaN, + "learning_rate": 9.182561162921731e-06, + "loss": 0.0, + "step": 57327 + }, + { + "epoch": 5.3492581879257255, + "grad_norm": NaN, + "learning_rate": 9.179955548299116e-06, + "loss": 0.0, + "step": 57328 + }, + { + "epoch": 5.349351497620603, + "grad_norm": NaN, + "learning_rate": 9.177350291740149e-06, + "loss": 0.0, + "step": 57329 + }, + { + "epoch": 5.34944480731548, + "grad_norm": NaN, + "learning_rate": 9.174745393251487e-06, + "loss": 0.0, + "step": 57330 + }, + { + "epoch": 5.349538117010358, + "grad_norm": NaN, + "learning_rate": 9.17214085283971e-06, + "loss": 0.0, + "step": 57331 + }, + { + "epoch": 5.349631426705234, + "grad_norm": NaN, + "learning_rate": 9.169536670511479e-06, + "loss": 0.0, + "step": 57332 + }, + { + "epoch": 5.349724736400112, + "grad_norm": NaN, + "learning_rate": 9.166932846273389e-06, + "loss": 0.0, + "step": 57333 + }, + { + "epoch": 5.349818046094989, + "grad_norm": NaN, + "learning_rate": 9.164329380132085e-06, + "loss": 0.0, + "step": 57334 + }, + { + "epoch": 5.3499113557898665, + "grad_norm": NaN, + "learning_rate": 9.16172627209416e-06, + "loss": 0.0, + "step": 57335 + }, + { + "epoch": 5.350004665484744, + "grad_norm": NaN, + "learning_rate": 9.159123522166262e-06, + "loss": 0.0, + "step": 57336 + }, + { + "epoch": 5.350097975179621, + "grad_norm": NaN, + "learning_rate": 9.156521130354966e-06, + "loss": 0.0, + "step": 57337 + }, + { + "epoch": 5.350191284874499, + "grad_norm": NaN, + "learning_rate": 9.153919096666935e-06, + "loss": 0.0, + "step": 57338 + }, + { + "epoch": 5.350284594569375, + "grad_norm": NaN, + "learning_rate": 9.151317421108762e-06, + "loss": 0.0, + "step": 57339 + }, + { + "epoch": 5.350377904264253, + "grad_norm": NaN, + "learning_rate": 9.148716103687077e-06, + "loss": 0.0, + "step": 57340 + }, + { + "epoch": 5.35047121395913, + "grad_norm": NaN, + "learning_rate": 9.146115144408457e-06, + "loss": 0.0, + "step": 57341 + }, + { + "epoch": 5.350564523654008, + "grad_norm": NaN, + "learning_rate": 9.143514543279561e-06, + "loss": 0.0, + "step": 57342 + }, + { + "epoch": 5.350657833348885, + "grad_norm": NaN, + "learning_rate": 9.140914300306989e-06, + "loss": 0.0, + "step": 57343 + }, + { + "epoch": 5.350751143043762, + "grad_norm": NaN, + "learning_rate": 9.138314415497333e-06, + "loss": 0.0, + "step": 57344 + }, + { + "epoch": 5.35084445273864, + "grad_norm": NaN, + "learning_rate": 9.135714888857221e-06, + "loss": 0.0, + "step": 57345 + }, + { + "epoch": 5.350937762433517, + "grad_norm": NaN, + "learning_rate": 9.133115720393263e-06, + "loss": 0.0, + "step": 57346 + }, + { + "epoch": 5.351031072128394, + "grad_norm": NaN, + "learning_rate": 9.130516910112057e-06, + "loss": 0.0, + "step": 57347 + }, + { + "epoch": 5.351124381823271, + "grad_norm": NaN, + "learning_rate": 9.127918458020212e-06, + "loss": 0.0, + "step": 57348 + }, + { + "epoch": 5.351217691518149, + "grad_norm": NaN, + "learning_rate": 9.125320364124355e-06, + "loss": 0.0, + "step": 57349 + }, + { + "epoch": 5.351311001213026, + "grad_norm": NaN, + "learning_rate": 9.122722628431067e-06, + "loss": 0.0, + "step": 57350 + }, + { + "epoch": 5.3514043109079035, + "grad_norm": NaN, + "learning_rate": 9.120125250946974e-06, + "loss": 0.0, + "step": 57351 + }, + { + "epoch": 5.351497620602781, + "grad_norm": NaN, + "learning_rate": 9.117528231678672e-06, + "loss": 0.0, + "step": 57352 + }, + { + "epoch": 5.351590930297658, + "grad_norm": NaN, + "learning_rate": 9.114931570632755e-06, + "loss": 0.0, + "step": 57353 + }, + { + "epoch": 5.351684239992535, + "grad_norm": NaN, + "learning_rate": 9.112335267815834e-06, + "loss": 0.0, + "step": 57354 + }, + { + "epoch": 5.351777549687412, + "grad_norm": NaN, + "learning_rate": 9.109739323234522e-06, + "loss": 0.0, + "step": 57355 + }, + { + "epoch": 5.35187085938229, + "grad_norm": NaN, + "learning_rate": 9.107143736895395e-06, + "loss": 0.0, + "step": 57356 + }, + { + "epoch": 5.351964169077167, + "grad_norm": NaN, + "learning_rate": 9.104548508805065e-06, + "loss": 0.0, + "step": 57357 + }, + { + "epoch": 5.3520574787720445, + "grad_norm": NaN, + "learning_rate": 9.101953638970144e-06, + "loss": 0.0, + "step": 57358 + }, + { + "epoch": 5.352150788466922, + "grad_norm": NaN, + "learning_rate": 9.09935912739721e-06, + "loss": 0.0, + "step": 57359 + }, + { + "epoch": 5.352244098161799, + "grad_norm": NaN, + "learning_rate": 9.096764974092874e-06, + "loss": 0.0, + "step": 57360 + }, + { + "epoch": 5.352337407856677, + "grad_norm": NaN, + "learning_rate": 9.094171179063715e-06, + "loss": 0.0, + "step": 57361 + }, + { + "epoch": 5.352430717551553, + "grad_norm": NaN, + "learning_rate": 9.091577742316358e-06, + "loss": 0.0, + "step": 57362 + }, + { + "epoch": 5.352524027246431, + "grad_norm": NaN, + "learning_rate": 9.088984663857351e-06, + "loss": 0.0, + "step": 57363 + }, + { + "epoch": 5.352617336941308, + "grad_norm": NaN, + "learning_rate": 9.086391943693339e-06, + "loss": 0.0, + "step": 57364 + }, + { + "epoch": 5.352710646636186, + "grad_norm": NaN, + "learning_rate": 9.08379958183088e-06, + "loss": 0.0, + "step": 57365 + }, + { + "epoch": 5.352803956331063, + "grad_norm": NaN, + "learning_rate": 9.081207578276572e-06, + "loss": 0.0, + "step": 57366 + }, + { + "epoch": 5.35289726602594, + "grad_norm": NaN, + "learning_rate": 9.078615933037026e-06, + "loss": 0.0, + "step": 57367 + }, + { + "epoch": 5.352990575720817, + "grad_norm": NaN, + "learning_rate": 9.076024646118802e-06, + "loss": 0.0, + "step": 57368 + }, + { + "epoch": 5.353083885415694, + "grad_norm": NaN, + "learning_rate": 9.073433717528512e-06, + "loss": 0.0, + "step": 57369 + }, + { + "epoch": 5.353177195110572, + "grad_norm": NaN, + "learning_rate": 9.070843147272716e-06, + "loss": 0.0, + "step": 57370 + }, + { + "epoch": 5.353270504805449, + "grad_norm": NaN, + "learning_rate": 9.068252935358044e-06, + "loss": 0.0, + "step": 57371 + }, + { + "epoch": 5.353363814500327, + "grad_norm": NaN, + "learning_rate": 9.065663081791042e-06, + "loss": 0.0, + "step": 57372 + }, + { + "epoch": 5.353457124195204, + "grad_norm": NaN, + "learning_rate": 9.063073586578318e-06, + "loss": 0.0, + "step": 57373 + }, + { + "epoch": 5.3535504338900815, + "grad_norm": NaN, + "learning_rate": 9.060484449726452e-06, + "loss": 0.0, + "step": 57374 + }, + { + "epoch": 5.353643743584959, + "grad_norm": NaN, + "learning_rate": 9.057895671242004e-06, + "loss": 0.0, + "step": 57375 + }, + { + "epoch": 5.353737053279835, + "grad_norm": NaN, + "learning_rate": 9.055307251131605e-06, + "loss": 0.0, + "step": 57376 + }, + { + "epoch": 5.353830362974713, + "grad_norm": NaN, + "learning_rate": 9.052719189401797e-06, + "loss": 0.0, + "step": 57377 + }, + { + "epoch": 5.35392367266959, + "grad_norm": NaN, + "learning_rate": 9.050131486059159e-06, + "loss": 0.0, + "step": 57378 + }, + { + "epoch": 5.354016982364468, + "grad_norm": NaN, + "learning_rate": 9.047544141110302e-06, + "loss": 0.0, + "step": 57379 + }, + { + "epoch": 5.354110292059345, + "grad_norm": NaN, + "learning_rate": 9.044957154561788e-06, + "loss": 0.0, + "step": 57380 + }, + { + "epoch": 5.3542036017542225, + "grad_norm": NaN, + "learning_rate": 9.042370526420178e-06, + "loss": 0.0, + "step": 57381 + }, + { + "epoch": 5.3542969114491, + "grad_norm": NaN, + "learning_rate": 9.039784256692083e-06, + "loss": 0.0, + "step": 57382 + }, + { + "epoch": 5.3543902211439764, + "grad_norm": NaN, + "learning_rate": 9.03719834538405e-06, + "loss": 0.0, + "step": 57383 + }, + { + "epoch": 5.354483530838854, + "grad_norm": NaN, + "learning_rate": 9.034612792502671e-06, + "loss": 0.0, + "step": 57384 + }, + { + "epoch": 5.354576840533731, + "grad_norm": NaN, + "learning_rate": 9.032027598054509e-06, + "loss": 0.0, + "step": 57385 + }, + { + "epoch": 5.354670150228609, + "grad_norm": NaN, + "learning_rate": 9.02944276204614e-06, + "loss": 0.0, + "step": 57386 + }, + { + "epoch": 5.354763459923486, + "grad_norm": NaN, + "learning_rate": 9.026858284484145e-06, + "loss": 0.0, + "step": 57387 + }, + { + "epoch": 5.3548567696183635, + "grad_norm": NaN, + "learning_rate": 9.0242741653751e-06, + "loss": 0.0, + "step": 57388 + }, + { + "epoch": 5.354950079313241, + "grad_norm": NaN, + "learning_rate": 9.02169040472555e-06, + "loss": 0.0, + "step": 57389 + }, + { + "epoch": 5.355043389008118, + "grad_norm": NaN, + "learning_rate": 9.019107002542092e-06, + "loss": 0.0, + "step": 57390 + }, + { + "epoch": 5.355136698702995, + "grad_norm": NaN, + "learning_rate": 9.016523958831268e-06, + "loss": 0.0, + "step": 57391 + }, + { + "epoch": 5.355230008397872, + "grad_norm": NaN, + "learning_rate": 9.013941273599673e-06, + "loss": 0.0, + "step": 57392 + }, + { + "epoch": 5.35532331809275, + "grad_norm": NaN, + "learning_rate": 9.011358946853869e-06, + "loss": 0.0, + "step": 57393 + }, + { + "epoch": 5.355416627787627, + "grad_norm": NaN, + "learning_rate": 9.008776978600402e-06, + "loss": 0.0, + "step": 57394 + }, + { + "epoch": 5.355509937482505, + "grad_norm": NaN, + "learning_rate": 9.006195368845848e-06, + "loss": 0.0, + "step": 57395 + }, + { + "epoch": 5.355603247177382, + "grad_norm": NaN, + "learning_rate": 9.003614117596785e-06, + "loss": 0.0, + "step": 57396 + }, + { + "epoch": 5.355696556872259, + "grad_norm": NaN, + "learning_rate": 9.00103322485976e-06, + "loss": 0.0, + "step": 57397 + }, + { + "epoch": 5.355789866567136, + "grad_norm": NaN, + "learning_rate": 8.99845269064135e-06, + "loss": 0.0, + "step": 57398 + }, + { + "epoch": 5.355883176262013, + "grad_norm": NaN, + "learning_rate": 8.995872514948083e-06, + "loss": 0.0, + "step": 57399 + }, + { + "epoch": 5.355976485956891, + "grad_norm": NaN, + "learning_rate": 8.99329269778657e-06, + "loss": 0.0, + "step": 57400 + }, + { + "epoch": 5.356069795651768, + "grad_norm": NaN, + "learning_rate": 8.990713239163321e-06, + "loss": 0.0, + "step": 57401 + }, + { + "epoch": 5.356163105346646, + "grad_norm": NaN, + "learning_rate": 8.988134139084935e-06, + "loss": 0.0, + "step": 57402 + }, + { + "epoch": 5.356256415041523, + "grad_norm": NaN, + "learning_rate": 8.985555397557952e-06, + "loss": 0.0, + "step": 57403 + }, + { + "epoch": 5.3563497247364005, + "grad_norm": NaN, + "learning_rate": 8.982977014588922e-06, + "loss": 0.0, + "step": 57404 + }, + { + "epoch": 5.356443034431277, + "grad_norm": NaN, + "learning_rate": 8.980398990184418e-06, + "loss": 0.0, + "step": 57405 + }, + { + "epoch": 5.356536344126154, + "grad_norm": NaN, + "learning_rate": 8.977821324350987e-06, + "loss": 0.0, + "step": 57406 + }, + { + "epoch": 5.356629653821032, + "grad_norm": NaN, + "learning_rate": 8.975244017095173e-06, + "loss": 0.0, + "step": 57407 + }, + { + "epoch": 5.356722963515909, + "grad_norm": NaN, + "learning_rate": 8.97266706842354e-06, + "loss": 0.0, + "step": 57408 + }, + { + "epoch": 5.356816273210787, + "grad_norm": NaN, + "learning_rate": 8.970090478342645e-06, + "loss": 0.0, + "step": 57409 + }, + { + "epoch": 5.356909582905664, + "grad_norm": NaN, + "learning_rate": 8.967514246859037e-06, + "loss": 0.0, + "step": 57410 + }, + { + "epoch": 5.3570028926005415, + "grad_norm": NaN, + "learning_rate": 8.964938373979258e-06, + "loss": 0.0, + "step": 57411 + }, + { + "epoch": 5.357096202295418, + "grad_norm": NaN, + "learning_rate": 8.962362859709854e-06, + "loss": 0.0, + "step": 57412 + }, + { + "epoch": 5.3571895119902955, + "grad_norm": NaN, + "learning_rate": 8.959787704057402e-06, + "loss": 0.0, + "step": 57413 + }, + { + "epoch": 5.357282821685173, + "grad_norm": NaN, + "learning_rate": 8.957212907028415e-06, + "loss": 0.0, + "step": 57414 + }, + { + "epoch": 5.35737613138005, + "grad_norm": NaN, + "learning_rate": 8.954638468629455e-06, + "loss": 0.0, + "step": 57415 + }, + { + "epoch": 5.357469441074928, + "grad_norm": NaN, + "learning_rate": 8.95206438886708e-06, + "loss": 0.0, + "step": 57416 + }, + { + "epoch": 5.357562750769805, + "grad_norm": NaN, + "learning_rate": 8.949490667747821e-06, + "loss": 0.0, + "step": 57417 + }, + { + "epoch": 5.357656060464683, + "grad_norm": NaN, + "learning_rate": 8.946917305278223e-06, + "loss": 0.0, + "step": 57418 + }, + { + "epoch": 5.35774937015956, + "grad_norm": NaN, + "learning_rate": 8.944344301464829e-06, + "loss": 0.0, + "step": 57419 + }, + { + "epoch": 5.3578426798544365, + "grad_norm": NaN, + "learning_rate": 8.941771656314184e-06, + "loss": 0.0, + "step": 57420 + }, + { + "epoch": 5.357935989549314, + "grad_norm": NaN, + "learning_rate": 8.939199369832833e-06, + "loss": 0.0, + "step": 57421 + }, + { + "epoch": 5.358029299244191, + "grad_norm": NaN, + "learning_rate": 8.936627442027305e-06, + "loss": 0.0, + "step": 57422 + }, + { + "epoch": 5.358122608939069, + "grad_norm": NaN, + "learning_rate": 8.93405587290416e-06, + "loss": 0.0, + "step": 57423 + }, + { + "epoch": 5.358215918633946, + "grad_norm": NaN, + "learning_rate": 8.931484662469911e-06, + "loss": 0.0, + "step": 57424 + }, + { + "epoch": 5.358309228328824, + "grad_norm": NaN, + "learning_rate": 8.92891381073112e-06, + "loss": 0.0, + "step": 57425 + }, + { + "epoch": 5.358402538023701, + "grad_norm": NaN, + "learning_rate": 8.926343317694312e-06, + "loss": 0.0, + "step": 57426 + }, + { + "epoch": 5.358495847718578, + "grad_norm": NaN, + "learning_rate": 8.923773183366018e-06, + "loss": 0.0, + "step": 57427 + }, + { + "epoch": 5.358589157413455, + "grad_norm": NaN, + "learning_rate": 8.92120340775278e-06, + "loss": 0.0, + "step": 57428 + }, + { + "epoch": 5.358682467108332, + "grad_norm": NaN, + "learning_rate": 8.918633990861129e-06, + "loss": 0.0, + "step": 57429 + }, + { + "epoch": 5.35877577680321, + "grad_norm": NaN, + "learning_rate": 8.916064932697608e-06, + "loss": 0.0, + "step": 57430 + }, + { + "epoch": 5.358869086498087, + "grad_norm": NaN, + "learning_rate": 8.91349623326873e-06, + "loss": 0.0, + "step": 57431 + }, + { + "epoch": 5.358962396192965, + "grad_norm": NaN, + "learning_rate": 8.910927892581038e-06, + "loss": 0.0, + "step": 57432 + }, + { + "epoch": 5.359055705887842, + "grad_norm": NaN, + "learning_rate": 8.908359910641077e-06, + "loss": 0.0, + "step": 57433 + }, + { + "epoch": 5.3591490155827195, + "grad_norm": NaN, + "learning_rate": 8.90579228745536e-06, + "loss": 0.0, + "step": 57434 + }, + { + "epoch": 5.359242325277596, + "grad_norm": NaN, + "learning_rate": 8.903225023030414e-06, + "loss": 0.0, + "step": 57435 + }, + { + "epoch": 5.3593356349724734, + "grad_norm": NaN, + "learning_rate": 8.900658117372767e-06, + "loss": 0.0, + "step": 57436 + }, + { + "epoch": 5.359428944667351, + "grad_norm": NaN, + "learning_rate": 8.898091570488947e-06, + "loss": 0.0, + "step": 57437 + }, + { + "epoch": 5.359522254362228, + "grad_norm": NaN, + "learning_rate": 8.895525382385482e-06, + "loss": 0.0, + "step": 57438 + }, + { + "epoch": 5.359615564057106, + "grad_norm": NaN, + "learning_rate": 8.892959553068901e-06, + "loss": 0.0, + "step": 57439 + }, + { + "epoch": 5.359708873751983, + "grad_norm": NaN, + "learning_rate": 8.890394082545731e-06, + "loss": 0.0, + "step": 57440 + }, + { + "epoch": 5.35980218344686, + "grad_norm": NaN, + "learning_rate": 8.887828970822486e-06, + "loss": 0.0, + "step": 57441 + }, + { + "epoch": 5.359895493141737, + "grad_norm": NaN, + "learning_rate": 8.885264217905691e-06, + "loss": 0.0, + "step": 57442 + }, + { + "epoch": 5.3599888028366145, + "grad_norm": NaN, + "learning_rate": 8.882699823801875e-06, + "loss": 0.0, + "step": 57443 + }, + { + "epoch": 5.360082112531492, + "grad_norm": NaN, + "learning_rate": 8.880135788517533e-06, + "loss": 0.0, + "step": 57444 + }, + { + "epoch": 5.360175422226369, + "grad_norm": NaN, + "learning_rate": 8.877572112059228e-06, + "loss": 0.0, + "step": 57445 + }, + { + "epoch": 5.360268731921247, + "grad_norm": NaN, + "learning_rate": 8.875008794433435e-06, + "loss": 0.0, + "step": 57446 + }, + { + "epoch": 5.360362041616124, + "grad_norm": NaN, + "learning_rate": 8.872445835646701e-06, + "loss": 0.0, + "step": 57447 + }, + { + "epoch": 5.360455351311002, + "grad_norm": NaN, + "learning_rate": 8.869883235705522e-06, + "loss": 0.0, + "step": 57448 + }, + { + "epoch": 5.360548661005878, + "grad_norm": NaN, + "learning_rate": 8.867320994616423e-06, + "loss": 0.0, + "step": 57449 + }, + { + "epoch": 5.3606419707007555, + "grad_norm": NaN, + "learning_rate": 8.864759112385934e-06, + "loss": 0.0, + "step": 57450 + }, + { + "epoch": 5.360735280395633, + "grad_norm": NaN, + "learning_rate": 8.862197589020548e-06, + "loss": 0.0, + "step": 57451 + }, + { + "epoch": 5.36082859009051, + "grad_norm": NaN, + "learning_rate": 8.859636424526778e-06, + "loss": 0.0, + "step": 57452 + }, + { + "epoch": 5.360921899785388, + "grad_norm": NaN, + "learning_rate": 8.857075618911152e-06, + "loss": 0.0, + "step": 57453 + }, + { + "epoch": 5.361015209480265, + "grad_norm": NaN, + "learning_rate": 8.854515172180166e-06, + "loss": 0.0, + "step": 57454 + }, + { + "epoch": 5.361108519175143, + "grad_norm": NaN, + "learning_rate": 8.851955084340346e-06, + "loss": 0.0, + "step": 57455 + }, + { + "epoch": 5.361201828870019, + "grad_norm": NaN, + "learning_rate": 8.849395355398187e-06, + "loss": 0.0, + "step": 57456 + }, + { + "epoch": 5.361295138564897, + "grad_norm": NaN, + "learning_rate": 8.846835985360201e-06, + "loss": 0.0, + "step": 57457 + }, + { + "epoch": 5.361388448259774, + "grad_norm": NaN, + "learning_rate": 8.8442769742329e-06, + "loss": 0.0, + "step": 57458 + }, + { + "epoch": 5.361481757954651, + "grad_norm": NaN, + "learning_rate": 8.841718322022795e-06, + "loss": 0.0, + "step": 57459 + }, + { + "epoch": 5.361575067649529, + "grad_norm": NaN, + "learning_rate": 8.839160028736398e-06, + "loss": 0.0, + "step": 57460 + }, + { + "epoch": 5.361668377344406, + "grad_norm": NaN, + "learning_rate": 8.836602094380185e-06, + "loss": 0.0, + "step": 57461 + }, + { + "epoch": 5.361761687039284, + "grad_norm": NaN, + "learning_rate": 8.83404451896067e-06, + "loss": 0.0, + "step": 57462 + }, + { + "epoch": 5.361854996734161, + "grad_norm": NaN, + "learning_rate": 8.83148730248438e-06, + "loss": 0.0, + "step": 57463 + }, + { + "epoch": 5.361948306429038, + "grad_norm": NaN, + "learning_rate": 8.828930444957793e-06, + "loss": 0.0, + "step": 57464 + }, + { + "epoch": 5.362041616123915, + "grad_norm": NaN, + "learning_rate": 8.826373946387422e-06, + "loss": 0.0, + "step": 57465 + }, + { + "epoch": 5.3621349258187925, + "grad_norm": NaN, + "learning_rate": 8.82381780677976e-06, + "loss": 0.0, + "step": 57466 + }, + { + "epoch": 5.36222823551367, + "grad_norm": NaN, + "learning_rate": 8.821262026141301e-06, + "loss": 0.0, + "step": 57467 + }, + { + "epoch": 5.362321545208547, + "grad_norm": NaN, + "learning_rate": 8.818706604478575e-06, + "loss": 0.0, + "step": 57468 + }, + { + "epoch": 5.362414854903425, + "grad_norm": NaN, + "learning_rate": 8.816151541798045e-06, + "loss": 0.0, + "step": 57469 + }, + { + "epoch": 5.362508164598302, + "grad_norm": NaN, + "learning_rate": 8.813596838106218e-06, + "loss": 0.0, + "step": 57470 + }, + { + "epoch": 5.362601474293179, + "grad_norm": NaN, + "learning_rate": 8.811042493409593e-06, + "loss": 0.0, + "step": 57471 + }, + { + "epoch": 5.362694783988056, + "grad_norm": NaN, + "learning_rate": 8.808488507714662e-06, + "loss": 0.0, + "step": 57472 + }, + { + "epoch": 5.3627880936829335, + "grad_norm": NaN, + "learning_rate": 8.805934881027921e-06, + "loss": 0.0, + "step": 57473 + }, + { + "epoch": 5.362881403377811, + "grad_norm": NaN, + "learning_rate": 8.803381613355863e-06, + "loss": 0.0, + "step": 57474 + }, + { + "epoch": 5.362974713072688, + "grad_norm": NaN, + "learning_rate": 8.800828704704972e-06, + "loss": 0.0, + "step": 57475 + }, + { + "epoch": 5.363068022767566, + "grad_norm": NaN, + "learning_rate": 8.798276155081768e-06, + "loss": 0.0, + "step": 57476 + }, + { + "epoch": 5.363161332462443, + "grad_norm": NaN, + "learning_rate": 8.795723964492701e-06, + "loss": 0.0, + "step": 57477 + }, + { + "epoch": 5.36325464215732, + "grad_norm": NaN, + "learning_rate": 8.793172132944282e-06, + "loss": 0.0, + "step": 57478 + }, + { + "epoch": 5.363347951852197, + "grad_norm": NaN, + "learning_rate": 8.790620660443003e-06, + "loss": 0.0, + "step": 57479 + }, + { + "epoch": 5.363441261547075, + "grad_norm": NaN, + "learning_rate": 8.788069546995346e-06, + "loss": 0.0, + "step": 57480 + }, + { + "epoch": 5.363534571241952, + "grad_norm": NaN, + "learning_rate": 8.785518792607804e-06, + "loss": 0.0, + "step": 57481 + }, + { + "epoch": 5.363627880936829, + "grad_norm": NaN, + "learning_rate": 8.782968397286855e-06, + "loss": 0.0, + "step": 57482 + }, + { + "epoch": 5.363721190631707, + "grad_norm": NaN, + "learning_rate": 8.780418361038977e-06, + "loss": 0.0, + "step": 57483 + }, + { + "epoch": 5.363814500326584, + "grad_norm": NaN, + "learning_rate": 8.777868683870682e-06, + "loss": 0.0, + "step": 57484 + }, + { + "epoch": 5.363907810021461, + "grad_norm": NaN, + "learning_rate": 8.775319365788414e-06, + "loss": 0.0, + "step": 57485 + }, + { + "epoch": 5.364001119716338, + "grad_norm": NaN, + "learning_rate": 8.772770406798684e-06, + "loss": 0.0, + "step": 57486 + }, + { + "epoch": 5.364094429411216, + "grad_norm": NaN, + "learning_rate": 8.770221806907957e-06, + "loss": 0.0, + "step": 57487 + }, + { + "epoch": 5.364187739106093, + "grad_norm": NaN, + "learning_rate": 8.767673566122724e-06, + "loss": 0.0, + "step": 57488 + }, + { + "epoch": 5.3642810488009705, + "grad_norm": NaN, + "learning_rate": 8.765125684449464e-06, + "loss": 0.0, + "step": 57489 + }, + { + "epoch": 5.364374358495848, + "grad_norm": NaN, + "learning_rate": 8.762578161894657e-06, + "loss": 0.0, + "step": 57490 + }, + { + "epoch": 5.364467668190725, + "grad_norm": NaN, + "learning_rate": 8.760030998464762e-06, + "loss": 0.0, + "step": 57491 + }, + { + "epoch": 5.364560977885603, + "grad_norm": NaN, + "learning_rate": 8.757484194166276e-06, + "loss": 0.0, + "step": 57492 + }, + { + "epoch": 5.364654287580479, + "grad_norm": NaN, + "learning_rate": 8.754937749005675e-06, + "loss": 0.0, + "step": 57493 + }, + { + "epoch": 5.364747597275357, + "grad_norm": NaN, + "learning_rate": 8.752391662989422e-06, + "loss": 0.0, + "step": 57494 + }, + { + "epoch": 5.364840906970234, + "grad_norm": NaN, + "learning_rate": 8.749845936123995e-06, + "loss": 0.0, + "step": 57495 + }, + { + "epoch": 5.3649342166651115, + "grad_norm": NaN, + "learning_rate": 8.74730056841587e-06, + "loss": 0.0, + "step": 57496 + }, + { + "epoch": 5.365027526359989, + "grad_norm": NaN, + "learning_rate": 8.744755559871514e-06, + "loss": 0.0, + "step": 57497 + }, + { + "epoch": 5.365120836054866, + "grad_norm": NaN, + "learning_rate": 8.742210910497382e-06, + "loss": 0.0, + "step": 57498 + }, + { + "epoch": 5.365214145749744, + "grad_norm": NaN, + "learning_rate": 8.739666620300022e-06, + "loss": 0.0, + "step": 57499 + }, + { + "epoch": 5.36530745544462, + "grad_norm": NaN, + "learning_rate": 8.737122689285814e-06, + "loss": 0.0, + "step": 57500 + }, + { + "epoch": 5.365400765139498, + "grad_norm": NaN, + "learning_rate": 8.734579117461232e-06, + "loss": 0.0, + "step": 57501 + }, + { + "epoch": 5.365494074834375, + "grad_norm": NaN, + "learning_rate": 8.732035904832806e-06, + "loss": 0.0, + "step": 57502 + }, + { + "epoch": 5.3655873845292525, + "grad_norm": NaN, + "learning_rate": 8.729493051406949e-06, + "loss": 0.0, + "step": 57503 + }, + { + "epoch": 5.36568069422413, + "grad_norm": NaN, + "learning_rate": 8.726950557190154e-06, + "loss": 0.0, + "step": 57504 + }, + { + "epoch": 5.365774003919007, + "grad_norm": NaN, + "learning_rate": 8.724408422188867e-06, + "loss": 0.0, + "step": 57505 + }, + { + "epoch": 5.365867313613885, + "grad_norm": NaN, + "learning_rate": 8.721866646409564e-06, + "loss": 0.0, + "step": 57506 + }, + { + "epoch": 5.365960623308762, + "grad_norm": NaN, + "learning_rate": 8.719325229858709e-06, + "loss": 0.0, + "step": 57507 + }, + { + "epoch": 5.366053933003639, + "grad_norm": NaN, + "learning_rate": 8.716784172542746e-06, + "loss": 0.0, + "step": 57508 + }, + { + "epoch": 5.366147242698516, + "grad_norm": NaN, + "learning_rate": 8.714243474468153e-06, + "loss": 0.0, + "step": 57509 + }, + { + "epoch": 5.366240552393394, + "grad_norm": NaN, + "learning_rate": 8.71170313564139e-06, + "loss": 0.0, + "step": 57510 + }, + { + "epoch": 5.366333862088271, + "grad_norm": NaN, + "learning_rate": 8.70916315606892e-06, + "loss": 0.0, + "step": 57511 + }, + { + "epoch": 5.366427171783148, + "grad_norm": NaN, + "learning_rate": 8.70662353575719e-06, + "loss": 0.0, + "step": 57512 + }, + { + "epoch": 5.366520481478026, + "grad_norm": NaN, + "learning_rate": 8.704084274712658e-06, + "loss": 0.0, + "step": 57513 + }, + { + "epoch": 5.366613791172903, + "grad_norm": NaN, + "learning_rate": 8.701545372941787e-06, + "loss": 0.0, + "step": 57514 + }, + { + "epoch": 5.36670710086778, + "grad_norm": NaN, + "learning_rate": 8.699006830451038e-06, + "loss": 0.0, + "step": 57515 + }, + { + "epoch": 5.366800410562657, + "grad_norm": NaN, + "learning_rate": 8.69646864724684e-06, + "loss": 0.0, + "step": 57516 + }, + { + "epoch": 5.366893720257535, + "grad_norm": NaN, + "learning_rate": 8.693930823335688e-06, + "loss": 0.0, + "step": 57517 + }, + { + "epoch": 5.366987029952412, + "grad_norm": NaN, + "learning_rate": 8.691393358723991e-06, + "loss": 0.0, + "step": 57518 + }, + { + "epoch": 5.3670803396472895, + "grad_norm": NaN, + "learning_rate": 8.68885625341823e-06, + "loss": 0.0, + "step": 57519 + }, + { + "epoch": 5.367173649342167, + "grad_norm": NaN, + "learning_rate": 8.686319507424849e-06, + "loss": 0.0, + "step": 57520 + }, + { + "epoch": 5.367266959037044, + "grad_norm": NaN, + "learning_rate": 8.68378312075031e-06, + "loss": 0.0, + "step": 57521 + }, + { + "epoch": 5.367360268731921, + "grad_norm": NaN, + "learning_rate": 8.681247093401023e-06, + "loss": 0.0, + "step": 57522 + }, + { + "epoch": 5.367453578426798, + "grad_norm": NaN, + "learning_rate": 8.678711425383484e-06, + "loss": 0.0, + "step": 57523 + }, + { + "epoch": 5.367546888121676, + "grad_norm": NaN, + "learning_rate": 8.67617611670412e-06, + "loss": 0.0, + "step": 57524 + }, + { + "epoch": 5.367640197816553, + "grad_norm": NaN, + "learning_rate": 8.67364116736936e-06, + "loss": 0.0, + "step": 57525 + }, + { + "epoch": 5.3677335075114305, + "grad_norm": NaN, + "learning_rate": 8.671106577385684e-06, + "loss": 0.0, + "step": 57526 + }, + { + "epoch": 5.367826817206308, + "grad_norm": NaN, + "learning_rate": 8.6685723467595e-06, + "loss": 0.0, + "step": 57527 + }, + { + "epoch": 5.367920126901185, + "grad_norm": NaN, + "learning_rate": 8.666038475497288e-06, + "loss": 0.0, + "step": 57528 + }, + { + "epoch": 5.368013436596062, + "grad_norm": NaN, + "learning_rate": 8.66350496360546e-06, + "loss": 0.0, + "step": 57529 + }, + { + "epoch": 5.368106746290939, + "grad_norm": NaN, + "learning_rate": 8.660971811090495e-06, + "loss": 0.0, + "step": 57530 + }, + { + "epoch": 5.368200055985817, + "grad_norm": NaN, + "learning_rate": 8.658439017958784e-06, + "loss": 0.0, + "step": 57531 + }, + { + "epoch": 5.368293365680694, + "grad_norm": NaN, + "learning_rate": 8.655906584216809e-06, + "loss": 0.0, + "step": 57532 + }, + { + "epoch": 5.368386675375572, + "grad_norm": NaN, + "learning_rate": 8.653374509870998e-06, + "loss": 0.0, + "step": 57533 + }, + { + "epoch": 5.368479985070449, + "grad_norm": NaN, + "learning_rate": 8.650842794927777e-06, + "loss": 0.0, + "step": 57534 + }, + { + "epoch": 5.368573294765326, + "grad_norm": NaN, + "learning_rate": 8.648311439393591e-06, + "loss": 0.0, + "step": 57535 + }, + { + "epoch": 5.368666604460204, + "grad_norm": NaN, + "learning_rate": 8.645780443274852e-06, + "loss": 0.0, + "step": 57536 + }, + { + "epoch": 5.36875991415508, + "grad_norm": NaN, + "learning_rate": 8.643249806578074e-06, + "loss": 0.0, + "step": 57537 + }, + { + "epoch": 5.368853223849958, + "grad_norm": NaN, + "learning_rate": 8.640719529309614e-06, + "loss": 0.0, + "step": 57538 + }, + { + "epoch": 5.368946533544835, + "grad_norm": NaN, + "learning_rate": 8.63818961147592e-06, + "loss": 0.0, + "step": 57539 + }, + { + "epoch": 5.369039843239713, + "grad_norm": NaN, + "learning_rate": 8.63566005308347e-06, + "loss": 0.0, + "step": 57540 + }, + { + "epoch": 5.36913315293459, + "grad_norm": NaN, + "learning_rate": 8.633130854138626e-06, + "loss": 0.0, + "step": 57541 + }, + { + "epoch": 5.3692264626294675, + "grad_norm": NaN, + "learning_rate": 8.630602014647848e-06, + "loss": 0.0, + "step": 57542 + }, + { + "epoch": 5.369319772324345, + "grad_norm": NaN, + "learning_rate": 8.628073534617614e-06, + "loss": 0.0, + "step": 57543 + }, + { + "epoch": 5.369413082019221, + "grad_norm": NaN, + "learning_rate": 8.625545414054286e-06, + "loss": 0.0, + "step": 57544 + }, + { + "epoch": 5.369506391714099, + "grad_norm": NaN, + "learning_rate": 8.62301765296431e-06, + "loss": 0.0, + "step": 57545 + }, + { + "epoch": 5.369599701408976, + "grad_norm": NaN, + "learning_rate": 8.620490251354145e-06, + "loss": 0.0, + "step": 57546 + }, + { + "epoch": 5.369693011103854, + "grad_norm": NaN, + "learning_rate": 8.617963209230172e-06, + "loss": 0.0, + "step": 57547 + }, + { + "epoch": 5.369786320798731, + "grad_norm": NaN, + "learning_rate": 8.615436526598851e-06, + "loss": 0.0, + "step": 57548 + }, + { + "epoch": 5.3698796304936085, + "grad_norm": NaN, + "learning_rate": 8.612910203466595e-06, + "loss": 0.0, + "step": 57549 + }, + { + "epoch": 5.369972940188486, + "grad_norm": NaN, + "learning_rate": 8.610384239839813e-06, + "loss": 0.0, + "step": 57550 + }, + { + "epoch": 5.370066249883363, + "grad_norm": NaN, + "learning_rate": 8.607858635724952e-06, + "loss": 0.0, + "step": 57551 + }, + { + "epoch": 5.37015955957824, + "grad_norm": NaN, + "learning_rate": 8.605333391128421e-06, + "loss": 0.0, + "step": 57552 + }, + { + "epoch": 5.370252869273117, + "grad_norm": NaN, + "learning_rate": 8.602808506056636e-06, + "loss": 0.0, + "step": 57553 + }, + { + "epoch": 5.370346178967995, + "grad_norm": NaN, + "learning_rate": 8.600283980516037e-06, + "loss": 0.0, + "step": 57554 + }, + { + "epoch": 5.370439488662872, + "grad_norm": NaN, + "learning_rate": 8.597759814513022e-06, + "loss": 0.0, + "step": 57555 + }, + { + "epoch": 5.3705327983577495, + "grad_norm": NaN, + "learning_rate": 8.595236008054018e-06, + "loss": 0.0, + "step": 57556 + }, + { + "epoch": 5.370626108052627, + "grad_norm": NaN, + "learning_rate": 8.592712561145437e-06, + "loss": 0.0, + "step": 57557 + }, + { + "epoch": 5.3707194177475035, + "grad_norm": NaN, + "learning_rate": 8.590189473793708e-06, + "loss": 0.0, + "step": 57558 + }, + { + "epoch": 5.370812727442381, + "grad_norm": NaN, + "learning_rate": 8.587666746005223e-06, + "loss": 0.0, + "step": 57559 + }, + { + "epoch": 5.370906037137258, + "grad_norm": NaN, + "learning_rate": 8.585144377786429e-06, + "loss": 0.0, + "step": 57560 + }, + { + "epoch": 5.370999346832136, + "grad_norm": NaN, + "learning_rate": 8.582622369143721e-06, + "loss": 0.0, + "step": 57561 + }, + { + "epoch": 5.371092656527013, + "grad_norm": NaN, + "learning_rate": 8.580100720083511e-06, + "loss": 0.0, + "step": 57562 + }, + { + "epoch": 5.371185966221891, + "grad_norm": NaN, + "learning_rate": 8.577579430612207e-06, + "loss": 0.0, + "step": 57563 + }, + { + "epoch": 5.371279275916768, + "grad_norm": NaN, + "learning_rate": 8.575058500736226e-06, + "loss": 0.0, + "step": 57564 + }, + { + "epoch": 5.371372585611645, + "grad_norm": NaN, + "learning_rate": 8.572537930461992e-06, + "loss": 0.0, + "step": 57565 + }, + { + "epoch": 5.371465895306522, + "grad_norm": NaN, + "learning_rate": 8.570017719795885e-06, + "loss": 0.0, + "step": 57566 + }, + { + "epoch": 5.371559205001399, + "grad_norm": NaN, + "learning_rate": 8.567497868744334e-06, + "loss": 0.0, + "step": 57567 + }, + { + "epoch": 5.371652514696277, + "grad_norm": NaN, + "learning_rate": 8.564978377313747e-06, + "loss": 0.0, + "step": 57568 + }, + { + "epoch": 5.371745824391154, + "grad_norm": NaN, + "learning_rate": 8.562459245510522e-06, + "loss": 0.0, + "step": 57569 + }, + { + "epoch": 5.371839134086032, + "grad_norm": NaN, + "learning_rate": 8.559940473341054e-06, + "loss": 0.0, + "step": 57570 + }, + { + "epoch": 5.371932443780909, + "grad_norm": NaN, + "learning_rate": 8.557422060811769e-06, + "loss": 0.0, + "step": 57571 + }, + { + "epoch": 5.3720257534757865, + "grad_norm": NaN, + "learning_rate": 8.554904007929065e-06, + "loss": 0.0, + "step": 57572 + }, + { + "epoch": 5.372119063170663, + "grad_norm": NaN, + "learning_rate": 8.552386314699333e-06, + "loss": 0.0, + "step": 57573 + }, + { + "epoch": 5.37221237286554, + "grad_norm": NaN, + "learning_rate": 8.549868981128971e-06, + "loss": 0.0, + "step": 57574 + }, + { + "epoch": 5.372305682560418, + "grad_norm": NaN, + "learning_rate": 8.547352007224423e-06, + "loss": 0.0, + "step": 57575 + }, + { + "epoch": 5.372398992255295, + "grad_norm": NaN, + "learning_rate": 8.54483539299205e-06, + "loss": 0.0, + "step": 57576 + }, + { + "epoch": 5.372492301950173, + "grad_norm": NaN, + "learning_rate": 8.542319138438214e-06, + "loss": 0.0, + "step": 57577 + }, + { + "epoch": 5.37258561164505, + "grad_norm": NaN, + "learning_rate": 8.539803243569427e-06, + "loss": 0.0, + "step": 57578 + }, + { + "epoch": 5.3726789213399275, + "grad_norm": NaN, + "learning_rate": 8.537287708391982e-06, + "loss": 0.0, + "step": 57579 + }, + { + "epoch": 5.372772231034805, + "grad_norm": NaN, + "learning_rate": 8.534772532912293e-06, + "loss": 0.0, + "step": 57580 + }, + { + "epoch": 5.3728655407296815, + "grad_norm": NaN, + "learning_rate": 8.532257717136803e-06, + "loss": 0.0, + "step": 57581 + }, + { + "epoch": 5.372958850424559, + "grad_norm": NaN, + "learning_rate": 8.529743261071858e-06, + "loss": 0.0, + "step": 57582 + }, + { + "epoch": 5.373052160119436, + "grad_norm": NaN, + "learning_rate": 8.527229164723854e-06, + "loss": 0.0, + "step": 57583 + }, + { + "epoch": 5.373145469814314, + "grad_norm": NaN, + "learning_rate": 8.52471542809925e-06, + "loss": 0.0, + "step": 57584 + }, + { + "epoch": 5.373238779509191, + "grad_norm": NaN, + "learning_rate": 8.522202051204358e-06, + "loss": 0.0, + "step": 57585 + }, + { + "epoch": 5.373332089204069, + "grad_norm": NaN, + "learning_rate": 8.519689034045574e-06, + "loss": 0.0, + "step": 57586 + }, + { + "epoch": 5.373425398898946, + "grad_norm": NaN, + "learning_rate": 8.517176376629342e-06, + "loss": 0.0, + "step": 57587 + }, + { + "epoch": 5.3735187085938225, + "grad_norm": NaN, + "learning_rate": 8.514664078962024e-06, + "loss": 0.0, + "step": 57588 + }, + { + "epoch": 5.3736120182887, + "grad_norm": NaN, + "learning_rate": 8.512152141049966e-06, + "loss": 0.0, + "step": 57589 + }, + { + "epoch": 5.373705327983577, + "grad_norm": NaN, + "learning_rate": 8.509640562899628e-06, + "loss": 0.0, + "step": 57590 + }, + { + "epoch": 5.373798637678455, + "grad_norm": NaN, + "learning_rate": 8.507129344517354e-06, + "loss": 0.0, + "step": 57591 + }, + { + "epoch": 5.373891947373332, + "grad_norm": NaN, + "learning_rate": 8.504618485909543e-06, + "loss": 0.0, + "step": 57592 + }, + { + "epoch": 5.37398525706821, + "grad_norm": NaN, + "learning_rate": 8.502107987082568e-06, + "loss": 0.0, + "step": 57593 + }, + { + "epoch": 5.374078566763087, + "grad_norm": NaN, + "learning_rate": 8.499597848042811e-06, + "loss": 0.0, + "step": 57594 + }, + { + "epoch": 5.374171876457964, + "grad_norm": NaN, + "learning_rate": 8.497088068796664e-06, + "loss": 0.0, + "step": 57595 + }, + { + "epoch": 5.374265186152841, + "grad_norm": NaN, + "learning_rate": 8.494578649350509e-06, + "loss": 0.0, + "step": 57596 + }, + { + "epoch": 5.374358495847718, + "grad_norm": NaN, + "learning_rate": 8.492069589710721e-06, + "loss": 0.0, + "step": 57597 + }, + { + "epoch": 5.374451805542596, + "grad_norm": NaN, + "learning_rate": 8.489560889883678e-06, + "loss": 0.0, + "step": 57598 + }, + { + "epoch": 5.374545115237473, + "grad_norm": NaN, + "learning_rate": 8.487052549875762e-06, + "loss": 0.0, + "step": 57599 + }, + { + "epoch": 5.374638424932351, + "grad_norm": NaN, + "learning_rate": 8.484544569693363e-06, + "loss": 0.0, + "step": 57600 + }, + { + "epoch": 5.374731734627228, + "grad_norm": NaN, + "learning_rate": 8.482036949342846e-06, + "loss": 0.0, + "step": 57601 + }, + { + "epoch": 5.374825044322105, + "grad_norm": NaN, + "learning_rate": 8.479529688830573e-06, + "loss": 0.0, + "step": 57602 + }, + { + "epoch": 5.374918354016982, + "grad_norm": NaN, + "learning_rate": 8.477022788162952e-06, + "loss": 0.0, + "step": 57603 + }, + { + "epoch": 5.3750116637118595, + "grad_norm": NaN, + "learning_rate": 8.47451624734633e-06, + "loss": 0.0, + "step": 57604 + }, + { + "epoch": 5.375104973406737, + "grad_norm": NaN, + "learning_rate": 8.472010066387087e-06, + "loss": 0.0, + "step": 57605 + }, + { + "epoch": 5.375198283101614, + "grad_norm": NaN, + "learning_rate": 8.469504245291614e-06, + "loss": 0.0, + "step": 57606 + }, + { + "epoch": 5.375291592796492, + "grad_norm": NaN, + "learning_rate": 8.46699878406626e-06, + "loss": 0.0, + "step": 57607 + }, + { + "epoch": 5.375384902491369, + "grad_norm": NaN, + "learning_rate": 8.4644936827174e-06, + "loss": 0.0, + "step": 57608 + }, + { + "epoch": 5.3754782121862466, + "grad_norm": NaN, + "learning_rate": 8.461988941251414e-06, + "loss": 0.0, + "step": 57609 + }, + { + "epoch": 5.375571521881123, + "grad_norm": NaN, + "learning_rate": 8.459484559674645e-06, + "loss": 0.0, + "step": 57610 + }, + { + "epoch": 5.3756648315760005, + "grad_norm": NaN, + "learning_rate": 8.45698053799349e-06, + "loss": 0.0, + "step": 57611 + }, + { + "epoch": 5.375758141270878, + "grad_norm": NaN, + "learning_rate": 8.454476876214294e-06, + "loss": 0.0, + "step": 57612 + }, + { + "epoch": 5.375851450965755, + "grad_norm": NaN, + "learning_rate": 8.451973574343467e-06, + "loss": 0.0, + "step": 57613 + }, + { + "epoch": 5.375944760660633, + "grad_norm": NaN, + "learning_rate": 8.449470632387323e-06, + "loss": 0.0, + "step": 57614 + }, + { + "epoch": 5.37603807035551, + "grad_norm": NaN, + "learning_rate": 8.44696805035222e-06, + "loss": 0.0, + "step": 57615 + }, + { + "epoch": 5.376131380050388, + "grad_norm": NaN, + "learning_rate": 8.44446582824459e-06, + "loss": 0.0, + "step": 57616 + }, + { + "epoch": 5.376224689745264, + "grad_norm": NaN, + "learning_rate": 8.441963966070741e-06, + "loss": 0.0, + "step": 57617 + }, + { + "epoch": 5.3763179994401415, + "grad_norm": NaN, + "learning_rate": 8.439462463837004e-06, + "loss": 0.0, + "step": 57618 + }, + { + "epoch": 5.376411309135019, + "grad_norm": NaN, + "learning_rate": 8.43696132154984e-06, + "loss": 0.0, + "step": 57619 + }, + { + "epoch": 5.376504618829896, + "grad_norm": NaN, + "learning_rate": 8.434460539215525e-06, + "loss": 0.0, + "step": 57620 + }, + { + "epoch": 5.376597928524774, + "grad_norm": NaN, + "learning_rate": 8.431960116840408e-06, + "loss": 0.0, + "step": 57621 + }, + { + "epoch": 5.376691238219651, + "grad_norm": NaN, + "learning_rate": 8.42946005443093e-06, + "loss": 0.0, + "step": 57622 + }, + { + "epoch": 5.376784547914529, + "grad_norm": NaN, + "learning_rate": 8.426960351993389e-06, + "loss": 0.0, + "step": 57623 + }, + { + "epoch": 5.376877857609406, + "grad_norm": NaN, + "learning_rate": 8.424461009534112e-06, + "loss": 0.0, + "step": 57624 + }, + { + "epoch": 5.376971167304283, + "grad_norm": NaN, + "learning_rate": 8.42196202705956e-06, + "loss": 0.0, + "step": 57625 + }, + { + "epoch": 5.37706447699916, + "grad_norm": NaN, + "learning_rate": 8.419463404575977e-06, + "loss": 0.0, + "step": 57626 + }, + { + "epoch": 5.377157786694037, + "grad_norm": NaN, + "learning_rate": 8.416965142089743e-06, + "loss": 0.0, + "step": 57627 + }, + { + "epoch": 5.377251096388915, + "grad_norm": NaN, + "learning_rate": 8.41446723960727e-06, + "loss": 0.0, + "step": 57628 + }, + { + "epoch": 5.377344406083792, + "grad_norm": NaN, + "learning_rate": 8.411969697134852e-06, + "loss": 0.0, + "step": 57629 + }, + { + "epoch": 5.37743771577867, + "grad_norm": NaN, + "learning_rate": 8.409472514678816e-06, + "loss": 0.0, + "step": 57630 + }, + { + "epoch": 5.377531025473547, + "grad_norm": NaN, + "learning_rate": 8.40697569224561e-06, + "loss": 0.0, + "step": 57631 + }, + { + "epoch": 5.377624335168424, + "grad_norm": NaN, + "learning_rate": 8.404479229841494e-06, + "loss": 0.0, + "step": 57632 + }, + { + "epoch": 5.377717644863301, + "grad_norm": NaN, + "learning_rate": 8.401983127472811e-06, + "loss": 0.0, + "step": 57633 + }, + { + "epoch": 5.3778109545581785, + "grad_norm": NaN, + "learning_rate": 8.399487385145992e-06, + "loss": 0.0, + "step": 57634 + }, + { + "epoch": 5.377904264253056, + "grad_norm": NaN, + "learning_rate": 8.396992002867314e-06, + "loss": 0.0, + "step": 57635 + }, + { + "epoch": 5.377997573947933, + "grad_norm": NaN, + "learning_rate": 8.394496980643106e-06, + "loss": 0.0, + "step": 57636 + }, + { + "epoch": 5.378090883642811, + "grad_norm": NaN, + "learning_rate": 8.392002318479796e-06, + "loss": 0.0, + "step": 57637 + }, + { + "epoch": 5.378184193337688, + "grad_norm": NaN, + "learning_rate": 8.389508016383662e-06, + "loss": 0.0, + "step": 57638 + }, + { + "epoch": 5.378277503032565, + "grad_norm": NaN, + "learning_rate": 8.387014074361064e-06, + "loss": 0.0, + "step": 57639 + }, + { + "epoch": 5.378370812727442, + "grad_norm": NaN, + "learning_rate": 8.384520492418317e-06, + "loss": 0.0, + "step": 57640 + }, + { + "epoch": 5.3784641224223195, + "grad_norm": NaN, + "learning_rate": 8.382027270561814e-06, + "loss": 0.0, + "step": 57641 + }, + { + "epoch": 5.378557432117197, + "grad_norm": NaN, + "learning_rate": 8.37953440879785e-06, + "loss": 0.0, + "step": 57642 + }, + { + "epoch": 5.378650741812074, + "grad_norm": NaN, + "learning_rate": 8.377041907132769e-06, + "loss": 0.0, + "step": 57643 + }, + { + "epoch": 5.378744051506952, + "grad_norm": NaN, + "learning_rate": 8.374549765572935e-06, + "loss": 0.0, + "step": 57644 + }, + { + "epoch": 5.378837361201829, + "grad_norm": NaN, + "learning_rate": 8.372057984124659e-06, + "loss": 0.0, + "step": 57645 + }, + { + "epoch": 5.378930670896706, + "grad_norm": NaN, + "learning_rate": 8.369566562794283e-06, + "loss": 0.0, + "step": 57646 + }, + { + "epoch": 5.379023980591583, + "grad_norm": NaN, + "learning_rate": 8.367075501588139e-06, + "loss": 0.0, + "step": 57647 + }, + { + "epoch": 5.379117290286461, + "grad_norm": NaN, + "learning_rate": 8.364584800512603e-06, + "loss": 0.0, + "step": 57648 + }, + { + "epoch": 5.379210599981338, + "grad_norm": NaN, + "learning_rate": 8.362094459573954e-06, + "loss": 0.0, + "step": 57649 + }, + { + "epoch": 5.379303909676215, + "grad_norm": NaN, + "learning_rate": 8.359604478778504e-06, + "loss": 0.0, + "step": 57650 + }, + { + "epoch": 5.379397219371093, + "grad_norm": NaN, + "learning_rate": 8.35711485813268e-06, + "loss": 0.0, + "step": 57651 + }, + { + "epoch": 5.37949052906597, + "grad_norm": NaN, + "learning_rate": 8.354625597642729e-06, + "loss": 0.0, + "step": 57652 + }, + { + "epoch": 5.379583838760848, + "grad_norm": NaN, + "learning_rate": 8.352136697314993e-06, + "loss": 0.0, + "step": 57653 + }, + { + "epoch": 5.379677148455724, + "grad_norm": NaN, + "learning_rate": 8.349648157155852e-06, + "loss": 0.0, + "step": 57654 + }, + { + "epoch": 5.379770458150602, + "grad_norm": NaN, + "learning_rate": 8.347159977171569e-06, + "loss": 0.0, + "step": 57655 + }, + { + "epoch": 5.379863767845479, + "grad_norm": NaN, + "learning_rate": 8.344672157368488e-06, + "loss": 0.0, + "step": 57656 + }, + { + "epoch": 5.3799570775403565, + "grad_norm": NaN, + "learning_rate": 8.342184697752985e-06, + "loss": 0.0, + "step": 57657 + }, + { + "epoch": 5.380050387235234, + "grad_norm": NaN, + "learning_rate": 8.339697598331307e-06, + "loss": 0.0, + "step": 57658 + }, + { + "epoch": 5.380143696930111, + "grad_norm": NaN, + "learning_rate": 8.337210859109817e-06, + "loss": 0.0, + "step": 57659 + }, + { + "epoch": 5.380237006624989, + "grad_norm": NaN, + "learning_rate": 8.334724480094857e-06, + "loss": 0.0, + "step": 57660 + }, + { + "epoch": 5.380330316319865, + "grad_norm": NaN, + "learning_rate": 8.332238461292724e-06, + "loss": 0.0, + "step": 57661 + }, + { + "epoch": 5.380423626014743, + "grad_norm": NaN, + "learning_rate": 8.32975280270971e-06, + "loss": 0.0, + "step": 57662 + }, + { + "epoch": 5.38051693570962, + "grad_norm": NaN, + "learning_rate": 8.327267504352232e-06, + "loss": 0.0, + "step": 57663 + }, + { + "epoch": 5.3806102454044975, + "grad_norm": NaN, + "learning_rate": 8.324782566226495e-06, + "loss": 0.0, + "step": 57664 + }, + { + "epoch": 5.380703555099375, + "grad_norm": NaN, + "learning_rate": 8.322297988338866e-06, + "loss": 0.0, + "step": 57665 + }, + { + "epoch": 5.380796864794252, + "grad_norm": NaN, + "learning_rate": 8.319813770695704e-06, + "loss": 0.0, + "step": 57666 + }, + { + "epoch": 5.38089017448913, + "grad_norm": NaN, + "learning_rate": 8.317329913303272e-06, + "loss": 0.0, + "step": 57667 + }, + { + "epoch": 5.380983484184007, + "grad_norm": NaN, + "learning_rate": 8.314846416167864e-06, + "loss": 0.0, + "step": 57668 + }, + { + "epoch": 5.381076793878884, + "grad_norm": NaN, + "learning_rate": 8.312363279295891e-06, + "loss": 0.0, + "step": 57669 + }, + { + "epoch": 5.381170103573761, + "grad_norm": NaN, + "learning_rate": 8.309880502693566e-06, + "loss": 0.0, + "step": 57670 + }, + { + "epoch": 5.3812634132686386, + "grad_norm": NaN, + "learning_rate": 8.307398086367234e-06, + "loss": 0.0, + "step": 57671 + }, + { + "epoch": 5.381356722963516, + "grad_norm": NaN, + "learning_rate": 8.304916030323256e-06, + "loss": 0.0, + "step": 57672 + }, + { + "epoch": 5.381450032658393, + "grad_norm": NaN, + "learning_rate": 8.302434334567892e-06, + "loss": 0.0, + "step": 57673 + }, + { + "epoch": 5.381543342353271, + "grad_norm": NaN, + "learning_rate": 8.29995299910744e-06, + "loss": 0.0, + "step": 57674 + }, + { + "epoch": 5.381636652048147, + "grad_norm": NaN, + "learning_rate": 8.297472023948259e-06, + "loss": 0.0, + "step": 57675 + }, + { + "epoch": 5.381729961743025, + "grad_norm": NaN, + "learning_rate": 8.29499140909663e-06, + "loss": 0.0, + "step": 57676 + }, + { + "epoch": 5.381823271437902, + "grad_norm": NaN, + "learning_rate": 8.292511154558828e-06, + "loss": 0.0, + "step": 57677 + }, + { + "epoch": 5.38191658113278, + "grad_norm": NaN, + "learning_rate": 8.290031260341235e-06, + "loss": 0.0, + "step": 57678 + }, + { + "epoch": 5.382009890827657, + "grad_norm": NaN, + "learning_rate": 8.287551726450109e-06, + "loss": 0.0, + "step": 57679 + }, + { + "epoch": 5.382103200522534, + "grad_norm": NaN, + "learning_rate": 8.285072552891713e-06, + "loss": 0.0, + "step": 57680 + }, + { + "epoch": 5.382196510217412, + "grad_norm": NaN, + "learning_rate": 8.28259373967246e-06, + "loss": 0.0, + "step": 57681 + }, + { + "epoch": 5.382289819912289, + "grad_norm": NaN, + "learning_rate": 8.280115286798561e-06, + "loss": 0.0, + "step": 57682 + }, + { + "epoch": 5.382383129607166, + "grad_norm": NaN, + "learning_rate": 8.277637194276359e-06, + "loss": 0.0, + "step": 57683 + }, + { + "epoch": 5.382476439302043, + "grad_norm": NaN, + "learning_rate": 8.275159462112136e-06, + "loss": 0.0, + "step": 57684 + }, + { + "epoch": 5.382569748996921, + "grad_norm": NaN, + "learning_rate": 8.272682090312183e-06, + "loss": 0.0, + "step": 57685 + }, + { + "epoch": 5.382663058691798, + "grad_norm": NaN, + "learning_rate": 8.270205078882863e-06, + "loss": 0.0, + "step": 57686 + }, + { + "epoch": 5.3827563683866755, + "grad_norm": NaN, + "learning_rate": 8.267728427830406e-06, + "loss": 0.0, + "step": 57687 + }, + { + "epoch": 5.382849678081553, + "grad_norm": NaN, + "learning_rate": 8.265252137161105e-06, + "loss": 0.0, + "step": 57688 + }, + { + "epoch": 5.38294298777643, + "grad_norm": NaN, + "learning_rate": 8.262776206881322e-06, + "loss": 0.0, + "step": 57689 + }, + { + "epoch": 5.383036297471307, + "grad_norm": NaN, + "learning_rate": 8.260300636997302e-06, + "loss": 0.0, + "step": 57690 + }, + { + "epoch": 5.383129607166184, + "grad_norm": NaN, + "learning_rate": 8.257825427515324e-06, + "loss": 0.0, + "step": 57691 + }, + { + "epoch": 5.383222916861062, + "grad_norm": NaN, + "learning_rate": 8.25535057844175e-06, + "loss": 0.0, + "step": 57692 + }, + { + "epoch": 5.383316226555939, + "grad_norm": NaN, + "learning_rate": 8.252876089782807e-06, + "loss": 0.0, + "step": 57693 + }, + { + "epoch": 5.3834095362508165, + "grad_norm": NaN, + "learning_rate": 8.250401961544806e-06, + "loss": 0.0, + "step": 57694 + }, + { + "epoch": 5.383502845945694, + "grad_norm": NaN, + "learning_rate": 8.247928193734076e-06, + "loss": 0.0, + "step": 57695 + }, + { + "epoch": 5.383596155640571, + "grad_norm": NaN, + "learning_rate": 8.245454786356848e-06, + "loss": 0.0, + "step": 57696 + }, + { + "epoch": 5.383689465335449, + "grad_norm": NaN, + "learning_rate": 8.242981739419414e-06, + "loss": 0.0, + "step": 57697 + }, + { + "epoch": 5.383782775030325, + "grad_norm": NaN, + "learning_rate": 8.240509052928134e-06, + "loss": 0.0, + "step": 57698 + }, + { + "epoch": 5.383876084725203, + "grad_norm": NaN, + "learning_rate": 8.238036726889242e-06, + "loss": 0.0, + "step": 57699 + }, + { + "epoch": 5.38396939442008, + "grad_norm": NaN, + "learning_rate": 8.235564761308994e-06, + "loss": 0.0, + "step": 57700 + }, + { + "epoch": 5.384062704114958, + "grad_norm": NaN, + "learning_rate": 8.233093156193755e-06, + "loss": 0.0, + "step": 57701 + }, + { + "epoch": 5.384156013809835, + "grad_norm": NaN, + "learning_rate": 8.230621911549734e-06, + "loss": 0.0, + "step": 57702 + }, + { + "epoch": 5.384249323504712, + "grad_norm": NaN, + "learning_rate": 8.228151027383228e-06, + "loss": 0.0, + "step": 57703 + }, + { + "epoch": 5.38434263319959, + "grad_norm": NaN, + "learning_rate": 8.225680503700599e-06, + "loss": 0.0, + "step": 57704 + }, + { + "epoch": 5.384435942894466, + "grad_norm": NaN, + "learning_rate": 8.223210340508024e-06, + "loss": 0.0, + "step": 57705 + }, + { + "epoch": 5.384529252589344, + "grad_norm": NaN, + "learning_rate": 8.220740537811815e-06, + "loss": 0.0, + "step": 57706 + }, + { + "epoch": 5.384622562284221, + "grad_norm": NaN, + "learning_rate": 8.218271095618317e-06, + "loss": 0.0, + "step": 57707 + }, + { + "epoch": 5.384715871979099, + "grad_norm": NaN, + "learning_rate": 8.215802013933725e-06, + "loss": 0.0, + "step": 57708 + }, + { + "epoch": 5.384809181673976, + "grad_norm": NaN, + "learning_rate": 8.213333292764334e-06, + "loss": 0.0, + "step": 57709 + }, + { + "epoch": 5.3849024913688535, + "grad_norm": NaN, + "learning_rate": 8.210864932116473e-06, + "loss": 0.0, + "step": 57710 + }, + { + "epoch": 5.384995801063731, + "grad_norm": NaN, + "learning_rate": 8.208396931996352e-06, + "loss": 0.0, + "step": 57711 + }, + { + "epoch": 5.385089110758607, + "grad_norm": NaN, + "learning_rate": 8.205929292410268e-06, + "loss": 0.0, + "step": 57712 + }, + { + "epoch": 5.385182420453485, + "grad_norm": NaN, + "learning_rate": 8.203462013364548e-06, + "loss": 0.0, + "step": 57713 + }, + { + "epoch": 5.385275730148362, + "grad_norm": NaN, + "learning_rate": 8.200995094865387e-06, + "loss": 0.0, + "step": 57714 + }, + { + "epoch": 5.38536903984324, + "grad_norm": NaN, + "learning_rate": 8.19852853691908e-06, + "loss": 0.0, + "step": 57715 + }, + { + "epoch": 5.385462349538117, + "grad_norm": NaN, + "learning_rate": 8.196062339531956e-06, + "loss": 0.0, + "step": 57716 + }, + { + "epoch": 5.3855556592329945, + "grad_norm": NaN, + "learning_rate": 8.193596502710208e-06, + "loss": 0.0, + "step": 57717 + }, + { + "epoch": 5.385648968927872, + "grad_norm": NaN, + "learning_rate": 8.191131026460118e-06, + "loss": 0.0, + "step": 57718 + }, + { + "epoch": 5.3857422786227485, + "grad_norm": NaN, + "learning_rate": 8.188665910788029e-06, + "loss": 0.0, + "step": 57719 + }, + { + "epoch": 5.385835588317626, + "grad_norm": NaN, + "learning_rate": 8.186201155700116e-06, + "loss": 0.0, + "step": 57720 + }, + { + "epoch": 5.385928898012503, + "grad_norm": NaN, + "learning_rate": 8.18373676120268e-06, + "loss": 0.0, + "step": 57721 + }, + { + "epoch": 5.386022207707381, + "grad_norm": NaN, + "learning_rate": 8.181272727302029e-06, + "loss": 0.0, + "step": 57722 + }, + { + "epoch": 5.386115517402258, + "grad_norm": NaN, + "learning_rate": 8.178809054004342e-06, + "loss": 0.0, + "step": 57723 + }, + { + "epoch": 5.3862088270971356, + "grad_norm": NaN, + "learning_rate": 8.176345741315964e-06, + "loss": 0.0, + "step": 57724 + }, + { + "epoch": 5.386302136792013, + "grad_norm": NaN, + "learning_rate": 8.17388278924314e-06, + "loss": 0.0, + "step": 57725 + }, + { + "epoch": 5.38639544648689, + "grad_norm": NaN, + "learning_rate": 8.171420197792084e-06, + "loss": 0.0, + "step": 57726 + }, + { + "epoch": 5.386488756181767, + "grad_norm": NaN, + "learning_rate": 8.168957966969136e-06, + "loss": 0.0, + "step": 57727 + }, + { + "epoch": 5.386582065876644, + "grad_norm": NaN, + "learning_rate": 8.166496096780495e-06, + "loss": 0.0, + "step": 57728 + }, + { + "epoch": 5.386675375571522, + "grad_norm": NaN, + "learning_rate": 8.164034587232421e-06, + "loss": 0.0, + "step": 57729 + }, + { + "epoch": 5.386768685266399, + "grad_norm": NaN, + "learning_rate": 8.161573438331227e-06, + "loss": 0.0, + "step": 57730 + }, + { + "epoch": 5.386861994961277, + "grad_norm": NaN, + "learning_rate": 8.159112650083105e-06, + "loss": 0.0, + "step": 57731 + }, + { + "epoch": 5.386955304656154, + "grad_norm": NaN, + "learning_rate": 8.156652222494337e-06, + "loss": 0.0, + "step": 57732 + }, + { + "epoch": 5.387048614351031, + "grad_norm": NaN, + "learning_rate": 8.154192155571232e-06, + "loss": 0.0, + "step": 57733 + }, + { + "epoch": 5.387141924045908, + "grad_norm": NaN, + "learning_rate": 8.15173244931997e-06, + "loss": 0.0, + "step": 57734 + }, + { + "epoch": 5.387235233740785, + "grad_norm": NaN, + "learning_rate": 8.149273103746812e-06, + "loss": 0.0, + "step": 57735 + }, + { + "epoch": 5.387328543435663, + "grad_norm": NaN, + "learning_rate": 8.146814118858085e-06, + "loss": 0.0, + "step": 57736 + }, + { + "epoch": 5.38742185313054, + "grad_norm": NaN, + "learning_rate": 8.144355494659955e-06, + "loss": 0.0, + "step": 57737 + }, + { + "epoch": 5.387515162825418, + "grad_norm": NaN, + "learning_rate": 8.141897231158696e-06, + "loss": 0.0, + "step": 57738 + }, + { + "epoch": 5.387608472520295, + "grad_norm": NaN, + "learning_rate": 8.139439328360603e-06, + "loss": 0.0, + "step": 57739 + }, + { + "epoch": 5.3877017822151725, + "grad_norm": NaN, + "learning_rate": 8.136981786271874e-06, + "loss": 0.0, + "step": 57740 + }, + { + "epoch": 5.38779509191005, + "grad_norm": NaN, + "learning_rate": 8.134524604898768e-06, + "loss": 0.0, + "step": 57741 + }, + { + "epoch": 5.387888401604926, + "grad_norm": NaN, + "learning_rate": 8.132067784247581e-06, + "loss": 0.0, + "step": 57742 + }, + { + "epoch": 5.387981711299804, + "grad_norm": NaN, + "learning_rate": 8.129611324324492e-06, + "loss": 0.0, + "step": 57743 + }, + { + "epoch": 5.388075020994681, + "grad_norm": NaN, + "learning_rate": 8.127155225135761e-06, + "loss": 0.0, + "step": 57744 + }, + { + "epoch": 5.388168330689559, + "grad_norm": NaN, + "learning_rate": 8.124699486687685e-06, + "loss": 0.0, + "step": 57745 + }, + { + "epoch": 5.388261640384436, + "grad_norm": NaN, + "learning_rate": 8.122244108986459e-06, + "loss": 0.0, + "step": 57746 + }, + { + "epoch": 5.3883549500793135, + "grad_norm": NaN, + "learning_rate": 8.119789092038308e-06, + "loss": 0.0, + "step": 57747 + }, + { + "epoch": 5.388448259774191, + "grad_norm": NaN, + "learning_rate": 8.117334435849565e-06, + "loss": 0.0, + "step": 57748 + }, + { + "epoch": 5.3885415694690675, + "grad_norm": NaN, + "learning_rate": 8.114880140426371e-06, + "loss": 0.0, + "step": 57749 + }, + { + "epoch": 5.388634879163945, + "grad_norm": NaN, + "learning_rate": 8.11242620577499e-06, + "loss": 0.0, + "step": 57750 + }, + { + "epoch": 5.388728188858822, + "grad_norm": NaN, + "learning_rate": 8.109972631901734e-06, + "loss": 0.0, + "step": 57751 + }, + { + "epoch": 5.3888214985537, + "grad_norm": NaN, + "learning_rate": 8.107519418812763e-06, + "loss": 0.0, + "step": 57752 + }, + { + "epoch": 5.388914808248577, + "grad_norm": NaN, + "learning_rate": 8.105066566514307e-06, + "loss": 0.0, + "step": 57753 + }, + { + "epoch": 5.389008117943455, + "grad_norm": NaN, + "learning_rate": 8.102614075012676e-06, + "loss": 0.0, + "step": 57754 + }, + { + "epoch": 5.389101427638332, + "grad_norm": NaN, + "learning_rate": 8.100161944314048e-06, + "loss": 0.0, + "step": 57755 + }, + { + "epoch": 5.3891947373332085, + "grad_norm": NaN, + "learning_rate": 8.097710174424654e-06, + "loss": 0.0, + "step": 57756 + }, + { + "epoch": 5.389288047028086, + "grad_norm": NaN, + "learning_rate": 8.095258765350787e-06, + "loss": 0.0, + "step": 57757 + }, + { + "epoch": 5.389381356722963, + "grad_norm": NaN, + "learning_rate": 8.092807717098626e-06, + "loss": 0.0, + "step": 57758 + }, + { + "epoch": 5.389474666417841, + "grad_norm": NaN, + "learning_rate": 8.090357029674399e-06, + "loss": 0.0, + "step": 57759 + }, + { + "epoch": 5.389567976112718, + "grad_norm": NaN, + "learning_rate": 8.087906703084401e-06, + "loss": 0.0, + "step": 57760 + }, + { + "epoch": 5.389661285807596, + "grad_norm": NaN, + "learning_rate": 8.085456737334778e-06, + "loss": 0.0, + "step": 57761 + }, + { + "epoch": 5.389754595502473, + "grad_norm": NaN, + "learning_rate": 8.083007132431807e-06, + "loss": 0.0, + "step": 57762 + }, + { + "epoch": 5.38984790519735, + "grad_norm": NaN, + "learning_rate": 8.080557888381734e-06, + "loss": 0.0, + "step": 57763 + }, + { + "epoch": 5.389941214892227, + "grad_norm": NaN, + "learning_rate": 8.078109005190736e-06, + "loss": 0.0, + "step": 57764 + }, + { + "epoch": 5.390034524587104, + "grad_norm": NaN, + "learning_rate": 8.075660482865076e-06, + "loss": 0.0, + "step": 57765 + }, + { + "epoch": 5.390127834281982, + "grad_norm": NaN, + "learning_rate": 8.073212321410982e-06, + "loss": 0.0, + "step": 57766 + }, + { + "epoch": 5.390221143976859, + "grad_norm": NaN, + "learning_rate": 8.07076452083465e-06, + "loss": 0.0, + "step": 57767 + }, + { + "epoch": 5.390314453671737, + "grad_norm": NaN, + "learning_rate": 8.068317081142323e-06, + "loss": 0.0, + "step": 57768 + }, + { + "epoch": 5.390407763366614, + "grad_norm": NaN, + "learning_rate": 8.065870002340247e-06, + "loss": 0.0, + "step": 57769 + }, + { + "epoch": 5.3905010730614915, + "grad_norm": NaN, + "learning_rate": 8.063423284434583e-06, + "loss": 0.0, + "step": 57770 + }, + { + "epoch": 5.390594382756368, + "grad_norm": NaN, + "learning_rate": 8.060976927431596e-06, + "loss": 0.0, + "step": 57771 + }, + { + "epoch": 5.3906876924512455, + "grad_norm": NaN, + "learning_rate": 8.058530931337543e-06, + "loss": 0.0, + "step": 57772 + }, + { + "epoch": 5.390781002146123, + "grad_norm": NaN, + "learning_rate": 8.056085296158537e-06, + "loss": 0.0, + "step": 57773 + }, + { + "epoch": 5.390874311841, + "grad_norm": NaN, + "learning_rate": 8.053640021900909e-06, + "loss": 0.0, + "step": 57774 + }, + { + "epoch": 5.390967621535878, + "grad_norm": NaN, + "learning_rate": 8.0511951085708e-06, + "loss": 0.0, + "step": 57775 + }, + { + "epoch": 5.391060931230755, + "grad_norm": NaN, + "learning_rate": 8.048750556174444e-06, + "loss": 0.0, + "step": 57776 + }, + { + "epoch": 5.391154240925633, + "grad_norm": NaN, + "learning_rate": 8.046306364718097e-06, + "loss": 0.0, + "step": 57777 + }, + { + "epoch": 5.391247550620509, + "grad_norm": NaN, + "learning_rate": 8.04386253420794e-06, + "loss": 0.0, + "step": 57778 + }, + { + "epoch": 5.3913408603153865, + "grad_norm": NaN, + "learning_rate": 8.041419064650151e-06, + "loss": 0.0, + "step": 57779 + }, + { + "epoch": 5.391434170010264, + "grad_norm": NaN, + "learning_rate": 8.038975956051024e-06, + "loss": 0.0, + "step": 57780 + }, + { + "epoch": 5.391527479705141, + "grad_norm": NaN, + "learning_rate": 8.036533208416723e-06, + "loss": 0.0, + "step": 57781 + }, + { + "epoch": 5.391620789400019, + "grad_norm": NaN, + "learning_rate": 8.034090821753441e-06, + "loss": 0.0, + "step": 57782 + }, + { + "epoch": 5.391714099094896, + "grad_norm": NaN, + "learning_rate": 8.031648796067441e-06, + "loss": 0.0, + "step": 57783 + }, + { + "epoch": 5.391807408789774, + "grad_norm": NaN, + "learning_rate": 8.029207131364885e-06, + "loss": 0.0, + "step": 57784 + }, + { + "epoch": 5.391900718484651, + "grad_norm": NaN, + "learning_rate": 8.026765827651998e-06, + "loss": 0.0, + "step": 57785 + }, + { + "epoch": 5.3919940281795276, + "grad_norm": NaN, + "learning_rate": 8.024324884935013e-06, + "loss": 0.0, + "step": 57786 + }, + { + "epoch": 5.392087337874405, + "grad_norm": NaN, + "learning_rate": 8.021884303220105e-06, + "loss": 0.0, + "step": 57787 + }, + { + "epoch": 5.392180647569282, + "grad_norm": NaN, + "learning_rate": 8.019444082513454e-06, + "loss": 0.0, + "step": 57788 + }, + { + "epoch": 5.39227395726416, + "grad_norm": NaN, + "learning_rate": 8.017004222821355e-06, + "loss": 0.0, + "step": 57789 + }, + { + "epoch": 5.392367266959037, + "grad_norm": NaN, + "learning_rate": 8.01456472414992e-06, + "loss": 0.0, + "step": 57790 + }, + { + "epoch": 5.392460576653915, + "grad_norm": NaN, + "learning_rate": 8.012125586505375e-06, + "loss": 0.0, + "step": 57791 + }, + { + "epoch": 5.392553886348791, + "grad_norm": NaN, + "learning_rate": 8.009686809893984e-06, + "loss": 0.0, + "step": 57792 + }, + { + "epoch": 5.392647196043669, + "grad_norm": NaN, + "learning_rate": 8.007248394321858e-06, + "loss": 0.0, + "step": 57793 + }, + { + "epoch": 5.392740505738546, + "grad_norm": NaN, + "learning_rate": 8.004810339795243e-06, + "loss": 0.0, + "step": 57794 + }, + { + "epoch": 5.392833815433423, + "grad_norm": NaN, + "learning_rate": 8.002372646320349e-06, + "loss": 0.0, + "step": 57795 + }, + { + "epoch": 5.392927125128301, + "grad_norm": NaN, + "learning_rate": 7.999935313903355e-06, + "loss": 0.0, + "step": 57796 + }, + { + "epoch": 5.393020434823178, + "grad_norm": NaN, + "learning_rate": 7.99749834255044e-06, + "loss": 0.0, + "step": 57797 + }, + { + "epoch": 5.393113744518056, + "grad_norm": NaN, + "learning_rate": 7.995061732267866e-06, + "loss": 0.0, + "step": 57798 + }, + { + "epoch": 5.393207054212933, + "grad_norm": NaN, + "learning_rate": 7.992625483061726e-06, + "loss": 0.0, + "step": 57799 + }, + { + "epoch": 5.39330036390781, + "grad_norm": NaN, + "learning_rate": 7.9901895949383e-06, + "loss": 0.0, + "step": 57800 + }, + { + "epoch": 5.393393673602687, + "grad_norm": NaN, + "learning_rate": 7.987754067903784e-06, + "loss": 0.0, + "step": 57801 + }, + { + "epoch": 5.3934869832975645, + "grad_norm": NaN, + "learning_rate": 7.985318901964305e-06, + "loss": 0.0, + "step": 57802 + }, + { + "epoch": 5.393580292992442, + "grad_norm": NaN, + "learning_rate": 7.982884097126108e-06, + "loss": 0.0, + "step": 57803 + }, + { + "epoch": 5.393673602687319, + "grad_norm": NaN, + "learning_rate": 7.980449653395388e-06, + "loss": 0.0, + "step": 57804 + }, + { + "epoch": 5.393766912382197, + "grad_norm": NaN, + "learning_rate": 7.978015570778273e-06, + "loss": 0.0, + "step": 57805 + }, + { + "epoch": 5.393860222077074, + "grad_norm": NaN, + "learning_rate": 7.97558184928101e-06, + "loss": 0.0, + "step": 57806 + }, + { + "epoch": 5.393953531771951, + "grad_norm": NaN, + "learning_rate": 7.973148488909809e-06, + "loss": 0.0, + "step": 57807 + }, + { + "epoch": 5.394046841466828, + "grad_norm": NaN, + "learning_rate": 7.970715489670764e-06, + "loss": 0.0, + "step": 57808 + }, + { + "epoch": 5.3941401511617055, + "grad_norm": NaN, + "learning_rate": 7.96828285157014e-06, + "loss": 0.0, + "step": 57809 + }, + { + "epoch": 5.394233460856583, + "grad_norm": NaN, + "learning_rate": 7.96585057461413e-06, + "loss": 0.0, + "step": 57810 + }, + { + "epoch": 5.39432677055146, + "grad_norm": NaN, + "learning_rate": 7.96341865880883e-06, + "loss": 0.0, + "step": 57811 + }, + { + "epoch": 5.394420080246338, + "grad_norm": NaN, + "learning_rate": 7.960987104160516e-06, + "loss": 0.0, + "step": 57812 + }, + { + "epoch": 5.394513389941215, + "grad_norm": NaN, + "learning_rate": 7.958555910675351e-06, + "loss": 0.0, + "step": 57813 + }, + { + "epoch": 5.394606699636093, + "grad_norm": NaN, + "learning_rate": 7.956125078359465e-06, + "loss": 0.0, + "step": 57814 + }, + { + "epoch": 5.394700009330969, + "grad_norm": NaN, + "learning_rate": 7.953694607219102e-06, + "loss": 0.0, + "step": 57815 + }, + { + "epoch": 5.394793319025847, + "grad_norm": NaN, + "learning_rate": 7.951264497260424e-06, + "loss": 0.0, + "step": 57816 + }, + { + "epoch": 5.394886628720724, + "grad_norm": NaN, + "learning_rate": 7.948834748489573e-06, + "loss": 0.0, + "step": 57817 + }, + { + "epoch": 5.394979938415601, + "grad_norm": NaN, + "learning_rate": 7.946405360912783e-06, + "loss": 0.0, + "step": 57818 + }, + { + "epoch": 5.395073248110479, + "grad_norm": NaN, + "learning_rate": 7.943976334536194e-06, + "loss": 0.0, + "step": 57819 + }, + { + "epoch": 5.395166557805356, + "grad_norm": NaN, + "learning_rate": 7.941547669365973e-06, + "loss": 0.0, + "step": 57820 + }, + { + "epoch": 5.395259867500234, + "grad_norm": NaN, + "learning_rate": 7.939119365408359e-06, + "loss": 0.0, + "step": 57821 + }, + { + "epoch": 5.39535317719511, + "grad_norm": NaN, + "learning_rate": 7.936691422669466e-06, + "loss": 0.0, + "step": 57822 + }, + { + "epoch": 5.395446486889988, + "grad_norm": NaN, + "learning_rate": 7.934263841155459e-06, + "loss": 0.0, + "step": 57823 + }, + { + "epoch": 5.395539796584865, + "grad_norm": NaN, + "learning_rate": 7.931836620872561e-06, + "loss": 0.0, + "step": 57824 + }, + { + "epoch": 5.3956331062797425, + "grad_norm": NaN, + "learning_rate": 7.929409761826922e-06, + "loss": 0.0, + "step": 57825 + }, + { + "epoch": 5.39572641597462, + "grad_norm": NaN, + "learning_rate": 7.926983264024683e-06, + "loss": 0.0, + "step": 57826 + }, + { + "epoch": 5.395819725669497, + "grad_norm": NaN, + "learning_rate": 7.924557127472075e-06, + "loss": 0.0, + "step": 57827 + }, + { + "epoch": 5.395913035364375, + "grad_norm": NaN, + "learning_rate": 7.92213135217521e-06, + "loss": 0.0, + "step": 57828 + }, + { + "epoch": 5.396006345059251, + "grad_norm": NaN, + "learning_rate": 7.919705938140263e-06, + "loss": 0.0, + "step": 57829 + }, + { + "epoch": 5.396099654754129, + "grad_norm": NaN, + "learning_rate": 7.91728088537345e-06, + "loss": 0.0, + "step": 57830 + }, + { + "epoch": 5.396192964449006, + "grad_norm": NaN, + "learning_rate": 7.914856193880897e-06, + "loss": 0.0, + "step": 57831 + }, + { + "epoch": 5.3962862741438835, + "grad_norm": NaN, + "learning_rate": 7.91243186366875e-06, + "loss": 0.0, + "step": 57832 + }, + { + "epoch": 5.396379583838761, + "grad_norm": NaN, + "learning_rate": 7.910007894743236e-06, + "loss": 0.0, + "step": 57833 + }, + { + "epoch": 5.396472893533638, + "grad_norm": NaN, + "learning_rate": 7.907584287110469e-06, + "loss": 0.0, + "step": 57834 + }, + { + "epoch": 5.396566203228516, + "grad_norm": NaN, + "learning_rate": 7.905161040776591e-06, + "loss": 0.0, + "step": 57835 + }, + { + "epoch": 5.396659512923392, + "grad_norm": NaN, + "learning_rate": 7.902738155747852e-06, + "loss": 0.0, + "step": 57836 + }, + { + "epoch": 5.39675282261827, + "grad_norm": NaN, + "learning_rate": 7.900315632030307e-06, + "loss": 0.0, + "step": 57837 + }, + { + "epoch": 5.396846132313147, + "grad_norm": NaN, + "learning_rate": 7.89789346963019e-06, + "loss": 0.0, + "step": 57838 + }, + { + "epoch": 5.396939442008025, + "grad_norm": NaN, + "learning_rate": 7.89547166855366e-06, + "loss": 0.0, + "step": 57839 + }, + { + "epoch": 5.397032751702902, + "grad_norm": NaN, + "learning_rate": 7.893050228806813e-06, + "loss": 0.0, + "step": 57840 + }, + { + "epoch": 5.397126061397779, + "grad_norm": NaN, + "learning_rate": 7.890629150395861e-06, + "loss": 0.0, + "step": 57841 + }, + { + "epoch": 5.397219371092657, + "grad_norm": NaN, + "learning_rate": 7.888208433326965e-06, + "loss": 0.0, + "step": 57842 + }, + { + "epoch": 5.397312680787534, + "grad_norm": NaN, + "learning_rate": 7.885788077606203e-06, + "loss": 0.0, + "step": 57843 + }, + { + "epoch": 5.397405990482411, + "grad_norm": NaN, + "learning_rate": 7.883368083239822e-06, + "loss": 0.0, + "step": 57844 + }, + { + "epoch": 5.397499300177288, + "grad_norm": NaN, + "learning_rate": 7.880948450233948e-06, + "loss": 0.0, + "step": 57845 + }, + { + "epoch": 5.397592609872166, + "grad_norm": NaN, + "learning_rate": 7.878529178594695e-06, + "loss": 0.0, + "step": 57846 + }, + { + "epoch": 5.397685919567043, + "grad_norm": NaN, + "learning_rate": 7.876110268328257e-06, + "loss": 0.0, + "step": 57847 + }, + { + "epoch": 5.39777922926192, + "grad_norm": NaN, + "learning_rate": 7.873691719440777e-06, + "loss": 0.0, + "step": 57848 + }, + { + "epoch": 5.397872538956798, + "grad_norm": NaN, + "learning_rate": 7.871273531938371e-06, + "loss": 0.0, + "step": 57849 + }, + { + "epoch": 5.397965848651675, + "grad_norm": NaN, + "learning_rate": 7.868855705827231e-06, + "loss": 0.0, + "step": 57850 + }, + { + "epoch": 5.398059158346552, + "grad_norm": NaN, + "learning_rate": 7.866438241113521e-06, + "loss": 0.0, + "step": 57851 + }, + { + "epoch": 5.398152468041429, + "grad_norm": NaN, + "learning_rate": 7.864021137803301e-06, + "loss": 0.0, + "step": 57852 + }, + { + "epoch": 5.398245777736307, + "grad_norm": NaN, + "learning_rate": 7.861604395902798e-06, + "loss": 0.0, + "step": 57853 + }, + { + "epoch": 5.398339087431184, + "grad_norm": NaN, + "learning_rate": 7.859188015418144e-06, + "loss": 0.0, + "step": 57854 + }, + { + "epoch": 5.3984323971260615, + "grad_norm": NaN, + "learning_rate": 7.856771996355431e-06, + "loss": 0.0, + "step": 57855 + }, + { + "epoch": 5.398525706820939, + "grad_norm": NaN, + "learning_rate": 7.854356338720875e-06, + "loss": 0.0, + "step": 57856 + }, + { + "epoch": 5.398619016515816, + "grad_norm": NaN, + "learning_rate": 7.851941042520582e-06, + "loss": 0.0, + "step": 57857 + }, + { + "epoch": 5.398712326210694, + "grad_norm": NaN, + "learning_rate": 7.849526107760668e-06, + "loss": 0.0, + "step": 57858 + }, + { + "epoch": 5.39880563590557, + "grad_norm": NaN, + "learning_rate": 7.847111534447325e-06, + "loss": 0.0, + "step": 57859 + }, + { + "epoch": 5.398898945600448, + "grad_norm": NaN, + "learning_rate": 7.844697322586685e-06, + "loss": 0.0, + "step": 57860 + }, + { + "epoch": 5.398992255295325, + "grad_norm": NaN, + "learning_rate": 7.842283472184823e-06, + "loss": 0.0, + "step": 57861 + }, + { + "epoch": 5.3990855649902025, + "grad_norm": NaN, + "learning_rate": 7.83986998324797e-06, + "loss": 0.0, + "step": 57862 + }, + { + "epoch": 5.39917887468508, + "grad_norm": NaN, + "learning_rate": 7.837456855782203e-06, + "loss": 0.0, + "step": 57863 + }, + { + "epoch": 5.399272184379957, + "grad_norm": NaN, + "learning_rate": 7.835044089793635e-06, + "loss": 0.0, + "step": 57864 + }, + { + "epoch": 5.399365494074834, + "grad_norm": NaN, + "learning_rate": 7.832631685288492e-06, + "loss": 0.0, + "step": 57865 + }, + { + "epoch": 5.399458803769711, + "grad_norm": NaN, + "learning_rate": 7.830219642272822e-06, + "loss": 0.0, + "step": 57866 + }, + { + "epoch": 5.399552113464589, + "grad_norm": NaN, + "learning_rate": 7.827807960752786e-06, + "loss": 0.0, + "step": 57867 + }, + { + "epoch": 5.399645423159466, + "grad_norm": NaN, + "learning_rate": 7.825396640734543e-06, + "loss": 0.0, + "step": 57868 + }, + { + "epoch": 5.399738732854344, + "grad_norm": NaN, + "learning_rate": 7.822985682224192e-06, + "loss": 0.0, + "step": 57869 + }, + { + "epoch": 5.399832042549221, + "grad_norm": NaN, + "learning_rate": 7.820575085227843e-06, + "loss": 0.0, + "step": 57870 + }, + { + "epoch": 5.399925352244098, + "grad_norm": NaN, + "learning_rate": 7.818164849751707e-06, + "loss": 0.0, + "step": 57871 + }, + { + "epoch": 5.400018661938976, + "grad_norm": NaN, + "learning_rate": 7.815754975801813e-06, + "loss": 0.0, + "step": 57872 + }, + { + "epoch": 5.400111971633852, + "grad_norm": NaN, + "learning_rate": 7.813345463384358e-06, + "loss": 0.0, + "step": 57873 + }, + { + "epoch": 5.40020528132873, + "grad_norm": NaN, + "learning_rate": 7.810936312505467e-06, + "loss": 0.0, + "step": 57874 + }, + { + "epoch": 5.400298591023607, + "grad_norm": NaN, + "learning_rate": 7.808527523171205e-06, + "loss": 0.0, + "step": 57875 + }, + { + "epoch": 5.400391900718485, + "grad_norm": NaN, + "learning_rate": 7.806119095387748e-06, + "loss": 0.0, + "step": 57876 + }, + { + "epoch": 5.400485210413362, + "grad_norm": NaN, + "learning_rate": 7.803711029161242e-06, + "loss": 0.0, + "step": 57877 + }, + { + "epoch": 5.4005785201082395, + "grad_norm": NaN, + "learning_rate": 7.801303324497732e-06, + "loss": 0.0, + "step": 57878 + }, + { + "epoch": 5.400671829803117, + "grad_norm": NaN, + "learning_rate": 7.798895981403397e-06, + "loss": 0.0, + "step": 57879 + }, + { + "epoch": 5.400765139497993, + "grad_norm": NaN, + "learning_rate": 7.796488999884382e-06, + "loss": 0.0, + "step": 57880 + }, + { + "epoch": 5.400858449192871, + "grad_norm": NaN, + "learning_rate": 7.794082379946731e-06, + "loss": 0.0, + "step": 57881 + }, + { + "epoch": 5.400951758887748, + "grad_norm": NaN, + "learning_rate": 7.791676121596607e-06, + "loss": 0.0, + "step": 57882 + }, + { + "epoch": 5.401045068582626, + "grad_norm": NaN, + "learning_rate": 7.789270224840173e-06, + "loss": 0.0, + "step": 57883 + }, + { + "epoch": 5.401138378277503, + "grad_norm": NaN, + "learning_rate": 7.786864689683436e-06, + "loss": 0.0, + "step": 57884 + }, + { + "epoch": 5.4012316879723805, + "grad_norm": NaN, + "learning_rate": 7.784459516132597e-06, + "loss": 0.0, + "step": 57885 + }, + { + "epoch": 5.401324997667258, + "grad_norm": NaN, + "learning_rate": 7.78205470419378e-06, + "loss": 0.0, + "step": 57886 + }, + { + "epoch": 5.401418307362135, + "grad_norm": NaN, + "learning_rate": 7.779650253873016e-06, + "loss": 0.0, + "step": 57887 + }, + { + "epoch": 5.401511617057012, + "grad_norm": NaN, + "learning_rate": 7.777246165176515e-06, + "loss": 0.0, + "step": 57888 + }, + { + "epoch": 5.401604926751889, + "grad_norm": NaN, + "learning_rate": 7.77484243811034e-06, + "loss": 0.0, + "step": 57889 + }, + { + "epoch": 5.401698236446767, + "grad_norm": NaN, + "learning_rate": 7.772439072680586e-06, + "loss": 0.0, + "step": 57890 + }, + { + "epoch": 5.401791546141644, + "grad_norm": NaN, + "learning_rate": 7.770036068893415e-06, + "loss": 0.0, + "step": 57891 + }, + { + "epoch": 5.401884855836522, + "grad_norm": NaN, + "learning_rate": 7.767633426754921e-06, + "loss": 0.0, + "step": 57892 + }, + { + "epoch": 5.401978165531399, + "grad_norm": NaN, + "learning_rate": 7.765231146271166e-06, + "loss": 0.0, + "step": 57893 + }, + { + "epoch": 5.402071475226276, + "grad_norm": NaN, + "learning_rate": 7.762829227448313e-06, + "loss": 0.0, + "step": 57894 + }, + { + "epoch": 5.402164784921153, + "grad_norm": NaN, + "learning_rate": 7.760427670292474e-06, + "loss": 0.0, + "step": 57895 + }, + { + "epoch": 5.40225809461603, + "grad_norm": NaN, + "learning_rate": 7.758026474809693e-06, + "loss": 0.0, + "step": 57896 + }, + { + "epoch": 5.402351404310908, + "grad_norm": NaN, + "learning_rate": 7.755625641006131e-06, + "loss": 0.0, + "step": 57897 + }, + { + "epoch": 5.402444714005785, + "grad_norm": NaN, + "learning_rate": 7.7532251688879e-06, + "loss": 0.0, + "step": 57898 + }, + { + "epoch": 5.402538023700663, + "grad_norm": NaN, + "learning_rate": 7.750825058461047e-06, + "loss": 0.0, + "step": 57899 + }, + { + "epoch": 5.40263133339554, + "grad_norm": NaN, + "learning_rate": 7.748425309731716e-06, + "loss": 0.0, + "step": 57900 + }, + { + "epoch": 5.4027246430904174, + "grad_norm": NaN, + "learning_rate": 7.746025922706034e-06, + "loss": 0.0, + "step": 57901 + }, + { + "epoch": 5.402817952785295, + "grad_norm": NaN, + "learning_rate": 7.743626897390016e-06, + "loss": 0.0, + "step": 57902 + }, + { + "epoch": 5.402911262480171, + "grad_norm": NaN, + "learning_rate": 7.741228233789836e-06, + "loss": 0.0, + "step": 57903 + }, + { + "epoch": 5.403004572175049, + "grad_norm": NaN, + "learning_rate": 7.73882993191161e-06, + "loss": 0.0, + "step": 57904 + }, + { + "epoch": 5.403097881869926, + "grad_norm": NaN, + "learning_rate": 7.736431991761333e-06, + "loss": 0.0, + "step": 57905 + }, + { + "epoch": 5.403191191564804, + "grad_norm": NaN, + "learning_rate": 7.73403441334523e-06, + "loss": 0.0, + "step": 57906 + }, + { + "epoch": 5.403284501259681, + "grad_norm": NaN, + "learning_rate": 7.731637196669316e-06, + "loss": 0.0, + "step": 57907 + }, + { + "epoch": 5.4033778109545585, + "grad_norm": NaN, + "learning_rate": 7.729240341739667e-06, + "loss": 0.0, + "step": 57908 + }, + { + "epoch": 5.403471120649435, + "grad_norm": NaN, + "learning_rate": 7.726843848562465e-06, + "loss": 0.0, + "step": 57909 + }, + { + "epoch": 5.403564430344312, + "grad_norm": NaN, + "learning_rate": 7.724447717143716e-06, + "loss": 0.0, + "step": 57910 + }, + { + "epoch": 5.40365774003919, + "grad_norm": NaN, + "learning_rate": 7.722051947489572e-06, + "loss": 0.0, + "step": 57911 + }, + { + "epoch": 5.403751049734067, + "grad_norm": NaN, + "learning_rate": 7.719656539606123e-06, + "loss": 0.0, + "step": 57912 + }, + { + "epoch": 5.403844359428945, + "grad_norm": NaN, + "learning_rate": 7.7172614934994e-06, + "loss": 0.0, + "step": 57913 + }, + { + "epoch": 5.403937669123822, + "grad_norm": NaN, + "learning_rate": 7.714866809175563e-06, + "loss": 0.0, + "step": 57914 + }, + { + "epoch": 5.4040309788186995, + "grad_norm": NaN, + "learning_rate": 7.712472486640675e-06, + "loss": 0.0, + "step": 57915 + }, + { + "epoch": 5.404124288513577, + "grad_norm": NaN, + "learning_rate": 7.710078525900798e-06, + "loss": 0.0, + "step": 57916 + }, + { + "epoch": 5.4042175982084535, + "grad_norm": NaN, + "learning_rate": 7.707684926962043e-06, + "loss": 0.0, + "step": 57917 + }, + { + "epoch": 5.404310907903331, + "grad_norm": NaN, + "learning_rate": 7.705291689830523e-06, + "loss": 0.0, + "step": 57918 + }, + { + "epoch": 5.404404217598208, + "grad_norm": NaN, + "learning_rate": 7.702898814512266e-06, + "loss": 0.0, + "step": 57919 + }, + { + "epoch": 5.404497527293086, + "grad_norm": NaN, + "learning_rate": 7.700506301013398e-06, + "loss": 0.0, + "step": 57920 + }, + { + "epoch": 5.404590836987963, + "grad_norm": NaN, + "learning_rate": 7.698114149340001e-06, + "loss": 0.0, + "step": 57921 + }, + { + "epoch": 5.404684146682841, + "grad_norm": NaN, + "learning_rate": 7.695722359498118e-06, + "loss": 0.0, + "step": 57922 + }, + { + "epoch": 5.404777456377718, + "grad_norm": NaN, + "learning_rate": 7.693330931493879e-06, + "loss": 0.0, + "step": 57923 + }, + { + "epoch": 5.4048707660725945, + "grad_norm": NaN, + "learning_rate": 7.690939865333361e-06, + "loss": 0.0, + "step": 57924 + }, + { + "epoch": 5.404964075767472, + "grad_norm": NaN, + "learning_rate": 7.688549161022595e-06, + "loss": 0.0, + "step": 57925 + }, + { + "epoch": 5.405057385462349, + "grad_norm": NaN, + "learning_rate": 7.686158818567707e-06, + "loss": 0.0, + "step": 57926 + }, + { + "epoch": 5.405150695157227, + "grad_norm": NaN, + "learning_rate": 7.68376883797479e-06, + "loss": 0.0, + "step": 57927 + }, + { + "epoch": 5.405244004852104, + "grad_norm": NaN, + "learning_rate": 7.681379219249828e-06, + "loss": 0.0, + "step": 57928 + }, + { + "epoch": 5.405337314546982, + "grad_norm": NaN, + "learning_rate": 7.678989962398996e-06, + "loss": 0.0, + "step": 57929 + }, + { + "epoch": 5.405430624241859, + "grad_norm": NaN, + "learning_rate": 7.676601067428356e-06, + "loss": 0.0, + "step": 57930 + }, + { + "epoch": 5.4055239339367365, + "grad_norm": NaN, + "learning_rate": 7.674212534343921e-06, + "loss": 0.0, + "step": 57931 + }, + { + "epoch": 5.405617243631613, + "grad_norm": NaN, + "learning_rate": 7.67182436315182e-06, + "loss": 0.0, + "step": 57932 + }, + { + "epoch": 5.40571055332649, + "grad_norm": NaN, + "learning_rate": 7.669436553858127e-06, + "loss": 0.0, + "step": 57933 + }, + { + "epoch": 5.405803863021368, + "grad_norm": NaN, + "learning_rate": 7.667049106468859e-06, + "loss": 0.0, + "step": 57934 + }, + { + "epoch": 5.405897172716245, + "grad_norm": NaN, + "learning_rate": 7.664662020990143e-06, + "loss": 0.0, + "step": 57935 + }, + { + "epoch": 5.405990482411123, + "grad_norm": NaN, + "learning_rate": 7.662275297428055e-06, + "loss": 0.0, + "step": 57936 + }, + { + "epoch": 5.406083792106, + "grad_norm": NaN, + "learning_rate": 7.659888935788578e-06, + "loss": 0.0, + "step": 57937 + }, + { + "epoch": 5.4061771018008775, + "grad_norm": NaN, + "learning_rate": 7.65750293607787e-06, + "loss": 0.0, + "step": 57938 + }, + { + "epoch": 5.406270411495754, + "grad_norm": NaN, + "learning_rate": 7.655117298301994e-06, + "loss": 0.0, + "step": 57939 + }, + { + "epoch": 5.4063637211906315, + "grad_norm": NaN, + "learning_rate": 7.652732022466945e-06, + "loss": 0.0, + "step": 57940 + }, + { + "epoch": 5.406457030885509, + "grad_norm": NaN, + "learning_rate": 7.650347108578852e-06, + "loss": 0.0, + "step": 57941 + }, + { + "epoch": 5.406550340580386, + "grad_norm": NaN, + "learning_rate": 7.647962556643778e-06, + "loss": 0.0, + "step": 57942 + }, + { + "epoch": 5.406643650275264, + "grad_norm": NaN, + "learning_rate": 7.645578366667732e-06, + "loss": 0.0, + "step": 57943 + }, + { + "epoch": 5.406736959970141, + "grad_norm": NaN, + "learning_rate": 7.643194538656827e-06, + "loss": 0.0, + "step": 57944 + }, + { + "epoch": 5.406830269665019, + "grad_norm": NaN, + "learning_rate": 7.640811072617125e-06, + "loss": 0.0, + "step": 57945 + }, + { + "epoch": 5.406923579359895, + "grad_norm": NaN, + "learning_rate": 7.638427968554623e-06, + "loss": 0.0, + "step": 57946 + }, + { + "epoch": 5.4070168890547725, + "grad_norm": NaN, + "learning_rate": 7.636045226475462e-06, + "loss": 0.0, + "step": 57947 + }, + { + "epoch": 5.40711019874965, + "grad_norm": NaN, + "learning_rate": 7.633662846385674e-06, + "loss": 0.0, + "step": 57948 + }, + { + "epoch": 5.407203508444527, + "grad_norm": NaN, + "learning_rate": 7.631280828291286e-06, + "loss": 0.0, + "step": 57949 + }, + { + "epoch": 5.407296818139405, + "grad_norm": NaN, + "learning_rate": 7.628899172198394e-06, + "loss": 0.0, + "step": 57950 + }, + { + "epoch": 5.407390127834282, + "grad_norm": NaN, + "learning_rate": 7.626517878113042e-06, + "loss": 0.0, + "step": 57951 + }, + { + "epoch": 5.40748343752916, + "grad_norm": NaN, + "learning_rate": 7.624136946041276e-06, + "loss": 0.0, + "step": 57952 + }, + { + "epoch": 5.407576747224036, + "grad_norm": NaN, + "learning_rate": 7.621756375989174e-06, + "loss": 0.0, + "step": 57953 + }, + { + "epoch": 5.407670056918914, + "grad_norm": NaN, + "learning_rate": 7.619376167962732e-06, + "loss": 0.0, + "step": 57954 + }, + { + "epoch": 5.407763366613791, + "grad_norm": NaN, + "learning_rate": 7.616996321968044e-06, + "loss": 0.0, + "step": 57955 + }, + { + "epoch": 5.407856676308668, + "grad_norm": NaN, + "learning_rate": 7.6146168380112065e-06, + "loss": 0.0, + "step": 57956 + }, + { + "epoch": 5.407949986003546, + "grad_norm": NaN, + "learning_rate": 7.612237716098163e-06, + "loss": 0.0, + "step": 57957 + }, + { + "epoch": 5.408043295698423, + "grad_norm": NaN, + "learning_rate": 7.6098589562350436e-06, + "loss": 0.0, + "step": 57958 + }, + { + "epoch": 5.408136605393301, + "grad_norm": NaN, + "learning_rate": 7.607480558427892e-06, + "loss": 0.0, + "step": 57959 + }, + { + "epoch": 5.408229915088178, + "grad_norm": NaN, + "learning_rate": 7.605102522682688e-06, + "loss": 0.0, + "step": 57960 + }, + { + "epoch": 5.408323224783055, + "grad_norm": NaN, + "learning_rate": 7.602724849005559e-06, + "loss": 0.0, + "step": 57961 + }, + { + "epoch": 5.408416534477932, + "grad_norm": NaN, + "learning_rate": 7.6003475374025345e-06, + "loss": 0.0, + "step": 57962 + }, + { + "epoch": 5.4085098441728094, + "grad_norm": NaN, + "learning_rate": 7.597970587879609e-06, + "loss": 0.0, + "step": 57963 + }, + { + "epoch": 5.408603153867687, + "grad_norm": NaN, + "learning_rate": 7.595594000442895e-06, + "loss": 0.0, + "step": 57964 + }, + { + "epoch": 5.408696463562564, + "grad_norm": NaN, + "learning_rate": 7.593217775098403e-06, + "loss": 0.0, + "step": 57965 + }, + { + "epoch": 5.408789773257442, + "grad_norm": NaN, + "learning_rate": 7.59084191185213e-06, + "loss": 0.0, + "step": 57966 + }, + { + "epoch": 5.408883082952319, + "grad_norm": NaN, + "learning_rate": 7.5884664107102034e-06, + "loss": 0.0, + "step": 57967 + }, + { + "epoch": 5.408976392647196, + "grad_norm": NaN, + "learning_rate": 7.5860912716786185e-06, + "loss": 0.0, + "step": 57968 + }, + { + "epoch": 5.409069702342073, + "grad_norm": NaN, + "learning_rate": 7.5837164947633866e-06, + "loss": 0.0, + "step": 57969 + }, + { + "epoch": 5.4091630120369505, + "grad_norm": NaN, + "learning_rate": 7.581342079970604e-06, + "loss": 0.0, + "step": 57970 + }, + { + "epoch": 5.409256321731828, + "grad_norm": NaN, + "learning_rate": 7.578968027306298e-06, + "loss": 0.0, + "step": 57971 + }, + { + "epoch": 5.409349631426705, + "grad_norm": NaN, + "learning_rate": 7.576594336776448e-06, + "loss": 0.0, + "step": 57972 + }, + { + "epoch": 5.409442941121583, + "grad_norm": NaN, + "learning_rate": 7.574221008387132e-06, + "loss": 0.0, + "step": 57973 + }, + { + "epoch": 5.40953625081646, + "grad_norm": NaN, + "learning_rate": 7.57184804214443e-06, + "loss": 0.0, + "step": 57974 + }, + { + "epoch": 5.409629560511338, + "grad_norm": NaN, + "learning_rate": 7.569475438054268e-06, + "loss": 0.0, + "step": 57975 + }, + { + "epoch": 5.409722870206214, + "grad_norm": NaN, + "learning_rate": 7.567103196122776e-06, + "loss": 0.0, + "step": 57976 + }, + { + "epoch": 5.4098161799010915, + "grad_norm": NaN, + "learning_rate": 7.564731316355949e-06, + "loss": 0.0, + "step": 57977 + }, + { + "epoch": 5.409909489595969, + "grad_norm": NaN, + "learning_rate": 7.562359798759782e-06, + "loss": 0.0, + "step": 57978 + }, + { + "epoch": 5.410002799290846, + "grad_norm": NaN, + "learning_rate": 7.55998864334037e-06, + "loss": 0.0, + "step": 57979 + }, + { + "epoch": 5.410096108985724, + "grad_norm": NaN, + "learning_rate": 7.5576178501037425e-06, + "loss": 0.0, + "step": 57980 + }, + { + "epoch": 5.410189418680601, + "grad_norm": NaN, + "learning_rate": 7.5552474190558435e-06, + "loss": 0.0, + "step": 57981 + }, + { + "epoch": 5.410282728375478, + "grad_norm": NaN, + "learning_rate": 7.552877350202768e-06, + "loss": 0.0, + "step": 57982 + }, + { + "epoch": 5.410376038070355, + "grad_norm": NaN, + "learning_rate": 7.550507643550563e-06, + "loss": 0.0, + "step": 57983 + }, + { + "epoch": 5.410469347765233, + "grad_norm": NaN, + "learning_rate": 7.548138299105172e-06, + "loss": 0.0, + "step": 57984 + }, + { + "epoch": 5.41056265746011, + "grad_norm": NaN, + "learning_rate": 7.54576931687269e-06, + "loss": 0.0, + "step": 57985 + }, + { + "epoch": 5.410655967154987, + "grad_norm": NaN, + "learning_rate": 7.543400696859115e-06, + "loss": 0.0, + "step": 57986 + }, + { + "epoch": 5.410749276849865, + "grad_norm": NaN, + "learning_rate": 7.541032439070488e-06, + "loss": 0.0, + "step": 57987 + }, + { + "epoch": 5.410842586544742, + "grad_norm": NaN, + "learning_rate": 7.538664543512807e-06, + "loss": 0.0, + "step": 57988 + }, + { + "epoch": 5.41093589623962, + "grad_norm": NaN, + "learning_rate": 7.5362970101920996e-06, + "loss": 0.0, + "step": 57989 + }, + { + "epoch": 5.411029205934496, + "grad_norm": NaN, + "learning_rate": 7.533929839114395e-06, + "loss": 0.0, + "step": 57990 + }, + { + "epoch": 5.411122515629374, + "grad_norm": NaN, + "learning_rate": 7.531563030285686e-06, + "loss": 0.0, + "step": 57991 + }, + { + "epoch": 5.411215825324251, + "grad_norm": NaN, + "learning_rate": 7.529196583712039e-06, + "loss": 0.0, + "step": 57992 + }, + { + "epoch": 5.4113091350191285, + "grad_norm": NaN, + "learning_rate": 7.526830499399428e-06, + "loss": 0.0, + "step": 57993 + }, + { + "epoch": 5.411402444714006, + "grad_norm": NaN, + "learning_rate": 7.524464777353884e-06, + "loss": 0.0, + "step": 57994 + }, + { + "epoch": 5.411495754408883, + "grad_norm": NaN, + "learning_rate": 7.522099417581417e-06, + "loss": 0.0, + "step": 57995 + }, + { + "epoch": 5.411589064103761, + "grad_norm": NaN, + "learning_rate": 7.5197344200880416e-06, + "loss": 0.0, + "step": 57996 + }, + { + "epoch": 5.411682373798637, + "grad_norm": NaN, + "learning_rate": 7.517369784879801e-06, + "loss": 0.0, + "step": 57997 + }, + { + "epoch": 5.411775683493515, + "grad_norm": NaN, + "learning_rate": 7.515005511962657e-06, + "loss": 0.0, + "step": 57998 + }, + { + "epoch": 5.411868993188392, + "grad_norm": NaN, + "learning_rate": 7.512641601342656e-06, + "loss": 0.0, + "step": 57999 + }, + { + "epoch": 5.4119623028832695, + "grad_norm": NaN, + "learning_rate": 7.510278053025809e-06, + "loss": 0.0, + "step": 58000 + }, + { + "epoch": 5.412055612578147, + "grad_norm": NaN, + "learning_rate": 7.507914867018095e-06, + "loss": 0.0, + "step": 58001 + }, + { + "epoch": 5.412148922273024, + "grad_norm": NaN, + "learning_rate": 7.505552043325541e-06, + "loss": 0.0, + "step": 58002 + }, + { + "epoch": 5.412242231967902, + "grad_norm": NaN, + "learning_rate": 7.503189581954194e-06, + "loss": 0.0, + "step": 58003 + }, + { + "epoch": 5.412335541662779, + "grad_norm": NaN, + "learning_rate": 7.5008274829099825e-06, + "loss": 0.0, + "step": 58004 + }, + { + "epoch": 5.412428851357656, + "grad_norm": NaN, + "learning_rate": 7.498465746198984e-06, + "loss": 0.0, + "step": 58005 + }, + { + "epoch": 5.412522161052533, + "grad_norm": NaN, + "learning_rate": 7.496104371827194e-06, + "loss": 0.0, + "step": 58006 + }, + { + "epoch": 5.412615470747411, + "grad_norm": NaN, + "learning_rate": 7.493743359800541e-06, + "loss": 0.0, + "step": 58007 + }, + { + "epoch": 5.412708780442288, + "grad_norm": NaN, + "learning_rate": 7.491382710125121e-06, + "loss": 0.0, + "step": 58008 + }, + { + "epoch": 5.412802090137165, + "grad_norm": NaN, + "learning_rate": 7.4890224228069116e-06, + "loss": 0.0, + "step": 58009 + }, + { + "epoch": 5.412895399832043, + "grad_norm": NaN, + "learning_rate": 7.486662497851859e-06, + "loss": 0.0, + "step": 58010 + }, + { + "epoch": 5.41298870952692, + "grad_norm": NaN, + "learning_rate": 7.48430293526604e-06, + "loss": 0.0, + "step": 58011 + }, + { + "epoch": 5.413082019221797, + "grad_norm": NaN, + "learning_rate": 7.481943735055434e-06, + "loss": 0.0, + "step": 58012 + }, + { + "epoch": 5.413175328916674, + "grad_norm": NaN, + "learning_rate": 7.479584897226005e-06, + "loss": 0.0, + "step": 58013 + }, + { + "epoch": 5.413268638611552, + "grad_norm": NaN, + "learning_rate": 7.4772264217837776e-06, + "loss": 0.0, + "step": 58014 + }, + { + "epoch": 5.413361948306429, + "grad_norm": NaN, + "learning_rate": 7.474868308734766e-06, + "loss": 0.0, + "step": 58015 + }, + { + "epoch": 5.4134552580013064, + "grad_norm": NaN, + "learning_rate": 7.4725105580849155e-06, + "loss": 0.0, + "step": 58016 + }, + { + "epoch": 5.413548567696184, + "grad_norm": NaN, + "learning_rate": 7.470153169840254e-06, + "loss": 0.0, + "step": 58017 + }, + { + "epoch": 5.413641877391061, + "grad_norm": NaN, + "learning_rate": 7.4677961440068104e-06, + "loss": 0.0, + "step": 58018 + }, + { + "epoch": 5.413735187085938, + "grad_norm": NaN, + "learning_rate": 7.465439480590479e-06, + "loss": 0.0, + "step": 58019 + }, + { + "epoch": 5.413828496780815, + "grad_norm": NaN, + "learning_rate": 7.463083179597357e-06, + "loss": 0.0, + "step": 58020 + }, + { + "epoch": 5.413921806475693, + "grad_norm": NaN, + "learning_rate": 7.460727241033404e-06, + "loss": 0.0, + "step": 58021 + }, + { + "epoch": 5.41401511617057, + "grad_norm": NaN, + "learning_rate": 7.45837166490455e-06, + "loss": 0.0, + "step": 58022 + }, + { + "epoch": 5.4141084258654475, + "grad_norm": NaN, + "learning_rate": 7.4560164512168564e-06, + "loss": 0.0, + "step": 58023 + }, + { + "epoch": 5.414201735560325, + "grad_norm": NaN, + "learning_rate": 7.453661599976285e-06, + "loss": 0.0, + "step": 58024 + }, + { + "epoch": 5.414295045255202, + "grad_norm": NaN, + "learning_rate": 7.451307111188831e-06, + "loss": 0.0, + "step": 58025 + }, + { + "epoch": 5.414388354950079, + "grad_norm": NaN, + "learning_rate": 7.4489529848604735e-06, + "loss": 0.0, + "step": 58026 + }, + { + "epoch": 5.414481664644956, + "grad_norm": NaN, + "learning_rate": 7.446599220997207e-06, + "loss": 0.0, + "step": 58027 + }, + { + "epoch": 5.414574974339834, + "grad_norm": NaN, + "learning_rate": 7.44424581960501e-06, + "loss": 0.0, + "step": 58028 + }, + { + "epoch": 5.414668284034711, + "grad_norm": NaN, + "learning_rate": 7.441892780689879e-06, + "loss": 0.0, + "step": 58029 + }, + { + "epoch": 5.4147615937295885, + "grad_norm": NaN, + "learning_rate": 7.439540104257774e-06, + "loss": 0.0, + "step": 58030 + }, + { + "epoch": 5.414854903424466, + "grad_norm": NaN, + "learning_rate": 7.437187790314692e-06, + "loss": 0.0, + "step": 58031 + }, + { + "epoch": 5.414948213119343, + "grad_norm": NaN, + "learning_rate": 7.43483583886661e-06, + "loss": 0.0, + "step": 58032 + }, + { + "epoch": 5.415041522814221, + "grad_norm": NaN, + "learning_rate": 7.432484249919507e-06, + "loss": 0.0, + "step": 58033 + }, + { + "epoch": 5.415134832509097, + "grad_norm": NaN, + "learning_rate": 7.4301330234793625e-06, + "loss": 0.0, + "step": 58034 + }, + { + "epoch": 5.415228142203975, + "grad_norm": NaN, + "learning_rate": 7.427782159552153e-06, + "loss": 0.0, + "step": 58035 + }, + { + "epoch": 5.415321451898852, + "grad_norm": NaN, + "learning_rate": 7.4254316581438765e-06, + "loss": 0.0, + "step": 58036 + }, + { + "epoch": 5.41541476159373, + "grad_norm": NaN, + "learning_rate": 7.423081519260477e-06, + "loss": 0.0, + "step": 58037 + }, + { + "epoch": 5.415508071288607, + "grad_norm": NaN, + "learning_rate": 7.420731742907965e-06, + "loss": 0.0, + "step": 58038 + }, + { + "epoch": 5.415601380983484, + "grad_norm": NaN, + "learning_rate": 7.418382329092287e-06, + "loss": 0.0, + "step": 58039 + }, + { + "epoch": 5.415694690678362, + "grad_norm": NaN, + "learning_rate": 7.416033277819422e-06, + "loss": 0.0, + "step": 58040 + }, + { + "epoch": 5.415788000373238, + "grad_norm": NaN, + "learning_rate": 7.413684589095381e-06, + "loss": 0.0, + "step": 58041 + }, + { + "epoch": 5.415881310068116, + "grad_norm": NaN, + "learning_rate": 7.4113362629260596e-06, + "loss": 0.0, + "step": 58042 + }, + { + "epoch": 5.415974619762993, + "grad_norm": NaN, + "learning_rate": 7.408988299317486e-06, + "loss": 0.0, + "step": 58043 + }, + { + "epoch": 5.416067929457871, + "grad_norm": NaN, + "learning_rate": 7.40664069827564e-06, + "loss": 0.0, + "step": 58044 + }, + { + "epoch": 5.416161239152748, + "grad_norm": NaN, + "learning_rate": 7.404293459806415e-06, + "loss": 0.0, + "step": 58045 + }, + { + "epoch": 5.4162545488476255, + "grad_norm": NaN, + "learning_rate": 7.401946583915874e-06, + "loss": 0.0, + "step": 58046 + }, + { + "epoch": 5.416347858542503, + "grad_norm": NaN, + "learning_rate": 7.399600070609946e-06, + "loss": 0.0, + "step": 58047 + }, + { + "epoch": 5.41644116823738, + "grad_norm": NaN, + "learning_rate": 7.397253919894558e-06, + "loss": 0.0, + "step": 58048 + }, + { + "epoch": 5.416534477932257, + "grad_norm": NaN, + "learning_rate": 7.394908131775723e-06, + "loss": 0.0, + "step": 58049 + }, + { + "epoch": 5.416627787627134, + "grad_norm": NaN, + "learning_rate": 7.392562706259436e-06, + "loss": 0.0, + "step": 58050 + }, + { + "epoch": 5.416721097322012, + "grad_norm": NaN, + "learning_rate": 7.390217643351559e-06, + "loss": 0.0, + "step": 58051 + }, + { + "epoch": 5.416814407016889, + "grad_norm": NaN, + "learning_rate": 7.387872943058138e-06, + "loss": 0.0, + "step": 58052 + }, + { + "epoch": 5.4169077167117665, + "grad_norm": NaN, + "learning_rate": 7.3855286053851325e-06, + "loss": 0.0, + "step": 58053 + }, + { + "epoch": 5.417001026406644, + "grad_norm": NaN, + "learning_rate": 7.383184630338457e-06, + "loss": 0.0, + "step": 58054 + }, + { + "epoch": 5.417094336101521, + "grad_norm": NaN, + "learning_rate": 7.380841017924088e-06, + "loss": 0.0, + "step": 58055 + }, + { + "epoch": 5.417187645796398, + "grad_norm": NaN, + "learning_rate": 7.3784977681480395e-06, + "loss": 0.0, + "step": 58056 + }, + { + "epoch": 5.417280955491275, + "grad_norm": NaN, + "learning_rate": 7.376154881016172e-06, + "loss": 0.0, + "step": 58057 + }, + { + "epoch": 5.417374265186153, + "grad_norm": NaN, + "learning_rate": 7.373812356534514e-06, + "loss": 0.0, + "step": 58058 + }, + { + "epoch": 5.41746757488103, + "grad_norm": NaN, + "learning_rate": 7.371470194709011e-06, + "loss": 0.0, + "step": 58059 + }, + { + "epoch": 5.417560884575908, + "grad_norm": NaN, + "learning_rate": 7.369128395545593e-06, + "loss": 0.0, + "step": 58060 + }, + { + "epoch": 5.417654194270785, + "grad_norm": NaN, + "learning_rate": 7.366786959050236e-06, + "loss": 0.0, + "step": 58061 + }, + { + "epoch": 5.417747503965662, + "grad_norm": NaN, + "learning_rate": 7.3644458852289035e-06, + "loss": 0.0, + "step": 58062 + }, + { + "epoch": 5.417840813660539, + "grad_norm": NaN, + "learning_rate": 7.362105174087524e-06, + "loss": 0.0, + "step": 58063 + }, + { + "epoch": 5.417934123355416, + "grad_norm": NaN, + "learning_rate": 7.3597648256320585e-06, + "loss": 0.0, + "step": 58064 + }, + { + "epoch": 5.418027433050294, + "grad_norm": NaN, + "learning_rate": 7.357424839868454e-06, + "loss": 0.0, + "step": 58065 + }, + { + "epoch": 5.418120742745171, + "grad_norm": NaN, + "learning_rate": 7.3550852168026694e-06, + "loss": 0.0, + "step": 58066 + }, + { + "epoch": 5.418214052440049, + "grad_norm": NaN, + "learning_rate": 7.352745956440637e-06, + "loss": 0.0, + "step": 58067 + }, + { + "epoch": 5.418307362134926, + "grad_norm": NaN, + "learning_rate": 7.350407058788333e-06, + "loss": 0.0, + "step": 58068 + }, + { + "epoch": 5.4184006718298034, + "grad_norm": NaN, + "learning_rate": 7.348068523851669e-06, + "loss": 0.0, + "step": 58069 + }, + { + "epoch": 5.41849398152468, + "grad_norm": NaN, + "learning_rate": 7.345730351636625e-06, + "loss": 0.0, + "step": 58070 + }, + { + "epoch": 5.418587291219557, + "grad_norm": NaN, + "learning_rate": 7.343392542149129e-06, + "loss": 0.0, + "step": 58071 + }, + { + "epoch": 5.418680600914435, + "grad_norm": NaN, + "learning_rate": 7.341055095395127e-06, + "loss": 0.0, + "step": 58072 + }, + { + "epoch": 5.418773910609312, + "grad_norm": NaN, + "learning_rate": 7.3387180113805625e-06, + "loss": 0.0, + "step": 58073 + }, + { + "epoch": 5.41886722030419, + "grad_norm": NaN, + "learning_rate": 7.336381290111382e-06, + "loss": 0.0, + "step": 58074 + }, + { + "epoch": 5.418960529999067, + "grad_norm": NaN, + "learning_rate": 7.3340449315935135e-06, + "loss": 0.0, + "step": 58075 + }, + { + "epoch": 5.4190538396939445, + "grad_norm": NaN, + "learning_rate": 7.331708935832919e-06, + "loss": 0.0, + "step": 58076 + }, + { + "epoch": 5.419147149388822, + "grad_norm": NaN, + "learning_rate": 7.329373302835528e-06, + "loss": 0.0, + "step": 58077 + }, + { + "epoch": 5.4192404590836984, + "grad_norm": NaN, + "learning_rate": 7.327038032607269e-06, + "loss": 0.0, + "step": 58078 + }, + { + "epoch": 5.419333768778576, + "grad_norm": NaN, + "learning_rate": 7.324703125154102e-06, + "loss": 0.0, + "step": 58079 + }, + { + "epoch": 5.419427078473453, + "grad_norm": NaN, + "learning_rate": 7.322368580481941e-06, + "loss": 0.0, + "step": 58080 + }, + { + "epoch": 5.419520388168331, + "grad_norm": NaN, + "learning_rate": 7.32003439859673e-06, + "loss": 0.0, + "step": 58081 + }, + { + "epoch": 5.419613697863208, + "grad_norm": NaN, + "learning_rate": 7.317700579504415e-06, + "loss": 0.0, + "step": 58082 + }, + { + "epoch": 5.4197070075580855, + "grad_norm": NaN, + "learning_rate": 7.3153671232109245e-06, + "loss": 0.0, + "step": 58083 + }, + { + "epoch": 5.419800317252963, + "grad_norm": NaN, + "learning_rate": 7.313034029722187e-06, + "loss": 0.0, + "step": 58084 + }, + { + "epoch": 5.4198936269478395, + "grad_norm": NaN, + "learning_rate": 7.31070129904413e-06, + "loss": 0.0, + "step": 58085 + }, + { + "epoch": 5.419986936642717, + "grad_norm": NaN, + "learning_rate": 7.308368931182684e-06, + "loss": 0.0, + "step": 58086 + }, + { + "epoch": 5.420080246337594, + "grad_norm": NaN, + "learning_rate": 7.3060369261438104e-06, + "loss": 0.0, + "step": 58087 + }, + { + "epoch": 5.420173556032472, + "grad_norm": NaN, + "learning_rate": 7.30370528393342e-06, + "loss": 0.0, + "step": 58088 + }, + { + "epoch": 5.420266865727349, + "grad_norm": NaN, + "learning_rate": 7.301374004557409e-06, + "loss": 0.0, + "step": 58089 + }, + { + "epoch": 5.420360175422227, + "grad_norm": NaN, + "learning_rate": 7.299043088021739e-06, + "loss": 0.0, + "step": 58090 + }, + { + "epoch": 5.420453485117104, + "grad_norm": NaN, + "learning_rate": 7.296712534332355e-06, + "loss": 0.0, + "step": 58091 + }, + { + "epoch": 5.420546794811981, + "grad_norm": NaN, + "learning_rate": 7.2943823434951196e-06, + "loss": 0.0, + "step": 58092 + }, + { + "epoch": 5.420640104506858, + "grad_norm": NaN, + "learning_rate": 7.292052515516028e-06, + "loss": 0.0, + "step": 58093 + }, + { + "epoch": 5.420733414201735, + "grad_norm": NaN, + "learning_rate": 7.2897230504009916e-06, + "loss": 0.0, + "step": 58094 + }, + { + "epoch": 5.420826723896613, + "grad_norm": NaN, + "learning_rate": 7.287393948155856e-06, + "loss": 0.0, + "step": 58095 + }, + { + "epoch": 5.42092003359149, + "grad_norm": NaN, + "learning_rate": 7.2850652087866504e-06, + "loss": 0.0, + "step": 58096 + }, + { + "epoch": 5.421013343286368, + "grad_norm": NaN, + "learning_rate": 7.282736832299235e-06, + "loss": 0.0, + "step": 58097 + }, + { + "epoch": 5.421106652981245, + "grad_norm": NaN, + "learning_rate": 7.28040881869954e-06, + "loss": 0.0, + "step": 58098 + }, + { + "epoch": 5.421199962676122, + "grad_norm": NaN, + "learning_rate": 7.278081167993493e-06, + "loss": 0.0, + "step": 58099 + }, + { + "epoch": 5.421293272370999, + "grad_norm": NaN, + "learning_rate": 7.275753880187007e-06, + "loss": 0.0, + "step": 58100 + }, + { + "epoch": 5.421386582065876, + "grad_norm": NaN, + "learning_rate": 7.273426955285993e-06, + "loss": 0.0, + "step": 58101 + }, + { + "epoch": 5.421479891760754, + "grad_norm": NaN, + "learning_rate": 7.271100393296398e-06, + "loss": 0.0, + "step": 58102 + }, + { + "epoch": 5.421573201455631, + "grad_norm": NaN, + "learning_rate": 7.268774194224097e-06, + "loss": 0.0, + "step": 58103 + }, + { + "epoch": 5.421666511150509, + "grad_norm": NaN, + "learning_rate": 7.2664483580750224e-06, + "loss": 0.0, + "step": 58104 + }, + { + "epoch": 5.421759820845386, + "grad_norm": NaN, + "learning_rate": 7.2641228848551e-06, + "loss": 0.0, + "step": 58105 + }, + { + "epoch": 5.4218531305402635, + "grad_norm": NaN, + "learning_rate": 7.261797774570227e-06, + "loss": 0.0, + "step": 58106 + }, + { + "epoch": 5.42194644023514, + "grad_norm": NaN, + "learning_rate": 7.259473027226315e-06, + "loss": 0.0, + "step": 58107 + }, + { + "epoch": 5.4220397499300175, + "grad_norm": NaN, + "learning_rate": 7.257148642829291e-06, + "loss": 0.0, + "step": 58108 + }, + { + "epoch": 5.422133059624895, + "grad_norm": NaN, + "learning_rate": 7.254824621385036e-06, + "loss": 0.0, + "step": 58109 + }, + { + "epoch": 5.422226369319772, + "grad_norm": NaN, + "learning_rate": 7.252500962899493e-06, + "loss": 0.0, + "step": 58110 + }, + { + "epoch": 5.42231967901465, + "grad_norm": NaN, + "learning_rate": 7.250177667378559e-06, + "loss": 0.0, + "step": 58111 + }, + { + "epoch": 5.422412988709527, + "grad_norm": NaN, + "learning_rate": 7.247854734828129e-06, + "loss": 0.0, + "step": 58112 + }, + { + "epoch": 5.422506298404405, + "grad_norm": NaN, + "learning_rate": 7.245532165254114e-06, + "loss": 0.0, + "step": 58113 + }, + { + "epoch": 5.422599608099281, + "grad_norm": NaN, + "learning_rate": 7.243209958662427e-06, + "loss": 0.0, + "step": 58114 + }, + { + "epoch": 5.4226929177941585, + "grad_norm": NaN, + "learning_rate": 7.2408881150589795e-06, + "loss": 0.0, + "step": 58115 + }, + { + "epoch": 5.422786227489036, + "grad_norm": NaN, + "learning_rate": 7.2385666344496495e-06, + "loss": 0.0, + "step": 58116 + }, + { + "epoch": 5.422879537183913, + "grad_norm": NaN, + "learning_rate": 7.236245516840367e-06, + "loss": 0.0, + "step": 58117 + }, + { + "epoch": 5.422972846878791, + "grad_norm": NaN, + "learning_rate": 7.233924762237009e-06, + "loss": 0.0, + "step": 58118 + }, + { + "epoch": 5.423066156573668, + "grad_norm": NaN, + "learning_rate": 7.231604370645505e-06, + "loss": 0.0, + "step": 58119 + }, + { + "epoch": 5.423159466268546, + "grad_norm": NaN, + "learning_rate": 7.229284342071734e-06, + "loss": 0.0, + "step": 58120 + }, + { + "epoch": 5.423252775963423, + "grad_norm": NaN, + "learning_rate": 7.226964676521607e-06, + "loss": 0.0, + "step": 58121 + }, + { + "epoch": 5.4233460856583, + "grad_norm": NaN, + "learning_rate": 7.224645374001004e-06, + "loss": 0.0, + "step": 58122 + }, + { + "epoch": 5.423439395353177, + "grad_norm": NaN, + "learning_rate": 7.222326434515835e-06, + "loss": 0.0, + "step": 58123 + }, + { + "epoch": 5.423532705048054, + "grad_norm": NaN, + "learning_rate": 7.220007858072014e-06, + "loss": 0.0, + "step": 58124 + }, + { + "epoch": 5.423626014742932, + "grad_norm": NaN, + "learning_rate": 7.217689644675401e-06, + "loss": 0.0, + "step": 58125 + }, + { + "epoch": 5.423719324437809, + "grad_norm": NaN, + "learning_rate": 7.2153717943319085e-06, + "loss": 0.0, + "step": 58126 + }, + { + "epoch": 5.423812634132687, + "grad_norm": NaN, + "learning_rate": 7.213054307047433e-06, + "loss": 0.0, + "step": 58127 + }, + { + "epoch": 5.423905943827564, + "grad_norm": NaN, + "learning_rate": 7.2107371828278695e-06, + "loss": 0.0, + "step": 58128 + }, + { + "epoch": 5.423999253522441, + "grad_norm": NaN, + "learning_rate": 7.208420421679096e-06, + "loss": 0.0, + "step": 58129 + }, + { + "epoch": 5.424092563217318, + "grad_norm": NaN, + "learning_rate": 7.206104023607023e-06, + "loss": 0.0, + "step": 58130 + }, + { + "epoch": 5.4241858729121954, + "grad_norm": NaN, + "learning_rate": 7.203787988617515e-06, + "loss": 0.0, + "step": 58131 + }, + { + "epoch": 5.424279182607073, + "grad_norm": NaN, + "learning_rate": 7.2014723167165005e-06, + "loss": 0.0, + "step": 58132 + }, + { + "epoch": 5.42437249230195, + "grad_norm": NaN, + "learning_rate": 7.199157007909806e-06, + "loss": 0.0, + "step": 58133 + }, + { + "epoch": 5.424465801996828, + "grad_norm": NaN, + "learning_rate": 7.196842062203378e-06, + "loss": 0.0, + "step": 58134 + }, + { + "epoch": 5.424559111691705, + "grad_norm": NaN, + "learning_rate": 7.194527479603063e-06, + "loss": 0.0, + "step": 58135 + }, + { + "epoch": 5.424652421386582, + "grad_norm": NaN, + "learning_rate": 7.192213260114771e-06, + "loss": 0.0, + "step": 58136 + }, + { + "epoch": 5.424745731081459, + "grad_norm": NaN, + "learning_rate": 7.189899403744381e-06, + "loss": 0.0, + "step": 58137 + }, + { + "epoch": 5.4248390407763365, + "grad_norm": NaN, + "learning_rate": 7.187585910497773e-06, + "loss": 0.0, + "step": 58138 + }, + { + "epoch": 5.424932350471214, + "grad_norm": NaN, + "learning_rate": 7.185272780380824e-06, + "loss": 0.0, + "step": 58139 + }, + { + "epoch": 5.425025660166091, + "grad_norm": NaN, + "learning_rate": 7.1829600133994304e-06, + "loss": 0.0, + "step": 58140 + }, + { + "epoch": 5.425118969860969, + "grad_norm": NaN, + "learning_rate": 7.180647609559453e-06, + "loss": 0.0, + "step": 58141 + }, + { + "epoch": 5.425212279555846, + "grad_norm": NaN, + "learning_rate": 7.178335568866789e-06, + "loss": 0.0, + "step": 58142 + }, + { + "epoch": 5.425305589250723, + "grad_norm": NaN, + "learning_rate": 7.176023891327315e-06, + "loss": 0.0, + "step": 58143 + }, + { + "epoch": 5.4253988989456, + "grad_norm": NaN, + "learning_rate": 7.173712576946894e-06, + "loss": 0.0, + "step": 58144 + }, + { + "epoch": 5.4254922086404775, + "grad_norm": NaN, + "learning_rate": 7.171401625731421e-06, + "loss": 0.0, + "step": 58145 + }, + { + "epoch": 5.425585518335355, + "grad_norm": NaN, + "learning_rate": 7.169091037686758e-06, + "loss": 0.0, + "step": 58146 + }, + { + "epoch": 5.425678828030232, + "grad_norm": NaN, + "learning_rate": 7.1667808128188005e-06, + "loss": 0.0, + "step": 58147 + }, + { + "epoch": 5.42577213772511, + "grad_norm": NaN, + "learning_rate": 7.164470951133394e-06, + "loss": 0.0, + "step": 58148 + }, + { + "epoch": 5.425865447419987, + "grad_norm": NaN, + "learning_rate": 7.162161452636434e-06, + "loss": 0.0, + "step": 58149 + }, + { + "epoch": 5.425958757114865, + "grad_norm": NaN, + "learning_rate": 7.1598523173337805e-06, + "loss": 0.0, + "step": 58150 + }, + { + "epoch": 5.426052066809741, + "grad_norm": NaN, + "learning_rate": 7.157543545231315e-06, + "loss": 0.0, + "step": 58151 + }, + { + "epoch": 5.426145376504619, + "grad_norm": NaN, + "learning_rate": 7.1552351363349135e-06, + "loss": 0.0, + "step": 58152 + }, + { + "epoch": 5.426238686199496, + "grad_norm": NaN, + "learning_rate": 7.1529270906504234e-06, + "loss": 0.0, + "step": 58153 + }, + { + "epoch": 5.426331995894373, + "grad_norm": NaN, + "learning_rate": 7.150619408183722e-06, + "loss": 0.0, + "step": 58154 + }, + { + "epoch": 5.426425305589251, + "grad_norm": NaN, + "learning_rate": 7.148312088940689e-06, + "loss": 0.0, + "step": 58155 + }, + { + "epoch": 5.426518615284128, + "grad_norm": NaN, + "learning_rate": 7.146005132927185e-06, + "loss": 0.0, + "step": 58156 + }, + { + "epoch": 5.426611924979006, + "grad_norm": NaN, + "learning_rate": 7.143698540149057e-06, + "loss": 0.0, + "step": 58157 + }, + { + "epoch": 5.426705234673882, + "grad_norm": NaN, + "learning_rate": 7.141392310612198e-06, + "loss": 0.0, + "step": 58158 + }, + { + "epoch": 5.42679854436876, + "grad_norm": NaN, + "learning_rate": 7.139086444322456e-06, + "loss": 0.0, + "step": 58159 + }, + { + "epoch": 5.426891854063637, + "grad_norm": NaN, + "learning_rate": 7.136780941285708e-06, + "loss": 0.0, + "step": 58160 + }, + { + "epoch": 5.4269851637585145, + "grad_norm": NaN, + "learning_rate": 7.1344758015077995e-06, + "loss": 0.0, + "step": 58161 + }, + { + "epoch": 5.427078473453392, + "grad_norm": NaN, + "learning_rate": 7.132171024994593e-06, + "loss": 0.0, + "step": 58162 + }, + { + "epoch": 5.427171783148269, + "grad_norm": NaN, + "learning_rate": 7.1298666117519664e-06, + "loss": 0.0, + "step": 58163 + }, + { + "epoch": 5.427265092843147, + "grad_norm": NaN, + "learning_rate": 7.1275625617857656e-06, + "loss": 0.0, + "step": 58164 + }, + { + "epoch": 5.427358402538024, + "grad_norm": NaN, + "learning_rate": 7.1252588751018525e-06, + "loss": 0.0, + "step": 58165 + }, + { + "epoch": 5.427451712232901, + "grad_norm": NaN, + "learning_rate": 7.122955551706072e-06, + "loss": 0.0, + "step": 58166 + }, + { + "epoch": 5.427545021927778, + "grad_norm": NaN, + "learning_rate": 7.120652591604304e-06, + "loss": 0.0, + "step": 58167 + }, + { + "epoch": 5.4276383316226555, + "grad_norm": NaN, + "learning_rate": 7.118349994802392e-06, + "loss": 0.0, + "step": 58168 + }, + { + "epoch": 5.427731641317533, + "grad_norm": NaN, + "learning_rate": 7.116047761306181e-06, + "loss": 0.0, + "step": 58169 + }, + { + "epoch": 5.42782495101241, + "grad_norm": NaN, + "learning_rate": 7.1137458911215525e-06, + "loss": 0.0, + "step": 58170 + }, + { + "epoch": 5.427918260707288, + "grad_norm": NaN, + "learning_rate": 7.111444384254317e-06, + "loss": 0.0, + "step": 58171 + }, + { + "epoch": 5.428011570402165, + "grad_norm": NaN, + "learning_rate": 7.109143240710369e-06, + "loss": 0.0, + "step": 58172 + }, + { + "epoch": 5.428104880097042, + "grad_norm": NaN, + "learning_rate": 7.106842460495538e-06, + "loss": 0.0, + "step": 58173 + }, + { + "epoch": 5.428198189791919, + "grad_norm": NaN, + "learning_rate": 7.104542043615685e-06, + "loss": 0.0, + "step": 58174 + }, + { + "epoch": 5.428291499486797, + "grad_norm": NaN, + "learning_rate": 7.102241990076657e-06, + "loss": 0.0, + "step": 58175 + }, + { + "epoch": 5.428384809181674, + "grad_norm": NaN, + "learning_rate": 7.099942299884281e-06, + "loss": 0.0, + "step": 58176 + }, + { + "epoch": 5.428478118876551, + "grad_norm": NaN, + "learning_rate": 7.097642973044437e-06, + "loss": 0.0, + "step": 58177 + }, + { + "epoch": 5.428571428571429, + "grad_norm": NaN, + "learning_rate": 7.095344009562937e-06, + "loss": 0.0, + "step": 58178 + }, + { + "epoch": 5.428664738266306, + "grad_norm": NaN, + "learning_rate": 7.093045409445658e-06, + "loss": 0.0, + "step": 58179 + }, + { + "epoch": 5.428758047961183, + "grad_norm": NaN, + "learning_rate": 7.090747172698447e-06, + "loss": 0.0, + "step": 58180 + }, + { + "epoch": 5.42885135765606, + "grad_norm": NaN, + "learning_rate": 7.088449299327115e-06, + "loss": 0.0, + "step": 58181 + }, + { + "epoch": 5.428944667350938, + "grad_norm": NaN, + "learning_rate": 7.086151789337524e-06, + "loss": 0.0, + "step": 58182 + }, + { + "epoch": 5.429037977045815, + "grad_norm": NaN, + "learning_rate": 7.083854642735521e-06, + "loss": 0.0, + "step": 58183 + }, + { + "epoch": 5.4291312867406925, + "grad_norm": NaN, + "learning_rate": 7.081557859526931e-06, + "loss": 0.0, + "step": 58184 + }, + { + "epoch": 5.42922459643557, + "grad_norm": NaN, + "learning_rate": 7.079261439717604e-06, + "loss": 0.0, + "step": 58185 + }, + { + "epoch": 5.429317906130447, + "grad_norm": NaN, + "learning_rate": 7.0769653833133984e-06, + "loss": 0.0, + "step": 58186 + }, + { + "epoch": 5.429411215825324, + "grad_norm": NaN, + "learning_rate": 7.074669690320112e-06, + "loss": 0.0, + "step": 58187 + }, + { + "epoch": 5.429504525520201, + "grad_norm": NaN, + "learning_rate": 7.072374360743605e-06, + "loss": 0.0, + "step": 58188 + }, + { + "epoch": 5.429597835215079, + "grad_norm": NaN, + "learning_rate": 7.070079394589723e-06, + "loss": 0.0, + "step": 58189 + }, + { + "epoch": 5.429691144909956, + "grad_norm": NaN, + "learning_rate": 7.0677847918642784e-06, + "loss": 0.0, + "step": 58190 + }, + { + "epoch": 5.4297844546048335, + "grad_norm": NaN, + "learning_rate": 7.065490552573133e-06, + "loss": 0.0, + "step": 58191 + }, + { + "epoch": 5.429877764299711, + "grad_norm": NaN, + "learning_rate": 7.063196676722083e-06, + "loss": 0.0, + "step": 58192 + }, + { + "epoch": 5.429971073994588, + "grad_norm": NaN, + "learning_rate": 7.060903164317006e-06, + "loss": 0.0, + "step": 58193 + }, + { + "epoch": 5.430064383689466, + "grad_norm": NaN, + "learning_rate": 7.058610015363697e-06, + "loss": 0.0, + "step": 58194 + }, + { + "epoch": 5.430157693384342, + "grad_norm": NaN, + "learning_rate": 7.056317229868002e-06, + "loss": 0.0, + "step": 58195 + }, + { + "epoch": 5.43025100307922, + "grad_norm": NaN, + "learning_rate": 7.054024807835751e-06, + "loss": 0.0, + "step": 58196 + }, + { + "epoch": 5.430344312774097, + "grad_norm": NaN, + "learning_rate": 7.051732749272771e-06, + "loss": 0.0, + "step": 58197 + }, + { + "epoch": 5.4304376224689745, + "grad_norm": NaN, + "learning_rate": 7.04944105418489e-06, + "loss": 0.0, + "step": 58198 + }, + { + "epoch": 5.430530932163852, + "grad_norm": NaN, + "learning_rate": 7.047149722577922e-06, + "loss": 0.0, + "step": 58199 + }, + { + "epoch": 5.430624241858729, + "grad_norm": NaN, + "learning_rate": 7.0448587544577275e-06, + "loss": 0.0, + "step": 58200 + }, + { + "epoch": 5.430717551553607, + "grad_norm": NaN, + "learning_rate": 7.042568149830102e-06, + "loss": 0.0, + "step": 58201 + }, + { + "epoch": 5.430810861248483, + "grad_norm": NaN, + "learning_rate": 7.040277908700876e-06, + "loss": 0.0, + "step": 58202 + }, + { + "epoch": 5.430904170943361, + "grad_norm": NaN, + "learning_rate": 7.037988031075875e-06, + "loss": 0.0, + "step": 58203 + }, + { + "epoch": 5.430997480638238, + "grad_norm": NaN, + "learning_rate": 7.0356985169609304e-06, + "loss": 0.0, + "step": 58204 + }, + { + "epoch": 5.431090790333116, + "grad_norm": NaN, + "learning_rate": 7.033409366361853e-06, + "loss": 0.0, + "step": 58205 + }, + { + "epoch": 5.431184100027993, + "grad_norm": NaN, + "learning_rate": 7.0311205792844545e-06, + "loss": 0.0, + "step": 58206 + }, + { + "epoch": 5.43127740972287, + "grad_norm": NaN, + "learning_rate": 7.028832155734582e-06, + "loss": 0.0, + "step": 58207 + }, + { + "epoch": 5.431370719417748, + "grad_norm": NaN, + "learning_rate": 7.026544095718029e-06, + "loss": 0.0, + "step": 58208 + }, + { + "epoch": 5.431464029112625, + "grad_norm": NaN, + "learning_rate": 7.024256399240624e-06, + "loss": 0.0, + "step": 58209 + }, + { + "epoch": 5.431557338807502, + "grad_norm": NaN, + "learning_rate": 7.02196906630818e-06, + "loss": 0.0, + "step": 58210 + }, + { + "epoch": 5.431650648502379, + "grad_norm": NaN, + "learning_rate": 7.01968209692651e-06, + "loss": 0.0, + "step": 58211 + }, + { + "epoch": 5.431743958197257, + "grad_norm": NaN, + "learning_rate": 7.017395491101457e-06, + "loss": 0.0, + "step": 58212 + }, + { + "epoch": 5.431837267892134, + "grad_norm": NaN, + "learning_rate": 7.015109248838785e-06, + "loss": 0.0, + "step": 58213 + }, + { + "epoch": 5.4319305775870115, + "grad_norm": NaN, + "learning_rate": 7.012823370144355e-06, + "loss": 0.0, + "step": 58214 + }, + { + "epoch": 5.432023887281889, + "grad_norm": NaN, + "learning_rate": 7.0105378550239455e-06, + "loss": 0.0, + "step": 58215 + }, + { + "epoch": 5.432117196976765, + "grad_norm": NaN, + "learning_rate": 7.008252703483386e-06, + "loss": 0.0, + "step": 58216 + }, + { + "epoch": 5.432210506671643, + "grad_norm": NaN, + "learning_rate": 7.005967915528488e-06, + "loss": 0.0, + "step": 58217 + }, + { + "epoch": 5.43230381636652, + "grad_norm": NaN, + "learning_rate": 7.003683491165063e-06, + "loss": 0.0, + "step": 58218 + }, + { + "epoch": 5.432397126061398, + "grad_norm": NaN, + "learning_rate": 7.001399430398891e-06, + "loss": 0.0, + "step": 58219 + }, + { + "epoch": 5.432490435756275, + "grad_norm": NaN, + "learning_rate": 6.999115733235816e-06, + "loss": 0.0, + "step": 58220 + }, + { + "epoch": 5.4325837454511525, + "grad_norm": NaN, + "learning_rate": 6.9968323996816176e-06, + "loss": 0.0, + "step": 58221 + }, + { + "epoch": 5.43267705514603, + "grad_norm": NaN, + "learning_rate": 6.994549429742125e-06, + "loss": 0.0, + "step": 58222 + }, + { + "epoch": 5.432770364840907, + "grad_norm": NaN, + "learning_rate": 6.9922668234231315e-06, + "loss": 0.0, + "step": 58223 + }, + { + "epoch": 5.432863674535784, + "grad_norm": NaN, + "learning_rate": 6.989984580730434e-06, + "loss": 0.0, + "step": 58224 + }, + { + "epoch": 5.432956984230661, + "grad_norm": NaN, + "learning_rate": 6.987702701669845e-06, + "loss": 0.0, + "step": 58225 + }, + { + "epoch": 5.433050293925539, + "grad_norm": NaN, + "learning_rate": 6.985421186247176e-06, + "loss": 0.0, + "step": 58226 + }, + { + "epoch": 5.433143603620416, + "grad_norm": NaN, + "learning_rate": 6.9831400344682045e-06, + "loss": 0.0, + "step": 58227 + }, + { + "epoch": 5.433236913315294, + "grad_norm": NaN, + "learning_rate": 6.980859246338744e-06, + "loss": 0.0, + "step": 58228 + }, + { + "epoch": 5.433330223010171, + "grad_norm": NaN, + "learning_rate": 6.978578821864605e-06, + "loss": 0.0, + "step": 58229 + }, + { + "epoch": 5.433423532705048, + "grad_norm": NaN, + "learning_rate": 6.976298761051552e-06, + "loss": 0.0, + "step": 58230 + }, + { + "epoch": 5.433516842399925, + "grad_norm": NaN, + "learning_rate": 6.974019063905428e-06, + "loss": 0.0, + "step": 58231 + }, + { + "epoch": 5.433610152094802, + "grad_norm": NaN, + "learning_rate": 6.9717397304319966e-06, + "loss": 0.0, + "step": 58232 + }, + { + "epoch": 5.43370346178968, + "grad_norm": NaN, + "learning_rate": 6.9694607606370525e-06, + "loss": 0.0, + "step": 58233 + }, + { + "epoch": 5.433796771484557, + "grad_norm": NaN, + "learning_rate": 6.9671821545264075e-06, + "loss": 0.0, + "step": 58234 + }, + { + "epoch": 5.433890081179435, + "grad_norm": NaN, + "learning_rate": 6.964903912105857e-06, + "loss": 0.0, + "step": 58235 + }, + { + "epoch": 5.433983390874312, + "grad_norm": NaN, + "learning_rate": 6.96262603338118e-06, + "loss": 0.0, + "step": 58236 + }, + { + "epoch": 5.4340767005691895, + "grad_norm": NaN, + "learning_rate": 6.960348518358172e-06, + "loss": 0.0, + "step": 58237 + }, + { + "epoch": 5.434170010264067, + "grad_norm": NaN, + "learning_rate": 6.958071367042628e-06, + "loss": 0.0, + "step": 58238 + }, + { + "epoch": 5.434263319958943, + "grad_norm": NaN, + "learning_rate": 6.955794579440344e-06, + "loss": 0.0, + "step": 58239 + }, + { + "epoch": 5.434356629653821, + "grad_norm": NaN, + "learning_rate": 6.953518155557097e-06, + "loss": 0.0, + "step": 58240 + }, + { + "epoch": 5.434449939348698, + "grad_norm": NaN, + "learning_rate": 6.951242095398668e-06, + "loss": 0.0, + "step": 58241 + }, + { + "epoch": 5.434543249043576, + "grad_norm": NaN, + "learning_rate": 6.948966398970868e-06, + "loss": 0.0, + "step": 58242 + }, + { + "epoch": 5.434636558738453, + "grad_norm": NaN, + "learning_rate": 6.946691066279475e-06, + "loss": 0.0, + "step": 58243 + }, + { + "epoch": 5.4347298684333305, + "grad_norm": NaN, + "learning_rate": 6.9444160973302685e-06, + "loss": 0.0, + "step": 58244 + }, + { + "epoch": 5.434823178128208, + "grad_norm": NaN, + "learning_rate": 6.942141492129028e-06, + "loss": 0.0, + "step": 58245 + }, + { + "epoch": 5.4349164878230845, + "grad_norm": NaN, + "learning_rate": 6.939867250681547e-06, + "loss": 0.0, + "step": 58246 + }, + { + "epoch": 5.435009797517962, + "grad_norm": NaN, + "learning_rate": 6.937593372993605e-06, + "loss": 0.0, + "step": 58247 + }, + { + "epoch": 5.435103107212839, + "grad_norm": NaN, + "learning_rate": 6.935319859070998e-06, + "loss": 0.0, + "step": 58248 + }, + { + "epoch": 5.435196416907717, + "grad_norm": NaN, + "learning_rate": 6.933046708919471e-06, + "loss": 0.0, + "step": 58249 + }, + { + "epoch": 5.435289726602594, + "grad_norm": NaN, + "learning_rate": 6.930773922544852e-06, + "loss": 0.0, + "step": 58250 + }, + { + "epoch": 5.4353830362974715, + "grad_norm": NaN, + "learning_rate": 6.928501499952871e-06, + "loss": 0.0, + "step": 58251 + }, + { + "epoch": 5.435476345992349, + "grad_norm": NaN, + "learning_rate": 6.926229441149339e-06, + "loss": 0.0, + "step": 58252 + }, + { + "epoch": 5.4355696556872255, + "grad_norm": NaN, + "learning_rate": 6.923957746140036e-06, + "loss": 0.0, + "step": 58253 + }, + { + "epoch": 5.435662965382103, + "grad_norm": NaN, + "learning_rate": 6.921686414930705e-06, + "loss": 0.0, + "step": 58254 + }, + { + "epoch": 5.43575627507698, + "grad_norm": NaN, + "learning_rate": 6.91941544752716e-06, + "loss": 0.0, + "step": 58255 + }, + { + "epoch": 5.435849584771858, + "grad_norm": NaN, + "learning_rate": 6.917144843935146e-06, + "loss": 0.0, + "step": 58256 + }, + { + "epoch": 5.435942894466735, + "grad_norm": NaN, + "learning_rate": 6.914874604160459e-06, + "loss": 0.0, + "step": 58257 + }, + { + "epoch": 5.436036204161613, + "grad_norm": NaN, + "learning_rate": 6.91260472820886e-06, + "loss": 0.0, + "step": 58258 + }, + { + "epoch": 5.43612951385649, + "grad_norm": NaN, + "learning_rate": 6.910335216086111e-06, + "loss": 0.0, + "step": 58259 + }, + { + "epoch": 5.4362228235513665, + "grad_norm": NaN, + "learning_rate": 6.908066067798007e-06, + "loss": 0.0, + "step": 58260 + }, + { + "epoch": 5.436316133246244, + "grad_norm": NaN, + "learning_rate": 6.905797283350295e-06, + "loss": 0.0, + "step": 58261 + }, + { + "epoch": 5.436409442941121, + "grad_norm": NaN, + "learning_rate": 6.9035288627487525e-06, + "loss": 0.0, + "step": 58262 + }, + { + "epoch": 5.436502752635999, + "grad_norm": NaN, + "learning_rate": 6.901260805999143e-06, + "loss": 0.0, + "step": 58263 + }, + { + "epoch": 5.436596062330876, + "grad_norm": NaN, + "learning_rate": 6.898993113107243e-06, + "loss": 0.0, + "step": 58264 + }, + { + "epoch": 5.436689372025754, + "grad_norm": NaN, + "learning_rate": 6.896725784078816e-06, + "loss": 0.0, + "step": 58265 + }, + { + "epoch": 5.436782681720631, + "grad_norm": NaN, + "learning_rate": 6.894458818919624e-06, + "loss": 0.0, + "step": 58266 + }, + { + "epoch": 5.4368759914155085, + "grad_norm": NaN, + "learning_rate": 6.892192217635445e-06, + "loss": 0.0, + "step": 58267 + }, + { + "epoch": 5.436969301110385, + "grad_norm": NaN, + "learning_rate": 6.889925980232009e-06, + "loss": 0.0, + "step": 58268 + }, + { + "epoch": 5.437062610805262, + "grad_norm": NaN, + "learning_rate": 6.887660106715109e-06, + "loss": 0.0, + "step": 58269 + }, + { + "epoch": 5.43715592050014, + "grad_norm": NaN, + "learning_rate": 6.8853945970904936e-06, + "loss": 0.0, + "step": 58270 + }, + { + "epoch": 5.437249230195017, + "grad_norm": NaN, + "learning_rate": 6.883129451363939e-06, + "loss": 0.0, + "step": 58271 + }, + { + "epoch": 5.437342539889895, + "grad_norm": NaN, + "learning_rate": 6.880864669541175e-06, + "loss": 0.0, + "step": 58272 + }, + { + "epoch": 5.437435849584772, + "grad_norm": NaN, + "learning_rate": 6.878600251627981e-06, + "loss": 0.0, + "step": 58273 + }, + { + "epoch": 5.4375291592796495, + "grad_norm": NaN, + "learning_rate": 6.876336197630117e-06, + "loss": 0.0, + "step": 58274 + }, + { + "epoch": 5.437622468974526, + "grad_norm": NaN, + "learning_rate": 6.87407250755333e-06, + "loss": 0.0, + "step": 58275 + }, + { + "epoch": 5.4377157786694035, + "grad_norm": NaN, + "learning_rate": 6.871809181403365e-06, + "loss": 0.0, + "step": 58276 + }, + { + "epoch": 5.437809088364281, + "grad_norm": NaN, + "learning_rate": 6.869546219186017e-06, + "loss": 0.0, + "step": 58277 + }, + { + "epoch": 5.437902398059158, + "grad_norm": NaN, + "learning_rate": 6.867283620906999e-06, + "loss": 0.0, + "step": 58278 + }, + { + "epoch": 5.437995707754036, + "grad_norm": NaN, + "learning_rate": 6.865021386572089e-06, + "loss": 0.0, + "step": 58279 + }, + { + "epoch": 5.438089017448913, + "grad_norm": NaN, + "learning_rate": 6.862759516187016e-06, + "loss": 0.0, + "step": 58280 + }, + { + "epoch": 5.438182327143791, + "grad_norm": NaN, + "learning_rate": 6.860498009757559e-06, + "loss": 0.0, + "step": 58281 + }, + { + "epoch": 5.438275636838668, + "grad_norm": NaN, + "learning_rate": 6.858236867289446e-06, + "loss": 0.0, + "step": 58282 + }, + { + "epoch": 5.4383689465335445, + "grad_norm": NaN, + "learning_rate": 6.85597608878844e-06, + "loss": 0.0, + "step": 58283 + }, + { + "epoch": 5.438462256228422, + "grad_norm": NaN, + "learning_rate": 6.853715674260285e-06, + "loss": 0.0, + "step": 58284 + }, + { + "epoch": 5.438555565923299, + "grad_norm": NaN, + "learning_rate": 6.851455623710728e-06, + "loss": 0.0, + "step": 58285 + }, + { + "epoch": 5.438648875618177, + "grad_norm": NaN, + "learning_rate": 6.849195937145513e-06, + "loss": 0.0, + "step": 58286 + }, + { + "epoch": 5.438742185313054, + "grad_norm": NaN, + "learning_rate": 6.8469366145703866e-06, + "loss": 0.0, + "step": 58287 + }, + { + "epoch": 5.438835495007932, + "grad_norm": NaN, + "learning_rate": 6.844677655991094e-06, + "loss": 0.0, + "step": 58288 + }, + { + "epoch": 5.438928804702808, + "grad_norm": NaN, + "learning_rate": 6.842419061413379e-06, + "loss": 0.0, + "step": 58289 + }, + { + "epoch": 5.439022114397686, + "grad_norm": NaN, + "learning_rate": 6.840160830842989e-06, + "loss": 0.0, + "step": 58290 + }, + { + "epoch": 5.439115424092563, + "grad_norm": NaN, + "learning_rate": 6.837902964285669e-06, + "loss": 0.0, + "step": 58291 + }, + { + "epoch": 5.43920873378744, + "grad_norm": NaN, + "learning_rate": 6.8356454617471465e-06, + "loss": 0.0, + "step": 58292 + }, + { + "epoch": 5.439302043482318, + "grad_norm": NaN, + "learning_rate": 6.8333883232331684e-06, + "loss": 0.0, + "step": 58293 + }, + { + "epoch": 5.439395353177195, + "grad_norm": NaN, + "learning_rate": 6.83113154874948e-06, + "loss": 0.0, + "step": 58294 + }, + { + "epoch": 5.439488662872073, + "grad_norm": NaN, + "learning_rate": 6.828875138301809e-06, + "loss": 0.0, + "step": 58295 + }, + { + "epoch": 5.43958197256695, + "grad_norm": NaN, + "learning_rate": 6.8266190918959016e-06, + "loss": 0.0, + "step": 58296 + }, + { + "epoch": 5.439675282261827, + "grad_norm": NaN, + "learning_rate": 6.824363409537503e-06, + "loss": 0.0, + "step": 58297 + }, + { + "epoch": 5.439768591956704, + "grad_norm": NaN, + "learning_rate": 6.822108091232326e-06, + "loss": 0.0, + "step": 58298 + }, + { + "epoch": 5.4398619016515815, + "grad_norm": NaN, + "learning_rate": 6.819853136986114e-06, + "loss": 0.0, + "step": 58299 + }, + { + "epoch": 5.439955211346459, + "grad_norm": NaN, + "learning_rate": 6.817598546804598e-06, + "loss": 0.0, + "step": 58300 + }, + { + "epoch": 5.440048521041336, + "grad_norm": NaN, + "learning_rate": 6.815344320693522e-06, + "loss": 0.0, + "step": 58301 + }, + { + "epoch": 5.440141830736214, + "grad_norm": NaN, + "learning_rate": 6.8130904586585986e-06, + "loss": 0.0, + "step": 58302 + }, + { + "epoch": 5.440235140431091, + "grad_norm": NaN, + "learning_rate": 6.81083696070559e-06, + "loss": 0.0, + "step": 58303 + }, + { + "epoch": 5.440328450125968, + "grad_norm": NaN, + "learning_rate": 6.808583826840191e-06, + "loss": 0.0, + "step": 58304 + }, + { + "epoch": 5.440421759820845, + "grad_norm": NaN, + "learning_rate": 6.806331057068165e-06, + "loss": 0.0, + "step": 58305 + }, + { + "epoch": 5.4405150695157225, + "grad_norm": NaN, + "learning_rate": 6.8040786513952065e-06, + "loss": 0.0, + "step": 58306 + }, + { + "epoch": 5.4406083792106, + "grad_norm": NaN, + "learning_rate": 6.80182660982706e-06, + "loss": 0.0, + "step": 58307 + }, + { + "epoch": 5.440701688905477, + "grad_norm": NaN, + "learning_rate": 6.799574932369456e-06, + "loss": 0.0, + "step": 58308 + }, + { + "epoch": 5.440794998600355, + "grad_norm": NaN, + "learning_rate": 6.797323619028106e-06, + "loss": 0.0, + "step": 58309 + }, + { + "epoch": 5.440888308295232, + "grad_norm": NaN, + "learning_rate": 6.795072669808754e-06, + "loss": 0.0, + "step": 58310 + }, + { + "epoch": 5.44098161799011, + "grad_norm": NaN, + "learning_rate": 6.792822084717098e-06, + "loss": 0.0, + "step": 58311 + }, + { + "epoch": 5.441074927684986, + "grad_norm": NaN, + "learning_rate": 6.790571863758881e-06, + "loss": 0.0, + "step": 58312 + }, + { + "epoch": 5.4411682373798635, + "grad_norm": NaN, + "learning_rate": 6.788322006939817e-06, + "loss": 0.0, + "step": 58313 + }, + { + "epoch": 5.441261547074741, + "grad_norm": NaN, + "learning_rate": 6.786072514265634e-06, + "loss": 0.0, + "step": 58314 + }, + { + "epoch": 5.441354856769618, + "grad_norm": NaN, + "learning_rate": 6.783823385742043e-06, + "loss": 0.0, + "step": 58315 + }, + { + "epoch": 5.441448166464496, + "grad_norm": NaN, + "learning_rate": 6.781574621374758e-06, + "loss": 0.0, + "step": 58316 + }, + { + "epoch": 5.441541476159373, + "grad_norm": NaN, + "learning_rate": 6.779326221169506e-06, + "loss": 0.0, + "step": 58317 + }, + { + "epoch": 5.441634785854251, + "grad_norm": NaN, + "learning_rate": 6.7770781851320015e-06, + "loss": 0.0, + "step": 58318 + }, + { + "epoch": 5.441728095549127, + "grad_norm": NaN, + "learning_rate": 6.774830513267971e-06, + "loss": 0.0, + "step": 58319 + }, + { + "epoch": 5.441821405244005, + "grad_norm": NaN, + "learning_rate": 6.7725832055831095e-06, + "loss": 0.0, + "step": 58320 + }, + { + "epoch": 5.441914714938882, + "grad_norm": NaN, + "learning_rate": 6.770336262083148e-06, + "loss": 0.0, + "step": 58321 + }, + { + "epoch": 5.442008024633759, + "grad_norm": NaN, + "learning_rate": 6.768089682773798e-06, + "loss": 0.0, + "step": 58322 + }, + { + "epoch": 5.442101334328637, + "grad_norm": NaN, + "learning_rate": 6.765843467660753e-06, + "loss": 0.0, + "step": 58323 + }, + { + "epoch": 5.442194644023514, + "grad_norm": NaN, + "learning_rate": 6.763597616749761e-06, + "loss": 0.0, + "step": 58324 + }, + { + "epoch": 5.442287953718392, + "grad_norm": NaN, + "learning_rate": 6.761352130046499e-06, + "loss": 0.0, + "step": 58325 + }, + { + "epoch": 5.442381263413269, + "grad_norm": NaN, + "learning_rate": 6.75910700755668e-06, + "loss": 0.0, + "step": 58326 + }, + { + "epoch": 5.442474573108146, + "grad_norm": NaN, + "learning_rate": 6.756862249286032e-06, + "loss": 0.0, + "step": 58327 + }, + { + "epoch": 5.442567882803023, + "grad_norm": NaN, + "learning_rate": 6.754617855240235e-06, + "loss": 0.0, + "step": 58328 + }, + { + "epoch": 5.4426611924979005, + "grad_norm": NaN, + "learning_rate": 6.752373825425034e-06, + "loss": 0.0, + "step": 58329 + }, + { + "epoch": 5.442754502192778, + "grad_norm": NaN, + "learning_rate": 6.750130159846107e-06, + "loss": 0.0, + "step": 58330 + }, + { + "epoch": 5.442847811887655, + "grad_norm": NaN, + "learning_rate": 6.74788685850915e-06, + "loss": 0.0, + "step": 58331 + }, + { + "epoch": 5.442941121582533, + "grad_norm": NaN, + "learning_rate": 6.745643921419891e-06, + "loss": 0.0, + "step": 58332 + }, + { + "epoch": 5.443034431277409, + "grad_norm": NaN, + "learning_rate": 6.7434013485840435e-06, + "loss": 0.0, + "step": 58333 + }, + { + "epoch": 5.443127740972287, + "grad_norm": NaN, + "learning_rate": 6.741159140007269e-06, + "loss": 0.0, + "step": 58334 + }, + { + "epoch": 5.443221050667164, + "grad_norm": NaN, + "learning_rate": 6.738917295695295e-06, + "loss": 0.0, + "step": 58335 + }, + { + "epoch": 5.4433143603620415, + "grad_norm": NaN, + "learning_rate": 6.736675815653819e-06, + "loss": 0.0, + "step": 58336 + }, + { + "epoch": 5.443407670056919, + "grad_norm": NaN, + "learning_rate": 6.734434699888536e-06, + "loss": 0.0, + "step": 58337 + }, + { + "epoch": 5.443500979751796, + "grad_norm": NaN, + "learning_rate": 6.73219394840514e-06, + "loss": 0.0, + "step": 58338 + }, + { + "epoch": 5.443594289446674, + "grad_norm": NaN, + "learning_rate": 6.7299535612093615e-06, + "loss": 0.0, + "step": 58339 + }, + { + "epoch": 5.443687599141551, + "grad_norm": NaN, + "learning_rate": 6.727713538306845e-06, + "loss": 0.0, + "step": 58340 + }, + { + "epoch": 5.443780908836428, + "grad_norm": NaN, + "learning_rate": 6.7254738797033194e-06, + "loss": 0.0, + "step": 58341 + }, + { + "epoch": 5.443874218531305, + "grad_norm": NaN, + "learning_rate": 6.72323458540448e-06, + "loss": 0.0, + "step": 58342 + }, + { + "epoch": 5.443967528226183, + "grad_norm": NaN, + "learning_rate": 6.720995655416006e-06, + "loss": 0.0, + "step": 58343 + }, + { + "epoch": 5.44406083792106, + "grad_norm": NaN, + "learning_rate": 6.718757089743609e-06, + "loss": 0.0, + "step": 58344 + }, + { + "epoch": 5.444154147615937, + "grad_norm": NaN, + "learning_rate": 6.716518888392952e-06, + "loss": 0.0, + "step": 58345 + }, + { + "epoch": 5.444247457310815, + "grad_norm": NaN, + "learning_rate": 6.714281051369763e-06, + "loss": 0.0, + "step": 58346 + }, + { + "epoch": 5.444340767005692, + "grad_norm": NaN, + "learning_rate": 6.712043578679705e-06, + "loss": 0.0, + "step": 58347 + }, + { + "epoch": 5.444434076700569, + "grad_norm": NaN, + "learning_rate": 6.709806470328471e-06, + "loss": 0.0, + "step": 58348 + }, + { + "epoch": 5.444527386395446, + "grad_norm": NaN, + "learning_rate": 6.707569726321743e-06, + "loss": 0.0, + "step": 58349 + }, + { + "epoch": 5.444620696090324, + "grad_norm": NaN, + "learning_rate": 6.705333346665231e-06, + "loss": 0.0, + "step": 58350 + }, + { + "epoch": 5.444714005785201, + "grad_norm": NaN, + "learning_rate": 6.7030973313646155e-06, + "loss": 0.0, + "step": 58351 + }, + { + "epoch": 5.4448073154800785, + "grad_norm": NaN, + "learning_rate": 6.7008616804255565e-06, + "loss": 0.0, + "step": 58352 + }, + { + "epoch": 5.444900625174956, + "grad_norm": NaN, + "learning_rate": 6.698626393853785e-06, + "loss": 0.0, + "step": 58353 + }, + { + "epoch": 5.444993934869833, + "grad_norm": NaN, + "learning_rate": 6.696391471654944e-06, + "loss": 0.0, + "step": 58354 + }, + { + "epoch": 5.445087244564711, + "grad_norm": NaN, + "learning_rate": 6.694156913834731e-06, + "loss": 0.0, + "step": 58355 + }, + { + "epoch": 5.445180554259587, + "grad_norm": NaN, + "learning_rate": 6.6919227203988234e-06, + "loss": 0.0, + "step": 58356 + }, + { + "epoch": 5.445273863954465, + "grad_norm": NaN, + "learning_rate": 6.689688891352901e-06, + "loss": 0.0, + "step": 58357 + }, + { + "epoch": 5.445367173649342, + "grad_norm": NaN, + "learning_rate": 6.687455426702659e-06, + "loss": 0.0, + "step": 58358 + }, + { + "epoch": 5.4454604833442195, + "grad_norm": NaN, + "learning_rate": 6.68522232645376e-06, + "loss": 0.0, + "step": 58359 + }, + { + "epoch": 5.445553793039097, + "grad_norm": NaN, + "learning_rate": 6.682989590611898e-06, + "loss": 0.0, + "step": 58360 + }, + { + "epoch": 5.445647102733974, + "grad_norm": NaN, + "learning_rate": 6.68075721918272e-06, + "loss": 0.0, + "step": 58361 + }, + { + "epoch": 5.445740412428852, + "grad_norm": NaN, + "learning_rate": 6.678525212171937e-06, + "loss": 0.0, + "step": 58362 + }, + { + "epoch": 5.445833722123728, + "grad_norm": NaN, + "learning_rate": 6.6762935695852115e-06, + "loss": 0.0, + "step": 58363 + }, + { + "epoch": 5.445927031818606, + "grad_norm": NaN, + "learning_rate": 6.674062291428206e-06, + "loss": 0.0, + "step": 58364 + }, + { + "epoch": 5.446020341513483, + "grad_norm": NaN, + "learning_rate": 6.671831377706599e-06, + "loss": 0.0, + "step": 58365 + }, + { + "epoch": 5.4461136512083606, + "grad_norm": NaN, + "learning_rate": 6.669600828426086e-06, + "loss": 0.0, + "step": 58366 + }, + { + "epoch": 5.446206960903238, + "grad_norm": NaN, + "learning_rate": 6.667370643592313e-06, + "loss": 0.0, + "step": 58367 + }, + { + "epoch": 5.446300270598115, + "grad_norm": NaN, + "learning_rate": 6.665140823210957e-06, + "loss": 0.0, + "step": 58368 + }, + { + "epoch": 5.446393580292993, + "grad_norm": NaN, + "learning_rate": 6.6629113672876825e-06, + "loss": 0.0, + "step": 58369 + }, + { + "epoch": 5.446486889987869, + "grad_norm": NaN, + "learning_rate": 6.660682275828167e-06, + "loss": 0.0, + "step": 58370 + }, + { + "epoch": 5.446580199682747, + "grad_norm": NaN, + "learning_rate": 6.658453548838072e-06, + "loss": 0.0, + "step": 58371 + }, + { + "epoch": 5.446673509377624, + "grad_norm": NaN, + "learning_rate": 6.656225186323077e-06, + "loss": 0.0, + "step": 58372 + }, + { + "epoch": 5.446766819072502, + "grad_norm": NaN, + "learning_rate": 6.6539971882888456e-06, + "loss": 0.0, + "step": 58373 + }, + { + "epoch": 5.446860128767379, + "grad_norm": NaN, + "learning_rate": 6.651769554741038e-06, + "loss": 0.0, + "step": 58374 + }, + { + "epoch": 5.446953438462256, + "grad_norm": NaN, + "learning_rate": 6.6495422856853e-06, + "loss": 0.0, + "step": 58375 + }, + { + "epoch": 5.447046748157134, + "grad_norm": NaN, + "learning_rate": 6.647315381127327e-06, + "loss": 0.0, + "step": 58376 + }, + { + "epoch": 5.44714005785201, + "grad_norm": NaN, + "learning_rate": 6.645088841072765e-06, + "loss": 0.0, + "step": 58377 + }, + { + "epoch": 5.447233367546888, + "grad_norm": NaN, + "learning_rate": 6.642862665527276e-06, + "loss": 0.0, + "step": 58378 + }, + { + "epoch": 5.447326677241765, + "grad_norm": NaN, + "learning_rate": 6.640636854496506e-06, + "loss": 0.0, + "step": 58379 + }, + { + "epoch": 5.447419986936643, + "grad_norm": NaN, + "learning_rate": 6.63841140798615e-06, + "loss": 0.0, + "step": 58380 + }, + { + "epoch": 5.44751329663152, + "grad_norm": NaN, + "learning_rate": 6.636186326001835e-06, + "loss": 0.0, + "step": 58381 + }, + { + "epoch": 5.4476066063263975, + "grad_norm": NaN, + "learning_rate": 6.633961608549243e-06, + "loss": 0.0, + "step": 58382 + }, + { + "epoch": 5.447699916021275, + "grad_norm": NaN, + "learning_rate": 6.631737255634017e-06, + "loss": 0.0, + "step": 58383 + }, + { + "epoch": 5.447793225716152, + "grad_norm": NaN, + "learning_rate": 6.629513267261805e-06, + "loss": 0.0, + "step": 58384 + }, + { + "epoch": 5.447886535411029, + "grad_norm": NaN, + "learning_rate": 6.627289643438283e-06, + "loss": 0.0, + "step": 58385 + }, + { + "epoch": 5.447979845105906, + "grad_norm": NaN, + "learning_rate": 6.625066384169081e-06, + "loss": 0.0, + "step": 58386 + }, + { + "epoch": 5.448073154800784, + "grad_norm": NaN, + "learning_rate": 6.622843489459862e-06, + "loss": 0.0, + "step": 58387 + }, + { + "epoch": 5.448166464495661, + "grad_norm": NaN, + "learning_rate": 6.6206209593163025e-06, + "loss": 0.0, + "step": 58388 + }, + { + "epoch": 5.4482597741905385, + "grad_norm": NaN, + "learning_rate": 6.618398793744018e-06, + "loss": 0.0, + "step": 58389 + }, + { + "epoch": 5.448353083885416, + "grad_norm": NaN, + "learning_rate": 6.616176992748651e-06, + "loss": 0.0, + "step": 58390 + }, + { + "epoch": 5.448446393580293, + "grad_norm": NaN, + "learning_rate": 6.6139555563359145e-06, + "loss": 0.0, + "step": 58391 + }, + { + "epoch": 5.44853970327517, + "grad_norm": NaN, + "learning_rate": 6.611734484511388e-06, + "loss": 0.0, + "step": 58392 + }, + { + "epoch": 5.448633012970047, + "grad_norm": NaN, + "learning_rate": 6.609513777280734e-06, + "loss": 0.0, + "step": 58393 + }, + { + "epoch": 5.448726322664925, + "grad_norm": NaN, + "learning_rate": 6.607293434649646e-06, + "loss": 0.0, + "step": 58394 + }, + { + "epoch": 5.448819632359802, + "grad_norm": NaN, + "learning_rate": 6.605073456623722e-06, + "loss": 0.0, + "step": 58395 + }, + { + "epoch": 5.44891294205468, + "grad_norm": NaN, + "learning_rate": 6.602853843208589e-06, + "loss": 0.0, + "step": 58396 + }, + { + "epoch": 5.449006251749557, + "grad_norm": NaN, + "learning_rate": 6.600634594409976e-06, + "loss": 0.0, + "step": 58397 + }, + { + "epoch": 5.449099561444434, + "grad_norm": NaN, + "learning_rate": 6.5984157102334285e-06, + "loss": 0.0, + "step": 58398 + }, + { + "epoch": 5.449192871139312, + "grad_norm": NaN, + "learning_rate": 6.596197190684627e-06, + "loss": 0.0, + "step": 58399 + }, + { + "epoch": 5.449286180834188, + "grad_norm": NaN, + "learning_rate": 6.5939790357692645e-06, + "loss": 0.0, + "step": 58400 + }, + { + "epoch": 5.449379490529066, + "grad_norm": NaN, + "learning_rate": 6.591761245492905e-06, + "loss": 0.0, + "step": 58401 + }, + { + "epoch": 5.449472800223943, + "grad_norm": NaN, + "learning_rate": 6.58954381986121e-06, + "loss": 0.0, + "step": 58402 + }, + { + "epoch": 5.449566109918821, + "grad_norm": NaN, + "learning_rate": 6.587326758879824e-06, + "loss": 0.0, + "step": 58403 + }, + { + "epoch": 5.449659419613698, + "grad_norm": NaN, + "learning_rate": 6.585110062554394e-06, + "loss": 0.0, + "step": 58404 + }, + { + "epoch": 5.4497527293085755, + "grad_norm": NaN, + "learning_rate": 6.5828937308905484e-06, + "loss": 0.0, + "step": 58405 + }, + { + "epoch": 5.449846039003452, + "grad_norm": NaN, + "learning_rate": 6.580677763893916e-06, + "loss": 0.0, + "step": 58406 + }, + { + "epoch": 5.449939348698329, + "grad_norm": NaN, + "learning_rate": 6.578462161570125e-06, + "loss": 0.0, + "step": 58407 + }, + { + "epoch": 5.450032658393207, + "grad_norm": NaN, + "learning_rate": 6.576246923924839e-06, + "loss": 0.0, + "step": 58408 + }, + { + "epoch": 5.450125968088084, + "grad_norm": NaN, + "learning_rate": 6.574032050963652e-06, + "loss": 0.0, + "step": 58409 + }, + { + "epoch": 5.450219277782962, + "grad_norm": NaN, + "learning_rate": 6.571817542692226e-06, + "loss": 0.0, + "step": 58410 + }, + { + "epoch": 5.450312587477839, + "grad_norm": NaN, + "learning_rate": 6.569603399116175e-06, + "loss": 0.0, + "step": 58411 + }, + { + "epoch": 5.4504058971727165, + "grad_norm": NaN, + "learning_rate": 6.567389620241143e-06, + "loss": 0.0, + "step": 58412 + }, + { + "epoch": 5.450499206867594, + "grad_norm": NaN, + "learning_rate": 6.565176206072759e-06, + "loss": 0.0, + "step": 58413 + }, + { + "epoch": 5.4505925165624705, + "grad_norm": NaN, + "learning_rate": 6.562963156616635e-06, + "loss": 0.0, + "step": 58414 + }, + { + "epoch": 5.450685826257348, + "grad_norm": NaN, + "learning_rate": 6.560750471878401e-06, + "loss": 0.0, + "step": 58415 + }, + { + "epoch": 5.450779135952225, + "grad_norm": NaN, + "learning_rate": 6.558538151863685e-06, + "loss": 0.0, + "step": 58416 + }, + { + "epoch": 5.450872445647103, + "grad_norm": NaN, + "learning_rate": 6.556326196578132e-06, + "loss": 0.0, + "step": 58417 + }, + { + "epoch": 5.45096575534198, + "grad_norm": NaN, + "learning_rate": 6.554114606027338e-06, + "loss": 0.0, + "step": 58418 + }, + { + "epoch": 5.4510590650368576, + "grad_norm": NaN, + "learning_rate": 6.55190338021695e-06, + "loss": 0.0, + "step": 58419 + }, + { + "epoch": 5.451152374731735, + "grad_norm": NaN, + "learning_rate": 6.549692519152561e-06, + "loss": 0.0, + "step": 58420 + }, + { + "epoch": 5.4512456844266115, + "grad_norm": NaN, + "learning_rate": 6.547482022839834e-06, + "loss": 0.0, + "step": 58421 + }, + { + "epoch": 5.451338994121489, + "grad_norm": NaN, + "learning_rate": 6.5452718912843486e-06, + "loss": 0.0, + "step": 58422 + }, + { + "epoch": 5.451432303816366, + "grad_norm": NaN, + "learning_rate": 6.5430621244917495e-06, + "loss": 0.0, + "step": 58423 + }, + { + "epoch": 5.451525613511244, + "grad_norm": NaN, + "learning_rate": 6.540852722467649e-06, + "loss": 0.0, + "step": 58424 + }, + { + "epoch": 5.451618923206121, + "grad_norm": NaN, + "learning_rate": 6.538643685217659e-06, + "loss": 0.0, + "step": 58425 + }, + { + "epoch": 5.451712232900999, + "grad_norm": NaN, + "learning_rate": 6.536435012747409e-06, + "loss": 0.0, + "step": 58426 + }, + { + "epoch": 5.451805542595876, + "grad_norm": NaN, + "learning_rate": 6.534226705062496e-06, + "loss": 0.0, + "step": 58427 + }, + { + "epoch": 5.451898852290753, + "grad_norm": NaN, + "learning_rate": 6.532018762168545e-06, + "loss": 0.0, + "step": 58428 + }, + { + "epoch": 5.45199216198563, + "grad_norm": NaN, + "learning_rate": 6.529811184071204e-06, + "loss": 0.0, + "step": 58429 + }, + { + "epoch": 5.452085471680507, + "grad_norm": NaN, + "learning_rate": 6.527603970776018e-06, + "loss": 0.0, + "step": 58430 + }, + { + "epoch": 5.452178781375385, + "grad_norm": NaN, + "learning_rate": 6.525397122288634e-06, + "loss": 0.0, + "step": 58431 + }, + { + "epoch": 5.452272091070262, + "grad_norm": NaN, + "learning_rate": 6.523190638614695e-06, + "loss": 0.0, + "step": 58432 + }, + { + "epoch": 5.45236540076514, + "grad_norm": NaN, + "learning_rate": 6.520984519759781e-06, + "loss": 0.0, + "step": 58433 + }, + { + "epoch": 5.452458710460017, + "grad_norm": NaN, + "learning_rate": 6.518778765729455e-06, + "loss": 0.0, + "step": 58434 + }, + { + "epoch": 5.4525520201548945, + "grad_norm": NaN, + "learning_rate": 6.516573376529427e-06, + "loss": 0.0, + "step": 58435 + }, + { + "epoch": 5.452645329849771, + "grad_norm": NaN, + "learning_rate": 6.5143683521652276e-06, + "loss": 0.0, + "step": 58436 + }, + { + "epoch": 5.452738639544648, + "grad_norm": NaN, + "learning_rate": 6.5121636926424695e-06, + "loss": 0.0, + "step": 58437 + }, + { + "epoch": 5.452831949239526, + "grad_norm": NaN, + "learning_rate": 6.509959397966813e-06, + "loss": 0.0, + "step": 58438 + }, + { + "epoch": 5.452925258934403, + "grad_norm": NaN, + "learning_rate": 6.507755468143805e-06, + "loss": 0.0, + "step": 58439 + }, + { + "epoch": 5.453018568629281, + "grad_norm": NaN, + "learning_rate": 6.5055519031790405e-06, + "loss": 0.0, + "step": 58440 + }, + { + "epoch": 5.453111878324158, + "grad_norm": NaN, + "learning_rate": 6.503348703078198e-06, + "loss": 0.0, + "step": 58441 + }, + { + "epoch": 5.4532051880190355, + "grad_norm": NaN, + "learning_rate": 6.501145867846808e-06, + "loss": 0.0, + "step": 58442 + }, + { + "epoch": 5.453298497713912, + "grad_norm": NaN, + "learning_rate": 6.498943397490481e-06, + "loss": 0.0, + "step": 58443 + }, + { + "epoch": 5.4533918074087895, + "grad_norm": NaN, + "learning_rate": 6.496741292014878e-06, + "loss": 0.0, + "step": 58444 + }, + { + "epoch": 5.453485117103667, + "grad_norm": NaN, + "learning_rate": 6.494539551425532e-06, + "loss": 0.0, + "step": 58445 + }, + { + "epoch": 5.453578426798544, + "grad_norm": NaN, + "learning_rate": 6.492338175728052e-06, + "loss": 0.0, + "step": 58446 + }, + { + "epoch": 5.453671736493422, + "grad_norm": NaN, + "learning_rate": 6.49013716492805e-06, + "loss": 0.0, + "step": 58447 + }, + { + "epoch": 5.453765046188299, + "grad_norm": NaN, + "learning_rate": 6.487936519031123e-06, + "loss": 0.0, + "step": 58448 + }, + { + "epoch": 5.453858355883177, + "grad_norm": NaN, + "learning_rate": 6.485736238042866e-06, + "loss": 0.0, + "step": 58449 + }, + { + "epoch": 5.453951665578053, + "grad_norm": NaN, + "learning_rate": 6.483536321968857e-06, + "loss": 0.0, + "step": 58450 + }, + { + "epoch": 5.4540449752729305, + "grad_norm": NaN, + "learning_rate": 6.4813367708147105e-06, + "loss": 0.0, + "step": 58451 + }, + { + "epoch": 5.454138284967808, + "grad_norm": NaN, + "learning_rate": 6.479137584586019e-06, + "loss": 0.0, + "step": 58452 + }, + { + "epoch": 5.454231594662685, + "grad_norm": NaN, + "learning_rate": 6.476938763288348e-06, + "loss": 0.0, + "step": 58453 + }, + { + "epoch": 5.454324904357563, + "grad_norm": NaN, + "learning_rate": 6.474740306927323e-06, + "loss": 0.0, + "step": 58454 + }, + { + "epoch": 5.45441821405244, + "grad_norm": NaN, + "learning_rate": 6.472542215508508e-06, + "loss": 0.0, + "step": 58455 + }, + { + "epoch": 5.454511523747318, + "grad_norm": NaN, + "learning_rate": 6.4703444890375156e-06, + "loss": 0.0, + "step": 58456 + }, + { + "epoch": 5.454604833442195, + "grad_norm": NaN, + "learning_rate": 6.4681471275199075e-06, + "loss": 0.0, + "step": 58457 + }, + { + "epoch": 5.454698143137072, + "grad_norm": NaN, + "learning_rate": 6.465950130961278e-06, + "loss": 0.0, + "step": 58458 + }, + { + "epoch": 5.454791452831949, + "grad_norm": NaN, + "learning_rate": 6.463753499367241e-06, + "loss": 0.0, + "step": 58459 + }, + { + "epoch": 5.454884762526826, + "grad_norm": NaN, + "learning_rate": 6.461557232743342e-06, + "loss": 0.0, + "step": 58460 + }, + { + "epoch": 5.454978072221704, + "grad_norm": NaN, + "learning_rate": 6.459361331095192e-06, + "loss": 0.0, + "step": 58461 + }, + { + "epoch": 5.455071381916581, + "grad_norm": NaN, + "learning_rate": 6.457165794428354e-06, + "loss": 0.0, + "step": 58462 + }, + { + "epoch": 5.455164691611459, + "grad_norm": NaN, + "learning_rate": 6.4549706227484396e-06, + "loss": 0.0, + "step": 58463 + }, + { + "epoch": 5.455258001306336, + "grad_norm": NaN, + "learning_rate": 6.4527758160610124e-06, + "loss": 0.0, + "step": 58464 + }, + { + "epoch": 5.455351311001213, + "grad_norm": NaN, + "learning_rate": 6.450581374371649e-06, + "loss": 0.0, + "step": 58465 + }, + { + "epoch": 5.45544462069609, + "grad_norm": NaN, + "learning_rate": 6.4483872976859145e-06, + "loss": 0.0, + "step": 58466 + }, + { + "epoch": 5.4555379303909675, + "grad_norm": NaN, + "learning_rate": 6.446193586009452e-06, + "loss": 0.0, + "step": 58467 + }, + { + "epoch": 5.455631240085845, + "grad_norm": NaN, + "learning_rate": 6.444000239347757e-06, + "loss": 0.0, + "step": 58468 + }, + { + "epoch": 5.455724549780722, + "grad_norm": NaN, + "learning_rate": 6.4418072577064435e-06, + "loss": 0.0, + "step": 58469 + }, + { + "epoch": 5.4558178594756, + "grad_norm": NaN, + "learning_rate": 6.439614641091123e-06, + "loss": 0.0, + "step": 58470 + }, + { + "epoch": 5.455911169170477, + "grad_norm": NaN, + "learning_rate": 6.437422389507324e-06, + "loss": 0.0, + "step": 58471 + }, + { + "epoch": 5.456004478865355, + "grad_norm": NaN, + "learning_rate": 6.435230502960609e-06, + "loss": 0.0, + "step": 58472 + }, + { + "epoch": 5.456097788560231, + "grad_norm": NaN, + "learning_rate": 6.433038981456623e-06, + "loss": 0.0, + "step": 58473 + }, + { + "epoch": 5.4561910982551085, + "grad_norm": NaN, + "learning_rate": 6.430847825000862e-06, + "loss": 0.0, + "step": 58474 + }, + { + "epoch": 5.456284407949986, + "grad_norm": NaN, + "learning_rate": 6.428657033598905e-06, + "loss": 0.0, + "step": 58475 + }, + { + "epoch": 5.456377717644863, + "grad_norm": NaN, + "learning_rate": 6.426466607256381e-06, + "loss": 0.0, + "step": 58476 + }, + { + "epoch": 5.456471027339741, + "grad_norm": NaN, + "learning_rate": 6.424276545978818e-06, + "loss": 0.0, + "step": 58477 + }, + { + "epoch": 5.456564337034618, + "grad_norm": NaN, + "learning_rate": 6.422086849771763e-06, + "loss": 0.0, + "step": 58478 + }, + { + "epoch": 5.456657646729496, + "grad_norm": NaN, + "learning_rate": 6.419897518640843e-06, + "loss": 0.0, + "step": 58479 + }, + { + "epoch": 5.456750956424372, + "grad_norm": NaN, + "learning_rate": 6.417708552591571e-06, + "loss": 0.0, + "step": 58480 + }, + { + "epoch": 5.4568442661192496, + "grad_norm": NaN, + "learning_rate": 6.4155199516295275e-06, + "loss": 0.0, + "step": 58481 + }, + { + "epoch": 5.456937575814127, + "grad_norm": NaN, + "learning_rate": 6.413331715760306e-06, + "loss": 0.0, + "step": 58482 + }, + { + "epoch": 5.457030885509004, + "grad_norm": NaN, + "learning_rate": 6.411143844989436e-06, + "loss": 0.0, + "step": 58483 + }, + { + "epoch": 5.457124195203882, + "grad_norm": NaN, + "learning_rate": 6.40895633932248e-06, + "loss": 0.0, + "step": 58484 + }, + { + "epoch": 5.457217504898759, + "grad_norm": NaN, + "learning_rate": 6.406769198765049e-06, + "loss": 0.0, + "step": 58485 + }, + { + "epoch": 5.457310814593637, + "grad_norm": NaN, + "learning_rate": 6.404582423322658e-06, + "loss": 0.0, + "step": 58486 + }, + { + "epoch": 5.457404124288513, + "grad_norm": NaN, + "learning_rate": 6.402396013000849e-06, + "loss": 0.0, + "step": 58487 + }, + { + "epoch": 5.457497433983391, + "grad_norm": NaN, + "learning_rate": 6.4002099678052365e-06, + "loss": 0.0, + "step": 58488 + }, + { + "epoch": 5.457590743678268, + "grad_norm": NaN, + "learning_rate": 6.398024287741349e-06, + "loss": 0.0, + "step": 58489 + }, + { + "epoch": 5.457684053373145, + "grad_norm": NaN, + "learning_rate": 6.3958389728147484e-06, + "loss": 0.0, + "step": 58490 + }, + { + "epoch": 5.457777363068023, + "grad_norm": NaN, + "learning_rate": 6.393654023030981e-06, + "loss": 0.0, + "step": 58491 + }, + { + "epoch": 5.4578706727629, + "grad_norm": NaN, + "learning_rate": 6.3914694383956245e-06, + "loss": 0.0, + "step": 58492 + }, + { + "epoch": 5.457963982457778, + "grad_norm": NaN, + "learning_rate": 6.389285218914209e-06, + "loss": 0.0, + "step": 58493 + }, + { + "epoch": 5.458057292152654, + "grad_norm": NaN, + "learning_rate": 6.387101364592312e-06, + "loss": 0.0, + "step": 58494 + }, + { + "epoch": 5.458150601847532, + "grad_norm": NaN, + "learning_rate": 6.384917875435463e-06, + "loss": 0.0, + "step": 58495 + }, + { + "epoch": 5.458243911542409, + "grad_norm": NaN, + "learning_rate": 6.382734751449226e-06, + "loss": 0.0, + "step": 58496 + }, + { + "epoch": 5.4583372212372865, + "grad_norm": NaN, + "learning_rate": 6.38055199263916e-06, + "loss": 0.0, + "step": 58497 + }, + { + "epoch": 5.458430530932164, + "grad_norm": NaN, + "learning_rate": 6.378369599010813e-06, + "loss": 0.0, + "step": 58498 + }, + { + "epoch": 5.458523840627041, + "grad_norm": NaN, + "learning_rate": 6.376187570569713e-06, + "loss": 0.0, + "step": 58499 + }, + { + "epoch": 5.458617150321919, + "grad_norm": NaN, + "learning_rate": 6.374005907321439e-06, + "loss": 0.0, + "step": 58500 + }, + { + "epoch": 5.458710460016796, + "grad_norm": NaN, + "learning_rate": 6.371824609271503e-06, + "loss": 0.0, + "step": 58501 + }, + { + "epoch": 5.458803769711673, + "grad_norm": NaN, + "learning_rate": 6.3696436764254846e-06, + "loss": 0.0, + "step": 58502 + }, + { + "epoch": 5.45889707940655, + "grad_norm": NaN, + "learning_rate": 6.367463108788911e-06, + "loss": 0.0, + "step": 58503 + }, + { + "epoch": 5.4589903891014275, + "grad_norm": NaN, + "learning_rate": 6.365282906367314e-06, + "loss": 0.0, + "step": 58504 + }, + { + "epoch": 5.459083698796305, + "grad_norm": NaN, + "learning_rate": 6.363103069166303e-06, + "loss": 0.0, + "step": 58505 + }, + { + "epoch": 5.459177008491182, + "grad_norm": NaN, + "learning_rate": 6.360923597191342e-06, + "loss": 0.0, + "step": 58506 + }, + { + "epoch": 5.45927031818606, + "grad_norm": NaN, + "learning_rate": 6.358744490447992e-06, + "loss": 0.0, + "step": 58507 + }, + { + "epoch": 5.459363627880937, + "grad_norm": NaN, + "learning_rate": 6.356565748941833e-06, + "loss": 0.0, + "step": 58508 + }, + { + "epoch": 5.459456937575814, + "grad_norm": NaN, + "learning_rate": 6.35438737267836e-06, + "loss": 0.0, + "step": 58509 + }, + { + "epoch": 5.459550247270691, + "grad_norm": NaN, + "learning_rate": 6.352209361663119e-06, + "loss": 0.0, + "step": 58510 + }, + { + "epoch": 5.459643556965569, + "grad_norm": NaN, + "learning_rate": 6.350031715901688e-06, + "loss": 0.0, + "step": 58511 + }, + { + "epoch": 5.459736866660446, + "grad_norm": NaN, + "learning_rate": 6.347854435399563e-06, + "loss": 0.0, + "step": 58512 + }, + { + "epoch": 5.459830176355323, + "grad_norm": NaN, + "learning_rate": 6.345677520162257e-06, + "loss": 0.0, + "step": 58513 + }, + { + "epoch": 5.459923486050201, + "grad_norm": NaN, + "learning_rate": 6.343500970195381e-06, + "loss": 0.0, + "step": 58514 + }, + { + "epoch": 5.460016795745078, + "grad_norm": NaN, + "learning_rate": 6.341324785504415e-06, + "loss": 0.0, + "step": 58515 + }, + { + "epoch": 5.460110105439956, + "grad_norm": NaN, + "learning_rate": 6.339148966094887e-06, + "loss": 0.0, + "step": 58516 + }, + { + "epoch": 5.460203415134832, + "grad_norm": NaN, + "learning_rate": 6.336973511972393e-06, + "loss": 0.0, + "step": 58517 + }, + { + "epoch": 5.46029672482971, + "grad_norm": NaN, + "learning_rate": 6.334798423142395e-06, + "loss": 0.0, + "step": 58518 + }, + { + "epoch": 5.460390034524587, + "grad_norm": NaN, + "learning_rate": 6.332623699610423e-06, + "loss": 0.0, + "step": 58519 + }, + { + "epoch": 5.4604833442194645, + "grad_norm": NaN, + "learning_rate": 6.330449341382071e-06, + "loss": 0.0, + "step": 58520 + }, + { + "epoch": 5.460576653914342, + "grad_norm": NaN, + "learning_rate": 6.328275348462803e-06, + "loss": 0.0, + "step": 58521 + }, + { + "epoch": 5.460669963609219, + "grad_norm": NaN, + "learning_rate": 6.326101720858162e-06, + "loss": 0.0, + "step": 58522 + }, + { + "epoch": 5.460763273304096, + "grad_norm": NaN, + "learning_rate": 6.3239284585737295e-06, + "loss": 0.0, + "step": 58523 + }, + { + "epoch": 5.460856582998973, + "grad_norm": NaN, + "learning_rate": 6.3217555616149495e-06, + "loss": 0.0, + "step": 58524 + }, + { + "epoch": 5.460949892693851, + "grad_norm": NaN, + "learning_rate": 6.319583029987385e-06, + "loss": 0.0, + "step": 58525 + }, + { + "epoch": 5.461043202388728, + "grad_norm": NaN, + "learning_rate": 6.317410863696581e-06, + "loss": 0.0, + "step": 58526 + }, + { + "epoch": 5.4611365120836055, + "grad_norm": NaN, + "learning_rate": 6.315239062748034e-06, + "loss": 0.0, + "step": 58527 + }, + { + "epoch": 5.461229821778483, + "grad_norm": NaN, + "learning_rate": 6.313067627147239e-06, + "loss": 0.0, + "step": 58528 + }, + { + "epoch": 5.46132313147336, + "grad_norm": NaN, + "learning_rate": 6.310896556899808e-06, + "loss": 0.0, + "step": 58529 + }, + { + "epoch": 5.461416441168238, + "grad_norm": NaN, + "learning_rate": 6.30872585201117e-06, + "loss": 0.0, + "step": 58530 + }, + { + "epoch": 5.461509750863114, + "grad_norm": NaN, + "learning_rate": 6.306555512486855e-06, + "loss": 0.0, + "step": 58531 + }, + { + "epoch": 5.461603060557992, + "grad_norm": NaN, + "learning_rate": 6.30438553833244e-06, + "loss": 0.0, + "step": 58532 + }, + { + "epoch": 5.461696370252869, + "grad_norm": NaN, + "learning_rate": 6.302215929553406e-06, + "loss": 0.0, + "step": 58533 + }, + { + "epoch": 5.461789679947747, + "grad_norm": NaN, + "learning_rate": 6.30004668615523e-06, + "loss": 0.0, + "step": 58534 + }, + { + "epoch": 5.461882989642624, + "grad_norm": NaN, + "learning_rate": 6.297877808143526e-06, + "loss": 0.0, + "step": 58535 + }, + { + "epoch": 5.461976299337501, + "grad_norm": NaN, + "learning_rate": 6.2957092955237205e-06, + "loss": 0.0, + "step": 58536 + }, + { + "epoch": 5.462069609032379, + "grad_norm": NaN, + "learning_rate": 6.293541148301362e-06, + "loss": 0.0, + "step": 58537 + }, + { + "epoch": 5.462162918727255, + "grad_norm": NaN, + "learning_rate": 6.2913733664819446e-06, + "loss": 0.0, + "step": 58538 + }, + { + "epoch": 5.462256228422133, + "grad_norm": NaN, + "learning_rate": 6.289205950071013e-06, + "loss": 0.0, + "step": 58539 + }, + { + "epoch": 5.46234953811701, + "grad_norm": NaN, + "learning_rate": 6.287038899074065e-06, + "loss": 0.0, + "step": 58540 + }, + { + "epoch": 5.462442847811888, + "grad_norm": NaN, + "learning_rate": 6.284872213496595e-06, + "loss": 0.0, + "step": 58541 + }, + { + "epoch": 5.462536157506765, + "grad_norm": NaN, + "learning_rate": 6.282705893344098e-06, + "loss": 0.0, + "step": 58542 + }, + { + "epoch": 5.462629467201642, + "grad_norm": NaN, + "learning_rate": 6.280539938622153e-06, + "loss": 0.0, + "step": 58543 + }, + { + "epoch": 5.46272277689652, + "grad_norm": NaN, + "learning_rate": 6.278374349336207e-06, + "loss": 0.0, + "step": 58544 + }, + { + "epoch": 5.462816086591397, + "grad_norm": NaN, + "learning_rate": 6.276209125491754e-06, + "loss": 0.0, + "step": 58545 + }, + { + "epoch": 5.462909396286274, + "grad_norm": NaN, + "learning_rate": 6.274044267094375e-06, + "loss": 0.0, + "step": 58546 + }, + { + "epoch": 5.463002705981151, + "grad_norm": NaN, + "learning_rate": 6.271879774149497e-06, + "loss": 0.0, + "step": 58547 + }, + { + "epoch": 5.463096015676029, + "grad_norm": NaN, + "learning_rate": 6.269715646662649e-06, + "loss": 0.0, + "step": 58548 + }, + { + "epoch": 5.463189325370906, + "grad_norm": NaN, + "learning_rate": 6.26755188463936e-06, + "loss": 0.0, + "step": 58549 + }, + { + "epoch": 5.4632826350657835, + "grad_norm": NaN, + "learning_rate": 6.265388488085094e-06, + "loss": 0.0, + "step": 58550 + }, + { + "epoch": 5.463375944760661, + "grad_norm": NaN, + "learning_rate": 6.263225457005361e-06, + "loss": 0.0, + "step": 58551 + }, + { + "epoch": 5.463469254455538, + "grad_norm": NaN, + "learning_rate": 6.261062791405691e-06, + "loss": 0.0, + "step": 58552 + }, + { + "epoch": 5.463562564150415, + "grad_norm": NaN, + "learning_rate": 6.258900491291547e-06, + "loss": 0.0, + "step": 58553 + }, + { + "epoch": 5.463655873845292, + "grad_norm": NaN, + "learning_rate": 6.256738556668423e-06, + "loss": 0.0, + "step": 58554 + }, + { + "epoch": 5.46374918354017, + "grad_norm": NaN, + "learning_rate": 6.254576987541865e-06, + "loss": 0.0, + "step": 58555 + }, + { + "epoch": 5.463842493235047, + "grad_norm": NaN, + "learning_rate": 6.2524157839173365e-06, + "loss": 0.0, + "step": 58556 + }, + { + "epoch": 5.4639358029299245, + "grad_norm": NaN, + "learning_rate": 6.250254945800298e-06, + "loss": 0.0, + "step": 58557 + }, + { + "epoch": 5.464029112624802, + "grad_norm": NaN, + "learning_rate": 6.2480944731963136e-06, + "loss": 0.0, + "step": 58558 + }, + { + "epoch": 5.464122422319679, + "grad_norm": NaN, + "learning_rate": 6.245934366110828e-06, + "loss": 0.0, + "step": 58559 + }, + { + "epoch": 5.464215732014556, + "grad_norm": NaN, + "learning_rate": 6.243774624549336e-06, + "loss": 0.0, + "step": 58560 + }, + { + "epoch": 5.464309041709433, + "grad_norm": NaN, + "learning_rate": 6.241615248517368e-06, + "loss": 0.0, + "step": 58561 + }, + { + "epoch": 5.464402351404311, + "grad_norm": NaN, + "learning_rate": 6.239456238020369e-06, + "loss": 0.0, + "step": 58562 + }, + { + "epoch": 5.464495661099188, + "grad_norm": NaN, + "learning_rate": 6.237297593063834e-06, + "loss": 0.0, + "step": 58563 + }, + { + "epoch": 5.464588970794066, + "grad_norm": NaN, + "learning_rate": 6.235139313653309e-06, + "loss": 0.0, + "step": 58564 + }, + { + "epoch": 5.464682280488943, + "grad_norm": NaN, + "learning_rate": 6.232981399794223e-06, + "loss": 0.0, + "step": 58565 + }, + { + "epoch": 5.46477559018382, + "grad_norm": NaN, + "learning_rate": 6.23082385149204e-06, + "loss": 0.0, + "step": 58566 + }, + { + "epoch": 5.464868899878697, + "grad_norm": NaN, + "learning_rate": 6.2286666687523355e-06, + "loss": 0.0, + "step": 58567 + }, + { + "epoch": 5.464962209573574, + "grad_norm": NaN, + "learning_rate": 6.226509851580525e-06, + "loss": 0.0, + "step": 58568 + }, + { + "epoch": 5.465055519268452, + "grad_norm": NaN, + "learning_rate": 6.224353399982085e-06, + "loss": 0.0, + "step": 58569 + }, + { + "epoch": 5.465148828963329, + "grad_norm": NaN, + "learning_rate": 6.2221973139625795e-06, + "loss": 0.0, + "step": 58570 + }, + { + "epoch": 5.465242138658207, + "grad_norm": NaN, + "learning_rate": 6.220041593527403e-06, + "loss": 0.0, + "step": 58571 + }, + { + "epoch": 5.465335448353084, + "grad_norm": NaN, + "learning_rate": 6.217886238682052e-06, + "loss": 0.0, + "step": 58572 + }, + { + "epoch": 5.4654287580479615, + "grad_norm": NaN, + "learning_rate": 6.215731249432054e-06, + "loss": 0.0, + "step": 58573 + }, + { + "epoch": 5.465522067742839, + "grad_norm": NaN, + "learning_rate": 6.213576625782857e-06, + "loss": 0.0, + "step": 58574 + }, + { + "epoch": 5.465615377437715, + "grad_norm": NaN, + "learning_rate": 6.211422367739921e-06, + "loss": 0.0, + "step": 58575 + }, + { + "epoch": 5.465708687132593, + "grad_norm": NaN, + "learning_rate": 6.209268475308777e-06, + "loss": 0.0, + "step": 58576 + }, + { + "epoch": 5.46580199682747, + "grad_norm": NaN, + "learning_rate": 6.207114948494835e-06, + "loss": 0.0, + "step": 58577 + }, + { + "epoch": 5.465895306522348, + "grad_norm": NaN, + "learning_rate": 6.204961787303608e-06, + "loss": 0.0, + "step": 58578 + }, + { + "epoch": 5.465988616217225, + "grad_norm": NaN, + "learning_rate": 6.2028089917406094e-06, + "loss": 0.0, + "step": 58579 + }, + { + "epoch": 5.4660819259121025, + "grad_norm": NaN, + "learning_rate": 6.200656561811218e-06, + "loss": 0.0, + "step": 58580 + }, + { + "epoch": 5.46617523560698, + "grad_norm": NaN, + "learning_rate": 6.198504497521012e-06, + "loss": 0.0, + "step": 58581 + }, + { + "epoch": 5.4662685453018565, + "grad_norm": NaN, + "learning_rate": 6.196352798875387e-06, + "loss": 0.0, + "step": 58582 + }, + { + "epoch": 5.466361854996734, + "grad_norm": NaN, + "learning_rate": 6.194201465879806e-06, + "loss": 0.0, + "step": 58583 + }, + { + "epoch": 5.466455164691611, + "grad_norm": NaN, + "learning_rate": 6.19205049853983e-06, + "loss": 0.0, + "step": 58584 + }, + { + "epoch": 5.466548474386489, + "grad_norm": NaN, + "learning_rate": 6.189899896860839e-06, + "loss": 0.0, + "step": 58585 + }, + { + "epoch": 5.466641784081366, + "grad_norm": NaN, + "learning_rate": 6.187749660848329e-06, + "loss": 0.0, + "step": 58586 + }, + { + "epoch": 5.466735093776244, + "grad_norm": NaN, + "learning_rate": 6.1855997905077946e-06, + "loss": 0.0, + "step": 58587 + }, + { + "epoch": 5.466828403471121, + "grad_norm": NaN, + "learning_rate": 6.183450285844666e-06, + "loss": 0.0, + "step": 58588 + }, + { + "epoch": 5.466921713165998, + "grad_norm": NaN, + "learning_rate": 6.181301146864403e-06, + "loss": 0.0, + "step": 58589 + }, + { + "epoch": 5.467015022860875, + "grad_norm": NaN, + "learning_rate": 6.179152373572521e-06, + "loss": 0.0, + "step": 58590 + }, + { + "epoch": 5.467108332555752, + "grad_norm": NaN, + "learning_rate": 6.177003965974447e-06, + "loss": 0.0, + "step": 58591 + }, + { + "epoch": 5.46720164225063, + "grad_norm": NaN, + "learning_rate": 6.174855924075611e-06, + "loss": 0.0, + "step": 58592 + }, + { + "epoch": 5.467294951945507, + "grad_norm": NaN, + "learning_rate": 6.172708247881558e-06, + "loss": 0.0, + "step": 58593 + }, + { + "epoch": 5.467388261640385, + "grad_norm": NaN, + "learning_rate": 6.1705609373976836e-06, + "loss": 0.0, + "step": 58594 + }, + { + "epoch": 5.467481571335262, + "grad_norm": NaN, + "learning_rate": 6.1684139926294505e-06, + "loss": 0.0, + "step": 58595 + }, + { + "epoch": 5.4675748810301394, + "grad_norm": NaN, + "learning_rate": 6.166267413582371e-06, + "loss": 0.0, + "step": 58596 + }, + { + "epoch": 5.467668190725016, + "grad_norm": NaN, + "learning_rate": 6.164121200261857e-06, + "loss": 0.0, + "step": 58597 + }, + { + "epoch": 5.467761500419893, + "grad_norm": NaN, + "learning_rate": 6.161975352673337e-06, + "loss": 0.0, + "step": 58598 + }, + { + "epoch": 5.467854810114771, + "grad_norm": NaN, + "learning_rate": 6.159829870822358e-06, + "loss": 0.0, + "step": 58599 + }, + { + "epoch": 5.467948119809648, + "grad_norm": NaN, + "learning_rate": 6.1576847547142985e-06, + "loss": 0.0, + "step": 58600 + }, + { + "epoch": 5.468041429504526, + "grad_norm": NaN, + "learning_rate": 6.15554000435462e-06, + "loss": 0.0, + "step": 58601 + }, + { + "epoch": 5.468134739199403, + "grad_norm": NaN, + "learning_rate": 6.153395619748835e-06, + "loss": 0.0, + "step": 58602 + }, + { + "epoch": 5.4682280488942805, + "grad_norm": NaN, + "learning_rate": 6.1512516009023404e-06, + "loss": 0.0, + "step": 58603 + }, + { + "epoch": 5.468321358589157, + "grad_norm": NaN, + "learning_rate": 6.149107947820564e-06, + "loss": 0.0, + "step": 58604 + }, + { + "epoch": 5.468414668284034, + "grad_norm": NaN, + "learning_rate": 6.1469646605090514e-06, + "loss": 0.0, + "step": 58605 + }, + { + "epoch": 5.468507977978912, + "grad_norm": NaN, + "learning_rate": 6.144821738973165e-06, + "loss": 0.0, + "step": 58606 + }, + { + "epoch": 5.468601287673789, + "grad_norm": NaN, + "learning_rate": 6.1426791832183685e-06, + "loss": 0.0, + "step": 58607 + }, + { + "epoch": 5.468694597368667, + "grad_norm": NaN, + "learning_rate": 6.140536993250173e-06, + "loss": 0.0, + "step": 58608 + }, + { + "epoch": 5.468787907063544, + "grad_norm": NaN, + "learning_rate": 6.138395169073939e-06, + "loss": 0.0, + "step": 58609 + }, + { + "epoch": 5.4688812167584215, + "grad_norm": NaN, + "learning_rate": 6.136253710695149e-06, + "loss": 0.0, + "step": 58610 + }, + { + "epoch": 5.468974526453298, + "grad_norm": NaN, + "learning_rate": 6.13411261811928e-06, + "loss": 0.0, + "step": 58611 + }, + { + "epoch": 5.4690678361481755, + "grad_norm": NaN, + "learning_rate": 6.131971891351728e-06, + "loss": 0.0, + "step": 58612 + }, + { + "epoch": 5.469161145843053, + "grad_norm": NaN, + "learning_rate": 6.129831530397922e-06, + "loss": 0.0, + "step": 58613 + }, + { + "epoch": 5.46925445553793, + "grad_norm": NaN, + "learning_rate": 6.12769153526339e-06, + "loss": 0.0, + "step": 58614 + }, + { + "epoch": 5.469347765232808, + "grad_norm": NaN, + "learning_rate": 6.125551905953479e-06, + "loss": 0.0, + "step": 58615 + }, + { + "epoch": 5.469441074927685, + "grad_norm": NaN, + "learning_rate": 6.123412642473702e-06, + "loss": 0.0, + "step": 58616 + }, + { + "epoch": 5.469534384622563, + "grad_norm": NaN, + "learning_rate": 6.1212737448294695e-06, + "loss": 0.0, + "step": 58617 + }, + { + "epoch": 5.46962769431744, + "grad_norm": NaN, + "learning_rate": 6.119135213026194e-06, + "loss": 0.0, + "step": 58618 + }, + { + "epoch": 5.4697210040123165, + "grad_norm": NaN, + "learning_rate": 6.116997047069339e-06, + "loss": 0.0, + "step": 58619 + }, + { + "epoch": 5.469814313707194, + "grad_norm": NaN, + "learning_rate": 6.114859246964365e-06, + "loss": 0.0, + "step": 58620 + }, + { + "epoch": 5.469907623402071, + "grad_norm": NaN, + "learning_rate": 6.112721812716653e-06, + "loss": 0.0, + "step": 58621 + }, + { + "epoch": 5.470000933096949, + "grad_norm": NaN, + "learning_rate": 6.110584744331681e-06, + "loss": 0.0, + "step": 58622 + }, + { + "epoch": 5.470094242791826, + "grad_norm": NaN, + "learning_rate": 6.108448041814879e-06, + "loss": 0.0, + "step": 58623 + }, + { + "epoch": 5.470187552486704, + "grad_norm": NaN, + "learning_rate": 6.10631170517164e-06, + "loss": 0.0, + "step": 58624 + }, + { + "epoch": 5.470280862181581, + "grad_norm": NaN, + "learning_rate": 6.104175734407463e-06, + "loss": 0.0, + "step": 58625 + }, + { + "epoch": 5.470374171876458, + "grad_norm": NaN, + "learning_rate": 6.102040129527725e-06, + "loss": 0.0, + "step": 58626 + }, + { + "epoch": 5.470467481571335, + "grad_norm": NaN, + "learning_rate": 6.0999048905378545e-06, + "loss": 0.0, + "step": 58627 + }, + { + "epoch": 5.470560791266212, + "grad_norm": NaN, + "learning_rate": 6.097770017443332e-06, + "loss": 0.0, + "step": 58628 + }, + { + "epoch": 5.47065410096109, + "grad_norm": NaN, + "learning_rate": 6.095635510249536e-06, + "loss": 0.0, + "step": 58629 + }, + { + "epoch": 5.470747410655967, + "grad_norm": NaN, + "learning_rate": 6.093501368961895e-06, + "loss": 0.0, + "step": 58630 + }, + { + "epoch": 5.470840720350845, + "grad_norm": NaN, + "learning_rate": 6.091367593585889e-06, + "loss": 0.0, + "step": 58631 + }, + { + "epoch": 5.470934030045722, + "grad_norm": NaN, + "learning_rate": 6.089234184126895e-06, + "loss": 0.0, + "step": 58632 + }, + { + "epoch": 5.4710273397405995, + "grad_norm": NaN, + "learning_rate": 6.087101140590312e-06, + "loss": 0.0, + "step": 58633 + }, + { + "epoch": 5.471120649435476, + "grad_norm": NaN, + "learning_rate": 6.084968462981649e-06, + "loss": 0.0, + "step": 58634 + }, + { + "epoch": 5.4712139591303535, + "grad_norm": NaN, + "learning_rate": 6.082836151306269e-06, + "loss": 0.0, + "step": 58635 + }, + { + "epoch": 5.471307268825231, + "grad_norm": NaN, + "learning_rate": 6.0807042055695695e-06, + "loss": 0.0, + "step": 58636 + }, + { + "epoch": 5.471400578520108, + "grad_norm": NaN, + "learning_rate": 6.078572625777062e-06, + "loss": 0.0, + "step": 58637 + }, + { + "epoch": 5.471493888214986, + "grad_norm": NaN, + "learning_rate": 6.076441411934074e-06, + "loss": 0.0, + "step": 58638 + }, + { + "epoch": 5.471587197909863, + "grad_norm": NaN, + "learning_rate": 6.0743105640460695e-06, + "loss": 0.0, + "step": 58639 + }, + { + "epoch": 5.47168050760474, + "grad_norm": NaN, + "learning_rate": 6.0721800821184775e-06, + "loss": 0.0, + "step": 58640 + }, + { + "epoch": 5.471773817299617, + "grad_norm": NaN, + "learning_rate": 6.070049966156676e-06, + "loss": 0.0, + "step": 58641 + }, + { + "epoch": 5.4718671269944945, + "grad_norm": NaN, + "learning_rate": 6.067920216166094e-06, + "loss": 0.0, + "step": 58642 + }, + { + "epoch": 5.471960436689372, + "grad_norm": NaN, + "learning_rate": 6.065790832152179e-06, + "loss": 0.0, + "step": 58643 + }, + { + "epoch": 5.472053746384249, + "grad_norm": NaN, + "learning_rate": 6.063661814120324e-06, + "loss": 0.0, + "step": 58644 + }, + { + "epoch": 5.472147056079127, + "grad_norm": NaN, + "learning_rate": 6.06153316207591e-06, + "loss": 0.0, + "step": 58645 + }, + { + "epoch": 5.472240365774004, + "grad_norm": NaN, + "learning_rate": 6.059404876024415e-06, + "loss": 0.0, + "step": 58646 + }, + { + "epoch": 5.472333675468882, + "grad_norm": NaN, + "learning_rate": 6.057276955971185e-06, + "loss": 0.0, + "step": 58647 + }, + { + "epoch": 5.472426985163758, + "grad_norm": NaN, + "learning_rate": 6.0551494019216665e-06, + "loss": 0.0, + "step": 58648 + }, + { + "epoch": 5.472520294858636, + "grad_norm": NaN, + "learning_rate": 6.0530222138812864e-06, + "loss": 0.0, + "step": 58649 + }, + { + "epoch": 5.472613604553513, + "grad_norm": NaN, + "learning_rate": 6.050895391855409e-06, + "loss": 0.0, + "step": 58650 + }, + { + "epoch": 5.47270691424839, + "grad_norm": NaN, + "learning_rate": 6.048768935849446e-06, + "loss": 0.0, + "step": 58651 + }, + { + "epoch": 5.472800223943268, + "grad_norm": NaN, + "learning_rate": 6.0466428458688594e-06, + "loss": 0.0, + "step": 58652 + }, + { + "epoch": 5.472893533638145, + "grad_norm": NaN, + "learning_rate": 6.044517121918996e-06, + "loss": 0.0, + "step": 58653 + }, + { + "epoch": 5.472986843333023, + "grad_norm": NaN, + "learning_rate": 6.042391764005283e-06, + "loss": 0.0, + "step": 58654 + }, + { + "epoch": 5.473080153027899, + "grad_norm": NaN, + "learning_rate": 6.040266772133151e-06, + "loss": 0.0, + "step": 58655 + }, + { + "epoch": 5.473173462722777, + "grad_norm": NaN, + "learning_rate": 6.038142146307945e-06, + "loss": 0.0, + "step": 58656 + }, + { + "epoch": 5.473266772417654, + "grad_norm": NaN, + "learning_rate": 6.036017886535111e-06, + "loss": 0.0, + "step": 58657 + }, + { + "epoch": 5.473360082112531, + "grad_norm": NaN, + "learning_rate": 6.03389399282006e-06, + "loss": 0.0, + "step": 58658 + }, + { + "epoch": 5.473453391807409, + "grad_norm": NaN, + "learning_rate": 6.031770465168123e-06, + "loss": 0.0, + "step": 58659 + }, + { + "epoch": 5.473546701502286, + "grad_norm": NaN, + "learning_rate": 6.029647303584778e-06, + "loss": 0.0, + "step": 58660 + }, + { + "epoch": 5.473640011197164, + "grad_norm": NaN, + "learning_rate": 6.027524508075404e-06, + "loss": 0.0, + "step": 58661 + }, + { + "epoch": 5.473733320892041, + "grad_norm": NaN, + "learning_rate": 6.025402078645347e-06, + "loss": 0.0, + "step": 58662 + }, + { + "epoch": 5.473826630586918, + "grad_norm": NaN, + "learning_rate": 6.0232800153000695e-06, + "loss": 0.0, + "step": 58663 + }, + { + "epoch": 5.473919940281795, + "grad_norm": NaN, + "learning_rate": 6.021158318044966e-06, + "loss": 0.0, + "step": 58664 + }, + { + "epoch": 5.4740132499766725, + "grad_norm": NaN, + "learning_rate": 6.019036986885351e-06, + "loss": 0.0, + "step": 58665 + }, + { + "epoch": 5.47410655967155, + "grad_norm": NaN, + "learning_rate": 6.016916021826717e-06, + "loss": 0.0, + "step": 58666 + }, + { + "epoch": 5.474199869366427, + "grad_norm": NaN, + "learning_rate": 6.014795422874413e-06, + "loss": 0.0, + "step": 58667 + }, + { + "epoch": 5.474293179061305, + "grad_norm": NaN, + "learning_rate": 6.0126751900338e-06, + "loss": 0.0, + "step": 58668 + }, + { + "epoch": 5.474386488756182, + "grad_norm": NaN, + "learning_rate": 6.010555323310357e-06, + "loss": 0.0, + "step": 58669 + }, + { + "epoch": 5.474479798451059, + "grad_norm": NaN, + "learning_rate": 6.008435822709379e-06, + "loss": 0.0, + "step": 58670 + }, + { + "epoch": 5.474573108145936, + "grad_norm": NaN, + "learning_rate": 6.006316688236279e-06, + "loss": 0.0, + "step": 58671 + }, + { + "epoch": 5.4746664178408135, + "grad_norm": NaN, + "learning_rate": 6.004197919896503e-06, + "loss": 0.0, + "step": 58672 + }, + { + "epoch": 5.474759727535691, + "grad_norm": NaN, + "learning_rate": 6.002079517695379e-06, + "loss": 0.0, + "step": 58673 + }, + { + "epoch": 5.474853037230568, + "grad_norm": NaN, + "learning_rate": 5.999961481638304e-06, + "loss": 0.0, + "step": 58674 + }, + { + "epoch": 5.474946346925446, + "grad_norm": NaN, + "learning_rate": 5.997843811730691e-06, + "loss": 0.0, + "step": 58675 + }, + { + "epoch": 5.475039656620323, + "grad_norm": NaN, + "learning_rate": 5.995726507977899e-06, + "loss": 0.0, + "step": 58676 + }, + { + "epoch": 5.4751329663152, + "grad_norm": NaN, + "learning_rate": 5.9936095703852936e-06, + "loss": 0.0, + "step": 58677 + }, + { + "epoch": 5.475226276010077, + "grad_norm": NaN, + "learning_rate": 5.991492998958319e-06, + "loss": 0.0, + "step": 58678 + }, + { + "epoch": 5.475319585704955, + "grad_norm": NaN, + "learning_rate": 5.989376793702288e-06, + "loss": 0.0, + "step": 58679 + }, + { + "epoch": 5.475412895399832, + "grad_norm": NaN, + "learning_rate": 5.987260954622614e-06, + "loss": 0.0, + "step": 58680 + }, + { + "epoch": 5.475506205094709, + "grad_norm": NaN, + "learning_rate": 5.98514548172469e-06, + "loss": 0.0, + "step": 58681 + }, + { + "epoch": 5.475599514789587, + "grad_norm": NaN, + "learning_rate": 5.983030375013881e-06, + "loss": 0.0, + "step": 58682 + }, + { + "epoch": 5.475692824484464, + "grad_norm": NaN, + "learning_rate": 5.980915634495548e-06, + "loss": 0.0, + "step": 58683 + }, + { + "epoch": 5.475786134179341, + "grad_norm": NaN, + "learning_rate": 5.978801260175103e-06, + "loss": 0.0, + "step": 58684 + }, + { + "epoch": 5.475879443874218, + "grad_norm": NaN, + "learning_rate": 5.97668725205791e-06, + "loss": 0.0, + "step": 58685 + }, + { + "epoch": 5.475972753569096, + "grad_norm": NaN, + "learning_rate": 5.974573610149297e-06, + "loss": 0.0, + "step": 58686 + }, + { + "epoch": 5.476066063263973, + "grad_norm": NaN, + "learning_rate": 5.9724603344547266e-06, + "loss": 0.0, + "step": 58687 + }, + { + "epoch": 5.4761593729588505, + "grad_norm": NaN, + "learning_rate": 5.97034742497951e-06, + "loss": 0.0, + "step": 58688 + }, + { + "epoch": 5.476252682653728, + "grad_norm": NaN, + "learning_rate": 5.968234881729011e-06, + "loss": 0.0, + "step": 58689 + }, + { + "epoch": 5.476345992348605, + "grad_norm": NaN, + "learning_rate": 5.966122704708676e-06, + "loss": 0.0, + "step": 58690 + }, + { + "epoch": 5.476439302043483, + "grad_norm": NaN, + "learning_rate": 5.964010893923765e-06, + "loss": 0.0, + "step": 58691 + }, + { + "epoch": 5.476532611738359, + "grad_norm": NaN, + "learning_rate": 5.961899449379742e-06, + "loss": 0.0, + "step": 58692 + }, + { + "epoch": 5.476625921433237, + "grad_norm": NaN, + "learning_rate": 5.9597883710819525e-06, + "loss": 0.0, + "step": 58693 + }, + { + "epoch": 5.476719231128114, + "grad_norm": NaN, + "learning_rate": 5.957677659035725e-06, + "loss": 0.0, + "step": 58694 + }, + { + "epoch": 5.4768125408229915, + "grad_norm": NaN, + "learning_rate": 5.9555673132464736e-06, + "loss": 0.0, + "step": 58695 + }, + { + "epoch": 5.476905850517869, + "grad_norm": NaN, + "learning_rate": 5.953457333719558e-06, + "loss": 0.0, + "step": 58696 + }, + { + "epoch": 5.476999160212746, + "grad_norm": NaN, + "learning_rate": 5.9513477204602925e-06, + "loss": 0.0, + "step": 58697 + }, + { + "epoch": 5.477092469907624, + "grad_norm": NaN, + "learning_rate": 5.949238473474105e-06, + "loss": 0.0, + "step": 58698 + }, + { + "epoch": 5.4771857796025, + "grad_norm": NaN, + "learning_rate": 5.947129592766342e-06, + "loss": 0.0, + "step": 58699 + }, + { + "epoch": 5.477279089297378, + "grad_norm": NaN, + "learning_rate": 5.945021078342332e-06, + "loss": 0.0, + "step": 58700 + }, + { + "epoch": 5.477372398992255, + "grad_norm": NaN, + "learning_rate": 5.942912930207472e-06, + "loss": 0.0, + "step": 58701 + }, + { + "epoch": 5.477465708687133, + "grad_norm": NaN, + "learning_rate": 5.940805148367122e-06, + "loss": 0.0, + "step": 58702 + }, + { + "epoch": 5.47755901838201, + "grad_norm": NaN, + "learning_rate": 5.938697732826597e-06, + "loss": 0.0, + "step": 58703 + }, + { + "epoch": 5.477652328076887, + "grad_norm": NaN, + "learning_rate": 5.936590683591325e-06, + "loss": 0.0, + "step": 58704 + }, + { + "epoch": 5.477745637771765, + "grad_norm": NaN, + "learning_rate": 5.934484000666634e-06, + "loss": 0.0, + "step": 58705 + }, + { + "epoch": 5.477838947466642, + "grad_norm": NaN, + "learning_rate": 5.932377684057837e-06, + "loss": 0.0, + "step": 58706 + }, + { + "epoch": 5.477932257161519, + "grad_norm": NaN, + "learning_rate": 5.930271733770347e-06, + "loss": 0.0, + "step": 58707 + }, + { + "epoch": 5.478025566856396, + "grad_norm": NaN, + "learning_rate": 5.9281661498095265e-06, + "loss": 0.0, + "step": 58708 + }, + { + "epoch": 5.478118876551274, + "grad_norm": NaN, + "learning_rate": 5.926060932180654e-06, + "loss": 0.0, + "step": 58709 + }, + { + "epoch": 5.478212186246151, + "grad_norm": NaN, + "learning_rate": 5.923956080889142e-06, + "loss": 0.0, + "step": 58710 + }, + { + "epoch": 5.4783054959410284, + "grad_norm": NaN, + "learning_rate": 5.921851595940369e-06, + "loss": 0.0, + "step": 58711 + }, + { + "epoch": 5.478398805635906, + "grad_norm": NaN, + "learning_rate": 5.919747477339598e-06, + "loss": 0.0, + "step": 58712 + }, + { + "epoch": 5.478492115330782, + "grad_norm": NaN, + "learning_rate": 5.917643725092258e-06, + "loss": 0.0, + "step": 58713 + }, + { + "epoch": 5.47858542502566, + "grad_norm": NaN, + "learning_rate": 5.915540339203678e-06, + "loss": 0.0, + "step": 58714 + }, + { + "epoch": 5.478678734720537, + "grad_norm": NaN, + "learning_rate": 5.913437319679171e-06, + "loss": 0.0, + "step": 58715 + }, + { + "epoch": 5.478772044415415, + "grad_norm": NaN, + "learning_rate": 5.911334666524148e-06, + "loss": 0.0, + "step": 58716 + }, + { + "epoch": 5.478865354110292, + "grad_norm": NaN, + "learning_rate": 5.909232379743889e-06, + "loss": 0.0, + "step": 58717 + }, + { + "epoch": 5.4789586638051695, + "grad_norm": NaN, + "learning_rate": 5.907130459343757e-06, + "loss": 0.0, + "step": 58718 + }, + { + "epoch": 5.479051973500047, + "grad_norm": NaN, + "learning_rate": 5.9050289053291455e-06, + "loss": 0.0, + "step": 58719 + }, + { + "epoch": 5.479145283194924, + "grad_norm": NaN, + "learning_rate": 5.902927717705336e-06, + "loss": 0.0, + "step": 58720 + }, + { + "epoch": 5.479238592889801, + "grad_norm": NaN, + "learning_rate": 5.90082689647769e-06, + "loss": 0.0, + "step": 58721 + }, + { + "epoch": 5.479331902584678, + "grad_norm": NaN, + "learning_rate": 5.898726441651586e-06, + "loss": 0.0, + "step": 58722 + }, + { + "epoch": 5.479425212279556, + "grad_norm": NaN, + "learning_rate": 5.89662635323232e-06, + "loss": 0.0, + "step": 58723 + }, + { + "epoch": 5.479518521974433, + "grad_norm": NaN, + "learning_rate": 5.894526631225222e-06, + "loss": 0.0, + "step": 58724 + }, + { + "epoch": 5.4796118316693105, + "grad_norm": NaN, + "learning_rate": 5.892427275635703e-06, + "loss": 0.0, + "step": 58725 + }, + { + "epoch": 5.479705141364188, + "grad_norm": NaN, + "learning_rate": 5.890328286469026e-06, + "loss": 0.0, + "step": 58726 + }, + { + "epoch": 5.479798451059065, + "grad_norm": NaN, + "learning_rate": 5.8882296637305375e-06, + "loss": 0.0, + "step": 58727 + }, + { + "epoch": 5.479891760753942, + "grad_norm": NaN, + "learning_rate": 5.886131407425632e-06, + "loss": 0.0, + "step": 58728 + }, + { + "epoch": 5.479985070448819, + "grad_norm": NaN, + "learning_rate": 5.884033517559572e-06, + "loss": 0.0, + "step": 58729 + }, + { + "epoch": 5.480078380143697, + "grad_norm": NaN, + "learning_rate": 5.881935994137737e-06, + "loss": 0.0, + "step": 58730 + }, + { + "epoch": 5.480171689838574, + "grad_norm": NaN, + "learning_rate": 5.879838837165474e-06, + "loss": 0.0, + "step": 58731 + }, + { + "epoch": 5.480264999533452, + "grad_norm": NaN, + "learning_rate": 5.877742046648043e-06, + "loss": 0.0, + "step": 58732 + }, + { + "epoch": 5.480358309228329, + "grad_norm": NaN, + "learning_rate": 5.875645622590841e-06, + "loss": 0.0, + "step": 58733 + }, + { + "epoch": 5.480451618923206, + "grad_norm": NaN, + "learning_rate": 5.873549564999197e-06, + "loss": 0.0, + "step": 58734 + }, + { + "epoch": 5.480544928618084, + "grad_norm": NaN, + "learning_rate": 5.87145387387839e-06, + "loss": 0.0, + "step": 58735 + }, + { + "epoch": 5.48063823831296, + "grad_norm": NaN, + "learning_rate": 5.869358549233799e-06, + "loss": 0.0, + "step": 58736 + }, + { + "epoch": 5.480731548007838, + "grad_norm": NaN, + "learning_rate": 5.867263591070753e-06, + "loss": 0.0, + "step": 58737 + }, + { + "epoch": 5.480824857702715, + "grad_norm": NaN, + "learning_rate": 5.865168999394532e-06, + "loss": 0.0, + "step": 58738 + }, + { + "epoch": 5.480918167397593, + "grad_norm": NaN, + "learning_rate": 5.863074774210497e-06, + "loss": 0.0, + "step": 58739 + }, + { + "epoch": 5.48101147709247, + "grad_norm": NaN, + "learning_rate": 5.860980915523994e-06, + "loss": 0.0, + "step": 58740 + }, + { + "epoch": 5.4811047867873475, + "grad_norm": NaN, + "learning_rate": 5.85888742334027e-06, + "loss": 0.0, + "step": 58741 + }, + { + "epoch": 5.481198096482225, + "grad_norm": NaN, + "learning_rate": 5.856794297664735e-06, + "loss": 0.0, + "step": 58742 + }, + { + "epoch": 5.481291406177101, + "grad_norm": NaN, + "learning_rate": 5.854701538502671e-06, + "loss": 0.0, + "step": 58743 + }, + { + "epoch": 5.481384715871979, + "grad_norm": NaN, + "learning_rate": 5.852609145859372e-06, + "loss": 0.0, + "step": 58744 + }, + { + "epoch": 5.481478025566856, + "grad_norm": NaN, + "learning_rate": 5.850517119740217e-06, + "loss": 0.0, + "step": 58745 + }, + { + "epoch": 5.481571335261734, + "grad_norm": NaN, + "learning_rate": 5.848425460150502e-06, + "loss": 0.0, + "step": 58746 + }, + { + "epoch": 5.481664644956611, + "grad_norm": NaN, + "learning_rate": 5.8463341670955066e-06, + "loss": 0.0, + "step": 58747 + }, + { + "epoch": 5.4817579546514885, + "grad_norm": NaN, + "learning_rate": 5.84424324058061e-06, + "loss": 0.0, + "step": 58748 + }, + { + "epoch": 5.481851264346366, + "grad_norm": NaN, + "learning_rate": 5.842152680611106e-06, + "loss": 0.0, + "step": 58749 + }, + { + "epoch": 5.481944574041243, + "grad_norm": NaN, + "learning_rate": 5.840062487192277e-06, + "loss": 0.0, + "step": 58750 + }, + { + "epoch": 5.48203788373612, + "grad_norm": NaN, + "learning_rate": 5.837972660329482e-06, + "loss": 0.0, + "step": 58751 + }, + { + "epoch": 5.482131193430997, + "grad_norm": NaN, + "learning_rate": 5.8358832000280355e-06, + "loss": 0.0, + "step": 58752 + }, + { + "epoch": 5.482224503125875, + "grad_norm": NaN, + "learning_rate": 5.833794106293199e-06, + "loss": 0.0, + "step": 58753 + }, + { + "epoch": 5.482317812820752, + "grad_norm": NaN, + "learning_rate": 5.831705379130336e-06, + "loss": 0.0, + "step": 58754 + }, + { + "epoch": 5.48241112251563, + "grad_norm": NaN, + "learning_rate": 5.829617018544758e-06, + "loss": 0.0, + "step": 58755 + }, + { + "epoch": 5.482504432210507, + "grad_norm": NaN, + "learning_rate": 5.8275290245417275e-06, + "loss": 0.0, + "step": 58756 + }, + { + "epoch": 5.4825977419053835, + "grad_norm": NaN, + "learning_rate": 5.82544139712659e-06, + "loss": 0.0, + "step": 58757 + }, + { + "epoch": 5.482691051600261, + "grad_norm": NaN, + "learning_rate": 5.823354136304676e-06, + "loss": 0.0, + "step": 58758 + }, + { + "epoch": 5.482784361295138, + "grad_norm": NaN, + "learning_rate": 5.821267242081213e-06, + "loss": 0.0, + "step": 58759 + }, + { + "epoch": 5.482877670990016, + "grad_norm": NaN, + "learning_rate": 5.819180714461613e-06, + "loss": 0.0, + "step": 58760 + }, + { + "epoch": 5.482970980684893, + "grad_norm": NaN, + "learning_rate": 5.8170945534510906e-06, + "loss": 0.0, + "step": 58761 + }, + { + "epoch": 5.483064290379771, + "grad_norm": NaN, + "learning_rate": 5.81500875905499e-06, + "loss": 0.0, + "step": 58762 + }, + { + "epoch": 5.483157600074648, + "grad_norm": NaN, + "learning_rate": 5.812923331278624e-06, + "loss": 0.0, + "step": 58763 + }, + { + "epoch": 5.4832509097695254, + "grad_norm": NaN, + "learning_rate": 5.810838270127271e-06, + "loss": 0.0, + "step": 58764 + }, + { + "epoch": 5.483344219464402, + "grad_norm": NaN, + "learning_rate": 5.808753575606245e-06, + "loss": 0.0, + "step": 58765 + }, + { + "epoch": 5.483437529159279, + "grad_norm": NaN, + "learning_rate": 5.806669247720874e-06, + "loss": 0.0, + "step": 58766 + }, + { + "epoch": 5.483530838854157, + "grad_norm": NaN, + "learning_rate": 5.804585286476388e-06, + "loss": 0.0, + "step": 58767 + }, + { + "epoch": 5.483624148549034, + "grad_norm": NaN, + "learning_rate": 5.802501691878147e-06, + "loss": 0.0, + "step": 58768 + }, + { + "epoch": 5.483717458243912, + "grad_norm": NaN, + "learning_rate": 5.80041846393145e-06, + "loss": 0.0, + "step": 58769 + }, + { + "epoch": 5.483810767938789, + "grad_norm": NaN, + "learning_rate": 5.798335602641524e-06, + "loss": 0.0, + "step": 58770 + }, + { + "epoch": 5.4839040776336665, + "grad_norm": NaN, + "learning_rate": 5.796253108013749e-06, + "loss": 0.0, + "step": 58771 + }, + { + "epoch": 5.483997387328543, + "grad_norm": NaN, + "learning_rate": 5.7941709800534034e-06, + "loss": 0.0, + "step": 58772 + }, + { + "epoch": 5.4840906970234204, + "grad_norm": NaN, + "learning_rate": 5.7920892187657166e-06, + "loss": 0.0, + "step": 58773 + }, + { + "epoch": 5.484184006718298, + "grad_norm": NaN, + "learning_rate": 5.790007824156051e-06, + "loss": 0.0, + "step": 58774 + }, + { + "epoch": 5.484277316413175, + "grad_norm": NaN, + "learning_rate": 5.787926796229703e-06, + "loss": 0.0, + "step": 58775 + }, + { + "epoch": 5.484370626108053, + "grad_norm": NaN, + "learning_rate": 5.7858461349919006e-06, + "loss": 0.0, + "step": 58776 + }, + { + "epoch": 5.48446393580293, + "grad_norm": NaN, + "learning_rate": 5.78376584044799e-06, + "loss": 0.0, + "step": 58777 + }, + { + "epoch": 5.4845572454978075, + "grad_norm": NaN, + "learning_rate": 5.781685912603251e-06, + "loss": 0.0, + "step": 58778 + }, + { + "epoch": 5.484650555192685, + "grad_norm": NaN, + "learning_rate": 5.779606351462945e-06, + "loss": 0.0, + "step": 58779 + }, + { + "epoch": 5.4847438648875615, + "grad_norm": NaN, + "learning_rate": 5.777527157032386e-06, + "loss": 0.0, + "step": 58780 + }, + { + "epoch": 5.484837174582439, + "grad_norm": NaN, + "learning_rate": 5.7754483293168675e-06, + "loss": 0.0, + "step": 58781 + }, + { + "epoch": 5.484930484277316, + "grad_norm": NaN, + "learning_rate": 5.773369868321637e-06, + "loss": 0.0, + "step": 58782 + }, + { + "epoch": 5.485023793972194, + "grad_norm": NaN, + "learning_rate": 5.771291774052006e-06, + "loss": 0.0, + "step": 58783 + }, + { + "epoch": 5.485117103667071, + "grad_norm": NaN, + "learning_rate": 5.769214046513288e-06, + "loss": 0.0, + "step": 58784 + }, + { + "epoch": 5.485210413361949, + "grad_norm": NaN, + "learning_rate": 5.767136685710694e-06, + "loss": 0.0, + "step": 58785 + }, + { + "epoch": 5.485303723056826, + "grad_norm": NaN, + "learning_rate": 5.765059691649571e-06, + "loss": 0.0, + "step": 58786 + }, + { + "epoch": 5.4853970327517025, + "grad_norm": NaN, + "learning_rate": 5.76298306433518e-06, + "loss": 0.0, + "step": 58787 + }, + { + "epoch": 5.48549034244658, + "grad_norm": NaN, + "learning_rate": 5.760906803772769e-06, + "loss": 0.0, + "step": 58788 + }, + { + "epoch": 5.485583652141457, + "grad_norm": NaN, + "learning_rate": 5.758830909967666e-06, + "loss": 0.0, + "step": 58789 + }, + { + "epoch": 5.485676961836335, + "grad_norm": NaN, + "learning_rate": 5.756755382925149e-06, + "loss": 0.0, + "step": 58790 + }, + { + "epoch": 5.485770271531212, + "grad_norm": NaN, + "learning_rate": 5.754680222650432e-06, + "loss": 0.0, + "step": 58791 + }, + { + "epoch": 5.48586358122609, + "grad_norm": NaN, + "learning_rate": 5.75260542914886e-06, + "loss": 0.0, + "step": 58792 + }, + { + "epoch": 5.485956890920967, + "grad_norm": NaN, + "learning_rate": 5.750531002425696e-06, + "loss": 0.0, + "step": 58793 + }, + { + "epoch": 5.486050200615844, + "grad_norm": NaN, + "learning_rate": 5.748456942486151e-06, + "loss": 0.0, + "step": 58794 + }, + { + "epoch": 5.486143510310721, + "grad_norm": NaN, + "learning_rate": 5.74638324933559e-06, + "loss": 0.0, + "step": 58795 + }, + { + "epoch": 5.486236820005598, + "grad_norm": NaN, + "learning_rate": 5.744309922979257e-06, + "loss": 0.0, + "step": 58796 + }, + { + "epoch": 5.486330129700476, + "grad_norm": NaN, + "learning_rate": 5.742236963422364e-06, + "loss": 0.0, + "step": 58797 + }, + { + "epoch": 5.486423439395353, + "grad_norm": NaN, + "learning_rate": 5.740164370670258e-06, + "loss": 0.0, + "step": 58798 + }, + { + "epoch": 5.486516749090231, + "grad_norm": NaN, + "learning_rate": 5.7380921447282015e-06, + "loss": 0.0, + "step": 58799 + }, + { + "epoch": 5.486610058785108, + "grad_norm": NaN, + "learning_rate": 5.736020285601406e-06, + "loss": 0.0, + "step": 58800 + }, + { + "epoch": 5.486703368479985, + "grad_norm": NaN, + "learning_rate": 5.733948793295201e-06, + "loss": 0.0, + "step": 58801 + }, + { + "epoch": 5.486796678174862, + "grad_norm": NaN, + "learning_rate": 5.731877667814816e-06, + "loss": 0.0, + "step": 58802 + }, + { + "epoch": 5.4868899878697395, + "grad_norm": NaN, + "learning_rate": 5.729806909165546e-06, + "loss": 0.0, + "step": 58803 + }, + { + "epoch": 5.486983297564617, + "grad_norm": NaN, + "learning_rate": 5.727736517352638e-06, + "loss": 0.0, + "step": 58804 + }, + { + "epoch": 5.487076607259494, + "grad_norm": NaN, + "learning_rate": 5.725666492381337e-06, + "loss": 0.0, + "step": 58805 + }, + { + "epoch": 5.487169916954372, + "grad_norm": NaN, + "learning_rate": 5.723596834256955e-06, + "loss": 0.0, + "step": 58806 + }, + { + "epoch": 5.487263226649249, + "grad_norm": NaN, + "learning_rate": 5.721527542984739e-06, + "loss": 0.0, + "step": 58807 + }, + { + "epoch": 5.487356536344127, + "grad_norm": NaN, + "learning_rate": 5.7194586185698995e-06, + "loss": 0.0, + "step": 58808 + }, + { + "epoch": 5.487449846039003, + "grad_norm": NaN, + "learning_rate": 5.717390061017751e-06, + "loss": 0.0, + "step": 58809 + }, + { + "epoch": 5.4875431557338805, + "grad_norm": NaN, + "learning_rate": 5.715321870333573e-06, + "loss": 0.0, + "step": 58810 + }, + { + "epoch": 5.487636465428758, + "grad_norm": NaN, + "learning_rate": 5.713254046522542e-06, + "loss": 0.0, + "step": 58811 + }, + { + "epoch": 5.487729775123635, + "grad_norm": NaN, + "learning_rate": 5.711186589589989e-06, + "loss": 0.0, + "step": 58812 + }, + { + "epoch": 5.487823084818513, + "grad_norm": NaN, + "learning_rate": 5.709119499541159e-06, + "loss": 0.0, + "step": 58813 + }, + { + "epoch": 5.48791639451339, + "grad_norm": NaN, + "learning_rate": 5.707052776381265e-06, + "loss": 0.0, + "step": 58814 + }, + { + "epoch": 5.488009704208268, + "grad_norm": NaN, + "learning_rate": 5.704986420115621e-06, + "loss": 0.0, + "step": 58815 + }, + { + "epoch": 5.488103013903144, + "grad_norm": NaN, + "learning_rate": 5.702920430749453e-06, + "loss": 0.0, + "step": 58816 + }, + { + "epoch": 5.488196323598022, + "grad_norm": NaN, + "learning_rate": 5.7008548082879915e-06, + "loss": 0.0, + "step": 58817 + }, + { + "epoch": 5.488289633292899, + "grad_norm": NaN, + "learning_rate": 5.698789552736532e-06, + "loss": 0.0, + "step": 58818 + }, + { + "epoch": 5.488382942987776, + "grad_norm": NaN, + "learning_rate": 5.696724664100305e-06, + "loss": 0.0, + "step": 58819 + }, + { + "epoch": 5.488476252682654, + "grad_norm": NaN, + "learning_rate": 5.694660142384538e-06, + "loss": 0.0, + "step": 58820 + }, + { + "epoch": 5.488569562377531, + "grad_norm": NaN, + "learning_rate": 5.6925959875945285e-06, + "loss": 0.0, + "step": 58821 + }, + { + "epoch": 5.488662872072409, + "grad_norm": NaN, + "learning_rate": 5.690532199735504e-06, + "loss": 0.0, + "step": 58822 + }, + { + "epoch": 5.488756181767286, + "grad_norm": NaN, + "learning_rate": 5.688468778812677e-06, + "loss": 0.0, + "step": 58823 + }, + { + "epoch": 5.488849491462163, + "grad_norm": NaN, + "learning_rate": 5.6864057248313445e-06, + "loss": 0.0, + "step": 58824 + }, + { + "epoch": 5.48894280115704, + "grad_norm": NaN, + "learning_rate": 5.6843430377967515e-06, + "loss": 0.0, + "step": 58825 + }, + { + "epoch": 5.4890361108519174, + "grad_norm": NaN, + "learning_rate": 5.682280717714094e-06, + "loss": 0.0, + "step": 58826 + }, + { + "epoch": 5.489129420546795, + "grad_norm": NaN, + "learning_rate": 5.680218764588668e-06, + "loss": 0.0, + "step": 58827 + }, + { + "epoch": 5.489222730241672, + "grad_norm": NaN, + "learning_rate": 5.678157178425702e-06, + "loss": 0.0, + "step": 58828 + }, + { + "epoch": 5.48931603993655, + "grad_norm": NaN, + "learning_rate": 5.676095959230392e-06, + "loss": 0.0, + "step": 58829 + }, + { + "epoch": 5.489409349631426, + "grad_norm": NaN, + "learning_rate": 5.674035107008052e-06, + "loss": 0.0, + "step": 58830 + }, + { + "epoch": 5.489502659326304, + "grad_norm": NaN, + "learning_rate": 5.671974621763892e-06, + "loss": 0.0, + "step": 58831 + }, + { + "epoch": 5.489595969021181, + "grad_norm": NaN, + "learning_rate": 5.669914503503109e-06, + "loss": 0.0, + "step": 58832 + }, + { + "epoch": 5.4896892787160585, + "grad_norm": NaN, + "learning_rate": 5.667854752231016e-06, + "loss": 0.0, + "step": 58833 + }, + { + "epoch": 5.489782588410936, + "grad_norm": NaN, + "learning_rate": 5.665795367952825e-06, + "loss": 0.0, + "step": 58834 + }, + { + "epoch": 5.489875898105813, + "grad_norm": NaN, + "learning_rate": 5.663736350673714e-06, + "loss": 0.0, + "step": 58835 + }, + { + "epoch": 5.489969207800691, + "grad_norm": NaN, + "learning_rate": 5.661677700398998e-06, + "loss": 0.0, + "step": 58836 + }, + { + "epoch": 5.490062517495568, + "grad_norm": NaN, + "learning_rate": 5.659619417133887e-06, + "loss": 0.0, + "step": 58837 + }, + { + "epoch": 5.490155827190445, + "grad_norm": NaN, + "learning_rate": 5.6575615008835785e-06, + "loss": 0.0, + "step": 58838 + }, + { + "epoch": 5.490249136885322, + "grad_norm": NaN, + "learning_rate": 5.655503951653334e-06, + "loss": 0.0, + "step": 58839 + }, + { + "epoch": 5.4903424465801995, + "grad_norm": NaN, + "learning_rate": 5.6534467694483994e-06, + "loss": 0.0, + "step": 58840 + }, + { + "epoch": 5.490435756275077, + "grad_norm": NaN, + "learning_rate": 5.6513899542740046e-06, + "loss": 0.0, + "step": 58841 + }, + { + "epoch": 5.490529065969954, + "grad_norm": NaN, + "learning_rate": 5.649333506135345e-06, + "loss": 0.0, + "step": 58842 + }, + { + "epoch": 5.490622375664832, + "grad_norm": NaN, + "learning_rate": 5.647277425037683e-06, + "loss": 0.0, + "step": 58843 + }, + { + "epoch": 5.490715685359709, + "grad_norm": NaN, + "learning_rate": 5.645221710986231e-06, + "loss": 0.0, + "step": 58844 + }, + { + "epoch": 5.490808995054586, + "grad_norm": NaN, + "learning_rate": 5.643166363986201e-06, + "loss": 0.0, + "step": 58845 + }, + { + "epoch": 5.490902304749463, + "grad_norm": NaN, + "learning_rate": 5.641111384042857e-06, + "loss": 0.0, + "step": 58846 + }, + { + "epoch": 5.490995614444341, + "grad_norm": NaN, + "learning_rate": 5.63905677116141e-06, + "loss": 0.0, + "step": 58847 + }, + { + "epoch": 5.491088924139218, + "grad_norm": NaN, + "learning_rate": 5.637002525347057e-06, + "loss": 0.0, + "step": 58848 + }, + { + "epoch": 5.491182233834095, + "grad_norm": NaN, + "learning_rate": 5.634948646605059e-06, + "loss": 0.0, + "step": 58849 + }, + { + "epoch": 5.491275543528973, + "grad_norm": NaN, + "learning_rate": 5.6328951349406125e-06, + "loss": 0.0, + "step": 58850 + }, + { + "epoch": 5.49136885322385, + "grad_norm": NaN, + "learning_rate": 5.630841990358981e-06, + "loss": 0.0, + "step": 58851 + }, + { + "epoch": 5.491462162918728, + "grad_norm": NaN, + "learning_rate": 5.628789212865309e-06, + "loss": 0.0, + "step": 58852 + }, + { + "epoch": 5.491555472613604, + "grad_norm": NaN, + "learning_rate": 5.626736802464876e-06, + "loss": 0.0, + "step": 58853 + }, + { + "epoch": 5.491648782308482, + "grad_norm": NaN, + "learning_rate": 5.6246847591629115e-06, + "loss": 0.0, + "step": 58854 + }, + { + "epoch": 5.491742092003359, + "grad_norm": NaN, + "learning_rate": 5.622633082964562e-06, + "loss": 0.0, + "step": 58855 + }, + { + "epoch": 5.4918354016982365, + "grad_norm": NaN, + "learning_rate": 5.620581773875104e-06, + "loss": 0.0, + "step": 58856 + }, + { + "epoch": 5.491928711393114, + "grad_norm": NaN, + "learning_rate": 5.61853083189977e-06, + "loss": 0.0, + "step": 58857 + }, + { + "epoch": 5.492022021087991, + "grad_norm": NaN, + "learning_rate": 5.616480257043688e-06, + "loss": 0.0, + "step": 58858 + }, + { + "epoch": 5.492115330782869, + "grad_norm": NaN, + "learning_rate": 5.614430049312169e-06, + "loss": 0.0, + "step": 58859 + }, + { + "epoch": 5.492208640477745, + "grad_norm": NaN, + "learning_rate": 5.612380208710376e-06, + "loss": 0.0, + "step": 58860 + }, + { + "epoch": 5.492301950172623, + "grad_norm": NaN, + "learning_rate": 5.610330735243506e-06, + "loss": 0.0, + "step": 58861 + }, + { + "epoch": 5.4923952598675, + "grad_norm": NaN, + "learning_rate": 5.608281628916822e-06, + "loss": 0.0, + "step": 58862 + }, + { + "epoch": 5.4924885695623775, + "grad_norm": NaN, + "learning_rate": 5.6062328897355016e-06, + "loss": 0.0, + "step": 58863 + }, + { + "epoch": 5.492581879257255, + "grad_norm": NaN, + "learning_rate": 5.604184517704741e-06, + "loss": 0.0, + "step": 58864 + }, + { + "epoch": 5.492675188952132, + "grad_norm": NaN, + "learning_rate": 5.602136512829786e-06, + "loss": 0.0, + "step": 58865 + }, + { + "epoch": 5.49276849864701, + "grad_norm": NaN, + "learning_rate": 5.600088875115832e-06, + "loss": 0.0, + "step": 58866 + }, + { + "epoch": 5.492861808341886, + "grad_norm": NaN, + "learning_rate": 5.598041604568043e-06, + "loss": 0.0, + "step": 58867 + }, + { + "epoch": 5.492955118036764, + "grad_norm": NaN, + "learning_rate": 5.595994701191664e-06, + "loss": 0.0, + "step": 58868 + }, + { + "epoch": 5.493048427731641, + "grad_norm": NaN, + "learning_rate": 5.5939481649919405e-06, + "loss": 0.0, + "step": 58869 + }, + { + "epoch": 5.493141737426519, + "grad_norm": NaN, + "learning_rate": 5.591901995973969e-06, + "loss": 0.0, + "step": 58870 + }, + { + "epoch": 5.493235047121396, + "grad_norm": NaN, + "learning_rate": 5.589856194143044e-06, + "loss": 0.0, + "step": 58871 + }, + { + "epoch": 5.493328356816273, + "grad_norm": NaN, + "learning_rate": 5.587810759504363e-06, + "loss": 0.0, + "step": 58872 + }, + { + "epoch": 5.493421666511151, + "grad_norm": NaN, + "learning_rate": 5.585765692063054e-06, + "loss": 0.0, + "step": 58873 + }, + { + "epoch": 5.493514976206027, + "grad_norm": NaN, + "learning_rate": 5.583720991824381e-06, + "loss": 0.0, + "step": 58874 + }, + { + "epoch": 5.493608285900905, + "grad_norm": NaN, + "learning_rate": 5.581676658793555e-06, + "loss": 0.0, + "step": 58875 + }, + { + "epoch": 5.493701595595782, + "grad_norm": NaN, + "learning_rate": 5.579632692975688e-06, + "loss": 0.0, + "step": 58876 + }, + { + "epoch": 5.49379490529066, + "grad_norm": NaN, + "learning_rate": 5.577589094376078e-06, + "loss": 0.0, + "step": 58877 + }, + { + "epoch": 5.493888214985537, + "grad_norm": NaN, + "learning_rate": 5.57554586299987e-06, + "loss": 0.0, + "step": 58878 + }, + { + "epoch": 5.4939815246804145, + "grad_norm": NaN, + "learning_rate": 5.5735029988522585e-06, + "loss": 0.0, + "step": 58879 + }, + { + "epoch": 5.494074834375292, + "grad_norm": NaN, + "learning_rate": 5.57146050193844e-06, + "loss": 0.0, + "step": 58880 + }, + { + "epoch": 5.494168144070169, + "grad_norm": NaN, + "learning_rate": 5.5694183722636286e-06, + "loss": 0.0, + "step": 58881 + }, + { + "epoch": 5.494261453765046, + "grad_norm": NaN, + "learning_rate": 5.567376609833002e-06, + "loss": 0.0, + "step": 58882 + }, + { + "epoch": 5.494354763459923, + "grad_norm": NaN, + "learning_rate": 5.565335214651756e-06, + "loss": 0.0, + "step": 58883 + }, + { + "epoch": 5.494448073154801, + "grad_norm": NaN, + "learning_rate": 5.56329418672507e-06, + "loss": 0.0, + "step": 58884 + }, + { + "epoch": 5.494541382849678, + "grad_norm": NaN, + "learning_rate": 5.56125352605814e-06, + "loss": 0.0, + "step": 58885 + }, + { + "epoch": 5.4946346925445555, + "grad_norm": NaN, + "learning_rate": 5.559213232656162e-06, + "loss": 0.0, + "step": 58886 + }, + { + "epoch": 5.494728002239433, + "grad_norm": NaN, + "learning_rate": 5.5571733065243315e-06, + "loss": 0.0, + "step": 58887 + }, + { + "epoch": 5.49482131193431, + "grad_norm": NaN, + "learning_rate": 5.555133747667795e-06, + "loss": 0.0, + "step": 58888 + }, + { + "epoch": 5.494914621629187, + "grad_norm": NaN, + "learning_rate": 5.553094556091781e-06, + "loss": 0.0, + "step": 58889 + }, + { + "epoch": 5.495007931324064, + "grad_norm": NaN, + "learning_rate": 5.551055731801468e-06, + "loss": 0.0, + "step": 58890 + }, + { + "epoch": 5.495101241018942, + "grad_norm": NaN, + "learning_rate": 5.54901727480202e-06, + "loss": 0.0, + "step": 58891 + }, + { + "epoch": 5.495194550713819, + "grad_norm": NaN, + "learning_rate": 5.5469791850986325e-06, + "loss": 0.0, + "step": 58892 + }, + { + "epoch": 5.4952878604086965, + "grad_norm": NaN, + "learning_rate": 5.544941462696484e-06, + "loss": 0.0, + "step": 58893 + }, + { + "epoch": 5.495381170103574, + "grad_norm": NaN, + "learning_rate": 5.542904107600754e-06, + "loss": 0.0, + "step": 58894 + }, + { + "epoch": 5.495474479798451, + "grad_norm": NaN, + "learning_rate": 5.540867119816655e-06, + "loss": 0.0, + "step": 58895 + }, + { + "epoch": 5.495567789493329, + "grad_norm": NaN, + "learning_rate": 5.5388304993493e-06, + "loss": 0.0, + "step": 58896 + }, + { + "epoch": 5.495661099188205, + "grad_norm": NaN, + "learning_rate": 5.536794246203919e-06, + "loss": 0.0, + "step": 58897 + }, + { + "epoch": 5.495754408883083, + "grad_norm": NaN, + "learning_rate": 5.5347583603857046e-06, + "loss": 0.0, + "step": 58898 + }, + { + "epoch": 5.49584771857796, + "grad_norm": NaN, + "learning_rate": 5.532722841899772e-06, + "loss": 0.0, + "step": 58899 + }, + { + "epoch": 5.495941028272838, + "grad_norm": NaN, + "learning_rate": 5.530687690751335e-06, + "loss": 0.0, + "step": 58900 + }, + { + "epoch": 5.496034337967715, + "grad_norm": NaN, + "learning_rate": 5.528652906945585e-06, + "loss": 0.0, + "step": 58901 + }, + { + "epoch": 5.496127647662592, + "grad_norm": NaN, + "learning_rate": 5.526618490487655e-06, + "loss": 0.0, + "step": 58902 + }, + { + "epoch": 5.49622095735747, + "grad_norm": NaN, + "learning_rate": 5.52458444138274e-06, + "loss": 0.0, + "step": 58903 + }, + { + "epoch": 5.496314267052346, + "grad_norm": NaN, + "learning_rate": 5.522550759636035e-06, + "loss": 0.0, + "step": 58904 + }, + { + "epoch": 5.496407576747224, + "grad_norm": NaN, + "learning_rate": 5.520517445252653e-06, + "loss": 0.0, + "step": 58905 + }, + { + "epoch": 5.496500886442101, + "grad_norm": NaN, + "learning_rate": 5.518484498237807e-06, + "loss": 0.0, + "step": 58906 + }, + { + "epoch": 5.496594196136979, + "grad_norm": NaN, + "learning_rate": 5.516451918596676e-06, + "loss": 0.0, + "step": 58907 + }, + { + "epoch": 5.496687505831856, + "grad_norm": NaN, + "learning_rate": 5.514419706334372e-06, + "loss": 0.0, + "step": 58908 + }, + { + "epoch": 5.4967808155267335, + "grad_norm": NaN, + "learning_rate": 5.5123878614561245e-06, + "loss": 0.0, + "step": 58909 + }, + { + "epoch": 5.496874125221611, + "grad_norm": NaN, + "learning_rate": 5.510356383967079e-06, + "loss": 0.0, + "step": 58910 + }, + { + "epoch": 5.496967434916487, + "grad_norm": NaN, + "learning_rate": 5.508325273872366e-06, + "loss": 0.0, + "step": 58911 + }, + { + "epoch": 5.497060744611365, + "grad_norm": NaN, + "learning_rate": 5.5062945311772136e-06, + "loss": 0.0, + "step": 58912 + }, + { + "epoch": 5.497154054306242, + "grad_norm": NaN, + "learning_rate": 5.504264155886751e-06, + "loss": 0.0, + "step": 58913 + }, + { + "epoch": 5.49724736400112, + "grad_norm": NaN, + "learning_rate": 5.502234148006107e-06, + "loss": 0.0, + "step": 58914 + }, + { + "epoch": 5.497340673695997, + "grad_norm": NaN, + "learning_rate": 5.500204507540496e-06, + "loss": 0.0, + "step": 58915 + }, + { + "epoch": 5.4974339833908745, + "grad_norm": NaN, + "learning_rate": 5.498175234495078e-06, + "loss": 0.0, + "step": 58916 + }, + { + "epoch": 5.497527293085752, + "grad_norm": NaN, + "learning_rate": 5.496146328874984e-06, + "loss": 0.0, + "step": 58917 + }, + { + "epoch": 5.4976206027806285, + "grad_norm": NaN, + "learning_rate": 5.494117790685376e-06, + "loss": 0.0, + "step": 58918 + }, + { + "epoch": 5.497713912475506, + "grad_norm": NaN, + "learning_rate": 5.492089619931434e-06, + "loss": 0.0, + "step": 58919 + }, + { + "epoch": 5.497807222170383, + "grad_norm": NaN, + "learning_rate": 5.4900618166182855e-06, + "loss": 0.0, + "step": 58920 + }, + { + "epoch": 5.497900531865261, + "grad_norm": NaN, + "learning_rate": 5.488034380751127e-06, + "loss": 0.0, + "step": 58921 + }, + { + "epoch": 5.497993841560138, + "grad_norm": NaN, + "learning_rate": 5.486007312335073e-06, + "loss": 0.0, + "step": 58922 + }, + { + "epoch": 5.498087151255016, + "grad_norm": NaN, + "learning_rate": 5.4839806113752995e-06, + "loss": 0.0, + "step": 58923 + }, + { + "epoch": 5.498180460949893, + "grad_norm": NaN, + "learning_rate": 5.481954277876955e-06, + "loss": 0.0, + "step": 58924 + }, + { + "epoch": 5.49827377064477, + "grad_norm": NaN, + "learning_rate": 5.479928311845183e-06, + "loss": 0.0, + "step": 58925 + }, + { + "epoch": 5.498367080339647, + "grad_norm": NaN, + "learning_rate": 5.477902713285148e-06, + "loss": 0.0, + "step": 58926 + }, + { + "epoch": 5.498460390034524, + "grad_norm": NaN, + "learning_rate": 5.475877482201996e-06, + "loss": 0.0, + "step": 58927 + }, + { + "epoch": 5.498553699729402, + "grad_norm": NaN, + "learning_rate": 5.473852618600872e-06, + "loss": 0.0, + "step": 58928 + }, + { + "epoch": 5.498647009424279, + "grad_norm": NaN, + "learning_rate": 5.471828122486937e-06, + "loss": 0.0, + "step": 58929 + }, + { + "epoch": 5.498740319119157, + "grad_norm": NaN, + "learning_rate": 5.469803993865341e-06, + "loss": 0.0, + "step": 58930 + }, + { + "epoch": 5.498833628814034, + "grad_norm": NaN, + "learning_rate": 5.467780232741193e-06, + "loss": 0.0, + "step": 58931 + }, + { + "epoch": 5.4989269385089115, + "grad_norm": NaN, + "learning_rate": 5.465756839119689e-06, + "loss": 0.0, + "step": 58932 + }, + { + "epoch": 5.499020248203788, + "grad_norm": NaN, + "learning_rate": 5.463733813005944e-06, + "loss": 0.0, + "step": 58933 + }, + { + "epoch": 5.499113557898665, + "grad_norm": NaN, + "learning_rate": 5.461711154405102e-06, + "loss": 0.0, + "step": 58934 + }, + { + "epoch": 5.499206867593543, + "grad_norm": NaN, + "learning_rate": 5.459688863322326e-06, + "loss": 0.0, + "step": 58935 + }, + { + "epoch": 5.49930017728842, + "grad_norm": NaN, + "learning_rate": 5.4576669397627285e-06, + "loss": 0.0, + "step": 58936 + }, + { + "epoch": 5.499393486983298, + "grad_norm": NaN, + "learning_rate": 5.455645383731488e-06, + "loss": 0.0, + "step": 58937 + }, + { + "epoch": 5.499486796678175, + "grad_norm": NaN, + "learning_rate": 5.453624195233703e-06, + "loss": 0.0, + "step": 58938 + }, + { + "epoch": 5.4995801063730525, + "grad_norm": NaN, + "learning_rate": 5.451603374274566e-06, + "loss": 0.0, + "step": 58939 + }, + { + "epoch": 5.49967341606793, + "grad_norm": NaN, + "learning_rate": 5.4495829208591415e-06, + "loss": 0.0, + "step": 58940 + }, + { + "epoch": 5.4997667257628065, + "grad_norm": NaN, + "learning_rate": 5.447562834992625e-06, + "loss": 0.0, + "step": 58941 + }, + { + "epoch": 5.499860035457684, + "grad_norm": NaN, + "learning_rate": 5.445543116680162e-06, + "loss": 0.0, + "step": 58942 + }, + { + "epoch": 5.499953345152561, + "grad_norm": NaN, + "learning_rate": 5.443523765926833e-06, + "loss": 0.0, + "step": 58943 + }, + { + "epoch": 5.500046654847439, + "grad_norm": NaN, + "learning_rate": 5.441504782737799e-06, + "loss": 0.0, + "step": 58944 + }, + { + "epoch": 5.500139964542316, + "grad_norm": NaN, + "learning_rate": 5.4394861671182234e-06, + "loss": 0.0, + "step": 58945 + }, + { + "epoch": 5.5002332742371935, + "grad_norm": NaN, + "learning_rate": 5.437467919073185e-06, + "loss": 0.0, + "step": 58946 + }, + { + "epoch": 5.50032658393207, + "grad_norm": NaN, + "learning_rate": 5.435450038607847e-06, + "loss": 0.0, + "step": 58947 + }, + { + "epoch": 5.5004198936269475, + "grad_norm": NaN, + "learning_rate": 5.433432525727371e-06, + "loss": 0.0, + "step": 58948 + }, + { + "epoch": 5.500513203321825, + "grad_norm": NaN, + "learning_rate": 5.431415380436804e-06, + "loss": 0.0, + "step": 58949 + }, + { + "epoch": 5.500606513016702, + "grad_norm": NaN, + "learning_rate": 5.429398602741342e-06, + "loss": 0.0, + "step": 58950 + }, + { + "epoch": 5.50069982271158, + "grad_norm": NaN, + "learning_rate": 5.427382192646129e-06, + "loss": 0.0, + "step": 58951 + }, + { + "epoch": 5.500793132406457, + "grad_norm": NaN, + "learning_rate": 5.425366150156196e-06, + "loss": 0.0, + "step": 58952 + }, + { + "epoch": 5.500886442101335, + "grad_norm": NaN, + "learning_rate": 5.423350475276772e-06, + "loss": 0.0, + "step": 58953 + }, + { + "epoch": 5.500979751796212, + "grad_norm": NaN, + "learning_rate": 5.421335168012919e-06, + "loss": 0.0, + "step": 58954 + }, + { + "epoch": 5.5010730614910885, + "grad_norm": NaN, + "learning_rate": 5.4193202283698e-06, + "loss": 0.0, + "step": 58955 + }, + { + "epoch": 5.501166371185966, + "grad_norm": NaN, + "learning_rate": 5.4173056563525105e-06, + "loss": 0.0, + "step": 58956 + }, + { + "epoch": 5.501259680880843, + "grad_norm": NaN, + "learning_rate": 5.415291451966197e-06, + "loss": 0.0, + "step": 58957 + }, + { + "epoch": 5.501352990575721, + "grad_norm": NaN, + "learning_rate": 5.413277615215955e-06, + "loss": 0.0, + "step": 58958 + }, + { + "epoch": 5.501446300270598, + "grad_norm": NaN, + "learning_rate": 5.411264146106931e-06, + "loss": 0.0, + "step": 58959 + }, + { + "epoch": 5.501539609965476, + "grad_norm": NaN, + "learning_rate": 5.409251044644219e-06, + "loss": 0.0, + "step": 58960 + }, + { + "epoch": 5.501632919660353, + "grad_norm": NaN, + "learning_rate": 5.407238310832951e-06, + "loss": 0.0, + "step": 58961 + }, + { + "epoch": 5.50172622935523, + "grad_norm": NaN, + "learning_rate": 5.4052259446782545e-06, + "loss": 0.0, + "step": 58962 + }, + { + "epoch": 5.501819539050107, + "grad_norm": NaN, + "learning_rate": 5.4032139461852255e-06, + "loss": 0.0, + "step": 58963 + }, + { + "epoch": 5.501912848744984, + "grad_norm": NaN, + "learning_rate": 5.40120231535901e-06, + "loss": 0.0, + "step": 58964 + }, + { + "epoch": 5.502006158439862, + "grad_norm": NaN, + "learning_rate": 5.399191052204688e-06, + "loss": 0.0, + "step": 58965 + }, + { + "epoch": 5.502099468134739, + "grad_norm": NaN, + "learning_rate": 5.397180156727387e-06, + "loss": 0.0, + "step": 58966 + }, + { + "epoch": 5.502192777829617, + "grad_norm": NaN, + "learning_rate": 5.395169628932222e-06, + "loss": 0.0, + "step": 58967 + }, + { + "epoch": 5.502286087524494, + "grad_norm": NaN, + "learning_rate": 5.393159468824321e-06, + "loss": 0.0, + "step": 58968 + }, + { + "epoch": 5.5023793972193715, + "grad_norm": NaN, + "learning_rate": 5.391149676408762e-06, + "loss": 0.0, + "step": 58969 + }, + { + "epoch": 5.502472706914248, + "grad_norm": NaN, + "learning_rate": 5.389140251690694e-06, + "loss": 0.0, + "step": 58970 + }, + { + "epoch": 5.5025660166091255, + "grad_norm": NaN, + "learning_rate": 5.387131194675176e-06, + "loss": 0.0, + "step": 58971 + }, + { + "epoch": 5.502659326304003, + "grad_norm": NaN, + "learning_rate": 5.385122505367373e-06, + "loss": 0.0, + "step": 58972 + }, + { + "epoch": 5.50275263599888, + "grad_norm": NaN, + "learning_rate": 5.383114183772347e-06, + "loss": 0.0, + "step": 58973 + }, + { + "epoch": 5.502845945693758, + "grad_norm": NaN, + "learning_rate": 5.381106229895227e-06, + "loss": 0.0, + "step": 58974 + }, + { + "epoch": 5.502939255388635, + "grad_norm": NaN, + "learning_rate": 5.379098643741125e-06, + "loss": 0.0, + "step": 58975 + }, + { + "epoch": 5.503032565083512, + "grad_norm": NaN, + "learning_rate": 5.377091425315122e-06, + "loss": 0.0, + "step": 58976 + }, + { + "epoch": 5.503125874778389, + "grad_norm": NaN, + "learning_rate": 5.375084574622346e-06, + "loss": 0.0, + "step": 58977 + }, + { + "epoch": 5.5032191844732665, + "grad_norm": NaN, + "learning_rate": 5.373078091667876e-06, + "loss": 0.0, + "step": 58978 + }, + { + "epoch": 5.503312494168144, + "grad_norm": NaN, + "learning_rate": 5.371071976456842e-06, + "loss": 0.0, + "step": 58979 + }, + { + "epoch": 5.503405803863021, + "grad_norm": NaN, + "learning_rate": 5.369066228994323e-06, + "loss": 0.0, + "step": 58980 + }, + { + "epoch": 5.503499113557899, + "grad_norm": NaN, + "learning_rate": 5.367060849285432e-06, + "loss": 0.0, + "step": 58981 + }, + { + "epoch": 5.503592423252776, + "grad_norm": NaN, + "learning_rate": 5.365055837335247e-06, + "loss": 0.0, + "step": 58982 + }, + { + "epoch": 5.503685732947654, + "grad_norm": NaN, + "learning_rate": 5.363051193148915e-06, + "loss": 0.0, + "step": 58983 + }, + { + "epoch": 5.503779042642531, + "grad_norm": NaN, + "learning_rate": 5.361046916731449e-06, + "loss": 0.0, + "step": 58984 + }, + { + "epoch": 5.503872352337408, + "grad_norm": NaN, + "learning_rate": 5.359043008088026e-06, + "loss": 0.0, + "step": 58985 + }, + { + "epoch": 5.503965662032285, + "grad_norm": NaN, + "learning_rate": 5.357039467223711e-06, + "loss": 0.0, + "step": 58986 + }, + { + "epoch": 5.504058971727162, + "grad_norm": NaN, + "learning_rate": 5.355036294143583e-06, + "loss": 0.0, + "step": 58987 + }, + { + "epoch": 5.50415228142204, + "grad_norm": NaN, + "learning_rate": 5.3530334888527536e-06, + "loss": 0.0, + "step": 58988 + }, + { + "epoch": 5.504245591116917, + "grad_norm": NaN, + "learning_rate": 5.351031051356319e-06, + "loss": 0.0, + "step": 58989 + }, + { + "epoch": 5.504338900811795, + "grad_norm": NaN, + "learning_rate": 5.349028981659359e-06, + "loss": 0.0, + "step": 58990 + }, + { + "epoch": 5.504432210506671, + "grad_norm": NaN, + "learning_rate": 5.347027279766985e-06, + "loss": 0.0, + "step": 58991 + }, + { + "epoch": 5.504525520201549, + "grad_norm": NaN, + "learning_rate": 5.345025945684261e-06, + "loss": 0.0, + "step": 58992 + }, + { + "epoch": 5.504618829896426, + "grad_norm": NaN, + "learning_rate": 5.343024979416282e-06, + "loss": 0.0, + "step": 58993 + }, + { + "epoch": 5.5047121395913035, + "grad_norm": NaN, + "learning_rate": 5.341024380968145e-06, + "loss": 0.0, + "step": 58994 + }, + { + "epoch": 5.504805449286181, + "grad_norm": NaN, + "learning_rate": 5.3390241503449285e-06, + "loss": 0.0, + "step": 58995 + }, + { + "epoch": 5.504898758981058, + "grad_norm": NaN, + "learning_rate": 5.337024287551711e-06, + "loss": 0.0, + "step": 58996 + }, + { + "epoch": 5.504992068675936, + "grad_norm": NaN, + "learning_rate": 5.3350247925936065e-06, + "loss": 0.0, + "step": 58997 + }, + { + "epoch": 5.505085378370813, + "grad_norm": NaN, + "learning_rate": 5.333025665475677e-06, + "loss": 0.0, + "step": 58998 + }, + { + "epoch": 5.50517868806569, + "grad_norm": NaN, + "learning_rate": 5.331026906203001e-06, + "loss": 0.0, + "step": 58999 + }, + { + "epoch": 5.505271997760567, + "grad_norm": NaN, + "learning_rate": 5.3290285147806925e-06, + "loss": 0.0, + "step": 59000 + }, + { + "epoch": 5.5053653074554445, + "grad_norm": NaN, + "learning_rate": 5.32703049121378e-06, + "loss": 0.0, + "step": 59001 + }, + { + "epoch": 5.505458617150322, + "grad_norm": NaN, + "learning_rate": 5.325032835507393e-06, + "loss": 0.0, + "step": 59002 + }, + { + "epoch": 5.505551926845199, + "grad_norm": NaN, + "learning_rate": 5.323035547666593e-06, + "loss": 0.0, + "step": 59003 + }, + { + "epoch": 5.505645236540077, + "grad_norm": NaN, + "learning_rate": 5.321038627696445e-06, + "loss": 0.0, + "step": 59004 + }, + { + "epoch": 5.505738546234954, + "grad_norm": NaN, + "learning_rate": 5.319042075602043e-06, + "loss": 0.0, + "step": 59005 + }, + { + "epoch": 5.505831855929831, + "grad_norm": NaN, + "learning_rate": 5.317045891388466e-06, + "loss": 0.0, + "step": 59006 + }, + { + "epoch": 5.505925165624708, + "grad_norm": NaN, + "learning_rate": 5.3150500750607784e-06, + "loss": 0.0, + "step": 59007 + }, + { + "epoch": 5.5060184753195855, + "grad_norm": NaN, + "learning_rate": 5.313054626624058e-06, + "loss": 0.0, + "step": 59008 + }, + { + "epoch": 5.506111785014463, + "grad_norm": NaN, + "learning_rate": 5.311059546083369e-06, + "loss": 0.0, + "step": 59009 + }, + { + "epoch": 5.50620509470934, + "grad_norm": NaN, + "learning_rate": 5.309064833443805e-06, + "loss": 0.0, + "step": 59010 + }, + { + "epoch": 5.506298404404218, + "grad_norm": NaN, + "learning_rate": 5.3070704887104295e-06, + "loss": 0.0, + "step": 59011 + }, + { + "epoch": 5.506391714099095, + "grad_norm": NaN, + "learning_rate": 5.3050765118883235e-06, + "loss": 0.0, + "step": 59012 + }, + { + "epoch": 5.506485023793973, + "grad_norm": NaN, + "learning_rate": 5.30308290298253e-06, + "loss": 0.0, + "step": 59013 + }, + { + "epoch": 5.506578333488849, + "grad_norm": NaN, + "learning_rate": 5.301089661998148e-06, + "loss": 0.0, + "step": 59014 + }, + { + "epoch": 5.506671643183727, + "grad_norm": NaN, + "learning_rate": 5.299096788940221e-06, + "loss": 0.0, + "step": 59015 + }, + { + "epoch": 5.506764952878604, + "grad_norm": NaN, + "learning_rate": 5.29710428381383e-06, + "loss": 0.0, + "step": 59016 + }, + { + "epoch": 5.506858262573481, + "grad_norm": NaN, + "learning_rate": 5.2951121466240364e-06, + "loss": 0.0, + "step": 59017 + }, + { + "epoch": 5.506951572268359, + "grad_norm": NaN, + "learning_rate": 5.29312037737592e-06, + "loss": 0.0, + "step": 59018 + }, + { + "epoch": 5.507044881963236, + "grad_norm": NaN, + "learning_rate": 5.291128976074527e-06, + "loss": 0.0, + "step": 59019 + }, + { + "epoch": 5.507138191658113, + "grad_norm": NaN, + "learning_rate": 5.289137942724919e-06, + "loss": 0.0, + "step": 59020 + }, + { + "epoch": 5.50723150135299, + "grad_norm": NaN, + "learning_rate": 5.287147277332177e-06, + "loss": 0.0, + "step": 59021 + }, + { + "epoch": 5.507324811047868, + "grad_norm": NaN, + "learning_rate": 5.285156979901345e-06, + "loss": 0.0, + "step": 59022 + }, + { + "epoch": 5.507418120742745, + "grad_norm": NaN, + "learning_rate": 5.283167050437487e-06, + "loss": 0.0, + "step": 59023 + }, + { + "epoch": 5.5075114304376225, + "grad_norm": NaN, + "learning_rate": 5.281177488945682e-06, + "loss": 0.0, + "step": 59024 + }, + { + "epoch": 5.5076047401325, + "grad_norm": NaN, + "learning_rate": 5.279188295430958e-06, + "loss": 0.0, + "step": 59025 + }, + { + "epoch": 5.507698049827377, + "grad_norm": NaN, + "learning_rate": 5.277199469898397e-06, + "loss": 0.0, + "step": 59026 + }, + { + "epoch": 5.507791359522255, + "grad_norm": NaN, + "learning_rate": 5.275211012353042e-06, + "loss": 0.0, + "step": 59027 + }, + { + "epoch": 5.507884669217132, + "grad_norm": NaN, + "learning_rate": 5.273222922799958e-06, + "loss": 0.0, + "step": 59028 + }, + { + "epoch": 5.507977978912009, + "grad_norm": NaN, + "learning_rate": 5.271235201244206e-06, + "loss": 0.0, + "step": 59029 + }, + { + "epoch": 5.508071288606886, + "grad_norm": NaN, + "learning_rate": 5.269247847690816e-06, + "loss": 0.0, + "step": 59030 + }, + { + "epoch": 5.5081645983017635, + "grad_norm": NaN, + "learning_rate": 5.267260862144867e-06, + "loss": 0.0, + "step": 59031 + }, + { + "epoch": 5.508257907996641, + "grad_norm": NaN, + "learning_rate": 5.265274244611406e-06, + "loss": 0.0, + "step": 59032 + }, + { + "epoch": 5.508351217691518, + "grad_norm": NaN, + "learning_rate": 5.263287995095461e-06, + "loss": 0.0, + "step": 59033 + }, + { + "epoch": 5.508444527386396, + "grad_norm": NaN, + "learning_rate": 5.261302113602112e-06, + "loss": 0.0, + "step": 59034 + }, + { + "epoch": 5.508537837081272, + "grad_norm": NaN, + "learning_rate": 5.259316600136387e-06, + "loss": 0.0, + "step": 59035 + }, + { + "epoch": 5.50863114677615, + "grad_norm": NaN, + "learning_rate": 5.257331454703367e-06, + "loss": 0.0, + "step": 59036 + }, + { + "epoch": 5.508724456471027, + "grad_norm": NaN, + "learning_rate": 5.255346677308047e-06, + "loss": 0.0, + "step": 59037 + }, + { + "epoch": 5.508817766165905, + "grad_norm": NaN, + "learning_rate": 5.253362267955524e-06, + "loss": 0.0, + "step": 59038 + }, + { + "epoch": 5.508911075860782, + "grad_norm": NaN, + "learning_rate": 5.251378226650826e-06, + "loss": 0.0, + "step": 59039 + }, + { + "epoch": 5.509004385555659, + "grad_norm": NaN, + "learning_rate": 5.249394553398983e-06, + "loss": 0.0, + "step": 59040 + }, + { + "epoch": 5.509097695250537, + "grad_norm": NaN, + "learning_rate": 5.247411248205058e-06, + "loss": 0.0, + "step": 59041 + }, + { + "epoch": 5.509191004945414, + "grad_norm": NaN, + "learning_rate": 5.2454283110740795e-06, + "loss": 0.0, + "step": 59042 + }, + { + "epoch": 5.509284314640291, + "grad_norm": NaN, + "learning_rate": 5.2434457420111094e-06, + "loss": 0.0, + "step": 59043 + }, + { + "epoch": 5.509377624335168, + "grad_norm": NaN, + "learning_rate": 5.2414635410211795e-06, + "loss": 0.0, + "step": 59044 + }, + { + "epoch": 5.509470934030046, + "grad_norm": NaN, + "learning_rate": 5.239481708109317e-06, + "loss": 0.0, + "step": 59045 + }, + { + "epoch": 5.509564243724923, + "grad_norm": NaN, + "learning_rate": 5.237500243280568e-06, + "loss": 0.0, + "step": 59046 + }, + { + "epoch": 5.5096575534198005, + "grad_norm": NaN, + "learning_rate": 5.235519146539979e-06, + "loss": 0.0, + "step": 59047 + }, + { + "epoch": 5.509750863114678, + "grad_norm": NaN, + "learning_rate": 5.2335384178925795e-06, + "loss": 0.0, + "step": 59048 + }, + { + "epoch": 5.509844172809555, + "grad_norm": NaN, + "learning_rate": 5.231558057343416e-06, + "loss": 0.0, + "step": 59049 + }, + { + "epoch": 5.509937482504432, + "grad_norm": NaN, + "learning_rate": 5.229578064897516e-06, + "loss": 0.0, + "step": 59050 + }, + { + "epoch": 5.510030792199309, + "grad_norm": NaN, + "learning_rate": 5.22759844055991e-06, + "loss": 0.0, + "step": 59051 + }, + { + "epoch": 5.510124101894187, + "grad_norm": NaN, + "learning_rate": 5.225619184335628e-06, + "loss": 0.0, + "step": 59052 + }, + { + "epoch": 5.510217411589064, + "grad_norm": NaN, + "learning_rate": 5.2236402962297305e-06, + "loss": 0.0, + "step": 59053 + }, + { + "epoch": 5.5103107212839415, + "grad_norm": NaN, + "learning_rate": 5.221661776247216e-06, + "loss": 0.0, + "step": 59054 + }, + { + "epoch": 5.510404030978819, + "grad_norm": NaN, + "learning_rate": 5.219683624393128e-06, + "loss": 0.0, + "step": 59055 + }, + { + "epoch": 5.510497340673696, + "grad_norm": NaN, + "learning_rate": 5.2177058406725146e-06, + "loss": 0.0, + "step": 59056 + }, + { + "epoch": 5.510590650368574, + "grad_norm": NaN, + "learning_rate": 5.215728425090371e-06, + "loss": 0.0, + "step": 59057 + }, + { + "epoch": 5.51068396006345, + "grad_norm": NaN, + "learning_rate": 5.213751377651743e-06, + "loss": 0.0, + "step": 59058 + }, + { + "epoch": 5.510777269758328, + "grad_norm": NaN, + "learning_rate": 5.2117746983616595e-06, + "loss": 0.0, + "step": 59059 + }, + { + "epoch": 5.510870579453205, + "grad_norm": NaN, + "learning_rate": 5.2097983872251505e-06, + "loss": 0.0, + "step": 59060 + }, + { + "epoch": 5.5109638891480826, + "grad_norm": NaN, + "learning_rate": 5.207822444247229e-06, + "loss": 0.0, + "step": 59061 + }, + { + "epoch": 5.51105719884296, + "grad_norm": NaN, + "learning_rate": 5.205846869432923e-06, + "loss": 0.0, + "step": 59062 + }, + { + "epoch": 5.511150508537837, + "grad_norm": NaN, + "learning_rate": 5.203871662787245e-06, + "loss": 0.0, + "step": 59063 + }, + { + "epoch": 5.511243818232714, + "grad_norm": NaN, + "learning_rate": 5.201896824315244e-06, + "loss": 0.0, + "step": 59064 + }, + { + "epoch": 5.511337127927591, + "grad_norm": NaN, + "learning_rate": 5.199922354021929e-06, + "loss": 0.0, + "step": 59065 + }, + { + "epoch": 5.511430437622469, + "grad_norm": NaN, + "learning_rate": 5.197948251912315e-06, + "loss": 0.0, + "step": 59066 + }, + { + "epoch": 5.511523747317346, + "grad_norm": NaN, + "learning_rate": 5.1959745179914306e-06, + "loss": 0.0, + "step": 59067 + }, + { + "epoch": 5.511617057012224, + "grad_norm": NaN, + "learning_rate": 5.194001152264271e-06, + "loss": 0.0, + "step": 59068 + }, + { + "epoch": 5.511710366707101, + "grad_norm": NaN, + "learning_rate": 5.1920281547358834e-06, + "loss": 0.0, + "step": 59069 + }, + { + "epoch": 5.511803676401978, + "grad_norm": NaN, + "learning_rate": 5.190055525411279e-06, + "loss": 0.0, + "step": 59070 + }, + { + "epoch": 5.511896986096856, + "grad_norm": NaN, + "learning_rate": 5.188083264295456e-06, + "loss": 0.0, + "step": 59071 + }, + { + "epoch": 5.511990295791732, + "grad_norm": NaN, + "learning_rate": 5.186111371393459e-06, + "loss": 0.0, + "step": 59072 + }, + { + "epoch": 5.51208360548661, + "grad_norm": NaN, + "learning_rate": 5.184139846710267e-06, + "loss": 0.0, + "step": 59073 + }, + { + "epoch": 5.512176915181487, + "grad_norm": NaN, + "learning_rate": 5.18216869025091e-06, + "loss": 0.0, + "step": 59074 + }, + { + "epoch": 5.512270224876365, + "grad_norm": NaN, + "learning_rate": 5.1801979020204e-06, + "loss": 0.0, + "step": 59075 + }, + { + "epoch": 5.512363534571242, + "grad_norm": NaN, + "learning_rate": 5.17822748202375e-06, + "loss": 0.0, + "step": 59076 + }, + { + "epoch": 5.5124568442661195, + "grad_norm": NaN, + "learning_rate": 5.176257430265973e-06, + "loss": 0.0, + "step": 59077 + }, + { + "epoch": 5.512550153960997, + "grad_norm": NaN, + "learning_rate": 5.174287746752065e-06, + "loss": 0.0, + "step": 59078 + }, + { + "epoch": 5.512643463655873, + "grad_norm": NaN, + "learning_rate": 5.172318431487055e-06, + "loss": 0.0, + "step": 59079 + }, + { + "epoch": 5.512736773350751, + "grad_norm": NaN, + "learning_rate": 5.170349484475922e-06, + "loss": 0.0, + "step": 59080 + }, + { + "epoch": 5.512830083045628, + "grad_norm": NaN, + "learning_rate": 5.168380905723713e-06, + "loss": 0.0, + "step": 59081 + }, + { + "epoch": 5.512923392740506, + "grad_norm": NaN, + "learning_rate": 5.166412695235389e-06, + "loss": 0.0, + "step": 59082 + }, + { + "epoch": 5.513016702435383, + "grad_norm": NaN, + "learning_rate": 5.164444853015981e-06, + "loss": 0.0, + "step": 59083 + }, + { + "epoch": 5.5131100121302605, + "grad_norm": NaN, + "learning_rate": 5.1624773790704845e-06, + "loss": 0.0, + "step": 59084 + }, + { + "epoch": 5.513203321825138, + "grad_norm": NaN, + "learning_rate": 5.160510273403895e-06, + "loss": 0.0, + "step": 59085 + }, + { + "epoch": 5.513296631520015, + "grad_norm": NaN, + "learning_rate": 5.158543536021242e-06, + "loss": 0.0, + "step": 59086 + }, + { + "epoch": 5.513389941214892, + "grad_norm": NaN, + "learning_rate": 5.1565771669274895e-06, + "loss": 0.0, + "step": 59087 + }, + { + "epoch": 5.513483250909769, + "grad_norm": NaN, + "learning_rate": 5.154611166127665e-06, + "loss": 0.0, + "step": 59088 + }, + { + "epoch": 5.513576560604647, + "grad_norm": NaN, + "learning_rate": 5.152645533626765e-06, + "loss": 0.0, + "step": 59089 + }, + { + "epoch": 5.513669870299524, + "grad_norm": NaN, + "learning_rate": 5.150680269429769e-06, + "loss": 0.0, + "step": 59090 + }, + { + "epoch": 5.513763179994402, + "grad_norm": NaN, + "learning_rate": 5.1487153735416895e-06, + "loss": 0.0, + "step": 59091 + }, + { + "epoch": 5.513856489689279, + "grad_norm": NaN, + "learning_rate": 5.146750845967506e-06, + "loss": 0.0, + "step": 59092 + }, + { + "epoch": 5.5139497993841555, + "grad_norm": NaN, + "learning_rate": 5.144786686712249e-06, + "loss": 0.0, + "step": 59093 + }, + { + "epoch": 5.514043109079033, + "grad_norm": NaN, + "learning_rate": 5.1428228957808795e-06, + "loss": 0.0, + "step": 59094 + }, + { + "epoch": 5.51413641877391, + "grad_norm": NaN, + "learning_rate": 5.140859473178394e-06, + "loss": 0.0, + "step": 59095 + }, + { + "epoch": 5.514229728468788, + "grad_norm": NaN, + "learning_rate": 5.138896418909804e-06, + "loss": 0.0, + "step": 59096 + }, + { + "epoch": 5.514323038163665, + "grad_norm": NaN, + "learning_rate": 5.1369337329800754e-06, + "loss": 0.0, + "step": 59097 + }, + { + "epoch": 5.514416347858543, + "grad_norm": NaN, + "learning_rate": 5.134971415394235e-06, + "loss": 0.0, + "step": 59098 + }, + { + "epoch": 5.51450965755342, + "grad_norm": NaN, + "learning_rate": 5.133009466157229e-06, + "loss": 0.0, + "step": 59099 + }, + { + "epoch": 5.5146029672482975, + "grad_norm": NaN, + "learning_rate": 5.131047885274087e-06, + "loss": 0.0, + "step": 59100 + }, + { + "epoch": 5.514696276943175, + "grad_norm": NaN, + "learning_rate": 5.129086672749755e-06, + "loss": 0.0, + "step": 59101 + }, + { + "epoch": 5.514789586638051, + "grad_norm": NaN, + "learning_rate": 5.127125828589263e-06, + "loss": 0.0, + "step": 59102 + }, + { + "epoch": 5.514882896332929, + "grad_norm": NaN, + "learning_rate": 5.125165352797555e-06, + "loss": 0.0, + "step": 59103 + }, + { + "epoch": 5.514976206027806, + "grad_norm": NaN, + "learning_rate": 5.123205245379647e-06, + "loss": 0.0, + "step": 59104 + }, + { + "epoch": 5.515069515722684, + "grad_norm": NaN, + "learning_rate": 5.121245506340516e-06, + "loss": 0.0, + "step": 59105 + }, + { + "epoch": 5.515162825417561, + "grad_norm": NaN, + "learning_rate": 5.119286135685141e-06, + "loss": 0.0, + "step": 59106 + }, + { + "epoch": 5.5152561351124385, + "grad_norm": NaN, + "learning_rate": 5.117327133418486e-06, + "loss": 0.0, + "step": 59107 + }, + { + "epoch": 5.515349444807315, + "grad_norm": NaN, + "learning_rate": 5.11536849954558e-06, + "loss": 0.0, + "step": 59108 + }, + { + "epoch": 5.5154427545021925, + "grad_norm": NaN, + "learning_rate": 5.113410234071352e-06, + "loss": 0.0, + "step": 59109 + }, + { + "epoch": 5.51553606419707, + "grad_norm": NaN, + "learning_rate": 5.111452337000815e-06, + "loss": 0.0, + "step": 59110 + }, + { + "epoch": 5.515629373891947, + "grad_norm": NaN, + "learning_rate": 5.109494808338932e-06, + "loss": 0.0, + "step": 59111 + }, + { + "epoch": 5.515722683586825, + "grad_norm": NaN, + "learning_rate": 5.107537648090682e-06, + "loss": 0.0, + "step": 59112 + }, + { + "epoch": 5.515815993281702, + "grad_norm": NaN, + "learning_rate": 5.105580856261044e-06, + "loss": 0.0, + "step": 59113 + }, + { + "epoch": 5.5159093029765796, + "grad_norm": NaN, + "learning_rate": 5.103624432854997e-06, + "loss": 0.0, + "step": 59114 + }, + { + "epoch": 5.516002612671457, + "grad_norm": NaN, + "learning_rate": 5.101668377877505e-06, + "loss": 0.0, + "step": 59115 + }, + { + "epoch": 5.5160959223663335, + "grad_norm": NaN, + "learning_rate": 5.099712691333546e-06, + "loss": 0.0, + "step": 59116 + }, + { + "epoch": 5.516189232061211, + "grad_norm": NaN, + "learning_rate": 5.097757373228101e-06, + "loss": 0.0, + "step": 59117 + }, + { + "epoch": 5.516282541756088, + "grad_norm": NaN, + "learning_rate": 5.095802423566131e-06, + "loss": 0.0, + "step": 59118 + }, + { + "epoch": 5.516375851450966, + "grad_norm": NaN, + "learning_rate": 5.093847842352616e-06, + "loss": 0.0, + "step": 59119 + }, + { + "epoch": 5.516469161145843, + "grad_norm": NaN, + "learning_rate": 5.09189362959252e-06, + "loss": 0.0, + "step": 59120 + }, + { + "epoch": 5.516562470840721, + "grad_norm": NaN, + "learning_rate": 5.089939785290819e-06, + "loss": 0.0, + "step": 59121 + }, + { + "epoch": 5.516655780535598, + "grad_norm": NaN, + "learning_rate": 5.0879863094524634e-06, + "loss": 0.0, + "step": 59122 + }, + { + "epoch": 5.5167490902304746, + "grad_norm": NaN, + "learning_rate": 5.086033202082446e-06, + "loss": 0.0, + "step": 59123 + }, + { + "epoch": 5.516842399925352, + "grad_norm": NaN, + "learning_rate": 5.084080463185714e-06, + "loss": 0.0, + "step": 59124 + }, + { + "epoch": 5.516935709620229, + "grad_norm": NaN, + "learning_rate": 5.082128092767246e-06, + "loss": 0.0, + "step": 59125 + }, + { + "epoch": 5.517029019315107, + "grad_norm": NaN, + "learning_rate": 5.080176090831989e-06, + "loss": 0.0, + "step": 59126 + }, + { + "epoch": 5.517122329009984, + "grad_norm": NaN, + "learning_rate": 5.078224457384905e-06, + "loss": 0.0, + "step": 59127 + }, + { + "epoch": 5.517215638704862, + "grad_norm": NaN, + "learning_rate": 5.076273192430991e-06, + "loss": 0.0, + "step": 59128 + }, + { + "epoch": 5.517308948399739, + "grad_norm": NaN, + "learning_rate": 5.074322295975175e-06, + "loss": 0.0, + "step": 59129 + }, + { + "epoch": 5.5174022580946165, + "grad_norm": NaN, + "learning_rate": 5.072371768022421e-06, + "loss": 0.0, + "step": 59130 + }, + { + "epoch": 5.517495567789493, + "grad_norm": NaN, + "learning_rate": 5.07042160857769e-06, + "loss": 0.0, + "step": 59131 + }, + { + "epoch": 5.51758887748437, + "grad_norm": NaN, + "learning_rate": 5.068471817645964e-06, + "loss": 0.0, + "step": 59132 + }, + { + "epoch": 5.517682187179248, + "grad_norm": NaN, + "learning_rate": 5.06652239523217e-06, + "loss": 0.0, + "step": 59133 + }, + { + "epoch": 5.517775496874125, + "grad_norm": NaN, + "learning_rate": 5.064573341341288e-06, + "loss": 0.0, + "step": 59134 + }, + { + "epoch": 5.517868806569003, + "grad_norm": NaN, + "learning_rate": 5.062624655978248e-06, + "loss": 0.0, + "step": 59135 + }, + { + "epoch": 5.51796211626388, + "grad_norm": NaN, + "learning_rate": 5.060676339148029e-06, + "loss": 0.0, + "step": 59136 + }, + { + "epoch": 5.518055425958757, + "grad_norm": NaN, + "learning_rate": 5.058728390855576e-06, + "loss": 0.0, + "step": 59137 + }, + { + "epoch": 5.518148735653634, + "grad_norm": NaN, + "learning_rate": 5.056780811105837e-06, + "loss": 0.0, + "step": 59138 + }, + { + "epoch": 5.5182420453485115, + "grad_norm": NaN, + "learning_rate": 5.054833599903774e-06, + "loss": 0.0, + "step": 59139 + }, + { + "epoch": 5.518335355043389, + "grad_norm": NaN, + "learning_rate": 5.052886757254332e-06, + "loss": 0.0, + "step": 59140 + }, + { + "epoch": 5.518428664738266, + "grad_norm": NaN, + "learning_rate": 5.050940283162458e-06, + "loss": 0.0, + "step": 59141 + }, + { + "epoch": 5.518521974433144, + "grad_norm": NaN, + "learning_rate": 5.048994177633115e-06, + "loss": 0.0, + "step": 59142 + }, + { + "epoch": 5.518615284128021, + "grad_norm": NaN, + "learning_rate": 5.047048440671231e-06, + "loss": 0.0, + "step": 59143 + }, + { + "epoch": 5.518708593822899, + "grad_norm": NaN, + "learning_rate": 5.04510307228177e-06, + "loss": 0.0, + "step": 59144 + }, + { + "epoch": 5.518801903517775, + "grad_norm": NaN, + "learning_rate": 5.043158072469677e-06, + "loss": 0.0, + "step": 59145 + }, + { + "epoch": 5.5188952132126525, + "grad_norm": NaN, + "learning_rate": 5.041213441239883e-06, + "loss": 0.0, + "step": 59146 + }, + { + "epoch": 5.51898852290753, + "grad_norm": NaN, + "learning_rate": 5.03926917859735e-06, + "loss": 0.0, + "step": 59147 + }, + { + "epoch": 5.519081832602407, + "grad_norm": NaN, + "learning_rate": 5.037325284547022e-06, + "loss": 0.0, + "step": 59148 + }, + { + "epoch": 5.519175142297285, + "grad_norm": NaN, + "learning_rate": 5.035381759093815e-06, + "loss": 0.0, + "step": 59149 + }, + { + "epoch": 5.519268451992162, + "grad_norm": NaN, + "learning_rate": 5.033438602242706e-06, + "loss": 0.0, + "step": 59150 + }, + { + "epoch": 5.51936176168704, + "grad_norm": NaN, + "learning_rate": 5.031495813998626e-06, + "loss": 0.0, + "step": 59151 + }, + { + "epoch": 5.519455071381916, + "grad_norm": NaN, + "learning_rate": 5.029553394366504e-06, + "loss": 0.0, + "step": 59152 + }, + { + "epoch": 5.519548381076794, + "grad_norm": NaN, + "learning_rate": 5.027611343351268e-06, + "loss": 0.0, + "step": 59153 + }, + { + "epoch": 5.519641690771671, + "grad_norm": NaN, + "learning_rate": 5.025669660957898e-06, + "loss": 0.0, + "step": 59154 + }, + { + "epoch": 5.519735000466548, + "grad_norm": NaN, + "learning_rate": 5.023728347191292e-06, + "loss": 0.0, + "step": 59155 + }, + { + "epoch": 5.519828310161426, + "grad_norm": NaN, + "learning_rate": 5.021787402056393e-06, + "loss": 0.0, + "step": 59156 + }, + { + "epoch": 5.519921619856303, + "grad_norm": NaN, + "learning_rate": 5.0198468255581495e-06, + "loss": 0.0, + "step": 59157 + }, + { + "epoch": 5.520014929551181, + "grad_norm": NaN, + "learning_rate": 5.017906617701489e-06, + "loss": 0.0, + "step": 59158 + }, + { + "epoch": 5.520108239246058, + "grad_norm": NaN, + "learning_rate": 5.015966778491343e-06, + "loss": 0.0, + "step": 59159 + }, + { + "epoch": 5.520201548940935, + "grad_norm": NaN, + "learning_rate": 5.014027307932655e-06, + "loss": 0.0, + "step": 59160 + }, + { + "epoch": 5.520294858635812, + "grad_norm": NaN, + "learning_rate": 5.01208820603034e-06, + "loss": 0.0, + "step": 59161 + }, + { + "epoch": 5.5203881683306895, + "grad_norm": NaN, + "learning_rate": 5.010149472789343e-06, + "loss": 0.0, + "step": 59162 + }, + { + "epoch": 5.520481478025567, + "grad_norm": NaN, + "learning_rate": 5.008211108214577e-06, + "loss": 0.0, + "step": 59163 + }, + { + "epoch": 5.520574787720444, + "grad_norm": NaN, + "learning_rate": 5.006273112310988e-06, + "loss": 0.0, + "step": 59164 + }, + { + "epoch": 5.520668097415322, + "grad_norm": NaN, + "learning_rate": 5.004335485083488e-06, + "loss": 0.0, + "step": 59165 + }, + { + "epoch": 5.520761407110199, + "grad_norm": NaN, + "learning_rate": 5.002398226537025e-06, + "loss": 0.0, + "step": 59166 + }, + { + "epoch": 5.520854716805076, + "grad_norm": NaN, + "learning_rate": 5.000461336676509e-06, + "loss": 0.0, + "step": 59167 + }, + { + "epoch": 5.520948026499953, + "grad_norm": NaN, + "learning_rate": 4.998524815506855e-06, + "loss": 0.0, + "step": 59168 + }, + { + "epoch": 5.5210413361948305, + "grad_norm": NaN, + "learning_rate": 4.996588663033008e-06, + "loss": 0.0, + "step": 59169 + }, + { + "epoch": 5.521134645889708, + "grad_norm": NaN, + "learning_rate": 4.99465287925988e-06, + "loss": 0.0, + "step": 59170 + }, + { + "epoch": 5.521227955584585, + "grad_norm": NaN, + "learning_rate": 4.992717464192403e-06, + "loss": 0.0, + "step": 59171 + }, + { + "epoch": 5.521321265279463, + "grad_norm": NaN, + "learning_rate": 4.990782417835487e-06, + "loss": 0.0, + "step": 59172 + }, + { + "epoch": 5.52141457497434, + "grad_norm": NaN, + "learning_rate": 4.988847740194046e-06, + "loss": 0.0, + "step": 59173 + }, + { + "epoch": 5.521507884669218, + "grad_norm": NaN, + "learning_rate": 4.9869134312730264e-06, + "loss": 0.0, + "step": 59174 + }, + { + "epoch": 5.521601194364094, + "grad_norm": NaN, + "learning_rate": 4.984979491077323e-06, + "loss": 0.0, + "step": 59175 + }, + { + "epoch": 5.5216945040589716, + "grad_norm": NaN, + "learning_rate": 4.983045919611867e-06, + "loss": 0.0, + "step": 59176 + }, + { + "epoch": 5.521787813753849, + "grad_norm": NaN, + "learning_rate": 4.9811127168815525e-06, + "loss": 0.0, + "step": 59177 + }, + { + "epoch": 5.521881123448726, + "grad_norm": NaN, + "learning_rate": 4.979179882891326e-06, + "loss": 0.0, + "step": 59178 + }, + { + "epoch": 5.521974433143604, + "grad_norm": NaN, + "learning_rate": 4.977247417646085e-06, + "loss": 0.0, + "step": 59179 + }, + { + "epoch": 5.522067742838481, + "grad_norm": NaN, + "learning_rate": 4.975315321150741e-06, + "loss": 0.0, + "step": 59180 + }, + { + "epoch": 5.522161052533358, + "grad_norm": NaN, + "learning_rate": 4.973383593410208e-06, + "loss": 0.0, + "step": 59181 + }, + { + "epoch": 5.522254362228235, + "grad_norm": NaN, + "learning_rate": 4.9714522344293965e-06, + "loss": 0.0, + "step": 59182 + }, + { + "epoch": 5.522347671923113, + "grad_norm": NaN, + "learning_rate": 4.96952124421322e-06, + "loss": 0.0, + "step": 59183 + }, + { + "epoch": 5.52244098161799, + "grad_norm": NaN, + "learning_rate": 4.9675906227666095e-06, + "loss": 0.0, + "step": 59184 + }, + { + "epoch": 5.522534291312867, + "grad_norm": NaN, + "learning_rate": 4.965660370094443e-06, + "loss": 0.0, + "step": 59185 + }, + { + "epoch": 5.522627601007745, + "grad_norm": NaN, + "learning_rate": 4.963730486201634e-06, + "loss": 0.0, + "step": 59186 + }, + { + "epoch": 5.522720910702622, + "grad_norm": NaN, + "learning_rate": 4.961800971093094e-06, + "loss": 0.0, + "step": 59187 + }, + { + "epoch": 5.5228142203975, + "grad_norm": NaN, + "learning_rate": 4.959871824773737e-06, + "loss": 0.0, + "step": 59188 + }, + { + "epoch": 5.522907530092376, + "grad_norm": NaN, + "learning_rate": 4.957943047248458e-06, + "loss": 0.0, + "step": 59189 + }, + { + "epoch": 5.523000839787254, + "grad_norm": NaN, + "learning_rate": 4.956014638522171e-06, + "loss": 0.0, + "step": 59190 + }, + { + "epoch": 5.523094149482131, + "grad_norm": NaN, + "learning_rate": 4.954086598599771e-06, + "loss": 0.0, + "step": 59191 + }, + { + "epoch": 5.5231874591770085, + "grad_norm": NaN, + "learning_rate": 4.952158927486172e-06, + "loss": 0.0, + "step": 59192 + }, + { + "epoch": 5.523280768871886, + "grad_norm": NaN, + "learning_rate": 4.950231625186252e-06, + "loss": 0.0, + "step": 59193 + }, + { + "epoch": 5.523374078566763, + "grad_norm": NaN, + "learning_rate": 4.9483046917049405e-06, + "loss": 0.0, + "step": 59194 + }, + { + "epoch": 5.523467388261641, + "grad_norm": NaN, + "learning_rate": 4.9463781270471015e-06, + "loss": 0.0, + "step": 59195 + }, + { + "epoch": 5.523560697956517, + "grad_norm": NaN, + "learning_rate": 4.94445193121768e-06, + "loss": 0.0, + "step": 59196 + }, + { + "epoch": 5.523654007651395, + "grad_norm": NaN, + "learning_rate": 4.942526104221522e-06, + "loss": 0.0, + "step": 59197 + }, + { + "epoch": 5.523747317346272, + "grad_norm": NaN, + "learning_rate": 4.940600646063575e-06, + "loss": 0.0, + "step": 59198 + }, + { + "epoch": 5.5238406270411495, + "grad_norm": NaN, + "learning_rate": 4.9386755567487006e-06, + "loss": 0.0, + "step": 59199 + }, + { + "epoch": 5.523933936736027, + "grad_norm": NaN, + "learning_rate": 4.936750836281811e-06, + "loss": 0.0, + "step": 59200 + }, + { + "epoch": 5.524027246430904, + "grad_norm": NaN, + "learning_rate": 4.93482648466777e-06, + "loss": 0.0, + "step": 59201 + }, + { + "epoch": 5.524120556125782, + "grad_norm": NaN, + "learning_rate": 4.932902501911523e-06, + "loss": 0.0, + "step": 59202 + }, + { + "epoch": 5.524213865820659, + "grad_norm": NaN, + "learning_rate": 4.930978888017917e-06, + "loss": 0.0, + "step": 59203 + }, + { + "epoch": 5.524307175515536, + "grad_norm": NaN, + "learning_rate": 4.9290556429918635e-06, + "loss": 0.0, + "step": 59204 + }, + { + "epoch": 5.524400485210413, + "grad_norm": NaN, + "learning_rate": 4.9271327668382436e-06, + "loss": 0.0, + "step": 59205 + }, + { + "epoch": 5.524493794905291, + "grad_norm": NaN, + "learning_rate": 4.925210259561951e-06, + "loss": 0.0, + "step": 59206 + }, + { + "epoch": 5.524587104600168, + "grad_norm": NaN, + "learning_rate": 4.923288121167884e-06, + "loss": 0.0, + "step": 59207 + }, + { + "epoch": 5.524680414295045, + "grad_norm": NaN, + "learning_rate": 4.921366351660905e-06, + "loss": 0.0, + "step": 59208 + }, + { + "epoch": 5.524773723989923, + "grad_norm": NaN, + "learning_rate": 4.919444951045925e-06, + "loss": 0.0, + "step": 59209 + }, + { + "epoch": 5.524867033684799, + "grad_norm": NaN, + "learning_rate": 4.917523919327826e-06, + "loss": 0.0, + "step": 59210 + }, + { + "epoch": 5.524960343379677, + "grad_norm": NaN, + "learning_rate": 4.915603256511486e-06, + "loss": 0.0, + "step": 59211 + }, + { + "epoch": 5.525053653074554, + "grad_norm": NaN, + "learning_rate": 4.913682962601784e-06, + "loss": 0.0, + "step": 59212 + }, + { + "epoch": 5.525146962769432, + "grad_norm": NaN, + "learning_rate": 4.911763037603617e-06, + "loss": 0.0, + "step": 59213 + }, + { + "epoch": 5.525240272464309, + "grad_norm": NaN, + "learning_rate": 4.9098434815218465e-06, + "loss": 0.0, + "step": 59214 + }, + { + "epoch": 5.5253335821591865, + "grad_norm": NaN, + "learning_rate": 4.907924294361371e-06, + "loss": 0.0, + "step": 59215 + }, + { + "epoch": 5.525426891854064, + "grad_norm": NaN, + "learning_rate": 4.906005476127067e-06, + "loss": 0.0, + "step": 59216 + }, + { + "epoch": 5.525520201548941, + "grad_norm": NaN, + "learning_rate": 4.9040870268238e-06, + "loss": 0.0, + "step": 59217 + }, + { + "epoch": 5.525613511243819, + "grad_norm": NaN, + "learning_rate": 4.9021689464564636e-06, + "loss": 0.0, + "step": 59218 + }, + { + "epoch": 5.525706820938695, + "grad_norm": NaN, + "learning_rate": 4.90025123502994e-06, + "loss": 0.0, + "step": 59219 + }, + { + "epoch": 5.525800130633573, + "grad_norm": NaN, + "learning_rate": 4.898333892549089e-06, + "loss": 0.0, + "step": 59220 + }, + { + "epoch": 5.52589344032845, + "grad_norm": NaN, + "learning_rate": 4.896416919018792e-06, + "loss": 0.0, + "step": 59221 + }, + { + "epoch": 5.5259867500233275, + "grad_norm": NaN, + "learning_rate": 4.894500314443928e-06, + "loss": 0.0, + "step": 59222 + }, + { + "epoch": 5.526080059718205, + "grad_norm": NaN, + "learning_rate": 4.892584078829359e-06, + "loss": 0.0, + "step": 59223 + }, + { + "epoch": 5.526173369413082, + "grad_norm": NaN, + "learning_rate": 4.890668212179966e-06, + "loss": 0.0, + "step": 59224 + }, + { + "epoch": 5.526266679107959, + "grad_norm": NaN, + "learning_rate": 4.888752714500627e-06, + "loss": 0.0, + "step": 59225 + }, + { + "epoch": 5.526359988802836, + "grad_norm": NaN, + "learning_rate": 4.886837585796189e-06, + "loss": 0.0, + "step": 59226 + }, + { + "epoch": 5.526453298497714, + "grad_norm": NaN, + "learning_rate": 4.884922826071563e-06, + "loss": 0.0, + "step": 59227 + }, + { + "epoch": 5.526546608192591, + "grad_norm": NaN, + "learning_rate": 4.883008435331565e-06, + "loss": 0.0, + "step": 59228 + }, + { + "epoch": 5.526639917887469, + "grad_norm": NaN, + "learning_rate": 4.881094413581105e-06, + "loss": 0.0, + "step": 59229 + }, + { + "epoch": 5.526733227582346, + "grad_norm": NaN, + "learning_rate": 4.87918076082503e-06, + "loss": 0.0, + "step": 59230 + }, + { + "epoch": 5.526826537277223, + "grad_norm": NaN, + "learning_rate": 4.8772674770682196e-06, + "loss": 0.0, + "step": 59231 + }, + { + "epoch": 5.526919846972101, + "grad_norm": NaN, + "learning_rate": 4.87535456231552e-06, + "loss": 0.0, + "step": 59232 + }, + { + "epoch": 5.527013156666977, + "grad_norm": NaN, + "learning_rate": 4.8734420165718105e-06, + "loss": 0.0, + "step": 59233 + }, + { + "epoch": 5.527106466361855, + "grad_norm": NaN, + "learning_rate": 4.8715298398419366e-06, + "loss": 0.0, + "step": 59234 + }, + { + "epoch": 5.527199776056732, + "grad_norm": NaN, + "learning_rate": 4.869618032130779e-06, + "loss": 0.0, + "step": 59235 + }, + { + "epoch": 5.52729308575161, + "grad_norm": NaN, + "learning_rate": 4.8677065934431995e-06, + "loss": 0.0, + "step": 59236 + }, + { + "epoch": 5.527386395446487, + "grad_norm": NaN, + "learning_rate": 4.865795523784044e-06, + "loss": 0.0, + "step": 59237 + }, + { + "epoch": 5.527479705141364, + "grad_norm": NaN, + "learning_rate": 4.8638848231581764e-06, + "loss": 0.0, + "step": 59238 + }, + { + "epoch": 5.527573014836242, + "grad_norm": NaN, + "learning_rate": 4.861974491570458e-06, + "loss": 0.0, + "step": 59239 + }, + { + "epoch": 5.527666324531118, + "grad_norm": NaN, + "learning_rate": 4.860064529025737e-06, + "loss": 0.0, + "step": 59240 + }, + { + "epoch": 5.527759634225996, + "grad_norm": NaN, + "learning_rate": 4.858154935528891e-06, + "loss": 0.0, + "step": 59241 + }, + { + "epoch": 5.527852943920873, + "grad_norm": NaN, + "learning_rate": 4.8562457110847665e-06, + "loss": 0.0, + "step": 59242 + }, + { + "epoch": 5.527946253615751, + "grad_norm": NaN, + "learning_rate": 4.85433685569821e-06, + "loss": 0.0, + "step": 59243 + }, + { + "epoch": 5.528039563310628, + "grad_norm": NaN, + "learning_rate": 4.852428369374067e-06, + "loss": 0.0, + "step": 59244 + }, + { + "epoch": 5.5281328730055055, + "grad_norm": NaN, + "learning_rate": 4.850520252117218e-06, + "loss": 0.0, + "step": 59245 + }, + { + "epoch": 5.528226182700383, + "grad_norm": NaN, + "learning_rate": 4.8486125039324916e-06, + "loss": 0.0, + "step": 59246 + }, + { + "epoch": 5.52831949239526, + "grad_norm": NaN, + "learning_rate": 4.846705124824735e-06, + "loss": 0.0, + "step": 59247 + }, + { + "epoch": 5.528412802090137, + "grad_norm": NaN, + "learning_rate": 4.844798114798842e-06, + "loss": 0.0, + "step": 59248 + }, + { + "epoch": 5.528506111785014, + "grad_norm": NaN, + "learning_rate": 4.842891473859611e-06, + "loss": 0.0, + "step": 59249 + }, + { + "epoch": 5.528599421479892, + "grad_norm": NaN, + "learning_rate": 4.8409852020118875e-06, + "loss": 0.0, + "step": 59250 + }, + { + "epoch": 5.528692731174769, + "grad_norm": NaN, + "learning_rate": 4.839079299260584e-06, + "loss": 0.0, + "step": 59251 + }, + { + "epoch": 5.5287860408696465, + "grad_norm": NaN, + "learning_rate": 4.837173765610464e-06, + "loss": 0.0, + "step": 59252 + }, + { + "epoch": 5.528879350564524, + "grad_norm": NaN, + "learning_rate": 4.835268601066439e-06, + "loss": 0.0, + "step": 59253 + }, + { + "epoch": 5.5289726602594005, + "grad_norm": NaN, + "learning_rate": 4.833363805633305e-06, + "loss": 0.0, + "step": 59254 + }, + { + "epoch": 5.529065969954278, + "grad_norm": NaN, + "learning_rate": 4.8314593793159276e-06, + "loss": 0.0, + "step": 59255 + }, + { + "epoch": 5.529159279649155, + "grad_norm": NaN, + "learning_rate": 4.829555322119149e-06, + "loss": 0.0, + "step": 59256 + }, + { + "epoch": 5.529252589344033, + "grad_norm": NaN, + "learning_rate": 4.8276516340478185e-06, + "loss": 0.0, + "step": 59257 + }, + { + "epoch": 5.52934589903891, + "grad_norm": NaN, + "learning_rate": 4.825748315106748e-06, + "loss": 0.0, + "step": 59258 + }, + { + "epoch": 5.529439208733788, + "grad_norm": NaN, + "learning_rate": 4.823845365300816e-06, + "loss": 0.0, + "step": 59259 + }, + { + "epoch": 5.529532518428665, + "grad_norm": NaN, + "learning_rate": 4.821942784634836e-06, + "loss": 0.0, + "step": 59260 + }, + { + "epoch": 5.529625828123542, + "grad_norm": NaN, + "learning_rate": 4.820040573113637e-06, + "loss": 0.0, + "step": 59261 + }, + { + "epoch": 5.529719137818419, + "grad_norm": NaN, + "learning_rate": 4.818138730742082e-06, + "loss": 0.0, + "step": 59262 + }, + { + "epoch": 5.529812447513296, + "grad_norm": NaN, + "learning_rate": 4.816237257524986e-06, + "loss": 0.0, + "step": 59263 + }, + { + "epoch": 5.529905757208174, + "grad_norm": NaN, + "learning_rate": 4.814336153467191e-06, + "loss": 0.0, + "step": 59264 + }, + { + "epoch": 5.529999066903051, + "grad_norm": NaN, + "learning_rate": 4.812435418573546e-06, + "loss": 0.0, + "step": 59265 + }, + { + "epoch": 5.530092376597929, + "grad_norm": NaN, + "learning_rate": 4.810535052848863e-06, + "loss": 0.0, + "step": 59266 + }, + { + "epoch": 5.530185686292806, + "grad_norm": NaN, + "learning_rate": 4.808635056297971e-06, + "loss": 0.0, + "step": 59267 + }, + { + "epoch": 5.5302789959876835, + "grad_norm": NaN, + "learning_rate": 4.8067354289257175e-06, + "loss": 0.0, + "step": 59268 + }, + { + "epoch": 5.53037230568256, + "grad_norm": NaN, + "learning_rate": 4.804836170736931e-06, + "loss": 0.0, + "step": 59269 + }, + { + "epoch": 5.530465615377437, + "grad_norm": NaN, + "learning_rate": 4.802937281736441e-06, + "loss": 0.0, + "step": 59270 + }, + { + "epoch": 5.530558925072315, + "grad_norm": NaN, + "learning_rate": 4.80103876192906e-06, + "loss": 0.0, + "step": 59271 + }, + { + "epoch": 5.530652234767192, + "grad_norm": NaN, + "learning_rate": 4.799140611319635e-06, + "loss": 0.0, + "step": 59272 + }, + { + "epoch": 5.53074554446207, + "grad_norm": NaN, + "learning_rate": 4.7972428299129785e-06, + "loss": 0.0, + "step": 59273 + }, + { + "epoch": 5.530838854156947, + "grad_norm": NaN, + "learning_rate": 4.7953454177139196e-06, + "loss": 0.0, + "step": 59274 + }, + { + "epoch": 5.5309321638518245, + "grad_norm": NaN, + "learning_rate": 4.793448374727288e-06, + "loss": 0.0, + "step": 59275 + }, + { + "epoch": 5.531025473546702, + "grad_norm": NaN, + "learning_rate": 4.7915517009578966e-06, + "loss": 0.0, + "step": 59276 + }, + { + "epoch": 5.5311187832415785, + "grad_norm": NaN, + "learning_rate": 4.789655396410574e-06, + "loss": 0.0, + "step": 59277 + }, + { + "epoch": 5.531212092936456, + "grad_norm": NaN, + "learning_rate": 4.787759461090152e-06, + "loss": 0.0, + "step": 59278 + }, + { + "epoch": 5.531305402631333, + "grad_norm": NaN, + "learning_rate": 4.78586389500144e-06, + "loss": 0.0, + "step": 59279 + }, + { + "epoch": 5.531398712326211, + "grad_norm": NaN, + "learning_rate": 4.783968698149271e-06, + "loss": 0.0, + "step": 59280 + }, + { + "epoch": 5.531492022021088, + "grad_norm": NaN, + "learning_rate": 4.782073870538438e-06, + "loss": 0.0, + "step": 59281 + }, + { + "epoch": 5.531585331715966, + "grad_norm": NaN, + "learning_rate": 4.780179412173757e-06, + "loss": 0.0, + "step": 59282 + }, + { + "epoch": 5.531678641410842, + "grad_norm": NaN, + "learning_rate": 4.7782853230601045e-06, + "loss": 0.0, + "step": 59283 + }, + { + "epoch": 5.5317719511057195, + "grad_norm": NaN, + "learning_rate": 4.776391603202245e-06, + "loss": 0.0, + "step": 59284 + }, + { + "epoch": 5.531865260800597, + "grad_norm": NaN, + "learning_rate": 4.7744982526049744e-06, + "loss": 0.0, + "step": 59285 + }, + { + "epoch": 5.531958570495474, + "grad_norm": NaN, + "learning_rate": 4.772605271273172e-06, + "loss": 0.0, + "step": 59286 + }, + { + "epoch": 5.532051880190352, + "grad_norm": NaN, + "learning_rate": 4.770712659211601e-06, + "loss": 0.0, + "step": 59287 + }, + { + "epoch": 5.532145189885229, + "grad_norm": NaN, + "learning_rate": 4.768820416425057e-06, + "loss": 0.0, + "step": 59288 + }, + { + "epoch": 5.532238499580107, + "grad_norm": NaN, + "learning_rate": 4.766928542918436e-06, + "loss": 0.0, + "step": 59289 + }, + { + "epoch": 5.532331809274984, + "grad_norm": NaN, + "learning_rate": 4.765037038696468e-06, + "loss": 0.0, + "step": 59290 + }, + { + "epoch": 5.532425118969861, + "grad_norm": NaN, + "learning_rate": 4.763145903763965e-06, + "loss": 0.0, + "step": 59291 + }, + { + "epoch": 5.532518428664738, + "grad_norm": NaN, + "learning_rate": 4.761255138125808e-06, + "loss": 0.0, + "step": 59292 + }, + { + "epoch": 5.532611738359615, + "grad_norm": NaN, + "learning_rate": 4.7593647417867255e-06, + "loss": 0.0, + "step": 59293 + }, + { + "epoch": 5.532705048054493, + "grad_norm": NaN, + "learning_rate": 4.757474714751547e-06, + "loss": 0.0, + "step": 59294 + }, + { + "epoch": 5.53279835774937, + "grad_norm": NaN, + "learning_rate": 4.755585057025119e-06, + "loss": 0.0, + "step": 59295 + }, + { + "epoch": 5.532891667444248, + "grad_norm": NaN, + "learning_rate": 4.753695768612204e-06, + "loss": 0.0, + "step": 59296 + }, + { + "epoch": 5.532984977139125, + "grad_norm": NaN, + "learning_rate": 4.751806849517581e-06, + "loss": 0.0, + "step": 59297 + }, + { + "epoch": 5.533078286834002, + "grad_norm": NaN, + "learning_rate": 4.749918299746131e-06, + "loss": 0.0, + "step": 59298 + }, + { + "epoch": 5.533171596528879, + "grad_norm": NaN, + "learning_rate": 4.748030119302598e-06, + "loss": 0.0, + "step": 59299 + }, + { + "epoch": 5.533264906223756, + "grad_norm": NaN, + "learning_rate": 4.746142308191797e-06, + "loss": 0.0, + "step": 59300 + }, + { + "epoch": 5.533358215918634, + "grad_norm": NaN, + "learning_rate": 4.744254866418523e-06, + "loss": 0.0, + "step": 59301 + }, + { + "epoch": 5.533451525613511, + "grad_norm": NaN, + "learning_rate": 4.74236779398759e-06, + "loss": 0.0, + "step": 59302 + }, + { + "epoch": 5.533544835308389, + "grad_norm": NaN, + "learning_rate": 4.740481090903775e-06, + "loss": 0.0, + "step": 59303 + }, + { + "epoch": 5.533638145003266, + "grad_norm": NaN, + "learning_rate": 4.738594757171893e-06, + "loss": 0.0, + "step": 59304 + }, + { + "epoch": 5.5337314546981435, + "grad_norm": NaN, + "learning_rate": 4.736708792796739e-06, + "loss": 0.0, + "step": 59305 + }, + { + "epoch": 5.53382476439302, + "grad_norm": NaN, + "learning_rate": 4.734823197783094e-06, + "loss": 0.0, + "step": 59306 + }, + { + "epoch": 5.5339180740878975, + "grad_norm": NaN, + "learning_rate": 4.732937972135769e-06, + "loss": 0.0, + "step": 59307 + }, + { + "epoch": 5.534011383782775, + "grad_norm": NaN, + "learning_rate": 4.731053115859562e-06, + "loss": 0.0, + "step": 59308 + }, + { + "epoch": 5.534104693477652, + "grad_norm": NaN, + "learning_rate": 4.729168628959235e-06, + "loss": 0.0, + "step": 59309 + }, + { + "epoch": 5.53419800317253, + "grad_norm": NaN, + "learning_rate": 4.727284511439616e-06, + "loss": 0.0, + "step": 59310 + }, + { + "epoch": 5.534291312867407, + "grad_norm": NaN, + "learning_rate": 4.725400763305471e-06, + "loss": 0.0, + "step": 59311 + }, + { + "epoch": 5.534384622562285, + "grad_norm": NaN, + "learning_rate": 4.723517384561609e-06, + "loss": 0.0, + "step": 59312 + }, + { + "epoch": 5.534477932257161, + "grad_norm": NaN, + "learning_rate": 4.721634375212796e-06, + "loss": 0.0, + "step": 59313 + }, + { + "epoch": 5.5345712419520385, + "grad_norm": NaN, + "learning_rate": 4.719751735263844e-06, + "loss": 0.0, + "step": 59314 + }, + { + "epoch": 5.534664551646916, + "grad_norm": NaN, + "learning_rate": 4.717869464719515e-06, + "loss": 0.0, + "step": 59315 + }, + { + "epoch": 5.534757861341793, + "grad_norm": NaN, + "learning_rate": 4.715987563584622e-06, + "loss": 0.0, + "step": 59316 + }, + { + "epoch": 5.534851171036671, + "grad_norm": NaN, + "learning_rate": 4.7141060318639286e-06, + "loss": 0.0, + "step": 59317 + }, + { + "epoch": 5.534944480731548, + "grad_norm": NaN, + "learning_rate": 4.712224869562215e-06, + "loss": 0.0, + "step": 59318 + }, + { + "epoch": 5.535037790426426, + "grad_norm": NaN, + "learning_rate": 4.7103440766842915e-06, + "loss": 0.0, + "step": 59319 + }, + { + "epoch": 5.535131100121303, + "grad_norm": NaN, + "learning_rate": 4.708463653234906e-06, + "loss": 0.0, + "step": 59320 + }, + { + "epoch": 5.53522440981618, + "grad_norm": NaN, + "learning_rate": 4.706583599218888e-06, + "loss": 0.0, + "step": 59321 + }, + { + "epoch": 5.535317719511057, + "grad_norm": NaN, + "learning_rate": 4.704703914640983e-06, + "loss": 0.0, + "step": 59322 + }, + { + "epoch": 5.535411029205934, + "grad_norm": NaN, + "learning_rate": 4.702824599505956e-06, + "loss": 0.0, + "step": 59323 + }, + { + "epoch": 5.535504338900812, + "grad_norm": NaN, + "learning_rate": 4.700945653818633e-06, + "loss": 0.0, + "step": 59324 + }, + { + "epoch": 5.535597648595689, + "grad_norm": NaN, + "learning_rate": 4.699067077583746e-06, + "loss": 0.0, + "step": 59325 + }, + { + "epoch": 5.535690958290567, + "grad_norm": NaN, + "learning_rate": 4.697188870806073e-06, + "loss": 0.0, + "step": 59326 + }, + { + "epoch": 5.535784267985443, + "grad_norm": NaN, + "learning_rate": 4.695311033490445e-06, + "loss": 0.0, + "step": 59327 + }, + { + "epoch": 5.535877577680321, + "grad_norm": NaN, + "learning_rate": 4.6934335656415744e-06, + "loss": 0.0, + "step": 59328 + }, + { + "epoch": 5.535970887375198, + "grad_norm": NaN, + "learning_rate": 4.691556467264257e-06, + "loss": 0.0, + "step": 59329 + }, + { + "epoch": 5.5360641970700755, + "grad_norm": NaN, + "learning_rate": 4.689679738363289e-06, + "loss": 0.0, + "step": 59330 + }, + { + "epoch": 5.536157506764953, + "grad_norm": NaN, + "learning_rate": 4.687803378943416e-06, + "loss": 0.0, + "step": 59331 + }, + { + "epoch": 5.53625081645983, + "grad_norm": NaN, + "learning_rate": 4.6859273890093864e-06, + "loss": 0.0, + "step": 59332 + }, + { + "epoch": 5.536344126154708, + "grad_norm": NaN, + "learning_rate": 4.684051768566044e-06, + "loss": 0.0, + "step": 59333 + }, + { + "epoch": 5.536437435849585, + "grad_norm": NaN, + "learning_rate": 4.682176517618086e-06, + "loss": 0.0, + "step": 59334 + }, + { + "epoch": 5.536530745544463, + "grad_norm": NaN, + "learning_rate": 4.680301636170291e-06, + "loss": 0.0, + "step": 59335 + }, + { + "epoch": 5.536624055239339, + "grad_norm": NaN, + "learning_rate": 4.678427124227474e-06, + "loss": 0.0, + "step": 59336 + }, + { + "epoch": 5.5367173649342165, + "grad_norm": NaN, + "learning_rate": 4.676552981794363e-06, + "loss": 0.0, + "step": 59337 + }, + { + "epoch": 5.536810674629094, + "grad_norm": NaN, + "learning_rate": 4.674679208875704e-06, + "loss": 0.0, + "step": 59338 + }, + { + "epoch": 5.536903984323971, + "grad_norm": NaN, + "learning_rate": 4.67280580547631e-06, + "loss": 0.0, + "step": 59339 + }, + { + "epoch": 5.536997294018849, + "grad_norm": NaN, + "learning_rate": 4.670932771600927e-06, + "loss": 0.0, + "step": 59340 + }, + { + "epoch": 5.537090603713726, + "grad_norm": NaN, + "learning_rate": 4.669060107254285e-06, + "loss": 0.0, + "step": 59341 + }, + { + "epoch": 5.537183913408603, + "grad_norm": NaN, + "learning_rate": 4.667187812441198e-06, + "loss": 0.0, + "step": 59342 + }, + { + "epoch": 5.53727722310348, + "grad_norm": NaN, + "learning_rate": 4.6653158871663934e-06, + "loss": 0.0, + "step": 59343 + }, + { + "epoch": 5.537370532798358, + "grad_norm": NaN, + "learning_rate": 4.663444331434652e-06, + "loss": 0.0, + "step": 59344 + }, + { + "epoch": 5.537463842493235, + "grad_norm": NaN, + "learning_rate": 4.661573145250702e-06, + "loss": 0.0, + "step": 59345 + }, + { + "epoch": 5.537557152188112, + "grad_norm": NaN, + "learning_rate": 4.659702328619325e-06, + "loss": 0.0, + "step": 59346 + }, + { + "epoch": 5.53765046188299, + "grad_norm": NaN, + "learning_rate": 4.657831881545265e-06, + "loss": 0.0, + "step": 59347 + }, + { + "epoch": 5.537743771577867, + "grad_norm": NaN, + "learning_rate": 4.655961804033287e-06, + "loss": 0.0, + "step": 59348 + }, + { + "epoch": 5.537837081272745, + "grad_norm": NaN, + "learning_rate": 4.654092096088153e-06, + "loss": 0.0, + "step": 59349 + }, + { + "epoch": 5.537930390967621, + "grad_norm": NaN, + "learning_rate": 4.652222757714607e-06, + "loss": 0.0, + "step": 59350 + }, + { + "epoch": 5.538023700662499, + "grad_norm": NaN, + "learning_rate": 4.650353788917399e-06, + "loss": 0.0, + "step": 59351 + }, + { + "epoch": 5.538117010357376, + "grad_norm": NaN, + "learning_rate": 4.64848518970129e-06, + "loss": 0.0, + "step": 59352 + }, + { + "epoch": 5.538210320052253, + "grad_norm": NaN, + "learning_rate": 4.646616960071026e-06, + "loss": 0.0, + "step": 59353 + }, + { + "epoch": 5.538303629747131, + "grad_norm": NaN, + "learning_rate": 4.644749100031353e-06, + "loss": 0.0, + "step": 59354 + }, + { + "epoch": 5.538396939442008, + "grad_norm": NaN, + "learning_rate": 4.642881609587018e-06, + "loss": 0.0, + "step": 59355 + }, + { + "epoch": 5.538490249136886, + "grad_norm": NaN, + "learning_rate": 4.641014488742801e-06, + "loss": 0.0, + "step": 59356 + }, + { + "epoch": 5.538583558831762, + "grad_norm": NaN, + "learning_rate": 4.639147737503413e-06, + "loss": 0.0, + "step": 59357 + }, + { + "epoch": 5.53867686852664, + "grad_norm": NaN, + "learning_rate": 4.637281355873585e-06, + "loss": 0.0, + "step": 59358 + }, + { + "epoch": 5.538770178221517, + "grad_norm": NaN, + "learning_rate": 4.635415343858145e-06, + "loss": 0.0, + "step": 59359 + }, + { + "epoch": 5.5388634879163945, + "grad_norm": NaN, + "learning_rate": 4.633549701461742e-06, + "loss": 0.0, + "step": 59360 + }, + { + "epoch": 5.538956797611272, + "grad_norm": NaN, + "learning_rate": 4.63168442868917e-06, + "loss": 0.0, + "step": 59361 + }, + { + "epoch": 5.539050107306149, + "grad_norm": NaN, + "learning_rate": 4.6298195255451755e-06, + "loss": 0.0, + "step": 59362 + }, + { + "epoch": 5.539143417001027, + "grad_norm": NaN, + "learning_rate": 4.627954992034488e-06, + "loss": 0.0, + "step": 59363 + }, + { + "epoch": 5.539236726695904, + "grad_norm": NaN, + "learning_rate": 4.626090828161821e-06, + "loss": 0.0, + "step": 59364 + }, + { + "epoch": 5.539330036390781, + "grad_norm": NaN, + "learning_rate": 4.624227033931988e-06, + "loss": 0.0, + "step": 59365 + }, + { + "epoch": 5.539423346085658, + "grad_norm": NaN, + "learning_rate": 4.62236360934965e-06, + "loss": 0.0, + "step": 59366 + }, + { + "epoch": 5.5395166557805355, + "grad_norm": NaN, + "learning_rate": 4.62050055441957e-06, + "loss": 0.0, + "step": 59367 + }, + { + "epoch": 5.539609965475413, + "grad_norm": NaN, + "learning_rate": 4.61863786914653e-06, + "loss": 0.0, + "step": 59368 + }, + { + "epoch": 5.53970327517029, + "grad_norm": NaN, + "learning_rate": 4.616775553535207e-06, + "loss": 0.0, + "step": 59369 + }, + { + "epoch": 5.539796584865168, + "grad_norm": NaN, + "learning_rate": 4.6149136075903485e-06, + "loss": 0.0, + "step": 59370 + }, + { + "epoch": 5.539889894560044, + "grad_norm": NaN, + "learning_rate": 4.613052031316717e-06, + "loss": 0.0, + "step": 59371 + }, + { + "epoch": 5.539983204254922, + "grad_norm": NaN, + "learning_rate": 4.611190824719024e-06, + "loss": 0.0, + "step": 59372 + }, + { + "epoch": 5.540076513949799, + "grad_norm": NaN, + "learning_rate": 4.609329987801969e-06, + "loss": 0.0, + "step": 59373 + }, + { + "epoch": 5.540169823644677, + "grad_norm": NaN, + "learning_rate": 4.607469520570378e-06, + "loss": 0.0, + "step": 59374 + }, + { + "epoch": 5.540263133339554, + "grad_norm": NaN, + "learning_rate": 4.605609423028883e-06, + "loss": 0.0, + "step": 59375 + }, + { + "epoch": 5.540356443034431, + "grad_norm": NaN, + "learning_rate": 4.603749695182263e-06, + "loss": 0.0, + "step": 59376 + }, + { + "epoch": 5.540449752729309, + "grad_norm": NaN, + "learning_rate": 4.601890337035247e-06, + "loss": 0.0, + "step": 59377 + }, + { + "epoch": 5.540543062424186, + "grad_norm": NaN, + "learning_rate": 4.600031348592548e-06, + "loss": 0.0, + "step": 59378 + }, + { + "epoch": 5.540636372119063, + "grad_norm": NaN, + "learning_rate": 4.598172729858896e-06, + "loss": 0.0, + "step": 59379 + }, + { + "epoch": 5.54072968181394, + "grad_norm": NaN, + "learning_rate": 4.5963144808390365e-06, + "loss": 0.0, + "step": 59380 + }, + { + "epoch": 5.540822991508818, + "grad_norm": NaN, + "learning_rate": 4.594456601537666e-06, + "loss": 0.0, + "step": 59381 + }, + { + "epoch": 5.540916301203695, + "grad_norm": NaN, + "learning_rate": 4.592599091959498e-06, + "loss": 0.0, + "step": 59382 + }, + { + "epoch": 5.5410096108985725, + "grad_norm": NaN, + "learning_rate": 4.590741952109312e-06, + "loss": 0.0, + "step": 59383 + }, + { + "epoch": 5.54110292059345, + "grad_norm": NaN, + "learning_rate": 4.588885181991786e-06, + "loss": 0.0, + "step": 59384 + }, + { + "epoch": 5.541196230288327, + "grad_norm": NaN, + "learning_rate": 4.587028781611618e-06, + "loss": 0.0, + "step": 59385 + }, + { + "epoch": 5.541289539983204, + "grad_norm": NaN, + "learning_rate": 4.5851727509736205e-06, + "loss": 0.0, + "step": 59386 + }, + { + "epoch": 5.541382849678081, + "grad_norm": NaN, + "learning_rate": 4.583317090082422e-06, + "loss": 0.0, + "step": 59387 + }, + { + "epoch": 5.541476159372959, + "grad_norm": NaN, + "learning_rate": 4.58146179894277e-06, + "loss": 0.0, + "step": 59388 + }, + { + "epoch": 5.541569469067836, + "grad_norm": NaN, + "learning_rate": 4.579606877559378e-06, + "loss": 0.0, + "step": 59389 + }, + { + "epoch": 5.5416627787627135, + "grad_norm": NaN, + "learning_rate": 4.57775232593699e-06, + "loss": 0.0, + "step": 59390 + }, + { + "epoch": 5.541756088457591, + "grad_norm": NaN, + "learning_rate": 4.575898144080287e-06, + "loss": 0.0, + "step": 59391 + }, + { + "epoch": 5.541849398152468, + "grad_norm": NaN, + "learning_rate": 4.574044331993998e-06, + "loss": 0.0, + "step": 59392 + }, + { + "epoch": 5.541942707847346, + "grad_norm": NaN, + "learning_rate": 4.572190889682836e-06, + "loss": 0.0, + "step": 59393 + }, + { + "epoch": 5.542036017542222, + "grad_norm": NaN, + "learning_rate": 4.570337817151515e-06, + "loss": 0.0, + "step": 59394 + }, + { + "epoch": 5.5421293272371, + "grad_norm": NaN, + "learning_rate": 4.568485114404747e-06, + "loss": 0.0, + "step": 59395 + }, + { + "epoch": 5.542222636931977, + "grad_norm": NaN, + "learning_rate": 4.566632781447227e-06, + "loss": 0.0, + "step": 59396 + }, + { + "epoch": 5.542315946626855, + "grad_norm": NaN, + "learning_rate": 4.564780818283703e-06, + "loss": 0.0, + "step": 59397 + }, + { + "epoch": 5.542409256321732, + "grad_norm": NaN, + "learning_rate": 4.562929224918855e-06, + "loss": 0.0, + "step": 59398 + }, + { + "epoch": 5.542502566016609, + "grad_norm": NaN, + "learning_rate": 4.561078001357376e-06, + "loss": 0.0, + "step": 59399 + }, + { + "epoch": 5.542595875711486, + "grad_norm": NaN, + "learning_rate": 4.5592271476040165e-06, + "loss": 0.0, + "step": 59400 + }, + { + "epoch": 5.542689185406363, + "grad_norm": NaN, + "learning_rate": 4.557376663663453e-06, + "loss": 0.0, + "step": 59401 + }, + { + "epoch": 5.542782495101241, + "grad_norm": NaN, + "learning_rate": 4.555526549540384e-06, + "loss": 0.0, + "step": 59402 + }, + { + "epoch": 5.542875804796118, + "grad_norm": NaN, + "learning_rate": 4.553676805239553e-06, + "loss": 0.0, + "step": 59403 + }, + { + "epoch": 5.542969114490996, + "grad_norm": NaN, + "learning_rate": 4.551827430765625e-06, + "loss": 0.0, + "step": 59404 + }, + { + "epoch": 5.543062424185873, + "grad_norm": NaN, + "learning_rate": 4.549978426123297e-06, + "loss": 0.0, + "step": 59405 + }, + { + "epoch": 5.5431557338807504, + "grad_norm": NaN, + "learning_rate": 4.548129791317329e-06, + "loss": 0.0, + "step": 59406 + }, + { + "epoch": 5.543249043575628, + "grad_norm": NaN, + "learning_rate": 4.546281526352352e-06, + "loss": 0.0, + "step": 59407 + }, + { + "epoch": 5.543342353270505, + "grad_norm": NaN, + "learning_rate": 4.5444336312330795e-06, + "loss": 0.0, + "step": 59408 + }, + { + "epoch": 5.543435662965382, + "grad_norm": NaN, + "learning_rate": 4.5425861059642575e-06, + "loss": 0.0, + "step": 59409 + }, + { + "epoch": 5.543528972660259, + "grad_norm": NaN, + "learning_rate": 4.540738950550549e-06, + "loss": 0.0, + "step": 59410 + }, + { + "epoch": 5.543622282355137, + "grad_norm": NaN, + "learning_rate": 4.538892164996632e-06, + "loss": 0.0, + "step": 59411 + }, + { + "epoch": 5.543715592050014, + "grad_norm": NaN, + "learning_rate": 4.537045749307239e-06, + "loss": 0.0, + "step": 59412 + }, + { + "epoch": 5.5438089017448915, + "grad_norm": NaN, + "learning_rate": 4.535199703487047e-06, + "loss": 0.0, + "step": 59413 + }, + { + "epoch": 5.543902211439769, + "grad_norm": NaN, + "learning_rate": 4.533354027540736e-06, + "loss": 0.0, + "step": 59414 + }, + { + "epoch": 5.543995521134645, + "grad_norm": NaN, + "learning_rate": 4.531508721473054e-06, + "loss": 0.0, + "step": 59415 + }, + { + "epoch": 5.544088830829523, + "grad_norm": NaN, + "learning_rate": 4.5296637852886284e-06, + "loss": 0.0, + "step": 59416 + }, + { + "epoch": 5.5441821405244, + "grad_norm": NaN, + "learning_rate": 4.527819218992157e-06, + "loss": 0.0, + "step": 59417 + }, + { + "epoch": 5.544275450219278, + "grad_norm": NaN, + "learning_rate": 4.525975022588385e-06, + "loss": 0.0, + "step": 59418 + }, + { + "epoch": 5.544368759914155, + "grad_norm": NaN, + "learning_rate": 4.5241311960819425e-06, + "loss": 0.0, + "step": 59419 + }, + { + "epoch": 5.5444620696090325, + "grad_norm": NaN, + "learning_rate": 4.522287739477542e-06, + "loss": 0.0, + "step": 59420 + }, + { + "epoch": 5.54455537930391, + "grad_norm": NaN, + "learning_rate": 4.520444652779898e-06, + "loss": 0.0, + "step": 59421 + }, + { + "epoch": 5.544648688998787, + "grad_norm": NaN, + "learning_rate": 4.5186019359936376e-06, + "loss": 0.0, + "step": 59422 + }, + { + "epoch": 5.544741998693664, + "grad_norm": NaN, + "learning_rate": 4.516759589123476e-06, + "loss": 0.0, + "step": 59423 + }, + { + "epoch": 5.544835308388541, + "grad_norm": NaN, + "learning_rate": 4.514917612174123e-06, + "loss": 0.0, + "step": 59424 + }, + { + "epoch": 5.544928618083419, + "grad_norm": NaN, + "learning_rate": 4.513076005150229e-06, + "loss": 0.0, + "step": 59425 + }, + { + "epoch": 5.545021927778296, + "grad_norm": NaN, + "learning_rate": 4.511234768056471e-06, + "loss": 0.0, + "step": 59426 + }, + { + "epoch": 5.545115237473174, + "grad_norm": NaN, + "learning_rate": 4.509393900897579e-06, + "loss": 0.0, + "step": 59427 + }, + { + "epoch": 5.545208547168051, + "grad_norm": NaN, + "learning_rate": 4.507553403678182e-06, + "loss": 0.0, + "step": 59428 + }, + { + "epoch": 5.545301856862928, + "grad_norm": NaN, + "learning_rate": 4.505713276402962e-06, + "loss": 0.0, + "step": 59429 + }, + { + "epoch": 5.545395166557805, + "grad_norm": NaN, + "learning_rate": 4.503873519076645e-06, + "loss": 0.0, + "step": 59430 + }, + { + "epoch": 5.545488476252682, + "grad_norm": NaN, + "learning_rate": 4.50203413170388e-06, + "loss": 0.0, + "step": 59431 + }, + { + "epoch": 5.54558178594756, + "grad_norm": NaN, + "learning_rate": 4.500195114289312e-06, + "loss": 0.0, + "step": 59432 + }, + { + "epoch": 5.545675095642437, + "grad_norm": NaN, + "learning_rate": 4.498356466837672e-06, + "loss": 0.0, + "step": 59433 + }, + { + "epoch": 5.545768405337315, + "grad_norm": NaN, + "learning_rate": 4.496518189353587e-06, + "loss": 0.0, + "step": 59434 + }, + { + "epoch": 5.545861715032192, + "grad_norm": NaN, + "learning_rate": 4.494680281841789e-06, + "loss": 0.0, + "step": 59435 + }, + { + "epoch": 5.5459550247270695, + "grad_norm": NaN, + "learning_rate": 4.492842744306891e-06, + "loss": 0.0, + "step": 59436 + }, + { + "epoch": 5.546048334421947, + "grad_norm": NaN, + "learning_rate": 4.491005576753587e-06, + "loss": 0.0, + "step": 59437 + }, + { + "epoch": 5.546141644116823, + "grad_norm": NaN, + "learning_rate": 4.489168779186575e-06, + "loss": 0.0, + "step": 59438 + }, + { + "epoch": 5.546234953811701, + "grad_norm": NaN, + "learning_rate": 4.487332351610501e-06, + "loss": 0.0, + "step": 59439 + }, + { + "epoch": 5.546328263506578, + "grad_norm": NaN, + "learning_rate": 4.485496294030011e-06, + "loss": 0.0, + "step": 59440 + }, + { + "epoch": 5.546421573201456, + "grad_norm": NaN, + "learning_rate": 4.483660606449835e-06, + "loss": 0.0, + "step": 59441 + }, + { + "epoch": 5.546514882896333, + "grad_norm": NaN, + "learning_rate": 4.4818252888745855e-06, + "loss": 0.0, + "step": 59442 + }, + { + "epoch": 5.5466081925912105, + "grad_norm": NaN, + "learning_rate": 4.4799903413089255e-06, + "loss": 0.0, + "step": 59443 + }, + { + "epoch": 5.546701502286087, + "grad_norm": NaN, + "learning_rate": 4.478155763757585e-06, + "loss": 0.0, + "step": 59444 + }, + { + "epoch": 5.5467948119809645, + "grad_norm": NaN, + "learning_rate": 4.476321556225177e-06, + "loss": 0.0, + "step": 59445 + }, + { + "epoch": 5.546888121675842, + "grad_norm": NaN, + "learning_rate": 4.474487718716347e-06, + "loss": 0.0, + "step": 59446 + }, + { + "epoch": 5.546981431370719, + "grad_norm": NaN, + "learning_rate": 4.472654251235825e-06, + "loss": 0.0, + "step": 59447 + }, + { + "epoch": 5.547074741065597, + "grad_norm": NaN, + "learning_rate": 4.470821153788223e-06, + "loss": 0.0, + "step": 59448 + }, + { + "epoch": 5.547168050760474, + "grad_norm": NaN, + "learning_rate": 4.468988426378189e-06, + "loss": 0.0, + "step": 59449 + }, + { + "epoch": 5.547261360455352, + "grad_norm": NaN, + "learning_rate": 4.467156069010452e-06, + "loss": 0.0, + "step": 59450 + }, + { + "epoch": 5.547354670150229, + "grad_norm": NaN, + "learning_rate": 4.465324081689608e-06, + "loss": 0.0, + "step": 59451 + }, + { + "epoch": 5.547447979845106, + "grad_norm": NaN, + "learning_rate": 4.463492464420304e-06, + "loss": 0.0, + "step": 59452 + }, + { + "epoch": 5.547541289539983, + "grad_norm": NaN, + "learning_rate": 4.461661217207268e-06, + "loss": 0.0, + "step": 59453 + }, + { + "epoch": 5.54763459923486, + "grad_norm": NaN, + "learning_rate": 4.459830340055115e-06, + "loss": 0.0, + "step": 59454 + }, + { + "epoch": 5.547727908929738, + "grad_norm": NaN, + "learning_rate": 4.457999832968473e-06, + "loss": 0.0, + "step": 59455 + }, + { + "epoch": 5.547821218624615, + "grad_norm": NaN, + "learning_rate": 4.456169695952055e-06, + "loss": 0.0, + "step": 59456 + }, + { + "epoch": 5.547914528319493, + "grad_norm": NaN, + "learning_rate": 4.4543399290104585e-06, + "loss": 0.0, + "step": 59457 + }, + { + "epoch": 5.54800783801437, + "grad_norm": NaN, + "learning_rate": 4.452510532148362e-06, + "loss": 0.0, + "step": 59458 + }, + { + "epoch": 5.548101147709247, + "grad_norm": NaN, + "learning_rate": 4.450681505370446e-06, + "loss": 0.0, + "step": 59459 + }, + { + "epoch": 5.548194457404124, + "grad_norm": NaN, + "learning_rate": 4.448852848681306e-06, + "loss": 0.0, + "step": 59460 + }, + { + "epoch": 5.548287767099001, + "grad_norm": NaN, + "learning_rate": 4.447024562085621e-06, + "loss": 0.0, + "step": 59461 + }, + { + "epoch": 5.548381076793879, + "grad_norm": NaN, + "learning_rate": 4.445196645588056e-06, + "loss": 0.0, + "step": 59462 + }, + { + "epoch": 5.548474386488756, + "grad_norm": NaN, + "learning_rate": 4.443369099193222e-06, + "loss": 0.0, + "step": 59463 + }, + { + "epoch": 5.548567696183634, + "grad_norm": NaN, + "learning_rate": 4.441541922905767e-06, + "loss": 0.0, + "step": 59464 + }, + { + "epoch": 5.548661005878511, + "grad_norm": NaN, + "learning_rate": 4.439715116730386e-06, + "loss": 0.0, + "step": 59465 + }, + { + "epoch": 5.5487543155733885, + "grad_norm": NaN, + "learning_rate": 4.437888680671675e-06, + "loss": 0.0, + "step": 59466 + }, + { + "epoch": 5.548847625268265, + "grad_norm": NaN, + "learning_rate": 4.436062614734282e-06, + "loss": 0.0, + "step": 59467 + }, + { + "epoch": 5.5489409349631424, + "grad_norm": NaN, + "learning_rate": 4.434236918922901e-06, + "loss": 0.0, + "step": 59468 + }, + { + "epoch": 5.54903424465802, + "grad_norm": NaN, + "learning_rate": 4.4324115932420966e-06, + "loss": 0.0, + "step": 59469 + }, + { + "epoch": 5.549127554352897, + "grad_norm": NaN, + "learning_rate": 4.430586637696565e-06, + "loss": 0.0, + "step": 59470 + }, + { + "epoch": 5.549220864047775, + "grad_norm": NaN, + "learning_rate": 4.428762052290935e-06, + "loss": 0.0, + "step": 59471 + }, + { + "epoch": 5.549314173742652, + "grad_norm": NaN, + "learning_rate": 4.4269378370298205e-06, + "loss": 0.0, + "step": 59472 + }, + { + "epoch": 5.5494074834375295, + "grad_norm": NaN, + "learning_rate": 4.4251139919179e-06, + "loss": 0.0, + "step": 59473 + }, + { + "epoch": 5.549500793132406, + "grad_norm": NaN, + "learning_rate": 4.423290516959804e-06, + "loss": 0.0, + "step": 59474 + }, + { + "epoch": 5.5495941028272835, + "grad_norm": NaN, + "learning_rate": 4.421467412160129e-06, + "loss": 0.0, + "step": 59475 + }, + { + "epoch": 5.549687412522161, + "grad_norm": NaN, + "learning_rate": 4.419644677523554e-06, + "loss": 0.0, + "step": 59476 + }, + { + "epoch": 5.549780722217038, + "grad_norm": NaN, + "learning_rate": 4.417822313054725e-06, + "loss": 0.0, + "step": 59477 + }, + { + "epoch": 5.549874031911916, + "grad_norm": NaN, + "learning_rate": 4.416000318758206e-06, + "loss": 0.0, + "step": 59478 + }, + { + "epoch": 5.549967341606793, + "grad_norm": NaN, + "learning_rate": 4.4141786946386925e-06, + "loss": 0.0, + "step": 59479 + }, + { + "epoch": 5.550060651301671, + "grad_norm": NaN, + "learning_rate": 4.412357440700798e-06, + "loss": 0.0, + "step": 59480 + }, + { + "epoch": 5.550153960996548, + "grad_norm": NaN, + "learning_rate": 4.410536556949135e-06, + "loss": 0.0, + "step": 59481 + }, + { + "epoch": 5.5502472706914245, + "grad_norm": NaN, + "learning_rate": 4.408716043388383e-06, + "loss": 0.0, + "step": 59482 + }, + { + "epoch": 5.550340580386302, + "grad_norm": NaN, + "learning_rate": 4.406895900023105e-06, + "loss": 0.0, + "step": 59483 + }, + { + "epoch": 5.550433890081179, + "grad_norm": NaN, + "learning_rate": 4.405076126857965e-06, + "loss": 0.0, + "step": 59484 + }, + { + "epoch": 5.550527199776057, + "grad_norm": NaN, + "learning_rate": 4.403256723897625e-06, + "loss": 0.0, + "step": 59485 + }, + { + "epoch": 5.550620509470934, + "grad_norm": NaN, + "learning_rate": 4.401437691146647e-06, + "loss": 0.0, + "step": 59486 + }, + { + "epoch": 5.550713819165812, + "grad_norm": NaN, + "learning_rate": 4.399619028609663e-06, + "loss": 0.0, + "step": 59487 + }, + { + "epoch": 5.550807128860688, + "grad_norm": NaN, + "learning_rate": 4.397800736291351e-06, + "loss": 0.0, + "step": 59488 + }, + { + "epoch": 5.550900438555566, + "grad_norm": NaN, + "learning_rate": 4.395982814196292e-06, + "loss": 0.0, + "step": 59489 + }, + { + "epoch": 5.550993748250443, + "grad_norm": NaN, + "learning_rate": 4.394165262329097e-06, + "loss": 0.0, + "step": 59490 + }, + { + "epoch": 5.55108705794532, + "grad_norm": NaN, + "learning_rate": 4.392348080694447e-06, + "loss": 0.0, + "step": 59491 + }, + { + "epoch": 5.551180367640198, + "grad_norm": NaN, + "learning_rate": 4.390531269296904e-06, + "loss": 0.0, + "step": 59492 + }, + { + "epoch": 5.551273677335075, + "grad_norm": NaN, + "learning_rate": 4.388714828141099e-06, + "loss": 0.0, + "step": 59493 + }, + { + "epoch": 5.551366987029953, + "grad_norm": NaN, + "learning_rate": 4.386898757231677e-06, + "loss": 0.0, + "step": 59494 + }, + { + "epoch": 5.55146029672483, + "grad_norm": NaN, + "learning_rate": 4.385083056573236e-06, + "loss": 0.0, + "step": 59495 + }, + { + "epoch": 5.551553606419707, + "grad_norm": NaN, + "learning_rate": 4.38326772617037e-06, + "loss": 0.0, + "step": 59496 + }, + { + "epoch": 5.551646916114584, + "grad_norm": NaN, + "learning_rate": 4.381452766027743e-06, + "loss": 0.0, + "step": 59497 + }, + { + "epoch": 5.5517402258094615, + "grad_norm": NaN, + "learning_rate": 4.379638176149936e-06, + "loss": 0.0, + "step": 59498 + }, + { + "epoch": 5.551833535504339, + "grad_norm": NaN, + "learning_rate": 4.37782395654156e-06, + "loss": 0.0, + "step": 59499 + }, + { + "epoch": 5.551926845199216, + "grad_norm": NaN, + "learning_rate": 4.376010107207279e-06, + "loss": 0.0, + "step": 59500 + }, + { + "epoch": 5.552020154894094, + "grad_norm": NaN, + "learning_rate": 4.374196628151638e-06, + "loss": 0.0, + "step": 59501 + }, + { + "epoch": 5.552113464588971, + "grad_norm": NaN, + "learning_rate": 4.372383519379269e-06, + "loss": 0.0, + "step": 59502 + }, + { + "epoch": 5.552206774283848, + "grad_norm": NaN, + "learning_rate": 4.370570780894833e-06, + "loss": 0.0, + "step": 59503 + }, + { + "epoch": 5.552300083978725, + "grad_norm": NaN, + "learning_rate": 4.368758412702861e-06, + "loss": 0.0, + "step": 59504 + }, + { + "epoch": 5.5523933936736025, + "grad_norm": NaN, + "learning_rate": 4.366946414807998e-06, + "loss": 0.0, + "step": 59505 + }, + { + "epoch": 5.55248670336848, + "grad_norm": NaN, + "learning_rate": 4.3651347872148745e-06, + "loss": 0.0, + "step": 59506 + }, + { + "epoch": 5.552580013063357, + "grad_norm": NaN, + "learning_rate": 4.363323529928037e-06, + "loss": 0.0, + "step": 59507 + }, + { + "epoch": 5.552673322758235, + "grad_norm": NaN, + "learning_rate": 4.361512642952148e-06, + "loss": 0.0, + "step": 59508 + }, + { + "epoch": 5.552766632453112, + "grad_norm": NaN, + "learning_rate": 4.359702126291803e-06, + "loss": 0.0, + "step": 59509 + }, + { + "epoch": 5.55285994214799, + "grad_norm": NaN, + "learning_rate": 4.357891979951566e-06, + "loss": 0.0, + "step": 59510 + }, + { + "epoch": 5.552953251842866, + "grad_norm": NaN, + "learning_rate": 4.3560822039360845e-06, + "loss": 0.0, + "step": 59511 + }, + { + "epoch": 5.553046561537744, + "grad_norm": NaN, + "learning_rate": 4.354272798249952e-06, + "loss": 0.0, + "step": 59512 + }, + { + "epoch": 5.553139871232621, + "grad_norm": NaN, + "learning_rate": 4.3524637628977345e-06, + "loss": 0.0, + "step": 59513 + }, + { + "epoch": 5.553233180927498, + "grad_norm": NaN, + "learning_rate": 4.350655097884076e-06, + "loss": 0.0, + "step": 59514 + }, + { + "epoch": 5.553326490622376, + "grad_norm": NaN, + "learning_rate": 4.348846803213574e-06, + "loss": 0.0, + "step": 59515 + }, + { + "epoch": 5.553419800317253, + "grad_norm": NaN, + "learning_rate": 4.3470388788907736e-06, + "loss": 0.0, + "step": 59516 + }, + { + "epoch": 5.55351311001213, + "grad_norm": NaN, + "learning_rate": 4.345231324920323e-06, + "loss": 0.0, + "step": 59517 + }, + { + "epoch": 5.553606419707007, + "grad_norm": NaN, + "learning_rate": 4.343424141306817e-06, + "loss": 0.0, + "step": 59518 + }, + { + "epoch": 5.553699729401885, + "grad_norm": NaN, + "learning_rate": 4.3416173280548024e-06, + "loss": 0.0, + "step": 59519 + }, + { + "epoch": 5.553793039096762, + "grad_norm": NaN, + "learning_rate": 4.339810885168926e-06, + "loss": 0.0, + "step": 59520 + }, + { + "epoch": 5.5538863487916394, + "grad_norm": NaN, + "learning_rate": 4.338004812653784e-06, + "loss": 0.0, + "step": 59521 + }, + { + "epoch": 5.553979658486517, + "grad_norm": NaN, + "learning_rate": 4.336199110513922e-06, + "loss": 0.0, + "step": 59522 + }, + { + "epoch": 5.554072968181394, + "grad_norm": NaN, + "learning_rate": 4.33439377875397e-06, + "loss": 0.0, + "step": 59523 + }, + { + "epoch": 5.554166277876272, + "grad_norm": NaN, + "learning_rate": 4.332588817378507e-06, + "loss": 0.0, + "step": 59524 + }, + { + "epoch": 5.554259587571149, + "grad_norm": NaN, + "learning_rate": 4.330784226392098e-06, + "loss": 0.0, + "step": 59525 + }, + { + "epoch": 5.554352897266026, + "grad_norm": NaN, + "learning_rate": 4.328980005799387e-06, + "loss": 0.0, + "step": 59526 + }, + { + "epoch": 5.554446206960903, + "grad_norm": NaN, + "learning_rate": 4.327176155604906e-06, + "loss": 0.0, + "step": 59527 + }, + { + "epoch": 5.5545395166557805, + "grad_norm": NaN, + "learning_rate": 4.325372675813249e-06, + "loss": 0.0, + "step": 59528 + }, + { + "epoch": 5.554632826350658, + "grad_norm": NaN, + "learning_rate": 4.323569566429064e-06, + "loss": 0.0, + "step": 59529 + }, + { + "epoch": 5.554726136045535, + "grad_norm": NaN, + "learning_rate": 4.321766827456863e-06, + "loss": 0.0, + "step": 59530 + }, + { + "epoch": 5.554819445740413, + "grad_norm": NaN, + "learning_rate": 4.319964458901226e-06, + "loss": 0.0, + "step": 59531 + }, + { + "epoch": 5.554912755435289, + "grad_norm": NaN, + "learning_rate": 4.318162460766817e-06, + "loss": 0.0, + "step": 59532 + }, + { + "epoch": 5.555006065130167, + "grad_norm": NaN, + "learning_rate": 4.316360833058147e-06, + "loss": 0.0, + "step": 59533 + }, + { + "epoch": 5.555099374825044, + "grad_norm": NaN, + "learning_rate": 4.314559575779797e-06, + "loss": 0.0, + "step": 59534 + }, + { + "epoch": 5.5551926845199215, + "grad_norm": NaN, + "learning_rate": 4.312758688936396e-06, + "loss": 0.0, + "step": 59535 + }, + { + "epoch": 5.555285994214799, + "grad_norm": NaN, + "learning_rate": 4.310958172532492e-06, + "loss": 0.0, + "step": 59536 + }, + { + "epoch": 5.555379303909676, + "grad_norm": NaN, + "learning_rate": 4.309158026572629e-06, + "loss": 0.0, + "step": 59537 + }, + { + "epoch": 5.555472613604554, + "grad_norm": NaN, + "learning_rate": 4.307358251061472e-06, + "loss": 0.0, + "step": 59538 + }, + { + "epoch": 5.555565923299431, + "grad_norm": NaN, + "learning_rate": 4.305558846003515e-06, + "loss": 0.0, + "step": 59539 + }, + { + "epoch": 5.555659232994308, + "grad_norm": NaN, + "learning_rate": 4.3037598114033564e-06, + "loss": 0.0, + "step": 59540 + }, + { + "epoch": 5.555752542689185, + "grad_norm": NaN, + "learning_rate": 4.301961147265609e-06, + "loss": 0.0, + "step": 59541 + }, + { + "epoch": 5.555845852384063, + "grad_norm": NaN, + "learning_rate": 4.3001628535948015e-06, + "loss": 0.0, + "step": 59542 + }, + { + "epoch": 5.55593916207894, + "grad_norm": NaN, + "learning_rate": 4.298364930395498e-06, + "loss": 0.0, + "step": 59543 + }, + { + "epoch": 5.556032471773817, + "grad_norm": NaN, + "learning_rate": 4.2965673776723275e-06, + "loss": 0.0, + "step": 59544 + }, + { + "epoch": 5.556125781468695, + "grad_norm": NaN, + "learning_rate": 4.2947701954297866e-06, + "loss": 0.0, + "step": 59545 + }, + { + "epoch": 5.556219091163572, + "grad_norm": NaN, + "learning_rate": 4.292973383672504e-06, + "loss": 0.0, + "step": 59546 + }, + { + "epoch": 5.556312400858449, + "grad_norm": NaN, + "learning_rate": 4.291176942405061e-06, + "loss": 0.0, + "step": 59547 + }, + { + "epoch": 5.556405710553326, + "grad_norm": NaN, + "learning_rate": 4.289380871631936e-06, + "loss": 0.0, + "step": 59548 + }, + { + "epoch": 5.556499020248204, + "grad_norm": NaN, + "learning_rate": 4.287585171357777e-06, + "loss": 0.0, + "step": 59549 + }, + { + "epoch": 5.556592329943081, + "grad_norm": NaN, + "learning_rate": 4.285789841587145e-06, + "loss": 0.0, + "step": 59550 + }, + { + "epoch": 5.5566856396379585, + "grad_norm": NaN, + "learning_rate": 4.283994882324554e-06, + "loss": 0.0, + "step": 59551 + }, + { + "epoch": 5.556778949332836, + "grad_norm": NaN, + "learning_rate": 4.2822002935746165e-06, + "loss": 0.0, + "step": 59552 + }, + { + "epoch": 5.556872259027713, + "grad_norm": NaN, + "learning_rate": 4.280406075341897e-06, + "loss": 0.0, + "step": 59553 + }, + { + "epoch": 5.556965568722591, + "grad_norm": NaN, + "learning_rate": 4.278612227630906e-06, + "loss": 0.0, + "step": 59554 + }, + { + "epoch": 5.557058878417467, + "grad_norm": NaN, + "learning_rate": 4.276818750446242e-06, + "loss": 0.0, + "step": 59555 + }, + { + "epoch": 5.557152188112345, + "grad_norm": NaN, + "learning_rate": 4.2750256437925015e-06, + "loss": 0.0, + "step": 59556 + }, + { + "epoch": 5.557245497807222, + "grad_norm": NaN, + "learning_rate": 4.273232907674146e-06, + "loss": 0.0, + "step": 59557 + }, + { + "epoch": 5.5573388075020995, + "grad_norm": NaN, + "learning_rate": 4.271440542095822e-06, + "loss": 0.0, + "step": 59558 + }, + { + "epoch": 5.557432117196977, + "grad_norm": NaN, + "learning_rate": 4.269648547062077e-06, + "loss": 0.0, + "step": 59559 + }, + { + "epoch": 5.557525426891854, + "grad_norm": NaN, + "learning_rate": 4.267856922577406e-06, + "loss": 0.0, + "step": 59560 + }, + { + "epoch": 5.557618736586731, + "grad_norm": NaN, + "learning_rate": 4.2660656686464235e-06, + "loss": 0.0, + "step": 59561 + }, + { + "epoch": 5.557712046281608, + "grad_norm": NaN, + "learning_rate": 4.264274785273691e-06, + "loss": 0.0, + "step": 59562 + }, + { + "epoch": 5.557805355976486, + "grad_norm": NaN, + "learning_rate": 4.2624842724636895e-06, + "loss": 0.0, + "step": 59563 + }, + { + "epoch": 5.557898665671363, + "grad_norm": NaN, + "learning_rate": 4.260694130221049e-06, + "loss": 0.0, + "step": 59564 + }, + { + "epoch": 5.557991975366241, + "grad_norm": NaN, + "learning_rate": 4.258904358550297e-06, + "loss": 0.0, + "step": 59565 + }, + { + "epoch": 5.558085285061118, + "grad_norm": NaN, + "learning_rate": 4.257114957455965e-06, + "loss": 0.0, + "step": 59566 + }, + { + "epoch": 5.558178594755995, + "grad_norm": NaN, + "learning_rate": 4.2553259269426324e-06, + "loss": 0.0, + "step": 59567 + }, + { + "epoch": 5.558271904450873, + "grad_norm": NaN, + "learning_rate": 4.253537267014828e-06, + "loss": 0.0, + "step": 59568 + }, + { + "epoch": 5.558365214145749, + "grad_norm": NaN, + "learning_rate": 4.251748977677083e-06, + "loss": 0.0, + "step": 59569 + }, + { + "epoch": 5.558458523840627, + "grad_norm": NaN, + "learning_rate": 4.249961058933993e-06, + "loss": 0.0, + "step": 59570 + }, + { + "epoch": 5.558551833535504, + "grad_norm": NaN, + "learning_rate": 4.248173510790071e-06, + "loss": 0.0, + "step": 59571 + }, + { + "epoch": 5.558645143230382, + "grad_norm": NaN, + "learning_rate": 4.246386333249846e-06, + "loss": 0.0, + "step": 59572 + }, + { + "epoch": 5.558738452925259, + "grad_norm": NaN, + "learning_rate": 4.244599526317932e-06, + "loss": 0.0, + "step": 59573 + }, + { + "epoch": 5.5588317626201365, + "grad_norm": NaN, + "learning_rate": 4.242813089998792e-06, + "loss": 0.0, + "step": 59574 + }, + { + "epoch": 5.558925072315014, + "grad_norm": NaN, + "learning_rate": 4.241027024296989e-06, + "loss": 0.0, + "step": 59575 + }, + { + "epoch": 5.55901838200989, + "grad_norm": NaN, + "learning_rate": 4.2392413292171194e-06, + "loss": 0.0, + "step": 59576 + }, + { + "epoch": 5.559111691704768, + "grad_norm": NaN, + "learning_rate": 4.237456004763662e-06, + "loss": 0.0, + "step": 59577 + }, + { + "epoch": 5.559205001399645, + "grad_norm": NaN, + "learning_rate": 4.235671050941147e-06, + "loss": 0.0, + "step": 59578 + }, + { + "epoch": 5.559298311094523, + "grad_norm": NaN, + "learning_rate": 4.233886467754188e-06, + "loss": 0.0, + "step": 59579 + }, + { + "epoch": 5.5593916207894, + "grad_norm": NaN, + "learning_rate": 4.232102255207265e-06, + "loss": 0.0, + "step": 59580 + }, + { + "epoch": 5.5594849304842775, + "grad_norm": NaN, + "learning_rate": 4.230318413304889e-06, + "loss": 0.0, + "step": 59581 + }, + { + "epoch": 5.559578240179155, + "grad_norm": NaN, + "learning_rate": 4.228534942051692e-06, + "loss": 0.0, + "step": 59582 + }, + { + "epoch": 5.559671549874032, + "grad_norm": NaN, + "learning_rate": 4.226751841452103e-06, + "loss": 0.0, + "step": 59583 + }, + { + "epoch": 5.559764859568909, + "grad_norm": NaN, + "learning_rate": 4.224969111510717e-06, + "loss": 0.0, + "step": 59584 + }, + { + "epoch": 5.559858169263786, + "grad_norm": NaN, + "learning_rate": 4.2231867522320666e-06, + "loss": 0.0, + "step": 59585 + }, + { + "epoch": 5.559951478958664, + "grad_norm": NaN, + "learning_rate": 4.221404763620645e-06, + "loss": 0.0, + "step": 59586 + }, + { + "epoch": 5.560044788653541, + "grad_norm": NaN, + "learning_rate": 4.219623145681017e-06, + "loss": 0.0, + "step": 59587 + }, + { + "epoch": 5.5601380983484185, + "grad_norm": NaN, + "learning_rate": 4.2178418984177285e-06, + "loss": 0.0, + "step": 59588 + }, + { + "epoch": 5.560231408043296, + "grad_norm": NaN, + "learning_rate": 4.216061021835259e-06, + "loss": 0.0, + "step": 59589 + }, + { + "epoch": 5.560324717738173, + "grad_norm": NaN, + "learning_rate": 4.214280515938157e-06, + "loss": 0.0, + "step": 59590 + }, + { + "epoch": 5.56041802743305, + "grad_norm": NaN, + "learning_rate": 4.212500380730982e-06, + "loss": 0.0, + "step": 59591 + }, + { + "epoch": 5.560511337127927, + "grad_norm": NaN, + "learning_rate": 4.2107206162181995e-06, + "loss": 0.0, + "step": 59592 + }, + { + "epoch": 5.560604646822805, + "grad_norm": NaN, + "learning_rate": 4.208941222404388e-06, + "loss": 0.0, + "step": 59593 + }, + { + "epoch": 5.560697956517682, + "grad_norm": NaN, + "learning_rate": 4.207162199294079e-06, + "loss": 0.0, + "step": 59594 + }, + { + "epoch": 5.56079126621256, + "grad_norm": NaN, + "learning_rate": 4.205383546891716e-06, + "loss": 0.0, + "step": 59595 + }, + { + "epoch": 5.560884575907437, + "grad_norm": NaN, + "learning_rate": 4.203605265201898e-06, + "loss": 0.0, + "step": 59596 + }, + { + "epoch": 5.560977885602314, + "grad_norm": NaN, + "learning_rate": 4.201827354229137e-06, + "loss": 0.0, + "step": 59597 + }, + { + "epoch": 5.561071195297192, + "grad_norm": NaN, + "learning_rate": 4.200049813977929e-06, + "loss": 0.0, + "step": 59598 + }, + { + "epoch": 5.561164504992068, + "grad_norm": NaN, + "learning_rate": 4.198272644452805e-06, + "loss": 0.0, + "step": 59599 + }, + { + "epoch": 5.561257814686946, + "grad_norm": NaN, + "learning_rate": 4.196495845658293e-06, + "loss": 0.0, + "step": 59600 + }, + { + "epoch": 5.561351124381823, + "grad_norm": NaN, + "learning_rate": 4.194719417598874e-06, + "loss": 0.0, + "step": 59601 + }, + { + "epoch": 5.561444434076701, + "grad_norm": NaN, + "learning_rate": 4.192943360279111e-06, + "loss": 0.0, + "step": 59602 + }, + { + "epoch": 5.561537743771578, + "grad_norm": NaN, + "learning_rate": 4.191167673703516e-06, + "loss": 0.0, + "step": 59603 + }, + { + "epoch": 5.5616310534664555, + "grad_norm": NaN, + "learning_rate": 4.18939235787657e-06, + "loss": 0.0, + "step": 59604 + }, + { + "epoch": 5.561724363161332, + "grad_norm": NaN, + "learning_rate": 4.187617412802818e-06, + "loss": 0.0, + "step": 59605 + }, + { + "epoch": 5.561817672856209, + "grad_norm": NaN, + "learning_rate": 4.185842838486775e-06, + "loss": 0.0, + "step": 59606 + }, + { + "epoch": 5.561910982551087, + "grad_norm": NaN, + "learning_rate": 4.1840686349329024e-06, + "loss": 0.0, + "step": 59607 + }, + { + "epoch": 5.562004292245964, + "grad_norm": NaN, + "learning_rate": 4.1822948021457635e-06, + "loss": 0.0, + "step": 59608 + }, + { + "epoch": 5.562097601940842, + "grad_norm": NaN, + "learning_rate": 4.180521340129889e-06, + "loss": 0.0, + "step": 59609 + }, + { + "epoch": 5.562190911635719, + "grad_norm": NaN, + "learning_rate": 4.178748248889707e-06, + "loss": 0.0, + "step": 59610 + }, + { + "epoch": 5.5622842213305965, + "grad_norm": NaN, + "learning_rate": 4.176975528429782e-06, + "loss": 0.0, + "step": 59611 + }, + { + "epoch": 5.562377531025474, + "grad_norm": NaN, + "learning_rate": 4.175203178754643e-06, + "loss": 0.0, + "step": 59612 + }, + { + "epoch": 5.5624708407203505, + "grad_norm": NaN, + "learning_rate": 4.173431199868721e-06, + "loss": 0.0, + "step": 59613 + }, + { + "epoch": 5.562564150415228, + "grad_norm": NaN, + "learning_rate": 4.1716595917766105e-06, + "loss": 0.0, + "step": 59614 + }, + { + "epoch": 5.562657460110105, + "grad_norm": NaN, + "learning_rate": 4.169888354482759e-06, + "loss": 0.0, + "step": 59615 + }, + { + "epoch": 5.562750769804983, + "grad_norm": NaN, + "learning_rate": 4.1681174879916625e-06, + "loss": 0.0, + "step": 59616 + }, + { + "epoch": 5.56284407949986, + "grad_norm": NaN, + "learning_rate": 4.166346992307867e-06, + "loss": 0.0, + "step": 59617 + }, + { + "epoch": 5.562937389194738, + "grad_norm": NaN, + "learning_rate": 4.164576867435837e-06, + "loss": 0.0, + "step": 59618 + }, + { + "epoch": 5.563030698889615, + "grad_norm": NaN, + "learning_rate": 4.162807113380084e-06, + "loss": 0.0, + "step": 59619 + }, + { + "epoch": 5.5631240085844915, + "grad_norm": NaN, + "learning_rate": 4.161037730145139e-06, + "loss": 0.0, + "step": 59620 + }, + { + "epoch": 5.563217318279369, + "grad_norm": NaN, + "learning_rate": 4.1592687177354474e-06, + "loss": 0.0, + "step": 59621 + }, + { + "epoch": 5.563310627974246, + "grad_norm": NaN, + "learning_rate": 4.15750007615554e-06, + "loss": 0.0, + "step": 59622 + }, + { + "epoch": 5.563403937669124, + "grad_norm": NaN, + "learning_rate": 4.155731805409928e-06, + "loss": 0.0, + "step": 59623 + }, + { + "epoch": 5.563497247364001, + "grad_norm": NaN, + "learning_rate": 4.153963905503061e-06, + "loss": 0.0, + "step": 59624 + }, + { + "epoch": 5.563590557058879, + "grad_norm": NaN, + "learning_rate": 4.152196376439465e-06, + "loss": 0.0, + "step": 59625 + }, + { + "epoch": 5.563683866753756, + "grad_norm": NaN, + "learning_rate": 4.150429218223656e-06, + "loss": 0.0, + "step": 59626 + }, + { + "epoch": 5.5637771764486335, + "grad_norm": NaN, + "learning_rate": 4.148662430860078e-06, + "loss": 0.0, + "step": 59627 + }, + { + "epoch": 5.56387048614351, + "grad_norm": NaN, + "learning_rate": 4.146896014353246e-06, + "loss": 0.0, + "step": 59628 + }, + { + "epoch": 5.563963795838387, + "grad_norm": NaN, + "learning_rate": 4.145129968707672e-06, + "loss": 0.0, + "step": 59629 + }, + { + "epoch": 5.564057105533265, + "grad_norm": NaN, + "learning_rate": 4.1433642939278034e-06, + "loss": 0.0, + "step": 59630 + }, + { + "epoch": 5.564150415228142, + "grad_norm": NaN, + "learning_rate": 4.141598990018169e-06, + "loss": 0.0, + "step": 59631 + }, + { + "epoch": 5.56424372492302, + "grad_norm": NaN, + "learning_rate": 4.139834056983249e-06, + "loss": 0.0, + "step": 59632 + }, + { + "epoch": 5.564337034617897, + "grad_norm": NaN, + "learning_rate": 4.138069494827506e-06, + "loss": 0.0, + "step": 59633 + }, + { + "epoch": 5.564430344312774, + "grad_norm": NaN, + "learning_rate": 4.136305303555454e-06, + "loss": 0.0, + "step": 59634 + }, + { + "epoch": 5.564523654007651, + "grad_norm": NaN, + "learning_rate": 4.134541483171571e-06, + "loss": 0.0, + "step": 59635 + }, + { + "epoch": 5.5646169637025285, + "grad_norm": NaN, + "learning_rate": 4.132778033680323e-06, + "loss": 0.0, + "step": 59636 + }, + { + "epoch": 5.564710273397406, + "grad_norm": NaN, + "learning_rate": 4.13101495508622e-06, + "loss": 0.0, + "step": 59637 + }, + { + "epoch": 5.564803583092283, + "grad_norm": NaN, + "learning_rate": 4.129252247393761e-06, + "loss": 0.0, + "step": 59638 + }, + { + "epoch": 5.564896892787161, + "grad_norm": NaN, + "learning_rate": 4.127489910607373e-06, + "loss": 0.0, + "step": 59639 + }, + { + "epoch": 5.564990202482038, + "grad_norm": NaN, + "learning_rate": 4.125727944731571e-06, + "loss": 0.0, + "step": 59640 + }, + { + "epoch": 5.5650835121769155, + "grad_norm": NaN, + "learning_rate": 4.123966349770868e-06, + "loss": 0.0, + "step": 59641 + }, + { + "epoch": 5.565176821871793, + "grad_norm": NaN, + "learning_rate": 4.12220512572966e-06, + "loss": 0.0, + "step": 59642 + }, + { + "epoch": 5.5652701315666695, + "grad_norm": NaN, + "learning_rate": 4.120444272612494e-06, + "loss": 0.0, + "step": 59643 + }, + { + "epoch": 5.565363441261547, + "grad_norm": NaN, + "learning_rate": 4.118683790423832e-06, + "loss": 0.0, + "step": 59644 + }, + { + "epoch": 5.565456750956424, + "grad_norm": NaN, + "learning_rate": 4.116923679168121e-06, + "loss": 0.0, + "step": 59645 + }, + { + "epoch": 5.565550060651302, + "grad_norm": NaN, + "learning_rate": 4.115163938849858e-06, + "loss": 0.0, + "step": 59646 + }, + { + "epoch": 5.565643370346179, + "grad_norm": NaN, + "learning_rate": 4.113404569473555e-06, + "loss": 0.0, + "step": 59647 + }, + { + "epoch": 5.565736680041057, + "grad_norm": NaN, + "learning_rate": 4.1116455710436095e-06, + "loss": 0.0, + "step": 59648 + }, + { + "epoch": 5.565829989735933, + "grad_norm": NaN, + "learning_rate": 4.10988694356455e-06, + "loss": 0.0, + "step": 59649 + }, + { + "epoch": 5.5659232994308105, + "grad_norm": NaN, + "learning_rate": 4.108128687040857e-06, + "loss": 0.0, + "step": 59650 + }, + { + "epoch": 5.566016609125688, + "grad_norm": NaN, + "learning_rate": 4.1063708014769435e-06, + "loss": 0.0, + "step": 59651 + }, + { + "epoch": 5.566109918820565, + "grad_norm": NaN, + "learning_rate": 4.1046132868773065e-06, + "loss": 0.0, + "step": 59652 + }, + { + "epoch": 5.566203228515443, + "grad_norm": NaN, + "learning_rate": 4.102856143246458e-06, + "loss": 0.0, + "step": 59653 + }, + { + "epoch": 5.56629653821032, + "grad_norm": NaN, + "learning_rate": 4.101099370588795e-06, + "loss": 0.0, + "step": 59654 + }, + { + "epoch": 5.566389847905198, + "grad_norm": NaN, + "learning_rate": 4.099342968908831e-06, + "loss": 0.0, + "step": 59655 + }, + { + "epoch": 5.566483157600075, + "grad_norm": NaN, + "learning_rate": 4.097586938211028e-06, + "loss": 0.0, + "step": 59656 + }, + { + "epoch": 5.566576467294952, + "grad_norm": NaN, + "learning_rate": 4.095831278499817e-06, + "loss": 0.0, + "step": 59657 + }, + { + "epoch": 5.566669776989829, + "grad_norm": NaN, + "learning_rate": 4.09407598977971e-06, + "loss": 0.0, + "step": 59658 + }, + { + "epoch": 5.566763086684706, + "grad_norm": NaN, + "learning_rate": 4.0923210720551215e-06, + "loss": 0.0, + "step": 59659 + }, + { + "epoch": 5.566856396379584, + "grad_norm": NaN, + "learning_rate": 4.090566525330563e-06, + "loss": 0.0, + "step": 59660 + }, + { + "epoch": 5.566949706074461, + "grad_norm": NaN, + "learning_rate": 4.0888123496104816e-06, + "loss": 0.0, + "step": 59661 + }, + { + "epoch": 5.567043015769339, + "grad_norm": NaN, + "learning_rate": 4.087058544899291e-06, + "loss": 0.0, + "step": 59662 + }, + { + "epoch": 5.567136325464216, + "grad_norm": NaN, + "learning_rate": 4.08530511120152e-06, + "loss": 0.0, + "step": 59663 + }, + { + "epoch": 5.567229635159093, + "grad_norm": NaN, + "learning_rate": 4.0835520485215985e-06, + "loss": 0.0, + "step": 59664 + }, + { + "epoch": 5.56732294485397, + "grad_norm": NaN, + "learning_rate": 4.081799356863941e-06, + "loss": 0.0, + "step": 59665 + }, + { + "epoch": 5.5674162545488475, + "grad_norm": NaN, + "learning_rate": 4.080047036233075e-06, + "loss": 0.0, + "step": 59666 + }, + { + "epoch": 5.567509564243725, + "grad_norm": NaN, + "learning_rate": 4.078295086633432e-06, + "loss": 0.0, + "step": 59667 + }, + { + "epoch": 5.567602873938602, + "grad_norm": NaN, + "learning_rate": 4.076543508069424e-06, + "loss": 0.0, + "step": 59668 + }, + { + "epoch": 5.56769618363348, + "grad_norm": NaN, + "learning_rate": 4.074792300545565e-06, + "loss": 0.0, + "step": 59669 + }, + { + "epoch": 5.567789493328357, + "grad_norm": NaN, + "learning_rate": 4.073041464066301e-06, + "loss": 0.0, + "step": 59670 + }, + { + "epoch": 5.567882803023235, + "grad_norm": NaN, + "learning_rate": 4.071290998636029e-06, + "loss": 0.0, + "step": 59671 + }, + { + "epoch": 5.567976112718111, + "grad_norm": NaN, + "learning_rate": 4.0695409042592445e-06, + "loss": 0.0, + "step": 59672 + }, + { + "epoch": 5.5680694224129885, + "grad_norm": NaN, + "learning_rate": 4.067791180940411e-06, + "loss": 0.0, + "step": 59673 + }, + { + "epoch": 5.568162732107866, + "grad_norm": NaN, + "learning_rate": 4.066041828683941e-06, + "loss": 0.0, + "step": 59674 + }, + { + "epoch": 5.568256041802743, + "grad_norm": NaN, + "learning_rate": 4.064292847494299e-06, + "loss": 0.0, + "step": 59675 + }, + { + "epoch": 5.568349351497621, + "grad_norm": NaN, + "learning_rate": 4.062544237375948e-06, + "loss": 0.0, + "step": 59676 + }, + { + "epoch": 5.568442661192498, + "grad_norm": NaN, + "learning_rate": 4.060795998333283e-06, + "loss": 0.0, + "step": 59677 + }, + { + "epoch": 5.568535970887375, + "grad_norm": NaN, + "learning_rate": 4.059048130370801e-06, + "loss": 0.0, + "step": 59678 + }, + { + "epoch": 5.568629280582252, + "grad_norm": NaN, + "learning_rate": 4.057300633492933e-06, + "loss": 0.0, + "step": 59679 + }, + { + "epoch": 5.56872259027713, + "grad_norm": NaN, + "learning_rate": 4.055553507704107e-06, + "loss": 0.0, + "step": 59680 + }, + { + "epoch": 5.568815899972007, + "grad_norm": NaN, + "learning_rate": 4.0538067530087866e-06, + "loss": 0.0, + "step": 59681 + }, + { + "epoch": 5.568909209666884, + "grad_norm": NaN, + "learning_rate": 4.0520603694114025e-06, + "loss": 0.0, + "step": 59682 + }, + { + "epoch": 5.569002519361762, + "grad_norm": NaN, + "learning_rate": 4.050314356916385e-06, + "loss": 0.0, + "step": 59683 + }, + { + "epoch": 5.569095829056639, + "grad_norm": NaN, + "learning_rate": 4.048568715528194e-06, + "loss": 0.0, + "step": 59684 + }, + { + "epoch": 5.569189138751517, + "grad_norm": NaN, + "learning_rate": 4.046823445251279e-06, + "loss": 0.0, + "step": 59685 + }, + { + "epoch": 5.569282448446393, + "grad_norm": NaN, + "learning_rate": 4.04507854609002e-06, + "loss": 0.0, + "step": 59686 + }, + { + "epoch": 5.569375758141271, + "grad_norm": NaN, + "learning_rate": 4.0433340180488935e-06, + "loss": 0.0, + "step": 59687 + }, + { + "epoch": 5.569469067836148, + "grad_norm": NaN, + "learning_rate": 4.041589861132366e-06, + "loss": 0.0, + "step": 59688 + }, + { + "epoch": 5.5695623775310255, + "grad_norm": NaN, + "learning_rate": 4.0398460753448155e-06, + "loss": 0.0, + "step": 59689 + }, + { + "epoch": 5.569655687225903, + "grad_norm": NaN, + "learning_rate": 4.0381026606906896e-06, + "loss": 0.0, + "step": 59690 + }, + { + "epoch": 5.56974899692078, + "grad_norm": NaN, + "learning_rate": 4.036359617174467e-06, + "loss": 0.0, + "step": 59691 + }, + { + "epoch": 5.569842306615658, + "grad_norm": NaN, + "learning_rate": 4.034616944800511e-06, + "loss": 0.0, + "step": 59692 + }, + { + "epoch": 5.569935616310534, + "grad_norm": NaN, + "learning_rate": 4.032874643573303e-06, + "loss": 0.0, + "step": 59693 + }, + { + "epoch": 5.570028926005412, + "grad_norm": NaN, + "learning_rate": 4.031132713497254e-06, + "loss": 0.0, + "step": 59694 + }, + { + "epoch": 5.570122235700289, + "grad_norm": NaN, + "learning_rate": 4.0293911545767945e-06, + "loss": 0.0, + "step": 59695 + }, + { + "epoch": 5.5702155453951665, + "grad_norm": NaN, + "learning_rate": 4.027649966816338e-06, + "loss": 0.0, + "step": 59696 + }, + { + "epoch": 5.570308855090044, + "grad_norm": NaN, + "learning_rate": 4.0259091502203465e-06, + "loss": 0.0, + "step": 59697 + }, + { + "epoch": 5.570402164784921, + "grad_norm": NaN, + "learning_rate": 4.024168704793218e-06, + "loss": 0.0, + "step": 59698 + }, + { + "epoch": 5.570495474479799, + "grad_norm": NaN, + "learning_rate": 4.022428630539398e-06, + "loss": 0.0, + "step": 59699 + }, + { + "epoch": 5.570588784174676, + "grad_norm": NaN, + "learning_rate": 4.0206889274633e-06, + "loss": 0.0, + "step": 59700 + }, + { + "epoch": 5.570682093869553, + "grad_norm": NaN, + "learning_rate": 4.018949595569338e-06, + "loss": 0.0, + "step": 59701 + }, + { + "epoch": 5.57077540356443, + "grad_norm": NaN, + "learning_rate": 4.017210634861956e-06, + "loss": 0.0, + "step": 59702 + }, + { + "epoch": 5.5708687132593075, + "grad_norm": NaN, + "learning_rate": 4.015472045345552e-06, + "loss": 0.0, + "step": 59703 + }, + { + "epoch": 5.570962022954185, + "grad_norm": NaN, + "learning_rate": 4.0137338270245565e-06, + "loss": 0.0, + "step": 59704 + }, + { + "epoch": 5.571055332649062, + "grad_norm": NaN, + "learning_rate": 4.0119959799034155e-06, + "loss": 0.0, + "step": 59705 + }, + { + "epoch": 5.57114864234394, + "grad_norm": NaN, + "learning_rate": 4.010258503986491e-06, + "loss": 0.0, + "step": 59706 + }, + { + "epoch": 5.571241952038816, + "grad_norm": NaN, + "learning_rate": 4.008521399278247e-06, + "loss": 0.0, + "step": 59707 + }, + { + "epoch": 5.571335261733694, + "grad_norm": NaN, + "learning_rate": 4.006784665783113e-06, + "loss": 0.0, + "step": 59708 + }, + { + "epoch": 5.571428571428571, + "grad_norm": NaN, + "learning_rate": 4.0050483035054374e-06, + "loss": 0.0, + "step": 59709 + }, + { + "epoch": 5.571521881123449, + "grad_norm": NaN, + "learning_rate": 4.003312312449697e-06, + "loss": 0.0, + "step": 59710 + }, + { + "epoch": 5.571615190818326, + "grad_norm": NaN, + "learning_rate": 4.0015766926202895e-06, + "loss": 0.0, + "step": 59711 + }, + { + "epoch": 5.571708500513203, + "grad_norm": NaN, + "learning_rate": 3.999841444021612e-06, + "loss": 0.0, + "step": 59712 + }, + { + "epoch": 5.571801810208081, + "grad_norm": NaN, + "learning_rate": 3.998106566658094e-06, + "loss": 0.0, + "step": 59713 + }, + { + "epoch": 5.571895119902958, + "grad_norm": NaN, + "learning_rate": 3.996372060534165e-06, + "loss": 0.0, + "step": 59714 + }, + { + "epoch": 5.571988429597836, + "grad_norm": NaN, + "learning_rate": 3.99463792565417e-06, + "loss": 0.0, + "step": 59715 + }, + { + "epoch": 5.572081739292712, + "grad_norm": NaN, + "learning_rate": 3.992904162022592e-06, + "loss": 0.0, + "step": 59716 + }, + { + "epoch": 5.57217504898759, + "grad_norm": NaN, + "learning_rate": 3.991170769643809e-06, + "loss": 0.0, + "step": 59717 + }, + { + "epoch": 5.572268358682467, + "grad_norm": NaN, + "learning_rate": 3.9894377485222005e-06, + "loss": 0.0, + "step": 59718 + }, + { + "epoch": 5.5723616683773445, + "grad_norm": NaN, + "learning_rate": 3.987705098662214e-06, + "loss": 0.0, + "step": 59719 + }, + { + "epoch": 5.572454978072222, + "grad_norm": NaN, + "learning_rate": 3.985972820068262e-06, + "loss": 0.0, + "step": 59720 + }, + { + "epoch": 5.572548287767099, + "grad_norm": NaN, + "learning_rate": 3.98424091274469e-06, + "loss": 0.0, + "step": 59721 + }, + { + "epoch": 5.572641597461976, + "grad_norm": NaN, + "learning_rate": 3.9825093766959635e-06, + "loss": 0.0, + "step": 59722 + }, + { + "epoch": 5.572734907156853, + "grad_norm": NaN, + "learning_rate": 3.980778211926478e-06, + "loss": 0.0, + "step": 59723 + }, + { + "epoch": 5.572828216851731, + "grad_norm": NaN, + "learning_rate": 3.979047418440595e-06, + "loss": 0.0, + "step": 59724 + }, + { + "epoch": 5.572921526546608, + "grad_norm": NaN, + "learning_rate": 3.977316996242746e-06, + "loss": 0.0, + "step": 59725 + }, + { + "epoch": 5.5730148362414855, + "grad_norm": NaN, + "learning_rate": 3.9755869453373445e-06, + "loss": 0.0, + "step": 59726 + }, + { + "epoch": 5.573108145936363, + "grad_norm": NaN, + "learning_rate": 3.973857265728736e-06, + "loss": 0.0, + "step": 59727 + }, + { + "epoch": 5.57320145563124, + "grad_norm": NaN, + "learning_rate": 3.972127957421367e-06, + "loss": 0.0, + "step": 59728 + }, + { + "epoch": 5.573294765326118, + "grad_norm": NaN, + "learning_rate": 3.970399020419635e-06, + "loss": 0.0, + "step": 59729 + }, + { + "epoch": 5.573388075020994, + "grad_norm": NaN, + "learning_rate": 3.968670454727885e-06, + "loss": 0.0, + "step": 59730 + }, + { + "epoch": 5.573481384715872, + "grad_norm": NaN, + "learning_rate": 3.966942260350564e-06, + "loss": 0.0, + "step": 59731 + }, + { + "epoch": 5.573574694410749, + "grad_norm": NaN, + "learning_rate": 3.965214437292069e-06, + "loss": 0.0, + "step": 59732 + }, + { + "epoch": 5.573668004105627, + "grad_norm": NaN, + "learning_rate": 3.963486985556746e-06, + "loss": 0.0, + "step": 59733 + }, + { + "epoch": 5.573761313800504, + "grad_norm": NaN, + "learning_rate": 3.961759905149043e-06, + "loss": 0.0, + "step": 59734 + }, + { + "epoch": 5.573854623495381, + "grad_norm": NaN, + "learning_rate": 3.96003319607332e-06, + "loss": 0.0, + "step": 59735 + }, + { + "epoch": 5.573947933190259, + "grad_norm": NaN, + "learning_rate": 3.95830685833396e-06, + "loss": 0.0, + "step": 59736 + }, + { + "epoch": 5.574041242885135, + "grad_norm": NaN, + "learning_rate": 3.956580891935374e-06, + "loss": 0.0, + "step": 59737 + }, + { + "epoch": 5.574134552580013, + "grad_norm": NaN, + "learning_rate": 3.954855296881942e-06, + "loss": 0.0, + "step": 59738 + }, + { + "epoch": 5.57422786227489, + "grad_norm": NaN, + "learning_rate": 3.953130073178046e-06, + "loss": 0.0, + "step": 59739 + }, + { + "epoch": 5.574321171969768, + "grad_norm": NaN, + "learning_rate": 3.95140522082808e-06, + "loss": 0.0, + "step": 59740 + }, + { + "epoch": 5.574414481664645, + "grad_norm": NaN, + "learning_rate": 3.949680739836425e-06, + "loss": 0.0, + "step": 59741 + }, + { + "epoch": 5.5745077913595225, + "grad_norm": NaN, + "learning_rate": 3.947956630207477e-06, + "loss": 0.0, + "step": 59742 + }, + { + "epoch": 5.5746011010544, + "grad_norm": NaN, + "learning_rate": 3.9462328919456e-06, + "loss": 0.0, + "step": 59743 + }, + { + "epoch": 5.574694410749277, + "grad_norm": NaN, + "learning_rate": 3.944509525055189e-06, + "loss": 0.0, + "step": 59744 + }, + { + "epoch": 5.574787720444154, + "grad_norm": NaN, + "learning_rate": 3.942786529540643e-06, + "loss": 0.0, + "step": 59745 + }, + { + "epoch": 5.574881030139031, + "grad_norm": NaN, + "learning_rate": 3.941063905406306e-06, + "loss": 0.0, + "step": 59746 + }, + { + "epoch": 5.574974339833909, + "grad_norm": NaN, + "learning_rate": 3.939341652656592e-06, + "loss": 0.0, + "step": 59747 + }, + { + "epoch": 5.575067649528786, + "grad_norm": NaN, + "learning_rate": 3.937619771295847e-06, + "loss": 0.0, + "step": 59748 + }, + { + "epoch": 5.5751609592236635, + "grad_norm": NaN, + "learning_rate": 3.935898261328502e-06, + "loss": 0.0, + "step": 59749 + }, + { + "epoch": 5.575254268918541, + "grad_norm": NaN, + "learning_rate": 3.934177122758869e-06, + "loss": 0.0, + "step": 59750 + }, + { + "epoch": 5.5753475786134175, + "grad_norm": NaN, + "learning_rate": 3.932456355591362e-06, + "loss": 0.0, + "step": 59751 + }, + { + "epoch": 5.575440888308295, + "grad_norm": NaN, + "learning_rate": 3.930735959830361e-06, + "loss": 0.0, + "step": 59752 + }, + { + "epoch": 5.575534198003172, + "grad_norm": NaN, + "learning_rate": 3.929015935480212e-06, + "loss": 0.0, + "step": 59753 + }, + { + "epoch": 5.57562750769805, + "grad_norm": NaN, + "learning_rate": 3.927296282545312e-06, + "loss": 0.0, + "step": 59754 + }, + { + "epoch": 5.575720817392927, + "grad_norm": NaN, + "learning_rate": 3.925577001030056e-06, + "loss": 0.0, + "step": 59755 + }, + { + "epoch": 5.5758141270878046, + "grad_norm": NaN, + "learning_rate": 3.923858090938742e-06, + "loss": 0.0, + "step": 59756 + }, + { + "epoch": 5.575907436782682, + "grad_norm": NaN, + "learning_rate": 3.9221395522757994e-06, + "loss": 0.0, + "step": 59757 + }, + { + "epoch": 5.576000746477559, + "grad_norm": NaN, + "learning_rate": 3.920421385045608e-06, + "loss": 0.0, + "step": 59758 + }, + { + "epoch": 5.576094056172437, + "grad_norm": NaN, + "learning_rate": 3.918703589252481e-06, + "loss": 0.0, + "step": 59759 + }, + { + "epoch": 5.576187365867313, + "grad_norm": NaN, + "learning_rate": 3.9169861649008485e-06, + "loss": 0.0, + "step": 59760 + }, + { + "epoch": 5.576280675562191, + "grad_norm": NaN, + "learning_rate": 3.91526911199504e-06, + "loss": 0.0, + "step": 59761 + }, + { + "epoch": 5.576373985257068, + "grad_norm": NaN, + "learning_rate": 3.913552430539401e-06, + "loss": 0.0, + "step": 59762 + }, + { + "epoch": 5.576467294951946, + "grad_norm": NaN, + "learning_rate": 3.911836120538347e-06, + "loss": 0.0, + "step": 59763 + }, + { + "epoch": 5.576560604646823, + "grad_norm": NaN, + "learning_rate": 3.91012018199624e-06, + "loss": 0.0, + "step": 59764 + }, + { + "epoch": 5.5766539143417, + "grad_norm": NaN, + "learning_rate": 3.908404614917376e-06, + "loss": 0.0, + "step": 59765 + }, + { + "epoch": 5.576747224036577, + "grad_norm": NaN, + "learning_rate": 3.9066894193061855e-06, + "loss": 0.0, + "step": 59766 + }, + { + "epoch": 5.576840533731454, + "grad_norm": NaN, + "learning_rate": 3.904974595167032e-06, + "loss": 0.0, + "step": 59767 + }, + { + "epoch": 5.576933843426332, + "grad_norm": NaN, + "learning_rate": 3.903260142504211e-06, + "loss": 0.0, + "step": 59768 + }, + { + "epoch": 5.577027153121209, + "grad_norm": NaN, + "learning_rate": 3.901546061322136e-06, + "loss": 0.0, + "step": 59769 + }, + { + "epoch": 5.577120462816087, + "grad_norm": NaN, + "learning_rate": 3.899832351625154e-06, + "loss": 0.0, + "step": 59770 + }, + { + "epoch": 5.577213772510964, + "grad_norm": NaN, + "learning_rate": 3.898119013417627e-06, + "loss": 0.0, + "step": 59771 + }, + { + "epoch": 5.5773070822058415, + "grad_norm": NaN, + "learning_rate": 3.896406046703904e-06, + "loss": 0.0, + "step": 59772 + }, + { + "epoch": 5.577400391900719, + "grad_norm": NaN, + "learning_rate": 3.894693451488329e-06, + "loss": 0.0, + "step": 59773 + }, + { + "epoch": 5.577493701595595, + "grad_norm": NaN, + "learning_rate": 3.892981227775283e-06, + "loss": 0.0, + "step": 59774 + }, + { + "epoch": 5.577587011290473, + "grad_norm": NaN, + "learning_rate": 3.891269375569095e-06, + "loss": 0.0, + "step": 59775 + }, + { + "epoch": 5.57768032098535, + "grad_norm": NaN, + "learning_rate": 3.8895578948741295e-06, + "loss": 0.0, + "step": 59776 + }, + { + "epoch": 5.577773630680228, + "grad_norm": NaN, + "learning_rate": 3.887846785694748e-06, + "loss": 0.0, + "step": 59777 + }, + { + "epoch": 5.577866940375105, + "grad_norm": NaN, + "learning_rate": 3.886136048035282e-06, + "loss": 0.0, + "step": 59778 + }, + { + "epoch": 5.5779602500699825, + "grad_norm": NaN, + "learning_rate": 3.884425681900094e-06, + "loss": 0.0, + "step": 59779 + }, + { + "epoch": 5.57805355976486, + "grad_norm": NaN, + "learning_rate": 3.882715687293531e-06, + "loss": 0.0, + "step": 59780 + }, + { + "epoch": 5.5781468694597365, + "grad_norm": NaN, + "learning_rate": 3.881006064219922e-06, + "loss": 0.0, + "step": 59781 + }, + { + "epoch": 5.578240179154614, + "grad_norm": NaN, + "learning_rate": 3.879296812683646e-06, + "loss": 0.0, + "step": 59782 + }, + { + "epoch": 5.578333488849491, + "grad_norm": NaN, + "learning_rate": 3.877587932689036e-06, + "loss": 0.0, + "step": 59783 + }, + { + "epoch": 5.578426798544369, + "grad_norm": NaN, + "learning_rate": 3.875879424240436e-06, + "loss": 0.0, + "step": 59784 + }, + { + "epoch": 5.578520108239246, + "grad_norm": NaN, + "learning_rate": 3.874171287342176e-06, + "loss": 0.0, + "step": 59785 + }, + { + "epoch": 5.578613417934124, + "grad_norm": NaN, + "learning_rate": 3.87246352199862e-06, + "loss": 0.0, + "step": 59786 + }, + { + "epoch": 5.578706727629001, + "grad_norm": NaN, + "learning_rate": 3.870756128214097e-06, + "loss": 0.0, + "step": 59787 + }, + { + "epoch": 5.578800037323878, + "grad_norm": NaN, + "learning_rate": 3.869049105992972e-06, + "loss": 0.0, + "step": 59788 + }, + { + "epoch": 5.578893347018755, + "grad_norm": NaN, + "learning_rate": 3.867342455339556e-06, + "loss": 0.0, + "step": 59789 + }, + { + "epoch": 5.578986656713632, + "grad_norm": NaN, + "learning_rate": 3.865636176258197e-06, + "loss": 0.0, + "step": 59790 + }, + { + "epoch": 5.57907996640851, + "grad_norm": NaN, + "learning_rate": 3.863930268753241e-06, + "loss": 0.0, + "step": 59791 + }, + { + "epoch": 5.579173276103387, + "grad_norm": NaN, + "learning_rate": 3.862224732829017e-06, + "loss": 0.0, + "step": 59792 + }, + { + "epoch": 5.579266585798265, + "grad_norm": NaN, + "learning_rate": 3.8605195684898895e-06, + "loss": 0.0, + "step": 59793 + }, + { + "epoch": 5.579359895493142, + "grad_norm": NaN, + "learning_rate": 3.8588147757401375e-06, + "loss": 0.0, + "step": 59794 + }, + { + "epoch": 5.579453205188019, + "grad_norm": NaN, + "learning_rate": 3.857110354584142e-06, + "loss": 0.0, + "step": 59795 + }, + { + "epoch": 5.579546514882896, + "grad_norm": NaN, + "learning_rate": 3.8554063050262486e-06, + "loss": 0.0, + "step": 59796 + }, + { + "epoch": 5.579639824577773, + "grad_norm": NaN, + "learning_rate": 3.853702627070721e-06, + "loss": 0.0, + "step": 59797 + }, + { + "epoch": 5.579733134272651, + "grad_norm": NaN, + "learning_rate": 3.851999320721971e-06, + "loss": 0.0, + "step": 59798 + }, + { + "epoch": 5.579826443967528, + "grad_norm": NaN, + "learning_rate": 3.850296385984297e-06, + "loss": 0.0, + "step": 59799 + }, + { + "epoch": 5.579919753662406, + "grad_norm": NaN, + "learning_rate": 3.848593822861995e-06, + "loss": 0.0, + "step": 59800 + }, + { + "epoch": 5.580013063357283, + "grad_norm": NaN, + "learning_rate": 3.8468916313594446e-06, + "loss": 0.0, + "step": 59801 + }, + { + "epoch": 5.5801063730521605, + "grad_norm": NaN, + "learning_rate": 3.8451898114809585e-06, + "loss": 0.0, + "step": 59802 + }, + { + "epoch": 5.580199682747037, + "grad_norm": NaN, + "learning_rate": 3.843488363230851e-06, + "loss": 0.0, + "step": 59803 + }, + { + "epoch": 5.5802929924419145, + "grad_norm": NaN, + "learning_rate": 3.841787286613451e-06, + "loss": 0.0, + "step": 59804 + }, + { + "epoch": 5.580386302136792, + "grad_norm": NaN, + "learning_rate": 3.840086581633123e-06, + "loss": 0.0, + "step": 59805 + }, + { + "epoch": 5.580479611831669, + "grad_norm": NaN, + "learning_rate": 3.8383862482941115e-06, + "loss": 0.0, + "step": 59806 + }, + { + "epoch": 5.580572921526547, + "grad_norm": NaN, + "learning_rate": 3.836686286600815e-06, + "loss": 0.0, + "step": 59807 + }, + { + "epoch": 5.580666231221424, + "grad_norm": NaN, + "learning_rate": 3.8349866965575295e-06, + "loss": 0.0, + "step": 59808 + }, + { + "epoch": 5.5807595409163016, + "grad_norm": NaN, + "learning_rate": 3.833287478168584e-06, + "loss": 0.0, + "step": 59809 + }, + { + "epoch": 5.580852850611178, + "grad_norm": NaN, + "learning_rate": 3.831588631438276e-06, + "loss": 0.0, + "step": 59810 + }, + { + "epoch": 5.5809461603060555, + "grad_norm": NaN, + "learning_rate": 3.829890156370935e-06, + "loss": 0.0, + "step": 59811 + }, + { + "epoch": 5.581039470000933, + "grad_norm": NaN, + "learning_rate": 3.828192052970891e-06, + "loss": 0.0, + "step": 59812 + }, + { + "epoch": 5.58113277969581, + "grad_norm": NaN, + "learning_rate": 3.826494321242474e-06, + "loss": 0.0, + "step": 59813 + }, + { + "epoch": 5.581226089390688, + "grad_norm": NaN, + "learning_rate": 3.824796961189963e-06, + "loss": 0.0, + "step": 59814 + }, + { + "epoch": 5.581319399085565, + "grad_norm": NaN, + "learning_rate": 3.823099972817689e-06, + "loss": 0.0, + "step": 59815 + }, + { + "epoch": 5.581412708780443, + "grad_norm": NaN, + "learning_rate": 3.821403356129982e-06, + "loss": 0.0, + "step": 59816 + }, + { + "epoch": 5.58150601847532, + "grad_norm": NaN, + "learning_rate": 3.819707111131137e-06, + "loss": 0.0, + "step": 59817 + }, + { + "epoch": 5.5815993281701966, + "grad_norm": NaN, + "learning_rate": 3.818011237825486e-06, + "loss": 0.0, + "step": 59818 + }, + { + "epoch": 5.581692637865074, + "grad_norm": NaN, + "learning_rate": 3.816315736217324e-06, + "loss": 0.0, + "step": 59819 + }, + { + "epoch": 5.581785947559951, + "grad_norm": NaN, + "learning_rate": 3.8146206063109643e-06, + "loss": 0.0, + "step": 59820 + }, + { + "epoch": 5.581879257254829, + "grad_norm": NaN, + "learning_rate": 3.8129258481107205e-06, + "loss": 0.0, + "step": 59821 + }, + { + "epoch": 5.581972566949706, + "grad_norm": NaN, + "learning_rate": 3.811231461620906e-06, + "loss": 0.0, + "step": 59822 + }, + { + "epoch": 5.582065876644584, + "grad_norm": NaN, + "learning_rate": 3.8095374468458172e-06, + "loss": 0.0, + "step": 59823 + }, + { + "epoch": 5.58215918633946, + "grad_norm": NaN, + "learning_rate": 3.807843803789784e-06, + "loss": 0.0, + "step": 59824 + }, + { + "epoch": 5.582252496034338, + "grad_norm": NaN, + "learning_rate": 3.8061505324570863e-06, + "loss": 0.0, + "step": 59825 + }, + { + "epoch": 5.582345805729215, + "grad_norm": NaN, + "learning_rate": 3.804457632852054e-06, + "loss": 0.0, + "step": 59826 + }, + { + "epoch": 5.582439115424092, + "grad_norm": NaN, + "learning_rate": 3.8027651049789667e-06, + "loss": 0.0, + "step": 59827 + }, + { + "epoch": 5.58253242511897, + "grad_norm": NaN, + "learning_rate": 3.8010729488421545e-06, + "loss": 0.0, + "step": 59828 + }, + { + "epoch": 5.582625734813847, + "grad_norm": NaN, + "learning_rate": 3.799381164445897e-06, + "loss": 0.0, + "step": 59829 + }, + { + "epoch": 5.582719044508725, + "grad_norm": NaN, + "learning_rate": 3.7976897517945083e-06, + "loss": 0.0, + "step": 59830 + }, + { + "epoch": 5.582812354203602, + "grad_norm": NaN, + "learning_rate": 3.795998710892284e-06, + "loss": 0.0, + "step": 59831 + }, + { + "epoch": 5.5829056638984795, + "grad_norm": NaN, + "learning_rate": 3.794308041743538e-06, + "loss": 0.0, + "step": 59832 + }, + { + "epoch": 5.582998973593356, + "grad_norm": NaN, + "learning_rate": 3.7926177443525494e-06, + "loss": 0.0, + "step": 59833 + }, + { + "epoch": 5.5830922832882335, + "grad_norm": NaN, + "learning_rate": 3.790927818723616e-06, + "loss": 0.0, + "step": 59834 + }, + { + "epoch": 5.583185592983111, + "grad_norm": NaN, + "learning_rate": 3.7892382648610496e-06, + "loss": 0.0, + "step": 59835 + }, + { + "epoch": 5.583278902677988, + "grad_norm": NaN, + "learning_rate": 3.787549082769148e-06, + "loss": 0.0, + "step": 59836 + }, + { + "epoch": 5.583372212372866, + "grad_norm": NaN, + "learning_rate": 3.78586027245219e-06, + "loss": 0.0, + "step": 59837 + }, + { + "epoch": 5.583465522067743, + "grad_norm": NaN, + "learning_rate": 3.784171833914473e-06, + "loss": 0.0, + "step": 59838 + }, + { + "epoch": 5.58355883176262, + "grad_norm": NaN, + "learning_rate": 3.7824837671602937e-06, + "loss": 0.0, + "step": 59839 + }, + { + "epoch": 5.583652141457497, + "grad_norm": NaN, + "learning_rate": 3.7807960721939646e-06, + "loss": 0.0, + "step": 59840 + }, + { + "epoch": 5.5837454511523745, + "grad_norm": NaN, + "learning_rate": 3.7791087490197323e-06, + "loss": 0.0, + "step": 59841 + }, + { + "epoch": 5.583838760847252, + "grad_norm": NaN, + "learning_rate": 3.7774217976419275e-06, + "loss": 0.0, + "step": 59842 + }, + { + "epoch": 5.583932070542129, + "grad_norm": NaN, + "learning_rate": 3.775735218064829e-06, + "loss": 0.0, + "step": 59843 + }, + { + "epoch": 5.584025380237007, + "grad_norm": NaN, + "learning_rate": 3.774049010292701e-06, + "loss": 0.0, + "step": 59844 + }, + { + "epoch": 5.584118689931884, + "grad_norm": NaN, + "learning_rate": 3.7723631743298568e-06, + "loss": 0.0, + "step": 59845 + }, + { + "epoch": 5.584211999626762, + "grad_norm": NaN, + "learning_rate": 3.7706777101805917e-06, + "loss": 0.0, + "step": 59846 + }, + { + "epoch": 5.584305309321638, + "grad_norm": NaN, + "learning_rate": 3.7689926178491705e-06, + "loss": 0.0, + "step": 59847 + }, + { + "epoch": 5.584398619016516, + "grad_norm": NaN, + "learning_rate": 3.7673078973398716e-06, + "loss": 0.0, + "step": 59848 + }, + { + "epoch": 5.584491928711393, + "grad_norm": NaN, + "learning_rate": 3.76562354865701e-06, + "loss": 0.0, + "step": 59849 + }, + { + "epoch": 5.58458523840627, + "grad_norm": NaN, + "learning_rate": 3.7639395718048305e-06, + "loss": 0.0, + "step": 59850 + }, + { + "epoch": 5.584678548101148, + "grad_norm": NaN, + "learning_rate": 3.762255966787647e-06, + "loss": 0.0, + "step": 59851 + }, + { + "epoch": 5.584771857796025, + "grad_norm": NaN, + "learning_rate": 3.7605727336097237e-06, + "loss": 0.0, + "step": 59852 + }, + { + "epoch": 5.584865167490903, + "grad_norm": NaN, + "learning_rate": 3.758889872275356e-06, + "loss": 0.0, + "step": 59853 + }, + { + "epoch": 5.584958477185779, + "grad_norm": NaN, + "learning_rate": 3.757207382788807e-06, + "loss": 0.0, + "step": 59854 + }, + { + "epoch": 5.585051786880657, + "grad_norm": NaN, + "learning_rate": 3.7555252651543577e-06, + "loss": 0.0, + "step": 59855 + }, + { + "epoch": 5.585145096575534, + "grad_norm": NaN, + "learning_rate": 3.7538435193762706e-06, + "loss": 0.0, + "step": 59856 + }, + { + "epoch": 5.5852384062704115, + "grad_norm": NaN, + "learning_rate": 3.752162145458859e-06, + "loss": 0.0, + "step": 59857 + }, + { + "epoch": 5.585331715965289, + "grad_norm": NaN, + "learning_rate": 3.75048114340637e-06, + "loss": 0.0, + "step": 59858 + }, + { + "epoch": 5.585425025660166, + "grad_norm": NaN, + "learning_rate": 3.7488005132230827e-06, + "loss": 0.0, + "step": 59859 + }, + { + "epoch": 5.585518335355044, + "grad_norm": NaN, + "learning_rate": 3.7471202549132774e-06, + "loss": 0.0, + "step": 59860 + }, + { + "epoch": 5.585611645049921, + "grad_norm": NaN, + "learning_rate": 3.7454403684812173e-06, + "loss": 0.0, + "step": 59861 + }, + { + "epoch": 5.585704954744798, + "grad_norm": NaN, + "learning_rate": 3.7437608539311825e-06, + "loss": 0.0, + "step": 59862 + }, + { + "epoch": 5.585798264439675, + "grad_norm": NaN, + "learning_rate": 3.7420817112674362e-06, + "loss": 0.0, + "step": 59863 + }, + { + "epoch": 5.5858915741345525, + "grad_norm": NaN, + "learning_rate": 3.740402940494258e-06, + "loss": 0.0, + "step": 59864 + }, + { + "epoch": 5.58598488382943, + "grad_norm": NaN, + "learning_rate": 3.738724541615912e-06, + "loss": 0.0, + "step": 59865 + }, + { + "epoch": 5.586078193524307, + "grad_norm": NaN, + "learning_rate": 3.7370465146366434e-06, + "loss": 0.0, + "step": 59866 + }, + { + "epoch": 5.586171503219185, + "grad_norm": NaN, + "learning_rate": 3.735368859560767e-06, + "loss": 0.0, + "step": 59867 + }, + { + "epoch": 5.586264812914061, + "grad_norm": NaN, + "learning_rate": 3.7336915763924946e-06, + "loss": 0.0, + "step": 59868 + }, + { + "epoch": 5.586358122608939, + "grad_norm": NaN, + "learning_rate": 3.7320146651361404e-06, + "loss": 0.0, + "step": 59869 + }, + { + "epoch": 5.586451432303816, + "grad_norm": NaN, + "learning_rate": 3.7303381257959342e-06, + "loss": 0.0, + "step": 59870 + }, + { + "epoch": 5.5865447419986936, + "grad_norm": NaN, + "learning_rate": 3.7286619583761392e-06, + "loss": 0.0, + "step": 59871 + }, + { + "epoch": 5.586638051693571, + "grad_norm": NaN, + "learning_rate": 3.726986162881035e-06, + "loss": 0.0, + "step": 59872 + }, + { + "epoch": 5.586731361388448, + "grad_norm": NaN, + "learning_rate": 3.725310739314885e-06, + "loss": 0.0, + "step": 59873 + }, + { + "epoch": 5.586824671083326, + "grad_norm": NaN, + "learning_rate": 3.7236356876819363e-06, + "loss": 0.0, + "step": 59874 + }, + { + "epoch": 5.586917980778203, + "grad_norm": NaN, + "learning_rate": 3.7219610079864514e-06, + "loss": 0.0, + "step": 59875 + }, + { + "epoch": 5.587011290473081, + "grad_norm": NaN, + "learning_rate": 3.7202867002326776e-06, + "loss": 0.0, + "step": 59876 + }, + { + "epoch": 5.587104600167957, + "grad_norm": NaN, + "learning_rate": 3.718612764424894e-06, + "loss": 0.0, + "step": 59877 + }, + { + "epoch": 5.587197909862835, + "grad_norm": NaN, + "learning_rate": 3.716939200567348e-06, + "loss": 0.0, + "step": 59878 + }, + { + "epoch": 5.587291219557712, + "grad_norm": NaN, + "learning_rate": 3.7152660086643027e-06, + "loss": 0.0, + "step": 59879 + }, + { + "epoch": 5.587384529252589, + "grad_norm": NaN, + "learning_rate": 3.7135931887199875e-06, + "loss": 0.0, + "step": 59880 + }, + { + "epoch": 5.587477838947467, + "grad_norm": NaN, + "learning_rate": 3.711920740738683e-06, + "loss": 0.0, + "step": 59881 + }, + { + "epoch": 5.587571148642344, + "grad_norm": NaN, + "learning_rate": 3.710248664724619e-06, + "loss": 0.0, + "step": 59882 + }, + { + "epoch": 5.587664458337221, + "grad_norm": NaN, + "learning_rate": 3.7085769606820583e-06, + "loss": 0.0, + "step": 59883 + }, + { + "epoch": 5.587757768032098, + "grad_norm": NaN, + "learning_rate": 3.7069056286152476e-06, + "loss": 0.0, + "step": 59884 + }, + { + "epoch": 5.587851077726976, + "grad_norm": NaN, + "learning_rate": 3.705234668528451e-06, + "loss": 0.0, + "step": 59885 + }, + { + "epoch": 5.587944387421853, + "grad_norm": NaN, + "learning_rate": 3.7035640804259137e-06, + "loss": 0.0, + "step": 59886 + }, + { + "epoch": 5.5880376971167305, + "grad_norm": NaN, + "learning_rate": 3.701893864311867e-06, + "loss": 0.0, + "step": 59887 + }, + { + "epoch": 5.588131006811608, + "grad_norm": NaN, + "learning_rate": 3.700224020190556e-06, + "loss": 0.0, + "step": 59888 + }, + { + "epoch": 5.588224316506485, + "grad_norm": NaN, + "learning_rate": 3.6985545480662625e-06, + "loss": 0.0, + "step": 59889 + }, + { + "epoch": 5.588317626201363, + "grad_norm": NaN, + "learning_rate": 3.696885447943182e-06, + "loss": 0.0, + "step": 59890 + }, + { + "epoch": 5.588410935896239, + "grad_norm": NaN, + "learning_rate": 3.695216719825611e-06, + "loss": 0.0, + "step": 59891 + }, + { + "epoch": 5.588504245591117, + "grad_norm": NaN, + "learning_rate": 3.6935483637177466e-06, + "loss": 0.0, + "step": 59892 + }, + { + "epoch": 5.588597555285994, + "grad_norm": NaN, + "learning_rate": 3.6918803796238515e-06, + "loss": 0.0, + "step": 59893 + }, + { + "epoch": 5.5886908649808715, + "grad_norm": NaN, + "learning_rate": 3.690212767548173e-06, + "loss": 0.0, + "step": 59894 + }, + { + "epoch": 5.588784174675749, + "grad_norm": NaN, + "learning_rate": 3.688545527494924e-06, + "loss": 0.0, + "step": 59895 + }, + { + "epoch": 5.588877484370626, + "grad_norm": NaN, + "learning_rate": 3.6868786594683842e-06, + "loss": 0.0, + "step": 59896 + }, + { + "epoch": 5.588970794065504, + "grad_norm": NaN, + "learning_rate": 3.6852121634727673e-06, + "loss": 0.0, + "step": 59897 + }, + { + "epoch": 5.58906410376038, + "grad_norm": NaN, + "learning_rate": 3.6835460395123027e-06, + "loss": 0.0, + "step": 59898 + }, + { + "epoch": 5.589157413455258, + "grad_norm": NaN, + "learning_rate": 3.6818802875912544e-06, + "loss": 0.0, + "step": 59899 + }, + { + "epoch": 5.589250723150135, + "grad_norm": NaN, + "learning_rate": 3.6802149077138187e-06, + "loss": 0.0, + "step": 59900 + }, + { + "epoch": 5.589344032845013, + "grad_norm": NaN, + "learning_rate": 3.6785498998842756e-06, + "loss": 0.0, + "step": 59901 + }, + { + "epoch": 5.58943734253989, + "grad_norm": NaN, + "learning_rate": 3.6768852641068212e-06, + "loss": 0.0, + "step": 59902 + }, + { + "epoch": 5.589530652234767, + "grad_norm": NaN, + "learning_rate": 3.6752210003857197e-06, + "loss": 0.0, + "step": 59903 + }, + { + "epoch": 5.589623961929645, + "grad_norm": NaN, + "learning_rate": 3.6735571087251836e-06, + "loss": 0.0, + "step": 59904 + }, + { + "epoch": 5.589717271624522, + "grad_norm": NaN, + "learning_rate": 3.6718935891294265e-06, + "loss": 0.0, + "step": 59905 + }, + { + "epoch": 5.589810581319399, + "grad_norm": NaN, + "learning_rate": 3.6702304416027285e-06, + "loss": 0.0, + "step": 59906 + }, + { + "epoch": 5.589903891014276, + "grad_norm": NaN, + "learning_rate": 3.668567666149269e-06, + "loss": 0.0, + "step": 59907 + }, + { + "epoch": 5.589997200709154, + "grad_norm": NaN, + "learning_rate": 3.666905262773295e-06, + "loss": 0.0, + "step": 59908 + }, + { + "epoch": 5.590090510404031, + "grad_norm": NaN, + "learning_rate": 3.665243231479037e-06, + "loss": 0.0, + "step": 59909 + }, + { + "epoch": 5.5901838200989085, + "grad_norm": NaN, + "learning_rate": 3.663581572270724e-06, + "loss": 0.0, + "step": 59910 + }, + { + "epoch": 5.590277129793786, + "grad_norm": NaN, + "learning_rate": 3.66192028515257e-06, + "loss": 0.0, + "step": 59911 + }, + { + "epoch": 5.590370439488662, + "grad_norm": NaN, + "learning_rate": 3.660259370128804e-06, + "loss": 0.0, + "step": 59912 + }, + { + "epoch": 5.59046374918354, + "grad_norm": NaN, + "learning_rate": 3.6585988272036404e-06, + "loss": 0.0, + "step": 59913 + }, + { + "epoch": 5.590557058878417, + "grad_norm": NaN, + "learning_rate": 3.656938656381325e-06, + "loss": 0.0, + "step": 59914 + }, + { + "epoch": 5.590650368573295, + "grad_norm": NaN, + "learning_rate": 3.6552788576660553e-06, + "loss": 0.0, + "step": 59915 + }, + { + "epoch": 5.590743678268172, + "grad_norm": NaN, + "learning_rate": 3.6536194310620603e-06, + "loss": 0.0, + "step": 59916 + }, + { + "epoch": 5.5908369879630495, + "grad_norm": NaN, + "learning_rate": 3.6519603765735707e-06, + "loss": 0.0, + "step": 59917 + }, + { + "epoch": 5.590930297657927, + "grad_norm": NaN, + "learning_rate": 3.6503016942047825e-06, + "loss": 0.0, + "step": 59918 + }, + { + "epoch": 5.591023607352804, + "grad_norm": NaN, + "learning_rate": 3.648643383959943e-06, + "loss": 0.0, + "step": 59919 + }, + { + "epoch": 5.591116917047681, + "grad_norm": NaN, + "learning_rate": 3.6469854458432315e-06, + "loss": 0.0, + "step": 59920 + }, + { + "epoch": 5.591210226742558, + "grad_norm": NaN, + "learning_rate": 3.645327879858895e-06, + "loss": 0.0, + "step": 59921 + }, + { + "epoch": 5.591303536437436, + "grad_norm": NaN, + "learning_rate": 3.6436706860111297e-06, + "loss": 0.0, + "step": 59922 + }, + { + "epoch": 5.591396846132313, + "grad_norm": NaN, + "learning_rate": 3.6420138643041665e-06, + "loss": 0.0, + "step": 59923 + }, + { + "epoch": 5.5914901558271906, + "grad_norm": NaN, + "learning_rate": 3.6403574147421845e-06, + "loss": 0.0, + "step": 59924 + }, + { + "epoch": 5.591583465522068, + "grad_norm": NaN, + "learning_rate": 3.638701337329447e-06, + "loss": 0.0, + "step": 59925 + }, + { + "epoch": 5.591676775216945, + "grad_norm": NaN, + "learning_rate": 3.637045632070118e-06, + "loss": 0.0, + "step": 59926 + }, + { + "epoch": 5.591770084911822, + "grad_norm": NaN, + "learning_rate": 3.635390298968427e-06, + "loss": 0.0, + "step": 59927 + }, + { + "epoch": 5.591863394606699, + "grad_norm": NaN, + "learning_rate": 3.633735338028587e-06, + "loss": 0.0, + "step": 59928 + }, + { + "epoch": 5.591956704301577, + "grad_norm": NaN, + "learning_rate": 3.6320807492547954e-06, + "loss": 0.0, + "step": 59929 + }, + { + "epoch": 5.592050013996454, + "grad_norm": NaN, + "learning_rate": 3.6304265326512816e-06, + "loss": 0.0, + "step": 59930 + }, + { + "epoch": 5.592143323691332, + "grad_norm": NaN, + "learning_rate": 3.628772688222209e-06, + "loss": 0.0, + "step": 59931 + }, + { + "epoch": 5.592236633386209, + "grad_norm": NaN, + "learning_rate": 3.6271192159718245e-06, + "loss": 0.0, + "step": 59932 + }, + { + "epoch": 5.592329943081086, + "grad_norm": NaN, + "learning_rate": 3.625466115904324e-06, + "loss": 0.0, + "step": 59933 + }, + { + "epoch": 5.592423252775964, + "grad_norm": NaN, + "learning_rate": 3.6238133880238882e-06, + "loss": 0.0, + "step": 59934 + }, + { + "epoch": 5.59251656247084, + "grad_norm": NaN, + "learning_rate": 3.6221610323347473e-06, + "loss": 0.0, + "step": 59935 + }, + { + "epoch": 5.592609872165718, + "grad_norm": NaN, + "learning_rate": 3.6205090488410805e-06, + "loss": 0.0, + "step": 59936 + }, + { + "epoch": 5.592703181860595, + "grad_norm": NaN, + "learning_rate": 3.6188574375471014e-06, + "loss": 0.0, + "step": 59937 + }, + { + "epoch": 5.592796491555473, + "grad_norm": NaN, + "learning_rate": 3.6172061984570235e-06, + "loss": 0.0, + "step": 59938 + }, + { + "epoch": 5.59288980125035, + "grad_norm": NaN, + "learning_rate": 3.6155553315750096e-06, + "loss": 0.0, + "step": 59939 + }, + { + "epoch": 5.5929831109452275, + "grad_norm": NaN, + "learning_rate": 3.6139048369052737e-06, + "loss": 0.0, + "step": 59940 + }, + { + "epoch": 5.593076420640104, + "grad_norm": NaN, + "learning_rate": 3.612254714452029e-06, + "loss": 0.0, + "step": 59941 + }, + { + "epoch": 5.593169730334981, + "grad_norm": NaN, + "learning_rate": 3.6106049642194544e-06, + "loss": 0.0, + "step": 59942 + }, + { + "epoch": 5.593263040029859, + "grad_norm": NaN, + "learning_rate": 3.6089555862117313e-06, + "loss": 0.0, + "step": 59943 + }, + { + "epoch": 5.593356349724736, + "grad_norm": NaN, + "learning_rate": 3.607306580433089e-06, + "loss": 0.0, + "step": 59944 + }, + { + "epoch": 5.593449659419614, + "grad_norm": NaN, + "learning_rate": 3.605657946887691e-06, + "loss": 0.0, + "step": 59945 + }, + { + "epoch": 5.593542969114491, + "grad_norm": NaN, + "learning_rate": 3.604009685579751e-06, + "loss": 0.0, + "step": 59946 + }, + { + "epoch": 5.5936362788093685, + "grad_norm": NaN, + "learning_rate": 3.602361796513431e-06, + "loss": 0.0, + "step": 59947 + }, + { + "epoch": 5.593729588504246, + "grad_norm": NaN, + "learning_rate": 3.6007142796929457e-06, + "loss": 0.0, + "step": 59948 + }, + { + "epoch": 5.593822898199123, + "grad_norm": NaN, + "learning_rate": 3.599067135122491e-06, + "loss": 0.0, + "step": 59949 + }, + { + "epoch": 5.593916207894, + "grad_norm": NaN, + "learning_rate": 3.5974203628062305e-06, + "loss": 0.0, + "step": 59950 + }, + { + "epoch": 5.594009517588877, + "grad_norm": NaN, + "learning_rate": 3.5957739627483605e-06, + "loss": 0.0, + "step": 59951 + }, + { + "epoch": 5.594102827283755, + "grad_norm": NaN, + "learning_rate": 3.5941279349530615e-06, + "loss": 0.0, + "step": 59952 + }, + { + "epoch": 5.594196136978632, + "grad_norm": NaN, + "learning_rate": 3.5924822794245466e-06, + "loss": 0.0, + "step": 59953 + }, + { + "epoch": 5.59428944667351, + "grad_norm": NaN, + "learning_rate": 3.5908369961669624e-06, + "loss": 0.0, + "step": 59954 + }, + { + "epoch": 5.594382756368387, + "grad_norm": NaN, + "learning_rate": 3.5891920851845226e-06, + "loss": 0.0, + "step": 59955 + }, + { + "epoch": 5.5944760660632635, + "grad_norm": NaN, + "learning_rate": 3.58754754648139e-06, + "loss": 0.0, + "step": 59956 + }, + { + "epoch": 5.594569375758141, + "grad_norm": NaN, + "learning_rate": 3.585903380061761e-06, + "loss": 0.0, + "step": 59957 + }, + { + "epoch": 5.594662685453018, + "grad_norm": NaN, + "learning_rate": 3.5842595859297995e-06, + "loss": 0.0, + "step": 59958 + }, + { + "epoch": 5.594755995147896, + "grad_norm": NaN, + "learning_rate": 3.582616164089702e-06, + "loss": 0.0, + "step": 59959 + }, + { + "epoch": 5.594849304842773, + "grad_norm": NaN, + "learning_rate": 3.580973114545632e-06, + "loss": 0.0, + "step": 59960 + }, + { + "epoch": 5.594942614537651, + "grad_norm": NaN, + "learning_rate": 3.5793304373017694e-06, + "loss": 0.0, + "step": 59961 + }, + { + "epoch": 5.595035924232528, + "grad_norm": NaN, + "learning_rate": 3.577688132362311e-06, + "loss": 0.0, + "step": 59962 + }, + { + "epoch": 5.5951292339274055, + "grad_norm": NaN, + "learning_rate": 3.5760461997314194e-06, + "loss": 0.0, + "step": 59963 + }, + { + "epoch": 5.595222543622282, + "grad_norm": NaN, + "learning_rate": 3.5744046394132585e-06, + "loss": 0.0, + "step": 59964 + }, + { + "epoch": 5.595315853317159, + "grad_norm": NaN, + "learning_rate": 3.5727634514120087e-06, + "loss": 0.0, + "step": 59965 + }, + { + "epoch": 5.595409163012037, + "grad_norm": NaN, + "learning_rate": 3.571122635731849e-06, + "loss": 0.0, + "step": 59966 + }, + { + "epoch": 5.595502472706914, + "grad_norm": NaN, + "learning_rate": 3.5694821923769602e-06, + "loss": 0.0, + "step": 59967 + }, + { + "epoch": 5.595595782401792, + "grad_norm": NaN, + "learning_rate": 3.5678421213514884e-06, + "loss": 0.0, + "step": 59968 + }, + { + "epoch": 5.595689092096669, + "grad_norm": NaN, + "learning_rate": 3.5662024226596307e-06, + "loss": 0.0, + "step": 59969 + }, + { + "epoch": 5.5957824017915465, + "grad_norm": NaN, + "learning_rate": 3.5645630963055338e-06, + "loss": 0.0, + "step": 59970 + }, + { + "epoch": 5.595875711486423, + "grad_norm": NaN, + "learning_rate": 3.5629241422933774e-06, + "loss": 0.0, + "step": 59971 + }, + { + "epoch": 5.5959690211813005, + "grad_norm": NaN, + "learning_rate": 3.561285560627325e-06, + "loss": 0.0, + "step": 59972 + }, + { + "epoch": 5.596062330876178, + "grad_norm": NaN, + "learning_rate": 3.5596473513115566e-06, + "loss": 0.0, + "step": 59973 + }, + { + "epoch": 5.596155640571055, + "grad_norm": NaN, + "learning_rate": 3.5580095143502185e-06, + "loss": 0.0, + "step": 59974 + }, + { + "epoch": 5.596248950265933, + "grad_norm": NaN, + "learning_rate": 3.5563720497474745e-06, + "loss": 0.0, + "step": 59975 + }, + { + "epoch": 5.59634225996081, + "grad_norm": NaN, + "learning_rate": 3.5547349575075046e-06, + "loss": 0.0, + "step": 59976 + }, + { + "epoch": 5.596435569655688, + "grad_norm": NaN, + "learning_rate": 3.553098237634472e-06, + "loss": 0.0, + "step": 59977 + }, + { + "epoch": 5.596528879350565, + "grad_norm": NaN, + "learning_rate": 3.5514618901325232e-06, + "loss": 0.0, + "step": 59978 + }, + { + "epoch": 5.5966221890454415, + "grad_norm": NaN, + "learning_rate": 3.5498259150058216e-06, + "loss": 0.0, + "step": 59979 + }, + { + "epoch": 5.596715498740319, + "grad_norm": NaN, + "learning_rate": 3.5481903122585307e-06, + "loss": 0.0, + "step": 59980 + }, + { + "epoch": 5.596808808435196, + "grad_norm": NaN, + "learning_rate": 3.546555081894814e-06, + "loss": 0.0, + "step": 59981 + }, + { + "epoch": 5.596902118130074, + "grad_norm": NaN, + "learning_rate": 3.544920223918818e-06, + "loss": 0.0, + "step": 59982 + }, + { + "epoch": 5.596995427824951, + "grad_norm": NaN, + "learning_rate": 3.5432857383347057e-06, + "loss": 0.0, + "step": 59983 + }, + { + "epoch": 5.597088737519829, + "grad_norm": NaN, + "learning_rate": 3.541651625146641e-06, + "loss": 0.0, + "step": 59984 + }, + { + "epoch": 5.597182047214705, + "grad_norm": NaN, + "learning_rate": 3.54001788435877e-06, + "loss": 0.0, + "step": 59985 + }, + { + "epoch": 5.5972753569095826, + "grad_norm": NaN, + "learning_rate": 3.5383845159752565e-06, + "loss": 0.0, + "step": 59986 + }, + { + "epoch": 5.59736866660446, + "grad_norm": NaN, + "learning_rate": 3.5367515200002474e-06, + "loss": 0.0, + "step": 59987 + }, + { + "epoch": 5.597461976299337, + "grad_norm": NaN, + "learning_rate": 3.5351188964378718e-06, + "loss": 0.0, + "step": 59988 + }, + { + "epoch": 5.597555285994215, + "grad_norm": NaN, + "learning_rate": 3.533486645292327e-06, + "loss": 0.0, + "step": 59989 + }, + { + "epoch": 5.597648595689092, + "grad_norm": NaN, + "learning_rate": 3.5318547665677264e-06, + "loss": 0.0, + "step": 59990 + }, + { + "epoch": 5.59774190538397, + "grad_norm": NaN, + "learning_rate": 3.5302232602682335e-06, + "loss": 0.0, + "step": 59991 + }, + { + "epoch": 5.597835215078847, + "grad_norm": NaN, + "learning_rate": 3.5285921263979945e-06, + "loss": 0.0, + "step": 59992 + }, + { + "epoch": 5.597928524773724, + "grad_norm": NaN, + "learning_rate": 3.526961364961156e-06, + "loss": 0.0, + "step": 59993 + }, + { + "epoch": 5.598021834468601, + "grad_norm": NaN, + "learning_rate": 3.5253309759618654e-06, + "loss": 0.0, + "step": 59994 + }, + { + "epoch": 5.598115144163478, + "grad_norm": NaN, + "learning_rate": 3.5237009594042855e-06, + "loss": 0.0, + "step": 59995 + }, + { + "epoch": 5.598208453858356, + "grad_norm": NaN, + "learning_rate": 3.5220713152925295e-06, + "loss": 0.0, + "step": 59996 + }, + { + "epoch": 5.598301763553233, + "grad_norm": NaN, + "learning_rate": 3.5204420436307612e-06, + "loss": 0.0, + "step": 59997 + }, + { + "epoch": 5.598395073248111, + "grad_norm": NaN, + "learning_rate": 3.5188131444231107e-06, + "loss": 0.0, + "step": 59998 + }, + { + "epoch": 5.598488382942988, + "grad_norm": NaN, + "learning_rate": 3.5171846176737406e-06, + "loss": 0.0, + "step": 59999 + }, + { + "epoch": 5.598581692637865, + "grad_norm": NaN, + "learning_rate": 3.515556463386765e-06, + "loss": 0.0, + "step": 60000 + }, + { + "epoch": 5.598675002332742, + "grad_norm": NaN, + "learning_rate": 3.513928681566347e-06, + "loss": 0.0, + "step": 60001 + }, + { + "epoch": 5.5987683120276195, + "grad_norm": NaN, + "learning_rate": 3.5123012722166e-06, + "loss": 0.0, + "step": 60002 + }, + { + "epoch": 5.598861621722497, + "grad_norm": NaN, + "learning_rate": 3.510674235341704e-06, + "loss": 0.0, + "step": 60003 + }, + { + "epoch": 5.598954931417374, + "grad_norm": NaN, + "learning_rate": 3.5090475709457557e-06, + "loss": 0.0, + "step": 60004 + }, + { + "epoch": 5.599048241112252, + "grad_norm": NaN, + "learning_rate": 3.5074212790329016e-06, + "loss": 0.0, + "step": 60005 + }, + { + "epoch": 5.599141550807129, + "grad_norm": NaN, + "learning_rate": 3.5057953596072885e-06, + "loss": 0.0, + "step": 60006 + }, + { + "epoch": 5.599234860502007, + "grad_norm": NaN, + "learning_rate": 3.50416981267303e-06, + "loss": 0.0, + "step": 60007 + }, + { + "epoch": 5.599328170196883, + "grad_norm": NaN, + "learning_rate": 3.502544638234289e-06, + "loss": 0.0, + "step": 60008 + }, + { + "epoch": 5.5994214798917605, + "grad_norm": NaN, + "learning_rate": 3.5009198362951797e-06, + "loss": 0.0, + "step": 60009 + }, + { + "epoch": 5.599514789586638, + "grad_norm": NaN, + "learning_rate": 3.4992954068598144e-06, + "loss": 0.0, + "step": 60010 + }, + { + "epoch": 5.599608099281515, + "grad_norm": NaN, + "learning_rate": 3.4976713499323738e-06, + "loss": 0.0, + "step": 60011 + }, + { + "epoch": 5.599701408976393, + "grad_norm": NaN, + "learning_rate": 3.496047665516938e-06, + "loss": 0.0, + "step": 60012 + }, + { + "epoch": 5.59979471867127, + "grad_norm": NaN, + "learning_rate": 3.494424353617653e-06, + "loss": 0.0, + "step": 60013 + }, + { + "epoch": 5.599888028366148, + "grad_norm": NaN, + "learning_rate": 3.492801414238666e-06, + "loss": 0.0, + "step": 60014 + }, + { + "epoch": 5.599981338061024, + "grad_norm": NaN, + "learning_rate": 3.491178847384074e-06, + "loss": 0.0, + "step": 60015 + }, + { + "epoch": 5.600074647755902, + "grad_norm": NaN, + "learning_rate": 3.4895566530580232e-06, + "loss": 0.0, + "step": 60016 + }, + { + "epoch": 5.600167957450779, + "grad_norm": NaN, + "learning_rate": 3.487934831264627e-06, + "loss": 0.0, + "step": 60017 + }, + { + "epoch": 5.600261267145656, + "grad_norm": NaN, + "learning_rate": 3.486313382007999e-06, + "loss": 0.0, + "step": 60018 + }, + { + "epoch": 5.600354576840534, + "grad_norm": NaN, + "learning_rate": 3.484692305292286e-06, + "loss": 0.0, + "step": 60019 + }, + { + "epoch": 5.600447886535411, + "grad_norm": NaN, + "learning_rate": 3.483071601121601e-06, + "loss": 0.0, + "step": 60020 + }, + { + "epoch": 5.600541196230289, + "grad_norm": NaN, + "learning_rate": 3.481451269500074e-06, + "loss": 0.0, + "step": 60021 + }, + { + "epoch": 5.600634505925166, + "grad_norm": NaN, + "learning_rate": 3.479831310431802e-06, + "loss": 0.0, + "step": 60022 + }, + { + "epoch": 5.600727815620043, + "grad_norm": NaN, + "learning_rate": 3.4782117239209153e-06, + "loss": 0.0, + "step": 60023 + }, + { + "epoch": 5.60082112531492, + "grad_norm": NaN, + "learning_rate": 3.4765925099715266e-06, + "loss": 0.0, + "step": 60024 + }, + { + "epoch": 5.6009144350097975, + "grad_norm": NaN, + "learning_rate": 3.474973668587783e-06, + "loss": 0.0, + "step": 60025 + }, + { + "epoch": 5.601007744704675, + "grad_norm": NaN, + "learning_rate": 3.4733551997737475e-06, + "loss": 0.0, + "step": 60026 + }, + { + "epoch": 5.601101054399552, + "grad_norm": NaN, + "learning_rate": 3.471737103533584e-06, + "loss": 0.0, + "step": 60027 + }, + { + "epoch": 5.60119436409443, + "grad_norm": NaN, + "learning_rate": 3.470119379871389e-06, + "loss": 0.0, + "step": 60028 + }, + { + "epoch": 5.601287673789306, + "grad_norm": NaN, + "learning_rate": 3.468502028791276e-06, + "loss": 0.0, + "step": 60029 + }, + { + "epoch": 5.601380983484184, + "grad_norm": NaN, + "learning_rate": 3.466885050297341e-06, + "loss": 0.0, + "step": 60030 + }, + { + "epoch": 5.601474293179061, + "grad_norm": NaN, + "learning_rate": 3.465268444393732e-06, + "loss": 0.0, + "step": 60031 + }, + { + "epoch": 5.6015676028739385, + "grad_norm": NaN, + "learning_rate": 3.463652211084528e-06, + "loss": 0.0, + "step": 60032 + }, + { + "epoch": 5.601660912568816, + "grad_norm": NaN, + "learning_rate": 3.4620363503738422e-06, + "loss": 0.0, + "step": 60033 + }, + { + "epoch": 5.601754222263693, + "grad_norm": NaN, + "learning_rate": 3.4604208622658057e-06, + "loss": 0.0, + "step": 60034 + }, + { + "epoch": 5.601847531958571, + "grad_norm": NaN, + "learning_rate": 3.458805746764498e-06, + "loss": 0.0, + "step": 60035 + }, + { + "epoch": 5.601940841653448, + "grad_norm": NaN, + "learning_rate": 3.4571910038740493e-06, + "loss": 0.0, + "step": 60036 + }, + { + "epoch": 5.602034151348325, + "grad_norm": NaN, + "learning_rate": 3.455576633598539e-06, + "loss": 0.0, + "step": 60037 + }, + { + "epoch": 5.602127461043202, + "grad_norm": NaN, + "learning_rate": 3.453962635942098e-06, + "loss": 0.0, + "step": 60038 + }, + { + "epoch": 5.60222077073808, + "grad_norm": NaN, + "learning_rate": 3.4523490109088224e-06, + "loss": 0.0, + "step": 60039 + }, + { + "epoch": 5.602314080432957, + "grad_norm": NaN, + "learning_rate": 3.4507357585028095e-06, + "loss": 0.0, + "step": 60040 + }, + { + "epoch": 5.602407390127834, + "grad_norm": NaN, + "learning_rate": 3.449122878728172e-06, + "loss": 0.0, + "step": 60041 + }, + { + "epoch": 5.602500699822712, + "grad_norm": NaN, + "learning_rate": 3.4475103715889908e-06, + "loss": 0.0, + "step": 60042 + }, + { + "epoch": 5.602594009517589, + "grad_norm": NaN, + "learning_rate": 3.4458982370893783e-06, + "loss": 0.0, + "step": 60043 + }, + { + "epoch": 5.602687319212466, + "grad_norm": NaN, + "learning_rate": 3.4442864752334486e-06, + "loss": 0.0, + "step": 60044 + }, + { + "epoch": 5.602780628907343, + "grad_norm": NaN, + "learning_rate": 3.442675086025265e-06, + "loss": 0.0, + "step": 60045 + }, + { + "epoch": 5.602873938602221, + "grad_norm": NaN, + "learning_rate": 3.4410640694689572e-06, + "loss": 0.0, + "step": 60046 + }, + { + "epoch": 5.602967248297098, + "grad_norm": NaN, + "learning_rate": 3.439453425568606e-06, + "loss": 0.0, + "step": 60047 + }, + { + "epoch": 5.603060557991975, + "grad_norm": NaN, + "learning_rate": 3.4378431543283235e-06, + "loss": 0.0, + "step": 60048 + }, + { + "epoch": 5.603153867686853, + "grad_norm": NaN, + "learning_rate": 3.4362332557521744e-06, + "loss": 0.0, + "step": 60049 + }, + { + "epoch": 5.60324717738173, + "grad_norm": NaN, + "learning_rate": 3.4346237298442713e-06, + "loss": 0.0, + "step": 60050 + }, + { + "epoch": 5.603340487076608, + "grad_norm": NaN, + "learning_rate": 3.4330145766087113e-06, + "loss": 0.0, + "step": 60051 + }, + { + "epoch": 5.603433796771484, + "grad_norm": NaN, + "learning_rate": 3.4314057960495744e-06, + "loss": 0.0, + "step": 60052 + }, + { + "epoch": 5.603527106466362, + "grad_norm": NaN, + "learning_rate": 3.429797388170957e-06, + "loss": 0.0, + "step": 60053 + }, + { + "epoch": 5.603620416161239, + "grad_norm": NaN, + "learning_rate": 3.4281893529769568e-06, + "loss": 0.0, + "step": 60054 + }, + { + "epoch": 5.6037137258561165, + "grad_norm": NaN, + "learning_rate": 3.426581690471636e-06, + "loss": 0.0, + "step": 60055 + }, + { + "epoch": 5.603807035550994, + "grad_norm": NaN, + "learning_rate": 3.4249744006591087e-06, + "loss": 0.0, + "step": 60056 + }, + { + "epoch": 5.603900345245871, + "grad_norm": NaN, + "learning_rate": 3.4233674835434542e-06, + "loss": 0.0, + "step": 60057 + }, + { + "epoch": 5.603993654940748, + "grad_norm": NaN, + "learning_rate": 3.4217609391287535e-06, + "loss": 0.0, + "step": 60058 + }, + { + "epoch": 5.604086964635625, + "grad_norm": NaN, + "learning_rate": 3.4201547674190867e-06, + "loss": 0.0, + "step": 60059 + }, + { + "epoch": 5.604180274330503, + "grad_norm": NaN, + "learning_rate": 3.418548968418566e-06, + "loss": 0.0, + "step": 60060 + }, + { + "epoch": 5.60427358402538, + "grad_norm": NaN, + "learning_rate": 3.4169435421312397e-06, + "loss": 0.0, + "step": 60061 + }, + { + "epoch": 5.6043668937202575, + "grad_norm": NaN, + "learning_rate": 3.4153384885612034e-06, + "loss": 0.0, + "step": 60062 + }, + { + "epoch": 5.604460203415135, + "grad_norm": NaN, + "learning_rate": 3.413733807712554e-06, + "loss": 0.0, + "step": 60063 + }, + { + "epoch": 5.604553513110012, + "grad_norm": NaN, + "learning_rate": 3.4121294995893387e-06, + "loss": 0.0, + "step": 60064 + }, + { + "epoch": 5.60464682280489, + "grad_norm": NaN, + "learning_rate": 3.4105255641956707e-06, + "loss": 0.0, + "step": 60065 + }, + { + "epoch": 5.604740132499767, + "grad_norm": NaN, + "learning_rate": 3.4089220015356134e-06, + "loss": 0.0, + "step": 60066 + }, + { + "epoch": 5.604833442194644, + "grad_norm": NaN, + "learning_rate": 3.40731881161323e-06, + "loss": 0.0, + "step": 60067 + }, + { + "epoch": 5.604926751889521, + "grad_norm": NaN, + "learning_rate": 3.4057159944326175e-06, + "loss": 0.0, + "step": 60068 + }, + { + "epoch": 5.605020061584399, + "grad_norm": NaN, + "learning_rate": 3.4041135499978557e-06, + "loss": 0.0, + "step": 60069 + }, + { + "epoch": 5.605113371279276, + "grad_norm": NaN, + "learning_rate": 3.4025114783129913e-06, + "loss": 0.0, + "step": 60070 + }, + { + "epoch": 5.605206680974153, + "grad_norm": NaN, + "learning_rate": 3.4009097793821215e-06, + "loss": 0.0, + "step": 60071 + }, + { + "epoch": 5.605299990669031, + "grad_norm": NaN, + "learning_rate": 3.3993084532093263e-06, + "loss": 0.0, + "step": 60072 + }, + { + "epoch": 5.605393300363907, + "grad_norm": NaN, + "learning_rate": 3.397707499798652e-06, + "loss": 0.0, + "step": 60073 + }, + { + "epoch": 5.605486610058785, + "grad_norm": NaN, + "learning_rate": 3.396106919154179e-06, + "loss": 0.0, + "step": 60074 + }, + { + "epoch": 5.605579919753662, + "grad_norm": NaN, + "learning_rate": 3.394506711279971e-06, + "loss": 0.0, + "step": 60075 + }, + { + "epoch": 5.60567322944854, + "grad_norm": NaN, + "learning_rate": 3.392906876180124e-06, + "loss": 0.0, + "step": 60076 + }, + { + "epoch": 5.605766539143417, + "grad_norm": NaN, + "learning_rate": 3.3913074138586684e-06, + "loss": 0.0, + "step": 60077 + }, + { + "epoch": 5.6058598488382945, + "grad_norm": NaN, + "learning_rate": 3.3897083243197008e-06, + "loss": 0.0, + "step": 60078 + }, + { + "epoch": 5.605953158533172, + "grad_norm": NaN, + "learning_rate": 3.3881096075672686e-06, + "loss": 0.0, + "step": 60079 + }, + { + "epoch": 5.606046468228049, + "grad_norm": NaN, + "learning_rate": 3.3865112636054515e-06, + "loss": 0.0, + "step": 60080 + }, + { + "epoch": 5.606139777922926, + "grad_norm": NaN, + "learning_rate": 3.384913292438296e-06, + "loss": 0.0, + "step": 60081 + }, + { + "epoch": 5.606233087617803, + "grad_norm": NaN, + "learning_rate": 3.3833156940698826e-06, + "loss": 0.0, + "step": 60082 + }, + { + "epoch": 5.606326397312681, + "grad_norm": NaN, + "learning_rate": 3.3817184685042576e-06, + "loss": 0.0, + "step": 60083 + }, + { + "epoch": 5.606419707007558, + "grad_norm": NaN, + "learning_rate": 3.3801216157455014e-06, + "loss": 0.0, + "step": 60084 + }, + { + "epoch": 5.6065130167024355, + "grad_norm": NaN, + "learning_rate": 3.3785251357976604e-06, + "loss": 0.0, + "step": 60085 + }, + { + "epoch": 5.606606326397313, + "grad_norm": NaN, + "learning_rate": 3.376929028664799e-06, + "loss": 0.0, + "step": 60086 + }, + { + "epoch": 5.60669963609219, + "grad_norm": NaN, + "learning_rate": 3.375333294350979e-06, + "loss": 0.0, + "step": 60087 + }, + { + "epoch": 5.606792945787067, + "grad_norm": NaN, + "learning_rate": 3.3737379328602487e-06, + "loss": 0.0, + "step": 60088 + }, + { + "epoch": 5.606886255481944, + "grad_norm": NaN, + "learning_rate": 3.372142944196654e-06, + "loss": 0.0, + "step": 60089 + }, + { + "epoch": 5.606979565176822, + "grad_norm": NaN, + "learning_rate": 3.370548328364292e-06, + "loss": 0.0, + "step": 60090 + }, + { + "epoch": 5.607072874871699, + "grad_norm": NaN, + "learning_rate": 3.368954085367176e-06, + "loss": 0.0, + "step": 60091 + }, + { + "epoch": 5.607166184566577, + "grad_norm": NaN, + "learning_rate": 3.367360215209369e-06, + "loss": 0.0, + "step": 60092 + }, + { + "epoch": 5.607259494261454, + "grad_norm": NaN, + "learning_rate": 3.3657667178949355e-06, + "loss": 0.0, + "step": 60093 + }, + { + "epoch": 5.607352803956331, + "grad_norm": NaN, + "learning_rate": 3.364173593427921e-06, + "loss": 0.0, + "step": 60094 + }, + { + "epoch": 5.607446113651209, + "grad_norm": NaN, + "learning_rate": 3.3625808418123733e-06, + "loss": 0.0, + "step": 60095 + }, + { + "epoch": 5.607539423346085, + "grad_norm": NaN, + "learning_rate": 3.3609884630523554e-06, + "loss": 0.0, + "step": 60096 + }, + { + "epoch": 5.607632733040963, + "grad_norm": NaN, + "learning_rate": 3.3593964571518973e-06, + "loss": 0.0, + "step": 60097 + }, + { + "epoch": 5.60772604273584, + "grad_norm": NaN, + "learning_rate": 3.3578048241150622e-06, + "loss": 0.0, + "step": 60098 + }, + { + "epoch": 5.607819352430718, + "grad_norm": NaN, + "learning_rate": 3.3562135639458808e-06, + "loss": 0.0, + "step": 60099 + }, + { + "epoch": 5.607912662125595, + "grad_norm": NaN, + "learning_rate": 3.354622676648416e-06, + "loss": 0.0, + "step": 60100 + }, + { + "epoch": 5.6080059718204724, + "grad_norm": NaN, + "learning_rate": 3.3530321622266985e-06, + "loss": 0.0, + "step": 60101 + }, + { + "epoch": 5.608099281515349, + "grad_norm": NaN, + "learning_rate": 3.3514420206847916e-06, + "loss": 0.0, + "step": 60102 + }, + { + "epoch": 5.608192591210226, + "grad_norm": NaN, + "learning_rate": 3.3498522520267246e-06, + "loss": 0.0, + "step": 60103 + }, + { + "epoch": 5.608285900905104, + "grad_norm": NaN, + "learning_rate": 3.3482628562565283e-06, + "loss": 0.0, + "step": 60104 + }, + { + "epoch": 5.608379210599981, + "grad_norm": NaN, + "learning_rate": 3.3466738333782827e-06, + "loss": 0.0, + "step": 60105 + }, + { + "epoch": 5.608472520294859, + "grad_norm": NaN, + "learning_rate": 3.3450851833959846e-06, + "loss": 0.0, + "step": 60106 + }, + { + "epoch": 5.608565829989736, + "grad_norm": NaN, + "learning_rate": 3.3434969063136972e-06, + "loss": 0.0, + "step": 60107 + }, + { + "epoch": 5.6086591396846135, + "grad_norm": NaN, + "learning_rate": 3.3419090021354676e-06, + "loss": 0.0, + "step": 60108 + }, + { + "epoch": 5.608752449379491, + "grad_norm": NaN, + "learning_rate": 3.340321470865309e-06, + "loss": 0.0, + "step": 60109 + }, + { + "epoch": 5.608845759074367, + "grad_norm": NaN, + "learning_rate": 3.3387343125072684e-06, + "loss": 0.0, + "step": 60110 + }, + { + "epoch": 5.608939068769245, + "grad_norm": NaN, + "learning_rate": 3.3371475270653756e-06, + "loss": 0.0, + "step": 60111 + }, + { + "epoch": 5.609032378464122, + "grad_norm": NaN, + "learning_rate": 3.335561114543678e-06, + "loss": 0.0, + "step": 60112 + }, + { + "epoch": 5.609125688159, + "grad_norm": NaN, + "learning_rate": 3.3339750749462046e-06, + "loss": 0.0, + "step": 60113 + }, + { + "epoch": 5.609218997853877, + "grad_norm": NaN, + "learning_rate": 3.3323894082769863e-06, + "loss": 0.0, + "step": 60114 + }, + { + "epoch": 5.6093123075487545, + "grad_norm": NaN, + "learning_rate": 3.3308041145400532e-06, + "loss": 0.0, + "step": 60115 + }, + { + "epoch": 5.609405617243632, + "grad_norm": NaN, + "learning_rate": 3.329219193739435e-06, + "loss": 0.0, + "step": 60116 + }, + { + "epoch": 5.6094989269385085, + "grad_norm": NaN, + "learning_rate": 3.327634645879179e-06, + "loss": 0.0, + "step": 60117 + }, + { + "epoch": 5.609592236633386, + "grad_norm": NaN, + "learning_rate": 3.3260504709632983e-06, + "loss": 0.0, + "step": 60118 + }, + { + "epoch": 5.609685546328263, + "grad_norm": NaN, + "learning_rate": 3.3244666689958066e-06, + "loss": 0.0, + "step": 60119 + }, + { + "epoch": 5.609778856023141, + "grad_norm": NaN, + "learning_rate": 3.322883239980767e-06, + "loss": 0.0, + "step": 60120 + }, + { + "epoch": 5.609872165718018, + "grad_norm": NaN, + "learning_rate": 3.3213001839221764e-06, + "loss": 0.0, + "step": 60121 + }, + { + "epoch": 5.609965475412896, + "grad_norm": NaN, + "learning_rate": 3.319717500824065e-06, + "loss": 0.0, + "step": 60122 + }, + { + "epoch": 5.610058785107773, + "grad_norm": NaN, + "learning_rate": 3.3181351906904797e-06, + "loss": 0.0, + "step": 60123 + }, + { + "epoch": 5.61015209480265, + "grad_norm": NaN, + "learning_rate": 3.316553253525417e-06, + "loss": 0.0, + "step": 60124 + }, + { + "epoch": 5.610245404497527, + "grad_norm": NaN, + "learning_rate": 3.3149716893328904e-06, + "loss": 0.0, + "step": 60125 + }, + { + "epoch": 5.610338714192404, + "grad_norm": NaN, + "learning_rate": 3.3133904981169633e-06, + "loss": 0.0, + "step": 60126 + }, + { + "epoch": 5.610432023887282, + "grad_norm": NaN, + "learning_rate": 3.311809679881633e-06, + "loss": 0.0, + "step": 60127 + }, + { + "epoch": 5.610525333582159, + "grad_norm": NaN, + "learning_rate": 3.3102292346308956e-06, + "loss": 0.0, + "step": 60128 + }, + { + "epoch": 5.610618643277037, + "grad_norm": NaN, + "learning_rate": 3.308649162368815e-06, + "loss": 0.0, + "step": 60129 + }, + { + "epoch": 5.610711952971914, + "grad_norm": NaN, + "learning_rate": 3.307069463099371e-06, + "loss": 0.0, + "step": 60130 + }, + { + "epoch": 5.610805262666791, + "grad_norm": NaN, + "learning_rate": 3.3054901368266107e-06, + "loss": 0.0, + "step": 60131 + }, + { + "epoch": 5.610898572361668, + "grad_norm": NaN, + "learning_rate": 3.3039111835545307e-06, + "loss": 0.0, + "step": 60132 + }, + { + "epoch": 5.610991882056545, + "grad_norm": NaN, + "learning_rate": 3.3023326032871445e-06, + "loss": 0.0, + "step": 60133 + }, + { + "epoch": 5.611085191751423, + "grad_norm": NaN, + "learning_rate": 3.3007543960284654e-06, + "loss": 0.0, + "step": 60134 + }, + { + "epoch": 5.6111785014463, + "grad_norm": NaN, + "learning_rate": 3.299176561782524e-06, + "loss": 0.0, + "step": 60135 + }, + { + "epoch": 5.611271811141178, + "grad_norm": NaN, + "learning_rate": 3.2975991005533163e-06, + "loss": 0.0, + "step": 60136 + }, + { + "epoch": 5.611365120836055, + "grad_norm": NaN, + "learning_rate": 3.2960220123448564e-06, + "loss": 0.0, + "step": 60137 + }, + { + "epoch": 5.6114584305309325, + "grad_norm": NaN, + "learning_rate": 3.294445297161158e-06, + "loss": 0.0, + "step": 60138 + }, + { + "epoch": 5.61155174022581, + "grad_norm": NaN, + "learning_rate": 3.2928689550062172e-06, + "loss": 0.0, + "step": 60139 + }, + { + "epoch": 5.6116450499206865, + "grad_norm": NaN, + "learning_rate": 3.291292985884081e-06, + "loss": 0.0, + "step": 60140 + }, + { + "epoch": 5.611738359615564, + "grad_norm": NaN, + "learning_rate": 3.289717389798713e-06, + "loss": 0.0, + "step": 60141 + }, + { + "epoch": 5.611831669310441, + "grad_norm": NaN, + "learning_rate": 3.28814216675411e-06, + "loss": 0.0, + "step": 60142 + }, + { + "epoch": 5.611924979005319, + "grad_norm": NaN, + "learning_rate": 3.2865673167543515e-06, + "loss": 0.0, + "step": 60143 + }, + { + "epoch": 5.612018288700196, + "grad_norm": NaN, + "learning_rate": 3.284992839803352e-06, + "loss": 0.0, + "step": 60144 + }, + { + "epoch": 5.612111598395074, + "grad_norm": NaN, + "learning_rate": 3.2834187359051577e-06, + "loss": 0.0, + "step": 60145 + }, + { + "epoch": 5.61220490808995, + "grad_norm": NaN, + "learning_rate": 3.281845005063799e-06, + "loss": 0.0, + "step": 60146 + }, + { + "epoch": 5.6122982177848275, + "grad_norm": NaN, + "learning_rate": 3.2802716472832223e-06, + "loss": 0.0, + "step": 60147 + }, + { + "epoch": 5.612391527479705, + "grad_norm": NaN, + "learning_rate": 3.2786986625674584e-06, + "loss": 0.0, + "step": 60148 + }, + { + "epoch": 5.612484837174582, + "grad_norm": NaN, + "learning_rate": 3.277126050920503e-06, + "loss": 0.0, + "step": 60149 + }, + { + "epoch": 5.61257814686946, + "grad_norm": NaN, + "learning_rate": 3.275553812346354e-06, + "loss": 0.0, + "step": 60150 + }, + { + "epoch": 5.612671456564337, + "grad_norm": NaN, + "learning_rate": 3.273981946849008e-06, + "loss": 0.0, + "step": 60151 + }, + { + "epoch": 5.612764766259215, + "grad_norm": NaN, + "learning_rate": 3.2724104544324614e-06, + "loss": 0.0, + "step": 60152 + }, + { + "epoch": 5.612858075954092, + "grad_norm": NaN, + "learning_rate": 3.2708393351007276e-06, + "loss": 0.0, + "step": 60153 + }, + { + "epoch": 5.612951385648969, + "grad_norm": NaN, + "learning_rate": 3.269268588857754e-06, + "loss": 0.0, + "step": 60154 + }, + { + "epoch": 5.613044695343846, + "grad_norm": NaN, + "learning_rate": 3.2676982157075872e-06, + "loss": 0.0, + "step": 60155 + }, + { + "epoch": 5.613138005038723, + "grad_norm": NaN, + "learning_rate": 3.26612821565419e-06, + "loss": 0.0, + "step": 60156 + }, + { + "epoch": 5.613231314733601, + "grad_norm": NaN, + "learning_rate": 3.264558588701577e-06, + "loss": 0.0, + "step": 60157 + }, + { + "epoch": 5.613324624428478, + "grad_norm": NaN, + "learning_rate": 3.2629893348537105e-06, + "loss": 0.0, + "step": 60158 + }, + { + "epoch": 5.613417934123356, + "grad_norm": NaN, + "learning_rate": 3.2614204541145884e-06, + "loss": 0.0, + "step": 60159 + }, + { + "epoch": 5.613511243818233, + "grad_norm": NaN, + "learning_rate": 3.2598519464882233e-06, + "loss": 0.0, + "step": 60160 + }, + { + "epoch": 5.61360455351311, + "grad_norm": NaN, + "learning_rate": 3.2582838119785793e-06, + "loss": 0.0, + "step": 60161 + }, + { + "epoch": 5.613697863207987, + "grad_norm": NaN, + "learning_rate": 3.2567160505896527e-06, + "loss": 0.0, + "step": 60162 + }, + { + "epoch": 5.6137911729028644, + "grad_norm": NaN, + "learning_rate": 3.2551486623254242e-06, + "loss": 0.0, + "step": 60163 + }, + { + "epoch": 5.613884482597742, + "grad_norm": NaN, + "learning_rate": 3.25358164718989e-06, + "loss": 0.0, + "step": 60164 + }, + { + "epoch": 5.613977792292619, + "grad_norm": NaN, + "learning_rate": 3.252015005187031e-06, + "loss": 0.0, + "step": 60165 + }, + { + "epoch": 5.614071101987497, + "grad_norm": NaN, + "learning_rate": 3.2504487363208264e-06, + "loss": 0.0, + "step": 60166 + }, + { + "epoch": 5.614164411682374, + "grad_norm": NaN, + "learning_rate": 3.248882840595257e-06, + "loss": 0.0, + "step": 60167 + }, + { + "epoch": 5.6142577213772515, + "grad_norm": NaN, + "learning_rate": 3.247317318014303e-06, + "loss": 0.0, + "step": 60168 + }, + { + "epoch": 5.614351031072128, + "grad_norm": NaN, + "learning_rate": 3.245752168581961e-06, + "loss": 0.0, + "step": 60169 + }, + { + "epoch": 5.6144443407670055, + "grad_norm": NaN, + "learning_rate": 3.2441873923021945e-06, + "loss": 0.0, + "step": 60170 + }, + { + "epoch": 5.614537650461883, + "grad_norm": NaN, + "learning_rate": 3.2426229891789835e-06, + "loss": 0.0, + "step": 60171 + }, + { + "epoch": 5.61463096015676, + "grad_norm": NaN, + "learning_rate": 3.241058959216308e-06, + "loss": 0.0, + "step": 60172 + }, + { + "epoch": 5.614724269851638, + "grad_norm": NaN, + "learning_rate": 3.239495302418149e-06, + "loss": 0.0, + "step": 60173 + }, + { + "epoch": 5.614817579546515, + "grad_norm": NaN, + "learning_rate": 3.2379320187884694e-06, + "loss": 0.0, + "step": 60174 + }, + { + "epoch": 5.614910889241392, + "grad_norm": NaN, + "learning_rate": 3.2363691083312824e-06, + "loss": 0.0, + "step": 60175 + }, + { + "epoch": 5.615004198936269, + "grad_norm": NaN, + "learning_rate": 3.234806571050519e-06, + "loss": 0.0, + "step": 60176 + }, + { + "epoch": 5.6150975086311465, + "grad_norm": NaN, + "learning_rate": 3.2332444069501585e-06, + "loss": 0.0, + "step": 60177 + }, + { + "epoch": 5.615190818326024, + "grad_norm": NaN, + "learning_rate": 3.231682616034198e-06, + "loss": 0.0, + "step": 60178 + }, + { + "epoch": 5.615284128020901, + "grad_norm": NaN, + "learning_rate": 3.2301211983065842e-06, + "loss": 0.0, + "step": 60179 + }, + { + "epoch": 5.615377437715779, + "grad_norm": NaN, + "learning_rate": 3.228560153771281e-06, + "loss": 0.0, + "step": 60180 + }, + { + "epoch": 5.615470747410656, + "grad_norm": NaN, + "learning_rate": 3.226999482432302e-06, + "loss": 0.0, + "step": 60181 + }, + { + "epoch": 5.615564057105534, + "grad_norm": NaN, + "learning_rate": 3.225439184293577e-06, + "loss": 0.0, + "step": 60182 + }, + { + "epoch": 5.615657366800411, + "grad_norm": NaN, + "learning_rate": 3.223879259359052e-06, + "loss": 0.0, + "step": 60183 + }, + { + "epoch": 5.615750676495288, + "grad_norm": NaN, + "learning_rate": 3.222319707632759e-06, + "loss": 0.0, + "step": 60184 + }, + { + "epoch": 5.615843986190165, + "grad_norm": NaN, + "learning_rate": 3.2207605291186265e-06, + "loss": 0.0, + "step": 60185 + }, + { + "epoch": 5.615937295885042, + "grad_norm": NaN, + "learning_rate": 3.2192017238206026e-06, + "loss": 0.0, + "step": 60186 + }, + { + "epoch": 5.61603060557992, + "grad_norm": NaN, + "learning_rate": 3.217643291742683e-06, + "loss": 0.0, + "step": 60187 + }, + { + "epoch": 5.616123915274797, + "grad_norm": NaN, + "learning_rate": 3.216085232888815e-06, + "loss": 0.0, + "step": 60188 + }, + { + "epoch": 5.616217224969675, + "grad_norm": NaN, + "learning_rate": 3.214527547262946e-06, + "loss": 0.0, + "step": 60189 + }, + { + "epoch": 5.616310534664551, + "grad_norm": NaN, + "learning_rate": 3.212970234869072e-06, + "loss": 0.0, + "step": 60190 + }, + { + "epoch": 5.616403844359429, + "grad_norm": NaN, + "learning_rate": 3.211413295711124e-06, + "loss": 0.0, + "step": 60191 + }, + { + "epoch": 5.616497154054306, + "grad_norm": NaN, + "learning_rate": 3.2098567297930643e-06, + "loss": 0.0, + "step": 60192 + }, + { + "epoch": 5.6165904637491835, + "grad_norm": NaN, + "learning_rate": 3.2083005371188743e-06, + "loss": 0.0, + "step": 60193 + }, + { + "epoch": 5.616683773444061, + "grad_norm": NaN, + "learning_rate": 3.2067447176924834e-06, + "loss": 0.0, + "step": 60194 + }, + { + "epoch": 5.616777083138938, + "grad_norm": NaN, + "learning_rate": 3.205189271517855e-06, + "loss": 0.0, + "step": 60195 + }, + { + "epoch": 5.616870392833816, + "grad_norm": NaN, + "learning_rate": 3.2036341985989534e-06, + "loss": 0.0, + "step": 60196 + }, + { + "epoch": 5.616963702528693, + "grad_norm": NaN, + "learning_rate": 3.202079498939725e-06, + "loss": 0.0, + "step": 60197 + }, + { + "epoch": 5.61705701222357, + "grad_norm": NaN, + "learning_rate": 3.200525172544116e-06, + "loss": 0.0, + "step": 60198 + }, + { + "epoch": 5.617150321918447, + "grad_norm": NaN, + "learning_rate": 3.198971219416091e-06, + "loss": 0.0, + "step": 60199 + }, + { + "epoch": 5.6172436316133245, + "grad_norm": NaN, + "learning_rate": 3.1974176395595964e-06, + "loss": 0.0, + "step": 60200 + }, + { + "epoch": 5.617336941308202, + "grad_norm": NaN, + "learning_rate": 3.1958644329785788e-06, + "loss": 0.0, + "step": 60201 + }, + { + "epoch": 5.617430251003079, + "grad_norm": NaN, + "learning_rate": 3.194311599676985e-06, + "loss": 0.0, + "step": 60202 + }, + { + "epoch": 5.617523560697957, + "grad_norm": NaN, + "learning_rate": 3.1927591396587793e-06, + "loss": 0.0, + "step": 60203 + }, + { + "epoch": 5.617616870392834, + "grad_norm": NaN, + "learning_rate": 3.191207052927908e-06, + "loss": 0.0, + "step": 60204 + }, + { + "epoch": 5.617710180087711, + "grad_norm": NaN, + "learning_rate": 3.1896553394883005e-06, + "loss": 0.0, + "step": 60205 + }, + { + "epoch": 5.617803489782588, + "grad_norm": NaN, + "learning_rate": 3.188103999343905e-06, + "loss": 0.0, + "step": 60206 + }, + { + "epoch": 5.617896799477466, + "grad_norm": NaN, + "learning_rate": 3.1865530324986843e-06, + "loss": 0.0, + "step": 60207 + }, + { + "epoch": 5.617990109172343, + "grad_norm": NaN, + "learning_rate": 3.185002438956552e-06, + "loss": 0.0, + "step": 60208 + }, + { + "epoch": 5.61808341886722, + "grad_norm": NaN, + "learning_rate": 3.1834522187214885e-06, + "loss": 0.0, + "step": 60209 + }, + { + "epoch": 5.618176728562098, + "grad_norm": NaN, + "learning_rate": 3.1819023717974068e-06, + "loss": 0.0, + "step": 60210 + }, + { + "epoch": 5.618270038256975, + "grad_norm": NaN, + "learning_rate": 3.180352898188254e-06, + "loss": 0.0, + "step": 60211 + }, + { + "epoch": 5.618363347951853, + "grad_norm": NaN, + "learning_rate": 3.178803797897961e-06, + "loss": 0.0, + "step": 60212 + }, + { + "epoch": 5.618456657646729, + "grad_norm": NaN, + "learning_rate": 3.1772550709305067e-06, + "loss": 0.0, + "step": 60213 + }, + { + "epoch": 5.618549967341607, + "grad_norm": NaN, + "learning_rate": 3.1757067172897887e-06, + "loss": 0.0, + "step": 60214 + }, + { + "epoch": 5.618643277036484, + "grad_norm": NaN, + "learning_rate": 3.174158736979737e-06, + "loss": 0.0, + "step": 60215 + }, + { + "epoch": 5.6187365867313614, + "grad_norm": NaN, + "learning_rate": 3.172611130004349e-06, + "loss": 0.0, + "step": 60216 + }, + { + "epoch": 5.618829896426239, + "grad_norm": NaN, + "learning_rate": 3.1710638963674873e-06, + "loss": 0.0, + "step": 60217 + }, + { + "epoch": 5.618923206121116, + "grad_norm": NaN, + "learning_rate": 3.1695170360730992e-06, + "loss": 0.0, + "step": 60218 + }, + { + "epoch": 5.619016515815993, + "grad_norm": NaN, + "learning_rate": 3.1679705491251815e-06, + "loss": 0.0, + "step": 60219 + }, + { + "epoch": 5.61910982551087, + "grad_norm": NaN, + "learning_rate": 3.166424435527598e-06, + "loss": 0.0, + "step": 60220 + }, + { + "epoch": 5.619203135205748, + "grad_norm": NaN, + "learning_rate": 3.1648786952842785e-06, + "loss": 0.0, + "step": 60221 + }, + { + "epoch": 5.619296444900625, + "grad_norm": NaN, + "learning_rate": 3.1633333283992037e-06, + "loss": 0.0, + "step": 60222 + }, + { + "epoch": 5.6193897545955025, + "grad_norm": NaN, + "learning_rate": 3.1617883348762698e-06, + "loss": 0.0, + "step": 60223 + }, + { + "epoch": 5.61948306429038, + "grad_norm": NaN, + "learning_rate": 3.160243714719407e-06, + "loss": 0.0, + "step": 60224 + }, + { + "epoch": 5.619576373985257, + "grad_norm": NaN, + "learning_rate": 3.1586994679325627e-06, + "loss": 0.0, + "step": 60225 + }, + { + "epoch": 5.619669683680135, + "grad_norm": NaN, + "learning_rate": 3.1571555945196336e-06, + "loss": 0.0, + "step": 60226 + }, + { + "epoch": 5.619762993375011, + "grad_norm": NaN, + "learning_rate": 3.15561209448455e-06, + "loss": 0.0, + "step": 60227 + }, + { + "epoch": 5.619856303069889, + "grad_norm": NaN, + "learning_rate": 3.1540689678312745e-06, + "loss": 0.0, + "step": 60228 + }, + { + "epoch": 5.619949612764766, + "grad_norm": NaN, + "learning_rate": 3.152526214563689e-06, + "loss": 0.0, + "step": 60229 + }, + { + "epoch": 5.6200429224596435, + "grad_norm": NaN, + "learning_rate": 3.1509838346857054e-06, + "loss": 0.0, + "step": 60230 + }, + { + "epoch": 5.620136232154521, + "grad_norm": NaN, + "learning_rate": 3.149441828201321e-06, + "loss": 0.0, + "step": 60231 + }, + { + "epoch": 5.620229541849398, + "grad_norm": NaN, + "learning_rate": 3.1479001951143667e-06, + "loss": 0.0, + "step": 60232 + }, + { + "epoch": 5.620322851544276, + "grad_norm": NaN, + "learning_rate": 3.1463589354288056e-06, + "loss": 0.0, + "step": 60233 + }, + { + "epoch": 5.620416161239152, + "grad_norm": NaN, + "learning_rate": 3.1448180491485675e-06, + "loss": 0.0, + "step": 60234 + }, + { + "epoch": 5.62050947093403, + "grad_norm": NaN, + "learning_rate": 3.1432775362775497e-06, + "loss": 0.0, + "step": 60235 + }, + { + "epoch": 5.620602780628907, + "grad_norm": NaN, + "learning_rate": 3.1417373968196493e-06, + "loss": 0.0, + "step": 60236 + }, + { + "epoch": 5.620696090323785, + "grad_norm": NaN, + "learning_rate": 3.1401976307788456e-06, + "loss": 0.0, + "step": 60237 + }, + { + "epoch": 5.620789400018662, + "grad_norm": NaN, + "learning_rate": 3.138658238159003e-06, + "loss": 0.0, + "step": 60238 + }, + { + "epoch": 5.620882709713539, + "grad_norm": NaN, + "learning_rate": 3.137119218964035e-06, + "loss": 0.0, + "step": 60239 + }, + { + "epoch": 5.620976019408417, + "grad_norm": NaN, + "learning_rate": 3.1355805731979044e-06, + "loss": 0.0, + "step": 60240 + }, + { + "epoch": 5.621069329103294, + "grad_norm": NaN, + "learning_rate": 3.1340423008644586e-06, + "loss": 0.0, + "step": 60241 + }, + { + "epoch": 5.621162638798171, + "grad_norm": NaN, + "learning_rate": 3.1325044019676448e-06, + "loss": 0.0, + "step": 60242 + }, + { + "epoch": 5.621255948493048, + "grad_norm": NaN, + "learning_rate": 3.130966876511376e-06, + "loss": 0.0, + "step": 60243 + }, + { + "epoch": 5.621349258187926, + "grad_norm": NaN, + "learning_rate": 3.1294297244995325e-06, + "loss": 0.0, + "step": 60244 + }, + { + "epoch": 5.621442567882803, + "grad_norm": NaN, + "learning_rate": 3.1278929459360615e-06, + "loss": 0.0, + "step": 60245 + }, + { + "epoch": 5.6215358775776805, + "grad_norm": NaN, + "learning_rate": 3.1263565408248425e-06, + "loss": 0.0, + "step": 60246 + }, + { + "epoch": 5.621629187272558, + "grad_norm": NaN, + "learning_rate": 3.1248205091697898e-06, + "loss": 0.0, + "step": 60247 + }, + { + "epoch": 5.621722496967434, + "grad_norm": NaN, + "learning_rate": 3.1232848509748165e-06, + "loss": 0.0, + "step": 60248 + }, + { + "epoch": 5.621815806662312, + "grad_norm": NaN, + "learning_rate": 3.12174956624382e-06, + "loss": 0.0, + "step": 60249 + }, + { + "epoch": 5.621909116357189, + "grad_norm": NaN, + "learning_rate": 3.1202146549806962e-06, + "loss": 0.0, + "step": 60250 + }, + { + "epoch": 5.622002426052067, + "grad_norm": NaN, + "learning_rate": 3.118680117189376e-06, + "loss": 0.0, + "step": 60251 + }, + { + "epoch": 5.622095735746944, + "grad_norm": NaN, + "learning_rate": 3.1171459528737395e-06, + "loss": 0.0, + "step": 60252 + }, + { + "epoch": 5.6221890454418215, + "grad_norm": NaN, + "learning_rate": 3.1156121620376673e-06, + "loss": 0.0, + "step": 60253 + }, + { + "epoch": 5.622282355136699, + "grad_norm": NaN, + "learning_rate": 3.1140787446851056e-06, + "loss": 0.0, + "step": 60254 + }, + { + "epoch": 5.622375664831576, + "grad_norm": NaN, + "learning_rate": 3.112545700819935e-06, + "loss": 0.0, + "step": 60255 + }, + { + "epoch": 5.622468974526454, + "grad_norm": NaN, + "learning_rate": 3.111013030446019e-06, + "loss": 0.0, + "step": 60256 + }, + { + "epoch": 5.62256228422133, + "grad_norm": NaN, + "learning_rate": 3.109480733567321e-06, + "loss": 0.0, + "step": 60257 + }, + { + "epoch": 5.622655593916208, + "grad_norm": NaN, + "learning_rate": 3.1079488101876883e-06, + "loss": 0.0, + "step": 60258 + }, + { + "epoch": 5.622748903611085, + "grad_norm": NaN, + "learning_rate": 3.1064172603110173e-06, + "loss": 0.0, + "step": 60259 + }, + { + "epoch": 5.622842213305963, + "grad_norm": NaN, + "learning_rate": 3.104886083941238e-06, + "loss": 0.0, + "step": 60260 + }, + { + "epoch": 5.62293552300084, + "grad_norm": NaN, + "learning_rate": 3.103355281082215e-06, + "loss": 0.0, + "step": 60261 + }, + { + "epoch": 5.623028832695717, + "grad_norm": NaN, + "learning_rate": 3.1018248517378107e-06, + "loss": 0.0, + "step": 60262 + }, + { + "epoch": 5.623122142390594, + "grad_norm": NaN, + "learning_rate": 3.100294795912006e-06, + "loss": 0.0, + "step": 60263 + }, + { + "epoch": 5.623215452085471, + "grad_norm": NaN, + "learning_rate": 3.0987651136085977e-06, + "loss": 0.0, + "step": 60264 + }, + { + "epoch": 5.623308761780349, + "grad_norm": NaN, + "learning_rate": 3.097235804831516e-06, + "loss": 0.0, + "step": 60265 + }, + { + "epoch": 5.623402071475226, + "grad_norm": NaN, + "learning_rate": 3.0957068695846743e-06, + "loss": 0.0, + "step": 60266 + }, + { + "epoch": 5.623495381170104, + "grad_norm": NaN, + "learning_rate": 3.094178307871903e-06, + "loss": 0.0, + "step": 60267 + }, + { + "epoch": 5.623588690864981, + "grad_norm": NaN, + "learning_rate": 3.092650119697132e-06, + "loss": 0.0, + "step": 60268 + }, + { + "epoch": 5.6236820005598585, + "grad_norm": NaN, + "learning_rate": 3.091122305064242e-06, + "loss": 0.0, + "step": 60269 + }, + { + "epoch": 5.623775310254736, + "grad_norm": NaN, + "learning_rate": 3.0895948639771128e-06, + "loss": 0.0, + "step": 60270 + }, + { + "epoch": 5.623868619949612, + "grad_norm": NaN, + "learning_rate": 3.0880677964395917e-06, + "loss": 0.0, + "step": 60271 + }, + { + "epoch": 5.62396192964449, + "grad_norm": NaN, + "learning_rate": 3.0865411024556417e-06, + "loss": 0.0, + "step": 60272 + }, + { + "epoch": 5.624055239339367, + "grad_norm": NaN, + "learning_rate": 3.085014782029077e-06, + "loss": 0.0, + "step": 60273 + }, + { + "epoch": 5.624148549034245, + "grad_norm": NaN, + "learning_rate": 3.083488835163794e-06, + "loss": 0.0, + "step": 60274 + }, + { + "epoch": 5.624241858729122, + "grad_norm": NaN, + "learning_rate": 3.08196326186369e-06, + "loss": 0.0, + "step": 60275 + }, + { + "epoch": 5.6243351684239995, + "grad_norm": NaN, + "learning_rate": 3.0804380621326284e-06, + "loss": 0.0, + "step": 60276 + }, + { + "epoch": 5.624428478118877, + "grad_norm": NaN, + "learning_rate": 3.0789132359744895e-06, + "loss": 0.0, + "step": 60277 + }, + { + "epoch": 5.6245217878137534, + "grad_norm": NaN, + "learning_rate": 3.07738878339317e-06, + "loss": 0.0, + "step": 60278 + }, + { + "epoch": 5.624615097508631, + "grad_norm": NaN, + "learning_rate": 3.0758647043925167e-06, + "loss": 0.0, + "step": 60279 + }, + { + "epoch": 5.624708407203508, + "grad_norm": NaN, + "learning_rate": 3.0743409989763933e-06, + "loss": 0.0, + "step": 60280 + }, + { + "epoch": 5.624801716898386, + "grad_norm": NaN, + "learning_rate": 3.0728176671487304e-06, + "loss": 0.0, + "step": 60281 + }, + { + "epoch": 5.624895026593263, + "grad_norm": NaN, + "learning_rate": 3.0712947089133577e-06, + "loss": 0.0, + "step": 60282 + }, + { + "epoch": 5.6249883362881405, + "grad_norm": NaN, + "learning_rate": 3.0697721242741557e-06, + "loss": 0.0, + "step": 60283 + }, + { + "epoch": 5.625081645983018, + "grad_norm": NaN, + "learning_rate": 3.0682499132350214e-06, + "loss": 0.0, + "step": 60284 + }, + { + "epoch": 5.625174955677895, + "grad_norm": NaN, + "learning_rate": 3.0667280757997847e-06, + "loss": 0.0, + "step": 60285 + }, + { + "epoch": 5.625268265372772, + "grad_norm": NaN, + "learning_rate": 3.0652066119723263e-06, + "loss": 0.0, + "step": 60286 + }, + { + "epoch": 5.625361575067649, + "grad_norm": NaN, + "learning_rate": 3.0636855217565426e-06, + "loss": 0.0, + "step": 60287 + }, + { + "epoch": 5.625454884762527, + "grad_norm": NaN, + "learning_rate": 3.0621648051562473e-06, + "loss": 0.0, + "step": 60288 + }, + { + "epoch": 5.625548194457404, + "grad_norm": NaN, + "learning_rate": 3.060644462175388e-06, + "loss": 0.0, + "step": 60289 + }, + { + "epoch": 5.625641504152282, + "grad_norm": NaN, + "learning_rate": 3.059124492817744e-06, + "loss": 0.0, + "step": 60290 + }, + { + "epoch": 5.625734813847159, + "grad_norm": NaN, + "learning_rate": 3.057604897087229e-06, + "loss": 0.0, + "step": 60291 + }, + { + "epoch": 5.6258281235420355, + "grad_norm": NaN, + "learning_rate": 3.056085674987707e-06, + "loss": 0.0, + "step": 60292 + }, + { + "epoch": 5.625921433236913, + "grad_norm": NaN, + "learning_rate": 3.054566826523025e-06, + "loss": 0.0, + "step": 60293 + }, + { + "epoch": 5.62601474293179, + "grad_norm": NaN, + "learning_rate": 3.0530483516970463e-06, + "loss": 0.0, + "step": 60294 + }, + { + "epoch": 5.626108052626668, + "grad_norm": NaN, + "learning_rate": 3.0515302505136507e-06, + "loss": 0.0, + "step": 60295 + }, + { + "epoch": 5.626201362321545, + "grad_norm": NaN, + "learning_rate": 3.0500125229766857e-06, + "loss": 0.0, + "step": 60296 + }, + { + "epoch": 5.626294672016423, + "grad_norm": NaN, + "learning_rate": 3.048495169089982e-06, + "loss": 0.0, + "step": 60297 + }, + { + "epoch": 5.6263879817113, + "grad_norm": NaN, + "learning_rate": 3.0469781888574517e-06, + "loss": 0.0, + "step": 60298 + }, + { + "epoch": 5.6264812914061775, + "grad_norm": NaN, + "learning_rate": 3.04546158228291e-06, + "loss": 0.0, + "step": 60299 + }, + { + "epoch": 5.626574601101055, + "grad_norm": NaN, + "learning_rate": 3.0439453493702193e-06, + "loss": 0.0, + "step": 60300 + }, + { + "epoch": 5.626667910795931, + "grad_norm": NaN, + "learning_rate": 3.042429490123277e-06, + "loss": 0.0, + "step": 60301 + }, + { + "epoch": 5.626761220490809, + "grad_norm": NaN, + "learning_rate": 3.04091400454588e-06, + "loss": 0.0, + "step": 60302 + }, + { + "epoch": 5.626854530185686, + "grad_norm": NaN, + "learning_rate": 3.039398892641909e-06, + "loss": 0.0, + "step": 60303 + }, + { + "epoch": 5.626947839880564, + "grad_norm": NaN, + "learning_rate": 3.0378841544152266e-06, + "loss": 0.0, + "step": 60304 + }, + { + "epoch": 5.627041149575441, + "grad_norm": NaN, + "learning_rate": 3.036369789869664e-06, + "loss": 0.0, + "step": 60305 + }, + { + "epoch": 5.6271344592703185, + "grad_norm": NaN, + "learning_rate": 3.0348557990090673e-06, + "loss": 0.0, + "step": 60306 + }, + { + "epoch": 5.627227768965195, + "grad_norm": NaN, + "learning_rate": 3.0333421818373173e-06, + "loss": 0.0, + "step": 60307 + }, + { + "epoch": 5.6273210786600725, + "grad_norm": NaN, + "learning_rate": 3.0318289383582273e-06, + "loss": 0.0, + "step": 60308 + }, + { + "epoch": 5.62741438835495, + "grad_norm": NaN, + "learning_rate": 3.0303160685756612e-06, + "loss": 0.0, + "step": 60309 + }, + { + "epoch": 5.627507698049827, + "grad_norm": NaN, + "learning_rate": 3.028803572493482e-06, + "loss": 0.0, + "step": 60310 + }, + { + "epoch": 5.627601007744705, + "grad_norm": NaN, + "learning_rate": 3.0272914501155043e-06, + "loss": 0.0, + "step": 60311 + }, + { + "epoch": 5.627694317439582, + "grad_norm": NaN, + "learning_rate": 3.025779701445591e-06, + "loss": 0.0, + "step": 60312 + }, + { + "epoch": 5.62778762713446, + "grad_norm": NaN, + "learning_rate": 3.0242683264875887e-06, + "loss": 0.0, + "step": 60313 + }, + { + "epoch": 5.627880936829337, + "grad_norm": NaN, + "learning_rate": 3.0227573252453285e-06, + "loss": 0.0, + "step": 60314 + }, + { + "epoch": 5.6279742465242135, + "grad_norm": NaN, + "learning_rate": 3.021246697722657e-06, + "loss": 0.0, + "step": 60315 + }, + { + "epoch": 5.628067556219091, + "grad_norm": NaN, + "learning_rate": 3.0197364439234373e-06, + "loss": 0.0, + "step": 60316 + }, + { + "epoch": 5.628160865913968, + "grad_norm": NaN, + "learning_rate": 3.018226563851467e-06, + "loss": 0.0, + "step": 60317 + }, + { + "epoch": 5.628254175608846, + "grad_norm": NaN, + "learning_rate": 3.0167170575105925e-06, + "loss": 0.0, + "step": 60318 + }, + { + "epoch": 5.628347485303723, + "grad_norm": NaN, + "learning_rate": 3.015207924904711e-06, + "loss": 0.0, + "step": 60319 + }, + { + "epoch": 5.628440794998601, + "grad_norm": NaN, + "learning_rate": 3.0136991660375864e-06, + "loss": 0.0, + "step": 60320 + }, + { + "epoch": 5.628534104693478, + "grad_norm": NaN, + "learning_rate": 3.0121907809130818e-06, + "loss": 0.0, + "step": 60321 + }, + { + "epoch": 5.628627414388355, + "grad_norm": NaN, + "learning_rate": 3.0106827695350443e-06, + "loss": 0.0, + "step": 60322 + }, + { + "epoch": 5.628720724083232, + "grad_norm": NaN, + "learning_rate": 3.0091751319073044e-06, + "loss": 0.0, + "step": 60323 + }, + { + "epoch": 5.628814033778109, + "grad_norm": NaN, + "learning_rate": 3.0076678680336754e-06, + "loss": 0.0, + "step": 60324 + }, + { + "epoch": 5.628907343472987, + "grad_norm": NaN, + "learning_rate": 3.0061609779180206e-06, + "loss": 0.0, + "step": 60325 + }, + { + "epoch": 5.629000653167864, + "grad_norm": NaN, + "learning_rate": 3.0046544615641376e-06, + "loss": 0.0, + "step": 60326 + }, + { + "epoch": 5.629093962862742, + "grad_norm": NaN, + "learning_rate": 3.0031483189758898e-06, + "loss": 0.0, + "step": 60327 + }, + { + "epoch": 5.629187272557619, + "grad_norm": NaN, + "learning_rate": 3.0016425501570905e-06, + "loss": 0.0, + "step": 60328 + }, + { + "epoch": 5.6292805822524965, + "grad_norm": NaN, + "learning_rate": 3.000137155111554e-06, + "loss": 0.0, + "step": 60329 + }, + { + "epoch": 5.629373891947373, + "grad_norm": NaN, + "learning_rate": 2.9986321338431594e-06, + "loss": 0.0, + "step": 60330 + }, + { + "epoch": 5.6294672016422505, + "grad_norm": NaN, + "learning_rate": 2.9971274863556717e-06, + "loss": 0.0, + "step": 60331 + }, + { + "epoch": 5.629560511337128, + "grad_norm": NaN, + "learning_rate": 2.9956232126529367e-06, + "loss": 0.0, + "step": 60332 + }, + { + "epoch": 5.629653821032005, + "grad_norm": NaN, + "learning_rate": 2.994119312738802e-06, + "loss": 0.0, + "step": 60333 + }, + { + "epoch": 5.629747130726883, + "grad_norm": NaN, + "learning_rate": 2.9926157866170807e-06, + "loss": 0.0, + "step": 60334 + }, + { + "epoch": 5.62984044042176, + "grad_norm": NaN, + "learning_rate": 2.9911126342915537e-06, + "loss": 0.0, + "step": 60335 + }, + { + "epoch": 5.629933750116637, + "grad_norm": NaN, + "learning_rate": 2.989609855766134e-06, + "loss": 0.0, + "step": 60336 + }, + { + "epoch": 5.630027059811514, + "grad_norm": NaN, + "learning_rate": 2.9881074510445525e-06, + "loss": 0.0, + "step": 60337 + }, + { + "epoch": 5.6301203695063915, + "grad_norm": NaN, + "learning_rate": 2.9866054201306723e-06, + "loss": 0.0, + "step": 60338 + }, + { + "epoch": 5.630213679201269, + "grad_norm": NaN, + "learning_rate": 2.9851037630283072e-06, + "loss": 0.0, + "step": 60339 + }, + { + "epoch": 5.630306988896146, + "grad_norm": NaN, + "learning_rate": 2.983602479741287e-06, + "loss": 0.0, + "step": 60340 + }, + { + "epoch": 5.630400298591024, + "grad_norm": NaN, + "learning_rate": 2.982101570273393e-06, + "loss": 0.0, + "step": 60341 + }, + { + "epoch": 5.630493608285901, + "grad_norm": NaN, + "learning_rate": 2.980601034628488e-06, + "loss": 0.0, + "step": 60342 + }, + { + "epoch": 5.630586917980779, + "grad_norm": NaN, + "learning_rate": 2.979100872810353e-06, + "loss": 0.0, + "step": 60343 + }, + { + "epoch": 5.630680227675655, + "grad_norm": NaN, + "learning_rate": 2.9776010848228004e-06, + "loss": 0.0, + "step": 60344 + }, + { + "epoch": 5.6307735373705325, + "grad_norm": NaN, + "learning_rate": 2.9761016706696783e-06, + "loss": 0.0, + "step": 60345 + }, + { + "epoch": 5.63086684706541, + "grad_norm": NaN, + "learning_rate": 2.9746026303547665e-06, + "loss": 0.0, + "step": 60346 + }, + { + "epoch": 5.630960156760287, + "grad_norm": NaN, + "learning_rate": 2.9731039638818787e-06, + "loss": 0.0, + "step": 60347 + }, + { + "epoch": 5.631053466455165, + "grad_norm": NaN, + "learning_rate": 2.971605671254862e-06, + "loss": 0.0, + "step": 60348 + }, + { + "epoch": 5.631146776150042, + "grad_norm": NaN, + "learning_rate": 2.9701077524774796e-06, + "loss": 0.0, + "step": 60349 + }, + { + "epoch": 5.63124008584492, + "grad_norm": NaN, + "learning_rate": 2.9686102075535624e-06, + "loss": 0.0, + "step": 60350 + }, + { + "epoch": 5.631333395539796, + "grad_norm": NaN, + "learning_rate": 2.9671130364869235e-06, + "loss": 0.0, + "step": 60351 + }, + { + "epoch": 5.631426705234674, + "grad_norm": NaN, + "learning_rate": 2.96561623928136e-06, + "loss": 0.0, + "step": 60352 + }, + { + "epoch": 5.631520014929551, + "grad_norm": NaN, + "learning_rate": 2.9641198159406522e-06, + "loss": 0.0, + "step": 60353 + }, + { + "epoch": 5.631613324624428, + "grad_norm": NaN, + "learning_rate": 2.96262376646868e-06, + "loss": 0.0, + "step": 60354 + }, + { + "epoch": 5.631706634319306, + "grad_norm": NaN, + "learning_rate": 2.9611280908691748e-06, + "loss": 0.0, + "step": 60355 + }, + { + "epoch": 5.631799944014183, + "grad_norm": NaN, + "learning_rate": 2.959632789145949e-06, + "loss": 0.0, + "step": 60356 + }, + { + "epoch": 5.631893253709061, + "grad_norm": NaN, + "learning_rate": 2.9581378613028504e-06, + "loss": 0.0, + "step": 60357 + }, + { + "epoch": 5.631986563403938, + "grad_norm": NaN, + "learning_rate": 2.9566433073436424e-06, + "loss": 0.0, + "step": 60358 + }, + { + "epoch": 5.632079873098815, + "grad_norm": NaN, + "learning_rate": 2.955149127272105e-06, + "loss": 0.0, + "step": 60359 + }, + { + "epoch": 5.632173182793692, + "grad_norm": NaN, + "learning_rate": 2.953655321092102e-06, + "loss": 0.0, + "step": 60360 + }, + { + "epoch": 5.6322664924885695, + "grad_norm": NaN, + "learning_rate": 2.9521618888073805e-06, + "loss": 0.0, + "step": 60361 + }, + { + "epoch": 5.632359802183447, + "grad_norm": NaN, + "learning_rate": 2.9506688304217375e-06, + "loss": 0.0, + "step": 60362 + }, + { + "epoch": 5.632453111878324, + "grad_norm": NaN, + "learning_rate": 2.9491761459390197e-06, + "loss": 0.0, + "step": 60363 + }, + { + "epoch": 5.632546421573202, + "grad_norm": NaN, + "learning_rate": 2.947683835362957e-06, + "loss": 0.0, + "step": 60364 + }, + { + "epoch": 5.632639731268078, + "grad_norm": NaN, + "learning_rate": 2.9461918986973808e-06, + "loss": 0.0, + "step": 60365 + }, + { + "epoch": 5.632733040962956, + "grad_norm": NaN, + "learning_rate": 2.9447003359460875e-06, + "loss": 0.0, + "step": 60366 + }, + { + "epoch": 5.632826350657833, + "grad_norm": NaN, + "learning_rate": 2.9432091471128573e-06, + "loss": 0.0, + "step": 60367 + }, + { + "epoch": 5.6329196603527105, + "grad_norm": NaN, + "learning_rate": 2.9417183322014703e-06, + "loss": 0.0, + "step": 60368 + }, + { + "epoch": 5.633012970047588, + "grad_norm": NaN, + "learning_rate": 2.9402278912157574e-06, + "loss": 0.0, + "step": 60369 + }, + { + "epoch": 5.633106279742465, + "grad_norm": NaN, + "learning_rate": 2.938737824159465e-06, + "loss": 0.0, + "step": 60370 + }, + { + "epoch": 5.633199589437343, + "grad_norm": NaN, + "learning_rate": 2.9372481310363905e-06, + "loss": 0.0, + "step": 60371 + }, + { + "epoch": 5.63329289913222, + "grad_norm": NaN, + "learning_rate": 2.935758811850364e-06, + "loss": 0.0, + "step": 60372 + }, + { + "epoch": 5.633386208827098, + "grad_norm": NaN, + "learning_rate": 2.934269866605099e-06, + "loss": 0.0, + "step": 60373 + }, + { + "epoch": 5.633479518521974, + "grad_norm": NaN, + "learning_rate": 2.932781295304443e-06, + "loss": 0.0, + "step": 60374 + }, + { + "epoch": 5.633572828216852, + "grad_norm": NaN, + "learning_rate": 2.931293097952175e-06, + "loss": 0.0, + "step": 60375 + }, + { + "epoch": 5.633666137911729, + "grad_norm": NaN, + "learning_rate": 2.929805274552044e-06, + "loss": 0.0, + "step": 60376 + }, + { + "epoch": 5.633759447606606, + "grad_norm": NaN, + "learning_rate": 2.928317825107862e-06, + "loss": 0.0, + "step": 60377 + }, + { + "epoch": 5.633852757301484, + "grad_norm": NaN, + "learning_rate": 2.9268307496233935e-06, + "loss": 0.0, + "step": 60378 + }, + { + "epoch": 5.633946066996361, + "grad_norm": NaN, + "learning_rate": 2.925344048102418e-06, + "loss": 0.0, + "step": 60379 + }, + { + "epoch": 5.634039376691238, + "grad_norm": NaN, + "learning_rate": 2.9238577205487503e-06, + "loss": 0.0, + "step": 60380 + }, + { + "epoch": 5.634132686386115, + "grad_norm": NaN, + "learning_rate": 2.922371766966136e-06, + "loss": 0.0, + "step": 60381 + }, + { + "epoch": 5.634225996080993, + "grad_norm": NaN, + "learning_rate": 2.92088618735834e-06, + "loss": 0.0, + "step": 60382 + }, + { + "epoch": 5.63431930577587, + "grad_norm": NaN, + "learning_rate": 2.9194009817292086e-06, + "loss": 0.0, + "step": 60383 + }, + { + "epoch": 5.6344126154707475, + "grad_norm": NaN, + "learning_rate": 2.9179161500824387e-06, + "loss": 0.0, + "step": 60384 + }, + { + "epoch": 5.634505925165625, + "grad_norm": NaN, + "learning_rate": 2.916431692421828e-06, + "loss": 0.0, + "step": 60385 + }, + { + "epoch": 5.634599234860502, + "grad_norm": NaN, + "learning_rate": 2.914947608751189e-06, + "loss": 0.0, + "step": 60386 + }, + { + "epoch": 5.63469254455538, + "grad_norm": NaN, + "learning_rate": 2.9134638990742697e-06, + "loss": 0.0, + "step": 60387 + }, + { + "epoch": 5.634785854250256, + "grad_norm": NaN, + "learning_rate": 2.9119805633948166e-06, + "loss": 0.0, + "step": 60388 + }, + { + "epoch": 5.634879163945134, + "grad_norm": NaN, + "learning_rate": 2.91049760171666e-06, + "loss": 0.0, + "step": 60389 + }, + { + "epoch": 5.634972473640011, + "grad_norm": NaN, + "learning_rate": 2.9090150140435143e-06, + "loss": 0.0, + "step": 60390 + }, + { + "epoch": 5.6350657833348885, + "grad_norm": NaN, + "learning_rate": 2.9075328003791587e-06, + "loss": 0.0, + "step": 60391 + }, + { + "epoch": 5.635159093029766, + "grad_norm": NaN, + "learning_rate": 2.9060509607274073e-06, + "loss": 0.0, + "step": 60392 + }, + { + "epoch": 5.635252402724643, + "grad_norm": NaN, + "learning_rate": 2.904569495091974e-06, + "loss": 0.0, + "step": 60393 + }, + { + "epoch": 5.635345712419521, + "grad_norm": NaN, + "learning_rate": 2.9030884034766387e-06, + "loss": 0.0, + "step": 60394 + }, + { + "epoch": 5.635439022114397, + "grad_norm": NaN, + "learning_rate": 2.9016076858852156e-06, + "loss": 0.0, + "step": 60395 + }, + { + "epoch": 5.635532331809275, + "grad_norm": NaN, + "learning_rate": 2.900127342321401e-06, + "loss": 0.0, + "step": 60396 + }, + { + "epoch": 5.635625641504152, + "grad_norm": NaN, + "learning_rate": 2.898647372788976e-06, + "loss": 0.0, + "step": 60397 + }, + { + "epoch": 5.6357189511990295, + "grad_norm": NaN, + "learning_rate": 2.8971677772917536e-06, + "loss": 0.0, + "step": 60398 + }, + { + "epoch": 5.635812260893907, + "grad_norm": NaN, + "learning_rate": 2.895688555833431e-06, + "loss": 0.0, + "step": 60399 + }, + { + "epoch": 5.635905570588784, + "grad_norm": NaN, + "learning_rate": 2.894209708417805e-06, + "loss": 0.0, + "step": 60400 + }, + { + "epoch": 5.635998880283662, + "grad_norm": NaN, + "learning_rate": 2.8927312350486398e-06, + "loss": 0.0, + "step": 60401 + }, + { + "epoch": 5.636092189978539, + "grad_norm": NaN, + "learning_rate": 2.8912531357296487e-06, + "loss": 0.0, + "step": 60402 + }, + { + "epoch": 5.636185499673416, + "grad_norm": NaN, + "learning_rate": 2.8897754104646452e-06, + "loss": 0.0, + "step": 60403 + }, + { + "epoch": 5.636278809368293, + "grad_norm": NaN, + "learning_rate": 2.8882980592573767e-06, + "loss": 0.0, + "step": 60404 + }, + { + "epoch": 5.636372119063171, + "grad_norm": NaN, + "learning_rate": 2.8868210821115734e-06, + "loss": 0.0, + "step": 60405 + }, + { + "epoch": 5.636465428758048, + "grad_norm": NaN, + "learning_rate": 2.885344479031015e-06, + "loss": 0.0, + "step": 60406 + }, + { + "epoch": 5.636558738452925, + "grad_norm": NaN, + "learning_rate": 2.88386825001945e-06, + "loss": 0.0, + "step": 60407 + }, + { + "epoch": 5.636652048147803, + "grad_norm": NaN, + "learning_rate": 2.8823923950806072e-06, + "loss": 0.0, + "step": 60408 + }, + { + "epoch": 5.636745357842679, + "grad_norm": NaN, + "learning_rate": 2.8809169142182677e-06, + "loss": 0.0, + "step": 60409 + }, + { + "epoch": 5.636838667537557, + "grad_norm": NaN, + "learning_rate": 2.8794418074362114e-06, + "loss": 0.0, + "step": 60410 + }, + { + "epoch": 5.636931977232434, + "grad_norm": NaN, + "learning_rate": 2.8779670747381024e-06, + "loss": 0.0, + "step": 60411 + }, + { + "epoch": 5.637025286927312, + "grad_norm": NaN, + "learning_rate": 2.876492716127771e-06, + "loss": 0.0, + "step": 60412 + }, + { + "epoch": 5.637118596622189, + "grad_norm": NaN, + "learning_rate": 2.8750187316089467e-06, + "loss": 0.0, + "step": 60413 + }, + { + "epoch": 5.6372119063170665, + "grad_norm": NaN, + "learning_rate": 2.8735451211853446e-06, + "loss": 0.0, + "step": 60414 + }, + { + "epoch": 5.637305216011944, + "grad_norm": NaN, + "learning_rate": 2.872071884860744e-06, + "loss": 0.0, + "step": 60415 + }, + { + "epoch": 5.637398525706821, + "grad_norm": NaN, + "learning_rate": 2.870599022638892e-06, + "loss": 0.0, + "step": 60416 + }, + { + "epoch": 5.637491835401698, + "grad_norm": NaN, + "learning_rate": 2.8691265345235025e-06, + "loss": 0.0, + "step": 60417 + }, + { + "epoch": 5.637585145096575, + "grad_norm": NaN, + "learning_rate": 2.867654420518356e-06, + "loss": 0.0, + "step": 60418 + }, + { + "epoch": 5.637678454791453, + "grad_norm": NaN, + "learning_rate": 2.866182680627199e-06, + "loss": 0.0, + "step": 60419 + }, + { + "epoch": 5.63777176448633, + "grad_norm": NaN, + "learning_rate": 2.8647113148537127e-06, + "loss": 0.0, + "step": 60420 + }, + { + "epoch": 5.6378650741812075, + "grad_norm": NaN, + "learning_rate": 2.8632403232017097e-06, + "loss": 0.0, + "step": 60421 + }, + { + "epoch": 5.637958383876085, + "grad_norm": NaN, + "learning_rate": 2.8617697056749044e-06, + "loss": 0.0, + "step": 60422 + }, + { + "epoch": 5.638051693570962, + "grad_norm": NaN, + "learning_rate": 2.86029946227701e-06, + "loss": 0.0, + "step": 60423 + }, + { + "epoch": 5.638145003265839, + "grad_norm": NaN, + "learning_rate": 2.8588295930118078e-06, + "loss": 0.0, + "step": 60424 + }, + { + "epoch": 5.638238312960716, + "grad_norm": NaN, + "learning_rate": 2.85736009788301e-06, + "loss": 0.0, + "step": 60425 + }, + { + "epoch": 5.638331622655594, + "grad_norm": NaN, + "learning_rate": 2.855890976894348e-06, + "loss": 0.0, + "step": 60426 + }, + { + "epoch": 5.638424932350471, + "grad_norm": NaN, + "learning_rate": 2.8544222300495854e-06, + "loss": 0.0, + "step": 60427 + }, + { + "epoch": 5.638518242045349, + "grad_norm": NaN, + "learning_rate": 2.852953857352436e-06, + "loss": 0.0, + "step": 60428 + }, + { + "epoch": 5.638611551740226, + "grad_norm": NaN, + "learning_rate": 2.8514858588066124e-06, + "loss": 0.0, + "step": 60429 + }, + { + "epoch": 5.638704861435103, + "grad_norm": NaN, + "learning_rate": 2.8500182344159128e-06, + "loss": 0.0, + "step": 60430 + }, + { + "epoch": 5.638798171129981, + "grad_norm": NaN, + "learning_rate": 2.8485509841840003e-06, + "loss": 0.0, + "step": 60431 + }, + { + "epoch": 5.638891480824857, + "grad_norm": NaN, + "learning_rate": 2.8470841081146224e-06, + "loss": 0.0, + "step": 60432 + }, + { + "epoch": 5.638984790519735, + "grad_norm": NaN, + "learning_rate": 2.845617606211559e-06, + "loss": 0.0, + "step": 60433 + }, + { + "epoch": 5.639078100214612, + "grad_norm": NaN, + "learning_rate": 2.8441514784784734e-06, + "loss": 0.0, + "step": 60434 + }, + { + "epoch": 5.63917140990949, + "grad_norm": NaN, + "learning_rate": 2.8426857249191135e-06, + "loss": 0.0, + "step": 60435 + }, + { + "epoch": 5.639264719604367, + "grad_norm": NaN, + "learning_rate": 2.8412203455372264e-06, + "loss": 0.0, + "step": 60436 + }, + { + "epoch": 5.6393580292992445, + "grad_norm": NaN, + "learning_rate": 2.839755340336508e-06, + "loss": 0.0, + "step": 60437 + }, + { + "epoch": 5.639451338994122, + "grad_norm": NaN, + "learning_rate": 2.838290709320723e-06, + "loss": 0.0, + "step": 60438 + }, + { + "epoch": 5.639544648688998, + "grad_norm": NaN, + "learning_rate": 2.836826452493568e-06, + "loss": 0.0, + "step": 60439 + }, + { + "epoch": 5.639637958383876, + "grad_norm": NaN, + "learning_rate": 2.8353625698587568e-06, + "loss": 0.0, + "step": 60440 + }, + { + "epoch": 5.639731268078753, + "grad_norm": NaN, + "learning_rate": 2.8338990614200195e-06, + "loss": 0.0, + "step": 60441 + }, + { + "epoch": 5.639824577773631, + "grad_norm": NaN, + "learning_rate": 2.8324359271811204e-06, + "loss": 0.0, + "step": 60442 + }, + { + "epoch": 5.639917887468508, + "grad_norm": NaN, + "learning_rate": 2.830973167145706e-06, + "loss": 0.0, + "step": 60443 + }, + { + "epoch": 5.6400111971633855, + "grad_norm": NaN, + "learning_rate": 2.8295107813175397e-06, + "loss": 0.0, + "step": 60444 + }, + { + "epoch": 5.640104506858263, + "grad_norm": NaN, + "learning_rate": 2.8280487697003527e-06, + "loss": 0.0, + "step": 60445 + }, + { + "epoch": 5.64019781655314, + "grad_norm": NaN, + "learning_rate": 2.8265871322978084e-06, + "loss": 0.0, + "step": 60446 + }, + { + "epoch": 5.640291126248017, + "grad_norm": NaN, + "learning_rate": 2.82512586911367e-06, + "loss": 0.0, + "step": 60447 + }, + { + "epoch": 5.640384435942894, + "grad_norm": NaN, + "learning_rate": 2.8236649801516686e-06, + "loss": 0.0, + "step": 60448 + }, + { + "epoch": 5.640477745637772, + "grad_norm": NaN, + "learning_rate": 2.822204465415451e-06, + "loss": 0.0, + "step": 60449 + }, + { + "epoch": 5.640571055332649, + "grad_norm": NaN, + "learning_rate": 2.82074432490878e-06, + "loss": 0.0, + "step": 60450 + }, + { + "epoch": 5.6406643650275266, + "grad_norm": NaN, + "learning_rate": 2.819284558635371e-06, + "loss": 0.0, + "step": 60451 + }, + { + "epoch": 5.640757674722404, + "grad_norm": NaN, + "learning_rate": 2.817825166598919e-06, + "loss": 0.0, + "step": 60452 + }, + { + "epoch": 5.6408509844172805, + "grad_norm": NaN, + "learning_rate": 2.816366148803123e-06, + "loss": 0.0, + "step": 60453 + }, + { + "epoch": 5.640944294112158, + "grad_norm": NaN, + "learning_rate": 2.8149075052517458e-06, + "loss": 0.0, + "step": 60454 + }, + { + "epoch": 5.641037603807035, + "grad_norm": NaN, + "learning_rate": 2.8134492359484173e-06, + "loss": 0.0, + "step": 60455 + }, + { + "epoch": 5.641130913501913, + "grad_norm": NaN, + "learning_rate": 2.811991340896902e-06, + "loss": 0.0, + "step": 60456 + }, + { + "epoch": 5.64122422319679, + "grad_norm": NaN, + "learning_rate": 2.8105338201009133e-06, + "loss": 0.0, + "step": 60457 + }, + { + "epoch": 5.641317532891668, + "grad_norm": NaN, + "learning_rate": 2.8090766735641145e-06, + "loss": 0.0, + "step": 60458 + }, + { + "epoch": 5.641410842586545, + "grad_norm": NaN, + "learning_rate": 2.807619901290237e-06, + "loss": 0.0, + "step": 60459 + }, + { + "epoch": 5.641504152281422, + "grad_norm": NaN, + "learning_rate": 2.806163503282993e-06, + "loss": 0.0, + "step": 60460 + }, + { + "epoch": 5.641597461976299, + "grad_norm": NaN, + "learning_rate": 2.804707479546048e-06, + "loss": 0.0, + "step": 60461 + }, + { + "epoch": 5.641690771671176, + "grad_norm": NaN, + "learning_rate": 2.8032518300831307e-06, + "loss": 0.0, + "step": 60462 + }, + { + "epoch": 5.641784081366054, + "grad_norm": NaN, + "learning_rate": 2.8017965548979727e-06, + "loss": 0.0, + "step": 60463 + }, + { + "epoch": 5.641877391060931, + "grad_norm": NaN, + "learning_rate": 2.8003416539942036e-06, + "loss": 0.0, + "step": 60464 + }, + { + "epoch": 5.641970700755809, + "grad_norm": NaN, + "learning_rate": 2.798887127375604e-06, + "loss": 0.0, + "step": 60465 + }, + { + "epoch": 5.642064010450686, + "grad_norm": NaN, + "learning_rate": 2.7974329750458045e-06, + "loss": 0.0, + "step": 60466 + }, + { + "epoch": 5.6421573201455635, + "grad_norm": NaN, + "learning_rate": 2.795979197008519e-06, + "loss": 0.0, + "step": 60467 + }, + { + "epoch": 5.64225062984044, + "grad_norm": NaN, + "learning_rate": 2.794525793267477e-06, + "loss": 0.0, + "step": 60468 + }, + { + "epoch": 5.642343939535317, + "grad_norm": NaN, + "learning_rate": 2.793072763826343e-06, + "loss": 0.0, + "step": 60469 + }, + { + "epoch": 5.642437249230195, + "grad_norm": NaN, + "learning_rate": 2.791620108688797e-06, + "loss": 0.0, + "step": 60470 + }, + { + "epoch": 5.642530558925072, + "grad_norm": NaN, + "learning_rate": 2.790167827858586e-06, + "loss": 0.0, + "step": 60471 + }, + { + "epoch": 5.64262386861995, + "grad_norm": NaN, + "learning_rate": 2.788715921339357e-06, + "loss": 0.0, + "step": 60472 + }, + { + "epoch": 5.642717178314827, + "grad_norm": NaN, + "learning_rate": 2.7872643891348077e-06, + "loss": 0.0, + "step": 60473 + }, + { + "epoch": 5.6428104880097045, + "grad_norm": NaN, + "learning_rate": 2.785813231248668e-06, + "loss": 0.0, + "step": 60474 + }, + { + "epoch": 5.642903797704582, + "grad_norm": NaN, + "learning_rate": 2.7843624476845515e-06, + "loss": 0.0, + "step": 60475 + }, + { + "epoch": 5.6429971073994585, + "grad_norm": NaN, + "learning_rate": 2.7829120384462224e-06, + "loss": 0.0, + "step": 60476 + }, + { + "epoch": 5.643090417094336, + "grad_norm": NaN, + "learning_rate": 2.7814620035373436e-06, + "loss": 0.0, + "step": 60477 + }, + { + "epoch": 5.643183726789213, + "grad_norm": NaN, + "learning_rate": 2.7800123429615795e-06, + "loss": 0.0, + "step": 60478 + }, + { + "epoch": 5.643277036484091, + "grad_norm": NaN, + "learning_rate": 2.7785630567226434e-06, + "loss": 0.0, + "step": 60479 + }, + { + "epoch": 5.643370346178968, + "grad_norm": NaN, + "learning_rate": 2.777114144824233e-06, + "loss": 0.0, + "step": 60480 + }, + { + "epoch": 5.643463655873846, + "grad_norm": NaN, + "learning_rate": 2.775665607269961e-06, + "loss": 0.0, + "step": 60481 + }, + { + "epoch": 5.643556965568722, + "grad_norm": NaN, + "learning_rate": 2.774217444063592e-06, + "loss": 0.0, + "step": 60482 + }, + { + "epoch": 5.6436502752635995, + "grad_norm": NaN, + "learning_rate": 2.7727696552087895e-06, + "loss": 0.0, + "step": 60483 + }, + { + "epoch": 5.643743584958477, + "grad_norm": NaN, + "learning_rate": 2.771322240709184e-06, + "loss": 0.0, + "step": 60484 + }, + { + "epoch": 5.643836894653354, + "grad_norm": NaN, + "learning_rate": 2.769875200568522e-06, + "loss": 0.0, + "step": 60485 + }, + { + "epoch": 5.643930204348232, + "grad_norm": NaN, + "learning_rate": 2.7684285347904677e-06, + "loss": 0.0, + "step": 60486 + }, + { + "epoch": 5.644023514043109, + "grad_norm": NaN, + "learning_rate": 2.7669822433786517e-06, + "loss": 0.0, + "step": 60487 + }, + { + "epoch": 5.644116823737987, + "grad_norm": NaN, + "learning_rate": 2.765536326336787e-06, + "loss": 0.0, + "step": 60488 + }, + { + "epoch": 5.644210133432864, + "grad_norm": NaN, + "learning_rate": 2.7640907836685877e-06, + "loss": 0.0, + "step": 60489 + }, + { + "epoch": 5.6443034431277415, + "grad_norm": NaN, + "learning_rate": 2.762645615377651e-06, + "loss": 0.0, + "step": 60490 + }, + { + "epoch": 5.644396752822618, + "grad_norm": NaN, + "learning_rate": 2.7612008214677073e-06, + "loss": 0.0, + "step": 60491 + }, + { + "epoch": 5.644490062517495, + "grad_norm": NaN, + "learning_rate": 2.7597564019424202e-06, + "loss": 0.0, + "step": 60492 + }, + { + "epoch": 5.644583372212373, + "grad_norm": NaN, + "learning_rate": 2.7583123568054367e-06, + "loss": 0.0, + "step": 60493 + }, + { + "epoch": 5.64467668190725, + "grad_norm": NaN, + "learning_rate": 2.7568686860604705e-06, + "loss": 0.0, + "step": 60494 + }, + { + "epoch": 5.644769991602128, + "grad_norm": NaN, + "learning_rate": 2.755425389711169e-06, + "loss": 0.0, + "step": 60495 + }, + { + "epoch": 5.644863301297005, + "grad_norm": NaN, + "learning_rate": 2.7539824677611953e-06, + "loss": 0.0, + "step": 60496 + }, + { + "epoch": 5.644956610991882, + "grad_norm": NaN, + "learning_rate": 2.7525399202142306e-06, + "loss": 0.0, + "step": 60497 + }, + { + "epoch": 5.645049920686759, + "grad_norm": NaN, + "learning_rate": 2.751097747073938e-06, + "loss": 0.0, + "step": 60498 + }, + { + "epoch": 5.6451432303816365, + "grad_norm": NaN, + "learning_rate": 2.7496559483439818e-06, + "loss": 0.0, + "step": 60499 + }, + { + "epoch": 5.645236540076514, + "grad_norm": NaN, + "learning_rate": 2.7482145240280253e-06, + "loss": 0.0, + "step": 60500 + }, + { + "epoch": 5.645329849771391, + "grad_norm": NaN, + "learning_rate": 2.7467734741297653e-06, + "loss": 0.0, + "step": 60501 + }, + { + "epoch": 5.645423159466269, + "grad_norm": NaN, + "learning_rate": 2.7453327986528163e-06, + "loss": 0.0, + "step": 60502 + }, + { + "epoch": 5.645516469161146, + "grad_norm": NaN, + "learning_rate": 2.743892497600875e-06, + "loss": 0.0, + "step": 60503 + }, + { + "epoch": 5.6456097788560236, + "grad_norm": NaN, + "learning_rate": 2.7424525709776045e-06, + "loss": 0.0, + "step": 60504 + }, + { + "epoch": 5.6457030885509, + "grad_norm": NaN, + "learning_rate": 2.741013018786636e-06, + "loss": 0.0, + "step": 60505 + }, + { + "epoch": 5.6457963982457775, + "grad_norm": NaN, + "learning_rate": 2.7395738410316505e-06, + "loss": 0.0, + "step": 60506 + }, + { + "epoch": 5.645889707940655, + "grad_norm": NaN, + "learning_rate": 2.738135037716327e-06, + "loss": 0.0, + "step": 60507 + }, + { + "epoch": 5.645983017635532, + "grad_norm": NaN, + "learning_rate": 2.73669660884428e-06, + "loss": 0.0, + "step": 60508 + }, + { + "epoch": 5.64607632733041, + "grad_norm": NaN, + "learning_rate": 2.7352585544192064e-06, + "loss": 0.0, + "step": 60509 + }, + { + "epoch": 5.646169637025287, + "grad_norm": NaN, + "learning_rate": 2.733820874444753e-06, + "loss": 0.0, + "step": 60510 + }, + { + "epoch": 5.646262946720165, + "grad_norm": NaN, + "learning_rate": 2.732383568924551e-06, + "loss": 0.0, + "step": 60511 + }, + { + "epoch": 5.646356256415041, + "grad_norm": NaN, + "learning_rate": 2.730946637862297e-06, + "loss": 0.0, + "step": 60512 + }, + { + "epoch": 5.6464495661099185, + "grad_norm": NaN, + "learning_rate": 2.7295100812616043e-06, + "loss": 0.0, + "step": 60513 + }, + { + "epoch": 5.646542875804796, + "grad_norm": NaN, + "learning_rate": 2.7280738991261377e-06, + "loss": 0.0, + "step": 60514 + }, + { + "epoch": 5.646636185499673, + "grad_norm": NaN, + "learning_rate": 2.7266380914595764e-06, + "loss": 0.0, + "step": 60515 + }, + { + "epoch": 5.646729495194551, + "grad_norm": NaN, + "learning_rate": 2.725202658265535e-06, + "loss": 0.0, + "step": 60516 + }, + { + "epoch": 5.646822804889428, + "grad_norm": NaN, + "learning_rate": 2.723767599547677e-06, + "loss": 0.0, + "step": 60517 + }, + { + "epoch": 5.646916114584306, + "grad_norm": NaN, + "learning_rate": 2.722332915309666e-06, + "loss": 0.0, + "step": 60518 + }, + { + "epoch": 5.647009424279183, + "grad_norm": NaN, + "learning_rate": 2.7208986055550996e-06, + "loss": 0.0, + "step": 60519 + }, + { + "epoch": 5.64710273397406, + "grad_norm": NaN, + "learning_rate": 2.719464670287691e-06, + "loss": 0.0, + "step": 60520 + }, + { + "epoch": 5.647196043668937, + "grad_norm": NaN, + "learning_rate": 2.718031109511054e-06, + "loss": 0.0, + "step": 60521 + }, + { + "epoch": 5.647289353363814, + "grad_norm": NaN, + "learning_rate": 2.7165979232288193e-06, + "loss": 0.0, + "step": 60522 + }, + { + "epoch": 5.647382663058692, + "grad_norm": NaN, + "learning_rate": 2.715165111444667e-06, + "loss": 0.0, + "step": 60523 + }, + { + "epoch": 5.647475972753569, + "grad_norm": NaN, + "learning_rate": 2.713732674162211e-06, + "loss": 0.0, + "step": 60524 + }, + { + "epoch": 5.647569282448447, + "grad_norm": NaN, + "learning_rate": 2.7123006113850985e-06, + "loss": 0.0, + "step": 60525 + }, + { + "epoch": 5.647662592143323, + "grad_norm": NaN, + "learning_rate": 2.7108689231169767e-06, + "loss": 0.0, + "step": 60526 + }, + { + "epoch": 5.647755901838201, + "grad_norm": NaN, + "learning_rate": 2.709437609361509e-06, + "loss": 0.0, + "step": 60527 + }, + { + "epoch": 5.647849211533078, + "grad_norm": NaN, + "learning_rate": 2.708006670122276e-06, + "loss": 0.0, + "step": 60528 + }, + { + "epoch": 5.6479425212279555, + "grad_norm": NaN, + "learning_rate": 2.7065761054029743e-06, + "loss": 0.0, + "step": 60529 + }, + { + "epoch": 5.648035830922833, + "grad_norm": NaN, + "learning_rate": 2.7051459152072185e-06, + "loss": 0.0, + "step": 60530 + }, + { + "epoch": 5.64812914061771, + "grad_norm": NaN, + "learning_rate": 2.703716099538622e-06, + "loss": 0.0, + "step": 60531 + }, + { + "epoch": 5.648222450312588, + "grad_norm": NaN, + "learning_rate": 2.702286658400865e-06, + "loss": 0.0, + "step": 60532 + }, + { + "epoch": 5.648315760007465, + "grad_norm": NaN, + "learning_rate": 2.700857591797562e-06, + "loss": 0.0, + "step": 60533 + }, + { + "epoch": 5.648409069702342, + "grad_norm": NaN, + "learning_rate": 2.6994288997323255e-06, + "loss": 0.0, + "step": 60534 + }, + { + "epoch": 5.648502379397219, + "grad_norm": NaN, + "learning_rate": 2.6980005822088203e-06, + "loss": 0.0, + "step": 60535 + }, + { + "epoch": 5.6485956890920965, + "grad_norm": NaN, + "learning_rate": 2.6965726392306764e-06, + "loss": 0.0, + "step": 60536 + }, + { + "epoch": 5.648688998786974, + "grad_norm": NaN, + "learning_rate": 2.6951450708014745e-06, + "loss": 0.0, + "step": 60537 + }, + { + "epoch": 5.648782308481851, + "grad_norm": NaN, + "learning_rate": 2.6937178769249115e-06, + "loss": 0.0, + "step": 60538 + }, + { + "epoch": 5.648875618176729, + "grad_norm": NaN, + "learning_rate": 2.6922910576046008e-06, + "loss": 0.0, + "step": 60539 + }, + { + "epoch": 5.648968927871606, + "grad_norm": NaN, + "learning_rate": 2.6908646128441402e-06, + "loss": 0.0, + "step": 60540 + }, + { + "epoch": 5.649062237566483, + "grad_norm": NaN, + "learning_rate": 2.6894385426471767e-06, + "loss": 0.0, + "step": 60541 + }, + { + "epoch": 5.64915554726136, + "grad_norm": NaN, + "learning_rate": 2.68801284701734e-06, + "loss": 0.0, + "step": 60542 + }, + { + "epoch": 5.649248856956238, + "grad_norm": NaN, + "learning_rate": 2.6865875259582448e-06, + "loss": 0.0, + "step": 60543 + }, + { + "epoch": 5.649342166651115, + "grad_norm": NaN, + "learning_rate": 2.685162579473521e-06, + "loss": 0.0, + "step": 60544 + }, + { + "epoch": 5.649435476345992, + "grad_norm": NaN, + "learning_rate": 2.683738007566799e-06, + "loss": 0.0, + "step": 60545 + }, + { + "epoch": 5.64952878604087, + "grad_norm": NaN, + "learning_rate": 2.6823138102416763e-06, + "loss": 0.0, + "step": 60546 + }, + { + "epoch": 5.649622095735747, + "grad_norm": NaN, + "learning_rate": 2.6808899875018e-06, + "loss": 0.0, + "step": 60547 + }, + { + "epoch": 5.649715405430625, + "grad_norm": NaN, + "learning_rate": 2.6794665393507832e-06, + "loss": 0.0, + "step": 60548 + }, + { + "epoch": 5.649808715125501, + "grad_norm": NaN, + "learning_rate": 2.67804346579224e-06, + "loss": 0.0, + "step": 60549 + }, + { + "epoch": 5.649902024820379, + "grad_norm": NaN, + "learning_rate": 2.6766207668297845e-06, + "loss": 0.0, + "step": 60550 + }, + { + "epoch": 5.649995334515256, + "grad_norm": NaN, + "learning_rate": 2.6751984424670636e-06, + "loss": 0.0, + "step": 60551 + }, + { + "epoch": 5.6500886442101335, + "grad_norm": NaN, + "learning_rate": 2.673776492707658e-06, + "loss": 0.0, + "step": 60552 + }, + { + "epoch": 5.650181953905011, + "grad_norm": NaN, + "learning_rate": 2.6723549175551973e-06, + "loss": 0.0, + "step": 60553 + }, + { + "epoch": 5.650275263599888, + "grad_norm": NaN, + "learning_rate": 2.6709337170132962e-06, + "loss": 0.0, + "step": 60554 + }, + { + "epoch": 5.650368573294765, + "grad_norm": NaN, + "learning_rate": 2.669512891085568e-06, + "loss": 0.0, + "step": 60555 + }, + { + "epoch": 5.650461882989642, + "grad_norm": NaN, + "learning_rate": 2.668092439775643e-06, + "loss": 0.0, + "step": 60556 + }, + { + "epoch": 5.65055519268452, + "grad_norm": NaN, + "learning_rate": 2.6666723630870856e-06, + "loss": 0.0, + "step": 60557 + }, + { + "epoch": 5.650648502379397, + "grad_norm": NaN, + "learning_rate": 2.665252661023559e-06, + "loss": 0.0, + "step": 60558 + }, + { + "epoch": 5.6507418120742745, + "grad_norm": NaN, + "learning_rate": 2.6638333335886607e-06, + "loss": 0.0, + "step": 60559 + }, + { + "epoch": 5.650835121769152, + "grad_norm": NaN, + "learning_rate": 2.6624143807859544e-06, + "loss": 0.0, + "step": 60560 + }, + { + "epoch": 5.650928431464029, + "grad_norm": NaN, + "learning_rate": 2.6609958026191034e-06, + "loss": 0.0, + "step": 60561 + }, + { + "epoch": 5.651021741158907, + "grad_norm": NaN, + "learning_rate": 2.659577599091706e-06, + "loss": 0.0, + "step": 60562 + }, + { + "epoch": 5.651115050853784, + "grad_norm": NaN, + "learning_rate": 2.6581597702073407e-06, + "loss": 0.0, + "step": 60563 + }, + { + "epoch": 5.651208360548661, + "grad_norm": NaN, + "learning_rate": 2.65674231596964e-06, + "loss": 0.0, + "step": 60564 + }, + { + "epoch": 5.651301670243538, + "grad_norm": NaN, + "learning_rate": 2.6553252363821998e-06, + "loss": 0.0, + "step": 60565 + }, + { + "epoch": 5.6513949799384156, + "grad_norm": NaN, + "learning_rate": 2.6539085314486008e-06, + "loss": 0.0, + "step": 60566 + }, + { + "epoch": 5.651488289633293, + "grad_norm": NaN, + "learning_rate": 2.6524922011724737e-06, + "loss": 0.0, + "step": 60567 + }, + { + "epoch": 5.65158159932817, + "grad_norm": NaN, + "learning_rate": 2.651076245557432e-06, + "loss": 0.0, + "step": 60568 + }, + { + "epoch": 5.651674909023048, + "grad_norm": NaN, + "learning_rate": 2.649660664607023e-06, + "loss": 0.0, + "step": 60569 + }, + { + "epoch": 5.651768218717924, + "grad_norm": NaN, + "learning_rate": 2.6482454583248937e-06, + "loss": 0.0, + "step": 60570 + }, + { + "epoch": 5.651861528412802, + "grad_norm": NaN, + "learning_rate": 2.646830626714641e-06, + "loss": 0.0, + "step": 60571 + }, + { + "epoch": 5.651954838107679, + "grad_norm": NaN, + "learning_rate": 2.645416169779829e-06, + "loss": 0.0, + "step": 60572 + }, + { + "epoch": 5.652048147802557, + "grad_norm": NaN, + "learning_rate": 2.6440020875240886e-06, + "loss": 0.0, + "step": 60573 + }, + { + "epoch": 5.652141457497434, + "grad_norm": NaN, + "learning_rate": 2.6425883799509997e-06, + "loss": 0.0, + "step": 60574 + }, + { + "epoch": 5.652234767192311, + "grad_norm": NaN, + "learning_rate": 2.6411750470641424e-06, + "loss": 0.0, + "step": 60575 + }, + { + "epoch": 5.652328076887189, + "grad_norm": NaN, + "learning_rate": 2.639762088867131e-06, + "loss": 0.0, + "step": 60576 + }, + { + "epoch": 5.652421386582066, + "grad_norm": NaN, + "learning_rate": 2.6383495053635793e-06, + "loss": 0.0, + "step": 60577 + }, + { + "epoch": 5.652514696276943, + "grad_norm": NaN, + "learning_rate": 2.636937296557018e-06, + "loss": 0.0, + "step": 60578 + }, + { + "epoch": 5.65260800597182, + "grad_norm": NaN, + "learning_rate": 2.635525462451077e-06, + "loss": 0.0, + "step": 60579 + }, + { + "epoch": 5.652701315666698, + "grad_norm": NaN, + "learning_rate": 2.63411400304937e-06, + "loss": 0.0, + "step": 60580 + }, + { + "epoch": 5.652794625361575, + "grad_norm": NaN, + "learning_rate": 2.632702918355428e-06, + "loss": 0.0, + "step": 60581 + }, + { + "epoch": 5.6528879350564525, + "grad_norm": NaN, + "learning_rate": 2.6312922083728815e-06, + "loss": 0.0, + "step": 60582 + }, + { + "epoch": 5.65298124475133, + "grad_norm": NaN, + "learning_rate": 2.6298818731053106e-06, + "loss": 0.0, + "step": 60583 + }, + { + "epoch": 5.653074554446207, + "grad_norm": NaN, + "learning_rate": 2.6284719125562792e-06, + "loss": 0.0, + "step": 60584 + }, + { + "epoch": 5.653167864141084, + "grad_norm": NaN, + "learning_rate": 2.627062326729401e-06, + "loss": 0.0, + "step": 60585 + }, + { + "epoch": 5.653261173835961, + "grad_norm": NaN, + "learning_rate": 2.6256531156282566e-06, + "loss": 0.0, + "step": 60586 + }, + { + "epoch": 5.653354483530839, + "grad_norm": NaN, + "learning_rate": 2.62424427925641e-06, + "loss": 0.0, + "step": 60587 + }, + { + "epoch": 5.653447793225716, + "grad_norm": NaN, + "learning_rate": 2.622835817617441e-06, + "loss": 0.0, + "step": 60588 + }, + { + "epoch": 5.6535411029205935, + "grad_norm": NaN, + "learning_rate": 2.6214277307149644e-06, + "loss": 0.0, + "step": 60589 + }, + { + "epoch": 5.653634412615471, + "grad_norm": NaN, + "learning_rate": 2.6200200185525267e-06, + "loss": 0.0, + "step": 60590 + }, + { + "epoch": 5.653727722310348, + "grad_norm": NaN, + "learning_rate": 2.6186126811337414e-06, + "loss": 0.0, + "step": 60591 + }, + { + "epoch": 5.653821032005226, + "grad_norm": NaN, + "learning_rate": 2.6172057184621563e-06, + "loss": 0.0, + "step": 60592 + }, + { + "epoch": 5.653914341700102, + "grad_norm": NaN, + "learning_rate": 2.615799130541352e-06, + "loss": 0.0, + "step": 60593 + }, + { + "epoch": 5.65400765139498, + "grad_norm": NaN, + "learning_rate": 2.6143929173749077e-06, + "loss": 0.0, + "step": 60594 + }, + { + "epoch": 5.654100961089857, + "grad_norm": NaN, + "learning_rate": 2.612987078966422e-06, + "loss": 0.0, + "step": 60595 + }, + { + "epoch": 5.654194270784735, + "grad_norm": NaN, + "learning_rate": 2.611581615319441e-06, + "loss": 0.0, + "step": 60596 + }, + { + "epoch": 5.654287580479612, + "grad_norm": NaN, + "learning_rate": 2.610176526437546e-06, + "loss": 0.0, + "step": 60597 + }, + { + "epoch": 5.654380890174489, + "grad_norm": NaN, + "learning_rate": 2.608771812324317e-06, + "loss": 0.0, + "step": 60598 + }, + { + "epoch": 5.654474199869366, + "grad_norm": NaN, + "learning_rate": 2.6073674729833184e-06, + "loss": 0.0, + "step": 60599 + }, + { + "epoch": 5.654567509564243, + "grad_norm": NaN, + "learning_rate": 2.6059635084181297e-06, + "loss": 0.0, + "step": 60600 + }, + { + "epoch": 5.654660819259121, + "grad_norm": NaN, + "learning_rate": 2.6045599186322983e-06, + "loss": 0.0, + "step": 60601 + }, + { + "epoch": 5.654754128953998, + "grad_norm": NaN, + "learning_rate": 2.6031567036294222e-06, + "loss": 0.0, + "step": 60602 + }, + { + "epoch": 5.654847438648876, + "grad_norm": NaN, + "learning_rate": 2.601753863413064e-06, + "loss": 0.0, + "step": 60603 + }, + { + "epoch": 5.654940748343753, + "grad_norm": NaN, + "learning_rate": 2.600351397986755e-06, + "loss": 0.0, + "step": 60604 + }, + { + "epoch": 5.6550340580386305, + "grad_norm": NaN, + "learning_rate": 2.598949307354109e-06, + "loss": 0.0, + "step": 60605 + }, + { + "epoch": 5.655127367733508, + "grad_norm": NaN, + "learning_rate": 2.5975475915186896e-06, + "loss": 0.0, + "step": 60606 + }, + { + "epoch": 5.655220677428385, + "grad_norm": NaN, + "learning_rate": 2.59614625048401e-06, + "loss": 0.0, + "step": 60607 + }, + { + "epoch": 5.655313987123262, + "grad_norm": NaN, + "learning_rate": 2.594745284253669e-06, + "loss": 0.0, + "step": 60608 + }, + { + "epoch": 5.655407296818139, + "grad_norm": NaN, + "learning_rate": 2.5933446928312452e-06, + "loss": 0.0, + "step": 60609 + }, + { + "epoch": 5.655500606513017, + "grad_norm": NaN, + "learning_rate": 2.59194447622027e-06, + "loss": 0.0, + "step": 60610 + }, + { + "epoch": 5.655593916207894, + "grad_norm": NaN, + "learning_rate": 2.5905446344243075e-06, + "loss": 0.0, + "step": 60611 + }, + { + "epoch": 5.6556872259027715, + "grad_norm": NaN, + "learning_rate": 2.5891451674469543e-06, + "loss": 0.0, + "step": 60612 + }, + { + "epoch": 5.655780535597649, + "grad_norm": NaN, + "learning_rate": 2.5877460752916912e-06, + "loss": 0.0, + "step": 60613 + }, + { + "epoch": 5.6558738452925255, + "grad_norm": NaN, + "learning_rate": 2.5863473579621484e-06, + "loss": 0.0, + "step": 60614 + }, + { + "epoch": 5.655967154987403, + "grad_norm": NaN, + "learning_rate": 2.5849490154618736e-06, + "loss": 0.0, + "step": 60615 + }, + { + "epoch": 5.65606046468228, + "grad_norm": NaN, + "learning_rate": 2.58355104779438e-06, + "loss": 0.0, + "step": 60616 + }, + { + "epoch": 5.656153774377158, + "grad_norm": NaN, + "learning_rate": 2.582153454963265e-06, + "loss": 0.0, + "step": 60617 + }, + { + "epoch": 5.656247084072035, + "grad_norm": NaN, + "learning_rate": 2.5807562369720592e-06, + "loss": 0.0, + "step": 60618 + }, + { + "epoch": 5.6563403937669126, + "grad_norm": NaN, + "learning_rate": 2.5793593938243094e-06, + "loss": 0.0, + "step": 60619 + }, + { + "epoch": 5.65643370346179, + "grad_norm": NaN, + "learning_rate": 2.5779629255235966e-06, + "loss": 0.0, + "step": 60620 + }, + { + "epoch": 5.656527013156667, + "grad_norm": NaN, + "learning_rate": 2.5765668320734513e-06, + "loss": 0.0, + "step": 60621 + }, + { + "epoch": 5.656620322851544, + "grad_norm": NaN, + "learning_rate": 2.57517111347742e-06, + "loss": 0.0, + "step": 60622 + }, + { + "epoch": 5.656713632546421, + "grad_norm": NaN, + "learning_rate": 2.573775769739067e-06, + "loss": 0.0, + "step": 60623 + }, + { + "epoch": 5.656806942241299, + "grad_norm": NaN, + "learning_rate": 2.572380800861923e-06, + "loss": 0.0, + "step": 60624 + }, + { + "epoch": 5.656900251936176, + "grad_norm": NaN, + "learning_rate": 2.5709862068495344e-06, + "loss": 0.0, + "step": 60625 + }, + { + "epoch": 5.656993561631054, + "grad_norm": NaN, + "learning_rate": 2.5695919877054825e-06, + "loss": 0.0, + "step": 60626 + }, + { + "epoch": 5.657086871325931, + "grad_norm": NaN, + "learning_rate": 2.5681981434332645e-06, + "loss": 0.0, + "step": 60627 + }, + { + "epoch": 5.657180181020808, + "grad_norm": NaN, + "learning_rate": 2.56680467403646e-06, + "loss": 0.0, + "step": 60628 + }, + { + "epoch": 5.657273490715685, + "grad_norm": NaN, + "learning_rate": 2.565411579518584e-06, + "loss": 0.0, + "step": 60629 + }, + { + "epoch": 5.657366800410562, + "grad_norm": NaN, + "learning_rate": 2.564018859883199e-06, + "loss": 0.0, + "step": 60630 + }, + { + "epoch": 5.65746011010544, + "grad_norm": NaN, + "learning_rate": 2.5626265151338367e-06, + "loss": 0.0, + "step": 60631 + }, + { + "epoch": 5.657553419800317, + "grad_norm": NaN, + "learning_rate": 2.56123454527406e-06, + "loss": 0.0, + "step": 60632 + }, + { + "epoch": 5.657646729495195, + "grad_norm": NaN, + "learning_rate": 2.559842950307367e-06, + "loss": 0.0, + "step": 60633 + }, + { + "epoch": 5.657740039190072, + "grad_norm": NaN, + "learning_rate": 2.5584517302373375e-06, + "loss": 0.0, + "step": 60634 + }, + { + "epoch": 5.6578333488849495, + "grad_norm": NaN, + "learning_rate": 2.557060885067469e-06, + "loss": 0.0, + "step": 60635 + }, + { + "epoch": 5.657926658579827, + "grad_norm": NaN, + "learning_rate": 2.5556704148013253e-06, + "loss": 0.0, + "step": 60636 + }, + { + "epoch": 5.658019968274703, + "grad_norm": NaN, + "learning_rate": 2.5542803194424368e-06, + "loss": 0.0, + "step": 60637 + }, + { + "epoch": 5.658113277969581, + "grad_norm": NaN, + "learning_rate": 2.5528905989943503e-06, + "loss": 0.0, + "step": 60638 + }, + { + "epoch": 5.658206587664458, + "grad_norm": NaN, + "learning_rate": 2.551501253460564e-06, + "loss": 0.0, + "step": 60639 + }, + { + "epoch": 5.658299897359336, + "grad_norm": NaN, + "learning_rate": 2.5501122828446573e-06, + "loss": 0.0, + "step": 60640 + }, + { + "epoch": 5.658393207054213, + "grad_norm": NaN, + "learning_rate": 2.5487236871501115e-06, + "loss": 0.0, + "step": 60641 + }, + { + "epoch": 5.6584865167490905, + "grad_norm": NaN, + "learning_rate": 2.547335466380507e-06, + "loss": 0.0, + "step": 60642 + }, + { + "epoch": 5.658579826443967, + "grad_norm": NaN, + "learning_rate": 2.5459476205393237e-06, + "loss": 0.0, + "step": 60643 + }, + { + "epoch": 5.6586731361388445, + "grad_norm": NaN, + "learning_rate": 2.5445601496301425e-06, + "loss": 0.0, + "step": 60644 + }, + { + "epoch": 5.658766445833722, + "grad_norm": NaN, + "learning_rate": 2.5431730536564442e-06, + "loss": 0.0, + "step": 60645 + }, + { + "epoch": 5.658859755528599, + "grad_norm": NaN, + "learning_rate": 2.5417863326217924e-06, + "loss": 0.0, + "step": 60646 + }, + { + "epoch": 5.658953065223477, + "grad_norm": NaN, + "learning_rate": 2.540399986529701e-06, + "loss": 0.0, + "step": 60647 + }, + { + "epoch": 5.659046374918354, + "grad_norm": NaN, + "learning_rate": 2.539014015383667e-06, + "loss": 0.0, + "step": 60648 + }, + { + "epoch": 5.659139684613232, + "grad_norm": NaN, + "learning_rate": 2.5376284191872543e-06, + "loss": 0.0, + "step": 60649 + }, + { + "epoch": 5.659232994308109, + "grad_norm": NaN, + "learning_rate": 2.536243197943977e-06, + "loss": 0.0, + "step": 60650 + }, + { + "epoch": 5.6593263040029855, + "grad_norm": NaN, + "learning_rate": 2.5348583516573483e-06, + "loss": 0.0, + "step": 60651 + }, + { + "epoch": 5.659419613697863, + "grad_norm": NaN, + "learning_rate": 2.533473880330883e-06, + "loss": 0.0, + "step": 60652 + }, + { + "epoch": 5.65951292339274, + "grad_norm": NaN, + "learning_rate": 2.5320897839681275e-06, + "loss": 0.0, + "step": 60653 + }, + { + "epoch": 5.659606233087618, + "grad_norm": NaN, + "learning_rate": 2.530706062572563e-06, + "loss": 0.0, + "step": 60654 + }, + { + "epoch": 5.659699542782495, + "grad_norm": NaN, + "learning_rate": 2.529322716147736e-06, + "loss": 0.0, + "step": 60655 + }, + { + "epoch": 5.659792852477373, + "grad_norm": NaN, + "learning_rate": 2.5279397446971773e-06, + "loss": 0.0, + "step": 60656 + }, + { + "epoch": 5.65988616217225, + "grad_norm": NaN, + "learning_rate": 2.5265571482243507e-06, + "loss": 0.0, + "step": 60657 + }, + { + "epoch": 5.659979471867127, + "grad_norm": NaN, + "learning_rate": 2.52517492673282e-06, + "loss": 0.0, + "step": 60658 + }, + { + "epoch": 5.660072781562004, + "grad_norm": NaN, + "learning_rate": 2.5237930802260995e-06, + "loss": 0.0, + "step": 60659 + }, + { + "epoch": 5.660166091256881, + "grad_norm": NaN, + "learning_rate": 2.5224116087076697e-06, + "loss": 0.0, + "step": 60660 + }, + { + "epoch": 5.660259400951759, + "grad_norm": NaN, + "learning_rate": 2.5210305121810603e-06, + "loss": 0.0, + "step": 60661 + }, + { + "epoch": 5.660352710646636, + "grad_norm": NaN, + "learning_rate": 2.519649790649786e-06, + "loss": 0.0, + "step": 60662 + }, + { + "epoch": 5.660446020341514, + "grad_norm": NaN, + "learning_rate": 2.51826944411736e-06, + "loss": 0.0, + "step": 60663 + }, + { + "epoch": 5.660539330036391, + "grad_norm": NaN, + "learning_rate": 2.51688947258728e-06, + "loss": 0.0, + "step": 60664 + }, + { + "epoch": 5.6606326397312685, + "grad_norm": NaN, + "learning_rate": 2.5155098760630766e-06, + "loss": 0.0, + "step": 60665 + }, + { + "epoch": 5.660725949426145, + "grad_norm": NaN, + "learning_rate": 2.51413065454823e-06, + "loss": 0.0, + "step": 60666 + }, + { + "epoch": 5.6608192591210225, + "grad_norm": NaN, + "learning_rate": 2.5127518080462704e-06, + "loss": 0.0, + "step": 60667 + }, + { + "epoch": 5.6609125688159, + "grad_norm": NaN, + "learning_rate": 2.511373336560679e-06, + "loss": 0.0, + "step": 60668 + }, + { + "epoch": 5.661005878510777, + "grad_norm": NaN, + "learning_rate": 2.5099952400949865e-06, + "loss": 0.0, + "step": 60669 + }, + { + "epoch": 5.661099188205655, + "grad_norm": NaN, + "learning_rate": 2.508617518652689e-06, + "loss": 0.0, + "step": 60670 + }, + { + "epoch": 5.661192497900532, + "grad_norm": NaN, + "learning_rate": 2.5072401722372848e-06, + "loss": 0.0, + "step": 60671 + }, + { + "epoch": 5.661285807595409, + "grad_norm": NaN, + "learning_rate": 2.5058632008522705e-06, + "loss": 0.0, + "step": 60672 + }, + { + "epoch": 5.661379117290286, + "grad_norm": NaN, + "learning_rate": 2.50448660450116e-06, + "loss": 0.0, + "step": 60673 + }, + { + "epoch": 5.6614724269851635, + "grad_norm": NaN, + "learning_rate": 2.5031103831874512e-06, + "loss": 0.0, + "step": 60674 + }, + { + "epoch": 5.661565736680041, + "grad_norm": NaN, + "learning_rate": 2.50173453691464e-06, + "loss": 0.0, + "step": 60675 + }, + { + "epoch": 5.661659046374918, + "grad_norm": NaN, + "learning_rate": 2.500359065686225e-06, + "loss": 0.0, + "step": 60676 + }, + { + "epoch": 5.661752356069796, + "grad_norm": NaN, + "learning_rate": 2.4989839695057023e-06, + "loss": 0.0, + "step": 60677 + }, + { + "epoch": 5.661845665764673, + "grad_norm": NaN, + "learning_rate": 2.4976092483765698e-06, + "loss": 0.0, + "step": 60678 + }, + { + "epoch": 5.661938975459551, + "grad_norm": NaN, + "learning_rate": 2.496234902302341e-06, + "loss": 0.0, + "step": 60679 + }, + { + "epoch": 5.662032285154428, + "grad_norm": NaN, + "learning_rate": 2.494860931286463e-06, + "loss": 0.0, + "step": 60680 + }, + { + "epoch": 5.6621255948493046, + "grad_norm": NaN, + "learning_rate": 2.4934873353324836e-06, + "loss": 0.0, + "step": 60681 + }, + { + "epoch": 5.662218904544182, + "grad_norm": NaN, + "learning_rate": 2.492114114443866e-06, + "loss": 0.0, + "step": 60682 + }, + { + "epoch": 5.662312214239059, + "grad_norm": NaN, + "learning_rate": 2.4907412686240913e-06, + "loss": 0.0, + "step": 60683 + }, + { + "epoch": 5.662405523933937, + "grad_norm": NaN, + "learning_rate": 2.4893687978766897e-06, + "loss": 0.0, + "step": 60684 + }, + { + "epoch": 5.662498833628814, + "grad_norm": NaN, + "learning_rate": 2.4879967022051084e-06, + "loss": 0.0, + "step": 60685 + }, + { + "epoch": 5.662592143323692, + "grad_norm": NaN, + "learning_rate": 2.4866249816128613e-06, + "loss": 0.0, + "step": 60686 + }, + { + "epoch": 5.662685453018568, + "grad_norm": NaN, + "learning_rate": 2.4852536361034457e-06, + "loss": 0.0, + "step": 60687 + }, + { + "epoch": 5.662778762713446, + "grad_norm": NaN, + "learning_rate": 2.483882665680309e-06, + "loss": 0.0, + "step": 60688 + }, + { + "epoch": 5.662872072408323, + "grad_norm": NaN, + "learning_rate": 2.4825120703469814e-06, + "loss": 0.0, + "step": 60689 + }, + { + "epoch": 5.6629653821032, + "grad_norm": NaN, + "learning_rate": 2.48114185010691e-06, + "loss": 0.0, + "step": 60690 + }, + { + "epoch": 5.663058691798078, + "grad_norm": NaN, + "learning_rate": 2.4797720049636093e-06, + "loss": 0.0, + "step": 60691 + }, + { + "epoch": 5.663152001492955, + "grad_norm": NaN, + "learning_rate": 2.4784025349205426e-06, + "loss": 0.0, + "step": 60692 + }, + { + "epoch": 5.663245311187833, + "grad_norm": NaN, + "learning_rate": 2.4770334399811907e-06, + "loss": 0.0, + "step": 60693 + }, + { + "epoch": 5.66333862088271, + "grad_norm": NaN, + "learning_rate": 2.4756647201490674e-06, + "loss": 0.0, + "step": 60694 + }, + { + "epoch": 5.663431930577587, + "grad_norm": NaN, + "learning_rate": 2.4742963754276037e-06, + "loss": 0.0, + "step": 60695 + }, + { + "epoch": 5.663525240272464, + "grad_norm": NaN, + "learning_rate": 2.4729284058203125e-06, + "loss": 0.0, + "step": 60696 + }, + { + "epoch": 5.6636185499673415, + "grad_norm": NaN, + "learning_rate": 2.471560811330692e-06, + "loss": 0.0, + "step": 60697 + }, + { + "epoch": 5.663711859662219, + "grad_norm": NaN, + "learning_rate": 2.4701935919621386e-06, + "loss": 0.0, + "step": 60698 + }, + { + "epoch": 5.663805169357096, + "grad_norm": NaN, + "learning_rate": 2.4688267477182167e-06, + "loss": 0.0, + "step": 60699 + }, + { + "epoch": 5.663898479051974, + "grad_norm": NaN, + "learning_rate": 2.4674602786023567e-06, + "loss": 0.0, + "step": 60700 + }, + { + "epoch": 5.663991788746851, + "grad_norm": NaN, + "learning_rate": 2.4660941846180393e-06, + "loss": 0.0, + "step": 60701 + }, + { + "epoch": 5.664085098441728, + "grad_norm": NaN, + "learning_rate": 2.4647284657687448e-06, + "loss": 0.0, + "step": 60702 + }, + { + "epoch": 5.664178408136605, + "grad_norm": NaN, + "learning_rate": 2.4633631220579543e-06, + "loss": 0.0, + "step": 60703 + }, + { + "epoch": 5.6642717178314825, + "grad_norm": NaN, + "learning_rate": 2.4619981534891144e-06, + "loss": 0.0, + "step": 60704 + }, + { + "epoch": 5.66436502752636, + "grad_norm": NaN, + "learning_rate": 2.4606335600657056e-06, + "loss": 0.0, + "step": 60705 + }, + { + "epoch": 5.664458337221237, + "grad_norm": NaN, + "learning_rate": 2.459269341791209e-06, + "loss": 0.0, + "step": 60706 + }, + { + "epoch": 5.664551646916115, + "grad_norm": NaN, + "learning_rate": 2.4579054986690883e-06, + "loss": 0.0, + "step": 60707 + }, + { + "epoch": 5.664644956610992, + "grad_norm": NaN, + "learning_rate": 2.4565420307028073e-06, + "loss": 0.0, + "step": 60708 + }, + { + "epoch": 5.66473826630587, + "grad_norm": NaN, + "learning_rate": 2.4551789378958296e-06, + "loss": 0.0, + "step": 60709 + }, + { + "epoch": 5.664831576000746, + "grad_norm": NaN, + "learning_rate": 2.4538162202516366e-06, + "loss": 0.0, + "step": 60710 + }, + { + "epoch": 5.664924885695624, + "grad_norm": NaN, + "learning_rate": 2.4524538777736745e-06, + "loss": 0.0, + "step": 60711 + }, + { + "epoch": 5.665018195390501, + "grad_norm": NaN, + "learning_rate": 2.451091910465408e-06, + "loss": 0.0, + "step": 60712 + }, + { + "epoch": 5.665111505085378, + "grad_norm": NaN, + "learning_rate": 2.4497303183303175e-06, + "loss": 0.0, + "step": 60713 + }, + { + "epoch": 5.665204814780256, + "grad_norm": NaN, + "learning_rate": 2.4483691013718498e-06, + "loss": 0.0, + "step": 60714 + }, + { + "epoch": 5.665298124475133, + "grad_norm": NaN, + "learning_rate": 2.447008259593486e-06, + "loss": 0.0, + "step": 60715 + }, + { + "epoch": 5.66539143417001, + "grad_norm": NaN, + "learning_rate": 2.445647792998656e-06, + "loss": 0.0, + "step": 60716 + }, + { + "epoch": 5.665484743864887, + "grad_norm": NaN, + "learning_rate": 2.4442877015908413e-06, + "loss": 0.0, + "step": 60717 + }, + { + "epoch": 5.665578053559765, + "grad_norm": NaN, + "learning_rate": 2.4429279853734883e-06, + "loss": 0.0, + "step": 60718 + }, + { + "epoch": 5.665671363254642, + "grad_norm": NaN, + "learning_rate": 2.4415686443500616e-06, + "loss": 0.0, + "step": 60719 + }, + { + "epoch": 5.6657646729495195, + "grad_norm": NaN, + "learning_rate": 2.4402096785240242e-06, + "loss": 0.0, + "step": 60720 + }, + { + "epoch": 5.665857982644397, + "grad_norm": NaN, + "learning_rate": 2.4388510878988076e-06, + "loss": 0.0, + "step": 60721 + }, + { + "epoch": 5.665951292339274, + "grad_norm": NaN, + "learning_rate": 2.437492872477892e-06, + "loss": 0.0, + "step": 60722 + }, + { + "epoch": 5.666044602034152, + "grad_norm": NaN, + "learning_rate": 2.4361350322647244e-06, + "loss": 0.0, + "step": 60723 + }, + { + "epoch": 5.666137911729029, + "grad_norm": NaN, + "learning_rate": 2.4347775672627357e-06, + "loss": 0.0, + "step": 60724 + }, + { + "epoch": 5.666231221423906, + "grad_norm": NaN, + "learning_rate": 2.433420477475423e-06, + "loss": 0.0, + "step": 60725 + }, + { + "epoch": 5.666324531118783, + "grad_norm": NaN, + "learning_rate": 2.4320637629061834e-06, + "loss": 0.0, + "step": 60726 + }, + { + "epoch": 5.6664178408136605, + "grad_norm": NaN, + "learning_rate": 2.4307074235585143e-06, + "loss": 0.0, + "step": 60727 + }, + { + "epoch": 5.666511150508538, + "grad_norm": NaN, + "learning_rate": 2.42935145943583e-06, + "loss": 0.0, + "step": 60728 + }, + { + "epoch": 5.666604460203415, + "grad_norm": NaN, + "learning_rate": 2.4279958705415936e-06, + "loss": 0.0, + "step": 60729 + }, + { + "epoch": 5.666697769898293, + "grad_norm": NaN, + "learning_rate": 2.426640656879253e-06, + "loss": 0.0, + "step": 60730 + }, + { + "epoch": 5.666791079593169, + "grad_norm": NaN, + "learning_rate": 2.4252858184522384e-06, + "loss": 0.0, + "step": 60731 + }, + { + "epoch": 5.666884389288047, + "grad_norm": NaN, + "learning_rate": 2.423931355264014e-06, + "loss": 0.0, + "step": 60732 + }, + { + "epoch": 5.666977698982924, + "grad_norm": NaN, + "learning_rate": 2.4225772673180267e-06, + "loss": 0.0, + "step": 60733 + }, + { + "epoch": 5.667071008677802, + "grad_norm": NaN, + "learning_rate": 2.421223554617707e-06, + "loss": 0.0, + "step": 60734 + }, + { + "epoch": 5.667164318372679, + "grad_norm": NaN, + "learning_rate": 2.419870217166503e-06, + "loss": 0.0, + "step": 60735 + }, + { + "epoch": 5.667257628067556, + "grad_norm": NaN, + "learning_rate": 2.418517254967828e-06, + "loss": 0.0, + "step": 60736 + }, + { + "epoch": 5.667350937762434, + "grad_norm": NaN, + "learning_rate": 2.417164668025162e-06, + "loss": 0.0, + "step": 60737 + }, + { + "epoch": 5.667444247457311, + "grad_norm": NaN, + "learning_rate": 2.4158124563419367e-06, + "loss": 0.0, + "step": 60738 + }, + { + "epoch": 5.667537557152188, + "grad_norm": NaN, + "learning_rate": 2.4144606199215823e-06, + "loss": 0.0, + "step": 60739 + }, + { + "epoch": 5.667630866847065, + "grad_norm": NaN, + "learning_rate": 2.413109158767529e-06, + "loss": 0.0, + "step": 60740 + }, + { + "epoch": 5.667724176541943, + "grad_norm": NaN, + "learning_rate": 2.411758072883224e-06, + "loss": 0.0, + "step": 60741 + }, + { + "epoch": 5.66781748623682, + "grad_norm": NaN, + "learning_rate": 2.410407362272099e-06, + "loss": 0.0, + "step": 60742 + }, + { + "epoch": 5.667910795931697, + "grad_norm": NaN, + "learning_rate": 2.409057026937583e-06, + "loss": 0.0, + "step": 60743 + }, + { + "epoch": 5.668004105626575, + "grad_norm": NaN, + "learning_rate": 2.4077070668831244e-06, + "loss": 0.0, + "step": 60744 + }, + { + "epoch": 5.668097415321452, + "grad_norm": NaN, + "learning_rate": 2.4063574821121367e-06, + "loss": 0.0, + "step": 60745 + }, + { + "epoch": 5.668190725016329, + "grad_norm": NaN, + "learning_rate": 2.405008272628067e-06, + "loss": 0.0, + "step": 60746 + }, + { + "epoch": 5.668284034711206, + "grad_norm": NaN, + "learning_rate": 2.4036594384343466e-06, + "loss": 0.0, + "step": 60747 + }, + { + "epoch": 5.668377344406084, + "grad_norm": NaN, + "learning_rate": 2.402310979534389e-06, + "loss": 0.0, + "step": 60748 + }, + { + "epoch": 5.668470654100961, + "grad_norm": NaN, + "learning_rate": 2.4009628959316407e-06, + "loss": 0.0, + "step": 60749 + }, + { + "epoch": 5.6685639637958385, + "grad_norm": NaN, + "learning_rate": 2.3996151876295166e-06, + "loss": 0.0, + "step": 60750 + }, + { + "epoch": 5.668657273490716, + "grad_norm": NaN, + "learning_rate": 2.398267854631447e-06, + "loss": 0.0, + "step": 60751 + }, + { + "epoch": 5.668750583185593, + "grad_norm": NaN, + "learning_rate": 2.3969208969408625e-06, + "loss": 0.0, + "step": 60752 + }, + { + "epoch": 5.668843892880471, + "grad_norm": NaN, + "learning_rate": 2.3955743145611773e-06, + "loss": 0.0, + "step": 60753 + }, + { + "epoch": 5.668937202575347, + "grad_norm": NaN, + "learning_rate": 2.394228107495838e-06, + "loss": 0.0, + "step": 60754 + }, + { + "epoch": 5.669030512270225, + "grad_norm": NaN, + "learning_rate": 2.3928822757482424e-06, + "loss": 0.0, + "step": 60755 + }, + { + "epoch": 5.669123821965102, + "grad_norm": NaN, + "learning_rate": 2.391536819321821e-06, + "loss": 0.0, + "step": 60756 + }, + { + "epoch": 5.6692171316599795, + "grad_norm": NaN, + "learning_rate": 2.3901917382200044e-06, + "loss": 0.0, + "step": 60757 + }, + { + "epoch": 5.669310441354857, + "grad_norm": NaN, + "learning_rate": 2.388847032446206e-06, + "loss": 0.0, + "step": 60758 + }, + { + "epoch": 5.669403751049734, + "grad_norm": NaN, + "learning_rate": 2.3875027020038407e-06, + "loss": 0.0, + "step": 60759 + }, + { + "epoch": 5.669497060744611, + "grad_norm": NaN, + "learning_rate": 2.3861587468963216e-06, + "loss": 0.0, + "step": 60760 + }, + { + "epoch": 5.669590370439488, + "grad_norm": NaN, + "learning_rate": 2.38481516712708e-06, + "loss": 0.0, + "step": 60761 + }, + { + "epoch": 5.669683680134366, + "grad_norm": NaN, + "learning_rate": 2.3834719626995292e-06, + "loss": 0.0, + "step": 60762 + }, + { + "epoch": 5.669776989829243, + "grad_norm": NaN, + "learning_rate": 2.3821291336170836e-06, + "loss": 0.0, + "step": 60763 + }, + { + "epoch": 5.669870299524121, + "grad_norm": NaN, + "learning_rate": 2.3807866798831566e-06, + "loss": 0.0, + "step": 60764 + }, + { + "epoch": 5.669963609218998, + "grad_norm": NaN, + "learning_rate": 2.3794446015011627e-06, + "loss": 0.0, + "step": 60765 + }, + { + "epoch": 5.670056918913875, + "grad_norm": NaN, + "learning_rate": 2.378102898474499e-06, + "loss": 0.0, + "step": 60766 + }, + { + "epoch": 5.670150228608753, + "grad_norm": NaN, + "learning_rate": 2.3767615708065956e-06, + "loss": 0.0, + "step": 60767 + }, + { + "epoch": 5.670243538303629, + "grad_norm": NaN, + "learning_rate": 2.3754206185008673e-06, + "loss": 0.0, + "step": 60768 + }, + { + "epoch": 5.670336847998507, + "grad_norm": NaN, + "learning_rate": 2.374080041560711e-06, + "loss": 0.0, + "step": 60769 + }, + { + "epoch": 5.670430157693384, + "grad_norm": NaN, + "learning_rate": 2.3727398399895403e-06, + "loss": 0.0, + "step": 60770 + }, + { + "epoch": 5.670523467388262, + "grad_norm": NaN, + "learning_rate": 2.371400013790753e-06, + "loss": 0.0, + "step": 60771 + }, + { + "epoch": 5.670616777083139, + "grad_norm": NaN, + "learning_rate": 2.3700605629677795e-06, + "loss": 0.0, + "step": 60772 + }, + { + "epoch": 5.6707100867780165, + "grad_norm": NaN, + "learning_rate": 2.368721487524e-06, + "loss": 0.0, + "step": 60773 + }, + { + "epoch": 5.670803396472894, + "grad_norm": NaN, + "learning_rate": 2.3673827874628293e-06, + "loss": 0.0, + "step": 60774 + }, + { + "epoch": 5.67089670616777, + "grad_norm": NaN, + "learning_rate": 2.366044462787664e-06, + "loss": 0.0, + "step": 60775 + }, + { + "epoch": 5.670990015862648, + "grad_norm": NaN, + "learning_rate": 2.364706513501935e-06, + "loss": 0.0, + "step": 60776 + }, + { + "epoch": 5.671083325557525, + "grad_norm": NaN, + "learning_rate": 2.363368939609023e-06, + "loss": 0.0, + "step": 60777 + }, + { + "epoch": 5.671176635252403, + "grad_norm": NaN, + "learning_rate": 2.362031741112308e-06, + "loss": 0.0, + "step": 60778 + }, + { + "epoch": 5.67126994494728, + "grad_norm": NaN, + "learning_rate": 2.360694918015238e-06, + "loss": 0.0, + "step": 60779 + }, + { + "epoch": 5.6713632546421575, + "grad_norm": NaN, + "learning_rate": 2.3593584703211764e-06, + "loss": 0.0, + "step": 60780 + }, + { + "epoch": 5.671456564337035, + "grad_norm": NaN, + "learning_rate": 2.3580223980335377e-06, + "loss": 0.0, + "step": 60781 + }, + { + "epoch": 5.671549874031912, + "grad_norm": NaN, + "learning_rate": 2.356686701155702e-06, + "loss": 0.0, + "step": 60782 + }, + { + "epoch": 5.671643183726789, + "grad_norm": NaN, + "learning_rate": 2.355351379691084e-06, + "loss": 0.0, + "step": 60783 + }, + { + "epoch": 5.671736493421666, + "grad_norm": NaN, + "learning_rate": 2.3540164336430632e-06, + "loss": 0.0, + "step": 60784 + }, + { + "epoch": 5.671829803116544, + "grad_norm": NaN, + "learning_rate": 2.3526818630150546e-06, + "loss": 0.0, + "step": 60785 + }, + { + "epoch": 5.671923112811421, + "grad_norm": NaN, + "learning_rate": 2.3513476678104382e-06, + "loss": 0.0, + "step": 60786 + }, + { + "epoch": 5.672016422506299, + "grad_norm": NaN, + "learning_rate": 2.350013848032595e-06, + "loss": 0.0, + "step": 60787 + }, + { + "epoch": 5.672109732201176, + "grad_norm": NaN, + "learning_rate": 2.348680403684938e-06, + "loss": 0.0, + "step": 60788 + }, + { + "epoch": 5.6722030418960525, + "grad_norm": NaN, + "learning_rate": 2.347347334770866e-06, + "loss": 0.0, + "step": 60789 + }, + { + "epoch": 5.67229635159093, + "grad_norm": NaN, + "learning_rate": 2.3460146412937253e-06, + "loss": 0.0, + "step": 60790 + }, + { + "epoch": 5.672389661285807, + "grad_norm": NaN, + "learning_rate": 2.3446823232569467e-06, + "loss": 0.0, + "step": 60791 + }, + { + "epoch": 5.672482970980685, + "grad_norm": NaN, + "learning_rate": 2.343350380663894e-06, + "loss": 0.0, + "step": 60792 + }, + { + "epoch": 5.672576280675562, + "grad_norm": NaN, + "learning_rate": 2.342018813517965e-06, + "loss": 0.0, + "step": 60793 + }, + { + "epoch": 5.67266959037044, + "grad_norm": NaN, + "learning_rate": 2.34068762182254e-06, + "loss": 0.0, + "step": 60794 + }, + { + "epoch": 5.672762900065317, + "grad_norm": NaN, + "learning_rate": 2.3393568055810162e-06, + "loss": 0.0, + "step": 60795 + }, + { + "epoch": 5.6728562097601944, + "grad_norm": NaN, + "learning_rate": 2.338026364796758e-06, + "loss": 0.0, + "step": 60796 + }, + { + "epoch": 5.672949519455072, + "grad_norm": NaN, + "learning_rate": 2.3366962994731617e-06, + "loss": 0.0, + "step": 60797 + }, + { + "epoch": 5.673042829149948, + "grad_norm": NaN, + "learning_rate": 2.3353666096136093e-06, + "loss": 0.0, + "step": 60798 + }, + { + "epoch": 5.673136138844826, + "grad_norm": NaN, + "learning_rate": 2.334037295221464e-06, + "loss": 0.0, + "step": 60799 + }, + { + "epoch": 5.673229448539703, + "grad_norm": NaN, + "learning_rate": 2.3327083563001393e-06, + "loss": 0.0, + "step": 60800 + }, + { + "epoch": 5.673322758234581, + "grad_norm": NaN, + "learning_rate": 2.331379792852983e-06, + "loss": 0.0, + "step": 60801 + }, + { + "epoch": 5.673416067929458, + "grad_norm": NaN, + "learning_rate": 2.330051604883393e-06, + "loss": 0.0, + "step": 60802 + }, + { + "epoch": 5.6735093776243355, + "grad_norm": NaN, + "learning_rate": 2.3287237923947323e-06, + "loss": 0.0, + "step": 60803 + }, + { + "epoch": 5.673602687319212, + "grad_norm": NaN, + "learning_rate": 2.3273963553903818e-06, + "loss": 0.0, + "step": 60804 + }, + { + "epoch": 5.673695997014089, + "grad_norm": NaN, + "learning_rate": 2.3260692938737225e-06, + "loss": 0.0, + "step": 60805 + }, + { + "epoch": 5.673789306708967, + "grad_norm": NaN, + "learning_rate": 2.324742607848118e-06, + "loss": 0.0, + "step": 60806 + }, + { + "epoch": 5.673882616403844, + "grad_norm": NaN, + "learning_rate": 2.323416297316966e-06, + "loss": 0.0, + "step": 60807 + }, + { + "epoch": 5.673975926098722, + "grad_norm": NaN, + "learning_rate": 2.322090362283596e-06, + "loss": 0.0, + "step": 60808 + }, + { + "epoch": 5.674069235793599, + "grad_norm": NaN, + "learning_rate": 2.3207648027514236e-06, + "loss": 0.0, + "step": 60809 + }, + { + "epoch": 5.6741625454884765, + "grad_norm": NaN, + "learning_rate": 2.3194396187237953e-06, + "loss": 0.0, + "step": 60810 + }, + { + "epoch": 5.674255855183354, + "grad_norm": NaN, + "learning_rate": 2.3181148102040913e-06, + "loss": 0.0, + "step": 60811 + }, + { + "epoch": 5.6743491648782305, + "grad_norm": NaN, + "learning_rate": 2.3167903771956597e-06, + "loss": 0.0, + "step": 60812 + }, + { + "epoch": 5.674442474573108, + "grad_norm": NaN, + "learning_rate": 2.3154663197018972e-06, + "loss": 0.0, + "step": 60813 + }, + { + "epoch": 5.674535784267985, + "grad_norm": NaN, + "learning_rate": 2.314142637726152e-06, + "loss": 0.0, + "step": 60814 + }, + { + "epoch": 5.674629093962863, + "grad_norm": NaN, + "learning_rate": 2.312819331271787e-06, + "loss": 0.0, + "step": 60815 + }, + { + "epoch": 5.67472240365774, + "grad_norm": NaN, + "learning_rate": 2.3114964003421834e-06, + "loss": 0.0, + "step": 60816 + }, + { + "epoch": 5.674815713352618, + "grad_norm": NaN, + "learning_rate": 2.310173844940705e-06, + "loss": 0.0, + "step": 60817 + }, + { + "epoch": 5.674909023047495, + "grad_norm": NaN, + "learning_rate": 2.3088516650706823e-06, + "loss": 0.0, + "step": 60818 + }, + { + "epoch": 5.6750023327423715, + "grad_norm": NaN, + "learning_rate": 2.307529860735513e-06, + "loss": 0.0, + "step": 60819 + }, + { + "epoch": 5.675095642437249, + "grad_norm": NaN, + "learning_rate": 2.3062084319385445e-06, + "loss": 0.0, + "step": 60820 + }, + { + "epoch": 5.675188952132126, + "grad_norm": NaN, + "learning_rate": 2.30488737868314e-06, + "loss": 0.0, + "step": 60821 + }, + { + "epoch": 5.675282261827004, + "grad_norm": NaN, + "learning_rate": 2.303566700972664e-06, + "loss": 0.0, + "step": 60822 + }, + { + "epoch": 5.675375571521881, + "grad_norm": NaN, + "learning_rate": 2.3022463988104476e-06, + "loss": 0.0, + "step": 60823 + }, + { + "epoch": 5.675468881216759, + "grad_norm": NaN, + "learning_rate": 2.3009264721998877e-06, + "loss": 0.0, + "step": 60824 + }, + { + "epoch": 5.675562190911636, + "grad_norm": NaN, + "learning_rate": 2.299606921144298e-06, + "loss": 0.0, + "step": 60825 + }, + { + "epoch": 5.6756555006065135, + "grad_norm": NaN, + "learning_rate": 2.298287745647076e-06, + "loss": 0.0, + "step": 60826 + }, + { + "epoch": 5.67574881030139, + "grad_norm": NaN, + "learning_rate": 2.2969689457115527e-06, + "loss": 0.0, + "step": 60827 + }, + { + "epoch": 5.675842119996267, + "grad_norm": NaN, + "learning_rate": 2.2956505213410746e-06, + "loss": 0.0, + "step": 60828 + }, + { + "epoch": 5.675935429691145, + "grad_norm": NaN, + "learning_rate": 2.2943324725390067e-06, + "loss": 0.0, + "step": 60829 + }, + { + "epoch": 5.676028739386022, + "grad_norm": NaN, + "learning_rate": 2.2930147993086956e-06, + "loss": 0.0, + "step": 60830 + }, + { + "epoch": 5.6761220490809, + "grad_norm": NaN, + "learning_rate": 2.2916975016535056e-06, + "loss": 0.0, + "step": 60831 + }, + { + "epoch": 5.676215358775777, + "grad_norm": NaN, + "learning_rate": 2.2903805795767673e-06, + "loss": 0.0, + "step": 60832 + }, + { + "epoch": 5.676308668470654, + "grad_norm": NaN, + "learning_rate": 2.2890640330818277e-06, + "loss": 0.0, + "step": 60833 + }, + { + "epoch": 5.676401978165531, + "grad_norm": NaN, + "learning_rate": 2.2877478621720514e-06, + "loss": 0.0, + "step": 60834 + }, + { + "epoch": 5.6764952878604085, + "grad_norm": NaN, + "learning_rate": 2.286432066850785e-06, + "loss": 0.0, + "step": 60835 + }, + { + "epoch": 5.676588597555286, + "grad_norm": NaN, + "learning_rate": 2.28511664712136e-06, + "loss": 0.0, + "step": 60836 + }, + { + "epoch": 5.676681907250163, + "grad_norm": NaN, + "learning_rate": 2.283801602987123e-06, + "loss": 0.0, + "step": 60837 + }, + { + "epoch": 5.676775216945041, + "grad_norm": NaN, + "learning_rate": 2.282486934451422e-06, + "loss": 0.0, + "step": 60838 + }, + { + "epoch": 5.676868526639918, + "grad_norm": NaN, + "learning_rate": 2.2811726415176037e-06, + "loss": 0.0, + "step": 60839 + }, + { + "epoch": 5.676961836334796, + "grad_norm": NaN, + "learning_rate": 2.2798587241889986e-06, + "loss": 0.0, + "step": 60840 + }, + { + "epoch": 5.677055146029672, + "grad_norm": NaN, + "learning_rate": 2.2785451824689548e-06, + "loss": 0.0, + "step": 60841 + }, + { + "epoch": 5.6771484557245495, + "grad_norm": NaN, + "learning_rate": 2.277232016360819e-06, + "loss": 0.0, + "step": 60842 + }, + { + "epoch": 5.677241765419427, + "grad_norm": NaN, + "learning_rate": 2.2759192258679227e-06, + "loss": 0.0, + "step": 60843 + }, + { + "epoch": 5.677335075114304, + "grad_norm": NaN, + "learning_rate": 2.2746068109935956e-06, + "loss": 0.0, + "step": 60844 + }, + { + "epoch": 5.677428384809182, + "grad_norm": NaN, + "learning_rate": 2.273294771741202e-06, + "loss": 0.0, + "step": 60845 + }, + { + "epoch": 5.677521694504059, + "grad_norm": NaN, + "learning_rate": 2.2719831081140396e-06, + "loss": 0.0, + "step": 60846 + }, + { + "epoch": 5.677615004198937, + "grad_norm": NaN, + "learning_rate": 2.2706718201154717e-06, + "loss": 0.0, + "step": 60847 + }, + { + "epoch": 5.677708313893813, + "grad_norm": NaN, + "learning_rate": 2.2693609077488296e-06, + "loss": 0.0, + "step": 60848 + }, + { + "epoch": 5.677801623588691, + "grad_norm": NaN, + "learning_rate": 2.2680503710174434e-06, + "loss": 0.0, + "step": 60849 + }, + { + "epoch": 5.677894933283568, + "grad_norm": NaN, + "learning_rate": 2.266740209924628e-06, + "loss": 0.0, + "step": 60850 + }, + { + "epoch": 5.677988242978445, + "grad_norm": NaN, + "learning_rate": 2.2654304244737466e-06, + "loss": 0.0, + "step": 60851 + }, + { + "epoch": 5.678081552673323, + "grad_norm": NaN, + "learning_rate": 2.264121014668113e-06, + "loss": 0.0, + "step": 60852 + }, + { + "epoch": 5.6781748623682, + "grad_norm": NaN, + "learning_rate": 2.262811980511042e-06, + "loss": 0.0, + "step": 60853 + }, + { + "epoch": 5.678268172063078, + "grad_norm": NaN, + "learning_rate": 2.2615033220058977e-06, + "loss": 0.0, + "step": 60854 + }, + { + "epoch": 5.678361481757955, + "grad_norm": NaN, + "learning_rate": 2.2601950391559763e-06, + "loss": 0.0, + "step": 60855 + }, + { + "epoch": 5.678454791452832, + "grad_norm": NaN, + "learning_rate": 2.2588871319646097e-06, + "loss": 0.0, + "step": 60856 + }, + { + "epoch": 5.678548101147709, + "grad_norm": NaN, + "learning_rate": 2.2575796004351444e-06, + "loss": 0.0, + "step": 60857 + }, + { + "epoch": 5.6786414108425864, + "grad_norm": NaN, + "learning_rate": 2.256272444570878e-06, + "loss": 0.0, + "step": 60858 + }, + { + "epoch": 5.678734720537464, + "grad_norm": NaN, + "learning_rate": 2.254965664375141e-06, + "loss": 0.0, + "step": 60859 + }, + { + "epoch": 5.678828030232341, + "grad_norm": NaN, + "learning_rate": 2.253659259851265e-06, + "loss": 0.0, + "step": 60860 + }, + { + "epoch": 5.678921339927219, + "grad_norm": NaN, + "learning_rate": 2.2523532310025793e-06, + "loss": 0.0, + "step": 60861 + }, + { + "epoch": 5.679014649622096, + "grad_norm": NaN, + "learning_rate": 2.251047577832382e-06, + "loss": 0.0, + "step": 60862 + }, + { + "epoch": 5.679107959316973, + "grad_norm": NaN, + "learning_rate": 2.249742300344004e-06, + "loss": 0.0, + "step": 60863 + }, + { + "epoch": 5.67920126901185, + "grad_norm": NaN, + "learning_rate": 2.2484373985407753e-06, + "loss": 0.0, + "step": 60864 + }, + { + "epoch": 5.6792945787067275, + "grad_norm": NaN, + "learning_rate": 2.247132872425994e-06, + "loss": 0.0, + "step": 60865 + }, + { + "epoch": 5.679387888401605, + "grad_norm": NaN, + "learning_rate": 2.2458287220029735e-06, + "loss": 0.0, + "step": 60866 + }, + { + "epoch": 5.679481198096482, + "grad_norm": NaN, + "learning_rate": 2.2445249472750615e-06, + "loss": 0.0, + "step": 60867 + }, + { + "epoch": 5.67957450779136, + "grad_norm": NaN, + "learning_rate": 2.243221548245555e-06, + "loss": 0.0, + "step": 60868 + }, + { + "epoch": 5.679667817486237, + "grad_norm": NaN, + "learning_rate": 2.2419185249177517e-06, + "loss": 0.0, + "step": 60869 + }, + { + "epoch": 5.679761127181115, + "grad_norm": NaN, + "learning_rate": 2.240615877294999e-06, + "loss": 0.0, + "step": 60870 + }, + { + "epoch": 5.679854436875991, + "grad_norm": NaN, + "learning_rate": 2.239313605380577e-06, + "loss": 0.0, + "step": 60871 + }, + { + "epoch": 5.6799477465708685, + "grad_norm": NaN, + "learning_rate": 2.2380117091778172e-06, + "loss": 0.0, + "step": 60872 + }, + { + "epoch": 5.680041056265746, + "grad_norm": NaN, + "learning_rate": 2.236710188690016e-06, + "loss": 0.0, + "step": 60873 + }, + { + "epoch": 5.680134365960623, + "grad_norm": NaN, + "learning_rate": 2.235409043920505e-06, + "loss": 0.0, + "step": 60874 + }, + { + "epoch": 5.680227675655501, + "grad_norm": NaN, + "learning_rate": 2.2341082748725647e-06, + "loss": 0.0, + "step": 60875 + }, + { + "epoch": 5.680320985350378, + "grad_norm": NaN, + "learning_rate": 2.2328078815495085e-06, + "loss": 0.0, + "step": 60876 + }, + { + "epoch": 5.680414295045255, + "grad_norm": NaN, + "learning_rate": 2.231507863954668e-06, + "loss": 0.0, + "step": 60877 + }, + { + "epoch": 5.680507604740132, + "grad_norm": NaN, + "learning_rate": 2.230208222091323e-06, + "loss": 0.0, + "step": 60878 + }, + { + "epoch": 5.68060091443501, + "grad_norm": NaN, + "learning_rate": 2.228908955962788e-06, + "loss": 0.0, + "step": 60879 + }, + { + "epoch": 5.680694224129887, + "grad_norm": NaN, + "learning_rate": 2.2276100655723605e-06, + "loss": 0.0, + "step": 60880 + }, + { + "epoch": 5.680787533824764, + "grad_norm": NaN, + "learning_rate": 2.2263115509233544e-06, + "loss": 0.0, + "step": 60881 + }, + { + "epoch": 5.680880843519642, + "grad_norm": NaN, + "learning_rate": 2.22501341201905e-06, + "loss": 0.0, + "step": 60882 + }, + { + "epoch": 5.680974153214519, + "grad_norm": NaN, + "learning_rate": 2.2237156488627784e-06, + "loss": 0.0, + "step": 60883 + }, + { + "epoch": 5.681067462909397, + "grad_norm": NaN, + "learning_rate": 2.2224182614578033e-06, + "loss": 0.0, + "step": 60884 + }, + { + "epoch": 5.681160772604273, + "grad_norm": NaN, + "learning_rate": 2.221121249807473e-06, + "loss": 0.0, + "step": 60885 + }, + { + "epoch": 5.681254082299151, + "grad_norm": NaN, + "learning_rate": 2.2198246139150333e-06, + "loss": 0.0, + "step": 60886 + }, + { + "epoch": 5.681347391994028, + "grad_norm": NaN, + "learning_rate": 2.218528353783816e-06, + "loss": 0.0, + "step": 60887 + }, + { + "epoch": 5.6814407016889055, + "grad_norm": NaN, + "learning_rate": 2.217232469417085e-06, + "loss": 0.0, + "step": 60888 + }, + { + "epoch": 5.681534011383783, + "grad_norm": NaN, + "learning_rate": 2.215936960818171e-06, + "loss": 0.0, + "step": 60889 + }, + { + "epoch": 5.68162732107866, + "grad_norm": NaN, + "learning_rate": 2.2146418279903545e-06, + "loss": 0.0, + "step": 60890 + }, + { + "epoch": 5.681720630773538, + "grad_norm": NaN, + "learning_rate": 2.213347070936916e-06, + "loss": 0.0, + "step": 60891 + }, + { + "epoch": 5.681813940468414, + "grad_norm": NaN, + "learning_rate": 2.21205268966117e-06, + "loss": 0.0, + "step": 60892 + }, + { + "epoch": 5.681907250163292, + "grad_norm": NaN, + "learning_rate": 2.210758684166397e-06, + "loss": 0.0, + "step": 60893 + }, + { + "epoch": 5.682000559858169, + "grad_norm": NaN, + "learning_rate": 2.209465054455878e-06, + "loss": 0.0, + "step": 60894 + }, + { + "epoch": 5.6820938695530465, + "grad_norm": NaN, + "learning_rate": 2.2081718005329096e-06, + "loss": 0.0, + "step": 60895 + }, + { + "epoch": 5.682187179247924, + "grad_norm": NaN, + "learning_rate": 2.2068789224007897e-06, + "loss": 0.0, + "step": 60896 + }, + { + "epoch": 5.682280488942801, + "grad_norm": NaN, + "learning_rate": 2.2055864200627825e-06, + "loss": 0.0, + "step": 60897 + }, + { + "epoch": 5.682373798637679, + "grad_norm": NaN, + "learning_rate": 2.204294293522202e-06, + "loss": 0.0, + "step": 60898 + }, + { + "epoch": 5.682467108332556, + "grad_norm": NaN, + "learning_rate": 2.2030025427823286e-06, + "loss": 0.0, + "step": 60899 + }, + { + "epoch": 5.682560418027433, + "grad_norm": NaN, + "learning_rate": 2.201711167846426e-06, + "loss": 0.0, + "step": 60900 + }, + { + "epoch": 5.68265372772231, + "grad_norm": NaN, + "learning_rate": 2.2004201687177923e-06, + "loss": 0.0, + "step": 60901 + }, + { + "epoch": 5.682747037417188, + "grad_norm": NaN, + "learning_rate": 2.1991295453997082e-06, + "loss": 0.0, + "step": 60902 + }, + { + "epoch": 5.682840347112065, + "grad_norm": NaN, + "learning_rate": 2.197839297895454e-06, + "loss": 0.0, + "step": 60903 + }, + { + "epoch": 5.682933656806942, + "grad_norm": NaN, + "learning_rate": 2.1965494262083104e-06, + "loss": 0.0, + "step": 60904 + }, + { + "epoch": 5.68302696650182, + "grad_norm": NaN, + "learning_rate": 2.1952599303415585e-06, + "loss": 0.0, + "step": 60905 + }, + { + "epoch": 5.683120276196696, + "grad_norm": NaN, + "learning_rate": 2.1939708102984955e-06, + "loss": 0.0, + "step": 60906 + }, + { + "epoch": 5.683213585891574, + "grad_norm": NaN, + "learning_rate": 2.1926820660823686e-06, + "loss": 0.0, + "step": 60907 + }, + { + "epoch": 5.683306895586451, + "grad_norm": NaN, + "learning_rate": 2.1913936976964587e-06, + "loss": 0.0, + "step": 60908 + }, + { + "epoch": 5.683400205281329, + "grad_norm": NaN, + "learning_rate": 2.1901057051440628e-06, + "loss": 0.0, + "step": 60909 + }, + { + "epoch": 5.683493514976206, + "grad_norm": NaN, + "learning_rate": 2.1888180884284454e-06, + "loss": 0.0, + "step": 60910 + }, + { + "epoch": 5.6835868246710834, + "grad_norm": NaN, + "learning_rate": 2.1875308475528707e-06, + "loss": 0.0, + "step": 60911 + }, + { + "epoch": 5.683680134365961, + "grad_norm": NaN, + "learning_rate": 2.186243982520619e-06, + "loss": 0.0, + "step": 60912 + }, + { + "epoch": 5.683773444060838, + "grad_norm": NaN, + "learning_rate": 2.184957493334971e-06, + "loss": 0.0, + "step": 60913 + }, + { + "epoch": 5.683866753755716, + "grad_norm": NaN, + "learning_rate": 2.1836713799991745e-06, + "loss": 0.0, + "step": 60914 + }, + { + "epoch": 5.683960063450592, + "grad_norm": NaN, + "learning_rate": 2.1823856425165265e-06, + "loss": 0.0, + "step": 60915 + }, + { + "epoch": 5.68405337314547, + "grad_norm": NaN, + "learning_rate": 2.1811002808902913e-06, + "loss": 0.0, + "step": 60916 + }, + { + "epoch": 5.684146682840347, + "grad_norm": NaN, + "learning_rate": 2.179815295123716e-06, + "loss": 0.0, + "step": 60917 + }, + { + "epoch": 5.6842399925352245, + "grad_norm": NaN, + "learning_rate": 2.178530685220098e-06, + "loss": 0.0, + "step": 60918 + }, + { + "epoch": 5.684333302230102, + "grad_norm": NaN, + "learning_rate": 2.177246451182685e-06, + "loss": 0.0, + "step": 60919 + }, + { + "epoch": 5.684426611924979, + "grad_norm": NaN, + "learning_rate": 2.1759625930147406e-06, + "loss": 0.0, + "step": 60920 + }, + { + "epoch": 5.684519921619856, + "grad_norm": NaN, + "learning_rate": 2.1746791107195294e-06, + "loss": 0.0, + "step": 60921 + }, + { + "epoch": 5.684613231314733, + "grad_norm": NaN, + "learning_rate": 2.1733960043003318e-06, + "loss": 0.0, + "step": 60922 + }, + { + "epoch": 5.684706541009611, + "grad_norm": NaN, + "learning_rate": 2.172113273760395e-06, + "loss": 0.0, + "step": 60923 + }, + { + "epoch": 5.684799850704488, + "grad_norm": NaN, + "learning_rate": 2.1708309191029836e-06, + "loss": 0.0, + "step": 60924 + }, + { + "epoch": 5.6848931603993655, + "grad_norm": NaN, + "learning_rate": 2.1695489403313616e-06, + "loss": 0.0, + "step": 60925 + }, + { + "epoch": 5.684986470094243, + "grad_norm": NaN, + "learning_rate": 2.168267337448776e-06, + "loss": 0.0, + "step": 60926 + }, + { + "epoch": 5.68507977978912, + "grad_norm": NaN, + "learning_rate": 2.1669861104585074e-06, + "loss": 0.0, + "step": 60927 + }, + { + "epoch": 5.685173089483998, + "grad_norm": NaN, + "learning_rate": 2.1657052593638036e-06, + "loss": 0.0, + "step": 60928 + }, + { + "epoch": 5.685266399178874, + "grad_norm": NaN, + "learning_rate": 2.1644247841679117e-06, + "loss": 0.0, + "step": 60929 + }, + { + "epoch": 5.685359708873752, + "grad_norm": NaN, + "learning_rate": 2.1631446848740965e-06, + "loss": 0.0, + "step": 60930 + }, + { + "epoch": 5.685453018568629, + "grad_norm": NaN, + "learning_rate": 2.1618649614856043e-06, + "loss": 0.0, + "step": 60931 + }, + { + "epoch": 5.685546328263507, + "grad_norm": NaN, + "learning_rate": 2.1605856140057e-06, + "loss": 0.0, + "step": 60932 + }, + { + "epoch": 5.685639637958384, + "grad_norm": NaN, + "learning_rate": 2.159306642437647e-06, + "loss": 0.0, + "step": 60933 + }, + { + "epoch": 5.685732947653261, + "grad_norm": NaN, + "learning_rate": 2.1580280467846767e-06, + "loss": 0.0, + "step": 60934 + }, + { + "epoch": 5.685826257348139, + "grad_norm": NaN, + "learning_rate": 2.156749827050036e-06, + "loss": 0.0, + "step": 60935 + }, + { + "epoch": 5.685919567043015, + "grad_norm": NaN, + "learning_rate": 2.155471983237006e-06, + "loss": 0.0, + "step": 60936 + }, + { + "epoch": 5.686012876737893, + "grad_norm": NaN, + "learning_rate": 2.1541945153488003e-06, + "loss": 0.0, + "step": 60937 + }, + { + "epoch": 5.68610618643277, + "grad_norm": NaN, + "learning_rate": 2.1529174233886835e-06, + "loss": 0.0, + "step": 60938 + }, + { + "epoch": 5.686199496127648, + "grad_norm": NaN, + "learning_rate": 2.151640707359903e-06, + "loss": 0.0, + "step": 60939 + }, + { + "epoch": 5.686292805822525, + "grad_norm": NaN, + "learning_rate": 2.1503643672657056e-06, + "loss": 0.0, + "step": 60940 + }, + { + "epoch": 5.6863861155174025, + "grad_norm": NaN, + "learning_rate": 2.149088403109339e-06, + "loss": 0.0, + "step": 60941 + }, + { + "epoch": 5.68647942521228, + "grad_norm": NaN, + "learning_rate": 2.147812814894051e-06, + "loss": 0.0, + "step": 60942 + }, + { + "epoch": 5.686572734907157, + "grad_norm": NaN, + "learning_rate": 2.1465376026230553e-06, + "loss": 0.0, + "step": 60943 + }, + { + "epoch": 5.686666044602034, + "grad_norm": NaN, + "learning_rate": 2.1452627662996322e-06, + "loss": 0.0, + "step": 60944 + }, + { + "epoch": 5.686759354296911, + "grad_norm": NaN, + "learning_rate": 2.1439883059270136e-06, + "loss": 0.0, + "step": 60945 + }, + { + "epoch": 5.686852663991789, + "grad_norm": NaN, + "learning_rate": 2.142714221508429e-06, + "loss": 0.0, + "step": 60946 + }, + { + "epoch": 5.686945973686666, + "grad_norm": NaN, + "learning_rate": 2.14144051304711e-06, + "loss": 0.0, + "step": 60947 + }, + { + "epoch": 5.6870392833815435, + "grad_norm": NaN, + "learning_rate": 2.1401671805463373e-06, + "loss": 0.0, + "step": 60948 + }, + { + "epoch": 5.687132593076421, + "grad_norm": NaN, + "learning_rate": 2.138894224009291e-06, + "loss": 0.0, + "step": 60949 + }, + { + "epoch": 5.6872259027712975, + "grad_norm": NaN, + "learning_rate": 2.137621643439252e-06, + "loss": 0.0, + "step": 60950 + }, + { + "epoch": 5.687319212466175, + "grad_norm": NaN, + "learning_rate": 2.1363494388394352e-06, + "loss": 0.0, + "step": 60951 + }, + { + "epoch": 5.687412522161052, + "grad_norm": NaN, + "learning_rate": 2.1350776102130872e-06, + "loss": 0.0, + "step": 60952 + }, + { + "epoch": 5.68750583185593, + "grad_norm": NaN, + "learning_rate": 2.133806157563439e-06, + "loss": 0.0, + "step": 60953 + }, + { + "epoch": 5.687599141550807, + "grad_norm": NaN, + "learning_rate": 2.1325350808937046e-06, + "loss": 0.0, + "step": 60954 + }, + { + "epoch": 5.687692451245685, + "grad_norm": NaN, + "learning_rate": 2.1312643802071315e-06, + "loss": 0.0, + "step": 60955 + }, + { + "epoch": 5.687785760940562, + "grad_norm": NaN, + "learning_rate": 2.129994055506984e-06, + "loss": 0.0, + "step": 60956 + }, + { + "epoch": 5.687879070635439, + "grad_norm": NaN, + "learning_rate": 2.1287241067964422e-06, + "loss": 0.0, + "step": 60957 + }, + { + "epoch": 5.687972380330316, + "grad_norm": NaN, + "learning_rate": 2.1274545340787376e-06, + "loss": 0.0, + "step": 60958 + }, + { + "epoch": 5.688065690025193, + "grad_norm": NaN, + "learning_rate": 2.126185337357117e-06, + "loss": 0.0, + "step": 60959 + }, + { + "epoch": 5.688158999720071, + "grad_norm": NaN, + "learning_rate": 2.1249165166348113e-06, + "loss": 0.0, + "step": 60960 + }, + { + "epoch": 5.688252309414948, + "grad_norm": NaN, + "learning_rate": 2.123648071915035e-06, + "loss": 0.0, + "step": 60961 + }, + { + "epoch": 5.688345619109826, + "grad_norm": NaN, + "learning_rate": 2.122380003201035e-06, + "loss": 0.0, + "step": 60962 + }, + { + "epoch": 5.688438928804703, + "grad_norm": NaN, + "learning_rate": 2.1211123104959927e-06, + "loss": 0.0, + "step": 60963 + }, + { + "epoch": 5.6885322384995805, + "grad_norm": NaN, + "learning_rate": 2.1198449938031715e-06, + "loss": 0.0, + "step": 60964 + }, + { + "epoch": 5.688625548194457, + "grad_norm": NaN, + "learning_rate": 2.118578053125769e-06, + "loss": 0.0, + "step": 60965 + }, + { + "epoch": 5.688718857889334, + "grad_norm": NaN, + "learning_rate": 2.117311488467033e-06, + "loss": 0.0, + "step": 60966 + }, + { + "epoch": 5.688812167584212, + "grad_norm": NaN, + "learning_rate": 2.116045299830144e-06, + "loss": 0.0, + "step": 60967 + }, + { + "epoch": 5.688905477279089, + "grad_norm": NaN, + "learning_rate": 2.1147794872183654e-06, + "loss": 0.0, + "step": 60968 + }, + { + "epoch": 5.688998786973967, + "grad_norm": NaN, + "learning_rate": 2.1135140506348958e-06, + "loss": 0.0, + "step": 60969 + }, + { + "epoch": 5.689092096668844, + "grad_norm": NaN, + "learning_rate": 2.112248990082932e-06, + "loss": 0.0, + "step": 60970 + }, + { + "epoch": 5.6891854063637215, + "grad_norm": NaN, + "learning_rate": 2.1109843055657383e-06, + "loss": 0.0, + "step": 60971 + }, + { + "epoch": 5.689278716058599, + "grad_norm": NaN, + "learning_rate": 2.1097199970864786e-06, + "loss": 0.0, + "step": 60972 + }, + { + "epoch": 5.6893720257534754, + "grad_norm": NaN, + "learning_rate": 2.1084560646484174e-06, + "loss": 0.0, + "step": 60973 + }, + { + "epoch": 5.689465335448353, + "grad_norm": NaN, + "learning_rate": 2.107192508254718e-06, + "loss": 0.0, + "step": 60974 + }, + { + "epoch": 5.68955864514323, + "grad_norm": NaN, + "learning_rate": 2.1059293279086455e-06, + "loss": 0.0, + "step": 60975 + }, + { + "epoch": 5.689651954838108, + "grad_norm": NaN, + "learning_rate": 2.104666523613363e-06, + "loss": 0.0, + "step": 60976 + }, + { + "epoch": 5.689745264532985, + "grad_norm": NaN, + "learning_rate": 2.1034040953721187e-06, + "loss": 0.0, + "step": 60977 + }, + { + "epoch": 5.6898385742278625, + "grad_norm": NaN, + "learning_rate": 2.1021420431880933e-06, + "loss": 0.0, + "step": 60978 + }, + { + "epoch": 5.689931883922739, + "grad_norm": NaN, + "learning_rate": 2.100880367064517e-06, + "loss": 0.0, + "step": 60979 + }, + { + "epoch": 5.6900251936176165, + "grad_norm": NaN, + "learning_rate": 2.0996190670045875e-06, + "loss": 0.0, + "step": 60980 + }, + { + "epoch": 5.690118503312494, + "grad_norm": NaN, + "learning_rate": 2.0983581430115192e-06, + "loss": 0.0, + "step": 60981 + }, + { + "epoch": 5.690211813007371, + "grad_norm": NaN, + "learning_rate": 2.097097595088526e-06, + "loss": 0.0, + "step": 60982 + }, + { + "epoch": 5.690305122702249, + "grad_norm": NaN, + "learning_rate": 2.0958374232387886e-06, + "loss": 0.0, + "step": 60983 + }, + { + "epoch": 5.690398432397126, + "grad_norm": NaN, + "learning_rate": 2.0945776274655213e-06, + "loss": 0.0, + "step": 60984 + }, + { + "epoch": 5.690491742092004, + "grad_norm": NaN, + "learning_rate": 2.093318207771938e-06, + "loss": 0.0, + "step": 60985 + }, + { + "epoch": 5.690585051786881, + "grad_norm": NaN, + "learning_rate": 2.092059164161236e-06, + "loss": 0.0, + "step": 60986 + }, + { + "epoch": 5.690678361481758, + "grad_norm": NaN, + "learning_rate": 2.0908004966365966e-06, + "loss": 0.0, + "step": 60987 + }, + { + "epoch": 5.690771671176635, + "grad_norm": NaN, + "learning_rate": 2.08954220520125e-06, + "loss": 0.0, + "step": 60988 + }, + { + "epoch": 5.690864980871512, + "grad_norm": NaN, + "learning_rate": 2.088284289858394e-06, + "loss": 0.0, + "step": 60989 + }, + { + "epoch": 5.69095829056639, + "grad_norm": NaN, + "learning_rate": 2.0870267506111926e-06, + "loss": 0.0, + "step": 60990 + }, + { + "epoch": 5.691051600261267, + "grad_norm": NaN, + "learning_rate": 2.0857695874628932e-06, + "loss": 0.0, + "step": 60991 + }, + { + "epoch": 5.691144909956145, + "grad_norm": NaN, + "learning_rate": 2.084512800416643e-06, + "loss": 0.0, + "step": 60992 + }, + { + "epoch": 5.691238219651022, + "grad_norm": NaN, + "learning_rate": 2.083256389475657e-06, + "loss": 0.0, + "step": 60993 + }, + { + "epoch": 5.691331529345899, + "grad_norm": NaN, + "learning_rate": 2.082000354643165e-06, + "loss": 0.0, + "step": 60994 + }, + { + "epoch": 5.691424839040776, + "grad_norm": NaN, + "learning_rate": 2.080744695922315e-06, + "loss": 0.0, + "step": 60995 + }, + { + "epoch": 5.691518148735653, + "grad_norm": NaN, + "learning_rate": 2.0794894133163043e-06, + "loss": 0.0, + "step": 60996 + }, + { + "epoch": 5.691611458430531, + "grad_norm": NaN, + "learning_rate": 2.078234506828347e-06, + "loss": 0.0, + "step": 60997 + }, + { + "epoch": 5.691704768125408, + "grad_norm": NaN, + "learning_rate": 2.0769799764616067e-06, + "loss": 0.0, + "step": 60998 + }, + { + "epoch": 5.691798077820286, + "grad_norm": NaN, + "learning_rate": 2.0757258222192987e-06, + "loss": 0.0, + "step": 60999 + }, + { + "epoch": 5.691891387515163, + "grad_norm": NaN, + "learning_rate": 2.0744720441046027e-06, + "loss": 0.0, + "step": 61000 + }, + { + "epoch": 5.6919846972100405, + "grad_norm": NaN, + "learning_rate": 2.0732186421207166e-06, + "loss": 0.0, + "step": 61001 + }, + { + "epoch": 5.692078006904917, + "grad_norm": NaN, + "learning_rate": 2.0719656162707876e-06, + "loss": 0.0, + "step": 61002 + }, + { + "epoch": 5.6921713165997945, + "grad_norm": NaN, + "learning_rate": 2.070712966558047e-06, + "loss": 0.0, + "step": 61003 + }, + { + "epoch": 5.692264626294672, + "grad_norm": NaN, + "learning_rate": 2.069460692985675e-06, + "loss": 0.0, + "step": 61004 + }, + { + "epoch": 5.692357935989549, + "grad_norm": NaN, + "learning_rate": 2.068208795556836e-06, + "loss": 0.0, + "step": 61005 + }, + { + "epoch": 5.692451245684427, + "grad_norm": NaN, + "learning_rate": 2.066957274274711e-06, + "loss": 0.0, + "step": 61006 + }, + { + "epoch": 5.692544555379304, + "grad_norm": NaN, + "learning_rate": 2.065706129142497e-06, + "loss": 0.0, + "step": 61007 + }, + { + "epoch": 5.692637865074182, + "grad_norm": NaN, + "learning_rate": 2.064455360163375e-06, + "loss": 0.0, + "step": 61008 + }, + { + "epoch": 5.692731174769058, + "grad_norm": NaN, + "learning_rate": 2.0632049673405093e-06, + "loss": 0.0, + "step": 61009 + }, + { + "epoch": 5.6928244844639355, + "grad_norm": NaN, + "learning_rate": 2.0619549506771137e-06, + "loss": 0.0, + "step": 61010 + }, + { + "epoch": 5.692917794158813, + "grad_norm": NaN, + "learning_rate": 2.0607053101763192e-06, + "loss": 0.0, + "step": 61011 + }, + { + "epoch": 5.69301110385369, + "grad_norm": NaN, + "learning_rate": 2.0594560458413566e-06, + "loss": 0.0, + "step": 61012 + }, + { + "epoch": 5.693104413548568, + "grad_norm": NaN, + "learning_rate": 2.058207157675357e-06, + "loss": 0.0, + "step": 61013 + }, + { + "epoch": 5.693197723243445, + "grad_norm": NaN, + "learning_rate": 2.0569586456815e-06, + "loss": 0.0, + "step": 61014 + }, + { + "epoch": 5.693291032938323, + "grad_norm": NaN, + "learning_rate": 2.055710509863001e-06, + "loss": 0.0, + "step": 61015 + }, + { + "epoch": 5.6933843426332, + "grad_norm": NaN, + "learning_rate": 2.0544627502229905e-06, + "loss": 0.0, + "step": 61016 + }, + { + "epoch": 5.693477652328077, + "grad_norm": NaN, + "learning_rate": 2.053215366764649e-06, + "loss": 0.0, + "step": 61017 + }, + { + "epoch": 5.693570962022954, + "grad_norm": NaN, + "learning_rate": 2.051968359491174e-06, + "loss": 0.0, + "step": 61018 + }, + { + "epoch": 5.693664271717831, + "grad_norm": NaN, + "learning_rate": 2.0507217284057133e-06, + "loss": 0.0, + "step": 61019 + }, + { + "epoch": 5.693757581412709, + "grad_norm": NaN, + "learning_rate": 2.0494754735114303e-06, + "loss": 0.0, + "step": 61020 + }, + { + "epoch": 5.693850891107586, + "grad_norm": NaN, + "learning_rate": 2.048229594811507e-06, + "loss": 0.0, + "step": 61021 + }, + { + "epoch": 5.693944200802464, + "grad_norm": NaN, + "learning_rate": 2.046984092309123e-06, + "loss": 0.0, + "step": 61022 + }, + { + "epoch": 5.69403751049734, + "grad_norm": NaN, + "learning_rate": 2.0457389660074097e-06, + "loss": 0.0, + "step": 61023 + }, + { + "epoch": 5.694130820192218, + "grad_norm": NaN, + "learning_rate": 2.044494215909581e-06, + "loss": 0.0, + "step": 61024 + }, + { + "epoch": 5.694224129887095, + "grad_norm": NaN, + "learning_rate": 2.0432498420187516e-06, + "loss": 0.0, + "step": 61025 + }, + { + "epoch": 5.6943174395819725, + "grad_norm": NaN, + "learning_rate": 2.042005844338135e-06, + "loss": 0.0, + "step": 61026 + }, + { + "epoch": 5.69441074927685, + "grad_norm": NaN, + "learning_rate": 2.040762222870845e-06, + "loss": 0.0, + "step": 61027 + }, + { + "epoch": 5.694504058971727, + "grad_norm": NaN, + "learning_rate": 2.0395189776200805e-06, + "loss": 0.0, + "step": 61028 + }, + { + "epoch": 5.694597368666605, + "grad_norm": NaN, + "learning_rate": 2.0382761085889875e-06, + "loss": 0.0, + "step": 61029 + }, + { + "epoch": 5.694690678361482, + "grad_norm": NaN, + "learning_rate": 2.037033615780731e-06, + "loss": 0.0, + "step": 61030 + }, + { + "epoch": 5.6947839880563595, + "grad_norm": NaN, + "learning_rate": 2.035791499198458e-06, + "loss": 0.0, + "step": 61031 + }, + { + "epoch": 5.694877297751236, + "grad_norm": NaN, + "learning_rate": 2.0345497588453495e-06, + "loss": 0.0, + "step": 61032 + }, + { + "epoch": 5.6949706074461135, + "grad_norm": NaN, + "learning_rate": 2.033308394724553e-06, + "loss": 0.0, + "step": 61033 + }, + { + "epoch": 5.695063917140991, + "grad_norm": NaN, + "learning_rate": 2.0320674068391997e-06, + "loss": 0.0, + "step": 61034 + }, + { + "epoch": 5.695157226835868, + "grad_norm": NaN, + "learning_rate": 2.030826795192486e-06, + "loss": 0.0, + "step": 61035 + }, + { + "epoch": 5.695250536530746, + "grad_norm": NaN, + "learning_rate": 2.0295865597875603e-06, + "loss": 0.0, + "step": 61036 + }, + { + "epoch": 5.695343846225623, + "grad_norm": NaN, + "learning_rate": 2.0283467006275368e-06, + "loss": 0.0, + "step": 61037 + }, + { + "epoch": 5.6954371559205, + "grad_norm": NaN, + "learning_rate": 2.0271072177156123e-06, + "loss": 0.0, + "step": 61038 + }, + { + "epoch": 5.695530465615377, + "grad_norm": NaN, + "learning_rate": 2.025868111054918e-06, + "loss": 0.0, + "step": 61039 + }, + { + "epoch": 5.6956237753102545, + "grad_norm": NaN, + "learning_rate": 2.024629380648585e-06, + "loss": 0.0, + "step": 61040 + }, + { + "epoch": 5.695717085005132, + "grad_norm": NaN, + "learning_rate": 2.0233910264998263e-06, + "loss": 0.0, + "step": 61041 + }, + { + "epoch": 5.695810394700009, + "grad_norm": NaN, + "learning_rate": 2.0221530486117245e-06, + "loss": 0.0, + "step": 61042 + }, + { + "epoch": 5.695903704394887, + "grad_norm": NaN, + "learning_rate": 2.020915446987459e-06, + "loss": 0.0, + "step": 61043 + }, + { + "epoch": 5.695997014089764, + "grad_norm": NaN, + "learning_rate": 2.0196782216301777e-06, + "loss": 0.0, + "step": 61044 + }, + { + "epoch": 5.696090323784642, + "grad_norm": NaN, + "learning_rate": 2.0184413725430114e-06, + "loss": 0.0, + "step": 61045 + }, + { + "epoch": 5.696183633479518, + "grad_norm": NaN, + "learning_rate": 2.0172048997291078e-06, + "loss": 0.0, + "step": 61046 + }, + { + "epoch": 5.696276943174396, + "grad_norm": NaN, + "learning_rate": 2.015968803191631e-06, + "loss": 0.0, + "step": 61047 + }, + { + "epoch": 5.696370252869273, + "grad_norm": NaN, + "learning_rate": 2.0147330829337114e-06, + "loss": 0.0, + "step": 61048 + }, + { + "epoch": 5.69646356256415, + "grad_norm": NaN, + "learning_rate": 2.0134977389584804e-06, + "loss": 0.0, + "step": 61049 + }, + { + "epoch": 5.696556872259028, + "grad_norm": NaN, + "learning_rate": 2.0122627712690854e-06, + "loss": 0.0, + "step": 61050 + }, + { + "epoch": 5.696650181953905, + "grad_norm": NaN, + "learning_rate": 2.0110281798686735e-06, + "loss": 0.0, + "step": 61051 + }, + { + "epoch": 5.696743491648783, + "grad_norm": NaN, + "learning_rate": 2.0097939647603755e-06, + "loss": 0.0, + "step": 61052 + }, + { + "epoch": 5.696836801343659, + "grad_norm": NaN, + "learning_rate": 2.0085601259473392e-06, + "loss": 0.0, + "step": 61053 + }, + { + "epoch": 5.696930111038537, + "grad_norm": NaN, + "learning_rate": 2.007326663432679e-06, + "loss": 0.0, + "step": 61054 + }, + { + "epoch": 5.697023420733414, + "grad_norm": NaN, + "learning_rate": 2.006093577219575e-06, + "loss": 0.0, + "step": 61055 + }, + { + "epoch": 5.6971167304282915, + "grad_norm": NaN, + "learning_rate": 2.0048608673111255e-06, + "loss": 0.0, + "step": 61056 + }, + { + "epoch": 5.697210040123169, + "grad_norm": NaN, + "learning_rate": 2.0036285337104606e-06, + "loss": 0.0, + "step": 61057 + }, + { + "epoch": 5.697303349818046, + "grad_norm": NaN, + "learning_rate": 2.002396576420745e-06, + "loss": 0.0, + "step": 61058 + }, + { + "epoch": 5.697396659512924, + "grad_norm": NaN, + "learning_rate": 2.0011649954450927e-06, + "loss": 0.0, + "step": 61059 + }, + { + "epoch": 5.697489969207801, + "grad_norm": NaN, + "learning_rate": 1.999933790786634e-06, + "loss": 0.0, + "step": 61060 + }, + { + "epoch": 5.697583278902678, + "grad_norm": NaN, + "learning_rate": 1.9987029624485005e-06, + "loss": 0.0, + "step": 61061 + }, + { + "epoch": 5.697676588597555, + "grad_norm": NaN, + "learning_rate": 1.997472510433823e-06, + "loss": 0.0, + "step": 61062 + }, + { + "epoch": 5.6977698982924325, + "grad_norm": NaN, + "learning_rate": 1.9962424347457486e-06, + "loss": 0.0, + "step": 61063 + }, + { + "epoch": 5.69786320798731, + "grad_norm": NaN, + "learning_rate": 1.9950127353873745e-06, + "loss": 0.0, + "step": 61064 + }, + { + "epoch": 5.697956517682187, + "grad_norm": NaN, + "learning_rate": 1.9937834123618324e-06, + "loss": 0.0, + "step": 61065 + }, + { + "epoch": 5.698049827377065, + "grad_norm": NaN, + "learning_rate": 1.992554465672269e-06, + "loss": 0.0, + "step": 61066 + }, + { + "epoch": 5.698143137071941, + "grad_norm": NaN, + "learning_rate": 1.991325895321799e-06, + "loss": 0.0, + "step": 61067 + }, + { + "epoch": 5.698236446766819, + "grad_norm": NaN, + "learning_rate": 1.9900977013135367e-06, + "loss": 0.0, + "step": 61068 + }, + { + "epoch": 5.698329756461696, + "grad_norm": NaN, + "learning_rate": 1.9888698836505957e-06, + "loss": 0.0, + "step": 61069 + }, + { + "epoch": 5.698423066156574, + "grad_norm": NaN, + "learning_rate": 1.98764244233614e-06, + "loss": 0.0, + "step": 61070 + }, + { + "epoch": 5.698516375851451, + "grad_norm": NaN, + "learning_rate": 1.9864153773732518e-06, + "loss": 0.0, + "step": 61071 + }, + { + "epoch": 5.698609685546328, + "grad_norm": NaN, + "learning_rate": 1.98518868876506e-06, + "loss": 0.0, + "step": 61072 + }, + { + "epoch": 5.698702995241206, + "grad_norm": NaN, + "learning_rate": 1.983962376514714e-06, + "loss": 0.0, + "step": 61073 + }, + { + "epoch": 5.698796304936083, + "grad_norm": NaN, + "learning_rate": 1.9827364406252934e-06, + "loss": 0.0, + "step": 61074 + }, + { + "epoch": 5.69888961463096, + "grad_norm": NaN, + "learning_rate": 1.9815108810999126e-06, + "loss": 0.0, + "step": 61075 + }, + { + "epoch": 5.698982924325837, + "grad_norm": NaN, + "learning_rate": 1.9802856979417194e-06, + "loss": 0.0, + "step": 61076 + }, + { + "epoch": 5.699076234020715, + "grad_norm": NaN, + "learning_rate": 1.979060891153811e-06, + "loss": 0.0, + "step": 61077 + }, + { + "epoch": 5.699169543715592, + "grad_norm": NaN, + "learning_rate": 1.977836460739285e-06, + "loss": 0.0, + "step": 61078 + }, + { + "epoch": 5.6992628534104695, + "grad_norm": NaN, + "learning_rate": 1.9766124067013058e-06, + "loss": 0.0, + "step": 61079 + }, + { + "epoch": 5.699356163105347, + "grad_norm": NaN, + "learning_rate": 1.9753887290429372e-06, + "loss": 0.0, + "step": 61080 + }, + { + "epoch": 5.699449472800224, + "grad_norm": NaN, + "learning_rate": 1.9741654277672934e-06, + "loss": 0.0, + "step": 61081 + }, + { + "epoch": 5.699542782495101, + "grad_norm": NaN, + "learning_rate": 1.9729425028775223e-06, + "loss": 0.0, + "step": 61082 + }, + { + "epoch": 5.699636092189978, + "grad_norm": NaN, + "learning_rate": 1.9717199543767046e-06, + "loss": 0.0, + "step": 61083 + }, + { + "epoch": 5.699729401884856, + "grad_norm": NaN, + "learning_rate": 1.970497782267938e-06, + "loss": 0.0, + "step": 61084 + }, + { + "epoch": 5.699822711579733, + "grad_norm": NaN, + "learning_rate": 1.9692759865543694e-06, + "loss": 0.0, + "step": 61085 + }, + { + "epoch": 5.6999160212746105, + "grad_norm": NaN, + "learning_rate": 1.9680545672390635e-06, + "loss": 0.0, + "step": 61086 + }, + { + "epoch": 5.700009330969488, + "grad_norm": NaN, + "learning_rate": 1.966833524325151e-06, + "loss": 0.0, + "step": 61087 + }, + { + "epoch": 5.700102640664365, + "grad_norm": NaN, + "learning_rate": 1.9656128578157294e-06, + "loss": 0.0, + "step": 61088 + }, + { + "epoch": 5.700195950359243, + "grad_norm": NaN, + "learning_rate": 1.964392567713913e-06, + "loss": 0.0, + "step": 61089 + }, + { + "epoch": 5.700289260054119, + "grad_norm": NaN, + "learning_rate": 1.9631726540227655e-06, + "loss": 0.0, + "step": 61090 + }, + { + "epoch": 5.700382569748997, + "grad_norm": NaN, + "learning_rate": 1.9619531167454515e-06, + "loss": 0.0, + "step": 61091 + }, + { + "epoch": 5.700475879443874, + "grad_norm": NaN, + "learning_rate": 1.9607339558850352e-06, + "loss": 0.0, + "step": 61092 + }, + { + "epoch": 5.7005691891387515, + "grad_norm": NaN, + "learning_rate": 1.959515171444598e-06, + "loss": 0.0, + "step": 61093 + }, + { + "epoch": 5.700662498833629, + "grad_norm": NaN, + "learning_rate": 1.958296763427286e-06, + "loss": 0.0, + "step": 61094 + }, + { + "epoch": 5.700755808528506, + "grad_norm": NaN, + "learning_rate": 1.957078731836148e-06, + "loss": 0.0, + "step": 61095 + }, + { + "epoch": 5.700849118223383, + "grad_norm": NaN, + "learning_rate": 1.9558610766743145e-06, + "loss": 0.0, + "step": 61096 + }, + { + "epoch": 5.70094242791826, + "grad_norm": NaN, + "learning_rate": 1.9546437979448825e-06, + "loss": 0.0, + "step": 61097 + }, + { + "epoch": 5.701035737613138, + "grad_norm": NaN, + "learning_rate": 1.9534268956509334e-06, + "loss": 0.0, + "step": 61098 + }, + { + "epoch": 5.701129047308015, + "grad_norm": NaN, + "learning_rate": 1.952210369795548e-06, + "loss": 0.0, + "step": 61099 + }, + { + "epoch": 5.701222357002893, + "grad_norm": NaN, + "learning_rate": 1.9509942203818575e-06, + "loss": 0.0, + "step": 61100 + }, + { + "epoch": 5.70131566669777, + "grad_norm": NaN, + "learning_rate": 1.9497784474129254e-06, + "loss": 0.0, + "step": 61101 + }, + { + "epoch": 5.701408976392647, + "grad_norm": NaN, + "learning_rate": 1.9485630508918493e-06, + "loss": 0.0, + "step": 61102 + }, + { + "epoch": 5.701502286087525, + "grad_norm": NaN, + "learning_rate": 1.947348030821727e-06, + "loss": 0.0, + "step": 61103 + }, + { + "epoch": 5.701595595782402, + "grad_norm": NaN, + "learning_rate": 1.9461333872056227e-06, + "loss": 0.0, + "step": 61104 + }, + { + "epoch": 5.701688905477279, + "grad_norm": NaN, + "learning_rate": 1.9449191200466674e-06, + "loss": 0.0, + "step": 61105 + }, + { + "epoch": 5.701782215172156, + "grad_norm": NaN, + "learning_rate": 1.943705229347925e-06, + "loss": 0.0, + "step": 61106 + }, + { + "epoch": 5.701875524867034, + "grad_norm": NaN, + "learning_rate": 1.94249171511246e-06, + "loss": 0.0, + "step": 61107 + }, + { + "epoch": 5.701968834561911, + "grad_norm": NaN, + "learning_rate": 1.9412785773434025e-06, + "loss": 0.0, + "step": 61108 + }, + { + "epoch": 5.7020621442567885, + "grad_norm": NaN, + "learning_rate": 1.940065816043818e-06, + "loss": 0.0, + "step": 61109 + }, + { + "epoch": 5.702155453951666, + "grad_norm": NaN, + "learning_rate": 1.9388534312167527e-06, + "loss": 0.0, + "step": 61110 + }, + { + "epoch": 5.702248763646542, + "grad_norm": NaN, + "learning_rate": 1.9376414228653712e-06, + "loss": 0.0, + "step": 61111 + }, + { + "epoch": 5.70234207334142, + "grad_norm": NaN, + "learning_rate": 1.9364297909926884e-06, + "loss": 0.0, + "step": 61112 + }, + { + "epoch": 5.702435383036297, + "grad_norm": NaN, + "learning_rate": 1.935218535601785e-06, + "loss": 0.0, + "step": 61113 + }, + { + "epoch": 5.702528692731175, + "grad_norm": NaN, + "learning_rate": 1.934007656695791e-06, + "loss": 0.0, + "step": 61114 + }, + { + "epoch": 5.702622002426052, + "grad_norm": NaN, + "learning_rate": 1.9327971542777544e-06, + "loss": 0.0, + "step": 61115 + }, + { + "epoch": 5.7027153121209295, + "grad_norm": NaN, + "learning_rate": 1.9315870283507227e-06, + "loss": 0.0, + "step": 61116 + }, + { + "epoch": 5.702808621815807, + "grad_norm": NaN, + "learning_rate": 1.930377278917844e-06, + "loss": 0.0, + "step": 61117 + }, + { + "epoch": 5.702901931510684, + "grad_norm": NaN, + "learning_rate": 1.9291679059821318e-06, + "loss": 0.0, + "step": 61118 + }, + { + "epoch": 5.702995241205561, + "grad_norm": NaN, + "learning_rate": 1.9279589095466674e-06, + "loss": 0.0, + "step": 61119 + }, + { + "epoch": 5.703088550900438, + "grad_norm": NaN, + "learning_rate": 1.9267502896145816e-06, + "loss": 0.0, + "step": 61120 + }, + { + "epoch": 5.703181860595316, + "grad_norm": NaN, + "learning_rate": 1.9255420461888882e-06, + "loss": 0.0, + "step": 61121 + }, + { + "epoch": 5.703275170290193, + "grad_norm": NaN, + "learning_rate": 1.924334179272652e-06, + "loss": 0.0, + "step": 61122 + }, + { + "epoch": 5.703368479985071, + "grad_norm": NaN, + "learning_rate": 1.92312668886902e-06, + "loss": 0.0, + "step": 61123 + }, + { + "epoch": 5.703461789679948, + "grad_norm": NaN, + "learning_rate": 1.9219195749809737e-06, + "loss": 0.0, + "step": 61124 + }, + { + "epoch": 5.703555099374825, + "grad_norm": NaN, + "learning_rate": 1.9207128376116264e-06, + "loss": 0.0, + "step": 61125 + }, + { + "epoch": 5.703648409069702, + "grad_norm": NaN, + "learning_rate": 1.919506476764077e-06, + "loss": 0.0, + "step": 61126 + }, + { + "epoch": 5.703741718764579, + "grad_norm": NaN, + "learning_rate": 1.9183004924413215e-06, + "loss": 0.0, + "step": 61127 + }, + { + "epoch": 5.703835028459457, + "grad_norm": NaN, + "learning_rate": 1.917094884646475e-06, + "loss": 0.0, + "step": 61128 + }, + { + "epoch": 5.703928338154334, + "grad_norm": NaN, + "learning_rate": 1.9158896533826016e-06, + "loss": 0.0, + "step": 61129 + }, + { + "epoch": 5.704021647849212, + "grad_norm": NaN, + "learning_rate": 1.9146847986527327e-06, + "loss": 0.0, + "step": 61130 + }, + { + "epoch": 5.704114957544089, + "grad_norm": NaN, + "learning_rate": 1.913480320459965e-06, + "loss": 0.0, + "step": 61131 + }, + { + "epoch": 5.7042082672389665, + "grad_norm": NaN, + "learning_rate": 1.9122762188073624e-06, + "loss": 0.0, + "step": 61132 + }, + { + "epoch": 5.704301576933844, + "grad_norm": NaN, + "learning_rate": 1.911072493697957e-06, + "loss": 0.0, + "step": 61133 + }, + { + "epoch": 5.70439488662872, + "grad_norm": NaN, + "learning_rate": 1.9098691451348293e-06, + "loss": 0.0, + "step": 61134 + }, + { + "epoch": 5.704488196323598, + "grad_norm": NaN, + "learning_rate": 1.9086661731210427e-06, + "loss": 0.0, + "step": 61135 + }, + { + "epoch": 5.704581506018475, + "grad_norm": NaN, + "learning_rate": 1.907463577659646e-06, + "loss": 0.0, + "step": 61136 + }, + { + "epoch": 5.704674815713353, + "grad_norm": NaN, + "learning_rate": 1.9062613587537023e-06, + "loss": 0.0, + "step": 61137 + }, + { + "epoch": 5.70476812540823, + "grad_norm": NaN, + "learning_rate": 1.9050595164062765e-06, + "loss": 0.0, + "step": 61138 + }, + { + "epoch": 5.7048614351031075, + "grad_norm": NaN, + "learning_rate": 1.9038580506203993e-06, + "loss": 0.0, + "step": 61139 + }, + { + "epoch": 5.704954744797984, + "grad_norm": NaN, + "learning_rate": 1.9026569613991516e-06, + "loss": 0.0, + "step": 61140 + }, + { + "epoch": 5.7050480544928615, + "grad_norm": NaN, + "learning_rate": 1.9014562487455809e-06, + "loss": 0.0, + "step": 61141 + }, + { + "epoch": 5.705141364187739, + "grad_norm": NaN, + "learning_rate": 1.9002559126627181e-06, + "loss": 0.0, + "step": 61142 + }, + { + "epoch": 5.705234673882616, + "grad_norm": NaN, + "learning_rate": 1.8990559531536608e-06, + "loss": 0.0, + "step": 61143 + }, + { + "epoch": 5.705327983577494, + "grad_norm": NaN, + "learning_rate": 1.8978563702214233e-06, + "loss": 0.0, + "step": 61144 + }, + { + "epoch": 5.705421293272371, + "grad_norm": NaN, + "learning_rate": 1.8966571638690532e-06, + "loss": 0.0, + "step": 61145 + }, + { + "epoch": 5.7055146029672485, + "grad_norm": NaN, + "learning_rate": 1.895458334099631e-06, + "loss": 0.0, + "step": 61146 + }, + { + "epoch": 5.705607912662126, + "grad_norm": NaN, + "learning_rate": 1.8942598809161713e-06, + "loss": 0.0, + "step": 61147 + }, + { + "epoch": 5.705701222357003, + "grad_norm": NaN, + "learning_rate": 1.8930618043217383e-06, + "loss": 0.0, + "step": 61148 + }, + { + "epoch": 5.70579453205188, + "grad_norm": NaN, + "learning_rate": 1.891864104319396e-06, + "loss": 0.0, + "step": 61149 + }, + { + "epoch": 5.705887841746757, + "grad_norm": NaN, + "learning_rate": 1.890666780912159e-06, + "loss": 0.0, + "step": 61150 + }, + { + "epoch": 5.705981151441635, + "grad_norm": NaN, + "learning_rate": 1.8894698341030578e-06, + "loss": 0.0, + "step": 61151 + }, + { + "epoch": 5.706074461136512, + "grad_norm": NaN, + "learning_rate": 1.8882732638951902e-06, + "loss": 0.0, + "step": 61152 + }, + { + "epoch": 5.70616777083139, + "grad_norm": NaN, + "learning_rate": 1.8870770702915705e-06, + "loss": 0.0, + "step": 61153 + }, + { + "epoch": 5.706261080526267, + "grad_norm": NaN, + "learning_rate": 1.8858812532952294e-06, + "loss": 0.0, + "step": 61154 + }, + { + "epoch": 5.7063543902211435, + "grad_norm": NaN, + "learning_rate": 1.8846858129092313e-06, + "loss": 0.0, + "step": 61155 + }, + { + "epoch": 5.706447699916021, + "grad_norm": NaN, + "learning_rate": 1.8834907491365903e-06, + "loss": 0.0, + "step": 61156 + }, + { + "epoch": 5.706541009610898, + "grad_norm": NaN, + "learning_rate": 1.8822960619803373e-06, + "loss": 0.0, + "step": 61157 + }, + { + "epoch": 5.706634319305776, + "grad_norm": NaN, + "learning_rate": 1.88110175144357e-06, + "loss": 0.0, + "step": 61158 + }, + { + "epoch": 5.706727629000653, + "grad_norm": NaN, + "learning_rate": 1.8799078175292525e-06, + "loss": 0.0, + "step": 61159 + }, + { + "epoch": 5.706820938695531, + "grad_norm": NaN, + "learning_rate": 1.8787142602404658e-06, + "loss": 0.0, + "step": 61160 + }, + { + "epoch": 5.706914248390408, + "grad_norm": NaN, + "learning_rate": 1.8775210795802408e-06, + "loss": 0.0, + "step": 61161 + }, + { + "epoch": 5.7070075580852855, + "grad_norm": NaN, + "learning_rate": 1.8763282755515918e-06, + "loss": 0.0, + "step": 61162 + }, + { + "epoch": 5.707100867780162, + "grad_norm": NaN, + "learning_rate": 1.8751358481575497e-06, + "loss": 0.0, + "step": 61163 + }, + { + "epoch": 5.707194177475039, + "grad_norm": NaN, + "learning_rate": 1.873943797401195e-06, + "loss": 0.0, + "step": 61164 + }, + { + "epoch": 5.707287487169917, + "grad_norm": NaN, + "learning_rate": 1.8727521232854924e-06, + "loss": 0.0, + "step": 61165 + }, + { + "epoch": 5.707380796864794, + "grad_norm": NaN, + "learning_rate": 1.8715608258135062e-06, + "loss": 0.0, + "step": 61166 + }, + { + "epoch": 5.707474106559672, + "grad_norm": NaN, + "learning_rate": 1.8703699049882837e-06, + "loss": 0.0, + "step": 61167 + }, + { + "epoch": 5.707567416254549, + "grad_norm": NaN, + "learning_rate": 1.8691793608128057e-06, + "loss": 0.0, + "step": 61168 + }, + { + "epoch": 5.7076607259494265, + "grad_norm": NaN, + "learning_rate": 1.8679891932901202e-06, + "loss": 0.0, + "step": 61169 + }, + { + "epoch": 5.707754035644303, + "grad_norm": NaN, + "learning_rate": 1.8667994024232914e-06, + "loss": 0.0, + "step": 61170 + }, + { + "epoch": 5.7078473453391805, + "grad_norm": NaN, + "learning_rate": 1.8656099882152997e-06, + "loss": 0.0, + "step": 61171 + }, + { + "epoch": 5.707940655034058, + "grad_norm": NaN, + "learning_rate": 1.86442095066916e-06, + "loss": 0.0, + "step": 61172 + }, + { + "epoch": 5.708033964728935, + "grad_norm": NaN, + "learning_rate": 1.863232289787936e-06, + "loss": 0.0, + "step": 61173 + }, + { + "epoch": 5.708127274423813, + "grad_norm": NaN, + "learning_rate": 1.8620440055746254e-06, + "loss": 0.0, + "step": 61174 + }, + { + "epoch": 5.70822058411869, + "grad_norm": NaN, + "learning_rate": 1.860856098032243e-06, + "loss": 0.0, + "step": 61175 + }, + { + "epoch": 5.708313893813568, + "grad_norm": NaN, + "learning_rate": 1.8596685671638523e-06, + "loss": 0.0, + "step": 61176 + }, + { + "epoch": 5.708407203508445, + "grad_norm": NaN, + "learning_rate": 1.858481412972418e-06, + "loss": 0.0, + "step": 61177 + }, + { + "epoch": 5.7085005132033215, + "grad_norm": NaN, + "learning_rate": 1.8572946354609874e-06, + "loss": 0.0, + "step": 61178 + }, + { + "epoch": 5.708593822898199, + "grad_norm": NaN, + "learning_rate": 1.8561082346325751e-06, + "loss": 0.0, + "step": 61179 + }, + { + "epoch": 5.708687132593076, + "grad_norm": NaN, + "learning_rate": 1.854922210490195e-06, + "loss": 0.0, + "step": 61180 + }, + { + "epoch": 5.708780442287954, + "grad_norm": NaN, + "learning_rate": 1.8537365630368617e-06, + "loss": 0.0, + "step": 61181 + }, + { + "epoch": 5.708873751982831, + "grad_norm": NaN, + "learning_rate": 1.8525512922756059e-06, + "loss": 0.0, + "step": 61182 + }, + { + "epoch": 5.708967061677709, + "grad_norm": NaN, + "learning_rate": 1.8513663982094084e-06, + "loss": 0.0, + "step": 61183 + }, + { + "epoch": 5.709060371372585, + "grad_norm": NaN, + "learning_rate": 1.8501818808413172e-06, + "loss": 0.0, + "step": 61184 + }, + { + "epoch": 5.709153681067463, + "grad_norm": NaN, + "learning_rate": 1.8489977401743129e-06, + "loss": 0.0, + "step": 61185 + }, + { + "epoch": 5.70924699076234, + "grad_norm": NaN, + "learning_rate": 1.8478139762114264e-06, + "loss": 0.0, + "step": 61186 + }, + { + "epoch": 5.709340300457217, + "grad_norm": NaN, + "learning_rate": 1.8466305889556722e-06, + "loss": 0.0, + "step": 61187 + }, + { + "epoch": 5.709433610152095, + "grad_norm": NaN, + "learning_rate": 1.8454475784100475e-06, + "loss": 0.0, + "step": 61188 + }, + { + "epoch": 5.709526919846972, + "grad_norm": NaN, + "learning_rate": 1.8442649445775504e-06, + "loss": 0.0, + "step": 61189 + }, + { + "epoch": 5.70962022954185, + "grad_norm": NaN, + "learning_rate": 1.843082687461228e-06, + "loss": 0.0, + "step": 61190 + }, + { + "epoch": 5.709713539236727, + "grad_norm": NaN, + "learning_rate": 1.8419008070640284e-06, + "loss": 0.0, + "step": 61191 + }, + { + "epoch": 5.709806848931604, + "grad_norm": NaN, + "learning_rate": 1.8407193033889989e-06, + "loss": 0.0, + "step": 61192 + }, + { + "epoch": 5.709900158626481, + "grad_norm": NaN, + "learning_rate": 1.839538176439137e-06, + "loss": 0.0, + "step": 61193 + }, + { + "epoch": 5.7099934683213585, + "grad_norm": NaN, + "learning_rate": 1.8383574262174405e-06, + "loss": 0.0, + "step": 61194 + }, + { + "epoch": 5.710086778016236, + "grad_norm": NaN, + "learning_rate": 1.8371770527269069e-06, + "loss": 0.0, + "step": 61195 + }, + { + "epoch": 5.710180087711113, + "grad_norm": NaN, + "learning_rate": 1.8359970559705506e-06, + "loss": 0.0, + "step": 61196 + }, + { + "epoch": 5.710273397405991, + "grad_norm": NaN, + "learning_rate": 1.8348174359513523e-06, + "loss": 0.0, + "step": 61197 + }, + { + "epoch": 5.710366707100868, + "grad_norm": NaN, + "learning_rate": 1.8336381926723264e-06, + "loss": 0.0, + "step": 61198 + }, + { + "epoch": 5.710460016795745, + "grad_norm": NaN, + "learning_rate": 1.8324593261364874e-06, + "loss": 0.0, + "step": 61199 + }, + { + "epoch": 5.710553326490622, + "grad_norm": NaN, + "learning_rate": 1.831280836346799e-06, + "loss": 0.0, + "step": 61200 + }, + { + "epoch": 5.7106466361854995, + "grad_norm": NaN, + "learning_rate": 1.8301027233062593e-06, + "loss": 0.0, + "step": 61201 + }, + { + "epoch": 5.710739945880377, + "grad_norm": NaN, + "learning_rate": 1.828924987017899e-06, + "loss": 0.0, + "step": 61202 + }, + { + "epoch": 5.710833255575254, + "grad_norm": NaN, + "learning_rate": 1.8277476274846826e-06, + "loss": 0.0, + "step": 61203 + }, + { + "epoch": 5.710926565270132, + "grad_norm": NaN, + "learning_rate": 1.8265706447096074e-06, + "loss": 0.0, + "step": 61204 + }, + { + "epoch": 5.711019874965009, + "grad_norm": NaN, + "learning_rate": 1.8253940386956877e-06, + "loss": 0.0, + "step": 61205 + }, + { + "epoch": 5.711113184659887, + "grad_norm": NaN, + "learning_rate": 1.8242178094458715e-06, + "loss": 0.0, + "step": 61206 + }, + { + "epoch": 5.711206494354763, + "grad_norm": NaN, + "learning_rate": 1.8230419569631894e-06, + "loss": 0.0, + "step": 61207 + }, + { + "epoch": 5.7112998040496405, + "grad_norm": NaN, + "learning_rate": 1.8218664812506224e-06, + "loss": 0.0, + "step": 61208 + }, + { + "epoch": 5.711393113744518, + "grad_norm": NaN, + "learning_rate": 1.8206913823111513e-06, + "loss": 0.0, + "step": 61209 + }, + { + "epoch": 5.711486423439395, + "grad_norm": NaN, + "learning_rate": 1.819516660147774e-06, + "loss": 0.0, + "step": 61210 + }, + { + "epoch": 5.711579733134273, + "grad_norm": NaN, + "learning_rate": 1.8183423147634712e-06, + "loss": 0.0, + "step": 61211 + }, + { + "epoch": 5.71167304282915, + "grad_norm": NaN, + "learning_rate": 1.8171683461612241e-06, + "loss": 0.0, + "step": 61212 + }, + { + "epoch": 5.711766352524027, + "grad_norm": NaN, + "learning_rate": 1.8159947543440134e-06, + "loss": 0.0, + "step": 61213 + }, + { + "epoch": 5.711859662218904, + "grad_norm": NaN, + "learning_rate": 1.8148215393148534e-06, + "loss": 0.0, + "step": 61214 + }, + { + "epoch": 5.711952971913782, + "grad_norm": NaN, + "learning_rate": 1.813648701076692e-06, + "loss": 0.0, + "step": 61215 + }, + { + "epoch": 5.712046281608659, + "grad_norm": NaN, + "learning_rate": 1.8124762396325266e-06, + "loss": 0.0, + "step": 61216 + }, + { + "epoch": 5.712139591303536, + "grad_norm": NaN, + "learning_rate": 1.8113041549853547e-06, + "loss": 0.0, + "step": 61217 + }, + { + "epoch": 5.712232900998414, + "grad_norm": NaN, + "learning_rate": 1.8101324471381074e-06, + "loss": 0.0, + "step": 61218 + }, + { + "epoch": 5.712326210693291, + "grad_norm": NaN, + "learning_rate": 1.8089611160938156e-06, + "loss": 0.0, + "step": 61219 + }, + { + "epoch": 5.712419520388169, + "grad_norm": NaN, + "learning_rate": 1.8077901618554601e-06, + "loss": 0.0, + "step": 61220 + }, + { + "epoch": 5.712512830083046, + "grad_norm": NaN, + "learning_rate": 1.8066195844259556e-06, + "loss": 0.0, + "step": 61221 + }, + { + "epoch": 5.712606139777923, + "grad_norm": NaN, + "learning_rate": 1.8054493838083329e-06, + "loss": 0.0, + "step": 61222 + }, + { + "epoch": 5.7126994494728, + "grad_norm": NaN, + "learning_rate": 1.8042795600055726e-06, + "loss": 0.0, + "step": 61223 + }, + { + "epoch": 5.7127927591676775, + "grad_norm": NaN, + "learning_rate": 1.803110113020606e-06, + "loss": 0.0, + "step": 61224 + }, + { + "epoch": 5.712886068862555, + "grad_norm": NaN, + "learning_rate": 1.8019410428564474e-06, + "loss": 0.0, + "step": 61225 + }, + { + "epoch": 5.712979378557432, + "grad_norm": NaN, + "learning_rate": 1.800772349516061e-06, + "loss": 0.0, + "step": 61226 + }, + { + "epoch": 5.71307268825231, + "grad_norm": NaN, + "learning_rate": 1.7996040330023776e-06, + "loss": 0.0, + "step": 61227 + }, + { + "epoch": 5.713165997947186, + "grad_norm": NaN, + "learning_rate": 1.798436093318445e-06, + "loss": 0.0, + "step": 61228 + }, + { + "epoch": 5.713259307642064, + "grad_norm": NaN, + "learning_rate": 1.7972685304671608e-06, + "loss": 0.0, + "step": 61229 + }, + { + "epoch": 5.713352617336941, + "grad_norm": NaN, + "learning_rate": 1.796101344451506e-06, + "loss": 0.0, + "step": 61230 + }, + { + "epoch": 5.7134459270318185, + "grad_norm": NaN, + "learning_rate": 1.7949345352744948e-06, + "loss": 0.0, + "step": 61231 + }, + { + "epoch": 5.713539236726696, + "grad_norm": NaN, + "learning_rate": 1.7937681029390582e-06, + "loss": 0.0, + "step": 61232 + }, + { + "epoch": 5.713632546421573, + "grad_norm": NaN, + "learning_rate": 1.7926020474481606e-06, + "loss": 0.0, + "step": 61233 + }, + { + "epoch": 5.713725856116451, + "grad_norm": NaN, + "learning_rate": 1.7914363688047829e-06, + "loss": 0.0, + "step": 61234 + }, + { + "epoch": 5.713819165811328, + "grad_norm": NaN, + "learning_rate": 1.7902710670118724e-06, + "loss": 0.0, + "step": 61235 + }, + { + "epoch": 5.713912475506205, + "grad_norm": NaN, + "learning_rate": 1.789106142072394e-06, + "loss": 0.0, + "step": 61236 + }, + { + "epoch": 5.714005785201082, + "grad_norm": NaN, + "learning_rate": 1.7879415939893282e-06, + "loss": 0.0, + "step": 61237 + }, + { + "epoch": 5.71409909489596, + "grad_norm": NaN, + "learning_rate": 1.7867774227656062e-06, + "loss": 0.0, + "step": 61238 + }, + { + "epoch": 5.714192404590837, + "grad_norm": NaN, + "learning_rate": 1.7856136284042088e-06, + "loss": 0.0, + "step": 61239 + }, + { + "epoch": 5.714285714285714, + "grad_norm": NaN, + "learning_rate": 1.7844502109081172e-06, + "loss": 0.0, + "step": 61240 + }, + { + "epoch": 5.714379023980592, + "grad_norm": NaN, + "learning_rate": 1.7832871702802288e-06, + "loss": 0.0, + "step": 61241 + }, + { + "epoch": 5.714472333675469, + "grad_norm": NaN, + "learning_rate": 1.7821245065235579e-06, + "loss": 0.0, + "step": 61242 + }, + { + "epoch": 5.714565643370346, + "grad_norm": NaN, + "learning_rate": 1.7809622196410356e-06, + "loss": 0.0, + "step": 61243 + }, + { + "epoch": 5.714658953065223, + "grad_norm": NaN, + "learning_rate": 1.7798003096356096e-06, + "loss": 0.0, + "step": 61244 + }, + { + "epoch": 5.714752262760101, + "grad_norm": NaN, + "learning_rate": 1.7786387765102438e-06, + "loss": 0.0, + "step": 61245 + }, + { + "epoch": 5.714845572454978, + "grad_norm": NaN, + "learning_rate": 1.7774776202679198e-06, + "loss": 0.0, + "step": 61246 + }, + { + "epoch": 5.7149388821498555, + "grad_norm": NaN, + "learning_rate": 1.7763168409115514e-06, + "loss": 0.0, + "step": 61247 + }, + { + "epoch": 5.715032191844733, + "grad_norm": NaN, + "learning_rate": 1.7751564384440864e-06, + "loss": 0.0, + "step": 61248 + }, + { + "epoch": 5.71512550153961, + "grad_norm": NaN, + "learning_rate": 1.7739964128685059e-06, + "loss": 0.0, + "step": 61249 + }, + { + "epoch": 5.715218811234488, + "grad_norm": NaN, + "learning_rate": 1.7728367641877406e-06, + "loss": 0.0, + "step": 61250 + }, + { + "epoch": 5.715312120929364, + "grad_norm": NaN, + "learning_rate": 1.7716774924047384e-06, + "loss": 0.0, + "step": 61251 + }, + { + "epoch": 5.715405430624242, + "grad_norm": NaN, + "learning_rate": 1.7705185975224633e-06, + "loss": 0.0, + "step": 61252 + }, + { + "epoch": 5.715498740319119, + "grad_norm": NaN, + "learning_rate": 1.7693600795438467e-06, + "loss": 0.0, + "step": 61253 + }, + { + "epoch": 5.7155920500139965, + "grad_norm": NaN, + "learning_rate": 1.768201938471836e-06, + "loss": 0.0, + "step": 61254 + }, + { + "epoch": 5.715685359708874, + "grad_norm": NaN, + "learning_rate": 1.767044174309379e-06, + "loss": 0.0, + "step": 61255 + }, + { + "epoch": 5.715778669403751, + "grad_norm": NaN, + "learning_rate": 1.7658867870594062e-06, + "loss": 0.0, + "step": 61256 + }, + { + "epoch": 5.715871979098628, + "grad_norm": NaN, + "learning_rate": 1.7647297767248826e-06, + "loss": 0.0, + "step": 61257 + }, + { + "epoch": 5.715965288793505, + "grad_norm": NaN, + "learning_rate": 1.7635731433087552e-06, + "loss": 0.0, + "step": 61258 + }, + { + "epoch": 5.716058598488383, + "grad_norm": NaN, + "learning_rate": 1.7624168868139388e-06, + "loss": 0.0, + "step": 61259 + }, + { + "epoch": 5.71615190818326, + "grad_norm": NaN, + "learning_rate": 1.7612610072433808e-06, + "loss": 0.0, + "step": 61260 + }, + { + "epoch": 5.7162452178781376, + "grad_norm": NaN, + "learning_rate": 1.7601055046000457e-06, + "loss": 0.0, + "step": 61261 + }, + { + "epoch": 5.716338527573015, + "grad_norm": NaN, + "learning_rate": 1.7589503788868143e-06, + "loss": 0.0, + "step": 61262 + }, + { + "epoch": 5.716431837267892, + "grad_norm": NaN, + "learning_rate": 1.7577956301066842e-06, + "loss": 0.0, + "step": 61263 + }, + { + "epoch": 5.71652514696277, + "grad_norm": NaN, + "learning_rate": 1.75664125826257e-06, + "loss": 0.0, + "step": 61264 + }, + { + "epoch": 5.716618456657646, + "grad_norm": NaN, + "learning_rate": 1.7554872633573857e-06, + "loss": 0.0, + "step": 61265 + }, + { + "epoch": 5.716711766352524, + "grad_norm": NaN, + "learning_rate": 1.754333645394096e-06, + "loss": 0.0, + "step": 61266 + }, + { + "epoch": 5.716805076047401, + "grad_norm": NaN, + "learning_rate": 1.7531804043756314e-06, + "loss": 0.0, + "step": 61267 + }, + { + "epoch": 5.716898385742279, + "grad_norm": NaN, + "learning_rate": 1.75202754030489e-06, + "loss": 0.0, + "step": 61268 + }, + { + "epoch": 5.716991695437156, + "grad_norm": NaN, + "learning_rate": 1.7508750531848358e-06, + "loss": 0.0, + "step": 61269 + }, + { + "epoch": 5.717085005132033, + "grad_norm": NaN, + "learning_rate": 1.7497229430184167e-06, + "loss": 0.0, + "step": 61270 + }, + { + "epoch": 5.717178314826911, + "grad_norm": NaN, + "learning_rate": 1.7485712098085136e-06, + "loss": 0.0, + "step": 61271 + }, + { + "epoch": 5.717271624521787, + "grad_norm": NaN, + "learning_rate": 1.7474198535580741e-06, + "loss": 0.0, + "step": 61272 + }, + { + "epoch": 5.717364934216665, + "grad_norm": NaN, + "learning_rate": 1.7462688742700458e-06, + "loss": 0.0, + "step": 61273 + }, + { + "epoch": 5.717458243911542, + "grad_norm": NaN, + "learning_rate": 1.7451182719473267e-06, + "loss": 0.0, + "step": 61274 + }, + { + "epoch": 5.71755155360642, + "grad_norm": NaN, + "learning_rate": 1.7439680465928808e-06, + "loss": 0.0, + "step": 61275 + }, + { + "epoch": 5.717644863301297, + "grad_norm": NaN, + "learning_rate": 1.742818198209589e-06, + "loss": 0.0, + "step": 61276 + }, + { + "epoch": 5.7177381729961745, + "grad_norm": NaN, + "learning_rate": 1.7416687268003826e-06, + "loss": 0.0, + "step": 61277 + }, + { + "epoch": 5.717831482691052, + "grad_norm": NaN, + "learning_rate": 1.7405196323682092e-06, + "loss": 0.0, + "step": 61278 + }, + { + "epoch": 5.717924792385929, + "grad_norm": NaN, + "learning_rate": 1.7393709149159829e-06, + "loss": 0.0, + "step": 61279 + }, + { + "epoch": 5.718018102080806, + "grad_norm": NaN, + "learning_rate": 1.7382225744466016e-06, + "loss": 0.0, + "step": 61280 + }, + { + "epoch": 5.718111411775683, + "grad_norm": NaN, + "learning_rate": 1.737074610963013e-06, + "loss": 0.0, + "step": 61281 + }, + { + "epoch": 5.718204721470561, + "grad_norm": NaN, + "learning_rate": 1.7359270244681311e-06, + "loss": 0.0, + "step": 61282 + }, + { + "epoch": 5.718298031165438, + "grad_norm": NaN, + "learning_rate": 1.7347798149648374e-06, + "loss": 0.0, + "step": 61283 + }, + { + "epoch": 5.7183913408603155, + "grad_norm": NaN, + "learning_rate": 1.7336329824561124e-06, + "loss": 0.0, + "step": 61284 + }, + { + "epoch": 5.718484650555193, + "grad_norm": NaN, + "learning_rate": 1.7324865269448208e-06, + "loss": 0.0, + "step": 61285 + }, + { + "epoch": 5.71857796025007, + "grad_norm": NaN, + "learning_rate": 1.7313404484338933e-06, + "loss": 0.0, + "step": 61286 + }, + { + "epoch": 5.718671269944947, + "grad_norm": NaN, + "learning_rate": 1.7301947469262777e-06, + "loss": 0.0, + "step": 61287 + }, + { + "epoch": 5.718764579639824, + "grad_norm": NaN, + "learning_rate": 1.729049422424822e-06, + "loss": 0.0, + "step": 61288 + }, + { + "epoch": 5.718857889334702, + "grad_norm": NaN, + "learning_rate": 1.7279044749324733e-06, + "loss": 0.0, + "step": 61289 + }, + { + "epoch": 5.718951199029579, + "grad_norm": NaN, + "learning_rate": 1.7267599044521629e-06, + "loss": 0.0, + "step": 61290 + }, + { + "epoch": 5.719044508724457, + "grad_norm": NaN, + "learning_rate": 1.7256157109867719e-06, + "loss": 0.0, + "step": 61291 + }, + { + "epoch": 5.719137818419334, + "grad_norm": NaN, + "learning_rate": 1.7244718945392145e-06, + "loss": 0.0, + "step": 61292 + }, + { + "epoch": 5.719231128114211, + "grad_norm": NaN, + "learning_rate": 1.723328455112405e-06, + "loss": 0.0, + "step": 61293 + }, + { + "epoch": 5.719324437809089, + "grad_norm": NaN, + "learning_rate": 1.722185392709241e-06, + "loss": 0.0, + "step": 61294 + }, + { + "epoch": 5.719417747503965, + "grad_norm": NaN, + "learning_rate": 1.721042707332654e-06, + "loss": 0.0, + "step": 61295 + }, + { + "epoch": 5.719511057198843, + "grad_norm": NaN, + "learning_rate": 1.7199003989855243e-06, + "loss": 0.0, + "step": 61296 + }, + { + "epoch": 5.71960436689372, + "grad_norm": NaN, + "learning_rate": 1.71875846767075e-06, + "loss": 0.0, + "step": 61297 + }, + { + "epoch": 5.719697676588598, + "grad_norm": NaN, + "learning_rate": 1.717616913391262e-06, + "loss": 0.0, + "step": 61298 + }, + { + "epoch": 5.719790986283475, + "grad_norm": NaN, + "learning_rate": 1.7164757361499581e-06, + "loss": 0.0, + "step": 61299 + }, + { + "epoch": 5.7198842959783525, + "grad_norm": NaN, + "learning_rate": 1.7153349359497026e-06, + "loss": 0.0, + "step": 61300 + }, + { + "epoch": 5.719977605673229, + "grad_norm": NaN, + "learning_rate": 1.7141945127934431e-06, + "loss": 0.0, + "step": 61301 + }, + { + "epoch": 5.720070915368106, + "grad_norm": NaN, + "learning_rate": 1.7130544666840773e-06, + "loss": 0.0, + "step": 61302 + }, + { + "epoch": 5.720164225062984, + "grad_norm": NaN, + "learning_rate": 1.7119147976244696e-06, + "loss": 0.0, + "step": 61303 + }, + { + "epoch": 5.720257534757861, + "grad_norm": NaN, + "learning_rate": 1.7107755056175342e-06, + "loss": 0.0, + "step": 61304 + }, + { + "epoch": 5.720350844452739, + "grad_norm": NaN, + "learning_rate": 1.7096365906661858e-06, + "loss": 0.0, + "step": 61305 + }, + { + "epoch": 5.720444154147616, + "grad_norm": NaN, + "learning_rate": 1.7084980527732883e-06, + "loss": 0.0, + "step": 61306 + }, + { + "epoch": 5.7205374638424935, + "grad_norm": NaN, + "learning_rate": 1.7073598919417565e-06, + "loss": 0.0, + "step": 61307 + }, + { + "epoch": 5.720630773537371, + "grad_norm": NaN, + "learning_rate": 1.7062221081744875e-06, + "loss": 0.0, + "step": 61308 + }, + { + "epoch": 5.7207240832322475, + "grad_norm": NaN, + "learning_rate": 1.7050847014743629e-06, + "loss": 0.0, + "step": 61309 + }, + { + "epoch": 5.720817392927125, + "grad_norm": NaN, + "learning_rate": 1.7039476718442802e-06, + "loss": 0.0, + "step": 61310 + }, + { + "epoch": 5.720910702622002, + "grad_norm": NaN, + "learning_rate": 1.7028110192871368e-06, + "loss": 0.0, + "step": 61311 + }, + { + "epoch": 5.72100401231688, + "grad_norm": NaN, + "learning_rate": 1.7016747438057976e-06, + "loss": 0.0, + "step": 61312 + }, + { + "epoch": 5.721097322011757, + "grad_norm": NaN, + "learning_rate": 1.7005388454031932e-06, + "loss": 0.0, + "step": 61313 + }, + { + "epoch": 5.7211906317066346, + "grad_norm": NaN, + "learning_rate": 1.699403324082188e-06, + "loss": 0.0, + "step": 61314 + }, + { + "epoch": 5.721283941401512, + "grad_norm": NaN, + "learning_rate": 1.6982681798456466e-06, + "loss": 0.0, + "step": 61315 + }, + { + "epoch": 5.7213772510963885, + "grad_norm": NaN, + "learning_rate": 1.697133412696483e-06, + "loss": 0.0, + "step": 61316 + }, + { + "epoch": 5.721470560791266, + "grad_norm": NaN, + "learning_rate": 1.6959990226376119e-06, + "loss": 0.0, + "step": 61317 + }, + { + "epoch": 5.721563870486143, + "grad_norm": NaN, + "learning_rate": 1.6948650096718476e-06, + "loss": 0.0, + "step": 61318 + }, + { + "epoch": 5.721657180181021, + "grad_norm": NaN, + "learning_rate": 1.6937313738021374e-06, + "loss": 0.0, + "step": 61319 + }, + { + "epoch": 5.721750489875898, + "grad_norm": NaN, + "learning_rate": 1.6925981150313128e-06, + "loss": 0.0, + "step": 61320 + }, + { + "epoch": 5.721843799570776, + "grad_norm": NaN, + "learning_rate": 1.691465233362288e-06, + "loss": 0.0, + "step": 61321 + }, + { + "epoch": 5.721937109265653, + "grad_norm": NaN, + "learning_rate": 1.690332728797944e-06, + "loss": 0.0, + "step": 61322 + }, + { + "epoch": 5.72203041896053, + "grad_norm": NaN, + "learning_rate": 1.6892006013411453e-06, + "loss": 0.0, + "step": 61323 + }, + { + "epoch": 5.722123728655407, + "grad_norm": NaN, + "learning_rate": 1.6880688509947727e-06, + "loss": 0.0, + "step": 61324 + }, + { + "epoch": 5.722217038350284, + "grad_norm": NaN, + "learning_rate": 1.6869374777617073e-06, + "loss": 0.0, + "step": 61325 + }, + { + "epoch": 5.722310348045162, + "grad_norm": NaN, + "learning_rate": 1.6858064816448302e-06, + "loss": 0.0, + "step": 61326 + }, + { + "epoch": 5.722403657740039, + "grad_norm": NaN, + "learning_rate": 1.6846758626470057e-06, + "loss": 0.0, + "step": 61327 + }, + { + "epoch": 5.722496967434917, + "grad_norm": NaN, + "learning_rate": 1.6835456207711318e-06, + "loss": 0.0, + "step": 61328 + }, + { + "epoch": 5.722590277129794, + "grad_norm": NaN, + "learning_rate": 1.6824157560200558e-06, + "loss": 0.0, + "step": 61329 + }, + { + "epoch": 5.722683586824671, + "grad_norm": NaN, + "learning_rate": 1.6812862683966588e-06, + "loss": 0.0, + "step": 61330 + }, + { + "epoch": 5.722776896519548, + "grad_norm": NaN, + "learning_rate": 1.6801571579038387e-06, + "loss": 0.0, + "step": 61331 + }, + { + "epoch": 5.722870206214425, + "grad_norm": NaN, + "learning_rate": 1.6790284245444096e-06, + "loss": 0.0, + "step": 61332 + }, + { + "epoch": 5.722963515909303, + "grad_norm": NaN, + "learning_rate": 1.6779000683212863e-06, + "loss": 0.0, + "step": 61333 + }, + { + "epoch": 5.72305682560418, + "grad_norm": NaN, + "learning_rate": 1.6767720892373493e-06, + "loss": 0.0, + "step": 61334 + }, + { + "epoch": 5.723150135299058, + "grad_norm": NaN, + "learning_rate": 1.6756444872954133e-06, + "loss": 0.0, + "step": 61335 + }, + { + "epoch": 5.723243444993935, + "grad_norm": NaN, + "learning_rate": 1.6745172624983927e-06, + "loss": 0.0, + "step": 61336 + }, + { + "epoch": 5.7233367546888125, + "grad_norm": NaN, + "learning_rate": 1.673390414849135e-06, + "loss": 0.0, + "step": 61337 + }, + { + "epoch": 5.72343006438369, + "grad_norm": NaN, + "learning_rate": 1.672263944350505e-06, + "loss": 0.0, + "step": 61338 + }, + { + "epoch": 5.7235233740785665, + "grad_norm": NaN, + "learning_rate": 1.6711378510053663e-06, + "loss": 0.0, + "step": 61339 + }, + { + "epoch": 5.723616683773444, + "grad_norm": NaN, + "learning_rate": 1.6700121348166006e-06, + "loss": 0.0, + "step": 61340 + }, + { + "epoch": 5.723709993468321, + "grad_norm": NaN, + "learning_rate": 1.668886795787039e-06, + "loss": 0.0, + "step": 61341 + }, + { + "epoch": 5.723803303163199, + "grad_norm": NaN, + "learning_rate": 1.667761833919562e-06, + "loss": 0.0, + "step": 61342 + }, + { + "epoch": 5.723896612858076, + "grad_norm": NaN, + "learning_rate": 1.666637249217051e-06, + "loss": 0.0, + "step": 61343 + }, + { + "epoch": 5.723989922552954, + "grad_norm": NaN, + "learning_rate": 1.6655130416823204e-06, + "loss": 0.0, + "step": 61344 + }, + { + "epoch": 5.72408323224783, + "grad_norm": NaN, + "learning_rate": 1.664389211318251e-06, + "loss": 0.0, + "step": 61345 + }, + { + "epoch": 5.7241765419427075, + "grad_norm": NaN, + "learning_rate": 1.6632657581277075e-06, + "loss": 0.0, + "step": 61346 + }, + { + "epoch": 5.724269851637585, + "grad_norm": NaN, + "learning_rate": 1.6621426821135208e-06, + "loss": 0.0, + "step": 61347 + }, + { + "epoch": 5.724363161332462, + "grad_norm": NaN, + "learning_rate": 1.661019983278572e-06, + "loss": 0.0, + "step": 61348 + }, + { + "epoch": 5.72445647102734, + "grad_norm": NaN, + "learning_rate": 1.6598976616257253e-06, + "loss": 0.0, + "step": 61349 + }, + { + "epoch": 5.724549780722217, + "grad_norm": NaN, + "learning_rate": 1.6587757171577953e-06, + "loss": 0.0, + "step": 61350 + }, + { + "epoch": 5.724643090417095, + "grad_norm": NaN, + "learning_rate": 1.657654149877663e-06, + "loss": 0.0, + "step": 61351 + }, + { + "epoch": 5.724736400111972, + "grad_norm": NaN, + "learning_rate": 1.6565329597881927e-06, + "loss": 0.0, + "step": 61352 + }, + { + "epoch": 5.724829709806849, + "grad_norm": NaN, + "learning_rate": 1.6554121468921822e-06, + "loss": 0.0, + "step": 61353 + }, + { + "epoch": 5.724923019501726, + "grad_norm": NaN, + "learning_rate": 1.6542917111925457e-06, + "loss": 0.0, + "step": 61354 + }, + { + "epoch": 5.725016329196603, + "grad_norm": NaN, + "learning_rate": 1.653171652692098e-06, + "loss": 0.0, + "step": 61355 + }, + { + "epoch": 5.725109638891481, + "grad_norm": NaN, + "learning_rate": 1.6520519713936698e-06, + "loss": 0.0, + "step": 61356 + }, + { + "epoch": 5.725202948586358, + "grad_norm": NaN, + "learning_rate": 1.6509326673001423e-06, + "loss": 0.0, + "step": 61357 + }, + { + "epoch": 5.725296258281236, + "grad_norm": NaN, + "learning_rate": 1.649813740414363e-06, + "loss": 0.0, + "step": 61358 + }, + { + "epoch": 5.725389567976113, + "grad_norm": NaN, + "learning_rate": 1.6486951907391465e-06, + "loss": 0.0, + "step": 61359 + }, + { + "epoch": 5.72548287767099, + "grad_norm": NaN, + "learning_rate": 1.6475770182773573e-06, + "loss": 0.0, + "step": 61360 + }, + { + "epoch": 5.725576187365867, + "grad_norm": NaN, + "learning_rate": 1.646459223031843e-06, + "loss": 0.0, + "step": 61361 + }, + { + "epoch": 5.7256694970607445, + "grad_norm": NaN, + "learning_rate": 1.645341805005418e-06, + "loss": 0.0, + "step": 61362 + }, + { + "epoch": 5.725762806755622, + "grad_norm": NaN, + "learning_rate": 1.6442247642009631e-06, + "loss": 0.0, + "step": 61363 + }, + { + "epoch": 5.725856116450499, + "grad_norm": NaN, + "learning_rate": 1.6431081006212766e-06, + "loss": 0.0, + "step": 61364 + }, + { + "epoch": 5.725949426145377, + "grad_norm": NaN, + "learning_rate": 1.6419918142692223e-06, + "loss": 0.0, + "step": 61365 + }, + { + "epoch": 5.726042735840254, + "grad_norm": NaN, + "learning_rate": 1.640875905147665e-06, + "loss": 0.0, + "step": 61366 + }, + { + "epoch": 5.726136045535132, + "grad_norm": NaN, + "learning_rate": 1.6397603732593855e-06, + "loss": 0.0, + "step": 61367 + }, + { + "epoch": 5.726229355230008, + "grad_norm": NaN, + "learning_rate": 1.6386452186072485e-06, + "loss": 0.0, + "step": 61368 + }, + { + "epoch": 5.7263226649248855, + "grad_norm": NaN, + "learning_rate": 1.6375304411941014e-06, + "loss": 0.0, + "step": 61369 + }, + { + "epoch": 5.726415974619763, + "grad_norm": NaN, + "learning_rate": 1.636416041022759e-06, + "loss": 0.0, + "step": 61370 + }, + { + "epoch": 5.72650928431464, + "grad_norm": NaN, + "learning_rate": 1.6353020180960518e-06, + "loss": 0.0, + "step": 61371 + }, + { + "epoch": 5.726602594009518, + "grad_norm": NaN, + "learning_rate": 1.6341883724168447e-06, + "loss": 0.0, + "step": 61372 + }, + { + "epoch": 5.726695903704395, + "grad_norm": NaN, + "learning_rate": 1.6330751039879353e-06, + "loss": 0.0, + "step": 61373 + }, + { + "epoch": 5.726789213399272, + "grad_norm": NaN, + "learning_rate": 1.6319622128121711e-06, + "loss": 0.0, + "step": 61374 + }, + { + "epoch": 5.726882523094149, + "grad_norm": NaN, + "learning_rate": 1.6308496988923835e-06, + "loss": 0.0, + "step": 61375 + }, + { + "epoch": 5.7269758327890266, + "grad_norm": NaN, + "learning_rate": 1.6297375622313702e-06, + "loss": 0.0, + "step": 61376 + }, + { + "epoch": 5.727069142483904, + "grad_norm": NaN, + "learning_rate": 1.6286258028319953e-06, + "loss": 0.0, + "step": 61377 + }, + { + "epoch": 5.727162452178781, + "grad_norm": NaN, + "learning_rate": 1.6275144206970902e-06, + "loss": 0.0, + "step": 61378 + }, + { + "epoch": 5.727255761873659, + "grad_norm": NaN, + "learning_rate": 1.6264034158294526e-06, + "loss": 0.0, + "step": 61379 + }, + { + "epoch": 5.727349071568536, + "grad_norm": NaN, + "learning_rate": 1.6252927882319134e-06, + "loss": 0.0, + "step": 61380 + }, + { + "epoch": 5.727442381263414, + "grad_norm": NaN, + "learning_rate": 1.6241825379073204e-06, + "loss": 0.0, + "step": 61381 + }, + { + "epoch": 5.72753569095829, + "grad_norm": NaN, + "learning_rate": 1.6230726648584547e-06, + "loss": 0.0, + "step": 61382 + }, + { + "epoch": 5.727629000653168, + "grad_norm": NaN, + "learning_rate": 1.6219631690881808e-06, + "loss": 0.0, + "step": 61383 + }, + { + "epoch": 5.727722310348045, + "grad_norm": NaN, + "learning_rate": 1.6208540505992961e-06, + "loss": 0.0, + "step": 61384 + }, + { + "epoch": 5.727815620042922, + "grad_norm": NaN, + "learning_rate": 1.6197453093946156e-06, + "loss": 0.0, + "step": 61385 + }, + { + "epoch": 5.7279089297378, + "grad_norm": NaN, + "learning_rate": 1.6186369454769866e-06, + "loss": 0.0, + "step": 61386 + }, + { + "epoch": 5.728002239432677, + "grad_norm": NaN, + "learning_rate": 1.6175289588492236e-06, + "loss": 0.0, + "step": 61387 + }, + { + "epoch": 5.728095549127555, + "grad_norm": NaN, + "learning_rate": 1.616421349514091e-06, + "loss": 0.0, + "step": 61388 + }, + { + "epoch": 5.728188858822431, + "grad_norm": NaN, + "learning_rate": 1.6153141174744698e-06, + "loss": 0.0, + "step": 61389 + }, + { + "epoch": 5.728282168517309, + "grad_norm": NaN, + "learning_rate": 1.614207262733158e-06, + "loss": 0.0, + "step": 61390 + }, + { + "epoch": 5.728375478212186, + "grad_norm": NaN, + "learning_rate": 1.6131007852929367e-06, + "loss": 0.0, + "step": 61391 + }, + { + "epoch": 5.7284687879070635, + "grad_norm": NaN, + "learning_rate": 1.6119946851566533e-06, + "loss": 0.0, + "step": 61392 + }, + { + "epoch": 5.728562097601941, + "grad_norm": NaN, + "learning_rate": 1.6108889623271225e-06, + "loss": 0.0, + "step": 61393 + }, + { + "epoch": 5.728655407296818, + "grad_norm": NaN, + "learning_rate": 1.6097836168071255e-06, + "loss": 0.0, + "step": 61394 + }, + { + "epoch": 5.728748716991696, + "grad_norm": NaN, + "learning_rate": 1.6086786485995096e-06, + "loss": 0.0, + "step": 61395 + }, + { + "epoch": 5.728842026686573, + "grad_norm": NaN, + "learning_rate": 1.607574057707073e-06, + "loss": 0.0, + "step": 61396 + }, + { + "epoch": 5.72893533638145, + "grad_norm": NaN, + "learning_rate": 1.6064698441325963e-06, + "loss": 0.0, + "step": 61397 + }, + { + "epoch": 5.729028646076327, + "grad_norm": NaN, + "learning_rate": 1.605366007878911e-06, + "loss": 0.0, + "step": 61398 + }, + { + "epoch": 5.7291219557712045, + "grad_norm": NaN, + "learning_rate": 1.6042625489488481e-06, + "loss": 0.0, + "step": 61399 + }, + { + "epoch": 5.729215265466082, + "grad_norm": NaN, + "learning_rate": 1.6031594673451553e-06, + "loss": 0.0, + "step": 61400 + }, + { + "epoch": 5.729308575160959, + "grad_norm": NaN, + "learning_rate": 1.602056763070697e-06, + "loss": 0.0, + "step": 61401 + }, + { + "epoch": 5.729401884855837, + "grad_norm": NaN, + "learning_rate": 1.6009544361282377e-06, + "loss": 0.0, + "step": 61402 + }, + { + "epoch": 5.729495194550713, + "grad_norm": NaN, + "learning_rate": 1.5998524865205919e-06, + "loss": 0.0, + "step": 61403 + }, + { + "epoch": 5.729588504245591, + "grad_norm": NaN, + "learning_rate": 1.5987509142505573e-06, + "loss": 0.0, + "step": 61404 + }, + { + "epoch": 5.729681813940468, + "grad_norm": NaN, + "learning_rate": 1.5976497193209482e-06, + "loss": 0.0, + "step": 61405 + }, + { + "epoch": 5.729775123635346, + "grad_norm": NaN, + "learning_rate": 1.5965489017345456e-06, + "loss": 0.0, + "step": 61406 + }, + { + "epoch": 5.729868433330223, + "grad_norm": NaN, + "learning_rate": 1.595448461494181e-06, + "loss": 0.0, + "step": 61407 + }, + { + "epoch": 5.7299617430251, + "grad_norm": NaN, + "learning_rate": 1.5943483986026018e-06, + "loss": 0.0, + "step": 61408 + }, + { + "epoch": 5.730055052719978, + "grad_norm": NaN, + "learning_rate": 1.5932487130626392e-06, + "loss": 0.0, + "step": 61409 + }, + { + "epoch": 5.730148362414855, + "grad_norm": NaN, + "learning_rate": 1.5921494048771077e-06, + "loss": 0.0, + "step": 61410 + }, + { + "epoch": 5.730241672109733, + "grad_norm": NaN, + "learning_rate": 1.591050474048755e-06, + "loss": 0.0, + "step": 61411 + }, + { + "epoch": 5.730334981804609, + "grad_norm": NaN, + "learning_rate": 1.5899519205803958e-06, + "loss": 0.0, + "step": 61412 + }, + { + "epoch": 5.730428291499487, + "grad_norm": NaN, + "learning_rate": 1.588853744474844e-06, + "loss": 0.0, + "step": 61413 + }, + { + "epoch": 5.730521601194364, + "grad_norm": NaN, + "learning_rate": 1.5877559457348642e-06, + "loss": 0.0, + "step": 61414 + }, + { + "epoch": 5.7306149108892415, + "grad_norm": NaN, + "learning_rate": 1.5866585243632546e-06, + "loss": 0.0, + "step": 61415 + }, + { + "epoch": 5.730708220584119, + "grad_norm": NaN, + "learning_rate": 1.5855614803628292e-06, + "loss": 0.0, + "step": 61416 + }, + { + "epoch": 5.730801530278996, + "grad_norm": NaN, + "learning_rate": 1.5844648137363359e-06, + "loss": 0.0, + "step": 61417 + }, + { + "epoch": 5.730894839973873, + "grad_norm": NaN, + "learning_rate": 1.583368524486589e-06, + "loss": 0.0, + "step": 61418 + }, + { + "epoch": 5.73098814966875, + "grad_norm": NaN, + "learning_rate": 1.5822726126163866e-06, + "loss": 0.0, + "step": 61419 + }, + { + "epoch": 5.731081459363628, + "grad_norm": NaN, + "learning_rate": 1.5811770781284761e-06, + "loss": 0.0, + "step": 61420 + }, + { + "epoch": 5.731174769058505, + "grad_norm": NaN, + "learning_rate": 1.5800819210256887e-06, + "loss": 0.0, + "step": 61421 + }, + { + "epoch": 5.7312680787533825, + "grad_norm": NaN, + "learning_rate": 1.578987141310789e-06, + "loss": 0.0, + "step": 61422 + }, + { + "epoch": 5.73136138844826, + "grad_norm": NaN, + "learning_rate": 1.5778927389865415e-06, + "loss": 0.0, + "step": 61423 + }, + { + "epoch": 5.731454698143137, + "grad_norm": NaN, + "learning_rate": 1.5767987140557437e-06, + "loss": 0.0, + "step": 61424 + }, + { + "epoch": 5.731548007838015, + "grad_norm": NaN, + "learning_rate": 1.57570506652121e-06, + "loss": 0.0, + "step": 61425 + }, + { + "epoch": 5.731641317532891, + "grad_norm": NaN, + "learning_rate": 1.5746117963856553e-06, + "loss": 0.0, + "step": 61426 + }, + { + "epoch": 5.731734627227769, + "grad_norm": NaN, + "learning_rate": 1.5735189036519269e-06, + "loss": 0.0, + "step": 61427 + }, + { + "epoch": 5.731827936922646, + "grad_norm": NaN, + "learning_rate": 1.5724263883227562e-06, + "loss": 0.0, + "step": 61428 + }, + { + "epoch": 5.731921246617524, + "grad_norm": NaN, + "learning_rate": 1.5713342504009408e-06, + "loss": 0.0, + "step": 61429 + }, + { + "epoch": 5.732014556312401, + "grad_norm": NaN, + "learning_rate": 1.5702424898892451e-06, + "loss": 0.0, + "step": 61430 + }, + { + "epoch": 5.732107866007278, + "grad_norm": NaN, + "learning_rate": 1.5691511067904838e-06, + "loss": 0.0, + "step": 61431 + }, + { + "epoch": 5.732201175702156, + "grad_norm": NaN, + "learning_rate": 1.5680601011073712e-06, + "loss": 0.0, + "step": 61432 + }, + { + "epoch": 5.732294485397032, + "grad_norm": NaN, + "learning_rate": 1.5669694728427218e-06, + "loss": 0.0, + "step": 61433 + }, + { + "epoch": 5.73238779509191, + "grad_norm": NaN, + "learning_rate": 1.5658792219993166e-06, + "loss": 0.0, + "step": 61434 + }, + { + "epoch": 5.732481104786787, + "grad_norm": NaN, + "learning_rate": 1.5647893485798867e-06, + "loss": 0.0, + "step": 61435 + }, + { + "epoch": 5.732574414481665, + "grad_norm": NaN, + "learning_rate": 1.563699852587247e-06, + "loss": 0.0, + "step": 61436 + }, + { + "epoch": 5.732667724176542, + "grad_norm": NaN, + "learning_rate": 1.5626107340241445e-06, + "loss": 0.0, + "step": 61437 + }, + { + "epoch": 5.732761033871419, + "grad_norm": NaN, + "learning_rate": 1.5615219928933442e-06, + "loss": 0.0, + "step": 61438 + }, + { + "epoch": 5.732854343566297, + "grad_norm": NaN, + "learning_rate": 1.5604336291976272e-06, + "loss": 0.0, + "step": 61439 + }, + { + "epoch": 5.732947653261174, + "grad_norm": NaN, + "learning_rate": 1.5593456429397577e-06, + "loss": 0.0, + "step": 61440 + }, + { + "epoch": 5.733040962956051, + "grad_norm": NaN, + "learning_rate": 1.5582580341225004e-06, + "loss": 0.0, + "step": 61441 + }, + { + "epoch": 5.733134272650928, + "grad_norm": NaN, + "learning_rate": 1.5571708027486196e-06, + "loss": 0.0, + "step": 61442 + }, + { + "epoch": 5.733227582345806, + "grad_norm": NaN, + "learning_rate": 1.5560839488208798e-06, + "loss": 0.0, + "step": 61443 + }, + { + "epoch": 5.733320892040683, + "grad_norm": NaN, + "learning_rate": 1.5549974723420455e-06, + "loss": 0.0, + "step": 61444 + }, + { + "epoch": 5.7334142017355605, + "grad_norm": NaN, + "learning_rate": 1.553911373314881e-06, + "loss": 0.0, + "step": 61445 + }, + { + "epoch": 5.733507511430438, + "grad_norm": NaN, + "learning_rate": 1.5528256517421512e-06, + "loss": 0.0, + "step": 61446 + }, + { + "epoch": 5.733600821125314, + "grad_norm": NaN, + "learning_rate": 1.55174030762662e-06, + "loss": 0.0, + "step": 61447 + }, + { + "epoch": 5.733694130820192, + "grad_norm": NaN, + "learning_rate": 1.5506553409710187e-06, + "loss": 0.0, + "step": 61448 + }, + { + "epoch": 5.733787440515069, + "grad_norm": NaN, + "learning_rate": 1.5495707517781453e-06, + "loss": 0.0, + "step": 61449 + }, + { + "epoch": 5.733880750209947, + "grad_norm": NaN, + "learning_rate": 1.5484865400507307e-06, + "loss": 0.0, + "step": 61450 + }, + { + "epoch": 5.733974059904824, + "grad_norm": NaN, + "learning_rate": 1.5474027057915395e-06, + "loss": 0.0, + "step": 61451 + }, + { + "epoch": 5.7340673695997015, + "grad_norm": NaN, + "learning_rate": 1.546319249003336e-06, + "loss": 0.0, + "step": 61452 + }, + { + "epoch": 5.734160679294579, + "grad_norm": NaN, + "learning_rate": 1.5452361696888515e-06, + "loss": 0.0, + "step": 61453 + }, + { + "epoch": 5.734253988989456, + "grad_norm": NaN, + "learning_rate": 1.5441534678508838e-06, + "loss": 0.0, + "step": 61454 + }, + { + "epoch": 5.734347298684334, + "grad_norm": NaN, + "learning_rate": 1.5430711434921305e-06, + "loss": 0.0, + "step": 61455 + }, + { + "epoch": 5.73444060837921, + "grad_norm": NaN, + "learning_rate": 1.5419891966153898e-06, + "loss": 0.0, + "step": 61456 + }, + { + "epoch": 5.734533918074088, + "grad_norm": NaN, + "learning_rate": 1.5409076272233922e-06, + "loss": 0.0, + "step": 61457 + }, + { + "epoch": 5.734627227768965, + "grad_norm": NaN, + "learning_rate": 1.5398264353188695e-06, + "loss": 0.0, + "step": 61458 + }, + { + "epoch": 5.734720537463843, + "grad_norm": NaN, + "learning_rate": 1.5387456209046023e-06, + "loss": 0.0, + "step": 61459 + }, + { + "epoch": 5.73481384715872, + "grad_norm": NaN, + "learning_rate": 1.5376651839833387e-06, + "loss": 0.0, + "step": 61460 + }, + { + "epoch": 5.734907156853597, + "grad_norm": NaN, + "learning_rate": 1.536585124557793e-06, + "loss": 0.0, + "step": 61461 + }, + { + "epoch": 5.735000466548474, + "grad_norm": NaN, + "learning_rate": 1.5355054426307467e-06, + "loss": 0.0, + "step": 61462 + }, + { + "epoch": 5.735093776243351, + "grad_norm": NaN, + "learning_rate": 1.5344261382049473e-06, + "loss": 0.0, + "step": 61463 + }, + { + "epoch": 5.735187085938229, + "grad_norm": NaN, + "learning_rate": 1.5333472112830924e-06, + "loss": 0.0, + "step": 61464 + }, + { + "epoch": 5.735280395633106, + "grad_norm": NaN, + "learning_rate": 1.5322686618679803e-06, + "loss": 0.0, + "step": 61465 + }, + { + "epoch": 5.735373705327984, + "grad_norm": NaN, + "learning_rate": 1.531190489962325e-06, + "loss": 0.0, + "step": 61466 + }, + { + "epoch": 5.735467015022861, + "grad_norm": NaN, + "learning_rate": 1.530112695568858e-06, + "loss": 0.0, + "step": 61467 + }, + { + "epoch": 5.7355603247177385, + "grad_norm": NaN, + "learning_rate": 1.5290352786903437e-06, + "loss": 0.0, + "step": 61468 + }, + { + "epoch": 5.735653634412616, + "grad_norm": NaN, + "learning_rate": 1.527958239329513e-06, + "loss": 0.0, + "step": 61469 + }, + { + "epoch": 5.735746944107492, + "grad_norm": NaN, + "learning_rate": 1.5268815774890975e-06, + "loss": 0.0, + "step": 61470 + }, + { + "epoch": 5.73584025380237, + "grad_norm": NaN, + "learning_rate": 1.5258052931718445e-06, + "loss": 0.0, + "step": 61471 + }, + { + "epoch": 5.735933563497247, + "grad_norm": NaN, + "learning_rate": 1.524729386380502e-06, + "loss": 0.0, + "step": 61472 + }, + { + "epoch": 5.736026873192125, + "grad_norm": NaN, + "learning_rate": 1.5236538571177682e-06, + "loss": 0.0, + "step": 61473 + }, + { + "epoch": 5.736120182887002, + "grad_norm": NaN, + "learning_rate": 1.5225787053864236e-06, + "loss": 0.0, + "step": 61474 + }, + { + "epoch": 5.7362134925818795, + "grad_norm": NaN, + "learning_rate": 1.5215039311891663e-06, + "loss": 0.0, + "step": 61475 + }, + { + "epoch": 5.736306802276757, + "grad_norm": NaN, + "learning_rate": 1.5204295345287443e-06, + "loss": 0.0, + "step": 61476 + }, + { + "epoch": 5.7364001119716335, + "grad_norm": NaN, + "learning_rate": 1.5193555154078884e-06, + "loss": 0.0, + "step": 61477 + }, + { + "epoch": 5.736493421666511, + "grad_norm": NaN, + "learning_rate": 1.51828187382933e-06, + "loss": 0.0, + "step": 61478 + }, + { + "epoch": 5.736586731361388, + "grad_norm": NaN, + "learning_rate": 1.5172086097957837e-06, + "loss": 0.0, + "step": 61479 + }, + { + "epoch": 5.736680041056266, + "grad_norm": NaN, + "learning_rate": 1.516135723309997e-06, + "loss": 0.0, + "step": 61480 + }, + { + "epoch": 5.736773350751143, + "grad_norm": NaN, + "learning_rate": 1.515063214374701e-06, + "loss": 0.0, + "step": 61481 + }, + { + "epoch": 5.736866660446021, + "grad_norm": NaN, + "learning_rate": 1.5139910829926105e-06, + "loss": 0.0, + "step": 61482 + }, + { + "epoch": 5.736959970140898, + "grad_norm": NaN, + "learning_rate": 1.5129193291664565e-06, + "loss": 0.0, + "step": 61483 + }, + { + "epoch": 5.737053279835775, + "grad_norm": NaN, + "learning_rate": 1.5118479528989535e-06, + "loss": 0.0, + "step": 61484 + }, + { + "epoch": 5.737146589530652, + "grad_norm": NaN, + "learning_rate": 1.5107769541928493e-06, + "loss": 0.0, + "step": 61485 + }, + { + "epoch": 5.737239899225529, + "grad_norm": NaN, + "learning_rate": 1.5097063330508585e-06, + "loss": 0.0, + "step": 61486 + }, + { + "epoch": 5.737333208920407, + "grad_norm": NaN, + "learning_rate": 1.5086360894756789e-06, + "loss": 0.0, + "step": 61487 + }, + { + "epoch": 5.737426518615284, + "grad_norm": NaN, + "learning_rate": 1.5075662234700746e-06, + "loss": 0.0, + "step": 61488 + }, + { + "epoch": 5.737519828310162, + "grad_norm": NaN, + "learning_rate": 1.5064967350367274e-06, + "loss": 0.0, + "step": 61489 + }, + { + "epoch": 5.737613138005039, + "grad_norm": NaN, + "learning_rate": 1.505427624178368e-06, + "loss": 0.0, + "step": 61490 + }, + { + "epoch": 5.737706447699916, + "grad_norm": NaN, + "learning_rate": 1.5043588908977278e-06, + "loss": 0.0, + "step": 61491 + }, + { + "epoch": 5.737799757394793, + "grad_norm": NaN, + "learning_rate": 1.503290535197521e-06, + "loss": 0.0, + "step": 61492 + }, + { + "epoch": 5.73789306708967, + "grad_norm": NaN, + "learning_rate": 1.5022225570804459e-06, + "loss": 0.0, + "step": 61493 + }, + { + "epoch": 5.737986376784548, + "grad_norm": NaN, + "learning_rate": 1.501154956549233e-06, + "loss": 0.0, + "step": 61494 + }, + { + "epoch": 5.738079686479425, + "grad_norm": NaN, + "learning_rate": 1.5000877336065975e-06, + "loss": 0.0, + "step": 61495 + }, + { + "epoch": 5.738172996174303, + "grad_norm": NaN, + "learning_rate": 1.4990208882552535e-06, + "loss": 0.0, + "step": 61496 + }, + { + "epoch": 5.73826630586918, + "grad_norm": NaN, + "learning_rate": 1.4979544204978988e-06, + "loss": 0.0, + "step": 61497 + }, + { + "epoch": 5.7383596155640575, + "grad_norm": NaN, + "learning_rate": 1.4968883303372814e-06, + "loss": 0.0, + "step": 61498 + }, + { + "epoch": 5.738452925258934, + "grad_norm": NaN, + "learning_rate": 1.495822617776049e-06, + "loss": 0.0, + "step": 61499 + }, + { + "epoch": 5.738546234953811, + "grad_norm": NaN, + "learning_rate": 1.4947572828169829e-06, + "loss": 0.0, + "step": 61500 + }, + { + "epoch": 5.738639544648689, + "grad_norm": NaN, + "learning_rate": 1.4936923254627474e-06, + "loss": 0.0, + "step": 61501 + }, + { + "epoch": 5.738732854343566, + "grad_norm": NaN, + "learning_rate": 1.4926277457160406e-06, + "loss": 0.0, + "step": 61502 + }, + { + "epoch": 5.738826164038444, + "grad_norm": NaN, + "learning_rate": 1.4915635435796103e-06, + "loss": 0.0, + "step": 61503 + }, + { + "epoch": 5.738919473733321, + "grad_norm": NaN, + "learning_rate": 1.4904997190561542e-06, + "loss": 0.0, + "step": 61504 + }, + { + "epoch": 5.7390127834281985, + "grad_norm": NaN, + "learning_rate": 1.4894362721483533e-06, + "loss": 0.0, + "step": 61505 + }, + { + "epoch": 5.739106093123075, + "grad_norm": NaN, + "learning_rate": 1.4883732028589223e-06, + "loss": 0.0, + "step": 61506 + }, + { + "epoch": 5.7391994028179525, + "grad_norm": NaN, + "learning_rate": 1.487310511190576e-06, + "loss": 0.0, + "step": 61507 + }, + { + "epoch": 5.73929271251283, + "grad_norm": NaN, + "learning_rate": 1.4862481971459784e-06, + "loss": 0.0, + "step": 61508 + }, + { + "epoch": 5.739386022207707, + "grad_norm": NaN, + "learning_rate": 1.4851862607278776e-06, + "loss": 0.0, + "step": 61509 + }, + { + "epoch": 5.739479331902585, + "grad_norm": NaN, + "learning_rate": 1.4841247019389712e-06, + "loss": 0.0, + "step": 61510 + }, + { + "epoch": 5.739572641597462, + "grad_norm": NaN, + "learning_rate": 1.4830635207819074e-06, + "loss": 0.0, + "step": 61511 + }, + { + "epoch": 5.73966595129234, + "grad_norm": NaN, + "learning_rate": 1.4820027172594339e-06, + "loss": 0.0, + "step": 61512 + }, + { + "epoch": 5.739759260987217, + "grad_norm": NaN, + "learning_rate": 1.4809422913742487e-06, + "loss": 0.0, + "step": 61513 + }, + { + "epoch": 5.7398525706820935, + "grad_norm": NaN, + "learning_rate": 1.479882243129016e-06, + "loss": 0.0, + "step": 61514 + }, + { + "epoch": 5.739945880376971, + "grad_norm": NaN, + "learning_rate": 1.47882257252645e-06, + "loss": 0.0, + "step": 61515 + }, + { + "epoch": 5.740039190071848, + "grad_norm": NaN, + "learning_rate": 1.4777632795692495e-06, + "loss": 0.0, + "step": 61516 + }, + { + "epoch": 5.740132499766726, + "grad_norm": NaN, + "learning_rate": 1.4767043642600952e-06, + "loss": 0.0, + "step": 61517 + }, + { + "epoch": 5.740225809461603, + "grad_norm": NaN, + "learning_rate": 1.4756458266016847e-06, + "loss": 0.0, + "step": 61518 + }, + { + "epoch": 5.740319119156481, + "grad_norm": NaN, + "learning_rate": 1.474587666596716e-06, + "loss": 0.0, + "step": 61519 + }, + { + "epoch": 5.740412428851357, + "grad_norm": NaN, + "learning_rate": 1.4735298842478704e-06, + "loss": 0.0, + "step": 61520 + }, + { + "epoch": 5.740505738546235, + "grad_norm": NaN, + "learning_rate": 1.4724724795578458e-06, + "loss": 0.0, + "step": 61521 + }, + { + "epoch": 5.740599048241112, + "grad_norm": NaN, + "learning_rate": 1.4714154525293231e-06, + "loss": 0.0, + "step": 61522 + }, + { + "epoch": 5.740692357935989, + "grad_norm": NaN, + "learning_rate": 1.4703588031650004e-06, + "loss": 0.0, + "step": 61523 + }, + { + "epoch": 5.740785667630867, + "grad_norm": NaN, + "learning_rate": 1.469302531467542e-06, + "loss": 0.0, + "step": 61524 + }, + { + "epoch": 5.740878977325744, + "grad_norm": NaN, + "learning_rate": 1.4682466374396629e-06, + "loss": 0.0, + "step": 61525 + }, + { + "epoch": 5.740972287020622, + "grad_norm": NaN, + "learning_rate": 1.4671911210840271e-06, + "loss": 0.0, + "step": 61526 + }, + { + "epoch": 5.741065596715499, + "grad_norm": NaN, + "learning_rate": 1.4661359824033324e-06, + "loss": 0.0, + "step": 61527 + }, + { + "epoch": 5.7411589064103765, + "grad_norm": NaN, + "learning_rate": 1.465081221400244e-06, + "loss": 0.0, + "step": 61528 + }, + { + "epoch": 5.741252216105253, + "grad_norm": NaN, + "learning_rate": 1.4640268380774756e-06, + "loss": 0.0, + "step": 61529 + }, + { + "epoch": 5.7413455258001305, + "grad_norm": NaN, + "learning_rate": 1.4629728324376589e-06, + "loss": 0.0, + "step": 61530 + }, + { + "epoch": 5.741438835495008, + "grad_norm": NaN, + "learning_rate": 1.4619192044835249e-06, + "loss": 0.0, + "step": 61531 + }, + { + "epoch": 5.741532145189885, + "grad_norm": NaN, + "learning_rate": 1.4608659542177214e-06, + "loss": 0.0, + "step": 61532 + }, + { + "epoch": 5.741625454884763, + "grad_norm": NaN, + "learning_rate": 1.4598130816429299e-06, + "loss": 0.0, + "step": 61533 + }, + { + "epoch": 5.74171876457964, + "grad_norm": NaN, + "learning_rate": 1.458760586761848e-06, + "loss": 0.0, + "step": 61534 + }, + { + "epoch": 5.741812074274517, + "grad_norm": NaN, + "learning_rate": 1.4577084695771235e-06, + "loss": 0.0, + "step": 61535 + }, + { + "epoch": 5.741905383969394, + "grad_norm": NaN, + "learning_rate": 1.4566567300914545e-06, + "loss": 0.0, + "step": 61536 + }, + { + "epoch": 5.7419986936642715, + "grad_norm": NaN, + "learning_rate": 1.4556053683075052e-06, + "loss": 0.0, + "step": 61537 + }, + { + "epoch": 5.742092003359149, + "grad_norm": NaN, + "learning_rate": 1.4545543842279405e-06, + "loss": 0.0, + "step": 61538 + }, + { + "epoch": 5.742185313054026, + "grad_norm": NaN, + "learning_rate": 1.4535037778554414e-06, + "loss": 0.0, + "step": 61539 + }, + { + "epoch": 5.742278622748904, + "grad_norm": NaN, + "learning_rate": 1.4524535491926892e-06, + "loss": 0.0, + "step": 61540 + }, + { + "epoch": 5.742371932443781, + "grad_norm": NaN, + "learning_rate": 1.4514036982423315e-06, + "loss": 0.0, + "step": 61541 + }, + { + "epoch": 5.742465242138659, + "grad_norm": NaN, + "learning_rate": 1.4503542250070666e-06, + "loss": 0.0, + "step": 61542 + }, + { + "epoch": 5.742558551833535, + "grad_norm": NaN, + "learning_rate": 1.449305129489542e-06, + "loss": 0.0, + "step": 61543 + }, + { + "epoch": 5.742651861528413, + "grad_norm": NaN, + "learning_rate": 1.4482564116924223e-06, + "loss": 0.0, + "step": 61544 + }, + { + "epoch": 5.74274517122329, + "grad_norm": NaN, + "learning_rate": 1.4472080716183887e-06, + "loss": 0.0, + "step": 61545 + }, + { + "epoch": 5.742838480918167, + "grad_norm": NaN, + "learning_rate": 1.4461601092700892e-06, + "loss": 0.0, + "step": 61546 + }, + { + "epoch": 5.742931790613045, + "grad_norm": NaN, + "learning_rate": 1.4451125246502215e-06, + "loss": 0.0, + "step": 61547 + }, + { + "epoch": 5.743025100307922, + "grad_norm": NaN, + "learning_rate": 1.444065317761417e-06, + "loss": 0.0, + "step": 61548 + }, + { + "epoch": 5.7431184100028, + "grad_norm": NaN, + "learning_rate": 1.44301848860634e-06, + "loss": 0.0, + "step": 61549 + }, + { + "epoch": 5.743211719697676, + "grad_norm": NaN, + "learning_rate": 1.441972037187672e-06, + "loss": 0.0, + "step": 61550 + }, + { + "epoch": 5.743305029392554, + "grad_norm": NaN, + "learning_rate": 1.4409259635080606e-06, + "loss": 0.0, + "step": 61551 + }, + { + "epoch": 5.743398339087431, + "grad_norm": NaN, + "learning_rate": 1.4398802675701703e-06, + "loss": 0.0, + "step": 61552 + }, + { + "epoch": 5.7434916487823084, + "grad_norm": NaN, + "learning_rate": 1.4388349493766494e-06, + "loss": 0.0, + "step": 61553 + }, + { + "epoch": 5.743584958477186, + "grad_norm": NaN, + "learning_rate": 1.4377900089301785e-06, + "loss": 0.0, + "step": 61554 + }, + { + "epoch": 5.743678268172063, + "grad_norm": NaN, + "learning_rate": 1.4367454462333894e-06, + "loss": 0.0, + "step": 61555 + }, + { + "epoch": 5.743771577866941, + "grad_norm": NaN, + "learning_rate": 1.4357012612889463e-06, + "loss": 0.0, + "step": 61556 + }, + { + "epoch": 5.743864887561818, + "grad_norm": NaN, + "learning_rate": 1.4346574540995137e-06, + "loss": 0.0, + "step": 61557 + }, + { + "epoch": 5.743958197256695, + "grad_norm": NaN, + "learning_rate": 1.4336140246677398e-06, + "loss": 0.0, + "step": 61558 + }, + { + "epoch": 5.744051506951572, + "grad_norm": NaN, + "learning_rate": 1.432570972996272e-06, + "loss": 0.0, + "step": 61559 + }, + { + "epoch": 5.7441448166464495, + "grad_norm": NaN, + "learning_rate": 1.4315282990877586e-06, + "loss": 0.0, + "step": 61560 + }, + { + "epoch": 5.744238126341327, + "grad_norm": NaN, + "learning_rate": 1.4304860029448807e-06, + "loss": 0.0, + "step": 61561 + }, + { + "epoch": 5.744331436036204, + "grad_norm": NaN, + "learning_rate": 1.4294440845702527e-06, + "loss": 0.0, + "step": 61562 + }, + { + "epoch": 5.744424745731082, + "grad_norm": NaN, + "learning_rate": 1.428402543966539e-06, + "loss": 0.0, + "step": 61563 + }, + { + "epoch": 5.744518055425958, + "grad_norm": NaN, + "learning_rate": 1.4273613811363883e-06, + "loss": 0.0, + "step": 61564 + }, + { + "epoch": 5.744611365120836, + "grad_norm": NaN, + "learning_rate": 1.4263205960824476e-06, + "loss": 0.0, + "step": 61565 + }, + { + "epoch": 5.744704674815713, + "grad_norm": NaN, + "learning_rate": 1.425280188807365e-06, + "loss": 0.0, + "step": 61566 + }, + { + "epoch": 5.7447979845105905, + "grad_norm": NaN, + "learning_rate": 1.4242401593137886e-06, + "loss": 0.0, + "step": 61567 + }, + { + "epoch": 5.744891294205468, + "grad_norm": NaN, + "learning_rate": 1.4232005076043496e-06, + "loss": 0.0, + "step": 61568 + }, + { + "epoch": 5.744984603900345, + "grad_norm": NaN, + "learning_rate": 1.4221612336817122e-06, + "loss": 0.0, + "step": 61569 + }, + { + "epoch": 5.745077913595223, + "grad_norm": NaN, + "learning_rate": 1.4211223375484914e-06, + "loss": 0.0, + "step": 61570 + }, + { + "epoch": 5.7451712232901, + "grad_norm": NaN, + "learning_rate": 1.4200838192073516e-06, + "loss": 0.0, + "step": 61571 + }, + { + "epoch": 5.745264532984978, + "grad_norm": NaN, + "learning_rate": 1.419045678660924e-06, + "loss": 0.0, + "step": 61572 + }, + { + "epoch": 5.745357842679854, + "grad_norm": NaN, + "learning_rate": 1.4180079159118562e-06, + "loss": 0.0, + "step": 61573 + }, + { + "epoch": 5.745451152374732, + "grad_norm": NaN, + "learning_rate": 1.41697053096278e-06, + "loss": 0.0, + "step": 61574 + }, + { + "epoch": 5.745544462069609, + "grad_norm": NaN, + "learning_rate": 1.4159335238163261e-06, + "loss": 0.0, + "step": 61575 + }, + { + "epoch": 5.745637771764486, + "grad_norm": NaN, + "learning_rate": 1.4148968944751427e-06, + "loss": 0.0, + "step": 61576 + }, + { + "epoch": 5.745731081459364, + "grad_norm": NaN, + "learning_rate": 1.413860642941861e-06, + "loss": 0.0, + "step": 61577 + }, + { + "epoch": 5.745824391154241, + "grad_norm": NaN, + "learning_rate": 1.4128247692191285e-06, + "loss": 0.0, + "step": 61578 + }, + { + "epoch": 5.745917700849118, + "grad_norm": NaN, + "learning_rate": 1.4117892733095437e-06, + "loss": 0.0, + "step": 61579 + }, + { + "epoch": 5.746011010543995, + "grad_norm": NaN, + "learning_rate": 1.4107541552157875e-06, + "loss": 0.0, + "step": 61580 + }, + { + "epoch": 5.746104320238873, + "grad_norm": NaN, + "learning_rate": 1.4097194149404579e-06, + "loss": 0.0, + "step": 61581 + }, + { + "epoch": 5.74619762993375, + "grad_norm": NaN, + "learning_rate": 1.4086850524862026e-06, + "loss": 0.0, + "step": 61582 + }, + { + "epoch": 5.7462909396286275, + "grad_norm": NaN, + "learning_rate": 1.4076510678556364e-06, + "loss": 0.0, + "step": 61583 + }, + { + "epoch": 5.746384249323505, + "grad_norm": NaN, + "learning_rate": 1.406617461051407e-06, + "loss": 0.0, + "step": 61584 + }, + { + "epoch": 5.746477559018382, + "grad_norm": NaN, + "learning_rate": 1.4055842320761291e-06, + "loss": 0.0, + "step": 61585 + }, + { + "epoch": 5.74657086871326, + "grad_norm": NaN, + "learning_rate": 1.4045513809324171e-06, + "loss": 0.0, + "step": 61586 + }, + { + "epoch": 5.746664178408136, + "grad_norm": NaN, + "learning_rate": 1.4035189076229358e-06, + "loss": 0.0, + "step": 61587 + }, + { + "epoch": 5.746757488103014, + "grad_norm": NaN, + "learning_rate": 1.402486812150283e-06, + "loss": 0.0, + "step": 61588 + }, + { + "epoch": 5.746850797797891, + "grad_norm": NaN, + "learning_rate": 1.4014550945170899e-06, + "loss": 0.0, + "step": 61589 + }, + { + "epoch": 5.7469441074927685, + "grad_norm": NaN, + "learning_rate": 1.4004237547259711e-06, + "loss": 0.0, + "step": 61590 + }, + { + "epoch": 5.747037417187646, + "grad_norm": NaN, + "learning_rate": 1.3993927927795579e-06, + "loss": 0.0, + "step": 61591 + }, + { + "epoch": 5.747130726882523, + "grad_norm": NaN, + "learning_rate": 1.3983622086804647e-06, + "loss": 0.0, + "step": 61592 + }, + { + "epoch": 5.747224036577401, + "grad_norm": NaN, + "learning_rate": 1.3973320024313394e-06, + "loss": 0.0, + "step": 61593 + }, + { + "epoch": 5.747317346272277, + "grad_norm": NaN, + "learning_rate": 1.3963021740347634e-06, + "loss": 0.0, + "step": 61594 + }, + { + "epoch": 5.747410655967155, + "grad_norm": NaN, + "learning_rate": 1.395272723493368e-06, + "loss": 0.0, + "step": 61595 + }, + { + "epoch": 5.747503965662032, + "grad_norm": NaN, + "learning_rate": 1.3942436508097843e-06, + "loss": 0.0, + "step": 61596 + }, + { + "epoch": 5.74759727535691, + "grad_norm": NaN, + "learning_rate": 1.3932149559866102e-06, + "loss": 0.0, + "step": 61597 + }, + { + "epoch": 5.747690585051787, + "grad_norm": NaN, + "learning_rate": 1.3921866390264769e-06, + "loss": 0.0, + "step": 61598 + }, + { + "epoch": 5.747783894746664, + "grad_norm": NaN, + "learning_rate": 1.3911586999319824e-06, + "loss": 0.0, + "step": 61599 + }, + { + "epoch": 5.747877204441542, + "grad_norm": NaN, + "learning_rate": 1.390131138705758e-06, + "loss": 0.0, + "step": 61600 + }, + { + "epoch": 5.747970514136419, + "grad_norm": NaN, + "learning_rate": 1.3891039553504012e-06, + "loss": 0.0, + "step": 61601 + }, + { + "epoch": 5.748063823831296, + "grad_norm": NaN, + "learning_rate": 1.3880771498685439e-06, + "loss": 0.0, + "step": 61602 + }, + { + "epoch": 5.748157133526173, + "grad_norm": NaN, + "learning_rate": 1.387050722262767e-06, + "loss": 0.0, + "step": 61603 + }, + { + "epoch": 5.748250443221051, + "grad_norm": NaN, + "learning_rate": 1.3860246725357183e-06, + "loss": 0.0, + "step": 61604 + }, + { + "epoch": 5.748343752915928, + "grad_norm": NaN, + "learning_rate": 1.3849990006899626e-06, + "loss": 0.0, + "step": 61605 + }, + { + "epoch": 5.7484370626108054, + "grad_norm": NaN, + "learning_rate": 1.3839737067281476e-06, + "loss": 0.0, + "step": 61606 + }, + { + "epoch": 5.748530372305683, + "grad_norm": NaN, + "learning_rate": 1.3829487906528713e-06, + "loss": 0.0, + "step": 61607 + }, + { + "epoch": 5.748623682000559, + "grad_norm": NaN, + "learning_rate": 1.381924252466715e-06, + "loss": 0.0, + "step": 61608 + }, + { + "epoch": 5.748716991695437, + "grad_norm": NaN, + "learning_rate": 1.3809000921723101e-06, + "loss": 0.0, + "step": 61609 + }, + { + "epoch": 5.748810301390314, + "grad_norm": NaN, + "learning_rate": 1.3798763097722542e-06, + "loss": 0.0, + "step": 61610 + }, + { + "epoch": 5.748903611085192, + "grad_norm": NaN, + "learning_rate": 1.3788529052691456e-06, + "loss": 0.0, + "step": 61611 + }, + { + "epoch": 5.748996920780069, + "grad_norm": NaN, + "learning_rate": 1.3778298786655817e-06, + "loss": 0.0, + "step": 61612 + }, + { + "epoch": 5.7490902304749465, + "grad_norm": NaN, + "learning_rate": 1.3768072299641776e-06, + "loss": 0.0, + "step": 61613 + }, + { + "epoch": 5.749183540169824, + "grad_norm": NaN, + "learning_rate": 1.3757849591675308e-06, + "loss": 0.0, + "step": 61614 + }, + { + "epoch": 5.749276849864701, + "grad_norm": NaN, + "learning_rate": 1.3747630662782228e-06, + "loss": 0.0, + "step": 61615 + }, + { + "epoch": 5.749370159559578, + "grad_norm": NaN, + "learning_rate": 1.3737415512988848e-06, + "loss": 0.0, + "step": 61616 + }, + { + "epoch": 5.749463469254455, + "grad_norm": NaN, + "learning_rate": 1.3727204142320812e-06, + "loss": 0.0, + "step": 61617 + }, + { + "epoch": 5.749556778949333, + "grad_norm": NaN, + "learning_rate": 1.3716996550804438e-06, + "loss": 0.0, + "step": 61618 + }, + { + "epoch": 5.74965008864421, + "grad_norm": NaN, + "learning_rate": 1.37067927384652e-06, + "loss": 0.0, + "step": 61619 + }, + { + "epoch": 5.7497433983390875, + "grad_norm": NaN, + "learning_rate": 1.3696592705329578e-06, + "loss": 0.0, + "step": 61620 + }, + { + "epoch": 5.749836708033965, + "grad_norm": NaN, + "learning_rate": 1.3686396451423054e-06, + "loss": 0.0, + "step": 61621 + }, + { + "epoch": 5.749930017728842, + "grad_norm": NaN, + "learning_rate": 1.367620397677177e-06, + "loss": 0.0, + "step": 61622 + }, + { + "epoch": 5.750023327423719, + "grad_norm": NaN, + "learning_rate": 1.366601528140171e-06, + "loss": 0.0, + "step": 61623 + }, + { + "epoch": 5.750116637118596, + "grad_norm": NaN, + "learning_rate": 1.3655830365338683e-06, + "loss": 0.0, + "step": 61624 + }, + { + "epoch": 5.750209946813474, + "grad_norm": NaN, + "learning_rate": 1.3645649228608502e-06, + "loss": 0.0, + "step": 61625 + }, + { + "epoch": 5.750303256508351, + "grad_norm": NaN, + "learning_rate": 1.3635471871237148e-06, + "loss": 0.0, + "step": 61626 + }, + { + "epoch": 5.750396566203229, + "grad_norm": NaN, + "learning_rate": 1.3625298293250597e-06, + "loss": 0.0, + "step": 61627 + }, + { + "epoch": 5.750489875898106, + "grad_norm": NaN, + "learning_rate": 1.3615128494674499e-06, + "loss": 0.0, + "step": 61628 + }, + { + "epoch": 5.750583185592983, + "grad_norm": NaN, + "learning_rate": 1.360496247553483e-06, + "loss": 0.0, + "step": 61629 + }, + { + "epoch": 5.750676495287861, + "grad_norm": NaN, + "learning_rate": 1.3594800235857573e-06, + "loss": 0.0, + "step": 61630 + }, + { + "epoch": 5.750769804982737, + "grad_norm": NaN, + "learning_rate": 1.358464177566837e-06, + "loss": 0.0, + "step": 61631 + }, + { + "epoch": 5.750863114677615, + "grad_norm": NaN, + "learning_rate": 1.3574487094993036e-06, + "loss": 0.0, + "step": 61632 + }, + { + "epoch": 5.750956424372492, + "grad_norm": NaN, + "learning_rate": 1.356433619385755e-06, + "loss": 0.0, + "step": 61633 + }, + { + "epoch": 5.75104973406737, + "grad_norm": NaN, + "learning_rate": 1.3554189072287557e-06, + "loss": 0.0, + "step": 61634 + }, + { + "epoch": 5.751143043762247, + "grad_norm": NaN, + "learning_rate": 1.3544045730309039e-06, + "loss": 0.0, + "step": 61635 + }, + { + "epoch": 5.7512363534571245, + "grad_norm": NaN, + "learning_rate": 1.353390616794764e-06, + "loss": 0.0, + "step": 61636 + }, + { + "epoch": 5.751329663152001, + "grad_norm": NaN, + "learning_rate": 1.3523770385229172e-06, + "loss": 0.0, + "step": 61637 + }, + { + "epoch": 5.751422972846878, + "grad_norm": NaN, + "learning_rate": 1.3513638382179615e-06, + "loss": 0.0, + "step": 61638 + }, + { + "epoch": 5.751516282541756, + "grad_norm": NaN, + "learning_rate": 1.3503510158824282e-06, + "loss": 0.0, + "step": 61639 + }, + { + "epoch": 5.751609592236633, + "grad_norm": NaN, + "learning_rate": 1.349338571518932e-06, + "loss": 0.0, + "step": 61640 + }, + { + "epoch": 5.751702901931511, + "grad_norm": NaN, + "learning_rate": 1.3483265051300373e-06, + "loss": 0.0, + "step": 61641 + }, + { + "epoch": 5.751796211626388, + "grad_norm": NaN, + "learning_rate": 1.3473148167183089e-06, + "loss": 0.0, + "step": 61642 + }, + { + "epoch": 5.7518895213212655, + "grad_norm": NaN, + "learning_rate": 1.3463035062863282e-06, + "loss": 0.0, + "step": 61643 + }, + { + "epoch": 5.751982831016143, + "grad_norm": NaN, + "learning_rate": 1.3452925738366592e-06, + "loss": 0.0, + "step": 61644 + }, + { + "epoch": 5.75207614071102, + "grad_norm": NaN, + "learning_rate": 1.344282019371884e-06, + "loss": 0.0, + "step": 61645 + }, + { + "epoch": 5.752169450405897, + "grad_norm": NaN, + "learning_rate": 1.3432718428945498e-06, + "loss": 0.0, + "step": 61646 + }, + { + "epoch": 5.752262760100774, + "grad_norm": NaN, + "learning_rate": 1.3422620444072551e-06, + "loss": 0.0, + "step": 61647 + }, + { + "epoch": 5.752356069795652, + "grad_norm": NaN, + "learning_rate": 1.3412526239125477e-06, + "loss": 0.0, + "step": 61648 + }, + { + "epoch": 5.752449379490529, + "grad_norm": NaN, + "learning_rate": 1.340243581412992e-06, + "loss": 0.0, + "step": 61649 + }, + { + "epoch": 5.752542689185407, + "grad_norm": NaN, + "learning_rate": 1.3392349169111693e-06, + "loss": 0.0, + "step": 61650 + }, + { + "epoch": 5.752635998880284, + "grad_norm": NaN, + "learning_rate": 1.3382266304096279e-06, + "loss": 0.0, + "step": 61651 + }, + { + "epoch": 5.7527293085751605, + "grad_norm": NaN, + "learning_rate": 1.3372187219109487e-06, + "loss": 0.0, + "step": 61652 + }, + { + "epoch": 5.752822618270038, + "grad_norm": NaN, + "learning_rate": 1.3362111914176798e-06, + "loss": 0.0, + "step": 61653 + }, + { + "epoch": 5.752915927964915, + "grad_norm": NaN, + "learning_rate": 1.3352040389323859e-06, + "loss": 0.0, + "step": 61654 + }, + { + "epoch": 5.753009237659793, + "grad_norm": NaN, + "learning_rate": 1.3341972644576315e-06, + "loss": 0.0, + "step": 61655 + }, + { + "epoch": 5.75310254735467, + "grad_norm": NaN, + "learning_rate": 1.3331908679959814e-06, + "loss": 0.0, + "step": 61656 + }, + { + "epoch": 5.753195857049548, + "grad_norm": NaN, + "learning_rate": 1.332184849549983e-06, + "loss": 0.0, + "step": 61657 + }, + { + "epoch": 5.753289166744425, + "grad_norm": NaN, + "learning_rate": 1.3311792091222018e-06, + "loss": 0.0, + "step": 61658 + }, + { + "epoch": 5.7533824764393025, + "grad_norm": NaN, + "learning_rate": 1.330173946715185e-06, + "loss": 0.0, + "step": 61659 + }, + { + "epoch": 5.753475786134179, + "grad_norm": NaN, + "learning_rate": 1.3291690623315144e-06, + "loss": 0.0, + "step": 61660 + }, + { + "epoch": 5.753569095829056, + "grad_norm": NaN, + "learning_rate": 1.3281645559737208e-06, + "loss": 0.0, + "step": 61661 + }, + { + "epoch": 5.753662405523934, + "grad_norm": NaN, + "learning_rate": 1.3271604276443527e-06, + "loss": 0.0, + "step": 61662 + }, + { + "epoch": 5.753755715218811, + "grad_norm": NaN, + "learning_rate": 1.326156677345991e-06, + "loss": 0.0, + "step": 61663 + }, + { + "epoch": 5.753849024913689, + "grad_norm": NaN, + "learning_rate": 1.3251533050811669e-06, + "loss": 0.0, + "step": 61664 + }, + { + "epoch": 5.753942334608566, + "grad_norm": NaN, + "learning_rate": 1.3241503108524287e-06, + "loss": 0.0, + "step": 61665 + }, + { + "epoch": 5.7540356443034435, + "grad_norm": NaN, + "learning_rate": 1.3231476946623409e-06, + "loss": 0.0, + "step": 61666 + }, + { + "epoch": 5.75412895399832, + "grad_norm": NaN, + "learning_rate": 1.3221454565134514e-06, + "loss": 0.0, + "step": 61667 + }, + { + "epoch": 5.7542222636931974, + "grad_norm": NaN, + "learning_rate": 1.3211435964082918e-06, + "loss": 0.0, + "step": 61668 + }, + { + "epoch": 5.754315573388075, + "grad_norm": NaN, + "learning_rate": 1.3201421143494428e-06, + "loss": 0.0, + "step": 61669 + }, + { + "epoch": 5.754408883082952, + "grad_norm": NaN, + "learning_rate": 1.3191410103394196e-06, + "loss": 0.0, + "step": 61670 + }, + { + "epoch": 5.75450219277783, + "grad_norm": NaN, + "learning_rate": 1.3181402843807698e-06, + "loss": 0.0, + "step": 61671 + }, + { + "epoch": 5.754595502472707, + "grad_norm": NaN, + "learning_rate": 1.3171399364760582e-06, + "loss": 0.0, + "step": 61672 + }, + { + "epoch": 5.7546888121675845, + "grad_norm": NaN, + "learning_rate": 1.316139966627816e-06, + "loss": 0.0, + "step": 61673 + }, + { + "epoch": 5.754782121862462, + "grad_norm": NaN, + "learning_rate": 1.3151403748385747e-06, + "loss": 0.0, + "step": 61674 + }, + { + "epoch": 5.7548754315573385, + "grad_norm": NaN, + "learning_rate": 1.3141411611108988e-06, + "loss": 0.0, + "step": 61675 + }, + { + "epoch": 5.754968741252216, + "grad_norm": NaN, + "learning_rate": 1.3131423254473195e-06, + "loss": 0.0, + "step": 61676 + }, + { + "epoch": 5.755062050947093, + "grad_norm": NaN, + "learning_rate": 1.3121438678503682e-06, + "loss": 0.0, + "step": 61677 + }, + { + "epoch": 5.755155360641971, + "grad_norm": NaN, + "learning_rate": 1.3111457883225928e-06, + "loss": 0.0, + "step": 61678 + }, + { + "epoch": 5.755248670336848, + "grad_norm": NaN, + "learning_rate": 1.3101480868665248e-06, + "loss": 0.0, + "step": 61679 + }, + { + "epoch": 5.755341980031726, + "grad_norm": NaN, + "learning_rate": 1.309150763484712e-06, + "loss": 0.0, + "step": 61680 + }, + { + "epoch": 5.755435289726602, + "grad_norm": NaN, + "learning_rate": 1.3081538181796858e-06, + "loss": 0.0, + "step": 61681 + }, + { + "epoch": 5.7555285994214795, + "grad_norm": NaN, + "learning_rate": 1.3071572509539773e-06, + "loss": 0.0, + "step": 61682 + }, + { + "epoch": 5.755621909116357, + "grad_norm": NaN, + "learning_rate": 1.3061610618101014e-06, + "loss": 0.0, + "step": 61683 + }, + { + "epoch": 5.755715218811234, + "grad_norm": NaN, + "learning_rate": 1.3051652507506393e-06, + "loss": 0.0, + "step": 61684 + }, + { + "epoch": 5.755808528506112, + "grad_norm": NaN, + "learning_rate": 1.304169817778089e-06, + "loss": 0.0, + "step": 61685 + }, + { + "epoch": 5.755901838200989, + "grad_norm": NaN, + "learning_rate": 1.3031747628949818e-06, + "loss": 0.0, + "step": 61686 + }, + { + "epoch": 5.755995147895867, + "grad_norm": NaN, + "learning_rate": 1.302180086103849e-06, + "loss": 0.0, + "step": 61687 + }, + { + "epoch": 5.756088457590744, + "grad_norm": NaN, + "learning_rate": 1.3011857874072385e-06, + "loss": 0.0, + "step": 61688 + }, + { + "epoch": 5.7561817672856215, + "grad_norm": NaN, + "learning_rate": 1.3001918668076482e-06, + "loss": 0.0, + "step": 61689 + }, + { + "epoch": 5.756275076980498, + "grad_norm": NaN, + "learning_rate": 1.2991983243076265e-06, + "loss": 0.0, + "step": 61690 + }, + { + "epoch": 5.756368386675375, + "grad_norm": NaN, + "learning_rate": 1.2982051599097044e-06, + "loss": 0.0, + "step": 61691 + }, + { + "epoch": 5.756461696370253, + "grad_norm": NaN, + "learning_rate": 1.2972123736163964e-06, + "loss": 0.0, + "step": 61692 + }, + { + "epoch": 5.75655500606513, + "grad_norm": NaN, + "learning_rate": 1.2962199654302174e-06, + "loss": 0.0, + "step": 61693 + }, + { + "epoch": 5.756648315760008, + "grad_norm": NaN, + "learning_rate": 1.2952279353537154e-06, + "loss": 0.0, + "step": 61694 + }, + { + "epoch": 5.756741625454885, + "grad_norm": NaN, + "learning_rate": 1.2942362833893883e-06, + "loss": 0.0, + "step": 61695 + }, + { + "epoch": 5.756834935149762, + "grad_norm": NaN, + "learning_rate": 1.2932450095397674e-06, + "loss": 0.0, + "step": 61696 + }, + { + "epoch": 5.756928244844639, + "grad_norm": NaN, + "learning_rate": 1.292254113807384e-06, + "loss": 0.0, + "step": 61697 + }, + { + "epoch": 5.7570215545395165, + "grad_norm": NaN, + "learning_rate": 1.2912635961947527e-06, + "loss": 0.0, + "step": 61698 + }, + { + "epoch": 5.757114864234394, + "grad_norm": NaN, + "learning_rate": 1.2902734567043714e-06, + "loss": 0.0, + "step": 61699 + }, + { + "epoch": 5.757208173929271, + "grad_norm": NaN, + "learning_rate": 1.289283695338772e-06, + "loss": 0.0, + "step": 61700 + }, + { + "epoch": 5.757301483624149, + "grad_norm": NaN, + "learning_rate": 1.2882943121004851e-06, + "loss": 0.0, + "step": 61701 + }, + { + "epoch": 5.757394793319026, + "grad_norm": NaN, + "learning_rate": 1.2873053069920091e-06, + "loss": 0.0, + "step": 61702 + }, + { + "epoch": 5.757488103013904, + "grad_norm": NaN, + "learning_rate": 1.2863166800158753e-06, + "loss": 0.0, + "step": 61703 + }, + { + "epoch": 5.75758141270878, + "grad_norm": NaN, + "learning_rate": 1.285328431174565e-06, + "loss": 0.0, + "step": 61704 + }, + { + "epoch": 5.7576747224036575, + "grad_norm": NaN, + "learning_rate": 1.2843405604706258e-06, + "loss": 0.0, + "step": 61705 + }, + { + "epoch": 5.757768032098535, + "grad_norm": NaN, + "learning_rate": 1.2833530679065563e-06, + "loss": 0.0, + "step": 61706 + }, + { + "epoch": 5.757861341793412, + "grad_norm": NaN, + "learning_rate": 1.282365953484854e-06, + "loss": 0.0, + "step": 61707 + }, + { + "epoch": 5.75795465148829, + "grad_norm": NaN, + "learning_rate": 1.2813792172080506e-06, + "loss": 0.0, + "step": 61708 + }, + { + "epoch": 5.758047961183167, + "grad_norm": NaN, + "learning_rate": 1.2803928590786604e-06, + "loss": 0.0, + "step": 61709 + }, + { + "epoch": 5.758141270878045, + "grad_norm": NaN, + "learning_rate": 1.2794068790991484e-06, + "loss": 0.0, + "step": 61710 + }, + { + "epoch": 5.758234580572921, + "grad_norm": NaN, + "learning_rate": 1.2784212772720792e-06, + "loss": 0.0, + "step": 61711 + }, + { + "epoch": 5.758327890267799, + "grad_norm": NaN, + "learning_rate": 1.2774360535999006e-06, + "loss": 0.0, + "step": 61712 + }, + { + "epoch": 5.758421199962676, + "grad_norm": NaN, + "learning_rate": 1.2764512080851774e-06, + "loss": 0.0, + "step": 61713 + }, + { + "epoch": 5.758514509657553, + "grad_norm": NaN, + "learning_rate": 1.2754667407303575e-06, + "loss": 0.0, + "step": 61714 + }, + { + "epoch": 5.758607819352431, + "grad_norm": NaN, + "learning_rate": 1.274482651537989e-06, + "loss": 0.0, + "step": 61715 + }, + { + "epoch": 5.758701129047308, + "grad_norm": NaN, + "learning_rate": 1.2734989405105368e-06, + "loss": 0.0, + "step": 61716 + }, + { + "epoch": 5.758794438742186, + "grad_norm": NaN, + "learning_rate": 1.2725156076505315e-06, + "loss": 0.0, + "step": 61717 + }, + { + "epoch": 5.758887748437063, + "grad_norm": NaN, + "learning_rate": 1.271532652960472e-06, + "loss": 0.0, + "step": 61718 + }, + { + "epoch": 5.75898105813194, + "grad_norm": NaN, + "learning_rate": 1.270550076442839e-06, + "loss": 0.0, + "step": 61719 + }, + { + "epoch": 5.759074367826817, + "grad_norm": NaN, + "learning_rate": 1.2695678781001473e-06, + "loss": 0.0, + "step": 61720 + }, + { + "epoch": 5.7591676775216945, + "grad_norm": NaN, + "learning_rate": 1.2685860579348783e-06, + "loss": 0.0, + "step": 61721 + }, + { + "epoch": 5.759260987216572, + "grad_norm": NaN, + "learning_rate": 1.2676046159495301e-06, + "loss": 0.0, + "step": 61722 + }, + { + "epoch": 5.759354296911449, + "grad_norm": NaN, + "learning_rate": 1.2666235521466173e-06, + "loss": 0.0, + "step": 61723 + }, + { + "epoch": 5.759447606606327, + "grad_norm": NaN, + "learning_rate": 1.265642866528621e-06, + "loss": 0.0, + "step": 61724 + }, + { + "epoch": 5.759540916301203, + "grad_norm": NaN, + "learning_rate": 1.2646625590980398e-06, + "loss": 0.0, + "step": 61725 + }, + { + "epoch": 5.759634225996081, + "grad_norm": NaN, + "learning_rate": 1.2636826298573543e-06, + "loss": 0.0, + "step": 61726 + }, + { + "epoch": 5.759727535690958, + "grad_norm": NaN, + "learning_rate": 1.2627030788090797e-06, + "loss": 0.0, + "step": 61727 + }, + { + "epoch": 5.7598208453858355, + "grad_norm": NaN, + "learning_rate": 1.2617239059556805e-06, + "loss": 0.0, + "step": 61728 + }, + { + "epoch": 5.759914155080713, + "grad_norm": NaN, + "learning_rate": 1.2607451112996547e-06, + "loss": 0.0, + "step": 61729 + }, + { + "epoch": 5.76000746477559, + "grad_norm": NaN, + "learning_rate": 1.2597666948435003e-06, + "loss": 0.0, + "step": 61730 + }, + { + "epoch": 5.760100774470468, + "grad_norm": NaN, + "learning_rate": 1.2587886565896987e-06, + "loss": 0.0, + "step": 61731 + }, + { + "epoch": 5.760194084165345, + "grad_norm": NaN, + "learning_rate": 1.2578109965407314e-06, + "loss": 0.0, + "step": 61732 + }, + { + "epoch": 5.760287393860222, + "grad_norm": NaN, + "learning_rate": 1.256833714699096e-06, + "loss": 0.0, + "step": 61733 + }, + { + "epoch": 5.760380703555099, + "grad_norm": NaN, + "learning_rate": 1.255856811067274e-06, + "loss": 0.0, + "step": 61734 + }, + { + "epoch": 5.7604740132499765, + "grad_norm": NaN, + "learning_rate": 1.2548802856477468e-06, + "loss": 0.0, + "step": 61735 + }, + { + "epoch": 5.760567322944854, + "grad_norm": NaN, + "learning_rate": 1.2539041384429793e-06, + "loss": 0.0, + "step": 61736 + }, + { + "epoch": 5.760660632639731, + "grad_norm": NaN, + "learning_rate": 1.2529283694554858e-06, + "loss": 0.0, + "step": 61737 + }, + { + "epoch": 5.760753942334609, + "grad_norm": NaN, + "learning_rate": 1.2519529786877481e-06, + "loss": 0.0, + "step": 61738 + }, + { + "epoch": 5.760847252029486, + "grad_norm": NaN, + "learning_rate": 1.2509779661422137e-06, + "loss": 0.0, + "step": 61739 + }, + { + "epoch": 5.760940561724363, + "grad_norm": NaN, + "learning_rate": 1.250003331821381e-06, + "loss": 0.0, + "step": 61740 + }, + { + "epoch": 5.76103387141924, + "grad_norm": NaN, + "learning_rate": 1.249029075727731e-06, + "loss": 0.0, + "step": 61741 + }, + { + "epoch": 5.761127181114118, + "grad_norm": NaN, + "learning_rate": 1.2480551978637288e-06, + "loss": 0.0, + "step": 61742 + }, + { + "epoch": 5.761220490808995, + "grad_norm": NaN, + "learning_rate": 1.247081698231872e-06, + "loss": 0.0, + "step": 61743 + }, + { + "epoch": 5.761313800503872, + "grad_norm": NaN, + "learning_rate": 1.246108576834609e-06, + "loss": 0.0, + "step": 61744 + }, + { + "epoch": 5.76140711019875, + "grad_norm": NaN, + "learning_rate": 1.2451358336744377e-06, + "loss": 0.0, + "step": 61745 + }, + { + "epoch": 5.761500419893627, + "grad_norm": NaN, + "learning_rate": 1.2441634687538226e-06, + "loss": 0.0, + "step": 61746 + }, + { + "epoch": 5.761593729588505, + "grad_norm": NaN, + "learning_rate": 1.2431914820752286e-06, + "loss": 0.0, + "step": 61747 + }, + { + "epoch": 5.761687039283381, + "grad_norm": NaN, + "learning_rate": 1.2422198736411372e-06, + "loss": 0.0, + "step": 61748 + }, + { + "epoch": 5.761780348978259, + "grad_norm": NaN, + "learning_rate": 1.2412486434540124e-06, + "loss": 0.0, + "step": 61749 + }, + { + "epoch": 5.761873658673136, + "grad_norm": NaN, + "learning_rate": 1.2402777915163198e-06, + "loss": 0.0, + "step": 61750 + }, + { + "epoch": 5.7619669683680135, + "grad_norm": NaN, + "learning_rate": 1.23930731783054e-06, + "loss": 0.0, + "step": 61751 + }, + { + "epoch": 5.762060278062891, + "grad_norm": NaN, + "learning_rate": 1.2383372223991384e-06, + "loss": 0.0, + "step": 61752 + }, + { + "epoch": 5.762153587757768, + "grad_norm": NaN, + "learning_rate": 1.2373675052245791e-06, + "loss": 0.0, + "step": 61753 + }, + { + "epoch": 5.762246897452645, + "grad_norm": NaN, + "learning_rate": 1.236398166309327e-06, + "loss": 0.0, + "step": 61754 + }, + { + "epoch": 5.762340207147522, + "grad_norm": NaN, + "learning_rate": 1.2354292056558468e-06, + "loss": 0.0, + "step": 61755 + }, + { + "epoch": 5.7624335168424, + "grad_norm": NaN, + "learning_rate": 1.2344606232666032e-06, + "loss": 0.0, + "step": 61756 + }, + { + "epoch": 5.762526826537277, + "grad_norm": NaN, + "learning_rate": 1.2334924191440609e-06, + "loss": 0.0, + "step": 61757 + }, + { + "epoch": 5.7626201362321545, + "grad_norm": NaN, + "learning_rate": 1.2325245932906846e-06, + "loss": 0.0, + "step": 61758 + }, + { + "epoch": 5.762713445927032, + "grad_norm": NaN, + "learning_rate": 1.2315571457089224e-06, + "loss": 0.0, + "step": 61759 + }, + { + "epoch": 5.762806755621909, + "grad_norm": NaN, + "learning_rate": 1.2305900764012388e-06, + "loss": 0.0, + "step": 61760 + }, + { + "epoch": 5.762900065316787, + "grad_norm": NaN, + "learning_rate": 1.2296233853700988e-06, + "loss": 0.0, + "step": 61761 + }, + { + "epoch": 5.762993375011664, + "grad_norm": NaN, + "learning_rate": 1.2286570726179668e-06, + "loss": 0.0, + "step": 61762 + }, + { + "epoch": 5.763086684706541, + "grad_norm": NaN, + "learning_rate": 1.2276911381472742e-06, + "loss": 0.0, + "step": 61763 + }, + { + "epoch": 5.763179994401418, + "grad_norm": NaN, + "learning_rate": 1.2267255819605027e-06, + "loss": 0.0, + "step": 61764 + }, + { + "epoch": 5.763273304096296, + "grad_norm": NaN, + "learning_rate": 1.2257604040600999e-06, + "loss": 0.0, + "step": 61765 + }, + { + "epoch": 5.763366613791173, + "grad_norm": NaN, + "learning_rate": 1.2247956044485142e-06, + "loss": 0.0, + "step": 61766 + }, + { + "epoch": 5.76345992348605, + "grad_norm": NaN, + "learning_rate": 1.2238311831282098e-06, + "loss": 0.0, + "step": 61767 + }, + { + "epoch": 5.763553233180928, + "grad_norm": NaN, + "learning_rate": 1.2228671401016187e-06, + "loss": 0.0, + "step": 61768 + }, + { + "epoch": 5.763646542875804, + "grad_norm": NaN, + "learning_rate": 1.2219034753712053e-06, + "loss": 0.0, + "step": 61769 + }, + { + "epoch": 5.763739852570682, + "grad_norm": NaN, + "learning_rate": 1.2209401889394344e-06, + "loss": 0.0, + "step": 61770 + }, + { + "epoch": 5.763833162265559, + "grad_norm": NaN, + "learning_rate": 1.219977280808737e-06, + "loss": 0.0, + "step": 61771 + }, + { + "epoch": 5.763926471960437, + "grad_norm": NaN, + "learning_rate": 1.2190147509815617e-06, + "loss": 0.0, + "step": 61772 + }, + { + "epoch": 5.764019781655314, + "grad_norm": NaN, + "learning_rate": 1.2180525994603562e-06, + "loss": 0.0, + "step": 61773 + }, + { + "epoch": 5.7641130913501915, + "grad_norm": NaN, + "learning_rate": 1.2170908262475853e-06, + "loss": 0.0, + "step": 61774 + }, + { + "epoch": 5.764206401045069, + "grad_norm": NaN, + "learning_rate": 1.2161294313456638e-06, + "loss": 0.0, + "step": 61775 + }, + { + "epoch": 5.764299710739946, + "grad_norm": NaN, + "learning_rate": 1.2151684147570562e-06, + "loss": 0.0, + "step": 61776 + }, + { + "epoch": 5.764393020434823, + "grad_norm": NaN, + "learning_rate": 1.2142077764841939e-06, + "loss": 0.0, + "step": 61777 + }, + { + "epoch": 5.7644863301297, + "grad_norm": NaN, + "learning_rate": 1.2132475165295418e-06, + "loss": 0.0, + "step": 61778 + }, + { + "epoch": 5.764579639824578, + "grad_norm": NaN, + "learning_rate": 1.2122876348955314e-06, + "loss": 0.0, + "step": 61779 + }, + { + "epoch": 5.764672949519455, + "grad_norm": NaN, + "learning_rate": 1.2113281315845769e-06, + "loss": 0.0, + "step": 61780 + }, + { + "epoch": 5.7647662592143325, + "grad_norm": NaN, + "learning_rate": 1.21036900659916e-06, + "loss": 0.0, + "step": 61781 + }, + { + "epoch": 5.76485956890921, + "grad_norm": NaN, + "learning_rate": 1.2094102599416955e-06, + "loss": 0.0, + "step": 61782 + }, + { + "epoch": 5.764952878604087, + "grad_norm": NaN, + "learning_rate": 1.2084518916146313e-06, + "loss": 0.0, + "step": 61783 + }, + { + "epoch": 5.765046188298964, + "grad_norm": NaN, + "learning_rate": 1.2074939016203823e-06, + "loss": 0.0, + "step": 61784 + }, + { + "epoch": 5.765139497993841, + "grad_norm": NaN, + "learning_rate": 1.206536289961413e-06, + "loss": 0.0, + "step": 61785 + }, + { + "epoch": 5.765232807688719, + "grad_norm": NaN, + "learning_rate": 1.205579056640138e-06, + "loss": 0.0, + "step": 61786 + }, + { + "epoch": 5.765326117383596, + "grad_norm": NaN, + "learning_rate": 1.2046222016590056e-06, + "loss": 0.0, + "step": 61787 + }, + { + "epoch": 5.7654194270784735, + "grad_norm": NaN, + "learning_rate": 1.2036657250204473e-06, + "loss": 0.0, + "step": 61788 + }, + { + "epoch": 5.765512736773351, + "grad_norm": NaN, + "learning_rate": 1.2027096267268776e-06, + "loss": 0.0, + "step": 61789 + }, + { + "epoch": 5.765606046468228, + "grad_norm": NaN, + "learning_rate": 1.2017539067807447e-06, + "loss": 0.0, + "step": 61790 + }, + { + "epoch": 5.765699356163106, + "grad_norm": NaN, + "learning_rate": 1.2007985651844797e-06, + "loss": 0.0, + "step": 61791 + }, + { + "epoch": 5.765792665857982, + "grad_norm": NaN, + "learning_rate": 1.1998436019404978e-06, + "loss": 0.0, + "step": 61792 + }, + { + "epoch": 5.76588597555286, + "grad_norm": NaN, + "learning_rate": 1.1988890170512467e-06, + "loss": 0.0, + "step": 61793 + }, + { + "epoch": 5.765979285247737, + "grad_norm": NaN, + "learning_rate": 1.197934810519141e-06, + "loss": 0.0, + "step": 61794 + }, + { + "epoch": 5.766072594942615, + "grad_norm": NaN, + "learning_rate": 1.196980982346596e-06, + "loss": 0.0, + "step": 61795 + }, + { + "epoch": 5.766165904637492, + "grad_norm": NaN, + "learning_rate": 1.1960275325360592e-06, + "loss": 0.0, + "step": 61796 + }, + { + "epoch": 5.766259214332369, + "grad_norm": NaN, + "learning_rate": 1.1950744610899454e-06, + "loss": 0.0, + "step": 61797 + }, + { + "epoch": 5.766352524027246, + "grad_norm": NaN, + "learning_rate": 1.1941217680106863e-06, + "loss": 0.0, + "step": 61798 + }, + { + "epoch": 5.766445833722123, + "grad_norm": NaN, + "learning_rate": 1.1931694533006797e-06, + "loss": 0.0, + "step": 61799 + }, + { + "epoch": 5.766539143417001, + "grad_norm": NaN, + "learning_rate": 1.1922175169623739e-06, + "loss": 0.0, + "step": 61800 + }, + { + "epoch": 5.766632453111878, + "grad_norm": NaN, + "learning_rate": 1.1912659589981831e-06, + "loss": 0.0, + "step": 61801 + }, + { + "epoch": 5.766725762806756, + "grad_norm": NaN, + "learning_rate": 1.190314779410506e-06, + "loss": 0.0, + "step": 61802 + }, + { + "epoch": 5.766819072501633, + "grad_norm": NaN, + "learning_rate": 1.1893639782017905e-06, + "loss": 0.0, + "step": 61803 + }, + { + "epoch": 5.7669123821965105, + "grad_norm": NaN, + "learning_rate": 1.1884135553744346e-06, + "loss": 0.0, + "step": 61804 + }, + { + "epoch": 5.767005691891388, + "grad_norm": NaN, + "learning_rate": 1.1874635109308694e-06, + "loss": 0.0, + "step": 61805 + }, + { + "epoch": 5.767099001586264, + "grad_norm": NaN, + "learning_rate": 1.1865138448734934e-06, + "loss": 0.0, + "step": 61806 + }, + { + "epoch": 5.767192311281142, + "grad_norm": NaN, + "learning_rate": 1.1855645572047378e-06, + "loss": 0.0, + "step": 61807 + }, + { + "epoch": 5.767285620976019, + "grad_norm": NaN, + "learning_rate": 1.1846156479270009e-06, + "loss": 0.0, + "step": 61808 + }, + { + "epoch": 5.767378930670897, + "grad_norm": NaN, + "learning_rate": 1.1836671170427136e-06, + "loss": 0.0, + "step": 61809 + }, + { + "epoch": 5.767472240365774, + "grad_norm": NaN, + "learning_rate": 1.1827189645542745e-06, + "loss": 0.0, + "step": 61810 + }, + { + "epoch": 5.7675655500606515, + "grad_norm": NaN, + "learning_rate": 1.181771190464098e-06, + "loss": 0.0, + "step": 61811 + }, + { + "epoch": 5.767658859755529, + "grad_norm": NaN, + "learning_rate": 1.180823794774599e-06, + "loss": 0.0, + "step": 61812 + }, + { + "epoch": 5.7677521694504055, + "grad_norm": NaN, + "learning_rate": 1.1798767774881756e-06, + "loss": 0.0, + "step": 61813 + }, + { + "epoch": 5.767845479145283, + "grad_norm": NaN, + "learning_rate": 1.1789301386072425e-06, + "loss": 0.0, + "step": 61814 + }, + { + "epoch": 5.76793878884016, + "grad_norm": NaN, + "learning_rate": 1.1779838781342144e-06, + "loss": 0.0, + "step": 61815 + }, + { + "epoch": 5.768032098535038, + "grad_norm": NaN, + "learning_rate": 1.1770379960714727e-06, + "loss": 0.0, + "step": 61816 + }, + { + "epoch": 5.768125408229915, + "grad_norm": NaN, + "learning_rate": 1.1760924924214488e-06, + "loss": 0.0, + "step": 61817 + }, + { + "epoch": 5.768218717924793, + "grad_norm": NaN, + "learning_rate": 1.1751473671865408e-06, + "loss": 0.0, + "step": 61818 + }, + { + "epoch": 5.76831202761967, + "grad_norm": NaN, + "learning_rate": 1.1742026203691468e-06, + "loss": 0.0, + "step": 61819 + }, + { + "epoch": 5.768405337314547, + "grad_norm": NaN, + "learning_rate": 1.1732582519716649e-06, + "loss": 0.0, + "step": 61820 + }, + { + "epoch": 5.768498647009424, + "grad_norm": NaN, + "learning_rate": 1.1723142619964931e-06, + "loss": 0.0, + "step": 61821 + }, + { + "epoch": 5.768591956704301, + "grad_norm": NaN, + "learning_rate": 1.171370650446063e-06, + "loss": 0.0, + "step": 61822 + }, + { + "epoch": 5.768685266399179, + "grad_norm": NaN, + "learning_rate": 1.1704274173227391e-06, + "loss": 0.0, + "step": 61823 + }, + { + "epoch": 5.768778576094056, + "grad_norm": NaN, + "learning_rate": 1.1694845626289195e-06, + "loss": 0.0, + "step": 61824 + }, + { + "epoch": 5.768871885788934, + "grad_norm": NaN, + "learning_rate": 1.1685420863670358e-06, + "loss": 0.0, + "step": 61825 + }, + { + "epoch": 5.768965195483811, + "grad_norm": NaN, + "learning_rate": 1.1675999885394361e-06, + "loss": 0.0, + "step": 61826 + }, + { + "epoch": 5.769058505178688, + "grad_norm": NaN, + "learning_rate": 1.1666582691485682e-06, + "loss": 0.0, + "step": 61827 + }, + { + "epoch": 5.769151814873565, + "grad_norm": NaN, + "learning_rate": 1.1657169281967805e-06, + "loss": 0.0, + "step": 61828 + }, + { + "epoch": 5.769245124568442, + "grad_norm": NaN, + "learning_rate": 1.1647759656865041e-06, + "loss": 0.0, + "step": 61829 + }, + { + "epoch": 5.76933843426332, + "grad_norm": NaN, + "learning_rate": 1.163835381620104e-06, + "loss": 0.0, + "step": 61830 + }, + { + "epoch": 5.769431743958197, + "grad_norm": NaN, + "learning_rate": 1.1628951759999782e-06, + "loss": 0.0, + "step": 61831 + }, + { + "epoch": 5.769525053653075, + "grad_norm": NaN, + "learning_rate": 1.1619553488285417e-06, + "loss": 0.0, + "step": 61832 + }, + { + "epoch": 5.769618363347952, + "grad_norm": NaN, + "learning_rate": 1.161015900108142e-06, + "loss": 0.0, + "step": 61833 + }, + { + "epoch": 5.7697116730428295, + "grad_norm": NaN, + "learning_rate": 1.1600768298411944e-06, + "loss": 0.0, + "step": 61834 + }, + { + "epoch": 5.769804982737707, + "grad_norm": NaN, + "learning_rate": 1.1591381380300802e-06, + "loss": 0.0, + "step": 61835 + }, + { + "epoch": 5.7698982924325835, + "grad_norm": NaN, + "learning_rate": 1.1581998246771973e-06, + "loss": 0.0, + "step": 61836 + }, + { + "epoch": 5.769991602127461, + "grad_norm": NaN, + "learning_rate": 1.1572618897849105e-06, + "loss": 0.0, + "step": 61837 + }, + { + "epoch": 5.770084911822338, + "grad_norm": NaN, + "learning_rate": 1.156324333355635e-06, + "loss": 0.0, + "step": 61838 + }, + { + "epoch": 5.770178221517216, + "grad_norm": NaN, + "learning_rate": 1.1553871553917183e-06, + "loss": 0.0, + "step": 61839 + }, + { + "epoch": 5.770271531212093, + "grad_norm": NaN, + "learning_rate": 1.1544503558955588e-06, + "loss": 0.0, + "step": 61840 + }, + { + "epoch": 5.7703648409069705, + "grad_norm": NaN, + "learning_rate": 1.1535139348695543e-06, + "loss": 0.0, + "step": 61841 + }, + { + "epoch": 5.770458150601847, + "grad_norm": NaN, + "learning_rate": 1.15257789231607e-06, + "loss": 0.0, + "step": 61842 + }, + { + "epoch": 5.7705514602967245, + "grad_norm": NaN, + "learning_rate": 1.151642228237487e-06, + "loss": 0.0, + "step": 61843 + }, + { + "epoch": 5.770644769991602, + "grad_norm": NaN, + "learning_rate": 1.1507069426361703e-06, + "loss": 0.0, + "step": 61844 + }, + { + "epoch": 5.770738079686479, + "grad_norm": NaN, + "learning_rate": 1.1497720355145345e-06, + "loss": 0.0, + "step": 61845 + }, + { + "epoch": 5.770831389381357, + "grad_norm": NaN, + "learning_rate": 1.1488375068749279e-06, + "loss": 0.0, + "step": 61846 + }, + { + "epoch": 5.770924699076234, + "grad_norm": NaN, + "learning_rate": 1.147903356719715e-06, + "loss": 0.0, + "step": 61847 + }, + { + "epoch": 5.771018008771112, + "grad_norm": NaN, + "learning_rate": 1.1469695850513271e-06, + "loss": 0.0, + "step": 61848 + }, + { + "epoch": 5.771111318465989, + "grad_norm": NaN, + "learning_rate": 1.1460361918720795e-06, + "loss": 0.0, + "step": 61849 + }, + { + "epoch": 5.7712046281608655, + "grad_norm": NaN, + "learning_rate": 1.1451031771843699e-06, + "loss": 0.0, + "step": 61850 + }, + { + "epoch": 5.771297937855743, + "grad_norm": NaN, + "learning_rate": 1.1441705409905799e-06, + "loss": 0.0, + "step": 61851 + }, + { + "epoch": 5.77139124755062, + "grad_norm": NaN, + "learning_rate": 1.1432382832930575e-06, + "loss": 0.0, + "step": 61852 + }, + { + "epoch": 5.771484557245498, + "grad_norm": NaN, + "learning_rate": 1.1423064040941843e-06, + "loss": 0.0, + "step": 61853 + }, + { + "epoch": 5.771577866940375, + "grad_norm": NaN, + "learning_rate": 1.1413749033963414e-06, + "loss": 0.0, + "step": 61854 + }, + { + "epoch": 5.771671176635253, + "grad_norm": NaN, + "learning_rate": 1.1404437812018774e-06, + "loss": 0.0, + "step": 61855 + }, + { + "epoch": 5.77176448633013, + "grad_norm": NaN, + "learning_rate": 1.1395130375131735e-06, + "loss": 0.0, + "step": 61856 + }, + { + "epoch": 5.771857796025007, + "grad_norm": NaN, + "learning_rate": 1.138582672332594e-06, + "loss": 0.0, + "step": 61857 + }, + { + "epoch": 5.771951105719884, + "grad_norm": NaN, + "learning_rate": 1.1376526856625046e-06, + "loss": 0.0, + "step": 61858 + }, + { + "epoch": 5.772044415414761, + "grad_norm": NaN, + "learning_rate": 1.1367230775052694e-06, + "loss": 0.0, + "step": 61859 + }, + { + "epoch": 5.772137725109639, + "grad_norm": NaN, + "learning_rate": 1.1357938478632367e-06, + "loss": 0.0, + "step": 61860 + }, + { + "epoch": 5.772231034804516, + "grad_norm": NaN, + "learning_rate": 1.1348649967388046e-06, + "loss": 0.0, + "step": 61861 + }, + { + "epoch": 5.772324344499394, + "grad_norm": NaN, + "learning_rate": 1.1339365241343045e-06, + "loss": 0.0, + "step": 61862 + }, + { + "epoch": 5.772417654194271, + "grad_norm": NaN, + "learning_rate": 1.1330084300521015e-06, + "loss": 0.0, + "step": 61863 + }, + { + "epoch": 5.7725109638891485, + "grad_norm": NaN, + "learning_rate": 1.13208071449456e-06, + "loss": 0.0, + "step": 61864 + }, + { + "epoch": 5.772604273584025, + "grad_norm": NaN, + "learning_rate": 1.1311533774640447e-06, + "loss": 0.0, + "step": 61865 + }, + { + "epoch": 5.7726975832789025, + "grad_norm": NaN, + "learning_rate": 1.130226418962904e-06, + "loss": 0.0, + "step": 61866 + }, + { + "epoch": 5.77279089297378, + "grad_norm": NaN, + "learning_rate": 1.1292998389935027e-06, + "loss": 0.0, + "step": 61867 + }, + { + "epoch": 5.772884202668657, + "grad_norm": NaN, + "learning_rate": 1.1283736375581886e-06, + "loss": 0.0, + "step": 61868 + }, + { + "epoch": 5.772977512363535, + "grad_norm": NaN, + "learning_rate": 1.1274478146593269e-06, + "loss": 0.0, + "step": 61869 + }, + { + "epoch": 5.773070822058412, + "grad_norm": NaN, + "learning_rate": 1.1265223702992654e-06, + "loss": 0.0, + "step": 61870 + }, + { + "epoch": 5.773164131753289, + "grad_norm": NaN, + "learning_rate": 1.1255973044803524e-06, + "loss": 0.0, + "step": 61871 + }, + { + "epoch": 5.773257441448166, + "grad_norm": NaN, + "learning_rate": 1.1246726172049524e-06, + "loss": 0.0, + "step": 61872 + }, + { + "epoch": 5.7733507511430435, + "grad_norm": NaN, + "learning_rate": 1.1237483084753973e-06, + "loss": 0.0, + "step": 61873 + }, + { + "epoch": 5.773444060837921, + "grad_norm": NaN, + "learning_rate": 1.1228243782940682e-06, + "loss": 0.0, + "step": 61874 + }, + { + "epoch": 5.773537370532798, + "grad_norm": NaN, + "learning_rate": 1.12190082666328e-06, + "loss": 0.0, + "step": 61875 + }, + { + "epoch": 5.773630680227676, + "grad_norm": NaN, + "learning_rate": 1.1209776535854142e-06, + "loss": 0.0, + "step": 61876 + }, + { + "epoch": 5.773723989922553, + "grad_norm": NaN, + "learning_rate": 1.1200548590627856e-06, + "loss": 0.0, + "step": 61877 + }, + { + "epoch": 5.773817299617431, + "grad_norm": NaN, + "learning_rate": 1.1191324430977589e-06, + "loss": 0.0, + "step": 61878 + }, + { + "epoch": 5.773910609312308, + "grad_norm": NaN, + "learning_rate": 1.1182104056926823e-06, + "loss": 0.0, + "step": 61879 + }, + { + "epoch": 5.774003919007185, + "grad_norm": NaN, + "learning_rate": 1.117288746849887e-06, + "loss": 0.0, + "step": 61880 + }, + { + "epoch": 5.774097228702062, + "grad_norm": NaN, + "learning_rate": 1.1163674665717216e-06, + "loss": 0.0, + "step": 61881 + }, + { + "epoch": 5.774190538396939, + "grad_norm": NaN, + "learning_rate": 1.1154465648605504e-06, + "loss": 0.0, + "step": 61882 + }, + { + "epoch": 5.774283848091817, + "grad_norm": NaN, + "learning_rate": 1.1145260417186719e-06, + "loss": 0.0, + "step": 61883 + }, + { + "epoch": 5.774377157786694, + "grad_norm": NaN, + "learning_rate": 1.1136058971484674e-06, + "loss": 0.0, + "step": 61884 + }, + { + "epoch": 5.774470467481572, + "grad_norm": NaN, + "learning_rate": 1.1126861311522518e-06, + "loss": 0.0, + "step": 61885 + }, + { + "epoch": 5.774563777176448, + "grad_norm": NaN, + "learning_rate": 1.111766743732373e-06, + "loss": 0.0, + "step": 61886 + }, + { + "epoch": 5.774657086871326, + "grad_norm": NaN, + "learning_rate": 1.1108477348911792e-06, + "loss": 0.0, + "step": 61887 + }, + { + "epoch": 5.774750396566203, + "grad_norm": NaN, + "learning_rate": 1.1099291046309689e-06, + "loss": 0.0, + "step": 61888 + }, + { + "epoch": 5.7748437062610805, + "grad_norm": NaN, + "learning_rate": 1.109010852954123e-06, + "loss": 0.0, + "step": 61889 + }, + { + "epoch": 5.774937015955958, + "grad_norm": NaN, + "learning_rate": 1.1080929798629568e-06, + "loss": 0.0, + "step": 61890 + }, + { + "epoch": 5.775030325650835, + "grad_norm": NaN, + "learning_rate": 1.1071754853598015e-06, + "loss": 0.0, + "step": 61891 + }, + { + "epoch": 5.775123635345713, + "grad_norm": NaN, + "learning_rate": 1.106258369447005e-06, + "loss": 0.0, + "step": 61892 + }, + { + "epoch": 5.77521694504059, + "grad_norm": NaN, + "learning_rate": 1.105341632126866e-06, + "loss": 0.0, + "step": 61893 + }, + { + "epoch": 5.775310254735467, + "grad_norm": NaN, + "learning_rate": 1.1044252734017488e-06, + "loss": 0.0, + "step": 61894 + }, + { + "epoch": 5.775403564430344, + "grad_norm": NaN, + "learning_rate": 1.1035092932739853e-06, + "loss": 0.0, + "step": 61895 + }, + { + "epoch": 5.7754968741252215, + "grad_norm": NaN, + "learning_rate": 1.1025936917458734e-06, + "loss": 0.0, + "step": 61896 + }, + { + "epoch": 5.775590183820099, + "grad_norm": NaN, + "learning_rate": 1.101678468819761e-06, + "loss": 0.0, + "step": 61897 + }, + { + "epoch": 5.775683493514976, + "grad_norm": NaN, + "learning_rate": 1.10076362449798e-06, + "loss": 0.0, + "step": 61898 + }, + { + "epoch": 5.775776803209854, + "grad_norm": NaN, + "learning_rate": 1.099849158782845e-06, + "loss": 0.0, + "step": 61899 + }, + { + "epoch": 5.775870112904731, + "grad_norm": NaN, + "learning_rate": 1.0989350716766877e-06, + "loss": 0.0, + "step": 61900 + }, + { + "epoch": 5.775963422599608, + "grad_norm": NaN, + "learning_rate": 1.0980213631818391e-06, + "loss": 0.0, + "step": 61901 + }, + { + "epoch": 5.776056732294485, + "grad_norm": NaN, + "learning_rate": 1.0971080333005977e-06, + "loss": 0.0, + "step": 61902 + }, + { + "epoch": 5.7761500419893625, + "grad_norm": NaN, + "learning_rate": 1.0961950820353117e-06, + "loss": 0.0, + "step": 61903 + }, + { + "epoch": 5.77624335168424, + "grad_norm": NaN, + "learning_rate": 1.0952825093882955e-06, + "loss": 0.0, + "step": 61904 + }, + { + "epoch": 5.776336661379117, + "grad_norm": NaN, + "learning_rate": 1.0943703153618644e-06, + "loss": 0.0, + "step": 61905 + }, + { + "epoch": 5.776429971073995, + "grad_norm": NaN, + "learning_rate": 1.093458499958333e-06, + "loss": 0.0, + "step": 61906 + }, + { + "epoch": 5.776523280768872, + "grad_norm": NaN, + "learning_rate": 1.0925470631800327e-06, + "loss": 0.0, + "step": 61907 + }, + { + "epoch": 5.77661659046375, + "grad_norm": NaN, + "learning_rate": 1.0916360050292782e-06, + "loss": 0.0, + "step": 61908 + }, + { + "epoch": 5.776709900158626, + "grad_norm": NaN, + "learning_rate": 1.0907253255083848e-06, + "loss": 0.0, + "step": 61909 + }, + { + "epoch": 5.776803209853504, + "grad_norm": NaN, + "learning_rate": 1.0898150246196669e-06, + "loss": 0.0, + "step": 61910 + }, + { + "epoch": 5.776896519548381, + "grad_norm": NaN, + "learning_rate": 1.0889051023654227e-06, + "loss": 0.0, + "step": 61911 + }, + { + "epoch": 5.776989829243258, + "grad_norm": NaN, + "learning_rate": 1.0879955587480006e-06, + "loss": 0.0, + "step": 61912 + }, + { + "epoch": 5.777083138938136, + "grad_norm": NaN, + "learning_rate": 1.0870863937696982e-06, + "loss": 0.0, + "step": 61913 + }, + { + "epoch": 5.777176448633013, + "grad_norm": NaN, + "learning_rate": 1.0861776074328144e-06, + "loss": 0.0, + "step": 61914 + }, + { + "epoch": 5.77726975832789, + "grad_norm": NaN, + "learning_rate": 1.0852691997396635e-06, + "loss": 0.0, + "step": 61915 + }, + { + "epoch": 5.777363068022767, + "grad_norm": NaN, + "learning_rate": 1.0843611706925603e-06, + "loss": 0.0, + "step": 61916 + }, + { + "epoch": 5.777456377717645, + "grad_norm": NaN, + "learning_rate": 1.0834535202938366e-06, + "loss": 0.0, + "step": 61917 + }, + { + "epoch": 5.777549687412522, + "grad_norm": NaN, + "learning_rate": 1.082546248545757e-06, + "loss": 0.0, + "step": 61918 + }, + { + "epoch": 5.7776429971073995, + "grad_norm": NaN, + "learning_rate": 1.081639355450653e-06, + "loss": 0.0, + "step": 61919 + }, + { + "epoch": 5.777736306802277, + "grad_norm": NaN, + "learning_rate": 1.0807328410108396e-06, + "loss": 0.0, + "step": 61920 + }, + { + "epoch": 5.777829616497154, + "grad_norm": NaN, + "learning_rate": 1.0798267052285981e-06, + "loss": 0.0, + "step": 61921 + }, + { + "epoch": 5.777922926192032, + "grad_norm": NaN, + "learning_rate": 1.0789209481062433e-06, + "loss": 0.0, + "step": 61922 + }, + { + "epoch": 5.778016235886908, + "grad_norm": NaN, + "learning_rate": 1.0780155696460735e-06, + "loss": 0.0, + "step": 61923 + }, + { + "epoch": 5.778109545581786, + "grad_norm": NaN, + "learning_rate": 1.0771105698504034e-06, + "loss": 0.0, + "step": 61924 + }, + { + "epoch": 5.778202855276663, + "grad_norm": NaN, + "learning_rate": 1.0762059487215314e-06, + "loss": 0.0, + "step": 61925 + }, + { + "epoch": 5.7782961649715405, + "grad_norm": NaN, + "learning_rate": 1.0753017062617386e-06, + "loss": 0.0, + "step": 61926 + }, + { + "epoch": 5.778389474666418, + "grad_norm": NaN, + "learning_rate": 1.0743978424733568e-06, + "loss": 0.0, + "step": 61927 + }, + { + "epoch": 5.778482784361295, + "grad_norm": NaN, + "learning_rate": 1.0734943573586674e-06, + "loss": 0.0, + "step": 61928 + }, + { + "epoch": 5.778576094056173, + "grad_norm": NaN, + "learning_rate": 1.0725912509199518e-06, + "loss": 0.0, + "step": 61929 + }, + { + "epoch": 5.778669403751049, + "grad_norm": NaN, + "learning_rate": 1.0716885231595419e-06, + "loss": 0.0, + "step": 61930 + }, + { + "epoch": 5.778762713445927, + "grad_norm": NaN, + "learning_rate": 1.0707861740796853e-06, + "loss": 0.0, + "step": 61931 + }, + { + "epoch": 5.778856023140804, + "grad_norm": NaN, + "learning_rate": 1.0698842036827137e-06, + "loss": 0.0, + "step": 61932 + }, + { + "epoch": 5.778949332835682, + "grad_norm": NaN, + "learning_rate": 1.0689826119709255e-06, + "loss": 0.0, + "step": 61933 + }, + { + "epoch": 5.779042642530559, + "grad_norm": NaN, + "learning_rate": 1.0680813989465852e-06, + "loss": 0.0, + "step": 61934 + }, + { + "epoch": 5.779135952225436, + "grad_norm": NaN, + "learning_rate": 1.0671805646119914e-06, + "loss": 0.0, + "step": 61935 + }, + { + "epoch": 5.779229261920314, + "grad_norm": NaN, + "learning_rate": 1.0662801089694418e-06, + "loss": 0.0, + "step": 61936 + }, + { + "epoch": 5.779322571615191, + "grad_norm": NaN, + "learning_rate": 1.0653800320212347e-06, + "loss": 0.0, + "step": 61937 + }, + { + "epoch": 5.779415881310068, + "grad_norm": NaN, + "learning_rate": 1.064480333769635e-06, + "loss": 0.0, + "step": 61938 + }, + { + "epoch": 5.779509191004945, + "grad_norm": NaN, + "learning_rate": 1.0635810142169576e-06, + "loss": 0.0, + "step": 61939 + }, + { + "epoch": 5.779602500699823, + "grad_norm": NaN, + "learning_rate": 1.0626820733654673e-06, + "loss": 0.0, + "step": 61940 + }, + { + "epoch": 5.7796958103947, + "grad_norm": NaN, + "learning_rate": 1.0617835112174456e-06, + "loss": 0.0, + "step": 61941 + }, + { + "epoch": 5.7797891200895775, + "grad_norm": NaN, + "learning_rate": 1.060885327775224e-06, + "loss": 0.0, + "step": 61942 + }, + { + "epoch": 5.779882429784455, + "grad_norm": NaN, + "learning_rate": 1.0599875230410338e-06, + "loss": 0.0, + "step": 61943 + }, + { + "epoch": 5.779975739479331, + "grad_norm": NaN, + "learning_rate": 1.0590900970171568e-06, + "loss": 0.0, + "step": 61944 + }, + { + "epoch": 5.780069049174209, + "grad_norm": NaN, + "learning_rate": 1.0581930497059244e-06, + "loss": 0.0, + "step": 61945 + }, + { + "epoch": 5.780162358869086, + "grad_norm": NaN, + "learning_rate": 1.057296381109568e-06, + "loss": 0.0, + "step": 61946 + }, + { + "epoch": 5.780255668563964, + "grad_norm": NaN, + "learning_rate": 1.0564000912304027e-06, + "loss": 0.0, + "step": 61947 + }, + { + "epoch": 5.780348978258841, + "grad_norm": NaN, + "learning_rate": 1.0555041800706764e-06, + "loss": 0.0, + "step": 61948 + }, + { + "epoch": 5.7804422879537185, + "grad_norm": NaN, + "learning_rate": 1.0546086476326876e-06, + "loss": 0.0, + "step": 61949 + }, + { + "epoch": 5.780535597648596, + "grad_norm": NaN, + "learning_rate": 1.053713493918701e-06, + "loss": 0.0, + "step": 61950 + }, + { + "epoch": 5.780628907343473, + "grad_norm": NaN, + "learning_rate": 1.0528187189310144e-06, + "loss": 0.0, + "step": 61951 + }, + { + "epoch": 5.780722217038351, + "grad_norm": NaN, + "learning_rate": 1.0519243226718765e-06, + "loss": 0.0, + "step": 61952 + }, + { + "epoch": 5.780815526733227, + "grad_norm": NaN, + "learning_rate": 1.0510303051435853e-06, + "loss": 0.0, + "step": 61953 + }, + { + "epoch": 5.780908836428105, + "grad_norm": NaN, + "learning_rate": 1.050136666348389e-06, + "loss": 0.0, + "step": 61954 + }, + { + "epoch": 5.781002146122982, + "grad_norm": NaN, + "learning_rate": 1.0492434062885858e-06, + "loss": 0.0, + "step": 61955 + }, + { + "epoch": 5.7810954558178596, + "grad_norm": NaN, + "learning_rate": 1.0483505249664236e-06, + "loss": 0.0, + "step": 61956 + }, + { + "epoch": 5.781188765512737, + "grad_norm": NaN, + "learning_rate": 1.0474580223841844e-06, + "loss": 0.0, + "step": 61957 + }, + { + "epoch": 5.781282075207614, + "grad_norm": NaN, + "learning_rate": 1.0465658985441327e-06, + "loss": 0.0, + "step": 61958 + }, + { + "epoch": 5.781375384902491, + "grad_norm": NaN, + "learning_rate": 1.0456741534485503e-06, + "loss": 0.0, + "step": 61959 + }, + { + "epoch": 5.781468694597368, + "grad_norm": NaN, + "learning_rate": 1.0447827870997016e-06, + "loss": 0.0, + "step": 61960 + }, + { + "epoch": 5.781562004292246, + "grad_norm": NaN, + "learning_rate": 1.0438917994998186e-06, + "loss": 0.0, + "step": 61961 + }, + { + "epoch": 5.781655313987123, + "grad_norm": NaN, + "learning_rate": 1.043001190651216e-06, + "loss": 0.0, + "step": 61962 + }, + { + "epoch": 5.781748623682001, + "grad_norm": NaN, + "learning_rate": 1.0421109605561418e-06, + "loss": 0.0, + "step": 61963 + }, + { + "epoch": 5.781841933376878, + "grad_norm": NaN, + "learning_rate": 1.0412211092168444e-06, + "loss": 0.0, + "step": 61964 + }, + { + "epoch": 5.781935243071755, + "grad_norm": NaN, + "learning_rate": 1.040331636635605e-06, + "loss": 0.0, + "step": 61965 + }, + { + "epoch": 5.782028552766633, + "grad_norm": NaN, + "learning_rate": 1.0394425428146724e-06, + "loss": 0.0, + "step": 61966 + }, + { + "epoch": 5.782121862461509, + "grad_norm": NaN, + "learning_rate": 1.0385538277562943e-06, + "loss": 0.0, + "step": 61967 + }, + { + "epoch": 5.782215172156387, + "grad_norm": NaN, + "learning_rate": 1.0376654914627691e-06, + "loss": 0.0, + "step": 61968 + }, + { + "epoch": 5.782308481851264, + "grad_norm": NaN, + "learning_rate": 1.0367775339363448e-06, + "loss": 0.0, + "step": 61969 + }, + { + "epoch": 5.782401791546142, + "grad_norm": NaN, + "learning_rate": 1.0358899551792366e-06, + "loss": 0.0, + "step": 61970 + }, + { + "epoch": 5.782495101241019, + "grad_norm": NaN, + "learning_rate": 1.035002755193759e-06, + "loss": 0.0, + "step": 61971 + }, + { + "epoch": 5.7825884109358965, + "grad_norm": NaN, + "learning_rate": 1.0341159339821437e-06, + "loss": 0.0, + "step": 61972 + }, + { + "epoch": 5.782681720630774, + "grad_norm": NaN, + "learning_rate": 1.0332294915466222e-06, + "loss": 0.0, + "step": 61973 + }, + { + "epoch": 5.78277503032565, + "grad_norm": NaN, + "learning_rate": 1.0323434278894927e-06, + "loss": 0.0, + "step": 61974 + }, + { + "epoch": 5.782868340020528, + "grad_norm": NaN, + "learning_rate": 1.0314577430129866e-06, + "loss": 0.0, + "step": 61975 + }, + { + "epoch": 5.782961649715405, + "grad_norm": NaN, + "learning_rate": 1.0305724369193358e-06, + "loss": 0.0, + "step": 61976 + }, + { + "epoch": 5.783054959410283, + "grad_norm": NaN, + "learning_rate": 1.0296875096108215e-06, + "loss": 0.0, + "step": 61977 + }, + { + "epoch": 5.78314826910516, + "grad_norm": NaN, + "learning_rate": 1.028802961089692e-06, + "loss": 0.0, + "step": 61978 + }, + { + "epoch": 5.7832415788000375, + "grad_norm": NaN, + "learning_rate": 1.0279187913581787e-06, + "loss": 0.0, + "step": 61979 + }, + { + "epoch": 5.783334888494915, + "grad_norm": NaN, + "learning_rate": 1.0270350004185467e-06, + "loss": 0.0, + "step": 61980 + }, + { + "epoch": 5.783428198189792, + "grad_norm": NaN, + "learning_rate": 1.0261515882730275e-06, + "loss": 0.0, + "step": 61981 + }, + { + "epoch": 5.783521507884669, + "grad_norm": NaN, + "learning_rate": 1.0252685549238692e-06, + "loss": 0.0, + "step": 61982 + }, + { + "epoch": 5.783614817579546, + "grad_norm": NaN, + "learning_rate": 1.0243859003733367e-06, + "loss": 0.0, + "step": 61983 + }, + { + "epoch": 5.783708127274424, + "grad_norm": NaN, + "learning_rate": 1.0235036246236617e-06, + "loss": 0.0, + "step": 61984 + }, + { + "epoch": 5.783801436969301, + "grad_norm": NaN, + "learning_rate": 1.0226217276770755e-06, + "loss": 0.0, + "step": 61985 + }, + { + "epoch": 5.783894746664179, + "grad_norm": NaN, + "learning_rate": 1.021740209535843e-06, + "loss": 0.0, + "step": 61986 + }, + { + "epoch": 5.783988056359056, + "grad_norm": NaN, + "learning_rate": 1.0208590702021957e-06, + "loss": 0.0, + "step": 61987 + }, + { + "epoch": 5.7840813660539325, + "grad_norm": NaN, + "learning_rate": 1.0199783096783654e-06, + "loss": 0.0, + "step": 61988 + }, + { + "epoch": 5.78417467574881, + "grad_norm": NaN, + "learning_rate": 1.0190979279666e-06, + "loss": 0.0, + "step": 61989 + }, + { + "epoch": 5.784267985443687, + "grad_norm": NaN, + "learning_rate": 1.018217925069148e-06, + "loss": 0.0, + "step": 61990 + }, + { + "epoch": 5.784361295138565, + "grad_norm": NaN, + "learning_rate": 1.017338300988224e-06, + "loss": 0.0, + "step": 61991 + }, + { + "epoch": 5.784454604833442, + "grad_norm": NaN, + "learning_rate": 1.0164590557260766e-06, + "loss": 0.0, + "step": 61992 + }, + { + "epoch": 5.78454791452832, + "grad_norm": NaN, + "learning_rate": 1.0155801892849536e-06, + "loss": 0.0, + "step": 61993 + }, + { + "epoch": 5.784641224223197, + "grad_norm": NaN, + "learning_rate": 1.0147017016670534e-06, + "loss": 0.0, + "step": 61994 + }, + { + "epoch": 5.7847345339180745, + "grad_norm": NaN, + "learning_rate": 1.0138235928746574e-06, + "loss": 0.0, + "step": 61995 + }, + { + "epoch": 5.784827843612952, + "grad_norm": NaN, + "learning_rate": 1.0129458629099641e-06, + "loss": 0.0, + "step": 61996 + }, + { + "epoch": 5.784921153307828, + "grad_norm": NaN, + "learning_rate": 1.0120685117752213e-06, + "loss": 0.0, + "step": 61997 + }, + { + "epoch": 5.785014463002706, + "grad_norm": NaN, + "learning_rate": 1.011191539472661e-06, + "loss": 0.0, + "step": 61998 + }, + { + "epoch": 5.785107772697583, + "grad_norm": NaN, + "learning_rate": 1.0103149460044813e-06, + "loss": 0.0, + "step": 61999 + }, + { + "epoch": 5.785201082392461, + "grad_norm": NaN, + "learning_rate": 1.0094387313729635e-06, + "loss": 0.0, + "step": 62000 + }, + { + "epoch": 5.785294392087338, + "grad_norm": NaN, + "learning_rate": 1.008562895580306e-06, + "loss": 0.0, + "step": 62001 + }, + { + "epoch": 5.7853877017822155, + "grad_norm": NaN, + "learning_rate": 1.0076874386287236e-06, + "loss": 0.0, + "step": 62002 + }, + { + "epoch": 5.785481011477092, + "grad_norm": NaN, + "learning_rate": 1.0068123605204647e-06, + "loss": 0.0, + "step": 62003 + }, + { + "epoch": 5.7855743211719695, + "grad_norm": NaN, + "learning_rate": 1.0059376612577442e-06, + "loss": 0.0, + "step": 62004 + }, + { + "epoch": 5.785667630866847, + "grad_norm": NaN, + "learning_rate": 1.0050633408427933e-06, + "loss": 0.0, + "step": 62005 + }, + { + "epoch": 5.785760940561724, + "grad_norm": NaN, + "learning_rate": 1.004189399277827e-06, + "loss": 0.0, + "step": 62006 + }, + { + "epoch": 5.785854250256602, + "grad_norm": NaN, + "learning_rate": 1.0033158365650773e-06, + "loss": 0.0, + "step": 62007 + }, + { + "epoch": 5.785947559951479, + "grad_norm": NaN, + "learning_rate": 1.0024426527067586e-06, + "loss": 0.0, + "step": 62008 + }, + { + "epoch": 5.7860408696463566, + "grad_norm": NaN, + "learning_rate": 1.001569847705086e-06, + "loss": 0.0, + "step": 62009 + }, + { + "epoch": 5.786134179341234, + "grad_norm": NaN, + "learning_rate": 1.000697421562291e-06, + "loss": 0.0, + "step": 62010 + }, + { + "epoch": 5.7862274890361105, + "grad_norm": NaN, + "learning_rate": 9.998253742805717e-07, + "loss": 0.0, + "step": 62011 + }, + { + "epoch": 5.786320798730988, + "grad_norm": NaN, + "learning_rate": 9.989537058621766e-07, + "loss": 0.0, + "step": 62012 + }, + { + "epoch": 5.786414108425865, + "grad_norm": NaN, + "learning_rate": 9.980824163093038e-07, + "loss": 0.0, + "step": 62013 + }, + { + "epoch": 5.786507418120743, + "grad_norm": NaN, + "learning_rate": 9.972115056241513e-07, + "loss": 0.0, + "step": 62014 + }, + { + "epoch": 5.78660072781562, + "grad_norm": NaN, + "learning_rate": 9.963409738089844e-07, + "loss": 0.0, + "step": 62015 + }, + { + "epoch": 5.786694037510498, + "grad_norm": NaN, + "learning_rate": 9.954708208659678e-07, + "loss": 0.0, + "step": 62016 + }, + { + "epoch": 5.786787347205375, + "grad_norm": NaN, + "learning_rate": 9.946010467973165e-07, + "loss": 0.0, + "step": 62017 + }, + { + "epoch": 5.7868806569002516, + "grad_norm": NaN, + "learning_rate": 9.937316516052784e-07, + "loss": 0.0, + "step": 62018 + }, + { + "epoch": 5.786973966595129, + "grad_norm": NaN, + "learning_rate": 9.928626352920188e-07, + "loss": 0.0, + "step": 62019 + }, + { + "epoch": 5.787067276290006, + "grad_norm": NaN, + "learning_rate": 9.919939978597858e-07, + "loss": 0.0, + "step": 62020 + }, + { + "epoch": 5.787160585984884, + "grad_norm": NaN, + "learning_rate": 9.911257393107775e-07, + "loss": 0.0, + "step": 62021 + }, + { + "epoch": 5.787253895679761, + "grad_norm": NaN, + "learning_rate": 9.902578596471755e-07, + "loss": 0.0, + "step": 62022 + }, + { + "epoch": 5.787347205374639, + "grad_norm": NaN, + "learning_rate": 9.893903588712282e-07, + "loss": 0.0, + "step": 62023 + }, + { + "epoch": 5.787440515069516, + "grad_norm": NaN, + "learning_rate": 9.88523236985117e-07, + "loss": 0.0, + "step": 62024 + }, + { + "epoch": 5.7875338247643935, + "grad_norm": NaN, + "learning_rate": 9.876564939910403e-07, + "loss": 0.0, + "step": 62025 + }, + { + "epoch": 5.78762713445927, + "grad_norm": NaN, + "learning_rate": 9.867901298912128e-07, + "loss": 0.0, + "step": 62026 + }, + { + "epoch": 5.787720444154147, + "grad_norm": NaN, + "learning_rate": 9.859241446878497e-07, + "loss": 0.0, + "step": 62027 + }, + { + "epoch": 5.787813753849025, + "grad_norm": NaN, + "learning_rate": 9.850585383831321e-07, + "loss": 0.0, + "step": 62028 + }, + { + "epoch": 5.787907063543902, + "grad_norm": NaN, + "learning_rate": 9.841933109792755e-07, + "loss": 0.0, + "step": 62029 + }, + { + "epoch": 5.78800037323878, + "grad_norm": NaN, + "learning_rate": 9.833284624784776e-07, + "loss": 0.0, + "step": 62030 + }, + { + "epoch": 5.788093682933657, + "grad_norm": NaN, + "learning_rate": 9.824639928829204e-07, + "loss": 0.0, + "step": 62031 + }, + { + "epoch": 5.788186992628534, + "grad_norm": NaN, + "learning_rate": 9.815999021948183e-07, + "loss": 0.0, + "step": 62032 + }, + { + "epoch": 5.788280302323411, + "grad_norm": NaN, + "learning_rate": 9.807361904163702e-07, + "loss": 0.0, + "step": 62033 + }, + { + "epoch": 5.7883736120182885, + "grad_norm": NaN, + "learning_rate": 9.798728575497572e-07, + "loss": 0.0, + "step": 62034 + }, + { + "epoch": 5.788466921713166, + "grad_norm": NaN, + "learning_rate": 9.790099035971943e-07, + "loss": 0.0, + "step": 62035 + }, + { + "epoch": 5.788560231408043, + "grad_norm": NaN, + "learning_rate": 9.781473285608632e-07, + "loss": 0.0, + "step": 62036 + }, + { + "epoch": 5.788653541102921, + "grad_norm": NaN, + "learning_rate": 9.772851324429453e-07, + "loss": 0.0, + "step": 62037 + }, + { + "epoch": 5.788746850797798, + "grad_norm": NaN, + "learning_rate": 9.764233152456724e-07, + "loss": 0.0, + "step": 62038 + }, + { + "epoch": 5.788840160492676, + "grad_norm": NaN, + "learning_rate": 9.755618769712093e-07, + "loss": 0.0, + "step": 62039 + }, + { + "epoch": 5.788933470187552, + "grad_norm": NaN, + "learning_rate": 9.747008176217208e-07, + "loss": 0.0, + "step": 62040 + }, + { + "epoch": 5.7890267798824295, + "grad_norm": NaN, + "learning_rate": 9.738401371994552e-07, + "loss": 0.0, + "step": 62041 + }, + { + "epoch": 5.789120089577307, + "grad_norm": NaN, + "learning_rate": 9.72979835706561e-07, + "loss": 0.0, + "step": 62042 + }, + { + "epoch": 5.789213399272184, + "grad_norm": NaN, + "learning_rate": 9.721199131452195e-07, + "loss": 0.0, + "step": 62043 + }, + { + "epoch": 5.789306708967062, + "grad_norm": NaN, + "learning_rate": 9.712603695176458e-07, + "loss": 0.0, + "step": 62044 + }, + { + "epoch": 5.789400018661939, + "grad_norm": NaN, + "learning_rate": 9.704012048260045e-07, + "loss": 0.0, + "step": 62045 + }, + { + "epoch": 5.789493328356817, + "grad_norm": NaN, + "learning_rate": 9.695424190724777e-07, + "loss": 0.0, + "step": 62046 + }, + { + "epoch": 5.789586638051693, + "grad_norm": NaN, + "learning_rate": 9.6868401225928e-07, + "loss": 0.0, + "step": 62047 + }, + { + "epoch": 5.789679947746571, + "grad_norm": NaN, + "learning_rate": 9.678259843885593e-07, + "loss": 0.0, + "step": 62048 + }, + { + "epoch": 5.789773257441448, + "grad_norm": NaN, + "learning_rate": 9.669683354625145e-07, + "loss": 0.0, + "step": 62049 + }, + { + "epoch": 5.789866567136325, + "grad_norm": NaN, + "learning_rate": 9.661110654833271e-07, + "loss": 0.0, + "step": 62050 + }, + { + "epoch": 5.789959876831203, + "grad_norm": NaN, + "learning_rate": 9.652541744531617e-07, + "loss": 0.0, + "step": 62051 + }, + { + "epoch": 5.79005318652608, + "grad_norm": NaN, + "learning_rate": 9.643976623742166e-07, + "loss": 0.0, + "step": 62052 + }, + { + "epoch": 5.790146496220958, + "grad_norm": NaN, + "learning_rate": 9.63541529248657e-07, + "loss": 0.0, + "step": 62053 + }, + { + "epoch": 5.790239805915835, + "grad_norm": NaN, + "learning_rate": 9.626857750786642e-07, + "loss": 0.0, + "step": 62054 + }, + { + "epoch": 5.790333115610712, + "grad_norm": NaN, + "learning_rate": 9.618303998664034e-07, + "loss": 0.0, + "step": 62055 + }, + { + "epoch": 5.790426425305589, + "grad_norm": NaN, + "learning_rate": 9.609754036140726e-07, + "loss": 0.0, + "step": 62056 + }, + { + "epoch": 5.7905197350004665, + "grad_norm": NaN, + "learning_rate": 9.60120786323837e-07, + "loss": 0.0, + "step": 62057 + }, + { + "epoch": 5.790613044695344, + "grad_norm": NaN, + "learning_rate": 9.592665479978446e-07, + "loss": 0.0, + "step": 62058 + }, + { + "epoch": 5.790706354390221, + "grad_norm": NaN, + "learning_rate": 9.584126886383103e-07, + "loss": 0.0, + "step": 62059 + }, + { + "epoch": 5.790799664085099, + "grad_norm": NaN, + "learning_rate": 9.57559208247366e-07, + "loss": 0.0, + "step": 62060 + }, + { + "epoch": 5.790892973779975, + "grad_norm": NaN, + "learning_rate": 9.567061068272096e-07, + "loss": 0.0, + "step": 62061 + }, + { + "epoch": 5.790986283474853, + "grad_norm": NaN, + "learning_rate": 9.558533843800064e-07, + "loss": 0.0, + "step": 62062 + }, + { + "epoch": 5.79107959316973, + "grad_norm": NaN, + "learning_rate": 9.550010409079045e-07, + "loss": 0.0, + "step": 62063 + }, + { + "epoch": 5.7911729028646075, + "grad_norm": NaN, + "learning_rate": 9.541490764130855e-07, + "loss": 0.0, + "step": 62064 + }, + { + "epoch": 5.791266212559485, + "grad_norm": NaN, + "learning_rate": 9.532974908977143e-07, + "loss": 0.0, + "step": 62065 + }, + { + "epoch": 5.791359522254362, + "grad_norm": NaN, + "learning_rate": 9.524462843639557e-07, + "loss": 0.0, + "step": 62066 + }, + { + "epoch": 5.79145283194924, + "grad_norm": NaN, + "learning_rate": 9.51595456813975e-07, + "loss": 0.0, + "step": 62067 + }, + { + "epoch": 5.791546141644117, + "grad_norm": NaN, + "learning_rate": 9.507450082499535e-07, + "loss": 0.0, + "step": 62068 + }, + { + "epoch": 5.791639451338995, + "grad_norm": NaN, + "learning_rate": 9.498949386740229e-07, + "loss": 0.0, + "step": 62069 + }, + { + "epoch": 5.791732761033871, + "grad_norm": NaN, + "learning_rate": 9.490452480883481e-07, + "loss": 0.0, + "step": 62070 + }, + { + "epoch": 5.7918260707287486, + "grad_norm": NaN, + "learning_rate": 9.481959364951108e-07, + "loss": 0.0, + "step": 62071 + }, + { + "epoch": 5.791919380423626, + "grad_norm": NaN, + "learning_rate": 9.473470038964426e-07, + "loss": 0.0, + "step": 62072 + }, + { + "epoch": 5.792012690118503, + "grad_norm": NaN, + "learning_rate": 9.464984502945417e-07, + "loss": 0.0, + "step": 62073 + }, + { + "epoch": 5.792105999813381, + "grad_norm": NaN, + "learning_rate": 9.456502756915397e-07, + "loss": 0.0, + "step": 62074 + }, + { + "epoch": 5.792199309508258, + "grad_norm": NaN, + "learning_rate": 9.448024800895848e-07, + "loss": 0.0, + "step": 62075 + }, + { + "epoch": 5.792292619203135, + "grad_norm": NaN, + "learning_rate": 9.439550634908422e-07, + "loss": 0.0, + "step": 62076 + }, + { + "epoch": 5.792385928898012, + "grad_norm": NaN, + "learning_rate": 9.431080258974766e-07, + "loss": 0.0, + "step": 62077 + }, + { + "epoch": 5.79247923859289, + "grad_norm": NaN, + "learning_rate": 9.422613673116197e-07, + "loss": 0.0, + "step": 62078 + }, + { + "epoch": 5.792572548287767, + "grad_norm": NaN, + "learning_rate": 9.414150877354532e-07, + "loss": 0.0, + "step": 62079 + }, + { + "epoch": 5.792665857982644, + "grad_norm": NaN, + "learning_rate": 9.405691871711086e-07, + "loss": 0.0, + "step": 62080 + }, + { + "epoch": 5.792759167677522, + "grad_norm": NaN, + "learning_rate": 9.397236656207341e-07, + "loss": 0.0, + "step": 62081 + }, + { + "epoch": 5.792852477372399, + "grad_norm": NaN, + "learning_rate": 9.388785230865114e-07, + "loss": 0.0, + "step": 62082 + }, + { + "epoch": 5.792945787067277, + "grad_norm": NaN, + "learning_rate": 9.380337595705556e-07, + "loss": 0.0, + "step": 62083 + }, + { + "epoch": 5.793039096762153, + "grad_norm": NaN, + "learning_rate": 9.371893750749981e-07, + "loss": 0.0, + "step": 62084 + }, + { + "epoch": 5.793132406457031, + "grad_norm": NaN, + "learning_rate": 9.363453696020539e-07, + "loss": 0.0, + "step": 62085 + }, + { + "epoch": 5.793225716151908, + "grad_norm": NaN, + "learning_rate": 9.355017431538047e-07, + "loss": 0.0, + "step": 62086 + }, + { + "epoch": 5.7933190258467855, + "grad_norm": NaN, + "learning_rate": 9.346584957323988e-07, + "loss": 0.0, + "step": 62087 + }, + { + "epoch": 5.793412335541663, + "grad_norm": NaN, + "learning_rate": 9.338156273400344e-07, + "loss": 0.0, + "step": 62088 + }, + { + "epoch": 5.79350564523654, + "grad_norm": NaN, + "learning_rate": 9.329731379788096e-07, + "loss": 0.0, + "step": 62089 + }, + { + "epoch": 5.793598954931418, + "grad_norm": NaN, + "learning_rate": 9.321310276508731e-07, + "loss": 0.0, + "step": 62090 + }, + { + "epoch": 5.793692264626294, + "grad_norm": NaN, + "learning_rate": 9.312892963583729e-07, + "loss": 0.0, + "step": 62091 + }, + { + "epoch": 5.793785574321172, + "grad_norm": NaN, + "learning_rate": 9.304479441034407e-07, + "loss": 0.0, + "step": 62092 + }, + { + "epoch": 5.793878884016049, + "grad_norm": NaN, + "learning_rate": 9.296069708882247e-07, + "loss": 0.0, + "step": 62093 + }, + { + "epoch": 5.7939721937109265, + "grad_norm": NaN, + "learning_rate": 9.287663767148734e-07, + "loss": 0.0, + "step": 62094 + }, + { + "epoch": 5.794065503405804, + "grad_norm": NaN, + "learning_rate": 9.279261615855015e-07, + "loss": 0.0, + "step": 62095 + }, + { + "epoch": 5.794158813100681, + "grad_norm": NaN, + "learning_rate": 9.270863255022409e-07, + "loss": 0.0, + "step": 62096 + }, + { + "epoch": 5.794252122795559, + "grad_norm": NaN, + "learning_rate": 9.262468684672564e-07, + "loss": 0.0, + "step": 62097 + }, + { + "epoch": 5.794345432490436, + "grad_norm": NaN, + "learning_rate": 9.254077904826629e-07, + "loss": 0.0, + "step": 62098 + }, + { + "epoch": 5.794438742185313, + "grad_norm": NaN, + "learning_rate": 9.245690915505921e-07, + "loss": 0.0, + "step": 62099 + }, + { + "epoch": 5.79453205188019, + "grad_norm": NaN, + "learning_rate": 9.237307716731923e-07, + "loss": 0.0, + "step": 62100 + }, + { + "epoch": 5.794625361575068, + "grad_norm": NaN, + "learning_rate": 9.228928308525951e-07, + "loss": 0.0, + "step": 62101 + }, + { + "epoch": 5.794718671269945, + "grad_norm": NaN, + "learning_rate": 9.220552690908989e-07, + "loss": 0.0, + "step": 62102 + }, + { + "epoch": 5.794811980964822, + "grad_norm": NaN, + "learning_rate": 9.212180863902686e-07, + "loss": 0.0, + "step": 62103 + }, + { + "epoch": 5.7949052906597, + "grad_norm": NaN, + "learning_rate": 9.203812827528189e-07, + "loss": 0.0, + "step": 62104 + }, + { + "epoch": 5.794998600354576, + "grad_norm": NaN, + "learning_rate": 9.195448581806819e-07, + "loss": 0.0, + "step": 62105 + }, + { + "epoch": 5.795091910049454, + "grad_norm": NaN, + "learning_rate": 9.187088126759723e-07, + "loss": 0.0, + "step": 62106 + }, + { + "epoch": 5.795185219744331, + "grad_norm": NaN, + "learning_rate": 9.178731462408384e-07, + "loss": 0.0, + "step": 62107 + }, + { + "epoch": 5.795278529439209, + "grad_norm": NaN, + "learning_rate": 9.170378588773786e-07, + "loss": 0.0, + "step": 62108 + }, + { + "epoch": 5.795371839134086, + "grad_norm": NaN, + "learning_rate": 9.16202950587741e-07, + "loss": 0.0, + "step": 62109 + }, + { + "epoch": 5.7954651488289635, + "grad_norm": NaN, + "learning_rate": 9.153684213740242e-07, + "loss": 0.0, + "step": 62110 + }, + { + "epoch": 5.795558458523841, + "grad_norm": NaN, + "learning_rate": 9.145342712383763e-07, + "loss": 0.0, + "step": 62111 + }, + { + "epoch": 5.795651768218718, + "grad_norm": NaN, + "learning_rate": 9.137005001828956e-07, + "loss": 0.0, + "step": 62112 + }, + { + "epoch": 5.795745077913596, + "grad_norm": NaN, + "learning_rate": 9.128671082097139e-07, + "loss": 0.0, + "step": 62113 + }, + { + "epoch": 5.795838387608472, + "grad_norm": NaN, + "learning_rate": 9.12034095320946e-07, + "loss": 0.0, + "step": 62114 + }, + { + "epoch": 5.79593169730335, + "grad_norm": NaN, + "learning_rate": 9.112014615187236e-07, + "loss": 0.0, + "step": 62115 + }, + { + "epoch": 5.796025006998227, + "grad_norm": NaN, + "learning_rate": 9.103692068051449e-07, + "loss": 0.0, + "step": 62116 + }, + { + "epoch": 5.7961183166931045, + "grad_norm": NaN, + "learning_rate": 9.095373311823418e-07, + "loss": 0.0, + "step": 62117 + }, + { + "epoch": 5.796211626387982, + "grad_norm": NaN, + "learning_rate": 9.087058346524123e-07, + "loss": 0.0, + "step": 62118 + }, + { + "epoch": 5.796304936082859, + "grad_norm": NaN, + "learning_rate": 9.078747172174716e-07, + "loss": 0.0, + "step": 62119 + }, + { + "epoch": 5.796398245777736, + "grad_norm": NaN, + "learning_rate": 9.070439788796513e-07, + "loss": 0.0, + "step": 62120 + }, + { + "epoch": 5.796491555472613, + "grad_norm": NaN, + "learning_rate": 9.062136196410664e-07, + "loss": 0.0, + "step": 62121 + }, + { + "epoch": 5.796584865167491, + "grad_norm": NaN, + "learning_rate": 9.053836395037984e-07, + "loss": 0.0, + "step": 62122 + }, + { + "epoch": 5.796678174862368, + "grad_norm": NaN, + "learning_rate": 9.04554038469979e-07, + "loss": 0.0, + "step": 62123 + }, + { + "epoch": 5.796771484557246, + "grad_norm": NaN, + "learning_rate": 9.037248165417233e-07, + "loss": 0.0, + "step": 62124 + }, + { + "epoch": 5.796864794252123, + "grad_norm": NaN, + "learning_rate": 9.028959737211128e-07, + "loss": 0.0, + "step": 62125 + }, + { + "epoch": 5.796958103947, + "grad_norm": NaN, + "learning_rate": 9.020675100102792e-07, + "loss": 0.0, + "step": 62126 + }, + { + "epoch": 5.797051413641878, + "grad_norm": NaN, + "learning_rate": 9.012394254113209e-07, + "loss": 0.0, + "step": 62127 + }, + { + "epoch": 5.797144723336754, + "grad_norm": NaN, + "learning_rate": 9.004117199263361e-07, + "loss": 0.0, + "step": 62128 + }, + { + "epoch": 5.797238033031632, + "grad_norm": NaN, + "learning_rate": 8.995843935574565e-07, + "loss": 0.0, + "step": 62129 + }, + { + "epoch": 5.797331342726509, + "grad_norm": NaN, + "learning_rate": 8.987574463067638e-07, + "loss": 0.0, + "step": 62130 + }, + { + "epoch": 5.797424652421387, + "grad_norm": NaN, + "learning_rate": 8.979308781763395e-07, + "loss": 0.0, + "step": 62131 + }, + { + "epoch": 5.797517962116264, + "grad_norm": NaN, + "learning_rate": 8.971046891683487e-07, + "loss": 0.0, + "step": 62132 + }, + { + "epoch": 5.797611271811141, + "grad_norm": NaN, + "learning_rate": 8.96278879284823e-07, + "loss": 0.0, + "step": 62133 + }, + { + "epoch": 5.797704581506019, + "grad_norm": NaN, + "learning_rate": 8.954534485278942e-07, + "loss": 0.0, + "step": 62134 + }, + { + "epoch": 5.797797891200895, + "grad_norm": NaN, + "learning_rate": 8.946283968996771e-07, + "loss": 0.0, + "step": 62135 + }, + { + "epoch": 5.797891200895773, + "grad_norm": NaN, + "learning_rate": 8.938037244022367e-07, + "loss": 0.0, + "step": 62136 + }, + { + "epoch": 5.79798451059065, + "grad_norm": NaN, + "learning_rate": 8.929794310376881e-07, + "loss": 0.0, + "step": 62137 + }, + { + "epoch": 5.798077820285528, + "grad_norm": NaN, + "learning_rate": 8.921555168081296e-07, + "loss": 0.0, + "step": 62138 + }, + { + "epoch": 5.798171129980405, + "grad_norm": NaN, + "learning_rate": 8.913319817156428e-07, + "loss": 0.0, + "step": 62139 + }, + { + "epoch": 5.7982644396752825, + "grad_norm": NaN, + "learning_rate": 8.905088257623261e-07, + "loss": 0.0, + "step": 62140 + }, + { + "epoch": 5.79835774937016, + "grad_norm": NaN, + "learning_rate": 8.896860489502943e-07, + "loss": 0.0, + "step": 62141 + }, + { + "epoch": 5.798451059065037, + "grad_norm": NaN, + "learning_rate": 8.888636512816128e-07, + "loss": 0.0, + "step": 62142 + }, + { + "epoch": 5.798544368759914, + "grad_norm": NaN, + "learning_rate": 8.880416327583628e-07, + "loss": 0.0, + "step": 62143 + }, + { + "epoch": 5.798637678454791, + "grad_norm": NaN, + "learning_rate": 8.872199933826762e-07, + "loss": 0.0, + "step": 62144 + }, + { + "epoch": 5.798730988149669, + "grad_norm": NaN, + "learning_rate": 8.86398733156618e-07, + "loss": 0.0, + "step": 62145 + }, + { + "epoch": 5.798824297844546, + "grad_norm": NaN, + "learning_rate": 8.855778520822532e-07, + "loss": 0.0, + "step": 62146 + }, + { + "epoch": 5.7989176075394235, + "grad_norm": NaN, + "learning_rate": 8.847573501617134e-07, + "loss": 0.0, + "step": 62147 + }, + { + "epoch": 5.799010917234301, + "grad_norm": NaN, + "learning_rate": 8.839372273970635e-07, + "loss": 0.0, + "step": 62148 + }, + { + "epoch": 5.7991042269291775, + "grad_norm": NaN, + "learning_rate": 8.831174837903854e-07, + "loss": 0.0, + "step": 62149 + }, + { + "epoch": 5.799197536624055, + "grad_norm": NaN, + "learning_rate": 8.822981193437773e-07, + "loss": 0.0, + "step": 62150 + }, + { + "epoch": 5.799290846318932, + "grad_norm": NaN, + "learning_rate": 8.814791340592875e-07, + "loss": 0.0, + "step": 62151 + }, + { + "epoch": 5.79938415601381, + "grad_norm": NaN, + "learning_rate": 8.806605279390478e-07, + "loss": 0.0, + "step": 62152 + }, + { + "epoch": 5.799477465708687, + "grad_norm": NaN, + "learning_rate": 8.79842300985123e-07, + "loss": 0.0, + "step": 62153 + }, + { + "epoch": 5.799570775403565, + "grad_norm": NaN, + "learning_rate": 8.790244531995616e-07, + "loss": 0.0, + "step": 62154 + }, + { + "epoch": 5.799664085098442, + "grad_norm": NaN, + "learning_rate": 8.782069845844786e-07, + "loss": 0.0, + "step": 62155 + }, + { + "epoch": 5.799757394793319, + "grad_norm": NaN, + "learning_rate": 8.773898951419556e-07, + "loss": 0.0, + "step": 62156 + }, + { + "epoch": 5.799850704488196, + "grad_norm": NaN, + "learning_rate": 8.765731848740243e-07, + "loss": 0.0, + "step": 62157 + }, + { + "epoch": 5.799944014183073, + "grad_norm": NaN, + "learning_rate": 8.757568537828163e-07, + "loss": 0.0, + "step": 62158 + }, + { + "epoch": 5.800037323877951, + "grad_norm": NaN, + "learning_rate": 8.749409018703968e-07, + "loss": 0.0, + "step": 62159 + }, + { + "epoch": 5.800130633572828, + "grad_norm": NaN, + "learning_rate": 8.741253291387973e-07, + "loss": 0.0, + "step": 62160 + }, + { + "epoch": 5.800223943267706, + "grad_norm": NaN, + "learning_rate": 8.733101355901495e-07, + "loss": 0.0, + "step": 62161 + }, + { + "epoch": 5.800317252962583, + "grad_norm": NaN, + "learning_rate": 8.724953212265018e-07, + "loss": 0.0, + "step": 62162 + }, + { + "epoch": 5.8004105626574605, + "grad_norm": NaN, + "learning_rate": 8.716808860499025e-07, + "loss": 0.0, + "step": 62163 + }, + { + "epoch": 5.800503872352337, + "grad_norm": NaN, + "learning_rate": 8.7086683006245e-07, + "loss": 0.0, + "step": 62164 + }, + { + "epoch": 5.800597182047214, + "grad_norm": NaN, + "learning_rate": 8.700531532662092e-07, + "loss": 0.0, + "step": 62165 + }, + { + "epoch": 5.800690491742092, + "grad_norm": NaN, + "learning_rate": 8.692398556632451e-07, + "loss": 0.0, + "step": 62166 + }, + { + "epoch": 5.800783801436969, + "grad_norm": NaN, + "learning_rate": 8.684269372556396e-07, + "loss": 0.0, + "step": 62167 + }, + { + "epoch": 5.800877111131847, + "grad_norm": NaN, + "learning_rate": 8.676143980454409e-07, + "loss": 0.0, + "step": 62168 + }, + { + "epoch": 5.800970420826724, + "grad_norm": NaN, + "learning_rate": 8.668022380347139e-07, + "loss": 0.0, + "step": 62169 + }, + { + "epoch": 5.8010637305216015, + "grad_norm": NaN, + "learning_rate": 8.659904572255572e-07, + "loss": 0.0, + "step": 62170 + }, + { + "epoch": 5.801157040216479, + "grad_norm": NaN, + "learning_rate": 8.651790556199856e-07, + "loss": 0.0, + "step": 62171 + }, + { + "epoch": 5.8012503499113555, + "grad_norm": NaN, + "learning_rate": 8.643680332200809e-07, + "loss": 0.0, + "step": 62172 + }, + { + "epoch": 5.801343659606233, + "grad_norm": NaN, + "learning_rate": 8.635573900279413e-07, + "loss": 0.0, + "step": 62173 + }, + { + "epoch": 5.80143696930111, + "grad_norm": NaN, + "learning_rate": 8.627471260455821e-07, + "loss": 0.0, + "step": 62174 + }, + { + "epoch": 5.801530278995988, + "grad_norm": NaN, + "learning_rate": 8.619372412750681e-07, + "loss": 0.0, + "step": 62175 + }, + { + "epoch": 5.801623588690865, + "grad_norm": NaN, + "learning_rate": 8.611277357184809e-07, + "loss": 0.0, + "step": 62176 + }, + { + "epoch": 5.801716898385743, + "grad_norm": NaN, + "learning_rate": 8.603186093778691e-07, + "loss": 0.0, + "step": 62177 + }, + { + "epoch": 5.801810208080619, + "grad_norm": NaN, + "learning_rate": 8.59509862255281e-07, + "loss": 0.0, + "step": 62178 + }, + { + "epoch": 5.8019035177754965, + "grad_norm": NaN, + "learning_rate": 8.587014943527981e-07, + "loss": 0.0, + "step": 62179 + }, + { + "epoch": 5.801996827470374, + "grad_norm": NaN, + "learning_rate": 8.578935056724357e-07, + "loss": 0.0, + "step": 62180 + }, + { + "epoch": 5.802090137165251, + "grad_norm": NaN, + "learning_rate": 8.570858962162752e-07, + "loss": 0.0, + "step": 62181 + }, + { + "epoch": 5.802183446860129, + "grad_norm": NaN, + "learning_rate": 8.562786659863818e-07, + "loss": 0.0, + "step": 62182 + }, + { + "epoch": 5.802276756555006, + "grad_norm": NaN, + "learning_rate": 8.554718149847872e-07, + "loss": 0.0, + "step": 62183 + }, + { + "epoch": 5.802370066249884, + "grad_norm": NaN, + "learning_rate": 8.546653432135398e-07, + "loss": 0.0, + "step": 62184 + }, + { + "epoch": 5.802463375944761, + "grad_norm": NaN, + "learning_rate": 8.538592506747044e-07, + "loss": 0.0, + "step": 62185 + }, + { + "epoch": 5.8025566856396384, + "grad_norm": NaN, + "learning_rate": 8.530535373703129e-07, + "loss": 0.0, + "step": 62186 + }, + { + "epoch": 5.802649995334515, + "grad_norm": NaN, + "learning_rate": 8.522482033024303e-07, + "loss": 0.0, + "step": 62187 + }, + { + "epoch": 5.802743305029392, + "grad_norm": NaN, + "learning_rate": 8.514432484731215e-07, + "loss": 0.0, + "step": 62188 + }, + { + "epoch": 5.80283661472427, + "grad_norm": NaN, + "learning_rate": 8.50638672884385e-07, + "loss": 0.0, + "step": 62189 + }, + { + "epoch": 5.802929924419147, + "grad_norm": NaN, + "learning_rate": 8.498344765383025e-07, + "loss": 0.0, + "step": 62190 + }, + { + "epoch": 5.803023234114025, + "grad_norm": NaN, + "learning_rate": 8.490306594369056e-07, + "loss": 0.0, + "step": 62191 + }, + { + "epoch": 5.803116543808902, + "grad_norm": NaN, + "learning_rate": 8.482272215822428e-07, + "loss": 0.0, + "step": 62192 + }, + { + "epoch": 5.803209853503779, + "grad_norm": NaN, + "learning_rate": 8.474241629763623e-07, + "loss": 0.0, + "step": 62193 + }, + { + "epoch": 5.803303163198656, + "grad_norm": NaN, + "learning_rate": 8.466214836212959e-07, + "loss": 0.0, + "step": 62194 + }, + { + "epoch": 5.803396472893533, + "grad_norm": NaN, + "learning_rate": 8.458191835190919e-07, + "loss": 0.0, + "step": 62195 + }, + { + "epoch": 5.803489782588411, + "grad_norm": NaN, + "learning_rate": 8.450172626717821e-07, + "loss": 0.0, + "step": 62196 + }, + { + "epoch": 5.803583092283288, + "grad_norm": NaN, + "learning_rate": 8.442157210814149e-07, + "loss": 0.0, + "step": 62197 + }, + { + "epoch": 5.803676401978166, + "grad_norm": NaN, + "learning_rate": 8.434145587500219e-07, + "loss": 0.0, + "step": 62198 + }, + { + "epoch": 5.803769711673043, + "grad_norm": NaN, + "learning_rate": 8.426137756796348e-07, + "loss": 0.0, + "step": 62199 + }, + { + "epoch": 5.8038630213679205, + "grad_norm": NaN, + "learning_rate": 8.418133718723185e-07, + "loss": 0.0, + "step": 62200 + }, + { + "epoch": 5.803956331062797, + "grad_norm": NaN, + "learning_rate": 8.410133473300551e-07, + "loss": 0.0, + "step": 62201 + }, + { + "epoch": 5.8040496407576745, + "grad_norm": NaN, + "learning_rate": 8.40213702054926e-07, + "loss": 0.0, + "step": 62202 + }, + { + "epoch": 5.804142950452552, + "grad_norm": NaN, + "learning_rate": 8.39414436048963e-07, + "loss": 0.0, + "step": 62203 + }, + { + "epoch": 5.804236260147429, + "grad_norm": NaN, + "learning_rate": 8.386155493141644e-07, + "loss": 0.0, + "step": 62204 + }, + { + "epoch": 5.804329569842307, + "grad_norm": NaN, + "learning_rate": 8.378170418525953e-07, + "loss": 0.0, + "step": 62205 + }, + { + "epoch": 5.804422879537184, + "grad_norm": NaN, + "learning_rate": 8.370189136662708e-07, + "loss": 0.0, + "step": 62206 + }, + { + "epoch": 5.804516189232062, + "grad_norm": NaN, + "learning_rate": 8.362211647572059e-07, + "loss": 0.0, + "step": 62207 + }, + { + "epoch": 5.804609498926938, + "grad_norm": NaN, + "learning_rate": 8.354237951274656e-07, + "loss": 0.0, + "step": 62208 + }, + { + "epoch": 5.8047028086218155, + "grad_norm": NaN, + "learning_rate": 8.346268047790483e-07, + "loss": 0.0, + "step": 62209 + }, + { + "epoch": 5.804796118316693, + "grad_norm": NaN, + "learning_rate": 8.338301937139858e-07, + "loss": 0.0, + "step": 62210 + }, + { + "epoch": 5.80488942801157, + "grad_norm": NaN, + "learning_rate": 8.330339619343096e-07, + "loss": 0.0, + "step": 62211 + }, + { + "epoch": 5.804982737706448, + "grad_norm": NaN, + "learning_rate": 8.322381094420516e-07, + "loss": 0.0, + "step": 62212 + }, + { + "epoch": 5.805076047401325, + "grad_norm": NaN, + "learning_rate": 8.314426362392101e-07, + "loss": 0.0, + "step": 62213 + }, + { + "epoch": 5.805169357096203, + "grad_norm": NaN, + "learning_rate": 8.306475423278169e-07, + "loss": 0.0, + "step": 62214 + }, + { + "epoch": 5.80526266679108, + "grad_norm": NaN, + "learning_rate": 8.298528277099203e-07, + "loss": 0.0, + "step": 62215 + }, + { + "epoch": 5.805355976485957, + "grad_norm": NaN, + "learning_rate": 8.290584923875021e-07, + "loss": 0.0, + "step": 62216 + }, + { + "epoch": 5.805449286180834, + "grad_norm": NaN, + "learning_rate": 8.282645363626273e-07, + "loss": 0.0, + "step": 62217 + }, + { + "epoch": 5.805542595875711, + "grad_norm": NaN, + "learning_rate": 8.274709596372608e-07, + "loss": 0.0, + "step": 62218 + }, + { + "epoch": 5.805635905570589, + "grad_norm": NaN, + "learning_rate": 8.266777622134513e-07, + "loss": 0.0, + "step": 62219 + }, + { + "epoch": 5.805729215265466, + "grad_norm": NaN, + "learning_rate": 8.258849440932303e-07, + "loss": 0.0, + "step": 62220 + }, + { + "epoch": 5.805822524960344, + "grad_norm": NaN, + "learning_rate": 8.250925052785795e-07, + "loss": 0.0, + "step": 62221 + }, + { + "epoch": 5.80591583465522, + "grad_norm": NaN, + "learning_rate": 8.243004457715308e-07, + "loss": 0.0, + "step": 62222 + }, + { + "epoch": 5.806009144350098, + "grad_norm": NaN, + "learning_rate": 8.235087655740991e-07, + "loss": 0.0, + "step": 62223 + }, + { + "epoch": 5.806102454044975, + "grad_norm": NaN, + "learning_rate": 8.227174646882828e-07, + "loss": 0.0, + "step": 62224 + }, + { + "epoch": 5.8061957637398525, + "grad_norm": NaN, + "learning_rate": 8.219265431161304e-07, + "loss": 0.0, + "step": 62225 + }, + { + "epoch": 5.80628907343473, + "grad_norm": NaN, + "learning_rate": 8.211360008596235e-07, + "loss": 0.0, + "step": 62226 + }, + { + "epoch": 5.806382383129607, + "grad_norm": NaN, + "learning_rate": 8.203458379207605e-07, + "loss": 0.0, + "step": 62227 + }, + { + "epoch": 5.806475692824485, + "grad_norm": NaN, + "learning_rate": 8.195560543015734e-07, + "loss": 0.0, + "step": 62228 + }, + { + "epoch": 5.806569002519362, + "grad_norm": NaN, + "learning_rate": 8.187666500040768e-07, + "loss": 0.0, + "step": 62229 + }, + { + "epoch": 5.806662312214239, + "grad_norm": NaN, + "learning_rate": 8.179776250302527e-07, + "loss": 0.0, + "step": 62230 + }, + { + "epoch": 5.806755621909116, + "grad_norm": NaN, + "learning_rate": 8.171889793821329e-07, + "loss": 0.0, + "step": 62231 + }, + { + "epoch": 5.8068489316039935, + "grad_norm": NaN, + "learning_rate": 8.164007130617156e-07, + "loss": 0.0, + "step": 62232 + }, + { + "epoch": 5.806942241298871, + "grad_norm": NaN, + "learning_rate": 8.156128260709826e-07, + "loss": 0.0, + "step": 62233 + }, + { + "epoch": 5.807035550993748, + "grad_norm": NaN, + "learning_rate": 8.148253184119491e-07, + "loss": 0.0, + "step": 62234 + }, + { + "epoch": 5.807128860688626, + "grad_norm": NaN, + "learning_rate": 8.140381900866466e-07, + "loss": 0.0, + "step": 62235 + }, + { + "epoch": 5.807222170383503, + "grad_norm": NaN, + "learning_rate": 8.132514410970404e-07, + "loss": 0.0, + "step": 62236 + }, + { + "epoch": 5.80731548007838, + "grad_norm": NaN, + "learning_rate": 8.124650714451286e-07, + "loss": 0.0, + "step": 62237 + }, + { + "epoch": 5.807408789773257, + "grad_norm": NaN, + "learning_rate": 8.116790811329598e-07, + "loss": 0.0, + "step": 62238 + }, + { + "epoch": 5.807502099468135, + "grad_norm": NaN, + "learning_rate": 8.108934701624659e-07, + "loss": 0.0, + "step": 62239 + }, + { + "epoch": 5.807595409163012, + "grad_norm": NaN, + "learning_rate": 8.10108238535695e-07, + "loss": 0.0, + "step": 62240 + }, + { + "epoch": 5.807688718857889, + "grad_norm": NaN, + "learning_rate": 8.093233862546288e-07, + "loss": 0.0, + "step": 62241 + }, + { + "epoch": 5.807782028552767, + "grad_norm": NaN, + "learning_rate": 8.085389133212328e-07, + "loss": 0.0, + "step": 62242 + }, + { + "epoch": 5.807875338247644, + "grad_norm": NaN, + "learning_rate": 8.077548197375549e-07, + "loss": 0.0, + "step": 62243 + }, + { + "epoch": 5.807968647942522, + "grad_norm": NaN, + "learning_rate": 8.069711055055606e-07, + "loss": 0.0, + "step": 62244 + }, + { + "epoch": 5.808061957637398, + "grad_norm": NaN, + "learning_rate": 8.061877706272313e-07, + "loss": 0.0, + "step": 62245 + }, + { + "epoch": 5.808155267332276, + "grad_norm": NaN, + "learning_rate": 8.054048151045823e-07, + "loss": 0.0, + "step": 62246 + }, + { + "epoch": 5.808248577027153, + "grad_norm": NaN, + "learning_rate": 8.046222389395951e-07, + "loss": 0.0, + "step": 62247 + }, + { + "epoch": 5.8083418867220304, + "grad_norm": NaN, + "learning_rate": 8.038400421342517e-07, + "loss": 0.0, + "step": 62248 + }, + { + "epoch": 5.808435196416908, + "grad_norm": NaN, + "learning_rate": 8.030582246905503e-07, + "loss": 0.0, + "step": 62249 + }, + { + "epoch": 5.808528506111785, + "grad_norm": NaN, + "learning_rate": 8.022767866104729e-07, + "loss": 0.0, + "step": 62250 + }, + { + "epoch": 5.808621815806662, + "grad_norm": NaN, + "learning_rate": 8.014957278960176e-07, + "loss": 0.0, + "step": 62251 + }, + { + "epoch": 5.808715125501539, + "grad_norm": NaN, + "learning_rate": 8.007150485491499e-07, + "loss": 0.0, + "step": 62252 + }, + { + "epoch": 5.808808435196417, + "grad_norm": NaN, + "learning_rate": 7.999347485718844e-07, + "loss": 0.0, + "step": 62253 + }, + { + "epoch": 5.808901744891294, + "grad_norm": NaN, + "learning_rate": 7.991548279661864e-07, + "loss": 0.0, + "step": 62254 + }, + { + "epoch": 5.8089950545861715, + "grad_norm": NaN, + "learning_rate": 7.983752867340376e-07, + "loss": 0.0, + "step": 62255 + }, + { + "epoch": 5.809088364281049, + "grad_norm": NaN, + "learning_rate": 7.975961248774198e-07, + "loss": 0.0, + "step": 62256 + }, + { + "epoch": 5.809181673975926, + "grad_norm": NaN, + "learning_rate": 7.968173423983149e-07, + "loss": 0.0, + "step": 62257 + }, + { + "epoch": 5.809274983670804, + "grad_norm": NaN, + "learning_rate": 7.960389392987044e-07, + "loss": 0.0, + "step": 62258 + }, + { + "epoch": 5.809368293365681, + "grad_norm": NaN, + "learning_rate": 7.952609155805867e-07, + "loss": 0.0, + "step": 62259 + }, + { + "epoch": 5.809461603060558, + "grad_norm": NaN, + "learning_rate": 7.944832712459104e-07, + "loss": 0.0, + "step": 62260 + }, + { + "epoch": 5.809554912755435, + "grad_norm": NaN, + "learning_rate": 7.937060062966738e-07, + "loss": 0.0, + "step": 62261 + }, + { + "epoch": 5.8096482224503125, + "grad_norm": NaN, + "learning_rate": 7.929291207348254e-07, + "loss": 0.0, + "step": 62262 + }, + { + "epoch": 5.80974153214519, + "grad_norm": NaN, + "learning_rate": 7.921526145623802e-07, + "loss": 0.0, + "step": 62263 + }, + { + "epoch": 5.809834841840067, + "grad_norm": NaN, + "learning_rate": 7.913764877812867e-07, + "loss": 0.0, + "step": 62264 + }, + { + "epoch": 5.809928151534945, + "grad_norm": NaN, + "learning_rate": 7.906007403935266e-07, + "loss": 0.0, + "step": 62265 + }, + { + "epoch": 5.810021461229821, + "grad_norm": NaN, + "learning_rate": 7.89825372401065e-07, + "loss": 0.0, + "step": 62266 + }, + { + "epoch": 5.810114770924699, + "grad_norm": NaN, + "learning_rate": 7.890503838058837e-07, + "loss": 0.0, + "step": 62267 + }, + { + "epoch": 5.810208080619576, + "grad_norm": NaN, + "learning_rate": 7.882757746099311e-07, + "loss": 0.0, + "step": 62268 + }, + { + "epoch": 5.810301390314454, + "grad_norm": NaN, + "learning_rate": 7.875015448152055e-07, + "loss": 0.0, + "step": 62269 + }, + { + "epoch": 5.810394700009331, + "grad_norm": NaN, + "learning_rate": 7.867276944236722e-07, + "loss": 0.0, + "step": 62270 + }, + { + "epoch": 5.810488009704208, + "grad_norm": NaN, + "learning_rate": 7.859542234372629e-07, + "loss": 0.0, + "step": 62271 + }, + { + "epoch": 5.810581319399086, + "grad_norm": NaN, + "learning_rate": 7.851811318579926e-07, + "loss": 0.0, + "step": 62272 + }, + { + "epoch": 5.810674629093963, + "grad_norm": NaN, + "learning_rate": 7.844084196878098e-07, + "loss": 0.0, + "step": 62273 + }, + { + "epoch": 5.81076793878884, + "grad_norm": NaN, + "learning_rate": 7.836360869286628e-07, + "loss": 0.0, + "step": 62274 + }, + { + "epoch": 5.810861248483717, + "grad_norm": NaN, + "learning_rate": 7.828641335825336e-07, + "loss": 0.0, + "step": 62275 + }, + { + "epoch": 5.810954558178595, + "grad_norm": NaN, + "learning_rate": 7.820925596513872e-07, + "loss": 0.0, + "step": 62276 + }, + { + "epoch": 5.811047867873472, + "grad_norm": NaN, + "learning_rate": 7.813213651371719e-07, + "loss": 0.0, + "step": 62277 + }, + { + "epoch": 5.8111411775683495, + "grad_norm": NaN, + "learning_rate": 7.805505500418529e-07, + "loss": 0.0, + "step": 62278 + }, + { + "epoch": 5.811234487263227, + "grad_norm": NaN, + "learning_rate": 7.797801143673954e-07, + "loss": 0.0, + "step": 62279 + }, + { + "epoch": 5.811327796958104, + "grad_norm": NaN, + "learning_rate": 7.79010058115731e-07, + "loss": 0.0, + "step": 62280 + }, + { + "epoch": 5.811421106652981, + "grad_norm": NaN, + "learning_rate": 7.782403812888749e-07, + "loss": 0.0, + "step": 62281 + }, + { + "epoch": 5.811514416347858, + "grad_norm": NaN, + "learning_rate": 7.774710838887421e-07, + "loss": 0.0, + "step": 62282 + }, + { + "epoch": 5.811607726042736, + "grad_norm": NaN, + "learning_rate": 7.767021659172812e-07, + "loss": 0.0, + "step": 62283 + }, + { + "epoch": 5.811701035737613, + "grad_norm": NaN, + "learning_rate": 7.759336273764738e-07, + "loss": 0.0, + "step": 62284 + }, + { + "epoch": 5.8117943454324905, + "grad_norm": NaN, + "learning_rate": 7.751654682682685e-07, + "loss": 0.0, + "step": 62285 + }, + { + "epoch": 5.811887655127368, + "grad_norm": NaN, + "learning_rate": 7.743976885946135e-07, + "loss": 0.0, + "step": 62286 + }, + { + "epoch": 5.811980964822245, + "grad_norm": NaN, + "learning_rate": 7.736302883574574e-07, + "loss": 0.0, + "step": 62287 + }, + { + "epoch": 5.812074274517123, + "grad_norm": NaN, + "learning_rate": 7.728632675587654e-07, + "loss": 0.0, + "step": 62288 + }, + { + "epoch": 5.812167584211999, + "grad_norm": NaN, + "learning_rate": 7.720966262004524e-07, + "loss": 0.0, + "step": 62289 + }, + { + "epoch": 5.812260893906877, + "grad_norm": NaN, + "learning_rate": 7.713303642845169e-07, + "loss": 0.0, + "step": 62290 + }, + { + "epoch": 5.812354203601754, + "grad_norm": NaN, + "learning_rate": 7.705644818128742e-07, + "loss": 0.0, + "step": 62291 + }, + { + "epoch": 5.812447513296632, + "grad_norm": NaN, + "learning_rate": 7.697989787874726e-07, + "loss": 0.0, + "step": 62292 + }, + { + "epoch": 5.812540822991509, + "grad_norm": NaN, + "learning_rate": 7.69033855210277e-07, + "loss": 0.0, + "step": 62293 + }, + { + "epoch": 5.812634132686386, + "grad_norm": NaN, + "learning_rate": 7.682691110832362e-07, + "loss": 0.0, + "step": 62294 + }, + { + "epoch": 5.812727442381263, + "grad_norm": NaN, + "learning_rate": 7.675047464082651e-07, + "loss": 0.0, + "step": 62295 + }, + { + "epoch": 5.81282075207614, + "grad_norm": NaN, + "learning_rate": 7.667407611873288e-07, + "loss": 0.0, + "step": 62296 + }, + { + "epoch": 5.812914061771018, + "grad_norm": NaN, + "learning_rate": 7.659771554223593e-07, + "loss": 0.0, + "step": 62297 + }, + { + "epoch": 5.813007371465895, + "grad_norm": NaN, + "learning_rate": 7.652139291153048e-07, + "loss": 0.0, + "step": 62298 + }, + { + "epoch": 5.813100681160773, + "grad_norm": NaN, + "learning_rate": 7.644510822680971e-07, + "loss": 0.0, + "step": 62299 + }, + { + "epoch": 5.81319399085565, + "grad_norm": NaN, + "learning_rate": 7.636886148827015e-07, + "loss": 0.0, + "step": 62300 + }, + { + "epoch": 5.8132873005505274, + "grad_norm": NaN, + "learning_rate": 7.62926526961033e-07, + "loss": 0.0, + "step": 62301 + }, + { + "epoch": 5.813380610245405, + "grad_norm": NaN, + "learning_rate": 7.621648185050233e-07, + "loss": 0.0, + "step": 62302 + }, + { + "epoch": 5.813473919940282, + "grad_norm": NaN, + "learning_rate": 7.614034895166377e-07, + "loss": 0.0, + "step": 62303 + }, + { + "epoch": 5.813567229635159, + "grad_norm": NaN, + "learning_rate": 7.606425399977744e-07, + "loss": 0.0, + "step": 62304 + }, + { + "epoch": 5.813660539330036, + "grad_norm": NaN, + "learning_rate": 7.598819699504155e-07, + "loss": 0.0, + "step": 62305 + }, + { + "epoch": 5.813753849024914, + "grad_norm": NaN, + "learning_rate": 7.591217793764426e-07, + "loss": 0.0, + "step": 62306 + }, + { + "epoch": 5.813847158719791, + "grad_norm": NaN, + "learning_rate": 7.583619682778374e-07, + "loss": 0.0, + "step": 62307 + }, + { + "epoch": 5.8139404684146685, + "grad_norm": NaN, + "learning_rate": 7.576025366564987e-07, + "loss": 0.0, + "step": 62308 + }, + { + "epoch": 5.814033778109546, + "grad_norm": NaN, + "learning_rate": 7.568434845143579e-07, + "loss": 0.0, + "step": 62309 + }, + { + "epoch": 5.8141270878044224, + "grad_norm": NaN, + "learning_rate": 7.560848118533636e-07, + "loss": 0.0, + "step": 62310 + }, + { + "epoch": 5.8142203974993, + "grad_norm": NaN, + "learning_rate": 7.553265186754476e-07, + "loss": 0.0, + "step": 62311 + }, + { + "epoch": 5.814313707194177, + "grad_norm": NaN, + "learning_rate": 7.545686049825083e-07, + "loss": 0.0, + "step": 62312 + }, + { + "epoch": 5.814407016889055, + "grad_norm": NaN, + "learning_rate": 7.538110707764944e-07, + "loss": 0.0, + "step": 62313 + }, + { + "epoch": 5.814500326583932, + "grad_norm": NaN, + "learning_rate": 7.530539160593541e-07, + "loss": 0.0, + "step": 62314 + }, + { + "epoch": 5.8145936362788095, + "grad_norm": NaN, + "learning_rate": 7.522971408329526e-07, + "loss": 0.0, + "step": 62315 + }, + { + "epoch": 5.814686945973687, + "grad_norm": NaN, + "learning_rate": 7.515407450992716e-07, + "loss": 0.0, + "step": 62316 + }, + { + "epoch": 5.814780255668564, + "grad_norm": NaN, + "learning_rate": 7.507847288602098e-07, + "loss": 0.0, + "step": 62317 + }, + { + "epoch": 5.814873565363441, + "grad_norm": NaN, + "learning_rate": 7.500290921176821e-07, + "loss": 0.0, + "step": 62318 + }, + { + "epoch": 5.814966875058318, + "grad_norm": NaN, + "learning_rate": 7.492738348736205e-07, + "loss": 0.0, + "step": 62319 + }, + { + "epoch": 5.815060184753196, + "grad_norm": NaN, + "learning_rate": 7.485189571299566e-07, + "loss": 0.0, + "step": 62320 + }, + { + "epoch": 5.815153494448073, + "grad_norm": NaN, + "learning_rate": 7.477644588885723e-07, + "loss": 0.0, + "step": 62321 + }, + { + "epoch": 5.815246804142951, + "grad_norm": NaN, + "learning_rate": 7.470103401514327e-07, + "loss": 0.0, + "step": 62322 + }, + { + "epoch": 5.815340113837828, + "grad_norm": NaN, + "learning_rate": 7.462566009204363e-07, + "loss": 0.0, + "step": 62323 + }, + { + "epoch": 5.815433423532705, + "grad_norm": NaN, + "learning_rate": 7.455032411974815e-07, + "loss": 0.0, + "step": 62324 + }, + { + "epoch": 5.815526733227582, + "grad_norm": NaN, + "learning_rate": 7.447502609845002e-07, + "loss": 0.0, + "step": 62325 + }, + { + "epoch": 5.815620042922459, + "grad_norm": NaN, + "learning_rate": 7.439976602834241e-07, + "loss": 0.0, + "step": 62326 + }, + { + "epoch": 5.815713352617337, + "grad_norm": NaN, + "learning_rate": 7.432454390961351e-07, + "loss": 0.0, + "step": 62327 + }, + { + "epoch": 5.815806662312214, + "grad_norm": NaN, + "learning_rate": 7.424935974245483e-07, + "loss": 0.0, + "step": 62328 + }, + { + "epoch": 5.815899972007092, + "grad_norm": NaN, + "learning_rate": 7.417421352706121e-07, + "loss": 0.0, + "step": 62329 + }, + { + "epoch": 5.815993281701969, + "grad_norm": NaN, + "learning_rate": 7.409910526361918e-07, + "loss": 0.0, + "step": 62330 + }, + { + "epoch": 5.8160865913968465, + "grad_norm": NaN, + "learning_rate": 7.402403495232356e-07, + "loss": 0.0, + "step": 62331 + }, + { + "epoch": 5.816179901091724, + "grad_norm": NaN, + "learning_rate": 7.394900259336257e-07, + "loss": 0.0, + "step": 62332 + }, + { + "epoch": 5.8162732107866, + "grad_norm": NaN, + "learning_rate": 7.387400818692768e-07, + "loss": 0.0, + "step": 62333 + }, + { + "epoch": 5.816366520481478, + "grad_norm": NaN, + "learning_rate": 7.379905173320877e-07, + "loss": 0.0, + "step": 62334 + }, + { + "epoch": 5.816459830176355, + "grad_norm": NaN, + "learning_rate": 7.372413323239901e-07, + "loss": 0.0, + "step": 62335 + }, + { + "epoch": 5.816553139871233, + "grad_norm": NaN, + "learning_rate": 7.364925268468658e-07, + "loss": 0.0, + "step": 62336 + }, + { + "epoch": 5.81664644956611, + "grad_norm": NaN, + "learning_rate": 7.3574410090263e-07, + "loss": 0.0, + "step": 62337 + }, + { + "epoch": 5.8167397592609875, + "grad_norm": NaN, + "learning_rate": 7.349960544931809e-07, + "loss": 0.0, + "step": 62338 + }, + { + "epoch": 5.816833068955864, + "grad_norm": NaN, + "learning_rate": 7.342483876204175e-07, + "loss": 0.0, + "step": 62339 + }, + { + "epoch": 5.8169263786507415, + "grad_norm": NaN, + "learning_rate": 7.335011002862545e-07, + "loss": 0.0, + "step": 62340 + }, + { + "epoch": 5.817019688345619, + "grad_norm": NaN, + "learning_rate": 7.32754192492574e-07, + "loss": 0.0, + "step": 62341 + }, + { + "epoch": 5.817112998040496, + "grad_norm": NaN, + "learning_rate": 7.320076642412743e-07, + "loss": 0.0, + "step": 62342 + }, + { + "epoch": 5.817206307735374, + "grad_norm": NaN, + "learning_rate": 7.312615155342705e-07, + "loss": 0.0, + "step": 62343 + }, + { + "epoch": 5.817299617430251, + "grad_norm": NaN, + "learning_rate": 7.305157463734613e-07, + "loss": 0.0, + "step": 62344 + }, + { + "epoch": 5.817392927125129, + "grad_norm": NaN, + "learning_rate": 7.297703567607283e-07, + "loss": 0.0, + "step": 62345 + }, + { + "epoch": 5.817486236820006, + "grad_norm": NaN, + "learning_rate": 7.2902534669797e-07, + "loss": 0.0, + "step": 62346 + }, + { + "epoch": 5.8175795465148825, + "grad_norm": NaN, + "learning_rate": 7.282807161870851e-07, + "loss": 0.0, + "step": 62347 + }, + { + "epoch": 5.81767285620976, + "grad_norm": NaN, + "learning_rate": 7.275364652299553e-07, + "loss": 0.0, + "step": 62348 + }, + { + "epoch": 5.817766165904637, + "grad_norm": NaN, + "learning_rate": 7.267925938284957e-07, + "loss": 0.0, + "step": 62349 + }, + { + "epoch": 5.817859475599515, + "grad_norm": NaN, + "learning_rate": 7.260491019845716e-07, + "loss": 0.0, + "step": 62350 + }, + { + "epoch": 5.817952785294392, + "grad_norm": NaN, + "learning_rate": 7.253059897000979e-07, + "loss": 0.0, + "step": 62351 + }, + { + "epoch": 5.81804609498927, + "grad_norm": NaN, + "learning_rate": 7.245632569769566e-07, + "loss": 0.0, + "step": 62352 + }, + { + "epoch": 5.818139404684147, + "grad_norm": NaN, + "learning_rate": 7.238209038170128e-07, + "loss": 0.0, + "step": 62353 + }, + { + "epoch": 5.818232714379024, + "grad_norm": NaN, + "learning_rate": 7.230789302221985e-07, + "loss": 0.0, + "step": 62354 + }, + { + "epoch": 5.818326024073901, + "grad_norm": NaN, + "learning_rate": 7.223373361943618e-07, + "loss": 0.0, + "step": 62355 + }, + { + "epoch": 5.818419333768778, + "grad_norm": NaN, + "learning_rate": 7.215961217354016e-07, + "loss": 0.0, + "step": 62356 + }, + { + "epoch": 5.818512643463656, + "grad_norm": NaN, + "learning_rate": 7.208552868471996e-07, + "loss": 0.0, + "step": 62357 + }, + { + "epoch": 5.818605953158533, + "grad_norm": NaN, + "learning_rate": 7.201148315316541e-07, + "loss": 0.0, + "step": 62358 + }, + { + "epoch": 5.818699262853411, + "grad_norm": NaN, + "learning_rate": 7.193747557906137e-07, + "loss": 0.0, + "step": 62359 + }, + { + "epoch": 5.818792572548288, + "grad_norm": NaN, + "learning_rate": 7.186350596260104e-07, + "loss": 0.0, + "step": 62360 + }, + { + "epoch": 5.8188858822431655, + "grad_norm": NaN, + "learning_rate": 7.178957430396759e-07, + "loss": 0.0, + "step": 62361 + }, + { + "epoch": 5.818979191938042, + "grad_norm": NaN, + "learning_rate": 7.171568060335254e-07, + "loss": 0.0, + "step": 62362 + }, + { + "epoch": 5.8190725016329194, + "grad_norm": NaN, + "learning_rate": 7.164182486094239e-07, + "loss": 0.0, + "step": 62363 + }, + { + "epoch": 5.819165811327797, + "grad_norm": NaN, + "learning_rate": 7.156800707692534e-07, + "loss": 0.0, + "step": 62364 + }, + { + "epoch": 5.819259121022674, + "grad_norm": NaN, + "learning_rate": 7.149422725148624e-07, + "loss": 0.0, + "step": 62365 + }, + { + "epoch": 5.819352430717552, + "grad_norm": NaN, + "learning_rate": 7.142048538481826e-07, + "loss": 0.0, + "step": 62366 + }, + { + "epoch": 5.819445740412429, + "grad_norm": NaN, + "learning_rate": 7.13467814771046e-07, + "loss": 0.0, + "step": 62367 + }, + { + "epoch": 5.819539050107306, + "grad_norm": NaN, + "learning_rate": 7.127311552853343e-07, + "loss": 0.0, + "step": 62368 + }, + { + "epoch": 5.819632359802183, + "grad_norm": NaN, + "learning_rate": 7.119948753929294e-07, + "loss": 0.0, + "step": 62369 + }, + { + "epoch": 5.8197256694970605, + "grad_norm": NaN, + "learning_rate": 7.112589750956965e-07, + "loss": 0.0, + "step": 62370 + }, + { + "epoch": 5.819818979191938, + "grad_norm": NaN, + "learning_rate": 7.105234543955173e-07, + "loss": 0.0, + "step": 62371 + }, + { + "epoch": 5.819912288886815, + "grad_norm": NaN, + "learning_rate": 7.097883132942405e-07, + "loss": 0.0, + "step": 62372 + }, + { + "epoch": 5.820005598581693, + "grad_norm": NaN, + "learning_rate": 7.090535517937645e-07, + "loss": 0.0, + "step": 62373 + }, + { + "epoch": 5.82009890827657, + "grad_norm": NaN, + "learning_rate": 7.083191698959212e-07, + "loss": 0.0, + "step": 62374 + }, + { + "epoch": 5.820192217971448, + "grad_norm": NaN, + "learning_rate": 7.075851676026256e-07, + "loss": 0.0, + "step": 62375 + }, + { + "epoch": 5.820285527666325, + "grad_norm": NaN, + "learning_rate": 7.068515449156931e-07, + "loss": 0.0, + "step": 62376 + }, + { + "epoch": 5.8203788373612015, + "grad_norm": NaN, + "learning_rate": 7.06118301837022e-07, + "loss": 0.0, + "step": 62377 + }, + { + "epoch": 5.820472147056079, + "grad_norm": NaN, + "learning_rate": 7.053854383684776e-07, + "loss": 0.0, + "step": 62378 + }, + { + "epoch": 5.820565456750956, + "grad_norm": NaN, + "learning_rate": 7.046529545119084e-07, + "loss": 0.0, + "step": 62379 + }, + { + "epoch": 5.820658766445834, + "grad_norm": NaN, + "learning_rate": 7.039208502691795e-07, + "loss": 0.0, + "step": 62380 + }, + { + "epoch": 5.820752076140711, + "grad_norm": NaN, + "learning_rate": 7.031891256421563e-07, + "loss": 0.0, + "step": 62381 + }, + { + "epoch": 5.820845385835589, + "grad_norm": NaN, + "learning_rate": 7.02457780632687e-07, + "loss": 0.0, + "step": 62382 + }, + { + "epoch": 5.820938695530465, + "grad_norm": NaN, + "learning_rate": 7.017268152426536e-07, + "loss": 0.0, + "step": 62383 + }, + { + "epoch": 5.821032005225343, + "grad_norm": NaN, + "learning_rate": 7.009962294739047e-07, + "loss": 0.0, + "step": 62384 + }, + { + "epoch": 5.82112531492022, + "grad_norm": NaN, + "learning_rate": 7.002660233282887e-07, + "loss": 0.0, + "step": 62385 + }, + { + "epoch": 5.821218624615097, + "grad_norm": NaN, + "learning_rate": 6.995361968076874e-07, + "loss": 0.0, + "step": 62386 + }, + { + "epoch": 5.821311934309975, + "grad_norm": NaN, + "learning_rate": 6.988067499139161e-07, + "loss": 0.0, + "step": 62387 + }, + { + "epoch": 5.821405244004852, + "grad_norm": NaN, + "learning_rate": 6.980776826488732e-07, + "loss": 0.0, + "step": 62388 + }, + { + "epoch": 5.82149855369973, + "grad_norm": NaN, + "learning_rate": 6.97348995014374e-07, + "loss": 0.0, + "step": 62389 + }, + { + "epoch": 5.821591863394607, + "grad_norm": NaN, + "learning_rate": 6.966206870123003e-07, + "loss": 0.0, + "step": 62390 + }, + { + "epoch": 5.821685173089484, + "grad_norm": NaN, + "learning_rate": 6.958927586445007e-07, + "loss": 0.0, + "step": 62391 + }, + { + "epoch": 5.821778482784361, + "grad_norm": NaN, + "learning_rate": 6.951652099128069e-07, + "loss": 0.0, + "step": 62392 + }, + { + "epoch": 5.8218717924792385, + "grad_norm": NaN, + "learning_rate": 6.944380408190841e-07, + "loss": 0.0, + "step": 62393 + }, + { + "epoch": 5.821965102174116, + "grad_norm": NaN, + "learning_rate": 6.937112513651811e-07, + "loss": 0.0, + "step": 62394 + }, + { + "epoch": 5.822058411868993, + "grad_norm": NaN, + "learning_rate": 6.92984841552946e-07, + "loss": 0.0, + "step": 62395 + }, + { + "epoch": 5.822151721563871, + "grad_norm": NaN, + "learning_rate": 6.922588113842109e-07, + "loss": 0.0, + "step": 62396 + }, + { + "epoch": 5.822245031258748, + "grad_norm": NaN, + "learning_rate": 6.915331608608409e-07, + "loss": 0.0, + "step": 62397 + }, + { + "epoch": 5.822338340953625, + "grad_norm": NaN, + "learning_rate": 6.908078899846681e-07, + "loss": 0.0, + "step": 62398 + }, + { + "epoch": 5.822431650648502, + "grad_norm": NaN, + "learning_rate": 6.900829987575574e-07, + "loss": 0.0, + "step": 62399 + }, + { + "epoch": 5.8225249603433795, + "grad_norm": NaN, + "learning_rate": 6.893584871813074e-07, + "loss": 0.0, + "step": 62400 + }, + { + "epoch": 5.822618270038257, + "grad_norm": NaN, + "learning_rate": 6.886343552578167e-07, + "loss": 0.0, + "step": 62401 + }, + { + "epoch": 5.822711579733134, + "grad_norm": NaN, + "learning_rate": 6.879106029888837e-07, + "loss": 0.0, + "step": 62402 + }, + { + "epoch": 5.822804889428012, + "grad_norm": NaN, + "learning_rate": 6.871872303763737e-07, + "loss": 0.0, + "step": 62403 + }, + { + "epoch": 5.822898199122889, + "grad_norm": NaN, + "learning_rate": 6.864642374221185e-07, + "loss": 0.0, + "step": 62404 + }, + { + "epoch": 5.822991508817767, + "grad_norm": NaN, + "learning_rate": 6.857416241279501e-07, + "loss": 0.0, + "step": 62405 + }, + { + "epoch": 5.823084818512643, + "grad_norm": NaN, + "learning_rate": 6.850193904957002e-07, + "loss": 0.0, + "step": 62406 + }, + { + "epoch": 5.823178128207521, + "grad_norm": NaN, + "learning_rate": 6.842975365272341e-07, + "loss": 0.0, + "step": 62407 + }, + { + "epoch": 5.823271437902398, + "grad_norm": NaN, + "learning_rate": 6.835760622243668e-07, + "loss": 0.0, + "step": 62408 + }, + { + "epoch": 5.823364747597275, + "grad_norm": NaN, + "learning_rate": 6.828549675889139e-07, + "loss": 0.0, + "step": 62409 + }, + { + "epoch": 5.823458057292153, + "grad_norm": NaN, + "learning_rate": 6.821342526227569e-07, + "loss": 0.0, + "step": 62410 + }, + { + "epoch": 5.82355136698703, + "grad_norm": NaN, + "learning_rate": 6.814139173276778e-07, + "loss": 0.0, + "step": 62411 + }, + { + "epoch": 5.823644676681907, + "grad_norm": NaN, + "learning_rate": 6.806939617055418e-07, + "loss": 0.0, + "step": 62412 + }, + { + "epoch": 5.823737986376784, + "grad_norm": NaN, + "learning_rate": 6.799743857581808e-07, + "loss": 0.0, + "step": 62413 + }, + { + "epoch": 5.823831296071662, + "grad_norm": NaN, + "learning_rate": 6.792551894873932e-07, + "loss": 0.0, + "step": 62414 + }, + { + "epoch": 5.823924605766539, + "grad_norm": NaN, + "learning_rate": 6.785363728950443e-07, + "loss": 0.0, + "step": 62415 + }, + { + "epoch": 5.8240179154614165, + "grad_norm": NaN, + "learning_rate": 6.778179359829328e-07, + "loss": 0.0, + "step": 62416 + }, + { + "epoch": 5.824111225156294, + "grad_norm": NaN, + "learning_rate": 6.77099878752907e-07, + "loss": 0.0, + "step": 62417 + }, + { + "epoch": 5.824204534851171, + "grad_norm": NaN, + "learning_rate": 6.763822012067821e-07, + "loss": 0.0, + "step": 62418 + }, + { + "epoch": 5.824297844546049, + "grad_norm": NaN, + "learning_rate": 6.756649033463735e-07, + "loss": 0.0, + "step": 62419 + }, + { + "epoch": 5.824391154240926, + "grad_norm": NaN, + "learning_rate": 6.749479851735295e-07, + "loss": 0.0, + "step": 62420 + }, + { + "epoch": 5.824484463935803, + "grad_norm": NaN, + "learning_rate": 6.742314466900489e-07, + "loss": 0.0, + "step": 62421 + }, + { + "epoch": 5.82457777363068, + "grad_norm": NaN, + "learning_rate": 6.735152878977801e-07, + "loss": 0.0, + "step": 62422 + }, + { + "epoch": 5.8246710833255575, + "grad_norm": NaN, + "learning_rate": 6.72799508798505e-07, + "loss": 0.0, + "step": 62423 + }, + { + "epoch": 5.824764393020435, + "grad_norm": NaN, + "learning_rate": 6.720841093940887e-07, + "loss": 0.0, + "step": 62424 + }, + { + "epoch": 5.824857702715312, + "grad_norm": NaN, + "learning_rate": 6.713690896863134e-07, + "loss": 0.0, + "step": 62425 + }, + { + "epoch": 5.82495101241019, + "grad_norm": NaN, + "learning_rate": 6.706544496770272e-07, + "loss": 0.0, + "step": 62426 + }, + { + "epoch": 5.825044322105066, + "grad_norm": NaN, + "learning_rate": 6.699401893680123e-07, + "loss": 0.0, + "step": 62427 + }, + { + "epoch": 5.825137631799944, + "grad_norm": NaN, + "learning_rate": 6.692263087611171e-07, + "loss": 0.0, + "step": 62428 + }, + { + "epoch": 5.825230941494821, + "grad_norm": NaN, + "learning_rate": 6.685128078581403e-07, + "loss": 0.0, + "step": 62429 + }, + { + "epoch": 5.8253242511896985, + "grad_norm": NaN, + "learning_rate": 6.67799686660897e-07, + "loss": 0.0, + "step": 62430 + }, + { + "epoch": 5.825417560884576, + "grad_norm": NaN, + "learning_rate": 6.67086945171219e-07, + "loss": 0.0, + "step": 62431 + }, + { + "epoch": 5.825510870579453, + "grad_norm": NaN, + "learning_rate": 6.663745833908885e-07, + "loss": 0.0, + "step": 62432 + }, + { + "epoch": 5.825604180274331, + "grad_norm": NaN, + "learning_rate": 6.65662601321737e-07, + "loss": 0.0, + "step": 62433 + }, + { + "epoch": 5.825697489969208, + "grad_norm": NaN, + "learning_rate": 6.649509989655632e-07, + "loss": 0.0, + "step": 62434 + }, + { + "epoch": 5.825790799664085, + "grad_norm": NaN, + "learning_rate": 6.642397763241991e-07, + "loss": 0.0, + "step": 62435 + }, + { + "epoch": 5.825884109358962, + "grad_norm": NaN, + "learning_rate": 6.635289333994264e-07, + "loss": 0.0, + "step": 62436 + }, + { + "epoch": 5.82597741905384, + "grad_norm": NaN, + "learning_rate": 6.628184701930605e-07, + "loss": 0.0, + "step": 62437 + }, + { + "epoch": 5.826070728748717, + "grad_norm": NaN, + "learning_rate": 6.621083867068999e-07, + "loss": 0.0, + "step": 62438 + }, + { + "epoch": 5.826164038443594, + "grad_norm": NaN, + "learning_rate": 6.613986829427764e-07, + "loss": 0.0, + "step": 62439 + }, + { + "epoch": 5.826257348138472, + "grad_norm": NaN, + "learning_rate": 6.606893589024886e-07, + "loss": 0.0, + "step": 62440 + }, + { + "epoch": 5.826350657833349, + "grad_norm": NaN, + "learning_rate": 6.599804145878018e-07, + "loss": 0.0, + "step": 62441 + }, + { + "epoch": 5.826443967528226, + "grad_norm": NaN, + "learning_rate": 6.592718500005645e-07, + "loss": 0.0, + "step": 62442 + }, + { + "epoch": 5.826537277223103, + "grad_norm": NaN, + "learning_rate": 6.585636651425586e-07, + "loss": 0.0, + "step": 62443 + }, + { + "epoch": 5.826630586917981, + "grad_norm": NaN, + "learning_rate": 6.578558600155825e-07, + "loss": 0.0, + "step": 62444 + }, + { + "epoch": 5.826723896612858, + "grad_norm": NaN, + "learning_rate": 6.571484346214517e-07, + "loss": 0.0, + "step": 62445 + }, + { + "epoch": 5.8268172063077355, + "grad_norm": NaN, + "learning_rate": 6.56441388961948e-07, + "loss": 0.0, + "step": 62446 + }, + { + "epoch": 5.826910516002613, + "grad_norm": NaN, + "learning_rate": 6.557347230388699e-07, + "loss": 0.0, + "step": 62447 + }, + { + "epoch": 5.82700382569749, + "grad_norm": NaN, + "learning_rate": 6.55028436854016e-07, + "loss": 0.0, + "step": 62448 + }, + { + "epoch": 5.827097135392368, + "grad_norm": NaN, + "learning_rate": 6.543225304092014e-07, + "loss": 0.0, + "step": 62449 + }, + { + "epoch": 5.827190445087244, + "grad_norm": NaN, + "learning_rate": 6.536170037061916e-07, + "loss": 0.0, + "step": 62450 + }, + { + "epoch": 5.827283754782122, + "grad_norm": NaN, + "learning_rate": 6.529118567468017e-07, + "loss": 0.0, + "step": 62451 + }, + { + "epoch": 5.827377064476999, + "grad_norm": NaN, + "learning_rate": 6.522070895328135e-07, + "loss": 0.0, + "step": 62452 + }, + { + "epoch": 5.8274703741718765, + "grad_norm": NaN, + "learning_rate": 6.515027020660258e-07, + "loss": 0.0, + "step": 62453 + }, + { + "epoch": 5.827563683866754, + "grad_norm": NaN, + "learning_rate": 6.507986943482202e-07, + "loss": 0.0, + "step": 62454 + }, + { + "epoch": 5.827656993561631, + "grad_norm": NaN, + "learning_rate": 6.500950663811955e-07, + "loss": 0.0, + "step": 62455 + }, + { + "epoch": 5.827750303256508, + "grad_norm": NaN, + "learning_rate": 6.493918181667335e-07, + "loss": 0.0, + "step": 62456 + }, + { + "epoch": 5.827843612951385, + "grad_norm": NaN, + "learning_rate": 6.486889497066328e-07, + "loss": 0.0, + "step": 62457 + }, + { + "epoch": 5.827936922646263, + "grad_norm": NaN, + "learning_rate": 6.479864610026752e-07, + "loss": 0.0, + "step": 62458 + }, + { + "epoch": 5.82803023234114, + "grad_norm": NaN, + "learning_rate": 6.472843520566595e-07, + "loss": 0.0, + "step": 62459 + }, + { + "epoch": 5.828123542036018, + "grad_norm": NaN, + "learning_rate": 6.465826228703341e-07, + "loss": 0.0, + "step": 62460 + }, + { + "epoch": 5.828216851730895, + "grad_norm": NaN, + "learning_rate": 6.458812734455309e-07, + "loss": 0.0, + "step": 62461 + }, + { + "epoch": 5.828310161425772, + "grad_norm": NaN, + "learning_rate": 6.451803037839987e-07, + "loss": 0.0, + "step": 62462 + }, + { + "epoch": 5.82840347112065, + "grad_norm": NaN, + "learning_rate": 6.44479713887519e-07, + "loss": 0.0, + "step": 62463 + }, + { + "epoch": 5.828496780815526, + "grad_norm": NaN, + "learning_rate": 6.437795037578907e-07, + "loss": 0.0, + "step": 62464 + }, + { + "epoch": 5.828590090510404, + "grad_norm": NaN, + "learning_rate": 6.430796733968957e-07, + "loss": 0.0, + "step": 62465 + }, + { + "epoch": 5.828683400205281, + "grad_norm": NaN, + "learning_rate": 6.423802228063158e-07, + "loss": 0.0, + "step": 62466 + }, + { + "epoch": 5.828776709900159, + "grad_norm": NaN, + "learning_rate": 6.416811519878995e-07, + "loss": 0.0, + "step": 62467 + }, + { + "epoch": 5.828870019595036, + "grad_norm": NaN, + "learning_rate": 6.409824609434621e-07, + "loss": 0.0, + "step": 62468 + }, + { + "epoch": 5.8289633292899135, + "grad_norm": NaN, + "learning_rate": 6.402841496747524e-07, + "loss": 0.0, + "step": 62469 + }, + { + "epoch": 5.829056638984791, + "grad_norm": NaN, + "learning_rate": 6.39586218183552e-07, + "loss": 0.0, + "step": 62470 + }, + { + "epoch": 5.829149948679667, + "grad_norm": NaN, + "learning_rate": 6.388886664716431e-07, + "loss": 0.0, + "step": 62471 + }, + { + "epoch": 5.829243258374545, + "grad_norm": NaN, + "learning_rate": 6.381914945408073e-07, + "loss": 0.0, + "step": 62472 + }, + { + "epoch": 5.829336568069422, + "grad_norm": NaN, + "learning_rate": 6.374947023927934e-07, + "loss": 0.0, + "step": 62473 + }, + { + "epoch": 5.8294298777643, + "grad_norm": NaN, + "learning_rate": 6.367982900294e-07, + "loss": 0.0, + "step": 62474 + }, + { + "epoch": 5.829523187459177, + "grad_norm": NaN, + "learning_rate": 6.361022574523755e-07, + "loss": 0.0, + "step": 62475 + }, + { + "epoch": 5.8296164971540545, + "grad_norm": NaN, + "learning_rate": 6.35406604663502e-07, + "loss": 0.0, + "step": 62476 + }, + { + "epoch": 5.829709806848932, + "grad_norm": NaN, + "learning_rate": 6.34711331664528e-07, + "loss": 0.0, + "step": 62477 + }, + { + "epoch": 5.829803116543809, + "grad_norm": NaN, + "learning_rate": 6.340164384572521e-07, + "loss": 0.0, + "step": 62478 + }, + { + "epoch": 5.829896426238686, + "grad_norm": NaN, + "learning_rate": 6.33321925043423e-07, + "loss": 0.0, + "step": 62479 + }, + { + "epoch": 5.829989735933563, + "grad_norm": NaN, + "learning_rate": 6.326277914248224e-07, + "loss": 0.0, + "step": 62480 + }, + { + "epoch": 5.830083045628441, + "grad_norm": NaN, + "learning_rate": 6.319340376031823e-07, + "loss": 0.0, + "step": 62481 + }, + { + "epoch": 5.830176355323318, + "grad_norm": NaN, + "learning_rate": 6.312406635803013e-07, + "loss": 0.0, + "step": 62482 + }, + { + "epoch": 5.8302696650181955, + "grad_norm": NaN, + "learning_rate": 6.305476693579281e-07, + "loss": 0.0, + "step": 62483 + }, + { + "epoch": 5.830362974713073, + "grad_norm": NaN, + "learning_rate": 6.298550549378278e-07, + "loss": 0.0, + "step": 62484 + }, + { + "epoch": 5.8304562844079495, + "grad_norm": NaN, + "learning_rate": 6.291628203217491e-07, + "loss": 0.0, + "step": 62485 + }, + { + "epoch": 5.830549594102827, + "grad_norm": NaN, + "learning_rate": 6.284709655114739e-07, + "loss": 0.0, + "step": 62486 + }, + { + "epoch": 5.830642903797704, + "grad_norm": NaN, + "learning_rate": 6.277794905087507e-07, + "loss": 0.0, + "step": 62487 + }, + { + "epoch": 5.830736213492582, + "grad_norm": NaN, + "learning_rate": 6.270883953153283e-07, + "loss": 0.0, + "step": 62488 + }, + { + "epoch": 5.830829523187459, + "grad_norm": NaN, + "learning_rate": 6.263976799329717e-07, + "loss": 0.0, + "step": 62489 + }, + { + "epoch": 5.830922832882337, + "grad_norm": NaN, + "learning_rate": 6.257073443634463e-07, + "loss": 0.0, + "step": 62490 + }, + { + "epoch": 5.831016142577214, + "grad_norm": NaN, + "learning_rate": 6.250173886085008e-07, + "loss": 0.0, + "step": 62491 + }, + { + "epoch": 5.831109452272091, + "grad_norm": NaN, + "learning_rate": 6.243278126698836e-07, + "loss": 0.0, + "step": 62492 + }, + { + "epoch": 5.831202761966969, + "grad_norm": NaN, + "learning_rate": 6.236386165493435e-07, + "loss": 0.0, + "step": 62493 + }, + { + "epoch": 5.831296071661845, + "grad_norm": NaN, + "learning_rate": 6.229498002486455e-07, + "loss": 0.0, + "step": 62494 + }, + { + "epoch": 5.831389381356723, + "grad_norm": NaN, + "learning_rate": 6.222613637695384e-07, + "loss": 0.0, + "step": 62495 + }, + { + "epoch": 5.8314826910516, + "grad_norm": NaN, + "learning_rate": 6.215733071137874e-07, + "loss": 0.0, + "step": 62496 + }, + { + "epoch": 5.831576000746478, + "grad_norm": NaN, + "learning_rate": 6.208856302831078e-07, + "loss": 0.0, + "step": 62497 + }, + { + "epoch": 5.831669310441355, + "grad_norm": NaN, + "learning_rate": 6.201983332792815e-07, + "loss": 0.0, + "step": 62498 + }, + { + "epoch": 5.8317626201362325, + "grad_norm": NaN, + "learning_rate": 6.195114161040238e-07, + "loss": 0.0, + "step": 62499 + }, + { + "epoch": 5.831855929831109, + "grad_norm": NaN, + "learning_rate": 6.188248787591e-07, + "loss": 0.0, + "step": 62500 + }, + { + "epoch": 5.831949239525986, + "grad_norm": NaN, + "learning_rate": 6.181387212462752e-07, + "loss": 0.0, + "step": 62501 + }, + { + "epoch": 5.832042549220864, + "grad_norm": NaN, + "learning_rate": 6.174529435672482e-07, + "loss": 0.0, + "step": 62502 + }, + { + "epoch": 5.832135858915741, + "grad_norm": NaN, + "learning_rate": 6.167675457238008e-07, + "loss": 0.0, + "step": 62503 + }, + { + "epoch": 5.832229168610619, + "grad_norm": NaN, + "learning_rate": 6.160825277176651e-07, + "loss": 0.0, + "step": 62504 + }, + { + "epoch": 5.832322478305496, + "grad_norm": NaN, + "learning_rate": 6.153978895505729e-07, + "loss": 0.0, + "step": 62505 + }, + { + "epoch": 5.8324157880003735, + "grad_norm": NaN, + "learning_rate": 6.147136312242895e-07, + "loss": 0.0, + "step": 62506 + }, + { + "epoch": 5.832509097695251, + "grad_norm": NaN, + "learning_rate": 6.140297527405302e-07, + "loss": 0.0, + "step": 62507 + }, + { + "epoch": 5.8326024073901275, + "grad_norm": NaN, + "learning_rate": 6.13346254101027e-07, + "loss": 0.0, + "step": 62508 + }, + { + "epoch": 5.832695717085005, + "grad_norm": NaN, + "learning_rate": 6.12663135307545e-07, + "loss": 0.0, + "step": 62509 + }, + { + "epoch": 5.832789026779882, + "grad_norm": NaN, + "learning_rate": 6.119803963618164e-07, + "loss": 0.0, + "step": 62510 + }, + { + "epoch": 5.83288233647476, + "grad_norm": NaN, + "learning_rate": 6.112980372655562e-07, + "loss": 0.0, + "step": 62511 + }, + { + "epoch": 5.832975646169637, + "grad_norm": NaN, + "learning_rate": 6.106160580205133e-07, + "loss": 0.0, + "step": 62512 + }, + { + "epoch": 5.833068955864515, + "grad_norm": NaN, + "learning_rate": 6.099344586284194e-07, + "loss": 0.0, + "step": 62513 + }, + { + "epoch": 5.833162265559392, + "grad_norm": NaN, + "learning_rate": 6.092532390910231e-07, + "loss": 0.0, + "step": 62514 + }, + { + "epoch": 5.8332555752542685, + "grad_norm": NaN, + "learning_rate": 6.0857239941004e-07, + "loss": 0.0, + "step": 62515 + }, + { + "epoch": 5.833348884949146, + "grad_norm": NaN, + "learning_rate": 6.078919395872017e-07, + "loss": 0.0, + "step": 62516 + }, + { + "epoch": 5.833442194644023, + "grad_norm": NaN, + "learning_rate": 6.072118596242403e-07, + "loss": 0.0, + "step": 62517 + }, + { + "epoch": 5.833535504338901, + "grad_norm": NaN, + "learning_rate": 6.065321595228711e-07, + "loss": 0.0, + "step": 62518 + }, + { + "epoch": 5.833628814033778, + "grad_norm": NaN, + "learning_rate": 6.058528392848594e-07, + "loss": 0.0, + "step": 62519 + }, + { + "epoch": 5.833722123728656, + "grad_norm": NaN, + "learning_rate": 6.051738989119037e-07, + "loss": 0.0, + "step": 62520 + }, + { + "epoch": 5.833815433423533, + "grad_norm": NaN, + "learning_rate": 6.04495338405736e-07, + "loss": 0.0, + "step": 62521 + }, + { + "epoch": 5.8339087431184105, + "grad_norm": NaN, + "learning_rate": 6.038171577680717e-07, + "loss": 0.0, + "step": 62522 + }, + { + "epoch": 5.834002052813287, + "grad_norm": NaN, + "learning_rate": 6.031393570006593e-07, + "loss": 0.0, + "step": 62523 + }, + { + "epoch": 5.834095362508164, + "grad_norm": NaN, + "learning_rate": 6.024619361051974e-07, + "loss": 0.0, + "step": 62524 + }, + { + "epoch": 5.834188672203042, + "grad_norm": NaN, + "learning_rate": 6.017848950834347e-07, + "loss": 0.0, + "step": 62525 + }, + { + "epoch": 5.834281981897919, + "grad_norm": NaN, + "learning_rate": 6.011082339370699e-07, + "loss": 0.0, + "step": 62526 + }, + { + "epoch": 5.834375291592797, + "grad_norm": NaN, + "learning_rate": 6.004319526678347e-07, + "loss": 0.0, + "step": 62527 + }, + { + "epoch": 5.834468601287674, + "grad_norm": NaN, + "learning_rate": 5.997560512774447e-07, + "loss": 0.0, + "step": 62528 + }, + { + "epoch": 5.834561910982551, + "grad_norm": NaN, + "learning_rate": 5.990805297676149e-07, + "loss": 0.0, + "step": 62529 + }, + { + "epoch": 5.834655220677428, + "grad_norm": NaN, + "learning_rate": 5.984053881400607e-07, + "loss": 0.0, + "step": 62530 + }, + { + "epoch": 5.8347485303723055, + "grad_norm": NaN, + "learning_rate": 5.977306263965309e-07, + "loss": 0.0, + "step": 62531 + }, + { + "epoch": 5.834841840067183, + "grad_norm": NaN, + "learning_rate": 5.970562445386906e-07, + "loss": 0.0, + "step": 62532 + }, + { + "epoch": 5.83493514976206, + "grad_norm": NaN, + "learning_rate": 5.963822425682885e-07, + "loss": 0.0, + "step": 62533 + }, + { + "epoch": 5.835028459456938, + "grad_norm": NaN, + "learning_rate": 5.957086204870398e-07, + "loss": 0.0, + "step": 62534 + }, + { + "epoch": 5.835121769151815, + "grad_norm": NaN, + "learning_rate": 5.950353782966266e-07, + "loss": 0.0, + "step": 62535 + }, + { + "epoch": 5.8352150788466925, + "grad_norm": NaN, + "learning_rate": 5.943625159987975e-07, + "loss": 0.0, + "step": 62536 + }, + { + "epoch": 5.83530838854157, + "grad_norm": NaN, + "learning_rate": 5.93690033595251e-07, + "loss": 0.0, + "step": 62537 + }, + { + "epoch": 5.8354016982364465, + "grad_norm": NaN, + "learning_rate": 5.930179310876859e-07, + "loss": 0.0, + "step": 62538 + }, + { + "epoch": 5.835495007931324, + "grad_norm": NaN, + "learning_rate": 5.923462084778175e-07, + "loss": 0.0, + "step": 62539 + }, + { + "epoch": 5.835588317626201, + "grad_norm": NaN, + "learning_rate": 5.916748657673443e-07, + "loss": 0.0, + "step": 62540 + }, + { + "epoch": 5.835681627321079, + "grad_norm": NaN, + "learning_rate": 5.910039029579983e-07, + "loss": 0.0, + "step": 62541 + }, + { + "epoch": 5.835774937015956, + "grad_norm": NaN, + "learning_rate": 5.903333200514615e-07, + "loss": 0.0, + "step": 62542 + }, + { + "epoch": 5.835868246710834, + "grad_norm": NaN, + "learning_rate": 5.896631170494659e-07, + "loss": 0.0, + "step": 62543 + }, + { + "epoch": 5.83596155640571, + "grad_norm": NaN, + "learning_rate": 5.889932939536767e-07, + "loss": 0.0, + "step": 62544 + }, + { + "epoch": 5.8360548661005875, + "grad_norm": NaN, + "learning_rate": 5.883238507658428e-07, + "loss": 0.0, + "step": 62545 + }, + { + "epoch": 5.836148175795465, + "grad_norm": NaN, + "learning_rate": 5.876547874876291e-07, + "loss": 0.0, + "step": 62546 + }, + { + "epoch": 5.836241485490342, + "grad_norm": NaN, + "learning_rate": 5.869861041207513e-07, + "loss": 0.0, + "step": 62547 + }, + { + "epoch": 5.83633479518522, + "grad_norm": NaN, + "learning_rate": 5.863178006669078e-07, + "loss": 0.0, + "step": 62548 + }, + { + "epoch": 5.836428104880097, + "grad_norm": NaN, + "learning_rate": 5.856498771277973e-07, + "loss": 0.0, + "step": 62549 + }, + { + "epoch": 5.836521414574975, + "grad_norm": NaN, + "learning_rate": 5.849823335051184e-07, + "loss": 0.0, + "step": 62550 + }, + { + "epoch": 5.836614724269852, + "grad_norm": NaN, + "learning_rate": 5.843151698005699e-07, + "loss": 0.0, + "step": 62551 + }, + { + "epoch": 5.836708033964729, + "grad_norm": NaN, + "learning_rate": 5.836483860158503e-07, + "loss": 0.0, + "step": 62552 + }, + { + "epoch": 5.836801343659606, + "grad_norm": NaN, + "learning_rate": 5.829819821526416e-07, + "loss": 0.0, + "step": 62553 + }, + { + "epoch": 5.836894653354483, + "grad_norm": NaN, + "learning_rate": 5.82315958212659e-07, + "loss": 0.0, + "step": 62554 + }, + { + "epoch": 5.836987963049361, + "grad_norm": NaN, + "learning_rate": 5.816503141975848e-07, + "loss": 0.0, + "step": 62555 + }, + { + "epoch": 5.837081272744238, + "grad_norm": NaN, + "learning_rate": 5.809850501091173e-07, + "loss": 0.0, + "step": 62556 + }, + { + "epoch": 5.837174582439116, + "grad_norm": NaN, + "learning_rate": 5.803201659489387e-07, + "loss": 0.0, + "step": 62557 + }, + { + "epoch": 5.837267892133993, + "grad_norm": NaN, + "learning_rate": 5.796556617187475e-07, + "loss": 0.0, + "step": 62558 + }, + { + "epoch": 5.83736120182887, + "grad_norm": NaN, + "learning_rate": 5.789915374202258e-07, + "loss": 0.0, + "step": 62559 + }, + { + "epoch": 5.837454511523747, + "grad_norm": NaN, + "learning_rate": 5.783277930550722e-07, + "loss": 0.0, + "step": 62560 + }, + { + "epoch": 5.8375478212186245, + "grad_norm": NaN, + "learning_rate": 5.776644286249688e-07, + "loss": 0.0, + "step": 62561 + }, + { + "epoch": 5.837641130913502, + "grad_norm": NaN, + "learning_rate": 5.770014441315974e-07, + "loss": 0.0, + "step": 62562 + }, + { + "epoch": 5.837734440608379, + "grad_norm": NaN, + "learning_rate": 5.763388395766566e-07, + "loss": 0.0, + "step": 62563 + }, + { + "epoch": 5.837827750303257, + "grad_norm": NaN, + "learning_rate": 5.75676614961812e-07, + "loss": 0.0, + "step": 62564 + }, + { + "epoch": 5.837921059998134, + "grad_norm": NaN, + "learning_rate": 5.75014770288762e-07, + "loss": 0.0, + "step": 62565 + }, + { + "epoch": 5.838014369693012, + "grad_norm": NaN, + "learning_rate": 5.743533055592053e-07, + "loss": 0.0, + "step": 62566 + }, + { + "epoch": 5.838107679387888, + "grad_norm": NaN, + "learning_rate": 5.736922207747907e-07, + "loss": 0.0, + "step": 62567 + }, + { + "epoch": 5.8382009890827655, + "grad_norm": NaN, + "learning_rate": 5.730315159372001e-07, + "loss": 0.0, + "step": 62568 + }, + { + "epoch": 5.838294298777643, + "grad_norm": NaN, + "learning_rate": 5.723711910481488e-07, + "loss": 0.0, + "step": 62569 + }, + { + "epoch": 5.83838760847252, + "grad_norm": NaN, + "learning_rate": 5.717112461092855e-07, + "loss": 0.0, + "step": 62570 + }, + { + "epoch": 5.838480918167398, + "grad_norm": NaN, + "learning_rate": 5.710516811222921e-07, + "loss": 0.0, + "step": 62571 + }, + { + "epoch": 5.838574227862275, + "grad_norm": NaN, + "learning_rate": 5.703924960888506e-07, + "loss": 0.0, + "step": 62572 + }, + { + "epoch": 5.838667537557152, + "grad_norm": NaN, + "learning_rate": 5.697336910106431e-07, + "loss": 0.0, + "step": 62573 + }, + { + "epoch": 5.838760847252029, + "grad_norm": NaN, + "learning_rate": 5.69075265889335e-07, + "loss": 0.0, + "step": 62574 + }, + { + "epoch": 5.838854156946907, + "grad_norm": NaN, + "learning_rate": 5.68417220726608e-07, + "loss": 0.0, + "step": 62575 + }, + { + "epoch": 5.838947466641784, + "grad_norm": NaN, + "learning_rate": 5.677595555241277e-07, + "loss": 0.0, + "step": 62576 + }, + { + "epoch": 5.839040776336661, + "grad_norm": NaN, + "learning_rate": 5.671022702835592e-07, + "loss": 0.0, + "step": 62577 + }, + { + "epoch": 5.839134086031539, + "grad_norm": NaN, + "learning_rate": 5.664453650066015e-07, + "loss": 0.0, + "step": 62578 + }, + { + "epoch": 5.839227395726416, + "grad_norm": NaN, + "learning_rate": 5.657888396949029e-07, + "loss": 0.0, + "step": 62579 + }, + { + "epoch": 5.839320705421294, + "grad_norm": NaN, + "learning_rate": 5.651326943501289e-07, + "loss": 0.0, + "step": 62580 + }, + { + "epoch": 5.83941401511617, + "grad_norm": NaN, + "learning_rate": 5.644769289739615e-07, + "loss": 0.0, + "step": 62581 + }, + { + "epoch": 5.839507324811048, + "grad_norm": NaN, + "learning_rate": 5.638215435680659e-07, + "loss": 0.0, + "step": 62582 + }, + { + "epoch": 5.839600634505925, + "grad_norm": NaN, + "learning_rate": 5.631665381341077e-07, + "loss": 0.0, + "step": 62583 + }, + { + "epoch": 5.8396939442008025, + "grad_norm": NaN, + "learning_rate": 5.625119126737521e-07, + "loss": 0.0, + "step": 62584 + }, + { + "epoch": 5.83978725389568, + "grad_norm": NaN, + "learning_rate": 5.618576671886476e-07, + "loss": 0.0, + "step": 62585 + }, + { + "epoch": 5.839880563590557, + "grad_norm": NaN, + "learning_rate": 5.612038016804932e-07, + "loss": 0.0, + "step": 62586 + }, + { + "epoch": 5.839973873285435, + "grad_norm": NaN, + "learning_rate": 5.60550316150904e-07, + "loss": 0.0, + "step": 62587 + }, + { + "epoch": 5.840067182980311, + "grad_norm": NaN, + "learning_rate": 5.598972106015953e-07, + "loss": 0.0, + "step": 62588 + }, + { + "epoch": 5.840160492675189, + "grad_norm": NaN, + "learning_rate": 5.592444850341826e-07, + "loss": 0.0, + "step": 62589 + }, + { + "epoch": 5.840253802370066, + "grad_norm": NaN, + "learning_rate": 5.585921394503478e-07, + "loss": 0.0, + "step": 62590 + }, + { + "epoch": 5.8403471120649435, + "grad_norm": NaN, + "learning_rate": 5.579401738517563e-07, + "loss": 0.0, + "step": 62591 + }, + { + "epoch": 5.840440421759821, + "grad_norm": NaN, + "learning_rate": 5.572885882400402e-07, + "loss": 0.0, + "step": 62592 + }, + { + "epoch": 5.840533731454698, + "grad_norm": NaN, + "learning_rate": 5.566373826168813e-07, + "loss": 0.0, + "step": 62593 + }, + { + "epoch": 5.840627041149576, + "grad_norm": NaN, + "learning_rate": 5.559865569839117e-07, + "loss": 0.0, + "step": 62594 + }, + { + "epoch": 5.840720350844453, + "grad_norm": NaN, + "learning_rate": 5.553361113428134e-07, + "loss": 0.0, + "step": 62595 + }, + { + "epoch": 5.84081366053933, + "grad_norm": NaN, + "learning_rate": 5.546860456952184e-07, + "loss": 0.0, + "step": 62596 + }, + { + "epoch": 5.840906970234207, + "grad_norm": NaN, + "learning_rate": 5.540363600427922e-07, + "loss": 0.0, + "step": 62597 + }, + { + "epoch": 5.8410002799290845, + "grad_norm": NaN, + "learning_rate": 5.533870543871832e-07, + "loss": 0.0, + "step": 62598 + }, + { + "epoch": 5.841093589623962, + "grad_norm": NaN, + "learning_rate": 5.527381287300403e-07, + "loss": 0.0, + "step": 62599 + }, + { + "epoch": 5.841186899318839, + "grad_norm": NaN, + "learning_rate": 5.520895830730121e-07, + "loss": 0.0, + "step": 62600 + }, + { + "epoch": 5.841280209013717, + "grad_norm": NaN, + "learning_rate": 5.514414174177473e-07, + "loss": 0.0, + "step": 62601 + }, + { + "epoch": 5.841373518708593, + "grad_norm": NaN, + "learning_rate": 5.507936317659112e-07, + "loss": 0.0, + "step": 62602 + }, + { + "epoch": 5.841466828403471, + "grad_norm": NaN, + "learning_rate": 5.501462261191192e-07, + "loss": 0.0, + "step": 62603 + }, + { + "epoch": 5.841560138098348, + "grad_norm": NaN, + "learning_rate": 5.494992004790533e-07, + "loss": 0.0, + "step": 62604 + }, + { + "epoch": 5.841653447793226, + "grad_norm": NaN, + "learning_rate": 5.488525548473288e-07, + "loss": 0.0, + "step": 62605 + }, + { + "epoch": 5.841746757488103, + "grad_norm": NaN, + "learning_rate": 5.482062892255945e-07, + "loss": 0.0, + "step": 62606 + }, + { + "epoch": 5.84184006718298, + "grad_norm": NaN, + "learning_rate": 5.475604036155156e-07, + "loss": 0.0, + "step": 62607 + }, + { + "epoch": 5.841933376877858, + "grad_norm": NaN, + "learning_rate": 5.469148980187077e-07, + "loss": 0.0, + "step": 62608 + }, + { + "epoch": 5.842026686572735, + "grad_norm": NaN, + "learning_rate": 5.462697724368359e-07, + "loss": 0.0, + "step": 62609 + }, + { + "epoch": 5.842119996267613, + "grad_norm": NaN, + "learning_rate": 5.456250268715156e-07, + "loss": 0.0, + "step": 62610 + }, + { + "epoch": 5.842213305962489, + "grad_norm": NaN, + "learning_rate": 5.449806613244124e-07, + "loss": 0.0, + "step": 62611 + }, + { + "epoch": 5.842306615657367, + "grad_norm": NaN, + "learning_rate": 5.443366757971412e-07, + "loss": 0.0, + "step": 62612 + }, + { + "epoch": 5.842399925352244, + "grad_norm": NaN, + "learning_rate": 5.436930702913511e-07, + "loss": 0.0, + "step": 62613 + }, + { + "epoch": 5.8424932350471215, + "grad_norm": NaN, + "learning_rate": 5.430498448086906e-07, + "loss": 0.0, + "step": 62614 + }, + { + "epoch": 5.842586544741999, + "grad_norm": NaN, + "learning_rate": 5.424069993507585e-07, + "loss": 0.0, + "step": 62615 + }, + { + "epoch": 5.842679854436876, + "grad_norm": NaN, + "learning_rate": 5.417645339192367e-07, + "loss": 0.0, + "step": 62616 + }, + { + "epoch": 5.842773164131753, + "grad_norm": NaN, + "learning_rate": 5.41122448515724e-07, + "loss": 0.0, + "step": 62617 + }, + { + "epoch": 5.84286647382663, + "grad_norm": NaN, + "learning_rate": 5.404807431418523e-07, + "loss": 0.0, + "step": 62618 + }, + { + "epoch": 5.842959783521508, + "grad_norm": NaN, + "learning_rate": 5.398394177992871e-07, + "loss": 0.0, + "step": 62619 + }, + { + "epoch": 5.843053093216385, + "grad_norm": NaN, + "learning_rate": 5.39198472489627e-07, + "loss": 0.0, + "step": 62620 + }, + { + "epoch": 5.8431464029112625, + "grad_norm": NaN, + "learning_rate": 5.385579072145041e-07, + "loss": 0.0, + "step": 62621 + }, + { + "epoch": 5.84323971260614, + "grad_norm": NaN, + "learning_rate": 5.37917721975567e-07, + "loss": 0.0, + "step": 62622 + }, + { + "epoch": 5.843333022301017, + "grad_norm": NaN, + "learning_rate": 5.372779167744146e-07, + "loss": 0.0, + "step": 62623 + }, + { + "epoch": 5.843426331995895, + "grad_norm": NaN, + "learning_rate": 5.36638491612712e-07, + "loss": 0.0, + "step": 62624 + }, + { + "epoch": 5.843519641690771, + "grad_norm": NaN, + "learning_rate": 5.359994464920414e-07, + "loss": 0.0, + "step": 62625 + }, + { + "epoch": 5.843612951385649, + "grad_norm": NaN, + "learning_rate": 5.353607814140682e-07, + "loss": 0.0, + "step": 62626 + }, + { + "epoch": 5.843706261080526, + "grad_norm": NaN, + "learning_rate": 5.347224963803909e-07, + "loss": 0.0, + "step": 62627 + }, + { + "epoch": 5.843799570775404, + "grad_norm": NaN, + "learning_rate": 5.340845913926417e-07, + "loss": 0.0, + "step": 62628 + }, + { + "epoch": 5.843892880470281, + "grad_norm": NaN, + "learning_rate": 5.33447066452436e-07, + "loss": 0.0, + "step": 62629 + }, + { + "epoch": 5.843986190165158, + "grad_norm": NaN, + "learning_rate": 5.328099215614057e-07, + "loss": 0.0, + "step": 62630 + }, + { + "epoch": 5.844079499860036, + "grad_norm": NaN, + "learning_rate": 5.321731567211496e-07, + "loss": 0.0, + "step": 62631 + }, + { + "epoch": 5.844172809554912, + "grad_norm": NaN, + "learning_rate": 5.315367719333164e-07, + "loss": 0.0, + "step": 62632 + }, + { + "epoch": 5.84426611924979, + "grad_norm": NaN, + "learning_rate": 5.309007671995047e-07, + "loss": 0.0, + "step": 62633 + }, + { + "epoch": 5.844359428944667, + "grad_norm": NaN, + "learning_rate": 5.3026514252133e-07, + "loss": 0.0, + "step": 62634 + }, + { + "epoch": 5.844452738639545, + "grad_norm": NaN, + "learning_rate": 5.296298979004243e-07, + "loss": 0.0, + "step": 62635 + }, + { + "epoch": 5.844546048334422, + "grad_norm": NaN, + "learning_rate": 5.289950333383863e-07, + "loss": 0.0, + "step": 62636 + }, + { + "epoch": 5.8446393580292995, + "grad_norm": NaN, + "learning_rate": 5.28360548836848e-07, + "loss": 0.0, + "step": 62637 + }, + { + "epoch": 5.844732667724177, + "grad_norm": NaN, + "learning_rate": 5.277264443974083e-07, + "loss": 0.0, + "step": 62638 + }, + { + "epoch": 5.844825977419054, + "grad_norm": NaN, + "learning_rate": 5.270927200216823e-07, + "loss": 0.0, + "step": 62639 + }, + { + "epoch": 5.844919287113931, + "grad_norm": NaN, + "learning_rate": 5.26459375711269e-07, + "loss": 0.0, + "step": 62640 + }, + { + "epoch": 5.845012596808808, + "grad_norm": NaN, + "learning_rate": 5.258264114678001e-07, + "loss": 0.0, + "step": 62641 + }, + { + "epoch": 5.845105906503686, + "grad_norm": NaN, + "learning_rate": 5.251938272928913e-07, + "loss": 0.0, + "step": 62642 + }, + { + "epoch": 5.845199216198563, + "grad_norm": NaN, + "learning_rate": 5.245616231881245e-07, + "loss": 0.0, + "step": 62643 + }, + { + "epoch": 5.8452925258934405, + "grad_norm": NaN, + "learning_rate": 5.23929799155115e-07, + "loss": 0.0, + "step": 62644 + }, + { + "epoch": 5.845385835588318, + "grad_norm": NaN, + "learning_rate": 5.232983551954784e-07, + "loss": 0.0, + "step": 62645 + }, + { + "epoch": 5.8454791452831945, + "grad_norm": NaN, + "learning_rate": 5.226672913108132e-07, + "loss": 0.0, + "step": 62646 + }, + { + "epoch": 5.845572454978072, + "grad_norm": NaN, + "learning_rate": 5.220366075027349e-07, + "loss": 0.0, + "step": 62647 + }, + { + "epoch": 5.845665764672949, + "grad_norm": NaN, + "learning_rate": 5.214063037728422e-07, + "loss": 0.0, + "step": 62648 + }, + { + "epoch": 5.845759074367827, + "grad_norm": NaN, + "learning_rate": 5.207763801227172e-07, + "loss": 0.0, + "step": 62649 + }, + { + "epoch": 5.845852384062704, + "grad_norm": NaN, + "learning_rate": 5.201468365539918e-07, + "loss": 0.0, + "step": 62650 + }, + { + "epoch": 5.8459456937575816, + "grad_norm": NaN, + "learning_rate": 5.195176730682482e-07, + "loss": 0.0, + "step": 62651 + }, + { + "epoch": 5.846039003452459, + "grad_norm": NaN, + "learning_rate": 5.188888896671017e-07, + "loss": 0.0, + "step": 62652 + }, + { + "epoch": 5.846132313147336, + "grad_norm": NaN, + "learning_rate": 5.182604863521345e-07, + "loss": 0.0, + "step": 62653 + }, + { + "epoch": 5.846225622842213, + "grad_norm": NaN, + "learning_rate": 5.176324631249451e-07, + "loss": 0.0, + "step": 62654 + }, + { + "epoch": 5.84631893253709, + "grad_norm": NaN, + "learning_rate": 5.17004819987149e-07, + "loss": 0.0, + "step": 62655 + }, + { + "epoch": 5.846412242231968, + "grad_norm": NaN, + "learning_rate": 5.163775569403117e-07, + "loss": 0.0, + "step": 62656 + }, + { + "epoch": 5.846505551926845, + "grad_norm": NaN, + "learning_rate": 5.15750673986065e-07, + "loss": 0.0, + "step": 62657 + }, + { + "epoch": 5.846598861621723, + "grad_norm": NaN, + "learning_rate": 5.151241711259746e-07, + "loss": 0.0, + "step": 62658 + }, + { + "epoch": 5.8466921713166, + "grad_norm": NaN, + "learning_rate": 5.14498048361639e-07, + "loss": 0.0, + "step": 62659 + }, + { + "epoch": 5.846785481011477, + "grad_norm": NaN, + "learning_rate": 5.13872305694657e-07, + "loss": 0.0, + "step": 62660 + }, + { + "epoch": 5.846878790706354, + "grad_norm": NaN, + "learning_rate": 5.132469431266272e-07, + "loss": 0.0, + "step": 62661 + }, + { + "epoch": 5.846972100401231, + "grad_norm": NaN, + "learning_rate": 5.126219606591153e-07, + "loss": 0.0, + "step": 62662 + }, + { + "epoch": 5.847065410096109, + "grad_norm": NaN, + "learning_rate": 5.119973582937198e-07, + "loss": 0.0, + "step": 62663 + }, + { + "epoch": 5.847158719790986, + "grad_norm": NaN, + "learning_rate": 5.113731360320561e-07, + "loss": 0.0, + "step": 62664 + }, + { + "epoch": 5.847252029485864, + "grad_norm": NaN, + "learning_rate": 5.10749293875673e-07, + "loss": 0.0, + "step": 62665 + }, + { + "epoch": 5.847345339180741, + "grad_norm": NaN, + "learning_rate": 5.101258318261691e-07, + "loss": 0.0, + "step": 62666 + }, + { + "epoch": 5.8474386488756185, + "grad_norm": NaN, + "learning_rate": 5.095027498851434e-07, + "loss": 0.0, + "step": 62667 + }, + { + "epoch": 5.847531958570496, + "grad_norm": NaN, + "learning_rate": 5.08880048054161e-07, + "loss": 0.0, + "step": 62668 + }, + { + "epoch": 5.847625268265372, + "grad_norm": NaN, + "learning_rate": 5.082577263348208e-07, + "loss": 0.0, + "step": 62669 + }, + { + "epoch": 5.84771857796025, + "grad_norm": NaN, + "learning_rate": 5.076357847286883e-07, + "loss": 0.0, + "step": 62670 + }, + { + "epoch": 5.847811887655127, + "grad_norm": NaN, + "learning_rate": 5.070142232373619e-07, + "loss": 0.0, + "step": 62671 + }, + { + "epoch": 5.847905197350005, + "grad_norm": NaN, + "learning_rate": 5.063930418624241e-07, + "loss": 0.0, + "step": 62672 + }, + { + "epoch": 5.847998507044882, + "grad_norm": NaN, + "learning_rate": 5.057722406054399e-07, + "loss": 0.0, + "step": 62673 + }, + { + "epoch": 5.8480918167397595, + "grad_norm": NaN, + "learning_rate": 5.051518194679915e-07, + "loss": 0.0, + "step": 62674 + }, + { + "epoch": 5.848185126434636, + "grad_norm": NaN, + "learning_rate": 5.045317784516611e-07, + "loss": 0.0, + "step": 62675 + }, + { + "epoch": 5.8482784361295135, + "grad_norm": NaN, + "learning_rate": 5.03912117558014e-07, + "loss": 0.0, + "step": 62676 + }, + { + "epoch": 5.848371745824391, + "grad_norm": NaN, + "learning_rate": 5.032928367886324e-07, + "loss": 0.0, + "step": 62677 + }, + { + "epoch": 5.848465055519268, + "grad_norm": NaN, + "learning_rate": 5.026739361450982e-07, + "loss": 0.0, + "step": 62678 + }, + { + "epoch": 5.848558365214146, + "grad_norm": NaN, + "learning_rate": 5.020554156289935e-07, + "loss": 0.0, + "step": 62679 + }, + { + "epoch": 5.848651674909023, + "grad_norm": NaN, + "learning_rate": 5.014372752418672e-07, + "loss": 0.0, + "step": 62680 + }, + { + "epoch": 5.848744984603901, + "grad_norm": NaN, + "learning_rate": 5.008195149852845e-07, + "loss": 0.0, + "step": 62681 + }, + { + "epoch": 5.848838294298778, + "grad_norm": NaN, + "learning_rate": 5.002021348608609e-07, + "loss": 0.0, + "step": 62682 + }, + { + "epoch": 5.848931603993655, + "grad_norm": NaN, + "learning_rate": 4.995851348701119e-07, + "loss": 0.0, + "step": 62683 + }, + { + "epoch": 5.849024913688532, + "grad_norm": NaN, + "learning_rate": 4.989685150146528e-07, + "loss": 0.0, + "step": 62684 + }, + { + "epoch": 5.849118223383409, + "grad_norm": NaN, + "learning_rate": 4.983522752960323e-07, + "loss": 0.0, + "step": 62685 + }, + { + "epoch": 5.849211533078287, + "grad_norm": NaN, + "learning_rate": 4.977364157157992e-07, + "loss": 0.0, + "step": 62686 + }, + { + "epoch": 5.849304842773164, + "grad_norm": NaN, + "learning_rate": 4.971209362755524e-07, + "loss": 0.0, + "step": 62687 + }, + { + "epoch": 5.849398152468042, + "grad_norm": NaN, + "learning_rate": 4.965058369768237e-07, + "loss": 0.0, + "step": 62688 + }, + { + "epoch": 5.849491462162919, + "grad_norm": NaN, + "learning_rate": 4.958911178212122e-07, + "loss": 0.0, + "step": 62689 + }, + { + "epoch": 5.849584771857796, + "grad_norm": NaN, + "learning_rate": 4.952767788102497e-07, + "loss": 0.0, + "step": 62690 + }, + { + "epoch": 5.849678081552673, + "grad_norm": NaN, + "learning_rate": 4.946628199455182e-07, + "loss": 0.0, + "step": 62691 + }, + { + "epoch": 5.84977139124755, + "grad_norm": NaN, + "learning_rate": 4.940492412285834e-07, + "loss": 0.0, + "step": 62692 + }, + { + "epoch": 5.849864700942428, + "grad_norm": NaN, + "learning_rate": 4.934360426609774e-07, + "loss": 0.0, + "step": 62693 + }, + { + "epoch": 5.849958010637305, + "grad_norm": NaN, + "learning_rate": 4.92823224244282e-07, + "loss": 0.0, + "step": 62694 + }, + { + "epoch": 5.850051320332183, + "grad_norm": NaN, + "learning_rate": 4.922107859800461e-07, + "loss": 0.0, + "step": 62695 + }, + { + "epoch": 5.85014463002706, + "grad_norm": NaN, + "learning_rate": 4.915987278698352e-07, + "loss": 0.0, + "step": 62696 + }, + { + "epoch": 5.8502379397219375, + "grad_norm": NaN, + "learning_rate": 4.90987049915198e-07, + "loss": 0.0, + "step": 62697 + }, + { + "epoch": 5.850331249416814, + "grad_norm": NaN, + "learning_rate": 4.903757521176999e-07, + "loss": 0.0, + "step": 62698 + }, + { + "epoch": 5.8504245591116915, + "grad_norm": NaN, + "learning_rate": 4.897648344788896e-07, + "loss": 0.0, + "step": 62699 + }, + { + "epoch": 5.850517868806569, + "grad_norm": NaN, + "learning_rate": 4.89154297000316e-07, + "loss": 0.0, + "step": 62700 + }, + { + "epoch": 5.850611178501446, + "grad_norm": NaN, + "learning_rate": 4.885441396835277e-07, + "loss": 0.0, + "step": 62701 + }, + { + "epoch": 5.850704488196324, + "grad_norm": NaN, + "learning_rate": 4.879343625300902e-07, + "loss": 0.0, + "step": 62702 + }, + { + "epoch": 5.850797797891201, + "grad_norm": NaN, + "learning_rate": 4.873249655415357e-07, + "loss": 0.0, + "step": 62703 + }, + { + "epoch": 5.8508911075860786, + "grad_norm": NaN, + "learning_rate": 4.867159487194294e-07, + "loss": 0.0, + "step": 62704 + }, + { + "epoch": 5.850984417280955, + "grad_norm": NaN, + "learning_rate": 4.861073120653369e-07, + "loss": 0.0, + "step": 62705 + }, + { + "epoch": 5.8510777269758325, + "grad_norm": NaN, + "learning_rate": 4.854990555807569e-07, + "loss": 0.0, + "step": 62706 + }, + { + "epoch": 5.85117103667071, + "grad_norm": NaN, + "learning_rate": 4.848911792672716e-07, + "loss": 0.0, + "step": 62707 + }, + { + "epoch": 5.851264346365587, + "grad_norm": NaN, + "learning_rate": 4.842836831264296e-07, + "loss": 0.0, + "step": 62708 + }, + { + "epoch": 5.851357656060465, + "grad_norm": NaN, + "learning_rate": 4.836765671597464e-07, + "loss": 0.0, + "step": 62709 + }, + { + "epoch": 5.851450965755342, + "grad_norm": NaN, + "learning_rate": 4.830698313687875e-07, + "loss": 0.0, + "step": 62710 + }, + { + "epoch": 5.85154427545022, + "grad_norm": NaN, + "learning_rate": 4.824634757551016e-07, + "loss": 0.0, + "step": 62711 + }, + { + "epoch": 5.851637585145097, + "grad_norm": NaN, + "learning_rate": 4.818575003202208e-07, + "loss": 0.0, + "step": 62712 + }, + { + "epoch": 5.8517308948399736, + "grad_norm": NaN, + "learning_rate": 4.812519050656771e-07, + "loss": 0.0, + "step": 62713 + }, + { + "epoch": 5.851824204534851, + "grad_norm": NaN, + "learning_rate": 4.806466899930195e-07, + "loss": 0.0, + "step": 62714 + }, + { + "epoch": 5.851917514229728, + "grad_norm": NaN, + "learning_rate": 4.800418551037965e-07, + "loss": 0.0, + "step": 62715 + }, + { + "epoch": 5.852010823924606, + "grad_norm": NaN, + "learning_rate": 4.794374003995238e-07, + "loss": 0.0, + "step": 62716 + }, + { + "epoch": 5.852104133619483, + "grad_norm": NaN, + "learning_rate": 4.7883332588175e-07, + "loss": 0.0, + "step": 62717 + }, + { + "epoch": 5.852197443314361, + "grad_norm": NaN, + "learning_rate": 4.78229631552024e-07, + "loss": 0.0, + "step": 62718 + }, + { + "epoch": 5.852290753009237, + "grad_norm": NaN, + "learning_rate": 4.77626317411861e-07, + "loss": 0.0, + "step": 62719 + }, + { + "epoch": 5.852384062704115, + "grad_norm": NaN, + "learning_rate": 4.770233834628101e-07, + "loss": 0.0, + "step": 62720 + }, + { + "epoch": 5.852477372398992, + "grad_norm": NaN, + "learning_rate": 4.7642082970638653e-07, + "loss": 0.0, + "step": 62721 + }, + { + "epoch": 5.852570682093869, + "grad_norm": NaN, + "learning_rate": 4.7581865614413904e-07, + "loss": 0.0, + "step": 62722 + }, + { + "epoch": 5.852663991788747, + "grad_norm": NaN, + "learning_rate": 4.752168627775832e-07, + "loss": 0.0, + "step": 62723 + }, + { + "epoch": 5.852757301483624, + "grad_norm": NaN, + "learning_rate": 4.746154496082677e-07, + "loss": 0.0, + "step": 62724 + }, + { + "epoch": 5.852850611178502, + "grad_norm": NaN, + "learning_rate": 4.74014416637708e-07, + "loss": 0.0, + "step": 62725 + }, + { + "epoch": 5.852943920873379, + "grad_norm": NaN, + "learning_rate": 4.734137638674362e-07, + "loss": 0.0, + "step": 62726 + }, + { + "epoch": 5.8530372305682565, + "grad_norm": NaN, + "learning_rate": 4.728134912989845e-07, + "loss": 0.0, + "step": 62727 + }, + { + "epoch": 5.853130540263133, + "grad_norm": NaN, + "learning_rate": 4.722135989338682e-07, + "loss": 0.0, + "step": 62728 + }, + { + "epoch": 5.8532238499580105, + "grad_norm": NaN, + "learning_rate": 4.7161408677361957e-07, + "loss": 0.0, + "step": 62729 + }, + { + "epoch": 5.853317159652888, + "grad_norm": NaN, + "learning_rate": 4.7101495481977056e-07, + "loss": 0.0, + "step": 62730 + }, + { + "epoch": 5.853410469347765, + "grad_norm": NaN, + "learning_rate": 4.7041620307383675e-07, + "loss": 0.0, + "step": 62731 + }, + { + "epoch": 5.853503779042643, + "grad_norm": NaN, + "learning_rate": 4.6981783153733354e-07, + "loss": 0.0, + "step": 62732 + }, + { + "epoch": 5.85359708873752, + "grad_norm": NaN, + "learning_rate": 4.6921984021179304e-07, + "loss": 0.0, + "step": 62733 + }, + { + "epoch": 5.853690398432397, + "grad_norm": NaN, + "learning_rate": 4.686222290987307e-07, + "loss": 0.0, + "step": 62734 + }, + { + "epoch": 5.853783708127274, + "grad_norm": NaN, + "learning_rate": 4.68024998199662e-07, + "loss": 0.0, + "step": 62735 + }, + { + "epoch": 5.8538770178221515, + "grad_norm": NaN, + "learning_rate": 4.6742814751611904e-07, + "loss": 0.0, + "step": 62736 + }, + { + "epoch": 5.853970327517029, + "grad_norm": NaN, + "learning_rate": 4.6683167704961723e-07, + "loss": 0.0, + "step": 62737 + }, + { + "epoch": 5.854063637211906, + "grad_norm": NaN, + "learning_rate": 4.662355868016554e-07, + "loss": 0.0, + "step": 62738 + }, + { + "epoch": 5.854156946906784, + "grad_norm": NaN, + "learning_rate": 4.6563987677376566e-07, + "loss": 0.0, + "step": 62739 + }, + { + "epoch": 5.854250256601661, + "grad_norm": NaN, + "learning_rate": 4.650445469674635e-07, + "loss": 0.0, + "step": 62740 + }, + { + "epoch": 5.854343566296539, + "grad_norm": NaN, + "learning_rate": 4.644495973842477e-07, + "loss": 0.0, + "step": 62741 + }, + { + "epoch": 5.854436875991415, + "grad_norm": NaN, + "learning_rate": 4.638550280256503e-07, + "loss": 0.0, + "step": 62742 + }, + { + "epoch": 5.854530185686293, + "grad_norm": NaN, + "learning_rate": 4.6326083889317025e-07, + "loss": 0.0, + "step": 62743 + }, + { + "epoch": 5.85462349538117, + "grad_norm": NaN, + "learning_rate": 4.6266702998832283e-07, + "loss": 0.0, + "step": 62744 + }, + { + "epoch": 5.854716805076047, + "grad_norm": NaN, + "learning_rate": 4.62073601312607e-07, + "loss": 0.0, + "step": 62745 + }, + { + "epoch": 5.854810114770925, + "grad_norm": NaN, + "learning_rate": 4.614805528675547e-07, + "loss": 0.0, + "step": 62746 + }, + { + "epoch": 5.854903424465802, + "grad_norm": NaN, + "learning_rate": 4.608878846546649e-07, + "loss": 0.0, + "step": 62747 + }, + { + "epoch": 5.85499673416068, + "grad_norm": NaN, + "learning_rate": 4.602955966754196e-07, + "loss": 0.0, + "step": 62748 + }, + { + "epoch": 5.855090043855556, + "grad_norm": NaN, + "learning_rate": 4.597036889313677e-07, + "loss": 0.0, + "step": 62749 + }, + { + "epoch": 5.855183353550434, + "grad_norm": NaN, + "learning_rate": 4.5911216142397455e-07, + "loss": 0.0, + "step": 62750 + }, + { + "epoch": 5.855276663245311, + "grad_norm": NaN, + "learning_rate": 4.585210141547724e-07, + "loss": 0.0, + "step": 62751 + }, + { + "epoch": 5.8553699729401885, + "grad_norm": NaN, + "learning_rate": 4.5793024712524327e-07, + "loss": 0.0, + "step": 62752 + }, + { + "epoch": 5.855463282635066, + "grad_norm": NaN, + "learning_rate": 4.573398603369194e-07, + "loss": 0.0, + "step": 62753 + }, + { + "epoch": 5.855556592329943, + "grad_norm": NaN, + "learning_rate": 4.5674985379126616e-07, + "loss": 0.0, + "step": 62754 + }, + { + "epoch": 5.855649902024821, + "grad_norm": NaN, + "learning_rate": 4.561602274897991e-07, + "loss": 0.0, + "step": 62755 + }, + { + "epoch": 5.855743211719698, + "grad_norm": NaN, + "learning_rate": 4.555709814340336e-07, + "loss": 0.0, + "step": 62756 + }, + { + "epoch": 5.855836521414575, + "grad_norm": NaN, + "learning_rate": 4.5498211562543516e-07, + "loss": 0.0, + "step": 62757 + }, + { + "epoch": 5.855929831109452, + "grad_norm": NaN, + "learning_rate": 4.5439363006553596e-07, + "loss": 0.0, + "step": 62758 + }, + { + "epoch": 5.8560231408043295, + "grad_norm": NaN, + "learning_rate": 4.5380552475580144e-07, + "loss": 0.0, + "step": 62759 + }, + { + "epoch": 5.856116450499207, + "grad_norm": NaN, + "learning_rate": 4.532177996977471e-07, + "loss": 0.0, + "step": 62760 + }, + { + "epoch": 5.856209760194084, + "grad_norm": NaN, + "learning_rate": 4.52630454892855e-07, + "loss": 0.0, + "step": 62761 + }, + { + "epoch": 5.856303069888962, + "grad_norm": NaN, + "learning_rate": 4.52043490342624e-07, + "loss": 0.0, + "step": 62762 + }, + { + "epoch": 5.856396379583838, + "grad_norm": NaN, + "learning_rate": 4.514569060485529e-07, + "loss": 0.0, + "step": 62763 + }, + { + "epoch": 5.856489689278716, + "grad_norm": NaN, + "learning_rate": 4.508707020121238e-07, + "loss": 0.0, + "step": 62764 + }, + { + "epoch": 5.856582998973593, + "grad_norm": NaN, + "learning_rate": 4.502848782348356e-07, + "loss": 0.0, + "step": 62765 + }, + { + "epoch": 5.8566763086684706, + "grad_norm": NaN, + "learning_rate": 4.496994347181704e-07, + "loss": 0.0, + "step": 62766 + }, + { + "epoch": 5.856769618363348, + "grad_norm": NaN, + "learning_rate": 4.491143714636103e-07, + "loss": 0.0, + "step": 62767 + }, + { + "epoch": 5.856862928058225, + "grad_norm": NaN, + "learning_rate": 4.485296884726708e-07, + "loss": 0.0, + "step": 62768 + }, + { + "epoch": 5.856956237753103, + "grad_norm": NaN, + "learning_rate": 4.479453857468007e-07, + "loss": 0.0, + "step": 62769 + }, + { + "epoch": 5.85704954744798, + "grad_norm": NaN, + "learning_rate": 4.4736146328751556e-07, + "loss": 0.0, + "step": 62770 + }, + { + "epoch": 5.857142857142857, + "grad_norm": NaN, + "learning_rate": 4.467779210962974e-07, + "loss": 0.0, + "step": 62771 + }, + { + "epoch": 5.857236166837734, + "grad_norm": NaN, + "learning_rate": 4.4619475917461177e-07, + "loss": 0.0, + "step": 62772 + }, + { + "epoch": 5.857329476532612, + "grad_norm": NaN, + "learning_rate": 4.4561197752394086e-07, + "loss": 0.0, + "step": 62773 + }, + { + "epoch": 5.857422786227489, + "grad_norm": NaN, + "learning_rate": 4.4502957614580005e-07, + "loss": 0.0, + "step": 62774 + }, + { + "epoch": 5.857516095922366, + "grad_norm": NaN, + "learning_rate": 4.444475550416382e-07, + "loss": 0.0, + "step": 62775 + }, + { + "epoch": 5.857609405617244, + "grad_norm": NaN, + "learning_rate": 4.438659142129375e-07, + "loss": 0.0, + "step": 62776 + }, + { + "epoch": 5.857702715312121, + "grad_norm": NaN, + "learning_rate": 4.4328465366118e-07, + "loss": 0.0, + "step": 62777 + }, + { + "epoch": 5.857796025006998, + "grad_norm": NaN, + "learning_rate": 4.427037733878647e-07, + "loss": 0.0, + "step": 62778 + }, + { + "epoch": 5.857889334701875, + "grad_norm": NaN, + "learning_rate": 4.421232733944402e-07, + "loss": 0.0, + "step": 62779 + }, + { + "epoch": 5.857982644396753, + "grad_norm": NaN, + "learning_rate": 4.4154315368240545e-07, + "loss": 0.0, + "step": 62780 + }, + { + "epoch": 5.85807595409163, + "grad_norm": NaN, + "learning_rate": 4.4096341425320927e-07, + "loss": 0.0, + "step": 62781 + }, + { + "epoch": 5.8581692637865075, + "grad_norm": NaN, + "learning_rate": 4.403840551083504e-07, + "loss": 0.0, + "step": 62782 + }, + { + "epoch": 5.858262573481385, + "grad_norm": NaN, + "learning_rate": 4.398050762492944e-07, + "loss": 0.0, + "step": 62783 + }, + { + "epoch": 5.858355883176262, + "grad_norm": NaN, + "learning_rate": 4.3922647767750674e-07, + "loss": 0.0, + "step": 62784 + }, + { + "epoch": 5.85844919287114, + "grad_norm": NaN, + "learning_rate": 4.3864825939445293e-07, + "loss": 0.0, + "step": 62785 + }, + { + "epoch": 5.858542502566016, + "grad_norm": NaN, + "learning_rate": 4.3807042140161506e-07, + "loss": 0.0, + "step": 62786 + }, + { + "epoch": 5.858635812260894, + "grad_norm": NaN, + "learning_rate": 4.3749296370047537e-07, + "loss": 0.0, + "step": 62787 + }, + { + "epoch": 5.858729121955771, + "grad_norm": NaN, + "learning_rate": 4.369158862924826e-07, + "loss": 0.0, + "step": 62788 + }, + { + "epoch": 5.8588224316506485, + "grad_norm": NaN, + "learning_rate": 4.363391891791024e-07, + "loss": 0.0, + "step": 62789 + }, + { + "epoch": 5.858915741345526, + "grad_norm": NaN, + "learning_rate": 4.357628723618167e-07, + "loss": 0.0, + "step": 62790 + }, + { + "epoch": 5.859009051040403, + "grad_norm": NaN, + "learning_rate": 4.3518693584207454e-07, + "loss": 0.0, + "step": 62791 + }, + { + "epoch": 5.85910236073528, + "grad_norm": NaN, + "learning_rate": 4.346113796213413e-07, + "loss": 0.0, + "step": 62792 + }, + { + "epoch": 5.859195670430157, + "grad_norm": NaN, + "learning_rate": 4.3403620370109915e-07, + "loss": 0.0, + "step": 62793 + }, + { + "epoch": 5.859288980125035, + "grad_norm": NaN, + "learning_rate": 4.3346140808279693e-07, + "loss": 0.0, + "step": 62794 + }, + { + "epoch": 5.859382289819912, + "grad_norm": NaN, + "learning_rate": 4.3288699276790017e-07, + "loss": 0.0, + "step": 62795 + }, + { + "epoch": 5.85947559951479, + "grad_norm": NaN, + "learning_rate": 4.323129577578577e-07, + "loss": 0.0, + "step": 62796 + }, + { + "epoch": 5.859568909209667, + "grad_norm": NaN, + "learning_rate": 4.317393030541516e-07, + "loss": 0.0, + "step": 62797 + }, + { + "epoch": 5.859662218904544, + "grad_norm": NaN, + "learning_rate": 4.311660286581975e-07, + "loss": 0.0, + "step": 62798 + }, + { + "epoch": 5.859755528599422, + "grad_norm": NaN, + "learning_rate": 4.305931345715108e-07, + "loss": 0.0, + "step": 62799 + }, + { + "epoch": 5.859848838294299, + "grad_norm": NaN, + "learning_rate": 4.3002062079552366e-07, + "loss": 0.0, + "step": 62800 + }, + { + "epoch": 5.859942147989176, + "grad_norm": NaN, + "learning_rate": 4.294484873316684e-07, + "loss": 0.0, + "step": 62801 + }, + { + "epoch": 5.860035457684053, + "grad_norm": NaN, + "learning_rate": 4.2887673418142696e-07, + "loss": 0.0, + "step": 62802 + }, + { + "epoch": 5.860128767378931, + "grad_norm": NaN, + "learning_rate": 4.283053613462484e-07, + "loss": 0.0, + "step": 62803 + }, + { + "epoch": 5.860222077073808, + "grad_norm": NaN, + "learning_rate": 4.2773436882758136e-07, + "loss": 0.0, + "step": 62804 + }, + { + "epoch": 5.8603153867686855, + "grad_norm": NaN, + "learning_rate": 4.2716375662687483e-07, + "loss": 0.0, + "step": 62805 + }, + { + "epoch": 5.860408696463563, + "grad_norm": NaN, + "learning_rate": 4.265935247455943e-07, + "loss": 0.0, + "step": 62806 + }, + { + "epoch": 5.860502006158439, + "grad_norm": NaN, + "learning_rate": 4.260236731851718e-07, + "loss": 0.0, + "step": 62807 + }, + { + "epoch": 5.860595315853317, + "grad_norm": NaN, + "learning_rate": 4.2545420194705635e-07, + "loss": 0.0, + "step": 62808 + }, + { + "epoch": 5.860688625548194, + "grad_norm": NaN, + "learning_rate": 4.248851110327134e-07, + "loss": 0.0, + "step": 62809 + }, + { + "epoch": 5.860781935243072, + "grad_norm": NaN, + "learning_rate": 4.2431640044357506e-07, + "loss": 0.0, + "step": 62810 + }, + { + "epoch": 5.860875244937949, + "grad_norm": NaN, + "learning_rate": 4.237480701811069e-07, + "loss": 0.0, + "step": 62811 + }, + { + "epoch": 5.8609685546328265, + "grad_norm": NaN, + "learning_rate": 4.231801202467244e-07, + "loss": 0.0, + "step": 62812 + }, + { + "epoch": 5.861061864327704, + "grad_norm": NaN, + "learning_rate": 4.2261255064189314e-07, + "loss": 0.0, + "step": 62813 + }, + { + "epoch": 5.861155174022581, + "grad_norm": NaN, + "learning_rate": 4.2204536136804524e-07, + "loss": 0.0, + "step": 62814 + }, + { + "epoch": 5.861248483717458, + "grad_norm": NaN, + "learning_rate": 4.2147855242662956e-07, + "loss": 0.0, + "step": 62815 + }, + { + "epoch": 5.861341793412335, + "grad_norm": NaN, + "learning_rate": 4.209121238190949e-07, + "loss": 0.0, + "step": 62816 + }, + { + "epoch": 5.861435103107213, + "grad_norm": NaN, + "learning_rate": 4.2034607554685684e-07, + "loss": 0.0, + "step": 62817 + }, + { + "epoch": 5.86152841280209, + "grad_norm": NaN, + "learning_rate": 4.197804076113809e-07, + "loss": 0.0, + "step": 62818 + }, + { + "epoch": 5.861621722496968, + "grad_norm": NaN, + "learning_rate": 4.192151200140992e-07, + "loss": 0.0, + "step": 62819 + }, + { + "epoch": 5.861715032191845, + "grad_norm": NaN, + "learning_rate": 4.186502127564273e-07, + "loss": 0.0, + "step": 62820 + }, + { + "epoch": 5.861808341886722, + "grad_norm": NaN, + "learning_rate": 4.180856858398307e-07, + "loss": 0.0, + "step": 62821 + }, + { + "epoch": 5.861901651581599, + "grad_norm": NaN, + "learning_rate": 4.1752153926572495e-07, + "loss": 0.0, + "step": 62822 + }, + { + "epoch": 5.861994961276476, + "grad_norm": NaN, + "learning_rate": 4.169577730355589e-07, + "loss": 0.0, + "step": 62823 + }, + { + "epoch": 5.862088270971354, + "grad_norm": NaN, + "learning_rate": 4.163943871507647e-07, + "loss": 0.0, + "step": 62824 + }, + { + "epoch": 5.862181580666231, + "grad_norm": NaN, + "learning_rate": 4.1583138161275784e-07, + "loss": 0.0, + "step": 62825 + }, + { + "epoch": 5.862274890361109, + "grad_norm": NaN, + "learning_rate": 4.152687564229873e-07, + "loss": 0.0, + "step": 62826 + }, + { + "epoch": 5.862368200055986, + "grad_norm": NaN, + "learning_rate": 4.147065115828685e-07, + "loss": 0.0, + "step": 62827 + }, + { + "epoch": 5.862461509750863, + "grad_norm": NaN, + "learning_rate": 4.1414464709386697e-07, + "loss": 0.0, + "step": 62828 + }, + { + "epoch": 5.862554819445741, + "grad_norm": NaN, + "learning_rate": 4.1358316295736493e-07, + "loss": 0.0, + "step": 62829 + }, + { + "epoch": 5.862648129140617, + "grad_norm": NaN, + "learning_rate": 4.130220591748112e-07, + "loss": 0.0, + "step": 62830 + }, + { + "epoch": 5.862741438835495, + "grad_norm": NaN, + "learning_rate": 4.124613357476381e-07, + "loss": 0.0, + "step": 62831 + }, + { + "epoch": 5.862834748530372, + "grad_norm": NaN, + "learning_rate": 4.119009926772443e-07, + "loss": 0.0, + "step": 62832 + }, + { + "epoch": 5.86292805822525, + "grad_norm": NaN, + "learning_rate": 4.113410299650954e-07, + "loss": 0.0, + "step": 62833 + }, + { + "epoch": 5.863021367920127, + "grad_norm": NaN, + "learning_rate": 4.1078144761259036e-07, + "loss": 0.0, + "step": 62834 + }, + { + "epoch": 5.8631146776150045, + "grad_norm": NaN, + "learning_rate": 4.1022224562116126e-07, + "loss": 0.0, + "step": 62835 + }, + { + "epoch": 5.863207987309881, + "grad_norm": NaN, + "learning_rate": 4.09663423992207e-07, + "loss": 0.0, + "step": 62836 + }, + { + "epoch": 5.863301297004758, + "grad_norm": NaN, + "learning_rate": 4.0910498272719303e-07, + "loss": 0.0, + "step": 62837 + }, + { + "epoch": 5.863394606699636, + "grad_norm": NaN, + "learning_rate": 4.085469218275017e-07, + "loss": 0.0, + "step": 62838 + }, + { + "epoch": 5.863487916394513, + "grad_norm": NaN, + "learning_rate": 4.0798924129454844e-07, + "loss": 0.0, + "step": 62839 + }, + { + "epoch": 5.863581226089391, + "grad_norm": NaN, + "learning_rate": 4.0743194112978217e-07, + "loss": 0.0, + "step": 62840 + }, + { + "epoch": 5.863674535784268, + "grad_norm": NaN, + "learning_rate": 4.068750213346017e-07, + "loss": 0.0, + "step": 62841 + }, + { + "epoch": 5.8637678454791455, + "grad_norm": NaN, + "learning_rate": 4.063184819104226e-07, + "loss": 0.0, + "step": 62842 + }, + { + "epoch": 5.863861155174023, + "grad_norm": NaN, + "learning_rate": 4.057623228586604e-07, + "loss": 0.0, + "step": 62843 + }, + { + "epoch": 5.8639544648689, + "grad_norm": NaN, + "learning_rate": 4.0520654418074727e-07, + "loss": 0.0, + "step": 62844 + }, + { + "epoch": 5.864047774563777, + "grad_norm": NaN, + "learning_rate": 4.0465114587806547e-07, + "loss": 0.0, + "step": 62845 + }, + { + "epoch": 5.864141084258654, + "grad_norm": NaN, + "learning_rate": 4.0409612795204714e-07, + "loss": 0.0, + "step": 62846 + }, + { + "epoch": 5.864234393953532, + "grad_norm": NaN, + "learning_rate": 4.0354149040409123e-07, + "loss": 0.0, + "step": 62847 + }, + { + "epoch": 5.864327703648409, + "grad_norm": NaN, + "learning_rate": 4.0298723323561323e-07, + "loss": 0.0, + "step": 62848 + }, + { + "epoch": 5.864421013343287, + "grad_norm": NaN, + "learning_rate": 4.024333564480453e-07, + "loss": 0.0, + "step": 62849 + }, + { + "epoch": 5.864514323038164, + "grad_norm": NaN, + "learning_rate": 4.0187986004275305e-07, + "loss": 0.0, + "step": 62850 + }, + { + "epoch": 5.8646076327330405, + "grad_norm": NaN, + "learning_rate": 4.0132674402118535e-07, + "loss": 0.0, + "step": 62851 + }, + { + "epoch": 5.864700942427918, + "grad_norm": NaN, + "learning_rate": 4.007740083847244e-07, + "loss": 0.0, + "step": 62852 + }, + { + "epoch": 5.864794252122795, + "grad_norm": NaN, + "learning_rate": 4.0022165313476906e-07, + "loss": 0.0, + "step": 62853 + }, + { + "epoch": 5.864887561817673, + "grad_norm": NaN, + "learning_rate": 3.996696782727349e-07, + "loss": 0.0, + "step": 62854 + }, + { + "epoch": 5.86498087151255, + "grad_norm": NaN, + "learning_rate": 3.991180838000374e-07, + "loss": 0.0, + "step": 62855 + }, + { + "epoch": 5.865074181207428, + "grad_norm": NaN, + "learning_rate": 3.9856686971805887e-07, + "loss": 0.0, + "step": 62856 + }, + { + "epoch": 5.865167490902305, + "grad_norm": NaN, + "learning_rate": 3.9801603602821475e-07, + "loss": 0.0, + "step": 62857 + }, + { + "epoch": 5.8652608005971825, + "grad_norm": NaN, + "learning_rate": 3.97465582731904e-07, + "loss": 0.0, + "step": 62858 + }, + { + "epoch": 5.865354110292059, + "grad_norm": NaN, + "learning_rate": 3.969155098305088e-07, + "loss": 0.0, + "step": 62859 + }, + { + "epoch": 5.865447419986936, + "grad_norm": NaN, + "learning_rate": 3.9636581732546136e-07, + "loss": 0.0, + "step": 62860 + }, + { + "epoch": 5.865540729681814, + "grad_norm": NaN, + "learning_rate": 3.9581650521812727e-07, + "loss": 0.0, + "step": 62861 + }, + { + "epoch": 5.865634039376691, + "grad_norm": NaN, + "learning_rate": 3.95267573509922e-07, + "loss": 0.0, + "step": 62862 + }, + { + "epoch": 5.865727349071569, + "grad_norm": NaN, + "learning_rate": 3.947190222022278e-07, + "loss": 0.0, + "step": 62863 + }, + { + "epoch": 5.865820658766446, + "grad_norm": NaN, + "learning_rate": 3.941708512964603e-07, + "loss": 0.0, + "step": 62864 + }, + { + "epoch": 5.8659139684613235, + "grad_norm": NaN, + "learning_rate": 3.93623060793985e-07, + "loss": 0.0, + "step": 62865 + }, + { + "epoch": 5.8660072781562, + "grad_norm": NaN, + "learning_rate": 3.930756506962174e-07, + "loss": 0.0, + "step": 62866 + }, + { + "epoch": 5.8661005878510775, + "grad_norm": NaN, + "learning_rate": 3.9252862100453973e-07, + "loss": 0.0, + "step": 62867 + }, + { + "epoch": 5.866193897545955, + "grad_norm": NaN, + "learning_rate": 3.919819717203343e-07, + "loss": 0.0, + "step": 62868 + }, + { + "epoch": 5.866287207240832, + "grad_norm": NaN, + "learning_rate": 3.914357028450166e-07, + "loss": 0.0, + "step": 62869 + }, + { + "epoch": 5.86638051693571, + "grad_norm": NaN, + "learning_rate": 3.9088981437995213e-07, + "loss": 0.0, + "step": 62870 + }, + { + "epoch": 5.866473826630587, + "grad_norm": NaN, + "learning_rate": 3.903443063265399e-07, + "loss": 0.0, + "step": 62871 + }, + { + "epoch": 5.866567136325465, + "grad_norm": NaN, + "learning_rate": 3.8979917868617874e-07, + "loss": 0.0, + "step": 62872 + }, + { + "epoch": 5.866660446020342, + "grad_norm": NaN, + "learning_rate": 3.892544314602175e-07, + "loss": 0.0, + "step": 62873 + }, + { + "epoch": 5.8667537557152185, + "grad_norm": NaN, + "learning_rate": 3.887100646500718e-07, + "loss": 0.0, + "step": 62874 + }, + { + "epoch": 5.866847065410096, + "grad_norm": NaN, + "learning_rate": 3.8816607825712387e-07, + "loss": 0.0, + "step": 62875 + }, + { + "epoch": 5.866940375104973, + "grad_norm": NaN, + "learning_rate": 3.876224722827559e-07, + "loss": 0.0, + "step": 62876 + }, + { + "epoch": 5.867033684799851, + "grad_norm": NaN, + "learning_rate": 3.870792467283168e-07, + "loss": 0.0, + "step": 62877 + }, + { + "epoch": 5.867126994494728, + "grad_norm": NaN, + "learning_rate": 3.8653640159525546e-07, + "loss": 0.0, + "step": 62878 + }, + { + "epoch": 5.867220304189606, + "grad_norm": NaN, + "learning_rate": 3.8599393688488745e-07, + "loss": 0.0, + "step": 62879 + }, + { + "epoch": 5.867313613884482, + "grad_norm": NaN, + "learning_rate": 3.854518525986283e-07, + "loss": 0.0, + "step": 62880 + }, + { + "epoch": 5.86740692357936, + "grad_norm": NaN, + "learning_rate": 3.8491014873784366e-07, + "loss": 0.0, + "step": 62881 + }, + { + "epoch": 5.867500233274237, + "grad_norm": NaN, + "learning_rate": 3.8436882530391565e-07, + "loss": 0.0, + "step": 62882 + }, + { + "epoch": 5.867593542969114, + "grad_norm": NaN, + "learning_rate": 3.838278822982099e-07, + "loss": 0.0, + "step": 62883 + }, + { + "epoch": 5.867686852663992, + "grad_norm": NaN, + "learning_rate": 3.8328731972210867e-07, + "loss": 0.0, + "step": 62884 + }, + { + "epoch": 5.867780162358869, + "grad_norm": NaN, + "learning_rate": 3.8274713757699413e-07, + "loss": 0.0, + "step": 62885 + }, + { + "epoch": 5.867873472053747, + "grad_norm": NaN, + "learning_rate": 3.822073358642319e-07, + "loss": 0.0, + "step": 62886 + }, + { + "epoch": 5.867966781748624, + "grad_norm": NaN, + "learning_rate": 3.8166791458520416e-07, + "loss": 0.0, + "step": 62887 + }, + { + "epoch": 5.868060091443501, + "grad_norm": NaN, + "learning_rate": 3.8112887374127656e-07, + "loss": 0.0, + "step": 62888 + }, + { + "epoch": 5.868153401138378, + "grad_norm": NaN, + "learning_rate": 3.805902133337979e-07, + "loss": 0.0, + "step": 62889 + }, + { + "epoch": 5.868246710833255, + "grad_norm": NaN, + "learning_rate": 3.800519333641838e-07, + "loss": 0.0, + "step": 62890 + }, + { + "epoch": 5.868340020528133, + "grad_norm": NaN, + "learning_rate": 3.795140338337499e-07, + "loss": 0.0, + "step": 62891 + }, + { + "epoch": 5.86843333022301, + "grad_norm": NaN, + "learning_rate": 3.7897651474391164e-07, + "loss": 0.0, + "step": 62892 + }, + { + "epoch": 5.868526639917888, + "grad_norm": NaN, + "learning_rate": 3.78439376096018e-07, + "loss": 0.0, + "step": 62893 + }, + { + "epoch": 5.868619949612765, + "grad_norm": NaN, + "learning_rate": 3.779026178914013e-07, + "loss": 0.0, + "step": 62894 + }, + { + "epoch": 5.868713259307642, + "grad_norm": NaN, + "learning_rate": 3.773662401314936e-07, + "loss": 0.0, + "step": 62895 + }, + { + "epoch": 5.868806569002519, + "grad_norm": NaN, + "learning_rate": 3.7683024281761067e-07, + "loss": 0.0, + "step": 62896 + }, + { + "epoch": 5.8688998786973965, + "grad_norm": NaN, + "learning_rate": 3.7629462595110125e-07, + "loss": 0.0, + "step": 62897 + }, + { + "epoch": 5.868993188392274, + "grad_norm": NaN, + "learning_rate": 3.75759389533381e-07, + "loss": 0.0, + "step": 62898 + }, + { + "epoch": 5.869086498087151, + "grad_norm": NaN, + "learning_rate": 3.752245335657822e-07, + "loss": 0.0, + "step": 62899 + }, + { + "epoch": 5.869179807782029, + "grad_norm": NaN, + "learning_rate": 3.746900580496537e-07, + "loss": 0.0, + "step": 62900 + }, + { + "epoch": 5.869273117476906, + "grad_norm": NaN, + "learning_rate": 3.7415596298637773e-07, + "loss": 0.0, + "step": 62901 + }, + { + "epoch": 5.869366427171784, + "grad_norm": NaN, + "learning_rate": 3.736222483772866e-07, + "loss": 0.0, + "step": 62902 + }, + { + "epoch": 5.86945973686666, + "grad_norm": NaN, + "learning_rate": 3.7308891422376254e-07, + "loss": 0.0, + "step": 62903 + }, + { + "epoch": 5.8695530465615375, + "grad_norm": NaN, + "learning_rate": 3.725559605271544e-07, + "loss": 0.0, + "step": 62904 + }, + { + "epoch": 5.869646356256415, + "grad_norm": NaN, + "learning_rate": 3.7202338728879453e-07, + "loss": 0.0, + "step": 62905 + }, + { + "epoch": 5.869739665951292, + "grad_norm": NaN, + "learning_rate": 3.714911945100651e-07, + "loss": 0.0, + "step": 62906 + }, + { + "epoch": 5.86983297564617, + "grad_norm": NaN, + "learning_rate": 3.7095938219231514e-07, + "loss": 0.0, + "step": 62907 + }, + { + "epoch": 5.869926285341047, + "grad_norm": NaN, + "learning_rate": 3.704279503368934e-07, + "loss": 0.0, + "step": 62908 + }, + { + "epoch": 5.870019595035924, + "grad_norm": NaN, + "learning_rate": 3.69896898945149e-07, + "loss": 0.0, + "step": 62909 + }, + { + "epoch": 5.870112904730801, + "grad_norm": NaN, + "learning_rate": 3.6936622801843065e-07, + "loss": 0.0, + "step": 62910 + }, + { + "epoch": 5.870206214425679, + "grad_norm": NaN, + "learning_rate": 3.688359375580874e-07, + "loss": 0.0, + "step": 62911 + }, + { + "epoch": 5.870299524120556, + "grad_norm": NaN, + "learning_rate": 3.683060275654681e-07, + "loss": 0.0, + "step": 62912 + }, + { + "epoch": 5.870392833815433, + "grad_norm": NaN, + "learning_rate": 3.6777649804192176e-07, + "loss": 0.0, + "step": 62913 + }, + { + "epoch": 5.870486143510311, + "grad_norm": NaN, + "learning_rate": 3.672473489888139e-07, + "loss": 0.0, + "step": 62914 + }, + { + "epoch": 5.870579453205188, + "grad_norm": NaN, + "learning_rate": 3.667185804074435e-07, + "loss": 0.0, + "step": 62915 + }, + { + "epoch": 5.870672762900066, + "grad_norm": NaN, + "learning_rate": 3.6619019229920945e-07, + "loss": 0.0, + "step": 62916 + }, + { + "epoch": 5.870766072594943, + "grad_norm": NaN, + "learning_rate": 3.6566218466541065e-07, + "loss": 0.0, + "step": 62917 + }, + { + "epoch": 5.87085938228982, + "grad_norm": NaN, + "learning_rate": 3.651345575074127e-07, + "loss": 0.0, + "step": 62918 + }, + { + "epoch": 5.870952691984697, + "grad_norm": NaN, + "learning_rate": 3.6460731082656457e-07, + "loss": 0.0, + "step": 62919 + }, + { + "epoch": 5.8710460016795745, + "grad_norm": NaN, + "learning_rate": 3.6408044462418185e-07, + "loss": 0.0, + "step": 62920 + }, + { + "epoch": 5.871139311374452, + "grad_norm": NaN, + "learning_rate": 3.6355395890161343e-07, + "loss": 0.0, + "step": 62921 + }, + { + "epoch": 5.871232621069329, + "grad_norm": NaN, + "learning_rate": 3.6302785366020827e-07, + "loss": 0.0, + "step": 62922 + }, + { + "epoch": 5.871325930764207, + "grad_norm": NaN, + "learning_rate": 3.625021289012986e-07, + "loss": 0.0, + "step": 62923 + }, + { + "epoch": 5.871419240459083, + "grad_norm": NaN, + "learning_rate": 3.619767846262167e-07, + "loss": 0.0, + "step": 62924 + }, + { + "epoch": 5.871512550153961, + "grad_norm": NaN, + "learning_rate": 3.614518208363115e-07, + "loss": 0.0, + "step": 62925 + }, + { + "epoch": 5.871605859848838, + "grad_norm": NaN, + "learning_rate": 3.6092723753289863e-07, + "loss": 0.0, + "step": 62926 + }, + { + "epoch": 5.8716991695437155, + "grad_norm": NaN, + "learning_rate": 3.6040303471731035e-07, + "loss": 0.0, + "step": 62927 + }, + { + "epoch": 5.871792479238593, + "grad_norm": NaN, + "learning_rate": 3.5987921239091223e-07, + "loss": 0.0, + "step": 62928 + }, + { + "epoch": 5.87188578893347, + "grad_norm": NaN, + "learning_rate": 3.593557705550032e-07, + "loss": 0.0, + "step": 62929 + }, + { + "epoch": 5.871979098628348, + "grad_norm": NaN, + "learning_rate": 3.588327092109322e-07, + "loss": 0.0, + "step": 62930 + }, + { + "epoch": 5.872072408323225, + "grad_norm": NaN, + "learning_rate": 3.5831002836001487e-07, + "loss": 0.0, + "step": 62931 + }, + { + "epoch": 5.872165718018102, + "grad_norm": NaN, + "learning_rate": 3.577877280036001e-07, + "loss": 0.0, + "step": 62932 + }, + { + "epoch": 5.872259027712979, + "grad_norm": NaN, + "learning_rate": 3.572658081429869e-07, + "loss": 0.0, + "step": 62933 + }, + { + "epoch": 5.872352337407857, + "grad_norm": NaN, + "learning_rate": 3.5674426877954075e-07, + "loss": 0.0, + "step": 62934 + }, + { + "epoch": 5.872445647102734, + "grad_norm": NaN, + "learning_rate": 3.5622310991454407e-07, + "loss": 0.0, + "step": 62935 + }, + { + "epoch": 5.872538956797611, + "grad_norm": NaN, + "learning_rate": 3.5570233154936236e-07, + "loss": 0.0, + "step": 62936 + }, + { + "epoch": 5.872632266492489, + "grad_norm": NaN, + "learning_rate": 3.5518193368529455e-07, + "loss": 0.0, + "step": 62937 + }, + { + "epoch": 5.872725576187366, + "grad_norm": NaN, + "learning_rate": 3.5466191632365636e-07, + "loss": 0.0, + "step": 62938 + }, + { + "epoch": 5.872818885882243, + "grad_norm": NaN, + "learning_rate": 3.541422794658133e-07, + "loss": 0.0, + "step": 62939 + }, + { + "epoch": 5.87291219557712, + "grad_norm": NaN, + "learning_rate": 3.53623023113031e-07, + "loss": 0.0, + "step": 62940 + }, + { + "epoch": 5.873005505271998, + "grad_norm": NaN, + "learning_rate": 3.5310414726667516e-07, + "loss": 0.0, + "step": 62941 + }, + { + "epoch": 5.873098814966875, + "grad_norm": NaN, + "learning_rate": 3.5258565192804456e-07, + "loss": 0.0, + "step": 62942 + }, + { + "epoch": 5.8731921246617524, + "grad_norm": NaN, + "learning_rate": 3.5206753709845494e-07, + "loss": 0.0, + "step": 62943 + }, + { + "epoch": 5.87328543435663, + "grad_norm": NaN, + "learning_rate": 3.5154980277922184e-07, + "loss": 0.0, + "step": 62944 + }, + { + "epoch": 5.873378744051507, + "grad_norm": NaN, + "learning_rate": 3.510324489716776e-07, + "loss": 0.0, + "step": 62945 + }, + { + "epoch": 5.873472053746385, + "grad_norm": NaN, + "learning_rate": 3.5051547567712114e-07, + "loss": 0.0, + "step": 62946 + }, + { + "epoch": 5.873565363441261, + "grad_norm": NaN, + "learning_rate": 3.499988828968847e-07, + "loss": 0.0, + "step": 62947 + }, + { + "epoch": 5.873658673136139, + "grad_norm": NaN, + "learning_rate": 3.494826706322673e-07, + "loss": 0.0, + "step": 62948 + }, + { + "epoch": 5.873751982831016, + "grad_norm": NaN, + "learning_rate": 3.4896683888458456e-07, + "loss": 0.0, + "step": 62949 + }, + { + "epoch": 5.8738452925258935, + "grad_norm": NaN, + "learning_rate": 3.48451387655152e-07, + "loss": 0.0, + "step": 62950 + }, + { + "epoch": 5.873938602220771, + "grad_norm": NaN, + "learning_rate": 3.479363169452687e-07, + "loss": 0.0, + "step": 62951 + }, + { + "epoch": 5.874031911915648, + "grad_norm": NaN, + "learning_rate": 3.4742162675626685e-07, + "loss": 0.0, + "step": 62952 + }, + { + "epoch": 5.874125221610525, + "grad_norm": NaN, + "learning_rate": 3.4690731708942877e-07, + "loss": 0.0, + "step": 62953 + }, + { + "epoch": 5.874218531305402, + "grad_norm": NaN, + "learning_rate": 3.4639338794608675e-07, + "loss": 0.0, + "step": 62954 + }, + { + "epoch": 5.87431184100028, + "grad_norm": NaN, + "learning_rate": 3.4587983932752305e-07, + "loss": 0.0, + "step": 62955 + }, + { + "epoch": 5.874405150695157, + "grad_norm": NaN, + "learning_rate": 3.4536667123505336e-07, + "loss": 0.0, + "step": 62956 + }, + { + "epoch": 5.8744984603900345, + "grad_norm": NaN, + "learning_rate": 3.4485388367000986e-07, + "loss": 0.0, + "step": 62957 + }, + { + "epoch": 5.874591770084912, + "grad_norm": NaN, + "learning_rate": 3.443414766336583e-07, + "loss": 0.0, + "step": 62958 + }, + { + "epoch": 5.874685079779789, + "grad_norm": NaN, + "learning_rate": 3.4382945012729756e-07, + "loss": 0.0, + "step": 62959 + }, + { + "epoch": 5.874778389474667, + "grad_norm": NaN, + "learning_rate": 3.4331780415227663e-07, + "loss": 0.0, + "step": 62960 + }, + { + "epoch": 5.874871699169544, + "grad_norm": NaN, + "learning_rate": 3.4280653870984445e-07, + "loss": 0.0, + "step": 62961 + }, + { + "epoch": 5.874965008864421, + "grad_norm": NaN, + "learning_rate": 3.422956538013333e-07, + "loss": 0.0, + "step": 62962 + }, + { + "epoch": 5.875058318559298, + "grad_norm": NaN, + "learning_rate": 3.4178514942802547e-07, + "loss": 0.0, + "step": 62963 + }, + { + "epoch": 5.875151628254176, + "grad_norm": NaN, + "learning_rate": 3.412750255912367e-07, + "loss": 0.0, + "step": 62964 + }, + { + "epoch": 5.875244937949053, + "grad_norm": NaN, + "learning_rate": 3.4076528229224906e-07, + "loss": 0.0, + "step": 62965 + }, + { + "epoch": 5.87533824764393, + "grad_norm": NaN, + "learning_rate": 3.4025591953236177e-07, + "loss": 0.0, + "step": 62966 + }, + { + "epoch": 5.875431557338808, + "grad_norm": NaN, + "learning_rate": 3.397469373128736e-07, + "loss": 0.0, + "step": 62967 + }, + { + "epoch": 5.875524867033684, + "grad_norm": NaN, + "learning_rate": 3.39238335635067e-07, + "loss": 0.0, + "step": 62968 + }, + { + "epoch": 5.875618176728562, + "grad_norm": NaN, + "learning_rate": 3.3873011450025745e-07, + "loss": 0.0, + "step": 62969 + }, + { + "epoch": 5.875711486423439, + "grad_norm": NaN, + "learning_rate": 3.382222739097107e-07, + "loss": 0.0, + "step": 62970 + }, + { + "epoch": 5.875804796118317, + "grad_norm": NaN, + "learning_rate": 3.377148138647423e-07, + "loss": 0.0, + "step": 62971 + }, + { + "epoch": 5.875898105813194, + "grad_norm": NaN, + "learning_rate": 3.3720773436663464e-07, + "loss": 0.0, + "step": 62972 + }, + { + "epoch": 5.8759914155080715, + "grad_norm": NaN, + "learning_rate": 3.3670103541666995e-07, + "loss": 0.0, + "step": 62973 + }, + { + "epoch": 5.876084725202949, + "grad_norm": NaN, + "learning_rate": 3.3619471701613053e-07, + "loss": 0.0, + "step": 62974 + }, + { + "epoch": 5.876178034897826, + "grad_norm": NaN, + "learning_rate": 3.356887791663321e-07, + "loss": 0.0, + "step": 62975 + }, + { + "epoch": 5.876271344592703, + "grad_norm": NaN, + "learning_rate": 3.3518322186852355e-07, + "loss": 0.0, + "step": 62976 + }, + { + "epoch": 5.87636465428758, + "grad_norm": NaN, + "learning_rate": 3.3467804512402053e-07, + "loss": 0.0, + "step": 62977 + }, + { + "epoch": 5.876457963982458, + "grad_norm": NaN, + "learning_rate": 3.3417324893410535e-07, + "loss": 0.0, + "step": 62978 + }, + { + "epoch": 5.876551273677335, + "grad_norm": NaN, + "learning_rate": 3.336688333000437e-07, + "loss": 0.0, + "step": 62979 + }, + { + "epoch": 5.8766445833722125, + "grad_norm": NaN, + "learning_rate": 3.3316479822313446e-07, + "loss": 0.0, + "step": 62980 + }, + { + "epoch": 5.87673789306709, + "grad_norm": NaN, + "learning_rate": 3.3266114370464335e-07, + "loss": 0.0, + "step": 62981 + }, + { + "epoch": 5.876831202761967, + "grad_norm": NaN, + "learning_rate": 3.3215786974586934e-07, + "loss": 0.0, + "step": 62982 + }, + { + "epoch": 5.876924512456844, + "grad_norm": NaN, + "learning_rate": 3.31654976348078e-07, + "loss": 0.0, + "step": 62983 + }, + { + "epoch": 5.877017822151721, + "grad_norm": NaN, + "learning_rate": 3.3115246351255174e-07, + "loss": 0.0, + "step": 62984 + }, + { + "epoch": 5.877111131846599, + "grad_norm": NaN, + "learning_rate": 3.306503312405728e-07, + "loss": 0.0, + "step": 62985 + }, + { + "epoch": 5.877204441541476, + "grad_norm": NaN, + "learning_rate": 3.3014857953342356e-07, + "loss": 0.0, + "step": 62986 + }, + { + "epoch": 5.877297751236354, + "grad_norm": NaN, + "learning_rate": 3.2964720839235294e-07, + "loss": 0.0, + "step": 62987 + }, + { + "epoch": 5.877391060931231, + "grad_norm": NaN, + "learning_rate": 3.2914621781865993e-07, + "loss": 0.0, + "step": 62988 + }, + { + "epoch": 5.877484370626108, + "grad_norm": NaN, + "learning_rate": 3.2864560781361016e-07, + "loss": 0.0, + "step": 62989 + }, + { + "epoch": 5.877577680320986, + "grad_norm": NaN, + "learning_rate": 3.28145378378486e-07, + "loss": 0.0, + "step": 62990 + }, + { + "epoch": 5.877670990015862, + "grad_norm": NaN, + "learning_rate": 3.27645529514553e-07, + "loss": 0.0, + "step": 62991 + }, + { + "epoch": 5.87776429971074, + "grad_norm": NaN, + "learning_rate": 3.2714606122307695e-07, + "loss": 0.0, + "step": 62992 + }, + { + "epoch": 5.877857609405617, + "grad_norm": NaN, + "learning_rate": 3.2664697350532345e-07, + "loss": 0.0, + "step": 62993 + }, + { + "epoch": 5.877950919100495, + "grad_norm": NaN, + "learning_rate": 3.2614826636257473e-07, + "loss": 0.0, + "step": 62994 + }, + { + "epoch": 5.878044228795372, + "grad_norm": NaN, + "learning_rate": 3.256499397961132e-07, + "loss": 0.0, + "step": 62995 + }, + { + "epoch": 5.8781375384902494, + "grad_norm": NaN, + "learning_rate": 3.2515199380717115e-07, + "loss": 0.0, + "step": 62996 + }, + { + "epoch": 5.878230848185126, + "grad_norm": NaN, + "learning_rate": 3.246544283970143e-07, + "loss": 0.0, + "step": 62997 + }, + { + "epoch": 5.878324157880003, + "grad_norm": NaN, + "learning_rate": 3.2415724356695813e-07, + "loss": 0.0, + "step": 62998 + }, + { + "epoch": 5.878417467574881, + "grad_norm": NaN, + "learning_rate": 3.236604393182185e-07, + "loss": 0.0, + "step": 62999 + }, + { + "epoch": 5.878510777269758, + "grad_norm": NaN, + "learning_rate": 3.2316401565206094e-07, + "loss": 0.0, + "step": 63000 + }, + { + "epoch": 5.878604086964636, + "grad_norm": NaN, + "learning_rate": 3.2266797256976785e-07, + "loss": 0.0, + "step": 63001 + }, + { + "epoch": 5.878697396659513, + "grad_norm": NaN, + "learning_rate": 3.221723100725881e-07, + "loss": 0.0, + "step": 63002 + }, + { + "epoch": 5.8787907063543905, + "grad_norm": NaN, + "learning_rate": 3.216770281618042e-07, + "loss": 0.0, + "step": 63003 + }, + { + "epoch": 5.878884016049268, + "grad_norm": NaN, + "learning_rate": 3.211821268386483e-07, + "loss": 0.0, + "step": 63004 + }, + { + "epoch": 5.8789773257441444, + "grad_norm": NaN, + "learning_rate": 3.206876061043695e-07, + "loss": 0.0, + "step": 63005 + }, + { + "epoch": 5.879070635439022, + "grad_norm": NaN, + "learning_rate": 3.2019346596026673e-07, + "loss": 0.0, + "step": 63006 + }, + { + "epoch": 5.879163945133899, + "grad_norm": NaN, + "learning_rate": 3.196997064075724e-07, + "loss": 0.0, + "step": 63007 + }, + { + "epoch": 5.879257254828777, + "grad_norm": NaN, + "learning_rate": 3.192063274475354e-07, + "loss": 0.0, + "step": 63008 + }, + { + "epoch": 5.879350564523654, + "grad_norm": NaN, + "learning_rate": 3.1871332908142144e-07, + "loss": 0.0, + "step": 63009 + }, + { + "epoch": 5.8794438742185315, + "grad_norm": NaN, + "learning_rate": 3.1822071131047954e-07, + "loss": 0.0, + "step": 63010 + }, + { + "epoch": 5.879537183913409, + "grad_norm": NaN, + "learning_rate": 3.177284741359587e-07, + "loss": 0.0, + "step": 63011 + }, + { + "epoch": 5.8796304936082855, + "grad_norm": NaN, + "learning_rate": 3.1723661755912455e-07, + "loss": 0.0, + "step": 63012 + }, + { + "epoch": 5.879723803303163, + "grad_norm": NaN, + "learning_rate": 3.167451415812095e-07, + "loss": 0.0, + "step": 63013 + }, + { + "epoch": 5.87981711299804, + "grad_norm": NaN, + "learning_rate": 3.162540462034791e-07, + "loss": 0.0, + "step": 63014 + }, + { + "epoch": 5.879910422692918, + "grad_norm": NaN, + "learning_rate": 3.157633314271657e-07, + "loss": 0.0, + "step": 63015 + }, + { + "epoch": 5.880003732387795, + "grad_norm": NaN, + "learning_rate": 3.152729972535351e-07, + "loss": 0.0, + "step": 63016 + }, + { + "epoch": 5.880097042082673, + "grad_norm": NaN, + "learning_rate": 3.147830436838195e-07, + "loss": 0.0, + "step": 63017 + }, + { + "epoch": 5.88019035177755, + "grad_norm": NaN, + "learning_rate": 3.142934707192679e-07, + "loss": 0.0, + "step": 63018 + }, + { + "epoch": 5.880283661472427, + "grad_norm": NaN, + "learning_rate": 3.138042783611461e-07, + "loss": 0.0, + "step": 63019 + }, + { + "epoch": 5.880376971167304, + "grad_norm": NaN, + "learning_rate": 3.1331546661065296e-07, + "loss": 0.0, + "step": 63020 + }, + { + "epoch": 5.880470280862181, + "grad_norm": NaN, + "learning_rate": 3.1282703546907093e-07, + "loss": 0.0, + "step": 63021 + }, + { + "epoch": 5.880563590557059, + "grad_norm": NaN, + "learning_rate": 3.12338984937649e-07, + "loss": 0.0, + "step": 63022 + }, + { + "epoch": 5.880656900251936, + "grad_norm": NaN, + "learning_rate": 3.118513150175861e-07, + "loss": 0.0, + "step": 63023 + }, + { + "epoch": 5.880750209946814, + "grad_norm": NaN, + "learning_rate": 3.1136402571014797e-07, + "loss": 0.0, + "step": 63024 + }, + { + "epoch": 5.880843519641691, + "grad_norm": NaN, + "learning_rate": 3.108771170165669e-07, + "loss": 0.0, + "step": 63025 + }, + { + "epoch": 5.880936829336568, + "grad_norm": NaN, + "learning_rate": 3.10390588938092e-07, + "loss": 0.0, + "step": 63026 + }, + { + "epoch": 5.881030139031445, + "grad_norm": NaN, + "learning_rate": 3.0990444147593887e-07, + "loss": 0.0, + "step": 63027 + }, + { + "epoch": 5.881123448726322, + "grad_norm": NaN, + "learning_rate": 3.0941867463137315e-07, + "loss": 0.0, + "step": 63028 + }, + { + "epoch": 5.8812167584212, + "grad_norm": NaN, + "learning_rate": 3.0893328840559396e-07, + "loss": 0.0, + "step": 63029 + }, + { + "epoch": 5.881310068116077, + "grad_norm": NaN, + "learning_rate": 3.0844828279988355e-07, + "loss": 0.0, + "step": 63030 + }, + { + "epoch": 5.881403377810955, + "grad_norm": NaN, + "learning_rate": 3.079636578154243e-07, + "loss": 0.0, + "step": 63031 + }, + { + "epoch": 5.881496687505832, + "grad_norm": NaN, + "learning_rate": 3.0747941345348194e-07, + "loss": 0.0, + "step": 63032 + }, + { + "epoch": 5.8815899972007095, + "grad_norm": NaN, + "learning_rate": 3.069955497152721e-07, + "loss": 0.0, + "step": 63033 + }, + { + "epoch": 5.881683306895587, + "grad_norm": NaN, + "learning_rate": 3.0651206660204374e-07, + "loss": 0.0, + "step": 63034 + }, + { + "epoch": 5.8817766165904635, + "grad_norm": NaN, + "learning_rate": 3.06028964114996e-07, + "loss": 0.0, + "step": 63035 + }, + { + "epoch": 5.881869926285341, + "grad_norm": NaN, + "learning_rate": 3.0554624225537785e-07, + "loss": 0.0, + "step": 63036 + }, + { + "epoch": 5.881963235980218, + "grad_norm": NaN, + "learning_rate": 3.0506390102442157e-07, + "loss": 0.0, + "step": 63037 + }, + { + "epoch": 5.882056545675096, + "grad_norm": NaN, + "learning_rate": 3.045819404233263e-07, + "loss": 0.0, + "step": 63038 + }, + { + "epoch": 5.882149855369973, + "grad_norm": NaN, + "learning_rate": 3.041003604533576e-07, + "loss": 0.0, + "step": 63039 + }, + { + "epoch": 5.882243165064851, + "grad_norm": NaN, + "learning_rate": 3.036191611157146e-07, + "loss": 0.0, + "step": 63040 + }, + { + "epoch": 5.882336474759727, + "grad_norm": NaN, + "learning_rate": 3.0313834241161293e-07, + "loss": 0.0, + "step": 63041 + }, + { + "epoch": 5.8824297844546045, + "grad_norm": NaN, + "learning_rate": 3.0265790434230164e-07, + "loss": 0.0, + "step": 63042 + }, + { + "epoch": 5.882523094149482, + "grad_norm": NaN, + "learning_rate": 3.0217784690896306e-07, + "loss": 0.0, + "step": 63043 + }, + { + "epoch": 5.882616403844359, + "grad_norm": NaN, + "learning_rate": 3.016981701128629e-07, + "loss": 0.0, + "step": 63044 + }, + { + "epoch": 5.882709713539237, + "grad_norm": NaN, + "learning_rate": 3.0121887395520015e-07, + "loss": 0.0, + "step": 63045 + }, + { + "epoch": 5.882803023234114, + "grad_norm": NaN, + "learning_rate": 3.007399584371906e-07, + "loss": 0.0, + "step": 63046 + }, + { + "epoch": 5.882896332928992, + "grad_norm": NaN, + "learning_rate": 3.002614235600498e-07, + "loss": 0.0, + "step": 63047 + }, + { + "epoch": 5.882989642623869, + "grad_norm": NaN, + "learning_rate": 2.997832693250102e-07, + "loss": 0.0, + "step": 63048 + }, + { + "epoch": 5.883082952318746, + "grad_norm": NaN, + "learning_rate": 2.9930549573327077e-07, + "loss": 0.0, + "step": 63049 + }, + { + "epoch": 5.883176262013623, + "grad_norm": NaN, + "learning_rate": 2.988281027860473e-07, + "loss": 0.0, + "step": 63050 + }, + { + "epoch": 5.8832695717085, + "grad_norm": NaN, + "learning_rate": 2.9835109048457204e-07, + "loss": 0.0, + "step": 63051 + }, + { + "epoch": 5.883362881403378, + "grad_norm": NaN, + "learning_rate": 2.9787445883004414e-07, + "loss": 0.0, + "step": 63052 + }, + { + "epoch": 5.883456191098255, + "grad_norm": NaN, + "learning_rate": 2.973982078236625e-07, + "loss": 0.0, + "step": 63053 + }, + { + "epoch": 5.883549500793133, + "grad_norm": NaN, + "learning_rate": 2.969223374666596e-07, + "loss": 0.0, + "step": 63054 + }, + { + "epoch": 5.88364281048801, + "grad_norm": NaN, + "learning_rate": 2.964468477602344e-07, + "loss": 0.0, + "step": 63055 + }, + { + "epoch": 5.883736120182887, + "grad_norm": NaN, + "learning_rate": 2.9597173870560267e-07, + "loss": 0.0, + "step": 63056 + }, + { + "epoch": 5.883829429877764, + "grad_norm": NaN, + "learning_rate": 2.9549701030398e-07, + "loss": 0.0, + "step": 63057 + }, + { + "epoch": 5.8839227395726414, + "grad_norm": NaN, + "learning_rate": 2.9502266255654885e-07, + "loss": 0.0, + "step": 63058 + }, + { + "epoch": 5.884016049267519, + "grad_norm": NaN, + "learning_rate": 2.9454869546454154e-07, + "loss": 0.0, + "step": 63059 + }, + { + "epoch": 5.884109358962396, + "grad_norm": NaN, + "learning_rate": 2.9407510902914046e-07, + "loss": 0.0, + "step": 63060 + }, + { + "epoch": 5.884202668657274, + "grad_norm": NaN, + "learning_rate": 2.936019032515613e-07, + "loss": 0.0, + "step": 63061 + }, + { + "epoch": 5.884295978352151, + "grad_norm": NaN, + "learning_rate": 2.9312907813300315e-07, + "loss": 0.0, + "step": 63062 + }, + { + "epoch": 5.8843892880470285, + "grad_norm": NaN, + "learning_rate": 2.9265663367468164e-07, + "loss": 0.0, + "step": 63063 + }, + { + "epoch": 5.884482597741905, + "grad_norm": NaN, + "learning_rate": 2.9218456987777915e-07, + "loss": 0.0, + "step": 63064 + }, + { + "epoch": 5.8845759074367825, + "grad_norm": NaN, + "learning_rate": 2.917128867434948e-07, + "loss": 0.0, + "step": 63065 + }, + { + "epoch": 5.88466921713166, + "grad_norm": NaN, + "learning_rate": 2.912415842730609e-07, + "loss": 0.0, + "step": 63066 + }, + { + "epoch": 5.884762526826537, + "grad_norm": NaN, + "learning_rate": 2.9077066246764315e-07, + "loss": 0.0, + "step": 63067 + }, + { + "epoch": 5.884855836521415, + "grad_norm": NaN, + "learning_rate": 2.903001213284406e-07, + "loss": 0.0, + "step": 63068 + }, + { + "epoch": 5.884949146216292, + "grad_norm": NaN, + "learning_rate": 2.8982996085666896e-07, + "loss": 0.0, + "step": 63069 + }, + { + "epoch": 5.885042455911169, + "grad_norm": NaN, + "learning_rate": 2.893601810535107e-07, + "loss": 0.0, + "step": 63070 + }, + { + "epoch": 5.885135765606046, + "grad_norm": NaN, + "learning_rate": 2.8889078192014805e-07, + "loss": 0.0, + "step": 63071 + }, + { + "epoch": 5.8852290753009235, + "grad_norm": NaN, + "learning_rate": 2.8842176345781345e-07, + "loss": 0.0, + "step": 63072 + }, + { + "epoch": 5.885322384995801, + "grad_norm": NaN, + "learning_rate": 2.879531256676559e-07, + "loss": 0.0, + "step": 63073 + }, + { + "epoch": 5.885415694690678, + "grad_norm": NaN, + "learning_rate": 2.8748486855089123e-07, + "loss": 0.0, + "step": 63074 + }, + { + "epoch": 5.885509004385556, + "grad_norm": NaN, + "learning_rate": 2.870169921087018e-07, + "loss": 0.0, + "step": 63075 + }, + { + "epoch": 5.885602314080433, + "grad_norm": NaN, + "learning_rate": 2.865494963422699e-07, + "loss": 0.0, + "step": 63076 + }, + { + "epoch": 5.885695623775311, + "grad_norm": NaN, + "learning_rate": 2.8608238125281126e-07, + "loss": 0.0, + "step": 63077 + }, + { + "epoch": 5.885788933470187, + "grad_norm": NaN, + "learning_rate": 2.856156468414916e-07, + "loss": 0.0, + "step": 63078 + }, + { + "epoch": 5.885882243165065, + "grad_norm": NaN, + "learning_rate": 2.851492931094934e-07, + "loss": 0.0, + "step": 63079 + }, + { + "epoch": 5.885975552859942, + "grad_norm": NaN, + "learning_rate": 2.846833200580323e-07, + "loss": 0.0, + "step": 63080 + }, + { + "epoch": 5.886068862554819, + "grad_norm": NaN, + "learning_rate": 2.8421772768825736e-07, + "loss": 0.0, + "step": 63081 + }, + { + "epoch": 5.886162172249697, + "grad_norm": NaN, + "learning_rate": 2.837525160013676e-07, + "loss": 0.0, + "step": 63082 + }, + { + "epoch": 5.886255481944574, + "grad_norm": NaN, + "learning_rate": 2.832876849985455e-07, + "loss": 0.0, + "step": 63083 + }, + { + "epoch": 5.886348791639452, + "grad_norm": NaN, + "learning_rate": 2.828232346809733e-07, + "loss": 0.0, + "step": 63084 + }, + { + "epoch": 5.886442101334328, + "grad_norm": NaN, + "learning_rate": 2.823591650498336e-07, + "loss": 0.0, + "step": 63085 + }, + { + "epoch": 5.886535411029206, + "grad_norm": NaN, + "learning_rate": 2.8189547610630857e-07, + "loss": 0.0, + "step": 63086 + }, + { + "epoch": 5.886628720724083, + "grad_norm": NaN, + "learning_rate": 2.8143216785158075e-07, + "loss": 0.0, + "step": 63087 + }, + { + "epoch": 5.8867220304189605, + "grad_norm": NaN, + "learning_rate": 2.809692402867991e-07, + "loss": 0.0, + "step": 63088 + }, + { + "epoch": 5.886815340113838, + "grad_norm": NaN, + "learning_rate": 2.805066934131794e-07, + "loss": 0.0, + "step": 63089 + }, + { + "epoch": 5.886908649808715, + "grad_norm": NaN, + "learning_rate": 2.800445272318874e-07, + "loss": 0.0, + "step": 63090 + }, + { + "epoch": 5.887001959503593, + "grad_norm": NaN, + "learning_rate": 2.7958274174407213e-07, + "loss": 0.0, + "step": 63091 + }, + { + "epoch": 5.88709526919847, + "grad_norm": NaN, + "learning_rate": 2.791213369509493e-07, + "loss": 0.0, + "step": 63092 + }, + { + "epoch": 5.887188578893347, + "grad_norm": NaN, + "learning_rate": 2.786603128536513e-07, + "loss": 0.0, + "step": 63093 + }, + { + "epoch": 5.887281888588224, + "grad_norm": NaN, + "learning_rate": 2.7819966945337725e-07, + "loss": 0.0, + "step": 63094 + }, + { + "epoch": 5.8873751982831015, + "grad_norm": NaN, + "learning_rate": 2.7773940675129283e-07, + "loss": 0.0, + "step": 63095 + }, + { + "epoch": 5.887468507977979, + "grad_norm": NaN, + "learning_rate": 2.772795247485471e-07, + "loss": 0.0, + "step": 63096 + }, + { + "epoch": 5.887561817672856, + "grad_norm": NaN, + "learning_rate": 2.768200234463558e-07, + "loss": 0.0, + "step": 63097 + }, + { + "epoch": 5.887655127367734, + "grad_norm": NaN, + "learning_rate": 2.7636090284585135e-07, + "loss": 0.0, + "step": 63098 + }, + { + "epoch": 5.88774843706261, + "grad_norm": NaN, + "learning_rate": 2.759021629481994e-07, + "loss": 0.0, + "step": 63099 + }, + { + "epoch": 5.887841746757488, + "grad_norm": NaN, + "learning_rate": 2.7544380375458255e-07, + "loss": 0.0, + "step": 63100 + }, + { + "epoch": 5.887935056452365, + "grad_norm": NaN, + "learning_rate": 2.749858252661663e-07, + "loss": 0.0, + "step": 63101 + }, + { + "epoch": 5.888028366147243, + "grad_norm": NaN, + "learning_rate": 2.7452822748409985e-07, + "loss": 0.0, + "step": 63102 + }, + { + "epoch": 5.88812167584212, + "grad_norm": NaN, + "learning_rate": 2.740710104095656e-07, + "loss": 0.0, + "step": 63103 + }, + { + "epoch": 5.888214985536997, + "grad_norm": NaN, + "learning_rate": 2.7361417404371254e-07, + "loss": 0.0, + "step": 63104 + }, + { + "epoch": 5.888308295231875, + "grad_norm": NaN, + "learning_rate": 2.731577183877065e-07, + "loss": 0.0, + "step": 63105 + }, + { + "epoch": 5.888401604926752, + "grad_norm": NaN, + "learning_rate": 2.727016434426965e-07, + "loss": 0.0, + "step": 63106 + }, + { + "epoch": 5.88849491462163, + "grad_norm": NaN, + "learning_rate": 2.722459492098816e-07, + "loss": 0.0, + "step": 63107 + }, + { + "epoch": 5.888588224316506, + "grad_norm": NaN, + "learning_rate": 2.7179063569036097e-07, + "loss": 0.0, + "step": 63108 + }, + { + "epoch": 5.888681534011384, + "grad_norm": NaN, + "learning_rate": 2.7133570288533357e-07, + "loss": 0.0, + "step": 63109 + }, + { + "epoch": 5.888774843706261, + "grad_norm": NaN, + "learning_rate": 2.7088115079596516e-07, + "loss": 0.0, + "step": 63110 + }, + { + "epoch": 5.8888681534011384, + "grad_norm": NaN, + "learning_rate": 2.7042697942337155e-07, + "loss": 0.0, + "step": 63111 + }, + { + "epoch": 5.888961463096016, + "grad_norm": NaN, + "learning_rate": 2.6997318876873507e-07, + "loss": 0.0, + "step": 63112 + }, + { + "epoch": 5.889054772790893, + "grad_norm": NaN, + "learning_rate": 2.6951977883320485e-07, + "loss": 0.0, + "step": 63113 + }, + { + "epoch": 5.88914808248577, + "grad_norm": NaN, + "learning_rate": 2.6906674961792994e-07, + "loss": 0.0, + "step": 63114 + }, + { + "epoch": 5.889241392180647, + "grad_norm": NaN, + "learning_rate": 2.686141011240761e-07, + "loss": 0.0, + "step": 63115 + }, + { + "epoch": 5.889334701875525, + "grad_norm": NaN, + "learning_rate": 2.6816183335277574e-07, + "loss": 0.0, + "step": 63116 + }, + { + "epoch": 5.889428011570402, + "grad_norm": NaN, + "learning_rate": 2.6770994630517795e-07, + "loss": 0.0, + "step": 63117 + }, + { + "epoch": 5.8895213212652795, + "grad_norm": NaN, + "learning_rate": 2.6725843998244845e-07, + "loss": 0.0, + "step": 63118 + }, + { + "epoch": 5.889614630960157, + "grad_norm": NaN, + "learning_rate": 2.668073143857197e-07, + "loss": 0.0, + "step": 63119 + }, + { + "epoch": 5.889707940655034, + "grad_norm": NaN, + "learning_rate": 2.6635656951615737e-07, + "loss": 0.0, + "step": 63120 + }, + { + "epoch": 5.889801250349912, + "grad_norm": NaN, + "learning_rate": 2.6590620537489395e-07, + "loss": 0.0, + "step": 63121 + }, + { + "epoch": 5.889894560044788, + "grad_norm": NaN, + "learning_rate": 2.654562219630618e-07, + "loss": 0.0, + "step": 63122 + }, + { + "epoch": 5.889987869739666, + "grad_norm": NaN, + "learning_rate": 2.6500661928182674e-07, + "loss": 0.0, + "step": 63123 + }, + { + "epoch": 5.890081179434543, + "grad_norm": NaN, + "learning_rate": 2.6455739733233784e-07, + "loss": 0.0, + "step": 63124 + }, + { + "epoch": 5.8901744891294205, + "grad_norm": NaN, + "learning_rate": 2.641085561157108e-07, + "loss": 0.0, + "step": 63125 + }, + { + "epoch": 5.890267798824298, + "grad_norm": NaN, + "learning_rate": 2.6366009563311143e-07, + "loss": 0.0, + "step": 63126 + }, + { + "epoch": 5.890361108519175, + "grad_norm": NaN, + "learning_rate": 2.632120158856554e-07, + "loss": 0.0, + "step": 63127 + }, + { + "epoch": 5.890454418214053, + "grad_norm": NaN, + "learning_rate": 2.627643168745086e-07, + "loss": 0.0, + "step": 63128 + }, + { + "epoch": 5.890547727908929, + "grad_norm": NaN, + "learning_rate": 2.623169986007867e-07, + "loss": 0.0, + "step": 63129 + }, + { + "epoch": 5.890641037603807, + "grad_norm": NaN, + "learning_rate": 2.618700610656388e-07, + "loss": 0.0, + "step": 63130 + }, + { + "epoch": 5.890734347298684, + "grad_norm": NaN, + "learning_rate": 2.614235042701973e-07, + "loss": 0.0, + "step": 63131 + }, + { + "epoch": 5.890827656993562, + "grad_norm": NaN, + "learning_rate": 2.6097732821561133e-07, + "loss": 0.0, + "step": 63132 + }, + { + "epoch": 5.890920966688439, + "grad_norm": NaN, + "learning_rate": 2.6053153290299664e-07, + "loss": 0.0, + "step": 63133 + }, + { + "epoch": 5.891014276383316, + "grad_norm": NaN, + "learning_rate": 2.6008611833350233e-07, + "loss": 0.0, + "step": 63134 + }, + { + "epoch": 5.891107586078194, + "grad_norm": NaN, + "learning_rate": 2.5964108450824415e-07, + "loss": 0.0, + "step": 63135 + }, + { + "epoch": 5.891200895773071, + "grad_norm": NaN, + "learning_rate": 2.591964314283712e-07, + "loss": 0.0, + "step": 63136 + }, + { + "epoch": 5.891294205467948, + "grad_norm": NaN, + "learning_rate": 2.587521590949826e-07, + "loss": 0.0, + "step": 63137 + }, + { + "epoch": 5.891387515162825, + "grad_norm": NaN, + "learning_rate": 2.5830826750924404e-07, + "loss": 0.0, + "step": 63138 + }, + { + "epoch": 5.891480824857703, + "grad_norm": NaN, + "learning_rate": 2.57864756672288e-07, + "loss": 0.0, + "step": 63139 + }, + { + "epoch": 5.89157413455258, + "grad_norm": NaN, + "learning_rate": 2.574216265851969e-07, + "loss": 0.0, + "step": 63140 + }, + { + "epoch": 5.8916674442474575, + "grad_norm": NaN, + "learning_rate": 2.5697887724913655e-07, + "loss": 0.0, + "step": 63141 + }, + { + "epoch": 5.891760753942335, + "grad_norm": NaN, + "learning_rate": 2.565365086652393e-07, + "loss": 0.0, + "step": 63142 + }, + { + "epoch": 5.891854063637211, + "grad_norm": NaN, + "learning_rate": 2.560945208345877e-07, + "loss": 0.0, + "step": 63143 + }, + { + "epoch": 5.891947373332089, + "grad_norm": NaN, + "learning_rate": 2.556529137583307e-07, + "loss": 0.0, + "step": 63144 + }, + { + "epoch": 5.892040683026966, + "grad_norm": NaN, + "learning_rate": 2.5521168743761756e-07, + "loss": 0.0, + "step": 63145 + }, + { + "epoch": 5.892133992721844, + "grad_norm": NaN, + "learning_rate": 2.5477084187351393e-07, + "loss": 0.0, + "step": 63146 + }, + { + "epoch": 5.892227302416721, + "grad_norm": NaN, + "learning_rate": 2.543303770671856e-07, + "loss": 0.0, + "step": 63147 + }, + { + "epoch": 5.8923206121115985, + "grad_norm": NaN, + "learning_rate": 2.5389029301973175e-07, + "loss": 0.0, + "step": 63148 + }, + { + "epoch": 5.892413921806476, + "grad_norm": NaN, + "learning_rate": 2.534505897322681e-07, + "loss": 0.0, + "step": 63149 + }, + { + "epoch": 5.892507231501353, + "grad_norm": NaN, + "learning_rate": 2.530112672059437e-07, + "loss": 0.0, + "step": 63150 + }, + { + "epoch": 5.892600541196231, + "grad_norm": NaN, + "learning_rate": 2.5257232544184103e-07, + "loss": 0.0, + "step": 63151 + }, + { + "epoch": 5.892693850891107, + "grad_norm": NaN, + "learning_rate": 2.521337644410759e-07, + "loss": 0.0, + "step": 63152 + }, + { + "epoch": 5.892787160585985, + "grad_norm": NaN, + "learning_rate": 2.516955842047974e-07, + "loss": 0.0, + "step": 63153 + }, + { + "epoch": 5.892880470280862, + "grad_norm": NaN, + "learning_rate": 2.51257784734088e-07, + "loss": 0.0, + "step": 63154 + }, + { + "epoch": 5.89297377997574, + "grad_norm": NaN, + "learning_rate": 2.5082036603008007e-07, + "loss": 0.0, + "step": 63155 + }, + { + "epoch": 5.893067089670617, + "grad_norm": NaN, + "learning_rate": 2.5038332809385607e-07, + "loss": 0.0, + "step": 63156 + }, + { + "epoch": 5.893160399365494, + "grad_norm": NaN, + "learning_rate": 2.499466709265652e-07, + "loss": 0.0, + "step": 63157 + }, + { + "epoch": 5.893253709060371, + "grad_norm": NaN, + "learning_rate": 2.4951039452930646e-07, + "loss": 0.0, + "step": 63158 + }, + { + "epoch": 5.893347018755248, + "grad_norm": NaN, + "learning_rate": 2.49074498903179e-07, + "loss": 0.0, + "step": 63159 + }, + { + "epoch": 5.893440328450126, + "grad_norm": NaN, + "learning_rate": 2.4863898404928197e-07, + "loss": 0.0, + "step": 63160 + }, + { + "epoch": 5.893533638145003, + "grad_norm": NaN, + "learning_rate": 2.4820384996874777e-07, + "loss": 0.0, + "step": 63161 + }, + { + "epoch": 5.893626947839881, + "grad_norm": NaN, + "learning_rate": 2.4776909666267554e-07, + "loss": 0.0, + "step": 63162 + }, + { + "epoch": 5.893720257534758, + "grad_norm": NaN, + "learning_rate": 2.4733472413214774e-07, + "loss": 0.0, + "step": 63163 + }, + { + "epoch": 5.8938135672296355, + "grad_norm": NaN, + "learning_rate": 2.4690073237829675e-07, + "loss": 0.0, + "step": 63164 + }, + { + "epoch": 5.893906876924513, + "grad_norm": NaN, + "learning_rate": 2.464671214022218e-07, + "loss": 0.0, + "step": 63165 + }, + { + "epoch": 5.894000186619389, + "grad_norm": NaN, + "learning_rate": 2.460338912050219e-07, + "loss": 0.0, + "step": 63166 + }, + { + "epoch": 5.894093496314267, + "grad_norm": NaN, + "learning_rate": 2.456010417877963e-07, + "loss": 0.0, + "step": 63167 + }, + { + "epoch": 5.894186806009144, + "grad_norm": NaN, + "learning_rate": 2.4516857315164397e-07, + "loss": 0.0, + "step": 63168 + }, + { + "epoch": 5.894280115704022, + "grad_norm": NaN, + "learning_rate": 2.447364852976641e-07, + "loss": 0.0, + "step": 63169 + }, + { + "epoch": 5.894373425398899, + "grad_norm": NaN, + "learning_rate": 2.443047782269725e-07, + "loss": 0.0, + "step": 63170 + }, + { + "epoch": 5.8944667350937765, + "grad_norm": NaN, + "learning_rate": 2.438734519406349e-07, + "loss": 0.0, + "step": 63171 + }, + { + "epoch": 5.894560044788654, + "grad_norm": NaN, + "learning_rate": 2.434425064397838e-07, + "loss": 0.0, + "step": 63172 + }, + { + "epoch": 5.8946533544835304, + "grad_norm": NaN, + "learning_rate": 2.43011941725485e-07, + "loss": 0.0, + "step": 63173 + }, + { + "epoch": 5.894746664178408, + "grad_norm": NaN, + "learning_rate": 2.4258175779885426e-07, + "loss": 0.0, + "step": 63174 + }, + { + "epoch": 5.894839973873285, + "grad_norm": NaN, + "learning_rate": 2.4215195466095736e-07, + "loss": 0.0, + "step": 63175 + }, + { + "epoch": 5.894933283568163, + "grad_norm": NaN, + "learning_rate": 2.4172253231292684e-07, + "loss": 0.0, + "step": 63176 + }, + { + "epoch": 5.89502659326304, + "grad_norm": NaN, + "learning_rate": 2.412934907558284e-07, + "loss": 0.0, + "step": 63177 + }, + { + "epoch": 5.8951199029579175, + "grad_norm": NaN, + "learning_rate": 2.408648299907612e-07, + "loss": 0.0, + "step": 63178 + }, + { + "epoch": 5.895213212652795, + "grad_norm": NaN, + "learning_rate": 2.4043655001880766e-07, + "loss": 0.0, + "step": 63179 + }, + { + "epoch": 5.895306522347672, + "grad_norm": NaN, + "learning_rate": 2.40008650841067e-07, + "loss": 0.0, + "step": 63180 + }, + { + "epoch": 5.895399832042549, + "grad_norm": NaN, + "learning_rate": 2.39581132458605e-07, + "loss": 0.0, + "step": 63181 + }, + { + "epoch": 5.895493141737426, + "grad_norm": NaN, + "learning_rate": 2.391539948725374e-07, + "loss": 0.0, + "step": 63182 + }, + { + "epoch": 5.895586451432304, + "grad_norm": NaN, + "learning_rate": 2.387272380839467e-07, + "loss": 0.0, + "step": 63183 + }, + { + "epoch": 5.895679761127181, + "grad_norm": NaN, + "learning_rate": 2.3830086209389864e-07, + "loss": 0.0, + "step": 63184 + }, + { + "epoch": 5.895773070822059, + "grad_norm": NaN, + "learning_rate": 2.3787486690349245e-07, + "loss": 0.0, + "step": 63185 + }, + { + "epoch": 5.895866380516936, + "grad_norm": NaN, + "learning_rate": 2.374492525138272e-07, + "loss": 0.0, + "step": 63186 + }, + { + "epoch": 5.8959596902118125, + "grad_norm": NaN, + "learning_rate": 2.3702401892593537e-07, + "loss": 0.0, + "step": 63187 + }, + { + "epoch": 5.89605299990669, + "grad_norm": NaN, + "learning_rate": 2.365991661409494e-07, + "loss": 0.0, + "step": 63188 + }, + { + "epoch": 5.896146309601567, + "grad_norm": NaN, + "learning_rate": 2.361746941599185e-07, + "loss": 0.0, + "step": 63189 + }, + { + "epoch": 5.896239619296445, + "grad_norm": NaN, + "learning_rate": 2.357506029839251e-07, + "loss": 0.0, + "step": 63190 + }, + { + "epoch": 5.896332928991322, + "grad_norm": NaN, + "learning_rate": 2.3532689261406835e-07, + "loss": 0.0, + "step": 63191 + }, + { + "epoch": 5.8964262386862, + "grad_norm": NaN, + "learning_rate": 2.34903563051414e-07, + "loss": 0.0, + "step": 63192 + }, + { + "epoch": 5.896519548381077, + "grad_norm": NaN, + "learning_rate": 2.344806142970279e-07, + "loss": 0.0, + "step": 63193 + }, + { + "epoch": 5.8966128580759545, + "grad_norm": NaN, + "learning_rate": 2.3405804635199253e-07, + "loss": 0.0, + "step": 63194 + }, + { + "epoch": 5.896706167770831, + "grad_norm": NaN, + "learning_rate": 2.336358592173737e-07, + "loss": 0.0, + "step": 63195 + }, + { + "epoch": 5.896799477465708, + "grad_norm": NaN, + "learning_rate": 2.3321405289427054e-07, + "loss": 0.0, + "step": 63196 + }, + { + "epoch": 5.896892787160586, + "grad_norm": NaN, + "learning_rate": 2.3279262738374882e-07, + "loss": 0.0, + "step": 63197 + }, + { + "epoch": 5.896986096855463, + "grad_norm": NaN, + "learning_rate": 2.3237158268685775e-07, + "loss": 0.0, + "step": 63198 + }, + { + "epoch": 5.897079406550341, + "grad_norm": NaN, + "learning_rate": 2.3195091880467974e-07, + "loss": 0.0, + "step": 63199 + }, + { + "epoch": 5.897172716245218, + "grad_norm": NaN, + "learning_rate": 2.3153063573828068e-07, + "loss": 0.0, + "step": 63200 + }, + { + "epoch": 5.8972660259400955, + "grad_norm": NaN, + "learning_rate": 2.3111073348874298e-07, + "loss": 0.0, + "step": 63201 + }, + { + "epoch": 5.897359335634972, + "grad_norm": NaN, + "learning_rate": 2.306912120571325e-07, + "loss": 0.0, + "step": 63202 + }, + { + "epoch": 5.8974526453298495, + "grad_norm": NaN, + "learning_rate": 2.3027207144449833e-07, + "loss": 0.0, + "step": 63203 + }, + { + "epoch": 5.897545955024727, + "grad_norm": NaN, + "learning_rate": 2.29853311651923e-07, + "loss": 0.0, + "step": 63204 + }, + { + "epoch": 5.897639264719604, + "grad_norm": NaN, + "learning_rate": 2.2943493268045564e-07, + "loss": 0.0, + "step": 63205 + }, + { + "epoch": 5.897732574414482, + "grad_norm": NaN, + "learning_rate": 2.2901693453117875e-07, + "loss": 0.0, + "step": 63206 + }, + { + "epoch": 5.897825884109359, + "grad_norm": NaN, + "learning_rate": 2.2859931720514146e-07, + "loss": 0.0, + "step": 63207 + }, + { + "epoch": 5.897919193804237, + "grad_norm": NaN, + "learning_rate": 2.2818208070342626e-07, + "loss": 0.0, + "step": 63208 + }, + { + "epoch": 5.898012503499114, + "grad_norm": NaN, + "learning_rate": 2.277652250270656e-07, + "loss": 0.0, + "step": 63209 + }, + { + "epoch": 5.8981058131939905, + "grad_norm": NaN, + "learning_rate": 2.273487501771254e-07, + "loss": 0.0, + "step": 63210 + }, + { + "epoch": 5.898199122888868, + "grad_norm": NaN, + "learning_rate": 2.2693265615467137e-07, + "loss": 0.0, + "step": 63211 + }, + { + "epoch": 5.898292432583745, + "grad_norm": NaN, + "learning_rate": 2.2651694296078605e-07, + "loss": 0.0, + "step": 63212 + }, + { + "epoch": 5.898385742278623, + "grad_norm": NaN, + "learning_rate": 2.261016105964686e-07, + "loss": 0.0, + "step": 63213 + }, + { + "epoch": 5.8984790519735, + "grad_norm": NaN, + "learning_rate": 2.256866590628348e-07, + "loss": 0.0, + "step": 63214 + }, + { + "epoch": 5.898572361668378, + "grad_norm": NaN, + "learning_rate": 2.252720883609005e-07, + "loss": 0.0, + "step": 63215 + }, + { + "epoch": 5.898665671363254, + "grad_norm": NaN, + "learning_rate": 2.2485789849173152e-07, + "loss": 0.0, + "step": 63216 + }, + { + "epoch": 5.898758981058132, + "grad_norm": NaN, + "learning_rate": 2.2444408945639369e-07, + "loss": 0.0, + "step": 63217 + }, + { + "epoch": 5.898852290753009, + "grad_norm": NaN, + "learning_rate": 2.240306612559195e-07, + "loss": 0.0, + "step": 63218 + }, + { + "epoch": 5.898945600447886, + "grad_norm": NaN, + "learning_rate": 2.236176138913748e-07, + "loss": 0.0, + "step": 63219 + }, + { + "epoch": 5.899038910142764, + "grad_norm": NaN, + "learning_rate": 2.23204947363792e-07, + "loss": 0.0, + "step": 63220 + }, + { + "epoch": 5.899132219837641, + "grad_norm": NaN, + "learning_rate": 2.2279266167423705e-07, + "loss": 0.0, + "step": 63221 + }, + { + "epoch": 5.899225529532519, + "grad_norm": NaN, + "learning_rate": 2.22380756823759e-07, + "loss": 0.0, + "step": 63222 + }, + { + "epoch": 5.899318839227396, + "grad_norm": NaN, + "learning_rate": 2.219692328133904e-07, + "loss": 0.0, + "step": 63223 + }, + { + "epoch": 5.8994121489222735, + "grad_norm": NaN, + "learning_rate": 2.2155808964419707e-07, + "loss": 0.0, + "step": 63224 + }, + { + "epoch": 5.89950545861715, + "grad_norm": NaN, + "learning_rate": 2.2114732731719487e-07, + "loss": 0.0, + "step": 63225 + }, + { + "epoch": 5.8995987683120275, + "grad_norm": NaN, + "learning_rate": 2.2073694583346624e-07, + "loss": 0.0, + "step": 63226 + }, + { + "epoch": 5.899692078006905, + "grad_norm": NaN, + "learning_rate": 2.2032694519402706e-07, + "loss": 0.0, + "step": 63227 + }, + { + "epoch": 5.899785387701782, + "grad_norm": NaN, + "learning_rate": 2.1991732539992646e-07, + "loss": 0.0, + "step": 63228 + }, + { + "epoch": 5.89987869739666, + "grad_norm": NaN, + "learning_rate": 2.1950808645221364e-07, + "loss": 0.0, + "step": 63229 + }, + { + "epoch": 5.899972007091537, + "grad_norm": NaN, + "learning_rate": 2.1909922835192107e-07, + "loss": 0.0, + "step": 63230 + }, + { + "epoch": 5.900065316786414, + "grad_norm": NaN, + "learning_rate": 2.1869075110009794e-07, + "loss": 0.0, + "step": 63231 + }, + { + "epoch": 5.900158626481291, + "grad_norm": NaN, + "learning_rate": 2.1828265469776008e-07, + "loss": 0.0, + "step": 63232 + }, + { + "epoch": 5.9002519361761685, + "grad_norm": NaN, + "learning_rate": 2.1787493914597332e-07, + "loss": 0.0, + "step": 63233 + }, + { + "epoch": 5.900345245871046, + "grad_norm": NaN, + "learning_rate": 2.1746760444575352e-07, + "loss": 0.0, + "step": 63234 + }, + { + "epoch": 5.900438555565923, + "grad_norm": NaN, + "learning_rate": 2.170606505981498e-07, + "loss": 0.0, + "step": 63235 + }, + { + "epoch": 5.900531865260801, + "grad_norm": NaN, + "learning_rate": 2.1665407760419473e-07, + "loss": 0.0, + "step": 63236 + }, + { + "epoch": 5.900625174955678, + "grad_norm": NaN, + "learning_rate": 2.1624788546492077e-07, + "loss": 0.0, + "step": 63237 + }, + { + "epoch": 5.900718484650556, + "grad_norm": NaN, + "learning_rate": 2.158420741813438e-07, + "loss": 0.0, + "step": 63238 + }, + { + "epoch": 5.900811794345432, + "grad_norm": NaN, + "learning_rate": 2.1543664375452962e-07, + "loss": 0.0, + "step": 63239 + }, + { + "epoch": 5.9009051040403095, + "grad_norm": NaN, + "learning_rate": 2.1503159418547745e-07, + "loss": 0.0, + "step": 63240 + }, + { + "epoch": 5.900998413735187, + "grad_norm": NaN, + "learning_rate": 2.146269254752364e-07, + "loss": 0.0, + "step": 63241 + }, + { + "epoch": 5.901091723430064, + "grad_norm": NaN, + "learning_rate": 2.1422263762483903e-07, + "loss": 0.0, + "step": 63242 + }, + { + "epoch": 5.901185033124942, + "grad_norm": NaN, + "learning_rate": 2.1381873063528455e-07, + "loss": 0.0, + "step": 63243 + }, + { + "epoch": 5.901278342819819, + "grad_norm": NaN, + "learning_rate": 2.1341520450762206e-07, + "loss": 0.0, + "step": 63244 + }, + { + "epoch": 5.901371652514697, + "grad_norm": NaN, + "learning_rate": 2.130120592428841e-07, + "loss": 0.0, + "step": 63245 + }, + { + "epoch": 5.901464962209573, + "grad_norm": NaN, + "learning_rate": 2.1260929484208654e-07, + "loss": 0.0, + "step": 63246 + }, + { + "epoch": 5.901558271904451, + "grad_norm": NaN, + "learning_rate": 2.1220691130624522e-07, + "loss": 0.0, + "step": 63247 + }, + { + "epoch": 5.901651581599328, + "grad_norm": NaN, + "learning_rate": 2.1180490863639266e-07, + "loss": 0.0, + "step": 63248 + }, + { + "epoch": 5.901744891294205, + "grad_norm": NaN, + "learning_rate": 2.1140328683356135e-07, + "loss": 0.0, + "step": 63249 + }, + { + "epoch": 5.901838200989083, + "grad_norm": NaN, + "learning_rate": 2.1100204589875048e-07, + "loss": 0.0, + "step": 63250 + }, + { + "epoch": 5.90193151068396, + "grad_norm": NaN, + "learning_rate": 2.1060118583299256e-07, + "loss": 0.0, + "step": 63251 + }, + { + "epoch": 5.902024820378838, + "grad_norm": NaN, + "learning_rate": 2.102007066373035e-07, + "loss": 0.0, + "step": 63252 + }, + { + "epoch": 5.902118130073715, + "grad_norm": NaN, + "learning_rate": 2.0980060831269907e-07, + "loss": 0.0, + "step": 63253 + }, + { + "epoch": 5.902211439768592, + "grad_norm": NaN, + "learning_rate": 2.0940089086021184e-07, + "loss": 0.0, + "step": 63254 + }, + { + "epoch": 5.902304749463469, + "grad_norm": NaN, + "learning_rate": 2.0900155428084098e-07, + "loss": 0.0, + "step": 63255 + }, + { + "epoch": 5.9023980591583465, + "grad_norm": NaN, + "learning_rate": 2.0860259857560235e-07, + "loss": 0.0, + "step": 63256 + }, + { + "epoch": 5.902491368853224, + "grad_norm": NaN, + "learning_rate": 2.0820402374552847e-07, + "loss": 0.0, + "step": 63257 + }, + { + "epoch": 5.902584678548101, + "grad_norm": NaN, + "learning_rate": 2.0780582979160188e-07, + "loss": 0.0, + "step": 63258 + }, + { + "epoch": 5.902677988242979, + "grad_norm": NaN, + "learning_rate": 2.0740801671487173e-07, + "loss": 0.0, + "step": 63259 + }, + { + "epoch": 5.902771297937855, + "grad_norm": NaN, + "learning_rate": 2.0701058451632058e-07, + "loss": 0.0, + "step": 63260 + }, + { + "epoch": 5.902864607632733, + "grad_norm": NaN, + "learning_rate": 2.0661353319698093e-07, + "loss": 0.0, + "step": 63261 + }, + { + "epoch": 5.90295791732761, + "grad_norm": NaN, + "learning_rate": 2.0621686275783532e-07, + "loss": 0.0, + "step": 63262 + }, + { + "epoch": 5.9030512270224875, + "grad_norm": NaN, + "learning_rate": 2.058205731999163e-07, + "loss": 0.0, + "step": 63263 + }, + { + "epoch": 5.903144536717365, + "grad_norm": NaN, + "learning_rate": 2.0542466452422303e-07, + "loss": 0.0, + "step": 63264 + }, + { + "epoch": 5.903237846412242, + "grad_norm": NaN, + "learning_rate": 2.050291367317547e-07, + "loss": 0.0, + "step": 63265 + }, + { + "epoch": 5.90333115610712, + "grad_norm": NaN, + "learning_rate": 2.046339898235272e-07, + "loss": 0.0, + "step": 63266 + }, + { + "epoch": 5.903424465801997, + "grad_norm": NaN, + "learning_rate": 2.0423922380055635e-07, + "loss": 0.0, + "step": 63267 + }, + { + "epoch": 5.903517775496875, + "grad_norm": NaN, + "learning_rate": 2.0384483866382473e-07, + "loss": 0.0, + "step": 63268 + }, + { + "epoch": 5.903611085191751, + "grad_norm": NaN, + "learning_rate": 2.0345083441433153e-07, + "loss": 0.0, + "step": 63269 + }, + { + "epoch": 5.903704394886629, + "grad_norm": NaN, + "learning_rate": 2.0305721105310924e-07, + "loss": 0.0, + "step": 63270 + }, + { + "epoch": 5.903797704581506, + "grad_norm": NaN, + "learning_rate": 2.0266396858112377e-07, + "loss": 0.0, + "step": 63271 + }, + { + "epoch": 5.903891014276383, + "grad_norm": NaN, + "learning_rate": 2.0227110699940762e-07, + "loss": 0.0, + "step": 63272 + }, + { + "epoch": 5.903984323971261, + "grad_norm": NaN, + "learning_rate": 2.018786263089267e-07, + "loss": 0.0, + "step": 63273 + }, + { + "epoch": 5.904077633666138, + "grad_norm": NaN, + "learning_rate": 2.0148652651069685e-07, + "loss": 0.0, + "step": 63274 + }, + { + "epoch": 5.904170943361015, + "grad_norm": NaN, + "learning_rate": 2.010948076057173e-07, + "loss": 0.0, + "step": 63275 + }, + { + "epoch": 5.904264253055892, + "grad_norm": NaN, + "learning_rate": 2.0070346959497053e-07, + "loss": 0.0, + "step": 63276 + }, + { + "epoch": 5.90435756275077, + "grad_norm": NaN, + "learning_rate": 2.0031251247947245e-07, + "loss": 0.0, + "step": 63277 + }, + { + "epoch": 5.904450872445647, + "grad_norm": NaN, + "learning_rate": 1.9992193626020557e-07, + "loss": 0.0, + "step": 63278 + }, + { + "epoch": 5.9045441821405245, + "grad_norm": NaN, + "learning_rate": 1.995317409381525e-07, + "loss": 0.0, + "step": 63279 + }, + { + "epoch": 5.904637491835402, + "grad_norm": NaN, + "learning_rate": 1.9914192651432904e-07, + "loss": 0.0, + "step": 63280 + }, + { + "epoch": 5.904730801530279, + "grad_norm": NaN, + "learning_rate": 1.9875249298971773e-07, + "loss": 0.0, + "step": 63281 + }, + { + "epoch": 5.904824111225157, + "grad_norm": NaN, + "learning_rate": 1.9836344036528452e-07, + "loss": 0.0, + "step": 63282 + }, + { + "epoch": 5.904917420920033, + "grad_norm": NaN, + "learning_rate": 1.979747686420452e-07, + "loss": 0.0, + "step": 63283 + }, + { + "epoch": 5.905010730614911, + "grad_norm": NaN, + "learning_rate": 1.9758647782098236e-07, + "loss": 0.0, + "step": 63284 + }, + { + "epoch": 5.905104040309788, + "grad_norm": NaN, + "learning_rate": 1.9719856790309518e-07, + "loss": 0.0, + "step": 63285 + }, + { + "epoch": 5.9051973500046655, + "grad_norm": NaN, + "learning_rate": 1.9681103888934958e-07, + "loss": 0.0, + "step": 63286 + }, + { + "epoch": 5.905290659699543, + "grad_norm": NaN, + "learning_rate": 1.9642389078072807e-07, + "loss": 0.0, + "step": 63287 + }, + { + "epoch": 5.90538396939442, + "grad_norm": NaN, + "learning_rate": 1.9603712357822987e-07, + "loss": 0.0, + "step": 63288 + }, + { + "epoch": 5.905477279089298, + "grad_norm": NaN, + "learning_rate": 1.9565073728283754e-07, + "loss": 0.0, + "step": 63289 + }, + { + "epoch": 5.905570588784174, + "grad_norm": NaN, + "learning_rate": 1.952647318955336e-07, + "loss": 0.0, + "step": 63290 + }, + { + "epoch": 5.905663898479052, + "grad_norm": NaN, + "learning_rate": 1.9487910741730063e-07, + "loss": 0.0, + "step": 63291 + }, + { + "epoch": 5.905757208173929, + "grad_norm": NaN, + "learning_rate": 1.9449386384910447e-07, + "loss": 0.0, + "step": 63292 + }, + { + "epoch": 5.9058505178688065, + "grad_norm": NaN, + "learning_rate": 1.9410900119192772e-07, + "loss": 0.0, + "step": 63293 + }, + { + "epoch": 5.905943827563684, + "grad_norm": NaN, + "learning_rate": 1.937245194467696e-07, + "loss": 0.0, + "step": 63294 + }, + { + "epoch": 5.906037137258561, + "grad_norm": NaN, + "learning_rate": 1.9334041861459593e-07, + "loss": 0.0, + "step": 63295 + }, + { + "epoch": 5.906130446953439, + "grad_norm": NaN, + "learning_rate": 1.9295669869637264e-07, + "loss": 0.0, + "step": 63296 + }, + { + "epoch": 5.906223756648316, + "grad_norm": NaN, + "learning_rate": 1.925733596930823e-07, + "loss": 0.0, + "step": 63297 + }, + { + "epoch": 5.906317066343193, + "grad_norm": NaN, + "learning_rate": 1.921904016057074e-07, + "loss": 0.0, + "step": 63298 + }, + { + "epoch": 5.90641037603807, + "grad_norm": NaN, + "learning_rate": 1.918078244352139e-07, + "loss": 0.0, + "step": 63299 + }, + { + "epoch": 5.906503685732948, + "grad_norm": NaN, + "learning_rate": 1.9142562818258434e-07, + "loss": 0.0, + "step": 63300 + }, + { + "epoch": 5.906596995427825, + "grad_norm": NaN, + "learning_rate": 1.9104381284876792e-07, + "loss": 0.0, + "step": 63301 + }, + { + "epoch": 5.906690305122702, + "grad_norm": NaN, + "learning_rate": 1.9066237843476384e-07, + "loss": 0.0, + "step": 63302 + }, + { + "epoch": 5.90678361481758, + "grad_norm": NaN, + "learning_rate": 1.9028132494152137e-07, + "loss": 0.0, + "step": 63303 + }, + { + "epoch": 5.906876924512456, + "grad_norm": NaN, + "learning_rate": 1.899006523700064e-07, + "loss": 0.0, + "step": 63304 + }, + { + "epoch": 5.906970234207334, + "grad_norm": NaN, + "learning_rate": 1.8952036072121812e-07, + "loss": 0.0, + "step": 63305 + }, + { + "epoch": 5.907063543902211, + "grad_norm": NaN, + "learning_rate": 1.8914044999608912e-07, + "loss": 0.0, + "step": 63306 + }, + { + "epoch": 5.907156853597089, + "grad_norm": NaN, + "learning_rate": 1.8876092019560196e-07, + "loss": 0.0, + "step": 63307 + }, + { + "epoch": 5.907250163291966, + "grad_norm": NaN, + "learning_rate": 1.8838177132072253e-07, + "loss": 0.0, + "step": 63308 + }, + { + "epoch": 5.9073434729868435, + "grad_norm": NaN, + "learning_rate": 1.8800300337240005e-07, + "loss": 0.0, + "step": 63309 + }, + { + "epoch": 5.907436782681721, + "grad_norm": NaN, + "learning_rate": 1.8762461635161709e-07, + "loss": 0.0, + "step": 63310 + }, + { + "epoch": 5.907530092376598, + "grad_norm": NaN, + "learning_rate": 1.8724661025933952e-07, + "loss": 0.0, + "step": 63311 + }, + { + "epoch": 5.907623402071475, + "grad_norm": NaN, + "learning_rate": 1.8686898509649995e-07, + "loss": 0.0, + "step": 63312 + }, + { + "epoch": 5.907716711766352, + "grad_norm": NaN, + "learning_rate": 1.8649174086406427e-07, + "loss": 0.0, + "step": 63313 + }, + { + "epoch": 5.90781002146123, + "grad_norm": NaN, + "learning_rate": 1.8611487756301502e-07, + "loss": 0.0, + "step": 63314 + }, + { + "epoch": 5.907903331156107, + "grad_norm": NaN, + "learning_rate": 1.8573839519430144e-07, + "loss": 0.0, + "step": 63315 + }, + { + "epoch": 5.9079966408509845, + "grad_norm": NaN, + "learning_rate": 1.853622937588728e-07, + "loss": 0.0, + "step": 63316 + }, + { + "epoch": 5.908089950545862, + "grad_norm": NaN, + "learning_rate": 1.8498657325769495e-07, + "loss": 0.0, + "step": 63317 + }, + { + "epoch": 5.908183260240739, + "grad_norm": NaN, + "learning_rate": 1.846112336917005e-07, + "loss": 0.0, + "step": 63318 + }, + { + "epoch": 5.908276569935616, + "grad_norm": NaN, + "learning_rate": 1.84236275061872e-07, + "loss": 0.0, + "step": 63319 + }, + { + "epoch": 5.908369879630493, + "grad_norm": NaN, + "learning_rate": 1.8386169736915869e-07, + "loss": 0.0, + "step": 63320 + }, + { + "epoch": 5.908463189325371, + "grad_norm": NaN, + "learning_rate": 1.834875006145098e-07, + "loss": 0.0, + "step": 63321 + }, + { + "epoch": 5.908556499020248, + "grad_norm": NaN, + "learning_rate": 1.831136847988579e-07, + "loss": 0.0, + "step": 63322 + }, + { + "epoch": 5.908649808715126, + "grad_norm": NaN, + "learning_rate": 1.8274024992316894e-07, + "loss": 0.0, + "step": 63323 + }, + { + "epoch": 5.908743118410003, + "grad_norm": NaN, + "learning_rate": 1.8236719598840876e-07, + "loss": 0.0, + "step": 63324 + }, + { + "epoch": 5.90883642810488, + "grad_norm": NaN, + "learning_rate": 1.8199452299549333e-07, + "loss": 0.0, + "step": 63325 + }, + { + "epoch": 5.908929737799758, + "grad_norm": NaN, + "learning_rate": 1.8162223094538853e-07, + "loss": 0.0, + "step": 63326 + }, + { + "epoch": 5.909023047494634, + "grad_norm": NaN, + "learning_rate": 1.8125031983904358e-07, + "loss": 0.0, + "step": 63327 + }, + { + "epoch": 5.909116357189512, + "grad_norm": NaN, + "learning_rate": 1.808787896773911e-07, + "loss": 0.0, + "step": 63328 + }, + { + "epoch": 5.909209666884389, + "grad_norm": NaN, + "learning_rate": 1.8050764046139699e-07, + "loss": 0.0, + "step": 63329 + }, + { + "epoch": 5.909302976579267, + "grad_norm": NaN, + "learning_rate": 1.801368721919938e-07, + "loss": 0.0, + "step": 63330 + }, + { + "epoch": 5.909396286274144, + "grad_norm": NaN, + "learning_rate": 1.7976648487011412e-07, + "loss": 0.0, + "step": 63331 + }, + { + "epoch": 5.9094895959690215, + "grad_norm": NaN, + "learning_rate": 1.7939647849670723e-07, + "loss": 0.0, + "step": 63332 + }, + { + "epoch": 5.909582905663898, + "grad_norm": NaN, + "learning_rate": 1.7902685307272236e-07, + "loss": 0.0, + "step": 63333 + }, + { + "epoch": 5.909676215358775, + "grad_norm": NaN, + "learning_rate": 1.786576085990754e-07, + "loss": 0.0, + "step": 63334 + }, + { + "epoch": 5.909769525053653, + "grad_norm": NaN, + "learning_rate": 1.782887450767323e-07, + "loss": 0.0, + "step": 63335 + }, + { + "epoch": 5.90986283474853, + "grad_norm": NaN, + "learning_rate": 1.7792026250662562e-07, + "loss": 0.0, + "step": 63336 + }, + { + "epoch": 5.909956144443408, + "grad_norm": NaN, + "learning_rate": 1.7755216088968794e-07, + "loss": 0.0, + "step": 63337 + }, + { + "epoch": 5.910049454138285, + "grad_norm": NaN, + "learning_rate": 1.7718444022685185e-07, + "loss": 0.0, + "step": 63338 + }, + { + "epoch": 5.9101427638331625, + "grad_norm": NaN, + "learning_rate": 1.7681710051904996e-07, + "loss": 0.0, + "step": 63339 + }, + { + "epoch": 5.91023607352804, + "grad_norm": NaN, + "learning_rate": 1.7645014176721484e-07, + "loss": 0.0, + "step": 63340 + }, + { + "epoch": 5.910329383222917, + "grad_norm": NaN, + "learning_rate": 1.7608356397229573e-07, + "loss": 0.0, + "step": 63341 + }, + { + "epoch": 5.910422692917794, + "grad_norm": NaN, + "learning_rate": 1.7571736713520855e-07, + "loss": 0.0, + "step": 63342 + }, + { + "epoch": 5.910516002612671, + "grad_norm": NaN, + "learning_rate": 1.7535155125690258e-07, + "loss": 0.0, + "step": 63343 + }, + { + "epoch": 5.910609312307549, + "grad_norm": NaN, + "learning_rate": 1.7498611633829374e-07, + "loss": 0.0, + "step": 63344 + }, + { + "epoch": 5.910702622002426, + "grad_norm": NaN, + "learning_rate": 1.746210623803146e-07, + "loss": 0.0, + "step": 63345 + }, + { + "epoch": 5.9107959316973036, + "grad_norm": NaN, + "learning_rate": 1.7425638938388108e-07, + "loss": 0.0, + "step": 63346 + }, + { + "epoch": 5.910889241392181, + "grad_norm": NaN, + "learning_rate": 1.7389209734994246e-07, + "loss": 0.0, + "step": 63347 + }, + { + "epoch": 5.9109825510870575, + "grad_norm": NaN, + "learning_rate": 1.7352818627941468e-07, + "loss": 0.0, + "step": 63348 + }, + { + "epoch": 5.911075860781935, + "grad_norm": NaN, + "learning_rate": 1.7316465617321363e-07, + "loss": 0.0, + "step": 63349 + }, + { + "epoch": 5.911169170476812, + "grad_norm": NaN, + "learning_rate": 1.728015070322719e-07, + "loss": 0.0, + "step": 63350 + }, + { + "epoch": 5.91126248017169, + "grad_norm": NaN, + "learning_rate": 1.7243873885752213e-07, + "loss": 0.0, + "step": 63351 + }, + { + "epoch": 5.911355789866567, + "grad_norm": NaN, + "learning_rate": 1.720763516498802e-07, + "loss": 0.0, + "step": 63352 + }, + { + "epoch": 5.911449099561445, + "grad_norm": NaN, + "learning_rate": 1.7171434541026208e-07, + "loss": 0.0, + "step": 63353 + }, + { + "epoch": 5.911542409256322, + "grad_norm": NaN, + "learning_rate": 1.7135272013958367e-07, + "loss": 0.0, + "step": 63354 + }, + { + "epoch": 5.911635718951199, + "grad_norm": NaN, + "learning_rate": 1.7099147583879426e-07, + "loss": 0.0, + "step": 63355 + }, + { + "epoch": 5.911729028646076, + "grad_norm": NaN, + "learning_rate": 1.706306125087764e-07, + "loss": 0.0, + "step": 63356 + }, + { + "epoch": 5.911822338340953, + "grad_norm": NaN, + "learning_rate": 1.7027013015046275e-07, + "loss": 0.0, + "step": 63357 + }, + { + "epoch": 5.911915648035831, + "grad_norm": NaN, + "learning_rate": 1.6991002876476923e-07, + "loss": 0.0, + "step": 63358 + }, + { + "epoch": 5.912008957730708, + "grad_norm": NaN, + "learning_rate": 1.6955030835261175e-07, + "loss": 0.0, + "step": 63359 + }, + { + "epoch": 5.912102267425586, + "grad_norm": NaN, + "learning_rate": 1.6919096891490624e-07, + "loss": 0.0, + "step": 63360 + }, + { + "epoch": 5.912195577120463, + "grad_norm": NaN, + "learning_rate": 1.6883201045256867e-07, + "loss": 0.0, + "step": 63361 + }, + { + "epoch": 5.9122888868153405, + "grad_norm": NaN, + "learning_rate": 1.684734329664983e-07, + "loss": 0.0, + "step": 63362 + }, + { + "epoch": 5.912382196510217, + "grad_norm": NaN, + "learning_rate": 1.6811523645761103e-07, + "loss": 0.0, + "step": 63363 + }, + { + "epoch": 5.912475506205094, + "grad_norm": NaN, + "learning_rate": 1.6775742092683952e-07, + "loss": 0.0, + "step": 63364 + }, + { + "epoch": 5.912568815899972, + "grad_norm": NaN, + "learning_rate": 1.6739998637506635e-07, + "loss": 0.0, + "step": 63365 + }, + { + "epoch": 5.912662125594849, + "grad_norm": NaN, + "learning_rate": 1.6704293280322413e-07, + "loss": 0.0, + "step": 63366 + }, + { + "epoch": 5.912755435289727, + "grad_norm": NaN, + "learning_rate": 1.6668626021219544e-07, + "loss": 0.0, + "step": 63367 + }, + { + "epoch": 5.912848744984604, + "grad_norm": NaN, + "learning_rate": 1.6632996860291291e-07, + "loss": 0.0, + "step": 63368 + }, + { + "epoch": 5.9129420546794815, + "grad_norm": NaN, + "learning_rate": 1.6597405797625918e-07, + "loss": 0.0, + "step": 63369 + }, + { + "epoch": 5.913035364374359, + "grad_norm": NaN, + "learning_rate": 1.6561852833315016e-07, + "loss": 0.0, + "step": 63370 + }, + { + "epoch": 5.9131286740692355, + "grad_norm": NaN, + "learning_rate": 1.652633796744851e-07, + "loss": 0.0, + "step": 63371 + }, + { + "epoch": 5.913221983764113, + "grad_norm": NaN, + "learning_rate": 1.6490861200118e-07, + "loss": 0.0, + "step": 63372 + }, + { + "epoch": 5.91331529345899, + "grad_norm": NaN, + "learning_rate": 1.6455422531413408e-07, + "loss": 0.0, + "step": 63373 + }, + { + "epoch": 5.913408603153868, + "grad_norm": NaN, + "learning_rate": 1.6420021961423002e-07, + "loss": 0.0, + "step": 63374 + }, + { + "epoch": 5.913501912848745, + "grad_norm": NaN, + "learning_rate": 1.6384659490238373e-07, + "loss": 0.0, + "step": 63375 + }, + { + "epoch": 5.913595222543623, + "grad_norm": NaN, + "learning_rate": 1.634933511794778e-07, + "loss": 0.0, + "step": 63376 + }, + { + "epoch": 5.913688532238499, + "grad_norm": NaN, + "learning_rate": 1.631404884464449e-07, + "loss": 0.0, + "step": 63377 + }, + { + "epoch": 5.9137818419333765, + "grad_norm": NaN, + "learning_rate": 1.6278800670415094e-07, + "loss": 0.0, + "step": 63378 + }, + { + "epoch": 5.913875151628254, + "grad_norm": NaN, + "learning_rate": 1.624359059534952e-07, + "loss": 0.0, + "step": 63379 + }, + { + "epoch": 5.913968461323131, + "grad_norm": NaN, + "learning_rate": 1.6208418619539366e-07, + "loss": 0.0, + "step": 63380 + }, + { + "epoch": 5.914061771018009, + "grad_norm": NaN, + "learning_rate": 1.6173284743071224e-07, + "loss": 0.0, + "step": 63381 + }, + { + "epoch": 5.914155080712886, + "grad_norm": NaN, + "learning_rate": 1.6138188966036693e-07, + "loss": 0.0, + "step": 63382 + }, + { + "epoch": 5.914248390407764, + "grad_norm": NaN, + "learning_rate": 1.610313128852403e-07, + "loss": 0.0, + "step": 63383 + }, + { + "epoch": 5.914341700102641, + "grad_norm": NaN, + "learning_rate": 1.6068111710623165e-07, + "loss": 0.0, + "step": 63384 + }, + { + "epoch": 5.9144350097975185, + "grad_norm": NaN, + "learning_rate": 1.60331302324207e-07, + "loss": 0.0, + "step": 63385 + }, + { + "epoch": 5.914528319492395, + "grad_norm": NaN, + "learning_rate": 1.5998186854009887e-07, + "loss": 0.0, + "step": 63386 + }, + { + "epoch": 5.914621629187272, + "grad_norm": NaN, + "learning_rate": 1.5963281575475662e-07, + "loss": 0.0, + "step": 63387 + }, + { + "epoch": 5.91471493888215, + "grad_norm": NaN, + "learning_rate": 1.5928414396907952e-07, + "loss": 0.0, + "step": 63388 + }, + { + "epoch": 5.914808248577027, + "grad_norm": NaN, + "learning_rate": 1.5893585318396683e-07, + "loss": 0.0, + "step": 63389 + }, + { + "epoch": 5.914901558271905, + "grad_norm": NaN, + "learning_rate": 1.5858794340028458e-07, + "loss": 0.0, + "step": 63390 + }, + { + "epoch": 5.914994867966782, + "grad_norm": NaN, + "learning_rate": 1.58240414618932e-07, + "loss": 0.0, + "step": 63391 + }, + { + "epoch": 5.915088177661659, + "grad_norm": NaN, + "learning_rate": 1.5789326684079173e-07, + "loss": 0.0, + "step": 63392 + }, + { + "epoch": 5.915181487356536, + "grad_norm": NaN, + "learning_rate": 1.5754650006672975e-07, + "loss": 0.0, + "step": 63393 + }, + { + "epoch": 5.9152747970514135, + "grad_norm": NaN, + "learning_rate": 1.5720011429764533e-07, + "loss": 0.0, + "step": 63394 + }, + { + "epoch": 5.915368106746291, + "grad_norm": NaN, + "learning_rate": 1.5685410953442113e-07, + "loss": 0.0, + "step": 63395 + }, + { + "epoch": 5.915461416441168, + "grad_norm": NaN, + "learning_rate": 1.5650848577792308e-07, + "loss": 0.0, + "step": 63396 + }, + { + "epoch": 5.915554726136046, + "grad_norm": NaN, + "learning_rate": 1.5616324302905048e-07, + "loss": 0.0, + "step": 63397 + }, + { + "epoch": 5.915648035830923, + "grad_norm": NaN, + "learning_rate": 1.5581838128865266e-07, + "loss": 0.0, + "step": 63398 + }, + { + "epoch": 5.9157413455258006, + "grad_norm": NaN, + "learning_rate": 1.5547390055762888e-07, + "loss": 0.0, + "step": 63399 + }, + { + "epoch": 5.915834655220677, + "grad_norm": NaN, + "learning_rate": 1.5512980083684513e-07, + "loss": 0.0, + "step": 63400 + }, + { + "epoch": 5.9159279649155545, + "grad_norm": NaN, + "learning_rate": 1.54786082127184e-07, + "loss": 0.0, + "step": 63401 + }, + { + "epoch": 5.916021274610432, + "grad_norm": NaN, + "learning_rate": 1.5444274442951153e-07, + "loss": 0.0, + "step": 63402 + }, + { + "epoch": 5.916114584305309, + "grad_norm": NaN, + "learning_rate": 1.5409978774469366e-07, + "loss": 0.0, + "step": 63403 + }, + { + "epoch": 5.916207894000187, + "grad_norm": NaN, + "learning_rate": 1.5375721207361303e-07, + "loss": 0.0, + "step": 63404 + }, + { + "epoch": 5.916301203695064, + "grad_norm": NaN, + "learning_rate": 1.5341501741715222e-07, + "loss": 0.0, + "step": 63405 + }, + { + "epoch": 5.916394513389942, + "grad_norm": NaN, + "learning_rate": 1.5307320377616062e-07, + "loss": 0.0, + "step": 63406 + }, + { + "epoch": 5.916487823084818, + "grad_norm": NaN, + "learning_rate": 1.527317711515208e-07, + "loss": 0.0, + "step": 63407 + }, + { + "epoch": 5.9165811327796956, + "grad_norm": NaN, + "learning_rate": 1.5239071954409875e-07, + "loss": 0.0, + "step": 63408 + }, + { + "epoch": 5.916674442474573, + "grad_norm": NaN, + "learning_rate": 1.5205004895476048e-07, + "loss": 0.0, + "step": 63409 + }, + { + "epoch": 5.91676775216945, + "grad_norm": NaN, + "learning_rate": 1.5170975938435525e-07, + "loss": 0.0, + "step": 63410 + }, + { + "epoch": 5.916861061864328, + "grad_norm": NaN, + "learning_rate": 1.5136985083378238e-07, + "loss": 0.0, + "step": 63411 + }, + { + "epoch": 5.916954371559205, + "grad_norm": NaN, + "learning_rate": 1.5103032330387456e-07, + "loss": 0.0, + "step": 63412 + }, + { + "epoch": 5.917047681254083, + "grad_norm": NaN, + "learning_rate": 1.5069117679551435e-07, + "loss": 0.0, + "step": 63413 + }, + { + "epoch": 5.91714099094896, + "grad_norm": NaN, + "learning_rate": 1.503524113095511e-07, + "loss": 0.0, + "step": 63414 + }, + { + "epoch": 5.917234300643837, + "grad_norm": NaN, + "learning_rate": 1.5001402684686748e-07, + "loss": 0.0, + "step": 63415 + }, + { + "epoch": 5.917327610338714, + "grad_norm": NaN, + "learning_rate": 1.4967602340829609e-07, + "loss": 0.0, + "step": 63416 + }, + { + "epoch": 5.917420920033591, + "grad_norm": NaN, + "learning_rate": 1.493384009947196e-07, + "loss": 0.0, + "step": 63417 + }, + { + "epoch": 5.917514229728469, + "grad_norm": NaN, + "learning_rate": 1.4900115960697067e-07, + "loss": 0.0, + "step": 63418 + }, + { + "epoch": 5.917607539423346, + "grad_norm": NaN, + "learning_rate": 1.4866429924593193e-07, + "loss": 0.0, + "step": 63419 + }, + { + "epoch": 5.917700849118224, + "grad_norm": NaN, + "learning_rate": 1.4832781991245267e-07, + "loss": 0.0, + "step": 63420 + }, + { + "epoch": 5.9177941588131, + "grad_norm": NaN, + "learning_rate": 1.4799172160738227e-07, + "loss": 0.0, + "step": 63421 + }, + { + "epoch": 5.917887468507978, + "grad_norm": NaN, + "learning_rate": 1.4765600433158663e-07, + "loss": 0.0, + "step": 63422 + }, + { + "epoch": 5.917980778202855, + "grad_norm": NaN, + "learning_rate": 1.473206680858985e-07, + "loss": 0.0, + "step": 63423 + }, + { + "epoch": 5.9180740878977325, + "grad_norm": NaN, + "learning_rate": 1.4698571287120042e-07, + "loss": 0.0, + "step": 63424 + }, + { + "epoch": 5.91816739759261, + "grad_norm": NaN, + "learning_rate": 1.4665113868832513e-07, + "loss": 0.0, + "step": 63425 + }, + { + "epoch": 5.918260707287487, + "grad_norm": NaN, + "learning_rate": 1.463169455381219e-07, + "loss": 0.0, + "step": 63426 + }, + { + "epoch": 5.918354016982365, + "grad_norm": NaN, + "learning_rate": 1.4598313342144008e-07, + "loss": 0.0, + "step": 63427 + }, + { + "epoch": 5.918447326677242, + "grad_norm": NaN, + "learning_rate": 1.4564970233914563e-07, + "loss": 0.0, + "step": 63428 + }, + { + "epoch": 5.918540636372119, + "grad_norm": NaN, + "learning_rate": 1.453166522920546e-07, + "loss": 0.0, + "step": 63429 + }, + { + "epoch": 5.918633946066996, + "grad_norm": NaN, + "learning_rate": 1.4498398328104954e-07, + "loss": 0.0, + "step": 63430 + }, + { + "epoch": 5.9187272557618735, + "grad_norm": NaN, + "learning_rate": 1.4465169530694652e-07, + "loss": 0.0, + "step": 63431 + }, + { + "epoch": 5.918820565456751, + "grad_norm": NaN, + "learning_rate": 1.4431978837061152e-07, + "loss": 0.0, + "step": 63432 + }, + { + "epoch": 5.918913875151628, + "grad_norm": NaN, + "learning_rate": 1.4398826247289387e-07, + "loss": 0.0, + "step": 63433 + }, + { + "epoch": 5.919007184846506, + "grad_norm": NaN, + "learning_rate": 1.4365711761460952e-07, + "loss": 0.0, + "step": 63434 + }, + { + "epoch": 5.919100494541383, + "grad_norm": NaN, + "learning_rate": 1.4332635379660784e-07, + "loss": 0.0, + "step": 63435 + }, + { + "epoch": 5.91919380423626, + "grad_norm": NaN, + "learning_rate": 1.4299597101975479e-07, + "loss": 0.0, + "step": 63436 + }, + { + "epoch": 5.919287113931137, + "grad_norm": NaN, + "learning_rate": 1.4266596928484975e-07, + "loss": 0.0, + "step": 63437 + }, + { + "epoch": 5.919380423626015, + "grad_norm": NaN, + "learning_rate": 1.423363485927753e-07, + "loss": 0.0, + "step": 63438 + }, + { + "epoch": 5.919473733320892, + "grad_norm": NaN, + "learning_rate": 1.4200710894433088e-07, + "loss": 0.0, + "step": 63439 + }, + { + "epoch": 5.919567043015769, + "grad_norm": NaN, + "learning_rate": 1.4167825034038237e-07, + "loss": 0.0, + "step": 63440 + }, + { + "epoch": 5.919660352710647, + "grad_norm": NaN, + "learning_rate": 1.4134977278174587e-07, + "loss": 0.0, + "step": 63441 + }, + { + "epoch": 5.919753662405524, + "grad_norm": NaN, + "learning_rate": 1.4102167626927063e-07, + "loss": 0.0, + "step": 63442 + }, + { + "epoch": 5.919846972100402, + "grad_norm": NaN, + "learning_rate": 1.4069396080378936e-07, + "loss": 0.0, + "step": 63443 + }, + { + "epoch": 5.919940281795278, + "grad_norm": NaN, + "learning_rate": 1.403666263861347e-07, + "loss": 0.0, + "step": 63444 + }, + { + "epoch": 5.920033591490156, + "grad_norm": NaN, + "learning_rate": 1.400396730171227e-07, + "loss": 0.0, + "step": 63445 + }, + { + "epoch": 5.920126901185033, + "grad_norm": NaN, + "learning_rate": 1.3971310069760267e-07, + "loss": 0.0, + "step": 63446 + }, + { + "epoch": 5.9202202108799105, + "grad_norm": NaN, + "learning_rate": 1.3938690942840723e-07, + "loss": 0.0, + "step": 63447 + }, + { + "epoch": 5.920313520574788, + "grad_norm": NaN, + "learning_rate": 1.3906109921035247e-07, + "loss": 0.0, + "step": 63448 + }, + { + "epoch": 5.920406830269665, + "grad_norm": NaN, + "learning_rate": 1.3873567004428765e-07, + "loss": 0.0, + "step": 63449 + }, + { + "epoch": 5.920500139964542, + "grad_norm": NaN, + "learning_rate": 1.3841062193101214e-07, + "loss": 0.0, + "step": 63450 + }, + { + "epoch": 5.920593449659419, + "grad_norm": NaN, + "learning_rate": 1.3808595487135865e-07, + "loss": 0.0, + "step": 63451 + }, + { + "epoch": 5.920686759354297, + "grad_norm": NaN, + "learning_rate": 1.3776166886617645e-07, + "loss": 0.0, + "step": 63452 + }, + { + "epoch": 5.920780069049174, + "grad_norm": NaN, + "learning_rate": 1.3743776391626492e-07, + "loss": 0.0, + "step": 63453 + }, + { + "epoch": 5.9208733787440515, + "grad_norm": NaN, + "learning_rate": 1.3711424002245674e-07, + "loss": 0.0, + "step": 63454 + }, + { + "epoch": 5.920966688438929, + "grad_norm": NaN, + "learning_rate": 1.367910971855679e-07, + "loss": 0.0, + "step": 63455 + }, + { + "epoch": 5.921059998133806, + "grad_norm": NaN, + "learning_rate": 1.3646833540644775e-07, + "loss": 0.0, + "step": 63456 + }, + { + "epoch": 5.921153307828684, + "grad_norm": NaN, + "learning_rate": 1.3614595468587897e-07, + "loss": 0.0, + "step": 63457 + }, + { + "epoch": 5.921246617523561, + "grad_norm": NaN, + "learning_rate": 1.3582395502469422e-07, + "loss": 0.0, + "step": 63458 + }, + { + "epoch": 5.921339927218438, + "grad_norm": NaN, + "learning_rate": 1.355023364237262e-07, + "loss": 0.0, + "step": 63459 + }, + { + "epoch": 5.921433236913315, + "grad_norm": NaN, + "learning_rate": 1.351810988837909e-07, + "loss": 0.0, + "step": 63460 + }, + { + "epoch": 5.9215265466081926, + "grad_norm": NaN, + "learning_rate": 1.3486024240567106e-07, + "loss": 0.0, + "step": 63461 + }, + { + "epoch": 5.92161985630307, + "grad_norm": NaN, + "learning_rate": 1.345397669902326e-07, + "loss": 0.0, + "step": 63462 + }, + { + "epoch": 5.921713165997947, + "grad_norm": NaN, + "learning_rate": 1.3421967263825827e-07, + "loss": 0.0, + "step": 63463 + }, + { + "epoch": 5.921806475692825, + "grad_norm": NaN, + "learning_rate": 1.3389995935056408e-07, + "loss": 0.0, + "step": 63464 + }, + { + "epoch": 5.921899785387701, + "grad_norm": NaN, + "learning_rate": 1.33580627127966e-07, + "loss": 0.0, + "step": 63465 + }, + { + "epoch": 5.921993095082579, + "grad_norm": NaN, + "learning_rate": 1.332616759712968e-07, + "loss": 0.0, + "step": 63466 + }, + { + "epoch": 5.922086404777456, + "grad_norm": NaN, + "learning_rate": 1.3294310588133906e-07, + "loss": 0.0, + "step": 63467 + }, + { + "epoch": 5.922179714472334, + "grad_norm": NaN, + "learning_rate": 1.3262491685890885e-07, + "loss": 0.0, + "step": 63468 + }, + { + "epoch": 5.922273024167211, + "grad_norm": NaN, + "learning_rate": 1.323071089048222e-07, + "loss": 0.0, + "step": 63469 + }, + { + "epoch": 5.922366333862088, + "grad_norm": NaN, + "learning_rate": 1.3198968201987848e-07, + "loss": 0.0, + "step": 63470 + }, + { + "epoch": 5.922459643556966, + "grad_norm": NaN, + "learning_rate": 1.3167263620491031e-07, + "loss": 0.0, + "step": 63471 + }, + { + "epoch": 5.922552953251843, + "grad_norm": NaN, + "learning_rate": 1.3135597146068378e-07, + "loss": 0.0, + "step": 63472 + }, + { + "epoch": 5.92264626294672, + "grad_norm": NaN, + "learning_rate": 1.3103968778803153e-07, + "loss": 0.0, + "step": 63473 + }, + { + "epoch": 5.922739572641597, + "grad_norm": NaN, + "learning_rate": 1.3072378518775294e-07, + "loss": 0.0, + "step": 63474 + }, + { + "epoch": 5.922832882336475, + "grad_norm": NaN, + "learning_rate": 1.3040826366064738e-07, + "loss": 0.0, + "step": 63475 + }, + { + "epoch": 5.922926192031352, + "grad_norm": NaN, + "learning_rate": 1.300931232075142e-07, + "loss": 0.0, + "step": 63476 + }, + { + "epoch": 5.9230195017262295, + "grad_norm": NaN, + "learning_rate": 1.297783638291694e-07, + "loss": 0.0, + "step": 63477 + }, + { + "epoch": 5.923112811421107, + "grad_norm": NaN, + "learning_rate": 1.294639855263957e-07, + "loss": 0.0, + "step": 63478 + }, + { + "epoch": 5.923206121115984, + "grad_norm": NaN, + "learning_rate": 1.2914998830000912e-07, + "loss": 0.0, + "step": 63479 + }, + { + "epoch": 5.923299430810861, + "grad_norm": NaN, + "learning_rate": 1.2883637215079235e-07, + "loss": 0.0, + "step": 63480 + }, + { + "epoch": 5.923392740505738, + "grad_norm": NaN, + "learning_rate": 1.2852313707954477e-07, + "loss": 0.0, + "step": 63481 + }, + { + "epoch": 5.923486050200616, + "grad_norm": NaN, + "learning_rate": 1.2821028308708237e-07, + "loss": 0.0, + "step": 63482 + }, + { + "epoch": 5.923579359895493, + "grad_norm": NaN, + "learning_rate": 1.2789781017417123e-07, + "loss": 0.0, + "step": 63483 + }, + { + "epoch": 5.9236726695903705, + "grad_norm": NaN, + "learning_rate": 1.2758571834162734e-07, + "loss": 0.0, + "step": 63484 + }, + { + "epoch": 5.923765979285248, + "grad_norm": NaN, + "learning_rate": 1.2727400759023342e-07, + "loss": 0.0, + "step": 63485 + }, + { + "epoch": 5.923859288980125, + "grad_norm": NaN, + "learning_rate": 1.2696267792078883e-07, + "loss": 0.0, + "step": 63486 + }, + { + "epoch": 5.923952598675003, + "grad_norm": NaN, + "learning_rate": 1.266517293340763e-07, + "loss": 0.0, + "step": 63487 + }, + { + "epoch": 5.924045908369879, + "grad_norm": NaN, + "learning_rate": 1.263411618309118e-07, + "loss": 0.0, + "step": 63488 + }, + { + "epoch": 5.924139218064757, + "grad_norm": NaN, + "learning_rate": 1.2603097541204478e-07, + "loss": 0.0, + "step": 63489 + }, + { + "epoch": 5.924232527759634, + "grad_norm": NaN, + "learning_rate": 1.257211700782912e-07, + "loss": 0.0, + "step": 63490 + }, + { + "epoch": 5.924325837454512, + "grad_norm": NaN, + "learning_rate": 1.254117458304338e-07, + "loss": 0.0, + "step": 63491 + }, + { + "epoch": 5.924419147149389, + "grad_norm": NaN, + "learning_rate": 1.2510270266927192e-07, + "loss": 0.0, + "step": 63492 + }, + { + "epoch": 5.924512456844266, + "grad_norm": NaN, + "learning_rate": 1.24794040595555e-07, + "loss": 0.0, + "step": 63493 + }, + { + "epoch": 5.924605766539143, + "grad_norm": NaN, + "learning_rate": 1.2448575961009898e-07, + "loss": 0.0, + "step": 63494 + }, + { + "epoch": 5.92469907623402, + "grad_norm": NaN, + "learning_rate": 1.2417785971368665e-07, + "loss": 0.0, + "step": 63495 + }, + { + "epoch": 5.924792385928898, + "grad_norm": NaN, + "learning_rate": 1.2387034090708404e-07, + "loss": 0.0, + "step": 63496 + }, + { + "epoch": 5.924885695623775, + "grad_norm": NaN, + "learning_rate": 1.2356320319109048e-07, + "loss": 0.0, + "step": 63497 + }, + { + "epoch": 5.924979005318653, + "grad_norm": NaN, + "learning_rate": 1.2325644656647206e-07, + "loss": 0.0, + "step": 63498 + }, + { + "epoch": 5.92507231501353, + "grad_norm": NaN, + "learning_rate": 1.2295007103401144e-07, + "loss": 0.0, + "step": 63499 + }, + { + "epoch": 5.9251656247084075, + "grad_norm": NaN, + "learning_rate": 1.2264407659450803e-07, + "loss": 0.0, + "step": 63500 + }, + { + "epoch": 5.925258934403285, + "grad_norm": NaN, + "learning_rate": 1.2233846324871122e-07, + "loss": 0.0, + "step": 63501 + }, + { + "epoch": 5.925352244098161, + "grad_norm": NaN, + "learning_rate": 1.2203323099742034e-07, + "loss": 0.0, + "step": 63502 + }, + { + "epoch": 5.925445553793039, + "grad_norm": NaN, + "learning_rate": 1.2172837984138484e-07, + "loss": 0.0, + "step": 63503 + }, + { + "epoch": 5.925538863487916, + "grad_norm": NaN, + "learning_rate": 1.2142390978140404e-07, + "loss": 0.0, + "step": 63504 + }, + { + "epoch": 5.925632173182794, + "grad_norm": NaN, + "learning_rate": 1.2111982081826066e-07, + "loss": 0.0, + "step": 63505 + }, + { + "epoch": 5.925725482877671, + "grad_norm": NaN, + "learning_rate": 1.2081611295268744e-07, + "loss": 0.0, + "step": 63506 + }, + { + "epoch": 5.9258187925725485, + "grad_norm": NaN, + "learning_rate": 1.2051278618550043e-07, + "loss": 0.0, + "step": 63507 + }, + { + "epoch": 5.925912102267426, + "grad_norm": NaN, + "learning_rate": 1.2020984051744898e-07, + "loss": 0.0, + "step": 63508 + }, + { + "epoch": 5.9260054119623025, + "grad_norm": NaN, + "learning_rate": 1.1990727594929916e-07, + "loss": 0.0, + "step": 63509 + }, + { + "epoch": 5.92609872165718, + "grad_norm": NaN, + "learning_rate": 1.196050924818337e-07, + "loss": 0.0, + "step": 63510 + }, + { + "epoch": 5.926192031352057, + "grad_norm": NaN, + "learning_rate": 1.1930329011580196e-07, + "loss": 0.0, + "step": 63511 + }, + { + "epoch": 5.926285341046935, + "grad_norm": NaN, + "learning_rate": 1.1900186885200336e-07, + "loss": 0.0, + "step": 63512 + }, + { + "epoch": 5.926378650741812, + "grad_norm": NaN, + "learning_rate": 1.187008286911706e-07, + "loss": 0.0, + "step": 63513 + }, + { + "epoch": 5.92647196043669, + "grad_norm": NaN, + "learning_rate": 1.1840016963408638e-07, + "loss": 0.0, + "step": 63514 + }, + { + "epoch": 5.926565270131567, + "grad_norm": NaN, + "learning_rate": 1.1809989168151679e-07, + "loss": 0.0, + "step": 63515 + }, + { + "epoch": 5.926658579826444, + "grad_norm": NaN, + "learning_rate": 1.1779999483422786e-07, + "loss": 0.0, + "step": 63516 + }, + { + "epoch": 5.926751889521321, + "grad_norm": NaN, + "learning_rate": 1.1750047909295235e-07, + "loss": 0.0, + "step": 63517 + }, + { + "epoch": 5.926845199216198, + "grad_norm": NaN, + "learning_rate": 1.1720134445848961e-07, + "loss": 0.0, + "step": 63518 + }, + { + "epoch": 5.926938508911076, + "grad_norm": NaN, + "learning_rate": 1.1690259093158905e-07, + "loss": 0.0, + "step": 63519 + }, + { + "epoch": 5.927031818605953, + "grad_norm": NaN, + "learning_rate": 1.1660421851300006e-07, + "loss": 0.0, + "step": 63520 + }, + { + "epoch": 5.927125128300831, + "grad_norm": NaN, + "learning_rate": 1.163062272034887e-07, + "loss": 0.0, + "step": 63521 + }, + { + "epoch": 5.927218437995708, + "grad_norm": NaN, + "learning_rate": 1.1600861700380437e-07, + "loss": 0.0, + "step": 63522 + }, + { + "epoch": 5.9273117476905846, + "grad_norm": NaN, + "learning_rate": 1.1571138791472978e-07, + "loss": 0.0, + "step": 63523 + }, + { + "epoch": 5.927405057385462, + "grad_norm": NaN, + "learning_rate": 1.1541453993698102e-07, + "loss": 0.0, + "step": 63524 + }, + { + "epoch": 5.927498367080339, + "grad_norm": NaN, + "learning_rate": 1.1511807307134079e-07, + "loss": 0.0, + "step": 63525 + }, + { + "epoch": 5.927591676775217, + "grad_norm": NaN, + "learning_rate": 1.1482198731857517e-07, + "loss": 0.0, + "step": 63526 + }, + { + "epoch": 5.927684986470094, + "grad_norm": NaN, + "learning_rate": 1.1452628267940022e-07, + "loss": 0.0, + "step": 63527 + }, + { + "epoch": 5.927778296164972, + "grad_norm": NaN, + "learning_rate": 1.1423095915458203e-07, + "loss": 0.0, + "step": 63528 + }, + { + "epoch": 5.927871605859849, + "grad_norm": NaN, + "learning_rate": 1.1393601674488661e-07, + "loss": 0.0, + "step": 63529 + }, + { + "epoch": 5.9279649155547265, + "grad_norm": NaN, + "learning_rate": 1.1364145545104675e-07, + "loss": 0.0, + "step": 63530 + }, + { + "epoch": 5.928058225249604, + "grad_norm": NaN, + "learning_rate": 1.1334727527381182e-07, + "loss": 0.0, + "step": 63531 + }, + { + "epoch": 5.92815153494448, + "grad_norm": NaN, + "learning_rate": 1.1305347621393124e-07, + "loss": 0.0, + "step": 63532 + }, + { + "epoch": 5.928244844639358, + "grad_norm": NaN, + "learning_rate": 1.1276005827217105e-07, + "loss": 0.0, + "step": 63533 + }, + { + "epoch": 5.928338154334235, + "grad_norm": NaN, + "learning_rate": 1.1246702144924736e-07, + "loss": 0.0, + "step": 63534 + }, + { + "epoch": 5.928431464029113, + "grad_norm": NaN, + "learning_rate": 1.1217436574594286e-07, + "loss": 0.0, + "step": 63535 + }, + { + "epoch": 5.92852477372399, + "grad_norm": NaN, + "learning_rate": 1.1188209116295698e-07, + "loss": 0.0, + "step": 63536 + }, + { + "epoch": 5.9286180834188675, + "grad_norm": NaN, + "learning_rate": 1.1159019770105581e-07, + "loss": 0.0, + "step": 63537 + }, + { + "epoch": 5.928711393113744, + "grad_norm": NaN, + "learning_rate": 1.1129868536098873e-07, + "loss": 0.0, + "step": 63538 + }, + { + "epoch": 5.9288047028086215, + "grad_norm": NaN, + "learning_rate": 1.110075541434885e-07, + "loss": 0.0, + "step": 63539 + }, + { + "epoch": 5.928898012503499, + "grad_norm": NaN, + "learning_rate": 1.1071680404930449e-07, + "loss": 0.0, + "step": 63540 + }, + { + "epoch": 5.928991322198376, + "grad_norm": NaN, + "learning_rate": 1.1042643507915284e-07, + "loss": 0.0, + "step": 63541 + }, + { + "epoch": 5.929084631893254, + "grad_norm": NaN, + "learning_rate": 1.101364472337829e-07, + "loss": 0.0, + "step": 63542 + }, + { + "epoch": 5.929177941588131, + "grad_norm": NaN, + "learning_rate": 1.098468405139441e-07, + "loss": 0.0, + "step": 63543 + }, + { + "epoch": 5.929271251283009, + "grad_norm": NaN, + "learning_rate": 1.0955761492036919e-07, + "loss": 0.0, + "step": 63544 + }, + { + "epoch": 5.929364560977886, + "grad_norm": NaN, + "learning_rate": 1.0926877045377425e-07, + "loss": 0.0, + "step": 63545 + }, + { + "epoch": 5.9294578706727625, + "grad_norm": NaN, + "learning_rate": 1.0898030711492534e-07, + "loss": 0.0, + "step": 63546 + }, + { + "epoch": 5.92955118036764, + "grad_norm": NaN, + "learning_rate": 1.0869222490452189e-07, + "loss": 0.0, + "step": 63547 + }, + { + "epoch": 5.929644490062517, + "grad_norm": NaN, + "learning_rate": 1.0840452382331333e-07, + "loss": 0.0, + "step": 63548 + }, + { + "epoch": 5.929737799757395, + "grad_norm": NaN, + "learning_rate": 1.0811720387204903e-07, + "loss": 0.0, + "step": 63549 + }, + { + "epoch": 5.929831109452272, + "grad_norm": NaN, + "learning_rate": 1.0783026505142844e-07, + "loss": 0.0, + "step": 63550 + }, + { + "epoch": 5.92992441914715, + "grad_norm": NaN, + "learning_rate": 1.0754370736218432e-07, + "loss": 0.0, + "step": 63551 + }, + { + "epoch": 5.930017728842027, + "grad_norm": NaN, + "learning_rate": 1.0725753080506605e-07, + "loss": 0.0, + "step": 63552 + }, + { + "epoch": 5.930111038536904, + "grad_norm": NaN, + "learning_rate": 1.0697173538078974e-07, + "loss": 0.0, + "step": 63553 + }, + { + "epoch": 5.930204348231781, + "grad_norm": NaN, + "learning_rate": 1.0668632109008813e-07, + "loss": 0.0, + "step": 63554 + }, + { + "epoch": 5.930297657926658, + "grad_norm": NaN, + "learning_rate": 1.0640128793367731e-07, + "loss": 0.0, + "step": 63555 + }, + { + "epoch": 5.930390967621536, + "grad_norm": NaN, + "learning_rate": 1.0611663591229003e-07, + "loss": 0.0, + "step": 63556 + }, + { + "epoch": 5.930484277316413, + "grad_norm": NaN, + "learning_rate": 1.0583236502664238e-07, + "loss": 0.0, + "step": 63557 + }, + { + "epoch": 5.930577587011291, + "grad_norm": NaN, + "learning_rate": 1.0554847527745048e-07, + "loss": 0.0, + "step": 63558 + }, + { + "epoch": 5.930670896706168, + "grad_norm": NaN, + "learning_rate": 1.0526496666546369e-07, + "loss": 0.0, + "step": 63559 + }, + { + "epoch": 5.9307642064010455, + "grad_norm": NaN, + "learning_rate": 1.0498183919138147e-07, + "loss": 0.0, + "step": 63560 + }, + { + "epoch": 5.930857516095922, + "grad_norm": NaN, + "learning_rate": 1.0469909285591993e-07, + "loss": 0.0, + "step": 63561 + }, + { + "epoch": 5.9309508257907995, + "grad_norm": NaN, + "learning_rate": 1.0441672765981179e-07, + "loss": 0.0, + "step": 63562 + }, + { + "epoch": 5.931044135485677, + "grad_norm": NaN, + "learning_rate": 1.0413474360377316e-07, + "loss": 0.0, + "step": 63563 + }, + { + "epoch": 5.931137445180554, + "grad_norm": NaN, + "learning_rate": 1.0385314068852013e-07, + "loss": 0.0, + "step": 63564 + }, + { + "epoch": 5.931230754875432, + "grad_norm": NaN, + "learning_rate": 1.0357191891475214e-07, + "loss": 0.0, + "step": 63565 + }, + { + "epoch": 5.931324064570309, + "grad_norm": NaN, + "learning_rate": 1.0329107828321858e-07, + "loss": 0.0, + "step": 63566 + }, + { + "epoch": 5.931417374265186, + "grad_norm": NaN, + "learning_rate": 1.0301061879460227e-07, + "loss": 0.0, + "step": 63567 + }, + { + "epoch": 5.931510683960063, + "grad_norm": NaN, + "learning_rate": 1.0273054044961926e-07, + "loss": 0.0, + "step": 63568 + }, + { + "epoch": 5.9316039936549405, + "grad_norm": NaN, + "learning_rate": 1.0245084324900232e-07, + "loss": 0.0, + "step": 63569 + }, + { + "epoch": 5.931697303349818, + "grad_norm": NaN, + "learning_rate": 1.0217152719343424e-07, + "loss": 0.0, + "step": 63570 + }, + { + "epoch": 5.931790613044695, + "grad_norm": NaN, + "learning_rate": 1.0189259228364776e-07, + "loss": 0.0, + "step": 63571 + }, + { + "epoch": 5.931883922739573, + "grad_norm": NaN, + "learning_rate": 1.0161403852035899e-07, + "loss": 0.0, + "step": 63572 + }, + { + "epoch": 5.93197723243445, + "grad_norm": NaN, + "learning_rate": 1.0133586590425069e-07, + "loss": 0.0, + "step": 63573 + }, + { + "epoch": 5.932070542129328, + "grad_norm": NaN, + "learning_rate": 1.0105807443603898e-07, + "loss": 0.0, + "step": 63574 + }, + { + "epoch": 5.932163851824205, + "grad_norm": NaN, + "learning_rate": 1.0078066411643992e-07, + "loss": 0.0, + "step": 63575 + }, + { + "epoch": 5.932257161519082, + "grad_norm": NaN, + "learning_rate": 1.0050363494615299e-07, + "loss": 0.0, + "step": 63576 + }, + { + "epoch": 5.932350471213959, + "grad_norm": NaN, + "learning_rate": 1.0022698692587761e-07, + "loss": 0.0, + "step": 63577 + }, + { + "epoch": 5.932443780908836, + "grad_norm": NaN, + "learning_rate": 9.995072005631322e-08, + "loss": 0.0, + "step": 63578 + }, + { + "epoch": 5.932537090603714, + "grad_norm": NaN, + "learning_rate": 9.967483433819257e-08, + "loss": 0.0, + "step": 63579 + }, + { + "epoch": 5.932630400298591, + "grad_norm": NaN, + "learning_rate": 9.939932977218179e-08, + "loss": 0.0, + "step": 63580 + }, + { + "epoch": 5.932723709993469, + "grad_norm": NaN, + "learning_rate": 9.912420635898033e-08, + "loss": 0.0, + "step": 63581 + }, + { + "epoch": 5.932817019688345, + "grad_norm": NaN, + "learning_rate": 9.884946409932093e-08, + "loss": 0.0, + "step": 63582 + }, + { + "epoch": 5.932910329383223, + "grad_norm": NaN, + "learning_rate": 9.857510299388638e-08, + "loss": 0.0, + "step": 63583 + }, + { + "epoch": 5.9330036390781, + "grad_norm": NaN, + "learning_rate": 9.830112304335947e-08, + "loss": 0.0, + "step": 63584 + }, + { + "epoch": 5.933096948772977, + "grad_norm": NaN, + "learning_rate": 9.802752424845627e-08, + "loss": 0.0, + "step": 63585 + }, + { + "epoch": 5.933190258467855, + "grad_norm": NaN, + "learning_rate": 9.775430660985961e-08, + "loss": 0.0, + "step": 63586 + }, + { + "epoch": 5.933283568162732, + "grad_norm": NaN, + "learning_rate": 9.748147012826891e-08, + "loss": 0.0, + "step": 63587 + }, + { + "epoch": 5.93337687785761, + "grad_norm": NaN, + "learning_rate": 9.720901480438359e-08, + "loss": 0.0, + "step": 63588 + }, + { + "epoch": 5.933470187552487, + "grad_norm": NaN, + "learning_rate": 9.693694063888646e-08, + "loss": 0.0, + "step": 63589 + }, + { + "epoch": 5.933563497247364, + "grad_norm": NaN, + "learning_rate": 9.666524763247696e-08, + "loss": 0.0, + "step": 63590 + }, + { + "epoch": 5.933656806942241, + "grad_norm": NaN, + "learning_rate": 9.639393578583787e-08, + "loss": 0.0, + "step": 63591 + }, + { + "epoch": 5.9337501166371185, + "grad_norm": NaN, + "learning_rate": 9.612300509966863e-08, + "loss": 0.0, + "step": 63592 + }, + { + "epoch": 5.933843426331996, + "grad_norm": NaN, + "learning_rate": 9.58524555746687e-08, + "loss": 0.0, + "step": 63593 + }, + { + "epoch": 5.933936736026873, + "grad_norm": NaN, + "learning_rate": 9.558228721148753e-08, + "loss": 0.0, + "step": 63594 + }, + { + "epoch": 5.934030045721751, + "grad_norm": NaN, + "learning_rate": 9.531250001084123e-08, + "loss": 0.0, + "step": 63595 + }, + { + "epoch": 5.934123355416628, + "grad_norm": NaN, + "learning_rate": 9.504309397341259e-08, + "loss": 0.0, + "step": 63596 + }, + { + "epoch": 5.934216665111505, + "grad_norm": NaN, + "learning_rate": 9.477406909988439e-08, + "loss": 0.0, + "step": 63597 + }, + { + "epoch": 5.934309974806382, + "grad_norm": NaN, + "learning_rate": 9.450542539095607e-08, + "loss": 0.0, + "step": 63598 + }, + { + "epoch": 5.9344032845012595, + "grad_norm": NaN, + "learning_rate": 9.423716284727711e-08, + "loss": 0.0, + "step": 63599 + }, + { + "epoch": 5.934496594196137, + "grad_norm": NaN, + "learning_rate": 9.396928146954697e-08, + "loss": 0.0, + "step": 63600 + }, + { + "epoch": 5.934589903891014, + "grad_norm": NaN, + "learning_rate": 9.37017812584484e-08, + "loss": 0.0, + "step": 63601 + }, + { + "epoch": 5.934683213585892, + "grad_norm": NaN, + "learning_rate": 9.343466221466423e-08, + "loss": 0.0, + "step": 63602 + }, + { + "epoch": 5.934776523280769, + "grad_norm": NaN, + "learning_rate": 9.31679243388772e-08, + "loss": 0.0, + "step": 63603 + }, + { + "epoch": 5.934869832975647, + "grad_norm": NaN, + "learning_rate": 9.290156763175349e-08, + "loss": 0.0, + "step": 63604 + }, + { + "epoch": 5.934963142670523, + "grad_norm": NaN, + "learning_rate": 9.26355920939592e-08, + "loss": 0.0, + "step": 63605 + }, + { + "epoch": 5.935056452365401, + "grad_norm": NaN, + "learning_rate": 9.236999772621045e-08, + "loss": 0.0, + "step": 63606 + }, + { + "epoch": 5.935149762060278, + "grad_norm": NaN, + "learning_rate": 9.210478452914005e-08, + "loss": 0.0, + "step": 63607 + }, + { + "epoch": 5.935243071755155, + "grad_norm": NaN, + "learning_rate": 9.183995250344744e-08, + "loss": 0.0, + "step": 63608 + }, + { + "epoch": 5.935336381450033, + "grad_norm": NaN, + "learning_rate": 9.157550164981542e-08, + "loss": 0.0, + "step": 63609 + }, + { + "epoch": 5.93542969114491, + "grad_norm": NaN, + "learning_rate": 9.13114319688768e-08, + "loss": 0.0, + "step": 63610 + }, + { + "epoch": 5.935523000839787, + "grad_norm": NaN, + "learning_rate": 9.10477434613477e-08, + "loss": 0.0, + "step": 63611 + }, + { + "epoch": 5.935616310534664, + "grad_norm": NaN, + "learning_rate": 9.07844361278609e-08, + "loss": 0.0, + "step": 63612 + }, + { + "epoch": 5.935709620229542, + "grad_norm": NaN, + "learning_rate": 9.052150996911589e-08, + "loss": 0.0, + "step": 63613 + }, + { + "epoch": 5.935802929924419, + "grad_norm": NaN, + "learning_rate": 9.025896498576212e-08, + "loss": 0.0, + "step": 63614 + }, + { + "epoch": 5.9358962396192965, + "grad_norm": NaN, + "learning_rate": 8.999680117846575e-08, + "loss": 0.0, + "step": 63615 + }, + { + "epoch": 5.935989549314174, + "grad_norm": NaN, + "learning_rate": 8.973501854790954e-08, + "loss": 0.0, + "step": 63616 + }, + { + "epoch": 5.936082859009051, + "grad_norm": NaN, + "learning_rate": 8.947361709475964e-08, + "loss": 0.0, + "step": 63617 + }, + { + "epoch": 5.936176168703929, + "grad_norm": NaN, + "learning_rate": 8.921259681964887e-08, + "loss": 0.0, + "step": 63618 + }, + { + "epoch": 5.936269478398805, + "grad_norm": NaN, + "learning_rate": 8.895195772327667e-08, + "loss": 0.0, + "step": 63619 + }, + { + "epoch": 5.936362788093683, + "grad_norm": NaN, + "learning_rate": 8.869169980629253e-08, + "loss": 0.0, + "step": 63620 + }, + { + "epoch": 5.93645609778856, + "grad_norm": NaN, + "learning_rate": 8.843182306936259e-08, + "loss": 0.0, + "step": 63621 + }, + { + "epoch": 5.9365494074834375, + "grad_norm": NaN, + "learning_rate": 8.81723275131363e-08, + "loss": 0.0, + "step": 63622 + }, + { + "epoch": 5.936642717178315, + "grad_norm": NaN, + "learning_rate": 8.791321313827982e-08, + "loss": 0.0, + "step": 63623 + }, + { + "epoch": 5.936736026873192, + "grad_norm": NaN, + "learning_rate": 8.765447994545926e-08, + "loss": 0.0, + "step": 63624 + }, + { + "epoch": 5.93682933656807, + "grad_norm": NaN, + "learning_rate": 8.739612793532414e-08, + "loss": 0.0, + "step": 63625 + }, + { + "epoch": 5.936922646262946, + "grad_norm": NaN, + "learning_rate": 8.713815710854055e-08, + "loss": 0.0, + "step": 63626 + }, + { + "epoch": 5.937015955957824, + "grad_norm": NaN, + "learning_rate": 8.688056746574134e-08, + "loss": 0.0, + "step": 63627 + }, + { + "epoch": 5.937109265652701, + "grad_norm": NaN, + "learning_rate": 8.662335900760931e-08, + "loss": 0.0, + "step": 63628 + }, + { + "epoch": 5.937202575347579, + "grad_norm": NaN, + "learning_rate": 8.636653173477726e-08, + "loss": 0.0, + "step": 63629 + }, + { + "epoch": 5.937295885042456, + "grad_norm": NaN, + "learning_rate": 8.611008564791133e-08, + "loss": 0.0, + "step": 63630 + }, + { + "epoch": 5.937389194737333, + "grad_norm": NaN, + "learning_rate": 8.585402074766102e-08, + "loss": 0.0, + "step": 63631 + }, + { + "epoch": 5.937482504432211, + "grad_norm": NaN, + "learning_rate": 8.559833703467578e-08, + "loss": 0.0, + "step": 63632 + }, + { + "epoch": 5.937575814127088, + "grad_norm": NaN, + "learning_rate": 8.534303450960511e-08, + "loss": 0.0, + "step": 63633 + }, + { + "epoch": 5.937669123821965, + "grad_norm": NaN, + "learning_rate": 8.508811317309849e-08, + "loss": 0.0, + "step": 63634 + }, + { + "epoch": 5.937762433516842, + "grad_norm": NaN, + "learning_rate": 8.48335730258054e-08, + "loss": 0.0, + "step": 63635 + }, + { + "epoch": 5.93785574321172, + "grad_norm": NaN, + "learning_rate": 8.457941406837532e-08, + "loss": 0.0, + "step": 63636 + }, + { + "epoch": 5.937949052906597, + "grad_norm": NaN, + "learning_rate": 8.432563630144107e-08, + "loss": 0.0, + "step": 63637 + }, + { + "epoch": 5.9380423626014744, + "grad_norm": NaN, + "learning_rate": 8.407223972566879e-08, + "loss": 0.0, + "step": 63638 + }, + { + "epoch": 5.938135672296352, + "grad_norm": NaN, + "learning_rate": 8.38192243416913e-08, + "loss": 0.0, + "step": 63639 + }, + { + "epoch": 5.938228981991228, + "grad_norm": NaN, + "learning_rate": 8.35665901501581e-08, + "loss": 0.0, + "step": 63640 + }, + { + "epoch": 5.938322291686106, + "grad_norm": NaN, + "learning_rate": 8.331433715170199e-08, + "loss": 0.0, + "step": 63641 + }, + { + "epoch": 5.938415601380983, + "grad_norm": NaN, + "learning_rate": 8.306246534697247e-08, + "loss": 0.0, + "step": 63642 + }, + { + "epoch": 5.938508911075861, + "grad_norm": NaN, + "learning_rate": 8.281097473658571e-08, + "loss": 0.0, + "step": 63643 + }, + { + "epoch": 5.938602220770738, + "grad_norm": NaN, + "learning_rate": 8.255986532122449e-08, + "loss": 0.0, + "step": 63644 + }, + { + "epoch": 5.9386955304656155, + "grad_norm": NaN, + "learning_rate": 8.2309137101505e-08, + "loss": 0.0, + "step": 63645 + }, + { + "epoch": 5.938788840160493, + "grad_norm": NaN, + "learning_rate": 8.205879007804339e-08, + "loss": 0.0, + "step": 63646 + }, + { + "epoch": 5.93888214985537, + "grad_norm": NaN, + "learning_rate": 8.180882425152246e-08, + "loss": 0.0, + "step": 63647 + }, + { + "epoch": 5.938975459550248, + "grad_norm": NaN, + "learning_rate": 8.155923962254174e-08, + "loss": 0.0, + "step": 63648 + }, + { + "epoch": 5.939068769245124, + "grad_norm": NaN, + "learning_rate": 8.131003619173404e-08, + "loss": 0.0, + "step": 63649 + }, + { + "epoch": 5.939162078940002, + "grad_norm": NaN, + "learning_rate": 8.106121395976551e-08, + "loss": 0.0, + "step": 63650 + }, + { + "epoch": 5.939255388634879, + "grad_norm": NaN, + "learning_rate": 8.081277292723564e-08, + "loss": 0.0, + "step": 63651 + }, + { + "epoch": 5.9393486983297565, + "grad_norm": NaN, + "learning_rate": 8.05647130948106e-08, + "loss": 0.0, + "step": 63652 + }, + { + "epoch": 5.939442008024634, + "grad_norm": NaN, + "learning_rate": 8.031703446307325e-08, + "loss": 0.0, + "step": 63653 + }, + { + "epoch": 5.939535317719511, + "grad_norm": NaN, + "learning_rate": 8.00697370326897e-08, + "loss": 0.0, + "step": 63654 + }, + { + "epoch": 5.939628627414388, + "grad_norm": NaN, + "learning_rate": 7.982282080429281e-08, + "loss": 0.0, + "step": 63655 + }, + { + "epoch": 5.939721937109265, + "grad_norm": NaN, + "learning_rate": 7.957628577848208e-08, + "loss": 0.0, + "step": 63656 + }, + { + "epoch": 5.939815246804143, + "grad_norm": NaN, + "learning_rate": 7.933013195590698e-08, + "loss": 0.0, + "step": 63657 + }, + { + "epoch": 5.93990855649902, + "grad_norm": NaN, + "learning_rate": 7.908435933716706e-08, + "loss": 0.0, + "step": 63658 + }, + { + "epoch": 5.940001866193898, + "grad_norm": NaN, + "learning_rate": 7.883896792292843e-08, + "loss": 0.0, + "step": 63659 + }, + { + "epoch": 5.940095175888775, + "grad_norm": NaN, + "learning_rate": 7.859395771379063e-08, + "loss": 0.0, + "step": 63660 + }, + { + "epoch": 5.940188485583652, + "grad_norm": NaN, + "learning_rate": 7.834932871036981e-08, + "loss": 0.0, + "step": 63661 + }, + { + "epoch": 5.94028179527853, + "grad_norm": NaN, + "learning_rate": 7.81050809132988e-08, + "loss": 0.0, + "step": 63662 + }, + { + "epoch": 5.940375104973406, + "grad_norm": NaN, + "learning_rate": 7.78612143231938e-08, + "loss": 0.0, + "step": 63663 + }, + { + "epoch": 5.940468414668284, + "grad_norm": NaN, + "learning_rate": 7.761772894067098e-08, + "loss": 0.0, + "step": 63664 + }, + { + "epoch": 5.940561724363161, + "grad_norm": NaN, + "learning_rate": 7.737462476636313e-08, + "loss": 0.0, + "step": 63665 + }, + { + "epoch": 5.940655034058039, + "grad_norm": NaN, + "learning_rate": 7.713190180088646e-08, + "loss": 0.0, + "step": 63666 + }, + { + "epoch": 5.940748343752916, + "grad_norm": NaN, + "learning_rate": 7.688956004485713e-08, + "loss": 0.0, + "step": 63667 + }, + { + "epoch": 5.9408416534477935, + "grad_norm": NaN, + "learning_rate": 7.664759949887467e-08, + "loss": 0.0, + "step": 63668 + }, + { + "epoch": 5.940934963142671, + "grad_norm": NaN, + "learning_rate": 7.640602016357189e-08, + "loss": 0.0, + "step": 63669 + }, + { + "epoch": 5.941028272837547, + "grad_norm": NaN, + "learning_rate": 7.616482203954832e-08, + "loss": 0.0, + "step": 63670 + }, + { + "epoch": 5.941121582532425, + "grad_norm": NaN, + "learning_rate": 7.59240051274368e-08, + "loss": 0.0, + "step": 63671 + }, + { + "epoch": 5.941214892227302, + "grad_norm": NaN, + "learning_rate": 7.568356942783682e-08, + "loss": 0.0, + "step": 63672 + }, + { + "epoch": 5.94130820192218, + "grad_norm": NaN, + "learning_rate": 7.544351494138123e-08, + "loss": 0.0, + "step": 63673 + }, + { + "epoch": 5.941401511617057, + "grad_norm": NaN, + "learning_rate": 7.520384166863625e-08, + "loss": 0.0, + "step": 63674 + }, + { + "epoch": 5.9414948213119345, + "grad_norm": NaN, + "learning_rate": 7.496454961025133e-08, + "loss": 0.0, + "step": 63675 + }, + { + "epoch": 5.941588131006812, + "grad_norm": NaN, + "learning_rate": 7.472563876680937e-08, + "loss": 0.0, + "step": 63676 + }, + { + "epoch": 5.941681440701689, + "grad_norm": NaN, + "learning_rate": 7.448710913894318e-08, + "loss": 0.0, + "step": 63677 + }, + { + "epoch": 5.941774750396566, + "grad_norm": NaN, + "learning_rate": 7.424896072723563e-08, + "loss": 0.0, + "step": 63678 + }, + { + "epoch": 5.941868060091443, + "grad_norm": NaN, + "learning_rate": 7.40111935323029e-08, + "loss": 0.0, + "step": 63679 + }, + { + "epoch": 5.941961369786321, + "grad_norm": NaN, + "learning_rate": 7.377380755474449e-08, + "loss": 0.0, + "step": 63680 + }, + { + "epoch": 5.942054679481198, + "grad_norm": NaN, + "learning_rate": 7.353680279515995e-08, + "loss": 0.0, + "step": 63681 + }, + { + "epoch": 5.942147989176076, + "grad_norm": NaN, + "learning_rate": 7.330017925418208e-08, + "loss": 0.0, + "step": 63682 + }, + { + "epoch": 5.942241298870953, + "grad_norm": NaN, + "learning_rate": 7.30639369323771e-08, + "loss": 0.0, + "step": 63683 + }, + { + "epoch": 5.9423346085658295, + "grad_norm": NaN, + "learning_rate": 7.282807583036121e-08, + "loss": 0.0, + "step": 63684 + }, + { + "epoch": 5.942427918260707, + "grad_norm": NaN, + "learning_rate": 7.25925959487339e-08, + "loss": 0.0, + "step": 63685 + }, + { + "epoch": 5.942521227955584, + "grad_norm": NaN, + "learning_rate": 7.23574972880947e-08, + "loss": 0.0, + "step": 63686 + }, + { + "epoch": 5.942614537650462, + "grad_norm": NaN, + "learning_rate": 7.212277984902647e-08, + "loss": 0.0, + "step": 63687 + }, + { + "epoch": 5.942707847345339, + "grad_norm": NaN, + "learning_rate": 7.188844363214541e-08, + "loss": 0.0, + "step": 63688 + }, + { + "epoch": 5.942801157040217, + "grad_norm": NaN, + "learning_rate": 7.165448863803435e-08, + "loss": 0.0, + "step": 63689 + }, + { + "epoch": 5.942894466735094, + "grad_norm": NaN, + "learning_rate": 7.14209148673095e-08, + "loss": 0.0, + "step": 63690 + }, + { + "epoch": 5.9429877764299714, + "grad_norm": NaN, + "learning_rate": 7.118772232053704e-08, + "loss": 0.0, + "step": 63691 + }, + { + "epoch": 5.943081086124849, + "grad_norm": NaN, + "learning_rate": 7.095491099831652e-08, + "loss": 0.0, + "step": 63692 + }, + { + "epoch": 5.943174395819725, + "grad_norm": NaN, + "learning_rate": 7.072248090126408e-08, + "loss": 0.0, + "step": 63693 + }, + { + "epoch": 5.943267705514603, + "grad_norm": NaN, + "learning_rate": 7.049043202994597e-08, + "loss": 0.0, + "step": 63694 + }, + { + "epoch": 5.94336101520948, + "grad_norm": NaN, + "learning_rate": 7.025876438494504e-08, + "loss": 0.0, + "step": 63695 + }, + { + "epoch": 5.943454324904358, + "grad_norm": NaN, + "learning_rate": 7.002747796687746e-08, + "loss": 0.0, + "step": 63696 + }, + { + "epoch": 5.943547634599235, + "grad_norm": NaN, + "learning_rate": 6.979657277630945e-08, + "loss": 0.0, + "step": 63697 + }, + { + "epoch": 5.9436409442941125, + "grad_norm": NaN, + "learning_rate": 6.956604881384054e-08, + "loss": 0.0, + "step": 63698 + }, + { + "epoch": 5.943734253988989, + "grad_norm": NaN, + "learning_rate": 6.933590608005357e-08, + "loss": 0.0, + "step": 63699 + }, + { + "epoch": 5.943827563683866, + "grad_norm": NaN, + "learning_rate": 6.910614457551478e-08, + "loss": 0.0, + "step": 63700 + }, + { + "epoch": 5.943920873378744, + "grad_norm": NaN, + "learning_rate": 6.887676430085698e-08, + "loss": 0.0, + "step": 63701 + }, + { + "epoch": 5.944014183073621, + "grad_norm": NaN, + "learning_rate": 6.864776525661308e-08, + "loss": 0.0, + "step": 63702 + }, + { + "epoch": 5.944107492768499, + "grad_norm": NaN, + "learning_rate": 6.841914744338261e-08, + "loss": 0.0, + "step": 63703 + }, + { + "epoch": 5.944200802463376, + "grad_norm": NaN, + "learning_rate": 6.81909108617651e-08, + "loss": 0.0, + "step": 63704 + }, + { + "epoch": 5.9442941121582535, + "grad_norm": NaN, + "learning_rate": 6.796305551231008e-08, + "loss": 0.0, + "step": 63705 + }, + { + "epoch": 5.944387421853131, + "grad_norm": NaN, + "learning_rate": 6.773558139561708e-08, + "loss": 0.0, + "step": 63706 + }, + { + "epoch": 5.9444807315480075, + "grad_norm": NaN, + "learning_rate": 6.750848851225233e-08, + "loss": 0.0, + "step": 63707 + }, + { + "epoch": 5.944574041242885, + "grad_norm": NaN, + "learning_rate": 6.728177686281533e-08, + "loss": 0.0, + "step": 63708 + }, + { + "epoch": 5.944667350937762, + "grad_norm": NaN, + "learning_rate": 6.705544644785565e-08, + "loss": 0.0, + "step": 63709 + }, + { + "epoch": 5.94476066063264, + "grad_norm": NaN, + "learning_rate": 6.682949726795617e-08, + "loss": 0.0, + "step": 63710 + }, + { + "epoch": 5.944853970327517, + "grad_norm": NaN, + "learning_rate": 6.660392932371639e-08, + "loss": 0.0, + "step": 63711 + }, + { + "epoch": 5.944947280022395, + "grad_norm": NaN, + "learning_rate": 6.637874261566923e-08, + "loss": 0.0, + "step": 63712 + }, + { + "epoch": 5.945040589717272, + "grad_norm": NaN, + "learning_rate": 6.615393714441419e-08, + "loss": 0.0, + "step": 63713 + }, + { + "epoch": 5.9451338994121485, + "grad_norm": NaN, + "learning_rate": 6.592951291051751e-08, + "loss": 0.0, + "step": 63714 + }, + { + "epoch": 5.945227209107026, + "grad_norm": NaN, + "learning_rate": 6.570546991454539e-08, + "loss": 0.0, + "step": 63715 + }, + { + "epoch": 5.945320518801903, + "grad_norm": NaN, + "learning_rate": 6.548180815708071e-08, + "loss": 0.0, + "step": 63716 + }, + { + "epoch": 5.945413828496781, + "grad_norm": NaN, + "learning_rate": 6.525852763867301e-08, + "loss": 0.0, + "step": 63717 + }, + { + "epoch": 5.945507138191658, + "grad_norm": NaN, + "learning_rate": 6.503562835988851e-08, + "loss": 0.0, + "step": 63718 + }, + { + "epoch": 5.945600447886536, + "grad_norm": NaN, + "learning_rate": 6.481311032132674e-08, + "loss": 0.0, + "step": 63719 + }, + { + "epoch": 5.945693757581413, + "grad_norm": NaN, + "learning_rate": 6.45909735235206e-08, + "loss": 0.0, + "step": 63720 + }, + { + "epoch": 5.9457870672762905, + "grad_norm": NaN, + "learning_rate": 6.436921796703632e-08, + "loss": 0.0, + "step": 63721 + }, + { + "epoch": 5.945880376971167, + "grad_norm": NaN, + "learning_rate": 6.414784365245673e-08, + "loss": 0.0, + "step": 63722 + }, + { + "epoch": 5.945973686666044, + "grad_norm": NaN, + "learning_rate": 6.392685058033143e-08, + "loss": 0.0, + "step": 63723 + }, + { + "epoch": 5.946066996360922, + "grad_norm": NaN, + "learning_rate": 6.37062387512266e-08, + "loss": 0.0, + "step": 63724 + }, + { + "epoch": 5.946160306055799, + "grad_norm": NaN, + "learning_rate": 6.348600816570848e-08, + "loss": 0.0, + "step": 63725 + }, + { + "epoch": 5.946253615750677, + "grad_norm": NaN, + "learning_rate": 6.326615882432662e-08, + "loss": 0.0, + "step": 63726 + }, + { + "epoch": 5.946346925445554, + "grad_norm": NaN, + "learning_rate": 6.304669072764724e-08, + "loss": 0.0, + "step": 63727 + }, + { + "epoch": 5.946440235140431, + "grad_norm": NaN, + "learning_rate": 6.282760387621988e-08, + "loss": 0.0, + "step": 63728 + }, + { + "epoch": 5.946533544835308, + "grad_norm": NaN, + "learning_rate": 6.260889827061078e-08, + "loss": 0.0, + "step": 63729 + }, + { + "epoch": 5.9466268545301855, + "grad_norm": NaN, + "learning_rate": 6.239057391136947e-08, + "loss": 0.0, + "step": 63730 + }, + { + "epoch": 5.946720164225063, + "grad_norm": NaN, + "learning_rate": 6.217263079906221e-08, + "loss": 0.0, + "step": 63731 + }, + { + "epoch": 5.94681347391994, + "grad_norm": NaN, + "learning_rate": 6.195506893422186e-08, + "loss": 0.0, + "step": 63732 + }, + { + "epoch": 5.946906783614818, + "grad_norm": NaN, + "learning_rate": 6.173788831741466e-08, + "loss": 0.0, + "step": 63733 + }, + { + "epoch": 5.947000093309695, + "grad_norm": NaN, + "learning_rate": 6.152108894920683e-08, + "loss": 0.0, + "step": 63734 + }, + { + "epoch": 5.947093403004573, + "grad_norm": NaN, + "learning_rate": 6.130467083013124e-08, + "loss": 0.0, + "step": 63735 + }, + { + "epoch": 5.947186712699449, + "grad_norm": NaN, + "learning_rate": 6.108863396073749e-08, + "loss": 0.0, + "step": 63736 + }, + { + "epoch": 5.9472800223943265, + "grad_norm": NaN, + "learning_rate": 6.087297834159177e-08, + "loss": 0.0, + "step": 63737 + }, + { + "epoch": 5.947373332089204, + "grad_norm": NaN, + "learning_rate": 6.065770397321034e-08, + "loss": 0.0, + "step": 63738 + }, + { + "epoch": 5.947466641784081, + "grad_norm": NaN, + "learning_rate": 6.044281085617608e-08, + "loss": 0.0, + "step": 63739 + }, + { + "epoch": 5.947559951478959, + "grad_norm": NaN, + "learning_rate": 6.022829899102189e-08, + "loss": 0.0, + "step": 63740 + }, + { + "epoch": 5.947653261173836, + "grad_norm": NaN, + "learning_rate": 6.001416837828066e-08, + "loss": 0.0, + "step": 63741 + }, + { + "epoch": 5.947746570868714, + "grad_norm": NaN, + "learning_rate": 5.980041901851862e-08, + "loss": 0.0, + "step": 63742 + }, + { + "epoch": 5.94783988056359, + "grad_norm": NaN, + "learning_rate": 5.958705091226868e-08, + "loss": 0.0, + "step": 63743 + }, + { + "epoch": 5.947933190258468, + "grad_norm": NaN, + "learning_rate": 5.937406406006373e-08, + "loss": 0.0, + "step": 63744 + }, + { + "epoch": 5.948026499953345, + "grad_norm": NaN, + "learning_rate": 5.916145846245335e-08, + "loss": 0.0, + "step": 63745 + }, + { + "epoch": 5.948119809648222, + "grad_norm": NaN, + "learning_rate": 5.894923411998709e-08, + "loss": 0.0, + "step": 63746 + }, + { + "epoch": 5.9482131193431, + "grad_norm": NaN, + "learning_rate": 5.8737391033197856e-08, + "loss": 0.0, + "step": 63747 + }, + { + "epoch": 5.948306429037977, + "grad_norm": NaN, + "learning_rate": 5.852592920261856e-08, + "loss": 0.0, + "step": 63748 + }, + { + "epoch": 5.948399738732855, + "grad_norm": NaN, + "learning_rate": 5.831484862879876e-08, + "loss": 0.0, + "step": 63749 + }, + { + "epoch": 5.948493048427732, + "grad_norm": NaN, + "learning_rate": 5.810414931227136e-08, + "loss": 0.0, + "step": 63750 + }, + { + "epoch": 5.948586358122609, + "grad_norm": NaN, + "learning_rate": 5.789383125355263e-08, + "loss": 0.0, + "step": 63751 + }, + { + "epoch": 5.948679667817486, + "grad_norm": NaN, + "learning_rate": 5.768389445320876e-08, + "loss": 0.0, + "step": 63752 + }, + { + "epoch": 5.9487729775123634, + "grad_norm": NaN, + "learning_rate": 5.747433891175601e-08, + "loss": 0.0, + "step": 63753 + }, + { + "epoch": 5.948866287207241, + "grad_norm": NaN, + "learning_rate": 5.72651646297273e-08, + "loss": 0.0, + "step": 63754 + }, + { + "epoch": 5.948959596902118, + "grad_norm": NaN, + "learning_rate": 5.705637160765553e-08, + "loss": 0.0, + "step": 63755 + }, + { + "epoch": 5.949052906596996, + "grad_norm": NaN, + "learning_rate": 5.68479598460736e-08, + "loss": 0.0, + "step": 63756 + }, + { + "epoch": 5.949146216291872, + "grad_norm": NaN, + "learning_rate": 5.6639929345514424e-08, + "loss": 0.0, + "step": 63757 + }, + { + "epoch": 5.94923952598675, + "grad_norm": NaN, + "learning_rate": 5.643228010651091e-08, + "loss": 0.0, + "step": 63758 + }, + { + "epoch": 5.949332835681627, + "grad_norm": NaN, + "learning_rate": 5.6225012129562655e-08, + "loss": 0.0, + "step": 63759 + }, + { + "epoch": 5.9494261453765045, + "grad_norm": NaN, + "learning_rate": 5.601812541523587e-08, + "loss": 0.0, + "step": 63760 + }, + { + "epoch": 5.949519455071382, + "grad_norm": NaN, + "learning_rate": 5.5811619964030165e-08, + "loss": 0.0, + "step": 63761 + }, + { + "epoch": 5.949612764766259, + "grad_norm": NaN, + "learning_rate": 5.5605495776478434e-08, + "loss": 0.0, + "step": 63762 + }, + { + "epoch": 5.949706074461137, + "grad_norm": NaN, + "learning_rate": 5.539975285309695e-08, + "loss": 0.0, + "step": 63763 + }, + { + "epoch": 5.949799384156014, + "grad_norm": NaN, + "learning_rate": 5.519439119443525e-08, + "loss": 0.0, + "step": 63764 + }, + { + "epoch": 5.949892693850892, + "grad_norm": NaN, + "learning_rate": 5.4989410800976294e-08, + "loss": 0.0, + "step": 63765 + }, + { + "epoch": 5.949986003545768, + "grad_norm": NaN, + "learning_rate": 5.47848116732863e-08, + "loss": 0.0, + "step": 63766 + }, + { + "epoch": 5.9500793132406455, + "grad_norm": NaN, + "learning_rate": 5.458059381184821e-08, + "loss": 0.0, + "step": 63767 + }, + { + "epoch": 5.950172622935523, + "grad_norm": NaN, + "learning_rate": 5.437675721719492e-08, + "loss": 0.0, + "step": 63768 + }, + { + "epoch": 5.9502659326304, + "grad_norm": NaN, + "learning_rate": 5.41733018898427e-08, + "loss": 0.0, + "step": 63769 + }, + { + "epoch": 5.950359242325278, + "grad_norm": NaN, + "learning_rate": 5.39702278303078e-08, + "loss": 0.0, + "step": 63770 + }, + { + "epoch": 5.950452552020155, + "grad_norm": NaN, + "learning_rate": 5.3767535039123124e-08, + "loss": 0.0, + "step": 63771 + }, + { + "epoch": 5.950545861715032, + "grad_norm": NaN, + "learning_rate": 5.3565223516771617e-08, + "loss": 0.0, + "step": 63772 + }, + { + "epoch": 5.950639171409909, + "grad_norm": NaN, + "learning_rate": 5.336329326380284e-08, + "loss": 0.0, + "step": 63773 + }, + { + "epoch": 5.950732481104787, + "grad_norm": NaN, + "learning_rate": 5.3161744280699745e-08, + "loss": 0.0, + "step": 63774 + }, + { + "epoch": 5.950825790799664, + "grad_norm": NaN, + "learning_rate": 5.296057656799524e-08, + "loss": 0.0, + "step": 63775 + }, + { + "epoch": 5.950919100494541, + "grad_norm": NaN, + "learning_rate": 5.275979012620557e-08, + "loss": 0.0, + "step": 63776 + }, + { + "epoch": 5.951012410189419, + "grad_norm": NaN, + "learning_rate": 5.255938495581369e-08, + "loss": 0.0, + "step": 63777 + }, + { + "epoch": 5.951105719884296, + "grad_norm": NaN, + "learning_rate": 5.23593610573525e-08, + "loss": 0.0, + "step": 63778 + }, + { + "epoch": 5.951199029579174, + "grad_norm": NaN, + "learning_rate": 5.2159718431321606e-08, + "loss": 0.0, + "step": 63779 + }, + { + "epoch": 5.95129233927405, + "grad_norm": NaN, + "learning_rate": 5.1960457078237264e-08, + "loss": 0.0, + "step": 63780 + }, + { + "epoch": 5.951385648968928, + "grad_norm": NaN, + "learning_rate": 5.1761576998599065e-08, + "loss": 0.0, + "step": 63781 + }, + { + "epoch": 5.951478958663805, + "grad_norm": NaN, + "learning_rate": 5.1563078192906614e-08, + "loss": 0.0, + "step": 63782 + }, + { + "epoch": 5.9515722683586825, + "grad_norm": NaN, + "learning_rate": 5.136496066165951e-08, + "loss": 0.0, + "step": 63783 + }, + { + "epoch": 5.95166557805356, + "grad_norm": NaN, + "learning_rate": 5.116722440539067e-08, + "loss": 0.0, + "step": 63784 + }, + { + "epoch": 5.951758887748437, + "grad_norm": NaN, + "learning_rate": 5.0969869424583034e-08, + "loss": 0.0, + "step": 63785 + }, + { + "epoch": 5.951852197443315, + "grad_norm": NaN, + "learning_rate": 5.0772895719736194e-08, + "loss": 0.0, + "step": 63786 + }, + { + "epoch": 5.951945507138191, + "grad_norm": NaN, + "learning_rate": 5.057630329136641e-08, + "loss": 0.0, + "step": 63787 + }, + { + "epoch": 5.952038816833069, + "grad_norm": NaN, + "learning_rate": 5.038009213995664e-08, + "loss": 0.0, + "step": 63788 + }, + { + "epoch": 5.952132126527946, + "grad_norm": NaN, + "learning_rate": 5.018426226600647e-08, + "loss": 0.0, + "step": 63789 + }, + { + "epoch": 5.9522254362228235, + "grad_norm": NaN, + "learning_rate": 4.998881367003216e-08, + "loss": 0.0, + "step": 63790 + }, + { + "epoch": 5.952318745917701, + "grad_norm": NaN, + "learning_rate": 4.9793746352499995e-08, + "loss": 0.0, + "step": 63791 + }, + { + "epoch": 5.952412055612578, + "grad_norm": NaN, + "learning_rate": 4.9599060313926245e-08, + "loss": 0.0, + "step": 63792 + }, + { + "epoch": 5.952505365307456, + "grad_norm": NaN, + "learning_rate": 4.9404755554827144e-08, + "loss": 0.0, + "step": 63793 + }, + { + "epoch": 5.952598675002333, + "grad_norm": NaN, + "learning_rate": 4.921083207565235e-08, + "loss": 0.0, + "step": 63794 + }, + { + "epoch": 5.95269198469721, + "grad_norm": NaN, + "learning_rate": 4.901728987691811e-08, + "loss": 0.0, + "step": 63795 + }, + { + "epoch": 5.952785294392087, + "grad_norm": NaN, + "learning_rate": 4.8824128959107366e-08, + "loss": 0.0, + "step": 63796 + }, + { + "epoch": 5.952878604086965, + "grad_norm": NaN, + "learning_rate": 4.863134932273638e-08, + "loss": 0.0, + "step": 63797 + }, + { + "epoch": 5.952971913781842, + "grad_norm": NaN, + "learning_rate": 4.843895096825479e-08, + "loss": 0.0, + "step": 63798 + }, + { + "epoch": 5.953065223476719, + "grad_norm": NaN, + "learning_rate": 4.82469338961955e-08, + "loss": 0.0, + "step": 63799 + }, + { + "epoch": 5.953158533171597, + "grad_norm": NaN, + "learning_rate": 4.8055298107008145e-08, + "loss": 0.0, + "step": 63800 + }, + { + "epoch": 5.953251842866473, + "grad_norm": NaN, + "learning_rate": 4.786404360120899e-08, + "loss": 0.0, + "step": 63801 + }, + { + "epoch": 5.953345152561351, + "grad_norm": NaN, + "learning_rate": 4.767317037926432e-08, + "loss": 0.0, + "step": 63802 + }, + { + "epoch": 5.953438462256228, + "grad_norm": NaN, + "learning_rate": 4.748267844167375e-08, + "loss": 0.0, + "step": 63803 + }, + { + "epoch": 5.953531771951106, + "grad_norm": NaN, + "learning_rate": 4.729256778892021e-08, + "loss": 0.0, + "step": 63804 + }, + { + "epoch": 5.953625081645983, + "grad_norm": NaN, + "learning_rate": 4.710283842147e-08, + "loss": 0.0, + "step": 63805 + }, + { + "epoch": 5.9537183913408604, + "grad_norm": NaN, + "learning_rate": 4.6913490339822725e-08, + "loss": 0.0, + "step": 63806 + }, + { + "epoch": 5.953811701035738, + "grad_norm": NaN, + "learning_rate": 4.6724523544461324e-08, + "loss": 0.0, + "step": 63807 + }, + { + "epoch": 5.953905010730615, + "grad_norm": NaN, + "learning_rate": 4.653593803585209e-08, + "loss": 0.0, + "step": 63808 + }, + { + "epoch": 5.953998320425493, + "grad_norm": NaN, + "learning_rate": 4.634773381447798e-08, + "loss": 0.0, + "step": 63809 + }, + { + "epoch": 5.954091630120369, + "grad_norm": NaN, + "learning_rate": 4.615991088083859e-08, + "loss": 0.0, + "step": 63810 + }, + { + "epoch": 5.954184939815247, + "grad_norm": NaN, + "learning_rate": 4.597246923538356e-08, + "loss": 0.0, + "step": 63811 + }, + { + "epoch": 5.954278249510124, + "grad_norm": NaN, + "learning_rate": 4.578540887859583e-08, + "loss": 0.0, + "step": 63812 + }, + { + "epoch": 5.9543715592050015, + "grad_norm": NaN, + "learning_rate": 4.559872981097501e-08, + "loss": 0.0, + "step": 63813 + }, + { + "epoch": 5.954464868899879, + "grad_norm": NaN, + "learning_rate": 4.5412432032954083e-08, + "loss": 0.0, + "step": 63814 + }, + { + "epoch": 5.954558178594756, + "grad_norm": NaN, + "learning_rate": 4.52265155450493e-08, + "loss": 0.0, + "step": 63815 + }, + { + "epoch": 5.954651488289633, + "grad_norm": NaN, + "learning_rate": 4.504098034769366e-08, + "loss": 0.0, + "step": 63816 + }, + { + "epoch": 5.95474479798451, + "grad_norm": NaN, + "learning_rate": 4.48558264414034e-08, + "loss": 0.0, + "step": 63817 + }, + { + "epoch": 5.954838107679388, + "grad_norm": NaN, + "learning_rate": 4.4671053826611516e-08, + "loss": 0.0, + "step": 63818 + }, + { + "epoch": 5.954931417374265, + "grad_norm": NaN, + "learning_rate": 4.44866625037843e-08, + "loss": 0.0, + "step": 63819 + }, + { + "epoch": 5.9550247270691425, + "grad_norm": NaN, + "learning_rate": 4.4302652473438006e-08, + "loss": 0.0, + "step": 63820 + }, + { + "epoch": 5.95511803676402, + "grad_norm": NaN, + "learning_rate": 4.411902373598897e-08, + "loss": 0.0, + "step": 63821 + }, + { + "epoch": 5.955211346458897, + "grad_norm": NaN, + "learning_rate": 4.3935776291936786e-08, + "loss": 0.0, + "step": 63822 + }, + { + "epoch": 5.955304656153775, + "grad_norm": NaN, + "learning_rate": 4.3752910141731104e-08, + "loss": 0.0, + "step": 63823 + }, + { + "epoch": 5.955397965848651, + "grad_norm": NaN, + "learning_rate": 4.357042528583821e-08, + "loss": 0.0, + "step": 63824 + }, + { + "epoch": 5.955491275543529, + "grad_norm": NaN, + "learning_rate": 4.33883217247244e-08, + "loss": 0.0, + "step": 63825 + }, + { + "epoch": 5.955584585238406, + "grad_norm": NaN, + "learning_rate": 4.3206599458855964e-08, + "loss": 0.0, + "step": 63826 + }, + { + "epoch": 5.955677894933284, + "grad_norm": NaN, + "learning_rate": 4.30252584886992e-08, + "loss": 0.0, + "step": 63827 + }, + { + "epoch": 5.955771204628161, + "grad_norm": NaN, + "learning_rate": 4.284429881470375e-08, + "loss": 0.0, + "step": 63828 + }, + { + "epoch": 5.955864514323038, + "grad_norm": NaN, + "learning_rate": 4.26637204373359e-08, + "loss": 0.0, + "step": 63829 + }, + { + "epoch": 5.955957824017916, + "grad_norm": NaN, + "learning_rate": 4.24835233570453e-08, + "loss": 0.0, + "step": 63830 + }, + { + "epoch": 5.956051133712792, + "grad_norm": NaN, + "learning_rate": 4.2303707574314896e-08, + "loss": 0.0, + "step": 63831 + }, + { + "epoch": 5.95614444340767, + "grad_norm": NaN, + "learning_rate": 4.212427308957766e-08, + "loss": 0.0, + "step": 63832 + }, + { + "epoch": 5.956237753102547, + "grad_norm": NaN, + "learning_rate": 4.194521990328326e-08, + "loss": 0.0, + "step": 63833 + }, + { + "epoch": 5.956331062797425, + "grad_norm": NaN, + "learning_rate": 4.176654801591461e-08, + "loss": 0.0, + "step": 63834 + }, + { + "epoch": 5.956424372492302, + "grad_norm": NaN, + "learning_rate": 4.1588257427921377e-08, + "loss": 0.0, + "step": 63835 + }, + { + "epoch": 5.9565176821871795, + "grad_norm": NaN, + "learning_rate": 4.141034813973654e-08, + "loss": 0.0, + "step": 63836 + }, + { + "epoch": 5.956610991882057, + "grad_norm": NaN, + "learning_rate": 4.1232820151826386e-08, + "loss": 0.0, + "step": 63837 + }, + { + "epoch": 5.956704301576934, + "grad_norm": NaN, + "learning_rate": 4.105567346464056e-08, + "loss": 0.0, + "step": 63838 + }, + { + "epoch": 5.956797611271811, + "grad_norm": NaN, + "learning_rate": 4.08789080786287e-08, + "loss": 0.0, + "step": 63839 + }, + { + "epoch": 5.956890920966688, + "grad_norm": NaN, + "learning_rate": 4.070252399424045e-08, + "loss": 0.0, + "step": 63840 + }, + { + "epoch": 5.956984230661566, + "grad_norm": NaN, + "learning_rate": 4.052652121192546e-08, + "loss": 0.0, + "step": 63841 + }, + { + "epoch": 5.957077540356443, + "grad_norm": NaN, + "learning_rate": 4.0350899732116695e-08, + "loss": 0.0, + "step": 63842 + }, + { + "epoch": 5.9571708500513205, + "grad_norm": NaN, + "learning_rate": 4.0175659555297114e-08, + "loss": 0.0, + "step": 63843 + }, + { + "epoch": 5.957264159746198, + "grad_norm": NaN, + "learning_rate": 4.000080068188305e-08, + "loss": 0.0, + "step": 63844 + }, + { + "epoch": 5.9573574694410745, + "grad_norm": NaN, + "learning_rate": 3.982632311230749e-08, + "loss": 0.0, + "step": 63845 + }, + { + "epoch": 5.957450779135952, + "grad_norm": NaN, + "learning_rate": 3.965222684705338e-08, + "loss": 0.0, + "step": 63846 + }, + { + "epoch": 5.957544088830829, + "grad_norm": NaN, + "learning_rate": 3.9478511886537054e-08, + "loss": 0.0, + "step": 63847 + }, + { + "epoch": 5.957637398525707, + "grad_norm": NaN, + "learning_rate": 3.9305178231191506e-08, + "loss": 0.0, + "step": 63848 + }, + { + "epoch": 5.957730708220584, + "grad_norm": NaN, + "learning_rate": 3.913222588149967e-08, + "loss": 0.0, + "step": 63849 + }, + { + "epoch": 5.957824017915462, + "grad_norm": NaN, + "learning_rate": 3.895965483784458e-08, + "loss": 0.0, + "step": 63850 + }, + { + "epoch": 5.957917327610339, + "grad_norm": NaN, + "learning_rate": 3.878746510070918e-08, + "loss": 0.0, + "step": 63851 + }, + { + "epoch": 5.958010637305216, + "grad_norm": NaN, + "learning_rate": 3.861565667050981e-08, + "loss": 0.0, + "step": 63852 + }, + { + "epoch": 5.958103947000093, + "grad_norm": NaN, + "learning_rate": 3.84442295476961e-08, + "loss": 0.0, + "step": 63853 + }, + { + "epoch": 5.95819725669497, + "grad_norm": NaN, + "learning_rate": 3.8273183732684396e-08, + "loss": 0.0, + "step": 63854 + }, + { + "epoch": 5.958290566389848, + "grad_norm": NaN, + "learning_rate": 3.8102519225924336e-08, + "loss": 0.0, + "step": 63855 + }, + { + "epoch": 5.958383876084725, + "grad_norm": NaN, + "learning_rate": 3.7932236027848896e-08, + "loss": 0.0, + "step": 63856 + }, + { + "epoch": 5.958477185779603, + "grad_norm": NaN, + "learning_rate": 3.7762334138907724e-08, + "loss": 0.0, + "step": 63857 + }, + { + "epoch": 5.95857049547448, + "grad_norm": NaN, + "learning_rate": 3.75928135595005e-08, + "loss": 0.0, + "step": 63858 + }, + { + "epoch": 5.9586638051693575, + "grad_norm": NaN, + "learning_rate": 3.742367429006021e-08, + "loss": 0.0, + "step": 63859 + }, + { + "epoch": 5.958757114864234, + "grad_norm": NaN, + "learning_rate": 3.725491633105315e-08, + "loss": 0.0, + "step": 63860 + }, + { + "epoch": 5.958850424559111, + "grad_norm": NaN, + "learning_rate": 3.7086539682862347e-08, + "loss": 0.0, + "step": 63861 + }, + { + "epoch": 5.958943734253989, + "grad_norm": NaN, + "learning_rate": 3.691854434595409e-08, + "loss": 0.0, + "step": 63862 + }, + { + "epoch": 5.959037043948866, + "grad_norm": NaN, + "learning_rate": 3.6750930320728066e-08, + "loss": 0.0, + "step": 63863 + }, + { + "epoch": 5.959130353643744, + "grad_norm": NaN, + "learning_rate": 3.658369760763391e-08, + "loss": 0.0, + "step": 63864 + }, + { + "epoch": 5.959223663338621, + "grad_norm": NaN, + "learning_rate": 3.6416846207071304e-08, + "loss": 0.0, + "step": 63865 + }, + { + "epoch": 5.9593169730334985, + "grad_norm": NaN, + "learning_rate": 3.625037611948989e-08, + "loss": 0.0, + "step": 63866 + }, + { + "epoch": 5.959410282728376, + "grad_norm": NaN, + "learning_rate": 3.6084287345305994e-08, + "loss": 0.0, + "step": 63867 + }, + { + "epoch": 5.9595035924232524, + "grad_norm": NaN, + "learning_rate": 3.591857988491931e-08, + "loss": 0.0, + "step": 63868 + }, + { + "epoch": 5.95959690211813, + "grad_norm": NaN, + "learning_rate": 3.5753253738779464e-08, + "loss": 0.0, + "step": 63869 + }, + { + "epoch": 5.959690211813007, + "grad_norm": NaN, + "learning_rate": 3.5588308907302795e-08, + "loss": 0.0, + "step": 63870 + }, + { + "epoch": 5.959783521507885, + "grad_norm": NaN, + "learning_rate": 3.542374539088899e-08, + "loss": 0.0, + "step": 63871 + }, + { + "epoch": 5.959876831202762, + "grad_norm": NaN, + "learning_rate": 3.525956318997103e-08, + "loss": 0.0, + "step": 63872 + }, + { + "epoch": 5.9599701408976395, + "grad_norm": NaN, + "learning_rate": 3.509576230496525e-08, + "loss": 0.0, + "step": 63873 + }, + { + "epoch": 5.960063450592516, + "grad_norm": NaN, + "learning_rate": 3.493234273630463e-08, + "loss": 0.0, + "step": 63874 + }, + { + "epoch": 5.9601567602873935, + "grad_norm": NaN, + "learning_rate": 3.476930448437221e-08, + "loss": 0.0, + "step": 63875 + }, + { + "epoch": 5.960250069982271, + "grad_norm": NaN, + "learning_rate": 3.4606647549600964e-08, + "loss": 0.0, + "step": 63876 + }, + { + "epoch": 5.960343379677148, + "grad_norm": NaN, + "learning_rate": 3.4444371932407234e-08, + "loss": 0.0, + "step": 63877 + }, + { + "epoch": 5.960436689372026, + "grad_norm": NaN, + "learning_rate": 3.42824776331907e-08, + "loss": 0.0, + "step": 63878 + }, + { + "epoch": 5.960529999066903, + "grad_norm": NaN, + "learning_rate": 3.412096465236769e-08, + "loss": 0.0, + "step": 63879 + }, + { + "epoch": 5.960623308761781, + "grad_norm": NaN, + "learning_rate": 3.395983299037119e-08, + "loss": 0.0, + "step": 63880 + }, + { + "epoch": 5.960716618456658, + "grad_norm": NaN, + "learning_rate": 3.379908264756759e-08, + "loss": 0.0, + "step": 63881 + }, + { + "epoch": 5.960809928151535, + "grad_norm": NaN, + "learning_rate": 3.363871362440651e-08, + "loss": 0.0, + "step": 63882 + }, + { + "epoch": 5.960903237846412, + "grad_norm": NaN, + "learning_rate": 3.347872592127099e-08, + "loss": 0.0, + "step": 63883 + }, + { + "epoch": 5.960996547541289, + "grad_norm": NaN, + "learning_rate": 3.3319119538577353e-08, + "loss": 0.0, + "step": 63884 + }, + { + "epoch": 5.961089857236167, + "grad_norm": NaN, + "learning_rate": 3.315989447672529e-08, + "loss": 0.0, + "step": 63885 + }, + { + "epoch": 5.961183166931044, + "grad_norm": NaN, + "learning_rate": 3.300105073613113e-08, + "loss": 0.0, + "step": 63886 + }, + { + "epoch": 5.961276476625922, + "grad_norm": NaN, + "learning_rate": 3.284258831719455e-08, + "loss": 0.0, + "step": 63887 + }, + { + "epoch": 5.961369786320799, + "grad_norm": NaN, + "learning_rate": 3.268450722031524e-08, + "loss": 0.0, + "step": 63888 + }, + { + "epoch": 5.961463096015676, + "grad_norm": NaN, + "learning_rate": 3.2526807445892866e-08, + "loss": 0.0, + "step": 63889 + }, + { + "epoch": 5.961556405710553, + "grad_norm": NaN, + "learning_rate": 3.236948899432712e-08, + "loss": 0.0, + "step": 63890 + }, + { + "epoch": 5.96164971540543, + "grad_norm": NaN, + "learning_rate": 3.221255186601768e-08, + "loss": 0.0, + "step": 63891 + }, + { + "epoch": 5.961743025100308, + "grad_norm": NaN, + "learning_rate": 3.2055996061380876e-08, + "loss": 0.0, + "step": 63892 + }, + { + "epoch": 5.961836334795185, + "grad_norm": NaN, + "learning_rate": 3.189982158079973e-08, + "loss": 0.0, + "step": 63893 + }, + { + "epoch": 5.961929644490063, + "grad_norm": NaN, + "learning_rate": 3.174402842467394e-08, + "loss": 0.0, + "step": 63894 + }, + { + "epoch": 5.96202295418494, + "grad_norm": NaN, + "learning_rate": 3.158861659338652e-08, + "loss": 0.0, + "step": 63895 + }, + { + "epoch": 5.9621162638798175, + "grad_norm": NaN, + "learning_rate": 3.14335860873538e-08, + "loss": 0.0, + "step": 63896 + }, + { + "epoch": 5.962209573574694, + "grad_norm": NaN, + "learning_rate": 3.127893690695882e-08, + "loss": 0.0, + "step": 63897 + }, + { + "epoch": 5.9623028832695715, + "grad_norm": NaN, + "learning_rate": 3.112466905260125e-08, + "loss": 0.0, + "step": 63898 + }, + { + "epoch": 5.962396192964449, + "grad_norm": NaN, + "learning_rate": 3.097078252468077e-08, + "loss": 0.0, + "step": 63899 + }, + { + "epoch": 5.962489502659326, + "grad_norm": NaN, + "learning_rate": 3.081727732356376e-08, + "loss": 0.0, + "step": 63900 + }, + { + "epoch": 5.962582812354204, + "grad_norm": NaN, + "learning_rate": 3.06641534496499e-08, + "loss": 0.0, + "step": 63901 + }, + { + "epoch": 5.962676122049081, + "grad_norm": NaN, + "learning_rate": 3.051141090333886e-08, + "loss": 0.0, + "step": 63902 + }, + { + "epoch": 5.962769431743959, + "grad_norm": NaN, + "learning_rate": 3.035904968501368e-08, + "loss": 0.0, + "step": 63903 + }, + { + "epoch": 5.962862741438835, + "grad_norm": NaN, + "learning_rate": 3.0207069795057384e-08, + "loss": 0.0, + "step": 63904 + }, + { + "epoch": 5.9629560511337125, + "grad_norm": NaN, + "learning_rate": 3.005547123386964e-08, + "loss": 0.0, + "step": 63905 + }, + { + "epoch": 5.96304936082859, + "grad_norm": NaN, + "learning_rate": 2.990425400181684e-08, + "loss": 0.0, + "step": 63906 + }, + { + "epoch": 5.963142670523467, + "grad_norm": NaN, + "learning_rate": 2.9753418099298654e-08, + "loss": 0.0, + "step": 63907 + }, + { + "epoch": 5.963235980218345, + "grad_norm": NaN, + "learning_rate": 2.9602963526681456e-08, + "loss": 0.0, + "step": 63908 + }, + { + "epoch": 5.963329289913222, + "grad_norm": NaN, + "learning_rate": 2.945289028438158e-08, + "loss": 0.0, + "step": 63909 + }, + { + "epoch": 5.9634225996081, + "grad_norm": NaN, + "learning_rate": 2.9303198372732095e-08, + "loss": 0.0, + "step": 63910 + }, + { + "epoch": 5.963515909302977, + "grad_norm": NaN, + "learning_rate": 2.9153887792165986e-08, + "loss": 0.0, + "step": 63911 + }, + { + "epoch": 5.963609218997854, + "grad_norm": NaN, + "learning_rate": 2.9004958543016322e-08, + "loss": 0.0, + "step": 63912 + }, + { + "epoch": 5.963702528692731, + "grad_norm": NaN, + "learning_rate": 2.8856410625699432e-08, + "loss": 0.0, + "step": 63913 + }, + { + "epoch": 5.963795838387608, + "grad_norm": NaN, + "learning_rate": 2.8708244040565043e-08, + "loss": 0.0, + "step": 63914 + }, + { + "epoch": 5.963889148082486, + "grad_norm": NaN, + "learning_rate": 2.8560458787996176e-08, + "loss": 0.0, + "step": 63915 + }, + { + "epoch": 5.963982457777363, + "grad_norm": NaN, + "learning_rate": 2.8413054868375863e-08, + "loss": 0.0, + "step": 63916 + }, + { + "epoch": 5.964075767472241, + "grad_norm": NaN, + "learning_rate": 2.8266032282087124e-08, + "loss": 0.0, + "step": 63917 + }, + { + "epoch": 5.964169077167117, + "grad_norm": NaN, + "learning_rate": 2.811939102947969e-08, + "loss": 0.0, + "step": 63918 + }, + { + "epoch": 5.964262386861995, + "grad_norm": NaN, + "learning_rate": 2.797313111095323e-08, + "loss": 0.0, + "step": 63919 + }, + { + "epoch": 5.964355696556872, + "grad_norm": NaN, + "learning_rate": 2.782725252685747e-08, + "loss": 0.0, + "step": 63920 + }, + { + "epoch": 5.9644490062517495, + "grad_norm": NaN, + "learning_rate": 2.7681755277575434e-08, + "loss": 0.0, + "step": 63921 + }, + { + "epoch": 5.964542315946627, + "grad_norm": NaN, + "learning_rate": 2.7536639363473502e-08, + "loss": 0.0, + "step": 63922 + }, + { + "epoch": 5.964635625641504, + "grad_norm": NaN, + "learning_rate": 2.739190478491804e-08, + "loss": 0.0, + "step": 63923 + }, + { + "epoch": 5.964728935336382, + "grad_norm": NaN, + "learning_rate": 2.7247551542275425e-08, + "loss": 0.0, + "step": 63924 + }, + { + "epoch": 5.964822245031259, + "grad_norm": NaN, + "learning_rate": 2.7103579635928687e-08, + "loss": 0.0, + "step": 63925 + }, + { + "epoch": 5.964915554726136, + "grad_norm": NaN, + "learning_rate": 2.695998906622754e-08, + "loss": 0.0, + "step": 63926 + }, + { + "epoch": 5.965008864421013, + "grad_norm": NaN, + "learning_rate": 2.681677983353836e-08, + "loss": 0.0, + "step": 63927 + }, + { + "epoch": 5.9651021741158905, + "grad_norm": NaN, + "learning_rate": 2.6673951938227524e-08, + "loss": 0.0, + "step": 63928 + }, + { + "epoch": 5.965195483810768, + "grad_norm": NaN, + "learning_rate": 2.65315053806614e-08, + "loss": 0.0, + "step": 63929 + }, + { + "epoch": 5.965288793505645, + "grad_norm": NaN, + "learning_rate": 2.6389440161206364e-08, + "loss": 0.0, + "step": 63930 + }, + { + "epoch": 5.965382103200523, + "grad_norm": NaN, + "learning_rate": 2.624775628021214e-08, + "loss": 0.0, + "step": 63931 + }, + { + "epoch": 5.9654754128954, + "grad_norm": NaN, + "learning_rate": 2.6106453738045097e-08, + "loss": 0.0, + "step": 63932 + }, + { + "epoch": 5.965568722590277, + "grad_norm": NaN, + "learning_rate": 2.5965532535054955e-08, + "loss": 0.0, + "step": 63933 + }, + { + "epoch": 5.965662032285154, + "grad_norm": NaN, + "learning_rate": 2.5824992671624744e-08, + "loss": 0.0, + "step": 63934 + }, + { + "epoch": 5.9657553419800315, + "grad_norm": NaN, + "learning_rate": 2.568483414808753e-08, + "loss": 0.0, + "step": 63935 + }, + { + "epoch": 5.965848651674909, + "grad_norm": NaN, + "learning_rate": 2.5545056964809685e-08, + "loss": 0.0, + "step": 63936 + }, + { + "epoch": 5.965941961369786, + "grad_norm": NaN, + "learning_rate": 2.540566112214093e-08, + "loss": 0.0, + "step": 63937 + }, + { + "epoch": 5.966035271064664, + "grad_norm": NaN, + "learning_rate": 2.526664662044764e-08, + "loss": 0.0, + "step": 63938 + }, + { + "epoch": 5.966128580759541, + "grad_norm": NaN, + "learning_rate": 2.5128013460079533e-08, + "loss": 0.0, + "step": 63939 + }, + { + "epoch": 5.966221890454419, + "grad_norm": NaN, + "learning_rate": 2.4989761641369676e-08, + "loss": 0.0, + "step": 63940 + }, + { + "epoch": 5.966315200149295, + "grad_norm": NaN, + "learning_rate": 2.4851891164684445e-08, + "loss": 0.0, + "step": 63941 + }, + { + "epoch": 5.966408509844173, + "grad_norm": NaN, + "learning_rate": 2.4714402030390213e-08, + "loss": 0.0, + "step": 63942 + }, + { + "epoch": 5.96650181953905, + "grad_norm": NaN, + "learning_rate": 2.4577294238820043e-08, + "loss": 0.0, + "step": 63943 + }, + { + "epoch": 5.966595129233927, + "grad_norm": NaN, + "learning_rate": 2.444056779030701e-08, + "loss": 0.0, + "step": 63944 + }, + { + "epoch": 5.966688438928805, + "grad_norm": NaN, + "learning_rate": 2.430422268523413e-08, + "loss": 0.0, + "step": 63945 + }, + { + "epoch": 5.966781748623682, + "grad_norm": NaN, + "learning_rate": 2.4168258923917828e-08, + "loss": 0.0, + "step": 63946 + }, + { + "epoch": 5.966875058318559, + "grad_norm": NaN, + "learning_rate": 2.4032676506724467e-08, + "loss": 0.0, + "step": 63947 + }, + { + "epoch": 5.966968368013436, + "grad_norm": NaN, + "learning_rate": 2.3897475433987122e-08, + "loss": 0.0, + "step": 63948 + }, + { + "epoch": 5.967061677708314, + "grad_norm": NaN, + "learning_rate": 2.376265570603886e-08, + "loss": 0.0, + "step": 63949 + }, + { + "epoch": 5.967154987403191, + "grad_norm": NaN, + "learning_rate": 2.3628217323246045e-08, + "loss": 0.0, + "step": 63950 + }, + { + "epoch": 5.9672482970980685, + "grad_norm": NaN, + "learning_rate": 2.3494160285941753e-08, + "loss": 0.0, + "step": 63951 + }, + { + "epoch": 5.967341606792946, + "grad_norm": NaN, + "learning_rate": 2.3360484594475705e-08, + "loss": 0.0, + "step": 63952 + }, + { + "epoch": 5.967434916487823, + "grad_norm": NaN, + "learning_rate": 2.3227190249164308e-08, + "loss": 0.0, + "step": 63953 + }, + { + "epoch": 5.967528226182701, + "grad_norm": NaN, + "learning_rate": 2.3094277250357284e-08, + "loss": 0.0, + "step": 63954 + }, + { + "epoch": 5.967621535877578, + "grad_norm": NaN, + "learning_rate": 2.2961745598404358e-08, + "loss": 0.0, + "step": 63955 + }, + { + "epoch": 5.967714845572455, + "grad_norm": NaN, + "learning_rate": 2.282959529362194e-08, + "loss": 0.0, + "step": 63956 + }, + { + "epoch": 5.967808155267332, + "grad_norm": NaN, + "learning_rate": 2.2697826336359748e-08, + "loss": 0.0, + "step": 63957 + }, + { + "epoch": 5.9679014649622095, + "grad_norm": NaN, + "learning_rate": 2.2566438726967505e-08, + "loss": 0.0, + "step": 63958 + }, + { + "epoch": 5.967994774657087, + "grad_norm": NaN, + "learning_rate": 2.2435432465744974e-08, + "loss": 0.0, + "step": 63959 + }, + { + "epoch": 5.968088084351964, + "grad_norm": NaN, + "learning_rate": 2.230480755304187e-08, + "loss": 0.0, + "step": 63960 + }, + { + "epoch": 5.968181394046842, + "grad_norm": NaN, + "learning_rate": 2.2174563989207915e-08, + "loss": 0.0, + "step": 63961 + }, + { + "epoch": 5.968274703741718, + "grad_norm": NaN, + "learning_rate": 2.2044701774542872e-08, + "loss": 0.0, + "step": 63962 + }, + { + "epoch": 5.968368013436596, + "grad_norm": NaN, + "learning_rate": 2.1915220909396458e-08, + "loss": 0.0, + "step": 63963 + }, + { + "epoch": 5.968461323131473, + "grad_norm": NaN, + "learning_rate": 2.178612139408509e-08, + "loss": 0.0, + "step": 63964 + }, + { + "epoch": 5.968554632826351, + "grad_norm": NaN, + "learning_rate": 2.1657403228941828e-08, + "loss": 0.0, + "step": 63965 + }, + { + "epoch": 5.968647942521228, + "grad_norm": NaN, + "learning_rate": 2.1529066414316397e-08, + "loss": 0.0, + "step": 63966 + }, + { + "epoch": 5.968741252216105, + "grad_norm": NaN, + "learning_rate": 2.140111095050856e-08, + "loss": 0.0, + "step": 63967 + }, + { + "epoch": 5.968834561910983, + "grad_norm": NaN, + "learning_rate": 2.1273536837834725e-08, + "loss": 0.0, + "step": 63968 + }, + { + "epoch": 5.96892787160586, + "grad_norm": NaN, + "learning_rate": 2.114634407666127e-08, + "loss": 0.0, + "step": 63969 + }, + { + "epoch": 5.969021181300737, + "grad_norm": NaN, + "learning_rate": 2.10195326672713e-08, + "loss": 0.0, + "step": 63970 + }, + { + "epoch": 5.969114490995614, + "grad_norm": NaN, + "learning_rate": 2.089310260999788e-08, + "loss": 0.0, + "step": 63971 + }, + { + "epoch": 5.969207800690492, + "grad_norm": NaN, + "learning_rate": 2.076705390517408e-08, + "loss": 0.0, + "step": 63972 + }, + { + "epoch": 5.969301110385369, + "grad_norm": NaN, + "learning_rate": 2.064138655309966e-08, + "loss": 0.0, + "step": 63973 + }, + { + "epoch": 5.9693944200802465, + "grad_norm": NaN, + "learning_rate": 2.0516100554124337e-08, + "loss": 0.0, + "step": 63974 + }, + { + "epoch": 5.969487729775124, + "grad_norm": NaN, + "learning_rate": 2.039119590853122e-08, + "loss": 0.0, + "step": 63975 + }, + { + "epoch": 5.969581039470001, + "grad_norm": NaN, + "learning_rate": 2.0266672616670032e-08, + "loss": 0.0, + "step": 63976 + }, + { + "epoch": 5.969674349164878, + "grad_norm": NaN, + "learning_rate": 2.014253067884053e-08, + "loss": 0.0, + "step": 63977 + }, + { + "epoch": 5.969767658859755, + "grad_norm": NaN, + "learning_rate": 2.0018770095359126e-08, + "loss": 0.0, + "step": 63978 + }, + { + "epoch": 5.969860968554633, + "grad_norm": NaN, + "learning_rate": 1.989539086654224e-08, + "loss": 0.0, + "step": 63979 + }, + { + "epoch": 5.96995427824951, + "grad_norm": NaN, + "learning_rate": 1.9772392992706275e-08, + "loss": 0.0, + "step": 63980 + }, + { + "epoch": 5.9700475879443875, + "grad_norm": NaN, + "learning_rate": 1.9649776474167656e-08, + "loss": 0.0, + "step": 63981 + }, + { + "epoch": 5.970140897639265, + "grad_norm": NaN, + "learning_rate": 1.9527541311226137e-08, + "loss": 0.0, + "step": 63982 + }, + { + "epoch": 5.970234207334142, + "grad_norm": NaN, + "learning_rate": 1.940568750419813e-08, + "loss": 0.0, + "step": 63983 + }, + { + "epoch": 5.97032751702902, + "grad_norm": NaN, + "learning_rate": 1.92842150533834e-08, + "loss": 0.0, + "step": 63984 + }, + { + "epoch": 5.970420826723896, + "grad_norm": NaN, + "learning_rate": 1.916312395911501e-08, + "loss": 0.0, + "step": 63985 + }, + { + "epoch": 5.970514136418774, + "grad_norm": NaN, + "learning_rate": 1.9042414221692725e-08, + "loss": 0.0, + "step": 63986 + }, + { + "epoch": 5.970607446113651, + "grad_norm": NaN, + "learning_rate": 1.892208584139965e-08, + "loss": 0.0, + "step": 63987 + }, + { + "epoch": 5.9707007558085285, + "grad_norm": NaN, + "learning_rate": 1.8802138818568845e-08, + "loss": 0.0, + "step": 63988 + }, + { + "epoch": 5.970794065503406, + "grad_norm": NaN, + "learning_rate": 1.868257315350008e-08, + "loss": 0.0, + "step": 63989 + }, + { + "epoch": 5.970887375198283, + "grad_norm": NaN, + "learning_rate": 1.856338884649311e-08, + "loss": 0.0, + "step": 63990 + }, + { + "epoch": 5.97098068489316, + "grad_norm": NaN, + "learning_rate": 1.8444585897847695e-08, + "loss": 0.0, + "step": 63991 + }, + { + "epoch": 5.971073994588037, + "grad_norm": NaN, + "learning_rate": 1.8326164307880254e-08, + "loss": 0.0, + "step": 63992 + }, + { + "epoch": 5.971167304282915, + "grad_norm": NaN, + "learning_rate": 1.8208124076873888e-08, + "loss": 0.0, + "step": 63993 + }, + { + "epoch": 5.971260613977792, + "grad_norm": NaN, + "learning_rate": 1.8090465205128356e-08, + "loss": 0.0, + "step": 63994 + }, + { + "epoch": 5.97135392367267, + "grad_norm": NaN, + "learning_rate": 1.7973187692960078e-08, + "loss": 0.0, + "step": 63995 + }, + { + "epoch": 5.971447233367547, + "grad_norm": NaN, + "learning_rate": 1.7856291540652158e-08, + "loss": 0.0, + "step": 63996 + }, + { + "epoch": 5.971540543062424, + "grad_norm": NaN, + "learning_rate": 1.7739776748504354e-08, + "loss": 0.0, + "step": 63997 + }, + { + "epoch": 5.971633852757302, + "grad_norm": NaN, + "learning_rate": 1.762364331683308e-08, + "loss": 0.0, + "step": 63998 + }, + { + "epoch": 5.971727162452179, + "grad_norm": NaN, + "learning_rate": 1.7507891245904794e-08, + "loss": 0.0, + "step": 63999 + }, + { + "epoch": 5.971820472147056, + "grad_norm": NaN, + "learning_rate": 1.739252053601925e-08, + "loss": 0.0, + "step": 64000 + }, + { + "epoch": 5.971913781841933, + "grad_norm": NaN, + "learning_rate": 1.7277531187492864e-08, + "loss": 0.0, + "step": 64001 + }, + { + "epoch": 5.972007091536811, + "grad_norm": NaN, + "learning_rate": 1.7162923200592094e-08, + "loss": 0.0, + "step": 64002 + }, + { + "epoch": 5.972100401231688, + "grad_norm": NaN, + "learning_rate": 1.7048696575616693e-08, + "loss": 0.0, + "step": 64003 + }, + { + "epoch": 5.9721937109265655, + "grad_norm": NaN, + "learning_rate": 1.6934851312866427e-08, + "loss": 0.0, + "step": 64004 + }, + { + "epoch": 5.972287020621443, + "grad_norm": NaN, + "learning_rate": 1.6821387412624397e-08, + "loss": 0.0, + "step": 64005 + }, + { + "epoch": 5.972380330316319, + "grad_norm": NaN, + "learning_rate": 1.6708304875157062e-08, + "loss": 0.0, + "step": 64006 + }, + { + "epoch": 5.972473640011197, + "grad_norm": NaN, + "learning_rate": 1.6595603700797488e-08, + "loss": 0.0, + "step": 64007 + }, + { + "epoch": 5.972566949706074, + "grad_norm": NaN, + "learning_rate": 1.6483283889795472e-08, + "loss": 0.0, + "step": 64008 + }, + { + "epoch": 5.972660259400952, + "grad_norm": NaN, + "learning_rate": 1.6371345442434126e-08, + "loss": 0.0, + "step": 64009 + }, + { + "epoch": 5.972753569095829, + "grad_norm": NaN, + "learning_rate": 1.6259788359029857e-08, + "loss": 0.0, + "step": 64010 + }, + { + "epoch": 5.9728468787907065, + "grad_norm": NaN, + "learning_rate": 1.6148612639849122e-08, + "loss": 0.0, + "step": 64011 + }, + { + "epoch": 5.972940188485584, + "grad_norm": NaN, + "learning_rate": 1.6037818285158376e-08, + "loss": 0.0, + "step": 64012 + }, + { + "epoch": 5.973033498180461, + "grad_norm": NaN, + "learning_rate": 1.5927405295274032e-08, + "loss": 0.0, + "step": 64013 + }, + { + "epoch": 5.973126807875338, + "grad_norm": NaN, + "learning_rate": 1.581737367044589e-08, + "loss": 0.0, + "step": 64014 + }, + { + "epoch": 5.973220117570215, + "grad_norm": NaN, + "learning_rate": 1.5707723410973704e-08, + "loss": 0.0, + "step": 64015 + }, + { + "epoch": 5.973313427265093, + "grad_norm": NaN, + "learning_rate": 1.559845451712394e-08, + "loss": 0.0, + "step": 64016 + }, + { + "epoch": 5.97340673695997, + "grad_norm": NaN, + "learning_rate": 1.5489566989179693e-08, + "loss": 0.0, + "step": 64017 + }, + { + "epoch": 5.973500046654848, + "grad_norm": NaN, + "learning_rate": 1.5381060827407422e-08, + "loss": 0.0, + "step": 64018 + }, + { + "epoch": 5.973593356349725, + "grad_norm": NaN, + "learning_rate": 1.5272936032106886e-08, + "loss": 0.0, + "step": 64019 + }, + { + "epoch": 5.973686666044602, + "grad_norm": NaN, + "learning_rate": 1.5165192603527888e-08, + "loss": 0.0, + "step": 64020 + }, + { + "epoch": 5.973779975739479, + "grad_norm": NaN, + "learning_rate": 1.505783054195353e-08, + "loss": 0.0, + "step": 64021 + }, + { + "epoch": 5.973873285434356, + "grad_norm": NaN, + "learning_rate": 1.4950849847666922e-08, + "loss": 0.0, + "step": 64022 + }, + { + "epoch": 5.973966595129234, + "grad_norm": NaN, + "learning_rate": 1.484425052093452e-08, + "loss": 0.0, + "step": 64023 + }, + { + "epoch": 5.974059904824111, + "grad_norm": NaN, + "learning_rate": 1.4738032562006119e-08, + "loss": 0.0, + "step": 64024 + }, + { + "epoch": 5.974153214518989, + "grad_norm": NaN, + "learning_rate": 1.4632195971181482e-08, + "loss": 0.0, + "step": 64025 + }, + { + "epoch": 5.974246524213866, + "grad_norm": NaN, + "learning_rate": 1.4526740748727061e-08, + "loss": 0.0, + "step": 64026 + }, + { + "epoch": 5.9743398339087435, + "grad_norm": NaN, + "learning_rate": 1.4421666894892659e-08, + "loss": 0.0, + "step": 64027 + }, + { + "epoch": 5.974433143603621, + "grad_norm": NaN, + "learning_rate": 1.4316974409961379e-08, + "loss": 0.0, + "step": 64028 + }, + { + "epoch": 5.974526453298497, + "grad_norm": NaN, + "learning_rate": 1.4212663294183024e-08, + "loss": 0.0, + "step": 64029 + }, + { + "epoch": 5.974619762993375, + "grad_norm": NaN, + "learning_rate": 1.41087335478407e-08, + "loss": 0.0, + "step": 64030 + }, + { + "epoch": 5.974713072688252, + "grad_norm": NaN, + "learning_rate": 1.4005185171184209e-08, + "loss": 0.0, + "step": 64031 + }, + { + "epoch": 5.97480638238313, + "grad_norm": NaN, + "learning_rate": 1.390201816448e-08, + "loss": 0.0, + "step": 64032 + }, + { + "epoch": 5.974899692078007, + "grad_norm": NaN, + "learning_rate": 1.379923252799453e-08, + "loss": 0.0, + "step": 64033 + }, + { + "epoch": 5.9749930017728845, + "grad_norm": NaN, + "learning_rate": 1.3696828261994252e-08, + "loss": 0.0, + "step": 64034 + }, + { + "epoch": 5.975086311467761, + "grad_norm": NaN, + "learning_rate": 1.359480536674562e-08, + "loss": 0.0, + "step": 64035 + }, + { + "epoch": 5.9751796211626385, + "grad_norm": NaN, + "learning_rate": 1.349316384248178e-08, + "loss": 0.0, + "step": 64036 + }, + { + "epoch": 5.975272930857516, + "grad_norm": NaN, + "learning_rate": 1.3391903689469186e-08, + "loss": 0.0, + "step": 64037 + }, + { + "epoch": 5.975366240552393, + "grad_norm": NaN, + "learning_rate": 1.3291024907990943e-08, + "loss": 0.0, + "step": 64038 + }, + { + "epoch": 5.975459550247271, + "grad_norm": NaN, + "learning_rate": 1.3190527498263548e-08, + "loss": 0.0, + "step": 64039 + }, + { + "epoch": 5.975552859942148, + "grad_norm": NaN, + "learning_rate": 1.3090411460586758e-08, + "loss": 0.0, + "step": 64040 + }, + { + "epoch": 5.9756461696370256, + "grad_norm": NaN, + "learning_rate": 1.2990676795177069e-08, + "loss": 0.0, + "step": 64041 + }, + { + "epoch": 5.975739479331903, + "grad_norm": NaN, + "learning_rate": 1.2891323502300932e-08, + "loss": 0.0, + "step": 64042 + }, + { + "epoch": 5.9758327890267795, + "grad_norm": NaN, + "learning_rate": 1.2792351582208148e-08, + "loss": 0.0, + "step": 64043 + }, + { + "epoch": 5.975926098721657, + "grad_norm": NaN, + "learning_rate": 1.2693761035165173e-08, + "loss": 0.0, + "step": 64044 + }, + { + "epoch": 5.976019408416534, + "grad_norm": NaN, + "learning_rate": 1.259555186140515e-08, + "loss": 0.0, + "step": 64045 + }, + { + "epoch": 5.976112718111412, + "grad_norm": NaN, + "learning_rate": 1.2497724061194536e-08, + "loss": 0.0, + "step": 64046 + }, + { + "epoch": 5.976206027806289, + "grad_norm": NaN, + "learning_rate": 1.2400277634766475e-08, + "loss": 0.0, + "step": 64047 + }, + { + "epoch": 5.976299337501167, + "grad_norm": NaN, + "learning_rate": 1.2303212582370769e-08, + "loss": 0.0, + "step": 64048 + }, + { + "epoch": 5.976392647196044, + "grad_norm": NaN, + "learning_rate": 1.2206528904273872e-08, + "loss": 0.0, + "step": 64049 + }, + { + "epoch": 5.9764859568909205, + "grad_norm": NaN, + "learning_rate": 1.2110226600692274e-08, + "loss": 0.0, + "step": 64050 + }, + { + "epoch": 5.976579266585798, + "grad_norm": NaN, + "learning_rate": 1.2014305671892432e-08, + "loss": 0.0, + "step": 64051 + }, + { + "epoch": 5.976672576280675, + "grad_norm": NaN, + "learning_rate": 1.1918766118107492e-08, + "loss": 0.0, + "step": 64052 + }, + { + "epoch": 5.976765885975553, + "grad_norm": NaN, + "learning_rate": 1.1823607939570601e-08, + "loss": 0.0, + "step": 64053 + }, + { + "epoch": 5.97685919567043, + "grad_norm": NaN, + "learning_rate": 1.172883113654821e-08, + "loss": 0.0, + "step": 64054 + }, + { + "epoch": 5.976952505365308, + "grad_norm": NaN, + "learning_rate": 1.163443570927347e-08, + "loss": 0.0, + "step": 64055 + }, + { + "epoch": 5.977045815060185, + "grad_norm": NaN, + "learning_rate": 1.1540421657979526e-08, + "loss": 0.0, + "step": 64056 + }, + { + "epoch": 5.9771391247550625, + "grad_norm": NaN, + "learning_rate": 1.1446788982916178e-08, + "loss": 0.0, + "step": 64057 + }, + { + "epoch": 5.977232434449939, + "grad_norm": NaN, + "learning_rate": 1.1353537684299919e-08, + "loss": 0.0, + "step": 64058 + }, + { + "epoch": 5.977325744144816, + "grad_norm": NaN, + "learning_rate": 1.126066776238055e-08, + "loss": 0.0, + "step": 64059 + }, + { + "epoch": 5.977419053839694, + "grad_norm": NaN, + "learning_rate": 1.1168179217407869e-08, + "loss": 0.0, + "step": 64060 + }, + { + "epoch": 5.977512363534571, + "grad_norm": NaN, + "learning_rate": 1.1076072049598372e-08, + "loss": 0.0, + "step": 64061 + }, + { + "epoch": 5.977605673229449, + "grad_norm": NaN, + "learning_rate": 1.0984346259185207e-08, + "loss": 0.0, + "step": 64062 + }, + { + "epoch": 5.977698982924326, + "grad_norm": NaN, + "learning_rate": 1.089300184641817e-08, + "loss": 0.0, + "step": 64063 + }, + { + "epoch": 5.977792292619203, + "grad_norm": NaN, + "learning_rate": 1.0802038811530412e-08, + "loss": 0.0, + "step": 64064 + }, + { + "epoch": 5.97788560231408, + "grad_norm": NaN, + "learning_rate": 1.071145715472177e-08, + "loss": 0.0, + "step": 64065 + }, + { + "epoch": 5.9779789120089575, + "grad_norm": NaN, + "learning_rate": 1.06212568762587e-08, + "loss": 0.0, + "step": 64066 + }, + { + "epoch": 5.978072221703835, + "grad_norm": NaN, + "learning_rate": 1.0531437976341039e-08, + "loss": 0.0, + "step": 64067 + }, + { + "epoch": 5.978165531398712, + "grad_norm": NaN, + "learning_rate": 1.0442000455218591e-08, + "loss": 0.0, + "step": 64068 + }, + { + "epoch": 5.97825884109359, + "grad_norm": NaN, + "learning_rate": 1.03529443131245e-08, + "loss": 0.0, + "step": 64069 + }, + { + "epoch": 5.978352150788467, + "grad_norm": NaN, + "learning_rate": 1.0264269550258607e-08, + "loss": 0.0, + "step": 64070 + }, + { + "epoch": 5.978445460483345, + "grad_norm": NaN, + "learning_rate": 1.0175976166854061e-08, + "loss": 0.0, + "step": 64071 + }, + { + "epoch": 5.978538770178222, + "grad_norm": NaN, + "learning_rate": 1.0088064163160659e-08, + "loss": 0.0, + "step": 64072 + }, + { + "epoch": 5.9786320798730985, + "grad_norm": NaN, + "learning_rate": 1.0000533539378242e-08, + "loss": 0.0, + "step": 64073 + }, + { + "epoch": 5.978725389567976, + "grad_norm": NaN, + "learning_rate": 9.913384295723303e-09, + "loss": 0.0, + "step": 64074 + }, + { + "epoch": 5.978818699262853, + "grad_norm": NaN, + "learning_rate": 9.826616432428992e-09, + "loss": 0.0, + "step": 64075 + }, + { + "epoch": 5.978912008957731, + "grad_norm": NaN, + "learning_rate": 9.740229949728451e-09, + "loss": 0.0, + "step": 64076 + }, + { + "epoch": 5.979005318652608, + "grad_norm": NaN, + "learning_rate": 9.654224847821524e-09, + "loss": 0.0, + "step": 64077 + }, + { + "epoch": 5.979098628347486, + "grad_norm": NaN, + "learning_rate": 9.568601126924702e-09, + "loss": 0.0, + "step": 64078 + }, + { + "epoch": 5.979191938042362, + "grad_norm": NaN, + "learning_rate": 9.483358787271133e-09, + "loss": 0.0, + "step": 64079 + }, + { + "epoch": 5.97928524773724, + "grad_norm": NaN, + "learning_rate": 9.398497829077312e-09, + "loss": 0.0, + "step": 64080 + }, + { + "epoch": 5.979378557432117, + "grad_norm": NaN, + "learning_rate": 9.314018252543076e-09, + "loss": 0.0, + "step": 64081 + }, + { + "epoch": 5.979471867126994, + "grad_norm": NaN, + "learning_rate": 9.229920057901574e-09, + "loss": 0.0, + "step": 64082 + }, + { + "epoch": 5.979565176821872, + "grad_norm": NaN, + "learning_rate": 9.146203245352645e-09, + "loss": 0.0, + "step": 64083 + }, + { + "epoch": 5.979658486516749, + "grad_norm": NaN, + "learning_rate": 9.062867815112784e-09, + "loss": 0.0, + "step": 64084 + }, + { + "epoch": 5.979751796211627, + "grad_norm": NaN, + "learning_rate": 8.979913767398483e-09, + "loss": 0.0, + "step": 64085 + }, + { + "epoch": 5.979845105906504, + "grad_norm": NaN, + "learning_rate": 8.897341102426236e-09, + "loss": 0.0, + "step": 64086 + }, + { + "epoch": 5.979938415601381, + "grad_norm": NaN, + "learning_rate": 8.81514982037923e-09, + "loss": 0.0, + "step": 64087 + }, + { + "epoch": 5.980031725296258, + "grad_norm": NaN, + "learning_rate": 8.733339921490613e-09, + "loss": 0.0, + "step": 64088 + }, + { + "epoch": 5.9801250349911355, + "grad_norm": NaN, + "learning_rate": 8.651911405976875e-09, + "loss": 0.0, + "step": 64089 + }, + { + "epoch": 5.980218344686013, + "grad_norm": NaN, + "learning_rate": 8.570864274004551e-09, + "loss": 0.0, + "step": 64090 + }, + { + "epoch": 5.98031165438089, + "grad_norm": NaN, + "learning_rate": 8.490198525823444e-09, + "loss": 0.0, + "step": 64091 + }, + { + "epoch": 5.980404964075768, + "grad_norm": NaN, + "learning_rate": 8.409914161616738e-09, + "loss": 0.0, + "step": 64092 + }, + { + "epoch": 5.980498273770645, + "grad_norm": NaN, + "learning_rate": 8.330011181600927e-09, + "loss": 0.0, + "step": 64093 + }, + { + "epoch": 5.980591583465522, + "grad_norm": NaN, + "learning_rate": 8.250489585959196e-09, + "loss": 0.0, + "step": 64094 + }, + { + "epoch": 5.980684893160399, + "grad_norm": NaN, + "learning_rate": 8.171349374908043e-09, + "loss": 0.0, + "step": 64095 + }, + { + "epoch": 5.9807782028552765, + "grad_norm": NaN, + "learning_rate": 8.092590548647304e-09, + "loss": 0.0, + "step": 64096 + }, + { + "epoch": 5.980871512550154, + "grad_norm": NaN, + "learning_rate": 8.014213107376821e-09, + "loss": 0.0, + "step": 64097 + }, + { + "epoch": 5.980964822245031, + "grad_norm": NaN, + "learning_rate": 7.936217051296435e-09, + "loss": 0.0, + "step": 64098 + }, + { + "epoch": 5.981058131939909, + "grad_norm": NaN, + "learning_rate": 7.858602380605983e-09, + "loss": 0.0, + "step": 64099 + }, + { + "epoch": 5.981151441634786, + "grad_norm": NaN, + "learning_rate": 7.781369095505308e-09, + "loss": 0.0, + "step": 64100 + }, + { + "epoch": 5.981244751329664, + "grad_norm": NaN, + "learning_rate": 7.704517196177595e-09, + "loss": 0.0, + "step": 64101 + }, + { + "epoch": 5.98133806102454, + "grad_norm": NaN, + "learning_rate": 7.628046682822687e-09, + "loss": 0.0, + "step": 64102 + }, + { + "epoch": 5.9814313707194176, + "grad_norm": NaN, + "learning_rate": 7.55195755564042e-09, + "loss": 0.0, + "step": 64103 + }, + { + "epoch": 5.981524680414295, + "grad_norm": NaN, + "learning_rate": 7.476249814830637e-09, + "loss": 0.0, + "step": 64104 + }, + { + "epoch": 5.981617990109172, + "grad_norm": NaN, + "learning_rate": 7.400923460559871e-09, + "loss": 0.0, + "step": 64105 + }, + { + "epoch": 5.98171129980405, + "grad_norm": NaN, + "learning_rate": 7.3259784930612685e-09, + "loss": 0.0, + "step": 64106 + }, + { + "epoch": 5.981804609498927, + "grad_norm": NaN, + "learning_rate": 7.2514149124847096e-09, + "loss": 0.0, + "step": 64107 + }, + { + "epoch": 5.981897919193804, + "grad_norm": NaN, + "learning_rate": 7.177232719046688e-09, + "loss": 0.0, + "step": 64108 + }, + { + "epoch": 5.981991228888681, + "grad_norm": NaN, + "learning_rate": 7.103431912913737e-09, + "loss": 0.0, + "step": 64109 + }, + { + "epoch": 5.982084538583559, + "grad_norm": NaN, + "learning_rate": 7.030012494285697e-09, + "loss": 0.0, + "step": 64110 + }, + { + "epoch": 5.982177848278436, + "grad_norm": NaN, + "learning_rate": 6.956974463362408e-09, + "loss": 0.0, + "step": 64111 + }, + { + "epoch": 5.982271157973313, + "grad_norm": NaN, + "learning_rate": 6.8843178203104035e-09, + "loss": 0.0, + "step": 64112 + }, + { + "epoch": 5.982364467668191, + "grad_norm": NaN, + "learning_rate": 6.81204256531287e-09, + "loss": 0.0, + "step": 64113 + }, + { + "epoch": 5.982457777363068, + "grad_norm": NaN, + "learning_rate": 6.740148698569647e-09, + "loss": 0.0, + "step": 64114 + }, + { + "epoch": 5.982551087057946, + "grad_norm": NaN, + "learning_rate": 6.6686362202472695e-09, + "loss": 0.0, + "step": 64115 + }, + { + "epoch": 5.982644396752823, + "grad_norm": NaN, + "learning_rate": 6.597505130545577e-09, + "loss": 0.0, + "step": 64116 + }, + { + "epoch": 5.9827377064477, + "grad_norm": NaN, + "learning_rate": 6.526755429631103e-09, + "loss": 0.0, + "step": 64117 + }, + { + "epoch": 5.982831016142577, + "grad_norm": NaN, + "learning_rate": 6.456387117687034e-09, + "loss": 0.0, + "step": 64118 + }, + { + "epoch": 5.9829243258374545, + "grad_norm": NaN, + "learning_rate": 6.386400194896557e-09, + "loss": 0.0, + "step": 64119 + }, + { + "epoch": 5.983017635532332, + "grad_norm": NaN, + "learning_rate": 6.316794661426205e-09, + "loss": 0.0, + "step": 64120 + }, + { + "epoch": 5.983110945227209, + "grad_norm": NaN, + "learning_rate": 6.247570517459166e-09, + "loss": 0.0, + "step": 64121 + }, + { + "epoch": 5.983204254922087, + "grad_norm": NaN, + "learning_rate": 6.178727763178626e-09, + "loss": 0.0, + "step": 64122 + }, + { + "epoch": 5.983297564616963, + "grad_norm": NaN, + "learning_rate": 6.110266398767772e-09, + "loss": 0.0, + "step": 64123 + }, + { + "epoch": 5.983390874311841, + "grad_norm": NaN, + "learning_rate": 6.042186424376483e-09, + "loss": 0.0, + "step": 64124 + }, + { + "epoch": 5.983484184006718, + "grad_norm": NaN, + "learning_rate": 5.974487840171294e-09, + "loss": 0.0, + "step": 64125 + }, + { + "epoch": 5.9835774937015955, + "grad_norm": NaN, + "learning_rate": 5.907170646368697e-09, + "loss": 0.0, + "step": 64126 + }, + { + "epoch": 5.983670803396473, + "grad_norm": NaN, + "learning_rate": 5.840234843101921e-09, + "loss": 0.0, + "step": 64127 + }, + { + "epoch": 5.98376411309135, + "grad_norm": NaN, + "learning_rate": 5.773680430537497e-09, + "loss": 0.0, + "step": 64128 + }, + { + "epoch": 5.983857422786228, + "grad_norm": NaN, + "learning_rate": 5.707507408875267e-09, + "loss": 0.0, + "step": 64129 + }, + { + "epoch": 5.983950732481105, + "grad_norm": NaN, + "learning_rate": 5.64171577826511e-09, + "loss": 0.0, + "step": 64130 + }, + { + "epoch": 5.984044042175982, + "grad_norm": NaN, + "learning_rate": 5.57630553887356e-09, + "loss": 0.0, + "step": 64131 + }, + { + "epoch": 5.984137351870859, + "grad_norm": NaN, + "learning_rate": 5.51127669086715e-09, + "loss": 0.0, + "step": 64132 + }, + { + "epoch": 5.984230661565737, + "grad_norm": NaN, + "learning_rate": 5.446629234429068e-09, + "loss": 0.0, + "step": 64133 + }, + { + "epoch": 5.984323971260614, + "grad_norm": NaN, + "learning_rate": 5.382363169692538e-09, + "loss": 0.0, + "step": 64134 + }, + { + "epoch": 5.984417280955491, + "grad_norm": NaN, + "learning_rate": 5.31847849684075e-09, + "loss": 0.0, + "step": 64135 + }, + { + "epoch": 5.984510590650369, + "grad_norm": NaN, + "learning_rate": 5.254975216023582e-09, + "loss": 0.0, + "step": 64136 + }, + { + "epoch": 5.984603900345246, + "grad_norm": NaN, + "learning_rate": 5.191853327424223e-09, + "loss": 0.0, + "step": 64137 + }, + { + "epoch": 5.984697210040123, + "grad_norm": NaN, + "learning_rate": 5.12911283119255e-09, + "loss": 0.0, + "step": 64138 + }, + { + "epoch": 5.984790519735, + "grad_norm": NaN, + "learning_rate": 5.066753727478445e-09, + "loss": 0.0, + "step": 64139 + }, + { + "epoch": 5.984883829429878, + "grad_norm": NaN, + "learning_rate": 5.0047760164484415e-09, + "loss": 0.0, + "step": 64140 + }, + { + "epoch": 5.984977139124755, + "grad_norm": NaN, + "learning_rate": 4.943179698269073e-09, + "loss": 0.0, + "step": 64141 + }, + { + "epoch": 5.9850704488196325, + "grad_norm": NaN, + "learning_rate": 4.881964773073565e-09, + "loss": 0.0, + "step": 64142 + }, + { + "epoch": 5.98516375851451, + "grad_norm": NaN, + "learning_rate": 4.821131241028453e-09, + "loss": 0.0, + "step": 64143 + }, + { + "epoch": 5.985257068209387, + "grad_norm": NaN, + "learning_rate": 4.760679102300269e-09, + "loss": 0.0, + "step": 64144 + }, + { + "epoch": 5.985350377904265, + "grad_norm": NaN, + "learning_rate": 4.700608357038893e-09, + "loss": 0.0, + "step": 64145 + }, + { + "epoch": 5.985443687599141, + "grad_norm": NaN, + "learning_rate": 4.640919005394206e-09, + "loss": 0.0, + "step": 64146 + }, + { + "epoch": 5.985536997294019, + "grad_norm": NaN, + "learning_rate": 4.581611047499434e-09, + "loss": 0.0, + "step": 64147 + }, + { + "epoch": 5.985630306988896, + "grad_norm": NaN, + "learning_rate": 4.522684483537764e-09, + "loss": 0.0, + "step": 64148 + }, + { + "epoch": 5.9857236166837735, + "grad_norm": NaN, + "learning_rate": 4.46413931362577e-09, + "loss": 0.0, + "step": 64149 + }, + { + "epoch": 5.985816926378651, + "grad_norm": NaN, + "learning_rate": 4.405975537946638e-09, + "loss": 0.0, + "step": 64150 + }, + { + "epoch": 5.985910236073528, + "grad_norm": NaN, + "learning_rate": 4.348193156616942e-09, + "loss": 0.0, + "step": 64151 + }, + { + "epoch": 5.986003545768405, + "grad_norm": NaN, + "learning_rate": 4.290792169803214e-09, + "loss": 0.0, + "step": 64152 + }, + { + "epoch": 5.986096855463282, + "grad_norm": NaN, + "learning_rate": 4.2337725776553366e-09, + "loss": 0.0, + "step": 64153 + }, + { + "epoch": 5.98619016515816, + "grad_norm": NaN, + "learning_rate": 4.177134380289881e-09, + "loss": 0.0, + "step": 64154 + }, + { + "epoch": 5.986283474853037, + "grad_norm": NaN, + "learning_rate": 4.120877577890036e-09, + "loss": 0.0, + "step": 64155 + }, + { + "epoch": 5.9863767845479146, + "grad_norm": NaN, + "learning_rate": 4.065002170572373e-09, + "loss": 0.0, + "step": 64156 + }, + { + "epoch": 5.986470094242792, + "grad_norm": NaN, + "learning_rate": 4.0095081584701206e-09, + "loss": 0.0, + "step": 64157 + }, + { + "epoch": 5.986563403937669, + "grad_norm": NaN, + "learning_rate": 3.954395541766464e-09, + "loss": 0.0, + "step": 64158 + }, + { + "epoch": 5.986656713632547, + "grad_norm": NaN, + "learning_rate": 3.899664320544671e-09, + "loss": 0.0, + "step": 64159 + }, + { + "epoch": 5.986750023327423, + "grad_norm": NaN, + "learning_rate": 3.845314495004581e-09, + "loss": 0.0, + "step": 64160 + }, + { + "epoch": 5.986843333022301, + "grad_norm": NaN, + "learning_rate": 3.7913460652294614e-09, + "loss": 0.0, + "step": 64161 + }, + { + "epoch": 5.986936642717178, + "grad_norm": NaN, + "learning_rate": 3.737759031385845e-09, + "loss": 0.0, + "step": 64162 + }, + { + "epoch": 5.987029952412056, + "grad_norm": NaN, + "learning_rate": 3.684553393606959e-09, + "loss": 0.0, + "step": 64163 + }, + { + "epoch": 5.987123262106933, + "grad_norm": NaN, + "learning_rate": 3.631729152042684e-09, + "loss": 0.0, + "step": 64164 + }, + { + "epoch": 5.98721657180181, + "grad_norm": NaN, + "learning_rate": 3.5792863067929387e-09, + "loss": 0.0, + "step": 64165 + }, + { + "epoch": 5.987309881496688, + "grad_norm": NaN, + "learning_rate": 3.5272248580076047e-09, + "loss": 0.0, + "step": 64166 + }, + { + "epoch": 5.987403191191564, + "grad_norm": NaN, + "learning_rate": 3.4755448058199076e-09, + "loss": 0.0, + "step": 64167 + }, + { + "epoch": 5.987496500886442, + "grad_norm": NaN, + "learning_rate": 3.424246150379728e-09, + "loss": 0.0, + "step": 64168 + }, + { + "epoch": 5.987589810581319, + "grad_norm": NaN, + "learning_rate": 3.373328891786986e-09, + "loss": 0.0, + "step": 64169 + }, + { + "epoch": 5.987683120276197, + "grad_norm": NaN, + "learning_rate": 3.3227930301749085e-09, + "loss": 0.0, + "step": 64170 + }, + { + "epoch": 5.987776429971074, + "grad_norm": NaN, + "learning_rate": 3.272638565693375e-09, + "loss": 0.0, + "step": 64171 + }, + { + "epoch": 5.9878697396659515, + "grad_norm": NaN, + "learning_rate": 3.2228654984423063e-09, + "loss": 0.0, + "step": 64172 + }, + { + "epoch": 5.987963049360829, + "grad_norm": NaN, + "learning_rate": 3.173473828571582e-09, + "loss": 0.0, + "step": 64173 + }, + { + "epoch": 5.988056359055706, + "grad_norm": NaN, + "learning_rate": 3.1244635562144294e-09, + "loss": 0.0, + "step": 64174 + }, + { + "epoch": 5.988149668750583, + "grad_norm": NaN, + "learning_rate": 3.0758346814541147e-09, + "loss": 0.0, + "step": 64175 + }, + { + "epoch": 5.98824297844546, + "grad_norm": NaN, + "learning_rate": 3.0275872044571716e-09, + "loss": 0.0, + "step": 64176 + }, + { + "epoch": 5.988336288140338, + "grad_norm": NaN, + "learning_rate": 2.97972112532352e-09, + "loss": 0.0, + "step": 64177 + }, + { + "epoch": 5.988429597835215, + "grad_norm": NaN, + "learning_rate": 2.932236444169733e-09, + "loss": 0.0, + "step": 64178 + }, + { + "epoch": 5.9885229075300925, + "grad_norm": NaN, + "learning_rate": 2.885133161145692e-09, + "loss": 0.0, + "step": 64179 + }, + { + "epoch": 5.98861621722497, + "grad_norm": NaN, + "learning_rate": 2.8384112763346624e-09, + "loss": 0.0, + "step": 64180 + }, + { + "epoch": 5.9887095269198465, + "grad_norm": NaN, + "learning_rate": 2.7920707898865247e-09, + "loss": 0.0, + "step": 64181 + }, + { + "epoch": 5.988802836614724, + "grad_norm": NaN, + "learning_rate": 2.7461117019011992e-09, + "loss": 0.0, + "step": 64182 + }, + { + "epoch": 5.988896146309601, + "grad_norm": NaN, + "learning_rate": 2.700534012495259e-09, + "loss": 0.0, + "step": 64183 + }, + { + "epoch": 5.988989456004479, + "grad_norm": NaN, + "learning_rate": 2.6553377217852777e-09, + "loss": 0.0, + "step": 64184 + }, + { + "epoch": 5.989082765699356, + "grad_norm": NaN, + "learning_rate": 2.6105228299044823e-09, + "loss": 0.0, + "step": 64185 + }, + { + "epoch": 5.989176075394234, + "grad_norm": NaN, + "learning_rate": 2.5660893369527925e-09, + "loss": 0.0, + "step": 64186 + }, + { + "epoch": 5.989269385089111, + "grad_norm": NaN, + "learning_rate": 2.5220372430301282e-09, + "loss": 0.0, + "step": 64187 + }, + { + "epoch": 5.989362694783988, + "grad_norm": NaN, + "learning_rate": 2.478366548269717e-09, + "loss": 0.0, + "step": 64188 + }, + { + "epoch": 5.989456004478866, + "grad_norm": NaN, + "learning_rate": 2.435077252771478e-09, + "loss": 0.0, + "step": 64189 + }, + { + "epoch": 5.989549314173742, + "grad_norm": NaN, + "learning_rate": 2.392169356651985e-09, + "loss": 0.0, + "step": 64190 + }, + { + "epoch": 5.98964262386862, + "grad_norm": NaN, + "learning_rate": 2.3496428600111583e-09, + "loss": 0.0, + "step": 64191 + }, + { + "epoch": 5.989735933563497, + "grad_norm": NaN, + "learning_rate": 2.3074977629655712e-09, + "loss": 0.0, + "step": 64192 + }, + { + "epoch": 5.989829243258375, + "grad_norm": NaN, + "learning_rate": 2.2657340656151436e-09, + "loss": 0.0, + "step": 64193 + }, + { + "epoch": 5.989922552953252, + "grad_norm": NaN, + "learning_rate": 2.224351768076449e-09, + "loss": 0.0, + "step": 64194 + }, + { + "epoch": 5.9900158626481295, + "grad_norm": NaN, + "learning_rate": 2.1833508704494075e-09, + "loss": 0.0, + "step": 64195 + }, + { + "epoch": 5.990109172343006, + "grad_norm": NaN, + "learning_rate": 2.142731372833939e-09, + "loss": 0.0, + "step": 64196 + }, + { + "epoch": 5.990202482037883, + "grad_norm": NaN, + "learning_rate": 2.102493275329964e-09, + "loss": 0.0, + "step": 64197 + }, + { + "epoch": 5.990295791732761, + "grad_norm": NaN, + "learning_rate": 2.0626365780540554e-09, + "loss": 0.0, + "step": 64198 + }, + { + "epoch": 5.990389101427638, + "grad_norm": NaN, + "learning_rate": 2.023161281106134e-09, + "loss": 0.0, + "step": 64199 + }, + { + "epoch": 5.990482411122516, + "grad_norm": NaN, + "learning_rate": 1.9840673845694653e-09, + "loss": 0.0, + "step": 64200 + }, + { + "epoch": 5.990575720817393, + "grad_norm": NaN, + "learning_rate": 1.945354888560624e-09, + "loss": 0.0, + "step": 64201 + }, + { + "epoch": 5.9906690305122705, + "grad_norm": NaN, + "learning_rate": 1.9070237931795297e-09, + "loss": 0.0, + "step": 64202 + }, + { + "epoch": 5.990762340207148, + "grad_norm": NaN, + "learning_rate": 1.8690740984927956e-09, + "loss": 0.0, + "step": 64203 + }, + { + "epoch": 5.9908556499020245, + "grad_norm": NaN, + "learning_rate": 1.8315058046503018e-09, + "loss": 0.0, + "step": 64204 + }, + { + "epoch": 5.990948959596902, + "grad_norm": NaN, + "learning_rate": 1.7943189117020084e-09, + "loss": 0.0, + "step": 64205 + }, + { + "epoch": 5.991042269291779, + "grad_norm": NaN, + "learning_rate": 1.7575134197478358e-09, + "loss": 0.0, + "step": 64206 + }, + { + "epoch": 5.991135578986657, + "grad_norm": NaN, + "learning_rate": 1.7210893289043571e-09, + "loss": 0.0, + "step": 64207 + }, + { + "epoch": 5.991228888681534, + "grad_norm": NaN, + "learning_rate": 1.685046639254839e-09, + "loss": 0.0, + "step": 64208 + }, + { + "epoch": 5.991322198376412, + "grad_norm": NaN, + "learning_rate": 1.6493853508825484e-09, + "loss": 0.0, + "step": 64209 + }, + { + "epoch": 5.991415508071289, + "grad_norm": NaN, + "learning_rate": 1.6141054638707517e-09, + "loss": 0.0, + "step": 64210 + }, + { + "epoch": 5.9915088177661655, + "grad_norm": NaN, + "learning_rate": 1.5792069783360228e-09, + "loss": 0.0, + "step": 64211 + }, + { + "epoch": 5.991602127461043, + "grad_norm": NaN, + "learning_rate": 1.5446898943449747e-09, + "loss": 0.0, + "step": 64212 + }, + { + "epoch": 5.99169543715592, + "grad_norm": NaN, + "learning_rate": 1.5105542119975277e-09, + "loss": 0.0, + "step": 64213 + }, + { + "epoch": 5.991788746850798, + "grad_norm": NaN, + "learning_rate": 1.4767999313769484e-09, + "loss": 0.0, + "step": 64214 + }, + { + "epoch": 5.991882056545675, + "grad_norm": NaN, + "learning_rate": 1.4434270525665038e-09, + "loss": 0.0, + "step": 64215 + }, + { + "epoch": 5.991975366240553, + "grad_norm": NaN, + "learning_rate": 1.4104355756661134e-09, + "loss": 0.0, + "step": 64216 + }, + { + "epoch": 5.99206867593543, + "grad_norm": NaN, + "learning_rate": 1.3778255007257377e-09, + "loss": 0.0, + "step": 64217 + }, + { + "epoch": 5.992161985630307, + "grad_norm": NaN, + "learning_rate": 1.34559682786195e-09, + "loss": 0.0, + "step": 64218 + }, + { + "epoch": 5.992255295325184, + "grad_norm": NaN, + "learning_rate": 1.3137495571413637e-09, + "loss": 0.0, + "step": 64219 + }, + { + "epoch": 5.992348605020061, + "grad_norm": NaN, + "learning_rate": 1.282283688630592e-09, + "loss": 0.0, + "step": 64220 + }, + { + "epoch": 5.992441914714939, + "grad_norm": NaN, + "learning_rate": 1.2511992224462086e-09, + "loss": 0.0, + "step": 64221 + }, + { + "epoch": 5.992535224409816, + "grad_norm": NaN, + "learning_rate": 1.2204961586381734e-09, + "loss": 0.0, + "step": 64222 + }, + { + "epoch": 5.992628534104694, + "grad_norm": NaN, + "learning_rate": 1.190174497289753e-09, + "loss": 0.0, + "step": 64223 + }, + { + "epoch": 5.992721843799571, + "grad_norm": NaN, + "learning_rate": 1.1602342384842146e-09, + "loss": 0.0, + "step": 64224 + }, + { + "epoch": 5.992815153494448, + "grad_norm": NaN, + "learning_rate": 1.1306753823048242e-09, + "loss": 0.0, + "step": 64225 + }, + { + "epoch": 5.992908463189325, + "grad_norm": NaN, + "learning_rate": 1.1014979288181957e-09, + "loss": 0.0, + "step": 64226 + }, + { + "epoch": 5.993001772884202, + "grad_norm": NaN, + "learning_rate": 1.0727018780909423e-09, + "loss": 0.0, + "step": 64227 + }, + { + "epoch": 5.99309508257908, + "grad_norm": NaN, + "learning_rate": 1.0442872301896777e-09, + "loss": 0.0, + "step": 64228 + }, + { + "epoch": 5.993188392273957, + "grad_norm": NaN, + "learning_rate": 1.0162539852143214e-09, + "loss": 0.0, + "step": 64229 + }, + { + "epoch": 5.993281701968835, + "grad_norm": NaN, + "learning_rate": 9.886021432148338e-10, + "loss": 0.0, + "step": 64230 + }, + { + "epoch": 5.993375011663712, + "grad_norm": NaN, + "learning_rate": 9.613317042744816e-10, + "loss": 0.0, + "step": 64231 + }, + { + "epoch": 5.9934683213585895, + "grad_norm": NaN, + "learning_rate": 9.344426684598782e-10, + "loss": 0.0, + "step": 64232 + }, + { + "epoch": 5.993561631053467, + "grad_norm": NaN, + "learning_rate": 9.079350358209836e-10, + "loss": 0.0, + "step": 64233 + }, + { + "epoch": 5.9936549407483435, + "grad_norm": NaN, + "learning_rate": 8.818088064577178e-10, + "loss": 0.0, + "step": 64234 + }, + { + "epoch": 5.993748250443221, + "grad_norm": NaN, + "learning_rate": 8.560639804033875e-10, + "loss": 0.0, + "step": 64235 + }, + { + "epoch": 5.993841560138098, + "grad_norm": NaN, + "learning_rate": 8.307005577579129e-10, + "loss": 0.0, + "step": 64236 + }, + { + "epoch": 5.993934869832976, + "grad_norm": NaN, + "learning_rate": 8.057185385546005e-10, + "loss": 0.0, + "step": 64237 + }, + { + "epoch": 5.994028179527853, + "grad_norm": NaN, + "learning_rate": 7.811179228767173e-10, + "loss": 0.0, + "step": 64238 + }, + { + "epoch": 5.994121489222731, + "grad_norm": NaN, + "learning_rate": 7.568987107575697e-10, + "loss": 0.0, + "step": 64239 + }, + { + "epoch": 5.994214798917607, + "grad_norm": NaN, + "learning_rate": 7.330609022970779e-10, + "loss": 0.0, + "step": 64240 + }, + { + "epoch": 5.9943081086124845, + "grad_norm": NaN, + "learning_rate": 7.096044975452019e-10, + "loss": 0.0, + "step": 64241 + }, + { + "epoch": 5.994401418307362, + "grad_norm": NaN, + "learning_rate": 6.865294965519019e-10, + "loss": 0.0, + "step": 64242 + }, + { + "epoch": 5.994494728002239, + "grad_norm": NaN, + "learning_rate": 6.638358993671378e-10, + "loss": 0.0, + "step": 64243 + }, + { + "epoch": 5.994588037697117, + "grad_norm": NaN, + "learning_rate": 6.415237060741763e-10, + "loss": 0.0, + "step": 64244 + }, + { + "epoch": 5.994681347391994, + "grad_norm": NaN, + "learning_rate": 6.195929167229774e-10, + "loss": 0.0, + "step": 64245 + }, + { + "epoch": 5.994774657086872, + "grad_norm": NaN, + "learning_rate": 5.980435313468479e-10, + "loss": 0.0, + "step": 64246 + }, + { + "epoch": 5.994867966781749, + "grad_norm": NaN, + "learning_rate": 5.768755500290545e-10, + "loss": 0.0, + "step": 64247 + }, + { + "epoch": 5.994961276476626, + "grad_norm": NaN, + "learning_rate": 5.560889728029038e-10, + "loss": 0.0, + "step": 64248 + }, + { + "epoch": 5.995054586171503, + "grad_norm": NaN, + "learning_rate": 5.356837997350094e-10, + "loss": 0.0, + "step": 64249 + }, + { + "epoch": 5.99514789586638, + "grad_norm": NaN, + "learning_rate": 5.156600308586778e-10, + "loss": 0.0, + "step": 64250 + }, + { + "epoch": 5.995241205561258, + "grad_norm": NaN, + "learning_rate": 4.960176662571757e-10, + "loss": 0.0, + "step": 64251 + }, + { + "epoch": 5.995334515256135, + "grad_norm": NaN, + "learning_rate": 4.767567059471567e-10, + "loss": 0.0, + "step": 64252 + }, + { + "epoch": 5.995427824951013, + "grad_norm": NaN, + "learning_rate": 4.5787714999523383e-10, + "loss": 0.0, + "step": 64253 + }, + { + "epoch": 5.99552113464589, + "grad_norm": NaN, + "learning_rate": 4.393789984513674e-10, + "loss": 0.0, + "step": 64254 + }, + { + "epoch": 5.995614444340767, + "grad_norm": NaN, + "learning_rate": 4.2126225134886393e-10, + "loss": 0.0, + "step": 64255 + }, + { + "epoch": 5.995707754035644, + "grad_norm": NaN, + "learning_rate": 4.035269087543369e-10, + "loss": 0.0, + "step": 64256 + }, + { + "epoch": 5.9958010637305215, + "grad_norm": NaN, + "learning_rate": 3.8617297068443966e-10, + "loss": 0.0, + "step": 64257 + }, + { + "epoch": 5.995894373425399, + "grad_norm": NaN, + "learning_rate": 3.692004372057855e-10, + "loss": 0.0, + "step": 64258 + }, + { + "epoch": 5.995987683120276, + "grad_norm": NaN, + "learning_rate": 3.5260930835168124e-10, + "loss": 0.0, + "step": 64259 + }, + { + "epoch": 5.996080992815154, + "grad_norm": NaN, + "learning_rate": 3.3639958417208677e-10, + "loss": 0.0, + "step": 64260 + }, + { + "epoch": 5.996174302510031, + "grad_norm": NaN, + "learning_rate": 3.205712647169623e-10, + "loss": 0.0, + "step": 64261 + }, + { + "epoch": 5.996267612204909, + "grad_norm": NaN, + "learning_rate": 3.0512435000296096e-10, + "loss": 0.0, + "step": 64262 + }, + { + "epoch": 5.996360921899785, + "grad_norm": NaN, + "learning_rate": 2.900588400633896e-10, + "loss": 0.0, + "step": 64263 + }, + { + "epoch": 5.9964542315946625, + "grad_norm": NaN, + "learning_rate": 2.7537473498151494e-10, + "loss": 0.0, + "step": 64264 + }, + { + "epoch": 5.99654754128954, + "grad_norm": NaN, + "learning_rate": 2.610720347406836e-10, + "loss": 0.0, + "step": 64265 + }, + { + "epoch": 5.996640850984417, + "grad_norm": NaN, + "learning_rate": 2.471507394241623e-10, + "loss": 0.0, + "step": 64266 + }, + { + "epoch": 5.996734160679295, + "grad_norm": NaN, + "learning_rate": 2.3361084903195104e-10, + "loss": 0.0, + "step": 64267 + }, + { + "epoch": 5.996827470374172, + "grad_norm": NaN, + "learning_rate": 2.2045236361400986e-10, + "loss": 0.0, + "step": 64268 + }, + { + "epoch": 5.996920780069049, + "grad_norm": NaN, + "learning_rate": 2.0767528320364545e-10, + "loss": 0.0, + "step": 64269 + }, + { + "epoch": 5.997014089763926, + "grad_norm": NaN, + "learning_rate": 1.952796078341645e-10, + "loss": 0.0, + "step": 64270 + }, + { + "epoch": 5.997107399458804, + "grad_norm": NaN, + "learning_rate": 1.832653375222204e-10, + "loss": 0.0, + "step": 64271 + }, + { + "epoch": 5.997200709153681, + "grad_norm": NaN, + "learning_rate": 1.7163247233442644e-10, + "loss": 0.0, + "step": 64272 + }, + { + "epoch": 5.997294018848558, + "grad_norm": NaN, + "learning_rate": 1.6038101225412936e-10, + "loss": 0.0, + "step": 64273 + }, + { + "epoch": 5.997387328543436, + "grad_norm": NaN, + "learning_rate": 1.495109573479425e-10, + "loss": 0.0, + "step": 64274 + }, + { + "epoch": 5.997480638238313, + "grad_norm": NaN, + "learning_rate": 1.390223076325192e-10, + "loss": 0.0, + "step": 64275 + }, + { + "epoch": 5.997573947933191, + "grad_norm": NaN, + "learning_rate": 1.2891506310785947e-10, + "loss": 0.0, + "step": 64276 + }, + { + "epoch": 5.997667257628067, + "grad_norm": NaN, + "learning_rate": 1.1918922384057673e-10, + "loss": 0.0, + "step": 64277 + }, + { + "epoch": 5.997760567322945, + "grad_norm": NaN, + "learning_rate": 1.0984478983067091e-10, + "loss": 0.0, + "step": 64278 + }, + { + "epoch": 5.997853877017822, + "grad_norm": NaN, + "learning_rate": 1.0088176111144874e-10, + "loss": 0.0, + "step": 64279 + }, + { + "epoch": 5.997947186712699, + "grad_norm": NaN, + "learning_rate": 9.230013769956356e-11, + "loss": 0.0, + "step": 64280 + }, + { + "epoch": 5.998040496407577, + "grad_norm": NaN, + "learning_rate": 8.409991962832207e-11, + "loss": 0.0, + "step": 64281 + }, + { + "epoch": 5.998133806102454, + "grad_norm": NaN, + "learning_rate": 7.628110689772427e-11, + "loss": 0.0, + "step": 64282 + }, + { + "epoch": 5.998227115797332, + "grad_norm": NaN, + "learning_rate": 6.884369954107682e-11, + "loss": 0.0, + "step": 64283 + }, + { + "epoch": 5.998320425492208, + "grad_norm": NaN, + "learning_rate": 6.178769757503311e-11, + "loss": 0.0, + "step": 64284 + }, + { + "epoch": 5.998413735187086, + "grad_norm": NaN, + "learning_rate": 5.511310103289979e-11, + "loss": 0.0, + "step": 64285 + }, + { + "epoch": 5.998507044881963, + "grad_norm": NaN, + "learning_rate": 4.881990989802354e-11, + "loss": 0.0, + "step": 64286 + }, + { + "epoch": 5.9986003545768405, + "grad_norm": NaN, + "learning_rate": 4.290812422036438e-11, + "loss": 0.0, + "step": 64287 + }, + { + "epoch": 5.998693664271718, + "grad_norm": NaN, + "learning_rate": 3.737774399992233e-11, + "loss": 0.0, + "step": 64288 + }, + { + "epoch": 5.998786973966595, + "grad_norm": NaN, + "learning_rate": 3.222876923669737e-11, + "loss": 0.0, + "step": 64289 + }, + { + "epoch": 5.998880283661473, + "grad_norm": NaN, + "learning_rate": 2.7461199963996207e-11, + "loss": 0.0, + "step": 64290 + }, + { + "epoch": 5.99897359335635, + "grad_norm": NaN, + "learning_rate": 2.3075036198472175e-11, + "loss": 0.0, + "step": 64291 + }, + { + "epoch": 5.999066903051227, + "grad_norm": NaN, + "learning_rate": 1.907027794012528e-11, + "loss": 0.0, + "step": 64292 + }, + { + "epoch": 5.999160212746104, + "grad_norm": NaN, + "learning_rate": 1.5446925188955518e-11, + "loss": 0.0, + "step": 64293 + }, + { + "epoch": 5.9992535224409815, + "grad_norm": NaN, + "learning_rate": 1.2204977978269581e-11, + "loss": 0.0, + "step": 64294 + }, + { + "epoch": 5.999346832135859, + "grad_norm": NaN, + "learning_rate": 9.344436291414126e-12, + "loss": 0.0, + "step": 64295 + }, + { + "epoch": 5.999440141830736, + "grad_norm": NaN, + "learning_rate": 6.865300145042496e-12, + "loss": 0.0, + "step": 64296 + }, + { + "epoch": 5.999533451525614, + "grad_norm": NaN, + "learning_rate": 4.7675695558080376e-12, + "loss": 0.0, + "step": 64297 + }, + { + "epoch": 5.99962676122049, + "grad_norm": NaN, + "learning_rate": 3.0512445237107498e-12, + "loss": 0.0, + "step": 64298 + }, + { + "epoch": 5.999720070915368, + "grad_norm": NaN, + "learning_rate": 1.7163250487506331e-12, + "loss": 0.0, + "step": 64299 + }, + { + "epoch": 5.999813380610245, + "grad_norm": NaN, + "learning_rate": 7.628111309276874e-13, + "loss": 0.0, + "step": 64300 + }, + { + "epoch": 5.999906690305123, + "grad_norm": NaN, + "learning_rate": 1.907027868952582e-13, + "loss": 0.0, + "step": 64301 + }, + { + "epoch": 6.0, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 0.0, + "step": 64302 + }, + { + "epoch": 6.0, + "eval_loss": NaN, + "eval_runtime": 26.3862, + "eval_samples_per_second": 6.708, + "eval_steps_per_second": 6.708, + "step": 64302 + }, + { + "epoch": 6.0, + "step": 64302, + "total_flos": 358029585285120.0, + "train_loss": 2.4681696535317337, + "train_runtime": 44136.3163, + "train_samples_per_second": 1.457, + "train_steps_per_second": 1.457 + } + ], + "logging_steps": 1.0, + "max_steps": 64302, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 358029585285120.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}